summaryrefslogtreecommitdiffstats
path: root/JLanguageTool/src/dev
diff options
context:
space:
mode:
Diffstat (limited to 'JLanguageTool/src/dev')
-rw-r--r--JLanguageTool/src/dev/de/danielnaber/languagetool/dev/CheckBNC.java105
-rw-r--r--JLanguageTool/src/dev/de/danielnaber/languagetool/dev/ContextFinder.java122
-rw-r--r--JLanguageTool/src/dev/de/danielnaber/languagetool/dev/ExportGermanNouns.java86
-rw-r--r--JLanguageTool/src/dev/de/danielnaber/languagetool/dev/Indexer.java100
-rw-r--r--JLanguageTool/src/dev/de/danielnaber/languagetool/dev/POSTagAnalyzer.java49
-rw-r--r--JLanguageTool/src/dev/de/danielnaber/languagetool/dev/POSTagFilter.java94
-rw-r--r--JLanguageTool/src/dev/de/danielnaber/languagetool/dev/POSTagLanguageModel.java147
-rw-r--r--JLanguageTool/src/dev/de/danielnaber/languagetool/dev/PrintLocales.java94
-rw-r--r--JLanguageTool/src/dev/de/danielnaber/languagetool/dev/RuleOverview.java195
-rw-r--r--JLanguageTool/src/dev/de/danielnaber/languagetool/dev/tools/RomanianDiacriticsModifier.java99
-rw-r--r--JLanguageTool/src/dev/de/danielnaber/languagetool/dev/wikipedia/BaseWikipediaDumpHandler.java155
-rw-r--r--JLanguageTool/src/dev/de/danielnaber/languagetool/dev/wikipedia/CheckWikipediaDump.java143
-rw-r--r--JLanguageTool/src/dev/de/danielnaber/languagetool/dev/wikipedia/DatabaseDumpHandler.java90
-rw-r--r--JLanguageTool/src/dev/de/danielnaber/languagetool/dev/wikipedia/OutputDumpHandler.java60
-rw-r--r--JLanguageTool/src/dev/de/danielnaber/languagetool/dev/wikipedia/WikipediaTextFilter.java52
-rw-r--r--JLanguageTool/src/dev/tools/add_short.xsl59
-rw-r--r--JLanguageTool/src/dev/tools/convert.xsl50
-rw-r--r--JLanguageTool/src/dev/tools/print.xsl200
-rw-r--r--JLanguageTool/src/dev/tools/stats.awk62
19 files changed, 1962 insertions, 0 deletions
diff --git a/JLanguageTool/src/dev/de/danielnaber/languagetool/dev/CheckBNC.java b/JLanguageTool/src/dev/de/danielnaber/languagetool/dev/CheckBNC.java
new file mode 100644
index 0000000..24931e6
--- /dev/null
+++ b/JLanguageTool/src/dev/de/danielnaber/languagetool/dev/CheckBNC.java
@@ -0,0 +1,105 @@
+/* LanguageTool, a natural language style checker
+ * Copyright (C) 2005 Daniel Naber (http://www.danielnaber.de)
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
+ * USA
+ */
+
+package de.danielnaber.languagetool.dev;
+
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.IOException;
+import java.util.List;
+
+import de.danielnaber.languagetool.JLanguageTool;
+import de.danielnaber.languagetool.Language;
+import de.danielnaber.languagetool.TextFilter;
+import de.danielnaber.languagetool.tokenizers.SentenceTokenizer;
+import de.danielnaber.languagetool.tools.StringTools;
+import de.danielnaber.languagetool.tools.Tools;
+
+/**
+ * Uses JLanguageTol recursively on the files of the BNC (British National Corpus).
+ *
+ * @author Daniel Naber
+ */
+public final class CheckBNC {
+
+ private JLanguageTool langTool = null;
+ private final TextFilter textFilter = new BNCTextFilter();
+
+ static final boolean CHECK_BY_SENTENCE = true;
+
+ public static void main(String[] args) throws Exception {
+ if (args.length != 1) {
+ System.out.println("Usage: CheckBNC <directory>");
+ System.exit(1);
+ }
+ final CheckBNC prg = new CheckBNC();
+ prg.run(new File(args[0]));
+ }
+
+ private CheckBNC() throws IOException {
+ langTool = new JLanguageTool(Language.ENGLISH);
+ langTool.activateDefaultPatternRules();
+ final String[] disRules = new String[] {"UPPERCASE_SENTENCE_START", "COMMA_PARENTHESIS_WHITESPACE",
+ "WORD_REPEAT_RULE", "DOUBLE_PUNCTUATION"};
+ System.err.println("Note: disabling the following rules:");
+ for (String disRule : disRules) {
+ langTool.disableRule(disRule);
+ System.err.println(" " + disRule);
+ }
+ }
+
+ private void run(final File file) throws IOException {
+ if (file.isDirectory()) {
+ final File[] files = file.listFiles();
+ for (File file1 : files) {
+ run(new File(file, file1.getName()));
+ }
+ } else {
+ System.out.println("Checking " + file.getAbsolutePath());
+ String text = StringTools.readFile(new FileInputStream(file.getAbsolutePath()));
+ text = textFilter.filter(text);
+ if (CHECK_BY_SENTENCE) {
+ final SentenceTokenizer st = new SentenceTokenizer();
+ final List<String> sentences = st.tokenize(text);
+ for (String sentence : sentences) {
+ Tools.checkText(sentence, langTool, false, 1000);
+ }
+ } else {
+ Tools.checkText(text, langTool);
+ }
+ }
+ }
+
+}
+
+class BNCTextFilter implements TextFilter {
+
+ public String filter(String text) {
+ text = text.replaceAll("(?s)<header.*?>.*?</header>", "");
+ text = text.replaceAll("<w.*?>", "");
+ text = text.replaceAll("<c.*?>", "");
+ text = text.replaceAll("<.*?>", "");
+ text = text.replaceAll(" +", " ");
+ text = text.replaceAll("&bquo|&equo", "\"");
+ text = text.replaceAll("&mdash;?", "--");
+ text = text.replaceAll("&amp;?", "&");
+ return text;
+ }
+
+}
diff --git a/JLanguageTool/src/dev/de/danielnaber/languagetool/dev/ContextFinder.java b/JLanguageTool/src/dev/de/danielnaber/languagetool/dev/ContextFinder.java
new file mode 100644
index 0000000..0154b33
--- /dev/null
+++ b/JLanguageTool/src/dev/de/danielnaber/languagetool/dev/ContextFinder.java
@@ -0,0 +1,122 @@
+/* LanguageTool, a natural language style checker
+ * Copyright (C) 2006 Daniel Naber (http://www.danielnaber.de)
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
+ * USA
+ */
+package de.danielnaber.languagetool.dev;
+
+import java.io.IOException;
+
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.Term;
+import org.apache.lucene.index.TermEnum;
+import org.apache.lucene.search.Hits;
+import org.apache.lucene.search.IndexSearcher;
+import org.apache.lucene.search.PhraseQuery;
+
+/**
+ * Compare the one-word right or left context of two words. This is useful
+ * to find potential rules for similar words, i.e. contexts that are typical
+ * for only one of the words.
+ *
+ * @author Daniel Naber
+ */
+public class ContextFinder {
+
+ private ContextFinder() {}
+
+ public static void main(String[] args) throws IOException {
+ if (args.length != 4 || !args[3].startsWith("--context")) {
+ printUsageAndExit();
+ }
+ final ContextFinder prg = new ContextFinder();
+ if (args[3].endsWith("=right"))
+ prg.run(args[0], args[1], args[2], true);
+ else if (args[3].endsWith("=left"))
+ prg.run(args[0], args[1], args[2], false);
+ else
+ printUsageAndExit();
+ }
+
+ private static void printUsageAndExit() {
+ System.err.println("Usage: ContextFinder <indexDir> <term1> <term2> --context=right|left");
+ System.exit(1);
+ }
+
+ private void run(String indexDir, String term1, String term2, boolean rightContext) throws IOException {
+ final IndexReader reader = IndexReader.open(indexDir);
+ final IndexSearcher searcher = new IndexSearcher(reader);
+ final TermEnum termEnum = reader.terms();
+ int termCount = 0;
+ System.out.println(term1 + ": " + reader.docFreq(new Term(Indexer.BODY_FIELD, term1)) + "x");
+ System.out.println(term2 + ": " + reader.docFreq(new Term(Indexer.BODY_FIELD, term2)) + "x");
+ while (termEnum.next()) {
+ final Term t = termEnum.term();
+ if (isPOSTag(t))
+ continue;
+ // first term:
+ final PhraseQuery pq1 = makeQuery(t, term1, rightContext);
+ final int hits1 = search(pq1, searcher);
+ // second term:
+ final PhraseQuery pq2 = makeQuery(t, term2, rightContext);
+ final int hits2 = search(pq2, searcher);
+ final float rel = (float)(hits1+1) / (float)(hits2+1);
+ if (rel > 1.0f)
+ System.out.println("#1: " + rel + ": " + myToString(pq1) + ": " + hits1 + " <-> " + myToString(pq2) + ": " + hits2);
+ else if (rel < 1.0f)
+ System.out.println("#2: " + rel + ": " + myToString(pq1) + ": " + hits1 + " <-> " + myToString(pq2) + ": " + hits2);
+ termCount++;
+ }
+ System.out.println("termCount = " + termCount);
+ searcher.close();
+ reader.close();
+ }
+
+ private String myToString(PhraseQuery pq) {
+ return pq.toString().replaceAll("body:", "");
+ }
+
+ private PhraseQuery makeQuery(Term t, String term1, boolean rightContext) {
+ final PhraseQuery pq = new PhraseQuery();
+ if (rightContext) {
+ pq.add(new Term(Indexer.BODY_FIELD, term1));
+ pq.add(new Term(Indexer.BODY_FIELD, t.text()));
+ } else {
+ pq.add(new Term(Indexer.BODY_FIELD, t.text()));
+ pq.add(new Term(Indexer.BODY_FIELD, term1));
+ }
+ return pq;
+ }
+
+ private int search(PhraseQuery pq, IndexSearcher searcher) throws IOException {
+ //long time = System.currentTimeMillis();
+ final Hits h = searcher.search(pq);
+ //long searchTime = System.currentTimeMillis()-time;
+ if (h.length() > 0) {
+ //System.err.println(h.length() + " " + pq);
+ //System.err.println(" " + searchTime + "ms");
+ }
+ return h.length();
+ }
+
+ private boolean isPOSTag(Term t) {
+ if (t.text().equals(t.text().toUpperCase())) { // e.g. "VER:1:PLU:KJ2:NON:NEB"
+ return true;
+ }
+ return false;
+ }
+
+}
diff --git a/JLanguageTool/src/dev/de/danielnaber/languagetool/dev/ExportGermanNouns.java b/JLanguageTool/src/dev/de/danielnaber/languagetool/dev/ExportGermanNouns.java
new file mode 100644
index 0000000..ad7d231
--- /dev/null
+++ b/JLanguageTool/src/dev/de/danielnaber/languagetool/dev/ExportGermanNouns.java
@@ -0,0 +1,86 @@
+/* LanguageTool, a natural language style checker
+ * Copyright (C) 2005 Daniel Naber (http://www.danielnaber.de)
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
+ * USA
+ */
+
+/*
+ * Created on 06.05.2007
+ */
+package de.danielnaber.languagetool.dev;
+
+import java.io.*;
+import java.nio.ByteBuffer;
+import java.util.HashSet;
+import java.util.Set;
+
+import de.danielnaber.languagetool.JLanguageTool;
+
+import morfologik.fsa.FSA;
+
+/**
+ * Export German nouns as a serialized Java HashSet, to be used
+ * by jWordSplitter.
+ *
+ * @author Daniel Naber
+ */
+public class ExportGermanNouns {
+
+ private static final String DICT_FILENAME = "/de/german.dict";
+
+ private ExportGermanNouns() {
+ }
+
+ private Set<String> getWords() throws IOException {
+ final FSA fsa = FSA.getInstance(JLanguageTool.getDataBroker().getFromResourceDirAsStream(DICT_FILENAME));
+ String lastTerm = null;
+ final Set<String> set = new HashSet<String>();
+ for (ByteBuffer bb : fsa) {
+ final byte [] sequence = new byte [bb.remaining()];
+ bb.get(sequence);
+ final String output = new String(sequence, "iso-8859-1");
+ if (output.indexOf("+SUB:") != -1 && output.indexOf(":ADJ") == -1) {
+ final String[] parts = output.split("\\+");
+ final String term = parts[0].toLowerCase();
+ if (lastTerm == null || !lastTerm.equals(parts[0])) {
+ //System.out.println(parts[0]);
+ set.add(term);
+ }
+ lastTerm = term;
+ }
+ }
+ return set;
+ }
+
+ private void serialize(Set<String> words, File outputFile) throws IOException {
+ final FileOutputStream fos = new FileOutputStream(outputFile);
+ final ObjectOutputStream oos = new ObjectOutputStream(fos);
+ oos.writeObject(words);
+ oos.close();
+ fos.close();
+ }
+
+ public static void main(String[] args) throws IOException {
+ if (args.length != 1) {
+ System.out.println("Usage: ExportGermanNouns <outputFile>");
+ System.exit(1);
+ }
+ final ExportGermanNouns prg = new ExportGermanNouns();
+ final Set<String> words = prg.getWords();
+ prg.serialize(words, new File(args[0]));
+ }
+
+}
diff --git a/JLanguageTool/src/dev/de/danielnaber/languagetool/dev/Indexer.java b/JLanguageTool/src/dev/de/danielnaber/languagetool/dev/Indexer.java
new file mode 100644
index 0000000..fc3392b
--- /dev/null
+++ b/JLanguageTool/src/dev/de/danielnaber/languagetool/dev/Indexer.java
@@ -0,0 +1,100 @@
+/* LanguageTool, a natural language style checker
+ * Copyright (C) 2006 Daniel Naber (http://www.danielnaber.de)
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
+ * USA
+ */
+package de.danielnaber.languagetool.dev;
+
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.IOException;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field;
+import org.apache.lucene.index.IndexWriter;
+
+import de.danielnaber.languagetool.Language;
+import de.danielnaber.languagetool.tools.StringTools;
+
+/**
+ * POS tag and index text files using Lucene. Required for ContextFinder.java.
+ * TODO: hard-coded to index a specific kind of XML.
+ *
+ * @author Daniel Naber
+ */
+public class Indexer {
+
+ static final String BODY_FIELD = "body";
+
+ private Indexer() {}
+
+ public static void main(String[] args) throws IOException {
+ final Indexer prg = new Indexer();
+ if (args.length != 2) {
+ System.err.println("Usage: Indexer <dataDir> <indexDir>");
+ System.exit(1);
+ }
+ // FIXME: make this an option:
+ final Language lang = Language.GERMAN;
+ prg.run(args[0], args[1], lang);
+ }
+
+ private void run(String dataDir, String indexDir, Language lang) throws IOException {
+ final IndexWriter iw = new IndexWriter(indexDir, new POSTagAnalyzer(lang.getTagger()), true);
+ iw.setMaxBufferedDocs(100);
+ index(iw, new File(dataDir), 1);
+ System.out.println("Optimizing index...");
+ iw.optimize();
+ iw.close();
+ System.out.println("Done.");
+ }
+
+ private void index(IndexWriter iw, File dir, int count) throws IOException {
+ if (dir.isDirectory()) {
+ final File[] files = dir.listFiles();
+ for (File file : files) {
+ index(iw, file, ++count);
+ }
+ } else {
+ final Document doc = new Document();
+ if (count % 50 == 0)
+ System.out.println("Indexing file #" + count);
+ String s = StringTools.readFile(new FileInputStream(dir.getAbsolutePath()), "iso-8859-1");
+ // XML data:
+ s = getParagraphs(s);
+ //s = s.replaceAll("(\\w)([.,?!])", "$1 $2");
+ //s = s.replaceAll("<.*?>", "");
+ //System.err.println(">"+s);
+ doc.add(new Field(BODY_FIELD, s, Field.Store.YES, Field.Index.TOKENIZED));
+ iw.addDocument(doc);
+ }
+ }
+
+ private String getParagraphs(String xml) {
+ final StringBuilder sb = new StringBuilder();
+ final Pattern pattern = Pattern.compile("<p>(.*?)</p>", Pattern.DOTALL);
+ final Matcher matcher = pattern.matcher(xml);
+ int pos = 0;
+ while (matcher.find(pos)) {
+ sb.append(matcher.group(1));
+ pos = matcher.end();
+ }
+ return sb.toString();
+ }
+
+}
diff --git a/JLanguageTool/src/dev/de/danielnaber/languagetool/dev/POSTagAnalyzer.java b/JLanguageTool/src/dev/de/danielnaber/languagetool/dev/POSTagAnalyzer.java
new file mode 100644
index 0000000..06d6cd4
--- /dev/null
+++ b/JLanguageTool/src/dev/de/danielnaber/languagetool/dev/POSTagAnalyzer.java
@@ -0,0 +1,49 @@
+/* LanguageTool, a natural language style checker
+ * Copyright (C) 2006 Daniel Naber (http://www.danielnaber.de)
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
+ * USA
+ */
+package de.danielnaber.languagetool.dev;
+
+import java.io.Reader;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.standard.StandardTokenizer;
+
+import de.danielnaber.languagetool.tagging.Tagger;
+
+/**
+ * Analyzer that stores text and its POS analysis.
+ *
+ * @author Daniel Naber
+ */
+class POSTagAnalyzer extends Analyzer {
+
+ private Tagger tagger = null;
+
+ public POSTagAnalyzer(Tagger tagger) {
+ this.tagger = tagger;
+ }
+
+ public TokenStream tokenStream(@SuppressWarnings("unused")String fieldName, Reader reader) {
+ TokenStream result = new StandardTokenizer(reader);
+ //result = new LowerCaseFilter(result);
+ result = new POSTagFilter(result, tagger);
+ return result;
+ }
+
+}
diff --git a/JLanguageTool/src/dev/de/danielnaber/languagetool/dev/POSTagFilter.java b/JLanguageTool/src/dev/de/danielnaber/languagetool/dev/POSTagFilter.java
new file mode 100644
index 0000000..01fc600
--- /dev/null
+++ b/JLanguageTool/src/dev/de/danielnaber/languagetool/dev/POSTagFilter.java
@@ -0,0 +1,94 @@
+/* LanguageTool, a natural language style checker
+ * Copyright (C) 2006 Daniel Naber (http://www.danielnaber.de)
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
+ * USA
+ */
+package de.danielnaber.languagetool.dev;
+
+import java.util.ArrayList;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Set;
+import java.util.Stack;
+
+import org.apache.lucene.analysis.Token;
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.TokenStream;
+
+import de.danielnaber.languagetool.AnalyzedToken;
+import de.danielnaber.languagetool.AnalyzedTokenReadings;
+import de.danielnaber.languagetool.tagging.Tagger;
+import de.danielnaber.languagetool.tagging.de.AnalyzedGermanTokenReadings;
+
+/**
+ * Filter that puts the words of a text, the base form, and word's POS tags at the
+ * same index position.
+ *
+ * @author Daniel Naber
+ */
+class POSTagFilter extends TokenFilter {
+
+ private static final String BASEFORM_PREFIX = "B_";
+ private static final String TEXTFORM_PREFIX = "T_";
+
+ private final Stack<Token> stack = new Stack<Token>();
+ private Tagger tagger = null;
+
+ public POSTagFilter(TokenStream in, Tagger tagger) {
+ super(in);
+ this.tagger = tagger;
+ }
+
+ public final org.apache.lucene.analysis.Token next() throws java.io.IOException {
+
+ if (stack.size() > 0) {
+ //System.err.println("*"+stack.peek());
+ return stack.pop();
+ } else {
+ final Token t = input.next();
+ if (t == null)
+ return null;
+ final List<String> wordList = new ArrayList<String>();
+ wordList.add(t.termText());
+ final List<AnalyzedTokenReadings> atr = tagger.tag(wordList);
+ for (Object anAtr : atr) {
+ final AnalyzedGermanTokenReadings atrs = (AnalyzedGermanTokenReadings) anAtr;
+ final List<AnalyzedToken> ats = atrs.getReadings();
+ for (Object at1 : ats) {
+ final AnalyzedToken at = (AnalyzedToken) at1;
+ if (at.getPOSTag() != null) {
+ //System.err.println(">>>>>"+at.getPOSTag());
+ final Token posToken = new Token(at.getPOSTag(), t.startOffset(), t.endOffset());
+ posToken.setPositionIncrement(0);
+ stack.push(posToken);
+ }
+ final Set<String> indexLemmas = new HashSet<String>();
+ if (at.getLemma() != null) {
+ final String lemma = at.getLemma().toLowerCase();
+ if (!lemma.equalsIgnoreCase(t.termText()) && !indexLemmas.contains(lemma)) {
+ final Token posToken = new Token(BASEFORM_PREFIX + lemma, t.startOffset(), t.endOffset());
+ posToken.setPositionIncrement(0);
+ stack.push(posToken);
+ indexLemmas.add(lemma);
+ }
+ }
+ }
+ }
+ return new Token(TEXTFORM_PREFIX + t.termText().toLowerCase(), t.startOffset(), t.endOffset());
+ }
+ }
+
+}
diff --git a/JLanguageTool/src/dev/de/danielnaber/languagetool/dev/POSTagLanguageModel.java b/JLanguageTool/src/dev/de/danielnaber/languagetool/dev/POSTagLanguageModel.java
new file mode 100644
index 0000000..68439cc
--- /dev/null
+++ b/JLanguageTool/src/dev/de/danielnaber/languagetool/dev/POSTagLanguageModel.java
@@ -0,0 +1,147 @@
+/* LanguageTool, a natural language style checker
+ * Copyright (C) 2009 Daniel Naber (http://www.danielnaber.de)
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
+ * USA
+ */
+
+package de.danielnaber.languagetool.dev;
+
+import java.io.BufferedInputStream;
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.io.InputStreamReader;
+import java.util.ArrayList;
+import java.util.List;
+
+import de.danielnaber.languagetool.AnalyzedSentence;
+import de.danielnaber.languagetool.AnalyzedTokenReadings;
+import de.danielnaber.languagetool.JLanguageTool;
+import de.danielnaber.languagetool.Language;
+
+/**
+ * Tag text and display only POS tags to create an n-gram language model.
+ *
+ * @author Marcin Milkowski
+ */
+public class POSTagLanguageModel {
+
+ /**
+ * @param args
+ * @throws IOException
+ */
+ public static void main(final String[] args) throws IOException {
+ if (args.length == 1) {
+ final Language language = getLanguageOrExit(args[0]);
+ final JLanguageTool lt = new JLanguageTool(language, null);
+ runOnStdIn(lt);
+ } else {
+ exitWithUsageMessage();
+ }
+ }
+
+ private static Language getLanguageOrExit(final String lang) {
+ Language language = null;
+ boolean foundLanguage = false;
+ final List<String> supportedLanguages = new ArrayList<String>();
+ for (final Language tmpLang : Language.LANGUAGES) {
+ supportedLanguages.add(tmpLang.getShortName());
+ if (lang.equals(tmpLang.getShortName())) {
+ language = tmpLang;
+ foundLanguage = true;
+ break;
+ }
+ }
+ if (!foundLanguage) {
+ System.out.println("Unknown language '" + lang
+ + "'. Supported languages are: " + supportedLanguages);
+ exitWithUsageMessage();
+ }
+ return language;
+ }
+
+ private static void exitWithUsageMessage() {
+ System.out
+ .println("Usage: java de.danielnaber.languagetool.dev.POSTagLanguageModel language");
+ }
+
+ private static void runOnStdIn(final JLanguageTool lt) throws IOException {
+ final int MAX_FILE_SIZE = 64000;
+ InputStreamReader isr = null;
+ BufferedReader br = null;
+ StringBuilder sb = new StringBuilder();
+ try {
+ isr = new InputStreamReader(new BufferedInputStream(System.in));
+ br = new BufferedReader(isr);
+ String line;
+ while ((line = br.readLine()) != null) {
+ sb.append(line);
+ sb.append('\n');
+ if (lt.getLanguage().getSentenceTokenizer().singleLineBreaksMarksPara()) {
+ tagText(sb.toString(), lt);
+ sb = new StringBuilder();
+ } else {
+ if ("".equals(line) || sb.length() >= MAX_FILE_SIZE) {
+ tagText(sb.toString(), lt);
+ sb = new StringBuilder();
+ }
+ }
+ }
+ } finally {
+ if (sb.length() > 0) {
+ tagText(sb.toString(), lt);
+ }
+ }
+
+ br.close();
+ isr.close();
+ }
+
+ private static void tagText(final String contents, final JLanguageTool lt)
+ throws IOException {
+ AnalyzedSentence analyzedText;
+ final List<String> sentences = lt.sentenceTokenize(contents);
+ for (final String sentence : sentences) {
+ analyzedText = lt.getAnalyzedSentence(sentence);
+ System.out.println(getSentence(analyzedText));
+ }
+ }
+
+ private static String getSentence(final AnalyzedSentence sent) {
+ final StringBuilder sb = new StringBuilder();
+ sb.append("<S>");
+ for (final AnalyzedTokenReadings atr : sent.getTokensWithoutWhitespace()) {
+ sb.append(getPOS(atr));
+ sb.append(' ');
+ }
+ sb.append("</S>");
+ return sb.toString();
+ }
+
+ private static String getPOS(final AnalyzedTokenReadings atr) {
+ final StringBuilder sb = new StringBuilder();
+ final int readNum = atr.getReadingsLength();
+ for (int i = 0; i < readNum; i++) {
+ if (!atr.isWhitespace()) {
+ sb.append(atr.getAnalyzedToken(i).getPOSTag());
+ if (i != readNum - 1) {
+ sb.append('+');
+ }
+ }
+ }
+ return sb.toString();
+ }
+
+}
diff --git a/JLanguageTool/src/dev/de/danielnaber/languagetool/dev/PrintLocales.java b/JLanguageTool/src/dev/de/danielnaber/languagetool/dev/PrintLocales.java
new file mode 100644
index 0000000..a946e07
--- /dev/null
+++ b/JLanguageTool/src/dev/de/danielnaber/languagetool/dev/PrintLocales.java
@@ -0,0 +1,94 @@
+/* LanguageTool, a natural language style checker
+ * Copyright (C) 2006 Daniel Naber (http://www.danielnaber.de)
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
+ * USA
+ */
+package de.danielnaber.languagetool.dev;
+
+import java.io.FileInputStream;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.util.Properties;
+
+import de.danielnaber.languagetool.Language;
+import de.danielnaber.languagetool.tools.StringTools;
+
+/**
+ * Used for creating ooolocales.properties file that defines a property that is
+ * needed to build Linguistic.xcu. Run internally by the ant build.
+ *
+ * @author Marcin Miłkowski
+ */
+public final class PrintLocales {
+
+ final static String FILENAME = "ooolocales.properties";
+
+ public static void main(final String[] args) throws IOException {
+ final PrintLocales prg = new PrintLocales();
+ prg.run();
+ }
+
+ private void run() throws IOException {
+ String locales = "";
+ for (final Language element : Language.LANGUAGES) {
+ if (!element.equals(Language.DEMO)) {
+ String var;
+ for (final String variant : element.getCountryVariants()) {
+
+ if (StringTools.isEmpty(variant)) {
+ var = "";
+ } else {
+ var = "-" + variant;
+ }
+
+ if (!StringTools.isEmpty(locales)) {
+ locales = locales + " " + element.getShortName() + var;
+ } else {
+ locales = element.getShortName() + var;
+ }
+ }
+ }
+ }
+ // change attribute to writable as the property file is in the repo
+ final Properties checkPropLoc = new Properties();
+ FileInputStream fIn = null;
+ try {
+ fIn = new FileInputStream(FILENAME);
+ checkPropLoc.load(fIn);
+ } finally {
+ if (fIn != null)
+ fIn.close();
+ }
+ final String oldLocales = checkPropLoc.getProperty("countryvariants");
+ if (!locales.equals(oldLocales)) {
+ final Properties propLoc = new Properties();
+ propLoc.setProperty("countryvariants", locales);
+ FileOutputStream fOut = null;
+ try {
+ fOut = new FileOutputStream(FILENAME);
+ propLoc.store(fOut, "Locales");
+ } finally {
+ if (fOut != null) {
+ fOut.close();
+ } else {
+ System.err.println("Cannot save new locales!");
+ System.exit(1);
+ }
+ }
+ }
+ }
+
+}
diff --git a/JLanguageTool/src/dev/de/danielnaber/languagetool/dev/RuleOverview.java b/JLanguageTool/src/dev/de/danielnaber/languagetool/dev/RuleOverview.java
new file mode 100644
index 0000000..c29b074
--- /dev/null
+++ b/JLanguageTool/src/dev/de/danielnaber/languagetool/dev/RuleOverview.java
@@ -0,0 +1,195 @@
+/* LanguageTool, a natural language style checker
+ * Copyright (C) 2006 Daniel Naber (http://www.danielnaber.de)
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
+ * USA
+ */
+package de.danielnaber.languagetool.dev;
+
+import java.io.File;
+import java.io.FileFilter;
+import java.io.IOException;
+import java.text.SimpleDateFormat;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.Date;
+import java.util.List;
+
+import de.danielnaber.languagetool.JLanguageTool;
+import de.danielnaber.languagetool.Language;
+import de.danielnaber.languagetool.language.Contributor;
+import de.danielnaber.languagetool.tools.StringTools;
+import de.danielnaber.languagetool.tools.Tools;
+
+/**
+ * Command line tool to list supported languages and their number of rules.
+ *
+ * @author Daniel Naber
+ */
+public final class RuleOverview {
+
+ public static void main(final String[] args) throws IOException {
+ final RuleOverview prg = new RuleOverview();
+ prg.run();
+ }
+
+ private RuleOverview() {
+ // no constructor
+ }
+
+ private void run() throws IOException {
+ System.out.println("<b>Rules in LanguageTool " + JLanguageTool.VERSION + "</b><br />");
+ System.out.println("Date: " + new SimpleDateFormat("yyyy-MM-dd").format(new Date()) + "<br /><br />\n");
+ System.out.println("<table>");
+ System.out.println("<tr>");
+ System.out.println(" <th></th>");
+ System.out.println(" <th align=\"right\">XML rules</th>");
+ System.out.println(" <th>&nbsp;&nbsp;</th>");
+ System.out.println(" <th align=\"right\">Java rules</th>");
+ System.out.println(" <th>&nbsp;&nbsp;</th>");
+ System.out.println(" <th align=\"right\">" +
+ "<a href=\"http://languagetool.cvs.sourceforge.net/*checkout*/languagetool/" +
+ "JLanguageTool/src/rules/false-friends.xml\">False friends</a></th>");
+ System.out.println(" <th>&nbsp;&nbsp;</th>");
+ System.out.println(" <th align=\"left\">Rule Maintainers</th>");
+ System.out.println("</tr>");
+ final List<String> sortedLanguages = new ArrayList<String>();
+ for (Language element : Language.LANGUAGES) {
+ if (element == Language.DEMO) {
+ continue;
+ }
+ sortedLanguages.add(element.getName());
+ }
+ Collections.sort(sortedLanguages);
+
+ //setup false friends counting
+ final String falseFriendFile = JLanguageTool.getDataBroker().getRulesDir() + File.separator + "false-friends.xml";
+ final java.net.URL falseFriendUrl = this.getClass().getResource(falseFriendFile);
+ final String falseFriendRules = StringTools.readFile(Tools.getStream(falseFriendFile))
+ .replaceAll("(?s)<!--.*?-->", "")
+ .replaceAll("(?s)<rules.*?>", "");
+
+ for (final String langName : sortedLanguages) {
+ final Language lang = Language.getLanguageForName(langName);
+ System.out.print("<tr>");
+ System.out.print("<td>" + lang.getName() + "</td>");
+ final String xmlFile = JLanguageTool.getDataBroker().getRulesDir() + File.separator + lang.getShortName() + File.separator + "grammar.xml";
+ final java.net.URL url = this.getClass().getResource(xmlFile);
+ if (url == null) {
+ System.out.println("<td align=\"right\">0</td>");
+ } else {
+ // count XML rules:
+ String xmlRules = StringTools.readFile(Tools.getStream(xmlFile));
+ xmlRules = xmlRules.replaceAll("(?s)<!--.*?-->", "");
+ xmlRules = xmlRules.replaceAll("(?s)<rules.*?>", "");
+ int pos = 0;
+ int count = 0;
+ while (true) {
+ pos = xmlRules.indexOf("<rule ", pos + 1);
+ if (pos == -1) {
+ break;
+ }
+ count++;
+ }
+ pos = 0;
+ int countInRuleGroup = 0;
+ while (true) {
+ pos = xmlRules.indexOf("<rule>", pos + 1);
+ if (pos == -1) {
+ break;
+ }
+ countInRuleGroup++;
+ }
+ System.out.print("<td align=\"right\">" + (count + countInRuleGroup) + " (" +
+ "<a href=\"http://languagetool.cvs.sourceforge.net/*checkout*/languagetool/" +
+ "JLanguageTool/src/rules/" + lang.getShortName() + "/grammar.xml\">show</a>/" +
+ "<a href=\"http://community.languagetool.org/rule/list?lang=" +
+ lang.getShortName() + "\">browse</a>" +
+ ")</td>");
+ }
+ System.out.print("<td></td>");
+
+ // count Java rules:
+ final File dir = new File("src/java/de/danielnaber/languagetool" +
+ JLanguageTool.getDataBroker().getRulesDir() + "/" + lang.getShortName());
+ if (!dir.exists()) {
+ System.out.print("<td align=\"right\">0</td>");
+ } else {
+ final File[] javaRules = dir.listFiles(new JavaFilter());
+ final int javaCount = javaRules.length-1; // minus 1: one is always "<Language>Rule.java"
+ System.out.print("<td align=\"right\">" + javaCount + "</td>");
+ }
+
+ // false friends
+ System.out.println("<td></td>");
+ if (falseFriendUrl == null) {
+ System.out.println("<td align=\"right\">0</td>");
+ } else {
+ // count XML rules:
+ int pos = 0;
+ int count = 0;
+ while (true) {
+ pos = falseFriendRules.indexOf("<pattern lang=\""+ lang.getShortName(), pos + 1);
+ if (pos == -1) {
+ break;
+ }
+ count++;
+ }
+ System.out.print("<td align=\"right\">" + count + "</td>");
+
+ // maintainer information:
+ System.out.print("<td></td>");
+ final StringBuilder maintainerInfo = new StringBuilder();
+ if (lang.getMaintainers() != null) {
+ for (Contributor contributor : lang.getMaintainers()) {
+ if (!StringTools.isEmpty(maintainerInfo. toString())) {
+ maintainerInfo.append(", ");
+ }
+ if (contributor.getUrl() != null) {
+ maintainerInfo.append("<a href=\"");
+ maintainerInfo.append(contributor.getUrl());
+ maintainerInfo.append("\">");
+ }
+ maintainerInfo.append(contributor.getName());
+ if (contributor.getUrl() != null) {
+ maintainerInfo.append("</a>");
+ }
+ if (contributor.getRemark() != null) {
+ maintainerInfo.append("&nbsp;(" + contributor.getRemark() + ")");
+ }
+ }
+ }
+ System.out.print("<td align=\"left\">" + maintainerInfo.toString() +
+ "</td>");
+ }
+
+ System.out.println("</tr>");
+ }
+
+ System.out.println("</table>");
+ }
+
+}
+
+class JavaFilter implements FileFilter {
+
+ public boolean accept(final File f) {
+ if (f.getName().endsWith(".java")) {
+ return true;
+ }
+ return false;
+ }
+
+}
diff --git a/JLanguageTool/src/dev/de/danielnaber/languagetool/dev/tools/RomanianDiacriticsModifier.java b/JLanguageTool/src/dev/de/danielnaber/languagetool/dev/tools/RomanianDiacriticsModifier.java
new file mode 100644
index 0000000..6fc90bf
--- /dev/null
+++ b/JLanguageTool/src/dev/de/danielnaber/languagetool/dev/tools/RomanianDiacriticsModifier.java
@@ -0,0 +1,99 @@
+package de.danielnaber.languagetool.dev.tools;
+
+/**
+ *
+ * Helper class for romanian diacritics correction. Many romanian texts
+ * (including Romanian wikipedia) contains wrong diacritics: <b>ş</b> instead of
+ * <b>ș</b> and <b>ţ</b> instead of <b>ț</b>.
+ *
+ * @author Ionuț Păduraru
+ * @since 14.04.2009 12:27:24
+ */
+public final class RomanianDiacriticsModifier {
+
+ private RomanianDiacriticsModifier() {
+ // private constructor
+ }
+ private static final int REPLACEMENT_BUFF_SIZE = 10 * 1024;
+ private static char[] cCorrectDiacritics = null;
+ private static char[] replacementBuff = null;
+
+ /**
+ * Initialize internal buffers
+ *
+ * @author Ionuț Păduraru
+ * @since 14.04.2009 12:32:29
+ */
+ private synchronized static void initCharMap() {
+ if (cCorrectDiacritics == null) {
+ replacementBuff = new char[REPLACEMENT_BUFF_SIZE];
+ cCorrectDiacritics = new char[Character.MAX_VALUE
+ - Character.MIN_VALUE];
+ char c = Character.MIN_VALUE;
+ for (int i = 0; i < Character.MAX_VALUE - Character.MIN_VALUE; i++) {
+ final char newC = diac(c);
+ cCorrectDiacritics[i] = newC;
+ c++;
+ }
+ }
+ }
+
+ /**
+ * Single character correction. Used internally during buffers
+ * initialization
+ *
+ * @author Ionuț Păduraru
+ * @since 14.04.2009 12:32:52
+ * @param c
+ * @return
+ */
+ private static char diac(char c) {
+ switch (c) {
+ case 'ş':
+ c = 'ș';
+ break;
+ case 'ţ':
+ c = 'ț';
+ break;
+ case 'Ţ':
+ c = 'Ț';
+ break;
+ case 'Ş':
+ c = 'Ș';
+ break;
+ default:
+ break;
+ }
+ return c;
+ }
+
+ /**
+ * Romanian diactitics correction: replace <b>ş</b> with <b>ș</b> and
+ * <b>ţ</b> with <b>ț</b>(including upper-case variants). <br/>
+ * Thread-safe method.
+ *
+ * @author Ionuț Păduraru
+ * @since 14.04.2009 12:33:39
+ * @param s
+ */
+ public static synchronized String correctDiacritrics(String s) {
+ if (null == s)
+ return null;
+ initCharMap();
+ final int length = s.length();
+ // check buffer size
+ if (length > replacementBuff.length) {
+ replacementBuff = new char[length];
+ }
+ // get current chars
+ s.getChars(0, length, replacementBuff, 0);
+ // replace
+ for (int i = 0; i < length; i++) {
+ replacementBuff[i] = cCorrectDiacritics[replacementBuff[i]];
+
+ }
+ // return the corrected string
+ return String.valueOf(replacementBuff, 0, length);
+ }
+
+} \ No newline at end of file
diff --git a/JLanguageTool/src/dev/de/danielnaber/languagetool/dev/wikipedia/BaseWikipediaDumpHandler.java b/JLanguageTool/src/dev/de/danielnaber/languagetool/dev/wikipedia/BaseWikipediaDumpHandler.java
new file mode 100644
index 0000000..589c0e2
--- /dev/null
+++ b/JLanguageTool/src/dev/de/danielnaber/languagetool/dev/wikipedia/BaseWikipediaDumpHandler.java
@@ -0,0 +1,155 @@
+/* LanguageTool, a natural language style checker
+ * Copyright (C) 2010 Daniel Naber (http://www.danielnaber.de)
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
+ * USA
+ */
+package de.danielnaber.languagetool.dev.wikipedia;
+
+import java.io.IOException;
+import java.util.Date;
+import java.util.List;
+
+import org.xml.sax.Attributes;
+import org.xml.sax.SAXException;
+import org.xml.sax.helpers.DefaultHandler;
+
+import de.danielnaber.languagetool.JLanguageTool;
+import de.danielnaber.languagetool.Language;
+import de.danielnaber.languagetool.TextFilter;
+import de.danielnaber.languagetool.dev.tools.RomanianDiacriticsModifier;
+import de.danielnaber.languagetool.rules.RuleMatch;
+
+/**
+ * Read the Wikipedia XML dump, check texts with LanguageTool, and
+ * let result be handled in sub classes.
+ */
+abstract class BaseWikipediaDumpHandler extends DefaultHandler {
+
+ protected static final int CONTEXT_SIZE = 50;
+ protected static final String MARKER_START = "<err>";
+ protected static final String MARKER_END = "</err>";
+ protected static final String LANG_MARKER = "XX";
+ protected static final String URL_PREFIX = "http://" + LANG_MARKER + ".wikipedia.org/wiki/";
+
+ protected Date dumpDate;
+ protected String langCode;
+
+ private final JLanguageTool languageTool;
+ private int ruleMatchCount = 0;
+ private int articleCount = 0;
+ private int maxArticles = 0;
+
+ private boolean inText = false;
+ private StringBuilder text = new StringBuilder();
+
+ private TextFilter textFilter = new WikipediaTextFilter();
+
+ private String title;
+ private final Language lang;
+
+ //===========================================================
+ // SAX DocumentHandler methods
+ //===========================================================
+
+ protected BaseWikipediaDumpHandler(JLanguageTool languageTool, int maxArticles, Date dumpDate,
+ String langCode, Language lang) {
+ this.lang = lang;
+ this.languageTool = languageTool;
+ this.maxArticles = maxArticles;
+ this.dumpDate = dumpDate;
+ this.langCode = langCode;
+ initTextFilter();
+ }
+
+ /**
+ * initialize textFilter field
+ */
+ private void initTextFilter() {
+ if (Language.ROMANIAN == lang) {
+ textFilter = new WikipediaTextFilter() {
+ @Override
+ public String filter(String arg0) {
+ final String tmp = super.filter(arg0);
+ // diacritics correction (comma-bellow instead of sedilla for ș and ț)
+ return RomanianDiacriticsModifier.correctDiacritrics(tmp);
+ }
+ };
+ } else {
+ textFilter = new WikipediaTextFilter();
+ }
+ }
+
+ @SuppressWarnings("unused")
+ public void startElement(String namespaceURI, String lName, String qName,
+ Attributes attrs) throws SAXException {
+ if (qName.equals("title")) {
+ inText = true;
+ } else if (qName.equals("text")) {
+ inText = true;
+ }
+ }
+
+ @SuppressWarnings("unused")
+ public void endElement(String namespaceURI, String sName, String qName) {
+ if (qName.equals("title")) {
+ title = text.toString();
+ text = new StringBuilder();
+ } else if (qName.equals("text")) {
+ //System.err.println(text.length() + " " + text.substring(0, Math.min(50, text.length())));
+ final String textToCheck = textFilter.filter(text.toString());
+ //System.out.println(textToCheck);
+ if (!textToCheck.contains("#REDIRECT")) {
+ //System.err.println("#########################");
+ //System.err.println(textToCheck);
+ try {
+ articleCount++;
+ if (maxArticles > 0 && articleCount > maxArticles) {
+ System.out.printf("Maximum number of articles reached. Found %d matches in %d articles\n",
+ ruleMatchCount, articleCount);
+ System.exit(0);
+ }
+ final List<RuleMatch> ruleMatches = languageTool.check(textToCheck);
+ System.out.println("Checking article " + articleCount + " (" +
+ textToCheck.length()/1024 + "KB, '" + title + "')" +
+ ", found " + ruleMatches.size() + " matches");
+ try {
+ handleResult(title, ruleMatches, textToCheck, languageTool.getLanguage());
+ } catch (Exception e) {
+ throw new RuntimeException(e);
+ }
+ ruleMatchCount += ruleMatches.size();
+ } catch (IOException e) {
+ throw new RuntimeException(e);
+ }
+ }
+ text = new StringBuilder();
+ }
+ inText = false;
+ }
+
+ public void characters(char buf[], int offset, int len) {
+ final String s = new String(buf, offset, len);
+ if (inText) {
+ text.append(s);
+ }
+ }
+
+ abstract protected void handleResult(String title, List<RuleMatch> ruleMatches,
+ String text, Language language) throws Exception;
+
+ abstract protected void close();
+
+}
diff --git a/JLanguageTool/src/dev/de/danielnaber/languagetool/dev/wikipedia/CheckWikipediaDump.java b/JLanguageTool/src/dev/de/danielnaber/languagetool/dev/wikipedia/CheckWikipediaDump.java
new file mode 100644
index 0000000..3eabdd8
--- /dev/null
+++ b/JLanguageTool/src/dev/de/danielnaber/languagetool/dev/wikipedia/CheckWikipediaDump.java
@@ -0,0 +1,143 @@
+/* LanguageTool, a natural language style checker
+ * Copyright (C) 2005 Daniel Naber (http://www.danielnaber.de)
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
+ * USA
+ */
+
+/*
+ *
+ * Created on 21.12.2006
+ */
+package de.danielnaber.languagetool.dev.wikipedia;
+
+import java.io.File;
+import java.io.IOException;
+import java.text.ParseException;
+import java.text.SimpleDateFormat;
+import java.util.Date;
+
+import javax.xml.parsers.ParserConfigurationException;
+import javax.xml.parsers.SAXParser;
+import javax.xml.parsers.SAXParserFactory;
+
+import org.xml.sax.SAXException;
+
+import de.danielnaber.languagetool.JLanguageTool;
+import de.danielnaber.languagetool.Language;
+
+/**
+ * Command-line tool that checks texts from Wikipedia (download "pages-articles.xml.bz2" from
+ * http://download.wikimedia.org/backup-index.html, e.g.
+ * http://download.wikimedia.org/dewiki/latest/dewiki-latest-pages-articles.xml.bz2)
+ * and stores the result in a database.
+ *
+ * @author Daniel Naber
+ */
+public class CheckWikipediaDump {
+
+ private CheckWikipediaDump() {
+ // no public constructor
+ }
+
+ public static void main(String[] args) throws IOException, SAXException, ParserConfigurationException {
+ final CheckWikipediaDump prg = new CheckWikipediaDump();
+ if (args.length < 3 || args.length > 4) {
+ System.err.println("Usage: CheckWikipediaDump <propertyFile> <language> <filename> [maxArticleCheck]");
+ System.err.println("\tpropertyFile a file to set database access properties. Use '-' to print results to stdout.");
+ System.err.println("\tlanguage languagecode like 'en' or 'de'");
+ System.err.println("\tfilename path to unpacked Wikipedia XML dump");
+ System.err.println("\tmaxArticleCheck optional: maximum number of articles to check");
+ System.exit(1);
+ }
+ int maxArticles = 0;
+ if (args.length == 4) {
+ maxArticles = Integer.parseInt(args[3]);
+ }
+ File propFile = null;
+ if (!"-".equals(args[0])) {
+ propFile = new File(args[0]);
+ if (!propFile.exists() || propFile.isDirectory()) {
+ throw new IOException("file not found or isn't a file: " + propFile.getAbsolutePath());
+ }
+ }
+ prg.run(propFile, args[1], args[2], maxArticles);
+ }
+
+ private void run(File propFile, String language, String textFilename, int maxArticles)
+ throws IOException, SAXException, ParserConfigurationException {
+ final File file = new File(textFilename);
+ if (!file.exists() || !file.isFile()) {
+ throw new IOException("File doesn't exist or isn't a file: " + textFilename);
+ }
+ final Language lang = Language.getLanguageForShortName(language);
+ if (lang == null) {
+ System.err.println("Language not supported: " + language);
+ System.exit(1);
+ }
+ final JLanguageTool languageTool = new JLanguageTool(lang);
+ languageTool.activateDefaultPatternRules();
+ // useful settings (avoid false alarms) because text extraction
+ // from Wikipedia isn't clean yet:
+ languageTool.disableRule("DE_CASE"); // too many false hits
+ languageTool.disableRule("UNPAIRED_BRACKETS");
+ languageTool.disableRule("UPPERCASE_SENTENCE_START");
+ languageTool.disableRule("WORD_REPEAT_RULE");
+ languageTool.disableRule("COMMA_PARENTHESIS_WHITESPACE");
+ languageTool.disableRule("WHITESPACE_RULE");
+ languageTool.disableRule("EN_QUOTES"); // en
+ languageTool.disableRule("CUDZYSLOW_DRUKARSKI"); // pl
+ languageTool.disableRule("POMIŠLJAJ_1"); // sl
+ languageTool.disableRule("POMIŠLJAJ_2"); // sl
+ languageTool.disableRule("POMIŠLJAJ_3"); // sl
+ /*
+ List rules = lt.getAllRules();
+ for (Iterator iter = rules.iterator(); iter.hasNext();) {
+ Rule element = (Rule) iter.next();
+ lt.disableRule(element.getId());
+ }
+ lt.enableRule("DE_AGREEMENT");
+ */
+ System.err.println("These rules are disabled: " + languageTool.getDisabledRules());
+ final Date dumpDate = getDumpDate(file);
+ System.out.println("Dump date: " + dumpDate + ", language: " + language);
+ final BaseWikipediaDumpHandler handler;
+ if (propFile != null) {
+ handler = new DatabaseDumpHandler(languageTool, maxArticles, dumpDate,
+ language, propFile, lang);
+ } else {
+ handler = new OutputDumpHandler(languageTool, maxArticles, dumpDate,
+ language, lang);
+ }
+ final SAXParserFactory factory = SAXParserFactory.newInstance();
+ final SAXParser saxParser = factory.newSAXParser();
+ saxParser.parse(file, handler);
+ }
+
+ private Date getDumpDate(File file) throws IOException {
+ final String filename = file.getName();
+ final String[] parts = filename.split("-");
+ if (parts.length < 3) {
+ throw new IOException("Unexpected filename format: " + file.getName());
+ }
+ final SimpleDateFormat sdf = new SimpleDateFormat("yyyyMMdd");
+ try {
+ return sdf.parse(parts[1]);
+ } catch (ParseException e) {
+ throw new IOException("Unexpected date format: " + parts[1], e);
+ }
+ }
+
+}
diff --git a/JLanguageTool/src/dev/de/danielnaber/languagetool/dev/wikipedia/DatabaseDumpHandler.java b/JLanguageTool/src/dev/de/danielnaber/languagetool/dev/wikipedia/DatabaseDumpHandler.java
new file mode 100644
index 0000000..a5ad6fb
--- /dev/null
+++ b/JLanguageTool/src/dev/de/danielnaber/languagetool/dev/wikipedia/DatabaseDumpHandler.java
@@ -0,0 +1,90 @@
+/*
+ * Created on 04.04.2010
+ */
+package de.danielnaber.languagetool.dev.wikipedia;
+
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.IOException;
+import java.sql.Connection;
+import java.sql.DriverManager;
+import java.sql.PreparedStatement;
+import java.sql.SQLException;
+import java.util.Date;
+import java.util.List;
+import java.util.Properties;
+
+import de.danielnaber.languagetool.JLanguageTool;
+import de.danielnaber.languagetool.Language;
+import de.danielnaber.languagetool.gui.Tools;
+import de.danielnaber.languagetool.rules.RuleMatch;
+
+/**
+ * Writes result of LanguageTool check to database. Used for community.languagetool.org.
+ *
+ * @author Daniel Naber
+ */
+class DatabaseDumpHandler extends BaseWikipediaDumpHandler {
+
+ private final Connection conn;
+
+ DatabaseDumpHandler(JLanguageTool lt, int maxArticles, Date dumpDate, String langCode,
+ File propertiesFile, Language lang) throws IOException {
+ super(lt, maxArticles, dumpDate, langCode, lang);
+ try {
+ final Properties dbProperties = new Properties();
+ dbProperties.load(new FileInputStream(propertiesFile));
+ final String dbDriver = getProperty(dbProperties, "dbDriver");
+ final String dbUrl = getProperty(dbProperties, "dbUrl");
+ final String dbUser = getProperty(dbProperties, "dbUser");
+ final String dbPassword = getProperty(dbProperties, "dbPassword");
+ Class.forName(dbDriver);
+ conn = DriverManager.getConnection(dbUrl, dbUser, dbPassword);
+ } catch (ClassNotFoundException e) {
+ throw new RuntimeException(e);
+ } catch (SQLException e) {
+ throw new RuntimeException(e);
+ }
+ }
+
+ @Override
+ protected void close() {
+ if (conn != null) {
+ try {
+ conn.close();
+ } catch (SQLException e) {
+ throw new RuntimeException(e);
+ }
+ }
+ }
+
+ private String getProperty(Properties prop, String key) {
+ final String value = prop.getProperty(key);
+ if (value == null) {
+ throw new RuntimeException("required key '" +key+ "' not found in properties");
+ }
+ return value;
+ }
+
+ @Override
+ protected void handleResult(String title, List<RuleMatch> ruleMatches,
+ String text, Language language) throws SQLException {
+ final String sql = "INSERT INTO corpus_match " +
+ "(version, language_code, ruleid, message, error_context, corpus_date, " +
+ "check_date, sourceuri, is_visible) "+
+ "VALUES (0, ?, ?, ?, ?, ?, ?, ?, 1)";
+ final PreparedStatement prepSt = conn.prepareStatement(sql);
+ for (RuleMatch match : ruleMatches) {
+ prepSt.setString(1, language.getShortName());
+ prepSt.setString(2, match.getRule().getId());
+ prepSt.setString(3, match.getMessage());
+ prepSt.setString(4, Tools.getContext(match.getFromPos(),
+ match.getToPos(), text, CONTEXT_SIZE, MARKER_START, MARKER_END));
+ prepSt.setDate(5, new java.sql.Date(dumpDate.getTime()));
+ prepSt.setDate(6, new java.sql.Date(new Date().getTime()));
+ prepSt.setString(7, URL_PREFIX.replaceAll(LANG_MARKER, langCode) + title);
+ prepSt.executeUpdate();
+ }
+ }
+
+}
diff --git a/JLanguageTool/src/dev/de/danielnaber/languagetool/dev/wikipedia/OutputDumpHandler.java b/JLanguageTool/src/dev/de/danielnaber/languagetool/dev/wikipedia/OutputDumpHandler.java
new file mode 100644
index 0000000..3a880fe
--- /dev/null
+++ b/JLanguageTool/src/dev/de/danielnaber/languagetool/dev/wikipedia/OutputDumpHandler.java
@@ -0,0 +1,60 @@
+/*
+ * Created on 04.04.2010
+ */
+package de.danielnaber.languagetool.dev.wikipedia;
+
+import java.util.Date;
+import java.util.List;
+
+import de.danielnaber.languagetool.JLanguageTool;
+import de.danielnaber.languagetool.Language;
+import de.danielnaber.languagetool.rules.RuleMatch;
+import de.danielnaber.languagetool.rules.patterns.PatternRule;
+import de.danielnaber.languagetool.tools.StringTools;
+
+/**
+ * Writes result of LanguageTool check to stdout.
+ *
+ * @author Daniel Naber
+ */
+class OutputDumpHandler extends BaseWikipediaDumpHandler {
+
+ OutputDumpHandler(JLanguageTool lt, int maxArticles, Date dumpDate, String langCode,
+ Language lang) {
+ super(lt, maxArticles, dumpDate, langCode, lang);
+ }
+
+ @Override
+ protected void close() {
+ }
+
+ @Override
+ protected void handleResult(String title, List<RuleMatch> ruleMatches,
+ String text, Language language) {
+ if (ruleMatches.size() > 0) {
+ int i = 1;
+ System.out.println("\nTitle: " + title);
+ for (RuleMatch match : ruleMatches) {
+ String output = i + ".) Line " + (match.getLine() + 1) + ", column "
+ + match.getColumn() + ", Rule ID: " + match.getRule().getId();
+ if (match.getRule() instanceof PatternRule) {
+ final PatternRule pRule = (PatternRule) match.getRule();
+ output += "[" + pRule.getSubId() + "]";
+ }
+ System.out.println(output);
+ String msg = match.getMessage();
+ msg = msg.replaceAll("<suggestion>", "'");
+ msg = msg.replaceAll("</suggestion>", "'");
+ System.out.println("Message: " + msg);
+ final List<String> replacements = match.getSuggestedReplacements();
+ if (!replacements.isEmpty()) {
+ System.out.println("Suggestion: " + StringTools.listToString(replacements, "; "));
+ }
+ System.out.println(StringTools.getContext(match.getFromPos(), match
+ .getToPos(), text, CONTEXT_SIZE));
+ i++;
+ }
+ }
+ }
+
+}
diff --git a/JLanguageTool/src/dev/de/danielnaber/languagetool/dev/wikipedia/WikipediaTextFilter.java b/JLanguageTool/src/dev/de/danielnaber/languagetool/dev/wikipedia/WikipediaTextFilter.java
new file mode 100644
index 0000000..49646e2
--- /dev/null
+++ b/JLanguageTool/src/dev/de/danielnaber/languagetool/dev/wikipedia/WikipediaTextFilter.java
@@ -0,0 +1,52 @@
+/* LanguageTool, a natural language style checker
+ * Copyright (C) 2010 Daniel Naber (http://www.danielnaber.de)
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
+ * USA
+ */
+package de.danielnaber.languagetool.dev.wikipedia;
+
+import info.bliki.wiki.model.WikiModel;
+
+import org.apache.commons.lang.StringEscapeUtils;
+
+import de.danielnaber.languagetool.TextFilter;
+
+/**
+ * Convert Wikipedia syntax to HTML using Bliki and then try to clean it up (this is
+ * rather ugly).
+ */
+class WikipediaTextFilter implements TextFilter {
+
+ public String filter(String s) {
+ // TODO: find general HTML to Text converter?!:
+ final WikiModel wikiModel = new WikiModel("${image}", "${title}");
+ s = wikiModel.render(s);
+ //System.out.println("0####"+s);
+ s = s.replaceAll("\\{\\{.*?\\}\\}", "");
+ s = s.replaceAll("</p>", "\n\n");
+ s = s.replaceAll("</dt>", "\n\n");
+ s = s.replaceAll("</dl>", "\n\n");
+ s = s.replaceAll("</h\\d>", "\n\n");
+ s = s.replaceAll("<a href=\"http://[a-zA-Z-]+\\.wikipedia\\.org/wiki/.*?\">.*?</a>", "");
+ s = s.replaceAll("<.*?>", "");
+ s = s.replaceAll("\n\n*", "\n\n"); // single line break isn't detected as paragraph in LT by default
+ s = StringEscapeUtils.unescapeHtml(s);
+ //System.out.println("1############################################\n"+s);
+ //System.out.println("/############################################"+s);
+ return s;
+ }
+
+}
diff --git a/JLanguageTool/src/dev/tools/add_short.xsl b/JLanguageTool/src/dev/tools/add_short.xsl
new file mode 100644
index 0000000..571e41a
--- /dev/null
+++ b/JLanguageTool/src/dev/tools/add_short.xsl
@@ -0,0 +1,59 @@
+<?xml version="1.0" ?>
+<xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
+ version="1.0">
+<!--
+ A simple stylesheet that adds "short" element with category name to grammar files
+ Copyright (C) 2008 Marcin Miłkowski
+
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
+ * USA
+
+Note: remove DOCTYPE declaration before conversion and add after it. Otherwise, you'd get
+all default values in the grammar.xml!!!
+
+Usage:
+
+java -jar saxon8.jar grammar.xml add_short.xsl >new_grammar.xml
+
+Then rename new_grammar.xml to grammar.xml, after making a backup of grammar.xml
+-->
+
+ <xsl:output method="xml" encoding="utf-8" indent="no"/>
+
+ <xsl:template match="@*|node()">
+ <xsl:copy>
+ <xsl:apply-templates select="@*|node()"/>
+ </xsl:copy>
+ </xsl:template>
+
+ <xsl:template match="@xml:space"/>
+
+ <xsl:template match="message">
+ <xsl:copy>
+ <xsl:apply-templates select="@*|node()"/>
+ </xsl:copy>
+ <xsl:text>
+ </xsl:text>
+ <xsl:element name="short">
+ <xsl:choose>
+ <xsl:when test="name(../..)='rulegroup'">
+ <xsl:value-of select="../../../@name"></xsl:value-of>
+ </xsl:when>
+ <xsl:otherwise><xsl:value-of select="../../@name"/></xsl:otherwise>
+ </xsl:choose>
+ </xsl:element>
+</xsl:template>
+
+</xsl:stylesheet> \ No newline at end of file
diff --git a/JLanguageTool/src/dev/tools/convert.xsl b/JLanguageTool/src/dev/tools/convert.xsl
new file mode 100644
index 0000000..3e70426
--- /dev/null
+++ b/JLanguageTool/src/dev/tools/convert.xsl
@@ -0,0 +1,50 @@
+<?xml version="1.0" ?>
+<xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
+ version="1.0">
+<!-- XSLT stylesheet to convert grammar.xml <em> elements
+
+ Copyright (C) 2008 Marcin Miłkowski.
+
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
+ * USA
+
+Note: it's obsolete and useless for current grammar.xml files.
+
+usage:
+
+java -jar saxon8.jar grammar.xml convert.xsl
+
+-->
+ <xsl:output method="xml" encoding="utf-8" indent="yes"/>
+
+ <xsl:template match="@*|node()">
+ <xsl:copy>
+ <xsl:apply-templates select="@*|node()"/>
+ </xsl:copy>
+ </xsl:template>
+
+ <xsl:template match="//message/em">
+ <xsl:element name="suggestion">
+ <xsl:value-of select="./text()"/>
+ </xsl:element>
+</xsl:template>
+
+ <xsl:template match="//example/em">
+ <xsl:element name="marker">
+ <xsl:value-of select="./text()"/>
+ </xsl:element>
+</xsl:template>
+
+</xsl:stylesheet> \ No newline at end of file
diff --git a/JLanguageTool/src/dev/tools/print.xsl b/JLanguageTool/src/dev/tools/print.xsl
new file mode 100644
index 0000000..2e775d6
--- /dev/null
+++ b/JLanguageTool/src/dev/tools/print.xsl
@@ -0,0 +1,200 @@
+<?xml version="1.0"?>
+<xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
+ version="2.0">
+ <!-- XSLT stylesheet to pretty print grammar.xml
+
+Copyright (C) 2008 Marcin Miłkowski
+
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
+ * USA
+
+ usage:
+
+ java -jar saxon8.jar grammar.xml print.xsl
+
+ This version doesn't work in Firefox, unfortunately...
+
+ -->
+ <xsl:output method="html" encoding="UTF-8" indent="no" />
+
+ <xsl:template match="text()" />
+
+ <xsl:template match="*">
+ <xsl:apply-templates select="*">
+ <xsl:sort select="@name"/>
+ </xsl:apply-templates>
+ </xsl:template>
+
+ <xsl:template match="//category">
+ <xsl:variable name="category_name" select="@name"/>
+ <xsl:variable name="cat_id" select="generate-id()"/>
+ <xsl:element name="div">
+ <xsl:attribute name="id"><xsl:copy-of select="$cat_id"/></xsl:attribute>
+ <xsl:attribute name="style">display:none</xsl:attribute>
+ <h4>
+ <xsl:element name="a">
+ <xsl:attribute name="href">javascript:;</xsl:attribute>
+ <xsl:attribute name="onmousedown">toggleDiv('<xsl:copy-of select="$cat_id"/>');</xsl:attribute>
+ <xsl:value-of select="$category_name"/>
+ </xsl:element>
+ (<xsl:value-of select="count(rule[@id!=''])+count(rulegroup[@id!=''])"/>)
+ </h4>
+ <ol>
+ <xsl:apply-templates select="*">
+ <xsl:sort select="@name"/>
+ </xsl:apply-templates>
+ </ol>
+ </xsl:element>
+ <h4>
+ <xsl:element name="a">
+ <xsl:attribute name="href">javascript:;</xsl:attribute>
+ <xsl:attribute name="onmousedown">toggleDiv('<xsl:copy-of select="$cat_id"/>');</xsl:attribute>
+ <xsl:value-of select="$category_name"/>
+ </xsl:element>
+ (<xsl:value-of select="count(rule[@id!=''])+count(rulegroup[@id!=''])"/>)
+ </h4>
+ </xsl:template>
+
+
+ <xsl:template match="//rule[@id!='']">
+ <li>
+ <xsl:value-of select="@name" />
+ </li>
+ <ul>
+ <xsl:apply-templates select="*" />
+ </ul>
+ </xsl:template>
+
+ <xsl:template match="//rulegroup">
+ <li>
+ <xsl:value-of select="@name" />
+ </li>
+ <ul>
+ <xsl:apply-templates select="*" />
+ </ul>
+ </xsl:template>
+
+
+ <xsl:template match="//rule/example[@type='incorrect']">
+ <li>
+ <xsl:apply-templates select="*|text()" /> <br/>
+ <xsl:if test="../short/text()!=''">
+ <xsl:value-of select="../short/text()"/>.
+ </xsl:if>
+ <xsl:if test="@correction !=''">
+ <xsl:choose>
+ <xsl:when test="not(contains(@correction, '|')) and not(contains(../message/text()[1], '\')) and count(../message/text()) &lt; 3">
+ <xsl:copy-of select="../message/text()[1]"/>
+ <strong style="color: #339900;"><xsl:value-of select="@correction"/></strong>
+ <xsl:copy-of select="../message/text()[2]"/>
+ </xsl:when>
+ <xsl:otherwise>
+<!--
+Remaining problem: replace \1 in message text with pattern/token[1]
+
+
+ <xsl:choose>
+ <xsl:when test="//rules[@lang='pl']">Poprawnie: </xsl:when>
+ <xsl:when test="//rules[@lang='en']">Correctly: </xsl:when>
+ <xsl:when test="//rules[@lang='de']">Korrekt: </xsl:when>
+ <xsl:when test="//rules[@lang='fr']">Correctement : </xsl:when>
+ <xsl:when test="//rules[@lang='nl']">Correct: </xsl:when>
+ <xsl:when test="//rules[@lang='es']">Correctamente: </xsl:when>
+ </xsl:choose>
+
+ <strong style="color: #339900;">
+ <xsl:value-of select="@correction"/>
+ </strong>
+ -->
+
+ <xsl:variable name="message" select="../message/text()"/>
+ <xsl:for-each select="tokenize(@correction,'\|')">
+ <xsl:variable name="message_cnt" select="position()"/>
+ <xsl:value-of select="$message[$message_cnt]"/>
+ <strong style="color: #339900;">
+ <xsl:value-of select="."/>
+ </strong>
+ <xsl:if test="position()=last()">
+ <xsl:variable name="last" select="last()+1"/>
+ <xsl:value-of select="$message[$last]"/>
+ </xsl:if>
+ </xsl:for-each>
+ </xsl:otherwise>
+ </xsl:choose>
+ </xsl:if>
+ </li>
+ </xsl:template>
+
+ <xsl:template match="//rule/example[@type='incorrect']/text()">
+ <xsl:copy-of select="." />
+ </xsl:template>
+
+ <xsl:template match="//rule/example[@type='incorrect']/marker">
+ <strong style="color: rgb(255, 0, 0);">
+ <xsl:value-of select="./text()" />
+ </strong>
+ </xsl:template>
+
+ <xsl:template match="//rules">
+ <html>
+ <meta http-equiv="content-type" content="text/html; charset=UTF-8"/>
+ <head>
+ <script language="javascript">
+ <xsl:text>
+ function toggleDiv(divid){
+ if(document.getElementById(divid).style.display == 'none'){
+ document.getElementById(divid).style.display = 'block';
+ }else{
+ document.getElementById(divid).style.display = 'none';
+ }
+ }
+ </xsl:text>
+ </script>
+ </head>
+ <body>
+ <noscript><p><strong>Note:</strong> this page requires Javascript to work</p></noscript>
+ <xsl:choose>
+ <xsl:when test="//rules[@lang='pl']">Łączna liczba reguł: </xsl:when>
+ <xsl:otherwise>Total number of rules: </xsl:otherwise>
+ </xsl:choose>
+ <strong>
+ <xsl:value-of select="count(//rule)"/>
+ </strong>
+ <br/>
+ <xsl:choose>
+ <xsl:when test="//rules[@lang='pl']">W tym z podpowiedziami: </xsl:when>
+ <xsl:otherwise>Rules with suggestions: </xsl:otherwise>
+ </xsl:choose>
+ <strong>
+ <xsl:value-of select="count(//message[suggestion!=''])"/>
+ </strong>
+ <br/>
+ <xsl:choose>
+ <xsl:when test="//rules[@lang='pl']">Liczba widocznych typów reguł: </xsl:when>
+ <xsl:otherwise>Total number of visible rule types: </xsl:otherwise>
+ </xsl:choose>
+ <strong>
+ <xsl:value-of select="count(//rule[@id!=''])+count(//rulegroup[@id!=''])"/>
+ </strong>
+ <br/>
+
+ <xsl:apply-templates select="*">
+ <xsl:sort select="@name"/>
+ </xsl:apply-templates>
+ </body>
+ </html>
+ </xsl:template>
+
+</xsl:stylesheet> \ No newline at end of file
diff --git a/JLanguageTool/src/dev/tools/stats.awk b/JLanguageTool/src/dev/tools/stats.awk
new file mode 100644
index 0000000..aa1760e
--- /dev/null
+++ b/JLanguageTool/src/dev/tools/stats.awk
@@ -0,0 +1,62 @@
+#Script to sort rule matches from LanguageTool
+#Usage: gawk -f stats.awk <file_created_by_LanguageTool>
+#(c) 2008, Marcin Milkowski
+# This library is free software; you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public
+# License as published by the Free Software Foundation; either
+# version 2.1 of the License, or (at your option) any later version.
+#
+# This library is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+# Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public
+# License along with this library; if not, write to the Free Software
+# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
+# USA
+
+/^[0-9]+\.\)/ {
+gsub(/^.*ID: /,"")
+rule_cnt[$0]++
+current_rule=$0
+rulematch=1
+linecnt=0
+}
+/^(Message: |Suggestion:)/ {
+comments[current_rule]= comments[current_rule] "\n" $0
+linecnt++
+}
+!/^($|Message: |Suggestion:|Time:)/ && !/ \^/ {
+if (linecnt>0)
+comments[current_rule]= comments[current_rule] "\n" $0
+}
+/^ / && / \^/ {
+comments[current_rule]= comments[current_rule] "\n" $0 "\n"
+}
+END {
+if (rulematch==1) {
+print "LanguageTool rule matches in descending order"
+print "============================================="
+print ""
+}
+z = asorti(rule_cnt, rule_names)
+#for (i = 1; i <= z; i++)
+ # print i " " rule_names[i]
+n = asort(rule_cnt, rules)
+
+for (i = z; i >= 1; i--) {
+
+ for (j = 1; j <= z; j++) {
+# print j " " rule_names[j] " => " rule_cnt[rule_names[j]]
+ if (rule_cnt[rule_names[j]]==rules[i] \
+ && printed[rule_names[j]]!="done") {
+ printed[rule_names[j]]="done"
+ rule=rule_names[j]
+ print "Rule ID: " rule ", matches: " rule_cnt[rule]
+ print comments[rule]
+ print "============="
+ }
+ }
+}
+} \ No newline at end of file