diff options
Diffstat (limited to 'JLanguageTool/src/dev')
19 files changed, 1962 insertions, 0 deletions
diff --git a/JLanguageTool/src/dev/de/danielnaber/languagetool/dev/CheckBNC.java b/JLanguageTool/src/dev/de/danielnaber/languagetool/dev/CheckBNC.java new file mode 100644 index 0000000..24931e6 --- /dev/null +++ b/JLanguageTool/src/dev/de/danielnaber/languagetool/dev/CheckBNC.java @@ -0,0 +1,105 @@ +/* LanguageTool, a natural language style checker + * Copyright (C) 2005 Daniel Naber (http://www.danielnaber.de) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ + +package de.danielnaber.languagetool.dev; + +import java.io.File; +import java.io.FileInputStream; +import java.io.IOException; +import java.util.List; + +import de.danielnaber.languagetool.JLanguageTool; +import de.danielnaber.languagetool.Language; +import de.danielnaber.languagetool.TextFilter; +import de.danielnaber.languagetool.tokenizers.SentenceTokenizer; +import de.danielnaber.languagetool.tools.StringTools; +import de.danielnaber.languagetool.tools.Tools; + +/** + * Uses JLanguageTol recursively on the files of the BNC (British National Corpus). + * + * @author Daniel Naber + */ +public final class CheckBNC { + + private JLanguageTool langTool = null; + private final TextFilter textFilter = new BNCTextFilter(); + + static final boolean CHECK_BY_SENTENCE = true; + + public static void main(String[] args) throws Exception { + if (args.length != 1) { + System.out.println("Usage: CheckBNC <directory>"); + System.exit(1); + } + final CheckBNC prg = new CheckBNC(); + prg.run(new File(args[0])); + } + + private CheckBNC() throws IOException { + langTool = new JLanguageTool(Language.ENGLISH); + langTool.activateDefaultPatternRules(); + final String[] disRules = new String[] {"UPPERCASE_SENTENCE_START", "COMMA_PARENTHESIS_WHITESPACE", + "WORD_REPEAT_RULE", "DOUBLE_PUNCTUATION"}; + System.err.println("Note: disabling the following rules:"); + for (String disRule : disRules) { + langTool.disableRule(disRule); + System.err.println(" " + disRule); + } + } + + private void run(final File file) throws IOException { + if (file.isDirectory()) { + final File[] files = file.listFiles(); + for (File file1 : files) { + run(new File(file, file1.getName())); + } + } else { + System.out.println("Checking " + file.getAbsolutePath()); + String text = StringTools.readFile(new FileInputStream(file.getAbsolutePath())); + text = textFilter.filter(text); + if (CHECK_BY_SENTENCE) { + final SentenceTokenizer st = new SentenceTokenizer(); + final List<String> sentences = st.tokenize(text); + for (String sentence : sentences) { + Tools.checkText(sentence, langTool, false, 1000); + } + } else { + Tools.checkText(text, langTool); + } + } + } + +} + +class BNCTextFilter implements TextFilter { + + public String filter(String text) { + text = text.replaceAll("(?s)<header.*?>.*?</header>", ""); + text = text.replaceAll("<w.*?>", ""); + text = text.replaceAll("<c.*?>", ""); + text = text.replaceAll("<.*?>", ""); + text = text.replaceAll(" +", " "); + text = text.replaceAll("&bquo|&equo", "\""); + text = text.replaceAll("—?", "--"); + text = text.replaceAll("&?", "&"); + return text; + } + +} diff --git a/JLanguageTool/src/dev/de/danielnaber/languagetool/dev/ContextFinder.java b/JLanguageTool/src/dev/de/danielnaber/languagetool/dev/ContextFinder.java new file mode 100644 index 0000000..0154b33 --- /dev/null +++ b/JLanguageTool/src/dev/de/danielnaber/languagetool/dev/ContextFinder.java @@ -0,0 +1,122 @@ +/* LanguageTool, a natural language style checker + * Copyright (C) 2006 Daniel Naber (http://www.danielnaber.de) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ +package de.danielnaber.languagetool.dev; + +import java.io.IOException; + +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.Term; +import org.apache.lucene.index.TermEnum; +import org.apache.lucene.search.Hits; +import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.search.PhraseQuery; + +/** + * Compare the one-word right or left context of two words. This is useful + * to find potential rules for similar words, i.e. contexts that are typical + * for only one of the words. + * + * @author Daniel Naber + */ +public class ContextFinder { + + private ContextFinder() {} + + public static void main(String[] args) throws IOException { + if (args.length != 4 || !args[3].startsWith("--context")) { + printUsageAndExit(); + } + final ContextFinder prg = new ContextFinder(); + if (args[3].endsWith("=right")) + prg.run(args[0], args[1], args[2], true); + else if (args[3].endsWith("=left")) + prg.run(args[0], args[1], args[2], false); + else + printUsageAndExit(); + } + + private static void printUsageAndExit() { + System.err.println("Usage: ContextFinder <indexDir> <term1> <term2> --context=right|left"); + System.exit(1); + } + + private void run(String indexDir, String term1, String term2, boolean rightContext) throws IOException { + final IndexReader reader = IndexReader.open(indexDir); + final IndexSearcher searcher = new IndexSearcher(reader); + final TermEnum termEnum = reader.terms(); + int termCount = 0; + System.out.println(term1 + ": " + reader.docFreq(new Term(Indexer.BODY_FIELD, term1)) + "x"); + System.out.println(term2 + ": " + reader.docFreq(new Term(Indexer.BODY_FIELD, term2)) + "x"); + while (termEnum.next()) { + final Term t = termEnum.term(); + if (isPOSTag(t)) + continue; + // first term: + final PhraseQuery pq1 = makeQuery(t, term1, rightContext); + final int hits1 = search(pq1, searcher); + // second term: + final PhraseQuery pq2 = makeQuery(t, term2, rightContext); + final int hits2 = search(pq2, searcher); + final float rel = (float)(hits1+1) / (float)(hits2+1); + if (rel > 1.0f) + System.out.println("#1: " + rel + ": " + myToString(pq1) + ": " + hits1 + " <-> " + myToString(pq2) + ": " + hits2); + else if (rel < 1.0f) + System.out.println("#2: " + rel + ": " + myToString(pq1) + ": " + hits1 + " <-> " + myToString(pq2) + ": " + hits2); + termCount++; + } + System.out.println("termCount = " + termCount); + searcher.close(); + reader.close(); + } + + private String myToString(PhraseQuery pq) { + return pq.toString().replaceAll("body:", ""); + } + + private PhraseQuery makeQuery(Term t, String term1, boolean rightContext) { + final PhraseQuery pq = new PhraseQuery(); + if (rightContext) { + pq.add(new Term(Indexer.BODY_FIELD, term1)); + pq.add(new Term(Indexer.BODY_FIELD, t.text())); + } else { + pq.add(new Term(Indexer.BODY_FIELD, t.text())); + pq.add(new Term(Indexer.BODY_FIELD, term1)); + } + return pq; + } + + private int search(PhraseQuery pq, IndexSearcher searcher) throws IOException { + //long time = System.currentTimeMillis(); + final Hits h = searcher.search(pq); + //long searchTime = System.currentTimeMillis()-time; + if (h.length() > 0) { + //System.err.println(h.length() + " " + pq); + //System.err.println(" " + searchTime + "ms"); + } + return h.length(); + } + + private boolean isPOSTag(Term t) { + if (t.text().equals(t.text().toUpperCase())) { // e.g. "VER:1:PLU:KJ2:NON:NEB" + return true; + } + return false; + } + +} diff --git a/JLanguageTool/src/dev/de/danielnaber/languagetool/dev/ExportGermanNouns.java b/JLanguageTool/src/dev/de/danielnaber/languagetool/dev/ExportGermanNouns.java new file mode 100644 index 0000000..ad7d231 --- /dev/null +++ b/JLanguageTool/src/dev/de/danielnaber/languagetool/dev/ExportGermanNouns.java @@ -0,0 +1,86 @@ +/* LanguageTool, a natural language style checker + * Copyright (C) 2005 Daniel Naber (http://www.danielnaber.de) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ + +/* + * Created on 06.05.2007 + */ +package de.danielnaber.languagetool.dev; + +import java.io.*; +import java.nio.ByteBuffer; +import java.util.HashSet; +import java.util.Set; + +import de.danielnaber.languagetool.JLanguageTool; + +import morfologik.fsa.FSA; + +/** + * Export German nouns as a serialized Java HashSet, to be used + * by jWordSplitter. + * + * @author Daniel Naber + */ +public class ExportGermanNouns { + + private static final String DICT_FILENAME = "/de/german.dict"; + + private ExportGermanNouns() { + } + + private Set<String> getWords() throws IOException { + final FSA fsa = FSA.getInstance(JLanguageTool.getDataBroker().getFromResourceDirAsStream(DICT_FILENAME)); + String lastTerm = null; + final Set<String> set = new HashSet<String>(); + for (ByteBuffer bb : fsa) { + final byte [] sequence = new byte [bb.remaining()]; + bb.get(sequence); + final String output = new String(sequence, "iso-8859-1"); + if (output.indexOf("+SUB:") != -1 && output.indexOf(":ADJ") == -1) { + final String[] parts = output.split("\\+"); + final String term = parts[0].toLowerCase(); + if (lastTerm == null || !lastTerm.equals(parts[0])) { + //System.out.println(parts[0]); + set.add(term); + } + lastTerm = term; + } + } + return set; + } + + private void serialize(Set<String> words, File outputFile) throws IOException { + final FileOutputStream fos = new FileOutputStream(outputFile); + final ObjectOutputStream oos = new ObjectOutputStream(fos); + oos.writeObject(words); + oos.close(); + fos.close(); + } + + public static void main(String[] args) throws IOException { + if (args.length != 1) { + System.out.println("Usage: ExportGermanNouns <outputFile>"); + System.exit(1); + } + final ExportGermanNouns prg = new ExportGermanNouns(); + final Set<String> words = prg.getWords(); + prg.serialize(words, new File(args[0])); + } + +} diff --git a/JLanguageTool/src/dev/de/danielnaber/languagetool/dev/Indexer.java b/JLanguageTool/src/dev/de/danielnaber/languagetool/dev/Indexer.java new file mode 100644 index 0000000..fc3392b --- /dev/null +++ b/JLanguageTool/src/dev/de/danielnaber/languagetool/dev/Indexer.java @@ -0,0 +1,100 @@ +/* LanguageTool, a natural language style checker + * Copyright (C) 2006 Daniel Naber (http://www.danielnaber.de) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ +package de.danielnaber.languagetool.dev; + +import java.io.File; +import java.io.FileInputStream; +import java.io.IOException; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.index.IndexWriter; + +import de.danielnaber.languagetool.Language; +import de.danielnaber.languagetool.tools.StringTools; + +/** + * POS tag and index text files using Lucene. Required for ContextFinder.java. + * TODO: hard-coded to index a specific kind of XML. + * + * @author Daniel Naber + */ +public class Indexer { + + static final String BODY_FIELD = "body"; + + private Indexer() {} + + public static void main(String[] args) throws IOException { + final Indexer prg = new Indexer(); + if (args.length != 2) { + System.err.println("Usage: Indexer <dataDir> <indexDir>"); + System.exit(1); + } + // FIXME: make this an option: + final Language lang = Language.GERMAN; + prg.run(args[0], args[1], lang); + } + + private void run(String dataDir, String indexDir, Language lang) throws IOException { + final IndexWriter iw = new IndexWriter(indexDir, new POSTagAnalyzer(lang.getTagger()), true); + iw.setMaxBufferedDocs(100); + index(iw, new File(dataDir), 1); + System.out.println("Optimizing index..."); + iw.optimize(); + iw.close(); + System.out.println("Done."); + } + + private void index(IndexWriter iw, File dir, int count) throws IOException { + if (dir.isDirectory()) { + final File[] files = dir.listFiles(); + for (File file : files) { + index(iw, file, ++count); + } + } else { + final Document doc = new Document(); + if (count % 50 == 0) + System.out.println("Indexing file #" + count); + String s = StringTools.readFile(new FileInputStream(dir.getAbsolutePath()), "iso-8859-1"); + // XML data: + s = getParagraphs(s); + //s = s.replaceAll("(\\w)([.,?!])", "$1 $2"); + //s = s.replaceAll("<.*?>", ""); + //System.err.println(">"+s); + doc.add(new Field(BODY_FIELD, s, Field.Store.YES, Field.Index.TOKENIZED)); + iw.addDocument(doc); + } + } + + private String getParagraphs(String xml) { + final StringBuilder sb = new StringBuilder(); + final Pattern pattern = Pattern.compile("<p>(.*?)</p>", Pattern.DOTALL); + final Matcher matcher = pattern.matcher(xml); + int pos = 0; + while (matcher.find(pos)) { + sb.append(matcher.group(1)); + pos = matcher.end(); + } + return sb.toString(); + } + +} diff --git a/JLanguageTool/src/dev/de/danielnaber/languagetool/dev/POSTagAnalyzer.java b/JLanguageTool/src/dev/de/danielnaber/languagetool/dev/POSTagAnalyzer.java new file mode 100644 index 0000000..06d6cd4 --- /dev/null +++ b/JLanguageTool/src/dev/de/danielnaber/languagetool/dev/POSTagAnalyzer.java @@ -0,0 +1,49 @@ +/* LanguageTool, a natural language style checker + * Copyright (C) 2006 Daniel Naber (http://www.danielnaber.de) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ +package de.danielnaber.languagetool.dev; + +import java.io.Reader; + +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.standard.StandardTokenizer; + +import de.danielnaber.languagetool.tagging.Tagger; + +/** + * Analyzer that stores text and its POS analysis. + * + * @author Daniel Naber + */ +class POSTagAnalyzer extends Analyzer { + + private Tagger tagger = null; + + public POSTagAnalyzer(Tagger tagger) { + this.tagger = tagger; + } + + public TokenStream tokenStream(@SuppressWarnings("unused")String fieldName, Reader reader) { + TokenStream result = new StandardTokenizer(reader); + //result = new LowerCaseFilter(result); + result = new POSTagFilter(result, tagger); + return result; + } + +} diff --git a/JLanguageTool/src/dev/de/danielnaber/languagetool/dev/POSTagFilter.java b/JLanguageTool/src/dev/de/danielnaber/languagetool/dev/POSTagFilter.java new file mode 100644 index 0000000..01fc600 --- /dev/null +++ b/JLanguageTool/src/dev/de/danielnaber/languagetool/dev/POSTagFilter.java @@ -0,0 +1,94 @@ +/* LanguageTool, a natural language style checker + * Copyright (C) 2006 Daniel Naber (http://www.danielnaber.de) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ +package de.danielnaber.languagetool.dev; + +import java.util.ArrayList; +import java.util.HashSet; +import java.util.List; +import java.util.Set; +import java.util.Stack; + +import org.apache.lucene.analysis.Token; +import org.apache.lucene.analysis.TokenFilter; +import org.apache.lucene.analysis.TokenStream; + +import de.danielnaber.languagetool.AnalyzedToken; +import de.danielnaber.languagetool.AnalyzedTokenReadings; +import de.danielnaber.languagetool.tagging.Tagger; +import de.danielnaber.languagetool.tagging.de.AnalyzedGermanTokenReadings; + +/** + * Filter that puts the words of a text, the base form, and word's POS tags at the + * same index position. + * + * @author Daniel Naber + */ +class POSTagFilter extends TokenFilter { + + private static final String BASEFORM_PREFIX = "B_"; + private static final String TEXTFORM_PREFIX = "T_"; + + private final Stack<Token> stack = new Stack<Token>(); + private Tagger tagger = null; + + public POSTagFilter(TokenStream in, Tagger tagger) { + super(in); + this.tagger = tagger; + } + + public final org.apache.lucene.analysis.Token next() throws java.io.IOException { + + if (stack.size() > 0) { + //System.err.println("*"+stack.peek()); + return stack.pop(); + } else { + final Token t = input.next(); + if (t == null) + return null; + final List<String> wordList = new ArrayList<String>(); + wordList.add(t.termText()); + final List<AnalyzedTokenReadings> atr = tagger.tag(wordList); + for (Object anAtr : atr) { + final AnalyzedGermanTokenReadings atrs = (AnalyzedGermanTokenReadings) anAtr; + final List<AnalyzedToken> ats = atrs.getReadings(); + for (Object at1 : ats) { + final AnalyzedToken at = (AnalyzedToken) at1; + if (at.getPOSTag() != null) { + //System.err.println(">>>>>"+at.getPOSTag()); + final Token posToken = new Token(at.getPOSTag(), t.startOffset(), t.endOffset()); + posToken.setPositionIncrement(0); + stack.push(posToken); + } + final Set<String> indexLemmas = new HashSet<String>(); + if (at.getLemma() != null) { + final String lemma = at.getLemma().toLowerCase(); + if (!lemma.equalsIgnoreCase(t.termText()) && !indexLemmas.contains(lemma)) { + final Token posToken = new Token(BASEFORM_PREFIX + lemma, t.startOffset(), t.endOffset()); + posToken.setPositionIncrement(0); + stack.push(posToken); + indexLemmas.add(lemma); + } + } + } + } + return new Token(TEXTFORM_PREFIX + t.termText().toLowerCase(), t.startOffset(), t.endOffset()); + } + } + +} diff --git a/JLanguageTool/src/dev/de/danielnaber/languagetool/dev/POSTagLanguageModel.java b/JLanguageTool/src/dev/de/danielnaber/languagetool/dev/POSTagLanguageModel.java new file mode 100644 index 0000000..68439cc --- /dev/null +++ b/JLanguageTool/src/dev/de/danielnaber/languagetool/dev/POSTagLanguageModel.java @@ -0,0 +1,147 @@ +/* LanguageTool, a natural language style checker + * Copyright (C) 2009 Daniel Naber (http://www.danielnaber.de) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ + +package de.danielnaber.languagetool.dev; + +import java.io.BufferedInputStream; +import java.io.BufferedReader; +import java.io.IOException; +import java.io.InputStreamReader; +import java.util.ArrayList; +import java.util.List; + +import de.danielnaber.languagetool.AnalyzedSentence; +import de.danielnaber.languagetool.AnalyzedTokenReadings; +import de.danielnaber.languagetool.JLanguageTool; +import de.danielnaber.languagetool.Language; + +/** + * Tag text and display only POS tags to create an n-gram language model. + * + * @author Marcin Milkowski + */ +public class POSTagLanguageModel { + + /** + * @param args + * @throws IOException + */ + public static void main(final String[] args) throws IOException { + if (args.length == 1) { + final Language language = getLanguageOrExit(args[0]); + final JLanguageTool lt = new JLanguageTool(language, null); + runOnStdIn(lt); + } else { + exitWithUsageMessage(); + } + } + + private static Language getLanguageOrExit(final String lang) { + Language language = null; + boolean foundLanguage = false; + final List<String> supportedLanguages = new ArrayList<String>(); + for (final Language tmpLang : Language.LANGUAGES) { + supportedLanguages.add(tmpLang.getShortName()); + if (lang.equals(tmpLang.getShortName())) { + language = tmpLang; + foundLanguage = true; + break; + } + } + if (!foundLanguage) { + System.out.println("Unknown language '" + lang + + "'. Supported languages are: " + supportedLanguages); + exitWithUsageMessage(); + } + return language; + } + + private static void exitWithUsageMessage() { + System.out + .println("Usage: java de.danielnaber.languagetool.dev.POSTagLanguageModel language"); + } + + private static void runOnStdIn(final JLanguageTool lt) throws IOException { + final int MAX_FILE_SIZE = 64000; + InputStreamReader isr = null; + BufferedReader br = null; + StringBuilder sb = new StringBuilder(); + try { + isr = new InputStreamReader(new BufferedInputStream(System.in)); + br = new BufferedReader(isr); + String line; + while ((line = br.readLine()) != null) { + sb.append(line); + sb.append('\n'); + if (lt.getLanguage().getSentenceTokenizer().singleLineBreaksMarksPara()) { + tagText(sb.toString(), lt); + sb = new StringBuilder(); + } else { + if ("".equals(line) || sb.length() >= MAX_FILE_SIZE) { + tagText(sb.toString(), lt); + sb = new StringBuilder(); + } + } + } + } finally { + if (sb.length() > 0) { + tagText(sb.toString(), lt); + } + } + + br.close(); + isr.close(); + } + + private static void tagText(final String contents, final JLanguageTool lt) + throws IOException { + AnalyzedSentence analyzedText; + final List<String> sentences = lt.sentenceTokenize(contents); + for (final String sentence : sentences) { + analyzedText = lt.getAnalyzedSentence(sentence); + System.out.println(getSentence(analyzedText)); + } + } + + private static String getSentence(final AnalyzedSentence sent) { + final StringBuilder sb = new StringBuilder(); + sb.append("<S>"); + for (final AnalyzedTokenReadings atr : sent.getTokensWithoutWhitespace()) { + sb.append(getPOS(atr)); + sb.append(' '); + } + sb.append("</S>"); + return sb.toString(); + } + + private static String getPOS(final AnalyzedTokenReadings atr) { + final StringBuilder sb = new StringBuilder(); + final int readNum = atr.getReadingsLength(); + for (int i = 0; i < readNum; i++) { + if (!atr.isWhitespace()) { + sb.append(atr.getAnalyzedToken(i).getPOSTag()); + if (i != readNum - 1) { + sb.append('+'); + } + } + } + return sb.toString(); + } + +} diff --git a/JLanguageTool/src/dev/de/danielnaber/languagetool/dev/PrintLocales.java b/JLanguageTool/src/dev/de/danielnaber/languagetool/dev/PrintLocales.java new file mode 100644 index 0000000..a946e07 --- /dev/null +++ b/JLanguageTool/src/dev/de/danielnaber/languagetool/dev/PrintLocales.java @@ -0,0 +1,94 @@ +/* LanguageTool, a natural language style checker + * Copyright (C) 2006 Daniel Naber (http://www.danielnaber.de) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ +package de.danielnaber.languagetool.dev; + +import java.io.FileInputStream; +import java.io.FileOutputStream; +import java.io.IOException; +import java.util.Properties; + +import de.danielnaber.languagetool.Language; +import de.danielnaber.languagetool.tools.StringTools; + +/** + * Used for creating ooolocales.properties file that defines a property that is + * needed to build Linguistic.xcu. Run internally by the ant build. + * + * @author Marcin Miłkowski + */ +public final class PrintLocales { + + final static String FILENAME = "ooolocales.properties"; + + public static void main(final String[] args) throws IOException { + final PrintLocales prg = new PrintLocales(); + prg.run(); + } + + private void run() throws IOException { + String locales = ""; + for (final Language element : Language.LANGUAGES) { + if (!element.equals(Language.DEMO)) { + String var; + for (final String variant : element.getCountryVariants()) { + + if (StringTools.isEmpty(variant)) { + var = ""; + } else { + var = "-" + variant; + } + + if (!StringTools.isEmpty(locales)) { + locales = locales + " " + element.getShortName() + var; + } else { + locales = element.getShortName() + var; + } + } + } + } + // change attribute to writable as the property file is in the repo + final Properties checkPropLoc = new Properties(); + FileInputStream fIn = null; + try { + fIn = new FileInputStream(FILENAME); + checkPropLoc.load(fIn); + } finally { + if (fIn != null) + fIn.close(); + } + final String oldLocales = checkPropLoc.getProperty("countryvariants"); + if (!locales.equals(oldLocales)) { + final Properties propLoc = new Properties(); + propLoc.setProperty("countryvariants", locales); + FileOutputStream fOut = null; + try { + fOut = new FileOutputStream(FILENAME); + propLoc.store(fOut, "Locales"); + } finally { + if (fOut != null) { + fOut.close(); + } else { + System.err.println("Cannot save new locales!"); + System.exit(1); + } + } + } + } + +} diff --git a/JLanguageTool/src/dev/de/danielnaber/languagetool/dev/RuleOverview.java b/JLanguageTool/src/dev/de/danielnaber/languagetool/dev/RuleOverview.java new file mode 100644 index 0000000..c29b074 --- /dev/null +++ b/JLanguageTool/src/dev/de/danielnaber/languagetool/dev/RuleOverview.java @@ -0,0 +1,195 @@ +/* LanguageTool, a natural language style checker + * Copyright (C) 2006 Daniel Naber (http://www.danielnaber.de) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ +package de.danielnaber.languagetool.dev; + +import java.io.File; +import java.io.FileFilter; +import java.io.IOException; +import java.text.SimpleDateFormat; +import java.util.ArrayList; +import java.util.Collections; +import java.util.Date; +import java.util.List; + +import de.danielnaber.languagetool.JLanguageTool; +import de.danielnaber.languagetool.Language; +import de.danielnaber.languagetool.language.Contributor; +import de.danielnaber.languagetool.tools.StringTools; +import de.danielnaber.languagetool.tools.Tools; + +/** + * Command line tool to list supported languages and their number of rules. + * + * @author Daniel Naber + */ +public final class RuleOverview { + + public static void main(final String[] args) throws IOException { + final RuleOverview prg = new RuleOverview(); + prg.run(); + } + + private RuleOverview() { + // no constructor + } + + private void run() throws IOException { + System.out.println("<b>Rules in LanguageTool " + JLanguageTool.VERSION + "</b><br />"); + System.out.println("Date: " + new SimpleDateFormat("yyyy-MM-dd").format(new Date()) + "<br /><br />\n"); + System.out.println("<table>"); + System.out.println("<tr>"); + System.out.println(" <th></th>"); + System.out.println(" <th align=\"right\">XML rules</th>"); + System.out.println(" <th> </th>"); + System.out.println(" <th align=\"right\">Java rules</th>"); + System.out.println(" <th> </th>"); + System.out.println(" <th align=\"right\">" + + "<a href=\"http://languagetool.cvs.sourceforge.net/*checkout*/languagetool/" + + "JLanguageTool/src/rules/false-friends.xml\">False friends</a></th>"); + System.out.println(" <th> </th>"); + System.out.println(" <th align=\"left\">Rule Maintainers</th>"); + System.out.println("</tr>"); + final List<String> sortedLanguages = new ArrayList<String>(); + for (Language element : Language.LANGUAGES) { + if (element == Language.DEMO) { + continue; + } + sortedLanguages.add(element.getName()); + } + Collections.sort(sortedLanguages); + + //setup false friends counting + final String falseFriendFile = JLanguageTool.getDataBroker().getRulesDir() + File.separator + "false-friends.xml"; + final java.net.URL falseFriendUrl = this.getClass().getResource(falseFriendFile); + final String falseFriendRules = StringTools.readFile(Tools.getStream(falseFriendFile)) + .replaceAll("(?s)<!--.*?-->", "") + .replaceAll("(?s)<rules.*?>", ""); + + for (final String langName : sortedLanguages) { + final Language lang = Language.getLanguageForName(langName); + System.out.print("<tr>"); + System.out.print("<td>" + lang.getName() + "</td>"); + final String xmlFile = JLanguageTool.getDataBroker().getRulesDir() + File.separator + lang.getShortName() + File.separator + "grammar.xml"; + final java.net.URL url = this.getClass().getResource(xmlFile); + if (url == null) { + System.out.println("<td align=\"right\">0</td>"); + } else { + // count XML rules: + String xmlRules = StringTools.readFile(Tools.getStream(xmlFile)); + xmlRules = xmlRules.replaceAll("(?s)<!--.*?-->", ""); + xmlRules = xmlRules.replaceAll("(?s)<rules.*?>", ""); + int pos = 0; + int count = 0; + while (true) { + pos = xmlRules.indexOf("<rule ", pos + 1); + if (pos == -1) { + break; + } + count++; + } + pos = 0; + int countInRuleGroup = 0; + while (true) { + pos = xmlRules.indexOf("<rule>", pos + 1); + if (pos == -1) { + break; + } + countInRuleGroup++; + } + System.out.print("<td align=\"right\">" + (count + countInRuleGroup) + " (" + + "<a href=\"http://languagetool.cvs.sourceforge.net/*checkout*/languagetool/" + + "JLanguageTool/src/rules/" + lang.getShortName() + "/grammar.xml\">show</a>/" + + "<a href=\"http://community.languagetool.org/rule/list?lang=" + + lang.getShortName() + "\">browse</a>" + + ")</td>"); + } + System.out.print("<td></td>"); + + // count Java rules: + final File dir = new File("src/java/de/danielnaber/languagetool" + + JLanguageTool.getDataBroker().getRulesDir() + "/" + lang.getShortName()); + if (!dir.exists()) { + System.out.print("<td align=\"right\">0</td>"); + } else { + final File[] javaRules = dir.listFiles(new JavaFilter()); + final int javaCount = javaRules.length-1; // minus 1: one is always "<Language>Rule.java" + System.out.print("<td align=\"right\">" + javaCount + "</td>"); + } + + // false friends + System.out.println("<td></td>"); + if (falseFriendUrl == null) { + System.out.println("<td align=\"right\">0</td>"); + } else { + // count XML rules: + int pos = 0; + int count = 0; + while (true) { + pos = falseFriendRules.indexOf("<pattern lang=\""+ lang.getShortName(), pos + 1); + if (pos == -1) { + break; + } + count++; + } + System.out.print("<td align=\"right\">" + count + "</td>"); + + // maintainer information: + System.out.print("<td></td>"); + final StringBuilder maintainerInfo = new StringBuilder(); + if (lang.getMaintainers() != null) { + for (Contributor contributor : lang.getMaintainers()) { + if (!StringTools.isEmpty(maintainerInfo. toString())) { + maintainerInfo.append(", "); + } + if (contributor.getUrl() != null) { + maintainerInfo.append("<a href=\""); + maintainerInfo.append(contributor.getUrl()); + maintainerInfo.append("\">"); + } + maintainerInfo.append(contributor.getName()); + if (contributor.getUrl() != null) { + maintainerInfo.append("</a>"); + } + if (contributor.getRemark() != null) { + maintainerInfo.append(" (" + contributor.getRemark() + ")"); + } + } + } + System.out.print("<td align=\"left\">" + maintainerInfo.toString() + + "</td>"); + } + + System.out.println("</tr>"); + } + + System.out.println("</table>"); + } + +} + +class JavaFilter implements FileFilter { + + public boolean accept(final File f) { + if (f.getName().endsWith(".java")) { + return true; + } + return false; + } + +} diff --git a/JLanguageTool/src/dev/de/danielnaber/languagetool/dev/tools/RomanianDiacriticsModifier.java b/JLanguageTool/src/dev/de/danielnaber/languagetool/dev/tools/RomanianDiacriticsModifier.java new file mode 100644 index 0000000..6fc90bf --- /dev/null +++ b/JLanguageTool/src/dev/de/danielnaber/languagetool/dev/tools/RomanianDiacriticsModifier.java @@ -0,0 +1,99 @@ +package de.danielnaber.languagetool.dev.tools; + +/** + * + * Helper class for romanian diacritics correction. Many romanian texts + * (including Romanian wikipedia) contains wrong diacritics: <b>ş</b> instead of + * <b>ș</b> and <b>ţ</b> instead of <b>ț</b>. + * + * @author Ionuț Păduraru + * @since 14.04.2009 12:27:24 + */ +public final class RomanianDiacriticsModifier { + + private RomanianDiacriticsModifier() { + // private constructor + } + private static final int REPLACEMENT_BUFF_SIZE = 10 * 1024; + private static char[] cCorrectDiacritics = null; + private static char[] replacementBuff = null; + + /** + * Initialize internal buffers + * + * @author Ionuț Păduraru + * @since 14.04.2009 12:32:29 + */ + private synchronized static void initCharMap() { + if (cCorrectDiacritics == null) { + replacementBuff = new char[REPLACEMENT_BUFF_SIZE]; + cCorrectDiacritics = new char[Character.MAX_VALUE + - Character.MIN_VALUE]; + char c = Character.MIN_VALUE; + for (int i = 0; i < Character.MAX_VALUE - Character.MIN_VALUE; i++) { + final char newC = diac(c); + cCorrectDiacritics[i] = newC; + c++; + } + } + } + + /** + * Single character correction. Used internally during buffers + * initialization + * + * @author Ionuț Păduraru + * @since 14.04.2009 12:32:52 + * @param c + * @return + */ + private static char diac(char c) { + switch (c) { + case 'ş': + c = 'ș'; + break; + case 'ţ': + c = 'ț'; + break; + case 'Ţ': + c = 'Ț'; + break; + case 'Ş': + c = 'Ș'; + break; + default: + break; + } + return c; + } + + /** + * Romanian diactitics correction: replace <b>ş</b> with <b>ș</b> and + * <b>ţ</b> with <b>ț</b>(including upper-case variants). <br/> + * Thread-safe method. + * + * @author Ionuț Păduraru + * @since 14.04.2009 12:33:39 + * @param s + */ + public static synchronized String correctDiacritrics(String s) { + if (null == s) + return null; + initCharMap(); + final int length = s.length(); + // check buffer size + if (length > replacementBuff.length) { + replacementBuff = new char[length]; + } + // get current chars + s.getChars(0, length, replacementBuff, 0); + // replace + for (int i = 0; i < length; i++) { + replacementBuff[i] = cCorrectDiacritics[replacementBuff[i]]; + + } + // return the corrected string + return String.valueOf(replacementBuff, 0, length); + } + +}
\ No newline at end of file diff --git a/JLanguageTool/src/dev/de/danielnaber/languagetool/dev/wikipedia/BaseWikipediaDumpHandler.java b/JLanguageTool/src/dev/de/danielnaber/languagetool/dev/wikipedia/BaseWikipediaDumpHandler.java new file mode 100644 index 0000000..589c0e2 --- /dev/null +++ b/JLanguageTool/src/dev/de/danielnaber/languagetool/dev/wikipedia/BaseWikipediaDumpHandler.java @@ -0,0 +1,155 @@ +/* LanguageTool, a natural language style checker + * Copyright (C) 2010 Daniel Naber (http://www.danielnaber.de) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ +package de.danielnaber.languagetool.dev.wikipedia; + +import java.io.IOException; +import java.util.Date; +import java.util.List; + +import org.xml.sax.Attributes; +import org.xml.sax.SAXException; +import org.xml.sax.helpers.DefaultHandler; + +import de.danielnaber.languagetool.JLanguageTool; +import de.danielnaber.languagetool.Language; +import de.danielnaber.languagetool.TextFilter; +import de.danielnaber.languagetool.dev.tools.RomanianDiacriticsModifier; +import de.danielnaber.languagetool.rules.RuleMatch; + +/** + * Read the Wikipedia XML dump, check texts with LanguageTool, and + * let result be handled in sub classes. + */ +abstract class BaseWikipediaDumpHandler extends DefaultHandler { + + protected static final int CONTEXT_SIZE = 50; + protected static final String MARKER_START = "<err>"; + protected static final String MARKER_END = "</err>"; + protected static final String LANG_MARKER = "XX"; + protected static final String URL_PREFIX = "http://" + LANG_MARKER + ".wikipedia.org/wiki/"; + + protected Date dumpDate; + protected String langCode; + + private final JLanguageTool languageTool; + private int ruleMatchCount = 0; + private int articleCount = 0; + private int maxArticles = 0; + + private boolean inText = false; + private StringBuilder text = new StringBuilder(); + + private TextFilter textFilter = new WikipediaTextFilter(); + + private String title; + private final Language lang; + + //=========================================================== + // SAX DocumentHandler methods + //=========================================================== + + protected BaseWikipediaDumpHandler(JLanguageTool languageTool, int maxArticles, Date dumpDate, + String langCode, Language lang) { + this.lang = lang; + this.languageTool = languageTool; + this.maxArticles = maxArticles; + this.dumpDate = dumpDate; + this.langCode = langCode; + initTextFilter(); + } + + /** + * initialize textFilter field + */ + private void initTextFilter() { + if (Language.ROMANIAN == lang) { + textFilter = new WikipediaTextFilter() { + @Override + public String filter(String arg0) { + final String tmp = super.filter(arg0); + // diacritics correction (comma-bellow instead of sedilla for ș and ț) + return RomanianDiacriticsModifier.correctDiacritrics(tmp); + } + }; + } else { + textFilter = new WikipediaTextFilter(); + } + } + + @SuppressWarnings("unused") + public void startElement(String namespaceURI, String lName, String qName, + Attributes attrs) throws SAXException { + if (qName.equals("title")) { + inText = true; + } else if (qName.equals("text")) { + inText = true; + } + } + + @SuppressWarnings("unused") + public void endElement(String namespaceURI, String sName, String qName) { + if (qName.equals("title")) { + title = text.toString(); + text = new StringBuilder(); + } else if (qName.equals("text")) { + //System.err.println(text.length() + " " + text.substring(0, Math.min(50, text.length()))); + final String textToCheck = textFilter.filter(text.toString()); + //System.out.println(textToCheck); + if (!textToCheck.contains("#REDIRECT")) { + //System.err.println("#########################"); + //System.err.println(textToCheck); + try { + articleCount++; + if (maxArticles > 0 && articleCount > maxArticles) { + System.out.printf("Maximum number of articles reached. Found %d matches in %d articles\n", + ruleMatchCount, articleCount); + System.exit(0); + } + final List<RuleMatch> ruleMatches = languageTool.check(textToCheck); + System.out.println("Checking article " + articleCount + " (" + + textToCheck.length()/1024 + "KB, '" + title + "')" + + ", found " + ruleMatches.size() + " matches"); + try { + handleResult(title, ruleMatches, textToCheck, languageTool.getLanguage()); + } catch (Exception e) { + throw new RuntimeException(e); + } + ruleMatchCount += ruleMatches.size(); + } catch (IOException e) { + throw new RuntimeException(e); + } + } + text = new StringBuilder(); + } + inText = false; + } + + public void characters(char buf[], int offset, int len) { + final String s = new String(buf, offset, len); + if (inText) { + text.append(s); + } + } + + abstract protected void handleResult(String title, List<RuleMatch> ruleMatches, + String text, Language language) throws Exception; + + abstract protected void close(); + +} diff --git a/JLanguageTool/src/dev/de/danielnaber/languagetool/dev/wikipedia/CheckWikipediaDump.java b/JLanguageTool/src/dev/de/danielnaber/languagetool/dev/wikipedia/CheckWikipediaDump.java new file mode 100644 index 0000000..3eabdd8 --- /dev/null +++ b/JLanguageTool/src/dev/de/danielnaber/languagetool/dev/wikipedia/CheckWikipediaDump.java @@ -0,0 +1,143 @@ +/* LanguageTool, a natural language style checker + * Copyright (C) 2005 Daniel Naber (http://www.danielnaber.de) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ + +/* + * + * Created on 21.12.2006 + */ +package de.danielnaber.languagetool.dev.wikipedia; + +import java.io.File; +import java.io.IOException; +import java.text.ParseException; +import java.text.SimpleDateFormat; +import java.util.Date; + +import javax.xml.parsers.ParserConfigurationException; +import javax.xml.parsers.SAXParser; +import javax.xml.parsers.SAXParserFactory; + +import org.xml.sax.SAXException; + +import de.danielnaber.languagetool.JLanguageTool; +import de.danielnaber.languagetool.Language; + +/** + * Command-line tool that checks texts from Wikipedia (download "pages-articles.xml.bz2" from + * http://download.wikimedia.org/backup-index.html, e.g. + * http://download.wikimedia.org/dewiki/latest/dewiki-latest-pages-articles.xml.bz2) + * and stores the result in a database. + * + * @author Daniel Naber + */ +public class CheckWikipediaDump { + + private CheckWikipediaDump() { + // no public constructor + } + + public static void main(String[] args) throws IOException, SAXException, ParserConfigurationException { + final CheckWikipediaDump prg = new CheckWikipediaDump(); + if (args.length < 3 || args.length > 4) { + System.err.println("Usage: CheckWikipediaDump <propertyFile> <language> <filename> [maxArticleCheck]"); + System.err.println("\tpropertyFile a file to set database access properties. Use '-' to print results to stdout."); + System.err.println("\tlanguage languagecode like 'en' or 'de'"); + System.err.println("\tfilename path to unpacked Wikipedia XML dump"); + System.err.println("\tmaxArticleCheck optional: maximum number of articles to check"); + System.exit(1); + } + int maxArticles = 0; + if (args.length == 4) { + maxArticles = Integer.parseInt(args[3]); + } + File propFile = null; + if (!"-".equals(args[0])) { + propFile = new File(args[0]); + if (!propFile.exists() || propFile.isDirectory()) { + throw new IOException("file not found or isn't a file: " + propFile.getAbsolutePath()); + } + } + prg.run(propFile, args[1], args[2], maxArticles); + } + + private void run(File propFile, String language, String textFilename, int maxArticles) + throws IOException, SAXException, ParserConfigurationException { + final File file = new File(textFilename); + if (!file.exists() || !file.isFile()) { + throw new IOException("File doesn't exist or isn't a file: " + textFilename); + } + final Language lang = Language.getLanguageForShortName(language); + if (lang == null) { + System.err.println("Language not supported: " + language); + System.exit(1); + } + final JLanguageTool languageTool = new JLanguageTool(lang); + languageTool.activateDefaultPatternRules(); + // useful settings (avoid false alarms) because text extraction + // from Wikipedia isn't clean yet: + languageTool.disableRule("DE_CASE"); // too many false hits + languageTool.disableRule("UNPAIRED_BRACKETS"); + languageTool.disableRule("UPPERCASE_SENTENCE_START"); + languageTool.disableRule("WORD_REPEAT_RULE"); + languageTool.disableRule("COMMA_PARENTHESIS_WHITESPACE"); + languageTool.disableRule("WHITESPACE_RULE"); + languageTool.disableRule("EN_QUOTES"); // en + languageTool.disableRule("CUDZYSLOW_DRUKARSKI"); // pl + languageTool.disableRule("POMIŠLJAJ_1"); // sl + languageTool.disableRule("POMIŠLJAJ_2"); // sl + languageTool.disableRule("POMIŠLJAJ_3"); // sl + /* + List rules = lt.getAllRules(); + for (Iterator iter = rules.iterator(); iter.hasNext();) { + Rule element = (Rule) iter.next(); + lt.disableRule(element.getId()); + } + lt.enableRule("DE_AGREEMENT"); + */ + System.err.println("These rules are disabled: " + languageTool.getDisabledRules()); + final Date dumpDate = getDumpDate(file); + System.out.println("Dump date: " + dumpDate + ", language: " + language); + final BaseWikipediaDumpHandler handler; + if (propFile != null) { + handler = new DatabaseDumpHandler(languageTool, maxArticles, dumpDate, + language, propFile, lang); + } else { + handler = new OutputDumpHandler(languageTool, maxArticles, dumpDate, + language, lang); + } + final SAXParserFactory factory = SAXParserFactory.newInstance(); + final SAXParser saxParser = factory.newSAXParser(); + saxParser.parse(file, handler); + } + + private Date getDumpDate(File file) throws IOException { + final String filename = file.getName(); + final String[] parts = filename.split("-"); + if (parts.length < 3) { + throw new IOException("Unexpected filename format: " + file.getName()); + } + final SimpleDateFormat sdf = new SimpleDateFormat("yyyyMMdd"); + try { + return sdf.parse(parts[1]); + } catch (ParseException e) { + throw new IOException("Unexpected date format: " + parts[1], e); + } + } + +} diff --git a/JLanguageTool/src/dev/de/danielnaber/languagetool/dev/wikipedia/DatabaseDumpHandler.java b/JLanguageTool/src/dev/de/danielnaber/languagetool/dev/wikipedia/DatabaseDumpHandler.java new file mode 100644 index 0000000..a5ad6fb --- /dev/null +++ b/JLanguageTool/src/dev/de/danielnaber/languagetool/dev/wikipedia/DatabaseDumpHandler.java @@ -0,0 +1,90 @@ +/* + * Created on 04.04.2010 + */ +package de.danielnaber.languagetool.dev.wikipedia; + +import java.io.File; +import java.io.FileInputStream; +import java.io.IOException; +import java.sql.Connection; +import java.sql.DriverManager; +import java.sql.PreparedStatement; +import java.sql.SQLException; +import java.util.Date; +import java.util.List; +import java.util.Properties; + +import de.danielnaber.languagetool.JLanguageTool; +import de.danielnaber.languagetool.Language; +import de.danielnaber.languagetool.gui.Tools; +import de.danielnaber.languagetool.rules.RuleMatch; + +/** + * Writes result of LanguageTool check to database. Used for community.languagetool.org. + * + * @author Daniel Naber + */ +class DatabaseDumpHandler extends BaseWikipediaDumpHandler { + + private final Connection conn; + + DatabaseDumpHandler(JLanguageTool lt, int maxArticles, Date dumpDate, String langCode, + File propertiesFile, Language lang) throws IOException { + super(lt, maxArticles, dumpDate, langCode, lang); + try { + final Properties dbProperties = new Properties(); + dbProperties.load(new FileInputStream(propertiesFile)); + final String dbDriver = getProperty(dbProperties, "dbDriver"); + final String dbUrl = getProperty(dbProperties, "dbUrl"); + final String dbUser = getProperty(dbProperties, "dbUser"); + final String dbPassword = getProperty(dbProperties, "dbPassword"); + Class.forName(dbDriver); + conn = DriverManager.getConnection(dbUrl, dbUser, dbPassword); + } catch (ClassNotFoundException e) { + throw new RuntimeException(e); + } catch (SQLException e) { + throw new RuntimeException(e); + } + } + + @Override + protected void close() { + if (conn != null) { + try { + conn.close(); + } catch (SQLException e) { + throw new RuntimeException(e); + } + } + } + + private String getProperty(Properties prop, String key) { + final String value = prop.getProperty(key); + if (value == null) { + throw new RuntimeException("required key '" +key+ "' not found in properties"); + } + return value; + } + + @Override + protected void handleResult(String title, List<RuleMatch> ruleMatches, + String text, Language language) throws SQLException { + final String sql = "INSERT INTO corpus_match " + + "(version, language_code, ruleid, message, error_context, corpus_date, " + + "check_date, sourceuri, is_visible) "+ + "VALUES (0, ?, ?, ?, ?, ?, ?, ?, 1)"; + final PreparedStatement prepSt = conn.prepareStatement(sql); + for (RuleMatch match : ruleMatches) { + prepSt.setString(1, language.getShortName()); + prepSt.setString(2, match.getRule().getId()); + prepSt.setString(3, match.getMessage()); + prepSt.setString(4, Tools.getContext(match.getFromPos(), + match.getToPos(), text, CONTEXT_SIZE, MARKER_START, MARKER_END)); + prepSt.setDate(5, new java.sql.Date(dumpDate.getTime())); + prepSt.setDate(6, new java.sql.Date(new Date().getTime())); + prepSt.setString(7, URL_PREFIX.replaceAll(LANG_MARKER, langCode) + title); + prepSt.executeUpdate(); + } + } + +} diff --git a/JLanguageTool/src/dev/de/danielnaber/languagetool/dev/wikipedia/OutputDumpHandler.java b/JLanguageTool/src/dev/de/danielnaber/languagetool/dev/wikipedia/OutputDumpHandler.java new file mode 100644 index 0000000..3a880fe --- /dev/null +++ b/JLanguageTool/src/dev/de/danielnaber/languagetool/dev/wikipedia/OutputDumpHandler.java @@ -0,0 +1,60 @@ +/* + * Created on 04.04.2010 + */ +package de.danielnaber.languagetool.dev.wikipedia; + +import java.util.Date; +import java.util.List; + +import de.danielnaber.languagetool.JLanguageTool; +import de.danielnaber.languagetool.Language; +import de.danielnaber.languagetool.rules.RuleMatch; +import de.danielnaber.languagetool.rules.patterns.PatternRule; +import de.danielnaber.languagetool.tools.StringTools; + +/** + * Writes result of LanguageTool check to stdout. + * + * @author Daniel Naber + */ +class OutputDumpHandler extends BaseWikipediaDumpHandler { + + OutputDumpHandler(JLanguageTool lt, int maxArticles, Date dumpDate, String langCode, + Language lang) { + super(lt, maxArticles, dumpDate, langCode, lang); + } + + @Override + protected void close() { + } + + @Override + protected void handleResult(String title, List<RuleMatch> ruleMatches, + String text, Language language) { + if (ruleMatches.size() > 0) { + int i = 1; + System.out.println("\nTitle: " + title); + for (RuleMatch match : ruleMatches) { + String output = i + ".) Line " + (match.getLine() + 1) + ", column " + + match.getColumn() + ", Rule ID: " + match.getRule().getId(); + if (match.getRule() instanceof PatternRule) { + final PatternRule pRule = (PatternRule) match.getRule(); + output += "[" + pRule.getSubId() + "]"; + } + System.out.println(output); + String msg = match.getMessage(); + msg = msg.replaceAll("<suggestion>", "'"); + msg = msg.replaceAll("</suggestion>", "'"); + System.out.println("Message: " + msg); + final List<String> replacements = match.getSuggestedReplacements(); + if (!replacements.isEmpty()) { + System.out.println("Suggestion: " + StringTools.listToString(replacements, "; ")); + } + System.out.println(StringTools.getContext(match.getFromPos(), match + .getToPos(), text, CONTEXT_SIZE)); + i++; + } + } + } + +} diff --git a/JLanguageTool/src/dev/de/danielnaber/languagetool/dev/wikipedia/WikipediaTextFilter.java b/JLanguageTool/src/dev/de/danielnaber/languagetool/dev/wikipedia/WikipediaTextFilter.java new file mode 100644 index 0000000..49646e2 --- /dev/null +++ b/JLanguageTool/src/dev/de/danielnaber/languagetool/dev/wikipedia/WikipediaTextFilter.java @@ -0,0 +1,52 @@ +/* LanguageTool, a natural language style checker + * Copyright (C) 2010 Daniel Naber (http://www.danielnaber.de) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ +package de.danielnaber.languagetool.dev.wikipedia; + +import info.bliki.wiki.model.WikiModel; + +import org.apache.commons.lang.StringEscapeUtils; + +import de.danielnaber.languagetool.TextFilter; + +/** + * Convert Wikipedia syntax to HTML using Bliki and then try to clean it up (this is + * rather ugly). + */ +class WikipediaTextFilter implements TextFilter { + + public String filter(String s) { + // TODO: find general HTML to Text converter?!: + final WikiModel wikiModel = new WikiModel("${image}", "${title}"); + s = wikiModel.render(s); + //System.out.println("0####"+s); + s = s.replaceAll("\\{\\{.*?\\}\\}", ""); + s = s.replaceAll("</p>", "\n\n"); + s = s.replaceAll("</dt>", "\n\n"); + s = s.replaceAll("</dl>", "\n\n"); + s = s.replaceAll("</h\\d>", "\n\n"); + s = s.replaceAll("<a href=\"http://[a-zA-Z-]+\\.wikipedia\\.org/wiki/.*?\">.*?</a>", ""); + s = s.replaceAll("<.*?>", ""); + s = s.replaceAll("\n\n*", "\n\n"); // single line break isn't detected as paragraph in LT by default + s = StringEscapeUtils.unescapeHtml(s); + //System.out.println("1############################################\n"+s); + //System.out.println("/############################################"+s); + return s; + } + +} diff --git a/JLanguageTool/src/dev/tools/add_short.xsl b/JLanguageTool/src/dev/tools/add_short.xsl new file mode 100644 index 0000000..571e41a --- /dev/null +++ b/JLanguageTool/src/dev/tools/add_short.xsl @@ -0,0 +1,59 @@ +<?xml version="1.0" ?> +<xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform" + version="1.0"> +<!-- + A simple stylesheet that adds "short" element with category name to grammar files + Copyright (C) 2008 Marcin Miłkowski + + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + +Note: remove DOCTYPE declaration before conversion and add after it. Otherwise, you'd get +all default values in the grammar.xml!!! + +Usage: + +java -jar saxon8.jar grammar.xml add_short.xsl >new_grammar.xml + +Then rename new_grammar.xml to grammar.xml, after making a backup of grammar.xml +--> + + <xsl:output method="xml" encoding="utf-8" indent="no"/> + + <xsl:template match="@*|node()"> + <xsl:copy> + <xsl:apply-templates select="@*|node()"/> + </xsl:copy> + </xsl:template> + + <xsl:template match="@xml:space"/> + + <xsl:template match="message"> + <xsl:copy> + <xsl:apply-templates select="@*|node()"/> + </xsl:copy> + <xsl:text> + </xsl:text> + <xsl:element name="short"> + <xsl:choose> + <xsl:when test="name(../..)='rulegroup'"> + <xsl:value-of select="../../../@name"></xsl:value-of> + </xsl:when> + <xsl:otherwise><xsl:value-of select="../../@name"/></xsl:otherwise> + </xsl:choose> + </xsl:element> +</xsl:template> + +</xsl:stylesheet>
\ No newline at end of file diff --git a/JLanguageTool/src/dev/tools/convert.xsl b/JLanguageTool/src/dev/tools/convert.xsl new file mode 100644 index 0000000..3e70426 --- /dev/null +++ b/JLanguageTool/src/dev/tools/convert.xsl @@ -0,0 +1,50 @@ +<?xml version="1.0" ?> +<xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform" + version="1.0"> +<!-- XSLT stylesheet to convert grammar.xml <em> elements + + Copyright (C) 2008 Marcin Miłkowski. + + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + +Note: it's obsolete and useless for current grammar.xml files. + +usage: + +java -jar saxon8.jar grammar.xml convert.xsl + +--> + <xsl:output method="xml" encoding="utf-8" indent="yes"/> + + <xsl:template match="@*|node()"> + <xsl:copy> + <xsl:apply-templates select="@*|node()"/> + </xsl:copy> + </xsl:template> + + <xsl:template match="//message/em"> + <xsl:element name="suggestion"> + <xsl:value-of select="./text()"/> + </xsl:element> +</xsl:template> + + <xsl:template match="//example/em"> + <xsl:element name="marker"> + <xsl:value-of select="./text()"/> + </xsl:element> +</xsl:template> + +</xsl:stylesheet>
\ No newline at end of file diff --git a/JLanguageTool/src/dev/tools/print.xsl b/JLanguageTool/src/dev/tools/print.xsl new file mode 100644 index 0000000..2e775d6 --- /dev/null +++ b/JLanguageTool/src/dev/tools/print.xsl @@ -0,0 +1,200 @@ +<?xml version="1.0"?> +<xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform" + version="2.0"> + <!-- XSLT stylesheet to pretty print grammar.xml + +Copyright (C) 2008 Marcin Miłkowski + + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + + usage: + + java -jar saxon8.jar grammar.xml print.xsl + + This version doesn't work in Firefox, unfortunately... + + --> + <xsl:output method="html" encoding="UTF-8" indent="no" /> + + <xsl:template match="text()" /> + + <xsl:template match="*"> + <xsl:apply-templates select="*"> + <xsl:sort select="@name"/> + </xsl:apply-templates> + </xsl:template> + + <xsl:template match="//category"> + <xsl:variable name="category_name" select="@name"/> + <xsl:variable name="cat_id" select="generate-id()"/> + <xsl:element name="div"> + <xsl:attribute name="id"><xsl:copy-of select="$cat_id"/></xsl:attribute> + <xsl:attribute name="style">display:none</xsl:attribute> + <h4> + <xsl:element name="a"> + <xsl:attribute name="href">javascript:;</xsl:attribute> + <xsl:attribute name="onmousedown">toggleDiv('<xsl:copy-of select="$cat_id"/>');</xsl:attribute> + <xsl:value-of select="$category_name"/> + </xsl:element> + (<xsl:value-of select="count(rule[@id!=''])+count(rulegroup[@id!=''])"/>) + </h4> + <ol> + <xsl:apply-templates select="*"> + <xsl:sort select="@name"/> + </xsl:apply-templates> + </ol> + </xsl:element> + <h4> + <xsl:element name="a"> + <xsl:attribute name="href">javascript:;</xsl:attribute> + <xsl:attribute name="onmousedown">toggleDiv('<xsl:copy-of select="$cat_id"/>');</xsl:attribute> + <xsl:value-of select="$category_name"/> + </xsl:element> + (<xsl:value-of select="count(rule[@id!=''])+count(rulegroup[@id!=''])"/>) + </h4> + </xsl:template> + + + <xsl:template match="//rule[@id!='']"> + <li> + <xsl:value-of select="@name" /> + </li> + <ul> + <xsl:apply-templates select="*" /> + </ul> + </xsl:template> + + <xsl:template match="//rulegroup"> + <li> + <xsl:value-of select="@name" /> + </li> + <ul> + <xsl:apply-templates select="*" /> + </ul> + </xsl:template> + + + <xsl:template match="//rule/example[@type='incorrect']"> + <li> + <xsl:apply-templates select="*|text()" /> <br/> + <xsl:if test="../short/text()!=''"> + <xsl:value-of select="../short/text()"/>. + </xsl:if> + <xsl:if test="@correction !=''"> + <xsl:choose> + <xsl:when test="not(contains(@correction, '|')) and not(contains(../message/text()[1], '\')) and count(../message/text()) < 3"> + <xsl:copy-of select="../message/text()[1]"/> + <strong style="color: #339900;"><xsl:value-of select="@correction"/></strong> + <xsl:copy-of select="../message/text()[2]"/> + </xsl:when> + <xsl:otherwise> +<!-- +Remaining problem: replace \1 in message text with pattern/token[1] + + + <xsl:choose> + <xsl:when test="//rules[@lang='pl']">Poprawnie: </xsl:when> + <xsl:when test="//rules[@lang='en']">Correctly: </xsl:when> + <xsl:when test="//rules[@lang='de']">Korrekt: </xsl:when> + <xsl:when test="//rules[@lang='fr']">Correctement : </xsl:when> + <xsl:when test="//rules[@lang='nl']">Correct: </xsl:when> + <xsl:when test="//rules[@lang='es']">Correctamente: </xsl:when> + </xsl:choose> + + <strong style="color: #339900;"> + <xsl:value-of select="@correction"/> + </strong> + --> + + <xsl:variable name="message" select="../message/text()"/> + <xsl:for-each select="tokenize(@correction,'\|')"> + <xsl:variable name="message_cnt" select="position()"/> + <xsl:value-of select="$message[$message_cnt]"/> + <strong style="color: #339900;"> + <xsl:value-of select="."/> + </strong> + <xsl:if test="position()=last()"> + <xsl:variable name="last" select="last()+1"/> + <xsl:value-of select="$message[$last]"/> + </xsl:if> + </xsl:for-each> + </xsl:otherwise> + </xsl:choose> + </xsl:if> + </li> + </xsl:template> + + <xsl:template match="//rule/example[@type='incorrect']/text()"> + <xsl:copy-of select="." /> + </xsl:template> + + <xsl:template match="//rule/example[@type='incorrect']/marker"> + <strong style="color: rgb(255, 0, 0);"> + <xsl:value-of select="./text()" /> + </strong> + </xsl:template> + + <xsl:template match="//rules"> + <html> + <meta http-equiv="content-type" content="text/html; charset=UTF-8"/> + <head> + <script language="javascript"> + <xsl:text> + function toggleDiv(divid){ + if(document.getElementById(divid).style.display == 'none'){ + document.getElementById(divid).style.display = 'block'; + }else{ + document.getElementById(divid).style.display = 'none'; + } + } + </xsl:text> + </script> + </head> + <body> + <noscript><p><strong>Note:</strong> this page requires Javascript to work</p></noscript> + <xsl:choose> + <xsl:when test="//rules[@lang='pl']">Łączna liczba reguł: </xsl:when> + <xsl:otherwise>Total number of rules: </xsl:otherwise> + </xsl:choose> + <strong> + <xsl:value-of select="count(//rule)"/> + </strong> + <br/> + <xsl:choose> + <xsl:when test="//rules[@lang='pl']">W tym z podpowiedziami: </xsl:when> + <xsl:otherwise>Rules with suggestions: </xsl:otherwise> + </xsl:choose> + <strong> + <xsl:value-of select="count(//message[suggestion!=''])"/> + </strong> + <br/> + <xsl:choose> + <xsl:when test="//rules[@lang='pl']">Liczba widocznych typów reguł: </xsl:when> + <xsl:otherwise>Total number of visible rule types: </xsl:otherwise> + </xsl:choose> + <strong> + <xsl:value-of select="count(//rule[@id!=''])+count(//rulegroup[@id!=''])"/> + </strong> + <br/> + + <xsl:apply-templates select="*"> + <xsl:sort select="@name"/> + </xsl:apply-templates> + </body> + </html> + </xsl:template> + +</xsl:stylesheet>
\ No newline at end of file diff --git a/JLanguageTool/src/dev/tools/stats.awk b/JLanguageTool/src/dev/tools/stats.awk new file mode 100644 index 0000000..aa1760e --- /dev/null +++ b/JLanguageTool/src/dev/tools/stats.awk @@ -0,0 +1,62 @@ +#Script to sort rule matches from LanguageTool +#Usage: gawk -f stats.awk <file_created_by_LanguageTool> +#(c) 2008, Marcin Milkowski +# This library is free software; you can redistribute it and/or +# modify it under the terms of the GNU Lesser General Public +# License as published by the Free Software Foundation; either +# version 2.1 of the License, or (at your option) any later version. +# +# This library is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public +# License along with this library; if not, write to the Free Software +# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 +# USA + +/^[0-9]+\.\)/ { +gsub(/^.*ID: /,"") +rule_cnt[$0]++ +current_rule=$0 +rulematch=1 +linecnt=0 +} +/^(Message: |Suggestion:)/ { +comments[current_rule]= comments[current_rule] "\n" $0 +linecnt++ +} +!/^($|Message: |Suggestion:|Time:)/ && !/ \^/ { +if (linecnt>0) +comments[current_rule]= comments[current_rule] "\n" $0 +} +/^ / && / \^/ { +comments[current_rule]= comments[current_rule] "\n" $0 "\n" +} +END { +if (rulematch==1) { +print "LanguageTool rule matches in descending order" +print "=============================================" +print "" +} +z = asorti(rule_cnt, rule_names) +#for (i = 1; i <= z; i++) + # print i " " rule_names[i] +n = asort(rule_cnt, rules) + +for (i = z; i >= 1; i--) { + + for (j = 1; j <= z; j++) { +# print j " " rule_names[j] " => " rule_cnt[rule_names[j]] + if (rule_cnt[rule_names[j]]==rules[i] \ + && printed[rule_names[j]]!="done") { + printed[rule_names[j]]="done" + rule=rule_names[j] + print "Rule ID: " rule ", matches: " rule_cnt[rule] + print comments[rule] + print "=============" + } + } +} +}
\ No newline at end of file |