diff options
author | Arno Teigseth <arno@teigseth.no> | 2011-02-05 08:48:27 +0000 |
---|---|---|
committer | Arno Teigseth <arno@teigseth.no> | 2011-02-05 08:48:27 +0000 |
commit | 4f3d565a5e5ede6eb6fd1f276d4e8ad37b67b5ce (patch) | |
tree | 7af736540eca93034428a975bd850e709fbbe2e5 /JLanguageTool/src/java/de/danielnaber/languagetool/Main.java | |
parent | ecaee85ab5984ebadd56721c295dc26b3335f7ce (diff) | |
download | grammar-norwegian-4f3d565a5e5ede6eb6fd1f276d4e8ad37b67b5ce.tar.gz grammar-norwegian-4f3d565a5e5ede6eb6fd1f276d4e8ad37b67b5ce.tar.bz2 grammar-norwegian-4f3d565a5e5ede6eb6fd1f276d4e8ad37b67b5ce.tar.xz |
Diffstat (limited to 'JLanguageTool/src/java/de/danielnaber/languagetool/Main.java')
-rw-r--r-- | JLanguageTool/src/java/de/danielnaber/languagetool/Main.java | 567 |
1 files changed, 567 insertions, 0 deletions
diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/Main.java b/JLanguageTool/src/java/de/danielnaber/languagetool/Main.java new file mode 100644 index 0000000..f2f2cc6 --- /dev/null +++ b/JLanguageTool/src/java/de/danielnaber/languagetool/Main.java @@ -0,0 +1,567 @@ +/* LanguageTool, a natural language style checker + * Copyright (C) 2005 Daniel Naber (http://www.danielnaber.de) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ +package de.danielnaber.languagetool; + +import java.io.BufferedInputStream; +import java.io.BufferedReader; +import java.io.File; +import java.io.FileInputStream; +import java.io.IOException; +import java.io.InputStreamReader; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; +import java.util.HashSet; +import java.util.List; +import java.util.Locale; +import java.util.Set; + +import javax.xml.parsers.ParserConfigurationException; + +import org.xml.sax.SAXException; + +import de.danielnaber.languagetool.bitext.StringPair; +import de.danielnaber.languagetool.bitext.TabBitextReader; +import de.danielnaber.languagetool.rules.Rule; +import de.danielnaber.languagetool.rules.bitext.BitextRule; +import de.danielnaber.languagetool.tools.StringTools; +import de.danielnaber.languagetool.tools.Tools; + +/** + * The command line tool to check plain text files. + * + * @author Daniel Naber + */ +class Main { + + private JLanguageTool lt; + private final boolean verbose; + private final boolean apiFormat; + private final boolean taggerOnly; + private final boolean applySuggestions; + private boolean profileRules; + private boolean bitextMode; + private JLanguageTool srcLt; + List<BitextRule> bRules; + private Rule currentRule; + + /* maximum file size to read in a single read */ + private static final int MAX_FILE_SIZE = 64000; + + Main(final boolean verbose, final boolean taggerOnly, + final Language language, final Language motherTongue, + final String[] disabledRules, final String[] enabledRules, + final boolean apiFormat, boolean applySuggestions) throws IOException, + SAXException, ParserConfigurationException { + this.verbose = verbose; + this.apiFormat = apiFormat; + this.taggerOnly = taggerOnly; + this.applySuggestions = applySuggestions; + profileRules = false; + bitextMode = false; + srcLt = null; + bRules = null; + lt = new JLanguageTool(language, motherTongue); + lt.activateDefaultPatternRules(); + lt.activateDefaultFalseFriendRules(); + selectRules(lt, disabledRules, enabledRules); + } + + private void selectRules(final JLanguageTool lt, final String[] disabledRules, final String[] enabledRules) { + // disable rules that are disabled explicitly: + for (final String disabledRule : disabledRules) { + lt.disableRule(disabledRule); + } + // disable all rules except those enabled explicitly, if any: + if (enabledRules.length > 0) { + final Set<String> enabledRuleIDs = new HashSet<String>(Arrays + .asList(enabledRules)); + for (String ruleName : enabledRuleIDs) { + lt.enableDefaultOffRule(ruleName); + lt.enableRule(ruleName); + } + for (Rule rule : lt.getAllRules()) { + if (!enabledRuleIDs.contains(rule.getId())) { + lt.disableRule(rule.getId()); + } + } + } + } + + private void setListUnknownWords(final boolean listUnknownWords) { + lt.setListUnknownWords(listUnknownWords); + } + + private void setProfilingMode() { + profileRules = true; + } + + private final void setBitextMode(final Language sourceLang, + final String[] disabledRules, final String[] enabledRules) throws IOException, ParserConfigurationException, SAXException { + bitextMode = true; + Language target = lt.getLanguage(); + lt = new JLanguageTool(target, null); + srcLt = new JLanguageTool(sourceLang); + lt.activateDefaultPatternRules(); + selectRules(lt, disabledRules, enabledRules); + selectRules(srcLt, disabledRules, enabledRules); + bRules = Tools.getBitextRules(sourceLang, lt.getLanguage()); + + List<BitextRule> bRuleList = new ArrayList<BitextRule>(bRules); + for (final BitextRule br : bRules) { + for (final String disabledRule : disabledRules) { + if (br.getId().equals(disabledRule)) { + bRuleList.remove(br); + } + } + } + bRules = bRuleList; + if (enabledRules.length > 0) { + bRuleList = new ArrayList<BitextRule>(); + for (final String enabledRule : enabledRules) { + for (final BitextRule br : bRules) { + if (br.getId().equals(enabledRule)) { + bRuleList.add(br); + } + } + } + bRules = bRuleList; + } + } + + JLanguageTool getJLanguageTool() { + return lt; + } + + private void runOnFile(final String filename, final String encoding, + final boolean listUnknownWords) throws IOException { + boolean oneTime = false; + if (!"-".equals(filename)) { + final File file = new File(filename); + // run once on file if the file size < MAXFILESIZE or + // when we use the bitext mode (we use a bitext reader + // instead of a direct file access) + oneTime = file.length() < MAX_FILE_SIZE || bitextMode; + } + if (oneTime) { + if (bitextMode) { + //TODO: add parameter to set different readers + TabBitextReader reader = new TabBitextReader(filename, encoding); + if (applySuggestions) { + Tools.correctBitext(reader, srcLt, lt, bRules); + } else { + Tools.checkBitext(reader, srcLt, lt, bRules, + apiFormat); + } + } else { + final String text = getFilteredText(filename, encoding); + if (applySuggestions) { + System.out.print(Tools.correctText(text, lt)); + } else if (profileRules) { + Tools.profileRulesOnText(text, lt); + } else if (!taggerOnly) { + Tools.checkText(text, lt, apiFormat, 0); + } else { + Tools.tagText(text, lt); + } + if (listUnknownWords) { + System.out.println("Unknown words: " + lt.getUnknownWords()); + } + } + } else { + if (verbose) { + lt.setOutput(System.err); + } + if (!apiFormat && !applySuggestions) { + if ("-".equals(filename)) { + System.out.println("Working on STDIN..."); + } else { + System.out.println("Working on " + filename + "..."); + } + } + int runCount = 1; + final List<Rule> rules = lt.getAllRules(); + if (profileRules) { + System.out.printf("Testing %d rules\n", rules.size()); + System.out.println("Rule ID\tTime\tSentences\tMatches\tSentences per sec."); + runCount = rules.size(); + } + InputStreamReader isr = null; + BufferedReader br = null; + int lineOffset = 0; + int tmpLineOffset = 0; + final List<String> unknownWords = new ArrayList<String>(); + StringBuilder sb = new StringBuilder(); + for (int ruleIndex = 0; ruleIndex <runCount; ruleIndex++) { + currentRule = rules.get(ruleIndex); + int matches = 0; + long sentences = 0; + final long startTime = System.currentTimeMillis(); + try { + if (!"-".equals(filename)) { + final File file = new File(filename); + if (encoding != null) { + isr = new InputStreamReader(new BufferedInputStream( + new FileInputStream(file.getAbsolutePath())), encoding); + } else { + isr = new InputStreamReader(new BufferedInputStream( + new FileInputStream(file.getAbsolutePath()))); + } + } else { + if (encoding != null) { + isr = new InputStreamReader(new BufferedInputStream(System.in), + encoding); + } else { + isr = new InputStreamReader(new BufferedInputStream(System.in)); + } + } + br = new BufferedReader(isr); + String line; + while ((line = br.readLine()) != null) { + sb.append(line); + sb.append('\n'); + tmpLineOffset++; + if (lt.getLanguage().getSentenceTokenizer() + .singleLineBreaksMarksPara()) { + matches = handleLine(matches, lineOffset, sb); + sentences += lt.getSentenceCount(); + if (profileRules) { + sentences += lt.sentenceTokenize(sb.toString()).size(); + } + if (listUnknownWords && !taggerOnly) { + for (String word : lt.getUnknownWords()) + if (!unknownWords.contains(word)) { + unknownWords.add(word); + } + } + sb = new StringBuilder(); + lineOffset = tmpLineOffset; + } else { + if ("".equals(line) || sb.length() >= MAX_FILE_SIZE) { + matches = handleLine(matches, lineOffset, sb); + sentences += lt.getSentenceCount(); + if (profileRules) { + sentences += lt.sentenceTokenize(sb.toString()).size(); + } + if (listUnknownWords && !taggerOnly) { + for (String word : lt.getUnknownWords()) + if (!unknownWords.contains(word)) { + unknownWords.add(word); + } + } + sb = new StringBuilder(); + lineOffset = tmpLineOffset; + } + } + } + } finally { + + if (sb.length() > 0) { + matches = handleLine(matches, tmpLineOffset - 1, sb); + sentences += lt.getSentenceCount(); + if (profileRules) { + sentences += lt.sentenceTokenize(sb.toString()).size(); + } + if (listUnknownWords && !taggerOnly) { + for (String word : lt.getUnknownWords()) + if (!unknownWords.contains(word)) { + unknownWords.add(word); + } + } + } + + printTimingInformation(listUnknownWords, rules, unknownWords, ruleIndex, matches, sentences, startTime); + + if (br != null) { + br.close(); + } + if (isr != null) { + isr.close(); + } + } + } + } + } + + private void printTimingInformation(final boolean listUnknownWords, final List<Rule> rules, + final List<String> unknownWords, final int ruleIndex, final int matches, final long sentences, final long startTime) { + if (!applySuggestions) { + final long endTime = System.currentTimeMillis(); + final long time = endTime - startTime; + final float timeInSeconds = time / 1000.0f; + final float sentencesPerSecond = sentences / timeInSeconds; + if (apiFormat) { + System.out.println("<!--"); + } + if (profileRules) { + //TODO: run 10 times, line in runOnce mode, and use median + System.out.printf(Locale.ENGLISH, + "%s\t%d\t%d\t%d\t%.1f", rules.get(ruleIndex).getId(), + time, sentences, matches, sentencesPerSecond); + System.out.println(); + } else { + System.out.printf(Locale.ENGLISH, + "Time: %dms for %d sentences (%.1f sentences/sec)", time, + sentences, sentencesPerSecond); + System.out.println(); + } + if (listUnknownWords) { + Collections.sort(unknownWords); + System.out.println("Unknown words: " + unknownWords); + } + if (apiFormat) { + System.out.println("-->"); + } + } + } + + private int handleLine(final int matchNo, final int lineOffset, + final StringBuilder sb) throws IOException { + int matches = matchNo; + if (applySuggestions) { + System.out.print(Tools.correctText(StringTools.filterXML(sb.toString()), + lt)); + } else if (profileRules) { + matches += Tools.profileRulesOnLine(StringTools.filterXML(sb.toString()), + lt, currentRule); + } else if (!taggerOnly) { + if (matches == 0) { + matches += Tools.checkText(StringTools.filterXML(sb.toString()), lt, + apiFormat, -1, lineOffset, matches, + StringTools.XmlPrintMode.START_XML); + } else { + matches += Tools.checkText(StringTools.filterXML(sb.toString()), lt, + apiFormat, -1, lineOffset, matches, + StringTools.XmlPrintMode.CONTINUE_XML); + } + } else { + Tools.tagText(StringTools.filterXML(sb.toString()), lt); + } + return matches; + } + + private void runRecursive(final String filename, final String encoding, + final boolean listUnknown) throws IOException, + ParserConfigurationException, SAXException { + final File dir = new File(filename); + if (!dir.isDirectory()) { + throw new IllegalArgumentException(dir.getAbsolutePath() + + " is not a directory, cannot use recursion"); + } + final File[] files = dir.listFiles(); + for (final File file : files) { + if (file.isDirectory()) { + runRecursive(file.getAbsolutePath(), encoding, listUnknown); + } else { + runOnFile(file.getAbsolutePath(), encoding, listUnknown); + } + } + } + + /** + * Loads filename and filters out XML. Note that the XML + * filtering can lead to incorrect positions in the list of matching rules. + * + * @param filename + * @throws IOException + */ + private String getFilteredText(final String filename, final String encoding) + throws IOException { + if (verbose) { + lt.setOutput(System.err); + } + if (!apiFormat && !applySuggestions) { + System.out.println("Working on " + filename + "..."); + } + final String fileContents = StringTools.readFile(new FileInputStream( + filename), encoding); + return StringTools.filterXML(fileContents); + } + + private static void exitWithUsageMessage() { + System.out + .println("Usage: java de.danielnaber.languagetool.Main " + + "[-r|--recursive] [-v|--verbose] [-l|--language LANG] [-m|--mothertongue LANG] [-d|--disable RULES] " + + "[-e|--enable RULES] [-c|--encoding] [-u|--list-unknown] [-t|--taggeronly] [-b] [--api] [-a|--apply] " + + "[-b2|--bitext] <file>"); + System.exit(1); + } + + /** + * Command line tool to check plain text files. + */ + public static void main(final String[] args) throws IOException, + ParserConfigurationException, SAXException { + if (args.length < 1 || args.length > 9) { + exitWithUsageMessage(); + } + boolean verbose = false; + boolean recursive = false; + boolean taggerOnly = false; + boolean singleLineBreakMarksParagraph = false; + boolean apiFormat = false; + boolean listUnknown = false; + boolean applySuggestions = false; + boolean profile = false; + boolean bitext = false; + Language language = null; + Language motherTongue = null; + String encoding = null; + String filename = null; + String[] disabledRules = new String[0]; + String[] enabledRules = new String[0]; + for (int i = 0; i < args.length; i++) { + if (args[i].equals("-h") || args[i].equals("-help") + || args[i].equals("--help") || args[i].equals("--?")) { + exitWithUsageMessage(); + } else if (args[i].equals("-v") || args[i].equals("--verbose")) { + verbose = true; + } else if (args[i].equals("-t") || args[i].equals("--taggeronly")) { + taggerOnly = true; + if (listUnknown) { + throw new IllegalArgumentException( + "You cannot list unknown words when tagging only."); + } + if (applySuggestions) { + throw new IllegalArgumentException( + "You cannot apply suggestions when tagging only."); + } + } else if (args[i].equals("-r") || args[i].equals("--recursive")) { + recursive = true; + } else if (args[i].equals("-b2") || args[i].equals("--bitext")) { + bitext = true; + } else if (args[i].equals("-d") || args[i].equals("--disable")) { + if (enabledRules.length > 0) { + throw new IllegalArgumentException( + "You cannot specify both enabled and disabled rules"); + } + final String rules = args[++i]; + disabledRules = rules.split(","); + } else if (args[i].equals("-e") || args[i].equals("--enable")) { + if (disabledRules.length > 0) { + throw new IllegalArgumentException( + "You cannot specify both enabled and disabled rules"); + } + final String rules = args[++i]; + enabledRules = rules.split(","); + } else if (args[i].equals("-l") || args[i].equals("--language")) { + language = getLanguageOrExit(args[++i]); + } else if (args[i].equals("-m") || args[i].equals("--mothertongue")) { + motherTongue = getLanguageOrExit(args[++i]); + } else if (args[i].equals("-c") || args[i].equals("--encoding")) { + encoding = args[++i]; + } else if (args[i].equals("-u") || args[i].equals("--list-unknown")) { + listUnknown = true; + if (taggerOnly) { + throw new IllegalArgumentException( + "You cannot list unknown words when tagging only."); + } + } else if (args[i].equals("-b")) { + singleLineBreakMarksParagraph = true; + } else if (args[i].equals("--api")) { + apiFormat = true; + if (applySuggestions) { + throw new IllegalArgumentException( + "API format makes no sense for automatic application of suggestions."); + } + } else if (args[i].equals("-a") || args[i].equals("--apply")) { + applySuggestions = true; + if (taggerOnly) { + throw new IllegalArgumentException( + "You cannot apply suggestions when tagging only."); + } + if (apiFormat) { + throw new IllegalArgumentException( + "API format makes no sense for automatic application of suggestions."); + } + } else if (args[i].equals("-p") || args[i].equals("--profile")) { + profile = true; + if (apiFormat) { + throw new IllegalArgumentException( + "API format makes no sense for profiling."); + } + if (applySuggestions) { + throw new IllegalArgumentException( + "Applying suggestions makes no sense for profiling."); + } + if (taggerOnly) { + throw new IllegalArgumentException( + "Tagging makes no sense for profiling."); + } + } else if (i == args.length - 1) { + filename = args[i]; + } else { + System.err.println("Unknown option: " + args[i]); + exitWithUsageMessage(); + } + } + if (filename == null) { + filename = "-"; + } + if (language == null) { + if (!apiFormat) { + System.err.println("No language specified, using English"); + } + language = Language.ENGLISH; + } else if (!apiFormat && !applySuggestions) { + System.out.println("Expected text language: " + language.getName()); + } + language.getSentenceTokenizer().setSingleLineBreaksMarksParagraph( + singleLineBreakMarksParagraph); + final Main prg = new Main(verbose, taggerOnly, language, motherTongue, + disabledRules, enabledRules, apiFormat, applySuggestions); + prg.setListUnknownWords(listUnknown); + if (profile) { + prg.setProfilingMode(); + } + if (bitext) { + if (motherTongue == null) { + throw new IllegalArgumentException( + "You have to set the source language (as mother tongue)."); + } + prg.setBitextMode(motherTongue, disabledRules, enabledRules); + } + if (recursive) { + prg.runRecursive(filename, encoding, listUnknown); + } else { + prg.runOnFile(filename, encoding, listUnknown); + } + } + + private static Language getLanguageOrExit(final String lang) { + Language language = null; + boolean foundLanguage = false; + final List<String> supportedLanguages = new ArrayList<String>(); + for (final Language tmpLang : Language.LANGUAGES) { + supportedLanguages.add(tmpLang.getShortName()); + if (lang.equals(tmpLang.getShortName())) { + language = tmpLang; + foundLanguage = true; + break; + } + } + if (!foundLanguage) { + System.out.println("Unknown language '" + lang + + "'. Supported languages are: " + supportedLanguages); + exitWithUsageMessage(); + } + return language; + } + +} |