summaryrefslogtreecommitdiffstats
path: root/JLanguageTool/src/java/de/danielnaber/languagetool/JLanguageTool.java
diff options
context:
space:
mode:
authorArno Teigseth <arno@teigseth.no>2011-02-05 08:48:27 +0000
committerArno Teigseth <arno@teigseth.no>2011-02-05 08:48:27 +0000
commit4f3d565a5e5ede6eb6fd1f276d4e8ad37b67b5ce (patch)
tree7af736540eca93034428a975bd850e709fbbe2e5 /JLanguageTool/src/java/de/danielnaber/languagetool/JLanguageTool.java
parentecaee85ab5984ebadd56721c295dc26b3335f7ce (diff)
downloadgrammar-norwegian-4f3d565a5e5ede6eb6fd1f276d4e8ad37b67b5ce.tar.gz
grammar-norwegian-4f3d565a5e5ede6eb6fd1f276d4e8ad37b67b5ce.tar.bz2
grammar-norwegian-4f3d565a5e5ede6eb6fd1f276d4e8ad37b67b5ce.tar.xz
added more files, to complete languagetool uploadHEADmaster
Diffstat (limited to 'JLanguageTool/src/java/de/danielnaber/languagetool/JLanguageTool.java')
-rw-r--r--JLanguageTool/src/java/de/danielnaber/languagetool/JLanguageTool.java802
1 files changed, 802 insertions, 0 deletions
diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/JLanguageTool.java b/JLanguageTool/src/java/de/danielnaber/languagetool/JLanguageTool.java
new file mode 100644
index 0000000..44bdfec
--- /dev/null
+++ b/JLanguageTool/src/java/de/danielnaber/languagetool/JLanguageTool.java
@@ -0,0 +1,802 @@
+/* LanguageTool, a natural language style checker
+ * Copyright (C) 2005 Daniel Naber (http://www.danielnaber.de)
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
+ * USA
+ */
+package de.danielnaber.languagetool;
+
+import java.io.FileInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.PrintStream;
+import java.lang.reflect.Constructor;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.HashSet;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Locale;
+import java.util.Map;
+import java.util.MissingResourceException;
+import java.util.ResourceBundle;
+import java.util.Set;
+
+import javax.xml.parsers.ParserConfigurationException;
+
+import org.xml.sax.SAXException;
+
+import de.danielnaber.languagetool.databroker.DefaultResourceDataBroker;
+import de.danielnaber.languagetool.databroker.ResourceDataBroker;
+import de.danielnaber.languagetool.rules.Rule;
+import de.danielnaber.languagetool.rules.RuleMatch;
+import de.danielnaber.languagetool.rules.patterns.FalseFriendRuleLoader;
+import de.danielnaber.languagetool.rules.patterns.PatternRule;
+import de.danielnaber.languagetool.rules.patterns.PatternRuleLoader;
+import de.danielnaber.languagetool.tagging.Tagger;
+import de.danielnaber.languagetool.tagging.disambiguation.Disambiguator;
+import de.danielnaber.languagetool.tokenizers.Tokenizer;
+import de.danielnaber.languagetool.tools.ReflectionUtils;
+
+/**
+ * The main class used for checking text against different rules:
+ * <ul>
+ * <li>the built-in rules (<i>a</i> vs. <i>an</i>, whitespace after commas, ...)
+ * <li>pattern rules loaded from external XML files with
+ * {@link #loadPatternRules(String)}
+ * <li>your own implementation of the abstract {@link Rule} classes added with
+ * {@link #addRule(Rule)}
+ * </ul>
+ *
+ * <p>
+ * Note that the constructors create a language checker that uses the built-in
+ * rules only. Other rules (e.g. from XML) need to be added explicitly.
+ *
+ * @author Daniel Naber
+ */
+@SuppressWarnings({"UnusedDeclaration"})
+public final class JLanguageTool {
+
+ public static final String VERSION = "1.3-dev"; // keep in sync with
+ // build.properties!
+
+ private static ResourceDataBroker dataBroker = new DefaultResourceDataBroker();
+ public static final String PATTERN_FILE = "grammar.xml";
+ public static final String FALSE_FRIEND_FILE = "false-friends.xml";
+
+ public static final String SENTENCE_START_TAGNAME = "SENT_START";
+ public static final String SENTENCE_END_TAGNAME = "SENT_END";
+ public static final String PARAGRAPH_END_TAGNAME = "PARA_END";
+
+ private final List<Rule> builtinRules = new ArrayList<Rule>();
+ private final List<Rule> userRules = new ArrayList<Rule>(); // rules added via addRule() method
+ private final Set<String> disabledRules = new HashSet<String>();
+ private final Set<String> enabledRules = new HashSet<String>();
+
+ private final Set<String> disabledCategories = new HashSet<String>();
+
+ private Language language;
+ private Language motherTongue;
+ private Disambiguator disambiguator;
+ private Tagger tagger;
+ private Tokenizer sentenceTokenizer;
+ private Tokenizer wordTokenizer;
+
+ private PrintStream printStream;
+
+ private int sentenceCount;
+
+ private boolean listUnknownWords;
+ private Set<String> unknownWords;
+
+ /**
+ * Constants for correct paragraph-rule handling.
+ */
+ public static enum paragraphHandling {
+ /**
+ * Handle normally - all kinds of rules run.
+ */
+ NORMAL,
+ /**
+ * Run only paragraph-level rules.
+ */
+ ONLYPARA,
+ /**
+ * Run only sentence-level rules.
+ */
+ ONLYNONPARA
+ }
+
+ // just for testing:
+ /*
+ * private Rule[] allBuiltinRules = new Rule[] { new
+ * UppercaseSentenceStartRule() };
+ */
+
+ /**
+ * Create a JLanguageTool and setup the built-in rules appropriate for the
+ * given language, ignoring false friend hints.
+ *
+ * @throws IOException
+ */
+ public JLanguageTool(final Language language) throws IOException {
+ this(language, null);
+ }
+
+ /**
+ * Create a JLanguageTool and setup the built-in rules appropriate for the
+ * given language.
+ *
+ * @param language
+ * the language to be used.
+ * @param motherTongue
+ * the user's mother tongue or <code>null</code>. The mother tongue
+ * may also be used as a source language for checking bilingual texts.
+ *
+ * @throws IOException
+ */
+ public JLanguageTool(final Language language, final Language motherTongue)
+ throws IOException {
+ if (language == null) {
+ throw new NullPointerException("language cannot be null");
+ }
+ this.language = language;
+ this.motherTongue = motherTongue;
+ final ResourceBundle messages = getMessageBundle(language);
+ final Rule[] allBuiltinRules = getAllBuiltinRules(language, messages);
+ for (final Rule element : allBuiltinRules) {
+ if (element.supportsLanguage(language)) {
+ builtinRules.add(element);
+ }
+ }
+ disambiguator = language.getDisambiguator();
+ tagger = language.getTagger();
+ sentenceTokenizer = language.getSentenceTokenizer();
+ wordTokenizer = language.getWordTokenizer();
+ }
+
+ /**
+ * The grammar checker does need resources from following
+ * directories:
+ *
+ * <ul style="list-type: circle">
+ * <li>{@code /resource}</li>
+ * <li>{@code /rules}</li>
+ * </ul>
+ *
+ * This method is thread-safe.
+ *
+ * @return The currently set data broker which allows to obtain
+ * resources from the mentioned directories above. If no
+ * data broker was set, a new {@link DefaultResourceDataBroker} will
+ * be instantiated and returned.
+ * @since 1.0.1
+ */
+ public static synchronized ResourceDataBroker getDataBroker() {
+ if (JLanguageTool.dataBroker == null) {
+ JLanguageTool.dataBroker = new DefaultResourceDataBroker();
+ }
+ return JLanguageTool.dataBroker;
+ }
+
+ /**
+ * The grammar checker does need resources from following
+ * directories:
+ *
+ * <ul style="list-type: circle">
+ * <li>{@code /resource}</li>
+ * <li>{@code /rules}</li>
+ * </ul>
+ *
+ * This method is thread-safe.
+ *
+ * @param broker The new resource broker to be used.
+ * @since 1.0.1
+ */
+ public static synchronized void setDataBroker(ResourceDataBroker broker) {
+ JLanguageTool.dataBroker = broker;
+ }
+
+ /**
+ * Whether the check() method stores unknown words. If set to
+ * <code>true</code> (default: false), you can get the list of unknown words
+ * using getUnknownWords().
+ */
+ public void setListUnknownWords(final boolean listUnknownWords) {
+ this.listUnknownWords = listUnknownWords;
+ }
+
+ /**
+ * Gets the ResourceBundle for the default language of the user's system.
+ */
+ public static ResourceBundle getMessageBundle() {
+ try {
+ return ResourceBundle
+ .getBundle("de.danielnaber.languagetool.MessagesBundle");
+ } catch (final MissingResourceException e) {
+ return ResourceBundle.getBundle(
+ "de.danielnaber.languagetool.MessagesBundle", Locale.ENGLISH);
+ }
+ }
+
+ /**
+ * Gets the ResourceBundle for the given user interface language.
+ */
+ private static ResourceBundle getMessageBundle(final Language lang) {
+ try {
+ return ResourceBundle.getBundle(
+ "de.danielnaber.languagetool.MessagesBundle", lang.getLocale());
+ } catch (final MissingResourceException e) {
+ return ResourceBundle.getBundle(
+ "de.danielnaber.languagetool.MessagesBundle", Locale.ENGLISH);
+ }
+ }
+
+ private Rule[] getAllBuiltinRules(final Language language,
+ final ResourceBundle messages) {
+ // use reflection to get a list of all non-pattern rules under
+ // "de.danielnaber.languagetool.rules"
+ // generic rules first, then language-specific ones
+ // TODO: the order of loading classes is not guaranteed so we may want to
+ // implement rule
+ // precedence
+
+ final List<Rule> rules = new ArrayList<Rule>();
+ try {
+ // we pass ".*Rule$" regexp to improve efficiency, see javadoc
+ final Class[] classes1 = ReflectionUtils.findClasses(Rule.class
+ .getClassLoader(), Rule.class.getPackage().getName(), ".*Rule$", 0,
+ Rule.class, null);
+ final Class[] classes2 = ReflectionUtils.findClasses(Rule.class
+ .getClassLoader(), Rule.class.getPackage().getName() + "."
+ + language.getShortName(), ".*Rule$", 0, Rule.class, null);
+
+ final List<Class> classes = new ArrayList<Class>();
+ classes.addAll(Arrays.asList(classes1));
+ classes.addAll(Arrays.asList(classes2));
+
+ for (final Class class1 : classes) {
+ final Constructor[] constructors = class1.getConstructors();
+ for (final Constructor constructor : constructors) {
+ final Class[] paramTypes = constructor.getParameterTypes();
+ if (paramTypes.length == 1
+ && paramTypes[0].equals(ResourceBundle.class)) {
+ rules.add((Rule) constructor.newInstance(messages));
+ break;
+ }
+ if (paramTypes.length == 2
+ && paramTypes[0].equals(ResourceBundle.class)
+ && paramTypes[1].equals(Language.class)) {
+ rules.add((Rule) constructor.newInstance(messages, language));
+ break;
+ }
+ throw new RuntimeException("Unknown constructor for rule class: "
+ + class1.getName());
+ }
+ }
+ } catch (final Exception e) {
+ throw new RuntimeException("Failed to load rules for language " + language, e);
+ }
+ // System.err.println("Loaded " + rules.size() + " rules");
+ return rules.toArray(new Rule[rules.size()]);
+ }
+
+ /**
+ * Set a PrintStream that will receive verbose output. Set to
+ * <code>null</code> to disable verbose output.
+ */
+ public void setOutput(final PrintStream printStream) {
+ this.printStream = printStream;
+ }
+
+ /**
+ * Load pattern rules from an XML file. Use {@link #addRule} to add these
+ * rules to the checking process.
+ *
+ * @throws IOException
+ * @return a List of {@link PatternRule} objects
+ */
+ public List<PatternRule> loadPatternRules(final String filename)
+ throws IOException {
+ final PatternRuleLoader ruleLoader = new PatternRuleLoader();
+ InputStream is = this.getClass().getResourceAsStream(filename);
+ if (is == null) {
+ // happens for external rules plugged in as an XML file:
+ is = new FileInputStream(filename);
+ }
+ return ruleLoader.getRules(is, filename);
+ }
+
+ /**
+ * Load false friend rules from an XML file. Only those pairs will be loaded
+ * that match the current text language and the mother tongue specified in the
+ * JLanguageTool constructor. Use {@link #addRule} to add these rules to the
+ * checking process.
+ *
+ * @throws ParserConfigurationException
+ * @throws SAXException
+ * @throws IOException
+ * @return a List of {@link PatternRule} objects
+ */
+ public List<PatternRule> loadFalseFriendRules(final String filename)
+ throws ParserConfigurationException, SAXException, IOException {
+ if (motherTongue == null) {
+ return new ArrayList<PatternRule>();
+ }
+ final FalseFriendRuleLoader ruleLoader = new FalseFriendRuleLoader();
+ return ruleLoader.getRules(this.getClass().getResourceAsStream(filename),
+ language, motherTongue);
+ }
+
+ /**
+ * Loads and activates the pattern rules from
+ * <code>rules/&lt;language&gt;/grammar.xml</code>.
+ *
+ * @throws ParserConfigurationException
+ * @throws SAXException
+ * @throws IOException
+ */
+ public void activateDefaultPatternRules() throws IOException {
+ final String defaultPatternFilename = language.getRuleFileName();
+ final List<PatternRule> patternRules = loadPatternRules(defaultPatternFilename);
+ userRules.addAll(patternRules);
+ }
+
+ /**
+ * Loads and activates the false friend rules from
+ * <code>rules/false-friends.xml</code>.
+ *
+ * @throws ParserConfigurationException
+ * @throws SAXException
+ * @throws IOException
+ */
+ public void activateDefaultFalseFriendRules()
+ throws ParserConfigurationException, SAXException, IOException {
+ final String falseFriendRulesFilename = JLanguageTool.getDataBroker().getRulesDir() + "/" + FALSE_FRIEND_FILE;
+ final List<PatternRule> patternRules = loadFalseFriendRules(falseFriendRulesFilename);
+ userRules.addAll(patternRules);
+ }
+
+ /**
+ * Add a rule to be used by the next call to {@link #check}.
+ */
+ public void addRule(final Rule rule) {
+ userRules.add(rule);
+ }
+
+ /**
+ * Disable a given rule so {@link #check} won't use it.
+ *
+ * @param ruleId
+ * the id of the rule to disable
+ */
+ public void disableRule(final String ruleId) {
+ // TODO: check if such a rule exists
+ disabledRules.add(ruleId);
+ }
+
+ /**
+ * Disable a given category so {@link #check} won't use it.
+ *
+ * @param categoryName
+ * the id of the category to disable
+ */
+ public void disableCategory(final String categoryName) {
+ // TODO: check if such a rule exists
+ disabledCategories.add(categoryName);
+ }
+
+ /**
+ * Get the language that was used to configure this instance.
+ */
+ public Language getLanguage() {
+ return language;
+ }
+
+ /**
+ * Get rule ids of the rules that have been explicitly disabled.
+ */
+ public Set<String> getDisabledRules() {
+ return disabledRules;
+ }
+
+ /**
+ * Enable a rule that was switched off by default.
+ *
+ * @param ruleId
+ * the id of the turned off rule to enable.
+ *
+ */
+ public void enableDefaultOffRule(final String ruleId) {
+ enabledRules.add(ruleId);
+ }
+
+ /**
+ * Get category ids of the rules that have been explicitly disabled.
+ */
+ public Set<String> getDisabledCategories() {
+ return disabledCategories;
+ }
+
+ /**
+ * Re-enable a given rule so {@link #check} will use it.
+ *
+ * @param ruleId
+ * the id of the rule to enable
+ */
+ public void enableRule(final String ruleId) {
+ if (disabledRules.contains(ruleId)) {
+ disabledRules.remove(ruleId);
+ }
+ }
+
+ /**
+ * Returns tokenized sentences.
+ */
+ public List<String> sentenceTokenize(final String text) {
+ return sentenceTokenizer.tokenize(text);
+ }
+
+ /**
+ * The main check method. Tokenizes the text into sentences and matches these
+ * sentences against all currently active rules.
+ *
+ * @param text
+ * the text to check
+ * @return a List of {@link RuleMatch} objects
+ * @throws IOException
+ */
+ public List<RuleMatch> check(final String text) throws IOException {
+ return check(text, true, paragraphHandling.NORMAL);
+ }
+
+
+ /**
+ * The main check method. Tokenizes the text into sentences and matches these
+ * sentences against all currently active rules.
+ *
+ * @param text
+ * the text to check
+ * @param tokenizeText
+ * If true, then the text is tokenized into sentences.
+ * Otherwise, it is assumed it's already tokenized.
+ * @param paraMode
+ * Uses paragraph-level rules only if true.
+
+ * @return a List of {@link RuleMatch} objects
+ * @throws IOException
+ */
+ public List<RuleMatch> check(final String text, boolean tokenizeText, final paragraphHandling paraMode) throws IOException {
+ sentenceCount = 0;
+ final List<String> sentences;
+ if (tokenizeText) {
+ sentences = sentenceTokenize(text);
+ } else {
+ sentences = new ArrayList<String>();
+ sentences.add(text);
+ }
+ final List<RuleMatch> ruleMatches = new ArrayList<RuleMatch>();
+ final List<Rule> allRules = getAllRules();
+ printIfVerbose(allRules.size() + " rules activated for language "
+ + language);
+ int charCount = 0;
+ int lineCount = 0;
+ int columnCount = 1;
+ unknownWords = new HashSet<String>();
+ for (final String sentence : sentences) {
+ sentenceCount++;
+ AnalyzedSentence analyzedText = getAnalyzedSentence(sentence);
+ rememberUnknownWords(analyzedText);
+
+ if (sentenceCount == sentences.size()) {
+ final AnalyzedTokenReadings[] anTokens = analyzedText.getTokens();
+ anTokens[anTokens.length - 1].setParaEnd();
+ analyzedText = new AnalyzedSentence(anTokens);
+ }
+
+ printIfVerbose(analyzedText.toString());
+ final List<RuleMatch> sentenceMatches =
+ checkAnalyzedSentence(paraMode, allRules, charCount, lineCount,
+ columnCount, sentence, analyzedText);
+
+ Collections.sort(sentenceMatches);
+ ruleMatches.addAll(sentenceMatches);
+ charCount += sentence.length();
+ lineCount += countLineBreaks(sentence);
+
+ // calculate matching column:
+ final int lineBreakPos = sentence.indexOf('\n');
+ if (lineBreakPos == -1) {
+ columnCount += sentence.length() -1;
+ } else {
+ if (lineBreakPos == 0) {
+ columnCount = sentence.length();
+ if (!language.getSentenceTokenizer().
+ singleLineBreaksMarksPara()) {
+ columnCount--;
+ }
+ } else {
+ columnCount = 1;
+ }
+ }
+ }
+
+ if (!paraMode.equals(paragraphHandling.ONLYNONPARA)) {
+ // removing false positives in paragraph-level rules
+ for (final Rule rule : allRules) {
+ if (rule.isParagraphBackTrack() && (rule.getMatches() != null)) {
+ final List<RuleMatch> rm = rule.getMatches();
+ for (final RuleMatch r : rm) {
+ if (rule.isInRemoved(r)) {
+ ruleMatches.remove(r);
+ }
+ }
+ }
+ }
+ }
+
+ return ruleMatches;
+ }
+
+ public List<RuleMatch> checkAnalyzedSentence(final paragraphHandling paraMode,
+ final List<Rule> allRules, int tokenCount, int lineCount,
+ int columnCount, final String sentence, AnalyzedSentence analyzedText)
+ throws IOException {
+ final List<RuleMatch> sentenceMatches = new ArrayList<RuleMatch>();
+ for (final Rule rule : allRules) {
+ if (disabledRules.contains(rule.getId())
+ || (rule.isDefaultOff() && !enabledRules.contains(rule.getId()))) {
+ continue;
+ }
+
+ if (disabledCategories.contains(rule.getCategory().getName())) {
+ continue;
+ }
+
+ switch (paraMode) {
+ case ONLYNONPARA: {
+ if (rule.isParagraphBackTrack()) {
+ continue;
+ }
+ break;
+ }
+ case ONLYPARA: {
+ if (!rule.isParagraphBackTrack()) {
+ continue;
+ }
+ break;
+ }
+ case NORMAL:
+ default:
+ }
+
+ final RuleMatch[] thisMatches = rule.match(analyzedText);
+ for (final RuleMatch element1 : thisMatches) {
+ RuleMatch thisMatch = adjustRuleMatchPos(element1,
+ tokenCount, columnCount, lineCount, sentence);
+ sentenceMatches.add(thisMatch);
+ if (rule.isParagraphBackTrack()) {
+ rule.addRuleMatch(thisMatch);
+ }
+ }
+ }
+ return sentenceMatches;
+ }
+
+ /**
+ * Change RuleMatch positions so they are relative to the complete text,
+ * not just to the sentence:
+ * @param rm RuleMatch
+ * @param sentLen Count of characters
+ * @param columnCount Current column number
+ * @param lineCount Current line number
+ * @param sentence The text being checked
+ * @return
+ * The RuleMatch object with adjustments.
+ */
+ public RuleMatch adjustRuleMatchPos(final RuleMatch rm, int sentLen,
+ int columnCount, int lineCount, final String sentence) {
+ final RuleMatch thisMatch = new RuleMatch(rm.getRule(),
+ rm.getFromPos() + sentLen, rm.getToPos()
+ + sentLen, rm.getMessage(), rm
+ .getShortMessage());
+ thisMatch.setSuggestedReplacements(rm
+ .getSuggestedReplacements());
+ final String sentencePartToError = sentence.substring(0, rm
+ .getFromPos());
+ final String sentencePartToEndOfError = sentence.substring(0,
+ rm.getToPos());
+ final int lastLineBreakPos = sentencePartToError.lastIndexOf('\n');
+ final int column;
+ final int endColumn;
+ if (lastLineBreakPos == -1) {
+ column = sentencePartToError.length() + columnCount;
+ } else {
+ column = sentencePartToError.length() - lastLineBreakPos;
+ }
+ final int lastLineBreakPosInError = sentencePartToEndOfError
+ .lastIndexOf('\n');
+ if (lastLineBreakPosInError == -1) {
+ endColumn = sentencePartToEndOfError.length() + columnCount + 1;
+ } else {
+ endColumn = sentencePartToEndOfError.length() - lastLineBreakPos;
+ }
+ final int lineBreaksToError = countLineBreaks(sentencePartToError);
+ final int lineBreaksToEndOfError = countLineBreaks(sentencePartToEndOfError);
+ thisMatch.setLine(lineCount + lineBreaksToError);
+ thisMatch.setEndLine(lineCount + lineBreaksToEndOfError);
+ thisMatch.setColumn(column);
+ thisMatch.setEndColumn(endColumn);
+ thisMatch.setOffset(rm.getFromPos() + sentLen);
+ return thisMatch;
+ }
+
+ private void rememberUnknownWords(final AnalyzedSentence analyzedText) {
+ if (listUnknownWords) {
+ final AnalyzedTokenReadings[] atr = analyzedText
+ .getTokensWithoutWhitespace();
+ for (final AnalyzedTokenReadings t : atr) {
+ if (t.getReadings().toString().contains("null]")) {
+ unknownWords.add(t.getToken());
+ }
+ }
+ }
+ }
+
+ /**
+ * Get the list of unknown words in the last run of the check() method.
+ *
+ * @throws IllegalStateException
+ * if listUnknownWords is set to <code>false</code>
+ */
+ public List<String> getUnknownWords() {
+ if (!listUnknownWords) {
+ throw new IllegalStateException(
+ "listUnknownWords is set to false, unknown words not stored");
+ }
+ final List<String> words = new ArrayList<String>(unknownWords);
+ Collections.sort(words);
+ return words;
+ }
+
+ static int countLineBreaks(final String s) {
+ int pos = -1;
+ int count = 0;
+ while (true) {
+ final int nextPos = s.indexOf('\n', pos + 1);
+ if (nextPos == -1) {
+ break;
+ }
+ pos = nextPos;
+ count++;
+ }
+ return count;
+ }
+
+ /**
+ * Tokenizes the given <code>sentence</code> into words and analyzes it,
+ * and then disambiguates POS tags.
+ *
+ * @throws IOException
+ */
+ public AnalyzedSentence getAnalyzedSentence(final String sentence)
+ throws IOException {
+ // disambiguate assigned tags & return
+ return disambiguator.disambiguate(getRawAnalyzedSentence(sentence));
+ }
+
+ /**
+ * Tokenizes the given <code>sentence</code> into words and analyzes it.
+ *
+ * @since 0.9.8
+ * @param sentence
+ * Sentence to be analyzed
+ * @return
+ * AnalyzedSentence
+ * @throws IOException
+ */
+ public AnalyzedSentence getRawAnalyzedSentence(final String sentence) throws IOException {
+ final List<String> tokens = wordTokenizer.tokenize(sentence);
+ final Map<Integer, String> softHyphenTokens = new HashMap<Integer, String>();
+
+ //for soft hyphens inside words, happens especially in OOo:
+ for (int i = 0; i < tokens.size(); i++) {
+ if (tokens.get(i).indexOf('\u00ad') != -1) {
+ softHyphenTokens.put(i, tokens.get(i));
+ tokens.set(i, tokens.get(i).replaceAll("\u00ad", ""));
+ }
+ }
+
+ final List<AnalyzedTokenReadings> aTokens = tagger.tag(tokens);
+ final int numTokens = aTokens.size();
+ int posFix = 0;
+ for (int i = 1; i < numTokens; i++) {
+ aTokens.get(i).setWhitespaceBefore(aTokens.get(i - 1).isWhitespace());
+ aTokens.get(i).setStartPos(aTokens.get(i).getStartPos() + posFix);
+ if (!softHyphenTokens.isEmpty()) {
+ if (softHyphenTokens.get(i) != null) {
+ aTokens.get(i).addReading(tagger.createToken(softHyphenTokens.get(i), null));
+ posFix += softHyphenTokens.get(i).length() - aTokens.get(i).getToken().length();
+ }
+ }
+ }
+
+ final AnalyzedTokenReadings[] tokenArray = new AnalyzedTokenReadings[tokens
+ .size() + 1];
+ final AnalyzedToken[] startTokenArray = new AnalyzedToken[1];
+ int toArrayCount = 0;
+ final AnalyzedToken sentenceStartToken = new AnalyzedToken("", SENTENCE_START_TAGNAME, null);
+ startTokenArray[0] = sentenceStartToken;
+ tokenArray[toArrayCount++] = new AnalyzedTokenReadings(startTokenArray, 0);
+ int startPos = 0;
+ for (final AnalyzedTokenReadings posTag : aTokens) {
+ posTag.setStartPos(startPos);
+ tokenArray[toArrayCount++] = posTag;
+ startPos += posTag.getToken().length();
+ }
+
+ // add additional tags
+ int lastToken = toArrayCount - 1;
+ // make SENT_END appear at last not whitespace token
+ for (int i = 0; i < toArrayCount - 1; i++) {
+ if (!tokenArray[lastToken - i].isWhitespace()) {
+ lastToken -= i;
+ break;
+ }
+ }
+
+ tokenArray[lastToken].setSentEnd();
+
+ if (tokenArray.length == lastToken + 1 && tokenArray[lastToken].isLinebreak()) {
+ tokenArray[lastToken].setParaEnd();
+ }
+ return new AnalyzedSentence(tokenArray);
+ }
+
+ /**
+ * Get all rules for the current language that are built-in or that have been
+ * added using {@link #addRule}.
+ * @return a List of {@link Rule} objects
+ */
+ public List<Rule> getAllRules() {
+ final List<Rule> rules = new ArrayList<Rule>();
+ rules.addAll(builtinRules);
+ rules.addAll(userRules);
+ // Some rules have an internal state so they can do checks over sentence
+ // boundaries. These need to be reset so the checks don't suddenly
+ // work on different texts with the same data. However, it could be useful
+ // to keep the state information if we're checking a continuous text.
+ for (final Rule rule : rules) {
+ rule.reset();
+ }
+ return rules;
+ }
+
+ /**
+ * Number of sentences the latest call to check() has checked.
+ */
+ public int getSentenceCount() {
+ return sentenceCount;
+
+ }
+
+ private void printIfVerbose(final String s) {
+ if (printStream != null) {
+ printStream.println(s);
+ }
+ }
+
+}