/* LanguageTool, a natural language style checker
* Copyright (C) 2005 Daniel Naber (http://www.danielnaber.de)
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library; if not, write to the Free Software
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
* USA
*/
package de.danielnaber.languagetool;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.PrintStream;
import java.lang.reflect.Constructor;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashSet;
import java.util.HashMap;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.MissingResourceException;
import java.util.ResourceBundle;
import java.util.Set;
import javax.xml.parsers.ParserConfigurationException;
import org.xml.sax.SAXException;
import de.danielnaber.languagetool.databroker.DefaultResourceDataBroker;
import de.danielnaber.languagetool.databroker.ResourceDataBroker;
import de.danielnaber.languagetool.rules.Rule;
import de.danielnaber.languagetool.rules.RuleMatch;
import de.danielnaber.languagetool.rules.patterns.FalseFriendRuleLoader;
import de.danielnaber.languagetool.rules.patterns.PatternRule;
import de.danielnaber.languagetool.rules.patterns.PatternRuleLoader;
import de.danielnaber.languagetool.tagging.Tagger;
import de.danielnaber.languagetool.tagging.disambiguation.Disambiguator;
import de.danielnaber.languagetool.tokenizers.Tokenizer;
import de.danielnaber.languagetool.tools.ReflectionUtils;
/**
* The main class used for checking text against different rules:
*
* - the built-in rules (a vs. an, whitespace after commas, ...)
*
- pattern rules loaded from external XML files with
* {@link #loadPatternRules(String)}
*
- your own implementation of the abstract {@link Rule} classes added with
* {@link #addRule(Rule)}
*
*
*
* Note that the constructors create a language checker that uses the built-in
* rules only. Other rules (e.g. from XML) need to be added explicitly.
*
* @author Daniel Naber
*/
@SuppressWarnings({"UnusedDeclaration"})
public final class JLanguageTool {
public static final String VERSION = "1.3-dev"; // keep in sync with
// build.properties!
private static ResourceDataBroker dataBroker = new DefaultResourceDataBroker();
public static final String PATTERN_FILE = "grammar.xml";
public static final String FALSE_FRIEND_FILE = "false-friends.xml";
public static final String SENTENCE_START_TAGNAME = "SENT_START";
public static final String SENTENCE_END_TAGNAME = "SENT_END";
public static final String PARAGRAPH_END_TAGNAME = "PARA_END";
private final List builtinRules = new ArrayList();
private final List userRules = new ArrayList(); // rules added via addRule() method
private final Set disabledRules = new HashSet();
private final Set enabledRules = new HashSet();
private final Set disabledCategories = new HashSet();
private Language language;
private Language motherTongue;
private Disambiguator disambiguator;
private Tagger tagger;
private Tokenizer sentenceTokenizer;
private Tokenizer wordTokenizer;
private PrintStream printStream;
private int sentenceCount;
private boolean listUnknownWords;
private Set unknownWords;
/**
* Constants for correct paragraph-rule handling.
*/
public static enum paragraphHandling {
/**
* Handle normally - all kinds of rules run.
*/
NORMAL,
/**
* Run only paragraph-level rules.
*/
ONLYPARA,
/**
* Run only sentence-level rules.
*/
ONLYNONPARA
}
// just for testing:
/*
* private Rule[] allBuiltinRules = new Rule[] { new
* UppercaseSentenceStartRule() };
*/
/**
* Create a JLanguageTool and setup the built-in rules appropriate for the
* given language, ignoring false friend hints.
*
* @throws IOException
*/
public JLanguageTool(final Language language) throws IOException {
this(language, null);
}
/**
* Create a JLanguageTool and setup the built-in rules appropriate for the
* given language.
*
* @param language
* the language to be used.
* @param motherTongue
* the user's mother tongue or null
. The mother tongue
* may also be used as a source language for checking bilingual texts.
*
* @throws IOException
*/
public JLanguageTool(final Language language, final Language motherTongue)
throws IOException {
if (language == null) {
throw new NullPointerException("language cannot be null");
}
this.language = language;
this.motherTongue = motherTongue;
final ResourceBundle messages = getMessageBundle(language);
final Rule[] allBuiltinRules = getAllBuiltinRules(language, messages);
for (final Rule element : allBuiltinRules) {
if (element.supportsLanguage(language)) {
builtinRules.add(element);
}
}
disambiguator = language.getDisambiguator();
tagger = language.getTagger();
sentenceTokenizer = language.getSentenceTokenizer();
wordTokenizer = language.getWordTokenizer();
}
/**
* The grammar checker does need resources from following
* directories:
*
*
* - {@code /resource}
* - {@code /rules}
*
*
* This method is thread-safe.
*
* @return The currently set data broker which allows to obtain
* resources from the mentioned directories above. If no
* data broker was set, a new {@link DefaultResourceDataBroker} will
* be instantiated and returned.
* @since 1.0.1
*/
public static synchronized ResourceDataBroker getDataBroker() {
if (JLanguageTool.dataBroker == null) {
JLanguageTool.dataBroker = new DefaultResourceDataBroker();
}
return JLanguageTool.dataBroker;
}
/**
* The grammar checker does need resources from following
* directories:
*
*
* - {@code /resource}
* - {@code /rules}
*
*
* This method is thread-safe.
*
* @param broker The new resource broker to be used.
* @since 1.0.1
*/
public static synchronized void setDataBroker(ResourceDataBroker broker) {
JLanguageTool.dataBroker = broker;
}
/**
* Whether the check() method stores unknown words. If set to
* true
(default: false), you can get the list of unknown words
* using getUnknownWords().
*/
public void setListUnknownWords(final boolean listUnknownWords) {
this.listUnknownWords = listUnknownWords;
}
/**
* Gets the ResourceBundle for the default language of the user's system.
*/
public static ResourceBundle getMessageBundle() {
try {
return ResourceBundle
.getBundle("de.danielnaber.languagetool.MessagesBundle");
} catch (final MissingResourceException e) {
return ResourceBundle.getBundle(
"de.danielnaber.languagetool.MessagesBundle", Locale.ENGLISH);
}
}
/**
* Gets the ResourceBundle for the given user interface language.
*/
private static ResourceBundle getMessageBundle(final Language lang) {
try {
return ResourceBundle.getBundle(
"de.danielnaber.languagetool.MessagesBundle", lang.getLocale());
} catch (final MissingResourceException e) {
return ResourceBundle.getBundle(
"de.danielnaber.languagetool.MessagesBundle", Locale.ENGLISH);
}
}
private Rule[] getAllBuiltinRules(final Language language,
final ResourceBundle messages) {
// use reflection to get a list of all non-pattern rules under
// "de.danielnaber.languagetool.rules"
// generic rules first, then language-specific ones
// TODO: the order of loading classes is not guaranteed so we may want to
// implement rule
// precedence
final List rules = new ArrayList();
try {
// we pass ".*Rule$" regexp to improve efficiency, see javadoc
final Class[] classes1 = ReflectionUtils.findClasses(Rule.class
.getClassLoader(), Rule.class.getPackage().getName(), ".*Rule$", 0,
Rule.class, null);
final Class[] classes2 = ReflectionUtils.findClasses(Rule.class
.getClassLoader(), Rule.class.getPackage().getName() + "."
+ language.getShortName(), ".*Rule$", 0, Rule.class, null);
final List classes = new ArrayList();
classes.addAll(Arrays.asList(classes1));
classes.addAll(Arrays.asList(classes2));
for (final Class class1 : classes) {
final Constructor[] constructors = class1.getConstructors();
for (final Constructor constructor : constructors) {
final Class[] paramTypes = constructor.getParameterTypes();
if (paramTypes.length == 1
&& paramTypes[0].equals(ResourceBundle.class)) {
rules.add((Rule) constructor.newInstance(messages));
break;
}
if (paramTypes.length == 2
&& paramTypes[0].equals(ResourceBundle.class)
&& paramTypes[1].equals(Language.class)) {
rules.add((Rule) constructor.newInstance(messages, language));
break;
}
throw new RuntimeException("Unknown constructor for rule class: "
+ class1.getName());
}
}
} catch (final Exception e) {
throw new RuntimeException("Failed to load rules for language " + language, e);
}
// System.err.println("Loaded " + rules.size() + " rules");
return rules.toArray(new Rule[rules.size()]);
}
/**
* Set a PrintStream that will receive verbose output. Set to
* null
to disable verbose output.
*/
public void setOutput(final PrintStream printStream) {
this.printStream = printStream;
}
/**
* Load pattern rules from an XML file. Use {@link #addRule} to add these
* rules to the checking process.
*
* @throws IOException
* @return a List of {@link PatternRule} objects
*/
public List loadPatternRules(final String filename)
throws IOException {
final PatternRuleLoader ruleLoader = new PatternRuleLoader();
InputStream is = this.getClass().getResourceAsStream(filename);
if (is == null) {
// happens for external rules plugged in as an XML file:
is = new FileInputStream(filename);
}
return ruleLoader.getRules(is, filename);
}
/**
* Load false friend rules from an XML file. Only those pairs will be loaded
* that match the current text language and the mother tongue specified in the
* JLanguageTool constructor. Use {@link #addRule} to add these rules to the
* checking process.
*
* @throws ParserConfigurationException
* @throws SAXException
* @throws IOException
* @return a List of {@link PatternRule} objects
*/
public List loadFalseFriendRules(final String filename)
throws ParserConfigurationException, SAXException, IOException {
if (motherTongue == null) {
return new ArrayList();
}
final FalseFriendRuleLoader ruleLoader = new FalseFriendRuleLoader();
return ruleLoader.getRules(this.getClass().getResourceAsStream(filename),
language, motherTongue);
}
/**
* Loads and activates the pattern rules from
* rules/<language>/grammar.xml
.
*
* @throws ParserConfigurationException
* @throws SAXException
* @throws IOException
*/
public void activateDefaultPatternRules() throws IOException {
final String defaultPatternFilename = language.getRuleFileName();
final List patternRules = loadPatternRules(defaultPatternFilename);
userRules.addAll(patternRules);
}
/**
* Loads and activates the false friend rules from
* rules/false-friends.xml
.
*
* @throws ParserConfigurationException
* @throws SAXException
* @throws IOException
*/
public void activateDefaultFalseFriendRules()
throws ParserConfigurationException, SAXException, IOException {
final String falseFriendRulesFilename = JLanguageTool.getDataBroker().getRulesDir() + "/" + FALSE_FRIEND_FILE;
final List patternRules = loadFalseFriendRules(falseFriendRulesFilename);
userRules.addAll(patternRules);
}
/**
* Add a rule to be used by the next call to {@link #check}.
*/
public void addRule(final Rule rule) {
userRules.add(rule);
}
/**
* Disable a given rule so {@link #check} won't use it.
*
* @param ruleId
* the id of the rule to disable
*/
public void disableRule(final String ruleId) {
// TODO: check if such a rule exists
disabledRules.add(ruleId);
}
/**
* Disable a given category so {@link #check} won't use it.
*
* @param categoryName
* the id of the category to disable
*/
public void disableCategory(final String categoryName) {
// TODO: check if such a rule exists
disabledCategories.add(categoryName);
}
/**
* Get the language that was used to configure this instance.
*/
public Language getLanguage() {
return language;
}
/**
* Get rule ids of the rules that have been explicitly disabled.
*/
public Set getDisabledRules() {
return disabledRules;
}
/**
* Enable a rule that was switched off by default.
*
* @param ruleId
* the id of the turned off rule to enable.
*
*/
public void enableDefaultOffRule(final String ruleId) {
enabledRules.add(ruleId);
}
/**
* Get category ids of the rules that have been explicitly disabled.
*/
public Set getDisabledCategories() {
return disabledCategories;
}
/**
* Re-enable a given rule so {@link #check} will use it.
*
* @param ruleId
* the id of the rule to enable
*/
public void enableRule(final String ruleId) {
if (disabledRules.contains(ruleId)) {
disabledRules.remove(ruleId);
}
}
/**
* Returns tokenized sentences.
*/
public List sentenceTokenize(final String text) {
return sentenceTokenizer.tokenize(text);
}
/**
* The main check method. Tokenizes the text into sentences and matches these
* sentences against all currently active rules.
*
* @param text
* the text to check
* @return a List of {@link RuleMatch} objects
* @throws IOException
*/
public List check(final String text) throws IOException {
return check(text, true, paragraphHandling.NORMAL);
}
/**
* The main check method. Tokenizes the text into sentences and matches these
* sentences against all currently active rules.
*
* @param text
* the text to check
* @param tokenizeText
* If true, then the text is tokenized into sentences.
* Otherwise, it is assumed it's already tokenized.
* @param paraMode
* Uses paragraph-level rules only if true.
* @return a List of {@link RuleMatch} objects
* @throws IOException
*/
public List check(final String text, boolean tokenizeText, final paragraphHandling paraMode) throws IOException {
sentenceCount = 0;
final List sentences;
if (tokenizeText) {
sentences = sentenceTokenize(text);
} else {
sentences = new ArrayList();
sentences.add(text);
}
final List ruleMatches = new ArrayList();
final List allRules = getAllRules();
printIfVerbose(allRules.size() + " rules activated for language "
+ language);
int charCount = 0;
int lineCount = 0;
int columnCount = 1;
unknownWords = new HashSet();
for (final String sentence : sentences) {
sentenceCount++;
AnalyzedSentence analyzedText = getAnalyzedSentence(sentence);
rememberUnknownWords(analyzedText);
if (sentenceCount == sentences.size()) {
final AnalyzedTokenReadings[] anTokens = analyzedText.getTokens();
anTokens[anTokens.length - 1].setParaEnd();
analyzedText = new AnalyzedSentence(anTokens);
}
printIfVerbose(analyzedText.toString());
final List sentenceMatches =
checkAnalyzedSentence(paraMode, allRules, charCount, lineCount,
columnCount, sentence, analyzedText);
Collections.sort(sentenceMatches);
ruleMatches.addAll(sentenceMatches);
charCount += sentence.length();
lineCount += countLineBreaks(sentence);
// calculate matching column:
final int lineBreakPos = sentence.indexOf('\n');
if (lineBreakPos == -1) {
columnCount += sentence.length() -1;
} else {
if (lineBreakPos == 0) {
columnCount = sentence.length();
if (!language.getSentenceTokenizer().
singleLineBreaksMarksPara()) {
columnCount--;
}
} else {
columnCount = 1;
}
}
}
if (!paraMode.equals(paragraphHandling.ONLYNONPARA)) {
// removing false positives in paragraph-level rules
for (final Rule rule : allRules) {
if (rule.isParagraphBackTrack() && (rule.getMatches() != null)) {
final List rm = rule.getMatches();
for (final RuleMatch r : rm) {
if (rule.isInRemoved(r)) {
ruleMatches.remove(r);
}
}
}
}
}
return ruleMatches;
}
public List checkAnalyzedSentence(final paragraphHandling paraMode,
final List allRules, int tokenCount, int lineCount,
int columnCount, final String sentence, AnalyzedSentence analyzedText)
throws IOException {
final List sentenceMatches = new ArrayList();
for (final Rule rule : allRules) {
if (disabledRules.contains(rule.getId())
|| (rule.isDefaultOff() && !enabledRules.contains(rule.getId()))) {
continue;
}
if (disabledCategories.contains(rule.getCategory().getName())) {
continue;
}
switch (paraMode) {
case ONLYNONPARA: {
if (rule.isParagraphBackTrack()) {
continue;
}
break;
}
case ONLYPARA: {
if (!rule.isParagraphBackTrack()) {
continue;
}
break;
}
case NORMAL:
default:
}
final RuleMatch[] thisMatches = rule.match(analyzedText);
for (final RuleMatch element1 : thisMatches) {
RuleMatch thisMatch = adjustRuleMatchPos(element1,
tokenCount, columnCount, lineCount, sentence);
sentenceMatches.add(thisMatch);
if (rule.isParagraphBackTrack()) {
rule.addRuleMatch(thisMatch);
}
}
}
return sentenceMatches;
}
/**
* Change RuleMatch positions so they are relative to the complete text,
* not just to the sentence:
* @param rm RuleMatch
* @param sentLen Count of characters
* @param columnCount Current column number
* @param lineCount Current line number
* @param sentence The text being checked
* @return
* The RuleMatch object with adjustments.
*/
public RuleMatch adjustRuleMatchPos(final RuleMatch rm, int sentLen,
int columnCount, int lineCount, final String sentence) {
final RuleMatch thisMatch = new RuleMatch(rm.getRule(),
rm.getFromPos() + sentLen, rm.getToPos()
+ sentLen, rm.getMessage(), rm
.getShortMessage());
thisMatch.setSuggestedReplacements(rm
.getSuggestedReplacements());
final String sentencePartToError = sentence.substring(0, rm
.getFromPos());
final String sentencePartToEndOfError = sentence.substring(0,
rm.getToPos());
final int lastLineBreakPos = sentencePartToError.lastIndexOf('\n');
final int column;
final int endColumn;
if (lastLineBreakPos == -1) {
column = sentencePartToError.length() + columnCount;
} else {
column = sentencePartToError.length() - lastLineBreakPos;
}
final int lastLineBreakPosInError = sentencePartToEndOfError
.lastIndexOf('\n');
if (lastLineBreakPosInError == -1) {
endColumn = sentencePartToEndOfError.length() + columnCount + 1;
} else {
endColumn = sentencePartToEndOfError.length() - lastLineBreakPos;
}
final int lineBreaksToError = countLineBreaks(sentencePartToError);
final int lineBreaksToEndOfError = countLineBreaks(sentencePartToEndOfError);
thisMatch.setLine(lineCount + lineBreaksToError);
thisMatch.setEndLine(lineCount + lineBreaksToEndOfError);
thisMatch.setColumn(column);
thisMatch.setEndColumn(endColumn);
thisMatch.setOffset(rm.getFromPos() + sentLen);
return thisMatch;
}
private void rememberUnknownWords(final AnalyzedSentence analyzedText) {
if (listUnknownWords) {
final AnalyzedTokenReadings[] atr = analyzedText
.getTokensWithoutWhitespace();
for (final AnalyzedTokenReadings t : atr) {
if (t.getReadings().toString().contains("null]")) {
unknownWords.add(t.getToken());
}
}
}
}
/**
* Get the list of unknown words in the last run of the check() method.
*
* @throws IllegalStateException
* if listUnknownWords is set to false
*/
public List getUnknownWords() {
if (!listUnknownWords) {
throw new IllegalStateException(
"listUnknownWords is set to false, unknown words not stored");
}
final List words = new ArrayList(unknownWords);
Collections.sort(words);
return words;
}
static int countLineBreaks(final String s) {
int pos = -1;
int count = 0;
while (true) {
final int nextPos = s.indexOf('\n', pos + 1);
if (nextPos == -1) {
break;
}
pos = nextPos;
count++;
}
return count;
}
/**
* Tokenizes the given sentence
into words and analyzes it,
* and then disambiguates POS tags.
*
* @throws IOException
*/
public AnalyzedSentence getAnalyzedSentence(final String sentence)
throws IOException {
// disambiguate assigned tags & return
return disambiguator.disambiguate(getRawAnalyzedSentence(sentence));
}
/**
* Tokenizes the given sentence
into words and analyzes it.
*
* @since 0.9.8
* @param sentence
* Sentence to be analyzed
* @return
* AnalyzedSentence
* @throws IOException
*/
public AnalyzedSentence getRawAnalyzedSentence(final String sentence) throws IOException {
final List tokens = wordTokenizer.tokenize(sentence);
final Map softHyphenTokens = new HashMap();
//for soft hyphens inside words, happens especially in OOo:
for (int i = 0; i < tokens.size(); i++) {
if (tokens.get(i).indexOf('\u00ad') != -1) {
softHyphenTokens.put(i, tokens.get(i));
tokens.set(i, tokens.get(i).replaceAll("\u00ad", ""));
}
}
final List aTokens = tagger.tag(tokens);
final int numTokens = aTokens.size();
int posFix = 0;
for (int i = 1; i < numTokens; i++) {
aTokens.get(i).setWhitespaceBefore(aTokens.get(i - 1).isWhitespace());
aTokens.get(i).setStartPos(aTokens.get(i).getStartPos() + posFix);
if (!softHyphenTokens.isEmpty()) {
if (softHyphenTokens.get(i) != null) {
aTokens.get(i).addReading(tagger.createToken(softHyphenTokens.get(i), null));
posFix += softHyphenTokens.get(i).length() - aTokens.get(i).getToken().length();
}
}
}
final AnalyzedTokenReadings[] tokenArray = new AnalyzedTokenReadings[tokens
.size() + 1];
final AnalyzedToken[] startTokenArray = new AnalyzedToken[1];
int toArrayCount = 0;
final AnalyzedToken sentenceStartToken = new AnalyzedToken("", SENTENCE_START_TAGNAME, null);
startTokenArray[0] = sentenceStartToken;
tokenArray[toArrayCount++] = new AnalyzedTokenReadings(startTokenArray, 0);
int startPos = 0;
for (final AnalyzedTokenReadings posTag : aTokens) {
posTag.setStartPos(startPos);
tokenArray[toArrayCount++] = posTag;
startPos += posTag.getToken().length();
}
// add additional tags
int lastToken = toArrayCount - 1;
// make SENT_END appear at last not whitespace token
for (int i = 0; i < toArrayCount - 1; i++) {
if (!tokenArray[lastToken - i].isWhitespace()) {
lastToken -= i;
break;
}
}
tokenArray[lastToken].setSentEnd();
if (tokenArray.length == lastToken + 1 && tokenArray[lastToken].isLinebreak()) {
tokenArray[lastToken].setParaEnd();
}
return new AnalyzedSentence(tokenArray);
}
/**
* Get all rules for the current language that are built-in or that have been
* added using {@link #addRule}.
* @return a List of {@link Rule} objects
*/
public List getAllRules() {
final List rules = new ArrayList();
rules.addAll(builtinRules);
rules.addAll(userRules);
// Some rules have an internal state so they can do checks over sentence
// boundaries. These need to be reset so the checks don't suddenly
// work on different texts with the same data. However, it could be useful
// to keep the state information if we're checking a continuous text.
for (final Rule rule : rules) {
rule.reset();
}
return rules;
}
/**
* Number of sentences the latest call to check() has checked.
*/
public int getSentenceCount() {
return sentenceCount;
}
private void printIfVerbose(final String s) {
if (printStream != null) {
printStream.println(s);
}
}
}