summaryrefslogtreecommitdiffstats
path: root/JLanguageTool/src/java/de/danielnaber/languagetool/rules/patterns
diff options
context:
space:
mode:
Diffstat (limited to 'JLanguageTool/src/java/de/danielnaber/languagetool/rules/patterns')
-rw-r--r--JLanguageTool/src/java/de/danielnaber/languagetool/rules/patterns/AbstractPatternRule.java223
-rw-r--r--JLanguageTool/src/java/de/danielnaber/languagetool/rules/patterns/Element.java803
-rw-r--r--JLanguageTool/src/java/de/danielnaber/languagetool/rules/patterns/FalseFriendRuleLoader.java356
-rw-r--r--JLanguageTool/src/java/de/danielnaber/languagetool/rules/patterns/Match.java551
-rw-r--r--JLanguageTool/src/java/de/danielnaber/languagetool/rules/patterns/PatternRule.java652
-rw-r--r--JLanguageTool/src/java/de/danielnaber/languagetool/rules/patterns/PatternRuleLoader.java369
-rw-r--r--JLanguageTool/src/java/de/danielnaber/languagetool/rules/patterns/Unifier.java432
-rw-r--r--JLanguageTool/src/java/de/danielnaber/languagetool/rules/patterns/XMLRuleHandler.java568
-rw-r--r--JLanguageTool/src/java/de/danielnaber/languagetool/rules/patterns/bitext/BitextPatternRule.java93
-rw-r--r--JLanguageTool/src/java/de/danielnaber/languagetool/rules/patterns/bitext/BitextPatternRuleLoader.java413
-rw-r--r--JLanguageTool/src/java/de/danielnaber/languagetool/rules/patterns/bitext/BitextXMLRuleHandler.java56
-rw-r--r--JLanguageTool/src/java/de/danielnaber/languagetool/rules/patterns/bitext/FalseFriendsAsBitextLoader.java72
12 files changed, 4588 insertions, 0 deletions
diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/rules/patterns/AbstractPatternRule.java b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/patterns/AbstractPatternRule.java
new file mode 100644
index 0000000..d172134
--- /dev/null
+++ b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/patterns/AbstractPatternRule.java
@@ -0,0 +1,223 @@
+/* LanguageTool, a natural language style checker
+ * Copyright (C) 2008 Daniel Naber (http://www.danielnaber.de)
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
+ * USA
+ */
+
+package de.danielnaber.languagetool.rules.patterns;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+
+import de.danielnaber.languagetool.AnalyzedSentence;
+import de.danielnaber.languagetool.AnalyzedToken;
+import de.danielnaber.languagetool.AnalyzedTokenReadings;
+import de.danielnaber.languagetool.Language;
+import de.danielnaber.languagetool.rules.Rule;
+import de.danielnaber.languagetool.rules.RuleMatch;
+
+/**
+ * An Abstract Pattern Rule that describes a pattern of words or part-of-speech tags
+ * used for PatternRule and DisambiguationPatternRule.
+ *
+ * Introduced to minimize code duplication between those classes.
+ *
+ * @author Marcin Miłkowski
+ */
+
+public abstract class AbstractPatternRule extends Rule {
+
+ private final String id;
+
+ private final String description;
+
+ protected final List<Element> patternElements;
+
+ protected Unifier unifier;
+
+ protected final Language language;
+
+ protected int startPositionCorrection;
+
+ protected int endPositionCorrection;
+
+ protected boolean prevMatched;
+
+ protected final boolean testUnification;
+
+ private final boolean getUnified;
+
+ private boolean groupsOrUnification;
+
+ protected AnalyzedTokenReadings[] unifiedTokens;
+
+ protected final boolean sentStart;
+
+ public AbstractPatternRule(final String id,
+ final String description,
+ final Language language,
+ final List<Element> elements,
+ boolean getUnified) {
+ this.id = id;
+ this.description = description;
+ this.patternElements = new ArrayList<Element>(elements); // copy elements
+ this.language = language;
+ this.getUnified = getUnified;
+ unifier = language.getUnifier();
+ testUnification = initUnifier();
+ sentStart = patternElements.get(0).isSentStart();
+ if (!testUnification) {
+ for (Element elem : patternElements) {
+ if (elem.hasAndGroup()) {
+ groupsOrUnification = true;
+ break;
+ }
+ }
+ } else {
+ groupsOrUnification = true;
+ }
+ }
+
+ private boolean initUnifier() {
+ for (final Element elem : patternElements) {
+ if (elem.isUnified()) {
+ return true;
+ }
+ }
+ return false;
+ }
+
+ @Override
+ public final String toString() {
+ return id + ":" + patternElements + ":" + description;
+ }
+
+ @Override
+ public String getDescription() {
+ return description;
+ }
+
+ @Override
+ public String getId() {
+ return id;
+ }
+
+ @Override
+ public RuleMatch[] match(AnalyzedSentence text) throws IOException {
+ // TODO Auto-generated method stub
+ return null;
+ }
+
+ @Override
+ public void reset() {
+ // TODO Auto-generated method stub
+ }
+
+ public final void setStartPositionCorrection(final int startPositionCorrection) {
+ this.startPositionCorrection = startPositionCorrection;
+ }
+
+ public final void setEndPositionCorrection(final int endPositionCorrection) {
+ this.endPositionCorrection = endPositionCorrection;
+ }
+
+
+ protected void setupAndGroup(final int firstMatchToken,
+ final Element elem, final AnalyzedTokenReadings[] tokens)
+ throws IOException {
+ if (elem.hasAndGroup()) {
+ for (final Element andElement : elem.getAndGroup()) {
+ if (andElement.isReferenceElement()) {
+ setupRef(firstMatchToken, andElement, tokens);
+ }
+ }
+ elem.setupAndGroup();
+ }
+ }
+
+ //TODO: add .compile for all exceptions of the element?
+ protected void setupRef(final int firstMatchToken, final Element elem,
+ final AnalyzedTokenReadings[] tokens) throws IOException {
+ if (elem.isReferenceElement()) {
+ final int refPos = firstMatchToken + elem.getMatch().getTokenRef();
+ if (refPos < tokens.length) {
+ elem.compile(tokens[refPos], language.getSynthesizer());
+ }
+ }
+ }
+
+ protected boolean testAllReadings(final AnalyzedTokenReadings[] tokens,
+ final Element elem, final Element prevElement, final int tokenNo,
+ final int firstMatchToken, final int prevSkipNext) throws IOException {
+ boolean thisMatched = false;
+ final int numberOfReadings = tokens[tokenNo].getReadingsLength();
+ setupAndGroup(firstMatchToken, elem, tokens);
+ for (int l = 0; l < numberOfReadings; l++) {
+ final AnalyzedToken matchToken = tokens[tokenNo].getAnalyzedToken(l);
+ prevMatched = prevMatched || prevSkipNext > 0 && prevElement != null
+ && prevElement.isMatchedByScopeNextException(matchToken);
+ if (prevMatched) {
+ return false;
+ }
+ thisMatched = thisMatched || elem.isMatched(matchToken);
+ if (!thisMatched && !elem.isInflected() && elem.getPOStag() == null
+ && (prevElement != null && prevElement.getExceptionList() == null)) {
+ return false; // the token is the same, we will not get a match
+ }
+ if (groupsOrUnification) {
+ thisMatched &= testUnificationAndGroups(thisMatched,
+ l + 1 == numberOfReadings, matchToken, elem);
+ }
+ }
+ if (thisMatched) {
+ for (int l = 0; l < numberOfReadings; l++) {
+ if (elem.isExceptionMatchedCompletely(tokens[tokenNo].getAnalyzedToken(l)))
+ return false;
+ }
+ if (tokenNo > 0 && elem.hasPreviousException()) {
+ if (elem.isMatchedByPreviousException(tokens[tokenNo - 1]))
+ return false;
+ }
+ }
+ return thisMatched;
+ }
+
+ protected boolean testUnificationAndGroups(final boolean matched,
+ final boolean lastReading, final AnalyzedToken matchToken,
+ final Element elem) {
+ boolean thisMatched = matched;
+ if (testUnification) {
+ if (matched && elem.isUnified()) {
+ thisMatched = thisMatched && unifier.isUnified(matchToken, elem.getUniFeatures(),
+ elem.isUniNegated(), lastReading);
+ }
+ if (thisMatched && getUnified) {
+ unifiedTokens = unifier.getFinalUnified();
+ }
+ if (!elem.isUnified()) {
+ unifier.reset();
+ }
+ }
+ elem.addMemberAndGroup(matchToken);
+ if (lastReading) {
+ thisMatched &= elem.checkAndGroup(thisMatched);
+ }
+ return thisMatched;
+ }
+
+
+}
diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/rules/patterns/Element.java b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/patterns/Element.java
new file mode 100644
index 0000000..0ad7c1f
--- /dev/null
+++ b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/patterns/Element.java
@@ -0,0 +1,803 @@
+/* LanguageTool, a natural language style checker
+ * Copyright (C) 2005 Daniel Naber (http://www.danielnaber.de)
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
+ * USA
+ */
+package de.danielnaber.languagetool.rules.patterns;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
+import java.util.Map;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import de.danielnaber.languagetool.AnalyzedToken;
+import de.danielnaber.languagetool.AnalyzedTokenReadings;
+import de.danielnaber.languagetool.JLanguageTool;
+import de.danielnaber.languagetool.synthesis.Synthesizer;
+import de.danielnaber.languagetool.tools.StringTools;
+
+/**
+ * A part of a pattern.
+ *
+ * @author Daniel Naber
+ */
+public class Element {
+
+ private String stringToken;
+ private String posToken;
+ private String regToken;
+ private boolean posRegExp;
+
+ private boolean negation;
+ private boolean posNegation;
+
+ private final boolean caseSensitive;
+ private final boolean stringRegExp;
+ private boolean inflected;
+
+ private boolean testWhitespace;
+ private boolean whitespaceBefore;
+
+ /**
+ * List of exceptions that are valid for the current token and / or some next
+ * tokens.
+ */
+ private List<Element> exceptionList;
+
+ /**
+ * True if scope=="next".
+ */
+ private boolean exceptionValidNext;
+
+ /**
+ * True if any exception with a scope=="current" or scope=="next" is set for
+ * the element.
+ */
+ private boolean exceptionSet;
+
+ /**
+ * True if attribute scope=="previous".
+ */
+ private boolean exceptionValidPrevious;
+
+ /**
+ * List of exceptions that are valid for a previous token.
+ */
+ private List<Element> previousExceptionList;
+
+ private List<Element> andGroupList;
+ private boolean andGroupSet;
+ private boolean[] andGroupCheck;
+
+ private int skip;
+
+ private Pattern p;
+ private Pattern pPos;
+
+ private Matcher m;
+ private Matcher mPos;
+
+ /** The reference to another element in the pattern. **/
+ private Match tokenReference;
+
+ /**
+ * True when the element stores a formatted reference to another element of
+ * the pattern.
+ */
+ private boolean containsMatches;
+
+ /** Matches only tokens without any POS tag. **/
+ private static final String UNKNOWN_TAG = "UNKNOWN";
+
+ /**
+ * Parameter passed to regular expression matcher to enable case insensitive
+ * Unicode matching.
+ */
+ private static final String CASE_INSENSITIVE = "(?iu)";
+
+ private String referenceString;
+
+ /** String ID of the phrase the element is in. **/
+ private String phraseName;
+
+ /**
+ * This var is used to determine if calling {@link #setStringElement} makes
+ * sense. This method takes most time so it's best to reduce the number of its
+ * calls.
+ **/
+ private boolean testString;
+
+ /**
+ * Tells if the element is inside the unification, so that {@link Unifier}
+ * tests it.
+ */
+ private boolean unified;
+ private boolean uniNegation;
+
+ private Map<String, List<String>> unificationFeatures;
+
+ /**
+ * Creates Element that is used to match tokens in the text.
+ *
+ * @param token
+ * String to be matched
+ * @param caseSensitive
+ * True if the check is case-sensitive.
+ * @param regExp
+ * True if the check uses regular expressions.
+ * @param inflected
+ * True if the check refers to base forms (lemmas).
+ */
+ public Element(final String token, final boolean caseSensitive,
+ final boolean regExp, final boolean inflected) {
+ this.caseSensitive = caseSensitive;
+ this.stringRegExp = regExp;
+ this.inflected = inflected;
+ setStringElement(token);
+ }
+
+ /**
+ * Checks whether the rule element matches the token given as a parameter.
+ *
+ * @param token
+ * @AnalyzedToken to check matching against
+ * @return True if token matches, false otherwise.
+ */
+ public final boolean isMatched(final AnalyzedToken token) {
+ if (testWhitespace && !isWhitespaceBefore(token)) {
+ return false;
+ }
+ boolean matched = false;
+ if (testString) {
+ matched = (isStringTokenMatched(token) ^ negation)
+ && (isPosTokenMatched(token) ^ posNegation);
+ } else {
+ matched = (!negation) && (isPosTokenMatched(token) ^ posNegation);
+ }
+
+ if (andGroupSet) {
+ andGroupCheck[0] |= matched;
+ }
+ return matched;
+ }
+
+ /**
+ * Checks whether an exception matches.
+ *
+ * @param token
+ * @AnalyzedToken to check matching against
+ * @return True if any of the exceptions matches (logical disjunction).
+ */
+ public final boolean isExceptionMatched(final AnalyzedToken token) {
+ if (exceptionSet) {
+ for (final Element testException : exceptionList) {
+ if (!testException.exceptionValidNext) {
+ if (testException.isMatched(token)) {
+ return true;
+ }
+ }
+ }
+ }
+ return false;
+ }
+
+ /**
+ * Enables testing multiple conditions specified by different elements.
+ * Doesn't test exceptions.
+ *
+ * Works as logical AND operator only if preceded with
+ * {@link #setupAndGroup()}, and followed by {@link #checkAndGroup(boolean)}.
+ *
+ * @param token
+ * AnalyzedToken - the token checked.
+ */
+ public final void addMemberAndGroup(final AnalyzedToken token) {
+ if (andGroupSet) {
+ for (int i = 0; i < andGroupList.size(); i++) {
+ if (!andGroupCheck[i + 1]) {
+ final Element testAndGroup = andGroupList.get(i);
+ if (testAndGroup.isMatched(token)) {
+ andGroupCheck[i + 1] = true;
+ }
+ }
+ }
+ }
+ }
+
+ public final void setupAndGroup() {
+ if (andGroupSet) {
+ andGroupCheck = new boolean[andGroupList.size() + 1];
+ Arrays.fill(andGroupCheck, false);
+ }
+ }
+
+ public final boolean checkAndGroup(final boolean previousValue) {
+ if (andGroupSet) {
+ boolean allConditionsMatch = true;
+ for (final boolean testValue : andGroupCheck) {
+ allConditionsMatch &= testValue;
+ }
+ return allConditionsMatch;
+ }
+ return previousValue;
+ }
+
+ /**
+ * Enables testing multiple conditions specified by multiple element
+ * exceptions.
+ *
+ * Works as logical AND operator.
+ *
+ * @param token
+ * AnalyzedToken - the token checked for exceptions.
+ * @return true if all conditions are met, false otherwise.
+ */
+ public final boolean isAndExceptionGroupMatched(final AnalyzedToken token) {
+ if (andGroupSet) {
+ for (final Element testAndGroup : andGroupList) {
+ if (testAndGroup.isExceptionMatched(token)) {
+ return true;
+ }
+ }
+ }
+ return false;
+ }
+
+ /**
+ * This method checks exceptions both in AND-group and the token. Introduced
+ * to for clarity.
+ *
+ * @param token
+ * Token to match
+ * @return True if matched.
+ */
+ public final boolean isExceptionMatchedCompletely(final AnalyzedToken token) {
+ // note: short-circuiting possible
+ return isExceptionMatched(token) || isAndExceptionGroupMatched(token);
+ }
+
+ public final void setAndGroupElement(final Element andToken) {
+ if (andToken != null) {
+ if (andGroupList == null) {
+ andGroupList = new ArrayList<Element>();
+ }
+ if (!andGroupSet) {
+ andGroupSet = true;
+ }
+ andGroupList.add(andToken);
+ }
+ }
+
+ /**
+ * Checks if this element has an AND group associated with it.
+ *
+ * @return true if the element has a group of elements that all should match.
+ */
+ public final boolean hasAndGroup() {
+ return andGroupSet;
+ }
+
+ /**
+ * Returns the group of elements linked with AND operator.
+ *
+ * @return List of Elements.
+ */
+ public final List<Element> getAndGroup() {
+ return andGroupList;
+ }
+
+ /**
+ * Checks whether a previously set exception matches (in case the exception
+ * had scope == "next").
+ *
+ * @param token
+ * @AnalyzedToken to check matching against.
+ * @return True if any of the exceptions matches.
+ */
+ public final boolean isMatchedByScopeNextException(final AnalyzedToken token) {
+ if (exceptionSet) {
+ for (final Element testException : exceptionList) {
+ if (testException.exceptionValidNext) {
+ if (testException.isMatched(token)) {
+ return true;
+ }
+ }
+ }
+ }
+ return false;
+ }
+
+ /**
+ * Checks whether an exception for a previous token matches (in case the
+ * exception had scope == "previous").
+ *
+ * @param token
+ * {@link AnalyzedToken} to check matching against.
+ * @return True if any of the exceptions matches.
+ */
+ public final boolean isMatchedByPreviousException(final AnalyzedToken token) {
+ if (exceptionValidPrevious) {
+ for (final Element testException : previousExceptionList) {
+ if (!testException.exceptionValidNext) {
+ if (testException.isMatched(token)) {
+ return true;
+ }
+ }
+ }
+ }
+ return false;
+ }
+
+ /**
+ * Checks whether an exception for a previous token matches all readings of a
+ * given token (in case the exception had scope == "previous").
+ *
+ * @param prevToken
+ * {@link AnalyzedTokenReadings} to check matching against.
+ * @return true if any of the exceptions matches.
+ */
+ public final boolean isMatchedByPreviousException(
+ final AnalyzedTokenReadings prevToken) {
+ final int numReadings = prevToken.getReadingsLength();
+ for (int i = 0; i < numReadings; i++) {
+ if (isMatchedByPreviousException(prevToken.getAnalyzedToken(i))) {
+ return true;
+ }
+ }
+ return false;
+ }
+
+ /**
+ * Checks if the token is a SENT_START.
+ *
+ * @return True if the element starts the sentence and the element hasn't been
+ * set to have negated POS token.
+ *
+ */
+ public final boolean isSentStart() {
+ return JLanguageTool.SENTENCE_START_TAGNAME.equals(posToken)
+ && !posNegation;
+ }
+
+ @Override
+ public final String toString() {
+ final StringBuilder sb = new StringBuilder();
+ if (negation) {
+ sb.append('!');
+ }
+ sb.append(stringToken);
+ if (phraseName != null) {
+ sb.append(" {");
+ sb.append(phraseName);
+ sb.append('}');
+ }
+ if (posToken != null) {
+ sb.append('/');
+ sb.append(posToken);
+ }
+ return sb.toString();
+ }
+
+ public final void setPosElement(final String posToken, final boolean regExp,
+ final boolean negation) {
+ this.posToken = posToken;
+ this.posNegation = negation;
+ posRegExp = regExp;
+ if (posRegExp) {
+ pPos = Pattern.compile(posToken);
+ }
+ }
+
+ public final String getString() {
+ return stringToken;
+ }
+
+ public final void setStringElement(final String token) {
+ this.stringToken = token;
+ testString = !StringTools.isEmpty(stringToken);
+ if (testString && stringRegExp) {
+ regToken = stringToken;
+ if (!caseSensitive) {
+ regToken = CASE_INSENSITIVE + stringToken;
+ }
+ if (!"\\0".equals(token)) {
+ p = Pattern.compile(regToken);
+ }
+ }
+ }
+
+ /**
+ * Sets a POS-type exception for matching string tokens.
+ *
+ * @param posToken
+ * The part of the speech tag in the exception.
+ * @param regExp
+ * True if the POS is specified as a regular expression.
+ * @param negation
+ * True if the exception is negated.
+ * @param scopeNext
+ * True if the exception scope is next tokens.
+ * @param scopePrevious
+ * True if the exception should match only a single previous token.
+ */
+ public final void setPosException(final String posToken,
+ final boolean regExp, final boolean negation, final boolean scopeNext,
+ final boolean scopePrevious) {
+ final Element posException = new Element("", this.caseSensitive, false,
+ false);
+ posException.setPosElement(posToken, regExp, negation);
+ posException.exceptionValidNext = scopeNext;
+ setException(posException, scopePrevious);
+ }
+
+ /**
+ * Sets a string-type exception for matching string tokens.
+ *
+ * @param token
+ * The string in the exception.
+ * @param regExp
+ * True if the string is specified as a regular expression.
+ * @param inflected
+ * True if the string is a base form (lemma).
+ * @param negation
+ * True if the exception is negated.
+ * @param scopeNext
+ * True if the exception scope is next tokens.
+ * @param scopePrevious
+ * True if the exception should match only a single previous token.
+ */
+ public final void setStringException(final String token,
+ final boolean regExp, final boolean inflected, final boolean negation,
+ final boolean scopeNext, final boolean scopePrevious) {
+ final Element stringException = new Element(token, this.caseSensitive,
+ regExp, inflected);
+ stringException.setNegation(negation);
+ stringException.exceptionValidNext = scopeNext;
+ setException(stringException, scopePrevious);
+ }
+
+ private void setException(final Element elem, final boolean scopePrevious) {
+ exceptionValidPrevious |= scopePrevious;
+ if (exceptionList == null && !scopePrevious) {
+ exceptionList = new ArrayList<Element>();
+ }
+ if (previousExceptionList == null && scopePrevious) {
+ previousExceptionList = new ArrayList<Element>();
+ }
+ if (scopePrevious) {
+ previousExceptionList.add(elem);
+ } else {
+ if (!exceptionSet) {
+ exceptionSet = true;
+ }
+ if (exceptionSet) {
+ exceptionList.add(elem);
+ }
+ }
+ }
+
+ /**
+ * Tests if part of speech matches a given string.
+ *
+ * @param token
+ * Token to test.
+ * @return true if matches
+ *
+ * Special value UNKNOWN_TAG matches null POS tags.
+ *
+ */
+ private boolean isPosTokenMatched(final AnalyzedToken token) {
+ // if no POS set
+ // defaulting to true
+ if (posToken == null) {
+ return true;
+ }
+ if (token.getPOSTag() == null) {
+ if (posRegExp) {
+ if (mPos == null) {
+ mPos = pPos.matcher(UNKNOWN_TAG);
+ } else {
+ mPos.reset(UNKNOWN_TAG);
+ }
+ return mPos.matches();
+ }
+ if (UNKNOWN_TAG.equals(posToken)) {
+ return true;
+ }
+ }
+ boolean match;
+ if (posRegExp) {
+ if (mPos == null) {
+ mPos = pPos.matcher(token.getPOSTag());
+ } else {
+ mPos.reset(token.getPOSTag());
+ }
+ match = mPos.matches();
+ } else {
+ match = posToken.equals(token.getPOSTag());
+ }
+ if (!match && UNKNOWN_TAG.equals(posToken)) { // these are helper tags,
+ // ignore them
+ match = JLanguageTool.SENTENCE_END_TAGNAME.equals(token.getPOSTag())
+ || JLanguageTool.PARAGRAPH_END_TAGNAME.equals(token.getPOSTag());
+ }
+ return match;
+ }
+
+ /**
+ * Tests whether the string token element matches a given token.
+ *
+ * @param token
+ * {@link AnalyzedToken} to match against.
+ * @return True if matches.
+ */
+ private boolean isStringTokenMatched(final AnalyzedToken token) {
+ final String testToken = getTestToken(token);
+ if (stringRegExp) {
+ if (m == null) {
+ m = p.matcher(testToken);
+ } else {
+ m.reset(testToken);
+ }
+ return m.matches();
+ }
+ if (caseSensitive) {
+ return stringToken.equals(testToken);
+ }
+ return stringToken.equalsIgnoreCase(testToken);
+ }
+
+ private String getTestToken(final AnalyzedToken token) {
+ // enables using words with lemmas and without lemmas
+ // in the same regexp with inflected="yes"
+ if (inflected) {
+ return token.getTokenInflected();
+ }
+ return token.getToken();
+ }
+
+ /**
+ * Gets the exception scope length.
+ *
+ * @return Scope length.
+ */
+ public final int getSkipNext() {
+ return skip;
+ }
+
+ /**
+ * Sets the exception scope length.
+ *
+ * @param i
+ * Exception scope length.
+ */
+ public final void setSkipNext(final int i) {
+ skip = i;
+ }
+
+ /**
+ * Checks if the element has an exception for a previous token.
+ *
+ * @return True if the element has a previous token matching exception.
+ */
+ public final boolean hasPreviousException() {
+ return exceptionValidPrevious;
+ }
+
+ /**
+ * Negates the meaning of match().
+ *
+ * @param negation
+ * - true if the meaning of match() is to be negated.
+ */
+ public final void setNegation(final boolean negation) {
+ this.negation = negation;
+ }
+
+ /**
+ * see {@link #setNegation}
+ *
+ * @since 0.9.3
+ */
+ public final boolean getNegation() {
+ return this.negation;
+ }
+
+ /**
+ *
+ * @return true when this element refers to another token.
+ */
+ public final boolean isReferenceElement() {
+ return containsMatches;
+ }
+
+ /**
+ * Sets the reference to another token.
+ *
+ * @param match
+ * Formatting object for the token reference.
+ */
+ public final void setMatch(final Match match) {
+ tokenReference = match;
+ containsMatches = true;
+ }
+
+ public final Match getMatch() {
+ return tokenReference;
+ }
+
+ /**
+ * Prepare Element for matching by formatting its string token and POS (if the
+ * Element is supposed to refer to some other token).
+ *
+ * @param token
+ * the token specified as {@link AnalyzedTokenReadings}
+ * @param synth
+ * the language synthesizer ({@link Synthesizer})
+ *
+ */
+ public final void compile(final AnalyzedTokenReadings token,
+ final Synthesizer synth) throws IOException {
+
+ m = null;
+ p = null;
+ tokenReference.setToken(token);
+ tokenReference.setSynthesizer(synth);
+
+ if (StringTools.isEmpty(referenceString)) {
+ referenceString = stringToken;
+ }
+ if (tokenReference.setsPos()) {
+ final String posReference = tokenReference.getTargetPosTag();
+ if (posReference != null) {
+ if (mPos != null) {
+ mPos = null;
+ }
+ setPosElement(posReference, tokenReference.posRegExp(), negation);
+ }
+ setStringElement(referenceString.replace("\\"
+ + tokenReference.getTokenRef(), ""));
+ inflected = true;
+ } else {
+ setStringElement(referenceString.replace("\\"
+ + tokenReference.getTokenRef(), tokenReference.toTokenString()));
+ }
+ }
+
+ /**
+ * Sets the phrase the element is in.
+ *
+ * @param s
+ * ID of the phrase.
+ */
+ public final void setPhraseName(final String s) {
+ phraseName = s;
+ }
+
+ /**
+ * Checks if the Element is in any phrase.
+ *
+ * @return True if the Element is contained in the phrase.
+ */
+ public final boolean isPartOfPhrase() {
+ return phraseName != null;
+ }
+
+ /**
+ * Whether the element matches case sensitively.
+ *
+ * @since 0.9.3
+ */
+ public final boolean getCaseSensitive() {
+ return caseSensitive;
+ }
+
+ /**
+ * Tests whether the element matches a regular expression.
+ *
+ * @since 0.9.6
+ */
+ public final boolean isRegularExpression() {
+ return stringRegExp;
+ }
+
+ /**
+ * @return the POS of the Element
+ * @since 0.9.6
+ */
+ public final String getPOStag() {
+ return posToken;
+ }
+
+ /**
+ * Tests whether the POS is negated.
+ *
+ * @return true if so.
+ */
+ public final boolean getPOSNegation() {
+ return posNegation;
+ }
+
+ /**
+ * Whether the token is inflected.
+ *
+ * @return True if so.
+ */
+ public final boolean isInflected() {
+ return inflected;
+ }
+
+ /**
+ * Gets the phrase the element is in.
+ *
+ * @return String The name of the phrase.
+ */
+ public final String getPhraseName() {
+ return phraseName;
+ }
+
+ public final boolean isUnified() {
+ return unified;
+ }
+
+ public final void setUnification(final Map<String, List<String>> uniFeatures) {
+ unificationFeatures = uniFeatures;
+ unified = true;
+ }
+
+ /**
+ * Get unification features and types.
+ * @return A map from features to a list of types.
+ * @since 1.0.1
+ */
+ public final Map<String, List<String>> getUniFeatures() {
+ return unificationFeatures;
+ }
+
+ public final void setUniNegation() {
+ uniNegation = true;
+ }
+
+ public final boolean isUniNegated() {
+ return uniNegation;
+ }
+
+ public final void setWhitespaceBefore(final boolean isWhite) {
+ whitespaceBefore = isWhite;
+ testWhitespace = true;
+ }
+
+ public final void setExceptionSpaceBefore(final boolean isWhite) {
+ if (exceptionList != null) {
+ exceptionList.get(exceptionList.size()).setWhitespaceBefore(isWhite);
+ }
+ }
+
+ public final boolean isWhitespaceBefore(final AnalyzedToken token) {
+ return whitespaceBefore == token.isWhitespaceBefore();
+ }
+
+ /**
+ * Since 1.0.0
+ * @return A List of Exceptions. Used for testing.
+ */
+ public final List<Element> getExceptionList() {
+ return exceptionList;
+ }
+}
diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/rules/patterns/FalseFriendRuleLoader.java b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/patterns/FalseFriendRuleLoader.java
new file mode 100644
index 0000000..94c6515
--- /dev/null
+++ b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/patterns/FalseFriendRuleLoader.java
@@ -0,0 +1,356 @@
+/* LanguageTool, a natural language style checker
+ * Copyright (C) 2005 Daniel Naber (http://www.danielnaber.de)
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
+ * USA
+ */
+package de.danielnaber.languagetool.rules.patterns;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.text.MessageFormat;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.Iterator;
+import java.util.List;
+import java.util.Map;
+import java.util.ResourceBundle;
+
+import javax.xml.parsers.ParserConfigurationException;
+import javax.xml.parsers.SAXParser;
+import javax.xml.parsers.SAXParserFactory;
+
+import org.xml.sax.Attributes;
+import org.xml.sax.SAXException;
+import org.xml.sax.helpers.DefaultHandler;
+
+import de.danielnaber.languagetool.JLanguageTool;
+import de.danielnaber.languagetool.Language;
+import de.danielnaber.languagetool.rules.Category;
+import de.danielnaber.languagetool.rules.IncorrectExample;
+import de.danielnaber.languagetool.tools.StringTools;
+
+/**
+ * Loads {@link PatternRule}s from a false friends XML file.
+ *
+ * @author Daniel Naber
+ */
+public class FalseFriendRuleLoader extends DefaultHandler {
+
+ public FalseFriendRuleLoader() {
+ }
+
+ public final List<PatternRule> getRules(final InputStream file,
+ final Language textLanguage, final Language motherTongue)
+ throws ParserConfigurationException, SAXException, IOException {
+ final FalseFriendRuleHandler handler = new FalseFriendRuleHandler(
+ textLanguage, motherTongue);
+ final SAXParserFactory factory = SAXParserFactory.newInstance();
+ final SAXParser saxParser = factory.newSAXParser();
+ saxParser.getXMLReader()
+ .setFeature(
+ "http://apache.org/xml/features/nonvalidating/load-external-dtd",
+ false);
+ saxParser.parse(file, handler);
+ final List<PatternRule> rules = handler.getRules();
+ // Add suggestions to each rule:
+ final ResourceBundle messages = ResourceBundle.getBundle(
+ "de.danielnaber.languagetool.MessagesBundle", motherTongue.getLocale());
+ for (final PatternRule rule : rules) {
+ final List<String> suggestionMap = handler.getSuggestionMap().get(rule.getId());
+ if (suggestionMap != null) {
+ final MessageFormat msgFormat = new MessageFormat(messages
+ .getString("false_friend_suggestion"));
+ final Object[] msg = new Object[] { formatSuggestions(suggestionMap) };
+ rule.setMessage(rule.getMessage() + " " + msgFormat.format(msg));
+ }
+ }
+ return rules;
+ }
+
+ private String formatSuggestions(final List<String> l) {
+ final StringBuilder sb = new StringBuilder();
+ for (final Iterator<String> iter = l.iterator(); iter.hasNext();) {
+ final String s = iter.next();
+ sb.append("<suggestion>");
+ sb.append(s);
+ sb.append("</suggestion>");
+ if (iter.hasNext()) {
+ sb.append(", ");
+ }
+ }
+ return sb.toString();
+ }
+
+ /** Testing only. */
+ public final void main(final String[] args)
+ throws ParserConfigurationException, SAXException, IOException {
+ final FalseFriendRuleLoader prg = new FalseFriendRuleLoader();
+ List<PatternRule> l = prg.getRules(JLanguageTool.getDataBroker()
+ .getFromRulesDirAsStream("/false-friends.xml"), Language.ENGLISH,
+ Language.GERMAN);
+ System.out.println("Hints for German native speakers:");
+ for (final PatternRule rule : l) {
+ System.out.println(rule);
+ }
+ System.out.println("=======================================");
+ System.out.println("Hints for English native speakers:");
+ l = prg.getRules(JLanguageTool.getDataBroker()
+ .getFromRulesDirAsStream("/false-friends.xml"),
+ Language.GERMAN, Language.ENGLISH);
+ for (final PatternRule rule : l) {
+ System.out.println(rule);
+ }
+ }
+
+}
+
+class FalseFriendRuleHandler extends XMLRuleHandler {
+
+ private final ResourceBundle messages;
+ private final MessageFormat formatter;
+
+ private final Language textLanguage;
+ private final Language motherTongue;
+
+ private boolean defaultOff;
+
+ private Language language;
+ private Language translationLanguage;
+ private Language currentTranslationLanguage;
+ private List<StringBuilder> translations = new ArrayList<StringBuilder>();
+ private StringBuilder translation = new StringBuilder();
+ private final List<String> suggestions = new ArrayList<String>();
+ // rule ID -> list of translations:
+ private final Map<String, List<String>> suggestionMap = new HashMap<String, List<String>>();
+
+ private boolean inTranslation;
+
+ public FalseFriendRuleHandler(final Language textLanguage,
+ final Language motherTongue) {
+ messages = ResourceBundle.getBundle(
+ "de.danielnaber.languagetool.MessagesBundle", motherTongue.getLocale());
+ formatter = new MessageFormat("");
+ formatter.setLocale(motherTongue.getLocale());
+ this.textLanguage = textLanguage;
+ this.motherTongue = motherTongue;
+ }
+
+ public Map<String, List<String>> getSuggestionMap() {
+ return suggestionMap;
+ }
+
+ // ===========================================================
+ // SAX DocumentHandler methods
+ // ===========================================================
+
+ @Override
+ public void startElement(final String namespaceURI, final String lName,
+ final String qName, final Attributes attrs) throws SAXException {
+ if (qName.equals("rule")) {
+ translations = new ArrayList<StringBuilder>();
+ id = attrs.getValue("id");
+ if (!(inRuleGroup && defaultOff)) {
+ defaultOff = "off".equals(attrs.getValue("default"));
+ }
+ if (inRuleGroup && id == null) {
+ id = ruleGroupId;
+ }
+ correctExamples = new ArrayList<String>();
+ incorrectExamples = new ArrayList<IncorrectExample>();
+ } else if (qName.equals("pattern")) {
+ inPattern = true;
+ final String languageStr = attrs.getValue("lang");
+ language = Language.getLanguageForShortName(languageStr);
+ if (language == null) {
+ throw new SAXException("Unknown language '" + languageStr + "'");
+ }
+ } else if (qName.equals("exception")) {
+ inException = true;
+ exceptions = new StringBuilder();
+
+ if (attrs.getValue(NEGATE) != null) {
+ exceptionStringNegation = attrs.getValue(NEGATE).equals(YES);
+ }
+ if (attrs.getValue(SCOPE) != null) {
+ exceptionValidNext = attrs.getValue(SCOPE).equals("next");
+ exceptionValidPrev = attrs.getValue(SCOPE).equals("previous");
+ }
+ if (attrs.getValue(INFLECTED) != null) {
+ exceptionStringInflected = attrs.getValue(INFLECTED).equals(YES);
+ }
+ if (attrs.getValue(POSTAG) != null) {
+ exceptionPosToken = attrs.getValue(POSTAG);
+ if (attrs.getValue(POSTAG_REGEXP) != null) {
+ exceptionPosRegExp = attrs.getValue(POSTAG_REGEXP).equals(YES);
+ }
+ if (attrs.getValue(NEGATE_POS) != null) {
+ exceptionPosNegation = attrs.getValue(NEGATE_POS).equals(YES);
+ }
+ }
+ if (attrs.getValue(REGEXP) != null) {
+ exceptionStringRegExp = attrs.getValue(REGEXP).equals(YES);
+ }
+
+ } else if (qName.equals(TOKEN)) {
+ setToken(attrs);
+ } else if (qName.equals("translation")) {
+ inTranslation = true;
+ final String languageStr = attrs.getValue("lang");
+ final Language tmpLang = Language.getLanguageForShortName(languageStr);
+ currentTranslationLanguage = tmpLang;
+ if (tmpLang == motherTongue) {
+ translationLanguage = tmpLang;
+ if (translationLanguage == null) {
+ throw new SAXException("Unknown language '" + languageStr + "'");
+ }
+ }
+ } else if (qName.equals(EXAMPLE)
+ && attrs.getValue(TYPE).equals("correct")) {
+ inCorrectExample = true;
+ correctExample = new StringBuilder();
+ } else if (qName.equals(EXAMPLE)
+ && attrs.getValue(TYPE).equals("incorrect")) {
+ inIncorrectExample = true;
+ incorrectExample = new StringBuilder();
+ } else if (qName.equals("message")) {
+ inMessage = true;
+ message = new StringBuilder();
+ } else if (qName.equals("rulegroup")) {
+ ruleGroupId = attrs.getValue("id");
+ inRuleGroup = true;
+ defaultOff = "off".equals(attrs.getValue(DEFAULT));
+ }
+ }
+
+ @Override
+ public void endElement(final String namespaceURI, final String sName,
+ final String qName) {
+ if (qName.equals("rule")) {
+ if (language == textLanguage && translationLanguage != null
+ && translationLanguage == motherTongue && language != motherTongue
+ && !translations.isEmpty()) {
+ formatter.applyPattern(messages.getString("false_friend_hint"));
+ final Object[] messageArguments = {
+ elements.toString().replace('|', '/'),
+ messages.getString(textLanguage.getShortName()),
+ formatTranslations(translations),
+ messages.getString(motherTongue.getShortName()) };
+ final String description = formatter.format(messageArguments);
+ final PatternRule rule = new PatternRule(id, language, elementList,
+ messages.getString("false_friend_desc") + " "
+ + elements.toString().replace('|', '/'), description, messages
+ .getString("false_friend"));
+ rule.setCorrectExamples(correctExamples);
+ rule.setIncorrectExamples(incorrectExamples);
+ rule.setCategory(new Category(messages
+ .getString("category_false_friend")));
+ if (defaultOff) {
+ rule.setDefaultOff();
+ }
+ rules.add(rule);
+ }
+
+ if (elementList != null) {
+ elementList.clear();
+ }
+
+ } else if (qName.equals("exception")) {
+ inException = false;
+ if (!exceptionSet) {
+ tokenElement = new Element(elements.toString(), caseSensitive,
+ regExpression, tokenInflected);
+ exceptionSet = true;
+ }
+ tokenElement.setNegation(tokenNegated);
+ if (!StringTools.isEmpty(exceptions.toString())) {
+ tokenElement.setStringException(exceptions.toString(),
+ exceptionStringRegExp, exceptionStringInflected,
+ exceptionStringNegation, exceptionValidNext, exceptionValidPrev);
+ }
+ if (exceptionPosToken != null) {
+ tokenElement.setPosException(exceptionPosToken, exceptionPosRegExp,
+ exceptionPosNegation, exceptionValidNext, exceptionValidPrev);
+ exceptionPosToken = null;
+ }
+ } else if (qName.equals(TOKEN)) {
+ finalizeTokens();
+ } else if (qName.equals("pattern")) {
+ inPattern = false;
+ } else if (qName.equals("translation")) {
+ if (currentTranslationLanguage == motherTongue) {
+ translations.add(translation);
+ }
+ if (currentTranslationLanguage == textLanguage) {
+ suggestions.add(translation.toString());
+ }
+ translation = new StringBuilder();
+ inTranslation = false;
+ currentTranslationLanguage = null;
+ } else if (qName.equals(EXAMPLE)) {
+ if (inCorrectExample) {
+ correctExamples.add(correctExample.toString());
+ } else if (inIncorrectExample) {
+ incorrectExamples
+ .add(new IncorrectExample(incorrectExample.toString()));
+ }
+ inCorrectExample = false;
+ inIncorrectExample = false;
+ correctExample = new StringBuilder();
+ incorrectExample = new StringBuilder();
+ } else if (qName.equals("message")) {
+ inMessage = false;
+ } else if (qName.equals("rulegroup")) {
+ if (!suggestions.isEmpty()) {
+ final List<String> l = new ArrayList<String>(suggestions);
+ suggestionMap.put(id, l);
+ suggestions.clear();
+ }
+ inRuleGroup = false;
+ }
+ }
+
+ private String formatTranslations(final List<StringBuilder> translations) {
+ final StringBuilder sb = new StringBuilder();
+ for (final Iterator<StringBuilder> iter = translations.iterator(); iter
+ .hasNext();) {
+ final StringBuilder trans = iter.next();
+ sb.append('"');
+ sb.append(trans.toString());
+ sb.append('"');
+ if (iter.hasNext()) {
+ sb.append(", ");
+ }
+ }
+ return sb.toString();
+ }
+
+ @Override
+ public void characters(final char[] buf, final int offset, final int len) {
+ final String s = new String(buf, offset, len);
+ if (inException) {
+ exceptions.append(s);
+ } else if (inToken && inPattern) {
+ elements.append(s);
+ } else if (inCorrectExample) {
+ correctExample.append(s);
+ } else if (inIncorrectExample) {
+ incorrectExample.append(s);
+ } else if (inTranslation) {
+ translation.append(s);
+ }
+ }
+
+}
diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/rules/patterns/Match.java b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/patterns/Match.java
new file mode 100644
index 0000000..0519f2c
--- /dev/null
+++ b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/patterns/Match.java
@@ -0,0 +1,551 @@
+/* LanguageTool, a natural language style checker
+ * Copyright (C) 2005 Daniel Naber (http://www.danielnaber.de)
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
+ * USA
+ */
+package de.danielnaber.languagetool.rules.patterns;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
+import java.util.TreeSet;
+import java.util.regex.Pattern;
+
+import de.danielnaber.languagetool.AnalyzedToken;
+import de.danielnaber.languagetool.AnalyzedTokenReadings;
+import de.danielnaber.languagetool.JLanguageTool;
+import de.danielnaber.languagetool.synthesis.Synthesizer;
+import de.danielnaber.languagetool.tools.StringTools;
+
+/**
+ * Reference to a matched token in a pattern, can be formatted and used for
+ * matching & suggestions.
+ *
+ * @author Marcin Miłkowski
+ */
+public class Match {
+
+ /** Possible string case conversions. **/
+ public enum CaseConversion {
+ NONE, STARTLOWER, STARTUPPER, ALLLOWER, ALLUPPER;
+
+ /**
+ * Converts string to the constant enum.
+ *
+ * @param str
+ * String value to be converted.
+ * @return CaseConversion enum.
+ */
+ public static CaseConversion toCase(final String str) {
+ try {
+ return valueOf(str);
+ } catch (final Exception ex) {
+ return NONE;
+ }
+ }
+ }
+
+ public enum IncludeRange {
+ NONE, FOLLOWING, ALL;
+
+ /**
+ * Converts string to the constant enum.
+ *
+ * @param str
+ * String value to be converted.
+ * @return IncludeRange enum.
+ */
+ public static IncludeRange toRange(final String str) {
+ try {
+ return valueOf(str);
+ } catch (final Exception ex) {
+ return NONE;
+ }
+ }
+ }
+
+ private final String posTag;
+ private boolean postagRegexp;
+ private final String regexReplace;
+ private final String posTagReplace;
+ private final CaseConversion caseConversionType;
+
+ private final IncludeRange includeSkipped;
+ private String skippedTokens;
+
+ /**
+ * True if this match element formats a statically defined lemma which is
+ * enclosed by the element, e.g., <tt>&lt;match...&gt;word&lt;/word&gt;</tt>.
+ */
+ private boolean staticLemma;
+
+ /**
+ * True if this match element is used for formatting POS token.
+ */
+ private final boolean setPos;
+
+ private AnalyzedTokenReadings formattedToken;
+ private AnalyzedTokenReadings matchedToken;
+
+ private int tokenRef;
+
+ /** Word form generator for POS tags. **/
+ private Synthesizer synthesizer;
+
+ /** Pattern used to define parts of the matched token. **/
+ private Pattern pRegexMatch;
+
+ /** Pattern used to define parts of the matched POS token. **/
+ private Pattern pPosRegexMatch;
+
+ /**
+ * True when the match is not in the suggestion.
+ */
+ private boolean inMessageOnly;
+
+ public Match(final String posTag, final String posTagReplace,
+ final boolean postagRegexp, final String regexMatch,
+ final String regexReplace, final CaseConversion caseConversionType,
+ final boolean setPOS,
+ final IncludeRange includeSkipped) {
+ this.posTag = posTag;
+ this.postagRegexp = postagRegexp;
+ this.caseConversionType = caseConversionType;
+
+ if (regexMatch != null) {
+ pRegexMatch = Pattern.compile(regexMatch);
+ }
+ if (postagRegexp && posTag != null) {
+ pPosRegexMatch = Pattern.compile(posTag);
+ }
+
+ this.regexReplace = regexReplace;
+ this.posTagReplace = posTagReplace;
+ this.setPos = setPOS;
+ this.includeSkipped = includeSkipped;
+ }
+
+ /**
+ * Sets the token that will be formatted or otherwise used in the class.
+ */
+ public final void setToken(final AnalyzedTokenReadings token) {
+ if (staticLemma) {
+ matchedToken = token;
+ } else {
+ formattedToken = token;
+ }
+ }
+
+ /**
+ * Sets the token to be formatted etc. and includes the support for
+ * including the skipped tokens.
+ * @param tokens Array of tokens
+ * @param index Index of the token to be formatted
+ * @param next Position of the next token (the skipped tokens
+ * are the ones between the tokens[index] and tokens[next]
+ */
+ public final void setToken(final AnalyzedTokenReadings[] tokens, final int index, final int next) {
+ setToken(tokens[index]);
+ if (next > 1 && includeSkipped != IncludeRange.NONE) {
+ final StringBuilder sb = new StringBuilder();
+ if (includeSkipped == IncludeRange.FOLLOWING) {
+ formattedToken = null;
+ }
+ for (int k = index + 1; k < index + next; k++) {
+ if (k > index + 1 &&
+ tokens[k].isWhitespaceBefore()) {
+ sb.append(' ');
+ }
+ sb.append(tokens[k].getToken());
+ }
+ skippedTokens = sb.toString();
+ } else {
+ skippedTokens = "";
+ }
+ }
+
+ /**
+ private String[] addSkipped(final String[] formattedString) {
+ if (skippedTokens != null && !"".equals(skippedTokens)) {
+ String[] finalStrings = new String[formattedString.length];
+ for (int i = 1; i <= formattedString.length; i++)
+ }
+ }
+
+ **/
+
+ /**
+ * Checks if the Match element is used for setting the part of speech Element.
+ *
+ * @return True if Match sets POS.
+ */
+ public final boolean setsPos() {
+ return setPos;
+ }
+
+ /**
+ * Checks if the Match element uses regexp-based form of the POS tag.
+ *
+ * @return True if regexp is used in POS.
+ */
+ public final boolean posRegExp() {
+ return postagRegexp;
+ }
+
+ /**
+ * Sets a base form (lemma) that will be formatted, or synthesized, using the
+ * specified POS regular expressions.
+ *
+ * @param lemmaString String that specifies the base form.
+ */
+ public final void setLemmaString(final String lemmaString) {
+ if (!StringTools.isEmpty(lemmaString)) {
+ formattedToken = new AnalyzedTokenReadings(new AnalyzedToken(lemmaString,
+ posTag, lemmaString), 0);
+ staticLemma = true;
+ postagRegexp = true;
+ if (posTag != null) {
+ pPosRegexMatch = Pattern.compile(posTag);
+ }
+ }
+ }
+
+ /**
+ * Sets a synthesizer used for grammatical synthesis of forms based on
+ * formatted POS values.
+ *
+ * @param synth Synthesizer class.
+ */
+ public final void setSynthesizer(final Synthesizer synth) {
+ synthesizer = synth;
+ }
+
+ /**
+ * Gets all strings formatted using the match element.
+ *
+ * @return array of strings
+ * @throws IOException
+ * in case of synthesizer-related disk problems.
+ */
+ public final String[] toFinalString() throws IOException {
+ String[] formattedString = new String[1];
+ if (formattedToken != null) {
+ final int readingCount = formattedToken.getReadingsLength();
+ formattedString[0] = formattedToken.getToken();
+ if (pRegexMatch != null) {
+ formattedString[0] = pRegexMatch.matcher(formattedString[0])
+ .replaceAll(regexReplace);
+ }
+ formattedString[0] = convertCase(formattedString[0]);
+ if (posTag != null) {
+ if (synthesizer == null) {
+ formattedString[0] = formattedToken.getToken();
+ } else if (postagRegexp) {
+ final TreeSet<String> wordForms = new TreeSet<String>();
+ boolean oneForm = false;
+ for (int k = 0; k < readingCount; k++) {
+ if (formattedToken.getAnalyzedToken(k).getLemma() == null) {
+ final String posUnique = formattedToken.getAnalyzedToken(k)
+ .getPOSTag();
+ if (posUnique == null) {
+ wordForms.add(formattedToken.getToken());
+ oneForm = true;
+ } else {
+ if (JLanguageTool.SENTENCE_START_TAGNAME.equals(posUnique)
+ || JLanguageTool.SENTENCE_END_TAGNAME.equals(posUnique)
+ || JLanguageTool.PARAGRAPH_END_TAGNAME.equals(posUnique)) {
+ if (!oneForm) {
+ wordForms.add(formattedToken.getToken());
+ }
+ oneForm = true;
+ } else {
+ oneForm = false;
+ }
+ }
+ }
+ }
+ final String targetPosTag = getTargetPosTag();
+ if (!oneForm) {
+ for (int i = 0; i < readingCount; i++) {
+ final String[] possibleWordForms = synthesizer.synthesize(
+ formattedToken.getAnalyzedToken(i), targetPosTag, true);
+ if (possibleWordForms != null) {
+ wordForms.addAll(Arrays.asList(possibleWordForms));
+ }
+ }
+ }
+ if (wordForms.isEmpty()) {
+ formattedString[0] = "(" + formattedToken.getToken() + ")";
+ } else {
+ formattedString = wordForms.toArray(new String[wordForms.size()]);
+ }
+ } else {
+ final TreeSet<String> wordForms = new TreeSet<String>();
+ for (int i = 0; i < readingCount; i++) {
+ final String[] possibleWordForms = synthesizer.synthesize(
+ formattedToken.getAnalyzedToken(i), posTag);
+ if (possibleWordForms != null) {
+ wordForms.addAll(Arrays.asList(possibleWordForms));
+ }
+ }
+ formattedString = wordForms.toArray(new String[wordForms.size()]);
+ }
+ }
+ }
+ if (includeSkipped != IncludeRange.NONE
+ && skippedTokens != null && !"".equals(skippedTokens)) {
+ final String[] helper = new String[formattedString.length];
+ for (int i = 0; i < formattedString.length; i++) {
+ if (formattedString[i] == null) {
+ formattedString[i] = "";
+ }
+ helper[i] = formattedString[i] + skippedTokens;
+ }
+ formattedString = helper;
+ }
+ return formattedString;
+ }
+
+ /**
+ * Format POS tag using parameters already defined in the class.
+ *
+ * @return Formatted POS tag as String.
+ */
+ // FIXME: gets only the first POS tag that matches, this can be wrong
+ // on the other hand, many POS tags = too many suggestions?
+ public final String getTargetPosTag() {
+ String targetPosTag = posTag;
+ final List<String> posTags = new ArrayList<String>();
+ if (staticLemma) {
+ final int numRead = matchedToken.getReadingsLength();
+ for (int i = 0; i < numRead; i++) {
+ final String tst = matchedToken.getAnalyzedToken(i).getPOSTag();
+ if (tst != null && pPosRegexMatch.matcher(tst).matches()) {
+ targetPosTag = matchedToken.getAnalyzedToken(i).getPOSTag();
+ posTags.add(targetPosTag);
+ }
+ }
+ if (pPosRegexMatch != null && posTagReplace != null) {
+ targetPosTag = pPosRegexMatch.matcher(targetPosTag).replaceAll(
+ posTagReplace);
+ }
+ } else {
+ final int numRead = formattedToken.getReadingsLength();
+ for (int i = 0; i < numRead; i++) {
+ final String tst = formattedToken.getAnalyzedToken(i).getPOSTag();
+ if (tst != null && pPosRegexMatch.matcher(tst).matches()) {
+ targetPosTag = formattedToken.getAnalyzedToken(i).getPOSTag();
+ posTags.add(targetPosTag);
+ }
+ }
+ if (pPosRegexMatch != null && posTagReplace != null) {
+ if (posTags.isEmpty()) {
+ posTags.add(targetPosTag);
+ }
+ final StringBuilder sb = new StringBuilder();
+ final int posTagLen = posTags.size();
+ int l = 0;
+ for (String lposTag : posTags) {
+ l++;
+ lposTag = pPosRegexMatch.matcher(lposTag).replaceAll(posTagReplace);
+ if (setPos) {
+ lposTag = synthesizer.getPosTagCorrection(lposTag);
+ }
+ sb.append(lposTag);
+ if (l < posTagLen) {
+ sb.append('|');
+ }
+ }
+ targetPosTag = sb.toString();
+ }
+ }
+ return targetPosTag;
+ }
+
+ /**
+ * Method for getting the formatted match as a single string. In case of
+ * multiple matches, it joins them using a regular expression operator "|".
+ *
+ * @return Formatted string of the matched token.
+ */
+ public final String toTokenString() throws IOException {
+ final StringBuilder output = new StringBuilder();
+ final String[] stringToFormat = toFinalString();
+ for (int i = 0; i < stringToFormat.length; i++) {
+ output.append(stringToFormat[i]);
+ if (i + 1 < stringToFormat.length) {
+ output.append('|');
+ }
+ }
+ return output.toString();
+ }
+
+ /**
+ * Sets the token number referenced by the match.
+ *
+ * @param i Token number.
+ */
+ public final void setTokenRef(final int i) {
+ tokenRef = i;
+ }
+
+ /**
+ * Gets the token number referenced by the match.
+ *
+ * @return int - token number.
+ */
+ public final int getTokenRef() {
+ return tokenRef;
+ }
+
+ /**
+ * Converts case of the string token according to match element attributes.
+ *
+ * @param s Token to be converted.
+ * @return Converted string.
+ */
+ private String convertCase(final String s) {
+ if (StringTools.isEmpty(s)) {
+ return s;
+ }
+ String token = s;
+ switch (caseConversionType) {
+ case NONE:
+ break;
+ case STARTLOWER:
+ token = token.substring(0, 1).toLowerCase() + token.substring(1);
+ break;
+ case STARTUPPER:
+ token = token.substring(0, 1).toUpperCase() + token.substring(1);
+ break;
+ case ALLUPPER:
+ token = token.toUpperCase();
+ break;
+ case ALLLOWER:
+ token = token.toLowerCase();
+ break;
+ default:
+ break;
+ }
+ return token;
+ }
+
+ /**
+ * Used to let LT know that it should change the case of the match.
+ *
+ * @return true if match converts the case of the token.
+ */
+ public final boolean convertsCase() {
+ return !caseConversionType.equals(CaseConversion.NONE);
+ }
+
+ public final AnalyzedTokenReadings filterReadings() {
+ final ArrayList<AnalyzedToken> l = new ArrayList<AnalyzedToken>();
+ if (formattedToken != null) {
+ if (staticLemma) {
+ formattedToken = new AnalyzedTokenReadings(new AnalyzedToken(
+ matchedToken.getToken(), posTag, formattedToken.getToken()),
+ matchedToken.getStartPos());
+ formattedToken.setWhitespaceBefore(matchedToken.isWhitespaceBefore());
+ }
+ String token = formattedToken.getToken();
+ if (pRegexMatch != null) {
+ token = pRegexMatch.matcher(token).replaceAll(regexReplace);
+ }
+ token = convertCase(token);
+ if (posTag != null) {
+ final int numRead = formattedToken.getReadingsLength();
+ if (postagRegexp) {
+ String targetPosTag = posTag;
+ for (int i = 0; i < numRead; i++) {
+ final String tst = formattedToken.getAnalyzedToken(i).getPOSTag();
+ if (tst != null && pPosRegexMatch.matcher(tst).matches()) {
+ targetPosTag = formattedToken.getAnalyzedToken(i).getPOSTag();
+ if (posTagReplace != null) {
+ targetPosTag = pPosRegexMatch.matcher(targetPosTag).replaceAll(
+ posTagReplace);
+ }
+ l
+ .add(new AnalyzedToken(token, targetPosTag, formattedToken
+ .getAnalyzedToken(i).getLemma()));
+ l.get(l.size() - 1).setWhitespaceBefore(formattedToken.isWhitespaceBefore());
+ }
+ }
+ if (l.isEmpty()) {
+ for (final AnalyzedToken anaTok : getNewToken(numRead, token)) {
+ l.add(anaTok);
+ }
+ }
+ } else {
+ for (final AnalyzedToken anaTok : getNewToken(numRead, token)) {
+ l.add(anaTok);
+ }
+ }
+ if (formattedToken.isSentEnd()) {
+ l.add(new AnalyzedToken(formattedToken.getToken(),
+ JLanguageTool.SENTENCE_END_TAGNAME,
+ formattedToken.getAnalyzedToken(0).getLemma()));
+ }
+ if (formattedToken.isParaEnd()) {
+ l.add(new AnalyzedToken(formattedToken.getToken(),
+ JLanguageTool.PARAGRAPH_END_TAGNAME,
+ formattedToken.getAnalyzedToken(0).getLemma()));
+ }
+ }
+ }
+ if (l.isEmpty()) {
+ return formattedToken;
+ }
+ return new AnalyzedTokenReadings(l.toArray(new AnalyzedToken[l.size()]), formattedToken.getStartPos());
+ }
+
+ private AnalyzedToken[] getNewToken(final int numRead, final String token) {
+ final List<AnalyzedToken> list = new ArrayList<AnalyzedToken>();
+ String lemma = "";
+ for (int j = 0; j < numRead; j++) {
+ if (formattedToken.getAnalyzedToken(j).getPOSTag() != null) {
+ if (formattedToken.getAnalyzedToken(j).getPOSTag().equals(posTag)
+ && (formattedToken.getAnalyzedToken(j).getLemma() != null)) {
+ lemma = formattedToken.getAnalyzedToken(j).getLemma();
+ }
+ if (StringTools.isEmpty(lemma)) {
+ lemma = formattedToken.getAnalyzedToken(0).getLemma();
+ }
+ list.add(new AnalyzedToken(token, posTag, lemma));
+ list.get(list.size() - 1).
+ setWhitespaceBefore(formattedToken.isWhitespaceBefore());
+ }
+ }
+ return list.toArray(new AnalyzedToken[list.size()]);
+ }
+
+ /**
+ * @param inMessageOnly
+ * the inMessageOnly to set
+ */
+ public void setInMessageOnly(final boolean inMessageOnly) {
+ this.inMessageOnly = inMessageOnly;
+ }
+
+ /**
+ * @return the inMessageOnly
+ */
+ public boolean isInMessageOnly() {
+ return inMessageOnly;
+ }
+
+}
diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/rules/patterns/PatternRule.java b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/patterns/PatternRule.java
new file mode 100644
index 0000000..843ef98
--- /dev/null
+++ b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/patterns/PatternRule.java
@@ -0,0 +1,652 @@
+/* LanguageTool, a natural language style checker
+ * Copyright (C) 2005 Daniel Naber (http://www.danielnaber.de)
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
+ * USA
+ */
+package de.danielnaber.languagetool.rules.patterns;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
+
+import de.danielnaber.languagetool.AnalyzedSentence;
+import de.danielnaber.languagetool.AnalyzedTokenReadings;
+import de.danielnaber.languagetool.Language;
+import de.danielnaber.languagetool.rules.IncorrectExample;
+import de.danielnaber.languagetool.rules.RuleMatch;
+import de.danielnaber.languagetool.tools.StringTools;
+
+/**
+ * A Rule that describes a language error as a simple pattern of words or of
+ * part-of-speech tags.
+ *
+ * @author Daniel Naber
+ */
+public class PatternRule extends AbstractPatternRule {
+
+ private static final String SUGG_TAG = "<suggestion>";
+ private static final String END_SUGG_TAG = "</suggestion>";
+
+ private String subId; // because there can be more than one rule in a rule
+ // group
+
+ private String message;
+ private String shortMessage;
+
+ /** Formatted suggestion elements. **/
+ private List<Match> suggestionMatches;
+
+ /**
+ * A list of elements as they appear in XML file (phrases count as single
+ * tokens in case of matches or skipping).
+ */
+ private List<Integer> elementNo;
+
+ /**
+ * This property is used for short-circuiting evaluation of the elementNo list
+ * order.
+ */
+ private boolean useList;
+
+ /**
+ * Marks whether the rule is a member of a disjunctive set (in case of OR
+ * operation on phraserefs).
+ **/
+ private boolean isMemberOfDisjunctiveSet;
+
+ /**
+ * @param id
+ * Id of the Rule
+ * @param language
+ * Language of the Rule
+ * @param elements
+ * Element (token) list
+ * @param description
+ * Description to be shown (name)
+ * @param message
+ * Message to be displayed to the user
+ */
+
+ public PatternRule(final String id, final Language language,
+ final List<Element> elements, final String description,
+ final String message, final String shortMessage) {
+ super(id, description, language, elements, false);
+ if (id == null) {
+ throw new NullPointerException("id cannot be null");
+ }
+ if (language == null) {
+ throw new NullPointerException("language cannot be null");
+ }
+ if (elements == null) {
+ throw new NullPointerException("elements cannot be null");
+ }
+ if (description == null) {
+ throw new NullPointerException("description cannot be null");
+ }
+
+ this.message = message;
+ this.shortMessage = shortMessage;
+ this.elementNo = new ArrayList<Integer>();
+ String prevName = "";
+ String curName = "";
+ int cnt = 0;
+ int loopCnt = 0;
+ for (final Element e : patternElements) {
+ if (e.isPartOfPhrase()) {
+ curName = e.getPhraseName();
+ if (prevName.equals(curName) || StringTools.isEmpty(prevName)) {
+ cnt++;
+ useList = true;
+ } else {
+ elementNo.add(cnt);
+ prevName = "";
+ curName = "";
+ cnt = 0;
+ }
+ prevName = curName;
+ loopCnt++;
+ if (loopCnt == patternElements.size() && !StringTools.isEmpty(prevName)) {
+ elementNo.add(cnt);
+ }
+ } else {
+ if (cnt > 0) {
+ elementNo.add(cnt);
+ }
+ elementNo.add(1);
+ loopCnt++;
+ }
+ }
+ }
+
+ public PatternRule(final String id, final Language language,
+ final List<Element> elements, final String description,
+ final String message, final String shortMessage, final boolean isMember) {
+ this(id, language, elements, description, message, shortMessage);
+ this.isMemberOfDisjunctiveSet = isMember;
+ }
+
+ public final String getSubId() {
+ return subId;
+ }
+
+ public final void setSubId(final String subId) {
+ this.subId = subId;
+ }
+
+ public final String getMessage() {
+ return message;
+ }
+
+ /**
+ * Used for testing rules: only one of the set can match.
+ *
+ * @return Whether the rule can non-match (as a member of disjunctive set of
+ * rules generated by phraseref in includephrases element).
+ */
+ public final boolean isWithComplexPhrase() {
+ return isMemberOfDisjunctiveSet;
+ }
+
+ /** Reset complex status - used for testing. **/
+ public final void notComplexPhrase() {
+ isMemberOfDisjunctiveSet = false;
+ }
+
+ /**
+ * Return the pattern as a string.
+ *
+ * @since 0.9.2
+ */
+ public final String toPatternString() {
+ final List<String> strList = new ArrayList<String>();
+ for (Element patternElement : patternElements) {
+ strList.add(patternElement.toString());
+ }
+ return StringTools.listToString(strList, ", ");
+ }
+
+ /**
+ * Return the pattern as an XML string. FIXME: this is not complete, information might be lost!
+ *
+ * @since 0.9.3
+ */
+ public final String toXML() {
+ final StringBuilder sb = new StringBuilder();
+ sb.append("<rule id=\"");
+ sb.append(StringTools.escapeXML(getId()));
+ sb.append("\" name=\"");
+ sb.append(StringTools.escapeXML(getDescription()));
+ sb.append("\">\n");
+ sb.append("<pattern mark_from=\"");
+ sb.append(startPositionCorrection);
+ sb.append("\" mark_to=\"");
+ sb.append(endPositionCorrection);
+ sb.append('"');
+ // for now, case sensitivity is per pattern, not per element,
+ // so just use the setting of the first element:
+ if (!patternElements.isEmpty() && patternElements.get(0).getCaseSensitive()) {
+ sb.append(" case_sensitive=\"yes\"");
+ }
+ sb.append(">\n");
+ for (Element patternElement : patternElements) {
+ sb.append("<token");
+ if (patternElement.getNegation()) {
+ sb.append(" negate=\"yes\"");
+ }
+ if (patternElement.isRegularExpression()) {
+ sb.append(" regexp=\"yes\"");
+ }
+ if (patternElement.getPOStag() != null) {
+ sb.append(" postag=\"");
+ sb.append(patternElement.getPOStag());
+ sb.append('"');
+ }
+ if (patternElement.getPOSNegation()) {
+ sb.append(" negate_pos=\"yes\"");
+ }
+ if (patternElement.isInflected()) {
+ sb.append(" inflected=\"yes\"");
+ }
+ sb.append('>');
+ if (patternElement.getString() != null) {
+ sb.append(StringTools.escapeXML(patternElement.getString()));
+ } else {
+ // TODO
+ }
+ sb.append("</token>\n");
+ }
+ sb.append("</pattern>\n");
+ sb.append("<message>");
+ sb.append(StringTools.escapeXML(message));
+ sb.append("</message>\n");
+ if (getIncorrectExamples() != null) {
+ for (IncorrectExample example : getIncorrectExamples()) {
+ sb.append("<example type=\"incorrect\">");
+ sb.append(StringTools.escapeXML(example.getExample()));
+ sb.append("</example>\n");
+ }
+ }
+ if (getCorrectExamples() != null) {
+ for (String example : getCorrectExamples()) {
+ sb.append("<example type=\"correct\">");
+ sb.append(StringTools.escapeXML(example));
+ sb.append("</example>\n");
+ }
+ }
+ sb.append("</rule>");
+ return sb.toString();
+ }
+
+ public final void setMessage(final String message) {
+ this.message = message;
+ }
+
+ @Override
+ public final RuleMatch[] match(final AnalyzedSentence text)
+ throws IOException {
+ final List<RuleMatch> ruleMatches = new ArrayList<RuleMatch>();
+ final AnalyzedTokenReadings[] tokens = text.getTokensWithoutWhitespace();
+ final int[] tokenPositions = new int[tokens.length + 1];
+ final int patternSize = patternElements.size();
+ final int limit = Math.max(0, tokens.length - patternSize + 1);
+ Element elem = null;
+ int i = 0;
+ while (i < limit && !(sentStart && i > 0)) {
+ boolean allElementsMatch = false;
+ int firstMatchToken = -1;
+ int lastMatchToken = -1;
+ int matchingTokens = 0;
+ int prevSkipNext = 0;
+ // this variable keeps the total number
+ // of tokens skipped
+ int skipShiftTotal = 0;
+ if (testUnification) {
+ unifier.reset();
+ }
+ for (int k = 0; k < patternSize; k++) {
+ final Element prevElement = elem;
+ elem = patternElements.get(k);
+ setupRef(firstMatchToken, elem, tokens);
+ final int nextPos = i + k + skipShiftTotal;
+ prevMatched = false;
+ if (prevSkipNext + nextPos >= tokens.length || prevSkipNext < 0) { // SENT_END?
+ prevSkipNext = tokens.length - (nextPos + 1);
+ }
+ final int maxTok = Math.min(nextPos + prevSkipNext, tokens.length - (patternSize - k));
+ for (int m = nextPos; m <= maxTok; m++) {
+ allElementsMatch = testAllReadings(tokens, elem, prevElement, m,
+ firstMatchToken, prevSkipNext);
+ if (allElementsMatch) {
+ lastMatchToken = m;
+ final int skipShift = lastMatchToken - nextPos;
+ tokenPositions[matchingTokens] = skipShift + 1;
+ prevSkipNext = translateElementNo(elem.getSkipNext());
+ matchingTokens++;
+ skipShiftTotal += skipShift;
+ if (firstMatchToken == -1) {
+ firstMatchToken = lastMatchToken;
+ }
+ break;
+ }
+ }
+ if (!allElementsMatch) {
+ break;
+ }
+ }
+
+ if (allElementsMatch && matchingTokens == patternSize) {
+ final RuleMatch rM = createRuleMatch(tokenPositions, tokens,
+ firstMatchToken, lastMatchToken, matchingTokens);
+ if (rM != null) {
+ ruleMatches.add(rM);
+ }
+ }
+ i++;
+ }
+ return ruleMatches.toArray(new RuleMatch[ruleMatches.size()]);
+ }
+
+ private RuleMatch createRuleMatch(final int[] tokenPositions,
+ final AnalyzedTokenReadings[] tokens, final int firstMatchToken,
+ final int lastMatchToken, final int matchingTokens) throws IOException {
+ final String errMessage = formatMatches(tokens, tokenPositions,
+ firstMatchToken, message);
+ int correctedStPos = 0;
+ if (startPositionCorrection > 0) {
+ for (int l = 0; l <= startPositionCorrection; l++) {
+ correctedStPos += tokenPositions[l];
+ }
+ correctedStPos--;
+ }
+ int correctedEndPos = 0;
+ if (endPositionCorrection < 0) {
+ int l = 0;
+ while (l > endPositionCorrection) {
+ correctedEndPos -= tokenPositions[matchingTokens + l - 1];
+ l--;
+ }
+ }
+ AnalyzedTokenReadings firstMatchTokenObj = tokens[firstMatchToken
+ + correctedStPos];
+ boolean startsWithUppercase = StringTools
+ .startsWithUppercase(firstMatchTokenObj.getToken())
+ && !matchConvertsCase();
+
+ if (firstMatchTokenObj.isSentStart()
+ && tokens.length > firstMatchToken + correctedStPos + 1) {
+ // make uppercasing work also at sentence start:
+ firstMatchTokenObj = tokens[firstMatchToken + correctedStPos + 1];
+ startsWithUppercase = StringTools.startsWithUppercase(firstMatchTokenObj
+ .getToken());
+ }
+ int fromPos = tokens[firstMatchToken + correctedStPos].getStartPos();
+ // FIXME: this is fishy, assumes that comma should always come before
+ // whitespace
+ if (errMessage.contains(SUGG_TAG + ",")
+ && firstMatchToken + correctedStPos >= 1) {
+ fromPos = tokens[firstMatchToken + correctedStPos - 1].getStartPos()
+ + tokens[firstMatchToken + correctedStPos - 1].getToken().length();
+ }
+
+ final int toPos = tokens[lastMatchToken + correctedEndPos].getStartPos()
+ + tokens[lastMatchToken + correctedEndPos].getToken().length();
+ if (fromPos < toPos) { // this can happen with some skip="-1" when the last
+ // token is not matched
+ return new RuleMatch(this, fromPos, toPos,
+ errMessage, shortMessage, startsWithUppercase);
+ } // failed to create any rule match...
+ return null;
+ }
+
+ /**
+ * Checks if the suggestion starts with a match that is supposed to convert
+ * case. If it does, stop the default conversion to uppercase.
+ *
+ * @return true, if the match converts the case of the token.
+ */
+ private boolean matchConvertsCase() {
+ if (suggestionMatches != null && !suggestionMatches.isEmpty()) {
+ final int sugStart = message.indexOf(SUGG_TAG) + SUGG_TAG.length();
+ for (Match sMatch : suggestionMatches) {
+ if (!sMatch.isInMessageOnly() && sMatch.convertsCase()
+ && message.charAt(sugStart) == '\\') {
+ return true;
+ }
+ }
+ }
+ return false;
+ }
+
+ public final void addSuggestionMatch(final Match m) {
+ if (suggestionMatches == null) {
+ suggestionMatches = new ArrayList<Match>();
+ }
+ suggestionMatches.add(m);
+ }
+
+ /**
+ * Gets the index of the element indexed by i, adding any offsets because of
+ * the phrases in the rule.
+ *
+ * @param i
+ * Current element index.
+ * @return int Index translated into XML element no.
+ */
+ private int translateElementNo(final int i) {
+ if (!useList || i < 0) {
+ return i;
+ }
+ int j = 0;
+ for (int k = 0; k < i; k++) {
+ j += elementNo.get(k);
+ }
+ return j;
+ }
+
+ /**
+ * Returns true when the token in the rule references a phrase composed of
+ * many tokens.
+ *
+ * @param i
+ * The index of the token.
+ * @return true if the phrase is under the index, false otherwise.
+ **/
+ private int phraseLen(final int i) {
+ if (!useList || i > (elementNo.size() - 1)) {
+ return 1;
+ }
+ return elementNo.get(i);
+ }
+
+ /**
+ * Creates a Cartesian product of the arrays stored in the input array.
+ *
+ * @param input
+ * Array of string arrays to combine.
+ * @param output
+ * Work array of strings.
+ * @param r
+ * Starting parameter (use 0 to get all combinations).
+ * @param lang
+ * Text language for adding spaces in some languages.
+ * @return Combined array of @String.
+ */
+ private static String[] combineLists(final String[][] input,
+ final String[] output, final int r, final Language lang) {
+ final List<String> outputList = new ArrayList<String>();
+ if (r == input.length) {
+ final StringBuilder sb = new StringBuilder();
+ for (int k = 0; k < output.length; k++) {
+ sb.append(output[k]);
+ if (k < output.length - 1) {
+ sb.append(StringTools.addSpace(output[k + 1], lang));
+ }
+ }
+ outputList.add(sb.toString());
+ } else {
+ for (int c = 0; c < input[r].length; c++) {
+ output[r] = input[r][c];
+ final String[] sList = combineLists(input, output, r + 1, lang);
+ outputList.addAll(Arrays.asList(sList));
+ }
+ }
+ return outputList.toArray(new String[outputList.size()]);
+ }
+
+ /**
+ * Concatenates the matches, and takes care of phrases (including inflection
+ * using synthesis).
+ *
+ * @param start
+ * Position of the element as referenced by match element in the
+ * rule.
+ * @param index
+ * The index of the element found in the matching sentence.
+ * @param tokenIndex
+ * The position of the token in the AnalyzedTokenReadings array.
+ * @param tokens
+ * Array of @AnalyzedTokenReadings
+ * @return @String[] Array of concatenated strings
+ * @throws IOException
+ * in case disk operations (used in synthesizer) go wrong.
+ */
+ private String[] concatMatches(final int start, final int index,
+ final int tokenIndex, final AnalyzedTokenReadings[] tokens,
+ final int nextTokenPos)
+ throws IOException {
+ String[] finalMatch = null;
+ if (suggestionMatches.get(start) != null) {
+ final int len = phraseLen(index);
+ if (len == 1) {
+ final int skippedTokens = nextTokenPos - tokenIndex;
+ suggestionMatches.get(start).setToken(tokens, tokenIndex - 1, skippedTokens);
+ suggestionMatches.get(start).setSynthesizer(language.getSynthesizer());
+ finalMatch = suggestionMatches.get(start).toFinalString();
+ } else {
+ final List<String[]> matchList = new ArrayList<String[]>();
+ for (int i = 0; i < len; i++) {
+ final int skippedTokens = nextTokenPos - (tokenIndex + i);
+ suggestionMatches.get(start).setToken(tokens, tokenIndex - 1 + i, skippedTokens);
+ suggestionMatches.get(start)
+ .setSynthesizer(language.getSynthesizer());
+ matchList.add(suggestionMatches.get(start).toFinalString());
+ }
+ return combineLists(matchList.toArray(new String[matchList.size()][]),
+ new String[matchList.size()], 0, language);
+ }
+ }
+ return finalMatch;
+ }
+
+ /**
+ * Replace back references generated with &lt;match&gt; and \\1 in message
+ * using Match class, and take care of skipping. *
+ *
+ * @param tokenReadings
+ * Array of AnalyzedTokenReadings that were matched against the
+ * pattern
+ * @param positions
+ * Array of relative positions of matched tokens
+ * @param firstMatchTok
+ * Position of the first matched token
+ * @param errorMsg
+ * String containing suggestion markup
+ * @return String Formatted message.
+ * @throws IOException
+ *
+ **/
+ private String formatMatches(final AnalyzedTokenReadings[] tokenReadings,
+ final int[] positions, final int firstMatchTok, final String errorMsg)
+ throws IOException {
+ String errorMessage = errorMsg;
+ int matchCounter = 0;
+ final int[] numbersToMatches = new int[errorMsg.length()];
+ boolean newWay = false;
+ int errLen = errorMessage.length();
+ int errMarker = errorMessage.indexOf('\\');
+ boolean numberFollows = false;
+ if (errMarker > 0 && errMarker < errLen - 1) {
+ numberFollows = StringTools.isPositiveNumber(errorMessage
+ .charAt(errMarker + 1));
+ }
+ while (errMarker > 0 && numberFollows) {
+ final int ind = errorMessage.indexOf('\\');
+ if (ind > 0 && StringTools.isPositiveNumber(errorMessage.charAt(ind + 1))) {
+ int numLen = 1;
+ while (ind + numLen < errorMessage.length()
+ && StringTools.isPositiveNumber(errorMessage.charAt(ind + numLen))) {
+ numLen++;
+ }
+ final int j = Integer.parseInt(errorMessage.substring(ind + 1, ind
+ + numLen)) - 1;
+ int repTokenPos = 0;
+ int nextTokenPos = 0;
+ for (int l = 0; l <= j; l++) {
+ repTokenPos += positions[l];
+ }
+ if (j <= positions.length) {
+ nextTokenPos = firstMatchTok + repTokenPos + positions[j + 1];
+ }
+ if (suggestionMatches != null) {
+ if (matchCounter < suggestionMatches.size()) {
+ numbersToMatches[j] = matchCounter;
+ if (suggestionMatches.get(matchCounter) != null) {
+ final String[] matches = concatMatches(matchCounter, j,
+ firstMatchTok + repTokenPos, tokenReadings, nextTokenPos);
+ final String leftSide = errorMessage.substring(0, ind);
+ final String rightSide = errorMessage.substring(ind + numLen);
+ if (matches.length == 1) {
+ errorMessage = leftSide + matches[0] + rightSide;
+ } else {
+ errorMessage = formatMultipleSynthesis(matches, leftSide,
+ rightSide);
+ }
+ matchCounter++;
+ newWay = true;
+ }
+ } else {
+ // FIXME: is this correct? this is how we deal with multiple matches
+ suggestionMatches.add(suggestionMatches.get(numbersToMatches[j]));
+ }
+ }
+
+ if (!newWay) {
+ // in case <match> elements weren't used (yet)
+ errorMessage = errorMessage.replace("\\" + (j + 1),
+ tokenReadings[firstMatchTok + repTokenPos - 1].getToken());
+ }
+ }
+ errMarker = errorMessage.indexOf('\\');
+ numberFollows = false;
+ errLen = errorMessage.length();
+ if (errMarker > 0 && errMarker < errLen - 1) {
+ numberFollows = StringTools.isPositiveNumber(errorMessage
+ .charAt(errMarker + 1));
+ }
+ }
+ return errorMessage;
+ }
+
+ private static String formatMultipleSynthesis(final String[] matches,
+ final String leftSide, final String rightSide) {
+ String errorMessage = "";
+ String suggestionLeft = "";
+ String suggestionRight = "";
+ String rightSideNew = rightSide;
+ final int sPos = leftSide.lastIndexOf(SUGG_TAG);
+ if (sPos > 0) {
+ suggestionLeft = leftSide.substring(sPos + SUGG_TAG.length());
+ }
+ if (StringTools.isEmpty(suggestionLeft)) {
+ errorMessage = leftSide;
+ } else {
+ errorMessage = leftSide.substring(0, leftSide.lastIndexOf(SUGG_TAG))
+ + SUGG_TAG;
+ }
+ final int rPos = rightSide.indexOf(END_SUGG_TAG);
+ if (rPos > 0) {
+ suggestionRight = rightSide.substring(0, rPos);
+ }
+ if (!StringTools.isEmpty(suggestionRight)) {
+ rightSideNew = rightSide.substring(rightSide.indexOf(END_SUGG_TAG));
+ }
+ final int lastLeftSugEnd = leftSide.indexOf(END_SUGG_TAG);
+ final int lastLeftSugStart = leftSide.lastIndexOf(SUGG_TAG);
+ final StringBuilder sb = new StringBuilder();
+ sb.append(errorMessage);
+ for (int z = 0; z < matches.length; z++) {
+ sb.append(suggestionLeft);
+ sb.append(matches[z]);
+ sb.append(suggestionRight);
+ if ((z < matches.length - 1) && lastLeftSugEnd < lastLeftSugStart) {
+ sb.append(END_SUGG_TAG);
+ sb.append(", ");
+ sb.append(SUGG_TAG);
+ }
+ }
+ sb.append(rightSideNew);
+ return sb.toString();
+ }
+
+ /**
+ * For testing only.
+ */
+ public final List<Element> getElements() {
+ return patternElements;
+ }
+
+}
diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/rules/patterns/PatternRuleLoader.java b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/patterns/PatternRuleLoader.java
new file mode 100644
index 0000000..8156a6e
--- /dev/null
+++ b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/patterns/PatternRuleLoader.java
@@ -0,0 +1,369 @@
+/* LanguageTool, a natural language style checker
+ * Copyright (C) 2005 Daniel Naber (http://www.danielnaber.de)
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
+ * USA
+ */
+package de.danielnaber.languagetool.rules.patterns;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+
+import javax.xml.parsers.SAXParser;
+import javax.xml.parsers.SAXParserFactory;
+
+import org.xml.sax.Attributes;
+import org.xml.sax.SAXException;
+import org.xml.sax.helpers.DefaultHandler;
+
+import de.danielnaber.languagetool.JLanguageTool;
+import de.danielnaber.languagetool.Language;
+import de.danielnaber.languagetool.rules.Category;
+import de.danielnaber.languagetool.rules.IncorrectExample;
+
+/**
+ * Loads {@link PatternRule}s from an XML file.
+ *
+ * @author Daniel Naber
+ */
+public class PatternRuleLoader extends DefaultHandler {
+
+ public final List<PatternRule> getRules(final InputStream is,
+ final String filename) throws IOException {
+ try {
+ final PatternRuleHandler handler = new PatternRuleHandler();
+ final SAXParserFactory factory = SAXParserFactory.newInstance();
+ final SAXParser saxParser = factory.newSAXParser();
+ saxParser.getXMLReader().setFeature(
+ "http://apache.org/xml/features/nonvalidating/load-external-dtd",
+ false);
+ saxParser.parse(is, handler);
+ return handler.getRules();
+ } catch (final Exception e) {
+ final IOException ioe = new IOException("Cannot load or parse '"
+ + filename + "'");
+ ioe.initCause(e);
+ throw ioe;
+ }
+ }
+
+ /** Testing only. */
+ public final void main(final String[] args) throws IOException {
+ final PatternRuleLoader prg = new PatternRuleLoader();
+ final String name = "/de/grammar.xml";
+ final List<PatternRule> l = prg.getRules(JLanguageTool.getDataBroker().getFromRulesDirAsStream(name), name);
+ System.out.println(l);
+ }
+
+}
+
+class PatternRuleHandler extends XMLRuleHandler {
+
+ private int subId;
+
+ private boolean defaultOff;
+ private boolean defaultOn;
+
+ private Category category;
+ private String description;
+ private String ruleGroupDescription;
+
+ // ===========================================================
+ // SAX DocumentHandler methods
+ // ===========================================================
+
+ @Override
+ public void startElement(final String namespaceURI, final String lName,
+ final String qName, final Attributes attrs) throws SAXException {
+ if ("category".equals(qName)) {
+ final String catName = attrs.getValue("name");
+ final String priorityStr = attrs.getValue("priority");
+ // int prio = 0;
+ if (priorityStr == null) {
+ category = new Category(catName);
+ } else {
+ category = new Category(catName, Integer.parseInt(priorityStr));
+ }
+
+ if ("off".equals(attrs.getValue(DEFAULT))) {
+ category.setDefaultOff();
+ }
+
+ } else if ("rules".equals(qName)) {
+ final String languageStr = attrs.getValue("lang");
+ language = Language.getLanguageForShortName(languageStr);
+ if (language == null) {
+ throw new SAXException("Unknown language '" + languageStr + "'");
+ }
+ } else if ("rule".equals(qName)) {
+ id = attrs.getValue("id");
+ if (inRuleGroup) {
+ subId++;
+ }
+ if (!(inRuleGroup && defaultOff)) {
+ defaultOff = "off".equals(attrs.getValue(DEFAULT));
+ }
+
+ if (!(inRuleGroup && defaultOn)) {
+ defaultOn = "on".equals(attrs.getValue(DEFAULT));
+ }
+ if (inRuleGroup && id == null) {
+ id = ruleGroupId;
+ }
+ description = attrs.getValue("name");
+ if (inRuleGroup && description == null) {
+ description = ruleGroupDescription;
+ }
+ correctExamples = new ArrayList<String>();
+ incorrectExamples = new ArrayList<IncorrectExample>();
+ if (suggestionMatches != null) {
+ suggestionMatches.clear();
+ }
+ } else if (PATTERN.equals(qName)) {
+ startPattern(attrs);
+ } else if (AND.equals(qName)) {
+ inAndGroup = true;
+ } else if ("unify".equals(qName)) {
+ inUnification = true;
+ uniNegation = YES.equals(attrs.getValue(NEGATE));
+ } else if ("feature".equals(qName)) {
+ uFeature = attrs.getValue("id");
+ } else if (qName.equals(TYPE)) {
+ uType = attrs.getValue("id");
+ uTypeList.add(uType);
+ } else if (qName.equals(TOKEN)) {
+ setToken(attrs);
+ } else if (EXCEPTION.equals(qName)) {
+ setExceptions(attrs);
+ } else if (qName.equals(EXAMPLE)
+ && attrs.getValue(TYPE).equals("correct")) {
+ inCorrectExample = true;
+ correctExample = new StringBuilder();
+ } else if (qName.equals(EXAMPLE)
+ && attrs.getValue(TYPE).equals("incorrect")) {
+ inIncorrectExample = true;
+ incorrectExample = new StringBuilder();
+ exampleCorrection = new StringBuilder();
+ if (attrs.getValue("correction") != null) {
+ exampleCorrection.append(attrs.getValue("correction"));
+ }
+ } else if ("message".equals(qName)) {
+ inMessage = true;
+ inSuggestion = false;
+ message = new StringBuilder();
+ } else if ("short".equals(qName)) {
+ inShortMessage = true;
+ shortMessage = new StringBuilder();
+ } else if ("rulegroup".equals(qName)) {
+ ruleGroupId = attrs.getValue("id");
+ ruleGroupDescription = attrs.getValue("name");
+ defaultOff = "off".equals(attrs.getValue(DEFAULT));
+ defaultOn = "on".equals(attrs.getValue(DEFAULT));
+ inRuleGroup = true;
+ subId = 0;
+ } else if ("suggestion".equals(qName) && inMessage) {
+ message.append("<suggestion>");
+ inSuggestion = true;
+ } else if ("match".equals(qName)) {
+ setMatchElement(attrs);
+ } else if (qName.equals(MARKER) && inCorrectExample) {
+ correctExample.append("<marker>");
+ } else if (qName.equals(MARKER) && inIncorrectExample) {
+ incorrectExample.append("<marker>");
+ } else if (UNIFICATION.equals(qName)) {
+ uFeature = attrs.getValue("feature");
+ inUnificationDef = true;
+ } else if ("equivalence".equals(qName)) {
+ uType = attrs.getValue(TYPE);
+ } else if (PHRASES.equals(qName)) {
+ inPhrases = true;
+ } else if ("includephrases".equals(qName)) {
+ phraseElementInit();
+ } else if ("phrase".equals(qName) && inPhrases) {
+ phraseId = attrs.getValue("id");
+ } else if ("phraseref".equals(qName) && (attrs.getValue("idref") != null)) {
+ preparePhrase(attrs);
+ }
+ }
+
+ @Override
+ public void endElement(final String namespaceURI, final String sName,
+ final String qName) throws SAXException {
+ if ("rule".equals(qName)) {
+ phraseElementInit();
+ if (phraseElementList.isEmpty()) {
+ final PatternRule rule = new PatternRule(id, language, elementList,
+ description, message.toString(), shortMessage.toString());
+ prepareRule(rule);
+ rules.add(rule);
+ } else {
+ if (!elementList.isEmpty()) {
+ for (final ArrayList<Element> ph : phraseElementList) {
+ ph.addAll(new ArrayList<Element>(elementList));
+ }
+ }
+
+ for (final ArrayList<Element> phraseElement : phraseElementList) {
+ processElement(phraseElement);
+ final PatternRule rule = new PatternRule(id, language, phraseElement,
+ description, message.toString(), shortMessage.toString(),
+ phraseElementList.size() > 1);
+ prepareRule(rule);
+ rules.add(rule);
+ }
+ }
+ elementList.clear();
+ if (phraseElementList != null) {
+ phraseElementList.clear();
+ }
+
+ } else if (qName.equals(EXCEPTION)) {
+ finalizeExceptions();
+ } else if (qName.equals(AND)) {
+ inAndGroup = false;
+ andGroupCounter = 0;
+ tokenCounter++;
+ } else if (qName.equals(TOKEN)) {
+ finalizeTokens();
+ } else if (qName.equals(PATTERN)) {
+ checkMarkPositions();
+ inPattern = false;
+ if (lastPhrase) {
+ elementList.clear();
+ }
+ if (phraseElementList == null || phraseElementList.isEmpty()) {
+ checkPositions(0);
+ } else {
+ for (List<Element> elements : phraseElementList) {
+ checkPositions(elements.size());
+ }
+ }
+ tokenCounter = 0;
+ } else if (qName.equals(EXAMPLE)) {
+ if (inCorrectExample) {
+ correctExamples.add(correctExample.toString());
+ } else if (inIncorrectExample) {
+ IncorrectExample example = null;
+ final String[] corrections = exampleCorrection.toString().split("\\|");
+ if (corrections.length > 0 && corrections[0].length() > 0) {
+ example = new IncorrectExample(incorrectExample.toString(),
+ corrections);
+ } else {
+ example = new IncorrectExample(incorrectExample.toString());
+ }
+ incorrectExamples.add(example);
+ }
+ inCorrectExample = false;
+ inIncorrectExample = false;
+ correctExample = new StringBuilder();
+ incorrectExample = new StringBuilder();
+ exampleCorrection = new StringBuilder();
+ } else if ("message".equals(qName)) {
+ suggestionMatches = addLegacyMatches();
+ inMessage = false;
+ } else if ("short".equals(qName)) {
+ inShortMessage = false;
+ } else if ("match".equals(qName)) {
+ if (inMessage) {
+ suggestionMatches.get(suggestionMatches.size() - 1).setLemmaString(
+ match.toString());
+ } else if (inToken) {
+ tokenReference.setLemmaString(match.toString());
+ }
+ inMatch = false;
+ } else if ("rulegroup".equals(qName)) {
+ inRuleGroup = false;
+ } else if ("suggestion".equals(qName) && inMessage) {
+ message.append("</suggestion>");
+ inSuggestion = false;
+ } else if (qName.equals(MARKER) && inCorrectExample) {
+ correctExample.append("</marker>");
+ } else if (qName.equals(MARKER) && inIncorrectExample) {
+ incorrectExample.append("</marker>");
+ } else if ("phrase".equals(qName) && inPhrases) {
+ finalizePhrase();
+ } else if ("includephrases".equals(qName)) {
+ elementList.clear();
+ } else if (PHRASES.equals(qName) && inPhrases) {
+ inPhrases = false;
+ } else if (UNIFICATION.equals(qName)) {
+ inUnificationDef = false;
+ } else if ("feature".equals(qName)) {
+ equivalenceFeatures.put(uFeature, uTypeList);
+ uTypeList = new ArrayList<String>();
+ } else if ("unify".equals(qName)) {
+ inUnification = false;
+ //clear the features...
+ equivalenceFeatures = new HashMap<String, List<String>>();
+ }
+ }
+
+ private void prepareRule(final PatternRule rule) {
+ rule.setStartPositionCorrection(startPositionCorrection);
+ rule.setEndPositionCorrection(endPositionCorrection);
+ startPositionCorrection = 0;
+ endPositionCorrection = 0;
+ rule.setCorrectExamples(correctExamples);
+ rule.setIncorrectExamples(incorrectExamples);
+ rule.setCategory(category);
+ if (inRuleGroup) {
+ rule.setSubId(Integer.toString(subId));
+ }
+ else {
+ rule.setSubId("1");
+ }
+ caseSensitive = false;
+ if (suggestionMatches != null) {
+ for (final Match m : suggestionMatches) {
+ rule.addSuggestionMatch(m);
+ }
+ if (phraseElementList.size() <= 1) {
+ suggestionMatches.clear();
+ }
+ }
+ if (defaultOff) {
+ rule.setDefaultOff();
+ }
+
+ if (category.isDefaultOff() && !defaultOn) {
+ rule.setDefaultOff();
+ }
+
+ }
+
+ @Override
+ public void characters(final char[] buf, final int offset, final int len) {
+ final String s = new String(buf, offset, len);
+ if (inException) {
+ exceptions.append(s);
+ } else if (inToken) {
+ elements.append(s);
+ } else if (inCorrectExample) {
+ correctExample.append(s);
+ } else if (inIncorrectExample) {
+ incorrectExample.append(s);
+ } else if (inMatch) {
+ match.append(s);
+ } else if (inMessage) {
+ message.append(s);
+ } else if (inShortMessage) {
+ shortMessage.append(s);
+ }
+ }
+
+}
diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/rules/patterns/Unifier.java b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/patterns/Unifier.java
new file mode 100644
index 0000000..7fbb35d
--- /dev/null
+++ b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/patterns/Unifier.java
@@ -0,0 +1,432 @@
+/* LanguageTool, a natural language style checker
+ * Copyright (C) 2006 Daniel Naber (http://www.danielnaber.de)
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
+ * USA
+ */
+
+package de.danielnaber.languagetool.rules.patterns;
+
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+
+import de.danielnaber.languagetool.AnalyzedToken;
+import de.danielnaber.languagetool.AnalyzedTokenReadings;
+
+/**
+ * Implements unification of features over tokens.
+ *
+ * @author Marcin Milkowski
+ */
+public class Unifier {
+
+ //TODO: add a possibility to negate some features but not all
+ /**
+ * Negates the meaning of unification just like negation in Element tokens.
+ */
+ private boolean negation;
+
+ private boolean allFeatsIn;
+
+ private int tokCnt;
+
+ private int readingsCounter;
+
+ private final List<AnalyzedTokenReadings> tokSequence;
+
+ /**
+ * A Map for storing the equivalence types for features. Features are
+ * specified as Strings, and map into types defined as maps from Strings to
+ * Elements.
+ */
+ private final Map<EquivalenceTypeLocator, Element> equivalenceTypes;
+
+ /**
+ * A Map that stores all possible equivalence types listed for features.
+ */
+ private final Map<String, List<String>> equivalenceFeatures;
+
+ /**
+ * Map of sets of matched equivalences in the unified sequence.
+ */
+ private final List<Map<String, Set<String>>> equivalencesMatched;
+
+ /**
+ * Marks found interpretations in subsequent tokens.
+ */
+ private List<Boolean> featuresFound;
+
+ /**
+ * For checking the current token.
+ */
+ private List<Boolean> tmpFeaturesFound;
+
+ /**
+ * Internal flag for checking whether the first token in tokSequence has to be
+ * yet unified.
+ */
+ private boolean firstUnified;
+
+ private boolean inUnification;
+ private boolean uniMatched;
+ private boolean uniAllMatched;
+ private AnalyzedTokenReadings[] unifiedTokens;
+
+ /**
+ * Instantiates the unifier.
+ */
+ public Unifier() {
+ tokCnt = -1;
+ readingsCounter = 1;
+ equivalencesMatched = new ArrayList<Map<String, Set<String>>>();
+ equivalenceTypes = new HashMap<EquivalenceTypeLocator, Element>();
+ equivalenceFeatures = new HashMap<String, List<String>>();
+ featuresFound = new ArrayList<Boolean>();
+ tmpFeaturesFound = new ArrayList<Boolean>();
+ tokSequence = new ArrayList<AnalyzedTokenReadings>();
+ }
+
+ /**
+ * Prepares equivalence types for features to be tested. All equivalence types
+ * are given as {@link Element}s. They create an equivalence set (with
+ * abstraction).
+ *
+ * @param feature
+ * Feature to be tested, like gender, grammatical case or number.
+ * @param type
+ * Type of equivalence for the feature, for example plural, first
+ * person, genitive.
+ * @param elem
+ * Element specifying the equivalence.
+ */
+ public final void setEquivalence(final String feature, final String type,
+ final Element elem) {
+ if (equivalenceTypes.containsKey(new EquivalenceTypeLocator(feature, type))) {
+ return;
+ }
+ equivalenceTypes.put(new EquivalenceTypeLocator(feature, type), elem);
+ List<String> lTypes;
+ if (equivalenceFeatures.containsKey(feature)) {
+ lTypes = equivalenceFeatures.get(feature);
+ } else {
+ lTypes = new ArrayList<String>();
+ }
+ lTypes.add(type);
+ equivalenceFeatures.put(feature, lTypes);
+ }
+
+ /**
+ * Tests if a token has shared features with other tokens.
+ *
+ * @param aToken
+ * - token to be tested
+ * @param feature
+ * - feature to be tested
+ * @param type
+ * - type of equivalence relation for the feature
+ * @return true if the token shares this type of feature with other tokens
+ */
+ protected final boolean isSatisfied(final AnalyzedToken aToken,
+ final Map<String, List<String>> uFeatures) {
+
+ if (allFeatsIn && equivalencesMatched.isEmpty()) {
+ return false;
+ }
+ // Error: no feature given!
+ if (uFeatures == null) {
+ return false; // throw exception??
+ }
+ boolean unified = true;
+ List<String> types;
+
+ if (allFeatsIn) {
+ unified &= checkNext(aToken, uFeatures);
+ } else {
+ tokCnt++;
+ while (equivalencesMatched.size() <= tokCnt) {
+ equivalencesMatched.add(new HashMap<String, Set<String>>());
+ }
+ for (final Map.Entry<String, List<String>> feat : uFeatures.entrySet()) {
+ types = feat.getValue();
+ if (types == null || types.isEmpty()) {
+ types = equivalenceFeatures.get(feat.getKey());
+ }
+ for (final String typename : types) {
+ final Element testElem = equivalenceTypes
+ .get(new EquivalenceTypeLocator(feat.getKey(), typename));
+ if (testElem == null) {
+ return false;
+ }
+ if (testElem.isMatched(aToken)) {
+ if (!equivalencesMatched.get(tokCnt).containsKey(feat.getKey())) {
+ final Set<String> typeSet = new HashSet<String>();
+ typeSet.add(typename);
+ equivalencesMatched.get(tokCnt).put(feat.getKey(), typeSet);
+ } else {
+ equivalencesMatched.get(tokCnt).get(feat.getKey()).add(typename);
+ }
+ }
+ }
+ unified &= equivalencesMatched.get(tokCnt).containsKey(feat.getKey());
+ if (!unified) {
+ break;
+ }
+ }
+ if (unified) {
+ if (tokCnt == 0 || tokSequence.isEmpty()) {
+ tokSequence.add(new AnalyzedTokenReadings(aToken, 0));
+ } else {
+ tokSequence.get(0).addReading(aToken);
+ }
+ }
+ }
+ return unified ^ negation;
+ }
+
+ private boolean checkNext(final AnalyzedToken aToken,
+ final Map<String, List<String>> uFeatures) {
+ boolean unifiedNext = true;
+ boolean anyFeatUnified = false;
+ List<String> types;
+ ArrayList<Boolean> tokenFeaturesFound = new ArrayList<Boolean>(tmpFeaturesFound);
+ if (allFeatsIn) {
+ for (int i = 0; i <= tokCnt; i++) {
+ boolean allFeatsUnified = true;
+ for (Map.Entry<String, List<String>> feat : uFeatures.entrySet()) {
+ boolean featUnified = false;
+ types = feat.getValue();
+ if (types == null || types.isEmpty()) {
+ types = equivalenceFeatures.get(feat.getKey());
+ }
+ for (final String typename : types) {
+ if (featuresFound.get(i)
+ && equivalencesMatched.get(i).containsKey(feat.getKey())
+ && equivalencesMatched.get(i).get(feat.getKey()).contains(typename)) {
+ final Element testElem = equivalenceTypes
+ .get(new EquivalenceTypeLocator(feat.getKey(), typename));
+ featUnified = featUnified || testElem.isMatched(aToken);
+ }
+ }
+ allFeatsUnified &= featUnified;
+ }
+ tokenFeaturesFound.set(i, allFeatsUnified);
+ anyFeatUnified = anyFeatUnified || allFeatsUnified;
+ }
+ unifiedNext &= anyFeatUnified;
+ if (unifiedNext) {
+ if (tokSequence.size() == readingsCounter) {
+ tokSequence.add(new AnalyzedTokenReadings(aToken, 0));
+ } else {
+ tokSequence.get(readingsCounter).addReading(aToken);
+ }
+ tmpFeaturesFound = tokenFeaturesFound;
+ }
+ }
+ return unifiedNext;
+ }
+
+ /**
+ * Call after every complete token (AnalyzedTokenReadings) checked.
+ */
+ public final void startNextToken() {
+ featuresFound = new ArrayList<Boolean>(tmpFeaturesFound);
+ readingsCounter++;
+ }
+
+ /**
+ * Starts testing only those equivalences that were previously matched.
+ */
+ public final void startUnify() {
+ allFeatsIn = true;
+ for (int i = 0; i <= tokCnt; i++) {
+ featuresFound.add(true);
+ }
+ tmpFeaturesFound = new ArrayList<Boolean>(featuresFound);
+ }
+
+ public final void setNegation(final boolean neg) {
+ negation = neg;
+ }
+
+ public final boolean getNegation() {
+ return negation;
+ }
+
+ /**
+ * Resets after use of unification. Required.
+ */
+ public final void reset() {
+ equivalencesMatched.clear();
+ allFeatsIn = false;
+ negation = false;
+ tokCnt = -1;
+ featuresFound.clear();
+ tmpFeaturesFound.clear();
+ tokSequence.clear();
+ readingsCounter = 1;
+ firstUnified = false;
+ uniMatched = false;
+ uniAllMatched = false;
+ inUnification = false;
+ }
+
+ /**
+ * Gets a full sequence of filtered tokens.
+ *
+ * @return Array of AnalyzedTokenReadings that match equivalence relation
+ * defined for features tested.
+ */
+ public final AnalyzedTokenReadings[] getUnifiedTokens() {
+ if (tokSequence.isEmpty()) {
+ return null;
+ }
+ if (!firstUnified) {
+ AnalyzedTokenReadings tmpATR;
+ int first = 0;
+ tmpFeaturesFound.add(true); // Bentley's search idea
+ while (!tmpFeaturesFound.get(first)) {
+ first++;
+ }
+ tmpFeaturesFound.remove(tmpFeaturesFound.size() - 1);
+ if (first >= tmpFeaturesFound.size()) {
+ return null;
+ }
+ // FIXME: why this happens??
+ final int numRead = tokSequence.get(0).getReadingsLength();
+ if (first < numRead) {
+ tmpATR = new AnalyzedTokenReadings(tokSequence.get(0).getAnalyzedToken(
+ first), 0);
+ for (int i = first + 1; i <= Math.min(numRead - 1, tokCnt); i++) {
+ if (tmpFeaturesFound.get(i)) {
+ tmpATR.addReading(tokSequence.get(0).getAnalyzedToken(i));
+ }
+ }
+ tokSequence.set(0, tmpATR);
+ }
+ firstUnified = true;
+ }
+ final AnalyzedTokenReadings[] atr = tokSequence
+ .toArray(new AnalyzedTokenReadings[tokSequence.size()]);
+ return atr;
+ }
+
+ /**
+ * Tests if the token sequence is unified.
+ *
+ * @param matchToken
+ * AnalyzedToken token to unify
+ * @param feature
+ * String: feature to unify over
+ * @param type
+ * String: value types of the feature
+ * @param isUniNegated
+ * if true, then return negated result
+ * @param lastReading
+ * true when the matchToken is the last reading in the
+ * AnalyzedReadings
+ * @return True if the tokens in the sequence are unified.
+ */
+ public final boolean isUnified(final AnalyzedToken matchToken,
+ final Map<String, List<String>> uFeatures, final boolean isUniNegated,
+ final boolean lastReading) {
+ if (inUnification) {
+ uniMatched |= isSatisfied(matchToken, uFeatures);
+ uniAllMatched = uniMatched;
+ if (lastReading) {
+ startNextToken();
+ unifiedTokens = getUnifiedTokens();
+ uniMatched = false;
+ }
+ return uniAllMatched;
+ }
+ if (isUniNegated) {
+ setNegation(true);
+ }
+ isSatisfied(matchToken, uFeatures);
+ if (lastReading) {
+ inUnification = true;
+ uniMatched = false;
+ startUnify();
+ }
+ return true;
+ }
+
+ /**
+ * Used for getting a unified sequence in case when simple test method
+ * {@link #isUnified} was used.
+ *
+ * @return An array of {@link AnalyzedTokenReadings}
+ */
+ public final AnalyzedTokenReadings[] getFinalUnified() {
+ if (inUnification) {
+ return unifiedTokens;
+ }
+ return null;
+ }
+}
+
+class EquivalenceTypeLocator {
+
+ @Override
+ public int hashCode() {
+ final int prime = 31;
+ int result = 1;
+ result = prime * result + ((feature == null) ? 0 : feature.hashCode());
+ result = prime * result + ((type == null) ? 0 : type.hashCode());
+ return result;
+ }
+
+ @Override
+ public boolean equals(final Object obj) {
+ if (this == obj) {
+ return true;
+ }
+ if (obj == null) {
+ return false;
+ }
+ if (getClass() != obj.getClass()) {
+ return false;
+ }
+ final EquivalenceTypeLocator other = (EquivalenceTypeLocator) obj;
+ if (feature == null) {
+ if (other.feature != null) {
+ return false;
+ }
+ } else if (!feature.equals(other.feature)) {
+ return false;
+ }
+ if (type == null) {
+ if (other.type != null) {
+ return false;
+ }
+ } else if (!type.equals(other.type)) {
+ return false;
+ }
+ return true;
+ }
+
+ private final String feature;
+ private final String type;
+
+ EquivalenceTypeLocator(final String feature, final String type) {
+ this.feature = feature;
+ this.type = type;
+ }
+
+} \ No newline at end of file
diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/rules/patterns/XMLRuleHandler.java b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/patterns/XMLRuleHandler.java
new file mode 100644
index 0000000..72a852a
--- /dev/null
+++ b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/patterns/XMLRuleHandler.java
@@ -0,0 +1,568 @@
+/* LanguageTool, a natural language style checker
+ * Copyright (C) 2006 Daniel Naber (http://www.danielnaber.de)
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
+ * USA
+ */
+package de.danielnaber.languagetool.rules.patterns;
+
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+
+import org.xml.sax.Attributes;
+import org.xml.sax.Locator;
+import org.xml.sax.SAXException;
+import org.xml.sax.SAXParseException;
+import org.xml.sax.helpers.DefaultHandler;
+
+import de.danielnaber.languagetool.Language;
+import de.danielnaber.languagetool.rules.IncorrectExample;
+import de.danielnaber.languagetool.tools.StringTools;
+
+/**
+ * XML rule handler that loads rules from XML and throws
+ * exceptions on errors and warnings.
+ *
+ * @author Daniel Naber
+ */
+public class XMLRuleHandler extends DefaultHandler {
+
+ public XMLRuleHandler() {
+ elementList = new ArrayList<Element>();
+ equivalenceFeatures = new HashMap<String, List<String>>();
+ uTypeList = new ArrayList<String>();
+ }
+
+ List<PatternRule> rules = new ArrayList<PatternRule>();
+
+ protected Language language;
+
+ protected StringBuilder correctExample = new StringBuilder();
+ protected StringBuilder incorrectExample = new StringBuilder();
+ protected StringBuilder exampleCorrection = new StringBuilder();
+ protected StringBuilder message = new StringBuilder();
+ protected StringBuilder match = new StringBuilder();
+ protected StringBuilder elements;
+ protected StringBuilder exceptions;
+
+ List<String> correctExamples = new ArrayList<String>();
+ List<IncorrectExample> incorrectExamples = new ArrayList<IncorrectExample>();
+
+ protected boolean inPattern;
+ protected boolean inCorrectExample;
+ protected boolean inIncorrectExample;
+ protected boolean inMessage;
+ protected boolean inSuggestion;
+ protected boolean inMatch;
+ protected boolean inRuleGroup;
+ protected boolean inToken;
+ protected boolean inException;
+ protected boolean inPhrases;
+ protected boolean inAndGroup;
+
+ protected boolean tokenSpaceBefore;
+ protected boolean tokenSpaceBeforeSet;
+ protected String posToken;
+ protected boolean posNegation;
+ protected boolean posRegExp;
+
+ protected boolean caseSensitive;
+ protected boolean regExpression;
+ protected boolean tokenNegated;
+ protected boolean tokenInflected;
+
+ protected String exceptionPosToken;
+ protected boolean exceptionStringRegExp;
+ protected boolean exceptionStringNegation;
+ protected boolean exceptionStringInflected;
+ protected boolean exceptionPosNegation;
+ protected boolean exceptionPosRegExp;
+ protected boolean exceptionValidNext;
+ protected boolean exceptionValidPrev;
+ protected boolean exceptionSet;
+ protected boolean exceptionSpaceBefore;
+ protected boolean exceptionSpaceBeforeSet;
+
+ /** List of elements as specified by tokens. **/
+ protected List<Element> elementList;
+
+ /** true when phraseref is the last element in the rule. **/
+ protected boolean lastPhrase;
+
+ /** ID reference to the phrase. **/
+ protected String phraseIdRef;
+
+ /** Current phrase ID. **/
+ protected String phraseId;
+
+ protected int skipPos;
+
+ protected String ruleGroupId;
+
+ protected String id;
+
+ protected Element tokenElement;
+
+ protected Match tokenReference;
+
+ protected List<Match> suggestionMatches;
+
+ protected Locator pLocator;
+
+ protected int startPositionCorrection;
+ protected int endPositionCorrection;
+ protected int tokenCounter;
+
+ /** Phrase store - elementLists keyed by phraseIds. **/
+ protected Map<String, List<List<Element>>> phraseMap;
+
+ /**
+ * Logically forking element list, used for including multiple phrases in the
+ * current one.
+ **/
+ protected List<ArrayList<Element>> phraseElementList;
+
+ protected int andGroupCounter;
+
+ protected StringBuilder shortMessage = new StringBuilder();
+ protected boolean inShortMessage;
+
+ protected boolean inUnification;
+ protected boolean inUnificationDef;
+ protected boolean uniNegation;
+
+ protected String uFeature;
+ protected String uType = "";
+
+ protected List<String> uTypeList;
+
+ protected Map<String, List<String>> equivalenceFeatures;
+
+
+ /** Definitions of values in XML files. */
+ protected static final String YES = "yes";
+ protected static final String POSTAG = "postag";
+ protected static final String POSTAG_REGEXP = "postag_regexp";
+ protected static final String REGEXP = "regexp";
+ protected static final String NEGATE = "negate";
+ protected static final String INFLECTED = "inflected";
+ protected static final String NEGATE_POS = "negate_pos";
+ protected static final String MARKER = "marker";
+ protected static final String DEFAULT = "default";
+ protected static final String TYPE = "type";
+ protected static final String SPACEBEFORE = "spacebefore";
+ protected static final String EXAMPLE = "example";
+ protected static final String SCOPE = "scope";
+ protected static final String IGNORE = "ignore";
+ protected static final String SKIP = "skip";
+ protected static final String TOKEN = "token";
+ protected static final String FEATURE = "feature";
+ protected static final String UNIFY = "unify";
+ protected static final String AND = "and";
+ protected static final String EXCEPTION = "exception";
+ protected static final String CASE_SENSITIVE = "case_sensitive";
+ protected static final String PATTERN = "pattern";
+ protected static final String MATCH = "match";
+ protected static final String UNIFICATION = "unification";
+ protected static final String RULEGROUP = "rulegroup";
+ protected static final String NO = "no";
+ protected static final String MARK_TO = "mark_to";
+ protected static final String MARK_FROM = "mark_from";
+ protected static final String PHRASES = "phrases";
+ protected static final String MESSAGE = "message";
+
+
+ public List<PatternRule> getRules() {
+ return rules;
+ }
+
+ public void warning (final SAXParseException e) throws SAXException {
+ throw e;
+ }
+
+ public void error (final SAXParseException e) throws SAXException {
+ throw e;
+ }
+
+ @Override
+ public void setDocumentLocator(final Locator locator) {
+ pLocator = locator;
+ super.setDocumentLocator(locator);
+ }
+
+ protected void resetToken() {
+ posNegation = false;
+ posRegExp = false;
+ inToken = false;
+ tokenSpaceBefore = false;
+ tokenSpaceBeforeSet = false;
+
+ resetException();
+ exceptionSet = false;
+ tokenReference = null;
+ }
+
+ protected void resetException() {
+ exceptionStringNegation = false;
+ exceptionStringInflected = false;
+ exceptionPosNegation = false;
+ exceptionPosRegExp = false;
+ exceptionStringRegExp = false;
+ exceptionValidNext = false;
+ exceptionValidPrev = false;
+ exceptionSpaceBefore = false;
+ exceptionSpaceBeforeSet = false;
+ }
+
+ protected void phraseElementInit() {
+ // lazy init
+ if (phraseElementList == null) {
+ phraseElementList = new ArrayList<ArrayList<Element>>();
+ }
+ }
+ protected void preparePhrase(final Attributes attrs) {
+ phraseIdRef = attrs.getValue("idref");
+ if (phraseMap.containsKey(phraseIdRef)) {
+ for (final List<Element> curPhrEl : phraseMap.get(phraseIdRef)) {
+ for (final Element e : curPhrEl) {
+ e.setPhraseName(phraseIdRef);
+ }
+ if (elementList.isEmpty()) {
+ phraseElementList.add(new ArrayList<Element>(curPhrEl));
+ } else {
+ final ArrayList<Element> prevList = new ArrayList<Element>(
+ elementList);
+ prevList.addAll(curPhrEl);
+ phraseElementList.add(new ArrayList<Element>(prevList));
+ prevList.clear();
+ }
+ }
+ lastPhrase = true;
+ }
+ }
+
+ protected void finalizePhrase() {
+ // lazy init
+ if (phraseMap == null) {
+ phraseMap = new HashMap<String, List<List<Element>>>();
+ }
+ phraseElementInit();
+ if (phraseElementList.isEmpty()) {
+ phraseElementList.add(new ArrayList<Element>(elementList));
+ } else {
+ for (final ArrayList<Element> ph : phraseElementList) {
+ ph.addAll(new ArrayList<Element>(elementList));
+ }
+ }
+
+ phraseMap.put(phraseId, new ArrayList<List<Element>>(phraseElementList));
+ elementList.clear();
+
+ phraseElementList.clear();
+ }
+
+ protected void startPattern(final Attributes attrs) throws SAXException {
+ inPattern = true;
+ if (attrs.getValue(MARK_FROM) != null) {
+ startPositionCorrection = Integer.parseInt(attrs.getValue(MARK_FROM));
+ }
+ if (attrs.getValue(MARK_TO) != null) {
+ endPositionCorrection = Integer.parseInt(attrs.getValue(MARK_TO));
+ if (endPositionCorrection > 0) {
+ throw new SAXException("End position correction (mark_to="+ endPositionCorrection
+ + ") cannot be larger than 0: " + "\n Line: "
+ + pLocator.getLineNumber() + ", column: "
+ + pLocator.getColumnNumber() + ".");
+ }
+ }
+ caseSensitive = YES.equals(attrs.getValue(CASE_SENSITIVE));
+ }
+
+
+ /**
+ * Calculates the offset of the match reference (if any) in case the match
+ * element has been used in the group.
+ *
+ * @param elList
+ * Element list where the match element was used. It is directly changed.
+ */
+ protected void processElement(final List<Element> elList) {
+ int counter = 0;
+ for (final Element elTest : elList) {
+ if (elTest.getPhraseName() != null && counter > 0) {
+ if (elTest.isReferenceElement()) {
+ final int tokRef = elTest.getMatch().getTokenRef();
+ elTest.getMatch().setTokenRef(tokRef + counter - 1);
+ final String offsetToken = elTest.getString().replace("\\" + tokRef,
+ "\\" + (tokRef + counter - 1));
+ elTest.setStringElement(offsetToken);
+ }
+ }
+ counter++;
+ }
+ }
+
+ protected void setMatchElement(final Attributes attrs) throws SAXException {
+ inMatch = true;
+ match = new StringBuilder();
+ Match.CaseConversion caseConversion = Match.CaseConversion.NONE;
+ if (attrs.getValue("case_conversion") != null) {
+ caseConversion = Match.CaseConversion.toCase(attrs
+ .getValue("case_conversion").toUpperCase());
+ }
+ Match.IncludeRange includeRange = Match.IncludeRange.NONE;
+ if (attrs.getValue("include_skipped") != null) {
+ includeRange = Match.IncludeRange.toRange(attrs
+ .getValue("include_skipped").toUpperCase());
+ }
+ final Match mWorker = new Match(attrs.getValue(POSTAG), attrs
+ .getValue("postag_replace"), YES
+ .equals(attrs.getValue(POSTAG_REGEXP)), attrs
+ .getValue("regexp_match"), attrs.getValue("regexp_replace"),
+ caseConversion, YES.equals(attrs.getValue("setpos")),
+ includeRange);
+ mWorker.setInMessageOnly(!inSuggestion);
+ if (inMessage) {
+ if (suggestionMatches == null) {
+ suggestionMatches = new ArrayList<Match>();
+ }
+ suggestionMatches.add(mWorker);
+ //add incorrect XML character for simplicity
+ message.append("\u0001\\");
+ message.append(attrs.getValue("no"));
+ if (StringTools.isEmpty(attrs.getValue("no"))) {
+ throw new SAXException("References cannot be empty: " + "\n Line: "
+ + pLocator.getLineNumber() + ", column: "
+ + pLocator.getColumnNumber() + ".");
+ } else if (Integer.parseInt(attrs.getValue("no")) < 1) {
+ throw new SAXException("References must be larger than 0: "
+ + attrs.getValue("no") + "\n Line: " + pLocator.getLineNumber()
+ + ", column: " + pLocator.getColumnNumber() + ".");
+ }
+ } else if (inToken && attrs.getValue("no") != null) {
+ final int refNumber = Integer.parseInt(attrs.getValue("no"));
+ if (refNumber > elementList.size()) {
+ throw new SAXException(
+ "Only backward references in match elements are possible, tried to specify token "
+ + refNumber
+ + "\n Line: "
+ + pLocator.getLineNumber()
+ + ", column: " + pLocator.getColumnNumber() + ".");
+ }
+ mWorker.setTokenRef(refNumber);
+ tokenReference = mWorker;
+ elements.append('\\');
+ elements.append(refNumber);
+ }
+ }
+
+ protected void setExceptions(final Attributes attrs) {
+ inException = true;
+ exceptions = new StringBuilder();
+ resetException();
+
+ exceptionStringNegation = YES.equals(attrs.getValue(NEGATE));
+ exceptionValidNext = "next".equals(attrs.getValue(SCOPE));
+ exceptionValidPrev = "previous".equals(attrs.getValue(SCOPE));
+ exceptionStringInflected = YES.equals(attrs.getValue(INFLECTED));
+
+ if (attrs.getValue(POSTAG) != null) {
+ exceptionPosToken = attrs.getValue(POSTAG);
+ exceptionPosRegExp = YES.equals(attrs.getValue(POSTAG_REGEXP));
+ exceptionPosNegation = YES.equals(attrs.getValue(NEGATE_POS));
+ }
+ exceptionStringRegExp = YES.equals(attrs.getValue(REGEXP));
+ if (attrs.getValue(SPACEBEFORE) != null) {
+ exceptionSpaceBefore = YES.equals(attrs.getValue(SPACEBEFORE));
+ exceptionSpaceBeforeSet = !"ignore".equals(attrs.getValue(SPACEBEFORE));
+ }
+ }
+
+ protected void finalizeExceptions() {
+ inException = false;
+ if (!exceptionSet) {
+ tokenElement = new Element(StringTools.trimWhitespace(elements
+ .toString()), caseSensitive, regExpression, tokenInflected);
+ exceptionSet = true;
+ }
+ tokenElement.setNegation(tokenNegated);
+ if (!StringTools.isEmpty(exceptions.toString())) {
+ tokenElement.setStringException(StringTools.trimWhitespace(exceptions
+ .toString()), exceptionStringRegExp, exceptionStringInflected,
+ exceptionStringNegation, exceptionValidNext, exceptionValidPrev);
+ }
+ if (exceptionPosToken != null) {
+ tokenElement.setPosException(exceptionPosToken, exceptionPosRegExp,
+ exceptionPosNegation, exceptionValidNext, exceptionValidPrev);
+ exceptionPosToken = null;
+ }
+ if (exceptionSpaceBeforeSet) {
+ tokenElement.setExceptionSpaceBefore(exceptionSpaceBefore);
+ }
+ resetException();
+ }
+
+ protected void setToken(final Attributes attrs) {
+ inToken = true;
+
+ if (lastPhrase) {
+ elementList.clear();
+ }
+
+ lastPhrase = false;
+ tokenNegated = YES.equals(attrs.getValue(NEGATE));
+ tokenInflected = YES.equals(attrs.getValue(INFLECTED));
+ if (attrs.getValue("skip") != null) {
+ skipPos = Integer.parseInt(attrs.getValue("skip"));
+ }
+ elements = new StringBuilder();
+ // POSElement creation
+ if (attrs.getValue(POSTAG) != null) {
+ posToken = attrs.getValue(POSTAG);
+ posRegExp = YES.equals(attrs.getValue(POSTAG_REGEXP));
+ posNegation = YES.equals(attrs.getValue(NEGATE_POS));
+ }
+ regExpression = YES.equals(attrs.getValue(REGEXP));
+
+ if (attrs.getValue(SPACEBEFORE) != null) {
+ tokenSpaceBefore = YES.equals(attrs.getValue(SPACEBEFORE));
+ tokenSpaceBeforeSet = !"ignore".equals(attrs.getValue(SPACEBEFORE));
+ }
+
+ if (!inAndGroup) {
+ tokenCounter++;
+ }
+ }
+
+ protected void checkPositions(final int add) throws SAXException {
+ if (startPositionCorrection >= tokenCounter + add) {
+ throw new SAXException(
+ "Attempt to mark a token no. ("+ startPositionCorrection +") that is outside the pattern ("
+ + tokenCounter + "). Pattern elements are numbered starting from 0!" + "\n Line: "
+ + pLocator.getLineNumber() + ", column: "
+ + pLocator.getColumnNumber() + ".");
+ }
+ if (tokenCounter +add - endPositionCorrection < 0) {
+ throw new SAXException(
+ "Attempt to mark a token no. ("+ endPositionCorrection +") that is outside the pattern ("
+ + tokenCounter + " elements). End positions should be negative but not larger than the token count!"
+ + "\n Line: "
+ + pLocator.getLineNumber() + ", column: "
+ + pLocator.getColumnNumber() + ".");
+ }
+ }
+
+ protected void checkMarkPositions() {
+ if (phraseElementList == null || phraseElementList.size() == 0) {
+ final int endMarker = elementList.size() + endPositionCorrection;
+ if (endMarker <= startPositionCorrection) {
+ throw new RuntimeException("Invalid combination of mark_from (" + startPositionCorrection
+ + ") and mark_to (" + endPositionCorrection + ") for rule " + id
+ + " with " + elementList.size()
+ + " tokens: the error position created by mark_from and mark_to is less than one token");
+ }
+ }
+ }
+
+ /**
+ * Adds Match objects for all references to tokens
+ * (including '\1' and the like).
+ */
+ protected List<Match> addLegacyMatches() {
+ if (suggestionMatches == null || suggestionMatches.isEmpty()) {
+ return null;
+ }
+ final List<Match> sugMatch = new ArrayList<Match>();
+ final String messageStr = message.toString();
+ int pos = 0;
+ int ind = 0;
+ int matchCounter = 0;
+ while (pos != -1) {
+ pos = messageStr.indexOf('\\', ind + 1);
+ if (pos != -1 && messageStr.length() > pos) {
+ if (Character.isDigit(messageStr.charAt(pos + 1))) {
+ if (pos == 1 || messageStr.charAt(pos - 1) != '\u0001') {
+ final Match mWorker = new Match(null, null, false, null,
+ null, Match.CaseConversion.NONE, false, Match.IncludeRange.NONE);
+ mWorker.setInMessageOnly(true);
+ sugMatch.add(mWorker);
+ } else if (messageStr.charAt(pos - 1) == '\u0001') { // real suggestion marker
+ sugMatch.add(suggestionMatches.get(matchCounter));
+ message.deleteCharAt(pos - 1 - matchCounter);
+ matchCounter++;
+ }
+ }
+ }
+ ind = pos;
+ }
+ if (sugMatch.isEmpty()) {
+ return suggestionMatches;
+ }
+ return sugMatch;
+ }
+
+ protected void finalizeTokens() {
+ if (!exceptionSet || tokenElement == null) {
+ tokenElement = new Element(StringTools.trimWhitespace(elements
+ .toString()), caseSensitive, regExpression, tokenInflected);
+ tokenElement.setNegation(tokenNegated);
+ } else {
+ tokenElement.setStringElement(StringTools.trimWhitespace(elements
+ .toString()));
+ }
+
+ if (skipPos != 0) {
+ tokenElement.setSkipNext(skipPos);
+ skipPos = 0;
+ }
+ if (posToken != null) {
+ tokenElement.setPosElement(posToken, posRegExp, posNegation);
+ posToken = null;
+ }
+
+ if (tokenReference != null) {
+ tokenElement.setMatch(tokenReference);
+ }
+
+ if (inAndGroup && andGroupCounter > 0) {
+ elementList.get(elementList.size() - 1)
+ .setAndGroupElement(tokenElement);
+ } else {
+ elementList.add(tokenElement);
+ }
+ if (inAndGroup) {
+ andGroupCounter++;
+ }
+
+ if (inUnification) {
+ tokenElement.setUnification(equivalenceFeatures);
+ if (uniNegation) {
+ tokenElement.setUniNegation();
+ }
+ }
+
+ if (inUnificationDef) {
+ language.getUnifier().setEquivalence(uFeature, uType, tokenElement);
+ elementList.clear();
+ }
+ if (tokenSpaceBeforeSet) {
+ tokenElement.setWhitespaceBefore(tokenSpaceBefore);
+ }
+ resetToken();
+ }
+
+}
diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/rules/patterns/bitext/BitextPatternRule.java b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/patterns/bitext/BitextPatternRule.java
new file mode 100644
index 0000000..1d42a17
--- /dev/null
+++ b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/patterns/bitext/BitextPatternRule.java
@@ -0,0 +1,93 @@
+/* LanguageTool, a natural language style checker
+ * Copyright (C) 2010 Marcin Miłkowski (www.languagetool.org)
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
+ * USA
+ */
+
+package de.danielnaber.languagetool.rules.patterns.bitext;
+
+import java.io.IOException;
+
+import de.danielnaber.languagetool.AnalyzedSentence;
+import de.danielnaber.languagetool.rules.Rule;
+import de.danielnaber.languagetool.rules.RuleMatch;
+import de.danielnaber.languagetool.rules.bitext.BitextRule;
+import de.danielnaber.languagetool.rules.patterns.PatternRule;
+
+/**
+ * A bitext pattern rule class. A BitextPatternRule describes a language error and
+ * can test whether a given pre-analyzed pair of source and target text
+ * contains that error using the {@link Rule#match} method. It uses the syntax
+ * of XML files similar to normal PatternRules.
+ *
+ * @author Marcin Miłkowski
+ */
+public class BitextPatternRule extends BitextRule {
+
+ private final PatternRule srcRule;
+ private final PatternRule trgRule;
+
+ BitextPatternRule(final PatternRule src, final PatternRule trg) {
+ srcRule = src;
+ trgRule = trg;
+ }
+
+ public PatternRule getSrcRule() {
+ return srcRule;
+ }
+
+ public PatternRule getTrgRule() {
+ return trgRule;
+ }
+
+ @Override
+ public String getDescription() {
+ return srcRule.getDescription();
+ }
+
+ public String getMessage() {
+ return trgRule.getMessage();
+ }
+
+ @Override
+ public String getId() {
+ return srcRule.getId();
+ }
+
+ /**
+ * This method always returns an empty array.
+ */
+ @Override
+ public RuleMatch[] match(AnalyzedSentence text) throws IOException {
+ return new RuleMatch[0];
+ }
+
+ @Override
+ public RuleMatch[] match(AnalyzedSentence sourceText,
+ AnalyzedSentence targetText) throws IOException {
+ if (srcRule.match(sourceText).length > 0) {
+ return trgRule.match(targetText);
+ }
+ return new RuleMatch[0];
+ }
+
+ @Override
+ public void reset() {
+ // TODO Auto-generated method stub
+
+ }
+
+}
diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/rules/patterns/bitext/BitextPatternRuleLoader.java b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/patterns/bitext/BitextPatternRuleLoader.java
new file mode 100644
index 0000000..508f381
--- /dev/null
+++ b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/patterns/bitext/BitextPatternRuleLoader.java
@@ -0,0 +1,413 @@
+/* LanguageTool, a natural language style checker
+ * Copyright (C) 2005 Daniel Naber (http://www.danielnaber.de)
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
+ * USA
+ */
+package de.danielnaber.languagetool.rules.patterns.bitext;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+
+import javax.xml.parsers.SAXParser;
+import javax.xml.parsers.SAXParserFactory;
+
+import org.xml.sax.Attributes;
+import org.xml.sax.SAXException;
+import org.xml.sax.helpers.DefaultHandler;
+
+import de.danielnaber.languagetool.Language;
+import de.danielnaber.languagetool.bitext.StringPair;
+import de.danielnaber.languagetool.rules.Category;
+import de.danielnaber.languagetool.rules.IncorrectExample;
+import de.danielnaber.languagetool.rules.bitext.IncorrectBitextExample;
+import de.danielnaber.languagetool.rules.patterns.Element;
+import de.danielnaber.languagetool.rules.patterns.Match;
+import de.danielnaber.languagetool.rules.patterns.PatternRule;
+
+/**
+ * Loads {@link PatternRule}s from an XML file.
+ *
+ * @author Marcin Miłkowski
+ */
+public class BitextPatternRuleLoader extends DefaultHandler {
+
+ public final List<BitextPatternRule> getRules(final InputStream is,
+ final String filename) throws IOException {
+ final List<BitextPatternRule> rules;
+ try {
+ final PatternRuleHandler handler = new PatternRuleHandler();
+ final SAXParserFactory factory = SAXParserFactory.newInstance();
+ final SAXParser saxParser = factory.newSAXParser();
+ /* saxParser.getXMLReader().setFeature(
+ "http://apache.org/xml/features/nonvalidating/load-external-dtd",
+ false);
+ */
+ saxParser.parse(is, handler);
+ rules = handler.getBitextRules();
+ return rules;
+ } catch (final Exception e) {
+ final IOException ioe = new IOException("Cannot load or parse '"
+ + filename + "'");
+ ioe.initCause(e);
+ throw ioe;
+ }
+ }
+
+}
+
+class PatternRuleHandler extends BitextXMLRuleHandler {
+
+ private int subId;
+
+ private boolean defaultOff;
+ private boolean defaultOn;
+
+ private Category category;
+ private String description;
+ private String ruleGroupDescription;
+
+ private PatternRule srcRule;
+ private PatternRule trgRule;
+
+ private IncorrectExample trgExample;
+ private IncorrectExample srcExample;
+
+ private Language srcLang;
+
+ // ===========================================================
+ // SAX DocumentHandler methods
+ // ===========================================================
+
+ @Override
+ public void startElement(final String namespaceURI, final String lName,
+ final String qName, final Attributes attrs) throws SAXException {
+ if (qName.equals("category")) {
+ final String catName = attrs.getValue("name");
+ final String priorityStr = attrs.getValue("priority");
+ // int prio = 0;
+ if (priorityStr != null) {
+ category = new Category(catName, Integer.parseInt(priorityStr));
+ } else {
+ category = new Category(catName);
+ }
+
+ if ("off".equals(attrs.getValue(DEFAULT))) {
+ category.setDefaultOff();
+ }
+
+ } else if (qName.equals("rules")) {
+ final String languageStr = attrs.getValue("targetLang");
+ language = Language.getLanguageForShortName(languageStr);
+ if (language == null) {
+ throw new SAXException("Unknown language '" + languageStr + "'");
+ }
+ } else if (qName.equals("rule")) {
+ id = attrs.getValue("id");
+ if (inRuleGroup)
+ subId++;
+ if (!(inRuleGroup && defaultOff)) {
+ defaultOff = "off".equals(attrs.getValue(DEFAULT));
+ }
+
+ if (!(inRuleGroup && defaultOn)) {
+ defaultOn = "on".equals(attrs.getValue(DEFAULT));
+ }
+ if (inRuleGroup && id == null) {
+ id = ruleGroupId;
+ }
+ description = attrs.getValue("name");
+ if (inRuleGroup && description == null) {
+ description = ruleGroupDescription;
+ }
+ correctExamples = new ArrayList<StringPair>();
+ incorrectExamples = new ArrayList<IncorrectBitextExample>();
+ if (suggestionMatches != null) {
+ suggestionMatches.clear();
+ }
+ } else if (PATTERN.equals(qName) || "target".equals(qName)) {
+ startPattern(attrs);
+ } else if (AND.equals(qName)) {
+ inAndGroup = true;
+ } else if (UNIFY.equals(qName)) {
+ inUnification = true;
+ uniNegation = YES.equals(attrs.getValue(NEGATE));
+ } else if (qName.equals("feature")) {
+ uFeature = attrs.getValue("id");
+ } else if (qName.equals(TYPE)) {
+ uType = attrs.getValue("id");
+ uTypeList.add(uType);
+ } else if (qName.equals(TOKEN)) {
+ setToken(attrs);
+ } else if (qName.equals(EXCEPTION)) {
+ setExceptions(attrs);
+ } else if (qName.equals(EXAMPLE)
+ && attrs.getValue(TYPE).equals("correct")) {
+ inCorrectExample = true;
+ correctExample = new StringBuilder();
+ } else if (EXAMPLE.equals(qName)
+ && attrs.getValue(TYPE).equals("incorrect")) {
+ inIncorrectExample = true;
+ incorrectExample = new StringBuilder();
+ exampleCorrection = new StringBuilder();
+ if (attrs.getValue("correction") != null) {
+ exampleCorrection.append(attrs.getValue("correction"));
+ }
+ } else if (MESSAGE.equals(qName)) {
+ inMessage = true;
+ message = new StringBuilder();
+ } else if (qName.equals("short")) {
+ inShortMessage = true;
+ shortMessage = new StringBuilder();
+ } else if (qName.equals(RULEGROUP)) {
+ ruleGroupId = attrs.getValue("id");
+ ruleGroupDescription = attrs.getValue("name");
+ defaultOff = "off".equals(attrs.getValue(DEFAULT));
+ defaultOn = "on".equals(attrs.getValue(DEFAULT));
+ inRuleGroup = true;
+ subId = 0;
+ } else if (qName.equals("suggestion") && inMessage) {
+ message.append("<suggestion>");
+ inSuggestion = true;
+ } else if (qName.equals("match")) {
+ setMatchElement(attrs);
+ } else if (qName.equals(MARKER) && inCorrectExample) {
+ correctExample.append("<marker>");
+ } else if (qName.equals(MARKER) && inIncorrectExample) {
+ incorrectExample.append("<marker>");
+ } else if (qName.equals("unification")) {
+ uFeature = attrs.getValue("feature");
+ inUnificationDef = true;
+ } else if (qName.equals("equivalence")) {
+ uType = attrs.getValue(TYPE);
+ } else if (qName.equals("phrases")) {
+ inPhrases = true;
+ } else if (qName.equals("includephrases")) {
+ phraseElementInit();
+ } else if (qName.equals("phrase") && inPhrases) {
+ phraseId = attrs.getValue("id");
+ } else if (qName.equals("phraseref") && (attrs.getValue("idref") != null)) {
+ preparePhrase(attrs);
+ } else if (qName.equals("source")) {
+ srcLang = Language.getLanguageForShortName(attrs.getValue("lang"));
+ }
+ }
+
+ @Override
+ public void endElement(final String namespaceURI, final String sName,
+ final String qName) throws SAXException {
+
+ if (qName.equals("source")) {
+ checkMarkPositions();
+ srcRule = finalizeRule();
+ } else if ("target".equals(qName)) {
+ checkMarkPositions();
+ trgRule = finalizeRule();
+ } else if ("rule".equals(qName)) {
+ trgRule.setMessage(message.toString());
+ if (suggestionMatches != null) {
+ for (final Match m : suggestionMatches) {
+ trgRule.addSuggestionMatch(m);
+ }
+ if (phraseElementList.size() <= 1) {
+ suggestionMatches.clear();
+ }
+ }
+ final BitextPatternRule bRule = new BitextPatternRule(srcRule, trgRule);
+ bRule.setCorrectBitextExamples(correctExamples);
+ bRule.setIncorrectBitextExamples(incorrectExamples);
+ bRule.setSourceLang(srcLang);
+ rules.add(bRule);
+ } else if (qName.equals(EXCEPTION)) {
+ finalizeExceptions();
+ } else if (qName.equals(AND)) {
+ inAndGroup = false;
+ andGroupCounter = 0;
+ tokenCounter++;
+ } else if (qName.equals(TOKEN)) {
+ finalizeTokens();
+ } else if (qName.equals(PATTERN)) {
+ inPattern = false;
+ if (lastPhrase) {
+ elementList.clear();
+ }
+ if (phraseElementList == null || phraseElementList.isEmpty()) {
+ checkPositions(0);
+ } else {
+ for (List<Element> elements : phraseElementList) {
+ checkPositions(elements.size());
+ }
+ }
+ tokenCounter = 0;
+ } else if (qName.equals("trgExample")) {
+ trgExample = setExample();
+ } else if (qName.equals("srcExample")) {
+ srcExample = setExample();
+ } else if (qName.equals("example")) {
+ if (inCorrectExample) {
+ correctExamples.add(new StringPair(srcExample.getExample(), trgExample.getExample()));
+ } else if (inIncorrectExample) {
+ if (trgExample.getCorrections() == null) {
+ incorrectExamples.add(
+ new IncorrectBitextExample(
+ new StringPair(
+ srcExample.getExample(), trgExample.getExample())
+ ));
+ } else {
+ List<String> l = trgExample.getCorrections();
+ String str [] = l.toArray (new String [l.size ()]);
+ incorrectExamples.add(
+ new IncorrectBitextExample(
+ new StringPair(srcExample.getExample(),
+ trgExample.getExample()), str)
+ );
+ }
+ }
+ inCorrectExample = false;
+ inIncorrectExample = false;
+ } else if (qName.equals("message")) {
+ suggestionMatches = addLegacyMatches();
+ inMessage = false;
+ } else if (qName.equals("short")) {
+ inShortMessage = false;
+ } else if (qName.equals("match")) {
+ if (inMessage) {
+ suggestionMatches.get(suggestionMatches.size() - 1).setLemmaString(
+ match.toString());
+ } else if (inToken) {
+ tokenReference.setLemmaString(match.toString());
+ }
+ inMatch = false;
+ } else if (qName.equals("rulegroup")) {
+ inRuleGroup = false;
+ } else if (qName.equals("suggestion") && inMessage) {
+ message.append("</suggestion>");
+ inSuggestion = false;
+ } else if (qName.equals(MARKER) && inCorrectExample) {
+ correctExample.append("</marker>");
+ } else if (qName.equals(MARKER) && inIncorrectExample) {
+ incorrectExample.append("</marker>");
+ } else if (qName.equals("phrase") && inPhrases) {
+ finalizePhrase();
+ } else if (qName.equals("includephrases")) {
+ elementList.clear();
+ } else if (qName.equals("phrases") && inPhrases) {
+ inPhrases = false;
+ } else if (qName.equals("unification")) {
+ inUnificationDef = false;
+ } else if (qName.equals("feature")) {
+ equivalenceFeatures.put(uFeature, uTypeList);
+ uTypeList = new ArrayList<String>();
+ } else if (qName.equals("unify")) {
+ inUnification = false;
+ //clear the features...
+ equivalenceFeatures = new HashMap<String, List<String>>();
+ }
+ }
+
+ private IncorrectExample setExample() {
+ IncorrectExample example = null;
+ if (inCorrectExample) {
+ example = new IncorrectExample(correctExample.toString());
+ } else if (inIncorrectExample) {
+ final String[] corrections = exampleCorrection.toString().split("\\|");
+ if (corrections.length > 0 && corrections[0].length() > 0) {
+ example = new IncorrectExample(incorrectExample.toString(),
+ corrections);
+ } else {
+ example = new IncorrectExample(incorrectExample.toString());
+ }
+ }
+ correctExample = new StringBuilder();
+ incorrectExample = new StringBuilder();
+ exampleCorrection = new StringBuilder();
+ return example;
+ }
+
+ private PatternRule finalizeRule() {
+ PatternRule rule = null;
+ phraseElementInit();
+ if (phraseElementList.isEmpty()) {
+ rule = new PatternRule(id, language, elementList,
+ description, "", shortMessage.toString());
+ prepareRule(rule);
+ } else {
+ if (!elementList.isEmpty()) {
+ for (final ArrayList<Element> ph : phraseElementList) {
+ ph.addAll(new ArrayList<Element>(elementList));
+ }
+ }
+
+ for (final ArrayList<Element> phraseElement : phraseElementList) {
+ processElement(phraseElement);
+ rule = new PatternRule(id, language, phraseElement,
+ description, message.toString(), shortMessage.toString(),
+ phraseElementList.size() > 1);
+ prepareRule(rule);
+ }
+ }
+ elementList.clear();
+ if (phraseElementList != null) {
+ phraseElementList.clear();
+ }
+ startPositionCorrection = 0;
+ endPositionCorrection = 0;
+ return rule;
+ }
+ private void prepareRule(final PatternRule rule) {
+ rule.setStartPositionCorrection(startPositionCorrection);
+ rule.setEndPositionCorrection(endPositionCorrection);
+ startPositionCorrection = 0;
+ endPositionCorrection = 0;
+ rule.setCategory(category);
+ if (inRuleGroup)
+ rule.setSubId(Integer.toString(subId));
+ else
+ rule.setSubId("1");
+ caseSensitive = false;
+ if (defaultOff) {
+ rule.setDefaultOff();
+ }
+
+ if (category.isDefaultOff() && !defaultOn) {
+ rule.setDefaultOff();
+ }
+
+ }
+
+ @Override
+ public void characters(final char[] buf, final int offset, final int len) {
+ final String s = new String(buf, offset, len);
+ if (inException) {
+ exceptions.append(s);
+ } else if (inToken) {
+ elements.append(s);
+ } else if (inCorrectExample) {
+ correctExample.append(s);
+ } else if (inIncorrectExample) {
+ incorrectExample.append(s);
+ } else if (inMatch) {
+ match.append(s);
+ } else if (inMessage) {
+ message.append(s);
+ } else if (inShortMessage) {
+ shortMessage.append(s);
+ }
+ }
+
+}
diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/rules/patterns/bitext/BitextXMLRuleHandler.java b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/patterns/bitext/BitextXMLRuleHandler.java
new file mode 100644
index 0000000..02f5a04
--- /dev/null
+++ b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/patterns/bitext/BitextXMLRuleHandler.java
@@ -0,0 +1,56 @@
+/* LanguageTool, a natural language style checker
+ * Copyright (C) 2006 Daniel Naber (http://www.danielnaber.de)
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
+ * USA
+ */
+package de.danielnaber.languagetool.rules.patterns.bitext;
+
+import java.util.ArrayList;
+import java.util.List;
+
+import org.xml.sax.SAXException;
+import org.xml.sax.SAXParseException;
+
+import de.danielnaber.languagetool.bitext.StringPair;
+import de.danielnaber.languagetool.rules.bitext.IncorrectBitextExample;
+import de.danielnaber.languagetool.rules.patterns.XMLRuleHandler;
+
+/**
+ * XML rule handler that loads rules from XML and throws
+ * exceptions on errors and warnings.
+ *
+ * @author Daniel Naber
+ */
+class BitextXMLRuleHandler extends XMLRuleHandler {
+
+ List<BitextPatternRule> rules = new ArrayList<BitextPatternRule>();
+
+ List<StringPair> correctExamples = new ArrayList<StringPair>();
+ List<IncorrectBitextExample> incorrectExamples = new ArrayList<IncorrectBitextExample>();
+
+ List<BitextPatternRule> getBitextRules() {
+ return rules;
+ }
+
+ public void warning (final SAXParseException e) throws SAXException {
+ throw e;
+ }
+
+ public void error (final SAXParseException e) throws SAXException {
+ throw e;
+ }
+
+}
diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/rules/patterns/bitext/FalseFriendsAsBitextLoader.java b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/patterns/bitext/FalseFriendsAsBitextLoader.java
new file mode 100644
index 0000000..87c30a5
--- /dev/null
+++ b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/patterns/bitext/FalseFriendsAsBitextLoader.java
@@ -0,0 +1,72 @@
+/* LanguageTool, a natural language style checker
+ * Copyright (C) 2010 Marcin Miłkowski (www.languagetool.org)
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
+ * USA
+ */
+
+package de.danielnaber.languagetool.rules.patterns.bitext;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+
+import javax.xml.parsers.ParserConfigurationException;
+
+import org.xml.sax.SAXException;
+
+import de.danielnaber.languagetool.Language;
+import de.danielnaber.languagetool.rules.patterns.FalseFriendRuleLoader;
+import de.danielnaber.languagetool.rules.patterns.PatternRule;
+
+/**
+ * Loads the false friend rules as bitext pattern rules. Note that the resulting
+ * rules have suggestions that are not really customizable, in contradistinction
+ * to the 'real' bitext pattern rules.
+ *
+ * @author Marcin Miłkowski
+ *
+ */
+public class FalseFriendsAsBitextLoader {
+
+ public List<BitextPatternRule> getFalseFriendsAsBitext(final String filename,
+ final Language motherTongue, final Language language) throws ParserConfigurationException, SAXException, IOException {
+ final FalseFriendRuleLoader ruleLoader = new FalseFriendRuleLoader();
+ List<BitextPatternRule> bRules = new ArrayList<BitextPatternRule>();
+ List<PatternRule> rules1 =
+ ruleLoader.getRules(this.getClass().getResourceAsStream(filename),
+ motherTongue, language);
+ List<PatternRule> rules2 =
+ ruleLoader.getRules(this.getClass().getResourceAsStream(filename),
+ language, motherTongue);
+ HashMap<String, PatternRule> srcRules = new HashMap<String, PatternRule>();
+ for (PatternRule rule : rules1) {
+ srcRules.put(rule.getId(), rule);
+ }
+ for (PatternRule rule : rules2) {
+ if (srcRules.containsKey(rule.getId())) {
+ BitextPatternRule bRule = new BitextPatternRule(
+ srcRules.get(rule.getId()), rule);
+ bRule.setSourceLang(motherTongue);
+ bRule.setCategory(rule.getCategory());
+ bRules.add(bRule);
+ }
+ }
+ return bRules;
+ }
+
+}
+