diff options
Diffstat (limited to 'JLanguageTool/src/java/de/danielnaber/languagetool/rules/patterns')
12 files changed, 4588 insertions, 0 deletions
diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/rules/patterns/AbstractPatternRule.java b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/patterns/AbstractPatternRule.java new file mode 100644 index 0000000..d172134 --- /dev/null +++ b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/patterns/AbstractPatternRule.java @@ -0,0 +1,223 @@ +/* LanguageTool, a natural language style checker + * Copyright (C) 2008 Daniel Naber (http://www.danielnaber.de) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ + +package de.danielnaber.languagetool.rules.patterns; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; + +import de.danielnaber.languagetool.AnalyzedSentence; +import de.danielnaber.languagetool.AnalyzedToken; +import de.danielnaber.languagetool.AnalyzedTokenReadings; +import de.danielnaber.languagetool.Language; +import de.danielnaber.languagetool.rules.Rule; +import de.danielnaber.languagetool.rules.RuleMatch; + +/** + * An Abstract Pattern Rule that describes a pattern of words or part-of-speech tags + * used for PatternRule and DisambiguationPatternRule. + * + * Introduced to minimize code duplication between those classes. + * + * @author Marcin Miłkowski + */ + +public abstract class AbstractPatternRule extends Rule { + + private final String id; + + private final String description; + + protected final List<Element> patternElements; + + protected Unifier unifier; + + protected final Language language; + + protected int startPositionCorrection; + + protected int endPositionCorrection; + + protected boolean prevMatched; + + protected final boolean testUnification; + + private final boolean getUnified; + + private boolean groupsOrUnification; + + protected AnalyzedTokenReadings[] unifiedTokens; + + protected final boolean sentStart; + + public AbstractPatternRule(final String id, + final String description, + final Language language, + final List<Element> elements, + boolean getUnified) { + this.id = id; + this.description = description; + this.patternElements = new ArrayList<Element>(elements); // copy elements + this.language = language; + this.getUnified = getUnified; + unifier = language.getUnifier(); + testUnification = initUnifier(); + sentStart = patternElements.get(0).isSentStart(); + if (!testUnification) { + for (Element elem : patternElements) { + if (elem.hasAndGroup()) { + groupsOrUnification = true; + break; + } + } + } else { + groupsOrUnification = true; + } + } + + private boolean initUnifier() { + for (final Element elem : patternElements) { + if (elem.isUnified()) { + return true; + } + } + return false; + } + + @Override + public final String toString() { + return id + ":" + patternElements + ":" + description; + } + + @Override + public String getDescription() { + return description; + } + + @Override + public String getId() { + return id; + } + + @Override + public RuleMatch[] match(AnalyzedSentence text) throws IOException { + // TODO Auto-generated method stub + return null; + } + + @Override + public void reset() { + // TODO Auto-generated method stub + } + + public final void setStartPositionCorrection(final int startPositionCorrection) { + this.startPositionCorrection = startPositionCorrection; + } + + public final void setEndPositionCorrection(final int endPositionCorrection) { + this.endPositionCorrection = endPositionCorrection; + } + + + protected void setupAndGroup(final int firstMatchToken, + final Element elem, final AnalyzedTokenReadings[] tokens) + throws IOException { + if (elem.hasAndGroup()) { + for (final Element andElement : elem.getAndGroup()) { + if (andElement.isReferenceElement()) { + setupRef(firstMatchToken, andElement, tokens); + } + } + elem.setupAndGroup(); + } + } + + //TODO: add .compile for all exceptions of the element? + protected void setupRef(final int firstMatchToken, final Element elem, + final AnalyzedTokenReadings[] tokens) throws IOException { + if (elem.isReferenceElement()) { + final int refPos = firstMatchToken + elem.getMatch().getTokenRef(); + if (refPos < tokens.length) { + elem.compile(tokens[refPos], language.getSynthesizer()); + } + } + } + + protected boolean testAllReadings(final AnalyzedTokenReadings[] tokens, + final Element elem, final Element prevElement, final int tokenNo, + final int firstMatchToken, final int prevSkipNext) throws IOException { + boolean thisMatched = false; + final int numberOfReadings = tokens[tokenNo].getReadingsLength(); + setupAndGroup(firstMatchToken, elem, tokens); + for (int l = 0; l < numberOfReadings; l++) { + final AnalyzedToken matchToken = tokens[tokenNo].getAnalyzedToken(l); + prevMatched = prevMatched || prevSkipNext > 0 && prevElement != null + && prevElement.isMatchedByScopeNextException(matchToken); + if (prevMatched) { + return false; + } + thisMatched = thisMatched || elem.isMatched(matchToken); + if (!thisMatched && !elem.isInflected() && elem.getPOStag() == null + && (prevElement != null && prevElement.getExceptionList() == null)) { + return false; // the token is the same, we will not get a match + } + if (groupsOrUnification) { + thisMatched &= testUnificationAndGroups(thisMatched, + l + 1 == numberOfReadings, matchToken, elem); + } + } + if (thisMatched) { + for (int l = 0; l < numberOfReadings; l++) { + if (elem.isExceptionMatchedCompletely(tokens[tokenNo].getAnalyzedToken(l))) + return false; + } + if (tokenNo > 0 && elem.hasPreviousException()) { + if (elem.isMatchedByPreviousException(tokens[tokenNo - 1])) + return false; + } + } + return thisMatched; + } + + protected boolean testUnificationAndGroups(final boolean matched, + final boolean lastReading, final AnalyzedToken matchToken, + final Element elem) { + boolean thisMatched = matched; + if (testUnification) { + if (matched && elem.isUnified()) { + thisMatched = thisMatched && unifier.isUnified(matchToken, elem.getUniFeatures(), + elem.isUniNegated(), lastReading); + } + if (thisMatched && getUnified) { + unifiedTokens = unifier.getFinalUnified(); + } + if (!elem.isUnified()) { + unifier.reset(); + } + } + elem.addMemberAndGroup(matchToken); + if (lastReading) { + thisMatched &= elem.checkAndGroup(thisMatched); + } + return thisMatched; + } + + +} diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/rules/patterns/Element.java b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/patterns/Element.java new file mode 100644 index 0000000..0ad7c1f --- /dev/null +++ b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/patterns/Element.java @@ -0,0 +1,803 @@ +/* LanguageTool, a natural language style checker + * Copyright (C) 2005 Daniel Naber (http://www.danielnaber.de) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ +package de.danielnaber.languagetool.rules.patterns; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; +import java.util.Map; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +import de.danielnaber.languagetool.AnalyzedToken; +import de.danielnaber.languagetool.AnalyzedTokenReadings; +import de.danielnaber.languagetool.JLanguageTool; +import de.danielnaber.languagetool.synthesis.Synthesizer; +import de.danielnaber.languagetool.tools.StringTools; + +/** + * A part of a pattern. + * + * @author Daniel Naber + */ +public class Element { + + private String stringToken; + private String posToken; + private String regToken; + private boolean posRegExp; + + private boolean negation; + private boolean posNegation; + + private final boolean caseSensitive; + private final boolean stringRegExp; + private boolean inflected; + + private boolean testWhitespace; + private boolean whitespaceBefore; + + /** + * List of exceptions that are valid for the current token and / or some next + * tokens. + */ + private List<Element> exceptionList; + + /** + * True if scope=="next". + */ + private boolean exceptionValidNext; + + /** + * True if any exception with a scope=="current" or scope=="next" is set for + * the element. + */ + private boolean exceptionSet; + + /** + * True if attribute scope=="previous". + */ + private boolean exceptionValidPrevious; + + /** + * List of exceptions that are valid for a previous token. + */ + private List<Element> previousExceptionList; + + private List<Element> andGroupList; + private boolean andGroupSet; + private boolean[] andGroupCheck; + + private int skip; + + private Pattern p; + private Pattern pPos; + + private Matcher m; + private Matcher mPos; + + /** The reference to another element in the pattern. **/ + private Match tokenReference; + + /** + * True when the element stores a formatted reference to another element of + * the pattern. + */ + private boolean containsMatches; + + /** Matches only tokens without any POS tag. **/ + private static final String UNKNOWN_TAG = "UNKNOWN"; + + /** + * Parameter passed to regular expression matcher to enable case insensitive + * Unicode matching. + */ + private static final String CASE_INSENSITIVE = "(?iu)"; + + private String referenceString; + + /** String ID of the phrase the element is in. **/ + private String phraseName; + + /** + * This var is used to determine if calling {@link #setStringElement} makes + * sense. This method takes most time so it's best to reduce the number of its + * calls. + **/ + private boolean testString; + + /** + * Tells if the element is inside the unification, so that {@link Unifier} + * tests it. + */ + private boolean unified; + private boolean uniNegation; + + private Map<String, List<String>> unificationFeatures; + + /** + * Creates Element that is used to match tokens in the text. + * + * @param token + * String to be matched + * @param caseSensitive + * True if the check is case-sensitive. + * @param regExp + * True if the check uses regular expressions. + * @param inflected + * True if the check refers to base forms (lemmas). + */ + public Element(final String token, final boolean caseSensitive, + final boolean regExp, final boolean inflected) { + this.caseSensitive = caseSensitive; + this.stringRegExp = regExp; + this.inflected = inflected; + setStringElement(token); + } + + /** + * Checks whether the rule element matches the token given as a parameter. + * + * @param token + * @AnalyzedToken to check matching against + * @return True if token matches, false otherwise. + */ + public final boolean isMatched(final AnalyzedToken token) { + if (testWhitespace && !isWhitespaceBefore(token)) { + return false; + } + boolean matched = false; + if (testString) { + matched = (isStringTokenMatched(token) ^ negation) + && (isPosTokenMatched(token) ^ posNegation); + } else { + matched = (!negation) && (isPosTokenMatched(token) ^ posNegation); + } + + if (andGroupSet) { + andGroupCheck[0] |= matched; + } + return matched; + } + + /** + * Checks whether an exception matches. + * + * @param token + * @AnalyzedToken to check matching against + * @return True if any of the exceptions matches (logical disjunction). + */ + public final boolean isExceptionMatched(final AnalyzedToken token) { + if (exceptionSet) { + for (final Element testException : exceptionList) { + if (!testException.exceptionValidNext) { + if (testException.isMatched(token)) { + return true; + } + } + } + } + return false; + } + + /** + * Enables testing multiple conditions specified by different elements. + * Doesn't test exceptions. + * + * Works as logical AND operator only if preceded with + * {@link #setupAndGroup()}, and followed by {@link #checkAndGroup(boolean)}. + * + * @param token + * AnalyzedToken - the token checked. + */ + public final void addMemberAndGroup(final AnalyzedToken token) { + if (andGroupSet) { + for (int i = 0; i < andGroupList.size(); i++) { + if (!andGroupCheck[i + 1]) { + final Element testAndGroup = andGroupList.get(i); + if (testAndGroup.isMatched(token)) { + andGroupCheck[i + 1] = true; + } + } + } + } + } + + public final void setupAndGroup() { + if (andGroupSet) { + andGroupCheck = new boolean[andGroupList.size() + 1]; + Arrays.fill(andGroupCheck, false); + } + } + + public final boolean checkAndGroup(final boolean previousValue) { + if (andGroupSet) { + boolean allConditionsMatch = true; + for (final boolean testValue : andGroupCheck) { + allConditionsMatch &= testValue; + } + return allConditionsMatch; + } + return previousValue; + } + + /** + * Enables testing multiple conditions specified by multiple element + * exceptions. + * + * Works as logical AND operator. + * + * @param token + * AnalyzedToken - the token checked for exceptions. + * @return true if all conditions are met, false otherwise. + */ + public final boolean isAndExceptionGroupMatched(final AnalyzedToken token) { + if (andGroupSet) { + for (final Element testAndGroup : andGroupList) { + if (testAndGroup.isExceptionMatched(token)) { + return true; + } + } + } + return false; + } + + /** + * This method checks exceptions both in AND-group and the token. Introduced + * to for clarity. + * + * @param token + * Token to match + * @return True if matched. + */ + public final boolean isExceptionMatchedCompletely(final AnalyzedToken token) { + // note: short-circuiting possible + return isExceptionMatched(token) || isAndExceptionGroupMatched(token); + } + + public final void setAndGroupElement(final Element andToken) { + if (andToken != null) { + if (andGroupList == null) { + andGroupList = new ArrayList<Element>(); + } + if (!andGroupSet) { + andGroupSet = true; + } + andGroupList.add(andToken); + } + } + + /** + * Checks if this element has an AND group associated with it. + * + * @return true if the element has a group of elements that all should match. + */ + public final boolean hasAndGroup() { + return andGroupSet; + } + + /** + * Returns the group of elements linked with AND operator. + * + * @return List of Elements. + */ + public final List<Element> getAndGroup() { + return andGroupList; + } + + /** + * Checks whether a previously set exception matches (in case the exception + * had scope == "next"). + * + * @param token + * @AnalyzedToken to check matching against. + * @return True if any of the exceptions matches. + */ + public final boolean isMatchedByScopeNextException(final AnalyzedToken token) { + if (exceptionSet) { + for (final Element testException : exceptionList) { + if (testException.exceptionValidNext) { + if (testException.isMatched(token)) { + return true; + } + } + } + } + return false; + } + + /** + * Checks whether an exception for a previous token matches (in case the + * exception had scope == "previous"). + * + * @param token + * {@link AnalyzedToken} to check matching against. + * @return True if any of the exceptions matches. + */ + public final boolean isMatchedByPreviousException(final AnalyzedToken token) { + if (exceptionValidPrevious) { + for (final Element testException : previousExceptionList) { + if (!testException.exceptionValidNext) { + if (testException.isMatched(token)) { + return true; + } + } + } + } + return false; + } + + /** + * Checks whether an exception for a previous token matches all readings of a + * given token (in case the exception had scope == "previous"). + * + * @param prevToken + * {@link AnalyzedTokenReadings} to check matching against. + * @return true if any of the exceptions matches. + */ + public final boolean isMatchedByPreviousException( + final AnalyzedTokenReadings prevToken) { + final int numReadings = prevToken.getReadingsLength(); + for (int i = 0; i < numReadings; i++) { + if (isMatchedByPreviousException(prevToken.getAnalyzedToken(i))) { + return true; + } + } + return false; + } + + /** + * Checks if the token is a SENT_START. + * + * @return True if the element starts the sentence and the element hasn't been + * set to have negated POS token. + * + */ + public final boolean isSentStart() { + return JLanguageTool.SENTENCE_START_TAGNAME.equals(posToken) + && !posNegation; + } + + @Override + public final String toString() { + final StringBuilder sb = new StringBuilder(); + if (negation) { + sb.append('!'); + } + sb.append(stringToken); + if (phraseName != null) { + sb.append(" {"); + sb.append(phraseName); + sb.append('}'); + } + if (posToken != null) { + sb.append('/'); + sb.append(posToken); + } + return sb.toString(); + } + + public final void setPosElement(final String posToken, final boolean regExp, + final boolean negation) { + this.posToken = posToken; + this.posNegation = negation; + posRegExp = regExp; + if (posRegExp) { + pPos = Pattern.compile(posToken); + } + } + + public final String getString() { + return stringToken; + } + + public final void setStringElement(final String token) { + this.stringToken = token; + testString = !StringTools.isEmpty(stringToken); + if (testString && stringRegExp) { + regToken = stringToken; + if (!caseSensitive) { + regToken = CASE_INSENSITIVE + stringToken; + } + if (!"\\0".equals(token)) { + p = Pattern.compile(regToken); + } + } + } + + /** + * Sets a POS-type exception for matching string tokens. + * + * @param posToken + * The part of the speech tag in the exception. + * @param regExp + * True if the POS is specified as a regular expression. + * @param negation + * True if the exception is negated. + * @param scopeNext + * True if the exception scope is next tokens. + * @param scopePrevious + * True if the exception should match only a single previous token. + */ + public final void setPosException(final String posToken, + final boolean regExp, final boolean negation, final boolean scopeNext, + final boolean scopePrevious) { + final Element posException = new Element("", this.caseSensitive, false, + false); + posException.setPosElement(posToken, regExp, negation); + posException.exceptionValidNext = scopeNext; + setException(posException, scopePrevious); + } + + /** + * Sets a string-type exception for matching string tokens. + * + * @param token + * The string in the exception. + * @param regExp + * True if the string is specified as a regular expression. + * @param inflected + * True if the string is a base form (lemma). + * @param negation + * True if the exception is negated. + * @param scopeNext + * True if the exception scope is next tokens. + * @param scopePrevious + * True if the exception should match only a single previous token. + */ + public final void setStringException(final String token, + final boolean regExp, final boolean inflected, final boolean negation, + final boolean scopeNext, final boolean scopePrevious) { + final Element stringException = new Element(token, this.caseSensitive, + regExp, inflected); + stringException.setNegation(negation); + stringException.exceptionValidNext = scopeNext; + setException(stringException, scopePrevious); + } + + private void setException(final Element elem, final boolean scopePrevious) { + exceptionValidPrevious |= scopePrevious; + if (exceptionList == null && !scopePrevious) { + exceptionList = new ArrayList<Element>(); + } + if (previousExceptionList == null && scopePrevious) { + previousExceptionList = new ArrayList<Element>(); + } + if (scopePrevious) { + previousExceptionList.add(elem); + } else { + if (!exceptionSet) { + exceptionSet = true; + } + if (exceptionSet) { + exceptionList.add(elem); + } + } + } + + /** + * Tests if part of speech matches a given string. + * + * @param token + * Token to test. + * @return true if matches + * + * Special value UNKNOWN_TAG matches null POS tags. + * + */ + private boolean isPosTokenMatched(final AnalyzedToken token) { + // if no POS set + // defaulting to true + if (posToken == null) { + return true; + } + if (token.getPOSTag() == null) { + if (posRegExp) { + if (mPos == null) { + mPos = pPos.matcher(UNKNOWN_TAG); + } else { + mPos.reset(UNKNOWN_TAG); + } + return mPos.matches(); + } + if (UNKNOWN_TAG.equals(posToken)) { + return true; + } + } + boolean match; + if (posRegExp) { + if (mPos == null) { + mPos = pPos.matcher(token.getPOSTag()); + } else { + mPos.reset(token.getPOSTag()); + } + match = mPos.matches(); + } else { + match = posToken.equals(token.getPOSTag()); + } + if (!match && UNKNOWN_TAG.equals(posToken)) { // these are helper tags, + // ignore them + match = JLanguageTool.SENTENCE_END_TAGNAME.equals(token.getPOSTag()) + || JLanguageTool.PARAGRAPH_END_TAGNAME.equals(token.getPOSTag()); + } + return match; + } + + /** + * Tests whether the string token element matches a given token. + * + * @param token + * {@link AnalyzedToken} to match against. + * @return True if matches. + */ + private boolean isStringTokenMatched(final AnalyzedToken token) { + final String testToken = getTestToken(token); + if (stringRegExp) { + if (m == null) { + m = p.matcher(testToken); + } else { + m.reset(testToken); + } + return m.matches(); + } + if (caseSensitive) { + return stringToken.equals(testToken); + } + return stringToken.equalsIgnoreCase(testToken); + } + + private String getTestToken(final AnalyzedToken token) { + // enables using words with lemmas and without lemmas + // in the same regexp with inflected="yes" + if (inflected) { + return token.getTokenInflected(); + } + return token.getToken(); + } + + /** + * Gets the exception scope length. + * + * @return Scope length. + */ + public final int getSkipNext() { + return skip; + } + + /** + * Sets the exception scope length. + * + * @param i + * Exception scope length. + */ + public final void setSkipNext(final int i) { + skip = i; + } + + /** + * Checks if the element has an exception for a previous token. + * + * @return True if the element has a previous token matching exception. + */ + public final boolean hasPreviousException() { + return exceptionValidPrevious; + } + + /** + * Negates the meaning of match(). + * + * @param negation + * - true if the meaning of match() is to be negated. + */ + public final void setNegation(final boolean negation) { + this.negation = negation; + } + + /** + * see {@link #setNegation} + * + * @since 0.9.3 + */ + public final boolean getNegation() { + return this.negation; + } + + /** + * + * @return true when this element refers to another token. + */ + public final boolean isReferenceElement() { + return containsMatches; + } + + /** + * Sets the reference to another token. + * + * @param match + * Formatting object for the token reference. + */ + public final void setMatch(final Match match) { + tokenReference = match; + containsMatches = true; + } + + public final Match getMatch() { + return tokenReference; + } + + /** + * Prepare Element for matching by formatting its string token and POS (if the + * Element is supposed to refer to some other token). + * + * @param token + * the token specified as {@link AnalyzedTokenReadings} + * @param synth + * the language synthesizer ({@link Synthesizer}) + * + */ + public final void compile(final AnalyzedTokenReadings token, + final Synthesizer synth) throws IOException { + + m = null; + p = null; + tokenReference.setToken(token); + tokenReference.setSynthesizer(synth); + + if (StringTools.isEmpty(referenceString)) { + referenceString = stringToken; + } + if (tokenReference.setsPos()) { + final String posReference = tokenReference.getTargetPosTag(); + if (posReference != null) { + if (mPos != null) { + mPos = null; + } + setPosElement(posReference, tokenReference.posRegExp(), negation); + } + setStringElement(referenceString.replace("\\" + + tokenReference.getTokenRef(), "")); + inflected = true; + } else { + setStringElement(referenceString.replace("\\" + + tokenReference.getTokenRef(), tokenReference.toTokenString())); + } + } + + /** + * Sets the phrase the element is in. + * + * @param s + * ID of the phrase. + */ + public final void setPhraseName(final String s) { + phraseName = s; + } + + /** + * Checks if the Element is in any phrase. + * + * @return True if the Element is contained in the phrase. + */ + public final boolean isPartOfPhrase() { + return phraseName != null; + } + + /** + * Whether the element matches case sensitively. + * + * @since 0.9.3 + */ + public final boolean getCaseSensitive() { + return caseSensitive; + } + + /** + * Tests whether the element matches a regular expression. + * + * @since 0.9.6 + */ + public final boolean isRegularExpression() { + return stringRegExp; + } + + /** + * @return the POS of the Element + * @since 0.9.6 + */ + public final String getPOStag() { + return posToken; + } + + /** + * Tests whether the POS is negated. + * + * @return true if so. + */ + public final boolean getPOSNegation() { + return posNegation; + } + + /** + * Whether the token is inflected. + * + * @return True if so. + */ + public final boolean isInflected() { + return inflected; + } + + /** + * Gets the phrase the element is in. + * + * @return String The name of the phrase. + */ + public final String getPhraseName() { + return phraseName; + } + + public final boolean isUnified() { + return unified; + } + + public final void setUnification(final Map<String, List<String>> uniFeatures) { + unificationFeatures = uniFeatures; + unified = true; + } + + /** + * Get unification features and types. + * @return A map from features to a list of types. + * @since 1.0.1 + */ + public final Map<String, List<String>> getUniFeatures() { + return unificationFeatures; + } + + public final void setUniNegation() { + uniNegation = true; + } + + public final boolean isUniNegated() { + return uniNegation; + } + + public final void setWhitespaceBefore(final boolean isWhite) { + whitespaceBefore = isWhite; + testWhitespace = true; + } + + public final void setExceptionSpaceBefore(final boolean isWhite) { + if (exceptionList != null) { + exceptionList.get(exceptionList.size()).setWhitespaceBefore(isWhite); + } + } + + public final boolean isWhitespaceBefore(final AnalyzedToken token) { + return whitespaceBefore == token.isWhitespaceBefore(); + } + + /** + * Since 1.0.0 + * @return A List of Exceptions. Used for testing. + */ + public final List<Element> getExceptionList() { + return exceptionList; + } +} diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/rules/patterns/FalseFriendRuleLoader.java b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/patterns/FalseFriendRuleLoader.java new file mode 100644 index 0000000..94c6515 --- /dev/null +++ b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/patterns/FalseFriendRuleLoader.java @@ -0,0 +1,356 @@ +/* LanguageTool, a natural language style checker + * Copyright (C) 2005 Daniel Naber (http://www.danielnaber.de) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ +package de.danielnaber.languagetool.rules.patterns; + +import java.io.IOException; +import java.io.InputStream; +import java.text.MessageFormat; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.Iterator; +import java.util.List; +import java.util.Map; +import java.util.ResourceBundle; + +import javax.xml.parsers.ParserConfigurationException; +import javax.xml.parsers.SAXParser; +import javax.xml.parsers.SAXParserFactory; + +import org.xml.sax.Attributes; +import org.xml.sax.SAXException; +import org.xml.sax.helpers.DefaultHandler; + +import de.danielnaber.languagetool.JLanguageTool; +import de.danielnaber.languagetool.Language; +import de.danielnaber.languagetool.rules.Category; +import de.danielnaber.languagetool.rules.IncorrectExample; +import de.danielnaber.languagetool.tools.StringTools; + +/** + * Loads {@link PatternRule}s from a false friends XML file. + * + * @author Daniel Naber + */ +public class FalseFriendRuleLoader extends DefaultHandler { + + public FalseFriendRuleLoader() { + } + + public final List<PatternRule> getRules(final InputStream file, + final Language textLanguage, final Language motherTongue) + throws ParserConfigurationException, SAXException, IOException { + final FalseFriendRuleHandler handler = new FalseFriendRuleHandler( + textLanguage, motherTongue); + final SAXParserFactory factory = SAXParserFactory.newInstance(); + final SAXParser saxParser = factory.newSAXParser(); + saxParser.getXMLReader() + .setFeature( + "http://apache.org/xml/features/nonvalidating/load-external-dtd", + false); + saxParser.parse(file, handler); + final List<PatternRule> rules = handler.getRules(); + // Add suggestions to each rule: + final ResourceBundle messages = ResourceBundle.getBundle( + "de.danielnaber.languagetool.MessagesBundle", motherTongue.getLocale()); + for (final PatternRule rule : rules) { + final List<String> suggestionMap = handler.getSuggestionMap().get(rule.getId()); + if (suggestionMap != null) { + final MessageFormat msgFormat = new MessageFormat(messages + .getString("false_friend_suggestion")); + final Object[] msg = new Object[] { formatSuggestions(suggestionMap) }; + rule.setMessage(rule.getMessage() + " " + msgFormat.format(msg)); + } + } + return rules; + } + + private String formatSuggestions(final List<String> l) { + final StringBuilder sb = new StringBuilder(); + for (final Iterator<String> iter = l.iterator(); iter.hasNext();) { + final String s = iter.next(); + sb.append("<suggestion>"); + sb.append(s); + sb.append("</suggestion>"); + if (iter.hasNext()) { + sb.append(", "); + } + } + return sb.toString(); + } + + /** Testing only. */ + public final void main(final String[] args) + throws ParserConfigurationException, SAXException, IOException { + final FalseFriendRuleLoader prg = new FalseFriendRuleLoader(); + List<PatternRule> l = prg.getRules(JLanguageTool.getDataBroker() + .getFromRulesDirAsStream("/false-friends.xml"), Language.ENGLISH, + Language.GERMAN); + System.out.println("Hints for German native speakers:"); + for (final PatternRule rule : l) { + System.out.println(rule); + } + System.out.println("======================================="); + System.out.println("Hints for English native speakers:"); + l = prg.getRules(JLanguageTool.getDataBroker() + .getFromRulesDirAsStream("/false-friends.xml"), + Language.GERMAN, Language.ENGLISH); + for (final PatternRule rule : l) { + System.out.println(rule); + } + } + +} + +class FalseFriendRuleHandler extends XMLRuleHandler { + + private final ResourceBundle messages; + private final MessageFormat formatter; + + private final Language textLanguage; + private final Language motherTongue; + + private boolean defaultOff; + + private Language language; + private Language translationLanguage; + private Language currentTranslationLanguage; + private List<StringBuilder> translations = new ArrayList<StringBuilder>(); + private StringBuilder translation = new StringBuilder(); + private final List<String> suggestions = new ArrayList<String>(); + // rule ID -> list of translations: + private final Map<String, List<String>> suggestionMap = new HashMap<String, List<String>>(); + + private boolean inTranslation; + + public FalseFriendRuleHandler(final Language textLanguage, + final Language motherTongue) { + messages = ResourceBundle.getBundle( + "de.danielnaber.languagetool.MessagesBundle", motherTongue.getLocale()); + formatter = new MessageFormat(""); + formatter.setLocale(motherTongue.getLocale()); + this.textLanguage = textLanguage; + this.motherTongue = motherTongue; + } + + public Map<String, List<String>> getSuggestionMap() { + return suggestionMap; + } + + // =========================================================== + // SAX DocumentHandler methods + // =========================================================== + + @Override + public void startElement(final String namespaceURI, final String lName, + final String qName, final Attributes attrs) throws SAXException { + if (qName.equals("rule")) { + translations = new ArrayList<StringBuilder>(); + id = attrs.getValue("id"); + if (!(inRuleGroup && defaultOff)) { + defaultOff = "off".equals(attrs.getValue("default")); + } + if (inRuleGroup && id == null) { + id = ruleGroupId; + } + correctExamples = new ArrayList<String>(); + incorrectExamples = new ArrayList<IncorrectExample>(); + } else if (qName.equals("pattern")) { + inPattern = true; + final String languageStr = attrs.getValue("lang"); + language = Language.getLanguageForShortName(languageStr); + if (language == null) { + throw new SAXException("Unknown language '" + languageStr + "'"); + } + } else if (qName.equals("exception")) { + inException = true; + exceptions = new StringBuilder(); + + if (attrs.getValue(NEGATE) != null) { + exceptionStringNegation = attrs.getValue(NEGATE).equals(YES); + } + if (attrs.getValue(SCOPE) != null) { + exceptionValidNext = attrs.getValue(SCOPE).equals("next"); + exceptionValidPrev = attrs.getValue(SCOPE).equals("previous"); + } + if (attrs.getValue(INFLECTED) != null) { + exceptionStringInflected = attrs.getValue(INFLECTED).equals(YES); + } + if (attrs.getValue(POSTAG) != null) { + exceptionPosToken = attrs.getValue(POSTAG); + if (attrs.getValue(POSTAG_REGEXP) != null) { + exceptionPosRegExp = attrs.getValue(POSTAG_REGEXP).equals(YES); + } + if (attrs.getValue(NEGATE_POS) != null) { + exceptionPosNegation = attrs.getValue(NEGATE_POS).equals(YES); + } + } + if (attrs.getValue(REGEXP) != null) { + exceptionStringRegExp = attrs.getValue(REGEXP).equals(YES); + } + + } else if (qName.equals(TOKEN)) { + setToken(attrs); + } else if (qName.equals("translation")) { + inTranslation = true; + final String languageStr = attrs.getValue("lang"); + final Language tmpLang = Language.getLanguageForShortName(languageStr); + currentTranslationLanguage = tmpLang; + if (tmpLang == motherTongue) { + translationLanguage = tmpLang; + if (translationLanguage == null) { + throw new SAXException("Unknown language '" + languageStr + "'"); + } + } + } else if (qName.equals(EXAMPLE) + && attrs.getValue(TYPE).equals("correct")) { + inCorrectExample = true; + correctExample = new StringBuilder(); + } else if (qName.equals(EXAMPLE) + && attrs.getValue(TYPE).equals("incorrect")) { + inIncorrectExample = true; + incorrectExample = new StringBuilder(); + } else if (qName.equals("message")) { + inMessage = true; + message = new StringBuilder(); + } else if (qName.equals("rulegroup")) { + ruleGroupId = attrs.getValue("id"); + inRuleGroup = true; + defaultOff = "off".equals(attrs.getValue(DEFAULT)); + } + } + + @Override + public void endElement(final String namespaceURI, final String sName, + final String qName) { + if (qName.equals("rule")) { + if (language == textLanguage && translationLanguage != null + && translationLanguage == motherTongue && language != motherTongue + && !translations.isEmpty()) { + formatter.applyPattern(messages.getString("false_friend_hint")); + final Object[] messageArguments = { + elements.toString().replace('|', '/'), + messages.getString(textLanguage.getShortName()), + formatTranslations(translations), + messages.getString(motherTongue.getShortName()) }; + final String description = formatter.format(messageArguments); + final PatternRule rule = new PatternRule(id, language, elementList, + messages.getString("false_friend_desc") + " " + + elements.toString().replace('|', '/'), description, messages + .getString("false_friend")); + rule.setCorrectExamples(correctExamples); + rule.setIncorrectExamples(incorrectExamples); + rule.setCategory(new Category(messages + .getString("category_false_friend"))); + if (defaultOff) { + rule.setDefaultOff(); + } + rules.add(rule); + } + + if (elementList != null) { + elementList.clear(); + } + + } else if (qName.equals("exception")) { + inException = false; + if (!exceptionSet) { + tokenElement = new Element(elements.toString(), caseSensitive, + regExpression, tokenInflected); + exceptionSet = true; + } + tokenElement.setNegation(tokenNegated); + if (!StringTools.isEmpty(exceptions.toString())) { + tokenElement.setStringException(exceptions.toString(), + exceptionStringRegExp, exceptionStringInflected, + exceptionStringNegation, exceptionValidNext, exceptionValidPrev); + } + if (exceptionPosToken != null) { + tokenElement.setPosException(exceptionPosToken, exceptionPosRegExp, + exceptionPosNegation, exceptionValidNext, exceptionValidPrev); + exceptionPosToken = null; + } + } else if (qName.equals(TOKEN)) { + finalizeTokens(); + } else if (qName.equals("pattern")) { + inPattern = false; + } else if (qName.equals("translation")) { + if (currentTranslationLanguage == motherTongue) { + translations.add(translation); + } + if (currentTranslationLanguage == textLanguage) { + suggestions.add(translation.toString()); + } + translation = new StringBuilder(); + inTranslation = false; + currentTranslationLanguage = null; + } else if (qName.equals(EXAMPLE)) { + if (inCorrectExample) { + correctExamples.add(correctExample.toString()); + } else if (inIncorrectExample) { + incorrectExamples + .add(new IncorrectExample(incorrectExample.toString())); + } + inCorrectExample = false; + inIncorrectExample = false; + correctExample = new StringBuilder(); + incorrectExample = new StringBuilder(); + } else if (qName.equals("message")) { + inMessage = false; + } else if (qName.equals("rulegroup")) { + if (!suggestions.isEmpty()) { + final List<String> l = new ArrayList<String>(suggestions); + suggestionMap.put(id, l); + suggestions.clear(); + } + inRuleGroup = false; + } + } + + private String formatTranslations(final List<StringBuilder> translations) { + final StringBuilder sb = new StringBuilder(); + for (final Iterator<StringBuilder> iter = translations.iterator(); iter + .hasNext();) { + final StringBuilder trans = iter.next(); + sb.append('"'); + sb.append(trans.toString()); + sb.append('"'); + if (iter.hasNext()) { + sb.append(", "); + } + } + return sb.toString(); + } + + @Override + public void characters(final char[] buf, final int offset, final int len) { + final String s = new String(buf, offset, len); + if (inException) { + exceptions.append(s); + } else if (inToken && inPattern) { + elements.append(s); + } else if (inCorrectExample) { + correctExample.append(s); + } else if (inIncorrectExample) { + incorrectExample.append(s); + } else if (inTranslation) { + translation.append(s); + } + } + +} diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/rules/patterns/Match.java b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/patterns/Match.java new file mode 100644 index 0000000..0519f2c --- /dev/null +++ b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/patterns/Match.java @@ -0,0 +1,551 @@ +/* LanguageTool, a natural language style checker + * Copyright (C) 2005 Daniel Naber (http://www.danielnaber.de) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ +package de.danielnaber.languagetool.rules.patterns; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; +import java.util.TreeSet; +import java.util.regex.Pattern; + +import de.danielnaber.languagetool.AnalyzedToken; +import de.danielnaber.languagetool.AnalyzedTokenReadings; +import de.danielnaber.languagetool.JLanguageTool; +import de.danielnaber.languagetool.synthesis.Synthesizer; +import de.danielnaber.languagetool.tools.StringTools; + +/** + * Reference to a matched token in a pattern, can be formatted and used for + * matching & suggestions. + * + * @author Marcin Miłkowski + */ +public class Match { + + /** Possible string case conversions. **/ + public enum CaseConversion { + NONE, STARTLOWER, STARTUPPER, ALLLOWER, ALLUPPER; + + /** + * Converts string to the constant enum. + * + * @param str + * String value to be converted. + * @return CaseConversion enum. + */ + public static CaseConversion toCase(final String str) { + try { + return valueOf(str); + } catch (final Exception ex) { + return NONE; + } + } + } + + public enum IncludeRange { + NONE, FOLLOWING, ALL; + + /** + * Converts string to the constant enum. + * + * @param str + * String value to be converted. + * @return IncludeRange enum. + */ + public static IncludeRange toRange(final String str) { + try { + return valueOf(str); + } catch (final Exception ex) { + return NONE; + } + } + } + + private final String posTag; + private boolean postagRegexp; + private final String regexReplace; + private final String posTagReplace; + private final CaseConversion caseConversionType; + + private final IncludeRange includeSkipped; + private String skippedTokens; + + /** + * True if this match element formats a statically defined lemma which is + * enclosed by the element, e.g., <tt><match...>word</word></tt>. + */ + private boolean staticLemma; + + /** + * True if this match element is used for formatting POS token. + */ + private final boolean setPos; + + private AnalyzedTokenReadings formattedToken; + private AnalyzedTokenReadings matchedToken; + + private int tokenRef; + + /** Word form generator for POS tags. **/ + private Synthesizer synthesizer; + + /** Pattern used to define parts of the matched token. **/ + private Pattern pRegexMatch; + + /** Pattern used to define parts of the matched POS token. **/ + private Pattern pPosRegexMatch; + + /** + * True when the match is not in the suggestion. + */ + private boolean inMessageOnly; + + public Match(final String posTag, final String posTagReplace, + final boolean postagRegexp, final String regexMatch, + final String regexReplace, final CaseConversion caseConversionType, + final boolean setPOS, + final IncludeRange includeSkipped) { + this.posTag = posTag; + this.postagRegexp = postagRegexp; + this.caseConversionType = caseConversionType; + + if (regexMatch != null) { + pRegexMatch = Pattern.compile(regexMatch); + } + if (postagRegexp && posTag != null) { + pPosRegexMatch = Pattern.compile(posTag); + } + + this.regexReplace = regexReplace; + this.posTagReplace = posTagReplace; + this.setPos = setPOS; + this.includeSkipped = includeSkipped; + } + + /** + * Sets the token that will be formatted or otherwise used in the class. + */ + public final void setToken(final AnalyzedTokenReadings token) { + if (staticLemma) { + matchedToken = token; + } else { + formattedToken = token; + } + } + + /** + * Sets the token to be formatted etc. and includes the support for + * including the skipped tokens. + * @param tokens Array of tokens + * @param index Index of the token to be formatted + * @param next Position of the next token (the skipped tokens + * are the ones between the tokens[index] and tokens[next] + */ + public final void setToken(final AnalyzedTokenReadings[] tokens, final int index, final int next) { + setToken(tokens[index]); + if (next > 1 && includeSkipped != IncludeRange.NONE) { + final StringBuilder sb = new StringBuilder(); + if (includeSkipped == IncludeRange.FOLLOWING) { + formattedToken = null; + } + for (int k = index + 1; k < index + next; k++) { + if (k > index + 1 && + tokens[k].isWhitespaceBefore()) { + sb.append(' '); + } + sb.append(tokens[k].getToken()); + } + skippedTokens = sb.toString(); + } else { + skippedTokens = ""; + } + } + + /** + private String[] addSkipped(final String[] formattedString) { + if (skippedTokens != null && !"".equals(skippedTokens)) { + String[] finalStrings = new String[formattedString.length]; + for (int i = 1; i <= formattedString.length; i++) + } + } + + **/ + + /** + * Checks if the Match element is used for setting the part of speech Element. + * + * @return True if Match sets POS. + */ + public final boolean setsPos() { + return setPos; + } + + /** + * Checks if the Match element uses regexp-based form of the POS tag. + * + * @return True if regexp is used in POS. + */ + public final boolean posRegExp() { + return postagRegexp; + } + + /** + * Sets a base form (lemma) that will be formatted, or synthesized, using the + * specified POS regular expressions. + * + * @param lemmaString String that specifies the base form. + */ + public final void setLemmaString(final String lemmaString) { + if (!StringTools.isEmpty(lemmaString)) { + formattedToken = new AnalyzedTokenReadings(new AnalyzedToken(lemmaString, + posTag, lemmaString), 0); + staticLemma = true; + postagRegexp = true; + if (posTag != null) { + pPosRegexMatch = Pattern.compile(posTag); + } + } + } + + /** + * Sets a synthesizer used for grammatical synthesis of forms based on + * formatted POS values. + * + * @param synth Synthesizer class. + */ + public final void setSynthesizer(final Synthesizer synth) { + synthesizer = synth; + } + + /** + * Gets all strings formatted using the match element. + * + * @return array of strings + * @throws IOException + * in case of synthesizer-related disk problems. + */ + public final String[] toFinalString() throws IOException { + String[] formattedString = new String[1]; + if (formattedToken != null) { + final int readingCount = formattedToken.getReadingsLength(); + formattedString[0] = formattedToken.getToken(); + if (pRegexMatch != null) { + formattedString[0] = pRegexMatch.matcher(formattedString[0]) + .replaceAll(regexReplace); + } + formattedString[0] = convertCase(formattedString[0]); + if (posTag != null) { + if (synthesizer == null) { + formattedString[0] = formattedToken.getToken(); + } else if (postagRegexp) { + final TreeSet<String> wordForms = new TreeSet<String>(); + boolean oneForm = false; + for (int k = 0; k < readingCount; k++) { + if (formattedToken.getAnalyzedToken(k).getLemma() == null) { + final String posUnique = formattedToken.getAnalyzedToken(k) + .getPOSTag(); + if (posUnique == null) { + wordForms.add(formattedToken.getToken()); + oneForm = true; + } else { + if (JLanguageTool.SENTENCE_START_TAGNAME.equals(posUnique) + || JLanguageTool.SENTENCE_END_TAGNAME.equals(posUnique) + || JLanguageTool.PARAGRAPH_END_TAGNAME.equals(posUnique)) { + if (!oneForm) { + wordForms.add(formattedToken.getToken()); + } + oneForm = true; + } else { + oneForm = false; + } + } + } + } + final String targetPosTag = getTargetPosTag(); + if (!oneForm) { + for (int i = 0; i < readingCount; i++) { + final String[] possibleWordForms = synthesizer.synthesize( + formattedToken.getAnalyzedToken(i), targetPosTag, true); + if (possibleWordForms != null) { + wordForms.addAll(Arrays.asList(possibleWordForms)); + } + } + } + if (wordForms.isEmpty()) { + formattedString[0] = "(" + formattedToken.getToken() + ")"; + } else { + formattedString = wordForms.toArray(new String[wordForms.size()]); + } + } else { + final TreeSet<String> wordForms = new TreeSet<String>(); + for (int i = 0; i < readingCount; i++) { + final String[] possibleWordForms = synthesizer.synthesize( + formattedToken.getAnalyzedToken(i), posTag); + if (possibleWordForms != null) { + wordForms.addAll(Arrays.asList(possibleWordForms)); + } + } + formattedString = wordForms.toArray(new String[wordForms.size()]); + } + } + } + if (includeSkipped != IncludeRange.NONE + && skippedTokens != null && !"".equals(skippedTokens)) { + final String[] helper = new String[formattedString.length]; + for (int i = 0; i < formattedString.length; i++) { + if (formattedString[i] == null) { + formattedString[i] = ""; + } + helper[i] = formattedString[i] + skippedTokens; + } + formattedString = helper; + } + return formattedString; + } + + /** + * Format POS tag using parameters already defined in the class. + * + * @return Formatted POS tag as String. + */ + // FIXME: gets only the first POS tag that matches, this can be wrong + // on the other hand, many POS tags = too many suggestions? + public final String getTargetPosTag() { + String targetPosTag = posTag; + final List<String> posTags = new ArrayList<String>(); + if (staticLemma) { + final int numRead = matchedToken.getReadingsLength(); + for (int i = 0; i < numRead; i++) { + final String tst = matchedToken.getAnalyzedToken(i).getPOSTag(); + if (tst != null && pPosRegexMatch.matcher(tst).matches()) { + targetPosTag = matchedToken.getAnalyzedToken(i).getPOSTag(); + posTags.add(targetPosTag); + } + } + if (pPosRegexMatch != null && posTagReplace != null) { + targetPosTag = pPosRegexMatch.matcher(targetPosTag).replaceAll( + posTagReplace); + } + } else { + final int numRead = formattedToken.getReadingsLength(); + for (int i = 0; i < numRead; i++) { + final String tst = formattedToken.getAnalyzedToken(i).getPOSTag(); + if (tst != null && pPosRegexMatch.matcher(tst).matches()) { + targetPosTag = formattedToken.getAnalyzedToken(i).getPOSTag(); + posTags.add(targetPosTag); + } + } + if (pPosRegexMatch != null && posTagReplace != null) { + if (posTags.isEmpty()) { + posTags.add(targetPosTag); + } + final StringBuilder sb = new StringBuilder(); + final int posTagLen = posTags.size(); + int l = 0; + for (String lposTag : posTags) { + l++; + lposTag = pPosRegexMatch.matcher(lposTag).replaceAll(posTagReplace); + if (setPos) { + lposTag = synthesizer.getPosTagCorrection(lposTag); + } + sb.append(lposTag); + if (l < posTagLen) { + sb.append('|'); + } + } + targetPosTag = sb.toString(); + } + } + return targetPosTag; + } + + /** + * Method for getting the formatted match as a single string. In case of + * multiple matches, it joins them using a regular expression operator "|". + * + * @return Formatted string of the matched token. + */ + public final String toTokenString() throws IOException { + final StringBuilder output = new StringBuilder(); + final String[] stringToFormat = toFinalString(); + for (int i = 0; i < stringToFormat.length; i++) { + output.append(stringToFormat[i]); + if (i + 1 < stringToFormat.length) { + output.append('|'); + } + } + return output.toString(); + } + + /** + * Sets the token number referenced by the match. + * + * @param i Token number. + */ + public final void setTokenRef(final int i) { + tokenRef = i; + } + + /** + * Gets the token number referenced by the match. + * + * @return int - token number. + */ + public final int getTokenRef() { + return tokenRef; + } + + /** + * Converts case of the string token according to match element attributes. + * + * @param s Token to be converted. + * @return Converted string. + */ + private String convertCase(final String s) { + if (StringTools.isEmpty(s)) { + return s; + } + String token = s; + switch (caseConversionType) { + case NONE: + break; + case STARTLOWER: + token = token.substring(0, 1).toLowerCase() + token.substring(1); + break; + case STARTUPPER: + token = token.substring(0, 1).toUpperCase() + token.substring(1); + break; + case ALLUPPER: + token = token.toUpperCase(); + break; + case ALLLOWER: + token = token.toLowerCase(); + break; + default: + break; + } + return token; + } + + /** + * Used to let LT know that it should change the case of the match. + * + * @return true if match converts the case of the token. + */ + public final boolean convertsCase() { + return !caseConversionType.equals(CaseConversion.NONE); + } + + public final AnalyzedTokenReadings filterReadings() { + final ArrayList<AnalyzedToken> l = new ArrayList<AnalyzedToken>(); + if (formattedToken != null) { + if (staticLemma) { + formattedToken = new AnalyzedTokenReadings(new AnalyzedToken( + matchedToken.getToken(), posTag, formattedToken.getToken()), + matchedToken.getStartPos()); + formattedToken.setWhitespaceBefore(matchedToken.isWhitespaceBefore()); + } + String token = formattedToken.getToken(); + if (pRegexMatch != null) { + token = pRegexMatch.matcher(token).replaceAll(regexReplace); + } + token = convertCase(token); + if (posTag != null) { + final int numRead = formattedToken.getReadingsLength(); + if (postagRegexp) { + String targetPosTag = posTag; + for (int i = 0; i < numRead; i++) { + final String tst = formattedToken.getAnalyzedToken(i).getPOSTag(); + if (tst != null && pPosRegexMatch.matcher(tst).matches()) { + targetPosTag = formattedToken.getAnalyzedToken(i).getPOSTag(); + if (posTagReplace != null) { + targetPosTag = pPosRegexMatch.matcher(targetPosTag).replaceAll( + posTagReplace); + } + l + .add(new AnalyzedToken(token, targetPosTag, formattedToken + .getAnalyzedToken(i).getLemma())); + l.get(l.size() - 1).setWhitespaceBefore(formattedToken.isWhitespaceBefore()); + } + } + if (l.isEmpty()) { + for (final AnalyzedToken anaTok : getNewToken(numRead, token)) { + l.add(anaTok); + } + } + } else { + for (final AnalyzedToken anaTok : getNewToken(numRead, token)) { + l.add(anaTok); + } + } + if (formattedToken.isSentEnd()) { + l.add(new AnalyzedToken(formattedToken.getToken(), + JLanguageTool.SENTENCE_END_TAGNAME, + formattedToken.getAnalyzedToken(0).getLemma())); + } + if (formattedToken.isParaEnd()) { + l.add(new AnalyzedToken(formattedToken.getToken(), + JLanguageTool.PARAGRAPH_END_TAGNAME, + formattedToken.getAnalyzedToken(0).getLemma())); + } + } + } + if (l.isEmpty()) { + return formattedToken; + } + return new AnalyzedTokenReadings(l.toArray(new AnalyzedToken[l.size()]), formattedToken.getStartPos()); + } + + private AnalyzedToken[] getNewToken(final int numRead, final String token) { + final List<AnalyzedToken> list = new ArrayList<AnalyzedToken>(); + String lemma = ""; + for (int j = 0; j < numRead; j++) { + if (formattedToken.getAnalyzedToken(j).getPOSTag() != null) { + if (formattedToken.getAnalyzedToken(j).getPOSTag().equals(posTag) + && (formattedToken.getAnalyzedToken(j).getLemma() != null)) { + lemma = formattedToken.getAnalyzedToken(j).getLemma(); + } + if (StringTools.isEmpty(lemma)) { + lemma = formattedToken.getAnalyzedToken(0).getLemma(); + } + list.add(new AnalyzedToken(token, posTag, lemma)); + list.get(list.size() - 1). + setWhitespaceBefore(formattedToken.isWhitespaceBefore()); + } + } + return list.toArray(new AnalyzedToken[list.size()]); + } + + /** + * @param inMessageOnly + * the inMessageOnly to set + */ + public void setInMessageOnly(final boolean inMessageOnly) { + this.inMessageOnly = inMessageOnly; + } + + /** + * @return the inMessageOnly + */ + public boolean isInMessageOnly() { + return inMessageOnly; + } + +} diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/rules/patterns/PatternRule.java b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/patterns/PatternRule.java new file mode 100644 index 0000000..843ef98 --- /dev/null +++ b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/patterns/PatternRule.java @@ -0,0 +1,652 @@ +/* LanguageTool, a natural language style checker + * Copyright (C) 2005 Daniel Naber (http://www.danielnaber.de) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ +package de.danielnaber.languagetool.rules.patterns; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; + +import de.danielnaber.languagetool.AnalyzedSentence; +import de.danielnaber.languagetool.AnalyzedTokenReadings; +import de.danielnaber.languagetool.Language; +import de.danielnaber.languagetool.rules.IncorrectExample; +import de.danielnaber.languagetool.rules.RuleMatch; +import de.danielnaber.languagetool.tools.StringTools; + +/** + * A Rule that describes a language error as a simple pattern of words or of + * part-of-speech tags. + * + * @author Daniel Naber + */ +public class PatternRule extends AbstractPatternRule { + + private static final String SUGG_TAG = "<suggestion>"; + private static final String END_SUGG_TAG = "</suggestion>"; + + private String subId; // because there can be more than one rule in a rule + // group + + private String message; + private String shortMessage; + + /** Formatted suggestion elements. **/ + private List<Match> suggestionMatches; + + /** + * A list of elements as they appear in XML file (phrases count as single + * tokens in case of matches or skipping). + */ + private List<Integer> elementNo; + + /** + * This property is used for short-circuiting evaluation of the elementNo list + * order. + */ + private boolean useList; + + /** + * Marks whether the rule is a member of a disjunctive set (in case of OR + * operation on phraserefs). + **/ + private boolean isMemberOfDisjunctiveSet; + + /** + * @param id + * Id of the Rule + * @param language + * Language of the Rule + * @param elements + * Element (token) list + * @param description + * Description to be shown (name) + * @param message + * Message to be displayed to the user + */ + + public PatternRule(final String id, final Language language, + final List<Element> elements, final String description, + final String message, final String shortMessage) { + super(id, description, language, elements, false); + if (id == null) { + throw new NullPointerException("id cannot be null"); + } + if (language == null) { + throw new NullPointerException("language cannot be null"); + } + if (elements == null) { + throw new NullPointerException("elements cannot be null"); + } + if (description == null) { + throw new NullPointerException("description cannot be null"); + } + + this.message = message; + this.shortMessage = shortMessage; + this.elementNo = new ArrayList<Integer>(); + String prevName = ""; + String curName = ""; + int cnt = 0; + int loopCnt = 0; + for (final Element e : patternElements) { + if (e.isPartOfPhrase()) { + curName = e.getPhraseName(); + if (prevName.equals(curName) || StringTools.isEmpty(prevName)) { + cnt++; + useList = true; + } else { + elementNo.add(cnt); + prevName = ""; + curName = ""; + cnt = 0; + } + prevName = curName; + loopCnt++; + if (loopCnt == patternElements.size() && !StringTools.isEmpty(prevName)) { + elementNo.add(cnt); + } + } else { + if (cnt > 0) { + elementNo.add(cnt); + } + elementNo.add(1); + loopCnt++; + } + } + } + + public PatternRule(final String id, final Language language, + final List<Element> elements, final String description, + final String message, final String shortMessage, final boolean isMember) { + this(id, language, elements, description, message, shortMessage); + this.isMemberOfDisjunctiveSet = isMember; + } + + public final String getSubId() { + return subId; + } + + public final void setSubId(final String subId) { + this.subId = subId; + } + + public final String getMessage() { + return message; + } + + /** + * Used for testing rules: only one of the set can match. + * + * @return Whether the rule can non-match (as a member of disjunctive set of + * rules generated by phraseref in includephrases element). + */ + public final boolean isWithComplexPhrase() { + return isMemberOfDisjunctiveSet; + } + + /** Reset complex status - used for testing. **/ + public final void notComplexPhrase() { + isMemberOfDisjunctiveSet = false; + } + + /** + * Return the pattern as a string. + * + * @since 0.9.2 + */ + public final String toPatternString() { + final List<String> strList = new ArrayList<String>(); + for (Element patternElement : patternElements) { + strList.add(patternElement.toString()); + } + return StringTools.listToString(strList, ", "); + } + + /** + * Return the pattern as an XML string. FIXME: this is not complete, information might be lost! + * + * @since 0.9.3 + */ + public final String toXML() { + final StringBuilder sb = new StringBuilder(); + sb.append("<rule id=\""); + sb.append(StringTools.escapeXML(getId())); + sb.append("\" name=\""); + sb.append(StringTools.escapeXML(getDescription())); + sb.append("\">\n"); + sb.append("<pattern mark_from=\""); + sb.append(startPositionCorrection); + sb.append("\" mark_to=\""); + sb.append(endPositionCorrection); + sb.append('"'); + // for now, case sensitivity is per pattern, not per element, + // so just use the setting of the first element: + if (!patternElements.isEmpty() && patternElements.get(0).getCaseSensitive()) { + sb.append(" case_sensitive=\"yes\""); + } + sb.append(">\n"); + for (Element patternElement : patternElements) { + sb.append("<token"); + if (patternElement.getNegation()) { + sb.append(" negate=\"yes\""); + } + if (patternElement.isRegularExpression()) { + sb.append(" regexp=\"yes\""); + } + if (patternElement.getPOStag() != null) { + sb.append(" postag=\""); + sb.append(patternElement.getPOStag()); + sb.append('"'); + } + if (patternElement.getPOSNegation()) { + sb.append(" negate_pos=\"yes\""); + } + if (patternElement.isInflected()) { + sb.append(" inflected=\"yes\""); + } + sb.append('>'); + if (patternElement.getString() != null) { + sb.append(StringTools.escapeXML(patternElement.getString())); + } else { + // TODO + } + sb.append("</token>\n"); + } + sb.append("</pattern>\n"); + sb.append("<message>"); + sb.append(StringTools.escapeXML(message)); + sb.append("</message>\n"); + if (getIncorrectExamples() != null) { + for (IncorrectExample example : getIncorrectExamples()) { + sb.append("<example type=\"incorrect\">"); + sb.append(StringTools.escapeXML(example.getExample())); + sb.append("</example>\n"); + } + } + if (getCorrectExamples() != null) { + for (String example : getCorrectExamples()) { + sb.append("<example type=\"correct\">"); + sb.append(StringTools.escapeXML(example)); + sb.append("</example>\n"); + } + } + sb.append("</rule>"); + return sb.toString(); + } + + public final void setMessage(final String message) { + this.message = message; + } + + @Override + public final RuleMatch[] match(final AnalyzedSentence text) + throws IOException { + final List<RuleMatch> ruleMatches = new ArrayList<RuleMatch>(); + final AnalyzedTokenReadings[] tokens = text.getTokensWithoutWhitespace(); + final int[] tokenPositions = new int[tokens.length + 1]; + final int patternSize = patternElements.size(); + final int limit = Math.max(0, tokens.length - patternSize + 1); + Element elem = null; + int i = 0; + while (i < limit && !(sentStart && i > 0)) { + boolean allElementsMatch = false; + int firstMatchToken = -1; + int lastMatchToken = -1; + int matchingTokens = 0; + int prevSkipNext = 0; + // this variable keeps the total number + // of tokens skipped + int skipShiftTotal = 0; + if (testUnification) { + unifier.reset(); + } + for (int k = 0; k < patternSize; k++) { + final Element prevElement = elem; + elem = patternElements.get(k); + setupRef(firstMatchToken, elem, tokens); + final int nextPos = i + k + skipShiftTotal; + prevMatched = false; + if (prevSkipNext + nextPos >= tokens.length || prevSkipNext < 0) { // SENT_END? + prevSkipNext = tokens.length - (nextPos + 1); + } + final int maxTok = Math.min(nextPos + prevSkipNext, tokens.length - (patternSize - k)); + for (int m = nextPos; m <= maxTok; m++) { + allElementsMatch = testAllReadings(tokens, elem, prevElement, m, + firstMatchToken, prevSkipNext); + if (allElementsMatch) { + lastMatchToken = m; + final int skipShift = lastMatchToken - nextPos; + tokenPositions[matchingTokens] = skipShift + 1; + prevSkipNext = translateElementNo(elem.getSkipNext()); + matchingTokens++; + skipShiftTotal += skipShift; + if (firstMatchToken == -1) { + firstMatchToken = lastMatchToken; + } + break; + } + } + if (!allElementsMatch) { + break; + } + } + + if (allElementsMatch && matchingTokens == patternSize) { + final RuleMatch rM = createRuleMatch(tokenPositions, tokens, + firstMatchToken, lastMatchToken, matchingTokens); + if (rM != null) { + ruleMatches.add(rM); + } + } + i++; + } + return ruleMatches.toArray(new RuleMatch[ruleMatches.size()]); + } + + private RuleMatch createRuleMatch(final int[] tokenPositions, + final AnalyzedTokenReadings[] tokens, final int firstMatchToken, + final int lastMatchToken, final int matchingTokens) throws IOException { + final String errMessage = formatMatches(tokens, tokenPositions, + firstMatchToken, message); + int correctedStPos = 0; + if (startPositionCorrection > 0) { + for (int l = 0; l <= startPositionCorrection; l++) { + correctedStPos += tokenPositions[l]; + } + correctedStPos--; + } + int correctedEndPos = 0; + if (endPositionCorrection < 0) { + int l = 0; + while (l > endPositionCorrection) { + correctedEndPos -= tokenPositions[matchingTokens + l - 1]; + l--; + } + } + AnalyzedTokenReadings firstMatchTokenObj = tokens[firstMatchToken + + correctedStPos]; + boolean startsWithUppercase = StringTools + .startsWithUppercase(firstMatchTokenObj.getToken()) + && !matchConvertsCase(); + + if (firstMatchTokenObj.isSentStart() + && tokens.length > firstMatchToken + correctedStPos + 1) { + // make uppercasing work also at sentence start: + firstMatchTokenObj = tokens[firstMatchToken + correctedStPos + 1]; + startsWithUppercase = StringTools.startsWithUppercase(firstMatchTokenObj + .getToken()); + } + int fromPos = tokens[firstMatchToken + correctedStPos].getStartPos(); + // FIXME: this is fishy, assumes that comma should always come before + // whitespace + if (errMessage.contains(SUGG_TAG + ",") + && firstMatchToken + correctedStPos >= 1) { + fromPos = tokens[firstMatchToken + correctedStPos - 1].getStartPos() + + tokens[firstMatchToken + correctedStPos - 1].getToken().length(); + } + + final int toPos = tokens[lastMatchToken + correctedEndPos].getStartPos() + + tokens[lastMatchToken + correctedEndPos].getToken().length(); + if (fromPos < toPos) { // this can happen with some skip="-1" when the last + // token is not matched + return new RuleMatch(this, fromPos, toPos, + errMessage, shortMessage, startsWithUppercase); + } // failed to create any rule match... + return null; + } + + /** + * Checks if the suggestion starts with a match that is supposed to convert + * case. If it does, stop the default conversion to uppercase. + * + * @return true, if the match converts the case of the token. + */ + private boolean matchConvertsCase() { + if (suggestionMatches != null && !suggestionMatches.isEmpty()) { + final int sugStart = message.indexOf(SUGG_TAG) + SUGG_TAG.length(); + for (Match sMatch : suggestionMatches) { + if (!sMatch.isInMessageOnly() && sMatch.convertsCase() + && message.charAt(sugStart) == '\\') { + return true; + } + } + } + return false; + } + + public final void addSuggestionMatch(final Match m) { + if (suggestionMatches == null) { + suggestionMatches = new ArrayList<Match>(); + } + suggestionMatches.add(m); + } + + /** + * Gets the index of the element indexed by i, adding any offsets because of + * the phrases in the rule. + * + * @param i + * Current element index. + * @return int Index translated into XML element no. + */ + private int translateElementNo(final int i) { + if (!useList || i < 0) { + return i; + } + int j = 0; + for (int k = 0; k < i; k++) { + j += elementNo.get(k); + } + return j; + } + + /** + * Returns true when the token in the rule references a phrase composed of + * many tokens. + * + * @param i + * The index of the token. + * @return true if the phrase is under the index, false otherwise. + **/ + private int phraseLen(final int i) { + if (!useList || i > (elementNo.size() - 1)) { + return 1; + } + return elementNo.get(i); + } + + /** + * Creates a Cartesian product of the arrays stored in the input array. + * + * @param input + * Array of string arrays to combine. + * @param output + * Work array of strings. + * @param r + * Starting parameter (use 0 to get all combinations). + * @param lang + * Text language for adding spaces in some languages. + * @return Combined array of @String. + */ + private static String[] combineLists(final String[][] input, + final String[] output, final int r, final Language lang) { + final List<String> outputList = new ArrayList<String>(); + if (r == input.length) { + final StringBuilder sb = new StringBuilder(); + for (int k = 0; k < output.length; k++) { + sb.append(output[k]); + if (k < output.length - 1) { + sb.append(StringTools.addSpace(output[k + 1], lang)); + } + } + outputList.add(sb.toString()); + } else { + for (int c = 0; c < input[r].length; c++) { + output[r] = input[r][c]; + final String[] sList = combineLists(input, output, r + 1, lang); + outputList.addAll(Arrays.asList(sList)); + } + } + return outputList.toArray(new String[outputList.size()]); + } + + /** + * Concatenates the matches, and takes care of phrases (including inflection + * using synthesis). + * + * @param start + * Position of the element as referenced by match element in the + * rule. + * @param index + * The index of the element found in the matching sentence. + * @param tokenIndex + * The position of the token in the AnalyzedTokenReadings array. + * @param tokens + * Array of @AnalyzedTokenReadings + * @return @String[] Array of concatenated strings + * @throws IOException + * in case disk operations (used in synthesizer) go wrong. + */ + private String[] concatMatches(final int start, final int index, + final int tokenIndex, final AnalyzedTokenReadings[] tokens, + final int nextTokenPos) + throws IOException { + String[] finalMatch = null; + if (suggestionMatches.get(start) != null) { + final int len = phraseLen(index); + if (len == 1) { + final int skippedTokens = nextTokenPos - tokenIndex; + suggestionMatches.get(start).setToken(tokens, tokenIndex - 1, skippedTokens); + suggestionMatches.get(start).setSynthesizer(language.getSynthesizer()); + finalMatch = suggestionMatches.get(start).toFinalString(); + } else { + final List<String[]> matchList = new ArrayList<String[]>(); + for (int i = 0; i < len; i++) { + final int skippedTokens = nextTokenPos - (tokenIndex + i); + suggestionMatches.get(start).setToken(tokens, tokenIndex - 1 + i, skippedTokens); + suggestionMatches.get(start) + .setSynthesizer(language.getSynthesizer()); + matchList.add(suggestionMatches.get(start).toFinalString()); + } + return combineLists(matchList.toArray(new String[matchList.size()][]), + new String[matchList.size()], 0, language); + } + } + return finalMatch; + } + + /** + * Replace back references generated with <match> and \\1 in message + * using Match class, and take care of skipping. * + * + * @param tokenReadings + * Array of AnalyzedTokenReadings that were matched against the + * pattern + * @param positions + * Array of relative positions of matched tokens + * @param firstMatchTok + * Position of the first matched token + * @param errorMsg + * String containing suggestion markup + * @return String Formatted message. + * @throws IOException + * + **/ + private String formatMatches(final AnalyzedTokenReadings[] tokenReadings, + final int[] positions, final int firstMatchTok, final String errorMsg) + throws IOException { + String errorMessage = errorMsg; + int matchCounter = 0; + final int[] numbersToMatches = new int[errorMsg.length()]; + boolean newWay = false; + int errLen = errorMessage.length(); + int errMarker = errorMessage.indexOf('\\'); + boolean numberFollows = false; + if (errMarker > 0 && errMarker < errLen - 1) { + numberFollows = StringTools.isPositiveNumber(errorMessage + .charAt(errMarker + 1)); + } + while (errMarker > 0 && numberFollows) { + final int ind = errorMessage.indexOf('\\'); + if (ind > 0 && StringTools.isPositiveNumber(errorMessage.charAt(ind + 1))) { + int numLen = 1; + while (ind + numLen < errorMessage.length() + && StringTools.isPositiveNumber(errorMessage.charAt(ind + numLen))) { + numLen++; + } + final int j = Integer.parseInt(errorMessage.substring(ind + 1, ind + + numLen)) - 1; + int repTokenPos = 0; + int nextTokenPos = 0; + for (int l = 0; l <= j; l++) { + repTokenPos += positions[l]; + } + if (j <= positions.length) { + nextTokenPos = firstMatchTok + repTokenPos + positions[j + 1]; + } + if (suggestionMatches != null) { + if (matchCounter < suggestionMatches.size()) { + numbersToMatches[j] = matchCounter; + if (suggestionMatches.get(matchCounter) != null) { + final String[] matches = concatMatches(matchCounter, j, + firstMatchTok + repTokenPos, tokenReadings, nextTokenPos); + final String leftSide = errorMessage.substring(0, ind); + final String rightSide = errorMessage.substring(ind + numLen); + if (matches.length == 1) { + errorMessage = leftSide + matches[0] + rightSide; + } else { + errorMessage = formatMultipleSynthesis(matches, leftSide, + rightSide); + } + matchCounter++; + newWay = true; + } + } else { + // FIXME: is this correct? this is how we deal with multiple matches + suggestionMatches.add(suggestionMatches.get(numbersToMatches[j])); + } + } + + if (!newWay) { + // in case <match> elements weren't used (yet) + errorMessage = errorMessage.replace("\\" + (j + 1), + tokenReadings[firstMatchTok + repTokenPos - 1].getToken()); + } + } + errMarker = errorMessage.indexOf('\\'); + numberFollows = false; + errLen = errorMessage.length(); + if (errMarker > 0 && errMarker < errLen - 1) { + numberFollows = StringTools.isPositiveNumber(errorMessage + .charAt(errMarker + 1)); + } + } + return errorMessage; + } + + private static String formatMultipleSynthesis(final String[] matches, + final String leftSide, final String rightSide) { + String errorMessage = ""; + String suggestionLeft = ""; + String suggestionRight = ""; + String rightSideNew = rightSide; + final int sPos = leftSide.lastIndexOf(SUGG_TAG); + if (sPos > 0) { + suggestionLeft = leftSide.substring(sPos + SUGG_TAG.length()); + } + if (StringTools.isEmpty(suggestionLeft)) { + errorMessage = leftSide; + } else { + errorMessage = leftSide.substring(0, leftSide.lastIndexOf(SUGG_TAG)) + + SUGG_TAG; + } + final int rPos = rightSide.indexOf(END_SUGG_TAG); + if (rPos > 0) { + suggestionRight = rightSide.substring(0, rPos); + } + if (!StringTools.isEmpty(suggestionRight)) { + rightSideNew = rightSide.substring(rightSide.indexOf(END_SUGG_TAG)); + } + final int lastLeftSugEnd = leftSide.indexOf(END_SUGG_TAG); + final int lastLeftSugStart = leftSide.lastIndexOf(SUGG_TAG); + final StringBuilder sb = new StringBuilder(); + sb.append(errorMessage); + for (int z = 0; z < matches.length; z++) { + sb.append(suggestionLeft); + sb.append(matches[z]); + sb.append(suggestionRight); + if ((z < matches.length - 1) && lastLeftSugEnd < lastLeftSugStart) { + sb.append(END_SUGG_TAG); + sb.append(", "); + sb.append(SUGG_TAG); + } + } + sb.append(rightSideNew); + return sb.toString(); + } + + /** + * For testing only. + */ + public final List<Element> getElements() { + return patternElements; + } + +} diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/rules/patterns/PatternRuleLoader.java b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/patterns/PatternRuleLoader.java new file mode 100644 index 0000000..8156a6e --- /dev/null +++ b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/patterns/PatternRuleLoader.java @@ -0,0 +1,369 @@ +/* LanguageTool, a natural language style checker + * Copyright (C) 2005 Daniel Naber (http://www.danielnaber.de) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ +package de.danielnaber.languagetool.rules.patterns; + +import java.io.IOException; +import java.io.InputStream; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; + +import javax.xml.parsers.SAXParser; +import javax.xml.parsers.SAXParserFactory; + +import org.xml.sax.Attributes; +import org.xml.sax.SAXException; +import org.xml.sax.helpers.DefaultHandler; + +import de.danielnaber.languagetool.JLanguageTool; +import de.danielnaber.languagetool.Language; +import de.danielnaber.languagetool.rules.Category; +import de.danielnaber.languagetool.rules.IncorrectExample; + +/** + * Loads {@link PatternRule}s from an XML file. + * + * @author Daniel Naber + */ +public class PatternRuleLoader extends DefaultHandler { + + public final List<PatternRule> getRules(final InputStream is, + final String filename) throws IOException { + try { + final PatternRuleHandler handler = new PatternRuleHandler(); + final SAXParserFactory factory = SAXParserFactory.newInstance(); + final SAXParser saxParser = factory.newSAXParser(); + saxParser.getXMLReader().setFeature( + "http://apache.org/xml/features/nonvalidating/load-external-dtd", + false); + saxParser.parse(is, handler); + return handler.getRules(); + } catch (final Exception e) { + final IOException ioe = new IOException("Cannot load or parse '" + + filename + "'"); + ioe.initCause(e); + throw ioe; + } + } + + /** Testing only. */ + public final void main(final String[] args) throws IOException { + final PatternRuleLoader prg = new PatternRuleLoader(); + final String name = "/de/grammar.xml"; + final List<PatternRule> l = prg.getRules(JLanguageTool.getDataBroker().getFromRulesDirAsStream(name), name); + System.out.println(l); + } + +} + +class PatternRuleHandler extends XMLRuleHandler { + + private int subId; + + private boolean defaultOff; + private boolean defaultOn; + + private Category category; + private String description; + private String ruleGroupDescription; + + // =========================================================== + // SAX DocumentHandler methods + // =========================================================== + + @Override + public void startElement(final String namespaceURI, final String lName, + final String qName, final Attributes attrs) throws SAXException { + if ("category".equals(qName)) { + final String catName = attrs.getValue("name"); + final String priorityStr = attrs.getValue("priority"); + // int prio = 0; + if (priorityStr == null) { + category = new Category(catName); + } else { + category = new Category(catName, Integer.parseInt(priorityStr)); + } + + if ("off".equals(attrs.getValue(DEFAULT))) { + category.setDefaultOff(); + } + + } else if ("rules".equals(qName)) { + final String languageStr = attrs.getValue("lang"); + language = Language.getLanguageForShortName(languageStr); + if (language == null) { + throw new SAXException("Unknown language '" + languageStr + "'"); + } + } else if ("rule".equals(qName)) { + id = attrs.getValue("id"); + if (inRuleGroup) { + subId++; + } + if (!(inRuleGroup && defaultOff)) { + defaultOff = "off".equals(attrs.getValue(DEFAULT)); + } + + if (!(inRuleGroup && defaultOn)) { + defaultOn = "on".equals(attrs.getValue(DEFAULT)); + } + if (inRuleGroup && id == null) { + id = ruleGroupId; + } + description = attrs.getValue("name"); + if (inRuleGroup && description == null) { + description = ruleGroupDescription; + } + correctExamples = new ArrayList<String>(); + incorrectExamples = new ArrayList<IncorrectExample>(); + if (suggestionMatches != null) { + suggestionMatches.clear(); + } + } else if (PATTERN.equals(qName)) { + startPattern(attrs); + } else if (AND.equals(qName)) { + inAndGroup = true; + } else if ("unify".equals(qName)) { + inUnification = true; + uniNegation = YES.equals(attrs.getValue(NEGATE)); + } else if ("feature".equals(qName)) { + uFeature = attrs.getValue("id"); + } else if (qName.equals(TYPE)) { + uType = attrs.getValue("id"); + uTypeList.add(uType); + } else if (qName.equals(TOKEN)) { + setToken(attrs); + } else if (EXCEPTION.equals(qName)) { + setExceptions(attrs); + } else if (qName.equals(EXAMPLE) + && attrs.getValue(TYPE).equals("correct")) { + inCorrectExample = true; + correctExample = new StringBuilder(); + } else if (qName.equals(EXAMPLE) + && attrs.getValue(TYPE).equals("incorrect")) { + inIncorrectExample = true; + incorrectExample = new StringBuilder(); + exampleCorrection = new StringBuilder(); + if (attrs.getValue("correction") != null) { + exampleCorrection.append(attrs.getValue("correction")); + } + } else if ("message".equals(qName)) { + inMessage = true; + inSuggestion = false; + message = new StringBuilder(); + } else if ("short".equals(qName)) { + inShortMessage = true; + shortMessage = new StringBuilder(); + } else if ("rulegroup".equals(qName)) { + ruleGroupId = attrs.getValue("id"); + ruleGroupDescription = attrs.getValue("name"); + defaultOff = "off".equals(attrs.getValue(DEFAULT)); + defaultOn = "on".equals(attrs.getValue(DEFAULT)); + inRuleGroup = true; + subId = 0; + } else if ("suggestion".equals(qName) && inMessage) { + message.append("<suggestion>"); + inSuggestion = true; + } else if ("match".equals(qName)) { + setMatchElement(attrs); + } else if (qName.equals(MARKER) && inCorrectExample) { + correctExample.append("<marker>"); + } else if (qName.equals(MARKER) && inIncorrectExample) { + incorrectExample.append("<marker>"); + } else if (UNIFICATION.equals(qName)) { + uFeature = attrs.getValue("feature"); + inUnificationDef = true; + } else if ("equivalence".equals(qName)) { + uType = attrs.getValue(TYPE); + } else if (PHRASES.equals(qName)) { + inPhrases = true; + } else if ("includephrases".equals(qName)) { + phraseElementInit(); + } else if ("phrase".equals(qName) && inPhrases) { + phraseId = attrs.getValue("id"); + } else if ("phraseref".equals(qName) && (attrs.getValue("idref") != null)) { + preparePhrase(attrs); + } + } + + @Override + public void endElement(final String namespaceURI, final String sName, + final String qName) throws SAXException { + if ("rule".equals(qName)) { + phraseElementInit(); + if (phraseElementList.isEmpty()) { + final PatternRule rule = new PatternRule(id, language, elementList, + description, message.toString(), shortMessage.toString()); + prepareRule(rule); + rules.add(rule); + } else { + if (!elementList.isEmpty()) { + for (final ArrayList<Element> ph : phraseElementList) { + ph.addAll(new ArrayList<Element>(elementList)); + } + } + + for (final ArrayList<Element> phraseElement : phraseElementList) { + processElement(phraseElement); + final PatternRule rule = new PatternRule(id, language, phraseElement, + description, message.toString(), shortMessage.toString(), + phraseElementList.size() > 1); + prepareRule(rule); + rules.add(rule); + } + } + elementList.clear(); + if (phraseElementList != null) { + phraseElementList.clear(); + } + + } else if (qName.equals(EXCEPTION)) { + finalizeExceptions(); + } else if (qName.equals(AND)) { + inAndGroup = false; + andGroupCounter = 0; + tokenCounter++; + } else if (qName.equals(TOKEN)) { + finalizeTokens(); + } else if (qName.equals(PATTERN)) { + checkMarkPositions(); + inPattern = false; + if (lastPhrase) { + elementList.clear(); + } + if (phraseElementList == null || phraseElementList.isEmpty()) { + checkPositions(0); + } else { + for (List<Element> elements : phraseElementList) { + checkPositions(elements.size()); + } + } + tokenCounter = 0; + } else if (qName.equals(EXAMPLE)) { + if (inCorrectExample) { + correctExamples.add(correctExample.toString()); + } else if (inIncorrectExample) { + IncorrectExample example = null; + final String[] corrections = exampleCorrection.toString().split("\\|"); + if (corrections.length > 0 && corrections[0].length() > 0) { + example = new IncorrectExample(incorrectExample.toString(), + corrections); + } else { + example = new IncorrectExample(incorrectExample.toString()); + } + incorrectExamples.add(example); + } + inCorrectExample = false; + inIncorrectExample = false; + correctExample = new StringBuilder(); + incorrectExample = new StringBuilder(); + exampleCorrection = new StringBuilder(); + } else if ("message".equals(qName)) { + suggestionMatches = addLegacyMatches(); + inMessage = false; + } else if ("short".equals(qName)) { + inShortMessage = false; + } else if ("match".equals(qName)) { + if (inMessage) { + suggestionMatches.get(suggestionMatches.size() - 1).setLemmaString( + match.toString()); + } else if (inToken) { + tokenReference.setLemmaString(match.toString()); + } + inMatch = false; + } else if ("rulegroup".equals(qName)) { + inRuleGroup = false; + } else if ("suggestion".equals(qName) && inMessage) { + message.append("</suggestion>"); + inSuggestion = false; + } else if (qName.equals(MARKER) && inCorrectExample) { + correctExample.append("</marker>"); + } else if (qName.equals(MARKER) && inIncorrectExample) { + incorrectExample.append("</marker>"); + } else if ("phrase".equals(qName) && inPhrases) { + finalizePhrase(); + } else if ("includephrases".equals(qName)) { + elementList.clear(); + } else if (PHRASES.equals(qName) && inPhrases) { + inPhrases = false; + } else if (UNIFICATION.equals(qName)) { + inUnificationDef = false; + } else if ("feature".equals(qName)) { + equivalenceFeatures.put(uFeature, uTypeList); + uTypeList = new ArrayList<String>(); + } else if ("unify".equals(qName)) { + inUnification = false; + //clear the features... + equivalenceFeatures = new HashMap<String, List<String>>(); + } + } + + private void prepareRule(final PatternRule rule) { + rule.setStartPositionCorrection(startPositionCorrection); + rule.setEndPositionCorrection(endPositionCorrection); + startPositionCorrection = 0; + endPositionCorrection = 0; + rule.setCorrectExamples(correctExamples); + rule.setIncorrectExamples(incorrectExamples); + rule.setCategory(category); + if (inRuleGroup) { + rule.setSubId(Integer.toString(subId)); + } + else { + rule.setSubId("1"); + } + caseSensitive = false; + if (suggestionMatches != null) { + for (final Match m : suggestionMatches) { + rule.addSuggestionMatch(m); + } + if (phraseElementList.size() <= 1) { + suggestionMatches.clear(); + } + } + if (defaultOff) { + rule.setDefaultOff(); + } + + if (category.isDefaultOff() && !defaultOn) { + rule.setDefaultOff(); + } + + } + + @Override + public void characters(final char[] buf, final int offset, final int len) { + final String s = new String(buf, offset, len); + if (inException) { + exceptions.append(s); + } else if (inToken) { + elements.append(s); + } else if (inCorrectExample) { + correctExample.append(s); + } else if (inIncorrectExample) { + incorrectExample.append(s); + } else if (inMatch) { + match.append(s); + } else if (inMessage) { + message.append(s); + } else if (inShortMessage) { + shortMessage.append(s); + } + } + +} diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/rules/patterns/Unifier.java b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/patterns/Unifier.java new file mode 100644 index 0000000..7fbb35d --- /dev/null +++ b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/patterns/Unifier.java @@ -0,0 +1,432 @@ +/* LanguageTool, a natural language style checker + * Copyright (C) 2006 Daniel Naber (http://www.danielnaber.de) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ + +package de.danielnaber.languagetool.rules.patterns; + +import java.util.ArrayList; +import java.util.HashMap; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Set; + +import de.danielnaber.languagetool.AnalyzedToken; +import de.danielnaber.languagetool.AnalyzedTokenReadings; + +/** + * Implements unification of features over tokens. + * + * @author Marcin Milkowski + */ +public class Unifier { + + //TODO: add a possibility to negate some features but not all + /** + * Negates the meaning of unification just like negation in Element tokens. + */ + private boolean negation; + + private boolean allFeatsIn; + + private int tokCnt; + + private int readingsCounter; + + private final List<AnalyzedTokenReadings> tokSequence; + + /** + * A Map for storing the equivalence types for features. Features are + * specified as Strings, and map into types defined as maps from Strings to + * Elements. + */ + private final Map<EquivalenceTypeLocator, Element> equivalenceTypes; + + /** + * A Map that stores all possible equivalence types listed for features. + */ + private final Map<String, List<String>> equivalenceFeatures; + + /** + * Map of sets of matched equivalences in the unified sequence. + */ + private final List<Map<String, Set<String>>> equivalencesMatched; + + /** + * Marks found interpretations in subsequent tokens. + */ + private List<Boolean> featuresFound; + + /** + * For checking the current token. + */ + private List<Boolean> tmpFeaturesFound; + + /** + * Internal flag for checking whether the first token in tokSequence has to be + * yet unified. + */ + private boolean firstUnified; + + private boolean inUnification; + private boolean uniMatched; + private boolean uniAllMatched; + private AnalyzedTokenReadings[] unifiedTokens; + + /** + * Instantiates the unifier. + */ + public Unifier() { + tokCnt = -1; + readingsCounter = 1; + equivalencesMatched = new ArrayList<Map<String, Set<String>>>(); + equivalenceTypes = new HashMap<EquivalenceTypeLocator, Element>(); + equivalenceFeatures = new HashMap<String, List<String>>(); + featuresFound = new ArrayList<Boolean>(); + tmpFeaturesFound = new ArrayList<Boolean>(); + tokSequence = new ArrayList<AnalyzedTokenReadings>(); + } + + /** + * Prepares equivalence types for features to be tested. All equivalence types + * are given as {@link Element}s. They create an equivalence set (with + * abstraction). + * + * @param feature + * Feature to be tested, like gender, grammatical case or number. + * @param type + * Type of equivalence for the feature, for example plural, first + * person, genitive. + * @param elem + * Element specifying the equivalence. + */ + public final void setEquivalence(final String feature, final String type, + final Element elem) { + if (equivalenceTypes.containsKey(new EquivalenceTypeLocator(feature, type))) { + return; + } + equivalenceTypes.put(new EquivalenceTypeLocator(feature, type), elem); + List<String> lTypes; + if (equivalenceFeatures.containsKey(feature)) { + lTypes = equivalenceFeatures.get(feature); + } else { + lTypes = new ArrayList<String>(); + } + lTypes.add(type); + equivalenceFeatures.put(feature, lTypes); + } + + /** + * Tests if a token has shared features with other tokens. + * + * @param aToken + * - token to be tested + * @param feature + * - feature to be tested + * @param type + * - type of equivalence relation for the feature + * @return true if the token shares this type of feature with other tokens + */ + protected final boolean isSatisfied(final AnalyzedToken aToken, + final Map<String, List<String>> uFeatures) { + + if (allFeatsIn && equivalencesMatched.isEmpty()) { + return false; + } + // Error: no feature given! + if (uFeatures == null) { + return false; // throw exception?? + } + boolean unified = true; + List<String> types; + + if (allFeatsIn) { + unified &= checkNext(aToken, uFeatures); + } else { + tokCnt++; + while (equivalencesMatched.size() <= tokCnt) { + equivalencesMatched.add(new HashMap<String, Set<String>>()); + } + for (final Map.Entry<String, List<String>> feat : uFeatures.entrySet()) { + types = feat.getValue(); + if (types == null || types.isEmpty()) { + types = equivalenceFeatures.get(feat.getKey()); + } + for (final String typename : types) { + final Element testElem = equivalenceTypes + .get(new EquivalenceTypeLocator(feat.getKey(), typename)); + if (testElem == null) { + return false; + } + if (testElem.isMatched(aToken)) { + if (!equivalencesMatched.get(tokCnt).containsKey(feat.getKey())) { + final Set<String> typeSet = new HashSet<String>(); + typeSet.add(typename); + equivalencesMatched.get(tokCnt).put(feat.getKey(), typeSet); + } else { + equivalencesMatched.get(tokCnt).get(feat.getKey()).add(typename); + } + } + } + unified &= equivalencesMatched.get(tokCnt).containsKey(feat.getKey()); + if (!unified) { + break; + } + } + if (unified) { + if (tokCnt == 0 || tokSequence.isEmpty()) { + tokSequence.add(new AnalyzedTokenReadings(aToken, 0)); + } else { + tokSequence.get(0).addReading(aToken); + } + } + } + return unified ^ negation; + } + + private boolean checkNext(final AnalyzedToken aToken, + final Map<String, List<String>> uFeatures) { + boolean unifiedNext = true; + boolean anyFeatUnified = false; + List<String> types; + ArrayList<Boolean> tokenFeaturesFound = new ArrayList<Boolean>(tmpFeaturesFound); + if (allFeatsIn) { + for (int i = 0; i <= tokCnt; i++) { + boolean allFeatsUnified = true; + for (Map.Entry<String, List<String>> feat : uFeatures.entrySet()) { + boolean featUnified = false; + types = feat.getValue(); + if (types == null || types.isEmpty()) { + types = equivalenceFeatures.get(feat.getKey()); + } + for (final String typename : types) { + if (featuresFound.get(i) + && equivalencesMatched.get(i).containsKey(feat.getKey()) + && equivalencesMatched.get(i).get(feat.getKey()).contains(typename)) { + final Element testElem = equivalenceTypes + .get(new EquivalenceTypeLocator(feat.getKey(), typename)); + featUnified = featUnified || testElem.isMatched(aToken); + } + } + allFeatsUnified &= featUnified; + } + tokenFeaturesFound.set(i, allFeatsUnified); + anyFeatUnified = anyFeatUnified || allFeatsUnified; + } + unifiedNext &= anyFeatUnified; + if (unifiedNext) { + if (tokSequence.size() == readingsCounter) { + tokSequence.add(new AnalyzedTokenReadings(aToken, 0)); + } else { + tokSequence.get(readingsCounter).addReading(aToken); + } + tmpFeaturesFound = tokenFeaturesFound; + } + } + return unifiedNext; + } + + /** + * Call after every complete token (AnalyzedTokenReadings) checked. + */ + public final void startNextToken() { + featuresFound = new ArrayList<Boolean>(tmpFeaturesFound); + readingsCounter++; + } + + /** + * Starts testing only those equivalences that were previously matched. + */ + public final void startUnify() { + allFeatsIn = true; + for (int i = 0; i <= tokCnt; i++) { + featuresFound.add(true); + } + tmpFeaturesFound = new ArrayList<Boolean>(featuresFound); + } + + public final void setNegation(final boolean neg) { + negation = neg; + } + + public final boolean getNegation() { + return negation; + } + + /** + * Resets after use of unification. Required. + */ + public final void reset() { + equivalencesMatched.clear(); + allFeatsIn = false; + negation = false; + tokCnt = -1; + featuresFound.clear(); + tmpFeaturesFound.clear(); + tokSequence.clear(); + readingsCounter = 1; + firstUnified = false; + uniMatched = false; + uniAllMatched = false; + inUnification = false; + } + + /** + * Gets a full sequence of filtered tokens. + * + * @return Array of AnalyzedTokenReadings that match equivalence relation + * defined for features tested. + */ + public final AnalyzedTokenReadings[] getUnifiedTokens() { + if (tokSequence.isEmpty()) { + return null; + } + if (!firstUnified) { + AnalyzedTokenReadings tmpATR; + int first = 0; + tmpFeaturesFound.add(true); // Bentley's search idea + while (!tmpFeaturesFound.get(first)) { + first++; + } + tmpFeaturesFound.remove(tmpFeaturesFound.size() - 1); + if (first >= tmpFeaturesFound.size()) { + return null; + } + // FIXME: why this happens?? + final int numRead = tokSequence.get(0).getReadingsLength(); + if (first < numRead) { + tmpATR = new AnalyzedTokenReadings(tokSequence.get(0).getAnalyzedToken( + first), 0); + for (int i = first + 1; i <= Math.min(numRead - 1, tokCnt); i++) { + if (tmpFeaturesFound.get(i)) { + tmpATR.addReading(tokSequence.get(0).getAnalyzedToken(i)); + } + } + tokSequence.set(0, tmpATR); + } + firstUnified = true; + } + final AnalyzedTokenReadings[] atr = tokSequence + .toArray(new AnalyzedTokenReadings[tokSequence.size()]); + return atr; + } + + /** + * Tests if the token sequence is unified. + * + * @param matchToken + * AnalyzedToken token to unify + * @param feature + * String: feature to unify over + * @param type + * String: value types of the feature + * @param isUniNegated + * if true, then return negated result + * @param lastReading + * true when the matchToken is the last reading in the + * AnalyzedReadings + * @return True if the tokens in the sequence are unified. + */ + public final boolean isUnified(final AnalyzedToken matchToken, + final Map<String, List<String>> uFeatures, final boolean isUniNegated, + final boolean lastReading) { + if (inUnification) { + uniMatched |= isSatisfied(matchToken, uFeatures); + uniAllMatched = uniMatched; + if (lastReading) { + startNextToken(); + unifiedTokens = getUnifiedTokens(); + uniMatched = false; + } + return uniAllMatched; + } + if (isUniNegated) { + setNegation(true); + } + isSatisfied(matchToken, uFeatures); + if (lastReading) { + inUnification = true; + uniMatched = false; + startUnify(); + } + return true; + } + + /** + * Used for getting a unified sequence in case when simple test method + * {@link #isUnified} was used. + * + * @return An array of {@link AnalyzedTokenReadings} + */ + public final AnalyzedTokenReadings[] getFinalUnified() { + if (inUnification) { + return unifiedTokens; + } + return null; + } +} + +class EquivalenceTypeLocator { + + @Override + public int hashCode() { + final int prime = 31; + int result = 1; + result = prime * result + ((feature == null) ? 0 : feature.hashCode()); + result = prime * result + ((type == null) ? 0 : type.hashCode()); + return result; + } + + @Override + public boolean equals(final Object obj) { + if (this == obj) { + return true; + } + if (obj == null) { + return false; + } + if (getClass() != obj.getClass()) { + return false; + } + final EquivalenceTypeLocator other = (EquivalenceTypeLocator) obj; + if (feature == null) { + if (other.feature != null) { + return false; + } + } else if (!feature.equals(other.feature)) { + return false; + } + if (type == null) { + if (other.type != null) { + return false; + } + } else if (!type.equals(other.type)) { + return false; + } + return true; + } + + private final String feature; + private final String type; + + EquivalenceTypeLocator(final String feature, final String type) { + this.feature = feature; + this.type = type; + } + +}
\ No newline at end of file diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/rules/patterns/XMLRuleHandler.java b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/patterns/XMLRuleHandler.java new file mode 100644 index 0000000..72a852a --- /dev/null +++ b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/patterns/XMLRuleHandler.java @@ -0,0 +1,568 @@ +/* LanguageTool, a natural language style checker + * Copyright (C) 2006 Daniel Naber (http://www.danielnaber.de) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ +package de.danielnaber.languagetool.rules.patterns; + +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +import org.xml.sax.Attributes; +import org.xml.sax.Locator; +import org.xml.sax.SAXException; +import org.xml.sax.SAXParseException; +import org.xml.sax.helpers.DefaultHandler; + +import de.danielnaber.languagetool.Language; +import de.danielnaber.languagetool.rules.IncorrectExample; +import de.danielnaber.languagetool.tools.StringTools; + +/** + * XML rule handler that loads rules from XML and throws + * exceptions on errors and warnings. + * + * @author Daniel Naber + */ +public class XMLRuleHandler extends DefaultHandler { + + public XMLRuleHandler() { + elementList = new ArrayList<Element>(); + equivalenceFeatures = new HashMap<String, List<String>>(); + uTypeList = new ArrayList<String>(); + } + + List<PatternRule> rules = new ArrayList<PatternRule>(); + + protected Language language; + + protected StringBuilder correctExample = new StringBuilder(); + protected StringBuilder incorrectExample = new StringBuilder(); + protected StringBuilder exampleCorrection = new StringBuilder(); + protected StringBuilder message = new StringBuilder(); + protected StringBuilder match = new StringBuilder(); + protected StringBuilder elements; + protected StringBuilder exceptions; + + List<String> correctExamples = new ArrayList<String>(); + List<IncorrectExample> incorrectExamples = new ArrayList<IncorrectExample>(); + + protected boolean inPattern; + protected boolean inCorrectExample; + protected boolean inIncorrectExample; + protected boolean inMessage; + protected boolean inSuggestion; + protected boolean inMatch; + protected boolean inRuleGroup; + protected boolean inToken; + protected boolean inException; + protected boolean inPhrases; + protected boolean inAndGroup; + + protected boolean tokenSpaceBefore; + protected boolean tokenSpaceBeforeSet; + protected String posToken; + protected boolean posNegation; + protected boolean posRegExp; + + protected boolean caseSensitive; + protected boolean regExpression; + protected boolean tokenNegated; + protected boolean tokenInflected; + + protected String exceptionPosToken; + protected boolean exceptionStringRegExp; + protected boolean exceptionStringNegation; + protected boolean exceptionStringInflected; + protected boolean exceptionPosNegation; + protected boolean exceptionPosRegExp; + protected boolean exceptionValidNext; + protected boolean exceptionValidPrev; + protected boolean exceptionSet; + protected boolean exceptionSpaceBefore; + protected boolean exceptionSpaceBeforeSet; + + /** List of elements as specified by tokens. **/ + protected List<Element> elementList; + + /** true when phraseref is the last element in the rule. **/ + protected boolean lastPhrase; + + /** ID reference to the phrase. **/ + protected String phraseIdRef; + + /** Current phrase ID. **/ + protected String phraseId; + + protected int skipPos; + + protected String ruleGroupId; + + protected String id; + + protected Element tokenElement; + + protected Match tokenReference; + + protected List<Match> suggestionMatches; + + protected Locator pLocator; + + protected int startPositionCorrection; + protected int endPositionCorrection; + protected int tokenCounter; + + /** Phrase store - elementLists keyed by phraseIds. **/ + protected Map<String, List<List<Element>>> phraseMap; + + /** + * Logically forking element list, used for including multiple phrases in the + * current one. + **/ + protected List<ArrayList<Element>> phraseElementList; + + protected int andGroupCounter; + + protected StringBuilder shortMessage = new StringBuilder(); + protected boolean inShortMessage; + + protected boolean inUnification; + protected boolean inUnificationDef; + protected boolean uniNegation; + + protected String uFeature; + protected String uType = ""; + + protected List<String> uTypeList; + + protected Map<String, List<String>> equivalenceFeatures; + + + /** Definitions of values in XML files. */ + protected static final String YES = "yes"; + protected static final String POSTAG = "postag"; + protected static final String POSTAG_REGEXP = "postag_regexp"; + protected static final String REGEXP = "regexp"; + protected static final String NEGATE = "negate"; + protected static final String INFLECTED = "inflected"; + protected static final String NEGATE_POS = "negate_pos"; + protected static final String MARKER = "marker"; + protected static final String DEFAULT = "default"; + protected static final String TYPE = "type"; + protected static final String SPACEBEFORE = "spacebefore"; + protected static final String EXAMPLE = "example"; + protected static final String SCOPE = "scope"; + protected static final String IGNORE = "ignore"; + protected static final String SKIP = "skip"; + protected static final String TOKEN = "token"; + protected static final String FEATURE = "feature"; + protected static final String UNIFY = "unify"; + protected static final String AND = "and"; + protected static final String EXCEPTION = "exception"; + protected static final String CASE_SENSITIVE = "case_sensitive"; + protected static final String PATTERN = "pattern"; + protected static final String MATCH = "match"; + protected static final String UNIFICATION = "unification"; + protected static final String RULEGROUP = "rulegroup"; + protected static final String NO = "no"; + protected static final String MARK_TO = "mark_to"; + protected static final String MARK_FROM = "mark_from"; + protected static final String PHRASES = "phrases"; + protected static final String MESSAGE = "message"; + + + public List<PatternRule> getRules() { + return rules; + } + + public void warning (final SAXParseException e) throws SAXException { + throw e; + } + + public void error (final SAXParseException e) throws SAXException { + throw e; + } + + @Override + public void setDocumentLocator(final Locator locator) { + pLocator = locator; + super.setDocumentLocator(locator); + } + + protected void resetToken() { + posNegation = false; + posRegExp = false; + inToken = false; + tokenSpaceBefore = false; + tokenSpaceBeforeSet = false; + + resetException(); + exceptionSet = false; + tokenReference = null; + } + + protected void resetException() { + exceptionStringNegation = false; + exceptionStringInflected = false; + exceptionPosNegation = false; + exceptionPosRegExp = false; + exceptionStringRegExp = false; + exceptionValidNext = false; + exceptionValidPrev = false; + exceptionSpaceBefore = false; + exceptionSpaceBeforeSet = false; + } + + protected void phraseElementInit() { + // lazy init + if (phraseElementList == null) { + phraseElementList = new ArrayList<ArrayList<Element>>(); + } + } + protected void preparePhrase(final Attributes attrs) { + phraseIdRef = attrs.getValue("idref"); + if (phraseMap.containsKey(phraseIdRef)) { + for (final List<Element> curPhrEl : phraseMap.get(phraseIdRef)) { + for (final Element e : curPhrEl) { + e.setPhraseName(phraseIdRef); + } + if (elementList.isEmpty()) { + phraseElementList.add(new ArrayList<Element>(curPhrEl)); + } else { + final ArrayList<Element> prevList = new ArrayList<Element>( + elementList); + prevList.addAll(curPhrEl); + phraseElementList.add(new ArrayList<Element>(prevList)); + prevList.clear(); + } + } + lastPhrase = true; + } + } + + protected void finalizePhrase() { + // lazy init + if (phraseMap == null) { + phraseMap = new HashMap<String, List<List<Element>>>(); + } + phraseElementInit(); + if (phraseElementList.isEmpty()) { + phraseElementList.add(new ArrayList<Element>(elementList)); + } else { + for (final ArrayList<Element> ph : phraseElementList) { + ph.addAll(new ArrayList<Element>(elementList)); + } + } + + phraseMap.put(phraseId, new ArrayList<List<Element>>(phraseElementList)); + elementList.clear(); + + phraseElementList.clear(); + } + + protected void startPattern(final Attributes attrs) throws SAXException { + inPattern = true; + if (attrs.getValue(MARK_FROM) != null) { + startPositionCorrection = Integer.parseInt(attrs.getValue(MARK_FROM)); + } + if (attrs.getValue(MARK_TO) != null) { + endPositionCorrection = Integer.parseInt(attrs.getValue(MARK_TO)); + if (endPositionCorrection > 0) { + throw new SAXException("End position correction (mark_to="+ endPositionCorrection + + ") cannot be larger than 0: " + "\n Line: " + + pLocator.getLineNumber() + ", column: " + + pLocator.getColumnNumber() + "."); + } + } + caseSensitive = YES.equals(attrs.getValue(CASE_SENSITIVE)); + } + + + /** + * Calculates the offset of the match reference (if any) in case the match + * element has been used in the group. + * + * @param elList + * Element list where the match element was used. It is directly changed. + */ + protected void processElement(final List<Element> elList) { + int counter = 0; + for (final Element elTest : elList) { + if (elTest.getPhraseName() != null && counter > 0) { + if (elTest.isReferenceElement()) { + final int tokRef = elTest.getMatch().getTokenRef(); + elTest.getMatch().setTokenRef(tokRef + counter - 1); + final String offsetToken = elTest.getString().replace("\\" + tokRef, + "\\" + (tokRef + counter - 1)); + elTest.setStringElement(offsetToken); + } + } + counter++; + } + } + + protected void setMatchElement(final Attributes attrs) throws SAXException { + inMatch = true; + match = new StringBuilder(); + Match.CaseConversion caseConversion = Match.CaseConversion.NONE; + if (attrs.getValue("case_conversion") != null) { + caseConversion = Match.CaseConversion.toCase(attrs + .getValue("case_conversion").toUpperCase()); + } + Match.IncludeRange includeRange = Match.IncludeRange.NONE; + if (attrs.getValue("include_skipped") != null) { + includeRange = Match.IncludeRange.toRange(attrs + .getValue("include_skipped").toUpperCase()); + } + final Match mWorker = new Match(attrs.getValue(POSTAG), attrs + .getValue("postag_replace"), YES + .equals(attrs.getValue(POSTAG_REGEXP)), attrs + .getValue("regexp_match"), attrs.getValue("regexp_replace"), + caseConversion, YES.equals(attrs.getValue("setpos")), + includeRange); + mWorker.setInMessageOnly(!inSuggestion); + if (inMessage) { + if (suggestionMatches == null) { + suggestionMatches = new ArrayList<Match>(); + } + suggestionMatches.add(mWorker); + //add incorrect XML character for simplicity + message.append("\u0001\\"); + message.append(attrs.getValue("no")); + if (StringTools.isEmpty(attrs.getValue("no"))) { + throw new SAXException("References cannot be empty: " + "\n Line: " + + pLocator.getLineNumber() + ", column: " + + pLocator.getColumnNumber() + "."); + } else if (Integer.parseInt(attrs.getValue("no")) < 1) { + throw new SAXException("References must be larger than 0: " + + attrs.getValue("no") + "\n Line: " + pLocator.getLineNumber() + + ", column: " + pLocator.getColumnNumber() + "."); + } + } else if (inToken && attrs.getValue("no") != null) { + final int refNumber = Integer.parseInt(attrs.getValue("no")); + if (refNumber > elementList.size()) { + throw new SAXException( + "Only backward references in match elements are possible, tried to specify token " + + refNumber + + "\n Line: " + + pLocator.getLineNumber() + + ", column: " + pLocator.getColumnNumber() + "."); + } + mWorker.setTokenRef(refNumber); + tokenReference = mWorker; + elements.append('\\'); + elements.append(refNumber); + } + } + + protected void setExceptions(final Attributes attrs) { + inException = true; + exceptions = new StringBuilder(); + resetException(); + + exceptionStringNegation = YES.equals(attrs.getValue(NEGATE)); + exceptionValidNext = "next".equals(attrs.getValue(SCOPE)); + exceptionValidPrev = "previous".equals(attrs.getValue(SCOPE)); + exceptionStringInflected = YES.equals(attrs.getValue(INFLECTED)); + + if (attrs.getValue(POSTAG) != null) { + exceptionPosToken = attrs.getValue(POSTAG); + exceptionPosRegExp = YES.equals(attrs.getValue(POSTAG_REGEXP)); + exceptionPosNegation = YES.equals(attrs.getValue(NEGATE_POS)); + } + exceptionStringRegExp = YES.equals(attrs.getValue(REGEXP)); + if (attrs.getValue(SPACEBEFORE) != null) { + exceptionSpaceBefore = YES.equals(attrs.getValue(SPACEBEFORE)); + exceptionSpaceBeforeSet = !"ignore".equals(attrs.getValue(SPACEBEFORE)); + } + } + + protected void finalizeExceptions() { + inException = false; + if (!exceptionSet) { + tokenElement = new Element(StringTools.trimWhitespace(elements + .toString()), caseSensitive, regExpression, tokenInflected); + exceptionSet = true; + } + tokenElement.setNegation(tokenNegated); + if (!StringTools.isEmpty(exceptions.toString())) { + tokenElement.setStringException(StringTools.trimWhitespace(exceptions + .toString()), exceptionStringRegExp, exceptionStringInflected, + exceptionStringNegation, exceptionValidNext, exceptionValidPrev); + } + if (exceptionPosToken != null) { + tokenElement.setPosException(exceptionPosToken, exceptionPosRegExp, + exceptionPosNegation, exceptionValidNext, exceptionValidPrev); + exceptionPosToken = null; + } + if (exceptionSpaceBeforeSet) { + tokenElement.setExceptionSpaceBefore(exceptionSpaceBefore); + } + resetException(); + } + + protected void setToken(final Attributes attrs) { + inToken = true; + + if (lastPhrase) { + elementList.clear(); + } + + lastPhrase = false; + tokenNegated = YES.equals(attrs.getValue(NEGATE)); + tokenInflected = YES.equals(attrs.getValue(INFLECTED)); + if (attrs.getValue("skip") != null) { + skipPos = Integer.parseInt(attrs.getValue("skip")); + } + elements = new StringBuilder(); + // POSElement creation + if (attrs.getValue(POSTAG) != null) { + posToken = attrs.getValue(POSTAG); + posRegExp = YES.equals(attrs.getValue(POSTAG_REGEXP)); + posNegation = YES.equals(attrs.getValue(NEGATE_POS)); + } + regExpression = YES.equals(attrs.getValue(REGEXP)); + + if (attrs.getValue(SPACEBEFORE) != null) { + tokenSpaceBefore = YES.equals(attrs.getValue(SPACEBEFORE)); + tokenSpaceBeforeSet = !"ignore".equals(attrs.getValue(SPACEBEFORE)); + } + + if (!inAndGroup) { + tokenCounter++; + } + } + + protected void checkPositions(final int add) throws SAXException { + if (startPositionCorrection >= tokenCounter + add) { + throw new SAXException( + "Attempt to mark a token no. ("+ startPositionCorrection +") that is outside the pattern (" + + tokenCounter + "). Pattern elements are numbered starting from 0!" + "\n Line: " + + pLocator.getLineNumber() + ", column: " + + pLocator.getColumnNumber() + "."); + } + if (tokenCounter +add - endPositionCorrection < 0) { + throw new SAXException( + "Attempt to mark a token no. ("+ endPositionCorrection +") that is outside the pattern (" + + tokenCounter + " elements). End positions should be negative but not larger than the token count!" + + "\n Line: " + + pLocator.getLineNumber() + ", column: " + + pLocator.getColumnNumber() + "."); + } + } + + protected void checkMarkPositions() { + if (phraseElementList == null || phraseElementList.size() == 0) { + final int endMarker = elementList.size() + endPositionCorrection; + if (endMarker <= startPositionCorrection) { + throw new RuntimeException("Invalid combination of mark_from (" + startPositionCorrection + + ") and mark_to (" + endPositionCorrection + ") for rule " + id + + " with " + elementList.size() + + " tokens: the error position created by mark_from and mark_to is less than one token"); + } + } + } + + /** + * Adds Match objects for all references to tokens + * (including '\1' and the like). + */ + protected List<Match> addLegacyMatches() { + if (suggestionMatches == null || suggestionMatches.isEmpty()) { + return null; + } + final List<Match> sugMatch = new ArrayList<Match>(); + final String messageStr = message.toString(); + int pos = 0; + int ind = 0; + int matchCounter = 0; + while (pos != -1) { + pos = messageStr.indexOf('\\', ind + 1); + if (pos != -1 && messageStr.length() > pos) { + if (Character.isDigit(messageStr.charAt(pos + 1))) { + if (pos == 1 || messageStr.charAt(pos - 1) != '\u0001') { + final Match mWorker = new Match(null, null, false, null, + null, Match.CaseConversion.NONE, false, Match.IncludeRange.NONE); + mWorker.setInMessageOnly(true); + sugMatch.add(mWorker); + } else if (messageStr.charAt(pos - 1) == '\u0001') { // real suggestion marker + sugMatch.add(suggestionMatches.get(matchCounter)); + message.deleteCharAt(pos - 1 - matchCounter); + matchCounter++; + } + } + } + ind = pos; + } + if (sugMatch.isEmpty()) { + return suggestionMatches; + } + return sugMatch; + } + + protected void finalizeTokens() { + if (!exceptionSet || tokenElement == null) { + tokenElement = new Element(StringTools.trimWhitespace(elements + .toString()), caseSensitive, regExpression, tokenInflected); + tokenElement.setNegation(tokenNegated); + } else { + tokenElement.setStringElement(StringTools.trimWhitespace(elements + .toString())); + } + + if (skipPos != 0) { + tokenElement.setSkipNext(skipPos); + skipPos = 0; + } + if (posToken != null) { + tokenElement.setPosElement(posToken, posRegExp, posNegation); + posToken = null; + } + + if (tokenReference != null) { + tokenElement.setMatch(tokenReference); + } + + if (inAndGroup && andGroupCounter > 0) { + elementList.get(elementList.size() - 1) + .setAndGroupElement(tokenElement); + } else { + elementList.add(tokenElement); + } + if (inAndGroup) { + andGroupCounter++; + } + + if (inUnification) { + tokenElement.setUnification(equivalenceFeatures); + if (uniNegation) { + tokenElement.setUniNegation(); + } + } + + if (inUnificationDef) { + language.getUnifier().setEquivalence(uFeature, uType, tokenElement); + elementList.clear(); + } + if (tokenSpaceBeforeSet) { + tokenElement.setWhitespaceBefore(tokenSpaceBefore); + } + resetToken(); + } + +} diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/rules/patterns/bitext/BitextPatternRule.java b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/patterns/bitext/BitextPatternRule.java new file mode 100644 index 0000000..1d42a17 --- /dev/null +++ b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/patterns/bitext/BitextPatternRule.java @@ -0,0 +1,93 @@ +/* LanguageTool, a natural language style checker + * Copyright (C) 2010 Marcin Miłkowski (www.languagetool.org) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ + +package de.danielnaber.languagetool.rules.patterns.bitext; + +import java.io.IOException; + +import de.danielnaber.languagetool.AnalyzedSentence; +import de.danielnaber.languagetool.rules.Rule; +import de.danielnaber.languagetool.rules.RuleMatch; +import de.danielnaber.languagetool.rules.bitext.BitextRule; +import de.danielnaber.languagetool.rules.patterns.PatternRule; + +/** + * A bitext pattern rule class. A BitextPatternRule describes a language error and + * can test whether a given pre-analyzed pair of source and target text + * contains that error using the {@link Rule#match} method. It uses the syntax + * of XML files similar to normal PatternRules. + * + * @author Marcin Miłkowski + */ +public class BitextPatternRule extends BitextRule { + + private final PatternRule srcRule; + private final PatternRule trgRule; + + BitextPatternRule(final PatternRule src, final PatternRule trg) { + srcRule = src; + trgRule = trg; + } + + public PatternRule getSrcRule() { + return srcRule; + } + + public PatternRule getTrgRule() { + return trgRule; + } + + @Override + public String getDescription() { + return srcRule.getDescription(); + } + + public String getMessage() { + return trgRule.getMessage(); + } + + @Override + public String getId() { + return srcRule.getId(); + } + + /** + * This method always returns an empty array. + */ + @Override + public RuleMatch[] match(AnalyzedSentence text) throws IOException { + return new RuleMatch[0]; + } + + @Override + public RuleMatch[] match(AnalyzedSentence sourceText, + AnalyzedSentence targetText) throws IOException { + if (srcRule.match(sourceText).length > 0) { + return trgRule.match(targetText); + } + return new RuleMatch[0]; + } + + @Override + public void reset() { + // TODO Auto-generated method stub + + } + +} diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/rules/patterns/bitext/BitextPatternRuleLoader.java b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/patterns/bitext/BitextPatternRuleLoader.java new file mode 100644 index 0000000..508f381 --- /dev/null +++ b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/patterns/bitext/BitextPatternRuleLoader.java @@ -0,0 +1,413 @@ +/* LanguageTool, a natural language style checker + * Copyright (C) 2005 Daniel Naber (http://www.danielnaber.de) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ +package de.danielnaber.languagetool.rules.patterns.bitext; + +import java.io.IOException; +import java.io.InputStream; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; + +import javax.xml.parsers.SAXParser; +import javax.xml.parsers.SAXParserFactory; + +import org.xml.sax.Attributes; +import org.xml.sax.SAXException; +import org.xml.sax.helpers.DefaultHandler; + +import de.danielnaber.languagetool.Language; +import de.danielnaber.languagetool.bitext.StringPair; +import de.danielnaber.languagetool.rules.Category; +import de.danielnaber.languagetool.rules.IncorrectExample; +import de.danielnaber.languagetool.rules.bitext.IncorrectBitextExample; +import de.danielnaber.languagetool.rules.patterns.Element; +import de.danielnaber.languagetool.rules.patterns.Match; +import de.danielnaber.languagetool.rules.patterns.PatternRule; + +/** + * Loads {@link PatternRule}s from an XML file. + * + * @author Marcin Miłkowski + */ +public class BitextPatternRuleLoader extends DefaultHandler { + + public final List<BitextPatternRule> getRules(final InputStream is, + final String filename) throws IOException { + final List<BitextPatternRule> rules; + try { + final PatternRuleHandler handler = new PatternRuleHandler(); + final SAXParserFactory factory = SAXParserFactory.newInstance(); + final SAXParser saxParser = factory.newSAXParser(); + /* saxParser.getXMLReader().setFeature( + "http://apache.org/xml/features/nonvalidating/load-external-dtd", + false); + */ + saxParser.parse(is, handler); + rules = handler.getBitextRules(); + return rules; + } catch (final Exception e) { + final IOException ioe = new IOException("Cannot load or parse '" + + filename + "'"); + ioe.initCause(e); + throw ioe; + } + } + +} + +class PatternRuleHandler extends BitextXMLRuleHandler { + + private int subId; + + private boolean defaultOff; + private boolean defaultOn; + + private Category category; + private String description; + private String ruleGroupDescription; + + private PatternRule srcRule; + private PatternRule trgRule; + + private IncorrectExample trgExample; + private IncorrectExample srcExample; + + private Language srcLang; + + // =========================================================== + // SAX DocumentHandler methods + // =========================================================== + + @Override + public void startElement(final String namespaceURI, final String lName, + final String qName, final Attributes attrs) throws SAXException { + if (qName.equals("category")) { + final String catName = attrs.getValue("name"); + final String priorityStr = attrs.getValue("priority"); + // int prio = 0; + if (priorityStr != null) { + category = new Category(catName, Integer.parseInt(priorityStr)); + } else { + category = new Category(catName); + } + + if ("off".equals(attrs.getValue(DEFAULT))) { + category.setDefaultOff(); + } + + } else if (qName.equals("rules")) { + final String languageStr = attrs.getValue("targetLang"); + language = Language.getLanguageForShortName(languageStr); + if (language == null) { + throw new SAXException("Unknown language '" + languageStr + "'"); + } + } else if (qName.equals("rule")) { + id = attrs.getValue("id"); + if (inRuleGroup) + subId++; + if (!(inRuleGroup && defaultOff)) { + defaultOff = "off".equals(attrs.getValue(DEFAULT)); + } + + if (!(inRuleGroup && defaultOn)) { + defaultOn = "on".equals(attrs.getValue(DEFAULT)); + } + if (inRuleGroup && id == null) { + id = ruleGroupId; + } + description = attrs.getValue("name"); + if (inRuleGroup && description == null) { + description = ruleGroupDescription; + } + correctExamples = new ArrayList<StringPair>(); + incorrectExamples = new ArrayList<IncorrectBitextExample>(); + if (suggestionMatches != null) { + suggestionMatches.clear(); + } + } else if (PATTERN.equals(qName) || "target".equals(qName)) { + startPattern(attrs); + } else if (AND.equals(qName)) { + inAndGroup = true; + } else if (UNIFY.equals(qName)) { + inUnification = true; + uniNegation = YES.equals(attrs.getValue(NEGATE)); + } else if (qName.equals("feature")) { + uFeature = attrs.getValue("id"); + } else if (qName.equals(TYPE)) { + uType = attrs.getValue("id"); + uTypeList.add(uType); + } else if (qName.equals(TOKEN)) { + setToken(attrs); + } else if (qName.equals(EXCEPTION)) { + setExceptions(attrs); + } else if (qName.equals(EXAMPLE) + && attrs.getValue(TYPE).equals("correct")) { + inCorrectExample = true; + correctExample = new StringBuilder(); + } else if (EXAMPLE.equals(qName) + && attrs.getValue(TYPE).equals("incorrect")) { + inIncorrectExample = true; + incorrectExample = new StringBuilder(); + exampleCorrection = new StringBuilder(); + if (attrs.getValue("correction") != null) { + exampleCorrection.append(attrs.getValue("correction")); + } + } else if (MESSAGE.equals(qName)) { + inMessage = true; + message = new StringBuilder(); + } else if (qName.equals("short")) { + inShortMessage = true; + shortMessage = new StringBuilder(); + } else if (qName.equals(RULEGROUP)) { + ruleGroupId = attrs.getValue("id"); + ruleGroupDescription = attrs.getValue("name"); + defaultOff = "off".equals(attrs.getValue(DEFAULT)); + defaultOn = "on".equals(attrs.getValue(DEFAULT)); + inRuleGroup = true; + subId = 0; + } else if (qName.equals("suggestion") && inMessage) { + message.append("<suggestion>"); + inSuggestion = true; + } else if (qName.equals("match")) { + setMatchElement(attrs); + } else if (qName.equals(MARKER) && inCorrectExample) { + correctExample.append("<marker>"); + } else if (qName.equals(MARKER) && inIncorrectExample) { + incorrectExample.append("<marker>"); + } else if (qName.equals("unification")) { + uFeature = attrs.getValue("feature"); + inUnificationDef = true; + } else if (qName.equals("equivalence")) { + uType = attrs.getValue(TYPE); + } else if (qName.equals("phrases")) { + inPhrases = true; + } else if (qName.equals("includephrases")) { + phraseElementInit(); + } else if (qName.equals("phrase") && inPhrases) { + phraseId = attrs.getValue("id"); + } else if (qName.equals("phraseref") && (attrs.getValue("idref") != null)) { + preparePhrase(attrs); + } else if (qName.equals("source")) { + srcLang = Language.getLanguageForShortName(attrs.getValue("lang")); + } + } + + @Override + public void endElement(final String namespaceURI, final String sName, + final String qName) throws SAXException { + + if (qName.equals("source")) { + checkMarkPositions(); + srcRule = finalizeRule(); + } else if ("target".equals(qName)) { + checkMarkPositions(); + trgRule = finalizeRule(); + } else if ("rule".equals(qName)) { + trgRule.setMessage(message.toString()); + if (suggestionMatches != null) { + for (final Match m : suggestionMatches) { + trgRule.addSuggestionMatch(m); + } + if (phraseElementList.size() <= 1) { + suggestionMatches.clear(); + } + } + final BitextPatternRule bRule = new BitextPatternRule(srcRule, trgRule); + bRule.setCorrectBitextExamples(correctExamples); + bRule.setIncorrectBitextExamples(incorrectExamples); + bRule.setSourceLang(srcLang); + rules.add(bRule); + } else if (qName.equals(EXCEPTION)) { + finalizeExceptions(); + } else if (qName.equals(AND)) { + inAndGroup = false; + andGroupCounter = 0; + tokenCounter++; + } else if (qName.equals(TOKEN)) { + finalizeTokens(); + } else if (qName.equals(PATTERN)) { + inPattern = false; + if (lastPhrase) { + elementList.clear(); + } + if (phraseElementList == null || phraseElementList.isEmpty()) { + checkPositions(0); + } else { + for (List<Element> elements : phraseElementList) { + checkPositions(elements.size()); + } + } + tokenCounter = 0; + } else if (qName.equals("trgExample")) { + trgExample = setExample(); + } else if (qName.equals("srcExample")) { + srcExample = setExample(); + } else if (qName.equals("example")) { + if (inCorrectExample) { + correctExamples.add(new StringPair(srcExample.getExample(), trgExample.getExample())); + } else if (inIncorrectExample) { + if (trgExample.getCorrections() == null) { + incorrectExamples.add( + new IncorrectBitextExample( + new StringPair( + srcExample.getExample(), trgExample.getExample()) + )); + } else { + List<String> l = trgExample.getCorrections(); + String str [] = l.toArray (new String [l.size ()]); + incorrectExamples.add( + new IncorrectBitextExample( + new StringPair(srcExample.getExample(), + trgExample.getExample()), str) + ); + } + } + inCorrectExample = false; + inIncorrectExample = false; + } else if (qName.equals("message")) { + suggestionMatches = addLegacyMatches(); + inMessage = false; + } else if (qName.equals("short")) { + inShortMessage = false; + } else if (qName.equals("match")) { + if (inMessage) { + suggestionMatches.get(suggestionMatches.size() - 1).setLemmaString( + match.toString()); + } else if (inToken) { + tokenReference.setLemmaString(match.toString()); + } + inMatch = false; + } else if (qName.equals("rulegroup")) { + inRuleGroup = false; + } else if (qName.equals("suggestion") && inMessage) { + message.append("</suggestion>"); + inSuggestion = false; + } else if (qName.equals(MARKER) && inCorrectExample) { + correctExample.append("</marker>"); + } else if (qName.equals(MARKER) && inIncorrectExample) { + incorrectExample.append("</marker>"); + } else if (qName.equals("phrase") && inPhrases) { + finalizePhrase(); + } else if (qName.equals("includephrases")) { + elementList.clear(); + } else if (qName.equals("phrases") && inPhrases) { + inPhrases = false; + } else if (qName.equals("unification")) { + inUnificationDef = false; + } else if (qName.equals("feature")) { + equivalenceFeatures.put(uFeature, uTypeList); + uTypeList = new ArrayList<String>(); + } else if (qName.equals("unify")) { + inUnification = false; + //clear the features... + equivalenceFeatures = new HashMap<String, List<String>>(); + } + } + + private IncorrectExample setExample() { + IncorrectExample example = null; + if (inCorrectExample) { + example = new IncorrectExample(correctExample.toString()); + } else if (inIncorrectExample) { + final String[] corrections = exampleCorrection.toString().split("\\|"); + if (corrections.length > 0 && corrections[0].length() > 0) { + example = new IncorrectExample(incorrectExample.toString(), + corrections); + } else { + example = new IncorrectExample(incorrectExample.toString()); + } + } + correctExample = new StringBuilder(); + incorrectExample = new StringBuilder(); + exampleCorrection = new StringBuilder(); + return example; + } + + private PatternRule finalizeRule() { + PatternRule rule = null; + phraseElementInit(); + if (phraseElementList.isEmpty()) { + rule = new PatternRule(id, language, elementList, + description, "", shortMessage.toString()); + prepareRule(rule); + } else { + if (!elementList.isEmpty()) { + for (final ArrayList<Element> ph : phraseElementList) { + ph.addAll(new ArrayList<Element>(elementList)); + } + } + + for (final ArrayList<Element> phraseElement : phraseElementList) { + processElement(phraseElement); + rule = new PatternRule(id, language, phraseElement, + description, message.toString(), shortMessage.toString(), + phraseElementList.size() > 1); + prepareRule(rule); + } + } + elementList.clear(); + if (phraseElementList != null) { + phraseElementList.clear(); + } + startPositionCorrection = 0; + endPositionCorrection = 0; + return rule; + } + private void prepareRule(final PatternRule rule) { + rule.setStartPositionCorrection(startPositionCorrection); + rule.setEndPositionCorrection(endPositionCorrection); + startPositionCorrection = 0; + endPositionCorrection = 0; + rule.setCategory(category); + if (inRuleGroup) + rule.setSubId(Integer.toString(subId)); + else + rule.setSubId("1"); + caseSensitive = false; + if (defaultOff) { + rule.setDefaultOff(); + } + + if (category.isDefaultOff() && !defaultOn) { + rule.setDefaultOff(); + } + + } + + @Override + public void characters(final char[] buf, final int offset, final int len) { + final String s = new String(buf, offset, len); + if (inException) { + exceptions.append(s); + } else if (inToken) { + elements.append(s); + } else if (inCorrectExample) { + correctExample.append(s); + } else if (inIncorrectExample) { + incorrectExample.append(s); + } else if (inMatch) { + match.append(s); + } else if (inMessage) { + message.append(s); + } else if (inShortMessage) { + shortMessage.append(s); + } + } + +} diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/rules/patterns/bitext/BitextXMLRuleHandler.java b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/patterns/bitext/BitextXMLRuleHandler.java new file mode 100644 index 0000000..02f5a04 --- /dev/null +++ b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/patterns/bitext/BitextXMLRuleHandler.java @@ -0,0 +1,56 @@ +/* LanguageTool, a natural language style checker + * Copyright (C) 2006 Daniel Naber (http://www.danielnaber.de) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ +package de.danielnaber.languagetool.rules.patterns.bitext; + +import java.util.ArrayList; +import java.util.List; + +import org.xml.sax.SAXException; +import org.xml.sax.SAXParseException; + +import de.danielnaber.languagetool.bitext.StringPair; +import de.danielnaber.languagetool.rules.bitext.IncorrectBitextExample; +import de.danielnaber.languagetool.rules.patterns.XMLRuleHandler; + +/** + * XML rule handler that loads rules from XML and throws + * exceptions on errors and warnings. + * + * @author Daniel Naber + */ +class BitextXMLRuleHandler extends XMLRuleHandler { + + List<BitextPatternRule> rules = new ArrayList<BitextPatternRule>(); + + List<StringPair> correctExamples = new ArrayList<StringPair>(); + List<IncorrectBitextExample> incorrectExamples = new ArrayList<IncorrectBitextExample>(); + + List<BitextPatternRule> getBitextRules() { + return rules; + } + + public void warning (final SAXParseException e) throws SAXException { + throw e; + } + + public void error (final SAXParseException e) throws SAXException { + throw e; + } + +} diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/rules/patterns/bitext/FalseFriendsAsBitextLoader.java b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/patterns/bitext/FalseFriendsAsBitextLoader.java new file mode 100644 index 0000000..87c30a5 --- /dev/null +++ b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/patterns/bitext/FalseFriendsAsBitextLoader.java @@ -0,0 +1,72 @@ +/* LanguageTool, a natural language style checker + * Copyright (C) 2010 Marcin Miłkowski (www.languagetool.org) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ + +package de.danielnaber.languagetool.rules.patterns.bitext; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; + +import javax.xml.parsers.ParserConfigurationException; + +import org.xml.sax.SAXException; + +import de.danielnaber.languagetool.Language; +import de.danielnaber.languagetool.rules.patterns.FalseFriendRuleLoader; +import de.danielnaber.languagetool.rules.patterns.PatternRule; + +/** + * Loads the false friend rules as bitext pattern rules. Note that the resulting + * rules have suggestions that are not really customizable, in contradistinction + * to the 'real' bitext pattern rules. + * + * @author Marcin Miłkowski + * + */ +public class FalseFriendsAsBitextLoader { + + public List<BitextPatternRule> getFalseFriendsAsBitext(final String filename, + final Language motherTongue, final Language language) throws ParserConfigurationException, SAXException, IOException { + final FalseFriendRuleLoader ruleLoader = new FalseFriendRuleLoader(); + List<BitextPatternRule> bRules = new ArrayList<BitextPatternRule>(); + List<PatternRule> rules1 = + ruleLoader.getRules(this.getClass().getResourceAsStream(filename), + motherTongue, language); + List<PatternRule> rules2 = + ruleLoader.getRules(this.getClass().getResourceAsStream(filename), + language, motherTongue); + HashMap<String, PatternRule> srcRules = new HashMap<String, PatternRule>(); + for (PatternRule rule : rules1) { + srcRules.put(rule.getId(), rule); + } + for (PatternRule rule : rules2) { + if (srcRules.containsKey(rule.getId())) { + BitextPatternRule bRule = new BitextPatternRule( + srcRules.get(rule.getId()), rule); + bRule.setSourceLang(motherTongue); + bRule.setCategory(rule.getCategory()); + bRules.add(bRule); + } + } + return bRules; + } + +} + |