summaryrefslogtreecommitdiffstats
path: root/JLanguageTool/src/java/de/danielnaber/languagetool/rules/patterns/Element.java
diff options
context:
space:
mode:
Diffstat (limited to 'JLanguageTool/src/java/de/danielnaber/languagetool/rules/patterns/Element.java')
-rw-r--r--JLanguageTool/src/java/de/danielnaber/languagetool/rules/patterns/Element.java803
1 files changed, 803 insertions, 0 deletions
diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/rules/patterns/Element.java b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/patterns/Element.java
new file mode 100644
index 0000000..0ad7c1f
--- /dev/null
+++ b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/patterns/Element.java
@@ -0,0 +1,803 @@
+/* LanguageTool, a natural language style checker
+ * Copyright (C) 2005 Daniel Naber (http://www.danielnaber.de)
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
+ * USA
+ */
+package de.danielnaber.languagetool.rules.patterns;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
+import java.util.Map;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import de.danielnaber.languagetool.AnalyzedToken;
+import de.danielnaber.languagetool.AnalyzedTokenReadings;
+import de.danielnaber.languagetool.JLanguageTool;
+import de.danielnaber.languagetool.synthesis.Synthesizer;
+import de.danielnaber.languagetool.tools.StringTools;
+
+/**
+ * A part of a pattern.
+ *
+ * @author Daniel Naber
+ */
+public class Element {
+
+ private String stringToken;
+ private String posToken;
+ private String regToken;
+ private boolean posRegExp;
+
+ private boolean negation;
+ private boolean posNegation;
+
+ private final boolean caseSensitive;
+ private final boolean stringRegExp;
+ private boolean inflected;
+
+ private boolean testWhitespace;
+ private boolean whitespaceBefore;
+
+ /**
+ * List of exceptions that are valid for the current token and / or some next
+ * tokens.
+ */
+ private List<Element> exceptionList;
+
+ /**
+ * True if scope=="next".
+ */
+ private boolean exceptionValidNext;
+
+ /**
+ * True if any exception with a scope=="current" or scope=="next" is set for
+ * the element.
+ */
+ private boolean exceptionSet;
+
+ /**
+ * True if attribute scope=="previous".
+ */
+ private boolean exceptionValidPrevious;
+
+ /**
+ * List of exceptions that are valid for a previous token.
+ */
+ private List<Element> previousExceptionList;
+
+ private List<Element> andGroupList;
+ private boolean andGroupSet;
+ private boolean[] andGroupCheck;
+
+ private int skip;
+
+ private Pattern p;
+ private Pattern pPos;
+
+ private Matcher m;
+ private Matcher mPos;
+
+ /** The reference to another element in the pattern. **/
+ private Match tokenReference;
+
+ /**
+ * True when the element stores a formatted reference to another element of
+ * the pattern.
+ */
+ private boolean containsMatches;
+
+ /** Matches only tokens without any POS tag. **/
+ private static final String UNKNOWN_TAG = "UNKNOWN";
+
+ /**
+ * Parameter passed to regular expression matcher to enable case insensitive
+ * Unicode matching.
+ */
+ private static final String CASE_INSENSITIVE = "(?iu)";
+
+ private String referenceString;
+
+ /** String ID of the phrase the element is in. **/
+ private String phraseName;
+
+ /**
+ * This var is used to determine if calling {@link #setStringElement} makes
+ * sense. This method takes most time so it's best to reduce the number of its
+ * calls.
+ **/
+ private boolean testString;
+
+ /**
+ * Tells if the element is inside the unification, so that {@link Unifier}
+ * tests it.
+ */
+ private boolean unified;
+ private boolean uniNegation;
+
+ private Map<String, List<String>> unificationFeatures;
+
+ /**
+ * Creates Element that is used to match tokens in the text.
+ *
+ * @param token
+ * String to be matched
+ * @param caseSensitive
+ * True if the check is case-sensitive.
+ * @param regExp
+ * True if the check uses regular expressions.
+ * @param inflected
+ * True if the check refers to base forms (lemmas).
+ */
+ public Element(final String token, final boolean caseSensitive,
+ final boolean regExp, final boolean inflected) {
+ this.caseSensitive = caseSensitive;
+ this.stringRegExp = regExp;
+ this.inflected = inflected;
+ setStringElement(token);
+ }
+
+ /**
+ * Checks whether the rule element matches the token given as a parameter.
+ *
+ * @param token
+ * @AnalyzedToken to check matching against
+ * @return True if token matches, false otherwise.
+ */
+ public final boolean isMatched(final AnalyzedToken token) {
+ if (testWhitespace && !isWhitespaceBefore(token)) {
+ return false;
+ }
+ boolean matched = false;
+ if (testString) {
+ matched = (isStringTokenMatched(token) ^ negation)
+ && (isPosTokenMatched(token) ^ posNegation);
+ } else {
+ matched = (!negation) && (isPosTokenMatched(token) ^ posNegation);
+ }
+
+ if (andGroupSet) {
+ andGroupCheck[0] |= matched;
+ }
+ return matched;
+ }
+
+ /**
+ * Checks whether an exception matches.
+ *
+ * @param token
+ * @AnalyzedToken to check matching against
+ * @return True if any of the exceptions matches (logical disjunction).
+ */
+ public final boolean isExceptionMatched(final AnalyzedToken token) {
+ if (exceptionSet) {
+ for (final Element testException : exceptionList) {
+ if (!testException.exceptionValidNext) {
+ if (testException.isMatched(token)) {
+ return true;
+ }
+ }
+ }
+ }
+ return false;
+ }
+
+ /**
+ * Enables testing multiple conditions specified by different elements.
+ * Doesn't test exceptions.
+ *
+ * Works as logical AND operator only if preceded with
+ * {@link #setupAndGroup()}, and followed by {@link #checkAndGroup(boolean)}.
+ *
+ * @param token
+ * AnalyzedToken - the token checked.
+ */
+ public final void addMemberAndGroup(final AnalyzedToken token) {
+ if (andGroupSet) {
+ for (int i = 0; i < andGroupList.size(); i++) {
+ if (!andGroupCheck[i + 1]) {
+ final Element testAndGroup = andGroupList.get(i);
+ if (testAndGroup.isMatched(token)) {
+ andGroupCheck[i + 1] = true;
+ }
+ }
+ }
+ }
+ }
+
+ public final void setupAndGroup() {
+ if (andGroupSet) {
+ andGroupCheck = new boolean[andGroupList.size() + 1];
+ Arrays.fill(andGroupCheck, false);
+ }
+ }
+
+ public final boolean checkAndGroup(final boolean previousValue) {
+ if (andGroupSet) {
+ boolean allConditionsMatch = true;
+ for (final boolean testValue : andGroupCheck) {
+ allConditionsMatch &= testValue;
+ }
+ return allConditionsMatch;
+ }
+ return previousValue;
+ }
+
+ /**
+ * Enables testing multiple conditions specified by multiple element
+ * exceptions.
+ *
+ * Works as logical AND operator.
+ *
+ * @param token
+ * AnalyzedToken - the token checked for exceptions.
+ * @return true if all conditions are met, false otherwise.
+ */
+ public final boolean isAndExceptionGroupMatched(final AnalyzedToken token) {
+ if (andGroupSet) {
+ for (final Element testAndGroup : andGroupList) {
+ if (testAndGroup.isExceptionMatched(token)) {
+ return true;
+ }
+ }
+ }
+ return false;
+ }
+
+ /**
+ * This method checks exceptions both in AND-group and the token. Introduced
+ * to for clarity.
+ *
+ * @param token
+ * Token to match
+ * @return True if matched.
+ */
+ public final boolean isExceptionMatchedCompletely(final AnalyzedToken token) {
+ // note: short-circuiting possible
+ return isExceptionMatched(token) || isAndExceptionGroupMatched(token);
+ }
+
+ public final void setAndGroupElement(final Element andToken) {
+ if (andToken != null) {
+ if (andGroupList == null) {
+ andGroupList = new ArrayList<Element>();
+ }
+ if (!andGroupSet) {
+ andGroupSet = true;
+ }
+ andGroupList.add(andToken);
+ }
+ }
+
+ /**
+ * Checks if this element has an AND group associated with it.
+ *
+ * @return true if the element has a group of elements that all should match.
+ */
+ public final boolean hasAndGroup() {
+ return andGroupSet;
+ }
+
+ /**
+ * Returns the group of elements linked with AND operator.
+ *
+ * @return List of Elements.
+ */
+ public final List<Element> getAndGroup() {
+ return andGroupList;
+ }
+
+ /**
+ * Checks whether a previously set exception matches (in case the exception
+ * had scope == "next").
+ *
+ * @param token
+ * @AnalyzedToken to check matching against.
+ * @return True if any of the exceptions matches.
+ */
+ public final boolean isMatchedByScopeNextException(final AnalyzedToken token) {
+ if (exceptionSet) {
+ for (final Element testException : exceptionList) {
+ if (testException.exceptionValidNext) {
+ if (testException.isMatched(token)) {
+ return true;
+ }
+ }
+ }
+ }
+ return false;
+ }
+
+ /**
+ * Checks whether an exception for a previous token matches (in case the
+ * exception had scope == "previous").
+ *
+ * @param token
+ * {@link AnalyzedToken} to check matching against.
+ * @return True if any of the exceptions matches.
+ */
+ public final boolean isMatchedByPreviousException(final AnalyzedToken token) {
+ if (exceptionValidPrevious) {
+ for (final Element testException : previousExceptionList) {
+ if (!testException.exceptionValidNext) {
+ if (testException.isMatched(token)) {
+ return true;
+ }
+ }
+ }
+ }
+ return false;
+ }
+
+ /**
+ * Checks whether an exception for a previous token matches all readings of a
+ * given token (in case the exception had scope == "previous").
+ *
+ * @param prevToken
+ * {@link AnalyzedTokenReadings} to check matching against.
+ * @return true if any of the exceptions matches.
+ */
+ public final boolean isMatchedByPreviousException(
+ final AnalyzedTokenReadings prevToken) {
+ final int numReadings = prevToken.getReadingsLength();
+ for (int i = 0; i < numReadings; i++) {
+ if (isMatchedByPreviousException(prevToken.getAnalyzedToken(i))) {
+ return true;
+ }
+ }
+ return false;
+ }
+
+ /**
+ * Checks if the token is a SENT_START.
+ *
+ * @return True if the element starts the sentence and the element hasn't been
+ * set to have negated POS token.
+ *
+ */
+ public final boolean isSentStart() {
+ return JLanguageTool.SENTENCE_START_TAGNAME.equals(posToken)
+ && !posNegation;
+ }
+
+ @Override
+ public final String toString() {
+ final StringBuilder sb = new StringBuilder();
+ if (negation) {
+ sb.append('!');
+ }
+ sb.append(stringToken);
+ if (phraseName != null) {
+ sb.append(" {");
+ sb.append(phraseName);
+ sb.append('}');
+ }
+ if (posToken != null) {
+ sb.append('/');
+ sb.append(posToken);
+ }
+ return sb.toString();
+ }
+
+ public final void setPosElement(final String posToken, final boolean regExp,
+ final boolean negation) {
+ this.posToken = posToken;
+ this.posNegation = negation;
+ posRegExp = regExp;
+ if (posRegExp) {
+ pPos = Pattern.compile(posToken);
+ }
+ }
+
+ public final String getString() {
+ return stringToken;
+ }
+
+ public final void setStringElement(final String token) {
+ this.stringToken = token;
+ testString = !StringTools.isEmpty(stringToken);
+ if (testString && stringRegExp) {
+ regToken = stringToken;
+ if (!caseSensitive) {
+ regToken = CASE_INSENSITIVE + stringToken;
+ }
+ if (!"\\0".equals(token)) {
+ p = Pattern.compile(regToken);
+ }
+ }
+ }
+
+ /**
+ * Sets a POS-type exception for matching string tokens.
+ *
+ * @param posToken
+ * The part of the speech tag in the exception.
+ * @param regExp
+ * True if the POS is specified as a regular expression.
+ * @param negation
+ * True if the exception is negated.
+ * @param scopeNext
+ * True if the exception scope is next tokens.
+ * @param scopePrevious
+ * True if the exception should match only a single previous token.
+ */
+ public final void setPosException(final String posToken,
+ final boolean regExp, final boolean negation, final boolean scopeNext,
+ final boolean scopePrevious) {
+ final Element posException = new Element("", this.caseSensitive, false,
+ false);
+ posException.setPosElement(posToken, regExp, negation);
+ posException.exceptionValidNext = scopeNext;
+ setException(posException, scopePrevious);
+ }
+
+ /**
+ * Sets a string-type exception for matching string tokens.
+ *
+ * @param token
+ * The string in the exception.
+ * @param regExp
+ * True if the string is specified as a regular expression.
+ * @param inflected
+ * True if the string is a base form (lemma).
+ * @param negation
+ * True if the exception is negated.
+ * @param scopeNext
+ * True if the exception scope is next tokens.
+ * @param scopePrevious
+ * True if the exception should match only a single previous token.
+ */
+ public final void setStringException(final String token,
+ final boolean regExp, final boolean inflected, final boolean negation,
+ final boolean scopeNext, final boolean scopePrevious) {
+ final Element stringException = new Element(token, this.caseSensitive,
+ regExp, inflected);
+ stringException.setNegation(negation);
+ stringException.exceptionValidNext = scopeNext;
+ setException(stringException, scopePrevious);
+ }
+
+ private void setException(final Element elem, final boolean scopePrevious) {
+ exceptionValidPrevious |= scopePrevious;
+ if (exceptionList == null && !scopePrevious) {
+ exceptionList = new ArrayList<Element>();
+ }
+ if (previousExceptionList == null && scopePrevious) {
+ previousExceptionList = new ArrayList<Element>();
+ }
+ if (scopePrevious) {
+ previousExceptionList.add(elem);
+ } else {
+ if (!exceptionSet) {
+ exceptionSet = true;
+ }
+ if (exceptionSet) {
+ exceptionList.add(elem);
+ }
+ }
+ }
+
+ /**
+ * Tests if part of speech matches a given string.
+ *
+ * @param token
+ * Token to test.
+ * @return true if matches
+ *
+ * Special value UNKNOWN_TAG matches null POS tags.
+ *
+ */
+ private boolean isPosTokenMatched(final AnalyzedToken token) {
+ // if no POS set
+ // defaulting to true
+ if (posToken == null) {
+ return true;
+ }
+ if (token.getPOSTag() == null) {
+ if (posRegExp) {
+ if (mPos == null) {
+ mPos = pPos.matcher(UNKNOWN_TAG);
+ } else {
+ mPos.reset(UNKNOWN_TAG);
+ }
+ return mPos.matches();
+ }
+ if (UNKNOWN_TAG.equals(posToken)) {
+ return true;
+ }
+ }
+ boolean match;
+ if (posRegExp) {
+ if (mPos == null) {
+ mPos = pPos.matcher(token.getPOSTag());
+ } else {
+ mPos.reset(token.getPOSTag());
+ }
+ match = mPos.matches();
+ } else {
+ match = posToken.equals(token.getPOSTag());
+ }
+ if (!match && UNKNOWN_TAG.equals(posToken)) { // these are helper tags,
+ // ignore them
+ match = JLanguageTool.SENTENCE_END_TAGNAME.equals(token.getPOSTag())
+ || JLanguageTool.PARAGRAPH_END_TAGNAME.equals(token.getPOSTag());
+ }
+ return match;
+ }
+
+ /**
+ * Tests whether the string token element matches a given token.
+ *
+ * @param token
+ * {@link AnalyzedToken} to match against.
+ * @return True if matches.
+ */
+ private boolean isStringTokenMatched(final AnalyzedToken token) {
+ final String testToken = getTestToken(token);
+ if (stringRegExp) {
+ if (m == null) {
+ m = p.matcher(testToken);
+ } else {
+ m.reset(testToken);
+ }
+ return m.matches();
+ }
+ if (caseSensitive) {
+ return stringToken.equals(testToken);
+ }
+ return stringToken.equalsIgnoreCase(testToken);
+ }
+
+ private String getTestToken(final AnalyzedToken token) {
+ // enables using words with lemmas and without lemmas
+ // in the same regexp with inflected="yes"
+ if (inflected) {
+ return token.getTokenInflected();
+ }
+ return token.getToken();
+ }
+
+ /**
+ * Gets the exception scope length.
+ *
+ * @return Scope length.
+ */
+ public final int getSkipNext() {
+ return skip;
+ }
+
+ /**
+ * Sets the exception scope length.
+ *
+ * @param i
+ * Exception scope length.
+ */
+ public final void setSkipNext(final int i) {
+ skip = i;
+ }
+
+ /**
+ * Checks if the element has an exception for a previous token.
+ *
+ * @return True if the element has a previous token matching exception.
+ */
+ public final boolean hasPreviousException() {
+ return exceptionValidPrevious;
+ }
+
+ /**
+ * Negates the meaning of match().
+ *
+ * @param negation
+ * - true if the meaning of match() is to be negated.
+ */
+ public final void setNegation(final boolean negation) {
+ this.negation = negation;
+ }
+
+ /**
+ * see {@link #setNegation}
+ *
+ * @since 0.9.3
+ */
+ public final boolean getNegation() {
+ return this.negation;
+ }
+
+ /**
+ *
+ * @return true when this element refers to another token.
+ */
+ public final boolean isReferenceElement() {
+ return containsMatches;
+ }
+
+ /**
+ * Sets the reference to another token.
+ *
+ * @param match
+ * Formatting object for the token reference.
+ */
+ public final void setMatch(final Match match) {
+ tokenReference = match;
+ containsMatches = true;
+ }
+
+ public final Match getMatch() {
+ return tokenReference;
+ }
+
+ /**
+ * Prepare Element for matching by formatting its string token and POS (if the
+ * Element is supposed to refer to some other token).
+ *
+ * @param token
+ * the token specified as {@link AnalyzedTokenReadings}
+ * @param synth
+ * the language synthesizer ({@link Synthesizer})
+ *
+ */
+ public final void compile(final AnalyzedTokenReadings token,
+ final Synthesizer synth) throws IOException {
+
+ m = null;
+ p = null;
+ tokenReference.setToken(token);
+ tokenReference.setSynthesizer(synth);
+
+ if (StringTools.isEmpty(referenceString)) {
+ referenceString = stringToken;
+ }
+ if (tokenReference.setsPos()) {
+ final String posReference = tokenReference.getTargetPosTag();
+ if (posReference != null) {
+ if (mPos != null) {
+ mPos = null;
+ }
+ setPosElement(posReference, tokenReference.posRegExp(), negation);
+ }
+ setStringElement(referenceString.replace("\\"
+ + tokenReference.getTokenRef(), ""));
+ inflected = true;
+ } else {
+ setStringElement(referenceString.replace("\\"
+ + tokenReference.getTokenRef(), tokenReference.toTokenString()));
+ }
+ }
+
+ /**
+ * Sets the phrase the element is in.
+ *
+ * @param s
+ * ID of the phrase.
+ */
+ public final void setPhraseName(final String s) {
+ phraseName = s;
+ }
+
+ /**
+ * Checks if the Element is in any phrase.
+ *
+ * @return True if the Element is contained in the phrase.
+ */
+ public final boolean isPartOfPhrase() {
+ return phraseName != null;
+ }
+
+ /**
+ * Whether the element matches case sensitively.
+ *
+ * @since 0.9.3
+ */
+ public final boolean getCaseSensitive() {
+ return caseSensitive;
+ }
+
+ /**
+ * Tests whether the element matches a regular expression.
+ *
+ * @since 0.9.6
+ */
+ public final boolean isRegularExpression() {
+ return stringRegExp;
+ }
+
+ /**
+ * @return the POS of the Element
+ * @since 0.9.6
+ */
+ public final String getPOStag() {
+ return posToken;
+ }
+
+ /**
+ * Tests whether the POS is negated.
+ *
+ * @return true if so.
+ */
+ public final boolean getPOSNegation() {
+ return posNegation;
+ }
+
+ /**
+ * Whether the token is inflected.
+ *
+ * @return True if so.
+ */
+ public final boolean isInflected() {
+ return inflected;
+ }
+
+ /**
+ * Gets the phrase the element is in.
+ *
+ * @return String The name of the phrase.
+ */
+ public final String getPhraseName() {
+ return phraseName;
+ }
+
+ public final boolean isUnified() {
+ return unified;
+ }
+
+ public final void setUnification(final Map<String, List<String>> uniFeatures) {
+ unificationFeatures = uniFeatures;
+ unified = true;
+ }
+
+ /**
+ * Get unification features and types.
+ * @return A map from features to a list of types.
+ * @since 1.0.1
+ */
+ public final Map<String, List<String>> getUniFeatures() {
+ return unificationFeatures;
+ }
+
+ public final void setUniNegation() {
+ uniNegation = true;
+ }
+
+ public final boolean isUniNegated() {
+ return uniNegation;
+ }
+
+ public final void setWhitespaceBefore(final boolean isWhite) {
+ whitespaceBefore = isWhite;
+ testWhitespace = true;
+ }
+
+ public final void setExceptionSpaceBefore(final boolean isWhite) {
+ if (exceptionList != null) {
+ exceptionList.get(exceptionList.size()).setWhitespaceBefore(isWhite);
+ }
+ }
+
+ public final boolean isWhitespaceBefore(final AnalyzedToken token) {
+ return whitespaceBefore == token.isWhitespaceBefore();
+ }
+
+ /**
+ * Since 1.0.0
+ * @return A List of Exceptions. Used for testing.
+ */
+ public final List<Element> getExceptionList() {
+ return exceptionList;
+ }
+}