diff options
Diffstat (limited to 'JLanguageTool/src/java/de/danielnaber/languagetool/rules/patterns/Element.java')
-rw-r--r-- | JLanguageTool/src/java/de/danielnaber/languagetool/rules/patterns/Element.java | 803 |
1 files changed, 803 insertions, 0 deletions
diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/rules/patterns/Element.java b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/patterns/Element.java new file mode 100644 index 0000000..0ad7c1f --- /dev/null +++ b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/patterns/Element.java @@ -0,0 +1,803 @@ +/* LanguageTool, a natural language style checker + * Copyright (C) 2005 Daniel Naber (http://www.danielnaber.de) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ +package de.danielnaber.languagetool.rules.patterns; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; +import java.util.Map; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +import de.danielnaber.languagetool.AnalyzedToken; +import de.danielnaber.languagetool.AnalyzedTokenReadings; +import de.danielnaber.languagetool.JLanguageTool; +import de.danielnaber.languagetool.synthesis.Synthesizer; +import de.danielnaber.languagetool.tools.StringTools; + +/** + * A part of a pattern. + * + * @author Daniel Naber + */ +public class Element { + + private String stringToken; + private String posToken; + private String regToken; + private boolean posRegExp; + + private boolean negation; + private boolean posNegation; + + private final boolean caseSensitive; + private final boolean stringRegExp; + private boolean inflected; + + private boolean testWhitespace; + private boolean whitespaceBefore; + + /** + * List of exceptions that are valid for the current token and / or some next + * tokens. + */ + private List<Element> exceptionList; + + /** + * True if scope=="next". + */ + private boolean exceptionValidNext; + + /** + * True if any exception with a scope=="current" or scope=="next" is set for + * the element. + */ + private boolean exceptionSet; + + /** + * True if attribute scope=="previous". + */ + private boolean exceptionValidPrevious; + + /** + * List of exceptions that are valid for a previous token. + */ + private List<Element> previousExceptionList; + + private List<Element> andGroupList; + private boolean andGroupSet; + private boolean[] andGroupCheck; + + private int skip; + + private Pattern p; + private Pattern pPos; + + private Matcher m; + private Matcher mPos; + + /** The reference to another element in the pattern. **/ + private Match tokenReference; + + /** + * True when the element stores a formatted reference to another element of + * the pattern. + */ + private boolean containsMatches; + + /** Matches only tokens without any POS tag. **/ + private static final String UNKNOWN_TAG = "UNKNOWN"; + + /** + * Parameter passed to regular expression matcher to enable case insensitive + * Unicode matching. + */ + private static final String CASE_INSENSITIVE = "(?iu)"; + + private String referenceString; + + /** String ID of the phrase the element is in. **/ + private String phraseName; + + /** + * This var is used to determine if calling {@link #setStringElement} makes + * sense. This method takes most time so it's best to reduce the number of its + * calls. + **/ + private boolean testString; + + /** + * Tells if the element is inside the unification, so that {@link Unifier} + * tests it. + */ + private boolean unified; + private boolean uniNegation; + + private Map<String, List<String>> unificationFeatures; + + /** + * Creates Element that is used to match tokens in the text. + * + * @param token + * String to be matched + * @param caseSensitive + * True if the check is case-sensitive. + * @param regExp + * True if the check uses regular expressions. + * @param inflected + * True if the check refers to base forms (lemmas). + */ + public Element(final String token, final boolean caseSensitive, + final boolean regExp, final boolean inflected) { + this.caseSensitive = caseSensitive; + this.stringRegExp = regExp; + this.inflected = inflected; + setStringElement(token); + } + + /** + * Checks whether the rule element matches the token given as a parameter. + * + * @param token + * @AnalyzedToken to check matching against + * @return True if token matches, false otherwise. + */ + public final boolean isMatched(final AnalyzedToken token) { + if (testWhitespace && !isWhitespaceBefore(token)) { + return false; + } + boolean matched = false; + if (testString) { + matched = (isStringTokenMatched(token) ^ negation) + && (isPosTokenMatched(token) ^ posNegation); + } else { + matched = (!negation) && (isPosTokenMatched(token) ^ posNegation); + } + + if (andGroupSet) { + andGroupCheck[0] |= matched; + } + return matched; + } + + /** + * Checks whether an exception matches. + * + * @param token + * @AnalyzedToken to check matching against + * @return True if any of the exceptions matches (logical disjunction). + */ + public final boolean isExceptionMatched(final AnalyzedToken token) { + if (exceptionSet) { + for (final Element testException : exceptionList) { + if (!testException.exceptionValidNext) { + if (testException.isMatched(token)) { + return true; + } + } + } + } + return false; + } + + /** + * Enables testing multiple conditions specified by different elements. + * Doesn't test exceptions. + * + * Works as logical AND operator only if preceded with + * {@link #setupAndGroup()}, and followed by {@link #checkAndGroup(boolean)}. + * + * @param token + * AnalyzedToken - the token checked. + */ + public final void addMemberAndGroup(final AnalyzedToken token) { + if (andGroupSet) { + for (int i = 0; i < andGroupList.size(); i++) { + if (!andGroupCheck[i + 1]) { + final Element testAndGroup = andGroupList.get(i); + if (testAndGroup.isMatched(token)) { + andGroupCheck[i + 1] = true; + } + } + } + } + } + + public final void setupAndGroup() { + if (andGroupSet) { + andGroupCheck = new boolean[andGroupList.size() + 1]; + Arrays.fill(andGroupCheck, false); + } + } + + public final boolean checkAndGroup(final boolean previousValue) { + if (andGroupSet) { + boolean allConditionsMatch = true; + for (final boolean testValue : andGroupCheck) { + allConditionsMatch &= testValue; + } + return allConditionsMatch; + } + return previousValue; + } + + /** + * Enables testing multiple conditions specified by multiple element + * exceptions. + * + * Works as logical AND operator. + * + * @param token + * AnalyzedToken - the token checked for exceptions. + * @return true if all conditions are met, false otherwise. + */ + public final boolean isAndExceptionGroupMatched(final AnalyzedToken token) { + if (andGroupSet) { + for (final Element testAndGroup : andGroupList) { + if (testAndGroup.isExceptionMatched(token)) { + return true; + } + } + } + return false; + } + + /** + * This method checks exceptions both in AND-group and the token. Introduced + * to for clarity. + * + * @param token + * Token to match + * @return True if matched. + */ + public final boolean isExceptionMatchedCompletely(final AnalyzedToken token) { + // note: short-circuiting possible + return isExceptionMatched(token) || isAndExceptionGroupMatched(token); + } + + public final void setAndGroupElement(final Element andToken) { + if (andToken != null) { + if (andGroupList == null) { + andGroupList = new ArrayList<Element>(); + } + if (!andGroupSet) { + andGroupSet = true; + } + andGroupList.add(andToken); + } + } + + /** + * Checks if this element has an AND group associated with it. + * + * @return true if the element has a group of elements that all should match. + */ + public final boolean hasAndGroup() { + return andGroupSet; + } + + /** + * Returns the group of elements linked with AND operator. + * + * @return List of Elements. + */ + public final List<Element> getAndGroup() { + return andGroupList; + } + + /** + * Checks whether a previously set exception matches (in case the exception + * had scope == "next"). + * + * @param token + * @AnalyzedToken to check matching against. + * @return True if any of the exceptions matches. + */ + public final boolean isMatchedByScopeNextException(final AnalyzedToken token) { + if (exceptionSet) { + for (final Element testException : exceptionList) { + if (testException.exceptionValidNext) { + if (testException.isMatched(token)) { + return true; + } + } + } + } + return false; + } + + /** + * Checks whether an exception for a previous token matches (in case the + * exception had scope == "previous"). + * + * @param token + * {@link AnalyzedToken} to check matching against. + * @return True if any of the exceptions matches. + */ + public final boolean isMatchedByPreviousException(final AnalyzedToken token) { + if (exceptionValidPrevious) { + for (final Element testException : previousExceptionList) { + if (!testException.exceptionValidNext) { + if (testException.isMatched(token)) { + return true; + } + } + } + } + return false; + } + + /** + * Checks whether an exception for a previous token matches all readings of a + * given token (in case the exception had scope == "previous"). + * + * @param prevToken + * {@link AnalyzedTokenReadings} to check matching against. + * @return true if any of the exceptions matches. + */ + public final boolean isMatchedByPreviousException( + final AnalyzedTokenReadings prevToken) { + final int numReadings = prevToken.getReadingsLength(); + for (int i = 0; i < numReadings; i++) { + if (isMatchedByPreviousException(prevToken.getAnalyzedToken(i))) { + return true; + } + } + return false; + } + + /** + * Checks if the token is a SENT_START. + * + * @return True if the element starts the sentence and the element hasn't been + * set to have negated POS token. + * + */ + public final boolean isSentStart() { + return JLanguageTool.SENTENCE_START_TAGNAME.equals(posToken) + && !posNegation; + } + + @Override + public final String toString() { + final StringBuilder sb = new StringBuilder(); + if (negation) { + sb.append('!'); + } + sb.append(stringToken); + if (phraseName != null) { + sb.append(" {"); + sb.append(phraseName); + sb.append('}'); + } + if (posToken != null) { + sb.append('/'); + sb.append(posToken); + } + return sb.toString(); + } + + public final void setPosElement(final String posToken, final boolean regExp, + final boolean negation) { + this.posToken = posToken; + this.posNegation = negation; + posRegExp = regExp; + if (posRegExp) { + pPos = Pattern.compile(posToken); + } + } + + public final String getString() { + return stringToken; + } + + public final void setStringElement(final String token) { + this.stringToken = token; + testString = !StringTools.isEmpty(stringToken); + if (testString && stringRegExp) { + regToken = stringToken; + if (!caseSensitive) { + regToken = CASE_INSENSITIVE + stringToken; + } + if (!"\\0".equals(token)) { + p = Pattern.compile(regToken); + } + } + } + + /** + * Sets a POS-type exception for matching string tokens. + * + * @param posToken + * The part of the speech tag in the exception. + * @param regExp + * True if the POS is specified as a regular expression. + * @param negation + * True if the exception is negated. + * @param scopeNext + * True if the exception scope is next tokens. + * @param scopePrevious + * True if the exception should match only a single previous token. + */ + public final void setPosException(final String posToken, + final boolean regExp, final boolean negation, final boolean scopeNext, + final boolean scopePrevious) { + final Element posException = new Element("", this.caseSensitive, false, + false); + posException.setPosElement(posToken, regExp, negation); + posException.exceptionValidNext = scopeNext; + setException(posException, scopePrevious); + } + + /** + * Sets a string-type exception for matching string tokens. + * + * @param token + * The string in the exception. + * @param regExp + * True if the string is specified as a regular expression. + * @param inflected + * True if the string is a base form (lemma). + * @param negation + * True if the exception is negated. + * @param scopeNext + * True if the exception scope is next tokens. + * @param scopePrevious + * True if the exception should match only a single previous token. + */ + public final void setStringException(final String token, + final boolean regExp, final boolean inflected, final boolean negation, + final boolean scopeNext, final boolean scopePrevious) { + final Element stringException = new Element(token, this.caseSensitive, + regExp, inflected); + stringException.setNegation(negation); + stringException.exceptionValidNext = scopeNext; + setException(stringException, scopePrevious); + } + + private void setException(final Element elem, final boolean scopePrevious) { + exceptionValidPrevious |= scopePrevious; + if (exceptionList == null && !scopePrevious) { + exceptionList = new ArrayList<Element>(); + } + if (previousExceptionList == null && scopePrevious) { + previousExceptionList = new ArrayList<Element>(); + } + if (scopePrevious) { + previousExceptionList.add(elem); + } else { + if (!exceptionSet) { + exceptionSet = true; + } + if (exceptionSet) { + exceptionList.add(elem); + } + } + } + + /** + * Tests if part of speech matches a given string. + * + * @param token + * Token to test. + * @return true if matches + * + * Special value UNKNOWN_TAG matches null POS tags. + * + */ + private boolean isPosTokenMatched(final AnalyzedToken token) { + // if no POS set + // defaulting to true + if (posToken == null) { + return true; + } + if (token.getPOSTag() == null) { + if (posRegExp) { + if (mPos == null) { + mPos = pPos.matcher(UNKNOWN_TAG); + } else { + mPos.reset(UNKNOWN_TAG); + } + return mPos.matches(); + } + if (UNKNOWN_TAG.equals(posToken)) { + return true; + } + } + boolean match; + if (posRegExp) { + if (mPos == null) { + mPos = pPos.matcher(token.getPOSTag()); + } else { + mPos.reset(token.getPOSTag()); + } + match = mPos.matches(); + } else { + match = posToken.equals(token.getPOSTag()); + } + if (!match && UNKNOWN_TAG.equals(posToken)) { // these are helper tags, + // ignore them + match = JLanguageTool.SENTENCE_END_TAGNAME.equals(token.getPOSTag()) + || JLanguageTool.PARAGRAPH_END_TAGNAME.equals(token.getPOSTag()); + } + return match; + } + + /** + * Tests whether the string token element matches a given token. + * + * @param token + * {@link AnalyzedToken} to match against. + * @return True if matches. + */ + private boolean isStringTokenMatched(final AnalyzedToken token) { + final String testToken = getTestToken(token); + if (stringRegExp) { + if (m == null) { + m = p.matcher(testToken); + } else { + m.reset(testToken); + } + return m.matches(); + } + if (caseSensitive) { + return stringToken.equals(testToken); + } + return stringToken.equalsIgnoreCase(testToken); + } + + private String getTestToken(final AnalyzedToken token) { + // enables using words with lemmas and without lemmas + // in the same regexp with inflected="yes" + if (inflected) { + return token.getTokenInflected(); + } + return token.getToken(); + } + + /** + * Gets the exception scope length. + * + * @return Scope length. + */ + public final int getSkipNext() { + return skip; + } + + /** + * Sets the exception scope length. + * + * @param i + * Exception scope length. + */ + public final void setSkipNext(final int i) { + skip = i; + } + + /** + * Checks if the element has an exception for a previous token. + * + * @return True if the element has a previous token matching exception. + */ + public final boolean hasPreviousException() { + return exceptionValidPrevious; + } + + /** + * Negates the meaning of match(). + * + * @param negation + * - true if the meaning of match() is to be negated. + */ + public final void setNegation(final boolean negation) { + this.negation = negation; + } + + /** + * see {@link #setNegation} + * + * @since 0.9.3 + */ + public final boolean getNegation() { + return this.negation; + } + + /** + * + * @return true when this element refers to another token. + */ + public final boolean isReferenceElement() { + return containsMatches; + } + + /** + * Sets the reference to another token. + * + * @param match + * Formatting object for the token reference. + */ + public final void setMatch(final Match match) { + tokenReference = match; + containsMatches = true; + } + + public final Match getMatch() { + return tokenReference; + } + + /** + * Prepare Element for matching by formatting its string token and POS (if the + * Element is supposed to refer to some other token). + * + * @param token + * the token specified as {@link AnalyzedTokenReadings} + * @param synth + * the language synthesizer ({@link Synthesizer}) + * + */ + public final void compile(final AnalyzedTokenReadings token, + final Synthesizer synth) throws IOException { + + m = null; + p = null; + tokenReference.setToken(token); + tokenReference.setSynthesizer(synth); + + if (StringTools.isEmpty(referenceString)) { + referenceString = stringToken; + } + if (tokenReference.setsPos()) { + final String posReference = tokenReference.getTargetPosTag(); + if (posReference != null) { + if (mPos != null) { + mPos = null; + } + setPosElement(posReference, tokenReference.posRegExp(), negation); + } + setStringElement(referenceString.replace("\\" + + tokenReference.getTokenRef(), "")); + inflected = true; + } else { + setStringElement(referenceString.replace("\\" + + tokenReference.getTokenRef(), tokenReference.toTokenString())); + } + } + + /** + * Sets the phrase the element is in. + * + * @param s + * ID of the phrase. + */ + public final void setPhraseName(final String s) { + phraseName = s; + } + + /** + * Checks if the Element is in any phrase. + * + * @return True if the Element is contained in the phrase. + */ + public final boolean isPartOfPhrase() { + return phraseName != null; + } + + /** + * Whether the element matches case sensitively. + * + * @since 0.9.3 + */ + public final boolean getCaseSensitive() { + return caseSensitive; + } + + /** + * Tests whether the element matches a regular expression. + * + * @since 0.9.6 + */ + public final boolean isRegularExpression() { + return stringRegExp; + } + + /** + * @return the POS of the Element + * @since 0.9.6 + */ + public final String getPOStag() { + return posToken; + } + + /** + * Tests whether the POS is negated. + * + * @return true if so. + */ + public final boolean getPOSNegation() { + return posNegation; + } + + /** + * Whether the token is inflected. + * + * @return True if so. + */ + public final boolean isInflected() { + return inflected; + } + + /** + * Gets the phrase the element is in. + * + * @return String The name of the phrase. + */ + public final String getPhraseName() { + return phraseName; + } + + public final boolean isUnified() { + return unified; + } + + public final void setUnification(final Map<String, List<String>> uniFeatures) { + unificationFeatures = uniFeatures; + unified = true; + } + + /** + * Get unification features and types. + * @return A map from features to a list of types. + * @since 1.0.1 + */ + public final Map<String, List<String>> getUniFeatures() { + return unificationFeatures; + } + + public final void setUniNegation() { + uniNegation = true; + } + + public final boolean isUniNegated() { + return uniNegation; + } + + public final void setWhitespaceBefore(final boolean isWhite) { + whitespaceBefore = isWhite; + testWhitespace = true; + } + + public final void setExceptionSpaceBefore(final boolean isWhite) { + if (exceptionList != null) { + exceptionList.get(exceptionList.size()).setWhitespaceBefore(isWhite); + } + } + + public final boolean isWhitespaceBefore(final AnalyzedToken token) { + return whitespaceBefore == token.isWhitespaceBefore(); + } + + /** + * Since 1.0.0 + * @return A List of Exceptions. Used for testing. + */ + public final List<Element> getExceptionList() { + return exceptionList; + } +} |