diff options
Diffstat (limited to 'JLanguageTool/src/java/de/danielnaber/languagetool/rules/patterns/Unifier.java')
-rw-r--r-- | JLanguageTool/src/java/de/danielnaber/languagetool/rules/patterns/Unifier.java | 432 |
1 files changed, 432 insertions, 0 deletions
diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/rules/patterns/Unifier.java b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/patterns/Unifier.java new file mode 100644 index 0000000..7fbb35d --- /dev/null +++ b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/patterns/Unifier.java @@ -0,0 +1,432 @@ +/* LanguageTool, a natural language style checker + * Copyright (C) 2006 Daniel Naber (http://www.danielnaber.de) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ + +package de.danielnaber.languagetool.rules.patterns; + +import java.util.ArrayList; +import java.util.HashMap; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Set; + +import de.danielnaber.languagetool.AnalyzedToken; +import de.danielnaber.languagetool.AnalyzedTokenReadings; + +/** + * Implements unification of features over tokens. + * + * @author Marcin Milkowski + */ +public class Unifier { + + //TODO: add a possibility to negate some features but not all + /** + * Negates the meaning of unification just like negation in Element tokens. + */ + private boolean negation; + + private boolean allFeatsIn; + + private int tokCnt; + + private int readingsCounter; + + private final List<AnalyzedTokenReadings> tokSequence; + + /** + * A Map for storing the equivalence types for features. Features are + * specified as Strings, and map into types defined as maps from Strings to + * Elements. + */ + private final Map<EquivalenceTypeLocator, Element> equivalenceTypes; + + /** + * A Map that stores all possible equivalence types listed for features. + */ + private final Map<String, List<String>> equivalenceFeatures; + + /** + * Map of sets of matched equivalences in the unified sequence. + */ + private final List<Map<String, Set<String>>> equivalencesMatched; + + /** + * Marks found interpretations in subsequent tokens. + */ + private List<Boolean> featuresFound; + + /** + * For checking the current token. + */ + private List<Boolean> tmpFeaturesFound; + + /** + * Internal flag for checking whether the first token in tokSequence has to be + * yet unified. + */ + private boolean firstUnified; + + private boolean inUnification; + private boolean uniMatched; + private boolean uniAllMatched; + private AnalyzedTokenReadings[] unifiedTokens; + + /** + * Instantiates the unifier. + */ + public Unifier() { + tokCnt = -1; + readingsCounter = 1; + equivalencesMatched = new ArrayList<Map<String, Set<String>>>(); + equivalenceTypes = new HashMap<EquivalenceTypeLocator, Element>(); + equivalenceFeatures = new HashMap<String, List<String>>(); + featuresFound = new ArrayList<Boolean>(); + tmpFeaturesFound = new ArrayList<Boolean>(); + tokSequence = new ArrayList<AnalyzedTokenReadings>(); + } + + /** + * Prepares equivalence types for features to be tested. All equivalence types + * are given as {@link Element}s. They create an equivalence set (with + * abstraction). + * + * @param feature + * Feature to be tested, like gender, grammatical case or number. + * @param type + * Type of equivalence for the feature, for example plural, first + * person, genitive. + * @param elem + * Element specifying the equivalence. + */ + public final void setEquivalence(final String feature, final String type, + final Element elem) { + if (equivalenceTypes.containsKey(new EquivalenceTypeLocator(feature, type))) { + return; + } + equivalenceTypes.put(new EquivalenceTypeLocator(feature, type), elem); + List<String> lTypes; + if (equivalenceFeatures.containsKey(feature)) { + lTypes = equivalenceFeatures.get(feature); + } else { + lTypes = new ArrayList<String>(); + } + lTypes.add(type); + equivalenceFeatures.put(feature, lTypes); + } + + /** + * Tests if a token has shared features with other tokens. + * + * @param aToken + * - token to be tested + * @param feature + * - feature to be tested + * @param type + * - type of equivalence relation for the feature + * @return true if the token shares this type of feature with other tokens + */ + protected final boolean isSatisfied(final AnalyzedToken aToken, + final Map<String, List<String>> uFeatures) { + + if (allFeatsIn && equivalencesMatched.isEmpty()) { + return false; + } + // Error: no feature given! + if (uFeatures == null) { + return false; // throw exception?? + } + boolean unified = true; + List<String> types; + + if (allFeatsIn) { + unified &= checkNext(aToken, uFeatures); + } else { + tokCnt++; + while (equivalencesMatched.size() <= tokCnt) { + equivalencesMatched.add(new HashMap<String, Set<String>>()); + } + for (final Map.Entry<String, List<String>> feat : uFeatures.entrySet()) { + types = feat.getValue(); + if (types == null || types.isEmpty()) { + types = equivalenceFeatures.get(feat.getKey()); + } + for (final String typename : types) { + final Element testElem = equivalenceTypes + .get(new EquivalenceTypeLocator(feat.getKey(), typename)); + if (testElem == null) { + return false; + } + if (testElem.isMatched(aToken)) { + if (!equivalencesMatched.get(tokCnt).containsKey(feat.getKey())) { + final Set<String> typeSet = new HashSet<String>(); + typeSet.add(typename); + equivalencesMatched.get(tokCnt).put(feat.getKey(), typeSet); + } else { + equivalencesMatched.get(tokCnt).get(feat.getKey()).add(typename); + } + } + } + unified &= equivalencesMatched.get(tokCnt).containsKey(feat.getKey()); + if (!unified) { + break; + } + } + if (unified) { + if (tokCnt == 0 || tokSequence.isEmpty()) { + tokSequence.add(new AnalyzedTokenReadings(aToken, 0)); + } else { + tokSequence.get(0).addReading(aToken); + } + } + } + return unified ^ negation; + } + + private boolean checkNext(final AnalyzedToken aToken, + final Map<String, List<String>> uFeatures) { + boolean unifiedNext = true; + boolean anyFeatUnified = false; + List<String> types; + ArrayList<Boolean> tokenFeaturesFound = new ArrayList<Boolean>(tmpFeaturesFound); + if (allFeatsIn) { + for (int i = 0; i <= tokCnt; i++) { + boolean allFeatsUnified = true; + for (Map.Entry<String, List<String>> feat : uFeatures.entrySet()) { + boolean featUnified = false; + types = feat.getValue(); + if (types == null || types.isEmpty()) { + types = equivalenceFeatures.get(feat.getKey()); + } + for (final String typename : types) { + if (featuresFound.get(i) + && equivalencesMatched.get(i).containsKey(feat.getKey()) + && equivalencesMatched.get(i).get(feat.getKey()).contains(typename)) { + final Element testElem = equivalenceTypes + .get(new EquivalenceTypeLocator(feat.getKey(), typename)); + featUnified = featUnified || testElem.isMatched(aToken); + } + } + allFeatsUnified &= featUnified; + } + tokenFeaturesFound.set(i, allFeatsUnified); + anyFeatUnified = anyFeatUnified || allFeatsUnified; + } + unifiedNext &= anyFeatUnified; + if (unifiedNext) { + if (tokSequence.size() == readingsCounter) { + tokSequence.add(new AnalyzedTokenReadings(aToken, 0)); + } else { + tokSequence.get(readingsCounter).addReading(aToken); + } + tmpFeaturesFound = tokenFeaturesFound; + } + } + return unifiedNext; + } + + /** + * Call after every complete token (AnalyzedTokenReadings) checked. + */ + public final void startNextToken() { + featuresFound = new ArrayList<Boolean>(tmpFeaturesFound); + readingsCounter++; + } + + /** + * Starts testing only those equivalences that were previously matched. + */ + public final void startUnify() { + allFeatsIn = true; + for (int i = 0; i <= tokCnt; i++) { + featuresFound.add(true); + } + tmpFeaturesFound = new ArrayList<Boolean>(featuresFound); + } + + public final void setNegation(final boolean neg) { + negation = neg; + } + + public final boolean getNegation() { + return negation; + } + + /** + * Resets after use of unification. Required. + */ + public final void reset() { + equivalencesMatched.clear(); + allFeatsIn = false; + negation = false; + tokCnt = -1; + featuresFound.clear(); + tmpFeaturesFound.clear(); + tokSequence.clear(); + readingsCounter = 1; + firstUnified = false; + uniMatched = false; + uniAllMatched = false; + inUnification = false; + } + + /** + * Gets a full sequence of filtered tokens. + * + * @return Array of AnalyzedTokenReadings that match equivalence relation + * defined for features tested. + */ + public final AnalyzedTokenReadings[] getUnifiedTokens() { + if (tokSequence.isEmpty()) { + return null; + } + if (!firstUnified) { + AnalyzedTokenReadings tmpATR; + int first = 0; + tmpFeaturesFound.add(true); // Bentley's search idea + while (!tmpFeaturesFound.get(first)) { + first++; + } + tmpFeaturesFound.remove(tmpFeaturesFound.size() - 1); + if (first >= tmpFeaturesFound.size()) { + return null; + } + // FIXME: why this happens?? + final int numRead = tokSequence.get(0).getReadingsLength(); + if (first < numRead) { + tmpATR = new AnalyzedTokenReadings(tokSequence.get(0).getAnalyzedToken( + first), 0); + for (int i = first + 1; i <= Math.min(numRead - 1, tokCnt); i++) { + if (tmpFeaturesFound.get(i)) { + tmpATR.addReading(tokSequence.get(0).getAnalyzedToken(i)); + } + } + tokSequence.set(0, tmpATR); + } + firstUnified = true; + } + final AnalyzedTokenReadings[] atr = tokSequence + .toArray(new AnalyzedTokenReadings[tokSequence.size()]); + return atr; + } + + /** + * Tests if the token sequence is unified. + * + * @param matchToken + * AnalyzedToken token to unify + * @param feature + * String: feature to unify over + * @param type + * String: value types of the feature + * @param isUniNegated + * if true, then return negated result + * @param lastReading + * true when the matchToken is the last reading in the + * AnalyzedReadings + * @return True if the tokens in the sequence are unified. + */ + public final boolean isUnified(final AnalyzedToken matchToken, + final Map<String, List<String>> uFeatures, final boolean isUniNegated, + final boolean lastReading) { + if (inUnification) { + uniMatched |= isSatisfied(matchToken, uFeatures); + uniAllMatched = uniMatched; + if (lastReading) { + startNextToken(); + unifiedTokens = getUnifiedTokens(); + uniMatched = false; + } + return uniAllMatched; + } + if (isUniNegated) { + setNegation(true); + } + isSatisfied(matchToken, uFeatures); + if (lastReading) { + inUnification = true; + uniMatched = false; + startUnify(); + } + return true; + } + + /** + * Used for getting a unified sequence in case when simple test method + * {@link #isUnified} was used. + * + * @return An array of {@link AnalyzedTokenReadings} + */ + public final AnalyzedTokenReadings[] getFinalUnified() { + if (inUnification) { + return unifiedTokens; + } + return null; + } +} + +class EquivalenceTypeLocator { + + @Override + public int hashCode() { + final int prime = 31; + int result = 1; + result = prime * result + ((feature == null) ? 0 : feature.hashCode()); + result = prime * result + ((type == null) ? 0 : type.hashCode()); + return result; + } + + @Override + public boolean equals(final Object obj) { + if (this == obj) { + return true; + } + if (obj == null) { + return false; + } + if (getClass() != obj.getClass()) { + return false; + } + final EquivalenceTypeLocator other = (EquivalenceTypeLocator) obj; + if (feature == null) { + if (other.feature != null) { + return false; + } + } else if (!feature.equals(other.feature)) { + return false; + } + if (type == null) { + if (other.type != null) { + return false; + } + } else if (!type.equals(other.type)) { + return false; + } + return true; + } + + private final String feature; + private final String type; + + EquivalenceTypeLocator(final String feature, final String type) { + this.feature = feature; + this.type = type; + } + +}
\ No newline at end of file |