diff options
Diffstat (limited to 'JLanguageTool/src/java/de/danielnaber/languagetool/rules/patterns/PatternRule.java')
-rw-r--r-- | JLanguageTool/src/java/de/danielnaber/languagetool/rules/patterns/PatternRule.java | 652 |
1 files changed, 652 insertions, 0 deletions
diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/rules/patterns/PatternRule.java b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/patterns/PatternRule.java new file mode 100644 index 0000000..843ef98 --- /dev/null +++ b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/patterns/PatternRule.java @@ -0,0 +1,652 @@ +/* LanguageTool, a natural language style checker + * Copyright (C) 2005 Daniel Naber (http://www.danielnaber.de) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ +package de.danielnaber.languagetool.rules.patterns; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; + +import de.danielnaber.languagetool.AnalyzedSentence; +import de.danielnaber.languagetool.AnalyzedTokenReadings; +import de.danielnaber.languagetool.Language; +import de.danielnaber.languagetool.rules.IncorrectExample; +import de.danielnaber.languagetool.rules.RuleMatch; +import de.danielnaber.languagetool.tools.StringTools; + +/** + * A Rule that describes a language error as a simple pattern of words or of + * part-of-speech tags. + * + * @author Daniel Naber + */ +public class PatternRule extends AbstractPatternRule { + + private static final String SUGG_TAG = "<suggestion>"; + private static final String END_SUGG_TAG = "</suggestion>"; + + private String subId; // because there can be more than one rule in a rule + // group + + private String message; + private String shortMessage; + + /** Formatted suggestion elements. **/ + private List<Match> suggestionMatches; + + /** + * A list of elements as they appear in XML file (phrases count as single + * tokens in case of matches or skipping). + */ + private List<Integer> elementNo; + + /** + * This property is used for short-circuiting evaluation of the elementNo list + * order. + */ + private boolean useList; + + /** + * Marks whether the rule is a member of a disjunctive set (in case of OR + * operation on phraserefs). + **/ + private boolean isMemberOfDisjunctiveSet; + + /** + * @param id + * Id of the Rule + * @param language + * Language of the Rule + * @param elements + * Element (token) list + * @param description + * Description to be shown (name) + * @param message + * Message to be displayed to the user + */ + + public PatternRule(final String id, final Language language, + final List<Element> elements, final String description, + final String message, final String shortMessage) { + super(id, description, language, elements, false); + if (id == null) { + throw new NullPointerException("id cannot be null"); + } + if (language == null) { + throw new NullPointerException("language cannot be null"); + } + if (elements == null) { + throw new NullPointerException("elements cannot be null"); + } + if (description == null) { + throw new NullPointerException("description cannot be null"); + } + + this.message = message; + this.shortMessage = shortMessage; + this.elementNo = new ArrayList<Integer>(); + String prevName = ""; + String curName = ""; + int cnt = 0; + int loopCnt = 0; + for (final Element e : patternElements) { + if (e.isPartOfPhrase()) { + curName = e.getPhraseName(); + if (prevName.equals(curName) || StringTools.isEmpty(prevName)) { + cnt++; + useList = true; + } else { + elementNo.add(cnt); + prevName = ""; + curName = ""; + cnt = 0; + } + prevName = curName; + loopCnt++; + if (loopCnt == patternElements.size() && !StringTools.isEmpty(prevName)) { + elementNo.add(cnt); + } + } else { + if (cnt > 0) { + elementNo.add(cnt); + } + elementNo.add(1); + loopCnt++; + } + } + } + + public PatternRule(final String id, final Language language, + final List<Element> elements, final String description, + final String message, final String shortMessage, final boolean isMember) { + this(id, language, elements, description, message, shortMessage); + this.isMemberOfDisjunctiveSet = isMember; + } + + public final String getSubId() { + return subId; + } + + public final void setSubId(final String subId) { + this.subId = subId; + } + + public final String getMessage() { + return message; + } + + /** + * Used for testing rules: only one of the set can match. + * + * @return Whether the rule can non-match (as a member of disjunctive set of + * rules generated by phraseref in includephrases element). + */ + public final boolean isWithComplexPhrase() { + return isMemberOfDisjunctiveSet; + } + + /** Reset complex status - used for testing. **/ + public final void notComplexPhrase() { + isMemberOfDisjunctiveSet = false; + } + + /** + * Return the pattern as a string. + * + * @since 0.9.2 + */ + public final String toPatternString() { + final List<String> strList = new ArrayList<String>(); + for (Element patternElement : patternElements) { + strList.add(patternElement.toString()); + } + return StringTools.listToString(strList, ", "); + } + + /** + * Return the pattern as an XML string. FIXME: this is not complete, information might be lost! + * + * @since 0.9.3 + */ + public final String toXML() { + final StringBuilder sb = new StringBuilder(); + sb.append("<rule id=\""); + sb.append(StringTools.escapeXML(getId())); + sb.append("\" name=\""); + sb.append(StringTools.escapeXML(getDescription())); + sb.append("\">\n"); + sb.append("<pattern mark_from=\""); + sb.append(startPositionCorrection); + sb.append("\" mark_to=\""); + sb.append(endPositionCorrection); + sb.append('"'); + // for now, case sensitivity is per pattern, not per element, + // so just use the setting of the first element: + if (!patternElements.isEmpty() && patternElements.get(0).getCaseSensitive()) { + sb.append(" case_sensitive=\"yes\""); + } + sb.append(">\n"); + for (Element patternElement : patternElements) { + sb.append("<token"); + if (patternElement.getNegation()) { + sb.append(" negate=\"yes\""); + } + if (patternElement.isRegularExpression()) { + sb.append(" regexp=\"yes\""); + } + if (patternElement.getPOStag() != null) { + sb.append(" postag=\""); + sb.append(patternElement.getPOStag()); + sb.append('"'); + } + if (patternElement.getPOSNegation()) { + sb.append(" negate_pos=\"yes\""); + } + if (patternElement.isInflected()) { + sb.append(" inflected=\"yes\""); + } + sb.append('>'); + if (patternElement.getString() != null) { + sb.append(StringTools.escapeXML(patternElement.getString())); + } else { + // TODO + } + sb.append("</token>\n"); + } + sb.append("</pattern>\n"); + sb.append("<message>"); + sb.append(StringTools.escapeXML(message)); + sb.append("</message>\n"); + if (getIncorrectExamples() != null) { + for (IncorrectExample example : getIncorrectExamples()) { + sb.append("<example type=\"incorrect\">"); + sb.append(StringTools.escapeXML(example.getExample())); + sb.append("</example>\n"); + } + } + if (getCorrectExamples() != null) { + for (String example : getCorrectExamples()) { + sb.append("<example type=\"correct\">"); + sb.append(StringTools.escapeXML(example)); + sb.append("</example>\n"); + } + } + sb.append("</rule>"); + return sb.toString(); + } + + public final void setMessage(final String message) { + this.message = message; + } + + @Override + public final RuleMatch[] match(final AnalyzedSentence text) + throws IOException { + final List<RuleMatch> ruleMatches = new ArrayList<RuleMatch>(); + final AnalyzedTokenReadings[] tokens = text.getTokensWithoutWhitespace(); + final int[] tokenPositions = new int[tokens.length + 1]; + final int patternSize = patternElements.size(); + final int limit = Math.max(0, tokens.length - patternSize + 1); + Element elem = null; + int i = 0; + while (i < limit && !(sentStart && i > 0)) { + boolean allElementsMatch = false; + int firstMatchToken = -1; + int lastMatchToken = -1; + int matchingTokens = 0; + int prevSkipNext = 0; + // this variable keeps the total number + // of tokens skipped + int skipShiftTotal = 0; + if (testUnification) { + unifier.reset(); + } + for (int k = 0; k < patternSize; k++) { + final Element prevElement = elem; + elem = patternElements.get(k); + setupRef(firstMatchToken, elem, tokens); + final int nextPos = i + k + skipShiftTotal; + prevMatched = false; + if (prevSkipNext + nextPos >= tokens.length || prevSkipNext < 0) { // SENT_END? + prevSkipNext = tokens.length - (nextPos + 1); + } + final int maxTok = Math.min(nextPos + prevSkipNext, tokens.length - (patternSize - k)); + for (int m = nextPos; m <= maxTok; m++) { + allElementsMatch = testAllReadings(tokens, elem, prevElement, m, + firstMatchToken, prevSkipNext); + if (allElementsMatch) { + lastMatchToken = m; + final int skipShift = lastMatchToken - nextPos; + tokenPositions[matchingTokens] = skipShift + 1; + prevSkipNext = translateElementNo(elem.getSkipNext()); + matchingTokens++; + skipShiftTotal += skipShift; + if (firstMatchToken == -1) { + firstMatchToken = lastMatchToken; + } + break; + } + } + if (!allElementsMatch) { + break; + } + } + + if (allElementsMatch && matchingTokens == patternSize) { + final RuleMatch rM = createRuleMatch(tokenPositions, tokens, + firstMatchToken, lastMatchToken, matchingTokens); + if (rM != null) { + ruleMatches.add(rM); + } + } + i++; + } + return ruleMatches.toArray(new RuleMatch[ruleMatches.size()]); + } + + private RuleMatch createRuleMatch(final int[] tokenPositions, + final AnalyzedTokenReadings[] tokens, final int firstMatchToken, + final int lastMatchToken, final int matchingTokens) throws IOException { + final String errMessage = formatMatches(tokens, tokenPositions, + firstMatchToken, message); + int correctedStPos = 0; + if (startPositionCorrection > 0) { + for (int l = 0; l <= startPositionCorrection; l++) { + correctedStPos += tokenPositions[l]; + } + correctedStPos--; + } + int correctedEndPos = 0; + if (endPositionCorrection < 0) { + int l = 0; + while (l > endPositionCorrection) { + correctedEndPos -= tokenPositions[matchingTokens + l - 1]; + l--; + } + } + AnalyzedTokenReadings firstMatchTokenObj = tokens[firstMatchToken + + correctedStPos]; + boolean startsWithUppercase = StringTools + .startsWithUppercase(firstMatchTokenObj.getToken()) + && !matchConvertsCase(); + + if (firstMatchTokenObj.isSentStart() + && tokens.length > firstMatchToken + correctedStPos + 1) { + // make uppercasing work also at sentence start: + firstMatchTokenObj = tokens[firstMatchToken + correctedStPos + 1]; + startsWithUppercase = StringTools.startsWithUppercase(firstMatchTokenObj + .getToken()); + } + int fromPos = tokens[firstMatchToken + correctedStPos].getStartPos(); + // FIXME: this is fishy, assumes that comma should always come before + // whitespace + if (errMessage.contains(SUGG_TAG + ",") + && firstMatchToken + correctedStPos >= 1) { + fromPos = tokens[firstMatchToken + correctedStPos - 1].getStartPos() + + tokens[firstMatchToken + correctedStPos - 1].getToken().length(); + } + + final int toPos = tokens[lastMatchToken + correctedEndPos].getStartPos() + + tokens[lastMatchToken + correctedEndPos].getToken().length(); + if (fromPos < toPos) { // this can happen with some skip="-1" when the last + // token is not matched + return new RuleMatch(this, fromPos, toPos, + errMessage, shortMessage, startsWithUppercase); + } // failed to create any rule match... + return null; + } + + /** + * Checks if the suggestion starts with a match that is supposed to convert + * case. If it does, stop the default conversion to uppercase. + * + * @return true, if the match converts the case of the token. + */ + private boolean matchConvertsCase() { + if (suggestionMatches != null && !suggestionMatches.isEmpty()) { + final int sugStart = message.indexOf(SUGG_TAG) + SUGG_TAG.length(); + for (Match sMatch : suggestionMatches) { + if (!sMatch.isInMessageOnly() && sMatch.convertsCase() + && message.charAt(sugStart) == '\\') { + return true; + } + } + } + return false; + } + + public final void addSuggestionMatch(final Match m) { + if (suggestionMatches == null) { + suggestionMatches = new ArrayList<Match>(); + } + suggestionMatches.add(m); + } + + /** + * Gets the index of the element indexed by i, adding any offsets because of + * the phrases in the rule. + * + * @param i + * Current element index. + * @return int Index translated into XML element no. + */ + private int translateElementNo(final int i) { + if (!useList || i < 0) { + return i; + } + int j = 0; + for (int k = 0; k < i; k++) { + j += elementNo.get(k); + } + return j; + } + + /** + * Returns true when the token in the rule references a phrase composed of + * many tokens. + * + * @param i + * The index of the token. + * @return true if the phrase is under the index, false otherwise. + **/ + private int phraseLen(final int i) { + if (!useList || i > (elementNo.size() - 1)) { + return 1; + } + return elementNo.get(i); + } + + /** + * Creates a Cartesian product of the arrays stored in the input array. + * + * @param input + * Array of string arrays to combine. + * @param output + * Work array of strings. + * @param r + * Starting parameter (use 0 to get all combinations). + * @param lang + * Text language for adding spaces in some languages. + * @return Combined array of @String. + */ + private static String[] combineLists(final String[][] input, + final String[] output, final int r, final Language lang) { + final List<String> outputList = new ArrayList<String>(); + if (r == input.length) { + final StringBuilder sb = new StringBuilder(); + for (int k = 0; k < output.length; k++) { + sb.append(output[k]); + if (k < output.length - 1) { + sb.append(StringTools.addSpace(output[k + 1], lang)); + } + } + outputList.add(sb.toString()); + } else { + for (int c = 0; c < input[r].length; c++) { + output[r] = input[r][c]; + final String[] sList = combineLists(input, output, r + 1, lang); + outputList.addAll(Arrays.asList(sList)); + } + } + return outputList.toArray(new String[outputList.size()]); + } + + /** + * Concatenates the matches, and takes care of phrases (including inflection + * using synthesis). + * + * @param start + * Position of the element as referenced by match element in the + * rule. + * @param index + * The index of the element found in the matching sentence. + * @param tokenIndex + * The position of the token in the AnalyzedTokenReadings array. + * @param tokens + * Array of @AnalyzedTokenReadings + * @return @String[] Array of concatenated strings + * @throws IOException + * in case disk operations (used in synthesizer) go wrong. + */ + private String[] concatMatches(final int start, final int index, + final int tokenIndex, final AnalyzedTokenReadings[] tokens, + final int nextTokenPos) + throws IOException { + String[] finalMatch = null; + if (suggestionMatches.get(start) != null) { + final int len = phraseLen(index); + if (len == 1) { + final int skippedTokens = nextTokenPos - tokenIndex; + suggestionMatches.get(start).setToken(tokens, tokenIndex - 1, skippedTokens); + suggestionMatches.get(start).setSynthesizer(language.getSynthesizer()); + finalMatch = suggestionMatches.get(start).toFinalString(); + } else { + final List<String[]> matchList = new ArrayList<String[]>(); + for (int i = 0; i < len; i++) { + final int skippedTokens = nextTokenPos - (tokenIndex + i); + suggestionMatches.get(start).setToken(tokens, tokenIndex - 1 + i, skippedTokens); + suggestionMatches.get(start) + .setSynthesizer(language.getSynthesizer()); + matchList.add(suggestionMatches.get(start).toFinalString()); + } + return combineLists(matchList.toArray(new String[matchList.size()][]), + new String[matchList.size()], 0, language); + } + } + return finalMatch; + } + + /** + * Replace back references generated with <match> and \\1 in message + * using Match class, and take care of skipping. * + * + * @param tokenReadings + * Array of AnalyzedTokenReadings that were matched against the + * pattern + * @param positions + * Array of relative positions of matched tokens + * @param firstMatchTok + * Position of the first matched token + * @param errorMsg + * String containing suggestion markup + * @return String Formatted message. + * @throws IOException + * + **/ + private String formatMatches(final AnalyzedTokenReadings[] tokenReadings, + final int[] positions, final int firstMatchTok, final String errorMsg) + throws IOException { + String errorMessage = errorMsg; + int matchCounter = 0; + final int[] numbersToMatches = new int[errorMsg.length()]; + boolean newWay = false; + int errLen = errorMessage.length(); + int errMarker = errorMessage.indexOf('\\'); + boolean numberFollows = false; + if (errMarker > 0 && errMarker < errLen - 1) { + numberFollows = StringTools.isPositiveNumber(errorMessage + .charAt(errMarker + 1)); + } + while (errMarker > 0 && numberFollows) { + final int ind = errorMessage.indexOf('\\'); + if (ind > 0 && StringTools.isPositiveNumber(errorMessage.charAt(ind + 1))) { + int numLen = 1; + while (ind + numLen < errorMessage.length() + && StringTools.isPositiveNumber(errorMessage.charAt(ind + numLen))) { + numLen++; + } + final int j = Integer.parseInt(errorMessage.substring(ind + 1, ind + + numLen)) - 1; + int repTokenPos = 0; + int nextTokenPos = 0; + for (int l = 0; l <= j; l++) { + repTokenPos += positions[l]; + } + if (j <= positions.length) { + nextTokenPos = firstMatchTok + repTokenPos + positions[j + 1]; + } + if (suggestionMatches != null) { + if (matchCounter < suggestionMatches.size()) { + numbersToMatches[j] = matchCounter; + if (suggestionMatches.get(matchCounter) != null) { + final String[] matches = concatMatches(matchCounter, j, + firstMatchTok + repTokenPos, tokenReadings, nextTokenPos); + final String leftSide = errorMessage.substring(0, ind); + final String rightSide = errorMessage.substring(ind + numLen); + if (matches.length == 1) { + errorMessage = leftSide + matches[0] + rightSide; + } else { + errorMessage = formatMultipleSynthesis(matches, leftSide, + rightSide); + } + matchCounter++; + newWay = true; + } + } else { + // FIXME: is this correct? this is how we deal with multiple matches + suggestionMatches.add(suggestionMatches.get(numbersToMatches[j])); + } + } + + if (!newWay) { + // in case <match> elements weren't used (yet) + errorMessage = errorMessage.replace("\\" + (j + 1), + tokenReadings[firstMatchTok + repTokenPos - 1].getToken()); + } + } + errMarker = errorMessage.indexOf('\\'); + numberFollows = false; + errLen = errorMessage.length(); + if (errMarker > 0 && errMarker < errLen - 1) { + numberFollows = StringTools.isPositiveNumber(errorMessage + .charAt(errMarker + 1)); + } + } + return errorMessage; + } + + private static String formatMultipleSynthesis(final String[] matches, + final String leftSide, final String rightSide) { + String errorMessage = ""; + String suggestionLeft = ""; + String suggestionRight = ""; + String rightSideNew = rightSide; + final int sPos = leftSide.lastIndexOf(SUGG_TAG); + if (sPos > 0) { + suggestionLeft = leftSide.substring(sPos + SUGG_TAG.length()); + } + if (StringTools.isEmpty(suggestionLeft)) { + errorMessage = leftSide; + } else { + errorMessage = leftSide.substring(0, leftSide.lastIndexOf(SUGG_TAG)) + + SUGG_TAG; + } + final int rPos = rightSide.indexOf(END_SUGG_TAG); + if (rPos > 0) { + suggestionRight = rightSide.substring(0, rPos); + } + if (!StringTools.isEmpty(suggestionRight)) { + rightSideNew = rightSide.substring(rightSide.indexOf(END_SUGG_TAG)); + } + final int lastLeftSugEnd = leftSide.indexOf(END_SUGG_TAG); + final int lastLeftSugStart = leftSide.lastIndexOf(SUGG_TAG); + final StringBuilder sb = new StringBuilder(); + sb.append(errorMessage); + for (int z = 0; z < matches.length; z++) { + sb.append(suggestionLeft); + sb.append(matches[z]); + sb.append(suggestionRight); + if ((z < matches.length - 1) && lastLeftSugEnd < lastLeftSugStart) { + sb.append(END_SUGG_TAG); + sb.append(", "); + sb.append(SUGG_TAG); + } + } + sb.append(rightSideNew); + return sb.toString(); + } + + /** + * For testing only. + */ + public final List<Element> getElements() { + return patternElements; + } + +} |