/* LanguageTool, a natural language style checker * Copyright (C) 2005 Daniel Naber (http://www.danielnaber.de) * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 * USA */ package de.danielnaber.languagetool.rules.patterns; import java.io.IOException; import java.util.ArrayList; import java.util.Arrays; import java.util.List; import de.danielnaber.languagetool.AnalyzedSentence; import de.danielnaber.languagetool.AnalyzedTokenReadings; import de.danielnaber.languagetool.Language; import de.danielnaber.languagetool.rules.IncorrectExample; import de.danielnaber.languagetool.rules.RuleMatch; import de.danielnaber.languagetool.tools.StringTools; /** * A Rule that describes a language error as a simple pattern of words or of * part-of-speech tags. * * @author Daniel Naber */ public class PatternRule extends AbstractPatternRule { private static final String SUGG_TAG = ""; private static final String END_SUGG_TAG = ""; private String subId; // because there can be more than one rule in a rule // group private String message; private String shortMessage; /** Formatted suggestion elements. **/ private List suggestionMatches; /** * A list of elements as they appear in XML file (phrases count as single * tokens in case of matches or skipping). */ private List elementNo; /** * This property is used for short-circuiting evaluation of the elementNo list * order. */ private boolean useList; /** * Marks whether the rule is a member of a disjunctive set (in case of OR * operation on phraserefs). **/ private boolean isMemberOfDisjunctiveSet; /** * @param id * Id of the Rule * @param language * Language of the Rule * @param elements * Element (token) list * @param description * Description to be shown (name) * @param message * Message to be displayed to the user */ public PatternRule(final String id, final Language language, final List elements, final String description, final String message, final String shortMessage) { super(id, description, language, elements, false); if (id == null) { throw new NullPointerException("id cannot be null"); } if (language == null) { throw new NullPointerException("language cannot be null"); } if (elements == null) { throw new NullPointerException("elements cannot be null"); } if (description == null) { throw new NullPointerException("description cannot be null"); } this.message = message; this.shortMessage = shortMessage; this.elementNo = new ArrayList(); String prevName = ""; String curName = ""; int cnt = 0; int loopCnt = 0; for (final Element e : patternElements) { if (e.isPartOfPhrase()) { curName = e.getPhraseName(); if (prevName.equals(curName) || StringTools.isEmpty(prevName)) { cnt++; useList = true; } else { elementNo.add(cnt); prevName = ""; curName = ""; cnt = 0; } prevName = curName; loopCnt++; if (loopCnt == patternElements.size() && !StringTools.isEmpty(prevName)) { elementNo.add(cnt); } } else { if (cnt > 0) { elementNo.add(cnt); } elementNo.add(1); loopCnt++; } } } public PatternRule(final String id, final Language language, final List elements, final String description, final String message, final String shortMessage, final boolean isMember) { this(id, language, elements, description, message, shortMessage); this.isMemberOfDisjunctiveSet = isMember; } public final String getSubId() { return subId; } public final void setSubId(final String subId) { this.subId = subId; } public final String getMessage() { return message; } /** * Used for testing rules: only one of the set can match. * * @return Whether the rule can non-match (as a member of disjunctive set of * rules generated by phraseref in includephrases element). */ public final boolean isWithComplexPhrase() { return isMemberOfDisjunctiveSet; } /** Reset complex status - used for testing. **/ public final void notComplexPhrase() { isMemberOfDisjunctiveSet = false; } /** * Return the pattern as a string. * * @since 0.9.2 */ public final String toPatternString() { final List strList = new ArrayList(); for (Element patternElement : patternElements) { strList.add(patternElement.toString()); } return StringTools.listToString(strList, ", "); } /** * Return the pattern as an XML string. FIXME: this is not complete, information might be lost! * * @since 0.9.3 */ public final String toXML() { final StringBuilder sb = new StringBuilder(); sb.append("\n"); sb.append("\n"); for (Element patternElement : patternElements) { sb.append("'); if (patternElement.getString() != null) { sb.append(StringTools.escapeXML(patternElement.getString())); } else { // TODO } sb.append("\n"); } sb.append("\n"); sb.append(""); sb.append(StringTools.escapeXML(message)); sb.append("\n"); if (getIncorrectExamples() != null) { for (IncorrectExample example : getIncorrectExamples()) { sb.append(""); sb.append(StringTools.escapeXML(example.getExample())); sb.append("\n"); } } if (getCorrectExamples() != null) { for (String example : getCorrectExamples()) { sb.append(""); sb.append(StringTools.escapeXML(example)); sb.append("\n"); } } sb.append(""); return sb.toString(); } public final void setMessage(final String message) { this.message = message; } @Override public final RuleMatch[] match(final AnalyzedSentence text) throws IOException { final List ruleMatches = new ArrayList(); final AnalyzedTokenReadings[] tokens = text.getTokensWithoutWhitespace(); final int[] tokenPositions = new int[tokens.length + 1]; final int patternSize = patternElements.size(); final int limit = Math.max(0, tokens.length - patternSize + 1); Element elem = null; int i = 0; while (i < limit && !(sentStart && i > 0)) { boolean allElementsMatch = false; int firstMatchToken = -1; int lastMatchToken = -1; int matchingTokens = 0; int prevSkipNext = 0; // this variable keeps the total number // of tokens skipped int skipShiftTotal = 0; if (testUnification) { unifier.reset(); } for (int k = 0; k < patternSize; k++) { final Element prevElement = elem; elem = patternElements.get(k); setupRef(firstMatchToken, elem, tokens); final int nextPos = i + k + skipShiftTotal; prevMatched = false; if (prevSkipNext + nextPos >= tokens.length || prevSkipNext < 0) { // SENT_END? prevSkipNext = tokens.length - (nextPos + 1); } final int maxTok = Math.min(nextPos + prevSkipNext, tokens.length - (patternSize - k)); for (int m = nextPos; m <= maxTok; m++) { allElementsMatch = testAllReadings(tokens, elem, prevElement, m, firstMatchToken, prevSkipNext); if (allElementsMatch) { lastMatchToken = m; final int skipShift = lastMatchToken - nextPos; tokenPositions[matchingTokens] = skipShift + 1; prevSkipNext = translateElementNo(elem.getSkipNext()); matchingTokens++; skipShiftTotal += skipShift; if (firstMatchToken == -1) { firstMatchToken = lastMatchToken; } break; } } if (!allElementsMatch) { break; } } if (allElementsMatch && matchingTokens == patternSize) { final RuleMatch rM = createRuleMatch(tokenPositions, tokens, firstMatchToken, lastMatchToken, matchingTokens); if (rM != null) { ruleMatches.add(rM); } } i++; } return ruleMatches.toArray(new RuleMatch[ruleMatches.size()]); } private RuleMatch createRuleMatch(final int[] tokenPositions, final AnalyzedTokenReadings[] tokens, final int firstMatchToken, final int lastMatchToken, final int matchingTokens) throws IOException { final String errMessage = formatMatches(tokens, tokenPositions, firstMatchToken, message); int correctedStPos = 0; if (startPositionCorrection > 0) { for (int l = 0; l <= startPositionCorrection; l++) { correctedStPos += tokenPositions[l]; } correctedStPos--; } int correctedEndPos = 0; if (endPositionCorrection < 0) { int l = 0; while (l > endPositionCorrection) { correctedEndPos -= tokenPositions[matchingTokens + l - 1]; l--; } } AnalyzedTokenReadings firstMatchTokenObj = tokens[firstMatchToken + correctedStPos]; boolean startsWithUppercase = StringTools .startsWithUppercase(firstMatchTokenObj.getToken()) && !matchConvertsCase(); if (firstMatchTokenObj.isSentStart() && tokens.length > firstMatchToken + correctedStPos + 1) { // make uppercasing work also at sentence start: firstMatchTokenObj = tokens[firstMatchToken + correctedStPos + 1]; startsWithUppercase = StringTools.startsWithUppercase(firstMatchTokenObj .getToken()); } int fromPos = tokens[firstMatchToken + correctedStPos].getStartPos(); // FIXME: this is fishy, assumes that comma should always come before // whitespace if (errMessage.contains(SUGG_TAG + ",") && firstMatchToken + correctedStPos >= 1) { fromPos = tokens[firstMatchToken + correctedStPos - 1].getStartPos() + tokens[firstMatchToken + correctedStPos - 1].getToken().length(); } final int toPos = tokens[lastMatchToken + correctedEndPos].getStartPos() + tokens[lastMatchToken + correctedEndPos].getToken().length(); if (fromPos < toPos) { // this can happen with some skip="-1" when the last // token is not matched return new RuleMatch(this, fromPos, toPos, errMessage, shortMessage, startsWithUppercase); } // failed to create any rule match... return null; } /** * Checks if the suggestion starts with a match that is supposed to convert * case. If it does, stop the default conversion to uppercase. * * @return true, if the match converts the case of the token. */ private boolean matchConvertsCase() { if (suggestionMatches != null && !suggestionMatches.isEmpty()) { final int sugStart = message.indexOf(SUGG_TAG) + SUGG_TAG.length(); for (Match sMatch : suggestionMatches) { if (!sMatch.isInMessageOnly() && sMatch.convertsCase() && message.charAt(sugStart) == '\\') { return true; } } } return false; } public final void addSuggestionMatch(final Match m) { if (suggestionMatches == null) { suggestionMatches = new ArrayList(); } suggestionMatches.add(m); } /** * Gets the index of the element indexed by i, adding any offsets because of * the phrases in the rule. * * @param i * Current element index. * @return int Index translated into XML element no. */ private int translateElementNo(final int i) { if (!useList || i < 0) { return i; } int j = 0; for (int k = 0; k < i; k++) { j += elementNo.get(k); } return j; } /** * Returns true when the token in the rule references a phrase composed of * many tokens. * * @param i * The index of the token. * @return true if the phrase is under the index, false otherwise. **/ private int phraseLen(final int i) { if (!useList || i > (elementNo.size() - 1)) { return 1; } return elementNo.get(i); } /** * Creates a Cartesian product of the arrays stored in the input array. * * @param input * Array of string arrays to combine. * @param output * Work array of strings. * @param r * Starting parameter (use 0 to get all combinations). * @param lang * Text language for adding spaces in some languages. * @return Combined array of @String. */ private static String[] combineLists(final String[][] input, final String[] output, final int r, final Language lang) { final List outputList = new ArrayList(); if (r == input.length) { final StringBuilder sb = new StringBuilder(); for (int k = 0; k < output.length; k++) { sb.append(output[k]); if (k < output.length - 1) { sb.append(StringTools.addSpace(output[k + 1], lang)); } } outputList.add(sb.toString()); } else { for (int c = 0; c < input[r].length; c++) { output[r] = input[r][c]; final String[] sList = combineLists(input, output, r + 1, lang); outputList.addAll(Arrays.asList(sList)); } } return outputList.toArray(new String[outputList.size()]); } /** * Concatenates the matches, and takes care of phrases (including inflection * using synthesis). * * @param start * Position of the element as referenced by match element in the * rule. * @param index * The index of the element found in the matching sentence. * @param tokenIndex * The position of the token in the AnalyzedTokenReadings array. * @param tokens * Array of @AnalyzedTokenReadings * @return @String[] Array of concatenated strings * @throws IOException * in case disk operations (used in synthesizer) go wrong. */ private String[] concatMatches(final int start, final int index, final int tokenIndex, final AnalyzedTokenReadings[] tokens, final int nextTokenPos) throws IOException { String[] finalMatch = null; if (suggestionMatches.get(start) != null) { final int len = phraseLen(index); if (len == 1) { final int skippedTokens = nextTokenPos - tokenIndex; suggestionMatches.get(start).setToken(tokens, tokenIndex - 1, skippedTokens); suggestionMatches.get(start).setSynthesizer(language.getSynthesizer()); finalMatch = suggestionMatches.get(start).toFinalString(); } else { final List matchList = new ArrayList(); for (int i = 0; i < len; i++) { final int skippedTokens = nextTokenPos - (tokenIndex + i); suggestionMatches.get(start).setToken(tokens, tokenIndex - 1 + i, skippedTokens); suggestionMatches.get(start) .setSynthesizer(language.getSynthesizer()); matchList.add(suggestionMatches.get(start).toFinalString()); } return combineLists(matchList.toArray(new String[matchList.size()][]), new String[matchList.size()], 0, language); } } return finalMatch; } /** * Replace back references generated with <match> and \\1 in message * using Match class, and take care of skipping. * * * @param tokenReadings * Array of AnalyzedTokenReadings that were matched against the * pattern * @param positions * Array of relative positions of matched tokens * @param firstMatchTok * Position of the first matched token * @param errorMsg * String containing suggestion markup * @return String Formatted message. * @throws IOException * **/ private String formatMatches(final AnalyzedTokenReadings[] tokenReadings, final int[] positions, final int firstMatchTok, final String errorMsg) throws IOException { String errorMessage = errorMsg; int matchCounter = 0; final int[] numbersToMatches = new int[errorMsg.length()]; boolean newWay = false; int errLen = errorMessage.length(); int errMarker = errorMessage.indexOf('\\'); boolean numberFollows = false; if (errMarker > 0 && errMarker < errLen - 1) { numberFollows = StringTools.isPositiveNumber(errorMessage .charAt(errMarker + 1)); } while (errMarker > 0 && numberFollows) { final int ind = errorMessage.indexOf('\\'); if (ind > 0 && StringTools.isPositiveNumber(errorMessage.charAt(ind + 1))) { int numLen = 1; while (ind + numLen < errorMessage.length() && StringTools.isPositiveNumber(errorMessage.charAt(ind + numLen))) { numLen++; } final int j = Integer.parseInt(errorMessage.substring(ind + 1, ind + numLen)) - 1; int repTokenPos = 0; int nextTokenPos = 0; for (int l = 0; l <= j; l++) { repTokenPos += positions[l]; } if (j <= positions.length) { nextTokenPos = firstMatchTok + repTokenPos + positions[j + 1]; } if (suggestionMatches != null) { if (matchCounter < suggestionMatches.size()) { numbersToMatches[j] = matchCounter; if (suggestionMatches.get(matchCounter) != null) { final String[] matches = concatMatches(matchCounter, j, firstMatchTok + repTokenPos, tokenReadings, nextTokenPos); final String leftSide = errorMessage.substring(0, ind); final String rightSide = errorMessage.substring(ind + numLen); if (matches.length == 1) { errorMessage = leftSide + matches[0] + rightSide; } else { errorMessage = formatMultipleSynthesis(matches, leftSide, rightSide); } matchCounter++; newWay = true; } } else { // FIXME: is this correct? this is how we deal with multiple matches suggestionMatches.add(suggestionMatches.get(numbersToMatches[j])); } } if (!newWay) { // in case elements weren't used (yet) errorMessage = errorMessage.replace("\\" + (j + 1), tokenReadings[firstMatchTok + repTokenPos - 1].getToken()); } } errMarker = errorMessage.indexOf('\\'); numberFollows = false; errLen = errorMessage.length(); if (errMarker > 0 && errMarker < errLen - 1) { numberFollows = StringTools.isPositiveNumber(errorMessage .charAt(errMarker + 1)); } } return errorMessage; } private static String formatMultipleSynthesis(final String[] matches, final String leftSide, final String rightSide) { String errorMessage = ""; String suggestionLeft = ""; String suggestionRight = ""; String rightSideNew = rightSide; final int sPos = leftSide.lastIndexOf(SUGG_TAG); if (sPos > 0) { suggestionLeft = leftSide.substring(sPos + SUGG_TAG.length()); } if (StringTools.isEmpty(suggestionLeft)) { errorMessage = leftSide; } else { errorMessage = leftSide.substring(0, leftSide.lastIndexOf(SUGG_TAG)) + SUGG_TAG; } final int rPos = rightSide.indexOf(END_SUGG_TAG); if (rPos > 0) { suggestionRight = rightSide.substring(0, rPos); } if (!StringTools.isEmpty(suggestionRight)) { rightSideNew = rightSide.substring(rightSide.indexOf(END_SUGG_TAG)); } final int lastLeftSugEnd = leftSide.indexOf(END_SUGG_TAG); final int lastLeftSugStart = leftSide.lastIndexOf(SUGG_TAG); final StringBuilder sb = new StringBuilder(); sb.append(errorMessage); for (int z = 0; z < matches.length; z++) { sb.append(suggestionLeft); sb.append(matches[z]); sb.append(suggestionRight); if ((z < matches.length - 1) && lastLeftSugEnd < lastLeftSugStart) { sb.append(END_SUGG_TAG); sb.append(", "); sb.append(SUGG_TAG); } } sb.append(rightSideNew); return sb.toString(); } /** * For testing only. */ public final List getElements() { return patternElements; } }