diff options
Diffstat (limited to 'JLanguageTool/src/java/de/danielnaber/languagetool/rules/patterns/Match.java')
-rw-r--r-- | JLanguageTool/src/java/de/danielnaber/languagetool/rules/patterns/Match.java | 551 |
1 files changed, 551 insertions, 0 deletions
diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/rules/patterns/Match.java b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/patterns/Match.java new file mode 100644 index 0000000..0519f2c --- /dev/null +++ b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/patterns/Match.java @@ -0,0 +1,551 @@ +/* LanguageTool, a natural language style checker + * Copyright (C) 2005 Daniel Naber (http://www.danielnaber.de) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ +package de.danielnaber.languagetool.rules.patterns; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; +import java.util.TreeSet; +import java.util.regex.Pattern; + +import de.danielnaber.languagetool.AnalyzedToken; +import de.danielnaber.languagetool.AnalyzedTokenReadings; +import de.danielnaber.languagetool.JLanguageTool; +import de.danielnaber.languagetool.synthesis.Synthesizer; +import de.danielnaber.languagetool.tools.StringTools; + +/** + * Reference to a matched token in a pattern, can be formatted and used for + * matching & suggestions. + * + * @author Marcin MiĆkowski + */ +public class Match { + + /** Possible string case conversions. **/ + public enum CaseConversion { + NONE, STARTLOWER, STARTUPPER, ALLLOWER, ALLUPPER; + + /** + * Converts string to the constant enum. + * + * @param str + * String value to be converted. + * @return CaseConversion enum. + */ + public static CaseConversion toCase(final String str) { + try { + return valueOf(str); + } catch (final Exception ex) { + return NONE; + } + } + } + + public enum IncludeRange { + NONE, FOLLOWING, ALL; + + /** + * Converts string to the constant enum. + * + * @param str + * String value to be converted. + * @return IncludeRange enum. + */ + public static IncludeRange toRange(final String str) { + try { + return valueOf(str); + } catch (final Exception ex) { + return NONE; + } + } + } + + private final String posTag; + private boolean postagRegexp; + private final String regexReplace; + private final String posTagReplace; + private final CaseConversion caseConversionType; + + private final IncludeRange includeSkipped; + private String skippedTokens; + + /** + * True if this match element formats a statically defined lemma which is + * enclosed by the element, e.g., <tt><match...>word</word></tt>. + */ + private boolean staticLemma; + + /** + * True if this match element is used for formatting POS token. + */ + private final boolean setPos; + + private AnalyzedTokenReadings formattedToken; + private AnalyzedTokenReadings matchedToken; + + private int tokenRef; + + /** Word form generator for POS tags. **/ + private Synthesizer synthesizer; + + /** Pattern used to define parts of the matched token. **/ + private Pattern pRegexMatch; + + /** Pattern used to define parts of the matched POS token. **/ + private Pattern pPosRegexMatch; + + /** + * True when the match is not in the suggestion. + */ + private boolean inMessageOnly; + + public Match(final String posTag, final String posTagReplace, + final boolean postagRegexp, final String regexMatch, + final String regexReplace, final CaseConversion caseConversionType, + final boolean setPOS, + final IncludeRange includeSkipped) { + this.posTag = posTag; + this.postagRegexp = postagRegexp; + this.caseConversionType = caseConversionType; + + if (regexMatch != null) { + pRegexMatch = Pattern.compile(regexMatch); + } + if (postagRegexp && posTag != null) { + pPosRegexMatch = Pattern.compile(posTag); + } + + this.regexReplace = regexReplace; + this.posTagReplace = posTagReplace; + this.setPos = setPOS; + this.includeSkipped = includeSkipped; + } + + /** + * Sets the token that will be formatted or otherwise used in the class. + */ + public final void setToken(final AnalyzedTokenReadings token) { + if (staticLemma) { + matchedToken = token; + } else { + formattedToken = token; + } + } + + /** + * Sets the token to be formatted etc. and includes the support for + * including the skipped tokens. + * @param tokens Array of tokens + * @param index Index of the token to be formatted + * @param next Position of the next token (the skipped tokens + * are the ones between the tokens[index] and tokens[next] + */ + public final void setToken(final AnalyzedTokenReadings[] tokens, final int index, final int next) { + setToken(tokens[index]); + if (next > 1 && includeSkipped != IncludeRange.NONE) { + final StringBuilder sb = new StringBuilder(); + if (includeSkipped == IncludeRange.FOLLOWING) { + formattedToken = null; + } + for (int k = index + 1; k < index + next; k++) { + if (k > index + 1 && + tokens[k].isWhitespaceBefore()) { + sb.append(' '); + } + sb.append(tokens[k].getToken()); + } + skippedTokens = sb.toString(); + } else { + skippedTokens = ""; + } + } + + /** + private String[] addSkipped(final String[] formattedString) { + if (skippedTokens != null && !"".equals(skippedTokens)) { + String[] finalStrings = new String[formattedString.length]; + for (int i = 1; i <= formattedString.length; i++) + } + } + + **/ + + /** + * Checks if the Match element is used for setting the part of speech Element. + * + * @return True if Match sets POS. + */ + public final boolean setsPos() { + return setPos; + } + + /** + * Checks if the Match element uses regexp-based form of the POS tag. + * + * @return True if regexp is used in POS. + */ + public final boolean posRegExp() { + return postagRegexp; + } + + /** + * Sets a base form (lemma) that will be formatted, or synthesized, using the + * specified POS regular expressions. + * + * @param lemmaString String that specifies the base form. + */ + public final void setLemmaString(final String lemmaString) { + if (!StringTools.isEmpty(lemmaString)) { + formattedToken = new AnalyzedTokenReadings(new AnalyzedToken(lemmaString, + posTag, lemmaString), 0); + staticLemma = true; + postagRegexp = true; + if (posTag != null) { + pPosRegexMatch = Pattern.compile(posTag); + } + } + } + + /** + * Sets a synthesizer used for grammatical synthesis of forms based on + * formatted POS values. + * + * @param synth Synthesizer class. + */ + public final void setSynthesizer(final Synthesizer synth) { + synthesizer = synth; + } + + /** + * Gets all strings formatted using the match element. + * + * @return array of strings + * @throws IOException + * in case of synthesizer-related disk problems. + */ + public final String[] toFinalString() throws IOException { + String[] formattedString = new String[1]; + if (formattedToken != null) { + final int readingCount = formattedToken.getReadingsLength(); + formattedString[0] = formattedToken.getToken(); + if (pRegexMatch != null) { + formattedString[0] = pRegexMatch.matcher(formattedString[0]) + .replaceAll(regexReplace); + } + formattedString[0] = convertCase(formattedString[0]); + if (posTag != null) { + if (synthesizer == null) { + formattedString[0] = formattedToken.getToken(); + } else if (postagRegexp) { + final TreeSet<String> wordForms = new TreeSet<String>(); + boolean oneForm = false; + for (int k = 0; k < readingCount; k++) { + if (formattedToken.getAnalyzedToken(k).getLemma() == null) { + final String posUnique = formattedToken.getAnalyzedToken(k) + .getPOSTag(); + if (posUnique == null) { + wordForms.add(formattedToken.getToken()); + oneForm = true; + } else { + if (JLanguageTool.SENTENCE_START_TAGNAME.equals(posUnique) + || JLanguageTool.SENTENCE_END_TAGNAME.equals(posUnique) + || JLanguageTool.PARAGRAPH_END_TAGNAME.equals(posUnique)) { + if (!oneForm) { + wordForms.add(formattedToken.getToken()); + } + oneForm = true; + } else { + oneForm = false; + } + } + } + } + final String targetPosTag = getTargetPosTag(); + if (!oneForm) { + for (int i = 0; i < readingCount; i++) { + final String[] possibleWordForms = synthesizer.synthesize( + formattedToken.getAnalyzedToken(i), targetPosTag, true); + if (possibleWordForms != null) { + wordForms.addAll(Arrays.asList(possibleWordForms)); + } + } + } + if (wordForms.isEmpty()) { + formattedString[0] = "(" + formattedToken.getToken() + ")"; + } else { + formattedString = wordForms.toArray(new String[wordForms.size()]); + } + } else { + final TreeSet<String> wordForms = new TreeSet<String>(); + for (int i = 0; i < readingCount; i++) { + final String[] possibleWordForms = synthesizer.synthesize( + formattedToken.getAnalyzedToken(i), posTag); + if (possibleWordForms != null) { + wordForms.addAll(Arrays.asList(possibleWordForms)); + } + } + formattedString = wordForms.toArray(new String[wordForms.size()]); + } + } + } + if (includeSkipped != IncludeRange.NONE + && skippedTokens != null && !"".equals(skippedTokens)) { + final String[] helper = new String[formattedString.length]; + for (int i = 0; i < formattedString.length; i++) { + if (formattedString[i] == null) { + formattedString[i] = ""; + } + helper[i] = formattedString[i] + skippedTokens; + } + formattedString = helper; + } + return formattedString; + } + + /** + * Format POS tag using parameters already defined in the class. + * + * @return Formatted POS tag as String. + */ + // FIXME: gets only the first POS tag that matches, this can be wrong + // on the other hand, many POS tags = too many suggestions? + public final String getTargetPosTag() { + String targetPosTag = posTag; + final List<String> posTags = new ArrayList<String>(); + if (staticLemma) { + final int numRead = matchedToken.getReadingsLength(); + for (int i = 0; i < numRead; i++) { + final String tst = matchedToken.getAnalyzedToken(i).getPOSTag(); + if (tst != null && pPosRegexMatch.matcher(tst).matches()) { + targetPosTag = matchedToken.getAnalyzedToken(i).getPOSTag(); + posTags.add(targetPosTag); + } + } + if (pPosRegexMatch != null && posTagReplace != null) { + targetPosTag = pPosRegexMatch.matcher(targetPosTag).replaceAll( + posTagReplace); + } + } else { + final int numRead = formattedToken.getReadingsLength(); + for (int i = 0; i < numRead; i++) { + final String tst = formattedToken.getAnalyzedToken(i).getPOSTag(); + if (tst != null && pPosRegexMatch.matcher(tst).matches()) { + targetPosTag = formattedToken.getAnalyzedToken(i).getPOSTag(); + posTags.add(targetPosTag); + } + } + if (pPosRegexMatch != null && posTagReplace != null) { + if (posTags.isEmpty()) { + posTags.add(targetPosTag); + } + final StringBuilder sb = new StringBuilder(); + final int posTagLen = posTags.size(); + int l = 0; + for (String lposTag : posTags) { + l++; + lposTag = pPosRegexMatch.matcher(lposTag).replaceAll(posTagReplace); + if (setPos) { + lposTag = synthesizer.getPosTagCorrection(lposTag); + } + sb.append(lposTag); + if (l < posTagLen) { + sb.append('|'); + } + } + targetPosTag = sb.toString(); + } + } + return targetPosTag; + } + + /** + * Method for getting the formatted match as a single string. In case of + * multiple matches, it joins them using a regular expression operator "|". + * + * @return Formatted string of the matched token. + */ + public final String toTokenString() throws IOException { + final StringBuilder output = new StringBuilder(); + final String[] stringToFormat = toFinalString(); + for (int i = 0; i < stringToFormat.length; i++) { + output.append(stringToFormat[i]); + if (i + 1 < stringToFormat.length) { + output.append('|'); + } + } + return output.toString(); + } + + /** + * Sets the token number referenced by the match. + * + * @param i Token number. + */ + public final void setTokenRef(final int i) { + tokenRef = i; + } + + /** + * Gets the token number referenced by the match. + * + * @return int - token number. + */ + public final int getTokenRef() { + return tokenRef; + } + + /** + * Converts case of the string token according to match element attributes. + * + * @param s Token to be converted. + * @return Converted string. + */ + private String convertCase(final String s) { + if (StringTools.isEmpty(s)) { + return s; + } + String token = s; + switch (caseConversionType) { + case NONE: + break; + case STARTLOWER: + token = token.substring(0, 1).toLowerCase() + token.substring(1); + break; + case STARTUPPER: + token = token.substring(0, 1).toUpperCase() + token.substring(1); + break; + case ALLUPPER: + token = token.toUpperCase(); + break; + case ALLLOWER: + token = token.toLowerCase(); + break; + default: + break; + } + return token; + } + + /** + * Used to let LT know that it should change the case of the match. + * + * @return true if match converts the case of the token. + */ + public final boolean convertsCase() { + return !caseConversionType.equals(CaseConversion.NONE); + } + + public final AnalyzedTokenReadings filterReadings() { + final ArrayList<AnalyzedToken> l = new ArrayList<AnalyzedToken>(); + if (formattedToken != null) { + if (staticLemma) { + formattedToken = new AnalyzedTokenReadings(new AnalyzedToken( + matchedToken.getToken(), posTag, formattedToken.getToken()), + matchedToken.getStartPos()); + formattedToken.setWhitespaceBefore(matchedToken.isWhitespaceBefore()); + } + String token = formattedToken.getToken(); + if (pRegexMatch != null) { + token = pRegexMatch.matcher(token).replaceAll(regexReplace); + } + token = convertCase(token); + if (posTag != null) { + final int numRead = formattedToken.getReadingsLength(); + if (postagRegexp) { + String targetPosTag = posTag; + for (int i = 0; i < numRead; i++) { + final String tst = formattedToken.getAnalyzedToken(i).getPOSTag(); + if (tst != null && pPosRegexMatch.matcher(tst).matches()) { + targetPosTag = formattedToken.getAnalyzedToken(i).getPOSTag(); + if (posTagReplace != null) { + targetPosTag = pPosRegexMatch.matcher(targetPosTag).replaceAll( + posTagReplace); + } + l + .add(new AnalyzedToken(token, targetPosTag, formattedToken + .getAnalyzedToken(i).getLemma())); + l.get(l.size() - 1).setWhitespaceBefore(formattedToken.isWhitespaceBefore()); + } + } + if (l.isEmpty()) { + for (final AnalyzedToken anaTok : getNewToken(numRead, token)) { + l.add(anaTok); + } + } + } else { + for (final AnalyzedToken anaTok : getNewToken(numRead, token)) { + l.add(anaTok); + } + } + if (formattedToken.isSentEnd()) { + l.add(new AnalyzedToken(formattedToken.getToken(), + JLanguageTool.SENTENCE_END_TAGNAME, + formattedToken.getAnalyzedToken(0).getLemma())); + } + if (formattedToken.isParaEnd()) { + l.add(new AnalyzedToken(formattedToken.getToken(), + JLanguageTool.PARAGRAPH_END_TAGNAME, + formattedToken.getAnalyzedToken(0).getLemma())); + } + } + } + if (l.isEmpty()) { + return formattedToken; + } + return new AnalyzedTokenReadings(l.toArray(new AnalyzedToken[l.size()]), formattedToken.getStartPos()); + } + + private AnalyzedToken[] getNewToken(final int numRead, final String token) { + final List<AnalyzedToken> list = new ArrayList<AnalyzedToken>(); + String lemma = ""; + for (int j = 0; j < numRead; j++) { + if (formattedToken.getAnalyzedToken(j).getPOSTag() != null) { + if (formattedToken.getAnalyzedToken(j).getPOSTag().equals(posTag) + && (formattedToken.getAnalyzedToken(j).getLemma() != null)) { + lemma = formattedToken.getAnalyzedToken(j).getLemma(); + } + if (StringTools.isEmpty(lemma)) { + lemma = formattedToken.getAnalyzedToken(0).getLemma(); + } + list.add(new AnalyzedToken(token, posTag, lemma)); + list.get(list.size() - 1). + setWhitespaceBefore(formattedToken.isWhitespaceBefore()); + } + } + return list.toArray(new AnalyzedToken[list.size()]); + } + + /** + * @param inMessageOnly + * the inMessageOnly to set + */ + public void setInMessageOnly(final boolean inMessageOnly) { + this.inMessageOnly = inMessageOnly; + } + + /** + * @return the inMessageOnly + */ + public boolean isInMessageOnly() { + return inMessageOnly; + } + +} |