summaryrefslogtreecommitdiffstats
path: root/JLanguageTool/src/java/de/danielnaber/languagetool/rules/patterns/Match.java
diff options
context:
space:
mode:
Diffstat (limited to 'JLanguageTool/src/java/de/danielnaber/languagetool/rules/patterns/Match.java')
-rw-r--r--JLanguageTool/src/java/de/danielnaber/languagetool/rules/patterns/Match.java551
1 files changed, 551 insertions, 0 deletions
diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/rules/patterns/Match.java b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/patterns/Match.java
new file mode 100644
index 0000000..0519f2c
--- /dev/null
+++ b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/patterns/Match.java
@@ -0,0 +1,551 @@
+/* LanguageTool, a natural language style checker
+ * Copyright (C) 2005 Daniel Naber (http://www.danielnaber.de)
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
+ * USA
+ */
+package de.danielnaber.languagetool.rules.patterns;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
+import java.util.TreeSet;
+import java.util.regex.Pattern;
+
+import de.danielnaber.languagetool.AnalyzedToken;
+import de.danielnaber.languagetool.AnalyzedTokenReadings;
+import de.danielnaber.languagetool.JLanguageTool;
+import de.danielnaber.languagetool.synthesis.Synthesizer;
+import de.danielnaber.languagetool.tools.StringTools;
+
+/**
+ * Reference to a matched token in a pattern, can be formatted and used for
+ * matching & suggestions.
+ *
+ * @author Marcin MiƂkowski
+ */
+public class Match {
+
+ /** Possible string case conversions. **/
+ public enum CaseConversion {
+ NONE, STARTLOWER, STARTUPPER, ALLLOWER, ALLUPPER;
+
+ /**
+ * Converts string to the constant enum.
+ *
+ * @param str
+ * String value to be converted.
+ * @return CaseConversion enum.
+ */
+ public static CaseConversion toCase(final String str) {
+ try {
+ return valueOf(str);
+ } catch (final Exception ex) {
+ return NONE;
+ }
+ }
+ }
+
+ public enum IncludeRange {
+ NONE, FOLLOWING, ALL;
+
+ /**
+ * Converts string to the constant enum.
+ *
+ * @param str
+ * String value to be converted.
+ * @return IncludeRange enum.
+ */
+ public static IncludeRange toRange(final String str) {
+ try {
+ return valueOf(str);
+ } catch (final Exception ex) {
+ return NONE;
+ }
+ }
+ }
+
+ private final String posTag;
+ private boolean postagRegexp;
+ private final String regexReplace;
+ private final String posTagReplace;
+ private final CaseConversion caseConversionType;
+
+ private final IncludeRange includeSkipped;
+ private String skippedTokens;
+
+ /**
+ * True if this match element formats a statically defined lemma which is
+ * enclosed by the element, e.g., <tt>&lt;match...&gt;word&lt;/word&gt;</tt>.
+ */
+ private boolean staticLemma;
+
+ /**
+ * True if this match element is used for formatting POS token.
+ */
+ private final boolean setPos;
+
+ private AnalyzedTokenReadings formattedToken;
+ private AnalyzedTokenReadings matchedToken;
+
+ private int tokenRef;
+
+ /** Word form generator for POS tags. **/
+ private Synthesizer synthesizer;
+
+ /** Pattern used to define parts of the matched token. **/
+ private Pattern pRegexMatch;
+
+ /** Pattern used to define parts of the matched POS token. **/
+ private Pattern pPosRegexMatch;
+
+ /**
+ * True when the match is not in the suggestion.
+ */
+ private boolean inMessageOnly;
+
+ public Match(final String posTag, final String posTagReplace,
+ final boolean postagRegexp, final String regexMatch,
+ final String regexReplace, final CaseConversion caseConversionType,
+ final boolean setPOS,
+ final IncludeRange includeSkipped) {
+ this.posTag = posTag;
+ this.postagRegexp = postagRegexp;
+ this.caseConversionType = caseConversionType;
+
+ if (regexMatch != null) {
+ pRegexMatch = Pattern.compile(regexMatch);
+ }
+ if (postagRegexp && posTag != null) {
+ pPosRegexMatch = Pattern.compile(posTag);
+ }
+
+ this.regexReplace = regexReplace;
+ this.posTagReplace = posTagReplace;
+ this.setPos = setPOS;
+ this.includeSkipped = includeSkipped;
+ }
+
+ /**
+ * Sets the token that will be formatted or otherwise used in the class.
+ */
+ public final void setToken(final AnalyzedTokenReadings token) {
+ if (staticLemma) {
+ matchedToken = token;
+ } else {
+ formattedToken = token;
+ }
+ }
+
+ /**
+ * Sets the token to be formatted etc. and includes the support for
+ * including the skipped tokens.
+ * @param tokens Array of tokens
+ * @param index Index of the token to be formatted
+ * @param next Position of the next token (the skipped tokens
+ * are the ones between the tokens[index] and tokens[next]
+ */
+ public final void setToken(final AnalyzedTokenReadings[] tokens, final int index, final int next) {
+ setToken(tokens[index]);
+ if (next > 1 && includeSkipped != IncludeRange.NONE) {
+ final StringBuilder sb = new StringBuilder();
+ if (includeSkipped == IncludeRange.FOLLOWING) {
+ formattedToken = null;
+ }
+ for (int k = index + 1; k < index + next; k++) {
+ if (k > index + 1 &&
+ tokens[k].isWhitespaceBefore()) {
+ sb.append(' ');
+ }
+ sb.append(tokens[k].getToken());
+ }
+ skippedTokens = sb.toString();
+ } else {
+ skippedTokens = "";
+ }
+ }
+
+ /**
+ private String[] addSkipped(final String[] formattedString) {
+ if (skippedTokens != null && !"".equals(skippedTokens)) {
+ String[] finalStrings = new String[formattedString.length];
+ for (int i = 1; i <= formattedString.length; i++)
+ }
+ }
+
+ **/
+
+ /**
+ * Checks if the Match element is used for setting the part of speech Element.
+ *
+ * @return True if Match sets POS.
+ */
+ public final boolean setsPos() {
+ return setPos;
+ }
+
+ /**
+ * Checks if the Match element uses regexp-based form of the POS tag.
+ *
+ * @return True if regexp is used in POS.
+ */
+ public final boolean posRegExp() {
+ return postagRegexp;
+ }
+
+ /**
+ * Sets a base form (lemma) that will be formatted, or synthesized, using the
+ * specified POS regular expressions.
+ *
+ * @param lemmaString String that specifies the base form.
+ */
+ public final void setLemmaString(final String lemmaString) {
+ if (!StringTools.isEmpty(lemmaString)) {
+ formattedToken = new AnalyzedTokenReadings(new AnalyzedToken(lemmaString,
+ posTag, lemmaString), 0);
+ staticLemma = true;
+ postagRegexp = true;
+ if (posTag != null) {
+ pPosRegexMatch = Pattern.compile(posTag);
+ }
+ }
+ }
+
+ /**
+ * Sets a synthesizer used for grammatical synthesis of forms based on
+ * formatted POS values.
+ *
+ * @param synth Synthesizer class.
+ */
+ public final void setSynthesizer(final Synthesizer synth) {
+ synthesizer = synth;
+ }
+
+ /**
+ * Gets all strings formatted using the match element.
+ *
+ * @return array of strings
+ * @throws IOException
+ * in case of synthesizer-related disk problems.
+ */
+ public final String[] toFinalString() throws IOException {
+ String[] formattedString = new String[1];
+ if (formattedToken != null) {
+ final int readingCount = formattedToken.getReadingsLength();
+ formattedString[0] = formattedToken.getToken();
+ if (pRegexMatch != null) {
+ formattedString[0] = pRegexMatch.matcher(formattedString[0])
+ .replaceAll(regexReplace);
+ }
+ formattedString[0] = convertCase(formattedString[0]);
+ if (posTag != null) {
+ if (synthesizer == null) {
+ formattedString[0] = formattedToken.getToken();
+ } else if (postagRegexp) {
+ final TreeSet<String> wordForms = new TreeSet<String>();
+ boolean oneForm = false;
+ for (int k = 0; k < readingCount; k++) {
+ if (formattedToken.getAnalyzedToken(k).getLemma() == null) {
+ final String posUnique = formattedToken.getAnalyzedToken(k)
+ .getPOSTag();
+ if (posUnique == null) {
+ wordForms.add(formattedToken.getToken());
+ oneForm = true;
+ } else {
+ if (JLanguageTool.SENTENCE_START_TAGNAME.equals(posUnique)
+ || JLanguageTool.SENTENCE_END_TAGNAME.equals(posUnique)
+ || JLanguageTool.PARAGRAPH_END_TAGNAME.equals(posUnique)) {
+ if (!oneForm) {
+ wordForms.add(formattedToken.getToken());
+ }
+ oneForm = true;
+ } else {
+ oneForm = false;
+ }
+ }
+ }
+ }
+ final String targetPosTag = getTargetPosTag();
+ if (!oneForm) {
+ for (int i = 0; i < readingCount; i++) {
+ final String[] possibleWordForms = synthesizer.synthesize(
+ formattedToken.getAnalyzedToken(i), targetPosTag, true);
+ if (possibleWordForms != null) {
+ wordForms.addAll(Arrays.asList(possibleWordForms));
+ }
+ }
+ }
+ if (wordForms.isEmpty()) {
+ formattedString[0] = "(" + formattedToken.getToken() + ")";
+ } else {
+ formattedString = wordForms.toArray(new String[wordForms.size()]);
+ }
+ } else {
+ final TreeSet<String> wordForms = new TreeSet<String>();
+ for (int i = 0; i < readingCount; i++) {
+ final String[] possibleWordForms = synthesizer.synthesize(
+ formattedToken.getAnalyzedToken(i), posTag);
+ if (possibleWordForms != null) {
+ wordForms.addAll(Arrays.asList(possibleWordForms));
+ }
+ }
+ formattedString = wordForms.toArray(new String[wordForms.size()]);
+ }
+ }
+ }
+ if (includeSkipped != IncludeRange.NONE
+ && skippedTokens != null && !"".equals(skippedTokens)) {
+ final String[] helper = new String[formattedString.length];
+ for (int i = 0; i < formattedString.length; i++) {
+ if (formattedString[i] == null) {
+ formattedString[i] = "";
+ }
+ helper[i] = formattedString[i] + skippedTokens;
+ }
+ formattedString = helper;
+ }
+ return formattedString;
+ }
+
+ /**
+ * Format POS tag using parameters already defined in the class.
+ *
+ * @return Formatted POS tag as String.
+ */
+ // FIXME: gets only the first POS tag that matches, this can be wrong
+ // on the other hand, many POS tags = too many suggestions?
+ public final String getTargetPosTag() {
+ String targetPosTag = posTag;
+ final List<String> posTags = new ArrayList<String>();
+ if (staticLemma) {
+ final int numRead = matchedToken.getReadingsLength();
+ for (int i = 0; i < numRead; i++) {
+ final String tst = matchedToken.getAnalyzedToken(i).getPOSTag();
+ if (tst != null && pPosRegexMatch.matcher(tst).matches()) {
+ targetPosTag = matchedToken.getAnalyzedToken(i).getPOSTag();
+ posTags.add(targetPosTag);
+ }
+ }
+ if (pPosRegexMatch != null && posTagReplace != null) {
+ targetPosTag = pPosRegexMatch.matcher(targetPosTag).replaceAll(
+ posTagReplace);
+ }
+ } else {
+ final int numRead = formattedToken.getReadingsLength();
+ for (int i = 0; i < numRead; i++) {
+ final String tst = formattedToken.getAnalyzedToken(i).getPOSTag();
+ if (tst != null && pPosRegexMatch.matcher(tst).matches()) {
+ targetPosTag = formattedToken.getAnalyzedToken(i).getPOSTag();
+ posTags.add(targetPosTag);
+ }
+ }
+ if (pPosRegexMatch != null && posTagReplace != null) {
+ if (posTags.isEmpty()) {
+ posTags.add(targetPosTag);
+ }
+ final StringBuilder sb = new StringBuilder();
+ final int posTagLen = posTags.size();
+ int l = 0;
+ for (String lposTag : posTags) {
+ l++;
+ lposTag = pPosRegexMatch.matcher(lposTag).replaceAll(posTagReplace);
+ if (setPos) {
+ lposTag = synthesizer.getPosTagCorrection(lposTag);
+ }
+ sb.append(lposTag);
+ if (l < posTagLen) {
+ sb.append('|');
+ }
+ }
+ targetPosTag = sb.toString();
+ }
+ }
+ return targetPosTag;
+ }
+
+ /**
+ * Method for getting the formatted match as a single string. In case of
+ * multiple matches, it joins them using a regular expression operator "|".
+ *
+ * @return Formatted string of the matched token.
+ */
+ public final String toTokenString() throws IOException {
+ final StringBuilder output = new StringBuilder();
+ final String[] stringToFormat = toFinalString();
+ for (int i = 0; i < stringToFormat.length; i++) {
+ output.append(stringToFormat[i]);
+ if (i + 1 < stringToFormat.length) {
+ output.append('|');
+ }
+ }
+ return output.toString();
+ }
+
+ /**
+ * Sets the token number referenced by the match.
+ *
+ * @param i Token number.
+ */
+ public final void setTokenRef(final int i) {
+ tokenRef = i;
+ }
+
+ /**
+ * Gets the token number referenced by the match.
+ *
+ * @return int - token number.
+ */
+ public final int getTokenRef() {
+ return tokenRef;
+ }
+
+ /**
+ * Converts case of the string token according to match element attributes.
+ *
+ * @param s Token to be converted.
+ * @return Converted string.
+ */
+ private String convertCase(final String s) {
+ if (StringTools.isEmpty(s)) {
+ return s;
+ }
+ String token = s;
+ switch (caseConversionType) {
+ case NONE:
+ break;
+ case STARTLOWER:
+ token = token.substring(0, 1).toLowerCase() + token.substring(1);
+ break;
+ case STARTUPPER:
+ token = token.substring(0, 1).toUpperCase() + token.substring(1);
+ break;
+ case ALLUPPER:
+ token = token.toUpperCase();
+ break;
+ case ALLLOWER:
+ token = token.toLowerCase();
+ break;
+ default:
+ break;
+ }
+ return token;
+ }
+
+ /**
+ * Used to let LT know that it should change the case of the match.
+ *
+ * @return true if match converts the case of the token.
+ */
+ public final boolean convertsCase() {
+ return !caseConversionType.equals(CaseConversion.NONE);
+ }
+
+ public final AnalyzedTokenReadings filterReadings() {
+ final ArrayList<AnalyzedToken> l = new ArrayList<AnalyzedToken>();
+ if (formattedToken != null) {
+ if (staticLemma) {
+ formattedToken = new AnalyzedTokenReadings(new AnalyzedToken(
+ matchedToken.getToken(), posTag, formattedToken.getToken()),
+ matchedToken.getStartPos());
+ formattedToken.setWhitespaceBefore(matchedToken.isWhitespaceBefore());
+ }
+ String token = formattedToken.getToken();
+ if (pRegexMatch != null) {
+ token = pRegexMatch.matcher(token).replaceAll(regexReplace);
+ }
+ token = convertCase(token);
+ if (posTag != null) {
+ final int numRead = formattedToken.getReadingsLength();
+ if (postagRegexp) {
+ String targetPosTag = posTag;
+ for (int i = 0; i < numRead; i++) {
+ final String tst = formattedToken.getAnalyzedToken(i).getPOSTag();
+ if (tst != null && pPosRegexMatch.matcher(tst).matches()) {
+ targetPosTag = formattedToken.getAnalyzedToken(i).getPOSTag();
+ if (posTagReplace != null) {
+ targetPosTag = pPosRegexMatch.matcher(targetPosTag).replaceAll(
+ posTagReplace);
+ }
+ l
+ .add(new AnalyzedToken(token, targetPosTag, formattedToken
+ .getAnalyzedToken(i).getLemma()));
+ l.get(l.size() - 1).setWhitespaceBefore(formattedToken.isWhitespaceBefore());
+ }
+ }
+ if (l.isEmpty()) {
+ for (final AnalyzedToken anaTok : getNewToken(numRead, token)) {
+ l.add(anaTok);
+ }
+ }
+ } else {
+ for (final AnalyzedToken anaTok : getNewToken(numRead, token)) {
+ l.add(anaTok);
+ }
+ }
+ if (formattedToken.isSentEnd()) {
+ l.add(new AnalyzedToken(formattedToken.getToken(),
+ JLanguageTool.SENTENCE_END_TAGNAME,
+ formattedToken.getAnalyzedToken(0).getLemma()));
+ }
+ if (formattedToken.isParaEnd()) {
+ l.add(new AnalyzedToken(formattedToken.getToken(),
+ JLanguageTool.PARAGRAPH_END_TAGNAME,
+ formattedToken.getAnalyzedToken(0).getLemma()));
+ }
+ }
+ }
+ if (l.isEmpty()) {
+ return formattedToken;
+ }
+ return new AnalyzedTokenReadings(l.toArray(new AnalyzedToken[l.size()]), formattedToken.getStartPos());
+ }
+
+ private AnalyzedToken[] getNewToken(final int numRead, final String token) {
+ final List<AnalyzedToken> list = new ArrayList<AnalyzedToken>();
+ String lemma = "";
+ for (int j = 0; j < numRead; j++) {
+ if (formattedToken.getAnalyzedToken(j).getPOSTag() != null) {
+ if (formattedToken.getAnalyzedToken(j).getPOSTag().equals(posTag)
+ && (formattedToken.getAnalyzedToken(j).getLemma() != null)) {
+ lemma = formattedToken.getAnalyzedToken(j).getLemma();
+ }
+ if (StringTools.isEmpty(lemma)) {
+ lemma = formattedToken.getAnalyzedToken(0).getLemma();
+ }
+ list.add(new AnalyzedToken(token, posTag, lemma));
+ list.get(list.size() - 1).
+ setWhitespaceBefore(formattedToken.isWhitespaceBefore());
+ }
+ }
+ return list.toArray(new AnalyzedToken[list.size()]);
+ }
+
+ /**
+ * @param inMessageOnly
+ * the inMessageOnly to set
+ */
+ public void setInMessageOnly(final boolean inMessageOnly) {
+ this.inMessageOnly = inMessageOnly;
+ }
+
+ /**
+ * @return the inMessageOnly
+ */
+ public boolean isInMessageOnly() {
+ return inMessageOnly;
+ }
+
+}