/* LanguageTool, a natural language style checker
* Copyright (C) 2005 Daniel Naber (http://www.danielnaber.de)
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library; if not, write to the Free Software
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
* USA
*/
package de.danielnaber.languagetool.rules.patterns;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.TreeSet;
import java.util.regex.Pattern;
import de.danielnaber.languagetool.AnalyzedToken;
import de.danielnaber.languagetool.AnalyzedTokenReadings;
import de.danielnaber.languagetool.JLanguageTool;
import de.danielnaber.languagetool.synthesis.Synthesizer;
import de.danielnaber.languagetool.tools.StringTools;
/**
* Reference to a matched token in a pattern, can be formatted and used for
* matching & suggestions.
*
* @author Marcin MiĆkowski
*/
public class Match {
/** Possible string case conversions. **/
public enum CaseConversion {
NONE, STARTLOWER, STARTUPPER, ALLLOWER, ALLUPPER;
/**
* Converts string to the constant enum.
*
* @param str
* String value to be converted.
* @return CaseConversion enum.
*/
public static CaseConversion toCase(final String str) {
try {
return valueOf(str);
} catch (final Exception ex) {
return NONE;
}
}
}
public enum IncludeRange {
NONE, FOLLOWING, ALL;
/**
* Converts string to the constant enum.
*
* @param str
* String value to be converted.
* @return IncludeRange enum.
*/
public static IncludeRange toRange(final String str) {
try {
return valueOf(str);
} catch (final Exception ex) {
return NONE;
}
}
}
private final String posTag;
private boolean postagRegexp;
private final String regexReplace;
private final String posTagReplace;
private final CaseConversion caseConversionType;
private final IncludeRange includeSkipped;
private String skippedTokens;
/**
* True if this match element formats a statically defined lemma which is
* enclosed by the element, e.g., <match...>word</word>.
*/
private boolean staticLemma;
/**
* True if this match element is used for formatting POS token.
*/
private final boolean setPos;
private AnalyzedTokenReadings formattedToken;
private AnalyzedTokenReadings matchedToken;
private int tokenRef;
/** Word form generator for POS tags. **/
private Synthesizer synthesizer;
/** Pattern used to define parts of the matched token. **/
private Pattern pRegexMatch;
/** Pattern used to define parts of the matched POS token. **/
private Pattern pPosRegexMatch;
/**
* True when the match is not in the suggestion.
*/
private boolean inMessageOnly;
public Match(final String posTag, final String posTagReplace,
final boolean postagRegexp, final String regexMatch,
final String regexReplace, final CaseConversion caseConversionType,
final boolean setPOS,
final IncludeRange includeSkipped) {
this.posTag = posTag;
this.postagRegexp = postagRegexp;
this.caseConversionType = caseConversionType;
if (regexMatch != null) {
pRegexMatch = Pattern.compile(regexMatch);
}
if (postagRegexp && posTag != null) {
pPosRegexMatch = Pattern.compile(posTag);
}
this.regexReplace = regexReplace;
this.posTagReplace = posTagReplace;
this.setPos = setPOS;
this.includeSkipped = includeSkipped;
}
/**
* Sets the token that will be formatted or otherwise used in the class.
*/
public final void setToken(final AnalyzedTokenReadings token) {
if (staticLemma) {
matchedToken = token;
} else {
formattedToken = token;
}
}
/**
* Sets the token to be formatted etc. and includes the support for
* including the skipped tokens.
* @param tokens Array of tokens
* @param index Index of the token to be formatted
* @param next Position of the next token (the skipped tokens
* are the ones between the tokens[index] and tokens[next]
*/
public final void setToken(final AnalyzedTokenReadings[] tokens, final int index, final int next) {
setToken(tokens[index]);
if (next > 1 && includeSkipped != IncludeRange.NONE) {
final StringBuilder sb = new StringBuilder();
if (includeSkipped == IncludeRange.FOLLOWING) {
formattedToken = null;
}
for (int k = index + 1; k < index + next; k++) {
if (k > index + 1 &&
tokens[k].isWhitespaceBefore()) {
sb.append(' ');
}
sb.append(tokens[k].getToken());
}
skippedTokens = sb.toString();
} else {
skippedTokens = "";
}
}
/**
private String[] addSkipped(final String[] formattedString) {
if (skippedTokens != null && !"".equals(skippedTokens)) {
String[] finalStrings = new String[formattedString.length];
for (int i = 1; i <= formattedString.length; i++)
}
}
**/
/**
* Checks if the Match element is used for setting the part of speech Element.
*
* @return True if Match sets POS.
*/
public final boolean setsPos() {
return setPos;
}
/**
* Checks if the Match element uses regexp-based form of the POS tag.
*
* @return True if regexp is used in POS.
*/
public final boolean posRegExp() {
return postagRegexp;
}
/**
* Sets a base form (lemma) that will be formatted, or synthesized, using the
* specified POS regular expressions.
*
* @param lemmaString String that specifies the base form.
*/
public final void setLemmaString(final String lemmaString) {
if (!StringTools.isEmpty(lemmaString)) {
formattedToken = new AnalyzedTokenReadings(new AnalyzedToken(lemmaString,
posTag, lemmaString), 0);
staticLemma = true;
postagRegexp = true;
if (posTag != null) {
pPosRegexMatch = Pattern.compile(posTag);
}
}
}
/**
* Sets a synthesizer used for grammatical synthesis of forms based on
* formatted POS values.
*
* @param synth Synthesizer class.
*/
public final void setSynthesizer(final Synthesizer synth) {
synthesizer = synth;
}
/**
* Gets all strings formatted using the match element.
*
* @return array of strings
* @throws IOException
* in case of synthesizer-related disk problems.
*/
public final String[] toFinalString() throws IOException {
String[] formattedString = new String[1];
if (formattedToken != null) {
final int readingCount = formattedToken.getReadingsLength();
formattedString[0] = formattedToken.getToken();
if (pRegexMatch != null) {
formattedString[0] = pRegexMatch.matcher(formattedString[0])
.replaceAll(regexReplace);
}
formattedString[0] = convertCase(formattedString[0]);
if (posTag != null) {
if (synthesizer == null) {
formattedString[0] = formattedToken.getToken();
} else if (postagRegexp) {
final TreeSet wordForms = new TreeSet();
boolean oneForm = false;
for (int k = 0; k < readingCount; k++) {
if (formattedToken.getAnalyzedToken(k).getLemma() == null) {
final String posUnique = formattedToken.getAnalyzedToken(k)
.getPOSTag();
if (posUnique == null) {
wordForms.add(formattedToken.getToken());
oneForm = true;
} else {
if (JLanguageTool.SENTENCE_START_TAGNAME.equals(posUnique)
|| JLanguageTool.SENTENCE_END_TAGNAME.equals(posUnique)
|| JLanguageTool.PARAGRAPH_END_TAGNAME.equals(posUnique)) {
if (!oneForm) {
wordForms.add(formattedToken.getToken());
}
oneForm = true;
} else {
oneForm = false;
}
}
}
}
final String targetPosTag = getTargetPosTag();
if (!oneForm) {
for (int i = 0; i < readingCount; i++) {
final String[] possibleWordForms = synthesizer.synthesize(
formattedToken.getAnalyzedToken(i), targetPosTag, true);
if (possibleWordForms != null) {
wordForms.addAll(Arrays.asList(possibleWordForms));
}
}
}
if (wordForms.isEmpty()) {
formattedString[0] = "(" + formattedToken.getToken() + ")";
} else {
formattedString = wordForms.toArray(new String[wordForms.size()]);
}
} else {
final TreeSet wordForms = new TreeSet();
for (int i = 0; i < readingCount; i++) {
final String[] possibleWordForms = synthesizer.synthesize(
formattedToken.getAnalyzedToken(i), posTag);
if (possibleWordForms != null) {
wordForms.addAll(Arrays.asList(possibleWordForms));
}
}
formattedString = wordForms.toArray(new String[wordForms.size()]);
}
}
}
if (includeSkipped != IncludeRange.NONE
&& skippedTokens != null && !"".equals(skippedTokens)) {
final String[] helper = new String[formattedString.length];
for (int i = 0; i < formattedString.length; i++) {
if (formattedString[i] == null) {
formattedString[i] = "";
}
helper[i] = formattedString[i] + skippedTokens;
}
formattedString = helper;
}
return formattedString;
}
/**
* Format POS tag using parameters already defined in the class.
*
* @return Formatted POS tag as String.
*/
// FIXME: gets only the first POS tag that matches, this can be wrong
// on the other hand, many POS tags = too many suggestions?
public final String getTargetPosTag() {
String targetPosTag = posTag;
final List posTags = new ArrayList();
if (staticLemma) {
final int numRead = matchedToken.getReadingsLength();
for (int i = 0; i < numRead; i++) {
final String tst = matchedToken.getAnalyzedToken(i).getPOSTag();
if (tst != null && pPosRegexMatch.matcher(tst).matches()) {
targetPosTag = matchedToken.getAnalyzedToken(i).getPOSTag();
posTags.add(targetPosTag);
}
}
if (pPosRegexMatch != null && posTagReplace != null) {
targetPosTag = pPosRegexMatch.matcher(targetPosTag).replaceAll(
posTagReplace);
}
} else {
final int numRead = formattedToken.getReadingsLength();
for (int i = 0; i < numRead; i++) {
final String tst = formattedToken.getAnalyzedToken(i).getPOSTag();
if (tst != null && pPosRegexMatch.matcher(tst).matches()) {
targetPosTag = formattedToken.getAnalyzedToken(i).getPOSTag();
posTags.add(targetPosTag);
}
}
if (pPosRegexMatch != null && posTagReplace != null) {
if (posTags.isEmpty()) {
posTags.add(targetPosTag);
}
final StringBuilder sb = new StringBuilder();
final int posTagLen = posTags.size();
int l = 0;
for (String lposTag : posTags) {
l++;
lposTag = pPosRegexMatch.matcher(lposTag).replaceAll(posTagReplace);
if (setPos) {
lposTag = synthesizer.getPosTagCorrection(lposTag);
}
sb.append(lposTag);
if (l < posTagLen) {
sb.append('|');
}
}
targetPosTag = sb.toString();
}
}
return targetPosTag;
}
/**
* Method for getting the formatted match as a single string. In case of
* multiple matches, it joins them using a regular expression operator "|".
*
* @return Formatted string of the matched token.
*/
public final String toTokenString() throws IOException {
final StringBuilder output = new StringBuilder();
final String[] stringToFormat = toFinalString();
for (int i = 0; i < stringToFormat.length; i++) {
output.append(stringToFormat[i]);
if (i + 1 < stringToFormat.length) {
output.append('|');
}
}
return output.toString();
}
/**
* Sets the token number referenced by the match.
*
* @param i Token number.
*/
public final void setTokenRef(final int i) {
tokenRef = i;
}
/**
* Gets the token number referenced by the match.
*
* @return int - token number.
*/
public final int getTokenRef() {
return tokenRef;
}
/**
* Converts case of the string token according to match element attributes.
*
* @param s Token to be converted.
* @return Converted string.
*/
private String convertCase(final String s) {
if (StringTools.isEmpty(s)) {
return s;
}
String token = s;
switch (caseConversionType) {
case NONE:
break;
case STARTLOWER:
token = token.substring(0, 1).toLowerCase() + token.substring(1);
break;
case STARTUPPER:
token = token.substring(0, 1).toUpperCase() + token.substring(1);
break;
case ALLUPPER:
token = token.toUpperCase();
break;
case ALLLOWER:
token = token.toLowerCase();
break;
default:
break;
}
return token;
}
/**
* Used to let LT know that it should change the case of the match.
*
* @return true if match converts the case of the token.
*/
public final boolean convertsCase() {
return !caseConversionType.equals(CaseConversion.NONE);
}
public final AnalyzedTokenReadings filterReadings() {
final ArrayList l = new ArrayList();
if (formattedToken != null) {
if (staticLemma) {
formattedToken = new AnalyzedTokenReadings(new AnalyzedToken(
matchedToken.getToken(), posTag, formattedToken.getToken()),
matchedToken.getStartPos());
formattedToken.setWhitespaceBefore(matchedToken.isWhitespaceBefore());
}
String token = formattedToken.getToken();
if (pRegexMatch != null) {
token = pRegexMatch.matcher(token).replaceAll(regexReplace);
}
token = convertCase(token);
if (posTag != null) {
final int numRead = formattedToken.getReadingsLength();
if (postagRegexp) {
String targetPosTag = posTag;
for (int i = 0; i < numRead; i++) {
final String tst = formattedToken.getAnalyzedToken(i).getPOSTag();
if (tst != null && pPosRegexMatch.matcher(tst).matches()) {
targetPosTag = formattedToken.getAnalyzedToken(i).getPOSTag();
if (posTagReplace != null) {
targetPosTag = pPosRegexMatch.matcher(targetPosTag).replaceAll(
posTagReplace);
}
l
.add(new AnalyzedToken(token, targetPosTag, formattedToken
.getAnalyzedToken(i).getLemma()));
l.get(l.size() - 1).setWhitespaceBefore(formattedToken.isWhitespaceBefore());
}
}
if (l.isEmpty()) {
for (final AnalyzedToken anaTok : getNewToken(numRead, token)) {
l.add(anaTok);
}
}
} else {
for (final AnalyzedToken anaTok : getNewToken(numRead, token)) {
l.add(anaTok);
}
}
if (formattedToken.isSentEnd()) {
l.add(new AnalyzedToken(formattedToken.getToken(),
JLanguageTool.SENTENCE_END_TAGNAME,
formattedToken.getAnalyzedToken(0).getLemma()));
}
if (formattedToken.isParaEnd()) {
l.add(new AnalyzedToken(formattedToken.getToken(),
JLanguageTool.PARAGRAPH_END_TAGNAME,
formattedToken.getAnalyzedToken(0).getLemma()));
}
}
}
if (l.isEmpty()) {
return formattedToken;
}
return new AnalyzedTokenReadings(l.toArray(new AnalyzedToken[l.size()]), formattedToken.getStartPos());
}
private AnalyzedToken[] getNewToken(final int numRead, final String token) {
final List list = new ArrayList();
String lemma = "";
for (int j = 0; j < numRead; j++) {
if (formattedToken.getAnalyzedToken(j).getPOSTag() != null) {
if (formattedToken.getAnalyzedToken(j).getPOSTag().equals(posTag)
&& (formattedToken.getAnalyzedToken(j).getLemma() != null)) {
lemma = formattedToken.getAnalyzedToken(j).getLemma();
}
if (StringTools.isEmpty(lemma)) {
lemma = formattedToken.getAnalyzedToken(0).getLemma();
}
list.add(new AnalyzedToken(token, posTag, lemma));
list.get(list.size() - 1).
setWhitespaceBefore(formattedToken.isWhitespaceBefore());
}
}
return list.toArray(new AnalyzedToken[list.size()]);
}
/**
* @param inMessageOnly
* the inMessageOnly to set
*/
public void setInMessageOnly(final boolean inMessageOnly) {
this.inMessageOnly = inMessageOnly;
}
/**
* @return the inMessageOnly
*/
public boolean isInMessageOnly() {
return inMessageOnly;
}
}