diff options
Diffstat (limited to 'JLanguageTool/src/java/de/danielnaber/languagetool/rules')
66 files changed, 10897 insertions, 0 deletions
diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/rules/AbstractCompoundRule.java b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/AbstractCompoundRule.java new file mode 100644 index 0000000..8ef9119 --- /dev/null +++ b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/AbstractCompoundRule.java @@ -0,0 +1,279 @@ +/* LanguageTool, a natural language style checker + * Copyright (C) 2006 Daniel Naber (http://www.danielnaber.de) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ +package de.danielnaber.languagetool.rules; + +import java.io.BufferedReader; +import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Queue; +import java.util.ResourceBundle; +import java.util.Set; +import java.util.concurrent.ArrayBlockingQueue; + +import de.danielnaber.languagetool.AnalyzedSentence; +import de.danielnaber.languagetool.AnalyzedToken; +import de.danielnaber.languagetool.AnalyzedTokenReadings; +import de.danielnaber.languagetool.tools.StringTools; + +/** + * Checks that compounds (if in the list) are not written as separate words. + * + * @author Daniel Naber & Marcin Miłkowski (refactoring) + */ + +public abstract class AbstractCompoundRule extends Rule { + + private static final int MAX_TERMS = 5; + + private final Set<String> incorrectCompounds = new HashSet<String>(); + private final Set<String> noDashSuggestion = new HashSet<String>(); + private final Set<String> onlyDashSuggestion = new HashSet<String>(); + + private String withHyphenMessage; + private String asOneMessage; + private String withOrWithoutHyphenMessage; + + private String shortDesc; + + /** Compounds with more than maxNoHyphensSize parts should always use hyphens */ + private int maxUnHyphenatedWordCount = 2; + + /** Flag to indicate if the hyphen is ignored in the text entered by the user. + * Set this to false if you want the rule to offer suggestions for words like [ro] "câte-și-trei" (with hyphen), not only for "câte și trei" (with spaces) + * This is only available for languages with hyphen as a word separator (ie: not available for english, available for Romanian) + * See Language.getWordTokenizer() + */ + private boolean hyphenIgnored = true; + + public AbstractCompoundRule(final ResourceBundle messages) throws IOException { + if (messages != null) + super.setCategory(new Category(messages.getString("category_misc"))); + } + + public abstract String getId(); + + public abstract String getDescription(); + + public void setShort(final String shortDescription) { + shortDesc = shortDescription; + } + + public void setMsg(final String withHyphenMessage, final String asOneMessage, final String withHyphenOrNotMessage) { + this.withHyphenMessage = withHyphenMessage; + this.asOneMessage = asOneMessage; + withOrWithoutHyphenMessage = withHyphenOrNotMessage; + } + + public boolean isHyphenIgnored() { + return hyphenIgnored; + } + + public void setHyphenIgnored(boolean ignoreHyphen) { + this.hyphenIgnored = ignoreHyphen; + } + + public int getMaxUnHyphenatedWordCount() { + return maxUnHyphenatedWordCount; + } + + public void setMaxUnHyphenatedWordCount(int maxNoHyphensSize) { + this.maxUnHyphenatedWordCount = maxNoHyphensSize; + } + + public RuleMatch[] match(final AnalyzedSentence text) { + final List<RuleMatch> ruleMatches = new ArrayList<RuleMatch>(); + final AnalyzedTokenReadings[] tokens = text.getTokensWithoutWhitespace(); + + RuleMatch prevRuleMatch = null; + final Queue<AnalyzedTokenReadings> prevTokens = new ArrayBlockingQueue<AnalyzedTokenReadings>(MAX_TERMS); + for (int i = 0; i < tokens.length + MAX_TERMS-1; i++) { + AnalyzedTokenReadings token = null; + // we need to extend the token list so we find matches at the end of the original list: + if (i >= tokens.length) + token = new AnalyzedTokenReadings(new AnalyzedToken("", "", null), prevTokens.peek().getStartPos()); + else + token = tokens[i]; + if (i == 0) { + addToQueue(token, prevTokens); + continue; + } + + final StringBuilder sb = new StringBuilder(); + int j = 0; + AnalyzedTokenReadings firstMatchToken = null; + final List<String> stringsToCheck = new ArrayList<String>(); + final List<String> origStringsToCheck = new ArrayList<String>(); // original upper/lowercase spelling + final Map<String, AnalyzedTokenReadings> stringToToken = new HashMap<String, AnalyzedTokenReadings>(); + for (AnalyzedTokenReadings atr : prevTokens) { + if (j == 0) + firstMatchToken = atr; + sb.append(' '); + sb.append(atr.getToken()); + if (j >= 1) { + final String stringToCheck = normalize(sb.toString()); + stringsToCheck.add(stringToCheck); + origStringsToCheck.add(sb.toString().trim()); + if (!stringToToken.containsKey(stringToCheck)) + stringToToken.put(stringToCheck, atr); + } + j++; + } + // iterate backwards over all potentially incorrect strings to make + // sure we match longer strings first: + for (int k = stringsToCheck.size()-1; k >= 0; k--) { + final String stringToCheck = stringsToCheck.get(k); + final String origStringToCheck = origStringsToCheck.get(k); + if (incorrectCompounds.contains(stringToCheck)) { + final AnalyzedTokenReadings atr = stringToToken.get(stringToCheck); + String msg = null; + final List<String> replacement = new ArrayList<String>(); + if (!noDashSuggestion.contains(stringToCheck)) { + replacement.add(origStringToCheck.replace(' ', '-')); + msg = withHyphenMessage; + } + // assume that compounds with more than maxUnHyphenatedWordCount (default: two) parts should always use hyphens: + if (!hasAllUppercaseParts(origStringToCheck) && countParts(stringToCheck) <= getMaxUnHyphenatedWordCount() + && !onlyDashSuggestion.contains(stringToCheck)) { + replacement.add(mergeCompound(origStringToCheck)); + msg = asOneMessage; + } + final String[] parts = stringToCheck.split(" "); + if (parts.length > 0 && parts[0].length() == 1) { + replacement.clear(); + replacement.add(origStringToCheck.replace(' ', '-')); + msg = withHyphenMessage; + } else if (replacement.isEmpty() || replacement.size() == 2) { // isEmpty shouldn't happen + msg = withOrWithoutHyphenMessage; + } + final RuleMatch ruleMatch = new RuleMatch(this, firstMatchToken.getStartPos(), + atr.getStartPos() + atr.getToken().length(), msg, shortDesc); + // avoid duplicate matches: + if (prevRuleMatch != null && prevRuleMatch.getFromPos() == ruleMatch.getFromPos()) { + prevRuleMatch = ruleMatch; + break; + } + prevRuleMatch = ruleMatch; + ruleMatch.setSuggestedReplacements(replacement); + ruleMatches.add(ruleMatch); + break; + } + } + addToQueue(token, prevTokens); + } + return toRuleMatchArray(ruleMatches); + } + + private String normalize(final String inStr) { + String str = inStr.trim().toLowerCase(); + if (str.indexOf('-') != -1 && str.indexOf(' ') != -1) { + if (isHyphenIgnored()) { + // e.g. "E-Mail Adresse" -> "E Mail Adresse" so the error can be detected: + str = str.replace('-', ' '); + } else { + str = str.replace(" - ", " "); + } + } + return str; + } + + private boolean hasAllUppercaseParts(final String str) { + final String[] parts = str.split(" "); + for (String part : parts) { + if (isHyphenIgnored() || !"-".equals(part)) { // do not treat '-' as an upper-case word + if (StringTools.isAllUppercase(part)) { + return true; + } + } + } + return false; + } + + private int countParts(final String str) { + return str.split(" ").length; + } + + private String mergeCompound(final String str) { + final String[] stringParts = str.split(" "); + final StringBuilder sb = new StringBuilder(); + for (int k = 0; k < stringParts.length; k++) { + if (isHyphenIgnored() || !"-".equals(stringParts[k])) { + if (k == 0) + sb.append(stringParts[k]); + else + sb.append(stringParts[k].toLowerCase()); + } + } + return sb.toString(); + } + + private void addToQueue(final AnalyzedTokenReadings token, final Queue<AnalyzedTokenReadings> prevTokens) { + final boolean inserted = prevTokens.offer(token); + if (!inserted) { + prevTokens.poll(); + prevTokens.offer(token); + } + } + + public void loadCompoundFile(final InputStream file, final String encoding) throws IOException { + InputStreamReader isr = null; + BufferedReader br = null; + try { + isr = new InputStreamReader(file, encoding); + br = new BufferedReader(isr); + String line; + while ((line = br.readLine()) != null) { + line = line.trim(); + if (line.length() < 1) { + continue; + } + if (line.charAt(0) == '#') { // ignore comments + continue; + } + // the set contains the incorrect spellings, i.e. the ones without hyphen + line = line.replace('-', ' '); + final String[] parts = line.split(" "); + if (parts.length > MAX_TERMS) + throw new IOException("Too many compound parts: " + line + ", maximum allowed: " + MAX_TERMS); + if (parts.length == 1) + throw new IOException("Not a compound: " + line); + if (line.endsWith("+")) { + line = line.substring(0, line.length() - 1); // cut off "+" + noDashSuggestion.add(line.toLowerCase()); + } else if (line.endsWith("*")) { + line = line.substring(0, line.length() - 1); // cut off "*" + onlyDashSuggestion.add(line.toLowerCase()); + } + incorrectCompounds.add(line.toLowerCase()); + } + } finally { + if (br != null) br.close(); + if (isr != null) isr.close(); + } + } + + public void reset() { + } + +} diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/rules/AbstractPunctuationCheckRule.java b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/AbstractPunctuationCheckRule.java new file mode 100644 index 0000000..89d216b --- /dev/null +++ b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/AbstractPunctuationCheckRule.java @@ -0,0 +1,93 @@ +/* LanguageTool, a natural language style checker + * Copyright (C) 2005 Daniel Naber (http://www.danielnaber.de) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ +package de.danielnaber.languagetool.rules; + +import java.util.ArrayList; +import java.util.List; +import java.util.ResourceBundle; + +import de.danielnaber.languagetool.AnalyzedSentence; +import de.danielnaber.languagetool.AnalyzedTokenReadings; + +/** + * A rule that matches "..", "::", "-," but not "...", "!..", "?!!", ",-" etc. + * Languages will have to subclass it and override <code>isPunctsJoinOk()</code> + * and <code>isPunctuation()</code> to provide language-specific checking + * + * @author Andriy Rysin + */ +public abstract class AbstractPunctuationCheckRule extends Rule { + + public AbstractPunctuationCheckRule(final ResourceBundle messages) { + super(messages); + super.setCategory(new Category(messages.getString("category_misc"))); + } + + public String getId() { + return "PUNCTUATION_GENERIC_CHECK"; + } + + public String getDescription() { + return "Use of unusual combination of punctuation characters"; + } + + protected abstract boolean isPunctsJoinOk(String tkns); + + protected abstract boolean isPunctuation(String token); + + public RuleMatch[] match(final AnalyzedSentence text) { + final List<RuleMatch> ruleMatches = new ArrayList<RuleMatch>(); + final AnalyzedTokenReadings[] tokens = text.getTokens(); + + int startTokenIdx = -1; + String tkns = ""; + for (int i = 0; i < tokens.length; i++) { + final String tokenStr = tokens[i].getToken(); + + if (isPunctuation(tokenStr)) { + tkns += tokenStr; + + if (startTokenIdx == -1) + startTokenIdx = i; + + if (i < tokens.length - 1) + continue; + } + + if (tkns.length() >= 2 && !isPunctsJoinOk(tkns)) { + final String msg = "bad duplication or combination of punctuation signs"; + final RuleMatch ruleMatch = new RuleMatch(this, tokens[startTokenIdx] + .getStartPos(), + tokens[startTokenIdx].getStartPos() + tkns.length(), msg, + "Punctuation problem"); + ruleMatch.setSuggestedReplacement(tkns.substring(0, 1)); + ruleMatches.add(ruleMatch); + } + tkns = ""; + startTokenIdx = -1; + } + + return toRuleMatchArray(ruleMatches); + } + + public void reset() { + // nothing + } + +}
\ No newline at end of file diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/rules/AbstractSimpleReplaceRule.java b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/AbstractSimpleReplaceRule.java new file mode 100644 index 0000000..13288a2 --- /dev/null +++ b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/AbstractSimpleReplaceRule.java @@ -0,0 +1,159 @@ +/* LanguageTool, a natural language style checker + * Copyright (C) 2005 Daniel Naber (http://www.danielnaber.de) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ +package de.danielnaber.languagetool.rules; + +import java.io.BufferedReader; +import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Locale; +import java.util.Map; +import java.util.ResourceBundle; + +import de.danielnaber.languagetool.AnalyzedSentence; +import de.danielnaber.languagetool.AnalyzedTokenReadings; +import de.danielnaber.languagetool.JLanguageTool; +import de.danielnaber.languagetool.tools.StringTools; + +/** + * A rule that matches words or phrases which should not be used and suggests + * correct ones instead. Loads the relevant words from + * <code>rules/XX/replace.txt</code>, where XX is a code of the language. + * + * @author Andriy Rysin + */ +public abstract class AbstractSimpleReplaceRule extends Rule { + + private static final String FILE_ENCODING = "utf-8"; + + private Map<String, String> wrongWords; // e.g. "вреѿті реѿт" -> "зреѿтою" + + public abstract String getFileName(); + + public String getEncoding() { + return FILE_ENCODING; + } + + /** + * Indicates if the rule is case-sensitive. Default value is <code>true</code>. + * @return true if the rule is case-sensitive, false otherwise. + */ + public boolean isCaseSensitive() { + return true; + } + + /** + * @return the locale used for case conversion when {@link #isCaseSensitive()} is set to <code>false</code>. + */ + public Locale getLocale() { + return Locale.getDefault(); + } + + public AbstractSimpleReplaceRule(final ResourceBundle messages) throws IOException { + if (messages != null) { + super.setCategory(new Category(messages.getString("category_misc"))); + } + wrongWords = loadWords(JLanguageTool.getDataBroker().getFromRulesDirAsStream(getFileName())); + } + + public String getId() { + return "SIMPLE_REPLACE"; + } + + public String getDescription() { + return "Checks for wrong words/phrases"; + } + + public String getSuggestion() { + return " is not valid, use "; + } + + public String getShort() { + return "Wrong word"; + } + + public final RuleMatch[] match(final AnalyzedSentence text) { + final List<RuleMatch> ruleMatches = new ArrayList<RuleMatch>(); + final AnalyzedTokenReadings[] tokens = text.getTokensWithoutWhitespace(); + + for (int i = 1; i < tokens.length; i++) { + final String token = tokens[i].getToken(); + + final String origToken = token; + final String replacement = isCaseSensitive()?wrongWords.get(token):wrongWords.get(token.toLowerCase(getLocale())); + if (replacement != null) { + final String msg = token + getSuggestion() + replacement; + final int pos = tokens[i].getStartPos(); + final RuleMatch potentialRuleMatch = new RuleMatch(this, pos, pos + + origToken.length(), msg, getShort()); + if (!isCaseSensitive() && StringTools.startsWithUppercase(token)) { + potentialRuleMatch.setSuggestedReplacement(StringTools.uppercaseFirstChar(replacement)); + } else { + potentialRuleMatch.setSuggestedReplacement(replacement); + } + ruleMatches.add(potentialRuleMatch); + } + } + return toRuleMatchArray(ruleMatches); + } + + + private Map<String, String> loadWords(final InputStream file) throws IOException { + final Map<String, String> map = new HashMap<String, String>(); + InputStreamReader isr = null; + BufferedReader br = null; + try { + isr = new InputStreamReader(file, getEncoding()); + br = new BufferedReader(isr); + String line; + + while ((line = br.readLine()) != null) { + line = line.trim(); + if (line.length() < 1) { + continue; + } + if (line.charAt(0) == '#') { // ignore comments + continue; + } + final String[] parts = line.split("="); + if (parts.length != 2) { + throw new IOException("Format error in file " + + JLanguageTool.getDataBroker().getFromRulesDirAsUrl(getFileName()) + ", line: " + line); + } + map.put(parts[0], parts[1]); + } + + } finally { + if (br != null) { + br.close(); + } + if (isr != null) { + isr.close(); + } + } + return map; + } + + public void reset() { + } + +} diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/rules/Category.java b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/Category.java new file mode 100644 index 0000000..95a3b44 --- /dev/null +++ b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/Category.java @@ -0,0 +1,85 @@ +/* LanguageTool, a natural language style checker + * Copyright (C) 2006 Daniel Naber (http://www.danielnaber.de) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ +package de.danielnaber.languagetool.rules; + +/** + * A rule's category. Categories are used to group rules for + * a better overview. + * + * @author Daniel Naber + */ +public class Category { + + private static final int DEFAULT_PRIORITY = 50; + + private int priority; + private String name; + private boolean defaultOff; + + /** + * Create a new category with the given name and priority. + * @param name name of the category + * @param priority a value between 0 and 100 (inclusive) + */ + public Category(final String name, final int priority) { + if (priority < 0 || priority > 100) + throw new IllegalArgumentException("priority must be in range 0 - 100"); + this.name = name; + this.priority = priority; + } + + /** + * Create a new category with the default priority (50). + * @param name name of the category + */ + public Category(final String name) { + this(name, DEFAULT_PRIORITY); + } + + public String getName() { + return name; + } + + public int getPriority() { + return priority; + } + + public String toString() { + return name + "(prio=" + priority + ")"; + } + + /** + * Checks whether the category has been turned off + * by default by the category author. + * @return True if the category is turned off by + * default. + */ + public final boolean isDefaultOff() { + return defaultOff; + } + + /** + * Turns the category by default off. + **/ + public final void setDefaultOff() { + defaultOff = true; + } + + +} diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/rules/CommaWhitespaceRule.java b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/CommaWhitespaceRule.java new file mode 100644 index 0000000..0636a1f --- /dev/null +++ b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/CommaWhitespaceRule.java @@ -0,0 +1,170 @@ +/* LanguageTool, a natural language style checker + * Copyright (C) 2005 Daniel Naber (http://www.danielnaber.de) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ +package de.danielnaber.languagetool.rules; + +import java.util.ArrayList; +import java.util.List; +import java.util.ResourceBundle; + +import de.danielnaber.languagetool.AnalyzedSentence; +import de.danielnaber.languagetool.AnalyzedTokenReadings; + +/** + * A rule that matches commas and closing parenthesis preceded by whitespace and + * opening parenthesis followed by whitespace. + * + * @author Daniel Naber + */ + +public class CommaWhitespaceRule extends Rule { + + public CommaWhitespaceRule(final ResourceBundle messages) { + super(messages); + super.setCategory(new Category(messages.getString("category_misc"))); + } + + public final String getId() { + return "COMMA_PARENTHESIS_WHITESPACE"; + } + + public final String getDescription() { + return messages.getString("desc_comma_whitespace"); + } + + public final RuleMatch[] match(final AnalyzedSentence text) { + final List<RuleMatch> ruleMatches = new ArrayList<RuleMatch>(); + final AnalyzedTokenReadings[] tokens = text.getTokens(); + String prevToken = ""; + String prevPrevToken = ""; + boolean prevWhite = false; + int pos = 0; + int prevLen = 0; + for (int i = 0; i < tokens.length; i++) { + final String token = tokens[i].getToken(); + final boolean isWhite = tokens[i].isWhitespace() + || tokens[i].isFieldCode(); + pos += token.length(); + String msg = null; + int fixLen = 0; + String suggestionText = null; + if (isWhite && isLeftBracket(prevToken)) { + msg = messages.getString("no_space_after"); + suggestionText = prevToken; + fixLen = 1; + } else if (!isWhite && prevToken.equals(",") + && isNotQuoteOrHyphen(token) + && containsNoNumber(prevPrevToken) + && containsNoNumber(token) + && !",".equals(prevPrevToken)) { + msg = messages.getString("missing_space_after_comma"); + suggestionText = ", "; + } else if (prevWhite) { + if (isRightBracket(token)) { + msg = messages.getString("no_space_before"); + suggestionText = token; + fixLen = 1; + } else if (token.equals(",")) { + msg = messages.getString("space_after_comma"); + suggestionText = ","; + fixLen = 1; + //exception for duplicated comma (we already have another rule for that) + if (i + 1 < tokens.length + && ",".equals(tokens[i + 1].getToken())) { + msg = null; + } + } else if (token.equals(".")) { + msg = messages.getString("no_space_before_dot"); + suggestionText = "."; + fixLen = 1; + // exception case for figures such as ".5" and ellipsis + if (i + 1 < tokens.length + && isNumberOrDot(tokens[i + 1].getToken())) { + msg = null; + } + } + } + if (msg != null) { + final int fromPos = tokens[i - 1].getStartPos(); + final int toPos = tokens[i - 1].getStartPos() + fixLen + prevLen; + // TODO: add some good short comment here + final RuleMatch ruleMatch = new RuleMatch(this, fromPos, toPos, msg); + ruleMatch.setSuggestedReplacement(suggestionText); + ruleMatches.add(ruleMatch); + } + prevPrevToken = prevToken; + prevToken = token; + prevWhite = isWhite && !tokens[i].isFieldCode(); //OOo code before comma/dot + prevLen = tokens[i].getToken().length(); + } + + return toRuleMatchArray(ruleMatches); + } + + static boolean isNotQuoteOrHyphen(final String str) { + if (str.length() == 1) { + final char c = str.charAt(0); + if (c =='\'' || c == '-' || c == '”' + || c =='’' || c == '"' || c == '“' + || c == ',') { + return false; + } + } else { + if (""".equals(str)) { + return false; + } + return containsNoNumber(str); + } + return true; + } + + static boolean isNumberOrDot(final String str) { + final char c = str.charAt(0); + return (c == '.' || Character.isDigit(c)); + } + + static boolean isLeftBracket(final String str) { + if (str.length() == 0) { + return false; + } + final char c = str.charAt(0); + return (c == '(' || c == '[' || c == '{'); + } + + static boolean isRightBracket(final String str) { + if (str.length() == 0) { + return false; + } + final char c = str.charAt(0); + return (c == ')' || c == ']' || c == '}'); + } + + static boolean containsNoNumber(final String str) { + for (int i = 0; i < str.length(); i++) { + if (Character.isDigit(str.charAt(i))) { + return false; + } + } + return true; + } + + public void reset() { + // nothing + } + +} diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/rules/DoublePunctuationRule.java b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/DoublePunctuationRule.java new file mode 100644 index 0000000..3a6a4e1 --- /dev/null +++ b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/DoublePunctuationRule.java @@ -0,0 +1,99 @@ +/* LanguageTool, a natural language style checker + * Copyright (C) 2005 Daniel Naber (http://www.danielnaber.de) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ +package de.danielnaber.languagetool.rules; + +import java.util.ArrayList; +import java.util.List; +import java.util.ResourceBundle; + +import de.danielnaber.languagetool.AnalyzedSentence; +import de.danielnaber.languagetool.AnalyzedTokenReadings; + +/** + * A rule that matches ".." (but not "..." etc) and ",,". + * + * @author Daniel Naber + */ +public class DoublePunctuationRule extends Rule { + + public DoublePunctuationRule(final ResourceBundle messages) { + super(messages); + super.setCategory(new Category(messages.getString("category_misc"))); + } + + public final String getId() { + return "DOUBLE_PUNCTUATION"; + } + + public final String getDescription() { + return messages.getString("desc_double_punct"); + } + + public final RuleMatch[] match(final AnalyzedSentence text) { + final List<RuleMatch> ruleMatches = new ArrayList<RuleMatch>(); + final AnalyzedTokenReadings[] tokens = text.getTokens(); + int startPos = 0; + int dotCount = 0; + int commaCount = 0; + for (int i = 0; i < tokens.length; i++) { + final String token = tokens[i].getToken(); + String nextToken = null; + if (i < tokens.length - 1) { + nextToken = tokens[i + 1].getToken(); + } + if (".".equals(token)) { + dotCount++; + commaCount = 0; + startPos = tokens[i].getStartPos(); + } else if (",".equals(token)) { + commaCount++; + dotCount = 0; + startPos = tokens[i].getStartPos(); + } + if (dotCount == 2 && !".".equals(nextToken)) { + final String msg = messages.getString("two_dots"); + final int fromPos = Math.max(0, startPos - 1); + final RuleMatch ruleMatch = new RuleMatch(this, fromPos, startPos + 1, + msg, messages.getString("double_dots_short")); + ruleMatch.setSuggestedReplacement("."); + ruleMatches.add(ruleMatch); + dotCount = 0; + } else if (commaCount == 2 && !",".equals(nextToken)) { + final String msg = messages.getString("two_commas"); + final int fromPos = Math.max(0, startPos); + final RuleMatch ruleMatch = new RuleMatch(this, fromPos, startPos + 1, + msg, messages.getString("double_commas_short")); + ruleMatch.setSuggestedReplacement(","); + ruleMatches.add(ruleMatch); + commaCount = 0; + } + if (!".".equals(token) && !",".equals(token)) { + dotCount = 0; + commaCount = 0; + } + } + + return toRuleMatchArray(ruleMatches); + } + + public void reset() { + // nothing + } + +} diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/rules/GenericUnpairedBracketsRule.java b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/GenericUnpairedBracketsRule.java new file mode 100644 index 0000000..a2cd35c --- /dev/null +++ b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/GenericUnpairedBracketsRule.java @@ -0,0 +1,314 @@ +/* LanguageTool, a natural language style checker + * Copyright (C) 2009 Daniel Naber (http://www.danielnaber.de) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ + +package de.danielnaber.languagetool.rules; + +import java.util.ArrayList; +import java.util.List; +import java.util.ResourceBundle; +import java.util.regex.Pattern; + +import de.danielnaber.languagetool.AnalyzedSentence; +import de.danielnaber.languagetool.AnalyzedTokenReadings; +import de.danielnaber.languagetool.Language; +import de.danielnaber.languagetool.tools.UnsyncStack; +import de.danielnaber.languagetool.tools.SymbolLocator; + +/** + * Rule that finds unpaired quotes, brackets etc. + * + * @author Marcin Miłkowski + */ +public class GenericUnpairedBracketsRule extends Rule { + + /** + * Note that there must be equal length of both arrays, and the sequence of + * starting symbols must match exactly the sequence of ending symbols. + */ + private static final String[] START_SYMBOLS = { "[", "(", "{", "\"", "'" }; + private static final String[] END_SYMBOLS = { "]", ")", "}", "\"", "'" }; + + protected String[] startSymbols; + protected String[] endSymbols; + + private static final String[] SL_START_SYMBOLS = { "[", "(", "{", "„", "»", "\"" }; + private static final String[] SL_END_SYMBOLS = { "]", ")", "}", "”", "«", "\"" }; + + private static final String[] SK_START_SYMBOLS = { "[", "(", "{", "„", "»", "\"" }; + private static final String[] SK_END_SYMBOLS = { "]", ")", "}", "“", "«", "\"" }; + + private static final String[] RO_START_SYMBOLS = { "[", "(", "{", "„", "«" }; + private static final String[] RO_END_SYMBOLS = { "]", ")", "}", "”", "»" }; + + private static final String[] FR_START_SYMBOLS = { "[", "(", "{", "«", /*"‘"*/ }; + private static final String[] FR_END_SYMBOLS = { "]", ")", "}", "»", /*"’" used in "d’arm" and many other words */ }; + + private static final String[] DE_START_SYMBOLS = { "[", "(", "{", "„", "»", "‘" }; + private static final String[] DE_END_SYMBOLS = { "]", ")", "}", "“", "«", "’" }; + + private static final String[] GL_START_SYMBOLS = { "[", "(", "{", "“", "«", "‘", "\"", "'" }; + private static final String[] GL_END_SYMBOLS = { "]", ")", "}", "”", "»", "’", "\"", "'" }; + + private static final String[] ES_START_SYMBOLS = { "[", "(", "{", "“", "«", "¿", "¡" }; + private static final String[] ES_END_SYMBOLS = { "]", ")", "}", "”", "»", "?", "!" }; + + private static final String[] UK_START_SYMBOLS = { "[", "(", "{", "„", "«" }; + private static final String[] UK_END_SYMBOLS = { "]", ")", "}", "“", "»" }; + + private static final String[] NL_START_SYMBOLS = { "[", "(", "{", "“", "\u2039", "\u201c", "\u201e" }; + private static final String[] NL_END_SYMBOLS = { "]", ")", "}", "”", "\u203a", "\u201d", "\u201d" }; + + private static final String[] IT_START_SYMBOLS = { "[", "(", "{", "»", /*"‘"*/ }; + private static final String[] IT_END_SYMBOLS = { "]", ")", "}", "«", /*"’"*/ }; + + private static final String[] DK_START_SYMBOLS = { "[", "(", "{", "\"", "”" }; + private static final String[] DK_END_SYMBOLS = { "]", ")", "}", "\"", "”" }; + + + + /** + * The stack for pairing symbols. + */ + protected final UnsyncStack<SymbolLocator> symbolStack = new UnsyncStack<SymbolLocator>(); + + /** + * Stack of rule matches. + */ + private final UnsyncStack<RuleMatchLocator> ruleMatchStack = new UnsyncStack<RuleMatchLocator>(); + + private boolean endOfParagraph; + + private final Language ruleLang; + + private static final Pattern PUNCTUATION = Pattern.compile("\\p{Punct}"); + private static final Pattern PUNCTUATION_NO_DOT = Pattern + .compile("[\\p{Punct}&&[^\\.]]"); + private static final Pattern NUMERALS = Pattern + .compile("(?i)\\d{1,2}?[a-z']*|M*(D?C{0,3}|C[DM])(L?X{0,3}|X[LC])(V?I{0,3}|I[VX])$"); + + private int ruleMatchIndex; + private List<RuleMatch> ruleMatches; + + public GenericUnpairedBracketsRule(final ResourceBundle messages, + final Language language) { + super(messages); + super.setCategory(new Category(messages.getString("category_misc"))); + + setParagraphBackTrack(true); + if (language.equals(Language.SLOVAK)) { + startSymbols = SK_START_SYMBOLS; + endSymbols = SK_END_SYMBOLS; } + else if (language.equals(Language.SLOVENIAN)) { + startSymbols = SL_START_SYMBOLS; + endSymbols = SL_END_SYMBOLS; + } else if (language.equals(Language.FRENCH)) { + startSymbols = FR_START_SYMBOLS; + endSymbols = FR_END_SYMBOLS; + } else if (language.equals(Language.GERMAN)) { + startSymbols = DE_START_SYMBOLS; + endSymbols = DE_END_SYMBOLS; + } else if (language.equals(Language.GALICIAN)) { + startSymbols = GL_START_SYMBOLS; + endSymbols = GL_END_SYMBOLS; + } else if (language.equals(Language.DUTCH)) { + startSymbols = NL_START_SYMBOLS; + endSymbols = NL_END_SYMBOLS; + } else if (language.equals(Language.SPANISH)) { + startSymbols = ES_START_SYMBOLS; + endSymbols = ES_END_SYMBOLS; + } else if (language.equals(Language.UKRAINIAN)) { + startSymbols = UK_START_SYMBOLS; + endSymbols = UK_END_SYMBOLS; + } else if (language.equals(Language.ITALIAN)) { + startSymbols = IT_START_SYMBOLS; + endSymbols = IT_END_SYMBOLS; + } else if (language.equals(Language.ROMANIAN)) { + startSymbols = RO_START_SYMBOLS; + endSymbols = RO_END_SYMBOLS; + } else if (language.equals(Language.DANISH)) { + startSymbols = DK_START_SYMBOLS; + endSymbols = DK_END_SYMBOLS; + } else { + startSymbols = START_SYMBOLS; + endSymbols = END_SYMBOLS; + } + + ruleLang = language; + } + + public String getId() { + return "UNPAIRED_BRACKETS"; + } + + public String getDescription() { + return messages.getString("desc_unpaired_brackets"); + } + + /** + * Generic method to specify an exception. For unspecified + * language, it simply returns true, which means no exception. + * @param token + * String token + * @param tokens + * Sentence tokens + * @param i + * Current token index + * @param precSpace + * boolean: is preceded with space + * @param follSpace + * boolean: is followed with space + * @return + */ + protected boolean isNoException(final String token, + final AnalyzedTokenReadings[] tokens, final int i, final int j, + final boolean precSpace, + final boolean follSpace) { + return true; + } + + public final RuleMatch[] match(final AnalyzedSentence text) { + ruleMatches = new ArrayList<RuleMatch>(); + final AnalyzedTokenReadings[] tokens = text.getTokensWithoutWhitespace(); + + if (endOfParagraph) { + reset(); + } + + ruleMatchIndex = getMatchesIndex(); + + for (int i = 1; i < tokens.length; i++) { + for (int j = 0; j < startSymbols.length; j++) { + + final String token = tokens[i].getToken(); + if (token.equals(startSymbols[j]) || token.equals(endSymbols[j])) { + boolean precededByWhitespace = true; + if (startSymbols[j].equals(endSymbols[j])) { + precededByWhitespace = tokens[i - 1].isSentStart() + || tokens[i].isWhitespaceBefore() + || PUNCTUATION_NO_DOT.matcher(tokens[i - 1].getToken()) + .matches(); + } + + boolean followedByWhitespace = true; + if (i < tokens.length - 1 && startSymbols[j].equals(endSymbols[j])) { + followedByWhitespace = tokens[i + 1].isWhitespaceBefore() + || PUNCTUATION.matcher(tokens[i + 1].getToken()).matches(); + } + + final boolean noException = isNoException(token, tokens, i, j, + precededByWhitespace, followedByWhitespace); + + if (noException && precededByWhitespace + && token.equals(startSymbols[j])) { + symbolStack.push(new SymbolLocator(startSymbols[j], i)); + } else if (noException && followedByWhitespace + && token.equals(endSymbols[j])) { + if (i > 1 && endSymbols[j].equals(")") + && (NUMERALS.matcher(tokens[i - 1].getToken()).matches() + && !(!symbolStack.empty() + && "(".equals(symbolStack.peek().symbol)))) { + } else { + if (symbolStack.empty()) { + symbolStack.push(new SymbolLocator(endSymbols[j], i)); + } else { + if (symbolStack.peek().symbol.equals(startSymbols[j])) { + symbolStack.pop(); + } else { + symbolStack.push(new SymbolLocator(endSymbols[j], i)); + } + } + } + } + } + } + } + for (final SymbolLocator sLoc : symbolStack) { + final RuleMatch rMatch = createMatch(tokens[sLoc.index].getStartPos(), + sLoc.symbol); + if (rMatch != null) { + ruleMatches.add(rMatch); + } + } + symbolStack.clear(); + if (tokens[tokens.length - 1].isParaEnd()) { + endOfParagraph = true; + } + + return toRuleMatchArray(ruleMatches); + } + + private RuleMatch createMatch(final int startPos, final String symbol) { + if (!ruleMatchStack.empty()) { + final int index = findSymbolNum(symbol); + if (index >= 0) { + final RuleMatchLocator rLoc = ruleMatchStack.peek(); + if (rLoc.symbol.equals(startSymbols[index])) { + if (ruleMatches.size() > rLoc.myIndex) { + ruleMatches.remove(rLoc.myIndex); + ruleMatchStack.pop(); + return null; + // if (ruleMatches.get(rLoc.myIndex).getFromPos()) + } + if (isInMatches(rLoc.index)) { + setAsDeleted(rLoc.index); + ruleMatchStack.pop(); + return null; + } + } + } + } + ruleMatchStack.push(new RuleMatchLocator(symbol, ruleMatchIndex, + ruleMatches.size())); + ruleMatchIndex++; + return new RuleMatch(this, startPos, startPos + symbol.length(), messages + .getString("unpaired_brackets")); + } + + private int findSymbolNum(final String ch) { + for (int i = 0; i < endSymbols.length; i++) { + if (ch.equals(endSymbols[i])) { + return i; + } + } + return -1; + } + + /** + * Reset the state information for the rule, including paragraph-level + * information. + */ + public final void reset() { + ruleMatchStack.clear(); + symbolStack.clear(); + if (!endOfParagraph) { + clearMatches(); + } + endOfParagraph = false; + } + +} + +class RuleMatchLocator extends SymbolLocator { + public int myIndex; + + RuleMatchLocator(final String sym, final int ind, final int myInd) { + super(sym, ind); + myIndex = myInd; + } +} diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/rules/IncorrectExample.java b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/IncorrectExample.java new file mode 100644 index 0000000..0d3478f --- /dev/null +++ b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/IncorrectExample.java @@ -0,0 +1,62 @@ +/* LanguageTool, a natural language style checker + * Copyright (C) 2008 Daniel Naber (http://www.danielnaber.de) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ +package de.danielnaber.languagetool.rules; + +import java.util.Arrays; +import java.util.List; + +/** + * A text, typically a sentence, that contains an error. + * + * @since 0.9.2 + * @author Daniel Naber + */ +public class IncorrectExample { + + private String example; + private List<String> corrections; + + public IncorrectExample(final String example) { + this.example = example; + } + + public IncorrectExample(final String example, final String[] corrections) { + this(example); + this.corrections = Arrays.asList(corrections); + } + + /** + * Return the example that contains the error. + */ + public String getExample() { + return example; + } + + /** + * Return the possible corrections. May be null. + */ + public List<String> getCorrections() { + return corrections; + } + + public String toString() { + return example + " " + corrections; + } + +} diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/rules/Rule.java b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/Rule.java new file mode 100644 index 0000000..210754c --- /dev/null +++ b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/Rule.java @@ -0,0 +1,230 @@ +/* LanguageTool, a natural language style checker + * Copyright (C) 2005 Daniel Naber (http://www.danielnaber.de) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ +package de.danielnaber.languagetool.rules; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; +import java.util.ResourceBundle; +import java.util.Set; + +import de.danielnaber.languagetool.AnalyzedSentence; +import de.danielnaber.languagetool.Language; + +/** + * Abstract rule class. A Rule describes a language error and can test whether a + * given pre-analyzed text contains that error using the {@link Rule#match} + * method. + * + * @author Daniel Naber + */ +public abstract class Rule { + + private List<String> correctExamples; + private List<IncorrectExample> incorrectExamples; + private Category category; + + /** + * If true, then the rule is turned off by default. + */ + private boolean defaultOff; + + protected ResourceBundle messages; + + /** + * Called by language-dependent rules. + */ + public Rule() { + } + + /** + * Called by language-independent rules. + */ + public Rule(final ResourceBundle messages) { + this.messages = messages; + } + + public abstract String getId(); + + public abstract String getDescription(); + + /** + * Used by paragraph rules to signal that they can remove previous rule + * matches. + */ + private boolean paragraphBackTrack; + + /** + * The final list of RuleMatches, without removed matches. + */ + private List<RuleMatch> previousMatches; + + private List<RuleMatch> removedMatches; + + /** + * Check whether the given text matches this error rule, i.e. whether the text + * contains this error. + * + * @param text + * a pre-analyzed sentence + * @return an array of RuleMatch object for each match. + */ + public abstract RuleMatch[] match(AnalyzedSentence text) throws IOException; + + /** + * If a rule keeps its state over more than the check of one sentence, this + * must be implemented so the internal state is reset. It will be called + * before a new text is going to be checked. + */ + public abstract void reset(); + + /** + * Whether this rule can be used for text in the given language. + */ + public final boolean supportsLanguage(final Language language) { + final Set<String> relevantIDs = language.getRelevantRuleIDs(); + return relevantIDs != null && relevantIDs.contains(getId()); + } + + /** + * Set the examples that are correct and thus do not trigger the rule. + */ + public final void setCorrectExamples(final List<String> correctExamples) { + this.correctExamples = correctExamples; + } + + /** + * Get example sentences that are correct and thus will not match this rule. + */ + public final List<String> getCorrectExamples() { + return correctExamples; + } + + /** + * Set the examples that are incorrect and thus do trigger the rule. + */ + public final void setIncorrectExamples( + final List<IncorrectExample> incorrectExamples) { + this.incorrectExamples = incorrectExamples; + } + + /** + * Get example sentences that are incorrect and thus will match this rule. + */ + public final List<IncorrectExample> getIncorrectExamples() { + return incorrectExamples; + } + + public final Category getCategory() { + return category; + } + + public final void setCategory(final Category category) { + this.category = category; + } + + protected final RuleMatch[] toRuleMatchArray(final List<RuleMatch> ruleMatches) { + return ruleMatches.toArray(new RuleMatch[ruleMatches.size()]); + } + + public final boolean isParagraphBackTrack() { + return paragraphBackTrack; + } + + public final void setParagraphBackTrack(final boolean backTrack) { + paragraphBackTrack = backTrack; + } + + /** + * Method to add matches. + * + * @param r + * RuleMatch - matched rule added by check() + */ + public final void addRuleMatch(final RuleMatch r) { + if (previousMatches == null) { + previousMatches = new ArrayList<RuleMatch>(); + } + previousMatches.add(r); + } + + /** + * Deletes (or disables) previously matched rule. + * + * @param i + * Index of the rule that should be deleted. + */ + public final void setAsDeleted(final int i) { + if (removedMatches == null) { + removedMatches = new ArrayList<RuleMatch>(); + } + removedMatches.add(previousMatches.get(i)); + } + + public final boolean isInRemoved(final RuleMatch r) { + if (removedMatches == null) { + return false; + } + return removedMatches.contains(r); + } + + public final boolean isInMatches(final int i) { + if (previousMatches == null) { + return false; + } + if (previousMatches.size() > i) { + return previousMatches.get(i) != null; + } + return false; + } + + public final void clearMatches() { + if (previousMatches != null) { + previousMatches.clear(); + } + } + + public final int getMatchesIndex() { + if (previousMatches == null) { + return 0; + } + return previousMatches.size(); + } + + public final List<RuleMatch> getMatches() { + return previousMatches; + } + + /** + * Checks whether the rule has been turned off by default by the rule author. + * + * @return True if the rule is turned off by default. + */ + public final boolean isDefaultOff() { + return defaultOff; + } + + /** + * Turns the rule by default off. + **/ + public final void setDefaultOff() { + defaultOff = true; + } + +} diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/rules/RuleMatch.java b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/RuleMatch.java new file mode 100644 index 0000000..05746fb --- /dev/null +++ b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/RuleMatch.java @@ -0,0 +1,239 @@ +/* LanguageTool, a natural language style checker + * Copyright (C) 2005 Daniel Naber (http://www.danielnaber.de) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ +package de.danielnaber.languagetool.rules; + +import java.util.ArrayList; +import java.util.List; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +import de.danielnaber.languagetool.tools.StringTools; + +/** + * A class that holds information about where a rule matches text. + * + * @author Daniel Naber + */ +public class RuleMatch implements Comparable<RuleMatch> { + + private static final Pattern SUGGESTION_PATTERN = Pattern.compile("<suggestion>(.*?)</suggestion>"); + + private int fromLine = -1; + private int column = -1; + private int offset = -1; + private int endLine = -1; + private int endColumn = -1; + + private Rule rule; + private int fromPos; + private int toPos; + private String message; + // for OOo context menu + private String shortMessage; + + private List<String> suggestedReplacements = new ArrayList<String>(); + +//TODO: remove this one after all rules get their short comments in place + public RuleMatch(Rule rule, int fromPos, int toPos, String message) { + this(rule, fromPos, toPos, message, null, false); + } + + // TODO: remove this constructor? + public RuleMatch(Rule rule, int fromPos, int toPos, String message, String shortMessage) { + this(rule, fromPos, toPos, message, shortMessage, false); + } + + /** + * Creates a RuleMatch object, taking the rule that triggered + * this match, position of the match and an explanation message. + * This message is scanned for <suggestion>...</suggestion> to get suggested + * fixes for the problem detected by this rule. + * + * @param startWithUppercase whether the original text at the position + * of the match start with an uppercase character + */ + public RuleMatch(Rule rule, int fromPos, int toPos, String message, String shortMessage, + boolean startWithUppercase) { + this.rule = rule; + this.fromPos = fromPos; + this.toPos = toPos; + this.message = message; + this.shortMessage = shortMessage; + // extract suggestion from <suggestion>...</suggestion> in message: + final Matcher matcher = SUGGESTION_PATTERN.matcher(message); + int pos = 0; + while (matcher.find(pos)) { + pos = matcher.end(); + String repl = matcher.group(1); + if (startWithUppercase) + repl = StringTools.uppercaseFirstChar(repl); + suggestedReplacements.add(repl); + } + } + + public Rule getRule() { + return rule; + } + + /** + * Set the line number in which the match occurs. + */ + public void setLine(final int fromLine) { + this.fromLine = fromLine; + } + + /** + * Get the line number in which the match occurs. + */ + public int getLine() { + return fromLine; + } + + /** + * Set the line number in which the match ends. + */ + public void setEndLine(final int endLine) { + this.endLine = endLine; + } + + /** + * Get the line number in which the match ends. + */ + public int getEndLine() { + return endLine; + } + + /** + * Set the column number in which the match occurs. + */ + public void setColumn(final int column) { + this.column = column; + } + + /** + * Get the column number in which the match occurs. + */ + public int getColumn() { + return column; + } + + /** + * Set the column number in which the match ends. + */ + public void setEndColumn(final int endColumn) { + this.endColumn = endColumn; + } + + /** + * Get the column number in which the match ends. + */ + public int getEndColumn() { + return endColumn; + } + + /** + * Set the character offset at which the match occurs. + */ + public void setOffset(final int offset) { + this.offset = offset; + } + + /** + * Get the character offset at which the match occurs. + */ + public int getOffset() { + return offset; + } + + /** + * Position of the start of the error (in characters). + */ + public int getFromPos() { + return fromPos; + } + + /** + * Position of the end of the error (in characters). + */ + public int getToPos() { + return toPos; + } + + /** + * A human-readable explanation describing the error. + */ + public String getMessage() { + return message; + } + + /** + * A shorter human-readable explanation describing the error. + */ + public String getShortMessage() { + return shortMessage; + } + + + /** + * @see #getSuggestedReplacements() + */ + public void setSuggestedReplacement(final String replacement) { + if (replacement == null) + throw new NullPointerException("replacement might be empty but not null"); + final List<String> fixes = new ArrayList<String>(); + fixes.add(replacement); + setSuggestedReplacements(fixes); + } + + /** + * @see #getSuggestedReplacements() + */ + public void setSuggestedReplacements(final List<String> replacement) { + if (replacement == null) + throw new NullPointerException("replacement might be empty but not null"); + this.suggestedReplacements = replacement; + } + + /** + * The text fragments which might be an appropriate fix for the problem. One + * of these fragments can be used to replace the old text between getFromPos() + * to getToPos(). Text between <suggestion> and </suggestion> is + * taken as the suggested replacement. + * @return List of String objects or an empty List + */ + public List<String> getSuggestedReplacements() { + return suggestedReplacements; + } + + @Override + public String toString() { + return rule.getId() + ":" + fromPos + "-" + toPos + ":" + message; + } + + public int compareTo(final RuleMatch other) { + if (other == null) + throw new ClassCastException(); + if (getFromPos() < other.getFromPos()) + return -1; + if (getFromPos() > other.getFromPos()) + return 1; + return 0; + } + +} diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/rules/UppercaseSentenceStartRule.java b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/UppercaseSentenceStartRule.java new file mode 100644 index 0000000..35ecfa4 --- /dev/null +++ b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/UppercaseSentenceStartRule.java @@ -0,0 +1,136 @@ +/* LanguageTool, a natural language style checker + * Copyright (C) 2005 Daniel Naber (http://www.danielnaber.de) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ +package de.danielnaber.languagetool.rules; + +import java.util.ArrayList; +import java.util.List; +import java.util.ResourceBundle; + +import de.danielnaber.languagetool.AnalyzedSentence; +import de.danielnaber.languagetool.AnalyzedTokenReadings; +import de.danielnaber.languagetool.Language; + +/** + * Checks that a sentence starts with an uppercase letter. + * + * @author Daniel Naber + */ +public class UppercaseSentenceStartRule extends Rule { + + private final Language language; + + private String lastParagraphString = ""; + + public UppercaseSentenceStartRule(final ResourceBundle messages, + final Language language) { + super(messages); + super.setCategory(new Category(messages.getString("category_case"))); + this.language = language; + } + + public final String getId() { + return "UPPERCASE_SENTENCE_START"; + } + + public final String getDescription() { + return messages.getString("desc_uppercase_sentence"); + } + + public final RuleMatch[] match(final AnalyzedSentence text) { + final List<RuleMatch> ruleMatches = new ArrayList<RuleMatch>(); + final AnalyzedTokenReadings[] tokens = text.getTokensWithoutWhitespace(); + if (tokens.length < 2) { + return toRuleMatchArray(ruleMatches); + } + int matchTokenPos = 1; // 0 = SENT_START + final String firstToken = tokens[matchTokenPos].getToken(); + String secondToken = null; + String thirdToken = null; + // ignore quote characters: + if (tokens.length >= 3 + && ("'".equals(firstToken) || "\"".equals(firstToken) || "„" + .equals(firstToken))) { + matchTokenPos = 2; + secondToken = tokens[matchTokenPos].getToken(); + } + final String firstDutchToken = dutchSpecialCase(firstToken, secondToken, + tokens); + if (firstDutchToken != null) { + thirdToken = firstDutchToken; + matchTokenPos = 3; + } + + String checkToken = firstToken; + if (thirdToken != null) { + checkToken = thirdToken; + } else if (secondToken != null) { + checkToken = secondToken; + } + + final String lastToken = tokens[tokens.length - 1].getToken(); + + boolean noException = false; + //fix for lists; note - this will not always work for the last point in OOo, + //as OOo might serve paragraphs in any order. + if ((language == Language.RUSSIAN || language == Language.POLISH) + && (";".equals(lastParagraphString) || ";".equals(lastToken) + || ",".equals(lastParagraphString) || ",".equals(lastToken))) { + noException = true; + } + //fix for comma in last paragraph; note - this will not always work for the last point in OOo, + //as OOo might serve paragraphs in any order. + if ((language == Language.RUSSIAN || language == Language.ITALIAN + || language == Language.POLISH || language == Language.GERMAN) + && (",".equals(lastParagraphString))) { + noException = true; + } + + lastParagraphString = lastToken; + + if (checkToken.length() > 0) { + final char firstChar = checkToken.charAt(0); + if (Character.isLowerCase(firstChar) && (!noException)) { + final RuleMatch ruleMatch = new RuleMatch(this, tokens[matchTokenPos] + .getStartPos(), tokens[matchTokenPos].getStartPos() + + tokens[matchTokenPos].getToken().length(), messages + .getString("incorrect_case")); + ruleMatch.setSuggestedReplacement(Character.toUpperCase(firstChar) + + checkToken.substring(1)); + ruleMatches.add(ruleMatch); + } + } + return toRuleMatchArray(ruleMatches); + } + + private String dutchSpecialCase(final String firstToken, + final String secondToken, final AnalyzedTokenReadings[] tokens) { + if (language != Language.DUTCH) { + return null; + } + if (tokens.length >= 3 && firstToken.equals("'") + && secondToken.matches("k|m|n|r|s|t")) { + return tokens[3].getToken(); + } + return null; + } + + public void reset() { + } + +} diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/rules/WhitespaceRule.java b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/WhitespaceRule.java new file mode 100644 index 0000000..61f1ca6 --- /dev/null +++ b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/WhitespaceRule.java @@ -0,0 +1,91 @@ +/* LanguageTool, a natural language style checker + * Copyright (C) 2005 Daniel Naber (http://www.danielnaber.de) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ + +package de.danielnaber.languagetool.rules; + +import java.util.ArrayList; +import java.util.List; +import java.util.ResourceBundle; + +import de.danielnaber.languagetool.AnalyzedSentence; +import de.danielnaber.languagetool.AnalyzedTokenReadings; +import de.danielnaber.languagetool.Language; + +/** + * Check if there is duplicated whitespace in a sentence. + * Considers two spaces as incorrect, and proposes a single space instead. + * + * @author Marcin Miłkowski + */ + +public class WhitespaceRule extends Rule { + + public WhitespaceRule(final ResourceBundle messages, final Language language) { + super(messages); + super.setCategory(new Category(messages.getString("category_misc"))); + } + + @Override + public final String getId() { + return "WHITESPACE_RULE"; + } + + @Override + public final String getDescription() { + return messages.getString("desc_whitespacerepetition"); + } + + @Override + public RuleMatch[] match(final AnalyzedSentence text) { + final List<RuleMatch> ruleMatches = new ArrayList<RuleMatch>(); + final AnalyzedTokenReadings[] tokens = text.getTokens(); + boolean prevWhite = false; + int prevLen = 0; + int prevPos = 0; + //note: we start from token 1 + //token no. 0 is guaranteed to be SENT_START + int i = 1; + while (i < tokens.length) { + if (tokens[i].isWhitespace() && prevWhite && !tokens[i -1].isLinebreak()) { + final int pos = tokens[i -1].getStartPos(); + while (i < tokens.length && tokens[i].isWhitespace()) { + prevLen += tokens[i].getToken().length(); + i++; + } + final RuleMatch ruleMatch = new RuleMatch(this, prevPos, pos + prevLen, messages + .getString("whitespace_repetition")); + ruleMatch.setSuggestedReplacement(" "); + ruleMatches.add(ruleMatch); + } + if (i < tokens.length) { + prevWhite = tokens[i].isWhitespace(); + prevLen = tokens[i].getToken().length(); + prevPos = tokens[i].getStartPos(); + i++; + } + } + return toRuleMatchArray(ruleMatches); + } + + @Override + public void reset() { + // nothing + } + +} diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/rules/WordRepeatRule.java b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/WordRepeatRule.java new file mode 100644 index 0000000..c8060a5 --- /dev/null +++ b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/WordRepeatRule.java @@ -0,0 +1,101 @@ +/* LanguageTool, a natural language style checker + * Copyright (C) 2005 Daniel Naber (http://www.danielnaber.de) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ +package de.danielnaber.languagetool.rules; + +import java.util.ArrayList; +import java.util.List; +import java.util.ResourceBundle; + +import de.danielnaber.languagetool.AnalyzedSentence; +import de.danielnaber.languagetool.AnalyzedTokenReadings; +import de.danielnaber.languagetool.Language; + +/** + * Check if a word is repeated twice, e.g. "the the". + * + * @author Daniel Naber + */ +public class WordRepeatRule extends Rule { + + public WordRepeatRule(final ResourceBundle messages, final Language language) { + super(messages); + super.setCategory(new Category(messages.getString("category_misc"))); + } + + /** + * Implement this method to return <code>true</code> if there's + * a potential word repetition at the current position should be ignored, + * i.e. if no error should be created. + * + * @param tokens the tokens of the sentence currently being checked + * @param position the current position in the tokens + * @return this implementation always returns false + */ + public boolean ignore(final AnalyzedTokenReadings[] tokens, final int position) { + return false; + } + + @Override + public String getId() { + return "WORD_REPEAT_RULE"; + } + + @Override + public String getDescription() { + return messages.getString("desc_repetition"); + } + + @Override + public RuleMatch[] match(final AnalyzedSentence text) { + final List<RuleMatch> ruleMatches = new ArrayList<RuleMatch>(); + final AnalyzedTokenReadings[] tokens = text.getTokensWithoutWhitespace(); + String prevToken = ""; + //note: we start from token 1 + //token no. 0 is guaranteed to be SENT_START + for (int i = 1; i < tokens.length; i++) { + final String token = tokens[i].getToken(); + // avoid "..." etc. to be matched: + boolean isWord = true; + if (token.length() == 1) { + final char c = token.charAt(0); + if (!Character.isLetter(c)) { + isWord = false; + } + } + final boolean isException = ignore(tokens, i); + if (isWord && prevToken.toLowerCase().equals(token.toLowerCase()) && !isException) { + final String msg = messages.getString("repetition"); + final int prevPos = tokens[i - 1].getStartPos(); + final int pos = tokens[i].getStartPos(); + final RuleMatch ruleMatch = new RuleMatch(this, prevPos, pos+prevToken.length(), msg, + messages.getString("desc_repetition_short")); + ruleMatch.setSuggestedReplacement(prevToken); + ruleMatches.add(ruleMatch); + } + prevToken = token; + } + return toRuleMatchArray(ruleMatches); + } + + @Override + public void reset() { + // nothing + } + +} diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/rules/bitext/BitextRule.java b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/bitext/BitextRule.java new file mode 100644 index 0000000..d508ae5 --- /dev/null +++ b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/bitext/BitextRule.java @@ -0,0 +1,106 @@ +/* LanguageTool, a natural language style checker + * Copyright (C) 2010 Marcin Miłkowski (www.languagetool.org) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ + +package de.danielnaber.languagetool.rules.bitext; + +import java.io.IOException; +import java.util.List; + +import de.danielnaber.languagetool.AnalyzedSentence; +import de.danielnaber.languagetool.bitext.StringPair; +import de.danielnaber.languagetool.rules.Rule; +import de.danielnaber.languagetool.rules.RuleMatch; +import de.danielnaber.languagetool.Language; + +/** + * Abstract bitext rule class. A BitextRule describes a language error and + * can test whether a given pre-analyzed pair of source and target text + * contains that error using the {@link Rule#match} method. + * + * @author Marcin Miłkowski + */ + +public abstract class BitextRule extends Rule { + + private List<StringPair> correctExamples; + private List<IncorrectBitextExample> incorrectExamples; + + private Language sourceLanguage; + + @Override + public abstract String getDescription(); + + public abstract String getMessage(); + + @Override + public abstract String getId(); + + @Override + public abstract RuleMatch[] match(AnalyzedSentence text) throws IOException; + + public abstract RuleMatch[] match(AnalyzedSentence sourceText, + AnalyzedSentence targetText) throws IOException; + + @Override + public abstract void reset(); + + /** + * Set the source language. If the language is not supported + * by LT, you need to use the default tokenizers etc. + * @param lang - Source Language + */ + public final void setSourceLang(final Language lang) { + sourceLanguage = lang; + } + + public final Language getSourceLang() { + return sourceLanguage; + } + + /** + * Set the examples that are correct and thus do not trigger the rule. + */ + public final void setCorrectBitextExamples(final List<StringPair> correctExamples) { + this.correctExamples = correctExamples; + } + + /** + * Get example sentences that are correct and thus will not match this rule. + */ + public final List<StringPair> getCorrectBitextExamples() { + return correctExamples; + } + + /** + * Set the examples that are incorrect and thus do trigger the rule. + */ + public final void setIncorrectBitextExamples( + final List<IncorrectBitextExample> incorrectExamples) { + this.incorrectExamples = incorrectExamples; + } + + /** + * Get example sentences that are incorrect and thus will match this rule. + */ + public final List<IncorrectBitextExample> getIncorrectBitextExamples() { + return incorrectExamples; + } + + +} diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/rules/bitext/DifferentLengthRule.java b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/bitext/DifferentLengthRule.java new file mode 100644 index 0000000..995772c --- /dev/null +++ b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/bitext/DifferentLengthRule.java @@ -0,0 +1,93 @@ +/* LanguageTool, a natural language style checker + * Copyright (C) 2010 Marcin Miłkowski (www.languagetool.org) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ +package de.danielnaber.languagetool.rules.bitext; + +import java.io.IOException; + +import de.danielnaber.languagetool.AnalyzedSentence; +import de.danielnaber.languagetool.AnalyzedTokenReadings; +import de.danielnaber.languagetool.rules.RuleMatch; + +/** + * Checks if the translation has a really different length than the source + * (smaller than 30% or longer by 250%). + * + * @author Marcin Miłkowski + * + */ +public class DifferentLengthRule extends BitextRule { + + static final String MSG = "Source and target translation lengths are very different!"; + + @Override + public String getDescription() { + return "Check if translation length is similar to source length"; + } + + @Override + public String getId() { + return "TRANSLATION_LENGTH"; + } + + public String getMessage() { + return MSG; + } + + /** + * This method makes no sense for bitext, return null?? + */ + @Override + public RuleMatch[] match(AnalyzedSentence text) throws IOException { + // TODO Auto-generated method stub + return null; + } + + @Override + public RuleMatch[] match(AnalyzedSentence sourceText, + AnalyzedSentence targetText) throws IOException { + + if (isLengthDifferent( + getPureText(sourceText), getPureText(targetText))) { + final RuleMatch[] rm = new RuleMatch[1]; + final AnalyzedTokenReadings[] tokens = targetText.getTokens(); + final int len = tokens[tokens.length - 1].getStartPos() + tokens[tokens.length - 1].getToken().length(); + rm[0] = new RuleMatch(this, 1, len, + MSG); + return rm; + } + return new RuleMatch[0]; + } + + static boolean isLengthDifferent(final String src, final String trg) { + final double skew = (((double) src.length() / (double) trg.length()) * 100.00); + return (skew > 250 || skew < 30); + } + + private static String getPureText(AnalyzedSentence text) { + final StringBuilder sb = new StringBuilder(); + for (AnalyzedTokenReadings token : text.getTokens()) { + sb.append(token.getToken()); + } + return sb.toString(); + } + + public void reset() { + } + +} diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/rules/bitext/IncorrectBitextExample.java b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/bitext/IncorrectBitextExample.java new file mode 100644 index 0000000..e877826 --- /dev/null +++ b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/bitext/IncorrectBitextExample.java @@ -0,0 +1,64 @@ +/* LanguageTool, a natural language style checker + * Copyright (C) 2010 Marcin Miłkowski (http://www.languagetool.org) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ +package de.danielnaber.languagetool.rules.bitext; + +import java.util.Arrays; +import java.util.List; + +import de.danielnaber.languagetool.bitext.StringPair; + +/** + * A text, typically a pair of sentences that contains an error. + * + * @since 1.0.1 + * @author Marcin Miłkowski + */ +public class IncorrectBitextExample { + + private StringPair example; + private List<String> corrections; + + public IncorrectBitextExample(final StringPair example) { + this.example = example; + } + + public IncorrectBitextExample(final StringPair example, final String[] corrections) { + this(example); + this.corrections = Arrays.asList(corrections); + } + + /** + * Return the example that contains the error. + */ + public StringPair getExample() { + return example; + } + + /** + * Return the possible corrections. May be null. + */ + public List<String> getCorrections() { + return corrections; + } + + public String toString() { + return example.getSource() + "/ " + example.getTarget() + " " + corrections; + } + +} diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/rules/bitext/SameTranslationRule.java b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/bitext/SameTranslationRule.java new file mode 100644 index 0000000..c9e1ace --- /dev/null +++ b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/bitext/SameTranslationRule.java @@ -0,0 +1,88 @@ +/* LanguageTool, a natural language style checker + * Copyright (C) 2010 Marcin Miłkowski (www.languagetool.org) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ +package de.danielnaber.languagetool.rules.bitext; + +import java.io.IOException; + +import de.danielnaber.languagetool.AnalyzedSentence; +import de.danielnaber.languagetool.AnalyzedTokenReadings; +import de.danielnaber.languagetool.rules.RuleMatch; + +/** + * Checks if the translation for segments that have more than two words + * is different. + * + * @author Marcin Miłkowski + * + */ +public class SameTranslationRule extends BitextRule { + + static final String MSG = "Source and target translation are the same!"; + + @Override + public String getDescription() { + return "Check if translation is the same as source"; + } + + @Override + public String getId() { + return "SAME_TRANSLATION"; + } + + public String getMessage() { + return MSG; + } + + /** + * This method makes no sense for bitext, return null?? + */ + @Override + public RuleMatch[] match(AnalyzedSentence text) throws IOException { + // TODO Auto-generated method stub + return null; + } + + @Override + public RuleMatch[] match(AnalyzedSentence sourceText, + AnalyzedSentence targetText) throws IOException { + + //This is just heuristics, checking word count + if (sourceText.getTokensWithoutWhitespace().length > 3 + && getPureText(sourceText).equals(getPureText(targetText))) { + final RuleMatch[] rm = new RuleMatch[1]; + final AnalyzedTokenReadings[] tokens = targetText.getTokens(); + final int len = tokens[tokens.length - 1].getStartPos() + tokens[tokens.length - 1].getToken().length(); + rm[0] = new RuleMatch(this, 1, len, MSG); + return rm; + } + return new RuleMatch[0]; + } + + private static String getPureText(AnalyzedSentence text) { + final StringBuilder sb = new StringBuilder(); + for (AnalyzedTokenReadings token : text.getTokens()) { + sb.append(token.getToken()); + } + return sb.toString(); + } + + public void reset() { + } + +} diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/rules/ca/AccentuacioReplaceRule.java b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/ca/AccentuacioReplaceRule.java new file mode 100644 index 0000000..eb5a3fa --- /dev/null +++ b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/ca/AccentuacioReplaceRule.java @@ -0,0 +1,90 @@ +/* LanguageTool, a natural language style checker + * Copyright (C) 2005 Daniel Naber (http://www.danielnaber.de) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ +package de.danielnaber.languagetool.rules.ca; + +import java.io.IOException; +import java.util.Locale; +import java.util.ResourceBundle; + +import de.danielnaber.languagetool.rules.AbstractSimpleReplaceRule; + +/** + * A rule that matches words or phrases which should not be used and suggests + * correct ones instead. + * + * Catalan implementations for accentuation errors. + * This is basically the same as CastellanismesReplaceRule.java + * with a different error message. + * + * Loads the list of words from <code>rules/ca/accentuacio.txt</code>. + * + * TODO: Some of the entries are proper names (Greek gods, etc.), which + * aren't currently checked. + * + * @author Jimmy O'Regan + * + * Based on pl/SimpleReplaceRule.java + */ +public class AccentuacioReplaceRule extends AbstractSimpleReplaceRule { + + public static final String CATALAN_ACCENTUACIO_REPLACE_RULE = "CA_ACCENTUACIO_REPLACE"; + + private static final String FILE_NAME = "/ca/accentuacio.txt"; + // locale used on case-conversion + private static final Locale CA_LOCALE = new Locale("ca"); + + public final String getFileName() { + return FILE_NAME; + } + + public AccentuacioReplaceRule(final ResourceBundle messages) throws IOException { + super(messages); + } + + public final String getId() { + return CATALAN_ACCENTUACIO_REPLACE_RULE; + } + + public String getDescription() { + return "Errors d'accentuació"; + } + + public String getShort() { + return "Accentuació"; + } + + public String getSuggestion() { + return " es un error d'accentuació, cal dir: "; + } + + /** + * use case-insensitive matching. + */ + public boolean isCaseSensitive() { + return false; + } + + /** + * locale used on case-conversion + */ + public Locale getLocale() { + return CA_LOCALE; + } + +} diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/rules/ca/CastellanismesReplaceRule.java b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/ca/CastellanismesReplaceRule.java new file mode 100644 index 0000000..3169b66 --- /dev/null +++ b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/ca/CastellanismesReplaceRule.java @@ -0,0 +1,85 @@ +/* LanguageTool, a natural language style checker + * Copyright (C) 2005 Daniel Naber (http://www.danielnaber.de) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ +package de.danielnaber.languagetool.rules.ca; + +import java.io.IOException; +import java.util.Locale; +import java.util.ResourceBundle; + +import de.danielnaber.languagetool.rules.AbstractSimpleReplaceRule; + +/** + * A rule that matches words or phrases which should not be used and suggests + * correct ones instead. + * + * Catalan implementations for Castelianisms, kept separate for an individual + * error message. + * Loads the list of words from <code>rules/ca/castellanismes.txt</code>. + * + * @author Jimmy O'Regan + * + * Based on pl/SimpleReplaceRule.java + */ +public class CastellanismesReplaceRule extends AbstractSimpleReplaceRule { + + public static final String CATALAN_CASTELLANISMES_REPLACE_RULE = "CA_CASTELLANISMES_REPLACE"; + + private static final String FILE_NAME = "/ca/castellanismes.txt"; + // locale used on case-conversion + private static final Locale caLocale = new Locale("ca"); + + public final String getFileName() { + return FILE_NAME; + } + + public CastellanismesReplaceRule(final ResourceBundle messages) throws IOException { + super(messages); + } + + public final String getId() { + return CATALAN_CASTELLANISMES_REPLACE_RULE; + } + + public String getDescription() { + return "Barbarismes (Castellanismes)"; + } + + public String getShort() { + return "Castellanismes"; + } + + public String getSuggestion() { + return " es un castellanisme, cal dir: "; + } + + /** + * use case-insensitive matching. + */ + public boolean isCaseSensitive() { + return false; + } + + /** + * locale used on case-conversion + */ + public Locale getLocale() { + return caLocale; + } + +} diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/rules/de/AgreementRule.java b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/de/AgreementRule.java new file mode 100644 index 0000000..8afff0c --- /dev/null +++ b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/de/AgreementRule.java @@ -0,0 +1,405 @@ +/* LanguageTool, a natural language style checker + * Copyright (C) 2005 Daniel Naber (http://www.danielnaber.de) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ +package de.danielnaber.languagetool.rules.de; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.HashSet; +import java.util.List; +import java.util.ResourceBundle; +import java.util.Set; + +import de.danielnaber.languagetool.AnalyzedSentence; +import de.danielnaber.languagetool.AnalyzedTokenReadings; +import de.danielnaber.languagetool.JLanguageTool; +import de.danielnaber.languagetool.rules.Category; +import de.danielnaber.languagetool.rules.RuleMatch; +import de.danielnaber.languagetool.tagging.de.AnalyzedGermanToken; +import de.danielnaber.languagetool.tagging.de.AnalyzedGermanTokenReadings; +import de.danielnaber.languagetool.tagging.de.GermanTagger; +import de.danielnaber.languagetool.tagging.de.GermanToken; +import de.danielnaber.languagetool.tagging.de.GermanToken.POSType; +import de.danielnaber.languagetool.tools.StringTools; + +/** + * Simple agreement checker for German noun phrases. Checks agreement in: + * + * <ul> + * <li>DET/PRO NOUN: e.g. "mein Auto", "der Mann", "die Frau" (correct), "die Haus" (incorrect)</li> + * <li>DET/PRO ADJ NOUN: e.g. "der riesige Tisch" (correct), "die riesigen Tisch" (incorrect)</li> + * </ul> + * + * Note that this rule only checks agreement inside the noun phrase, not whether + * e.g. the correct case is used. For example, "Es ist das Haus dem Mann" is not + * detected as incorrect. + * + * @author Daniel Naber + */ +public class AgreementRule extends GermanRule { + + private static final String KASUS = "Kasus"; + private static final String NUMERUS = "Numerus"; + private static final String GENUS = "Genus"; + + /* + * City names are incoherently tagged in the Morphy data. To avoid + * false alarms on phrases like "das Berliner Auto" we have to + * explicitly add these adjective readings to "Berliner" and to all + * other potential city names: + */ + private static final String[] ADJ_READINGS = new String[] { + // singular: + "ADJ:NOM:SIN:MAS:GRU", "ADJ:NOM:SIN:NEU:GRU", "ADJ:NOM:SIN:FEM:GRU", // das Berliner Auto + "ADJ:GEN:SIN:MAS:GRU", "ADJ:GEN:SIN:NEU:GRU", "ADJ:GEN:SIN:FEM:GRU", // des Berliner Autos + "ADJ:DAT:SIN:MAS:GRU", "ADJ:DAT:SIN:NEU:GRU", "ADJ:DAT:SIN:FEM:GRU", // dem Berliner Auto + "ADJ:AKK:SIN:MAS:GRU", "ADJ:AKK:SIN:NEU:GRU", "ADJ:AKK:SIN:FEM:GRU", // den Berliner Bewohner + // plural: + "ADJ:NOM:PLU:MAS:GRU", "ADJ:NOM:PLU:NEU:GRU", "ADJ:NOM:PLU:FEM:GRU", // die Berliner Autos + "ADJ:GEN:PLU:MAS:GRU", "ADJ:GEN:PLU:NEU:GRU", "ADJ:GEN:PLU:FEM:GRU", // der Berliner Autos + "ADJ:DAT:PLU:MAS:GRU", "ADJ:DAT:PLU:NEU:GRU", "ADJ:DAT:PLU:FEM:GRU", // den Berliner Autos + "ADJ:AKK:PLU:MAS:GRU", "ADJ:AKK:PLU:NEU:GRU", "ADJ:AKK:PLU:FEM:GRU", // den Berliner Bewohnern + }; + + + private static final Set<String> REL_PRONOUN = new HashSet<String>(); + static { + REL_PRONOUN.add("der"); + REL_PRONOUN.add("die"); + REL_PRONOUN.add("das"); + REL_PRONOUN.add("dessen"); + REL_PRONOUN.add("deren"); + REL_PRONOUN.add("dem"); + REL_PRONOUN.add("den"); + REL_PRONOUN.add("welche"); + REL_PRONOUN.add("welcher"); + REL_PRONOUN.add("welchen"); + REL_PRONOUN.add("welchem"); + REL_PRONOUN.add("welches"); + } + + private static final Set<String> PREPOSITIONS = new HashSet<String>(); + static { + PREPOSITIONS.add("in"); + PREPOSITIONS.add("auf"); + PREPOSITIONS.add("an"); + PREPOSITIONS.add("ab"); + PREPOSITIONS.add("für"); + PREPOSITIONS.add("zu"); + // TODO: add more + } + + public AgreementRule(final ResourceBundle messages) { + if (messages != null) + super.setCategory(new Category(messages.getString("category_grammar"))); + } + + public String getId() { + return "DE_AGREEMENT"; + } + + public String getDescription() { + return "Kongruenz von Nominalphrasen (unvollständig!), z.B. 'mein kleiner(kleines) Haus'"; + } + + public RuleMatch[] match(final AnalyzedSentence text) { + final List<RuleMatch> ruleMatches = new ArrayList<RuleMatch>(); + final AnalyzedTokenReadings[] tokens = text.getTokensWithoutWhitespace(); + int pos = 0; + for (int i = 0; i < tokens.length; i++) { + //defaulting to the first reading + //TODO: check for all readings + //and replace GermanTokenReading + final String posToken = tokens[i].getAnalyzedToken(0).getPOSTag(); + if (posToken != null && posToken.equals(JLanguageTool.SENTENCE_START_TAGNAME)) + continue; + //AnalyzedGermanToken analyzedToken = new AnalyzedGermanToken(tokens[i]); + + final AnalyzedGermanTokenReadings analyzedToken = (AnalyzedGermanTokenReadings)tokens[i]; + final boolean relevantPronoun = isRelevantPronoun(tokens, i); + + boolean ignore = couldBeRelativeClause(tokens, i); + if (i > 0) { + final String prevToken = tokens[i-1].getToken().toLowerCase(); + if ((prevToken.equals("der") || prevToken.equals("die") || prevToken.equals("das")) + && tokens[i].getToken().equals("eine")) { + // TODO: "der eine Polizist" -> nicht ignorieren, sondern "der polizist" checken + ignore = true; + } + } + + // avoid false alarm on "nichts Gutes": + if (analyzedToken.getToken().equals("nichts")) { + ignore = true; + } + + if ((analyzedToken.hasReadingOfType(POSType.DETERMINER) || relevantPronoun) && !ignore) { + int tokenPos = i + 1; + if (tokenPos >= tokens.length) + break; + AnalyzedGermanTokenReadings nextToken = (AnalyzedGermanTokenReadings)tokens[tokenPos]; + nextToken = maybeAddAdjectiveReadings(nextToken, tokens, tokenPos); + if (nextToken.hasReadingOfType(POSType.ADJEKTIV)) { + tokenPos = i + 2; + if (tokenPos >= tokens.length) + break; + final AnalyzedGermanTokenReadings nextNextToken = (AnalyzedGermanTokenReadings)tokens[tokenPos]; + if (nextNextToken.hasReadingOfType(POSType.NOMEN)) { + // TODO: add a case (checkAdjNounAgreement) for special cases like "deren", + // e.g. "deren komisches Geschenke" isn't yet detected as incorrect + final RuleMatch ruleMatch = checkDetAdjNounAgreement((AnalyzedGermanTokenReadings)tokens[i], + nextToken, (AnalyzedGermanTokenReadings)tokens[i+2]); + if (ruleMatch != null) { + ruleMatches.add(ruleMatch); + } + } + } else if (nextToken.hasReadingOfType(POSType.NOMEN)) { + final RuleMatch ruleMatch = checkDetNounAgreement((AnalyzedGermanTokenReadings)tokens[i], + (AnalyzedGermanTokenReadings)tokens[i+1]); + if (ruleMatch != null) { + ruleMatches.add(ruleMatch); + } + } + } + + pos += tokens[i].getToken().length(); + } + return toRuleMatchArray(ruleMatches); + } + + private boolean isRelevantPronoun(AnalyzedTokenReadings[] tokens, int pos) { + final AnalyzedGermanTokenReadings analyzedToken = (AnalyzedGermanTokenReadings)tokens[pos]; + boolean relevantPronoun = analyzedToken.hasReadingOfType(POSType.PRONOMEN); + // avoid false alarms: + final String token = tokens[pos].getToken(); + if (pos > 0 && tokens[pos-1].getToken().equalsIgnoreCase("vor") && tokens[pos].getToken().equalsIgnoreCase("allem")) + relevantPronoun = false; + else if (token.equalsIgnoreCase("er") || token.equalsIgnoreCase("sie") || token.equalsIgnoreCase("es")) + relevantPronoun = false; + else if (token.equalsIgnoreCase("ich")) + relevantPronoun = false; + else if (token.equalsIgnoreCase("du")) + relevantPronoun = false; + else if (token.equalsIgnoreCase("dessen")) // avoid false alarm on: "..., dessen Leiche" + relevantPronoun = false; + else if (token.equalsIgnoreCase("deren")) + relevantPronoun = false; + else if (token.equalsIgnoreCase("sich")) // avoid false alarm + relevantPronoun = false; + else if (token.equalsIgnoreCase("unser")) // avoid false alarm "unser Produkt": TODO! + relevantPronoun = false; + else if (token.equalsIgnoreCase("aller")) + relevantPronoun = false; + else if (token.equalsIgnoreCase("man")) + relevantPronoun = false; + else if (token.equalsIgnoreCase("beiden")) + relevantPronoun = false; + else if (token.equalsIgnoreCase("wessen")) + relevantPronoun = false; + else if (token.equalsIgnoreCase("a")) + relevantPronoun = false; + else if (token.equalsIgnoreCase("alle")) + relevantPronoun = false; + else if (token.equalsIgnoreCase("etwas")) // TODO: doesn't have case -- but don't just ignore + relevantPronoun = false; + else if (token.equalsIgnoreCase("was")) // TODO: doesn't have case -- but don't just ignore + relevantPronoun = false; + else if (token.equalsIgnoreCase("wer")) + relevantPronoun = false; + return relevantPronoun; + } + + // see the comment at ADJ_READINGS: + private AnalyzedGermanTokenReadings maybeAddAdjectiveReadings(AnalyzedGermanTokenReadings nextToken, + AnalyzedTokenReadings[] tokens, int tokenPos) { + final String nextTerm = nextToken.getToken(); + // Just a heuristic: nouns and proper nouns that end with "er" are considered + // city names: + if (nextTerm.endsWith("er") && tokens.length > tokenPos+1) { + final AnalyzedGermanTokenReadings nextNextToken = (AnalyzedGermanTokenReadings)tokens[tokenPos+1]; + final GermanTagger tagger = new GermanTagger(); + try { + final AnalyzedGermanTokenReadings nextATR = tagger.lookup(nextTerm.substring(0, nextTerm.length()-2)); + final AnalyzedGermanTokenReadings nextNextATR = tagger.lookup(nextNextToken.getToken()); + //System.err.println("nextATR: " + nextATR); + //System.err.println("nextNextATR: " + nextNextATR); + // "Münchner": special case as cutting off last two characters doesn't produce city name: + if ("Münchner".equals(nextTerm) || + (nextATR != null && + // tagging in Morphy for cities is not coherent: + (nextATR.hasReadingOfType(POSType.PROPER_NOUN) || nextATR.hasReadingOfType(POSType.NOMEN) && + nextNextATR != null && nextNextATR.hasReadingOfType(POSType.NOMEN)))) { + final AnalyzedGermanToken[] adjReadings = new AnalyzedGermanToken[ADJ_READINGS.length]; + for (int j = 0; j < ADJ_READINGS.length; j++) { + adjReadings[j] = new AnalyzedGermanToken(nextTerm, ADJ_READINGS[j], null); + } + nextToken = new AnalyzedGermanTokenReadings(adjReadings, nextToken.getStartPos()); + } + } catch (IOException e) { + throw new RuntimeException(e); + } + } + return nextToken; + } + + // TODO: improve this so it only returns true for real relative clauses + private boolean couldBeRelativeClause(AnalyzedTokenReadings[] tokens, int pos) { + boolean comma; + boolean relPronoun; + if (pos >= 1) { + // avoid false alarm: "Das Wahlrecht, das Frauen zugesprochen bekamen." etc: + comma = tokens[pos-1].getToken().equals(","); + final String term = tokens[pos].getToken().toLowerCase(); + relPronoun = REL_PRONOUN.contains(term); + if (comma && relPronoun) + return true; + } + if (pos >= 2) { + // avoid false alarm: "Der Mann, in dem quadratische Fische schwammen." + comma = tokens[pos-2].getToken().equals(","); + final String term1 = tokens[pos-1].getToken().toLowerCase(); + final String term2 = tokens[pos].getToken().toLowerCase(); + final boolean prep = PREPOSITIONS.contains(term1); + relPronoun = REL_PRONOUN.contains(term2); + return comma && prep && relPronoun; + } + return false; + } + + private RuleMatch checkDetNounAgreement(final AnalyzedGermanTokenReadings token1, + final AnalyzedGermanTokenReadings token2) { + // avoid false alarm: "Gebt ihm Macht." + if (token1.getToken().equalsIgnoreCase("ihm")) + return null; + RuleMatch ruleMatch = null; + final Set<String> set1 = getAgreementCategories(token1); + if (set1 == null) + return null; // word not known, assume it's correct + final Set<String> set2 = getAgreementCategories(token2); + if (set2 == null) + return null; + /*System.err.println("#"+set1); + System.err.println("#"+set2); + System.err.println("");*/ + set1.retainAll(set2); + if (set1.size() == 0) { + // TODO: better error message than just 'agreement error' + final String msg = "Möglicherweise fehlende Übereinstimmung (Kongruenz) zwischen Artikel und Nomen " + + "bezüglich Kasus, Numerus oder Genus. Beispiel: 'meine Haus' statt 'mein Haus'"; + ruleMatch = new RuleMatch(this, token1.getStartPos(), + token2.getStartPos()+token2.getToken().length(), msg); + } + return ruleMatch; + } + + private RuleMatch checkDetAdjNounAgreement(final AnalyzedGermanTokenReadings token1, + final AnalyzedGermanTokenReadings token2, final AnalyzedGermanTokenReadings token3) { + final Set<String> relax = new HashSet<String>(); + final Set<String> set = retainCommonCategories(token1, token2, token3, relax); + RuleMatch ruleMatch = null; + if (set.size() == 0) { + // TODO: more detailed error message: + /*relax.add(KASUS); + set = retainCommonCategories(token1, token2, token3, relax); + if (set.size() > 0) { + System.err.println("KASUS!"); + } + relax.clear(); + relax.add(NUMERUS); + set = retainCommonCategories(token1, token2, token3, relax); + if (set.size() > 0) { + System.err.println("NUMERUS!"); + } + relax.clear(); + relax.add(GENUS); + set = retainCommonCategories(token1, token2, token3, relax); + if (set.size() > 0) { + System.err.println("GENUS!"); + }*/ + final String msg = "Möglicherweise fehlende Übereinstimmung (Kongruenz) zwischen Artikel, Adjektiv und " + + "Nomen bezüglich Kasus, Numerus oder Genus. Beispiel: 'mein kleiner Haus' " + + "statt 'mein kleines Haus'"; + ruleMatch = new RuleMatch(this, token1.getStartPos(), + token3.getStartPos()+token3.getToken().length(), msg); + } + return ruleMatch; + } + + private Set<String> retainCommonCategories(final AnalyzedGermanTokenReadings token1, + final AnalyzedGermanTokenReadings token2, final AnalyzedGermanTokenReadings token3, + Set<String> relax) { + final Set<String> set1 = getAgreementCategories(token1, relax); + if (set1 == null) + return null; // word not known, assume it's correct + final Set<String> set2 = getAgreementCategories(token2, relax); + if (set2 == null) + return null; + final Set<String> set3 = getAgreementCategories(token3, relax); + if (set3 == null) + return null; + /*System.err.println(token1.getToken()+"#"+set1); + System.err.println(token2.getToken()+"#"+set2); + System.err.println(token3.getToken()+"#"+set3); + System.err.println("");*/ + set1.retainAll(set2); + set1.retainAll(set3); + return set1; + } + + private Set<String> getAgreementCategories(final AnalyzedGermanTokenReadings aToken) { + return getAgreementCategories(aToken, new HashSet<String>()); + } + + /** Return Kasus, Numerus, Genus. */ + private Set<String> getAgreementCategories(final AnalyzedGermanTokenReadings aToken, Set<String> omit) { + final Set<String> set = new HashSet<String>(); + final List<AnalyzedGermanToken> readings = aToken.getGermanReadings(); + for (AnalyzedGermanToken reading : readings) { + if (reading.getCasus() == null && reading.getNumerus() == null && + reading.getGenus() == null) + continue; + if (reading.getGenus() == null) { + // "ich" and "wir" contains genus=ALG in the original data. Not sure if + // this is allowed, but expand this so "Ich Arbeiter" doesn't get flagged + // as incorrect: + set.add(makeString(reading.getCasus(), reading.getNumerus(), GermanToken.Genus.MASKULINUM, omit)); + set.add(makeString(reading.getCasus(), reading.getNumerus(), GermanToken.Genus.FEMININUM, omit)); + set.add(makeString(reading.getCasus(), reading.getNumerus(), GermanToken.Genus.NEUTRUM, omit)); + } else { + set.add(makeString(reading.getCasus(), reading.getNumerus(), reading.getGenus(), omit)); + } + } + return set; + } + + private String makeString(GermanToken.Kasus casus, GermanToken.Numerus num, GermanToken.Genus gen, + Set<String> omit) { + final List<String> l = new ArrayList<String>(); + if (casus != null && !omit.contains(KASUS)) + l.add(casus.toString()); + if (num != null && !omit.contains(NUMERUS)) + l.add(num.toString()); + if (gen != null && !omit.contains(GENUS)) + l.add(gen.toString()); + return StringTools.listToString(l, "/"); + } + + public void reset() { + } + +} diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/rules/de/CaseRule.java b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/de/CaseRule.java new file mode 100644 index 0000000..663e9ff --- /dev/null +++ b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/de/CaseRule.java @@ -0,0 +1,358 @@ +/* LanguageTool, a natural language style checker + * Copyright (C) 2005 Daniel Naber (http://www.danielnaber.de) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ +package de.danielnaber.languagetool.rules.de; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.HashSet; +import java.util.List; +import java.util.ResourceBundle; +import java.util.Set; + +import de.danielnaber.languagetool.AnalyzedSentence; +import de.danielnaber.languagetool.AnalyzedTokenReadings; +import de.danielnaber.languagetool.JLanguageTool; +import de.danielnaber.languagetool.rules.Category; +import de.danielnaber.languagetool.rules.RuleMatch; +import de.danielnaber.languagetool.tagging.de.AnalyzedGermanToken; +import de.danielnaber.languagetool.tagging.de.AnalyzedGermanTokenReadings; +import de.danielnaber.languagetool.tagging.de.GermanTagger; +import de.danielnaber.languagetool.tagging.de.GermanToken; +import de.danielnaber.languagetool.tagging.de.GermanToken.POSType; +import de.danielnaber.languagetool.tools.StringTools; + +/** + * Check that adjectives and verbs are not written with an uppercase + * first letter (except at the start of a sentence) and cases + * like this: <tt>Das laufen fällt mir leicht.</tt> (<tt>laufen</tt> needs + * to be uppercased). + * + * @author Daniel Naber + */ +public class CaseRule extends GermanRule { + + private final GermanTagger tagger = new GermanTagger(); + + // wenn hinter diesen Wörtern ein Verb steht, ist es wohl ein substantiviertes Verb, + // muss also groß geschrieben werden: + private static final Set<String> nounIndicators = new HashSet<String>(); + static { + nounIndicators.add("das"); + nounIndicators.add("sein"); + //indicator.add("seines"); // TODO: ? + //nounIndicators.add("ihr"); // would cause false alarm e.g. "Auf ihr stehen die Ruinen..." + nounIndicators.add("mein"); + nounIndicators.add("dein"); + nounIndicators.add("euer"); + //indicator.add("ihres"); + //indicator.add("ihren"); + } + + private static final Set<String> sentenceStartExceptions = new HashSet<String>(); + static { + sentenceStartExceptions.add("("); + sentenceStartExceptions.add(":"); + sentenceStartExceptions.add("\""); + sentenceStartExceptions.add("'"); + sentenceStartExceptions.add("„"); + sentenceStartExceptions.add("“"); + sentenceStartExceptions.add("«"); + sentenceStartExceptions.add("»"); + } + + private static final Set<String> exceptions = new HashSet<String>(); + static { + exceptions.add("Für"); // "das Für und Wider" + exceptions.add("Wider"); // "das Für und Wider" + exceptions.add("Nachts"); // "des Nachts", "eines Nachts" + exceptions.add("Genüge"); + exceptions.add("Zusage"); + exceptions.add("Nachfrage"); + exceptions.add("Sachverständiger"); + exceptions.add("Nr"); + exceptions.add("Sankt"); + exceptions.add("Toter"); + exceptions.add("Verantwortlicher"); + exceptions.add("Wichtiges"); + exceptions.add("Dr"); + exceptions.add("Prof"); + exceptions.add("Mr"); + exceptions.add("Mrs"); + exceptions.add("De"); // "De Morgan" etc + exceptions.add("Le"); // "Le Monde" etc + exceptions.add("Ihr"); + exceptions.add("Ihre"); + exceptions.add("Ihres"); + exceptions.add("Ihren"); + exceptions.add("Ihnen"); + exceptions.add("Ihrem"); + exceptions.add("Ihrer"); + exceptions.add("Sie"); + exceptions.add("Aus"); // "vor dem Aus stehen" + exceptions.add("Oder"); // der Fluss + exceptions.add("tun"); // "Sie müssen das tun" + exceptions.add("St"); // Paris St. Germain + exceptions.add("Las"); // Las Vegas, nicht "lesen" + exceptions.add("Folgendes"); // je nach Kontext groß (TODO)... + exceptions.add("besonderes"); // je nach Kontext groß (TODO): "etwas Besonderes" + exceptions.add("Hundert"); // je nach Kontext groß (TODO) + exceptions.add("Tausend"); // je nach Kontext groß (TODO) + exceptions.add("Übrigen"); // je nach Kontext groß (TODO), z.B. "im Übrigen" + exceptions.add("Unvorhergesehenes"); // je nach Kontext groß (TODO), z.B. "etwas Unvorhergesehenes" + + exceptions.add("Englisch"); // TODO: alle Sprachen + exceptions.add("Deutsch"); + exceptions.add("Französisch"); + exceptions.add("Spanisch"); + exceptions.add("Italienisch"); + exceptions.add("Portugiesisch"); + exceptions.add("Dänisch"); + exceptions.add("Norwegisch"); + exceptions.add("Schwedisch"); + exceptions.add("Finnisch"); + exceptions.add("Holländisch"); + exceptions.add("Niederländisch"); + exceptions.add("Polnisch"); + exceptions.add("Tschechisch"); + exceptions.add("Arabisch"); + exceptions.add("Persisch"); + + exceptions.add("Schuld"); + exceptions.add("Erwachsener"); + exceptions.add("Jugendlicher"); + exceptions.add("Link"); + exceptions.add("Ausdrücke"); + exceptions.add("Landwirtschaft"); + exceptions.add("Flöße"); + exceptions.add("Wild"); + exceptions.add("Vorsitzender"); + exceptions.add("Mrd"); + exceptions.add("Links"); + // Änderungen an der Rechtschreibreform 2006 erlauben hier Großschreibung: + exceptions.add("Du"); + exceptions.add("Dir"); + exceptions.add("Dich"); + exceptions.add("Deine"); + exceptions.add("Deinen"); + exceptions.add("Deinem"); + exceptions.add("Deines"); + exceptions.add("Deiner"); + exceptions.add("Euch"); + + exceptions.add("Neuem"); + exceptions.add("Weitem"); + exceptions.add("Weiteres"); + exceptions.add("Langem"); + exceptions.add("Längerem"); + exceptions.add("Kurzem"); + exceptions.add("Schwarzes"); // Schwarzes Brett + exceptions.add("Goldener"); // Goldener Schnitt + // TODO: add more exceptions here + } + + private static final Set<String> myExceptionPhrases = new HashSet<String>(); + static { + // use proper upper/lowercase spelling here: + myExceptionPhrases.add("ohne Wenn und Aber"); + myExceptionPhrases.add("Große Koalition"); + myExceptionPhrases.add("Großen Koalition"); + myExceptionPhrases.add("im Großen und Ganzen"); + myExceptionPhrases.add("Im Großen und Ganzen"); + myExceptionPhrases.add("im Guten wie im Schlechten"); + myExceptionPhrases.add("Im Guten wie im Schlechten"); + } + + private static final Set<String> substVerbenExceptions = new HashSet<String>(); + static { + substVerbenExceptions.add("gehören"); + substVerbenExceptions.add("bedeutet"); // "und das bedeutet..." + substVerbenExceptions.add("ermöglicht"); // "und das ermöglicht..." + substVerbenExceptions.add("sollen"); + substVerbenExceptions.add("werden"); + substVerbenExceptions.add("dürfen"); + substVerbenExceptions.add("müssen"); + substVerbenExceptions.add("so"); + substVerbenExceptions.add("ist"); + substVerbenExceptions.add("können"); + substVerbenExceptions.add("muss"); + substVerbenExceptions.add("muß"); + substVerbenExceptions.add("wollen"); + substVerbenExceptions.add("habe"); + substVerbenExceptions.add("ein"); // nicht "einen" (Verb) + substVerbenExceptions.add("tun"); // "...dann wird er das tun." + substVerbenExceptions.add("bestätigt"); + substVerbenExceptions.add("bestätigte"); + substVerbenExceptions.add("bestätigten"); + substVerbenExceptions.add("bekommen"); + } + + public CaseRule(final ResourceBundle messages) { + if (messages != null) + super.setCategory(new Category(messages.getString("category_case"))); + } + + public String getId() { + return "DE_CASE"; + } + + public String getDescription() { + return "Großschreibung von Nomen und substantivierten Verben"; + } + + public RuleMatch[] match(final AnalyzedSentence text) { + final List<RuleMatch> ruleMatches = new ArrayList<RuleMatch>(); + final AnalyzedTokenReadings[] tokens = text.getTokensWithoutWhitespace(); + + int pos = 0; + boolean prevTokenIsDas = false; + for (int i = 0; i < tokens.length; i++) { + //FIXME: defaulting to the first analysis + //don't know if it's safe + final String posToken = tokens[i].getAnalyzedToken(0).getPOSTag(); + if (posToken != null && posToken.equals(JLanguageTool.SENTENCE_START_TAGNAME)) + continue; + if (i == 1) { // don't care about first word, UppercaseSentenceStartRule does this already + if (nounIndicators.contains(tokens[i].getToken().toLowerCase())) { + prevTokenIsDas = true; + } + continue; + } + final AnalyzedGermanTokenReadings analyzedToken = (AnalyzedGermanTokenReadings)tokens[i]; + final String token = analyzedToken.getToken(); + List<AnalyzedGermanToken> readings = analyzedToken.getGermanReadings(); + AnalyzedGermanTokenReadings analyzedGermanToken2 = null; + + boolean isBaseform = false; + if (analyzedToken.getReadingsLength() > 1 && token.equals(analyzedToken.getAnalyzedToken(0).getLemma())) { + isBaseform = true; + } + if ((readings == null || analyzedToken.getAnalyzedToken(0).getPOSTag() == null || analyzedToken.hasReadingOfType(GermanToken.POSType.VERB)) + && isBaseform) { + // no match, e.g. for "Groß": try if there's a match for the lowercased word: + + try { + analyzedGermanToken2 = tagger.lookup(token.toLowerCase()); + if (analyzedGermanToken2 != null) { + readings = analyzedGermanToken2.getGermanReadings(); + } + } catch (IOException e) { + throw new RuntimeException(e); + } + if (prevTokenIsDas) { + // e.g. essen -> Essen + final String newToken = StringTools.uppercaseFirstChar(token); + try { + analyzedGermanToken2 = tagger.lookup(newToken); + //analyzedGermanToken2.hasReadingOfType(GermanToken.POSType.VERB) + } catch (IOException e) { + throw new RuntimeException(e); + } + if (Character.isLowerCase(token.charAt(0)) && !substVerbenExceptions.contains(token)) { + final String msg = "Substantivierte Verben werden groß geschrieben."; + final RuleMatch ruleMatch = new RuleMatch(this, tokens[i].getStartPos(), + tokens[i].getStartPos()+token.length(), msg); + final String word = tokens[i].getToken(); + final String fixedWord = StringTools.uppercaseFirstChar(word); + ruleMatch.setSuggestedReplacement(fixedWord); + ruleMatches.add(ruleMatch); + } + } + } + prevTokenIsDas = nounIndicators.contains(tokens[i].getToken().toLowerCase()); + if (readings == null) + continue; + final boolean hasNounReading = analyzedToken.hasReadingOfType(GermanToken.POSType.NOMEN); + if (hasNounReading) // it's the spell checker's task to check that nouns are uppercase + continue; + try { + // TODO: this lookup should only happen once: + analyzedGermanToken2 = tagger.lookup(token.toLowerCase()); + } catch (IOException e) { + throw new RuntimeException(e); + } + if (analyzedToken.getAnalyzedToken(0).getPOSTag() == null && analyzedGermanToken2 == null) { + continue; + } + if (analyzedToken.getAnalyzedToken(0).getPOSTag() == null && analyzedGermanToken2 != null + && analyzedGermanToken2.getAnalyzedToken(0).getPOSTag() == null) { + // unknown word, probably a name etc + continue; + } + + if (Character.isUpperCase(token.charAt(0)) && + token.length() > 1 && // length limit = ignore abbreviations + !sentenceStartExceptions.contains(tokens[i-1].getToken()) && + !StringTools.isAllUppercase(token) && + !exceptions.contains(token) && + !analyzedToken.hasReadingOfType(POSType.PROPER_NOUN) && + !analyzedToken.isSentenceEnd() && + !isExceptionPhrase(i, tokens)) { + final String msg = "Außer am Satzanfang werden nur Nomen und Eigennamen groß geschrieben"; + final RuleMatch ruleMatch = new RuleMatch(this, tokens[i].getStartPos(), + tokens[i].getStartPos()+token.length(), msg); + final String word = tokens[i].getToken(); + final String fixedWord = Character.toLowerCase(word.charAt(0)) + word.substring(1); + ruleMatch.setSuggestedReplacement(fixedWord); + ruleMatches.add(ruleMatch); + } + pos += token.length(); + } + return toRuleMatchArray(ruleMatches); + } + + private boolean isExceptionPhrase(int i, AnalyzedTokenReadings[] tokens) { + // TODO: speed up? + for (String exc : myExceptionPhrases) { + final String[] parts = exc.split(" "); + for (int j = 0; j < parts.length; j++) { + if (parts[j].equals(tokens[i].getToken())) { + /*System.err.println("*******"+j + " of " + parts.length + ": " + parts[j]); + System.err.println("start:" + tokens[i-j].getToken()); + System.err.println("end:" + tokens[i-j+parts.length-1].getToken());*/ + final int startIndex = i-j; + if (compareLists(tokens, startIndex, startIndex+parts.length-1, parts)) { + return true; + } + } + } + } + return false; + } + + private boolean compareLists(AnalyzedTokenReadings[] tokens, int startIndex, int endIndex, String[] parts) { + if (startIndex < 0) + return false; + int i = 0; + for (int j = startIndex; j <= endIndex; j++) { + //System.err.println("**" +tokens[j].getToken() + " <-> "+ parts[i]); + if (i >= parts.length) + return false; + if (!tokens[j].getToken().equals(parts[i])) { + return false; + } + i++; + } + return true; + } + + public void reset() { + // nothing + } + +}
\ No newline at end of file diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/rules/de/CompoundRule.java b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/de/CompoundRule.java new file mode 100644 index 0000000..f180acc --- /dev/null +++ b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/de/CompoundRule.java @@ -0,0 +1,53 @@ +/* LanguageTool, a natural language style checker + * Copyright (C) 2006 Daniel Naber (http://www.danielnaber.de) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ +package de.danielnaber.languagetool.rules.de; + +import java.io.IOException; +import java.util.ResourceBundle; + +import de.danielnaber.languagetool.JLanguageTool; +import de.danielnaber.languagetool.rules.AbstractCompoundRule; + +/** + * Checks that compounds (if in the list) are not written as separate words. + * + * @author Daniel Naber + */ +public class CompoundRule extends AbstractCompoundRule { + + private static final String FILE_NAME = "/de/compounds.txt"; + + public CompoundRule(final ResourceBundle messages) throws IOException { + super(messages); + loadCompoundFile(JLanguageTool.getDataBroker().getFromResourceDirAsStream(FILE_NAME), "UTF-8"); + super.setShort("Hyphenation problem"); + super.setMsg("Dieses Kompositum wird mit Bindestrich geschrieben.", + "Dieses Kompositum wird zusammengeschrieben.", + "Dieses Kompositum wird zusammen oder mit Bindestrich geschrieben."); + } + + + public String getId() { + return "DE_COMPOUNDS"; + } + + public String getDescription() { + return "Zusammenschreibung von Komposita, z.B. 'CD-ROM' statt 'CD ROM'"; + } +} diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/rules/de/DashRule.java b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/de/DashRule.java new file mode 100644 index 0000000..18bb670 --- /dev/null +++ b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/de/DashRule.java @@ -0,0 +1,84 @@ +/* LanguageTool, a natural language style checker + * Copyright (C) 2005 Daniel Naber (http://www.danielnaber.de) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ +package de.danielnaber.languagetool.rules.de; + +import java.util.ArrayList; +import java.util.List; +import java.util.ResourceBundle; + +import de.danielnaber.languagetool.AnalyzedSentence; +import de.danielnaber.languagetool.AnalyzedTokenReadings; +import de.danielnaber.languagetool.rules.Category; +import de.danielnaber.languagetool.rules.RuleMatch; + +/** + * Prüft, dass in Bindestrich-Komposita kein Leerzeichen eingefügt wird (wie z.B. in 'Diäten- Erhöhung'). + * + * @author Daniel Naber + */ +public class DashRule extends GermanRule { + + public DashRule(final ResourceBundle messages) { + if (messages != null) + super.setCategory(new Category(messages.getString("category_misc"))); + } + + public String getId() { + return "DE_DASH"; + } + + public String getDescription() { + return "Keine Leerzeichen in Bindestrich-Komposita (wie z.B. in 'Diäten- Erhöhung')"; + } + + public RuleMatch[] match(final AnalyzedSentence text) { + final List<RuleMatch> ruleMatches = new ArrayList<RuleMatch>(); + final AnalyzedTokenReadings[] tokens = text.getTokensWithoutWhitespace(); + int pos = 0; + String prevToken = null; + for (int i = 0; i < tokens.length; i++) { + final String token = tokens[i].getToken(); + if (tokens[i].isWhitespace()) { + // ignore + continue; + } + if (prevToken != null && !prevToken.equals("-") && prevToken.indexOf("--") == -1 + && prevToken.indexOf("–-") == -1 // first char is some special kind of dash, found in Wikipedia + && prevToken.endsWith("-")) { + final char firstChar = token.charAt(0); + if (Character.isUpperCase(firstChar)) { + final String msg = "Möglicherweise fehlt ein 'und' oder es wurde nach dem Wort " + + "ein überflüssiges Leerzeichen eingefügt."; + final RuleMatch ruleMatch = new RuleMatch(this, tokens[i-1].getStartPos(), + tokens[i-1].getStartPos()+prevToken.length()+1, msg); + ruleMatch.setSuggestedReplacement(tokens[i-1].getToken()); + ruleMatches.add(ruleMatch); + } + } + prevToken = token; + pos += token.length(); + } + return toRuleMatchArray(ruleMatches); + } + + public void reset() { + // nothing + } + +} diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/rules/de/GermanLemmatizer.java b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/de/GermanLemmatizer.java new file mode 100644 index 0000000..ddcac98 --- /dev/null +++ b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/de/GermanLemmatizer.java @@ -0,0 +1,84 @@ +/* LanguageTool, a natural language style checker + * Copyright (C) 2005 Daniel Naber (http://www.danielnaber.de) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ +package de.danielnaber.languagetool.rules.de; + +import java.io.BufferedReader; +import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.util.HashMap; +import java.util.Map; + +import de.danielnaber.languagetool.JLanguageTool; + +/** + * Trivial German lemmatizer that can simply find the baseforms of + * those fullforms listed in <code>rules/de/fullform2baseform.txt</code>. + * + * @author Daniel Naber + */ +class GermanLemmatizer { + + private static final String FILE_NAME = "/de/fullform2baseform.txt"; + private static final String FILE_ENCODING = "utf-8"; + + private final Map<String, String> fullform2baseform; + + GermanLemmatizer() throws IOException { + fullform2baseform = loadWords(JLanguageTool.getDataBroker().getFromRulesDirAsStream(FILE_NAME)); + } + + String getBaseform(final String fullform) { + return fullform2baseform.get(fullform); + } + + private Map<String, String> loadWords(InputStream file) throws IOException { + final Map<String, String> map = new HashMap<String, String>(); + InputStreamReader isr = null; + BufferedReader br = null; + try { + isr = new InputStreamReader(file, FILE_ENCODING); + br = new BufferedReader(isr); + String line; + while ((line = br.readLine()) != null) { + line = line.trim(); + if (line.length() < 1) { //ignore empty lines + continue; + } + if (line.charAt(0) == '#') { // ignore comments + continue; + } + final String[] parts = line.split(":"); + if (parts.length != 2) { + throw new IOException("Format error in file " +JLanguageTool.getDataBroker().getFromRulesDirAsUrl(FILE_NAME)+", line: " + line); + } + final String baseform = parts[0]; + final String[] fullforms = parts[1].split(","); + for (String fullform : fullforms) { + map.put(fullform.trim(), baseform); + } + } + } finally { + if (br != null) br.close(); + if (isr != null) isr.close(); + } + return map; + } + +} diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/rules/de/GermanRule.java b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/de/GermanRule.java new file mode 100644 index 0000000..1fca395 --- /dev/null +++ b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/de/GermanRule.java @@ -0,0 +1,30 @@ +/* LanguageTool, a natural language style checker + * Copyright (C) 2005 Daniel Naber (http://www.danielnaber.de) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ +package de.danielnaber.languagetool.rules.de; + +import de.danielnaber.languagetool.rules.Rule; + +/** + * Abstract base class for rules for the German language. + * + * @author Daniel Naber + */ +public abstract class GermanRule extends Rule { + +} diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/rules/de/GermanWordRepeatRule.java b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/de/GermanWordRepeatRule.java new file mode 100644 index 0000000..55f98b4 --- /dev/null +++ b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/de/GermanWordRepeatRule.java @@ -0,0 +1,39 @@ +/* + * Created on 03.10.2009 + */ +package de.danielnaber.languagetool.rules.de; + +import java.util.ResourceBundle; + +import de.danielnaber.languagetool.AnalyzedTokenReadings; +import de.danielnaber.languagetool.Language; +import de.danielnaber.languagetool.rules.WordRepeatRule; + +/** + * Check if a word is repeated twice, taking into account an exception + * for German where e.g. "..., die die ..." is often okay. + * + * @author Daniel Naber + */ +public class GermanWordRepeatRule extends WordRepeatRule { + + public GermanWordRepeatRule(final ResourceBundle messages, final Language language) { + super(messages, language); + } + + @Override + public String getId() { + return "GERMAN_WORD_REPEAT_RULE"; + } + + @Override + public boolean ignore(final AnalyzedTokenReadings[] tokens, final int position) { + // Don't mark error for cases like: + // "wie Honda und Samsung, die die Bezahlung ihrer Firmenchefs..." + if (position >= 2 && ",".equals(tokens[position - 2].getToken())) { + return true; + } + return false; + } + +} diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/rules/de/WiederVsWiderRule.java b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/de/WiederVsWiderRule.java new file mode 100644 index 0000000..ea1c2aa --- /dev/null +++ b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/de/WiederVsWiderRule.java @@ -0,0 +1,91 @@ +/* LanguageTool, a natural language style checker + * Copyright (C) 2005 Daniel Naber (http://www.danielnaber.de) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ +package de.danielnaber.languagetool.rules.de; + +import java.util.ArrayList; +import java.util.List; +import java.util.ResourceBundle; + +import de.danielnaber.languagetool.AnalyzedSentence; +import de.danielnaber.languagetool.AnalyzedTokenReadings; +import de.danielnaber.languagetool.rules.Category; +import de.danielnaber.languagetool.rules.RuleMatch; + +/** + * Check incorrect use of "spiegelt ... wider", namely using "wieder" instead + * of "wider", e.g. in "Das spiegelt die Situation wieder" (incorrect). + * + * @author Daniel Naber + */ +public class WiederVsWiderRule extends GermanRule { + + public WiederVsWiderRule(ResourceBundle messages) { + if (messages != null) + super.setCategory(new Category(messages.getString("category_typo"))); + } + + public String getId() { + return "DE_WIEDER_VS_WIDER"; + } + + public String getDescription() { + return "Möglicher Tippfehler 'spiegeln ... wieder(wider)'"; + } + + public RuleMatch[] match(AnalyzedSentence text) { + final List<RuleMatch> ruleMatches = new ArrayList<RuleMatch>(); + final AnalyzedTokenReadings[] tokens = text.getTokens(); + int pos = 0; + boolean foundSpiegelt = false; + boolean foundWieder = false; + boolean foundWider = false; + for (AnalyzedTokenReadings token1 : tokens) { + final String token = token1.getToken(); + if (token.trim().equals("")) { + // ignore + } else { + if (token.equalsIgnoreCase("spiegelt") || token.equalsIgnoreCase("spiegeln") || token.equalsIgnoreCase("spiegelte") + || token.equalsIgnoreCase("spiegelten") || token.equalsIgnoreCase("spiegelst")) { + foundSpiegelt = true; + } else if (token.equalsIgnoreCase("wieder") && foundSpiegelt) { + foundWieder = true; + } else if (token.equalsIgnoreCase("wider") && foundSpiegelt) { + foundWider = true; + } + if (foundSpiegelt && foundWieder && !foundWider) { + final String msg = "'wider' in 'widerspiegeln' wird mit 'i' statt mit 'ie' " + + "geschrieben, z.B. 'Das spiegelt die Situation gut wider.'"; + final RuleMatch ruleMatch = new RuleMatch(this, pos, pos + token.length(), msg); + ruleMatch.setSuggestedReplacement("wider"); + ruleMatches.add(ruleMatch); + foundSpiegelt = false; + foundWieder = false; + foundWider = false; + } + } + pos += token.length(); + } + return toRuleMatchArray(ruleMatches); + } + + public void reset() { + // nothing + } + +} diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/rules/de/WordCoherencyRule.java b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/de/WordCoherencyRule.java new file mode 100644 index 0000000..2bba43a --- /dev/null +++ b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/de/WordCoherencyRule.java @@ -0,0 +1,156 @@ +/* LanguageTool, a natural language style checker + * Copyright (C) 2005 Daniel Naber (http://www.danielnaber.de) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ +package de.danielnaber.languagetool.rules.de; + +import java.io.BufferedReader; +import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.ResourceBundle; + +import de.danielnaber.languagetool.AnalyzedSentence; +import de.danielnaber.languagetool.AnalyzedToken; +import de.danielnaber.languagetool.AnalyzedTokenReadings; +import de.danielnaber.languagetool.JLanguageTool; +import de.danielnaber.languagetool.rules.Category; +import de.danielnaber.languagetool.rules.RuleMatch; + +/** + * A rule that matches words for which two different spellings are used + * throughout the document. Currently only implemented for German. Loads + * the relevant word from <code>rules/de/coherency.txt</code>. + * + * <p>Note that this should not be used for language variations like + * American English vs. British English or German "alte Rechtschreibung" + * vs. "neue Rechtschreibung" -- that's the task of a spell checker. + * + * @author Daniel Naber + */ +public class WordCoherencyRule extends GermanRule { + + private static final String FILE_NAME = "/de/coherency.txt"; + private static final String FILE_ENCODING = "utf-8"; + + private final Map<String, String> relevantWords; // e.g. "aufwendig -> aufwändig" + private Map<String, RuleMatch> shouldNotAppearWord = new HashMap<String, RuleMatch>(); // e.g. aufwändig -> RuleMatch of aufwendig + + private final GermanLemmatizer germanLemmatizer; + + public WordCoherencyRule(ResourceBundle messages) throws IOException { + if (messages != null) + super.setCategory(new Category(messages.getString("category_misc"))); + relevantWords = loadWords(JLanguageTool.getDataBroker().getFromRulesDirAsStream(FILE_NAME)); + germanLemmatizer = new GermanLemmatizer(); + } + + public String getId() { + return "DE_WORD_COHERENCY"; + } + + public String getDescription() { + return "Einheitliche Schreibweise für Wörter mit mehr als einer korrekten Schreibweise"; + } + + public RuleMatch[] match(AnalyzedSentence text) { + final List<RuleMatch> ruleMatches = new ArrayList<RuleMatch>(); + final AnalyzedTokenReadings[] tokens = text.getTokens(); + int pos = 0; + for (AnalyzedTokenReadings tmpToken : tokens) { + //TODO: definitely should be changed + //if the general lemmatizer is working + //defaulting to the first element because the + //general German lemmatizer is not (yet) there + String token = tmpToken.getToken(); + if (tmpToken.isWhitespace()) { + // ignore + } else { + final String origToken = token; + final List<AnalyzedToken> readings = tmpToken.getReadings(); + // TODO: in theory we need to care about the other readings, too: + if (readings != null && readings.size() > 0) { + final String baseform = readings.get(0).getLemma(); + if (baseform != null) { + token = baseform; + } else { + // not all words are known by the Tagger (esp. compounds), so use the + // file lookup: + final String manualLookup = germanLemmatizer.getBaseform(origToken); + if (manualLookup != null) + token = manualLookup; + } + } + if (shouldNotAppearWord.containsKey(token)) { + final RuleMatch otherMatch = shouldNotAppearWord.get(token); + final String otherSpelling = otherMatch.getMessage(); + final String msg = "'" + token + "' und '" + otherSpelling + + "' sollten nicht gleichzeitig benutzt werden"; + final RuleMatch ruleMatch = new RuleMatch(this, pos, pos + origToken.length(), msg); + ruleMatch.setSuggestedReplacement(otherSpelling); + ruleMatches.add(ruleMatch); + } else if (relevantWords.containsKey(token)) { + final String shouldNotAppear = relevantWords.get(token); + // only used to display this spelling variation if the other one really occurs: + final RuleMatch potentialRuleMatch = new RuleMatch(this, pos, pos + origToken.length(), token); + shouldNotAppearWord.put(shouldNotAppear, potentialRuleMatch); + } + } + pos += tmpToken.getToken().length(); + } + return toRuleMatchArray(ruleMatches); + } + + private Map<String, String> loadWords(InputStream file) throws IOException { + final Map<String, String> map = new HashMap<String, String>(); + InputStreamReader isr = null; + BufferedReader br = null; + try { + isr = new InputStreamReader(file, FILE_ENCODING); + br = new BufferedReader(isr); + String line; + while ((line = br.readLine()) != null) { + line = line.trim(); + if (line.length() < 1) { + continue; + } + if (line.charAt(0) == '#') { // ignore comments + continue; + } + final String[] parts = line.split(";"); + if (parts.length != 2) { + throw new IOException("Format error in file " + JLanguageTool.getDataBroker().getFromRulesDirAsUrl(FILE_NAME) + ", line: " + line); + } + map.put(parts[0], parts[1]); + map.put(parts[1], parts[0]); + } + } finally { + if (br != null) br.close(); + if (isr != null) isr.close(); + } + return map; + } + + public void reset() { + shouldNotAppearWord = new HashMap<String, RuleMatch>(); + } + +} diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/rules/en/AvsAnRule.java b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/en/AvsAnRule.java new file mode 100644 index 0000000..ae02ef5 --- /dev/null +++ b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/en/AvsAnRule.java @@ -0,0 +1,251 @@ +/* LanguageTool, a natural language style checker + * Copyright (C) 2005 Daniel Naber (http://www.danielnaber.de) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ +package de.danielnaber.languagetool.rules.en; + +import java.io.BufferedReader; +import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.util.ArrayList; +import java.util.List; +import java.util.ResourceBundle; +import java.util.TreeSet; + +import de.danielnaber.languagetool.AnalyzedSentence; +import de.danielnaber.languagetool.AnalyzedTokenReadings; +import de.danielnaber.languagetool.JLanguageTool; +import de.danielnaber.languagetool.rules.Category; +import de.danielnaber.languagetool.rules.RuleMatch; +import de.danielnaber.languagetool.tools.StringTools; + +/** + * Check if the determiner (if any) preceding a word is: + * <ul> + * <li><i>an</i> if the next word starts with a vowel + * <li><i>a</i> if the next word does not start with a vowel + * </ul> + * This rule loads some exceptions from external files (e.g. <i>an hour</i>). + * + * @author Daniel Naber + */ +public class AvsAnRule extends EnglishRule { + + private static final String FILENAME_A = "/en/det_a.txt"; + private static final String FILENAME_AN = "/en/det_an.txt"; + + private final TreeSet<String> requiresA; + private final TreeSet<String> requiresAn; + + public AvsAnRule(final ResourceBundle messages) throws IOException { + if (messages != null) { + super.setCategory(new Category(messages.getString("category_misc"))); + } + requiresA = loadWords(JLanguageTool.getDataBroker().getFromRulesDirAsStream(FILENAME_A)); + requiresAn = loadWords(JLanguageTool.getDataBroker().getFromRulesDirAsStream(FILENAME_AN)); + } + + @Override + public String getId() { + return "EN_A_VS_AN"; + } + + @Override + public String getDescription() { + return "Use of 'a' vs. 'an'"; + } + + @Override + public RuleMatch[] match(final AnalyzedSentence text) { + final List<RuleMatch> ruleMatches = new ArrayList<RuleMatch>(); + final AnalyzedTokenReadings[] tokens = text.getTokensWithoutWhitespace(); + String prevToken = ""; + int prevPos = 0; + //ignoring token 0, i.e., SENT_START + for (int i = 1; i < tokens.length; i++) { + String token = tokens[i].getToken(); + boolean doesRequireA = false; + boolean doesRequireAn = false; + // check for exceptions: + boolean isException = false; + final String[] parts = token.split("[-']"); // for example, in "one-way" only "one" is relevant + if (parts.length >= 1 && + !parts[0].equalsIgnoreCase("a")) { // avoid false alarm on "A-levels are..." + token = parts[0]; + } + token = token.replaceAll("[^a-zA-Z0-9\\.']", ""); // e.g. >>an "industry party"<< + if (StringTools.isEmpty(token)) { + continue; + } + final char tokenFirstChar = token.charAt(0); + if (requiresA.contains(token.toLowerCase()) || requiresA.contains(token)) { + isException = true; + doesRequireA = true; + } + if (requiresAn.contains(token.toLowerCase()) || requiresAn.contains(token)) { + if (isException) { + throw new IllegalStateException(token + " is listed in both det_a.txt and det_an.txt"); + } + isException = true; + doesRequireAn = true; + } + + if (!isException) { + if (StringTools.isAllUppercase(token) || StringTools.isMixedCase(token)) { + // we don't know how all-uppercase and mixed case words (often abbreviations) are pronounced, + // so never complain about these: + doesRequireAn = false; + doesRequireA = false; + } else if (isVowel(tokenFirstChar)) { + doesRequireAn = true; + } else { + doesRequireA = true; + } + } + //System.err.println(prevToken + " " +token + ", a="+doesRequireA + ", an="+doesRequireAn); + String msg = null; + if (prevToken.equalsIgnoreCase("a") && doesRequireAn) { + String replacement = "an"; + if (prevToken.equals("A")) { + replacement = "An"; + } + msg = "Use <suggestion>" +replacement+ "</suggestion> instead of '" +prevToken+ "' if the following "+ + "word starts with a vowel sound, e.g. 'an article', " + + "'an hour'"; + } else if (prevToken.equalsIgnoreCase("an") && doesRequireA) { + String replacement = "a"; + if (prevToken.equals("An")) { + replacement = "A"; + } + msg = "Use <suggestion>" +replacement+ "</suggestion> instead of '" +prevToken+ "' if the following "+ + "word doesn't start with a vowel sound, e.g. 'a sentence', " + + "'a university'"; + } + if (msg != null) { + final RuleMatch ruleMatch = new RuleMatch(this, prevPos, prevPos+prevToken.length(), msg, "Wrong article"); + ruleMatches.add(ruleMatch); + } + if (tokens[i].hasPosTag("DT")) { + prevToken = token; + prevPos = tokens[i].getStartPos(); + } else { + prevToken = ""; + } + } + return toRuleMatchArray(ruleMatches); + } + + /** + * Adds "a" or "an" to the English noun. + * Used for suggesting the proper form of the + * indefinite article. + * @param noun Word that needs an article. + * @return String containing the word with a determiner, + * or just the word if the word is an abbreviation. + */ + public final String suggestAorAn(final String noun) { + String word = noun; + boolean doesRequireA = false; + boolean doesRequireAn = false; + // check for exceptions: + boolean isException = false; + final String[] parts = word.split("[-']"); // for example, in "one-way" only "one" is relevant + if (parts.length >= 1 && + !parts[0].equalsIgnoreCase("a")) { // avoid false alarm on "A-levels are..." + word = parts[0]; + } + //html entities! + word = word.replaceAll(""|&|<|>|[^a-zA-Z0-9]", ""); // e.g. >>an "industry party"<< + if (StringTools.isEmpty(word)) { + return word; + } + final char tokenFirstChar = word.charAt(0); + if (requiresA.contains(word.toLowerCase()) || requiresA.contains(word)) { + isException = true; + doesRequireA = true; + } + if (requiresAn.contains(word.toLowerCase()) || requiresAn.contains(word)) { + if (isException) { + throw new IllegalStateException(word + " is listed in both det_a.txt and det_an.txt"); + } + isException = true; + doesRequireAn = true; + } + if (!isException) { + if (StringTools.isAllUppercase(word) || StringTools.isMixedCase(word)) { + // we don't know how all-uppercase words (often abbreviations) are pronounced, + // so never complain about these: + doesRequireAn = false; + doesRequireA = false; + } else if (isVowel(tokenFirstChar)) { + doesRequireAn = true; + } else { + doesRequireA = true; + } + } + if (doesRequireA) { + return "a " + noun; + } else if (doesRequireAn) { + return "an " + noun; + } else { + return noun; + } + } + + private static boolean isVowel(char c) { + c = Character.toLowerCase(c); + return c == 'a' || c == 'e' || c == 'i' || c == 'o' || c == 'u'; + } + + /** + * Load words, normalized to lowercase. + */ + private TreeSet<String> loadWords(final InputStream file) throws IOException { + BufferedReader br = null; + final TreeSet<String> set = new TreeSet<String>(); + try { + br = new BufferedReader(new InputStreamReader(file)); + String line; + while ((line = br.readLine()) != null) { + line = line.trim(); + if (line.length() < 1) { + continue; + } + if (line.charAt(0) == '#') { + continue; + } + if (line.charAt(0) == '*') { + set.add(line.substring(1)); + } else { + set.add(line.toLowerCase()); + } + } + } finally { + if (br != null) { + br.close(); + } + } + return set; + } + + @Override + public void reset() { + // nothing + } + +} diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/rules/en/CompoundRule.java b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/en/CompoundRule.java new file mode 100644 index 0000000..0e01523 --- /dev/null +++ b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/en/CompoundRule.java @@ -0,0 +1,55 @@ +/* LanguageTool, a natural language style checker + * Copyright (C) 2006 Daniel Naber (http://www.danielnaber.de) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ +package de.danielnaber.languagetool.rules.en; + +import java.io.IOException; +import java.util.ResourceBundle; + +import de.danielnaber.languagetool.JLanguageTool; +import de.danielnaber.languagetool.rules.AbstractCompoundRule; + +/** + * Checks that compounds (if in the list) are not written as separate words. + * + * @author Marcin Miłkowski, based on code by Daniel Naber + */ + +public class CompoundRule extends AbstractCompoundRule { + + private static final String FILE_NAME = "/en/compounds.txt"; + + public CompoundRule(final ResourceBundle messages) throws IOException { + super(messages); + loadCompoundFile(JLanguageTool.getDataBroker().getFromResourceDirAsStream(FILE_NAME), "UTF-8"); + super.setShort("Hyphenation problem"); + super.setMsg("This word is normally spelled with hyphen.", + "This word is normally spelled as one.", + "This expression is normally spelled as one or with hyphen."); + } + + public String getId() { + return "EN_COMPOUNDS"; + } + + public String getDescription() { + return "Hyphenated words, e.g., 'case-sensitive' instead of 'case sensitive'"; + } + + +} diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/rules/en/EnglishRule.java b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/en/EnglishRule.java new file mode 100644 index 0000000..cd0036d --- /dev/null +++ b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/en/EnglishRule.java @@ -0,0 +1,30 @@ +/* LanguageTool, a natural language style checker + * Copyright (C) 2005 Daniel Naber (http://www.danielnaber.de) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ +package de.danielnaber.languagetool.rules.en; + +import de.danielnaber.languagetool.rules.Rule; + +/** + * Abstract base class for rules for the English language. + * + * @author Daniel Naber + */ +public abstract class EnglishRule extends Rule { + +} diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/rules/en/EnglishUnpairedBracketsRule.java b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/en/EnglishUnpairedBracketsRule.java new file mode 100644 index 0000000..4b32c05 --- /dev/null +++ b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/en/EnglishUnpairedBracketsRule.java @@ -0,0 +1,89 @@ +/* LanguageTool, a natural language style checker + * Copyright (C) 2010 Daniel Naber (http://www.languagetool.org) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ + +package de.danielnaber.languagetool.rules.en; + +import java.util.ResourceBundle; +import java.util.regex.Pattern; + +import de.danielnaber.languagetool.AnalyzedTokenReadings; +import de.danielnaber.languagetool.Language; +import de.danielnaber.languagetool.rules.GenericUnpairedBracketsRule; + +public class EnglishUnpairedBracketsRule extends GenericUnpairedBracketsRule { + + private static final String[] EN_START_SYMBOLS = { "[", "(", "{", "“", "\"", "'" }; + private static final String[] EN_END_SYMBOLS = { "]", ")", "}", "”", "\"", "'" }; + + private static final Pattern NUMBER = Pattern.compile("\\d+"); + + public EnglishUnpairedBracketsRule(final ResourceBundle messages, + final Language language) { + super(messages, language); + startSymbols = EN_START_SYMBOLS; + endSymbols = EN_END_SYMBOLS; + } + + public String getId() { + return "EN_UNPAIRED_BRACKETS"; + } + + protected boolean isNoException(final String token, + final AnalyzedTokenReadings[] tokens, final int i, final int j, final boolean precSpace, + final boolean follSpace) { + + +//TODO: add an', o', 'till, 'tain't, 'cept, 'fore in the disambiguator +//and mark up as contractions somehow +// add exception for dates like '52 + + if (i <= 1) { + return true; + } + + if (!precSpace && follSpace) { + // exception for English inches, e.g., 20" + if ("\"".equals(token) + && NUMBER.matcher(tokens[i - 1].getToken()).matches()) { + return false; + } + // Exception for English plural Saxon genetive + // current disambiguation scheme is a bit too greedy + // for adjectives + if ("'".equals(token) && tokens[i].hasPosTag("POS")) { + return false; + } + // puttin' on the Ritz + if ("'".equals(token) && tokens[i - 1].hasPosTag("VBG") + && tokens[i - 1].getToken().endsWith("in")) { + return false; + } + } + if (precSpace && !follSpace) { + // hold 'em! + if ("'".equals(token) && i + 1 < tokens.length + && "em".equals(tokens[i + 1].getToken())) { + return false; + } + } + return true; + } + + +} diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/rules/es/ElwithFemRule.java b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/es/ElwithFemRule.java new file mode 100644 index 0000000..c22b9a3 --- /dev/null +++ b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/es/ElwithFemRule.java @@ -0,0 +1,179 @@ +/* LanguageTool, a natural language style checker + * Copyright (C) 2005 Daniel Naber (http://www.danielnaber.de) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ +package de.danielnaber.languagetool.rules.es; + +import java.io.BufferedReader; +import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.util.ArrayList; +import java.util.List; +import java.util.ResourceBundle; +import java.util.TreeSet; + +import de.danielnaber.languagetool.AnalyzedSentence; +import de.danielnaber.languagetool.AnalyzedTokenReadings; +import de.danielnaber.languagetool.JLanguageTool; +import de.danielnaber.languagetool.rules.Category; +import de.danielnaber.languagetool.rules.RuleMatch; +import de.danielnaber.languagetool.tools.StringTools; + +/** + * Check if the determiner (if any) preceding a feminine noun is "el". This + * rule loads a list of words (feminine nouns starting with stressed ha- or a-) + * from an external file. These words enforce the use of 'el' as determiner + * instead of 'la' (also with 'un', 'algun' and 'ningun'). + * + * Sample + * + * *la alma -> el alma + * *la hambre -> el hambre + * + * http://blog.lengua-e.com/2007/el-arma-determinante-masculino-ante-nombre-femenino/ + * http://tinyurl.com/m9uzte + * + * + * @author Susana Sotelo Docio + * + * based on English AvsAnRule rule + */ +public class ElwithFemRule extends SpanishRule { + + private static final String FILENAME_EL = "/es/el.txt"; + private final TreeSet<String> requiresEl; + + public ElwithFemRule(final ResourceBundle messages) throws IOException { + if (messages != null) { + super.setCategory(new Category(messages.getString("category_misc"))); + } + requiresEl = loadWords(JLanguageTool.getDataBroker().getFromRulesDirAsStream(FILENAME_EL)); + } + + @Override + public String getId() { + return "EL_WITH_FEM"; + } + + @Override + public String getDescription() { + return "Uso de 'el' con sustantivos femeninos que comienzan por a- o ha- t\u00f3nicas"; + } + + @Override + public RuleMatch[] match(final AnalyzedSentence text) { + final List<RuleMatch> ruleMatches = new ArrayList<RuleMatch>(); + final AnalyzedTokenReadings[] tokens = text.getTokensWithoutWhitespace(); + String prevToken = ""; + int prevPos = 0; + //ignoring token 0, i.e., SENT_START + for (int i = 1; i < tokens.length; i++) { + String token = tokens[i].getToken(); + boolean doesRequireEl = false; + + token = token.replaceAll("[^a-záéíóúñüA-ZÁÉÍÓÚÑÜ0-9\\.']", ""); // el 'alma' + if (StringTools.isEmpty(token)) { + continue; + } + if (requiresEl.contains(token.toLowerCase()) || requiresEl.contains(token)) { + doesRequireEl = true; + } + + // FIXME: temporal solution for "La Haya" (change) + if (prevToken.equals("La") && token.equals("Haya")) { + doesRequireEl = false; + } + + String msg = null; + String replacement = null; + if (prevToken.equalsIgnoreCase("la") && doesRequireEl) + { + replacement = "el"; + if (prevToken.equals("La")) { replacement = "El"; } + } + else if (prevToken.equalsIgnoreCase("una") && doesRequireEl) + { + replacement = "un"; + if (prevToken.equals("Una")) { replacement = "Un"; } + } + else if (prevToken.equalsIgnoreCase("alguna") && doesRequireEl) + { + replacement = "alg\u00fan"; + if (prevToken.equals("Alguna")) { replacement = "Alg\u00fan"; } + } + else if (prevToken.equalsIgnoreCase("ninguna") && doesRequireEl) + { + replacement = "ning\u00fan"; + if (prevToken.equals("Ninguna")) { replacement = "Ning\u00fan"; } + } + + msg = "Use <suggestion>" +replacement+ "</suggestion> en lugar de '" +prevToken+ "' si la siguiente "+ + "palabra comienza por 'a' o 'ha' t\u00f3nicas, por ejemplo 'el hampa', " + + "'un agua'"; + + + if (replacement != null) { + final RuleMatch ruleMatch = new RuleMatch(this, prevPos, prevPos+prevToken.length(), msg, "Art\u00edculo incorrecto"); + ruleMatches.add(ruleMatch); + } + if (tokens[i].hasPosTag("DA0FS0") || tokens[i].hasPosTag("DI0FS0") ) { + prevToken = token; + prevPos = tokens[i].getStartPos(); + } else { + prevToken = ""; + } + } + return toRuleMatchArray(ruleMatches); + } + + /** + * Load words, normalized to lowercase. + */ + private TreeSet<String> loadWords(final InputStream file) throws IOException { + BufferedReader br = null; + final TreeSet<String> set = new TreeSet<String>(); + try { + br = new BufferedReader(new InputStreamReader(file, "utf-8")); + String line; + while ((line = br.readLine()) != null) { + line = line.trim(); + if (line.length() < 1) { + continue; + } + if (line.charAt(0) == '#') { + continue; + } + if (line.charAt(0) == '*') { + set.add(line.substring(1)); + } else { + set.add(line.toLowerCase()); + } + } + } finally { + if (br != null) { + br.close(); + } + } + return set; + } + + @Override + public void reset() { + // nothing + } +} diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/rules/es/SpanishRule.java b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/es/SpanishRule.java new file mode 100644 index 0000000..4aaa297 --- /dev/null +++ b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/es/SpanishRule.java @@ -0,0 +1,32 @@ +/* LanguageTool, a natural language style checker + * Copyright (C) 2005 Daniel Naber (http://www.danielnaber.de) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ +package de.danielnaber.languagetool.rules.es; + +import de.danielnaber.languagetool.rules.Rule; + +/** + * Abstract base class for rules for Spanish. + * + * @author Susana Sotelo Docio + * + * based on English rules + */ +public abstract class SpanishRule extends Rule { + +} diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/rules/fr/FrenchRule.java b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/fr/FrenchRule.java new file mode 100644 index 0000000..2ad4bcc --- /dev/null +++ b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/fr/FrenchRule.java @@ -0,0 +1,31 @@ +/* LanguageTool, a natural language style checker + * Copyright (C) 2005 Daniel Naber (http://www.danielnaber.de) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ + +package de.danielnaber.languagetool.rules.fr; + +import de.danielnaber.languagetool.rules.Rule; + +/** + * Abstract base class for French rules. + * + * @author Marcin Milkowski + */ +public abstract class FrenchRule extends Rule { + +} diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/rules/fr/QuestionWhitespaceRule.java b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/fr/QuestionWhitespaceRule.java new file mode 100644 index 0000000..4c03049 --- /dev/null +++ b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/fr/QuestionWhitespaceRule.java @@ -0,0 +1,161 @@ +/* LanguageTool, a natural language style checker + * Copyright (C) 2005 Daniel Naber (http://www.danielnaber.de) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ +package de.danielnaber.languagetool.rules.fr; + +import java.util.ArrayList; +import java.util.List; +import java.util.ResourceBundle; + +import de.danielnaber.languagetool.AnalyzedSentence; +import de.danielnaber.languagetool.AnalyzedTokenReadings; +import de.danielnaber.languagetool.rules.Category; +import de.danielnaber.languagetool.rules.RuleMatch; +import de.danielnaber.languagetool.tools.StringTools; + +/** + * A rule that matches spaces before ?,:,; and ! (required for correct French + * punctuation). + * + * @author Marcin Miłkowski + */ +public class QuestionWhitespaceRule extends FrenchRule { + + public QuestionWhitespaceRule(final ResourceBundle messages) { + // super(messages); + super.setCategory(new Category(messages.getString("category_misc"))); + } + + @Override + public String getId() { + return "FRENCH_WHITESPACE"; + } + + @Override + public String getDescription() { + return "Insertion des espaces fines insécables"; + } + + @Override + public RuleMatch[] match(final AnalyzedSentence text) { + final List<RuleMatch> ruleMatches = new ArrayList<RuleMatch>(); + final AnalyzedTokenReadings[] tokens = text.getTokens(); + String prevToken = ""; + int pos = 0; + for (int i = 1; i < tokens.length; i++) { + final String token = tokens[i].getToken(); + final boolean isWhiteBefore = tokens[i].isWhitespaceBefore(); + pos += token.length(); + String msg = null; + final int fixPos = 0; + int fixLen = 0; + String suggestionText = null; + if (isWhiteBefore) { + if (token.equals("?")) { + msg = "Point d'interrogation est précédé d'une espace fine insécable."; + // non-breaking space + suggestionText = " ?"; + fixLen = 1; + } else if (token.equals("!")) { + msg = "Point d'exclamation est précédé d'une espace fine insécable."; + // non-breaking space + suggestionText = " !"; + fixLen = 1; + } else if (token.equals("»")) { + msg = "Le guillemet fermant est précédé d'une espace fine insécable."; + // non-breaking space + suggestionText = " »"; + fixLen = 1; + } else if (token.equals(";")) { + msg = "Point-virgule est précédé d'une espace fine insécable."; + // non-breaking space + suggestionText = " ;"; + fixLen = 1; + } else if (token.equals(":")) { + msg = "Deux-points sont précédé d'une espace fine insécable."; + // non-breaking space + suggestionText = " :"; + fixLen = 1; + } + } else { + if (token.equals("?") && !prevToken.equals("!") + && !prevToken.equals("\u00a0")) { + msg = "Point d'interrogation est précédé d'une espace fine insécable."; + // non-breaking space + suggestionText = prevToken + " ?"; + fixLen = 1; + } else if (token.equals("!") && !prevToken.equals("?") + && !prevToken.equals("\u00a0")) { + msg = "Point d'exclamation est précédé d'une espace fine insécable."; + // non-breaking space + suggestionText = prevToken + " !"; + fixLen = 1; + } else if (token.equals(";") && !prevToken.equals("\u00a0")) { + msg = "Point-virgule est précédé d'une espace fine insécable."; + // non-breaking space + suggestionText = prevToken + " ;"; + fixLen = 1; + } else if (token.equals(":") && !prevToken.equals("\u00a0")) { + msg = "Deux-points précédés d'une espace fine insécable."; + // non-breaking space + suggestionText = prevToken + " :"; + fixLen = 1; + } else if (token.equals("»") && !prevToken.equals("\u00a0")) { + msg = "Le guillemet fermant est précédé d'une espace fine insécable."; + // non-breaking space + suggestionText = prevToken + " »"; + fixLen = 1; + } + } + + if (StringTools.isEmpty(token) && prevToken.equals("«")) { + msg = "Le guillemet ouvrant est suivi d'une espace fine insécable."; + // non-breaking space + suggestionText = "« "; + fixLen = 1; + } else if (!StringTools.isEmpty(token) && !token.equals("\u00a0") + && prevToken.equals("«")) { + msg = "Le guillemet ouvrant est suivi d'une espace fine insécable."; + // non-breaking space + suggestionText = "« "; + fixLen = 0; + } + + if (msg != null) { + final int fromPos = tokens[i - 1].getStartPos() + fixPos; + final int toPos = tokens[i - 1].getStartPos() + fixPos + fixLen + + tokens[i - 1].getToken().length(); + final RuleMatch ruleMatch = new RuleMatch(this, fromPos, toPos, msg, + "Insérer un espace insécable"); + if (suggestionText != null) { + ruleMatch.setSuggestedReplacement(suggestionText); + } + ruleMatches.add(ruleMatch); + } + prevToken = token; + } + + return toRuleMatchArray(ruleMatches); + } + + @Override + public void reset() { + // nothing + } + +} diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/rules/patterns/AbstractPatternRule.java b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/patterns/AbstractPatternRule.java new file mode 100644 index 0000000..d172134 --- /dev/null +++ b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/patterns/AbstractPatternRule.java @@ -0,0 +1,223 @@ +/* LanguageTool, a natural language style checker + * Copyright (C) 2008 Daniel Naber (http://www.danielnaber.de) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ + +package de.danielnaber.languagetool.rules.patterns; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; + +import de.danielnaber.languagetool.AnalyzedSentence; +import de.danielnaber.languagetool.AnalyzedToken; +import de.danielnaber.languagetool.AnalyzedTokenReadings; +import de.danielnaber.languagetool.Language; +import de.danielnaber.languagetool.rules.Rule; +import de.danielnaber.languagetool.rules.RuleMatch; + +/** + * An Abstract Pattern Rule that describes a pattern of words or part-of-speech tags + * used for PatternRule and DisambiguationPatternRule. + * + * Introduced to minimize code duplication between those classes. + * + * @author Marcin Miłkowski + */ + +public abstract class AbstractPatternRule extends Rule { + + private final String id; + + private final String description; + + protected final List<Element> patternElements; + + protected Unifier unifier; + + protected final Language language; + + protected int startPositionCorrection; + + protected int endPositionCorrection; + + protected boolean prevMatched; + + protected final boolean testUnification; + + private final boolean getUnified; + + private boolean groupsOrUnification; + + protected AnalyzedTokenReadings[] unifiedTokens; + + protected final boolean sentStart; + + public AbstractPatternRule(final String id, + final String description, + final Language language, + final List<Element> elements, + boolean getUnified) { + this.id = id; + this.description = description; + this.patternElements = new ArrayList<Element>(elements); // copy elements + this.language = language; + this.getUnified = getUnified; + unifier = language.getUnifier(); + testUnification = initUnifier(); + sentStart = patternElements.get(0).isSentStart(); + if (!testUnification) { + for (Element elem : patternElements) { + if (elem.hasAndGroup()) { + groupsOrUnification = true; + break; + } + } + } else { + groupsOrUnification = true; + } + } + + private boolean initUnifier() { + for (final Element elem : patternElements) { + if (elem.isUnified()) { + return true; + } + } + return false; + } + + @Override + public final String toString() { + return id + ":" + patternElements + ":" + description; + } + + @Override + public String getDescription() { + return description; + } + + @Override + public String getId() { + return id; + } + + @Override + public RuleMatch[] match(AnalyzedSentence text) throws IOException { + // TODO Auto-generated method stub + return null; + } + + @Override + public void reset() { + // TODO Auto-generated method stub + } + + public final void setStartPositionCorrection(final int startPositionCorrection) { + this.startPositionCorrection = startPositionCorrection; + } + + public final void setEndPositionCorrection(final int endPositionCorrection) { + this.endPositionCorrection = endPositionCorrection; + } + + + protected void setupAndGroup(final int firstMatchToken, + final Element elem, final AnalyzedTokenReadings[] tokens) + throws IOException { + if (elem.hasAndGroup()) { + for (final Element andElement : elem.getAndGroup()) { + if (andElement.isReferenceElement()) { + setupRef(firstMatchToken, andElement, tokens); + } + } + elem.setupAndGroup(); + } + } + + //TODO: add .compile for all exceptions of the element? + protected void setupRef(final int firstMatchToken, final Element elem, + final AnalyzedTokenReadings[] tokens) throws IOException { + if (elem.isReferenceElement()) { + final int refPos = firstMatchToken + elem.getMatch().getTokenRef(); + if (refPos < tokens.length) { + elem.compile(tokens[refPos], language.getSynthesizer()); + } + } + } + + protected boolean testAllReadings(final AnalyzedTokenReadings[] tokens, + final Element elem, final Element prevElement, final int tokenNo, + final int firstMatchToken, final int prevSkipNext) throws IOException { + boolean thisMatched = false; + final int numberOfReadings = tokens[tokenNo].getReadingsLength(); + setupAndGroup(firstMatchToken, elem, tokens); + for (int l = 0; l < numberOfReadings; l++) { + final AnalyzedToken matchToken = tokens[tokenNo].getAnalyzedToken(l); + prevMatched = prevMatched || prevSkipNext > 0 && prevElement != null + && prevElement.isMatchedByScopeNextException(matchToken); + if (prevMatched) { + return false; + } + thisMatched = thisMatched || elem.isMatched(matchToken); + if (!thisMatched && !elem.isInflected() && elem.getPOStag() == null + && (prevElement != null && prevElement.getExceptionList() == null)) { + return false; // the token is the same, we will not get a match + } + if (groupsOrUnification) { + thisMatched &= testUnificationAndGroups(thisMatched, + l + 1 == numberOfReadings, matchToken, elem); + } + } + if (thisMatched) { + for (int l = 0; l < numberOfReadings; l++) { + if (elem.isExceptionMatchedCompletely(tokens[tokenNo].getAnalyzedToken(l))) + return false; + } + if (tokenNo > 0 && elem.hasPreviousException()) { + if (elem.isMatchedByPreviousException(tokens[tokenNo - 1])) + return false; + } + } + return thisMatched; + } + + protected boolean testUnificationAndGroups(final boolean matched, + final boolean lastReading, final AnalyzedToken matchToken, + final Element elem) { + boolean thisMatched = matched; + if (testUnification) { + if (matched && elem.isUnified()) { + thisMatched = thisMatched && unifier.isUnified(matchToken, elem.getUniFeatures(), + elem.isUniNegated(), lastReading); + } + if (thisMatched && getUnified) { + unifiedTokens = unifier.getFinalUnified(); + } + if (!elem.isUnified()) { + unifier.reset(); + } + } + elem.addMemberAndGroup(matchToken); + if (lastReading) { + thisMatched &= elem.checkAndGroup(thisMatched); + } + return thisMatched; + } + + +} diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/rules/patterns/Element.java b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/patterns/Element.java new file mode 100644 index 0000000..0ad7c1f --- /dev/null +++ b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/patterns/Element.java @@ -0,0 +1,803 @@ +/* LanguageTool, a natural language style checker + * Copyright (C) 2005 Daniel Naber (http://www.danielnaber.de) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ +package de.danielnaber.languagetool.rules.patterns; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; +import java.util.Map; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +import de.danielnaber.languagetool.AnalyzedToken; +import de.danielnaber.languagetool.AnalyzedTokenReadings; +import de.danielnaber.languagetool.JLanguageTool; +import de.danielnaber.languagetool.synthesis.Synthesizer; +import de.danielnaber.languagetool.tools.StringTools; + +/** + * A part of a pattern. + * + * @author Daniel Naber + */ +public class Element { + + private String stringToken; + private String posToken; + private String regToken; + private boolean posRegExp; + + private boolean negation; + private boolean posNegation; + + private final boolean caseSensitive; + private final boolean stringRegExp; + private boolean inflected; + + private boolean testWhitespace; + private boolean whitespaceBefore; + + /** + * List of exceptions that are valid for the current token and / or some next + * tokens. + */ + private List<Element> exceptionList; + + /** + * True if scope=="next". + */ + private boolean exceptionValidNext; + + /** + * True if any exception with a scope=="current" or scope=="next" is set for + * the element. + */ + private boolean exceptionSet; + + /** + * True if attribute scope=="previous". + */ + private boolean exceptionValidPrevious; + + /** + * List of exceptions that are valid for a previous token. + */ + private List<Element> previousExceptionList; + + private List<Element> andGroupList; + private boolean andGroupSet; + private boolean[] andGroupCheck; + + private int skip; + + private Pattern p; + private Pattern pPos; + + private Matcher m; + private Matcher mPos; + + /** The reference to another element in the pattern. **/ + private Match tokenReference; + + /** + * True when the element stores a formatted reference to another element of + * the pattern. + */ + private boolean containsMatches; + + /** Matches only tokens without any POS tag. **/ + private static final String UNKNOWN_TAG = "UNKNOWN"; + + /** + * Parameter passed to regular expression matcher to enable case insensitive + * Unicode matching. + */ + private static final String CASE_INSENSITIVE = "(?iu)"; + + private String referenceString; + + /** String ID of the phrase the element is in. **/ + private String phraseName; + + /** + * This var is used to determine if calling {@link #setStringElement} makes + * sense. This method takes most time so it's best to reduce the number of its + * calls. + **/ + private boolean testString; + + /** + * Tells if the element is inside the unification, so that {@link Unifier} + * tests it. + */ + private boolean unified; + private boolean uniNegation; + + private Map<String, List<String>> unificationFeatures; + + /** + * Creates Element that is used to match tokens in the text. + * + * @param token + * String to be matched + * @param caseSensitive + * True if the check is case-sensitive. + * @param regExp + * True if the check uses regular expressions. + * @param inflected + * True if the check refers to base forms (lemmas). + */ + public Element(final String token, final boolean caseSensitive, + final boolean regExp, final boolean inflected) { + this.caseSensitive = caseSensitive; + this.stringRegExp = regExp; + this.inflected = inflected; + setStringElement(token); + } + + /** + * Checks whether the rule element matches the token given as a parameter. + * + * @param token + * @AnalyzedToken to check matching against + * @return True if token matches, false otherwise. + */ + public final boolean isMatched(final AnalyzedToken token) { + if (testWhitespace && !isWhitespaceBefore(token)) { + return false; + } + boolean matched = false; + if (testString) { + matched = (isStringTokenMatched(token) ^ negation) + && (isPosTokenMatched(token) ^ posNegation); + } else { + matched = (!negation) && (isPosTokenMatched(token) ^ posNegation); + } + + if (andGroupSet) { + andGroupCheck[0] |= matched; + } + return matched; + } + + /** + * Checks whether an exception matches. + * + * @param token + * @AnalyzedToken to check matching against + * @return True if any of the exceptions matches (logical disjunction). + */ + public final boolean isExceptionMatched(final AnalyzedToken token) { + if (exceptionSet) { + for (final Element testException : exceptionList) { + if (!testException.exceptionValidNext) { + if (testException.isMatched(token)) { + return true; + } + } + } + } + return false; + } + + /** + * Enables testing multiple conditions specified by different elements. + * Doesn't test exceptions. + * + * Works as logical AND operator only if preceded with + * {@link #setupAndGroup()}, and followed by {@link #checkAndGroup(boolean)}. + * + * @param token + * AnalyzedToken - the token checked. + */ + public final void addMemberAndGroup(final AnalyzedToken token) { + if (andGroupSet) { + for (int i = 0; i < andGroupList.size(); i++) { + if (!andGroupCheck[i + 1]) { + final Element testAndGroup = andGroupList.get(i); + if (testAndGroup.isMatched(token)) { + andGroupCheck[i + 1] = true; + } + } + } + } + } + + public final void setupAndGroup() { + if (andGroupSet) { + andGroupCheck = new boolean[andGroupList.size() + 1]; + Arrays.fill(andGroupCheck, false); + } + } + + public final boolean checkAndGroup(final boolean previousValue) { + if (andGroupSet) { + boolean allConditionsMatch = true; + for (final boolean testValue : andGroupCheck) { + allConditionsMatch &= testValue; + } + return allConditionsMatch; + } + return previousValue; + } + + /** + * Enables testing multiple conditions specified by multiple element + * exceptions. + * + * Works as logical AND operator. + * + * @param token + * AnalyzedToken - the token checked for exceptions. + * @return true if all conditions are met, false otherwise. + */ + public final boolean isAndExceptionGroupMatched(final AnalyzedToken token) { + if (andGroupSet) { + for (final Element testAndGroup : andGroupList) { + if (testAndGroup.isExceptionMatched(token)) { + return true; + } + } + } + return false; + } + + /** + * This method checks exceptions both in AND-group and the token. Introduced + * to for clarity. + * + * @param token + * Token to match + * @return True if matched. + */ + public final boolean isExceptionMatchedCompletely(final AnalyzedToken token) { + // note: short-circuiting possible + return isExceptionMatched(token) || isAndExceptionGroupMatched(token); + } + + public final void setAndGroupElement(final Element andToken) { + if (andToken != null) { + if (andGroupList == null) { + andGroupList = new ArrayList<Element>(); + } + if (!andGroupSet) { + andGroupSet = true; + } + andGroupList.add(andToken); + } + } + + /** + * Checks if this element has an AND group associated with it. + * + * @return true if the element has a group of elements that all should match. + */ + public final boolean hasAndGroup() { + return andGroupSet; + } + + /** + * Returns the group of elements linked with AND operator. + * + * @return List of Elements. + */ + public final List<Element> getAndGroup() { + return andGroupList; + } + + /** + * Checks whether a previously set exception matches (in case the exception + * had scope == "next"). + * + * @param token + * @AnalyzedToken to check matching against. + * @return True if any of the exceptions matches. + */ + public final boolean isMatchedByScopeNextException(final AnalyzedToken token) { + if (exceptionSet) { + for (final Element testException : exceptionList) { + if (testException.exceptionValidNext) { + if (testException.isMatched(token)) { + return true; + } + } + } + } + return false; + } + + /** + * Checks whether an exception for a previous token matches (in case the + * exception had scope == "previous"). + * + * @param token + * {@link AnalyzedToken} to check matching against. + * @return True if any of the exceptions matches. + */ + public final boolean isMatchedByPreviousException(final AnalyzedToken token) { + if (exceptionValidPrevious) { + for (final Element testException : previousExceptionList) { + if (!testException.exceptionValidNext) { + if (testException.isMatched(token)) { + return true; + } + } + } + } + return false; + } + + /** + * Checks whether an exception for a previous token matches all readings of a + * given token (in case the exception had scope == "previous"). + * + * @param prevToken + * {@link AnalyzedTokenReadings} to check matching against. + * @return true if any of the exceptions matches. + */ + public final boolean isMatchedByPreviousException( + final AnalyzedTokenReadings prevToken) { + final int numReadings = prevToken.getReadingsLength(); + for (int i = 0; i < numReadings; i++) { + if (isMatchedByPreviousException(prevToken.getAnalyzedToken(i))) { + return true; + } + } + return false; + } + + /** + * Checks if the token is a SENT_START. + * + * @return True if the element starts the sentence and the element hasn't been + * set to have negated POS token. + * + */ + public final boolean isSentStart() { + return JLanguageTool.SENTENCE_START_TAGNAME.equals(posToken) + && !posNegation; + } + + @Override + public final String toString() { + final StringBuilder sb = new StringBuilder(); + if (negation) { + sb.append('!'); + } + sb.append(stringToken); + if (phraseName != null) { + sb.append(" {"); + sb.append(phraseName); + sb.append('}'); + } + if (posToken != null) { + sb.append('/'); + sb.append(posToken); + } + return sb.toString(); + } + + public final void setPosElement(final String posToken, final boolean regExp, + final boolean negation) { + this.posToken = posToken; + this.posNegation = negation; + posRegExp = regExp; + if (posRegExp) { + pPos = Pattern.compile(posToken); + } + } + + public final String getString() { + return stringToken; + } + + public final void setStringElement(final String token) { + this.stringToken = token; + testString = !StringTools.isEmpty(stringToken); + if (testString && stringRegExp) { + regToken = stringToken; + if (!caseSensitive) { + regToken = CASE_INSENSITIVE + stringToken; + } + if (!"\\0".equals(token)) { + p = Pattern.compile(regToken); + } + } + } + + /** + * Sets a POS-type exception for matching string tokens. + * + * @param posToken + * The part of the speech tag in the exception. + * @param regExp + * True if the POS is specified as a regular expression. + * @param negation + * True if the exception is negated. + * @param scopeNext + * True if the exception scope is next tokens. + * @param scopePrevious + * True if the exception should match only a single previous token. + */ + public final void setPosException(final String posToken, + final boolean regExp, final boolean negation, final boolean scopeNext, + final boolean scopePrevious) { + final Element posException = new Element("", this.caseSensitive, false, + false); + posException.setPosElement(posToken, regExp, negation); + posException.exceptionValidNext = scopeNext; + setException(posException, scopePrevious); + } + + /** + * Sets a string-type exception for matching string tokens. + * + * @param token + * The string in the exception. + * @param regExp + * True if the string is specified as a regular expression. + * @param inflected + * True if the string is a base form (lemma). + * @param negation + * True if the exception is negated. + * @param scopeNext + * True if the exception scope is next tokens. + * @param scopePrevious + * True if the exception should match only a single previous token. + */ + public final void setStringException(final String token, + final boolean regExp, final boolean inflected, final boolean negation, + final boolean scopeNext, final boolean scopePrevious) { + final Element stringException = new Element(token, this.caseSensitive, + regExp, inflected); + stringException.setNegation(negation); + stringException.exceptionValidNext = scopeNext; + setException(stringException, scopePrevious); + } + + private void setException(final Element elem, final boolean scopePrevious) { + exceptionValidPrevious |= scopePrevious; + if (exceptionList == null && !scopePrevious) { + exceptionList = new ArrayList<Element>(); + } + if (previousExceptionList == null && scopePrevious) { + previousExceptionList = new ArrayList<Element>(); + } + if (scopePrevious) { + previousExceptionList.add(elem); + } else { + if (!exceptionSet) { + exceptionSet = true; + } + if (exceptionSet) { + exceptionList.add(elem); + } + } + } + + /** + * Tests if part of speech matches a given string. + * + * @param token + * Token to test. + * @return true if matches + * + * Special value UNKNOWN_TAG matches null POS tags. + * + */ + private boolean isPosTokenMatched(final AnalyzedToken token) { + // if no POS set + // defaulting to true + if (posToken == null) { + return true; + } + if (token.getPOSTag() == null) { + if (posRegExp) { + if (mPos == null) { + mPos = pPos.matcher(UNKNOWN_TAG); + } else { + mPos.reset(UNKNOWN_TAG); + } + return mPos.matches(); + } + if (UNKNOWN_TAG.equals(posToken)) { + return true; + } + } + boolean match; + if (posRegExp) { + if (mPos == null) { + mPos = pPos.matcher(token.getPOSTag()); + } else { + mPos.reset(token.getPOSTag()); + } + match = mPos.matches(); + } else { + match = posToken.equals(token.getPOSTag()); + } + if (!match && UNKNOWN_TAG.equals(posToken)) { // these are helper tags, + // ignore them + match = JLanguageTool.SENTENCE_END_TAGNAME.equals(token.getPOSTag()) + || JLanguageTool.PARAGRAPH_END_TAGNAME.equals(token.getPOSTag()); + } + return match; + } + + /** + * Tests whether the string token element matches a given token. + * + * @param token + * {@link AnalyzedToken} to match against. + * @return True if matches. + */ + private boolean isStringTokenMatched(final AnalyzedToken token) { + final String testToken = getTestToken(token); + if (stringRegExp) { + if (m == null) { + m = p.matcher(testToken); + } else { + m.reset(testToken); + } + return m.matches(); + } + if (caseSensitive) { + return stringToken.equals(testToken); + } + return stringToken.equalsIgnoreCase(testToken); + } + + private String getTestToken(final AnalyzedToken token) { + // enables using words with lemmas and without lemmas + // in the same regexp with inflected="yes" + if (inflected) { + return token.getTokenInflected(); + } + return token.getToken(); + } + + /** + * Gets the exception scope length. + * + * @return Scope length. + */ + public final int getSkipNext() { + return skip; + } + + /** + * Sets the exception scope length. + * + * @param i + * Exception scope length. + */ + public final void setSkipNext(final int i) { + skip = i; + } + + /** + * Checks if the element has an exception for a previous token. + * + * @return True if the element has a previous token matching exception. + */ + public final boolean hasPreviousException() { + return exceptionValidPrevious; + } + + /** + * Negates the meaning of match(). + * + * @param negation + * - true if the meaning of match() is to be negated. + */ + public final void setNegation(final boolean negation) { + this.negation = negation; + } + + /** + * see {@link #setNegation} + * + * @since 0.9.3 + */ + public final boolean getNegation() { + return this.negation; + } + + /** + * + * @return true when this element refers to another token. + */ + public final boolean isReferenceElement() { + return containsMatches; + } + + /** + * Sets the reference to another token. + * + * @param match + * Formatting object for the token reference. + */ + public final void setMatch(final Match match) { + tokenReference = match; + containsMatches = true; + } + + public final Match getMatch() { + return tokenReference; + } + + /** + * Prepare Element for matching by formatting its string token and POS (if the + * Element is supposed to refer to some other token). + * + * @param token + * the token specified as {@link AnalyzedTokenReadings} + * @param synth + * the language synthesizer ({@link Synthesizer}) + * + */ + public final void compile(final AnalyzedTokenReadings token, + final Synthesizer synth) throws IOException { + + m = null; + p = null; + tokenReference.setToken(token); + tokenReference.setSynthesizer(synth); + + if (StringTools.isEmpty(referenceString)) { + referenceString = stringToken; + } + if (tokenReference.setsPos()) { + final String posReference = tokenReference.getTargetPosTag(); + if (posReference != null) { + if (mPos != null) { + mPos = null; + } + setPosElement(posReference, tokenReference.posRegExp(), negation); + } + setStringElement(referenceString.replace("\\" + + tokenReference.getTokenRef(), "")); + inflected = true; + } else { + setStringElement(referenceString.replace("\\" + + tokenReference.getTokenRef(), tokenReference.toTokenString())); + } + } + + /** + * Sets the phrase the element is in. + * + * @param s + * ID of the phrase. + */ + public final void setPhraseName(final String s) { + phraseName = s; + } + + /** + * Checks if the Element is in any phrase. + * + * @return True if the Element is contained in the phrase. + */ + public final boolean isPartOfPhrase() { + return phraseName != null; + } + + /** + * Whether the element matches case sensitively. + * + * @since 0.9.3 + */ + public final boolean getCaseSensitive() { + return caseSensitive; + } + + /** + * Tests whether the element matches a regular expression. + * + * @since 0.9.6 + */ + public final boolean isRegularExpression() { + return stringRegExp; + } + + /** + * @return the POS of the Element + * @since 0.9.6 + */ + public final String getPOStag() { + return posToken; + } + + /** + * Tests whether the POS is negated. + * + * @return true if so. + */ + public final boolean getPOSNegation() { + return posNegation; + } + + /** + * Whether the token is inflected. + * + * @return True if so. + */ + public final boolean isInflected() { + return inflected; + } + + /** + * Gets the phrase the element is in. + * + * @return String The name of the phrase. + */ + public final String getPhraseName() { + return phraseName; + } + + public final boolean isUnified() { + return unified; + } + + public final void setUnification(final Map<String, List<String>> uniFeatures) { + unificationFeatures = uniFeatures; + unified = true; + } + + /** + * Get unification features and types. + * @return A map from features to a list of types. + * @since 1.0.1 + */ + public final Map<String, List<String>> getUniFeatures() { + return unificationFeatures; + } + + public final void setUniNegation() { + uniNegation = true; + } + + public final boolean isUniNegated() { + return uniNegation; + } + + public final void setWhitespaceBefore(final boolean isWhite) { + whitespaceBefore = isWhite; + testWhitespace = true; + } + + public final void setExceptionSpaceBefore(final boolean isWhite) { + if (exceptionList != null) { + exceptionList.get(exceptionList.size()).setWhitespaceBefore(isWhite); + } + } + + public final boolean isWhitespaceBefore(final AnalyzedToken token) { + return whitespaceBefore == token.isWhitespaceBefore(); + } + + /** + * Since 1.0.0 + * @return A List of Exceptions. Used for testing. + */ + public final List<Element> getExceptionList() { + return exceptionList; + } +} diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/rules/patterns/FalseFriendRuleLoader.java b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/patterns/FalseFriendRuleLoader.java new file mode 100644 index 0000000..94c6515 --- /dev/null +++ b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/patterns/FalseFriendRuleLoader.java @@ -0,0 +1,356 @@ +/* LanguageTool, a natural language style checker + * Copyright (C) 2005 Daniel Naber (http://www.danielnaber.de) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ +package de.danielnaber.languagetool.rules.patterns; + +import java.io.IOException; +import java.io.InputStream; +import java.text.MessageFormat; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.Iterator; +import java.util.List; +import java.util.Map; +import java.util.ResourceBundle; + +import javax.xml.parsers.ParserConfigurationException; +import javax.xml.parsers.SAXParser; +import javax.xml.parsers.SAXParserFactory; + +import org.xml.sax.Attributes; +import org.xml.sax.SAXException; +import org.xml.sax.helpers.DefaultHandler; + +import de.danielnaber.languagetool.JLanguageTool; +import de.danielnaber.languagetool.Language; +import de.danielnaber.languagetool.rules.Category; +import de.danielnaber.languagetool.rules.IncorrectExample; +import de.danielnaber.languagetool.tools.StringTools; + +/** + * Loads {@link PatternRule}s from a false friends XML file. + * + * @author Daniel Naber + */ +public class FalseFriendRuleLoader extends DefaultHandler { + + public FalseFriendRuleLoader() { + } + + public final List<PatternRule> getRules(final InputStream file, + final Language textLanguage, final Language motherTongue) + throws ParserConfigurationException, SAXException, IOException { + final FalseFriendRuleHandler handler = new FalseFriendRuleHandler( + textLanguage, motherTongue); + final SAXParserFactory factory = SAXParserFactory.newInstance(); + final SAXParser saxParser = factory.newSAXParser(); + saxParser.getXMLReader() + .setFeature( + "http://apache.org/xml/features/nonvalidating/load-external-dtd", + false); + saxParser.parse(file, handler); + final List<PatternRule> rules = handler.getRules(); + // Add suggestions to each rule: + final ResourceBundle messages = ResourceBundle.getBundle( + "de.danielnaber.languagetool.MessagesBundle", motherTongue.getLocale()); + for (final PatternRule rule : rules) { + final List<String> suggestionMap = handler.getSuggestionMap().get(rule.getId()); + if (suggestionMap != null) { + final MessageFormat msgFormat = new MessageFormat(messages + .getString("false_friend_suggestion")); + final Object[] msg = new Object[] { formatSuggestions(suggestionMap) }; + rule.setMessage(rule.getMessage() + " " + msgFormat.format(msg)); + } + } + return rules; + } + + private String formatSuggestions(final List<String> l) { + final StringBuilder sb = new StringBuilder(); + for (final Iterator<String> iter = l.iterator(); iter.hasNext();) { + final String s = iter.next(); + sb.append("<suggestion>"); + sb.append(s); + sb.append("</suggestion>"); + if (iter.hasNext()) { + sb.append(", "); + } + } + return sb.toString(); + } + + /** Testing only. */ + public final void main(final String[] args) + throws ParserConfigurationException, SAXException, IOException { + final FalseFriendRuleLoader prg = new FalseFriendRuleLoader(); + List<PatternRule> l = prg.getRules(JLanguageTool.getDataBroker() + .getFromRulesDirAsStream("/false-friends.xml"), Language.ENGLISH, + Language.GERMAN); + System.out.println("Hints for German native speakers:"); + for (final PatternRule rule : l) { + System.out.println(rule); + } + System.out.println("======================================="); + System.out.println("Hints for English native speakers:"); + l = prg.getRules(JLanguageTool.getDataBroker() + .getFromRulesDirAsStream("/false-friends.xml"), + Language.GERMAN, Language.ENGLISH); + for (final PatternRule rule : l) { + System.out.println(rule); + } + } + +} + +class FalseFriendRuleHandler extends XMLRuleHandler { + + private final ResourceBundle messages; + private final MessageFormat formatter; + + private final Language textLanguage; + private final Language motherTongue; + + private boolean defaultOff; + + private Language language; + private Language translationLanguage; + private Language currentTranslationLanguage; + private List<StringBuilder> translations = new ArrayList<StringBuilder>(); + private StringBuilder translation = new StringBuilder(); + private final List<String> suggestions = new ArrayList<String>(); + // rule ID -> list of translations: + private final Map<String, List<String>> suggestionMap = new HashMap<String, List<String>>(); + + private boolean inTranslation; + + public FalseFriendRuleHandler(final Language textLanguage, + final Language motherTongue) { + messages = ResourceBundle.getBundle( + "de.danielnaber.languagetool.MessagesBundle", motherTongue.getLocale()); + formatter = new MessageFormat(""); + formatter.setLocale(motherTongue.getLocale()); + this.textLanguage = textLanguage; + this.motherTongue = motherTongue; + } + + public Map<String, List<String>> getSuggestionMap() { + return suggestionMap; + } + + // =========================================================== + // SAX DocumentHandler methods + // =========================================================== + + @Override + public void startElement(final String namespaceURI, final String lName, + final String qName, final Attributes attrs) throws SAXException { + if (qName.equals("rule")) { + translations = new ArrayList<StringBuilder>(); + id = attrs.getValue("id"); + if (!(inRuleGroup && defaultOff)) { + defaultOff = "off".equals(attrs.getValue("default")); + } + if (inRuleGroup && id == null) { + id = ruleGroupId; + } + correctExamples = new ArrayList<String>(); + incorrectExamples = new ArrayList<IncorrectExample>(); + } else if (qName.equals("pattern")) { + inPattern = true; + final String languageStr = attrs.getValue("lang"); + language = Language.getLanguageForShortName(languageStr); + if (language == null) { + throw new SAXException("Unknown language '" + languageStr + "'"); + } + } else if (qName.equals("exception")) { + inException = true; + exceptions = new StringBuilder(); + + if (attrs.getValue(NEGATE) != null) { + exceptionStringNegation = attrs.getValue(NEGATE).equals(YES); + } + if (attrs.getValue(SCOPE) != null) { + exceptionValidNext = attrs.getValue(SCOPE).equals("next"); + exceptionValidPrev = attrs.getValue(SCOPE).equals("previous"); + } + if (attrs.getValue(INFLECTED) != null) { + exceptionStringInflected = attrs.getValue(INFLECTED).equals(YES); + } + if (attrs.getValue(POSTAG) != null) { + exceptionPosToken = attrs.getValue(POSTAG); + if (attrs.getValue(POSTAG_REGEXP) != null) { + exceptionPosRegExp = attrs.getValue(POSTAG_REGEXP).equals(YES); + } + if (attrs.getValue(NEGATE_POS) != null) { + exceptionPosNegation = attrs.getValue(NEGATE_POS).equals(YES); + } + } + if (attrs.getValue(REGEXP) != null) { + exceptionStringRegExp = attrs.getValue(REGEXP).equals(YES); + } + + } else if (qName.equals(TOKEN)) { + setToken(attrs); + } else if (qName.equals("translation")) { + inTranslation = true; + final String languageStr = attrs.getValue("lang"); + final Language tmpLang = Language.getLanguageForShortName(languageStr); + currentTranslationLanguage = tmpLang; + if (tmpLang == motherTongue) { + translationLanguage = tmpLang; + if (translationLanguage == null) { + throw new SAXException("Unknown language '" + languageStr + "'"); + } + } + } else if (qName.equals(EXAMPLE) + && attrs.getValue(TYPE).equals("correct")) { + inCorrectExample = true; + correctExample = new StringBuilder(); + } else if (qName.equals(EXAMPLE) + && attrs.getValue(TYPE).equals("incorrect")) { + inIncorrectExample = true; + incorrectExample = new StringBuilder(); + } else if (qName.equals("message")) { + inMessage = true; + message = new StringBuilder(); + } else if (qName.equals("rulegroup")) { + ruleGroupId = attrs.getValue("id"); + inRuleGroup = true; + defaultOff = "off".equals(attrs.getValue(DEFAULT)); + } + } + + @Override + public void endElement(final String namespaceURI, final String sName, + final String qName) { + if (qName.equals("rule")) { + if (language == textLanguage && translationLanguage != null + && translationLanguage == motherTongue && language != motherTongue + && !translations.isEmpty()) { + formatter.applyPattern(messages.getString("false_friend_hint")); + final Object[] messageArguments = { + elements.toString().replace('|', '/'), + messages.getString(textLanguage.getShortName()), + formatTranslations(translations), + messages.getString(motherTongue.getShortName()) }; + final String description = formatter.format(messageArguments); + final PatternRule rule = new PatternRule(id, language, elementList, + messages.getString("false_friend_desc") + " " + + elements.toString().replace('|', '/'), description, messages + .getString("false_friend")); + rule.setCorrectExamples(correctExamples); + rule.setIncorrectExamples(incorrectExamples); + rule.setCategory(new Category(messages + .getString("category_false_friend"))); + if (defaultOff) { + rule.setDefaultOff(); + } + rules.add(rule); + } + + if (elementList != null) { + elementList.clear(); + } + + } else if (qName.equals("exception")) { + inException = false; + if (!exceptionSet) { + tokenElement = new Element(elements.toString(), caseSensitive, + regExpression, tokenInflected); + exceptionSet = true; + } + tokenElement.setNegation(tokenNegated); + if (!StringTools.isEmpty(exceptions.toString())) { + tokenElement.setStringException(exceptions.toString(), + exceptionStringRegExp, exceptionStringInflected, + exceptionStringNegation, exceptionValidNext, exceptionValidPrev); + } + if (exceptionPosToken != null) { + tokenElement.setPosException(exceptionPosToken, exceptionPosRegExp, + exceptionPosNegation, exceptionValidNext, exceptionValidPrev); + exceptionPosToken = null; + } + } else if (qName.equals(TOKEN)) { + finalizeTokens(); + } else if (qName.equals("pattern")) { + inPattern = false; + } else if (qName.equals("translation")) { + if (currentTranslationLanguage == motherTongue) { + translations.add(translation); + } + if (currentTranslationLanguage == textLanguage) { + suggestions.add(translation.toString()); + } + translation = new StringBuilder(); + inTranslation = false; + currentTranslationLanguage = null; + } else if (qName.equals(EXAMPLE)) { + if (inCorrectExample) { + correctExamples.add(correctExample.toString()); + } else if (inIncorrectExample) { + incorrectExamples + .add(new IncorrectExample(incorrectExample.toString())); + } + inCorrectExample = false; + inIncorrectExample = false; + correctExample = new StringBuilder(); + incorrectExample = new StringBuilder(); + } else if (qName.equals("message")) { + inMessage = false; + } else if (qName.equals("rulegroup")) { + if (!suggestions.isEmpty()) { + final List<String> l = new ArrayList<String>(suggestions); + suggestionMap.put(id, l); + suggestions.clear(); + } + inRuleGroup = false; + } + } + + private String formatTranslations(final List<StringBuilder> translations) { + final StringBuilder sb = new StringBuilder(); + for (final Iterator<StringBuilder> iter = translations.iterator(); iter + .hasNext();) { + final StringBuilder trans = iter.next(); + sb.append('"'); + sb.append(trans.toString()); + sb.append('"'); + if (iter.hasNext()) { + sb.append(", "); + } + } + return sb.toString(); + } + + @Override + public void characters(final char[] buf, final int offset, final int len) { + final String s = new String(buf, offset, len); + if (inException) { + exceptions.append(s); + } else if (inToken && inPattern) { + elements.append(s); + } else if (inCorrectExample) { + correctExample.append(s); + } else if (inIncorrectExample) { + incorrectExample.append(s); + } else if (inTranslation) { + translation.append(s); + } + } + +} diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/rules/patterns/Match.java b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/patterns/Match.java new file mode 100644 index 0000000..0519f2c --- /dev/null +++ b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/patterns/Match.java @@ -0,0 +1,551 @@ +/* LanguageTool, a natural language style checker + * Copyright (C) 2005 Daniel Naber (http://www.danielnaber.de) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ +package de.danielnaber.languagetool.rules.patterns; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; +import java.util.TreeSet; +import java.util.regex.Pattern; + +import de.danielnaber.languagetool.AnalyzedToken; +import de.danielnaber.languagetool.AnalyzedTokenReadings; +import de.danielnaber.languagetool.JLanguageTool; +import de.danielnaber.languagetool.synthesis.Synthesizer; +import de.danielnaber.languagetool.tools.StringTools; + +/** + * Reference to a matched token in a pattern, can be formatted and used for + * matching & suggestions. + * + * @author Marcin Miłkowski + */ +public class Match { + + /** Possible string case conversions. **/ + public enum CaseConversion { + NONE, STARTLOWER, STARTUPPER, ALLLOWER, ALLUPPER; + + /** + * Converts string to the constant enum. + * + * @param str + * String value to be converted. + * @return CaseConversion enum. + */ + public static CaseConversion toCase(final String str) { + try { + return valueOf(str); + } catch (final Exception ex) { + return NONE; + } + } + } + + public enum IncludeRange { + NONE, FOLLOWING, ALL; + + /** + * Converts string to the constant enum. + * + * @param str + * String value to be converted. + * @return IncludeRange enum. + */ + public static IncludeRange toRange(final String str) { + try { + return valueOf(str); + } catch (final Exception ex) { + return NONE; + } + } + } + + private final String posTag; + private boolean postagRegexp; + private final String regexReplace; + private final String posTagReplace; + private final CaseConversion caseConversionType; + + private final IncludeRange includeSkipped; + private String skippedTokens; + + /** + * True if this match element formats a statically defined lemma which is + * enclosed by the element, e.g., <tt><match...>word</word></tt>. + */ + private boolean staticLemma; + + /** + * True if this match element is used for formatting POS token. + */ + private final boolean setPos; + + private AnalyzedTokenReadings formattedToken; + private AnalyzedTokenReadings matchedToken; + + private int tokenRef; + + /** Word form generator for POS tags. **/ + private Synthesizer synthesizer; + + /** Pattern used to define parts of the matched token. **/ + private Pattern pRegexMatch; + + /** Pattern used to define parts of the matched POS token. **/ + private Pattern pPosRegexMatch; + + /** + * True when the match is not in the suggestion. + */ + private boolean inMessageOnly; + + public Match(final String posTag, final String posTagReplace, + final boolean postagRegexp, final String regexMatch, + final String regexReplace, final CaseConversion caseConversionType, + final boolean setPOS, + final IncludeRange includeSkipped) { + this.posTag = posTag; + this.postagRegexp = postagRegexp; + this.caseConversionType = caseConversionType; + + if (regexMatch != null) { + pRegexMatch = Pattern.compile(regexMatch); + } + if (postagRegexp && posTag != null) { + pPosRegexMatch = Pattern.compile(posTag); + } + + this.regexReplace = regexReplace; + this.posTagReplace = posTagReplace; + this.setPos = setPOS; + this.includeSkipped = includeSkipped; + } + + /** + * Sets the token that will be formatted or otherwise used in the class. + */ + public final void setToken(final AnalyzedTokenReadings token) { + if (staticLemma) { + matchedToken = token; + } else { + formattedToken = token; + } + } + + /** + * Sets the token to be formatted etc. and includes the support for + * including the skipped tokens. + * @param tokens Array of tokens + * @param index Index of the token to be formatted + * @param next Position of the next token (the skipped tokens + * are the ones between the tokens[index] and tokens[next] + */ + public final void setToken(final AnalyzedTokenReadings[] tokens, final int index, final int next) { + setToken(tokens[index]); + if (next > 1 && includeSkipped != IncludeRange.NONE) { + final StringBuilder sb = new StringBuilder(); + if (includeSkipped == IncludeRange.FOLLOWING) { + formattedToken = null; + } + for (int k = index + 1; k < index + next; k++) { + if (k > index + 1 && + tokens[k].isWhitespaceBefore()) { + sb.append(' '); + } + sb.append(tokens[k].getToken()); + } + skippedTokens = sb.toString(); + } else { + skippedTokens = ""; + } + } + + /** + private String[] addSkipped(final String[] formattedString) { + if (skippedTokens != null && !"".equals(skippedTokens)) { + String[] finalStrings = new String[formattedString.length]; + for (int i = 1; i <= formattedString.length; i++) + } + } + + **/ + + /** + * Checks if the Match element is used for setting the part of speech Element. + * + * @return True if Match sets POS. + */ + public final boolean setsPos() { + return setPos; + } + + /** + * Checks if the Match element uses regexp-based form of the POS tag. + * + * @return True if regexp is used in POS. + */ + public final boolean posRegExp() { + return postagRegexp; + } + + /** + * Sets a base form (lemma) that will be formatted, or synthesized, using the + * specified POS regular expressions. + * + * @param lemmaString String that specifies the base form. + */ + public final void setLemmaString(final String lemmaString) { + if (!StringTools.isEmpty(lemmaString)) { + formattedToken = new AnalyzedTokenReadings(new AnalyzedToken(lemmaString, + posTag, lemmaString), 0); + staticLemma = true; + postagRegexp = true; + if (posTag != null) { + pPosRegexMatch = Pattern.compile(posTag); + } + } + } + + /** + * Sets a synthesizer used for grammatical synthesis of forms based on + * formatted POS values. + * + * @param synth Synthesizer class. + */ + public final void setSynthesizer(final Synthesizer synth) { + synthesizer = synth; + } + + /** + * Gets all strings formatted using the match element. + * + * @return array of strings + * @throws IOException + * in case of synthesizer-related disk problems. + */ + public final String[] toFinalString() throws IOException { + String[] formattedString = new String[1]; + if (formattedToken != null) { + final int readingCount = formattedToken.getReadingsLength(); + formattedString[0] = formattedToken.getToken(); + if (pRegexMatch != null) { + formattedString[0] = pRegexMatch.matcher(formattedString[0]) + .replaceAll(regexReplace); + } + formattedString[0] = convertCase(formattedString[0]); + if (posTag != null) { + if (synthesizer == null) { + formattedString[0] = formattedToken.getToken(); + } else if (postagRegexp) { + final TreeSet<String> wordForms = new TreeSet<String>(); + boolean oneForm = false; + for (int k = 0; k < readingCount; k++) { + if (formattedToken.getAnalyzedToken(k).getLemma() == null) { + final String posUnique = formattedToken.getAnalyzedToken(k) + .getPOSTag(); + if (posUnique == null) { + wordForms.add(formattedToken.getToken()); + oneForm = true; + } else { + if (JLanguageTool.SENTENCE_START_TAGNAME.equals(posUnique) + || JLanguageTool.SENTENCE_END_TAGNAME.equals(posUnique) + || JLanguageTool.PARAGRAPH_END_TAGNAME.equals(posUnique)) { + if (!oneForm) { + wordForms.add(formattedToken.getToken()); + } + oneForm = true; + } else { + oneForm = false; + } + } + } + } + final String targetPosTag = getTargetPosTag(); + if (!oneForm) { + for (int i = 0; i < readingCount; i++) { + final String[] possibleWordForms = synthesizer.synthesize( + formattedToken.getAnalyzedToken(i), targetPosTag, true); + if (possibleWordForms != null) { + wordForms.addAll(Arrays.asList(possibleWordForms)); + } + } + } + if (wordForms.isEmpty()) { + formattedString[0] = "(" + formattedToken.getToken() + ")"; + } else { + formattedString = wordForms.toArray(new String[wordForms.size()]); + } + } else { + final TreeSet<String> wordForms = new TreeSet<String>(); + for (int i = 0; i < readingCount; i++) { + final String[] possibleWordForms = synthesizer.synthesize( + formattedToken.getAnalyzedToken(i), posTag); + if (possibleWordForms != null) { + wordForms.addAll(Arrays.asList(possibleWordForms)); + } + } + formattedString = wordForms.toArray(new String[wordForms.size()]); + } + } + } + if (includeSkipped != IncludeRange.NONE + && skippedTokens != null && !"".equals(skippedTokens)) { + final String[] helper = new String[formattedString.length]; + for (int i = 0; i < formattedString.length; i++) { + if (formattedString[i] == null) { + formattedString[i] = ""; + } + helper[i] = formattedString[i] + skippedTokens; + } + formattedString = helper; + } + return formattedString; + } + + /** + * Format POS tag using parameters already defined in the class. + * + * @return Formatted POS tag as String. + */ + // FIXME: gets only the first POS tag that matches, this can be wrong + // on the other hand, many POS tags = too many suggestions? + public final String getTargetPosTag() { + String targetPosTag = posTag; + final List<String> posTags = new ArrayList<String>(); + if (staticLemma) { + final int numRead = matchedToken.getReadingsLength(); + for (int i = 0; i < numRead; i++) { + final String tst = matchedToken.getAnalyzedToken(i).getPOSTag(); + if (tst != null && pPosRegexMatch.matcher(tst).matches()) { + targetPosTag = matchedToken.getAnalyzedToken(i).getPOSTag(); + posTags.add(targetPosTag); + } + } + if (pPosRegexMatch != null && posTagReplace != null) { + targetPosTag = pPosRegexMatch.matcher(targetPosTag).replaceAll( + posTagReplace); + } + } else { + final int numRead = formattedToken.getReadingsLength(); + for (int i = 0; i < numRead; i++) { + final String tst = formattedToken.getAnalyzedToken(i).getPOSTag(); + if (tst != null && pPosRegexMatch.matcher(tst).matches()) { + targetPosTag = formattedToken.getAnalyzedToken(i).getPOSTag(); + posTags.add(targetPosTag); + } + } + if (pPosRegexMatch != null && posTagReplace != null) { + if (posTags.isEmpty()) { + posTags.add(targetPosTag); + } + final StringBuilder sb = new StringBuilder(); + final int posTagLen = posTags.size(); + int l = 0; + for (String lposTag : posTags) { + l++; + lposTag = pPosRegexMatch.matcher(lposTag).replaceAll(posTagReplace); + if (setPos) { + lposTag = synthesizer.getPosTagCorrection(lposTag); + } + sb.append(lposTag); + if (l < posTagLen) { + sb.append('|'); + } + } + targetPosTag = sb.toString(); + } + } + return targetPosTag; + } + + /** + * Method for getting the formatted match as a single string. In case of + * multiple matches, it joins them using a regular expression operator "|". + * + * @return Formatted string of the matched token. + */ + public final String toTokenString() throws IOException { + final StringBuilder output = new StringBuilder(); + final String[] stringToFormat = toFinalString(); + for (int i = 0; i < stringToFormat.length; i++) { + output.append(stringToFormat[i]); + if (i + 1 < stringToFormat.length) { + output.append('|'); + } + } + return output.toString(); + } + + /** + * Sets the token number referenced by the match. + * + * @param i Token number. + */ + public final void setTokenRef(final int i) { + tokenRef = i; + } + + /** + * Gets the token number referenced by the match. + * + * @return int - token number. + */ + public final int getTokenRef() { + return tokenRef; + } + + /** + * Converts case of the string token according to match element attributes. + * + * @param s Token to be converted. + * @return Converted string. + */ + private String convertCase(final String s) { + if (StringTools.isEmpty(s)) { + return s; + } + String token = s; + switch (caseConversionType) { + case NONE: + break; + case STARTLOWER: + token = token.substring(0, 1).toLowerCase() + token.substring(1); + break; + case STARTUPPER: + token = token.substring(0, 1).toUpperCase() + token.substring(1); + break; + case ALLUPPER: + token = token.toUpperCase(); + break; + case ALLLOWER: + token = token.toLowerCase(); + break; + default: + break; + } + return token; + } + + /** + * Used to let LT know that it should change the case of the match. + * + * @return true if match converts the case of the token. + */ + public final boolean convertsCase() { + return !caseConversionType.equals(CaseConversion.NONE); + } + + public final AnalyzedTokenReadings filterReadings() { + final ArrayList<AnalyzedToken> l = new ArrayList<AnalyzedToken>(); + if (formattedToken != null) { + if (staticLemma) { + formattedToken = new AnalyzedTokenReadings(new AnalyzedToken( + matchedToken.getToken(), posTag, formattedToken.getToken()), + matchedToken.getStartPos()); + formattedToken.setWhitespaceBefore(matchedToken.isWhitespaceBefore()); + } + String token = formattedToken.getToken(); + if (pRegexMatch != null) { + token = pRegexMatch.matcher(token).replaceAll(regexReplace); + } + token = convertCase(token); + if (posTag != null) { + final int numRead = formattedToken.getReadingsLength(); + if (postagRegexp) { + String targetPosTag = posTag; + for (int i = 0; i < numRead; i++) { + final String tst = formattedToken.getAnalyzedToken(i).getPOSTag(); + if (tst != null && pPosRegexMatch.matcher(tst).matches()) { + targetPosTag = formattedToken.getAnalyzedToken(i).getPOSTag(); + if (posTagReplace != null) { + targetPosTag = pPosRegexMatch.matcher(targetPosTag).replaceAll( + posTagReplace); + } + l + .add(new AnalyzedToken(token, targetPosTag, formattedToken + .getAnalyzedToken(i).getLemma())); + l.get(l.size() - 1).setWhitespaceBefore(formattedToken.isWhitespaceBefore()); + } + } + if (l.isEmpty()) { + for (final AnalyzedToken anaTok : getNewToken(numRead, token)) { + l.add(anaTok); + } + } + } else { + for (final AnalyzedToken anaTok : getNewToken(numRead, token)) { + l.add(anaTok); + } + } + if (formattedToken.isSentEnd()) { + l.add(new AnalyzedToken(formattedToken.getToken(), + JLanguageTool.SENTENCE_END_TAGNAME, + formattedToken.getAnalyzedToken(0).getLemma())); + } + if (formattedToken.isParaEnd()) { + l.add(new AnalyzedToken(formattedToken.getToken(), + JLanguageTool.PARAGRAPH_END_TAGNAME, + formattedToken.getAnalyzedToken(0).getLemma())); + } + } + } + if (l.isEmpty()) { + return formattedToken; + } + return new AnalyzedTokenReadings(l.toArray(new AnalyzedToken[l.size()]), formattedToken.getStartPos()); + } + + private AnalyzedToken[] getNewToken(final int numRead, final String token) { + final List<AnalyzedToken> list = new ArrayList<AnalyzedToken>(); + String lemma = ""; + for (int j = 0; j < numRead; j++) { + if (formattedToken.getAnalyzedToken(j).getPOSTag() != null) { + if (formattedToken.getAnalyzedToken(j).getPOSTag().equals(posTag) + && (formattedToken.getAnalyzedToken(j).getLemma() != null)) { + lemma = formattedToken.getAnalyzedToken(j).getLemma(); + } + if (StringTools.isEmpty(lemma)) { + lemma = formattedToken.getAnalyzedToken(0).getLemma(); + } + list.add(new AnalyzedToken(token, posTag, lemma)); + list.get(list.size() - 1). + setWhitespaceBefore(formattedToken.isWhitespaceBefore()); + } + } + return list.toArray(new AnalyzedToken[list.size()]); + } + + /** + * @param inMessageOnly + * the inMessageOnly to set + */ + public void setInMessageOnly(final boolean inMessageOnly) { + this.inMessageOnly = inMessageOnly; + } + + /** + * @return the inMessageOnly + */ + public boolean isInMessageOnly() { + return inMessageOnly; + } + +} diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/rules/patterns/PatternRule.java b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/patterns/PatternRule.java new file mode 100644 index 0000000..843ef98 --- /dev/null +++ b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/patterns/PatternRule.java @@ -0,0 +1,652 @@ +/* LanguageTool, a natural language style checker + * Copyright (C) 2005 Daniel Naber (http://www.danielnaber.de) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ +package de.danielnaber.languagetool.rules.patterns; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; + +import de.danielnaber.languagetool.AnalyzedSentence; +import de.danielnaber.languagetool.AnalyzedTokenReadings; +import de.danielnaber.languagetool.Language; +import de.danielnaber.languagetool.rules.IncorrectExample; +import de.danielnaber.languagetool.rules.RuleMatch; +import de.danielnaber.languagetool.tools.StringTools; + +/** + * A Rule that describes a language error as a simple pattern of words or of + * part-of-speech tags. + * + * @author Daniel Naber + */ +public class PatternRule extends AbstractPatternRule { + + private static final String SUGG_TAG = "<suggestion>"; + private static final String END_SUGG_TAG = "</suggestion>"; + + private String subId; // because there can be more than one rule in a rule + // group + + private String message; + private String shortMessage; + + /** Formatted suggestion elements. **/ + private List<Match> suggestionMatches; + + /** + * A list of elements as they appear in XML file (phrases count as single + * tokens in case of matches or skipping). + */ + private List<Integer> elementNo; + + /** + * This property is used for short-circuiting evaluation of the elementNo list + * order. + */ + private boolean useList; + + /** + * Marks whether the rule is a member of a disjunctive set (in case of OR + * operation on phraserefs). + **/ + private boolean isMemberOfDisjunctiveSet; + + /** + * @param id + * Id of the Rule + * @param language + * Language of the Rule + * @param elements + * Element (token) list + * @param description + * Description to be shown (name) + * @param message + * Message to be displayed to the user + */ + + public PatternRule(final String id, final Language language, + final List<Element> elements, final String description, + final String message, final String shortMessage) { + super(id, description, language, elements, false); + if (id == null) { + throw new NullPointerException("id cannot be null"); + } + if (language == null) { + throw new NullPointerException("language cannot be null"); + } + if (elements == null) { + throw new NullPointerException("elements cannot be null"); + } + if (description == null) { + throw new NullPointerException("description cannot be null"); + } + + this.message = message; + this.shortMessage = shortMessage; + this.elementNo = new ArrayList<Integer>(); + String prevName = ""; + String curName = ""; + int cnt = 0; + int loopCnt = 0; + for (final Element e : patternElements) { + if (e.isPartOfPhrase()) { + curName = e.getPhraseName(); + if (prevName.equals(curName) || StringTools.isEmpty(prevName)) { + cnt++; + useList = true; + } else { + elementNo.add(cnt); + prevName = ""; + curName = ""; + cnt = 0; + } + prevName = curName; + loopCnt++; + if (loopCnt == patternElements.size() && !StringTools.isEmpty(prevName)) { + elementNo.add(cnt); + } + } else { + if (cnt > 0) { + elementNo.add(cnt); + } + elementNo.add(1); + loopCnt++; + } + } + } + + public PatternRule(final String id, final Language language, + final List<Element> elements, final String description, + final String message, final String shortMessage, final boolean isMember) { + this(id, language, elements, description, message, shortMessage); + this.isMemberOfDisjunctiveSet = isMember; + } + + public final String getSubId() { + return subId; + } + + public final void setSubId(final String subId) { + this.subId = subId; + } + + public final String getMessage() { + return message; + } + + /** + * Used for testing rules: only one of the set can match. + * + * @return Whether the rule can non-match (as a member of disjunctive set of + * rules generated by phraseref in includephrases element). + */ + public final boolean isWithComplexPhrase() { + return isMemberOfDisjunctiveSet; + } + + /** Reset complex status - used for testing. **/ + public final void notComplexPhrase() { + isMemberOfDisjunctiveSet = false; + } + + /** + * Return the pattern as a string. + * + * @since 0.9.2 + */ + public final String toPatternString() { + final List<String> strList = new ArrayList<String>(); + for (Element patternElement : patternElements) { + strList.add(patternElement.toString()); + } + return StringTools.listToString(strList, ", "); + } + + /** + * Return the pattern as an XML string. FIXME: this is not complete, information might be lost! + * + * @since 0.9.3 + */ + public final String toXML() { + final StringBuilder sb = new StringBuilder(); + sb.append("<rule id=\""); + sb.append(StringTools.escapeXML(getId())); + sb.append("\" name=\""); + sb.append(StringTools.escapeXML(getDescription())); + sb.append("\">\n"); + sb.append("<pattern mark_from=\""); + sb.append(startPositionCorrection); + sb.append("\" mark_to=\""); + sb.append(endPositionCorrection); + sb.append('"'); + // for now, case sensitivity is per pattern, not per element, + // so just use the setting of the first element: + if (!patternElements.isEmpty() && patternElements.get(0).getCaseSensitive()) { + sb.append(" case_sensitive=\"yes\""); + } + sb.append(">\n"); + for (Element patternElement : patternElements) { + sb.append("<token"); + if (patternElement.getNegation()) { + sb.append(" negate=\"yes\""); + } + if (patternElement.isRegularExpression()) { + sb.append(" regexp=\"yes\""); + } + if (patternElement.getPOStag() != null) { + sb.append(" postag=\""); + sb.append(patternElement.getPOStag()); + sb.append('"'); + } + if (patternElement.getPOSNegation()) { + sb.append(" negate_pos=\"yes\""); + } + if (patternElement.isInflected()) { + sb.append(" inflected=\"yes\""); + } + sb.append('>'); + if (patternElement.getString() != null) { + sb.append(StringTools.escapeXML(patternElement.getString())); + } else { + // TODO + } + sb.append("</token>\n"); + } + sb.append("</pattern>\n"); + sb.append("<message>"); + sb.append(StringTools.escapeXML(message)); + sb.append("</message>\n"); + if (getIncorrectExamples() != null) { + for (IncorrectExample example : getIncorrectExamples()) { + sb.append("<example type=\"incorrect\">"); + sb.append(StringTools.escapeXML(example.getExample())); + sb.append("</example>\n"); + } + } + if (getCorrectExamples() != null) { + for (String example : getCorrectExamples()) { + sb.append("<example type=\"correct\">"); + sb.append(StringTools.escapeXML(example)); + sb.append("</example>\n"); + } + } + sb.append("</rule>"); + return sb.toString(); + } + + public final void setMessage(final String message) { + this.message = message; + } + + @Override + public final RuleMatch[] match(final AnalyzedSentence text) + throws IOException { + final List<RuleMatch> ruleMatches = new ArrayList<RuleMatch>(); + final AnalyzedTokenReadings[] tokens = text.getTokensWithoutWhitespace(); + final int[] tokenPositions = new int[tokens.length + 1]; + final int patternSize = patternElements.size(); + final int limit = Math.max(0, tokens.length - patternSize + 1); + Element elem = null; + int i = 0; + while (i < limit && !(sentStart && i > 0)) { + boolean allElementsMatch = false; + int firstMatchToken = -1; + int lastMatchToken = -1; + int matchingTokens = 0; + int prevSkipNext = 0; + // this variable keeps the total number + // of tokens skipped + int skipShiftTotal = 0; + if (testUnification) { + unifier.reset(); + } + for (int k = 0; k < patternSize; k++) { + final Element prevElement = elem; + elem = patternElements.get(k); + setupRef(firstMatchToken, elem, tokens); + final int nextPos = i + k + skipShiftTotal; + prevMatched = false; + if (prevSkipNext + nextPos >= tokens.length || prevSkipNext < 0) { // SENT_END? + prevSkipNext = tokens.length - (nextPos + 1); + } + final int maxTok = Math.min(nextPos + prevSkipNext, tokens.length - (patternSize - k)); + for (int m = nextPos; m <= maxTok; m++) { + allElementsMatch = testAllReadings(tokens, elem, prevElement, m, + firstMatchToken, prevSkipNext); + if (allElementsMatch) { + lastMatchToken = m; + final int skipShift = lastMatchToken - nextPos; + tokenPositions[matchingTokens] = skipShift + 1; + prevSkipNext = translateElementNo(elem.getSkipNext()); + matchingTokens++; + skipShiftTotal += skipShift; + if (firstMatchToken == -1) { + firstMatchToken = lastMatchToken; + } + break; + } + } + if (!allElementsMatch) { + break; + } + } + + if (allElementsMatch && matchingTokens == patternSize) { + final RuleMatch rM = createRuleMatch(tokenPositions, tokens, + firstMatchToken, lastMatchToken, matchingTokens); + if (rM != null) { + ruleMatches.add(rM); + } + } + i++; + } + return ruleMatches.toArray(new RuleMatch[ruleMatches.size()]); + } + + private RuleMatch createRuleMatch(final int[] tokenPositions, + final AnalyzedTokenReadings[] tokens, final int firstMatchToken, + final int lastMatchToken, final int matchingTokens) throws IOException { + final String errMessage = formatMatches(tokens, tokenPositions, + firstMatchToken, message); + int correctedStPos = 0; + if (startPositionCorrection > 0) { + for (int l = 0; l <= startPositionCorrection; l++) { + correctedStPos += tokenPositions[l]; + } + correctedStPos--; + } + int correctedEndPos = 0; + if (endPositionCorrection < 0) { + int l = 0; + while (l > endPositionCorrection) { + correctedEndPos -= tokenPositions[matchingTokens + l - 1]; + l--; + } + } + AnalyzedTokenReadings firstMatchTokenObj = tokens[firstMatchToken + + correctedStPos]; + boolean startsWithUppercase = StringTools + .startsWithUppercase(firstMatchTokenObj.getToken()) + && !matchConvertsCase(); + + if (firstMatchTokenObj.isSentStart() + && tokens.length > firstMatchToken + correctedStPos + 1) { + // make uppercasing work also at sentence start: + firstMatchTokenObj = tokens[firstMatchToken + correctedStPos + 1]; + startsWithUppercase = StringTools.startsWithUppercase(firstMatchTokenObj + .getToken()); + } + int fromPos = tokens[firstMatchToken + correctedStPos].getStartPos(); + // FIXME: this is fishy, assumes that comma should always come before + // whitespace + if (errMessage.contains(SUGG_TAG + ",") + && firstMatchToken + correctedStPos >= 1) { + fromPos = tokens[firstMatchToken + correctedStPos - 1].getStartPos() + + tokens[firstMatchToken + correctedStPos - 1].getToken().length(); + } + + final int toPos = tokens[lastMatchToken + correctedEndPos].getStartPos() + + tokens[lastMatchToken + correctedEndPos].getToken().length(); + if (fromPos < toPos) { // this can happen with some skip="-1" when the last + // token is not matched + return new RuleMatch(this, fromPos, toPos, + errMessage, shortMessage, startsWithUppercase); + } // failed to create any rule match... + return null; + } + + /** + * Checks if the suggestion starts with a match that is supposed to convert + * case. If it does, stop the default conversion to uppercase. + * + * @return true, if the match converts the case of the token. + */ + private boolean matchConvertsCase() { + if (suggestionMatches != null && !suggestionMatches.isEmpty()) { + final int sugStart = message.indexOf(SUGG_TAG) + SUGG_TAG.length(); + for (Match sMatch : suggestionMatches) { + if (!sMatch.isInMessageOnly() && sMatch.convertsCase() + && message.charAt(sugStart) == '\\') { + return true; + } + } + } + return false; + } + + public final void addSuggestionMatch(final Match m) { + if (suggestionMatches == null) { + suggestionMatches = new ArrayList<Match>(); + } + suggestionMatches.add(m); + } + + /** + * Gets the index of the element indexed by i, adding any offsets because of + * the phrases in the rule. + * + * @param i + * Current element index. + * @return int Index translated into XML element no. + */ + private int translateElementNo(final int i) { + if (!useList || i < 0) { + return i; + } + int j = 0; + for (int k = 0; k < i; k++) { + j += elementNo.get(k); + } + return j; + } + + /** + * Returns true when the token in the rule references a phrase composed of + * many tokens. + * + * @param i + * The index of the token. + * @return true if the phrase is under the index, false otherwise. + **/ + private int phraseLen(final int i) { + if (!useList || i > (elementNo.size() - 1)) { + return 1; + } + return elementNo.get(i); + } + + /** + * Creates a Cartesian product of the arrays stored in the input array. + * + * @param input + * Array of string arrays to combine. + * @param output + * Work array of strings. + * @param r + * Starting parameter (use 0 to get all combinations). + * @param lang + * Text language for adding spaces in some languages. + * @return Combined array of @String. + */ + private static String[] combineLists(final String[][] input, + final String[] output, final int r, final Language lang) { + final List<String> outputList = new ArrayList<String>(); + if (r == input.length) { + final StringBuilder sb = new StringBuilder(); + for (int k = 0; k < output.length; k++) { + sb.append(output[k]); + if (k < output.length - 1) { + sb.append(StringTools.addSpace(output[k + 1], lang)); + } + } + outputList.add(sb.toString()); + } else { + for (int c = 0; c < input[r].length; c++) { + output[r] = input[r][c]; + final String[] sList = combineLists(input, output, r + 1, lang); + outputList.addAll(Arrays.asList(sList)); + } + } + return outputList.toArray(new String[outputList.size()]); + } + + /** + * Concatenates the matches, and takes care of phrases (including inflection + * using synthesis). + * + * @param start + * Position of the element as referenced by match element in the + * rule. + * @param index + * The index of the element found in the matching sentence. + * @param tokenIndex + * The position of the token in the AnalyzedTokenReadings array. + * @param tokens + * Array of @AnalyzedTokenReadings + * @return @String[] Array of concatenated strings + * @throws IOException + * in case disk operations (used in synthesizer) go wrong. + */ + private String[] concatMatches(final int start, final int index, + final int tokenIndex, final AnalyzedTokenReadings[] tokens, + final int nextTokenPos) + throws IOException { + String[] finalMatch = null; + if (suggestionMatches.get(start) != null) { + final int len = phraseLen(index); + if (len == 1) { + final int skippedTokens = nextTokenPos - tokenIndex; + suggestionMatches.get(start).setToken(tokens, tokenIndex - 1, skippedTokens); + suggestionMatches.get(start).setSynthesizer(language.getSynthesizer()); + finalMatch = suggestionMatches.get(start).toFinalString(); + } else { + final List<String[]> matchList = new ArrayList<String[]>(); + for (int i = 0; i < len; i++) { + final int skippedTokens = nextTokenPos - (tokenIndex + i); + suggestionMatches.get(start).setToken(tokens, tokenIndex - 1 + i, skippedTokens); + suggestionMatches.get(start) + .setSynthesizer(language.getSynthesizer()); + matchList.add(suggestionMatches.get(start).toFinalString()); + } + return combineLists(matchList.toArray(new String[matchList.size()][]), + new String[matchList.size()], 0, language); + } + } + return finalMatch; + } + + /** + * Replace back references generated with <match> and \\1 in message + * using Match class, and take care of skipping. * + * + * @param tokenReadings + * Array of AnalyzedTokenReadings that were matched against the + * pattern + * @param positions + * Array of relative positions of matched tokens + * @param firstMatchTok + * Position of the first matched token + * @param errorMsg + * String containing suggestion markup + * @return String Formatted message. + * @throws IOException + * + **/ + private String formatMatches(final AnalyzedTokenReadings[] tokenReadings, + final int[] positions, final int firstMatchTok, final String errorMsg) + throws IOException { + String errorMessage = errorMsg; + int matchCounter = 0; + final int[] numbersToMatches = new int[errorMsg.length()]; + boolean newWay = false; + int errLen = errorMessage.length(); + int errMarker = errorMessage.indexOf('\\'); + boolean numberFollows = false; + if (errMarker > 0 && errMarker < errLen - 1) { + numberFollows = StringTools.isPositiveNumber(errorMessage + .charAt(errMarker + 1)); + } + while (errMarker > 0 && numberFollows) { + final int ind = errorMessage.indexOf('\\'); + if (ind > 0 && StringTools.isPositiveNumber(errorMessage.charAt(ind + 1))) { + int numLen = 1; + while (ind + numLen < errorMessage.length() + && StringTools.isPositiveNumber(errorMessage.charAt(ind + numLen))) { + numLen++; + } + final int j = Integer.parseInt(errorMessage.substring(ind + 1, ind + + numLen)) - 1; + int repTokenPos = 0; + int nextTokenPos = 0; + for (int l = 0; l <= j; l++) { + repTokenPos += positions[l]; + } + if (j <= positions.length) { + nextTokenPos = firstMatchTok + repTokenPos + positions[j + 1]; + } + if (suggestionMatches != null) { + if (matchCounter < suggestionMatches.size()) { + numbersToMatches[j] = matchCounter; + if (suggestionMatches.get(matchCounter) != null) { + final String[] matches = concatMatches(matchCounter, j, + firstMatchTok + repTokenPos, tokenReadings, nextTokenPos); + final String leftSide = errorMessage.substring(0, ind); + final String rightSide = errorMessage.substring(ind + numLen); + if (matches.length == 1) { + errorMessage = leftSide + matches[0] + rightSide; + } else { + errorMessage = formatMultipleSynthesis(matches, leftSide, + rightSide); + } + matchCounter++; + newWay = true; + } + } else { + // FIXME: is this correct? this is how we deal with multiple matches + suggestionMatches.add(suggestionMatches.get(numbersToMatches[j])); + } + } + + if (!newWay) { + // in case <match> elements weren't used (yet) + errorMessage = errorMessage.replace("\\" + (j + 1), + tokenReadings[firstMatchTok + repTokenPos - 1].getToken()); + } + } + errMarker = errorMessage.indexOf('\\'); + numberFollows = false; + errLen = errorMessage.length(); + if (errMarker > 0 && errMarker < errLen - 1) { + numberFollows = StringTools.isPositiveNumber(errorMessage + .charAt(errMarker + 1)); + } + } + return errorMessage; + } + + private static String formatMultipleSynthesis(final String[] matches, + final String leftSide, final String rightSide) { + String errorMessage = ""; + String suggestionLeft = ""; + String suggestionRight = ""; + String rightSideNew = rightSide; + final int sPos = leftSide.lastIndexOf(SUGG_TAG); + if (sPos > 0) { + suggestionLeft = leftSide.substring(sPos + SUGG_TAG.length()); + } + if (StringTools.isEmpty(suggestionLeft)) { + errorMessage = leftSide; + } else { + errorMessage = leftSide.substring(0, leftSide.lastIndexOf(SUGG_TAG)) + + SUGG_TAG; + } + final int rPos = rightSide.indexOf(END_SUGG_TAG); + if (rPos > 0) { + suggestionRight = rightSide.substring(0, rPos); + } + if (!StringTools.isEmpty(suggestionRight)) { + rightSideNew = rightSide.substring(rightSide.indexOf(END_SUGG_TAG)); + } + final int lastLeftSugEnd = leftSide.indexOf(END_SUGG_TAG); + final int lastLeftSugStart = leftSide.lastIndexOf(SUGG_TAG); + final StringBuilder sb = new StringBuilder(); + sb.append(errorMessage); + for (int z = 0; z < matches.length; z++) { + sb.append(suggestionLeft); + sb.append(matches[z]); + sb.append(suggestionRight); + if ((z < matches.length - 1) && lastLeftSugEnd < lastLeftSugStart) { + sb.append(END_SUGG_TAG); + sb.append(", "); + sb.append(SUGG_TAG); + } + } + sb.append(rightSideNew); + return sb.toString(); + } + + /** + * For testing only. + */ + public final List<Element> getElements() { + return patternElements; + } + +} diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/rules/patterns/PatternRuleLoader.java b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/patterns/PatternRuleLoader.java new file mode 100644 index 0000000..8156a6e --- /dev/null +++ b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/patterns/PatternRuleLoader.java @@ -0,0 +1,369 @@ +/* LanguageTool, a natural language style checker + * Copyright (C) 2005 Daniel Naber (http://www.danielnaber.de) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ +package de.danielnaber.languagetool.rules.patterns; + +import java.io.IOException; +import java.io.InputStream; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; + +import javax.xml.parsers.SAXParser; +import javax.xml.parsers.SAXParserFactory; + +import org.xml.sax.Attributes; +import org.xml.sax.SAXException; +import org.xml.sax.helpers.DefaultHandler; + +import de.danielnaber.languagetool.JLanguageTool; +import de.danielnaber.languagetool.Language; +import de.danielnaber.languagetool.rules.Category; +import de.danielnaber.languagetool.rules.IncorrectExample; + +/** + * Loads {@link PatternRule}s from an XML file. + * + * @author Daniel Naber + */ +public class PatternRuleLoader extends DefaultHandler { + + public final List<PatternRule> getRules(final InputStream is, + final String filename) throws IOException { + try { + final PatternRuleHandler handler = new PatternRuleHandler(); + final SAXParserFactory factory = SAXParserFactory.newInstance(); + final SAXParser saxParser = factory.newSAXParser(); + saxParser.getXMLReader().setFeature( + "http://apache.org/xml/features/nonvalidating/load-external-dtd", + false); + saxParser.parse(is, handler); + return handler.getRules(); + } catch (final Exception e) { + final IOException ioe = new IOException("Cannot load or parse '" + + filename + "'"); + ioe.initCause(e); + throw ioe; + } + } + + /** Testing only. */ + public final void main(final String[] args) throws IOException { + final PatternRuleLoader prg = new PatternRuleLoader(); + final String name = "/de/grammar.xml"; + final List<PatternRule> l = prg.getRules(JLanguageTool.getDataBroker().getFromRulesDirAsStream(name), name); + System.out.println(l); + } + +} + +class PatternRuleHandler extends XMLRuleHandler { + + private int subId; + + private boolean defaultOff; + private boolean defaultOn; + + private Category category; + private String description; + private String ruleGroupDescription; + + // =========================================================== + // SAX DocumentHandler methods + // =========================================================== + + @Override + public void startElement(final String namespaceURI, final String lName, + final String qName, final Attributes attrs) throws SAXException { + if ("category".equals(qName)) { + final String catName = attrs.getValue("name"); + final String priorityStr = attrs.getValue("priority"); + // int prio = 0; + if (priorityStr == null) { + category = new Category(catName); + } else { + category = new Category(catName, Integer.parseInt(priorityStr)); + } + + if ("off".equals(attrs.getValue(DEFAULT))) { + category.setDefaultOff(); + } + + } else if ("rules".equals(qName)) { + final String languageStr = attrs.getValue("lang"); + language = Language.getLanguageForShortName(languageStr); + if (language == null) { + throw new SAXException("Unknown language '" + languageStr + "'"); + } + } else if ("rule".equals(qName)) { + id = attrs.getValue("id"); + if (inRuleGroup) { + subId++; + } + if (!(inRuleGroup && defaultOff)) { + defaultOff = "off".equals(attrs.getValue(DEFAULT)); + } + + if (!(inRuleGroup && defaultOn)) { + defaultOn = "on".equals(attrs.getValue(DEFAULT)); + } + if (inRuleGroup && id == null) { + id = ruleGroupId; + } + description = attrs.getValue("name"); + if (inRuleGroup && description == null) { + description = ruleGroupDescription; + } + correctExamples = new ArrayList<String>(); + incorrectExamples = new ArrayList<IncorrectExample>(); + if (suggestionMatches != null) { + suggestionMatches.clear(); + } + } else if (PATTERN.equals(qName)) { + startPattern(attrs); + } else if (AND.equals(qName)) { + inAndGroup = true; + } else if ("unify".equals(qName)) { + inUnification = true; + uniNegation = YES.equals(attrs.getValue(NEGATE)); + } else if ("feature".equals(qName)) { + uFeature = attrs.getValue("id"); + } else if (qName.equals(TYPE)) { + uType = attrs.getValue("id"); + uTypeList.add(uType); + } else if (qName.equals(TOKEN)) { + setToken(attrs); + } else if (EXCEPTION.equals(qName)) { + setExceptions(attrs); + } else if (qName.equals(EXAMPLE) + && attrs.getValue(TYPE).equals("correct")) { + inCorrectExample = true; + correctExample = new StringBuilder(); + } else if (qName.equals(EXAMPLE) + && attrs.getValue(TYPE).equals("incorrect")) { + inIncorrectExample = true; + incorrectExample = new StringBuilder(); + exampleCorrection = new StringBuilder(); + if (attrs.getValue("correction") != null) { + exampleCorrection.append(attrs.getValue("correction")); + } + } else if ("message".equals(qName)) { + inMessage = true; + inSuggestion = false; + message = new StringBuilder(); + } else if ("short".equals(qName)) { + inShortMessage = true; + shortMessage = new StringBuilder(); + } else if ("rulegroup".equals(qName)) { + ruleGroupId = attrs.getValue("id"); + ruleGroupDescription = attrs.getValue("name"); + defaultOff = "off".equals(attrs.getValue(DEFAULT)); + defaultOn = "on".equals(attrs.getValue(DEFAULT)); + inRuleGroup = true; + subId = 0; + } else if ("suggestion".equals(qName) && inMessage) { + message.append("<suggestion>"); + inSuggestion = true; + } else if ("match".equals(qName)) { + setMatchElement(attrs); + } else if (qName.equals(MARKER) && inCorrectExample) { + correctExample.append("<marker>"); + } else if (qName.equals(MARKER) && inIncorrectExample) { + incorrectExample.append("<marker>"); + } else if (UNIFICATION.equals(qName)) { + uFeature = attrs.getValue("feature"); + inUnificationDef = true; + } else if ("equivalence".equals(qName)) { + uType = attrs.getValue(TYPE); + } else if (PHRASES.equals(qName)) { + inPhrases = true; + } else if ("includephrases".equals(qName)) { + phraseElementInit(); + } else if ("phrase".equals(qName) && inPhrases) { + phraseId = attrs.getValue("id"); + } else if ("phraseref".equals(qName) && (attrs.getValue("idref") != null)) { + preparePhrase(attrs); + } + } + + @Override + public void endElement(final String namespaceURI, final String sName, + final String qName) throws SAXException { + if ("rule".equals(qName)) { + phraseElementInit(); + if (phraseElementList.isEmpty()) { + final PatternRule rule = new PatternRule(id, language, elementList, + description, message.toString(), shortMessage.toString()); + prepareRule(rule); + rules.add(rule); + } else { + if (!elementList.isEmpty()) { + for (final ArrayList<Element> ph : phraseElementList) { + ph.addAll(new ArrayList<Element>(elementList)); + } + } + + for (final ArrayList<Element> phraseElement : phraseElementList) { + processElement(phraseElement); + final PatternRule rule = new PatternRule(id, language, phraseElement, + description, message.toString(), shortMessage.toString(), + phraseElementList.size() > 1); + prepareRule(rule); + rules.add(rule); + } + } + elementList.clear(); + if (phraseElementList != null) { + phraseElementList.clear(); + } + + } else if (qName.equals(EXCEPTION)) { + finalizeExceptions(); + } else if (qName.equals(AND)) { + inAndGroup = false; + andGroupCounter = 0; + tokenCounter++; + } else if (qName.equals(TOKEN)) { + finalizeTokens(); + } else if (qName.equals(PATTERN)) { + checkMarkPositions(); + inPattern = false; + if (lastPhrase) { + elementList.clear(); + } + if (phraseElementList == null || phraseElementList.isEmpty()) { + checkPositions(0); + } else { + for (List<Element> elements : phraseElementList) { + checkPositions(elements.size()); + } + } + tokenCounter = 0; + } else if (qName.equals(EXAMPLE)) { + if (inCorrectExample) { + correctExamples.add(correctExample.toString()); + } else if (inIncorrectExample) { + IncorrectExample example = null; + final String[] corrections = exampleCorrection.toString().split("\\|"); + if (corrections.length > 0 && corrections[0].length() > 0) { + example = new IncorrectExample(incorrectExample.toString(), + corrections); + } else { + example = new IncorrectExample(incorrectExample.toString()); + } + incorrectExamples.add(example); + } + inCorrectExample = false; + inIncorrectExample = false; + correctExample = new StringBuilder(); + incorrectExample = new StringBuilder(); + exampleCorrection = new StringBuilder(); + } else if ("message".equals(qName)) { + suggestionMatches = addLegacyMatches(); + inMessage = false; + } else if ("short".equals(qName)) { + inShortMessage = false; + } else if ("match".equals(qName)) { + if (inMessage) { + suggestionMatches.get(suggestionMatches.size() - 1).setLemmaString( + match.toString()); + } else if (inToken) { + tokenReference.setLemmaString(match.toString()); + } + inMatch = false; + } else if ("rulegroup".equals(qName)) { + inRuleGroup = false; + } else if ("suggestion".equals(qName) && inMessage) { + message.append("</suggestion>"); + inSuggestion = false; + } else if (qName.equals(MARKER) && inCorrectExample) { + correctExample.append("</marker>"); + } else if (qName.equals(MARKER) && inIncorrectExample) { + incorrectExample.append("</marker>"); + } else if ("phrase".equals(qName) && inPhrases) { + finalizePhrase(); + } else if ("includephrases".equals(qName)) { + elementList.clear(); + } else if (PHRASES.equals(qName) && inPhrases) { + inPhrases = false; + } else if (UNIFICATION.equals(qName)) { + inUnificationDef = false; + } else if ("feature".equals(qName)) { + equivalenceFeatures.put(uFeature, uTypeList); + uTypeList = new ArrayList<String>(); + } else if ("unify".equals(qName)) { + inUnification = false; + //clear the features... + equivalenceFeatures = new HashMap<String, List<String>>(); + } + } + + private void prepareRule(final PatternRule rule) { + rule.setStartPositionCorrection(startPositionCorrection); + rule.setEndPositionCorrection(endPositionCorrection); + startPositionCorrection = 0; + endPositionCorrection = 0; + rule.setCorrectExamples(correctExamples); + rule.setIncorrectExamples(incorrectExamples); + rule.setCategory(category); + if (inRuleGroup) { + rule.setSubId(Integer.toString(subId)); + } + else { + rule.setSubId("1"); + } + caseSensitive = false; + if (suggestionMatches != null) { + for (final Match m : suggestionMatches) { + rule.addSuggestionMatch(m); + } + if (phraseElementList.size() <= 1) { + suggestionMatches.clear(); + } + } + if (defaultOff) { + rule.setDefaultOff(); + } + + if (category.isDefaultOff() && !defaultOn) { + rule.setDefaultOff(); + } + + } + + @Override + public void characters(final char[] buf, final int offset, final int len) { + final String s = new String(buf, offset, len); + if (inException) { + exceptions.append(s); + } else if (inToken) { + elements.append(s); + } else if (inCorrectExample) { + correctExample.append(s); + } else if (inIncorrectExample) { + incorrectExample.append(s); + } else if (inMatch) { + match.append(s); + } else if (inMessage) { + message.append(s); + } else if (inShortMessage) { + shortMessage.append(s); + } + } + +} diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/rules/patterns/Unifier.java b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/patterns/Unifier.java new file mode 100644 index 0000000..7fbb35d --- /dev/null +++ b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/patterns/Unifier.java @@ -0,0 +1,432 @@ +/* LanguageTool, a natural language style checker + * Copyright (C) 2006 Daniel Naber (http://www.danielnaber.de) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ + +package de.danielnaber.languagetool.rules.patterns; + +import java.util.ArrayList; +import java.util.HashMap; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Set; + +import de.danielnaber.languagetool.AnalyzedToken; +import de.danielnaber.languagetool.AnalyzedTokenReadings; + +/** + * Implements unification of features over tokens. + * + * @author Marcin Milkowski + */ +public class Unifier { + + //TODO: add a possibility to negate some features but not all + /** + * Negates the meaning of unification just like negation in Element tokens. + */ + private boolean negation; + + private boolean allFeatsIn; + + private int tokCnt; + + private int readingsCounter; + + private final List<AnalyzedTokenReadings> tokSequence; + + /** + * A Map for storing the equivalence types for features. Features are + * specified as Strings, and map into types defined as maps from Strings to + * Elements. + */ + private final Map<EquivalenceTypeLocator, Element> equivalenceTypes; + + /** + * A Map that stores all possible equivalence types listed for features. + */ + private final Map<String, List<String>> equivalenceFeatures; + + /** + * Map of sets of matched equivalences in the unified sequence. + */ + private final List<Map<String, Set<String>>> equivalencesMatched; + + /** + * Marks found interpretations in subsequent tokens. + */ + private List<Boolean> featuresFound; + + /** + * For checking the current token. + */ + private List<Boolean> tmpFeaturesFound; + + /** + * Internal flag for checking whether the first token in tokSequence has to be + * yet unified. + */ + private boolean firstUnified; + + private boolean inUnification; + private boolean uniMatched; + private boolean uniAllMatched; + private AnalyzedTokenReadings[] unifiedTokens; + + /** + * Instantiates the unifier. + */ + public Unifier() { + tokCnt = -1; + readingsCounter = 1; + equivalencesMatched = new ArrayList<Map<String, Set<String>>>(); + equivalenceTypes = new HashMap<EquivalenceTypeLocator, Element>(); + equivalenceFeatures = new HashMap<String, List<String>>(); + featuresFound = new ArrayList<Boolean>(); + tmpFeaturesFound = new ArrayList<Boolean>(); + tokSequence = new ArrayList<AnalyzedTokenReadings>(); + } + + /** + * Prepares equivalence types for features to be tested. All equivalence types + * are given as {@link Element}s. They create an equivalence set (with + * abstraction). + * + * @param feature + * Feature to be tested, like gender, grammatical case or number. + * @param type + * Type of equivalence for the feature, for example plural, first + * person, genitive. + * @param elem + * Element specifying the equivalence. + */ + public final void setEquivalence(final String feature, final String type, + final Element elem) { + if (equivalenceTypes.containsKey(new EquivalenceTypeLocator(feature, type))) { + return; + } + equivalenceTypes.put(new EquivalenceTypeLocator(feature, type), elem); + List<String> lTypes; + if (equivalenceFeatures.containsKey(feature)) { + lTypes = equivalenceFeatures.get(feature); + } else { + lTypes = new ArrayList<String>(); + } + lTypes.add(type); + equivalenceFeatures.put(feature, lTypes); + } + + /** + * Tests if a token has shared features with other tokens. + * + * @param aToken + * - token to be tested + * @param feature + * - feature to be tested + * @param type + * - type of equivalence relation for the feature + * @return true if the token shares this type of feature with other tokens + */ + protected final boolean isSatisfied(final AnalyzedToken aToken, + final Map<String, List<String>> uFeatures) { + + if (allFeatsIn && equivalencesMatched.isEmpty()) { + return false; + } + // Error: no feature given! + if (uFeatures == null) { + return false; // throw exception?? + } + boolean unified = true; + List<String> types; + + if (allFeatsIn) { + unified &= checkNext(aToken, uFeatures); + } else { + tokCnt++; + while (equivalencesMatched.size() <= tokCnt) { + equivalencesMatched.add(new HashMap<String, Set<String>>()); + } + for (final Map.Entry<String, List<String>> feat : uFeatures.entrySet()) { + types = feat.getValue(); + if (types == null || types.isEmpty()) { + types = equivalenceFeatures.get(feat.getKey()); + } + for (final String typename : types) { + final Element testElem = equivalenceTypes + .get(new EquivalenceTypeLocator(feat.getKey(), typename)); + if (testElem == null) { + return false; + } + if (testElem.isMatched(aToken)) { + if (!equivalencesMatched.get(tokCnt).containsKey(feat.getKey())) { + final Set<String> typeSet = new HashSet<String>(); + typeSet.add(typename); + equivalencesMatched.get(tokCnt).put(feat.getKey(), typeSet); + } else { + equivalencesMatched.get(tokCnt).get(feat.getKey()).add(typename); + } + } + } + unified &= equivalencesMatched.get(tokCnt).containsKey(feat.getKey()); + if (!unified) { + break; + } + } + if (unified) { + if (tokCnt == 0 || tokSequence.isEmpty()) { + tokSequence.add(new AnalyzedTokenReadings(aToken, 0)); + } else { + tokSequence.get(0).addReading(aToken); + } + } + } + return unified ^ negation; + } + + private boolean checkNext(final AnalyzedToken aToken, + final Map<String, List<String>> uFeatures) { + boolean unifiedNext = true; + boolean anyFeatUnified = false; + List<String> types; + ArrayList<Boolean> tokenFeaturesFound = new ArrayList<Boolean>(tmpFeaturesFound); + if (allFeatsIn) { + for (int i = 0; i <= tokCnt; i++) { + boolean allFeatsUnified = true; + for (Map.Entry<String, List<String>> feat : uFeatures.entrySet()) { + boolean featUnified = false; + types = feat.getValue(); + if (types == null || types.isEmpty()) { + types = equivalenceFeatures.get(feat.getKey()); + } + for (final String typename : types) { + if (featuresFound.get(i) + && equivalencesMatched.get(i).containsKey(feat.getKey()) + && equivalencesMatched.get(i).get(feat.getKey()).contains(typename)) { + final Element testElem = equivalenceTypes + .get(new EquivalenceTypeLocator(feat.getKey(), typename)); + featUnified = featUnified || testElem.isMatched(aToken); + } + } + allFeatsUnified &= featUnified; + } + tokenFeaturesFound.set(i, allFeatsUnified); + anyFeatUnified = anyFeatUnified || allFeatsUnified; + } + unifiedNext &= anyFeatUnified; + if (unifiedNext) { + if (tokSequence.size() == readingsCounter) { + tokSequence.add(new AnalyzedTokenReadings(aToken, 0)); + } else { + tokSequence.get(readingsCounter).addReading(aToken); + } + tmpFeaturesFound = tokenFeaturesFound; + } + } + return unifiedNext; + } + + /** + * Call after every complete token (AnalyzedTokenReadings) checked. + */ + public final void startNextToken() { + featuresFound = new ArrayList<Boolean>(tmpFeaturesFound); + readingsCounter++; + } + + /** + * Starts testing only those equivalences that were previously matched. + */ + public final void startUnify() { + allFeatsIn = true; + for (int i = 0; i <= tokCnt; i++) { + featuresFound.add(true); + } + tmpFeaturesFound = new ArrayList<Boolean>(featuresFound); + } + + public final void setNegation(final boolean neg) { + negation = neg; + } + + public final boolean getNegation() { + return negation; + } + + /** + * Resets after use of unification. Required. + */ + public final void reset() { + equivalencesMatched.clear(); + allFeatsIn = false; + negation = false; + tokCnt = -1; + featuresFound.clear(); + tmpFeaturesFound.clear(); + tokSequence.clear(); + readingsCounter = 1; + firstUnified = false; + uniMatched = false; + uniAllMatched = false; + inUnification = false; + } + + /** + * Gets a full sequence of filtered tokens. + * + * @return Array of AnalyzedTokenReadings that match equivalence relation + * defined for features tested. + */ + public final AnalyzedTokenReadings[] getUnifiedTokens() { + if (tokSequence.isEmpty()) { + return null; + } + if (!firstUnified) { + AnalyzedTokenReadings tmpATR; + int first = 0; + tmpFeaturesFound.add(true); // Bentley's search idea + while (!tmpFeaturesFound.get(first)) { + first++; + } + tmpFeaturesFound.remove(tmpFeaturesFound.size() - 1); + if (first >= tmpFeaturesFound.size()) { + return null; + } + // FIXME: why this happens?? + final int numRead = tokSequence.get(0).getReadingsLength(); + if (first < numRead) { + tmpATR = new AnalyzedTokenReadings(tokSequence.get(0).getAnalyzedToken( + first), 0); + for (int i = first + 1; i <= Math.min(numRead - 1, tokCnt); i++) { + if (tmpFeaturesFound.get(i)) { + tmpATR.addReading(tokSequence.get(0).getAnalyzedToken(i)); + } + } + tokSequence.set(0, tmpATR); + } + firstUnified = true; + } + final AnalyzedTokenReadings[] atr = tokSequence + .toArray(new AnalyzedTokenReadings[tokSequence.size()]); + return atr; + } + + /** + * Tests if the token sequence is unified. + * + * @param matchToken + * AnalyzedToken token to unify + * @param feature + * String: feature to unify over + * @param type + * String: value types of the feature + * @param isUniNegated + * if true, then return negated result + * @param lastReading + * true when the matchToken is the last reading in the + * AnalyzedReadings + * @return True if the tokens in the sequence are unified. + */ + public final boolean isUnified(final AnalyzedToken matchToken, + final Map<String, List<String>> uFeatures, final boolean isUniNegated, + final boolean lastReading) { + if (inUnification) { + uniMatched |= isSatisfied(matchToken, uFeatures); + uniAllMatched = uniMatched; + if (lastReading) { + startNextToken(); + unifiedTokens = getUnifiedTokens(); + uniMatched = false; + } + return uniAllMatched; + } + if (isUniNegated) { + setNegation(true); + } + isSatisfied(matchToken, uFeatures); + if (lastReading) { + inUnification = true; + uniMatched = false; + startUnify(); + } + return true; + } + + /** + * Used for getting a unified sequence in case when simple test method + * {@link #isUnified} was used. + * + * @return An array of {@link AnalyzedTokenReadings} + */ + public final AnalyzedTokenReadings[] getFinalUnified() { + if (inUnification) { + return unifiedTokens; + } + return null; + } +} + +class EquivalenceTypeLocator { + + @Override + public int hashCode() { + final int prime = 31; + int result = 1; + result = prime * result + ((feature == null) ? 0 : feature.hashCode()); + result = prime * result + ((type == null) ? 0 : type.hashCode()); + return result; + } + + @Override + public boolean equals(final Object obj) { + if (this == obj) { + return true; + } + if (obj == null) { + return false; + } + if (getClass() != obj.getClass()) { + return false; + } + final EquivalenceTypeLocator other = (EquivalenceTypeLocator) obj; + if (feature == null) { + if (other.feature != null) { + return false; + } + } else if (!feature.equals(other.feature)) { + return false; + } + if (type == null) { + if (other.type != null) { + return false; + } + } else if (!type.equals(other.type)) { + return false; + } + return true; + } + + private final String feature; + private final String type; + + EquivalenceTypeLocator(final String feature, final String type) { + this.feature = feature; + this.type = type; + } + +}
\ No newline at end of file diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/rules/patterns/XMLRuleHandler.java b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/patterns/XMLRuleHandler.java new file mode 100644 index 0000000..72a852a --- /dev/null +++ b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/patterns/XMLRuleHandler.java @@ -0,0 +1,568 @@ +/* LanguageTool, a natural language style checker + * Copyright (C) 2006 Daniel Naber (http://www.danielnaber.de) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ +package de.danielnaber.languagetool.rules.patterns; + +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +import org.xml.sax.Attributes; +import org.xml.sax.Locator; +import org.xml.sax.SAXException; +import org.xml.sax.SAXParseException; +import org.xml.sax.helpers.DefaultHandler; + +import de.danielnaber.languagetool.Language; +import de.danielnaber.languagetool.rules.IncorrectExample; +import de.danielnaber.languagetool.tools.StringTools; + +/** + * XML rule handler that loads rules from XML and throws + * exceptions on errors and warnings. + * + * @author Daniel Naber + */ +public class XMLRuleHandler extends DefaultHandler { + + public XMLRuleHandler() { + elementList = new ArrayList<Element>(); + equivalenceFeatures = new HashMap<String, List<String>>(); + uTypeList = new ArrayList<String>(); + } + + List<PatternRule> rules = new ArrayList<PatternRule>(); + + protected Language language; + + protected StringBuilder correctExample = new StringBuilder(); + protected StringBuilder incorrectExample = new StringBuilder(); + protected StringBuilder exampleCorrection = new StringBuilder(); + protected StringBuilder message = new StringBuilder(); + protected StringBuilder match = new StringBuilder(); + protected StringBuilder elements; + protected StringBuilder exceptions; + + List<String> correctExamples = new ArrayList<String>(); + List<IncorrectExample> incorrectExamples = new ArrayList<IncorrectExample>(); + + protected boolean inPattern; + protected boolean inCorrectExample; + protected boolean inIncorrectExample; + protected boolean inMessage; + protected boolean inSuggestion; + protected boolean inMatch; + protected boolean inRuleGroup; + protected boolean inToken; + protected boolean inException; + protected boolean inPhrases; + protected boolean inAndGroup; + + protected boolean tokenSpaceBefore; + protected boolean tokenSpaceBeforeSet; + protected String posToken; + protected boolean posNegation; + protected boolean posRegExp; + + protected boolean caseSensitive; + protected boolean regExpression; + protected boolean tokenNegated; + protected boolean tokenInflected; + + protected String exceptionPosToken; + protected boolean exceptionStringRegExp; + protected boolean exceptionStringNegation; + protected boolean exceptionStringInflected; + protected boolean exceptionPosNegation; + protected boolean exceptionPosRegExp; + protected boolean exceptionValidNext; + protected boolean exceptionValidPrev; + protected boolean exceptionSet; + protected boolean exceptionSpaceBefore; + protected boolean exceptionSpaceBeforeSet; + + /** List of elements as specified by tokens. **/ + protected List<Element> elementList; + + /** true when phraseref is the last element in the rule. **/ + protected boolean lastPhrase; + + /** ID reference to the phrase. **/ + protected String phraseIdRef; + + /** Current phrase ID. **/ + protected String phraseId; + + protected int skipPos; + + protected String ruleGroupId; + + protected String id; + + protected Element tokenElement; + + protected Match tokenReference; + + protected List<Match> suggestionMatches; + + protected Locator pLocator; + + protected int startPositionCorrection; + protected int endPositionCorrection; + protected int tokenCounter; + + /** Phrase store - elementLists keyed by phraseIds. **/ + protected Map<String, List<List<Element>>> phraseMap; + + /** + * Logically forking element list, used for including multiple phrases in the + * current one. + **/ + protected List<ArrayList<Element>> phraseElementList; + + protected int andGroupCounter; + + protected StringBuilder shortMessage = new StringBuilder(); + protected boolean inShortMessage; + + protected boolean inUnification; + protected boolean inUnificationDef; + protected boolean uniNegation; + + protected String uFeature; + protected String uType = ""; + + protected List<String> uTypeList; + + protected Map<String, List<String>> equivalenceFeatures; + + + /** Definitions of values in XML files. */ + protected static final String YES = "yes"; + protected static final String POSTAG = "postag"; + protected static final String POSTAG_REGEXP = "postag_regexp"; + protected static final String REGEXP = "regexp"; + protected static final String NEGATE = "negate"; + protected static final String INFLECTED = "inflected"; + protected static final String NEGATE_POS = "negate_pos"; + protected static final String MARKER = "marker"; + protected static final String DEFAULT = "default"; + protected static final String TYPE = "type"; + protected static final String SPACEBEFORE = "spacebefore"; + protected static final String EXAMPLE = "example"; + protected static final String SCOPE = "scope"; + protected static final String IGNORE = "ignore"; + protected static final String SKIP = "skip"; + protected static final String TOKEN = "token"; + protected static final String FEATURE = "feature"; + protected static final String UNIFY = "unify"; + protected static final String AND = "and"; + protected static final String EXCEPTION = "exception"; + protected static final String CASE_SENSITIVE = "case_sensitive"; + protected static final String PATTERN = "pattern"; + protected static final String MATCH = "match"; + protected static final String UNIFICATION = "unification"; + protected static final String RULEGROUP = "rulegroup"; + protected static final String NO = "no"; + protected static final String MARK_TO = "mark_to"; + protected static final String MARK_FROM = "mark_from"; + protected static final String PHRASES = "phrases"; + protected static final String MESSAGE = "message"; + + + public List<PatternRule> getRules() { + return rules; + } + + public void warning (final SAXParseException e) throws SAXException { + throw e; + } + + public void error (final SAXParseException e) throws SAXException { + throw e; + } + + @Override + public void setDocumentLocator(final Locator locator) { + pLocator = locator; + super.setDocumentLocator(locator); + } + + protected void resetToken() { + posNegation = false; + posRegExp = false; + inToken = false; + tokenSpaceBefore = false; + tokenSpaceBeforeSet = false; + + resetException(); + exceptionSet = false; + tokenReference = null; + } + + protected void resetException() { + exceptionStringNegation = false; + exceptionStringInflected = false; + exceptionPosNegation = false; + exceptionPosRegExp = false; + exceptionStringRegExp = false; + exceptionValidNext = false; + exceptionValidPrev = false; + exceptionSpaceBefore = false; + exceptionSpaceBeforeSet = false; + } + + protected void phraseElementInit() { + // lazy init + if (phraseElementList == null) { + phraseElementList = new ArrayList<ArrayList<Element>>(); + } + } + protected void preparePhrase(final Attributes attrs) { + phraseIdRef = attrs.getValue("idref"); + if (phraseMap.containsKey(phraseIdRef)) { + for (final List<Element> curPhrEl : phraseMap.get(phraseIdRef)) { + for (final Element e : curPhrEl) { + e.setPhraseName(phraseIdRef); + } + if (elementList.isEmpty()) { + phraseElementList.add(new ArrayList<Element>(curPhrEl)); + } else { + final ArrayList<Element> prevList = new ArrayList<Element>( + elementList); + prevList.addAll(curPhrEl); + phraseElementList.add(new ArrayList<Element>(prevList)); + prevList.clear(); + } + } + lastPhrase = true; + } + } + + protected void finalizePhrase() { + // lazy init + if (phraseMap == null) { + phraseMap = new HashMap<String, List<List<Element>>>(); + } + phraseElementInit(); + if (phraseElementList.isEmpty()) { + phraseElementList.add(new ArrayList<Element>(elementList)); + } else { + for (final ArrayList<Element> ph : phraseElementList) { + ph.addAll(new ArrayList<Element>(elementList)); + } + } + + phraseMap.put(phraseId, new ArrayList<List<Element>>(phraseElementList)); + elementList.clear(); + + phraseElementList.clear(); + } + + protected void startPattern(final Attributes attrs) throws SAXException { + inPattern = true; + if (attrs.getValue(MARK_FROM) != null) { + startPositionCorrection = Integer.parseInt(attrs.getValue(MARK_FROM)); + } + if (attrs.getValue(MARK_TO) != null) { + endPositionCorrection = Integer.parseInt(attrs.getValue(MARK_TO)); + if (endPositionCorrection > 0) { + throw new SAXException("End position correction (mark_to="+ endPositionCorrection + + ") cannot be larger than 0: " + "\n Line: " + + pLocator.getLineNumber() + ", column: " + + pLocator.getColumnNumber() + "."); + } + } + caseSensitive = YES.equals(attrs.getValue(CASE_SENSITIVE)); + } + + + /** + * Calculates the offset of the match reference (if any) in case the match + * element has been used in the group. + * + * @param elList + * Element list where the match element was used. It is directly changed. + */ + protected void processElement(final List<Element> elList) { + int counter = 0; + for (final Element elTest : elList) { + if (elTest.getPhraseName() != null && counter > 0) { + if (elTest.isReferenceElement()) { + final int tokRef = elTest.getMatch().getTokenRef(); + elTest.getMatch().setTokenRef(tokRef + counter - 1); + final String offsetToken = elTest.getString().replace("\\" + tokRef, + "\\" + (tokRef + counter - 1)); + elTest.setStringElement(offsetToken); + } + } + counter++; + } + } + + protected void setMatchElement(final Attributes attrs) throws SAXException { + inMatch = true; + match = new StringBuilder(); + Match.CaseConversion caseConversion = Match.CaseConversion.NONE; + if (attrs.getValue("case_conversion") != null) { + caseConversion = Match.CaseConversion.toCase(attrs + .getValue("case_conversion").toUpperCase()); + } + Match.IncludeRange includeRange = Match.IncludeRange.NONE; + if (attrs.getValue("include_skipped") != null) { + includeRange = Match.IncludeRange.toRange(attrs + .getValue("include_skipped").toUpperCase()); + } + final Match mWorker = new Match(attrs.getValue(POSTAG), attrs + .getValue("postag_replace"), YES + .equals(attrs.getValue(POSTAG_REGEXP)), attrs + .getValue("regexp_match"), attrs.getValue("regexp_replace"), + caseConversion, YES.equals(attrs.getValue("setpos")), + includeRange); + mWorker.setInMessageOnly(!inSuggestion); + if (inMessage) { + if (suggestionMatches == null) { + suggestionMatches = new ArrayList<Match>(); + } + suggestionMatches.add(mWorker); + //add incorrect XML character for simplicity + message.append("\u0001\\"); + message.append(attrs.getValue("no")); + if (StringTools.isEmpty(attrs.getValue("no"))) { + throw new SAXException("References cannot be empty: " + "\n Line: " + + pLocator.getLineNumber() + ", column: " + + pLocator.getColumnNumber() + "."); + } else if (Integer.parseInt(attrs.getValue("no")) < 1) { + throw new SAXException("References must be larger than 0: " + + attrs.getValue("no") + "\n Line: " + pLocator.getLineNumber() + + ", column: " + pLocator.getColumnNumber() + "."); + } + } else if (inToken && attrs.getValue("no") != null) { + final int refNumber = Integer.parseInt(attrs.getValue("no")); + if (refNumber > elementList.size()) { + throw new SAXException( + "Only backward references in match elements are possible, tried to specify token " + + refNumber + + "\n Line: " + + pLocator.getLineNumber() + + ", column: " + pLocator.getColumnNumber() + "."); + } + mWorker.setTokenRef(refNumber); + tokenReference = mWorker; + elements.append('\\'); + elements.append(refNumber); + } + } + + protected void setExceptions(final Attributes attrs) { + inException = true; + exceptions = new StringBuilder(); + resetException(); + + exceptionStringNegation = YES.equals(attrs.getValue(NEGATE)); + exceptionValidNext = "next".equals(attrs.getValue(SCOPE)); + exceptionValidPrev = "previous".equals(attrs.getValue(SCOPE)); + exceptionStringInflected = YES.equals(attrs.getValue(INFLECTED)); + + if (attrs.getValue(POSTAG) != null) { + exceptionPosToken = attrs.getValue(POSTAG); + exceptionPosRegExp = YES.equals(attrs.getValue(POSTAG_REGEXP)); + exceptionPosNegation = YES.equals(attrs.getValue(NEGATE_POS)); + } + exceptionStringRegExp = YES.equals(attrs.getValue(REGEXP)); + if (attrs.getValue(SPACEBEFORE) != null) { + exceptionSpaceBefore = YES.equals(attrs.getValue(SPACEBEFORE)); + exceptionSpaceBeforeSet = !"ignore".equals(attrs.getValue(SPACEBEFORE)); + } + } + + protected void finalizeExceptions() { + inException = false; + if (!exceptionSet) { + tokenElement = new Element(StringTools.trimWhitespace(elements + .toString()), caseSensitive, regExpression, tokenInflected); + exceptionSet = true; + } + tokenElement.setNegation(tokenNegated); + if (!StringTools.isEmpty(exceptions.toString())) { + tokenElement.setStringException(StringTools.trimWhitespace(exceptions + .toString()), exceptionStringRegExp, exceptionStringInflected, + exceptionStringNegation, exceptionValidNext, exceptionValidPrev); + } + if (exceptionPosToken != null) { + tokenElement.setPosException(exceptionPosToken, exceptionPosRegExp, + exceptionPosNegation, exceptionValidNext, exceptionValidPrev); + exceptionPosToken = null; + } + if (exceptionSpaceBeforeSet) { + tokenElement.setExceptionSpaceBefore(exceptionSpaceBefore); + } + resetException(); + } + + protected void setToken(final Attributes attrs) { + inToken = true; + + if (lastPhrase) { + elementList.clear(); + } + + lastPhrase = false; + tokenNegated = YES.equals(attrs.getValue(NEGATE)); + tokenInflected = YES.equals(attrs.getValue(INFLECTED)); + if (attrs.getValue("skip") != null) { + skipPos = Integer.parseInt(attrs.getValue("skip")); + } + elements = new StringBuilder(); + // POSElement creation + if (attrs.getValue(POSTAG) != null) { + posToken = attrs.getValue(POSTAG); + posRegExp = YES.equals(attrs.getValue(POSTAG_REGEXP)); + posNegation = YES.equals(attrs.getValue(NEGATE_POS)); + } + regExpression = YES.equals(attrs.getValue(REGEXP)); + + if (attrs.getValue(SPACEBEFORE) != null) { + tokenSpaceBefore = YES.equals(attrs.getValue(SPACEBEFORE)); + tokenSpaceBeforeSet = !"ignore".equals(attrs.getValue(SPACEBEFORE)); + } + + if (!inAndGroup) { + tokenCounter++; + } + } + + protected void checkPositions(final int add) throws SAXException { + if (startPositionCorrection >= tokenCounter + add) { + throw new SAXException( + "Attempt to mark a token no. ("+ startPositionCorrection +") that is outside the pattern (" + + tokenCounter + "). Pattern elements are numbered starting from 0!" + "\n Line: " + + pLocator.getLineNumber() + ", column: " + + pLocator.getColumnNumber() + "."); + } + if (tokenCounter +add - endPositionCorrection < 0) { + throw new SAXException( + "Attempt to mark a token no. ("+ endPositionCorrection +") that is outside the pattern (" + + tokenCounter + " elements). End positions should be negative but not larger than the token count!" + + "\n Line: " + + pLocator.getLineNumber() + ", column: " + + pLocator.getColumnNumber() + "."); + } + } + + protected void checkMarkPositions() { + if (phraseElementList == null || phraseElementList.size() == 0) { + final int endMarker = elementList.size() + endPositionCorrection; + if (endMarker <= startPositionCorrection) { + throw new RuntimeException("Invalid combination of mark_from (" + startPositionCorrection + + ") and mark_to (" + endPositionCorrection + ") for rule " + id + + " with " + elementList.size() + + " tokens: the error position created by mark_from and mark_to is less than one token"); + } + } + } + + /** + * Adds Match objects for all references to tokens + * (including '\1' and the like). + */ + protected List<Match> addLegacyMatches() { + if (suggestionMatches == null || suggestionMatches.isEmpty()) { + return null; + } + final List<Match> sugMatch = new ArrayList<Match>(); + final String messageStr = message.toString(); + int pos = 0; + int ind = 0; + int matchCounter = 0; + while (pos != -1) { + pos = messageStr.indexOf('\\', ind + 1); + if (pos != -1 && messageStr.length() > pos) { + if (Character.isDigit(messageStr.charAt(pos + 1))) { + if (pos == 1 || messageStr.charAt(pos - 1) != '\u0001') { + final Match mWorker = new Match(null, null, false, null, + null, Match.CaseConversion.NONE, false, Match.IncludeRange.NONE); + mWorker.setInMessageOnly(true); + sugMatch.add(mWorker); + } else if (messageStr.charAt(pos - 1) == '\u0001') { // real suggestion marker + sugMatch.add(suggestionMatches.get(matchCounter)); + message.deleteCharAt(pos - 1 - matchCounter); + matchCounter++; + } + } + } + ind = pos; + } + if (sugMatch.isEmpty()) { + return suggestionMatches; + } + return sugMatch; + } + + protected void finalizeTokens() { + if (!exceptionSet || tokenElement == null) { + tokenElement = new Element(StringTools.trimWhitespace(elements + .toString()), caseSensitive, regExpression, tokenInflected); + tokenElement.setNegation(tokenNegated); + } else { + tokenElement.setStringElement(StringTools.trimWhitespace(elements + .toString())); + } + + if (skipPos != 0) { + tokenElement.setSkipNext(skipPos); + skipPos = 0; + } + if (posToken != null) { + tokenElement.setPosElement(posToken, posRegExp, posNegation); + posToken = null; + } + + if (tokenReference != null) { + tokenElement.setMatch(tokenReference); + } + + if (inAndGroup && andGroupCounter > 0) { + elementList.get(elementList.size() - 1) + .setAndGroupElement(tokenElement); + } else { + elementList.add(tokenElement); + } + if (inAndGroup) { + andGroupCounter++; + } + + if (inUnification) { + tokenElement.setUnification(equivalenceFeatures); + if (uniNegation) { + tokenElement.setUniNegation(); + } + } + + if (inUnificationDef) { + language.getUnifier().setEquivalence(uFeature, uType, tokenElement); + elementList.clear(); + } + if (tokenSpaceBeforeSet) { + tokenElement.setWhitespaceBefore(tokenSpaceBefore); + } + resetToken(); + } + +} diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/rules/patterns/bitext/BitextPatternRule.java b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/patterns/bitext/BitextPatternRule.java new file mode 100644 index 0000000..1d42a17 --- /dev/null +++ b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/patterns/bitext/BitextPatternRule.java @@ -0,0 +1,93 @@ +/* LanguageTool, a natural language style checker + * Copyright (C) 2010 Marcin Miłkowski (www.languagetool.org) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ + +package de.danielnaber.languagetool.rules.patterns.bitext; + +import java.io.IOException; + +import de.danielnaber.languagetool.AnalyzedSentence; +import de.danielnaber.languagetool.rules.Rule; +import de.danielnaber.languagetool.rules.RuleMatch; +import de.danielnaber.languagetool.rules.bitext.BitextRule; +import de.danielnaber.languagetool.rules.patterns.PatternRule; + +/** + * A bitext pattern rule class. A BitextPatternRule describes a language error and + * can test whether a given pre-analyzed pair of source and target text + * contains that error using the {@link Rule#match} method. It uses the syntax + * of XML files similar to normal PatternRules. + * + * @author Marcin Miłkowski + */ +public class BitextPatternRule extends BitextRule { + + private final PatternRule srcRule; + private final PatternRule trgRule; + + BitextPatternRule(final PatternRule src, final PatternRule trg) { + srcRule = src; + trgRule = trg; + } + + public PatternRule getSrcRule() { + return srcRule; + } + + public PatternRule getTrgRule() { + return trgRule; + } + + @Override + public String getDescription() { + return srcRule.getDescription(); + } + + public String getMessage() { + return trgRule.getMessage(); + } + + @Override + public String getId() { + return srcRule.getId(); + } + + /** + * This method always returns an empty array. + */ + @Override + public RuleMatch[] match(AnalyzedSentence text) throws IOException { + return new RuleMatch[0]; + } + + @Override + public RuleMatch[] match(AnalyzedSentence sourceText, + AnalyzedSentence targetText) throws IOException { + if (srcRule.match(sourceText).length > 0) { + return trgRule.match(targetText); + } + return new RuleMatch[0]; + } + + @Override + public void reset() { + // TODO Auto-generated method stub + + } + +} diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/rules/patterns/bitext/BitextPatternRuleLoader.java b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/patterns/bitext/BitextPatternRuleLoader.java new file mode 100644 index 0000000..508f381 --- /dev/null +++ b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/patterns/bitext/BitextPatternRuleLoader.java @@ -0,0 +1,413 @@ +/* LanguageTool, a natural language style checker + * Copyright (C) 2005 Daniel Naber (http://www.danielnaber.de) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ +package de.danielnaber.languagetool.rules.patterns.bitext; + +import java.io.IOException; +import java.io.InputStream; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; + +import javax.xml.parsers.SAXParser; +import javax.xml.parsers.SAXParserFactory; + +import org.xml.sax.Attributes; +import org.xml.sax.SAXException; +import org.xml.sax.helpers.DefaultHandler; + +import de.danielnaber.languagetool.Language; +import de.danielnaber.languagetool.bitext.StringPair; +import de.danielnaber.languagetool.rules.Category; +import de.danielnaber.languagetool.rules.IncorrectExample; +import de.danielnaber.languagetool.rules.bitext.IncorrectBitextExample; +import de.danielnaber.languagetool.rules.patterns.Element; +import de.danielnaber.languagetool.rules.patterns.Match; +import de.danielnaber.languagetool.rules.patterns.PatternRule; + +/** + * Loads {@link PatternRule}s from an XML file. + * + * @author Marcin Miłkowski + */ +public class BitextPatternRuleLoader extends DefaultHandler { + + public final List<BitextPatternRule> getRules(final InputStream is, + final String filename) throws IOException { + final List<BitextPatternRule> rules; + try { + final PatternRuleHandler handler = new PatternRuleHandler(); + final SAXParserFactory factory = SAXParserFactory.newInstance(); + final SAXParser saxParser = factory.newSAXParser(); + /* saxParser.getXMLReader().setFeature( + "http://apache.org/xml/features/nonvalidating/load-external-dtd", + false); + */ + saxParser.parse(is, handler); + rules = handler.getBitextRules(); + return rules; + } catch (final Exception e) { + final IOException ioe = new IOException("Cannot load or parse '" + + filename + "'"); + ioe.initCause(e); + throw ioe; + } + } + +} + +class PatternRuleHandler extends BitextXMLRuleHandler { + + private int subId; + + private boolean defaultOff; + private boolean defaultOn; + + private Category category; + private String description; + private String ruleGroupDescription; + + private PatternRule srcRule; + private PatternRule trgRule; + + private IncorrectExample trgExample; + private IncorrectExample srcExample; + + private Language srcLang; + + // =========================================================== + // SAX DocumentHandler methods + // =========================================================== + + @Override + public void startElement(final String namespaceURI, final String lName, + final String qName, final Attributes attrs) throws SAXException { + if (qName.equals("category")) { + final String catName = attrs.getValue("name"); + final String priorityStr = attrs.getValue("priority"); + // int prio = 0; + if (priorityStr != null) { + category = new Category(catName, Integer.parseInt(priorityStr)); + } else { + category = new Category(catName); + } + + if ("off".equals(attrs.getValue(DEFAULT))) { + category.setDefaultOff(); + } + + } else if (qName.equals("rules")) { + final String languageStr = attrs.getValue("targetLang"); + language = Language.getLanguageForShortName(languageStr); + if (language == null) { + throw new SAXException("Unknown language '" + languageStr + "'"); + } + } else if (qName.equals("rule")) { + id = attrs.getValue("id"); + if (inRuleGroup) + subId++; + if (!(inRuleGroup && defaultOff)) { + defaultOff = "off".equals(attrs.getValue(DEFAULT)); + } + + if (!(inRuleGroup && defaultOn)) { + defaultOn = "on".equals(attrs.getValue(DEFAULT)); + } + if (inRuleGroup && id == null) { + id = ruleGroupId; + } + description = attrs.getValue("name"); + if (inRuleGroup && description == null) { + description = ruleGroupDescription; + } + correctExamples = new ArrayList<StringPair>(); + incorrectExamples = new ArrayList<IncorrectBitextExample>(); + if (suggestionMatches != null) { + suggestionMatches.clear(); + } + } else if (PATTERN.equals(qName) || "target".equals(qName)) { + startPattern(attrs); + } else if (AND.equals(qName)) { + inAndGroup = true; + } else if (UNIFY.equals(qName)) { + inUnification = true; + uniNegation = YES.equals(attrs.getValue(NEGATE)); + } else if (qName.equals("feature")) { + uFeature = attrs.getValue("id"); + } else if (qName.equals(TYPE)) { + uType = attrs.getValue("id"); + uTypeList.add(uType); + } else if (qName.equals(TOKEN)) { + setToken(attrs); + } else if (qName.equals(EXCEPTION)) { + setExceptions(attrs); + } else if (qName.equals(EXAMPLE) + && attrs.getValue(TYPE).equals("correct")) { + inCorrectExample = true; + correctExample = new StringBuilder(); + } else if (EXAMPLE.equals(qName) + && attrs.getValue(TYPE).equals("incorrect")) { + inIncorrectExample = true; + incorrectExample = new StringBuilder(); + exampleCorrection = new StringBuilder(); + if (attrs.getValue("correction") != null) { + exampleCorrection.append(attrs.getValue("correction")); + } + } else if (MESSAGE.equals(qName)) { + inMessage = true; + message = new StringBuilder(); + } else if (qName.equals("short")) { + inShortMessage = true; + shortMessage = new StringBuilder(); + } else if (qName.equals(RULEGROUP)) { + ruleGroupId = attrs.getValue("id"); + ruleGroupDescription = attrs.getValue("name"); + defaultOff = "off".equals(attrs.getValue(DEFAULT)); + defaultOn = "on".equals(attrs.getValue(DEFAULT)); + inRuleGroup = true; + subId = 0; + } else if (qName.equals("suggestion") && inMessage) { + message.append("<suggestion>"); + inSuggestion = true; + } else if (qName.equals("match")) { + setMatchElement(attrs); + } else if (qName.equals(MARKER) && inCorrectExample) { + correctExample.append("<marker>"); + } else if (qName.equals(MARKER) && inIncorrectExample) { + incorrectExample.append("<marker>"); + } else if (qName.equals("unification")) { + uFeature = attrs.getValue("feature"); + inUnificationDef = true; + } else if (qName.equals("equivalence")) { + uType = attrs.getValue(TYPE); + } else if (qName.equals("phrases")) { + inPhrases = true; + } else if (qName.equals("includephrases")) { + phraseElementInit(); + } else if (qName.equals("phrase") && inPhrases) { + phraseId = attrs.getValue("id"); + } else if (qName.equals("phraseref") && (attrs.getValue("idref") != null)) { + preparePhrase(attrs); + } else if (qName.equals("source")) { + srcLang = Language.getLanguageForShortName(attrs.getValue("lang")); + } + } + + @Override + public void endElement(final String namespaceURI, final String sName, + final String qName) throws SAXException { + + if (qName.equals("source")) { + checkMarkPositions(); + srcRule = finalizeRule(); + } else if ("target".equals(qName)) { + checkMarkPositions(); + trgRule = finalizeRule(); + } else if ("rule".equals(qName)) { + trgRule.setMessage(message.toString()); + if (suggestionMatches != null) { + for (final Match m : suggestionMatches) { + trgRule.addSuggestionMatch(m); + } + if (phraseElementList.size() <= 1) { + suggestionMatches.clear(); + } + } + final BitextPatternRule bRule = new BitextPatternRule(srcRule, trgRule); + bRule.setCorrectBitextExamples(correctExamples); + bRule.setIncorrectBitextExamples(incorrectExamples); + bRule.setSourceLang(srcLang); + rules.add(bRule); + } else if (qName.equals(EXCEPTION)) { + finalizeExceptions(); + } else if (qName.equals(AND)) { + inAndGroup = false; + andGroupCounter = 0; + tokenCounter++; + } else if (qName.equals(TOKEN)) { + finalizeTokens(); + } else if (qName.equals(PATTERN)) { + inPattern = false; + if (lastPhrase) { + elementList.clear(); + } + if (phraseElementList == null || phraseElementList.isEmpty()) { + checkPositions(0); + } else { + for (List<Element> elements : phraseElementList) { + checkPositions(elements.size()); + } + } + tokenCounter = 0; + } else if (qName.equals("trgExample")) { + trgExample = setExample(); + } else if (qName.equals("srcExample")) { + srcExample = setExample(); + } else if (qName.equals("example")) { + if (inCorrectExample) { + correctExamples.add(new StringPair(srcExample.getExample(), trgExample.getExample())); + } else if (inIncorrectExample) { + if (trgExample.getCorrections() == null) { + incorrectExamples.add( + new IncorrectBitextExample( + new StringPair( + srcExample.getExample(), trgExample.getExample()) + )); + } else { + List<String> l = trgExample.getCorrections(); + String str [] = l.toArray (new String [l.size ()]); + incorrectExamples.add( + new IncorrectBitextExample( + new StringPair(srcExample.getExample(), + trgExample.getExample()), str) + ); + } + } + inCorrectExample = false; + inIncorrectExample = false; + } else if (qName.equals("message")) { + suggestionMatches = addLegacyMatches(); + inMessage = false; + } else if (qName.equals("short")) { + inShortMessage = false; + } else if (qName.equals("match")) { + if (inMessage) { + suggestionMatches.get(suggestionMatches.size() - 1).setLemmaString( + match.toString()); + } else if (inToken) { + tokenReference.setLemmaString(match.toString()); + } + inMatch = false; + } else if (qName.equals("rulegroup")) { + inRuleGroup = false; + } else if (qName.equals("suggestion") && inMessage) { + message.append("</suggestion>"); + inSuggestion = false; + } else if (qName.equals(MARKER) && inCorrectExample) { + correctExample.append("</marker>"); + } else if (qName.equals(MARKER) && inIncorrectExample) { + incorrectExample.append("</marker>"); + } else if (qName.equals("phrase") && inPhrases) { + finalizePhrase(); + } else if (qName.equals("includephrases")) { + elementList.clear(); + } else if (qName.equals("phrases") && inPhrases) { + inPhrases = false; + } else if (qName.equals("unification")) { + inUnificationDef = false; + } else if (qName.equals("feature")) { + equivalenceFeatures.put(uFeature, uTypeList); + uTypeList = new ArrayList<String>(); + } else if (qName.equals("unify")) { + inUnification = false; + //clear the features... + equivalenceFeatures = new HashMap<String, List<String>>(); + } + } + + private IncorrectExample setExample() { + IncorrectExample example = null; + if (inCorrectExample) { + example = new IncorrectExample(correctExample.toString()); + } else if (inIncorrectExample) { + final String[] corrections = exampleCorrection.toString().split("\\|"); + if (corrections.length > 0 && corrections[0].length() > 0) { + example = new IncorrectExample(incorrectExample.toString(), + corrections); + } else { + example = new IncorrectExample(incorrectExample.toString()); + } + } + correctExample = new StringBuilder(); + incorrectExample = new StringBuilder(); + exampleCorrection = new StringBuilder(); + return example; + } + + private PatternRule finalizeRule() { + PatternRule rule = null; + phraseElementInit(); + if (phraseElementList.isEmpty()) { + rule = new PatternRule(id, language, elementList, + description, "", shortMessage.toString()); + prepareRule(rule); + } else { + if (!elementList.isEmpty()) { + for (final ArrayList<Element> ph : phraseElementList) { + ph.addAll(new ArrayList<Element>(elementList)); + } + } + + for (final ArrayList<Element> phraseElement : phraseElementList) { + processElement(phraseElement); + rule = new PatternRule(id, language, phraseElement, + description, message.toString(), shortMessage.toString(), + phraseElementList.size() > 1); + prepareRule(rule); + } + } + elementList.clear(); + if (phraseElementList != null) { + phraseElementList.clear(); + } + startPositionCorrection = 0; + endPositionCorrection = 0; + return rule; + } + private void prepareRule(final PatternRule rule) { + rule.setStartPositionCorrection(startPositionCorrection); + rule.setEndPositionCorrection(endPositionCorrection); + startPositionCorrection = 0; + endPositionCorrection = 0; + rule.setCategory(category); + if (inRuleGroup) + rule.setSubId(Integer.toString(subId)); + else + rule.setSubId("1"); + caseSensitive = false; + if (defaultOff) { + rule.setDefaultOff(); + } + + if (category.isDefaultOff() && !defaultOn) { + rule.setDefaultOff(); + } + + } + + @Override + public void characters(final char[] buf, final int offset, final int len) { + final String s = new String(buf, offset, len); + if (inException) { + exceptions.append(s); + } else if (inToken) { + elements.append(s); + } else if (inCorrectExample) { + correctExample.append(s); + } else if (inIncorrectExample) { + incorrectExample.append(s); + } else if (inMatch) { + match.append(s); + } else if (inMessage) { + message.append(s); + } else if (inShortMessage) { + shortMessage.append(s); + } + } + +} diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/rules/patterns/bitext/BitextXMLRuleHandler.java b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/patterns/bitext/BitextXMLRuleHandler.java new file mode 100644 index 0000000..02f5a04 --- /dev/null +++ b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/patterns/bitext/BitextXMLRuleHandler.java @@ -0,0 +1,56 @@ +/* LanguageTool, a natural language style checker + * Copyright (C) 2006 Daniel Naber (http://www.danielnaber.de) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ +package de.danielnaber.languagetool.rules.patterns.bitext; + +import java.util.ArrayList; +import java.util.List; + +import org.xml.sax.SAXException; +import org.xml.sax.SAXParseException; + +import de.danielnaber.languagetool.bitext.StringPair; +import de.danielnaber.languagetool.rules.bitext.IncorrectBitextExample; +import de.danielnaber.languagetool.rules.patterns.XMLRuleHandler; + +/** + * XML rule handler that loads rules from XML and throws + * exceptions on errors and warnings. + * + * @author Daniel Naber + */ +class BitextXMLRuleHandler extends XMLRuleHandler { + + List<BitextPatternRule> rules = new ArrayList<BitextPatternRule>(); + + List<StringPair> correctExamples = new ArrayList<StringPair>(); + List<IncorrectBitextExample> incorrectExamples = new ArrayList<IncorrectBitextExample>(); + + List<BitextPatternRule> getBitextRules() { + return rules; + } + + public void warning (final SAXParseException e) throws SAXException { + throw e; + } + + public void error (final SAXParseException e) throws SAXException { + throw e; + } + +} diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/rules/patterns/bitext/FalseFriendsAsBitextLoader.java b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/patterns/bitext/FalseFriendsAsBitextLoader.java new file mode 100644 index 0000000..87c30a5 --- /dev/null +++ b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/patterns/bitext/FalseFriendsAsBitextLoader.java @@ -0,0 +1,72 @@ +/* LanguageTool, a natural language style checker + * Copyright (C) 2010 Marcin Miłkowski (www.languagetool.org) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ + +package de.danielnaber.languagetool.rules.patterns.bitext; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; + +import javax.xml.parsers.ParserConfigurationException; + +import org.xml.sax.SAXException; + +import de.danielnaber.languagetool.Language; +import de.danielnaber.languagetool.rules.patterns.FalseFriendRuleLoader; +import de.danielnaber.languagetool.rules.patterns.PatternRule; + +/** + * Loads the false friend rules as bitext pattern rules. Note that the resulting + * rules have suggestions that are not really customizable, in contradistinction + * to the 'real' bitext pattern rules. + * + * @author Marcin Miłkowski + * + */ +public class FalseFriendsAsBitextLoader { + + public List<BitextPatternRule> getFalseFriendsAsBitext(final String filename, + final Language motherTongue, final Language language) throws ParserConfigurationException, SAXException, IOException { + final FalseFriendRuleLoader ruleLoader = new FalseFriendRuleLoader(); + List<BitextPatternRule> bRules = new ArrayList<BitextPatternRule>(); + List<PatternRule> rules1 = + ruleLoader.getRules(this.getClass().getResourceAsStream(filename), + motherTongue, language); + List<PatternRule> rules2 = + ruleLoader.getRules(this.getClass().getResourceAsStream(filename), + language, motherTongue); + HashMap<String, PatternRule> srcRules = new HashMap<String, PatternRule>(); + for (PatternRule rule : rules1) { + srcRules.put(rule.getId(), rule); + } + for (PatternRule rule : rules2) { + if (srcRules.containsKey(rule.getId())) { + BitextPatternRule bRule = new BitextPatternRule( + srcRules.get(rule.getId()), rule); + bRule.setSourceLang(motherTongue); + bRule.setCategory(rule.getCategory()); + bRules.add(bRule); + } + } + return bRules; + } + +} + diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/rules/pl/CompoundRule.java b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/pl/CompoundRule.java new file mode 100644 index 0000000..6d2ff17 --- /dev/null +++ b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/pl/CompoundRule.java @@ -0,0 +1,55 @@ +/* LanguageTool, a natural language style checker + * Copyright (C) 2006 Daniel Naber (http://www.danielnaber.de) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ +package de.danielnaber.languagetool.rules.pl; + +import java.io.IOException; +import java.util.ResourceBundle; + +import de.danielnaber.languagetool.JLanguageTool; +import de.danielnaber.languagetool.rules.AbstractCompoundRule; + +/** + * Checks that compounds (if in the list) are not written as separate words. + * + * @author Marcin Miłkowski, based on code by Daniel Naber + */ + +public final class CompoundRule extends AbstractCompoundRule { + + private static final String FILE_NAME = "/pl/compounds.txt"; + + public CompoundRule(final ResourceBundle messages) throws IOException { + super(messages); + loadCompoundFile(JLanguageTool.getDataBroker().getFromResourceDirAsStream(FILE_NAME), "UTF-8"); + super.setShort("Brak łącznika lub zbędny łącznik"); + super.setMsg("Ten wyraz pisze się z łącznikiem.", + "Ten wyraz pisze się razem (bez spacji ani łącznika).", + "Ten wyraz pisze się z łącznikiem lub bez niego."); + } + + public final String getId() { + return "PL_COMPOUNDS"; + } + + public final String getDescription() { + return "Sprawdza wyrazy z łącznikiem, np. „łapu capu” zamiast „łapu-capu”"; + } + + +} diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/rules/pl/PolishRule.java b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/pl/PolishRule.java new file mode 100644 index 0000000..0a6f01b --- /dev/null +++ b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/pl/PolishRule.java @@ -0,0 +1,31 @@ +/* LanguageTool, a natural language style checker + * Copyright (C) 2005 Daniel Naber (http://www.danielnaber.de) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ +package de.danielnaber.languagetool.rules.pl; + +import de.danielnaber.languagetool.rules.Rule; + +/** + * Abstract base class for Polish rules. + * + * @author Marcin Miłkowski + * + */ +public abstract class PolishRule extends Rule { + +} diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/rules/pl/PolishUnpairedBracketsRule.java b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/pl/PolishUnpairedBracketsRule.java new file mode 100644 index 0000000..3b83133 --- /dev/null +++ b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/pl/PolishUnpairedBracketsRule.java @@ -0,0 +1,42 @@ +/* LanguageTool, a natural language style checker + * Copyright (C) 2010 Marcin Miłkowski (http://www.languagetool.org) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ + +package de.danielnaber.languagetool.rules.pl; + +import java.util.ResourceBundle; + +import de.danielnaber.languagetool.Language; +import de.danielnaber.languagetool.rules.GenericUnpairedBracketsRule; + +public class PolishUnpairedBracketsRule extends GenericUnpairedBracketsRule { + + private static final String[] PL_START_SYMBOLS = { "[", "(", "{", "„", "»", "\"" }; + private static final String[] PL_END_SYMBOLS = { "]", ")", "}", "”", "«", "\"" }; + + public PolishUnpairedBracketsRule(final ResourceBundle messages, + final Language language) { + super(messages, language); + startSymbols = PL_START_SYMBOLS; + endSymbols = PL_END_SYMBOLS; + } + + public String getId() { + return "PL_UNPAIRED_BRACKETS"; + } +} diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/rules/pl/PolishWordRepeatRule.java b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/pl/PolishWordRepeatRule.java new file mode 100644 index 0000000..a7dbb5e --- /dev/null +++ b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/pl/PolishWordRepeatRule.java @@ -0,0 +1,200 @@ +/* LanguageTool, a natural language style checker + * Copyright (C) 2005 Daniel Naber (http://www.danielnaber.de) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ +package de.danielnaber.languagetool.rules.pl; + +import java.util.ArrayList; +import java.util.List; +import java.util.ResourceBundle; +import java.util.TreeSet; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +import de.danielnaber.languagetool.AnalyzedSentence; +import de.danielnaber.languagetool.AnalyzedTokenReadings; +import de.danielnaber.languagetool.rules.Category; +import de.danielnaber.languagetool.rules.RuleMatch; +import de.danielnaber.languagetool.tools.StringTools; + +/** + * @author Marcin Miłkowski + * + * Rule for detecting same words in the sentence but not just in a row + * + */ +public class PolishWordRepeatRule extends PolishRule { + + /** + * Excluded dictionary words. + */ + private static final Pattern EXC_WORDS = Pattern + .compile("nie|tuż|aż|to|siebie|być|ani|ni|albo|" + + "lub|czy|bądź|jako|zł|np|coraz" + + "|bardzo|bardziej|proc|ten|jak|mln|tys|swój|mój|" + + "twój|nasz|wasz|i|zbyt"); + + /** + * Excluded part of speech classes. + */ + private static final Pattern EXC_POS = Pattern.compile("prep:.*|ppron.*"); + + /** + * Excluded non-words (special symbols, Roman numerals etc. + */ + private static final Pattern EXC_NONWORDS = Pattern + .compile(""|>|<|&|[0-9].*|" + + "M*(D?C{0,3}|C[DM])(L?X{0,3}|X[LC])(V?I{0,3}|I[VX])$"); + + public PolishWordRepeatRule(final ResourceBundle messages) { + if (messages != null) { + super.setCategory(new Category(messages.getString("category_misc"))); + } + setDefaultOff(); + } + + /* + * (non-Javadoc) + * + * @see de.danielnaber.languagetool.rules.Rule#getId() + */ + @Override + public final String getId() { + return "PL_WORD_REPEAT"; + } + + /* + * (non-Javadoc) + * + * @see de.danielnaber.languagetool.rules.Rule#getDescription() + */ + @Override + public final String getDescription() { + return "Powtórzenia wyrazów w zdaniu (monotonia stylistyczna)"; + } + + /* + * Tests if any word form is repeated in the sentence. + */ + @Override + public final RuleMatch[] match(final AnalyzedSentence text) { + final List<RuleMatch> ruleMatches = new ArrayList<RuleMatch>(); + final AnalyzedTokenReadings[] tokens = text.getTokensWithoutWhitespace(); + boolean repetition = false; + final TreeSet<String> inflectedWords = new TreeSet<String>(); + String prevLemma, curLemma; + // start from real token, 0 = SENT_START + for (int i = 1; i < tokens.length; i++) { + final String token = tokens[i].getToken(); + // avoid "..." etc. to be matched: + boolean isWord = true; + boolean hasLemma = true; + + if (token.length() < 2) { + isWord = false; + } + + final int readingsLen = tokens[i].getReadingsLength(); + for (int k = 0; k < readingsLen; k++) { + final String posTag = tokens[i].getAnalyzedToken(k).getPOSTag(); + if (posTag != null) { + if (StringTools.isEmpty(posTag)) { + isWord = false; + break; + } + // FIXME: too many false alarms here: + final String lemma = tokens[i].getAnalyzedToken(k).getLemma(); + if (lemma == null) { + hasLemma = false; + break; + } + final Matcher m1 = EXC_WORDS.matcher(lemma); + if (m1.matches()) { + isWord = false; + break; + } + + final Matcher m2 = EXC_POS.matcher(posTag); + if (m2.matches()) { + isWord = false; + break; + } + } else { + hasLemma = false; + } + + } + + final Matcher m1 = EXC_NONWORDS.matcher(tokens[i].getToken()); + if (m1.matches()) { + isWord = false; + } + + prevLemma = ""; + if (isWord) { + boolean notSentEnd = false; + for (int j = 0; j < readingsLen; j++) { + final String pos = tokens[i].getAnalyzedToken(j).getPOSTag(); + if (pos != null) { + notSentEnd |= "SENT_END".equals(pos); + } + if (hasLemma) { + curLemma = tokens[i].getAnalyzedToken(j).getLemma(); + if (!prevLemma.equals(curLemma) && !notSentEnd) { + if (inflectedWords.contains(curLemma)) { + repetition = true; + } else { + inflectedWords.add(tokens[i].getAnalyzedToken(j).getLemma()); + } + } + prevLemma = curLemma; + } else { + if (inflectedWords.contains(tokens[i].getToken()) && !notSentEnd) { + repetition = true; + } else { + inflectedWords.add(tokens[i].getToken()); + } + } + + } + } + + if (repetition) { + final String msg = "Powtórzony wyraz w zdaniu"; + final int pos = tokens[i].getStartPos(); + final RuleMatch ruleMatch = new RuleMatch(this, pos, pos + + token.length(), msg, "Powtórzenie wyrazu"); + ruleMatches.add(ruleMatch); + repetition = false; + } + + } + return toRuleMatchArray(ruleMatches); + } + + /* + * (non-Javadoc) + * + * @see de.danielnaber.languagetool.rules.Rule#reset() + */ + @Override + public void reset() { + // nothing + + } + +} diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/rules/pl/SimpleReplaceRule.java b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/pl/SimpleReplaceRule.java new file mode 100644 index 0000000..90708d9 --- /dev/null +++ b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/pl/SimpleReplaceRule.java @@ -0,0 +1,82 @@ +/* LanguageTool, a natural language style checker + * Copyright (C) 2005 Daniel Naber (http://www.danielnaber.de) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ +package de.danielnaber.languagetool.rules.pl; + +import java.io.IOException; +import java.util.Locale; +import java.util.ResourceBundle; + +import de.danielnaber.languagetool.rules.AbstractSimpleReplaceRule; + +/** + * A rule that matches words or phrases which should not be used and suggests + * correct ones instead. + * + * Polish implementations. Loads the list of words from + * <code>rules/pl/replace.txt</code>. + * + * @author Marcin Miłkowski + */ +public class SimpleReplaceRule extends AbstractSimpleReplaceRule { + + public static final String POLISH_SIMPLE_REPLACE_RULE = "PL_SIMPLE_REPLACE"; + + private static final String FILE_NAME = "/pl/replace.txt"; + // locale used on case-conversion + private static final Locale PL_LOCALE = new Locale("pl"); + + public final String getFileName() { + return FILE_NAME; + } + + public SimpleReplaceRule(final ResourceBundle messages) throws IOException { + super(messages); + } + + public final String getId() { + return POLISH_SIMPLE_REPLACE_RULE; + } + + public String getDescription() { + return "Typowe literówki"; + } + + public String getShort() { + return "Literówka"; + } + + public String getSuggestion() { + return " to typowa literówka, poprawnie: "; + } + + /** + * use case-insensitive matching. + */ + public boolean isCaseSensitive() { + return false; + } + + /** + * locale used on case-conversion + */ + public Locale getLocale() { + return PL_LOCALE; + } + +} diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/rules/ro/CompoundRule.java b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/ro/CompoundRule.java new file mode 100644 index 0000000..bb9dea8 --- /dev/null +++ b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/ro/CompoundRule.java @@ -0,0 +1,58 @@ +/* LanguageTool, a natural language style checker + * Copyright (C) 2006 Daniel Naber (http://www.danielnaber.de) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ +package de.danielnaber.languagetool.rules.ro; + +import java.io.IOException; +import java.util.ResourceBundle; + +import de.danielnaber.languagetool.JLanguageTool; +import de.danielnaber.languagetool.rules.AbstractCompoundRule; + +/** + * Checks that compounds (if in the list) are not written as separate words. + * + * @author Ionuț Păduraru, based on code by Daniel Naber + */ +public class CompoundRule extends AbstractCompoundRule { + + public static final String ROMANIAN_COMPOUND_RULE = "RO_COMPOUND"; + private static final String FILE_NAME = "/ro/compounds.txt"; + + public CompoundRule(final ResourceBundle messages) throws IOException { + super(messages); + loadCompoundFile(JLanguageTool.getDataBroker().getFromRulesDirAsStream(FILE_NAME), "UTF-8"); + super.setShort("Problemă de scriere (cratimă, spațiu, etc.)"); + super.setMsg("Cuvântul se scrie cu cratimă.", + "Cuvântul se scrie legat.", + "Cuvântul se scrie legat sau cu cratimă."); + // default value (2) is not ok for Romanian + setMaxUnHyphenatedWordCount(Integer.MAX_VALUE); + // there are words that should not be written with hyphen but as one word + setHyphenIgnored(false); + } + + public String getId() { + return ROMANIAN_COMPOUND_RULE; + } + + public String getDescription() { + return "Greșeală de scriere (cuvinte scrise legat sau cu cratimă)"; + } + +} diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/rules/ro/SimpleReplaceRule.java b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/ro/SimpleReplaceRule.java new file mode 100644 index 0000000..9e96513 --- /dev/null +++ b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/ro/SimpleReplaceRule.java @@ -0,0 +1,264 @@ +/* LanguageTool, a natural language style checker + * Copyright (C) 2005 Daniel Naber (http://www.danielnaber.de) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ +package de.danielnaber.languagetool.rules.ro; + +import java.io.BufferedReader; +import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; +import java.util.HashMap; +import java.util.List; +import java.util.Locale; +import java.util.Map; +import java.util.Queue; +import java.util.ResourceBundle; +import java.util.concurrent.ArrayBlockingQueue; + +import de.danielnaber.languagetool.AnalyzedSentence; +import de.danielnaber.languagetool.AnalyzedTokenReadings; +import de.danielnaber.languagetool.JLanguageTool; +import de.danielnaber.languagetool.Language; +import de.danielnaber.languagetool.rules.AbstractSimpleReplaceRule; +import de.danielnaber.languagetool.rules.Category; +import de.danielnaber.languagetool.rules.Rule; +import de.danielnaber.languagetool.rules.RuleMatch; +import de.danielnaber.languagetool.tokenizers.Tokenizer; +import de.danielnaber.languagetool.tools.StringTools; + +/** + * A rule that matches words which should not be used and suggests correct ones instead. <br/> + * Romanian implementations. Loads the list of words from + * <code>/ro/replace.txt</code>.<br/><br/> + * + * Unlike AbstractSimpleReplaceRule, supports multiple words (Ex: "aqua forte" => "acvaforte").<br/><br/> + * + * Note: Merge this into {@link AbstractSimpleReplaceRule} eventually and simply extend from AbstractSimpleReplaceRule.<br/> + * + * @author Ionuț Păduraru + * @version $Id$ + * + */ +public class SimpleReplaceRule extends Rule { + + public static final String ROMANIAN_SIMPLE_REPLACE_RULE = "RO_SIMPLE_REPLACE"; + + private static final String FILE_NAME = "/ro/replace.txt"; + private static final String FILE_ENCODING = "utf-8"; + // locale used on case-conversion + private static Locale roLocale = new Locale("ro"); + + // list of maps containing error-corrections pairs. + // the n-th map contains key strings of (n+1) words + private List<Map<String, String>> wrongWords; + + public final String getFileName() { + return FILE_NAME; + } + + public SimpleReplaceRule(final ResourceBundle messages) throws IOException { + super(messages); + if (messages != null) { + super.setCategory(new Category(messages.getString("category_misc"))); + } + wrongWords = loadWords(JLanguageTool.getDataBroker().getFromRulesDirAsStream(getFileName())); + } + + public final String getId() { + return ROMANIAN_SIMPLE_REPLACE_RULE; + } + + public String getDescription() { + return "Cuvinte sau grupuri de cuvinte incorecte sau ieșite din uz"; + } + + public String getShort() { + return "Cuvânt incorect sau ieșit din uz"; + } + + public String getSuggestion() { + return " este incorect sau ieșit din uz, folosiți "; + } + + /** + * @return the word used to separate multiple suggestions; used only before last suggestion, the rest are comma-separated. + */ + public String getSuggestionsSeparator() { + return " sau "; + } + + /** + * use case-insensitive matching. + */ + public boolean isCaseSensitive() { + return false; + } + + /** + * locale used on case-conversion + */ + public Locale getLocale() { + return roLocale; + } + + public String getEncoding() { + return FILE_ENCODING; + } + + /** + * @return the word tokenizer used for tokenization on loading words. + */ + protected Tokenizer getWordTokenizer() { + return Language.ROMANIAN.getWordTokenizer(); + } + + /** + * @return the list of wrong words for which this rule can suggest correction. The list cannot be modified. + */ + public List<Map<String, String>> getWrongWords() { + return wrongWords; + } + + /** + * Load the list of words. <br/> + * Same as {@link AbstractSimpleReplaceRule#loadWords} but allows multiple words. + * @param file the file to load. + * @return the list of maps containing the error-corrections pairs. <br/>The n-th map contains key strings of (n+1) words. + * @throws IOException when the file contains errors. + * @see #getWordTokenizer + */ + private List<Map<String, String>> loadWords(final InputStream file) + throws IOException { + final List<Map<String, String>> list = new ArrayList<Map<String, String>>(); + InputStreamReader isr = null; + BufferedReader br = null; + try { + isr = new InputStreamReader(file, getEncoding()); + br = new BufferedReader(isr); + String line; + + while ((line = br.readLine()) != null) { + line = line.trim(); + if (line.length() < 1 || line.charAt(0) == '#') { // ignore comments + continue; + } + final String[] parts = line.split("="); + if (parts.length != 2) { + throw new IOException("Format error in file " + + JLanguageTool.getDataBroker().getFromRulesDirAsUrl(getFileName()) + + ", line: " + line); + } + final String[] wrongForms = parts[0].split("\\|"); // multiple incorect forms + for (String wrongForm : wrongForms) { + int wordCount = 0; + final List<String> tokens = getWordTokenizer().tokenize(wrongForm); + for (String token : tokens) { + if (!StringTools.isWhitespace(token)) { + wordCount++; + } + } + // grow if necessary + for (int i = list.size(); i < wordCount; i++) { + list.add(new HashMap<String, String>()); + } + list.get(wordCount - 1).put(wrongForm, parts[1]); + } + } + + } finally { + if (br != null) { + br.close(); + } + if (isr != null) { + isr.close(); + } + } + // seal the result (prevent modification from outside this class) + List<Map<String,String>> result = new ArrayList<Map<String, String>>(); + for (Map<String, String> map : list) { + result.add(Collections.unmodifiableMap(map)); + } + return Collections.unmodifiableList(result); + } + + private void addToQueue(AnalyzedTokenReadings token, + Queue<AnalyzedTokenReadings> prevTokens) { + final boolean inserted = prevTokens.offer(token); + if (!inserted) { + prevTokens.poll(); + prevTokens.offer(token); + } + } + + public RuleMatch[] match(final AnalyzedSentence text) { + final List<RuleMatch> ruleMatches = new ArrayList<RuleMatch>(); + final AnalyzedTokenReadings[] tokens = text + .getTokensWithoutWhitespace(); + + final Queue<AnalyzedTokenReadings> prevTokens = new ArrayBlockingQueue<AnalyzedTokenReadings>(wrongWords.size()); + + for (int i = 1; i < tokens.length; i++) { + addToQueue(tokens[i], prevTokens); + final StringBuilder sb = new StringBuilder(); + final ArrayList<String> variants = new ArrayList<String>(); + final List<AnalyzedTokenReadings> prevTokensList = Arrays.asList(prevTokens.toArray(new AnalyzedTokenReadings[] {})); + for (int j = prevTokensList.size() - 1; j >= 0; j--) { + if (j != prevTokensList.size() - 1 && prevTokensList.get(j + 1).isWhitespaceBefore()) + sb.insert(0, " "); + sb.insert(0, prevTokensList.get(j).getToken()); + variants.add(0, sb.toString()); + } + final int len = variants.size(); // prevTokensList and variants have now the same length + for (int j = 0; j < len; j++) { // longest words first + final String crt = variants.get(j); + final int crtWordCount = len - j; + final String crtMatch = isCaseSensitive() ? wrongWords.get(crtWordCount - 1).get(crt) : wrongWords.get(crtWordCount- 1).get(crt.toLowerCase(getLocale())); + if (crtMatch != null) { + final List<String> replacements = Arrays.asList(crtMatch.split("\\|")); + String msg = crt + getSuggestion(); + for (int k = 0; k < replacements.size(); k++) { + if (k > 0) { + msg = msg + (k == replacements.size() - 1 ? getSuggestionsSeparator(): ", "); + } + msg += "<suggestion>" + replacements.get(k) + "</suggestion>"; + } + final int startPos = prevTokensList.get(len - crtWordCount).getStartPos(); + final int endPos = prevTokensList.get(len - 1).getStartPos() + prevTokensList.get(len - 1).getToken().length(); + final RuleMatch potentialRuleMatch = new RuleMatch(this, startPos, endPos, msg, getShort()); + + if (!isCaseSensitive() && StringTools.startsWithUppercase(crt)) { + for (int k = 0; k < replacements.size(); k++) { + replacements.set(k, StringTools.uppercaseFirstChar(replacements.get(k))); + } + } + potentialRuleMatch.setSuggestedReplacements(replacements); + ruleMatches.add(potentialRuleMatch); + break; + } + } + } + return toRuleMatchArray(ruleMatches); + } + + public void reset() { + } + +} diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/rules/ru/RuSimpleReplaceRule.java b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/ru/RuSimpleReplaceRule.java new file mode 100644 index 0000000..4076a9c --- /dev/null +++ b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/ru/RuSimpleReplaceRule.java @@ -0,0 +1,80 @@ +/* LanguageTool, a natural language style checker + * Copyright (C) 2005 Daniel Naber (http://www.danielnaber.de) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ +package de.danielnaber.languagetool.rules.ru; + +import java.io.IOException; +import java.util.Locale; +import java.util.ResourceBundle; + +import de.danielnaber.languagetool.rules.AbstractSimpleReplaceRule; + +/** + * A rule that matches words or phrases which should not be used and suggests + * correct ones instead. + * + * Russian implementations. Loads the + * relevant words from <code>rules/ru/replace.txt</code>. + * + * @author Yakov Reztsov + */ +public class RuSimpleReplaceRule extends AbstractSimpleReplaceRule { + + private static final String FILE_NAME = "/ru/replace.txt"; + + // locale used on case-conversion + private static final Locale RU_LOCALE = new Locale("ru"); + + + public final String getFileName() { + return FILE_NAME; + } + public RuSimpleReplaceRule(final ResourceBundle messages) throws IOException { + super(messages); + } + + public final String getId() { + return "RU_SIMPLE_REPLACE"; + } + + public String getDescription() { + return "Поиск ошибочных слов/фраз"; + } + +public String getShort() { + return "Ошибка?"; + } + + public String getSuggestion() { + return " - ошибочное слово/фраза, исправление: "; + } + + /** + * use case-insensitive matching. + */ + public boolean isCaseSensitive() { + return false; + } + + /** + * locale used on case-conversion + */ + public Locale getLocale() { + return RU_LOCALE; + } +} diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/rules/ru/RussianCompoundRule.java b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/ru/RussianCompoundRule.java new file mode 100644 index 0000000..3e7d889 --- /dev/null +++ b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/ru/RussianCompoundRule.java @@ -0,0 +1,57 @@ +/* LanguageTool, a natural language style checker + * Copyright (C) 2006 Daniel Naber (http://www.danielnaber.de) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ +package de.danielnaber.languagetool.rules.ru; + +import java.io.IOException; +import java.util.ResourceBundle; + +import de.danielnaber.languagetool.JLanguageTool; +import de.danielnaber.languagetool.rules.AbstractCompoundRule; + +/** + * Checks that compounds (if in the list) are not written as separate words. + * Russian compounds rule. + * @author Yakov Reztsov + * + * Based on German compounds rule. + * @author Daniel Naber + * + */ +public class RussianCompoundRule extends AbstractCompoundRule { + + private static final String FILE_NAME = "/ru/compounds_ru.txt"; + + public RussianCompoundRule(final ResourceBundle messages) throws IOException { + super(messages); + loadCompoundFile(JLanguageTool.getDataBroker().getFromResourceDirAsStream(FILE_NAME), "UTF-8"); + super.setMsg("Эти слова должны быть написаны через дефис.", + "Эти слова должны быть написаны слитно.", + "Эти слова могут быть написаны через дефис или слитно."); + + } + + public String getId() { + return "RU_COMPOUNDS"; + } + + public String getDescription() { + return "Правописание через дефис"; + } + +} diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/rules/ru/RussianRule.java b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/ru/RussianRule.java new file mode 100644 index 0000000..030abf2 --- /dev/null +++ b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/ru/RussianRule.java @@ -0,0 +1,30 @@ +/* LanguageTool, a natural language style checker + * Copyright (C) 2005 Daniel Naber (http://www.danielnaber.de) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ +package de.danielnaber.languagetool.rules.ru; + +import de.danielnaber.languagetool.rules.Rule; + +/** + * Abstract base class for rules for the Russian language. + * + * @author + */ +public abstract class RussianRule extends Rule { + +} diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/rules/ru/RussianUnpairedBracketsRule.java b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/ru/RussianUnpairedBracketsRule.java new file mode 100644 index 0000000..75dd86b --- /dev/null +++ b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/ru/RussianUnpairedBracketsRule.java @@ -0,0 +1,62 @@ +/* LanguageTool, a natural language style checker + * Copyright (C) 2010 Marcin Miłkowski (http://www.languagetool.org) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ + +package de.danielnaber.languagetool.rules.ru; + +import java.util.ResourceBundle; +import java.util.regex.Pattern; + +import de.danielnaber.languagetool.AnalyzedTokenReadings; +import de.danielnaber.languagetool.Language; +import de.danielnaber.languagetool.rules.GenericUnpairedBracketsRule; + +public class RussianUnpairedBracketsRule extends GenericUnpairedBracketsRule { + + private static final String[] RU_START_SYMBOLS = { "[", "(", "{", "„", "«", "\"", "'" }; + private static final String[] RU_END_SYMBOLS = { "]", ")", "}", "“", "»", "\"", "'" }; + + private static final Pattern NUMERALS_RU = Pattern + .compile("(?i)\\d{1,2}?[а-я]*|[а-я]|[А-Я]|[а-я][а-я]|[А-Я][А-Я]"); + + + protected boolean isNoException(final String token, + final AnalyzedTokenReadings[] tokens, final int i, final int j, + final boolean precSpace, + final boolean follSpace) { + // exception for Russian bullets: а), б), Д)..., ДД), аа) and 1а). + if (i > 1 && endSymbols[j].equals(")") && + NUMERALS_RU.matcher(tokens[i - 1].getToken()).matches() && + !(!symbolStack.empty() && "(".equals(symbolStack.peek().symbol))) { + return false; + } + return true; + } + + public RussianUnpairedBracketsRule(final ResourceBundle messages, + final Language language) { + super(messages, language); + startSymbols = RU_START_SYMBOLS; + endSymbols = RU_END_SYMBOLS; + } + + public String getId() { + return "RU_UNPAIRED_BRACKETS"; + } + +} diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/rules/sk/CompoundRule.java b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/sk/CompoundRule.java new file mode 100644 index 0000000..d5260bf --- /dev/null +++ b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/sk/CompoundRule.java @@ -0,0 +1,55 @@ +/* LanguageTool, a natural language style checker + * Copyright (C) 2006 Daniel Naber (http://www.danielnaber.de) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ +package de.danielnaber.languagetool.rules.sk; + +import java.io.IOException; +import java.util.ResourceBundle; + +import de.danielnaber.languagetool.JLanguageTool; +import de.danielnaber.languagetool.rules.AbstractCompoundRule; + +/** + * Checks that compounds (if in the list) are not written as separate words. + * + * @author Zdenko Podobný based on code by Marcin Miłkowski, Daniel Naber + */ + +public final class CompoundRule extends AbstractCompoundRule { + + private static final String FILE_NAME = "/sk/compounds.txt"; + + public CompoundRule(final ResourceBundle messages) throws IOException { + super(messages); + loadCompoundFile(JLanguageTool.getDataBroker().getFromResourceDirAsStream(FILE_NAME), "UTF-8"); + super.setShort("Problém spájania slov"); + super.setMsg("Toto slovo sa zvyčajne píše so spojovníkom.", + "Toto slovo sa obvykle píše bez spojovníka.", + "Tento výraz sa bežne píše s alebo bez spojovníka."); + } + + public final String getId() { + return "SK_COMPOUNDS"; + } + + public final String getDescription() { + return "Slová so spojovníkom napr. použite „česko-slovenský” namiesto „česko slovenský”"; + } + + +} diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/rules/sk/SlovakRule.java b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/sk/SlovakRule.java new file mode 100644 index 0000000..f28067a --- /dev/null +++ b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/sk/SlovakRule.java @@ -0,0 +1,31 @@ +/* LanguageTool, a natural language style checker + * Copyright (C) 2005 Daniel Naber (http://www.danielnaber.de) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ +package de.danielnaber.languagetool.rules.sk; + +import de.danielnaber.languagetool.rules.Rule; + +/** + * Abstract base class for Polish rules. + * + * @author Zdenko Podobný based on Polish rules + * + */ +public abstract class SlovakRule extends Rule { + +} diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/rules/sk/SlovakVes.java b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/sk/SlovakVes.java new file mode 100644 index 0000000..3fff582 --- /dev/null +++ b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/sk/SlovakVes.java @@ -0,0 +1,146 @@ +/* LanguageTool, a natural language style checker + * Copyright (C) 2010 Luboš Lehotský lubo.lehotsky (at) gmail (dot) com + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ + +package de.danielnaber.languagetool.rules.sk; + + +import java.util.ArrayList; +import java.util.List; +import java.util.ResourceBundle; + +import de.danielnaber.languagetool.AnalyzedSentence; +import de.danielnaber.languagetool.AnalyzedTokenReadings; +import de.danielnaber.languagetool.rules.Category; +import de.danielnaber.languagetool.rules.RuleMatch; + + +public class SlovakVes extends SlovakRule { + + public SlovakVes(final ResourceBundle messages) { + if (messages != null) { + super.setCategory(new Category(messages.getString("category_misc"))); + } + setDefaultOff(); + } + + @Override + public final String getId() { + return "SK_VES"; + } + + @Override + public final String getDescription() { + return "Názvy obcí, v ktorých je \"Ves\""; + } + + @Override + public final RuleMatch[] match(final AnalyzedSentence text) { + final List<RuleMatch> ruleMatches = new ArrayList<RuleMatch>(); + final AnalyzedTokenReadings[] tokens = text.getTokensWithoutWhitespace(); + // never read boolean prve_uvodzovky; + boolean tag, tag2, tag3; + final String pomoc; + final char znak; + +// never read prve_uvodzovky = false; + tag = false; + tag2 = false; + tag3 = false; + + pomoc = tokens[1].getToken(); + if (pomoc.length() >= 1) { + znak = pomoc.charAt(0); + } else { + znak = '.'; + } + + if (znak == '?') { +// never read prve_uvodzovky = true; + } + for (int i = 1; i < tokens.length; i++) { + final String token = tokens[i].getToken(); +// never read String premenna = token.toString(); + final char pomocnik; +// never read final int help; + boolean bodka; + boolean pady; + + pady = false; + pomocnik = token.charAt(0); + bodka = false; + if (token.charAt(0) == '.' || token.charAt(0) == '?' + || token.charAt(0) == '!') { + bodka = true; + } + + if (tokens[i].hasPosTag("AAfs1x") || tokens[i].hasPosTag("AAfs2x") + || tokens[i].hasPosTag("AAfs3x") + || tokens[i].hasPosTag("AAfs4x") + || tokens[i].hasPosTag("AAfs6x") + || tokens[i].hasPosTag("AAfs7x")) { + pady = true; + } + if (pady && Character.isUpperCase(pomocnik)) { + tag = true; + } + + if (tag && !tag2) { + if (pady && Character.isLowerCase(pomocnik)) { + tag2 = true; + // premenna = tokens[i].getToken(); + } + + } + + if (tag2) { + if (token.equals("Ves") || token.equals("Vsi") + || token.equals("Vsou")) { + tag3 = true; + } + } + if (tag3 && !bodka) { + String spravne; + char prve; + + prve = tokens[i - 1].getToken().charAt(0); + prve = Character.toUpperCase(prve); + spravne = tokens[i - 1].getToken().substring(1, + tokens[i - 1].getToken().length()); + + final String msg = "Zmeňte začiatočné písmeno na veľké: <suggestion> " + + prve + spravne + " </suggestion>"; + final int pos = tokens[i - 1].getStartPos(); + final int pos2 = tokens[i - 1].getToken().length(); + final RuleMatch ruleMatch = new RuleMatch(this, pos, pos + pos2, + msg, "Zmeňte začiatočné písmeno na veľké: "); + + ruleMatches.add(ruleMatch); + + } + + } + return toRuleMatchArray(ruleMatches); + } + + @Override + public void reset() {// nothing + } + +} + diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/rules/sv/CompoundRule.java b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/sv/CompoundRule.java new file mode 100644 index 0000000..b3087cd --- /dev/null +++ b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/sv/CompoundRule.java @@ -0,0 +1,247 @@ +/* LanguageTool, a natural language style checker + * Copyright (C) 2006 Daniel Naber (http://www.danielnaber.de) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ +package de.danielnaber.languagetool.rules.sv; + +import java.io.BufferedReader; +import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Queue; +import java.util.ResourceBundle; +import java.util.Set; +import java.util.concurrent.ArrayBlockingQueue; + +import de.danielnaber.languagetool.AnalyzedSentence; +import de.danielnaber.languagetool.AnalyzedToken; +import de.danielnaber.languagetool.AnalyzedTokenReadings; +import de.danielnaber.languagetool.JLanguageTool; +import de.danielnaber.languagetool.rules.Category; +import de.danielnaber.languagetool.rules.RuleMatch; +import de.danielnaber.languagetool.tools.StringTools; + +/** + * Checks that compounds (if in the list) are not written as separate words. + * + * @author Daniel Naber + */ +public class CompoundRule extends SwedishRule { + //TODO for words with more then one part check if parts of it is compounded. + //in env. allt-i-genom+ should match "allt i genom", "allt igenom" as well as "allti genom" + private static final String FILE_NAME = "/sv/compounds.txt"; + + private final static int MAX_TERMS = 5; + + private final Set<String> incorrectCompounds = new HashSet<String>(); + private final Set<String> noDashSuggestion = new HashSet<String>(); + private final Set<String> onlyDashSuggestion = new HashSet<String>(); + + public CompoundRule(final ResourceBundle messages) throws IOException { + if (messages != null) + super.setCategory(new Category(messages.getString("category_misc"))); + loadCompoundFile(JLanguageTool.getDataBroker().getFromResourceDirAsStream(FILE_NAME), "UTF-8"); + } + + public String getId() { + return "SV_COMPOUNDS"; + } + + public String getDescription() { + return "Särskrivningar, t.ex. 'cd rom' bör skrivas 'cd-rom'"; + } + + public RuleMatch[] match(final AnalyzedSentence text) { + final List<RuleMatch> ruleMatches = new ArrayList<RuleMatch>(); + final AnalyzedTokenReadings[] tokens = text.getTokensWithoutWhitespace(); + + RuleMatch prevRuleMatch = null; + final Queue<AnalyzedTokenReadings> prevTokens = new ArrayBlockingQueue<AnalyzedTokenReadings>(MAX_TERMS); + for (int i = 0; i < tokens.length + MAX_TERMS-1; i++) { + AnalyzedTokenReadings token = null; + // we need to extend the token list so we find matches at the end of the original list: + if (i >= tokens.length) + token = new AnalyzedTokenReadings(new AnalyzedToken("", "", null), prevTokens.peek().getStartPos()); + else + token = tokens[i]; + if (i == 0) { + addToQueue(token, prevTokens); + continue; + } + + final StringBuilder sb = new StringBuilder(); + int j = 0; + AnalyzedTokenReadings firstMatchToken = null; + final List<String> stringsToCheck = new ArrayList<String>(); + final List<String> origStringsToCheck = new ArrayList<String>(); // original upper/lowercase spelling + final Map<String, AnalyzedTokenReadings> stringToToken = new HashMap<String, AnalyzedTokenReadings>(); + for (AnalyzedTokenReadings atr : prevTokens) { + if (j == 0) + firstMatchToken = atr; + sb.append(' '); + sb.append(atr.getToken()); + if (j >= 1) { + final String stringToCheck = normalize(sb.toString()); + stringsToCheck.add(stringToCheck); + origStringsToCheck.add(sb.toString().trim()); + if (!stringToToken.containsKey(stringToCheck)) + stringToToken.put(stringToCheck, atr); + } + j++; + } + // iterate backwards over all potentially incorrect strings to make + // sure we match longer strings first: + for (int k = stringsToCheck.size()-1; k >= 0; k--) { + final String stringToCheck = stringsToCheck.get(k); + final String origStringToCheck = origStringsToCheck.get(k); + //System.err.println("##"+stringtoCheck+"#"); + if (incorrectCompounds.contains(stringToCheck)) { + final AnalyzedTokenReadings atr = stringToToken.get(stringToCheck); + String msg = null; + final List<String> repl = new ArrayList<String>(); + if (!noDashSuggestion.contains(stringToCheck)) { + repl.add(origStringToCheck.replace(' ', '-')); + msg = "Dessa ord skrivs samman med bindesträck."; + } + // Do not assume that compounds with more than two parts should always use hyphens: + if (!hasAllUppercaseParts(origStringToCheck) && !onlyDashSuggestion.contains(stringToCheck)) { + repl.add(mergeCompound(origStringToCheck)); + msg = "Dessa ord skrivs samman."; + } + final String[] parts = stringToCheck.split(" "); + if (parts.length > 0) { + repl.clear(); + repl.add(origStringToCheck.replace(' ', '-')); + msg = "Dessa ord skrivs samman med bindesträck."; + } else if (repl.size() == 0 || repl.size() == 2) { // == 0 shouldn't happen + // did not work as expected so I added repl. explicitly. + msg = "Dessa ord skrivs samman med eller utan bindesträck."; + repl.clear(); + repl.add(origStringToCheck.replace(' ', '-')); + repl.add(mergeCompound(origStringToCheck)); + } + final RuleMatch ruleMatch = new RuleMatch(this, firstMatchToken.getStartPos(), + atr.getStartPos() + atr.getToken().length(), msg); + // avoid duplicate matches: + if (prevRuleMatch != null && prevRuleMatch.getFromPos() == ruleMatch.getFromPos()) { + prevRuleMatch = ruleMatch; + break; + } + prevRuleMatch = ruleMatch; + ruleMatch.setSuggestedReplacements(repl); + ruleMatches.add(ruleMatch); + break; + } + } + addToQueue(token, prevTokens); + } + return toRuleMatchArray(ruleMatches); + } + + /** + * Replaces dashes with whitespace + * e.g. "E-Mail Adresse" -> "E Mail Adresse" so the error can be detected: + * @param str + * @return str + */ + private String normalize(String str) { + str = str.trim().toLowerCase(); + if (str.indexOf('-') != -1 && str.indexOf(' ') != -1) { + // e.g. "E-Mail Adresse" -> "E Mail Adresse" so the error can be detected: + str = str.replace('-', ' '); + } + return str; + } + + private boolean hasAllUppercaseParts(String str) { + final String[] parts = str.split(" "); + for (String part : parts) { + if (StringTools.isAllUppercase(part)) { + return true; + } + } + return false; + } + + private String mergeCompound(String str) { + final String[] stringParts = str.split(" "); + final StringBuilder sb = new StringBuilder(); + for (int k = 0; k < stringParts.length; k++) { + if (k == 0) + sb.append(stringParts[k]); + else + sb.append(stringParts[k].toLowerCase()); + } + return sb.toString(); + } + + private void addToQueue(AnalyzedTokenReadings token, Queue<AnalyzedTokenReadings> prevTokens) { + final boolean inserted = prevTokens.offer(token); + if (!inserted) { + prevTokens.poll(); + prevTokens.offer(token); + } + } + + private void loadCompoundFile(final InputStream file, final String encoding) throws IOException { + InputStreamReader isr = null; + BufferedReader br = null; + try { + isr = new InputStreamReader(file, encoding); + br = new BufferedReader(isr); + String line; + while ((line = br.readLine()) != null) { + line = line.trim(); + if (line.length() < 1) { + continue; + } + if (line.charAt(0) == '#') { // ignore comments + continue; + } + // the set contains the incorrect spellings, i.e. the ones without hyphen + line = line.replace('-', ' '); + final String[] parts = line.split(" "); + if (parts.length > MAX_TERMS) + throw new IOException("För många ord sammansatta: " + line + ", max antal tillåtna ord: " + MAX_TERMS); + if (parts.length == 1) + throw new IOException("Inget sammansatt ord: " + line); + if (line.endsWith("+")) { + line = line.substring(0, line.length() - 1); // cut off "+" + noDashSuggestion.add(line.toLowerCase()); + } else if (line.endsWith("*")) { + line = line.substring(0, line.length() - 1); // cut off "*" + onlyDashSuggestion.add(line.toLowerCase()); + } + incorrectCompounds.add(line.toLowerCase()); + } + } finally { + if (br != null) br.close(); + if (isr != null) isr.close(); + } + } + + public void reset() { + } + +} + + diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/rules/sv/SwedishRule.java b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/sv/SwedishRule.java new file mode 100644 index 0000000..73af8fe --- /dev/null +++ b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/sv/SwedishRule.java @@ -0,0 +1,31 @@ +/* LanguageTool, a natural language style checker + * Copyright (C) 2005 Daniel Naber (http://www.danielnaber.de) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ +package de.danielnaber.languagetool.rules.sv; + +import de.danielnaber.languagetool.rules.Rule; + +/** + * Abstract base class for Swedish rules. + * + * @author Marcin Miłkowski + * + */ +public abstract class SwedishRule extends Rule { + +} diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/rules/uk/PunctuationCheckRule.java b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/uk/PunctuationCheckRule.java new file mode 100644 index 0000000..5abc339 --- /dev/null +++ b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/uk/PunctuationCheckRule.java @@ -0,0 +1,76 @@ +/* LanguageTool, a natural language style checker + * Copyright (C) 2005 Daniel Naber (http://www.danielnaber.de) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ +package de.danielnaber.languagetool.rules.uk; + +import java.util.ResourceBundle; + +import de.danielnaber.languagetool.rules.AbstractPunctuationCheckRule; + +/** + * A rule that matches "..", "::", "-," but not "...", "!..", "?!!", ",-" etc + * TODO: spaces seem to be special, extract from regexp? + * + * @author Andriy Rysin + */ +public class PunctuationCheckRule extends AbstractPunctuationCheckRule { + + public PunctuationCheckRule(final ResourceBundle messages) { + super(messages); + // super.setCategory(new Category(messages.getString("category_misc"))); + } + + // private boolean isTripleOk(String token) { + // return token.matches("^[.!?]$"); + // } + + /* + * (non-Javadoc) + * + * @see + * de.danielnaber.languagetool.rules.AbstractPunctuationCheckRule#isPunctsJoinOk + * (java.lang.String) + */ + protected final boolean isPunctsJoinOk(final String tokens) { + return // we ignore duplicated spaces - too many errors + tokens.matches("([,:] | *- |,- | ) *") // internal puctuation + || tokens + .matches("([.!?]|!!!|\\?\\?\\?|\\?!!|!\\.\\.|\\?\\.\\.|\\.\\.\\.) *"); + } + + /* + * (non-Javadoc) + * + * @see + * de.danielnaber.languagetool.rules.AbstractPunctuationCheckRule#isPunctuation + * (java.lang.String) + */ + protected final boolean isPunctuation(final String token) { + return token.matches("^[.,!?: -]$"); + } + + /* + * (non-Javadoc) + * + * @see de.danielnaber.languagetool.rules.AbstractPunctuationCheckRule#reset() + */ + public void reset() { + // nothing + } + +} diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/rules/uk/SimpleReplaceRule.java b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/uk/SimpleReplaceRule.java new file mode 100644 index 0000000..3bba01c --- /dev/null +++ b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/uk/SimpleReplaceRule.java @@ -0,0 +1,50 @@ +/* LanguageTool, a natural language style checker + * Copyright (C) 2005 Daniel Naber (http://www.danielnaber.de) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ +package de.danielnaber.languagetool.rules.uk; + +import java.io.IOException; +import java.util.ResourceBundle; + +import de.danielnaber.languagetool.rules.AbstractSimpleReplaceRule; + +/** + * A rule that matches words or phrases which should not be used and suggests + * correct ones instead. + * + * Ukrainian implementations. Loads the + * relevant words from <code>rules/uk/replace.txt</code>. + * + * @author Andriy Rysin + */ +public class SimpleReplaceRule extends AbstractSimpleReplaceRule { + + private static final String FILE_NAME = "/uk/replace.txt"; + + public final String getFileName() { + return FILE_NAME; + } + public SimpleReplaceRule(final ResourceBundle messages) throws IOException { + super(messages); + } + + public final String getId() { + return "UK_SIMPLE_REPLACE"; + } + +} |