summaryrefslogtreecommitdiffstats
path: root/JLanguageTool/src/java/de/danielnaber/languagetool/rules
diff options
context:
space:
mode:
Diffstat (limited to 'JLanguageTool/src/java/de/danielnaber/languagetool/rules')
-rw-r--r--JLanguageTool/src/java/de/danielnaber/languagetool/rules/AbstractCompoundRule.java279
-rw-r--r--JLanguageTool/src/java/de/danielnaber/languagetool/rules/AbstractPunctuationCheckRule.java93
-rw-r--r--JLanguageTool/src/java/de/danielnaber/languagetool/rules/AbstractSimpleReplaceRule.java159
-rw-r--r--JLanguageTool/src/java/de/danielnaber/languagetool/rules/Category.java85
-rw-r--r--JLanguageTool/src/java/de/danielnaber/languagetool/rules/CommaWhitespaceRule.java170
-rw-r--r--JLanguageTool/src/java/de/danielnaber/languagetool/rules/DoublePunctuationRule.java99
-rw-r--r--JLanguageTool/src/java/de/danielnaber/languagetool/rules/GenericUnpairedBracketsRule.java314
-rw-r--r--JLanguageTool/src/java/de/danielnaber/languagetool/rules/IncorrectExample.java62
-rw-r--r--JLanguageTool/src/java/de/danielnaber/languagetool/rules/Rule.java230
-rw-r--r--JLanguageTool/src/java/de/danielnaber/languagetool/rules/RuleMatch.java239
-rw-r--r--JLanguageTool/src/java/de/danielnaber/languagetool/rules/UppercaseSentenceStartRule.java136
-rw-r--r--JLanguageTool/src/java/de/danielnaber/languagetool/rules/WhitespaceRule.java91
-rw-r--r--JLanguageTool/src/java/de/danielnaber/languagetool/rules/WordRepeatRule.java101
-rw-r--r--JLanguageTool/src/java/de/danielnaber/languagetool/rules/bitext/BitextRule.java106
-rw-r--r--JLanguageTool/src/java/de/danielnaber/languagetool/rules/bitext/DifferentLengthRule.java93
-rw-r--r--JLanguageTool/src/java/de/danielnaber/languagetool/rules/bitext/IncorrectBitextExample.java64
-rw-r--r--JLanguageTool/src/java/de/danielnaber/languagetool/rules/bitext/SameTranslationRule.java88
-rw-r--r--JLanguageTool/src/java/de/danielnaber/languagetool/rules/ca/AccentuacioReplaceRule.java90
-rw-r--r--JLanguageTool/src/java/de/danielnaber/languagetool/rules/ca/CastellanismesReplaceRule.java85
-rw-r--r--JLanguageTool/src/java/de/danielnaber/languagetool/rules/de/AgreementRule.java405
-rw-r--r--JLanguageTool/src/java/de/danielnaber/languagetool/rules/de/CaseRule.java358
-rw-r--r--JLanguageTool/src/java/de/danielnaber/languagetool/rules/de/CompoundRule.java53
-rw-r--r--JLanguageTool/src/java/de/danielnaber/languagetool/rules/de/DashRule.java84
-rw-r--r--JLanguageTool/src/java/de/danielnaber/languagetool/rules/de/GermanLemmatizer.java84
-rw-r--r--JLanguageTool/src/java/de/danielnaber/languagetool/rules/de/GermanRule.java30
-rw-r--r--JLanguageTool/src/java/de/danielnaber/languagetool/rules/de/GermanWordRepeatRule.java39
-rw-r--r--JLanguageTool/src/java/de/danielnaber/languagetool/rules/de/WiederVsWiderRule.java91
-rw-r--r--JLanguageTool/src/java/de/danielnaber/languagetool/rules/de/WordCoherencyRule.java156
-rw-r--r--JLanguageTool/src/java/de/danielnaber/languagetool/rules/en/AvsAnRule.java251
-rw-r--r--JLanguageTool/src/java/de/danielnaber/languagetool/rules/en/CompoundRule.java55
-rw-r--r--JLanguageTool/src/java/de/danielnaber/languagetool/rules/en/EnglishRule.java30
-rw-r--r--JLanguageTool/src/java/de/danielnaber/languagetool/rules/en/EnglishUnpairedBracketsRule.java89
-rw-r--r--JLanguageTool/src/java/de/danielnaber/languagetool/rules/es/ElwithFemRule.java179
-rw-r--r--JLanguageTool/src/java/de/danielnaber/languagetool/rules/es/SpanishRule.java32
-rw-r--r--JLanguageTool/src/java/de/danielnaber/languagetool/rules/fr/FrenchRule.java31
-rw-r--r--JLanguageTool/src/java/de/danielnaber/languagetool/rules/fr/QuestionWhitespaceRule.java161
-rw-r--r--JLanguageTool/src/java/de/danielnaber/languagetool/rules/patterns/AbstractPatternRule.java223
-rw-r--r--JLanguageTool/src/java/de/danielnaber/languagetool/rules/patterns/Element.java803
-rw-r--r--JLanguageTool/src/java/de/danielnaber/languagetool/rules/patterns/FalseFriendRuleLoader.java356
-rw-r--r--JLanguageTool/src/java/de/danielnaber/languagetool/rules/patterns/Match.java551
-rw-r--r--JLanguageTool/src/java/de/danielnaber/languagetool/rules/patterns/PatternRule.java652
-rw-r--r--JLanguageTool/src/java/de/danielnaber/languagetool/rules/patterns/PatternRuleLoader.java369
-rw-r--r--JLanguageTool/src/java/de/danielnaber/languagetool/rules/patterns/Unifier.java432
-rw-r--r--JLanguageTool/src/java/de/danielnaber/languagetool/rules/patterns/XMLRuleHandler.java568
-rw-r--r--JLanguageTool/src/java/de/danielnaber/languagetool/rules/patterns/bitext/BitextPatternRule.java93
-rw-r--r--JLanguageTool/src/java/de/danielnaber/languagetool/rules/patterns/bitext/BitextPatternRuleLoader.java413
-rw-r--r--JLanguageTool/src/java/de/danielnaber/languagetool/rules/patterns/bitext/BitextXMLRuleHandler.java56
-rw-r--r--JLanguageTool/src/java/de/danielnaber/languagetool/rules/patterns/bitext/FalseFriendsAsBitextLoader.java72
-rw-r--r--JLanguageTool/src/java/de/danielnaber/languagetool/rules/pl/CompoundRule.java55
-rw-r--r--JLanguageTool/src/java/de/danielnaber/languagetool/rules/pl/PolishRule.java31
-rw-r--r--JLanguageTool/src/java/de/danielnaber/languagetool/rules/pl/PolishUnpairedBracketsRule.java42
-rw-r--r--JLanguageTool/src/java/de/danielnaber/languagetool/rules/pl/PolishWordRepeatRule.java200
-rw-r--r--JLanguageTool/src/java/de/danielnaber/languagetool/rules/pl/SimpleReplaceRule.java82
-rw-r--r--JLanguageTool/src/java/de/danielnaber/languagetool/rules/ro/CompoundRule.java58
-rw-r--r--JLanguageTool/src/java/de/danielnaber/languagetool/rules/ro/SimpleReplaceRule.java264
-rw-r--r--JLanguageTool/src/java/de/danielnaber/languagetool/rules/ru/RuSimpleReplaceRule.java80
-rw-r--r--JLanguageTool/src/java/de/danielnaber/languagetool/rules/ru/RussianCompoundRule.java57
-rw-r--r--JLanguageTool/src/java/de/danielnaber/languagetool/rules/ru/RussianRule.java30
-rw-r--r--JLanguageTool/src/java/de/danielnaber/languagetool/rules/ru/RussianUnpairedBracketsRule.java62
-rw-r--r--JLanguageTool/src/java/de/danielnaber/languagetool/rules/sk/CompoundRule.java55
-rw-r--r--JLanguageTool/src/java/de/danielnaber/languagetool/rules/sk/SlovakRule.java31
-rw-r--r--JLanguageTool/src/java/de/danielnaber/languagetool/rules/sk/SlovakVes.java146
-rw-r--r--JLanguageTool/src/java/de/danielnaber/languagetool/rules/sv/CompoundRule.java247
-rw-r--r--JLanguageTool/src/java/de/danielnaber/languagetool/rules/sv/SwedishRule.java31
-rw-r--r--JLanguageTool/src/java/de/danielnaber/languagetool/rules/uk/PunctuationCheckRule.java76
-rw-r--r--JLanguageTool/src/java/de/danielnaber/languagetool/rules/uk/SimpleReplaceRule.java50
66 files changed, 10897 insertions, 0 deletions
diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/rules/AbstractCompoundRule.java b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/AbstractCompoundRule.java
new file mode 100644
index 0000000..8ef9119
--- /dev/null
+++ b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/AbstractCompoundRule.java
@@ -0,0 +1,279 @@
+/* LanguageTool, a natural language style checker
+ * Copyright (C) 2006 Daniel Naber (http://www.danielnaber.de)
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
+ * USA
+ */
+package de.danielnaber.languagetool.rules;
+
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Map;
+import java.util.Queue;
+import java.util.ResourceBundle;
+import java.util.Set;
+import java.util.concurrent.ArrayBlockingQueue;
+
+import de.danielnaber.languagetool.AnalyzedSentence;
+import de.danielnaber.languagetool.AnalyzedToken;
+import de.danielnaber.languagetool.AnalyzedTokenReadings;
+import de.danielnaber.languagetool.tools.StringTools;
+
+/**
+ * Checks that compounds (if in the list) are not written as separate words.
+ *
+ * @author Daniel Naber & Marcin Miłkowski (refactoring)
+ */
+
+public abstract class AbstractCompoundRule extends Rule {
+
+ private static final int MAX_TERMS = 5;
+
+ private final Set<String> incorrectCompounds = new HashSet<String>();
+ private final Set<String> noDashSuggestion = new HashSet<String>();
+ private final Set<String> onlyDashSuggestion = new HashSet<String>();
+
+ private String withHyphenMessage;
+ private String asOneMessage;
+ private String withOrWithoutHyphenMessage;
+
+ private String shortDesc;
+
+ /** Compounds with more than maxNoHyphensSize parts should always use hyphens */
+ private int maxUnHyphenatedWordCount = 2;
+
+ /** Flag to indicate if the hyphen is ignored in the text entered by the user.
+ * Set this to false if you want the rule to offer suggestions for words like [ro] "câte-și-trei" (with hyphen), not only for "câte și trei" (with spaces)
+ * This is only available for languages with hyphen as a word separator (ie: not available for english, available for Romanian)
+ * See Language.getWordTokenizer()
+ */
+ private boolean hyphenIgnored = true;
+
+ public AbstractCompoundRule(final ResourceBundle messages) throws IOException {
+ if (messages != null)
+ super.setCategory(new Category(messages.getString("category_misc")));
+ }
+
+ public abstract String getId();
+
+ public abstract String getDescription();
+
+ public void setShort(final String shortDescription) {
+ shortDesc = shortDescription;
+ }
+
+ public void setMsg(final String withHyphenMessage, final String asOneMessage, final String withHyphenOrNotMessage) {
+ this.withHyphenMessage = withHyphenMessage;
+ this.asOneMessage = asOneMessage;
+ withOrWithoutHyphenMessage = withHyphenOrNotMessage;
+ }
+
+ public boolean isHyphenIgnored() {
+ return hyphenIgnored;
+ }
+
+ public void setHyphenIgnored(boolean ignoreHyphen) {
+ this.hyphenIgnored = ignoreHyphen;
+ }
+
+ public int getMaxUnHyphenatedWordCount() {
+ return maxUnHyphenatedWordCount;
+ }
+
+ public void setMaxUnHyphenatedWordCount(int maxNoHyphensSize) {
+ this.maxUnHyphenatedWordCount = maxNoHyphensSize;
+ }
+
+ public RuleMatch[] match(final AnalyzedSentence text) {
+ final List<RuleMatch> ruleMatches = new ArrayList<RuleMatch>();
+ final AnalyzedTokenReadings[] tokens = text.getTokensWithoutWhitespace();
+
+ RuleMatch prevRuleMatch = null;
+ final Queue<AnalyzedTokenReadings> prevTokens = new ArrayBlockingQueue<AnalyzedTokenReadings>(MAX_TERMS);
+ for (int i = 0; i < tokens.length + MAX_TERMS-1; i++) {
+ AnalyzedTokenReadings token = null;
+ // we need to extend the token list so we find matches at the end of the original list:
+ if (i >= tokens.length)
+ token = new AnalyzedTokenReadings(new AnalyzedToken("", "", null), prevTokens.peek().getStartPos());
+ else
+ token = tokens[i];
+ if (i == 0) {
+ addToQueue(token, prevTokens);
+ continue;
+ }
+
+ final StringBuilder sb = new StringBuilder();
+ int j = 0;
+ AnalyzedTokenReadings firstMatchToken = null;
+ final List<String> stringsToCheck = new ArrayList<String>();
+ final List<String> origStringsToCheck = new ArrayList<String>(); // original upper/lowercase spelling
+ final Map<String, AnalyzedTokenReadings> stringToToken = new HashMap<String, AnalyzedTokenReadings>();
+ for (AnalyzedTokenReadings atr : prevTokens) {
+ if (j == 0)
+ firstMatchToken = atr;
+ sb.append(' ');
+ sb.append(atr.getToken());
+ if (j >= 1) {
+ final String stringToCheck = normalize(sb.toString());
+ stringsToCheck.add(stringToCheck);
+ origStringsToCheck.add(sb.toString().trim());
+ if (!stringToToken.containsKey(stringToCheck))
+ stringToToken.put(stringToCheck, atr);
+ }
+ j++;
+ }
+ // iterate backwards over all potentially incorrect strings to make
+ // sure we match longer strings first:
+ for (int k = stringsToCheck.size()-1; k >= 0; k--) {
+ final String stringToCheck = stringsToCheck.get(k);
+ final String origStringToCheck = origStringsToCheck.get(k);
+ if (incorrectCompounds.contains(stringToCheck)) {
+ final AnalyzedTokenReadings atr = stringToToken.get(stringToCheck);
+ String msg = null;
+ final List<String> replacement = new ArrayList<String>();
+ if (!noDashSuggestion.contains(stringToCheck)) {
+ replacement.add(origStringToCheck.replace(' ', '-'));
+ msg = withHyphenMessage;
+ }
+ // assume that compounds with more than maxUnHyphenatedWordCount (default: two) parts should always use hyphens:
+ if (!hasAllUppercaseParts(origStringToCheck) && countParts(stringToCheck) <= getMaxUnHyphenatedWordCount()
+ && !onlyDashSuggestion.contains(stringToCheck)) {
+ replacement.add(mergeCompound(origStringToCheck));
+ msg = asOneMessage;
+ }
+ final String[] parts = stringToCheck.split(" ");
+ if (parts.length > 0 && parts[0].length() == 1) {
+ replacement.clear();
+ replacement.add(origStringToCheck.replace(' ', '-'));
+ msg = withHyphenMessage;
+ } else if (replacement.isEmpty() || replacement.size() == 2) { // isEmpty shouldn't happen
+ msg = withOrWithoutHyphenMessage;
+ }
+ final RuleMatch ruleMatch = new RuleMatch(this, firstMatchToken.getStartPos(),
+ atr.getStartPos() + atr.getToken().length(), msg, shortDesc);
+ // avoid duplicate matches:
+ if (prevRuleMatch != null && prevRuleMatch.getFromPos() == ruleMatch.getFromPos()) {
+ prevRuleMatch = ruleMatch;
+ break;
+ }
+ prevRuleMatch = ruleMatch;
+ ruleMatch.setSuggestedReplacements(replacement);
+ ruleMatches.add(ruleMatch);
+ break;
+ }
+ }
+ addToQueue(token, prevTokens);
+ }
+ return toRuleMatchArray(ruleMatches);
+ }
+
+ private String normalize(final String inStr) {
+ String str = inStr.trim().toLowerCase();
+ if (str.indexOf('-') != -1 && str.indexOf(' ') != -1) {
+ if (isHyphenIgnored()) {
+ // e.g. "E-Mail Adresse" -> "E Mail Adresse" so the error can be detected:
+ str = str.replace('-', ' ');
+ } else {
+ str = str.replace(" - ", " ");
+ }
+ }
+ return str;
+ }
+
+ private boolean hasAllUppercaseParts(final String str) {
+ final String[] parts = str.split(" ");
+ for (String part : parts) {
+ if (isHyphenIgnored() || !"-".equals(part)) { // do not treat '-' as an upper-case word
+ if (StringTools.isAllUppercase(part)) {
+ return true;
+ }
+ }
+ }
+ return false;
+ }
+
+ private int countParts(final String str) {
+ return str.split(" ").length;
+ }
+
+ private String mergeCompound(final String str) {
+ final String[] stringParts = str.split(" ");
+ final StringBuilder sb = new StringBuilder();
+ for (int k = 0; k < stringParts.length; k++) {
+ if (isHyphenIgnored() || !"-".equals(stringParts[k])) {
+ if (k == 0)
+ sb.append(stringParts[k]);
+ else
+ sb.append(stringParts[k].toLowerCase());
+ }
+ }
+ return sb.toString();
+ }
+
+ private void addToQueue(final AnalyzedTokenReadings token, final Queue<AnalyzedTokenReadings> prevTokens) {
+ final boolean inserted = prevTokens.offer(token);
+ if (!inserted) {
+ prevTokens.poll();
+ prevTokens.offer(token);
+ }
+ }
+
+ public void loadCompoundFile(final InputStream file, final String encoding) throws IOException {
+ InputStreamReader isr = null;
+ BufferedReader br = null;
+ try {
+ isr = new InputStreamReader(file, encoding);
+ br = new BufferedReader(isr);
+ String line;
+ while ((line = br.readLine()) != null) {
+ line = line.trim();
+ if (line.length() < 1) {
+ continue;
+ }
+ if (line.charAt(0) == '#') { // ignore comments
+ continue;
+ }
+ // the set contains the incorrect spellings, i.e. the ones without hyphen
+ line = line.replace('-', ' ');
+ final String[] parts = line.split(" ");
+ if (parts.length > MAX_TERMS)
+ throw new IOException("Too many compound parts: " + line + ", maximum allowed: " + MAX_TERMS);
+ if (parts.length == 1)
+ throw new IOException("Not a compound: " + line);
+ if (line.endsWith("+")) {
+ line = line.substring(0, line.length() - 1); // cut off "+"
+ noDashSuggestion.add(line.toLowerCase());
+ } else if (line.endsWith("*")) {
+ line = line.substring(0, line.length() - 1); // cut off "*"
+ onlyDashSuggestion.add(line.toLowerCase());
+ }
+ incorrectCompounds.add(line.toLowerCase());
+ }
+ } finally {
+ if (br != null) br.close();
+ if (isr != null) isr.close();
+ }
+ }
+
+ public void reset() {
+ }
+
+}
diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/rules/AbstractPunctuationCheckRule.java b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/AbstractPunctuationCheckRule.java
new file mode 100644
index 0000000..89d216b
--- /dev/null
+++ b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/AbstractPunctuationCheckRule.java
@@ -0,0 +1,93 @@
+/* LanguageTool, a natural language style checker
+ * Copyright (C) 2005 Daniel Naber (http://www.danielnaber.de)
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
+ * USA
+ */
+package de.danielnaber.languagetool.rules;
+
+import java.util.ArrayList;
+import java.util.List;
+import java.util.ResourceBundle;
+
+import de.danielnaber.languagetool.AnalyzedSentence;
+import de.danielnaber.languagetool.AnalyzedTokenReadings;
+
+/**
+ * A rule that matches "..", "::", "-," but not "...", "!..", "?!!", ",-" etc.
+ * Languages will have to subclass it and override <code>isPunctsJoinOk()</code>
+ * and <code>isPunctuation()</code> to provide language-specific checking
+ *
+ * @author Andriy Rysin
+ */
+public abstract class AbstractPunctuationCheckRule extends Rule {
+
+ public AbstractPunctuationCheckRule(final ResourceBundle messages) {
+ super(messages);
+ super.setCategory(new Category(messages.getString("category_misc")));
+ }
+
+ public String getId() {
+ return "PUNCTUATION_GENERIC_CHECK";
+ }
+
+ public String getDescription() {
+ return "Use of unusual combination of punctuation characters";
+ }
+
+ protected abstract boolean isPunctsJoinOk(String tkns);
+
+ protected abstract boolean isPunctuation(String token);
+
+ public RuleMatch[] match(final AnalyzedSentence text) {
+ final List<RuleMatch> ruleMatches = new ArrayList<RuleMatch>();
+ final AnalyzedTokenReadings[] tokens = text.getTokens();
+
+ int startTokenIdx = -1;
+ String tkns = "";
+ for (int i = 0; i < tokens.length; i++) {
+ final String tokenStr = tokens[i].getToken();
+
+ if (isPunctuation(tokenStr)) {
+ tkns += tokenStr;
+
+ if (startTokenIdx == -1)
+ startTokenIdx = i;
+
+ if (i < tokens.length - 1)
+ continue;
+ }
+
+ if (tkns.length() >= 2 && !isPunctsJoinOk(tkns)) {
+ final String msg = "bad duplication or combination of punctuation signs";
+ final RuleMatch ruleMatch = new RuleMatch(this, tokens[startTokenIdx]
+ .getStartPos(),
+ tokens[startTokenIdx].getStartPos() + tkns.length(), msg,
+ "Punctuation problem");
+ ruleMatch.setSuggestedReplacement(tkns.substring(0, 1));
+ ruleMatches.add(ruleMatch);
+ }
+ tkns = "";
+ startTokenIdx = -1;
+ }
+
+ return toRuleMatchArray(ruleMatches);
+ }
+
+ public void reset() {
+ // nothing
+ }
+
+} \ No newline at end of file
diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/rules/AbstractSimpleReplaceRule.java b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/AbstractSimpleReplaceRule.java
new file mode 100644
index 0000000..13288a2
--- /dev/null
+++ b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/AbstractSimpleReplaceRule.java
@@ -0,0 +1,159 @@
+/* LanguageTool, a natural language style checker
+ * Copyright (C) 2005 Daniel Naber (http://www.danielnaber.de)
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
+ * USA
+ */
+package de.danielnaber.languagetool.rules;
+
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Locale;
+import java.util.Map;
+import java.util.ResourceBundle;
+
+import de.danielnaber.languagetool.AnalyzedSentence;
+import de.danielnaber.languagetool.AnalyzedTokenReadings;
+import de.danielnaber.languagetool.JLanguageTool;
+import de.danielnaber.languagetool.tools.StringTools;
+
+/**
+ * A rule that matches words or phrases which should not be used and suggests
+ * correct ones instead. Loads the relevant words from
+ * <code>rules/XX/replace.txt</code>, where XX is a code of the language.
+ *
+ * @author Andriy Rysin
+ */
+public abstract class AbstractSimpleReplaceRule extends Rule {
+
+ private static final String FILE_ENCODING = "utf-8";
+
+ private Map<String, String> wrongWords; // e.g. "вреѿті реѿт" -> "зреѿтою"
+
+ public abstract String getFileName();
+
+ public String getEncoding() {
+ return FILE_ENCODING;
+ }
+
+ /**
+ * Indicates if the rule is case-sensitive. Default value is <code>true</code>.
+ * @return true if the rule is case-sensitive, false otherwise.
+ */
+ public boolean isCaseSensitive() {
+ return true;
+ }
+
+ /**
+ * @return the locale used for case conversion when {@link #isCaseSensitive()} is set to <code>false</code>.
+ */
+ public Locale getLocale() {
+ return Locale.getDefault();
+ }
+
+ public AbstractSimpleReplaceRule(final ResourceBundle messages) throws IOException {
+ if (messages != null) {
+ super.setCategory(new Category(messages.getString("category_misc")));
+ }
+ wrongWords = loadWords(JLanguageTool.getDataBroker().getFromRulesDirAsStream(getFileName()));
+ }
+
+ public String getId() {
+ return "SIMPLE_REPLACE";
+ }
+
+ public String getDescription() {
+ return "Checks for wrong words/phrases";
+ }
+
+ public String getSuggestion() {
+ return " is not valid, use ";
+ }
+
+ public String getShort() {
+ return "Wrong word";
+ }
+
+ public final RuleMatch[] match(final AnalyzedSentence text) {
+ final List<RuleMatch> ruleMatches = new ArrayList<RuleMatch>();
+ final AnalyzedTokenReadings[] tokens = text.getTokensWithoutWhitespace();
+
+ for (int i = 1; i < tokens.length; i++) {
+ final String token = tokens[i].getToken();
+
+ final String origToken = token;
+ final String replacement = isCaseSensitive()?wrongWords.get(token):wrongWords.get(token.toLowerCase(getLocale()));
+ if (replacement != null) {
+ final String msg = token + getSuggestion() + replacement;
+ final int pos = tokens[i].getStartPos();
+ final RuleMatch potentialRuleMatch = new RuleMatch(this, pos, pos
+ + origToken.length(), msg, getShort());
+ if (!isCaseSensitive() && StringTools.startsWithUppercase(token)) {
+ potentialRuleMatch.setSuggestedReplacement(StringTools.uppercaseFirstChar(replacement));
+ } else {
+ potentialRuleMatch.setSuggestedReplacement(replacement);
+ }
+ ruleMatches.add(potentialRuleMatch);
+ }
+ }
+ return toRuleMatchArray(ruleMatches);
+ }
+
+
+ private Map<String, String> loadWords(final InputStream file) throws IOException {
+ final Map<String, String> map = new HashMap<String, String>();
+ InputStreamReader isr = null;
+ BufferedReader br = null;
+ try {
+ isr = new InputStreamReader(file, getEncoding());
+ br = new BufferedReader(isr);
+ String line;
+
+ while ((line = br.readLine()) != null) {
+ line = line.trim();
+ if (line.length() < 1) {
+ continue;
+ }
+ if (line.charAt(0) == '#') { // ignore comments
+ continue;
+ }
+ final String[] parts = line.split("=");
+ if (parts.length != 2) {
+ throw new IOException("Format error in file "
+ + JLanguageTool.getDataBroker().getFromRulesDirAsUrl(getFileName()) + ", line: " + line);
+ }
+ map.put(parts[0], parts[1]);
+ }
+
+ } finally {
+ if (br != null) {
+ br.close();
+ }
+ if (isr != null) {
+ isr.close();
+ }
+ }
+ return map;
+ }
+
+ public void reset() {
+ }
+
+}
diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/rules/Category.java b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/Category.java
new file mode 100644
index 0000000..95a3b44
--- /dev/null
+++ b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/Category.java
@@ -0,0 +1,85 @@
+/* LanguageTool, a natural language style checker
+ * Copyright (C) 2006 Daniel Naber (http://www.danielnaber.de)
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
+ * USA
+ */
+package de.danielnaber.languagetool.rules;
+
+/**
+ * A rule's category. Categories are used to group rules for
+ * a better overview.
+ *
+ * @author Daniel Naber
+ */
+public class Category {
+
+ private static final int DEFAULT_PRIORITY = 50;
+
+ private int priority;
+ private String name;
+ private boolean defaultOff;
+
+ /**
+ * Create a new category with the given name and priority.
+ * @param name name of the category
+ * @param priority a value between 0 and 100 (inclusive)
+ */
+ public Category(final String name, final int priority) {
+ if (priority < 0 || priority > 100)
+ throw new IllegalArgumentException("priority must be in range 0 - 100");
+ this.name = name;
+ this.priority = priority;
+ }
+
+ /**
+ * Create a new category with the default priority (50).
+ * @param name name of the category
+ */
+ public Category(final String name) {
+ this(name, DEFAULT_PRIORITY);
+ }
+
+ public String getName() {
+ return name;
+ }
+
+ public int getPriority() {
+ return priority;
+ }
+
+ public String toString() {
+ return name + "(prio=" + priority + ")";
+ }
+
+ /**
+ * Checks whether the category has been turned off
+ * by default by the category author.
+ * @return True if the category is turned off by
+ * default.
+ */
+ public final boolean isDefaultOff() {
+ return defaultOff;
+ }
+
+ /**
+ * Turns the category by default off.
+ **/
+ public final void setDefaultOff() {
+ defaultOff = true;
+ }
+
+
+}
diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/rules/CommaWhitespaceRule.java b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/CommaWhitespaceRule.java
new file mode 100644
index 0000000..0636a1f
--- /dev/null
+++ b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/CommaWhitespaceRule.java
@@ -0,0 +1,170 @@
+/* LanguageTool, a natural language style checker
+ * Copyright (C) 2005 Daniel Naber (http://www.danielnaber.de)
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
+ * USA
+ */
+package de.danielnaber.languagetool.rules;
+
+import java.util.ArrayList;
+import java.util.List;
+import java.util.ResourceBundle;
+
+import de.danielnaber.languagetool.AnalyzedSentence;
+import de.danielnaber.languagetool.AnalyzedTokenReadings;
+
+/**
+ * A rule that matches commas and closing parenthesis preceded by whitespace and
+ * opening parenthesis followed by whitespace.
+ *
+ * @author Daniel Naber
+ */
+
+public class CommaWhitespaceRule extends Rule {
+
+ public CommaWhitespaceRule(final ResourceBundle messages) {
+ super(messages);
+ super.setCategory(new Category(messages.getString("category_misc")));
+ }
+
+ public final String getId() {
+ return "COMMA_PARENTHESIS_WHITESPACE";
+ }
+
+ public final String getDescription() {
+ return messages.getString("desc_comma_whitespace");
+ }
+
+ public final RuleMatch[] match(final AnalyzedSentence text) {
+ final List<RuleMatch> ruleMatches = new ArrayList<RuleMatch>();
+ final AnalyzedTokenReadings[] tokens = text.getTokens();
+ String prevToken = "";
+ String prevPrevToken = "";
+ boolean prevWhite = false;
+ int pos = 0;
+ int prevLen = 0;
+ for (int i = 0; i < tokens.length; i++) {
+ final String token = tokens[i].getToken();
+ final boolean isWhite = tokens[i].isWhitespace()
+ || tokens[i].isFieldCode();
+ pos += token.length();
+ String msg = null;
+ int fixLen = 0;
+ String suggestionText = null;
+ if (isWhite && isLeftBracket(prevToken)) {
+ msg = messages.getString("no_space_after");
+ suggestionText = prevToken;
+ fixLen = 1;
+ } else if (!isWhite && prevToken.equals(",")
+ && isNotQuoteOrHyphen(token)
+ && containsNoNumber(prevPrevToken)
+ && containsNoNumber(token)
+ && !",".equals(prevPrevToken)) {
+ msg = messages.getString("missing_space_after_comma");
+ suggestionText = ", ";
+ } else if (prevWhite) {
+ if (isRightBracket(token)) {
+ msg = messages.getString("no_space_before");
+ suggestionText = token;
+ fixLen = 1;
+ } else if (token.equals(",")) {
+ msg = messages.getString("space_after_comma");
+ suggestionText = ",";
+ fixLen = 1;
+ //exception for duplicated comma (we already have another rule for that)
+ if (i + 1 < tokens.length
+ && ",".equals(tokens[i + 1].getToken())) {
+ msg = null;
+ }
+ } else if (token.equals(".")) {
+ msg = messages.getString("no_space_before_dot");
+ suggestionText = ".";
+ fixLen = 1;
+ // exception case for figures such as ".5" and ellipsis
+ if (i + 1 < tokens.length
+ && isNumberOrDot(tokens[i + 1].getToken())) {
+ msg = null;
+ }
+ }
+ }
+ if (msg != null) {
+ final int fromPos = tokens[i - 1].getStartPos();
+ final int toPos = tokens[i - 1].getStartPos() + fixLen + prevLen;
+ // TODO: add some good short comment here
+ final RuleMatch ruleMatch = new RuleMatch(this, fromPos, toPos, msg);
+ ruleMatch.setSuggestedReplacement(suggestionText);
+ ruleMatches.add(ruleMatch);
+ }
+ prevPrevToken = prevToken;
+ prevToken = token;
+ prevWhite = isWhite && !tokens[i].isFieldCode(); //OOo code before comma/dot
+ prevLen = tokens[i].getToken().length();
+ }
+
+ return toRuleMatchArray(ruleMatches);
+ }
+
+ static boolean isNotQuoteOrHyphen(final String str) {
+ if (str.length() == 1) {
+ final char c = str.charAt(0);
+ if (c =='\'' || c == '-' || c == '”'
+ || c =='’' || c == '"' || c == '“'
+ || c == ',') {
+ return false;
+ }
+ } else {
+ if ("&quot".equals(str)) {
+ return false;
+ }
+ return containsNoNumber(str);
+ }
+ return true;
+ }
+
+ static boolean isNumberOrDot(final String str) {
+ final char c = str.charAt(0);
+ return (c == '.' || Character.isDigit(c));
+ }
+
+ static boolean isLeftBracket(final String str) {
+ if (str.length() == 0) {
+ return false;
+ }
+ final char c = str.charAt(0);
+ return (c == '(' || c == '[' || c == '{');
+ }
+
+ static boolean isRightBracket(final String str) {
+ if (str.length() == 0) {
+ return false;
+ }
+ final char c = str.charAt(0);
+ return (c == ')' || c == ']' || c == '}');
+ }
+
+ static boolean containsNoNumber(final String str) {
+ for (int i = 0; i < str.length(); i++) {
+ if (Character.isDigit(str.charAt(i))) {
+ return false;
+ }
+ }
+ return true;
+ }
+
+ public void reset() {
+ // nothing
+ }
+
+}
diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/rules/DoublePunctuationRule.java b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/DoublePunctuationRule.java
new file mode 100644
index 0000000..3a6a4e1
--- /dev/null
+++ b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/DoublePunctuationRule.java
@@ -0,0 +1,99 @@
+/* LanguageTool, a natural language style checker
+ * Copyright (C) 2005 Daniel Naber (http://www.danielnaber.de)
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
+ * USA
+ */
+package de.danielnaber.languagetool.rules;
+
+import java.util.ArrayList;
+import java.util.List;
+import java.util.ResourceBundle;
+
+import de.danielnaber.languagetool.AnalyzedSentence;
+import de.danielnaber.languagetool.AnalyzedTokenReadings;
+
+/**
+ * A rule that matches ".." (but not "..." etc) and ",,".
+ *
+ * @author Daniel Naber
+ */
+public class DoublePunctuationRule extends Rule {
+
+ public DoublePunctuationRule(final ResourceBundle messages) {
+ super(messages);
+ super.setCategory(new Category(messages.getString("category_misc")));
+ }
+
+ public final String getId() {
+ return "DOUBLE_PUNCTUATION";
+ }
+
+ public final String getDescription() {
+ return messages.getString("desc_double_punct");
+ }
+
+ public final RuleMatch[] match(final AnalyzedSentence text) {
+ final List<RuleMatch> ruleMatches = new ArrayList<RuleMatch>();
+ final AnalyzedTokenReadings[] tokens = text.getTokens();
+ int startPos = 0;
+ int dotCount = 0;
+ int commaCount = 0;
+ for (int i = 0; i < tokens.length; i++) {
+ final String token = tokens[i].getToken();
+ String nextToken = null;
+ if (i < tokens.length - 1) {
+ nextToken = tokens[i + 1].getToken();
+ }
+ if (".".equals(token)) {
+ dotCount++;
+ commaCount = 0;
+ startPos = tokens[i].getStartPos();
+ } else if (",".equals(token)) {
+ commaCount++;
+ dotCount = 0;
+ startPos = tokens[i].getStartPos();
+ }
+ if (dotCount == 2 && !".".equals(nextToken)) {
+ final String msg = messages.getString("two_dots");
+ final int fromPos = Math.max(0, startPos - 1);
+ final RuleMatch ruleMatch = new RuleMatch(this, fromPos, startPos + 1,
+ msg, messages.getString("double_dots_short"));
+ ruleMatch.setSuggestedReplacement(".");
+ ruleMatches.add(ruleMatch);
+ dotCount = 0;
+ } else if (commaCount == 2 && !",".equals(nextToken)) {
+ final String msg = messages.getString("two_commas");
+ final int fromPos = Math.max(0, startPos);
+ final RuleMatch ruleMatch = new RuleMatch(this, fromPos, startPos + 1,
+ msg, messages.getString("double_commas_short"));
+ ruleMatch.setSuggestedReplacement(",");
+ ruleMatches.add(ruleMatch);
+ commaCount = 0;
+ }
+ if (!".".equals(token) && !",".equals(token)) {
+ dotCount = 0;
+ commaCount = 0;
+ }
+ }
+
+ return toRuleMatchArray(ruleMatches);
+ }
+
+ public void reset() {
+ // nothing
+ }
+
+}
diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/rules/GenericUnpairedBracketsRule.java b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/GenericUnpairedBracketsRule.java
new file mode 100644
index 0000000..a2cd35c
--- /dev/null
+++ b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/GenericUnpairedBracketsRule.java
@@ -0,0 +1,314 @@
+/* LanguageTool, a natural language style checker
+ * Copyright (C) 2009 Daniel Naber (http://www.danielnaber.de)
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
+ * USA
+ */
+
+package de.danielnaber.languagetool.rules;
+
+import java.util.ArrayList;
+import java.util.List;
+import java.util.ResourceBundle;
+import java.util.regex.Pattern;
+
+import de.danielnaber.languagetool.AnalyzedSentence;
+import de.danielnaber.languagetool.AnalyzedTokenReadings;
+import de.danielnaber.languagetool.Language;
+import de.danielnaber.languagetool.tools.UnsyncStack;
+import de.danielnaber.languagetool.tools.SymbolLocator;
+
+/**
+ * Rule that finds unpaired quotes, brackets etc.
+ *
+ * @author Marcin Miłkowski
+ */
+public class GenericUnpairedBracketsRule extends Rule {
+
+ /**
+ * Note that there must be equal length of both arrays, and the sequence of
+ * starting symbols must match exactly the sequence of ending symbols.
+ */
+ private static final String[] START_SYMBOLS = { "[", "(", "{", "\"", "'" };
+ private static final String[] END_SYMBOLS = { "]", ")", "}", "\"", "'" };
+
+ protected String[] startSymbols;
+ protected String[] endSymbols;
+
+ private static final String[] SL_START_SYMBOLS = { "[", "(", "{", "„", "»", "\"" };
+ private static final String[] SL_END_SYMBOLS = { "]", ")", "}", "”", "«", "\"" };
+
+ private static final String[] SK_START_SYMBOLS = { "[", "(", "{", "„", "»", "\"" };
+ private static final String[] SK_END_SYMBOLS = { "]", ")", "}", "“", "«", "\"" };
+
+ private static final String[] RO_START_SYMBOLS = { "[", "(", "{", "„", "«" };
+ private static final String[] RO_END_SYMBOLS = { "]", ")", "}", "”", "»" };
+
+ private static final String[] FR_START_SYMBOLS = { "[", "(", "{", "«", /*"‘"*/ };
+ private static final String[] FR_END_SYMBOLS = { "]", ")", "}", "»", /*"’" used in "d’arm" and many other words */ };
+
+ private static final String[] DE_START_SYMBOLS = { "[", "(", "{", "„", "»", "‘" };
+ private static final String[] DE_END_SYMBOLS = { "]", ")", "}", "“", "«", "’" };
+
+ private static final String[] GL_START_SYMBOLS = { "[", "(", "{", "“", "«", "‘", "\"", "'" };
+ private static final String[] GL_END_SYMBOLS = { "]", ")", "}", "”", "»", "’", "\"", "'" };
+
+ private static final String[] ES_START_SYMBOLS = { "[", "(", "{", "“", "«", "¿", "¡" };
+ private static final String[] ES_END_SYMBOLS = { "]", ")", "}", "”", "»", "?", "!" };
+
+ private static final String[] UK_START_SYMBOLS = { "[", "(", "{", "„", "«" };
+ private static final String[] UK_END_SYMBOLS = { "]", ")", "}", "“", "»" };
+
+ private static final String[] NL_START_SYMBOLS = { "[", "(", "{", "“", "\u2039", "\u201c", "\u201e" };
+ private static final String[] NL_END_SYMBOLS = { "]", ")", "}", "”", "\u203a", "\u201d", "\u201d" };
+
+ private static final String[] IT_START_SYMBOLS = { "[", "(", "{", "»", /*"‘"*/ };
+ private static final String[] IT_END_SYMBOLS = { "]", ")", "}", "«", /*"’"*/ };
+
+ private static final String[] DK_START_SYMBOLS = { "[", "(", "{", "\"", "”" };
+ private static final String[] DK_END_SYMBOLS = { "]", ")", "}", "\"", "”" };
+
+
+
+ /**
+ * The stack for pairing symbols.
+ */
+ protected final UnsyncStack<SymbolLocator> symbolStack = new UnsyncStack<SymbolLocator>();
+
+ /**
+ * Stack of rule matches.
+ */
+ private final UnsyncStack<RuleMatchLocator> ruleMatchStack = new UnsyncStack<RuleMatchLocator>();
+
+ private boolean endOfParagraph;
+
+ private final Language ruleLang;
+
+ private static final Pattern PUNCTUATION = Pattern.compile("\\p{Punct}");
+ private static final Pattern PUNCTUATION_NO_DOT = Pattern
+ .compile("[\\p{Punct}&&[^\\.]]");
+ private static final Pattern NUMERALS = Pattern
+ .compile("(?i)\\d{1,2}?[a-z']*|M*(D?C{0,3}|C[DM])(L?X{0,3}|X[LC])(V?I{0,3}|I[VX])$");
+
+ private int ruleMatchIndex;
+ private List<RuleMatch> ruleMatches;
+
+ public GenericUnpairedBracketsRule(final ResourceBundle messages,
+ final Language language) {
+ super(messages);
+ super.setCategory(new Category(messages.getString("category_misc")));
+
+ setParagraphBackTrack(true);
+ if (language.equals(Language.SLOVAK)) {
+ startSymbols = SK_START_SYMBOLS;
+ endSymbols = SK_END_SYMBOLS; }
+ else if (language.equals(Language.SLOVENIAN)) {
+ startSymbols = SL_START_SYMBOLS;
+ endSymbols = SL_END_SYMBOLS;
+ } else if (language.equals(Language.FRENCH)) {
+ startSymbols = FR_START_SYMBOLS;
+ endSymbols = FR_END_SYMBOLS;
+ } else if (language.equals(Language.GERMAN)) {
+ startSymbols = DE_START_SYMBOLS;
+ endSymbols = DE_END_SYMBOLS;
+ } else if (language.equals(Language.GALICIAN)) {
+ startSymbols = GL_START_SYMBOLS;
+ endSymbols = GL_END_SYMBOLS;
+ } else if (language.equals(Language.DUTCH)) {
+ startSymbols = NL_START_SYMBOLS;
+ endSymbols = NL_END_SYMBOLS;
+ } else if (language.equals(Language.SPANISH)) {
+ startSymbols = ES_START_SYMBOLS;
+ endSymbols = ES_END_SYMBOLS;
+ } else if (language.equals(Language.UKRAINIAN)) {
+ startSymbols = UK_START_SYMBOLS;
+ endSymbols = UK_END_SYMBOLS;
+ } else if (language.equals(Language.ITALIAN)) {
+ startSymbols = IT_START_SYMBOLS;
+ endSymbols = IT_END_SYMBOLS;
+ } else if (language.equals(Language.ROMANIAN)) {
+ startSymbols = RO_START_SYMBOLS;
+ endSymbols = RO_END_SYMBOLS;
+ } else if (language.equals(Language.DANISH)) {
+ startSymbols = DK_START_SYMBOLS;
+ endSymbols = DK_END_SYMBOLS;
+ } else {
+ startSymbols = START_SYMBOLS;
+ endSymbols = END_SYMBOLS;
+ }
+
+ ruleLang = language;
+ }
+
+ public String getId() {
+ return "UNPAIRED_BRACKETS";
+ }
+
+ public String getDescription() {
+ return messages.getString("desc_unpaired_brackets");
+ }
+
+ /**
+ * Generic method to specify an exception. For unspecified
+ * language, it simply returns true, which means no exception.
+ * @param token
+ * String token
+ * @param tokens
+ * Sentence tokens
+ * @param i
+ * Current token index
+ * @param precSpace
+ * boolean: is preceded with space
+ * @param follSpace
+ * boolean: is followed with space
+ * @return
+ */
+ protected boolean isNoException(final String token,
+ final AnalyzedTokenReadings[] tokens, final int i, final int j,
+ final boolean precSpace,
+ final boolean follSpace) {
+ return true;
+ }
+
+ public final RuleMatch[] match(final AnalyzedSentence text) {
+ ruleMatches = new ArrayList<RuleMatch>();
+ final AnalyzedTokenReadings[] tokens = text.getTokensWithoutWhitespace();
+
+ if (endOfParagraph) {
+ reset();
+ }
+
+ ruleMatchIndex = getMatchesIndex();
+
+ for (int i = 1; i < tokens.length; i++) {
+ for (int j = 0; j < startSymbols.length; j++) {
+
+ final String token = tokens[i].getToken();
+ if (token.equals(startSymbols[j]) || token.equals(endSymbols[j])) {
+ boolean precededByWhitespace = true;
+ if (startSymbols[j].equals(endSymbols[j])) {
+ precededByWhitespace = tokens[i - 1].isSentStart()
+ || tokens[i].isWhitespaceBefore()
+ || PUNCTUATION_NO_DOT.matcher(tokens[i - 1].getToken())
+ .matches();
+ }
+
+ boolean followedByWhitespace = true;
+ if (i < tokens.length - 1 && startSymbols[j].equals(endSymbols[j])) {
+ followedByWhitespace = tokens[i + 1].isWhitespaceBefore()
+ || PUNCTUATION.matcher(tokens[i + 1].getToken()).matches();
+ }
+
+ final boolean noException = isNoException(token, tokens, i, j,
+ precededByWhitespace, followedByWhitespace);
+
+ if (noException && precededByWhitespace
+ && token.equals(startSymbols[j])) {
+ symbolStack.push(new SymbolLocator(startSymbols[j], i));
+ } else if (noException && followedByWhitespace
+ && token.equals(endSymbols[j])) {
+ if (i > 1 && endSymbols[j].equals(")")
+ && (NUMERALS.matcher(tokens[i - 1].getToken()).matches()
+ && !(!symbolStack.empty()
+ && "(".equals(symbolStack.peek().symbol)))) {
+ } else {
+ if (symbolStack.empty()) {
+ symbolStack.push(new SymbolLocator(endSymbols[j], i));
+ } else {
+ if (symbolStack.peek().symbol.equals(startSymbols[j])) {
+ symbolStack.pop();
+ } else {
+ symbolStack.push(new SymbolLocator(endSymbols[j], i));
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+ for (final SymbolLocator sLoc : symbolStack) {
+ final RuleMatch rMatch = createMatch(tokens[sLoc.index].getStartPos(),
+ sLoc.symbol);
+ if (rMatch != null) {
+ ruleMatches.add(rMatch);
+ }
+ }
+ symbolStack.clear();
+ if (tokens[tokens.length - 1].isParaEnd()) {
+ endOfParagraph = true;
+ }
+
+ return toRuleMatchArray(ruleMatches);
+ }
+
+ private RuleMatch createMatch(final int startPos, final String symbol) {
+ if (!ruleMatchStack.empty()) {
+ final int index = findSymbolNum(symbol);
+ if (index >= 0) {
+ final RuleMatchLocator rLoc = ruleMatchStack.peek();
+ if (rLoc.symbol.equals(startSymbols[index])) {
+ if (ruleMatches.size() > rLoc.myIndex) {
+ ruleMatches.remove(rLoc.myIndex);
+ ruleMatchStack.pop();
+ return null;
+ // if (ruleMatches.get(rLoc.myIndex).getFromPos())
+ }
+ if (isInMatches(rLoc.index)) {
+ setAsDeleted(rLoc.index);
+ ruleMatchStack.pop();
+ return null;
+ }
+ }
+ }
+ }
+ ruleMatchStack.push(new RuleMatchLocator(symbol, ruleMatchIndex,
+ ruleMatches.size()));
+ ruleMatchIndex++;
+ return new RuleMatch(this, startPos, startPos + symbol.length(), messages
+ .getString("unpaired_brackets"));
+ }
+
+ private int findSymbolNum(final String ch) {
+ for (int i = 0; i < endSymbols.length; i++) {
+ if (ch.equals(endSymbols[i])) {
+ return i;
+ }
+ }
+ return -1;
+ }
+
+ /**
+ * Reset the state information for the rule, including paragraph-level
+ * information.
+ */
+ public final void reset() {
+ ruleMatchStack.clear();
+ symbolStack.clear();
+ if (!endOfParagraph) {
+ clearMatches();
+ }
+ endOfParagraph = false;
+ }
+
+}
+
+class RuleMatchLocator extends SymbolLocator {
+ public int myIndex;
+
+ RuleMatchLocator(final String sym, final int ind, final int myInd) {
+ super(sym, ind);
+ myIndex = myInd;
+ }
+}
diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/rules/IncorrectExample.java b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/IncorrectExample.java
new file mode 100644
index 0000000..0d3478f
--- /dev/null
+++ b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/IncorrectExample.java
@@ -0,0 +1,62 @@
+/* LanguageTool, a natural language style checker
+ * Copyright (C) 2008 Daniel Naber (http://www.danielnaber.de)
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
+ * USA
+ */
+package de.danielnaber.languagetool.rules;
+
+import java.util.Arrays;
+import java.util.List;
+
+/**
+ * A text, typically a sentence, that contains an error.
+ *
+ * @since 0.9.2
+ * @author Daniel Naber
+ */
+public class IncorrectExample {
+
+ private String example;
+ private List<String> corrections;
+
+ public IncorrectExample(final String example) {
+ this.example = example;
+ }
+
+ public IncorrectExample(final String example, final String[] corrections) {
+ this(example);
+ this.corrections = Arrays.asList(corrections);
+ }
+
+ /**
+ * Return the example that contains the error.
+ */
+ public String getExample() {
+ return example;
+ }
+
+ /**
+ * Return the possible corrections. May be null.
+ */
+ public List<String> getCorrections() {
+ return corrections;
+ }
+
+ public String toString() {
+ return example + " " + corrections;
+ }
+
+}
diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/rules/Rule.java b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/Rule.java
new file mode 100644
index 0000000..210754c
--- /dev/null
+++ b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/Rule.java
@@ -0,0 +1,230 @@
+/* LanguageTool, a natural language style checker
+ * Copyright (C) 2005 Daniel Naber (http://www.danielnaber.de)
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
+ * USA
+ */
+package de.danielnaber.languagetool.rules;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.ResourceBundle;
+import java.util.Set;
+
+import de.danielnaber.languagetool.AnalyzedSentence;
+import de.danielnaber.languagetool.Language;
+
+/**
+ * Abstract rule class. A Rule describes a language error and can test whether a
+ * given pre-analyzed text contains that error using the {@link Rule#match}
+ * method.
+ *
+ * @author Daniel Naber
+ */
+public abstract class Rule {
+
+ private List<String> correctExamples;
+ private List<IncorrectExample> incorrectExamples;
+ private Category category;
+
+ /**
+ * If true, then the rule is turned off by default.
+ */
+ private boolean defaultOff;
+
+ protected ResourceBundle messages;
+
+ /**
+ * Called by language-dependent rules.
+ */
+ public Rule() {
+ }
+
+ /**
+ * Called by language-independent rules.
+ */
+ public Rule(final ResourceBundle messages) {
+ this.messages = messages;
+ }
+
+ public abstract String getId();
+
+ public abstract String getDescription();
+
+ /**
+ * Used by paragraph rules to signal that they can remove previous rule
+ * matches.
+ */
+ private boolean paragraphBackTrack;
+
+ /**
+ * The final list of RuleMatches, without removed matches.
+ */
+ private List<RuleMatch> previousMatches;
+
+ private List<RuleMatch> removedMatches;
+
+ /**
+ * Check whether the given text matches this error rule, i.e. whether the text
+ * contains this error.
+ *
+ * @param text
+ * a pre-analyzed sentence
+ * @return an array of RuleMatch object for each match.
+ */
+ public abstract RuleMatch[] match(AnalyzedSentence text) throws IOException;
+
+ /**
+ * If a rule keeps its state over more than the check of one sentence, this
+ * must be implemented so the internal state is reset. It will be called
+ * before a new text is going to be checked.
+ */
+ public abstract void reset();
+
+ /**
+ * Whether this rule can be used for text in the given language.
+ */
+ public final boolean supportsLanguage(final Language language) {
+ final Set<String> relevantIDs = language.getRelevantRuleIDs();
+ return relevantIDs != null && relevantIDs.contains(getId());
+ }
+
+ /**
+ * Set the examples that are correct and thus do not trigger the rule.
+ */
+ public final void setCorrectExamples(final List<String> correctExamples) {
+ this.correctExamples = correctExamples;
+ }
+
+ /**
+ * Get example sentences that are correct and thus will not match this rule.
+ */
+ public final List<String> getCorrectExamples() {
+ return correctExamples;
+ }
+
+ /**
+ * Set the examples that are incorrect and thus do trigger the rule.
+ */
+ public final void setIncorrectExamples(
+ final List<IncorrectExample> incorrectExamples) {
+ this.incorrectExamples = incorrectExamples;
+ }
+
+ /**
+ * Get example sentences that are incorrect and thus will match this rule.
+ */
+ public final List<IncorrectExample> getIncorrectExamples() {
+ return incorrectExamples;
+ }
+
+ public final Category getCategory() {
+ return category;
+ }
+
+ public final void setCategory(final Category category) {
+ this.category = category;
+ }
+
+ protected final RuleMatch[] toRuleMatchArray(final List<RuleMatch> ruleMatches) {
+ return ruleMatches.toArray(new RuleMatch[ruleMatches.size()]);
+ }
+
+ public final boolean isParagraphBackTrack() {
+ return paragraphBackTrack;
+ }
+
+ public final void setParagraphBackTrack(final boolean backTrack) {
+ paragraphBackTrack = backTrack;
+ }
+
+ /**
+ * Method to add matches.
+ *
+ * @param r
+ * RuleMatch - matched rule added by check()
+ */
+ public final void addRuleMatch(final RuleMatch r) {
+ if (previousMatches == null) {
+ previousMatches = new ArrayList<RuleMatch>();
+ }
+ previousMatches.add(r);
+ }
+
+ /**
+ * Deletes (or disables) previously matched rule.
+ *
+ * @param i
+ * Index of the rule that should be deleted.
+ */
+ public final void setAsDeleted(final int i) {
+ if (removedMatches == null) {
+ removedMatches = new ArrayList<RuleMatch>();
+ }
+ removedMatches.add(previousMatches.get(i));
+ }
+
+ public final boolean isInRemoved(final RuleMatch r) {
+ if (removedMatches == null) {
+ return false;
+ }
+ return removedMatches.contains(r);
+ }
+
+ public final boolean isInMatches(final int i) {
+ if (previousMatches == null) {
+ return false;
+ }
+ if (previousMatches.size() > i) {
+ return previousMatches.get(i) != null;
+ }
+ return false;
+ }
+
+ public final void clearMatches() {
+ if (previousMatches != null) {
+ previousMatches.clear();
+ }
+ }
+
+ public final int getMatchesIndex() {
+ if (previousMatches == null) {
+ return 0;
+ }
+ return previousMatches.size();
+ }
+
+ public final List<RuleMatch> getMatches() {
+ return previousMatches;
+ }
+
+ /**
+ * Checks whether the rule has been turned off by default by the rule author.
+ *
+ * @return True if the rule is turned off by default.
+ */
+ public final boolean isDefaultOff() {
+ return defaultOff;
+ }
+
+ /**
+ * Turns the rule by default off.
+ **/
+ public final void setDefaultOff() {
+ defaultOff = true;
+ }
+
+}
diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/rules/RuleMatch.java b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/RuleMatch.java
new file mode 100644
index 0000000..05746fb
--- /dev/null
+++ b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/RuleMatch.java
@@ -0,0 +1,239 @@
+/* LanguageTool, a natural language style checker
+ * Copyright (C) 2005 Daniel Naber (http://www.danielnaber.de)
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
+ * USA
+ */
+package de.danielnaber.languagetool.rules;
+
+import java.util.ArrayList;
+import java.util.List;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import de.danielnaber.languagetool.tools.StringTools;
+
+/**
+ * A class that holds information about where a rule matches text.
+ *
+ * @author Daniel Naber
+ */
+public class RuleMatch implements Comparable<RuleMatch> {
+
+ private static final Pattern SUGGESTION_PATTERN = Pattern.compile("<suggestion>(.*?)</suggestion>");
+
+ private int fromLine = -1;
+ private int column = -1;
+ private int offset = -1;
+ private int endLine = -1;
+ private int endColumn = -1;
+
+ private Rule rule;
+ private int fromPos;
+ private int toPos;
+ private String message;
+ // for OOo context menu
+ private String shortMessage;
+
+ private List<String> suggestedReplacements = new ArrayList<String>();
+
+//TODO: remove this one after all rules get their short comments in place
+ public RuleMatch(Rule rule, int fromPos, int toPos, String message) {
+ this(rule, fromPos, toPos, message, null, false);
+ }
+
+ // TODO: remove this constructor?
+ public RuleMatch(Rule rule, int fromPos, int toPos, String message, String shortMessage) {
+ this(rule, fromPos, toPos, message, shortMessage, false);
+ }
+
+ /**
+ * Creates a RuleMatch object, taking the rule that triggered
+ * this match, position of the match and an explanation message.
+ * This message is scanned for &lt;suggestion>...&lt;/suggestion> to get suggested
+ * fixes for the problem detected by this rule.
+ *
+ * @param startWithUppercase whether the original text at the position
+ * of the match start with an uppercase character
+ */
+ public RuleMatch(Rule rule, int fromPos, int toPos, String message, String shortMessage,
+ boolean startWithUppercase) {
+ this.rule = rule;
+ this.fromPos = fromPos;
+ this.toPos = toPos;
+ this.message = message;
+ this.shortMessage = shortMessage;
+ // extract suggestion from <suggestion>...</suggestion> in message:
+ final Matcher matcher = SUGGESTION_PATTERN.matcher(message);
+ int pos = 0;
+ while (matcher.find(pos)) {
+ pos = matcher.end();
+ String repl = matcher.group(1);
+ if (startWithUppercase)
+ repl = StringTools.uppercaseFirstChar(repl);
+ suggestedReplacements.add(repl);
+ }
+ }
+
+ public Rule getRule() {
+ return rule;
+ }
+
+ /**
+ * Set the line number in which the match occurs.
+ */
+ public void setLine(final int fromLine) {
+ this.fromLine = fromLine;
+ }
+
+ /**
+ * Get the line number in which the match occurs.
+ */
+ public int getLine() {
+ return fromLine;
+ }
+
+ /**
+ * Set the line number in which the match ends.
+ */
+ public void setEndLine(final int endLine) {
+ this.endLine = endLine;
+ }
+
+ /**
+ * Get the line number in which the match ends.
+ */
+ public int getEndLine() {
+ return endLine;
+ }
+
+ /**
+ * Set the column number in which the match occurs.
+ */
+ public void setColumn(final int column) {
+ this.column = column;
+ }
+
+ /**
+ * Get the column number in which the match occurs.
+ */
+ public int getColumn() {
+ return column;
+ }
+
+ /**
+ * Set the column number in which the match ends.
+ */
+ public void setEndColumn(final int endColumn) {
+ this.endColumn = endColumn;
+ }
+
+ /**
+ * Get the column number in which the match ends.
+ */
+ public int getEndColumn() {
+ return endColumn;
+ }
+
+ /**
+ * Set the character offset at which the match occurs.
+ */
+ public void setOffset(final int offset) {
+ this.offset = offset;
+ }
+
+ /**
+ * Get the character offset at which the match occurs.
+ */
+ public int getOffset() {
+ return offset;
+ }
+
+ /**
+ * Position of the start of the error (in characters).
+ */
+ public int getFromPos() {
+ return fromPos;
+ }
+
+ /**
+ * Position of the end of the error (in characters).
+ */
+ public int getToPos() {
+ return toPos;
+ }
+
+ /**
+ * A human-readable explanation describing the error.
+ */
+ public String getMessage() {
+ return message;
+ }
+
+ /**
+ * A shorter human-readable explanation describing the error.
+ */
+ public String getShortMessage() {
+ return shortMessage;
+ }
+
+
+ /**
+ * @see #getSuggestedReplacements()
+ */
+ public void setSuggestedReplacement(final String replacement) {
+ if (replacement == null)
+ throw new NullPointerException("replacement might be empty but not null");
+ final List<String> fixes = new ArrayList<String>();
+ fixes.add(replacement);
+ setSuggestedReplacements(fixes);
+ }
+
+ /**
+ * @see #getSuggestedReplacements()
+ */
+ public void setSuggestedReplacements(final List<String> replacement) {
+ if (replacement == null)
+ throw new NullPointerException("replacement might be empty but not null");
+ this.suggestedReplacements = replacement;
+ }
+
+ /**
+ * The text fragments which might be an appropriate fix for the problem. One
+ * of these fragments can be used to replace the old text between getFromPos()
+ * to getToPos(). Text between &lt;suggestion> and &lt;/suggestion> is
+ * taken as the suggested replacement.
+ * @return List of String objects or an empty List
+ */
+ public List<String> getSuggestedReplacements() {
+ return suggestedReplacements;
+ }
+
+ @Override
+ public String toString() {
+ return rule.getId() + ":" + fromPos + "-" + toPos + ":" + message;
+ }
+
+ public int compareTo(final RuleMatch other) {
+ if (other == null)
+ throw new ClassCastException();
+ if (getFromPos() < other.getFromPos())
+ return -1;
+ if (getFromPos() > other.getFromPos())
+ return 1;
+ return 0;
+ }
+
+}
diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/rules/UppercaseSentenceStartRule.java b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/UppercaseSentenceStartRule.java
new file mode 100644
index 0000000..35ecfa4
--- /dev/null
+++ b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/UppercaseSentenceStartRule.java
@@ -0,0 +1,136 @@
+/* LanguageTool, a natural language style checker
+ * Copyright (C) 2005 Daniel Naber (http://www.danielnaber.de)
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
+ * USA
+ */
+package de.danielnaber.languagetool.rules;
+
+import java.util.ArrayList;
+import java.util.List;
+import java.util.ResourceBundle;
+
+import de.danielnaber.languagetool.AnalyzedSentence;
+import de.danielnaber.languagetool.AnalyzedTokenReadings;
+import de.danielnaber.languagetool.Language;
+
+/**
+ * Checks that a sentence starts with an uppercase letter.
+ *
+ * @author Daniel Naber
+ */
+public class UppercaseSentenceStartRule extends Rule {
+
+ private final Language language;
+
+ private String lastParagraphString = "";
+
+ public UppercaseSentenceStartRule(final ResourceBundle messages,
+ final Language language) {
+ super(messages);
+ super.setCategory(new Category(messages.getString("category_case")));
+ this.language = language;
+ }
+
+ public final String getId() {
+ return "UPPERCASE_SENTENCE_START";
+ }
+
+ public final String getDescription() {
+ return messages.getString("desc_uppercase_sentence");
+ }
+
+ public final RuleMatch[] match(final AnalyzedSentence text) {
+ final List<RuleMatch> ruleMatches = new ArrayList<RuleMatch>();
+ final AnalyzedTokenReadings[] tokens = text.getTokensWithoutWhitespace();
+ if (tokens.length < 2) {
+ return toRuleMatchArray(ruleMatches);
+ }
+ int matchTokenPos = 1; // 0 = SENT_START
+ final String firstToken = tokens[matchTokenPos].getToken();
+ String secondToken = null;
+ String thirdToken = null;
+ // ignore quote characters:
+ if (tokens.length >= 3
+ && ("'".equals(firstToken) || "\"".equals(firstToken) || "„"
+ .equals(firstToken))) {
+ matchTokenPos = 2;
+ secondToken = tokens[matchTokenPos].getToken();
+ }
+ final String firstDutchToken = dutchSpecialCase(firstToken, secondToken,
+ tokens);
+ if (firstDutchToken != null) {
+ thirdToken = firstDutchToken;
+ matchTokenPos = 3;
+ }
+
+ String checkToken = firstToken;
+ if (thirdToken != null) {
+ checkToken = thirdToken;
+ } else if (secondToken != null) {
+ checkToken = secondToken;
+ }
+
+ final String lastToken = tokens[tokens.length - 1].getToken();
+
+ boolean noException = false;
+ //fix for lists; note - this will not always work for the last point in OOo,
+ //as OOo might serve paragraphs in any order.
+ if ((language == Language.RUSSIAN || language == Language.POLISH)
+ && (";".equals(lastParagraphString) || ";".equals(lastToken)
+ || ",".equals(lastParagraphString) || ",".equals(lastToken))) {
+ noException = true;
+ }
+ //fix for comma in last paragraph; note - this will not always work for the last point in OOo,
+ //as OOo might serve paragraphs in any order.
+ if ((language == Language.RUSSIAN || language == Language.ITALIAN
+ || language == Language.POLISH || language == Language.GERMAN)
+ && (",".equals(lastParagraphString))) {
+ noException = true;
+ }
+
+ lastParagraphString = lastToken;
+
+ if (checkToken.length() > 0) {
+ final char firstChar = checkToken.charAt(0);
+ if (Character.isLowerCase(firstChar) && (!noException)) {
+ final RuleMatch ruleMatch = new RuleMatch(this, tokens[matchTokenPos]
+ .getStartPos(), tokens[matchTokenPos].getStartPos()
+ + tokens[matchTokenPos].getToken().length(), messages
+ .getString("incorrect_case"));
+ ruleMatch.setSuggestedReplacement(Character.toUpperCase(firstChar)
+ + checkToken.substring(1));
+ ruleMatches.add(ruleMatch);
+ }
+ }
+ return toRuleMatchArray(ruleMatches);
+ }
+
+ private String dutchSpecialCase(final String firstToken,
+ final String secondToken, final AnalyzedTokenReadings[] tokens) {
+ if (language != Language.DUTCH) {
+ return null;
+ }
+ if (tokens.length >= 3 && firstToken.equals("'")
+ && secondToken.matches("k|m|n|r|s|t")) {
+ return tokens[3].getToken();
+ }
+ return null;
+ }
+
+ public void reset() {
+ }
+
+}
diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/rules/WhitespaceRule.java b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/WhitespaceRule.java
new file mode 100644
index 0000000..61f1ca6
--- /dev/null
+++ b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/WhitespaceRule.java
@@ -0,0 +1,91 @@
+/* LanguageTool, a natural language style checker
+ * Copyright (C) 2005 Daniel Naber (http://www.danielnaber.de)
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
+ * USA
+ */
+
+package de.danielnaber.languagetool.rules;
+
+import java.util.ArrayList;
+import java.util.List;
+import java.util.ResourceBundle;
+
+import de.danielnaber.languagetool.AnalyzedSentence;
+import de.danielnaber.languagetool.AnalyzedTokenReadings;
+import de.danielnaber.languagetool.Language;
+
+/**
+ * Check if there is duplicated whitespace in a sentence.
+ * Considers two spaces as incorrect, and proposes a single space instead.
+ *
+ * @author Marcin Miłkowski
+ */
+
+public class WhitespaceRule extends Rule {
+
+ public WhitespaceRule(final ResourceBundle messages, final Language language) {
+ super(messages);
+ super.setCategory(new Category(messages.getString("category_misc")));
+ }
+
+ @Override
+ public final String getId() {
+ return "WHITESPACE_RULE";
+ }
+
+ @Override
+ public final String getDescription() {
+ return messages.getString("desc_whitespacerepetition");
+ }
+
+ @Override
+ public RuleMatch[] match(final AnalyzedSentence text) {
+ final List<RuleMatch> ruleMatches = new ArrayList<RuleMatch>();
+ final AnalyzedTokenReadings[] tokens = text.getTokens();
+ boolean prevWhite = false;
+ int prevLen = 0;
+ int prevPos = 0;
+ //note: we start from token 1
+ //token no. 0 is guaranteed to be SENT_START
+ int i = 1;
+ while (i < tokens.length) {
+ if (tokens[i].isWhitespace() && prevWhite && !tokens[i -1].isLinebreak()) {
+ final int pos = tokens[i -1].getStartPos();
+ while (i < tokens.length && tokens[i].isWhitespace()) {
+ prevLen += tokens[i].getToken().length();
+ i++;
+ }
+ final RuleMatch ruleMatch = new RuleMatch(this, prevPos, pos + prevLen, messages
+ .getString("whitespace_repetition"));
+ ruleMatch.setSuggestedReplacement(" ");
+ ruleMatches.add(ruleMatch);
+ }
+ if (i < tokens.length) {
+ prevWhite = tokens[i].isWhitespace();
+ prevLen = tokens[i].getToken().length();
+ prevPos = tokens[i].getStartPos();
+ i++;
+ }
+ }
+ return toRuleMatchArray(ruleMatches);
+ }
+
+ @Override
+ public void reset() {
+ // nothing
+ }
+
+}
diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/rules/WordRepeatRule.java b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/WordRepeatRule.java
new file mode 100644
index 0000000..c8060a5
--- /dev/null
+++ b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/WordRepeatRule.java
@@ -0,0 +1,101 @@
+/* LanguageTool, a natural language style checker
+ * Copyright (C) 2005 Daniel Naber (http://www.danielnaber.de)
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
+ * USA
+ */
+package de.danielnaber.languagetool.rules;
+
+import java.util.ArrayList;
+import java.util.List;
+import java.util.ResourceBundle;
+
+import de.danielnaber.languagetool.AnalyzedSentence;
+import de.danielnaber.languagetool.AnalyzedTokenReadings;
+import de.danielnaber.languagetool.Language;
+
+/**
+ * Check if a word is repeated twice, e.g. "the the".
+ *
+ * @author Daniel Naber
+ */
+public class WordRepeatRule extends Rule {
+
+ public WordRepeatRule(final ResourceBundle messages, final Language language) {
+ super(messages);
+ super.setCategory(new Category(messages.getString("category_misc")));
+ }
+
+ /**
+ * Implement this method to return <code>true</code> if there's
+ * a potential word repetition at the current position should be ignored,
+ * i.e. if no error should be created.
+ *
+ * @param tokens the tokens of the sentence currently being checked
+ * @param position the current position in the tokens
+ * @return this implementation always returns false
+ */
+ public boolean ignore(final AnalyzedTokenReadings[] tokens, final int position) {
+ return false;
+ }
+
+ @Override
+ public String getId() {
+ return "WORD_REPEAT_RULE";
+ }
+
+ @Override
+ public String getDescription() {
+ return messages.getString("desc_repetition");
+ }
+
+ @Override
+ public RuleMatch[] match(final AnalyzedSentence text) {
+ final List<RuleMatch> ruleMatches = new ArrayList<RuleMatch>();
+ final AnalyzedTokenReadings[] tokens = text.getTokensWithoutWhitespace();
+ String prevToken = "";
+ //note: we start from token 1
+ //token no. 0 is guaranteed to be SENT_START
+ for (int i = 1; i < tokens.length; i++) {
+ final String token = tokens[i].getToken();
+ // avoid "..." etc. to be matched:
+ boolean isWord = true;
+ if (token.length() == 1) {
+ final char c = token.charAt(0);
+ if (!Character.isLetter(c)) {
+ isWord = false;
+ }
+ }
+ final boolean isException = ignore(tokens, i);
+ if (isWord && prevToken.toLowerCase().equals(token.toLowerCase()) && !isException) {
+ final String msg = messages.getString("repetition");
+ final int prevPos = tokens[i - 1].getStartPos();
+ final int pos = tokens[i].getStartPos();
+ final RuleMatch ruleMatch = new RuleMatch(this, prevPos, pos+prevToken.length(), msg,
+ messages.getString("desc_repetition_short"));
+ ruleMatch.setSuggestedReplacement(prevToken);
+ ruleMatches.add(ruleMatch);
+ }
+ prevToken = token;
+ }
+ return toRuleMatchArray(ruleMatches);
+ }
+
+ @Override
+ public void reset() {
+ // nothing
+ }
+
+}
diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/rules/bitext/BitextRule.java b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/bitext/BitextRule.java
new file mode 100644
index 0000000..d508ae5
--- /dev/null
+++ b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/bitext/BitextRule.java
@@ -0,0 +1,106 @@
+/* LanguageTool, a natural language style checker
+ * Copyright (C) 2010 Marcin Miłkowski (www.languagetool.org)
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
+ * USA
+ */
+
+package de.danielnaber.languagetool.rules.bitext;
+
+import java.io.IOException;
+import java.util.List;
+
+import de.danielnaber.languagetool.AnalyzedSentence;
+import de.danielnaber.languagetool.bitext.StringPair;
+import de.danielnaber.languagetool.rules.Rule;
+import de.danielnaber.languagetool.rules.RuleMatch;
+import de.danielnaber.languagetool.Language;
+
+/**
+ * Abstract bitext rule class. A BitextRule describes a language error and
+ * can test whether a given pre-analyzed pair of source and target text
+ * contains that error using the {@link Rule#match} method.
+ *
+ * @author Marcin Miłkowski
+ */
+
+public abstract class BitextRule extends Rule {
+
+ private List<StringPair> correctExamples;
+ private List<IncorrectBitextExample> incorrectExamples;
+
+ private Language sourceLanguage;
+
+ @Override
+ public abstract String getDescription();
+
+ public abstract String getMessage();
+
+ @Override
+ public abstract String getId();
+
+ @Override
+ public abstract RuleMatch[] match(AnalyzedSentence text) throws IOException;
+
+ public abstract RuleMatch[] match(AnalyzedSentence sourceText,
+ AnalyzedSentence targetText) throws IOException;
+
+ @Override
+ public abstract void reset();
+
+ /**
+ * Set the source language. If the language is not supported
+ * by LT, you need to use the default tokenizers etc.
+ * @param lang - Source Language
+ */
+ public final void setSourceLang(final Language lang) {
+ sourceLanguage = lang;
+ }
+
+ public final Language getSourceLang() {
+ return sourceLanguage;
+ }
+
+ /**
+ * Set the examples that are correct and thus do not trigger the rule.
+ */
+ public final void setCorrectBitextExamples(final List<StringPair> correctExamples) {
+ this.correctExamples = correctExamples;
+ }
+
+ /**
+ * Get example sentences that are correct and thus will not match this rule.
+ */
+ public final List<StringPair> getCorrectBitextExamples() {
+ return correctExamples;
+ }
+
+ /**
+ * Set the examples that are incorrect and thus do trigger the rule.
+ */
+ public final void setIncorrectBitextExamples(
+ final List<IncorrectBitextExample> incorrectExamples) {
+ this.incorrectExamples = incorrectExamples;
+ }
+
+ /**
+ * Get example sentences that are incorrect and thus will match this rule.
+ */
+ public final List<IncorrectBitextExample> getIncorrectBitextExamples() {
+ return incorrectExamples;
+ }
+
+
+}
diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/rules/bitext/DifferentLengthRule.java b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/bitext/DifferentLengthRule.java
new file mode 100644
index 0000000..995772c
--- /dev/null
+++ b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/bitext/DifferentLengthRule.java
@@ -0,0 +1,93 @@
+/* LanguageTool, a natural language style checker
+ * Copyright (C) 2010 Marcin Miłkowski (www.languagetool.org)
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
+ * USA
+ */
+package de.danielnaber.languagetool.rules.bitext;
+
+import java.io.IOException;
+
+import de.danielnaber.languagetool.AnalyzedSentence;
+import de.danielnaber.languagetool.AnalyzedTokenReadings;
+import de.danielnaber.languagetool.rules.RuleMatch;
+
+/**
+ * Checks if the translation has a really different length than the source
+ * (smaller than 30% or longer by 250%).
+ *
+ * @author Marcin Miłkowski
+ *
+ */
+public class DifferentLengthRule extends BitextRule {
+
+ static final String MSG = "Source and target translation lengths are very different!";
+
+ @Override
+ public String getDescription() {
+ return "Check if translation length is similar to source length";
+ }
+
+ @Override
+ public String getId() {
+ return "TRANSLATION_LENGTH";
+ }
+
+ public String getMessage() {
+ return MSG;
+ }
+
+ /**
+ * This method makes no sense for bitext, return null??
+ */
+ @Override
+ public RuleMatch[] match(AnalyzedSentence text) throws IOException {
+ // TODO Auto-generated method stub
+ return null;
+ }
+
+ @Override
+ public RuleMatch[] match(AnalyzedSentence sourceText,
+ AnalyzedSentence targetText) throws IOException {
+
+ if (isLengthDifferent(
+ getPureText(sourceText), getPureText(targetText))) {
+ final RuleMatch[] rm = new RuleMatch[1];
+ final AnalyzedTokenReadings[] tokens = targetText.getTokens();
+ final int len = tokens[tokens.length - 1].getStartPos() + tokens[tokens.length - 1].getToken().length();
+ rm[0] = new RuleMatch(this, 1, len,
+ MSG);
+ return rm;
+ }
+ return new RuleMatch[0];
+ }
+
+ static boolean isLengthDifferent(final String src, final String trg) {
+ final double skew = (((double) src.length() / (double) trg.length()) * 100.00);
+ return (skew > 250 || skew < 30);
+ }
+
+ private static String getPureText(AnalyzedSentence text) {
+ final StringBuilder sb = new StringBuilder();
+ for (AnalyzedTokenReadings token : text.getTokens()) {
+ sb.append(token.getToken());
+ }
+ return sb.toString();
+ }
+
+ public void reset() {
+ }
+
+}
diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/rules/bitext/IncorrectBitextExample.java b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/bitext/IncorrectBitextExample.java
new file mode 100644
index 0000000..e877826
--- /dev/null
+++ b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/bitext/IncorrectBitextExample.java
@@ -0,0 +1,64 @@
+/* LanguageTool, a natural language style checker
+ * Copyright (C) 2010 Marcin Miłkowski (http://www.languagetool.org)
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
+ * USA
+ */
+package de.danielnaber.languagetool.rules.bitext;
+
+import java.util.Arrays;
+import java.util.List;
+
+import de.danielnaber.languagetool.bitext.StringPair;
+
+/**
+ * A text, typically a pair of sentences that contains an error.
+ *
+ * @since 1.0.1
+ * @author Marcin Miłkowski
+ */
+public class IncorrectBitextExample {
+
+ private StringPair example;
+ private List<String> corrections;
+
+ public IncorrectBitextExample(final StringPair example) {
+ this.example = example;
+ }
+
+ public IncorrectBitextExample(final StringPair example, final String[] corrections) {
+ this(example);
+ this.corrections = Arrays.asList(corrections);
+ }
+
+ /**
+ * Return the example that contains the error.
+ */
+ public StringPair getExample() {
+ return example;
+ }
+
+ /**
+ * Return the possible corrections. May be null.
+ */
+ public List<String> getCorrections() {
+ return corrections;
+ }
+
+ public String toString() {
+ return example.getSource() + "/ " + example.getTarget() + " " + corrections;
+ }
+
+}
diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/rules/bitext/SameTranslationRule.java b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/bitext/SameTranslationRule.java
new file mode 100644
index 0000000..c9e1ace
--- /dev/null
+++ b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/bitext/SameTranslationRule.java
@@ -0,0 +1,88 @@
+/* LanguageTool, a natural language style checker
+ * Copyright (C) 2010 Marcin Miłkowski (www.languagetool.org)
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
+ * USA
+ */
+package de.danielnaber.languagetool.rules.bitext;
+
+import java.io.IOException;
+
+import de.danielnaber.languagetool.AnalyzedSentence;
+import de.danielnaber.languagetool.AnalyzedTokenReadings;
+import de.danielnaber.languagetool.rules.RuleMatch;
+
+/**
+ * Checks if the translation for segments that have more than two words
+ * is different.
+ *
+ * @author Marcin Miłkowski
+ *
+ */
+public class SameTranslationRule extends BitextRule {
+
+ static final String MSG = "Source and target translation are the same!";
+
+ @Override
+ public String getDescription() {
+ return "Check if translation is the same as source";
+ }
+
+ @Override
+ public String getId() {
+ return "SAME_TRANSLATION";
+ }
+
+ public String getMessage() {
+ return MSG;
+ }
+
+ /**
+ * This method makes no sense for bitext, return null??
+ */
+ @Override
+ public RuleMatch[] match(AnalyzedSentence text) throws IOException {
+ // TODO Auto-generated method stub
+ return null;
+ }
+
+ @Override
+ public RuleMatch[] match(AnalyzedSentence sourceText,
+ AnalyzedSentence targetText) throws IOException {
+
+ //This is just heuristics, checking word count
+ if (sourceText.getTokensWithoutWhitespace().length > 3
+ && getPureText(sourceText).equals(getPureText(targetText))) {
+ final RuleMatch[] rm = new RuleMatch[1];
+ final AnalyzedTokenReadings[] tokens = targetText.getTokens();
+ final int len = tokens[tokens.length - 1].getStartPos() + tokens[tokens.length - 1].getToken().length();
+ rm[0] = new RuleMatch(this, 1, len, MSG);
+ return rm;
+ }
+ return new RuleMatch[0];
+ }
+
+ private static String getPureText(AnalyzedSentence text) {
+ final StringBuilder sb = new StringBuilder();
+ for (AnalyzedTokenReadings token : text.getTokens()) {
+ sb.append(token.getToken());
+ }
+ return sb.toString();
+ }
+
+ public void reset() {
+ }
+
+}
diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/rules/ca/AccentuacioReplaceRule.java b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/ca/AccentuacioReplaceRule.java
new file mode 100644
index 0000000..eb5a3fa
--- /dev/null
+++ b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/ca/AccentuacioReplaceRule.java
@@ -0,0 +1,90 @@
+/* LanguageTool, a natural language style checker
+ * Copyright (C) 2005 Daniel Naber (http://www.danielnaber.de)
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
+ * USA
+ */
+package de.danielnaber.languagetool.rules.ca;
+
+import java.io.IOException;
+import java.util.Locale;
+import java.util.ResourceBundle;
+
+import de.danielnaber.languagetool.rules.AbstractSimpleReplaceRule;
+
+/**
+ * A rule that matches words or phrases which should not be used and suggests
+ * correct ones instead.
+ *
+ * Catalan implementations for accentuation errors.
+ * This is basically the same as CastellanismesReplaceRule.java
+ * with a different error message.
+ *
+ * Loads the list of words from <code>rules/ca/accentuacio.txt</code>.
+ *
+ * TODO: Some of the entries are proper names (Greek gods, etc.), which
+ * aren't currently checked.
+ *
+ * @author Jimmy O'Regan
+ *
+ * Based on pl/SimpleReplaceRule.java
+ */
+public class AccentuacioReplaceRule extends AbstractSimpleReplaceRule {
+
+ public static final String CATALAN_ACCENTUACIO_REPLACE_RULE = "CA_ACCENTUACIO_REPLACE";
+
+ private static final String FILE_NAME = "/ca/accentuacio.txt";
+ // locale used on case-conversion
+ private static final Locale CA_LOCALE = new Locale("ca");
+
+ public final String getFileName() {
+ return FILE_NAME;
+ }
+
+ public AccentuacioReplaceRule(final ResourceBundle messages) throws IOException {
+ super(messages);
+ }
+
+ public final String getId() {
+ return CATALAN_ACCENTUACIO_REPLACE_RULE;
+ }
+
+ public String getDescription() {
+ return "Errors d'accentuació";
+ }
+
+ public String getShort() {
+ return "Accentuació";
+ }
+
+ public String getSuggestion() {
+ return " es un error d'accentuació, cal dir: ";
+ }
+
+ /**
+ * use case-insensitive matching.
+ */
+ public boolean isCaseSensitive() {
+ return false;
+ }
+
+ /**
+ * locale used on case-conversion
+ */
+ public Locale getLocale() {
+ return CA_LOCALE;
+ }
+
+}
diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/rules/ca/CastellanismesReplaceRule.java b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/ca/CastellanismesReplaceRule.java
new file mode 100644
index 0000000..3169b66
--- /dev/null
+++ b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/ca/CastellanismesReplaceRule.java
@@ -0,0 +1,85 @@
+/* LanguageTool, a natural language style checker
+ * Copyright (C) 2005 Daniel Naber (http://www.danielnaber.de)
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
+ * USA
+ */
+package de.danielnaber.languagetool.rules.ca;
+
+import java.io.IOException;
+import java.util.Locale;
+import java.util.ResourceBundle;
+
+import de.danielnaber.languagetool.rules.AbstractSimpleReplaceRule;
+
+/**
+ * A rule that matches words or phrases which should not be used and suggests
+ * correct ones instead.
+ *
+ * Catalan implementations for Castelianisms, kept separate for an individual
+ * error message.
+ * Loads the list of words from <code>rules/ca/castellanismes.txt</code>.
+ *
+ * @author Jimmy O'Regan
+ *
+ * Based on pl/SimpleReplaceRule.java
+ */
+public class CastellanismesReplaceRule extends AbstractSimpleReplaceRule {
+
+ public static final String CATALAN_CASTELLANISMES_REPLACE_RULE = "CA_CASTELLANISMES_REPLACE";
+
+ private static final String FILE_NAME = "/ca/castellanismes.txt";
+ // locale used on case-conversion
+ private static final Locale caLocale = new Locale("ca");
+
+ public final String getFileName() {
+ return FILE_NAME;
+ }
+
+ public CastellanismesReplaceRule(final ResourceBundle messages) throws IOException {
+ super(messages);
+ }
+
+ public final String getId() {
+ return CATALAN_CASTELLANISMES_REPLACE_RULE;
+ }
+
+ public String getDescription() {
+ return "Barbarismes (Castellanismes)";
+ }
+
+ public String getShort() {
+ return "Castellanismes";
+ }
+
+ public String getSuggestion() {
+ return " es un castellanisme, cal dir: ";
+ }
+
+ /**
+ * use case-insensitive matching.
+ */
+ public boolean isCaseSensitive() {
+ return false;
+ }
+
+ /**
+ * locale used on case-conversion
+ */
+ public Locale getLocale() {
+ return caLocale;
+ }
+
+}
diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/rules/de/AgreementRule.java b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/de/AgreementRule.java
new file mode 100644
index 0000000..8afff0c
--- /dev/null
+++ b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/de/AgreementRule.java
@@ -0,0 +1,405 @@
+/* LanguageTool, a natural language style checker
+ * Copyright (C) 2005 Daniel Naber (http://www.danielnaber.de)
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
+ * USA
+ */
+package de.danielnaber.languagetool.rules.de;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.HashSet;
+import java.util.List;
+import java.util.ResourceBundle;
+import java.util.Set;
+
+import de.danielnaber.languagetool.AnalyzedSentence;
+import de.danielnaber.languagetool.AnalyzedTokenReadings;
+import de.danielnaber.languagetool.JLanguageTool;
+import de.danielnaber.languagetool.rules.Category;
+import de.danielnaber.languagetool.rules.RuleMatch;
+import de.danielnaber.languagetool.tagging.de.AnalyzedGermanToken;
+import de.danielnaber.languagetool.tagging.de.AnalyzedGermanTokenReadings;
+import de.danielnaber.languagetool.tagging.de.GermanTagger;
+import de.danielnaber.languagetool.tagging.de.GermanToken;
+import de.danielnaber.languagetool.tagging.de.GermanToken.POSType;
+import de.danielnaber.languagetool.tools.StringTools;
+
+/**
+ * Simple agreement checker for German noun phrases. Checks agreement in:
+ *
+ * <ul>
+ * <li>DET/PRO NOUN: e.g. "mein Auto", "der Mann", "die Frau" (correct), "die Haus" (incorrect)</li>
+ * <li>DET/PRO ADJ NOUN: e.g. "der riesige Tisch" (correct), "die riesigen Tisch" (incorrect)</li>
+ * </ul>
+ *
+ * Note that this rule only checks agreement inside the noun phrase, not whether
+ * e.g. the correct case is used. For example, "Es ist das Haus dem Mann" is not
+ * detected as incorrect.
+ *
+ * @author Daniel Naber
+ */
+public class AgreementRule extends GermanRule {
+
+ private static final String KASUS = "Kasus";
+ private static final String NUMERUS = "Numerus";
+ private static final String GENUS = "Genus";
+
+ /*
+ * City names are incoherently tagged in the Morphy data. To avoid
+ * false alarms on phrases like "das Berliner Auto" we have to
+ * explicitly add these adjective readings to "Berliner" and to all
+ * other potential city names:
+ */
+ private static final String[] ADJ_READINGS = new String[] {
+ // singular:
+ "ADJ:NOM:SIN:MAS:GRU", "ADJ:NOM:SIN:NEU:GRU", "ADJ:NOM:SIN:FEM:GRU", // das Berliner Auto
+ "ADJ:GEN:SIN:MAS:GRU", "ADJ:GEN:SIN:NEU:GRU", "ADJ:GEN:SIN:FEM:GRU", // des Berliner Autos
+ "ADJ:DAT:SIN:MAS:GRU", "ADJ:DAT:SIN:NEU:GRU", "ADJ:DAT:SIN:FEM:GRU", // dem Berliner Auto
+ "ADJ:AKK:SIN:MAS:GRU", "ADJ:AKK:SIN:NEU:GRU", "ADJ:AKK:SIN:FEM:GRU", // den Berliner Bewohner
+ // plural:
+ "ADJ:NOM:PLU:MAS:GRU", "ADJ:NOM:PLU:NEU:GRU", "ADJ:NOM:PLU:FEM:GRU", // die Berliner Autos
+ "ADJ:GEN:PLU:MAS:GRU", "ADJ:GEN:PLU:NEU:GRU", "ADJ:GEN:PLU:FEM:GRU", // der Berliner Autos
+ "ADJ:DAT:PLU:MAS:GRU", "ADJ:DAT:PLU:NEU:GRU", "ADJ:DAT:PLU:FEM:GRU", // den Berliner Autos
+ "ADJ:AKK:PLU:MAS:GRU", "ADJ:AKK:PLU:NEU:GRU", "ADJ:AKK:PLU:FEM:GRU", // den Berliner Bewohnern
+ };
+
+
+ private static final Set<String> REL_PRONOUN = new HashSet<String>();
+ static {
+ REL_PRONOUN.add("der");
+ REL_PRONOUN.add("die");
+ REL_PRONOUN.add("das");
+ REL_PRONOUN.add("dessen");
+ REL_PRONOUN.add("deren");
+ REL_PRONOUN.add("dem");
+ REL_PRONOUN.add("den");
+ REL_PRONOUN.add("welche");
+ REL_PRONOUN.add("welcher");
+ REL_PRONOUN.add("welchen");
+ REL_PRONOUN.add("welchem");
+ REL_PRONOUN.add("welches");
+ }
+
+ private static final Set<String> PREPOSITIONS = new HashSet<String>();
+ static {
+ PREPOSITIONS.add("in");
+ PREPOSITIONS.add("auf");
+ PREPOSITIONS.add("an");
+ PREPOSITIONS.add("ab");
+ PREPOSITIONS.add("für");
+ PREPOSITIONS.add("zu");
+ // TODO: add more
+ }
+
+ public AgreementRule(final ResourceBundle messages) {
+ if (messages != null)
+ super.setCategory(new Category(messages.getString("category_grammar")));
+ }
+
+ public String getId() {
+ return "DE_AGREEMENT";
+ }
+
+ public String getDescription() {
+ return "Kongruenz von Nominalphrasen (unvollständig!), z.B. 'mein kleiner(kleines) Haus'";
+ }
+
+ public RuleMatch[] match(final AnalyzedSentence text) {
+ final List<RuleMatch> ruleMatches = new ArrayList<RuleMatch>();
+ final AnalyzedTokenReadings[] tokens = text.getTokensWithoutWhitespace();
+ int pos = 0;
+ for (int i = 0; i < tokens.length; i++) {
+ //defaulting to the first reading
+ //TODO: check for all readings
+ //and replace GermanTokenReading
+ final String posToken = tokens[i].getAnalyzedToken(0).getPOSTag();
+ if (posToken != null && posToken.equals(JLanguageTool.SENTENCE_START_TAGNAME))
+ continue;
+ //AnalyzedGermanToken analyzedToken = new AnalyzedGermanToken(tokens[i]);
+
+ final AnalyzedGermanTokenReadings analyzedToken = (AnalyzedGermanTokenReadings)tokens[i];
+ final boolean relevantPronoun = isRelevantPronoun(tokens, i);
+
+ boolean ignore = couldBeRelativeClause(tokens, i);
+ if (i > 0) {
+ final String prevToken = tokens[i-1].getToken().toLowerCase();
+ if ((prevToken.equals("der") || prevToken.equals("die") || prevToken.equals("das"))
+ && tokens[i].getToken().equals("eine")) {
+ // TODO: "der eine Polizist" -> nicht ignorieren, sondern "der polizist" checken
+ ignore = true;
+ }
+ }
+
+ // avoid false alarm on "nichts Gutes":
+ if (analyzedToken.getToken().equals("nichts")) {
+ ignore = true;
+ }
+
+ if ((analyzedToken.hasReadingOfType(POSType.DETERMINER) || relevantPronoun) && !ignore) {
+ int tokenPos = i + 1;
+ if (tokenPos >= tokens.length)
+ break;
+ AnalyzedGermanTokenReadings nextToken = (AnalyzedGermanTokenReadings)tokens[tokenPos];
+ nextToken = maybeAddAdjectiveReadings(nextToken, tokens, tokenPos);
+ if (nextToken.hasReadingOfType(POSType.ADJEKTIV)) {
+ tokenPos = i + 2;
+ if (tokenPos >= tokens.length)
+ break;
+ final AnalyzedGermanTokenReadings nextNextToken = (AnalyzedGermanTokenReadings)tokens[tokenPos];
+ if (nextNextToken.hasReadingOfType(POSType.NOMEN)) {
+ // TODO: add a case (checkAdjNounAgreement) for special cases like "deren",
+ // e.g. "deren komisches Geschenke" isn't yet detected as incorrect
+ final RuleMatch ruleMatch = checkDetAdjNounAgreement((AnalyzedGermanTokenReadings)tokens[i],
+ nextToken, (AnalyzedGermanTokenReadings)tokens[i+2]);
+ if (ruleMatch != null) {
+ ruleMatches.add(ruleMatch);
+ }
+ }
+ } else if (nextToken.hasReadingOfType(POSType.NOMEN)) {
+ final RuleMatch ruleMatch = checkDetNounAgreement((AnalyzedGermanTokenReadings)tokens[i],
+ (AnalyzedGermanTokenReadings)tokens[i+1]);
+ if (ruleMatch != null) {
+ ruleMatches.add(ruleMatch);
+ }
+ }
+ }
+
+ pos += tokens[i].getToken().length();
+ }
+ return toRuleMatchArray(ruleMatches);
+ }
+
+ private boolean isRelevantPronoun(AnalyzedTokenReadings[] tokens, int pos) {
+ final AnalyzedGermanTokenReadings analyzedToken = (AnalyzedGermanTokenReadings)tokens[pos];
+ boolean relevantPronoun = analyzedToken.hasReadingOfType(POSType.PRONOMEN);
+ // avoid false alarms:
+ final String token = tokens[pos].getToken();
+ if (pos > 0 && tokens[pos-1].getToken().equalsIgnoreCase("vor") && tokens[pos].getToken().equalsIgnoreCase("allem"))
+ relevantPronoun = false;
+ else if (token.equalsIgnoreCase("er") || token.equalsIgnoreCase("sie") || token.equalsIgnoreCase("es"))
+ relevantPronoun = false;
+ else if (token.equalsIgnoreCase("ich"))
+ relevantPronoun = false;
+ else if (token.equalsIgnoreCase("du"))
+ relevantPronoun = false;
+ else if (token.equalsIgnoreCase("dessen")) // avoid false alarm on: "..., dessen Leiche"
+ relevantPronoun = false;
+ else if (token.equalsIgnoreCase("deren"))
+ relevantPronoun = false;
+ else if (token.equalsIgnoreCase("sich")) // avoid false alarm
+ relevantPronoun = false;
+ else if (token.equalsIgnoreCase("unser")) // avoid false alarm "unser Produkt": TODO!
+ relevantPronoun = false;
+ else if (token.equalsIgnoreCase("aller"))
+ relevantPronoun = false;
+ else if (token.equalsIgnoreCase("man"))
+ relevantPronoun = false;
+ else if (token.equalsIgnoreCase("beiden"))
+ relevantPronoun = false;
+ else if (token.equalsIgnoreCase("wessen"))
+ relevantPronoun = false;
+ else if (token.equalsIgnoreCase("a"))
+ relevantPronoun = false;
+ else if (token.equalsIgnoreCase("alle"))
+ relevantPronoun = false;
+ else if (token.equalsIgnoreCase("etwas")) // TODO: doesn't have case -- but don't just ignore
+ relevantPronoun = false;
+ else if (token.equalsIgnoreCase("was")) // TODO: doesn't have case -- but don't just ignore
+ relevantPronoun = false;
+ else if (token.equalsIgnoreCase("wer"))
+ relevantPronoun = false;
+ return relevantPronoun;
+ }
+
+ // see the comment at ADJ_READINGS:
+ private AnalyzedGermanTokenReadings maybeAddAdjectiveReadings(AnalyzedGermanTokenReadings nextToken,
+ AnalyzedTokenReadings[] tokens, int tokenPos) {
+ final String nextTerm = nextToken.getToken();
+ // Just a heuristic: nouns and proper nouns that end with "er" are considered
+ // city names:
+ if (nextTerm.endsWith("er") && tokens.length > tokenPos+1) {
+ final AnalyzedGermanTokenReadings nextNextToken = (AnalyzedGermanTokenReadings)tokens[tokenPos+1];
+ final GermanTagger tagger = new GermanTagger();
+ try {
+ final AnalyzedGermanTokenReadings nextATR = tagger.lookup(nextTerm.substring(0, nextTerm.length()-2));
+ final AnalyzedGermanTokenReadings nextNextATR = tagger.lookup(nextNextToken.getToken());
+ //System.err.println("nextATR: " + nextATR);
+ //System.err.println("nextNextATR: " + nextNextATR);
+ // "Münchner": special case as cutting off last two characters doesn't produce city name:
+ if ("Münchner".equals(nextTerm) ||
+ (nextATR != null &&
+ // tagging in Morphy for cities is not coherent:
+ (nextATR.hasReadingOfType(POSType.PROPER_NOUN) || nextATR.hasReadingOfType(POSType.NOMEN) &&
+ nextNextATR != null && nextNextATR.hasReadingOfType(POSType.NOMEN)))) {
+ final AnalyzedGermanToken[] adjReadings = new AnalyzedGermanToken[ADJ_READINGS.length];
+ for (int j = 0; j < ADJ_READINGS.length; j++) {
+ adjReadings[j] = new AnalyzedGermanToken(nextTerm, ADJ_READINGS[j], null);
+ }
+ nextToken = new AnalyzedGermanTokenReadings(adjReadings, nextToken.getStartPos());
+ }
+ } catch (IOException e) {
+ throw new RuntimeException(e);
+ }
+ }
+ return nextToken;
+ }
+
+ // TODO: improve this so it only returns true for real relative clauses
+ private boolean couldBeRelativeClause(AnalyzedTokenReadings[] tokens, int pos) {
+ boolean comma;
+ boolean relPronoun;
+ if (pos >= 1) {
+ // avoid false alarm: "Das Wahlrecht, das Frauen zugesprochen bekamen." etc:
+ comma = tokens[pos-1].getToken().equals(",");
+ final String term = tokens[pos].getToken().toLowerCase();
+ relPronoun = REL_PRONOUN.contains(term);
+ if (comma && relPronoun)
+ return true;
+ }
+ if (pos >= 2) {
+ // avoid false alarm: "Der Mann, in dem quadratische Fische schwammen."
+ comma = tokens[pos-2].getToken().equals(",");
+ final String term1 = tokens[pos-1].getToken().toLowerCase();
+ final String term2 = tokens[pos].getToken().toLowerCase();
+ final boolean prep = PREPOSITIONS.contains(term1);
+ relPronoun = REL_PRONOUN.contains(term2);
+ return comma && prep && relPronoun;
+ }
+ return false;
+ }
+
+ private RuleMatch checkDetNounAgreement(final AnalyzedGermanTokenReadings token1,
+ final AnalyzedGermanTokenReadings token2) {
+ // avoid false alarm: "Gebt ihm Macht."
+ if (token1.getToken().equalsIgnoreCase("ihm"))
+ return null;
+ RuleMatch ruleMatch = null;
+ final Set<String> set1 = getAgreementCategories(token1);
+ if (set1 == null)
+ return null; // word not known, assume it's correct
+ final Set<String> set2 = getAgreementCategories(token2);
+ if (set2 == null)
+ return null;
+ /*System.err.println("#"+set1);
+ System.err.println("#"+set2);
+ System.err.println("");*/
+ set1.retainAll(set2);
+ if (set1.size() == 0) {
+ // TODO: better error message than just 'agreement error'
+ final String msg = "Möglicherweise fehlende Übereinstimmung (Kongruenz) zwischen Artikel und Nomen " +
+ "bezüglich Kasus, Numerus oder Genus. Beispiel: 'meine Haus' statt 'mein Haus'";
+ ruleMatch = new RuleMatch(this, token1.getStartPos(),
+ token2.getStartPos()+token2.getToken().length(), msg);
+ }
+ return ruleMatch;
+ }
+
+ private RuleMatch checkDetAdjNounAgreement(final AnalyzedGermanTokenReadings token1,
+ final AnalyzedGermanTokenReadings token2, final AnalyzedGermanTokenReadings token3) {
+ final Set<String> relax = new HashSet<String>();
+ final Set<String> set = retainCommonCategories(token1, token2, token3, relax);
+ RuleMatch ruleMatch = null;
+ if (set.size() == 0) {
+ // TODO: more detailed error message:
+ /*relax.add(KASUS);
+ set = retainCommonCategories(token1, token2, token3, relax);
+ if (set.size() > 0) {
+ System.err.println("KASUS!");
+ }
+ relax.clear();
+ relax.add(NUMERUS);
+ set = retainCommonCategories(token1, token2, token3, relax);
+ if (set.size() > 0) {
+ System.err.println("NUMERUS!");
+ }
+ relax.clear();
+ relax.add(GENUS);
+ set = retainCommonCategories(token1, token2, token3, relax);
+ if (set.size() > 0) {
+ System.err.println("GENUS!");
+ }*/
+ final String msg = "Möglicherweise fehlende Übereinstimmung (Kongruenz) zwischen Artikel, Adjektiv und " +
+ "Nomen bezüglich Kasus, Numerus oder Genus. Beispiel: 'mein kleiner Haus' " +
+ "statt 'mein kleines Haus'";
+ ruleMatch = new RuleMatch(this, token1.getStartPos(),
+ token3.getStartPos()+token3.getToken().length(), msg);
+ }
+ return ruleMatch;
+ }
+
+ private Set<String> retainCommonCategories(final AnalyzedGermanTokenReadings token1,
+ final AnalyzedGermanTokenReadings token2, final AnalyzedGermanTokenReadings token3,
+ Set<String> relax) {
+ final Set<String> set1 = getAgreementCategories(token1, relax);
+ if (set1 == null)
+ return null; // word not known, assume it's correct
+ final Set<String> set2 = getAgreementCategories(token2, relax);
+ if (set2 == null)
+ return null;
+ final Set<String> set3 = getAgreementCategories(token3, relax);
+ if (set3 == null)
+ return null;
+ /*System.err.println(token1.getToken()+"#"+set1);
+ System.err.println(token2.getToken()+"#"+set2);
+ System.err.println(token3.getToken()+"#"+set3);
+ System.err.println("");*/
+ set1.retainAll(set2);
+ set1.retainAll(set3);
+ return set1;
+ }
+
+ private Set<String> getAgreementCategories(final AnalyzedGermanTokenReadings aToken) {
+ return getAgreementCategories(aToken, new HashSet<String>());
+ }
+
+ /** Return Kasus, Numerus, Genus. */
+ private Set<String> getAgreementCategories(final AnalyzedGermanTokenReadings aToken, Set<String> omit) {
+ final Set<String> set = new HashSet<String>();
+ final List<AnalyzedGermanToken> readings = aToken.getGermanReadings();
+ for (AnalyzedGermanToken reading : readings) {
+ if (reading.getCasus() == null && reading.getNumerus() == null &&
+ reading.getGenus() == null)
+ continue;
+ if (reading.getGenus() == null) {
+ // "ich" and "wir" contains genus=ALG in the original data. Not sure if
+ // this is allowed, but expand this so "Ich Arbeiter" doesn't get flagged
+ // as incorrect:
+ set.add(makeString(reading.getCasus(), reading.getNumerus(), GermanToken.Genus.MASKULINUM, omit));
+ set.add(makeString(reading.getCasus(), reading.getNumerus(), GermanToken.Genus.FEMININUM, omit));
+ set.add(makeString(reading.getCasus(), reading.getNumerus(), GermanToken.Genus.NEUTRUM, omit));
+ } else {
+ set.add(makeString(reading.getCasus(), reading.getNumerus(), reading.getGenus(), omit));
+ }
+ }
+ return set;
+ }
+
+ private String makeString(GermanToken.Kasus casus, GermanToken.Numerus num, GermanToken.Genus gen,
+ Set<String> omit) {
+ final List<String> l = new ArrayList<String>();
+ if (casus != null && !omit.contains(KASUS))
+ l.add(casus.toString());
+ if (num != null && !omit.contains(NUMERUS))
+ l.add(num.toString());
+ if (gen != null && !omit.contains(GENUS))
+ l.add(gen.toString());
+ return StringTools.listToString(l, "/");
+ }
+
+ public void reset() {
+ }
+
+}
diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/rules/de/CaseRule.java b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/de/CaseRule.java
new file mode 100644
index 0000000..663e9ff
--- /dev/null
+++ b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/de/CaseRule.java
@@ -0,0 +1,358 @@
+/* LanguageTool, a natural language style checker
+ * Copyright (C) 2005 Daniel Naber (http://www.danielnaber.de)
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
+ * USA
+ */
+package de.danielnaber.languagetool.rules.de;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.HashSet;
+import java.util.List;
+import java.util.ResourceBundle;
+import java.util.Set;
+
+import de.danielnaber.languagetool.AnalyzedSentence;
+import de.danielnaber.languagetool.AnalyzedTokenReadings;
+import de.danielnaber.languagetool.JLanguageTool;
+import de.danielnaber.languagetool.rules.Category;
+import de.danielnaber.languagetool.rules.RuleMatch;
+import de.danielnaber.languagetool.tagging.de.AnalyzedGermanToken;
+import de.danielnaber.languagetool.tagging.de.AnalyzedGermanTokenReadings;
+import de.danielnaber.languagetool.tagging.de.GermanTagger;
+import de.danielnaber.languagetool.tagging.de.GermanToken;
+import de.danielnaber.languagetool.tagging.de.GermanToken.POSType;
+import de.danielnaber.languagetool.tools.StringTools;
+
+/**
+ * Check that adjectives and verbs are not written with an uppercase
+ * first letter (except at the start of a sentence) and cases
+ * like this: <tt>Das laufen f&auml;llt mir leicht.</tt> (<tt>laufen</tt> needs
+ * to be uppercased).
+ *
+ * @author Daniel Naber
+ */
+public class CaseRule extends GermanRule {
+
+ private final GermanTagger tagger = new GermanTagger();
+
+ // wenn hinter diesen Wörtern ein Verb steht, ist es wohl ein substantiviertes Verb,
+ // muss also groß geschrieben werden:
+ private static final Set<String> nounIndicators = new HashSet<String>();
+ static {
+ nounIndicators.add("das");
+ nounIndicators.add("sein");
+ //indicator.add("seines"); // TODO: ?
+ //nounIndicators.add("ihr"); // would cause false alarm e.g. "Auf ihr stehen die Ruinen..."
+ nounIndicators.add("mein");
+ nounIndicators.add("dein");
+ nounIndicators.add("euer");
+ //indicator.add("ihres");
+ //indicator.add("ihren");
+ }
+
+ private static final Set<String> sentenceStartExceptions = new HashSet<String>();
+ static {
+ sentenceStartExceptions.add("(");
+ sentenceStartExceptions.add(":");
+ sentenceStartExceptions.add("\"");
+ sentenceStartExceptions.add("'");
+ sentenceStartExceptions.add("„");
+ sentenceStartExceptions.add("“");
+ sentenceStartExceptions.add("«");
+ sentenceStartExceptions.add("»");
+ }
+
+ private static final Set<String> exceptions = new HashSet<String>();
+ static {
+ exceptions.add("Für"); // "das Für und Wider"
+ exceptions.add("Wider"); // "das Für und Wider"
+ exceptions.add("Nachts"); // "des Nachts", "eines Nachts"
+ exceptions.add("Genüge");
+ exceptions.add("Zusage");
+ exceptions.add("Nachfrage");
+ exceptions.add("Sachverständiger");
+ exceptions.add("Nr");
+ exceptions.add("Sankt");
+ exceptions.add("Toter");
+ exceptions.add("Verantwortlicher");
+ exceptions.add("Wichtiges");
+ exceptions.add("Dr");
+ exceptions.add("Prof");
+ exceptions.add("Mr");
+ exceptions.add("Mrs");
+ exceptions.add("De"); // "De Morgan" etc
+ exceptions.add("Le"); // "Le Monde" etc
+ exceptions.add("Ihr");
+ exceptions.add("Ihre");
+ exceptions.add("Ihres");
+ exceptions.add("Ihren");
+ exceptions.add("Ihnen");
+ exceptions.add("Ihrem");
+ exceptions.add("Ihrer");
+ exceptions.add("Sie");
+ exceptions.add("Aus"); // "vor dem Aus stehen"
+ exceptions.add("Oder"); // der Fluss
+ exceptions.add("tun"); // "Sie müssen das tun"
+ exceptions.add("St"); // Paris St. Germain
+ exceptions.add("Las"); // Las Vegas, nicht "lesen"
+ exceptions.add("Folgendes"); // je nach Kontext groß (TODO)...
+ exceptions.add("besonderes"); // je nach Kontext groß (TODO): "etwas Besonderes"
+ exceptions.add("Hundert"); // je nach Kontext groß (TODO)
+ exceptions.add("Tausend"); // je nach Kontext groß (TODO)
+ exceptions.add("Übrigen"); // je nach Kontext groß (TODO), z.B. "im Übrigen"
+ exceptions.add("Unvorhergesehenes"); // je nach Kontext groß (TODO), z.B. "etwas Unvorhergesehenes"
+
+ exceptions.add("Englisch"); // TODO: alle Sprachen
+ exceptions.add("Deutsch");
+ exceptions.add("Französisch");
+ exceptions.add("Spanisch");
+ exceptions.add("Italienisch");
+ exceptions.add("Portugiesisch");
+ exceptions.add("Dänisch");
+ exceptions.add("Norwegisch");
+ exceptions.add("Schwedisch");
+ exceptions.add("Finnisch");
+ exceptions.add("Holländisch");
+ exceptions.add("Niederländisch");
+ exceptions.add("Polnisch");
+ exceptions.add("Tschechisch");
+ exceptions.add("Arabisch");
+ exceptions.add("Persisch");
+
+ exceptions.add("Schuld");
+ exceptions.add("Erwachsener");
+ exceptions.add("Jugendlicher");
+ exceptions.add("Link");
+ exceptions.add("Ausdrücke");
+ exceptions.add("Landwirtschaft");
+ exceptions.add("Flöße");
+ exceptions.add("Wild");
+ exceptions.add("Vorsitzender");
+ exceptions.add("Mrd");
+ exceptions.add("Links");
+ // Änderungen an der Rechtschreibreform 2006 erlauben hier Großschreibung:
+ exceptions.add("Du");
+ exceptions.add("Dir");
+ exceptions.add("Dich");
+ exceptions.add("Deine");
+ exceptions.add("Deinen");
+ exceptions.add("Deinem");
+ exceptions.add("Deines");
+ exceptions.add("Deiner");
+ exceptions.add("Euch");
+
+ exceptions.add("Neuem");
+ exceptions.add("Weitem");
+ exceptions.add("Weiteres");
+ exceptions.add("Langem");
+ exceptions.add("Längerem");
+ exceptions.add("Kurzem");
+ exceptions.add("Schwarzes"); // Schwarzes Brett
+ exceptions.add("Goldener"); // Goldener Schnitt
+ // TODO: add more exceptions here
+ }
+
+ private static final Set<String> myExceptionPhrases = new HashSet<String>();
+ static {
+ // use proper upper/lowercase spelling here:
+ myExceptionPhrases.add("ohne Wenn und Aber");
+ myExceptionPhrases.add("Große Koalition");
+ myExceptionPhrases.add("Großen Koalition");
+ myExceptionPhrases.add("im Großen und Ganzen");
+ myExceptionPhrases.add("Im Großen und Ganzen");
+ myExceptionPhrases.add("im Guten wie im Schlechten");
+ myExceptionPhrases.add("Im Guten wie im Schlechten");
+ }
+
+ private static final Set<String> substVerbenExceptions = new HashSet<String>();
+ static {
+ substVerbenExceptions.add("gehören");
+ substVerbenExceptions.add("bedeutet"); // "und das bedeutet..."
+ substVerbenExceptions.add("ermöglicht"); // "und das ermöglicht..."
+ substVerbenExceptions.add("sollen");
+ substVerbenExceptions.add("werden");
+ substVerbenExceptions.add("dürfen");
+ substVerbenExceptions.add("müssen");
+ substVerbenExceptions.add("so");
+ substVerbenExceptions.add("ist");
+ substVerbenExceptions.add("können");
+ substVerbenExceptions.add("muss");
+ substVerbenExceptions.add("muß");
+ substVerbenExceptions.add("wollen");
+ substVerbenExceptions.add("habe");
+ substVerbenExceptions.add("ein"); // nicht "einen" (Verb)
+ substVerbenExceptions.add("tun"); // "...dann wird er das tun."
+ substVerbenExceptions.add("bestätigt");
+ substVerbenExceptions.add("bestätigte");
+ substVerbenExceptions.add("bestätigten");
+ substVerbenExceptions.add("bekommen");
+ }
+
+ public CaseRule(final ResourceBundle messages) {
+ if (messages != null)
+ super.setCategory(new Category(messages.getString("category_case")));
+ }
+
+ public String getId() {
+ return "DE_CASE";
+ }
+
+ public String getDescription() {
+ return "Großschreibung von Nomen und substantivierten Verben";
+ }
+
+ public RuleMatch[] match(final AnalyzedSentence text) {
+ final List<RuleMatch> ruleMatches = new ArrayList<RuleMatch>();
+ final AnalyzedTokenReadings[] tokens = text.getTokensWithoutWhitespace();
+
+ int pos = 0;
+ boolean prevTokenIsDas = false;
+ for (int i = 0; i < tokens.length; i++) {
+ //FIXME: defaulting to the first analysis
+ //don't know if it's safe
+ final String posToken = tokens[i].getAnalyzedToken(0).getPOSTag();
+ if (posToken != null && posToken.equals(JLanguageTool.SENTENCE_START_TAGNAME))
+ continue;
+ if (i == 1) { // don't care about first word, UppercaseSentenceStartRule does this already
+ if (nounIndicators.contains(tokens[i].getToken().toLowerCase())) {
+ prevTokenIsDas = true;
+ }
+ continue;
+ }
+ final AnalyzedGermanTokenReadings analyzedToken = (AnalyzedGermanTokenReadings)tokens[i];
+ final String token = analyzedToken.getToken();
+ List<AnalyzedGermanToken> readings = analyzedToken.getGermanReadings();
+ AnalyzedGermanTokenReadings analyzedGermanToken2 = null;
+
+ boolean isBaseform = false;
+ if (analyzedToken.getReadingsLength() > 1 && token.equals(analyzedToken.getAnalyzedToken(0).getLemma())) {
+ isBaseform = true;
+ }
+ if ((readings == null || analyzedToken.getAnalyzedToken(0).getPOSTag() == null || analyzedToken.hasReadingOfType(GermanToken.POSType.VERB))
+ && isBaseform) {
+ // no match, e.g. for "Groß": try if there's a match for the lowercased word:
+
+ try {
+ analyzedGermanToken2 = tagger.lookup(token.toLowerCase());
+ if (analyzedGermanToken2 != null) {
+ readings = analyzedGermanToken2.getGermanReadings();
+ }
+ } catch (IOException e) {
+ throw new RuntimeException(e);
+ }
+ if (prevTokenIsDas) {
+ // e.g. essen -> Essen
+ final String newToken = StringTools.uppercaseFirstChar(token);
+ try {
+ analyzedGermanToken2 = tagger.lookup(newToken);
+ //analyzedGermanToken2.hasReadingOfType(GermanToken.POSType.VERB)
+ } catch (IOException e) {
+ throw new RuntimeException(e);
+ }
+ if (Character.isLowerCase(token.charAt(0)) && !substVerbenExceptions.contains(token)) {
+ final String msg = "Substantivierte Verben werden groß geschrieben.";
+ final RuleMatch ruleMatch = new RuleMatch(this, tokens[i].getStartPos(),
+ tokens[i].getStartPos()+token.length(), msg);
+ final String word = tokens[i].getToken();
+ final String fixedWord = StringTools.uppercaseFirstChar(word);
+ ruleMatch.setSuggestedReplacement(fixedWord);
+ ruleMatches.add(ruleMatch);
+ }
+ }
+ }
+ prevTokenIsDas = nounIndicators.contains(tokens[i].getToken().toLowerCase());
+ if (readings == null)
+ continue;
+ final boolean hasNounReading = analyzedToken.hasReadingOfType(GermanToken.POSType.NOMEN);
+ if (hasNounReading) // it's the spell checker's task to check that nouns are uppercase
+ continue;
+ try {
+ // TODO: this lookup should only happen once:
+ analyzedGermanToken2 = tagger.lookup(token.toLowerCase());
+ } catch (IOException e) {
+ throw new RuntimeException(e);
+ }
+ if (analyzedToken.getAnalyzedToken(0).getPOSTag() == null && analyzedGermanToken2 == null) {
+ continue;
+ }
+ if (analyzedToken.getAnalyzedToken(0).getPOSTag() == null && analyzedGermanToken2 != null
+ && analyzedGermanToken2.getAnalyzedToken(0).getPOSTag() == null) {
+ // unknown word, probably a name etc
+ continue;
+ }
+
+ if (Character.isUpperCase(token.charAt(0)) &&
+ token.length() > 1 && // length limit = ignore abbreviations
+ !sentenceStartExceptions.contains(tokens[i-1].getToken()) &&
+ !StringTools.isAllUppercase(token) &&
+ !exceptions.contains(token) &&
+ !analyzedToken.hasReadingOfType(POSType.PROPER_NOUN) &&
+ !analyzedToken.isSentenceEnd() &&
+ !isExceptionPhrase(i, tokens)) {
+ final String msg = "Außer am Satzanfang werden nur Nomen und Eigennamen groß geschrieben";
+ final RuleMatch ruleMatch = new RuleMatch(this, tokens[i].getStartPos(),
+ tokens[i].getStartPos()+token.length(), msg);
+ final String word = tokens[i].getToken();
+ final String fixedWord = Character.toLowerCase(word.charAt(0)) + word.substring(1);
+ ruleMatch.setSuggestedReplacement(fixedWord);
+ ruleMatches.add(ruleMatch);
+ }
+ pos += token.length();
+ }
+ return toRuleMatchArray(ruleMatches);
+ }
+
+ private boolean isExceptionPhrase(int i, AnalyzedTokenReadings[] tokens) {
+ // TODO: speed up?
+ for (String exc : myExceptionPhrases) {
+ final String[] parts = exc.split(" ");
+ for (int j = 0; j < parts.length; j++) {
+ if (parts[j].equals(tokens[i].getToken())) {
+ /*System.err.println("*******"+j + " of " + parts.length + ": " + parts[j]);
+ System.err.println("start:" + tokens[i-j].getToken());
+ System.err.println("end:" + tokens[i-j+parts.length-1].getToken());*/
+ final int startIndex = i-j;
+ if (compareLists(tokens, startIndex, startIndex+parts.length-1, parts)) {
+ return true;
+ }
+ }
+ }
+ }
+ return false;
+ }
+
+ private boolean compareLists(AnalyzedTokenReadings[] tokens, int startIndex, int endIndex, String[] parts) {
+ if (startIndex < 0)
+ return false;
+ int i = 0;
+ for (int j = startIndex; j <= endIndex; j++) {
+ //System.err.println("**" +tokens[j].getToken() + " <-> "+ parts[i]);
+ if (i >= parts.length)
+ return false;
+ if (!tokens[j].getToken().equals(parts[i])) {
+ return false;
+ }
+ i++;
+ }
+ return true;
+ }
+
+ public void reset() {
+ // nothing
+ }
+
+} \ No newline at end of file
diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/rules/de/CompoundRule.java b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/de/CompoundRule.java
new file mode 100644
index 0000000..f180acc
--- /dev/null
+++ b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/de/CompoundRule.java
@@ -0,0 +1,53 @@
+/* LanguageTool, a natural language style checker
+ * Copyright (C) 2006 Daniel Naber (http://www.danielnaber.de)
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
+ * USA
+ */
+package de.danielnaber.languagetool.rules.de;
+
+import java.io.IOException;
+import java.util.ResourceBundle;
+
+import de.danielnaber.languagetool.JLanguageTool;
+import de.danielnaber.languagetool.rules.AbstractCompoundRule;
+
+/**
+ * Checks that compounds (if in the list) are not written as separate words.
+ *
+ * @author Daniel Naber
+ */
+public class CompoundRule extends AbstractCompoundRule {
+
+ private static final String FILE_NAME = "/de/compounds.txt";
+
+ public CompoundRule(final ResourceBundle messages) throws IOException {
+ super(messages);
+ loadCompoundFile(JLanguageTool.getDataBroker().getFromResourceDirAsStream(FILE_NAME), "UTF-8");
+ super.setShort("Hyphenation problem");
+ super.setMsg("Dieses Kompositum wird mit Bindestrich geschrieben.",
+ "Dieses Kompositum wird zusammengeschrieben.",
+ "Dieses Kompositum wird zusammen oder mit Bindestrich geschrieben.");
+ }
+
+
+ public String getId() {
+ return "DE_COMPOUNDS";
+ }
+
+ public String getDescription() {
+ return "Zusammenschreibung von Komposita, z.B. 'CD-ROM' statt 'CD ROM'";
+ }
+}
diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/rules/de/DashRule.java b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/de/DashRule.java
new file mode 100644
index 0000000..18bb670
--- /dev/null
+++ b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/de/DashRule.java
@@ -0,0 +1,84 @@
+/* LanguageTool, a natural language style checker
+ * Copyright (C) 2005 Daniel Naber (http://www.danielnaber.de)
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
+ * USA
+ */
+package de.danielnaber.languagetool.rules.de;
+
+import java.util.ArrayList;
+import java.util.List;
+import java.util.ResourceBundle;
+
+import de.danielnaber.languagetool.AnalyzedSentence;
+import de.danielnaber.languagetool.AnalyzedTokenReadings;
+import de.danielnaber.languagetool.rules.Category;
+import de.danielnaber.languagetool.rules.RuleMatch;
+
+/**
+ * Pr&uuml;ft, dass in Bindestrich-Komposita kein Leerzeichen eingef&uuml;gt wird (wie z.B. in 'Di&auml;ten- Erh&ouml;hung').
+ *
+ * @author Daniel Naber
+ */
+public class DashRule extends GermanRule {
+
+ public DashRule(final ResourceBundle messages) {
+ if (messages != null)
+ super.setCategory(new Category(messages.getString("category_misc")));
+ }
+
+ public String getId() {
+ return "DE_DASH";
+ }
+
+ public String getDescription() {
+ return "Keine Leerzeichen in Bindestrich-Komposita (wie z.B. in 'Diäten- Erhöhung')";
+ }
+
+ public RuleMatch[] match(final AnalyzedSentence text) {
+ final List<RuleMatch> ruleMatches = new ArrayList<RuleMatch>();
+ final AnalyzedTokenReadings[] tokens = text.getTokensWithoutWhitespace();
+ int pos = 0;
+ String prevToken = null;
+ for (int i = 0; i < tokens.length; i++) {
+ final String token = tokens[i].getToken();
+ if (tokens[i].isWhitespace()) {
+ // ignore
+ continue;
+ }
+ if (prevToken != null && !prevToken.equals("-") && prevToken.indexOf("--") == -1
+ && prevToken.indexOf("–-") == -1 // first char is some special kind of dash, found in Wikipedia
+ && prevToken.endsWith("-")) {
+ final char firstChar = token.charAt(0);
+ if (Character.isUpperCase(firstChar)) {
+ final String msg = "Möglicherweise fehlt ein 'und' oder es wurde nach dem Wort " +
+ "ein überflüssiges Leerzeichen eingefügt.";
+ final RuleMatch ruleMatch = new RuleMatch(this, tokens[i-1].getStartPos(),
+ tokens[i-1].getStartPos()+prevToken.length()+1, msg);
+ ruleMatch.setSuggestedReplacement(tokens[i-1].getToken());
+ ruleMatches.add(ruleMatch);
+ }
+ }
+ prevToken = token;
+ pos += token.length();
+ }
+ return toRuleMatchArray(ruleMatches);
+ }
+
+ public void reset() {
+ // nothing
+ }
+
+}
diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/rules/de/GermanLemmatizer.java b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/de/GermanLemmatizer.java
new file mode 100644
index 0000000..ddcac98
--- /dev/null
+++ b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/de/GermanLemmatizer.java
@@ -0,0 +1,84 @@
+/* LanguageTool, a natural language style checker
+ * Copyright (C) 2005 Daniel Naber (http://www.danielnaber.de)
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
+ * USA
+ */
+package de.danielnaber.languagetool.rules.de;
+
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.util.HashMap;
+import java.util.Map;
+
+import de.danielnaber.languagetool.JLanguageTool;
+
+/**
+ * Trivial German lemmatizer that can simply find the baseforms of
+ * those fullforms listed in <code>rules/de/fullform2baseform.txt</code>.
+ *
+ * @author Daniel Naber
+ */
+class GermanLemmatizer {
+
+ private static final String FILE_NAME = "/de/fullform2baseform.txt";
+ private static final String FILE_ENCODING = "utf-8";
+
+ private final Map<String, String> fullform2baseform;
+
+ GermanLemmatizer() throws IOException {
+ fullform2baseform = loadWords(JLanguageTool.getDataBroker().getFromRulesDirAsStream(FILE_NAME));
+ }
+
+ String getBaseform(final String fullform) {
+ return fullform2baseform.get(fullform);
+ }
+
+ private Map<String, String> loadWords(InputStream file) throws IOException {
+ final Map<String, String> map = new HashMap<String, String>();
+ InputStreamReader isr = null;
+ BufferedReader br = null;
+ try {
+ isr = new InputStreamReader(file, FILE_ENCODING);
+ br = new BufferedReader(isr);
+ String line;
+ while ((line = br.readLine()) != null) {
+ line = line.trim();
+ if (line.length() < 1) { //ignore empty lines
+ continue;
+ }
+ if (line.charAt(0) == '#') { // ignore comments
+ continue;
+ }
+ final String[] parts = line.split(":");
+ if (parts.length != 2) {
+ throw new IOException("Format error in file " +JLanguageTool.getDataBroker().getFromRulesDirAsUrl(FILE_NAME)+", line: " + line);
+ }
+ final String baseform = parts[0];
+ final String[] fullforms = parts[1].split(",");
+ for (String fullform : fullforms) {
+ map.put(fullform.trim(), baseform);
+ }
+ }
+ } finally {
+ if (br != null) br.close();
+ if (isr != null) isr.close();
+ }
+ return map;
+ }
+
+}
diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/rules/de/GermanRule.java b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/de/GermanRule.java
new file mode 100644
index 0000000..1fca395
--- /dev/null
+++ b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/de/GermanRule.java
@@ -0,0 +1,30 @@
+/* LanguageTool, a natural language style checker
+ * Copyright (C) 2005 Daniel Naber (http://www.danielnaber.de)
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
+ * USA
+ */
+package de.danielnaber.languagetool.rules.de;
+
+import de.danielnaber.languagetool.rules.Rule;
+
+/**
+ * Abstract base class for rules for the German language.
+ *
+ * @author Daniel Naber
+ */
+public abstract class GermanRule extends Rule {
+
+}
diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/rules/de/GermanWordRepeatRule.java b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/de/GermanWordRepeatRule.java
new file mode 100644
index 0000000..55f98b4
--- /dev/null
+++ b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/de/GermanWordRepeatRule.java
@@ -0,0 +1,39 @@
+/*
+ * Created on 03.10.2009
+ */
+package de.danielnaber.languagetool.rules.de;
+
+import java.util.ResourceBundle;
+
+import de.danielnaber.languagetool.AnalyzedTokenReadings;
+import de.danielnaber.languagetool.Language;
+import de.danielnaber.languagetool.rules.WordRepeatRule;
+
+/**
+ * Check if a word is repeated twice, taking into account an exception
+ * for German where e.g. "..., die die ..." is often okay.
+ *
+ * @author Daniel Naber
+ */
+public class GermanWordRepeatRule extends WordRepeatRule {
+
+ public GermanWordRepeatRule(final ResourceBundle messages, final Language language) {
+ super(messages, language);
+ }
+
+ @Override
+ public String getId() {
+ return "GERMAN_WORD_REPEAT_RULE";
+ }
+
+ @Override
+ public boolean ignore(final AnalyzedTokenReadings[] tokens, final int position) {
+ // Don't mark error for cases like:
+ // "wie Honda und Samsung, die die Bezahlung ihrer Firmenchefs..."
+ if (position >= 2 && ",".equals(tokens[position - 2].getToken())) {
+ return true;
+ }
+ return false;
+ }
+
+}
diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/rules/de/WiederVsWiderRule.java b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/de/WiederVsWiderRule.java
new file mode 100644
index 0000000..ea1c2aa
--- /dev/null
+++ b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/de/WiederVsWiderRule.java
@@ -0,0 +1,91 @@
+/* LanguageTool, a natural language style checker
+ * Copyright (C) 2005 Daniel Naber (http://www.danielnaber.de)
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
+ * USA
+ */
+package de.danielnaber.languagetool.rules.de;
+
+import java.util.ArrayList;
+import java.util.List;
+import java.util.ResourceBundle;
+
+import de.danielnaber.languagetool.AnalyzedSentence;
+import de.danielnaber.languagetool.AnalyzedTokenReadings;
+import de.danielnaber.languagetool.rules.Category;
+import de.danielnaber.languagetool.rules.RuleMatch;
+
+/**
+ * Check incorrect use of "spiegelt ... wider", namely using "wieder" instead
+ * of "wider", e.g. in "Das spiegelt die Situation wieder" (incorrect).
+ *
+ * @author Daniel Naber
+ */
+public class WiederVsWiderRule extends GermanRule {
+
+ public WiederVsWiderRule(ResourceBundle messages) {
+ if (messages != null)
+ super.setCategory(new Category(messages.getString("category_typo")));
+ }
+
+ public String getId() {
+ return "DE_WIEDER_VS_WIDER";
+ }
+
+ public String getDescription() {
+ return "Möglicher Tippfehler 'spiegeln ... wieder(wider)'";
+ }
+
+ public RuleMatch[] match(AnalyzedSentence text) {
+ final List<RuleMatch> ruleMatches = new ArrayList<RuleMatch>();
+ final AnalyzedTokenReadings[] tokens = text.getTokens();
+ int pos = 0;
+ boolean foundSpiegelt = false;
+ boolean foundWieder = false;
+ boolean foundWider = false;
+ for (AnalyzedTokenReadings token1 : tokens) {
+ final String token = token1.getToken();
+ if (token.trim().equals("")) {
+ // ignore
+ } else {
+ if (token.equalsIgnoreCase("spiegelt") || token.equalsIgnoreCase("spiegeln") || token.equalsIgnoreCase("spiegelte")
+ || token.equalsIgnoreCase("spiegelten") || token.equalsIgnoreCase("spiegelst")) {
+ foundSpiegelt = true;
+ } else if (token.equalsIgnoreCase("wieder") && foundSpiegelt) {
+ foundWieder = true;
+ } else if (token.equalsIgnoreCase("wider") && foundSpiegelt) {
+ foundWider = true;
+ }
+ if (foundSpiegelt && foundWieder && !foundWider) {
+ final String msg = "'wider' in 'widerspiegeln' wird mit 'i' statt mit 'ie' " +
+ "geschrieben, z.B. 'Das spiegelt die Situation gut wider.'";
+ final RuleMatch ruleMatch = new RuleMatch(this, pos, pos + token.length(), msg);
+ ruleMatch.setSuggestedReplacement("wider");
+ ruleMatches.add(ruleMatch);
+ foundSpiegelt = false;
+ foundWieder = false;
+ foundWider = false;
+ }
+ }
+ pos += token.length();
+ }
+ return toRuleMatchArray(ruleMatches);
+ }
+
+ public void reset() {
+ // nothing
+ }
+
+}
diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/rules/de/WordCoherencyRule.java b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/de/WordCoherencyRule.java
new file mode 100644
index 0000000..2bba43a
--- /dev/null
+++ b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/de/WordCoherencyRule.java
@@ -0,0 +1,156 @@
+/* LanguageTool, a natural language style checker
+ * Copyright (C) 2005 Daniel Naber (http://www.danielnaber.de)
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
+ * USA
+ */
+package de.danielnaber.languagetool.rules.de;
+
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.ResourceBundle;
+
+import de.danielnaber.languagetool.AnalyzedSentence;
+import de.danielnaber.languagetool.AnalyzedToken;
+import de.danielnaber.languagetool.AnalyzedTokenReadings;
+import de.danielnaber.languagetool.JLanguageTool;
+import de.danielnaber.languagetool.rules.Category;
+import de.danielnaber.languagetool.rules.RuleMatch;
+
+/**
+ * A rule that matches words for which two different spellings are used
+ * throughout the document. Currently only implemented for German. Loads
+ * the relevant word from <code>rules/de/coherency.txt</code>.
+ *
+ * <p>Note that this should not be used for language variations like
+ * American English vs. British English or German "alte Rechtschreibung"
+ * vs. "neue Rechtschreibung" -- that's the task of a spell checker.
+ *
+ * @author Daniel Naber
+ */
+public class WordCoherencyRule extends GermanRule {
+
+ private static final String FILE_NAME = "/de/coherency.txt";
+ private static final String FILE_ENCODING = "utf-8";
+
+ private final Map<String, String> relevantWords; // e.g. "aufwendig -> aufwändig"
+ private Map<String, RuleMatch> shouldNotAppearWord = new HashMap<String, RuleMatch>(); // e.g. aufwändig -> RuleMatch of aufwendig
+
+ private final GermanLemmatizer germanLemmatizer;
+
+ public WordCoherencyRule(ResourceBundle messages) throws IOException {
+ if (messages != null)
+ super.setCategory(new Category(messages.getString("category_misc")));
+ relevantWords = loadWords(JLanguageTool.getDataBroker().getFromRulesDirAsStream(FILE_NAME));
+ germanLemmatizer = new GermanLemmatizer();
+ }
+
+ public String getId() {
+ return "DE_WORD_COHERENCY";
+ }
+
+ public String getDescription() {
+ return "Einheitliche Schreibweise für Wörter mit mehr als einer korrekten Schreibweise";
+ }
+
+ public RuleMatch[] match(AnalyzedSentence text) {
+ final List<RuleMatch> ruleMatches = new ArrayList<RuleMatch>();
+ final AnalyzedTokenReadings[] tokens = text.getTokens();
+ int pos = 0;
+ for (AnalyzedTokenReadings tmpToken : tokens) {
+ //TODO: definitely should be changed
+ //if the general lemmatizer is working
+ //defaulting to the first element because the
+ //general German lemmatizer is not (yet) there
+ String token = tmpToken.getToken();
+ if (tmpToken.isWhitespace()) {
+ // ignore
+ } else {
+ final String origToken = token;
+ final List<AnalyzedToken> readings = tmpToken.getReadings();
+ // TODO: in theory we need to care about the other readings, too:
+ if (readings != null && readings.size() > 0) {
+ final String baseform = readings.get(0).getLemma();
+ if (baseform != null) {
+ token = baseform;
+ } else {
+ // not all words are known by the Tagger (esp. compounds), so use the
+ // file lookup:
+ final String manualLookup = germanLemmatizer.getBaseform(origToken);
+ if (manualLookup != null)
+ token = manualLookup;
+ }
+ }
+ if (shouldNotAppearWord.containsKey(token)) {
+ final RuleMatch otherMatch = shouldNotAppearWord.get(token);
+ final String otherSpelling = otherMatch.getMessage();
+ final String msg = "'" + token + "' und '" + otherSpelling +
+ "' sollten nicht gleichzeitig benutzt werden";
+ final RuleMatch ruleMatch = new RuleMatch(this, pos, pos + origToken.length(), msg);
+ ruleMatch.setSuggestedReplacement(otherSpelling);
+ ruleMatches.add(ruleMatch);
+ } else if (relevantWords.containsKey(token)) {
+ final String shouldNotAppear = relevantWords.get(token);
+ // only used to display this spelling variation if the other one really occurs:
+ final RuleMatch potentialRuleMatch = new RuleMatch(this, pos, pos + origToken.length(), token);
+ shouldNotAppearWord.put(shouldNotAppear, potentialRuleMatch);
+ }
+ }
+ pos += tmpToken.getToken().length();
+ }
+ return toRuleMatchArray(ruleMatches);
+ }
+
+ private Map<String, String> loadWords(InputStream file) throws IOException {
+ final Map<String, String> map = new HashMap<String, String>();
+ InputStreamReader isr = null;
+ BufferedReader br = null;
+ try {
+ isr = new InputStreamReader(file, FILE_ENCODING);
+ br = new BufferedReader(isr);
+ String line;
+ while ((line = br.readLine()) != null) {
+ line = line.trim();
+ if (line.length() < 1) {
+ continue;
+ }
+ if (line.charAt(0) == '#') { // ignore comments
+ continue;
+ }
+ final String[] parts = line.split(";");
+ if (parts.length != 2) {
+ throw new IOException("Format error in file " + JLanguageTool.getDataBroker().getFromRulesDirAsUrl(FILE_NAME) + ", line: " + line);
+ }
+ map.put(parts[0], parts[1]);
+ map.put(parts[1], parts[0]);
+ }
+ } finally {
+ if (br != null) br.close();
+ if (isr != null) isr.close();
+ }
+ return map;
+ }
+
+ public void reset() {
+ shouldNotAppearWord = new HashMap<String, RuleMatch>();
+ }
+
+}
diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/rules/en/AvsAnRule.java b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/en/AvsAnRule.java
new file mode 100644
index 0000000..ae02ef5
--- /dev/null
+++ b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/en/AvsAnRule.java
@@ -0,0 +1,251 @@
+/* LanguageTool, a natural language style checker
+ * Copyright (C) 2005 Daniel Naber (http://www.danielnaber.de)
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
+ * USA
+ */
+package de.danielnaber.languagetool.rules.en;
+
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.ResourceBundle;
+import java.util.TreeSet;
+
+import de.danielnaber.languagetool.AnalyzedSentence;
+import de.danielnaber.languagetool.AnalyzedTokenReadings;
+import de.danielnaber.languagetool.JLanguageTool;
+import de.danielnaber.languagetool.rules.Category;
+import de.danielnaber.languagetool.rules.RuleMatch;
+import de.danielnaber.languagetool.tools.StringTools;
+
+/**
+ * Check if the determiner (if any) preceding a word is:
+ * <ul>
+ * <li><i>an</i> if the next word starts with a vowel
+ * <li><i>a</i> if the next word does not start with a vowel
+ * </ul>
+ * This rule loads some exceptions from external files (e.g. <i>an hour</i>).
+ *
+ * @author Daniel Naber
+ */
+public class AvsAnRule extends EnglishRule {
+
+ private static final String FILENAME_A = "/en/det_a.txt";
+ private static final String FILENAME_AN = "/en/det_an.txt";
+
+ private final TreeSet<String> requiresA;
+ private final TreeSet<String> requiresAn;
+
+ public AvsAnRule(final ResourceBundle messages) throws IOException {
+ if (messages != null) {
+ super.setCategory(new Category(messages.getString("category_misc")));
+ }
+ requiresA = loadWords(JLanguageTool.getDataBroker().getFromRulesDirAsStream(FILENAME_A));
+ requiresAn = loadWords(JLanguageTool.getDataBroker().getFromRulesDirAsStream(FILENAME_AN));
+ }
+
+ @Override
+ public String getId() {
+ return "EN_A_VS_AN";
+ }
+
+ @Override
+ public String getDescription() {
+ return "Use of 'a' vs. 'an'";
+ }
+
+ @Override
+ public RuleMatch[] match(final AnalyzedSentence text) {
+ final List<RuleMatch> ruleMatches = new ArrayList<RuleMatch>();
+ final AnalyzedTokenReadings[] tokens = text.getTokensWithoutWhitespace();
+ String prevToken = "";
+ int prevPos = 0;
+ //ignoring token 0, i.e., SENT_START
+ for (int i = 1; i < tokens.length; i++) {
+ String token = tokens[i].getToken();
+ boolean doesRequireA = false;
+ boolean doesRequireAn = false;
+ // check for exceptions:
+ boolean isException = false;
+ final String[] parts = token.split("[-']"); // for example, in "one-way" only "one" is relevant
+ if (parts.length >= 1 &&
+ !parts[0].equalsIgnoreCase("a")) { // avoid false alarm on "A-levels are..."
+ token = parts[0];
+ }
+ token = token.replaceAll("[^a-zA-Z0-9\\.']", ""); // e.g. >>an "industry party"<<
+ if (StringTools.isEmpty(token)) {
+ continue;
+ }
+ final char tokenFirstChar = token.charAt(0);
+ if (requiresA.contains(token.toLowerCase()) || requiresA.contains(token)) {
+ isException = true;
+ doesRequireA = true;
+ }
+ if (requiresAn.contains(token.toLowerCase()) || requiresAn.contains(token)) {
+ if (isException) {
+ throw new IllegalStateException(token + " is listed in both det_a.txt and det_an.txt");
+ }
+ isException = true;
+ doesRequireAn = true;
+ }
+
+ if (!isException) {
+ if (StringTools.isAllUppercase(token) || StringTools.isMixedCase(token)) {
+ // we don't know how all-uppercase and mixed case words (often abbreviations) are pronounced,
+ // so never complain about these:
+ doesRequireAn = false;
+ doesRequireA = false;
+ } else if (isVowel(tokenFirstChar)) {
+ doesRequireAn = true;
+ } else {
+ doesRequireA = true;
+ }
+ }
+ //System.err.println(prevToken + " " +token + ", a="+doesRequireA + ", an="+doesRequireAn);
+ String msg = null;
+ if (prevToken.equalsIgnoreCase("a") && doesRequireAn) {
+ String replacement = "an";
+ if (prevToken.equals("A")) {
+ replacement = "An";
+ }
+ msg = "Use <suggestion>" +replacement+ "</suggestion> instead of '" +prevToken+ "' if the following "+
+ "word starts with a vowel sound, e.g. 'an article', "
+ + "'an hour'";
+ } else if (prevToken.equalsIgnoreCase("an") && doesRequireA) {
+ String replacement = "a";
+ if (prevToken.equals("An")) {
+ replacement = "A";
+ }
+ msg = "Use <suggestion>" +replacement+ "</suggestion> instead of '" +prevToken+ "' if the following "+
+ "word doesn't start with a vowel sound, e.g. 'a sentence', "
+ + "'a university'";
+ }
+ if (msg != null) {
+ final RuleMatch ruleMatch = new RuleMatch(this, prevPos, prevPos+prevToken.length(), msg, "Wrong article");
+ ruleMatches.add(ruleMatch);
+ }
+ if (tokens[i].hasPosTag("DT")) {
+ prevToken = token;
+ prevPos = tokens[i].getStartPos();
+ } else {
+ prevToken = "";
+ }
+ }
+ return toRuleMatchArray(ruleMatches);
+ }
+
+ /**
+ * Adds "a" or "an" to the English noun.
+ * Used for suggesting the proper form of the
+ * indefinite article.
+ * @param noun Word that needs an article.
+ * @return String containing the word with a determiner,
+ * or just the word if the word is an abbreviation.
+ */
+ public final String suggestAorAn(final String noun) {
+ String word = noun;
+ boolean doesRequireA = false;
+ boolean doesRequireAn = false;
+ // check for exceptions:
+ boolean isException = false;
+ final String[] parts = word.split("[-']"); // for example, in "one-way" only "one" is relevant
+ if (parts.length >= 1 &&
+ !parts[0].equalsIgnoreCase("a")) { // avoid false alarm on "A-levels are..."
+ word = parts[0];
+ }
+ //html entities!
+ word = word.replaceAll("&quot|&amp|&lt|&gt|[^a-zA-Z0-9]", ""); // e.g. >>an "industry party"<<
+ if (StringTools.isEmpty(word)) {
+ return word;
+ }
+ final char tokenFirstChar = word.charAt(0);
+ if (requiresA.contains(word.toLowerCase()) || requiresA.contains(word)) {
+ isException = true;
+ doesRequireA = true;
+ }
+ if (requiresAn.contains(word.toLowerCase()) || requiresAn.contains(word)) {
+ if (isException) {
+ throw new IllegalStateException(word + " is listed in both det_a.txt and det_an.txt");
+ }
+ isException = true;
+ doesRequireAn = true;
+ }
+ if (!isException) {
+ if (StringTools.isAllUppercase(word) || StringTools.isMixedCase(word)) {
+ // we don't know how all-uppercase words (often abbreviations) are pronounced,
+ // so never complain about these:
+ doesRequireAn = false;
+ doesRequireA = false;
+ } else if (isVowel(tokenFirstChar)) {
+ doesRequireAn = true;
+ } else {
+ doesRequireA = true;
+ }
+ }
+ if (doesRequireA) {
+ return "a " + noun;
+ } else if (doesRequireAn) {
+ return "an " + noun;
+ } else {
+ return noun;
+ }
+ }
+
+ private static boolean isVowel(char c) {
+ c = Character.toLowerCase(c);
+ return c == 'a' || c == 'e' || c == 'i' || c == 'o' || c == 'u';
+ }
+
+ /**
+ * Load words, normalized to lowercase.
+ */
+ private TreeSet<String> loadWords(final InputStream file) throws IOException {
+ BufferedReader br = null;
+ final TreeSet<String> set = new TreeSet<String>();
+ try {
+ br = new BufferedReader(new InputStreamReader(file));
+ String line;
+ while ((line = br.readLine()) != null) {
+ line = line.trim();
+ if (line.length() < 1) {
+ continue;
+ }
+ if (line.charAt(0) == '#') {
+ continue;
+ }
+ if (line.charAt(0) == '*') {
+ set.add(line.substring(1));
+ } else {
+ set.add(line.toLowerCase());
+ }
+ }
+ } finally {
+ if (br != null) {
+ br.close();
+ }
+ }
+ return set;
+ }
+
+ @Override
+ public void reset() {
+ // nothing
+ }
+
+}
diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/rules/en/CompoundRule.java b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/en/CompoundRule.java
new file mode 100644
index 0000000..0e01523
--- /dev/null
+++ b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/en/CompoundRule.java
@@ -0,0 +1,55 @@
+/* LanguageTool, a natural language style checker
+ * Copyright (C) 2006 Daniel Naber (http://www.danielnaber.de)
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
+ * USA
+ */
+package de.danielnaber.languagetool.rules.en;
+
+import java.io.IOException;
+import java.util.ResourceBundle;
+
+import de.danielnaber.languagetool.JLanguageTool;
+import de.danielnaber.languagetool.rules.AbstractCompoundRule;
+
+/**
+ * Checks that compounds (if in the list) are not written as separate words.
+ *
+ * @author Marcin Miłkowski, based on code by Daniel Naber
+ */
+
+public class CompoundRule extends AbstractCompoundRule {
+
+ private static final String FILE_NAME = "/en/compounds.txt";
+
+ public CompoundRule(final ResourceBundle messages) throws IOException {
+ super(messages);
+ loadCompoundFile(JLanguageTool.getDataBroker().getFromResourceDirAsStream(FILE_NAME), "UTF-8");
+ super.setShort("Hyphenation problem");
+ super.setMsg("This word is normally spelled with hyphen.",
+ "This word is normally spelled as one.",
+ "This expression is normally spelled as one or with hyphen.");
+ }
+
+ public String getId() {
+ return "EN_COMPOUNDS";
+ }
+
+ public String getDescription() {
+ return "Hyphenated words, e.g., 'case-sensitive' instead of 'case sensitive'";
+ }
+
+
+}
diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/rules/en/EnglishRule.java b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/en/EnglishRule.java
new file mode 100644
index 0000000..cd0036d
--- /dev/null
+++ b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/en/EnglishRule.java
@@ -0,0 +1,30 @@
+/* LanguageTool, a natural language style checker
+ * Copyright (C) 2005 Daniel Naber (http://www.danielnaber.de)
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
+ * USA
+ */
+package de.danielnaber.languagetool.rules.en;
+
+import de.danielnaber.languagetool.rules.Rule;
+
+/**
+ * Abstract base class for rules for the English language.
+ *
+ * @author Daniel Naber
+ */
+public abstract class EnglishRule extends Rule {
+
+}
diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/rules/en/EnglishUnpairedBracketsRule.java b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/en/EnglishUnpairedBracketsRule.java
new file mode 100644
index 0000000..4b32c05
--- /dev/null
+++ b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/en/EnglishUnpairedBracketsRule.java
@@ -0,0 +1,89 @@
+/* LanguageTool, a natural language style checker
+ * Copyright (C) 2010 Daniel Naber (http://www.languagetool.org)
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
+ * USA
+ */
+
+package de.danielnaber.languagetool.rules.en;
+
+import java.util.ResourceBundle;
+import java.util.regex.Pattern;
+
+import de.danielnaber.languagetool.AnalyzedTokenReadings;
+import de.danielnaber.languagetool.Language;
+import de.danielnaber.languagetool.rules.GenericUnpairedBracketsRule;
+
+public class EnglishUnpairedBracketsRule extends GenericUnpairedBracketsRule {
+
+ private static final String[] EN_START_SYMBOLS = { "[", "(", "{", "“", "\"", "'" };
+ private static final String[] EN_END_SYMBOLS = { "]", ")", "}", "”", "\"", "'" };
+
+ private static final Pattern NUMBER = Pattern.compile("\\d+");
+
+ public EnglishUnpairedBracketsRule(final ResourceBundle messages,
+ final Language language) {
+ super(messages, language);
+ startSymbols = EN_START_SYMBOLS;
+ endSymbols = EN_END_SYMBOLS;
+ }
+
+ public String getId() {
+ return "EN_UNPAIRED_BRACKETS";
+ }
+
+ protected boolean isNoException(final String token,
+ final AnalyzedTokenReadings[] tokens, final int i, final int j, final boolean precSpace,
+ final boolean follSpace) {
+
+
+//TODO: add an', o', 'till, 'tain't, 'cept, 'fore in the disambiguator
+//and mark up as contractions somehow
+// add exception for dates like '52
+
+ if (i <= 1) {
+ return true;
+ }
+
+ if (!precSpace && follSpace) {
+ // exception for English inches, e.g., 20"
+ if ("\"".equals(token)
+ && NUMBER.matcher(tokens[i - 1].getToken()).matches()) {
+ return false;
+ }
+ // Exception for English plural Saxon genetive
+ // current disambiguation scheme is a bit too greedy
+ // for adjectives
+ if ("'".equals(token) && tokens[i].hasPosTag("POS")) {
+ return false;
+ }
+ // puttin' on the Ritz
+ if ("'".equals(token) && tokens[i - 1].hasPosTag("VBG")
+ && tokens[i - 1].getToken().endsWith("in")) {
+ return false;
+ }
+ }
+ if (precSpace && !follSpace) {
+ // hold 'em!
+ if ("'".equals(token) && i + 1 < tokens.length
+ && "em".equals(tokens[i + 1].getToken())) {
+ return false;
+ }
+ }
+ return true;
+ }
+
+
+}
diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/rules/es/ElwithFemRule.java b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/es/ElwithFemRule.java
new file mode 100644
index 0000000..c22b9a3
--- /dev/null
+++ b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/es/ElwithFemRule.java
@@ -0,0 +1,179 @@
+/* LanguageTool, a natural language style checker
+ * Copyright (C) 2005 Daniel Naber (http://www.danielnaber.de)
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
+ * USA
+ */
+package de.danielnaber.languagetool.rules.es;
+
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.ResourceBundle;
+import java.util.TreeSet;
+
+import de.danielnaber.languagetool.AnalyzedSentence;
+import de.danielnaber.languagetool.AnalyzedTokenReadings;
+import de.danielnaber.languagetool.JLanguageTool;
+import de.danielnaber.languagetool.rules.Category;
+import de.danielnaber.languagetool.rules.RuleMatch;
+import de.danielnaber.languagetool.tools.StringTools;
+
+/**
+ * Check if the determiner (if any) preceding a feminine noun is "el". This
+ * rule loads a list of words (feminine nouns starting with stressed ha- or a-)
+ * from an external file. These words enforce the use of 'el' as determiner
+ * instead of 'la' (also with 'un', 'algun' and 'ningun').
+ *
+ * Sample
+ *
+ * *la alma -> el alma
+ * *la hambre -> el hambre
+ *
+ * http://blog.lengua-e.com/2007/el-arma-determinante-masculino-ante-nombre-femenino/
+ * http://tinyurl.com/m9uzte
+ *
+ *
+ * @author Susana Sotelo Docio
+ *
+ * based on English AvsAnRule rule
+ */
+public class ElwithFemRule extends SpanishRule {
+
+ private static final String FILENAME_EL = "/es/el.txt";
+ private final TreeSet<String> requiresEl;
+
+ public ElwithFemRule(final ResourceBundle messages) throws IOException {
+ if (messages != null) {
+ super.setCategory(new Category(messages.getString("category_misc")));
+ }
+ requiresEl = loadWords(JLanguageTool.getDataBroker().getFromRulesDirAsStream(FILENAME_EL));
+ }
+
+ @Override
+ public String getId() {
+ return "EL_WITH_FEM";
+ }
+
+ @Override
+ public String getDescription() {
+ return "Uso de 'el' con sustantivos femeninos que comienzan por a- o ha- t\u00f3nicas";
+ }
+
+ @Override
+ public RuleMatch[] match(final AnalyzedSentence text) {
+ final List<RuleMatch> ruleMatches = new ArrayList<RuleMatch>();
+ final AnalyzedTokenReadings[] tokens = text.getTokensWithoutWhitespace();
+ String prevToken = "";
+ int prevPos = 0;
+ //ignoring token 0, i.e., SENT_START
+ for (int i = 1; i < tokens.length; i++) {
+ String token = tokens[i].getToken();
+ boolean doesRequireEl = false;
+
+ token = token.replaceAll("[^a-záéíóúñüA-ZÁÉÍÓÚÑÜ0-9\\.']", ""); // el 'alma'
+ if (StringTools.isEmpty(token)) {
+ continue;
+ }
+ if (requiresEl.contains(token.toLowerCase()) || requiresEl.contains(token)) {
+ doesRequireEl = true;
+ }
+
+ // FIXME: temporal solution for "La Haya" (change)
+ if (prevToken.equals("La") && token.equals("Haya")) {
+ doesRequireEl = false;
+ }
+
+ String msg = null;
+ String replacement = null;
+ if (prevToken.equalsIgnoreCase("la") && doesRequireEl)
+ {
+ replacement = "el";
+ if (prevToken.equals("La")) { replacement = "El"; }
+ }
+ else if (prevToken.equalsIgnoreCase("una") && doesRequireEl)
+ {
+ replacement = "un";
+ if (prevToken.equals("Una")) { replacement = "Un"; }
+ }
+ else if (prevToken.equalsIgnoreCase("alguna") && doesRequireEl)
+ {
+ replacement = "alg\u00fan";
+ if (prevToken.equals("Alguna")) { replacement = "Alg\u00fan"; }
+ }
+ else if (prevToken.equalsIgnoreCase("ninguna") && doesRequireEl)
+ {
+ replacement = "ning\u00fan";
+ if (prevToken.equals("Ninguna")) { replacement = "Ning\u00fan"; }
+ }
+
+ msg = "Use <suggestion>" +replacement+ "</suggestion> en lugar de '" +prevToken+ "' si la siguiente "+
+ "palabra comienza por 'a' o 'ha' t\u00f3nicas, por ejemplo 'el hampa', "
+ + "'un agua'";
+
+
+ if (replacement != null) {
+ final RuleMatch ruleMatch = new RuleMatch(this, prevPos, prevPos+prevToken.length(), msg, "Art\u00edculo incorrecto");
+ ruleMatches.add(ruleMatch);
+ }
+ if (tokens[i].hasPosTag("DA0FS0") || tokens[i].hasPosTag("DI0FS0") ) {
+ prevToken = token;
+ prevPos = tokens[i].getStartPos();
+ } else {
+ prevToken = "";
+ }
+ }
+ return toRuleMatchArray(ruleMatches);
+ }
+
+ /**
+ * Load words, normalized to lowercase.
+ */
+ private TreeSet<String> loadWords(final InputStream file) throws IOException {
+ BufferedReader br = null;
+ final TreeSet<String> set = new TreeSet<String>();
+ try {
+ br = new BufferedReader(new InputStreamReader(file, "utf-8"));
+ String line;
+ while ((line = br.readLine()) != null) {
+ line = line.trim();
+ if (line.length() < 1) {
+ continue;
+ }
+ if (line.charAt(0) == '#') {
+ continue;
+ }
+ if (line.charAt(0) == '*') {
+ set.add(line.substring(1));
+ } else {
+ set.add(line.toLowerCase());
+ }
+ }
+ } finally {
+ if (br != null) {
+ br.close();
+ }
+ }
+ return set;
+ }
+
+ @Override
+ public void reset() {
+ // nothing
+ }
+}
diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/rules/es/SpanishRule.java b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/es/SpanishRule.java
new file mode 100644
index 0000000..4aaa297
--- /dev/null
+++ b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/es/SpanishRule.java
@@ -0,0 +1,32 @@
+/* LanguageTool, a natural language style checker
+ * Copyright (C) 2005 Daniel Naber (http://www.danielnaber.de)
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
+ * USA
+ */
+package de.danielnaber.languagetool.rules.es;
+
+import de.danielnaber.languagetool.rules.Rule;
+
+/**
+ * Abstract base class for rules for Spanish.
+ *
+ * @author Susana Sotelo Docio
+ *
+ * based on English rules
+ */
+public abstract class SpanishRule extends Rule {
+
+}
diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/rules/fr/FrenchRule.java b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/fr/FrenchRule.java
new file mode 100644
index 0000000..2ad4bcc
--- /dev/null
+++ b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/fr/FrenchRule.java
@@ -0,0 +1,31 @@
+/* LanguageTool, a natural language style checker
+ * Copyright (C) 2005 Daniel Naber (http://www.danielnaber.de)
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
+ * USA
+ */
+
+package de.danielnaber.languagetool.rules.fr;
+
+import de.danielnaber.languagetool.rules.Rule;
+
+/**
+ * Abstract base class for French rules.
+ *
+ * @author Marcin Milkowski
+ */
+public abstract class FrenchRule extends Rule {
+
+}
diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/rules/fr/QuestionWhitespaceRule.java b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/fr/QuestionWhitespaceRule.java
new file mode 100644
index 0000000..4c03049
--- /dev/null
+++ b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/fr/QuestionWhitespaceRule.java
@@ -0,0 +1,161 @@
+/* LanguageTool, a natural language style checker
+ * Copyright (C) 2005 Daniel Naber (http://www.danielnaber.de)
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
+ * USA
+ */
+package de.danielnaber.languagetool.rules.fr;
+
+import java.util.ArrayList;
+import java.util.List;
+import java.util.ResourceBundle;
+
+import de.danielnaber.languagetool.AnalyzedSentence;
+import de.danielnaber.languagetool.AnalyzedTokenReadings;
+import de.danielnaber.languagetool.rules.Category;
+import de.danielnaber.languagetool.rules.RuleMatch;
+import de.danielnaber.languagetool.tools.StringTools;
+
+/**
+ * A rule that matches spaces before ?,:,; and ! (required for correct French
+ * punctuation).
+ *
+ * @author Marcin Miłkowski
+ */
+public class QuestionWhitespaceRule extends FrenchRule {
+
+ public QuestionWhitespaceRule(final ResourceBundle messages) {
+ // super(messages);
+ super.setCategory(new Category(messages.getString("category_misc")));
+ }
+
+ @Override
+ public String getId() {
+ return "FRENCH_WHITESPACE";
+ }
+
+ @Override
+ public String getDescription() {
+ return "Insertion des espaces fines insécables";
+ }
+
+ @Override
+ public RuleMatch[] match(final AnalyzedSentence text) {
+ final List<RuleMatch> ruleMatches = new ArrayList<RuleMatch>();
+ final AnalyzedTokenReadings[] tokens = text.getTokens();
+ String prevToken = "";
+ int pos = 0;
+ for (int i = 1; i < tokens.length; i++) {
+ final String token = tokens[i].getToken();
+ final boolean isWhiteBefore = tokens[i].isWhitespaceBefore();
+ pos += token.length();
+ String msg = null;
+ final int fixPos = 0;
+ int fixLen = 0;
+ String suggestionText = null;
+ if (isWhiteBefore) {
+ if (token.equals("?")) {
+ msg = "Point d'interrogation est précédé d'une espace fine insécable.";
+ // non-breaking space
+ suggestionText = " ?";
+ fixLen = 1;
+ } else if (token.equals("!")) {
+ msg = "Point d'exclamation est précédé d'une espace fine insécable.";
+ // non-breaking space
+ suggestionText = " !";
+ fixLen = 1;
+ } else if (token.equals("»")) {
+ msg = "Le guillemet fermant est précédé d'une espace fine insécable.";
+ // non-breaking space
+ suggestionText = " »";
+ fixLen = 1;
+ } else if (token.equals(";")) {
+ msg = "Point-virgule est précédé d'une espace fine insécable.";
+ // non-breaking space
+ suggestionText = " ;";
+ fixLen = 1;
+ } else if (token.equals(":")) {
+ msg = "Deux-points sont précédé d'une espace fine insécable.";
+ // non-breaking space
+ suggestionText = " :";
+ fixLen = 1;
+ }
+ } else {
+ if (token.equals("?") && !prevToken.equals("!")
+ && !prevToken.equals("\u00a0")) {
+ msg = "Point d'interrogation est précédé d'une espace fine insécable.";
+ // non-breaking space
+ suggestionText = prevToken + " ?";
+ fixLen = 1;
+ } else if (token.equals("!") && !prevToken.equals("?")
+ && !prevToken.equals("\u00a0")) {
+ msg = "Point d'exclamation est précédé d'une espace fine insécable.";
+ // non-breaking space
+ suggestionText = prevToken + " !";
+ fixLen = 1;
+ } else if (token.equals(";") && !prevToken.equals("\u00a0")) {
+ msg = "Point-virgule est précédé d'une espace fine insécable.";
+ // non-breaking space
+ suggestionText = prevToken + " ;";
+ fixLen = 1;
+ } else if (token.equals(":") && !prevToken.equals("\u00a0")) {
+ msg = "Deux-points précédés d'une espace fine insécable.";
+ // non-breaking space
+ suggestionText = prevToken + " :";
+ fixLen = 1;
+ } else if (token.equals("»") && !prevToken.equals("\u00a0")) {
+ msg = "Le guillemet fermant est précédé d'une espace fine insécable.";
+ // non-breaking space
+ suggestionText = prevToken + " »";
+ fixLen = 1;
+ }
+ }
+
+ if (StringTools.isEmpty(token) && prevToken.equals("«")) {
+ msg = "Le guillemet ouvrant est suivi d'une espace fine insécable.";
+ // non-breaking space
+ suggestionText = "« ";
+ fixLen = 1;
+ } else if (!StringTools.isEmpty(token) && !token.equals("\u00a0")
+ && prevToken.equals("«")) {
+ msg = "Le guillemet ouvrant est suivi d'une espace fine insécable.";
+ // non-breaking space
+ suggestionText = "« ";
+ fixLen = 0;
+ }
+
+ if (msg != null) {
+ final int fromPos = tokens[i - 1].getStartPos() + fixPos;
+ final int toPos = tokens[i - 1].getStartPos() + fixPos + fixLen
+ + tokens[i - 1].getToken().length();
+ final RuleMatch ruleMatch = new RuleMatch(this, fromPos, toPos, msg,
+ "Insérer un espace insécable");
+ if (suggestionText != null) {
+ ruleMatch.setSuggestedReplacement(suggestionText);
+ }
+ ruleMatches.add(ruleMatch);
+ }
+ prevToken = token;
+ }
+
+ return toRuleMatchArray(ruleMatches);
+ }
+
+ @Override
+ public void reset() {
+ // nothing
+ }
+
+}
diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/rules/patterns/AbstractPatternRule.java b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/patterns/AbstractPatternRule.java
new file mode 100644
index 0000000..d172134
--- /dev/null
+++ b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/patterns/AbstractPatternRule.java
@@ -0,0 +1,223 @@
+/* LanguageTool, a natural language style checker
+ * Copyright (C) 2008 Daniel Naber (http://www.danielnaber.de)
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
+ * USA
+ */
+
+package de.danielnaber.languagetool.rules.patterns;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+
+import de.danielnaber.languagetool.AnalyzedSentence;
+import de.danielnaber.languagetool.AnalyzedToken;
+import de.danielnaber.languagetool.AnalyzedTokenReadings;
+import de.danielnaber.languagetool.Language;
+import de.danielnaber.languagetool.rules.Rule;
+import de.danielnaber.languagetool.rules.RuleMatch;
+
+/**
+ * An Abstract Pattern Rule that describes a pattern of words or part-of-speech tags
+ * used for PatternRule and DisambiguationPatternRule.
+ *
+ * Introduced to minimize code duplication between those classes.
+ *
+ * @author Marcin Miłkowski
+ */
+
+public abstract class AbstractPatternRule extends Rule {
+
+ private final String id;
+
+ private final String description;
+
+ protected final List<Element> patternElements;
+
+ protected Unifier unifier;
+
+ protected final Language language;
+
+ protected int startPositionCorrection;
+
+ protected int endPositionCorrection;
+
+ protected boolean prevMatched;
+
+ protected final boolean testUnification;
+
+ private final boolean getUnified;
+
+ private boolean groupsOrUnification;
+
+ protected AnalyzedTokenReadings[] unifiedTokens;
+
+ protected final boolean sentStart;
+
+ public AbstractPatternRule(final String id,
+ final String description,
+ final Language language,
+ final List<Element> elements,
+ boolean getUnified) {
+ this.id = id;
+ this.description = description;
+ this.patternElements = new ArrayList<Element>(elements); // copy elements
+ this.language = language;
+ this.getUnified = getUnified;
+ unifier = language.getUnifier();
+ testUnification = initUnifier();
+ sentStart = patternElements.get(0).isSentStart();
+ if (!testUnification) {
+ for (Element elem : patternElements) {
+ if (elem.hasAndGroup()) {
+ groupsOrUnification = true;
+ break;
+ }
+ }
+ } else {
+ groupsOrUnification = true;
+ }
+ }
+
+ private boolean initUnifier() {
+ for (final Element elem : patternElements) {
+ if (elem.isUnified()) {
+ return true;
+ }
+ }
+ return false;
+ }
+
+ @Override
+ public final String toString() {
+ return id + ":" + patternElements + ":" + description;
+ }
+
+ @Override
+ public String getDescription() {
+ return description;
+ }
+
+ @Override
+ public String getId() {
+ return id;
+ }
+
+ @Override
+ public RuleMatch[] match(AnalyzedSentence text) throws IOException {
+ // TODO Auto-generated method stub
+ return null;
+ }
+
+ @Override
+ public void reset() {
+ // TODO Auto-generated method stub
+ }
+
+ public final void setStartPositionCorrection(final int startPositionCorrection) {
+ this.startPositionCorrection = startPositionCorrection;
+ }
+
+ public final void setEndPositionCorrection(final int endPositionCorrection) {
+ this.endPositionCorrection = endPositionCorrection;
+ }
+
+
+ protected void setupAndGroup(final int firstMatchToken,
+ final Element elem, final AnalyzedTokenReadings[] tokens)
+ throws IOException {
+ if (elem.hasAndGroup()) {
+ for (final Element andElement : elem.getAndGroup()) {
+ if (andElement.isReferenceElement()) {
+ setupRef(firstMatchToken, andElement, tokens);
+ }
+ }
+ elem.setupAndGroup();
+ }
+ }
+
+ //TODO: add .compile for all exceptions of the element?
+ protected void setupRef(final int firstMatchToken, final Element elem,
+ final AnalyzedTokenReadings[] tokens) throws IOException {
+ if (elem.isReferenceElement()) {
+ final int refPos = firstMatchToken + elem.getMatch().getTokenRef();
+ if (refPos < tokens.length) {
+ elem.compile(tokens[refPos], language.getSynthesizer());
+ }
+ }
+ }
+
+ protected boolean testAllReadings(final AnalyzedTokenReadings[] tokens,
+ final Element elem, final Element prevElement, final int tokenNo,
+ final int firstMatchToken, final int prevSkipNext) throws IOException {
+ boolean thisMatched = false;
+ final int numberOfReadings = tokens[tokenNo].getReadingsLength();
+ setupAndGroup(firstMatchToken, elem, tokens);
+ for (int l = 0; l < numberOfReadings; l++) {
+ final AnalyzedToken matchToken = tokens[tokenNo].getAnalyzedToken(l);
+ prevMatched = prevMatched || prevSkipNext > 0 && prevElement != null
+ && prevElement.isMatchedByScopeNextException(matchToken);
+ if (prevMatched) {
+ return false;
+ }
+ thisMatched = thisMatched || elem.isMatched(matchToken);
+ if (!thisMatched && !elem.isInflected() && elem.getPOStag() == null
+ && (prevElement != null && prevElement.getExceptionList() == null)) {
+ return false; // the token is the same, we will not get a match
+ }
+ if (groupsOrUnification) {
+ thisMatched &= testUnificationAndGroups(thisMatched,
+ l + 1 == numberOfReadings, matchToken, elem);
+ }
+ }
+ if (thisMatched) {
+ for (int l = 0; l < numberOfReadings; l++) {
+ if (elem.isExceptionMatchedCompletely(tokens[tokenNo].getAnalyzedToken(l)))
+ return false;
+ }
+ if (tokenNo > 0 && elem.hasPreviousException()) {
+ if (elem.isMatchedByPreviousException(tokens[tokenNo - 1]))
+ return false;
+ }
+ }
+ return thisMatched;
+ }
+
+ protected boolean testUnificationAndGroups(final boolean matched,
+ final boolean lastReading, final AnalyzedToken matchToken,
+ final Element elem) {
+ boolean thisMatched = matched;
+ if (testUnification) {
+ if (matched && elem.isUnified()) {
+ thisMatched = thisMatched && unifier.isUnified(matchToken, elem.getUniFeatures(),
+ elem.isUniNegated(), lastReading);
+ }
+ if (thisMatched && getUnified) {
+ unifiedTokens = unifier.getFinalUnified();
+ }
+ if (!elem.isUnified()) {
+ unifier.reset();
+ }
+ }
+ elem.addMemberAndGroup(matchToken);
+ if (lastReading) {
+ thisMatched &= elem.checkAndGroup(thisMatched);
+ }
+ return thisMatched;
+ }
+
+
+}
diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/rules/patterns/Element.java b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/patterns/Element.java
new file mode 100644
index 0000000..0ad7c1f
--- /dev/null
+++ b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/patterns/Element.java
@@ -0,0 +1,803 @@
+/* LanguageTool, a natural language style checker
+ * Copyright (C) 2005 Daniel Naber (http://www.danielnaber.de)
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
+ * USA
+ */
+package de.danielnaber.languagetool.rules.patterns;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
+import java.util.Map;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import de.danielnaber.languagetool.AnalyzedToken;
+import de.danielnaber.languagetool.AnalyzedTokenReadings;
+import de.danielnaber.languagetool.JLanguageTool;
+import de.danielnaber.languagetool.synthesis.Synthesizer;
+import de.danielnaber.languagetool.tools.StringTools;
+
+/**
+ * A part of a pattern.
+ *
+ * @author Daniel Naber
+ */
+public class Element {
+
+ private String stringToken;
+ private String posToken;
+ private String regToken;
+ private boolean posRegExp;
+
+ private boolean negation;
+ private boolean posNegation;
+
+ private final boolean caseSensitive;
+ private final boolean stringRegExp;
+ private boolean inflected;
+
+ private boolean testWhitespace;
+ private boolean whitespaceBefore;
+
+ /**
+ * List of exceptions that are valid for the current token and / or some next
+ * tokens.
+ */
+ private List<Element> exceptionList;
+
+ /**
+ * True if scope=="next".
+ */
+ private boolean exceptionValidNext;
+
+ /**
+ * True if any exception with a scope=="current" or scope=="next" is set for
+ * the element.
+ */
+ private boolean exceptionSet;
+
+ /**
+ * True if attribute scope=="previous".
+ */
+ private boolean exceptionValidPrevious;
+
+ /**
+ * List of exceptions that are valid for a previous token.
+ */
+ private List<Element> previousExceptionList;
+
+ private List<Element> andGroupList;
+ private boolean andGroupSet;
+ private boolean[] andGroupCheck;
+
+ private int skip;
+
+ private Pattern p;
+ private Pattern pPos;
+
+ private Matcher m;
+ private Matcher mPos;
+
+ /** The reference to another element in the pattern. **/
+ private Match tokenReference;
+
+ /**
+ * True when the element stores a formatted reference to another element of
+ * the pattern.
+ */
+ private boolean containsMatches;
+
+ /** Matches only tokens without any POS tag. **/
+ private static final String UNKNOWN_TAG = "UNKNOWN";
+
+ /**
+ * Parameter passed to regular expression matcher to enable case insensitive
+ * Unicode matching.
+ */
+ private static final String CASE_INSENSITIVE = "(?iu)";
+
+ private String referenceString;
+
+ /** String ID of the phrase the element is in. **/
+ private String phraseName;
+
+ /**
+ * This var is used to determine if calling {@link #setStringElement} makes
+ * sense. This method takes most time so it's best to reduce the number of its
+ * calls.
+ **/
+ private boolean testString;
+
+ /**
+ * Tells if the element is inside the unification, so that {@link Unifier}
+ * tests it.
+ */
+ private boolean unified;
+ private boolean uniNegation;
+
+ private Map<String, List<String>> unificationFeatures;
+
+ /**
+ * Creates Element that is used to match tokens in the text.
+ *
+ * @param token
+ * String to be matched
+ * @param caseSensitive
+ * True if the check is case-sensitive.
+ * @param regExp
+ * True if the check uses regular expressions.
+ * @param inflected
+ * True if the check refers to base forms (lemmas).
+ */
+ public Element(final String token, final boolean caseSensitive,
+ final boolean regExp, final boolean inflected) {
+ this.caseSensitive = caseSensitive;
+ this.stringRegExp = regExp;
+ this.inflected = inflected;
+ setStringElement(token);
+ }
+
+ /**
+ * Checks whether the rule element matches the token given as a parameter.
+ *
+ * @param token
+ * @AnalyzedToken to check matching against
+ * @return True if token matches, false otherwise.
+ */
+ public final boolean isMatched(final AnalyzedToken token) {
+ if (testWhitespace && !isWhitespaceBefore(token)) {
+ return false;
+ }
+ boolean matched = false;
+ if (testString) {
+ matched = (isStringTokenMatched(token) ^ negation)
+ && (isPosTokenMatched(token) ^ posNegation);
+ } else {
+ matched = (!negation) && (isPosTokenMatched(token) ^ posNegation);
+ }
+
+ if (andGroupSet) {
+ andGroupCheck[0] |= matched;
+ }
+ return matched;
+ }
+
+ /**
+ * Checks whether an exception matches.
+ *
+ * @param token
+ * @AnalyzedToken to check matching against
+ * @return True if any of the exceptions matches (logical disjunction).
+ */
+ public final boolean isExceptionMatched(final AnalyzedToken token) {
+ if (exceptionSet) {
+ for (final Element testException : exceptionList) {
+ if (!testException.exceptionValidNext) {
+ if (testException.isMatched(token)) {
+ return true;
+ }
+ }
+ }
+ }
+ return false;
+ }
+
+ /**
+ * Enables testing multiple conditions specified by different elements.
+ * Doesn't test exceptions.
+ *
+ * Works as logical AND operator only if preceded with
+ * {@link #setupAndGroup()}, and followed by {@link #checkAndGroup(boolean)}.
+ *
+ * @param token
+ * AnalyzedToken - the token checked.
+ */
+ public final void addMemberAndGroup(final AnalyzedToken token) {
+ if (andGroupSet) {
+ for (int i = 0; i < andGroupList.size(); i++) {
+ if (!andGroupCheck[i + 1]) {
+ final Element testAndGroup = andGroupList.get(i);
+ if (testAndGroup.isMatched(token)) {
+ andGroupCheck[i + 1] = true;
+ }
+ }
+ }
+ }
+ }
+
+ public final void setupAndGroup() {
+ if (andGroupSet) {
+ andGroupCheck = new boolean[andGroupList.size() + 1];
+ Arrays.fill(andGroupCheck, false);
+ }
+ }
+
+ public final boolean checkAndGroup(final boolean previousValue) {
+ if (andGroupSet) {
+ boolean allConditionsMatch = true;
+ for (final boolean testValue : andGroupCheck) {
+ allConditionsMatch &= testValue;
+ }
+ return allConditionsMatch;
+ }
+ return previousValue;
+ }
+
+ /**
+ * Enables testing multiple conditions specified by multiple element
+ * exceptions.
+ *
+ * Works as logical AND operator.
+ *
+ * @param token
+ * AnalyzedToken - the token checked for exceptions.
+ * @return true if all conditions are met, false otherwise.
+ */
+ public final boolean isAndExceptionGroupMatched(final AnalyzedToken token) {
+ if (andGroupSet) {
+ for (final Element testAndGroup : andGroupList) {
+ if (testAndGroup.isExceptionMatched(token)) {
+ return true;
+ }
+ }
+ }
+ return false;
+ }
+
+ /**
+ * This method checks exceptions both in AND-group and the token. Introduced
+ * to for clarity.
+ *
+ * @param token
+ * Token to match
+ * @return True if matched.
+ */
+ public final boolean isExceptionMatchedCompletely(final AnalyzedToken token) {
+ // note: short-circuiting possible
+ return isExceptionMatched(token) || isAndExceptionGroupMatched(token);
+ }
+
+ public final void setAndGroupElement(final Element andToken) {
+ if (andToken != null) {
+ if (andGroupList == null) {
+ andGroupList = new ArrayList<Element>();
+ }
+ if (!andGroupSet) {
+ andGroupSet = true;
+ }
+ andGroupList.add(andToken);
+ }
+ }
+
+ /**
+ * Checks if this element has an AND group associated with it.
+ *
+ * @return true if the element has a group of elements that all should match.
+ */
+ public final boolean hasAndGroup() {
+ return andGroupSet;
+ }
+
+ /**
+ * Returns the group of elements linked with AND operator.
+ *
+ * @return List of Elements.
+ */
+ public final List<Element> getAndGroup() {
+ return andGroupList;
+ }
+
+ /**
+ * Checks whether a previously set exception matches (in case the exception
+ * had scope == "next").
+ *
+ * @param token
+ * @AnalyzedToken to check matching against.
+ * @return True if any of the exceptions matches.
+ */
+ public final boolean isMatchedByScopeNextException(final AnalyzedToken token) {
+ if (exceptionSet) {
+ for (final Element testException : exceptionList) {
+ if (testException.exceptionValidNext) {
+ if (testException.isMatched(token)) {
+ return true;
+ }
+ }
+ }
+ }
+ return false;
+ }
+
+ /**
+ * Checks whether an exception for a previous token matches (in case the
+ * exception had scope == "previous").
+ *
+ * @param token
+ * {@link AnalyzedToken} to check matching against.
+ * @return True if any of the exceptions matches.
+ */
+ public final boolean isMatchedByPreviousException(final AnalyzedToken token) {
+ if (exceptionValidPrevious) {
+ for (final Element testException : previousExceptionList) {
+ if (!testException.exceptionValidNext) {
+ if (testException.isMatched(token)) {
+ return true;
+ }
+ }
+ }
+ }
+ return false;
+ }
+
+ /**
+ * Checks whether an exception for a previous token matches all readings of a
+ * given token (in case the exception had scope == "previous").
+ *
+ * @param prevToken
+ * {@link AnalyzedTokenReadings} to check matching against.
+ * @return true if any of the exceptions matches.
+ */
+ public final boolean isMatchedByPreviousException(
+ final AnalyzedTokenReadings prevToken) {
+ final int numReadings = prevToken.getReadingsLength();
+ for (int i = 0; i < numReadings; i++) {
+ if (isMatchedByPreviousException(prevToken.getAnalyzedToken(i))) {
+ return true;
+ }
+ }
+ return false;
+ }
+
+ /**
+ * Checks if the token is a SENT_START.
+ *
+ * @return True if the element starts the sentence and the element hasn't been
+ * set to have negated POS token.
+ *
+ */
+ public final boolean isSentStart() {
+ return JLanguageTool.SENTENCE_START_TAGNAME.equals(posToken)
+ && !posNegation;
+ }
+
+ @Override
+ public final String toString() {
+ final StringBuilder sb = new StringBuilder();
+ if (negation) {
+ sb.append('!');
+ }
+ sb.append(stringToken);
+ if (phraseName != null) {
+ sb.append(" {");
+ sb.append(phraseName);
+ sb.append('}');
+ }
+ if (posToken != null) {
+ sb.append('/');
+ sb.append(posToken);
+ }
+ return sb.toString();
+ }
+
+ public final void setPosElement(final String posToken, final boolean regExp,
+ final boolean negation) {
+ this.posToken = posToken;
+ this.posNegation = negation;
+ posRegExp = regExp;
+ if (posRegExp) {
+ pPos = Pattern.compile(posToken);
+ }
+ }
+
+ public final String getString() {
+ return stringToken;
+ }
+
+ public final void setStringElement(final String token) {
+ this.stringToken = token;
+ testString = !StringTools.isEmpty(stringToken);
+ if (testString && stringRegExp) {
+ regToken = stringToken;
+ if (!caseSensitive) {
+ regToken = CASE_INSENSITIVE + stringToken;
+ }
+ if (!"\\0".equals(token)) {
+ p = Pattern.compile(regToken);
+ }
+ }
+ }
+
+ /**
+ * Sets a POS-type exception for matching string tokens.
+ *
+ * @param posToken
+ * The part of the speech tag in the exception.
+ * @param regExp
+ * True if the POS is specified as a regular expression.
+ * @param negation
+ * True if the exception is negated.
+ * @param scopeNext
+ * True if the exception scope is next tokens.
+ * @param scopePrevious
+ * True if the exception should match only a single previous token.
+ */
+ public final void setPosException(final String posToken,
+ final boolean regExp, final boolean negation, final boolean scopeNext,
+ final boolean scopePrevious) {
+ final Element posException = new Element("", this.caseSensitive, false,
+ false);
+ posException.setPosElement(posToken, regExp, negation);
+ posException.exceptionValidNext = scopeNext;
+ setException(posException, scopePrevious);
+ }
+
+ /**
+ * Sets a string-type exception for matching string tokens.
+ *
+ * @param token
+ * The string in the exception.
+ * @param regExp
+ * True if the string is specified as a regular expression.
+ * @param inflected
+ * True if the string is a base form (lemma).
+ * @param negation
+ * True if the exception is negated.
+ * @param scopeNext
+ * True if the exception scope is next tokens.
+ * @param scopePrevious
+ * True if the exception should match only a single previous token.
+ */
+ public final void setStringException(final String token,
+ final boolean regExp, final boolean inflected, final boolean negation,
+ final boolean scopeNext, final boolean scopePrevious) {
+ final Element stringException = new Element(token, this.caseSensitive,
+ regExp, inflected);
+ stringException.setNegation(negation);
+ stringException.exceptionValidNext = scopeNext;
+ setException(stringException, scopePrevious);
+ }
+
+ private void setException(final Element elem, final boolean scopePrevious) {
+ exceptionValidPrevious |= scopePrevious;
+ if (exceptionList == null && !scopePrevious) {
+ exceptionList = new ArrayList<Element>();
+ }
+ if (previousExceptionList == null && scopePrevious) {
+ previousExceptionList = new ArrayList<Element>();
+ }
+ if (scopePrevious) {
+ previousExceptionList.add(elem);
+ } else {
+ if (!exceptionSet) {
+ exceptionSet = true;
+ }
+ if (exceptionSet) {
+ exceptionList.add(elem);
+ }
+ }
+ }
+
+ /**
+ * Tests if part of speech matches a given string.
+ *
+ * @param token
+ * Token to test.
+ * @return true if matches
+ *
+ * Special value UNKNOWN_TAG matches null POS tags.
+ *
+ */
+ private boolean isPosTokenMatched(final AnalyzedToken token) {
+ // if no POS set
+ // defaulting to true
+ if (posToken == null) {
+ return true;
+ }
+ if (token.getPOSTag() == null) {
+ if (posRegExp) {
+ if (mPos == null) {
+ mPos = pPos.matcher(UNKNOWN_TAG);
+ } else {
+ mPos.reset(UNKNOWN_TAG);
+ }
+ return mPos.matches();
+ }
+ if (UNKNOWN_TAG.equals(posToken)) {
+ return true;
+ }
+ }
+ boolean match;
+ if (posRegExp) {
+ if (mPos == null) {
+ mPos = pPos.matcher(token.getPOSTag());
+ } else {
+ mPos.reset(token.getPOSTag());
+ }
+ match = mPos.matches();
+ } else {
+ match = posToken.equals(token.getPOSTag());
+ }
+ if (!match && UNKNOWN_TAG.equals(posToken)) { // these are helper tags,
+ // ignore them
+ match = JLanguageTool.SENTENCE_END_TAGNAME.equals(token.getPOSTag())
+ || JLanguageTool.PARAGRAPH_END_TAGNAME.equals(token.getPOSTag());
+ }
+ return match;
+ }
+
+ /**
+ * Tests whether the string token element matches a given token.
+ *
+ * @param token
+ * {@link AnalyzedToken} to match against.
+ * @return True if matches.
+ */
+ private boolean isStringTokenMatched(final AnalyzedToken token) {
+ final String testToken = getTestToken(token);
+ if (stringRegExp) {
+ if (m == null) {
+ m = p.matcher(testToken);
+ } else {
+ m.reset(testToken);
+ }
+ return m.matches();
+ }
+ if (caseSensitive) {
+ return stringToken.equals(testToken);
+ }
+ return stringToken.equalsIgnoreCase(testToken);
+ }
+
+ private String getTestToken(final AnalyzedToken token) {
+ // enables using words with lemmas and without lemmas
+ // in the same regexp with inflected="yes"
+ if (inflected) {
+ return token.getTokenInflected();
+ }
+ return token.getToken();
+ }
+
+ /**
+ * Gets the exception scope length.
+ *
+ * @return Scope length.
+ */
+ public final int getSkipNext() {
+ return skip;
+ }
+
+ /**
+ * Sets the exception scope length.
+ *
+ * @param i
+ * Exception scope length.
+ */
+ public final void setSkipNext(final int i) {
+ skip = i;
+ }
+
+ /**
+ * Checks if the element has an exception for a previous token.
+ *
+ * @return True if the element has a previous token matching exception.
+ */
+ public final boolean hasPreviousException() {
+ return exceptionValidPrevious;
+ }
+
+ /**
+ * Negates the meaning of match().
+ *
+ * @param negation
+ * - true if the meaning of match() is to be negated.
+ */
+ public final void setNegation(final boolean negation) {
+ this.negation = negation;
+ }
+
+ /**
+ * see {@link #setNegation}
+ *
+ * @since 0.9.3
+ */
+ public final boolean getNegation() {
+ return this.negation;
+ }
+
+ /**
+ *
+ * @return true when this element refers to another token.
+ */
+ public final boolean isReferenceElement() {
+ return containsMatches;
+ }
+
+ /**
+ * Sets the reference to another token.
+ *
+ * @param match
+ * Formatting object for the token reference.
+ */
+ public final void setMatch(final Match match) {
+ tokenReference = match;
+ containsMatches = true;
+ }
+
+ public final Match getMatch() {
+ return tokenReference;
+ }
+
+ /**
+ * Prepare Element for matching by formatting its string token and POS (if the
+ * Element is supposed to refer to some other token).
+ *
+ * @param token
+ * the token specified as {@link AnalyzedTokenReadings}
+ * @param synth
+ * the language synthesizer ({@link Synthesizer})
+ *
+ */
+ public final void compile(final AnalyzedTokenReadings token,
+ final Synthesizer synth) throws IOException {
+
+ m = null;
+ p = null;
+ tokenReference.setToken(token);
+ tokenReference.setSynthesizer(synth);
+
+ if (StringTools.isEmpty(referenceString)) {
+ referenceString = stringToken;
+ }
+ if (tokenReference.setsPos()) {
+ final String posReference = tokenReference.getTargetPosTag();
+ if (posReference != null) {
+ if (mPos != null) {
+ mPos = null;
+ }
+ setPosElement(posReference, tokenReference.posRegExp(), negation);
+ }
+ setStringElement(referenceString.replace("\\"
+ + tokenReference.getTokenRef(), ""));
+ inflected = true;
+ } else {
+ setStringElement(referenceString.replace("\\"
+ + tokenReference.getTokenRef(), tokenReference.toTokenString()));
+ }
+ }
+
+ /**
+ * Sets the phrase the element is in.
+ *
+ * @param s
+ * ID of the phrase.
+ */
+ public final void setPhraseName(final String s) {
+ phraseName = s;
+ }
+
+ /**
+ * Checks if the Element is in any phrase.
+ *
+ * @return True if the Element is contained in the phrase.
+ */
+ public final boolean isPartOfPhrase() {
+ return phraseName != null;
+ }
+
+ /**
+ * Whether the element matches case sensitively.
+ *
+ * @since 0.9.3
+ */
+ public final boolean getCaseSensitive() {
+ return caseSensitive;
+ }
+
+ /**
+ * Tests whether the element matches a regular expression.
+ *
+ * @since 0.9.6
+ */
+ public final boolean isRegularExpression() {
+ return stringRegExp;
+ }
+
+ /**
+ * @return the POS of the Element
+ * @since 0.9.6
+ */
+ public final String getPOStag() {
+ return posToken;
+ }
+
+ /**
+ * Tests whether the POS is negated.
+ *
+ * @return true if so.
+ */
+ public final boolean getPOSNegation() {
+ return posNegation;
+ }
+
+ /**
+ * Whether the token is inflected.
+ *
+ * @return True if so.
+ */
+ public final boolean isInflected() {
+ return inflected;
+ }
+
+ /**
+ * Gets the phrase the element is in.
+ *
+ * @return String The name of the phrase.
+ */
+ public final String getPhraseName() {
+ return phraseName;
+ }
+
+ public final boolean isUnified() {
+ return unified;
+ }
+
+ public final void setUnification(final Map<String, List<String>> uniFeatures) {
+ unificationFeatures = uniFeatures;
+ unified = true;
+ }
+
+ /**
+ * Get unification features and types.
+ * @return A map from features to a list of types.
+ * @since 1.0.1
+ */
+ public final Map<String, List<String>> getUniFeatures() {
+ return unificationFeatures;
+ }
+
+ public final void setUniNegation() {
+ uniNegation = true;
+ }
+
+ public final boolean isUniNegated() {
+ return uniNegation;
+ }
+
+ public final void setWhitespaceBefore(final boolean isWhite) {
+ whitespaceBefore = isWhite;
+ testWhitespace = true;
+ }
+
+ public final void setExceptionSpaceBefore(final boolean isWhite) {
+ if (exceptionList != null) {
+ exceptionList.get(exceptionList.size()).setWhitespaceBefore(isWhite);
+ }
+ }
+
+ public final boolean isWhitespaceBefore(final AnalyzedToken token) {
+ return whitespaceBefore == token.isWhitespaceBefore();
+ }
+
+ /**
+ * Since 1.0.0
+ * @return A List of Exceptions. Used for testing.
+ */
+ public final List<Element> getExceptionList() {
+ return exceptionList;
+ }
+}
diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/rules/patterns/FalseFriendRuleLoader.java b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/patterns/FalseFriendRuleLoader.java
new file mode 100644
index 0000000..94c6515
--- /dev/null
+++ b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/patterns/FalseFriendRuleLoader.java
@@ -0,0 +1,356 @@
+/* LanguageTool, a natural language style checker
+ * Copyright (C) 2005 Daniel Naber (http://www.danielnaber.de)
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
+ * USA
+ */
+package de.danielnaber.languagetool.rules.patterns;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.text.MessageFormat;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.Iterator;
+import java.util.List;
+import java.util.Map;
+import java.util.ResourceBundle;
+
+import javax.xml.parsers.ParserConfigurationException;
+import javax.xml.parsers.SAXParser;
+import javax.xml.parsers.SAXParserFactory;
+
+import org.xml.sax.Attributes;
+import org.xml.sax.SAXException;
+import org.xml.sax.helpers.DefaultHandler;
+
+import de.danielnaber.languagetool.JLanguageTool;
+import de.danielnaber.languagetool.Language;
+import de.danielnaber.languagetool.rules.Category;
+import de.danielnaber.languagetool.rules.IncorrectExample;
+import de.danielnaber.languagetool.tools.StringTools;
+
+/**
+ * Loads {@link PatternRule}s from a false friends XML file.
+ *
+ * @author Daniel Naber
+ */
+public class FalseFriendRuleLoader extends DefaultHandler {
+
+ public FalseFriendRuleLoader() {
+ }
+
+ public final List<PatternRule> getRules(final InputStream file,
+ final Language textLanguage, final Language motherTongue)
+ throws ParserConfigurationException, SAXException, IOException {
+ final FalseFriendRuleHandler handler = new FalseFriendRuleHandler(
+ textLanguage, motherTongue);
+ final SAXParserFactory factory = SAXParserFactory.newInstance();
+ final SAXParser saxParser = factory.newSAXParser();
+ saxParser.getXMLReader()
+ .setFeature(
+ "http://apache.org/xml/features/nonvalidating/load-external-dtd",
+ false);
+ saxParser.parse(file, handler);
+ final List<PatternRule> rules = handler.getRules();
+ // Add suggestions to each rule:
+ final ResourceBundle messages = ResourceBundle.getBundle(
+ "de.danielnaber.languagetool.MessagesBundle", motherTongue.getLocale());
+ for (final PatternRule rule : rules) {
+ final List<String> suggestionMap = handler.getSuggestionMap().get(rule.getId());
+ if (suggestionMap != null) {
+ final MessageFormat msgFormat = new MessageFormat(messages
+ .getString("false_friend_suggestion"));
+ final Object[] msg = new Object[] { formatSuggestions(suggestionMap) };
+ rule.setMessage(rule.getMessage() + " " + msgFormat.format(msg));
+ }
+ }
+ return rules;
+ }
+
+ private String formatSuggestions(final List<String> l) {
+ final StringBuilder sb = new StringBuilder();
+ for (final Iterator<String> iter = l.iterator(); iter.hasNext();) {
+ final String s = iter.next();
+ sb.append("<suggestion>");
+ sb.append(s);
+ sb.append("</suggestion>");
+ if (iter.hasNext()) {
+ sb.append(", ");
+ }
+ }
+ return sb.toString();
+ }
+
+ /** Testing only. */
+ public final void main(final String[] args)
+ throws ParserConfigurationException, SAXException, IOException {
+ final FalseFriendRuleLoader prg = new FalseFriendRuleLoader();
+ List<PatternRule> l = prg.getRules(JLanguageTool.getDataBroker()
+ .getFromRulesDirAsStream("/false-friends.xml"), Language.ENGLISH,
+ Language.GERMAN);
+ System.out.println("Hints for German native speakers:");
+ for (final PatternRule rule : l) {
+ System.out.println(rule);
+ }
+ System.out.println("=======================================");
+ System.out.println("Hints for English native speakers:");
+ l = prg.getRules(JLanguageTool.getDataBroker()
+ .getFromRulesDirAsStream("/false-friends.xml"),
+ Language.GERMAN, Language.ENGLISH);
+ for (final PatternRule rule : l) {
+ System.out.println(rule);
+ }
+ }
+
+}
+
+class FalseFriendRuleHandler extends XMLRuleHandler {
+
+ private final ResourceBundle messages;
+ private final MessageFormat formatter;
+
+ private final Language textLanguage;
+ private final Language motherTongue;
+
+ private boolean defaultOff;
+
+ private Language language;
+ private Language translationLanguage;
+ private Language currentTranslationLanguage;
+ private List<StringBuilder> translations = new ArrayList<StringBuilder>();
+ private StringBuilder translation = new StringBuilder();
+ private final List<String> suggestions = new ArrayList<String>();
+ // rule ID -> list of translations:
+ private final Map<String, List<String>> suggestionMap = new HashMap<String, List<String>>();
+
+ private boolean inTranslation;
+
+ public FalseFriendRuleHandler(final Language textLanguage,
+ final Language motherTongue) {
+ messages = ResourceBundle.getBundle(
+ "de.danielnaber.languagetool.MessagesBundle", motherTongue.getLocale());
+ formatter = new MessageFormat("");
+ formatter.setLocale(motherTongue.getLocale());
+ this.textLanguage = textLanguage;
+ this.motherTongue = motherTongue;
+ }
+
+ public Map<String, List<String>> getSuggestionMap() {
+ return suggestionMap;
+ }
+
+ // ===========================================================
+ // SAX DocumentHandler methods
+ // ===========================================================
+
+ @Override
+ public void startElement(final String namespaceURI, final String lName,
+ final String qName, final Attributes attrs) throws SAXException {
+ if (qName.equals("rule")) {
+ translations = new ArrayList<StringBuilder>();
+ id = attrs.getValue("id");
+ if (!(inRuleGroup && defaultOff)) {
+ defaultOff = "off".equals(attrs.getValue("default"));
+ }
+ if (inRuleGroup && id == null) {
+ id = ruleGroupId;
+ }
+ correctExamples = new ArrayList<String>();
+ incorrectExamples = new ArrayList<IncorrectExample>();
+ } else if (qName.equals("pattern")) {
+ inPattern = true;
+ final String languageStr = attrs.getValue("lang");
+ language = Language.getLanguageForShortName(languageStr);
+ if (language == null) {
+ throw new SAXException("Unknown language '" + languageStr + "'");
+ }
+ } else if (qName.equals("exception")) {
+ inException = true;
+ exceptions = new StringBuilder();
+
+ if (attrs.getValue(NEGATE) != null) {
+ exceptionStringNegation = attrs.getValue(NEGATE).equals(YES);
+ }
+ if (attrs.getValue(SCOPE) != null) {
+ exceptionValidNext = attrs.getValue(SCOPE).equals("next");
+ exceptionValidPrev = attrs.getValue(SCOPE).equals("previous");
+ }
+ if (attrs.getValue(INFLECTED) != null) {
+ exceptionStringInflected = attrs.getValue(INFLECTED).equals(YES);
+ }
+ if (attrs.getValue(POSTAG) != null) {
+ exceptionPosToken = attrs.getValue(POSTAG);
+ if (attrs.getValue(POSTAG_REGEXP) != null) {
+ exceptionPosRegExp = attrs.getValue(POSTAG_REGEXP).equals(YES);
+ }
+ if (attrs.getValue(NEGATE_POS) != null) {
+ exceptionPosNegation = attrs.getValue(NEGATE_POS).equals(YES);
+ }
+ }
+ if (attrs.getValue(REGEXP) != null) {
+ exceptionStringRegExp = attrs.getValue(REGEXP).equals(YES);
+ }
+
+ } else if (qName.equals(TOKEN)) {
+ setToken(attrs);
+ } else if (qName.equals("translation")) {
+ inTranslation = true;
+ final String languageStr = attrs.getValue("lang");
+ final Language tmpLang = Language.getLanguageForShortName(languageStr);
+ currentTranslationLanguage = tmpLang;
+ if (tmpLang == motherTongue) {
+ translationLanguage = tmpLang;
+ if (translationLanguage == null) {
+ throw new SAXException("Unknown language '" + languageStr + "'");
+ }
+ }
+ } else if (qName.equals(EXAMPLE)
+ && attrs.getValue(TYPE).equals("correct")) {
+ inCorrectExample = true;
+ correctExample = new StringBuilder();
+ } else if (qName.equals(EXAMPLE)
+ && attrs.getValue(TYPE).equals("incorrect")) {
+ inIncorrectExample = true;
+ incorrectExample = new StringBuilder();
+ } else if (qName.equals("message")) {
+ inMessage = true;
+ message = new StringBuilder();
+ } else if (qName.equals("rulegroup")) {
+ ruleGroupId = attrs.getValue("id");
+ inRuleGroup = true;
+ defaultOff = "off".equals(attrs.getValue(DEFAULT));
+ }
+ }
+
+ @Override
+ public void endElement(final String namespaceURI, final String sName,
+ final String qName) {
+ if (qName.equals("rule")) {
+ if (language == textLanguage && translationLanguage != null
+ && translationLanguage == motherTongue && language != motherTongue
+ && !translations.isEmpty()) {
+ formatter.applyPattern(messages.getString("false_friend_hint"));
+ final Object[] messageArguments = {
+ elements.toString().replace('|', '/'),
+ messages.getString(textLanguage.getShortName()),
+ formatTranslations(translations),
+ messages.getString(motherTongue.getShortName()) };
+ final String description = formatter.format(messageArguments);
+ final PatternRule rule = new PatternRule(id, language, elementList,
+ messages.getString("false_friend_desc") + " "
+ + elements.toString().replace('|', '/'), description, messages
+ .getString("false_friend"));
+ rule.setCorrectExamples(correctExamples);
+ rule.setIncorrectExamples(incorrectExamples);
+ rule.setCategory(new Category(messages
+ .getString("category_false_friend")));
+ if (defaultOff) {
+ rule.setDefaultOff();
+ }
+ rules.add(rule);
+ }
+
+ if (elementList != null) {
+ elementList.clear();
+ }
+
+ } else if (qName.equals("exception")) {
+ inException = false;
+ if (!exceptionSet) {
+ tokenElement = new Element(elements.toString(), caseSensitive,
+ regExpression, tokenInflected);
+ exceptionSet = true;
+ }
+ tokenElement.setNegation(tokenNegated);
+ if (!StringTools.isEmpty(exceptions.toString())) {
+ tokenElement.setStringException(exceptions.toString(),
+ exceptionStringRegExp, exceptionStringInflected,
+ exceptionStringNegation, exceptionValidNext, exceptionValidPrev);
+ }
+ if (exceptionPosToken != null) {
+ tokenElement.setPosException(exceptionPosToken, exceptionPosRegExp,
+ exceptionPosNegation, exceptionValidNext, exceptionValidPrev);
+ exceptionPosToken = null;
+ }
+ } else if (qName.equals(TOKEN)) {
+ finalizeTokens();
+ } else if (qName.equals("pattern")) {
+ inPattern = false;
+ } else if (qName.equals("translation")) {
+ if (currentTranslationLanguage == motherTongue) {
+ translations.add(translation);
+ }
+ if (currentTranslationLanguage == textLanguage) {
+ suggestions.add(translation.toString());
+ }
+ translation = new StringBuilder();
+ inTranslation = false;
+ currentTranslationLanguage = null;
+ } else if (qName.equals(EXAMPLE)) {
+ if (inCorrectExample) {
+ correctExamples.add(correctExample.toString());
+ } else if (inIncorrectExample) {
+ incorrectExamples
+ .add(new IncorrectExample(incorrectExample.toString()));
+ }
+ inCorrectExample = false;
+ inIncorrectExample = false;
+ correctExample = new StringBuilder();
+ incorrectExample = new StringBuilder();
+ } else if (qName.equals("message")) {
+ inMessage = false;
+ } else if (qName.equals("rulegroup")) {
+ if (!suggestions.isEmpty()) {
+ final List<String> l = new ArrayList<String>(suggestions);
+ suggestionMap.put(id, l);
+ suggestions.clear();
+ }
+ inRuleGroup = false;
+ }
+ }
+
+ private String formatTranslations(final List<StringBuilder> translations) {
+ final StringBuilder sb = new StringBuilder();
+ for (final Iterator<StringBuilder> iter = translations.iterator(); iter
+ .hasNext();) {
+ final StringBuilder trans = iter.next();
+ sb.append('"');
+ sb.append(trans.toString());
+ sb.append('"');
+ if (iter.hasNext()) {
+ sb.append(", ");
+ }
+ }
+ return sb.toString();
+ }
+
+ @Override
+ public void characters(final char[] buf, final int offset, final int len) {
+ final String s = new String(buf, offset, len);
+ if (inException) {
+ exceptions.append(s);
+ } else if (inToken && inPattern) {
+ elements.append(s);
+ } else if (inCorrectExample) {
+ correctExample.append(s);
+ } else if (inIncorrectExample) {
+ incorrectExample.append(s);
+ } else if (inTranslation) {
+ translation.append(s);
+ }
+ }
+
+}
diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/rules/patterns/Match.java b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/patterns/Match.java
new file mode 100644
index 0000000..0519f2c
--- /dev/null
+++ b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/patterns/Match.java
@@ -0,0 +1,551 @@
+/* LanguageTool, a natural language style checker
+ * Copyright (C) 2005 Daniel Naber (http://www.danielnaber.de)
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
+ * USA
+ */
+package de.danielnaber.languagetool.rules.patterns;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
+import java.util.TreeSet;
+import java.util.regex.Pattern;
+
+import de.danielnaber.languagetool.AnalyzedToken;
+import de.danielnaber.languagetool.AnalyzedTokenReadings;
+import de.danielnaber.languagetool.JLanguageTool;
+import de.danielnaber.languagetool.synthesis.Synthesizer;
+import de.danielnaber.languagetool.tools.StringTools;
+
+/**
+ * Reference to a matched token in a pattern, can be formatted and used for
+ * matching & suggestions.
+ *
+ * @author Marcin Miłkowski
+ */
+public class Match {
+
+ /** Possible string case conversions. **/
+ public enum CaseConversion {
+ NONE, STARTLOWER, STARTUPPER, ALLLOWER, ALLUPPER;
+
+ /**
+ * Converts string to the constant enum.
+ *
+ * @param str
+ * String value to be converted.
+ * @return CaseConversion enum.
+ */
+ public static CaseConversion toCase(final String str) {
+ try {
+ return valueOf(str);
+ } catch (final Exception ex) {
+ return NONE;
+ }
+ }
+ }
+
+ public enum IncludeRange {
+ NONE, FOLLOWING, ALL;
+
+ /**
+ * Converts string to the constant enum.
+ *
+ * @param str
+ * String value to be converted.
+ * @return IncludeRange enum.
+ */
+ public static IncludeRange toRange(final String str) {
+ try {
+ return valueOf(str);
+ } catch (final Exception ex) {
+ return NONE;
+ }
+ }
+ }
+
+ private final String posTag;
+ private boolean postagRegexp;
+ private final String regexReplace;
+ private final String posTagReplace;
+ private final CaseConversion caseConversionType;
+
+ private final IncludeRange includeSkipped;
+ private String skippedTokens;
+
+ /**
+ * True if this match element formats a statically defined lemma which is
+ * enclosed by the element, e.g., <tt>&lt;match...&gt;word&lt;/word&gt;</tt>.
+ */
+ private boolean staticLemma;
+
+ /**
+ * True if this match element is used for formatting POS token.
+ */
+ private final boolean setPos;
+
+ private AnalyzedTokenReadings formattedToken;
+ private AnalyzedTokenReadings matchedToken;
+
+ private int tokenRef;
+
+ /** Word form generator for POS tags. **/
+ private Synthesizer synthesizer;
+
+ /** Pattern used to define parts of the matched token. **/
+ private Pattern pRegexMatch;
+
+ /** Pattern used to define parts of the matched POS token. **/
+ private Pattern pPosRegexMatch;
+
+ /**
+ * True when the match is not in the suggestion.
+ */
+ private boolean inMessageOnly;
+
+ public Match(final String posTag, final String posTagReplace,
+ final boolean postagRegexp, final String regexMatch,
+ final String regexReplace, final CaseConversion caseConversionType,
+ final boolean setPOS,
+ final IncludeRange includeSkipped) {
+ this.posTag = posTag;
+ this.postagRegexp = postagRegexp;
+ this.caseConversionType = caseConversionType;
+
+ if (regexMatch != null) {
+ pRegexMatch = Pattern.compile(regexMatch);
+ }
+ if (postagRegexp && posTag != null) {
+ pPosRegexMatch = Pattern.compile(posTag);
+ }
+
+ this.regexReplace = regexReplace;
+ this.posTagReplace = posTagReplace;
+ this.setPos = setPOS;
+ this.includeSkipped = includeSkipped;
+ }
+
+ /**
+ * Sets the token that will be formatted or otherwise used in the class.
+ */
+ public final void setToken(final AnalyzedTokenReadings token) {
+ if (staticLemma) {
+ matchedToken = token;
+ } else {
+ formattedToken = token;
+ }
+ }
+
+ /**
+ * Sets the token to be formatted etc. and includes the support for
+ * including the skipped tokens.
+ * @param tokens Array of tokens
+ * @param index Index of the token to be formatted
+ * @param next Position of the next token (the skipped tokens
+ * are the ones between the tokens[index] and tokens[next]
+ */
+ public final void setToken(final AnalyzedTokenReadings[] tokens, final int index, final int next) {
+ setToken(tokens[index]);
+ if (next > 1 && includeSkipped != IncludeRange.NONE) {
+ final StringBuilder sb = new StringBuilder();
+ if (includeSkipped == IncludeRange.FOLLOWING) {
+ formattedToken = null;
+ }
+ for (int k = index + 1; k < index + next; k++) {
+ if (k > index + 1 &&
+ tokens[k].isWhitespaceBefore()) {
+ sb.append(' ');
+ }
+ sb.append(tokens[k].getToken());
+ }
+ skippedTokens = sb.toString();
+ } else {
+ skippedTokens = "";
+ }
+ }
+
+ /**
+ private String[] addSkipped(final String[] formattedString) {
+ if (skippedTokens != null && !"".equals(skippedTokens)) {
+ String[] finalStrings = new String[formattedString.length];
+ for (int i = 1; i <= formattedString.length; i++)
+ }
+ }
+
+ **/
+
+ /**
+ * Checks if the Match element is used for setting the part of speech Element.
+ *
+ * @return True if Match sets POS.
+ */
+ public final boolean setsPos() {
+ return setPos;
+ }
+
+ /**
+ * Checks if the Match element uses regexp-based form of the POS tag.
+ *
+ * @return True if regexp is used in POS.
+ */
+ public final boolean posRegExp() {
+ return postagRegexp;
+ }
+
+ /**
+ * Sets a base form (lemma) that will be formatted, or synthesized, using the
+ * specified POS regular expressions.
+ *
+ * @param lemmaString String that specifies the base form.
+ */
+ public final void setLemmaString(final String lemmaString) {
+ if (!StringTools.isEmpty(lemmaString)) {
+ formattedToken = new AnalyzedTokenReadings(new AnalyzedToken(lemmaString,
+ posTag, lemmaString), 0);
+ staticLemma = true;
+ postagRegexp = true;
+ if (posTag != null) {
+ pPosRegexMatch = Pattern.compile(posTag);
+ }
+ }
+ }
+
+ /**
+ * Sets a synthesizer used for grammatical synthesis of forms based on
+ * formatted POS values.
+ *
+ * @param synth Synthesizer class.
+ */
+ public final void setSynthesizer(final Synthesizer synth) {
+ synthesizer = synth;
+ }
+
+ /**
+ * Gets all strings formatted using the match element.
+ *
+ * @return array of strings
+ * @throws IOException
+ * in case of synthesizer-related disk problems.
+ */
+ public final String[] toFinalString() throws IOException {
+ String[] formattedString = new String[1];
+ if (formattedToken != null) {
+ final int readingCount = formattedToken.getReadingsLength();
+ formattedString[0] = formattedToken.getToken();
+ if (pRegexMatch != null) {
+ formattedString[0] = pRegexMatch.matcher(formattedString[0])
+ .replaceAll(regexReplace);
+ }
+ formattedString[0] = convertCase(formattedString[0]);
+ if (posTag != null) {
+ if (synthesizer == null) {
+ formattedString[0] = formattedToken.getToken();
+ } else if (postagRegexp) {
+ final TreeSet<String> wordForms = new TreeSet<String>();
+ boolean oneForm = false;
+ for (int k = 0; k < readingCount; k++) {
+ if (formattedToken.getAnalyzedToken(k).getLemma() == null) {
+ final String posUnique = formattedToken.getAnalyzedToken(k)
+ .getPOSTag();
+ if (posUnique == null) {
+ wordForms.add(formattedToken.getToken());
+ oneForm = true;
+ } else {
+ if (JLanguageTool.SENTENCE_START_TAGNAME.equals(posUnique)
+ || JLanguageTool.SENTENCE_END_TAGNAME.equals(posUnique)
+ || JLanguageTool.PARAGRAPH_END_TAGNAME.equals(posUnique)) {
+ if (!oneForm) {
+ wordForms.add(formattedToken.getToken());
+ }
+ oneForm = true;
+ } else {
+ oneForm = false;
+ }
+ }
+ }
+ }
+ final String targetPosTag = getTargetPosTag();
+ if (!oneForm) {
+ for (int i = 0; i < readingCount; i++) {
+ final String[] possibleWordForms = synthesizer.synthesize(
+ formattedToken.getAnalyzedToken(i), targetPosTag, true);
+ if (possibleWordForms != null) {
+ wordForms.addAll(Arrays.asList(possibleWordForms));
+ }
+ }
+ }
+ if (wordForms.isEmpty()) {
+ formattedString[0] = "(" + formattedToken.getToken() + ")";
+ } else {
+ formattedString = wordForms.toArray(new String[wordForms.size()]);
+ }
+ } else {
+ final TreeSet<String> wordForms = new TreeSet<String>();
+ for (int i = 0; i < readingCount; i++) {
+ final String[] possibleWordForms = synthesizer.synthesize(
+ formattedToken.getAnalyzedToken(i), posTag);
+ if (possibleWordForms != null) {
+ wordForms.addAll(Arrays.asList(possibleWordForms));
+ }
+ }
+ formattedString = wordForms.toArray(new String[wordForms.size()]);
+ }
+ }
+ }
+ if (includeSkipped != IncludeRange.NONE
+ && skippedTokens != null && !"".equals(skippedTokens)) {
+ final String[] helper = new String[formattedString.length];
+ for (int i = 0; i < formattedString.length; i++) {
+ if (formattedString[i] == null) {
+ formattedString[i] = "";
+ }
+ helper[i] = formattedString[i] + skippedTokens;
+ }
+ formattedString = helper;
+ }
+ return formattedString;
+ }
+
+ /**
+ * Format POS tag using parameters already defined in the class.
+ *
+ * @return Formatted POS tag as String.
+ */
+ // FIXME: gets only the first POS tag that matches, this can be wrong
+ // on the other hand, many POS tags = too many suggestions?
+ public final String getTargetPosTag() {
+ String targetPosTag = posTag;
+ final List<String> posTags = new ArrayList<String>();
+ if (staticLemma) {
+ final int numRead = matchedToken.getReadingsLength();
+ for (int i = 0; i < numRead; i++) {
+ final String tst = matchedToken.getAnalyzedToken(i).getPOSTag();
+ if (tst != null && pPosRegexMatch.matcher(tst).matches()) {
+ targetPosTag = matchedToken.getAnalyzedToken(i).getPOSTag();
+ posTags.add(targetPosTag);
+ }
+ }
+ if (pPosRegexMatch != null && posTagReplace != null) {
+ targetPosTag = pPosRegexMatch.matcher(targetPosTag).replaceAll(
+ posTagReplace);
+ }
+ } else {
+ final int numRead = formattedToken.getReadingsLength();
+ for (int i = 0; i < numRead; i++) {
+ final String tst = formattedToken.getAnalyzedToken(i).getPOSTag();
+ if (tst != null && pPosRegexMatch.matcher(tst).matches()) {
+ targetPosTag = formattedToken.getAnalyzedToken(i).getPOSTag();
+ posTags.add(targetPosTag);
+ }
+ }
+ if (pPosRegexMatch != null && posTagReplace != null) {
+ if (posTags.isEmpty()) {
+ posTags.add(targetPosTag);
+ }
+ final StringBuilder sb = new StringBuilder();
+ final int posTagLen = posTags.size();
+ int l = 0;
+ for (String lposTag : posTags) {
+ l++;
+ lposTag = pPosRegexMatch.matcher(lposTag).replaceAll(posTagReplace);
+ if (setPos) {
+ lposTag = synthesizer.getPosTagCorrection(lposTag);
+ }
+ sb.append(lposTag);
+ if (l < posTagLen) {
+ sb.append('|');
+ }
+ }
+ targetPosTag = sb.toString();
+ }
+ }
+ return targetPosTag;
+ }
+
+ /**
+ * Method for getting the formatted match as a single string. In case of
+ * multiple matches, it joins them using a regular expression operator "|".
+ *
+ * @return Formatted string of the matched token.
+ */
+ public final String toTokenString() throws IOException {
+ final StringBuilder output = new StringBuilder();
+ final String[] stringToFormat = toFinalString();
+ for (int i = 0; i < stringToFormat.length; i++) {
+ output.append(stringToFormat[i]);
+ if (i + 1 < stringToFormat.length) {
+ output.append('|');
+ }
+ }
+ return output.toString();
+ }
+
+ /**
+ * Sets the token number referenced by the match.
+ *
+ * @param i Token number.
+ */
+ public final void setTokenRef(final int i) {
+ tokenRef = i;
+ }
+
+ /**
+ * Gets the token number referenced by the match.
+ *
+ * @return int - token number.
+ */
+ public final int getTokenRef() {
+ return tokenRef;
+ }
+
+ /**
+ * Converts case of the string token according to match element attributes.
+ *
+ * @param s Token to be converted.
+ * @return Converted string.
+ */
+ private String convertCase(final String s) {
+ if (StringTools.isEmpty(s)) {
+ return s;
+ }
+ String token = s;
+ switch (caseConversionType) {
+ case NONE:
+ break;
+ case STARTLOWER:
+ token = token.substring(0, 1).toLowerCase() + token.substring(1);
+ break;
+ case STARTUPPER:
+ token = token.substring(0, 1).toUpperCase() + token.substring(1);
+ break;
+ case ALLUPPER:
+ token = token.toUpperCase();
+ break;
+ case ALLLOWER:
+ token = token.toLowerCase();
+ break;
+ default:
+ break;
+ }
+ return token;
+ }
+
+ /**
+ * Used to let LT know that it should change the case of the match.
+ *
+ * @return true if match converts the case of the token.
+ */
+ public final boolean convertsCase() {
+ return !caseConversionType.equals(CaseConversion.NONE);
+ }
+
+ public final AnalyzedTokenReadings filterReadings() {
+ final ArrayList<AnalyzedToken> l = new ArrayList<AnalyzedToken>();
+ if (formattedToken != null) {
+ if (staticLemma) {
+ formattedToken = new AnalyzedTokenReadings(new AnalyzedToken(
+ matchedToken.getToken(), posTag, formattedToken.getToken()),
+ matchedToken.getStartPos());
+ formattedToken.setWhitespaceBefore(matchedToken.isWhitespaceBefore());
+ }
+ String token = formattedToken.getToken();
+ if (pRegexMatch != null) {
+ token = pRegexMatch.matcher(token).replaceAll(regexReplace);
+ }
+ token = convertCase(token);
+ if (posTag != null) {
+ final int numRead = formattedToken.getReadingsLength();
+ if (postagRegexp) {
+ String targetPosTag = posTag;
+ for (int i = 0; i < numRead; i++) {
+ final String tst = formattedToken.getAnalyzedToken(i).getPOSTag();
+ if (tst != null && pPosRegexMatch.matcher(tst).matches()) {
+ targetPosTag = formattedToken.getAnalyzedToken(i).getPOSTag();
+ if (posTagReplace != null) {
+ targetPosTag = pPosRegexMatch.matcher(targetPosTag).replaceAll(
+ posTagReplace);
+ }
+ l
+ .add(new AnalyzedToken(token, targetPosTag, formattedToken
+ .getAnalyzedToken(i).getLemma()));
+ l.get(l.size() - 1).setWhitespaceBefore(formattedToken.isWhitespaceBefore());
+ }
+ }
+ if (l.isEmpty()) {
+ for (final AnalyzedToken anaTok : getNewToken(numRead, token)) {
+ l.add(anaTok);
+ }
+ }
+ } else {
+ for (final AnalyzedToken anaTok : getNewToken(numRead, token)) {
+ l.add(anaTok);
+ }
+ }
+ if (formattedToken.isSentEnd()) {
+ l.add(new AnalyzedToken(formattedToken.getToken(),
+ JLanguageTool.SENTENCE_END_TAGNAME,
+ formattedToken.getAnalyzedToken(0).getLemma()));
+ }
+ if (formattedToken.isParaEnd()) {
+ l.add(new AnalyzedToken(formattedToken.getToken(),
+ JLanguageTool.PARAGRAPH_END_TAGNAME,
+ formattedToken.getAnalyzedToken(0).getLemma()));
+ }
+ }
+ }
+ if (l.isEmpty()) {
+ return formattedToken;
+ }
+ return new AnalyzedTokenReadings(l.toArray(new AnalyzedToken[l.size()]), formattedToken.getStartPos());
+ }
+
+ private AnalyzedToken[] getNewToken(final int numRead, final String token) {
+ final List<AnalyzedToken> list = new ArrayList<AnalyzedToken>();
+ String lemma = "";
+ for (int j = 0; j < numRead; j++) {
+ if (formattedToken.getAnalyzedToken(j).getPOSTag() != null) {
+ if (formattedToken.getAnalyzedToken(j).getPOSTag().equals(posTag)
+ && (formattedToken.getAnalyzedToken(j).getLemma() != null)) {
+ lemma = formattedToken.getAnalyzedToken(j).getLemma();
+ }
+ if (StringTools.isEmpty(lemma)) {
+ lemma = formattedToken.getAnalyzedToken(0).getLemma();
+ }
+ list.add(new AnalyzedToken(token, posTag, lemma));
+ list.get(list.size() - 1).
+ setWhitespaceBefore(formattedToken.isWhitespaceBefore());
+ }
+ }
+ return list.toArray(new AnalyzedToken[list.size()]);
+ }
+
+ /**
+ * @param inMessageOnly
+ * the inMessageOnly to set
+ */
+ public void setInMessageOnly(final boolean inMessageOnly) {
+ this.inMessageOnly = inMessageOnly;
+ }
+
+ /**
+ * @return the inMessageOnly
+ */
+ public boolean isInMessageOnly() {
+ return inMessageOnly;
+ }
+
+}
diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/rules/patterns/PatternRule.java b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/patterns/PatternRule.java
new file mode 100644
index 0000000..843ef98
--- /dev/null
+++ b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/patterns/PatternRule.java
@@ -0,0 +1,652 @@
+/* LanguageTool, a natural language style checker
+ * Copyright (C) 2005 Daniel Naber (http://www.danielnaber.de)
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
+ * USA
+ */
+package de.danielnaber.languagetool.rules.patterns;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
+
+import de.danielnaber.languagetool.AnalyzedSentence;
+import de.danielnaber.languagetool.AnalyzedTokenReadings;
+import de.danielnaber.languagetool.Language;
+import de.danielnaber.languagetool.rules.IncorrectExample;
+import de.danielnaber.languagetool.rules.RuleMatch;
+import de.danielnaber.languagetool.tools.StringTools;
+
+/**
+ * A Rule that describes a language error as a simple pattern of words or of
+ * part-of-speech tags.
+ *
+ * @author Daniel Naber
+ */
+public class PatternRule extends AbstractPatternRule {
+
+ private static final String SUGG_TAG = "<suggestion>";
+ private static final String END_SUGG_TAG = "</suggestion>";
+
+ private String subId; // because there can be more than one rule in a rule
+ // group
+
+ private String message;
+ private String shortMessage;
+
+ /** Formatted suggestion elements. **/
+ private List<Match> suggestionMatches;
+
+ /**
+ * A list of elements as they appear in XML file (phrases count as single
+ * tokens in case of matches or skipping).
+ */
+ private List<Integer> elementNo;
+
+ /**
+ * This property is used for short-circuiting evaluation of the elementNo list
+ * order.
+ */
+ private boolean useList;
+
+ /**
+ * Marks whether the rule is a member of a disjunctive set (in case of OR
+ * operation on phraserefs).
+ **/
+ private boolean isMemberOfDisjunctiveSet;
+
+ /**
+ * @param id
+ * Id of the Rule
+ * @param language
+ * Language of the Rule
+ * @param elements
+ * Element (token) list
+ * @param description
+ * Description to be shown (name)
+ * @param message
+ * Message to be displayed to the user
+ */
+
+ public PatternRule(final String id, final Language language,
+ final List<Element> elements, final String description,
+ final String message, final String shortMessage) {
+ super(id, description, language, elements, false);
+ if (id == null) {
+ throw new NullPointerException("id cannot be null");
+ }
+ if (language == null) {
+ throw new NullPointerException("language cannot be null");
+ }
+ if (elements == null) {
+ throw new NullPointerException("elements cannot be null");
+ }
+ if (description == null) {
+ throw new NullPointerException("description cannot be null");
+ }
+
+ this.message = message;
+ this.shortMessage = shortMessage;
+ this.elementNo = new ArrayList<Integer>();
+ String prevName = "";
+ String curName = "";
+ int cnt = 0;
+ int loopCnt = 0;
+ for (final Element e : patternElements) {
+ if (e.isPartOfPhrase()) {
+ curName = e.getPhraseName();
+ if (prevName.equals(curName) || StringTools.isEmpty(prevName)) {
+ cnt++;
+ useList = true;
+ } else {
+ elementNo.add(cnt);
+ prevName = "";
+ curName = "";
+ cnt = 0;
+ }
+ prevName = curName;
+ loopCnt++;
+ if (loopCnt == patternElements.size() && !StringTools.isEmpty(prevName)) {
+ elementNo.add(cnt);
+ }
+ } else {
+ if (cnt > 0) {
+ elementNo.add(cnt);
+ }
+ elementNo.add(1);
+ loopCnt++;
+ }
+ }
+ }
+
+ public PatternRule(final String id, final Language language,
+ final List<Element> elements, final String description,
+ final String message, final String shortMessage, final boolean isMember) {
+ this(id, language, elements, description, message, shortMessage);
+ this.isMemberOfDisjunctiveSet = isMember;
+ }
+
+ public final String getSubId() {
+ return subId;
+ }
+
+ public final void setSubId(final String subId) {
+ this.subId = subId;
+ }
+
+ public final String getMessage() {
+ return message;
+ }
+
+ /**
+ * Used for testing rules: only one of the set can match.
+ *
+ * @return Whether the rule can non-match (as a member of disjunctive set of
+ * rules generated by phraseref in includephrases element).
+ */
+ public final boolean isWithComplexPhrase() {
+ return isMemberOfDisjunctiveSet;
+ }
+
+ /** Reset complex status - used for testing. **/
+ public final void notComplexPhrase() {
+ isMemberOfDisjunctiveSet = false;
+ }
+
+ /**
+ * Return the pattern as a string.
+ *
+ * @since 0.9.2
+ */
+ public final String toPatternString() {
+ final List<String> strList = new ArrayList<String>();
+ for (Element patternElement : patternElements) {
+ strList.add(patternElement.toString());
+ }
+ return StringTools.listToString(strList, ", ");
+ }
+
+ /**
+ * Return the pattern as an XML string. FIXME: this is not complete, information might be lost!
+ *
+ * @since 0.9.3
+ */
+ public final String toXML() {
+ final StringBuilder sb = new StringBuilder();
+ sb.append("<rule id=\"");
+ sb.append(StringTools.escapeXML(getId()));
+ sb.append("\" name=\"");
+ sb.append(StringTools.escapeXML(getDescription()));
+ sb.append("\">\n");
+ sb.append("<pattern mark_from=\"");
+ sb.append(startPositionCorrection);
+ sb.append("\" mark_to=\"");
+ sb.append(endPositionCorrection);
+ sb.append('"');
+ // for now, case sensitivity is per pattern, not per element,
+ // so just use the setting of the first element:
+ if (!patternElements.isEmpty() && patternElements.get(0).getCaseSensitive()) {
+ sb.append(" case_sensitive=\"yes\"");
+ }
+ sb.append(">\n");
+ for (Element patternElement : patternElements) {
+ sb.append("<token");
+ if (patternElement.getNegation()) {
+ sb.append(" negate=\"yes\"");
+ }
+ if (patternElement.isRegularExpression()) {
+ sb.append(" regexp=\"yes\"");
+ }
+ if (patternElement.getPOStag() != null) {
+ sb.append(" postag=\"");
+ sb.append(patternElement.getPOStag());
+ sb.append('"');
+ }
+ if (patternElement.getPOSNegation()) {
+ sb.append(" negate_pos=\"yes\"");
+ }
+ if (patternElement.isInflected()) {
+ sb.append(" inflected=\"yes\"");
+ }
+ sb.append('>');
+ if (patternElement.getString() != null) {
+ sb.append(StringTools.escapeXML(patternElement.getString()));
+ } else {
+ // TODO
+ }
+ sb.append("</token>\n");
+ }
+ sb.append("</pattern>\n");
+ sb.append("<message>");
+ sb.append(StringTools.escapeXML(message));
+ sb.append("</message>\n");
+ if (getIncorrectExamples() != null) {
+ for (IncorrectExample example : getIncorrectExamples()) {
+ sb.append("<example type=\"incorrect\">");
+ sb.append(StringTools.escapeXML(example.getExample()));
+ sb.append("</example>\n");
+ }
+ }
+ if (getCorrectExamples() != null) {
+ for (String example : getCorrectExamples()) {
+ sb.append("<example type=\"correct\">");
+ sb.append(StringTools.escapeXML(example));
+ sb.append("</example>\n");
+ }
+ }
+ sb.append("</rule>");
+ return sb.toString();
+ }
+
+ public final void setMessage(final String message) {
+ this.message = message;
+ }
+
+ @Override
+ public final RuleMatch[] match(final AnalyzedSentence text)
+ throws IOException {
+ final List<RuleMatch> ruleMatches = new ArrayList<RuleMatch>();
+ final AnalyzedTokenReadings[] tokens = text.getTokensWithoutWhitespace();
+ final int[] tokenPositions = new int[tokens.length + 1];
+ final int patternSize = patternElements.size();
+ final int limit = Math.max(0, tokens.length - patternSize + 1);
+ Element elem = null;
+ int i = 0;
+ while (i < limit && !(sentStart && i > 0)) {
+ boolean allElementsMatch = false;
+ int firstMatchToken = -1;
+ int lastMatchToken = -1;
+ int matchingTokens = 0;
+ int prevSkipNext = 0;
+ // this variable keeps the total number
+ // of tokens skipped
+ int skipShiftTotal = 0;
+ if (testUnification) {
+ unifier.reset();
+ }
+ for (int k = 0; k < patternSize; k++) {
+ final Element prevElement = elem;
+ elem = patternElements.get(k);
+ setupRef(firstMatchToken, elem, tokens);
+ final int nextPos = i + k + skipShiftTotal;
+ prevMatched = false;
+ if (prevSkipNext + nextPos >= tokens.length || prevSkipNext < 0) { // SENT_END?
+ prevSkipNext = tokens.length - (nextPos + 1);
+ }
+ final int maxTok = Math.min(nextPos + prevSkipNext, tokens.length - (patternSize - k));
+ for (int m = nextPos; m <= maxTok; m++) {
+ allElementsMatch = testAllReadings(tokens, elem, prevElement, m,
+ firstMatchToken, prevSkipNext);
+ if (allElementsMatch) {
+ lastMatchToken = m;
+ final int skipShift = lastMatchToken - nextPos;
+ tokenPositions[matchingTokens] = skipShift + 1;
+ prevSkipNext = translateElementNo(elem.getSkipNext());
+ matchingTokens++;
+ skipShiftTotal += skipShift;
+ if (firstMatchToken == -1) {
+ firstMatchToken = lastMatchToken;
+ }
+ break;
+ }
+ }
+ if (!allElementsMatch) {
+ break;
+ }
+ }
+
+ if (allElementsMatch && matchingTokens == patternSize) {
+ final RuleMatch rM = createRuleMatch(tokenPositions, tokens,
+ firstMatchToken, lastMatchToken, matchingTokens);
+ if (rM != null) {
+ ruleMatches.add(rM);
+ }
+ }
+ i++;
+ }
+ return ruleMatches.toArray(new RuleMatch[ruleMatches.size()]);
+ }
+
+ private RuleMatch createRuleMatch(final int[] tokenPositions,
+ final AnalyzedTokenReadings[] tokens, final int firstMatchToken,
+ final int lastMatchToken, final int matchingTokens) throws IOException {
+ final String errMessage = formatMatches(tokens, tokenPositions,
+ firstMatchToken, message);
+ int correctedStPos = 0;
+ if (startPositionCorrection > 0) {
+ for (int l = 0; l <= startPositionCorrection; l++) {
+ correctedStPos += tokenPositions[l];
+ }
+ correctedStPos--;
+ }
+ int correctedEndPos = 0;
+ if (endPositionCorrection < 0) {
+ int l = 0;
+ while (l > endPositionCorrection) {
+ correctedEndPos -= tokenPositions[matchingTokens + l - 1];
+ l--;
+ }
+ }
+ AnalyzedTokenReadings firstMatchTokenObj = tokens[firstMatchToken
+ + correctedStPos];
+ boolean startsWithUppercase = StringTools
+ .startsWithUppercase(firstMatchTokenObj.getToken())
+ && !matchConvertsCase();
+
+ if (firstMatchTokenObj.isSentStart()
+ && tokens.length > firstMatchToken + correctedStPos + 1) {
+ // make uppercasing work also at sentence start:
+ firstMatchTokenObj = tokens[firstMatchToken + correctedStPos + 1];
+ startsWithUppercase = StringTools.startsWithUppercase(firstMatchTokenObj
+ .getToken());
+ }
+ int fromPos = tokens[firstMatchToken + correctedStPos].getStartPos();
+ // FIXME: this is fishy, assumes that comma should always come before
+ // whitespace
+ if (errMessage.contains(SUGG_TAG + ",")
+ && firstMatchToken + correctedStPos >= 1) {
+ fromPos = tokens[firstMatchToken + correctedStPos - 1].getStartPos()
+ + tokens[firstMatchToken + correctedStPos - 1].getToken().length();
+ }
+
+ final int toPos = tokens[lastMatchToken + correctedEndPos].getStartPos()
+ + tokens[lastMatchToken + correctedEndPos].getToken().length();
+ if (fromPos < toPos) { // this can happen with some skip="-1" when the last
+ // token is not matched
+ return new RuleMatch(this, fromPos, toPos,
+ errMessage, shortMessage, startsWithUppercase);
+ } // failed to create any rule match...
+ return null;
+ }
+
+ /**
+ * Checks if the suggestion starts with a match that is supposed to convert
+ * case. If it does, stop the default conversion to uppercase.
+ *
+ * @return true, if the match converts the case of the token.
+ */
+ private boolean matchConvertsCase() {
+ if (suggestionMatches != null && !suggestionMatches.isEmpty()) {
+ final int sugStart = message.indexOf(SUGG_TAG) + SUGG_TAG.length();
+ for (Match sMatch : suggestionMatches) {
+ if (!sMatch.isInMessageOnly() && sMatch.convertsCase()
+ && message.charAt(sugStart) == '\\') {
+ return true;
+ }
+ }
+ }
+ return false;
+ }
+
+ public final void addSuggestionMatch(final Match m) {
+ if (suggestionMatches == null) {
+ suggestionMatches = new ArrayList<Match>();
+ }
+ suggestionMatches.add(m);
+ }
+
+ /**
+ * Gets the index of the element indexed by i, adding any offsets because of
+ * the phrases in the rule.
+ *
+ * @param i
+ * Current element index.
+ * @return int Index translated into XML element no.
+ */
+ private int translateElementNo(final int i) {
+ if (!useList || i < 0) {
+ return i;
+ }
+ int j = 0;
+ for (int k = 0; k < i; k++) {
+ j += elementNo.get(k);
+ }
+ return j;
+ }
+
+ /**
+ * Returns true when the token in the rule references a phrase composed of
+ * many tokens.
+ *
+ * @param i
+ * The index of the token.
+ * @return true if the phrase is under the index, false otherwise.
+ **/
+ private int phraseLen(final int i) {
+ if (!useList || i > (elementNo.size() - 1)) {
+ return 1;
+ }
+ return elementNo.get(i);
+ }
+
+ /**
+ * Creates a Cartesian product of the arrays stored in the input array.
+ *
+ * @param input
+ * Array of string arrays to combine.
+ * @param output
+ * Work array of strings.
+ * @param r
+ * Starting parameter (use 0 to get all combinations).
+ * @param lang
+ * Text language for adding spaces in some languages.
+ * @return Combined array of @String.
+ */
+ private static String[] combineLists(final String[][] input,
+ final String[] output, final int r, final Language lang) {
+ final List<String> outputList = new ArrayList<String>();
+ if (r == input.length) {
+ final StringBuilder sb = new StringBuilder();
+ for (int k = 0; k < output.length; k++) {
+ sb.append(output[k]);
+ if (k < output.length - 1) {
+ sb.append(StringTools.addSpace(output[k + 1], lang));
+ }
+ }
+ outputList.add(sb.toString());
+ } else {
+ for (int c = 0; c < input[r].length; c++) {
+ output[r] = input[r][c];
+ final String[] sList = combineLists(input, output, r + 1, lang);
+ outputList.addAll(Arrays.asList(sList));
+ }
+ }
+ return outputList.toArray(new String[outputList.size()]);
+ }
+
+ /**
+ * Concatenates the matches, and takes care of phrases (including inflection
+ * using synthesis).
+ *
+ * @param start
+ * Position of the element as referenced by match element in the
+ * rule.
+ * @param index
+ * The index of the element found in the matching sentence.
+ * @param tokenIndex
+ * The position of the token in the AnalyzedTokenReadings array.
+ * @param tokens
+ * Array of @AnalyzedTokenReadings
+ * @return @String[] Array of concatenated strings
+ * @throws IOException
+ * in case disk operations (used in synthesizer) go wrong.
+ */
+ private String[] concatMatches(final int start, final int index,
+ final int tokenIndex, final AnalyzedTokenReadings[] tokens,
+ final int nextTokenPos)
+ throws IOException {
+ String[] finalMatch = null;
+ if (suggestionMatches.get(start) != null) {
+ final int len = phraseLen(index);
+ if (len == 1) {
+ final int skippedTokens = nextTokenPos - tokenIndex;
+ suggestionMatches.get(start).setToken(tokens, tokenIndex - 1, skippedTokens);
+ suggestionMatches.get(start).setSynthesizer(language.getSynthesizer());
+ finalMatch = suggestionMatches.get(start).toFinalString();
+ } else {
+ final List<String[]> matchList = new ArrayList<String[]>();
+ for (int i = 0; i < len; i++) {
+ final int skippedTokens = nextTokenPos - (tokenIndex + i);
+ suggestionMatches.get(start).setToken(tokens, tokenIndex - 1 + i, skippedTokens);
+ suggestionMatches.get(start)
+ .setSynthesizer(language.getSynthesizer());
+ matchList.add(suggestionMatches.get(start).toFinalString());
+ }
+ return combineLists(matchList.toArray(new String[matchList.size()][]),
+ new String[matchList.size()], 0, language);
+ }
+ }
+ return finalMatch;
+ }
+
+ /**
+ * Replace back references generated with &lt;match&gt; and \\1 in message
+ * using Match class, and take care of skipping. *
+ *
+ * @param tokenReadings
+ * Array of AnalyzedTokenReadings that were matched against the
+ * pattern
+ * @param positions
+ * Array of relative positions of matched tokens
+ * @param firstMatchTok
+ * Position of the first matched token
+ * @param errorMsg
+ * String containing suggestion markup
+ * @return String Formatted message.
+ * @throws IOException
+ *
+ **/
+ private String formatMatches(final AnalyzedTokenReadings[] tokenReadings,
+ final int[] positions, final int firstMatchTok, final String errorMsg)
+ throws IOException {
+ String errorMessage = errorMsg;
+ int matchCounter = 0;
+ final int[] numbersToMatches = new int[errorMsg.length()];
+ boolean newWay = false;
+ int errLen = errorMessage.length();
+ int errMarker = errorMessage.indexOf('\\');
+ boolean numberFollows = false;
+ if (errMarker > 0 && errMarker < errLen - 1) {
+ numberFollows = StringTools.isPositiveNumber(errorMessage
+ .charAt(errMarker + 1));
+ }
+ while (errMarker > 0 && numberFollows) {
+ final int ind = errorMessage.indexOf('\\');
+ if (ind > 0 && StringTools.isPositiveNumber(errorMessage.charAt(ind + 1))) {
+ int numLen = 1;
+ while (ind + numLen < errorMessage.length()
+ && StringTools.isPositiveNumber(errorMessage.charAt(ind + numLen))) {
+ numLen++;
+ }
+ final int j = Integer.parseInt(errorMessage.substring(ind + 1, ind
+ + numLen)) - 1;
+ int repTokenPos = 0;
+ int nextTokenPos = 0;
+ for (int l = 0; l <= j; l++) {
+ repTokenPos += positions[l];
+ }
+ if (j <= positions.length) {
+ nextTokenPos = firstMatchTok + repTokenPos + positions[j + 1];
+ }
+ if (suggestionMatches != null) {
+ if (matchCounter < suggestionMatches.size()) {
+ numbersToMatches[j] = matchCounter;
+ if (suggestionMatches.get(matchCounter) != null) {
+ final String[] matches = concatMatches(matchCounter, j,
+ firstMatchTok + repTokenPos, tokenReadings, nextTokenPos);
+ final String leftSide = errorMessage.substring(0, ind);
+ final String rightSide = errorMessage.substring(ind + numLen);
+ if (matches.length == 1) {
+ errorMessage = leftSide + matches[0] + rightSide;
+ } else {
+ errorMessage = formatMultipleSynthesis(matches, leftSide,
+ rightSide);
+ }
+ matchCounter++;
+ newWay = true;
+ }
+ } else {
+ // FIXME: is this correct? this is how we deal with multiple matches
+ suggestionMatches.add(suggestionMatches.get(numbersToMatches[j]));
+ }
+ }
+
+ if (!newWay) {
+ // in case <match> elements weren't used (yet)
+ errorMessage = errorMessage.replace("\\" + (j + 1),
+ tokenReadings[firstMatchTok + repTokenPos - 1].getToken());
+ }
+ }
+ errMarker = errorMessage.indexOf('\\');
+ numberFollows = false;
+ errLen = errorMessage.length();
+ if (errMarker > 0 && errMarker < errLen - 1) {
+ numberFollows = StringTools.isPositiveNumber(errorMessage
+ .charAt(errMarker + 1));
+ }
+ }
+ return errorMessage;
+ }
+
+ private static String formatMultipleSynthesis(final String[] matches,
+ final String leftSide, final String rightSide) {
+ String errorMessage = "";
+ String suggestionLeft = "";
+ String suggestionRight = "";
+ String rightSideNew = rightSide;
+ final int sPos = leftSide.lastIndexOf(SUGG_TAG);
+ if (sPos > 0) {
+ suggestionLeft = leftSide.substring(sPos + SUGG_TAG.length());
+ }
+ if (StringTools.isEmpty(suggestionLeft)) {
+ errorMessage = leftSide;
+ } else {
+ errorMessage = leftSide.substring(0, leftSide.lastIndexOf(SUGG_TAG))
+ + SUGG_TAG;
+ }
+ final int rPos = rightSide.indexOf(END_SUGG_TAG);
+ if (rPos > 0) {
+ suggestionRight = rightSide.substring(0, rPos);
+ }
+ if (!StringTools.isEmpty(suggestionRight)) {
+ rightSideNew = rightSide.substring(rightSide.indexOf(END_SUGG_TAG));
+ }
+ final int lastLeftSugEnd = leftSide.indexOf(END_SUGG_TAG);
+ final int lastLeftSugStart = leftSide.lastIndexOf(SUGG_TAG);
+ final StringBuilder sb = new StringBuilder();
+ sb.append(errorMessage);
+ for (int z = 0; z < matches.length; z++) {
+ sb.append(suggestionLeft);
+ sb.append(matches[z]);
+ sb.append(suggestionRight);
+ if ((z < matches.length - 1) && lastLeftSugEnd < lastLeftSugStart) {
+ sb.append(END_SUGG_TAG);
+ sb.append(", ");
+ sb.append(SUGG_TAG);
+ }
+ }
+ sb.append(rightSideNew);
+ return sb.toString();
+ }
+
+ /**
+ * For testing only.
+ */
+ public final List<Element> getElements() {
+ return patternElements;
+ }
+
+}
diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/rules/patterns/PatternRuleLoader.java b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/patterns/PatternRuleLoader.java
new file mode 100644
index 0000000..8156a6e
--- /dev/null
+++ b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/patterns/PatternRuleLoader.java
@@ -0,0 +1,369 @@
+/* LanguageTool, a natural language style checker
+ * Copyright (C) 2005 Daniel Naber (http://www.danielnaber.de)
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
+ * USA
+ */
+package de.danielnaber.languagetool.rules.patterns;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+
+import javax.xml.parsers.SAXParser;
+import javax.xml.parsers.SAXParserFactory;
+
+import org.xml.sax.Attributes;
+import org.xml.sax.SAXException;
+import org.xml.sax.helpers.DefaultHandler;
+
+import de.danielnaber.languagetool.JLanguageTool;
+import de.danielnaber.languagetool.Language;
+import de.danielnaber.languagetool.rules.Category;
+import de.danielnaber.languagetool.rules.IncorrectExample;
+
+/**
+ * Loads {@link PatternRule}s from an XML file.
+ *
+ * @author Daniel Naber
+ */
+public class PatternRuleLoader extends DefaultHandler {
+
+ public final List<PatternRule> getRules(final InputStream is,
+ final String filename) throws IOException {
+ try {
+ final PatternRuleHandler handler = new PatternRuleHandler();
+ final SAXParserFactory factory = SAXParserFactory.newInstance();
+ final SAXParser saxParser = factory.newSAXParser();
+ saxParser.getXMLReader().setFeature(
+ "http://apache.org/xml/features/nonvalidating/load-external-dtd",
+ false);
+ saxParser.parse(is, handler);
+ return handler.getRules();
+ } catch (final Exception e) {
+ final IOException ioe = new IOException("Cannot load or parse '"
+ + filename + "'");
+ ioe.initCause(e);
+ throw ioe;
+ }
+ }
+
+ /** Testing only. */
+ public final void main(final String[] args) throws IOException {
+ final PatternRuleLoader prg = new PatternRuleLoader();
+ final String name = "/de/grammar.xml";
+ final List<PatternRule> l = prg.getRules(JLanguageTool.getDataBroker().getFromRulesDirAsStream(name), name);
+ System.out.println(l);
+ }
+
+}
+
+class PatternRuleHandler extends XMLRuleHandler {
+
+ private int subId;
+
+ private boolean defaultOff;
+ private boolean defaultOn;
+
+ private Category category;
+ private String description;
+ private String ruleGroupDescription;
+
+ // ===========================================================
+ // SAX DocumentHandler methods
+ // ===========================================================
+
+ @Override
+ public void startElement(final String namespaceURI, final String lName,
+ final String qName, final Attributes attrs) throws SAXException {
+ if ("category".equals(qName)) {
+ final String catName = attrs.getValue("name");
+ final String priorityStr = attrs.getValue("priority");
+ // int prio = 0;
+ if (priorityStr == null) {
+ category = new Category(catName);
+ } else {
+ category = new Category(catName, Integer.parseInt(priorityStr));
+ }
+
+ if ("off".equals(attrs.getValue(DEFAULT))) {
+ category.setDefaultOff();
+ }
+
+ } else if ("rules".equals(qName)) {
+ final String languageStr = attrs.getValue("lang");
+ language = Language.getLanguageForShortName(languageStr);
+ if (language == null) {
+ throw new SAXException("Unknown language '" + languageStr + "'");
+ }
+ } else if ("rule".equals(qName)) {
+ id = attrs.getValue("id");
+ if (inRuleGroup) {
+ subId++;
+ }
+ if (!(inRuleGroup && defaultOff)) {
+ defaultOff = "off".equals(attrs.getValue(DEFAULT));
+ }
+
+ if (!(inRuleGroup && defaultOn)) {
+ defaultOn = "on".equals(attrs.getValue(DEFAULT));
+ }
+ if (inRuleGroup && id == null) {
+ id = ruleGroupId;
+ }
+ description = attrs.getValue("name");
+ if (inRuleGroup && description == null) {
+ description = ruleGroupDescription;
+ }
+ correctExamples = new ArrayList<String>();
+ incorrectExamples = new ArrayList<IncorrectExample>();
+ if (suggestionMatches != null) {
+ suggestionMatches.clear();
+ }
+ } else if (PATTERN.equals(qName)) {
+ startPattern(attrs);
+ } else if (AND.equals(qName)) {
+ inAndGroup = true;
+ } else if ("unify".equals(qName)) {
+ inUnification = true;
+ uniNegation = YES.equals(attrs.getValue(NEGATE));
+ } else if ("feature".equals(qName)) {
+ uFeature = attrs.getValue("id");
+ } else if (qName.equals(TYPE)) {
+ uType = attrs.getValue("id");
+ uTypeList.add(uType);
+ } else if (qName.equals(TOKEN)) {
+ setToken(attrs);
+ } else if (EXCEPTION.equals(qName)) {
+ setExceptions(attrs);
+ } else if (qName.equals(EXAMPLE)
+ && attrs.getValue(TYPE).equals("correct")) {
+ inCorrectExample = true;
+ correctExample = new StringBuilder();
+ } else if (qName.equals(EXAMPLE)
+ && attrs.getValue(TYPE).equals("incorrect")) {
+ inIncorrectExample = true;
+ incorrectExample = new StringBuilder();
+ exampleCorrection = new StringBuilder();
+ if (attrs.getValue("correction") != null) {
+ exampleCorrection.append(attrs.getValue("correction"));
+ }
+ } else if ("message".equals(qName)) {
+ inMessage = true;
+ inSuggestion = false;
+ message = new StringBuilder();
+ } else if ("short".equals(qName)) {
+ inShortMessage = true;
+ shortMessage = new StringBuilder();
+ } else if ("rulegroup".equals(qName)) {
+ ruleGroupId = attrs.getValue("id");
+ ruleGroupDescription = attrs.getValue("name");
+ defaultOff = "off".equals(attrs.getValue(DEFAULT));
+ defaultOn = "on".equals(attrs.getValue(DEFAULT));
+ inRuleGroup = true;
+ subId = 0;
+ } else if ("suggestion".equals(qName) && inMessage) {
+ message.append("<suggestion>");
+ inSuggestion = true;
+ } else if ("match".equals(qName)) {
+ setMatchElement(attrs);
+ } else if (qName.equals(MARKER) && inCorrectExample) {
+ correctExample.append("<marker>");
+ } else if (qName.equals(MARKER) && inIncorrectExample) {
+ incorrectExample.append("<marker>");
+ } else if (UNIFICATION.equals(qName)) {
+ uFeature = attrs.getValue("feature");
+ inUnificationDef = true;
+ } else if ("equivalence".equals(qName)) {
+ uType = attrs.getValue(TYPE);
+ } else if (PHRASES.equals(qName)) {
+ inPhrases = true;
+ } else if ("includephrases".equals(qName)) {
+ phraseElementInit();
+ } else if ("phrase".equals(qName) && inPhrases) {
+ phraseId = attrs.getValue("id");
+ } else if ("phraseref".equals(qName) && (attrs.getValue("idref") != null)) {
+ preparePhrase(attrs);
+ }
+ }
+
+ @Override
+ public void endElement(final String namespaceURI, final String sName,
+ final String qName) throws SAXException {
+ if ("rule".equals(qName)) {
+ phraseElementInit();
+ if (phraseElementList.isEmpty()) {
+ final PatternRule rule = new PatternRule(id, language, elementList,
+ description, message.toString(), shortMessage.toString());
+ prepareRule(rule);
+ rules.add(rule);
+ } else {
+ if (!elementList.isEmpty()) {
+ for (final ArrayList<Element> ph : phraseElementList) {
+ ph.addAll(new ArrayList<Element>(elementList));
+ }
+ }
+
+ for (final ArrayList<Element> phraseElement : phraseElementList) {
+ processElement(phraseElement);
+ final PatternRule rule = new PatternRule(id, language, phraseElement,
+ description, message.toString(), shortMessage.toString(),
+ phraseElementList.size() > 1);
+ prepareRule(rule);
+ rules.add(rule);
+ }
+ }
+ elementList.clear();
+ if (phraseElementList != null) {
+ phraseElementList.clear();
+ }
+
+ } else if (qName.equals(EXCEPTION)) {
+ finalizeExceptions();
+ } else if (qName.equals(AND)) {
+ inAndGroup = false;
+ andGroupCounter = 0;
+ tokenCounter++;
+ } else if (qName.equals(TOKEN)) {
+ finalizeTokens();
+ } else if (qName.equals(PATTERN)) {
+ checkMarkPositions();
+ inPattern = false;
+ if (lastPhrase) {
+ elementList.clear();
+ }
+ if (phraseElementList == null || phraseElementList.isEmpty()) {
+ checkPositions(0);
+ } else {
+ for (List<Element> elements : phraseElementList) {
+ checkPositions(elements.size());
+ }
+ }
+ tokenCounter = 0;
+ } else if (qName.equals(EXAMPLE)) {
+ if (inCorrectExample) {
+ correctExamples.add(correctExample.toString());
+ } else if (inIncorrectExample) {
+ IncorrectExample example = null;
+ final String[] corrections = exampleCorrection.toString().split("\\|");
+ if (corrections.length > 0 && corrections[0].length() > 0) {
+ example = new IncorrectExample(incorrectExample.toString(),
+ corrections);
+ } else {
+ example = new IncorrectExample(incorrectExample.toString());
+ }
+ incorrectExamples.add(example);
+ }
+ inCorrectExample = false;
+ inIncorrectExample = false;
+ correctExample = new StringBuilder();
+ incorrectExample = new StringBuilder();
+ exampleCorrection = new StringBuilder();
+ } else if ("message".equals(qName)) {
+ suggestionMatches = addLegacyMatches();
+ inMessage = false;
+ } else if ("short".equals(qName)) {
+ inShortMessage = false;
+ } else if ("match".equals(qName)) {
+ if (inMessage) {
+ suggestionMatches.get(suggestionMatches.size() - 1).setLemmaString(
+ match.toString());
+ } else if (inToken) {
+ tokenReference.setLemmaString(match.toString());
+ }
+ inMatch = false;
+ } else if ("rulegroup".equals(qName)) {
+ inRuleGroup = false;
+ } else if ("suggestion".equals(qName) && inMessage) {
+ message.append("</suggestion>");
+ inSuggestion = false;
+ } else if (qName.equals(MARKER) && inCorrectExample) {
+ correctExample.append("</marker>");
+ } else if (qName.equals(MARKER) && inIncorrectExample) {
+ incorrectExample.append("</marker>");
+ } else if ("phrase".equals(qName) && inPhrases) {
+ finalizePhrase();
+ } else if ("includephrases".equals(qName)) {
+ elementList.clear();
+ } else if (PHRASES.equals(qName) && inPhrases) {
+ inPhrases = false;
+ } else if (UNIFICATION.equals(qName)) {
+ inUnificationDef = false;
+ } else if ("feature".equals(qName)) {
+ equivalenceFeatures.put(uFeature, uTypeList);
+ uTypeList = new ArrayList<String>();
+ } else if ("unify".equals(qName)) {
+ inUnification = false;
+ //clear the features...
+ equivalenceFeatures = new HashMap<String, List<String>>();
+ }
+ }
+
+ private void prepareRule(final PatternRule rule) {
+ rule.setStartPositionCorrection(startPositionCorrection);
+ rule.setEndPositionCorrection(endPositionCorrection);
+ startPositionCorrection = 0;
+ endPositionCorrection = 0;
+ rule.setCorrectExamples(correctExamples);
+ rule.setIncorrectExamples(incorrectExamples);
+ rule.setCategory(category);
+ if (inRuleGroup) {
+ rule.setSubId(Integer.toString(subId));
+ }
+ else {
+ rule.setSubId("1");
+ }
+ caseSensitive = false;
+ if (suggestionMatches != null) {
+ for (final Match m : suggestionMatches) {
+ rule.addSuggestionMatch(m);
+ }
+ if (phraseElementList.size() <= 1) {
+ suggestionMatches.clear();
+ }
+ }
+ if (defaultOff) {
+ rule.setDefaultOff();
+ }
+
+ if (category.isDefaultOff() && !defaultOn) {
+ rule.setDefaultOff();
+ }
+
+ }
+
+ @Override
+ public void characters(final char[] buf, final int offset, final int len) {
+ final String s = new String(buf, offset, len);
+ if (inException) {
+ exceptions.append(s);
+ } else if (inToken) {
+ elements.append(s);
+ } else if (inCorrectExample) {
+ correctExample.append(s);
+ } else if (inIncorrectExample) {
+ incorrectExample.append(s);
+ } else if (inMatch) {
+ match.append(s);
+ } else if (inMessage) {
+ message.append(s);
+ } else if (inShortMessage) {
+ shortMessage.append(s);
+ }
+ }
+
+}
diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/rules/patterns/Unifier.java b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/patterns/Unifier.java
new file mode 100644
index 0000000..7fbb35d
--- /dev/null
+++ b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/patterns/Unifier.java
@@ -0,0 +1,432 @@
+/* LanguageTool, a natural language style checker
+ * Copyright (C) 2006 Daniel Naber (http://www.danielnaber.de)
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
+ * USA
+ */
+
+package de.danielnaber.languagetool.rules.patterns;
+
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+
+import de.danielnaber.languagetool.AnalyzedToken;
+import de.danielnaber.languagetool.AnalyzedTokenReadings;
+
+/**
+ * Implements unification of features over tokens.
+ *
+ * @author Marcin Milkowski
+ */
+public class Unifier {
+
+ //TODO: add a possibility to negate some features but not all
+ /**
+ * Negates the meaning of unification just like negation in Element tokens.
+ */
+ private boolean negation;
+
+ private boolean allFeatsIn;
+
+ private int tokCnt;
+
+ private int readingsCounter;
+
+ private final List<AnalyzedTokenReadings> tokSequence;
+
+ /**
+ * A Map for storing the equivalence types for features. Features are
+ * specified as Strings, and map into types defined as maps from Strings to
+ * Elements.
+ */
+ private final Map<EquivalenceTypeLocator, Element> equivalenceTypes;
+
+ /**
+ * A Map that stores all possible equivalence types listed for features.
+ */
+ private final Map<String, List<String>> equivalenceFeatures;
+
+ /**
+ * Map of sets of matched equivalences in the unified sequence.
+ */
+ private final List<Map<String, Set<String>>> equivalencesMatched;
+
+ /**
+ * Marks found interpretations in subsequent tokens.
+ */
+ private List<Boolean> featuresFound;
+
+ /**
+ * For checking the current token.
+ */
+ private List<Boolean> tmpFeaturesFound;
+
+ /**
+ * Internal flag for checking whether the first token in tokSequence has to be
+ * yet unified.
+ */
+ private boolean firstUnified;
+
+ private boolean inUnification;
+ private boolean uniMatched;
+ private boolean uniAllMatched;
+ private AnalyzedTokenReadings[] unifiedTokens;
+
+ /**
+ * Instantiates the unifier.
+ */
+ public Unifier() {
+ tokCnt = -1;
+ readingsCounter = 1;
+ equivalencesMatched = new ArrayList<Map<String, Set<String>>>();
+ equivalenceTypes = new HashMap<EquivalenceTypeLocator, Element>();
+ equivalenceFeatures = new HashMap<String, List<String>>();
+ featuresFound = new ArrayList<Boolean>();
+ tmpFeaturesFound = new ArrayList<Boolean>();
+ tokSequence = new ArrayList<AnalyzedTokenReadings>();
+ }
+
+ /**
+ * Prepares equivalence types for features to be tested. All equivalence types
+ * are given as {@link Element}s. They create an equivalence set (with
+ * abstraction).
+ *
+ * @param feature
+ * Feature to be tested, like gender, grammatical case or number.
+ * @param type
+ * Type of equivalence for the feature, for example plural, first
+ * person, genitive.
+ * @param elem
+ * Element specifying the equivalence.
+ */
+ public final void setEquivalence(final String feature, final String type,
+ final Element elem) {
+ if (equivalenceTypes.containsKey(new EquivalenceTypeLocator(feature, type))) {
+ return;
+ }
+ equivalenceTypes.put(new EquivalenceTypeLocator(feature, type), elem);
+ List<String> lTypes;
+ if (equivalenceFeatures.containsKey(feature)) {
+ lTypes = equivalenceFeatures.get(feature);
+ } else {
+ lTypes = new ArrayList<String>();
+ }
+ lTypes.add(type);
+ equivalenceFeatures.put(feature, lTypes);
+ }
+
+ /**
+ * Tests if a token has shared features with other tokens.
+ *
+ * @param aToken
+ * - token to be tested
+ * @param feature
+ * - feature to be tested
+ * @param type
+ * - type of equivalence relation for the feature
+ * @return true if the token shares this type of feature with other tokens
+ */
+ protected final boolean isSatisfied(final AnalyzedToken aToken,
+ final Map<String, List<String>> uFeatures) {
+
+ if (allFeatsIn && equivalencesMatched.isEmpty()) {
+ return false;
+ }
+ // Error: no feature given!
+ if (uFeatures == null) {
+ return false; // throw exception??
+ }
+ boolean unified = true;
+ List<String> types;
+
+ if (allFeatsIn) {
+ unified &= checkNext(aToken, uFeatures);
+ } else {
+ tokCnt++;
+ while (equivalencesMatched.size() <= tokCnt) {
+ equivalencesMatched.add(new HashMap<String, Set<String>>());
+ }
+ for (final Map.Entry<String, List<String>> feat : uFeatures.entrySet()) {
+ types = feat.getValue();
+ if (types == null || types.isEmpty()) {
+ types = equivalenceFeatures.get(feat.getKey());
+ }
+ for (final String typename : types) {
+ final Element testElem = equivalenceTypes
+ .get(new EquivalenceTypeLocator(feat.getKey(), typename));
+ if (testElem == null) {
+ return false;
+ }
+ if (testElem.isMatched(aToken)) {
+ if (!equivalencesMatched.get(tokCnt).containsKey(feat.getKey())) {
+ final Set<String> typeSet = new HashSet<String>();
+ typeSet.add(typename);
+ equivalencesMatched.get(tokCnt).put(feat.getKey(), typeSet);
+ } else {
+ equivalencesMatched.get(tokCnt).get(feat.getKey()).add(typename);
+ }
+ }
+ }
+ unified &= equivalencesMatched.get(tokCnt).containsKey(feat.getKey());
+ if (!unified) {
+ break;
+ }
+ }
+ if (unified) {
+ if (tokCnt == 0 || tokSequence.isEmpty()) {
+ tokSequence.add(new AnalyzedTokenReadings(aToken, 0));
+ } else {
+ tokSequence.get(0).addReading(aToken);
+ }
+ }
+ }
+ return unified ^ negation;
+ }
+
+ private boolean checkNext(final AnalyzedToken aToken,
+ final Map<String, List<String>> uFeatures) {
+ boolean unifiedNext = true;
+ boolean anyFeatUnified = false;
+ List<String> types;
+ ArrayList<Boolean> tokenFeaturesFound = new ArrayList<Boolean>(tmpFeaturesFound);
+ if (allFeatsIn) {
+ for (int i = 0; i <= tokCnt; i++) {
+ boolean allFeatsUnified = true;
+ for (Map.Entry<String, List<String>> feat : uFeatures.entrySet()) {
+ boolean featUnified = false;
+ types = feat.getValue();
+ if (types == null || types.isEmpty()) {
+ types = equivalenceFeatures.get(feat.getKey());
+ }
+ for (final String typename : types) {
+ if (featuresFound.get(i)
+ && equivalencesMatched.get(i).containsKey(feat.getKey())
+ && equivalencesMatched.get(i).get(feat.getKey()).contains(typename)) {
+ final Element testElem = equivalenceTypes
+ .get(new EquivalenceTypeLocator(feat.getKey(), typename));
+ featUnified = featUnified || testElem.isMatched(aToken);
+ }
+ }
+ allFeatsUnified &= featUnified;
+ }
+ tokenFeaturesFound.set(i, allFeatsUnified);
+ anyFeatUnified = anyFeatUnified || allFeatsUnified;
+ }
+ unifiedNext &= anyFeatUnified;
+ if (unifiedNext) {
+ if (tokSequence.size() == readingsCounter) {
+ tokSequence.add(new AnalyzedTokenReadings(aToken, 0));
+ } else {
+ tokSequence.get(readingsCounter).addReading(aToken);
+ }
+ tmpFeaturesFound = tokenFeaturesFound;
+ }
+ }
+ return unifiedNext;
+ }
+
+ /**
+ * Call after every complete token (AnalyzedTokenReadings) checked.
+ */
+ public final void startNextToken() {
+ featuresFound = new ArrayList<Boolean>(tmpFeaturesFound);
+ readingsCounter++;
+ }
+
+ /**
+ * Starts testing only those equivalences that were previously matched.
+ */
+ public final void startUnify() {
+ allFeatsIn = true;
+ for (int i = 0; i <= tokCnt; i++) {
+ featuresFound.add(true);
+ }
+ tmpFeaturesFound = new ArrayList<Boolean>(featuresFound);
+ }
+
+ public final void setNegation(final boolean neg) {
+ negation = neg;
+ }
+
+ public final boolean getNegation() {
+ return negation;
+ }
+
+ /**
+ * Resets after use of unification. Required.
+ */
+ public final void reset() {
+ equivalencesMatched.clear();
+ allFeatsIn = false;
+ negation = false;
+ tokCnt = -1;
+ featuresFound.clear();
+ tmpFeaturesFound.clear();
+ tokSequence.clear();
+ readingsCounter = 1;
+ firstUnified = false;
+ uniMatched = false;
+ uniAllMatched = false;
+ inUnification = false;
+ }
+
+ /**
+ * Gets a full sequence of filtered tokens.
+ *
+ * @return Array of AnalyzedTokenReadings that match equivalence relation
+ * defined for features tested.
+ */
+ public final AnalyzedTokenReadings[] getUnifiedTokens() {
+ if (tokSequence.isEmpty()) {
+ return null;
+ }
+ if (!firstUnified) {
+ AnalyzedTokenReadings tmpATR;
+ int first = 0;
+ tmpFeaturesFound.add(true); // Bentley's search idea
+ while (!tmpFeaturesFound.get(first)) {
+ first++;
+ }
+ tmpFeaturesFound.remove(tmpFeaturesFound.size() - 1);
+ if (first >= tmpFeaturesFound.size()) {
+ return null;
+ }
+ // FIXME: why this happens??
+ final int numRead = tokSequence.get(0).getReadingsLength();
+ if (first < numRead) {
+ tmpATR = new AnalyzedTokenReadings(tokSequence.get(0).getAnalyzedToken(
+ first), 0);
+ for (int i = first + 1; i <= Math.min(numRead - 1, tokCnt); i++) {
+ if (tmpFeaturesFound.get(i)) {
+ tmpATR.addReading(tokSequence.get(0).getAnalyzedToken(i));
+ }
+ }
+ tokSequence.set(0, tmpATR);
+ }
+ firstUnified = true;
+ }
+ final AnalyzedTokenReadings[] atr = tokSequence
+ .toArray(new AnalyzedTokenReadings[tokSequence.size()]);
+ return atr;
+ }
+
+ /**
+ * Tests if the token sequence is unified.
+ *
+ * @param matchToken
+ * AnalyzedToken token to unify
+ * @param feature
+ * String: feature to unify over
+ * @param type
+ * String: value types of the feature
+ * @param isUniNegated
+ * if true, then return negated result
+ * @param lastReading
+ * true when the matchToken is the last reading in the
+ * AnalyzedReadings
+ * @return True if the tokens in the sequence are unified.
+ */
+ public final boolean isUnified(final AnalyzedToken matchToken,
+ final Map<String, List<String>> uFeatures, final boolean isUniNegated,
+ final boolean lastReading) {
+ if (inUnification) {
+ uniMatched |= isSatisfied(matchToken, uFeatures);
+ uniAllMatched = uniMatched;
+ if (lastReading) {
+ startNextToken();
+ unifiedTokens = getUnifiedTokens();
+ uniMatched = false;
+ }
+ return uniAllMatched;
+ }
+ if (isUniNegated) {
+ setNegation(true);
+ }
+ isSatisfied(matchToken, uFeatures);
+ if (lastReading) {
+ inUnification = true;
+ uniMatched = false;
+ startUnify();
+ }
+ return true;
+ }
+
+ /**
+ * Used for getting a unified sequence in case when simple test method
+ * {@link #isUnified} was used.
+ *
+ * @return An array of {@link AnalyzedTokenReadings}
+ */
+ public final AnalyzedTokenReadings[] getFinalUnified() {
+ if (inUnification) {
+ return unifiedTokens;
+ }
+ return null;
+ }
+}
+
+class EquivalenceTypeLocator {
+
+ @Override
+ public int hashCode() {
+ final int prime = 31;
+ int result = 1;
+ result = prime * result + ((feature == null) ? 0 : feature.hashCode());
+ result = prime * result + ((type == null) ? 0 : type.hashCode());
+ return result;
+ }
+
+ @Override
+ public boolean equals(final Object obj) {
+ if (this == obj) {
+ return true;
+ }
+ if (obj == null) {
+ return false;
+ }
+ if (getClass() != obj.getClass()) {
+ return false;
+ }
+ final EquivalenceTypeLocator other = (EquivalenceTypeLocator) obj;
+ if (feature == null) {
+ if (other.feature != null) {
+ return false;
+ }
+ } else if (!feature.equals(other.feature)) {
+ return false;
+ }
+ if (type == null) {
+ if (other.type != null) {
+ return false;
+ }
+ } else if (!type.equals(other.type)) {
+ return false;
+ }
+ return true;
+ }
+
+ private final String feature;
+ private final String type;
+
+ EquivalenceTypeLocator(final String feature, final String type) {
+ this.feature = feature;
+ this.type = type;
+ }
+
+} \ No newline at end of file
diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/rules/patterns/XMLRuleHandler.java b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/patterns/XMLRuleHandler.java
new file mode 100644
index 0000000..72a852a
--- /dev/null
+++ b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/patterns/XMLRuleHandler.java
@@ -0,0 +1,568 @@
+/* LanguageTool, a natural language style checker
+ * Copyright (C) 2006 Daniel Naber (http://www.danielnaber.de)
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
+ * USA
+ */
+package de.danielnaber.languagetool.rules.patterns;
+
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+
+import org.xml.sax.Attributes;
+import org.xml.sax.Locator;
+import org.xml.sax.SAXException;
+import org.xml.sax.SAXParseException;
+import org.xml.sax.helpers.DefaultHandler;
+
+import de.danielnaber.languagetool.Language;
+import de.danielnaber.languagetool.rules.IncorrectExample;
+import de.danielnaber.languagetool.tools.StringTools;
+
+/**
+ * XML rule handler that loads rules from XML and throws
+ * exceptions on errors and warnings.
+ *
+ * @author Daniel Naber
+ */
+public class XMLRuleHandler extends DefaultHandler {
+
+ public XMLRuleHandler() {
+ elementList = new ArrayList<Element>();
+ equivalenceFeatures = new HashMap<String, List<String>>();
+ uTypeList = new ArrayList<String>();
+ }
+
+ List<PatternRule> rules = new ArrayList<PatternRule>();
+
+ protected Language language;
+
+ protected StringBuilder correctExample = new StringBuilder();
+ protected StringBuilder incorrectExample = new StringBuilder();
+ protected StringBuilder exampleCorrection = new StringBuilder();
+ protected StringBuilder message = new StringBuilder();
+ protected StringBuilder match = new StringBuilder();
+ protected StringBuilder elements;
+ protected StringBuilder exceptions;
+
+ List<String> correctExamples = new ArrayList<String>();
+ List<IncorrectExample> incorrectExamples = new ArrayList<IncorrectExample>();
+
+ protected boolean inPattern;
+ protected boolean inCorrectExample;
+ protected boolean inIncorrectExample;
+ protected boolean inMessage;
+ protected boolean inSuggestion;
+ protected boolean inMatch;
+ protected boolean inRuleGroup;
+ protected boolean inToken;
+ protected boolean inException;
+ protected boolean inPhrases;
+ protected boolean inAndGroup;
+
+ protected boolean tokenSpaceBefore;
+ protected boolean tokenSpaceBeforeSet;
+ protected String posToken;
+ protected boolean posNegation;
+ protected boolean posRegExp;
+
+ protected boolean caseSensitive;
+ protected boolean regExpression;
+ protected boolean tokenNegated;
+ protected boolean tokenInflected;
+
+ protected String exceptionPosToken;
+ protected boolean exceptionStringRegExp;
+ protected boolean exceptionStringNegation;
+ protected boolean exceptionStringInflected;
+ protected boolean exceptionPosNegation;
+ protected boolean exceptionPosRegExp;
+ protected boolean exceptionValidNext;
+ protected boolean exceptionValidPrev;
+ protected boolean exceptionSet;
+ protected boolean exceptionSpaceBefore;
+ protected boolean exceptionSpaceBeforeSet;
+
+ /** List of elements as specified by tokens. **/
+ protected List<Element> elementList;
+
+ /** true when phraseref is the last element in the rule. **/
+ protected boolean lastPhrase;
+
+ /** ID reference to the phrase. **/
+ protected String phraseIdRef;
+
+ /** Current phrase ID. **/
+ protected String phraseId;
+
+ protected int skipPos;
+
+ protected String ruleGroupId;
+
+ protected String id;
+
+ protected Element tokenElement;
+
+ protected Match tokenReference;
+
+ protected List<Match> suggestionMatches;
+
+ protected Locator pLocator;
+
+ protected int startPositionCorrection;
+ protected int endPositionCorrection;
+ protected int tokenCounter;
+
+ /** Phrase store - elementLists keyed by phraseIds. **/
+ protected Map<String, List<List<Element>>> phraseMap;
+
+ /**
+ * Logically forking element list, used for including multiple phrases in the
+ * current one.
+ **/
+ protected List<ArrayList<Element>> phraseElementList;
+
+ protected int andGroupCounter;
+
+ protected StringBuilder shortMessage = new StringBuilder();
+ protected boolean inShortMessage;
+
+ protected boolean inUnification;
+ protected boolean inUnificationDef;
+ protected boolean uniNegation;
+
+ protected String uFeature;
+ protected String uType = "";
+
+ protected List<String> uTypeList;
+
+ protected Map<String, List<String>> equivalenceFeatures;
+
+
+ /** Definitions of values in XML files. */
+ protected static final String YES = "yes";
+ protected static final String POSTAG = "postag";
+ protected static final String POSTAG_REGEXP = "postag_regexp";
+ protected static final String REGEXP = "regexp";
+ protected static final String NEGATE = "negate";
+ protected static final String INFLECTED = "inflected";
+ protected static final String NEGATE_POS = "negate_pos";
+ protected static final String MARKER = "marker";
+ protected static final String DEFAULT = "default";
+ protected static final String TYPE = "type";
+ protected static final String SPACEBEFORE = "spacebefore";
+ protected static final String EXAMPLE = "example";
+ protected static final String SCOPE = "scope";
+ protected static final String IGNORE = "ignore";
+ protected static final String SKIP = "skip";
+ protected static final String TOKEN = "token";
+ protected static final String FEATURE = "feature";
+ protected static final String UNIFY = "unify";
+ protected static final String AND = "and";
+ protected static final String EXCEPTION = "exception";
+ protected static final String CASE_SENSITIVE = "case_sensitive";
+ protected static final String PATTERN = "pattern";
+ protected static final String MATCH = "match";
+ protected static final String UNIFICATION = "unification";
+ protected static final String RULEGROUP = "rulegroup";
+ protected static final String NO = "no";
+ protected static final String MARK_TO = "mark_to";
+ protected static final String MARK_FROM = "mark_from";
+ protected static final String PHRASES = "phrases";
+ protected static final String MESSAGE = "message";
+
+
+ public List<PatternRule> getRules() {
+ return rules;
+ }
+
+ public void warning (final SAXParseException e) throws SAXException {
+ throw e;
+ }
+
+ public void error (final SAXParseException e) throws SAXException {
+ throw e;
+ }
+
+ @Override
+ public void setDocumentLocator(final Locator locator) {
+ pLocator = locator;
+ super.setDocumentLocator(locator);
+ }
+
+ protected void resetToken() {
+ posNegation = false;
+ posRegExp = false;
+ inToken = false;
+ tokenSpaceBefore = false;
+ tokenSpaceBeforeSet = false;
+
+ resetException();
+ exceptionSet = false;
+ tokenReference = null;
+ }
+
+ protected void resetException() {
+ exceptionStringNegation = false;
+ exceptionStringInflected = false;
+ exceptionPosNegation = false;
+ exceptionPosRegExp = false;
+ exceptionStringRegExp = false;
+ exceptionValidNext = false;
+ exceptionValidPrev = false;
+ exceptionSpaceBefore = false;
+ exceptionSpaceBeforeSet = false;
+ }
+
+ protected void phraseElementInit() {
+ // lazy init
+ if (phraseElementList == null) {
+ phraseElementList = new ArrayList<ArrayList<Element>>();
+ }
+ }
+ protected void preparePhrase(final Attributes attrs) {
+ phraseIdRef = attrs.getValue("idref");
+ if (phraseMap.containsKey(phraseIdRef)) {
+ for (final List<Element> curPhrEl : phraseMap.get(phraseIdRef)) {
+ for (final Element e : curPhrEl) {
+ e.setPhraseName(phraseIdRef);
+ }
+ if (elementList.isEmpty()) {
+ phraseElementList.add(new ArrayList<Element>(curPhrEl));
+ } else {
+ final ArrayList<Element> prevList = new ArrayList<Element>(
+ elementList);
+ prevList.addAll(curPhrEl);
+ phraseElementList.add(new ArrayList<Element>(prevList));
+ prevList.clear();
+ }
+ }
+ lastPhrase = true;
+ }
+ }
+
+ protected void finalizePhrase() {
+ // lazy init
+ if (phraseMap == null) {
+ phraseMap = new HashMap<String, List<List<Element>>>();
+ }
+ phraseElementInit();
+ if (phraseElementList.isEmpty()) {
+ phraseElementList.add(new ArrayList<Element>(elementList));
+ } else {
+ for (final ArrayList<Element> ph : phraseElementList) {
+ ph.addAll(new ArrayList<Element>(elementList));
+ }
+ }
+
+ phraseMap.put(phraseId, new ArrayList<List<Element>>(phraseElementList));
+ elementList.clear();
+
+ phraseElementList.clear();
+ }
+
+ protected void startPattern(final Attributes attrs) throws SAXException {
+ inPattern = true;
+ if (attrs.getValue(MARK_FROM) != null) {
+ startPositionCorrection = Integer.parseInt(attrs.getValue(MARK_FROM));
+ }
+ if (attrs.getValue(MARK_TO) != null) {
+ endPositionCorrection = Integer.parseInt(attrs.getValue(MARK_TO));
+ if (endPositionCorrection > 0) {
+ throw new SAXException("End position correction (mark_to="+ endPositionCorrection
+ + ") cannot be larger than 0: " + "\n Line: "
+ + pLocator.getLineNumber() + ", column: "
+ + pLocator.getColumnNumber() + ".");
+ }
+ }
+ caseSensitive = YES.equals(attrs.getValue(CASE_SENSITIVE));
+ }
+
+
+ /**
+ * Calculates the offset of the match reference (if any) in case the match
+ * element has been used in the group.
+ *
+ * @param elList
+ * Element list where the match element was used. It is directly changed.
+ */
+ protected void processElement(final List<Element> elList) {
+ int counter = 0;
+ for (final Element elTest : elList) {
+ if (elTest.getPhraseName() != null && counter > 0) {
+ if (elTest.isReferenceElement()) {
+ final int tokRef = elTest.getMatch().getTokenRef();
+ elTest.getMatch().setTokenRef(tokRef + counter - 1);
+ final String offsetToken = elTest.getString().replace("\\" + tokRef,
+ "\\" + (tokRef + counter - 1));
+ elTest.setStringElement(offsetToken);
+ }
+ }
+ counter++;
+ }
+ }
+
+ protected void setMatchElement(final Attributes attrs) throws SAXException {
+ inMatch = true;
+ match = new StringBuilder();
+ Match.CaseConversion caseConversion = Match.CaseConversion.NONE;
+ if (attrs.getValue("case_conversion") != null) {
+ caseConversion = Match.CaseConversion.toCase(attrs
+ .getValue("case_conversion").toUpperCase());
+ }
+ Match.IncludeRange includeRange = Match.IncludeRange.NONE;
+ if (attrs.getValue("include_skipped") != null) {
+ includeRange = Match.IncludeRange.toRange(attrs
+ .getValue("include_skipped").toUpperCase());
+ }
+ final Match mWorker = new Match(attrs.getValue(POSTAG), attrs
+ .getValue("postag_replace"), YES
+ .equals(attrs.getValue(POSTAG_REGEXP)), attrs
+ .getValue("regexp_match"), attrs.getValue("regexp_replace"),
+ caseConversion, YES.equals(attrs.getValue("setpos")),
+ includeRange);
+ mWorker.setInMessageOnly(!inSuggestion);
+ if (inMessage) {
+ if (suggestionMatches == null) {
+ suggestionMatches = new ArrayList<Match>();
+ }
+ suggestionMatches.add(mWorker);
+ //add incorrect XML character for simplicity
+ message.append("\u0001\\");
+ message.append(attrs.getValue("no"));
+ if (StringTools.isEmpty(attrs.getValue("no"))) {
+ throw new SAXException("References cannot be empty: " + "\n Line: "
+ + pLocator.getLineNumber() + ", column: "
+ + pLocator.getColumnNumber() + ".");
+ } else if (Integer.parseInt(attrs.getValue("no")) < 1) {
+ throw new SAXException("References must be larger than 0: "
+ + attrs.getValue("no") + "\n Line: " + pLocator.getLineNumber()
+ + ", column: " + pLocator.getColumnNumber() + ".");
+ }
+ } else if (inToken && attrs.getValue("no") != null) {
+ final int refNumber = Integer.parseInt(attrs.getValue("no"));
+ if (refNumber > elementList.size()) {
+ throw new SAXException(
+ "Only backward references in match elements are possible, tried to specify token "
+ + refNumber
+ + "\n Line: "
+ + pLocator.getLineNumber()
+ + ", column: " + pLocator.getColumnNumber() + ".");
+ }
+ mWorker.setTokenRef(refNumber);
+ tokenReference = mWorker;
+ elements.append('\\');
+ elements.append(refNumber);
+ }
+ }
+
+ protected void setExceptions(final Attributes attrs) {
+ inException = true;
+ exceptions = new StringBuilder();
+ resetException();
+
+ exceptionStringNegation = YES.equals(attrs.getValue(NEGATE));
+ exceptionValidNext = "next".equals(attrs.getValue(SCOPE));
+ exceptionValidPrev = "previous".equals(attrs.getValue(SCOPE));
+ exceptionStringInflected = YES.equals(attrs.getValue(INFLECTED));
+
+ if (attrs.getValue(POSTAG) != null) {
+ exceptionPosToken = attrs.getValue(POSTAG);
+ exceptionPosRegExp = YES.equals(attrs.getValue(POSTAG_REGEXP));
+ exceptionPosNegation = YES.equals(attrs.getValue(NEGATE_POS));
+ }
+ exceptionStringRegExp = YES.equals(attrs.getValue(REGEXP));
+ if (attrs.getValue(SPACEBEFORE) != null) {
+ exceptionSpaceBefore = YES.equals(attrs.getValue(SPACEBEFORE));
+ exceptionSpaceBeforeSet = !"ignore".equals(attrs.getValue(SPACEBEFORE));
+ }
+ }
+
+ protected void finalizeExceptions() {
+ inException = false;
+ if (!exceptionSet) {
+ tokenElement = new Element(StringTools.trimWhitespace(elements
+ .toString()), caseSensitive, regExpression, tokenInflected);
+ exceptionSet = true;
+ }
+ tokenElement.setNegation(tokenNegated);
+ if (!StringTools.isEmpty(exceptions.toString())) {
+ tokenElement.setStringException(StringTools.trimWhitespace(exceptions
+ .toString()), exceptionStringRegExp, exceptionStringInflected,
+ exceptionStringNegation, exceptionValidNext, exceptionValidPrev);
+ }
+ if (exceptionPosToken != null) {
+ tokenElement.setPosException(exceptionPosToken, exceptionPosRegExp,
+ exceptionPosNegation, exceptionValidNext, exceptionValidPrev);
+ exceptionPosToken = null;
+ }
+ if (exceptionSpaceBeforeSet) {
+ tokenElement.setExceptionSpaceBefore(exceptionSpaceBefore);
+ }
+ resetException();
+ }
+
+ protected void setToken(final Attributes attrs) {
+ inToken = true;
+
+ if (lastPhrase) {
+ elementList.clear();
+ }
+
+ lastPhrase = false;
+ tokenNegated = YES.equals(attrs.getValue(NEGATE));
+ tokenInflected = YES.equals(attrs.getValue(INFLECTED));
+ if (attrs.getValue("skip") != null) {
+ skipPos = Integer.parseInt(attrs.getValue("skip"));
+ }
+ elements = new StringBuilder();
+ // POSElement creation
+ if (attrs.getValue(POSTAG) != null) {
+ posToken = attrs.getValue(POSTAG);
+ posRegExp = YES.equals(attrs.getValue(POSTAG_REGEXP));
+ posNegation = YES.equals(attrs.getValue(NEGATE_POS));
+ }
+ regExpression = YES.equals(attrs.getValue(REGEXP));
+
+ if (attrs.getValue(SPACEBEFORE) != null) {
+ tokenSpaceBefore = YES.equals(attrs.getValue(SPACEBEFORE));
+ tokenSpaceBeforeSet = !"ignore".equals(attrs.getValue(SPACEBEFORE));
+ }
+
+ if (!inAndGroup) {
+ tokenCounter++;
+ }
+ }
+
+ protected void checkPositions(final int add) throws SAXException {
+ if (startPositionCorrection >= tokenCounter + add) {
+ throw new SAXException(
+ "Attempt to mark a token no. ("+ startPositionCorrection +") that is outside the pattern ("
+ + tokenCounter + "). Pattern elements are numbered starting from 0!" + "\n Line: "
+ + pLocator.getLineNumber() + ", column: "
+ + pLocator.getColumnNumber() + ".");
+ }
+ if (tokenCounter +add - endPositionCorrection < 0) {
+ throw new SAXException(
+ "Attempt to mark a token no. ("+ endPositionCorrection +") that is outside the pattern ("
+ + tokenCounter + " elements). End positions should be negative but not larger than the token count!"
+ + "\n Line: "
+ + pLocator.getLineNumber() + ", column: "
+ + pLocator.getColumnNumber() + ".");
+ }
+ }
+
+ protected void checkMarkPositions() {
+ if (phraseElementList == null || phraseElementList.size() == 0) {
+ final int endMarker = elementList.size() + endPositionCorrection;
+ if (endMarker <= startPositionCorrection) {
+ throw new RuntimeException("Invalid combination of mark_from (" + startPositionCorrection
+ + ") and mark_to (" + endPositionCorrection + ") for rule " + id
+ + " with " + elementList.size()
+ + " tokens: the error position created by mark_from and mark_to is less than one token");
+ }
+ }
+ }
+
+ /**
+ * Adds Match objects for all references to tokens
+ * (including '\1' and the like).
+ */
+ protected List<Match> addLegacyMatches() {
+ if (suggestionMatches == null || suggestionMatches.isEmpty()) {
+ return null;
+ }
+ final List<Match> sugMatch = new ArrayList<Match>();
+ final String messageStr = message.toString();
+ int pos = 0;
+ int ind = 0;
+ int matchCounter = 0;
+ while (pos != -1) {
+ pos = messageStr.indexOf('\\', ind + 1);
+ if (pos != -1 && messageStr.length() > pos) {
+ if (Character.isDigit(messageStr.charAt(pos + 1))) {
+ if (pos == 1 || messageStr.charAt(pos - 1) != '\u0001') {
+ final Match mWorker = new Match(null, null, false, null,
+ null, Match.CaseConversion.NONE, false, Match.IncludeRange.NONE);
+ mWorker.setInMessageOnly(true);
+ sugMatch.add(mWorker);
+ } else if (messageStr.charAt(pos - 1) == '\u0001') { // real suggestion marker
+ sugMatch.add(suggestionMatches.get(matchCounter));
+ message.deleteCharAt(pos - 1 - matchCounter);
+ matchCounter++;
+ }
+ }
+ }
+ ind = pos;
+ }
+ if (sugMatch.isEmpty()) {
+ return suggestionMatches;
+ }
+ return sugMatch;
+ }
+
+ protected void finalizeTokens() {
+ if (!exceptionSet || tokenElement == null) {
+ tokenElement = new Element(StringTools.trimWhitespace(elements
+ .toString()), caseSensitive, regExpression, tokenInflected);
+ tokenElement.setNegation(tokenNegated);
+ } else {
+ tokenElement.setStringElement(StringTools.trimWhitespace(elements
+ .toString()));
+ }
+
+ if (skipPos != 0) {
+ tokenElement.setSkipNext(skipPos);
+ skipPos = 0;
+ }
+ if (posToken != null) {
+ tokenElement.setPosElement(posToken, posRegExp, posNegation);
+ posToken = null;
+ }
+
+ if (tokenReference != null) {
+ tokenElement.setMatch(tokenReference);
+ }
+
+ if (inAndGroup && andGroupCounter > 0) {
+ elementList.get(elementList.size() - 1)
+ .setAndGroupElement(tokenElement);
+ } else {
+ elementList.add(tokenElement);
+ }
+ if (inAndGroup) {
+ andGroupCounter++;
+ }
+
+ if (inUnification) {
+ tokenElement.setUnification(equivalenceFeatures);
+ if (uniNegation) {
+ tokenElement.setUniNegation();
+ }
+ }
+
+ if (inUnificationDef) {
+ language.getUnifier().setEquivalence(uFeature, uType, tokenElement);
+ elementList.clear();
+ }
+ if (tokenSpaceBeforeSet) {
+ tokenElement.setWhitespaceBefore(tokenSpaceBefore);
+ }
+ resetToken();
+ }
+
+}
diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/rules/patterns/bitext/BitextPatternRule.java b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/patterns/bitext/BitextPatternRule.java
new file mode 100644
index 0000000..1d42a17
--- /dev/null
+++ b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/patterns/bitext/BitextPatternRule.java
@@ -0,0 +1,93 @@
+/* LanguageTool, a natural language style checker
+ * Copyright (C) 2010 Marcin Miłkowski (www.languagetool.org)
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
+ * USA
+ */
+
+package de.danielnaber.languagetool.rules.patterns.bitext;
+
+import java.io.IOException;
+
+import de.danielnaber.languagetool.AnalyzedSentence;
+import de.danielnaber.languagetool.rules.Rule;
+import de.danielnaber.languagetool.rules.RuleMatch;
+import de.danielnaber.languagetool.rules.bitext.BitextRule;
+import de.danielnaber.languagetool.rules.patterns.PatternRule;
+
+/**
+ * A bitext pattern rule class. A BitextPatternRule describes a language error and
+ * can test whether a given pre-analyzed pair of source and target text
+ * contains that error using the {@link Rule#match} method. It uses the syntax
+ * of XML files similar to normal PatternRules.
+ *
+ * @author Marcin Miłkowski
+ */
+public class BitextPatternRule extends BitextRule {
+
+ private final PatternRule srcRule;
+ private final PatternRule trgRule;
+
+ BitextPatternRule(final PatternRule src, final PatternRule trg) {
+ srcRule = src;
+ trgRule = trg;
+ }
+
+ public PatternRule getSrcRule() {
+ return srcRule;
+ }
+
+ public PatternRule getTrgRule() {
+ return trgRule;
+ }
+
+ @Override
+ public String getDescription() {
+ return srcRule.getDescription();
+ }
+
+ public String getMessage() {
+ return trgRule.getMessage();
+ }
+
+ @Override
+ public String getId() {
+ return srcRule.getId();
+ }
+
+ /**
+ * This method always returns an empty array.
+ */
+ @Override
+ public RuleMatch[] match(AnalyzedSentence text) throws IOException {
+ return new RuleMatch[0];
+ }
+
+ @Override
+ public RuleMatch[] match(AnalyzedSentence sourceText,
+ AnalyzedSentence targetText) throws IOException {
+ if (srcRule.match(sourceText).length > 0) {
+ return trgRule.match(targetText);
+ }
+ return new RuleMatch[0];
+ }
+
+ @Override
+ public void reset() {
+ // TODO Auto-generated method stub
+
+ }
+
+}
diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/rules/patterns/bitext/BitextPatternRuleLoader.java b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/patterns/bitext/BitextPatternRuleLoader.java
new file mode 100644
index 0000000..508f381
--- /dev/null
+++ b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/patterns/bitext/BitextPatternRuleLoader.java
@@ -0,0 +1,413 @@
+/* LanguageTool, a natural language style checker
+ * Copyright (C) 2005 Daniel Naber (http://www.danielnaber.de)
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
+ * USA
+ */
+package de.danielnaber.languagetool.rules.patterns.bitext;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+
+import javax.xml.parsers.SAXParser;
+import javax.xml.parsers.SAXParserFactory;
+
+import org.xml.sax.Attributes;
+import org.xml.sax.SAXException;
+import org.xml.sax.helpers.DefaultHandler;
+
+import de.danielnaber.languagetool.Language;
+import de.danielnaber.languagetool.bitext.StringPair;
+import de.danielnaber.languagetool.rules.Category;
+import de.danielnaber.languagetool.rules.IncorrectExample;
+import de.danielnaber.languagetool.rules.bitext.IncorrectBitextExample;
+import de.danielnaber.languagetool.rules.patterns.Element;
+import de.danielnaber.languagetool.rules.patterns.Match;
+import de.danielnaber.languagetool.rules.patterns.PatternRule;
+
+/**
+ * Loads {@link PatternRule}s from an XML file.
+ *
+ * @author Marcin Miłkowski
+ */
+public class BitextPatternRuleLoader extends DefaultHandler {
+
+ public final List<BitextPatternRule> getRules(final InputStream is,
+ final String filename) throws IOException {
+ final List<BitextPatternRule> rules;
+ try {
+ final PatternRuleHandler handler = new PatternRuleHandler();
+ final SAXParserFactory factory = SAXParserFactory.newInstance();
+ final SAXParser saxParser = factory.newSAXParser();
+ /* saxParser.getXMLReader().setFeature(
+ "http://apache.org/xml/features/nonvalidating/load-external-dtd",
+ false);
+ */
+ saxParser.parse(is, handler);
+ rules = handler.getBitextRules();
+ return rules;
+ } catch (final Exception e) {
+ final IOException ioe = new IOException("Cannot load or parse '"
+ + filename + "'");
+ ioe.initCause(e);
+ throw ioe;
+ }
+ }
+
+}
+
+class PatternRuleHandler extends BitextXMLRuleHandler {
+
+ private int subId;
+
+ private boolean defaultOff;
+ private boolean defaultOn;
+
+ private Category category;
+ private String description;
+ private String ruleGroupDescription;
+
+ private PatternRule srcRule;
+ private PatternRule trgRule;
+
+ private IncorrectExample trgExample;
+ private IncorrectExample srcExample;
+
+ private Language srcLang;
+
+ // ===========================================================
+ // SAX DocumentHandler methods
+ // ===========================================================
+
+ @Override
+ public void startElement(final String namespaceURI, final String lName,
+ final String qName, final Attributes attrs) throws SAXException {
+ if (qName.equals("category")) {
+ final String catName = attrs.getValue("name");
+ final String priorityStr = attrs.getValue("priority");
+ // int prio = 0;
+ if (priorityStr != null) {
+ category = new Category(catName, Integer.parseInt(priorityStr));
+ } else {
+ category = new Category(catName);
+ }
+
+ if ("off".equals(attrs.getValue(DEFAULT))) {
+ category.setDefaultOff();
+ }
+
+ } else if (qName.equals("rules")) {
+ final String languageStr = attrs.getValue("targetLang");
+ language = Language.getLanguageForShortName(languageStr);
+ if (language == null) {
+ throw new SAXException("Unknown language '" + languageStr + "'");
+ }
+ } else if (qName.equals("rule")) {
+ id = attrs.getValue("id");
+ if (inRuleGroup)
+ subId++;
+ if (!(inRuleGroup && defaultOff)) {
+ defaultOff = "off".equals(attrs.getValue(DEFAULT));
+ }
+
+ if (!(inRuleGroup && defaultOn)) {
+ defaultOn = "on".equals(attrs.getValue(DEFAULT));
+ }
+ if (inRuleGroup && id == null) {
+ id = ruleGroupId;
+ }
+ description = attrs.getValue("name");
+ if (inRuleGroup && description == null) {
+ description = ruleGroupDescription;
+ }
+ correctExamples = new ArrayList<StringPair>();
+ incorrectExamples = new ArrayList<IncorrectBitextExample>();
+ if (suggestionMatches != null) {
+ suggestionMatches.clear();
+ }
+ } else if (PATTERN.equals(qName) || "target".equals(qName)) {
+ startPattern(attrs);
+ } else if (AND.equals(qName)) {
+ inAndGroup = true;
+ } else if (UNIFY.equals(qName)) {
+ inUnification = true;
+ uniNegation = YES.equals(attrs.getValue(NEGATE));
+ } else if (qName.equals("feature")) {
+ uFeature = attrs.getValue("id");
+ } else if (qName.equals(TYPE)) {
+ uType = attrs.getValue("id");
+ uTypeList.add(uType);
+ } else if (qName.equals(TOKEN)) {
+ setToken(attrs);
+ } else if (qName.equals(EXCEPTION)) {
+ setExceptions(attrs);
+ } else if (qName.equals(EXAMPLE)
+ && attrs.getValue(TYPE).equals("correct")) {
+ inCorrectExample = true;
+ correctExample = new StringBuilder();
+ } else if (EXAMPLE.equals(qName)
+ && attrs.getValue(TYPE).equals("incorrect")) {
+ inIncorrectExample = true;
+ incorrectExample = new StringBuilder();
+ exampleCorrection = new StringBuilder();
+ if (attrs.getValue("correction") != null) {
+ exampleCorrection.append(attrs.getValue("correction"));
+ }
+ } else if (MESSAGE.equals(qName)) {
+ inMessage = true;
+ message = new StringBuilder();
+ } else if (qName.equals("short")) {
+ inShortMessage = true;
+ shortMessage = new StringBuilder();
+ } else if (qName.equals(RULEGROUP)) {
+ ruleGroupId = attrs.getValue("id");
+ ruleGroupDescription = attrs.getValue("name");
+ defaultOff = "off".equals(attrs.getValue(DEFAULT));
+ defaultOn = "on".equals(attrs.getValue(DEFAULT));
+ inRuleGroup = true;
+ subId = 0;
+ } else if (qName.equals("suggestion") && inMessage) {
+ message.append("<suggestion>");
+ inSuggestion = true;
+ } else if (qName.equals("match")) {
+ setMatchElement(attrs);
+ } else if (qName.equals(MARKER) && inCorrectExample) {
+ correctExample.append("<marker>");
+ } else if (qName.equals(MARKER) && inIncorrectExample) {
+ incorrectExample.append("<marker>");
+ } else if (qName.equals("unification")) {
+ uFeature = attrs.getValue("feature");
+ inUnificationDef = true;
+ } else if (qName.equals("equivalence")) {
+ uType = attrs.getValue(TYPE);
+ } else if (qName.equals("phrases")) {
+ inPhrases = true;
+ } else if (qName.equals("includephrases")) {
+ phraseElementInit();
+ } else if (qName.equals("phrase") && inPhrases) {
+ phraseId = attrs.getValue("id");
+ } else if (qName.equals("phraseref") && (attrs.getValue("idref") != null)) {
+ preparePhrase(attrs);
+ } else if (qName.equals("source")) {
+ srcLang = Language.getLanguageForShortName(attrs.getValue("lang"));
+ }
+ }
+
+ @Override
+ public void endElement(final String namespaceURI, final String sName,
+ final String qName) throws SAXException {
+
+ if (qName.equals("source")) {
+ checkMarkPositions();
+ srcRule = finalizeRule();
+ } else if ("target".equals(qName)) {
+ checkMarkPositions();
+ trgRule = finalizeRule();
+ } else if ("rule".equals(qName)) {
+ trgRule.setMessage(message.toString());
+ if (suggestionMatches != null) {
+ for (final Match m : suggestionMatches) {
+ trgRule.addSuggestionMatch(m);
+ }
+ if (phraseElementList.size() <= 1) {
+ suggestionMatches.clear();
+ }
+ }
+ final BitextPatternRule bRule = new BitextPatternRule(srcRule, trgRule);
+ bRule.setCorrectBitextExamples(correctExamples);
+ bRule.setIncorrectBitextExamples(incorrectExamples);
+ bRule.setSourceLang(srcLang);
+ rules.add(bRule);
+ } else if (qName.equals(EXCEPTION)) {
+ finalizeExceptions();
+ } else if (qName.equals(AND)) {
+ inAndGroup = false;
+ andGroupCounter = 0;
+ tokenCounter++;
+ } else if (qName.equals(TOKEN)) {
+ finalizeTokens();
+ } else if (qName.equals(PATTERN)) {
+ inPattern = false;
+ if (lastPhrase) {
+ elementList.clear();
+ }
+ if (phraseElementList == null || phraseElementList.isEmpty()) {
+ checkPositions(0);
+ } else {
+ for (List<Element> elements : phraseElementList) {
+ checkPositions(elements.size());
+ }
+ }
+ tokenCounter = 0;
+ } else if (qName.equals("trgExample")) {
+ trgExample = setExample();
+ } else if (qName.equals("srcExample")) {
+ srcExample = setExample();
+ } else if (qName.equals("example")) {
+ if (inCorrectExample) {
+ correctExamples.add(new StringPair(srcExample.getExample(), trgExample.getExample()));
+ } else if (inIncorrectExample) {
+ if (trgExample.getCorrections() == null) {
+ incorrectExamples.add(
+ new IncorrectBitextExample(
+ new StringPair(
+ srcExample.getExample(), trgExample.getExample())
+ ));
+ } else {
+ List<String> l = trgExample.getCorrections();
+ String str [] = l.toArray (new String [l.size ()]);
+ incorrectExamples.add(
+ new IncorrectBitextExample(
+ new StringPair(srcExample.getExample(),
+ trgExample.getExample()), str)
+ );
+ }
+ }
+ inCorrectExample = false;
+ inIncorrectExample = false;
+ } else if (qName.equals("message")) {
+ suggestionMatches = addLegacyMatches();
+ inMessage = false;
+ } else if (qName.equals("short")) {
+ inShortMessage = false;
+ } else if (qName.equals("match")) {
+ if (inMessage) {
+ suggestionMatches.get(suggestionMatches.size() - 1).setLemmaString(
+ match.toString());
+ } else if (inToken) {
+ tokenReference.setLemmaString(match.toString());
+ }
+ inMatch = false;
+ } else if (qName.equals("rulegroup")) {
+ inRuleGroup = false;
+ } else if (qName.equals("suggestion") && inMessage) {
+ message.append("</suggestion>");
+ inSuggestion = false;
+ } else if (qName.equals(MARKER) && inCorrectExample) {
+ correctExample.append("</marker>");
+ } else if (qName.equals(MARKER) && inIncorrectExample) {
+ incorrectExample.append("</marker>");
+ } else if (qName.equals("phrase") && inPhrases) {
+ finalizePhrase();
+ } else if (qName.equals("includephrases")) {
+ elementList.clear();
+ } else if (qName.equals("phrases") && inPhrases) {
+ inPhrases = false;
+ } else if (qName.equals("unification")) {
+ inUnificationDef = false;
+ } else if (qName.equals("feature")) {
+ equivalenceFeatures.put(uFeature, uTypeList);
+ uTypeList = new ArrayList<String>();
+ } else if (qName.equals("unify")) {
+ inUnification = false;
+ //clear the features...
+ equivalenceFeatures = new HashMap<String, List<String>>();
+ }
+ }
+
+ private IncorrectExample setExample() {
+ IncorrectExample example = null;
+ if (inCorrectExample) {
+ example = new IncorrectExample(correctExample.toString());
+ } else if (inIncorrectExample) {
+ final String[] corrections = exampleCorrection.toString().split("\\|");
+ if (corrections.length > 0 && corrections[0].length() > 0) {
+ example = new IncorrectExample(incorrectExample.toString(),
+ corrections);
+ } else {
+ example = new IncorrectExample(incorrectExample.toString());
+ }
+ }
+ correctExample = new StringBuilder();
+ incorrectExample = new StringBuilder();
+ exampleCorrection = new StringBuilder();
+ return example;
+ }
+
+ private PatternRule finalizeRule() {
+ PatternRule rule = null;
+ phraseElementInit();
+ if (phraseElementList.isEmpty()) {
+ rule = new PatternRule(id, language, elementList,
+ description, "", shortMessage.toString());
+ prepareRule(rule);
+ } else {
+ if (!elementList.isEmpty()) {
+ for (final ArrayList<Element> ph : phraseElementList) {
+ ph.addAll(new ArrayList<Element>(elementList));
+ }
+ }
+
+ for (final ArrayList<Element> phraseElement : phraseElementList) {
+ processElement(phraseElement);
+ rule = new PatternRule(id, language, phraseElement,
+ description, message.toString(), shortMessage.toString(),
+ phraseElementList.size() > 1);
+ prepareRule(rule);
+ }
+ }
+ elementList.clear();
+ if (phraseElementList != null) {
+ phraseElementList.clear();
+ }
+ startPositionCorrection = 0;
+ endPositionCorrection = 0;
+ return rule;
+ }
+ private void prepareRule(final PatternRule rule) {
+ rule.setStartPositionCorrection(startPositionCorrection);
+ rule.setEndPositionCorrection(endPositionCorrection);
+ startPositionCorrection = 0;
+ endPositionCorrection = 0;
+ rule.setCategory(category);
+ if (inRuleGroup)
+ rule.setSubId(Integer.toString(subId));
+ else
+ rule.setSubId("1");
+ caseSensitive = false;
+ if (defaultOff) {
+ rule.setDefaultOff();
+ }
+
+ if (category.isDefaultOff() && !defaultOn) {
+ rule.setDefaultOff();
+ }
+
+ }
+
+ @Override
+ public void characters(final char[] buf, final int offset, final int len) {
+ final String s = new String(buf, offset, len);
+ if (inException) {
+ exceptions.append(s);
+ } else if (inToken) {
+ elements.append(s);
+ } else if (inCorrectExample) {
+ correctExample.append(s);
+ } else if (inIncorrectExample) {
+ incorrectExample.append(s);
+ } else if (inMatch) {
+ match.append(s);
+ } else if (inMessage) {
+ message.append(s);
+ } else if (inShortMessage) {
+ shortMessage.append(s);
+ }
+ }
+
+}
diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/rules/patterns/bitext/BitextXMLRuleHandler.java b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/patterns/bitext/BitextXMLRuleHandler.java
new file mode 100644
index 0000000..02f5a04
--- /dev/null
+++ b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/patterns/bitext/BitextXMLRuleHandler.java
@@ -0,0 +1,56 @@
+/* LanguageTool, a natural language style checker
+ * Copyright (C) 2006 Daniel Naber (http://www.danielnaber.de)
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
+ * USA
+ */
+package de.danielnaber.languagetool.rules.patterns.bitext;
+
+import java.util.ArrayList;
+import java.util.List;
+
+import org.xml.sax.SAXException;
+import org.xml.sax.SAXParseException;
+
+import de.danielnaber.languagetool.bitext.StringPair;
+import de.danielnaber.languagetool.rules.bitext.IncorrectBitextExample;
+import de.danielnaber.languagetool.rules.patterns.XMLRuleHandler;
+
+/**
+ * XML rule handler that loads rules from XML and throws
+ * exceptions on errors and warnings.
+ *
+ * @author Daniel Naber
+ */
+class BitextXMLRuleHandler extends XMLRuleHandler {
+
+ List<BitextPatternRule> rules = new ArrayList<BitextPatternRule>();
+
+ List<StringPair> correctExamples = new ArrayList<StringPair>();
+ List<IncorrectBitextExample> incorrectExamples = new ArrayList<IncorrectBitextExample>();
+
+ List<BitextPatternRule> getBitextRules() {
+ return rules;
+ }
+
+ public void warning (final SAXParseException e) throws SAXException {
+ throw e;
+ }
+
+ public void error (final SAXParseException e) throws SAXException {
+ throw e;
+ }
+
+}
diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/rules/patterns/bitext/FalseFriendsAsBitextLoader.java b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/patterns/bitext/FalseFriendsAsBitextLoader.java
new file mode 100644
index 0000000..87c30a5
--- /dev/null
+++ b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/patterns/bitext/FalseFriendsAsBitextLoader.java
@@ -0,0 +1,72 @@
+/* LanguageTool, a natural language style checker
+ * Copyright (C) 2010 Marcin Miłkowski (www.languagetool.org)
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
+ * USA
+ */
+
+package de.danielnaber.languagetool.rules.patterns.bitext;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+
+import javax.xml.parsers.ParserConfigurationException;
+
+import org.xml.sax.SAXException;
+
+import de.danielnaber.languagetool.Language;
+import de.danielnaber.languagetool.rules.patterns.FalseFriendRuleLoader;
+import de.danielnaber.languagetool.rules.patterns.PatternRule;
+
+/**
+ * Loads the false friend rules as bitext pattern rules. Note that the resulting
+ * rules have suggestions that are not really customizable, in contradistinction
+ * to the 'real' bitext pattern rules.
+ *
+ * @author Marcin Miłkowski
+ *
+ */
+public class FalseFriendsAsBitextLoader {
+
+ public List<BitextPatternRule> getFalseFriendsAsBitext(final String filename,
+ final Language motherTongue, final Language language) throws ParserConfigurationException, SAXException, IOException {
+ final FalseFriendRuleLoader ruleLoader = new FalseFriendRuleLoader();
+ List<BitextPatternRule> bRules = new ArrayList<BitextPatternRule>();
+ List<PatternRule> rules1 =
+ ruleLoader.getRules(this.getClass().getResourceAsStream(filename),
+ motherTongue, language);
+ List<PatternRule> rules2 =
+ ruleLoader.getRules(this.getClass().getResourceAsStream(filename),
+ language, motherTongue);
+ HashMap<String, PatternRule> srcRules = new HashMap<String, PatternRule>();
+ for (PatternRule rule : rules1) {
+ srcRules.put(rule.getId(), rule);
+ }
+ for (PatternRule rule : rules2) {
+ if (srcRules.containsKey(rule.getId())) {
+ BitextPatternRule bRule = new BitextPatternRule(
+ srcRules.get(rule.getId()), rule);
+ bRule.setSourceLang(motherTongue);
+ bRule.setCategory(rule.getCategory());
+ bRules.add(bRule);
+ }
+ }
+ return bRules;
+ }
+
+}
+
diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/rules/pl/CompoundRule.java b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/pl/CompoundRule.java
new file mode 100644
index 0000000..6d2ff17
--- /dev/null
+++ b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/pl/CompoundRule.java
@@ -0,0 +1,55 @@
+/* LanguageTool, a natural language style checker
+ * Copyright (C) 2006 Daniel Naber (http://www.danielnaber.de)
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
+ * USA
+ */
+package de.danielnaber.languagetool.rules.pl;
+
+import java.io.IOException;
+import java.util.ResourceBundle;
+
+import de.danielnaber.languagetool.JLanguageTool;
+import de.danielnaber.languagetool.rules.AbstractCompoundRule;
+
+/**
+ * Checks that compounds (if in the list) are not written as separate words.
+ *
+ * @author Marcin Miłkowski, based on code by Daniel Naber
+ */
+
+public final class CompoundRule extends AbstractCompoundRule {
+
+ private static final String FILE_NAME = "/pl/compounds.txt";
+
+ public CompoundRule(final ResourceBundle messages) throws IOException {
+ super(messages);
+ loadCompoundFile(JLanguageTool.getDataBroker().getFromResourceDirAsStream(FILE_NAME), "UTF-8");
+ super.setShort("Brak łącznika lub zbędny łącznik");
+ super.setMsg("Ten wyraz pisze się z łącznikiem.",
+ "Ten wyraz pisze się razem (bez spacji ani łącznika).",
+ "Ten wyraz pisze się z łącznikiem lub bez niego.");
+ }
+
+ public final String getId() {
+ return "PL_COMPOUNDS";
+ }
+
+ public final String getDescription() {
+ return "Sprawdza wyrazy z łącznikiem, np. „łapu capu” zamiast „łapu-capu”";
+ }
+
+
+}
diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/rules/pl/PolishRule.java b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/pl/PolishRule.java
new file mode 100644
index 0000000..0a6f01b
--- /dev/null
+++ b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/pl/PolishRule.java
@@ -0,0 +1,31 @@
+/* LanguageTool, a natural language style checker
+ * Copyright (C) 2005 Daniel Naber (http://www.danielnaber.de)
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
+ * USA
+ */
+package de.danielnaber.languagetool.rules.pl;
+
+import de.danielnaber.languagetool.rules.Rule;
+
+/**
+ * Abstract base class for Polish rules.
+ *
+ * @author Marcin Miłkowski
+ *
+ */
+public abstract class PolishRule extends Rule {
+
+}
diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/rules/pl/PolishUnpairedBracketsRule.java b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/pl/PolishUnpairedBracketsRule.java
new file mode 100644
index 0000000..3b83133
--- /dev/null
+++ b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/pl/PolishUnpairedBracketsRule.java
@@ -0,0 +1,42 @@
+/* LanguageTool, a natural language style checker
+ * Copyright (C) 2010 Marcin Miłkowski (http://www.languagetool.org)
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
+ * USA
+ */
+
+package de.danielnaber.languagetool.rules.pl;
+
+import java.util.ResourceBundle;
+
+import de.danielnaber.languagetool.Language;
+import de.danielnaber.languagetool.rules.GenericUnpairedBracketsRule;
+
+public class PolishUnpairedBracketsRule extends GenericUnpairedBracketsRule {
+
+ private static final String[] PL_START_SYMBOLS = { "[", "(", "{", "„", "»", "\"" };
+ private static final String[] PL_END_SYMBOLS = { "]", ")", "}", "”", "«", "\"" };
+
+ public PolishUnpairedBracketsRule(final ResourceBundle messages,
+ final Language language) {
+ super(messages, language);
+ startSymbols = PL_START_SYMBOLS;
+ endSymbols = PL_END_SYMBOLS;
+ }
+
+ public String getId() {
+ return "PL_UNPAIRED_BRACKETS";
+ }
+}
diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/rules/pl/PolishWordRepeatRule.java b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/pl/PolishWordRepeatRule.java
new file mode 100644
index 0000000..a7dbb5e
--- /dev/null
+++ b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/pl/PolishWordRepeatRule.java
@@ -0,0 +1,200 @@
+/* LanguageTool, a natural language style checker
+ * Copyright (C) 2005 Daniel Naber (http://www.danielnaber.de)
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
+ * USA
+ */
+package de.danielnaber.languagetool.rules.pl;
+
+import java.util.ArrayList;
+import java.util.List;
+import java.util.ResourceBundle;
+import java.util.TreeSet;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import de.danielnaber.languagetool.AnalyzedSentence;
+import de.danielnaber.languagetool.AnalyzedTokenReadings;
+import de.danielnaber.languagetool.rules.Category;
+import de.danielnaber.languagetool.rules.RuleMatch;
+import de.danielnaber.languagetool.tools.StringTools;
+
+/**
+ * @author Marcin Miłkowski
+ *
+ * Rule for detecting same words in the sentence but not just in a row
+ *
+ */
+public class PolishWordRepeatRule extends PolishRule {
+
+ /**
+ * Excluded dictionary words.
+ */
+ private static final Pattern EXC_WORDS = Pattern
+ .compile("nie|tuż|aż|to|siebie|być|ani|ni|albo|"
+ + "lub|czy|bądź|jako|zł|np|coraz"
+ + "|bardzo|bardziej|proc|ten|jak|mln|tys|swój|mój|"
+ + "twój|nasz|wasz|i|zbyt");
+
+ /**
+ * Excluded part of speech classes.
+ */
+ private static final Pattern EXC_POS = Pattern.compile("prep:.*|ppron.*");
+
+ /**
+ * Excluded non-words (special symbols, Roman numerals etc.
+ */
+ private static final Pattern EXC_NONWORDS = Pattern
+ .compile("&quot|&gt|&lt|&amp|[0-9].*|"
+ + "M*(D?C{0,3}|C[DM])(L?X{0,3}|X[LC])(V?I{0,3}|I[VX])$");
+
+ public PolishWordRepeatRule(final ResourceBundle messages) {
+ if (messages != null) {
+ super.setCategory(new Category(messages.getString("category_misc")));
+ }
+ setDefaultOff();
+ }
+
+ /*
+ * (non-Javadoc)
+ *
+ * @see de.danielnaber.languagetool.rules.Rule#getId()
+ */
+ @Override
+ public final String getId() {
+ return "PL_WORD_REPEAT";
+ }
+
+ /*
+ * (non-Javadoc)
+ *
+ * @see de.danielnaber.languagetool.rules.Rule#getDescription()
+ */
+ @Override
+ public final String getDescription() {
+ return "Powtórzenia wyrazów w zdaniu (monotonia stylistyczna)";
+ }
+
+ /*
+ * Tests if any word form is repeated in the sentence.
+ */
+ @Override
+ public final RuleMatch[] match(final AnalyzedSentence text) {
+ final List<RuleMatch> ruleMatches = new ArrayList<RuleMatch>();
+ final AnalyzedTokenReadings[] tokens = text.getTokensWithoutWhitespace();
+ boolean repetition = false;
+ final TreeSet<String> inflectedWords = new TreeSet<String>();
+ String prevLemma, curLemma;
+ // start from real token, 0 = SENT_START
+ for (int i = 1; i < tokens.length; i++) {
+ final String token = tokens[i].getToken();
+ // avoid "..." etc. to be matched:
+ boolean isWord = true;
+ boolean hasLemma = true;
+
+ if (token.length() < 2) {
+ isWord = false;
+ }
+
+ final int readingsLen = tokens[i].getReadingsLength();
+ for (int k = 0; k < readingsLen; k++) {
+ final String posTag = tokens[i].getAnalyzedToken(k).getPOSTag();
+ if (posTag != null) {
+ if (StringTools.isEmpty(posTag)) {
+ isWord = false;
+ break;
+ }
+ // FIXME: too many false alarms here:
+ final String lemma = tokens[i].getAnalyzedToken(k).getLemma();
+ if (lemma == null) {
+ hasLemma = false;
+ break;
+ }
+ final Matcher m1 = EXC_WORDS.matcher(lemma);
+ if (m1.matches()) {
+ isWord = false;
+ break;
+ }
+
+ final Matcher m2 = EXC_POS.matcher(posTag);
+ if (m2.matches()) {
+ isWord = false;
+ break;
+ }
+ } else {
+ hasLemma = false;
+ }
+
+ }
+
+ final Matcher m1 = EXC_NONWORDS.matcher(tokens[i].getToken());
+ if (m1.matches()) {
+ isWord = false;
+ }
+
+ prevLemma = "";
+ if (isWord) {
+ boolean notSentEnd = false;
+ for (int j = 0; j < readingsLen; j++) {
+ final String pos = tokens[i].getAnalyzedToken(j).getPOSTag();
+ if (pos != null) {
+ notSentEnd |= "SENT_END".equals(pos);
+ }
+ if (hasLemma) {
+ curLemma = tokens[i].getAnalyzedToken(j).getLemma();
+ if (!prevLemma.equals(curLemma) && !notSentEnd) {
+ if (inflectedWords.contains(curLemma)) {
+ repetition = true;
+ } else {
+ inflectedWords.add(tokens[i].getAnalyzedToken(j).getLemma());
+ }
+ }
+ prevLemma = curLemma;
+ } else {
+ if (inflectedWords.contains(tokens[i].getToken()) && !notSentEnd) {
+ repetition = true;
+ } else {
+ inflectedWords.add(tokens[i].getToken());
+ }
+ }
+
+ }
+ }
+
+ if (repetition) {
+ final String msg = "Powtórzony wyraz w zdaniu";
+ final int pos = tokens[i].getStartPos();
+ final RuleMatch ruleMatch = new RuleMatch(this, pos, pos
+ + token.length(), msg, "Powtórzenie wyrazu");
+ ruleMatches.add(ruleMatch);
+ repetition = false;
+ }
+
+ }
+ return toRuleMatchArray(ruleMatches);
+ }
+
+ /*
+ * (non-Javadoc)
+ *
+ * @see de.danielnaber.languagetool.rules.Rule#reset()
+ */
+ @Override
+ public void reset() {
+ // nothing
+
+ }
+
+}
diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/rules/pl/SimpleReplaceRule.java b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/pl/SimpleReplaceRule.java
new file mode 100644
index 0000000..90708d9
--- /dev/null
+++ b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/pl/SimpleReplaceRule.java
@@ -0,0 +1,82 @@
+/* LanguageTool, a natural language style checker
+ * Copyright (C) 2005 Daniel Naber (http://www.danielnaber.de)
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
+ * USA
+ */
+package de.danielnaber.languagetool.rules.pl;
+
+import java.io.IOException;
+import java.util.Locale;
+import java.util.ResourceBundle;
+
+import de.danielnaber.languagetool.rules.AbstractSimpleReplaceRule;
+
+/**
+ * A rule that matches words or phrases which should not be used and suggests
+ * correct ones instead.
+ *
+ * Polish implementations. Loads the list of words from
+ * <code>rules/pl/replace.txt</code>.
+ *
+ * @author Marcin Miłkowski
+ */
+public class SimpleReplaceRule extends AbstractSimpleReplaceRule {
+
+ public static final String POLISH_SIMPLE_REPLACE_RULE = "PL_SIMPLE_REPLACE";
+
+ private static final String FILE_NAME = "/pl/replace.txt";
+ // locale used on case-conversion
+ private static final Locale PL_LOCALE = new Locale("pl");
+
+ public final String getFileName() {
+ return FILE_NAME;
+ }
+
+ public SimpleReplaceRule(final ResourceBundle messages) throws IOException {
+ super(messages);
+ }
+
+ public final String getId() {
+ return POLISH_SIMPLE_REPLACE_RULE;
+ }
+
+ public String getDescription() {
+ return "Typowe literówki";
+ }
+
+ public String getShort() {
+ return "Literówka";
+ }
+
+ public String getSuggestion() {
+ return " to typowa literówka, poprawnie: ";
+ }
+
+ /**
+ * use case-insensitive matching.
+ */
+ public boolean isCaseSensitive() {
+ return false;
+ }
+
+ /**
+ * locale used on case-conversion
+ */
+ public Locale getLocale() {
+ return PL_LOCALE;
+ }
+
+}
diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/rules/ro/CompoundRule.java b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/ro/CompoundRule.java
new file mode 100644
index 0000000..bb9dea8
--- /dev/null
+++ b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/ro/CompoundRule.java
@@ -0,0 +1,58 @@
+/* LanguageTool, a natural language style checker
+ * Copyright (C) 2006 Daniel Naber (http://www.danielnaber.de)
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
+ * USA
+ */
+package de.danielnaber.languagetool.rules.ro;
+
+import java.io.IOException;
+import java.util.ResourceBundle;
+
+import de.danielnaber.languagetool.JLanguageTool;
+import de.danielnaber.languagetool.rules.AbstractCompoundRule;
+
+/**
+ * Checks that compounds (if in the list) are not written as separate words.
+ *
+ * @author Ionuț Păduraru, based on code by Daniel Naber
+ */
+public class CompoundRule extends AbstractCompoundRule {
+
+ public static final String ROMANIAN_COMPOUND_RULE = "RO_COMPOUND";
+ private static final String FILE_NAME = "/ro/compounds.txt";
+
+ public CompoundRule(final ResourceBundle messages) throws IOException {
+ super(messages);
+ loadCompoundFile(JLanguageTool.getDataBroker().getFromRulesDirAsStream(FILE_NAME), "UTF-8");
+ super.setShort("Problemă de scriere (cratimă, spațiu, etc.)");
+ super.setMsg("Cuvântul se scrie cu cratimă.",
+ "Cuvântul se scrie legat.",
+ "Cuvântul se scrie legat sau cu cratimă.");
+ // default value (2) is not ok for Romanian
+ setMaxUnHyphenatedWordCount(Integer.MAX_VALUE);
+ // there are words that should not be written with hyphen but as one word
+ setHyphenIgnored(false);
+ }
+
+ public String getId() {
+ return ROMANIAN_COMPOUND_RULE;
+ }
+
+ public String getDescription() {
+ return "Greșeală de scriere (cuvinte scrise legat sau cu cratimă)";
+ }
+
+}
diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/rules/ro/SimpleReplaceRule.java b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/ro/SimpleReplaceRule.java
new file mode 100644
index 0000000..9e96513
--- /dev/null
+++ b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/ro/SimpleReplaceRule.java
@@ -0,0 +1,264 @@
+/* LanguageTool, a natural language style checker
+ * Copyright (C) 2005 Daniel Naber (http://www.danielnaber.de)
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
+ * USA
+ */
+package de.danielnaber.languagetool.rules.ro;
+
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Locale;
+import java.util.Map;
+import java.util.Queue;
+import java.util.ResourceBundle;
+import java.util.concurrent.ArrayBlockingQueue;
+
+import de.danielnaber.languagetool.AnalyzedSentence;
+import de.danielnaber.languagetool.AnalyzedTokenReadings;
+import de.danielnaber.languagetool.JLanguageTool;
+import de.danielnaber.languagetool.Language;
+import de.danielnaber.languagetool.rules.AbstractSimpleReplaceRule;
+import de.danielnaber.languagetool.rules.Category;
+import de.danielnaber.languagetool.rules.Rule;
+import de.danielnaber.languagetool.rules.RuleMatch;
+import de.danielnaber.languagetool.tokenizers.Tokenizer;
+import de.danielnaber.languagetool.tools.StringTools;
+
+/**
+ * A rule that matches words which should not be used and suggests correct ones instead. <br/>
+ * Romanian implementations. Loads the list of words from
+ * <code>/ro/replace.txt</code>.<br/><br/>
+ *
+ * Unlike AbstractSimpleReplaceRule, supports multiple words (Ex: "aqua forte" => "acvaforte").<br/><br/>
+ *
+ * Note: Merge this into {@link AbstractSimpleReplaceRule} eventually and simply extend from AbstractSimpleReplaceRule.<br/>
+ *
+ * @author Ionuț Păduraru
+ * @version $Id$
+ *
+ */
+public class SimpleReplaceRule extends Rule {
+
+ public static final String ROMANIAN_SIMPLE_REPLACE_RULE = "RO_SIMPLE_REPLACE";
+
+ private static final String FILE_NAME = "/ro/replace.txt";
+ private static final String FILE_ENCODING = "utf-8";
+ // locale used on case-conversion
+ private static Locale roLocale = new Locale("ro");
+
+ // list of maps containing error-corrections pairs.
+ // the n-th map contains key strings of (n+1) words
+ private List<Map<String, String>> wrongWords;
+
+ public final String getFileName() {
+ return FILE_NAME;
+ }
+
+ public SimpleReplaceRule(final ResourceBundle messages) throws IOException {
+ super(messages);
+ if (messages != null) {
+ super.setCategory(new Category(messages.getString("category_misc")));
+ }
+ wrongWords = loadWords(JLanguageTool.getDataBroker().getFromRulesDirAsStream(getFileName()));
+ }
+
+ public final String getId() {
+ return ROMANIAN_SIMPLE_REPLACE_RULE;
+ }
+
+ public String getDescription() {
+ return "Cuvinte sau grupuri de cuvinte incorecte sau ieșite din uz";
+ }
+
+ public String getShort() {
+ return "Cuvânt incorect sau ieșit din uz";
+ }
+
+ public String getSuggestion() {
+ return " este incorect sau ieșit din uz, folosiți ";
+ }
+
+ /**
+ * @return the word used to separate multiple suggestions; used only before last suggestion, the rest are comma-separated.
+ */
+ public String getSuggestionsSeparator() {
+ return " sau ";
+ }
+
+ /**
+ * use case-insensitive matching.
+ */
+ public boolean isCaseSensitive() {
+ return false;
+ }
+
+ /**
+ * locale used on case-conversion
+ */
+ public Locale getLocale() {
+ return roLocale;
+ }
+
+ public String getEncoding() {
+ return FILE_ENCODING;
+ }
+
+ /**
+ * @return the word tokenizer used for tokenization on loading words.
+ */
+ protected Tokenizer getWordTokenizer() {
+ return Language.ROMANIAN.getWordTokenizer();
+ }
+
+ /**
+ * @return the list of wrong words for which this rule can suggest correction. The list cannot be modified.
+ */
+ public List<Map<String, String>> getWrongWords() {
+ return wrongWords;
+ }
+
+ /**
+ * Load the list of words. <br/>
+ * Same as {@link AbstractSimpleReplaceRule#loadWords} but allows multiple words.
+ * @param file the file to load.
+ * @return the list of maps containing the error-corrections pairs. <br/>The n-th map contains key strings of (n+1) words.
+ * @throws IOException when the file contains errors.
+ * @see #getWordTokenizer
+ */
+ private List<Map<String, String>> loadWords(final InputStream file)
+ throws IOException {
+ final List<Map<String, String>> list = new ArrayList<Map<String, String>>();
+ InputStreamReader isr = null;
+ BufferedReader br = null;
+ try {
+ isr = new InputStreamReader(file, getEncoding());
+ br = new BufferedReader(isr);
+ String line;
+
+ while ((line = br.readLine()) != null) {
+ line = line.trim();
+ if (line.length() < 1 || line.charAt(0) == '#') { // ignore comments
+ continue;
+ }
+ final String[] parts = line.split("=");
+ if (parts.length != 2) {
+ throw new IOException("Format error in file "
+ + JLanguageTool.getDataBroker().getFromRulesDirAsUrl(getFileName())
+ + ", line: " + line);
+ }
+ final String[] wrongForms = parts[0].split("\\|"); // multiple incorect forms
+ for (String wrongForm : wrongForms) {
+ int wordCount = 0;
+ final List<String> tokens = getWordTokenizer().tokenize(wrongForm);
+ for (String token : tokens) {
+ if (!StringTools.isWhitespace(token)) {
+ wordCount++;
+ }
+ }
+ // grow if necessary
+ for (int i = list.size(); i < wordCount; i++) {
+ list.add(new HashMap<String, String>());
+ }
+ list.get(wordCount - 1).put(wrongForm, parts[1]);
+ }
+ }
+
+ } finally {
+ if (br != null) {
+ br.close();
+ }
+ if (isr != null) {
+ isr.close();
+ }
+ }
+ // seal the result (prevent modification from outside this class)
+ List<Map<String,String>> result = new ArrayList<Map<String, String>>();
+ for (Map<String, String> map : list) {
+ result.add(Collections.unmodifiableMap(map));
+ }
+ return Collections.unmodifiableList(result);
+ }
+
+ private void addToQueue(AnalyzedTokenReadings token,
+ Queue<AnalyzedTokenReadings> prevTokens) {
+ final boolean inserted = prevTokens.offer(token);
+ if (!inserted) {
+ prevTokens.poll();
+ prevTokens.offer(token);
+ }
+ }
+
+ public RuleMatch[] match(final AnalyzedSentence text) {
+ final List<RuleMatch> ruleMatches = new ArrayList<RuleMatch>();
+ final AnalyzedTokenReadings[] tokens = text
+ .getTokensWithoutWhitespace();
+
+ final Queue<AnalyzedTokenReadings> prevTokens = new ArrayBlockingQueue<AnalyzedTokenReadings>(wrongWords.size());
+
+ for (int i = 1; i < tokens.length; i++) {
+ addToQueue(tokens[i], prevTokens);
+ final StringBuilder sb = new StringBuilder();
+ final ArrayList<String> variants = new ArrayList<String>();
+ final List<AnalyzedTokenReadings> prevTokensList = Arrays.asList(prevTokens.toArray(new AnalyzedTokenReadings[] {}));
+ for (int j = prevTokensList.size() - 1; j >= 0; j--) {
+ if (j != prevTokensList.size() - 1 && prevTokensList.get(j + 1).isWhitespaceBefore())
+ sb.insert(0, " ");
+ sb.insert(0, prevTokensList.get(j).getToken());
+ variants.add(0, sb.toString());
+ }
+ final int len = variants.size(); // prevTokensList and variants have now the same length
+ for (int j = 0; j < len; j++) { // longest words first
+ final String crt = variants.get(j);
+ final int crtWordCount = len - j;
+ final String crtMatch = isCaseSensitive() ? wrongWords.get(crtWordCount - 1).get(crt) : wrongWords.get(crtWordCount- 1).get(crt.toLowerCase(getLocale()));
+ if (crtMatch != null) {
+ final List<String> replacements = Arrays.asList(crtMatch.split("\\|"));
+ String msg = crt + getSuggestion();
+ for (int k = 0; k < replacements.size(); k++) {
+ if (k > 0) {
+ msg = msg + (k == replacements.size() - 1 ? getSuggestionsSeparator(): ", ");
+ }
+ msg += "<suggestion>" + replacements.get(k) + "</suggestion>";
+ }
+ final int startPos = prevTokensList.get(len - crtWordCount).getStartPos();
+ final int endPos = prevTokensList.get(len - 1).getStartPos() + prevTokensList.get(len - 1).getToken().length();
+ final RuleMatch potentialRuleMatch = new RuleMatch(this, startPos, endPos, msg, getShort());
+
+ if (!isCaseSensitive() && StringTools.startsWithUppercase(crt)) {
+ for (int k = 0; k < replacements.size(); k++) {
+ replacements.set(k, StringTools.uppercaseFirstChar(replacements.get(k)));
+ }
+ }
+ potentialRuleMatch.setSuggestedReplacements(replacements);
+ ruleMatches.add(potentialRuleMatch);
+ break;
+ }
+ }
+ }
+ return toRuleMatchArray(ruleMatches);
+ }
+
+ public void reset() {
+ }
+
+}
diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/rules/ru/RuSimpleReplaceRule.java b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/ru/RuSimpleReplaceRule.java
new file mode 100644
index 0000000..4076a9c
--- /dev/null
+++ b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/ru/RuSimpleReplaceRule.java
@@ -0,0 +1,80 @@
+/* LanguageTool, a natural language style checker
+ * Copyright (C) 2005 Daniel Naber (http://www.danielnaber.de)
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
+ * USA
+ */
+package de.danielnaber.languagetool.rules.ru;
+
+import java.io.IOException;
+import java.util.Locale;
+import java.util.ResourceBundle;
+
+import de.danielnaber.languagetool.rules.AbstractSimpleReplaceRule;
+
+/**
+ * A rule that matches words or phrases which should not be used and suggests
+ * correct ones instead.
+ *
+ * Russian implementations. Loads the
+ * relevant words from <code>rules/ru/replace.txt</code>.
+ *
+ * @author Yakov Reztsov
+ */
+public class RuSimpleReplaceRule extends AbstractSimpleReplaceRule {
+
+ private static final String FILE_NAME = "/ru/replace.txt";
+
+ // locale used on case-conversion
+ private static final Locale RU_LOCALE = new Locale("ru");
+
+
+ public final String getFileName() {
+ return FILE_NAME;
+ }
+ public RuSimpleReplaceRule(final ResourceBundle messages) throws IOException {
+ super(messages);
+ }
+
+ public final String getId() {
+ return "RU_SIMPLE_REPLACE";
+ }
+
+ public String getDescription() {
+ return "Поиск ошибочных слов/фраз";
+ }
+
+public String getShort() {
+ return "Ошибка?";
+ }
+
+ public String getSuggestion() {
+ return " - ошибочное слово/фраза, исправление: ";
+ }
+
+ /**
+ * use case-insensitive matching.
+ */
+ public boolean isCaseSensitive() {
+ return false;
+ }
+
+ /**
+ * locale used on case-conversion
+ */
+ public Locale getLocale() {
+ return RU_LOCALE;
+ }
+}
diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/rules/ru/RussianCompoundRule.java b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/ru/RussianCompoundRule.java
new file mode 100644
index 0000000..3e7d889
--- /dev/null
+++ b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/ru/RussianCompoundRule.java
@@ -0,0 +1,57 @@
+/* LanguageTool, a natural language style checker
+ * Copyright (C) 2006 Daniel Naber (http://www.danielnaber.de)
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
+ * USA
+ */
+package de.danielnaber.languagetool.rules.ru;
+
+import java.io.IOException;
+import java.util.ResourceBundle;
+
+import de.danielnaber.languagetool.JLanguageTool;
+import de.danielnaber.languagetool.rules.AbstractCompoundRule;
+
+/**
+ * Checks that compounds (if in the list) are not written as separate words.
+ * Russian compounds rule.
+ * @author Yakov Reztsov
+ *
+ * Based on German compounds rule.
+ * @author Daniel Naber
+ *
+ */
+public class RussianCompoundRule extends AbstractCompoundRule {
+
+ private static final String FILE_NAME = "/ru/compounds_ru.txt";
+
+ public RussianCompoundRule(final ResourceBundle messages) throws IOException {
+ super(messages);
+ loadCompoundFile(JLanguageTool.getDataBroker().getFromResourceDirAsStream(FILE_NAME), "UTF-8");
+ super.setMsg("Эти слова должны быть написаны через дефис.",
+ "Эти слова должны быть написаны слитно.",
+ "Эти слова могут быть написаны через дефис или слитно.");
+
+ }
+
+ public String getId() {
+ return "RU_COMPOUNDS";
+ }
+
+ public String getDescription() {
+ return "Правописание через дефис";
+ }
+
+}
diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/rules/ru/RussianRule.java b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/ru/RussianRule.java
new file mode 100644
index 0000000..030abf2
--- /dev/null
+++ b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/ru/RussianRule.java
@@ -0,0 +1,30 @@
+/* LanguageTool, a natural language style checker
+ * Copyright (C) 2005 Daniel Naber (http://www.danielnaber.de)
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
+ * USA
+ */
+package de.danielnaber.languagetool.rules.ru;
+
+import de.danielnaber.languagetool.rules.Rule;
+
+/**
+ * Abstract base class for rules for the Russian language.
+ *
+ * @author
+ */
+public abstract class RussianRule extends Rule {
+
+}
diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/rules/ru/RussianUnpairedBracketsRule.java b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/ru/RussianUnpairedBracketsRule.java
new file mode 100644
index 0000000..75dd86b
--- /dev/null
+++ b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/ru/RussianUnpairedBracketsRule.java
@@ -0,0 +1,62 @@
+/* LanguageTool, a natural language style checker
+ * Copyright (C) 2010 Marcin Miłkowski (http://www.languagetool.org)
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
+ * USA
+ */
+
+package de.danielnaber.languagetool.rules.ru;
+
+import java.util.ResourceBundle;
+import java.util.regex.Pattern;
+
+import de.danielnaber.languagetool.AnalyzedTokenReadings;
+import de.danielnaber.languagetool.Language;
+import de.danielnaber.languagetool.rules.GenericUnpairedBracketsRule;
+
+public class RussianUnpairedBracketsRule extends GenericUnpairedBracketsRule {
+
+ private static final String[] RU_START_SYMBOLS = { "[", "(", "{", "„", "«", "\"", "'" };
+ private static final String[] RU_END_SYMBOLS = { "]", ")", "}", "“", "»", "\"", "'" };
+
+ private static final Pattern NUMERALS_RU = Pattern
+ .compile("(?i)\\d{1,2}?[а-я]*|[а-я]|[А-Я]|[а-я][а-я]|[А-Я][А-Я]");
+
+
+ protected boolean isNoException(final String token,
+ final AnalyzedTokenReadings[] tokens, final int i, final int j,
+ final boolean precSpace,
+ final boolean follSpace) {
+ // exception for Russian bullets: а), б), Д)..., ДД), аа) and 1а).
+ if (i > 1 && endSymbols[j].equals(")") &&
+ NUMERALS_RU.matcher(tokens[i - 1].getToken()).matches() &&
+ !(!symbolStack.empty() && "(".equals(symbolStack.peek().symbol))) {
+ return false;
+ }
+ return true;
+ }
+
+ public RussianUnpairedBracketsRule(final ResourceBundle messages,
+ final Language language) {
+ super(messages, language);
+ startSymbols = RU_START_SYMBOLS;
+ endSymbols = RU_END_SYMBOLS;
+ }
+
+ public String getId() {
+ return "RU_UNPAIRED_BRACKETS";
+ }
+
+}
diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/rules/sk/CompoundRule.java b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/sk/CompoundRule.java
new file mode 100644
index 0000000..d5260bf
--- /dev/null
+++ b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/sk/CompoundRule.java
@@ -0,0 +1,55 @@
+/* LanguageTool, a natural language style checker
+ * Copyright (C) 2006 Daniel Naber (http://www.danielnaber.de)
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
+ * USA
+ */
+package de.danielnaber.languagetool.rules.sk;
+
+import java.io.IOException;
+import java.util.ResourceBundle;
+
+import de.danielnaber.languagetool.JLanguageTool;
+import de.danielnaber.languagetool.rules.AbstractCompoundRule;
+
+/**
+ * Checks that compounds (if in the list) are not written as separate words.
+ *
+ * @author Zdenko Podobný based on code by Marcin Miłkowski, Daniel Naber
+ */
+
+public final class CompoundRule extends AbstractCompoundRule {
+
+ private static final String FILE_NAME = "/sk/compounds.txt";
+
+ public CompoundRule(final ResourceBundle messages) throws IOException {
+ super(messages);
+ loadCompoundFile(JLanguageTool.getDataBroker().getFromResourceDirAsStream(FILE_NAME), "UTF-8");
+ super.setShort("Problém spájania slov");
+ super.setMsg("Toto slovo sa zvyčajne píše so spojovníkom.",
+ "Toto slovo sa obvykle píše bez spojovníka.",
+ "Tento výraz sa bežne píše s alebo bez spojovníka.");
+ }
+
+ public final String getId() {
+ return "SK_COMPOUNDS";
+ }
+
+ public final String getDescription() {
+ return "Slová so spojovníkom napr. použite „česko-slovenský” namiesto „česko slovenský”";
+ }
+
+
+}
diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/rules/sk/SlovakRule.java b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/sk/SlovakRule.java
new file mode 100644
index 0000000..f28067a
--- /dev/null
+++ b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/sk/SlovakRule.java
@@ -0,0 +1,31 @@
+/* LanguageTool, a natural language style checker
+ * Copyright (C) 2005 Daniel Naber (http://www.danielnaber.de)
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
+ * USA
+ */
+package de.danielnaber.languagetool.rules.sk;
+
+import de.danielnaber.languagetool.rules.Rule;
+
+/**
+ * Abstract base class for Polish rules.
+ *
+ * @author Zdenko Podobný based on Polish rules
+ *
+ */
+public abstract class SlovakRule extends Rule {
+
+}
diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/rules/sk/SlovakVes.java b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/sk/SlovakVes.java
new file mode 100644
index 0000000..3fff582
--- /dev/null
+++ b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/sk/SlovakVes.java
@@ -0,0 +1,146 @@
+/* LanguageTool, a natural language style checker
+ * Copyright (C) 2010 Luboš Lehotský lubo.lehotsky (at) gmail (dot) com
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
+ * USA
+ */
+
+package de.danielnaber.languagetool.rules.sk;
+
+
+import java.util.ArrayList;
+import java.util.List;
+import java.util.ResourceBundle;
+
+import de.danielnaber.languagetool.AnalyzedSentence;
+import de.danielnaber.languagetool.AnalyzedTokenReadings;
+import de.danielnaber.languagetool.rules.Category;
+import de.danielnaber.languagetool.rules.RuleMatch;
+
+
+public class SlovakVes extends SlovakRule {
+
+ public SlovakVes(final ResourceBundle messages) {
+ if (messages != null) {
+ super.setCategory(new Category(messages.getString("category_misc")));
+ }
+ setDefaultOff();
+ }
+
+ @Override
+ public final String getId() {
+ return "SK_VES";
+ }
+
+ @Override
+ public final String getDescription() {
+ return "Názvy obcí, v ktorých je \"Ves\"";
+ }
+
+ @Override
+ public final RuleMatch[] match(final AnalyzedSentence text) {
+ final List<RuleMatch> ruleMatches = new ArrayList<RuleMatch>();
+ final AnalyzedTokenReadings[] tokens = text.getTokensWithoutWhitespace();
+ // never read boolean prve_uvodzovky;
+ boolean tag, tag2, tag3;
+ final String pomoc;
+ final char znak;
+
+// never read prve_uvodzovky = false;
+ tag = false;
+ tag2 = false;
+ tag3 = false;
+
+ pomoc = tokens[1].getToken();
+ if (pomoc.length() >= 1) {
+ znak = pomoc.charAt(0);
+ } else {
+ znak = '.';
+ }
+
+ if (znak == '?') {
+// never read prve_uvodzovky = true;
+ }
+ for (int i = 1; i < tokens.length; i++) {
+ final String token = tokens[i].getToken();
+// never read String premenna = token.toString();
+ final char pomocnik;
+// never read final int help;
+ boolean bodka;
+ boolean pady;
+
+ pady = false;
+ pomocnik = token.charAt(0);
+ bodka = false;
+ if (token.charAt(0) == '.' || token.charAt(0) == '?'
+ || token.charAt(0) == '!') {
+ bodka = true;
+ }
+
+ if (tokens[i].hasPosTag("AAfs1x") || tokens[i].hasPosTag("AAfs2x")
+ || tokens[i].hasPosTag("AAfs3x")
+ || tokens[i].hasPosTag("AAfs4x")
+ || tokens[i].hasPosTag("AAfs6x")
+ || tokens[i].hasPosTag("AAfs7x")) {
+ pady = true;
+ }
+ if (pady && Character.isUpperCase(pomocnik)) {
+ tag = true;
+ }
+
+ if (tag && !tag2) {
+ if (pady && Character.isLowerCase(pomocnik)) {
+ tag2 = true;
+ // premenna = tokens[i].getToken();
+ }
+
+ }
+
+ if (tag2) {
+ if (token.equals("Ves") || token.equals("Vsi")
+ || token.equals("Vsou")) {
+ tag3 = true;
+ }
+ }
+ if (tag3 && !bodka) {
+ String spravne;
+ char prve;
+
+ prve = tokens[i - 1].getToken().charAt(0);
+ prve = Character.toUpperCase(prve);
+ spravne = tokens[i - 1].getToken().substring(1,
+ tokens[i - 1].getToken().length());
+
+ final String msg = "Zmeňte začiatočné písmeno na veľké: <suggestion> "
+ + prve + spravne + " </suggestion>";
+ final int pos = tokens[i - 1].getStartPos();
+ final int pos2 = tokens[i - 1].getToken().length();
+ final RuleMatch ruleMatch = new RuleMatch(this, pos, pos + pos2,
+ msg, "Zmeňte začiatočné písmeno na veľké: ");
+
+ ruleMatches.add(ruleMatch);
+
+ }
+
+ }
+ return toRuleMatchArray(ruleMatches);
+ }
+
+ @Override
+ public void reset() {// nothing
+ }
+
+}
+
diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/rules/sv/CompoundRule.java b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/sv/CompoundRule.java
new file mode 100644
index 0000000..b3087cd
--- /dev/null
+++ b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/sv/CompoundRule.java
@@ -0,0 +1,247 @@
+/* LanguageTool, a natural language style checker
+ * Copyright (C) 2006 Daniel Naber (http://www.danielnaber.de)
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
+ * USA
+ */
+package de.danielnaber.languagetool.rules.sv;
+
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Map;
+import java.util.Queue;
+import java.util.ResourceBundle;
+import java.util.Set;
+import java.util.concurrent.ArrayBlockingQueue;
+
+import de.danielnaber.languagetool.AnalyzedSentence;
+import de.danielnaber.languagetool.AnalyzedToken;
+import de.danielnaber.languagetool.AnalyzedTokenReadings;
+import de.danielnaber.languagetool.JLanguageTool;
+import de.danielnaber.languagetool.rules.Category;
+import de.danielnaber.languagetool.rules.RuleMatch;
+import de.danielnaber.languagetool.tools.StringTools;
+
+/**
+ * Checks that compounds (if in the list) are not written as separate words.
+ *
+ * @author Daniel Naber
+ */
+public class CompoundRule extends SwedishRule {
+ //TODO for words with more then one part check if parts of it is compounded.
+ //in env. allt-i-genom+ should match "allt i genom", "allt igenom" as well as "allti genom"
+ private static final String FILE_NAME = "/sv/compounds.txt";
+
+ private final static int MAX_TERMS = 5;
+
+ private final Set<String> incorrectCompounds = new HashSet<String>();
+ private final Set<String> noDashSuggestion = new HashSet<String>();
+ private final Set<String> onlyDashSuggestion = new HashSet<String>();
+
+ public CompoundRule(final ResourceBundle messages) throws IOException {
+ if (messages != null)
+ super.setCategory(new Category(messages.getString("category_misc")));
+ loadCompoundFile(JLanguageTool.getDataBroker().getFromResourceDirAsStream(FILE_NAME), "UTF-8");
+ }
+
+ public String getId() {
+ return "SV_COMPOUNDS";
+ }
+
+ public String getDescription() {
+ return "Särskrivningar, t.ex. 'cd rom' bör skrivas 'cd-rom'";
+ }
+
+ public RuleMatch[] match(final AnalyzedSentence text) {
+ final List<RuleMatch> ruleMatches = new ArrayList<RuleMatch>();
+ final AnalyzedTokenReadings[] tokens = text.getTokensWithoutWhitespace();
+
+ RuleMatch prevRuleMatch = null;
+ final Queue<AnalyzedTokenReadings> prevTokens = new ArrayBlockingQueue<AnalyzedTokenReadings>(MAX_TERMS);
+ for (int i = 0; i < tokens.length + MAX_TERMS-1; i++) {
+ AnalyzedTokenReadings token = null;
+ // we need to extend the token list so we find matches at the end of the original list:
+ if (i >= tokens.length)
+ token = new AnalyzedTokenReadings(new AnalyzedToken("", "", null), prevTokens.peek().getStartPos());
+ else
+ token = tokens[i];
+ if (i == 0) {
+ addToQueue(token, prevTokens);
+ continue;
+ }
+
+ final StringBuilder sb = new StringBuilder();
+ int j = 0;
+ AnalyzedTokenReadings firstMatchToken = null;
+ final List<String> stringsToCheck = new ArrayList<String>();
+ final List<String> origStringsToCheck = new ArrayList<String>(); // original upper/lowercase spelling
+ final Map<String, AnalyzedTokenReadings> stringToToken = new HashMap<String, AnalyzedTokenReadings>();
+ for (AnalyzedTokenReadings atr : prevTokens) {
+ if (j == 0)
+ firstMatchToken = atr;
+ sb.append(' ');
+ sb.append(atr.getToken());
+ if (j >= 1) {
+ final String stringToCheck = normalize(sb.toString());
+ stringsToCheck.add(stringToCheck);
+ origStringsToCheck.add(sb.toString().trim());
+ if (!stringToToken.containsKey(stringToCheck))
+ stringToToken.put(stringToCheck, atr);
+ }
+ j++;
+ }
+ // iterate backwards over all potentially incorrect strings to make
+ // sure we match longer strings first:
+ for (int k = stringsToCheck.size()-1; k >= 0; k--) {
+ final String stringToCheck = stringsToCheck.get(k);
+ final String origStringToCheck = origStringsToCheck.get(k);
+ //System.err.println("##"+stringtoCheck+"#");
+ if (incorrectCompounds.contains(stringToCheck)) {
+ final AnalyzedTokenReadings atr = stringToToken.get(stringToCheck);
+ String msg = null;
+ final List<String> repl = new ArrayList<String>();
+ if (!noDashSuggestion.contains(stringToCheck)) {
+ repl.add(origStringToCheck.replace(' ', '-'));
+ msg = "Dessa ord skrivs samman med bindesträck.";
+ }
+ // Do not assume that compounds with more than two parts should always use hyphens:
+ if (!hasAllUppercaseParts(origStringToCheck) && !onlyDashSuggestion.contains(stringToCheck)) {
+ repl.add(mergeCompound(origStringToCheck));
+ msg = "Dessa ord skrivs samman.";
+ }
+ final String[] parts = stringToCheck.split(" ");
+ if (parts.length > 0) {
+ repl.clear();
+ repl.add(origStringToCheck.replace(' ', '-'));
+ msg = "Dessa ord skrivs samman med bindesträck.";
+ } else if (repl.size() == 0 || repl.size() == 2) { // == 0 shouldn't happen
+ // did not work as expected so I added repl. explicitly.
+ msg = "Dessa ord skrivs samman med eller utan bindesträck.";
+ repl.clear();
+ repl.add(origStringToCheck.replace(' ', '-'));
+ repl.add(mergeCompound(origStringToCheck));
+ }
+ final RuleMatch ruleMatch = new RuleMatch(this, firstMatchToken.getStartPos(),
+ atr.getStartPos() + atr.getToken().length(), msg);
+ // avoid duplicate matches:
+ if (prevRuleMatch != null && prevRuleMatch.getFromPos() == ruleMatch.getFromPos()) {
+ prevRuleMatch = ruleMatch;
+ break;
+ }
+ prevRuleMatch = ruleMatch;
+ ruleMatch.setSuggestedReplacements(repl);
+ ruleMatches.add(ruleMatch);
+ break;
+ }
+ }
+ addToQueue(token, prevTokens);
+ }
+ return toRuleMatchArray(ruleMatches);
+ }
+
+ /**
+ * Replaces dashes with whitespace
+ * e.g. "E-Mail Adresse" -> "E Mail Adresse" so the error can be detected:
+ * @param str
+ * @return str
+ */
+ private String normalize(String str) {
+ str = str.trim().toLowerCase();
+ if (str.indexOf('-') != -1 && str.indexOf(' ') != -1) {
+ // e.g. "E-Mail Adresse" -> "E Mail Adresse" so the error can be detected:
+ str = str.replace('-', ' ');
+ }
+ return str;
+ }
+
+ private boolean hasAllUppercaseParts(String str) {
+ final String[] parts = str.split(" ");
+ for (String part : parts) {
+ if (StringTools.isAllUppercase(part)) {
+ return true;
+ }
+ }
+ return false;
+ }
+
+ private String mergeCompound(String str) {
+ final String[] stringParts = str.split(" ");
+ final StringBuilder sb = new StringBuilder();
+ for (int k = 0; k < stringParts.length; k++) {
+ if (k == 0)
+ sb.append(stringParts[k]);
+ else
+ sb.append(stringParts[k].toLowerCase());
+ }
+ return sb.toString();
+ }
+
+ private void addToQueue(AnalyzedTokenReadings token, Queue<AnalyzedTokenReadings> prevTokens) {
+ final boolean inserted = prevTokens.offer(token);
+ if (!inserted) {
+ prevTokens.poll();
+ prevTokens.offer(token);
+ }
+ }
+
+ private void loadCompoundFile(final InputStream file, final String encoding) throws IOException {
+ InputStreamReader isr = null;
+ BufferedReader br = null;
+ try {
+ isr = new InputStreamReader(file, encoding);
+ br = new BufferedReader(isr);
+ String line;
+ while ((line = br.readLine()) != null) {
+ line = line.trim();
+ if (line.length() < 1) {
+ continue;
+ }
+ if (line.charAt(0) == '#') { // ignore comments
+ continue;
+ }
+ // the set contains the incorrect spellings, i.e. the ones without hyphen
+ line = line.replace('-', ' ');
+ final String[] parts = line.split(" ");
+ if (parts.length > MAX_TERMS)
+ throw new IOException("För många ord sammansatta: " + line + ", max antal tillåtna ord: " + MAX_TERMS);
+ if (parts.length == 1)
+ throw new IOException("Inget sammansatt ord: " + line);
+ if (line.endsWith("+")) {
+ line = line.substring(0, line.length() - 1); // cut off "+"
+ noDashSuggestion.add(line.toLowerCase());
+ } else if (line.endsWith("*")) {
+ line = line.substring(0, line.length() - 1); // cut off "*"
+ onlyDashSuggestion.add(line.toLowerCase());
+ }
+ incorrectCompounds.add(line.toLowerCase());
+ }
+ } finally {
+ if (br != null) br.close();
+ if (isr != null) isr.close();
+ }
+ }
+
+ public void reset() {
+ }
+
+}
+
+
diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/rules/sv/SwedishRule.java b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/sv/SwedishRule.java
new file mode 100644
index 0000000..73af8fe
--- /dev/null
+++ b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/sv/SwedishRule.java
@@ -0,0 +1,31 @@
+/* LanguageTool, a natural language style checker
+ * Copyright (C) 2005 Daniel Naber (http://www.danielnaber.de)
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
+ * USA
+ */
+package de.danielnaber.languagetool.rules.sv;
+
+import de.danielnaber.languagetool.rules.Rule;
+
+/**
+ * Abstract base class for Swedish rules.
+ *
+ * @author Marcin Miłkowski
+ *
+ */
+public abstract class SwedishRule extends Rule {
+
+}
diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/rules/uk/PunctuationCheckRule.java b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/uk/PunctuationCheckRule.java
new file mode 100644
index 0000000..5abc339
--- /dev/null
+++ b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/uk/PunctuationCheckRule.java
@@ -0,0 +1,76 @@
+/* LanguageTool, a natural language style checker
+ * Copyright (C) 2005 Daniel Naber (http://www.danielnaber.de)
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
+ * USA
+ */
+package de.danielnaber.languagetool.rules.uk;
+
+import java.util.ResourceBundle;
+
+import de.danielnaber.languagetool.rules.AbstractPunctuationCheckRule;
+
+/**
+ * A rule that matches "..", "::", "-," but not "...", "!..", "?!!", ",-" etc
+ * TODO: spaces seem to be special, extract from regexp?
+ *
+ * @author Andriy Rysin
+ */
+public class PunctuationCheckRule extends AbstractPunctuationCheckRule {
+
+ public PunctuationCheckRule(final ResourceBundle messages) {
+ super(messages);
+ // super.setCategory(new Category(messages.getString("category_misc")));
+ }
+
+ // private boolean isTripleOk(String token) {
+ // return token.matches("^[.!?]$");
+ // }
+
+ /*
+ * (non-Javadoc)
+ *
+ * @see
+ * de.danielnaber.languagetool.rules.AbstractPunctuationCheckRule#isPunctsJoinOk
+ * (java.lang.String)
+ */
+ protected final boolean isPunctsJoinOk(final String tokens) {
+ return // we ignore duplicated spaces - too many errors
+ tokens.matches("([,:] | *- |,- | ) *") // internal puctuation
+ || tokens
+ .matches("([.!?]|!!!|\\?\\?\\?|\\?!!|!\\.\\.|\\?\\.\\.|\\.\\.\\.) *");
+ }
+
+ /*
+ * (non-Javadoc)
+ *
+ * @see
+ * de.danielnaber.languagetool.rules.AbstractPunctuationCheckRule#isPunctuation
+ * (java.lang.String)
+ */
+ protected final boolean isPunctuation(final String token) {
+ return token.matches("^[.,!?: -]$");
+ }
+
+ /*
+ * (non-Javadoc)
+ *
+ * @see de.danielnaber.languagetool.rules.AbstractPunctuationCheckRule#reset()
+ */
+ public void reset() {
+ // nothing
+ }
+
+}
diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/rules/uk/SimpleReplaceRule.java b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/uk/SimpleReplaceRule.java
new file mode 100644
index 0000000..3bba01c
--- /dev/null
+++ b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/uk/SimpleReplaceRule.java
@@ -0,0 +1,50 @@
+/* LanguageTool, a natural language style checker
+ * Copyright (C) 2005 Daniel Naber (http://www.danielnaber.de)
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
+ * USA
+ */
+package de.danielnaber.languagetool.rules.uk;
+
+import java.io.IOException;
+import java.util.ResourceBundle;
+
+import de.danielnaber.languagetool.rules.AbstractSimpleReplaceRule;
+
+/**
+ * A rule that matches words or phrases which should not be used and suggests
+ * correct ones instead.
+ *
+ * Ukrainian implementations. Loads the
+ * relevant words from <code>rules/uk/replace.txt</code>.
+ *
+ * @author Andriy Rysin
+ */
+public class SimpleReplaceRule extends AbstractSimpleReplaceRule {
+
+ private static final String FILE_NAME = "/uk/replace.txt";
+
+ public final String getFileName() {
+ return FILE_NAME;
+ }
+ public SimpleReplaceRule(final ResourceBundle messages) throws IOException {
+ super(messages);
+ }
+
+ public final String getId() {
+ return "UK_SIMPLE_REPLACE";
+ }
+
+}