summaryrefslogtreecommitdiffstats
path: root/JLanguageTool/src/java/de/danielnaber/languagetool/rules/GenericUnpairedBracketsRule.java
diff options
context:
space:
mode:
Diffstat (limited to 'JLanguageTool/src/java/de/danielnaber/languagetool/rules/GenericUnpairedBracketsRule.java')
-rw-r--r--JLanguageTool/src/java/de/danielnaber/languagetool/rules/GenericUnpairedBracketsRule.java314
1 files changed, 314 insertions, 0 deletions
diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/rules/GenericUnpairedBracketsRule.java b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/GenericUnpairedBracketsRule.java
new file mode 100644
index 0000000..a2cd35c
--- /dev/null
+++ b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/GenericUnpairedBracketsRule.java
@@ -0,0 +1,314 @@
+/* LanguageTool, a natural language style checker
+ * Copyright (C) 2009 Daniel Naber (http://www.danielnaber.de)
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
+ * USA
+ */
+
+package de.danielnaber.languagetool.rules;
+
+import java.util.ArrayList;
+import java.util.List;
+import java.util.ResourceBundle;
+import java.util.regex.Pattern;
+
+import de.danielnaber.languagetool.AnalyzedSentence;
+import de.danielnaber.languagetool.AnalyzedTokenReadings;
+import de.danielnaber.languagetool.Language;
+import de.danielnaber.languagetool.tools.UnsyncStack;
+import de.danielnaber.languagetool.tools.SymbolLocator;
+
+/**
+ * Rule that finds unpaired quotes, brackets etc.
+ *
+ * @author Marcin Miłkowski
+ */
+public class GenericUnpairedBracketsRule extends Rule {
+
+ /**
+ * Note that there must be equal length of both arrays, and the sequence of
+ * starting symbols must match exactly the sequence of ending symbols.
+ */
+ private static final String[] START_SYMBOLS = { "[", "(", "{", "\"", "'" };
+ private static final String[] END_SYMBOLS = { "]", ")", "}", "\"", "'" };
+
+ protected String[] startSymbols;
+ protected String[] endSymbols;
+
+ private static final String[] SL_START_SYMBOLS = { "[", "(", "{", "„", "»", "\"" };
+ private static final String[] SL_END_SYMBOLS = { "]", ")", "}", "”", "«", "\"" };
+
+ private static final String[] SK_START_SYMBOLS = { "[", "(", "{", "„", "»", "\"" };
+ private static final String[] SK_END_SYMBOLS = { "]", ")", "}", "“", "«", "\"" };
+
+ private static final String[] RO_START_SYMBOLS = { "[", "(", "{", "„", "«" };
+ private static final String[] RO_END_SYMBOLS = { "]", ")", "}", "”", "»" };
+
+ private static final String[] FR_START_SYMBOLS = { "[", "(", "{", "«", /*"‘"*/ };
+ private static final String[] FR_END_SYMBOLS = { "]", ")", "}", "»", /*"’" used in "d’arm" and many other words */ };
+
+ private static final String[] DE_START_SYMBOLS = { "[", "(", "{", "„", "»", "‘" };
+ private static final String[] DE_END_SYMBOLS = { "]", ")", "}", "“", "«", "’" };
+
+ private static final String[] GL_START_SYMBOLS = { "[", "(", "{", "“", "«", "‘", "\"", "'" };
+ private static final String[] GL_END_SYMBOLS = { "]", ")", "}", "”", "»", "’", "\"", "'" };
+
+ private static final String[] ES_START_SYMBOLS = { "[", "(", "{", "“", "«", "¿", "¡" };
+ private static final String[] ES_END_SYMBOLS = { "]", ")", "}", "”", "»", "?", "!" };
+
+ private static final String[] UK_START_SYMBOLS = { "[", "(", "{", "„", "«" };
+ private static final String[] UK_END_SYMBOLS = { "]", ")", "}", "“", "»" };
+
+ private static final String[] NL_START_SYMBOLS = { "[", "(", "{", "“", "\u2039", "\u201c", "\u201e" };
+ private static final String[] NL_END_SYMBOLS = { "]", ")", "}", "”", "\u203a", "\u201d", "\u201d" };
+
+ private static final String[] IT_START_SYMBOLS = { "[", "(", "{", "»", /*"‘"*/ };
+ private static final String[] IT_END_SYMBOLS = { "]", ")", "}", "«", /*"’"*/ };
+
+ private static final String[] DK_START_SYMBOLS = { "[", "(", "{", "\"", "”" };
+ private static final String[] DK_END_SYMBOLS = { "]", ")", "}", "\"", "”" };
+
+
+
+ /**
+ * The stack for pairing symbols.
+ */
+ protected final UnsyncStack<SymbolLocator> symbolStack = new UnsyncStack<SymbolLocator>();
+
+ /**
+ * Stack of rule matches.
+ */
+ private final UnsyncStack<RuleMatchLocator> ruleMatchStack = new UnsyncStack<RuleMatchLocator>();
+
+ private boolean endOfParagraph;
+
+ private final Language ruleLang;
+
+ private static final Pattern PUNCTUATION = Pattern.compile("\\p{Punct}");
+ private static final Pattern PUNCTUATION_NO_DOT = Pattern
+ .compile("[\\p{Punct}&&[^\\.]]");
+ private static final Pattern NUMERALS = Pattern
+ .compile("(?i)\\d{1,2}?[a-z']*|M*(D?C{0,3}|C[DM])(L?X{0,3}|X[LC])(V?I{0,3}|I[VX])$");
+
+ private int ruleMatchIndex;
+ private List<RuleMatch> ruleMatches;
+
+ public GenericUnpairedBracketsRule(final ResourceBundle messages,
+ final Language language) {
+ super(messages);
+ super.setCategory(new Category(messages.getString("category_misc")));
+
+ setParagraphBackTrack(true);
+ if (language.equals(Language.SLOVAK)) {
+ startSymbols = SK_START_SYMBOLS;
+ endSymbols = SK_END_SYMBOLS; }
+ else if (language.equals(Language.SLOVENIAN)) {
+ startSymbols = SL_START_SYMBOLS;
+ endSymbols = SL_END_SYMBOLS;
+ } else if (language.equals(Language.FRENCH)) {
+ startSymbols = FR_START_SYMBOLS;
+ endSymbols = FR_END_SYMBOLS;
+ } else if (language.equals(Language.GERMAN)) {
+ startSymbols = DE_START_SYMBOLS;
+ endSymbols = DE_END_SYMBOLS;
+ } else if (language.equals(Language.GALICIAN)) {
+ startSymbols = GL_START_SYMBOLS;
+ endSymbols = GL_END_SYMBOLS;
+ } else if (language.equals(Language.DUTCH)) {
+ startSymbols = NL_START_SYMBOLS;
+ endSymbols = NL_END_SYMBOLS;
+ } else if (language.equals(Language.SPANISH)) {
+ startSymbols = ES_START_SYMBOLS;
+ endSymbols = ES_END_SYMBOLS;
+ } else if (language.equals(Language.UKRAINIAN)) {
+ startSymbols = UK_START_SYMBOLS;
+ endSymbols = UK_END_SYMBOLS;
+ } else if (language.equals(Language.ITALIAN)) {
+ startSymbols = IT_START_SYMBOLS;
+ endSymbols = IT_END_SYMBOLS;
+ } else if (language.equals(Language.ROMANIAN)) {
+ startSymbols = RO_START_SYMBOLS;
+ endSymbols = RO_END_SYMBOLS;
+ } else if (language.equals(Language.DANISH)) {
+ startSymbols = DK_START_SYMBOLS;
+ endSymbols = DK_END_SYMBOLS;
+ } else {
+ startSymbols = START_SYMBOLS;
+ endSymbols = END_SYMBOLS;
+ }
+
+ ruleLang = language;
+ }
+
+ public String getId() {
+ return "UNPAIRED_BRACKETS";
+ }
+
+ public String getDescription() {
+ return messages.getString("desc_unpaired_brackets");
+ }
+
+ /**
+ * Generic method to specify an exception. For unspecified
+ * language, it simply returns true, which means no exception.
+ * @param token
+ * String token
+ * @param tokens
+ * Sentence tokens
+ * @param i
+ * Current token index
+ * @param precSpace
+ * boolean: is preceded with space
+ * @param follSpace
+ * boolean: is followed with space
+ * @return
+ */
+ protected boolean isNoException(final String token,
+ final AnalyzedTokenReadings[] tokens, final int i, final int j,
+ final boolean precSpace,
+ final boolean follSpace) {
+ return true;
+ }
+
+ public final RuleMatch[] match(final AnalyzedSentence text) {
+ ruleMatches = new ArrayList<RuleMatch>();
+ final AnalyzedTokenReadings[] tokens = text.getTokensWithoutWhitespace();
+
+ if (endOfParagraph) {
+ reset();
+ }
+
+ ruleMatchIndex = getMatchesIndex();
+
+ for (int i = 1; i < tokens.length; i++) {
+ for (int j = 0; j < startSymbols.length; j++) {
+
+ final String token = tokens[i].getToken();
+ if (token.equals(startSymbols[j]) || token.equals(endSymbols[j])) {
+ boolean precededByWhitespace = true;
+ if (startSymbols[j].equals(endSymbols[j])) {
+ precededByWhitespace = tokens[i - 1].isSentStart()
+ || tokens[i].isWhitespaceBefore()
+ || PUNCTUATION_NO_DOT.matcher(tokens[i - 1].getToken())
+ .matches();
+ }
+
+ boolean followedByWhitespace = true;
+ if (i < tokens.length - 1 && startSymbols[j].equals(endSymbols[j])) {
+ followedByWhitespace = tokens[i + 1].isWhitespaceBefore()
+ || PUNCTUATION.matcher(tokens[i + 1].getToken()).matches();
+ }
+
+ final boolean noException = isNoException(token, tokens, i, j,
+ precededByWhitespace, followedByWhitespace);
+
+ if (noException && precededByWhitespace
+ && token.equals(startSymbols[j])) {
+ symbolStack.push(new SymbolLocator(startSymbols[j], i));
+ } else if (noException && followedByWhitespace
+ && token.equals(endSymbols[j])) {
+ if (i > 1 && endSymbols[j].equals(")")
+ && (NUMERALS.matcher(tokens[i - 1].getToken()).matches()
+ && !(!symbolStack.empty()
+ && "(".equals(symbolStack.peek().symbol)))) {
+ } else {
+ if (symbolStack.empty()) {
+ symbolStack.push(new SymbolLocator(endSymbols[j], i));
+ } else {
+ if (symbolStack.peek().symbol.equals(startSymbols[j])) {
+ symbolStack.pop();
+ } else {
+ symbolStack.push(new SymbolLocator(endSymbols[j], i));
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+ for (final SymbolLocator sLoc : symbolStack) {
+ final RuleMatch rMatch = createMatch(tokens[sLoc.index].getStartPos(),
+ sLoc.symbol);
+ if (rMatch != null) {
+ ruleMatches.add(rMatch);
+ }
+ }
+ symbolStack.clear();
+ if (tokens[tokens.length - 1].isParaEnd()) {
+ endOfParagraph = true;
+ }
+
+ return toRuleMatchArray(ruleMatches);
+ }
+
+ private RuleMatch createMatch(final int startPos, final String symbol) {
+ if (!ruleMatchStack.empty()) {
+ final int index = findSymbolNum(symbol);
+ if (index >= 0) {
+ final RuleMatchLocator rLoc = ruleMatchStack.peek();
+ if (rLoc.symbol.equals(startSymbols[index])) {
+ if (ruleMatches.size() > rLoc.myIndex) {
+ ruleMatches.remove(rLoc.myIndex);
+ ruleMatchStack.pop();
+ return null;
+ // if (ruleMatches.get(rLoc.myIndex).getFromPos())
+ }
+ if (isInMatches(rLoc.index)) {
+ setAsDeleted(rLoc.index);
+ ruleMatchStack.pop();
+ return null;
+ }
+ }
+ }
+ }
+ ruleMatchStack.push(new RuleMatchLocator(symbol, ruleMatchIndex,
+ ruleMatches.size()));
+ ruleMatchIndex++;
+ return new RuleMatch(this, startPos, startPos + symbol.length(), messages
+ .getString("unpaired_brackets"));
+ }
+
+ private int findSymbolNum(final String ch) {
+ for (int i = 0; i < endSymbols.length; i++) {
+ if (ch.equals(endSymbols[i])) {
+ return i;
+ }
+ }
+ return -1;
+ }
+
+ /**
+ * Reset the state information for the rule, including paragraph-level
+ * information.
+ */
+ public final void reset() {
+ ruleMatchStack.clear();
+ symbolStack.clear();
+ if (!endOfParagraph) {
+ clearMatches();
+ }
+ endOfParagraph = false;
+ }
+
+}
+
+class RuleMatchLocator extends SymbolLocator {
+ public int myIndex;
+
+ RuleMatchLocator(final String sym, final int ind, final int myInd) {
+ super(sym, ind);
+ myIndex = myInd;
+ }
+}