summaryrefslogtreecommitdiffstats
path: root/JLanguageTool/src/java/de/danielnaber/languagetool/rules/en
diff options
context:
space:
mode:
Diffstat (limited to 'JLanguageTool/src/java/de/danielnaber/languagetool/rules/en')
-rw-r--r--JLanguageTool/src/java/de/danielnaber/languagetool/rules/en/AvsAnRule.java251
-rw-r--r--JLanguageTool/src/java/de/danielnaber/languagetool/rules/en/CompoundRule.java55
-rw-r--r--JLanguageTool/src/java/de/danielnaber/languagetool/rules/en/EnglishRule.java30
-rw-r--r--JLanguageTool/src/java/de/danielnaber/languagetool/rules/en/EnglishUnpairedBracketsRule.java89
4 files changed, 425 insertions, 0 deletions
diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/rules/en/AvsAnRule.java b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/en/AvsAnRule.java
new file mode 100644
index 0000000..ae02ef5
--- /dev/null
+++ b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/en/AvsAnRule.java
@@ -0,0 +1,251 @@
+/* LanguageTool, a natural language style checker
+ * Copyright (C) 2005 Daniel Naber (http://www.danielnaber.de)
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
+ * USA
+ */
+package de.danielnaber.languagetool.rules.en;
+
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.ResourceBundle;
+import java.util.TreeSet;
+
+import de.danielnaber.languagetool.AnalyzedSentence;
+import de.danielnaber.languagetool.AnalyzedTokenReadings;
+import de.danielnaber.languagetool.JLanguageTool;
+import de.danielnaber.languagetool.rules.Category;
+import de.danielnaber.languagetool.rules.RuleMatch;
+import de.danielnaber.languagetool.tools.StringTools;
+
+/**
+ * Check if the determiner (if any) preceding a word is:
+ * <ul>
+ * <li><i>an</i> if the next word starts with a vowel
+ * <li><i>a</i> if the next word does not start with a vowel
+ * </ul>
+ * This rule loads some exceptions from external files (e.g. <i>an hour</i>).
+ *
+ * @author Daniel Naber
+ */
+public class AvsAnRule extends EnglishRule {
+
+ private static final String FILENAME_A = "/en/det_a.txt";
+ private static final String FILENAME_AN = "/en/det_an.txt";
+
+ private final TreeSet<String> requiresA;
+ private final TreeSet<String> requiresAn;
+
+ public AvsAnRule(final ResourceBundle messages) throws IOException {
+ if (messages != null) {
+ super.setCategory(new Category(messages.getString("category_misc")));
+ }
+ requiresA = loadWords(JLanguageTool.getDataBroker().getFromRulesDirAsStream(FILENAME_A));
+ requiresAn = loadWords(JLanguageTool.getDataBroker().getFromRulesDirAsStream(FILENAME_AN));
+ }
+
+ @Override
+ public String getId() {
+ return "EN_A_VS_AN";
+ }
+
+ @Override
+ public String getDescription() {
+ return "Use of 'a' vs. 'an'";
+ }
+
+ @Override
+ public RuleMatch[] match(final AnalyzedSentence text) {
+ final List<RuleMatch> ruleMatches = new ArrayList<RuleMatch>();
+ final AnalyzedTokenReadings[] tokens = text.getTokensWithoutWhitespace();
+ String prevToken = "";
+ int prevPos = 0;
+ //ignoring token 0, i.e., SENT_START
+ for (int i = 1; i < tokens.length; i++) {
+ String token = tokens[i].getToken();
+ boolean doesRequireA = false;
+ boolean doesRequireAn = false;
+ // check for exceptions:
+ boolean isException = false;
+ final String[] parts = token.split("[-']"); // for example, in "one-way" only "one" is relevant
+ if (parts.length >= 1 &&
+ !parts[0].equalsIgnoreCase("a")) { // avoid false alarm on "A-levels are..."
+ token = parts[0];
+ }
+ token = token.replaceAll("[^a-zA-Z0-9\\.']", ""); // e.g. >>an "industry party"<<
+ if (StringTools.isEmpty(token)) {
+ continue;
+ }
+ final char tokenFirstChar = token.charAt(0);
+ if (requiresA.contains(token.toLowerCase()) || requiresA.contains(token)) {
+ isException = true;
+ doesRequireA = true;
+ }
+ if (requiresAn.contains(token.toLowerCase()) || requiresAn.contains(token)) {
+ if (isException) {
+ throw new IllegalStateException(token + " is listed in both det_a.txt and det_an.txt");
+ }
+ isException = true;
+ doesRequireAn = true;
+ }
+
+ if (!isException) {
+ if (StringTools.isAllUppercase(token) || StringTools.isMixedCase(token)) {
+ // we don't know how all-uppercase and mixed case words (often abbreviations) are pronounced,
+ // so never complain about these:
+ doesRequireAn = false;
+ doesRequireA = false;
+ } else if (isVowel(tokenFirstChar)) {
+ doesRequireAn = true;
+ } else {
+ doesRequireA = true;
+ }
+ }
+ //System.err.println(prevToken + " " +token + ", a="+doesRequireA + ", an="+doesRequireAn);
+ String msg = null;
+ if (prevToken.equalsIgnoreCase("a") && doesRequireAn) {
+ String replacement = "an";
+ if (prevToken.equals("A")) {
+ replacement = "An";
+ }
+ msg = "Use <suggestion>" +replacement+ "</suggestion> instead of '" +prevToken+ "' if the following "+
+ "word starts with a vowel sound, e.g. 'an article', "
+ + "'an hour'";
+ } else if (prevToken.equalsIgnoreCase("an") && doesRequireA) {
+ String replacement = "a";
+ if (prevToken.equals("An")) {
+ replacement = "A";
+ }
+ msg = "Use <suggestion>" +replacement+ "</suggestion> instead of '" +prevToken+ "' if the following "+
+ "word doesn't start with a vowel sound, e.g. 'a sentence', "
+ + "'a university'";
+ }
+ if (msg != null) {
+ final RuleMatch ruleMatch = new RuleMatch(this, prevPos, prevPos+prevToken.length(), msg, "Wrong article");
+ ruleMatches.add(ruleMatch);
+ }
+ if (tokens[i].hasPosTag("DT")) {
+ prevToken = token;
+ prevPos = tokens[i].getStartPos();
+ } else {
+ prevToken = "";
+ }
+ }
+ return toRuleMatchArray(ruleMatches);
+ }
+
+ /**
+ * Adds "a" or "an" to the English noun.
+ * Used for suggesting the proper form of the
+ * indefinite article.
+ * @param noun Word that needs an article.
+ * @return String containing the word with a determiner,
+ * or just the word if the word is an abbreviation.
+ */
+ public final String suggestAorAn(final String noun) {
+ String word = noun;
+ boolean doesRequireA = false;
+ boolean doesRequireAn = false;
+ // check for exceptions:
+ boolean isException = false;
+ final String[] parts = word.split("[-']"); // for example, in "one-way" only "one" is relevant
+ if (parts.length >= 1 &&
+ !parts[0].equalsIgnoreCase("a")) { // avoid false alarm on "A-levels are..."
+ word = parts[0];
+ }
+ //html entities!
+ word = word.replaceAll("&quot|&amp|&lt|&gt|[^a-zA-Z0-9]", ""); // e.g. >>an "industry party"<<
+ if (StringTools.isEmpty(word)) {
+ return word;
+ }
+ final char tokenFirstChar = word.charAt(0);
+ if (requiresA.contains(word.toLowerCase()) || requiresA.contains(word)) {
+ isException = true;
+ doesRequireA = true;
+ }
+ if (requiresAn.contains(word.toLowerCase()) || requiresAn.contains(word)) {
+ if (isException) {
+ throw new IllegalStateException(word + " is listed in both det_a.txt and det_an.txt");
+ }
+ isException = true;
+ doesRequireAn = true;
+ }
+ if (!isException) {
+ if (StringTools.isAllUppercase(word) || StringTools.isMixedCase(word)) {
+ // we don't know how all-uppercase words (often abbreviations) are pronounced,
+ // so never complain about these:
+ doesRequireAn = false;
+ doesRequireA = false;
+ } else if (isVowel(tokenFirstChar)) {
+ doesRequireAn = true;
+ } else {
+ doesRequireA = true;
+ }
+ }
+ if (doesRequireA) {
+ return "a " + noun;
+ } else if (doesRequireAn) {
+ return "an " + noun;
+ } else {
+ return noun;
+ }
+ }
+
+ private static boolean isVowel(char c) {
+ c = Character.toLowerCase(c);
+ return c == 'a' || c == 'e' || c == 'i' || c == 'o' || c == 'u';
+ }
+
+ /**
+ * Load words, normalized to lowercase.
+ */
+ private TreeSet<String> loadWords(final InputStream file) throws IOException {
+ BufferedReader br = null;
+ final TreeSet<String> set = new TreeSet<String>();
+ try {
+ br = new BufferedReader(new InputStreamReader(file));
+ String line;
+ while ((line = br.readLine()) != null) {
+ line = line.trim();
+ if (line.length() < 1) {
+ continue;
+ }
+ if (line.charAt(0) == '#') {
+ continue;
+ }
+ if (line.charAt(0) == '*') {
+ set.add(line.substring(1));
+ } else {
+ set.add(line.toLowerCase());
+ }
+ }
+ } finally {
+ if (br != null) {
+ br.close();
+ }
+ }
+ return set;
+ }
+
+ @Override
+ public void reset() {
+ // nothing
+ }
+
+}
diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/rules/en/CompoundRule.java b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/en/CompoundRule.java
new file mode 100644
index 0000000..0e01523
--- /dev/null
+++ b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/en/CompoundRule.java
@@ -0,0 +1,55 @@
+/* LanguageTool, a natural language style checker
+ * Copyright (C) 2006 Daniel Naber (http://www.danielnaber.de)
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
+ * USA
+ */
+package de.danielnaber.languagetool.rules.en;
+
+import java.io.IOException;
+import java.util.ResourceBundle;
+
+import de.danielnaber.languagetool.JLanguageTool;
+import de.danielnaber.languagetool.rules.AbstractCompoundRule;
+
+/**
+ * Checks that compounds (if in the list) are not written as separate words.
+ *
+ * @author Marcin Miłkowski, based on code by Daniel Naber
+ */
+
+public class CompoundRule extends AbstractCompoundRule {
+
+ private static final String FILE_NAME = "/en/compounds.txt";
+
+ public CompoundRule(final ResourceBundle messages) throws IOException {
+ super(messages);
+ loadCompoundFile(JLanguageTool.getDataBroker().getFromResourceDirAsStream(FILE_NAME), "UTF-8");
+ super.setShort("Hyphenation problem");
+ super.setMsg("This word is normally spelled with hyphen.",
+ "This word is normally spelled as one.",
+ "This expression is normally spelled as one or with hyphen.");
+ }
+
+ public String getId() {
+ return "EN_COMPOUNDS";
+ }
+
+ public String getDescription() {
+ return "Hyphenated words, e.g., 'case-sensitive' instead of 'case sensitive'";
+ }
+
+
+}
diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/rules/en/EnglishRule.java b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/en/EnglishRule.java
new file mode 100644
index 0000000..cd0036d
--- /dev/null
+++ b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/en/EnglishRule.java
@@ -0,0 +1,30 @@
+/* LanguageTool, a natural language style checker
+ * Copyright (C) 2005 Daniel Naber (http://www.danielnaber.de)
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
+ * USA
+ */
+package de.danielnaber.languagetool.rules.en;
+
+import de.danielnaber.languagetool.rules.Rule;
+
+/**
+ * Abstract base class for rules for the English language.
+ *
+ * @author Daniel Naber
+ */
+public abstract class EnglishRule extends Rule {
+
+}
diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/rules/en/EnglishUnpairedBracketsRule.java b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/en/EnglishUnpairedBracketsRule.java
new file mode 100644
index 0000000..4b32c05
--- /dev/null
+++ b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/en/EnglishUnpairedBracketsRule.java
@@ -0,0 +1,89 @@
+/* LanguageTool, a natural language style checker
+ * Copyright (C) 2010 Daniel Naber (http://www.languagetool.org)
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
+ * USA
+ */
+
+package de.danielnaber.languagetool.rules.en;
+
+import java.util.ResourceBundle;
+import java.util.regex.Pattern;
+
+import de.danielnaber.languagetool.AnalyzedTokenReadings;
+import de.danielnaber.languagetool.Language;
+import de.danielnaber.languagetool.rules.GenericUnpairedBracketsRule;
+
+public class EnglishUnpairedBracketsRule extends GenericUnpairedBracketsRule {
+
+ private static final String[] EN_START_SYMBOLS = { "[", "(", "{", "“", "\"", "'" };
+ private static final String[] EN_END_SYMBOLS = { "]", ")", "}", "”", "\"", "'" };
+
+ private static final Pattern NUMBER = Pattern.compile("\\d+");
+
+ public EnglishUnpairedBracketsRule(final ResourceBundle messages,
+ final Language language) {
+ super(messages, language);
+ startSymbols = EN_START_SYMBOLS;
+ endSymbols = EN_END_SYMBOLS;
+ }
+
+ public String getId() {
+ return "EN_UNPAIRED_BRACKETS";
+ }
+
+ protected boolean isNoException(final String token,
+ final AnalyzedTokenReadings[] tokens, final int i, final int j, final boolean precSpace,
+ final boolean follSpace) {
+
+
+//TODO: add an', o', 'till, 'tain't, 'cept, 'fore in the disambiguator
+//and mark up as contractions somehow
+// add exception for dates like '52
+
+ if (i <= 1) {
+ return true;
+ }
+
+ if (!precSpace && follSpace) {
+ // exception for English inches, e.g., 20"
+ if ("\"".equals(token)
+ && NUMBER.matcher(tokens[i - 1].getToken()).matches()) {
+ return false;
+ }
+ // Exception for English plural Saxon genetive
+ // current disambiguation scheme is a bit too greedy
+ // for adjectives
+ if ("'".equals(token) && tokens[i].hasPosTag("POS")) {
+ return false;
+ }
+ // puttin' on the Ritz
+ if ("'".equals(token) && tokens[i - 1].hasPosTag("VBG")
+ && tokens[i - 1].getToken().endsWith("in")) {
+ return false;
+ }
+ }
+ if (precSpace && !follSpace) {
+ // hold 'em!
+ if ("'".equals(token) && i + 1 < tokens.length
+ && "em".equals(tokens[i + 1].getToken())) {
+ return false;
+ }
+ }
+ return true;
+ }
+
+
+}