diff options
Diffstat (limited to 'JLanguageTool/src/java/de/danielnaber/languagetool/rules/AbstractCompoundRule.java')
-rw-r--r-- | JLanguageTool/src/java/de/danielnaber/languagetool/rules/AbstractCompoundRule.java | 279 |
1 files changed, 279 insertions, 0 deletions
diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/rules/AbstractCompoundRule.java b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/AbstractCompoundRule.java new file mode 100644 index 0000000..8ef9119 --- /dev/null +++ b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/AbstractCompoundRule.java @@ -0,0 +1,279 @@ +/* LanguageTool, a natural language style checker + * Copyright (C) 2006 Daniel Naber (http://www.danielnaber.de) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ +package de.danielnaber.languagetool.rules; + +import java.io.BufferedReader; +import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Queue; +import java.util.ResourceBundle; +import java.util.Set; +import java.util.concurrent.ArrayBlockingQueue; + +import de.danielnaber.languagetool.AnalyzedSentence; +import de.danielnaber.languagetool.AnalyzedToken; +import de.danielnaber.languagetool.AnalyzedTokenReadings; +import de.danielnaber.languagetool.tools.StringTools; + +/** + * Checks that compounds (if in the list) are not written as separate words. + * + * @author Daniel Naber & Marcin Miłkowski (refactoring) + */ + +public abstract class AbstractCompoundRule extends Rule { + + private static final int MAX_TERMS = 5; + + private final Set<String> incorrectCompounds = new HashSet<String>(); + private final Set<String> noDashSuggestion = new HashSet<String>(); + private final Set<String> onlyDashSuggestion = new HashSet<String>(); + + private String withHyphenMessage; + private String asOneMessage; + private String withOrWithoutHyphenMessage; + + private String shortDesc; + + /** Compounds with more than maxNoHyphensSize parts should always use hyphens */ + private int maxUnHyphenatedWordCount = 2; + + /** Flag to indicate if the hyphen is ignored in the text entered by the user. + * Set this to false if you want the rule to offer suggestions for words like [ro] "câte-și-trei" (with hyphen), not only for "câte și trei" (with spaces) + * This is only available for languages with hyphen as a word separator (ie: not available for english, available for Romanian) + * See Language.getWordTokenizer() + */ + private boolean hyphenIgnored = true; + + public AbstractCompoundRule(final ResourceBundle messages) throws IOException { + if (messages != null) + super.setCategory(new Category(messages.getString("category_misc"))); + } + + public abstract String getId(); + + public abstract String getDescription(); + + public void setShort(final String shortDescription) { + shortDesc = shortDescription; + } + + public void setMsg(final String withHyphenMessage, final String asOneMessage, final String withHyphenOrNotMessage) { + this.withHyphenMessage = withHyphenMessage; + this.asOneMessage = asOneMessage; + withOrWithoutHyphenMessage = withHyphenOrNotMessage; + } + + public boolean isHyphenIgnored() { + return hyphenIgnored; + } + + public void setHyphenIgnored(boolean ignoreHyphen) { + this.hyphenIgnored = ignoreHyphen; + } + + public int getMaxUnHyphenatedWordCount() { + return maxUnHyphenatedWordCount; + } + + public void setMaxUnHyphenatedWordCount(int maxNoHyphensSize) { + this.maxUnHyphenatedWordCount = maxNoHyphensSize; + } + + public RuleMatch[] match(final AnalyzedSentence text) { + final List<RuleMatch> ruleMatches = new ArrayList<RuleMatch>(); + final AnalyzedTokenReadings[] tokens = text.getTokensWithoutWhitespace(); + + RuleMatch prevRuleMatch = null; + final Queue<AnalyzedTokenReadings> prevTokens = new ArrayBlockingQueue<AnalyzedTokenReadings>(MAX_TERMS); + for (int i = 0; i < tokens.length + MAX_TERMS-1; i++) { + AnalyzedTokenReadings token = null; + // we need to extend the token list so we find matches at the end of the original list: + if (i >= tokens.length) + token = new AnalyzedTokenReadings(new AnalyzedToken("", "", null), prevTokens.peek().getStartPos()); + else + token = tokens[i]; + if (i == 0) { + addToQueue(token, prevTokens); + continue; + } + + final StringBuilder sb = new StringBuilder(); + int j = 0; + AnalyzedTokenReadings firstMatchToken = null; + final List<String> stringsToCheck = new ArrayList<String>(); + final List<String> origStringsToCheck = new ArrayList<String>(); // original upper/lowercase spelling + final Map<String, AnalyzedTokenReadings> stringToToken = new HashMap<String, AnalyzedTokenReadings>(); + for (AnalyzedTokenReadings atr : prevTokens) { + if (j == 0) + firstMatchToken = atr; + sb.append(' '); + sb.append(atr.getToken()); + if (j >= 1) { + final String stringToCheck = normalize(sb.toString()); + stringsToCheck.add(stringToCheck); + origStringsToCheck.add(sb.toString().trim()); + if (!stringToToken.containsKey(stringToCheck)) + stringToToken.put(stringToCheck, atr); + } + j++; + } + // iterate backwards over all potentially incorrect strings to make + // sure we match longer strings first: + for (int k = stringsToCheck.size()-1; k >= 0; k--) { + final String stringToCheck = stringsToCheck.get(k); + final String origStringToCheck = origStringsToCheck.get(k); + if (incorrectCompounds.contains(stringToCheck)) { + final AnalyzedTokenReadings atr = stringToToken.get(stringToCheck); + String msg = null; + final List<String> replacement = new ArrayList<String>(); + if (!noDashSuggestion.contains(stringToCheck)) { + replacement.add(origStringToCheck.replace(' ', '-')); + msg = withHyphenMessage; + } + // assume that compounds with more than maxUnHyphenatedWordCount (default: two) parts should always use hyphens: + if (!hasAllUppercaseParts(origStringToCheck) && countParts(stringToCheck) <= getMaxUnHyphenatedWordCount() + && !onlyDashSuggestion.contains(stringToCheck)) { + replacement.add(mergeCompound(origStringToCheck)); + msg = asOneMessage; + } + final String[] parts = stringToCheck.split(" "); + if (parts.length > 0 && parts[0].length() == 1) { + replacement.clear(); + replacement.add(origStringToCheck.replace(' ', '-')); + msg = withHyphenMessage; + } else if (replacement.isEmpty() || replacement.size() == 2) { // isEmpty shouldn't happen + msg = withOrWithoutHyphenMessage; + } + final RuleMatch ruleMatch = new RuleMatch(this, firstMatchToken.getStartPos(), + atr.getStartPos() + atr.getToken().length(), msg, shortDesc); + // avoid duplicate matches: + if (prevRuleMatch != null && prevRuleMatch.getFromPos() == ruleMatch.getFromPos()) { + prevRuleMatch = ruleMatch; + break; + } + prevRuleMatch = ruleMatch; + ruleMatch.setSuggestedReplacements(replacement); + ruleMatches.add(ruleMatch); + break; + } + } + addToQueue(token, prevTokens); + } + return toRuleMatchArray(ruleMatches); + } + + private String normalize(final String inStr) { + String str = inStr.trim().toLowerCase(); + if (str.indexOf('-') != -1 && str.indexOf(' ') != -1) { + if (isHyphenIgnored()) { + // e.g. "E-Mail Adresse" -> "E Mail Adresse" so the error can be detected: + str = str.replace('-', ' '); + } else { + str = str.replace(" - ", " "); + } + } + return str; + } + + private boolean hasAllUppercaseParts(final String str) { + final String[] parts = str.split(" "); + for (String part : parts) { + if (isHyphenIgnored() || !"-".equals(part)) { // do not treat '-' as an upper-case word + if (StringTools.isAllUppercase(part)) { + return true; + } + } + } + return false; + } + + private int countParts(final String str) { + return str.split(" ").length; + } + + private String mergeCompound(final String str) { + final String[] stringParts = str.split(" "); + final StringBuilder sb = new StringBuilder(); + for (int k = 0; k < stringParts.length; k++) { + if (isHyphenIgnored() || !"-".equals(stringParts[k])) { + if (k == 0) + sb.append(stringParts[k]); + else + sb.append(stringParts[k].toLowerCase()); + } + } + return sb.toString(); + } + + private void addToQueue(final AnalyzedTokenReadings token, final Queue<AnalyzedTokenReadings> prevTokens) { + final boolean inserted = prevTokens.offer(token); + if (!inserted) { + prevTokens.poll(); + prevTokens.offer(token); + } + } + + public void loadCompoundFile(final InputStream file, final String encoding) throws IOException { + InputStreamReader isr = null; + BufferedReader br = null; + try { + isr = new InputStreamReader(file, encoding); + br = new BufferedReader(isr); + String line; + while ((line = br.readLine()) != null) { + line = line.trim(); + if (line.length() < 1) { + continue; + } + if (line.charAt(0) == '#') { // ignore comments + continue; + } + // the set contains the incorrect spellings, i.e. the ones without hyphen + line = line.replace('-', ' '); + final String[] parts = line.split(" "); + if (parts.length > MAX_TERMS) + throw new IOException("Too many compound parts: " + line + ", maximum allowed: " + MAX_TERMS); + if (parts.length == 1) + throw new IOException("Not a compound: " + line); + if (line.endsWith("+")) { + line = line.substring(0, line.length() - 1); // cut off "+" + noDashSuggestion.add(line.toLowerCase()); + } else if (line.endsWith("*")) { + line = line.substring(0, line.length() - 1); // cut off "*" + onlyDashSuggestion.add(line.toLowerCase()); + } + incorrectCompounds.add(line.toLowerCase()); + } + } finally { + if (br != null) br.close(); + if (isr != null) isr.close(); + } + } + + public void reset() { + } + +} |