diff options
Diffstat (limited to 'JLanguageTool/src/java/de/danielnaber/languagetool/rules/sv/CompoundRule.java')
-rw-r--r-- | JLanguageTool/src/java/de/danielnaber/languagetool/rules/sv/CompoundRule.java | 247 |
1 files changed, 247 insertions, 0 deletions
diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/rules/sv/CompoundRule.java b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/sv/CompoundRule.java new file mode 100644 index 0000000..b3087cd --- /dev/null +++ b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/sv/CompoundRule.java @@ -0,0 +1,247 @@ +/* LanguageTool, a natural language style checker + * Copyright (C) 2006 Daniel Naber (http://www.danielnaber.de) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ +package de.danielnaber.languagetool.rules.sv; + +import java.io.BufferedReader; +import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Queue; +import java.util.ResourceBundle; +import java.util.Set; +import java.util.concurrent.ArrayBlockingQueue; + +import de.danielnaber.languagetool.AnalyzedSentence; +import de.danielnaber.languagetool.AnalyzedToken; +import de.danielnaber.languagetool.AnalyzedTokenReadings; +import de.danielnaber.languagetool.JLanguageTool; +import de.danielnaber.languagetool.rules.Category; +import de.danielnaber.languagetool.rules.RuleMatch; +import de.danielnaber.languagetool.tools.StringTools; + +/** + * Checks that compounds (if in the list) are not written as separate words. + * + * @author Daniel Naber + */ +public class CompoundRule extends SwedishRule { + //TODO for words with more then one part check if parts of it is compounded. + //in env. allt-i-genom+ should match "allt i genom", "allt igenom" as well as "allti genom" + private static final String FILE_NAME = "/sv/compounds.txt"; + + private final static int MAX_TERMS = 5; + + private final Set<String> incorrectCompounds = new HashSet<String>(); + private final Set<String> noDashSuggestion = new HashSet<String>(); + private final Set<String> onlyDashSuggestion = new HashSet<String>(); + + public CompoundRule(final ResourceBundle messages) throws IOException { + if (messages != null) + super.setCategory(new Category(messages.getString("category_misc"))); + loadCompoundFile(JLanguageTool.getDataBroker().getFromResourceDirAsStream(FILE_NAME), "UTF-8"); + } + + public String getId() { + return "SV_COMPOUNDS"; + } + + public String getDescription() { + return "Särskrivningar, t.ex. 'cd rom' bör skrivas 'cd-rom'"; + } + + public RuleMatch[] match(final AnalyzedSentence text) { + final List<RuleMatch> ruleMatches = new ArrayList<RuleMatch>(); + final AnalyzedTokenReadings[] tokens = text.getTokensWithoutWhitespace(); + + RuleMatch prevRuleMatch = null; + final Queue<AnalyzedTokenReadings> prevTokens = new ArrayBlockingQueue<AnalyzedTokenReadings>(MAX_TERMS); + for (int i = 0; i < tokens.length + MAX_TERMS-1; i++) { + AnalyzedTokenReadings token = null; + // we need to extend the token list so we find matches at the end of the original list: + if (i >= tokens.length) + token = new AnalyzedTokenReadings(new AnalyzedToken("", "", null), prevTokens.peek().getStartPos()); + else + token = tokens[i]; + if (i == 0) { + addToQueue(token, prevTokens); + continue; + } + + final StringBuilder sb = new StringBuilder(); + int j = 0; + AnalyzedTokenReadings firstMatchToken = null; + final List<String> stringsToCheck = new ArrayList<String>(); + final List<String> origStringsToCheck = new ArrayList<String>(); // original upper/lowercase spelling + final Map<String, AnalyzedTokenReadings> stringToToken = new HashMap<String, AnalyzedTokenReadings>(); + for (AnalyzedTokenReadings atr : prevTokens) { + if (j == 0) + firstMatchToken = atr; + sb.append(' '); + sb.append(atr.getToken()); + if (j >= 1) { + final String stringToCheck = normalize(sb.toString()); + stringsToCheck.add(stringToCheck); + origStringsToCheck.add(sb.toString().trim()); + if (!stringToToken.containsKey(stringToCheck)) + stringToToken.put(stringToCheck, atr); + } + j++; + } + // iterate backwards over all potentially incorrect strings to make + // sure we match longer strings first: + for (int k = stringsToCheck.size()-1; k >= 0; k--) { + final String stringToCheck = stringsToCheck.get(k); + final String origStringToCheck = origStringsToCheck.get(k); + //System.err.println("##"+stringtoCheck+"#"); + if (incorrectCompounds.contains(stringToCheck)) { + final AnalyzedTokenReadings atr = stringToToken.get(stringToCheck); + String msg = null; + final List<String> repl = new ArrayList<String>(); + if (!noDashSuggestion.contains(stringToCheck)) { + repl.add(origStringToCheck.replace(' ', '-')); + msg = "Dessa ord skrivs samman med bindesträck."; + } + // Do not assume that compounds with more than two parts should always use hyphens: + if (!hasAllUppercaseParts(origStringToCheck) && !onlyDashSuggestion.contains(stringToCheck)) { + repl.add(mergeCompound(origStringToCheck)); + msg = "Dessa ord skrivs samman."; + } + final String[] parts = stringToCheck.split(" "); + if (parts.length > 0) { + repl.clear(); + repl.add(origStringToCheck.replace(' ', '-')); + msg = "Dessa ord skrivs samman med bindesträck."; + } else if (repl.size() == 0 || repl.size() == 2) { // == 0 shouldn't happen + // did not work as expected so I added repl. explicitly. + msg = "Dessa ord skrivs samman med eller utan bindesträck."; + repl.clear(); + repl.add(origStringToCheck.replace(' ', '-')); + repl.add(mergeCompound(origStringToCheck)); + } + final RuleMatch ruleMatch = new RuleMatch(this, firstMatchToken.getStartPos(), + atr.getStartPos() + atr.getToken().length(), msg); + // avoid duplicate matches: + if (prevRuleMatch != null && prevRuleMatch.getFromPos() == ruleMatch.getFromPos()) { + prevRuleMatch = ruleMatch; + break; + } + prevRuleMatch = ruleMatch; + ruleMatch.setSuggestedReplacements(repl); + ruleMatches.add(ruleMatch); + break; + } + } + addToQueue(token, prevTokens); + } + return toRuleMatchArray(ruleMatches); + } + + /** + * Replaces dashes with whitespace + * e.g. "E-Mail Adresse" -> "E Mail Adresse" so the error can be detected: + * @param str + * @return str + */ + private String normalize(String str) { + str = str.trim().toLowerCase(); + if (str.indexOf('-') != -1 && str.indexOf(' ') != -1) { + // e.g. "E-Mail Adresse" -> "E Mail Adresse" so the error can be detected: + str = str.replace('-', ' '); + } + return str; + } + + private boolean hasAllUppercaseParts(String str) { + final String[] parts = str.split(" "); + for (String part : parts) { + if (StringTools.isAllUppercase(part)) { + return true; + } + } + return false; + } + + private String mergeCompound(String str) { + final String[] stringParts = str.split(" "); + final StringBuilder sb = new StringBuilder(); + for (int k = 0; k < stringParts.length; k++) { + if (k == 0) + sb.append(stringParts[k]); + else + sb.append(stringParts[k].toLowerCase()); + } + return sb.toString(); + } + + private void addToQueue(AnalyzedTokenReadings token, Queue<AnalyzedTokenReadings> prevTokens) { + final boolean inserted = prevTokens.offer(token); + if (!inserted) { + prevTokens.poll(); + prevTokens.offer(token); + } + } + + private void loadCompoundFile(final InputStream file, final String encoding) throws IOException { + InputStreamReader isr = null; + BufferedReader br = null; + try { + isr = new InputStreamReader(file, encoding); + br = new BufferedReader(isr); + String line; + while ((line = br.readLine()) != null) { + line = line.trim(); + if (line.length() < 1) { + continue; + } + if (line.charAt(0) == '#') { // ignore comments + continue; + } + // the set contains the incorrect spellings, i.e. the ones without hyphen + line = line.replace('-', ' '); + final String[] parts = line.split(" "); + if (parts.length > MAX_TERMS) + throw new IOException("För många ord sammansatta: " + line + ", max antal tillåtna ord: " + MAX_TERMS); + if (parts.length == 1) + throw new IOException("Inget sammansatt ord: " + line); + if (line.endsWith("+")) { + line = line.substring(0, line.length() - 1); // cut off "+" + noDashSuggestion.add(line.toLowerCase()); + } else if (line.endsWith("*")) { + line = line.substring(0, line.length() - 1); // cut off "*" + onlyDashSuggestion.add(line.toLowerCase()); + } + incorrectCompounds.add(line.toLowerCase()); + } + } finally { + if (br != null) br.close(); + if (isr != null) isr.close(); + } + } + + public void reset() { + } + +} + + |