diff options
Diffstat (limited to 'JLanguageTool/src/java/de/danielnaber/languagetool/rules/ro/SimpleReplaceRule.java')
-rw-r--r-- | JLanguageTool/src/java/de/danielnaber/languagetool/rules/ro/SimpleReplaceRule.java | 264 |
1 files changed, 264 insertions, 0 deletions
diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/rules/ro/SimpleReplaceRule.java b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/ro/SimpleReplaceRule.java new file mode 100644 index 0000000..9e96513 --- /dev/null +++ b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/ro/SimpleReplaceRule.java @@ -0,0 +1,264 @@ +/* LanguageTool, a natural language style checker + * Copyright (C) 2005 Daniel Naber (http://www.danielnaber.de) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ +package de.danielnaber.languagetool.rules.ro; + +import java.io.BufferedReader; +import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; +import java.util.HashMap; +import java.util.List; +import java.util.Locale; +import java.util.Map; +import java.util.Queue; +import java.util.ResourceBundle; +import java.util.concurrent.ArrayBlockingQueue; + +import de.danielnaber.languagetool.AnalyzedSentence; +import de.danielnaber.languagetool.AnalyzedTokenReadings; +import de.danielnaber.languagetool.JLanguageTool; +import de.danielnaber.languagetool.Language; +import de.danielnaber.languagetool.rules.AbstractSimpleReplaceRule; +import de.danielnaber.languagetool.rules.Category; +import de.danielnaber.languagetool.rules.Rule; +import de.danielnaber.languagetool.rules.RuleMatch; +import de.danielnaber.languagetool.tokenizers.Tokenizer; +import de.danielnaber.languagetool.tools.StringTools; + +/** + * A rule that matches words which should not be used and suggests correct ones instead. <br/> + * Romanian implementations. Loads the list of words from + * <code>/ro/replace.txt</code>.<br/><br/> + * + * Unlike AbstractSimpleReplaceRule, supports multiple words (Ex: "aqua forte" => "acvaforte").<br/><br/> + * + * Note: Merge this into {@link AbstractSimpleReplaceRule} eventually and simply extend from AbstractSimpleReplaceRule.<br/> + * + * @author Ionuț Păduraru + * @version $Id$ + * + */ +public class SimpleReplaceRule extends Rule { + + public static final String ROMANIAN_SIMPLE_REPLACE_RULE = "RO_SIMPLE_REPLACE"; + + private static final String FILE_NAME = "/ro/replace.txt"; + private static final String FILE_ENCODING = "utf-8"; + // locale used on case-conversion + private static Locale roLocale = new Locale("ro"); + + // list of maps containing error-corrections pairs. + // the n-th map contains key strings of (n+1) words + private List<Map<String, String>> wrongWords; + + public final String getFileName() { + return FILE_NAME; + } + + public SimpleReplaceRule(final ResourceBundle messages) throws IOException { + super(messages); + if (messages != null) { + super.setCategory(new Category(messages.getString("category_misc"))); + } + wrongWords = loadWords(JLanguageTool.getDataBroker().getFromRulesDirAsStream(getFileName())); + } + + public final String getId() { + return ROMANIAN_SIMPLE_REPLACE_RULE; + } + + public String getDescription() { + return "Cuvinte sau grupuri de cuvinte incorecte sau ieșite din uz"; + } + + public String getShort() { + return "Cuvânt incorect sau ieșit din uz"; + } + + public String getSuggestion() { + return " este incorect sau ieșit din uz, folosiți "; + } + + /** + * @return the word used to separate multiple suggestions; used only before last suggestion, the rest are comma-separated. + */ + public String getSuggestionsSeparator() { + return " sau "; + } + + /** + * use case-insensitive matching. + */ + public boolean isCaseSensitive() { + return false; + } + + /** + * locale used on case-conversion + */ + public Locale getLocale() { + return roLocale; + } + + public String getEncoding() { + return FILE_ENCODING; + } + + /** + * @return the word tokenizer used for tokenization on loading words. + */ + protected Tokenizer getWordTokenizer() { + return Language.ROMANIAN.getWordTokenizer(); + } + + /** + * @return the list of wrong words for which this rule can suggest correction. The list cannot be modified. + */ + public List<Map<String, String>> getWrongWords() { + return wrongWords; + } + + /** + * Load the list of words. <br/> + * Same as {@link AbstractSimpleReplaceRule#loadWords} but allows multiple words. + * @param file the file to load. + * @return the list of maps containing the error-corrections pairs. <br/>The n-th map contains key strings of (n+1) words. + * @throws IOException when the file contains errors. + * @see #getWordTokenizer + */ + private List<Map<String, String>> loadWords(final InputStream file) + throws IOException { + final List<Map<String, String>> list = new ArrayList<Map<String, String>>(); + InputStreamReader isr = null; + BufferedReader br = null; + try { + isr = new InputStreamReader(file, getEncoding()); + br = new BufferedReader(isr); + String line; + + while ((line = br.readLine()) != null) { + line = line.trim(); + if (line.length() < 1 || line.charAt(0) == '#') { // ignore comments + continue; + } + final String[] parts = line.split("="); + if (parts.length != 2) { + throw new IOException("Format error in file " + + JLanguageTool.getDataBroker().getFromRulesDirAsUrl(getFileName()) + + ", line: " + line); + } + final String[] wrongForms = parts[0].split("\\|"); // multiple incorect forms + for (String wrongForm : wrongForms) { + int wordCount = 0; + final List<String> tokens = getWordTokenizer().tokenize(wrongForm); + for (String token : tokens) { + if (!StringTools.isWhitespace(token)) { + wordCount++; + } + } + // grow if necessary + for (int i = list.size(); i < wordCount; i++) { + list.add(new HashMap<String, String>()); + } + list.get(wordCount - 1).put(wrongForm, parts[1]); + } + } + + } finally { + if (br != null) { + br.close(); + } + if (isr != null) { + isr.close(); + } + } + // seal the result (prevent modification from outside this class) + List<Map<String,String>> result = new ArrayList<Map<String, String>>(); + for (Map<String, String> map : list) { + result.add(Collections.unmodifiableMap(map)); + } + return Collections.unmodifiableList(result); + } + + private void addToQueue(AnalyzedTokenReadings token, + Queue<AnalyzedTokenReadings> prevTokens) { + final boolean inserted = prevTokens.offer(token); + if (!inserted) { + prevTokens.poll(); + prevTokens.offer(token); + } + } + + public RuleMatch[] match(final AnalyzedSentence text) { + final List<RuleMatch> ruleMatches = new ArrayList<RuleMatch>(); + final AnalyzedTokenReadings[] tokens = text + .getTokensWithoutWhitespace(); + + final Queue<AnalyzedTokenReadings> prevTokens = new ArrayBlockingQueue<AnalyzedTokenReadings>(wrongWords.size()); + + for (int i = 1; i < tokens.length; i++) { + addToQueue(tokens[i], prevTokens); + final StringBuilder sb = new StringBuilder(); + final ArrayList<String> variants = new ArrayList<String>(); + final List<AnalyzedTokenReadings> prevTokensList = Arrays.asList(prevTokens.toArray(new AnalyzedTokenReadings[] {})); + for (int j = prevTokensList.size() - 1; j >= 0; j--) { + if (j != prevTokensList.size() - 1 && prevTokensList.get(j + 1).isWhitespaceBefore()) + sb.insert(0, " "); + sb.insert(0, prevTokensList.get(j).getToken()); + variants.add(0, sb.toString()); + } + final int len = variants.size(); // prevTokensList and variants have now the same length + for (int j = 0; j < len; j++) { // longest words first + final String crt = variants.get(j); + final int crtWordCount = len - j; + final String crtMatch = isCaseSensitive() ? wrongWords.get(crtWordCount - 1).get(crt) : wrongWords.get(crtWordCount- 1).get(crt.toLowerCase(getLocale())); + if (crtMatch != null) { + final List<String> replacements = Arrays.asList(crtMatch.split("\\|")); + String msg = crt + getSuggestion(); + for (int k = 0; k < replacements.size(); k++) { + if (k > 0) { + msg = msg + (k == replacements.size() - 1 ? getSuggestionsSeparator(): ", "); + } + msg += "<suggestion>" + replacements.get(k) + "</suggestion>"; + } + final int startPos = prevTokensList.get(len - crtWordCount).getStartPos(); + final int endPos = prevTokensList.get(len - 1).getStartPos() + prevTokensList.get(len - 1).getToken().length(); + final RuleMatch potentialRuleMatch = new RuleMatch(this, startPos, endPos, msg, getShort()); + + if (!isCaseSensitive() && StringTools.startsWithUppercase(crt)) { + for (int k = 0; k < replacements.size(); k++) { + replacements.set(k, StringTools.uppercaseFirstChar(replacements.get(k))); + } + } + potentialRuleMatch.setSuggestedReplacements(replacements); + ruleMatches.add(potentialRuleMatch); + break; + } + } + } + return toRuleMatchArray(ruleMatches); + } + + public void reset() { + } + +} |