diff options
Diffstat (limited to 'JLanguageTool/src/java/de/danielnaber/languagetool/rules/pl')
5 files changed, 410 insertions, 0 deletions
diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/rules/pl/CompoundRule.java b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/pl/CompoundRule.java new file mode 100644 index 0000000..6d2ff17 --- /dev/null +++ b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/pl/CompoundRule.java @@ -0,0 +1,55 @@ +/* LanguageTool, a natural language style checker + * Copyright (C) 2006 Daniel Naber (http://www.danielnaber.de) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ +package de.danielnaber.languagetool.rules.pl; + +import java.io.IOException; +import java.util.ResourceBundle; + +import de.danielnaber.languagetool.JLanguageTool; +import de.danielnaber.languagetool.rules.AbstractCompoundRule; + +/** + * Checks that compounds (if in the list) are not written as separate words. + * + * @author Marcin Miłkowski, based on code by Daniel Naber + */ + +public final class CompoundRule extends AbstractCompoundRule { + + private static final String FILE_NAME = "/pl/compounds.txt"; + + public CompoundRule(final ResourceBundle messages) throws IOException { + super(messages); + loadCompoundFile(JLanguageTool.getDataBroker().getFromResourceDirAsStream(FILE_NAME), "UTF-8"); + super.setShort("Brak łącznika lub zbędny łącznik"); + super.setMsg("Ten wyraz pisze się z łącznikiem.", + "Ten wyraz pisze się razem (bez spacji ani łącznika).", + "Ten wyraz pisze się z łącznikiem lub bez niego."); + } + + public final String getId() { + return "PL_COMPOUNDS"; + } + + public final String getDescription() { + return "Sprawdza wyrazy z łącznikiem, np. „łapu capu” zamiast „łapu-capu”"; + } + + +} diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/rules/pl/PolishRule.java b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/pl/PolishRule.java new file mode 100644 index 0000000..0a6f01b --- /dev/null +++ b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/pl/PolishRule.java @@ -0,0 +1,31 @@ +/* LanguageTool, a natural language style checker + * Copyright (C) 2005 Daniel Naber (http://www.danielnaber.de) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ +package de.danielnaber.languagetool.rules.pl; + +import de.danielnaber.languagetool.rules.Rule; + +/** + * Abstract base class for Polish rules. + * + * @author Marcin Miłkowski + * + */ +public abstract class PolishRule extends Rule { + +} diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/rules/pl/PolishUnpairedBracketsRule.java b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/pl/PolishUnpairedBracketsRule.java new file mode 100644 index 0000000..3b83133 --- /dev/null +++ b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/pl/PolishUnpairedBracketsRule.java @@ -0,0 +1,42 @@ +/* LanguageTool, a natural language style checker + * Copyright (C) 2010 Marcin Miłkowski (http://www.languagetool.org) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ + +package de.danielnaber.languagetool.rules.pl; + +import java.util.ResourceBundle; + +import de.danielnaber.languagetool.Language; +import de.danielnaber.languagetool.rules.GenericUnpairedBracketsRule; + +public class PolishUnpairedBracketsRule extends GenericUnpairedBracketsRule { + + private static final String[] PL_START_SYMBOLS = { "[", "(", "{", "„", "»", "\"" }; + private static final String[] PL_END_SYMBOLS = { "]", ")", "}", "”", "«", "\"" }; + + public PolishUnpairedBracketsRule(final ResourceBundle messages, + final Language language) { + super(messages, language); + startSymbols = PL_START_SYMBOLS; + endSymbols = PL_END_SYMBOLS; + } + + public String getId() { + return "PL_UNPAIRED_BRACKETS"; + } +} diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/rules/pl/PolishWordRepeatRule.java b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/pl/PolishWordRepeatRule.java new file mode 100644 index 0000000..a7dbb5e --- /dev/null +++ b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/pl/PolishWordRepeatRule.java @@ -0,0 +1,200 @@ +/* LanguageTool, a natural language style checker + * Copyright (C) 2005 Daniel Naber (http://www.danielnaber.de) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ +package de.danielnaber.languagetool.rules.pl; + +import java.util.ArrayList; +import java.util.List; +import java.util.ResourceBundle; +import java.util.TreeSet; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +import de.danielnaber.languagetool.AnalyzedSentence; +import de.danielnaber.languagetool.AnalyzedTokenReadings; +import de.danielnaber.languagetool.rules.Category; +import de.danielnaber.languagetool.rules.RuleMatch; +import de.danielnaber.languagetool.tools.StringTools; + +/** + * @author Marcin Miłkowski + * + * Rule for detecting same words in the sentence but not just in a row + * + */ +public class PolishWordRepeatRule extends PolishRule { + + /** + * Excluded dictionary words. + */ + private static final Pattern EXC_WORDS = Pattern + .compile("nie|tuż|aż|to|siebie|być|ani|ni|albo|" + + "lub|czy|bądź|jako|zł|np|coraz" + + "|bardzo|bardziej|proc|ten|jak|mln|tys|swój|mój|" + + "twój|nasz|wasz|i|zbyt"); + + /** + * Excluded part of speech classes. + */ + private static final Pattern EXC_POS = Pattern.compile("prep:.*|ppron.*"); + + /** + * Excluded non-words (special symbols, Roman numerals etc. + */ + private static final Pattern EXC_NONWORDS = Pattern + .compile(""|>|<|&|[0-9].*|" + + "M*(D?C{0,3}|C[DM])(L?X{0,3}|X[LC])(V?I{0,3}|I[VX])$"); + + public PolishWordRepeatRule(final ResourceBundle messages) { + if (messages != null) { + super.setCategory(new Category(messages.getString("category_misc"))); + } + setDefaultOff(); + } + + /* + * (non-Javadoc) + * + * @see de.danielnaber.languagetool.rules.Rule#getId() + */ + @Override + public final String getId() { + return "PL_WORD_REPEAT"; + } + + /* + * (non-Javadoc) + * + * @see de.danielnaber.languagetool.rules.Rule#getDescription() + */ + @Override + public final String getDescription() { + return "Powtórzenia wyrazów w zdaniu (monotonia stylistyczna)"; + } + + /* + * Tests if any word form is repeated in the sentence. + */ + @Override + public final RuleMatch[] match(final AnalyzedSentence text) { + final List<RuleMatch> ruleMatches = new ArrayList<RuleMatch>(); + final AnalyzedTokenReadings[] tokens = text.getTokensWithoutWhitespace(); + boolean repetition = false; + final TreeSet<String> inflectedWords = new TreeSet<String>(); + String prevLemma, curLemma; + // start from real token, 0 = SENT_START + for (int i = 1; i < tokens.length; i++) { + final String token = tokens[i].getToken(); + // avoid "..." etc. to be matched: + boolean isWord = true; + boolean hasLemma = true; + + if (token.length() < 2) { + isWord = false; + } + + final int readingsLen = tokens[i].getReadingsLength(); + for (int k = 0; k < readingsLen; k++) { + final String posTag = tokens[i].getAnalyzedToken(k).getPOSTag(); + if (posTag != null) { + if (StringTools.isEmpty(posTag)) { + isWord = false; + break; + } + // FIXME: too many false alarms here: + final String lemma = tokens[i].getAnalyzedToken(k).getLemma(); + if (lemma == null) { + hasLemma = false; + break; + } + final Matcher m1 = EXC_WORDS.matcher(lemma); + if (m1.matches()) { + isWord = false; + break; + } + + final Matcher m2 = EXC_POS.matcher(posTag); + if (m2.matches()) { + isWord = false; + break; + } + } else { + hasLemma = false; + } + + } + + final Matcher m1 = EXC_NONWORDS.matcher(tokens[i].getToken()); + if (m1.matches()) { + isWord = false; + } + + prevLemma = ""; + if (isWord) { + boolean notSentEnd = false; + for (int j = 0; j < readingsLen; j++) { + final String pos = tokens[i].getAnalyzedToken(j).getPOSTag(); + if (pos != null) { + notSentEnd |= "SENT_END".equals(pos); + } + if (hasLemma) { + curLemma = tokens[i].getAnalyzedToken(j).getLemma(); + if (!prevLemma.equals(curLemma) && !notSentEnd) { + if (inflectedWords.contains(curLemma)) { + repetition = true; + } else { + inflectedWords.add(tokens[i].getAnalyzedToken(j).getLemma()); + } + } + prevLemma = curLemma; + } else { + if (inflectedWords.contains(tokens[i].getToken()) && !notSentEnd) { + repetition = true; + } else { + inflectedWords.add(tokens[i].getToken()); + } + } + + } + } + + if (repetition) { + final String msg = "Powtórzony wyraz w zdaniu"; + final int pos = tokens[i].getStartPos(); + final RuleMatch ruleMatch = new RuleMatch(this, pos, pos + + token.length(), msg, "Powtórzenie wyrazu"); + ruleMatches.add(ruleMatch); + repetition = false; + } + + } + return toRuleMatchArray(ruleMatches); + } + + /* + * (non-Javadoc) + * + * @see de.danielnaber.languagetool.rules.Rule#reset() + */ + @Override + public void reset() { + // nothing + + } + +} diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/rules/pl/SimpleReplaceRule.java b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/pl/SimpleReplaceRule.java new file mode 100644 index 0000000..90708d9 --- /dev/null +++ b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/pl/SimpleReplaceRule.java @@ -0,0 +1,82 @@ +/* LanguageTool, a natural language style checker + * Copyright (C) 2005 Daniel Naber (http://www.danielnaber.de) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ +package de.danielnaber.languagetool.rules.pl; + +import java.io.IOException; +import java.util.Locale; +import java.util.ResourceBundle; + +import de.danielnaber.languagetool.rules.AbstractSimpleReplaceRule; + +/** + * A rule that matches words or phrases which should not be used and suggests + * correct ones instead. + * + * Polish implementations. Loads the list of words from + * <code>rules/pl/replace.txt</code>. + * + * @author Marcin Miłkowski + */ +public class SimpleReplaceRule extends AbstractSimpleReplaceRule { + + public static final String POLISH_SIMPLE_REPLACE_RULE = "PL_SIMPLE_REPLACE"; + + private static final String FILE_NAME = "/pl/replace.txt"; + // locale used on case-conversion + private static final Locale PL_LOCALE = new Locale("pl"); + + public final String getFileName() { + return FILE_NAME; + } + + public SimpleReplaceRule(final ResourceBundle messages) throws IOException { + super(messages); + } + + public final String getId() { + return POLISH_SIMPLE_REPLACE_RULE; + } + + public String getDescription() { + return "Typowe literówki"; + } + + public String getShort() { + return "Literówka"; + } + + public String getSuggestion() { + return " to typowa literówka, poprawnie: "; + } + + /** + * use case-insensitive matching. + */ + public boolean isCaseSensitive() { + return false; + } + + /** + * locale used on case-conversion + */ + public Locale getLocale() { + return PL_LOCALE; + } + +} |