diff options
Diffstat (limited to 'JLanguageTool/src/java/de/danielnaber/languagetool/rules/de/AgreementRule.java')
-rw-r--r-- | JLanguageTool/src/java/de/danielnaber/languagetool/rules/de/AgreementRule.java | 405 |
1 files changed, 405 insertions, 0 deletions
diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/rules/de/AgreementRule.java b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/de/AgreementRule.java new file mode 100644 index 0000000..8afff0c --- /dev/null +++ b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/de/AgreementRule.java @@ -0,0 +1,405 @@ +/* LanguageTool, a natural language style checker + * Copyright (C) 2005 Daniel Naber (http://www.danielnaber.de) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ +package de.danielnaber.languagetool.rules.de; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.HashSet; +import java.util.List; +import java.util.ResourceBundle; +import java.util.Set; + +import de.danielnaber.languagetool.AnalyzedSentence; +import de.danielnaber.languagetool.AnalyzedTokenReadings; +import de.danielnaber.languagetool.JLanguageTool; +import de.danielnaber.languagetool.rules.Category; +import de.danielnaber.languagetool.rules.RuleMatch; +import de.danielnaber.languagetool.tagging.de.AnalyzedGermanToken; +import de.danielnaber.languagetool.tagging.de.AnalyzedGermanTokenReadings; +import de.danielnaber.languagetool.tagging.de.GermanTagger; +import de.danielnaber.languagetool.tagging.de.GermanToken; +import de.danielnaber.languagetool.tagging.de.GermanToken.POSType; +import de.danielnaber.languagetool.tools.StringTools; + +/** + * Simple agreement checker for German noun phrases. Checks agreement in: + * + * <ul> + * <li>DET/PRO NOUN: e.g. "mein Auto", "der Mann", "die Frau" (correct), "die Haus" (incorrect)</li> + * <li>DET/PRO ADJ NOUN: e.g. "der riesige Tisch" (correct), "die riesigen Tisch" (incorrect)</li> + * </ul> + * + * Note that this rule only checks agreement inside the noun phrase, not whether + * e.g. the correct case is used. For example, "Es ist das Haus dem Mann" is not + * detected as incorrect. + * + * @author Daniel Naber + */ +public class AgreementRule extends GermanRule { + + private static final String KASUS = "Kasus"; + private static final String NUMERUS = "Numerus"; + private static final String GENUS = "Genus"; + + /* + * City names are incoherently tagged in the Morphy data. To avoid + * false alarms on phrases like "das Berliner Auto" we have to + * explicitly add these adjective readings to "Berliner" and to all + * other potential city names: + */ + private static final String[] ADJ_READINGS = new String[] { + // singular: + "ADJ:NOM:SIN:MAS:GRU", "ADJ:NOM:SIN:NEU:GRU", "ADJ:NOM:SIN:FEM:GRU", // das Berliner Auto + "ADJ:GEN:SIN:MAS:GRU", "ADJ:GEN:SIN:NEU:GRU", "ADJ:GEN:SIN:FEM:GRU", // des Berliner Autos + "ADJ:DAT:SIN:MAS:GRU", "ADJ:DAT:SIN:NEU:GRU", "ADJ:DAT:SIN:FEM:GRU", // dem Berliner Auto + "ADJ:AKK:SIN:MAS:GRU", "ADJ:AKK:SIN:NEU:GRU", "ADJ:AKK:SIN:FEM:GRU", // den Berliner Bewohner + // plural: + "ADJ:NOM:PLU:MAS:GRU", "ADJ:NOM:PLU:NEU:GRU", "ADJ:NOM:PLU:FEM:GRU", // die Berliner Autos + "ADJ:GEN:PLU:MAS:GRU", "ADJ:GEN:PLU:NEU:GRU", "ADJ:GEN:PLU:FEM:GRU", // der Berliner Autos + "ADJ:DAT:PLU:MAS:GRU", "ADJ:DAT:PLU:NEU:GRU", "ADJ:DAT:PLU:FEM:GRU", // den Berliner Autos + "ADJ:AKK:PLU:MAS:GRU", "ADJ:AKK:PLU:NEU:GRU", "ADJ:AKK:PLU:FEM:GRU", // den Berliner Bewohnern + }; + + + private static final Set<String> REL_PRONOUN = new HashSet<String>(); + static { + REL_PRONOUN.add("der"); + REL_PRONOUN.add("die"); + REL_PRONOUN.add("das"); + REL_PRONOUN.add("dessen"); + REL_PRONOUN.add("deren"); + REL_PRONOUN.add("dem"); + REL_PRONOUN.add("den"); + REL_PRONOUN.add("welche"); + REL_PRONOUN.add("welcher"); + REL_PRONOUN.add("welchen"); + REL_PRONOUN.add("welchem"); + REL_PRONOUN.add("welches"); + } + + private static final Set<String> PREPOSITIONS = new HashSet<String>(); + static { + PREPOSITIONS.add("in"); + PREPOSITIONS.add("auf"); + PREPOSITIONS.add("an"); + PREPOSITIONS.add("ab"); + PREPOSITIONS.add("für"); + PREPOSITIONS.add("zu"); + // TODO: add more + } + + public AgreementRule(final ResourceBundle messages) { + if (messages != null) + super.setCategory(new Category(messages.getString("category_grammar"))); + } + + public String getId() { + return "DE_AGREEMENT"; + } + + public String getDescription() { + return "Kongruenz von Nominalphrasen (unvollständig!), z.B. 'mein kleiner(kleines) Haus'"; + } + + public RuleMatch[] match(final AnalyzedSentence text) { + final List<RuleMatch> ruleMatches = new ArrayList<RuleMatch>(); + final AnalyzedTokenReadings[] tokens = text.getTokensWithoutWhitespace(); + int pos = 0; + for (int i = 0; i < tokens.length; i++) { + //defaulting to the first reading + //TODO: check for all readings + //and replace GermanTokenReading + final String posToken = tokens[i].getAnalyzedToken(0).getPOSTag(); + if (posToken != null && posToken.equals(JLanguageTool.SENTENCE_START_TAGNAME)) + continue; + //AnalyzedGermanToken analyzedToken = new AnalyzedGermanToken(tokens[i]); + + final AnalyzedGermanTokenReadings analyzedToken = (AnalyzedGermanTokenReadings)tokens[i]; + final boolean relevantPronoun = isRelevantPronoun(tokens, i); + + boolean ignore = couldBeRelativeClause(tokens, i); + if (i > 0) { + final String prevToken = tokens[i-1].getToken().toLowerCase(); + if ((prevToken.equals("der") || prevToken.equals("die") || prevToken.equals("das")) + && tokens[i].getToken().equals("eine")) { + // TODO: "der eine Polizist" -> nicht ignorieren, sondern "der polizist" checken + ignore = true; + } + } + + // avoid false alarm on "nichts Gutes": + if (analyzedToken.getToken().equals("nichts")) { + ignore = true; + } + + if ((analyzedToken.hasReadingOfType(POSType.DETERMINER) || relevantPronoun) && !ignore) { + int tokenPos = i + 1; + if (tokenPos >= tokens.length) + break; + AnalyzedGermanTokenReadings nextToken = (AnalyzedGermanTokenReadings)tokens[tokenPos]; + nextToken = maybeAddAdjectiveReadings(nextToken, tokens, tokenPos); + if (nextToken.hasReadingOfType(POSType.ADJEKTIV)) { + tokenPos = i + 2; + if (tokenPos >= tokens.length) + break; + final AnalyzedGermanTokenReadings nextNextToken = (AnalyzedGermanTokenReadings)tokens[tokenPos]; + if (nextNextToken.hasReadingOfType(POSType.NOMEN)) { + // TODO: add a case (checkAdjNounAgreement) for special cases like "deren", + // e.g. "deren komisches Geschenke" isn't yet detected as incorrect + final RuleMatch ruleMatch = checkDetAdjNounAgreement((AnalyzedGermanTokenReadings)tokens[i], + nextToken, (AnalyzedGermanTokenReadings)tokens[i+2]); + if (ruleMatch != null) { + ruleMatches.add(ruleMatch); + } + } + } else if (nextToken.hasReadingOfType(POSType.NOMEN)) { + final RuleMatch ruleMatch = checkDetNounAgreement((AnalyzedGermanTokenReadings)tokens[i], + (AnalyzedGermanTokenReadings)tokens[i+1]); + if (ruleMatch != null) { + ruleMatches.add(ruleMatch); + } + } + } + + pos += tokens[i].getToken().length(); + } + return toRuleMatchArray(ruleMatches); + } + + private boolean isRelevantPronoun(AnalyzedTokenReadings[] tokens, int pos) { + final AnalyzedGermanTokenReadings analyzedToken = (AnalyzedGermanTokenReadings)tokens[pos]; + boolean relevantPronoun = analyzedToken.hasReadingOfType(POSType.PRONOMEN); + // avoid false alarms: + final String token = tokens[pos].getToken(); + if (pos > 0 && tokens[pos-1].getToken().equalsIgnoreCase("vor") && tokens[pos].getToken().equalsIgnoreCase("allem")) + relevantPronoun = false; + else if (token.equalsIgnoreCase("er") || token.equalsIgnoreCase("sie") || token.equalsIgnoreCase("es")) + relevantPronoun = false; + else if (token.equalsIgnoreCase("ich")) + relevantPronoun = false; + else if (token.equalsIgnoreCase("du")) + relevantPronoun = false; + else if (token.equalsIgnoreCase("dessen")) // avoid false alarm on: "..., dessen Leiche" + relevantPronoun = false; + else if (token.equalsIgnoreCase("deren")) + relevantPronoun = false; + else if (token.equalsIgnoreCase("sich")) // avoid false alarm + relevantPronoun = false; + else if (token.equalsIgnoreCase("unser")) // avoid false alarm "unser Produkt": TODO! + relevantPronoun = false; + else if (token.equalsIgnoreCase("aller")) + relevantPronoun = false; + else if (token.equalsIgnoreCase("man")) + relevantPronoun = false; + else if (token.equalsIgnoreCase("beiden")) + relevantPronoun = false; + else if (token.equalsIgnoreCase("wessen")) + relevantPronoun = false; + else if (token.equalsIgnoreCase("a")) + relevantPronoun = false; + else if (token.equalsIgnoreCase("alle")) + relevantPronoun = false; + else if (token.equalsIgnoreCase("etwas")) // TODO: doesn't have case -- but don't just ignore + relevantPronoun = false; + else if (token.equalsIgnoreCase("was")) // TODO: doesn't have case -- but don't just ignore + relevantPronoun = false; + else if (token.equalsIgnoreCase("wer")) + relevantPronoun = false; + return relevantPronoun; + } + + // see the comment at ADJ_READINGS: + private AnalyzedGermanTokenReadings maybeAddAdjectiveReadings(AnalyzedGermanTokenReadings nextToken, + AnalyzedTokenReadings[] tokens, int tokenPos) { + final String nextTerm = nextToken.getToken(); + // Just a heuristic: nouns and proper nouns that end with "er" are considered + // city names: + if (nextTerm.endsWith("er") && tokens.length > tokenPos+1) { + final AnalyzedGermanTokenReadings nextNextToken = (AnalyzedGermanTokenReadings)tokens[tokenPos+1]; + final GermanTagger tagger = new GermanTagger(); + try { + final AnalyzedGermanTokenReadings nextATR = tagger.lookup(nextTerm.substring(0, nextTerm.length()-2)); + final AnalyzedGermanTokenReadings nextNextATR = tagger.lookup(nextNextToken.getToken()); + //System.err.println("nextATR: " + nextATR); + //System.err.println("nextNextATR: " + nextNextATR); + // "Münchner": special case as cutting off last two characters doesn't produce city name: + if ("Münchner".equals(nextTerm) || + (nextATR != null && + // tagging in Morphy for cities is not coherent: + (nextATR.hasReadingOfType(POSType.PROPER_NOUN) || nextATR.hasReadingOfType(POSType.NOMEN) && + nextNextATR != null && nextNextATR.hasReadingOfType(POSType.NOMEN)))) { + final AnalyzedGermanToken[] adjReadings = new AnalyzedGermanToken[ADJ_READINGS.length]; + for (int j = 0; j < ADJ_READINGS.length; j++) { + adjReadings[j] = new AnalyzedGermanToken(nextTerm, ADJ_READINGS[j], null); + } + nextToken = new AnalyzedGermanTokenReadings(adjReadings, nextToken.getStartPos()); + } + } catch (IOException e) { + throw new RuntimeException(e); + } + } + return nextToken; + } + + // TODO: improve this so it only returns true for real relative clauses + private boolean couldBeRelativeClause(AnalyzedTokenReadings[] tokens, int pos) { + boolean comma; + boolean relPronoun; + if (pos >= 1) { + // avoid false alarm: "Das Wahlrecht, das Frauen zugesprochen bekamen." etc: + comma = tokens[pos-1].getToken().equals(","); + final String term = tokens[pos].getToken().toLowerCase(); + relPronoun = REL_PRONOUN.contains(term); + if (comma && relPronoun) + return true; + } + if (pos >= 2) { + // avoid false alarm: "Der Mann, in dem quadratische Fische schwammen." + comma = tokens[pos-2].getToken().equals(","); + final String term1 = tokens[pos-1].getToken().toLowerCase(); + final String term2 = tokens[pos].getToken().toLowerCase(); + final boolean prep = PREPOSITIONS.contains(term1); + relPronoun = REL_PRONOUN.contains(term2); + return comma && prep && relPronoun; + } + return false; + } + + private RuleMatch checkDetNounAgreement(final AnalyzedGermanTokenReadings token1, + final AnalyzedGermanTokenReadings token2) { + // avoid false alarm: "Gebt ihm Macht." + if (token1.getToken().equalsIgnoreCase("ihm")) + return null; + RuleMatch ruleMatch = null; + final Set<String> set1 = getAgreementCategories(token1); + if (set1 == null) + return null; // word not known, assume it's correct + final Set<String> set2 = getAgreementCategories(token2); + if (set2 == null) + return null; + /*System.err.println("#"+set1); + System.err.println("#"+set2); + System.err.println("");*/ + set1.retainAll(set2); + if (set1.size() == 0) { + // TODO: better error message than just 'agreement error' + final String msg = "Möglicherweise fehlende Übereinstimmung (Kongruenz) zwischen Artikel und Nomen " + + "bezüglich Kasus, Numerus oder Genus. Beispiel: 'meine Haus' statt 'mein Haus'"; + ruleMatch = new RuleMatch(this, token1.getStartPos(), + token2.getStartPos()+token2.getToken().length(), msg); + } + return ruleMatch; + } + + private RuleMatch checkDetAdjNounAgreement(final AnalyzedGermanTokenReadings token1, + final AnalyzedGermanTokenReadings token2, final AnalyzedGermanTokenReadings token3) { + final Set<String> relax = new HashSet<String>(); + final Set<String> set = retainCommonCategories(token1, token2, token3, relax); + RuleMatch ruleMatch = null; + if (set.size() == 0) { + // TODO: more detailed error message: + /*relax.add(KASUS); + set = retainCommonCategories(token1, token2, token3, relax); + if (set.size() > 0) { + System.err.println("KASUS!"); + } + relax.clear(); + relax.add(NUMERUS); + set = retainCommonCategories(token1, token2, token3, relax); + if (set.size() > 0) { + System.err.println("NUMERUS!"); + } + relax.clear(); + relax.add(GENUS); + set = retainCommonCategories(token1, token2, token3, relax); + if (set.size() > 0) { + System.err.println("GENUS!"); + }*/ + final String msg = "Möglicherweise fehlende Übereinstimmung (Kongruenz) zwischen Artikel, Adjektiv und " + + "Nomen bezüglich Kasus, Numerus oder Genus. Beispiel: 'mein kleiner Haus' " + + "statt 'mein kleines Haus'"; + ruleMatch = new RuleMatch(this, token1.getStartPos(), + token3.getStartPos()+token3.getToken().length(), msg); + } + return ruleMatch; + } + + private Set<String> retainCommonCategories(final AnalyzedGermanTokenReadings token1, + final AnalyzedGermanTokenReadings token2, final AnalyzedGermanTokenReadings token3, + Set<String> relax) { + final Set<String> set1 = getAgreementCategories(token1, relax); + if (set1 == null) + return null; // word not known, assume it's correct + final Set<String> set2 = getAgreementCategories(token2, relax); + if (set2 == null) + return null; + final Set<String> set3 = getAgreementCategories(token3, relax); + if (set3 == null) + return null; + /*System.err.println(token1.getToken()+"#"+set1); + System.err.println(token2.getToken()+"#"+set2); + System.err.println(token3.getToken()+"#"+set3); + System.err.println("");*/ + set1.retainAll(set2); + set1.retainAll(set3); + return set1; + } + + private Set<String> getAgreementCategories(final AnalyzedGermanTokenReadings aToken) { + return getAgreementCategories(aToken, new HashSet<String>()); + } + + /** Return Kasus, Numerus, Genus. */ + private Set<String> getAgreementCategories(final AnalyzedGermanTokenReadings aToken, Set<String> omit) { + final Set<String> set = new HashSet<String>(); + final List<AnalyzedGermanToken> readings = aToken.getGermanReadings(); + for (AnalyzedGermanToken reading : readings) { + if (reading.getCasus() == null && reading.getNumerus() == null && + reading.getGenus() == null) + continue; + if (reading.getGenus() == null) { + // "ich" and "wir" contains genus=ALG in the original data. Not sure if + // this is allowed, but expand this so "Ich Arbeiter" doesn't get flagged + // as incorrect: + set.add(makeString(reading.getCasus(), reading.getNumerus(), GermanToken.Genus.MASKULINUM, omit)); + set.add(makeString(reading.getCasus(), reading.getNumerus(), GermanToken.Genus.FEMININUM, omit)); + set.add(makeString(reading.getCasus(), reading.getNumerus(), GermanToken.Genus.NEUTRUM, omit)); + } else { + set.add(makeString(reading.getCasus(), reading.getNumerus(), reading.getGenus(), omit)); + } + } + return set; + } + + private String makeString(GermanToken.Kasus casus, GermanToken.Numerus num, GermanToken.Genus gen, + Set<String> omit) { + final List<String> l = new ArrayList<String>(); + if (casus != null && !omit.contains(KASUS)) + l.add(casus.toString()); + if (num != null && !omit.contains(NUMERUS)) + l.add(num.toString()); + if (gen != null && !omit.contains(GENUS)) + l.add(gen.toString()); + return StringTools.listToString(l, "/"); + } + + public void reset() { + } + +} |