/* LanguageTool, a natural language style checker * Copyright (C) 2005 Daniel Naber (http://www.danielnaber.de) * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 * USA */ package de.danielnaber.languagetool.rules.patterns; import java.io.IOException; import java.lang.reflect.InvocationTargetException; import java.util.ArrayList; import java.util.HashMap; import java.util.HashSet; import java.util.Iterator; import java.util.List; import java.util.Set; import java.util.regex.Pattern; import java.util.regex.Matcher; import junit.framework.TestCase; import de.danielnaber.languagetool.AnalyzedSentence; import de.danielnaber.languagetool.JLanguageTool; import de.danielnaber.languagetool.Language; import de.danielnaber.languagetool.TestTools; import de.danielnaber.languagetool.rules.IncorrectExample; import de.danielnaber.languagetool.rules.Rule; import de.danielnaber.languagetool.rules.RuleMatch; /** * @author Daniel Naber */ public class PatternRuleTest extends TestCase { private static JLanguageTool langTool; private static final Pattern PROBABLE_REGEX = Pattern.compile("[^\\[\\]\\*\\+\\|\\^\\{\\}\\?][\\[\\]\\*\\+\\|\\^\\{\\}\\?]|\\\\[^0-9]|\\(.+\\)|\\.."); private static final Pattern CASE_REGEX = Pattern.compile("\\[(.)(.)\\]"); @Override public void setUp() throws IOException { if (langTool == null) { langTool = new JLanguageTool(Language.ENGLISH); } } public void testGrammarRulesFromXML() throws IOException { testGrammarRulesFromXML(null, false); } private void testGrammarRulesFromXML(final Set ignoredLanguages, final boolean verbose) throws IOException { for (final Language lang : Language.LANGUAGES) { if (ignoredLanguages != null && ignoredLanguages.contains(lang)) { if (verbose) { System.out.println("Ignoring tests for " + lang.getName()); } continue; } if (verbose) { System.out.println("Running tests for " + lang.getName() + "..."); } final PatternRuleLoader ruleLoader = new PatternRuleLoader(); final JLanguageTool languageTool = new JLanguageTool(lang); final String name = "/" + lang.getShortName() + "/grammar.xml"; final List rules = ruleLoader.getRules(JLanguageTool.getDataBroker(). getFromRulesDirAsStream(name), name); warnIfRegexpSyntax(rules, lang); testGrammarRulesFromXML(rules, languageTool, lang); } } // TODO: probably this would be more useful for exceptions // instead of adding next methods to PatternRule // we can probably validate using XSD and specify regexes straight there private void warnIfRegexpSyntax(final List rules, final Language lang) { for (final PatternRule rule : rules) { int i = 0; for (final Element element : rule.getElements()) { i++; warnIfElementNotKosher(element, lang, rule.getId()); if (element.getExceptionList() != null) { for (final Element exception: element.getExceptionList()) { warnIfElementNotKosher(exception, lang, rule.getId() + " (exception in token [" + i + "]:" + element +") "); } } } } } private void warnIfElementNotKosher(final Element element, final Language lang, final String ruleId) { if (!element.isRegularExpression() && (PROBABLE_REGEX.matcher(element.getString()) .find())) { System.err.println("The " + lang.toString() + " rule: " + ruleId + " contains element " + "\"" + element + "\" that is not marked as regular expression" + " but probably is one."); } if (element.isRegularExpression() && "".equals(element.getString())) { System.err.println("The " + lang.toString() + " rule: " + ruleId + " contains an empty string element " + "\"" + element + "\" that is marked as regular expression (don't look at the POS tag, it might be OK)."); } else if (element.isRegularExpression() && !PROBABLE_REGEX.matcher(element.getString()) .find()) { System.err.println("The " + lang.toString() + " rule: " + ruleId + " contains element " + "\"" + element + "\" that is marked as regular expression" + " but probably is not one."); } if (element.isInflected() && "".equals(element.getString())) { System.err.println("The " + lang.toString() + " rule: " + ruleId + " contains element " + "\"" + element + "\" that is marked as inflected" + " but is empty, so the attribute is redundant."); } if (element.isRegularExpression() && !element.getCaseSensitive()) { Matcher matcher = CASE_REGEX.matcher(element.getString()); if (matcher.find()) { final String letter1 = matcher.group(1); final String letter2 = matcher.group(2); if (!letter1.equals(letter2) && letter1.toLowerCase().equals(letter2.toLowerCase())) { System.err.println("The " + lang.toString() + " rule: " + ruleId + " contains regexp part [" + letter1 + letter2 + "] which is useless without case_sensitive=\"yes\"."); } } } if (element.isRegularExpression() && element.getString().contains("|")) { final String[] groups = element.getString().split("\\)"); final boolean caseSensitive = element.getCaseSensitive(); for (final String group : groups) { final String[] alt = group.split("\\|"); final Set partSet = new HashSet(); final Set partSetNoCase = new HashSet(); for (String part : alt) { String partNoCase = caseSensitive ? part : part.toLowerCase(); if (partSetNoCase.contains(partNoCase)) { if (partSet.contains(part)) { // Duplicate disjunction parts "foo|foo". System.err.println("The " + lang.toString() + " rule : " + ruleId + " contains duplicated disjunction part (" + part + ") within the element " + "\"" + element + "\"."); } else { // Duplicate disjunction parts "Foo|foo" since element ignores case. System.err.println("The " + lang.toString() + " rule : " + ruleId + " contains duplicated non case sensitive disjunction part (" + part + ") within the element " + "\"" + element + "\". Did you " + "forget case_sensitive=\"yes\"?"); } } partSetNoCase.add(partNoCase); partSet.add(part); } } } } private void testGrammarRulesFromXML(final List rules, final JLanguageTool languageTool, final Language lang) throws IOException { int noSuggestionCount = 0; final HashMap complexRules = new HashMap(); for (final PatternRule rule : rules) { final List goodSentences = rule.getCorrectExamples(); for (String goodSentence : goodSentences) { // enable indentation use goodSentence = goodSentence.replaceAll("[\\n\\t]+", ""); goodSentence = cleanXML(goodSentence); assertTrue(goodSentence.trim().length() > 0); assertFalse(lang + ": Did not expect error in: " + goodSentence + " (Rule: " + rule + ")", match(rule, goodSentence, languageTool)); } final List badSentences = rule.getIncorrectExamples(); for (IncorrectExample origBadExample : badSentences) { // enable indentation use String origBadSentence = origBadExample.getExample().replaceAll( "[\\n\\t]+", ""); final List suggestedCorrection = origBadExample .getCorrections(); final int expectedMatchStart = origBadSentence.indexOf(""); final int expectedMatchEnd = origBadSentence.indexOf("") - "".length(); if (expectedMatchStart == -1 || expectedMatchEnd == -1) { fail(lang + ": No error position markup ('...') in bad example in rule " + rule); } final String badSentence = cleanXML(origBadSentence); assertTrue(badSentence.trim().length() > 0); RuleMatch[] matches = getMatches(rule, badSentence, languageTool); if (!rule.isWithComplexPhrase()) { assertTrue(lang + ": Did expect one error in: \"" + badSentence + "\" (Rule: " + rule + "), got " + matches.length + ". Additional info:" + rule.getMessage(), matches.length == 1); assertEquals(lang + ": Incorrect match position markup (start) for rule " + rule, expectedMatchStart, matches[0].getFromPos()); assertEquals(lang + ": Incorrect match position markup (end) for rule " + rule, expectedMatchEnd, matches[0].getToPos()); // make sure suggestion is what we expect it to be if (suggestedCorrection != null && suggestedCorrection.size() > 0) { assertTrue("You specified a correction but your message has no suggestions in rule " + rule, rule.getMessage().contains("") ); assertTrue(lang + ": Incorrect suggestions: " + suggestedCorrection.toString() + " != " + matches[0].getSuggestedReplacements() + " for rule " + rule, suggestedCorrection.equals(matches[0] .getSuggestedReplacements())); } // make sure the suggested correction doesn't produce an error: if (matches[0].getSuggestedReplacements().size() > 0) { final int fromPos = matches[0].getFromPos(); final int toPos = matches[0].getToPos(); for (final String repl : matches[0].getSuggestedReplacements()) { final String fixedSentence = badSentence.substring(0, fromPos) + repl + badSentence.substring(toPos); matches = getMatches(rule, fixedSentence, languageTool); if (matches.length > 0) { fail("Incorrect input:\n" + " " + badSentence + "\nCorrected sentence:\n" + " " + fixedSentence + "\nBy Rule:\n" + " " + rule + "\nThe correction triggered an error itself:\n" + " " + matches[0] + "\n"); } } } else { noSuggestionCount++; } } else { // for multiple rules created with complex phrases matches = getMatches(rule, badSentence, languageTool); if (matches.length == 0 && !complexRules.containsKey(rule.getId() + badSentence)) { complexRules.put(rule.getId() + badSentence, rule); } if (matches.length != 0) { complexRules.put(rule.getId() + badSentence, null); assertTrue(lang + ": Did expect one error in: \"" + badSentence + "\" (Rule: " + rule + "), got " + matches.length, matches.length == 1); assertEquals(lang + ": Incorrect match position markup (start) for rule " + rule, expectedMatchStart, matches[0].getFromPos()); assertEquals(lang + ": Incorrect match position markup (end) for rule " + rule, expectedMatchEnd, matches[0].getToPos()); // make sure suggestion is what we expect it to be if (suggestedCorrection != null && suggestedCorrection.size() > 0) { assertTrue( lang + ": Incorrect suggestions: " + suggestedCorrection.toString() + " != " + matches[0].getSuggestedReplacements() + " for rule " + rule, suggestedCorrection.equals(matches[0] .getSuggestedReplacements())); } // make sure the suggested correction doesn't produce an error: if (matches[0].getSuggestedReplacements().size() > 0) { final int fromPos = matches[0].getFromPos(); final int toPos = matches[0].getToPos(); for (final String repl : matches[0].getSuggestedReplacements()) { final String fixedSentence = badSentence.substring(0, fromPos) + repl + badSentence.substring(toPos); matches = getMatches(rule, fixedSentence, languageTool); assertEquals("Corrected sentence for rule " + rule + " triggered error: " + fixedSentence, 0, matches.length); } } else { noSuggestionCount++; } } } } } if (!complexRules.isEmpty()) { final Set set = complexRules.keySet(); final List badRules = new ArrayList(); final Iterator iter = set.iterator(); while (iter.hasNext()) { final PatternRule badRule = complexRules.get(iter.next()); if (badRule != null) { badRule.notComplexPhrase(); badRule .setMessage("The rule contains a phrase that never matched any incorrect example."); badRules.add(badRule); } } if (!badRules.isEmpty()) { testGrammarRulesFromXML(badRules, languageTool, lang); } } } protected String cleanXML(final String str) { return str.replaceAll("<([^<].*?)>", ""); } private boolean match(final Rule rule, final String sentence, final JLanguageTool languageTool) throws IOException { final AnalyzedSentence text = languageTool.getAnalyzedSentence(sentence); final RuleMatch[] matches = rule.match(text); return matches.length > 0; } private RuleMatch[] getMatches(final Rule rule, final String sentence, final JLanguageTool languageTool) throws IOException { final AnalyzedSentence text = languageTool.getAnalyzedSentence(sentence); final RuleMatch[] matches = rule.match(text); /* * for (int i = 0; i < matches.length; i++) { * System.err.println(matches[i]); } */ return matches; } public void testUppercasingSuggestion() throws IOException { final JLanguageTool langTool = new JLanguageTool(Language.ENGLISH); langTool.activateDefaultPatternRules(); final List matches = langTool .check("Were are in the process of ..."); assertEquals(1, matches.size()); final RuleMatch match = matches.get(0); final List sugg = match.getSuggestedReplacements(); assertEquals(2, sugg.size()); assertEquals("Where", sugg.get(0)); assertEquals("We", sugg.get(1)); } public void testRule() throws IOException { PatternRule pr; RuleMatch[] matches; pr = makePatternRule("one"); matches = pr .match(langTool.getAnalyzedSentence("A non-matching sentence.")); assertEquals(0, matches.length); matches = pr.match(langTool .getAnalyzedSentence("A matching sentence with one match.")); assertEquals(1, matches.length); assertEquals(25, matches[0].getFromPos()); assertEquals(28, matches[0].getToPos()); // these two are not set if the rule is called standalone (not via // JLanguageTool): assertEquals(-1, matches[0].getColumn()); assertEquals(-1, matches[0].getLine()); assertEquals("ID1", matches[0].getRule().getId()); assertTrue(matches[0].getMessage().equals("user visible message")); assertTrue(matches[0].getShortMessage().equals("short comment")); matches = pr.match(langTool .getAnalyzedSentence("one one and one: three matches")); assertEquals(3, matches.length); pr = makePatternRule("one two"); matches = pr.match(langTool.getAnalyzedSentence("this is one not two")); assertEquals(0, matches.length); matches = pr.match(langTool.getAnalyzedSentence("this is two one")); assertEquals(0, matches.length); matches = pr.match(langTool.getAnalyzedSentence("this is one two three")); assertEquals(1, matches.length); matches = pr.match(langTool.getAnalyzedSentence("one two")); assertEquals(1, matches.length); pr = makePatternRule("one|foo|xxxx two", false, true); matches = pr.match(langTool.getAnalyzedSentence("one foo three")); assertEquals(0, matches.length); matches = pr.match(langTool.getAnalyzedSentence("one two")); assertEquals(1, matches.length); matches = pr.match(langTool.getAnalyzedSentence("foo two")); assertEquals(1, matches.length); matches = pr.match(langTool.getAnalyzedSentence("one foo two")); assertEquals(1, matches.length); matches = pr.match(langTool.getAnalyzedSentence("y x z one two blah foo")); assertEquals(1, matches.length); pr = makePatternRule("one|foo|xxxx two|yyy", false, true); matches = pr.match(langTool.getAnalyzedSentence("one, yyy")); assertEquals(0, matches.length); matches = pr.match(langTool.getAnalyzedSentence("one yyy")); assertEquals(1, matches.length); matches = pr.match(langTool.getAnalyzedSentence("xxxx two")); assertEquals(1, matches.length); matches = pr.match(langTool.getAnalyzedSentence("xxxx yyy")); assertEquals(1, matches.length); } private PatternRule makePatternRule(final String s) { return makePatternRule(s, false, false); } private PatternRule makePatternRule(final String s, final boolean caseSensitive, final boolean regex) { final List elems = new ArrayList(); final String[] parts = s.split(" "); boolean pos = false; Element se = null; for (final String element : parts) { if (element.equals("SENT_START")) { pos = true; } if (!pos) { se = new Element(element, caseSensitive, regex, false); } else { se = new Element("", caseSensitive, regex, false); } if (pos) { se.setPosElement(element, false, false); } elems.add(se); pos = false; } final PatternRule rule = new PatternRule("ID1", Language.ENGLISH, elems, "test rule", "user visible message", "short comment"); return rule; } public void testSentenceStart() throws IOException { PatternRule pr; RuleMatch[] matches; pr = makePatternRule("SENT_START One"); matches = pr.match(langTool.getAnalyzedSentence("Not One word.")); assertEquals(0, matches.length); matches = pr.match(langTool.getAnalyzedSentence("One word.")); assertEquals(1, matches.length); } private static String callFormatMultipleSynthesis(final String[] suggs, final String left, final String right) throws IllegalArgumentException, SecurityException, InvocationTargetException, IllegalAccessException, NoSuchMethodException { Class[] argClasses = { String[].class, String.class, String.class }; Object[] argObjects = { suggs, left, right }; return TestTools.callStringStaticMethod(PatternRule.class, "formatMultipleSynthesis", argClasses, argObjects); } /* test private methods as well */ public void testformatMultipleSynthesis() throws IllegalArgumentException, SecurityException, InvocationTargetException, IllegalAccessException, NoSuchMethodException { final String[] suggArray = { "blah blah", "foo bar" }; assertEquals( "This is how you should write: blah blah, foo bar.", callFormatMultipleSynthesis(suggArray, "This is how you should write: ", ".")); final String[] suggArray2 = { "test", " " }; assertEquals( "This is how you should write: test, .", callFormatMultipleSynthesis(suggArray2, "This is how you should write: ", ".")); } /** * Test XML patterns, as a help for people developing rules that are not * programmers. */ public static void main(final String[] args) throws IOException { final PatternRuleTest prt = new PatternRuleTest(); System.out.println("Running XML pattern tests..."); prt.setUp(); final Set ignoredLanguages = new HashSet(); // ignoredLanguages.add(Language.CZECH); // has no XML rules yet prt.testGrammarRulesFromXML(ignoredLanguages, true); System.out.println("Tests successful."); } }