summaryrefslogtreecommitdiffstats
path: root/JLanguageTool/src/test/de/danielnaber/languagetool/rules/patterns/PatternRuleTest.java
blob: a1dfeaa39aa88f0e12732d984ad143f1b53840cd (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
/* LanguageTool, a natural language style checker 
 * Copyright (C) 2005 Daniel Naber (http://www.danielnaber.de)
 * 
 * This library is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * This library is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with this library; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301
 * USA
 */
package de.danielnaber.languagetool.rules.patterns;

import java.io.IOException;
import java.lang.reflect.InvocationTargetException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Set;
import java.util.regex.Pattern;
import java.util.regex.Matcher;

import junit.framework.TestCase;
import de.danielnaber.languagetool.AnalyzedSentence;
import de.danielnaber.languagetool.JLanguageTool;
import de.danielnaber.languagetool.Language;
import de.danielnaber.languagetool.TestTools;
import de.danielnaber.languagetool.rules.IncorrectExample;
import de.danielnaber.languagetool.rules.Rule;
import de.danielnaber.languagetool.rules.RuleMatch;

/**
 * @author Daniel Naber
 */
public class PatternRuleTest extends TestCase {

  private static JLanguageTool langTool;

  private static final Pattern PROBABLE_REGEX = Pattern.compile("[^\\[\\]\\*\\+\\|\\^\\{\\}\\?][\\[\\]\\*\\+\\|\\^\\{\\}\\?]|\\\\[^0-9]|\\(.+\\)|\\..");

  private static final Pattern CASE_REGEX = Pattern.compile("\\[(.)(.)\\]");

  
  @Override
  public void setUp() throws IOException {
    if (langTool == null) {
      langTool = new JLanguageTool(Language.ENGLISH);
    }
  }

  public void testGrammarRulesFromXML() throws IOException {
    testGrammarRulesFromXML(null, false);
  }

  private void testGrammarRulesFromXML(final Set<Language> ignoredLanguages,
      final boolean verbose) throws IOException {
    for (final Language lang : Language.LANGUAGES) {
      if (ignoredLanguages != null && ignoredLanguages.contains(lang)) {
        if (verbose) {
          System.out.println("Ignoring tests for " + lang.getName());
        }
        continue;
      }
      if (verbose) {
        System.out.println("Running tests for " + lang.getName() + "...");
      }
      final PatternRuleLoader ruleLoader = new PatternRuleLoader();
      final JLanguageTool languageTool = new JLanguageTool(lang);
      final String name = "/" + lang.getShortName() + "/grammar.xml";
      final List<PatternRule> rules = ruleLoader.getRules(JLanguageTool.getDataBroker().
    		  getFromRulesDirAsStream(name), name);
      warnIfRegexpSyntax(rules, lang);
      testGrammarRulesFromXML(rules, languageTool, lang);
    }
  }

  // TODO: probably this would be more useful for exceptions
  // instead of adding next methods to PatternRule
  // we can probably validate using XSD and specify regexes straight there
  private void warnIfRegexpSyntax(final List<PatternRule> rules,
      final Language lang) {
    for (final PatternRule rule : rules) {
      int i = 0;
      for (final Element element : rule.getElements()) {
        i++;
        warnIfElementNotKosher(element, lang, rule.getId());
        if (element.getExceptionList() != null) {
        for (final Element exception: element.getExceptionList()) {          
          warnIfElementNotKosher(exception, lang, rule.getId() 
              + " (exception in token [" + i + "]:" + element +") ");          
        }
        }
      }
    }
  }

  private void warnIfElementNotKosher(final Element element, 
      final Language lang, final String ruleId) {
    if (!element.isRegularExpression()
        && (PROBABLE_REGEX.matcher(element.getString())
            .find())) {
      System.err.println("The " + lang.toString() + " rule: "
          + ruleId + " contains element " + "\"" + element
          + "\" that is not marked as regular expression"
          + " but probably is one.");
    }
    if (element.isRegularExpression() && "".equals(element.getString())) {
      System.err.println("The " + lang.toString() + " rule: "
          + ruleId + " contains an empty string element " + "\"" + element
          + "\" that is marked as regular expression (don't look at the POS tag, it might be OK).");
    } else if (element.isRegularExpression()
        && !PROBABLE_REGEX.matcher(element.getString())
            .find()) {
      System.err.println("The " + lang.toString() + " rule: "
          + ruleId + " contains element " + "\"" + element
          + "\" that is marked as regular expression"
          + " but probably is not one."); 
      }   
          
    if (element.isInflected()
     && "".equals(element.getString())) {
      System.err.println("The " + lang.toString() + " rule: "
          + ruleId + " contains element " + "\"" + element
          + "\" that is marked as inflected"
          + " but is empty, so the attribute is redundant.");
    }

    if (element.isRegularExpression() && !element.getCaseSensitive()) {
      Matcher matcher = CASE_REGEX.matcher(element.getString());
      if (matcher.find()) {
        final String letter1 = matcher.group(1);
        final String letter2 = matcher.group(2);

        if (!letter1.equals(letter2) 
          && letter1.toLowerCase().equals(letter2.toLowerCase())) {
          System.err.println("The " + lang.toString() + " rule: "
             + ruleId + " contains regexp part [" + letter1 + letter2
             + "] which is useless without case_sensitive=\"yes\".");
        }
      }
    }

    if (element.isRegularExpression() && element.getString().contains("|")) {
      final String[] groups = element.getString().split("\\)");         
      final boolean caseSensitive = element.getCaseSensitive();
      for (final String group : groups) {        
        final String[] alt = group.split("\\|");
        final Set<String> partSet = new HashSet<String>();
        final Set<String> partSetNoCase = new HashSet<String>();
        for (String part : alt) {
          String partNoCase = caseSensitive ? part : part.toLowerCase();
          if (partSetNoCase.contains(partNoCase)) {
            if (partSet.contains(part)) {
              // Duplicate disjunction parts "foo|foo".
              System.err.println("The " + lang.toString() + " rule : "
                  + ruleId + " contains duplicated disjunction part (" 
                  + part + ") within the element " + "\"" + element + "\".");
            } else {
              // Duplicate disjunction parts "Foo|foo" since element ignores case.
              System.err.println("The " + lang.toString() + " rule : "
                  + ruleId + " contains duplicated non case sensitive disjunction part (" 
                  + part + ") within the element " + "\"" + element + "\". Did you "
                  + "forget case_sensitive=\"yes\"?");
            }
          }    
          partSetNoCase.add(partNoCase);
          partSet.add(part);
        }
      }
    }
  }
  
  
  private void testGrammarRulesFromXML(final List<PatternRule> rules,
      final JLanguageTool languageTool, final Language lang) throws IOException {
    int noSuggestionCount = 0;
    final HashMap<String, PatternRule> complexRules = new HashMap<String, PatternRule>();
    for (final PatternRule rule : rules) {
      final List<String> goodSentences = rule.getCorrectExamples();
      for (String goodSentence : goodSentences) {
        // enable indentation use
        goodSentence = goodSentence.replaceAll("[\\n\\t]+", "");
        goodSentence = cleanXML(goodSentence);
        assertTrue(goodSentence.trim().length() > 0);
        assertFalse(lang + ": Did not expect error in: " + goodSentence
            + " (Rule: " + rule + ")", match(rule, goodSentence, languageTool));
      }
      final List<IncorrectExample> badSentences = rule.getIncorrectExamples();
      for (IncorrectExample origBadExample : badSentences) {
        // enable indentation use
        String origBadSentence = origBadExample.getExample().replaceAll(
            "[\\n\\t]+", "");
        final List<String> suggestedCorrection = origBadExample
            .getCorrections();
        final int expectedMatchStart = origBadSentence.indexOf("<marker>");
        final int expectedMatchEnd = origBadSentence.indexOf("</marker>")
            - "<marker>".length();
        if (expectedMatchStart == -1 || expectedMatchEnd == -1) {
          fail(lang
              + ": No error position markup ('<marker>...</marker>') in bad example in rule "
              + rule);
        }
        final String badSentence = cleanXML(origBadSentence);
        assertTrue(badSentence.trim().length() > 0);
        RuleMatch[] matches = getMatches(rule, badSentence, languageTool);
        if (!rule.isWithComplexPhrase()) {
          assertTrue(lang + ": Did expect one error in: \"" + badSentence
              + "\" (Rule: " + rule + "), got " + matches.length
              + ". Additional info:" + rule.getMessage(), matches.length == 1);
          assertEquals(lang
              + ": Incorrect match position markup (start) for rule " + rule,
              expectedMatchStart, matches[0].getFromPos());
          assertEquals(lang
              + ": Incorrect match position markup (end) for rule " + rule,
              expectedMatchEnd, matches[0].getToPos());
          // make sure suggestion is what we expect it to be
          if (suggestedCorrection != null && suggestedCorrection.size() > 0) {
            assertTrue("You specified a correction but your message has no suggestions in rule " + rule,
              rule.getMessage().contains("<suggestion>")    
            );
            assertTrue(lang + ": Incorrect suggestions: "
                + suggestedCorrection.toString() + " != "
                + matches[0].getSuggestedReplacements() + " for rule " + rule,
                suggestedCorrection.equals(matches[0]
                    .getSuggestedReplacements()));
          }
          // make sure the suggested correction doesn't produce an error:
          if (matches[0].getSuggestedReplacements().size() > 0) {
            final int fromPos = matches[0].getFromPos();
            final int toPos = matches[0].getToPos();
            for (final String repl : matches[0].getSuggestedReplacements()) {
              final String fixedSentence = badSentence.substring(0, fromPos)
                  + repl + badSentence.substring(toPos);
              matches = getMatches(rule, fixedSentence, languageTool);
              if (matches.length > 0) {
                  fail("Incorrect input:\n"
                          + "  " + badSentence
                  		  + "\nCorrected sentence:\n"
                          + "  " + fixedSentence
                          + "\nBy Rule:\n"
                          + "  " + rule
                          + "\nThe correction triggered an error itself:\n"
                          + "  " + matches[0] + "\n");
              }
            }
          } else {
            noSuggestionCount++;
          }
        } else { // for multiple rules created with complex phrases

          matches = getMatches(rule, badSentence, languageTool);
          if (matches.length == 0
              && !complexRules.containsKey(rule.getId() + badSentence)) {
            complexRules.put(rule.getId() + badSentence, rule);
          }

          if (matches.length != 0) {
            complexRules.put(rule.getId() + badSentence, null);
            assertTrue(lang + ": Did expect one error in: \"" + badSentence
                + "\" (Rule: " + rule + "), got " + matches.length,
                matches.length == 1);
            assertEquals(lang
                + ": Incorrect match position markup (start) for rule " + rule,
                expectedMatchStart, matches[0].getFromPos());
            assertEquals(lang
                + ": Incorrect match position markup (end) for rule " + rule,
                expectedMatchEnd, matches[0].getToPos());
            // make sure suggestion is what we expect it to be
            if (suggestedCorrection != null && suggestedCorrection.size() > 0) {
              assertTrue(
                  lang + ": Incorrect suggestions: "
                      + suggestedCorrection.toString() + " != "
                      + matches[0].getSuggestedReplacements() + " for rule "
                      + rule, suggestedCorrection.equals(matches[0]
                      .getSuggestedReplacements()));
            }
            // make sure the suggested correction doesn't produce an error:
            if (matches[0].getSuggestedReplacements().size() > 0) {
              final int fromPos = matches[0].getFromPos();
              final int toPos = matches[0].getToPos();
              for (final String repl : matches[0].getSuggestedReplacements()) {
                final String fixedSentence = badSentence.substring(0, fromPos)
                    + repl + badSentence.substring(toPos);
                matches = getMatches(rule, fixedSentence, languageTool);
                assertEquals("Corrected sentence for rule " + rule
                    + " triggered error: " + fixedSentence, 0, matches.length);
              }
            } else {
              noSuggestionCount++;
            }
          }
        }

      }
    }
    if (!complexRules.isEmpty()) {
      final Set<String> set = complexRules.keySet();
      final List<PatternRule> badRules = new ArrayList<PatternRule>();
      final Iterator<String> iter = set.iterator();
      while (iter.hasNext()) {
        final PatternRule badRule = complexRules.get(iter.next());
        if (badRule != null) {
          badRule.notComplexPhrase();
          badRule
              .setMessage("The rule contains a phrase that never matched any incorrect example.");
          badRules.add(badRule);
        }
      }
      if (!badRules.isEmpty()) {
        testGrammarRulesFromXML(badRules, languageTool, lang);
      }
    }
  }

  protected String cleanXML(final String str) {
    return str.replaceAll("<([^<].*?)>", "");
  }

  private boolean match(final Rule rule, final String sentence,
      final JLanguageTool languageTool) throws IOException {
    final AnalyzedSentence text = languageTool.getAnalyzedSentence(sentence);
    final RuleMatch[] matches = rule.match(text);
    return matches.length > 0;
  }

  private RuleMatch[] getMatches(final Rule rule, final String sentence,
      final JLanguageTool languageTool) throws IOException {
    final AnalyzedSentence text = languageTool.getAnalyzedSentence(sentence);
    final RuleMatch[] matches = rule.match(text);
    /*
     * for (int i = 0; i < matches.length; i++) {
     * System.err.println(matches[i]); }
     */
    return matches;
  }

  public void testUppercasingSuggestion() throws IOException {
    final JLanguageTool langTool = new JLanguageTool(Language.ENGLISH);
    langTool.activateDefaultPatternRules();
    final List<RuleMatch> matches = langTool
        .check("Were are in the process of ...");
    assertEquals(1, matches.size());
    final RuleMatch match = matches.get(0);
    final List<String> sugg = match.getSuggestedReplacements();
    assertEquals(2, sugg.size());
    assertEquals("Where", sugg.get(0));
    assertEquals("We", sugg.get(1));
  }

  public void testRule() throws IOException {
    PatternRule pr;
    RuleMatch[] matches;

    pr = makePatternRule("one");
    matches = pr
        .match(langTool.getAnalyzedSentence("A non-matching sentence."));
    assertEquals(0, matches.length);
    matches = pr.match(langTool
        .getAnalyzedSentence("A matching sentence with one match."));
    assertEquals(1, matches.length);
    assertEquals(25, matches[0].getFromPos());
    assertEquals(28, matches[0].getToPos());
    // these two are not set if the rule is called standalone (not via
    // JLanguageTool):
    assertEquals(-1, matches[0].getColumn());
    assertEquals(-1, matches[0].getLine());
    assertEquals("ID1", matches[0].getRule().getId());
    assertTrue(matches[0].getMessage().equals("user visible message"));
    assertTrue(matches[0].getShortMessage().equals("short comment"));
    matches = pr.match(langTool
        .getAnalyzedSentence("one one and one: three matches"));
    assertEquals(3, matches.length);

    pr = makePatternRule("one two");
    matches = pr.match(langTool.getAnalyzedSentence("this is one not two"));
    assertEquals(0, matches.length);
    matches = pr.match(langTool.getAnalyzedSentence("this is two one"));
    assertEquals(0, matches.length);
    matches = pr.match(langTool.getAnalyzedSentence("this is one two three"));
    assertEquals(1, matches.length);
    matches = pr.match(langTool.getAnalyzedSentence("one two"));
    assertEquals(1, matches.length);

    pr = makePatternRule("one|foo|xxxx two", false, true);
    matches = pr.match(langTool.getAnalyzedSentence("one foo three"));
    assertEquals(0, matches.length);
    matches = pr.match(langTool.getAnalyzedSentence("one two"));
    assertEquals(1, matches.length);
    matches = pr.match(langTool.getAnalyzedSentence("foo two"));
    assertEquals(1, matches.length);
    matches = pr.match(langTool.getAnalyzedSentence("one foo two"));
    assertEquals(1, matches.length);
    matches = pr.match(langTool.getAnalyzedSentence("y x z one two blah foo"));
    assertEquals(1, matches.length);

    pr = makePatternRule("one|foo|xxxx two|yyy", false, true);
    matches = pr.match(langTool.getAnalyzedSentence("one, yyy"));
    assertEquals(0, matches.length);
    matches = pr.match(langTool.getAnalyzedSentence("one yyy"));
    assertEquals(1, matches.length);
    matches = pr.match(langTool.getAnalyzedSentence("xxxx two"));
    assertEquals(1, matches.length);
    matches = pr.match(langTool.getAnalyzedSentence("xxxx yyy"));
    assertEquals(1, matches.length);
  }

  private PatternRule makePatternRule(final String s) {
    return makePatternRule(s, false, false);
  }

  private PatternRule makePatternRule(final String s,
      final boolean caseSensitive, final boolean regex) {
    final List<Element> elems = new ArrayList<Element>();
    final String[] parts = s.split(" ");
    boolean pos = false;
    Element se = null;
    for (final String element : parts) {
      if (element.equals("SENT_START")) {
        pos = true;
      }
      if (!pos) {
        se = new Element(element, caseSensitive, regex, false);
      } else {
        se = new Element("", caseSensitive, regex, false);
      }
      if (pos) {
        se.setPosElement(element, false, false);
      }
      elems.add(se);
      pos = false;
    }
    final PatternRule rule = new PatternRule("ID1", Language.ENGLISH, elems,
        "test rule", "user visible message", "short comment");
    return rule;
  }

  public void testSentenceStart() throws IOException {
    PatternRule pr;
    RuleMatch[] matches;

    pr = makePatternRule("SENT_START One");
    matches = pr.match(langTool.getAnalyzedSentence("Not One word."));
    assertEquals(0, matches.length);
    matches = pr.match(langTool.getAnalyzedSentence("One word."));
    assertEquals(1, matches.length);
  }

  private static String callFormatMultipleSynthesis(final String[] suggs,
      final String left, final String right) throws IllegalArgumentException,
      SecurityException, InvocationTargetException, IllegalAccessException,
      NoSuchMethodException {
    Class[] argClasses = { String[].class, String.class, String.class };
    Object[] argObjects = { suggs, left, right };
    return TestTools.callStringStaticMethod(PatternRule.class,
        "formatMultipleSynthesis", argClasses, argObjects);
  }

  /* test private methods as well */
  public void testformatMultipleSynthesis() throws IllegalArgumentException,
      SecurityException, InvocationTargetException, IllegalAccessException,
      NoSuchMethodException {
    final String[] suggArray = { "blah blah", "foo bar" };

    assertEquals(
        "This is how you should write: <suggestion>blah blah</suggestion>, <suggestion>foo bar</suggestion>.",

        callFormatMultipleSynthesis(suggArray,
            "This is how you should write: <suggestion>", "</suggestion>."));

    final String[] suggArray2 = { "test", " " };

    assertEquals(
        "This is how you should write: <suggestion>test</suggestion>, <suggestion> </suggestion>.",

        callFormatMultipleSynthesis(suggArray2,
            "This is how you should write: <suggestion>", "</suggestion>."));
  }

  /**
   * Test XML patterns, as a help for people developing rules that are not
   * programmers.
   */
  public static void main(final String[] args) throws IOException {
    final PatternRuleTest prt = new PatternRuleTest();
    System.out.println("Running XML pattern tests...");
    prt.setUp();
    final Set<Language> ignoredLanguages = new HashSet<Language>();
    // ignoredLanguages.add(Language.CZECH); // has no XML rules yet
    prt.testGrammarRulesFromXML(ignoredLanguages, true);
    System.out.println("Tests successful.");
  }

}