diff options
Diffstat (limited to 'JLanguageTool/src/java/de/danielnaber/languagetool/rules/patterns/XMLRuleHandler.java')
-rw-r--r-- | JLanguageTool/src/java/de/danielnaber/languagetool/rules/patterns/XMLRuleHandler.java | 568 |
1 files changed, 568 insertions, 0 deletions
diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/rules/patterns/XMLRuleHandler.java b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/patterns/XMLRuleHandler.java new file mode 100644 index 0000000..72a852a --- /dev/null +++ b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/patterns/XMLRuleHandler.java @@ -0,0 +1,568 @@ +/* LanguageTool, a natural language style checker + * Copyright (C) 2006 Daniel Naber (http://www.danielnaber.de) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ +package de.danielnaber.languagetool.rules.patterns; + +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +import org.xml.sax.Attributes; +import org.xml.sax.Locator; +import org.xml.sax.SAXException; +import org.xml.sax.SAXParseException; +import org.xml.sax.helpers.DefaultHandler; + +import de.danielnaber.languagetool.Language; +import de.danielnaber.languagetool.rules.IncorrectExample; +import de.danielnaber.languagetool.tools.StringTools; + +/** + * XML rule handler that loads rules from XML and throws + * exceptions on errors and warnings. + * + * @author Daniel Naber + */ +public class XMLRuleHandler extends DefaultHandler { + + public XMLRuleHandler() { + elementList = new ArrayList<Element>(); + equivalenceFeatures = new HashMap<String, List<String>>(); + uTypeList = new ArrayList<String>(); + } + + List<PatternRule> rules = new ArrayList<PatternRule>(); + + protected Language language; + + protected StringBuilder correctExample = new StringBuilder(); + protected StringBuilder incorrectExample = new StringBuilder(); + protected StringBuilder exampleCorrection = new StringBuilder(); + protected StringBuilder message = new StringBuilder(); + protected StringBuilder match = new StringBuilder(); + protected StringBuilder elements; + protected StringBuilder exceptions; + + List<String> correctExamples = new ArrayList<String>(); + List<IncorrectExample> incorrectExamples = new ArrayList<IncorrectExample>(); + + protected boolean inPattern; + protected boolean inCorrectExample; + protected boolean inIncorrectExample; + protected boolean inMessage; + protected boolean inSuggestion; + protected boolean inMatch; + protected boolean inRuleGroup; + protected boolean inToken; + protected boolean inException; + protected boolean inPhrases; + protected boolean inAndGroup; + + protected boolean tokenSpaceBefore; + protected boolean tokenSpaceBeforeSet; + protected String posToken; + protected boolean posNegation; + protected boolean posRegExp; + + protected boolean caseSensitive; + protected boolean regExpression; + protected boolean tokenNegated; + protected boolean tokenInflected; + + protected String exceptionPosToken; + protected boolean exceptionStringRegExp; + protected boolean exceptionStringNegation; + protected boolean exceptionStringInflected; + protected boolean exceptionPosNegation; + protected boolean exceptionPosRegExp; + protected boolean exceptionValidNext; + protected boolean exceptionValidPrev; + protected boolean exceptionSet; + protected boolean exceptionSpaceBefore; + protected boolean exceptionSpaceBeforeSet; + + /** List of elements as specified by tokens. **/ + protected List<Element> elementList; + + /** true when phraseref is the last element in the rule. **/ + protected boolean lastPhrase; + + /** ID reference to the phrase. **/ + protected String phraseIdRef; + + /** Current phrase ID. **/ + protected String phraseId; + + protected int skipPos; + + protected String ruleGroupId; + + protected String id; + + protected Element tokenElement; + + protected Match tokenReference; + + protected List<Match> suggestionMatches; + + protected Locator pLocator; + + protected int startPositionCorrection; + protected int endPositionCorrection; + protected int tokenCounter; + + /** Phrase store - elementLists keyed by phraseIds. **/ + protected Map<String, List<List<Element>>> phraseMap; + + /** + * Logically forking element list, used for including multiple phrases in the + * current one. + **/ + protected List<ArrayList<Element>> phraseElementList; + + protected int andGroupCounter; + + protected StringBuilder shortMessage = new StringBuilder(); + protected boolean inShortMessage; + + protected boolean inUnification; + protected boolean inUnificationDef; + protected boolean uniNegation; + + protected String uFeature; + protected String uType = ""; + + protected List<String> uTypeList; + + protected Map<String, List<String>> equivalenceFeatures; + + + /** Definitions of values in XML files. */ + protected static final String YES = "yes"; + protected static final String POSTAG = "postag"; + protected static final String POSTAG_REGEXP = "postag_regexp"; + protected static final String REGEXP = "regexp"; + protected static final String NEGATE = "negate"; + protected static final String INFLECTED = "inflected"; + protected static final String NEGATE_POS = "negate_pos"; + protected static final String MARKER = "marker"; + protected static final String DEFAULT = "default"; + protected static final String TYPE = "type"; + protected static final String SPACEBEFORE = "spacebefore"; + protected static final String EXAMPLE = "example"; + protected static final String SCOPE = "scope"; + protected static final String IGNORE = "ignore"; + protected static final String SKIP = "skip"; + protected static final String TOKEN = "token"; + protected static final String FEATURE = "feature"; + protected static final String UNIFY = "unify"; + protected static final String AND = "and"; + protected static final String EXCEPTION = "exception"; + protected static final String CASE_SENSITIVE = "case_sensitive"; + protected static final String PATTERN = "pattern"; + protected static final String MATCH = "match"; + protected static final String UNIFICATION = "unification"; + protected static final String RULEGROUP = "rulegroup"; + protected static final String NO = "no"; + protected static final String MARK_TO = "mark_to"; + protected static final String MARK_FROM = "mark_from"; + protected static final String PHRASES = "phrases"; + protected static final String MESSAGE = "message"; + + + public List<PatternRule> getRules() { + return rules; + } + + public void warning (final SAXParseException e) throws SAXException { + throw e; + } + + public void error (final SAXParseException e) throws SAXException { + throw e; + } + + @Override + public void setDocumentLocator(final Locator locator) { + pLocator = locator; + super.setDocumentLocator(locator); + } + + protected void resetToken() { + posNegation = false; + posRegExp = false; + inToken = false; + tokenSpaceBefore = false; + tokenSpaceBeforeSet = false; + + resetException(); + exceptionSet = false; + tokenReference = null; + } + + protected void resetException() { + exceptionStringNegation = false; + exceptionStringInflected = false; + exceptionPosNegation = false; + exceptionPosRegExp = false; + exceptionStringRegExp = false; + exceptionValidNext = false; + exceptionValidPrev = false; + exceptionSpaceBefore = false; + exceptionSpaceBeforeSet = false; + } + + protected void phraseElementInit() { + // lazy init + if (phraseElementList == null) { + phraseElementList = new ArrayList<ArrayList<Element>>(); + } + } + protected void preparePhrase(final Attributes attrs) { + phraseIdRef = attrs.getValue("idref"); + if (phraseMap.containsKey(phraseIdRef)) { + for (final List<Element> curPhrEl : phraseMap.get(phraseIdRef)) { + for (final Element e : curPhrEl) { + e.setPhraseName(phraseIdRef); + } + if (elementList.isEmpty()) { + phraseElementList.add(new ArrayList<Element>(curPhrEl)); + } else { + final ArrayList<Element> prevList = new ArrayList<Element>( + elementList); + prevList.addAll(curPhrEl); + phraseElementList.add(new ArrayList<Element>(prevList)); + prevList.clear(); + } + } + lastPhrase = true; + } + } + + protected void finalizePhrase() { + // lazy init + if (phraseMap == null) { + phraseMap = new HashMap<String, List<List<Element>>>(); + } + phraseElementInit(); + if (phraseElementList.isEmpty()) { + phraseElementList.add(new ArrayList<Element>(elementList)); + } else { + for (final ArrayList<Element> ph : phraseElementList) { + ph.addAll(new ArrayList<Element>(elementList)); + } + } + + phraseMap.put(phraseId, new ArrayList<List<Element>>(phraseElementList)); + elementList.clear(); + + phraseElementList.clear(); + } + + protected void startPattern(final Attributes attrs) throws SAXException { + inPattern = true; + if (attrs.getValue(MARK_FROM) != null) { + startPositionCorrection = Integer.parseInt(attrs.getValue(MARK_FROM)); + } + if (attrs.getValue(MARK_TO) != null) { + endPositionCorrection = Integer.parseInt(attrs.getValue(MARK_TO)); + if (endPositionCorrection > 0) { + throw new SAXException("End position correction (mark_to="+ endPositionCorrection + + ") cannot be larger than 0: " + "\n Line: " + + pLocator.getLineNumber() + ", column: " + + pLocator.getColumnNumber() + "."); + } + } + caseSensitive = YES.equals(attrs.getValue(CASE_SENSITIVE)); + } + + + /** + * Calculates the offset of the match reference (if any) in case the match + * element has been used in the group. + * + * @param elList + * Element list where the match element was used. It is directly changed. + */ + protected void processElement(final List<Element> elList) { + int counter = 0; + for (final Element elTest : elList) { + if (elTest.getPhraseName() != null && counter > 0) { + if (elTest.isReferenceElement()) { + final int tokRef = elTest.getMatch().getTokenRef(); + elTest.getMatch().setTokenRef(tokRef + counter - 1); + final String offsetToken = elTest.getString().replace("\\" + tokRef, + "\\" + (tokRef + counter - 1)); + elTest.setStringElement(offsetToken); + } + } + counter++; + } + } + + protected void setMatchElement(final Attributes attrs) throws SAXException { + inMatch = true; + match = new StringBuilder(); + Match.CaseConversion caseConversion = Match.CaseConversion.NONE; + if (attrs.getValue("case_conversion") != null) { + caseConversion = Match.CaseConversion.toCase(attrs + .getValue("case_conversion").toUpperCase()); + } + Match.IncludeRange includeRange = Match.IncludeRange.NONE; + if (attrs.getValue("include_skipped") != null) { + includeRange = Match.IncludeRange.toRange(attrs + .getValue("include_skipped").toUpperCase()); + } + final Match mWorker = new Match(attrs.getValue(POSTAG), attrs + .getValue("postag_replace"), YES + .equals(attrs.getValue(POSTAG_REGEXP)), attrs + .getValue("regexp_match"), attrs.getValue("regexp_replace"), + caseConversion, YES.equals(attrs.getValue("setpos")), + includeRange); + mWorker.setInMessageOnly(!inSuggestion); + if (inMessage) { + if (suggestionMatches == null) { + suggestionMatches = new ArrayList<Match>(); + } + suggestionMatches.add(mWorker); + //add incorrect XML character for simplicity + message.append("\u0001\\"); + message.append(attrs.getValue("no")); + if (StringTools.isEmpty(attrs.getValue("no"))) { + throw new SAXException("References cannot be empty: " + "\n Line: " + + pLocator.getLineNumber() + ", column: " + + pLocator.getColumnNumber() + "."); + } else if (Integer.parseInt(attrs.getValue("no")) < 1) { + throw new SAXException("References must be larger than 0: " + + attrs.getValue("no") + "\n Line: " + pLocator.getLineNumber() + + ", column: " + pLocator.getColumnNumber() + "."); + } + } else if (inToken && attrs.getValue("no") != null) { + final int refNumber = Integer.parseInt(attrs.getValue("no")); + if (refNumber > elementList.size()) { + throw new SAXException( + "Only backward references in match elements are possible, tried to specify token " + + refNumber + + "\n Line: " + + pLocator.getLineNumber() + + ", column: " + pLocator.getColumnNumber() + "."); + } + mWorker.setTokenRef(refNumber); + tokenReference = mWorker; + elements.append('\\'); + elements.append(refNumber); + } + } + + protected void setExceptions(final Attributes attrs) { + inException = true; + exceptions = new StringBuilder(); + resetException(); + + exceptionStringNegation = YES.equals(attrs.getValue(NEGATE)); + exceptionValidNext = "next".equals(attrs.getValue(SCOPE)); + exceptionValidPrev = "previous".equals(attrs.getValue(SCOPE)); + exceptionStringInflected = YES.equals(attrs.getValue(INFLECTED)); + + if (attrs.getValue(POSTAG) != null) { + exceptionPosToken = attrs.getValue(POSTAG); + exceptionPosRegExp = YES.equals(attrs.getValue(POSTAG_REGEXP)); + exceptionPosNegation = YES.equals(attrs.getValue(NEGATE_POS)); + } + exceptionStringRegExp = YES.equals(attrs.getValue(REGEXP)); + if (attrs.getValue(SPACEBEFORE) != null) { + exceptionSpaceBefore = YES.equals(attrs.getValue(SPACEBEFORE)); + exceptionSpaceBeforeSet = !"ignore".equals(attrs.getValue(SPACEBEFORE)); + } + } + + protected void finalizeExceptions() { + inException = false; + if (!exceptionSet) { + tokenElement = new Element(StringTools.trimWhitespace(elements + .toString()), caseSensitive, regExpression, tokenInflected); + exceptionSet = true; + } + tokenElement.setNegation(tokenNegated); + if (!StringTools.isEmpty(exceptions.toString())) { + tokenElement.setStringException(StringTools.trimWhitespace(exceptions + .toString()), exceptionStringRegExp, exceptionStringInflected, + exceptionStringNegation, exceptionValidNext, exceptionValidPrev); + } + if (exceptionPosToken != null) { + tokenElement.setPosException(exceptionPosToken, exceptionPosRegExp, + exceptionPosNegation, exceptionValidNext, exceptionValidPrev); + exceptionPosToken = null; + } + if (exceptionSpaceBeforeSet) { + tokenElement.setExceptionSpaceBefore(exceptionSpaceBefore); + } + resetException(); + } + + protected void setToken(final Attributes attrs) { + inToken = true; + + if (lastPhrase) { + elementList.clear(); + } + + lastPhrase = false; + tokenNegated = YES.equals(attrs.getValue(NEGATE)); + tokenInflected = YES.equals(attrs.getValue(INFLECTED)); + if (attrs.getValue("skip") != null) { + skipPos = Integer.parseInt(attrs.getValue("skip")); + } + elements = new StringBuilder(); + // POSElement creation + if (attrs.getValue(POSTAG) != null) { + posToken = attrs.getValue(POSTAG); + posRegExp = YES.equals(attrs.getValue(POSTAG_REGEXP)); + posNegation = YES.equals(attrs.getValue(NEGATE_POS)); + } + regExpression = YES.equals(attrs.getValue(REGEXP)); + + if (attrs.getValue(SPACEBEFORE) != null) { + tokenSpaceBefore = YES.equals(attrs.getValue(SPACEBEFORE)); + tokenSpaceBeforeSet = !"ignore".equals(attrs.getValue(SPACEBEFORE)); + } + + if (!inAndGroup) { + tokenCounter++; + } + } + + protected void checkPositions(final int add) throws SAXException { + if (startPositionCorrection >= tokenCounter + add) { + throw new SAXException( + "Attempt to mark a token no. ("+ startPositionCorrection +") that is outside the pattern (" + + tokenCounter + "). Pattern elements are numbered starting from 0!" + "\n Line: " + + pLocator.getLineNumber() + ", column: " + + pLocator.getColumnNumber() + "."); + } + if (tokenCounter +add - endPositionCorrection < 0) { + throw new SAXException( + "Attempt to mark a token no. ("+ endPositionCorrection +") that is outside the pattern (" + + tokenCounter + " elements). End positions should be negative but not larger than the token count!" + + "\n Line: " + + pLocator.getLineNumber() + ", column: " + + pLocator.getColumnNumber() + "."); + } + } + + protected void checkMarkPositions() { + if (phraseElementList == null || phraseElementList.size() == 0) { + final int endMarker = elementList.size() + endPositionCorrection; + if (endMarker <= startPositionCorrection) { + throw new RuntimeException("Invalid combination of mark_from (" + startPositionCorrection + + ") and mark_to (" + endPositionCorrection + ") for rule " + id + + " with " + elementList.size() + + " tokens: the error position created by mark_from and mark_to is less than one token"); + } + } + } + + /** + * Adds Match objects for all references to tokens + * (including '\1' and the like). + */ + protected List<Match> addLegacyMatches() { + if (suggestionMatches == null || suggestionMatches.isEmpty()) { + return null; + } + final List<Match> sugMatch = new ArrayList<Match>(); + final String messageStr = message.toString(); + int pos = 0; + int ind = 0; + int matchCounter = 0; + while (pos != -1) { + pos = messageStr.indexOf('\\', ind + 1); + if (pos != -1 && messageStr.length() > pos) { + if (Character.isDigit(messageStr.charAt(pos + 1))) { + if (pos == 1 || messageStr.charAt(pos - 1) != '\u0001') { + final Match mWorker = new Match(null, null, false, null, + null, Match.CaseConversion.NONE, false, Match.IncludeRange.NONE); + mWorker.setInMessageOnly(true); + sugMatch.add(mWorker); + } else if (messageStr.charAt(pos - 1) == '\u0001') { // real suggestion marker + sugMatch.add(suggestionMatches.get(matchCounter)); + message.deleteCharAt(pos - 1 - matchCounter); + matchCounter++; + } + } + } + ind = pos; + } + if (sugMatch.isEmpty()) { + return suggestionMatches; + } + return sugMatch; + } + + protected void finalizeTokens() { + if (!exceptionSet || tokenElement == null) { + tokenElement = new Element(StringTools.trimWhitespace(elements + .toString()), caseSensitive, regExpression, tokenInflected); + tokenElement.setNegation(tokenNegated); + } else { + tokenElement.setStringElement(StringTools.trimWhitespace(elements + .toString())); + } + + if (skipPos != 0) { + tokenElement.setSkipNext(skipPos); + skipPos = 0; + } + if (posToken != null) { + tokenElement.setPosElement(posToken, posRegExp, posNegation); + posToken = null; + } + + if (tokenReference != null) { + tokenElement.setMatch(tokenReference); + } + + if (inAndGroup && andGroupCounter > 0) { + elementList.get(elementList.size() - 1) + .setAndGroupElement(tokenElement); + } else { + elementList.add(tokenElement); + } + if (inAndGroup) { + andGroupCounter++; + } + + if (inUnification) { + tokenElement.setUnification(equivalenceFeatures); + if (uniNegation) { + tokenElement.setUniNegation(); + } + } + + if (inUnificationDef) { + language.getUnifier().setEquivalence(uFeature, uType, tokenElement); + elementList.clear(); + } + if (tokenSpaceBeforeSet) { + tokenElement.setWhitespaceBefore(tokenSpaceBefore); + } + resetToken(); + } + +} |