diff options
Diffstat (limited to 'JLanguageTool/src/java/de/danielnaber')
219 files changed, 29932 insertions, 0 deletions
diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/AnalyzedSentence.java b/JLanguageTool/src/java/de/danielnaber/languagetool/AnalyzedSentence.java new file mode 100644 index 0000000..6c50282 --- /dev/null +++ b/JLanguageTool/src/java/de/danielnaber/languagetool/AnalyzedSentence.java @@ -0,0 +1,197 @@ +/* LanguageTool, a natural language style checker + * Copyright (C) 2005 Daniel Naber (http://www.danielnaber.de) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ +package de.danielnaber.languagetool; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; + +import de.danielnaber.languagetool.tagging.de.AnalyzedGermanTokenReadings; + +/** + * A sentence that has been tokenized and analyzed. + * + * @author Daniel Naber + */ +public class AnalyzedSentence { + + @Override + public int hashCode() { + final int prime = 31; + int result = 1; + result = prime * result + Arrays.hashCode(nonBlankTokens); + result = prime * result + Arrays.hashCode(tokens); + result = prime * result + Arrays.hashCode(whPositions); + return result; + } + + @Override + public boolean equals(Object obj) { + if (this == obj) + return true; + if (obj == null) + return false; + if (getClass() != obj.getClass()) + return false; + final AnalyzedSentence other = (AnalyzedSentence) obj; + if (!Arrays.equals(nonBlankTokens, other.nonBlankTokens)) + return false; + if (!Arrays.equals(tokens, other.tokens)) + return false; + if (!Arrays.equals(whPositions, other.whPositions)) + return false; + return true; + } + + private AnalyzedTokenReadings[] tokens; + + private AnalyzedTokenReadings[] nonBlankTokens; + + /** + * Array mapping positions of tokens as returned with + * getTokensWithoutWhitespace() to the internal tokens array. + */ + private int[] whPositions; + + /** + * Sets {@link AnalyzedTokenReadings}. Whitespace is also a token. + */ + public AnalyzedSentence(final AnalyzedTokenReadings[] tokens) { + this.tokens = tokens; + } + + public AnalyzedSentence(final AnalyzedTokenReadings[] tokens, final + int[] whPositions) { + this.tokens = tokens; + this.setWhPositions(whPositions); + getTokensWithoutWhitespace(); + } + + /** + * Returns the {@link AnalyzedTokenReadings} of the analyzed text. Whitespace + * is also a token. + */ + public final AnalyzedTokenReadings[] getTokens() { + return tokens; + } + + /** + * Returns the {@link AnalyzedTokenReadings} of the analyzed text, with + * whitespace tokens removed but with the artificial <code>SENT_START</code> + * token included. + */ + public final AnalyzedTokenReadings[] getTokensWithoutWhitespace() { + if (nonBlankTokens == null) { + int whCounter = 0; + int nonWhCounter = 0; + final int[] mapping = new int[tokens.length + 1]; + final List<AnalyzedTokenReadings> l = new ArrayList<AnalyzedTokenReadings>(); + for (final AnalyzedTokenReadings token : tokens) { + if (!token.isWhitespace() || token.isSentStart() || token.isSentEnd() + || token.isParaEnd()) { + l.add(token); + mapping[nonWhCounter] = whCounter; + nonWhCounter++; + } + whCounter++; + } + setNonBlankTokens(l.toArray(new AnalyzedTokenReadings[l.size()])); + setWhPositions(mapping.clone()); + } + return nonBlankTokens.clone(); + } + + /** + * Get a position of a non-whitespace token in the original sentence with + * whitespace. + * + * @param nonWhPosition + * Position of a non-whitespace token + * @return int position in the original sentence. + */ + public final int getOriginalPosition(final int nonWhPosition) { + if (nonBlankTokens == null) { + getTokensWithoutWhitespace(); + } + return getWhPositions()[nonWhPosition]; + } + + @Override + public final String toString() { + final StringBuilder sb = new StringBuilder(); + for (final AnalyzedTokenReadings element : tokens) { + if (!element.isWhitespace()) { + sb.append(element.getToken()); + sb.append('['); + } + for (int j = 0; j < element.getReadingsLength(); j++) { + final String posTag = element.getAnalyzedToken(j).getPOSTag(); + if (element.isSentStart()) { + sb.append("<S>"); + } else if (JLanguageTool.SENTENCE_END_TAGNAME.equals(element + .getAnalyzedToken(j).getPOSTag())) { + sb.append("</S>"); + } else if (JLanguageTool.PARAGRAPH_END_TAGNAME.equals(element + .getAnalyzedToken(j).getPOSTag())) { + sb.append("<P/>"); + } else if (element.getAnalyzedToken(j) != null && posTag == null + && !(element instanceof AnalyzedGermanTokenReadings)) { + // FIXME: don't depend on AnalyzedGermanTokenReadings here + sb.append(element.getAnalyzedToken(j).getToken()); + } else { + if (!element.isWhitespace()) { + sb.append(element.getAnalyzedToken(j)); + if (j < element.getReadingsLength() - 1) { + sb.append(','); + } + } + } + } + if (!element.isWhitespace()) { + sb.append(']'); + } else { + sb.append(' '); + } + + } + return sb.toString(); + } + + /** + * @param whPositions the whPositions to set + */ + public void setWhPositions(int[] whPositions) { + this.whPositions = whPositions; + } + + /** + * @return the whPositions + */ + public int[] getWhPositions() { + return whPositions; + } + + /** + * @param nonBlankTokens the nonBlankTokens to set + */ + public void setNonBlankTokens(AnalyzedTokenReadings[] nonBlankTokens) { + this.nonBlankTokens = nonBlankTokens; + } + +} diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/AnalyzedToken.java b/JLanguageTool/src/java/de/danielnaber/languagetool/AnalyzedToken.java new file mode 100644 index 0000000..d36274a --- /dev/null +++ b/JLanguageTool/src/java/de/danielnaber/languagetool/AnalyzedToken.java @@ -0,0 +1,137 @@ +/* LanguageTool, a natural language style checker + * Copyright (C) 2005 Daniel Naber (http://www.danielnaber.de) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ +package de.danielnaber.languagetool; + +/** + * A word (or punctuation, or whitespace) and its part-of-speech tag. + * + * @author Daniel Naber + */ +public class AnalyzedToken { + + private final String token; + private final String posTag; + private final String lemma; + + /** + * used only for matching with Elements + */ + private final String tokenInflected; + + private boolean isWhitespaceBefore; + + public AnalyzedToken(final String token, final String posTag, final String lemma) { + if (token == null) { + throw new NullPointerException("Token cannot be null!"); + } + this.token = token; + this.posTag = posTag; + this.lemma = lemma; + if (lemma == null) { + tokenInflected = token; + } else { + tokenInflected = lemma; + } + } + + public final String getToken() { + return token; + } + + public final String getPOSTag() { + return posTag; + } + + public final String getLemma() { + return lemma; + } + + public final String getTokenInflected() { + return tokenInflected; + } + + public final void setWhitespaceBefore(final boolean isWhite) { + isWhitespaceBefore = isWhite; + } + + public final boolean isWhitespaceBefore() { + return isWhitespaceBefore; + } + + public String toString() { + final StringBuilder sb = new StringBuilder(); + sb.append(tokenInflected); + sb.append('/'); + sb.append(posTag); + return sb.toString(); + } + + @Override + public final int hashCode() { + // TODO: use Apache Commons Lang HashCodeBuilder + final int prime = 31; + int result = 1; + result = prime * result + (isWhitespaceBefore ? 1231 : 1237); + result = prime * result + ((lemma == null) ? 0 : lemma.hashCode()); + result = prime * result + ((posTag == null) ? 0 : posTag.hashCode()); + result = prime * result + ((token == null) ? 0 : token.hashCode()); + return result; + } + + @Override + public final boolean equals(final Object obj) { + // TODO: use Apache Commons Lang EqualsBuilder + if (this == obj) { + return true; + } + if (obj == null) { + return false; + } + if (getClass() != obj.getClass()) { + return false; + } + final AnalyzedToken other = (AnalyzedToken) obj; + if (isWhitespaceBefore != other.isWhitespaceBefore) { + return false; + } + if (lemma == null) { + if (other.lemma != null) { + return false; + } + } else if (!lemma.equals(other.lemma)) { + return false; + } + if (posTag == null) { + if (other.posTag != null) { + return false; + } + } else if (!posTag.equals(other.posTag)) { + return false; + } + if (token == null) { + if (other.token != null) { + return false; + } + } else if (!token.equals(other.token)) { + return false; + } + return true; + } + +} diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/AnalyzedTokenReadings.java b/JLanguageTool/src/java/de/danielnaber/languagetool/AnalyzedTokenReadings.java new file mode 100644 index 0000000..ac6dc54 --- /dev/null +++ b/JLanguageTool/src/java/de/danielnaber/languagetool/AnalyzedTokenReadings.java @@ -0,0 +1,284 @@ +/* LanguageTool, a natural language style checker + * Copyright (C) 2005 Daniel Naber (http://www.danielnaber.de) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ + +package de.danielnaber.languagetool; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; + +import de.danielnaber.languagetool.tools.StringTools; + +/** + * An array of {@link AnalyzedToken}s used to store multiple POS tags and lemmas + * for a given single token. + * + * @author Marcin Milkowski + */ +public class AnalyzedTokenReadings { + + @Override + public int hashCode() { + final int prime = 31; + int result = 1; + result = prime * result + Arrays.hashCode(anTokReadings); + result = prime * result + (isLinebreak ? 1231 : 1237); + result = prime * result + (isParaEnd ? 1231 : 1237); + result = prime * result + (isSentEnd ? 1231 : 1237); + result = prime * result + (isSentStart ? 1231 : 1237); + result = prime * result + (isWhitespace ? 1231 : 1237); + result = prime * result + (isWhitespaceBefore ? 1231 : 1237); + result = prime * result + startPos; + result = prime * result + ((token == null) ? 0 : token.hashCode()); + return result; + } + + @Override + public boolean equals(Object obj) { + if (this == obj) + return true; + if (obj == null) + return false; + if (getClass() != obj.getClass()) + return false; + final AnalyzedTokenReadings other = (AnalyzedTokenReadings) obj; + if (!Arrays.equals(anTokReadings, other.anTokReadings)) + return false; + if (isLinebreak != other.isLinebreak) + return false; + if (isParaEnd != other.isParaEnd) + return false; + if (isSentEnd != other.isSentEnd) + return false; + if (isSentStart != other.isSentStart) + return false; + if (isWhitespace != other.isWhitespace) + return false; + if (isWhitespaceBefore != other.isWhitespaceBefore) + return false; + if (startPos != other.startPos) + return false; + if (token == null) { + if (other.token != null) + return false; + } else if (!token.equals(other.token)) + return false; + return true; + } + + protected AnalyzedToken[] anTokReadings; + private int startPos; + private String token; + + private boolean isWhitespace; + private boolean isLinebreak; + private boolean isSentEnd; + private boolean isSentStart; + private boolean isParaEnd; + + private boolean isWhitespaceBefore; + + public AnalyzedTokenReadings(final AnalyzedToken[] r, final int startPos) { + anTokReadings = r.clone(); + this.startPos = startPos; + init(); + } + + public AnalyzedTokenReadings(final List<AnalyzedToken> list, final int startPos) { + anTokReadings = list.toArray(new AnalyzedToken[list.size()]); + this.startPos = startPos; + init(); + } + + AnalyzedTokenReadings(final AnalyzedToken at) { + anTokReadings = new AnalyzedToken[1]; + anTokReadings[0] = at; + isWhitespaceBefore = at.isWhitespaceBefore(); + init(); + } + + public AnalyzedTokenReadings(final AnalyzedToken at, final int startPos) { + this(at); + this.startPos = startPos; + } + + private void init() { + token = anTokReadings[0].getToken(); + isWhitespace = StringTools.isWhitespace(token); + isLinebreak = "\n".equals(token) || "\r\n".equals(token) + || "\r".equals(token) || "\n\r".equals(token); + isSentStart = JLanguageTool.SENTENCE_START_TAGNAME.equals(anTokReadings[0] + .getPOSTag()); + isParaEnd = hasPosTag(JLanguageTool.PARAGRAPH_END_TAGNAME); + isSentEnd = hasPosTag(JLanguageTool.SENTENCE_END_TAGNAME); + } + + public final List<AnalyzedToken> getReadings() { + return Arrays.asList(anTokReadings); + } + + /** + * Checks if the token has a particular POS tag. + * + * @param pos + * POS Tag to check + * @return True if it does. + */ + public final boolean hasPosTag(final String pos) { + boolean found = false; + for (final AnalyzedToken reading : anTokReadings) { + if (reading.getPOSTag() != null) { + found = pos.equals(reading.getPOSTag()); + if (found) { + break; + } + } + } + return found; + } + + public final AnalyzedToken getAnalyzedToken(final int i) { + return anTokReadings[i]; + } + + public final void addReading(final AnalyzedToken tok) { + final ArrayList<AnalyzedToken> l = new ArrayList<AnalyzedToken>(); + for (int i = 0; i < anTokReadings.length - 1; i++) { + l.add(anTokReadings[i]); + } + if (anTokReadings[anTokReadings.length - 1].getPOSTag() != null) { + l.add(anTokReadings[anTokReadings.length - 1]); + } + tok.setWhitespaceBefore(isWhitespaceBefore); + l.add(tok); + anTokReadings = l.toArray(new AnalyzedToken[l.size()]); + if (tok.getToken().length() > token.length()) { //in case a longer token is added + token = tok.getToken(); + } + anTokReadings[anTokReadings.length - 1]. + setWhitespaceBefore(isWhitespaceBefore); + isParaEnd = hasPosTag(JLanguageTool.PARAGRAPH_END_TAGNAME); + isSentEnd = hasPosTag(JLanguageTool.SENTENCE_END_TAGNAME); + } + + public final void removeReading(final AnalyzedToken tok) { + final ArrayList<AnalyzedToken> l = new ArrayList<AnalyzedToken>(); + final AnalyzedToken tmpTok = new AnalyzedToken(tok.getToken(), tok + .getPOSTag(), tok.getLemma()); + tmpTok.setWhitespaceBefore(isWhitespaceBefore); + for (AnalyzedToken anTokReading : anTokReadings) { + if (!anTokReading.equals(tmpTok)) { + l.add(anTokReading); + } + } + anTokReadings = l.toArray(new AnalyzedToken[l.size()]); + } + + public final int getReadingsLength() { + return anTokReadings.length; + } + + public final boolean isWhitespace() { + return isWhitespace; + } + + /** + * Returns true if the token equals \n, \r\n \n\r or \r\n. + */ + public final boolean isLinebreak() { + return isLinebreak; + } + + public final boolean isSentStart() { + return isSentStart; + } + + /** + * @return true when the token is a last token in a paragraph. + */ + public final boolean isParaEnd() { + return isParaEnd; + } + + /** + * Add PARA_END tag. + */ + public void setParaEnd() { + final AnalyzedToken paragraphEnd = new AnalyzedToken(getToken(), + JLanguageTool.PARAGRAPH_END_TAGNAME, getAnalyzedToken(0).getLemma()); + addReading(paragraphEnd); + } + + /** + * @return true when the token is a last token in a sentence. + */ + public final boolean isSentEnd() { + return isSentEnd; + } + + /** + * @since 0.9.9 + * @return true if the token is OpenOffice field code. + */ + public final boolean isFieldCode() { + return "\u0001".equals(token) || "\u0002".equals(token); + } + + /** + * Add a SENT_END tag. + */ + public final void setSentEnd() { + final AnalyzedToken sentenceEnd = new AnalyzedToken(getToken(), + JLanguageTool.SENTENCE_END_TAGNAME, getAnalyzedToken(0).getLemma()); + addReading(sentenceEnd); + } + + public final int getStartPos() { + return startPos; + } + + public final void setStartPos(final int position) { + startPos = position; + } + + public final String getToken() { + return token; + } + + public final void setWhitespaceBefore(final boolean isWhite) { + isWhitespaceBefore = isWhite; + for (final AnalyzedToken aTok : anTokReadings) { + aTok.setWhitespaceBefore(isWhite); + } + } + + public final boolean isWhitespaceBefore() { + return isWhitespaceBefore; + } + + @Override + public String toString() { + final StringBuilder sb = new StringBuilder(); + for (final AnalyzedToken element : anTokReadings) { + sb.append(element); + } + return sb.toString(); + } + +} diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/JLanguageTool.java b/JLanguageTool/src/java/de/danielnaber/languagetool/JLanguageTool.java new file mode 100644 index 0000000..44bdfec --- /dev/null +++ b/JLanguageTool/src/java/de/danielnaber/languagetool/JLanguageTool.java @@ -0,0 +1,802 @@ +/* LanguageTool, a natural language style checker + * Copyright (C) 2005 Daniel Naber (http://www.danielnaber.de) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ +package de.danielnaber.languagetool; + +import java.io.FileInputStream; +import java.io.IOException; +import java.io.InputStream; +import java.io.PrintStream; +import java.lang.reflect.Constructor; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; +import java.util.HashSet; +import java.util.HashMap; +import java.util.List; +import java.util.Locale; +import java.util.Map; +import java.util.MissingResourceException; +import java.util.ResourceBundle; +import java.util.Set; + +import javax.xml.parsers.ParserConfigurationException; + +import org.xml.sax.SAXException; + +import de.danielnaber.languagetool.databroker.DefaultResourceDataBroker; +import de.danielnaber.languagetool.databroker.ResourceDataBroker; +import de.danielnaber.languagetool.rules.Rule; +import de.danielnaber.languagetool.rules.RuleMatch; +import de.danielnaber.languagetool.rules.patterns.FalseFriendRuleLoader; +import de.danielnaber.languagetool.rules.patterns.PatternRule; +import de.danielnaber.languagetool.rules.patterns.PatternRuleLoader; +import de.danielnaber.languagetool.tagging.Tagger; +import de.danielnaber.languagetool.tagging.disambiguation.Disambiguator; +import de.danielnaber.languagetool.tokenizers.Tokenizer; +import de.danielnaber.languagetool.tools.ReflectionUtils; + +/** + * The main class used for checking text against different rules: + * <ul> + * <li>the built-in rules (<i>a</i> vs. <i>an</i>, whitespace after commas, ...) + * <li>pattern rules loaded from external XML files with + * {@link #loadPatternRules(String)} + * <li>your own implementation of the abstract {@link Rule} classes added with + * {@link #addRule(Rule)} + * </ul> + * + * <p> + * Note that the constructors create a language checker that uses the built-in + * rules only. Other rules (e.g. from XML) need to be added explicitly. + * + * @author Daniel Naber + */ +@SuppressWarnings({"UnusedDeclaration"}) +public final class JLanguageTool { + + public static final String VERSION = "1.3-dev"; // keep in sync with + // build.properties! + + private static ResourceDataBroker dataBroker = new DefaultResourceDataBroker(); + public static final String PATTERN_FILE = "grammar.xml"; + public static final String FALSE_FRIEND_FILE = "false-friends.xml"; + + public static final String SENTENCE_START_TAGNAME = "SENT_START"; + public static final String SENTENCE_END_TAGNAME = "SENT_END"; + public static final String PARAGRAPH_END_TAGNAME = "PARA_END"; + + private final List<Rule> builtinRules = new ArrayList<Rule>(); + private final List<Rule> userRules = new ArrayList<Rule>(); // rules added via addRule() method + private final Set<String> disabledRules = new HashSet<String>(); + private final Set<String> enabledRules = new HashSet<String>(); + + private final Set<String> disabledCategories = new HashSet<String>(); + + private Language language; + private Language motherTongue; + private Disambiguator disambiguator; + private Tagger tagger; + private Tokenizer sentenceTokenizer; + private Tokenizer wordTokenizer; + + private PrintStream printStream; + + private int sentenceCount; + + private boolean listUnknownWords; + private Set<String> unknownWords; + + /** + * Constants for correct paragraph-rule handling. + */ + public static enum paragraphHandling { + /** + * Handle normally - all kinds of rules run. + */ + NORMAL, + /** + * Run only paragraph-level rules. + */ + ONLYPARA, + /** + * Run only sentence-level rules. + */ + ONLYNONPARA + } + + // just for testing: + /* + * private Rule[] allBuiltinRules = new Rule[] { new + * UppercaseSentenceStartRule() }; + */ + + /** + * Create a JLanguageTool and setup the built-in rules appropriate for the + * given language, ignoring false friend hints. + * + * @throws IOException + */ + public JLanguageTool(final Language language) throws IOException { + this(language, null); + } + + /** + * Create a JLanguageTool and setup the built-in rules appropriate for the + * given language. + * + * @param language + * the language to be used. + * @param motherTongue + * the user's mother tongue or <code>null</code>. The mother tongue + * may also be used as a source language for checking bilingual texts. + * + * @throws IOException + */ + public JLanguageTool(final Language language, final Language motherTongue) + throws IOException { + if (language == null) { + throw new NullPointerException("language cannot be null"); + } + this.language = language; + this.motherTongue = motherTongue; + final ResourceBundle messages = getMessageBundle(language); + final Rule[] allBuiltinRules = getAllBuiltinRules(language, messages); + for (final Rule element : allBuiltinRules) { + if (element.supportsLanguage(language)) { + builtinRules.add(element); + } + } + disambiguator = language.getDisambiguator(); + tagger = language.getTagger(); + sentenceTokenizer = language.getSentenceTokenizer(); + wordTokenizer = language.getWordTokenizer(); + } + + /** + * The grammar checker does need resources from following + * directories: + * + * <ul style="list-type: circle"> + * <li>{@code /resource}</li> + * <li>{@code /rules}</li> + * </ul> + * + * This method is thread-safe. + * + * @return The currently set data broker which allows to obtain + * resources from the mentioned directories above. If no + * data broker was set, a new {@link DefaultResourceDataBroker} will + * be instantiated and returned. + * @since 1.0.1 + */ + public static synchronized ResourceDataBroker getDataBroker() { + if (JLanguageTool.dataBroker == null) { + JLanguageTool.dataBroker = new DefaultResourceDataBroker(); + } + return JLanguageTool.dataBroker; + } + + /** + * The grammar checker does need resources from following + * directories: + * + * <ul style="list-type: circle"> + * <li>{@code /resource}</li> + * <li>{@code /rules}</li> + * </ul> + * + * This method is thread-safe. + * + * @param broker The new resource broker to be used. + * @since 1.0.1 + */ + public static synchronized void setDataBroker(ResourceDataBroker broker) { + JLanguageTool.dataBroker = broker; + } + + /** + * Whether the check() method stores unknown words. If set to + * <code>true</code> (default: false), you can get the list of unknown words + * using getUnknownWords(). + */ + public void setListUnknownWords(final boolean listUnknownWords) { + this.listUnknownWords = listUnknownWords; + } + + /** + * Gets the ResourceBundle for the default language of the user's system. + */ + public static ResourceBundle getMessageBundle() { + try { + return ResourceBundle + .getBundle("de.danielnaber.languagetool.MessagesBundle"); + } catch (final MissingResourceException e) { + return ResourceBundle.getBundle( + "de.danielnaber.languagetool.MessagesBundle", Locale.ENGLISH); + } + } + + /** + * Gets the ResourceBundle for the given user interface language. + */ + private static ResourceBundle getMessageBundle(final Language lang) { + try { + return ResourceBundle.getBundle( + "de.danielnaber.languagetool.MessagesBundle", lang.getLocale()); + } catch (final MissingResourceException e) { + return ResourceBundle.getBundle( + "de.danielnaber.languagetool.MessagesBundle", Locale.ENGLISH); + } + } + + private Rule[] getAllBuiltinRules(final Language language, + final ResourceBundle messages) { + // use reflection to get a list of all non-pattern rules under + // "de.danielnaber.languagetool.rules" + // generic rules first, then language-specific ones + // TODO: the order of loading classes is not guaranteed so we may want to + // implement rule + // precedence + + final List<Rule> rules = new ArrayList<Rule>(); + try { + // we pass ".*Rule$" regexp to improve efficiency, see javadoc + final Class[] classes1 = ReflectionUtils.findClasses(Rule.class + .getClassLoader(), Rule.class.getPackage().getName(), ".*Rule$", 0, + Rule.class, null); + final Class[] classes2 = ReflectionUtils.findClasses(Rule.class + .getClassLoader(), Rule.class.getPackage().getName() + "." + + language.getShortName(), ".*Rule$", 0, Rule.class, null); + + final List<Class> classes = new ArrayList<Class>(); + classes.addAll(Arrays.asList(classes1)); + classes.addAll(Arrays.asList(classes2)); + + for (final Class class1 : classes) { + final Constructor[] constructors = class1.getConstructors(); + for (final Constructor constructor : constructors) { + final Class[] paramTypes = constructor.getParameterTypes(); + if (paramTypes.length == 1 + && paramTypes[0].equals(ResourceBundle.class)) { + rules.add((Rule) constructor.newInstance(messages)); + break; + } + if (paramTypes.length == 2 + && paramTypes[0].equals(ResourceBundle.class) + && paramTypes[1].equals(Language.class)) { + rules.add((Rule) constructor.newInstance(messages, language)); + break; + } + throw new RuntimeException("Unknown constructor for rule class: " + + class1.getName()); + } + } + } catch (final Exception e) { + throw new RuntimeException("Failed to load rules for language " + language, e); + } + // System.err.println("Loaded " + rules.size() + " rules"); + return rules.toArray(new Rule[rules.size()]); + } + + /** + * Set a PrintStream that will receive verbose output. Set to + * <code>null</code> to disable verbose output. + */ + public void setOutput(final PrintStream printStream) { + this.printStream = printStream; + } + + /** + * Load pattern rules from an XML file. Use {@link #addRule} to add these + * rules to the checking process. + * + * @throws IOException + * @return a List of {@link PatternRule} objects + */ + public List<PatternRule> loadPatternRules(final String filename) + throws IOException { + final PatternRuleLoader ruleLoader = new PatternRuleLoader(); + InputStream is = this.getClass().getResourceAsStream(filename); + if (is == null) { + // happens for external rules plugged in as an XML file: + is = new FileInputStream(filename); + } + return ruleLoader.getRules(is, filename); + } + + /** + * Load false friend rules from an XML file. Only those pairs will be loaded + * that match the current text language and the mother tongue specified in the + * JLanguageTool constructor. Use {@link #addRule} to add these rules to the + * checking process. + * + * @throws ParserConfigurationException + * @throws SAXException + * @throws IOException + * @return a List of {@link PatternRule} objects + */ + public List<PatternRule> loadFalseFriendRules(final String filename) + throws ParserConfigurationException, SAXException, IOException { + if (motherTongue == null) { + return new ArrayList<PatternRule>(); + } + final FalseFriendRuleLoader ruleLoader = new FalseFriendRuleLoader(); + return ruleLoader.getRules(this.getClass().getResourceAsStream(filename), + language, motherTongue); + } + + /** + * Loads and activates the pattern rules from + * <code>rules/<language>/grammar.xml</code>. + * + * @throws ParserConfigurationException + * @throws SAXException + * @throws IOException + */ + public void activateDefaultPatternRules() throws IOException { + final String defaultPatternFilename = language.getRuleFileName(); + final List<PatternRule> patternRules = loadPatternRules(defaultPatternFilename); + userRules.addAll(patternRules); + } + + /** + * Loads and activates the false friend rules from + * <code>rules/false-friends.xml</code>. + * + * @throws ParserConfigurationException + * @throws SAXException + * @throws IOException + */ + public void activateDefaultFalseFriendRules() + throws ParserConfigurationException, SAXException, IOException { + final String falseFriendRulesFilename = JLanguageTool.getDataBroker().getRulesDir() + "/" + FALSE_FRIEND_FILE; + final List<PatternRule> patternRules = loadFalseFriendRules(falseFriendRulesFilename); + userRules.addAll(patternRules); + } + + /** + * Add a rule to be used by the next call to {@link #check}. + */ + public void addRule(final Rule rule) { + userRules.add(rule); + } + + /** + * Disable a given rule so {@link #check} won't use it. + * + * @param ruleId + * the id of the rule to disable + */ + public void disableRule(final String ruleId) { + // TODO: check if such a rule exists + disabledRules.add(ruleId); + } + + /** + * Disable a given category so {@link #check} won't use it. + * + * @param categoryName + * the id of the category to disable + */ + public void disableCategory(final String categoryName) { + // TODO: check if such a rule exists + disabledCategories.add(categoryName); + } + + /** + * Get the language that was used to configure this instance. + */ + public Language getLanguage() { + return language; + } + + /** + * Get rule ids of the rules that have been explicitly disabled. + */ + public Set<String> getDisabledRules() { + return disabledRules; + } + + /** + * Enable a rule that was switched off by default. + * + * @param ruleId + * the id of the turned off rule to enable. + * + */ + public void enableDefaultOffRule(final String ruleId) { + enabledRules.add(ruleId); + } + + /** + * Get category ids of the rules that have been explicitly disabled. + */ + public Set<String> getDisabledCategories() { + return disabledCategories; + } + + /** + * Re-enable a given rule so {@link #check} will use it. + * + * @param ruleId + * the id of the rule to enable + */ + public void enableRule(final String ruleId) { + if (disabledRules.contains(ruleId)) { + disabledRules.remove(ruleId); + } + } + + /** + * Returns tokenized sentences. + */ + public List<String> sentenceTokenize(final String text) { + return sentenceTokenizer.tokenize(text); + } + + /** + * The main check method. Tokenizes the text into sentences and matches these + * sentences against all currently active rules. + * + * @param text + * the text to check + * @return a List of {@link RuleMatch} objects + * @throws IOException + */ + public List<RuleMatch> check(final String text) throws IOException { + return check(text, true, paragraphHandling.NORMAL); + } + + + /** + * The main check method. Tokenizes the text into sentences and matches these + * sentences against all currently active rules. + * + * @param text + * the text to check + * @param tokenizeText + * If true, then the text is tokenized into sentences. + * Otherwise, it is assumed it's already tokenized. + * @param paraMode + * Uses paragraph-level rules only if true. + + * @return a List of {@link RuleMatch} objects + * @throws IOException + */ + public List<RuleMatch> check(final String text, boolean tokenizeText, final paragraphHandling paraMode) throws IOException { + sentenceCount = 0; + final List<String> sentences; + if (tokenizeText) { + sentences = sentenceTokenize(text); + } else { + sentences = new ArrayList<String>(); + sentences.add(text); + } + final List<RuleMatch> ruleMatches = new ArrayList<RuleMatch>(); + final List<Rule> allRules = getAllRules(); + printIfVerbose(allRules.size() + " rules activated for language " + + language); + int charCount = 0; + int lineCount = 0; + int columnCount = 1; + unknownWords = new HashSet<String>(); + for (final String sentence : sentences) { + sentenceCount++; + AnalyzedSentence analyzedText = getAnalyzedSentence(sentence); + rememberUnknownWords(analyzedText); + + if (sentenceCount == sentences.size()) { + final AnalyzedTokenReadings[] anTokens = analyzedText.getTokens(); + anTokens[anTokens.length - 1].setParaEnd(); + analyzedText = new AnalyzedSentence(anTokens); + } + + printIfVerbose(analyzedText.toString()); + final List<RuleMatch> sentenceMatches = + checkAnalyzedSentence(paraMode, allRules, charCount, lineCount, + columnCount, sentence, analyzedText); + + Collections.sort(sentenceMatches); + ruleMatches.addAll(sentenceMatches); + charCount += sentence.length(); + lineCount += countLineBreaks(sentence); + + // calculate matching column: + final int lineBreakPos = sentence.indexOf('\n'); + if (lineBreakPos == -1) { + columnCount += sentence.length() -1; + } else { + if (lineBreakPos == 0) { + columnCount = sentence.length(); + if (!language.getSentenceTokenizer(). + singleLineBreaksMarksPara()) { + columnCount--; + } + } else { + columnCount = 1; + } + } + } + + if (!paraMode.equals(paragraphHandling.ONLYNONPARA)) { + // removing false positives in paragraph-level rules + for (final Rule rule : allRules) { + if (rule.isParagraphBackTrack() && (rule.getMatches() != null)) { + final List<RuleMatch> rm = rule.getMatches(); + for (final RuleMatch r : rm) { + if (rule.isInRemoved(r)) { + ruleMatches.remove(r); + } + } + } + } + } + + return ruleMatches; + } + + public List<RuleMatch> checkAnalyzedSentence(final paragraphHandling paraMode, + final List<Rule> allRules, int tokenCount, int lineCount, + int columnCount, final String sentence, AnalyzedSentence analyzedText) + throws IOException { + final List<RuleMatch> sentenceMatches = new ArrayList<RuleMatch>(); + for (final Rule rule : allRules) { + if (disabledRules.contains(rule.getId()) + || (rule.isDefaultOff() && !enabledRules.contains(rule.getId()))) { + continue; + } + + if (disabledCategories.contains(rule.getCategory().getName())) { + continue; + } + + switch (paraMode) { + case ONLYNONPARA: { + if (rule.isParagraphBackTrack()) { + continue; + } + break; + } + case ONLYPARA: { + if (!rule.isParagraphBackTrack()) { + continue; + } + break; + } + case NORMAL: + default: + } + + final RuleMatch[] thisMatches = rule.match(analyzedText); + for (final RuleMatch element1 : thisMatches) { + RuleMatch thisMatch = adjustRuleMatchPos(element1, + tokenCount, columnCount, lineCount, sentence); + sentenceMatches.add(thisMatch); + if (rule.isParagraphBackTrack()) { + rule.addRuleMatch(thisMatch); + } + } + } + return sentenceMatches; + } + + /** + * Change RuleMatch positions so they are relative to the complete text, + * not just to the sentence: + * @param rm RuleMatch + * @param sentLen Count of characters + * @param columnCount Current column number + * @param lineCount Current line number + * @param sentence The text being checked + * @return + * The RuleMatch object with adjustments. + */ + public RuleMatch adjustRuleMatchPos(final RuleMatch rm, int sentLen, + int columnCount, int lineCount, final String sentence) { + final RuleMatch thisMatch = new RuleMatch(rm.getRule(), + rm.getFromPos() + sentLen, rm.getToPos() + + sentLen, rm.getMessage(), rm + .getShortMessage()); + thisMatch.setSuggestedReplacements(rm + .getSuggestedReplacements()); + final String sentencePartToError = sentence.substring(0, rm + .getFromPos()); + final String sentencePartToEndOfError = sentence.substring(0, + rm.getToPos()); + final int lastLineBreakPos = sentencePartToError.lastIndexOf('\n'); + final int column; + final int endColumn; + if (lastLineBreakPos == -1) { + column = sentencePartToError.length() + columnCount; + } else { + column = sentencePartToError.length() - lastLineBreakPos; + } + final int lastLineBreakPosInError = sentencePartToEndOfError + .lastIndexOf('\n'); + if (lastLineBreakPosInError == -1) { + endColumn = sentencePartToEndOfError.length() + columnCount + 1; + } else { + endColumn = sentencePartToEndOfError.length() - lastLineBreakPos; + } + final int lineBreaksToError = countLineBreaks(sentencePartToError); + final int lineBreaksToEndOfError = countLineBreaks(sentencePartToEndOfError); + thisMatch.setLine(lineCount + lineBreaksToError); + thisMatch.setEndLine(lineCount + lineBreaksToEndOfError); + thisMatch.setColumn(column); + thisMatch.setEndColumn(endColumn); + thisMatch.setOffset(rm.getFromPos() + sentLen); + return thisMatch; + } + + private void rememberUnknownWords(final AnalyzedSentence analyzedText) { + if (listUnknownWords) { + final AnalyzedTokenReadings[] atr = analyzedText + .getTokensWithoutWhitespace(); + for (final AnalyzedTokenReadings t : atr) { + if (t.getReadings().toString().contains("null]")) { + unknownWords.add(t.getToken()); + } + } + } + } + + /** + * Get the list of unknown words in the last run of the check() method. + * + * @throws IllegalStateException + * if listUnknownWords is set to <code>false</code> + */ + public List<String> getUnknownWords() { + if (!listUnknownWords) { + throw new IllegalStateException( + "listUnknownWords is set to false, unknown words not stored"); + } + final List<String> words = new ArrayList<String>(unknownWords); + Collections.sort(words); + return words; + } + + static int countLineBreaks(final String s) { + int pos = -1; + int count = 0; + while (true) { + final int nextPos = s.indexOf('\n', pos + 1); + if (nextPos == -1) { + break; + } + pos = nextPos; + count++; + } + return count; + } + + /** + * Tokenizes the given <code>sentence</code> into words and analyzes it, + * and then disambiguates POS tags. + * + * @throws IOException + */ + public AnalyzedSentence getAnalyzedSentence(final String sentence) + throws IOException { + // disambiguate assigned tags & return + return disambiguator.disambiguate(getRawAnalyzedSentence(sentence)); + } + + /** + * Tokenizes the given <code>sentence</code> into words and analyzes it. + * + * @since 0.9.8 + * @param sentence + * Sentence to be analyzed + * @return + * AnalyzedSentence + * @throws IOException + */ + public AnalyzedSentence getRawAnalyzedSentence(final String sentence) throws IOException { + final List<String> tokens = wordTokenizer.tokenize(sentence); + final Map<Integer, String> softHyphenTokens = new HashMap<Integer, String>(); + + //for soft hyphens inside words, happens especially in OOo: + for (int i = 0; i < tokens.size(); i++) { + if (tokens.get(i).indexOf('\u00ad') != -1) { + softHyphenTokens.put(i, tokens.get(i)); + tokens.set(i, tokens.get(i).replaceAll("\u00ad", "")); + } + } + + final List<AnalyzedTokenReadings> aTokens = tagger.tag(tokens); + final int numTokens = aTokens.size(); + int posFix = 0; + for (int i = 1; i < numTokens; i++) { + aTokens.get(i).setWhitespaceBefore(aTokens.get(i - 1).isWhitespace()); + aTokens.get(i).setStartPos(aTokens.get(i).getStartPos() + posFix); + if (!softHyphenTokens.isEmpty()) { + if (softHyphenTokens.get(i) != null) { + aTokens.get(i).addReading(tagger.createToken(softHyphenTokens.get(i), null)); + posFix += softHyphenTokens.get(i).length() - aTokens.get(i).getToken().length(); + } + } + } + + final AnalyzedTokenReadings[] tokenArray = new AnalyzedTokenReadings[tokens + .size() + 1]; + final AnalyzedToken[] startTokenArray = new AnalyzedToken[1]; + int toArrayCount = 0; + final AnalyzedToken sentenceStartToken = new AnalyzedToken("", SENTENCE_START_TAGNAME, null); + startTokenArray[0] = sentenceStartToken; + tokenArray[toArrayCount++] = new AnalyzedTokenReadings(startTokenArray, 0); + int startPos = 0; + for (final AnalyzedTokenReadings posTag : aTokens) { + posTag.setStartPos(startPos); + tokenArray[toArrayCount++] = posTag; + startPos += posTag.getToken().length(); + } + + // add additional tags + int lastToken = toArrayCount - 1; + // make SENT_END appear at last not whitespace token + for (int i = 0; i < toArrayCount - 1; i++) { + if (!tokenArray[lastToken - i].isWhitespace()) { + lastToken -= i; + break; + } + } + + tokenArray[lastToken].setSentEnd(); + + if (tokenArray.length == lastToken + 1 && tokenArray[lastToken].isLinebreak()) { + tokenArray[lastToken].setParaEnd(); + } + return new AnalyzedSentence(tokenArray); + } + + /** + * Get all rules for the current language that are built-in or that have been + * added using {@link #addRule}. + * @return a List of {@link Rule} objects + */ + public List<Rule> getAllRules() { + final List<Rule> rules = new ArrayList<Rule>(); + rules.addAll(builtinRules); + rules.addAll(userRules); + // Some rules have an internal state so they can do checks over sentence + // boundaries. These need to be reset so the checks don't suddenly + // work on different texts with the same data. However, it could be useful + // to keep the state information if we're checking a continuous text. + for (final Rule rule : rules) { + rule.reset(); + } + return rules; + } + + /** + * Number of sentences the latest call to check() has checked. + */ + public int getSentenceCount() { + return sentenceCount; + + } + + private void printIfVerbose(final String s) { + if (printStream != null) { + printStream.println(s); + } + } + +} diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/Language.java b/JLanguageTool/src/java/de/danielnaber/languagetool/Language.java new file mode 100644 index 0000000..a565058 --- /dev/null +++ b/JLanguageTool/src/java/de/danielnaber/languagetool/Language.java @@ -0,0 +1,336 @@ +/* LanguageTool, a natural language style checker + * Copyright (C) 2005 Daniel Naber (http://www.danielnaber.de) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ +package de.danielnaber.languagetool; + +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; +import java.util.Locale; +import java.util.MissingResourceException; +import java.util.ResourceBundle; +import java.util.Set; + +import de.danielnaber.languagetool.language.Belarusian; +import de.danielnaber.languagetool.language.Catalan; +import de.danielnaber.languagetool.language.Contributor; +import de.danielnaber.languagetool.language.Demo; +import de.danielnaber.languagetool.language.Danish; +import de.danielnaber.languagetool.language.Bokmal; +import de.danielnaber.languagetool.language.Dutch; +import de.danielnaber.languagetool.language.English; +import de.danielnaber.languagetool.language.Esperanto; +import de.danielnaber.languagetool.language.French; +import de.danielnaber.languagetool.language.Galician; +import de.danielnaber.languagetool.language.German; +import de.danielnaber.languagetool.language.Icelandic; +import de.danielnaber.languagetool.language.Italian; +import de.danielnaber.languagetool.language.Lithuanian; +import de.danielnaber.languagetool.language.Malayalam; +import de.danielnaber.languagetool.language.Polish; +import de.danielnaber.languagetool.language.Romanian; +import de.danielnaber.languagetool.language.Russian; +import de.danielnaber.languagetool.language.Slovak; +import de.danielnaber.languagetool.language.Slovenian; +import de.danielnaber.languagetool.language.Spanish; +import de.danielnaber.languagetool.language.Swedish; +import de.danielnaber.languagetool.language.Ukrainian; +import de.danielnaber.languagetool.rules.patterns.Unifier; +import de.danielnaber.languagetool.synthesis.Synthesizer; +import de.danielnaber.languagetool.tagging.Tagger; +import de.danielnaber.languagetool.tagging.disambiguation.Disambiguator; +import de.danielnaber.languagetool.tagging.disambiguation.xx.DemoDisambiguator; +import de.danielnaber.languagetool.tagging.xx.DemoTagger; +import de.danielnaber.languagetool.tokenizers.SentenceTokenizer; +import de.danielnaber.languagetool.tokenizers.Tokenizer; +import de.danielnaber.languagetool.tokenizers.WordTokenizer; +import de.danielnaber.languagetool.tools.StringTools; + +/** + * Base class for any supported language (English, German, etc). + * + * @author Daniel Naber + */ +public abstract class Language { + + // NOTE: keep in sync with array below! + //public final static Language CZECH = new Czech(); + public static final Language DANISH = new Danish(); + public static final Language BOKMAL = new Bokmal(); + public static final Language DUTCH = new Dutch(); + public static final Language ENGLISH = new English(); + public static final Language ESPERANTO = new Esperanto(); + public static final Language FRENCH = new French(); + public static final Language GERMAN = new German(); + public static final Language ITALIAN = new Italian(); + public static final Language LITHUANIAN = new Lithuanian(); + public static final Language POLISH = new Polish(); + public static final Language SLOVAK = new Slovak(); + public static final Language SLOVENIAN = new Slovenian(); + public static final Language SPANISH = new Spanish(); + public static final Language SWEDISH = new Swedish(); + public static final Language UKRAINIAN = new Ukrainian(); + public static final Language RUSSIAN = new Russian(); + public static final Language ROMANIAN = new Romanian(); + public static final Language ICELANDIC = new Icelandic(); + public static final Language GALICIAN = new Galician(); + public static final Language CATALAN = new Catalan(); + public static final Language MALAYALAM = new Malayalam(); + public static final Language BELARUSIAN = new Belarusian(); + + public static final Language DEMO = new Demo(); + + private static List<Language> externalLanguages = new ArrayList<Language>(); + + /** + * All languages supported by LanguageTool. + */ + public static Language[] LANGUAGES = { + ENGLISH, GERMAN, POLISH, FRENCH, SPANISH, ITALIAN, DUTCH, LITHUANIAN, UKRAINIAN, RUSSIAN, + SLOVAK, SLOVENIAN, SWEDISH, ROMANIAN, ICELANDIC, GALICIAN, CATALAN, DANISH, + MALAYALAM, BELARUSIAN, ESPERANTO, BOKMAL, + DEMO + // FIXME: load dynamically from classpath + }; + + /** + * All languages supported by LanguageTool, but without the demo language. + */ + public static Language[] REAL_LANGUAGES = new Language[LANGUAGES.length-1]; + static { + int i = 0; + for (final Language lang : LANGUAGES) { + if (lang != DEMO) { + REAL_LANGUAGES[i] = lang; + i++; + } + } + } + + private static final Language[] BUILTIN_LANGUAGES = LANGUAGES; + + private static final Disambiguator DEMO_DISAMBIGUATOR = new DemoDisambiguator(); + private static final Tagger DEMO_TAGGER = new DemoTagger(); + private static final SentenceTokenizer SENTENCE_TOKENIZER = new SentenceTokenizer(); + private static final WordTokenizer WORD_TOKENIZER = new WordTokenizer(); + private static final Unifier MATCH_UNIFIER = new Unifier(); + + // ------------------------------------------------------------------------- + + /** + * Get this language's two character code, e.g. <code>en</code> for English. + * @return String - language code + */ + public abstract String getShortName(); + + /** + * Get this language's name in English, e.g. <code>English</code> or <code>German</code>. + * @return String - language name + */ + public abstract String getName(); + + /** + * Get this language's variants, e.g. <code>US</code> (as in <code>en_US</code>) or + * <code>PL</code> (as in <code>pl_PL</code>). + * @return String[] - array of country variants for the language. + */ + public abstract String[] getCountryVariants(); + + /** + * Get this language's Java locale. + */ + public abstract Locale getLocale(); + + /** + * Get the name(s) of the maintainer(s) for this language or <code>null</code>. + */ + public abstract Contributor[] getMaintainers(); + + /** + * Get the IDs of the global rules that should run for texts in this language + * or <code>null</code>. + */ + public abstract Set<String> getRelevantRuleIDs(); + + // ------------------------------------------------------------------------- + + /** + * Get the location of the rule file. + */ + public String getRuleFileName() { + return JLanguageTool.getDataBroker().getRulesDir() + "/" + getShortName() + "/" + JLanguageTool.PATTERN_FILE; + } + + /** + * Get this language's part-of-speech disambiguator implementation. + */ + public Disambiguator getDisambiguator() { + return DEMO_DISAMBIGUATOR; + } + + /** + * Get this language's part-of-speech tagger implementation. + */ + public Tagger getTagger() { + return DEMO_TAGGER; + } + + /** + * Get this language's sentence tokenizer implementation. + */ + public SentenceTokenizer getSentenceTokenizer() { + return SENTENCE_TOKENIZER; + } + + /** + * Get this language's word tokenizer implementation. + */ + public Tokenizer getWordTokenizer() { + return WORD_TOKENIZER; + } + + /** + * Get this language's part-of-speech synthesizer implementation or <code>null</code>. + */ + public Synthesizer getSynthesizer() { + return null; + } + + /** + * Get this language's feature unifier. + * @return Feature unifier for analyzed tokens. + */ + public Unifier getUnifier() { + return MATCH_UNIFIER; + } + + /** + * Get this language's feature unifier used for disambiguation. + * Note: it might be different from the normal rule unifier. + * @return Feature unifier for analyzed tokens. + */ + public Unifier getDisambiguationUnifier() { + return MATCH_UNIFIER; + } + + /** + * Get the name of the language translated to the current locale, + * if available. Otherwise, get the untranslated name. + */ + public final String getTranslatedName(final ResourceBundle messages) { + try { + return messages.getString(getShortName()); + } catch (final MissingResourceException e) { + return getName(); + } + } + + // ------------------------------------------------------------------------- + + /** + * Re-inits the built-in languages and adds the specified ones. + */ + public static void reInit(final List<Language> languages) { + LANGUAGES = new Language[BUILTIN_LANGUAGES.length + languages.size()]; + int i = BUILTIN_LANGUAGES.length; + System.arraycopy(BUILTIN_LANGUAGES, 0, + LANGUAGES, 0, BUILTIN_LANGUAGES.length); + for (final Language lang : languages) { + LANGUAGES[i++] = lang; + } + externalLanguages = languages; + } + + /** + * Return languages that are not built-in but have been added manually. + */ + public static List<Language> getExternalLanguages() { + return externalLanguages; + } + + /** + * Get the Language object for the given short language name. + * + * @param shortLanguageCode e.g. <code>en</code> or <code>de</code> + * @return a Language object or <code>null</code> + */ + public static Language getLanguageForShortName(final String shortLanguageCode) { + StringTools.assureSet(shortLanguageCode, "shortLanguageCode"); + if (shortLanguageCode.length() != "xx".length()) { + throw new IllegalArgumentException("'" + shortLanguageCode + "' isn't a two-character code"); + } + for (Language element : Language.LANGUAGES) { + if (shortLanguageCode.equals(element.getShortName())) { + return element; + } + } + return null; + } + + /** + * Get the Language object for the given language name. + * + * @param languageName e.g. <code>English</code> or <code>German</code> (case is significant) + * @return a Language object or <code>null</code> + */ + public static Language getLanguageForName(final String languageName) { + for (Language element : Language.LANGUAGES) { + if (languageName.equals(element.getName())) { + return element; + } + } + return null; + } + + @Override + public final String toString() { + return getName(); + } + + /** + * Get sorted info about all maintainers. + * @since 0.9.9 + * @param messages + * {{@link ResourceBundle} language bundle to translate + * the info + * @return + * A sorted list of maintainers. + */ + public static String getAllMaintainers(final ResourceBundle messages) { + final StringBuilder maintainersInfo = new StringBuilder(); + final List<String> toSort = new ArrayList<String>(); + for (final Language lang : Language.LANGUAGES) { + if (lang != Language.DEMO) { + if (lang.getMaintainers() != null) { + final List<String> names = new ArrayList<String>(); + for (Contributor contributor : lang.getMaintainers()) { + names.add(contributor.getName()); + } + toSort.add(messages.getString(lang.getShortName()) + + ": " + StringTools.listToString(names, ", ")); + } + } + } + Collections.sort(toSort); + for (final String lElem : toSort) { + maintainersInfo.append(lElem); + maintainersInfo.append('\n'); + } + return maintainersInfo.toString(); + } +} diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/Main.java b/JLanguageTool/src/java/de/danielnaber/languagetool/Main.java new file mode 100644 index 0000000..f2f2cc6 --- /dev/null +++ b/JLanguageTool/src/java/de/danielnaber/languagetool/Main.java @@ -0,0 +1,567 @@ +/* LanguageTool, a natural language style checker + * Copyright (C) 2005 Daniel Naber (http://www.danielnaber.de) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ +package de.danielnaber.languagetool; + +import java.io.BufferedInputStream; +import java.io.BufferedReader; +import java.io.File; +import java.io.FileInputStream; +import java.io.IOException; +import java.io.InputStreamReader; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; +import java.util.HashSet; +import java.util.List; +import java.util.Locale; +import java.util.Set; + +import javax.xml.parsers.ParserConfigurationException; + +import org.xml.sax.SAXException; + +import de.danielnaber.languagetool.bitext.StringPair; +import de.danielnaber.languagetool.bitext.TabBitextReader; +import de.danielnaber.languagetool.rules.Rule; +import de.danielnaber.languagetool.rules.bitext.BitextRule; +import de.danielnaber.languagetool.tools.StringTools; +import de.danielnaber.languagetool.tools.Tools; + +/** + * The command line tool to check plain text files. + * + * @author Daniel Naber + */ +class Main { + + private JLanguageTool lt; + private final boolean verbose; + private final boolean apiFormat; + private final boolean taggerOnly; + private final boolean applySuggestions; + private boolean profileRules; + private boolean bitextMode; + private JLanguageTool srcLt; + List<BitextRule> bRules; + private Rule currentRule; + + /* maximum file size to read in a single read */ + private static final int MAX_FILE_SIZE = 64000; + + Main(final boolean verbose, final boolean taggerOnly, + final Language language, final Language motherTongue, + final String[] disabledRules, final String[] enabledRules, + final boolean apiFormat, boolean applySuggestions) throws IOException, + SAXException, ParserConfigurationException { + this.verbose = verbose; + this.apiFormat = apiFormat; + this.taggerOnly = taggerOnly; + this.applySuggestions = applySuggestions; + profileRules = false; + bitextMode = false; + srcLt = null; + bRules = null; + lt = new JLanguageTool(language, motherTongue); + lt.activateDefaultPatternRules(); + lt.activateDefaultFalseFriendRules(); + selectRules(lt, disabledRules, enabledRules); + } + + private void selectRules(final JLanguageTool lt, final String[] disabledRules, final String[] enabledRules) { + // disable rules that are disabled explicitly: + for (final String disabledRule : disabledRules) { + lt.disableRule(disabledRule); + } + // disable all rules except those enabled explicitly, if any: + if (enabledRules.length > 0) { + final Set<String> enabledRuleIDs = new HashSet<String>(Arrays + .asList(enabledRules)); + for (String ruleName : enabledRuleIDs) { + lt.enableDefaultOffRule(ruleName); + lt.enableRule(ruleName); + } + for (Rule rule : lt.getAllRules()) { + if (!enabledRuleIDs.contains(rule.getId())) { + lt.disableRule(rule.getId()); + } + } + } + } + + private void setListUnknownWords(final boolean listUnknownWords) { + lt.setListUnknownWords(listUnknownWords); + } + + private void setProfilingMode() { + profileRules = true; + } + + private final void setBitextMode(final Language sourceLang, + final String[] disabledRules, final String[] enabledRules) throws IOException, ParserConfigurationException, SAXException { + bitextMode = true; + Language target = lt.getLanguage(); + lt = new JLanguageTool(target, null); + srcLt = new JLanguageTool(sourceLang); + lt.activateDefaultPatternRules(); + selectRules(lt, disabledRules, enabledRules); + selectRules(srcLt, disabledRules, enabledRules); + bRules = Tools.getBitextRules(sourceLang, lt.getLanguage()); + + List<BitextRule> bRuleList = new ArrayList<BitextRule>(bRules); + for (final BitextRule br : bRules) { + for (final String disabledRule : disabledRules) { + if (br.getId().equals(disabledRule)) { + bRuleList.remove(br); + } + } + } + bRules = bRuleList; + if (enabledRules.length > 0) { + bRuleList = new ArrayList<BitextRule>(); + for (final String enabledRule : enabledRules) { + for (final BitextRule br : bRules) { + if (br.getId().equals(enabledRule)) { + bRuleList.add(br); + } + } + } + bRules = bRuleList; + } + } + + JLanguageTool getJLanguageTool() { + return lt; + } + + private void runOnFile(final String filename, final String encoding, + final boolean listUnknownWords) throws IOException { + boolean oneTime = false; + if (!"-".equals(filename)) { + final File file = new File(filename); + // run once on file if the file size < MAXFILESIZE or + // when we use the bitext mode (we use a bitext reader + // instead of a direct file access) + oneTime = file.length() < MAX_FILE_SIZE || bitextMode; + } + if (oneTime) { + if (bitextMode) { + //TODO: add parameter to set different readers + TabBitextReader reader = new TabBitextReader(filename, encoding); + if (applySuggestions) { + Tools.correctBitext(reader, srcLt, lt, bRules); + } else { + Tools.checkBitext(reader, srcLt, lt, bRules, + apiFormat); + } + } else { + final String text = getFilteredText(filename, encoding); + if (applySuggestions) { + System.out.print(Tools.correctText(text, lt)); + } else if (profileRules) { + Tools.profileRulesOnText(text, lt); + } else if (!taggerOnly) { + Tools.checkText(text, lt, apiFormat, 0); + } else { + Tools.tagText(text, lt); + } + if (listUnknownWords) { + System.out.println("Unknown words: " + lt.getUnknownWords()); + } + } + } else { + if (verbose) { + lt.setOutput(System.err); + } + if (!apiFormat && !applySuggestions) { + if ("-".equals(filename)) { + System.out.println("Working on STDIN..."); + } else { + System.out.println("Working on " + filename + "..."); + } + } + int runCount = 1; + final List<Rule> rules = lt.getAllRules(); + if (profileRules) { + System.out.printf("Testing %d rules\n", rules.size()); + System.out.println("Rule ID\tTime\tSentences\tMatches\tSentences per sec."); + runCount = rules.size(); + } + InputStreamReader isr = null; + BufferedReader br = null; + int lineOffset = 0; + int tmpLineOffset = 0; + final List<String> unknownWords = new ArrayList<String>(); + StringBuilder sb = new StringBuilder(); + for (int ruleIndex = 0; ruleIndex <runCount; ruleIndex++) { + currentRule = rules.get(ruleIndex); + int matches = 0; + long sentences = 0; + final long startTime = System.currentTimeMillis(); + try { + if (!"-".equals(filename)) { + final File file = new File(filename); + if (encoding != null) { + isr = new InputStreamReader(new BufferedInputStream( + new FileInputStream(file.getAbsolutePath())), encoding); + } else { + isr = new InputStreamReader(new BufferedInputStream( + new FileInputStream(file.getAbsolutePath()))); + } + } else { + if (encoding != null) { + isr = new InputStreamReader(new BufferedInputStream(System.in), + encoding); + } else { + isr = new InputStreamReader(new BufferedInputStream(System.in)); + } + } + br = new BufferedReader(isr); + String line; + while ((line = br.readLine()) != null) { + sb.append(line); + sb.append('\n'); + tmpLineOffset++; + if (lt.getLanguage().getSentenceTokenizer() + .singleLineBreaksMarksPara()) { + matches = handleLine(matches, lineOffset, sb); + sentences += lt.getSentenceCount(); + if (profileRules) { + sentences += lt.sentenceTokenize(sb.toString()).size(); + } + if (listUnknownWords && !taggerOnly) { + for (String word : lt.getUnknownWords()) + if (!unknownWords.contains(word)) { + unknownWords.add(word); + } + } + sb = new StringBuilder(); + lineOffset = tmpLineOffset; + } else { + if ("".equals(line) || sb.length() >= MAX_FILE_SIZE) { + matches = handleLine(matches, lineOffset, sb); + sentences += lt.getSentenceCount(); + if (profileRules) { + sentences += lt.sentenceTokenize(sb.toString()).size(); + } + if (listUnknownWords && !taggerOnly) { + for (String word : lt.getUnknownWords()) + if (!unknownWords.contains(word)) { + unknownWords.add(word); + } + } + sb = new StringBuilder(); + lineOffset = tmpLineOffset; + } + } + } + } finally { + + if (sb.length() > 0) { + matches = handleLine(matches, tmpLineOffset - 1, sb); + sentences += lt.getSentenceCount(); + if (profileRules) { + sentences += lt.sentenceTokenize(sb.toString()).size(); + } + if (listUnknownWords && !taggerOnly) { + for (String word : lt.getUnknownWords()) + if (!unknownWords.contains(word)) { + unknownWords.add(word); + } + } + } + + printTimingInformation(listUnknownWords, rules, unknownWords, ruleIndex, matches, sentences, startTime); + + if (br != null) { + br.close(); + } + if (isr != null) { + isr.close(); + } + } + } + } + } + + private void printTimingInformation(final boolean listUnknownWords, final List<Rule> rules, + final List<String> unknownWords, final int ruleIndex, final int matches, final long sentences, final long startTime) { + if (!applySuggestions) { + final long endTime = System.currentTimeMillis(); + final long time = endTime - startTime; + final float timeInSeconds = time / 1000.0f; + final float sentencesPerSecond = sentences / timeInSeconds; + if (apiFormat) { + System.out.println("<!--"); + } + if (profileRules) { + //TODO: run 10 times, line in runOnce mode, and use median + System.out.printf(Locale.ENGLISH, + "%s\t%d\t%d\t%d\t%.1f", rules.get(ruleIndex).getId(), + time, sentences, matches, sentencesPerSecond); + System.out.println(); + } else { + System.out.printf(Locale.ENGLISH, + "Time: %dms for %d sentences (%.1f sentences/sec)", time, + sentences, sentencesPerSecond); + System.out.println(); + } + if (listUnknownWords) { + Collections.sort(unknownWords); + System.out.println("Unknown words: " + unknownWords); + } + if (apiFormat) { + System.out.println("-->"); + } + } + } + + private int handleLine(final int matchNo, final int lineOffset, + final StringBuilder sb) throws IOException { + int matches = matchNo; + if (applySuggestions) { + System.out.print(Tools.correctText(StringTools.filterXML(sb.toString()), + lt)); + } else if (profileRules) { + matches += Tools.profileRulesOnLine(StringTools.filterXML(sb.toString()), + lt, currentRule); + } else if (!taggerOnly) { + if (matches == 0) { + matches += Tools.checkText(StringTools.filterXML(sb.toString()), lt, + apiFormat, -1, lineOffset, matches, + StringTools.XmlPrintMode.START_XML); + } else { + matches += Tools.checkText(StringTools.filterXML(sb.toString()), lt, + apiFormat, -1, lineOffset, matches, + StringTools.XmlPrintMode.CONTINUE_XML); + } + } else { + Tools.tagText(StringTools.filterXML(sb.toString()), lt); + } + return matches; + } + + private void runRecursive(final String filename, final String encoding, + final boolean listUnknown) throws IOException, + ParserConfigurationException, SAXException { + final File dir = new File(filename); + if (!dir.isDirectory()) { + throw new IllegalArgumentException(dir.getAbsolutePath() + + " is not a directory, cannot use recursion"); + } + final File[] files = dir.listFiles(); + for (final File file : files) { + if (file.isDirectory()) { + runRecursive(file.getAbsolutePath(), encoding, listUnknown); + } else { + runOnFile(file.getAbsolutePath(), encoding, listUnknown); + } + } + } + + /** + * Loads filename and filters out XML. Note that the XML + * filtering can lead to incorrect positions in the list of matching rules. + * + * @param filename + * @throws IOException + */ + private String getFilteredText(final String filename, final String encoding) + throws IOException { + if (verbose) { + lt.setOutput(System.err); + } + if (!apiFormat && !applySuggestions) { + System.out.println("Working on " + filename + "..."); + } + final String fileContents = StringTools.readFile(new FileInputStream( + filename), encoding); + return StringTools.filterXML(fileContents); + } + + private static void exitWithUsageMessage() { + System.out + .println("Usage: java de.danielnaber.languagetool.Main " + + "[-r|--recursive] [-v|--verbose] [-l|--language LANG] [-m|--mothertongue LANG] [-d|--disable RULES] " + + "[-e|--enable RULES] [-c|--encoding] [-u|--list-unknown] [-t|--taggeronly] [-b] [--api] [-a|--apply] " + + "[-b2|--bitext] <file>"); + System.exit(1); + } + + /** + * Command line tool to check plain text files. + */ + public static void main(final String[] args) throws IOException, + ParserConfigurationException, SAXException { + if (args.length < 1 || args.length > 9) { + exitWithUsageMessage(); + } + boolean verbose = false; + boolean recursive = false; + boolean taggerOnly = false; + boolean singleLineBreakMarksParagraph = false; + boolean apiFormat = false; + boolean listUnknown = false; + boolean applySuggestions = false; + boolean profile = false; + boolean bitext = false; + Language language = null; + Language motherTongue = null; + String encoding = null; + String filename = null; + String[] disabledRules = new String[0]; + String[] enabledRules = new String[0]; + for (int i = 0; i < args.length; i++) { + if (args[i].equals("-h") || args[i].equals("-help") + || args[i].equals("--help") || args[i].equals("--?")) { + exitWithUsageMessage(); + } else if (args[i].equals("-v") || args[i].equals("--verbose")) { + verbose = true; + } else if (args[i].equals("-t") || args[i].equals("--taggeronly")) { + taggerOnly = true; + if (listUnknown) { + throw new IllegalArgumentException( + "You cannot list unknown words when tagging only."); + } + if (applySuggestions) { + throw new IllegalArgumentException( + "You cannot apply suggestions when tagging only."); + } + } else if (args[i].equals("-r") || args[i].equals("--recursive")) { + recursive = true; + } else if (args[i].equals("-b2") || args[i].equals("--bitext")) { + bitext = true; + } else if (args[i].equals("-d") || args[i].equals("--disable")) { + if (enabledRules.length > 0) { + throw new IllegalArgumentException( + "You cannot specify both enabled and disabled rules"); + } + final String rules = args[++i]; + disabledRules = rules.split(","); + } else if (args[i].equals("-e") || args[i].equals("--enable")) { + if (disabledRules.length > 0) { + throw new IllegalArgumentException( + "You cannot specify both enabled and disabled rules"); + } + final String rules = args[++i]; + enabledRules = rules.split(","); + } else if (args[i].equals("-l") || args[i].equals("--language")) { + language = getLanguageOrExit(args[++i]); + } else if (args[i].equals("-m") || args[i].equals("--mothertongue")) { + motherTongue = getLanguageOrExit(args[++i]); + } else if (args[i].equals("-c") || args[i].equals("--encoding")) { + encoding = args[++i]; + } else if (args[i].equals("-u") || args[i].equals("--list-unknown")) { + listUnknown = true; + if (taggerOnly) { + throw new IllegalArgumentException( + "You cannot list unknown words when tagging only."); + } + } else if (args[i].equals("-b")) { + singleLineBreakMarksParagraph = true; + } else if (args[i].equals("--api")) { + apiFormat = true; + if (applySuggestions) { + throw new IllegalArgumentException( + "API format makes no sense for automatic application of suggestions."); + } + } else if (args[i].equals("-a") || args[i].equals("--apply")) { + applySuggestions = true; + if (taggerOnly) { + throw new IllegalArgumentException( + "You cannot apply suggestions when tagging only."); + } + if (apiFormat) { + throw new IllegalArgumentException( + "API format makes no sense for automatic application of suggestions."); + } + } else if (args[i].equals("-p") || args[i].equals("--profile")) { + profile = true; + if (apiFormat) { + throw new IllegalArgumentException( + "API format makes no sense for profiling."); + } + if (applySuggestions) { + throw new IllegalArgumentException( + "Applying suggestions makes no sense for profiling."); + } + if (taggerOnly) { + throw new IllegalArgumentException( + "Tagging makes no sense for profiling."); + } + } else if (i == args.length - 1) { + filename = args[i]; + } else { + System.err.println("Unknown option: " + args[i]); + exitWithUsageMessage(); + } + } + if (filename == null) { + filename = "-"; + } + if (language == null) { + if (!apiFormat) { + System.err.println("No language specified, using English"); + } + language = Language.ENGLISH; + } else if (!apiFormat && !applySuggestions) { + System.out.println("Expected text language: " + language.getName()); + } + language.getSentenceTokenizer().setSingleLineBreaksMarksParagraph( + singleLineBreakMarksParagraph); + final Main prg = new Main(verbose, taggerOnly, language, motherTongue, + disabledRules, enabledRules, apiFormat, applySuggestions); + prg.setListUnknownWords(listUnknown); + if (profile) { + prg.setProfilingMode(); + } + if (bitext) { + if (motherTongue == null) { + throw new IllegalArgumentException( + "You have to set the source language (as mother tongue)."); + } + prg.setBitextMode(motherTongue, disabledRules, enabledRules); + } + if (recursive) { + prg.runRecursive(filename, encoding, listUnknown); + } else { + prg.runOnFile(filename, encoding, listUnknown); + } + } + + private static Language getLanguageOrExit(final String lang) { + Language language = null; + boolean foundLanguage = false; + final List<String> supportedLanguages = new ArrayList<String>(); + for (final Language tmpLang : Language.LANGUAGES) { + supportedLanguages.add(tmpLang.getShortName()); + if (lang.equals(tmpLang.getShortName())) { + language = tmpLang; + foundLanguage = true; + break; + } + } + if (!foundLanguage) { + System.out.println("Unknown language '" + lang + + "'. Supported languages are: " + supportedLanguages); + exitWithUsageMessage(); + } + return language; + } + +} diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/MessagesBundle.properties b/JLanguageTool/src/java/de/danielnaber/languagetool/MessagesBundle.properties new file mode 100644 index 0000000..921a510 --- /dev/null +++ b/JLanguageTool/src/java/de/danielnaber/languagetool/MessagesBundle.properties @@ -0,0 +1,186 @@ +# English translation of LanguageTool +# Copyright (C) 2006 Daniel Naber (http://www.danielnaber.de), Marcin Milkowski + +be = Belarusian + +ca = Catalan + +category_case = Capitalization + +category_false_friend = False friends + +category_grammar = Grammar + +category_misc = Miscellaneous + +category_typo = Possible Typo + +checkDone = Check done, {0} potential problems found + +checkText = &Check Text + +correctionMessage = Correction: + +cs = Czech + +da = Danish + +de = German + +desc_comma_whitespace = Use of whitespace before comma and before/after parentheses + +desc_double_punct = Use of two consecutive dots or commas + +desc_repetition = Word repetition (e.g. 'will will') + +desc_repetition_short = Word repetition + +desc_unpaired_brackets = Unpaired braces, brackets, quotation marks and similar symbols + +desc_uppercase_sentence = Checks that a sentence starts with an uppercase letter + +desc_whitespacerepetition = Whitespace repetition (bad formatting) + +double_dots_short = Two consecutive dots + +double_commas_short = Two consecutive comma + +en = English + +enterText = Please type or paste text to check in the top area + +enterText2 = Please insert text to check here + +eo = Esperanto + +errorContext = Context: + +errorMessage = Message: + +es = Spanish + +false_friend = False friend + +false_friend_desc = false friend hint for: + +false_friend_hint = Hint: "{0}" ({1}) means {2} ({3}). + +false_friend_suggestion = Did you mean {0}? + +fr = French + +gl = Galician + +guiCancelButton = Cancel + +guiCheckComplete = LanguageTool check is complete. + +guiConfigWindowTitle = LanguageTool Options + +guiDemoText = This is a example input to to show you how LanguageTool works. Note, however, that it does not include a spell checka. + +guiMatchCount = Potential errors: + +guiMenuAbout = &About... + +guiMenuAddRules = Load &Rule File + +guiMenuCheckClipboard = &Check Text in Clipboard + +guiMenuFile = &File + +guiMenuHelp = &Help + +guiMenuHide = &Hide to System Tray + +guiMenuOpen = &Open... + +guiMenuOptions = Option&s... + +guiMenuQuit = &Quit + +guiMenuShowMainWindow = Open Main Window + +guiMotherTongue = Your mother tongue: + +guiNoErrorsFound = No errors or warnings found (language: {0}) + +guiNoErrorsFoundSelectedText = No errors or warnings found in selected text (language: {0}) + +guiOKButton = &OK + +guiOOoChangeButton = &Change + +guiOOoCloseButton = Close + +guiOOoIgnoreAllButton = Ignore All + +guiOOoIgnoreButton = Ignore + +guiOOoOptionsButton = Options... + +guiProgressWindowTitle = LanguageTool: Checking Text... + +guiReplaceWindowTitle = Replace text + +guiReplaceWithOtherText = <other text> + +guiRunOnPort = Run as server on po&rt + +guiSelectionCheckComplete = LanguageTool check of selected text is complete. + +incorrect_case = This sentence does not start with an uppercase letter + +is = Icelandic + +it = Italian + +lt = Lithuanian + +missing_space_after_comma = Put a space after the comma + +ml= Malayalam + +nl = Dutch + +no_space_after = Don't put a space after the opening parenthesis + +no_space_before = Don't put a space before the closing parenthesis + +no_space_before_dot = Don't put a space before the full stop + +pl = Polish + +repetition = Possible typo: you repeated a word + +result1 = <br><b> {0}. Line {1}, column {2}</b><br> + +resultAreaText = Results will appear here + +resultTime = <br>Time: {0}ms (including {1}ms for rule matching)<br> + +ru = Russian + +sk = Slovak + +sl = Slovenian + +space_after_comma = Put a space after the comma, but not before the comma + +startChecking = Starting check in {0} + +sv = Swedish + +textLanguage = Text Language: + +two_commas = Two consecutive commas + +two_dots = Two consecutive dots + +uk = Ukrainian + +unpaired_brackets = Unpaired bracket or similar symbol + +whitespace_repetition = Possible typo: you repeated a whitespace + +ro = Romanian
\ No newline at end of file diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/MessagesBundle_be.properties b/JLanguageTool/src/java/de/danielnaber/languagetool/MessagesBundle_be.properties new file mode 100644 index 0000000..9a7212b --- /dev/null +++ b/JLanguageTool/src/java/de/danielnaber/languagetool/MessagesBundle_be.properties @@ -0,0 +1,93 @@ +#X-Generator: crowdin.net +eo=Esperanto +be=\u0411\u0435\u043b\u0430\u0440\u0443\u0441\u043a\u0430\u044f +ca=\u041a\u0430\u0442\u0430\u043b\u043e\u043d\u0441\u043a\u0430\u044f +category_case=\u0412\u044f\u043b\u0456\u043a\u0456\u044f \u043b\u0456\u0442\u0430\u0440\u044b +category_false_friend=\u0410\u043c\u043e\u043d\u0456\u043c\u044b +category_grammar=\u0413\u0440\u0430\u043c\u0430\u0442\u044b\u043a\u0430 +category_misc=\u0420\u043e\u0437\u043d\u0430\u0435 +category_typo=\u041c\u0430\u0433\u0447\u044b\u043c\u044b\u044f \u043f\u0430\u043c\u044b\u043b\u043a\u0456 \u043d\u0430\u0431\u043e\u0440\u0443 +checkDone=\u0421\u043f\u0440\u0430\u045e\u0434\u0436\u0430\u043d\u043d\u0435 \u0437\u0430\u0432\u0435\u0440\u0448\u0430\u043d\u0430, \u0437\u043d\u043e\u0439\u0434\u0437\u0435\u043d\u0430 {0} \u043c\u0430\u0433\u0447\u044b\u043c\u044b\u0445 \u043f\u0430\u043c\u044b\u043b\u0430\u043a +checkText=\u0421\u043f\u0440\u0430\u045e\u0434\u0437\u0456\u0446\u044c \u0442\u044d\u043a\u0441\u0442 +correctionMessage=\u0412\u044b\u043f\u0440\u0430\u045e\u043b\u0435\u043d\u043d\u0456\: +cs=\u0427\u044d\u0448\u0441\u043a\u0430\u044f +da=\u0414\u0430\u0446\u043a\u0430\u044f +de=\u041d\u044f\u043c\u0435\u0446\u043a\u0430\u044f +desc_comma_whitespace=\u0412\u044b\u043a\u0430\u0440\u044b\u0441\u0442\u0430\u043d\u043d\u0435 \u043f\u0440\u0430\u0431\u0435\u043b\u0443 \u043f\u0435\u0440\u0430\u0434 \u043a\u043e\u0441\u043a\u0430\u0439 \u0446\u0456 \u0434\u0430 \u0456 \u043f\u0430\u0441\u043b\u044f \u0434\u0443\u0436\u043a\u0456 +desc_double_punct=\u0412\u044b\u043a\u0430\u0440\u044b\u0441\u0442\u0430\u043d\u043d\u0435 \u0434\u0432\u0443\u0445 \u043f\u0430\u0441\u043b\u044f\u0434\u043e\u045e\u043d\u044b\u0445 \u043a\u0440\u043e\u043f\u0430\u043a \u0456 \u043a\u043e\u0441\u043a\u0430\u0445 +desc_repetition=\u041f\u0430\u045e\u0442\u0430\u0440\u044d\u043d\u043d\u0435 \u0441\u043b\u043e\u0432\u0430\u045e (\u043d\u0430\u043f\u0440\u044b\u043a\u043b\u0430\u0434, "\u0431\u0443\u0434\u0443 \u0431\u0443\u0434\u0443") +desc_repetition_short=\u041f\u0430\u045e\u0442\u0430\u0440\u044d\u043d\u043d\u0435 \u0441\u043b\u043e\u0432\u0430\u045e +desc_unpaired_brackets=\u041d\u044f\u043f\u0430\u0440\u043d\u044b\u044f \u0434\u0443\u0436\u043a\u0456, \u0434\u0432\u0443\u043a\u043e\u0441\u0441\u0456 \u0446\u0456 \u043f\u0430\u0434\u043e\u0431\u043d\u044b\u044f \u0441\u0456\u043c\u0432\u0430\u043b\u044b +desc_uppercase_sentence=\u041f\u0440\u0430\u0432\u044f\u0440\u0430\u0435, \u0448\u0442\u043e \u0441\u043a\u0430\u0437 \u043f\u0430\u0447\u044b\u043d\u0430\u0435\u0446\u0446\u0430 \u0437 \u0432\u044f\u043b\u0456\u043a\u0430\u0439 \u043b\u0456\u0442\u0430\u0440\u044b +desc_whitespacerepetition=\u041f\u0430\u045e\u0442\u0430\u0440\u044d\u043d\u043d\u0435 \u043f\u0440\u0430\u0431\u0435\u043b\u0430\u045e (\u0434\u0440\u044d\u043d\u043d\u0430\u0435 \u0444\u0430\u0440\u043c\u0430\u0442\u0430\u0432\u0430\u043d\u043d\u0435) +double_dots_short=\u0414\u0437\u0432\u0435 \u043f\u0430\u0441\u043b\u044f\u0434\u043e\u045e\u043d\u044b\u044f \u043a\u0440\u043e\u043f\u043a\u0456 +double_commas_short=\u0414\u0437\u0432\u0435 \u043f\u0430\u0441\u043b\u044f\u0434\u043e\u045e\u043d\u044b\u044f \u043a\u043e\u0441\u043a\u0456 +en=\u0410\u043d\u0433\u043b\u0456\u0439\u0441\u043a\u0430\u044f +enterText=\u041a\u0430\u0431 \u0441\u043f\u0440\u0430\u045e\u0434\u0437\u0456\u0446\u044c, \u043d\u0430\u0431\u044f\u0440\u044b\u0446\u0435 \u0446\u0456 \u045e\u0441\u0442\u0430\u045e\u0446\u0435 \u0442\u044d\u043a\u0441\u0442 \u0443\u0432\u0435\u0440\u0441\u0435 +enterText2=\u041a\u0430\u043b\u0456 \u043b\u0430\u0441\u043a\u0430, \u045e\u0441\u0442\u0430\u045e\u0446\u0435 \u0442\u044d\u043a\u0441\u0442 \u043a\u0430\u0431 \u0441\u043f\u0440\u0430\u045e\u0434\u0437\u0456\u0446\u044c +errorContext=\u041a\u0430\u043d\u0442\u044d\u043a\u0441\u0442\: +errorMessage=\u041f\u0430\u0432\u0435\u0434\u0430\u043c\u043b\u0435\u043d\u043d\u0435\: +es=\u0406\u0441\u043f\u0430\u043d\u0441\u043a\u0430\u044f +false_friend=\u0410\u043c\u043e\u043d\u0456\u043c +false_friend_desc=\u043f\u0430\u0434\u043a\u0430\u0437\u043a\u0430 \u0430\u043c\u043e\u043d\u0456\u043c\u0430 \u0434\u043b\u044f\: +false_friend_hint=\u041f\u0430\u0434\u043a\u0430\u0437\u043a\u0430\: "{0}" ({1}) \u0430\u0437\u043d\u0430\u0447\u0430\u0435 {2} ({3}). +false_friend_suggestion=\u0412\u044b \u043c\u0435\u043b\u0456 \u043d\u0430 \u045e\u0432\u0430\u0437\u0435 {0} ? +fr=\u0424\u0440\u0430\u043d\u0446\u0443\u0437\u0441\u043a\u0430\u044f +gl=\u0413\u0430\u043b\u0456\u0441\u0456\u0439\u0441\u043a\u0430\u044f +guiCancelButton=\u0421\u043a\u0430\u0441\u0430\u0432\u0430\u0446\u044c +guiCheckComplete=LanguageTool \u0437\u0430\u0432\u044f\u0440\u0448\u044b\u045e \u0441\u043f\u0440\u0430\u045e\u0434\u0436\u0432\u0430\u043d\u043d\u0435. +guiConfigWindowTitle=\u041d\u0430\u0441\u0442\u0430\u045e\u043b\u0435\u043d\u043d\u0456 LanguageTool +guiDemoText=\u0413\u044d\u0442\u0430 \u043f\u0440\u044b\u043a\u043b\u0430\u0434 \u0442\u044d\u043a\u0441\u0442\u0443 \u043a\u0430\u0431 \u043f\u0430\u043a\u0430\u0437\u0430\u0446\u044c \u0432\u0430\u043c, \u044f\u043a \u043f\u0440\u0430\u0446\u0443\u0435 LanguageTool. \u041c\u0430\u0439\u0446\u0435 \u043d\u0430 \u045e\u0432\u0430\u0437\u0435, \u0430\u0434\u043d\u0430\u043a, \u0448\u0442\u043e \u044f\u043d\u043e \u043d\u0435 \u045e\u043a\u043b\u044e\u0447\u0430\u0435 \u0441\u043f\u0440\u0430\u045e\u0434\u0436\u0432\u0430\u043d\u043d\u0435 \u0430\u0440\u0444\u0430\u0433\u0440\u0430\u0444\u0456\u0456. +guiMatchCount=\u041c\u0430\u0433\u0447\u044b\u043c\u044b\u044f \u043f\u0430\u043c\u044b\u043b\u043a\u0456\: +guiMenuAbout=\u0410\u0431 \u043f\u0440\u0430\u0433\u0440\u0430\u043c\u0435 ... +guiMenuAddRules=\u0417\u0430\u0433\u0440\u0443\u0437\u0456\u0446\u044c \u0444\u0430\u0439\u043b \u043f\u0440\u0430\u0432\u0456\u043b\u0430\u045e +guiMenuCheckClipboard=\u0421\u043f\u0440\u0430\u045e\u0434\u0437\u0456\u0446\u044c \u0442\u044d\u043a\u0441\u0442 \u0443 \u0430\u0431\u043c\u0435\u043d\u043d\u0456\u043a\u0443 +guiMenuFile=\u0424\u0430\u0439\u043b +guiMenuHelp=\u0414\u0430\u0432\u0435\u0434\u043a\u0430 +guiMenuHide=\u0421\u0445\u0430\u0432\u0430\u0446\u044c \u045e \u0441\u0456\u0441\u0442\u044d\u043c\u043d\u044b \u0442\u0440\u044d\u0439 +guiMenuOpen=\u0410\u0434\u043a\u0440\u044b\u0446\u044c +guiMenuOptions=\u041d\u0430\u0441\u0442\u0430\u045e\u043b\u0435\u043d\u043d\u0456 +guiMenuQuit=\u0412\u044b\u0439\u0441\u0446\u0456 +guiMenuShowMainWindow=\u0410\u0434\u043a\u0440\u044b\u0446\u044c \u0433\u0430\u043b\u043e\u045e\u043d\u0430\u0435 \u0430\u043a\u043d\u043e +guiMotherTongue=\u0412\u0430\u0448\u0430 \u0440\u043e\u0434\u043d\u0430\u044f \u043c\u043e\u0432\u0430\: +guiNoErrorsFound=\u041d\u044f \u0437\u043d\u043e\u0439\u0434\u0437\u0435\u043d\u0430 \u043f\u0430\u043c\u044b\u043b\u0430\u043a \u0430\u0431\u043e \u043f\u0430\u043f\u044f\u0440\u044d\u0434\u0436\u0430\u043d\u043d\u044f\u045e (\u043c\u043e\u0432\u0430\: {0}) +guiNoErrorsFoundSelectedText=\u041d\u044f \u0437\u043d\u043e\u0439\u0434\u0437\u0435\u043d\u0430 \u043f\u0430\u043c\u044b\u043b\u0430\u043a \u0430\u0431\u043e \u043f\u0430\u043f\u044f\u0440\u044d\u0434\u0436\u0430\u043d\u043d\u044f\u045e \u0443 \u0430\u0431\u0440\u0430\u043d\u044b\u043c \u0442\u044d\u043a\u0441\u0446\u0435 (\u043c\u043e\u0432\u0430\: {0}) +guiOKButton=&OK +guiOOoChangeButton=\u0417\u043c\u044f\u043d\u0456\u0446\u044c +guiOOoCloseButton=\u0417\u0430\u043a\u0440\u044b\u0446\u044c +guiOOoIgnoreAllButton=\u0406\u0433\u043d\u0430\u0440\u0430\u0432\u0430\u0446\u044c \u0443\u0441\u0435 +guiOOoIgnoreButton=\u0406\u0433\u043d\u0430\u0440\u0430\u0432\u0430\u0446\u044c +guiOOoOptionsButton=\u041d\u0430\u0441\u0442\u0430\u045e\u043b\u0435\u043d\u043d\u0456... +guiProgressWindowTitle=LanguageTool\: \u0441\u043f\u0440\u0430\u045e\u0434\u0436\u0432\u0430\u043d\u043d\u0435 \u0442\u044d\u043a\u0441\u0442\u0443 ... +guiReplaceWindowTitle=\u0417\u0430\u043c\u044f\u043d\u0456\u0446\u044c \u0442\u044d\u043a\u0441\u0442\u0443 +guiReplaceWithOtherText=<\u0456\u043d\u0448\u044b \u0442\u044d\u043a\u0441\u0442> +guiRunOnPort=\u0417\u0430\u043f\u0443\u0441\u0446\u0456\u0446\u044c \u044f\u043a \u0441\u044d\u0440\u0432\u0435\u0440 \u043d\u0430 \u043f\u043e\u0440\u0446\u0435 +guiSelectionCheckComplete=\u0421\u043f\u0440\u0430\u045e\u0434\u0436\u0432\u0430\u043d\u043d\u0435 LanguageTool \u0430\u0431\u0440\u0430\u043d\u0430\u0433\u0430 \u0442\u044d\u043a\u0441\u0442\u0443 \u0437\u0430\u0432\u0435\u0440\u0448\u0430\u043d\u0430. +incorrect_case=\u0413\u044d\u0442\u044b \u0441\u043a\u0430\u0437 \u043d\u0435 \u043f\u0430\u0447\u044b\u043d\u0430\u0435\u0446\u0446\u0430 \u0437 \u0432\u044f\u043b\u0456\u043a\u0430\u0439 \u043b\u0456\u0442\u0430\u0440\u044b +is=\u0406\u0441\u043b\u0430\u043d\u0434\u0441\u043a\u0430\u044f +it=\u0406\u0442\u0430\u043b\u044c\u044f\u043d\u0441\u043a\u0430\u044f +lt=\u041b\u0456\u0442\u043e\u045e\u0441\u043a\u0430\u044f +missing_space_after_comma=\u041f\u0430\u0441\u0442\u0430\u0432\u0456\u0446\u044c \u043f\u0440\u0430\u0431\u0435\u043b \u043f\u0430\u0441\u043b\u044f \u043a\u043e\u0441\u043a\u0456 +ml=\u041c\u0430\u043b\u0430\u044f\u043b\u0430\u043c\u0441\u043a\u0430\u044f +nl=\u0413\u0430\u043b\u0430\u043d\u0434\u0441\u043a\u0430\u044f +no_space_after=\u041d\u0435 \u0441\u0442\u0430\u045e\u0446\u0435 \u043f\u0440\u0430\u0431\u0435\u043b \u043f\u0430\u0441\u043b\u044f \u043b\u0435\u0432\u0430\u0439 \u0434\u0443\u0436\u043a\u0456 +no_space_before=\u041d\u0435 \u0441\u0442\u0430\u045e\u0446\u0435 \u043f\u0440\u0430\u0431\u0435\u043b \u043f\u0435\u0440\u0430\u0434 \u043f\u0440\u0430\u0432\u0430\u0439 \u0434\u0443\u0436\u043a\u0456 +no_space_before_dot=\u041d\u0435 \u043f\u0430\u043a\u0456\u0434\u0430\u0439\u0446\u0435 \u043f\u0440\u0430\u0431\u0435\u043b \u043f\u0435\u0440\u0430\u0434 \u043a\u0440\u043e\u043f\u043a\u0430\u0439 +pl=\u041f\u043e\u043b\u044c\u0441\u043a\u0430\u044f +repetition=\u041c\u0430\u0433\u0447\u044b\u043c\u0430\u044f \u043f\u0430\u043c\u044b\u043b\u043a\u0430 \u043d\u0430\u0431\u043e\u0440\u0443\: \u0432\u044b \u043f\u0430\u045e\u0442\u0430\u0440\u044b\u043b\u0456 \u0441\u043b\u043e\u0432\u0430 +result1=<br><b> {0}. \u0420\u0430\u0434\u043e\u043a {1}, \u043a\u0430\u043b\u043e\u043d\u043a\u0430 {2}</b><br> +resultAreaText=\u0412\u044b\u043d\u0456\u043a\u0456 \u0437'\u044f\u0432\u044f\u0446\u0446\u0430 \u0442\u0443\u0442 +resultTime=<br>\u0427\u0430\u0441\: {0}\u043c\u0441 (\u0443\u043a\u043b\u044e\u0447\u0430\u044f {1}\u043c\u0441 \u043d\u0430 \u0441\u043f\u0440\u0430\u045e\u0434\u0436\u0432\u0430\u043d\u043d\u0435 \u043f\u0440\u0430\u0432\u0456\u043b\u0430\u045e)<br> +ru=\u0420\u0443\u0441\u043a\u0430\u044f +sk=\u0421\u043b\u0430\u0432\u0430\u0446\u043a\u0430\u044f +sl=\u0421\u043b\u0430\u0432\u0435\u043d\u0441\u043a\u0430\u044f +space_after_comma=\u041f\u0430\u0441\u0442\u0430\u045e\u0446\u0435 \u043f\u0440\u0430\u0431\u0435\u043b \u043f\u0430\u0441\u043b\u044f \u043a\u043e\u0441\u043a\u0456, \u0430\u043b\u0435 \u043d\u0435 \u043f\u0435\u0440\u0430\u0434 \u043a\u043e\u0441\u043a\u0430\u0439 +startChecking=\u0421\u043f\u0440\u0430\u045e\u0434\u0436\u0432\u0430\u043d\u043d\u0435 \u043d\u0430 \u043c\u043e\u0432\u0435\: {0} +sv=\u0428\u0432\u0435\u0434\u0441\u043a\u0430\u044f +textLanguage=\u041c\u043e\u0432\u0430 \u0442\u044d\u043a\u0441\u0442\u0443\: +two_commas=\u0414\u0437\u0432\u0435 \u043f\u0430\u0441\u043b\u044f\u0434\u043e\u045e\u043d\u044b\u044f \u043a\u043e\u0441\u043a\u0456 +two_dots=\u0414\u0437\u0432\u0435 \u043f\u0430\u0441\u043b\u044f\u0434\u043e\u045e\u043d\u044b\u044f \u043a\u0440\u043e\u043f\u043a\u0456 +uk=\u0423\u043a\u0440\u0430\u0456\u043d\u0441\u043a\u0430\u044f +unpaired_brackets=\u041d\u044f\u043f\u0430\u0440\u043d\u044b\u044f \u0441\u043a\u043e\u0431\u043a\u0456 \u0430\u0431\u043e \u043f\u0430\u0434\u043e\u0431\u043d\u044b\u044f \u0441\u0456\u043c\u0432\u0430\u043b\u044b +whitespace_repetition=\u041c\u0430\u0433\u0447\u044b\u043c\u0430\u044f \u043f\u0430\u043c\u044b\u043b\u043a\u0430 \u043d\u0430\u0431\u043e\u0440\u0443\: \u0432\u044b \u043f\u0430\u045e\u0442\u0430\u0440\u044b\u043b\u0456 \u043f\u0440\u0430\u0431\u0435\u043b +ro=\u0420\u0443\u043c\u044b\u043d\u0441\u043a\u0430\u044f diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/MessagesBundle_ca.properties b/JLanguageTool/src/java/de/danielnaber/languagetool/MessagesBundle_ca.properties new file mode 100644 index 0000000..ef81842 --- /dev/null +++ b/JLanguageTool/src/java/de/danielnaber/languagetool/MessagesBundle_ca.properties @@ -0,0 +1,93 @@ +#X-Generator: crowdin.net +eo=Esperanto +be=Belarusian +ca=Catalan +category_case=Capitalization +category_false_friend=False friends +category_grammar=Grammar +category_misc=Miscellaneous +category_typo=Possible Typo +checkDone=Check done, {0} potential problems found +checkText=&Check Text +correctionMessage=Correction\: +cs=Czech +da=Danish +de=German +desc_comma_whitespace=Use of whitespace before comma and before/after parentheses +desc_double_punct=Use of two consecutive dots or commas +desc_repetition=Word repetition (e.g. 'will will') +desc_repetition_short=Word repetition +desc_unpaired_brackets=Unpaired braces, brackets, quotation marks and similar symbols +desc_uppercase_sentence=Checks that a sentence starts with an uppercase letter +desc_whitespacerepetition=Whitespace repetition (bad formatting) +double_dots_short=Two consecutive dots +double_commas_short=Two consecutive comma +en=English +enterText=Please type or paste text to check in the top area +enterText2=Please insert text to check here +errorContext=Context\: +errorMessage=Message\: +es=Spanish +false_friend=False friend +false_friend_desc=false friend hint for\: +false_friend_hint=Hint\: "{0}" ({1}) means {2} ({3}). +false_friend_suggestion=Did you mean {0}? +fr=French +gl=Galician +guiCancelButton=Cancel +guiCheckComplete=LanguageTool check is complete. +guiConfigWindowTitle=LanguageTool Options +guiDemoText=This is a example input to to show you how LanguageTool works. Note, however, that it does not include a spell checka. +guiMatchCount=Potential errors\: +guiMenuAbout=&About... +guiMenuAddRules=Load &Rule File +guiMenuCheckClipboard=&Check Text in Clipboard +guiMenuFile=&File +guiMenuHelp=&Help +guiMenuHide=&Hide to System Tray +guiMenuOpen=&Open... +guiMenuOptions=Option&s... +guiMenuQuit=&Quit +guiMenuShowMainWindow=Open Main Window +guiMotherTongue=Your mother tongue\: +guiNoErrorsFound=No errors or warnings found (language\: {0}) +guiNoErrorsFoundSelectedText=No errors or warnings found in selected text (language\: {0}) +guiOKButton=&OK +guiOOoChangeButton=&Change +guiOOoCloseButton=Close +guiOOoIgnoreAllButton=Ignore All +guiOOoIgnoreButton=Ignore +guiOOoOptionsButton=Options... +guiProgressWindowTitle=LanguageTool\: Checking Text... +guiReplaceWindowTitle=Replace text +guiReplaceWithOtherText=<other text> +guiRunOnPort=Run as server on po&rt +guiSelectionCheckComplete=LanguageTool check of selected text is complete. +incorrect_case=This sentence does not start with an uppercase letter +is=Icelandic +it=Italian +lt=Lithuanian +missing_space_after_comma=Put a space after the comma +ml=Malayalam +nl=Dutch +no_space_after=Don't put a space after the opening parenthesis +no_space_before=Don't put a space before the closing parenthesis +no_space_before_dot=Don't put a space before the full stop +pl=Polish +repetition=Possible typo\: you repeated a word +result1=<br><b> {0}. Line {1}, column {2}</b><br> +resultAreaText=Results will appear here +resultTime=<br>Time\: {0}ms (including {1}ms for rule matching)<br> +ru=Russian +sk=Slovak +sl=Slovenian +space_after_comma=Put a space after the comma, but not before the comma +startChecking=Starting check in {0} +sv=Swedish +textLanguage=Text Language\: +two_commas=Two consecutive commas +two_dots=Two consecutive dots +uk=Ukrainian +unpaired_brackets=Unpaired bracket or similar symbol +whitespace_repetition=Possible typo\: you repeated a whitespace +ro=Romanian diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/MessagesBundle_cs.properties b/JLanguageTool/src/java/de/danielnaber/languagetool/MessagesBundle_cs.properties new file mode 100644 index 0000000..6484c20 --- /dev/null +++ b/JLanguageTool/src/java/de/danielnaber/languagetool/MessagesBundle_cs.properties @@ -0,0 +1,93 @@ +#X-Generator: crowdin.net +eo=Esperanto +be=B\u011blorusky +ca=Katal\u00e1nsky +category_case=Kapitalizace +category_false_friend=Fale\u0161n\u00ed p\u0159\u00e1tel\u00e9 +category_grammar=Gramatika +category_misc=R\u016fzn\u00e9 +category_typo=Mo\u017en\u00fd p\u0159eklep +checkDone=Kontrola ukon\u010dena, {0} nalezen\u00fdch potenci\u00e1ln\u00edch chyb. +checkText=Prov\u00e9st kontrolu +correctionMessage=Oprava\: +cs=\u010cesky +da=D\u00e1nsky +de=N\u011bmecky +desc_comma_whitespace=Pou\u017eit\u00ed mezery p\u0159ed \u010d\u00e1rkou a p\u0159ed/za z\u00e1vorkami +desc_double_punct=Pou\u017eit\u00ed dvou za sebou jdouc\u00edch \u010d\u00e1rek nebo te\u010dek +desc_repetition=Opakov\u00e1n\u00ed slov (nap\u0159. 'bude bude') +desc_repetition_short=Opakov\u00e1n\u00ed slov +desc_unpaired_brackets=Nesp\u00e1rovan\u00e9 z\u00e1vorky, uvozovky nebo podobn\u00e9 symboly +desc_uppercase_sentence=Zkontroluje zda v\u011bta za\u010d\u00edn\u00e1 velk\u00fdm po\u010d\u00e1te\u010dn\u00edm p\u00edsmenem +desc_whitespacerepetition=Opakov\u00e1n\u00ed mezery(\u0161patn\u00e9 form\u00e1tov\u00e1n\u00ed) +double_dots_short=Dv\u011b za sebou jdouc\u00ed te\u010dky +double_commas_short=Dv\u011b za sebou jdouc\u00ed \u010d\u00e1rky +en=Anglicky +enterText=Pros\u00edm zadejte nebo vlo\u017ete text, kter\u00fd chcete zkontrolovat, do vrchn\u00ed oblasti +enterText2=Zde zadejte po\u017eadovan\u00fd text +errorContext=Kontext\: +errorMessage=Zpr\u00e1va\: +es=\u0160pan\u011blsky +false_friend=Fale\u0161n\u00fd p\u0159\u00edtel +false_friend_desc=\u0161patn\u00fd p\u0159itel, n\u00e1pov\u011bda\: +false_friend_hint=N\u00e1pov\u011bda\: "{0}" ({1}) znamen\u00e1 {2} ({3}). +false_friend_suggestion=Mysleli ste {0}? +fr=Francouzsky +gl=Galicij\u0161tinsky +guiCancelButton=Storno +guiCheckComplete=LanguageTool kontrola byla ukon\u010dena +guiConfigWindowTitle=LanguageTool nastaven\u00ed +guiDemoText=Tohle je uk\u00e1zkov\u00fd vstup, aby p\u0159edvedl jak LanguageTool funguje. Pamatujte, pros\u00edm, \u017ee neobsahuje kontrolu pravopisu (spellchecker). +guiMatchCount=Potenci\u00e1ln\u00ed chyby\: +guiMenuAbout=O Aplikaci... +guiMenuAddRules=Load Rule File +guiMenuCheckClipboard=Zkontrolovat text ve zchr\u00e1nce +guiMenuFile=Soubor +guiMenuHelp=N\u00e1pov\u011bda +guiMenuHide=Schovat do syst\u00e9move li\u0161ty +guiMenuOpen=Otev\u0159\u00edt... +guiMenuOptions=Nastaven\u00ed... +guiMenuQuit=Konec +guiMenuShowMainWindow=Open main window +guiMotherTongue=V\u00e1\u0161 mate\u0159sk\u00fd jazyk\: +guiNoErrorsFound=\u017d\u00e1dn\u00e9 chyby ani varov\u00e1n\u00ed nebyly nalezeny (jazyk\: {0}) +guiNoErrorsFoundSelectedText=\u017d\u00e1dn\u00e9 chyby ani varov\u00e1n\u00ed nebyly vo vyzna\u010den\u00e9m textu nalezeny (jazyk\: {0}) +guiOKButton=&OK +guiOOoChangeButton=Zam\u011bnit +guiOOoCloseButton=Zav\u0159\u00edt +guiOOoIgnoreAllButton=Ignorovat v\u0161e +guiOOoIgnoreButton=Ignorovat +guiOOoOptionsButton=Nastaven\u00ed... +guiProgressWindowTitle=LanguageTool\: Kontrola textu... +guiReplaceWindowTitle=Zam\u011bnit text +guiReplaceWithOtherText=<jin\u00fd text> +guiRunOnPort=Pob\u011b\u017e\u00ed jako server na portu +guiSelectionCheckComplete=LanguageTool kontrola ozna\u010den\u00e9ho textu je ukon\u010dena. +incorrect_case=Tato v\u011bta neza\u010d\u00edn\u00e1 z velk\u00fdm p\u00edsmenem +is=Islandsky +it=Italsky +lt=Litevsky +missing_space_after_comma=Vlo\u017eit mezeru za \u010d\u00e1rku +ml=Malaj\u00e1lamsky +nl=Nizozemsky +no_space_after=Nevkl\u00e1dat mezeru za otev\u00edrac\u00ed z\u00e1vorku +no_space_before=Nevkl\u00e1dat mezeru za uzav\u00edrac\u00ed z\u00e1vorku +no_space_before_dot=Nevkl\u00e1dejte mezeru p\u0159ed te\u010dku +pl=Polsky +repetition=Mo\u017en\u00fd p\u0159eklep\: zopakovali jste slovo +result1=<br><b> {0}. \u0158\u00e1dek {1}, sloupec {2}</b><br> +resultAreaText=V\u00fdsledek se zobraz\u00ed zde +resultTime=<br>\u010cas\: {0}ms (v\u010detn\u011b {1}ms na pou\u017eit\u00ed pravidel)<br> +ru=Rusky +sk=Slovensky +sl=Slovinsky +space_after_comma=Vlo\u017eit mezeru za \u010d\u00e1rku, ale ne p\u0159ed \u010d\u00e1rku +startChecking=Za\u010d\u00e1tek kontroly v {0} +sv=\u0160v\u00e9dsky +textLanguage=Jazyk textu\: +two_commas=Dv\u011b po sebe jdouc\u00ed \u010d\u00e1rky +two_dots=Dv\u011b po sebe jdouc\u00ed te\u010dky +uk=Ukrajinsky +unpaired_brackets=Nesp\u00e1rovan\u00e9 z\u00e1vorky nebo podobn\u00fd symbol +whitespace_repetition=Mo\u017en\u00fd p\u0159eklep\: zopakovali jste mezeru +ro=Rumunsky diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/MessagesBundle_da.properties b/JLanguageTool/src/java/de/danielnaber/languagetool/MessagesBundle_da.properties new file mode 100644 index 0000000..419d33d --- /dev/null +++ b/JLanguageTool/src/java/de/danielnaber/languagetool/MessagesBundle_da.properties @@ -0,0 +1,93 @@ +#X-Generator: crowdin.net +eo=Esperanto +be=Hviderussisk +ca=Catalansk +category_case=STORE/sm\u00e5 bogstaver +category_false_friend=Falske venner +category_grammar=Grammatik +category_misc=Diverse +category_typo=Mulig sl\u00e5fejl +checkDone=Kontrol gennemf\u00f8rt, {0} mulige problemer fundet +checkText=&Kontrollere tekst +correctionMessage=Korrektion\: +cs=Tjekkisk +da=Dansk +de=Tysk +desc_comma_whitespace=Mellemrum f\u00f8r komma og f\u00f8r/efter parenteser +desc_double_punct=To p\u00e5 hinanden f\u00f8lgende punktummer eller kommaer +desc_repetition=Ordgentagelse (f.eks. 'den den') +desc_repetition_short=Ordgentagelse +desc_unpaired_brackets=Ikke parret parenteser, tuborgklammer, citationstegn og lignende symboler +desc_uppercase_sentence=Kontroller at s\u00e6tningen starter med et stort begyndelsesbogstav +desc_whitespacerepetition=Gentagende mellemrum (d\u00e5rlig formatering) +double_dots_short=To p\u00e5 hinanden f\u00f8lgende punktummer +double_commas_short=To p\u00e5 hinanden f\u00f8lgende kommaer +en=Engelsk +enterText=Indtast eller inds\u00e6t teksten der skal kontrolleres i det \u00f8verste felt +enterText2=Inds\u00e6t teksten der skal kontrollers her +errorContext=Sammenh\u00e6ng\: +errorMessage=Meddelelse\: +es=Spansk +false_friend=Falske venner +false_friend_desc=Tip om falske venner\: +false_friend_hint=Tip\: "{0}" ({1}) betyder {2} ({3}). +false_friend_suggestion=Mente du {0}? +fr=Fransk +gl=Galicisk +guiCancelButton=Annuller +guiCheckComplete=LanguageTools kontrol er f\u00e6rdig. +guiConfigWindowTitle=LanguageTool Indstillinger +guiDemoText=Dette er et teksteksempel for at at vise hvordan LanguageTool virker. Bem\u00e6rk dog, at den ikke indeholder en stavekontrol. +guiMatchCount=Mulig fejl\: +guiMenuAbout=&Om LanguageTool +guiMenuAddRules=Hent ®elfil +guiMenuCheckClipboard=&Kontroller tekst i klipholderen +guiMenuFile=&Filer +guiMenuHelp=&Hj\u00e6lp +guiMenuHide=&Skjul til systembakken +guiMenuOpen=&\u00c5ben... +guiMenuOptions=&Indstillinger... +guiMenuQuit=&Afslut +guiMenuShowMainWindow=\u00c5ben hovedvinduet +guiMotherTongue=Dit modersm\u00e5l\: +guiNoErrorsFound=Ingen fejl eller advarsler fundet (sprog\: {0}) +guiNoErrorsFoundSelectedText=Ingen fejl eller advarsler fundet i den markerede tekst (sprog\: {0}) +guiOKButton=&OK +guiOOoChangeButton=&Change +guiOOoCloseButton=&Luk +guiOOoIgnoreAllButton=I&gnorer alle +guiOOoIgnoreButton=&Ignorer her +guiOOoOptionsButton=In&dstillinger... +guiProgressWindowTitle=LanguageTool\: Kontrollere teks... +guiReplaceWindowTitle=Erstat tekst +guiReplaceWithOtherText=<anden tekst> +guiRunOnPort=K\u00f8r som service p\u00e5 po&rt +guiSelectionCheckComplete=LanguageTools kontrol af markerede tekst er f\u00e6rdig. +incorrect_case=Denne s\u00e6tning starter ikke med et stort begyndelsesbogstav +is=Islandsk +it=Italiensk +lt=Lettisk +missing_space_after_comma=Inds\u00e6t et mellemrum efter kommaet +ml=Malayalam +nl=Hollandsk +no_space_after=Inds\u00e6t ikke et mellemrum efter parentesbegynd +no_space_before=Inds\u00e6t ikke et mellemrum f\u00f8r parentesslut +no_space_before_dot=Inds\u00e6t ikke et mellemrum f\u00f8r punktum +pl=Polsk +repetition=Mulig sl\u00e5fejl\: du har gentaget et ord +result1=<br><b> {0}. Linje {1}, kolonne {2}</b><br> +resultAreaText=Resultater vil vise sig her +resultTime=<br>Tid\: {0}ms (inklusiv {1}ms til regelafpr\u00f8vning)<br> +ru=Russisk +sk=Slovakisk +sl=Slovensk +space_after_comma=Inds\u00e6t et mellemrum efter kommaet, ikke f\u00f8r det. +startChecking=Starter kontrollen om {0} +sv=Svensk +textLanguage=Tekstens sprog\: +two_commas=To p\u00e5 hinanden f\u00f8lgende kommaer +two_dots=To p\u00e5 hinanden f\u00f8lgende punktummer +uk=Ukrainsk +unpaired_brackets=Ikke parret parenteser eller lignende symboler +whitespace_repetition=Mulig sl\u00e5fejl\: du har gentaget et mellemrum +ro=Rum\u00e6nsk diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/MessagesBundle_de.properties b/JLanguageTool/src/java/de/danielnaber/languagetool/MessagesBundle_de.properties new file mode 100644 index 0000000..ccffd28 --- /dev/null +++ b/JLanguageTool/src/java/de/danielnaber/languagetool/MessagesBundle_de.properties @@ -0,0 +1,93 @@ +#X-Generator: crowdin.net +be=Wei\u00dfrussisch +ca=Katalanisch +category_case=Gro\u00df-/Kleinschreibung +category_false_friend=Falsche Freunde +category_grammar=Grammatik +category_misc=Sonstiges +category_typo=M\u00f6gliche Tippfehler +checkDone=Pr\u00fcfung beendet, {0} m\u00f6gliche Probleme gefunden +checkText=Text &pr\u00fcfen +correctionMessage=Korrektur\: +cs=Tschechisch +da=D\u00e4nisch +de=Deutsch +desc_comma_whitespace=Leerzeichen vor/hinter Kommas und Klammern +desc_double_punct=Zwei aufeinander folgende Kommas oder Punkte +desc_repetition=Wortwiederholung (z.B. 'als als') +desc_repetition_short=Wortwiederholung +desc_unpaired_brackets=Unpaarige Anf\u00fchrungszeichen und Klammern +desc_uppercase_sentence=Gro\u00dfschreibung am Satzanfang +desc_whitespacerepetition=Wiederholung von Leerzeichen +double_dots_short=Zwei aufeinander folgende Punkte +double_commas_short=Zwei aufeinander folgende Kommas +en=Englisch +eo=Esperanto +enterText=Bitte Text ins obere Feld eintippen oder hineinkopieren +enterText2=Bitte Text hier eintippen oder hineinkopieren +errorContext=Text\: +errorMessage=Hinweis\: +es=Spanisch +false_friend=Falscher Freund +false_friend_desc=Falscher-Freund-Hinweis f\u00fcr\: +false_friend_hint=Hinweis\: "{0}" ({1}) bedeutet {2} ({3}). +false_friend_suggestion=Meinten Sie vielleicht {0}? +fr=Franz\u00f6sisch +gl=Galicisch +guiCancelButton=Abbrechen +guiCheckComplete=LanguageTool-Pr\u00fcfung beendet. +guiConfigWindowTitle=LanguageTool Optionen +guiDemoText=Dies ist ein Beispiel-Text, um zu zeigen zeigen, wie LanguageTool funktioniert. Wie man sieht, ist keine R\u00e4chtshreibpr\u00fcfung enthalten. +guiMatchCount=M\u00f6gliche Fehler\: +guiMenuAbout=\u00dcber... +guiMenuAddRules=Regeldatei laden... +guiMenuCheckClipboard=Text in der Zwischenablage pr\u00fcfen +guiMenuFile=Datei +guiMenuHelp=Hilfe +guiMenuHide=In den System Tray verkleinern +guiMenuOpen=\u00d6ffnen... +guiMenuOptions=Optionen... +guiMenuQuit=Beenden +guiMenuShowMainWindow=Hauptfenster \u00f6ffnen +guiMotherTongue=Ihre Muttersprache\: +guiNoErrorsFound=Keine Fehler und Warnungen gefunden (Textsprache\: {0}) +guiNoErrorsFoundSelectedText=Keine Fehler und Warnungen im selektierten Text gefunden (Textsprache\: {0}) +guiOKButton=&OK +guiOOoChangeButton=\u00c4ndern +guiOOoCloseButton=Schlie\u00dfen +guiOOoIgnoreAllButton=Alle ignorieren +guiOOoIgnoreButton=Ignorieren +guiOOoOptionsButton=Optionen... +guiProgressWindowTitle=LanguageTool\: Text pr\u00fcfen... +guiReplaceWindowTitle=Text ersetzen +guiReplaceWithOtherText=<anderer Text> +guiRunOnPort=Als Server laufen auf Po&rt +guiSelectionCheckComplete=LanguageTool-Pr\u00fcfung des selektierten Textes ist beendet. +incorrect_case=Dieser Satz f\u00e4ngt nicht mit einem gro\u00df geschriebenen Wort an +is=Isl\u00e4ndisch +it=Italienisch +lt=Litauisch +missing_space_after_comma=Hinter einem Komma sollte ein Leerzeichen stehen. +ml=Malayalam +nl=Niederl\u00e4ndisch +no_space_after=Hinter einer \u00f6ffnenden Klammer wird kein Leerzeichen eingef\u00fcgt. +no_space_before=Vor einer schlie\u00dfeden Klammer wird kein Leerzeichen eingef\u00fcgt. +no_space_before_dot=Vor dem Punkt sollte kein Leerzeichen stehen +pl=Polnisch +repetition=M\u00f6glicher Tippfehler\: ein Wort wird wiederholt +result1=<br><b> {0}. Zeile {1}, Spalte {2}</b><br> +resultAreaText=Hier erscheint das Ergebnis der Textpr\u00fcfung +resultTime=<br>Zeit\: {0}ms (davon {1}ms f\u00fcr Regelpr\u00fcfungen)<br> +ru=Russisch +sk=Slowakisch +sl=Slowenisch +space_after_comma=Nur hinter einem Komma steht ein Leerzeichen, aber nicht davor. +startChecking=Beginne Pr\u00fcfung in {0} +sv=Schwedisch +textLanguage=Textsprache\: +two_commas=Zwei aufeinander folgende Kommas. +two_dots=Zwei aufeinander folgende Punkte. +uk=Ukrainisch +unpaired_brackets=Es fehlt eine Klammer usw. +whitespace_repetition=M\u00f6glicher Tippfehler\: mehr als ein Leerzeichen hintereinander +ro=Rum\u00e4nisch diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/MessagesBundle_en.properties b/JLanguageTool/src/java/de/danielnaber/languagetool/MessagesBundle_en.properties new file mode 100644 index 0000000..5133e9a --- /dev/null +++ b/JLanguageTool/src/java/de/danielnaber/languagetool/MessagesBundle_en.properties @@ -0,0 +1,189 @@ +#Generated by ResourceBundle Editor (http://eclipse-rbe.sourceforge.net) +# English translation of LanguageTool +# Copyright (C) 2006 Daniel Naber (http://www.danielnaber.de), Marcin Milkowski + +be = Belarusian + +ca = Catalan + +category_case = Capitalization + +category_false_friend = False friends + +category_grammar = Grammar + +category_misc = Miscellaneous + +category_typo = Possible Typo + +checkDone = Check done, {0} potential problems found + +checkText = &Check Text + +correctionMessage = Correction: + +cs = Czech + +da = Danish + +de = German + +nb = Norwegian (Bokmål) + +desc_comma_whitespace = Use of whitespace before comma and before/after parentheses + +desc_double_punct = Use of two consecutive dots or commas + +desc_repetition = Word repetition (e.g. 'will will') + +desc_repetition_short = Word repetition + +desc_unpaired_brackets = Unpaired braces, brackets, quotation marks and similar symbols + +desc_uppercase_sentence = Checks that a sentence starts with an uppercase letter + +desc_whitespacerepetition = Whitespace repetition (bad formatting) + +double_dots_short = Two consecutive dots + +double_commas_short = Two consecutive comma + +en = English + +enterText = Please type or paste text to check in the top area + +enterText2 = Please insert text to check here + +eo = Esperanto + +errorContext = Context: + +errorMessage = Message: + +es = Spanish + +false_friend = False friend + +false_friend_desc = false friend hint for: + +false_friend_hint = Hint: "{0}" ({1}) means {2} ({3}). + +false_friend_suggestion = Did you mean {0}? + +fr = French + +gl = Galician + +guiCancelButton = Cancel + +guiCheckComplete = LanguageTool check is complete. + +guiConfigWindowTitle = LanguageTool Options + +guiDemoText = This is a example input to to show you how LanguageTool works. Note, however, that it does not include a spell checka. + +guiMatchCount = Potential errors: + +guiMenuAbout = &About... + +guiMenuAddRules = Load &Rule File + +guiMenuCheckClipboard = &Check Text in Clipboard + +guiMenuFile = &File + +guiMenuHelp = &Help + +guiMenuHide = &Hide to System Tray + +guiMenuOpen = &Open... + +guiMenuOptions = Option&s... + +guiMenuQuit = &Quit + +guiMenuShowMainWindow = Open Main Window + +guiMotherTongue = Your mother tongue: + +guiNoErrorsFound = No errors or warnings found (language: {0}) + +guiNoErrorsFoundSelectedText = No errors or warnings found in selected text (language: {0}) + +guiOKButton = &OK + +guiOOoChangeButton = &Change + +guiOOoCloseButton = Close + +guiOOoIgnoreAllButton = Ignore All + +guiOOoIgnoreButton = Ignore + +guiOOoOptionsButton = Options... + +guiProgressWindowTitle = LanguageTool: Checking Text... + +guiReplaceWindowTitle = Replace text + +guiReplaceWithOtherText = <other text> + +guiRunOnPort = Run as server on po&rt + +guiSelectionCheckComplete = LanguageTool check of selected text is complete. + +incorrect_case = This sentence does not start with an uppercase letter + +is = Icelandic + +it = Italian + +lt = Lithuanian + +missing_space_after_comma = Put a space after the comma + +ml = Malayalam + +nl = Dutch + +no_space_after = Don't put a space after the opening parenthesis + +no_space_before = Don't put a space before the closing parenthesis + +no_space_before_dot = Don't put a space before the full stop + +pl = Polish + +repetition = Possible typo: you repeated a word + +result1 = <br><b> {0}. Line {1}, column {2}</b><br> + +resultAreaText = Results will appear here + +resultTime = <br>Time: {0}ms (including {1}ms for rule matching)<br> + +ru = Russian + +sk = Slovak + +sl = Slovenian + +space_after_comma = Put a space after the comma, but not before the comma + +startChecking = Starting check in {0} + +sv = Swedish + +textLanguage = Text Language: + +two_commas = Two consecutive commas + +two_dots = Two consecutive dots + +uk = Ukrainian + +unpaired_brackets = Unpaired bracket or similar symbol + +whitespace_repetition = Possible typo: you repeated a whitespace + +ro = Romanian diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/MessagesBundle_eo.properties b/JLanguageTool/src/java/de/danielnaber/languagetool/MessagesBundle_eo.properties new file mode 100644 index 0000000..f962fa7 --- /dev/null +++ b/JLanguageTool/src/java/de/danielnaber/languagetool/MessagesBundle_eo.properties @@ -0,0 +1,198 @@ +# Esperanto translation of LanguageTool +# Copyright (C) 2010 Daniel Naber (http://www.danielnaber.de), Marcin Milkowski +# +# Vim Command to run before editing (to see accentuated letters rather +# their code): +# :v/^#/s:\\u0109:ĉ:gI |s:\\u0108:Ĉ:gI |s:\\u011d:ĝ:gI |s:\\u0135:ĵ:gI |s:\\u0125:ĥ:gI |s:\\u016d:ŭ:gI |s:\\u015c:Ŝ:gI +# +# Vim command to run before saving file: +# :v/^#/s:ĉ:\\u0109:gI |s:Ĉ:\\u0108:gI |s:ĝ:\\u011d:gI |s:ĵ:\\u0135:gI |s:ĥ:\\u0125:gI |s:ŭ:\\u016d:gI |s:Ŝ:\\u015c:gI +# +# Sendu komentojn aŭ korektojn al: +# Send comments or corrections to: Dominique Pellé <dominique.pelle@gmail.com> +# +# +be = Belorusa + +ca = Kataluna + +category_case = Uskleco + +category_false_friend = Falsaj amikoj + +category_grammar = Gramatiko + +category_misc = Diversa\u0135oj + +category_typo = Ebla misliterumo + +checkDone = Kontrolo farita, {0} eblaj problemoj trovitaj + +checkText = &Kontroli tekston + +correctionMessage = Korektado: + +cs = \u0108e\u0125a + +da = Dana + +de = Germana + +desc_comma_whitespace = Uzo de spaceton anta\u016d komo kaj anta\u016d/malanta\u016d krampoj + +desc_double_punct = Uzo de sinsekvaj punktoj a\u016d komoj + +desc_repetition = Ripetita vorto (ekz. 'li li') + +desc_repetition_short = Ripetita vorto + +desc_unpaired_brackets = Nekongruaj krampoj, rektaj krampoj, citiloj kaj similaj signoj + +desc_uppercase_sentence = Kontrolas, \u0109u frazo komenci\u011das per majuskla litero + +desc_whitespacerepetition = Ripetita spaceto (ne\u011dusta formato) + +double_dots_short = Du sinsekvaj punktoj + +double_commas_short = Du sinsekvaj komoj + +en = Angla + +enterText = Bonvolu tajpi a\u016d alglui kontrolendan tekston en la supra kampo + +enterText2 = Bonvolu enmeti la kontrolendajn tekstojn \u0109i tie + +eo = Esperanto + +errorContext = Kunteksto: + +errorMessage = Mesa\u011do: + +es = Hispana + +false_friend = Falsaj amikoj + +false_friend_desc = false friend hint for: + +false_friend_hint = Konsilo: "{0}" ({1}) signifas {2} ({3}). + +false_friend_suggestion = \u0108u vi intencis {0}? + +fr = Franca + +gl = Galega + +guiCancelButton = Rezigni + +guiCheckComplete = Kontrolado de LingvoIlo fini\u011dis. + +guiConfigWindowTitle = Opcioj de LingvoIlo + +# Errors are on purpose in this demo text. +guiDemoText = \u0108i tiu ekzemplo estas ekzemplo por por montri kiel funkcias LingvoIlo. Rimarku, tamen, ke \u011di ne inkluzivas litterumulon. + +guiMatchCount = Eblaj eraroj: + +guiMenuAbout = &Pri... + +guiMenuAddRules = \u015cargi dosieron de ®uloj + +guiMenuCheckClipboard = &Kontroli tekston en la tondujo + +guiMenuFile = &Dosiero + +guiMenuHelp = &Helpo + +guiMenuHide = K&aŝi en la taskopleto + +guiMenuOpen = &Malfermi... + +guiMenuOptions = &Opcioj... + +guiMenuQuit = &Eliti + +guiMenuShowMainWindow = Malfermi la \u0109efan fenestron + +guiMotherTongue = Via denaska lingvo: + +guiNoErrorsFound = Neniuj eraroj a\u016d avertoj trovitaj (lingvo: {0}) + +guiNoErrorsFoundSelectedText = Neniuj eraroj a\u016d avertoj trovitaj en la apartigita teksto (lingvo: {0}) + +guiOKButton = &Bone + +guiOOoChangeButton = \u015ca&n\u011di + +guiOOoCloseButton = Fermi + +guiOOoIgnoreAllButton = Ignori \u0109iujn + +guiOOoIgnoreButton = Ignori + +guiOOoOptionsButton = Opcioj... + +guiProgressWindowTitle = LinvoIlo: kontrolado de teksto... + +guiReplaceWindowTitle = Anstata\u016digi tekston + +guiReplaceWithOtherText = <alia teksto> + +guiRunOnPort = Run as server on po&rt + +guiSelectionCheckComplete = Kontrolo per LinvoIlo de apartigita teksto fini\u011dis. + +incorrect_case = Tiu frazo ne komenci\u011das per majuskla litero + +is = Islanda + +it = Itala + +lt = Litova + +missing_space_after_comma = Enmetu spaceton post la komo + +ml= Malajala + +nl = Nederlanda + +no_space_after = Ne metu spaceton malanta\u016d malfermantaj krampoj + +no_space_before = Ne metu spaceton anta\u016d fermanta krampo + +no_space_before_dot = Ne enmetu spaceton anta\u016d punkto + +pl = Pola + +repetition = Ebla mistajpa\u0135o: vi ripetis vorton + +result1 = <br><b> {0}. Linio {1}, kolumno {2}</b><br> + +resultAreaText = Rezultoj aperos tie + +resultTime = <br>Tempo: {0}ms (inkluzive {1}ms por rekono de reguloj)<br> + +ru = Rusa + +sk = Slovaka + +sl = Slovena + +space_after_comma = Enmeti spaceton post la komo, sed ne anta\u016d la komo + +startChecking = Ekkontroli en {0} + +sv = Sveda + +textLanguage = Lingvo de teksto: + +two_commas = Du sinsekvaj komoj + +two_dots = Du sinsekvaj punktoj + +uk = Ukraina + +unpaired_brackets = Nekongruaj krampoj a\u016d similaj simbolo + +whitespace_repetition = Ebla mistajpa\u0135o: vi ripetis spaceton + +ro = Rumana diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/MessagesBundle_es.properties b/JLanguageTool/src/java/de/danielnaber/languagetool/MessagesBundle_es.properties new file mode 100644 index 0000000..e95e2b4 --- /dev/null +++ b/JLanguageTool/src/java/de/danielnaber/languagetool/MessagesBundle_es.properties @@ -0,0 +1,93 @@ +#X-Generator: crowdin.net +eo=Esperanto +be=Bielorruso +ca=Catal\u00e1n +category_case=May\u00fasculas y min\u00fasculas +category_false_friend=Falsos amigos +category_grammar=Gram\u00e1tica +category_misc=Diversas +category_typo=Posible error tipogr\u00e1fico +checkDone=Revisi\u00f3n completa, {0} posibles problemas encontrados +checkText=&Revisar Texto +correctionMessage=Correcci\u00f3n\: +cs=checo +da=dan\u00e9s +de=alem\u00e1n +desc_comma_whitespace=Espacios en blanco antes de coma y antes/despu\u00e9s de parent\u00e9sis +desc_double_punct=Dos puntos o comas consecutivos +desc_repetition=Repetici\u00f3n de una palabra (p. ej. 'soy soy') +desc_repetition_short=Repetici\u00f3n de una palabra +desc_unpaired_brackets=Par\u00e9ntesis, comillas, signos de exclamaci\u00f3n, interrogaci\u00f3n y similares desparejados +desc_uppercase_sentence=Comprobar si la frase se inicia con una letra may\u00fascula +desc_whitespacerepetition=M\u00faltiples espacios en blanco +double_dots_short=Dos puntos consecutivos +double_commas_short=Dos comas consecutivas +en=ingl\u00e9s +enterText=Escriba o pegue el texto a revisar en el cuadro superior +enterText2=Escriba o pegue el texto a revisar aqu\u00ed +errorContext=Contexto\: +errorMessage=Mensaje\: +es=espa\u00f1ol +false_friend=Falso amigo +false_friend_desc=Nota para falso amigo\: +false_friend_hint=Nota\: "{0}" ({1}) significa {2} ({3}). +false_friend_suggestion=\u00bfQuiere decir {0}? +fr=franc\u00e9s +gl=gallego +guiCancelButton=Cancelar +guiCheckComplete=Se ha completado la revisi\u00f3n de LanguageTool. +guiConfigWindowTitle=LanguageTool - Opciones +guiDemoText=Ese es un un ejemplo del texto que mostra como funcciona LanguageTool. Ese programma no incluieee ninguna revisi\u00f3n ortogr\u00e1fica\! +guiMatchCount=Posibles errores\: +guiMenuAbout=Acerca de... +guiMenuAddRules=Cargar fichero de reglas +guiMenuCheckClipboard=Revisar el texto del portapapeles +guiMenuFile=Archivo +guiMenuHelp=Ayuda +guiMenuHide=Minimizar a la bandeja del sistema +guiMenuOpen=Abrir... +guiMenuOptions=Opciones... +guiMenuQuit=Terminar +guiMenuShowMainWindow=Abrir ventana principal +guiMotherTongue=Idioma nativo\: +guiNoErrorsFound=No se han encontrado errores ni advertencias (idioma\: {0}) +guiNoErrorsFoundSelectedText=No se han encontrado errores ni advertencias en el texto seleccionado (idioma\: {0}) +guiOKButton=Aceptar +guiOOoChangeButton=Cambiar +guiOOoCloseButton=Cerrar +guiOOoIgnoreAllButton=Ignorar siempre +guiOOoIgnoreButton=Ignorar +guiOOoOptionsButton=Opciones... +guiProgressWindowTitle=LanguageTool\: Comprobaci\u00f3n del texto... +guiReplaceWindowTitle=Sustituir texto +guiReplaceWithOtherText=<otro texto> +guiRunOnPort=Ejecutar como servidor en puerto +guiSelectionCheckComplete=Se ha completado la verificaci\u00f3n del texto seleccionado en LanguageTool. +incorrect_case=Esa frase no se inicia con may\u00fascula +is=island\u00e9s +it=italiano +lt=lituano +missing_space_after_comma=Deja un espacio despu\u00e9s de coma +ml=Malayo +nl=holand\u00e9s +no_space_after=No se deja un espacio despu\u00e9s de un par\u00e9ntesis izquierdo +no_space_before=No se deja un espacio antes de un par\u00e9ntesis derecho +no_space_before_dot=No se deja un espacio antes del punto +pl=polaco +repetition=Posible error tipogr\u00e1fico\: repetici\u00f3n de una palabra +result1=<br><b> {0}. L\u00ednea {1}, Columna {2}</b><br> +resultAreaText=Los resultados aparecer\u00e1n aqu\u00ed. +resultTime=<br>Tiempo\: {0}ms (incluye {1}ms para la coincidencia de reglas)<br> +ru=ruso +sk=eslovaco +sl=esloveno +space_after_comma=Se deja un espacio despu\u00e9s de coma y nunca antes del signo ortogr\u00e1fico. +startChecking=Inicio de verificaci\u00f3n en {0} +sv=sueco +textLanguage=Idioma del texto\: +two_commas=Dos comas consecutivas +two_dots=Dos puntos consecutivos +uk=ucraniano +unpaired_brackets=Se ha encontrado un error en los par\u00e9ntesis, comillas, signos de exclamaci\u00f3n o interrogaci\u00f3n +whitespace_repetition=Posible error tipogr\u00e1fico\: m\u00faltiples espacios en blanco +ro=rumano diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/MessagesBundle_fr.properties b/JLanguageTool/src/java/de/danielnaber/languagetool/MessagesBundle_fr.properties new file mode 100644 index 0000000..1befa1f --- /dev/null +++ b/JLanguageTool/src/java/de/danielnaber/languagetool/MessagesBundle_fr.properties @@ -0,0 +1,93 @@ +#X-Generator: crowdin.net +eo=Esperanto +be=Bi\u00e9lorusse +ca=catalan +category_case=Majuscules +category_false_friend=Faux amis +category_grammar=Grammaire +category_misc=R\u00e8gles de base +category_typo=Faute de frappe possible +checkDone=V\u00e9rification termin\u00e9e. Nombre d'alertes \: {0} +checkText=V\u00e9rifier le texte +correctionMessage=Correction \: +cs=tch\u00e8que +da=danois +de=allemand +desc_comma_whitespace=Espace devant \u00ab\u00a0,\u00a0\u00bb, \u00ab\u00a0)\u00a0\u00bb ou apr\u00e8s \u00ab\u00a0(\u00a0\u00bb +desc_double_punct=Virgules ou points cons\u00e9cutifs +desc_repetition=Doublon (\u00ab\u00a0pour pour\u00a0\u00bb, \u00ab\u00a0je je\u00a0\u00bb, etc.) +desc_repetition_short=Doublon +desc_unpaired_brackets=Guillemet fermant ou ouvrant manquant +desc_uppercase_sentence=Majuscule en d\u00e9but de phrase +desc_whitespacerepetition=Plusieurs espaces blanches (mauvais format) +double_dots_short=Points cons\u00e9cutifs +double_commas_short=Virgules cons\u00e9cutifs +en=anglais +enterText=Ins\u00e9rez le texte \u00e0 v\u00e9rifier dans l'espace du haut +enterText2=Placez le texte \u00e0 v\u00e9rifier ici +errorContext=Contexte \: +errorMessage=Message \: +es=espagnol +false_friend=Faux ami +false_friend_desc=note concernant les faux-amis \: +false_friend_hint=Note \: "{0}" ({1}) signifie {2} ({3}). +false_friend_suggestion=Voulez-vous dire {0} ? +fr=fran\u00e7ais +gl=galicien +guiCancelButton=Annuler +guiCheckComplete=La v\u00e9rification de LanguageTool est termin\u00e9e. +guiConfigWindowTitle=Options de LanguageTool +guiDemoText=Se texte est un exemple pour pour vous montrer le fonctionnement de LanguageTool. notez que LanguageTool ne comporte pas de correcteur orthographique. +guiMatchCount=Erreurs possibles \: +guiMenuAbout=\u00c0 propos... +guiMenuAddRules=Charger un fichier de r\u00e8gles +guiMenuCheckClipboard=V\u00e9rifier le texte dans le presse-papiers +guiMenuFile=Fichier +guiMenuHelp=Aide +guiMenuHide=R\u00e9duire dans la barre des t\u00e2ches +guiMenuOpen=Ouvrir... +guiMenuOptions=Options... +guiMenuQuit=&Quitter +guiMenuShowMainWindow=Ouvrir la fen\u00eatre principale +guiMotherTongue=Langue maternelle \: +guiNoErrorsFound=Ni alerte ni erreur n'ont \u00e9t\u00e9 trouv\u00e9es (language\: {0}) +guiNoErrorsFoundSelectedText=Ni alerte ni erreur n'ont \u00e9t\u00e9 trouv\u00e9es pour le texte s\u00e9lectionn\u00e9 (language\: {0}) +guiOKButton=&OK +guiOOoChangeButton=Changer +guiOOoCloseButton=Fermer +guiOOoIgnoreAllButton=Ignorer tout +guiOOoIgnoreButton=Ignorer +guiOOoOptionsButton=Options... +guiProgressWindowTitle=LanguageTool \: V\u00e9rification en cours... +guiReplaceWindowTitle=Remplacer le texte +guiReplaceWithOtherText=<other text> +guiRunOnPort=Ex\u00e9cuter en tant que serveur sur le port +guiSelectionCheckComplete=La v\u00e9rification de LanguageTool pour le texte s\u00e9lectionn\u00e9 est termin\u00e9e. +incorrect_case=Cette phrase ne commence pas par une majuscule +is=islandais +it=italien +lt=lituanien +missing_space_after_comma=Cr\u00e9ez une espace apr\u00e8s la virgule +ml=malayalam +nl=n\u00e9erlandais +no_space_after=Ne placez pas d'espace apr\u00e8s une parenth\u00e8se ouvrante +no_space_before=Ne placez pas d'espace avant une parenth\u00e8se fermante +no_space_before_dot=Ne placez pas d'espace avant le point +pl=polonais +repetition=Faute de frappe possible \: un mot est r\u00e9p\u00e9t\u00e9 +result1=<br><b> {0}. Ligne {1}, colonne {2}</b><br> +resultAreaText=Les r\u00e9sultats se trouveront ici. +resultTime=<br>Temps \: {0}ms (dont {1}ms pour le filtrage de r\u00e8gles)<br> +ru=russe +sk=slovaque +sl=slov\u00e9nien +space_after_comma=Placer l'espace apr\u00e8s la virgule et non avant +startChecking=Commencement de la v\u00e9rification en {0} +sv=su\u00e9dois +textLanguage=Langue du texte \: +two_commas=Deux virgules cons\u00e9cutives +two_dots=Deux points cons\u00e9cutifs +uk=ukrainien +unpaired_brackets=Il manque une parenth\u00e8se fermante ou ouvrante, un guillemet fermant ou ouvrant, etc. +whitespace_repetition=Faute de frappe possible \: vous avez r\u00e9p\u00e9t\u00e9 une espace +ro=roumain diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/MessagesBundle_gl.properties b/JLanguageTool/src/java/de/danielnaber/languagetool/MessagesBundle_gl.properties new file mode 100644 index 0000000..c5a7489 --- /dev/null +++ b/JLanguageTool/src/java/de/danielnaber/languagetool/MessagesBundle_gl.properties @@ -0,0 +1,93 @@ +#X-Generator: crowdin.net +eo=Esperanto +be=Bielorruso +ca=catal\u00e1n +category_case=Mai\u00fasculas e min\u00fasculas +category_false_friend=Falsos amigos +category_grammar=Gram\u00e1tica +category_misc=Miscel\u00e1nea +category_typo=Pos\u00edbeis erros tipogr\u00e1ficos +checkDone=Comprobaci\u00f3n feita, atop\u00e1ronse {0} problemas potenciais +checkText=&Comprobar o texto +correctionMessage=Correcci\u00f3n\: +cs=checo +da=dan\u00e9s +de=alem\u00e1n +desc_comma_whitespace=Uso de espazos en branco diante dunha coma ou antes/despois de par\u00e9ntese +desc_double_punct=Uso de dous puntos ou comas consecutivos +desc_repetition=Repetici\u00f3n dunha palabra (por exemplo, 'vai vai') +desc_repetition_short=Repetici\u00f3n dunha palabra +desc_unpaired_brackets=Par\u00e9nteses, comi\u00f1as e s\u00edmbolos similares desemparellados +desc_uppercase_sentence=Comproba que unha oraci\u00f3n comece con mai\u00fascula +desc_whitespacerepetition=M\u00faltiples espazos en branco (erro de formato) +double_dots_short=Dous puntos consecutivos +double_commas_short=D\u00c3\u00baas comas consecutivas +en=ingl\u00e9s +enterText=Por favor, teclee ou pegue o texto a corrixir na \u00e1rea superior +enterText2=Por favor, insira aqu\u00ed o texto a corrixir +errorContext=Contexto\: +errorMessage=Mensaxe\: +es=espa\u00f1ol +false_friend=Falso amigo +false_friend_desc=Nota referente aos falsos amigos\: +false_friend_hint=Consello\: "{0}" ({1}) significa {2} ({3}). +false_friend_suggestion=Quer\u00eda vostede dicir {0}? +fr=franc\u00e9s +gl=galego +guiCancelButton=Cancelar +guiCheckComplete=Completouse a correcci\u00f3n de LanguageTool. +guiConfigWindowTitle=Opci\u00f3ns de LanguageTool +guiDemoText=Esta vai a ser unha mostra de de exemplo para amosar o funcionamento de LanguageTool. Por\u00e9n, te\u00f1a en conta que non incl\u00fae un corretor de ortografia. +guiMatchCount=Erros potenciais\: +guiMenuAbout=&Acerca de... +guiMenuAddRules=Cargar ficheiro de ®ras +guiMenuCheckClipboard=&Corrixir texto do portaretallos +guiMenuFile=&Ficheiro +guiMenuHelp=A&xuda +guiMenuHide=&Minimizar \u00e1 bandexa do sistema +guiMenuOpen=A&brir... +guiMenuOptions=Opci\u00f3n&s... +guiMenuQuit=Sa\u00ed&r +guiMenuShowMainWindow=Abrir fiestra principal +guiMotherTongue=A s\u00faa lingua materna\: +guiNoErrorsFound=Non se atoparon erros nin advertencias (lingua\: {0}) +guiNoErrorsFoundSelectedText=Non se atoparon erros ou advertencias no texto seleccionado (lingua\: {0}) +guiOKButton=&Aceptar +guiOOoChangeButton=&Cambiar +guiOOoCloseButton=Pechar +guiOOoIgnoreAllButton=Ignorar todos +guiOOoIgnoreButton=Ignorar +guiOOoOptionsButton=Opci\u00f3ns... +guiProgressWindowTitle=LanguageTool\: Comprobaci\u00f3n do texto... +guiReplaceWindowTitle=Substitu\u00edr o texto +guiReplaceWithOtherText=<outro texto> +guiRunOnPort=Executar como servidor no por&to +guiSelectionCheckComplete=LanguageTool completou a correcci\u00f3n do texto seleccionado. +incorrect_case=Esta oraci\u00f3n non comeza cunha letra mai\u00fascula +is=island\u00e9s +it=italiano +lt=lituano +missing_space_after_comma=Po\u00f1a un espazo detr\u00e1s da coma +ml=malaiala +nl=holand\u00e9s +no_space_after=Non debe usar espazos detr\u00e1s dos par\u00e9nteses de apertura +no_space_before=Non debe usar espazos antes dos par\u00e9nteses de peche +no_space_before_dot=Non debe po\u00c3\u00b1er espazos antes dun punto. +pl=polaco +repetition=Pos\u00edbel erro tipogr\u00e1fico\: repet\u00edu unha palabra +result1=<br><b> {0}. Li\u00f1a {1}, columna {2}</b><br> +resultAreaText=Os resultados aparecer\u00e1n aqu\u00ed +resultTime=<br>Tempo\: {0}ms (isto incl\u00fae {1}ms de coincidencia de regras)<br> +ru=ruso +sk=eslovaco +sl=esloveno +space_after_comma=Po\u00f1a un espazo en branco despois da coma, pero nunca antes +startChecking=Inicio da verificaci\u00f3n en {0} +sv=sueco +textLanguage=Lingua do texto\: +two_commas=D\u00faas comas consecutivas +two_dots=Dous puntos consecutivos +uk=ucra\u00edno +unpaired_brackets=Par\u00e9nteses ou s\u00edmbolos similares desemparellados +whitespace_repetition=Pos\u00edbel erro tipogr\u00e1fico\: repet\u00edu un espazo en branco +ro=roman\u00e9s diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/MessagesBundle_is.properties b/JLanguageTool/src/java/de/danielnaber/languagetool/MessagesBundle_is.properties new file mode 100644 index 0000000..488aea9 --- /dev/null +++ b/JLanguageTool/src/java/de/danielnaber/languagetool/MessagesBundle_is.properties @@ -0,0 +1,93 @@ +#X-Generator: crowdin.net +eo=Esperanto +be=Belarusian +ca=Catalan +category_case=H\u00e1stafur/l\u00e1gstafur +category_false_friend=Falskir vinir +category_grammar=M\u00e1lfr\u00e6\u00f0i +category_misc=\u00ddmislegt +category_typo=Hugsanleg ritvilla +checkDone=Yfirlestri loki\u00f0, {0} hugsanlegar villur fundust +checkText=&Yfirlestur +correctionMessage=Lei\u00f0r\u00e9tting\: +cs=t\u00e9kkneska +da=Danish +de=\u00fe\u00fdska +desc_comma_whitespace=Bil \u00e1 undan kommu og \u00e1 undan/eftir sviga +desc_double_punct=Tv\u00edtekinn punktur e\u00f0a komma +desc_repetition=Endurteki\u00f0 or\u00f0 (t.d. 'mun mun') +desc_repetition_short=Endurteki\u00f0 or\u00f0 +desc_unpaired_brackets=Svigar, hornklofar e\u00f0a \u00f6nnur greinarmerki standast ekki \u00e1 +desc_uppercase_sentence=Athuga hvort setning hefst \u00e1 st\u00f3rum staf +desc_whitespacerepetition=Tv\u00f6falt bil (galli \u00e1 uppsetningu) +double_dots_short=Tveir punktar \u00ed r\u00f6\u00f0 +double_commas_short=Tv\u00e6r kommur \u00ed r\u00f6\u00f0 +en=enska +enterText=Skrifa\u00f0u e\u00f0a l\u00edmdu texta til a\u00f0 lesa yfir \u00ed efsta reitinn +enterText2=Settu texta inn h\u00e9r +errorContext=Samhengi\: +errorMessage=Skilabo\u00f0\: +es=sp\u00e6nska +false_friend=Falskur vinur +false_friend_desc=\u00c1bending um falskan vin\: +false_friend_hint=Athuga\u00f0u\: "{0}" ({1}) merkir {2} ({3}). +false_friend_suggestion=\u00c1ttir\u00f0u vi\u00f0 {0}? +fr=franska +gl=Galician +guiCancelButton=H\u00e6tta vi\u00f0 +guiCheckComplete=LanguageTool yfirlestri er loki\u00f0. +guiConfigWindowTitle=LanguageTool valkostir +guiDemoText=\u00deetta er d\u00e6mi um texta sem \u00e1 a\u00f0 s\u00edna farm \u00e1 hvernig LanguageTool virkar. \u00dea\u00f0 er \u00fe\u00f3 h\u00e9rme\u00f0 gert lj\u00f3st a\u00f0 forriti\u00f0 framkv\u00e6mir ekki hef\u00f0bundna ritvilluleit. +guiMatchCount=Hugsanlegar villur\: +guiMenuAbout=&Um... +guiMenuAddRules=Hla\u00f0a inn ®lum +guiMenuCheckClipboard=&Yfirlesa texta \u00e1 klemmuspjaldi +guiMenuFile=&Skr\u00e1 +guiMenuHelp=&Hj\u00e1lp +guiMenuHide=&Fela \u00e1 t\u00e6kjasl\u00e1 +guiMenuOpen=&Opna... +guiMenuOptions=&Valkostir... +guiMenuQuit=&Loka +guiMenuShowMainWindow=Opna a\u00f0alglugga +guiMotherTongue=\u00deitt m\u00f3\u00f0urm\u00e1l\: +guiNoErrorsFound=Engar villur fundust (tungum\u00e1l\: {0}) +guiNoErrorsFoundSelectedText=Engar villur fundust \u00ed v\u00f6ldum texta (tungum\u00e1l\: {0}) +guiOKButton=&\u00cd lagi +guiOOoChangeButton=&Breyta +guiOOoCloseButton=Loka +guiOOoIgnoreAllButton=Sleppa \u00f6llu +guiOOoIgnoreButton=Sleppa +guiOOoOptionsButton=Valkostir... +guiProgressWindowTitle=LanguageTool\: Les yfir texta... +guiReplaceWindowTitle=Skipta \u00fat texta +guiReplaceWithOtherText=<annar texti> +guiRunOnPort=Keyra \u00fej\u00f3n \u00e1 netg\u00e1tt nr. +guiSelectionCheckComplete=LanguageTool hefur loki\u00f0 yfirlestri. +incorrect_case=\u00deessi setning hefst ekki \u00e1 h\u00e1staf +is=Icelandic +it=\u00edtalska +lt=lith\u00e1\u00edska +missing_space_after_comma=Bil vantar \u00e1 eftir kommu +ml=Malayalam +nl=hollenska +no_space_after=Ekki setja bil eftir a\u00f0 svigi er opna\u00f0ur +no_space_before=Ekki setja bil \u00e1\u00f0ur en sviga er loka\u00f0 +no_space_before_dot=Ekki setja bil \u00e1 undan punkti +pl=p\u00f3lska +repetition=Hugsanleg ritvilla\: or\u00f0 endurteki\u00f0 +result1=<br><b> {0}. L\u00edna {1}, d\u00e1lkur {2}</b><br> +resultAreaText=Ni\u00f0urst\u00f6\u00f0ur birtast h\u00e9r +resultTime=<br>T\u00edmi\: {0}ms (\u00fear af {1}ms til a\u00f0 m\u00e1ta reglur)<br> +ru=r\u00fassneska +sk=Slovak +sl=sl\u00f3venska +space_after_comma=Bil skal vera \u00e1 eftir kommu, ekki \u00e1 undan henni +startChecking=Hef yfirlestur\: {0} +sv=s\u00e6nska +textLanguage=Tungum\u00e1l texta\: +two_commas=Tv\u00e6r kommur \u00ed r\u00f6\u00f0 +two_dots=Tveir punktar \u00ed r\u00f6\u00f0 +uk=\u00fakra\u00ednska +unpaired_brackets=Svigar (e\u00f0a svipu\u00f0 t\u00e1kn) standast ekki \u00e1 +whitespace_repetition=Hugsanleg ritvilla\: endurteki\u00f0 bil +ro=Romanian diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/MessagesBundle_it.properties b/JLanguageTool/src/java/de/danielnaber/languagetool/MessagesBundle_it.properties new file mode 100644 index 0000000..ddb2e9e --- /dev/null +++ b/JLanguageTool/src/java/de/danielnaber/languagetool/MessagesBundle_it.properties @@ -0,0 +1,93 @@ +#X-Generator: crowdin.net +eo=Esperanto +be=Bielorusso +ca=Catalano +category_case=Uso delle maiuscole +category_false_friend=Falsi amici +category_grammar=Grammatica +category_misc=Altre +category_typo=Possibile errore di battitura +checkDone=Controllo completato, sono stati trovati {0} possibili errori +checkText=&Controlla Testo +correctionMessage=Correzione\: +cs=Ceco +da=Danese +de=Tedesco +desc_comma_whitespace=Utilizzo dello spazio prima della virgola e prima/dopo le parentesi +desc_double_punct=Doppia battitura di punti o di virgole +desc_repetition=Parola ripetuta (es. 'casa casa') +desc_repetition_short=Ripetizione +desc_unpaired_brackets=Non chiusura di parentesi, virgolette e altra punteggiatura simile +desc_uppercase_sentence=Controlla che la frase inizi con una maiuscola +desc_whitespacerepetition=Ripetizione dello spazio (brutta formattazione) +double_dots_short=Due punti consecutivi +double_commas_short=Due virgole consecutive +en=Inglese +enterText=Si prega di scrivere o di incollare il testo da controllare nel campo in alto +enterText2=Si prega di inserire il testo da controllare qui +errorContext=Contesto\: +errorMessage=Messaggio\: +es=Spagnolo +false_friend=Falso amico +false_friend_desc=suggerimento di falso amico per\: +false_friend_hint=Suggerimento\: "{0}" ({1}) significa {2} ({3}). +false_friend_suggestion=Intendevi forse {0}? +fr=Francese +gl=Galiziano +guiCancelButton=Annulla +guiCheckComplete=LanguageTool ha completato il controllo. +guiConfigWindowTitle=Opzioni LanguageTool +guiDemoText=Questo \u010d un esempio di input per dimostrare il funzionamento di LanguageTool. Si noti, per\u0148, che non include il controllo ortografico.works. +guiMatchCount=Probabili errori\: +guiMenuAbout=&Informazioni su... +guiMenuAddRules=Carica file delle &Regole +guiMenuCheckClipboard=&Contolla il testo nella Clipboard +guiMenuFile=&File +guiMenuHelp=&Aiuto +guiMenuHide=&Minimizza nel System Tray +guiMenuOpen=&Apri... +guiMenuOptions=Opzio&ni... +guiMenuQuit=&Esci +guiMenuShowMainWindow=Apri la Finestra Principale +guiMotherTongue=La tua lingua madre\: +guiNoErrorsFound=Niente da segnalare (linguaggio\: {0}) +guiNoErrorsFoundSelectedText=Niente da segnalare nel testo selezionato (linguaggio\: {0}) +guiOKButton=&OK +guiOOoChangeButton=&Cambia +guiOOoCloseButton=Chiudi +guiOOoIgnoreAllButton=Ignora Tutti +guiOOoIgnoreButton=Ignora +guiOOoOptionsButton=Opzioni... +guiProgressWindowTitle=LanguageTool\: Controllo in corso... +guiReplaceWindowTitle=Rimpiazza il testo +guiReplaceWithOtherText=<altro testo> +guiRunOnPort=Esegui come server sulla po&rta +guiSelectionCheckComplete=Il controllo LanguageTool del testo selezionato \u010d completo. +incorrect_case=Questa frase non inizia con una maiuscola +is=Islandese +it=Italiano +lt=Lituano +missing_space_after_comma=Inserire uno spazio dopo la virgola +ml=Malayalam +nl=Olandese +no_space_after=Non inserire lo spazio dopo l'apertura di parentesi +no_space_before=Non inserire lo spazio dopo la chiusura di parentesi +no_space_before_dot=Non inserire lo spazio dopo il punto a capo +pl=Polacco +repetition=Possibile errore di battitura\: parola ripetuta +result1=<br><b> {0}. Linea {1}, colonna {2}</b><br> +resultAreaText=I rusultati appariranno qui +resultTime=<br>Time\: {0}ms (inclusi {1}ms per il rule matching)<br> +ru=Russo +sk=Slovacco +sl=Sloveno +space_after_comma=Inserire lo spazio dopo la virgola e non prima +startChecking=Inizio controllo a {0} +sv=Svedese +textLanguage=Linguaggio del testo\: +two_commas=Due virgole consecutive +two_dots=Due punti consecutivi +uk=Ucraino +unpaired_brackets=Non chiusura di parentesi o di simboli simili +whitespace_repetition=Probabile errore\: ripetizione di spazio +ro=Rumeno diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/MessagesBundle_lt.properties b/JLanguageTool/src/java/de/danielnaber/languagetool/MessagesBundle_lt.properties new file mode 100644 index 0000000..1d3ea9d --- /dev/null +++ b/JLanguageTool/src/java/de/danielnaber/languagetool/MessagesBundle_lt.properties @@ -0,0 +1,93 @@ +#X-Generator: crowdin.net +eo=Esperanto +be=Baltarusi\u0173 +ca=Katalon\u0173 +category_case=Did\u017eiosios/ma\u017eosios raid\u0117s +category_false_friend=Netikras draugas +category_grammar=Gramatika +category_misc=Kiti +category_typo=Possible Typo +checkDone=Tikrinimas baigtas, rasta galim\u0173 klaid\u0173\: {0} +checkText=Tikrinti tekst\u0105 +correctionMessage=Correction\: +cs=\u010dek\u0173 +da=Dan\u0173 +de=vokie\u010di\u0173 +desc_comma_whitespace=Tarp\u0173 naudojimas prie\u0161/po kablel\u012f bei skliaustelius +desc_double_punct=Ar n\u0117ra dviej\u0173 pasikartojan\u010di\u0173 ta\u0161k\u0173 ar kableli\u0173 +desc_repetition=Pasikartojantys \u017eod\u017eiai (pvz. "ar ar") +desc_repetition_short=Word repetition +desc_unpaired_brackets=Unpaired braces, brackets, quotation marks and similar symbols +desc_uppercase_sentence=Tikrinti ar sakinys prasideda i\u0161 did\u017eiosios raid\u0117s +desc_whitespacerepetition=Whitespace repetition (bad formatting) +double_dots_short=Two consecutive dots +double_commas_short=Two consecutive comma +en=angl\u0173 +enterText=Tikrinimui skirt\u0105 tekst\u0105 ra\u0161ykite arba \u012fd\u0117kite vir\u0161uje +enterText2=\u010cia para\u0161ykite arba \u012fd\u0117kite tikrinimui skirt\u0105 tekst\u0105 +errorContext=Tekstas\: +errorMessage=Klaida\: +es=ispan\u0173 +false_friend=False friend +false_friend_desc=netikras draugas\: +false_friend_hint=Hint\: "{0}" ({1}) means {2} ({3}). +false_friend_suggestion=Did you mean {0}? +fr=pranc\u016bz\u0173 +gl=Galician +guiCancelButton=At\u0161aukti +guiCheckComplete=Gramatikos tikrinimas baigtas. +guiConfigWindowTitle=Gramatikos \u012frankio nustatymai +guiDemoText=\u010dia yra pavyzdinis tekstas gramatikos tikrinimui tikrinimui. Atsiminkite , kad \u017eod\u017ei\u0173 ra\u0161yba netikrynama. +guiMatchCount=Rasta galim\u0173 klaid\u0173\: +guiMenuAbout=Apie... +guiMenuAddRules=Load Rule File +guiMenuCheckClipboard=Tikrinti nukopijuot\u0105 tekst\u0105 i\u0161karpin\u0117je +guiMenuFile=Failas +guiMenuHelp=Pagalba +guiMenuHide=Pasl\u0117pti sistemos prane\u0161im\u0173 vietoje +guiMenuOpen=Atverti... +guiMenuOptions=Nustatymai... +guiMenuQuit=I\u0161eiti +guiMenuShowMainWindow=Open main window +guiMotherTongue=Gimtoji kalba\: +guiNoErrorsFound=Klaid\u0173 nerasta (teksto kalba\: {0}) +guiNoErrorsFoundSelectedText=Pa\u017eym\u0117tame tekste klaid\u0173 nerasta (teksto kalba\: {0}) +guiOKButton=Gerai +guiOOoChangeButton=Pakeisti +guiOOoCloseButton=U\u017edaryti +guiOOoIgnoreAllButton=Nepaisyti visur +guiOOoIgnoreButton=Nepaisyti +guiOOoOptionsButton=Nustatymai... +guiProgressWindowTitle=Gramatika\: Tikrinamas tekstas... +guiReplaceWindowTitle=Pakeisti tekst\u0105 +guiReplaceWithOtherText=<kitas tekstas> +guiRunOnPort=Paleisti tikrinimo server\u012f, prievadas\: +guiSelectionCheckComplete=Pa\u017eym\u0117to teksto gramatikos tikrinimas baigtas. +incorrect_case=Sakinys turi prasid\u0117ti i\u0161 did\u017eiosios raid\u0117s +is=island\u0173 +it=ital\u0173 +lt=lietuvi\u0173 +missing_space_after_comma=Po kablelio reikia pad\u0117ti tarp\u0105 +ml=Malajalam\u0173 +nl=oland\u0173 +no_space_after=Nereikia d\u0117ti tarpo po atidaran\u010dio skliaustelio +no_space_before=Nereikia d\u0117ti tarpo prie\u0161 u\u017edarant\u012f skliaustel\u012f +no_space_before_dot=Don't put a space before the full stop +pl=lenk\u0173 +repetition=Possible typo\: you repeated a word +result1=<br><b> {0}. Eilut\u0117 {1}, simbolis {2}</b><br> +resultAreaText=Rezultatai bus pateikti \u010dia +resultTime=<br>nLaikas\: {0}ms (\u012fskaitant {1}ms taisykli\u0173 tikrinimui)<br> +ru=rus\u0173 +sk=slovak\u0173 +sl=Slovenian +space_after_comma=Tarp\u0105 reikia d\u0117ti po kablelio, o ne prie\u0161 j\u012f +startChecking=Pradedamas tikrinimas kalbai\: {0} +sv=\u0161ved\u0173 +textLanguage=Teksto kalba\: +two_commas=Du pasikartojantys kableliai +two_dots=Du pasikartojantys ta\u0161kai +uk=ukrainie\u010di\u0173 +unpaired_brackets=Unpaired bracket or similar symbol +whitespace_repetition=Possible typo\: you repeated a whitespace +ro=Rom\u00e2n\u0103 diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/MessagesBundle_nb.properties b/JLanguageTool/src/java/de/danielnaber/languagetool/MessagesBundle_nb.properties new file mode 100644 index 0000000..520908b --- /dev/null +++ b/JLanguageTool/src/java/de/danielnaber/languagetool/MessagesBundle_nb.properties @@ -0,0 +1,188 @@ +# Norwegian Bokmål translation of LanguageTool +# Copyright (C) 2006 Daniel Naber (http://www.danielnaber.de), Marcin Milkowski + +be = Hviterrussisk + +ca = Katalansk + +category_case = Capitalization + +category_false_friend = False friends + +category_grammar = Grammatikk + +category_misc = Diverse + +category_typo = Mulig slagfeil + +checkDone = Kontrollert ferdig, fant {0} m + +checkText = &Kontroller tekst + +correctionMessage = Rettelse: + +cs = Tsjekkisk + +da = Dansk + +de = German + +nb = Norsk Bokmål + +desc_comma_whitespace = Use of whitespace before comma and before/after parentheses + +desc_double_punct = Use of two consecutive dots or commas + +desc_repetition = Repetert ord (f.eks. 'skal skal') + +desc_repetition_short = Repetert ord + +desc_unpaired_brackets = Unpaired braces, brackets, quotation marks and similar symbols + +desc_uppercase_sentence = Checks that a sentence starts with an uppercase letter + +desc_whitespacerepetition = Whitespace repetition (bad formatting) + +double_dots_short = Two consecutive dots + +double_commas_short = Two consecutive comma + +en = Engelsk + +enterText = Please type or paste text to check in the top area + +enterText2 = Please insert text to check here + +eo = Esperanto + +errorContext = Sammenheng: + +errorMessage = Melding: + +es = Spansk + +false_friend = False friend + +false_friend_desc = false friend hint for: + +false_friend_hint = Vink: "{0}" ({1}) betyr {2} ({3}). + +false_friend_suggestion = Mente du {0}? + +fr = Fransk + +gl = Galician + +guiCancelButton = Avbryt + +guiCheckComplete = LanguageTool check is complete. + +guiConfigWindowTitle = LanguageTool Options + +guiDemoText = This is a example input to to show you how LanguageTool works. Note, however, that it does not include a spell checka. + +guiMatchCount = Potential errors: + +guiMenuAbout = &About... + +guiMenuAddRules = Load &Rule File + +guiMenuCheckClipboard = &Check Text in Clipboard + +guiMenuFile = &File + +guiMenuHelp = &Help + +guiMenuHide = &Hide to System Tray + +guiMenuOpen = &Open... + +guiMenuOptions = Option&s... + +guiMenuQuit = &Quit + +guiMenuShowMainWindow = Open Main Window + +guiMotherTongue = Your mother tongue: + +guiNoErrorsFound = No errors or warnings found (language: {0}) + +guiNoErrorsFoundSelectedText = No errors or warnings found in selected text (language: {0}) + +guiOKButton = &OK + +guiOOoChangeButton = &Change + +guiOOoCloseButton = Close + +guiOOoIgnoreAllButton = Ignore All + +guiOOoIgnoreButton = Ignore + +guiOOoOptionsButton = Options... + +guiProgressWindowTitle = LanguageTool: Checking Text... + +guiReplaceWindowTitle = Replace text + +guiReplaceWithOtherText = <other text> + +guiRunOnPort = Run as server on po&rt + +guiSelectionCheckComplete = LanguageTool check of selected text is complete. + +incorrect_case = This sentence does not start with an uppercase letter + +is = Icelandic + +it = Italian + +lt = Lithuanian + +missing_space_after_comma = Put a space after the comma + +ml= Malayalam + +nl = Dutch + +no_space_after = Don't put a space after the opening parenthesis + +no_space_before = Don't put a space before the closing parenthesis + +no_space_before_dot = Don't put a space before the full stop + +pl = Polish + +repetition = Possible typo: you repeated a word + +result1 = <br><b> {0}. Line {1}, column {2}</b><br> + +resultAreaText = Results will appear here + +resultTime = <br>Time: {0}ms (including {1}ms for rule matching)<br> + +ru = Russian + +sk = Slovak + +sl = Slovenian + +space_after_comma = Put a space after the comma, but not before the comma + +startChecking = Starting check in {0} + +sv = Swedish + +textLanguage = Text Language: + +two_commas = Two consecutive commas + +two_dots = Two consecutive dots + +uk = Ukrainian + +unpaired_brackets = Unpaired bracket or similar symbol + +whitespace_repetition = Possible typo: you repeated a whitespace + +ro = Romanian diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/MessagesBundle_nl.properties b/JLanguageTool/src/java/de/danielnaber/languagetool/MessagesBundle_nl.properties new file mode 100644 index 0000000..0825e8b --- /dev/null +++ b/JLanguageTool/src/java/de/danielnaber/languagetool/MessagesBundle_nl.properties @@ -0,0 +1,93 @@ +#X-Generator: crowdin.net +eo=Esperanto +be=Witrussisch +ca=Catalaans +category_case=Hoofdlettergebruik +category_false_friend=Begripsverwarring +category_grammar=Grammatica +category_misc=Diversen +category_typo=Mogelijke typefouten +checkDone=Klaar, {0} aandachtspunten +checkText=Tekst controleren +correctionMessage=Suggestie\: +cs=Tsjechisch +da=Deens +de=Duits +desc_comma_whitespace=Spatie voor of achter haakje +desc_double_punct=Twee komma's of punten +desc_repetition=Herhaling (bijv. 'de de') +desc_repetition_short=Herhaling +desc_unpaired_brackets=Onjuist gecombineerde leestekens +desc_uppercase_sentence=Controleert of een zin begint met een hoofdletter +desc_whitespacerepetition=Teveel witruimte tussen woorden +double_dots_short=Teveel punten +double_commas_short=Teveel komma's +en=Engels +enterText=Voer in het bovenste vak de te controleren tekst in +enterText2=Voer hier de te controleren tekst in +errorContext=Context\: +errorMessage=Melding\: +es=Spaans +false_friend=Betekenisverwarring +false_friend_desc=Hint bij mogelijke verwisseling van begrippen\: +false_friend_hint=Hint\: "{0}" ({1}) betekent {2} ({3}). +false_friend_suggestion=Bedoelde u {0}? +fr=Frans +gl=Galicisch +guiCancelButton=Annuleren +guiCheckComplete=LanguageTool-controle gereed. +guiConfigWindowTitle=LanguageTool-opties +guiDemoText=Dit is een voorbeeld om te laten zien hoe Languagetool werkt. Besef wel dat het geen spellingcontrole bevat. +guiMatchCount=Aandachtspunten\: +guiMenuAbout=Over... +guiMenuAddRules=Inlezen regelbestand +guiMenuCheckClipboard=Tekst van klembord controleren +guiMenuFile=Bestand +guiMenuHelp=Hulp +guiMenuHide=Naar systeemvak +guiMenuOpen=Openen... +guiMenuOptions=Opties... +guiMenuQuit=Afsluiten +guiMenuShowMainWindow=Open het hoofdvenster +guiMotherTongue=Moedertaal\: +guiNoErrorsFound=Geen aandachtspunten gevonden (taal\: {0}) +guiNoErrorsFoundSelectedText=Geen aandachtspunten in de geselecteerde tekst(taal\: {0}) +guiOKButton=&Ok\u00e9 +guiOOoChangeButton=Wijzigen +guiOOoCloseButton=Sluiten +guiOOoIgnoreAllButton=Alles negeren +guiOOoIgnoreButton=Negeer +guiOOoOptionsButton=Opties... +guiProgressWindowTitle=LanguageTool\: Tekst controleren... +guiReplaceWindowTitle=Tekst vervangen +guiReplaceWithOtherText=<nieuwe tekst> +guiRunOnPort=Voer uit als server op poort +guiSelectionCheckComplete=LanguageTool\: controle van geselecteerde tekst gereed. +incorrect_case=Deze zin begint niet met een hoofdletter +is=IJslands +it=Italiaans +lt=Litouws +missing_space_after_comma=Zet een spatie na de komma +ml=Maleis +nl=Nederlands +no_space_after=Zet geen spatie na een haakje openen +no_space_before=Zet geen spatie voor een haakje sluiten +no_space_before_dot=Zet geen spatie voor een punt +pl=Pools +repetition=Mogelijke typefout\: herhaling van woord +result1=<br><b> {0}. Regel {1}, positie {2}</b><br> +resultAreaText=Resultaten verschijnen hier. Suggesties voor (verbeteringen van) Nederlandse regels kunt u inbrengen via\: www.opentaal.org en opentaal@lists.sf.own-it.nl. +resultTime=<br>Duur\: {0}ms (inclusief {1}ms voor de regels)<br> +ru=Russisch +sk=Slovaaks +sl=Sloveens +space_after_comma=Zet een spatie na een komma, maar niet ervoor +startChecking=Start controle in {0} +sv=Zweeds +textLanguage=Teksttaal\: +two_commas=Twee opeenvolgende komma's +two_dots=Twee opeenvolgende punten +uk=Oekra\u00efens +unpaired_brackets=Oneven aantal teksthaken +whitespace_repetition=Teveel witruimte +ro=Roemeens diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/MessagesBundle_pl.properties b/JLanguageTool/src/java/de/danielnaber/languagetool/MessagesBundle_pl.properties new file mode 100644 index 0000000..ca00fb5 --- /dev/null +++ b/JLanguageTool/src/java/de/danielnaber/languagetool/MessagesBundle_pl.properties @@ -0,0 +1,93 @@ +#X-Generator: crowdin.net +eo=esperanto +be=bia\u0142oruski +ca=katalo\u0144ski +category_case=Pisownia ma\u0142\u0105 i wielk\u0105 liter\u0105 +category_false_friend=Fa\u0142szywi przyjaciele +category_grammar=Gramatyka +category_misc=B\u0142\u0119dy r\u00f3\u017cne +category_typo=Prawdopodobna liter\u00f3wka +checkDone=Sprawdzono, liczba znalezionych potencjalnych problem\u00f3w\: {0} +checkText=&Sprawd\u017a tekst +correctionMessage=Proponowana poprawka\: +cs=Czeski +da=Du\u0144ski +de=Niemiecki +desc_comma_whitespace=Odst\u0119py przed przecinkami oraz przed nawiasami i po nawiasach +desc_double_punct=Podw\u00f3jne kropki lub przecinki +desc_repetition=Powt\u00f3rzenie wyrazu (np. \u201ejest jest\u201d) +desc_repetition_short=Powt\u00f3rzenie wyrazu +desc_unpaired_brackets=Niesparowane nawiasy, cudzys\u0142owy i podobne +desc_uppercase_sentence=Test, czy zdanie zaczyna si\u0119 wielk\u0105 liter\u0105 +desc_whitespacerepetition=Powt\u00f3rzenie spacji (b\u0142\u0119dne formatowanie) +double_dots_short=Dwie kropki +double_commas_short=Dwa przecinki +en=Angielski +enterText=Wpisz lub wklej tekst do sprawdzenia w g\u00f3rnym polu +enterText2=Wstaw tutaj tekst do sprawdzenia +errorContext=Kontekst\: +errorMessage=Komunikat\: +es=Hiszpa\u0144ski +false_friend=Fa\u0142szywi przyjaciele +false_friend_desc=wskaz\u00f3wka dotycz\u0105ca fa\u0142szywych przyjaci\u00f3\u0142\: +false_friend_hint=Wskaz\u00f3wka\: "{0}" ({1}) oznacza {2} ({3}) +false_friend_suggestion=Czy chodzi o {0}? +fr=francuski +gl=galisyjski +guiCancelButton=Anuluj +guiCheckComplete=Program LanguageTool zako\u0144czy\u0142 sprawdzanie. +guiConfigWindowTitle=Opcje LanguageTool +guiDemoText=To jest przyk\u0142adowy tekst kt\u00f3ry pokazuje, jak jak dzia\u0142a LanguageTool. LanguageTool nie zawiera jadnak korektora psowni. +guiMatchCount=Potencjalne b\u0142\u0119dy\: +guiMenuAbout=&Informacje... +guiMenuAddRules=Otw\u00f3rz plik ®u\u0142 +guiMenuCheckClipboard=Sprawd\u017a &tekst ze schowka +guiMenuFile=&Plik +guiMenuHelp=Pomo&c +guiMenuHide=Schowaj do paska &zada\u0144 +guiMenuOpen=&Otw\u00f3rz... +guiMenuOptions=&Opcje... +guiMenuQuit=Za&ko\u0144cz +guiMenuShowMainWindow=Otw\u00f3rz g\u0142\u00f3wne okno +guiMotherTongue=J\u0119zyk ojczysty\: +guiNoErrorsFound=Nie znaleziono b\u0142\u0119d\u00f3w ani usterek (j\u0119zyk\: {0}) +guiNoErrorsFoundSelectedText=W zaznaczonym tek\u015bcie nie znaleziono b\u0142\u0119d\u00f3w ani usterek (j\u0119zyk\: {0}) +guiOKButton=&OK +guiOOoChangeButton=&Zmie\u0144 +guiOOoCloseButton=Zamknij +guiOOoIgnoreAllButton=Ignoruj wszystkie +guiOOoIgnoreButton=Ignoruj +guiOOoOptionsButton=&Opcje... +guiProgressWindowTitle=LanguageTool\: Sprawdzanie tekstu... +guiReplaceWindowTitle=Zast\u0105p tekst +guiReplaceWithOtherText=<inny tekst> +guiRunOnPort=Uruchom jako serwer na po&rcie +guiSelectionCheckComplete=Sprawdzanie zaznaczonego tekstu w programie LanguageTool zosta\u0142o zako\u0144czone. +incorrect_case=To zdanie nie zaczyna si\u0119 wielk\u0105 liter\u0105 +is=islandzki +it=w\u0142oski +lt=litewski +missing_space_after_comma=Po przecinku wstawiamy spacj\u0119 +ml=malayalam +nl=niderlandzki +no_space_after=Nie wstawiamy spacji po nawiasie otwieraj\u0105cym +no_space_before=Nie wstawiamy spacji przed nawiasem zamykaj\u0105cym +no_space_before_dot=Nie wstawiamy spacji przed kropk\u0105 +pl=polski +repetition=Prawdopodobna liter\u00f3wka\: powt\u00f3rzony wyraz +result1=<br><b> {0}. Wiersz {1}, kolumna {2}</b><br> +resultAreaText=Miejsce na wyniki +resultTime=<br>Czas\: {0}ms (w tym dopasowywanie regu\u0142\: {1}ms)<br> +ru=rosyjski +sk=s\u0142owacki +sl=s\u0142owe\u0144ski +space_after_comma=Spacj\u0119 wstawiamy po przecinku, nie przed przecinkiem +startChecking=Sprawdzanie w j\u0119zyku\: {0} +sv=szwedzki +textLanguage=J\u0119zyk tekstu\: +two_commas=Dwa przecinki +two_dots=Dwie kropki +uk=ukrai\u0144ski +unpaired_brackets=Niesparowany cudzys\u0142\u00f3w, nawias itd. +whitespace_repetition=Prawdopodobna liter\u00f3wka\: wiele spacji z rz\u0119du +ro=rumu\u0144ski diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/MessagesBundle_ro.properties b/JLanguageTool/src/java/de/danielnaber/languagetool/MessagesBundle_ro.properties new file mode 100644 index 0000000..80f49e1 --- /dev/null +++ b/JLanguageTool/src/java/de/danielnaber/languagetool/MessagesBundle_ro.properties @@ -0,0 +1,93 @@ +#X-Generator: crowdin.net +eo=Esperanto +be=Bielorus\u0103 +ca=Catalan\u0103 +category_case=Capitalizare +category_false_friend=Prieteni fal\u0219i +category_grammar=Gramatic\u0103 +category_misc=Diverse +category_typo=Posibil\u0103 gre\u0219eal\u0103 de tastare +checkDone=Verificare complet\u0103, s-au g\u0103sit {0} probleme probabile +checkText=Verifi&c\u0103 textul +correctionMessage=Corectare\: +cs=Ceh\u0103 +da=Danez\u0103 +de=German\u0103 +desc_comma_whitespace=Spa\u021bii puse \u00eenainte de virgul\u0103 sau \u00eenainte/dup\u0103 paranteze +desc_double_punct=S-au folosit dou\u0103 puncte sau virgule consecutive +desc_repetition=Cuv\u00e2nt repetat (ex\: \u201evoi voi\u201d) +desc_repetition_short=Cuv\u00e2nt repetat +desc_unpaired_brackets=Acolade, paranteze, ghilimele sau alte simboluri similare desperecheate +desc_uppercase_sentence=Verific\u0103 dac\u0103 propozi\u021bia \u00eencepe cu liter\u0103 mare +desc_whitespacerepetition=Spa\u021biu repetat (formatare gre\u0219it\u0103) +double_dots_short=Dou\u0103 puncte consecutive +double_commas_short=Dou\u0103 virgule consecutive +en=Englez\u0103 +enterText=V\u0103 rug\u0103m s\u0103 tasta\u021bi sau s\u0103 lipi\u021bi textul de verificat \u00een zona de sus +enterText2=V\u0103 rug\u0103m s\u0103 insera\u021bi textul de verificat aici +errorContext=Context\: +errorMessage=Mesaj\: +es=Spaniol\u0103 +false_friend=Prieten fals +false_friend_desc=indiciu de prieten fals pentru\: +false_friend_hint=Indiciu\: \u201e{0}\u201d ({1}) \u00eenseamn\u0103 {2} ({3}). +false_friend_suggestion=A\u021bi vrut s\u0103 scrie\u021bi {0}? +fr=Francez\u0103 +gl=Galician +guiCancelButton=Renun\u021b\u0103 +guiCheckComplete=Verificarea f\u0103cut\u0103 de LanguageTool este complet\u0103. +guiConfigWindowTitle=Op\u021biuni pentru LanguageTool +guiDemoText=Acesta este un exemplu pentru a v\u0103 v\u0103 ar\u0103ta cum func\u021bioneaz\u0103 LanguageTool. Re\u021bine\u021bi c\u0103 nu include \u0219i verificare ortografic\u0103. +guiMatchCount=Posibile erori\: +guiMenuAbout=&Despre... +guiMenuAddRules=\u00eencarc\u0103 un fi\u0219ier de ®uli +guiMenuCheckClipboard=Verifi&c\u0103 textul din memorie +guiMenuFile=&Fi\u0219ier +guiMenuHelp=&Ajutor +guiMenuHide=&Ascunde \u00een zona de notificare +guiMenuOpen=&Deschide... +guiMenuOptions=&Op\u021biuni... +guiMenuQuit=&Ie\u0219ire +guiMenuShowMainWindow=Deschide fereastra principal\u0103 +guiMotherTongue=Limba dumneavoastr\u0103 matern\u0103\: +guiNoErrorsFound=Nu s-au g\u0103sit erori sau avertismente (limba {0}) +guiNoErrorsFoundSelectedText=Nu s-au g\u0103sit erori sau avertismente \u00een textul selectat (limba {0}) +guiOKButton=&OK +guiOOoChangeButton=S&chimb\u0103 +guiOOoCloseButton=\u00eenchide +guiOOoIgnoreAllButton=Ignor\u0103 tot +guiOOoIgnoreButton=Ignor\u0103 +guiOOoOptionsButton=Op\u021biuni... +guiProgressWindowTitle=LanguageTool\: Se verific\u0103 textul... +guiReplaceWindowTitle=\u00eenlocuire text +guiReplaceWithOtherText=<alt text> +guiRunOnPort=Ruleaz\u0103 ca server pe po&rtul +guiSelectionCheckComplete=Verificarea LanguageTool a textului selectat este complet\u0103. +incorrect_case=Propozi\u021bia nu \u00eencepe cu liter\u0103 mare +is=Islandez\u0103 +it=Italian\u0103 +lt=Lituanian\u0103 +missing_space_after_comma=Pune\u021bi un spa\u021biu dup\u0103 virgul\u0103 +ml=Malayalam +nl=Olandez\u0103 +no_space_after=Nu pune\u021bi spa\u021biu dup\u0103 deschiderea parantezei +no_space_before=Nu pune\u021bi spa\u021biu dup\u0103 \u00eenchiderea parantezei +no_space_before_dot=Nu pune\u021bi spa\u021biu \u00eenainte de punct +pl=Polonez\u0103 +repetition=Posibil\u0103 gre\u0219eal\u0103\: a\u021bi repetat un cuv\u00e2nt +result1=<br><b> {0}. Linia {1}, coloana {2}</b><br> +resultAreaText=Rezultatele vor ap\u0103rea aici +resultTime=<br>Durat\u0103\: {0}ms (inclusiv {1}ms pentru potrivirea regulilor)<br> +ru=Rus\u0103 +sk=Slovac\u0103 +sl=Sloven\u0103 +space_after_comma=Pune un spa\u021biu dup\u0103 virgul\u0103, dar nu \u00eenainte de virgul\u0103 +startChecking=Verificarea \u00eencepe \u00een {0} +sv=Suedez\u0103 +textLanguage=Limba textului\: +two_commas=Dou\u0103 virgule consecutive +two_dots=Dou\u0103 puncte consecutive +uk=Ukrainian +unpaired_brackets=Parantez\u0103 nepereche +whitespace_repetition=Posibi\u0103 gre\u0219eal\u0103\: a\u021bi repetat un spa\u021biu +ro=Rom\u00e2n\u0103 diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/MessagesBundle_ru.properties b/JLanguageTool/src/java/de/danielnaber/languagetool/MessagesBundle_ru.properties new file mode 100644 index 0000000..5c27320 --- /dev/null +++ b/JLanguageTool/src/java/de/danielnaber/languagetool/MessagesBundle_ru.properties @@ -0,0 +1,93 @@ +#X-Generator: crowdin.net +eo=Esperanto +be=\u0411\u0435\u043b\u043e\u0440\u0443\u0441\u0441\u043a\u0438\u0439 +ca=\u041a\u0430\u0442\u0430\u043b\u0430\u043d\u0441\u043a\u0438\u0439 +category_case=\u0417\u0430\u0433\u043b\u0430\u0432\u043d\u044b\u0435 \u0431\u0443\u043a\u0432\u044b +category_false_friend=\u041e\u043c\u043e\u043d\u0438\u043c\u044b +category_grammar=\u0413\u0440\u0430\u043c\u043c\u0430\u0442\u0438\u043a\u0430 +category_misc=\u041d\u0430\u0441\u0442\u0440\u043e\u0439\u043a\u0438 +category_typo=\u041e\u043f\u0435\u0447\u0430\u0442\u043a\u0430 +checkDone=\u041f\u0440\u043e\u0432\u0435\u0440\u043a\u0430 \u0437\u0430\u043a\u043e\u043d\u0447\u0435\u043d\u0430, {0} \u043f\u043e\u0442\u0435\u043d\u0446\u0438\u0430\u043b\u044c\u043d\u044b\u0445 \u043e\u0448\u0438\u0431\u043e\u043a \u043d\u0430\u0439\u0434\u0435\u043d\u043e +checkText=&\u041f\u0440\u043e\u0432\u0435\u0440\u0438\u0442\u044c \u0442\u0435\u043a\u0441\u0442 +correctionMessage=\u0418\u0441\u043f\u0440\u0430\u0432\u043b\u0435\u043d\u0438\u044f\: +cs=\u0427\u0435\u0448\u0441\u043a\u0438\u0439 +da=\u0414\u0430\u0442\u0441\u043a\u0438\u0439 +de=\u041d\u0435\u043c\u0435\u0446\u043a\u0438\u0439 +desc_comma_whitespace=\u041f\u0440\u043e\u0431\u0435\u043b\u044b \u043f\u0435\u0440\u0435\u0434 \u0437\u0430\u043f\u044f\u0442\u043e\u0439 \u0438\u043b\u0438 \u043f\u0435\u0440\u0435\u0434/\u043f\u043e\u0441\u043b\u0435 \u0441\u043a\u043e\u0431\u043e\u043a +desc_double_punct=\u0414\u0432\u0435 \u0437\u0430\u043f\u044f\u0442\u044b\u0435 \u0438\u043b\u0438 \u0442\u043e\u0447\u043a\u0438 \u043f\u043e\u0434\u0440\u044f\u0434 +desc_repetition=\u041f\u043e\u0432\u0442\u043e\u0440 \u0441\u043b\u043e\u0432 (\u043d\u0430\u043f\u0440\u0438\u043c\u0435\u0440\: \u201c\u043e\u043d \u043e\u043d\u201d) +desc_repetition_short=\u041f\u043e\u0432\u0442\u043e\u0440 \u0441\u043b\u043e\u0432\u0430 +desc_unpaired_brackets=\u041d\u0435\u043f\u0430\u0440\u043d\u044b\u0435 \u0441\u043a\u043e\u0431\u043a\u0438 \u0438\u043b\u0438 \u0430\u043f\u043e\u0441\u0442\u0440\u043e\u0444\u044b +desc_uppercase_sentence=\u041f\u0440\u0435\u0434\u043b\u043e\u0436\u0435\u043d\u0438\u0435 \u0434\u043e\u043b\u0436\u043d\u043e \u043d\u0430\u0447\u0438\u043d\u0430\u0442\u044c\u0441\u044f \u0441 \u0437\u0430\u0433\u043b\u0430\u0432\u043d\u043e\u0439 \u0431\u0443\u043a\u0432\u044b +desc_whitespacerepetition=\u041f\u043e\u0432\u0442\u043e\u0440 \u043f\u0440\u043e\u0431\u0435\u043b\u0430 +double_dots_short=\u0414\u0432\u0435 \u0442\u043e\u0447\u043a\u0438 +double_commas_short=\u0414\u0432\u0435 \u0437\u0430\u043f\u044f\u0442\u044b\u0435 +en=\u0410\u043d\u0433\u043b\u0438\u0439\u0441\u043a\u0438\u0439 +enterText=\u041d\u0430\u043f\u0435\u0447\u0430\u0442\u0430\u0439\u0442\u0435 \u0438\u043b\u0438 \u0432\u0441\u0442\u0430\u0432\u044c\u0442\u0435 \u0442\u0435\u043a\u0441\u0442 \u0434\u043b\u044f \u043f\u0440\u043e\u0432\u0435\u0440\u043a\u0438 +enterText2=\u041d\u0430\u043f\u0435\u0447\u0430\u0442\u0430\u0439\u0442\u0435 \u0438\u043b\u0438 \u0432\u0441\u0442\u0430\u0432\u044c\u0442\u0435 \u0442\u0435\u043a\u0441\u0442 \u0434\u043b\u044f \u043f\u0440\u043e\u0432\u0435\u0440\u043a\u0438 +errorContext=\u041e\u0448\u0438\u0431\u043a\u0430 \u043a\u043e\u043d\u0442\u0435\u043a\u0441\u0442\u0430\: +errorMessage=\u041e\u0448\u0438\u0431\u043a\u0430\: +es=\u0418\u0441\u043f\u0430\u043d\u0441\u043a\u0438\u0439 +false_friend=\u041e\u043c\u043e\u043d\u0438\u043c\u044b +false_friend_desc=\u0417\u043d\u0430\u0447\u0435\u043d\u0438\u0435 \u043e\u043c\u043e\u043d\u0438\u043c\u043e\u0432\: +false_friend_hint=\u041f\u043e\u0434\u0441\u043a\u0430\u0437\u043a\u0430\: "{0}" ({1}) \u043e\u0437\u043d\u0430\u0447\u0430\u0435\u0442 {2} ({3}). +false_friend_suggestion=\u041f\u0440\u0435\u0434\u043f\u043e\u043b\u0430\u0433\u0430\u0435\u0442\u0441\u044f {0}? +fr=\u0424\u0440\u0430\u043d\u0446\u0443\u0437\u0441\u043a\u0438\u0439 +gl=\u0413\u0430\u043b\u0438\u0441\u0438\u0439\u0441\u043a\u0438\u0439 +guiCancelButton=\u041e\u0442\u043c\u0435\u043d\u0430 +guiCheckComplete=LanguageTool \u0437\u0430\u0432\u0435\u0440\u0448\u0438\u043b \u043f\u0440\u043e\u0432\u0435\u0440\u043a\u0443. +guiConfigWindowTitle=\u041d\u0430\u0441\u0442\u0440\u043e\u0439\u043a\u0438 LanguageTool +guiDemoText=\u042d\u0442\u043e \u043f\u0440\u0438\u043c\u0435\u0440 \u0440\u0430\u0431\u043e\u0442\u044b LanguageTool. \u041e\u043d \u043e\u043d \u0441\u043e\u0437\u0434\u0430\u043d \u0434\u043b\u044f \u043f\u043e\u0438\u0441\u043a\u0430 \u0433\u0440\u0430\u043c\u043c\u0430\u0442\u0438\u0447\u0435\u0441\u043a\u0438\u0445 \u043e\u0448\u0438\u0431\u043e\u043a.. +guiMatchCount=\u041f\u043e\u0442\u0435\u043d\u0446\u0438\u0430\u043b\u044c\u043d\u044b\u0435 \u043e\u0448\u0438\u0431\u043a\u0438\: +guiMenuAbout=&\u041e... +guiMenuAddRules=\u0417\u0430\u0433\u0440\u0443\u0437\u0438\u0442\u044c \u0444\u0430\u0439\u043b &\u041f\u0440\u0430\u0432\u0438\u043b +guiMenuCheckClipboard=\u041f&\u0440\u043e\u0432\u0435\u0440\u0438\u0442\u044c \u0442\u0435\u043a\u0441\u0442 \u0432 \u0431\u0443\u0444\u0435\u0440\u0435 \u043e\u0431\u043c\u0435\u043d\u0430 +guiMenuFile=&\u0424\u0430\u0439\u043b +guiMenuHelp=\u041f\u043e&\u043c\u043e\u0449\u044c +guiMenuHide=&\u0421\u043f\u0440\u044f\u0442\u0430\u0442\u044c \u0432 \u0442\u0440\u0435\u0439 +guiMenuOpen=\u041e&\u0442\u043a\u0440\u044b\u0442\u044c... +guiMenuOptions=\u041a\u043e&\u043d\u0444\u0438\u0433\u0443\u0440\u0430\u0446\u0438\u044f... +guiMenuQuit=&\u0412\u044b\u0445\u043e\u0434 +guiMenuShowMainWindow=\u041e\u0442\u043a\u0440\u044b\u0442\u044c \u0433\u043b\u0430\u0432\u043d\u043e\u0435 \u043e\u043a\u043d\u043e +guiMotherTongue=\u042f\u0437\u044b\u043a\: +guiNoErrorsFound=\u041d\u0435 \u043d\u0430\u0439\u0434\u0435\u043d\u043e \u043e\u0448\u0438\u0431\u043e\u043a \u0438 \u043f\u0440\u0435\u0434\u0443\u043f\u0440\u0435\u0436\u0434\u0435\u043d\u0438\u0439 (\u044f\u0437\u044b\u043a\: {0}) +guiNoErrorsFoundSelectedText=\u041d\u0435 \u043d\u0430\u0439\u0434\u0435\u043d\u043e \u043e\u0448\u0438\u0431\u043e\u043a \u0438 \u043f\u0440\u0435\u0434\u0443\u043f\u0440\u0435\u0436\u0434\u0435\u043d\u0438\u0439 (\u044f\u0437\u044b\u043a\: {0}) +guiOKButton=&OK +guiOOoChangeButton=&\u0418\u0437\u043c\u0435\u043d\u0438\u0442\u044c +guiOOoCloseButton=\u0417\u0430\u043a\u0440\u044b\u0442\u044c +guiOOoIgnoreAllButton=\u0418\u0433\u043d\u043e\u0440\u0438\u0440\u043e\u0432\u0430\u0442\u044c \u0432\u0441\u0451 +guiOOoIgnoreButton=\u0418\u0433\u043d\u043e\u0440\u0438\u0440\u043e\u0432\u0430\u0442\u044c +guiOOoOptionsButton=\u041e\u043f\u0446\u0438\u0438... +guiProgressWindowTitle=LanguageTool\: \u041f\u0440\u043e\u0432\u0435\u0440\u043a\u0430 \u0442\u0435\u043a\u0441\u0442\u0430... +guiReplaceWindowTitle=\u0417\u0430\u043c\u0435\u043d\u0438\u0442\u044c \u0442\u0435\u043a\u0441\u0442 +guiReplaceWithOtherText=<\u043f\u0440\u043e\u0447\u0438\u0439 \u0442\u0435\u043a\u0441\u0442> +guiRunOnPort=\u0417\u0430\u043f&\u0443\u0441\u0442\u0438\u0442\u044c \u043a\u0430\u043a \u0441\u0435\u0442\u0435\u0432\u043e\u0439 \u0441\u0435\u0440\u0432\u0435\u0440. \u041f\u043e\u0440\u0442\: +guiSelectionCheckComplete=LanguageTool \u0437\u0430\u0432\u0435\u0440\u0448\u0438\u043b \u043f\u0440\u043e\u0432\u0435\u0440\u043a\u0443 \u0432\u044b\u0434\u0435\u043b\u0435\u043d\u043d\u043e\u0433\u043e \u0442\u0435\u043a\u0441\u0442\u0430. +incorrect_case=\u042d\u0442\u043e \u043f\u0440\u0435\u0434\u043b\u043e\u0436\u0435\u043d\u0438\u0435 \u043d\u0435 \u043d\u0430\u0447\u0438\u043d\u0430\u0435\u0442\u0441\u044f \u0441 \u0437\u0430\u0433\u043b\u0430\u0432\u043d\u043e\u0439 \u0431\u0443\u043a\u0432\u044b +is=\u0418\u0441\u043b\u0430\u043d\u0434\u0441\u043a\u0438\u0439 +it=\u0418\u0442\u0430\u043b\u044c\u044f\u043d\u0441\u043a\u0438\u0439 +lt=\u041b\u0438\u0442\u043e\u0432\u0441\u043a\u0438\u0439 +missing_space_after_comma=\u041f\u043e\u0441\u0442\u0430\u0432\u044c\u0442\u0435 \u043f\u0440\u043e\u0431\u0435\u043b \u043f\u043e\u0441\u043b\u0435 \u0437\u0430\u043f\u044f\u0442\u043e\u0439 +ml=\u041c\u0430\u043b\u0430\u0439\u0441\u043a\u0438\u0439 +nl=\u041d\u0438\u0434\u0435\u0440\u043b\u0430\u043d\u0434\u0441\u043a\u0438\u0439 +no_space_after=\u041d\u0435 \u0441\u0442\u0430\u0432\u044c\u0442\u0435 \u043f\u0440\u043e\u0431\u0435\u043b \u043f\u043e\u0441\u043b\u0435 \u043e\u0442\u043a\u0440\u044b\u0432\u0430\u044e\u0449\u0435\u0439\u0441\u044f \u0441\u043a\u043e\u0431\u043a\u0438 +no_space_before=\u041d\u0435 \u0441\u0442\u0430\u0432\u044c\u0442\u0435 \u043f\u0440\u043e\u0431\u0435\u043b \u0434\u043e \u0437\u0430\u043a\u0440\u044b\u0432\u0430\u044e\u0449\u0435\u0439\u0441\u044f \u0441\u043a\u043e\u0431\u043a\u0438 +no_space_before_dot=\u041d\u0435 \u0441\u0442\u0430\u0432\u044c\u0442\u0435 \u043f\u0440\u043e\u0431\u0435\u043b \u043f\u0435\u0440\u0435\u0434 \u0442\u043e\u0447\u043a\u043e\u0439 \u0432 \u043a\u043e\u043d\u0446\u0435 \u043f\u0440\u0435\u0434\u043b\u043e\u0436\u0435\u043d\u0438\u044f +pl=\u041f\u043e\u043b\u044c\u0441\u043a\u0438\u0439 +repetition=\u0412\u043e\u0437\u043c\u043e\u0436\u043d\u0430\u044f \u043e\u043f\u0435\u0447\u0430\u0442\u043a\u0430\: \u043f\u043e\u0432\u0442\u043e\u0440 \u0441\u043b\u043e\u0432\u0430 +result1=<br><b> {0}. \u0421\u0442\u0440\u043e\u043a\u0430 {1}, \u0421\u0442\u043e\u043b\u0431\u0435\u0446 {2}</b><br> +resultAreaText=\u0420\u0435\u0437\u0443\u043b\u044c\u0442\u0430\u0442 \u043f\u0440\u043e\u0432\u0435\u0440\u043a\u0438 \u0431\u0443\u0434\u0435\u0442 \u043f\u0440\u0435\u0434\u0441\u0442\u0430\u0432\u043b\u0435\u043d \u0437\u0434\u0435\u0441\u044c +resultTime=<br>\u0412\u0440\u0435\u043c\u044f\: {0}ms (\u0432\u043a\u043b\u044e\u0447\u0430\u044f {1}ms \u0434\u043b\u044f \u043e\u0431\u0440\u0430\u0431\u043e\u0442\u043a\u0438 \u043f\u0440\u0430\u0432\u0438\u043b)<br> +ru=\u0420\u0443\u0441\u0441\u043a\u0438\u0439 +sk=\u0421\u043b\u043e\u0432\u0430\u0446\u043a\u0438\u0439 +sl=\u0421\u043b\u043e\u0432\u0435\u043d\u0441\u043a\u0438\u0439 +space_after_comma=\u041f\u043e\u0441\u0442\u0430\u0432\u044c\u0442\u0435 \u043f\u0440\u043e\u0431\u0435\u043b \u043f\u043e\u0441\u043b\u0435 \u0437\u0430\u043f\u044f\u0442\u043e\u0439, \u0430 \u043d\u0435 \u043f\u0435\u0440\u0435\u0434 \u043d\u0435\u0439 +startChecking=\u041d\u0430\u0447\u0430\u043b\u043e \u043f\u0440\u043e\u0432\u0435\u0440\u043a\u0438. \u042f\u0437\u044b\u043a\: {0} +sv=\u0428\u0432\u0435\u0434\u0441\u043a\u0438\u0439 +textLanguage=\u042f\u0437\u044b\u043a \u0442\u0435\u043a\u0441\u0442\u0430\: +two_commas=\u0414\u0432\u0435 \u0437\u0430\u043f\u044f\u0442\u044b\u0435 \u043f\u043e\u0434\u0440\u044f\u0434 +two_dots=\u0414\u0432\u0435 \u0442\u043e\u0447\u043a\u0438 \u043f\u043e\u0434\u0440\u044f\u0434 +uk=\u0423\u043a\u0440\u0430\u0438\u043d\u0441\u043a\u0438\u0439 +unpaired_brackets=\u041d\u0435\u043f\u0430\u0440\u043d\u0430\u044f \u0441\u043a\u043e\u0431\u043a\u0430, \u0430\u043f\u043e\u0441\u0442\u0440\u043e\u0444 \u0438\u043b\u0438 \u043a\u0430\u0432\u044b\u0447\u043a\u0430 +whitespace_repetition=\u041f\u043e\u0432\u0442\u043e\u0440 \u043f\u0440\u043e\u0431\u0435\u043b\u0430 +ro=\u0420\u0443\u043c\u044b\u043d\u0441\u043a\u0438\u0439 diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/MessagesBundle_sk.properties b/JLanguageTool/src/java/de/danielnaber/languagetool/MessagesBundle_sk.properties new file mode 100644 index 0000000..6a71621 --- /dev/null +++ b/JLanguageTool/src/java/de/danielnaber/languagetool/MessagesBundle_sk.properties @@ -0,0 +1,93 @@ +#X-Generator: crowdin.net +eo=Esperanto +be=Bielorusky +ca=Katal\u00e1nsky +category_case=Ve\u013ek\u00e9 a mal\u00e9 p\u00edsmen\u00e1 +category_false_friend=Falo\u0161n\u00fd priatelia +category_grammar=Gramatika +category_misc=R\u00f4zne +category_typo=Mo\u017en\u00fd preklep +checkDone=Kontrola je ukon\u010den\u00e1, {0} n\u00e1jden\u00fdch potenci\u00e1lnych ch\u00fdb. +checkText=S&kontrolova\u0165 +correctionMessage=Oprava\: +cs=\u010cesky +da=D\u00e1nsky +de=Nemecky +desc_comma_whitespace=Pou\u017eitie medzery pred \u010diarkou a pred/za z\u00e1tvorkami +desc_double_punct=Pou\u017eitie dvoch za sebou id\u00facich bodiek alebo \u010diarok +desc_repetition=Opakovanie slov (napr. 'bude bude') +desc_repetition_short=Opakovanie slov +desc_unpaired_brackets=Nesp\u00e1rovan\u00e9 z\u00e1tvorky, \u00favodzovky alebo podobn\u00e9 symboly +desc_uppercase_sentence=Skontrolujte, \u010di veta za\u010d\u00edna ve\u013ek\u00fdmi po\u010diato\u010dn\u00fdmi p\u00edsmenami +desc_whitespacerepetition=Opakovanie "bielych znakov" napr. medzier (zl\u00e9 form\u00e1tovanie) +double_dots_short=Dve za sebou id\u00face bodky +double_commas_short=Dve za sebou id\u00face \u010diarky +en=Anglicky +enterText=Vlo\u017ete alebo nap\u00ed\u0161te text, ktor\u00fd chcete skontrolova\u0165 do hornej \u010dasti +enterText2=Sem vlo\u017ete po\u017eadovan\u00fd text +errorContext=Kontext\: +errorMessage=Spr\u00e1va\: +es=\u0160panielsky +false_friend=Falo\u0161n\u00fd priate\u013e +false_friend_desc=falo\u0161n\u00fd priate\u013e, tip\:\: +false_friend_hint=Tip\: "{0}" ({1}) znamen\u00e1 {2} ({3}). +false_friend_suggestion=M\u00e1te na mysli {0}? +fr=Franc\u00fazsky +gl=Gal\u00edcky +guiCancelButton=Zru\u0161i\u0165 +guiCheckComplete=LanguageTool kontrola bola ukon\u010den\u00e1. +guiConfigWindowTitle=LanguageTool Nastavenia +guiDemoText=Toto je uk\u00e1\u017ekov\u00fd vstup, na predvedenie funk\u010dnosti LanguageTool. Pam\u00e4tajte si si, \u017ee neobsahuje "kontrolu" preklepo. +guiMatchCount=Potenci\u00e1lne chyby\: +guiMenuAbout=&O programe... +guiMenuAddRules=Na\u010d\u00edta\u0165 s\u00fabor s p&ravidlami +guiMenuCheckClipboard=S&kontrolova\u0165 text v schr\u00e1nke (clipboard) +guiMenuFile=&S\u00fabor +guiMenuHelp=&Pomocn\u00edk +guiMenuHide=Sry\u0165 do syst\u00e9movej &li\u0161ty +guiMenuOpen=&Otvori\u0165... +guiMenuOptions=Nastaveni&a... +guiMenuQuit=&Koniec +guiMenuShowMainWindow=Otvori\u0165 hlavn\u00e9 okno +guiMotherTongue=V\u00e1\u0161 rodn\u00fd jazyk\: +guiNoErrorsFound=Bez ch\u00fdb alebo varovan\u00ed (jazyk\: {0}) +guiNoErrorsFoundSelectedText=Neboli n\u00e1jden\u00e9 chyby alebo varovania pre zvolen\u00fd text (jazyk\: {0}) +guiOKButton=&OK +guiOOoChangeButton=&Zmeni\u0165 +guiOOoCloseButton=Zatvori\u0165 +guiOOoIgnoreAllButton=Ignorova\u0165 v\u0161etko +guiOOoIgnoreButton=Ignorova\u0165 +guiOOoOptionsButton=Nastavenia... +guiProgressWindowTitle=LanguageTool\: kontrolujem text... +guiReplaceWindowTitle=Nahradi\u0165 text +guiReplaceWithOtherText=<in\u00fd text> +guiRunOnPort=Spusti\u0165 ako server na po&rte +guiSelectionCheckComplete=LanguageTool kontrola ozna\u010den\u00e9ho textu je dokon\u010den\u00e1. +incorrect_case=T\u00e1to veta neza\u010d\u00edna s ve\u013ek\u00fdm p\u00edsmenom +is=Islandsky +it=Taliansky +lt=Litovsk\u00fd +missing_space_after_comma=Vlo\u017ete medze za \u010diarku +ml=Malajzijsk\u00fd +nl=Holandsky +no_space_after=Nevlo\u017ei\u0165 medzeru pred otv\u00e1raciu z\u00e1tvorku +no_space_before=Nevlo\u017ei\u0165 medzeru za otv\u00e1raciu z\u00e1tvorku +no_space_before_dot=Nevlo\u017ei\u0165 medzeru pred bodku +pl=Po\u013esky +repetition=Mo\u017en\u00fd preklep\: zopakovali ste slovo +result1=<br><b> {0}. Riadok {1}, st\u013apec {2}</b><br> +resultAreaText=Tu sa zobraz\u00ed v\u00fdsledok +resultTime=<br>\u010cas\: {0}ms (vr\u00e1tane {1}ms na pou\u017eitie pravidiel)<br> +ru=Rusky +sk=Slovensky +sl=Slovinsky +space_after_comma=Vlo\u017ei\u0165 medzeru za \u010diarku, ale nie pred \u010diarku +startChecking=Za\u010diatok kontroly po {0} +sv=\u0160v\u00e9dsky +textLanguage=Jazyk textu\: +two_commas=Dve po sebe id\u00face \u010diarky +two_dots=Dve po sebe id\u00face bodky +uk=Ukrajinsky +unpaired_brackets=Nevyp\u00e1rovan\u00e9 z\u00e1tvorky alebo podobn\u00fd symbol +whitespace_repetition=Mo\u017en\u00fd preklep\: zopakovali ste "biely znak" (whitespace) +ro=Rumunsky diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/MessagesBundle_sl.properties b/JLanguageTool/src/java/de/danielnaber/languagetool/MessagesBundle_sl.properties new file mode 100644 index 0000000..ff90555 --- /dev/null +++ b/JLanguageTool/src/java/de/danielnaber/languagetool/MessagesBundle_sl.properties @@ -0,0 +1,93 @@ +#X-Generator: crowdin.net +eo=Esperanto +be=beloruski +ca=katalonski +category_case=Velike za\u010detnice +category_false_friend=La\u017eni prijatelji +category_grammar=Slovnica +category_misc=Razno +category_typo=Mo\u017ena tipkarska napaka +checkDone=Preverjanje kon\u010dano, najdenih {0} potencialnih te\u017eav +checkText=&Preveri besedilo +correctionMessage=Popravek\: +cs=\u010de\u0161ki +da=danski +de=nem\u0161ki +desc_comma_whitespace=Uporaba presledka, tabulatorja ali preloma vrstice pred vejico in pred/po oklepaju +desc_double_punct=Uporaba dveh zaporednih pik ali vejic +desc_repetition=Podvojena beseda (npr. 'bo bo') +desc_repetition_short=Podvojena beseda +desc_unpaired_brackets=Neparni oklepaji, zaviti oklepaji, narekovaji in podobni znaki +desc_uppercase_sentence=Preveri, da se poved za\u010dne z veliko za\u010detnico +desc_whitespacerepetition=Ponovljen presledek (nepravilno oblikovanje) +double_dots_short=Zaporedni piki +double_commas_short=Zaporedni vejici +en=angle\u0161ki +enterText=Prosimo, vnesite ali prilepite besedilo za preverjanje v zgornje podro\u010dje +enterText2=Prosimo, vnesite besedilo za preverjanje semkaj +errorContext=Kontekst\: +errorMessage=Sporo\u010dilo\: +es=\u0161panski +false_friend=La\u017eni prijatelj +false_friend_desc=namig o napa\u010dnem prijatelju za\: +false_friend_hint=Namig\: "{0}" ({1}) pomeni {2} ({3}). +false_friend_suggestion=Ste imeli v mislih {0}? +fr=francoski +gl=galicijski +guiCancelButton=Prekli\u010di +guiCheckComplete=Preverjanje z LanguageTool je dokon\u010dano. +guiConfigWindowTitle=Mo\u017enosti LanguageTool +guiDemoText=To je primer vnosa za potrebe prikaza delovanja orodja LanguageTool. Upo\u0161tevajte, da ne vklju\u010duje preverjanja \u010drkovanja. +guiMatchCount=Mo\u017ene napake\: +guiMenuAbout=&O raz\u0161iritvi ... +guiMenuAddRules=Nalo\u017ei &datoteko s pravili +guiMenuCheckClipboard=&Preveri besedilo na odlo\u017ei\u0161\u010du +guiMenuFile=&Datoteka +guiMenuHelp=Po&mo\u010d +guiMenuHide=&Skrij v sistemski pladenj +guiMenuOpen=&Odpri ... +guiMenuOptions=&Mo\u017enosti ... +guiMenuQuit=I&zhod +guiMenuShowMainWindow=Odpri glavno okno +guiMotherTongue=Va\u0161 materni jezik\: +guiNoErrorsFound=Ni najdenih napak ali opozoril (jezik\: {0}) +guiNoErrorsFoundSelectedText=V izbranem besedilu ni najdenih napak ali opozoril (jezik\: {0}) +guiOKButton=V &redu +guiOOoChangeButton=&Spremeni +guiOOoCloseButton=Zapri +guiOOoIgnoreAllButton=Prezri vse +guiOOoIgnoreButton=Prezri +guiOOoOptionsButton=Mo\u017enosti ... +guiProgressWindowTitle=LanguageTool\: preverjanje besedila ... +guiReplaceWindowTitle=Zamenjaj besedilo +guiReplaceWithOtherText=<drugo besedilo> +guiRunOnPort=Po\u017eeni kot stre\u017enik na v&ratih +guiSelectionCheckComplete=Preverjanje izbranega besedila z LanguageTool je dokon\u010dano. +incorrect_case=Ta poved se ne za\u010denja z veliko za\u010detnico +is=islandski +it=italijanski +lt=litovski +missing_space_after_comma=Po vejici vstavi presledek +ml=malajalamski +nl=nizozemski +no_space_after=Ne postavljaj presledka za oklepaj +no_space_before=Ne postavljaj presledka pred zaklepaj +no_space_before_dot=Ne postavljaj presledka po piki +pl=poljski +repetition=Mo\u017ena tipkarska napaka\: ponovili ste besedo +result1=<br><b> {0}. Vrstica {1}, stolpec {2}</b><br> +resultAreaText=Tukaj se bodo izpisali rezultati +resultTime=<br>\u010cas\: {0}ms (vklju\u010dno z {1}ms za ujemanje pravil)<br> +ru=ruski +sk=slova\u0161ki +sl=slovenski +space_after_comma=Presledek vstavi po vejici, ne pa pred vejico +startChecking=Za\u010detek preverjanja v {0} +sv=\u0161vedski +textLanguage=Jezik besedila\: +two_commas=Dve zaporedni vejici +two_dots=Dve zaporedni piki +uk=ukrajinski +unpaired_brackets=Neparni oklepaji ali podobni znaki +whitespace_repetition=Mo\u017ena tipkarska napaka\: ponovili ste presledek +ro=romunski diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/MessagesBundle_sv.properties b/JLanguageTool/src/java/de/danielnaber/languagetool/MessagesBundle_sv.properties new file mode 100644 index 0000000..796aabf --- /dev/null +++ b/JLanguageTool/src/java/de/danielnaber/languagetool/MessagesBundle_sv.properties @@ -0,0 +1,93 @@ +#X-Generator: crowdin.net +eo=Esperanto +be=Belarusian +ca=Catalan +category_case=Stor eller lite bokstav +category_false_friend=Falska v\u00e4nner +category_grammar=Grammatik +category_misc=\u00f6vrigt +category_typo=Eventuellt ett stavfel +checkDone=Kontroll utf\u00f6rd, Hittade {0} potentiella problem +checkText=&Kontrollera Text +correctionMessage=R\u00e4ttelse\: +cs=Tjeckiska +da=Danish +de=Tyska +desc_comma_whitespace=Anv\u00e4ndande av blanksteg f\u00f6re kommatecken eller f\u00f6re/efter parentes +desc_double_punct=Anv\u00e4ndande av dubbla punkter eller kommatecken +desc_repetition=Repetition av ord (exempelvis 'till till') +desc_repetition_short=Repetition av ord +desc_unpaired_brackets=Icke ihopparade parenteser, citattecken eller liknande symboler +desc_uppercase_sentence=Kontrollera att meningar b\u00f6rjar med stor bokstav +desc_whitespacerepetition=Upprepade mellanslag (d\u00e5lig formateringsvana) +double_dots_short=Two consecutive dots +double_commas_short=Two consecutive comma +en=Engelska +enterText=Skriv eller klistra in text i \u00f6vre f\u00e4ltet +enterText2=L\u00e4gg in text f\u00f6r kontroll h\u00e4r +errorContext=Sammanhang\: +errorMessage=Meddelande\: +es=Spanska +false_friend=False friend +false_friend_desc=Hint om falska v\u00e4nner\: +false_friend_hint=Hint\: "{0}" ({1}) betyder {2} ({3}). +false_friend_suggestion=Menade du {0}? +fr=Franska +gl=Galician +guiCancelButton=Avbryt +guiCheckComplete=LanguageTool kontroll \u00e4r f\u00e4rdig. +guiConfigWindowTitle=LanguageTool Alternativ +guiDemoText=Detta \u00e4r en en exempeltext f\u00f6r att visa hur LanguageTool fungerar.. Notera att den inte inneh\u00e5lle n\u00e5gon stavningskontrol. +guiMatchCount=M\u00f6jliga fel\: +guiMenuAbout=&Om... +guiMenuAddRules=Ladda in &Regelfil +guiMenuCheckClipboard=&Kontrollera texten i Urklipp +guiMenuFile=&Arkiv +guiMenuHelp=&Hj\u00e4lp +guiMenuHide=&G\u00f6m i systemf\u00e4ltet +guiMenuOpen=&\u00f6ppna... +guiMenuOptions=&Alternativ... +guiMenuQuit=&Avsluta +guiMenuShowMainWindow=\u00f6ppna huvudf\u00f6nster +guiMotherTongue=Ditt modersm\u00e5l\: +guiNoErrorsFound=Hittade inga fel eller varningar (spr\u00e5k\: {0}) +guiNoErrorsFoundSelectedText=Hittade inga fel eller varningar i markerad text (spr\u00e5k\: {0}) +guiOKButton=&OK +guiOOoChangeButton=&\u00e4ndra +guiOOoCloseButton=St\u00e4ng +guiOOoIgnoreAllButton=Ignorera Alla +guiOOoIgnoreButton=Ignorera +guiOOoOptionsButton=Alternativ... +guiProgressWindowTitle=LanguageTool\: Kontrollerar text... +guiReplaceWindowTitle=Ers\u00e4tt text +guiReplaceWithOtherText=<annan text> +guiRunOnPort=K\u00f6r server p\u00e5 po&rt +guiSelectionCheckComplete=LanguageTool kontroll av markerad text \u00e4r utf\u00f6rd. +incorrect_case=Denna mening b\u00f6rjar med liten bokstav +is=Icelandic +it=Italienska +lt=Litauiska +missing_space_after_comma=L\u00e4gg till ett blanksteg efter kommatecknet +ml=Malayalam +nl=Holl\u00e4ndska +no_space_after=Ta bort blanksteg efter \u00f6ppnande parentesen +no_space_before=Ta bort blanksteg f\u00f6re avslutande parentesen +no_space_before_dot=Don't put a space before the full stop +pl=Polska +repetition=M\u00f6jlig felskrivning\: du repeterade ett ord +result1=<br><b> {0}. Rad {1}, kolumn {2}</b><br> +resultAreaText=Resultatet visas h\u00e4r +resultTime=<br>Tid\: {0}ms (inklusive {1}ms f\u00f6r regelmatching)<br> +ru=Ryska +sk=Slovakiska +sl=Slovenska +space_after_comma=L\u00e4gg till ett mellanrum efter kommatecknet, men inte f\u00f6re +startChecking=P\u00e5b\u00f6rjar kontroll om {0} +sv=Svenska +textLanguage=Textens spr\u00e5k\: +two_commas=Dubbla kommatecken +two_dots=Dubbla punkter +uk=Ukrainska +unpaired_brackets=Icke ihopparade parenteser eller liknande symboler +whitespace_repetition=M\u00f6jlig felskrivning\: du har gjort upprepade mellanslag. +ro=Rom\u00e2n\u0103 diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/MessagesBundle_uk.properties b/JLanguageTool/src/java/de/danielnaber/languagetool/MessagesBundle_uk.properties new file mode 100644 index 0000000..67704db --- /dev/null +++ b/JLanguageTool/src/java/de/danielnaber/languagetool/MessagesBundle_uk.properties @@ -0,0 +1,93 @@ +#X-Generator: crowdin.net +be=Belarusian +ca=Catalan +category_case=\u0412\u0435\u043b\u0438\u043a\u0456 \u043b\u0456\u0442\u0435\u0440\u0438 +category_false_friend=\u0424\u0430\u043b\u044c\u0448\u0438\u0432\u0456 \u0434\u0440\u0443\u0437\u0456 +category_grammar=\u0413\u0440\u0430\u043c\u0430\u0442\u0438\u043a\u0430 +category_misc=\u0406\u043d\u0448\u0435 +category_typo=\u041c\u043e\u0436\u043b\u0438\u0432\u0430 \u043c\u0435\u0445\u0430\u043d\u0456\u0447\u043d\u0430 \u043f\u043e\u043c\u0438\u043b\u043a\u0430 +checkDone=\u041f\u0435\u0440\u0435\u0432\u0456\u0440\u043a\u0443 \u0437\u0430\u0432\u0435\u0440\u0448\u0435\u043d\u043e, \u0437\u043d\u0430\u0439\u0434\u0435\u043d\u043e {0} \u043f\u043e\u0442\u0435\u043d\u0446\u0456\u0430\u043b\u044c\u043d\u0438\u0445 \u043f\u043e\u043c\u0438\u043b\u043e\u043a +checkText=\u041f\u0435\u0440\u0435\u0432\u0456\u0440\u0438\u0442\u0438 \u0442\u0435\u043a\u0441\u0442 +correctionMessage=\u0412\u0438\u043f\u0440\u0430\u0432\u043b\u0435\u043d\u043d\u044f\: +cs=\u0427\u0435\u0441\u044c\u043a\u0430 +da=Danish +de=\u041D\u0456\u043C\u0435\u0446\u044C\u043A\u0430 +desc_comma_whitespace=\u0412\u0436\u0438\u0432\u0430\u043D\u043D\u044F \u043F\u0440\u043E\u0433\u0430\u043B\u0438\u043D\u0438 \u043F\u0435\u0440\u0435\u0434 \u043A\u043E\u043C\u043E\u044E \u0442\u0430 \u043F\u0435\u0440\u0435\u0434/\u043F\u0456\u0441\u043B\u044F \u0434\u0443\u0436\u043E\u043A +desc_double_punct=\u0412\u0436\u0438\u0432\u0430\u043D\u043D\u044F \u043F\u043E\u0434\u0432\u0456\u0439\u043D\u0438\u0445 \u043A\u043E\u043C, \u043A\u0440\u0430\u043F\u043E\u043A... +desc_repetition=\u041F\u043E\u0432\u0442\u043E\u0440\u0435\u043D\u043D\u044F \u0441\u043B\u0456\u0432 (\u043D\u0430\u043F\u0440., '\u0431\u0443\u0434\u0435 \u0431\u0443\u0434\u0435') +desc_repetition_short=\u041F\u043E\u0432\u0442\u043E\u0440\u0435\u043D\u043D\u044F \u0441\u043B\u0456\u0432 +desc_unpaired_brackets=\u041D\u0435\u043F\u0430\u0440\u043D\u0456 \u0434\u0443\u0436\u043A\u0438, \u043B\u0430\u043F\u043A\u0438 \u0430\u0431\u043E \u0456\u043D\u0448\u0456 \u0441\u0445\u043E\u0436\u0456 \u0441\u0438\u043C\u0432\u043E\u043B\u0438 +desc_uppercase_sentence=\u041F\u0435\u0440\u0435\u0432\u0456\u0440\u044F\u0454, \u0447\u0438 \u0440\u0435\u0447\u0435\u043D\u043D\u044F \u043F\u043E\u0447\u0438\u043D\u0430\u0454\u0442\u044C\u0441\u044F \u0437 \u0432\u0435\u043B\u0438\u043A\u043E\u0457 \u043B\u0456\u0442\u0435\u0440\u0438 +desc_whitespacerepetition=\u041F\u043E\u0432\u0442\u043E\u0440 \u043F\u0440\u043E\u0431\u0456\u043B\u0443 +double_dots_short=\u041F\u043E\u0434\u0432\u0456\u0439\u043D\u0430 \u043A\u0440\u0430\u043F\u043A\u0430 +double_commas_short=\u041F\u043E\u0434\u0432\u0456\u0439\u043D\u0430 \u043A\u043E\u043C\u0430 +en=\u0410\u043D\u0433\u043B\u0456\u0439\u0441\u044C\u043A\u0430 +enterText=\u0412\u0432\u0435\u0434\u0456\u0442\u044C \u0430\u0431\u043E \u0432\u0441\u0442\u0430\u0432\u0442\u0435 \u0442\u0435\u043A\u0441\u0442 \u0434\u043B\u044F \u043F\u0435\u0440\u0435\u0432\u0456\u0440\u044F\u043D\u043D\u044F \u0432\u0433\u043E\u0440\u0456 +enterText2=\u0412\u0441\u0442\u0430\u0432\u0442\u0435 \u0442\u0435\u043A\u0441\u0442 \u0434\u043B\u044F \u043F\u0435\u0440\u0435\u0432\u0456\u0440\u044F\u043D\u043D\u044F \u0432\u0433\u043E\u0440\u0456 +eo = \u0415\u0441\u043F\u0435\u0440\u0430\u043D\u0442\u043E +errorContext=\u041A\u043E\u043D\u0442\u0435\u043A\u0441\u0442\: +errorMessage=\u041F\u043E\u0432\u0456\u0434\u043E\u043C\u043B\u0435\u043D\u043D\u044F\: +es=\u0406\u0441\u043F\u0430\u043D\u0441\u044C\u043A\u0430 +false_friend=\u041E\u043C\u043E\u043D\u0456\u043C\u0438 +false_friend_desc=\u043F\u0456\u0434\u043A\u0430\u0437\u043A\u0430 \u043D\u0435\u043F\u0440\u0430\u0432\u0438\u043B\u044C\u043D\u043E\u0457 \u043F\u0430\u0440\u0438 \u0434\u043B\u044F\: +false_friend_hint=\u041F\u0456\u0434\u043A\u0430\u0437\u043A\u0430\: "{0}" ({1}) \u043E\u0437\u043D\u0430\u0447\u0430\u0454 {2} ({3}). +false_friend_suggestion=\u0412\u0438 \u043C\u0430\u043B\u0438 \u043D\u0430 \u0443\u0432\u0430\u0437\u0456 {0}? +fr=\u0424\u0440\u0430\u043D\u0446\u0443\u0437\u044C\u043A\u0430 +gl=Galician +guiCancelButton=\u0421\u043a\u0430\u0441\u0443\u0432\u0430\u0442\u0438 +guiCheckComplete=\u041f\u0435\u0440\u0435\u0432\u0456\u0440\u043a\u0443 \u0432 LanguageTool \u0437\u0430\u043a\u0456\u043d\u0447\u0435\u043d\u043e. +guiConfigWindowTitle=\u041f\u0430\u0440\u0430\u043c\u0435\u0442\u0440\u0438 LanguageTool +guiDemoText=\u0426\u0435 \u043f\u0440\u0438\u043a\u043b\u0430\u0434 \u0432\u0445\u0456\u0434\u043d\u043e\u0433\u043e \u0442\u0435\u043a\u0441\u0442\u0443 \u0434\u043b\u044f \u0434\u0435\u043c\u043e\u043d\u0441\u0442\u0440\u0430\u0446\u0456\u0457 \u0440\u043e\u0431\u043e\u0442\u0438 LanguageTool. \u0417\u0430\u0443\u0432\u0430\u0436\u0442\u0435, \u0449\u043e \u0446\u0435 \u043d\u0435 \u0432\u043a\u043b\u044e\u0447\u0430\u0454 \u043f\u0435\u0440\u0435\u0432\u0456\u0440\u043a\u0443 \u043e\u0440\u0444\u043e\u0433\u0440\u0430\u0444\u0456\u0457. +guiMatchCount=\u041f\u043e\u0442\u0435\u043d\u0446\u0456\u0439\u043d\u0438\u0445 \u043f\u043e\u043c\u0438\u043b\u043e\u043a\: +guiMenuAbout=\u041f\u0440\u043e... +guiMenuAddRules=Load Rule File +guiMenuCheckClipboard=\u041f\u0435\u0440\u0435\u0432\u0456\u0440\u0438\u0442\u0438 \u0442\u0435\u043a\u0441\u0442 \u0437 \u043a\u0438\u0448\u0435\u043d\u0456 +guiMenuFile=\u0424\u0430\u0439\u043b +guiMenuHelp=\u0414\u043e\u0432\u0456\u0434\u043a\u0430 +guiMenuHide=\u0421\u0445\u043e\u0432\u0430\u0442\u0438 \u0432 \u0441\u0438\u0441\u0442\u0435\u043c\u043d\u0438\u0439 \u043b\u043e\u0442\u043e\u043a +guiMenuOpen=\u0412\u0456\u0434\u043a\u0440\u0438\u0442\u0438... +guiMenuOptions=\u041f\u0430\u0440\u0430\u043c\u0435\u0442\u0440\u0438... +guiMenuQuit=\u0412\u0438\u0439\u0442\u0438 +guiMenuShowMainWindow=Open main window +guiMotherTongue=\u0412\u0430\u0448\u0430 \u0440\u0456\u0434\u043d\u0430 \u043c\u043e\u0432\u0430\: +guiNoErrorsFound=\u041d\u0435 \u0437\u043d\u0430\u0439\u0434\u0435\u043d\u043e \u0436\u043e\u0434\u043d\u043e\u0457 \u043f\u043e\u043c\u0438\u043b\u043a\u0438 \u0430\u0431\u043e \u043f\u043e\u043f\u0435\u0440\u0435\u0434\u0436\u0435\u043d\u043d\u044f (\u043c\u043e\u0432\u0430\: {0}) +guiNoErrorsFoundSelectedText=\u041d\u0435 \u0437\u043d\u0430\u0439\u0434\u0435\u043d\u043e \u0436\u043e\u0434\u043d\u043e\u0457 \u043f\u043e\u043c\u0438\u043b\u043a\u0438 \u0430\u0431\u043e \u043f\u043e\u043f\u0435\u0440\u0435\u0434\u0436\u0435\u043d\u043d\u044f \u0443 \u0432\u0438\u0431\u0440\u0430\u043d\u043e\u043c\u0443 \u0442\u0435\u043a\u0441\u0442\u0456 (\u043c\u043e\u0432\u0430\: {0}) +guiOKButton=\u0413\u0430\u0440\u0430\u0437\u0434 +guiOOoChangeButton=\u0417\u043c\u0456\u043d\u0438\u0442\u0438 +guiOOoCloseButton=\u0417\u0430\u043a\u0440\u0438\u0442\u0438 +guiOOoIgnoreAllButton=\u041f\u0440\u043e\u043f\u0443\u0441\u0442\u0438\u0442\u0438 \u0432\u0441\u0456 +guiOOoIgnoreButton=\u041f\u0440\u043e\u043f\u0443\u0441\u0442\u0438\u0442\u0438 +guiOOoOptionsButton=\u041f\u0430\u0440\u0430\u043c\u0435\u0442\u0440\u0438... +guiProgressWindowTitle=LanguageTool\: \u0442\u0435\u043a\u0441\u0442 \u043f\u0435\u0440\u0435\u0432\u0456\u0440\u044f\u0454\u0442\u044c\u0441\u044f... +guiReplaceWindowTitle=\u0417\u0430\u043c\u0456\u043d\u0438\u0442\u0438 \u0442\u0435\u043a\u0441\u0442 +guiReplaceWithOtherText=<\u0456\u043d\u0448\u0438\u0439 \u0442\u0435\u043a\u0441\u0442> +guiRunOnPort=\u0417\u0430\u043f\u0443\u0441\u0442\u0438\u0442\u0438, \u044f\u043a \u0441\u0435\u0440\u0432\u0435\u0440 \u043d\u0430 \u043f\u043e\u0440\u0442\u0443 +guiSelectionCheckComplete=\u041f\u0435\u0440\u0435\u0432\u0456\u0440\u044f\u043d\u043d\u044f LanguageTool \u0432\u0438\u0431\u0440\u0430\u043d\u043e\u0433\u043e \u0442\u0435\u043a\u0441\u0442\u0443 \u0437\u0430\u0432\u0435\u0440\u0448\u0435\u043d\u043e. +incorrect_case=\u0426\u0435 \u0440\u0435\u0447\u0435\u043d\u043d\u044f \u043d\u0435 \u043f\u043e\u0447\u0438\u043d\u0430\u0454\u0442\u044c\u0441\u044f \u0437 \u0432\u0435\u043b\u0438\u043a\u043e\u0457 \u043b\u0456\u0442\u0435\u0440\u0438 +is=\u0406\u0441\u043b\u0430\u043d\u0434\u0441\u044c\u043a\u0438\u0439 +it=\u0406\u0442\u0430\u043b\u0456\u0439\u0441\u044c\u043a\u0430 +lt=\u041b\u0438\u0442\u043e\u0432\u0441\u044c\u043a\u0430 +missing_space_after_comma=\u0412\u0441\u0442\u0430\u0432\u0442\u0435 \u043f\u0440\u043e\u0433\u0430\u043b\u0438\u043d\u0443 \u043f\u0456\u0441\u043b\u044f \u043a\u043e\u043c\u0438 +ml=Malayalam +nl=\u0413\u043e\u043b\u0430\u043d\u0434\u0441\u044c\u043a\u0430 +no_space_after=\u041d\u0435 \u0441\u0442\u0430\u0432\u0442\u0435 \u043f\u0440\u043e\u0433\u0430\u043b\u0438\u043d\u0443 \u043f\u0456\u0441\u043b\u044f \u043b\u0456\u0432\u043e\u0457 \u0434\u0443\u0436\u043a\u0438 +no_space_before=\u041d\u0435 \u0441\u0442\u0430\u0432\u0442\u0435 \u043f\u0440\u043e\u0433\u0430\u043b\u0438\u043d\u0443 \u043f\u0456\u0441\u043b\u044f \u043f\u0440\u0430\u0432\u043e\u0457 \u0434\u0443\u0436\u043a\u0438 +no_space_before_dot=\u041d\u0435 \u0441\u0442\u0430\u0432\u0442\u0435 \u043f\u0440\u043e\u0433\u0430\u043b\u0438\u043d\u0443 \u043f\u0435\u0440\u0435\u0434 \u043a\u0440\u0430\u043f\u043a\u043e\u044e +pl=\u041f\u043e\u043b\u044c\u0441\u044c\u043a\u0430 +repetition=\u041c\u043e\u0436\u043b\u0438\u0432\u0430 \u043c\u0435\u0445\u0430\u043d\u0456\u0447\u043d\u0430 \u043f\u043e\u043c\u0438\u043b\u043a\u0430\: \u043f\u043e\u0432\u0442\u043e\u0440\u0435\u043d\u043d\u044f \u0441\u043b\u043e\u0432\u0430 +result1=<br><b> {0}. \u0420\u044f\u0434\u043e\u043a {1}, \u0441\u0442\u043e\u0432\u043f\u0447\u0438\u043a {2}</b><br> +resultAreaText=\u0422\u0443\u0442 \u0437'\u044f\u0432\u043b\u044f\u0442\u044c\u0441\u044f \u0440\u0435\u0437\u0443\u043b\u044c\u0442\u0430\u0442\u0438 +resultTime=<br>\u0417\u0430\u0442\u0440\u0430\u0447\u0435\u043d\u043e\: {0}\u043c\u0441 (\u0432\u043a\u043b\u044e\u0447\u043d\u043e \u0437 {1}\u043c\u0441 \u043d\u0430 \u043f\u0435\u0440\u0435\u0432\u0456\u0440\u044f\u043d\u043d\u044f \u043f\u0440\u0430\u0432\u0438\u043b)<br> +ru=\u0420\u043e\u0441\u0456\u0301\u0439\u0441\u044c\u043a\u0430 +sk=\u0421\u043b\u043e\u0432\u0430\u0446\u044c\u043a\u0430 +sl=\u0421\u043b\u043e\u0432\u0435\u043d\u0441\u044c\u043a\u0430 +space_after_comma=\u041f\u043e\u0441\u0442\u0430\u0432\u0442\u0435 \u043f\u0440\u043e\u0433\u0430\u043b\u0438\u043d\u0443 \u043f\u0456\u0441\u043b\u044f \u043a\u043e\u043c\u0438, \u0430 \u043d\u0435 \u043f\u0435\u0440\u0435\u0434 \u043a\u043e\u043c\u043e\u044e +startChecking=\u041f\u043e\u0447\u0430\u0442\u043e\u043a \u043f\u0435\u0440\u0435\u0432\u0456\u0440\u044f\u043d\u043d\u044f \u0432 {0} +sv=\u0428\u0432\u0435\u0434\u0441\u044c\u043a\u0430 +textLanguage=\u041c\u043e\u0432\u0430 \u0442\u0435\u043a\u0441\u0442\u0443\: +two_commas=\u041f\u043e\u0434\u0432\u0456\u0439\u043d\u0430 \u043a\u043e\u043c\u0430 +two_dots=\u041f\u043e\u0434\u0432\u0456\u0439\u043d\u0430 \u043a\u0440\u0430\u043f\u043a\u0430 +uk=\u0423\u043a\u0440\u0430\u0457\u043d\u0441\u044c\u043a\u0430 +unpaired_brackets=\u041d\u0435\u043f\u0430\u0440\u043d\u0456 \u0434\u0443\u0436\u043a\u0438 \u0430\u0431\u043e \u0456\u043d\u0448\u0456 \u0432\u0438\u043e\u043a\u0440\u0435\u043c\u043b\u044e\u0432\u0430\u043b\u043d\u0456 \u0441\u0438\u043c\u0432\u043e\u043b\u0438 +whitespace_repetition=\u041f\u043e\u0432\u0442\u043e\u0440 \u043f\u0440\u043e\u0431\u0456\u043b\u0443 +ro=Rom\u00e2n\u0103 diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/TextFilter.java b/JLanguageTool/src/java/de/danielnaber/languagetool/TextFilter.java new file mode 100644 index 0000000..a1eaad6 --- /dev/null +++ b/JLanguageTool/src/java/de/danielnaber/languagetool/TextFilter.java @@ -0,0 +1,30 @@ +/* LanguageTool, a natural language style checker + * Copyright (C) 2007 Daniel Naber (http://www.danielnaber.de) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ +package de.danielnaber.languagetool; + +/** + * Filter, i.e. clean up, text before it is checked. + * + * @author Daniel Naber + */ +public interface TextFilter { + + public String filter(String text); + +} diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/bitext/BitextReader.java b/JLanguageTool/src/java/de/danielnaber/languagetool/bitext/BitextReader.java new file mode 100644 index 0000000..0770dcd --- /dev/null +++ b/JLanguageTool/src/java/de/danielnaber/languagetool/bitext/BitextReader.java @@ -0,0 +1,62 @@ +/* LanguageTool, a natural language style checker + * Copyright (C) 2005 Daniel Naber (http://www.danielnaber.de) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ + +package de.danielnaber.languagetool.bitext; + +/** + * Interface for classes that implement reading from bitext files, + * such as translation memory files, glossary files, aligned text... + * + * @author Marcin Miłkowski + */ + +public interface BitextReader extends Iterable<StringPair> { + + /** + * Get the current line number in the file. + * @return The current line number. + */ + public int getLineCount(); + + /** + * Get the current column number in the file. + * @return The current column number. + */ + public int getColumnCount(); + + /** + * Get the current target column number in the file. + * @return The current target column number. + */ + public int getTargetColumnCount(); + + + /** + * Get the current target sentence position in the file. + * @return The current sentence position. + */ + public int getSentencePosition(); + + /** + * Get the current line of the bitext input. + * @return The complete line (including source, if any). + */ + public String getCurrentLine(); + +} diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/bitext/StringPair.java b/JLanguageTool/src/java/de/danielnaber/languagetool/bitext/StringPair.java new file mode 100644 index 0000000..7677d1d --- /dev/null +++ b/JLanguageTool/src/java/de/danielnaber/languagetool/bitext/StringPair.java @@ -0,0 +1,49 @@ +/* LanguageTool, a natural language style checker + * Copyright (C) 2010 Marcin Miłkowski (www.languagetool.org) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ + +package de.danielnaber.languagetool.bitext; + +/** + * A convenience class to work with bitext strings. + * @author Marcin Miłkowski + * + */ +public class StringPair { + + private final String sourceString; + + private final String targetString; + + public StringPair(final String source, final String target) { + sourceString = source; + targetString = target; + } + + public String getSource() { + return sourceString; + } + + public String getTarget() { + return targetString; + } + + public String toString() { + return sourceString + " & " + targetString; + } +} diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/bitext/TabBitextReader.java b/JLanguageTool/src/java/de/danielnaber/languagetool/bitext/TabBitextReader.java new file mode 100644 index 0000000..b0a4eaa --- /dev/null +++ b/JLanguageTool/src/java/de/danielnaber/languagetool/bitext/TabBitextReader.java @@ -0,0 +1,129 @@ +/* LanguageTool, a natural language style checker + * Copyright (C) 2010 Marcin Miłkowski (http://www.languagetool.org) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ + +package de.danielnaber.languagetool.bitext; + +import java.io.BufferedReader; +import java.io.FileInputStream; +import java.io.IOException; +import java.io.InputStreamReader; +import java.util.Iterator; + +/** + * Reader of simple tab-delimited bilingual files. + * + * @author Marcin Miłkowski + */ +public class TabBitextReader implements BitextReader { + + protected BufferedReader in; + protected StringPair nextPair; + protected String nextLine; + private String prevLine; + + private int lineCount = -1; + protected int sentencePos; + + public TabBitextReader(final String filename, final String encoding) { + try { + if (encoding == null) { + in = new BufferedReader(new InputStreamReader(new FileInputStream(filename))); + } else { + in = new BufferedReader(new InputStreamReader(new FileInputStream(filename), encoding)); + } + nextLine = in.readLine(); + prevLine = ""; + nextPair = tab2StringPair(nextLine); + } catch(IOException e) { + throw new IllegalArgumentException(e); + } + } + + protected StringPair tab2StringPair(final String line) { + if (line == null) { + return null; + } + final String[] fields = line.split("\t"); + return new StringPair(fields[0], fields[1]); + } + + @Override + public Iterator<StringPair> iterator() { + return new TabReader(); + } + + class TabReader implements Iterator<StringPair> { + + public boolean hasNext() { + return nextLine != null; + } + + public StringPair next() { + try { + final StringPair result = nextPair; + sentencePos = nextPair.getSource().length() + 1; + if (nextLine != null) { + prevLine = nextLine; + nextLine = in.readLine(); + nextPair = tab2StringPair(nextLine); + lineCount++; + if (nextLine == null) { + in.close(); + } + } + return result; + } catch(IOException e) { + throw new IllegalArgumentException(e); + } + } + + // The file is read-only. + public void remove() { + throw new UnsupportedOperationException(); + } + } + + @Override + public int getColumnCount() { + return sentencePos; + } + + @Override + public int getTargetColumnCount() { + return 1; + } + + @Override + public int getLineCount() { + return lineCount; + } + + @Override + public int getSentencePosition() { + return sentencePos; + } + + @Override + public String getCurrentLine() { + return prevLine; + } + + + +} diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/bitext/WordFastTMReader.java b/JLanguageTool/src/java/de/danielnaber/languagetool/bitext/WordFastTMReader.java new file mode 100644 index 0000000..cadad69 --- /dev/null +++ b/JLanguageTool/src/java/de/danielnaber/languagetool/bitext/WordFastTMReader.java @@ -0,0 +1,87 @@ +/* LanguageTool, a natural language style checker + * Copyright (C) 2010 Marcin Miłkowski (http://www.languagetool.org) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ + +package de.danielnaber.languagetool.bitext; + +import java.io.IOException; +import java.util.Iterator; + +/** + * Reader of WordFast Translation Memory text files. + * They are simple tab-delimited text files. + * + * @author Marcin Miłkowski + */ +public class WordFastTMReader extends TabBitextReader { + + public WordFastTMReader(final String filename, final String encoding) throws IOException { + super(filename, encoding); + //skip the header (first line) + if (nextLine != null) { + nextLine = in.readLine(); + nextPair = tab2StringPair(nextLine); + } + } + + public final StringPair tab2StringPair(final String line) { + if (line == null) { + return null; + } + final String[] fields = line.split("\t"); + sentencePos = fields[4].length() + 1; + return new StringPair(fields[4], fields[6]); + } + + @Override + public Iterator<StringPair> iterator() { + return new TabReader(); + } + + class TabReader implements Iterator<StringPair> { + + public boolean hasNext() { + return nextLine != null; + } + + public StringPair next() { + try { + final StringPair result = nextPair; + + if (nextLine != null) { + nextLine = in.readLine(); + nextPair = tab2StringPair(nextLine); + if (nextLine == null) { + in.close(); + } + } + return result; + } catch(IOException e) { + throw new IllegalArgumentException(e); + } + } + + // The file is read-only. + public void remove() { + throw new UnsupportedOperationException(); + } + } + +} + + diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/databroker/DefaultResourceDataBroker.java b/JLanguageTool/src/java/de/danielnaber/languagetool/databroker/DefaultResourceDataBroker.java new file mode 100644 index 0000000..d365ea5 --- /dev/null +++ b/JLanguageTool/src/java/de/danielnaber/languagetool/databroker/DefaultResourceDataBroker.java @@ -0,0 +1,360 @@ +/* LanguageTool, a natural language style checker + * Copyright (C) 2006 Daniel Naber (http://www.danielnaber.de) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ +package de.danielnaber.languagetool.databroker; + +import java.io.File; +import java.io.IOException; +import java.io.InputStream; +import java.net.MalformedURLException; +import java.net.URL; +import java.net.URLConnection; +import java.security.Permission; + +import de.danielnaber.languagetool.JLanguageTool; + +/** + * Responsible for getting any items from the grammar checker's resource + * directories. This default data broker assumes that they are accessible + * directly via class-path and the directory names are like specified in: + * + * <ul style="list-type: circle"> + * <li>{@link ResourceDataBroker#RESOURCE_DIR}</li> + * <li>{@link ResourceDataBroker#RULES_DIR}</li> + * </ul> + * <p> + * + * If you'd like to determine another resource directory location this default + * data broker provides proper methods. + * Assuming your {@code /rules} and {@code /resource} directories are accessible + * via class-path with following path information: + * + * <ul style="list-type: circle"> + * <li>{@code /res/grammarchecker/rulesdirname}</li> + * <li>{@code /res/grammarchecker/resourcedirname}</li> + * </ul> + * + * In this case you have to invoke the methods + * {@link ResourceDataBroker#setRulesDir(String)} and + * {@link ResourceDataBroker#setResourceDir(String)} with following arguments: + * + * <ul style="list-type: circle"> + * <li>{@code /res/grammarchecker/rulesdirname}</li> + * <li>{@code /res/grammarchecker/resourcedirname}</li> + * </ul> + * <p> + * + * Make sure that you never obtain any grammar checker resources by calling + * {@code Object.class.getResource(String)} or {@code + * Object.class.getResourceAsStream(String)} directly. If you would like to + * obtain something from these directories do always use + * {@link JLanguageTool#getDataBroker()} which provides proper methods for + * reading the directories above. + * <p> + * + * For example, if you want to get the {@link URL} of {@code + * /rules/de/grammar.xml} just invoke + * {@link ResourceDataBroker#getFromRulesDirAsUrl(String)} and pass {@code + * /de/grammar.xml} as a string. Note: The {@code /rules} directory's name isn't + * passed, because its name might have changed. The same usage does apply for the + * {@code /resource} directory. + * + * @see ResourceDataBroker + * @author PAX + * @since 1.0.1 + */ +public class DefaultResourceDataBroker implements ResourceDataBroker { + + /** + * The directory's name of the grammar checker's resource directory. The + * default value equals {@link ResourceDataBroker#RESOURCE_DIR}. + */ + protected String resourceDir; + + /** + * The directory's name of the grammar checker's rules directory. The + * default value equals {@link ResourceDataBroker#RULES_DIR}. + */ + protected String rulesDir; + + /** + * Instantiates this data broker with the default resource directory names + * as specified in: + * + * <ul> + * <li>{@link ResourceDataBroker#RESOURCE_DIR}</li> + * <li>{@link ResourceDataBroker#RULES_DIR}</li> + * </ul> + */ + public DefaultResourceDataBroker() { + this(ResourceDataBroker.RESOURCE_DIR, ResourceDataBroker.RULES_DIR); + } + + /** + * Instantiates this data broker with the passed resource directory names. + * + * @param resourceDir + * The directory's name of the grammar checker's resource + * directory. The default value equals + * {@link ResourceDataBroker#RESOURCE_DIR}. + * @param rulesDir + * The directory's name of the grammar checker's rules directory. + * The default value equals + * {@link ResourceDataBroker#RULES_DIR}. + */ + public DefaultResourceDataBroker(final String resourceDir, final String rulesDir) { + this.setResourceDir(resourceDir); + this.setRulesDir(rulesDir); + } + + /** + * See: + * {@link ResourceDataBroker#getFromResourceDirAsStream(java.lang.String)} + * + * @param path + * The relative path to the item inside of the {@code /resource} + * directory. Please start your path information with {@code /} + * because it will be concatenated with the directory's name: + * /resource<b>/yourpath</b>. + * @return An {@link InputStream} object to the requested item or {@code + * null} if it wasn't found. + */ + @Override + public InputStream getFromResourceDirAsStream(final String path) { + final String completePath = this.getCompleteResourceUrl(path); + return ResourceDataBroker.class.getResourceAsStream(completePath); + } + + /** + * See: + * {@link ResourceDataBroker#getFromResourceDirAsUrl(java.lang.String)} + * + * @param path + * The relative path to the item inside of the {@code /resource} + * directory. Please start your path information with {@code /} + * because it will be concatenated with the directory's name: + * /resource<b>/yourpath</b>. + * @return An {@link URL} object to the requested item or {@code null} if it + * wasn't found. + */ + @Override + public URL getFromResourceDirAsUrl(final String path) { + final String completePath = this.getCompleteResourceUrl(path); + return getFixedJarURL(ResourceDataBroker.class.getResource(completePath)); + } + + /** + * Concatenates the passed resource path with the currently set {@code + * resource} directory path. + * + * @param path + * The relative path to a resource item inside of the {@code + * resource} directory. + * @return The full relative path to the resource including the path to the + * {@code resource} directory. + */ + private String getCompleteResourceUrl(final String path) { + final StringBuffer completePath = new StringBuffer(this.getResourceDir()); + + if (!this.getResourceDir().endsWith("/") && !(path.charAt(0)=='/')) { + completePath.append('/'); + } + + if (this.getResourceDir().endsWith("/") && (path.charAt(0)=='/') + && path.length() > 1) { + completePath.append(path.substring(1)); + } else { + completePath.append(path); + } + + return completePath.toString(); + } + + /** + * See: + * {@link ResourceDataBroker#getFromRulesDirAsStream(java.lang.String)} + * + * @param path + * The relative path to the item inside of the {@code /rules} + * directory. Please start your path information with {@code /} + * because it will be concatenated with the directory's name: + * /rules<b>/yourpath</b>. + * @return An {@link InputStream} object to the requested item or {@code + * null} if it wasn't found. + */ + @Override + public InputStream getFromRulesDirAsStream(final String path) { + final StringBuffer completePath = this.getCompleteRulesUrl(path); + return ResourceDataBroker.class.getResourceAsStream(completePath.toString()); + } + + /** + * See: {@link ResourceDataBroker#getFromRulesDirAsUrl(java.lang.String)} + * + * @param path + * The relative path to the item inside of the {@code /rules} + * directory. Please start your path information with {@code /} + * because it will be concatenated with the directory's name: + * /rules<b>/yourpath</b>. + * @return An {@link URL} object to the requested item or {@code null} if it + * wasn't found. + */ + @Override + public URL getFromRulesDirAsUrl(final String path) { + final StringBuffer completePath = this.getCompleteRulesUrl(path); + return getFixedJarURL(ResourceDataBroker.class.getResource(completePath.toString())); + } + + /** + * Concatenates the passed resource path with the currently set {@code + * rules} directory path. + * + * @param path + * The relative path to a resource item inside of the {@code + * rules} directory. + * @return The full relative path to the resource including the path to the + * {@code rules} directory. + */ + private StringBuffer getCompleteRulesUrl(final String path) { + final StringBuffer completePath = new StringBuffer(this.getRulesDir()); + + if (!this.getRulesDir().endsWith("/") && !(path.charAt(0)=='/')) { + completePath.append('/'); + } + + if (this.getRulesDir().endsWith("/") && (path.charAt(0)=='/') && path.length() > 1) { + completePath.append(path.substring(1)); + } else { + completePath.append(path); + } + + return completePath; + } + + /** + * @return The directory's name of the grammar checker's resource directory. + * The default value equals + * {@link ResourceDataBroker#RESOURCE_DIR}. + */ + @Override + public String getResourceDir() { + return this.resourceDir; + } + + /** + * @param resourceDir + * The directory's name of the grammar checker's resource + * directory. The default value was + * {@link ResourceDataBroker#RESOURCE_DIR}. Please let this + * string start with {@code '/'} and use this character as path + * separator. Don't set this character to the string's end. Valid + * example value: {@code /subdir/furtherdir/resourcedir}. + */ + @Override + public void setResourceDir(final String resourceDir) { + this.resourceDir = (resourceDir == null) ? "" : resourceDir; + } + + /** + * @return The directory's name of the grammar checker's rules directory. + * The default value equals {@link ResourceDataBroker#RULES_DIR}. + */ + @Override + public String getRulesDir() { + return this.rulesDir; + } + + /** + * @param rulesDir + * The directory's name of the grammar checker's rules directory. + * The default value was {@link ResourceDataBroker#RULES_DIR}. + * Please let this string start with {@code '/'} and use this + * character as path separator. Don't set this character to the + * string's end. Valid example value: {@code + * /subdir/furtherdir/rulesdir}. + */ + @Override + public void setRulesDir(final String rulesDir) { + this.rulesDir = (rulesDir == null) ? "" : rulesDir; + } + + /** + * Fixes the getResource bug if you want to obtain any resource from a JAR file under Java + * 1.5.0_16 Webstart. (Workaround by {@code mevanclark} from http://forums.sun.com) + * + * @param url The {@link URL} to be fixed. + * @return The fixed version if necessary. + */ + private static URL getFixedJarURL(URL url) { + if (url == null) { + return url; + } + + final String originalURLProtocol = url.getProtocol(); + if (!"jar".equalsIgnoreCase(originalURLProtocol)) { + return url; + } + + final String originalURLString = url.toString(); + final int bangSlashIndex = originalURLString.indexOf("!/"); + if (bangSlashIndex > -1) { + return url; + } + + final String originalURLPath = url.getPath(); + final URLConnection urlConnection; + try { + urlConnection = url.openConnection(); + if (urlConnection == null) { + throw new IOException("urlConnection is null"); + } + } catch (IOException e) { + return url; + } + + final Permission urlConnectionPermission; + try { + urlConnectionPermission = urlConnection.getPermission(); + if (urlConnectionPermission == null) { + throw new IOException("urlConnectionPermission is null"); + } + } catch (IOException e) { + return url; + } + + final String urlConnectionPermissionName = urlConnectionPermission.getName(); + if (urlConnectionPermissionName == null) { + return url; + } + + final File file = new File(urlConnectionPermissionName); + if (!file.exists()) { + return url; + } + + try { + final String newURLStr = "jar:" + file.toURI().toURL().toExternalForm() + "!/" + originalURLPath; + url = new URL(newURLStr); + } catch (MalformedURLException e) { + return url; + } + + return url; + } + +} diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/databroker/ResourceDataBroker.java b/JLanguageTool/src/java/de/danielnaber/languagetool/databroker/ResourceDataBroker.java new file mode 100644 index 0000000..eac263b --- /dev/null +++ b/JLanguageTool/src/java/de/danielnaber/languagetool/databroker/ResourceDataBroker.java @@ -0,0 +1,139 @@ +/* LanguageTool, a natural language style checker + * Copyright (C) 2006 Daniel Naber (http://www.danielnaber.de) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ +package de.danielnaber.languagetool.databroker; + +import java.io.InputStream; +import java.net.URL; + +import de.danielnaber.languagetool.JLanguageTool; + +/** + * Is responsible for getting the necessary resources for the grammar checker + * library. Following directories are currently needed by a couple of classes: + * + * <ul style="list-type: circle"> + * <li>{@code /resource}</li> + * <li>{@code /rules}</li> + * </ul> + * + * This interface determines methods to obtain any contents from these + * directories. + * <p> + * + * Make sure that you never obtain any grammar checker resources by calling + * {@code Object.class.getResource(String)} or {@code + * Object.class.getResourceAsStream(String)} directly. If you would like to + * obtain something from these directories do always use + * {@link JLanguageTool#getDataBroker()} which provides proper methods for + * reading the directories above. + * <p> + * + * For example, if you want to get the {@link URL} of {@code + * /rules/de/grammar.xml} just invoke + * {@link ResourceDataBroker#getFromRulesDirAsUrl(String)} and pass {@code + * /de/grammar.xml} as a string. Note: The {@code /rules} directory's name isn't + * passed, because its name might have changed. The same usage does apply for the + * {@code /resource} directory. + * + * @author PAX + * @since 1.0.1 + */ +public interface ResourceDataBroker { + + /** + * The directory name of the {@code /resource} directory. + */ + public static final String RESOURCE_DIR = "/resource"; + + /** + * The directory name of the {@code /rules} directory. + */ + public static final String RULES_DIR = "/rules"; + + /** + * Gets any resource from the grammar checker's {@code /resource} directory. + * + * @param path Path to an item from the {@code /resource} directory. + * @return An {@link URL} object to the requested item or {@code null} if it + * wasn't found. + */ + public URL getFromResourceDirAsUrl(String path); + + /** + * Gets any resource from the grammar checker's {@code /resource} directory. + * + * @param path Path to an item from the {@code /resource} directory. + * @return An {@link InputStream} object to the requested item or {@code null} + * if it wasn't found. + */ + public InputStream getFromResourceDirAsStream(String path); + + /** + * Gets any resource from the grammar checker's {@code /rules} directory. + * + * @param path + * Path to an item from the {@code /rules} directory. + * @return An {@link URL} object to the requested item or {@code null} if it + * wasn't found. + */ + public URL getFromRulesDirAsUrl(String path); + + /** + * Gets any resource from the grammar checker's {@code /rules} directory. + * + * @param path Path to an item from the {@code /rules} directory. + * @return An {@link InputStream} object to the requested item or {@code + * null} if it wasn't found. + */ + public InputStream getFromRulesDirAsStream(String path); + + /** + * @return The currently set resource directory path as a string. Make sure + * that you comply with the following format when setting this value: + * <p> + * {@code /subdir/furtherdir/resourcedir} + */ + public String getResourceDir(); + + /** + * @param resourceDir The used directory path to the {@code /resource} directory. + * Make sure that you comply with the following format when setting + * this value: + * <p> + * {@code /subdir/furtherdir/resourcedir} + */ + public void setResourceDir(String resourceDir); + + /** + * @return The currently set rules directory path as a string. Make sure + * that you comply with the following format when setting this value: + * <p> + * {@code /subdir/furtherdir/rulesdir} + */ + public String getRulesDir(); + + /** + * @param rulesDir The used directory path to the {@code /rules} directory. Make + * sure that you comply with the following format when setting this + * value: + * <p> + * {@code /subdir/furtherdir/rulesdir} + */ + public void setRulesDir(String rulesDir); +} diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/gui/AboutDialog.java b/JLanguageTool/src/java/de/danielnaber/languagetool/gui/AboutDialog.java new file mode 100644 index 0000000..26df4a1 --- /dev/null +++ b/JLanguageTool/src/java/de/danielnaber/languagetool/gui/AboutDialog.java @@ -0,0 +1,58 @@ +/* LanguageTool, a natural language style checker + * Copyright (C) 2007 Daniel Naber (http://www.danielnaber.de) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ +package de.danielnaber.languagetool.gui; + +import java.util.ResourceBundle; + +import javax.swing.JOptionPane; + +import de.danielnaber.languagetool.JLanguageTool; +import de.danielnaber.languagetool.Language; +import de.danielnaber.languagetool.tools.StringTools; + +/** + * A dialog with version and copyright information. + * + * @author Daniel Naber + */ +public class AboutDialog { + + protected final ResourceBundle messages; + + public AboutDialog(final ResourceBundle messages) { + this.messages = messages; + } + + public void show() { + final String aboutText = + StringTools.getLabel(messages.getString("guiMenuAbout")); + JOptionPane.showMessageDialog(null, getAboutText(), + aboutText, JOptionPane.INFORMATION_MESSAGE); + } + + protected String getAboutText() { + return "LanguageTool " + JLanguageTool.VERSION + "\n" + + "Copyright (C) 2005-2010 Daniel Naber\n" + + "This software is licensed under the GNU Lesser General Public License.\n" + + "LanguageTool Homepage: http://www.languagetool.org\n\n" + + "Maintainers of the language modules:\n\n" + + Language.getAllMaintainers(messages); + } + +} diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/gui/Configuration.java b/JLanguageTool/src/java/de/danielnaber/languagetool/gui/Configuration.java new file mode 100644 index 0000000..932e1fe --- /dev/null +++ b/JLanguageTool/src/java/de/danielnaber/languagetool/gui/Configuration.java @@ -0,0 +1,233 @@ +/* LanguageTool, a natural language style checker + * Copyright (C) 2005 Daniel Naber (http://www.danielnaber.de) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ +package de.danielnaber.languagetool.gui; + +import java.io.File; +import java.io.FileInputStream; +import java.io.FileNotFoundException; +import java.io.FileOutputStream; +import java.io.IOException; +import java.util.*; + +import de.danielnaber.languagetool.Language; +import de.danielnaber.languagetool.server.HTTPServer; + +/** + * Configuration -- currently this is just a list of disabled rule IDs. + * Configuration is loaded from and stored to a properties file. + * + * @author Daniel Naber + */ +public class Configuration { + + private static final String CONFIG_FILE = "languagetool.properties"; + private static final String DISABLED_RULES_CONFIG_KEY = "disabledRules"; + private static final String ENABLED_RULES_CONFIG_KEY = "enabledRules"; + private static final String DISABLED_CATEGORIES_CONFIG_KEY = "disabledCategories"; + private static final String MOTHER_TONGUE_CONFIG_KEY = "motherTongue"; + private static final String SERVER_RUN_CONFIG_KEY = "serverMode"; + private static final String SERVER_PORT_CONFIG_KEY = "serverPort"; + + private File configFile; + + private Set<String> disabledRuleIds = new HashSet<String>(); + private Set<String> enabledRuleIds = new HashSet<String>(); + private Set<String> disabledCategoryNames = new HashSet<String>(); + private Language motherTongue; + private boolean runServer; + private int serverPort = HTTPServer.DEFAULT_PORT; + + public Configuration(final File baseDir, final String filename) + throws IOException { + if (!baseDir.isDirectory()) { + throw new IllegalArgumentException("Not a directory: " + baseDir); + } + configFile = new File(baseDir, filename); + loadConfiguration(); + } + + public Configuration(final File baseDir) throws IOException { + this(baseDir, CONFIG_FILE); + } + + public Set<String> getDisabledRuleIds() { + return disabledRuleIds; + } + + public Set<String> getEnabledRuleIds() { + return enabledRuleIds; + } + + public Set<String> getDisabledCategoryNames() { + return disabledCategoryNames; + } + + public void setDisabledRuleIds(final Set<String> ruleIDs) { + disabledRuleIds = ruleIDs; + } + + public void setEnabledRuleIds(final Set<String> ruleIDs) { + enabledRuleIds = ruleIDs; + } + + public void setDisabledCategoryNames(final Set<String> categoryNames) { + disabledCategoryNames = categoryNames; + } + + public Language getMotherTongue() { + return motherTongue; + } + + public void setMotherTongue(final Language motherTongue) { + this.motherTongue = motherTongue; + } + + public boolean getRunServer() { + return runServer; + } + + public void setRunServer(final boolean runServer) { + this.runServer = runServer; + } + + public int getServerPort() { + return serverPort; + } + + public void setServerPort(final int serverPort) { + this.serverPort = serverPort; + } + + private void loadConfiguration() throws IOException { + + // FIXME: disabling a rule X in language Y should not disable it in all + // languages - need to add a language parameter + + FileInputStream fis = null; + try { + fis = new FileInputStream(configFile); + final Properties props = new Properties(); + props.load(fis); + final String val = (String) props.get(DISABLED_RULES_CONFIG_KEY); + if (val != null) { + final String[] ids = val.split(","); + disabledRuleIds.addAll(Arrays.asList(ids)); + } + + final String enRul = (String) props.get(ENABLED_RULES_CONFIG_KEY); + if (enRul != null) { + final String[] ids = enRul.split(","); + enabledRuleIds.addAll(Arrays.asList(ids)); + } + + final String cat = (String) props.get(DISABLED_CATEGORIES_CONFIG_KEY); + if (cat != null) { + final String[] names = cat.split(","); + disabledCategoryNames.addAll(Arrays.asList(names)); + } + + final String motherTongueStr = (String) props + .get(MOTHER_TONGUE_CONFIG_KEY); + if (motherTongueStr != null) { + motherTongue = Language.getLanguageForShortName(motherTongueStr); + } + final String runServerString = (String) props.get(SERVER_RUN_CONFIG_KEY); + if (runServerString != null) { + runServer = runServerString.equals("true"); + } + final String serverPortString = (String) props + .get(SERVER_PORT_CONFIG_KEY); + if (serverPortString != null) { + serverPort = Integer.parseInt(serverPortString); + } + } catch (final FileNotFoundException e) { + // file not found: okay, leave disabledRuleIds empty + } finally { + if (fis != null) { + fis.close(); + } + } + } + + public void saveConfiguration() throws IOException { + final Properties props = new Properties(); + + if (disabledRuleIds == null) { + props.setProperty(DISABLED_RULES_CONFIG_KEY, ""); + } else { + final StringBuilder sb = new StringBuilder(); + for (final Iterator<String> iter = disabledRuleIds.iterator(); iter + .hasNext();) { + final String id = iter.next(); + sb.append(id); + if (iter.hasNext()) { + sb.append(','); + } + } + props.setProperty(DISABLED_RULES_CONFIG_KEY, sb.toString()); + } + + if (enabledRuleIds == null) { + props.setProperty(ENABLED_RULES_CONFIG_KEY, ""); + } else { + final StringBuilder sb = new StringBuilder(); + for (final Iterator<String> iter = enabledRuleIds.iterator(); iter.hasNext();) { + final String id = iter.next(); + sb.append(id); + if (iter.hasNext()) { + sb.append(','); + } + } + props.setProperty(ENABLED_RULES_CONFIG_KEY, sb.toString()); + } + + if (disabledCategoryNames == null) { + props.setProperty(DISABLED_CATEGORIES_CONFIG_KEY, ""); + } else { + final StringBuilder sb = new StringBuilder(); + for (final Iterator<String> iter = disabledCategoryNames.iterator(); iter + .hasNext();) { + final String name = iter.next(); + sb.append(name); + if (iter.hasNext()) { + sb.append(','); + } + } + props.setProperty(DISABLED_CATEGORIES_CONFIG_KEY, sb.toString()); + } + + if (motherTongue != null) { + props.setProperty(MOTHER_TONGUE_CONFIG_KEY, motherTongue.getShortName()); + } + props.setProperty(SERVER_RUN_CONFIG_KEY, Boolean.valueOf(runServer) + .toString()); + props.setProperty(SERVER_PORT_CONFIG_KEY, Integer.valueOf(serverPort) + .toString()); + FileOutputStream fos = null; + try { + fos = new FileOutputStream(configFile); + props.store(fos, "LanguageTool configuration"); + } finally { + if (fos != null) { + fos.close(); + } + } + } + +} diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/gui/ConfigurationDialog.java b/JLanguageTool/src/java/de/danielnaber/languagetool/gui/ConfigurationDialog.java new file mode 100644 index 0000000..d78ea08 --- /dev/null +++ b/JLanguageTool/src/java/de/danielnaber/languagetool/gui/ConfigurationDialog.java @@ -0,0 +1,497 @@ +/* LanguageTool, a natural language style checker + * Copyright (C) 2005 Daniel Naber (http://www.danielnaber.de) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ +package de.danielnaber.languagetool.gui; + +import java.awt.Container; +import java.awt.Dimension; +import java.awt.Frame; +import java.awt.GridBagConstraints; +import java.awt.GridBagLayout; +import java.awt.Insets; +import java.awt.Toolkit; +import java.awt.event.ActionEvent; +import java.awt.event.ActionListener; +import java.awt.event.KeyEvent; +import java.io.IOException; +import java.util.ArrayList; +import java.util.Collections; +import java.util.Comparator; +import java.util.HashSet; +import java.util.List; +import java.util.ResourceBundle; +import java.util.Set; + +import javax.swing.JButton; +import javax.swing.JCheckBox; +import javax.swing.JComboBox; +import javax.swing.JComponent; +import javax.swing.JDialog; +import javax.swing.JLabel; +import javax.swing.JPanel; +import javax.swing.JRootPane; +import javax.swing.JScrollPane; +import javax.swing.JTextField; +import javax.swing.KeyStroke; + +import de.danielnaber.languagetool.JLanguageTool; +import de.danielnaber.languagetool.Language; +import de.danielnaber.languagetool.rules.Rule; +import de.danielnaber.languagetool.server.HTTPServer; +import de.danielnaber.languagetool.tools.StringTools; + +/** + * Dialog that offers the available rules so they can be turned on/off + * individually. + * + * @author Daniel Naber + */ +public class ConfigurationDialog implements ActionListener { + + private static final String NO_MOTHER_TONGUE = "---"; + + private JButton okButton; + private JButton cancelButton; + + private final ResourceBundle messages; + private JDialog dialog; + + private JComboBox motherTongueBox; + + private JCheckBox serverCheckbox; + private JTextField serverPortField; + + private final List<JCheckBox> checkBoxes = new ArrayList<JCheckBox>(); + private final List<String> checkBoxesRuleIds = new ArrayList<String>(); + private final List<String> checkBoxesCategories = new ArrayList<String>(); + + private final List<String> defaultOffRules = new ArrayList<String>(); + + private Set<String> inactiveRuleIds = new HashSet<String>(); + private Set<String> enabledRuleIds = new HashSet<String>(); + private Set<String> inactiveCategoryNames = new HashSet<String>(); + private final List<JCheckBox> categoryCheckBoxes = new ArrayList<JCheckBox>(); + private final List<String> checkBoxesCategoryNames = new ArrayList<String>(); + private Language motherTongue; + private boolean serverMode; + private int serverPort; + + private final Frame owner; + private final boolean insideOOo; + + public ConfigurationDialog(Frame owner, boolean insideOOo) { + this.owner = owner; + this.insideOOo = insideOOo; + messages = JLanguageTool.getMessageBundle(); + } + + public void show(List<Rule> rules) { + dialog = new JDialog(owner, true); + dialog.setTitle(messages.getString("guiConfigWindowTitle")); + checkBoxes.clear(); + checkBoxesRuleIds.clear(); + categoryCheckBoxes.clear(); + checkBoxesCategoryNames.clear(); + + Collections.sort(rules, new CategoryComparator()); + + // close dialog when user presses Escape key: + final KeyStroke stroke = KeyStroke.getKeyStroke(KeyEvent.VK_ESCAPE, 0); + final ActionListener actionListener = new ActionListener() { + public void actionPerformed(@SuppressWarnings("unused") ActionEvent actionEvent) { + dialog.setVisible(false); + } + }; + final JRootPane rootPane = dialog.getRootPane(); + rootPane.registerKeyboardAction(actionListener, stroke, + JComponent.WHEN_IN_FOCUSED_WINDOW); + + // JPanel + final JPanel checkBoxPanel = new JPanel(); + checkBoxPanel.setLayout(new GridBagLayout()); + GridBagConstraints cons = new GridBagConstraints(); + cons.anchor = GridBagConstraints.NORTHWEST; + cons.gridx = 0; + int row = 0; + String prevID = null; + String prevCategory = null; + for (final Rule rule : rules) { + // avoid displaying rules from rule groups more than once: + if (prevID == null || !rule.getId().equals(prevID)) { + cons.gridy = row; + final JCheckBox checkBox = new JCheckBox(rule.getDescription()); + if (inactiveRuleIds != null + && (inactiveRuleIds.contains(rule.getId()) || inactiveCategoryNames + .contains(rule.getCategory().getName()))) { + checkBox.setSelected(false); + } else { + checkBox.setSelected(true); + } + + if (rule.isDefaultOff() && !enabledRuleIds.contains(rule.getId())) { + checkBox.setSelected(false); + } + + if (rule.isDefaultOff()) { + defaultOffRules.add(rule.getId()); + if (rule.getCategory().isDefaultOff()) { + inactiveCategoryNames.add(rule.getCategory().getName()); + } + } else { + if (rule.getCategory().isDefaultOff()) { + inactiveCategoryNames.remove(rule.getCategory().getName()); + } + } + + final ActionListener ruleCheckBoxListener = new ActionListener() { + public void actionPerformed(final ActionEvent actionEvent) { + final JCheckBox cBox = (JCheckBox) actionEvent.getSource(); + final boolean selected = cBox.getModel().isSelected(); + int i = 0; + for (final JCheckBox chBox : checkBoxes) { + if (chBox.equals(cBox)) { + final int catNo = checkBoxesCategoryNames + .indexOf(checkBoxesCategories.get(i)); + if (selected && !categoryCheckBoxes.get(catNo).isSelected()) { + categoryCheckBoxes.get(catNo).setSelected(true); + } + } + i++; + } + } + }; + checkBox.addActionListener(ruleCheckBoxListener); + checkBoxes.add(checkBox); + checkBoxesRuleIds.add(rule.getId()); + checkBoxesCategories.add(rule.getCategory().getName()); + final boolean showHeadline = rule.getCategory() != null + && !rule.getCategory().getName().equals(prevCategory); + if ((showHeadline || prevCategory == null) + && rule.getCategory() != null) { + + // TODO: maybe use a Tree of Checkboxes here, like in: + // http://www.javaworld.com/javaworld/jw-09-2007/jw-09-checkboxtree.html + final JCheckBox categoryCheckBox = new JCheckBox(rule.getCategory() + .getName()); + if (inactiveCategoryNames != null + && inactiveCategoryNames.contains(rule.getCategory().getName())) { + categoryCheckBox.setSelected(false); + } else { + categoryCheckBox.setSelected(true); + } + + final ActionListener categoryCheckBoxListener = new ActionListener() { + public void actionPerformed(final ActionEvent actionEvent) { + final JCheckBox cBox = (JCheckBox) actionEvent.getSource(); + final boolean selected = cBox.getModel().isSelected(); + int i = 0; + for (final JCheckBox ruleBox : checkBoxes) { + if (ruleBox.isSelected() != selected) { + if (checkBoxesCategories.get(i).equals(cBox.getText())) { + ruleBox.setSelected(selected); + } + } + i++; + } + } + }; + + categoryCheckBox.addActionListener(categoryCheckBoxListener); + categoryCheckBoxes.add(categoryCheckBox); + checkBoxesCategoryNames.add(rule.getCategory().getName()); + checkBoxPanel.add(categoryCheckBox, cons); + prevCategory = rule.getCategory().getName(); + cons.gridy++; + row++; + } + checkBox.setMargin(new Insets(0, 20, 0, 0)); // indent + checkBoxPanel.add(checkBox, cons); + row++; + } + prevID = rule.getId(); + } + + final JPanel motherTonguePanel = new JPanel(); + motherTonguePanel.add(new JLabel(messages.getString("guiMotherTongue")), + cons); + motherTongueBox = new JComboBox(getPossibleMotherTongues()); + if (motherTongue != null) { + if (motherTongue == Language.DEMO) { + motherTongueBox.setSelectedItem(NO_MOTHER_TONGUE); + } else { + motherTongueBox.setSelectedItem(messages.getString(motherTongue + .getShortName())); + } + } + motherTonguePanel.add(motherTongueBox, cons); + + final JPanel portPanel = new JPanel(); + portPanel.setLayout(new GridBagLayout()); + // TODO: why is this now left-aligned?!?! + cons = new GridBagConstraints(); + cons.insets = new Insets(0, 4, 0, 0); + cons.gridx = 0; + cons.gridy = 0; + cons.anchor = GridBagConstraints.WEST; + cons.fill = GridBagConstraints.NONE; + cons.weightx = 0.0f; + if (!insideOOo) { + serverCheckbox = new JCheckBox(StringTools.getLabel(messages + .getString("guiRunOnPort"))); + serverCheckbox.setMnemonic(StringTools.getMnemonic(messages + .getString("guiRunOnPort"))); + serverCheckbox.setSelected(serverMode); + portPanel.add(serverCheckbox, cons); + serverPortField = new JTextField(Integer.toString(serverPort)); + serverPortField.setEnabled(serverCheckbox.isSelected()); + // TODO: without this the box is just a few pixels small, but why??: + serverPortField.setMinimumSize(new Dimension(100, 25)); + cons.gridx = 1; + serverCheckbox.addActionListener(new ActionListener() { + public void actionPerformed(@SuppressWarnings("unused") ActionEvent e) { + serverPortField.setEnabled(serverCheckbox.isSelected()); + } + }); + portPanel.add(serverPortField, cons); + } + + final JPanel buttonPanel = new JPanel(); + buttonPanel.setLayout(new GridBagLayout()); + okButton = new JButton(StringTools.getLabel(messages + .getString("guiOKButton"))); + okButton.setMnemonic(StringTools.getMnemonic(messages + .getString("guiOKButton"))); + okButton.addActionListener(this); + cancelButton = new JButton(StringTools.getLabel(messages + .getString("guiCancelButton"))); + cancelButton.setMnemonic(StringTools.getMnemonic(messages + .getString("guiCancelButton"))); + cancelButton.addActionListener(this); + cons = new GridBagConstraints(); + cons.insets = new Insets(0, 4, 0, 0); + buttonPanel.add(okButton, cons); + buttonPanel.add(cancelButton, cons); + + final Container contentPane = dialog.getContentPane(); + contentPane.setLayout(new GridBagLayout()); + cons = new GridBagConstraints(); + cons.insets = new Insets(4, 4, 4, 4); + cons.gridx = 0; + cons.gridy = 0; + cons.weightx = 10.0f; + cons.weighty = 10.0f; + cons.fill = GridBagConstraints.BOTH; + contentPane.add(new JScrollPane(checkBoxPanel), cons); + + cons.gridx = 0; + cons.gridy = 1; + cons.weightx = 0.0f; + cons.weighty = 0.0f; + cons.fill = GridBagConstraints.NONE; + cons.anchor = GridBagConstraints.WEST; + contentPane.add(motherTonguePanel, cons); + + cons.gridx = 0; + cons.gridy = 2; + cons.weightx = 0.0f; + cons.weighty = 0.0f; + cons.fill = GridBagConstraints.NONE; + cons.anchor = GridBagConstraints.WEST; + contentPane.add(portPanel, cons); + + cons.gridx = 0; + cons.gridy = 3; + cons.weightx = 0.0f; + cons.weighty = 0.0f; + cons.fill = GridBagConstraints.NONE; + cons.anchor = GridBagConstraints.EAST; + contentPane.add(buttonPanel, cons); + + dialog.pack(); + dialog.setSize(500, 500); + // center on screen: + final Dimension screenSize = Toolkit.getDefaultToolkit().getScreenSize(); + final Dimension frameSize = dialog.getSize(); + dialog.setLocation(screenSize.width / 2 - frameSize.width / 2, + screenSize.height / 2 - frameSize.height / 2); + dialog.setVisible(true); + } + + private Object[] getPossibleMotherTongues() { + final List<Object> motherTongues = new ArrayList<Object>(); + motherTongues.add(NO_MOTHER_TONGUE); + for (final Language lang : Language.LANGUAGES) { + if (lang != Language.DEMO) { + motherTongues.add(messages.getString(lang.getShortName())); + } + } + return motherTongues.toArray(); + } + + public void actionPerformed(ActionEvent e) { + if (e.getSource() == okButton) { + int i = 0; + inactiveCategoryNames.clear(); + for (final JCheckBox checkBox : categoryCheckBoxes) { + if (!checkBox.isSelected()) { + final String categoryName = checkBoxesCategoryNames.get(i); + inactiveCategoryNames.add(categoryName); + } + i++; + } + i = 0; + inactiveRuleIds.clear(); + enabledRuleIds.clear(); + for (final JCheckBox checkBox : checkBoxes) { + if (!checkBox.isSelected()) { + final String ruleId = checkBoxesRuleIds.get(i); + if (!defaultOffRules.contains(ruleId)) { + inactiveRuleIds.add(ruleId); + } + } + + if (checkBox.isSelected()) { + final String ruleId = checkBoxesRuleIds.get(i); + if (defaultOffRules.contains(ruleId)) { + enabledRuleIds.add(ruleId); + } + } + + i++; + } + + if (motherTongueBox.getSelectedItem() instanceof String) { + motherTongue = getLanguageForLocalizedName(motherTongueBox + .getSelectedItem().toString()); + } else { + motherTongue = (Language) motherTongueBox.getSelectedItem(); + } + if (serverCheckbox != null) { + serverMode = serverCheckbox.isSelected(); + serverPort = Integer.parseInt(serverPortField.getText()); + } + dialog.setVisible(false); + } else if (e.getSource() == cancelButton) { + dialog.setVisible(false); + } + } + + public void setDisabledRules(Set<String> ruleIDs) { + inactiveRuleIds = ruleIDs; + } + + public Set<String> getDisabledRuleIds() { + return inactiveRuleIds; + } + + public void setEnabledRules(Set<String> ruleIDs) { + enabledRuleIds = ruleIDs; + } + + public Set<String> getEnabledRuleIds() { + return enabledRuleIds; + } + + public void setDisabledCategories(Set<String> categoryNames) { + inactiveCategoryNames = categoryNames; + } + + public Set<String> getDisabledCategoryNames() { + return inactiveCategoryNames; + } + + public void setMotherTongue(Language motherTongue) { + this.motherTongue = motherTongue; + } + + public Language getMotherTongue() { + return motherTongue; + } + + /** + * Get the Language object for the given localized language name. + * + * @param languageName + * e.g. <code>English</code> or <code>German</code> (case is + * significant) + * @return a Language object or <code>null</code> + */ + private Language getLanguageForLocalizedName(final String languageName) { + for (final Language element : Language.LANGUAGES) { + if (NO_MOTHER_TONGUE.equals(languageName)) { + return Language.DEMO; + } + if (languageName.equals(messages.getString(element.getShortName()))) { + return element; + } + } + return null; + } + + public void setRunServer(boolean serverMode) { + this.serverMode = serverMode; + } + + public boolean getRunServer() { + if (serverCheckbox == null) { + return false; + } + return serverCheckbox.isSelected(); + } + + public void setServerPort(int serverPort) { + this.serverPort = serverPort; + } + + public int getServerPort() { + if (serverPortField == null) { + return HTTPServer.DEFAULT_PORT; + } + return Integer.parseInt(serverPortField.getText()); + } + + /** + * Opens the dialog - for internal testing only. + */ + public static void main(String[] args) throws IOException { + final ConfigurationDialog dlg = new ConfigurationDialog(null, false); + final List<Rule> rules = new ArrayList<Rule>(); + final JLanguageTool lt = new JLanguageTool(Language.ENGLISH); + lt.activateDefaultPatternRules(); + rules.addAll(lt.getAllRules()); + dlg.show(rules); + } + +} + +class CategoryComparator implements Comparator<Rule> { + + public int compare(final Rule r1, final Rule r2) { + final boolean hasCat = r1.getCategory() != null && r2.getCategory() != null; + if (hasCat) { + final int res = r1.getCategory().getName().compareTo( + r2.getCategory().getName()); + if (res == 0) { + return r1.getDescription().compareToIgnoreCase(r2.getDescription()); + } + return res; + } + return r1.getDescription().compareToIgnoreCase(r2.getDescription()); + } + +} diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/gui/LanguageManagerDialog.java b/JLanguageTool/src/java/de/danielnaber/languagetool/gui/LanguageManagerDialog.java new file mode 100644 index 0000000..18c5b26 --- /dev/null +++ b/JLanguageTool/src/java/de/danielnaber/languagetool/gui/LanguageManagerDialog.java @@ -0,0 +1,184 @@ +/* LanguageTool, a natural language style checker + * Copyright (C) 2007 Daniel Naber (http://www.danielnaber.de) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ +package de.danielnaber.languagetool.gui; + +import java.awt.Container; +import java.awt.Dimension; +import java.awt.Frame; +import java.awt.GridBagConstraints; +import java.awt.GridBagLayout; +import java.awt.Insets; +import java.awt.Toolkit; +import java.awt.event.ActionEvent; +import java.awt.event.ActionListener; +import java.awt.event.KeyEvent; +import java.io.File; +import java.util.ArrayList; +import java.util.List; + +import javax.swing.JButton; +import javax.swing.JComponent; +import javax.swing.JDialog; +import javax.swing.JList; +import javax.swing.JPanel; +import javax.swing.JRootPane; +import javax.swing.JScrollPane; +import javax.swing.KeyStroke; +import javax.swing.filechooser.FileFilter; + +import de.danielnaber.languagetool.Language; +import de.danielnaber.languagetool.language.LanguageBuilder; + +/** + * Dialog for managing externally loaded rules. + * + * @author Daniel Naber + */ +public class LanguageManagerDialog implements ActionListener { + + private JDialog dialog; + + private JList list; + private JButton addButton; + private JButton removeButton; + private JButton closeButton; + private final List<File> ruleFiles = new ArrayList<File>(); + + private final Frame owner; + //private ResourceBundle messages = null; + + public LanguageManagerDialog(Frame owner, List<Language> languages) { + this.owner = owner; + for (Language lang : languages) { + ruleFiles.add(new File(lang.getRuleFileName())); + } + //messages = JLanguageTool.getMessageBundle(); + } + + public void show() { + dialog = new JDialog(owner, true); + dialog.setTitle("Language Module Manager"); // FIXME: i18n + + // close dialog when user presses Escape key: + // TODO: taken from ConfigurationDialog, avoid duplication: + final KeyStroke stroke = KeyStroke.getKeyStroke(KeyEvent.VK_ESCAPE, 0); + final ActionListener actionListener = new ActionListener() { + @SuppressWarnings("unused") + public void actionPerformed(ActionEvent actionEvent) { + dialog.setVisible(false); + } + }; + final JRootPane rootPane = dialog.getRootPane(); + rootPane.registerKeyboardAction(actionListener, stroke, JComponent.WHEN_IN_FOCUSED_WINDOW); + + final Container contentPane = dialog.getContentPane(); + contentPane.setLayout(new GridBagLayout()); + + list = new JList(ruleFiles.toArray(new File[]{})); + GridBagConstraints cons = new GridBagConstraints(); + cons.insets = new Insets(4, 4, 4, 4); + cons.gridx = 0; + cons.gridy = 0; + cons.fill = GridBagConstraints.BOTH; + cons.weightx = 2.0f; + cons.weighty = 2.0f; + contentPane.add(new JScrollPane(list), cons); + + cons = new GridBagConstraints(); + cons.insets = new Insets(4, 4, 4, 4); + cons.fill = GridBagConstraints.HORIZONTAL; + + final JPanel buttonPanel = new JPanel(); + buttonPanel.setLayout(new GridBagLayout()); + addButton = new JButton("Add..."); // FIXME: i18n + addButton.addActionListener(this); + cons.gridx = 1; + cons.gridy = 0; + buttonPanel.add(addButton, cons); + + removeButton = new JButton("Remove"); // FIXME: i18n + removeButton.addActionListener(this); + cons.gridx = 1; + cons.gridy = 1; + buttonPanel.add(removeButton, cons); + + closeButton = new JButton("Close"); // FIXME: i18n + closeButton.addActionListener(this); + cons.gridx = 1; + cons.gridy = 2; + buttonPanel.add(closeButton, cons); + + cons.gridx = 1; + cons.gridy = 0; + cons = new GridBagConstraints(); + cons.anchor = GridBagConstraints.NORTH; + contentPane.add(buttonPanel, cons); + + dialog.pack(); + dialog.setSize(300, 200); + // center on screen: + final Dimension screenSize = Toolkit.getDefaultToolkit().getScreenSize(); + final Dimension frameSize = dialog.getSize(); + dialog.setLocation(screenSize.width/2 - (frameSize.width/2), screenSize.height/2 - (frameSize.height/2)); + dialog.setVisible(true); + } + + public void actionPerformed(ActionEvent e) { + if (e.getSource() == addButton) { + final File ruleFile = Tools.openFileDialog(null, new XMLFileFilter()); + // TODO: avoid duplicate files! + ruleFiles.add(ruleFile); + list.setListData(ruleFiles.toArray(new File[]{})); + } else if (e.getSource() == removeButton) { + if (list.getSelectedIndex() != -1) { + ruleFiles.remove(list.getSelectedIndex()); + list.setListData(ruleFiles.toArray(new File[]{})); + } + } else if (e.getSource() == closeButton) { + dialog.setVisible(false); + } else { + throw new IllegalArgumentException("Don't know how to handle " + e); + } + } + + /** + * Return all external Languages. + */ + List<Language> getLanguages() { + final List<Language> languages = new ArrayList<Language>(); + for (File ruleFile : ruleFiles) { + final Language newLanguage = LanguageBuilder.makeLanguage(ruleFile); + languages.add(newLanguage); + } + return languages; + } + + static class XMLFileFilter extends FileFilter { + public boolean accept(final File f) { + if (f.getName().toLowerCase().endsWith(".xml") || f.isDirectory()) { + return true; + } + return false; + } + public String getDescription() { + return "*.xml"; + } + } + +} diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/gui/Main.java b/JLanguageTool/src/java/de/danielnaber/languagetool/gui/Main.java new file mode 100644 index 0000000..eb73813 --- /dev/null +++ b/JLanguageTool/src/java/de/danielnaber/languagetool/gui/Main.java @@ -0,0 +1,738 @@ +/* LanguageTool, a natural language style checker + * Copyright (C) 2005 Daniel Naber (http://www.danielnaber.de) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ +package de.danielnaber.languagetool.gui; + +import java.awt.AWTException; +import java.awt.Container; +import java.awt.GridBagConstraints; +import java.awt.GridBagLayout; +import java.awt.Image; +import java.awt.Insets; +import java.awt.MenuItem; +import java.awt.PopupMenu; +import java.awt.Toolkit; +import java.awt.datatransfer.Clipboard; +import java.awt.datatransfer.DataFlavor; +import java.awt.datatransfer.Transferable; +import java.awt.event.ActionEvent; +import java.awt.event.ActionListener; +import java.awt.event.MouseEvent; +import java.awt.event.MouseListener; +import java.awt.event.WindowEvent; +import java.awt.event.WindowListener; +import java.io.File; +import java.io.FileInputStream; +import java.io.IOException; +import java.io.Reader; +import java.util.ArrayList; +import java.util.Collections; +import java.util.Enumeration; +import java.util.HashMap; +import java.util.List; +import java.util.Locale; +import java.util.Map; +import java.util.MissingResourceException; +import java.util.ResourceBundle; +import java.util.Set; + +import javax.swing.Icon; +import javax.swing.ImageIcon; +import javax.swing.JButton; +import javax.swing.JComboBox; +import javax.swing.JFrame; +import javax.swing.JLabel; +import javax.swing.JOptionPane; +import javax.swing.JPanel; +import javax.swing.JScrollPane; +import javax.swing.JSplitPane; +import javax.swing.JTextArea; +import javax.swing.JTextPane; +import javax.swing.UIManager; +import javax.swing.WindowConstants; +import javax.swing.filechooser.FileFilter; +import javax.xml.parsers.ParserConfigurationException; + +import org.jdesktop.jdic.tray.SystemTray; +import org.jdesktop.jdic.tray.TrayIcon; +import org.xml.sax.SAXException; + +import de.danielnaber.languagetool.JLanguageTool; +import de.danielnaber.languagetool.Language; +import de.danielnaber.languagetool.language.RuleFilenameException; +import de.danielnaber.languagetool.rules.Rule; +import de.danielnaber.languagetool.rules.RuleMatch; +import de.danielnaber.languagetool.server.HTTPServer; +import de.danielnaber.languagetool.server.PortBindingException; +import de.danielnaber.languagetool.tools.StringTools; + +/** + * A simple GUI to check texts with. + * + * @author Daniel Naber + */ +public final class Main implements ActionListener { + + private final ResourceBundle messages; + + private static final String HTML_FONT_START = "<font face='Arial,Helvetica'>"; + private static final String HTML_FONT_END = "</font>"; + + private final static String SYSTEM_TRAY_ICON_NAME = "/TrayIcon.png"; + private static final String SYSTEM_TRAY_TOOLTIP = "LanguageTool"; + private static final String CONFIG_FILE = ".languagetool.cfg"; + + private final Configuration config; + + private JFrame frame; + private JTextArea textArea; + private JTextPane resultArea; + private JComboBox langBox; + + private HTTPServer httpServer; + + private final Map<Language, ConfigurationDialog> configDialogs = new HashMap<Language, ConfigurationDialog>(); + + // whether clicking on the window close button hides to system tray: + private boolean trayMode; + + private boolean isInTray; + + private Main() throws IOException { + config = new Configuration(new File(System.getProperty("user.home")), + CONFIG_FILE); + messages = JLanguageTool.getMessageBundle(); + maybeStartServer(); + } + + private void createGUI() { + frame = new JFrame("LanguageTool " + JLanguageTool.VERSION); + + try { + for (UIManager.LookAndFeelInfo info : UIManager + .getInstalledLookAndFeels()) { + if ("Nimbus".equals(info.getName())) { + UIManager.setLookAndFeel(info.getClassName()); + break; + } + } + } catch (Exception ex) { + // Well, what can we do... + } + + frame.setDefaultCloseOperation(WindowConstants.DO_NOTHING_ON_CLOSE); + frame.addWindowListener(new CloseListener()); + frame.setIconImage(new ImageIcon(JLanguageTool.getDataBroker().getFromResourceDirAsUrl( + Main.SYSTEM_TRAY_ICON_NAME)).getImage()); + frame.setJMenuBar(new MainMenuBar(this, messages)); + + textArea = new JTextArea(messages.getString("guiDemoText")); + // TODO: wrong line number is displayed for lines that are wrapped + // automatically: + textArea.setLineWrap(true); + textArea.setWrapStyleWord(true); + resultArea = new JTextPane(); + resultArea.setContentType("text/html"); + resultArea.setText(HTML_FONT_START + messages.getString("resultAreaText") + + HTML_FONT_END); + resultArea.setEditable(false); + final JLabel label = new JLabel(messages.getString("enterText")); + final JButton button = new JButton(StringTools.getLabel(messages + .getString("checkText"))); + button + .setMnemonic(StringTools.getMnemonic(messages.getString("checkText"))); + button.addActionListener(this); + + final JPanel panel = new JPanel(); + panel.setLayout(new GridBagLayout()); + final GridBagConstraints buttonCons = new GridBagConstraints(); + buttonCons.gridx = 0; + buttonCons.gridy = 0; + panel.add(button, buttonCons); + buttonCons.gridx = 1; + buttonCons.gridy = 0; + panel.add(new JLabel(" " + messages.getString("textLanguage") + " "), + buttonCons); + buttonCons.gridx = 2; + buttonCons.gridy = 0; + langBox = new JComboBox(); + populateLanguageBox(); + // use the system default language to preselect the language from the combo + // box: + try { + final Locale defaultLocale = Locale.getDefault(); + langBox.setSelectedItem(messages.getString(defaultLocale.getLanguage())); + } catch (final MissingResourceException e) { + // language not supported, so don't select a default + } + panel.add(langBox, buttonCons); + + final Container contentPane = frame.getContentPane(); + final GridBagLayout gridLayout = new GridBagLayout(); + contentPane.setLayout(gridLayout); + final GridBagConstraints cons = new GridBagConstraints(); + cons.insets = new Insets(5, 5, 5, 5); + cons.fill = GridBagConstraints.BOTH; + cons.weightx = 10.0f; + cons.weighty = 10.0f; + cons.gridx = 0; + cons.gridy = 1; + cons.weighty = 5.0f; + final JSplitPane splitPane = new JSplitPane(JSplitPane.VERTICAL_SPLIT, + new JScrollPane(textArea), new JScrollPane(resultArea)); + splitPane.setDividerLocation(200); + contentPane.add(splitPane, cons); + + cons.fill = GridBagConstraints.NONE; + cons.gridx = 0; + cons.gridy = 2; + cons.weighty = 0.0f; + cons.insets = new Insets(3, 3, 3, 3); + // cons.fill = GridBagConstraints.NONE; + contentPane.add(label, cons); + cons.gridy = 3; + contentPane.add(panel, cons); + + frame.pack(); + frame.setSize(600, 550); + } + + private void populateLanguageBox() { + final List<String> toSort = new ArrayList<String>(); + langBox.removeAllItems(); + for (final Language lang : Language.LANGUAGES) { + if (lang != Language.DEMO) { + try { + toSort.add(messages.getString(lang.getShortName())); + } catch (final MissingResourceException e) { + // can happen with external rules: + toSort.add(lang.getName()); + } + } + } + Collections.sort(toSort); + for (final String lng : toSort) { + langBox.addItem(lng); + } + } + + private void showGUI() { + frame.setVisible(true); + } + + public void actionPerformed(final ActionEvent e) { + try { + if (e.getActionCommand().equals( + StringTools.getLabel(messages.getString("checkText")))) { + final JLanguageTool langTool = getCurrentLanguageTool(); + checkTextAndDisplayResults(langTool, getCurrentLanguage()); + } else { + throw new IllegalArgumentException("Unknown action " + e); + } + } catch (final Exception exc) { + Tools.showError(exc); + } + } + + void loadFile() { + final File file = Tools.openFileDialog(frame, new PlainTextFilter()); + if (file == null) { + return; + } + try { + final String fileContents = StringTools.readFile(new FileInputStream(file + .getAbsolutePath())); + textArea.setText(fileContents); + final JLanguageTool langTool = getCurrentLanguageTool(); + checkTextAndDisplayResults(langTool, getCurrentLanguage()); + } catch (final IOException e) { + Tools.showError(e); + } + } + + void hideToTray() { + final String version = System.getProperty("java.version"); + if (!isInTray && version.startsWith("1.5")) { // we don't run under <= 1.4, + // so we don't check for that + TrayIcon trayIcon = null; + try { + final Icon sysTrayIcon = new ImageIcon(JLanguageTool.getDataBroker().getFromResourceDirAsUrl(Main.SYSTEM_TRAY_ICON_NAME)); + trayIcon = new TrayIcon(sysTrayIcon); + } catch (final NoClassDefFoundError e) { + throw new MissingJdicException(e); + } + final SystemTray tray = SystemTray.getDefaultSystemTray(); + trayIcon.addActionListener(new TrayActionListener()); + trayIcon.setToolTip(SYSTEM_TRAY_TOOLTIP); + tray.addTrayIcon(trayIcon); + } else if (!isInTray) { + // Java 1.6 or later + final java.awt.SystemTray tray = java.awt.SystemTray.getSystemTray(); + final Image img = Toolkit.getDefaultToolkit().getImage( + JLanguageTool.getDataBroker().getFromResourceDirAsUrl(Main.SYSTEM_TRAY_ICON_NAME)); + final PopupMenu popup = makePopupMenu(); + try { + final java.awt.TrayIcon trayIcon = new java.awt.TrayIcon(img, + "tooltip", popup); + trayIcon.addMouseListener(new TrayActionListener()); + trayIcon.setToolTip(SYSTEM_TRAY_TOOLTIP); + tray.add(trayIcon); + } catch (final AWTException e1) { + // thrown if there's no system tray + Tools.showError(e1); + } + } + isInTray = true; + frame.setVisible(false); + } + + private PopupMenu makePopupMenu() { + final PopupMenu popup = new PopupMenu(); + final ActionListener rmbListener = new TrayActionRMBListener(); + // Check clipboard text: + final MenuItem checkClipboardItem = new MenuItem(StringTools + .getLabel(messages.getString("guiMenuCheckClipboard"))); + checkClipboardItem.addActionListener(rmbListener); + popup.add(checkClipboardItem); + // Open main window: + final MenuItem restoreItem = new MenuItem(StringTools.getLabel(messages + .getString("guiMenuShowMainWindow"))); + restoreItem.addActionListener(rmbListener); + popup.add(restoreItem); + // Exit: + final MenuItem exitItem = new MenuItem(StringTools.getLabel(messages + .getString("guiMenuQuit"))); + exitItem.addActionListener(rmbListener); + popup.add(exitItem); + return popup; + } + + void addLanguage() { + final LanguageManagerDialog lmd = new LanguageManagerDialog(frame, Language + .getExternalLanguages()); + lmd.show(); + try { + Language.reInit(lmd.getLanguages()); + } catch (final RuleFilenameException e) { + Tools.showErrorMessage(e); + } + populateLanguageBox(); + } + + void showOptions() { + final JLanguageTool langTool = getCurrentLanguageTool(); + final List<Rule> rules = langTool.getAllRules(); + final ConfigurationDialog configDialog = getCurrentConfigDialog(); + configDialog.show(rules); // this blocks until OK/Cancel is clicked in the + // dialog + config.setDisabledRuleIds(configDialog.getDisabledRuleIds()); + config.setEnabledRuleIds(configDialog.getEnabledRuleIds()); + config.setDisabledCategoryNames(configDialog.getDisabledCategoryNames()); + config.setMotherTongue(configDialog.getMotherTongue()); + config.setRunServer(configDialog.getRunServer()); + config.setServerPort(configDialog.getServerPort()); + // Stop server, start new server if requested: + stopServer(); + maybeStartServer(); + } + + private void restoreFromTray() { + frame.setVisible(true); + } + + // show GUI and check the text from clipboard/selection: + private void restoreFromTrayAndCheck() { + final String s = getClipboardText(); + restoreFromTray(); + textArea.setText(s); + final JLanguageTool langTool = getCurrentLanguageTool(); + checkTextAndDisplayResults(langTool, getCurrentLanguage()); + } + + void checkClipboardText() { + final String s = getClipboardText(); + textArea.setText(s); + final JLanguageTool langTool = getCurrentLanguageTool(); + checkTextAndDisplayResults(langTool, getCurrentLanguage()); + } + + private String getClipboardText() { + // get text from clipboard or selection: + Clipboard clipboard = Toolkit.getDefaultToolkit().getSystemSelection(); + if (clipboard == null) { // on Windows + clipboard = Toolkit.getDefaultToolkit().getSystemClipboard(); + } + String s = null; + final Transferable data = clipboard.getContents(this); + try { + if (data != null + && data.isDataFlavorSupported(DataFlavor.getTextPlainUnicodeFlavor())) { + final DataFlavor df = DataFlavor.getTextPlainUnicodeFlavor(); + final Reader sr = df.getReaderForText(data); + s = StringTools.readerToString(sr); + } else { + s = ""; + } + } catch (final Exception ex) { + ex.printStackTrace(); + if (data != null) { + s = data.toString(); + } else { + s = ""; + } + } + return s; + } + + void quitOrHide() { + if (trayMode) { + hideToTray(); + } else { + quit(); + } + } + + void quit() { + stopServer(); + try { + config.saveConfiguration(); + } catch (final IOException e) { + Tools.showError(e); + } + frame.setVisible(false); + System.exit(0); + } + + private void maybeStartServer() { + if (config.getRunServer()) { + httpServer = new HTTPServer(config.getServerPort()); + try { + httpServer.run(); + } catch (final PortBindingException e) { + JOptionPane.showMessageDialog(null, e.getMessage(), "Error", + JOptionPane.ERROR_MESSAGE); + } + } + } + + private void stopServer() { + if (httpServer != null) { + httpServer.stop(); + httpServer = null; + } + } + + private Language getCurrentLanguage() { + final String langName = langBox.getSelectedItem().toString(); + String lang = langName; + for (final Enumeration<String> e = messages.getKeys(); e.hasMoreElements();) { + final String elem = e.nextElement(); + if (messages.getString(elem).equals(langName)) { + lang = elem; + break; + } + } + // external rules: + if (lang.length() > 2) { + return Language.getLanguageForName(lang); + } + return Language.getLanguageForShortName(lang); + } + + private ConfigurationDialog getCurrentConfigDialog() { + final Language language = getCurrentLanguage(); + ConfigurationDialog configDialog = null; + if (configDialogs.containsKey(language)) { + configDialog = configDialogs.get(language); + } else { + configDialog = new ConfigurationDialog(frame, false); + configDialog.setMotherTongue(config.getMotherTongue()); + configDialog.setDisabledRules(config.getDisabledRuleIds()); + configDialog.setEnabledRules(config.getEnabledRuleIds()); + configDialog.setDisabledCategories(config.getDisabledCategoryNames()); + configDialog.setRunServer(config.getRunServer()); + configDialog.setServerPort(config.getServerPort()); + configDialogs.put(language, configDialog); + } + return configDialog; + } + + private JLanguageTool getCurrentLanguageTool() { + final JLanguageTool langTool; + try { + final ConfigurationDialog configDialog = getCurrentConfigDialog(); + langTool = new JLanguageTool(getCurrentLanguage(), configDialog + .getMotherTongue()); + langTool.activateDefaultPatternRules(); + langTool.activateDefaultFalseFriendRules(); + final Set<String> disabledRules = configDialog.getDisabledRuleIds(); + if (disabledRules != null) { + for (final String ruleId : disabledRules) { + langTool.disableRule(ruleId); + } + } + final Set<String> disabledCategories = configDialog + .getDisabledCategoryNames(); + if (disabledCategories != null) { + for (final String categoryName : disabledCategories) { + langTool.disableCategory(categoryName); + } + } + final Set<String> enabledRules = configDialog.getEnabledRuleIds(); + if (enabledRules != null) { + for (String ruleName : enabledRules) { + langTool.enableDefaultOffRule(ruleName); + langTool.enableRule(ruleName); + } + } + } catch (final IOException ioe) { + throw new RuntimeException(ioe); + } catch (final ParserConfigurationException ex) { + throw new RuntimeException(ex); + } catch (final SAXException ex) { + throw new RuntimeException(ex); + } + return langTool; + } + + private void checkTextAndDisplayResults(final JLanguageTool langTool, + final Language lang) { + if (StringTools.isEmpty(textArea.getText().trim())) { + textArea.setText(messages.getString("enterText2")); + } else { + final StringBuilder sb = new StringBuilder(); + final String startCheckText = Tools.makeTexti18n(messages, + "startChecking", new Object[] { lang.getTranslatedName(messages) }); + resultArea.setText(HTML_FONT_START + startCheckText + "<br>\n" + + HTML_FONT_END); + resultArea.repaint(); // FIXME: why doesn't this work? + // TODO: resultArea.setCursor(new Cursor(Cursor.WAIT_CURSOR)); + sb.append(startCheckText); + sb.append("...<br>\n"); + int matches = 0; + try { + matches = checkText(langTool, textArea.getText(), sb); + } catch (final Exception ex) { + sb.append("<br><br><b><font color=\"red\">" + ex.toString() + "<br>"); + final StackTraceElement[] elements = ex.getStackTrace(); + for (final StackTraceElement element : elements) { + sb.append(element); + sb.append("<br>"); + } + sb.append("</font></b><br>"); + ex.printStackTrace(); + } + final String checkDone = Tools.makeTexti18n(messages, "checkDone", + new Object[] {matches}); + sb.append(checkDone); + sb.append("<br>\n"); + resultArea.setText(HTML_FONT_START + sb.toString() + HTML_FONT_END); + resultArea.setCaretPosition(0); + } + } + + private int checkText(final JLanguageTool langTool, final String text, + final StringBuilder sb) throws IOException { + final long startTime = System.currentTimeMillis(); + final List<RuleMatch> ruleMatches = langTool.check(text); + final long startTimeMatching = System.currentTimeMillis(); + int i = 0; + for (final RuleMatch match : ruleMatches) { + final String output = Tools.makeTexti18n(messages, "result1", + new Object[] {i + 1, + match.getLine() + 1, + match.getColumn()}); + sb.append(output); + String msg = match.getMessage(); + msg = msg.replaceAll("<suggestion>", "<b>"); + msg = msg.replaceAll("</suggestion>", "</b>"); + msg = msg.replaceAll("<old>", "<b>"); + msg = msg.replaceAll("</old>", "</b>"); + sb.append("<b>" + messages.getString("errorMessage") + "</b> " + msg + "<br>\n"); + if (match.getSuggestedReplacements().size() > 0) { + final String repl = StringTools.listToString(match + .getSuggestedReplacements(), "; "); + sb.append("<b>" + messages.getString("correctionMessage") + "</b> " + + repl + "<br>\n"); + } + final String context = Tools.getContext(match.getFromPos(), match + .getToPos(), text); + sb.append("<b>" + messages.getString("errorContext") + "</b> " + context); + sb.append("<br>\n"); + i++; + } + final long endTime = System.currentTimeMillis(); + sb.append(Tools.makeTexti18n(messages, "resultTime", new Object[] { + endTime - startTime, + endTime - startTimeMatching})); + return ruleMatches.size(); + } + + private void setTrayMode(boolean trayMode) { + this.trayMode = trayMode; + } + + public static void main(final String[] args) { + try { + final Main prg = new Main(); + if (args.length == 1 + && (args[0].equals("-t") || args[0].equals("--tray"))) { + // dock to systray on startup + javax.swing.SwingUtilities.invokeLater(new Runnable() { + public void run() { + try { + prg.createGUI(); + prg.setTrayMode(true); + prg.hideToTray(); + } catch (final MissingJdicException e) { + JOptionPane.showMessageDialog(null, e.getMessage(), "Error", + JOptionPane.ERROR_MESSAGE); + System.exit(1); + } catch (final Exception e) { + Tools.showError(e); + System.exit(1); + } + } + }); + } else if (args.length >= 1) { + System.out + .println("Usage: java de.danielnaber.languagetool.gui.Main [-t|--tray]"); + System.out + .println(" -t, --tray: dock LanguageTool to system tray on startup"); + } else { + javax.swing.SwingUtilities.invokeLater(new Runnable() { + public void run() { + try { + prg.createGUI(); + prg.showGUI(); + } catch (final Exception e) { + Tools.showError(e); + } + } + }); + } + } catch (final Exception e) { + Tools.showError(e); + } + } + + // + // The System Tray stuff + // + + class TrayActionRMBListener implements ActionListener { + + public void actionPerformed(ActionEvent e) { + if (e.getActionCommand().equalsIgnoreCase( + StringTools.getLabel(messages.getString("guiMenuCheckClipboard")))) { + restoreFromTrayAndCheck(); + } else if (e.getActionCommand().equalsIgnoreCase( + StringTools.getLabel(messages.getString("guiMenuShowMainWindow")))) { + restoreFromTray(); + } else if (e.getActionCommand().equalsIgnoreCase( + StringTools.getLabel(messages.getString("guiMenuQuit")))) { + quit(); + } else { + JOptionPane.showMessageDialog(null, "Unknown action: " + + e.getActionCommand(), "Error", JOptionPane.ERROR_MESSAGE); + } + } + + } + + class TrayActionListener implements ActionListener, MouseListener { + + // for Java 1.5 / Jdic: + public void actionPerformed(@SuppressWarnings("unused")ActionEvent e) { + handleClick(); + } + + // Java 1.6: + public void mouseClicked(@SuppressWarnings("unused")MouseEvent e) { + handleClick(); + } + + private void handleClick() { + if (frame.isVisible() && frame.isActive()) { + frame.setVisible(false); + } else if (frame.isVisible() && !frame.isActive()) { + frame.toFront(); + restoreFromTrayAndCheck(); + } else { + restoreFromTrayAndCheck(); + } + } + + public void mouseEntered(@SuppressWarnings("unused") MouseEvent e) { + } + + public void mouseExited(@SuppressWarnings("unused")MouseEvent e) { + } + + public void mousePressed(@SuppressWarnings("unused")MouseEvent e) { + } + + public void mouseReleased(@SuppressWarnings("unused")MouseEvent e) { + } + + } + + class CloseListener implements WindowListener { + + public void windowClosing(@SuppressWarnings("unused")WindowEvent e) { + quitOrHide(); + } + + public void windowActivated(@SuppressWarnings("unused")WindowEvent e) { + } + + public void windowClosed(@SuppressWarnings("unused")WindowEvent e) { + } + + public void windowDeactivated(@SuppressWarnings("unused")WindowEvent e) { + } + + public void windowDeiconified(@SuppressWarnings("unused")WindowEvent e) { + } + + public void windowIconified(@SuppressWarnings("unused")WindowEvent e) { + } + + public void windowOpened(@SuppressWarnings("unused")WindowEvent e) { + } + + } + + static class PlainTextFilter extends FileFilter { + + @Override + public boolean accept(final File f) { + if (f.getName().toLowerCase().endsWith(".txt")) { + return true; + } + return false; + } + + @Override + public String getDescription() { + return "*.txt"; + } + + } + +} diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/gui/MainMenuBar.java b/JLanguageTool/src/java/de/danielnaber/languagetool/gui/MainMenuBar.java new file mode 100644 index 0000000..72e3191 --- /dev/null +++ b/JLanguageTool/src/java/de/danielnaber/languagetool/gui/MainMenuBar.java @@ -0,0 +1,170 @@ +/* LanguageTool, a natural language style checker + * Copyright (C) 2005 Daniel Naber (http://www.danielnaber.de) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ +package de.danielnaber.languagetool.gui; + +import java.awt.Event; +import java.awt.event.ActionEvent; +import java.awt.event.ActionListener; +import java.awt.event.KeyEvent; +import java.util.ResourceBundle; + +import javax.swing.JMenu; +import javax.swing.JMenuBar; +import javax.swing.JMenuItem; +import javax.swing.JOptionPane; +import javax.swing.KeyStroke; + +import de.danielnaber.languagetool.tools.StringTools; + +/** + * The menu bar of the main dialog. + * + * @author Daniel Naber + */ +class MainMenuBar extends JMenuBar implements ActionListener { + + private static final long serialVersionUID = -7160998682243081767L; + + private final ResourceBundle messages; + + // File: + private String openText; + private String checkClipboardText; + private String dockToTrayText; + private String addLanguageText; + private String optionsText; + private String quitText; + // Help: + private String aboutText; + + private final Main prg; + private JMenu fileMenu; + private JMenu helpMenu; + + MainMenuBar(Main prg, ResourceBundle messages) { + this.prg = prg; + this.messages = messages; + initStrings(); + fileMenu.setMnemonic(StringTools.getMnemonic( + messages.getString("guiMenuFile"))); + helpMenu.setMnemonic(StringTools.getMnemonic( + messages.getString("guiMenuHelp"))); + // "Open": + final JMenuItem openItem = new JMenuItem(openText); + openItem.setAccelerator(KeyStroke.getKeyStroke(KeyEvent.VK_O, Event.CTRL_MASK)); + openItem.setMnemonic(StringTools.getMnemonic( + messages.getString("guiMenuOpen"))); + openItem.addActionListener(this); + fileMenu.add(openItem); + // "Check Text in Clipboard": + final JMenuItem checkClipboardItem = new JMenuItem(checkClipboardText); + checkClipboardItem.setAccelerator(KeyStroke.getKeyStroke(KeyEvent.VK_Y, Event.CTRL_MASK)); + checkClipboardItem.setMnemonic(StringTools.getMnemonic( + messages.getString("guiMenuCheckClipboard"))); + checkClipboardItem.addActionListener(this); + fileMenu.add(checkClipboardItem); + // "Hide to System Tray": + final JMenuItem dockToTrayItem = new JMenuItem(dockToTrayText); + dockToTrayItem.setMnemonic(StringTools.getMnemonic( + messages.getString("guiMenuHide"))); + dockToTrayItem.setAccelerator(KeyStroke.getKeyStroke(KeyEvent.VK_D, Event.CTRL_MASK)); + dockToTrayItem.addActionListener(this); + fileMenu.add(dockToTrayItem); + // "Add Language": + final JMenuItem addLanguageItem = new JMenuItem(addLanguageText); + addLanguageItem.setMnemonic(StringTools.getMnemonic( + messages.getString("guiMenuAddRules"))); + addLanguageItem.setAccelerator(KeyStroke.getKeyStroke(KeyEvent.VK_A, Event.CTRL_MASK)); + addLanguageItem.addActionListener(this); + fileMenu.add(addLanguageItem); + // "Options": + final JMenuItem optionsItem = new JMenuItem(optionsText); + optionsItem.setMnemonic(StringTools.getMnemonic( + messages.getString("guiMenuOptions"))); + optionsItem.setAccelerator(KeyStroke.getKeyStroke(KeyEvent.VK_S, Event.CTRL_MASK)); + optionsItem.addActionListener(this); + fileMenu.add(optionsItem); + // "Quit": + final JMenuItem quitItem = new JMenuItem(quitText); + quitItem.setMnemonic(StringTools.getMnemonic( + messages.getString("guiMenuQuit"))); + quitItem.setAccelerator(KeyStroke.getKeyStroke(KeyEvent.VK_Q, Event.CTRL_MASK)); + quitItem.addActionListener(this); + fileMenu.add(quitItem); + // "About": + final JMenuItem helpItem = new JMenuItem(aboutText); + helpItem.addActionListener(this); + helpItem.setMnemonic(StringTools.getMnemonic( + messages.getString("guiMenuAbout"))); + helpMenu.add(helpItem); + // add menus: + add(fileMenu); + add(helpMenu); + } + + private void initStrings() { + fileMenu = new JMenu(StringTools.getLabel( + messages.getString("guiMenuFile"))); + helpMenu = new JMenu(StringTools.getLabel( + messages.getString("guiMenuHelp"))); + // File: + openText = StringTools.getLabel( + messages.getString("guiMenuOpen")); + checkClipboardText = StringTools.getLabel( + messages.getString("guiMenuCheckClipboard")); + dockToTrayText = StringTools.getLabel( + messages.getString("guiMenuHide")); + addLanguageText = StringTools.getLabel( + messages.getString("guiMenuAddRules")); + optionsText = StringTools.getLabel( + messages.getString("guiMenuOptions")); + quitText = StringTools.getLabel( + messages.getString("guiMenuQuit")); + // Help: + aboutText = StringTools.getLabel( + messages.getString("guiMenuAbout")); + } + + public void actionPerformed(ActionEvent e) { + if (e.getActionCommand().equals(openText)) { + prg.loadFile(); + } else if (e.getActionCommand().equals(checkClipboardText)) { + prg.checkClipboardText(); + } else if (e.getActionCommand().equals(dockToTrayText)) { + try { + prg.hideToTray(); + } catch (MissingJdicException ex) { + JOptionPane.showMessageDialog(null, ex.getMessage(), "Error", + JOptionPane.ERROR_MESSAGE); + } + } else if (e.getActionCommand().equals(addLanguageText)) { + prg.addLanguage(); + } else if (e.getActionCommand().equals(optionsText)) { + prg.showOptions(); + } else if (e.getActionCommand().equals(quitText)) { + prg.quit(); + } else if (e.getActionCommand().equals(aboutText)) { + final AboutDialog about = new AboutDialog(messages); + about.show(); + } else { + throw new IllegalArgumentException("Unknown action " + e); + } + } + +} diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/gui/MissingJdicException.java b/JLanguageTool/src/java/de/danielnaber/languagetool/gui/MissingJdicException.java new file mode 100644 index 0000000..6dcf5de --- /dev/null +++ b/JLanguageTool/src/java/de/danielnaber/languagetool/gui/MissingJdicException.java @@ -0,0 +1,38 @@ +/* LanguageTool, a natural language style checker + * Copyright (C) 2007 Daniel Naber (http://www.danielnaber.de) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ +package de.danielnaber.languagetool.gui; + +/** + * Exception thrown with Java 1.5 if the jdic library cannot be found. + * + * @author Daniel Naber + */ +public class MissingJdicException extends RuntimeException { + + /** + * + */ + private static final long serialVersionUID = 8822404582351420654L; + + public MissingJdicException(Throwable throwable) { + super("TrayIcon class not found. Please unzip " + + "'standalone-libs.zip' in your LanguageTool installation directory.", throwable); + } + +} diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/gui/Tools.java b/JLanguageTool/src/java/de/danielnaber/languagetool/gui/Tools.java new file mode 100644 index 0000000..5abe803 --- /dev/null +++ b/JLanguageTool/src/java/de/danielnaber/languagetool/gui/Tools.java @@ -0,0 +1,192 @@ +/* LanguageTool, a natural language style checker + * Copyright (C) 2005 Daniel Naber (http://www.danielnaber.de) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ +package de.danielnaber.languagetool.gui; + +import java.awt.Frame; +import java.io.File; +import java.text.MessageFormat; +import java.util.ResourceBundle; + +import javax.swing.JFileChooser; +import javax.swing.JOptionPane; +import javax.swing.filechooser.FileFilter; + +import de.danielnaber.languagetool.tools.StringTools; + +/** + * GUI-related tools. + * + * @author Daniel Naber + */ +public class Tools { + + private static final int DEFAULT_CONTEXT_SIZE = 40; // characters + private static final String MARKER_START = "<b><font color=\"red\">"; + private static final String MARKER_END = "</font></b>"; + + private Tools() { + // no constructor + } + + public static String makeTexti18n(final ResourceBundle messages, final String key, + final Object[] messageArguments) { + final MessageFormat formatter = new MessageFormat(""); + formatter.applyPattern(messages.getString(key)); + return formatter.format(messageArguments); + } + + /** + * Get the default context (40 characters) of the given text range, + * highlighting the range with HTML. + */ + public static String getContext(final int fromPos, final int toPos, final String text) { + return getContext(fromPos, toPos, text, DEFAULT_CONTEXT_SIZE); + } + + /** + * Get the context (<code>contextSize</code> characters) of the given text + * range, highlighting the range with HTML code. + */ + public static String getContext(final int fromPos, final int toPos, final String fileContents, + int contextSize) { + return getContext(fromPos, toPos, fileContents, contextSize, MARKER_START, + MARKER_END, true); + } + + /** + * Get the context (<code>contextSize</code> characters) of the given text + * range, highlighting the range with the given marker strings, not escaping + * HTML. + */ + public static String getContext(final int fromPos, final int toPos, + final String fileContents, final int contextSize, + final String markerStart, final String markerEnd) { + return getContext(fromPos, toPos, fileContents, contextSize, markerStart, + markerEnd, false); + } + /** + * Get the context (<code>contextSize</code> characters) of the given text + * range, highlighting the range with the given marker strings. + * + * @param fromPos + * the start position of the error in characters + * @param toPos + * the end position of the error in characters + * @param text + * the text from which the context should be taken + * @param contextSize + * the size of the context in characters + * @param markerStart + * the string used to mark the beginning of the error + * @param markerEnd + * the string used to mark the end of the error + * @param escapeHTML + * whether HTML/XML characters should be escaped + */ + public static String getContext(final int fromPos, final int toPos, + String text, final int contextSize, final String markerStart, + final String markerEnd, final boolean escapeHTML) { + text = text.replace('\n', ' '); + // calculate context region: + int startContent = fromPos - contextSize; + String prefix = "..."; + String postfix = "..."; + String markerPrefix = " "; + if (startContent < 0) { + prefix = ""; + markerPrefix = ""; + startContent = 0; + } + int endContent = toPos + contextSize; + final int fileLen = text.length(); + if (endContent > fileLen) { + postfix = ""; + endContent = fileLen; + } + // make "^" marker. inefficient but robust implementation: + final StringBuilder marker = new StringBuilder(); + final int totalLen = fileLen + prefix.length(); + for (int i = 0; i < totalLen; i++) { + if (i >= fromPos && i < toPos) { + marker.append('^'); + } else { + marker.append(' '); + } + } + // now build context string plus marker: + final StringBuilder sb = new StringBuilder(); + sb.append(prefix); + sb.append(text.substring(startContent, endContent)); + final String markerStr = markerPrefix + + marker.substring(startContent, endContent); + sb.append(postfix); + final int startMark = markerStr.indexOf('^'); + final int endMark = markerStr.lastIndexOf('^'); + String result = sb.toString(); + if (escapeHTML) { + result = StringTools.escapeHTML(result.substring(0, startMark)) + + markerStart + + StringTools.escapeHTML(result.substring(startMark, endMark + 1)) + + markerEnd + StringTools.escapeHTML(result.substring(endMark + 1)); + } else { + result = result.substring(0, startMark) + markerStart + + result.substring(startMark, endMark + 1) + markerEnd + + result.substring(endMark + 1); + } + return result; + } + + /** + * Show a file chooser dialog and return the file selected by the user or + * <code>null</code>. + */ + static File openFileDialog(final Frame frame, final FileFilter fileFilter) { + final JFileChooser jfc = new JFileChooser(); + jfc.setFileFilter(fileFilter); + jfc.showOpenDialog(frame); + final File file = jfc.getSelectedFile(); + if (file == null) { + return null; + } + return file; + } + + /** + * Show the exception (with stacktrace) in a dialog and print it to STDERR. + */ + static void showError(final Exception e) { + final String msg = de.danielnaber.languagetool.tools.Tools + .getFullStackTrace(e); + JOptionPane + .showMessageDialog(null, msg, "Error", JOptionPane.ERROR_MESSAGE); + e.printStackTrace(); + } + + /** + * Show the exception (message without stacktrace) in a dialog and print it to + * STDERR. + */ + static void showErrorMessage(final Exception e) { + final String msg = e.getMessage(); + JOptionPane + .showMessageDialog(null, msg, "Error", JOptionPane.ERROR_MESSAGE); + e.printStackTrace(); + } + +} diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/language/Belarusian.java b/JLanguageTool/src/java/de/danielnaber/languagetool/language/Belarusian.java new file mode 100644 index 0000000..fb1df60 --- /dev/null +++ b/JLanguageTool/src/java/de/danielnaber/languagetool/language/Belarusian.java @@ -0,0 +1,72 @@ +/* LanguageTool, a natural language style checker + * Copyright (C) 2005 Daniel Naber (http://www.danielnaber.de) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ +package de.danielnaber.languagetool.language; + +import java.util.HashSet; +import java.util.Locale; +import java.util.Set; + +import de.danielnaber.languagetool.Language; +import de.danielnaber.languagetool.tagging.Tagger; +import de.danielnaber.languagetool.tagging.be.BelarusianTagger; + +/** + * Belarusian language declarations. + * + * Copyright (C) 2010 Alex Buloichik (alex73mail@gmail.com) + */ +public class Belarusian extends Language { + + private static final String[] COUNTRIES = { "BY" }; + + private final Tagger tagger = new BelarusianTagger(); + + public Locale getLocale() { + return new Locale(getShortName()); + } + + public String getName() { + return "Belarusian"; + } + + public String getShortName() { + return "be"; + } + + public String[] getCountryVariants() { + return COUNTRIES; + } + + public Tagger getTagger() { + return tagger; + } + + public Contributor[] getMaintainers() { + return new Contributor[] { new Contributor("Alex Buloichik") }; + } + + public Set<String> getRelevantRuleIDs() { + final Set<String> ids = new HashSet<String>(); + ids.add("COMMA_PARENTHESIS_WHITESPACE"); + ids.add("DOUBLE_PUNCTUATION"); + ids.add("UPPERCASE_SENTENCE_START"); + ids.add("WHITESPACE_RULE"); + return ids; + } +} diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/language/Bokmal.java b/JLanguageTool/src/java/de/danielnaber/languagetool/language/Bokmal.java new file mode 100644 index 0000000..77d79ae --- /dev/null +++ b/JLanguageTool/src/java/de/danielnaber/languagetool/language/Bokmal.java @@ -0,0 +1,104 @@ +/* LanguageTool, a natural language style checker + * Copyright (C) 2007 Daniel Naber (http://www.danielnaber.de) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ +package de.danielnaber.languagetool.language; + +import java.util.HashSet; +import java.util.Locale; +import java.util.Set; + +import de.danielnaber.languagetool.Language; +import de.danielnaber.languagetool.synthesis.Synthesizer; +/*import de.danielnaber.languagetool.synthesis.en.EnglishSynthesizer; */ +import de.danielnaber.languagetool.tagging.Tagger; +import de.danielnaber.languagetool.tagging.disambiguation.Disambiguator; +/*import de.danielnaber.languagetool.tagging.disambiguation.rules.en.EnglishRuleDisambiguator;*/ +/*import de.danielnaber.languagetool.tagging.en.EnglishTagger;*/ +import de.danielnaber.languagetool.tagging.nb.BokmalTagger; +import de.danielnaber.languagetool.tokenizers.SRXSentenceTokenizer; +import de.danielnaber.languagetool.tokenizers.SentenceTokenizer; +import de.danielnaber.languagetool.tokenizers.Tokenizer; +/*import de.danielnaber.languagetool.tokenizers.en.EnglishWordTokenizer;*/ + +public class Bokmal extends Language { + + private final Tagger tagger = new BokmalTagger(); +// private final Tokenizer wordTokenizer = new BokmalWordTokenizer(); + private final SentenceTokenizer sentenceTokenizer = new SRXSentenceTokenizer("nb"); +// private final Synthesizer synthesizer = new BokmalSynthesizer(); +// private final Disambiguator disambiguator = new BokmalRuleDisambiguator(); + + private static final String[] COUNTRIES = {"NO"}; + + public final Locale getLocale() { + return new Locale(getShortName()); + } + + public final SentenceTokenizer getSentenceTokenizer() { + return sentenceTokenizer; + } + + public final String getName() { + return "Bokmal"; + } + + public final String getShortName() { + return "nb"; + } + + public final String[] getCountryVariants() { + return COUNTRIES; + } + + public final Tagger getTagger() { + return tagger; + } + +/* public final Tokenizer getWordTokenizer() { + return wordTokenizer; + } + + public final Synthesizer getSynthesizer() { + return synthesizer; + } + + public final Disambiguator getDisambiguator() { + return disambiguator; + }*/ + + + public final Contributor[] getMaintainers() { + return new Contributor[] {new Contributor("Arno Teigseth")/*, + new Contributor("Arno Teigseth")*/}; + } + + public final Set<String> getRelevantRuleIDs() { + final Set<String> ids = new HashSet<String>(); + ids.add("COMMA_PARENTHESIS_WHITESPACE"); + ids.add("DOUBLE_PUNCTUATION"); + ids.add("EN_UNPAIRED_BRACKETS"); + ids.add("UPPERCASE_SENTENCE_START"); + ids.add("WORD_REPEAT_RULE"); + ids.add("WHITESPACE_RULE"); + // specific to English: +// ids.add("EN_A_VS_AN"); +// ids.add("EN_COMPOUNDS"); + return ids; + } + +} diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/language/Catalan.java b/JLanguageTool/src/java/de/danielnaber/languagetool/language/Catalan.java new file mode 100644 index 0000000..4e0eb67 --- /dev/null +++ b/JLanguageTool/src/java/de/danielnaber/languagetool/language/Catalan.java @@ -0,0 +1,91 @@ +/* LanguageTool, a natural language style checker + * Copyright (C) 2009 Daniel Naber (http://www.danielnaber.de) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ +package de.danielnaber.languagetool.language; + +import java.util.HashSet; +import java.util.Locale; +import java.util.Set; + +import de.danielnaber.languagetool.Language; +import de.danielnaber.languagetool.synthesis.Synthesizer; +import de.danielnaber.languagetool.synthesis.ca.CatalanSynthesizer; +import de.danielnaber.languagetool.tagging.Tagger; +import de.danielnaber.languagetool.tagging.ca.CatalanTagger; +import de.danielnaber.languagetool.tokenizers.SRXSentenceTokenizer; +import de.danielnaber.languagetool.tokenizers.SentenceTokenizer; +import de.danielnaber.languagetool.rules.ca.CastellanismesReplaceRule; +import de.danielnaber.languagetool.rules.ca.AccentuacioReplaceRule; + +public class Catalan extends Language { + + private final Tagger tagger = new CatalanTagger(); + private final SentenceTokenizer sentenceTokenizer = new SRXSentenceTokenizer("ca"); + private final Synthesizer synthesizer = new CatalanSynthesizer(); +// private CastellanismesReplaceRule castella = new CastellanismesReplaceRule(); + + private static final String[] COUNTRIES = { + "ES" + }; + + public Locale getLocale() { + return new Locale(getShortName()); + } + + public String getName() { + return "Catalan"; + } + + @Override + public String[] getCountryVariants() { + return COUNTRIES; + } + + public String getShortName() { + return "ca"; + } + + public Contributor[] getMaintainers() { + return new Contributor[] {new Contributor("Ricard Roca")}; + } + + public Set<String> getRelevantRuleIDs() { + final Set<String> ids = new HashSet<String>(); + ids.add("COMMA_PARENTHESIS_WHITESPACE"); + ids.add("DOUBLE_PUNCTUATION"); + ids.add("UNPAIRED_BRACKETS"); + ids.add("UPPERCASE_SENTENCE_START"); + ids.add("WHITESPACE_RULE"); + ids.add(CastellanismesReplaceRule.CATALAN_CASTELLANISMES_REPLACE_RULE); + ids.add(AccentuacioReplaceRule.CATALAN_ACCENTUACIO_REPLACE_RULE); + return ids; + } + + public final Tagger getTagger() { + return tagger; + } + + public final Synthesizer getSynthesizer() { + return synthesizer; + } + + public final SentenceTokenizer getSentenceTokenizer() { + return sentenceTokenizer; + } + +} diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/language/Contributor.java b/JLanguageTool/src/java/de/danielnaber/languagetool/language/Contributor.java new file mode 100644 index 0000000..e38d635 --- /dev/null +++ b/JLanguageTool/src/java/de/danielnaber/languagetool/language/Contributor.java @@ -0,0 +1,63 @@ +/* LanguageTool, a natural language style checker + * Copyright (C) 2007 Daniel Naber (http://www.danielnaber.de) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ +package de.danielnaber.languagetool.language; + +/** + * A person that contributed rules or code to LanguageTool. + * + * @author Daniel Naber + */ +public class Contributor { + + private final String name; + private String remark; + private String url; + + Contributor(String name) { + if (name == null) { + throw new NullPointerException("name cannot be null"); + } + this.name = name; + } + + public String getName() { + return name; + } + + public final String toString() { + return getName(); + } + + public String getRemark() { + return remark; + } + + public void setRemark(final String remark) { + this.remark = remark; + } + + public String getUrl() { + return url; + } + + public void setUrl(final String url) { + this.url = url; + } + +} diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/language/Czech.java b/JLanguageTool/src/java/de/danielnaber/languagetool/language/Czech.java new file mode 100644 index 0000000..d3154d7 --- /dev/null +++ b/JLanguageTool/src/java/de/danielnaber/languagetool/language/Czech.java @@ -0,0 +1,73 @@ +/* LanguageTool, a natural language style checker + * Copyright (C) 2007 Daniel Naber (http://www.danielnaber.de) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ +package de.danielnaber.languagetool.language; + +import java.util.HashSet; +import java.util.Locale; +import java.util.Set; + +import de.danielnaber.languagetool.Language; +import de.danielnaber.languagetool.tagging.Tagger; +import de.danielnaber.languagetool.tagging.cs.CzechTagger; +import de.danielnaber.languagetool.tokenizers.SentenceTokenizer; +import de.danielnaber.languagetool.tokenizers.cs.CzechSentenceTokenizer; + +public class Czech extends Language { + + private final Tagger tagger = new CzechTagger(); + private final SentenceTokenizer sentenceTokenizer = new CzechSentenceTokenizer(); + + private static final String[] COUNTRIES = {"CZ"}; + + public Locale getLocale() { + return new Locale(getShortName()); + } + + public String getName() { + return "Czech"; + } + + public String getShortName() { + return "cs"; + } + + public String[] getCountryVariants() { + return COUNTRIES; + } + + public Tagger getTagger() { + return tagger; + } + + public SentenceTokenizer getSentenceTokenizer() { + return sentenceTokenizer; + } + + public Contributor[] getMaintainers() { + return new Contributor[] {new Contributor("Jozef Ličko")}; + } + + public Set<String> getRelevantRuleIDs() { + final Set<String> ids = new HashSet<String>(); + ids.add("COMMA_PARENTHESIS_WHITESPACE"); + ids.add("DOUBLE_PUNCTUATION"); + return ids; + } + +} diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/language/Danish.java b/JLanguageTool/src/java/de/danielnaber/languagetool/language/Danish.java new file mode 100644 index 0000000..d114c40 --- /dev/null +++ b/JLanguageTool/src/java/de/danielnaber/languagetool/language/Danish.java @@ -0,0 +1,78 @@ +/* LanguageTool, a natural language style checker + * Copyright (C) 2007 Daniel Naber (http://www.danielnaber.de) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ +package de.danielnaber.languagetool.language; + +import java.util.HashSet; +import java.util.Locale; +import java.util.Set; + +import de.danielnaber.languagetool.Language; +import de.danielnaber.languagetool.tagging.Tagger; +import de.danielnaber.languagetool.tagging.da.DanishTagger; +import de.danielnaber.languagetool.tokenizers.SRXSentenceTokenizer; +import de.danielnaber.languagetool.tokenizers.SentenceTokenizer; + +public class Danish extends Language { + + private final Tagger tagger = new DanishTagger(); + private final SentenceTokenizer sentenceTokenizer = new SRXSentenceTokenizer("da"); + + private static final String[] COUNTRIES = {"DK"}; + + public final Locale getLocale() { + return new Locale(getShortName()); + } + + public final String getName() { + return "Danish"; + } + + public final String getShortName() { + return "da"; + } + + public final String[] getCountryVariants() { + return COUNTRIES; + } + + public final Tagger getTagger() { + return tagger; + } + + public SentenceTokenizer getSentenceTokenizer() { + return sentenceTokenizer; + } + + public final Contributor[] getMaintainers() { + return new Contributor[] {new Contributor("Esben Aaberg")}; + } + + public final Set<String> getRelevantRuleIDs() { + final Set<String> ids = new HashSet<String>(); + ids.add("COMMA_PARENTHESIS_WHITESPACE"); + ids.add("DOUBLE_PUNCTUATION"); + ids.add("UNPAIRED_BRACKETS"); // correction for genitive apostrophes eg. "Lis' hund" made in UnpairedQuotesBracketsRule + ids.add("UPPERCASE_SENTENCE_START"); // abbreviation exceptions, done in DanishSentenceTokenizer + // "WORD_REPEAT_RULE" implemented in grammar.xml + ids.add("WHITESPACE_RULE"); + // specific to Danish: + return ids; + } + +} diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/language/Demo.java b/JLanguageTool/src/java/de/danielnaber/languagetool/language/Demo.java new file mode 100644 index 0000000..ab4284b --- /dev/null +++ b/JLanguageTool/src/java/de/danielnaber/languagetool/language/Demo.java @@ -0,0 +1,60 @@ +/* LanguageTool, a natural language style checker + * Copyright (C) 2007 Daniel Naber (http://www.danielnaber.de) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ +package de.danielnaber.languagetool.language; + +import java.util.Locale; +import java.util.Set; + +import de.danielnaber.languagetool.Language; +import de.danielnaber.languagetool.tagging.Tagger; +import de.danielnaber.languagetool.tagging.xx.DemoTagger; + +public class Demo extends Language { + + private final Tagger tagger = new DemoTagger(); + + public Locale getLocale() { + return new Locale("en"); + } + + public String getName() { + return "Testlanguage"; + } + + public String getShortName() { + return "xx"; + } + + public String[] getCountryVariants() { + return new String[] {"XX"}; + } + + public Tagger getTagger() { + return tagger; + } + + public Contributor[] getMaintainers() { + return null; + } + + public Set<String> getRelevantRuleIDs() { + return null; + } + +} diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/language/Dutch.java b/JLanguageTool/src/java/de/danielnaber/languagetool/language/Dutch.java new file mode 100644 index 0000000..0670736 --- /dev/null +++ b/JLanguageTool/src/java/de/danielnaber/languagetool/language/Dutch.java @@ -0,0 +1,99 @@ +/* LanguageTool, a natural language style checker + * Copyright (C) 2007 Daniel Naber (http://www.danielnaber.de) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ +package de.danielnaber.languagetool.language; + +import java.util.HashSet; +import java.util.Locale; +import java.util.Set; + +import de.danielnaber.languagetool.Language; +import de.danielnaber.languagetool.synthesis.Synthesizer; +import de.danielnaber.languagetool.synthesis.nl.DutchSynthesizer; +import de.danielnaber.languagetool.tagging.Tagger; +import de.danielnaber.languagetool.tagging.disambiguation.Disambiguator; +import de.danielnaber.languagetool.tagging.disambiguation.rules.nl.DutchRuleDisambiguator; +import de.danielnaber.languagetool.tagging.nl.DutchTagger; +import de.danielnaber.languagetool.tokenizers.SRXSentenceTokenizer; +import de.danielnaber.languagetool.tokenizers.SentenceTokenizer; +import de.danielnaber.languagetool.tokenizers.Tokenizer; +import de.danielnaber.languagetool.tokenizers.nl.DutchWordTokenizer; + +public class Dutch extends Language { + + private final Tagger tagger = new DutchTagger(); + private final SentenceTokenizer sentenceTokenizer = new SRXSentenceTokenizer("nl"); + private final Synthesizer synthesizer = new DutchSynthesizer(); + private final Disambiguator disambiguator = new DutchRuleDisambiguator(); + private final Tokenizer wdTokenizer = new DutchWordTokenizer(); + + private static final String[] COUNTRIES = { "NL", "BE" }; + + public final Locale getLocale() { + return new Locale(getShortName()); + } + + public final String getName() { + return "Dutch"; + } + + public final String getShortName() { + return "nl"; + } + + public final String[] getCountryVariants() { + return COUNTRIES; + } + + public final Tagger getTagger() { + return tagger; + } + + public final Synthesizer getSynthesizer() { + return synthesizer; + } + + public final SentenceTokenizer getSentenceTokenizer() { + return sentenceTokenizer; + } + + public final Tokenizer getWordTokenizer() { + return wdTokenizer; + } + + public final Disambiguator getDisambiguator() { + return disambiguator; + } + + public final Contributor[] getMaintainers() { + final Contributor contributor = new Contributor("Ruud Baars"); + contributor.setUrl("http://www.opentaal.org"); + return new Contributor[] { contributor }; + } + + public final Set<String> getRelevantRuleIDs() { + final Set<String> ids = new HashSet<String>(); + ids.add("COMMA_PARENTHESIS_WHITESPACE"); + ids.add("DOUBLE_PUNCTUATION"); + ids.add("UNPAIRED_BRACKETS"); + ids.add("UPPERCASE_SENTENCE_START"); + ids.add("WHITESPACE_RULE"); + return ids; + } + +} diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/language/English.java b/JLanguageTool/src/java/de/danielnaber/languagetool/language/English.java new file mode 100644 index 0000000..0bf16e8 --- /dev/null +++ b/JLanguageTool/src/java/de/danielnaber/languagetool/language/English.java @@ -0,0 +1,103 @@ +/* LanguageTool, a natural language style checker + * Copyright (C) 2007 Daniel Naber (http://www.danielnaber.de) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ +package de.danielnaber.languagetool.language; + +import java.util.HashSet; +import java.util.Locale; +import java.util.Set; + +import de.danielnaber.languagetool.Language; +import de.danielnaber.languagetool.synthesis.Synthesizer; +import de.danielnaber.languagetool.synthesis.en.EnglishSynthesizer; +import de.danielnaber.languagetool.tagging.Tagger; +import de.danielnaber.languagetool.tagging.disambiguation.Disambiguator; +import de.danielnaber.languagetool.tagging.disambiguation.rules.en.EnglishRuleDisambiguator; +import de.danielnaber.languagetool.tagging.en.EnglishTagger; +import de.danielnaber.languagetool.tokenizers.SRXSentenceTokenizer; +import de.danielnaber.languagetool.tokenizers.SentenceTokenizer; +import de.danielnaber.languagetool.tokenizers.Tokenizer; +import de.danielnaber.languagetool.tokenizers.en.EnglishWordTokenizer; + +public class English extends Language { + + private final Tagger tagger = new EnglishTagger(); + private final Tokenizer wordTokenizer = new EnglishWordTokenizer(); + private final SentenceTokenizer sentenceTokenizer = new SRXSentenceTokenizer("en"); + private final Synthesizer synthesizer = new EnglishSynthesizer(); + private final Disambiguator disambiguator = new EnglishRuleDisambiguator(); + + private static final String[] COUNTRIES = {"GB", "US", "AU", "CA", "NZ", "ZA" }; + + public final Locale getLocale() { + return new Locale(getShortName()); + } + + public final SentenceTokenizer getSentenceTokenizer() { + return sentenceTokenizer; + } + + public final String getName() { + return "English"; + } + + public final String getShortName() { + return "en"; + } + + public final String[] getCountryVariants() { + return COUNTRIES; + } + + public final Tagger getTagger() { + return tagger; + } + + public final Tokenizer getWordTokenizer() { + return wordTokenizer; + } + + public final Synthesizer getSynthesizer() { + return synthesizer; + } + + public final Disambiguator getDisambiguator() { + return disambiguator; + } + + + public final Contributor[] getMaintainers() { + return new Contributor[] {new Contributor("Marcin Miłkowski"), + new Contributor("Daniel Naber")}; + } + + public final Set<String> getRelevantRuleIDs() { + final Set<String> ids = new HashSet<String>(); + ids.add("COMMA_PARENTHESIS_WHITESPACE"); + ids.add("DOUBLE_PUNCTUATION"); + ids.add("EN_UNPAIRED_BRACKETS"); + ids.add("UPPERCASE_SENTENCE_START"); + ids.add("WORD_REPEAT_RULE"); + ids.add("WHITESPACE_RULE"); + // specific to English: + ids.add("EN_A_VS_AN"); + ids.add("EN_COMPOUNDS"); + return ids; + } + +} diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/language/Esperanto.java b/JLanguageTool/src/java/de/danielnaber/languagetool/language/Esperanto.java new file mode 100644 index 0000000..0e48d98 --- /dev/null +++ b/JLanguageTool/src/java/de/danielnaber/languagetool/language/Esperanto.java @@ -0,0 +1,72 @@ +/* LanguageTool, a natural language style checker + * Copyright (C) 2007 Daniel Naber (http://www.danielnaber.de) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ +package de.danielnaber.languagetool.language; + +import java.util.HashSet; +import java.util.Locale; +import java.util.Set; + +import de.danielnaber.languagetool.Language; +import de.danielnaber.languagetool.tagging.Tagger; +import de.danielnaber.languagetool.tagging.eo.EsperantoTagger; + +public class Esperanto extends Language { + + private final Tagger tagger = new EsperantoTagger(); + + public Locale getLocale() { + return new Locale("eo"); + } + + public String getName() { + return "Esperanto"; + } + + public String getShortName() { + return "eo"; + } + + public String[] getCountryVariants() { + /* return "ANY" country code as a "country-less" placeholder for OOo: */ + return new String[] {"ANY"}; + } + + public Tagger getTagger() { + return tagger; + } + + public Contributor[] getMaintainers() { + return new Contributor[] { + new Contributor("Dominique Pellé") + }; + } + + public Set<String> getRelevantRuleIDs() { + final Set<String> ids = new HashSet<String>(); + ids.add("COMMA_PARENTHESIS_WHITESPACE"); + ids.add("DOUBLE_PUNCTUATION"); + ids.add("UNPAIRED_BRACKETS"); + ids.add("UPPERCASE_SENTENCE_START"); + ids.add("WORD_REPEAT_RULE"); + ids.add("WHITESPACE_RULE"); + ids.add("FRENCH_WHITESPACE"); + return ids; + } + +} diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/language/French.java b/JLanguageTool/src/java/de/danielnaber/languagetool/language/French.java new file mode 100644 index 0000000..96dc5fc --- /dev/null +++ b/JLanguageTool/src/java/de/danielnaber/languagetool/language/French.java @@ -0,0 +1,90 @@ +/* LanguageTool, a natural language style checker + * Copyright (C) 2007 Daniel Naber (http://www.danielnaber.de) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ +package de.danielnaber.languagetool.language; + +import java.util.HashSet; +import java.util.Locale; +import java.util.Set; + +import de.danielnaber.languagetool.Language; +import de.danielnaber.languagetool.rules.patterns.Unifier; +import de.danielnaber.languagetool.tagging.Tagger; +import de.danielnaber.languagetool.tagging.disambiguation.Disambiguator; +import de.danielnaber.languagetool.tagging.disambiguation.rules.fr.FrenchRuleDisambiguator; +import de.danielnaber.languagetool.tagging.fr.FrenchTagger; + +public class French extends Language { + + private final Tagger tagger = new FrenchTagger(); + private final Disambiguator disambiguator = new FrenchRuleDisambiguator(); + private static final Unifier FRENCH_UNIFIER = new Unifier(); + + private static final String[] COUNTRIES = {"FR", "", "BE", "CH", "CA", + "LU", "MC", "CM", "CI", "HI", "ML", "SN", "CD", "MA", "RE" + }; + + public Locale getLocale() { + return new Locale(getShortName()); + } + + public String getName() { + return "French"; + } + + public String getShortName() { + return "fr"; + } + + public String[] getCountryVariants() { + return COUNTRIES; + } + + public Tagger getTagger() { + return tagger; + } + + public Disambiguator getDisambiguator() { + return disambiguator; + } + + public Unifier getUnifier() { + return FRENCH_UNIFIER; + } + + public Contributor[] getMaintainers() { + final Contributor hVoisard = new Contributor("Hugo Voisard"); + hVoisard.setRemark("2006-2007"); + return new Contributor[] { + new Contributor("Agnes Souque"), + hVoisard + }; + } + + public Set<String> getRelevantRuleIDs() { + final Set<String> ids = new HashSet<String>(); + ids.add("COMMA_PARENTHESIS_WHITESPACE"); + ids.add("DOUBLE_PUNCTUATION"); + ids.add("UNPAIRED_BRACKETS"); + ids.add("UPPERCASE_SENTENCE_START"); + ids.add("WHITESPACE_RULE"); + ids.add("FRENCH_WHITESPACE"); + return ids; + } + +} diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/language/Galician.java b/JLanguageTool/src/java/de/danielnaber/languagetool/language/Galician.java new file mode 100644 index 0000000..abd2158 --- /dev/null +++ b/JLanguageTool/src/java/de/danielnaber/languagetool/language/Galician.java @@ -0,0 +1,86 @@ +/* LanguageTool, a natural language style checker + * Copyright (C) 2007 Daniel Naber (http://www.danielnaber.de) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ +package de.danielnaber.languagetool.language; + +import java.util.HashSet; +import java.util.Locale; +import java.util.Set; + +import de.danielnaber.languagetool.Language; +import de.danielnaber.languagetool.tagging.Tagger; +import de.danielnaber.languagetool.tagging.gl.GalicianTagger; +import de.danielnaber.languagetool.tokenizers.SRXSentenceTokenizer; +import de.danielnaber.languagetool.tokenizers.SentenceTokenizer; +import de.danielnaber.languagetool.tokenizers.Tokenizer; +import de.danielnaber.languagetool.tokenizers.gl.GalicianWordTokenizer; + +public class Galician extends Language { + + private final Tagger tagger = new GalicianTagger(); + private final Tokenizer wordTokenizer = new GalicianWordTokenizer(); + private final SentenceTokenizer sentenceTokenizer = new SRXSentenceTokenizer("gl"); + + private static final String[] COUNTRIES = {"ES"}; + + public final Locale getLocale() { + return new Locale(getShortName()); + } + + public final SentenceTokenizer getSentenceTokenizer() { + return sentenceTokenizer; + } + + public final String getName() { + return "Galician"; + } + + public final String getShortName() { + return "gl"; + } + + public final String[] getCountryVariants() { + return COUNTRIES; + } + + public final Tagger getTagger() { + return tagger; + } + + public final Tokenizer getWordTokenizer() { + return wordTokenizer; + } + + public Contributor[] getMaintainers() { + final Contributor contributor = new Contributor("Susana Sotelo Docío"); + contributor.setUrl("http://www.g11n.net/languagetool-gl"); + return new Contributor[] { contributor }; + } + + public final Set<String> getRelevantRuleIDs() { + final Set<String> ids = new HashSet<String>(); + ids.add("COMMA_PARENTHESIS_WHITESPACE"); + ids.add("DOUBLE_PUNCTUATION"); + ids.add("UNPAIRED_BRACKETS"); + ids.add("UPPERCASE_SENTENCE_START"); + ids.add("WORD_REPEAT_RULE"); + ids.add("WHITESPACE_RULE"); + return ids; + } + +} diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/language/German.java b/JLanguageTool/src/java/de/danielnaber/languagetool/language/German.java new file mode 100644 index 0000000..2df4cd4 --- /dev/null +++ b/JLanguageTool/src/java/de/danielnaber/languagetool/language/German.java @@ -0,0 +1,87 @@ +/* LanguageTool, a natural language style checker + * Copyright (C) 2007 Daniel Naber (http://www.danielnaber.de) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ +package de.danielnaber.languagetool.language; + +import java.util.HashSet; +import java.util.Locale; +import java.util.Set; + +import de.danielnaber.languagetool.Language; +import de.danielnaber.languagetool.tagging.Tagger; +import de.danielnaber.languagetool.tagging.de.GermanTagger; +import de.danielnaber.languagetool.tokenizers.SRXSentenceTokenizer; +import de.danielnaber.languagetool.tokenizers.SentenceTokenizer; + +public class German extends Language { + + private final Tagger tagger = new GermanTagger(); + private final SentenceTokenizer sentenceTokenizer = new SRXSentenceTokenizer("de"); + + private static final String[] COUNTRIES = { + "DE", "CH", "AT", "LU", "LI", "BE" + }; + + public Locale getLocale() { + return new Locale(getShortName()); + } + + public String getName() { + return "German"; + } + + public String getShortName() { + return "de"; + } + + @Override + public String[] getCountryVariants() { + return COUNTRIES; + } + + public Tagger getTagger() { + return tagger; + } + + public SentenceTokenizer getSentenceTokenizer() { + return sentenceTokenizer; + } + + public Contributor[] getMaintainers() { + return new Contributor[] {new Contributor("Daniel Naber")}; + } + + public Set<String> getRelevantRuleIDs() { + final Set<String> ids = new HashSet<String>(); + ids.add("COMMA_PARENTHESIS_WHITESPACE"); + ids.add("DOUBLE_PUNCTUATION"); + ids.add("UNPAIRED_BRACKETS"); + ids.add("UPPERCASE_SENTENCE_START"); + ids.add("GERMAN_WORD_REPEAT_RULE"); + ids.add("WHITESPACE_RULE"); + // specific to German: + ids.add("DE_AGREEMENT"); + ids.add("DE_CASE"); + ids.add("DE_COMPOUNDS"); + ids.add("DE_DASH"); + ids.add("DE_WORD_COHERENCY"); + ids.add("DE_WIEDER_VS_WIDER"); + return ids; + } + +} diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/language/Icelandic.java b/JLanguageTool/src/java/de/danielnaber/languagetool/language/Icelandic.java new file mode 100644 index 0000000..e48fb6a --- /dev/null +++ b/JLanguageTool/src/java/de/danielnaber/languagetool/language/Icelandic.java @@ -0,0 +1,86 @@ +/* LanguageTool, a natural language style checker + * Copyright (C) 2005 Daniel Naber (http://www.danielnaber.de) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ + +package de.danielnaber.languagetool.language; + +/** + * @author Anton Karl Ingason + */ + +import java.util.HashSet; +import java.util.Locale; +import java.util.Set; +import de.danielnaber.languagetool.Language; +import de.danielnaber.languagetool.tagging.Tagger; +import de.danielnaber.languagetool.tagging.xx.DemoTagger; +import de.danielnaber.languagetool.tokenizers.SRXSentenceTokenizer; +import de.danielnaber.languagetool.tokenizers.SentenceTokenizer; + +public class Icelandic extends Language { + + private final Tagger tagger = new DemoTagger(); + private final SentenceTokenizer sentenceTokenizer = new SRXSentenceTokenizer("is"); + private static final String[] COUNTRIES = { "IS" }; + + @Override + public String[] getCountryVariants() { + return COUNTRIES; + } + + @Override + public Locale getLocale() { + return new Locale(getShortName()); + } + + @Override + public Contributor[] getMaintainers() { + return new Contributor[] {new Contributor("Anton Karl Ingason")}; + } + + @Override + public String getName() { + return "Icelandic"; + } + + public Tagger getTagger() { + return tagger; + } + + public SentenceTokenizer getSentenceTokenizer() { + return sentenceTokenizer; + } + + @Override + public Set<String> getRelevantRuleIDs() { + final Set<String> ids = new HashSet<String>(); + ids.add("COMMA_PARENTHESIS_WHITESPACE"); + ids.add("DOUBLE_PUNCTUATION"); + ids.add("UNPAIRED_BRACKETS"); + ids.add("UPPERCASE_SENTENCE_START"); + ids.add("WORD_REPEAT_RULE"); + ids.add("WHITESPACE_RULE"); + return ids; + } + + @Override + public String getShortName() { + return "is"; + } + +} diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/language/Italian.java b/JLanguageTool/src/java/de/danielnaber/languagetool/language/Italian.java new file mode 100644 index 0000000..986b7f5 --- /dev/null +++ b/JLanguageTool/src/java/de/danielnaber/languagetool/language/Italian.java @@ -0,0 +1,74 @@ +/* LanguageTool, a natural language style checker + * Copyright (C) 2007 Daniel Naber (http://www.danielnaber.de) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ +package de.danielnaber.languagetool.language; + +import java.util.HashSet; +import java.util.Locale; +import java.util.Set; + +import de.danielnaber.languagetool.Language; +import de.danielnaber.languagetool.tagging.Tagger; +import de.danielnaber.languagetool.tagging.it.ItalianTagger; + +public class Italian extends Language { + + private static final String[] COUNTRIES = { + "IT", "CH" + }; + + private final Tagger tagger = new ItalianTagger(); + + public Locale getLocale() { + return new Locale(getShortName()); + } + + public String getName() { + return "Italian"; + } + + public String getShortName() { + return "it"; + } + + @Override + public String[] getCountryVariants() { + return COUNTRIES; + } + + public Tagger getTagger() { + return tagger; + } + + public Contributor[] getMaintainers() { + final Contributor contributor = new Contributor("Paolo Bianchini"); + return new Contributor[] { contributor }; + } + + public Set<String> getRelevantRuleIDs() { + final Set<String> ids = new HashSet<String>(); + ids.add("COMMA_PARENTHESIS_WHITESPACE"); + ids.add("DOUBLE_PUNCTUATION"); + ids.add("UNPAIRED_BRACKETS"); + ids.add("UPPERCASE_SENTENCE_START"); + ids.add("WORD_REPEAT_RULE"); + ids.add("WHITESPACE_RULE"); + return ids; + } + +} diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/language/LanguageBuilder.java b/JLanguageTool/src/java/de/danielnaber/languagetool/language/LanguageBuilder.java new file mode 100644 index 0000000..201a8b5 --- /dev/null +++ b/JLanguageTool/src/java/de/danielnaber/languagetool/language/LanguageBuilder.java @@ -0,0 +1,80 @@ +/* LanguageTool, a natural language style checker + * Copyright (C) 2007 Daniel Naber (http://www.danielnaber.de) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ +package de.danielnaber.languagetool.language; + +import java.io.File; +import java.util.Locale; +import java.util.Set; + +import de.danielnaber.languagetool.Language; + +/** + * Create a language by specifying the language's XML rule file. + * + * @author Daniel Naber + */ +public class LanguageBuilder { + + private LanguageBuilder() { + } + + /** + * Takes an XML file named <tt>rules-xx-language.xml</tt>, + * e.g. <tt>rules-de-German.xml</tt> and builds + * a Language object for that language. + */ + public static Language makeLanguage(final File file) { + if (file == null) { + throw new NullPointerException("file argument cannot be null"); + } + if (!file.getName().endsWith(".xml")) { + throw new RuleFilenameException(file); + } + final String[] parts = file.getName().split("-"); + if (parts.length != 3 || !parts[0].equals("rules") || parts[1].length() != 2) { + throw new RuleFilenameException(file); + } + + final Language newLanguage = new Language() { + public Locale getLocale() { + return new Locale(getShortName()); + } + public Contributor[] getMaintainers() { + return null; + } + public String getShortName() { + return parts[1]; + } + public String[] getCountryVariants() { + return new String[] {""}; + } + public String getName() { + return parts[2].replace(".xml", ""); + } + public Set<String> getRelevantRuleIDs() { + return null; + } + public String getRuleFileName() { + return file.getAbsolutePath(); + } + }; + return newLanguage; + } + +} diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/language/Lithuanian.java b/JLanguageTool/src/java/de/danielnaber/languagetool/language/Lithuanian.java new file mode 100644 index 0000000..6401195 --- /dev/null +++ b/JLanguageTool/src/java/de/danielnaber/languagetool/language/Lithuanian.java @@ -0,0 +1,70 @@ +/* LanguageTool, a natural language style checker + * Copyright (C) 2007 Daniel Naber (http://www.danielnaber.de) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ +package de.danielnaber.languagetool.language; + +import java.util.HashSet; +import java.util.Locale; +import java.util.Set; + +import de.danielnaber.languagetool.Language; +import de.danielnaber.languagetool.tagging.Tagger; +import de.danielnaber.languagetool.tagging.xx.DemoTagger; + +public class Lithuanian extends Language { + + private static final String[] COUNTRIES = { + "LT" + }; + + public Locale getLocale() { + return new Locale(getShortName()); + } + + public String getName() { + return "Lithuanian"; + } + + @Override + public String[] getCountryVariants() { + return COUNTRIES; + } + + public String getShortName() { + return "lt"; + } + + public Tagger getTagger() { + return new DemoTagger(); + } + + public Contributor[] getMaintainers() { + return new Contributor[] {new Contributor("Mantas Kriaučiūnas")}; + } + + public Set<String> getRelevantRuleIDs() { + final Set<String> ids = new HashSet<String>(); + ids.add("COMMA_PARENTHESIS_WHITESPACE"); + ids.add("DOUBLE_PUNCTUATION"); + ids.add("UNPAIRED_BRACKETS"); + ids.add("UPPERCASE_SENTENCE_START"); + ids.add("WHITESPACE_RULE"); + return ids; + } + +} diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/language/Malayalam.java b/JLanguageTool/src/java/de/danielnaber/languagetool/language/Malayalam.java new file mode 100644 index 0000000..f15ca5c --- /dev/null +++ b/JLanguageTool/src/java/de/danielnaber/languagetool/language/Malayalam.java @@ -0,0 +1,86 @@ +/* LanguageTool, a natural language style checker + * Copyright (C) 2007 Daniel Naber (http://www.danielnaber.de) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ +package de.danielnaber.languagetool.language; + +import java.util.HashSet; +import java.util.Locale; +import java.util.Set; + +import de.danielnaber.languagetool.Language; +import de.danielnaber.languagetool.tagging.Tagger; +import de.danielnaber.languagetool.tagging.ml.MalayalamTagger; +import de.danielnaber.languagetool.tokenizers.SRXSentenceTokenizer; +import de.danielnaber.languagetool.tokenizers.SentenceTokenizer; +import de.danielnaber.languagetool.tokenizers.ml.MalayalamWordTokenizer; +import de.danielnaber.languagetool.tokenizers.Tokenizer; + +public class Malayalam extends Language { + + private final Tagger tagger = new MalayalamTagger(); + private final SentenceTokenizer sentenceTokenizer = new SRXSentenceTokenizer("en"); + private final Tokenizer wordTokenizer = new MalayalamWordTokenizer(); + + private static final String[] COUNTRIES = {"IN"}; + + public final Locale getLocale() { + return new Locale(getShortName()); + } + + public final SentenceTokenizer getSentenceTokenizer() { + return sentenceTokenizer; + } + + public final String getName() { + return "Malayalam"; + } + + public final String getShortName() { + return "ml"; + } + + public final Tokenizer getWordTokenizer() { + return wordTokenizer; + } + + public final String[] getCountryVariants() { + return COUNTRIES; + } + + public final Tagger getTagger() { + return tagger; + } + + public final Contributor[] getMaintainers() { + return new Contributor[] {new Contributor("Jithesh.V.S") + }; + } + + public final Set<String> getRelevantRuleIDs() { + final Set<String> ids = new HashSet<String>(); + ids.add("COMMA_PARENTHESIS_WHITESPACE"); + ids.add("DOUBLE_PUNCTUATION"); + ids.add("UNPAIRED_BRACKETS"); + ids.add("UPPERCASE_SENTENCE_START"); + ids.add("WORD_REPEAT_RULE"); + ids.add("WHITESPACE_RULE"); + // specific to Malayalam...: + return ids; + } + +} diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/language/Polish.java b/JLanguageTool/src/java/de/danielnaber/languagetool/language/Polish.java new file mode 100644 index 0000000..13b4faf --- /dev/null +++ b/JLanguageTool/src/java/de/danielnaber/languagetool/language/Polish.java @@ -0,0 +1,116 @@ +/* LanguageTool, a natural language style checker + * Copyright (C) 2007 Daniel Naber (http://www.danielnaber.de) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ +package de.danielnaber.languagetool.language; + +import java.util.HashSet; +import java.util.Locale; +import java.util.Set; + +import de.danielnaber.languagetool.Language; +import de.danielnaber.languagetool.rules.patterns.Unifier; +import de.danielnaber.languagetool.synthesis.Synthesizer; +import de.danielnaber.languagetool.synthesis.pl.PolishSynthesizer; +import de.danielnaber.languagetool.tagging.Tagger; +import de.danielnaber.languagetool.tagging.disambiguation.Disambiguator; +import de.danielnaber.languagetool.tagging.disambiguation.pl.PolishHybridDisambiguator; +import de.danielnaber.languagetool.tagging.pl.PolishTagger; +import de.danielnaber.languagetool.tokenizers.SentenceTokenizer; +import de.danielnaber.languagetool.tokenizers.SRXSentenceTokenizer; + +public class Polish extends Language { + + private final Tagger tagger = new PolishTagger(); + private final SentenceTokenizer sentenceTokenizer = new SRXSentenceTokenizer("pl"); + private final Disambiguator disambiguator = new PolishHybridDisambiguator(); + private final Synthesizer synthesizer = new PolishSynthesizer(); + private static final Unifier POLISH_UNIFIER = new Unifier(); + private static final Unifier POLISH_DISAMB_UNIFIER = new Unifier(); + + private static final String[] COUNTRIES = {"PL"}; + + @Override + public Locale getLocale() { + return new Locale(getShortName()); + } + + @Override + public String getName() { + return "Polish"; + } + + @Override + public String getShortName() { + return "pl"; + } + + @Override + public String[] getCountryVariants() { + return COUNTRIES; + } + + @Override + public Tagger getTagger() { + return tagger; + } + + @Override + public SentenceTokenizer getSentenceTokenizer() { + return sentenceTokenizer; + } + + @Override + public Disambiguator getDisambiguator() { + return disambiguator; + } + + public Unifier getUnifier() { + return POLISH_UNIFIER; + } + + public Unifier getDisambiguationUnifier() { + return POLISH_DISAMB_UNIFIER; + } + + @Override + public Synthesizer getSynthesizer() { + return synthesizer; + } + + @Override + public Contributor[] getMaintainers() { + return new Contributor[] {new Contributor("Marcin Miłkowski")}; + } + + @Override + public Set<String> getRelevantRuleIDs() { + final Set<String> ids = new HashSet<String>(); + ids.add("COMMA_PARENTHESIS_WHITESPACE"); + ids.add("DOUBLE_PUNCTUATION"); + ids.add("UPPERCASE_SENTENCE_START"); + ids.add("WORD_REPEAT_RULE"); + ids.add("WHITESPACE_RULE"); + // specific to Polish: + ids.add("PL_UNPAIRED_BRACKETS"); + ids.add("PL_WORD_REPEAT"); + ids.add("PL_COMPOUNDS"); + ids.add("PL_SIMPLE_REPLACE"); + return ids; + } + +} diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/language/Romanian.java b/JLanguageTool/src/java/de/danielnaber/languagetool/language/Romanian.java new file mode 100644 index 0000000..96d6a6b --- /dev/null +++ b/JLanguageTool/src/java/de/danielnaber/languagetool/language/Romanian.java @@ -0,0 +1,112 @@ +/* LanguageTool, a natural language style checker + * Copyright (C) 2007 Daniel Naber (http://www.danielnaber.de) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ +package de.danielnaber.languagetool.language; + +import java.util.HashSet; +import java.util.Locale; +import java.util.Set; + +import de.danielnaber.languagetool.Language; +import de.danielnaber.languagetool.rules.ro.CompoundRule; +import de.danielnaber.languagetool.rules.ro.SimpleReplaceRule; +import de.danielnaber.languagetool.synthesis.Synthesizer; +import de.danielnaber.languagetool.synthesis.ro.RomanianSynthesizer; +import de.danielnaber.languagetool.tagging.Tagger; +import de.danielnaber.languagetool.tagging.disambiguation.Disambiguator; +import de.danielnaber.languagetool.tagging.disambiguation.rules.ro.RomanianRuleDisambiguator; +import de.danielnaber.languagetool.tagging.ro.RomanianTagger; +import de.danielnaber.languagetool.tokenizers.SRXSentenceTokenizer; +import de.danielnaber.languagetool.tokenizers.SentenceTokenizer; +import de.danielnaber.languagetool.tokenizers.Tokenizer; +import de.danielnaber.languagetool.tokenizers.ro.RomanianWordTokenizer; + +/** + * + * @author Ionuț Păduraru + * @since 24.02.2009 22:18:21 + */ +public class Romanian extends Language { + + private static final String[] COUNTRIES = { "RO" }; + + private final Tagger tagger = new RomanianTagger(); + private final Synthesizer synthesizer = new RomanianSynthesizer(); + private final Disambiguator disambiguator = new RomanianRuleDisambiguator(); + private final Tokenizer wdTokenizer = new RomanianWordTokenizer(); + private final SentenceTokenizer sentenceTokenizer = new SRXSentenceTokenizer("ro"); + + public Locale getLocale() { + return new Locale(getShortName()); + } + + public String getName() { + return "Romanian"; + } + + public String getShortName() { + return "ro"; + } + + @Override + public String[] getCountryVariants() { + return COUNTRIES; + } + + public Tagger getTagger() { + return tagger; + } + + public Contributor[] getMaintainers() { + final Contributor contributor = new Contributor("Ionuț Păduraru"); + contributor.setUrl("http://www.archeus.ro"); + return new Contributor[] { contributor }; + } + + public Set<String> getRelevantRuleIDs() { + final Set<String> ids = new HashSet<String>(); + ids.add("COMMA_PARENTHESIS_WHITESPACE"); + ids.add("DOUBLE_PUNCTUATION"); + ids.add("UPPERCASE_SENTENCE_START"); + ids.add("WHITESPACE_RULE"); + ids.add("UNPAIRED_BRACKETS"); + ids.add("UPPERCASE_SENTENCE_START"); + ids.add("WORD_REPEAT_RULE"); + // specific to romanian + ids.add(SimpleReplaceRule.ROMANIAN_SIMPLE_REPLACE_RULE); + ids.add(CompoundRule.ROMANIAN_COMPOUND_RULE); + + return ids; + } + + public final Synthesizer getSynthesizer() { + return synthesizer; + } + + public final Disambiguator getDisambiguator() { + return disambiguator; + } + + public final Tokenizer getWordTokenizer() { + return wdTokenizer; + } + + public SentenceTokenizer getSentenceTokenizer() { + return sentenceTokenizer; + } +} diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/language/RuleFilenameException.java b/JLanguageTool/src/java/de/danielnaber/languagetool/language/RuleFilenameException.java new file mode 100644 index 0000000..715bdc9 --- /dev/null +++ b/JLanguageTool/src/java/de/danielnaber/languagetool/language/RuleFilenameException.java @@ -0,0 +1,42 @@ +/* LanguageTool, a natural language style checker + * Copyright (C) 2007 Daniel Naber (http://www.danielnaber.de) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ +package de.danielnaber.languagetool.language; + +import java.io.File; + +/** + * Thrown if external rule filename doesn't match the required format. + * + * @author Daniel Naber + */ +public class RuleFilenameException extends RuntimeException { + + /** + * + */ + private static final long serialVersionUID = 6642163394764392897L; + + public RuleFilenameException(File file) { + super("Rule file must be named rules-<xx>-<lang>.xml (<xx> = language code, " + + "<lang> = language name),\n" + + "for example: rules-en-English.xml\n" + + "Current name: " + file.getName()); + } + +} diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/language/Russian.java b/JLanguageTool/src/java/de/danielnaber/languagetool/language/Russian.java new file mode 100644 index 0000000..8491d65 --- /dev/null +++ b/JLanguageTool/src/java/de/danielnaber/languagetool/language/Russian.java @@ -0,0 +1,114 @@ +/* LanguageTool, a natural language style checker + * Copyright (C) 2007 Daniel Naber (http://www.danielnaber.de) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ +package de.danielnaber.languagetool.language; + +import java.util.HashSet; +import java.util.Locale; +import java.util.Set; + +import de.danielnaber.languagetool.Language; +import de.danielnaber.languagetool.rules.patterns.Unifier; +import de.danielnaber.languagetool.synthesis.Synthesizer; +import de.danielnaber.languagetool.synthesis.ru.RussianSynthesizer; +import de.danielnaber.languagetool.tagging.Tagger; +import de.danielnaber.languagetool.tagging.disambiguation.Disambiguator; +import de.danielnaber.languagetool.tagging.disambiguation.rules.ru.RussianRuleDisambiguator; +import de.danielnaber.languagetool.tagging.ru.RussianTagger; +//import de.danielnaber.languagetool.tokenizers.Tokenizer; +//import de.danielnaber.languagetool.tokenizers.ru.RussianWordTokenizer; +import de.danielnaber.languagetool.tokenizers.SRXSentenceTokenizer; // new Tokenizer +import de.danielnaber.languagetool.tokenizers.SentenceTokenizer; +// import de.danielnaber.languagetool.tokenizers.ru.RussianSentenceTokenizer; // old Tokenizer + + +public class Russian extends Language { + + private static final String[] COUNTRIES = { + "RU" + }; + + private final Tagger tagger = new RussianTagger(); + private final Disambiguator disambiguator = new RussianRuleDisambiguator(); + private static final Unifier RUSSIAN_UNIFIER = new Unifier(); +// private Tokenizer wordTokenizer = new RussianWordTokenizer(); + private final Synthesizer synthesizer = new RussianSynthesizer(); +// private SentenceTokenizer sentenceTokenizer = new RussianSentenceTokenizer(); // old Tokenizer + private final SentenceTokenizer sentenceTokenizer = new SRXSentenceTokenizer("ru"); // new Tokenizer + public Locale getLocale() { + return new Locale(getShortName()); + } + + public String getName() { + return "Russian"; + } + + public String getShortName() { + return "ru"; + } + + @Override + public String[] getCountryVariants() { + return COUNTRIES; + } + + public Tagger getTagger() { + return tagger; + } + + public Disambiguator getDisambiguator() { + return disambiguator; + } + +// public Tokenizer getWordTokenizer() { +// return wordTokenizer; +// } + + public Synthesizer getSynthesizer() { + return synthesizer; + } + + public SentenceTokenizer getSentenceTokenizer() { + return sentenceTokenizer; + } + + + public Unifier getUnifier() { + return RUSSIAN_UNIFIER; + } + + public Contributor[] getMaintainers() { + return new Contributor[] {new Contributor("Yakov Reztsov")}; + } + + public Set<String> getRelevantRuleIDs() { + final Set<String> ids = new HashSet<String>(); + ids.add("COMMA_PARENTHESIS_WHITESPACE"); + ids.add("DOUBLE_PUNCTUATION"); + ids.add("UPPERCASE_SENTENCE_START"); + ids.add("WORD_REPEAT_RULE"); + ids.add("WHITESPACE_RULE"); + // specific to Russian : + ids.add("RU_UNPAIRED_BRACKETS"); + ids.add("RU_COMPOUNDS"); + ids.add("RU_SIMPLE_REPLACE"); + return ids; + + } + +}
\ No newline at end of file diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/language/Slovak.java b/JLanguageTool/src/java/de/danielnaber/languagetool/language/Slovak.java new file mode 100644 index 0000000..eecb54b --- /dev/null +++ b/JLanguageTool/src/java/de/danielnaber/languagetool/language/Slovak.java @@ -0,0 +1,93 @@ +/* LanguageTool, a natural language style checker + * Copyright (C) 2007 Daniel Naber (http://www.danielnaber.de) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ +package de.danielnaber.languagetool.language; + +import java.util.HashSet; +import java.util.Locale; +import java.util.Set; + +import de.danielnaber.languagetool.Language; +import de.danielnaber.languagetool.synthesis.Synthesizer; +import de.danielnaber.languagetool.tagging.Tagger; +import de.danielnaber.languagetool.tagging.sk.SlovakTagger; +import de.danielnaber.languagetool.synthesis.sk.SlovakSynthesizer; +import de.danielnaber.languagetool.tokenizers.SRXSentenceTokenizer; +import de.danielnaber.languagetool.tokenizers.SentenceTokenizer; + +public class Slovak extends Language { + + private final SentenceTokenizer sentenceTokenizer = new SRXSentenceTokenizer("sk"); + private final Tagger tagger = new SlovakTagger(); + private final Synthesizer synthesizer = new SlovakSynthesizer(); + + private static final String[] COUNTRIES = { + "SK" + }; + + public Locale getLocale() { + return new Locale(getShortName()); + } + + public String getName() { + return "Slovak"; + } + + public String getShortName() { + return "sk"; + } + + @Override + public String[] getCountryVariants() { + return COUNTRIES; + } + + public Tagger getTagger() { + return tagger; + } + + @Override + public Synthesizer getSynthesizer() { + return synthesizer; + } + + public SentenceTokenizer getSentenceTokenizer() { + return sentenceTokenizer; + } + + public Contributor[] getMaintainers() { + final Contributor contributor = new Contributor("Zdenko Podobný"); + contributor.setUrl("http://sk-spell.sk.cx"); + return new Contributor[] { contributor }; + } + + public Set<String> getRelevantRuleIDs() { + final Set<String> ids = new HashSet<String>(); + ids.add("COMMA_PARENTHESIS_WHITESPACE"); + ids.add("DOUBLE_PUNCTUATION"); + ids.add("UNPAIRED_BRACKETS"); + ids.add("UPPERCASE_SENTENCE_START"); + ids.add("WORD_REPEAT_RULE"); + ids.add("WHITESPACE_RULE"); + // specific to Slovak: + ids.add("SK_COMPOUNDS"); + ids.add("SK_VES"); + return ids; + } + +} diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/language/Slovenian.java b/JLanguageTool/src/java/de/danielnaber/languagetool/language/Slovenian.java new file mode 100644 index 0000000..cc945f3 --- /dev/null +++ b/JLanguageTool/src/java/de/danielnaber/languagetool/language/Slovenian.java @@ -0,0 +1,75 @@ +/* LanguageTool, a natural language style checker + * Copyright (C) 2007 Daniel Naber (http://www.danielnaber.de) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ +package de.danielnaber.languagetool.language; + +import java.util.HashSet; +import java.util.Locale; +import java.util.Set; + +import de.danielnaber.languagetool.Language; +import de.danielnaber.languagetool.tokenizers.SRXSentenceTokenizer; +import de.danielnaber.languagetool.tokenizers.SentenceTokenizer; + +public class Slovenian extends Language { + + private final SentenceTokenizer sentenceTokenizer = new SRXSentenceTokenizer("sl"); + + private static final String[] COUNTRIES = { + "SI" + }; + + public Locale getLocale() { + return new Locale(getShortName()); + } + + public String getName() { + return "Slovenian"; + } + + public String getShortName() { + return "sl"; + } + + @Override + public String[] getCountryVariants() { + return COUNTRIES; + } + + public SentenceTokenizer getSentenceTokenizer() { + return sentenceTokenizer; + } + + public Contributor[] getMaintainers() { + return new Contributor[] {new Contributor("Martin Srebotnjak")}; + } + + public Set<String> getRelevantRuleIDs() { + final Set<String> ids = new HashSet<String>(); + ids.add("COMMA_PARENTHESIS_WHITESPACE"); + ids.add("DOUBLE_PUNCTUATION"); + ids.add("UNPAIRED_BRACKETS"); + ids.add("UPPERCASE_SENTENCE_START"); + ids.add("WORD_REPEAT_RULE"); + ids.add("WHITESPACE_RULE"); + // specific to Slovenian: none + return ids; + + } + +} diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/language/Spanish.java b/JLanguageTool/src/java/de/danielnaber/languagetool/language/Spanish.java new file mode 100644 index 0000000..ba646d6 --- /dev/null +++ b/JLanguageTool/src/java/de/danielnaber/languagetool/language/Spanish.java @@ -0,0 +1,94 @@ +/* LanguageTool, a natural language style checker + * Copyright (C) 2007 Daniel Naber (http://www.danielnaber.de) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ +package de.danielnaber.languagetool.language; + +import java.util.HashSet; +import java.util.Locale; +import java.util.Set; + +import de.danielnaber.languagetool.Language; +import de.danielnaber.languagetool.synthesis.Synthesizer; +import de.danielnaber.languagetool.synthesis.es.SpanishSynthesizer; +import de.danielnaber.languagetool.tagging.Tagger; +import de.danielnaber.languagetool.tagging.es.SpanishTagger; +import de.danielnaber.languagetool.tokenizers.SRXSentenceTokenizer; +import de.danielnaber.languagetool.tokenizers.SentenceTokenizer; + +public class Spanish extends Language { + + private final SentenceTokenizer sentenceTokenizer = new SRXSentenceTokenizer("es"); + private final Synthesizer synthesizer = new SpanishSynthesizer(); + + private static final String[] COUNTRIES = { + "ES", "", "MX", "GT", "CR", "PA", "DO", + "VE", "PE", "AR", "EC", "CL", "UY", "PY", + "BO", "SV", "HN", "NI", "PR", "US", "CU" + }; + + private final Tagger tagger = new SpanishTagger(); + + public Locale getLocale() { + return new Locale(getShortName()); + } + + public String getName() { + return "Spanish"; + } + + public String getShortName() { + return "es"; + } + + @Override + public String[] getCountryVariants() { + return COUNTRIES; + } + + public Tagger getTagger() { + return tagger; + } + + public final Synthesizer getSynthesizer() { + return synthesizer; + } + + public final SentenceTokenizer getSentenceTokenizer() { + return sentenceTokenizer; + } + + public Contributor[] getMaintainers() { + final Contributor contributor = new Contributor("Juan Martorell"); + contributor.setUrl("http://languagetool-es.blogspot.com/"); + return new Contributor[] { contributor }; + } + + public Set<String> getRelevantRuleIDs() { + final Set<String> ids = new HashSet<String>(); + ids.add("COMMA_PARENTHESIS_WHITESPACE"); + ids.add("DOUBLE_PUNCTUATION"); + ids.add("UNPAIRED_BRACKETS"); + ids.add("UPPERCASE_SENTENCE_START"); + ids.add("WORD_REPEAT_RULE"); + ids.add("WHITESPACE_RULE"); + // specific to Spanish: + // ids.add("EL_WITH_FEM"); + return ids; + } + +} diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/language/Swedish.java b/JLanguageTool/src/java/de/danielnaber/languagetool/language/Swedish.java new file mode 100644 index 0000000..1b99f9a --- /dev/null +++ b/JLanguageTool/src/java/de/danielnaber/languagetool/language/Swedish.java @@ -0,0 +1,75 @@ +/* LanguageTool, a natural language style checker + * Copyright (C) 2007 Daniel Naber (http://www.danielnaber.de) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ +package de.danielnaber.languagetool.language; + +import java.util.HashSet; +import java.util.Locale; +import java.util.Set; + +import de.danielnaber.languagetool.Language; +import de.danielnaber.languagetool.tagging.Tagger; +import de.danielnaber.languagetool.tagging.sv.SwedishTagger; + +public class Swedish extends Language { + + private static final String[] COUNTRIES = { + "SE", "FI" + }; + + private final Tagger tagger = new SwedishTagger(); + + public final Locale getLocale() { + return new Locale(getShortName()); + } + + public final String getName() { + return "Swedish"; + } + + public final String getShortName() { + return "sv"; + } + + @Override + public final String[] getCountryVariants() { + return COUNTRIES; + } + + public final Tagger getTagger() { + return tagger; + } + + public final Contributor[] getMaintainers() { + return new Contributor[] {new Contributor("Niklas Johansson")}; + } + + public final Set<String> getRelevantRuleIDs() { + final Set<String> ids = new HashSet<String>(); + ids.add("COMMA_PARENTHESIS_WHITESPACE"); + ids.add("DOUBLE_PUNCTUATION"); + ids.add("UNPAIRED_BRACKETS"); + ids.add("UPPERCASE_SENTENCE_START"); + ids.add("WORD_REPEAT_RULE"); + ids.add("WHITESPACE_RULE"); + // specific to Swedish: + ids.add("SV_COMPOUNDS"); + return ids; + } + +} diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/language/Ukrainian.java b/JLanguageTool/src/java/de/danielnaber/languagetool/language/Ukrainian.java new file mode 100644 index 0000000..c426100 --- /dev/null +++ b/JLanguageTool/src/java/de/danielnaber/languagetool/language/Ukrainian.java @@ -0,0 +1,73 @@ +/* LanguageTool, a natural language style checker + * Copyright (C) 2007 Daniel Naber (http://www.danielnaber.de) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ +package de.danielnaber.languagetool.language; + +import java.util.HashSet; +import java.util.Locale; +import java.util.Set; + +import de.danielnaber.languagetool.Language; +import de.danielnaber.languagetool.tagging.Tagger; +import de.danielnaber.languagetool.tagging.uk.UkrainianTagger; + +public class Ukrainian extends Language { + + private static final String[] COUNTRIES = { + "UA" + }; + + private final Tagger tagger = new UkrainianTagger(); + + public Locale getLocale() { + return new Locale(getShortName()); + } + + public String getName() { + return "Ukrainian"; + } + + public String getShortName() { + return "uk"; + } + + @Override + public String[] getCountryVariants() { + return COUNTRIES; + } + + public Tagger getTagger() { + return tagger; + } + + public Contributor[] getMaintainers() { + return new Contributor[] {new Contributor("Andriy Rysin")}; + } + + public Set<String> getRelevantRuleIDs() { + final Set<String> ids = new HashSet<String>(); + ids.add("COMMA_PARENTHESIS_WHITESPACE"); + ids.add("DOUBLE_PUNCTUATION"); + ids.add("UPPERCASE_SENTENCE_START"); + ids.add("WHITESPACE_RULE"); + // specific to Ukrainian: + ids.add("UK_SIMPLE_REPLACE"); + return ids; + } + +} diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/openoffice/ConfigThread.java b/JLanguageTool/src/java/de/danielnaber/languagetool/openoffice/ConfigThread.java new file mode 100644 index 0000000..52aae8b --- /dev/null +++ b/JLanguageTool/src/java/de/danielnaber/languagetool/openoffice/ConfigThread.java @@ -0,0 +1,78 @@ +/* LanguageTool, a natural language style checker + * Copyright (C) 2005 Daniel Naber (http://www.danielnaber.de) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ +package de.danielnaber.languagetool.openoffice; + +import java.util.Set; + +import de.danielnaber.languagetool.JLanguageTool; +import de.danielnaber.languagetool.Language; +import de.danielnaber.languagetool.gui.Configuration; +import de.danielnaber.languagetool.gui.ConfigurationDialog; + +/** + * A thread that shows the configuration dialog which lets the + * user enable/disable rules. + * + * @author Marcin Miłkowski + * @author Daniel Naber + */ +class ConfigThread extends Thread { + + private final Language docLanguage; + private final Configuration config; + private final de.danielnaber.languagetool.openoffice.Main mainThread; + + private final ConfigurationDialog cfgDialog; + + ConfigThread(final Language docLanguage, final Configuration config, + final de.danielnaber.languagetool.openoffice.Main main) { + this.docLanguage = docLanguage; + this.config = config; + mainThread = main; + cfgDialog = new ConfigurationDialog(null, true); + cfgDialog.setDisabledRules(config.getDisabledRuleIds()); + cfgDialog.setEnabledRules(config.getEnabledRuleIds()); + cfgDialog.setDisabledCategories(config.getDisabledCategoryNames()); + cfgDialog.setMotherTongue(config.getMotherTongue()); + } + + public Set<String> getDisabledRuleIds() { + return cfgDialog.getDisabledRuleIds(); + } + + public void run() { + try { + final JLanguageTool langTool = new JLanguageTool(docLanguage, cfgDialog.getMotherTongue()); + langTool.activateDefaultPatternRules(); + langTool.activateDefaultFalseFriendRules(); + cfgDialog.show(langTool.getAllRules()); + config.setDisabledRuleIds(cfgDialog.getDisabledRuleIds()); + config.setEnabledRuleIds(cfgDialog.getEnabledRuleIds()); + config.setDisabledCategoryNames(cfgDialog.getDisabledCategoryNames()); + config.setMotherTongue(cfgDialog.getMotherTongue()); + config.saveConfiguration(); + if (mainThread != null) { + mainThread.resetDocument(); + } + } catch (Throwable e) { + Main.showError(e); + } + } + +} diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/openoffice/Main.java b/JLanguageTool/src/java/de/danielnaber/languagetool/openoffice/Main.java new file mode 100644 index 0000000..3eaecda --- /dev/null +++ b/JLanguageTool/src/java/de/danielnaber/languagetool/openoffice/Main.java @@ -0,0 +1,760 @@ +/* LanguageTool, a natural language style checker + * Copyright (C) 2005 Daniel Naber (http://www.danielnaber.de) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ + +package de.danielnaber.languagetool.openoffice; + +/** OpenOffice 3.x Integration + * + * @author Marcin Miłkowski + */ +import java.io.File; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Comparator; +import java.util.HashSet; +import java.util.List; +import java.util.ResourceBundle; +import java.util.Set; + +import javax.swing.JOptionPane; +import javax.swing.UIManager; + +import com.sun.star.awt.XWindow; +import com.sun.star.awt.XWindowPeer; +import com.sun.star.beans.PropertyValue; +import com.sun.star.beans.XPropertySet; +import com.sun.star.frame.XDesktop; +import com.sun.star.frame.XModel; +import com.sun.star.lang.IllegalArgumentException; +import com.sun.star.lang.Locale; +import com.sun.star.lang.XComponent; +import com.sun.star.lang.XMultiComponentFactory; +import com.sun.star.lang.XServiceDisplayName; +import com.sun.star.lang.XServiceInfo; +import com.sun.star.lang.XSingleComponentFactory; +import com.sun.star.lib.uno.helper.Factory; +import com.sun.star.lib.uno.helper.WeakBase; +import com.sun.star.linguistic2.ProofreadingResult; +import com.sun.star.linguistic2.SingleProofreadingError; +import com.sun.star.linguistic2.XLinguServiceEventBroadcaster; +import com.sun.star.linguistic2.XLinguServiceEventListener; +import com.sun.star.linguistic2.XProofreader; +import com.sun.star.registry.XRegistryKey; +import com.sun.star.task.XJobExecutor; +import com.sun.star.text.XTextViewCursor; +import com.sun.star.text.XTextViewCursorSupplier; +import com.sun.star.uno.UnoRuntime; +import com.sun.star.uno.XComponentContext; + +import de.danielnaber.languagetool.JLanguageTool; +import de.danielnaber.languagetool.Language; +import de.danielnaber.languagetool.gui.Configuration; +import de.danielnaber.languagetool.rules.RuleMatch; +import de.danielnaber.languagetool.tools.StringTools; + +public class Main extends WeakBase implements XJobExecutor, + XServiceDisplayName, XServiceInfo, XProofreader, + XLinguServiceEventBroadcaster { + + private Configuration config; + private JLanguageTool langTool; + private Language docLanguage; + + private String docID; + + /* + * Rules disabled using the config dialog box rather than Spelling dialog box + * or the context menu. + */ + private Set<String> disabledRules; + + private Set<String> disabledRulesUI; + + private List<XLinguServiceEventListener> xEventListeners; + + /** + * Make another instance of JLanguageTool and assign it to langTool if true. + */ + private boolean recheck; + + /** + * Sentence tokenization-related members. + */ + + private String currentPara; + private List<String> tokenizedSentences; + private int position; + private List<RuleMatch> paragraphMatches; + + /** + * Service name required by the OOo API && our own name. + */ + private static final String[] SERVICE_NAMES = { + "com.sun.star.linguistic2.Proofreader", + "de.danielnaber.languagetool.openoffice.Main" }; + + // use a different name than the stand-alone version to avoid conflicts: + private static final String CONFIG_FILE = ".languagetool-ooo.cfg"; + + private static final ResourceBundle MESSAGES = JLanguageTool + .getMessageBundle(); + + private XComponentContext xContext; + + public Main(final XComponentContext xCompContext) { + try { + changeContext(xCompContext); + final File homeDir = getHomeDir(); + config = new Configuration(homeDir, CONFIG_FILE); + disabledRules = config.getDisabledRuleIds(); + if (disabledRules == null) { + disabledRules = new HashSet<String>(); + } + disabledRulesUI = new HashSet<String>(disabledRules); + xEventListeners = new ArrayList<XLinguServiceEventListener>(); + } catch (final Throwable t) { + showError(t); + } + } + + public final void changeContext(final XComponentContext xCompContext) { + xContext = xCompContext; + } + + private XComponent getxComponent() { + try { + final XMultiComponentFactory xMCF = xContext.getServiceManager(); + final Object desktop = xMCF.createInstanceWithContext( + "com.sun.star.frame.Desktop", xContext); + final XDesktop xDesktop = (XDesktop) UnoRuntime.queryInterface( + XDesktop.class, desktop); + return xDesktop.getCurrentComponent(); + } catch (final Throwable t) { + showError(t); + return null; + } + } + + /** + * Checks the language under the cursor. Used for opening the configuration + * dialog. + * + * @return Language - the language under the visible cursor. + */ + private Language getLanguage() { + final XComponent xComponent = getxComponent(); + if (xComponent == null) { + return Language.ENGLISH; // for testing with local main() method only + } + final Locale charLocale; + final XPropertySet xCursorProps; + try { + final XModel model = (XModel) UnoRuntime.queryInterface(XModel.class, + xComponent); + final XTextViewCursorSupplier xViewCursorSupplier = (XTextViewCursorSupplier) UnoRuntime + .queryInterface(XTextViewCursorSupplier.class, model + .getCurrentController()); + final XTextViewCursor xCursor = xViewCursorSupplier.getViewCursor(); + if (xCursor.isCollapsed()) { // no text selection + xCursorProps = (XPropertySet) UnoRuntime.queryInterface( + XPropertySet.class, xCursor); + } else { // text is selected, need to create another cursor + // as multiple languages can occur here - we care only + // about character under the cursor, which might be wrong + // but it applies only to the checking dialog to be removed + xCursorProps = (XPropertySet) UnoRuntime.queryInterface( + XPropertySet.class, xCursor.getText().createTextCursorByRange( + xCursor.getStart())); + } + final Object obj = xCursorProps.getPropertyValue("CharLocale"); + if (obj == null) { + return Language.ENGLISH; // fallback + } + charLocale = (Locale) obj; + boolean langIsSupported = false; + for (Language element : Language.LANGUAGES) { + if (element.getShortName().equals(charLocale.Language)) { + langIsSupported = true; + break; + } + } + if (!langIsSupported) { + // FIXME: i18n + JOptionPane.showMessageDialog(null, + "Error: Sorry, the document language '" + charLocale.Language + + "' is not supported by LanguageTool."); + return null; + } + } catch (final Throwable t) { + showError(t); + return null; + } + return Language.getLanguageForShortName(charLocale.Language); + } + + /** + * Runs the grammar checker on paragraph text. + * + * @param docID - document ID + * @param paraText - paragraph text + * @param locale Locale - the text Locale + * @param startOfSentencePos start of sentence position + * @param nSuggestedBehindEndOfSentencePosition end of sentence position + * @param props - properties + * @return ProofreadingResult containing the results of the check. + * @throws IllegalArgumentException + * (not really, LT simply returns the ProofreadingResult with the + * values supplied) + */ + public final ProofreadingResult doProofreading(final String docID, + final String paraText, final Locale locale, final int startOfSentencePos, + final int nSuggestedBehindEndOfSentencePosition, + final PropertyValue[] props) { + final ProofreadingResult paRes = new ProofreadingResult(); + try { + paRes.nStartOfSentencePosition = startOfSentencePos; + paRes.xProofreader = this; + paRes.aLocale = locale; + paRes.aDocumentIdentifier = docID; + paRes.aText = paraText; + paRes.aProperties = props; + return doGrammarCheckingInternal(paraText, locale, paRes); + } catch (final Throwable t) { + showError(t); + return paRes; + } + } + + synchronized private ProofreadingResult doGrammarCheckingInternal( + final String paraText, final Locale locale, final ProofreadingResult paRes) { + + if (!StringTools.isEmpty(paraText) + && hasLocale(locale)) { + // caching the instance of LT + if (!Language.getLanguageForShortName(locale.Language).equals( + docLanguage) + || langTool == null || recheck) { + docLanguage = Language.getLanguageForShortName(locale.Language); + if (docLanguage == null) { + return paRes; + } + try { + langTool = new JLanguageTool(docLanguage, config.getMotherTongue()); + langTool.activateDefaultPatternRules(); + langTool.activateDefaultFalseFriendRules(); + recheck = false; + } catch (final Throwable t) { + showError(t); + } + } + + if (config.getDisabledRuleIds() != null) { + for (final String id : config.getDisabledRuleIds()) { + langTool.disableRule(id); + } + } + final Set<String> disabledCategories = config + .getDisabledCategoryNames(); + if (disabledCategories != null) { + for (final String categoryName : disabledCategories) { + langTool.disableCategory(categoryName); + } + } + final Set<String> enabledRules = config.getEnabledRuleIds(); + if (enabledRules != null) { + for (String ruleName : enabledRules) { + langTool.enableDefaultOffRule(ruleName); + langTool.enableRule(ruleName); + } + } + try { + final String sentence = getSentence(paraText, + paRes.nStartOfSentencePosition); + paRes.nStartOfSentencePosition = position; + paRes.nStartOfNextSentencePosition = position + sentence.length(); + paRes.nBehindEndOfSentencePosition = paRes.nStartOfNextSentencePosition; + if (!StringTools.isEmpty(sentence)) { + final List<RuleMatch> ruleMatches = langTool.check(sentence, false, + JLanguageTool.paragraphHandling.ONLYNONPARA); + final SingleProofreadingError[] pErrors = checkParaRules(paraText, + locale, paRes.nStartOfSentencePosition, + paRes.nStartOfNextSentencePosition, paRes.aDocumentIdentifier); + int pErrorCount = 0; + if (pErrors != null) { + pErrorCount = pErrors.length; + } + if (!ruleMatches.isEmpty()) { + final SingleProofreadingError[] errorArray = new SingleProofreadingError[ruleMatches + .size() + + pErrorCount]; + int i = 0; + for (final RuleMatch myRuleMatch : ruleMatches) { + errorArray[i] = createOOoError(myRuleMatch, paRes.nStartOfSentencePosition); + i++; + } + // add para matches + if (pErrors != null) { + for (SingleProofreadingError paraError : pErrors) { + if (paraError != null) { + errorArray[i] = paraError; + i++; + } + } + } + Arrays.sort(errorArray, new ErrorPositionComparator()); + paRes.aErrors = errorArray; + + } else { + if (pErrors != null) { + paRes.aErrors = pErrors; + } + } + } + } catch (final Throwable t) { + showError(t); + paRes.nBehindEndOfSentencePosition = paraText.length(); + } + } + return paRes; + } + + synchronized private String getSentence(final String paraText, + final int startPos) { + if (paraText.equals(currentPara) && tokenizedSentences != null) { + int i = 0; + int index = -1; + while (index < startPos && i < tokenizedSentences.size()) { + index += tokenizedSentences.get(i).length(); + if (index < startPos) { + i++; + } + } + position = index + 1; + if (i < tokenizedSentences.size()) { + position -= tokenizedSentences.get(i).length(); + return tokenizedSentences.get(i); + } + return ""; + } + currentPara = paraText; + tokenizedSentences = langTool.sentenceTokenize(paraText); + position = 0; + if (!tokenizedSentences.isEmpty()) { + return tokenizedSentences.get(0); + } + return ""; + } + + synchronized private SingleProofreadingError[] checkParaRules( + final String paraText, final Locale locale, final int startPos, + final int endPos, final String docID) { + if (startPos == 0) { + try { + paragraphMatches = langTool.check(paraText, false, + JLanguageTool.paragraphHandling.ONLYPARA); + this.docID = docID; + } catch (final Throwable t) { + showError(t); + } + } + if (paragraphMatches != null && !paragraphMatches.isEmpty() + && docID.equals(this.docID)) { + final List<SingleProofreadingError> errorList = new ArrayList<SingleProofreadingError>( + paragraphMatches.size()); + for (final RuleMatch myRuleMatch : paragraphMatches) { + final int startErrPos = myRuleMatch.getFromPos(); + final int endErrPos = myRuleMatch.getToPos(); + if (startErrPos >= startPos && startErrPos < endPos + && endErrPos >= startPos && endErrPos < endPos) { + errorList.add(createOOoError(myRuleMatch, 0)); + } + } + if (!errorList.isEmpty()) { + final SingleProofreadingError[] errorArray = errorList.toArray(new SingleProofreadingError[errorList.size()]); + Arrays.sort(errorArray, new ErrorPositionComparator()); + return errorArray; + } + } + return null; + } + + /** + * Creates a SingleGrammarError object for use in OOo. + * @param myMatch + * ruleMatch - LT rule match + * + * @return SingleGrammarError - object for OOo checker integration + */ + private SingleProofreadingError createOOoError(final RuleMatch myMatch, + final int startIndex) { + final SingleProofreadingError aError = new SingleProofreadingError(); + aError.nErrorType = com.sun.star.text.TextMarkupType.PROOFREADING; + // the API currently has no support for formatting text in comments + final String comment = myMatch.getMessage() + .replaceAll("<suggestion>", "\"").replaceAll("</suggestion>", "\"") + .replaceAll("([\r]*\n)", " "); // convert line ends to spaces + aError.aFullComment = comment; + // not all rules have short comments + if (!StringTools.isEmpty(myMatch.getShortMessage())) { + aError.aShortComment = myMatch.getShortMessage(); + } else { + aError.aShortComment = aError.aFullComment; + } + aError.aSuggestions = myMatch.getSuggestedReplacements().toArray( + new String[myMatch.getSuggestedReplacements().size()]); + aError.nErrorStart = myMatch.getFromPos() + startIndex; + aError.nErrorLength = myMatch.getToPos() - myMatch.getFromPos(); + aError.aRuleIdentifier = myMatch.getRule().getId(); + aError.aProperties = new PropertyValue[0]; + return aError; + } + + /** + * LT does not support spell-checking, so we return false. + * + * @return false + */ + public final boolean isSpellChecker() { + return false; + } + + /** + * Runs LT options dialog box. + **/ + public final void runOptionsDialog() { + final Language lang = getLanguage(); + if (lang == null) { + return; + } + final ConfigThread configThread = new ConfigThread(lang, config, this); + configThread.start(); + } + + /** + * @return An array of Locales supported by LT. + */ + public final Locale[] getLocales() { + try { + int dims = 0; + for (final Language element : Language.LANGUAGES) { + dims += element.getCountryVariants().length; + } + final Locale[] aLocales = new Locale[dims]; + int cnt = 0; + for (final Language element : Language.LANGUAGES) { + for (final String variant : element.getCountryVariants()) { + aLocales[cnt] = new Locale(element.getShortName(), variant, ""); + cnt++; + } + } + return aLocales; + } catch (final Throwable t) { + showError(t); + return new Locale[0]; + } + } + + /** + * @return true if LT supports the language of a given locale. + * @param locale + * The Locale to check. + */ + public final boolean hasLocale(final Locale locale) { + try { + for (final Language element : Language.LANGUAGES) { + if (element.getShortName().equals(locale.Language)) { + return true; + } + } + } catch (final Throwable t) { + showError(t); + } + return false; + } + + /** + * Add a listener that allow re-checking the document after changing the + * options in the configuration dialog box. + * + * @param xLinEvLis + * - the listener to be added + * @return true if listener is non-null and has been added, false otherwise. + */ + public final boolean addLinguServiceEventListener( + final XLinguServiceEventListener xLinEvLis) { + if (xLinEvLis == null) { + return false; + } + xEventListeners.add(xLinEvLis); + return true; + } + + /** + * Remove a listener from the event listeners list. + * + * @param xLinEvLis + * - the listener to be removed + * @return true if listener is non-null and has been removed, false otherwise. + */ + public final boolean removeLinguServiceEventListener( + final XLinguServiceEventListener xLinEvLis) { + if (xLinEvLis == null) { + return false; + } + if (xEventListeners.contains(xLinEvLis)) { + xEventListeners.remove(xLinEvLis); + return true; + } + return false; + } + + /** + * Inform listener (grammar checking iterator) that options have changed and + * the doc should be rechecked. + * + */ + public final void resetDocument() { + if (!xEventListeners.isEmpty()) { + for (final XLinguServiceEventListener xEvLis : xEventListeners) { + if (xEvLis != null) { + final com.sun.star.linguistic2.LinguServiceEvent xEvent = new com.sun.star.linguistic2.LinguServiceEvent(); + xEvent.nEvent = com.sun.star.linguistic2.LinguServiceEventFlags.PROOFREAD_AGAIN; + xEvLis.processLinguServiceEvent(xEvent); + } + } + recheck = true; + disabledRules = config.getDisabledRuleIds(); + if (disabledRules == null) { + disabledRules = new HashSet<String>(); + } + } + } + + public String[] getSupportedServiceNames() { + return getServiceNames(); + } + + public static String[] getServiceNames() { + return SERVICE_NAMES; + } + + public boolean supportsService(final String sServiceName) { + for (final String sName : SERVICE_NAMES) { + if (sServiceName.equals(sName)) { + return true; + } + } + return false; + } + + public String getImplementationName() { + return Main.class.getName(); + } + + public static XSingleComponentFactory __getComponentFactory( + final String sImplName) { + SingletonFactory xFactory = null; + if (sImplName.equals(Main.class.getName())) { + xFactory = new SingletonFactory(); + } + return xFactory; + } + + public static boolean __writeRegistryServiceInfo(final XRegistryKey regKey) { + return Factory.writeRegistryServiceInfo(Main.class.getName(), Main + .getServiceNames(), regKey); + } + + public void trigger(final String sEvent) { + if (!javaVersionOkay()) { + return; + } + try { + if ("configure".equals(sEvent)) { + runOptionsDialog(); + } else if ("about".equals(sEvent)) { + final AboutDialogThread aboutThread = new AboutDialogThread(MESSAGES); + aboutThread.start(); + } else { + System.err.println("Sorry, don't know what to do, sEvent = " + sEvent); + } + } catch (final Throwable e) { + showError(e); + } + } + + private boolean javaVersionOkay() { + final String version = System.getProperty("java.version"); + if (version != null + && (version.startsWith("1.0") || version.startsWith("1.1") + || version.startsWith("1.2") || version.startsWith("1.3") || version + .startsWith("1.4"))) { + final DialogThread dt = new DialogThread( + "Error: LanguageTool requires Java 1.5 or later. Current version: " + + version); + dt.start(); + return false; + } + try { + for (UIManager.LookAndFeelInfo info : UIManager + .getInstalledLookAndFeels()) { + if ("Nimbus".equals(info.getName())) { + UIManager.setLookAndFeel(info.getClassName()); + break; + } + } + } catch (Exception ex) { + // Well, what can we do... + } + + return true; + } + + static void showError(final Throwable e) { + final String metaInfo = "OS: " + System.getProperty("os.name") + + " on " + System.getProperty("os.arch") + ", Java version " + + System.getProperty("java.vm.version") + + " from " + System.getProperty("java.vm.vendor"); + String msg = "An error has occurred in LanguageTool " + JLanguageTool.VERSION + ":\n" + e.toString() + + "\nStacktrace:\n"; + final StackTraceElement[] elem = e.getStackTrace(); + for (final StackTraceElement element : elem) { + msg += element.toString() + "\n"; + } + msg += metaInfo; + final DialogThread dt = new DialogThread(msg); + dt.start(); + // e.printStackTrace(); + // OOo crashes when we throw an Exception :-( + // throw new RuntimeException(e); + } + + private File getHomeDir() { + final String homeDir = System.getProperty("user.home"); + if (homeDir == null) { + @SuppressWarnings({"ThrowableInstanceNeverThrown"}) + final RuntimeException ex = new RuntimeException("Could not get home directory"); + showError(ex); + } + return new File(homeDir); + } + + private class AboutDialogThread extends Thread { + + private final ResourceBundle messages; + + AboutDialogThread(final ResourceBundle messages) { + this.messages = messages; + } + + @Override + public void run() { + final XModel model = (XModel) UnoRuntime.queryInterface(XModel.class, + getxComponent()); + final XWindow parentWindow = model.getCurrentController().getFrame() + .getContainerWindow(); + final XWindowPeer parentWindowPeer = (XWindowPeer) UnoRuntime + .queryInterface(XWindowPeer.class, parentWindow); + final OOoAboutDialog about = new OOoAboutDialog(messages, + parentWindowPeer); + about.show(); + } + } + + public void ignoreRule(final String ruleId, final Locale locale) + throws IllegalArgumentException { + // TODO: config should be locale-dependent + disabledRulesUI.add(ruleId); + config.setDisabledRuleIds(disabledRulesUI); + try { + config.saveConfiguration(); + } catch (final Throwable t) { + showError(t); + } + recheck = true; + } + + /** + * Called on rechecking the document - resets the ignore status for rules that + * was set in the spelling dialog box or in the context menu. + * + * The rules disabled in the config dialog box are left as intact. + */ + public void resetIgnoreRules() { + config.setDisabledRuleIds(disabledRules); + try { + config.saveConfiguration(); + } catch (final Throwable t) { + showError(t); + } + recheck = true; + } + + public String getServiceDisplayName(Locale locale) { + return "LanguageTool"; + } + +} + +/** + * A simple comparator for sorting errors by their position. + * + */ +class ErrorPositionComparator implements Comparator<SingleProofreadingError> { + + public int compare(final SingleProofreadingError match1, + final SingleProofreadingError match2) { + if (match1.aSuggestions.length == 0 + && match2.aSuggestions.length > 0) { + return 1; + } + if (match2.aSuggestions.length == 0 + && match1.aSuggestions.length > 0) { + return -1; + } + final int error1pos = match1.nErrorStart; + final int error2pos = match2.nErrorStart; + if (error1pos > error2pos) + return 1; + else if (error1pos < error2pos) + return -1; + else + if (match1.aSuggestions.length != 0 + && match2.aSuggestions.length != 0 + && match1.aSuggestions.length + != match2.aSuggestions.length) { + return ((Integer) (match1.aSuggestions.length)) + .compareTo(match2.aSuggestions.length); + } + return match1.aRuleIdentifier.compareTo(match2.aRuleIdentifier); + } +} + +class DialogThread extends Thread { + final private String text; + + DialogThread(final String text) { + this.text = text; + } + + @Override + public void run() { + JOptionPane.showMessageDialog(null, text); + } +} diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/openoffice/OOoAboutDialog.java b/JLanguageTool/src/java/de/danielnaber/languagetool/openoffice/OOoAboutDialog.java new file mode 100644 index 0000000..35fbb2c --- /dev/null +++ b/JLanguageTool/src/java/de/danielnaber/languagetool/openoffice/OOoAboutDialog.java @@ -0,0 +1,64 @@ +/* LanguageTool, a natural language style checker + * Copyright (C) 2007 Daniel Naber (http://www.danielnaber.de) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ +package de.danielnaber.languagetool.openoffice; + +import com.sun.star.awt.Rectangle; +import com.sun.star.awt.XMessageBox; +import com.sun.star.awt.XMessageBoxFactory; +import com.sun.star.awt.XWindowPeer; +import com.sun.star.uno.UnoRuntime; +import de.danielnaber.languagetool.gui.AboutDialog; +import de.danielnaber.languagetool.tools.StringTools; + +import java.util.ResourceBundle; + +/** + * Dialog that display version and copyright information. + * + * @author Marcin Miłkowski + */ +public class OOoAboutDialog extends AboutDialog { + + private final XWindowPeer winPeer; + + public OOoAboutDialog(final ResourceBundle messages, + final XWindowPeer parentWindowPeer) { + super(messages); + winPeer = parentWindowPeer; + } + + @Override + public void show() { + final String aboutDialogTitle = StringTools.getLabel(messages + .getString("guiMenuAbout")); + final XMessageBoxFactory messageBoxFactory = (XMessageBoxFactory) UnoRuntime + .queryInterface(XMessageBoxFactory.class, winPeer.getToolkit()); + final Rectangle messageBoxRectangle = new Rectangle(); + final XMessageBox box = messageBoxFactory + .createMessageBox( + winPeer, + messageBoxRectangle, + "infobox", + 0, + aboutDialogTitle, + getAboutText()); + box.execute(); + } + +}
\ No newline at end of file diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/openoffice/SingletonFactory.java b/JLanguageTool/src/java/de/danielnaber/languagetool/openoffice/SingletonFactory.java new file mode 100644 index 0000000..ba43cbf --- /dev/null +++ b/JLanguageTool/src/java/de/danielnaber/languagetool/openoffice/SingletonFactory.java @@ -0,0 +1,48 @@ +/* LanguageTool, a natural language style checker + * Copyright (C) 2006 Daniel Naber (http://www.danielnaber.de) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ +package de.danielnaber.languagetool.openoffice; + +import com.sun.star.lang.XSingleComponentFactory; +import com.sun.star.uno.XComponentContext; + +/** + * This class is a factory that creates only a single instance, + * or a singleton, of the Main class. Used for performance + * reasons and to allow various parts of code to interact. + * + * @author Marcin Miłkowski + */ +public class SingletonFactory implements XSingleComponentFactory { + + private transient de.danielnaber.languagetool.openoffice.Main instance; + + public final Object createInstanceWithArgumentsAndContext(final Object[] arguments, + final XComponentContext xContext) throws com.sun.star.uno.Exception { + return createInstanceWithContext(xContext); + } + + public final Object createInstanceWithContext(final XComponentContext xContext) throws com.sun.star.uno.Exception { + if (instance == null) { + instance = new de.danielnaber.languagetool.openoffice.Main(xContext); + } else { + instance.changeContext(xContext); + } + return instance; + } +} diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/rules/AbstractCompoundRule.java b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/AbstractCompoundRule.java new file mode 100644 index 0000000..8ef9119 --- /dev/null +++ b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/AbstractCompoundRule.java @@ -0,0 +1,279 @@ +/* LanguageTool, a natural language style checker + * Copyright (C) 2006 Daniel Naber (http://www.danielnaber.de) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ +package de.danielnaber.languagetool.rules; + +import java.io.BufferedReader; +import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Queue; +import java.util.ResourceBundle; +import java.util.Set; +import java.util.concurrent.ArrayBlockingQueue; + +import de.danielnaber.languagetool.AnalyzedSentence; +import de.danielnaber.languagetool.AnalyzedToken; +import de.danielnaber.languagetool.AnalyzedTokenReadings; +import de.danielnaber.languagetool.tools.StringTools; + +/** + * Checks that compounds (if in the list) are not written as separate words. + * + * @author Daniel Naber & Marcin Miłkowski (refactoring) + */ + +public abstract class AbstractCompoundRule extends Rule { + + private static final int MAX_TERMS = 5; + + private final Set<String> incorrectCompounds = new HashSet<String>(); + private final Set<String> noDashSuggestion = new HashSet<String>(); + private final Set<String> onlyDashSuggestion = new HashSet<String>(); + + private String withHyphenMessage; + private String asOneMessage; + private String withOrWithoutHyphenMessage; + + private String shortDesc; + + /** Compounds with more than maxNoHyphensSize parts should always use hyphens */ + private int maxUnHyphenatedWordCount = 2; + + /** Flag to indicate if the hyphen is ignored in the text entered by the user. + * Set this to false if you want the rule to offer suggestions for words like [ro] "câte-și-trei" (with hyphen), not only for "câte și trei" (with spaces) + * This is only available for languages with hyphen as a word separator (ie: not available for english, available for Romanian) + * See Language.getWordTokenizer() + */ + private boolean hyphenIgnored = true; + + public AbstractCompoundRule(final ResourceBundle messages) throws IOException { + if (messages != null) + super.setCategory(new Category(messages.getString("category_misc"))); + } + + public abstract String getId(); + + public abstract String getDescription(); + + public void setShort(final String shortDescription) { + shortDesc = shortDescription; + } + + public void setMsg(final String withHyphenMessage, final String asOneMessage, final String withHyphenOrNotMessage) { + this.withHyphenMessage = withHyphenMessage; + this.asOneMessage = asOneMessage; + withOrWithoutHyphenMessage = withHyphenOrNotMessage; + } + + public boolean isHyphenIgnored() { + return hyphenIgnored; + } + + public void setHyphenIgnored(boolean ignoreHyphen) { + this.hyphenIgnored = ignoreHyphen; + } + + public int getMaxUnHyphenatedWordCount() { + return maxUnHyphenatedWordCount; + } + + public void setMaxUnHyphenatedWordCount(int maxNoHyphensSize) { + this.maxUnHyphenatedWordCount = maxNoHyphensSize; + } + + public RuleMatch[] match(final AnalyzedSentence text) { + final List<RuleMatch> ruleMatches = new ArrayList<RuleMatch>(); + final AnalyzedTokenReadings[] tokens = text.getTokensWithoutWhitespace(); + + RuleMatch prevRuleMatch = null; + final Queue<AnalyzedTokenReadings> prevTokens = new ArrayBlockingQueue<AnalyzedTokenReadings>(MAX_TERMS); + for (int i = 0; i < tokens.length + MAX_TERMS-1; i++) { + AnalyzedTokenReadings token = null; + // we need to extend the token list so we find matches at the end of the original list: + if (i >= tokens.length) + token = new AnalyzedTokenReadings(new AnalyzedToken("", "", null), prevTokens.peek().getStartPos()); + else + token = tokens[i]; + if (i == 0) { + addToQueue(token, prevTokens); + continue; + } + + final StringBuilder sb = new StringBuilder(); + int j = 0; + AnalyzedTokenReadings firstMatchToken = null; + final List<String> stringsToCheck = new ArrayList<String>(); + final List<String> origStringsToCheck = new ArrayList<String>(); // original upper/lowercase spelling + final Map<String, AnalyzedTokenReadings> stringToToken = new HashMap<String, AnalyzedTokenReadings>(); + for (AnalyzedTokenReadings atr : prevTokens) { + if (j == 0) + firstMatchToken = atr; + sb.append(' '); + sb.append(atr.getToken()); + if (j >= 1) { + final String stringToCheck = normalize(sb.toString()); + stringsToCheck.add(stringToCheck); + origStringsToCheck.add(sb.toString().trim()); + if (!stringToToken.containsKey(stringToCheck)) + stringToToken.put(stringToCheck, atr); + } + j++; + } + // iterate backwards over all potentially incorrect strings to make + // sure we match longer strings first: + for (int k = stringsToCheck.size()-1; k >= 0; k--) { + final String stringToCheck = stringsToCheck.get(k); + final String origStringToCheck = origStringsToCheck.get(k); + if (incorrectCompounds.contains(stringToCheck)) { + final AnalyzedTokenReadings atr = stringToToken.get(stringToCheck); + String msg = null; + final List<String> replacement = new ArrayList<String>(); + if (!noDashSuggestion.contains(stringToCheck)) { + replacement.add(origStringToCheck.replace(' ', '-')); + msg = withHyphenMessage; + } + // assume that compounds with more than maxUnHyphenatedWordCount (default: two) parts should always use hyphens: + if (!hasAllUppercaseParts(origStringToCheck) && countParts(stringToCheck) <= getMaxUnHyphenatedWordCount() + && !onlyDashSuggestion.contains(stringToCheck)) { + replacement.add(mergeCompound(origStringToCheck)); + msg = asOneMessage; + } + final String[] parts = stringToCheck.split(" "); + if (parts.length > 0 && parts[0].length() == 1) { + replacement.clear(); + replacement.add(origStringToCheck.replace(' ', '-')); + msg = withHyphenMessage; + } else if (replacement.isEmpty() || replacement.size() == 2) { // isEmpty shouldn't happen + msg = withOrWithoutHyphenMessage; + } + final RuleMatch ruleMatch = new RuleMatch(this, firstMatchToken.getStartPos(), + atr.getStartPos() + atr.getToken().length(), msg, shortDesc); + // avoid duplicate matches: + if (prevRuleMatch != null && prevRuleMatch.getFromPos() == ruleMatch.getFromPos()) { + prevRuleMatch = ruleMatch; + break; + } + prevRuleMatch = ruleMatch; + ruleMatch.setSuggestedReplacements(replacement); + ruleMatches.add(ruleMatch); + break; + } + } + addToQueue(token, prevTokens); + } + return toRuleMatchArray(ruleMatches); + } + + private String normalize(final String inStr) { + String str = inStr.trim().toLowerCase(); + if (str.indexOf('-') != -1 && str.indexOf(' ') != -1) { + if (isHyphenIgnored()) { + // e.g. "E-Mail Adresse" -> "E Mail Adresse" so the error can be detected: + str = str.replace('-', ' '); + } else { + str = str.replace(" - ", " "); + } + } + return str; + } + + private boolean hasAllUppercaseParts(final String str) { + final String[] parts = str.split(" "); + for (String part : parts) { + if (isHyphenIgnored() || !"-".equals(part)) { // do not treat '-' as an upper-case word + if (StringTools.isAllUppercase(part)) { + return true; + } + } + } + return false; + } + + private int countParts(final String str) { + return str.split(" ").length; + } + + private String mergeCompound(final String str) { + final String[] stringParts = str.split(" "); + final StringBuilder sb = new StringBuilder(); + for (int k = 0; k < stringParts.length; k++) { + if (isHyphenIgnored() || !"-".equals(stringParts[k])) { + if (k == 0) + sb.append(stringParts[k]); + else + sb.append(stringParts[k].toLowerCase()); + } + } + return sb.toString(); + } + + private void addToQueue(final AnalyzedTokenReadings token, final Queue<AnalyzedTokenReadings> prevTokens) { + final boolean inserted = prevTokens.offer(token); + if (!inserted) { + prevTokens.poll(); + prevTokens.offer(token); + } + } + + public void loadCompoundFile(final InputStream file, final String encoding) throws IOException { + InputStreamReader isr = null; + BufferedReader br = null; + try { + isr = new InputStreamReader(file, encoding); + br = new BufferedReader(isr); + String line; + while ((line = br.readLine()) != null) { + line = line.trim(); + if (line.length() < 1) { + continue; + } + if (line.charAt(0) == '#') { // ignore comments + continue; + } + // the set contains the incorrect spellings, i.e. the ones without hyphen + line = line.replace('-', ' '); + final String[] parts = line.split(" "); + if (parts.length > MAX_TERMS) + throw new IOException("Too many compound parts: " + line + ", maximum allowed: " + MAX_TERMS); + if (parts.length == 1) + throw new IOException("Not a compound: " + line); + if (line.endsWith("+")) { + line = line.substring(0, line.length() - 1); // cut off "+" + noDashSuggestion.add(line.toLowerCase()); + } else if (line.endsWith("*")) { + line = line.substring(0, line.length() - 1); // cut off "*" + onlyDashSuggestion.add(line.toLowerCase()); + } + incorrectCompounds.add(line.toLowerCase()); + } + } finally { + if (br != null) br.close(); + if (isr != null) isr.close(); + } + } + + public void reset() { + } + +} diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/rules/AbstractPunctuationCheckRule.java b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/AbstractPunctuationCheckRule.java new file mode 100644 index 0000000..89d216b --- /dev/null +++ b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/AbstractPunctuationCheckRule.java @@ -0,0 +1,93 @@ +/* LanguageTool, a natural language style checker + * Copyright (C) 2005 Daniel Naber (http://www.danielnaber.de) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ +package de.danielnaber.languagetool.rules; + +import java.util.ArrayList; +import java.util.List; +import java.util.ResourceBundle; + +import de.danielnaber.languagetool.AnalyzedSentence; +import de.danielnaber.languagetool.AnalyzedTokenReadings; + +/** + * A rule that matches "..", "::", "-," but not "...", "!..", "?!!", ",-" etc. + * Languages will have to subclass it and override <code>isPunctsJoinOk()</code> + * and <code>isPunctuation()</code> to provide language-specific checking + * + * @author Andriy Rysin + */ +public abstract class AbstractPunctuationCheckRule extends Rule { + + public AbstractPunctuationCheckRule(final ResourceBundle messages) { + super(messages); + super.setCategory(new Category(messages.getString("category_misc"))); + } + + public String getId() { + return "PUNCTUATION_GENERIC_CHECK"; + } + + public String getDescription() { + return "Use of unusual combination of punctuation characters"; + } + + protected abstract boolean isPunctsJoinOk(String tkns); + + protected abstract boolean isPunctuation(String token); + + public RuleMatch[] match(final AnalyzedSentence text) { + final List<RuleMatch> ruleMatches = new ArrayList<RuleMatch>(); + final AnalyzedTokenReadings[] tokens = text.getTokens(); + + int startTokenIdx = -1; + String tkns = ""; + for (int i = 0; i < tokens.length; i++) { + final String tokenStr = tokens[i].getToken(); + + if (isPunctuation(tokenStr)) { + tkns += tokenStr; + + if (startTokenIdx == -1) + startTokenIdx = i; + + if (i < tokens.length - 1) + continue; + } + + if (tkns.length() >= 2 && !isPunctsJoinOk(tkns)) { + final String msg = "bad duplication or combination of punctuation signs"; + final RuleMatch ruleMatch = new RuleMatch(this, tokens[startTokenIdx] + .getStartPos(), + tokens[startTokenIdx].getStartPos() + tkns.length(), msg, + "Punctuation problem"); + ruleMatch.setSuggestedReplacement(tkns.substring(0, 1)); + ruleMatches.add(ruleMatch); + } + tkns = ""; + startTokenIdx = -1; + } + + return toRuleMatchArray(ruleMatches); + } + + public void reset() { + // nothing + } + +}
\ No newline at end of file diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/rules/AbstractSimpleReplaceRule.java b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/AbstractSimpleReplaceRule.java new file mode 100644 index 0000000..13288a2 --- /dev/null +++ b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/AbstractSimpleReplaceRule.java @@ -0,0 +1,159 @@ +/* LanguageTool, a natural language style checker + * Copyright (C) 2005 Daniel Naber (http://www.danielnaber.de) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ +package de.danielnaber.languagetool.rules; + +import java.io.BufferedReader; +import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Locale; +import java.util.Map; +import java.util.ResourceBundle; + +import de.danielnaber.languagetool.AnalyzedSentence; +import de.danielnaber.languagetool.AnalyzedTokenReadings; +import de.danielnaber.languagetool.JLanguageTool; +import de.danielnaber.languagetool.tools.StringTools; + +/** + * A rule that matches words or phrases which should not be used and suggests + * correct ones instead. Loads the relevant words from + * <code>rules/XX/replace.txt</code>, where XX is a code of the language. + * + * @author Andriy Rysin + */ +public abstract class AbstractSimpleReplaceRule extends Rule { + + private static final String FILE_ENCODING = "utf-8"; + + private Map<String, String> wrongWords; // e.g. "вреѿті реѿт" -> "зреѿтою" + + public abstract String getFileName(); + + public String getEncoding() { + return FILE_ENCODING; + } + + /** + * Indicates if the rule is case-sensitive. Default value is <code>true</code>. + * @return true if the rule is case-sensitive, false otherwise. + */ + public boolean isCaseSensitive() { + return true; + } + + /** + * @return the locale used for case conversion when {@link #isCaseSensitive()} is set to <code>false</code>. + */ + public Locale getLocale() { + return Locale.getDefault(); + } + + public AbstractSimpleReplaceRule(final ResourceBundle messages) throws IOException { + if (messages != null) { + super.setCategory(new Category(messages.getString("category_misc"))); + } + wrongWords = loadWords(JLanguageTool.getDataBroker().getFromRulesDirAsStream(getFileName())); + } + + public String getId() { + return "SIMPLE_REPLACE"; + } + + public String getDescription() { + return "Checks for wrong words/phrases"; + } + + public String getSuggestion() { + return " is not valid, use "; + } + + public String getShort() { + return "Wrong word"; + } + + public final RuleMatch[] match(final AnalyzedSentence text) { + final List<RuleMatch> ruleMatches = new ArrayList<RuleMatch>(); + final AnalyzedTokenReadings[] tokens = text.getTokensWithoutWhitespace(); + + for (int i = 1; i < tokens.length; i++) { + final String token = tokens[i].getToken(); + + final String origToken = token; + final String replacement = isCaseSensitive()?wrongWords.get(token):wrongWords.get(token.toLowerCase(getLocale())); + if (replacement != null) { + final String msg = token + getSuggestion() + replacement; + final int pos = tokens[i].getStartPos(); + final RuleMatch potentialRuleMatch = new RuleMatch(this, pos, pos + + origToken.length(), msg, getShort()); + if (!isCaseSensitive() && StringTools.startsWithUppercase(token)) { + potentialRuleMatch.setSuggestedReplacement(StringTools.uppercaseFirstChar(replacement)); + } else { + potentialRuleMatch.setSuggestedReplacement(replacement); + } + ruleMatches.add(potentialRuleMatch); + } + } + return toRuleMatchArray(ruleMatches); + } + + + private Map<String, String> loadWords(final InputStream file) throws IOException { + final Map<String, String> map = new HashMap<String, String>(); + InputStreamReader isr = null; + BufferedReader br = null; + try { + isr = new InputStreamReader(file, getEncoding()); + br = new BufferedReader(isr); + String line; + + while ((line = br.readLine()) != null) { + line = line.trim(); + if (line.length() < 1) { + continue; + } + if (line.charAt(0) == '#') { // ignore comments + continue; + } + final String[] parts = line.split("="); + if (parts.length != 2) { + throw new IOException("Format error in file " + + JLanguageTool.getDataBroker().getFromRulesDirAsUrl(getFileName()) + ", line: " + line); + } + map.put(parts[0], parts[1]); + } + + } finally { + if (br != null) { + br.close(); + } + if (isr != null) { + isr.close(); + } + } + return map; + } + + public void reset() { + } + +} diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/rules/Category.java b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/Category.java new file mode 100644 index 0000000..95a3b44 --- /dev/null +++ b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/Category.java @@ -0,0 +1,85 @@ +/* LanguageTool, a natural language style checker + * Copyright (C) 2006 Daniel Naber (http://www.danielnaber.de) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ +package de.danielnaber.languagetool.rules; + +/** + * A rule's category. Categories are used to group rules for + * a better overview. + * + * @author Daniel Naber + */ +public class Category { + + private static final int DEFAULT_PRIORITY = 50; + + private int priority; + private String name; + private boolean defaultOff; + + /** + * Create a new category with the given name and priority. + * @param name name of the category + * @param priority a value between 0 and 100 (inclusive) + */ + public Category(final String name, final int priority) { + if (priority < 0 || priority > 100) + throw new IllegalArgumentException("priority must be in range 0 - 100"); + this.name = name; + this.priority = priority; + } + + /** + * Create a new category with the default priority (50). + * @param name name of the category + */ + public Category(final String name) { + this(name, DEFAULT_PRIORITY); + } + + public String getName() { + return name; + } + + public int getPriority() { + return priority; + } + + public String toString() { + return name + "(prio=" + priority + ")"; + } + + /** + * Checks whether the category has been turned off + * by default by the category author. + * @return True if the category is turned off by + * default. + */ + public final boolean isDefaultOff() { + return defaultOff; + } + + /** + * Turns the category by default off. + **/ + public final void setDefaultOff() { + defaultOff = true; + } + + +} diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/rules/CommaWhitespaceRule.java b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/CommaWhitespaceRule.java new file mode 100644 index 0000000..0636a1f --- /dev/null +++ b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/CommaWhitespaceRule.java @@ -0,0 +1,170 @@ +/* LanguageTool, a natural language style checker + * Copyright (C) 2005 Daniel Naber (http://www.danielnaber.de) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ +package de.danielnaber.languagetool.rules; + +import java.util.ArrayList; +import java.util.List; +import java.util.ResourceBundle; + +import de.danielnaber.languagetool.AnalyzedSentence; +import de.danielnaber.languagetool.AnalyzedTokenReadings; + +/** + * A rule that matches commas and closing parenthesis preceded by whitespace and + * opening parenthesis followed by whitespace. + * + * @author Daniel Naber + */ + +public class CommaWhitespaceRule extends Rule { + + public CommaWhitespaceRule(final ResourceBundle messages) { + super(messages); + super.setCategory(new Category(messages.getString("category_misc"))); + } + + public final String getId() { + return "COMMA_PARENTHESIS_WHITESPACE"; + } + + public final String getDescription() { + return messages.getString("desc_comma_whitespace"); + } + + public final RuleMatch[] match(final AnalyzedSentence text) { + final List<RuleMatch> ruleMatches = new ArrayList<RuleMatch>(); + final AnalyzedTokenReadings[] tokens = text.getTokens(); + String prevToken = ""; + String prevPrevToken = ""; + boolean prevWhite = false; + int pos = 0; + int prevLen = 0; + for (int i = 0; i < tokens.length; i++) { + final String token = tokens[i].getToken(); + final boolean isWhite = tokens[i].isWhitespace() + || tokens[i].isFieldCode(); + pos += token.length(); + String msg = null; + int fixLen = 0; + String suggestionText = null; + if (isWhite && isLeftBracket(prevToken)) { + msg = messages.getString("no_space_after"); + suggestionText = prevToken; + fixLen = 1; + } else if (!isWhite && prevToken.equals(",") + && isNotQuoteOrHyphen(token) + && containsNoNumber(prevPrevToken) + && containsNoNumber(token) + && !",".equals(prevPrevToken)) { + msg = messages.getString("missing_space_after_comma"); + suggestionText = ", "; + } else if (prevWhite) { + if (isRightBracket(token)) { + msg = messages.getString("no_space_before"); + suggestionText = token; + fixLen = 1; + } else if (token.equals(",")) { + msg = messages.getString("space_after_comma"); + suggestionText = ","; + fixLen = 1; + //exception for duplicated comma (we already have another rule for that) + if (i + 1 < tokens.length + && ",".equals(tokens[i + 1].getToken())) { + msg = null; + } + } else if (token.equals(".")) { + msg = messages.getString("no_space_before_dot"); + suggestionText = "."; + fixLen = 1; + // exception case for figures such as ".5" and ellipsis + if (i + 1 < tokens.length + && isNumberOrDot(tokens[i + 1].getToken())) { + msg = null; + } + } + } + if (msg != null) { + final int fromPos = tokens[i - 1].getStartPos(); + final int toPos = tokens[i - 1].getStartPos() + fixLen + prevLen; + // TODO: add some good short comment here + final RuleMatch ruleMatch = new RuleMatch(this, fromPos, toPos, msg); + ruleMatch.setSuggestedReplacement(suggestionText); + ruleMatches.add(ruleMatch); + } + prevPrevToken = prevToken; + prevToken = token; + prevWhite = isWhite && !tokens[i].isFieldCode(); //OOo code before comma/dot + prevLen = tokens[i].getToken().length(); + } + + return toRuleMatchArray(ruleMatches); + } + + static boolean isNotQuoteOrHyphen(final String str) { + if (str.length() == 1) { + final char c = str.charAt(0); + if (c =='\'' || c == '-' || c == '”' + || c =='’' || c == '"' || c == '“' + || c == ',') { + return false; + } + } else { + if (""".equals(str)) { + return false; + } + return containsNoNumber(str); + } + return true; + } + + static boolean isNumberOrDot(final String str) { + final char c = str.charAt(0); + return (c == '.' || Character.isDigit(c)); + } + + static boolean isLeftBracket(final String str) { + if (str.length() == 0) { + return false; + } + final char c = str.charAt(0); + return (c == '(' || c == '[' || c == '{'); + } + + static boolean isRightBracket(final String str) { + if (str.length() == 0) { + return false; + } + final char c = str.charAt(0); + return (c == ')' || c == ']' || c == '}'); + } + + static boolean containsNoNumber(final String str) { + for (int i = 0; i < str.length(); i++) { + if (Character.isDigit(str.charAt(i))) { + return false; + } + } + return true; + } + + public void reset() { + // nothing + } + +} diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/rules/DoublePunctuationRule.java b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/DoublePunctuationRule.java new file mode 100644 index 0000000..3a6a4e1 --- /dev/null +++ b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/DoublePunctuationRule.java @@ -0,0 +1,99 @@ +/* LanguageTool, a natural language style checker + * Copyright (C) 2005 Daniel Naber (http://www.danielnaber.de) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ +package de.danielnaber.languagetool.rules; + +import java.util.ArrayList; +import java.util.List; +import java.util.ResourceBundle; + +import de.danielnaber.languagetool.AnalyzedSentence; +import de.danielnaber.languagetool.AnalyzedTokenReadings; + +/** + * A rule that matches ".." (but not "..." etc) and ",,". + * + * @author Daniel Naber + */ +public class DoublePunctuationRule extends Rule { + + public DoublePunctuationRule(final ResourceBundle messages) { + super(messages); + super.setCategory(new Category(messages.getString("category_misc"))); + } + + public final String getId() { + return "DOUBLE_PUNCTUATION"; + } + + public final String getDescription() { + return messages.getString("desc_double_punct"); + } + + public final RuleMatch[] match(final AnalyzedSentence text) { + final List<RuleMatch> ruleMatches = new ArrayList<RuleMatch>(); + final AnalyzedTokenReadings[] tokens = text.getTokens(); + int startPos = 0; + int dotCount = 0; + int commaCount = 0; + for (int i = 0; i < tokens.length; i++) { + final String token = tokens[i].getToken(); + String nextToken = null; + if (i < tokens.length - 1) { + nextToken = tokens[i + 1].getToken(); + } + if (".".equals(token)) { + dotCount++; + commaCount = 0; + startPos = tokens[i].getStartPos(); + } else if (",".equals(token)) { + commaCount++; + dotCount = 0; + startPos = tokens[i].getStartPos(); + } + if (dotCount == 2 && !".".equals(nextToken)) { + final String msg = messages.getString("two_dots"); + final int fromPos = Math.max(0, startPos - 1); + final RuleMatch ruleMatch = new RuleMatch(this, fromPos, startPos + 1, + msg, messages.getString("double_dots_short")); + ruleMatch.setSuggestedReplacement("."); + ruleMatches.add(ruleMatch); + dotCount = 0; + } else if (commaCount == 2 && !",".equals(nextToken)) { + final String msg = messages.getString("two_commas"); + final int fromPos = Math.max(0, startPos); + final RuleMatch ruleMatch = new RuleMatch(this, fromPos, startPos + 1, + msg, messages.getString("double_commas_short")); + ruleMatch.setSuggestedReplacement(","); + ruleMatches.add(ruleMatch); + commaCount = 0; + } + if (!".".equals(token) && !",".equals(token)) { + dotCount = 0; + commaCount = 0; + } + } + + return toRuleMatchArray(ruleMatches); + } + + public void reset() { + // nothing + } + +} diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/rules/GenericUnpairedBracketsRule.java b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/GenericUnpairedBracketsRule.java new file mode 100644 index 0000000..a2cd35c --- /dev/null +++ b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/GenericUnpairedBracketsRule.java @@ -0,0 +1,314 @@ +/* LanguageTool, a natural language style checker + * Copyright (C) 2009 Daniel Naber (http://www.danielnaber.de) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ + +package de.danielnaber.languagetool.rules; + +import java.util.ArrayList; +import java.util.List; +import java.util.ResourceBundle; +import java.util.regex.Pattern; + +import de.danielnaber.languagetool.AnalyzedSentence; +import de.danielnaber.languagetool.AnalyzedTokenReadings; +import de.danielnaber.languagetool.Language; +import de.danielnaber.languagetool.tools.UnsyncStack; +import de.danielnaber.languagetool.tools.SymbolLocator; + +/** + * Rule that finds unpaired quotes, brackets etc. + * + * @author Marcin Miłkowski + */ +public class GenericUnpairedBracketsRule extends Rule { + + /** + * Note that there must be equal length of both arrays, and the sequence of + * starting symbols must match exactly the sequence of ending symbols. + */ + private static final String[] START_SYMBOLS = { "[", "(", "{", "\"", "'" }; + private static final String[] END_SYMBOLS = { "]", ")", "}", "\"", "'" }; + + protected String[] startSymbols; + protected String[] endSymbols; + + private static final String[] SL_START_SYMBOLS = { "[", "(", "{", "„", "»", "\"" }; + private static final String[] SL_END_SYMBOLS = { "]", ")", "}", "”", "«", "\"" }; + + private static final String[] SK_START_SYMBOLS = { "[", "(", "{", "„", "»", "\"" }; + private static final String[] SK_END_SYMBOLS = { "]", ")", "}", "“", "«", "\"" }; + + private static final String[] RO_START_SYMBOLS = { "[", "(", "{", "„", "«" }; + private static final String[] RO_END_SYMBOLS = { "]", ")", "}", "”", "»" }; + + private static final String[] FR_START_SYMBOLS = { "[", "(", "{", "«", /*"‘"*/ }; + private static final String[] FR_END_SYMBOLS = { "]", ")", "}", "»", /*"’" used in "d’arm" and many other words */ }; + + private static final String[] DE_START_SYMBOLS = { "[", "(", "{", "„", "»", "‘" }; + private static final String[] DE_END_SYMBOLS = { "]", ")", "}", "“", "«", "’" }; + + private static final String[] GL_START_SYMBOLS = { "[", "(", "{", "“", "«", "‘", "\"", "'" }; + private static final String[] GL_END_SYMBOLS = { "]", ")", "}", "”", "»", "’", "\"", "'" }; + + private static final String[] ES_START_SYMBOLS = { "[", "(", "{", "“", "«", "¿", "¡" }; + private static final String[] ES_END_SYMBOLS = { "]", ")", "}", "”", "»", "?", "!" }; + + private static final String[] UK_START_SYMBOLS = { "[", "(", "{", "„", "«" }; + private static final String[] UK_END_SYMBOLS = { "]", ")", "}", "“", "»" }; + + private static final String[] NL_START_SYMBOLS = { "[", "(", "{", "“", "\u2039", "\u201c", "\u201e" }; + private static final String[] NL_END_SYMBOLS = { "]", ")", "}", "”", "\u203a", "\u201d", "\u201d" }; + + private static final String[] IT_START_SYMBOLS = { "[", "(", "{", "»", /*"‘"*/ }; + private static final String[] IT_END_SYMBOLS = { "]", ")", "}", "«", /*"’"*/ }; + + private static final String[] DK_START_SYMBOLS = { "[", "(", "{", "\"", "”" }; + private static final String[] DK_END_SYMBOLS = { "]", ")", "}", "\"", "”" }; + + + + /** + * The stack for pairing symbols. + */ + protected final UnsyncStack<SymbolLocator> symbolStack = new UnsyncStack<SymbolLocator>(); + + /** + * Stack of rule matches. + */ + private final UnsyncStack<RuleMatchLocator> ruleMatchStack = new UnsyncStack<RuleMatchLocator>(); + + private boolean endOfParagraph; + + private final Language ruleLang; + + private static final Pattern PUNCTUATION = Pattern.compile("\\p{Punct}"); + private static final Pattern PUNCTUATION_NO_DOT = Pattern + .compile("[\\p{Punct}&&[^\\.]]"); + private static final Pattern NUMERALS = Pattern + .compile("(?i)\\d{1,2}?[a-z']*|M*(D?C{0,3}|C[DM])(L?X{0,3}|X[LC])(V?I{0,3}|I[VX])$"); + + private int ruleMatchIndex; + private List<RuleMatch> ruleMatches; + + public GenericUnpairedBracketsRule(final ResourceBundle messages, + final Language language) { + super(messages); + super.setCategory(new Category(messages.getString("category_misc"))); + + setParagraphBackTrack(true); + if (language.equals(Language.SLOVAK)) { + startSymbols = SK_START_SYMBOLS; + endSymbols = SK_END_SYMBOLS; } + else if (language.equals(Language.SLOVENIAN)) { + startSymbols = SL_START_SYMBOLS; + endSymbols = SL_END_SYMBOLS; + } else if (language.equals(Language.FRENCH)) { + startSymbols = FR_START_SYMBOLS; + endSymbols = FR_END_SYMBOLS; + } else if (language.equals(Language.GERMAN)) { + startSymbols = DE_START_SYMBOLS; + endSymbols = DE_END_SYMBOLS; + } else if (language.equals(Language.GALICIAN)) { + startSymbols = GL_START_SYMBOLS; + endSymbols = GL_END_SYMBOLS; + } else if (language.equals(Language.DUTCH)) { + startSymbols = NL_START_SYMBOLS; + endSymbols = NL_END_SYMBOLS; + } else if (language.equals(Language.SPANISH)) { + startSymbols = ES_START_SYMBOLS; + endSymbols = ES_END_SYMBOLS; + } else if (language.equals(Language.UKRAINIAN)) { + startSymbols = UK_START_SYMBOLS; + endSymbols = UK_END_SYMBOLS; + } else if (language.equals(Language.ITALIAN)) { + startSymbols = IT_START_SYMBOLS; + endSymbols = IT_END_SYMBOLS; + } else if (language.equals(Language.ROMANIAN)) { + startSymbols = RO_START_SYMBOLS; + endSymbols = RO_END_SYMBOLS; + } else if (language.equals(Language.DANISH)) { + startSymbols = DK_START_SYMBOLS; + endSymbols = DK_END_SYMBOLS; + } else { + startSymbols = START_SYMBOLS; + endSymbols = END_SYMBOLS; + } + + ruleLang = language; + } + + public String getId() { + return "UNPAIRED_BRACKETS"; + } + + public String getDescription() { + return messages.getString("desc_unpaired_brackets"); + } + + /** + * Generic method to specify an exception. For unspecified + * language, it simply returns true, which means no exception. + * @param token + * String token + * @param tokens + * Sentence tokens + * @param i + * Current token index + * @param precSpace + * boolean: is preceded with space + * @param follSpace + * boolean: is followed with space + * @return + */ + protected boolean isNoException(final String token, + final AnalyzedTokenReadings[] tokens, final int i, final int j, + final boolean precSpace, + final boolean follSpace) { + return true; + } + + public final RuleMatch[] match(final AnalyzedSentence text) { + ruleMatches = new ArrayList<RuleMatch>(); + final AnalyzedTokenReadings[] tokens = text.getTokensWithoutWhitespace(); + + if (endOfParagraph) { + reset(); + } + + ruleMatchIndex = getMatchesIndex(); + + for (int i = 1; i < tokens.length; i++) { + for (int j = 0; j < startSymbols.length; j++) { + + final String token = tokens[i].getToken(); + if (token.equals(startSymbols[j]) || token.equals(endSymbols[j])) { + boolean precededByWhitespace = true; + if (startSymbols[j].equals(endSymbols[j])) { + precededByWhitespace = tokens[i - 1].isSentStart() + || tokens[i].isWhitespaceBefore() + || PUNCTUATION_NO_DOT.matcher(tokens[i - 1].getToken()) + .matches(); + } + + boolean followedByWhitespace = true; + if (i < tokens.length - 1 && startSymbols[j].equals(endSymbols[j])) { + followedByWhitespace = tokens[i + 1].isWhitespaceBefore() + || PUNCTUATION.matcher(tokens[i + 1].getToken()).matches(); + } + + final boolean noException = isNoException(token, tokens, i, j, + precededByWhitespace, followedByWhitespace); + + if (noException && precededByWhitespace + && token.equals(startSymbols[j])) { + symbolStack.push(new SymbolLocator(startSymbols[j], i)); + } else if (noException && followedByWhitespace + && token.equals(endSymbols[j])) { + if (i > 1 && endSymbols[j].equals(")") + && (NUMERALS.matcher(tokens[i - 1].getToken()).matches() + && !(!symbolStack.empty() + && "(".equals(symbolStack.peek().symbol)))) { + } else { + if (symbolStack.empty()) { + symbolStack.push(new SymbolLocator(endSymbols[j], i)); + } else { + if (symbolStack.peek().symbol.equals(startSymbols[j])) { + symbolStack.pop(); + } else { + symbolStack.push(new SymbolLocator(endSymbols[j], i)); + } + } + } + } + } + } + } + for (final SymbolLocator sLoc : symbolStack) { + final RuleMatch rMatch = createMatch(tokens[sLoc.index].getStartPos(), + sLoc.symbol); + if (rMatch != null) { + ruleMatches.add(rMatch); + } + } + symbolStack.clear(); + if (tokens[tokens.length - 1].isParaEnd()) { + endOfParagraph = true; + } + + return toRuleMatchArray(ruleMatches); + } + + private RuleMatch createMatch(final int startPos, final String symbol) { + if (!ruleMatchStack.empty()) { + final int index = findSymbolNum(symbol); + if (index >= 0) { + final RuleMatchLocator rLoc = ruleMatchStack.peek(); + if (rLoc.symbol.equals(startSymbols[index])) { + if (ruleMatches.size() > rLoc.myIndex) { + ruleMatches.remove(rLoc.myIndex); + ruleMatchStack.pop(); + return null; + // if (ruleMatches.get(rLoc.myIndex).getFromPos()) + } + if (isInMatches(rLoc.index)) { + setAsDeleted(rLoc.index); + ruleMatchStack.pop(); + return null; + } + } + } + } + ruleMatchStack.push(new RuleMatchLocator(symbol, ruleMatchIndex, + ruleMatches.size())); + ruleMatchIndex++; + return new RuleMatch(this, startPos, startPos + symbol.length(), messages + .getString("unpaired_brackets")); + } + + private int findSymbolNum(final String ch) { + for (int i = 0; i < endSymbols.length; i++) { + if (ch.equals(endSymbols[i])) { + return i; + } + } + return -1; + } + + /** + * Reset the state information for the rule, including paragraph-level + * information. + */ + public final void reset() { + ruleMatchStack.clear(); + symbolStack.clear(); + if (!endOfParagraph) { + clearMatches(); + } + endOfParagraph = false; + } + +} + +class RuleMatchLocator extends SymbolLocator { + public int myIndex; + + RuleMatchLocator(final String sym, final int ind, final int myInd) { + super(sym, ind); + myIndex = myInd; + } +} diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/rules/IncorrectExample.java b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/IncorrectExample.java new file mode 100644 index 0000000..0d3478f --- /dev/null +++ b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/IncorrectExample.java @@ -0,0 +1,62 @@ +/* LanguageTool, a natural language style checker + * Copyright (C) 2008 Daniel Naber (http://www.danielnaber.de) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ +package de.danielnaber.languagetool.rules; + +import java.util.Arrays; +import java.util.List; + +/** + * A text, typically a sentence, that contains an error. + * + * @since 0.9.2 + * @author Daniel Naber + */ +public class IncorrectExample { + + private String example; + private List<String> corrections; + + public IncorrectExample(final String example) { + this.example = example; + } + + public IncorrectExample(final String example, final String[] corrections) { + this(example); + this.corrections = Arrays.asList(corrections); + } + + /** + * Return the example that contains the error. + */ + public String getExample() { + return example; + } + + /** + * Return the possible corrections. May be null. + */ + public List<String> getCorrections() { + return corrections; + } + + public String toString() { + return example + " " + corrections; + } + +} diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/rules/Rule.java b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/Rule.java new file mode 100644 index 0000000..210754c --- /dev/null +++ b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/Rule.java @@ -0,0 +1,230 @@ +/* LanguageTool, a natural language style checker + * Copyright (C) 2005 Daniel Naber (http://www.danielnaber.de) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ +package de.danielnaber.languagetool.rules; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; +import java.util.ResourceBundle; +import java.util.Set; + +import de.danielnaber.languagetool.AnalyzedSentence; +import de.danielnaber.languagetool.Language; + +/** + * Abstract rule class. A Rule describes a language error and can test whether a + * given pre-analyzed text contains that error using the {@link Rule#match} + * method. + * + * @author Daniel Naber + */ +public abstract class Rule { + + private List<String> correctExamples; + private List<IncorrectExample> incorrectExamples; + private Category category; + + /** + * If true, then the rule is turned off by default. + */ + private boolean defaultOff; + + protected ResourceBundle messages; + + /** + * Called by language-dependent rules. + */ + public Rule() { + } + + /** + * Called by language-independent rules. + */ + public Rule(final ResourceBundle messages) { + this.messages = messages; + } + + public abstract String getId(); + + public abstract String getDescription(); + + /** + * Used by paragraph rules to signal that they can remove previous rule + * matches. + */ + private boolean paragraphBackTrack; + + /** + * The final list of RuleMatches, without removed matches. + */ + private List<RuleMatch> previousMatches; + + private List<RuleMatch> removedMatches; + + /** + * Check whether the given text matches this error rule, i.e. whether the text + * contains this error. + * + * @param text + * a pre-analyzed sentence + * @return an array of RuleMatch object for each match. + */ + public abstract RuleMatch[] match(AnalyzedSentence text) throws IOException; + + /** + * If a rule keeps its state over more than the check of one sentence, this + * must be implemented so the internal state is reset. It will be called + * before a new text is going to be checked. + */ + public abstract void reset(); + + /** + * Whether this rule can be used for text in the given language. + */ + public final boolean supportsLanguage(final Language language) { + final Set<String> relevantIDs = language.getRelevantRuleIDs(); + return relevantIDs != null && relevantIDs.contains(getId()); + } + + /** + * Set the examples that are correct and thus do not trigger the rule. + */ + public final void setCorrectExamples(final List<String> correctExamples) { + this.correctExamples = correctExamples; + } + + /** + * Get example sentences that are correct and thus will not match this rule. + */ + public final List<String> getCorrectExamples() { + return correctExamples; + } + + /** + * Set the examples that are incorrect and thus do trigger the rule. + */ + public final void setIncorrectExamples( + final List<IncorrectExample> incorrectExamples) { + this.incorrectExamples = incorrectExamples; + } + + /** + * Get example sentences that are incorrect and thus will match this rule. + */ + public final List<IncorrectExample> getIncorrectExamples() { + return incorrectExamples; + } + + public final Category getCategory() { + return category; + } + + public final void setCategory(final Category category) { + this.category = category; + } + + protected final RuleMatch[] toRuleMatchArray(final List<RuleMatch> ruleMatches) { + return ruleMatches.toArray(new RuleMatch[ruleMatches.size()]); + } + + public final boolean isParagraphBackTrack() { + return paragraphBackTrack; + } + + public final void setParagraphBackTrack(final boolean backTrack) { + paragraphBackTrack = backTrack; + } + + /** + * Method to add matches. + * + * @param r + * RuleMatch - matched rule added by check() + */ + public final void addRuleMatch(final RuleMatch r) { + if (previousMatches == null) { + previousMatches = new ArrayList<RuleMatch>(); + } + previousMatches.add(r); + } + + /** + * Deletes (or disables) previously matched rule. + * + * @param i + * Index of the rule that should be deleted. + */ + public final void setAsDeleted(final int i) { + if (removedMatches == null) { + removedMatches = new ArrayList<RuleMatch>(); + } + removedMatches.add(previousMatches.get(i)); + } + + public final boolean isInRemoved(final RuleMatch r) { + if (removedMatches == null) { + return false; + } + return removedMatches.contains(r); + } + + public final boolean isInMatches(final int i) { + if (previousMatches == null) { + return false; + } + if (previousMatches.size() > i) { + return previousMatches.get(i) != null; + } + return false; + } + + public final void clearMatches() { + if (previousMatches != null) { + previousMatches.clear(); + } + } + + public final int getMatchesIndex() { + if (previousMatches == null) { + return 0; + } + return previousMatches.size(); + } + + public final List<RuleMatch> getMatches() { + return previousMatches; + } + + /** + * Checks whether the rule has been turned off by default by the rule author. + * + * @return True if the rule is turned off by default. + */ + public final boolean isDefaultOff() { + return defaultOff; + } + + /** + * Turns the rule by default off. + **/ + public final void setDefaultOff() { + defaultOff = true; + } + +} diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/rules/RuleMatch.java b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/RuleMatch.java new file mode 100644 index 0000000..05746fb --- /dev/null +++ b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/RuleMatch.java @@ -0,0 +1,239 @@ +/* LanguageTool, a natural language style checker + * Copyright (C) 2005 Daniel Naber (http://www.danielnaber.de) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ +package de.danielnaber.languagetool.rules; + +import java.util.ArrayList; +import java.util.List; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +import de.danielnaber.languagetool.tools.StringTools; + +/** + * A class that holds information about where a rule matches text. + * + * @author Daniel Naber + */ +public class RuleMatch implements Comparable<RuleMatch> { + + private static final Pattern SUGGESTION_PATTERN = Pattern.compile("<suggestion>(.*?)</suggestion>"); + + private int fromLine = -1; + private int column = -1; + private int offset = -1; + private int endLine = -1; + private int endColumn = -1; + + private Rule rule; + private int fromPos; + private int toPos; + private String message; + // for OOo context menu + private String shortMessage; + + private List<String> suggestedReplacements = new ArrayList<String>(); + +//TODO: remove this one after all rules get their short comments in place + public RuleMatch(Rule rule, int fromPos, int toPos, String message) { + this(rule, fromPos, toPos, message, null, false); + } + + // TODO: remove this constructor? + public RuleMatch(Rule rule, int fromPos, int toPos, String message, String shortMessage) { + this(rule, fromPos, toPos, message, shortMessage, false); + } + + /** + * Creates a RuleMatch object, taking the rule that triggered + * this match, position of the match and an explanation message. + * This message is scanned for <suggestion>...</suggestion> to get suggested + * fixes for the problem detected by this rule. + * + * @param startWithUppercase whether the original text at the position + * of the match start with an uppercase character + */ + public RuleMatch(Rule rule, int fromPos, int toPos, String message, String shortMessage, + boolean startWithUppercase) { + this.rule = rule; + this.fromPos = fromPos; + this.toPos = toPos; + this.message = message; + this.shortMessage = shortMessage; + // extract suggestion from <suggestion>...</suggestion> in message: + final Matcher matcher = SUGGESTION_PATTERN.matcher(message); + int pos = 0; + while (matcher.find(pos)) { + pos = matcher.end(); + String repl = matcher.group(1); + if (startWithUppercase) + repl = StringTools.uppercaseFirstChar(repl); + suggestedReplacements.add(repl); + } + } + + public Rule getRule() { + return rule; + } + + /** + * Set the line number in which the match occurs. + */ + public void setLine(final int fromLine) { + this.fromLine = fromLine; + } + + /** + * Get the line number in which the match occurs. + */ + public int getLine() { + return fromLine; + } + + /** + * Set the line number in which the match ends. + */ + public void setEndLine(final int endLine) { + this.endLine = endLine; + } + + /** + * Get the line number in which the match ends. + */ + public int getEndLine() { + return endLine; + } + + /** + * Set the column number in which the match occurs. + */ + public void setColumn(final int column) { + this.column = column; + } + + /** + * Get the column number in which the match occurs. + */ + public int getColumn() { + return column; + } + + /** + * Set the column number in which the match ends. + */ + public void setEndColumn(final int endColumn) { + this.endColumn = endColumn; + } + + /** + * Get the column number in which the match ends. + */ + public int getEndColumn() { + return endColumn; + } + + /** + * Set the character offset at which the match occurs. + */ + public void setOffset(final int offset) { + this.offset = offset; + } + + /** + * Get the character offset at which the match occurs. + */ + public int getOffset() { + return offset; + } + + /** + * Position of the start of the error (in characters). + */ + public int getFromPos() { + return fromPos; + } + + /** + * Position of the end of the error (in characters). + */ + public int getToPos() { + return toPos; + } + + /** + * A human-readable explanation describing the error. + */ + public String getMessage() { + return message; + } + + /** + * A shorter human-readable explanation describing the error. + */ + public String getShortMessage() { + return shortMessage; + } + + + /** + * @see #getSuggestedReplacements() + */ + public void setSuggestedReplacement(final String replacement) { + if (replacement == null) + throw new NullPointerException("replacement might be empty but not null"); + final List<String> fixes = new ArrayList<String>(); + fixes.add(replacement); + setSuggestedReplacements(fixes); + } + + /** + * @see #getSuggestedReplacements() + */ + public void setSuggestedReplacements(final List<String> replacement) { + if (replacement == null) + throw new NullPointerException("replacement might be empty but not null"); + this.suggestedReplacements = replacement; + } + + /** + * The text fragments which might be an appropriate fix for the problem. One + * of these fragments can be used to replace the old text between getFromPos() + * to getToPos(). Text between <suggestion> and </suggestion> is + * taken as the suggested replacement. + * @return List of String objects or an empty List + */ + public List<String> getSuggestedReplacements() { + return suggestedReplacements; + } + + @Override + public String toString() { + return rule.getId() + ":" + fromPos + "-" + toPos + ":" + message; + } + + public int compareTo(final RuleMatch other) { + if (other == null) + throw new ClassCastException(); + if (getFromPos() < other.getFromPos()) + return -1; + if (getFromPos() > other.getFromPos()) + return 1; + return 0; + } + +} diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/rules/UppercaseSentenceStartRule.java b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/UppercaseSentenceStartRule.java new file mode 100644 index 0000000..35ecfa4 --- /dev/null +++ b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/UppercaseSentenceStartRule.java @@ -0,0 +1,136 @@ +/* LanguageTool, a natural language style checker + * Copyright (C) 2005 Daniel Naber (http://www.danielnaber.de) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ +package de.danielnaber.languagetool.rules; + +import java.util.ArrayList; +import java.util.List; +import java.util.ResourceBundle; + +import de.danielnaber.languagetool.AnalyzedSentence; +import de.danielnaber.languagetool.AnalyzedTokenReadings; +import de.danielnaber.languagetool.Language; + +/** + * Checks that a sentence starts with an uppercase letter. + * + * @author Daniel Naber + */ +public class UppercaseSentenceStartRule extends Rule { + + private final Language language; + + private String lastParagraphString = ""; + + public UppercaseSentenceStartRule(final ResourceBundle messages, + final Language language) { + super(messages); + super.setCategory(new Category(messages.getString("category_case"))); + this.language = language; + } + + public final String getId() { + return "UPPERCASE_SENTENCE_START"; + } + + public final String getDescription() { + return messages.getString("desc_uppercase_sentence"); + } + + public final RuleMatch[] match(final AnalyzedSentence text) { + final List<RuleMatch> ruleMatches = new ArrayList<RuleMatch>(); + final AnalyzedTokenReadings[] tokens = text.getTokensWithoutWhitespace(); + if (tokens.length < 2) { + return toRuleMatchArray(ruleMatches); + } + int matchTokenPos = 1; // 0 = SENT_START + final String firstToken = tokens[matchTokenPos].getToken(); + String secondToken = null; + String thirdToken = null; + // ignore quote characters: + if (tokens.length >= 3 + && ("'".equals(firstToken) || "\"".equals(firstToken) || "„" + .equals(firstToken))) { + matchTokenPos = 2; + secondToken = tokens[matchTokenPos].getToken(); + } + final String firstDutchToken = dutchSpecialCase(firstToken, secondToken, + tokens); + if (firstDutchToken != null) { + thirdToken = firstDutchToken; + matchTokenPos = 3; + } + + String checkToken = firstToken; + if (thirdToken != null) { + checkToken = thirdToken; + } else if (secondToken != null) { + checkToken = secondToken; + } + + final String lastToken = tokens[tokens.length - 1].getToken(); + + boolean noException = false; + //fix for lists; note - this will not always work for the last point in OOo, + //as OOo might serve paragraphs in any order. + if ((language == Language.RUSSIAN || language == Language.POLISH) + && (";".equals(lastParagraphString) || ";".equals(lastToken) + || ",".equals(lastParagraphString) || ",".equals(lastToken))) { + noException = true; + } + //fix for comma in last paragraph; note - this will not always work for the last point in OOo, + //as OOo might serve paragraphs in any order. + if ((language == Language.RUSSIAN || language == Language.ITALIAN + || language == Language.POLISH || language == Language.GERMAN) + && (",".equals(lastParagraphString))) { + noException = true; + } + + lastParagraphString = lastToken; + + if (checkToken.length() > 0) { + final char firstChar = checkToken.charAt(0); + if (Character.isLowerCase(firstChar) && (!noException)) { + final RuleMatch ruleMatch = new RuleMatch(this, tokens[matchTokenPos] + .getStartPos(), tokens[matchTokenPos].getStartPos() + + tokens[matchTokenPos].getToken().length(), messages + .getString("incorrect_case")); + ruleMatch.setSuggestedReplacement(Character.toUpperCase(firstChar) + + checkToken.substring(1)); + ruleMatches.add(ruleMatch); + } + } + return toRuleMatchArray(ruleMatches); + } + + private String dutchSpecialCase(final String firstToken, + final String secondToken, final AnalyzedTokenReadings[] tokens) { + if (language != Language.DUTCH) { + return null; + } + if (tokens.length >= 3 && firstToken.equals("'") + && secondToken.matches("k|m|n|r|s|t")) { + return tokens[3].getToken(); + } + return null; + } + + public void reset() { + } + +} diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/rules/WhitespaceRule.java b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/WhitespaceRule.java new file mode 100644 index 0000000..61f1ca6 --- /dev/null +++ b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/WhitespaceRule.java @@ -0,0 +1,91 @@ +/* LanguageTool, a natural language style checker + * Copyright (C) 2005 Daniel Naber (http://www.danielnaber.de) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ + +package de.danielnaber.languagetool.rules; + +import java.util.ArrayList; +import java.util.List; +import java.util.ResourceBundle; + +import de.danielnaber.languagetool.AnalyzedSentence; +import de.danielnaber.languagetool.AnalyzedTokenReadings; +import de.danielnaber.languagetool.Language; + +/** + * Check if there is duplicated whitespace in a sentence. + * Considers two spaces as incorrect, and proposes a single space instead. + * + * @author Marcin Miłkowski + */ + +public class WhitespaceRule extends Rule { + + public WhitespaceRule(final ResourceBundle messages, final Language language) { + super(messages); + super.setCategory(new Category(messages.getString("category_misc"))); + } + + @Override + public final String getId() { + return "WHITESPACE_RULE"; + } + + @Override + public final String getDescription() { + return messages.getString("desc_whitespacerepetition"); + } + + @Override + public RuleMatch[] match(final AnalyzedSentence text) { + final List<RuleMatch> ruleMatches = new ArrayList<RuleMatch>(); + final AnalyzedTokenReadings[] tokens = text.getTokens(); + boolean prevWhite = false; + int prevLen = 0; + int prevPos = 0; + //note: we start from token 1 + //token no. 0 is guaranteed to be SENT_START + int i = 1; + while (i < tokens.length) { + if (tokens[i].isWhitespace() && prevWhite && !tokens[i -1].isLinebreak()) { + final int pos = tokens[i -1].getStartPos(); + while (i < tokens.length && tokens[i].isWhitespace()) { + prevLen += tokens[i].getToken().length(); + i++; + } + final RuleMatch ruleMatch = new RuleMatch(this, prevPos, pos + prevLen, messages + .getString("whitespace_repetition")); + ruleMatch.setSuggestedReplacement(" "); + ruleMatches.add(ruleMatch); + } + if (i < tokens.length) { + prevWhite = tokens[i].isWhitespace(); + prevLen = tokens[i].getToken().length(); + prevPos = tokens[i].getStartPos(); + i++; + } + } + return toRuleMatchArray(ruleMatches); + } + + @Override + public void reset() { + // nothing + } + +} diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/rules/WordRepeatRule.java b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/WordRepeatRule.java new file mode 100644 index 0000000..c8060a5 --- /dev/null +++ b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/WordRepeatRule.java @@ -0,0 +1,101 @@ +/* LanguageTool, a natural language style checker + * Copyright (C) 2005 Daniel Naber (http://www.danielnaber.de) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ +package de.danielnaber.languagetool.rules; + +import java.util.ArrayList; +import java.util.List; +import java.util.ResourceBundle; + +import de.danielnaber.languagetool.AnalyzedSentence; +import de.danielnaber.languagetool.AnalyzedTokenReadings; +import de.danielnaber.languagetool.Language; + +/** + * Check if a word is repeated twice, e.g. "the the". + * + * @author Daniel Naber + */ +public class WordRepeatRule extends Rule { + + public WordRepeatRule(final ResourceBundle messages, final Language language) { + super(messages); + super.setCategory(new Category(messages.getString("category_misc"))); + } + + /** + * Implement this method to return <code>true</code> if there's + * a potential word repetition at the current position should be ignored, + * i.e. if no error should be created. + * + * @param tokens the tokens of the sentence currently being checked + * @param position the current position in the tokens + * @return this implementation always returns false + */ + public boolean ignore(final AnalyzedTokenReadings[] tokens, final int position) { + return false; + } + + @Override + public String getId() { + return "WORD_REPEAT_RULE"; + } + + @Override + public String getDescription() { + return messages.getString("desc_repetition"); + } + + @Override + public RuleMatch[] match(final AnalyzedSentence text) { + final List<RuleMatch> ruleMatches = new ArrayList<RuleMatch>(); + final AnalyzedTokenReadings[] tokens = text.getTokensWithoutWhitespace(); + String prevToken = ""; + //note: we start from token 1 + //token no. 0 is guaranteed to be SENT_START + for (int i = 1; i < tokens.length; i++) { + final String token = tokens[i].getToken(); + // avoid "..." etc. to be matched: + boolean isWord = true; + if (token.length() == 1) { + final char c = token.charAt(0); + if (!Character.isLetter(c)) { + isWord = false; + } + } + final boolean isException = ignore(tokens, i); + if (isWord && prevToken.toLowerCase().equals(token.toLowerCase()) && !isException) { + final String msg = messages.getString("repetition"); + final int prevPos = tokens[i - 1].getStartPos(); + final int pos = tokens[i].getStartPos(); + final RuleMatch ruleMatch = new RuleMatch(this, prevPos, pos+prevToken.length(), msg, + messages.getString("desc_repetition_short")); + ruleMatch.setSuggestedReplacement(prevToken); + ruleMatches.add(ruleMatch); + } + prevToken = token; + } + return toRuleMatchArray(ruleMatches); + } + + @Override + public void reset() { + // nothing + } + +} diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/rules/bitext/BitextRule.java b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/bitext/BitextRule.java new file mode 100644 index 0000000..d508ae5 --- /dev/null +++ b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/bitext/BitextRule.java @@ -0,0 +1,106 @@ +/* LanguageTool, a natural language style checker + * Copyright (C) 2010 Marcin Miłkowski (www.languagetool.org) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ + +package de.danielnaber.languagetool.rules.bitext; + +import java.io.IOException; +import java.util.List; + +import de.danielnaber.languagetool.AnalyzedSentence; +import de.danielnaber.languagetool.bitext.StringPair; +import de.danielnaber.languagetool.rules.Rule; +import de.danielnaber.languagetool.rules.RuleMatch; +import de.danielnaber.languagetool.Language; + +/** + * Abstract bitext rule class. A BitextRule describes a language error and + * can test whether a given pre-analyzed pair of source and target text + * contains that error using the {@link Rule#match} method. + * + * @author Marcin Miłkowski + */ + +public abstract class BitextRule extends Rule { + + private List<StringPair> correctExamples; + private List<IncorrectBitextExample> incorrectExamples; + + private Language sourceLanguage; + + @Override + public abstract String getDescription(); + + public abstract String getMessage(); + + @Override + public abstract String getId(); + + @Override + public abstract RuleMatch[] match(AnalyzedSentence text) throws IOException; + + public abstract RuleMatch[] match(AnalyzedSentence sourceText, + AnalyzedSentence targetText) throws IOException; + + @Override + public abstract void reset(); + + /** + * Set the source language. If the language is not supported + * by LT, you need to use the default tokenizers etc. + * @param lang - Source Language + */ + public final void setSourceLang(final Language lang) { + sourceLanguage = lang; + } + + public final Language getSourceLang() { + return sourceLanguage; + } + + /** + * Set the examples that are correct and thus do not trigger the rule. + */ + public final void setCorrectBitextExamples(final List<StringPair> correctExamples) { + this.correctExamples = correctExamples; + } + + /** + * Get example sentences that are correct and thus will not match this rule. + */ + public final List<StringPair> getCorrectBitextExamples() { + return correctExamples; + } + + /** + * Set the examples that are incorrect and thus do trigger the rule. + */ + public final void setIncorrectBitextExamples( + final List<IncorrectBitextExample> incorrectExamples) { + this.incorrectExamples = incorrectExamples; + } + + /** + * Get example sentences that are incorrect and thus will match this rule. + */ + public final List<IncorrectBitextExample> getIncorrectBitextExamples() { + return incorrectExamples; + } + + +} diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/rules/bitext/DifferentLengthRule.java b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/bitext/DifferentLengthRule.java new file mode 100644 index 0000000..995772c --- /dev/null +++ b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/bitext/DifferentLengthRule.java @@ -0,0 +1,93 @@ +/* LanguageTool, a natural language style checker + * Copyright (C) 2010 Marcin Miłkowski (www.languagetool.org) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ +package de.danielnaber.languagetool.rules.bitext; + +import java.io.IOException; + +import de.danielnaber.languagetool.AnalyzedSentence; +import de.danielnaber.languagetool.AnalyzedTokenReadings; +import de.danielnaber.languagetool.rules.RuleMatch; + +/** + * Checks if the translation has a really different length than the source + * (smaller than 30% or longer by 250%). + * + * @author Marcin Miłkowski + * + */ +public class DifferentLengthRule extends BitextRule { + + static final String MSG = "Source and target translation lengths are very different!"; + + @Override + public String getDescription() { + return "Check if translation length is similar to source length"; + } + + @Override + public String getId() { + return "TRANSLATION_LENGTH"; + } + + public String getMessage() { + return MSG; + } + + /** + * This method makes no sense for bitext, return null?? + */ + @Override + public RuleMatch[] match(AnalyzedSentence text) throws IOException { + // TODO Auto-generated method stub + return null; + } + + @Override + public RuleMatch[] match(AnalyzedSentence sourceText, + AnalyzedSentence targetText) throws IOException { + + if (isLengthDifferent( + getPureText(sourceText), getPureText(targetText))) { + final RuleMatch[] rm = new RuleMatch[1]; + final AnalyzedTokenReadings[] tokens = targetText.getTokens(); + final int len = tokens[tokens.length - 1].getStartPos() + tokens[tokens.length - 1].getToken().length(); + rm[0] = new RuleMatch(this, 1, len, + MSG); + return rm; + } + return new RuleMatch[0]; + } + + static boolean isLengthDifferent(final String src, final String trg) { + final double skew = (((double) src.length() / (double) trg.length()) * 100.00); + return (skew > 250 || skew < 30); + } + + private static String getPureText(AnalyzedSentence text) { + final StringBuilder sb = new StringBuilder(); + for (AnalyzedTokenReadings token : text.getTokens()) { + sb.append(token.getToken()); + } + return sb.toString(); + } + + public void reset() { + } + +} diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/rules/bitext/IncorrectBitextExample.java b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/bitext/IncorrectBitextExample.java new file mode 100644 index 0000000..e877826 --- /dev/null +++ b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/bitext/IncorrectBitextExample.java @@ -0,0 +1,64 @@ +/* LanguageTool, a natural language style checker + * Copyright (C) 2010 Marcin Miłkowski (http://www.languagetool.org) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ +package de.danielnaber.languagetool.rules.bitext; + +import java.util.Arrays; +import java.util.List; + +import de.danielnaber.languagetool.bitext.StringPair; + +/** + * A text, typically a pair of sentences that contains an error. + * + * @since 1.0.1 + * @author Marcin Miłkowski + */ +public class IncorrectBitextExample { + + private StringPair example; + private List<String> corrections; + + public IncorrectBitextExample(final StringPair example) { + this.example = example; + } + + public IncorrectBitextExample(final StringPair example, final String[] corrections) { + this(example); + this.corrections = Arrays.asList(corrections); + } + + /** + * Return the example that contains the error. + */ + public StringPair getExample() { + return example; + } + + /** + * Return the possible corrections. May be null. + */ + public List<String> getCorrections() { + return corrections; + } + + public String toString() { + return example.getSource() + "/ " + example.getTarget() + " " + corrections; + } + +} diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/rules/bitext/SameTranslationRule.java b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/bitext/SameTranslationRule.java new file mode 100644 index 0000000..c9e1ace --- /dev/null +++ b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/bitext/SameTranslationRule.java @@ -0,0 +1,88 @@ +/* LanguageTool, a natural language style checker + * Copyright (C) 2010 Marcin Miłkowski (www.languagetool.org) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ +package de.danielnaber.languagetool.rules.bitext; + +import java.io.IOException; + +import de.danielnaber.languagetool.AnalyzedSentence; +import de.danielnaber.languagetool.AnalyzedTokenReadings; +import de.danielnaber.languagetool.rules.RuleMatch; + +/** + * Checks if the translation for segments that have more than two words + * is different. + * + * @author Marcin Miłkowski + * + */ +public class SameTranslationRule extends BitextRule { + + static final String MSG = "Source and target translation are the same!"; + + @Override + public String getDescription() { + return "Check if translation is the same as source"; + } + + @Override + public String getId() { + return "SAME_TRANSLATION"; + } + + public String getMessage() { + return MSG; + } + + /** + * This method makes no sense for bitext, return null?? + */ + @Override + public RuleMatch[] match(AnalyzedSentence text) throws IOException { + // TODO Auto-generated method stub + return null; + } + + @Override + public RuleMatch[] match(AnalyzedSentence sourceText, + AnalyzedSentence targetText) throws IOException { + + //This is just heuristics, checking word count + if (sourceText.getTokensWithoutWhitespace().length > 3 + && getPureText(sourceText).equals(getPureText(targetText))) { + final RuleMatch[] rm = new RuleMatch[1]; + final AnalyzedTokenReadings[] tokens = targetText.getTokens(); + final int len = tokens[tokens.length - 1].getStartPos() + tokens[tokens.length - 1].getToken().length(); + rm[0] = new RuleMatch(this, 1, len, MSG); + return rm; + } + return new RuleMatch[0]; + } + + private static String getPureText(AnalyzedSentence text) { + final StringBuilder sb = new StringBuilder(); + for (AnalyzedTokenReadings token : text.getTokens()) { + sb.append(token.getToken()); + } + return sb.toString(); + } + + public void reset() { + } + +} diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/rules/ca/AccentuacioReplaceRule.java b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/ca/AccentuacioReplaceRule.java new file mode 100644 index 0000000..eb5a3fa --- /dev/null +++ b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/ca/AccentuacioReplaceRule.java @@ -0,0 +1,90 @@ +/* LanguageTool, a natural language style checker + * Copyright (C) 2005 Daniel Naber (http://www.danielnaber.de) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ +package de.danielnaber.languagetool.rules.ca; + +import java.io.IOException; +import java.util.Locale; +import java.util.ResourceBundle; + +import de.danielnaber.languagetool.rules.AbstractSimpleReplaceRule; + +/** + * A rule that matches words or phrases which should not be used and suggests + * correct ones instead. + * + * Catalan implementations for accentuation errors. + * This is basically the same as CastellanismesReplaceRule.java + * with a different error message. + * + * Loads the list of words from <code>rules/ca/accentuacio.txt</code>. + * + * TODO: Some of the entries are proper names (Greek gods, etc.), which + * aren't currently checked. + * + * @author Jimmy O'Regan + * + * Based on pl/SimpleReplaceRule.java + */ +public class AccentuacioReplaceRule extends AbstractSimpleReplaceRule { + + public static final String CATALAN_ACCENTUACIO_REPLACE_RULE = "CA_ACCENTUACIO_REPLACE"; + + private static final String FILE_NAME = "/ca/accentuacio.txt"; + // locale used on case-conversion + private static final Locale CA_LOCALE = new Locale("ca"); + + public final String getFileName() { + return FILE_NAME; + } + + public AccentuacioReplaceRule(final ResourceBundle messages) throws IOException { + super(messages); + } + + public final String getId() { + return CATALAN_ACCENTUACIO_REPLACE_RULE; + } + + public String getDescription() { + return "Errors d'accentuació"; + } + + public String getShort() { + return "Accentuació"; + } + + public String getSuggestion() { + return " es un error d'accentuació, cal dir: "; + } + + /** + * use case-insensitive matching. + */ + public boolean isCaseSensitive() { + return false; + } + + /** + * locale used on case-conversion + */ + public Locale getLocale() { + return CA_LOCALE; + } + +} diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/rules/ca/CastellanismesReplaceRule.java b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/ca/CastellanismesReplaceRule.java new file mode 100644 index 0000000..3169b66 --- /dev/null +++ b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/ca/CastellanismesReplaceRule.java @@ -0,0 +1,85 @@ +/* LanguageTool, a natural language style checker + * Copyright (C) 2005 Daniel Naber (http://www.danielnaber.de) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ +package de.danielnaber.languagetool.rules.ca; + +import java.io.IOException; +import java.util.Locale; +import java.util.ResourceBundle; + +import de.danielnaber.languagetool.rules.AbstractSimpleReplaceRule; + +/** + * A rule that matches words or phrases which should not be used and suggests + * correct ones instead. + * + * Catalan implementations for Castelianisms, kept separate for an individual + * error message. + * Loads the list of words from <code>rules/ca/castellanismes.txt</code>. + * + * @author Jimmy O'Regan + * + * Based on pl/SimpleReplaceRule.java + */ +public class CastellanismesReplaceRule extends AbstractSimpleReplaceRule { + + public static final String CATALAN_CASTELLANISMES_REPLACE_RULE = "CA_CASTELLANISMES_REPLACE"; + + private static final String FILE_NAME = "/ca/castellanismes.txt"; + // locale used on case-conversion + private static final Locale caLocale = new Locale("ca"); + + public final String getFileName() { + return FILE_NAME; + } + + public CastellanismesReplaceRule(final ResourceBundle messages) throws IOException { + super(messages); + } + + public final String getId() { + return CATALAN_CASTELLANISMES_REPLACE_RULE; + } + + public String getDescription() { + return "Barbarismes (Castellanismes)"; + } + + public String getShort() { + return "Castellanismes"; + } + + public String getSuggestion() { + return " es un castellanisme, cal dir: "; + } + + /** + * use case-insensitive matching. + */ + public boolean isCaseSensitive() { + return false; + } + + /** + * locale used on case-conversion + */ + public Locale getLocale() { + return caLocale; + } + +} diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/rules/de/AgreementRule.java b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/de/AgreementRule.java new file mode 100644 index 0000000..8afff0c --- /dev/null +++ b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/de/AgreementRule.java @@ -0,0 +1,405 @@ +/* LanguageTool, a natural language style checker + * Copyright (C) 2005 Daniel Naber (http://www.danielnaber.de) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ +package de.danielnaber.languagetool.rules.de; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.HashSet; +import java.util.List; +import java.util.ResourceBundle; +import java.util.Set; + +import de.danielnaber.languagetool.AnalyzedSentence; +import de.danielnaber.languagetool.AnalyzedTokenReadings; +import de.danielnaber.languagetool.JLanguageTool; +import de.danielnaber.languagetool.rules.Category; +import de.danielnaber.languagetool.rules.RuleMatch; +import de.danielnaber.languagetool.tagging.de.AnalyzedGermanToken; +import de.danielnaber.languagetool.tagging.de.AnalyzedGermanTokenReadings; +import de.danielnaber.languagetool.tagging.de.GermanTagger; +import de.danielnaber.languagetool.tagging.de.GermanToken; +import de.danielnaber.languagetool.tagging.de.GermanToken.POSType; +import de.danielnaber.languagetool.tools.StringTools; + +/** + * Simple agreement checker for German noun phrases. Checks agreement in: + * + * <ul> + * <li>DET/PRO NOUN: e.g. "mein Auto", "der Mann", "die Frau" (correct), "die Haus" (incorrect)</li> + * <li>DET/PRO ADJ NOUN: e.g. "der riesige Tisch" (correct), "die riesigen Tisch" (incorrect)</li> + * </ul> + * + * Note that this rule only checks agreement inside the noun phrase, not whether + * e.g. the correct case is used. For example, "Es ist das Haus dem Mann" is not + * detected as incorrect. + * + * @author Daniel Naber + */ +public class AgreementRule extends GermanRule { + + private static final String KASUS = "Kasus"; + private static final String NUMERUS = "Numerus"; + private static final String GENUS = "Genus"; + + /* + * City names are incoherently tagged in the Morphy data. To avoid + * false alarms on phrases like "das Berliner Auto" we have to + * explicitly add these adjective readings to "Berliner" and to all + * other potential city names: + */ + private static final String[] ADJ_READINGS = new String[] { + // singular: + "ADJ:NOM:SIN:MAS:GRU", "ADJ:NOM:SIN:NEU:GRU", "ADJ:NOM:SIN:FEM:GRU", // das Berliner Auto + "ADJ:GEN:SIN:MAS:GRU", "ADJ:GEN:SIN:NEU:GRU", "ADJ:GEN:SIN:FEM:GRU", // des Berliner Autos + "ADJ:DAT:SIN:MAS:GRU", "ADJ:DAT:SIN:NEU:GRU", "ADJ:DAT:SIN:FEM:GRU", // dem Berliner Auto + "ADJ:AKK:SIN:MAS:GRU", "ADJ:AKK:SIN:NEU:GRU", "ADJ:AKK:SIN:FEM:GRU", // den Berliner Bewohner + // plural: + "ADJ:NOM:PLU:MAS:GRU", "ADJ:NOM:PLU:NEU:GRU", "ADJ:NOM:PLU:FEM:GRU", // die Berliner Autos + "ADJ:GEN:PLU:MAS:GRU", "ADJ:GEN:PLU:NEU:GRU", "ADJ:GEN:PLU:FEM:GRU", // der Berliner Autos + "ADJ:DAT:PLU:MAS:GRU", "ADJ:DAT:PLU:NEU:GRU", "ADJ:DAT:PLU:FEM:GRU", // den Berliner Autos + "ADJ:AKK:PLU:MAS:GRU", "ADJ:AKK:PLU:NEU:GRU", "ADJ:AKK:PLU:FEM:GRU", // den Berliner Bewohnern + }; + + + private static final Set<String> REL_PRONOUN = new HashSet<String>(); + static { + REL_PRONOUN.add("der"); + REL_PRONOUN.add("die"); + REL_PRONOUN.add("das"); + REL_PRONOUN.add("dessen"); + REL_PRONOUN.add("deren"); + REL_PRONOUN.add("dem"); + REL_PRONOUN.add("den"); + REL_PRONOUN.add("welche"); + REL_PRONOUN.add("welcher"); + REL_PRONOUN.add("welchen"); + REL_PRONOUN.add("welchem"); + REL_PRONOUN.add("welches"); + } + + private static final Set<String> PREPOSITIONS = new HashSet<String>(); + static { + PREPOSITIONS.add("in"); + PREPOSITIONS.add("auf"); + PREPOSITIONS.add("an"); + PREPOSITIONS.add("ab"); + PREPOSITIONS.add("für"); + PREPOSITIONS.add("zu"); + // TODO: add more + } + + public AgreementRule(final ResourceBundle messages) { + if (messages != null) + super.setCategory(new Category(messages.getString("category_grammar"))); + } + + public String getId() { + return "DE_AGREEMENT"; + } + + public String getDescription() { + return "Kongruenz von Nominalphrasen (unvollständig!), z.B. 'mein kleiner(kleines) Haus'"; + } + + public RuleMatch[] match(final AnalyzedSentence text) { + final List<RuleMatch> ruleMatches = new ArrayList<RuleMatch>(); + final AnalyzedTokenReadings[] tokens = text.getTokensWithoutWhitespace(); + int pos = 0; + for (int i = 0; i < tokens.length; i++) { + //defaulting to the first reading + //TODO: check for all readings + //and replace GermanTokenReading + final String posToken = tokens[i].getAnalyzedToken(0).getPOSTag(); + if (posToken != null && posToken.equals(JLanguageTool.SENTENCE_START_TAGNAME)) + continue; + //AnalyzedGermanToken analyzedToken = new AnalyzedGermanToken(tokens[i]); + + final AnalyzedGermanTokenReadings analyzedToken = (AnalyzedGermanTokenReadings)tokens[i]; + final boolean relevantPronoun = isRelevantPronoun(tokens, i); + + boolean ignore = couldBeRelativeClause(tokens, i); + if (i > 0) { + final String prevToken = tokens[i-1].getToken().toLowerCase(); + if ((prevToken.equals("der") || prevToken.equals("die") || prevToken.equals("das")) + && tokens[i].getToken().equals("eine")) { + // TODO: "der eine Polizist" -> nicht ignorieren, sondern "der polizist" checken + ignore = true; + } + } + + // avoid false alarm on "nichts Gutes": + if (analyzedToken.getToken().equals("nichts")) { + ignore = true; + } + + if ((analyzedToken.hasReadingOfType(POSType.DETERMINER) || relevantPronoun) && !ignore) { + int tokenPos = i + 1; + if (tokenPos >= tokens.length) + break; + AnalyzedGermanTokenReadings nextToken = (AnalyzedGermanTokenReadings)tokens[tokenPos]; + nextToken = maybeAddAdjectiveReadings(nextToken, tokens, tokenPos); + if (nextToken.hasReadingOfType(POSType.ADJEKTIV)) { + tokenPos = i + 2; + if (tokenPos >= tokens.length) + break; + final AnalyzedGermanTokenReadings nextNextToken = (AnalyzedGermanTokenReadings)tokens[tokenPos]; + if (nextNextToken.hasReadingOfType(POSType.NOMEN)) { + // TODO: add a case (checkAdjNounAgreement) for special cases like "deren", + // e.g. "deren komisches Geschenke" isn't yet detected as incorrect + final RuleMatch ruleMatch = checkDetAdjNounAgreement((AnalyzedGermanTokenReadings)tokens[i], + nextToken, (AnalyzedGermanTokenReadings)tokens[i+2]); + if (ruleMatch != null) { + ruleMatches.add(ruleMatch); + } + } + } else if (nextToken.hasReadingOfType(POSType.NOMEN)) { + final RuleMatch ruleMatch = checkDetNounAgreement((AnalyzedGermanTokenReadings)tokens[i], + (AnalyzedGermanTokenReadings)tokens[i+1]); + if (ruleMatch != null) { + ruleMatches.add(ruleMatch); + } + } + } + + pos += tokens[i].getToken().length(); + } + return toRuleMatchArray(ruleMatches); + } + + private boolean isRelevantPronoun(AnalyzedTokenReadings[] tokens, int pos) { + final AnalyzedGermanTokenReadings analyzedToken = (AnalyzedGermanTokenReadings)tokens[pos]; + boolean relevantPronoun = analyzedToken.hasReadingOfType(POSType.PRONOMEN); + // avoid false alarms: + final String token = tokens[pos].getToken(); + if (pos > 0 && tokens[pos-1].getToken().equalsIgnoreCase("vor") && tokens[pos].getToken().equalsIgnoreCase("allem")) + relevantPronoun = false; + else if (token.equalsIgnoreCase("er") || token.equalsIgnoreCase("sie") || token.equalsIgnoreCase("es")) + relevantPronoun = false; + else if (token.equalsIgnoreCase("ich")) + relevantPronoun = false; + else if (token.equalsIgnoreCase("du")) + relevantPronoun = false; + else if (token.equalsIgnoreCase("dessen")) // avoid false alarm on: "..., dessen Leiche" + relevantPronoun = false; + else if (token.equalsIgnoreCase("deren")) + relevantPronoun = false; + else if (token.equalsIgnoreCase("sich")) // avoid false alarm + relevantPronoun = false; + else if (token.equalsIgnoreCase("unser")) // avoid false alarm "unser Produkt": TODO! + relevantPronoun = false; + else if (token.equalsIgnoreCase("aller")) + relevantPronoun = false; + else if (token.equalsIgnoreCase("man")) + relevantPronoun = false; + else if (token.equalsIgnoreCase("beiden")) + relevantPronoun = false; + else if (token.equalsIgnoreCase("wessen")) + relevantPronoun = false; + else if (token.equalsIgnoreCase("a")) + relevantPronoun = false; + else if (token.equalsIgnoreCase("alle")) + relevantPronoun = false; + else if (token.equalsIgnoreCase("etwas")) // TODO: doesn't have case -- but don't just ignore + relevantPronoun = false; + else if (token.equalsIgnoreCase("was")) // TODO: doesn't have case -- but don't just ignore + relevantPronoun = false; + else if (token.equalsIgnoreCase("wer")) + relevantPronoun = false; + return relevantPronoun; + } + + // see the comment at ADJ_READINGS: + private AnalyzedGermanTokenReadings maybeAddAdjectiveReadings(AnalyzedGermanTokenReadings nextToken, + AnalyzedTokenReadings[] tokens, int tokenPos) { + final String nextTerm = nextToken.getToken(); + // Just a heuristic: nouns and proper nouns that end with "er" are considered + // city names: + if (nextTerm.endsWith("er") && tokens.length > tokenPos+1) { + final AnalyzedGermanTokenReadings nextNextToken = (AnalyzedGermanTokenReadings)tokens[tokenPos+1]; + final GermanTagger tagger = new GermanTagger(); + try { + final AnalyzedGermanTokenReadings nextATR = tagger.lookup(nextTerm.substring(0, nextTerm.length()-2)); + final AnalyzedGermanTokenReadings nextNextATR = tagger.lookup(nextNextToken.getToken()); + //System.err.println("nextATR: " + nextATR); + //System.err.println("nextNextATR: " + nextNextATR); + // "Münchner": special case as cutting off last two characters doesn't produce city name: + if ("Münchner".equals(nextTerm) || + (nextATR != null && + // tagging in Morphy for cities is not coherent: + (nextATR.hasReadingOfType(POSType.PROPER_NOUN) || nextATR.hasReadingOfType(POSType.NOMEN) && + nextNextATR != null && nextNextATR.hasReadingOfType(POSType.NOMEN)))) { + final AnalyzedGermanToken[] adjReadings = new AnalyzedGermanToken[ADJ_READINGS.length]; + for (int j = 0; j < ADJ_READINGS.length; j++) { + adjReadings[j] = new AnalyzedGermanToken(nextTerm, ADJ_READINGS[j], null); + } + nextToken = new AnalyzedGermanTokenReadings(adjReadings, nextToken.getStartPos()); + } + } catch (IOException e) { + throw new RuntimeException(e); + } + } + return nextToken; + } + + // TODO: improve this so it only returns true for real relative clauses + private boolean couldBeRelativeClause(AnalyzedTokenReadings[] tokens, int pos) { + boolean comma; + boolean relPronoun; + if (pos >= 1) { + // avoid false alarm: "Das Wahlrecht, das Frauen zugesprochen bekamen." etc: + comma = tokens[pos-1].getToken().equals(","); + final String term = tokens[pos].getToken().toLowerCase(); + relPronoun = REL_PRONOUN.contains(term); + if (comma && relPronoun) + return true; + } + if (pos >= 2) { + // avoid false alarm: "Der Mann, in dem quadratische Fische schwammen." + comma = tokens[pos-2].getToken().equals(","); + final String term1 = tokens[pos-1].getToken().toLowerCase(); + final String term2 = tokens[pos].getToken().toLowerCase(); + final boolean prep = PREPOSITIONS.contains(term1); + relPronoun = REL_PRONOUN.contains(term2); + return comma && prep && relPronoun; + } + return false; + } + + private RuleMatch checkDetNounAgreement(final AnalyzedGermanTokenReadings token1, + final AnalyzedGermanTokenReadings token2) { + // avoid false alarm: "Gebt ihm Macht." + if (token1.getToken().equalsIgnoreCase("ihm")) + return null; + RuleMatch ruleMatch = null; + final Set<String> set1 = getAgreementCategories(token1); + if (set1 == null) + return null; // word not known, assume it's correct + final Set<String> set2 = getAgreementCategories(token2); + if (set2 == null) + return null; + /*System.err.println("#"+set1); + System.err.println("#"+set2); + System.err.println("");*/ + set1.retainAll(set2); + if (set1.size() == 0) { + // TODO: better error message than just 'agreement error' + final String msg = "Möglicherweise fehlende Übereinstimmung (Kongruenz) zwischen Artikel und Nomen " + + "bezüglich Kasus, Numerus oder Genus. Beispiel: 'meine Haus' statt 'mein Haus'"; + ruleMatch = new RuleMatch(this, token1.getStartPos(), + token2.getStartPos()+token2.getToken().length(), msg); + } + return ruleMatch; + } + + private RuleMatch checkDetAdjNounAgreement(final AnalyzedGermanTokenReadings token1, + final AnalyzedGermanTokenReadings token2, final AnalyzedGermanTokenReadings token3) { + final Set<String> relax = new HashSet<String>(); + final Set<String> set = retainCommonCategories(token1, token2, token3, relax); + RuleMatch ruleMatch = null; + if (set.size() == 0) { + // TODO: more detailed error message: + /*relax.add(KASUS); + set = retainCommonCategories(token1, token2, token3, relax); + if (set.size() > 0) { + System.err.println("KASUS!"); + } + relax.clear(); + relax.add(NUMERUS); + set = retainCommonCategories(token1, token2, token3, relax); + if (set.size() > 0) { + System.err.println("NUMERUS!"); + } + relax.clear(); + relax.add(GENUS); + set = retainCommonCategories(token1, token2, token3, relax); + if (set.size() > 0) { + System.err.println("GENUS!"); + }*/ + final String msg = "Möglicherweise fehlende Übereinstimmung (Kongruenz) zwischen Artikel, Adjektiv und " + + "Nomen bezüglich Kasus, Numerus oder Genus. Beispiel: 'mein kleiner Haus' " + + "statt 'mein kleines Haus'"; + ruleMatch = new RuleMatch(this, token1.getStartPos(), + token3.getStartPos()+token3.getToken().length(), msg); + } + return ruleMatch; + } + + private Set<String> retainCommonCategories(final AnalyzedGermanTokenReadings token1, + final AnalyzedGermanTokenReadings token2, final AnalyzedGermanTokenReadings token3, + Set<String> relax) { + final Set<String> set1 = getAgreementCategories(token1, relax); + if (set1 == null) + return null; // word not known, assume it's correct + final Set<String> set2 = getAgreementCategories(token2, relax); + if (set2 == null) + return null; + final Set<String> set3 = getAgreementCategories(token3, relax); + if (set3 == null) + return null; + /*System.err.println(token1.getToken()+"#"+set1); + System.err.println(token2.getToken()+"#"+set2); + System.err.println(token3.getToken()+"#"+set3); + System.err.println("");*/ + set1.retainAll(set2); + set1.retainAll(set3); + return set1; + } + + private Set<String> getAgreementCategories(final AnalyzedGermanTokenReadings aToken) { + return getAgreementCategories(aToken, new HashSet<String>()); + } + + /** Return Kasus, Numerus, Genus. */ + private Set<String> getAgreementCategories(final AnalyzedGermanTokenReadings aToken, Set<String> omit) { + final Set<String> set = new HashSet<String>(); + final List<AnalyzedGermanToken> readings = aToken.getGermanReadings(); + for (AnalyzedGermanToken reading : readings) { + if (reading.getCasus() == null && reading.getNumerus() == null && + reading.getGenus() == null) + continue; + if (reading.getGenus() == null) { + // "ich" and "wir" contains genus=ALG in the original data. Not sure if + // this is allowed, but expand this so "Ich Arbeiter" doesn't get flagged + // as incorrect: + set.add(makeString(reading.getCasus(), reading.getNumerus(), GermanToken.Genus.MASKULINUM, omit)); + set.add(makeString(reading.getCasus(), reading.getNumerus(), GermanToken.Genus.FEMININUM, omit)); + set.add(makeString(reading.getCasus(), reading.getNumerus(), GermanToken.Genus.NEUTRUM, omit)); + } else { + set.add(makeString(reading.getCasus(), reading.getNumerus(), reading.getGenus(), omit)); + } + } + return set; + } + + private String makeString(GermanToken.Kasus casus, GermanToken.Numerus num, GermanToken.Genus gen, + Set<String> omit) { + final List<String> l = new ArrayList<String>(); + if (casus != null && !omit.contains(KASUS)) + l.add(casus.toString()); + if (num != null && !omit.contains(NUMERUS)) + l.add(num.toString()); + if (gen != null && !omit.contains(GENUS)) + l.add(gen.toString()); + return StringTools.listToString(l, "/"); + } + + public void reset() { + } + +} diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/rules/de/CaseRule.java b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/de/CaseRule.java new file mode 100644 index 0000000..663e9ff --- /dev/null +++ b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/de/CaseRule.java @@ -0,0 +1,358 @@ +/* LanguageTool, a natural language style checker + * Copyright (C) 2005 Daniel Naber (http://www.danielnaber.de) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ +package de.danielnaber.languagetool.rules.de; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.HashSet; +import java.util.List; +import java.util.ResourceBundle; +import java.util.Set; + +import de.danielnaber.languagetool.AnalyzedSentence; +import de.danielnaber.languagetool.AnalyzedTokenReadings; +import de.danielnaber.languagetool.JLanguageTool; +import de.danielnaber.languagetool.rules.Category; +import de.danielnaber.languagetool.rules.RuleMatch; +import de.danielnaber.languagetool.tagging.de.AnalyzedGermanToken; +import de.danielnaber.languagetool.tagging.de.AnalyzedGermanTokenReadings; +import de.danielnaber.languagetool.tagging.de.GermanTagger; +import de.danielnaber.languagetool.tagging.de.GermanToken; +import de.danielnaber.languagetool.tagging.de.GermanToken.POSType; +import de.danielnaber.languagetool.tools.StringTools; + +/** + * Check that adjectives and verbs are not written with an uppercase + * first letter (except at the start of a sentence) and cases + * like this: <tt>Das laufen fällt mir leicht.</tt> (<tt>laufen</tt> needs + * to be uppercased). + * + * @author Daniel Naber + */ +public class CaseRule extends GermanRule { + + private final GermanTagger tagger = new GermanTagger(); + + // wenn hinter diesen Wörtern ein Verb steht, ist es wohl ein substantiviertes Verb, + // muss also groß geschrieben werden: + private static final Set<String> nounIndicators = new HashSet<String>(); + static { + nounIndicators.add("das"); + nounIndicators.add("sein"); + //indicator.add("seines"); // TODO: ? + //nounIndicators.add("ihr"); // would cause false alarm e.g. "Auf ihr stehen die Ruinen..." + nounIndicators.add("mein"); + nounIndicators.add("dein"); + nounIndicators.add("euer"); + //indicator.add("ihres"); + //indicator.add("ihren"); + } + + private static final Set<String> sentenceStartExceptions = new HashSet<String>(); + static { + sentenceStartExceptions.add("("); + sentenceStartExceptions.add(":"); + sentenceStartExceptions.add("\""); + sentenceStartExceptions.add("'"); + sentenceStartExceptions.add("„"); + sentenceStartExceptions.add("“"); + sentenceStartExceptions.add("«"); + sentenceStartExceptions.add("»"); + } + + private static final Set<String> exceptions = new HashSet<String>(); + static { + exceptions.add("Für"); // "das Für und Wider" + exceptions.add("Wider"); // "das Für und Wider" + exceptions.add("Nachts"); // "des Nachts", "eines Nachts" + exceptions.add("Genüge"); + exceptions.add("Zusage"); + exceptions.add("Nachfrage"); + exceptions.add("Sachverständiger"); + exceptions.add("Nr"); + exceptions.add("Sankt"); + exceptions.add("Toter"); + exceptions.add("Verantwortlicher"); + exceptions.add("Wichtiges"); + exceptions.add("Dr"); + exceptions.add("Prof"); + exceptions.add("Mr"); + exceptions.add("Mrs"); + exceptions.add("De"); // "De Morgan" etc + exceptions.add("Le"); // "Le Monde" etc + exceptions.add("Ihr"); + exceptions.add("Ihre"); + exceptions.add("Ihres"); + exceptions.add("Ihren"); + exceptions.add("Ihnen"); + exceptions.add("Ihrem"); + exceptions.add("Ihrer"); + exceptions.add("Sie"); + exceptions.add("Aus"); // "vor dem Aus stehen" + exceptions.add("Oder"); // der Fluss + exceptions.add("tun"); // "Sie müssen das tun" + exceptions.add("St"); // Paris St. Germain + exceptions.add("Las"); // Las Vegas, nicht "lesen" + exceptions.add("Folgendes"); // je nach Kontext groß (TODO)... + exceptions.add("besonderes"); // je nach Kontext groß (TODO): "etwas Besonderes" + exceptions.add("Hundert"); // je nach Kontext groß (TODO) + exceptions.add("Tausend"); // je nach Kontext groß (TODO) + exceptions.add("Übrigen"); // je nach Kontext groß (TODO), z.B. "im Übrigen" + exceptions.add("Unvorhergesehenes"); // je nach Kontext groß (TODO), z.B. "etwas Unvorhergesehenes" + + exceptions.add("Englisch"); // TODO: alle Sprachen + exceptions.add("Deutsch"); + exceptions.add("Französisch"); + exceptions.add("Spanisch"); + exceptions.add("Italienisch"); + exceptions.add("Portugiesisch"); + exceptions.add("Dänisch"); + exceptions.add("Norwegisch"); + exceptions.add("Schwedisch"); + exceptions.add("Finnisch"); + exceptions.add("Holländisch"); + exceptions.add("Niederländisch"); + exceptions.add("Polnisch"); + exceptions.add("Tschechisch"); + exceptions.add("Arabisch"); + exceptions.add("Persisch"); + + exceptions.add("Schuld"); + exceptions.add("Erwachsener"); + exceptions.add("Jugendlicher"); + exceptions.add("Link"); + exceptions.add("Ausdrücke"); + exceptions.add("Landwirtschaft"); + exceptions.add("Flöße"); + exceptions.add("Wild"); + exceptions.add("Vorsitzender"); + exceptions.add("Mrd"); + exceptions.add("Links"); + // Änderungen an der Rechtschreibreform 2006 erlauben hier Großschreibung: + exceptions.add("Du"); + exceptions.add("Dir"); + exceptions.add("Dich"); + exceptions.add("Deine"); + exceptions.add("Deinen"); + exceptions.add("Deinem"); + exceptions.add("Deines"); + exceptions.add("Deiner"); + exceptions.add("Euch"); + + exceptions.add("Neuem"); + exceptions.add("Weitem"); + exceptions.add("Weiteres"); + exceptions.add("Langem"); + exceptions.add("Längerem"); + exceptions.add("Kurzem"); + exceptions.add("Schwarzes"); // Schwarzes Brett + exceptions.add("Goldener"); // Goldener Schnitt + // TODO: add more exceptions here + } + + private static final Set<String> myExceptionPhrases = new HashSet<String>(); + static { + // use proper upper/lowercase spelling here: + myExceptionPhrases.add("ohne Wenn und Aber"); + myExceptionPhrases.add("Große Koalition"); + myExceptionPhrases.add("Großen Koalition"); + myExceptionPhrases.add("im Großen und Ganzen"); + myExceptionPhrases.add("Im Großen und Ganzen"); + myExceptionPhrases.add("im Guten wie im Schlechten"); + myExceptionPhrases.add("Im Guten wie im Schlechten"); + } + + private static final Set<String> substVerbenExceptions = new HashSet<String>(); + static { + substVerbenExceptions.add("gehören"); + substVerbenExceptions.add("bedeutet"); // "und das bedeutet..." + substVerbenExceptions.add("ermöglicht"); // "und das ermöglicht..." + substVerbenExceptions.add("sollen"); + substVerbenExceptions.add("werden"); + substVerbenExceptions.add("dürfen"); + substVerbenExceptions.add("müssen"); + substVerbenExceptions.add("so"); + substVerbenExceptions.add("ist"); + substVerbenExceptions.add("können"); + substVerbenExceptions.add("muss"); + substVerbenExceptions.add("muß"); + substVerbenExceptions.add("wollen"); + substVerbenExceptions.add("habe"); + substVerbenExceptions.add("ein"); // nicht "einen" (Verb) + substVerbenExceptions.add("tun"); // "...dann wird er das tun." + substVerbenExceptions.add("bestätigt"); + substVerbenExceptions.add("bestätigte"); + substVerbenExceptions.add("bestätigten"); + substVerbenExceptions.add("bekommen"); + } + + public CaseRule(final ResourceBundle messages) { + if (messages != null) + super.setCategory(new Category(messages.getString("category_case"))); + } + + public String getId() { + return "DE_CASE"; + } + + public String getDescription() { + return "Großschreibung von Nomen und substantivierten Verben"; + } + + public RuleMatch[] match(final AnalyzedSentence text) { + final List<RuleMatch> ruleMatches = new ArrayList<RuleMatch>(); + final AnalyzedTokenReadings[] tokens = text.getTokensWithoutWhitespace(); + + int pos = 0; + boolean prevTokenIsDas = false; + for (int i = 0; i < tokens.length; i++) { + //FIXME: defaulting to the first analysis + //don't know if it's safe + final String posToken = tokens[i].getAnalyzedToken(0).getPOSTag(); + if (posToken != null && posToken.equals(JLanguageTool.SENTENCE_START_TAGNAME)) + continue; + if (i == 1) { // don't care about first word, UppercaseSentenceStartRule does this already + if (nounIndicators.contains(tokens[i].getToken().toLowerCase())) { + prevTokenIsDas = true; + } + continue; + } + final AnalyzedGermanTokenReadings analyzedToken = (AnalyzedGermanTokenReadings)tokens[i]; + final String token = analyzedToken.getToken(); + List<AnalyzedGermanToken> readings = analyzedToken.getGermanReadings(); + AnalyzedGermanTokenReadings analyzedGermanToken2 = null; + + boolean isBaseform = false; + if (analyzedToken.getReadingsLength() > 1 && token.equals(analyzedToken.getAnalyzedToken(0).getLemma())) { + isBaseform = true; + } + if ((readings == null || analyzedToken.getAnalyzedToken(0).getPOSTag() == null || analyzedToken.hasReadingOfType(GermanToken.POSType.VERB)) + && isBaseform) { + // no match, e.g. for "Groß": try if there's a match for the lowercased word: + + try { + analyzedGermanToken2 = tagger.lookup(token.toLowerCase()); + if (analyzedGermanToken2 != null) { + readings = analyzedGermanToken2.getGermanReadings(); + } + } catch (IOException e) { + throw new RuntimeException(e); + } + if (prevTokenIsDas) { + // e.g. essen -> Essen + final String newToken = StringTools.uppercaseFirstChar(token); + try { + analyzedGermanToken2 = tagger.lookup(newToken); + //analyzedGermanToken2.hasReadingOfType(GermanToken.POSType.VERB) + } catch (IOException e) { + throw new RuntimeException(e); + } + if (Character.isLowerCase(token.charAt(0)) && !substVerbenExceptions.contains(token)) { + final String msg = "Substantivierte Verben werden groß geschrieben."; + final RuleMatch ruleMatch = new RuleMatch(this, tokens[i].getStartPos(), + tokens[i].getStartPos()+token.length(), msg); + final String word = tokens[i].getToken(); + final String fixedWord = StringTools.uppercaseFirstChar(word); + ruleMatch.setSuggestedReplacement(fixedWord); + ruleMatches.add(ruleMatch); + } + } + } + prevTokenIsDas = nounIndicators.contains(tokens[i].getToken().toLowerCase()); + if (readings == null) + continue; + final boolean hasNounReading = analyzedToken.hasReadingOfType(GermanToken.POSType.NOMEN); + if (hasNounReading) // it's the spell checker's task to check that nouns are uppercase + continue; + try { + // TODO: this lookup should only happen once: + analyzedGermanToken2 = tagger.lookup(token.toLowerCase()); + } catch (IOException e) { + throw new RuntimeException(e); + } + if (analyzedToken.getAnalyzedToken(0).getPOSTag() == null && analyzedGermanToken2 == null) { + continue; + } + if (analyzedToken.getAnalyzedToken(0).getPOSTag() == null && analyzedGermanToken2 != null + && analyzedGermanToken2.getAnalyzedToken(0).getPOSTag() == null) { + // unknown word, probably a name etc + continue; + } + + if (Character.isUpperCase(token.charAt(0)) && + token.length() > 1 && // length limit = ignore abbreviations + !sentenceStartExceptions.contains(tokens[i-1].getToken()) && + !StringTools.isAllUppercase(token) && + !exceptions.contains(token) && + !analyzedToken.hasReadingOfType(POSType.PROPER_NOUN) && + !analyzedToken.isSentenceEnd() && + !isExceptionPhrase(i, tokens)) { + final String msg = "Außer am Satzanfang werden nur Nomen und Eigennamen groß geschrieben"; + final RuleMatch ruleMatch = new RuleMatch(this, tokens[i].getStartPos(), + tokens[i].getStartPos()+token.length(), msg); + final String word = tokens[i].getToken(); + final String fixedWord = Character.toLowerCase(word.charAt(0)) + word.substring(1); + ruleMatch.setSuggestedReplacement(fixedWord); + ruleMatches.add(ruleMatch); + } + pos += token.length(); + } + return toRuleMatchArray(ruleMatches); + } + + private boolean isExceptionPhrase(int i, AnalyzedTokenReadings[] tokens) { + // TODO: speed up? + for (String exc : myExceptionPhrases) { + final String[] parts = exc.split(" "); + for (int j = 0; j < parts.length; j++) { + if (parts[j].equals(tokens[i].getToken())) { + /*System.err.println("*******"+j + " of " + parts.length + ": " + parts[j]); + System.err.println("start:" + tokens[i-j].getToken()); + System.err.println("end:" + tokens[i-j+parts.length-1].getToken());*/ + final int startIndex = i-j; + if (compareLists(tokens, startIndex, startIndex+parts.length-1, parts)) { + return true; + } + } + } + } + return false; + } + + private boolean compareLists(AnalyzedTokenReadings[] tokens, int startIndex, int endIndex, String[] parts) { + if (startIndex < 0) + return false; + int i = 0; + for (int j = startIndex; j <= endIndex; j++) { + //System.err.println("**" +tokens[j].getToken() + " <-> "+ parts[i]); + if (i >= parts.length) + return false; + if (!tokens[j].getToken().equals(parts[i])) { + return false; + } + i++; + } + return true; + } + + public void reset() { + // nothing + } + +}
\ No newline at end of file diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/rules/de/CompoundRule.java b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/de/CompoundRule.java new file mode 100644 index 0000000..f180acc --- /dev/null +++ b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/de/CompoundRule.java @@ -0,0 +1,53 @@ +/* LanguageTool, a natural language style checker + * Copyright (C) 2006 Daniel Naber (http://www.danielnaber.de) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ +package de.danielnaber.languagetool.rules.de; + +import java.io.IOException; +import java.util.ResourceBundle; + +import de.danielnaber.languagetool.JLanguageTool; +import de.danielnaber.languagetool.rules.AbstractCompoundRule; + +/** + * Checks that compounds (if in the list) are not written as separate words. + * + * @author Daniel Naber + */ +public class CompoundRule extends AbstractCompoundRule { + + private static final String FILE_NAME = "/de/compounds.txt"; + + public CompoundRule(final ResourceBundle messages) throws IOException { + super(messages); + loadCompoundFile(JLanguageTool.getDataBroker().getFromResourceDirAsStream(FILE_NAME), "UTF-8"); + super.setShort("Hyphenation problem"); + super.setMsg("Dieses Kompositum wird mit Bindestrich geschrieben.", + "Dieses Kompositum wird zusammengeschrieben.", + "Dieses Kompositum wird zusammen oder mit Bindestrich geschrieben."); + } + + + public String getId() { + return "DE_COMPOUNDS"; + } + + public String getDescription() { + return "Zusammenschreibung von Komposita, z.B. 'CD-ROM' statt 'CD ROM'"; + } +} diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/rules/de/DashRule.java b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/de/DashRule.java new file mode 100644 index 0000000..18bb670 --- /dev/null +++ b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/de/DashRule.java @@ -0,0 +1,84 @@ +/* LanguageTool, a natural language style checker + * Copyright (C) 2005 Daniel Naber (http://www.danielnaber.de) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ +package de.danielnaber.languagetool.rules.de; + +import java.util.ArrayList; +import java.util.List; +import java.util.ResourceBundle; + +import de.danielnaber.languagetool.AnalyzedSentence; +import de.danielnaber.languagetool.AnalyzedTokenReadings; +import de.danielnaber.languagetool.rules.Category; +import de.danielnaber.languagetool.rules.RuleMatch; + +/** + * Prüft, dass in Bindestrich-Komposita kein Leerzeichen eingefügt wird (wie z.B. in 'Diäten- Erhöhung'). + * + * @author Daniel Naber + */ +public class DashRule extends GermanRule { + + public DashRule(final ResourceBundle messages) { + if (messages != null) + super.setCategory(new Category(messages.getString("category_misc"))); + } + + public String getId() { + return "DE_DASH"; + } + + public String getDescription() { + return "Keine Leerzeichen in Bindestrich-Komposita (wie z.B. in 'Diäten- Erhöhung')"; + } + + public RuleMatch[] match(final AnalyzedSentence text) { + final List<RuleMatch> ruleMatches = new ArrayList<RuleMatch>(); + final AnalyzedTokenReadings[] tokens = text.getTokensWithoutWhitespace(); + int pos = 0; + String prevToken = null; + for (int i = 0; i < tokens.length; i++) { + final String token = tokens[i].getToken(); + if (tokens[i].isWhitespace()) { + // ignore + continue; + } + if (prevToken != null && !prevToken.equals("-") && prevToken.indexOf("--") == -1 + && prevToken.indexOf("–-") == -1 // first char is some special kind of dash, found in Wikipedia + && prevToken.endsWith("-")) { + final char firstChar = token.charAt(0); + if (Character.isUpperCase(firstChar)) { + final String msg = "Möglicherweise fehlt ein 'und' oder es wurde nach dem Wort " + + "ein überflüssiges Leerzeichen eingefügt."; + final RuleMatch ruleMatch = new RuleMatch(this, tokens[i-1].getStartPos(), + tokens[i-1].getStartPos()+prevToken.length()+1, msg); + ruleMatch.setSuggestedReplacement(tokens[i-1].getToken()); + ruleMatches.add(ruleMatch); + } + } + prevToken = token; + pos += token.length(); + } + return toRuleMatchArray(ruleMatches); + } + + public void reset() { + // nothing + } + +} diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/rules/de/GermanLemmatizer.java b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/de/GermanLemmatizer.java new file mode 100644 index 0000000..ddcac98 --- /dev/null +++ b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/de/GermanLemmatizer.java @@ -0,0 +1,84 @@ +/* LanguageTool, a natural language style checker + * Copyright (C) 2005 Daniel Naber (http://www.danielnaber.de) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ +package de.danielnaber.languagetool.rules.de; + +import java.io.BufferedReader; +import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.util.HashMap; +import java.util.Map; + +import de.danielnaber.languagetool.JLanguageTool; + +/** + * Trivial German lemmatizer that can simply find the baseforms of + * those fullforms listed in <code>rules/de/fullform2baseform.txt</code>. + * + * @author Daniel Naber + */ +class GermanLemmatizer { + + private static final String FILE_NAME = "/de/fullform2baseform.txt"; + private static final String FILE_ENCODING = "utf-8"; + + private final Map<String, String> fullform2baseform; + + GermanLemmatizer() throws IOException { + fullform2baseform = loadWords(JLanguageTool.getDataBroker().getFromRulesDirAsStream(FILE_NAME)); + } + + String getBaseform(final String fullform) { + return fullform2baseform.get(fullform); + } + + private Map<String, String> loadWords(InputStream file) throws IOException { + final Map<String, String> map = new HashMap<String, String>(); + InputStreamReader isr = null; + BufferedReader br = null; + try { + isr = new InputStreamReader(file, FILE_ENCODING); + br = new BufferedReader(isr); + String line; + while ((line = br.readLine()) != null) { + line = line.trim(); + if (line.length() < 1) { //ignore empty lines + continue; + } + if (line.charAt(0) == '#') { // ignore comments + continue; + } + final String[] parts = line.split(":"); + if (parts.length != 2) { + throw new IOException("Format error in file " +JLanguageTool.getDataBroker().getFromRulesDirAsUrl(FILE_NAME)+", line: " + line); + } + final String baseform = parts[0]; + final String[] fullforms = parts[1].split(","); + for (String fullform : fullforms) { + map.put(fullform.trim(), baseform); + } + } + } finally { + if (br != null) br.close(); + if (isr != null) isr.close(); + } + return map; + } + +} diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/rules/de/GermanRule.java b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/de/GermanRule.java new file mode 100644 index 0000000..1fca395 --- /dev/null +++ b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/de/GermanRule.java @@ -0,0 +1,30 @@ +/* LanguageTool, a natural language style checker + * Copyright (C) 2005 Daniel Naber (http://www.danielnaber.de) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ +package de.danielnaber.languagetool.rules.de; + +import de.danielnaber.languagetool.rules.Rule; + +/** + * Abstract base class for rules for the German language. + * + * @author Daniel Naber + */ +public abstract class GermanRule extends Rule { + +} diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/rules/de/GermanWordRepeatRule.java b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/de/GermanWordRepeatRule.java new file mode 100644 index 0000000..55f98b4 --- /dev/null +++ b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/de/GermanWordRepeatRule.java @@ -0,0 +1,39 @@ +/* + * Created on 03.10.2009 + */ +package de.danielnaber.languagetool.rules.de; + +import java.util.ResourceBundle; + +import de.danielnaber.languagetool.AnalyzedTokenReadings; +import de.danielnaber.languagetool.Language; +import de.danielnaber.languagetool.rules.WordRepeatRule; + +/** + * Check if a word is repeated twice, taking into account an exception + * for German where e.g. "..., die die ..." is often okay. + * + * @author Daniel Naber + */ +public class GermanWordRepeatRule extends WordRepeatRule { + + public GermanWordRepeatRule(final ResourceBundle messages, final Language language) { + super(messages, language); + } + + @Override + public String getId() { + return "GERMAN_WORD_REPEAT_RULE"; + } + + @Override + public boolean ignore(final AnalyzedTokenReadings[] tokens, final int position) { + // Don't mark error for cases like: + // "wie Honda und Samsung, die die Bezahlung ihrer Firmenchefs..." + if (position >= 2 && ",".equals(tokens[position - 2].getToken())) { + return true; + } + return false; + } + +} diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/rules/de/WiederVsWiderRule.java b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/de/WiederVsWiderRule.java new file mode 100644 index 0000000..ea1c2aa --- /dev/null +++ b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/de/WiederVsWiderRule.java @@ -0,0 +1,91 @@ +/* LanguageTool, a natural language style checker + * Copyright (C) 2005 Daniel Naber (http://www.danielnaber.de) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ +package de.danielnaber.languagetool.rules.de; + +import java.util.ArrayList; +import java.util.List; +import java.util.ResourceBundle; + +import de.danielnaber.languagetool.AnalyzedSentence; +import de.danielnaber.languagetool.AnalyzedTokenReadings; +import de.danielnaber.languagetool.rules.Category; +import de.danielnaber.languagetool.rules.RuleMatch; + +/** + * Check incorrect use of "spiegelt ... wider", namely using "wieder" instead + * of "wider", e.g. in "Das spiegelt die Situation wieder" (incorrect). + * + * @author Daniel Naber + */ +public class WiederVsWiderRule extends GermanRule { + + public WiederVsWiderRule(ResourceBundle messages) { + if (messages != null) + super.setCategory(new Category(messages.getString("category_typo"))); + } + + public String getId() { + return "DE_WIEDER_VS_WIDER"; + } + + public String getDescription() { + return "Möglicher Tippfehler 'spiegeln ... wieder(wider)'"; + } + + public RuleMatch[] match(AnalyzedSentence text) { + final List<RuleMatch> ruleMatches = new ArrayList<RuleMatch>(); + final AnalyzedTokenReadings[] tokens = text.getTokens(); + int pos = 0; + boolean foundSpiegelt = false; + boolean foundWieder = false; + boolean foundWider = false; + for (AnalyzedTokenReadings token1 : tokens) { + final String token = token1.getToken(); + if (token.trim().equals("")) { + // ignore + } else { + if (token.equalsIgnoreCase("spiegelt") || token.equalsIgnoreCase("spiegeln") || token.equalsIgnoreCase("spiegelte") + || token.equalsIgnoreCase("spiegelten") || token.equalsIgnoreCase("spiegelst")) { + foundSpiegelt = true; + } else if (token.equalsIgnoreCase("wieder") && foundSpiegelt) { + foundWieder = true; + } else if (token.equalsIgnoreCase("wider") && foundSpiegelt) { + foundWider = true; + } + if (foundSpiegelt && foundWieder && !foundWider) { + final String msg = "'wider' in 'widerspiegeln' wird mit 'i' statt mit 'ie' " + + "geschrieben, z.B. 'Das spiegelt die Situation gut wider.'"; + final RuleMatch ruleMatch = new RuleMatch(this, pos, pos + token.length(), msg); + ruleMatch.setSuggestedReplacement("wider"); + ruleMatches.add(ruleMatch); + foundSpiegelt = false; + foundWieder = false; + foundWider = false; + } + } + pos += token.length(); + } + return toRuleMatchArray(ruleMatches); + } + + public void reset() { + // nothing + } + +} diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/rules/de/WordCoherencyRule.java b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/de/WordCoherencyRule.java new file mode 100644 index 0000000..2bba43a --- /dev/null +++ b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/de/WordCoherencyRule.java @@ -0,0 +1,156 @@ +/* LanguageTool, a natural language style checker + * Copyright (C) 2005 Daniel Naber (http://www.danielnaber.de) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ +package de.danielnaber.languagetool.rules.de; + +import java.io.BufferedReader; +import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.ResourceBundle; + +import de.danielnaber.languagetool.AnalyzedSentence; +import de.danielnaber.languagetool.AnalyzedToken; +import de.danielnaber.languagetool.AnalyzedTokenReadings; +import de.danielnaber.languagetool.JLanguageTool; +import de.danielnaber.languagetool.rules.Category; +import de.danielnaber.languagetool.rules.RuleMatch; + +/** + * A rule that matches words for which two different spellings are used + * throughout the document. Currently only implemented for German. Loads + * the relevant word from <code>rules/de/coherency.txt</code>. + * + * <p>Note that this should not be used for language variations like + * American English vs. British English or German "alte Rechtschreibung" + * vs. "neue Rechtschreibung" -- that's the task of a spell checker. + * + * @author Daniel Naber + */ +public class WordCoherencyRule extends GermanRule { + + private static final String FILE_NAME = "/de/coherency.txt"; + private static final String FILE_ENCODING = "utf-8"; + + private final Map<String, String> relevantWords; // e.g. "aufwendig -> aufwändig" + private Map<String, RuleMatch> shouldNotAppearWord = new HashMap<String, RuleMatch>(); // e.g. aufwändig -> RuleMatch of aufwendig + + private final GermanLemmatizer germanLemmatizer; + + public WordCoherencyRule(ResourceBundle messages) throws IOException { + if (messages != null) + super.setCategory(new Category(messages.getString("category_misc"))); + relevantWords = loadWords(JLanguageTool.getDataBroker().getFromRulesDirAsStream(FILE_NAME)); + germanLemmatizer = new GermanLemmatizer(); + } + + public String getId() { + return "DE_WORD_COHERENCY"; + } + + public String getDescription() { + return "Einheitliche Schreibweise für Wörter mit mehr als einer korrekten Schreibweise"; + } + + public RuleMatch[] match(AnalyzedSentence text) { + final List<RuleMatch> ruleMatches = new ArrayList<RuleMatch>(); + final AnalyzedTokenReadings[] tokens = text.getTokens(); + int pos = 0; + for (AnalyzedTokenReadings tmpToken : tokens) { + //TODO: definitely should be changed + //if the general lemmatizer is working + //defaulting to the first element because the + //general German lemmatizer is not (yet) there + String token = tmpToken.getToken(); + if (tmpToken.isWhitespace()) { + // ignore + } else { + final String origToken = token; + final List<AnalyzedToken> readings = tmpToken.getReadings(); + // TODO: in theory we need to care about the other readings, too: + if (readings != null && readings.size() > 0) { + final String baseform = readings.get(0).getLemma(); + if (baseform != null) { + token = baseform; + } else { + // not all words are known by the Tagger (esp. compounds), so use the + // file lookup: + final String manualLookup = germanLemmatizer.getBaseform(origToken); + if (manualLookup != null) + token = manualLookup; + } + } + if (shouldNotAppearWord.containsKey(token)) { + final RuleMatch otherMatch = shouldNotAppearWord.get(token); + final String otherSpelling = otherMatch.getMessage(); + final String msg = "'" + token + "' und '" + otherSpelling + + "' sollten nicht gleichzeitig benutzt werden"; + final RuleMatch ruleMatch = new RuleMatch(this, pos, pos + origToken.length(), msg); + ruleMatch.setSuggestedReplacement(otherSpelling); + ruleMatches.add(ruleMatch); + } else if (relevantWords.containsKey(token)) { + final String shouldNotAppear = relevantWords.get(token); + // only used to display this spelling variation if the other one really occurs: + final RuleMatch potentialRuleMatch = new RuleMatch(this, pos, pos + origToken.length(), token); + shouldNotAppearWord.put(shouldNotAppear, potentialRuleMatch); + } + } + pos += tmpToken.getToken().length(); + } + return toRuleMatchArray(ruleMatches); + } + + private Map<String, String> loadWords(InputStream file) throws IOException { + final Map<String, String> map = new HashMap<String, String>(); + InputStreamReader isr = null; + BufferedReader br = null; + try { + isr = new InputStreamReader(file, FILE_ENCODING); + br = new BufferedReader(isr); + String line; + while ((line = br.readLine()) != null) { + line = line.trim(); + if (line.length() < 1) { + continue; + } + if (line.charAt(0) == '#') { // ignore comments + continue; + } + final String[] parts = line.split(";"); + if (parts.length != 2) { + throw new IOException("Format error in file " + JLanguageTool.getDataBroker().getFromRulesDirAsUrl(FILE_NAME) + ", line: " + line); + } + map.put(parts[0], parts[1]); + map.put(parts[1], parts[0]); + } + } finally { + if (br != null) br.close(); + if (isr != null) isr.close(); + } + return map; + } + + public void reset() { + shouldNotAppearWord = new HashMap<String, RuleMatch>(); + } + +} diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/rules/en/AvsAnRule.java b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/en/AvsAnRule.java new file mode 100644 index 0000000..ae02ef5 --- /dev/null +++ b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/en/AvsAnRule.java @@ -0,0 +1,251 @@ +/* LanguageTool, a natural language style checker + * Copyright (C) 2005 Daniel Naber (http://www.danielnaber.de) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ +package de.danielnaber.languagetool.rules.en; + +import java.io.BufferedReader; +import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.util.ArrayList; +import java.util.List; +import java.util.ResourceBundle; +import java.util.TreeSet; + +import de.danielnaber.languagetool.AnalyzedSentence; +import de.danielnaber.languagetool.AnalyzedTokenReadings; +import de.danielnaber.languagetool.JLanguageTool; +import de.danielnaber.languagetool.rules.Category; +import de.danielnaber.languagetool.rules.RuleMatch; +import de.danielnaber.languagetool.tools.StringTools; + +/** + * Check if the determiner (if any) preceding a word is: + * <ul> + * <li><i>an</i> if the next word starts with a vowel + * <li><i>a</i> if the next word does not start with a vowel + * </ul> + * This rule loads some exceptions from external files (e.g. <i>an hour</i>). + * + * @author Daniel Naber + */ +public class AvsAnRule extends EnglishRule { + + private static final String FILENAME_A = "/en/det_a.txt"; + private static final String FILENAME_AN = "/en/det_an.txt"; + + private final TreeSet<String> requiresA; + private final TreeSet<String> requiresAn; + + public AvsAnRule(final ResourceBundle messages) throws IOException { + if (messages != null) { + super.setCategory(new Category(messages.getString("category_misc"))); + } + requiresA = loadWords(JLanguageTool.getDataBroker().getFromRulesDirAsStream(FILENAME_A)); + requiresAn = loadWords(JLanguageTool.getDataBroker().getFromRulesDirAsStream(FILENAME_AN)); + } + + @Override + public String getId() { + return "EN_A_VS_AN"; + } + + @Override + public String getDescription() { + return "Use of 'a' vs. 'an'"; + } + + @Override + public RuleMatch[] match(final AnalyzedSentence text) { + final List<RuleMatch> ruleMatches = new ArrayList<RuleMatch>(); + final AnalyzedTokenReadings[] tokens = text.getTokensWithoutWhitespace(); + String prevToken = ""; + int prevPos = 0; + //ignoring token 0, i.e., SENT_START + for (int i = 1; i < tokens.length; i++) { + String token = tokens[i].getToken(); + boolean doesRequireA = false; + boolean doesRequireAn = false; + // check for exceptions: + boolean isException = false; + final String[] parts = token.split("[-']"); // for example, in "one-way" only "one" is relevant + if (parts.length >= 1 && + !parts[0].equalsIgnoreCase("a")) { // avoid false alarm on "A-levels are..." + token = parts[0]; + } + token = token.replaceAll("[^a-zA-Z0-9\\.']", ""); // e.g. >>an "industry party"<< + if (StringTools.isEmpty(token)) { + continue; + } + final char tokenFirstChar = token.charAt(0); + if (requiresA.contains(token.toLowerCase()) || requiresA.contains(token)) { + isException = true; + doesRequireA = true; + } + if (requiresAn.contains(token.toLowerCase()) || requiresAn.contains(token)) { + if (isException) { + throw new IllegalStateException(token + " is listed in both det_a.txt and det_an.txt"); + } + isException = true; + doesRequireAn = true; + } + + if (!isException) { + if (StringTools.isAllUppercase(token) || StringTools.isMixedCase(token)) { + // we don't know how all-uppercase and mixed case words (often abbreviations) are pronounced, + // so never complain about these: + doesRequireAn = false; + doesRequireA = false; + } else if (isVowel(tokenFirstChar)) { + doesRequireAn = true; + } else { + doesRequireA = true; + } + } + //System.err.println(prevToken + " " +token + ", a="+doesRequireA + ", an="+doesRequireAn); + String msg = null; + if (prevToken.equalsIgnoreCase("a") && doesRequireAn) { + String replacement = "an"; + if (prevToken.equals("A")) { + replacement = "An"; + } + msg = "Use <suggestion>" +replacement+ "</suggestion> instead of '" +prevToken+ "' if the following "+ + "word starts with a vowel sound, e.g. 'an article', " + + "'an hour'"; + } else if (prevToken.equalsIgnoreCase("an") && doesRequireA) { + String replacement = "a"; + if (prevToken.equals("An")) { + replacement = "A"; + } + msg = "Use <suggestion>" +replacement+ "</suggestion> instead of '" +prevToken+ "' if the following "+ + "word doesn't start with a vowel sound, e.g. 'a sentence', " + + "'a university'"; + } + if (msg != null) { + final RuleMatch ruleMatch = new RuleMatch(this, prevPos, prevPos+prevToken.length(), msg, "Wrong article"); + ruleMatches.add(ruleMatch); + } + if (tokens[i].hasPosTag("DT")) { + prevToken = token; + prevPos = tokens[i].getStartPos(); + } else { + prevToken = ""; + } + } + return toRuleMatchArray(ruleMatches); + } + + /** + * Adds "a" or "an" to the English noun. + * Used for suggesting the proper form of the + * indefinite article. + * @param noun Word that needs an article. + * @return String containing the word with a determiner, + * or just the word if the word is an abbreviation. + */ + public final String suggestAorAn(final String noun) { + String word = noun; + boolean doesRequireA = false; + boolean doesRequireAn = false; + // check for exceptions: + boolean isException = false; + final String[] parts = word.split("[-']"); // for example, in "one-way" only "one" is relevant + if (parts.length >= 1 && + !parts[0].equalsIgnoreCase("a")) { // avoid false alarm on "A-levels are..." + word = parts[0]; + } + //html entities! + word = word.replaceAll(""|&|<|>|[^a-zA-Z0-9]", ""); // e.g. >>an "industry party"<< + if (StringTools.isEmpty(word)) { + return word; + } + final char tokenFirstChar = word.charAt(0); + if (requiresA.contains(word.toLowerCase()) || requiresA.contains(word)) { + isException = true; + doesRequireA = true; + } + if (requiresAn.contains(word.toLowerCase()) || requiresAn.contains(word)) { + if (isException) { + throw new IllegalStateException(word + " is listed in both det_a.txt and det_an.txt"); + } + isException = true; + doesRequireAn = true; + } + if (!isException) { + if (StringTools.isAllUppercase(word) || StringTools.isMixedCase(word)) { + // we don't know how all-uppercase words (often abbreviations) are pronounced, + // so never complain about these: + doesRequireAn = false; + doesRequireA = false; + } else if (isVowel(tokenFirstChar)) { + doesRequireAn = true; + } else { + doesRequireA = true; + } + } + if (doesRequireA) { + return "a " + noun; + } else if (doesRequireAn) { + return "an " + noun; + } else { + return noun; + } + } + + private static boolean isVowel(char c) { + c = Character.toLowerCase(c); + return c == 'a' || c == 'e' || c == 'i' || c == 'o' || c == 'u'; + } + + /** + * Load words, normalized to lowercase. + */ + private TreeSet<String> loadWords(final InputStream file) throws IOException { + BufferedReader br = null; + final TreeSet<String> set = new TreeSet<String>(); + try { + br = new BufferedReader(new InputStreamReader(file)); + String line; + while ((line = br.readLine()) != null) { + line = line.trim(); + if (line.length() < 1) { + continue; + } + if (line.charAt(0) == '#') { + continue; + } + if (line.charAt(0) == '*') { + set.add(line.substring(1)); + } else { + set.add(line.toLowerCase()); + } + } + } finally { + if (br != null) { + br.close(); + } + } + return set; + } + + @Override + public void reset() { + // nothing + } + +} diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/rules/en/CompoundRule.java b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/en/CompoundRule.java new file mode 100644 index 0000000..0e01523 --- /dev/null +++ b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/en/CompoundRule.java @@ -0,0 +1,55 @@ +/* LanguageTool, a natural language style checker + * Copyright (C) 2006 Daniel Naber (http://www.danielnaber.de) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ +package de.danielnaber.languagetool.rules.en; + +import java.io.IOException; +import java.util.ResourceBundle; + +import de.danielnaber.languagetool.JLanguageTool; +import de.danielnaber.languagetool.rules.AbstractCompoundRule; + +/** + * Checks that compounds (if in the list) are not written as separate words. + * + * @author Marcin Miłkowski, based on code by Daniel Naber + */ + +public class CompoundRule extends AbstractCompoundRule { + + private static final String FILE_NAME = "/en/compounds.txt"; + + public CompoundRule(final ResourceBundle messages) throws IOException { + super(messages); + loadCompoundFile(JLanguageTool.getDataBroker().getFromResourceDirAsStream(FILE_NAME), "UTF-8"); + super.setShort("Hyphenation problem"); + super.setMsg("This word is normally spelled with hyphen.", + "This word is normally spelled as one.", + "This expression is normally spelled as one or with hyphen."); + } + + public String getId() { + return "EN_COMPOUNDS"; + } + + public String getDescription() { + return "Hyphenated words, e.g., 'case-sensitive' instead of 'case sensitive'"; + } + + +} diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/rules/en/EnglishRule.java b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/en/EnglishRule.java new file mode 100644 index 0000000..cd0036d --- /dev/null +++ b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/en/EnglishRule.java @@ -0,0 +1,30 @@ +/* LanguageTool, a natural language style checker + * Copyright (C) 2005 Daniel Naber (http://www.danielnaber.de) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ +package de.danielnaber.languagetool.rules.en; + +import de.danielnaber.languagetool.rules.Rule; + +/** + * Abstract base class for rules for the English language. + * + * @author Daniel Naber + */ +public abstract class EnglishRule extends Rule { + +} diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/rules/en/EnglishUnpairedBracketsRule.java b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/en/EnglishUnpairedBracketsRule.java new file mode 100644 index 0000000..4b32c05 --- /dev/null +++ b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/en/EnglishUnpairedBracketsRule.java @@ -0,0 +1,89 @@ +/* LanguageTool, a natural language style checker + * Copyright (C) 2010 Daniel Naber (http://www.languagetool.org) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ + +package de.danielnaber.languagetool.rules.en; + +import java.util.ResourceBundle; +import java.util.regex.Pattern; + +import de.danielnaber.languagetool.AnalyzedTokenReadings; +import de.danielnaber.languagetool.Language; +import de.danielnaber.languagetool.rules.GenericUnpairedBracketsRule; + +public class EnglishUnpairedBracketsRule extends GenericUnpairedBracketsRule { + + private static final String[] EN_START_SYMBOLS = { "[", "(", "{", "“", "\"", "'" }; + private static final String[] EN_END_SYMBOLS = { "]", ")", "}", "”", "\"", "'" }; + + private static final Pattern NUMBER = Pattern.compile("\\d+"); + + public EnglishUnpairedBracketsRule(final ResourceBundle messages, + final Language language) { + super(messages, language); + startSymbols = EN_START_SYMBOLS; + endSymbols = EN_END_SYMBOLS; + } + + public String getId() { + return "EN_UNPAIRED_BRACKETS"; + } + + protected boolean isNoException(final String token, + final AnalyzedTokenReadings[] tokens, final int i, final int j, final boolean precSpace, + final boolean follSpace) { + + +//TODO: add an', o', 'till, 'tain't, 'cept, 'fore in the disambiguator +//and mark up as contractions somehow +// add exception for dates like '52 + + if (i <= 1) { + return true; + } + + if (!precSpace && follSpace) { + // exception for English inches, e.g., 20" + if ("\"".equals(token) + && NUMBER.matcher(tokens[i - 1].getToken()).matches()) { + return false; + } + // Exception for English plural Saxon genetive + // current disambiguation scheme is a bit too greedy + // for adjectives + if ("'".equals(token) && tokens[i].hasPosTag("POS")) { + return false; + } + // puttin' on the Ritz + if ("'".equals(token) && tokens[i - 1].hasPosTag("VBG") + && tokens[i - 1].getToken().endsWith("in")) { + return false; + } + } + if (precSpace && !follSpace) { + // hold 'em! + if ("'".equals(token) && i + 1 < tokens.length + && "em".equals(tokens[i + 1].getToken())) { + return false; + } + } + return true; + } + + +} diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/rules/es/ElwithFemRule.java b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/es/ElwithFemRule.java new file mode 100644 index 0000000..c22b9a3 --- /dev/null +++ b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/es/ElwithFemRule.java @@ -0,0 +1,179 @@ +/* LanguageTool, a natural language style checker + * Copyright (C) 2005 Daniel Naber (http://www.danielnaber.de) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ +package de.danielnaber.languagetool.rules.es; + +import java.io.BufferedReader; +import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.util.ArrayList; +import java.util.List; +import java.util.ResourceBundle; +import java.util.TreeSet; + +import de.danielnaber.languagetool.AnalyzedSentence; +import de.danielnaber.languagetool.AnalyzedTokenReadings; +import de.danielnaber.languagetool.JLanguageTool; +import de.danielnaber.languagetool.rules.Category; +import de.danielnaber.languagetool.rules.RuleMatch; +import de.danielnaber.languagetool.tools.StringTools; + +/** + * Check if the determiner (if any) preceding a feminine noun is "el". This + * rule loads a list of words (feminine nouns starting with stressed ha- or a-) + * from an external file. These words enforce the use of 'el' as determiner + * instead of 'la' (also with 'un', 'algun' and 'ningun'). + * + * Sample + * + * *la alma -> el alma + * *la hambre -> el hambre + * + * http://blog.lengua-e.com/2007/el-arma-determinante-masculino-ante-nombre-femenino/ + * http://tinyurl.com/m9uzte + * + * + * @author Susana Sotelo Docio + * + * based on English AvsAnRule rule + */ +public class ElwithFemRule extends SpanishRule { + + private static final String FILENAME_EL = "/es/el.txt"; + private final TreeSet<String> requiresEl; + + public ElwithFemRule(final ResourceBundle messages) throws IOException { + if (messages != null) { + super.setCategory(new Category(messages.getString("category_misc"))); + } + requiresEl = loadWords(JLanguageTool.getDataBroker().getFromRulesDirAsStream(FILENAME_EL)); + } + + @Override + public String getId() { + return "EL_WITH_FEM"; + } + + @Override + public String getDescription() { + return "Uso de 'el' con sustantivos femeninos que comienzan por a- o ha- t\u00f3nicas"; + } + + @Override + public RuleMatch[] match(final AnalyzedSentence text) { + final List<RuleMatch> ruleMatches = new ArrayList<RuleMatch>(); + final AnalyzedTokenReadings[] tokens = text.getTokensWithoutWhitespace(); + String prevToken = ""; + int prevPos = 0; + //ignoring token 0, i.e., SENT_START + for (int i = 1; i < tokens.length; i++) { + String token = tokens[i].getToken(); + boolean doesRequireEl = false; + + token = token.replaceAll("[^a-záéíóúñüA-ZÁÉÍÓÚÑÜ0-9\\.']", ""); // el 'alma' + if (StringTools.isEmpty(token)) { + continue; + } + if (requiresEl.contains(token.toLowerCase()) || requiresEl.contains(token)) { + doesRequireEl = true; + } + + // FIXME: temporal solution for "La Haya" (change) + if (prevToken.equals("La") && token.equals("Haya")) { + doesRequireEl = false; + } + + String msg = null; + String replacement = null; + if (prevToken.equalsIgnoreCase("la") && doesRequireEl) + { + replacement = "el"; + if (prevToken.equals("La")) { replacement = "El"; } + } + else if (prevToken.equalsIgnoreCase("una") && doesRequireEl) + { + replacement = "un"; + if (prevToken.equals("Una")) { replacement = "Un"; } + } + else if (prevToken.equalsIgnoreCase("alguna") && doesRequireEl) + { + replacement = "alg\u00fan"; + if (prevToken.equals("Alguna")) { replacement = "Alg\u00fan"; } + } + else if (prevToken.equalsIgnoreCase("ninguna") && doesRequireEl) + { + replacement = "ning\u00fan"; + if (prevToken.equals("Ninguna")) { replacement = "Ning\u00fan"; } + } + + msg = "Use <suggestion>" +replacement+ "</suggestion> en lugar de '" +prevToken+ "' si la siguiente "+ + "palabra comienza por 'a' o 'ha' t\u00f3nicas, por ejemplo 'el hampa', " + + "'un agua'"; + + + if (replacement != null) { + final RuleMatch ruleMatch = new RuleMatch(this, prevPos, prevPos+prevToken.length(), msg, "Art\u00edculo incorrecto"); + ruleMatches.add(ruleMatch); + } + if (tokens[i].hasPosTag("DA0FS0") || tokens[i].hasPosTag("DI0FS0") ) { + prevToken = token; + prevPos = tokens[i].getStartPos(); + } else { + prevToken = ""; + } + } + return toRuleMatchArray(ruleMatches); + } + + /** + * Load words, normalized to lowercase. + */ + private TreeSet<String> loadWords(final InputStream file) throws IOException { + BufferedReader br = null; + final TreeSet<String> set = new TreeSet<String>(); + try { + br = new BufferedReader(new InputStreamReader(file, "utf-8")); + String line; + while ((line = br.readLine()) != null) { + line = line.trim(); + if (line.length() < 1) { + continue; + } + if (line.charAt(0) == '#') { + continue; + } + if (line.charAt(0) == '*') { + set.add(line.substring(1)); + } else { + set.add(line.toLowerCase()); + } + } + } finally { + if (br != null) { + br.close(); + } + } + return set; + } + + @Override + public void reset() { + // nothing + } +} diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/rules/es/SpanishRule.java b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/es/SpanishRule.java new file mode 100644 index 0000000..4aaa297 --- /dev/null +++ b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/es/SpanishRule.java @@ -0,0 +1,32 @@ +/* LanguageTool, a natural language style checker + * Copyright (C) 2005 Daniel Naber (http://www.danielnaber.de) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ +package de.danielnaber.languagetool.rules.es; + +import de.danielnaber.languagetool.rules.Rule; + +/** + * Abstract base class for rules for Spanish. + * + * @author Susana Sotelo Docio + * + * based on English rules + */ +public abstract class SpanishRule extends Rule { + +} diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/rules/fr/FrenchRule.java b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/fr/FrenchRule.java new file mode 100644 index 0000000..2ad4bcc --- /dev/null +++ b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/fr/FrenchRule.java @@ -0,0 +1,31 @@ +/* LanguageTool, a natural language style checker + * Copyright (C) 2005 Daniel Naber (http://www.danielnaber.de) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ + +package de.danielnaber.languagetool.rules.fr; + +import de.danielnaber.languagetool.rules.Rule; + +/** + * Abstract base class for French rules. + * + * @author Marcin Milkowski + */ +public abstract class FrenchRule extends Rule { + +} diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/rules/fr/QuestionWhitespaceRule.java b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/fr/QuestionWhitespaceRule.java new file mode 100644 index 0000000..4c03049 --- /dev/null +++ b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/fr/QuestionWhitespaceRule.java @@ -0,0 +1,161 @@ +/* LanguageTool, a natural language style checker + * Copyright (C) 2005 Daniel Naber (http://www.danielnaber.de) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ +package de.danielnaber.languagetool.rules.fr; + +import java.util.ArrayList; +import java.util.List; +import java.util.ResourceBundle; + +import de.danielnaber.languagetool.AnalyzedSentence; +import de.danielnaber.languagetool.AnalyzedTokenReadings; +import de.danielnaber.languagetool.rules.Category; +import de.danielnaber.languagetool.rules.RuleMatch; +import de.danielnaber.languagetool.tools.StringTools; + +/** + * A rule that matches spaces before ?,:,; and ! (required for correct French + * punctuation). + * + * @author Marcin Miłkowski + */ +public class QuestionWhitespaceRule extends FrenchRule { + + public QuestionWhitespaceRule(final ResourceBundle messages) { + // super(messages); + super.setCategory(new Category(messages.getString("category_misc"))); + } + + @Override + public String getId() { + return "FRENCH_WHITESPACE"; + } + + @Override + public String getDescription() { + return "Insertion des espaces fines insécables"; + } + + @Override + public RuleMatch[] match(final AnalyzedSentence text) { + final List<RuleMatch> ruleMatches = new ArrayList<RuleMatch>(); + final AnalyzedTokenReadings[] tokens = text.getTokens(); + String prevToken = ""; + int pos = 0; + for (int i = 1; i < tokens.length; i++) { + final String token = tokens[i].getToken(); + final boolean isWhiteBefore = tokens[i].isWhitespaceBefore(); + pos += token.length(); + String msg = null; + final int fixPos = 0; + int fixLen = 0; + String suggestionText = null; + if (isWhiteBefore) { + if (token.equals("?")) { + msg = "Point d'interrogation est précédé d'une espace fine insécable."; + // non-breaking space + suggestionText = " ?"; + fixLen = 1; + } else if (token.equals("!")) { + msg = "Point d'exclamation est précédé d'une espace fine insécable."; + // non-breaking space + suggestionText = " !"; + fixLen = 1; + } else if (token.equals("»")) { + msg = "Le guillemet fermant est précédé d'une espace fine insécable."; + // non-breaking space + suggestionText = " »"; + fixLen = 1; + } else if (token.equals(";")) { + msg = "Point-virgule est précédé d'une espace fine insécable."; + // non-breaking space + suggestionText = " ;"; + fixLen = 1; + } else if (token.equals(":")) { + msg = "Deux-points sont précédé d'une espace fine insécable."; + // non-breaking space + suggestionText = " :"; + fixLen = 1; + } + } else { + if (token.equals("?") && !prevToken.equals("!") + && !prevToken.equals("\u00a0")) { + msg = "Point d'interrogation est précédé d'une espace fine insécable."; + // non-breaking space + suggestionText = prevToken + " ?"; + fixLen = 1; + } else if (token.equals("!") && !prevToken.equals("?") + && !prevToken.equals("\u00a0")) { + msg = "Point d'exclamation est précédé d'une espace fine insécable."; + // non-breaking space + suggestionText = prevToken + " !"; + fixLen = 1; + } else if (token.equals(";") && !prevToken.equals("\u00a0")) { + msg = "Point-virgule est précédé d'une espace fine insécable."; + // non-breaking space + suggestionText = prevToken + " ;"; + fixLen = 1; + } else if (token.equals(":") && !prevToken.equals("\u00a0")) { + msg = "Deux-points précédés d'une espace fine insécable."; + // non-breaking space + suggestionText = prevToken + " :"; + fixLen = 1; + } else if (token.equals("»") && !prevToken.equals("\u00a0")) { + msg = "Le guillemet fermant est précédé d'une espace fine insécable."; + // non-breaking space + suggestionText = prevToken + " »"; + fixLen = 1; + } + } + + if (StringTools.isEmpty(token) && prevToken.equals("«")) { + msg = "Le guillemet ouvrant est suivi d'une espace fine insécable."; + // non-breaking space + suggestionText = "« "; + fixLen = 1; + } else if (!StringTools.isEmpty(token) && !token.equals("\u00a0") + && prevToken.equals("«")) { + msg = "Le guillemet ouvrant est suivi d'une espace fine insécable."; + // non-breaking space + suggestionText = "« "; + fixLen = 0; + } + + if (msg != null) { + final int fromPos = tokens[i - 1].getStartPos() + fixPos; + final int toPos = tokens[i - 1].getStartPos() + fixPos + fixLen + + tokens[i - 1].getToken().length(); + final RuleMatch ruleMatch = new RuleMatch(this, fromPos, toPos, msg, + "Insérer un espace insécable"); + if (suggestionText != null) { + ruleMatch.setSuggestedReplacement(suggestionText); + } + ruleMatches.add(ruleMatch); + } + prevToken = token; + } + + return toRuleMatchArray(ruleMatches); + } + + @Override + public void reset() { + // nothing + } + +} diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/rules/patterns/AbstractPatternRule.java b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/patterns/AbstractPatternRule.java new file mode 100644 index 0000000..d172134 --- /dev/null +++ b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/patterns/AbstractPatternRule.java @@ -0,0 +1,223 @@ +/* LanguageTool, a natural language style checker + * Copyright (C) 2008 Daniel Naber (http://www.danielnaber.de) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ + +package de.danielnaber.languagetool.rules.patterns; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; + +import de.danielnaber.languagetool.AnalyzedSentence; +import de.danielnaber.languagetool.AnalyzedToken; +import de.danielnaber.languagetool.AnalyzedTokenReadings; +import de.danielnaber.languagetool.Language; +import de.danielnaber.languagetool.rules.Rule; +import de.danielnaber.languagetool.rules.RuleMatch; + +/** + * An Abstract Pattern Rule that describes a pattern of words or part-of-speech tags + * used for PatternRule and DisambiguationPatternRule. + * + * Introduced to minimize code duplication between those classes. + * + * @author Marcin Miłkowski + */ + +public abstract class AbstractPatternRule extends Rule { + + private final String id; + + private final String description; + + protected final List<Element> patternElements; + + protected Unifier unifier; + + protected final Language language; + + protected int startPositionCorrection; + + protected int endPositionCorrection; + + protected boolean prevMatched; + + protected final boolean testUnification; + + private final boolean getUnified; + + private boolean groupsOrUnification; + + protected AnalyzedTokenReadings[] unifiedTokens; + + protected final boolean sentStart; + + public AbstractPatternRule(final String id, + final String description, + final Language language, + final List<Element> elements, + boolean getUnified) { + this.id = id; + this.description = description; + this.patternElements = new ArrayList<Element>(elements); // copy elements + this.language = language; + this.getUnified = getUnified; + unifier = language.getUnifier(); + testUnification = initUnifier(); + sentStart = patternElements.get(0).isSentStart(); + if (!testUnification) { + for (Element elem : patternElements) { + if (elem.hasAndGroup()) { + groupsOrUnification = true; + break; + } + } + } else { + groupsOrUnification = true; + } + } + + private boolean initUnifier() { + for (final Element elem : patternElements) { + if (elem.isUnified()) { + return true; + } + } + return false; + } + + @Override + public final String toString() { + return id + ":" + patternElements + ":" + description; + } + + @Override + public String getDescription() { + return description; + } + + @Override + public String getId() { + return id; + } + + @Override + public RuleMatch[] match(AnalyzedSentence text) throws IOException { + // TODO Auto-generated method stub + return null; + } + + @Override + public void reset() { + // TODO Auto-generated method stub + } + + public final void setStartPositionCorrection(final int startPositionCorrection) { + this.startPositionCorrection = startPositionCorrection; + } + + public final void setEndPositionCorrection(final int endPositionCorrection) { + this.endPositionCorrection = endPositionCorrection; + } + + + protected void setupAndGroup(final int firstMatchToken, + final Element elem, final AnalyzedTokenReadings[] tokens) + throws IOException { + if (elem.hasAndGroup()) { + for (final Element andElement : elem.getAndGroup()) { + if (andElement.isReferenceElement()) { + setupRef(firstMatchToken, andElement, tokens); + } + } + elem.setupAndGroup(); + } + } + + //TODO: add .compile for all exceptions of the element? + protected void setupRef(final int firstMatchToken, final Element elem, + final AnalyzedTokenReadings[] tokens) throws IOException { + if (elem.isReferenceElement()) { + final int refPos = firstMatchToken + elem.getMatch().getTokenRef(); + if (refPos < tokens.length) { + elem.compile(tokens[refPos], language.getSynthesizer()); + } + } + } + + protected boolean testAllReadings(final AnalyzedTokenReadings[] tokens, + final Element elem, final Element prevElement, final int tokenNo, + final int firstMatchToken, final int prevSkipNext) throws IOException { + boolean thisMatched = false; + final int numberOfReadings = tokens[tokenNo].getReadingsLength(); + setupAndGroup(firstMatchToken, elem, tokens); + for (int l = 0; l < numberOfReadings; l++) { + final AnalyzedToken matchToken = tokens[tokenNo].getAnalyzedToken(l); + prevMatched = prevMatched || prevSkipNext > 0 && prevElement != null + && prevElement.isMatchedByScopeNextException(matchToken); + if (prevMatched) { + return false; + } + thisMatched = thisMatched || elem.isMatched(matchToken); + if (!thisMatched && !elem.isInflected() && elem.getPOStag() == null + && (prevElement != null && prevElement.getExceptionList() == null)) { + return false; // the token is the same, we will not get a match + } + if (groupsOrUnification) { + thisMatched &= testUnificationAndGroups(thisMatched, + l + 1 == numberOfReadings, matchToken, elem); + } + } + if (thisMatched) { + for (int l = 0; l < numberOfReadings; l++) { + if (elem.isExceptionMatchedCompletely(tokens[tokenNo].getAnalyzedToken(l))) + return false; + } + if (tokenNo > 0 && elem.hasPreviousException()) { + if (elem.isMatchedByPreviousException(tokens[tokenNo - 1])) + return false; + } + } + return thisMatched; + } + + protected boolean testUnificationAndGroups(final boolean matched, + final boolean lastReading, final AnalyzedToken matchToken, + final Element elem) { + boolean thisMatched = matched; + if (testUnification) { + if (matched && elem.isUnified()) { + thisMatched = thisMatched && unifier.isUnified(matchToken, elem.getUniFeatures(), + elem.isUniNegated(), lastReading); + } + if (thisMatched && getUnified) { + unifiedTokens = unifier.getFinalUnified(); + } + if (!elem.isUnified()) { + unifier.reset(); + } + } + elem.addMemberAndGroup(matchToken); + if (lastReading) { + thisMatched &= elem.checkAndGroup(thisMatched); + } + return thisMatched; + } + + +} diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/rules/patterns/Element.java b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/patterns/Element.java new file mode 100644 index 0000000..0ad7c1f --- /dev/null +++ b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/patterns/Element.java @@ -0,0 +1,803 @@ +/* LanguageTool, a natural language style checker + * Copyright (C) 2005 Daniel Naber (http://www.danielnaber.de) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ +package de.danielnaber.languagetool.rules.patterns; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; +import java.util.Map; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +import de.danielnaber.languagetool.AnalyzedToken; +import de.danielnaber.languagetool.AnalyzedTokenReadings; +import de.danielnaber.languagetool.JLanguageTool; +import de.danielnaber.languagetool.synthesis.Synthesizer; +import de.danielnaber.languagetool.tools.StringTools; + +/** + * A part of a pattern. + * + * @author Daniel Naber + */ +public class Element { + + private String stringToken; + private String posToken; + private String regToken; + private boolean posRegExp; + + private boolean negation; + private boolean posNegation; + + private final boolean caseSensitive; + private final boolean stringRegExp; + private boolean inflected; + + private boolean testWhitespace; + private boolean whitespaceBefore; + + /** + * List of exceptions that are valid for the current token and / or some next + * tokens. + */ + private List<Element> exceptionList; + + /** + * True if scope=="next". + */ + private boolean exceptionValidNext; + + /** + * True if any exception with a scope=="current" or scope=="next" is set for + * the element. + */ + private boolean exceptionSet; + + /** + * True if attribute scope=="previous". + */ + private boolean exceptionValidPrevious; + + /** + * List of exceptions that are valid for a previous token. + */ + private List<Element> previousExceptionList; + + private List<Element> andGroupList; + private boolean andGroupSet; + private boolean[] andGroupCheck; + + private int skip; + + private Pattern p; + private Pattern pPos; + + private Matcher m; + private Matcher mPos; + + /** The reference to another element in the pattern. **/ + private Match tokenReference; + + /** + * True when the element stores a formatted reference to another element of + * the pattern. + */ + private boolean containsMatches; + + /** Matches only tokens without any POS tag. **/ + private static final String UNKNOWN_TAG = "UNKNOWN"; + + /** + * Parameter passed to regular expression matcher to enable case insensitive + * Unicode matching. + */ + private static final String CASE_INSENSITIVE = "(?iu)"; + + private String referenceString; + + /** String ID of the phrase the element is in. **/ + private String phraseName; + + /** + * This var is used to determine if calling {@link #setStringElement} makes + * sense. This method takes most time so it's best to reduce the number of its + * calls. + **/ + private boolean testString; + + /** + * Tells if the element is inside the unification, so that {@link Unifier} + * tests it. + */ + private boolean unified; + private boolean uniNegation; + + private Map<String, List<String>> unificationFeatures; + + /** + * Creates Element that is used to match tokens in the text. + * + * @param token + * String to be matched + * @param caseSensitive + * True if the check is case-sensitive. + * @param regExp + * True if the check uses regular expressions. + * @param inflected + * True if the check refers to base forms (lemmas). + */ + public Element(final String token, final boolean caseSensitive, + final boolean regExp, final boolean inflected) { + this.caseSensitive = caseSensitive; + this.stringRegExp = regExp; + this.inflected = inflected; + setStringElement(token); + } + + /** + * Checks whether the rule element matches the token given as a parameter. + * + * @param token + * @AnalyzedToken to check matching against + * @return True if token matches, false otherwise. + */ + public final boolean isMatched(final AnalyzedToken token) { + if (testWhitespace && !isWhitespaceBefore(token)) { + return false; + } + boolean matched = false; + if (testString) { + matched = (isStringTokenMatched(token) ^ negation) + && (isPosTokenMatched(token) ^ posNegation); + } else { + matched = (!negation) && (isPosTokenMatched(token) ^ posNegation); + } + + if (andGroupSet) { + andGroupCheck[0] |= matched; + } + return matched; + } + + /** + * Checks whether an exception matches. + * + * @param token + * @AnalyzedToken to check matching against + * @return True if any of the exceptions matches (logical disjunction). + */ + public final boolean isExceptionMatched(final AnalyzedToken token) { + if (exceptionSet) { + for (final Element testException : exceptionList) { + if (!testException.exceptionValidNext) { + if (testException.isMatched(token)) { + return true; + } + } + } + } + return false; + } + + /** + * Enables testing multiple conditions specified by different elements. + * Doesn't test exceptions. + * + * Works as logical AND operator only if preceded with + * {@link #setupAndGroup()}, and followed by {@link #checkAndGroup(boolean)}. + * + * @param token + * AnalyzedToken - the token checked. + */ + public final void addMemberAndGroup(final AnalyzedToken token) { + if (andGroupSet) { + for (int i = 0; i < andGroupList.size(); i++) { + if (!andGroupCheck[i + 1]) { + final Element testAndGroup = andGroupList.get(i); + if (testAndGroup.isMatched(token)) { + andGroupCheck[i + 1] = true; + } + } + } + } + } + + public final void setupAndGroup() { + if (andGroupSet) { + andGroupCheck = new boolean[andGroupList.size() + 1]; + Arrays.fill(andGroupCheck, false); + } + } + + public final boolean checkAndGroup(final boolean previousValue) { + if (andGroupSet) { + boolean allConditionsMatch = true; + for (final boolean testValue : andGroupCheck) { + allConditionsMatch &= testValue; + } + return allConditionsMatch; + } + return previousValue; + } + + /** + * Enables testing multiple conditions specified by multiple element + * exceptions. + * + * Works as logical AND operator. + * + * @param token + * AnalyzedToken - the token checked for exceptions. + * @return true if all conditions are met, false otherwise. + */ + public final boolean isAndExceptionGroupMatched(final AnalyzedToken token) { + if (andGroupSet) { + for (final Element testAndGroup : andGroupList) { + if (testAndGroup.isExceptionMatched(token)) { + return true; + } + } + } + return false; + } + + /** + * This method checks exceptions both in AND-group and the token. Introduced + * to for clarity. + * + * @param token + * Token to match + * @return True if matched. + */ + public final boolean isExceptionMatchedCompletely(final AnalyzedToken token) { + // note: short-circuiting possible + return isExceptionMatched(token) || isAndExceptionGroupMatched(token); + } + + public final void setAndGroupElement(final Element andToken) { + if (andToken != null) { + if (andGroupList == null) { + andGroupList = new ArrayList<Element>(); + } + if (!andGroupSet) { + andGroupSet = true; + } + andGroupList.add(andToken); + } + } + + /** + * Checks if this element has an AND group associated with it. + * + * @return true if the element has a group of elements that all should match. + */ + public final boolean hasAndGroup() { + return andGroupSet; + } + + /** + * Returns the group of elements linked with AND operator. + * + * @return List of Elements. + */ + public final List<Element> getAndGroup() { + return andGroupList; + } + + /** + * Checks whether a previously set exception matches (in case the exception + * had scope == "next"). + * + * @param token + * @AnalyzedToken to check matching against. + * @return True if any of the exceptions matches. + */ + public final boolean isMatchedByScopeNextException(final AnalyzedToken token) { + if (exceptionSet) { + for (final Element testException : exceptionList) { + if (testException.exceptionValidNext) { + if (testException.isMatched(token)) { + return true; + } + } + } + } + return false; + } + + /** + * Checks whether an exception for a previous token matches (in case the + * exception had scope == "previous"). + * + * @param token + * {@link AnalyzedToken} to check matching against. + * @return True if any of the exceptions matches. + */ + public final boolean isMatchedByPreviousException(final AnalyzedToken token) { + if (exceptionValidPrevious) { + for (final Element testException : previousExceptionList) { + if (!testException.exceptionValidNext) { + if (testException.isMatched(token)) { + return true; + } + } + } + } + return false; + } + + /** + * Checks whether an exception for a previous token matches all readings of a + * given token (in case the exception had scope == "previous"). + * + * @param prevToken + * {@link AnalyzedTokenReadings} to check matching against. + * @return true if any of the exceptions matches. + */ + public final boolean isMatchedByPreviousException( + final AnalyzedTokenReadings prevToken) { + final int numReadings = prevToken.getReadingsLength(); + for (int i = 0; i < numReadings; i++) { + if (isMatchedByPreviousException(prevToken.getAnalyzedToken(i))) { + return true; + } + } + return false; + } + + /** + * Checks if the token is a SENT_START. + * + * @return True if the element starts the sentence and the element hasn't been + * set to have negated POS token. + * + */ + public final boolean isSentStart() { + return JLanguageTool.SENTENCE_START_TAGNAME.equals(posToken) + && !posNegation; + } + + @Override + public final String toString() { + final StringBuilder sb = new StringBuilder(); + if (negation) { + sb.append('!'); + } + sb.append(stringToken); + if (phraseName != null) { + sb.append(" {"); + sb.append(phraseName); + sb.append('}'); + } + if (posToken != null) { + sb.append('/'); + sb.append(posToken); + } + return sb.toString(); + } + + public final void setPosElement(final String posToken, final boolean regExp, + final boolean negation) { + this.posToken = posToken; + this.posNegation = negation; + posRegExp = regExp; + if (posRegExp) { + pPos = Pattern.compile(posToken); + } + } + + public final String getString() { + return stringToken; + } + + public final void setStringElement(final String token) { + this.stringToken = token; + testString = !StringTools.isEmpty(stringToken); + if (testString && stringRegExp) { + regToken = stringToken; + if (!caseSensitive) { + regToken = CASE_INSENSITIVE + stringToken; + } + if (!"\\0".equals(token)) { + p = Pattern.compile(regToken); + } + } + } + + /** + * Sets a POS-type exception for matching string tokens. + * + * @param posToken + * The part of the speech tag in the exception. + * @param regExp + * True if the POS is specified as a regular expression. + * @param negation + * True if the exception is negated. + * @param scopeNext + * True if the exception scope is next tokens. + * @param scopePrevious + * True if the exception should match only a single previous token. + */ + public final void setPosException(final String posToken, + final boolean regExp, final boolean negation, final boolean scopeNext, + final boolean scopePrevious) { + final Element posException = new Element("", this.caseSensitive, false, + false); + posException.setPosElement(posToken, regExp, negation); + posException.exceptionValidNext = scopeNext; + setException(posException, scopePrevious); + } + + /** + * Sets a string-type exception for matching string tokens. + * + * @param token + * The string in the exception. + * @param regExp + * True if the string is specified as a regular expression. + * @param inflected + * True if the string is a base form (lemma). + * @param negation + * True if the exception is negated. + * @param scopeNext + * True if the exception scope is next tokens. + * @param scopePrevious + * True if the exception should match only a single previous token. + */ + public final void setStringException(final String token, + final boolean regExp, final boolean inflected, final boolean negation, + final boolean scopeNext, final boolean scopePrevious) { + final Element stringException = new Element(token, this.caseSensitive, + regExp, inflected); + stringException.setNegation(negation); + stringException.exceptionValidNext = scopeNext; + setException(stringException, scopePrevious); + } + + private void setException(final Element elem, final boolean scopePrevious) { + exceptionValidPrevious |= scopePrevious; + if (exceptionList == null && !scopePrevious) { + exceptionList = new ArrayList<Element>(); + } + if (previousExceptionList == null && scopePrevious) { + previousExceptionList = new ArrayList<Element>(); + } + if (scopePrevious) { + previousExceptionList.add(elem); + } else { + if (!exceptionSet) { + exceptionSet = true; + } + if (exceptionSet) { + exceptionList.add(elem); + } + } + } + + /** + * Tests if part of speech matches a given string. + * + * @param token + * Token to test. + * @return true if matches + * + * Special value UNKNOWN_TAG matches null POS tags. + * + */ + private boolean isPosTokenMatched(final AnalyzedToken token) { + // if no POS set + // defaulting to true + if (posToken == null) { + return true; + } + if (token.getPOSTag() == null) { + if (posRegExp) { + if (mPos == null) { + mPos = pPos.matcher(UNKNOWN_TAG); + } else { + mPos.reset(UNKNOWN_TAG); + } + return mPos.matches(); + } + if (UNKNOWN_TAG.equals(posToken)) { + return true; + } + } + boolean match; + if (posRegExp) { + if (mPos == null) { + mPos = pPos.matcher(token.getPOSTag()); + } else { + mPos.reset(token.getPOSTag()); + } + match = mPos.matches(); + } else { + match = posToken.equals(token.getPOSTag()); + } + if (!match && UNKNOWN_TAG.equals(posToken)) { // these are helper tags, + // ignore them + match = JLanguageTool.SENTENCE_END_TAGNAME.equals(token.getPOSTag()) + || JLanguageTool.PARAGRAPH_END_TAGNAME.equals(token.getPOSTag()); + } + return match; + } + + /** + * Tests whether the string token element matches a given token. + * + * @param token + * {@link AnalyzedToken} to match against. + * @return True if matches. + */ + private boolean isStringTokenMatched(final AnalyzedToken token) { + final String testToken = getTestToken(token); + if (stringRegExp) { + if (m == null) { + m = p.matcher(testToken); + } else { + m.reset(testToken); + } + return m.matches(); + } + if (caseSensitive) { + return stringToken.equals(testToken); + } + return stringToken.equalsIgnoreCase(testToken); + } + + private String getTestToken(final AnalyzedToken token) { + // enables using words with lemmas and without lemmas + // in the same regexp with inflected="yes" + if (inflected) { + return token.getTokenInflected(); + } + return token.getToken(); + } + + /** + * Gets the exception scope length. + * + * @return Scope length. + */ + public final int getSkipNext() { + return skip; + } + + /** + * Sets the exception scope length. + * + * @param i + * Exception scope length. + */ + public final void setSkipNext(final int i) { + skip = i; + } + + /** + * Checks if the element has an exception for a previous token. + * + * @return True if the element has a previous token matching exception. + */ + public final boolean hasPreviousException() { + return exceptionValidPrevious; + } + + /** + * Negates the meaning of match(). + * + * @param negation + * - true if the meaning of match() is to be negated. + */ + public final void setNegation(final boolean negation) { + this.negation = negation; + } + + /** + * see {@link #setNegation} + * + * @since 0.9.3 + */ + public final boolean getNegation() { + return this.negation; + } + + /** + * + * @return true when this element refers to another token. + */ + public final boolean isReferenceElement() { + return containsMatches; + } + + /** + * Sets the reference to another token. + * + * @param match + * Formatting object for the token reference. + */ + public final void setMatch(final Match match) { + tokenReference = match; + containsMatches = true; + } + + public final Match getMatch() { + return tokenReference; + } + + /** + * Prepare Element for matching by formatting its string token and POS (if the + * Element is supposed to refer to some other token). + * + * @param token + * the token specified as {@link AnalyzedTokenReadings} + * @param synth + * the language synthesizer ({@link Synthesizer}) + * + */ + public final void compile(final AnalyzedTokenReadings token, + final Synthesizer synth) throws IOException { + + m = null; + p = null; + tokenReference.setToken(token); + tokenReference.setSynthesizer(synth); + + if (StringTools.isEmpty(referenceString)) { + referenceString = stringToken; + } + if (tokenReference.setsPos()) { + final String posReference = tokenReference.getTargetPosTag(); + if (posReference != null) { + if (mPos != null) { + mPos = null; + } + setPosElement(posReference, tokenReference.posRegExp(), negation); + } + setStringElement(referenceString.replace("\\" + + tokenReference.getTokenRef(), "")); + inflected = true; + } else { + setStringElement(referenceString.replace("\\" + + tokenReference.getTokenRef(), tokenReference.toTokenString())); + } + } + + /** + * Sets the phrase the element is in. + * + * @param s + * ID of the phrase. + */ + public final void setPhraseName(final String s) { + phraseName = s; + } + + /** + * Checks if the Element is in any phrase. + * + * @return True if the Element is contained in the phrase. + */ + public final boolean isPartOfPhrase() { + return phraseName != null; + } + + /** + * Whether the element matches case sensitively. + * + * @since 0.9.3 + */ + public final boolean getCaseSensitive() { + return caseSensitive; + } + + /** + * Tests whether the element matches a regular expression. + * + * @since 0.9.6 + */ + public final boolean isRegularExpression() { + return stringRegExp; + } + + /** + * @return the POS of the Element + * @since 0.9.6 + */ + public final String getPOStag() { + return posToken; + } + + /** + * Tests whether the POS is negated. + * + * @return true if so. + */ + public final boolean getPOSNegation() { + return posNegation; + } + + /** + * Whether the token is inflected. + * + * @return True if so. + */ + public final boolean isInflected() { + return inflected; + } + + /** + * Gets the phrase the element is in. + * + * @return String The name of the phrase. + */ + public final String getPhraseName() { + return phraseName; + } + + public final boolean isUnified() { + return unified; + } + + public final void setUnification(final Map<String, List<String>> uniFeatures) { + unificationFeatures = uniFeatures; + unified = true; + } + + /** + * Get unification features and types. + * @return A map from features to a list of types. + * @since 1.0.1 + */ + public final Map<String, List<String>> getUniFeatures() { + return unificationFeatures; + } + + public final void setUniNegation() { + uniNegation = true; + } + + public final boolean isUniNegated() { + return uniNegation; + } + + public final void setWhitespaceBefore(final boolean isWhite) { + whitespaceBefore = isWhite; + testWhitespace = true; + } + + public final void setExceptionSpaceBefore(final boolean isWhite) { + if (exceptionList != null) { + exceptionList.get(exceptionList.size()).setWhitespaceBefore(isWhite); + } + } + + public final boolean isWhitespaceBefore(final AnalyzedToken token) { + return whitespaceBefore == token.isWhitespaceBefore(); + } + + /** + * Since 1.0.0 + * @return A List of Exceptions. Used for testing. + */ + public final List<Element> getExceptionList() { + return exceptionList; + } +} diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/rules/patterns/FalseFriendRuleLoader.java b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/patterns/FalseFriendRuleLoader.java new file mode 100644 index 0000000..94c6515 --- /dev/null +++ b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/patterns/FalseFriendRuleLoader.java @@ -0,0 +1,356 @@ +/* LanguageTool, a natural language style checker + * Copyright (C) 2005 Daniel Naber (http://www.danielnaber.de) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ +package de.danielnaber.languagetool.rules.patterns; + +import java.io.IOException; +import java.io.InputStream; +import java.text.MessageFormat; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.Iterator; +import java.util.List; +import java.util.Map; +import java.util.ResourceBundle; + +import javax.xml.parsers.ParserConfigurationException; +import javax.xml.parsers.SAXParser; +import javax.xml.parsers.SAXParserFactory; + +import org.xml.sax.Attributes; +import org.xml.sax.SAXException; +import org.xml.sax.helpers.DefaultHandler; + +import de.danielnaber.languagetool.JLanguageTool; +import de.danielnaber.languagetool.Language; +import de.danielnaber.languagetool.rules.Category; +import de.danielnaber.languagetool.rules.IncorrectExample; +import de.danielnaber.languagetool.tools.StringTools; + +/** + * Loads {@link PatternRule}s from a false friends XML file. + * + * @author Daniel Naber + */ +public class FalseFriendRuleLoader extends DefaultHandler { + + public FalseFriendRuleLoader() { + } + + public final List<PatternRule> getRules(final InputStream file, + final Language textLanguage, final Language motherTongue) + throws ParserConfigurationException, SAXException, IOException { + final FalseFriendRuleHandler handler = new FalseFriendRuleHandler( + textLanguage, motherTongue); + final SAXParserFactory factory = SAXParserFactory.newInstance(); + final SAXParser saxParser = factory.newSAXParser(); + saxParser.getXMLReader() + .setFeature( + "http://apache.org/xml/features/nonvalidating/load-external-dtd", + false); + saxParser.parse(file, handler); + final List<PatternRule> rules = handler.getRules(); + // Add suggestions to each rule: + final ResourceBundle messages = ResourceBundle.getBundle( + "de.danielnaber.languagetool.MessagesBundle", motherTongue.getLocale()); + for (final PatternRule rule : rules) { + final List<String> suggestionMap = handler.getSuggestionMap().get(rule.getId()); + if (suggestionMap != null) { + final MessageFormat msgFormat = new MessageFormat(messages + .getString("false_friend_suggestion")); + final Object[] msg = new Object[] { formatSuggestions(suggestionMap) }; + rule.setMessage(rule.getMessage() + " " + msgFormat.format(msg)); + } + } + return rules; + } + + private String formatSuggestions(final List<String> l) { + final StringBuilder sb = new StringBuilder(); + for (final Iterator<String> iter = l.iterator(); iter.hasNext();) { + final String s = iter.next(); + sb.append("<suggestion>"); + sb.append(s); + sb.append("</suggestion>"); + if (iter.hasNext()) { + sb.append(", "); + } + } + return sb.toString(); + } + + /** Testing only. */ + public final void main(final String[] args) + throws ParserConfigurationException, SAXException, IOException { + final FalseFriendRuleLoader prg = new FalseFriendRuleLoader(); + List<PatternRule> l = prg.getRules(JLanguageTool.getDataBroker() + .getFromRulesDirAsStream("/false-friends.xml"), Language.ENGLISH, + Language.GERMAN); + System.out.println("Hints for German native speakers:"); + for (final PatternRule rule : l) { + System.out.println(rule); + } + System.out.println("======================================="); + System.out.println("Hints for English native speakers:"); + l = prg.getRules(JLanguageTool.getDataBroker() + .getFromRulesDirAsStream("/false-friends.xml"), + Language.GERMAN, Language.ENGLISH); + for (final PatternRule rule : l) { + System.out.println(rule); + } + } + +} + +class FalseFriendRuleHandler extends XMLRuleHandler { + + private final ResourceBundle messages; + private final MessageFormat formatter; + + private final Language textLanguage; + private final Language motherTongue; + + private boolean defaultOff; + + private Language language; + private Language translationLanguage; + private Language currentTranslationLanguage; + private List<StringBuilder> translations = new ArrayList<StringBuilder>(); + private StringBuilder translation = new StringBuilder(); + private final List<String> suggestions = new ArrayList<String>(); + // rule ID -> list of translations: + private final Map<String, List<String>> suggestionMap = new HashMap<String, List<String>>(); + + private boolean inTranslation; + + public FalseFriendRuleHandler(final Language textLanguage, + final Language motherTongue) { + messages = ResourceBundle.getBundle( + "de.danielnaber.languagetool.MessagesBundle", motherTongue.getLocale()); + formatter = new MessageFormat(""); + formatter.setLocale(motherTongue.getLocale()); + this.textLanguage = textLanguage; + this.motherTongue = motherTongue; + } + + public Map<String, List<String>> getSuggestionMap() { + return suggestionMap; + } + + // =========================================================== + // SAX DocumentHandler methods + // =========================================================== + + @Override + public void startElement(final String namespaceURI, final String lName, + final String qName, final Attributes attrs) throws SAXException { + if (qName.equals("rule")) { + translations = new ArrayList<StringBuilder>(); + id = attrs.getValue("id"); + if (!(inRuleGroup && defaultOff)) { + defaultOff = "off".equals(attrs.getValue("default")); + } + if (inRuleGroup && id == null) { + id = ruleGroupId; + } + correctExamples = new ArrayList<String>(); + incorrectExamples = new ArrayList<IncorrectExample>(); + } else if (qName.equals("pattern")) { + inPattern = true; + final String languageStr = attrs.getValue("lang"); + language = Language.getLanguageForShortName(languageStr); + if (language == null) { + throw new SAXException("Unknown language '" + languageStr + "'"); + } + } else if (qName.equals("exception")) { + inException = true; + exceptions = new StringBuilder(); + + if (attrs.getValue(NEGATE) != null) { + exceptionStringNegation = attrs.getValue(NEGATE).equals(YES); + } + if (attrs.getValue(SCOPE) != null) { + exceptionValidNext = attrs.getValue(SCOPE).equals("next"); + exceptionValidPrev = attrs.getValue(SCOPE).equals("previous"); + } + if (attrs.getValue(INFLECTED) != null) { + exceptionStringInflected = attrs.getValue(INFLECTED).equals(YES); + } + if (attrs.getValue(POSTAG) != null) { + exceptionPosToken = attrs.getValue(POSTAG); + if (attrs.getValue(POSTAG_REGEXP) != null) { + exceptionPosRegExp = attrs.getValue(POSTAG_REGEXP).equals(YES); + } + if (attrs.getValue(NEGATE_POS) != null) { + exceptionPosNegation = attrs.getValue(NEGATE_POS).equals(YES); + } + } + if (attrs.getValue(REGEXP) != null) { + exceptionStringRegExp = attrs.getValue(REGEXP).equals(YES); + } + + } else if (qName.equals(TOKEN)) { + setToken(attrs); + } else if (qName.equals("translation")) { + inTranslation = true; + final String languageStr = attrs.getValue("lang"); + final Language tmpLang = Language.getLanguageForShortName(languageStr); + currentTranslationLanguage = tmpLang; + if (tmpLang == motherTongue) { + translationLanguage = tmpLang; + if (translationLanguage == null) { + throw new SAXException("Unknown language '" + languageStr + "'"); + } + } + } else if (qName.equals(EXAMPLE) + && attrs.getValue(TYPE).equals("correct")) { + inCorrectExample = true; + correctExample = new StringBuilder(); + } else if (qName.equals(EXAMPLE) + && attrs.getValue(TYPE).equals("incorrect")) { + inIncorrectExample = true; + incorrectExample = new StringBuilder(); + } else if (qName.equals("message")) { + inMessage = true; + message = new StringBuilder(); + } else if (qName.equals("rulegroup")) { + ruleGroupId = attrs.getValue("id"); + inRuleGroup = true; + defaultOff = "off".equals(attrs.getValue(DEFAULT)); + } + } + + @Override + public void endElement(final String namespaceURI, final String sName, + final String qName) { + if (qName.equals("rule")) { + if (language == textLanguage && translationLanguage != null + && translationLanguage == motherTongue && language != motherTongue + && !translations.isEmpty()) { + formatter.applyPattern(messages.getString("false_friend_hint")); + final Object[] messageArguments = { + elements.toString().replace('|', '/'), + messages.getString(textLanguage.getShortName()), + formatTranslations(translations), + messages.getString(motherTongue.getShortName()) }; + final String description = formatter.format(messageArguments); + final PatternRule rule = new PatternRule(id, language, elementList, + messages.getString("false_friend_desc") + " " + + elements.toString().replace('|', '/'), description, messages + .getString("false_friend")); + rule.setCorrectExamples(correctExamples); + rule.setIncorrectExamples(incorrectExamples); + rule.setCategory(new Category(messages + .getString("category_false_friend"))); + if (defaultOff) { + rule.setDefaultOff(); + } + rules.add(rule); + } + + if (elementList != null) { + elementList.clear(); + } + + } else if (qName.equals("exception")) { + inException = false; + if (!exceptionSet) { + tokenElement = new Element(elements.toString(), caseSensitive, + regExpression, tokenInflected); + exceptionSet = true; + } + tokenElement.setNegation(tokenNegated); + if (!StringTools.isEmpty(exceptions.toString())) { + tokenElement.setStringException(exceptions.toString(), + exceptionStringRegExp, exceptionStringInflected, + exceptionStringNegation, exceptionValidNext, exceptionValidPrev); + } + if (exceptionPosToken != null) { + tokenElement.setPosException(exceptionPosToken, exceptionPosRegExp, + exceptionPosNegation, exceptionValidNext, exceptionValidPrev); + exceptionPosToken = null; + } + } else if (qName.equals(TOKEN)) { + finalizeTokens(); + } else if (qName.equals("pattern")) { + inPattern = false; + } else if (qName.equals("translation")) { + if (currentTranslationLanguage == motherTongue) { + translations.add(translation); + } + if (currentTranslationLanguage == textLanguage) { + suggestions.add(translation.toString()); + } + translation = new StringBuilder(); + inTranslation = false; + currentTranslationLanguage = null; + } else if (qName.equals(EXAMPLE)) { + if (inCorrectExample) { + correctExamples.add(correctExample.toString()); + } else if (inIncorrectExample) { + incorrectExamples + .add(new IncorrectExample(incorrectExample.toString())); + } + inCorrectExample = false; + inIncorrectExample = false; + correctExample = new StringBuilder(); + incorrectExample = new StringBuilder(); + } else if (qName.equals("message")) { + inMessage = false; + } else if (qName.equals("rulegroup")) { + if (!suggestions.isEmpty()) { + final List<String> l = new ArrayList<String>(suggestions); + suggestionMap.put(id, l); + suggestions.clear(); + } + inRuleGroup = false; + } + } + + private String formatTranslations(final List<StringBuilder> translations) { + final StringBuilder sb = new StringBuilder(); + for (final Iterator<StringBuilder> iter = translations.iterator(); iter + .hasNext();) { + final StringBuilder trans = iter.next(); + sb.append('"'); + sb.append(trans.toString()); + sb.append('"'); + if (iter.hasNext()) { + sb.append(", "); + } + } + return sb.toString(); + } + + @Override + public void characters(final char[] buf, final int offset, final int len) { + final String s = new String(buf, offset, len); + if (inException) { + exceptions.append(s); + } else if (inToken && inPattern) { + elements.append(s); + } else if (inCorrectExample) { + correctExample.append(s); + } else if (inIncorrectExample) { + incorrectExample.append(s); + } else if (inTranslation) { + translation.append(s); + } + } + +} diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/rules/patterns/Match.java b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/patterns/Match.java new file mode 100644 index 0000000..0519f2c --- /dev/null +++ b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/patterns/Match.java @@ -0,0 +1,551 @@ +/* LanguageTool, a natural language style checker + * Copyright (C) 2005 Daniel Naber (http://www.danielnaber.de) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ +package de.danielnaber.languagetool.rules.patterns; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; +import java.util.TreeSet; +import java.util.regex.Pattern; + +import de.danielnaber.languagetool.AnalyzedToken; +import de.danielnaber.languagetool.AnalyzedTokenReadings; +import de.danielnaber.languagetool.JLanguageTool; +import de.danielnaber.languagetool.synthesis.Synthesizer; +import de.danielnaber.languagetool.tools.StringTools; + +/** + * Reference to a matched token in a pattern, can be formatted and used for + * matching & suggestions. + * + * @author Marcin Miłkowski + */ +public class Match { + + /** Possible string case conversions. **/ + public enum CaseConversion { + NONE, STARTLOWER, STARTUPPER, ALLLOWER, ALLUPPER; + + /** + * Converts string to the constant enum. + * + * @param str + * String value to be converted. + * @return CaseConversion enum. + */ + public static CaseConversion toCase(final String str) { + try { + return valueOf(str); + } catch (final Exception ex) { + return NONE; + } + } + } + + public enum IncludeRange { + NONE, FOLLOWING, ALL; + + /** + * Converts string to the constant enum. + * + * @param str + * String value to be converted. + * @return IncludeRange enum. + */ + public static IncludeRange toRange(final String str) { + try { + return valueOf(str); + } catch (final Exception ex) { + return NONE; + } + } + } + + private final String posTag; + private boolean postagRegexp; + private final String regexReplace; + private final String posTagReplace; + private final CaseConversion caseConversionType; + + private final IncludeRange includeSkipped; + private String skippedTokens; + + /** + * True if this match element formats a statically defined lemma which is + * enclosed by the element, e.g., <tt><match...>word</word></tt>. + */ + private boolean staticLemma; + + /** + * True if this match element is used for formatting POS token. + */ + private final boolean setPos; + + private AnalyzedTokenReadings formattedToken; + private AnalyzedTokenReadings matchedToken; + + private int tokenRef; + + /** Word form generator for POS tags. **/ + private Synthesizer synthesizer; + + /** Pattern used to define parts of the matched token. **/ + private Pattern pRegexMatch; + + /** Pattern used to define parts of the matched POS token. **/ + private Pattern pPosRegexMatch; + + /** + * True when the match is not in the suggestion. + */ + private boolean inMessageOnly; + + public Match(final String posTag, final String posTagReplace, + final boolean postagRegexp, final String regexMatch, + final String regexReplace, final CaseConversion caseConversionType, + final boolean setPOS, + final IncludeRange includeSkipped) { + this.posTag = posTag; + this.postagRegexp = postagRegexp; + this.caseConversionType = caseConversionType; + + if (regexMatch != null) { + pRegexMatch = Pattern.compile(regexMatch); + } + if (postagRegexp && posTag != null) { + pPosRegexMatch = Pattern.compile(posTag); + } + + this.regexReplace = regexReplace; + this.posTagReplace = posTagReplace; + this.setPos = setPOS; + this.includeSkipped = includeSkipped; + } + + /** + * Sets the token that will be formatted or otherwise used in the class. + */ + public final void setToken(final AnalyzedTokenReadings token) { + if (staticLemma) { + matchedToken = token; + } else { + formattedToken = token; + } + } + + /** + * Sets the token to be formatted etc. and includes the support for + * including the skipped tokens. + * @param tokens Array of tokens + * @param index Index of the token to be formatted + * @param next Position of the next token (the skipped tokens + * are the ones between the tokens[index] and tokens[next] + */ + public final void setToken(final AnalyzedTokenReadings[] tokens, final int index, final int next) { + setToken(tokens[index]); + if (next > 1 && includeSkipped != IncludeRange.NONE) { + final StringBuilder sb = new StringBuilder(); + if (includeSkipped == IncludeRange.FOLLOWING) { + formattedToken = null; + } + for (int k = index + 1; k < index + next; k++) { + if (k > index + 1 && + tokens[k].isWhitespaceBefore()) { + sb.append(' '); + } + sb.append(tokens[k].getToken()); + } + skippedTokens = sb.toString(); + } else { + skippedTokens = ""; + } + } + + /** + private String[] addSkipped(final String[] formattedString) { + if (skippedTokens != null && !"".equals(skippedTokens)) { + String[] finalStrings = new String[formattedString.length]; + for (int i = 1; i <= formattedString.length; i++) + } + } + + **/ + + /** + * Checks if the Match element is used for setting the part of speech Element. + * + * @return True if Match sets POS. + */ + public final boolean setsPos() { + return setPos; + } + + /** + * Checks if the Match element uses regexp-based form of the POS tag. + * + * @return True if regexp is used in POS. + */ + public final boolean posRegExp() { + return postagRegexp; + } + + /** + * Sets a base form (lemma) that will be formatted, or synthesized, using the + * specified POS regular expressions. + * + * @param lemmaString String that specifies the base form. + */ + public final void setLemmaString(final String lemmaString) { + if (!StringTools.isEmpty(lemmaString)) { + formattedToken = new AnalyzedTokenReadings(new AnalyzedToken(lemmaString, + posTag, lemmaString), 0); + staticLemma = true; + postagRegexp = true; + if (posTag != null) { + pPosRegexMatch = Pattern.compile(posTag); + } + } + } + + /** + * Sets a synthesizer used for grammatical synthesis of forms based on + * formatted POS values. + * + * @param synth Synthesizer class. + */ + public final void setSynthesizer(final Synthesizer synth) { + synthesizer = synth; + } + + /** + * Gets all strings formatted using the match element. + * + * @return array of strings + * @throws IOException + * in case of synthesizer-related disk problems. + */ + public final String[] toFinalString() throws IOException { + String[] formattedString = new String[1]; + if (formattedToken != null) { + final int readingCount = formattedToken.getReadingsLength(); + formattedString[0] = formattedToken.getToken(); + if (pRegexMatch != null) { + formattedString[0] = pRegexMatch.matcher(formattedString[0]) + .replaceAll(regexReplace); + } + formattedString[0] = convertCase(formattedString[0]); + if (posTag != null) { + if (synthesizer == null) { + formattedString[0] = formattedToken.getToken(); + } else if (postagRegexp) { + final TreeSet<String> wordForms = new TreeSet<String>(); + boolean oneForm = false; + for (int k = 0; k < readingCount; k++) { + if (formattedToken.getAnalyzedToken(k).getLemma() == null) { + final String posUnique = formattedToken.getAnalyzedToken(k) + .getPOSTag(); + if (posUnique == null) { + wordForms.add(formattedToken.getToken()); + oneForm = true; + } else { + if (JLanguageTool.SENTENCE_START_TAGNAME.equals(posUnique) + || JLanguageTool.SENTENCE_END_TAGNAME.equals(posUnique) + || JLanguageTool.PARAGRAPH_END_TAGNAME.equals(posUnique)) { + if (!oneForm) { + wordForms.add(formattedToken.getToken()); + } + oneForm = true; + } else { + oneForm = false; + } + } + } + } + final String targetPosTag = getTargetPosTag(); + if (!oneForm) { + for (int i = 0; i < readingCount; i++) { + final String[] possibleWordForms = synthesizer.synthesize( + formattedToken.getAnalyzedToken(i), targetPosTag, true); + if (possibleWordForms != null) { + wordForms.addAll(Arrays.asList(possibleWordForms)); + } + } + } + if (wordForms.isEmpty()) { + formattedString[0] = "(" + formattedToken.getToken() + ")"; + } else { + formattedString = wordForms.toArray(new String[wordForms.size()]); + } + } else { + final TreeSet<String> wordForms = new TreeSet<String>(); + for (int i = 0; i < readingCount; i++) { + final String[] possibleWordForms = synthesizer.synthesize( + formattedToken.getAnalyzedToken(i), posTag); + if (possibleWordForms != null) { + wordForms.addAll(Arrays.asList(possibleWordForms)); + } + } + formattedString = wordForms.toArray(new String[wordForms.size()]); + } + } + } + if (includeSkipped != IncludeRange.NONE + && skippedTokens != null && !"".equals(skippedTokens)) { + final String[] helper = new String[formattedString.length]; + for (int i = 0; i < formattedString.length; i++) { + if (formattedString[i] == null) { + formattedString[i] = ""; + } + helper[i] = formattedString[i] + skippedTokens; + } + formattedString = helper; + } + return formattedString; + } + + /** + * Format POS tag using parameters already defined in the class. + * + * @return Formatted POS tag as String. + */ + // FIXME: gets only the first POS tag that matches, this can be wrong + // on the other hand, many POS tags = too many suggestions? + public final String getTargetPosTag() { + String targetPosTag = posTag; + final List<String> posTags = new ArrayList<String>(); + if (staticLemma) { + final int numRead = matchedToken.getReadingsLength(); + for (int i = 0; i < numRead; i++) { + final String tst = matchedToken.getAnalyzedToken(i).getPOSTag(); + if (tst != null && pPosRegexMatch.matcher(tst).matches()) { + targetPosTag = matchedToken.getAnalyzedToken(i).getPOSTag(); + posTags.add(targetPosTag); + } + } + if (pPosRegexMatch != null && posTagReplace != null) { + targetPosTag = pPosRegexMatch.matcher(targetPosTag).replaceAll( + posTagReplace); + } + } else { + final int numRead = formattedToken.getReadingsLength(); + for (int i = 0; i < numRead; i++) { + final String tst = formattedToken.getAnalyzedToken(i).getPOSTag(); + if (tst != null && pPosRegexMatch.matcher(tst).matches()) { + targetPosTag = formattedToken.getAnalyzedToken(i).getPOSTag(); + posTags.add(targetPosTag); + } + } + if (pPosRegexMatch != null && posTagReplace != null) { + if (posTags.isEmpty()) { + posTags.add(targetPosTag); + } + final StringBuilder sb = new StringBuilder(); + final int posTagLen = posTags.size(); + int l = 0; + for (String lposTag : posTags) { + l++; + lposTag = pPosRegexMatch.matcher(lposTag).replaceAll(posTagReplace); + if (setPos) { + lposTag = synthesizer.getPosTagCorrection(lposTag); + } + sb.append(lposTag); + if (l < posTagLen) { + sb.append('|'); + } + } + targetPosTag = sb.toString(); + } + } + return targetPosTag; + } + + /** + * Method for getting the formatted match as a single string. In case of + * multiple matches, it joins them using a regular expression operator "|". + * + * @return Formatted string of the matched token. + */ + public final String toTokenString() throws IOException { + final StringBuilder output = new StringBuilder(); + final String[] stringToFormat = toFinalString(); + for (int i = 0; i < stringToFormat.length; i++) { + output.append(stringToFormat[i]); + if (i + 1 < stringToFormat.length) { + output.append('|'); + } + } + return output.toString(); + } + + /** + * Sets the token number referenced by the match. + * + * @param i Token number. + */ + public final void setTokenRef(final int i) { + tokenRef = i; + } + + /** + * Gets the token number referenced by the match. + * + * @return int - token number. + */ + public final int getTokenRef() { + return tokenRef; + } + + /** + * Converts case of the string token according to match element attributes. + * + * @param s Token to be converted. + * @return Converted string. + */ + private String convertCase(final String s) { + if (StringTools.isEmpty(s)) { + return s; + } + String token = s; + switch (caseConversionType) { + case NONE: + break; + case STARTLOWER: + token = token.substring(0, 1).toLowerCase() + token.substring(1); + break; + case STARTUPPER: + token = token.substring(0, 1).toUpperCase() + token.substring(1); + break; + case ALLUPPER: + token = token.toUpperCase(); + break; + case ALLLOWER: + token = token.toLowerCase(); + break; + default: + break; + } + return token; + } + + /** + * Used to let LT know that it should change the case of the match. + * + * @return true if match converts the case of the token. + */ + public final boolean convertsCase() { + return !caseConversionType.equals(CaseConversion.NONE); + } + + public final AnalyzedTokenReadings filterReadings() { + final ArrayList<AnalyzedToken> l = new ArrayList<AnalyzedToken>(); + if (formattedToken != null) { + if (staticLemma) { + formattedToken = new AnalyzedTokenReadings(new AnalyzedToken( + matchedToken.getToken(), posTag, formattedToken.getToken()), + matchedToken.getStartPos()); + formattedToken.setWhitespaceBefore(matchedToken.isWhitespaceBefore()); + } + String token = formattedToken.getToken(); + if (pRegexMatch != null) { + token = pRegexMatch.matcher(token).replaceAll(regexReplace); + } + token = convertCase(token); + if (posTag != null) { + final int numRead = formattedToken.getReadingsLength(); + if (postagRegexp) { + String targetPosTag = posTag; + for (int i = 0; i < numRead; i++) { + final String tst = formattedToken.getAnalyzedToken(i).getPOSTag(); + if (tst != null && pPosRegexMatch.matcher(tst).matches()) { + targetPosTag = formattedToken.getAnalyzedToken(i).getPOSTag(); + if (posTagReplace != null) { + targetPosTag = pPosRegexMatch.matcher(targetPosTag).replaceAll( + posTagReplace); + } + l + .add(new AnalyzedToken(token, targetPosTag, formattedToken + .getAnalyzedToken(i).getLemma())); + l.get(l.size() - 1).setWhitespaceBefore(formattedToken.isWhitespaceBefore()); + } + } + if (l.isEmpty()) { + for (final AnalyzedToken anaTok : getNewToken(numRead, token)) { + l.add(anaTok); + } + } + } else { + for (final AnalyzedToken anaTok : getNewToken(numRead, token)) { + l.add(anaTok); + } + } + if (formattedToken.isSentEnd()) { + l.add(new AnalyzedToken(formattedToken.getToken(), + JLanguageTool.SENTENCE_END_TAGNAME, + formattedToken.getAnalyzedToken(0).getLemma())); + } + if (formattedToken.isParaEnd()) { + l.add(new AnalyzedToken(formattedToken.getToken(), + JLanguageTool.PARAGRAPH_END_TAGNAME, + formattedToken.getAnalyzedToken(0).getLemma())); + } + } + } + if (l.isEmpty()) { + return formattedToken; + } + return new AnalyzedTokenReadings(l.toArray(new AnalyzedToken[l.size()]), formattedToken.getStartPos()); + } + + private AnalyzedToken[] getNewToken(final int numRead, final String token) { + final List<AnalyzedToken> list = new ArrayList<AnalyzedToken>(); + String lemma = ""; + for (int j = 0; j < numRead; j++) { + if (formattedToken.getAnalyzedToken(j).getPOSTag() != null) { + if (formattedToken.getAnalyzedToken(j).getPOSTag().equals(posTag) + && (formattedToken.getAnalyzedToken(j).getLemma() != null)) { + lemma = formattedToken.getAnalyzedToken(j).getLemma(); + } + if (StringTools.isEmpty(lemma)) { + lemma = formattedToken.getAnalyzedToken(0).getLemma(); + } + list.add(new AnalyzedToken(token, posTag, lemma)); + list.get(list.size() - 1). + setWhitespaceBefore(formattedToken.isWhitespaceBefore()); + } + } + return list.toArray(new AnalyzedToken[list.size()]); + } + + /** + * @param inMessageOnly + * the inMessageOnly to set + */ + public void setInMessageOnly(final boolean inMessageOnly) { + this.inMessageOnly = inMessageOnly; + } + + /** + * @return the inMessageOnly + */ + public boolean isInMessageOnly() { + return inMessageOnly; + } + +} diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/rules/patterns/PatternRule.java b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/patterns/PatternRule.java new file mode 100644 index 0000000..843ef98 --- /dev/null +++ b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/patterns/PatternRule.java @@ -0,0 +1,652 @@ +/* LanguageTool, a natural language style checker + * Copyright (C) 2005 Daniel Naber (http://www.danielnaber.de) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ +package de.danielnaber.languagetool.rules.patterns; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; + +import de.danielnaber.languagetool.AnalyzedSentence; +import de.danielnaber.languagetool.AnalyzedTokenReadings; +import de.danielnaber.languagetool.Language; +import de.danielnaber.languagetool.rules.IncorrectExample; +import de.danielnaber.languagetool.rules.RuleMatch; +import de.danielnaber.languagetool.tools.StringTools; + +/** + * A Rule that describes a language error as a simple pattern of words or of + * part-of-speech tags. + * + * @author Daniel Naber + */ +public class PatternRule extends AbstractPatternRule { + + private static final String SUGG_TAG = "<suggestion>"; + private static final String END_SUGG_TAG = "</suggestion>"; + + private String subId; // because there can be more than one rule in a rule + // group + + private String message; + private String shortMessage; + + /** Formatted suggestion elements. **/ + private List<Match> suggestionMatches; + + /** + * A list of elements as they appear in XML file (phrases count as single + * tokens in case of matches or skipping). + */ + private List<Integer> elementNo; + + /** + * This property is used for short-circuiting evaluation of the elementNo list + * order. + */ + private boolean useList; + + /** + * Marks whether the rule is a member of a disjunctive set (in case of OR + * operation on phraserefs). + **/ + private boolean isMemberOfDisjunctiveSet; + + /** + * @param id + * Id of the Rule + * @param language + * Language of the Rule + * @param elements + * Element (token) list + * @param description + * Description to be shown (name) + * @param message + * Message to be displayed to the user + */ + + public PatternRule(final String id, final Language language, + final List<Element> elements, final String description, + final String message, final String shortMessage) { + super(id, description, language, elements, false); + if (id == null) { + throw new NullPointerException("id cannot be null"); + } + if (language == null) { + throw new NullPointerException("language cannot be null"); + } + if (elements == null) { + throw new NullPointerException("elements cannot be null"); + } + if (description == null) { + throw new NullPointerException("description cannot be null"); + } + + this.message = message; + this.shortMessage = shortMessage; + this.elementNo = new ArrayList<Integer>(); + String prevName = ""; + String curName = ""; + int cnt = 0; + int loopCnt = 0; + for (final Element e : patternElements) { + if (e.isPartOfPhrase()) { + curName = e.getPhraseName(); + if (prevName.equals(curName) || StringTools.isEmpty(prevName)) { + cnt++; + useList = true; + } else { + elementNo.add(cnt); + prevName = ""; + curName = ""; + cnt = 0; + } + prevName = curName; + loopCnt++; + if (loopCnt == patternElements.size() && !StringTools.isEmpty(prevName)) { + elementNo.add(cnt); + } + } else { + if (cnt > 0) { + elementNo.add(cnt); + } + elementNo.add(1); + loopCnt++; + } + } + } + + public PatternRule(final String id, final Language language, + final List<Element> elements, final String description, + final String message, final String shortMessage, final boolean isMember) { + this(id, language, elements, description, message, shortMessage); + this.isMemberOfDisjunctiveSet = isMember; + } + + public final String getSubId() { + return subId; + } + + public final void setSubId(final String subId) { + this.subId = subId; + } + + public final String getMessage() { + return message; + } + + /** + * Used for testing rules: only one of the set can match. + * + * @return Whether the rule can non-match (as a member of disjunctive set of + * rules generated by phraseref in includephrases element). + */ + public final boolean isWithComplexPhrase() { + return isMemberOfDisjunctiveSet; + } + + /** Reset complex status - used for testing. **/ + public final void notComplexPhrase() { + isMemberOfDisjunctiveSet = false; + } + + /** + * Return the pattern as a string. + * + * @since 0.9.2 + */ + public final String toPatternString() { + final List<String> strList = new ArrayList<String>(); + for (Element patternElement : patternElements) { + strList.add(patternElement.toString()); + } + return StringTools.listToString(strList, ", "); + } + + /** + * Return the pattern as an XML string. FIXME: this is not complete, information might be lost! + * + * @since 0.9.3 + */ + public final String toXML() { + final StringBuilder sb = new StringBuilder(); + sb.append("<rule id=\""); + sb.append(StringTools.escapeXML(getId())); + sb.append("\" name=\""); + sb.append(StringTools.escapeXML(getDescription())); + sb.append("\">\n"); + sb.append("<pattern mark_from=\""); + sb.append(startPositionCorrection); + sb.append("\" mark_to=\""); + sb.append(endPositionCorrection); + sb.append('"'); + // for now, case sensitivity is per pattern, not per element, + // so just use the setting of the first element: + if (!patternElements.isEmpty() && patternElements.get(0).getCaseSensitive()) { + sb.append(" case_sensitive=\"yes\""); + } + sb.append(">\n"); + for (Element patternElement : patternElements) { + sb.append("<token"); + if (patternElement.getNegation()) { + sb.append(" negate=\"yes\""); + } + if (patternElement.isRegularExpression()) { + sb.append(" regexp=\"yes\""); + } + if (patternElement.getPOStag() != null) { + sb.append(" postag=\""); + sb.append(patternElement.getPOStag()); + sb.append('"'); + } + if (patternElement.getPOSNegation()) { + sb.append(" negate_pos=\"yes\""); + } + if (patternElement.isInflected()) { + sb.append(" inflected=\"yes\""); + } + sb.append('>'); + if (patternElement.getString() != null) { + sb.append(StringTools.escapeXML(patternElement.getString())); + } else { + // TODO + } + sb.append("</token>\n"); + } + sb.append("</pattern>\n"); + sb.append("<message>"); + sb.append(StringTools.escapeXML(message)); + sb.append("</message>\n"); + if (getIncorrectExamples() != null) { + for (IncorrectExample example : getIncorrectExamples()) { + sb.append("<example type=\"incorrect\">"); + sb.append(StringTools.escapeXML(example.getExample())); + sb.append("</example>\n"); + } + } + if (getCorrectExamples() != null) { + for (String example : getCorrectExamples()) { + sb.append("<example type=\"correct\">"); + sb.append(StringTools.escapeXML(example)); + sb.append("</example>\n"); + } + } + sb.append("</rule>"); + return sb.toString(); + } + + public final void setMessage(final String message) { + this.message = message; + } + + @Override + public final RuleMatch[] match(final AnalyzedSentence text) + throws IOException { + final List<RuleMatch> ruleMatches = new ArrayList<RuleMatch>(); + final AnalyzedTokenReadings[] tokens = text.getTokensWithoutWhitespace(); + final int[] tokenPositions = new int[tokens.length + 1]; + final int patternSize = patternElements.size(); + final int limit = Math.max(0, tokens.length - patternSize + 1); + Element elem = null; + int i = 0; + while (i < limit && !(sentStart && i > 0)) { + boolean allElementsMatch = false; + int firstMatchToken = -1; + int lastMatchToken = -1; + int matchingTokens = 0; + int prevSkipNext = 0; + // this variable keeps the total number + // of tokens skipped + int skipShiftTotal = 0; + if (testUnification) { + unifier.reset(); + } + for (int k = 0; k < patternSize; k++) { + final Element prevElement = elem; + elem = patternElements.get(k); + setupRef(firstMatchToken, elem, tokens); + final int nextPos = i + k + skipShiftTotal; + prevMatched = false; + if (prevSkipNext + nextPos >= tokens.length || prevSkipNext < 0) { // SENT_END? + prevSkipNext = tokens.length - (nextPos + 1); + } + final int maxTok = Math.min(nextPos + prevSkipNext, tokens.length - (patternSize - k)); + for (int m = nextPos; m <= maxTok; m++) { + allElementsMatch = testAllReadings(tokens, elem, prevElement, m, + firstMatchToken, prevSkipNext); + if (allElementsMatch) { + lastMatchToken = m; + final int skipShift = lastMatchToken - nextPos; + tokenPositions[matchingTokens] = skipShift + 1; + prevSkipNext = translateElementNo(elem.getSkipNext()); + matchingTokens++; + skipShiftTotal += skipShift; + if (firstMatchToken == -1) { + firstMatchToken = lastMatchToken; + } + break; + } + } + if (!allElementsMatch) { + break; + } + } + + if (allElementsMatch && matchingTokens == patternSize) { + final RuleMatch rM = createRuleMatch(tokenPositions, tokens, + firstMatchToken, lastMatchToken, matchingTokens); + if (rM != null) { + ruleMatches.add(rM); + } + } + i++; + } + return ruleMatches.toArray(new RuleMatch[ruleMatches.size()]); + } + + private RuleMatch createRuleMatch(final int[] tokenPositions, + final AnalyzedTokenReadings[] tokens, final int firstMatchToken, + final int lastMatchToken, final int matchingTokens) throws IOException { + final String errMessage = formatMatches(tokens, tokenPositions, + firstMatchToken, message); + int correctedStPos = 0; + if (startPositionCorrection > 0) { + for (int l = 0; l <= startPositionCorrection; l++) { + correctedStPos += tokenPositions[l]; + } + correctedStPos--; + } + int correctedEndPos = 0; + if (endPositionCorrection < 0) { + int l = 0; + while (l > endPositionCorrection) { + correctedEndPos -= tokenPositions[matchingTokens + l - 1]; + l--; + } + } + AnalyzedTokenReadings firstMatchTokenObj = tokens[firstMatchToken + + correctedStPos]; + boolean startsWithUppercase = StringTools + .startsWithUppercase(firstMatchTokenObj.getToken()) + && !matchConvertsCase(); + + if (firstMatchTokenObj.isSentStart() + && tokens.length > firstMatchToken + correctedStPos + 1) { + // make uppercasing work also at sentence start: + firstMatchTokenObj = tokens[firstMatchToken + correctedStPos + 1]; + startsWithUppercase = StringTools.startsWithUppercase(firstMatchTokenObj + .getToken()); + } + int fromPos = tokens[firstMatchToken + correctedStPos].getStartPos(); + // FIXME: this is fishy, assumes that comma should always come before + // whitespace + if (errMessage.contains(SUGG_TAG + ",") + && firstMatchToken + correctedStPos >= 1) { + fromPos = tokens[firstMatchToken + correctedStPos - 1].getStartPos() + + tokens[firstMatchToken + correctedStPos - 1].getToken().length(); + } + + final int toPos = tokens[lastMatchToken + correctedEndPos].getStartPos() + + tokens[lastMatchToken + correctedEndPos].getToken().length(); + if (fromPos < toPos) { // this can happen with some skip="-1" when the last + // token is not matched + return new RuleMatch(this, fromPos, toPos, + errMessage, shortMessage, startsWithUppercase); + } // failed to create any rule match... + return null; + } + + /** + * Checks if the suggestion starts with a match that is supposed to convert + * case. If it does, stop the default conversion to uppercase. + * + * @return true, if the match converts the case of the token. + */ + private boolean matchConvertsCase() { + if (suggestionMatches != null && !suggestionMatches.isEmpty()) { + final int sugStart = message.indexOf(SUGG_TAG) + SUGG_TAG.length(); + for (Match sMatch : suggestionMatches) { + if (!sMatch.isInMessageOnly() && sMatch.convertsCase() + && message.charAt(sugStart) == '\\') { + return true; + } + } + } + return false; + } + + public final void addSuggestionMatch(final Match m) { + if (suggestionMatches == null) { + suggestionMatches = new ArrayList<Match>(); + } + suggestionMatches.add(m); + } + + /** + * Gets the index of the element indexed by i, adding any offsets because of + * the phrases in the rule. + * + * @param i + * Current element index. + * @return int Index translated into XML element no. + */ + private int translateElementNo(final int i) { + if (!useList || i < 0) { + return i; + } + int j = 0; + for (int k = 0; k < i; k++) { + j += elementNo.get(k); + } + return j; + } + + /** + * Returns true when the token in the rule references a phrase composed of + * many tokens. + * + * @param i + * The index of the token. + * @return true if the phrase is under the index, false otherwise. + **/ + private int phraseLen(final int i) { + if (!useList || i > (elementNo.size() - 1)) { + return 1; + } + return elementNo.get(i); + } + + /** + * Creates a Cartesian product of the arrays stored in the input array. + * + * @param input + * Array of string arrays to combine. + * @param output + * Work array of strings. + * @param r + * Starting parameter (use 0 to get all combinations). + * @param lang + * Text language for adding spaces in some languages. + * @return Combined array of @String. + */ + private static String[] combineLists(final String[][] input, + final String[] output, final int r, final Language lang) { + final List<String> outputList = new ArrayList<String>(); + if (r == input.length) { + final StringBuilder sb = new StringBuilder(); + for (int k = 0; k < output.length; k++) { + sb.append(output[k]); + if (k < output.length - 1) { + sb.append(StringTools.addSpace(output[k + 1], lang)); + } + } + outputList.add(sb.toString()); + } else { + for (int c = 0; c < input[r].length; c++) { + output[r] = input[r][c]; + final String[] sList = combineLists(input, output, r + 1, lang); + outputList.addAll(Arrays.asList(sList)); + } + } + return outputList.toArray(new String[outputList.size()]); + } + + /** + * Concatenates the matches, and takes care of phrases (including inflection + * using synthesis). + * + * @param start + * Position of the element as referenced by match element in the + * rule. + * @param index + * The index of the element found in the matching sentence. + * @param tokenIndex + * The position of the token in the AnalyzedTokenReadings array. + * @param tokens + * Array of @AnalyzedTokenReadings + * @return @String[] Array of concatenated strings + * @throws IOException + * in case disk operations (used in synthesizer) go wrong. + */ + private String[] concatMatches(final int start, final int index, + final int tokenIndex, final AnalyzedTokenReadings[] tokens, + final int nextTokenPos) + throws IOException { + String[] finalMatch = null; + if (suggestionMatches.get(start) != null) { + final int len = phraseLen(index); + if (len == 1) { + final int skippedTokens = nextTokenPos - tokenIndex; + suggestionMatches.get(start).setToken(tokens, tokenIndex - 1, skippedTokens); + suggestionMatches.get(start).setSynthesizer(language.getSynthesizer()); + finalMatch = suggestionMatches.get(start).toFinalString(); + } else { + final List<String[]> matchList = new ArrayList<String[]>(); + for (int i = 0; i < len; i++) { + final int skippedTokens = nextTokenPos - (tokenIndex + i); + suggestionMatches.get(start).setToken(tokens, tokenIndex - 1 + i, skippedTokens); + suggestionMatches.get(start) + .setSynthesizer(language.getSynthesizer()); + matchList.add(suggestionMatches.get(start).toFinalString()); + } + return combineLists(matchList.toArray(new String[matchList.size()][]), + new String[matchList.size()], 0, language); + } + } + return finalMatch; + } + + /** + * Replace back references generated with <match> and \\1 in message + * using Match class, and take care of skipping. * + * + * @param tokenReadings + * Array of AnalyzedTokenReadings that were matched against the + * pattern + * @param positions + * Array of relative positions of matched tokens + * @param firstMatchTok + * Position of the first matched token + * @param errorMsg + * String containing suggestion markup + * @return String Formatted message. + * @throws IOException + * + **/ + private String formatMatches(final AnalyzedTokenReadings[] tokenReadings, + final int[] positions, final int firstMatchTok, final String errorMsg) + throws IOException { + String errorMessage = errorMsg; + int matchCounter = 0; + final int[] numbersToMatches = new int[errorMsg.length()]; + boolean newWay = false; + int errLen = errorMessage.length(); + int errMarker = errorMessage.indexOf('\\'); + boolean numberFollows = false; + if (errMarker > 0 && errMarker < errLen - 1) { + numberFollows = StringTools.isPositiveNumber(errorMessage + .charAt(errMarker + 1)); + } + while (errMarker > 0 && numberFollows) { + final int ind = errorMessage.indexOf('\\'); + if (ind > 0 && StringTools.isPositiveNumber(errorMessage.charAt(ind + 1))) { + int numLen = 1; + while (ind + numLen < errorMessage.length() + && StringTools.isPositiveNumber(errorMessage.charAt(ind + numLen))) { + numLen++; + } + final int j = Integer.parseInt(errorMessage.substring(ind + 1, ind + + numLen)) - 1; + int repTokenPos = 0; + int nextTokenPos = 0; + for (int l = 0; l <= j; l++) { + repTokenPos += positions[l]; + } + if (j <= positions.length) { + nextTokenPos = firstMatchTok + repTokenPos + positions[j + 1]; + } + if (suggestionMatches != null) { + if (matchCounter < suggestionMatches.size()) { + numbersToMatches[j] = matchCounter; + if (suggestionMatches.get(matchCounter) != null) { + final String[] matches = concatMatches(matchCounter, j, + firstMatchTok + repTokenPos, tokenReadings, nextTokenPos); + final String leftSide = errorMessage.substring(0, ind); + final String rightSide = errorMessage.substring(ind + numLen); + if (matches.length == 1) { + errorMessage = leftSide + matches[0] + rightSide; + } else { + errorMessage = formatMultipleSynthesis(matches, leftSide, + rightSide); + } + matchCounter++; + newWay = true; + } + } else { + // FIXME: is this correct? this is how we deal with multiple matches + suggestionMatches.add(suggestionMatches.get(numbersToMatches[j])); + } + } + + if (!newWay) { + // in case <match> elements weren't used (yet) + errorMessage = errorMessage.replace("\\" + (j + 1), + tokenReadings[firstMatchTok + repTokenPos - 1].getToken()); + } + } + errMarker = errorMessage.indexOf('\\'); + numberFollows = false; + errLen = errorMessage.length(); + if (errMarker > 0 && errMarker < errLen - 1) { + numberFollows = StringTools.isPositiveNumber(errorMessage + .charAt(errMarker + 1)); + } + } + return errorMessage; + } + + private static String formatMultipleSynthesis(final String[] matches, + final String leftSide, final String rightSide) { + String errorMessage = ""; + String suggestionLeft = ""; + String suggestionRight = ""; + String rightSideNew = rightSide; + final int sPos = leftSide.lastIndexOf(SUGG_TAG); + if (sPos > 0) { + suggestionLeft = leftSide.substring(sPos + SUGG_TAG.length()); + } + if (StringTools.isEmpty(suggestionLeft)) { + errorMessage = leftSide; + } else { + errorMessage = leftSide.substring(0, leftSide.lastIndexOf(SUGG_TAG)) + + SUGG_TAG; + } + final int rPos = rightSide.indexOf(END_SUGG_TAG); + if (rPos > 0) { + suggestionRight = rightSide.substring(0, rPos); + } + if (!StringTools.isEmpty(suggestionRight)) { + rightSideNew = rightSide.substring(rightSide.indexOf(END_SUGG_TAG)); + } + final int lastLeftSugEnd = leftSide.indexOf(END_SUGG_TAG); + final int lastLeftSugStart = leftSide.lastIndexOf(SUGG_TAG); + final StringBuilder sb = new StringBuilder(); + sb.append(errorMessage); + for (int z = 0; z < matches.length; z++) { + sb.append(suggestionLeft); + sb.append(matches[z]); + sb.append(suggestionRight); + if ((z < matches.length - 1) && lastLeftSugEnd < lastLeftSugStart) { + sb.append(END_SUGG_TAG); + sb.append(", "); + sb.append(SUGG_TAG); + } + } + sb.append(rightSideNew); + return sb.toString(); + } + + /** + * For testing only. + */ + public final List<Element> getElements() { + return patternElements; + } + +} diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/rules/patterns/PatternRuleLoader.java b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/patterns/PatternRuleLoader.java new file mode 100644 index 0000000..8156a6e --- /dev/null +++ b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/patterns/PatternRuleLoader.java @@ -0,0 +1,369 @@ +/* LanguageTool, a natural language style checker + * Copyright (C) 2005 Daniel Naber (http://www.danielnaber.de) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ +package de.danielnaber.languagetool.rules.patterns; + +import java.io.IOException; +import java.io.InputStream; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; + +import javax.xml.parsers.SAXParser; +import javax.xml.parsers.SAXParserFactory; + +import org.xml.sax.Attributes; +import org.xml.sax.SAXException; +import org.xml.sax.helpers.DefaultHandler; + +import de.danielnaber.languagetool.JLanguageTool; +import de.danielnaber.languagetool.Language; +import de.danielnaber.languagetool.rules.Category; +import de.danielnaber.languagetool.rules.IncorrectExample; + +/** + * Loads {@link PatternRule}s from an XML file. + * + * @author Daniel Naber + */ +public class PatternRuleLoader extends DefaultHandler { + + public final List<PatternRule> getRules(final InputStream is, + final String filename) throws IOException { + try { + final PatternRuleHandler handler = new PatternRuleHandler(); + final SAXParserFactory factory = SAXParserFactory.newInstance(); + final SAXParser saxParser = factory.newSAXParser(); + saxParser.getXMLReader().setFeature( + "http://apache.org/xml/features/nonvalidating/load-external-dtd", + false); + saxParser.parse(is, handler); + return handler.getRules(); + } catch (final Exception e) { + final IOException ioe = new IOException("Cannot load or parse '" + + filename + "'"); + ioe.initCause(e); + throw ioe; + } + } + + /** Testing only. */ + public final void main(final String[] args) throws IOException { + final PatternRuleLoader prg = new PatternRuleLoader(); + final String name = "/de/grammar.xml"; + final List<PatternRule> l = prg.getRules(JLanguageTool.getDataBroker().getFromRulesDirAsStream(name), name); + System.out.println(l); + } + +} + +class PatternRuleHandler extends XMLRuleHandler { + + private int subId; + + private boolean defaultOff; + private boolean defaultOn; + + private Category category; + private String description; + private String ruleGroupDescription; + + // =========================================================== + // SAX DocumentHandler methods + // =========================================================== + + @Override + public void startElement(final String namespaceURI, final String lName, + final String qName, final Attributes attrs) throws SAXException { + if ("category".equals(qName)) { + final String catName = attrs.getValue("name"); + final String priorityStr = attrs.getValue("priority"); + // int prio = 0; + if (priorityStr == null) { + category = new Category(catName); + } else { + category = new Category(catName, Integer.parseInt(priorityStr)); + } + + if ("off".equals(attrs.getValue(DEFAULT))) { + category.setDefaultOff(); + } + + } else if ("rules".equals(qName)) { + final String languageStr = attrs.getValue("lang"); + language = Language.getLanguageForShortName(languageStr); + if (language == null) { + throw new SAXException("Unknown language '" + languageStr + "'"); + } + } else if ("rule".equals(qName)) { + id = attrs.getValue("id"); + if (inRuleGroup) { + subId++; + } + if (!(inRuleGroup && defaultOff)) { + defaultOff = "off".equals(attrs.getValue(DEFAULT)); + } + + if (!(inRuleGroup && defaultOn)) { + defaultOn = "on".equals(attrs.getValue(DEFAULT)); + } + if (inRuleGroup && id == null) { + id = ruleGroupId; + } + description = attrs.getValue("name"); + if (inRuleGroup && description == null) { + description = ruleGroupDescription; + } + correctExamples = new ArrayList<String>(); + incorrectExamples = new ArrayList<IncorrectExample>(); + if (suggestionMatches != null) { + suggestionMatches.clear(); + } + } else if (PATTERN.equals(qName)) { + startPattern(attrs); + } else if (AND.equals(qName)) { + inAndGroup = true; + } else if ("unify".equals(qName)) { + inUnification = true; + uniNegation = YES.equals(attrs.getValue(NEGATE)); + } else if ("feature".equals(qName)) { + uFeature = attrs.getValue("id"); + } else if (qName.equals(TYPE)) { + uType = attrs.getValue("id"); + uTypeList.add(uType); + } else if (qName.equals(TOKEN)) { + setToken(attrs); + } else if (EXCEPTION.equals(qName)) { + setExceptions(attrs); + } else if (qName.equals(EXAMPLE) + && attrs.getValue(TYPE).equals("correct")) { + inCorrectExample = true; + correctExample = new StringBuilder(); + } else if (qName.equals(EXAMPLE) + && attrs.getValue(TYPE).equals("incorrect")) { + inIncorrectExample = true; + incorrectExample = new StringBuilder(); + exampleCorrection = new StringBuilder(); + if (attrs.getValue("correction") != null) { + exampleCorrection.append(attrs.getValue("correction")); + } + } else if ("message".equals(qName)) { + inMessage = true; + inSuggestion = false; + message = new StringBuilder(); + } else if ("short".equals(qName)) { + inShortMessage = true; + shortMessage = new StringBuilder(); + } else if ("rulegroup".equals(qName)) { + ruleGroupId = attrs.getValue("id"); + ruleGroupDescription = attrs.getValue("name"); + defaultOff = "off".equals(attrs.getValue(DEFAULT)); + defaultOn = "on".equals(attrs.getValue(DEFAULT)); + inRuleGroup = true; + subId = 0; + } else if ("suggestion".equals(qName) && inMessage) { + message.append("<suggestion>"); + inSuggestion = true; + } else if ("match".equals(qName)) { + setMatchElement(attrs); + } else if (qName.equals(MARKER) && inCorrectExample) { + correctExample.append("<marker>"); + } else if (qName.equals(MARKER) && inIncorrectExample) { + incorrectExample.append("<marker>"); + } else if (UNIFICATION.equals(qName)) { + uFeature = attrs.getValue("feature"); + inUnificationDef = true; + } else if ("equivalence".equals(qName)) { + uType = attrs.getValue(TYPE); + } else if (PHRASES.equals(qName)) { + inPhrases = true; + } else if ("includephrases".equals(qName)) { + phraseElementInit(); + } else if ("phrase".equals(qName) && inPhrases) { + phraseId = attrs.getValue("id"); + } else if ("phraseref".equals(qName) && (attrs.getValue("idref") != null)) { + preparePhrase(attrs); + } + } + + @Override + public void endElement(final String namespaceURI, final String sName, + final String qName) throws SAXException { + if ("rule".equals(qName)) { + phraseElementInit(); + if (phraseElementList.isEmpty()) { + final PatternRule rule = new PatternRule(id, language, elementList, + description, message.toString(), shortMessage.toString()); + prepareRule(rule); + rules.add(rule); + } else { + if (!elementList.isEmpty()) { + for (final ArrayList<Element> ph : phraseElementList) { + ph.addAll(new ArrayList<Element>(elementList)); + } + } + + for (final ArrayList<Element> phraseElement : phraseElementList) { + processElement(phraseElement); + final PatternRule rule = new PatternRule(id, language, phraseElement, + description, message.toString(), shortMessage.toString(), + phraseElementList.size() > 1); + prepareRule(rule); + rules.add(rule); + } + } + elementList.clear(); + if (phraseElementList != null) { + phraseElementList.clear(); + } + + } else if (qName.equals(EXCEPTION)) { + finalizeExceptions(); + } else if (qName.equals(AND)) { + inAndGroup = false; + andGroupCounter = 0; + tokenCounter++; + } else if (qName.equals(TOKEN)) { + finalizeTokens(); + } else if (qName.equals(PATTERN)) { + checkMarkPositions(); + inPattern = false; + if (lastPhrase) { + elementList.clear(); + } + if (phraseElementList == null || phraseElementList.isEmpty()) { + checkPositions(0); + } else { + for (List<Element> elements : phraseElementList) { + checkPositions(elements.size()); + } + } + tokenCounter = 0; + } else if (qName.equals(EXAMPLE)) { + if (inCorrectExample) { + correctExamples.add(correctExample.toString()); + } else if (inIncorrectExample) { + IncorrectExample example = null; + final String[] corrections = exampleCorrection.toString().split("\\|"); + if (corrections.length > 0 && corrections[0].length() > 0) { + example = new IncorrectExample(incorrectExample.toString(), + corrections); + } else { + example = new IncorrectExample(incorrectExample.toString()); + } + incorrectExamples.add(example); + } + inCorrectExample = false; + inIncorrectExample = false; + correctExample = new StringBuilder(); + incorrectExample = new StringBuilder(); + exampleCorrection = new StringBuilder(); + } else if ("message".equals(qName)) { + suggestionMatches = addLegacyMatches(); + inMessage = false; + } else if ("short".equals(qName)) { + inShortMessage = false; + } else if ("match".equals(qName)) { + if (inMessage) { + suggestionMatches.get(suggestionMatches.size() - 1).setLemmaString( + match.toString()); + } else if (inToken) { + tokenReference.setLemmaString(match.toString()); + } + inMatch = false; + } else if ("rulegroup".equals(qName)) { + inRuleGroup = false; + } else if ("suggestion".equals(qName) && inMessage) { + message.append("</suggestion>"); + inSuggestion = false; + } else if (qName.equals(MARKER) && inCorrectExample) { + correctExample.append("</marker>"); + } else if (qName.equals(MARKER) && inIncorrectExample) { + incorrectExample.append("</marker>"); + } else if ("phrase".equals(qName) && inPhrases) { + finalizePhrase(); + } else if ("includephrases".equals(qName)) { + elementList.clear(); + } else if (PHRASES.equals(qName) && inPhrases) { + inPhrases = false; + } else if (UNIFICATION.equals(qName)) { + inUnificationDef = false; + } else if ("feature".equals(qName)) { + equivalenceFeatures.put(uFeature, uTypeList); + uTypeList = new ArrayList<String>(); + } else if ("unify".equals(qName)) { + inUnification = false; + //clear the features... + equivalenceFeatures = new HashMap<String, List<String>>(); + } + } + + private void prepareRule(final PatternRule rule) { + rule.setStartPositionCorrection(startPositionCorrection); + rule.setEndPositionCorrection(endPositionCorrection); + startPositionCorrection = 0; + endPositionCorrection = 0; + rule.setCorrectExamples(correctExamples); + rule.setIncorrectExamples(incorrectExamples); + rule.setCategory(category); + if (inRuleGroup) { + rule.setSubId(Integer.toString(subId)); + } + else { + rule.setSubId("1"); + } + caseSensitive = false; + if (suggestionMatches != null) { + for (final Match m : suggestionMatches) { + rule.addSuggestionMatch(m); + } + if (phraseElementList.size() <= 1) { + suggestionMatches.clear(); + } + } + if (defaultOff) { + rule.setDefaultOff(); + } + + if (category.isDefaultOff() && !defaultOn) { + rule.setDefaultOff(); + } + + } + + @Override + public void characters(final char[] buf, final int offset, final int len) { + final String s = new String(buf, offset, len); + if (inException) { + exceptions.append(s); + } else if (inToken) { + elements.append(s); + } else if (inCorrectExample) { + correctExample.append(s); + } else if (inIncorrectExample) { + incorrectExample.append(s); + } else if (inMatch) { + match.append(s); + } else if (inMessage) { + message.append(s); + } else if (inShortMessage) { + shortMessage.append(s); + } + } + +} diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/rules/patterns/Unifier.java b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/patterns/Unifier.java new file mode 100644 index 0000000..7fbb35d --- /dev/null +++ b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/patterns/Unifier.java @@ -0,0 +1,432 @@ +/* LanguageTool, a natural language style checker + * Copyright (C) 2006 Daniel Naber (http://www.danielnaber.de) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ + +package de.danielnaber.languagetool.rules.patterns; + +import java.util.ArrayList; +import java.util.HashMap; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Set; + +import de.danielnaber.languagetool.AnalyzedToken; +import de.danielnaber.languagetool.AnalyzedTokenReadings; + +/** + * Implements unification of features over tokens. + * + * @author Marcin Milkowski + */ +public class Unifier { + + //TODO: add a possibility to negate some features but not all + /** + * Negates the meaning of unification just like negation in Element tokens. + */ + private boolean negation; + + private boolean allFeatsIn; + + private int tokCnt; + + private int readingsCounter; + + private final List<AnalyzedTokenReadings> tokSequence; + + /** + * A Map for storing the equivalence types for features. Features are + * specified as Strings, and map into types defined as maps from Strings to + * Elements. + */ + private final Map<EquivalenceTypeLocator, Element> equivalenceTypes; + + /** + * A Map that stores all possible equivalence types listed for features. + */ + private final Map<String, List<String>> equivalenceFeatures; + + /** + * Map of sets of matched equivalences in the unified sequence. + */ + private final List<Map<String, Set<String>>> equivalencesMatched; + + /** + * Marks found interpretations in subsequent tokens. + */ + private List<Boolean> featuresFound; + + /** + * For checking the current token. + */ + private List<Boolean> tmpFeaturesFound; + + /** + * Internal flag for checking whether the first token in tokSequence has to be + * yet unified. + */ + private boolean firstUnified; + + private boolean inUnification; + private boolean uniMatched; + private boolean uniAllMatched; + private AnalyzedTokenReadings[] unifiedTokens; + + /** + * Instantiates the unifier. + */ + public Unifier() { + tokCnt = -1; + readingsCounter = 1; + equivalencesMatched = new ArrayList<Map<String, Set<String>>>(); + equivalenceTypes = new HashMap<EquivalenceTypeLocator, Element>(); + equivalenceFeatures = new HashMap<String, List<String>>(); + featuresFound = new ArrayList<Boolean>(); + tmpFeaturesFound = new ArrayList<Boolean>(); + tokSequence = new ArrayList<AnalyzedTokenReadings>(); + } + + /** + * Prepares equivalence types for features to be tested. All equivalence types + * are given as {@link Element}s. They create an equivalence set (with + * abstraction). + * + * @param feature + * Feature to be tested, like gender, grammatical case or number. + * @param type + * Type of equivalence for the feature, for example plural, first + * person, genitive. + * @param elem + * Element specifying the equivalence. + */ + public final void setEquivalence(final String feature, final String type, + final Element elem) { + if (equivalenceTypes.containsKey(new EquivalenceTypeLocator(feature, type))) { + return; + } + equivalenceTypes.put(new EquivalenceTypeLocator(feature, type), elem); + List<String> lTypes; + if (equivalenceFeatures.containsKey(feature)) { + lTypes = equivalenceFeatures.get(feature); + } else { + lTypes = new ArrayList<String>(); + } + lTypes.add(type); + equivalenceFeatures.put(feature, lTypes); + } + + /** + * Tests if a token has shared features with other tokens. + * + * @param aToken + * - token to be tested + * @param feature + * - feature to be tested + * @param type + * - type of equivalence relation for the feature + * @return true if the token shares this type of feature with other tokens + */ + protected final boolean isSatisfied(final AnalyzedToken aToken, + final Map<String, List<String>> uFeatures) { + + if (allFeatsIn && equivalencesMatched.isEmpty()) { + return false; + } + // Error: no feature given! + if (uFeatures == null) { + return false; // throw exception?? + } + boolean unified = true; + List<String> types; + + if (allFeatsIn) { + unified &= checkNext(aToken, uFeatures); + } else { + tokCnt++; + while (equivalencesMatched.size() <= tokCnt) { + equivalencesMatched.add(new HashMap<String, Set<String>>()); + } + for (final Map.Entry<String, List<String>> feat : uFeatures.entrySet()) { + types = feat.getValue(); + if (types == null || types.isEmpty()) { + types = equivalenceFeatures.get(feat.getKey()); + } + for (final String typename : types) { + final Element testElem = equivalenceTypes + .get(new EquivalenceTypeLocator(feat.getKey(), typename)); + if (testElem == null) { + return false; + } + if (testElem.isMatched(aToken)) { + if (!equivalencesMatched.get(tokCnt).containsKey(feat.getKey())) { + final Set<String> typeSet = new HashSet<String>(); + typeSet.add(typename); + equivalencesMatched.get(tokCnt).put(feat.getKey(), typeSet); + } else { + equivalencesMatched.get(tokCnt).get(feat.getKey()).add(typename); + } + } + } + unified &= equivalencesMatched.get(tokCnt).containsKey(feat.getKey()); + if (!unified) { + break; + } + } + if (unified) { + if (tokCnt == 0 || tokSequence.isEmpty()) { + tokSequence.add(new AnalyzedTokenReadings(aToken, 0)); + } else { + tokSequence.get(0).addReading(aToken); + } + } + } + return unified ^ negation; + } + + private boolean checkNext(final AnalyzedToken aToken, + final Map<String, List<String>> uFeatures) { + boolean unifiedNext = true; + boolean anyFeatUnified = false; + List<String> types; + ArrayList<Boolean> tokenFeaturesFound = new ArrayList<Boolean>(tmpFeaturesFound); + if (allFeatsIn) { + for (int i = 0; i <= tokCnt; i++) { + boolean allFeatsUnified = true; + for (Map.Entry<String, List<String>> feat : uFeatures.entrySet()) { + boolean featUnified = false; + types = feat.getValue(); + if (types == null || types.isEmpty()) { + types = equivalenceFeatures.get(feat.getKey()); + } + for (final String typename : types) { + if (featuresFound.get(i) + && equivalencesMatched.get(i).containsKey(feat.getKey()) + && equivalencesMatched.get(i).get(feat.getKey()).contains(typename)) { + final Element testElem = equivalenceTypes + .get(new EquivalenceTypeLocator(feat.getKey(), typename)); + featUnified = featUnified || testElem.isMatched(aToken); + } + } + allFeatsUnified &= featUnified; + } + tokenFeaturesFound.set(i, allFeatsUnified); + anyFeatUnified = anyFeatUnified || allFeatsUnified; + } + unifiedNext &= anyFeatUnified; + if (unifiedNext) { + if (tokSequence.size() == readingsCounter) { + tokSequence.add(new AnalyzedTokenReadings(aToken, 0)); + } else { + tokSequence.get(readingsCounter).addReading(aToken); + } + tmpFeaturesFound = tokenFeaturesFound; + } + } + return unifiedNext; + } + + /** + * Call after every complete token (AnalyzedTokenReadings) checked. + */ + public final void startNextToken() { + featuresFound = new ArrayList<Boolean>(tmpFeaturesFound); + readingsCounter++; + } + + /** + * Starts testing only those equivalences that were previously matched. + */ + public final void startUnify() { + allFeatsIn = true; + for (int i = 0; i <= tokCnt; i++) { + featuresFound.add(true); + } + tmpFeaturesFound = new ArrayList<Boolean>(featuresFound); + } + + public final void setNegation(final boolean neg) { + negation = neg; + } + + public final boolean getNegation() { + return negation; + } + + /** + * Resets after use of unification. Required. + */ + public final void reset() { + equivalencesMatched.clear(); + allFeatsIn = false; + negation = false; + tokCnt = -1; + featuresFound.clear(); + tmpFeaturesFound.clear(); + tokSequence.clear(); + readingsCounter = 1; + firstUnified = false; + uniMatched = false; + uniAllMatched = false; + inUnification = false; + } + + /** + * Gets a full sequence of filtered tokens. + * + * @return Array of AnalyzedTokenReadings that match equivalence relation + * defined for features tested. + */ + public final AnalyzedTokenReadings[] getUnifiedTokens() { + if (tokSequence.isEmpty()) { + return null; + } + if (!firstUnified) { + AnalyzedTokenReadings tmpATR; + int first = 0; + tmpFeaturesFound.add(true); // Bentley's search idea + while (!tmpFeaturesFound.get(first)) { + first++; + } + tmpFeaturesFound.remove(tmpFeaturesFound.size() - 1); + if (first >= tmpFeaturesFound.size()) { + return null; + } + // FIXME: why this happens?? + final int numRead = tokSequence.get(0).getReadingsLength(); + if (first < numRead) { + tmpATR = new AnalyzedTokenReadings(tokSequence.get(0).getAnalyzedToken( + first), 0); + for (int i = first + 1; i <= Math.min(numRead - 1, tokCnt); i++) { + if (tmpFeaturesFound.get(i)) { + tmpATR.addReading(tokSequence.get(0).getAnalyzedToken(i)); + } + } + tokSequence.set(0, tmpATR); + } + firstUnified = true; + } + final AnalyzedTokenReadings[] atr = tokSequence + .toArray(new AnalyzedTokenReadings[tokSequence.size()]); + return atr; + } + + /** + * Tests if the token sequence is unified. + * + * @param matchToken + * AnalyzedToken token to unify + * @param feature + * String: feature to unify over + * @param type + * String: value types of the feature + * @param isUniNegated + * if true, then return negated result + * @param lastReading + * true when the matchToken is the last reading in the + * AnalyzedReadings + * @return True if the tokens in the sequence are unified. + */ + public final boolean isUnified(final AnalyzedToken matchToken, + final Map<String, List<String>> uFeatures, final boolean isUniNegated, + final boolean lastReading) { + if (inUnification) { + uniMatched |= isSatisfied(matchToken, uFeatures); + uniAllMatched = uniMatched; + if (lastReading) { + startNextToken(); + unifiedTokens = getUnifiedTokens(); + uniMatched = false; + } + return uniAllMatched; + } + if (isUniNegated) { + setNegation(true); + } + isSatisfied(matchToken, uFeatures); + if (lastReading) { + inUnification = true; + uniMatched = false; + startUnify(); + } + return true; + } + + /** + * Used for getting a unified sequence in case when simple test method + * {@link #isUnified} was used. + * + * @return An array of {@link AnalyzedTokenReadings} + */ + public final AnalyzedTokenReadings[] getFinalUnified() { + if (inUnification) { + return unifiedTokens; + } + return null; + } +} + +class EquivalenceTypeLocator { + + @Override + public int hashCode() { + final int prime = 31; + int result = 1; + result = prime * result + ((feature == null) ? 0 : feature.hashCode()); + result = prime * result + ((type == null) ? 0 : type.hashCode()); + return result; + } + + @Override + public boolean equals(final Object obj) { + if (this == obj) { + return true; + } + if (obj == null) { + return false; + } + if (getClass() != obj.getClass()) { + return false; + } + final EquivalenceTypeLocator other = (EquivalenceTypeLocator) obj; + if (feature == null) { + if (other.feature != null) { + return false; + } + } else if (!feature.equals(other.feature)) { + return false; + } + if (type == null) { + if (other.type != null) { + return false; + } + } else if (!type.equals(other.type)) { + return false; + } + return true; + } + + private final String feature; + private final String type; + + EquivalenceTypeLocator(final String feature, final String type) { + this.feature = feature; + this.type = type; + } + +}
\ No newline at end of file diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/rules/patterns/XMLRuleHandler.java b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/patterns/XMLRuleHandler.java new file mode 100644 index 0000000..72a852a --- /dev/null +++ b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/patterns/XMLRuleHandler.java @@ -0,0 +1,568 @@ +/* LanguageTool, a natural language style checker + * Copyright (C) 2006 Daniel Naber (http://www.danielnaber.de) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ +package de.danielnaber.languagetool.rules.patterns; + +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +import org.xml.sax.Attributes; +import org.xml.sax.Locator; +import org.xml.sax.SAXException; +import org.xml.sax.SAXParseException; +import org.xml.sax.helpers.DefaultHandler; + +import de.danielnaber.languagetool.Language; +import de.danielnaber.languagetool.rules.IncorrectExample; +import de.danielnaber.languagetool.tools.StringTools; + +/** + * XML rule handler that loads rules from XML and throws + * exceptions on errors and warnings. + * + * @author Daniel Naber + */ +public class XMLRuleHandler extends DefaultHandler { + + public XMLRuleHandler() { + elementList = new ArrayList<Element>(); + equivalenceFeatures = new HashMap<String, List<String>>(); + uTypeList = new ArrayList<String>(); + } + + List<PatternRule> rules = new ArrayList<PatternRule>(); + + protected Language language; + + protected StringBuilder correctExample = new StringBuilder(); + protected StringBuilder incorrectExample = new StringBuilder(); + protected StringBuilder exampleCorrection = new StringBuilder(); + protected StringBuilder message = new StringBuilder(); + protected StringBuilder match = new StringBuilder(); + protected StringBuilder elements; + protected StringBuilder exceptions; + + List<String> correctExamples = new ArrayList<String>(); + List<IncorrectExample> incorrectExamples = new ArrayList<IncorrectExample>(); + + protected boolean inPattern; + protected boolean inCorrectExample; + protected boolean inIncorrectExample; + protected boolean inMessage; + protected boolean inSuggestion; + protected boolean inMatch; + protected boolean inRuleGroup; + protected boolean inToken; + protected boolean inException; + protected boolean inPhrases; + protected boolean inAndGroup; + + protected boolean tokenSpaceBefore; + protected boolean tokenSpaceBeforeSet; + protected String posToken; + protected boolean posNegation; + protected boolean posRegExp; + + protected boolean caseSensitive; + protected boolean regExpression; + protected boolean tokenNegated; + protected boolean tokenInflected; + + protected String exceptionPosToken; + protected boolean exceptionStringRegExp; + protected boolean exceptionStringNegation; + protected boolean exceptionStringInflected; + protected boolean exceptionPosNegation; + protected boolean exceptionPosRegExp; + protected boolean exceptionValidNext; + protected boolean exceptionValidPrev; + protected boolean exceptionSet; + protected boolean exceptionSpaceBefore; + protected boolean exceptionSpaceBeforeSet; + + /** List of elements as specified by tokens. **/ + protected List<Element> elementList; + + /** true when phraseref is the last element in the rule. **/ + protected boolean lastPhrase; + + /** ID reference to the phrase. **/ + protected String phraseIdRef; + + /** Current phrase ID. **/ + protected String phraseId; + + protected int skipPos; + + protected String ruleGroupId; + + protected String id; + + protected Element tokenElement; + + protected Match tokenReference; + + protected List<Match> suggestionMatches; + + protected Locator pLocator; + + protected int startPositionCorrection; + protected int endPositionCorrection; + protected int tokenCounter; + + /** Phrase store - elementLists keyed by phraseIds. **/ + protected Map<String, List<List<Element>>> phraseMap; + + /** + * Logically forking element list, used for including multiple phrases in the + * current one. + **/ + protected List<ArrayList<Element>> phraseElementList; + + protected int andGroupCounter; + + protected StringBuilder shortMessage = new StringBuilder(); + protected boolean inShortMessage; + + protected boolean inUnification; + protected boolean inUnificationDef; + protected boolean uniNegation; + + protected String uFeature; + protected String uType = ""; + + protected List<String> uTypeList; + + protected Map<String, List<String>> equivalenceFeatures; + + + /** Definitions of values in XML files. */ + protected static final String YES = "yes"; + protected static final String POSTAG = "postag"; + protected static final String POSTAG_REGEXP = "postag_regexp"; + protected static final String REGEXP = "regexp"; + protected static final String NEGATE = "negate"; + protected static final String INFLECTED = "inflected"; + protected static final String NEGATE_POS = "negate_pos"; + protected static final String MARKER = "marker"; + protected static final String DEFAULT = "default"; + protected static final String TYPE = "type"; + protected static final String SPACEBEFORE = "spacebefore"; + protected static final String EXAMPLE = "example"; + protected static final String SCOPE = "scope"; + protected static final String IGNORE = "ignore"; + protected static final String SKIP = "skip"; + protected static final String TOKEN = "token"; + protected static final String FEATURE = "feature"; + protected static final String UNIFY = "unify"; + protected static final String AND = "and"; + protected static final String EXCEPTION = "exception"; + protected static final String CASE_SENSITIVE = "case_sensitive"; + protected static final String PATTERN = "pattern"; + protected static final String MATCH = "match"; + protected static final String UNIFICATION = "unification"; + protected static final String RULEGROUP = "rulegroup"; + protected static final String NO = "no"; + protected static final String MARK_TO = "mark_to"; + protected static final String MARK_FROM = "mark_from"; + protected static final String PHRASES = "phrases"; + protected static final String MESSAGE = "message"; + + + public List<PatternRule> getRules() { + return rules; + } + + public void warning (final SAXParseException e) throws SAXException { + throw e; + } + + public void error (final SAXParseException e) throws SAXException { + throw e; + } + + @Override + public void setDocumentLocator(final Locator locator) { + pLocator = locator; + super.setDocumentLocator(locator); + } + + protected void resetToken() { + posNegation = false; + posRegExp = false; + inToken = false; + tokenSpaceBefore = false; + tokenSpaceBeforeSet = false; + + resetException(); + exceptionSet = false; + tokenReference = null; + } + + protected void resetException() { + exceptionStringNegation = false; + exceptionStringInflected = false; + exceptionPosNegation = false; + exceptionPosRegExp = false; + exceptionStringRegExp = false; + exceptionValidNext = false; + exceptionValidPrev = false; + exceptionSpaceBefore = false; + exceptionSpaceBeforeSet = false; + } + + protected void phraseElementInit() { + // lazy init + if (phraseElementList == null) { + phraseElementList = new ArrayList<ArrayList<Element>>(); + } + } + protected void preparePhrase(final Attributes attrs) { + phraseIdRef = attrs.getValue("idref"); + if (phraseMap.containsKey(phraseIdRef)) { + for (final List<Element> curPhrEl : phraseMap.get(phraseIdRef)) { + for (final Element e : curPhrEl) { + e.setPhraseName(phraseIdRef); + } + if (elementList.isEmpty()) { + phraseElementList.add(new ArrayList<Element>(curPhrEl)); + } else { + final ArrayList<Element> prevList = new ArrayList<Element>( + elementList); + prevList.addAll(curPhrEl); + phraseElementList.add(new ArrayList<Element>(prevList)); + prevList.clear(); + } + } + lastPhrase = true; + } + } + + protected void finalizePhrase() { + // lazy init + if (phraseMap == null) { + phraseMap = new HashMap<String, List<List<Element>>>(); + } + phraseElementInit(); + if (phraseElementList.isEmpty()) { + phraseElementList.add(new ArrayList<Element>(elementList)); + } else { + for (final ArrayList<Element> ph : phraseElementList) { + ph.addAll(new ArrayList<Element>(elementList)); + } + } + + phraseMap.put(phraseId, new ArrayList<List<Element>>(phraseElementList)); + elementList.clear(); + + phraseElementList.clear(); + } + + protected void startPattern(final Attributes attrs) throws SAXException { + inPattern = true; + if (attrs.getValue(MARK_FROM) != null) { + startPositionCorrection = Integer.parseInt(attrs.getValue(MARK_FROM)); + } + if (attrs.getValue(MARK_TO) != null) { + endPositionCorrection = Integer.parseInt(attrs.getValue(MARK_TO)); + if (endPositionCorrection > 0) { + throw new SAXException("End position correction (mark_to="+ endPositionCorrection + + ") cannot be larger than 0: " + "\n Line: " + + pLocator.getLineNumber() + ", column: " + + pLocator.getColumnNumber() + "."); + } + } + caseSensitive = YES.equals(attrs.getValue(CASE_SENSITIVE)); + } + + + /** + * Calculates the offset of the match reference (if any) in case the match + * element has been used in the group. + * + * @param elList + * Element list where the match element was used. It is directly changed. + */ + protected void processElement(final List<Element> elList) { + int counter = 0; + for (final Element elTest : elList) { + if (elTest.getPhraseName() != null && counter > 0) { + if (elTest.isReferenceElement()) { + final int tokRef = elTest.getMatch().getTokenRef(); + elTest.getMatch().setTokenRef(tokRef + counter - 1); + final String offsetToken = elTest.getString().replace("\\" + tokRef, + "\\" + (tokRef + counter - 1)); + elTest.setStringElement(offsetToken); + } + } + counter++; + } + } + + protected void setMatchElement(final Attributes attrs) throws SAXException { + inMatch = true; + match = new StringBuilder(); + Match.CaseConversion caseConversion = Match.CaseConversion.NONE; + if (attrs.getValue("case_conversion") != null) { + caseConversion = Match.CaseConversion.toCase(attrs + .getValue("case_conversion").toUpperCase()); + } + Match.IncludeRange includeRange = Match.IncludeRange.NONE; + if (attrs.getValue("include_skipped") != null) { + includeRange = Match.IncludeRange.toRange(attrs + .getValue("include_skipped").toUpperCase()); + } + final Match mWorker = new Match(attrs.getValue(POSTAG), attrs + .getValue("postag_replace"), YES + .equals(attrs.getValue(POSTAG_REGEXP)), attrs + .getValue("regexp_match"), attrs.getValue("regexp_replace"), + caseConversion, YES.equals(attrs.getValue("setpos")), + includeRange); + mWorker.setInMessageOnly(!inSuggestion); + if (inMessage) { + if (suggestionMatches == null) { + suggestionMatches = new ArrayList<Match>(); + } + suggestionMatches.add(mWorker); + //add incorrect XML character for simplicity + message.append("\u0001\\"); + message.append(attrs.getValue("no")); + if (StringTools.isEmpty(attrs.getValue("no"))) { + throw new SAXException("References cannot be empty: " + "\n Line: " + + pLocator.getLineNumber() + ", column: " + + pLocator.getColumnNumber() + "."); + } else if (Integer.parseInt(attrs.getValue("no")) < 1) { + throw new SAXException("References must be larger than 0: " + + attrs.getValue("no") + "\n Line: " + pLocator.getLineNumber() + + ", column: " + pLocator.getColumnNumber() + "."); + } + } else if (inToken && attrs.getValue("no") != null) { + final int refNumber = Integer.parseInt(attrs.getValue("no")); + if (refNumber > elementList.size()) { + throw new SAXException( + "Only backward references in match elements are possible, tried to specify token " + + refNumber + + "\n Line: " + + pLocator.getLineNumber() + + ", column: " + pLocator.getColumnNumber() + "."); + } + mWorker.setTokenRef(refNumber); + tokenReference = mWorker; + elements.append('\\'); + elements.append(refNumber); + } + } + + protected void setExceptions(final Attributes attrs) { + inException = true; + exceptions = new StringBuilder(); + resetException(); + + exceptionStringNegation = YES.equals(attrs.getValue(NEGATE)); + exceptionValidNext = "next".equals(attrs.getValue(SCOPE)); + exceptionValidPrev = "previous".equals(attrs.getValue(SCOPE)); + exceptionStringInflected = YES.equals(attrs.getValue(INFLECTED)); + + if (attrs.getValue(POSTAG) != null) { + exceptionPosToken = attrs.getValue(POSTAG); + exceptionPosRegExp = YES.equals(attrs.getValue(POSTAG_REGEXP)); + exceptionPosNegation = YES.equals(attrs.getValue(NEGATE_POS)); + } + exceptionStringRegExp = YES.equals(attrs.getValue(REGEXP)); + if (attrs.getValue(SPACEBEFORE) != null) { + exceptionSpaceBefore = YES.equals(attrs.getValue(SPACEBEFORE)); + exceptionSpaceBeforeSet = !"ignore".equals(attrs.getValue(SPACEBEFORE)); + } + } + + protected void finalizeExceptions() { + inException = false; + if (!exceptionSet) { + tokenElement = new Element(StringTools.trimWhitespace(elements + .toString()), caseSensitive, regExpression, tokenInflected); + exceptionSet = true; + } + tokenElement.setNegation(tokenNegated); + if (!StringTools.isEmpty(exceptions.toString())) { + tokenElement.setStringException(StringTools.trimWhitespace(exceptions + .toString()), exceptionStringRegExp, exceptionStringInflected, + exceptionStringNegation, exceptionValidNext, exceptionValidPrev); + } + if (exceptionPosToken != null) { + tokenElement.setPosException(exceptionPosToken, exceptionPosRegExp, + exceptionPosNegation, exceptionValidNext, exceptionValidPrev); + exceptionPosToken = null; + } + if (exceptionSpaceBeforeSet) { + tokenElement.setExceptionSpaceBefore(exceptionSpaceBefore); + } + resetException(); + } + + protected void setToken(final Attributes attrs) { + inToken = true; + + if (lastPhrase) { + elementList.clear(); + } + + lastPhrase = false; + tokenNegated = YES.equals(attrs.getValue(NEGATE)); + tokenInflected = YES.equals(attrs.getValue(INFLECTED)); + if (attrs.getValue("skip") != null) { + skipPos = Integer.parseInt(attrs.getValue("skip")); + } + elements = new StringBuilder(); + // POSElement creation + if (attrs.getValue(POSTAG) != null) { + posToken = attrs.getValue(POSTAG); + posRegExp = YES.equals(attrs.getValue(POSTAG_REGEXP)); + posNegation = YES.equals(attrs.getValue(NEGATE_POS)); + } + regExpression = YES.equals(attrs.getValue(REGEXP)); + + if (attrs.getValue(SPACEBEFORE) != null) { + tokenSpaceBefore = YES.equals(attrs.getValue(SPACEBEFORE)); + tokenSpaceBeforeSet = !"ignore".equals(attrs.getValue(SPACEBEFORE)); + } + + if (!inAndGroup) { + tokenCounter++; + } + } + + protected void checkPositions(final int add) throws SAXException { + if (startPositionCorrection >= tokenCounter + add) { + throw new SAXException( + "Attempt to mark a token no. ("+ startPositionCorrection +") that is outside the pattern (" + + tokenCounter + "). Pattern elements are numbered starting from 0!" + "\n Line: " + + pLocator.getLineNumber() + ", column: " + + pLocator.getColumnNumber() + "."); + } + if (tokenCounter +add - endPositionCorrection < 0) { + throw new SAXException( + "Attempt to mark a token no. ("+ endPositionCorrection +") that is outside the pattern (" + + tokenCounter + " elements). End positions should be negative but not larger than the token count!" + + "\n Line: " + + pLocator.getLineNumber() + ", column: " + + pLocator.getColumnNumber() + "."); + } + } + + protected void checkMarkPositions() { + if (phraseElementList == null || phraseElementList.size() == 0) { + final int endMarker = elementList.size() + endPositionCorrection; + if (endMarker <= startPositionCorrection) { + throw new RuntimeException("Invalid combination of mark_from (" + startPositionCorrection + + ") and mark_to (" + endPositionCorrection + ") for rule " + id + + " with " + elementList.size() + + " tokens: the error position created by mark_from and mark_to is less than one token"); + } + } + } + + /** + * Adds Match objects for all references to tokens + * (including '\1' and the like). + */ + protected List<Match> addLegacyMatches() { + if (suggestionMatches == null || suggestionMatches.isEmpty()) { + return null; + } + final List<Match> sugMatch = new ArrayList<Match>(); + final String messageStr = message.toString(); + int pos = 0; + int ind = 0; + int matchCounter = 0; + while (pos != -1) { + pos = messageStr.indexOf('\\', ind + 1); + if (pos != -1 && messageStr.length() > pos) { + if (Character.isDigit(messageStr.charAt(pos + 1))) { + if (pos == 1 || messageStr.charAt(pos - 1) != '\u0001') { + final Match mWorker = new Match(null, null, false, null, + null, Match.CaseConversion.NONE, false, Match.IncludeRange.NONE); + mWorker.setInMessageOnly(true); + sugMatch.add(mWorker); + } else if (messageStr.charAt(pos - 1) == '\u0001') { // real suggestion marker + sugMatch.add(suggestionMatches.get(matchCounter)); + message.deleteCharAt(pos - 1 - matchCounter); + matchCounter++; + } + } + } + ind = pos; + } + if (sugMatch.isEmpty()) { + return suggestionMatches; + } + return sugMatch; + } + + protected void finalizeTokens() { + if (!exceptionSet || tokenElement == null) { + tokenElement = new Element(StringTools.trimWhitespace(elements + .toString()), caseSensitive, regExpression, tokenInflected); + tokenElement.setNegation(tokenNegated); + } else { + tokenElement.setStringElement(StringTools.trimWhitespace(elements + .toString())); + } + + if (skipPos != 0) { + tokenElement.setSkipNext(skipPos); + skipPos = 0; + } + if (posToken != null) { + tokenElement.setPosElement(posToken, posRegExp, posNegation); + posToken = null; + } + + if (tokenReference != null) { + tokenElement.setMatch(tokenReference); + } + + if (inAndGroup && andGroupCounter > 0) { + elementList.get(elementList.size() - 1) + .setAndGroupElement(tokenElement); + } else { + elementList.add(tokenElement); + } + if (inAndGroup) { + andGroupCounter++; + } + + if (inUnification) { + tokenElement.setUnification(equivalenceFeatures); + if (uniNegation) { + tokenElement.setUniNegation(); + } + } + + if (inUnificationDef) { + language.getUnifier().setEquivalence(uFeature, uType, tokenElement); + elementList.clear(); + } + if (tokenSpaceBeforeSet) { + tokenElement.setWhitespaceBefore(tokenSpaceBefore); + } + resetToken(); + } + +} diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/rules/patterns/bitext/BitextPatternRule.java b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/patterns/bitext/BitextPatternRule.java new file mode 100644 index 0000000..1d42a17 --- /dev/null +++ b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/patterns/bitext/BitextPatternRule.java @@ -0,0 +1,93 @@ +/* LanguageTool, a natural language style checker + * Copyright (C) 2010 Marcin Miłkowski (www.languagetool.org) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ + +package de.danielnaber.languagetool.rules.patterns.bitext; + +import java.io.IOException; + +import de.danielnaber.languagetool.AnalyzedSentence; +import de.danielnaber.languagetool.rules.Rule; +import de.danielnaber.languagetool.rules.RuleMatch; +import de.danielnaber.languagetool.rules.bitext.BitextRule; +import de.danielnaber.languagetool.rules.patterns.PatternRule; + +/** + * A bitext pattern rule class. A BitextPatternRule describes a language error and + * can test whether a given pre-analyzed pair of source and target text + * contains that error using the {@link Rule#match} method. It uses the syntax + * of XML files similar to normal PatternRules. + * + * @author Marcin Miłkowski + */ +public class BitextPatternRule extends BitextRule { + + private final PatternRule srcRule; + private final PatternRule trgRule; + + BitextPatternRule(final PatternRule src, final PatternRule trg) { + srcRule = src; + trgRule = trg; + } + + public PatternRule getSrcRule() { + return srcRule; + } + + public PatternRule getTrgRule() { + return trgRule; + } + + @Override + public String getDescription() { + return srcRule.getDescription(); + } + + public String getMessage() { + return trgRule.getMessage(); + } + + @Override + public String getId() { + return srcRule.getId(); + } + + /** + * This method always returns an empty array. + */ + @Override + public RuleMatch[] match(AnalyzedSentence text) throws IOException { + return new RuleMatch[0]; + } + + @Override + public RuleMatch[] match(AnalyzedSentence sourceText, + AnalyzedSentence targetText) throws IOException { + if (srcRule.match(sourceText).length > 0) { + return trgRule.match(targetText); + } + return new RuleMatch[0]; + } + + @Override + public void reset() { + // TODO Auto-generated method stub + + } + +} diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/rules/patterns/bitext/BitextPatternRuleLoader.java b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/patterns/bitext/BitextPatternRuleLoader.java new file mode 100644 index 0000000..508f381 --- /dev/null +++ b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/patterns/bitext/BitextPatternRuleLoader.java @@ -0,0 +1,413 @@ +/* LanguageTool, a natural language style checker + * Copyright (C) 2005 Daniel Naber (http://www.danielnaber.de) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ +package de.danielnaber.languagetool.rules.patterns.bitext; + +import java.io.IOException; +import java.io.InputStream; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; + +import javax.xml.parsers.SAXParser; +import javax.xml.parsers.SAXParserFactory; + +import org.xml.sax.Attributes; +import org.xml.sax.SAXException; +import org.xml.sax.helpers.DefaultHandler; + +import de.danielnaber.languagetool.Language; +import de.danielnaber.languagetool.bitext.StringPair; +import de.danielnaber.languagetool.rules.Category; +import de.danielnaber.languagetool.rules.IncorrectExample; +import de.danielnaber.languagetool.rules.bitext.IncorrectBitextExample; +import de.danielnaber.languagetool.rules.patterns.Element; +import de.danielnaber.languagetool.rules.patterns.Match; +import de.danielnaber.languagetool.rules.patterns.PatternRule; + +/** + * Loads {@link PatternRule}s from an XML file. + * + * @author Marcin Miłkowski + */ +public class BitextPatternRuleLoader extends DefaultHandler { + + public final List<BitextPatternRule> getRules(final InputStream is, + final String filename) throws IOException { + final List<BitextPatternRule> rules; + try { + final PatternRuleHandler handler = new PatternRuleHandler(); + final SAXParserFactory factory = SAXParserFactory.newInstance(); + final SAXParser saxParser = factory.newSAXParser(); + /* saxParser.getXMLReader().setFeature( + "http://apache.org/xml/features/nonvalidating/load-external-dtd", + false); + */ + saxParser.parse(is, handler); + rules = handler.getBitextRules(); + return rules; + } catch (final Exception e) { + final IOException ioe = new IOException("Cannot load or parse '" + + filename + "'"); + ioe.initCause(e); + throw ioe; + } + } + +} + +class PatternRuleHandler extends BitextXMLRuleHandler { + + private int subId; + + private boolean defaultOff; + private boolean defaultOn; + + private Category category; + private String description; + private String ruleGroupDescription; + + private PatternRule srcRule; + private PatternRule trgRule; + + private IncorrectExample trgExample; + private IncorrectExample srcExample; + + private Language srcLang; + + // =========================================================== + // SAX DocumentHandler methods + // =========================================================== + + @Override + public void startElement(final String namespaceURI, final String lName, + final String qName, final Attributes attrs) throws SAXException { + if (qName.equals("category")) { + final String catName = attrs.getValue("name"); + final String priorityStr = attrs.getValue("priority"); + // int prio = 0; + if (priorityStr != null) { + category = new Category(catName, Integer.parseInt(priorityStr)); + } else { + category = new Category(catName); + } + + if ("off".equals(attrs.getValue(DEFAULT))) { + category.setDefaultOff(); + } + + } else if (qName.equals("rules")) { + final String languageStr = attrs.getValue("targetLang"); + language = Language.getLanguageForShortName(languageStr); + if (language == null) { + throw new SAXException("Unknown language '" + languageStr + "'"); + } + } else if (qName.equals("rule")) { + id = attrs.getValue("id"); + if (inRuleGroup) + subId++; + if (!(inRuleGroup && defaultOff)) { + defaultOff = "off".equals(attrs.getValue(DEFAULT)); + } + + if (!(inRuleGroup && defaultOn)) { + defaultOn = "on".equals(attrs.getValue(DEFAULT)); + } + if (inRuleGroup && id == null) { + id = ruleGroupId; + } + description = attrs.getValue("name"); + if (inRuleGroup && description == null) { + description = ruleGroupDescription; + } + correctExamples = new ArrayList<StringPair>(); + incorrectExamples = new ArrayList<IncorrectBitextExample>(); + if (suggestionMatches != null) { + suggestionMatches.clear(); + } + } else if (PATTERN.equals(qName) || "target".equals(qName)) { + startPattern(attrs); + } else if (AND.equals(qName)) { + inAndGroup = true; + } else if (UNIFY.equals(qName)) { + inUnification = true; + uniNegation = YES.equals(attrs.getValue(NEGATE)); + } else if (qName.equals("feature")) { + uFeature = attrs.getValue("id"); + } else if (qName.equals(TYPE)) { + uType = attrs.getValue("id"); + uTypeList.add(uType); + } else if (qName.equals(TOKEN)) { + setToken(attrs); + } else if (qName.equals(EXCEPTION)) { + setExceptions(attrs); + } else if (qName.equals(EXAMPLE) + && attrs.getValue(TYPE).equals("correct")) { + inCorrectExample = true; + correctExample = new StringBuilder(); + } else if (EXAMPLE.equals(qName) + && attrs.getValue(TYPE).equals("incorrect")) { + inIncorrectExample = true; + incorrectExample = new StringBuilder(); + exampleCorrection = new StringBuilder(); + if (attrs.getValue("correction") != null) { + exampleCorrection.append(attrs.getValue("correction")); + } + } else if (MESSAGE.equals(qName)) { + inMessage = true; + message = new StringBuilder(); + } else if (qName.equals("short")) { + inShortMessage = true; + shortMessage = new StringBuilder(); + } else if (qName.equals(RULEGROUP)) { + ruleGroupId = attrs.getValue("id"); + ruleGroupDescription = attrs.getValue("name"); + defaultOff = "off".equals(attrs.getValue(DEFAULT)); + defaultOn = "on".equals(attrs.getValue(DEFAULT)); + inRuleGroup = true; + subId = 0; + } else if (qName.equals("suggestion") && inMessage) { + message.append("<suggestion>"); + inSuggestion = true; + } else if (qName.equals("match")) { + setMatchElement(attrs); + } else if (qName.equals(MARKER) && inCorrectExample) { + correctExample.append("<marker>"); + } else if (qName.equals(MARKER) && inIncorrectExample) { + incorrectExample.append("<marker>"); + } else if (qName.equals("unification")) { + uFeature = attrs.getValue("feature"); + inUnificationDef = true; + } else if (qName.equals("equivalence")) { + uType = attrs.getValue(TYPE); + } else if (qName.equals("phrases")) { + inPhrases = true; + } else if (qName.equals("includephrases")) { + phraseElementInit(); + } else if (qName.equals("phrase") && inPhrases) { + phraseId = attrs.getValue("id"); + } else if (qName.equals("phraseref") && (attrs.getValue("idref") != null)) { + preparePhrase(attrs); + } else if (qName.equals("source")) { + srcLang = Language.getLanguageForShortName(attrs.getValue("lang")); + } + } + + @Override + public void endElement(final String namespaceURI, final String sName, + final String qName) throws SAXException { + + if (qName.equals("source")) { + checkMarkPositions(); + srcRule = finalizeRule(); + } else if ("target".equals(qName)) { + checkMarkPositions(); + trgRule = finalizeRule(); + } else if ("rule".equals(qName)) { + trgRule.setMessage(message.toString()); + if (suggestionMatches != null) { + for (final Match m : suggestionMatches) { + trgRule.addSuggestionMatch(m); + } + if (phraseElementList.size() <= 1) { + suggestionMatches.clear(); + } + } + final BitextPatternRule bRule = new BitextPatternRule(srcRule, trgRule); + bRule.setCorrectBitextExamples(correctExamples); + bRule.setIncorrectBitextExamples(incorrectExamples); + bRule.setSourceLang(srcLang); + rules.add(bRule); + } else if (qName.equals(EXCEPTION)) { + finalizeExceptions(); + } else if (qName.equals(AND)) { + inAndGroup = false; + andGroupCounter = 0; + tokenCounter++; + } else if (qName.equals(TOKEN)) { + finalizeTokens(); + } else if (qName.equals(PATTERN)) { + inPattern = false; + if (lastPhrase) { + elementList.clear(); + } + if (phraseElementList == null || phraseElementList.isEmpty()) { + checkPositions(0); + } else { + for (List<Element> elements : phraseElementList) { + checkPositions(elements.size()); + } + } + tokenCounter = 0; + } else if (qName.equals("trgExample")) { + trgExample = setExample(); + } else if (qName.equals("srcExample")) { + srcExample = setExample(); + } else if (qName.equals("example")) { + if (inCorrectExample) { + correctExamples.add(new StringPair(srcExample.getExample(), trgExample.getExample())); + } else if (inIncorrectExample) { + if (trgExample.getCorrections() == null) { + incorrectExamples.add( + new IncorrectBitextExample( + new StringPair( + srcExample.getExample(), trgExample.getExample()) + )); + } else { + List<String> l = trgExample.getCorrections(); + String str [] = l.toArray (new String [l.size ()]); + incorrectExamples.add( + new IncorrectBitextExample( + new StringPair(srcExample.getExample(), + trgExample.getExample()), str) + ); + } + } + inCorrectExample = false; + inIncorrectExample = false; + } else if (qName.equals("message")) { + suggestionMatches = addLegacyMatches(); + inMessage = false; + } else if (qName.equals("short")) { + inShortMessage = false; + } else if (qName.equals("match")) { + if (inMessage) { + suggestionMatches.get(suggestionMatches.size() - 1).setLemmaString( + match.toString()); + } else if (inToken) { + tokenReference.setLemmaString(match.toString()); + } + inMatch = false; + } else if (qName.equals("rulegroup")) { + inRuleGroup = false; + } else if (qName.equals("suggestion") && inMessage) { + message.append("</suggestion>"); + inSuggestion = false; + } else if (qName.equals(MARKER) && inCorrectExample) { + correctExample.append("</marker>"); + } else if (qName.equals(MARKER) && inIncorrectExample) { + incorrectExample.append("</marker>"); + } else if (qName.equals("phrase") && inPhrases) { + finalizePhrase(); + } else if (qName.equals("includephrases")) { + elementList.clear(); + } else if (qName.equals("phrases") && inPhrases) { + inPhrases = false; + } else if (qName.equals("unification")) { + inUnificationDef = false; + } else if (qName.equals("feature")) { + equivalenceFeatures.put(uFeature, uTypeList); + uTypeList = new ArrayList<String>(); + } else if (qName.equals("unify")) { + inUnification = false; + //clear the features... + equivalenceFeatures = new HashMap<String, List<String>>(); + } + } + + private IncorrectExample setExample() { + IncorrectExample example = null; + if (inCorrectExample) { + example = new IncorrectExample(correctExample.toString()); + } else if (inIncorrectExample) { + final String[] corrections = exampleCorrection.toString().split("\\|"); + if (corrections.length > 0 && corrections[0].length() > 0) { + example = new IncorrectExample(incorrectExample.toString(), + corrections); + } else { + example = new IncorrectExample(incorrectExample.toString()); + } + } + correctExample = new StringBuilder(); + incorrectExample = new StringBuilder(); + exampleCorrection = new StringBuilder(); + return example; + } + + private PatternRule finalizeRule() { + PatternRule rule = null; + phraseElementInit(); + if (phraseElementList.isEmpty()) { + rule = new PatternRule(id, language, elementList, + description, "", shortMessage.toString()); + prepareRule(rule); + } else { + if (!elementList.isEmpty()) { + for (final ArrayList<Element> ph : phraseElementList) { + ph.addAll(new ArrayList<Element>(elementList)); + } + } + + for (final ArrayList<Element> phraseElement : phraseElementList) { + processElement(phraseElement); + rule = new PatternRule(id, language, phraseElement, + description, message.toString(), shortMessage.toString(), + phraseElementList.size() > 1); + prepareRule(rule); + } + } + elementList.clear(); + if (phraseElementList != null) { + phraseElementList.clear(); + } + startPositionCorrection = 0; + endPositionCorrection = 0; + return rule; + } + private void prepareRule(final PatternRule rule) { + rule.setStartPositionCorrection(startPositionCorrection); + rule.setEndPositionCorrection(endPositionCorrection); + startPositionCorrection = 0; + endPositionCorrection = 0; + rule.setCategory(category); + if (inRuleGroup) + rule.setSubId(Integer.toString(subId)); + else + rule.setSubId("1"); + caseSensitive = false; + if (defaultOff) { + rule.setDefaultOff(); + } + + if (category.isDefaultOff() && !defaultOn) { + rule.setDefaultOff(); + } + + } + + @Override + public void characters(final char[] buf, final int offset, final int len) { + final String s = new String(buf, offset, len); + if (inException) { + exceptions.append(s); + } else if (inToken) { + elements.append(s); + } else if (inCorrectExample) { + correctExample.append(s); + } else if (inIncorrectExample) { + incorrectExample.append(s); + } else if (inMatch) { + match.append(s); + } else if (inMessage) { + message.append(s); + } else if (inShortMessage) { + shortMessage.append(s); + } + } + +} diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/rules/patterns/bitext/BitextXMLRuleHandler.java b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/patterns/bitext/BitextXMLRuleHandler.java new file mode 100644 index 0000000..02f5a04 --- /dev/null +++ b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/patterns/bitext/BitextXMLRuleHandler.java @@ -0,0 +1,56 @@ +/* LanguageTool, a natural language style checker + * Copyright (C) 2006 Daniel Naber (http://www.danielnaber.de) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ +package de.danielnaber.languagetool.rules.patterns.bitext; + +import java.util.ArrayList; +import java.util.List; + +import org.xml.sax.SAXException; +import org.xml.sax.SAXParseException; + +import de.danielnaber.languagetool.bitext.StringPair; +import de.danielnaber.languagetool.rules.bitext.IncorrectBitextExample; +import de.danielnaber.languagetool.rules.patterns.XMLRuleHandler; + +/** + * XML rule handler that loads rules from XML and throws + * exceptions on errors and warnings. + * + * @author Daniel Naber + */ +class BitextXMLRuleHandler extends XMLRuleHandler { + + List<BitextPatternRule> rules = new ArrayList<BitextPatternRule>(); + + List<StringPair> correctExamples = new ArrayList<StringPair>(); + List<IncorrectBitextExample> incorrectExamples = new ArrayList<IncorrectBitextExample>(); + + List<BitextPatternRule> getBitextRules() { + return rules; + } + + public void warning (final SAXParseException e) throws SAXException { + throw e; + } + + public void error (final SAXParseException e) throws SAXException { + throw e; + } + +} diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/rules/patterns/bitext/FalseFriendsAsBitextLoader.java b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/patterns/bitext/FalseFriendsAsBitextLoader.java new file mode 100644 index 0000000..87c30a5 --- /dev/null +++ b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/patterns/bitext/FalseFriendsAsBitextLoader.java @@ -0,0 +1,72 @@ +/* LanguageTool, a natural language style checker + * Copyright (C) 2010 Marcin Miłkowski (www.languagetool.org) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ + +package de.danielnaber.languagetool.rules.patterns.bitext; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; + +import javax.xml.parsers.ParserConfigurationException; + +import org.xml.sax.SAXException; + +import de.danielnaber.languagetool.Language; +import de.danielnaber.languagetool.rules.patterns.FalseFriendRuleLoader; +import de.danielnaber.languagetool.rules.patterns.PatternRule; + +/** + * Loads the false friend rules as bitext pattern rules. Note that the resulting + * rules have suggestions that are not really customizable, in contradistinction + * to the 'real' bitext pattern rules. + * + * @author Marcin Miłkowski + * + */ +public class FalseFriendsAsBitextLoader { + + public List<BitextPatternRule> getFalseFriendsAsBitext(final String filename, + final Language motherTongue, final Language language) throws ParserConfigurationException, SAXException, IOException { + final FalseFriendRuleLoader ruleLoader = new FalseFriendRuleLoader(); + List<BitextPatternRule> bRules = new ArrayList<BitextPatternRule>(); + List<PatternRule> rules1 = + ruleLoader.getRules(this.getClass().getResourceAsStream(filename), + motherTongue, language); + List<PatternRule> rules2 = + ruleLoader.getRules(this.getClass().getResourceAsStream(filename), + language, motherTongue); + HashMap<String, PatternRule> srcRules = new HashMap<String, PatternRule>(); + for (PatternRule rule : rules1) { + srcRules.put(rule.getId(), rule); + } + for (PatternRule rule : rules2) { + if (srcRules.containsKey(rule.getId())) { + BitextPatternRule bRule = new BitextPatternRule( + srcRules.get(rule.getId()), rule); + bRule.setSourceLang(motherTongue); + bRule.setCategory(rule.getCategory()); + bRules.add(bRule); + } + } + return bRules; + } + +} + diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/rules/pl/CompoundRule.java b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/pl/CompoundRule.java new file mode 100644 index 0000000..6d2ff17 --- /dev/null +++ b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/pl/CompoundRule.java @@ -0,0 +1,55 @@ +/* LanguageTool, a natural language style checker + * Copyright (C) 2006 Daniel Naber (http://www.danielnaber.de) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ +package de.danielnaber.languagetool.rules.pl; + +import java.io.IOException; +import java.util.ResourceBundle; + +import de.danielnaber.languagetool.JLanguageTool; +import de.danielnaber.languagetool.rules.AbstractCompoundRule; + +/** + * Checks that compounds (if in the list) are not written as separate words. + * + * @author Marcin Miłkowski, based on code by Daniel Naber + */ + +public final class CompoundRule extends AbstractCompoundRule { + + private static final String FILE_NAME = "/pl/compounds.txt"; + + public CompoundRule(final ResourceBundle messages) throws IOException { + super(messages); + loadCompoundFile(JLanguageTool.getDataBroker().getFromResourceDirAsStream(FILE_NAME), "UTF-8"); + super.setShort("Brak łącznika lub zbędny łącznik"); + super.setMsg("Ten wyraz pisze się z łącznikiem.", + "Ten wyraz pisze się razem (bez spacji ani łącznika).", + "Ten wyraz pisze się z łącznikiem lub bez niego."); + } + + public final String getId() { + return "PL_COMPOUNDS"; + } + + public final String getDescription() { + return "Sprawdza wyrazy z łącznikiem, np. „łapu capu” zamiast „łapu-capu”"; + } + + +} diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/rules/pl/PolishRule.java b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/pl/PolishRule.java new file mode 100644 index 0000000..0a6f01b --- /dev/null +++ b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/pl/PolishRule.java @@ -0,0 +1,31 @@ +/* LanguageTool, a natural language style checker + * Copyright (C) 2005 Daniel Naber (http://www.danielnaber.de) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ +package de.danielnaber.languagetool.rules.pl; + +import de.danielnaber.languagetool.rules.Rule; + +/** + * Abstract base class for Polish rules. + * + * @author Marcin Miłkowski + * + */ +public abstract class PolishRule extends Rule { + +} diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/rules/pl/PolishUnpairedBracketsRule.java b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/pl/PolishUnpairedBracketsRule.java new file mode 100644 index 0000000..3b83133 --- /dev/null +++ b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/pl/PolishUnpairedBracketsRule.java @@ -0,0 +1,42 @@ +/* LanguageTool, a natural language style checker + * Copyright (C) 2010 Marcin Miłkowski (http://www.languagetool.org) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ + +package de.danielnaber.languagetool.rules.pl; + +import java.util.ResourceBundle; + +import de.danielnaber.languagetool.Language; +import de.danielnaber.languagetool.rules.GenericUnpairedBracketsRule; + +public class PolishUnpairedBracketsRule extends GenericUnpairedBracketsRule { + + private static final String[] PL_START_SYMBOLS = { "[", "(", "{", "„", "»", "\"" }; + private static final String[] PL_END_SYMBOLS = { "]", ")", "}", "”", "«", "\"" }; + + public PolishUnpairedBracketsRule(final ResourceBundle messages, + final Language language) { + super(messages, language); + startSymbols = PL_START_SYMBOLS; + endSymbols = PL_END_SYMBOLS; + } + + public String getId() { + return "PL_UNPAIRED_BRACKETS"; + } +} diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/rules/pl/PolishWordRepeatRule.java b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/pl/PolishWordRepeatRule.java new file mode 100644 index 0000000..a7dbb5e --- /dev/null +++ b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/pl/PolishWordRepeatRule.java @@ -0,0 +1,200 @@ +/* LanguageTool, a natural language style checker + * Copyright (C) 2005 Daniel Naber (http://www.danielnaber.de) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ +package de.danielnaber.languagetool.rules.pl; + +import java.util.ArrayList; +import java.util.List; +import java.util.ResourceBundle; +import java.util.TreeSet; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +import de.danielnaber.languagetool.AnalyzedSentence; +import de.danielnaber.languagetool.AnalyzedTokenReadings; +import de.danielnaber.languagetool.rules.Category; +import de.danielnaber.languagetool.rules.RuleMatch; +import de.danielnaber.languagetool.tools.StringTools; + +/** + * @author Marcin Miłkowski + * + * Rule for detecting same words in the sentence but not just in a row + * + */ +public class PolishWordRepeatRule extends PolishRule { + + /** + * Excluded dictionary words. + */ + private static final Pattern EXC_WORDS = Pattern + .compile("nie|tuż|aż|to|siebie|być|ani|ni|albo|" + + "lub|czy|bądź|jako|zł|np|coraz" + + "|bardzo|bardziej|proc|ten|jak|mln|tys|swój|mój|" + + "twój|nasz|wasz|i|zbyt"); + + /** + * Excluded part of speech classes. + */ + private static final Pattern EXC_POS = Pattern.compile("prep:.*|ppron.*"); + + /** + * Excluded non-words (special symbols, Roman numerals etc. + */ + private static final Pattern EXC_NONWORDS = Pattern + .compile(""|>|<|&|[0-9].*|" + + "M*(D?C{0,3}|C[DM])(L?X{0,3}|X[LC])(V?I{0,3}|I[VX])$"); + + public PolishWordRepeatRule(final ResourceBundle messages) { + if (messages != null) { + super.setCategory(new Category(messages.getString("category_misc"))); + } + setDefaultOff(); + } + + /* + * (non-Javadoc) + * + * @see de.danielnaber.languagetool.rules.Rule#getId() + */ + @Override + public final String getId() { + return "PL_WORD_REPEAT"; + } + + /* + * (non-Javadoc) + * + * @see de.danielnaber.languagetool.rules.Rule#getDescription() + */ + @Override + public final String getDescription() { + return "Powtórzenia wyrazów w zdaniu (monotonia stylistyczna)"; + } + + /* + * Tests if any word form is repeated in the sentence. + */ + @Override + public final RuleMatch[] match(final AnalyzedSentence text) { + final List<RuleMatch> ruleMatches = new ArrayList<RuleMatch>(); + final AnalyzedTokenReadings[] tokens = text.getTokensWithoutWhitespace(); + boolean repetition = false; + final TreeSet<String> inflectedWords = new TreeSet<String>(); + String prevLemma, curLemma; + // start from real token, 0 = SENT_START + for (int i = 1; i < tokens.length; i++) { + final String token = tokens[i].getToken(); + // avoid "..." etc. to be matched: + boolean isWord = true; + boolean hasLemma = true; + + if (token.length() < 2) { + isWord = false; + } + + final int readingsLen = tokens[i].getReadingsLength(); + for (int k = 0; k < readingsLen; k++) { + final String posTag = tokens[i].getAnalyzedToken(k).getPOSTag(); + if (posTag != null) { + if (StringTools.isEmpty(posTag)) { + isWord = false; + break; + } + // FIXME: too many false alarms here: + final String lemma = tokens[i].getAnalyzedToken(k).getLemma(); + if (lemma == null) { + hasLemma = false; + break; + } + final Matcher m1 = EXC_WORDS.matcher(lemma); + if (m1.matches()) { + isWord = false; + break; + } + + final Matcher m2 = EXC_POS.matcher(posTag); + if (m2.matches()) { + isWord = false; + break; + } + } else { + hasLemma = false; + } + + } + + final Matcher m1 = EXC_NONWORDS.matcher(tokens[i].getToken()); + if (m1.matches()) { + isWord = false; + } + + prevLemma = ""; + if (isWord) { + boolean notSentEnd = false; + for (int j = 0; j < readingsLen; j++) { + final String pos = tokens[i].getAnalyzedToken(j).getPOSTag(); + if (pos != null) { + notSentEnd |= "SENT_END".equals(pos); + } + if (hasLemma) { + curLemma = tokens[i].getAnalyzedToken(j).getLemma(); + if (!prevLemma.equals(curLemma) && !notSentEnd) { + if (inflectedWords.contains(curLemma)) { + repetition = true; + } else { + inflectedWords.add(tokens[i].getAnalyzedToken(j).getLemma()); + } + } + prevLemma = curLemma; + } else { + if (inflectedWords.contains(tokens[i].getToken()) && !notSentEnd) { + repetition = true; + } else { + inflectedWords.add(tokens[i].getToken()); + } + } + + } + } + + if (repetition) { + final String msg = "Powtórzony wyraz w zdaniu"; + final int pos = tokens[i].getStartPos(); + final RuleMatch ruleMatch = new RuleMatch(this, pos, pos + + token.length(), msg, "Powtórzenie wyrazu"); + ruleMatches.add(ruleMatch); + repetition = false; + } + + } + return toRuleMatchArray(ruleMatches); + } + + /* + * (non-Javadoc) + * + * @see de.danielnaber.languagetool.rules.Rule#reset() + */ + @Override + public void reset() { + // nothing + + } + +} diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/rules/pl/SimpleReplaceRule.java b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/pl/SimpleReplaceRule.java new file mode 100644 index 0000000..90708d9 --- /dev/null +++ b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/pl/SimpleReplaceRule.java @@ -0,0 +1,82 @@ +/* LanguageTool, a natural language style checker + * Copyright (C) 2005 Daniel Naber (http://www.danielnaber.de) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ +package de.danielnaber.languagetool.rules.pl; + +import java.io.IOException; +import java.util.Locale; +import java.util.ResourceBundle; + +import de.danielnaber.languagetool.rules.AbstractSimpleReplaceRule; + +/** + * A rule that matches words or phrases which should not be used and suggests + * correct ones instead. + * + * Polish implementations. Loads the list of words from + * <code>rules/pl/replace.txt</code>. + * + * @author Marcin Miłkowski + */ +public class SimpleReplaceRule extends AbstractSimpleReplaceRule { + + public static final String POLISH_SIMPLE_REPLACE_RULE = "PL_SIMPLE_REPLACE"; + + private static final String FILE_NAME = "/pl/replace.txt"; + // locale used on case-conversion + private static final Locale PL_LOCALE = new Locale("pl"); + + public final String getFileName() { + return FILE_NAME; + } + + public SimpleReplaceRule(final ResourceBundle messages) throws IOException { + super(messages); + } + + public final String getId() { + return POLISH_SIMPLE_REPLACE_RULE; + } + + public String getDescription() { + return "Typowe literówki"; + } + + public String getShort() { + return "Literówka"; + } + + public String getSuggestion() { + return " to typowa literówka, poprawnie: "; + } + + /** + * use case-insensitive matching. + */ + public boolean isCaseSensitive() { + return false; + } + + /** + * locale used on case-conversion + */ + public Locale getLocale() { + return PL_LOCALE; + } + +} diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/rules/ro/CompoundRule.java b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/ro/CompoundRule.java new file mode 100644 index 0000000..bb9dea8 --- /dev/null +++ b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/ro/CompoundRule.java @@ -0,0 +1,58 @@ +/* LanguageTool, a natural language style checker + * Copyright (C) 2006 Daniel Naber (http://www.danielnaber.de) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ +package de.danielnaber.languagetool.rules.ro; + +import java.io.IOException; +import java.util.ResourceBundle; + +import de.danielnaber.languagetool.JLanguageTool; +import de.danielnaber.languagetool.rules.AbstractCompoundRule; + +/** + * Checks that compounds (if in the list) are not written as separate words. + * + * @author Ionuț Păduraru, based on code by Daniel Naber + */ +public class CompoundRule extends AbstractCompoundRule { + + public static final String ROMANIAN_COMPOUND_RULE = "RO_COMPOUND"; + private static final String FILE_NAME = "/ro/compounds.txt"; + + public CompoundRule(final ResourceBundle messages) throws IOException { + super(messages); + loadCompoundFile(JLanguageTool.getDataBroker().getFromRulesDirAsStream(FILE_NAME), "UTF-8"); + super.setShort("Problemă de scriere (cratimă, spațiu, etc.)"); + super.setMsg("Cuvântul se scrie cu cratimă.", + "Cuvântul se scrie legat.", + "Cuvântul se scrie legat sau cu cratimă."); + // default value (2) is not ok for Romanian + setMaxUnHyphenatedWordCount(Integer.MAX_VALUE); + // there are words that should not be written with hyphen but as one word + setHyphenIgnored(false); + } + + public String getId() { + return ROMANIAN_COMPOUND_RULE; + } + + public String getDescription() { + return "Greșeală de scriere (cuvinte scrise legat sau cu cratimă)"; + } + +} diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/rules/ro/SimpleReplaceRule.java b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/ro/SimpleReplaceRule.java new file mode 100644 index 0000000..9e96513 --- /dev/null +++ b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/ro/SimpleReplaceRule.java @@ -0,0 +1,264 @@ +/* LanguageTool, a natural language style checker + * Copyright (C) 2005 Daniel Naber (http://www.danielnaber.de) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ +package de.danielnaber.languagetool.rules.ro; + +import java.io.BufferedReader; +import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; +import java.util.HashMap; +import java.util.List; +import java.util.Locale; +import java.util.Map; +import java.util.Queue; +import java.util.ResourceBundle; +import java.util.concurrent.ArrayBlockingQueue; + +import de.danielnaber.languagetool.AnalyzedSentence; +import de.danielnaber.languagetool.AnalyzedTokenReadings; +import de.danielnaber.languagetool.JLanguageTool; +import de.danielnaber.languagetool.Language; +import de.danielnaber.languagetool.rules.AbstractSimpleReplaceRule; +import de.danielnaber.languagetool.rules.Category; +import de.danielnaber.languagetool.rules.Rule; +import de.danielnaber.languagetool.rules.RuleMatch; +import de.danielnaber.languagetool.tokenizers.Tokenizer; +import de.danielnaber.languagetool.tools.StringTools; + +/** + * A rule that matches words which should not be used and suggests correct ones instead. <br/> + * Romanian implementations. Loads the list of words from + * <code>/ro/replace.txt</code>.<br/><br/> + * + * Unlike AbstractSimpleReplaceRule, supports multiple words (Ex: "aqua forte" => "acvaforte").<br/><br/> + * + * Note: Merge this into {@link AbstractSimpleReplaceRule} eventually and simply extend from AbstractSimpleReplaceRule.<br/> + * + * @author Ionuț Păduraru + * @version $Id$ + * + */ +public class SimpleReplaceRule extends Rule { + + public static final String ROMANIAN_SIMPLE_REPLACE_RULE = "RO_SIMPLE_REPLACE"; + + private static final String FILE_NAME = "/ro/replace.txt"; + private static final String FILE_ENCODING = "utf-8"; + // locale used on case-conversion + private static Locale roLocale = new Locale("ro"); + + // list of maps containing error-corrections pairs. + // the n-th map contains key strings of (n+1) words + private List<Map<String, String>> wrongWords; + + public final String getFileName() { + return FILE_NAME; + } + + public SimpleReplaceRule(final ResourceBundle messages) throws IOException { + super(messages); + if (messages != null) { + super.setCategory(new Category(messages.getString("category_misc"))); + } + wrongWords = loadWords(JLanguageTool.getDataBroker().getFromRulesDirAsStream(getFileName())); + } + + public final String getId() { + return ROMANIAN_SIMPLE_REPLACE_RULE; + } + + public String getDescription() { + return "Cuvinte sau grupuri de cuvinte incorecte sau ieșite din uz"; + } + + public String getShort() { + return "Cuvânt incorect sau ieșit din uz"; + } + + public String getSuggestion() { + return " este incorect sau ieșit din uz, folosiți "; + } + + /** + * @return the word used to separate multiple suggestions; used only before last suggestion, the rest are comma-separated. + */ + public String getSuggestionsSeparator() { + return " sau "; + } + + /** + * use case-insensitive matching. + */ + public boolean isCaseSensitive() { + return false; + } + + /** + * locale used on case-conversion + */ + public Locale getLocale() { + return roLocale; + } + + public String getEncoding() { + return FILE_ENCODING; + } + + /** + * @return the word tokenizer used for tokenization on loading words. + */ + protected Tokenizer getWordTokenizer() { + return Language.ROMANIAN.getWordTokenizer(); + } + + /** + * @return the list of wrong words for which this rule can suggest correction. The list cannot be modified. + */ + public List<Map<String, String>> getWrongWords() { + return wrongWords; + } + + /** + * Load the list of words. <br/> + * Same as {@link AbstractSimpleReplaceRule#loadWords} but allows multiple words. + * @param file the file to load. + * @return the list of maps containing the error-corrections pairs. <br/>The n-th map contains key strings of (n+1) words. + * @throws IOException when the file contains errors. + * @see #getWordTokenizer + */ + private List<Map<String, String>> loadWords(final InputStream file) + throws IOException { + final List<Map<String, String>> list = new ArrayList<Map<String, String>>(); + InputStreamReader isr = null; + BufferedReader br = null; + try { + isr = new InputStreamReader(file, getEncoding()); + br = new BufferedReader(isr); + String line; + + while ((line = br.readLine()) != null) { + line = line.trim(); + if (line.length() < 1 || line.charAt(0) == '#') { // ignore comments + continue; + } + final String[] parts = line.split("="); + if (parts.length != 2) { + throw new IOException("Format error in file " + + JLanguageTool.getDataBroker().getFromRulesDirAsUrl(getFileName()) + + ", line: " + line); + } + final String[] wrongForms = parts[0].split("\\|"); // multiple incorect forms + for (String wrongForm : wrongForms) { + int wordCount = 0; + final List<String> tokens = getWordTokenizer().tokenize(wrongForm); + for (String token : tokens) { + if (!StringTools.isWhitespace(token)) { + wordCount++; + } + } + // grow if necessary + for (int i = list.size(); i < wordCount; i++) { + list.add(new HashMap<String, String>()); + } + list.get(wordCount - 1).put(wrongForm, parts[1]); + } + } + + } finally { + if (br != null) { + br.close(); + } + if (isr != null) { + isr.close(); + } + } + // seal the result (prevent modification from outside this class) + List<Map<String,String>> result = new ArrayList<Map<String, String>>(); + for (Map<String, String> map : list) { + result.add(Collections.unmodifiableMap(map)); + } + return Collections.unmodifiableList(result); + } + + private void addToQueue(AnalyzedTokenReadings token, + Queue<AnalyzedTokenReadings> prevTokens) { + final boolean inserted = prevTokens.offer(token); + if (!inserted) { + prevTokens.poll(); + prevTokens.offer(token); + } + } + + public RuleMatch[] match(final AnalyzedSentence text) { + final List<RuleMatch> ruleMatches = new ArrayList<RuleMatch>(); + final AnalyzedTokenReadings[] tokens = text + .getTokensWithoutWhitespace(); + + final Queue<AnalyzedTokenReadings> prevTokens = new ArrayBlockingQueue<AnalyzedTokenReadings>(wrongWords.size()); + + for (int i = 1; i < tokens.length; i++) { + addToQueue(tokens[i], prevTokens); + final StringBuilder sb = new StringBuilder(); + final ArrayList<String> variants = new ArrayList<String>(); + final List<AnalyzedTokenReadings> prevTokensList = Arrays.asList(prevTokens.toArray(new AnalyzedTokenReadings[] {})); + for (int j = prevTokensList.size() - 1; j >= 0; j--) { + if (j != prevTokensList.size() - 1 && prevTokensList.get(j + 1).isWhitespaceBefore()) + sb.insert(0, " "); + sb.insert(0, prevTokensList.get(j).getToken()); + variants.add(0, sb.toString()); + } + final int len = variants.size(); // prevTokensList and variants have now the same length + for (int j = 0; j < len; j++) { // longest words first + final String crt = variants.get(j); + final int crtWordCount = len - j; + final String crtMatch = isCaseSensitive() ? wrongWords.get(crtWordCount - 1).get(crt) : wrongWords.get(crtWordCount- 1).get(crt.toLowerCase(getLocale())); + if (crtMatch != null) { + final List<String> replacements = Arrays.asList(crtMatch.split("\\|")); + String msg = crt + getSuggestion(); + for (int k = 0; k < replacements.size(); k++) { + if (k > 0) { + msg = msg + (k == replacements.size() - 1 ? getSuggestionsSeparator(): ", "); + } + msg += "<suggestion>" + replacements.get(k) + "</suggestion>"; + } + final int startPos = prevTokensList.get(len - crtWordCount).getStartPos(); + final int endPos = prevTokensList.get(len - 1).getStartPos() + prevTokensList.get(len - 1).getToken().length(); + final RuleMatch potentialRuleMatch = new RuleMatch(this, startPos, endPos, msg, getShort()); + + if (!isCaseSensitive() && StringTools.startsWithUppercase(crt)) { + for (int k = 0; k < replacements.size(); k++) { + replacements.set(k, StringTools.uppercaseFirstChar(replacements.get(k))); + } + } + potentialRuleMatch.setSuggestedReplacements(replacements); + ruleMatches.add(potentialRuleMatch); + break; + } + } + } + return toRuleMatchArray(ruleMatches); + } + + public void reset() { + } + +} diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/rules/ru/RuSimpleReplaceRule.java b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/ru/RuSimpleReplaceRule.java new file mode 100644 index 0000000..4076a9c --- /dev/null +++ b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/ru/RuSimpleReplaceRule.java @@ -0,0 +1,80 @@ +/* LanguageTool, a natural language style checker + * Copyright (C) 2005 Daniel Naber (http://www.danielnaber.de) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ +package de.danielnaber.languagetool.rules.ru; + +import java.io.IOException; +import java.util.Locale; +import java.util.ResourceBundle; + +import de.danielnaber.languagetool.rules.AbstractSimpleReplaceRule; + +/** + * A rule that matches words or phrases which should not be used and suggests + * correct ones instead. + * + * Russian implementations. Loads the + * relevant words from <code>rules/ru/replace.txt</code>. + * + * @author Yakov Reztsov + */ +public class RuSimpleReplaceRule extends AbstractSimpleReplaceRule { + + private static final String FILE_NAME = "/ru/replace.txt"; + + // locale used on case-conversion + private static final Locale RU_LOCALE = new Locale("ru"); + + + public final String getFileName() { + return FILE_NAME; + } + public RuSimpleReplaceRule(final ResourceBundle messages) throws IOException { + super(messages); + } + + public final String getId() { + return "RU_SIMPLE_REPLACE"; + } + + public String getDescription() { + return "Поиск ошибочных слов/фраз"; + } + +public String getShort() { + return "Ошибка?"; + } + + public String getSuggestion() { + return " - ошибочное слово/фраза, исправление: "; + } + + /** + * use case-insensitive matching. + */ + public boolean isCaseSensitive() { + return false; + } + + /** + * locale used on case-conversion + */ + public Locale getLocale() { + return RU_LOCALE; + } +} diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/rules/ru/RussianCompoundRule.java b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/ru/RussianCompoundRule.java new file mode 100644 index 0000000..3e7d889 --- /dev/null +++ b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/ru/RussianCompoundRule.java @@ -0,0 +1,57 @@ +/* LanguageTool, a natural language style checker + * Copyright (C) 2006 Daniel Naber (http://www.danielnaber.de) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ +package de.danielnaber.languagetool.rules.ru; + +import java.io.IOException; +import java.util.ResourceBundle; + +import de.danielnaber.languagetool.JLanguageTool; +import de.danielnaber.languagetool.rules.AbstractCompoundRule; + +/** + * Checks that compounds (if in the list) are not written as separate words. + * Russian compounds rule. + * @author Yakov Reztsov + * + * Based on German compounds rule. + * @author Daniel Naber + * + */ +public class RussianCompoundRule extends AbstractCompoundRule { + + private static final String FILE_NAME = "/ru/compounds_ru.txt"; + + public RussianCompoundRule(final ResourceBundle messages) throws IOException { + super(messages); + loadCompoundFile(JLanguageTool.getDataBroker().getFromResourceDirAsStream(FILE_NAME), "UTF-8"); + super.setMsg("Эти слова должны быть написаны через дефис.", + "Эти слова должны быть написаны слитно.", + "Эти слова могут быть написаны через дефис или слитно."); + + } + + public String getId() { + return "RU_COMPOUNDS"; + } + + public String getDescription() { + return "Правописание через дефис"; + } + +} diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/rules/ru/RussianRule.java b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/ru/RussianRule.java new file mode 100644 index 0000000..030abf2 --- /dev/null +++ b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/ru/RussianRule.java @@ -0,0 +1,30 @@ +/* LanguageTool, a natural language style checker + * Copyright (C) 2005 Daniel Naber (http://www.danielnaber.de) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ +package de.danielnaber.languagetool.rules.ru; + +import de.danielnaber.languagetool.rules.Rule; + +/** + * Abstract base class for rules for the Russian language. + * + * @author + */ +public abstract class RussianRule extends Rule { + +} diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/rules/ru/RussianUnpairedBracketsRule.java b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/ru/RussianUnpairedBracketsRule.java new file mode 100644 index 0000000..75dd86b --- /dev/null +++ b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/ru/RussianUnpairedBracketsRule.java @@ -0,0 +1,62 @@ +/* LanguageTool, a natural language style checker + * Copyright (C) 2010 Marcin Miłkowski (http://www.languagetool.org) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ + +package de.danielnaber.languagetool.rules.ru; + +import java.util.ResourceBundle; +import java.util.regex.Pattern; + +import de.danielnaber.languagetool.AnalyzedTokenReadings; +import de.danielnaber.languagetool.Language; +import de.danielnaber.languagetool.rules.GenericUnpairedBracketsRule; + +public class RussianUnpairedBracketsRule extends GenericUnpairedBracketsRule { + + private static final String[] RU_START_SYMBOLS = { "[", "(", "{", "„", "«", "\"", "'" }; + private static final String[] RU_END_SYMBOLS = { "]", ")", "}", "“", "»", "\"", "'" }; + + private static final Pattern NUMERALS_RU = Pattern + .compile("(?i)\\d{1,2}?[а-я]*|[а-я]|[А-Я]|[а-я][а-я]|[А-Я][А-Я]"); + + + protected boolean isNoException(final String token, + final AnalyzedTokenReadings[] tokens, final int i, final int j, + final boolean precSpace, + final boolean follSpace) { + // exception for Russian bullets: а), б), Д)..., ДД), аа) and 1а). + if (i > 1 && endSymbols[j].equals(")") && + NUMERALS_RU.matcher(tokens[i - 1].getToken()).matches() && + !(!symbolStack.empty() && "(".equals(symbolStack.peek().symbol))) { + return false; + } + return true; + } + + public RussianUnpairedBracketsRule(final ResourceBundle messages, + final Language language) { + super(messages, language); + startSymbols = RU_START_SYMBOLS; + endSymbols = RU_END_SYMBOLS; + } + + public String getId() { + return "RU_UNPAIRED_BRACKETS"; + } + +} diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/rules/sk/CompoundRule.java b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/sk/CompoundRule.java new file mode 100644 index 0000000..d5260bf --- /dev/null +++ b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/sk/CompoundRule.java @@ -0,0 +1,55 @@ +/* LanguageTool, a natural language style checker + * Copyright (C) 2006 Daniel Naber (http://www.danielnaber.de) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ +package de.danielnaber.languagetool.rules.sk; + +import java.io.IOException; +import java.util.ResourceBundle; + +import de.danielnaber.languagetool.JLanguageTool; +import de.danielnaber.languagetool.rules.AbstractCompoundRule; + +/** + * Checks that compounds (if in the list) are not written as separate words. + * + * @author Zdenko Podobný based on code by Marcin Miłkowski, Daniel Naber + */ + +public final class CompoundRule extends AbstractCompoundRule { + + private static final String FILE_NAME = "/sk/compounds.txt"; + + public CompoundRule(final ResourceBundle messages) throws IOException { + super(messages); + loadCompoundFile(JLanguageTool.getDataBroker().getFromResourceDirAsStream(FILE_NAME), "UTF-8"); + super.setShort("Problém spájania slov"); + super.setMsg("Toto slovo sa zvyčajne píše so spojovníkom.", + "Toto slovo sa obvykle píše bez spojovníka.", + "Tento výraz sa bežne píše s alebo bez spojovníka."); + } + + public final String getId() { + return "SK_COMPOUNDS"; + } + + public final String getDescription() { + return "Slová so spojovníkom napr. použite „česko-slovenský” namiesto „česko slovenský”"; + } + + +} diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/rules/sk/SlovakRule.java b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/sk/SlovakRule.java new file mode 100644 index 0000000..f28067a --- /dev/null +++ b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/sk/SlovakRule.java @@ -0,0 +1,31 @@ +/* LanguageTool, a natural language style checker + * Copyright (C) 2005 Daniel Naber (http://www.danielnaber.de) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ +package de.danielnaber.languagetool.rules.sk; + +import de.danielnaber.languagetool.rules.Rule; + +/** + * Abstract base class for Polish rules. + * + * @author Zdenko Podobný based on Polish rules + * + */ +public abstract class SlovakRule extends Rule { + +} diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/rules/sk/SlovakVes.java b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/sk/SlovakVes.java new file mode 100644 index 0000000..3fff582 --- /dev/null +++ b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/sk/SlovakVes.java @@ -0,0 +1,146 @@ +/* LanguageTool, a natural language style checker + * Copyright (C) 2010 Luboš Lehotský lubo.lehotsky (at) gmail (dot) com + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ + +package de.danielnaber.languagetool.rules.sk; + + +import java.util.ArrayList; +import java.util.List; +import java.util.ResourceBundle; + +import de.danielnaber.languagetool.AnalyzedSentence; +import de.danielnaber.languagetool.AnalyzedTokenReadings; +import de.danielnaber.languagetool.rules.Category; +import de.danielnaber.languagetool.rules.RuleMatch; + + +public class SlovakVes extends SlovakRule { + + public SlovakVes(final ResourceBundle messages) { + if (messages != null) { + super.setCategory(new Category(messages.getString("category_misc"))); + } + setDefaultOff(); + } + + @Override + public final String getId() { + return "SK_VES"; + } + + @Override + public final String getDescription() { + return "Názvy obcí, v ktorých je \"Ves\""; + } + + @Override + public final RuleMatch[] match(final AnalyzedSentence text) { + final List<RuleMatch> ruleMatches = new ArrayList<RuleMatch>(); + final AnalyzedTokenReadings[] tokens = text.getTokensWithoutWhitespace(); + // never read boolean prve_uvodzovky; + boolean tag, tag2, tag3; + final String pomoc; + final char znak; + +// never read prve_uvodzovky = false; + tag = false; + tag2 = false; + tag3 = false; + + pomoc = tokens[1].getToken(); + if (pomoc.length() >= 1) { + znak = pomoc.charAt(0); + } else { + znak = '.'; + } + + if (znak == '?') { +// never read prve_uvodzovky = true; + } + for (int i = 1; i < tokens.length; i++) { + final String token = tokens[i].getToken(); +// never read String premenna = token.toString(); + final char pomocnik; +// never read final int help; + boolean bodka; + boolean pady; + + pady = false; + pomocnik = token.charAt(0); + bodka = false; + if (token.charAt(0) == '.' || token.charAt(0) == '?' + || token.charAt(0) == '!') { + bodka = true; + } + + if (tokens[i].hasPosTag("AAfs1x") || tokens[i].hasPosTag("AAfs2x") + || tokens[i].hasPosTag("AAfs3x") + || tokens[i].hasPosTag("AAfs4x") + || tokens[i].hasPosTag("AAfs6x") + || tokens[i].hasPosTag("AAfs7x")) { + pady = true; + } + if (pady && Character.isUpperCase(pomocnik)) { + tag = true; + } + + if (tag && !tag2) { + if (pady && Character.isLowerCase(pomocnik)) { + tag2 = true; + // premenna = tokens[i].getToken(); + } + + } + + if (tag2) { + if (token.equals("Ves") || token.equals("Vsi") + || token.equals("Vsou")) { + tag3 = true; + } + } + if (tag3 && !bodka) { + String spravne; + char prve; + + prve = tokens[i - 1].getToken().charAt(0); + prve = Character.toUpperCase(prve); + spravne = tokens[i - 1].getToken().substring(1, + tokens[i - 1].getToken().length()); + + final String msg = "Zmeňte začiatočné písmeno na veľké: <suggestion> " + + prve + spravne + " </suggestion>"; + final int pos = tokens[i - 1].getStartPos(); + final int pos2 = tokens[i - 1].getToken().length(); + final RuleMatch ruleMatch = new RuleMatch(this, pos, pos + pos2, + msg, "Zmeňte začiatočné písmeno na veľké: "); + + ruleMatches.add(ruleMatch); + + } + + } + return toRuleMatchArray(ruleMatches); + } + + @Override + public void reset() {// nothing + } + +} + diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/rules/sv/CompoundRule.java b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/sv/CompoundRule.java new file mode 100644 index 0000000..b3087cd --- /dev/null +++ b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/sv/CompoundRule.java @@ -0,0 +1,247 @@ +/* LanguageTool, a natural language style checker + * Copyright (C) 2006 Daniel Naber (http://www.danielnaber.de) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ +package de.danielnaber.languagetool.rules.sv; + +import java.io.BufferedReader; +import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Queue; +import java.util.ResourceBundle; +import java.util.Set; +import java.util.concurrent.ArrayBlockingQueue; + +import de.danielnaber.languagetool.AnalyzedSentence; +import de.danielnaber.languagetool.AnalyzedToken; +import de.danielnaber.languagetool.AnalyzedTokenReadings; +import de.danielnaber.languagetool.JLanguageTool; +import de.danielnaber.languagetool.rules.Category; +import de.danielnaber.languagetool.rules.RuleMatch; +import de.danielnaber.languagetool.tools.StringTools; + +/** + * Checks that compounds (if in the list) are not written as separate words. + * + * @author Daniel Naber + */ +public class CompoundRule extends SwedishRule { + //TODO for words with more then one part check if parts of it is compounded. + //in env. allt-i-genom+ should match "allt i genom", "allt igenom" as well as "allti genom" + private static final String FILE_NAME = "/sv/compounds.txt"; + + private final static int MAX_TERMS = 5; + + private final Set<String> incorrectCompounds = new HashSet<String>(); + private final Set<String> noDashSuggestion = new HashSet<String>(); + private final Set<String> onlyDashSuggestion = new HashSet<String>(); + + public CompoundRule(final ResourceBundle messages) throws IOException { + if (messages != null) + super.setCategory(new Category(messages.getString("category_misc"))); + loadCompoundFile(JLanguageTool.getDataBroker().getFromResourceDirAsStream(FILE_NAME), "UTF-8"); + } + + public String getId() { + return "SV_COMPOUNDS"; + } + + public String getDescription() { + return "Särskrivningar, t.ex. 'cd rom' bör skrivas 'cd-rom'"; + } + + public RuleMatch[] match(final AnalyzedSentence text) { + final List<RuleMatch> ruleMatches = new ArrayList<RuleMatch>(); + final AnalyzedTokenReadings[] tokens = text.getTokensWithoutWhitespace(); + + RuleMatch prevRuleMatch = null; + final Queue<AnalyzedTokenReadings> prevTokens = new ArrayBlockingQueue<AnalyzedTokenReadings>(MAX_TERMS); + for (int i = 0; i < tokens.length + MAX_TERMS-1; i++) { + AnalyzedTokenReadings token = null; + // we need to extend the token list so we find matches at the end of the original list: + if (i >= tokens.length) + token = new AnalyzedTokenReadings(new AnalyzedToken("", "", null), prevTokens.peek().getStartPos()); + else + token = tokens[i]; + if (i == 0) { + addToQueue(token, prevTokens); + continue; + } + + final StringBuilder sb = new StringBuilder(); + int j = 0; + AnalyzedTokenReadings firstMatchToken = null; + final List<String> stringsToCheck = new ArrayList<String>(); + final List<String> origStringsToCheck = new ArrayList<String>(); // original upper/lowercase spelling + final Map<String, AnalyzedTokenReadings> stringToToken = new HashMap<String, AnalyzedTokenReadings>(); + for (AnalyzedTokenReadings atr : prevTokens) { + if (j == 0) + firstMatchToken = atr; + sb.append(' '); + sb.append(atr.getToken()); + if (j >= 1) { + final String stringToCheck = normalize(sb.toString()); + stringsToCheck.add(stringToCheck); + origStringsToCheck.add(sb.toString().trim()); + if (!stringToToken.containsKey(stringToCheck)) + stringToToken.put(stringToCheck, atr); + } + j++; + } + // iterate backwards over all potentially incorrect strings to make + // sure we match longer strings first: + for (int k = stringsToCheck.size()-1; k >= 0; k--) { + final String stringToCheck = stringsToCheck.get(k); + final String origStringToCheck = origStringsToCheck.get(k); + //System.err.println("##"+stringtoCheck+"#"); + if (incorrectCompounds.contains(stringToCheck)) { + final AnalyzedTokenReadings atr = stringToToken.get(stringToCheck); + String msg = null; + final List<String> repl = new ArrayList<String>(); + if (!noDashSuggestion.contains(stringToCheck)) { + repl.add(origStringToCheck.replace(' ', '-')); + msg = "Dessa ord skrivs samman med bindesträck."; + } + // Do not assume that compounds with more than two parts should always use hyphens: + if (!hasAllUppercaseParts(origStringToCheck) && !onlyDashSuggestion.contains(stringToCheck)) { + repl.add(mergeCompound(origStringToCheck)); + msg = "Dessa ord skrivs samman."; + } + final String[] parts = stringToCheck.split(" "); + if (parts.length > 0) { + repl.clear(); + repl.add(origStringToCheck.replace(' ', '-')); + msg = "Dessa ord skrivs samman med bindesträck."; + } else if (repl.size() == 0 || repl.size() == 2) { // == 0 shouldn't happen + // did not work as expected so I added repl. explicitly. + msg = "Dessa ord skrivs samman med eller utan bindesträck."; + repl.clear(); + repl.add(origStringToCheck.replace(' ', '-')); + repl.add(mergeCompound(origStringToCheck)); + } + final RuleMatch ruleMatch = new RuleMatch(this, firstMatchToken.getStartPos(), + atr.getStartPos() + atr.getToken().length(), msg); + // avoid duplicate matches: + if (prevRuleMatch != null && prevRuleMatch.getFromPos() == ruleMatch.getFromPos()) { + prevRuleMatch = ruleMatch; + break; + } + prevRuleMatch = ruleMatch; + ruleMatch.setSuggestedReplacements(repl); + ruleMatches.add(ruleMatch); + break; + } + } + addToQueue(token, prevTokens); + } + return toRuleMatchArray(ruleMatches); + } + + /** + * Replaces dashes with whitespace + * e.g. "E-Mail Adresse" -> "E Mail Adresse" so the error can be detected: + * @param str + * @return str + */ + private String normalize(String str) { + str = str.trim().toLowerCase(); + if (str.indexOf('-') != -1 && str.indexOf(' ') != -1) { + // e.g. "E-Mail Adresse" -> "E Mail Adresse" so the error can be detected: + str = str.replace('-', ' '); + } + return str; + } + + private boolean hasAllUppercaseParts(String str) { + final String[] parts = str.split(" "); + for (String part : parts) { + if (StringTools.isAllUppercase(part)) { + return true; + } + } + return false; + } + + private String mergeCompound(String str) { + final String[] stringParts = str.split(" "); + final StringBuilder sb = new StringBuilder(); + for (int k = 0; k < stringParts.length; k++) { + if (k == 0) + sb.append(stringParts[k]); + else + sb.append(stringParts[k].toLowerCase()); + } + return sb.toString(); + } + + private void addToQueue(AnalyzedTokenReadings token, Queue<AnalyzedTokenReadings> prevTokens) { + final boolean inserted = prevTokens.offer(token); + if (!inserted) { + prevTokens.poll(); + prevTokens.offer(token); + } + } + + private void loadCompoundFile(final InputStream file, final String encoding) throws IOException { + InputStreamReader isr = null; + BufferedReader br = null; + try { + isr = new InputStreamReader(file, encoding); + br = new BufferedReader(isr); + String line; + while ((line = br.readLine()) != null) { + line = line.trim(); + if (line.length() < 1) { + continue; + } + if (line.charAt(0) == '#') { // ignore comments + continue; + } + // the set contains the incorrect spellings, i.e. the ones without hyphen + line = line.replace('-', ' '); + final String[] parts = line.split(" "); + if (parts.length > MAX_TERMS) + throw new IOException("För många ord sammansatta: " + line + ", max antal tillåtna ord: " + MAX_TERMS); + if (parts.length == 1) + throw new IOException("Inget sammansatt ord: " + line); + if (line.endsWith("+")) { + line = line.substring(0, line.length() - 1); // cut off "+" + noDashSuggestion.add(line.toLowerCase()); + } else if (line.endsWith("*")) { + line = line.substring(0, line.length() - 1); // cut off "*" + onlyDashSuggestion.add(line.toLowerCase()); + } + incorrectCompounds.add(line.toLowerCase()); + } + } finally { + if (br != null) br.close(); + if (isr != null) isr.close(); + } + } + + public void reset() { + } + +} + + diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/rules/sv/SwedishRule.java b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/sv/SwedishRule.java new file mode 100644 index 0000000..73af8fe --- /dev/null +++ b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/sv/SwedishRule.java @@ -0,0 +1,31 @@ +/* LanguageTool, a natural language style checker + * Copyright (C) 2005 Daniel Naber (http://www.danielnaber.de) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ +package de.danielnaber.languagetool.rules.sv; + +import de.danielnaber.languagetool.rules.Rule; + +/** + * Abstract base class for Swedish rules. + * + * @author Marcin Miłkowski + * + */ +public abstract class SwedishRule extends Rule { + +} diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/rules/uk/PunctuationCheckRule.java b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/uk/PunctuationCheckRule.java new file mode 100644 index 0000000..5abc339 --- /dev/null +++ b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/uk/PunctuationCheckRule.java @@ -0,0 +1,76 @@ +/* LanguageTool, a natural language style checker + * Copyright (C) 2005 Daniel Naber (http://www.danielnaber.de) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ +package de.danielnaber.languagetool.rules.uk; + +import java.util.ResourceBundle; + +import de.danielnaber.languagetool.rules.AbstractPunctuationCheckRule; + +/** + * A rule that matches "..", "::", "-," but not "...", "!..", "?!!", ",-" etc + * TODO: spaces seem to be special, extract from regexp? + * + * @author Andriy Rysin + */ +public class PunctuationCheckRule extends AbstractPunctuationCheckRule { + + public PunctuationCheckRule(final ResourceBundle messages) { + super(messages); + // super.setCategory(new Category(messages.getString("category_misc"))); + } + + // private boolean isTripleOk(String token) { + // return token.matches("^[.!?]$"); + // } + + /* + * (non-Javadoc) + * + * @see + * de.danielnaber.languagetool.rules.AbstractPunctuationCheckRule#isPunctsJoinOk + * (java.lang.String) + */ + protected final boolean isPunctsJoinOk(final String tokens) { + return // we ignore duplicated spaces - too many errors + tokens.matches("([,:] | *- |,- | ) *") // internal puctuation + || tokens + .matches("([.!?]|!!!|\\?\\?\\?|\\?!!|!\\.\\.|\\?\\.\\.|\\.\\.\\.) *"); + } + + /* + * (non-Javadoc) + * + * @see + * de.danielnaber.languagetool.rules.AbstractPunctuationCheckRule#isPunctuation + * (java.lang.String) + */ + protected final boolean isPunctuation(final String token) { + return token.matches("^[.,!?: -]$"); + } + + /* + * (non-Javadoc) + * + * @see de.danielnaber.languagetool.rules.AbstractPunctuationCheckRule#reset() + */ + public void reset() { + // nothing + } + +} diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/rules/uk/SimpleReplaceRule.java b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/uk/SimpleReplaceRule.java new file mode 100644 index 0000000..3bba01c --- /dev/null +++ b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/uk/SimpleReplaceRule.java @@ -0,0 +1,50 @@ +/* LanguageTool, a natural language style checker + * Copyright (C) 2005 Daniel Naber (http://www.danielnaber.de) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ +package de.danielnaber.languagetool.rules.uk; + +import java.io.IOException; +import java.util.ResourceBundle; + +import de.danielnaber.languagetool.rules.AbstractSimpleReplaceRule; + +/** + * A rule that matches words or phrases which should not be used and suggests + * correct ones instead. + * + * Ukrainian implementations. Loads the + * relevant words from <code>rules/uk/replace.txt</code>. + * + * @author Andriy Rysin + */ +public class SimpleReplaceRule extends AbstractSimpleReplaceRule { + + private static final String FILE_NAME = "/uk/replace.txt"; + + public final String getFileName() { + return FILE_NAME; + } + public SimpleReplaceRule(final ResourceBundle messages) throws IOException { + super(messages); + } + + public final String getId() { + return "UK_SIMPLE_REPLACE"; + } + +} diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/server/HTTPServer.java b/JLanguageTool/src/java/de/danielnaber/languagetool/server/HTTPServer.java new file mode 100644 index 0000000..7e1dc99 --- /dev/null +++ b/JLanguageTool/src/java/de/danielnaber/languagetool/server/HTTPServer.java @@ -0,0 +1,341 @@ +/* LanguageTool, a natural language style checker + * Copyright (C) 2006 Daniel Naber (http://www.danielnaber.de) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ +package de.danielnaber.languagetool.server; + +import java.io.UnsupportedEncodingException; +import java.net.URLDecoder; +import java.text.SimpleDateFormat; +import java.util.Arrays; +import java.util.Collections; +import java.util.Comparator; +import java.util.Date; +import java.util.HashMap; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Set; + +import com.prolixtech.jaminid.ContentOracle; +import com.prolixtech.jaminid.Daemon; +import com.prolixtech.jaminid.ProtocolResponseHeader; +import com.prolixtech.jaminid.Request; +import com.prolixtech.jaminid.Response; + +import de.danielnaber.languagetool.JLanguageTool; +import de.danielnaber.languagetool.Language; +import de.danielnaber.languagetool.rules.RuleMatch; +import de.danielnaber.languagetool.tools.StringTools; + +/** + * A small embedded HTTP server that checks text. Returns XML, prints debugging + * to stdout/stderr. + * + * @author Daniel Naber + */ +public class HTTPServer extends ContentOracle { + + /** + * JLanguageTool instances for each language (created and configured on fist use). + * Instances are organized by language and mother language. + * This is like a tree: first level contain the Languages, next level contains JLanguageTool instances for each mother tongue. + */ + private static final Map<Language, Map<Language, JLanguageTool>> instances = new HashMap<Language, Map<Language, JLanguageTool>>(); + /** + * The default port on which the server is running (8081). + */ + public static final int DEFAULT_PORT = 8081; + + private static final int CONTEXT_SIZE = 40; // characters + + private Daemon daemon; + private int port = DEFAULT_PORT; + private boolean verbose; + + private static final Set<String> allowedIPs = new HashSet<String>(); + static { + // accept only requests from localhost. + // TODO: find a cleaner solution + allowedIPs.add("/0:0:0:0:0:0:0:1"); // Suse Linux IPv6 stuff + allowedIPs.add("/0:0:0:0:0:0:0:1%0"); // some(?) Mac OS X + allowedIPs.add("/127.0.0.1"); + } + + /** + * Prepare a server - use run() to start it. + */ + public HTTPServer() { + } + + /** + * Prepare a server on the given port - use run() to start it. + */ + public HTTPServer(int port) { + this(port, false); + } + + /** + * Prepare a server on the given port - use run() to start it. + * + * @param verbose + * if true, the text to check will be displayed in case of exceptions + * (default: false) + */ + public HTTPServer(int port, boolean verbose) { + this.port = port; + this.verbose = verbose; + } + + /** + * Start the server. + */ + public void run() { + System.out.println("Starting server on port " + port + "..."); + daemon = new Daemon(port, this); + if (daemon.isRunning()) + System.out.println("Server started"); + else + throw new PortBindingException( + "LanguageTool server could not be started " + "on port " + port + + ", maybe something else is running on that port already?"); + } + + public String demultiplex(Request connRequest, Response connResponse) { + synchronized(instances){ + final long timeStart = System.currentTimeMillis(); + String text = null; + try { + if (StringTools.isEmpty(connRequest.getLocation())) { + connResponse.setStatus(403); + throw new RuntimeException("Error: Access to " + + connRequest.getLocation() + " denied"); + } + if (allowedIPs.contains(connRequest.getIPAddressString())) { + // TODO: temporary fix until jaminid bug is fixed (it seams that non-asci characters are not handled correctly) + // see https://sourceforge.net/tracker/?func=detail&aid=2876507&group_id=127764&atid=709370 + fixRequestParamMap(connRequest); + + // return content base on request string. + // Refactor this when the number of known request types gets too big. + + // request type: list known languages + if (connRequest.getLocation().endsWith("/Languages")) { + connResponse.setHeaderLine(ProtocolResponseHeader.Content_Type, "text/xml"); + connResponse.setHeaderLine(ProtocolResponseHeader.Content_Encoding, "UTF-8"); + return getSupportedLanguagesAsXML(); + } + + // request type: grammar checking (default type) + final String langParam = connRequest.getParamOrNull("language"); + if (langParam == null) + throw new IllegalArgumentException("Missing 'language' parameter"); + final Language lang = Language.getLanguageForShortName(langParam); + if (lang == null) + throw new IllegalArgumentException("Unknown language '" + langParam + + "'"); + final String motherTongueParam = connRequest.getParamOrNull("motherTongue"); + Language motherTongue = null; + if (null != motherTongueParam) + motherTongue = Language.getLanguageForShortName(motherTongueParam); + final JLanguageTool lt = getLanguageToolInstance(lang, motherTongue); + // TODO: how to take options from the client? + // TODO: customize lt here after reading client options + text = connRequest.getParamOrNull("text"); + if (text == null) + throw new IllegalArgumentException("Missing 'text' parameter"); + print("Checking " + text.length() + " characters of text, language " + + langParam); + final List<RuleMatch> matches = lt.check(text); + connResponse.setHeaderLine(ProtocolResponseHeader.Content_Type, + "text/xml"); + // TODO: how to set the encoding to utf-8 if we can just return a + // String? + connResponse.setHeaderLine(ProtocolResponseHeader.Content_Encoding, + "UTF-8"); + final String response = StringTools.ruleMatchesToXML(matches, text, + CONTEXT_SIZE, StringTools.XmlPrintMode.NORMAL_XML); + print("Check done in " + (System.currentTimeMillis() - timeStart) + + "ms"); + return response; + } + connResponse.setStatus(403); + throw new RuntimeException("Error: Access from " + + connRequest.getIPAddressString() + " denied"); + } catch (Exception e) { + if (verbose) + print("Exceptions was caused by this text: " + text); + e.printStackTrace(); + connResponse.setStatus(500); + // escape input to avoid XSS attacks: + return "Error: " + StringTools.escapeXML(e.toString()); + } + } + } + + private void print(String s) { + System.out.println(getDate() + " " + s); + } + + private String getDate() { + final SimpleDateFormat sdf = new SimpleDateFormat(); + return sdf.format(new Date()); + } + + /** + * Stop the server process. + */ + public void stop() { + System.out.println("Stopping server..."); + daemon.tearDown(); + System.out.println("Server stopped"); + } + + private static void printUsageAndExit() { + System.out.println("Usage: HTTPServer [-p|--port port]"); + System.exit(1); + } + + /** + * Private fix until jaminid bug is fixed (it seams that non-asci characters are not handled correctly) + * see https://sourceforge.net/tracker/?func=detail&aid=2876507&group_id=127764&atid=709370 + * + * @param connRequest the Request object from jaminid ContentOracle. + * @throws UnsupportedEncodingException If character encoding needs to be consulted, but named character encoding is not supported. + */ + private void fixRequestParamMap(final Request connRequest) throws UnsupportedEncodingException { + final Map<String, String> paramMap = getParamMap(connRequest); + connRequest.getParamMap().clear(); + connRequest.getParamMap().putAll(paramMap); + } + + /** + * Private fix until jaminid bug is fixed (it seams that non-asci characters are not handled correctly) + * see https://sourceforge.net/tracker/?func=detail&aid=2876507&group_id=127764&atid=709370 + * Method to get the requst parameters from the request string. The default implementation can't handle + * the UTF-8 characters (like șțîâ). We just use URLDecoder.decode() instead of the default unescape private method. + * @param connRequest the Request object from jaminid ContentOracle. + * @return the parameters map. + * @throws UnsupportedEncodingException If character encoding needs to be consulted, but named character encoding is not supported + */ + private Map<String, String> getParamMap(Request connRequest) throws UnsupportedEncodingException { + final Map<String, String> paramMap = new HashMap<String, String>(); + if (null == connRequest) + return paramMap; + String requestStr = null; + if (!StringTools.isEmpty(connRequest.getBody())) { + requestStr = connRequest.getBody(); // POST + } else { + requestStr = connRequest.getParamString(); // GET + } + if (StringTools.isEmpty(requestStr)) + return paramMap; + + final String[] comps = requestStr.split("&"); + for (String comp : comps) { + final int equalsLoc = comp.indexOf("="); + if (equalsLoc > 0) { + paramMap.put(comp.substring(0, equalsLoc), + URLDecoder.decode(comp.substring(equalsLoc + 1), "UTF-8")); + // TODO: Find some way to determine the encoding used on client-side + // maybe "Accept-Charset" request header could be used. + // UTF-8 will work on most platforms and browsers. + } else { + paramMap.put(comp, ""); + } + } + return paramMap; + } + + /** + * Find or create a JLanguageTool instance for a specific language and mother tongue. + * The instance will be reused. If any customization is required (like disabled rules), + * it will be done after acquiring this instance. + * + * @param lang the language to be used. + * @param motherTongue the user's mother tongue or <code>null</code> + * @return a JLanguageTool instance for a specific language and mother tongue. + * @throws Exception when JLanguageTool creation failed + */ + private JLanguageTool getLanguageToolInstance(Language lang, Language motherTongue) + throws Exception { + Map<Language, JLanguageTool> languageTools = instances.get(lang); + if (null == languageTools) { + // first call using this language + languageTools = new HashMap<Language, JLanguageTool>(); + instances.put(lang, languageTools); + } + final JLanguageTool languageTool = languageTools.get(motherTongue); + if (null == languageTool) { + print("Creating JLanguageTool instance for language " + lang + ((null != motherTongue)?(" and mother tongue " + motherTongue):"")); + final JLanguageTool newLanguageTool = new JLanguageTool(lang, motherTongue); + newLanguageTool.activateDefaultPatternRules(); + newLanguageTool.activateDefaultFalseFriendRules(); + languageTools.put(motherTongue, newLanguageTool); + return newLanguageTool; + } + return languageTool; + } + + /** + * Construct an xml string containing all supported languages. <br/>The xml format is:<br/> + * <languages><br/> + * <language name="Catalan" abbr="ca" /><br/> + * <language name="Dutch" abbr="nl" /><br/> + * ...<br/> + * <languages><br/> + * The languages are alphabetically sorted. + * @return an xml string containing all supported languages. + */ + public static String getSupportedLanguagesAsXML() { + final List<Language> languages = Arrays.asList(Language.REAL_LANGUAGES); + Collections.sort(languages, + new Comparator<Language>() { + public int compare(Language o1, Language o2) { + return o1.getName().compareTo(o2.getName()); + } + }); + final StringBuilder xmlBuffer = new StringBuilder("<?xml version='1.0' encoding='UTF-8'?>\n<languages>\n"); + for (Language lang : languages) { + xmlBuffer.append(String.format("\t<language name=\"%s\" abbr=\"%s\" /> \n", lang.getName(), lang.getShortName())); + } + xmlBuffer.append("</languages>\n"); + return xmlBuffer.toString(); + } + + /** + * Start the server from command line. Usage: + * <tt>HTTPServer [-v|--verbose] [-p|--port port]</tt> + */ + public static void main(String[] args) { + if (args.length > 3) { + printUsageAndExit(); + } + boolean verbose = false; + int port = DEFAULT_PORT; + for (int i = 0; i < args.length; i++) { + if ("-p".equals(args[i]) || "--port".equals(args[i])) { + port = Integer.parseInt(args[++i]); + } else if ("-v".equals(args[i]) || "--verbose".equals(args[i])) { + verbose = true; + } + } + final HTTPServer server = new HTTPServer(port, verbose); + server.run(); + } + +} diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/server/PortBindingException.java b/JLanguageTool/src/java/de/danielnaber/languagetool/server/PortBindingException.java new file mode 100644 index 0000000..de3ae56 --- /dev/null +++ b/JLanguageTool/src/java/de/danielnaber/languagetool/server/PortBindingException.java @@ -0,0 +1,36 @@ +/* LanguageTool, a natural language style checker + * Copyright (C) 2007 Daniel Naber (http://www.danielnaber.de) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ +package de.danielnaber.languagetool.server; + +public class PortBindingException extends RuntimeException { + + /** + * + */ + private static final long serialVersionUID = -8416700513887041339L; + + PortBindingException(String message) { + super(message); + } + + PortBindingException(String message, Throwable cause) { + super(message, cause); + } + +} diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/synthesis/BaseSynthesizer.java b/JLanguageTool/src/java/de/danielnaber/languagetool/synthesis/BaseSynthesizer.java new file mode 100644 index 0000000..49ce47c --- /dev/null +++ b/JLanguageTool/src/java/de/danielnaber/languagetool/synthesis/BaseSynthesizer.java @@ -0,0 +1,87 @@ +package de.danielnaber.languagetool.synthesis; + +import java.io.IOException; +import java.net.URL; +import java.util.ArrayList; +import java.util.List; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +import morfologik.stemming.Dictionary; +import morfologik.stemming.DictionaryLookup; +import morfologik.stemming.IStemmer; +import morfologik.stemming.WordData; + +import de.danielnaber.languagetool.AnalyzedToken; +import de.danielnaber.languagetool.tools.Tools; + +public class BaseSynthesizer implements Synthesizer { + + protected IStemmer synthesizer; + + private ArrayList<String> possibleTags; + + private final String tagsFileName; + private final String resourceFileName; + + public BaseSynthesizer(final String resFile, final String tagFile) { + tagsFileName = tagFile; + resourceFileName = resFile; + } + + /** + * Get a form of a given AnalyzedToken, where the form is defined by a + * part-of-speech tag. + * + * @param token + * AnalyzedToken to be inflected. + * @param posTag + * A desired part-of-speech tag. + * @return String value - inflected word. + */ + public String[] synthesize(final AnalyzedToken token, final String posTag) + throws IOException { + if (synthesizer == null) { + final URL url = this.getClass().getResource(resourceFileName); + synthesizer = new DictionaryLookup(Dictionary.read(url)); + } + final List<WordData> wordData = synthesizer.lookup(token.getLemma() + "|" + posTag); + final List<String> wordForms = new ArrayList<String>(); + for (WordData wd : wordData) { + wordForms.add(wd.getStem().toString()); + } + return wordForms.toArray(new String[wordForms.size()]); + } + + public String[] synthesize(final AnalyzedToken token, final String posTag, + final boolean posTagRegExp) throws IOException { + if (posTagRegExp) { + if (possibleTags == null) { + possibleTags = SynthesizerTools.loadWords(Tools + .getStream(tagsFileName)); + } + if (synthesizer == null) { + final URL url = this.getClass().getResource(resourceFileName); + synthesizer = new DictionaryLookup(Dictionary.read(url)); + } + final Pattern p = Pattern.compile(posTag); + final ArrayList<String> results = new ArrayList<String>(); + for (final String tag : possibleTags) { + final Matcher m = p.matcher(tag); + if (m.matches()) { + final List<WordData> wordForms = synthesizer.lookup(token.getLemma() + "|" + tag); + for (WordData wd : wordForms) + results.add(wd.getStem().toString()); + } + } + return results.toArray(new String[results.size()]); + } + return synthesize(token, posTag); + } + + public String getPosTagCorrection(final String posTag) { + return posTag; + } + + +} diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/synthesis/Synthesizer.java b/JLanguageTool/src/java/de/danielnaber/languagetool/synthesis/Synthesizer.java new file mode 100644 index 0000000..359bb20 --- /dev/null +++ b/JLanguageTool/src/java/de/danielnaber/languagetool/synthesis/Synthesizer.java @@ -0,0 +1,58 @@ +/* LanguageTool, a natural language style checker + * Copyright (C) 2005 Daniel Naber (http://www.danielnaber.de) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ +package de.danielnaber.languagetool.synthesis; + +import java.io.IOException; + +import de.danielnaber.languagetool.AnalyzedToken; + +/** + * Part-of-speech synthesizer interface. Implementations are + * heavily language-dependent. + * + * @author Marcin Miłkowski + */ + +public interface Synthesizer { + + /** Generates a form of the word with a given POS tag for a given lemma. + * @param token the token to be used for synthesis + * @param posTag POS tag of the form to be generated. + **/ + public String[] synthesize(AnalyzedToken token, String posTag) throws IOException; + + /** Generates a form of the word with a given POS tag for a given lemma. + * POS tag can be specified using regular expressions. + * @param token the token to be used for synthesis + * @param posTag POS tag of the form to be generated. + * @param posTagRegExp Specifies whether the posTag string is a + * regular expression. + **/ + public String[] synthesize(AnalyzedToken token, String posTag, boolean posTagRegExp) throws IOException; + + /** + * Gets a corrected version of the POS tag used for synthesis. + * Useful when the tagset defines special disjunction that + * need to be converted into regexp disjunctions. + * @param posTag Original POS tag. + * to correct. + * @return @String Converted POS tag. + */ + public String getPosTagCorrection(String posTag); +} diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/synthesis/SynthesizerTools.java b/JLanguageTool/src/java/de/danielnaber/languagetool/synthesis/SynthesizerTools.java new file mode 100644 index 0000000..f91614e --- /dev/null +++ b/JLanguageTool/src/java/de/danielnaber/languagetool/synthesis/SynthesizerTools.java @@ -0,0 +1,64 @@ +/* LanguageTool, a natural language style checker + * Copyright (C) 2008 Daniel Naber (http://www.danielnaber.de) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ +package de.danielnaber.languagetool.synthesis; + +import java.io.BufferedReader; +import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.util.ArrayList; + +public class SynthesizerTools { + + private SynthesizerTools() { + // static methods only, no public constructor + } + + public static ArrayList<String> loadWords(final InputStream file) throws IOException { + final ArrayList<String> set = new ArrayList<String>(); + InputStreamReader isr = null; + BufferedReader br = null; + try { + isr = new InputStreamReader(file); + br = new BufferedReader(isr); + String line; + + while ((line = br.readLine()) != null) { + line = line.trim(); + if (line.length() < 1) { + continue; + } + if (line.charAt(0) == '#') { // ignore comments + continue; + } + set.add(line); + } + + } finally { + if (br != null) { + br.close(); + } + if (isr != null) { + isr.close(); + } + } + return set; + } + +} diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/synthesis/ca/CatalanSynthesizer.java b/JLanguageTool/src/java/de/danielnaber/languagetool/synthesis/ca/CatalanSynthesizer.java new file mode 100644 index 0000000..c350f1b --- /dev/null +++ b/JLanguageTool/src/java/de/danielnaber/languagetool/synthesis/ca/CatalanSynthesizer.java @@ -0,0 +1,42 @@ +/* LanguageTool, a natural language style checker + * Copyright (C) 2005 Daniel Naber (http://www.danielnaber.de) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ +package de.danielnaber.languagetool.synthesis.ca; + +import de.danielnaber.languagetool.JLanguageTool; +import de.danielnaber.languagetool.synthesis.BaseSynthesizer; + +/** + * Catalan word form synthesizer. <br/> + * + * @author Marcin Miłkowski + */ + +public class CatalanSynthesizer extends BaseSynthesizer { + + private static final String RESOURCE_FILENAME = "/ca/catalan_synth.dict"; + + private static final String TAGS_FILE_NAME = "/ca/catalan_tags.txt"; + + public CatalanSynthesizer() { + super(JLanguageTool.getDataBroker().getResourceDir() + RESOURCE_FILENAME, + JLanguageTool.getDataBroker().getResourceDir() + TAGS_FILE_NAME); + } + + +} diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/synthesis/en/EnglishSynthesizer.java b/JLanguageTool/src/java/de/danielnaber/languagetool/synthesis/en/EnglishSynthesizer.java new file mode 100644 index 0000000..12935bf --- /dev/null +++ b/JLanguageTool/src/java/de/danielnaber/languagetool/synthesis/en/EnglishSynthesizer.java @@ -0,0 +1,99 @@ +/* LanguageTool, a natural language style checker + * Copyright (C) 2005 Daniel Naber (http://www.danielnaber.de) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ +package de.danielnaber.languagetool.synthesis.en; + +import java.io.IOException; +import java.net.URL; +import java.util.ArrayList; +import java.util.List; + +import morfologik.stemming.Dictionary; +import morfologik.stemming.DictionaryLookup; +import morfologik.stemming.WordData; +import de.danielnaber.languagetool.AnalyzedToken; +import de.danielnaber.languagetool.JLanguageTool; +import de.danielnaber.languagetool.rules.en.AvsAnRule; +import de.danielnaber.languagetool.synthesis.BaseSynthesizer; + +/** + * English word form synthesizer. <br/> + * Based on part-of-speech lists in Public Domain. See readme.txt for details, + * the POS tagset is described in tagset.txt. + * + * There are to special additions: + * <ol> + * <li>+DT - tag that adds "a" or "an" (according to the way the word is + * pronounced) and "the"</li> + * <li>+INDT - a tag that adds only "a" or "an"</li> + * </ol> + * + * @author Marcin Miłkowski + */ + +public class EnglishSynthesizer extends BaseSynthesizer { + + private static final String RESOURCE_FILENAME = "/en/english_synth.dict"; + + private static final String TAGS_FILE_NAME = "/en/english_tags.txt"; + + /** A special tag to add determiners. **/ + private static final String ADD_DETERMINER = "+DT"; + + /** A special tag to add only indefinite articles. **/ + private static final String ADD_IND_DETERMINER = "+INDT"; + + public EnglishSynthesizer() { + super(JLanguageTool.getDataBroker().getResourceDir() + RESOURCE_FILENAME, + JLanguageTool.getDataBroker().getResourceDir() + TAGS_FILE_NAME); + } + + /** + * Get a form of a given AnalyzedToken, where the form is defined by a + * part-of-speech tag. + * + * @param token + * AnalyzedToken to be inflected. + * @param posTag + * A desired part-of-speech tag. + * @return String value - inflected word. + */ + public String[] synthesize(final AnalyzedToken token, final String posTag) + throws IOException { + if (ADD_DETERMINER.equals(posTag)) { + final AvsAnRule rule = new AvsAnRule(null); + return new String[] { rule.suggestAorAn(token.getToken()), + "the " + token.getToken() }; + } else if (ADD_IND_DETERMINER.equals(posTag)) { + final AvsAnRule rule = new AvsAnRule(null); + return new String[] { rule.suggestAorAn(token.getToken()) }; + } else { + if (synthesizer == null) { + final URL url = JLanguageTool.getDataBroker().getFromResourceDirAsUrl(RESOURCE_FILENAME); + synthesizer = new DictionaryLookup(Dictionary.read(url)); + } + final List<WordData> wordData = synthesizer.lookup(token.getLemma() + "|" + posTag); + final List<String> wordForms = new ArrayList<String>(); + for (WordData wd : wordData) { + wordForms.add(wd.getStem().toString()); + } + return wordForms.toArray(new String[wordForms.size()]); + } + } + +} diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/synthesis/es/SpanishSynthesizer.java b/JLanguageTool/src/java/de/danielnaber/languagetool/synthesis/es/SpanishSynthesizer.java new file mode 100644 index 0000000..48ffe93 --- /dev/null +++ b/JLanguageTool/src/java/de/danielnaber/languagetool/synthesis/es/SpanishSynthesizer.java @@ -0,0 +1,44 @@ +/* LanguageTool, a natural language style checker + * Copyright (C) 2005 Daniel Naber (http://www.danielnaber.de) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ +package de.danielnaber.languagetool.synthesis.es; + +import de.danielnaber.languagetool.JLanguageTool; +import de.danielnaber.languagetool.synthesis.BaseSynthesizer; + +/** + * Spanish word form synthesizer. <br/> + * + * Based on Dutch word from synthesizer + * + * @author Juan Martorell + */ + +public class SpanishSynthesizer extends BaseSynthesizer { + + private static final String RESOURCE_FILENAME = "/es/spanish_synth.dict"; + + private static final String TAGS_FILE_NAME = "/es/spanish_tags.txt"; + + public SpanishSynthesizer() { + super(JLanguageTool.getDataBroker().getResourceDir() + RESOURCE_FILENAME, + JLanguageTool.getDataBroker().getResourceDir() + TAGS_FILE_NAME); + } + + +}
\ No newline at end of file diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/synthesis/nl/DutchSynthesizer.java b/JLanguageTool/src/java/de/danielnaber/languagetool/synthesis/nl/DutchSynthesizer.java new file mode 100644 index 0000000..8c85755 --- /dev/null +++ b/JLanguageTool/src/java/de/danielnaber/languagetool/synthesis/nl/DutchSynthesizer.java @@ -0,0 +1,42 @@ +/* LanguageTool, a natural language style checker + * Copyright (C) 2005 Daniel Naber (http://www.danielnaber.de) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ +package de.danielnaber.languagetool.synthesis.nl; + +import de.danielnaber.languagetool.JLanguageTool; +import de.danielnaber.languagetool.synthesis.BaseSynthesizer; + +/** + * Dutch word form synthesizer. <br/> + * + * @author Marcin Miłkowski + */ + +public class DutchSynthesizer extends BaseSynthesizer { + + private static final String RESOURCE_FILENAME = "/nl/dutch_synth.dict"; + + private static final String TAGS_FILE_NAME = "/nl/dutch_tags.txt"; + + public DutchSynthesizer() { + super(JLanguageTool.getDataBroker().getResourceDir() + RESOURCE_FILENAME, + JLanguageTool.getDataBroker().getResourceDir() + TAGS_FILE_NAME); + } + + +} diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/synthesis/pl/PolishSynthesizer.java b/JLanguageTool/src/java/de/danielnaber/languagetool/synthesis/pl/PolishSynthesizer.java new file mode 100644 index 0000000..e86312c --- /dev/null +++ b/JLanguageTool/src/java/de/danielnaber/languagetool/synthesis/pl/PolishSynthesizer.java @@ -0,0 +1,171 @@ +/* LanguageTool, a natural language style checker + * Copyright (C) 2005 Daniel Naber (http://www.danielnaber.de) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ +package de.danielnaber.languagetool.synthesis.pl; + +import java.io.IOException; +import java.net.URL; +import java.util.ArrayList; +import java.util.List; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +import morfologik.stemming.Dictionary; +import morfologik.stemming.DictionaryLookup; +import morfologik.stemming.IStemmer; +import morfologik.stemming.WordData; +import de.danielnaber.languagetool.AnalyzedToken; +import de.danielnaber.languagetool.JLanguageTool; +import de.danielnaber.languagetool.synthesis.Synthesizer; +import de.danielnaber.languagetool.synthesis.SynthesizerTools; + +/** + * Polish word form synthesizer. Based on project Morfologik. + * + * @author Marcin Milkowski + */ + +public class PolishSynthesizer implements Synthesizer { + + private static final String RESOURCE_FILENAME = "/pl/polish_synth.dict"; + + private static final String TAGS_FILE_NAME = "/pl/polish_tags.txt"; + + private static final String POTENTIAL_NEGATION_TAG = ":aff"; + private static final String NEGATION_TAG = ":neg"; + private static final String COMP_TAG = "comp"; + private static final String SUP_TAG = "sup"; + + private IStemmer synthesizer; + + private ArrayList<String> possibleTags; + + public final String[] synthesize(final AnalyzedToken token, + final String posTag) throws IOException { + if (posTag == null) { + return null; + } + if (synthesizer == null) { + final URL url = JLanguageTool.getDataBroker().getFromResourceDirAsUrl(RESOURCE_FILENAME); + synthesizer = new DictionaryLookup(Dictionary.read(url)); + } + boolean isNegated = false; + if (token.getPOSTag() != null) { + isNegated = posTag.indexOf(NEGATION_TAG) > 0 + || token.getPOSTag().indexOf(NEGATION_TAG) > 0 + && !(posTag.indexOf(COMP_TAG) > 0) && !(posTag.indexOf(SUP_TAG) > 0); + } + if (posTag.indexOf('+') > 0) { + return synthesize(token, posTag, true); + } + final List<String> forms = getWordForms(token, posTag, isNegated); + return forms.toArray(new String[forms.size()]); + } + + public final String[] synthesize(final AnalyzedToken token, final String pos, + final boolean posTagRegExp) throws IOException { + if (pos == null) { + return null; + } + String posTag = pos; + if (posTagRegExp) { + if (possibleTags == null) { + possibleTags = SynthesizerTools.loadWords(JLanguageTool.getDataBroker(). + getFromResourceDirAsStream(TAGS_FILE_NAME)); + } + if (synthesizer == null) { + final URL url = JLanguageTool.getDataBroker().getFromResourceDirAsUrl(RESOURCE_FILENAME); + synthesizer = new DictionaryLookup(Dictionary.read(url)); + } + final ArrayList<String> results = new ArrayList<String>(); + + boolean isNegated = false; + if (token.getPOSTag() != null) { + isNegated = posTag.indexOf(NEGATION_TAG) > 0 + || token.getPOSTag().indexOf(NEGATION_TAG) > 0 + && !(posTag.indexOf(COMP_TAG) > 0) + && !(posTag.indexOf(SUP_TAG) > 0); + } + + if (isNegated) { + posTag = posTag.replaceAll(NEGATION_TAG, POTENTIAL_NEGATION_TAG + "?"); + } + + final Pattern p = Pattern.compile(posTag.replace('+', '|').replaceAll( + "m[1-3]", "m[1-3]?")); + + for (final String tag : possibleTags) { + final Matcher m = p.matcher(tag); + if (m.matches()) { + final List<String> wordForms = getWordForms(token, tag, isNegated); + if (wordForms != null) { + results.addAll(wordForms); + } + } + } + return results.toArray(new String[results.size()]); + } + return synthesize(token, posTag); + } + + public final String getPosTagCorrection(final String posTag) { + if (posTag.contains(".")) { + final String[] tags = posTag.split(":"); + int pos = -1; + for (int i = 0; i < tags.length; i++) { + if (tags[i].matches(".*[a-z]\\.[a-z].*")) { + tags[i] = "(.*" + tags[i].replace(".", ".*|.*") + ".*)"; + pos = i; + } + } + if (pos == -1) { + return posTag; + } + final StringBuilder sb = new StringBuilder(); + sb.append(tags[0]); + for (int i = 1; i < tags.length; i++) { + sb.append(':'); + sb.append(tags[i]); + } + return sb.toString(); + } + return posTag; + } + + private List<String> getWordForms(final AnalyzedToken token, final String posTag, + final boolean isNegated) { + final List<String> forms = new ArrayList<String>(); + final List<WordData> wordForms; + if (isNegated) { + wordForms = synthesizer.lookup(token.getLemma() + "|" + + posTag.replaceFirst(NEGATION_TAG, POTENTIAL_NEGATION_TAG)); + if (wordForms != null) { + for (WordData wd : wordForms) { + forms.add("nie" + wd.getStem().toString()); + } + } + } else { + wordForms = synthesizer.lookup(token.getLemma() + "|" + posTag); + for (WordData wd : wordForms) { + forms.add(wd.getStem().toString()); + } + } + return forms; + } + +} diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/synthesis/ro/RomanianSynthesizer.java b/JLanguageTool/src/java/de/danielnaber/languagetool/synthesis/ro/RomanianSynthesizer.java new file mode 100644 index 0000000..123bb62 --- /dev/null +++ b/JLanguageTool/src/java/de/danielnaber/languagetool/synthesis/ro/RomanianSynthesizer.java @@ -0,0 +1,40 @@ +/* LanguageTool, a natural language style checker + * Copyright (C) 2005 Daniel Naber (http://www.danielnaber.de) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ +package de.danielnaber.languagetool.synthesis.ro; + +import de.danielnaber.languagetool.JLanguageTool; +import de.danielnaber.languagetool.synthesis.BaseSynthesizer; + +/** + * Romanian word form synthesizer. <br/> + * + * @author Ionuț Păduraru + */ + +public class RomanianSynthesizer extends BaseSynthesizer { + + private static final String RESOURCE_FILENAME = "/ro/romanian_synth.dict"; + + private static final String TAGS_FILE_NAME = "/ro/romanian_tags.txt"; + + public RomanianSynthesizer() { + super(JLanguageTool.getDataBroker().getResourceDir() + RESOURCE_FILENAME, + JLanguageTool.getDataBroker().getResourceDir() + TAGS_FILE_NAME); + } +} diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/synthesis/ru/RussianSynthesizer.java b/JLanguageTool/src/java/de/danielnaber/languagetool/synthesis/ru/RussianSynthesizer.java new file mode 100644 index 0000000..7fd404b --- /dev/null +++ b/JLanguageTool/src/java/de/danielnaber/languagetool/synthesis/ru/RussianSynthesizer.java @@ -0,0 +1,44 @@ +/* LanguageTool, a natural language style checker + * Copyright (C) 2005 Daniel Naber (http://www.danielnaber.de) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ +package de.danielnaber.languagetool.synthesis.ru; + +import de.danielnaber.languagetool.JLanguageTool; +import de.danielnaber.languagetool.synthesis.BaseSynthesizer; + +/** + * Russian word form synthesizer. <br/> + * @author Yakov Reztsov + * + * Based on Dutch word from synthesizer + * + * @author Marcin Miłkowski + */ + +public class RussianSynthesizer extends BaseSynthesizer { + + private static final String RESOURCE_FILENAME = "/ru/russian_synth.dict"; + + private static final String TAGS_FILE_NAME = "/ru/tags_russian.txt"; + + public RussianSynthesizer() { + super(JLanguageTool.getDataBroker().getResourceDir() + RESOURCE_FILENAME, + JLanguageTool.getDataBroker().getResourceDir() + TAGS_FILE_NAME); + } + +} diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/synthesis/sk/SlovakSynthesizer.java b/JLanguageTool/src/java/de/danielnaber/languagetool/synthesis/sk/SlovakSynthesizer.java new file mode 100644 index 0000000..0c2c018 --- /dev/null +++ b/JLanguageTool/src/java/de/danielnaber/languagetool/synthesis/sk/SlovakSynthesizer.java @@ -0,0 +1,40 @@ +/* LanguageTool, a natural language style checker + * Copyright (C) 2005 Daniel Naber (http://www.danielnaber.de) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ +package de.danielnaber.languagetool.synthesis.sk; + +import de.danielnaber.languagetool.JLanguageTool; +import de.danielnaber.languagetool.synthesis.BaseSynthesizer; + +/** + * Slovak word form synthesizer. <br/> + * + * @author Marcin Miłkowski + */ + +public class SlovakSynthesizer extends BaseSynthesizer { + + private static final String RESOURCE_FILENAME = "/sk/slovak_synth.dict"; + + private static final String TAGS_FILE_NAME = "/sk/slovak_tags.txt"; + + public SlovakSynthesizer() { + super(JLanguageTool.getDataBroker().getResourceDir() + RESOURCE_FILENAME, + JLanguageTool.getDataBroker().getResourceDir() + TAGS_FILE_NAME); + } +} diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/tagging/BaseTagger.java b/JLanguageTool/src/java/de/danielnaber/languagetool/tagging/BaseTagger.java new file mode 100644 index 0000000..d8399e1 --- /dev/null +++ b/JLanguageTool/src/java/de/danielnaber/languagetool/tagging/BaseTagger.java @@ -0,0 +1,152 @@ +/* LanguageTool, a natural language style checker + * Copyright (C) 2006 Daniel Naber (http://www.danielnaber.de) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ +package de.danielnaber.languagetool.tagging; + +import java.io.IOException; +import java.net.URL; +import java.util.ArrayList; +import java.util.List; +import java.util.Locale; + +import morfologik.stemming.Dictionary; +import morfologik.stemming.DictionaryLookup; +import morfologik.stemming.IStemmer; +import morfologik.stemming.WordData; +import de.danielnaber.languagetool.AnalyzedToken; +import de.danielnaber.languagetool.AnalyzedTokenReadings; +import de.danielnaber.languagetool.tools.StringTools; + +/** + * Base tagger using Lametyzator. + * + * @author Marcin Milkowski + */ +public abstract class BaseTagger implements Tagger { + + private IStemmer morfologik; + private Locale conversionLocale = Locale.getDefault(); + + /** + * Get the filename, e.g., <tt>/resource/fr/french.dict</tt>. + **/ + public abstract String getFileName(); + + public void setLocale(Locale loc) { + conversionLocale = loc; + } + + public List<AnalyzedTokenReadings> tag(final List<String> sentenceTokens) + throws IOException { + List<AnalyzedToken> taggerTokens; + List<AnalyzedToken> lowerTaggerTokens; + List<AnalyzedToken> upperTaggerTokens; + final List<AnalyzedTokenReadings> tokenReadings = new ArrayList<AnalyzedTokenReadings>(); + int pos = 0; + // caching IStemmer instance - lazy init + if (morfologik == null) { + final URL url = this.getClass().getResource(getFileName()); + morfologik = new DictionaryLookup(Dictionary.read(url)); + } + + for (String word : sentenceTokens) { + final List<AnalyzedToken> l = new ArrayList<AnalyzedToken>(); + final String lowerWord = word.toLowerCase(conversionLocale); + taggerTokens = asAnalyzedTokenList(word, morfologik.lookup(word)); + lowerTaggerTokens = asAnalyzedTokenList(word, morfologik.lookup(lowerWord)); + final boolean isLowercase = word.equals(lowerWord); + + //normal case + addTokens(taggerTokens, l); + + if (!isLowercase) { + //lowercase + addTokens(lowerTaggerTokens, l); + } + + //uppercase + if (lowerTaggerTokens.isEmpty() && taggerTokens.isEmpty()) { + if (isLowercase) { + upperTaggerTokens = asAnalyzedTokenList(word, + morfologik.lookup(StringTools + .uppercaseFirstChar(word))); + if (!upperTaggerTokens.isEmpty()) { + addTokens(upperTaggerTokens, l); + } else { + l.add(new AnalyzedToken(word, null, null)); + } + } else { + l.add(new AnalyzedToken(word, null, null)); + } + } + tokenReadings.add(new AnalyzedTokenReadings(l, pos)); + pos += word.length(); + } + + return tokenReadings; + + } + + protected List<AnalyzedToken> asAnalyzedTokenList(final String word, final List<WordData> wdList) { + final List<AnalyzedToken> aTokenList = new ArrayList<AnalyzedToken>(); + for (WordData wd : wdList) { + aTokenList.add(asAnalyzedToken(word, wd)); + } + return aTokenList; + } + protected AnalyzedToken asAnalyzedToken(final String word, final WordData wd) { + return new AnalyzedToken( + word, + StringTools.asString(wd.getTag()), + StringTools.asString(wd.getStem())); + } + + private void addTokens(final List<AnalyzedToken> taggedTokens, + final List<AnalyzedToken> l) { + if (taggedTokens != null) { + for (AnalyzedToken at : taggedTokens) { + /* + if (!StringTools.isEmpty(at.getPOSTag())) { + l.add(at); + } else { + l.add(new AnalyzedToken(at.getToken(), null, null)); + } + */ + l.add(at); + } + } + } + + + /* + * (non-Javadoc) + * + * @see + * de.danielnaber.languagetool.tagging.Tagger#createNullToken(java.lang.String + * , int) + */ + public final AnalyzedTokenReadings createNullToken(final String token, + final int startPos) { + return new AnalyzedTokenReadings(new AnalyzedToken(token, null, null), startPos); + } + + public AnalyzedToken createToken(String token, String posTag) { + return new AnalyzedToken(token, posTag, null); + } + +} diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/tagging/ManualTagger.java b/JLanguageTool/src/java/de/danielnaber/languagetool/tagging/ManualTagger.java new file mode 100644 index 0000000..ae726fa --- /dev/null +++ b/JLanguageTool/src/java/de/danielnaber/languagetool/tagging/ManualTagger.java @@ -0,0 +1,127 @@ +/* LanguageTool, a natural language style checker + * Copyright (C) 2007 Daniel Naber (http://www.danielnaber.de) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ +package de.danielnaber.languagetool.tagging; + +import java.io.BufferedReader; +import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +import de.danielnaber.languagetool.tools.StringTools; + + + +/** + * A tagger that reads the POS information from a plain (UTF-8) text file. This + * makes it possible for the user to edit the text file to let the system know + * about new words or missing readings in the *.dict file. + * + * <p> + * File Format: <tt>fullform baseform postags</tt> (tab separated) + * + * @author Daniel Naber + */ +public class ManualTagger { + + private final Map<String, List<LookedUpTerm>> mapping; + + public ManualTagger(final InputStream file) throws IOException { + mapping = loadMapping(file, "utf8"); + } + + /** + * Look up a word's baseform and POS information. + * + * @param term + * @return an array with the baseform (at position 0, 2, ...) and the POS + * information (at position 1, 3, ...) or <code>null</code> if the + * word is unknown + */ + public String[] lookup(final String term) { + final List<LookedUpTerm> l = mapping.get(term); + if (l == null) { + return null; + } + final List<String> plainResult = new ArrayList<String>(); + for (final Object element : l) { + final LookedUpTerm lookedUpTerm = (LookedUpTerm) element; + plainResult.add(lookedUpTerm.baseform); + plainResult.add(lookedUpTerm.postags); + } + if (plainResult.isEmpty()) { + return null; + } + return plainResult.toArray(new String[]{}); + } + + private Map<String, List<LookedUpTerm>> loadMapping(final InputStream file, + final String encoding) throws IOException { + final Map<String, List<LookedUpTerm>> map = new HashMap<String, List<LookedUpTerm>>(); + InputStreamReader isr = null; + BufferedReader br = null; + try { + isr = new InputStreamReader(file, encoding); + br = new BufferedReader(isr); + String line; + while ((line = br.readLine()) != null) { + if (StringTools.isEmpty(line) || line.charAt(0)=='#') { + continue; + } + final String[] parts = line.split("\t"); + if (parts.length != 3) { + throw new IOException("Unknown format in " + file + ": " + line); + } + if (map.containsKey(parts[0])) { + final List<LookedUpTerm> l = map.get(parts[0]); + l.add(new LookedUpTerm(parts[1], parts[2])); + map.put(parts[0], l); + } else { + final List<LookedUpTerm> l = new ArrayList<LookedUpTerm>(); + l.add(new LookedUpTerm(parts[1], parts[2])); + map.put(parts[0], l); + } + } + } finally { + if (br != null) { + br.close(); + } + if (isr != null) { + isr.close(); + } + } + return map; + } + +} + +class LookedUpTerm { + + String baseform; + String postags; + + LookedUpTerm(final String baseform, final String postags) { + this.baseform = baseform; + this.postags = postags; + } + +} diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/tagging/Tagger.java b/JLanguageTool/src/java/de/danielnaber/languagetool/tagging/Tagger.java new file mode 100644 index 0000000..64a1dbd --- /dev/null +++ b/JLanguageTool/src/java/de/danielnaber/languagetool/tagging/Tagger.java @@ -0,0 +1,57 @@ +/* LanguageTool, a natural language style checker + * Copyright (C) 2005 Daniel Naber (http://www.danielnaber.de) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ +package de.danielnaber.languagetool.tagging; + +import java.io.IOException; +import java.util.List; + +import de.danielnaber.languagetool.AnalyzedToken; +import de.danielnaber.languagetool.AnalyzedTokenReadings; + +/** + * The part-of-speech tagger interface, whose implementations are usually language-dependent. + * + * @author Daniel Naber + */ +public interface Tagger { + + /** + * Returns a list of {@link AnalyzedToken}s that assigns each term in the + * sentence some kind of part-of-speech information (not necessarily just one tag). + * + * <p>Note that this method takes exactly one sentence. Its implementation + * may implement special cases for the first word of a sentence, which is + * usually written with an uppercase letter. + * + * @param sentenceTokens the text as returned by a WordTokenizer but without whitespace tokens. + */ + public List<AnalyzedTokenReadings> tag(List<String> sentenceTokens) throws IOException; + + /** + * Create the AnalyzedToken used for whitespace and other non-words. Use <code>null</code> + * as the POS tag for this token. + */ + public AnalyzedTokenReadings createNullToken(String token, int startPos); + + /** + * Create a token specific to the language of the implementing class. + */ + public AnalyzedToken createToken(String token, String posTag); + +} diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/tagging/be/BelarusianTagger.java b/JLanguageTool/src/java/de/danielnaber/languagetool/tagging/be/BelarusianTagger.java new file mode 100644 index 0000000..8e10360 --- /dev/null +++ b/JLanguageTool/src/java/de/danielnaber/languagetool/tagging/be/BelarusianTagger.java @@ -0,0 +1,58 @@ +/* LanguageTool, a natural language style checker + * Copyright (C) 2005 Daniel Naber (http://www.danielnaber.de) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ +package de.danielnaber.languagetool.tagging.be; + +import java.util.ArrayList; +import java.util.List; + +import de.danielnaber.languagetool.AnalyzedToken; +import de.danielnaber.languagetool.AnalyzedTokenReadings; +import de.danielnaber.languagetool.tagging.Tagger; + +/** + * Belarusian Tagger. + * + * Copyright (C) 2010 Alex Buloichik (alex73mail@gmail.com) + */ +public class BelarusianTagger implements Tagger { + + public List<AnalyzedTokenReadings> tag(List<String> sentenceTokens) { + final List<AnalyzedTokenReadings> tokenReadings = new ArrayList<AnalyzedTokenReadings>(); + int pos = 0; + for (String word : sentenceTokens) { + final List<AnalyzedToken> l = new ArrayList<AnalyzedToken>(); + // a real tagger would need to assign a POS tag + // in the next line instead of null: + l.add(new AnalyzedToken(word, null, null)); + pos += word.length(); + tokenReadings.add(new AnalyzedTokenReadings(l + .toArray(new AnalyzedToken[0]), 0)); + } + return tokenReadings; + } + + public AnalyzedTokenReadings createNullToken(String token, int startPos) { + return new AnalyzedTokenReadings(new AnalyzedToken(token, null, null), + startPos); + } + + public AnalyzedToken createToken(String token, String posTag) { + return new AnalyzedToken(token, posTag, null); + } +} diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/tagging/ca/CatalanTagger.java b/JLanguageTool/src/java/de/danielnaber/languagetool/tagging/ca/CatalanTagger.java new file mode 100644 index 0000000..b0c266a --- /dev/null +++ b/JLanguageTool/src/java/de/danielnaber/languagetool/tagging/ca/CatalanTagger.java @@ -0,0 +1,42 @@ +/* LanguageTool, a natural language style checker + * Copyright (C) 2006 Daniel Naber (http://www.danielnaber.de) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ +package de.danielnaber.languagetool.tagging.ca; + +import java.util.Locale; + +import de.danielnaber.languagetool.JLanguageTool; +import de.danielnaber.languagetool.tagging.BaseTagger; + +/** Catalan Tagger + * + * Based on FreeLing tagger dictionary + * + * @author Marcin Milkowski + */ +public class CatalanTagger extends BaseTagger { + + public final String getFileName() { + return JLanguageTool.getDataBroker().getResourceDir() + "/ca/catalan.dict"; + } + + public CatalanTagger() { + super(); + setLocale(new Locale("ca")); + } +} diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/tagging/cs/CzechTagger.java b/JLanguageTool/src/java/de/danielnaber/languagetool/tagging/cs/CzechTagger.java new file mode 100644 index 0000000..b33fd00 --- /dev/null +++ b/JLanguageTool/src/java/de/danielnaber/languagetool/tagging/cs/CzechTagger.java @@ -0,0 +1,115 @@ +/* LanguageTool, a natural language style checker + * Copyright (C) 2006 Daniel Naber (http://www.danielnaber.de) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ +package de.danielnaber.languagetool.tagging.cs; + +import java.io.IOException; +import java.net.URL; +import java.util.ArrayList; +import java.util.List; +import java.util.Locale; + +import morfologik.stemming.Dictionary; +import morfologik.stemming.DictionaryLookup; +import morfologik.stemming.IStemmer; +import de.danielnaber.languagetool.AnalyzedToken; +import de.danielnaber.languagetool.AnalyzedTokenReadings; +import de.danielnaber.languagetool.JLanguageTool; +import de.danielnaber.languagetool.tagging.BaseTagger; +import de.danielnaber.languagetool.tools.StringTools; + +/** + * Czech POS tagger based on FSA morphological dictionaries. + * + * @author Jozef Licko + */ +public class CzechTagger extends BaseTagger { + + private static final String RESOURCE_FILENAME = "/cs/czech.dict"; + + private IStemmer morfologik; + private final Locale csLocale = new Locale("cs"); + + public final String getFileName() { + return JLanguageTool.getDataBroker().getResourceDir() + RESOURCE_FILENAME; + } + + @Override + public final List<AnalyzedTokenReadings> tag(final List<String> sentenceTokens) + throws IOException { + List<AnalyzedToken> taggerTokens; + List<AnalyzedToken> lowerTaggerTokens; + List<AnalyzedToken> upperTaggerTokens; + final List<AnalyzedTokenReadings> tokenReadings = new ArrayList<AnalyzedTokenReadings>(); + int pos = 0; + // caching Lametyzator instance - lazy init + if (morfologik == null) { + final URL url = JLanguageTool.getDataBroker().getFromResourceDirAsUrl(RESOURCE_FILENAME); + morfologik = new DictionaryLookup(Dictionary.read(url)); + } + + for (String word : sentenceTokens) { + final List<AnalyzedToken> l = new ArrayList<AnalyzedToken>(); + final String lowerWord = word.toLowerCase(csLocale); + taggerTokens = asAnalyzedTokenList(word, morfologik.lookup(word)); + lowerTaggerTokens = asAnalyzedTokenList(word, morfologik.lookup(lowerWord)); + final boolean isLowercase = word.equals(lowerWord); + + //normal case + addTokens(taggerTokens, l); + + if (!isLowercase) { + //lowercase + addTokens(lowerTaggerTokens, l); + } + + //uppercase + if (lowerTaggerTokens.isEmpty() && taggerTokens.isEmpty()) { + if (isLowercase) { + upperTaggerTokens = asAnalyzedTokenList(word, morfologik.lookup(StringTools + .uppercaseFirstChar(word))); + if (!upperTaggerTokens.isEmpty()) { + addTokens(upperTaggerTokens, l); + } else { + l.add(new AnalyzedToken(word, null, null)); + } + } else { + l.add(new AnalyzedToken(word, null, null)); + } + } + tokenReadings.add(new AnalyzedTokenReadings(l, pos)); + pos += word.length(); + } + + return tokenReadings; + } + + private void addTokens(final List<AnalyzedToken> taggedTokens, + final List<AnalyzedToken> l) { + if (taggedTokens != null) { + for (AnalyzedToken at : taggedTokens) { + final String[] tagsArr = StringTools.asString(at.getPOSTag()).split("\\+"); + for (final String currTag : tagsArr) { + l.add(new AnalyzedToken(at.getToken(), currTag, + at.getLemma())); + } + } + } + } + +}
\ No newline at end of file diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/tagging/da/DanishTagger.java b/JLanguageTool/src/java/de/danielnaber/languagetool/tagging/da/DanishTagger.java new file mode 100644 index 0000000..e383272 --- /dev/null +++ b/JLanguageTool/src/java/de/danielnaber/languagetool/tagging/da/DanishTagger.java @@ -0,0 +1,50 @@ +/* LanguageTool, a natural language style checker + * Copyright (C) 2006 Daniel Naber (http://www.danielnaber.de) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ +package de.danielnaber.languagetool.tagging.da; + +import java.util.Locale; + +import de.danielnaber.languagetool.JLanguageTool; +import de.danielnaber.languagetool.tagging.BaseTagger; + +/** + * Danish Part-of-speech tagger. + * + * Based on the spellcheck dictionary Stavekontrolden + * published by Foreningen for frit tilgængelige sprogværktøjer + * under the terms of the GNU LGPL version 2.1 and Mozilla MPL version 1.1. + * + * www.stavekontrolden.dk + * + * Stavekontrolden is based on data from Det Danske Sprog- og Litteraturselskab + * (The Danish Society for Language and Literature), http://www.dsl.dk. + * + * @author Esben Aaberg + */ +public class DanishTagger extends BaseTagger { + + public final String getFileName() { + return JLanguageTool.getDataBroker().getResourceDir() + "/da/danish.dict"; + } + + public DanishTagger() { + super(); + setLocale(new Locale("da")); + } +} diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/tagging/de/AnalyzedGermanToken.java b/JLanguageTool/src/java/de/danielnaber/languagetool/tagging/de/AnalyzedGermanToken.java new file mode 100644 index 0000000..dcd5bc8 --- /dev/null +++ b/JLanguageTool/src/java/de/danielnaber/languagetool/tagging/de/AnalyzedGermanToken.java @@ -0,0 +1,136 @@ +/* LanguageTool, a natural language style checker + * Copyright (C) 2005 Daniel Naber (http://www.danielnaber.de) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ +package de.danielnaber.languagetool.tagging.de; + +import de.danielnaber.languagetool.AnalyzedToken; +import de.danielnaber.languagetool.tagging.de.GermanToken.Genus; +import de.danielnaber.languagetool.tagging.de.GermanToken.Kasus; +import de.danielnaber.languagetool.tagging.de.GermanToken.Numerus; +import de.danielnaber.languagetool.tagging.de.GermanToken.POSType; + +/** + * One reading of a German word. Many words can have more + * than one reading, e.g. "Tische" can be both Nominativ Plural + * and Genitiv Plural (among other readings). + * + * @author Daniel Naber + */ +public class AnalyzedGermanToken extends AnalyzedToken { + + private POSType type; + private Kasus casus; + private Numerus numerus; + private Genus genus; + + public AnalyzedGermanToken(String token, String posTag) { + super(token, posTag, null); + initFromPOSTagString(posTag); + } + + public AnalyzedGermanToken(String token, String posTag, String lemma) { + super(token, posTag, lemma); + initFromPOSTagString(posTag); + } + + private void initFromPOSTagString(String posTagString) { + if (posTagString == null) { + return; + } + final String[] parts = posTagString.split(":"); + if (parts.length < 3) { + //FIXME ?? + //System.err.println(posTagString); + return; + } + + //System.err.println(fullform + " " + posTagString); + for (String part : parts) { + if (part.equals("EIG")) + type = POSType.PROPER_NOUN; + else if (part.equals("SUB") && type == null) + type = POSType.NOMEN; + else if (part.equals("PA1") || part.equals("PA2")) + type = POSType.PARTIZIP; + else if (part.equals("VER") && type == null) + type = POSType.VERB; + else if (part.equals("ADJ") && type == null) + type = POSType.ADJEKTIV; + else if (part.equals("PRO") && type == null) + type = POSType.PRONOMEN; + else if (part.equals("ART") && type == null) + type = POSType.DETERMINER; + + else if (part.equals("AKK")) + casus = Kasus.AKKUSATIV; + else if (part.equals("GEN")) + casus = Kasus.GENITIV; + else if (part.equals("NOM")) + casus = Kasus.NOMINATIV; + else if (part.equals("DAT")) + casus = Kasus.DATIV; + + else if (part.equals("PLU")) + numerus = Numerus.PLURAL; + else if (part.equals("SIN")) + numerus = Numerus.SINGULAR; + + else if (part.equals("MAS")) + genus = Genus.MASKULINUM; + else if (part.equals("FEM")) + genus = Genus.FEMININUM; + else if (part.equals("NEU")) + genus = Genus.NEUTRUM; + else if (part.equals("NOG")) + genus = Genus.FEMININUM; // NOG = no genus because only used as plural + + else if (part.equals("DEF")) + ; // not yet used + else if (part.equals("DEM")) //??? + ; // not yet used + else if (part.equals("PER")) + ; // not yet used + + //else + //System.err.println("unknown: " + posTagString + " for fullform " + fullform); + // TODO: add else here that throws execption?! + } + + } + + public POSType getType() { + return type; + } + + public Kasus getCasus() { + return casus; + } + + public Numerus getNumerus() { + return numerus; + } + + public Genus getGenus() { + return genus; + } + + public String toString() { + return getPOSTag(); + } + +} diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/tagging/de/AnalyzedGermanTokenReadings.java b/JLanguageTool/src/java/de/danielnaber/languagetool/tagging/de/AnalyzedGermanTokenReadings.java new file mode 100644 index 0000000..81e5895 --- /dev/null +++ b/JLanguageTool/src/java/de/danielnaber/languagetool/tagging/de/AnalyzedGermanTokenReadings.java @@ -0,0 +1,172 @@ +/* LanguageTool, a natural language style checker + * Copyright (C) 2005 Daniel Naber (http://www.danielnaber.de) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ +package de.danielnaber.languagetool.tagging.de; + +import java.util.ArrayList; +import java.util.HashSet; +import java.util.List; +import java.util.Set; +import java.util.TreeSet; + +import de.danielnaber.languagetool.AnalyzedToken; +import de.danielnaber.languagetool.AnalyzedTokenReadings; +import de.danielnaber.languagetool.tagging.de.GermanToken.POSType; +import de.danielnaber.languagetool.tools.StringTools; +import de.danielnaber.languagetool.JLanguageTool; + +/** + * All possible readings of an analyzed German word. + * + * @author Daniel Naber + */ +public class AnalyzedGermanTokenReadings extends AnalyzedTokenReadings { + + public AnalyzedGermanTokenReadings(AnalyzedGermanToken[] aTokens, final int startPos) { + super(aTokens, startPos); + } + + public AnalyzedGermanTokenReadings(AnalyzedGermanToken aToken, final int startPos) { + super(aToken, startPos); + } + + /** + * @return a list of {@link AnalyzedGermanToken}s. + */ + public List<AnalyzedGermanToken> getGermanReadings() { + final List<AnalyzedGermanToken> tokens = new ArrayList<AnalyzedGermanToken>(); + for (AnalyzedToken reading : anTokReadings) { + if (reading.getPOSTag() != null) { + if (!reading.getPOSTag().equals(JLanguageTool.SENTENCE_END_TAGNAME) && !reading.getPOSTag().equals(JLanguageTool.PARAGRAPH_END_TAGNAME)) { + tokens.add((AnalyzedGermanToken)reading); + } + } else { + tokens.add((AnalyzedGermanToken)reading); + } + + } + return tokens; + } + + public boolean hasReadingOfType(POSType type) { + if (anTokReadings == null) + return false; + for (AnalyzedToken reading : anTokReadings) { + if (reading.getPOSTag() != null) { + if (reading.getPOSTag().equals(JLanguageTool.SENTENCE_END_TAGNAME) || reading.getPOSTag().equals(JLanguageTool.PARAGRAPH_END_TAGNAME)) { + return false; + } + } + final AnalyzedGermanToken germanReading = (AnalyzedGermanToken) reading; + if (germanReading.getType() == type) + return true; + } + return false; + } + + /** + * Return true if the analyzed word is a sentence or paragraph end. + */ + public boolean isSentenceEnd() { + if (anTokReadings == null) { + return false; + } + for (AnalyzedToken reading : anTokReadings) { + if (reading.getPOSTag() != null) { + if (reading.getPOSTag().equals(JLanguageTool.SENTENCE_END_TAGNAME) || reading.getPOSTag().equals(JLanguageTool.PARAGRAPH_END_TAGNAME)) { + return true; + } + } + } + return false; + } + + public boolean hasReading(GermanToken.Kasus kasus) { + if (anTokReadings == null) + return false; + for (AnalyzedToken reading : anTokReadings) { + final AnalyzedGermanToken germanReading = (AnalyzedGermanToken) reading; + if (germanReading.getCasus() == kasus) + return true; + } + return false; + } + + public boolean hasReading(GermanToken.Numerus numerus) { + if (anTokReadings == null) + return false; + for (AnalyzedToken reading : anTokReadings) { + final AnalyzedGermanToken germanReading = (AnalyzedGermanToken) reading; + if (germanReading.getNumerus() == numerus) + return true; + } + return false; + } + + public boolean hasReading(GermanToken.Genus genus) { + if (anTokReadings == null) + return false; + for (AnalyzedToken reading : anTokReadings) { + final AnalyzedGermanToken germanReading = (AnalyzedGermanToken) reading; + if (germanReading.getGenus() == genus) + return true; + } + return false; + } + + public String toString() { + if (anTokReadings == null) { + return super.getAnalyzedToken(0).getToken() + "[?]"; + } + final StringBuilder sb = new StringBuilder(super.getAnalyzedToken(0).getToken()); + final Set<String> printed = new HashSet<String>(); + sb.append('['); + for (AnalyzedToken reading : anTokReadings) { + if (!printed.contains(reading.toString())) { + if (printed.size() > 0) + sb.append(", "); + sb.append(reading.toString()); + } + printed.add(reading.toString()); + } + sb.append(']'); + return sb.toString(); + } + + /** + * Returns a string representation like {@code toString()}, but sorts + * the elements alphabetically. + */ + public String toSortedString() { + if (anTokReadings == null) { + return super.getAnalyzedToken(0).getToken() + "[?]"; + } + final StringBuilder sb = new StringBuilder(super.getAnalyzedToken(0).getToken()); + final Set<String> elements = new TreeSet<String>(); + sb.append('['); + for (AnalyzedToken reading : anTokReadings) { + if (!elements.contains(reading.toString())) { + elements.add(reading.toString()); + } + } + sb.append(StringTools.listToString(elements, ", ")); + sb.append(']'); + return sb.toString(); + } + +} diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/tagging/de/GermanTagger.java b/JLanguageTool/src/java/de/danielnaber/languagetool/tagging/de/GermanTagger.java new file mode 100644 index 0000000..b8adf5b --- /dev/null +++ b/JLanguageTool/src/java/de/danielnaber/languagetool/tagging/de/GermanTagger.java @@ -0,0 +1,201 @@ +/* LanguageTool, a natural language style checker + * Copyright (C) 2006 Daniel Naber (http://www.danielnaber.de) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ +package de.danielnaber.languagetool.tagging.de; + +import java.io.IOException; +import java.net.URL; +import java.util.ArrayList; +import java.util.List; + +import morfologik.stemming.Dictionary; +import morfologik.stemming.DictionaryLookup; +import morfologik.stemming.IStemmer; +import morfologik.stemming.WordData; +import de.danielnaber.languagetool.AnalyzedToken; +import de.danielnaber.languagetool.AnalyzedTokenReadings; +import de.danielnaber.languagetool.JLanguageTool; +import de.danielnaber.languagetool.tagging.ManualTagger; +import de.danielnaber.languagetool.tagging.Tagger; +import de.danielnaber.languagetool.tokenizers.de.GermanCompoundTokenizer; +import de.danielnaber.languagetool.tools.StringTools; + +/** + * German tagger, requires data file in <code>resource/de/german.dict</code>. + * + * @author Marcin Milkowski, Daniel Naber + */ +public class GermanTagger implements Tagger { + + private static final String DICT_FILENAME = "/de/german.dict"; + private static final String USER_DICT_FILENAME = "/de/added.txt"; + + private static IStemmer morfologik; + private static ManualTagger manualTagger; + private static GermanCompoundTokenizer compoundTokenizer; + + public GermanTagger() { + } + + public AnalyzedGermanTokenReadings lookup(final String word) throws IOException { + final List<String> words = new ArrayList<String>(); + words.add(word); + final List<AnalyzedTokenReadings> result = tag(words, false); + final AnalyzedGermanTokenReadings atr = (AnalyzedGermanTokenReadings) result.get(0); + if (atr.getAnalyzedToken(0).getPOSTag() == null) + return null; + return atr; + } + + public List<AnalyzedTokenReadings> tag(final List<String> sentenceTokens) throws IOException { + return tag(sentenceTokens, true); + } + + public List<AnalyzedTokenReadings> tag(final List<String> sentenceTokens, final boolean ignoreCase) throws IOException { + String[] taggerTokens; + boolean firstWord = true; + final List<AnalyzedTokenReadings> tokenReadings = new ArrayList<AnalyzedTokenReadings>(); + int pos = 0; + // caching Lametyzator instance - lazy init + if (morfologik == null) { + final URL url = JLanguageTool.getDataBroker().getFromResourceDirAsUrl(DICT_FILENAME); + morfologik = new DictionaryLookup(Dictionary.read(url)); + } + if (manualTagger == null) { + manualTagger = new ManualTagger(JLanguageTool.getDataBroker().getFromResourceDirAsStream(USER_DICT_FILENAME)); + } + if (compoundTokenizer == null) { + compoundTokenizer = new GermanCompoundTokenizer(); + } + + for (String word: sentenceTokens) { + final List<AnalyzedGermanToken> l = new ArrayList<AnalyzedGermanToken>(); + taggerTokens = lexiconLookup(word); + if (firstWord && taggerTokens == null && ignoreCase) { // e.g. "Das" -> "das" at start of sentence + taggerTokens = lexiconLookup(word.toLowerCase()); + firstWord = false; + } + if (taggerTokens != null) { + tagWord(taggerTokens, word, l); + } else { + // word not known, try to decompose it and use the last part for POS tagging: + if (!StringTools.isEmpty(word.trim())) { + final List<String> compoundParts = compoundTokenizer.tokenize(word); + if (compoundParts.size() <= 1) { + l.add(new AnalyzedGermanToken(word, null, null)); + } else { + // last part governs a word's POS: + String lastPart = compoundParts.get(compoundParts.size()-1); + if (StringTools.startsWithUppercase(word)) { + lastPart = StringTools.uppercaseFirstChar(lastPart); + } + taggerTokens = lexiconLookup(lastPart); + if (taggerTokens != null) { + tagWord(taggerTokens, word, l, compoundParts); + } else { + l.add(new AnalyzedGermanToken(word, null, null)); + } + } + } else { + l.add(new AnalyzedGermanToken(word, null, null)); + } + } + + //tokenReadings.add(new AnalyzedGermanToken(new AnalyzedTokenReadings((AnalyzedToken[]) l.toArray(new AnalyzedToken[0])))); + tokenReadings.add(new AnalyzedGermanTokenReadings(l.toArray(new AnalyzedGermanToken[l.size()]), pos)); + pos += word.length(); + } + return tokenReadings; + } + + private void tagWord(String[] taggerTokens, String word, List<AnalyzedGermanToken> l) { + tagWord(taggerTokens, word, l, null); + } + + /** + * @param compoundParts all compound parts of the complete word or <code>null</code>, + * if the original input is not a compound + */ + private void tagWord(String[] taggerTokens, String word, List<AnalyzedGermanToken> l, + List<String> compoundParts) { + int i = 0; + while (i < taggerTokens.length) { + // Lametyzator returns data as String[] + // first lemma, then annotations + if (compoundParts != null) { + // was originally a compound word + final List<String> allButLastPart = compoundParts.subList(0, compoundParts.size() - 1); + final String lemma = StringTools.listToString(allButLastPart, "") + + StringTools.lowercaseFirstChar(taggerTokens[i]); + l.add(new AnalyzedGermanToken(word, taggerTokens[i + 1], lemma)); + } else { + l.add(new AnalyzedGermanToken(word, taggerTokens[i + 1], taggerTokens[i])); + } + i = i + 2; + } + } + + private String[] lexiconLookup(final String word) { + final String[] posTagsFromUserDict = manualTagger.lookup(word); + final List<WordData> posTagsFromDict = morfologik.lookup(word); + if (posTagsFromUserDict != null && !posTagsFromDict.isEmpty()) { + final String[] allPosTags = new String[posTagsFromUserDict.length + posTagsFromDict.size() * 2]; + //System.arraycopy(posTagsFromDict, 0, allPosTags, 0, posTagsFromDict.size()); + int i = 0; + for (WordData wd : posTagsFromDict) { + allPosTags[i] = wd.getStem().toString(); + allPosTags[i + 1] = wd.getTag().toString(); + i = i + 2; + } + System.arraycopy(posTagsFromUserDict, 0, allPosTags, posTagsFromDict.size() * 2, posTagsFromUserDict.length); + return allPosTags; + } else if (posTagsFromUserDict == null && !posTagsFromDict.isEmpty()) { + final String[] allPosTags = new String[posTagsFromDict.size() * 2]; + int i = 0; + for (WordData wd : posTagsFromDict) { + allPosTags[i] = wd.getStem().toString(); + allPosTags[i + 1] = wd.getTag().toString(); + i = i + 2; + } + return allPosTags; + } else { + return posTagsFromUserDict; + } + } + + public final AnalyzedGermanTokenReadings createNullToken(final String token, final int startPos) { + return new AnalyzedGermanTokenReadings(new AnalyzedGermanToken(token, null, null), startPos); + } + + public AnalyzedToken createToken(String token, String posTag) { + return new AnalyzedGermanToken(token, posTag); + } + + /** + * Test only + */ + public static void main(final String[] args) throws IOException { + final GermanTagger gt = new GermanTagger(); + final List<String> l = new ArrayList<String>(); + l.add("Einfacher"); + //System.err.println(gt.lookup("Treffen", 0)); + final List<AnalyzedTokenReadings> res = gt.tag(l); + System.err.println(res); + } + +} diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/tagging/de/GermanToken.java b/JLanguageTool/src/java/de/danielnaber/languagetool/tagging/de/GermanToken.java new file mode 100644 index 0000000..d6011a0 --- /dev/null +++ b/JLanguageTool/src/java/de/danielnaber/languagetool/tagging/de/GermanToken.java @@ -0,0 +1,104 @@ +/* LanguageTool, a natural language style checker + * Copyright (C) 2005 Daniel Naber (http://www.danielnaber.de) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ +package de.danielnaber.languagetool.tagging.de; + +/** + * Constants used to describe the properties of German tokens. + * + * @author Daniel Naber + */ +public final class GermanToken { + + private GermanToken() { + // only static stuff + } + + public static final class POSType { + public static final POSType NOMEN = new POSType("Nomen"); + public static final POSType VERB = new POSType("Verb"); + public static final POSType ADJEKTIV = new POSType("Adjektiv"); + public static final POSType DETERMINER = new POSType("Determiner"); + public static final POSType PRONOMEN = new POSType("Pronomen"); + public static final POSType PARTIZIP = new POSType("Partizip"); + public static final POSType PROPER_NOUN = new POSType("Eigenname"); + public static final POSType OTHER = new POSType("Other"); // e.g. sentence start + + private final String name; + + private POSType(final String name) { + this.name = name; + } + + public String toString() { + return name; + } + } + + public static final class Kasus { + public static final Kasus NOMINATIV = new Kasus("Nominativ"); + public static final Kasus AKKUSATIV = new Kasus("Akkusativ"); + public static final Kasus DATIV = new Kasus("Dativ"); + public static final Kasus GENITIV = new Kasus("Genitiv"); + public static final Kasus OTHER = new Kasus("Other"); + + private final String name; + + private Kasus(final String name) { + this.name = name; + } + + public String toString() { + return name; + } + } + + public static final class Numerus { + public static final Numerus SINGULAR = new Numerus("Singular"); + public static final Numerus PLURAL = new Numerus("Plural"); + public static final Numerus OTHER = new Numerus("Other"); + + private final String name; + + private Numerus(final String name) { + this.name = name; + } + + public String toString() { + return name; + } + } + + public static final class Genus { + public static final Genus NEUTRUM = new Genus("Neutrum"); + public static final Genus MASKULINUM = new Genus("Maskulinum"); + public static final Genus FEMININUM = new Genus("Femininum"); + public static final Genus OTHER = new Genus("Other"); + + private final String name; + + private Genus(final String name) { + this.name = name; + } + + public String toString() { + return name; + } + } + +} diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/tagging/disambiguation/Disambiguator.java b/JLanguageTool/src/java/de/danielnaber/languagetool/tagging/disambiguation/Disambiguator.java new file mode 100644 index 0000000..88c5455 --- /dev/null +++ b/JLanguageTool/src/java/de/danielnaber/languagetool/tagging/disambiguation/Disambiguator.java @@ -0,0 +1,50 @@ +/* LanguageTool, a natural language style checker + * Copyright (C) 2007 Daniel Naber (http://www.danielnaber.de) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ +package de.danielnaber.languagetool.tagging.disambiguation; + +import java.io.IOException; + +import de.danielnaber.languagetool.AnalyzedSentence; + +/** + * Disambiguator interface. Particular implementations are language-dependent. + * + * <p> + * The POS tagger might assign multiple tags to the token. + * + * The goal is to filter out the incorrect tags and leave ideally only one per + * token. + * + * @author Jozef Licko + */ +public interface Disambiguator { + + /** + * If possible, filters out the wrong POS tags. + * + * @param input + * The sentence with already tagged words. The words are expected to + * have multiple tags. + * @return Analyzed sentence, where each word has only one (possibly the most + * correct) tag. + * @throws IOException + */ + AnalyzedSentence disambiguate(AnalyzedSentence input) throws IOException; + +} diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/tagging/disambiguation/pl/PolishChunker.java b/JLanguageTool/src/java/de/danielnaber/languagetool/tagging/disambiguation/pl/PolishChunker.java new file mode 100644 index 0000000..a190cfb --- /dev/null +++ b/JLanguageTool/src/java/de/danielnaber/languagetool/tagging/disambiguation/pl/PolishChunker.java @@ -0,0 +1,199 @@ +/* LanguageTool, a natural language style checker + * Copyright (C) 2007 Daniel Naber (http://www.danielnaber.de) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ + +package de.danielnaber.languagetool.tagging.disambiguation.pl; + +import java.io.BufferedReader; +import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +import de.danielnaber.languagetool.AnalyzedSentence; +import de.danielnaber.languagetool.AnalyzedToken; +import de.danielnaber.languagetool.AnalyzedTokenReadings; +import de.danielnaber.languagetool.JLanguageTool; +import de.danielnaber.languagetool.tagging.disambiguation.Disambiguator; + +/** + * Multiword tagger-chunker for Polish. + * + * @author Marcin Miłkowski + */ +public class PolishChunker implements Disambiguator { + + private Map<String, String> mStartSpace; + private Map<String, String> mStartNoSpace; + private Map<String, String> mFull; + + private static final String FILENAME = "/pl/multiwords.txt"; + + /* + * Lazy init, thanks to Artur Trzewik + */ + private void lazyInit() throws IOException { + + if (mStartSpace != null) + return; + + mStartSpace = new HashMap<String, String>(); + mStartNoSpace = new HashMap<String, String>(); + mFull = new HashMap<String, String>(); + + final List<String> posTokens = loadWords(JLanguageTool.getDataBroker().getFromResourceDirAsStream(FILENAME)); + for (String posToken : posTokens) { + final String[] tokenAndTag = posToken.split("\t"); + final boolean containsSpace = tokenAndTag[0].indexOf(' ') > 0; + String firstToken = ""; + final String[] firstTokens; + if (!containsSpace) { + firstTokens = new String[tokenAndTag[0].length()]; + firstToken = tokenAndTag[0].substring(0, 1); + for (int i = 1; i < tokenAndTag[0].length(); i++) { + firstTokens[i] = tokenAndTag[0].substring(0 + (i - 1), i); + } + if (mStartNoSpace.containsKey(firstToken)) { + if (Integer.parseInt(mStartNoSpace.get(firstToken)) < firstTokens.length) { + mStartNoSpace.put(firstToken, Integer.toString(firstTokens.length)); + } + } else { + mStartNoSpace.put(firstToken, Integer.toString(firstTokens.length)); + } + } else { + firstTokens = tokenAndTag[0].split(" "); + firstToken = firstTokens[0]; + + if (mStartSpace.containsKey(firstToken)) { + if (Integer.parseInt(mStartSpace.get(firstToken)) < firstTokens.length) { + mStartSpace.put(firstToken, Integer.toString(firstTokens.length)); + } + } else { + mStartSpace.put(firstToken, Integer.toString(firstTokens.length)); + } + } + mFull.put(tokenAndTag[0], tokenAndTag[1]); + } + } + + /** + * Implements multiword POS tags, e.g., <ELLIPSIS> for ellipsis (...) + * start, and </ELLIPSIS> for ellipsis end. + * + * @param input + * The tokens to be chunked. + * @return AnalyzedSentence with additional markers. + * @throws IOException + */ + public final AnalyzedSentence disambiguate(final AnalyzedSentence input) throws IOException { + + lazyInit(); + + final AnalyzedTokenReadings[] anTokens = input.getTokens(); + final AnalyzedTokenReadings[] output = anTokens; + + for (int i = 0; i < anTokens.length; i++) { + final String tok = output[i].getToken(); + final StringBuilder tokens = new StringBuilder(); + + int finalLen = 0; + if (mStartSpace.containsKey(tok)) { + final int len = Integer.parseInt(mStartSpace.get(tok)); + int j = i; + int lenCounter = 0; + while (j < anTokens.length) { + if (!anTokens[j].isWhitespace()) { + tokens.append(anTokens[j].getToken()); + if (mFull.containsKey(tokens.toString())) { + final AnalyzedToken tokenStart = new AnalyzedToken(tok, "<" + + mFull.get(tokens.toString()) + ">", tokens.toString()); + output[i].addReading(tokenStart); + final AnalyzedToken tokenEnd = new AnalyzedToken( + anTokens[finalLen].getToken(), "</" + + mFull.get(tokens.toString()) + ">", tokens.toString()); + output[finalLen].addReading(tokenEnd); + } + lenCounter++; + if (lenCounter == len) { + break; + } + tokens.append(' '); + } + j++; + finalLen = j; + } + } + + if (mStartNoSpace.containsKey(tok)) { + final int len = Integer.parseInt(mStartNoSpace.get(tok)); + if (i + len <= anTokens.length) { + for (int j = i; j < i + len; j++) { + tokens.append(anTokens[j].getToken()); + if (mFull.containsKey(tokens.toString())) { + final AnalyzedToken tokenStart = new AnalyzedToken(tok, "<" + + mFull.get(tokens.toString()) + ">", tokens.toString()); + output[i].addReading(tokenStart); + final AnalyzedToken tokenEnd = new AnalyzedToken(anTokens + [i + len - 1].getToken(), + "</" + mFull.get(tokens.toString()) + ">", + tokens.toString()); + output[i + len - 1].addReading(tokenEnd); + } + } + } + } + } + + return new AnalyzedSentence(output); + } + + private List<String> loadWords(final InputStream file) throws IOException { + InputStreamReader isr = null; + BufferedReader br = null; + final List<String> lines = new ArrayList<String>(); + try { + isr = new InputStreamReader(file, "UTF-8"); + br = new BufferedReader(isr); + String line; + + while ((line = br.readLine()) != null) { + line = line.trim(); + if (line.length() < 1) { + continue; + } + if (line.charAt(0) == '#') { // ignore comments + continue; + } + lines.add(line); + } + + } finally { + if (br != null) { + br.close(); + } + if (isr != null) { + isr.close(); + } + } + return lines; + } + +}
\ No newline at end of file diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/tagging/disambiguation/pl/PolishHybridDisambiguator.java b/JLanguageTool/src/java/de/danielnaber/languagetool/tagging/disambiguation/pl/PolishHybridDisambiguator.java new file mode 100644 index 0000000..b06cbdb --- /dev/null +++ b/JLanguageTool/src/java/de/danielnaber/languagetool/tagging/disambiguation/pl/PolishHybridDisambiguator.java @@ -0,0 +1,48 @@ +/* LanguageTool, a natural language style checker + * Copyright (C) 2007 Daniel Naber (http://www.danielnaber.de) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ + +package de.danielnaber.languagetool.tagging.disambiguation.pl; + +import java.io.IOException; + +import de.danielnaber.languagetool.AnalyzedSentence; +import de.danielnaber.languagetool.tagging.disambiguation.Disambiguator; +import de.danielnaber.languagetool.tagging.disambiguation.rules.pl.PolishRuleDisambiguator; + +/** + * Hybrid chunker-disambiguator for Polish. + * + * @author Marcin Miłkowski + */ + +public class PolishHybridDisambiguator implements Disambiguator { + + private final Disambiguator chunker = new PolishChunker(); + private final Disambiguator disambiguator = new PolishRuleDisambiguator(); + + /** + * Calls two disambiguator classes: (1) a chunker; (2) a rule-based + * disambiguator. + */ + public final AnalyzedSentence disambiguate(AnalyzedSentence input) + throws IOException { + return disambiguator.disambiguate(chunker.disambiguate(input)); + } + +} diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/tagging/disambiguation/rules/AbstractRuleDisambiguator.java b/JLanguageTool/src/java/de/danielnaber/languagetool/tagging/disambiguation/rules/AbstractRuleDisambiguator.java new file mode 100644 index 0000000..4605fb7 --- /dev/null +++ b/JLanguageTool/src/java/de/danielnaber/languagetool/tagging/disambiguation/rules/AbstractRuleDisambiguator.java @@ -0,0 +1,83 @@ +/* LanguageTool, a natural language style checker + * Copyright (C) 2007 Daniel Naber (http://www.danielnaber.de) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ + +package de.danielnaber.languagetool.tagging.disambiguation.rules; + +import java.io.IOException; +import java.util.List; + +import javax.xml.parsers.ParserConfigurationException; + +import de.danielnaber.languagetool.JLanguageTool; +import de.danielnaber.languagetool.Language; +import org.xml.sax.SAXException; + +import de.danielnaber.languagetool.AnalyzedSentence; +import de.danielnaber.languagetool.tagging.disambiguation.Disambiguator; +import de.danielnaber.languagetool.tools.Tools; + +/** + * Rule-based disambiguator. + * Implements an idea by Agnes Souque. + * + * @author Marcin Miłkowski + * + */ +public abstract class AbstractRuleDisambiguator implements Disambiguator { + + protected static final String DISAMB_FILE = "disambiguation.xml"; + protected List<DisambiguationPatternRule> disambiguationRules; + + protected abstract Language getLanguage(); + + @Override + public AnalyzedSentence disambiguate(final AnalyzedSentence input) throws IOException { + AnalyzedSentence sentence = input; + if (disambiguationRules == null) { + final String defaultPatternFilename = + JLanguageTool.getDataBroker().getResourceDir() + "/" + getLanguage().getShortName() + "/" + DISAMB_FILE; + try { + disambiguationRules = loadPatternRules(defaultPatternFilename); + } catch (final Exception e) { + throw new RuntimeException("Problems with parsing disambiguation file: " + + defaultPatternFilename, e); + } + } + for (final DisambiguationPatternRule patternRule : disambiguationRules) { + sentence = patternRule.replace(sentence); + } + return sentence; + } + + /** + * Load disambiguation rules from an XML file. Use {@link de.danielnaber.languagetool.JLanguageTool#addRule} to add + * these rules to the checking process. + * + * @throws ParserConfigurationException + * @throws SAXException + * @throws IOException + * @return a List of {@link DisambiguationPatternRule} objects + */ + protected List<DisambiguationPatternRule> loadPatternRules(final String filename) throws ParserConfigurationException, SAXException, IOException { + final DisambiguationRuleLoader ruleLoader = new DisambiguationRuleLoader(); + return ruleLoader.getRules(Tools.getStream(filename)); + } + + +} diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/tagging/disambiguation/rules/DisambXMLRuleHandler.java b/JLanguageTool/src/java/de/danielnaber/languagetool/tagging/disambiguation/rules/DisambXMLRuleHandler.java new file mode 100644 index 0000000..5154009 --- /dev/null +++ b/JLanguageTool/src/java/de/danielnaber/languagetool/tagging/disambiguation/rules/DisambXMLRuleHandler.java @@ -0,0 +1,52 @@ +/* LanguageTool, a natural language style checker + * Copyright (C) 2006 Daniel Naber (http://www.danielnaber.de) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ +package de.danielnaber.languagetool.tagging.disambiguation.rules; + +import java.util.ArrayList; +import java.util.List; + +import org.xml.sax.SAXException; +import org.xml.sax.SAXParseException; +import de.danielnaber.languagetool.rules.patterns.XMLRuleHandler; + +/** + * XML rule handler that loads disambiguation rules from XML and throws + * exceptions on errors and warnings. + * + * @author Daniel Naber + */ +class DisambXMLRuleHandler extends XMLRuleHandler { + + final List<DisambiguationPatternRule> rules = new ArrayList<DisambiguationPatternRule>(); + + boolean inDisambiguation; + + List<DisambiguationPatternRule> getDisambRules() { + return rules; + } + + public void warning (final SAXParseException e) throws SAXException { + throw e; + } + + public void error (final SAXParseException e) throws SAXException { + throw e; + } + +} diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/tagging/disambiguation/rules/DisambiguatedExample.java b/JLanguageTool/src/java/de/danielnaber/languagetool/tagging/disambiguation/rules/DisambiguatedExample.java new file mode 100644 index 0000000..127e0a4 --- /dev/null +++ b/JLanguageTool/src/java/de/danielnaber/languagetool/tagging/disambiguation/rules/DisambiguatedExample.java @@ -0,0 +1,77 @@ +/* LanguageTool, a natural language style checker + * Copyright (C) 2007 Daniel Naber (http://www.danielnaber.de) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ + +package de.danielnaber.languagetool.tagging.disambiguation.rules; + +/** + * Disambiguated example. Used for testing + * disambiguator rules. + * @author Marcin Milkowski + * @since 0.9.8 + */ +public class DisambiguatedExample { + + private String example; + private String inputForms; + private String outputForms; + + + public DisambiguatedExample(final String example) { + this.example = example; + } + + /** + * @param example + * Example sentence + * @param input + * Ambiguous forms of a token + * (specify in 'word[lemma/POS]' format) + * @param output + * Disambiguated forms of a token + * (specify in 'word[lemma/POS]' format) + */ + public DisambiguatedExample(final String example, final String input, final String output) { + this(example); + inputForms = input; + outputForms = output; + } + + /** + * Return the example that contains the error. + */ + public String getExample() { + return example; + } + + /** + * Return the possible corrections. May be null. + */ + public String getDisambiguated() { + return outputForms; + } + + public String getAmbiguous() { + return inputForms; + } + + public String toString() { + return example + ": " + inputForms + " -> " + outputForms; + } + +} diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/tagging/disambiguation/rules/DisambiguationPatternRule.java b/JLanguageTool/src/java/de/danielnaber/languagetool/tagging/disambiguation/rules/DisambiguationPatternRule.java new file mode 100644 index 0000000..6b5d3a8 --- /dev/null +++ b/JLanguageTool/src/java/de/danielnaber/languagetool/tagging/disambiguation/rules/DisambiguationPatternRule.java @@ -0,0 +1,357 @@ +/* LanguageTool, a natural language style checker + * Copyright (C) 2005 Daniel Naber (http://www.danielnaber.de) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ +package de.danielnaber.languagetool.tagging.disambiguation.rules; + +import java.io.IOException; +import java.util.List; + +import de.danielnaber.languagetool.AnalyzedSentence; +import de.danielnaber.languagetool.AnalyzedToken; +import de.danielnaber.languagetool.AnalyzedTokenReadings; +import de.danielnaber.languagetool.Language; +import de.danielnaber.languagetool.rules.patterns.AbstractPatternRule; +import de.danielnaber.languagetool.rules.patterns.Element; +import de.danielnaber.languagetool.rules.patterns.Match; +import de.danielnaber.languagetool.tools.StringTools; + +/** + * A Rule that describes a pattern of words or part-of-speech tags used for + * disambiguation. + * + * @author Marcin Miłkowski + */ +public class DisambiguationPatternRule extends AbstractPatternRule { + + /** Possible disambiguator actions. **/ + public enum DisambiguatorAction { + ADD, FILTER, REMOVE, REPLACE, UNIFY; + + /** + * Converts string to the constant enum. + * + * @param str + * String value to be converted. + * @return DisambiguatorAction enum. + */ + public static DisambiguatorAction toAction(final String str) { + try { + return valueOf(str); + } catch (final Exception ex) { + return REPLACE; + } + } + } + + private final String disambiguatedPOS; + + private final Match matchElement; + + private final DisambiguatorAction disAction; + + private AnalyzedToken[] newTokenReadings; + + private List<DisambiguatedExample> examples; + + private List<String> untouchedExamples; + + /** + * @param id + * Id of the Rule + * @param language + * Language of the Rule + * @param elements + * Element (token) list + * @param description + * Description to be shown (name) + * @param disambAction + * - the action to be executed on found token(s), one of the + * following: add, filter, remove, replace, unify. + * + */ + + DisambiguationPatternRule(final String id, final String description, + final Language language, final List<Element> elements, + final String disamb, final Match posSelect, + final DisambiguatorAction disambAction) { + super(id, description, language, elements, true); + if (id == null) { + throw new NullPointerException("id cannot be null"); + } + if (language == null) { + throw new NullPointerException("language cannot be null"); + } + if (elements == null) { + throw new NullPointerException("elements cannot be null"); + } + if (description == null) { + throw new NullPointerException("description cannot be null"); + } + if (disamb == null && posSelect == null + && disambAction != DisambiguatorAction.UNIFY + && disambAction != DisambiguatorAction.ADD + && disambAction != DisambiguatorAction.REMOVE) { + throw new NullPointerException("disambiguated POS cannot be null"); + } + this.disambiguatedPOS = disamb; + this.matchElement = posSelect; + this.disAction = disambAction; + this.unifier = language.getDisambiguationUnifier(); + } + + /** + * Used to add new interpretations. + * + * @param newReadings + * An array of AnalyzedTokens. The length of the array should be the + * same as the number of the tokens matched and selected by + * mark/mark_from & mark_to attributes (>1). + */ + public final void setNewInterpretations(final AnalyzedToken[] newReadings) { + newTokenReadings = newReadings.clone(); + } + + /** + * Performs disambiguation on the source sentence. + * + * @param text + * {@link AnalyzedSentence} Sentence to be disambiguated. + * @return {@link AnalyzedSentence} Disambiguated sentence (might be + * unchanged). + * @throws IOException + */ + public final AnalyzedSentence replace(final AnalyzedSentence text) + throws IOException { + final AnalyzedTokenReadings[] tokens = text.getTokensWithoutWhitespace(); + AnalyzedTokenReadings[] whTokens = text.getTokens(); + final int[] tokenPositions = new int[tokens.length + 1]; + final int patternSize = patternElements.size(); + final int limit = Math.max(0, tokens.length - patternSize + 1); + Element elem = null; + boolean changed = false; + for (int i = 0; i < limit && !(sentStart && i > 0); i++) { + boolean allElementsMatch = false; + unifiedTokens = null; + int matchingTokens = 0; + int skipShiftTotal = 0; + int firstMatchToken = -1; + int prevSkipNext = 0; + if (testUnification) { + unifier.reset(); + } + for (int k = 0; k < patternSize; k++) { + final Element prevElement = elem; + elem = patternElements.get(k); + setupRef(firstMatchToken, elem, tokens); + final int nextPos = i + k + skipShiftTotal; + prevMatched = false; + if (prevSkipNext + nextPos >= tokens.length || prevSkipNext < 0) { // SENT_END? + prevSkipNext = tokens.length - (nextPos + 1); + } + final int maxTok = Math.min(nextPos + prevSkipNext, tokens.length - (patternSize - k)); + for (int m = nextPos; m <= maxTok; m++) { + allElementsMatch = testAllReadings(tokens, elem, prevElement, m, + firstMatchToken, prevSkipNext); + if (allElementsMatch) { + final int skipShift = m - nextPos; + tokenPositions[matchingTokens] = skipShift + 1; + prevSkipNext = elem.getSkipNext(); + matchingTokens++; + skipShiftTotal += skipShift; + if (firstMatchToken == -1) { + firstMatchToken = m; + } + break; + } + } + if (!allElementsMatch) { + break; + } + } + if (allElementsMatch && matchingTokens == patternSize) { + whTokens = executeAction(text, whTokens, unifiedTokens, + firstMatchToken, matchingTokens, tokenPositions); + changed = true; + } + } + if (changed) { + return new AnalyzedSentence(whTokens, text.getWhPositions()); + } + return text; + } + + private AnalyzedTokenReadings[] executeAction(final AnalyzedSentence text, + final AnalyzedTokenReadings[] whiteTokens, + final AnalyzedTokenReadings[] unifiedTokens, final int firstMatchToken, + final int matchingTokens, final int[] tokenPositions) { + final AnalyzedTokenReadings[] whTokens = whiteTokens.clone(); + int correctedStPos = 0; + if (startPositionCorrection > 0) { + for (int l = 0; l <= startPositionCorrection; l++) { + correctedStPos += tokenPositions[l]; + } + correctedStPos--; + } + int correctedEndPos = 0; + if (endPositionCorrection < 0) { + int l = 0; + while (l > endPositionCorrection) { + correctedEndPos -= tokenPositions[matchingTokens + l - 1]; + l--; + } + } + final int fromPos = text.getOriginalPosition(firstMatchToken + + correctedStPos); + final int numRead = whTokens[fromPos].getReadingsLength(); + final boolean spaceBefore = whTokens[fromPos].isWhitespaceBefore(); + boolean filtered = false; + switch (disAction) { + case UNIFY: + if (unifiedTokens != null) { + if (unifiedTokens.length == matchingTokens - startPositionCorrection + + endPositionCorrection) { + if (whTokens[text.getOriginalPosition(firstMatchToken + correctedStPos + + unifiedTokens.length - 1)].isSentEnd()) { + unifiedTokens[unifiedTokens.length - 1].setSentEnd(); + } + for (int i = 0; i < unifiedTokens.length; i++) { + unifiedTokens[i].setStartPos(whTokens[text.getOriginalPosition(firstMatchToken + correctedStPos + + i)].getStartPos()); + whTokens[text.getOriginalPosition(firstMatchToken + correctedStPos + + i)] = unifiedTokens[i]; + } + } + } + break; + case REMOVE: + if (newTokenReadings != null) { + if (newTokenReadings.length == matchingTokens - startPositionCorrection + + endPositionCorrection) { + for (int i = 0; i < newTokenReadings.length; i++) { + whTokens[text.getOriginalPosition(firstMatchToken + correctedStPos + + i)].removeReading(newTokenReadings[i]); + } + } + } + break; + case ADD: + if (newTokenReadings != null) { + if (newTokenReadings.length == matchingTokens - startPositionCorrection + + endPositionCorrection) { + String lemma = ""; + String token = ""; + for (int i = 0; i < newTokenReadings.length; i++) { + if ("".equals(newTokenReadings[i].getToken())) { //empty token + token = whTokens[text.getOriginalPosition(firstMatchToken + correctedStPos + + i)].getToken(); + } else { + token = newTokenReadings[i].getToken(); + } + if (newTokenReadings[i].getLemma() == null) { //empty lemma + lemma = token; + } else { + lemma = newTokenReadings[i].getLemma(); + } + final AnalyzedToken newTok = new AnalyzedToken(token, newTokenReadings[i].getPOSTag(), lemma); + whTokens[text.getOriginalPosition(firstMatchToken + correctedStPos + + i)].addReading(newTok); + } + } + } + break; + case FILTER: + if (matchElement == null) { // same as REPLACE if using <match> + final Match tmpMatchToken = new Match(disambiguatedPOS, null, true, + disambiguatedPOS, null, Match.CaseConversion.NONE, + false, Match.IncludeRange.NONE); + tmpMatchToken.setToken(whTokens[fromPos]); + whTokens[fromPos] = tmpMatchToken.filterReadings(); + filtered = true; + } + case REPLACE: + default: + if (!filtered) { + if (matchElement == null) { + String lemma = ""; + for (int l = 0; l < numRead; l++) { + if (whTokens[fromPos].getAnalyzedToken(l).getPOSTag() != null + && (whTokens[fromPos].getAnalyzedToken(l).getPOSTag().equals( + disambiguatedPOS) && (whTokens[fromPos].getAnalyzedToken(l) + .getLemma() != null))) { + lemma = whTokens[fromPos].getAnalyzedToken(l).getLemma(); + } + } + if (StringTools.isEmpty(lemma)) { + lemma = whTokens[fromPos].getAnalyzedToken(0).getLemma(); + } + + final AnalyzedTokenReadings toReplace = new AnalyzedTokenReadings( + new AnalyzedToken(whTokens[fromPos].getToken(), disambiguatedPOS, + lemma), whTokens[fromPos].getStartPos()); + final boolean isSentEnd = whTokens[fromPos].isSentEnd(); + final boolean isParaEnd = whTokens[fromPos].isParaEnd(); + whTokens[fromPos] = toReplace; + if (isSentEnd) { + whTokens[fromPos].setSentEnd(); + } + if (isParaEnd) { + whTokens[fromPos].setParaEnd(); + } + whTokens[fromPos].setWhitespaceBefore(spaceBefore); + } else { + // using the match element + matchElement.setToken(whTokens[fromPos]); + whTokens[fromPos] = matchElement.filterReadings(); + whTokens[fromPos].setWhitespaceBefore(spaceBefore); + } + } + } + return whTokens; + } + + /** + * @param examples + * the examples to set + */ + public void setExamples(final List<DisambiguatedExample> examples) { + this.examples = examples; + } + + /** + * @return the examples + */ + public List<DisambiguatedExample> getExamples() { + return examples; + } + + /** + * @param untouchedExamples + * the untouchedExamples to set + */ + public void setUntouchedExamples(final List<String> untouchedExamples) { + this.untouchedExamples = untouchedExamples; + } + + /** + * @return the untouchedExamples + */ + public List<String> getUntouchedExamples() { + return untouchedExamples; + } + +} diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/tagging/disambiguation/rules/DisambiguationRuleLoader.java b/JLanguageTool/src/java/de/danielnaber/languagetool/tagging/disambiguation/rules/DisambiguationRuleLoader.java new file mode 100644 index 0000000..5e38956 --- /dev/null +++ b/JLanguageTool/src/java/de/danielnaber/languagetool/tagging/disambiguation/rules/DisambiguationRuleLoader.java @@ -0,0 +1,453 @@ +/* LanguageTool, a natural language style checker + * Copyright (C) 2005 Daniel Naber (http://www.danielnaber.de) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ +package de.danielnaber.languagetool.tagging.disambiguation.rules; + +import java.io.IOException; +import java.io.InputStream; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; + +import javax.xml.parsers.ParserConfigurationException; +import javax.xml.parsers.SAXParser; +import javax.xml.parsers.SAXParserFactory; + +import org.xml.sax.Attributes; +import org.xml.sax.SAXException; +import org.xml.sax.helpers.DefaultHandler; + +import de.danielnaber.languagetool.AnalyzedToken; +import de.danielnaber.languagetool.Language; +import de.danielnaber.languagetool.rules.patterns.Element; +import de.danielnaber.languagetool.rules.patterns.Match; +import de.danielnaber.languagetool.tagging.disambiguation.rules.DisambiguationPatternRule.DisambiguatorAction; + +/** + * Loads {@link DisambiguationPatternRule}s from a disambiguation rules XML + * file. + * + * @author Marcin Miłkowski + */ +public class DisambiguationRuleLoader extends DefaultHandler { + + public DisambiguationRuleLoader() { + super(); + } + + public final List<DisambiguationPatternRule> getRules(final InputStream file) + throws ParserConfigurationException, SAXException, IOException { + final DisambiguationRuleHandler handler = new DisambiguationRuleHandler(); + final SAXParserFactory factory = SAXParserFactory.newInstance(); + final SAXParser saxParser = factory.newSAXParser(); + saxParser.parse(file, handler); + return handler.getDisambRules(); + } + +} + +class DisambiguationRuleHandler extends DisambXMLRuleHandler { + + private static final String MARK = "mark"; + private static final String WD = "wd"; + private static final String ACTION = "action"; + private static final String DISAMBIG = "disambig"; + + private String name; + private String ruleGroupId; + private String ruleGroupName; + private StringBuilder disamb = new StringBuilder(); + private StringBuilder wd = new StringBuilder(); + private StringBuilder example = new StringBuilder(); + + private boolean inWord; + + private String disambiguatedPOS; + + private int positionCorrection; + private int endPositionCorrection; + private boolean singleTokenCorrection; + + private Match posSelector; + + private int uniCounter; + + private List<AnalyzedToken> newWdList; + private String wdLemma; + private String wdPos; + + private boolean inExample; + private boolean untouched; + private List<String> untouchedExamples; + private List<DisambiguatedExample> disambExamples; + private String input; + private String output; + + private DisambiguationPatternRule.DisambiguatorAction disambigAction; + + + // =========================================================== + // SAX DocumentHandler methods + // =========================================================== + + @Override + public void startElement(final String namespaceURI, final String lName, + final String qName, final Attributes attrs) throws SAXException { + if (qName.equals("rule")) { + id = attrs.getValue("id"); + name = attrs.getValue("name"); + if (inRuleGroup && id == null) { + id = ruleGroupId; + } + if (inRuleGroup && name == null) { + name = ruleGroupName; + } + } else if ("rules".equals(qName)) { + language = Language.getLanguageForShortName(attrs.getValue("lang")); + } else if (qName.equals(PATTERN)) { + inPattern = true; + if (attrs.getValue(MARK) != null && (attrs.getValue(MARK_FROM) != null)) { + throw new SAXException( + "You cannot use both mark and mark_from attributes." + "\n Line: " + + pLocator.getLineNumber() + ", column: " + + pLocator.getColumnNumber() + "."); + } + if (attrs.getValue(MARK) != null && (attrs.getValue(MARK_TO) != null)) { + throw new SAXException( + "You cannot use both mark and mark_to attributes." + "\n Line: " + + pLocator.getLineNumber() + ", column: " + + pLocator.getColumnNumber() + "."); + } + + if (attrs.getValue(MARK) != null) { + positionCorrection = Integer.parseInt(attrs.getValue(MARK)); + } + if (attrs.getValue(MARK_FROM) != null) { + positionCorrection = Integer.parseInt(attrs.getValue(MARK_FROM)); + } + if (attrs.getValue(MARK_TO) == null) { + singleTokenCorrection = true; + } else { + endPositionCorrection = Integer.parseInt(attrs.getValue(MARK_TO)); + if (endPositionCorrection > 0) { + throw new SAXException("End position correction (mark_to=" + + endPositionCorrection + + ") cannot be larger than 0: " + "\n Line: " + + pLocator.getLineNumber() + ", column: " + + pLocator.getColumnNumber() + "."); + } + singleTokenCorrection = false; + } + if (attrs.getValue(CASE_SENSITIVE) != null + && YES.equals(attrs.getValue(CASE_SENSITIVE))) { + caseSensitive = true; + } + } else if (qName.equals(EXCEPTION)) { + setExceptions(attrs); + } else if (qName.equals(AND)) { + inAndGroup = true; + } else if (qName.equals(UNIFY)) { + inUnification = true; + uniNegation = YES.equals(attrs.getValue(NEGATE)); + uniCounter = 0; + } else if ("feature".equals(qName)) { + uFeature = attrs.getValue("id"); + } else if (qName.equals(TYPE)) { + uType = attrs.getValue("id"); + uTypeList.add(uType); + } else if (qName.equals(TOKEN)) { + setToken(attrs); + } else if (qName.equals(DISAMBIG)) { + inDisambiguation = true; + disambiguatedPOS = attrs.getValue(POSTAG); + if (attrs.getValue(ACTION) == null) { + // default mode: + disambigAction = DisambiguationPatternRule.DisambiguatorAction + .toAction("REPLACE"); + } else { + disambigAction = DisambiguationPatternRule.DisambiguatorAction + .toAction(attrs.getValue(ACTION).toUpperCase()); + } + disamb = new StringBuilder(); + } else if (qName.equals(MATCH)) { + inMatch = true; + match = new StringBuilder(); + Match.CaseConversion caseConversion = Match.CaseConversion.NONE; + if (attrs.getValue("case_conversion") != null) { + caseConversion = Match.CaseConversion.toCase(attrs + .getValue("case_conversion").toUpperCase()); + } + Match.IncludeRange includeRange = Match.IncludeRange.NONE; + if (attrs.getValue("include_skipped") != null) { + includeRange = Match.IncludeRange.toRange(attrs + .getValue("include_skipped").toUpperCase()); + } + final Match mWorker = new Match(attrs.getValue(POSTAG), attrs + .getValue("postag_replace"), YES + .equals(attrs.getValue(POSTAG_REGEXP)), attrs + .getValue("regexp_match"), attrs.getValue("regexp_replace"), + caseConversion, YES.equals(attrs.getValue("setpos")), + includeRange); + if (inDisambiguation) { + if (attrs.getValue(NO) != null) { + final int refNumber = Integer.parseInt(attrs.getValue(NO)); + if (refNumber > elementList.size()) { + throw new SAXException( + "Only backward references in match elements are possible, tried to specify token " + + refNumber + + "\n Line: " + + pLocator.getLineNumber() + + ", column: " + pLocator.getColumnNumber() + "."); + } + mWorker.setTokenRef(refNumber); + posSelector = mWorker; + } + } else if (inToken) { + if (attrs.getValue(NO) != null) { + final int refNumber = Integer.parseInt(attrs.getValue(NO)); + if (refNumber > elementList.size()) { + throw new SAXException( + "Only backward references in match elements are possible, tried to specify token " + + refNumber + + "\n Line: " + + pLocator.getLineNumber() + + ", column: " + pLocator.getColumnNumber() + "."); + } + mWorker.setTokenRef(refNumber); + tokenReference = mWorker; + elements.append('\\'); + elements.append(refNumber); + } + } + } else if (qName.equals(RULEGROUP)) { + ruleGroupId = attrs.getValue("id"); + ruleGroupName = attrs.getValue("name"); + inRuleGroup = true; + } else if (qName.equals(UNIFICATION)) { + uFeature = attrs.getValue(FEATURE); + inUnificationDef = true; + } else if ("equivalence".equals(qName)) { + uType = attrs.getValue(TYPE); + } else if (qName.equals(WD)) { + wdLemma = attrs.getValue("lemma"); + wdPos = attrs.getValue("pos"); + inWord = true; + wd = new StringBuilder(); + } else if (qName.equals(EXAMPLE)) { + inExample = true; + if (untouchedExamples == null) { + untouchedExamples = new ArrayList<String>(); + } + if (disambExamples == null) { + disambExamples = new ArrayList<DisambiguatedExample>(); + } + untouched = attrs.getValue(TYPE).equals("untouched"); + if (attrs.getValue(TYPE).equals("ambiguous")) { + input = attrs.getValue("inputform"); + output = attrs.getValue("outputform"); + } + example = new StringBuilder(); + } else if ("marker".equals(qName)) { + example.append("<marker>"); + } + } + + @Override + public void endElement(final String namespaceURI, final String sName, + final String qName) throws SAXException { + if ("rule".equals(qName)) { + final DisambiguationPatternRule rule = new DisambiguationPatternRule(id, + name, language, elementList, disambiguatedPOS, posSelector, + disambigAction); + rule.setStartPositionCorrection(positionCorrection); + if (singleTokenCorrection) { + endPositionCorrection = 1 - (elementList.size() - positionCorrection); + rule.setEndPositionCorrection(endPositionCorrection); + } else { + rule.setEndPositionCorrection(endPositionCorrection); + } + if (newWdList != null) { + if (disambigAction == DisambiguatorAction.ADD + || disambigAction == DisambiguatorAction.REMOVE) { + if (newWdList.size() != (elementList.size() - positionCorrection + endPositionCorrection)) { + throw new SAXException( + language.getName() + " rule error. The number of interpretations specified with wd: " + + newWdList.size() + + " must be equal to the number of matched tokens (" + (elementList.size() - positionCorrection + endPositionCorrection) + ")" + + "\n Line: " + pLocator.getLineNumber() + ", column: " + + pLocator.getColumnNumber() + "."); + } + rule.setNewInterpretations(newWdList + .toArray(new AnalyzedToken[newWdList.size()])); + } + newWdList.clear(); + } + caseSensitive = false; + if (disambExamples != null) { + rule.setExamples(disambExamples); + } + if (untouchedExamples != null) { + rule.setUntouchedExamples(untouchedExamples); + } + rules.add(rule); + if (disambigAction == DisambiguatorAction.UNIFY + && (elementList.size() - positionCorrection + endPositionCorrection) != uniCounter) { + throw new SAXException(language.getName() + " rule error. The number unified tokens: " + + uniCounter + " must be equal to the number of matched tokens." + + "\n Line: " + pLocator.getLineNumber() + ", column: " + + pLocator.getColumnNumber() + "."); + } + if ((!singleTokenCorrection && (disambigAction == DisambiguatorAction.FILTER || disambigAction == DisambiguatorAction.REPLACE)) + && ((elementList.size() - positionCorrection + endPositionCorrection) > 1)) { + throw new SAXException( + language.getName() + " rule error. Cannot replace or filter more than one token at a time." + + "\n Line: " + pLocator.getLineNumber() + ", column: " + + pLocator.getColumnNumber() + "."); + } + elementList.clear(); + posSelector = null; + disambExamples = null; + untouchedExamples = null; + } else if (qName.equals(EXCEPTION)) { + finalizeExceptions(); + } else if (qName.equals(AND)) { + inAndGroup = false; + andGroupCounter = 0; + tokenCounter++; + } else if (qName.equals(TOKEN)) { + if (!exceptionSet || tokenElement == null) { + tokenElement = new Element(elements.toString(), caseSensitive, + regExpression, tokenInflected); + tokenElement.setNegation(tokenNegated); + } else { + tokenElement.setStringElement(elements.toString()); + } + if (skipPos != 0) { + tokenElement.setSkipNext(skipPos); + skipPos = 0; + } + if (posToken != null) { + tokenElement.setPosElement(posToken, posRegExp, posNegation); + posToken = null; + } + + if (tokenReference != null) { + tokenElement.setMatch(tokenReference); + } + + if (inAndGroup && andGroupCounter > 0) { + elementList.get(elementList.size() - 1) + .setAndGroupElement(tokenElement); + } else { + elementList.add(tokenElement); + } + if (inAndGroup) { + andGroupCounter++; + } + if (inUnification) { + tokenElement.setUnification(equivalenceFeatures); + if (uniNegation) { + tokenElement.setUniNegation(); + } + uniCounter++; + } + if (inUnificationDef) { + language.getDisambiguationUnifier().setEquivalence(uFeature, uType, tokenElement); + elementList.clear(); + } + if (tokenSpaceBeforeSet) { + tokenElement.setWhitespaceBefore(tokenSpaceBefore); + } + resetToken(); + } else if (qName.equals(PATTERN)) { + inPattern = false; + if (positionCorrection >= tokenCounter) { + throw new SAXException( + "Attempt to mark a token no. ("+ positionCorrection +") that is outside the pattern (" + tokenCounter + "). Pattern elements are numbered starting from 0!" + "\n Line: " + + pLocator.getLineNumber() + ", column: " + + pLocator.getColumnNumber() + "."); + } + if (tokenCounter - endPositionCorrection < 0 ) { + throw new SAXException( + "Attempt to mark a token no. ("+ endPositionCorrection +") that is outside the pattern (" + tokenCounter + "). Pattern elements are numbered starting from 0!" + "\n Line: " + + pLocator.getLineNumber() + ", column: " + + pLocator.getColumnNumber() + "."); + } + tokenCounter = 0; + } else if (qName.equals(MATCH)) { + if (inDisambiguation) { + posSelector.setLemmaString(match.toString()); + } else if (inToken) { + tokenReference.setLemmaString(match.toString()); + } + inMatch = false; + } else if (qName.equals(DISAMBIG)) { + inDisambiguation = false; + } else if (qName.equals(RULEGROUP)) { + inRuleGroup = false; + } else if (qName.equals(UNIFICATION) && inUnificationDef) { + inUnificationDef = false; + } else if ("feature".equals(qName)) { + equivalenceFeatures.put(uFeature, uTypeList); + uTypeList = new ArrayList<String>(); + } else if (qName.equals(UNIFY)) { + inUnification = false; + equivalenceFeatures = new HashMap<String, List<String>>(); + } else if (qName.equals(WD)) { + addNewWord(wd.toString(), wdLemma, wdPos); + inWord = false; + } else if (EXAMPLE.equals(qName)) { + inExample = false; + if (untouched) { + untouchedExamples.add(example.toString()); + } else { + disambExamples.add(new DisambiguatedExample(example.toString(), input, output)); + } + } else if ("marker".equals(qName)) { + example.append("</marker>"); + } + } + + private void addNewWord(final String word, final String lemma, + final String pos) { + final AnalyzedToken newWd = new AnalyzedToken(word, pos, lemma); + if (newWdList == null) { + newWdList = new ArrayList<AnalyzedToken>(); + } + newWdList.add(newWd); + } + + @Override + public final void characters(final char[] buf, final int offset, final int len) { + final String s = new String(buf, offset, len); + if (inException) { + exceptions.append(s); + } else if (inToken && inPattern) { + elements.append(s); + } else if (inMatch) { + match.append(s); + } else if (inWord) { + wd.append(s); + } else if (inDisambiguation) { + disamb.append(s); + } else if (inExample) { + example.append(s); + } + } + +} diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/tagging/disambiguation/rules/en/EnglishRuleDisambiguator.java b/JLanguageTool/src/java/de/danielnaber/languagetool/tagging/disambiguation/rules/en/EnglishRuleDisambiguator.java new file mode 100644 index 0000000..6202dcb --- /dev/null +++ b/JLanguageTool/src/java/de/danielnaber/languagetool/tagging/disambiguation/rules/en/EnglishRuleDisambiguator.java @@ -0,0 +1,32 @@ +/* LanguageTool, a natural language style checker + * Copyright (C) 2007 Daniel Naber (http://www.danielnaber.de) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ + +package de.danielnaber.languagetool.tagging.disambiguation.rules.en; + +import de.danielnaber.languagetool.Language; +import de.danielnaber.languagetool.tagging.disambiguation.rules.AbstractRuleDisambiguator; + +public class EnglishRuleDisambiguator extends AbstractRuleDisambiguator { + + @Override + protected Language getLanguage() { + return Language.ENGLISH; + } + +} diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/tagging/disambiguation/rules/fr/FrenchRuleDisambiguator.java b/JLanguageTool/src/java/de/danielnaber/languagetool/tagging/disambiguation/rules/fr/FrenchRuleDisambiguator.java new file mode 100644 index 0000000..6d81940 --- /dev/null +++ b/JLanguageTool/src/java/de/danielnaber/languagetool/tagging/disambiguation/rules/fr/FrenchRuleDisambiguator.java @@ -0,0 +1,32 @@ +/* LanguageTool, a natural language style checker + * Copyright (C) 2007 Daniel Naber (http://www.danielnaber.de) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ + +package de.danielnaber.languagetool.tagging.disambiguation.rules.fr; + +import de.danielnaber.languagetool.Language; +import de.danielnaber.languagetool.tagging.disambiguation.rules.AbstractRuleDisambiguator; + +public class FrenchRuleDisambiguator extends AbstractRuleDisambiguator { + + @Override + protected Language getLanguage() { + return Language.FRENCH; + } + +} diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/tagging/disambiguation/rules/nl/DutchRuleDisambiguator.java b/JLanguageTool/src/java/de/danielnaber/languagetool/tagging/disambiguation/rules/nl/DutchRuleDisambiguator.java new file mode 100644 index 0000000..0d28afc --- /dev/null +++ b/JLanguageTool/src/java/de/danielnaber/languagetool/tagging/disambiguation/rules/nl/DutchRuleDisambiguator.java @@ -0,0 +1,32 @@ +/* LanguageTool, a natural language style checker + * Copyright (C) 2007 Daniel Naber (http://www.danielnaber.de) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ + +package de.danielnaber.languagetool.tagging.disambiguation.rules.nl; + +import de.danielnaber.languagetool.Language; +import de.danielnaber.languagetool.tagging.disambiguation.rules.AbstractRuleDisambiguator; + +public class DutchRuleDisambiguator extends AbstractRuleDisambiguator { + + @Override + protected Language getLanguage() { + return Language.DUTCH; + } + +} diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/tagging/disambiguation/rules/pl/PolishRuleDisambiguator.java b/JLanguageTool/src/java/de/danielnaber/languagetool/tagging/disambiguation/rules/pl/PolishRuleDisambiguator.java new file mode 100644 index 0000000..8dc18eb --- /dev/null +++ b/JLanguageTool/src/java/de/danielnaber/languagetool/tagging/disambiguation/rules/pl/PolishRuleDisambiguator.java @@ -0,0 +1,31 @@ +/* LanguageTool, a natural language style checker + * Copyright (C) 2007 Daniel Naber (http://www.danielnaber.de) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ +package de.danielnaber.languagetool.tagging.disambiguation.rules.pl; + +import de.danielnaber.languagetool.Language; +import de.danielnaber.languagetool.tagging.disambiguation.rules.AbstractRuleDisambiguator; + +public class PolishRuleDisambiguator extends AbstractRuleDisambiguator { + + @Override + protected Language getLanguage() { + return Language.POLISH; + } + +} diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/tagging/disambiguation/rules/ro/RomanianRuleDisambiguator.java b/JLanguageTool/src/java/de/danielnaber/languagetool/tagging/disambiguation/rules/ro/RomanianRuleDisambiguator.java new file mode 100644 index 0000000..5b90bb0 --- /dev/null +++ b/JLanguageTool/src/java/de/danielnaber/languagetool/tagging/disambiguation/rules/ro/RomanianRuleDisambiguator.java @@ -0,0 +1,32 @@ +/* LanguageTool, a natural language style checker + * Copyright (C) 2007 Daniel Naber (http://www.danielnaber.de) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ + +package de.danielnaber.languagetool.tagging.disambiguation.rules.ro; + +import de.danielnaber.languagetool.Language; +import de.danielnaber.languagetool.tagging.disambiguation.rules.AbstractRuleDisambiguator; + +public class RomanianRuleDisambiguator extends AbstractRuleDisambiguator { + + @Override + protected Language getLanguage() { + return Language.ROMANIAN; + } + +} diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/tagging/disambiguation/rules/ru/RussianRuleDisambiguator.java b/JLanguageTool/src/java/de/danielnaber/languagetool/tagging/disambiguation/rules/ru/RussianRuleDisambiguator.java new file mode 100644 index 0000000..f200de3 --- /dev/null +++ b/JLanguageTool/src/java/de/danielnaber/languagetool/tagging/disambiguation/rules/ru/RussianRuleDisambiguator.java @@ -0,0 +1,36 @@ +/* LanguageTool, a natural language style checker + * Copyright (C) 2007 Daniel Naber (http://www.danielnaber.de) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ + +package de.danielnaber.languagetool.tagging.disambiguation.rules.ru; + +import de.danielnaber.languagetool.Language; +import de.danielnaber.languagetool.tagging.disambiguation.rules.AbstractRuleDisambiguator; + +/** + * Disambiguator for Russian + */ + +public class RussianRuleDisambiguator extends AbstractRuleDisambiguator { + + @Override + protected Language getLanguage() { + return Language.RUSSIAN; + } + +} diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/tagging/disambiguation/xx/DemoDisambiguator.java b/JLanguageTool/src/java/de/danielnaber/languagetool/tagging/disambiguation/xx/DemoDisambiguator.java new file mode 100644 index 0000000..5ebec17 --- /dev/null +++ b/JLanguageTool/src/java/de/danielnaber/languagetool/tagging/disambiguation/xx/DemoDisambiguator.java @@ -0,0 +1,38 @@ +/* LanguageTool, a natural language style checker + * Copyright (C) 2007 Daniel Naber (http://www.danielnaber.de) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ + +package de.danielnaber.languagetool.tagging.disambiguation.xx; + +import de.danielnaber.languagetool.AnalyzedSentence; +import de.danielnaber.languagetool.tagging.disambiguation.Disambiguator; + +/** + * Trivial disambiguator. + * Does nothing at all. Just copies input to output. + * + * @author Jozef Licko + * + */ +public class DemoDisambiguator implements Disambiguator { + + public final AnalyzedSentence disambiguate(final AnalyzedSentence input) { + return input; + } + +} diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/tagging/disambiguation/xx/TrimDisambiguator.java b/JLanguageTool/src/java/de/danielnaber/languagetool/tagging/disambiguation/xx/TrimDisambiguator.java new file mode 100644 index 0000000..783dcd8 --- /dev/null +++ b/JLanguageTool/src/java/de/danielnaber/languagetool/tagging/disambiguation/xx/TrimDisambiguator.java @@ -0,0 +1,53 @@ +/* LanguageTool, a natural language style checker + * Copyright (C) 2007 Daniel Naber (http://www.danielnaber.de) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ + +package de.danielnaber.languagetool.tagging.disambiguation.xx; + +import de.danielnaber.languagetool.AnalyzedSentence; +import de.danielnaber.languagetool.AnalyzedToken; +import de.danielnaber.languagetool.AnalyzedTokenReadings; +import de.danielnaber.languagetool.tagging.disambiguation.Disambiguator; + +/** + * Trivial disambiguator. Just cuts out tags from the token. It leaves only the + * first tag. + * + * @author Jozef Licko + */ +public class TrimDisambiguator implements Disambiguator { + + public final AnalyzedSentence disambiguate(final AnalyzedSentence input) { + + final AnalyzedTokenReadings[] anTokens = input.getTokens(); + final AnalyzedTokenReadings[] output = new AnalyzedTokenReadings[anTokens.length]; + + for (int i = 0; i < anTokens.length; i++) { + + if (anTokens[i].getReadingsLength() > 1) { + final AnalyzedToken[] firstToken = new AnalyzedToken[1]; + firstToken[0] = anTokens[i].getAnalyzedToken(0); + output[i] = new AnalyzedTokenReadings(firstToken, anTokens[i].getStartPos()); + } else { + output[i] = anTokens[i]; + } + } + return new AnalyzedSentence(output); + } + +} diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/tagging/en/EnglishTagger.java b/JLanguageTool/src/java/de/danielnaber/languagetool/tagging/en/EnglishTagger.java new file mode 100644 index 0000000..5bee4ad --- /dev/null +++ b/JLanguageTool/src/java/de/danielnaber/languagetool/tagging/en/EnglishTagger.java @@ -0,0 +1,43 @@ +/* LanguageTool, a natural language style checker + * Copyright (C) 2006 Daniel Naber (http://www.danielnaber.de) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ +package de.danielnaber.languagetool.tagging.en; + +import java.util.Locale; + +import de.danielnaber.languagetool.JLanguageTool; +import de.danielnaber.languagetool.tagging.BaseTagger; + +/** English Part-of-speech tagger. + * Based on part-of-speech lists in Public Domain. + * see readme.txt for details, the POS tagset is + * described in tagset.txt + * + * @author Marcin Milkowski + */ +public class EnglishTagger extends BaseTagger { + + public final String getFileName() { + return JLanguageTool.getDataBroker().getResourceDir() + "/en/english.dict"; + } + + public EnglishTagger() { + super(); + setLocale(Locale.ENGLISH); + } +} diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/tagging/eo/EsperantoTagger.java b/JLanguageTool/src/java/de/danielnaber/languagetool/tagging/eo/EsperantoTagger.java new file mode 100644 index 0000000..cff2e77 --- /dev/null +++ b/JLanguageTool/src/java/de/danielnaber/languagetool/tagging/eo/EsperantoTagger.java @@ -0,0 +1,360 @@ +/* LanguageTool, a natural language style checker + * Copyright (C) 2007 Daniel Naber (http://www.danielnaber.de) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ + +/* + * Created on 01.10.2010 + */ +package de.danielnaber.languagetool.tagging.eo; + +import java.io.BufferedReader; +import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.util.Arrays; +import java.util.ArrayList; +import java.util.List; +import java.util.HashSet; +import java.util.Set; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + + +import de.danielnaber.languagetool.AnalyzedToken; +import de.danielnaber.languagetool.AnalyzedTokenReadings; +import de.danielnaber.languagetool.JLanguageTool; +import de.danielnaber.languagetool.tagging.Tagger; + +/** + * A part-of-speech tagger for Esperanto. + * + * @author Dominique Pellé + */ +public class EsperantoTagger implements Tagger { + + // These words don't need to be tagged. + private final static String wordsNotTagged[] = { + "ajn", "ĉi", "ĉu", "des", "do", "ja", "ju", "ke", "malpli", + "ne", "nek", "ol", "pli" + }; + + private final static Set setWordsNotTagged = new HashSet<String>(Arrays.asList(wordsNotTagged)); + + // Following preposition are never followed by accusative. + private final static String prepositionsNoAccusative[] = { + "al", "apud", "cis", "da", "de", "disde", "dum", "ekde", "el", + "far", "ĝis", "je", "kun", "laŭ", "malgraŭ", "na", + "per", "po", "post", "por", "pri", "pro", "sen", "super", "tra" + }; + + private final static Set setPrepositionsNoAccusative = + new HashSet<String>(Arrays.asList(prepositionsNoAccusative)); + + // Following preposition may be followed by accusative. + private final static String prepositionsAccusative[] = { + "anstataŭ", "en", "kontraŭ", "krom", "sur", "sub", "trans", + "preter", "ĉirkaŭ", "antaŭ", "malantaŭ", "ekster", "inter", "ĉe" + }; + + private final Set setPrepositionsAccusative = + new HashSet<String>(Arrays.asList(prepositionsAccusative)); + + // Conjunctions. + private final static String conjunctions[] = { + "ĉar", "kaj", "aŭ", "sed", "plus", "minus", "tamen" + }; + + private final static Set setConjunctions = new HashSet<String>(Arrays.asList(conjunctions)); + + // Numbers. + private final static String numbers[] = { + "nul", "unu", "du", "tri", "kvar", "kvin", "ses", + "sep", "ok", "naŭ", "dek", "cent", "mil" + }; + + private final static Set setNumbers = new HashSet<String>(Arrays.asList(numbers)); + + // Adverbs which do not end in -e + private final static String adverbs[] = { + "ankoraŭ", "almenaŭ", "apenaŭ", "baldaŭ", "preskaŭ", "eĉ", + "jam", "jen", "ĵus", "morgaŭ", "hodiaŭ", "hieraŭ", "nun", + "nur", "plu", "tre", "tro", "tuj", "for" + }; + + private final static Set setAdverbs = new HashSet<String>(Arrays.asList(adverbs)); + + // Set of transitive verbs and non-transitive verbs. + private Set setTransitiveVerbs = null; + private Set setNonTransitiveVerbs = null; + + // Verbs always end with this pattern. + private final static Pattern patternVerb1 = Pattern.compile("(.*)(as|os|is|us|u|i)$"); + private final static Pattern patternVerb2 = Pattern.compile(".*(ig|iĝ)(.s|.)$"); + + // Particips -ant-, -int, ont-, -it-, -it-, -ot- + // TODO: this is not used yet. + final Pattern patternParticiple = + Pattern.compile("(.*)([aio])(n?)t([aoe])(j?)(n?)$"); + // Groups 11 22222 33 44444 55 66 + + // Pattern 'tabelvortoj'. + final Pattern patternTabelvorto = + Pattern.compile("^(i|ti|ki|ĉi|neni)((([uoae])(j?)(n?))|(am|al|es|el|om))$"); + // Groups 111111111111111 22222222222222222222222222222222 + // 3333333333333333 77777777777 + // 444444 55 66 + + // Pattern of 'tabelvortoj' which are also tagged adverbs. + final Pattern patternTabelvortoAdverb = + Pattern.compile("(ti|i|ĉi|neni)(am|om|el|e)"); + + /** + * Load list of words from UTF-8 file (one word per line). + */ + private Set loadWords(final InputStream file) throws IOException { + InputStreamReader isr = null; + BufferedReader br = null; + final Set<String> words = new HashSet<String>(); + try { + isr = new InputStreamReader(file, "UTF-8"); + br = new BufferedReader(isr); + String line; + + while ((line = br.readLine()) != null) { + line = line.trim(); + if (line.length() < 1) { + continue; + } + if (line.charAt(0) == '#') { // ignore comments + continue; + } + words.add(line); + } + } finally { + if (br != null) { + br.close(); + } + if (isr != null) { + isr.close(); + } + } + return words; + } + + private void lazyInit() throws IOException { + if (setTransitiveVerbs != null) { + return; + } + + // Load set of transitive and non-transitive verbs. Files don't contain + // verbs with suffix -iĝ or -ig since transitivity is obvious for those verbs. + // They also don't contain verbs with prefixes mal-, ek-, re-, mis- fi- and + // suffixes -ad, -aĉ, -et, -eg since these affixes never alter transitivity. + setTransitiveVerbs = loadWords(JLanguageTool.getDataBroker().getFromRulesDirAsStream("/eo/verb-tr.txt")); + setNonTransitiveVerbs = loadWords(JLanguageTool.getDataBroker().getFromRulesDirAsStream("/eo/verb-ntr.txt")); + } + + public List<AnalyzedTokenReadings> tag(final List<String> sentenceTokens) throws IOException { + + lazyInit(); + + Matcher matcher; + + final List<AnalyzedTokenReadings> tokenReadings = + new ArrayList<AnalyzedTokenReadings>(); + int pos = 0; + for (String word : sentenceTokens) { + final List<AnalyzedToken> l = new ArrayList<AnalyzedToken>(); + final String lWord = word.toLowerCase(); + + if (lWord.equals(".")) { + l.add(new AnalyzedToken(word, "M fino", lWord)); + + } else if (lWord.equals("?")) { + l.add(new AnalyzedToken(word, "M fino dem", lWord)); + + } else if (lWord.equals("!")) { + l.add(new AnalyzedToken(word, "M fino kri", lWord)); + + } else if (lWord.equals("la")) { + l.add(new AnalyzedToken(word, "D", lWord)); + + } else if (setAdverbs.contains(lWord)) { + l.add(new AnalyzedToken(word, "E nak", lWord)); + + } else if (setWordsNotTagged.contains(lWord)) { + l.add(new AnalyzedToken(word, null, lWord)); + + // Pronouns. + } else if (lWord.equals("mi") || lWord.equals("ci") + || lWord.equals("li") || lWord.equals("ŝi") + || lWord.equals("ĝi") || lWord.equals("si") + || lWord.equals("oni")) { + l.add(new AnalyzedToken(word, "R nak np", lWord)); + } else if (lWord.equals("min") || lWord.equals("cin") + || lWord.equals("lin") || lWord.equals("ŝin") + || lWord.equals("ĝin") || lWord.equals("sin")) { + l.add(new AnalyzedToken(word, "R akz np", lWord.substring(0, lWord.length() - 1))); + } else if (lWord.equals("ni") || lWord.equals("ili")) { + l.add(new AnalyzedToken(word, "R nak pl", lWord)); + } else if (lWord.equals("nin") || lWord.equals("ilin")) { + l.add(new AnalyzedToken(word, "R akz pl", lWord.substring(0, lWord.length() - 1))); + } else if (lWord.equals("vi")) { + l.add(new AnalyzedToken(word, "R nak pn", lWord)); + } else if (lWord.equals("vin")) { + l.add(new AnalyzedToken(word, "R akz pn", lWord.substring(0, lWord.length() - 1))); + + // Conjunctions (kaj, sed, ...) + } else if (setConjunctions.contains(lWord)) { + l.add(new AnalyzedToken(word, "K", lWord)); + + // Prepositions. + } else if (setPrepositionsNoAccusative.contains(lWord)) { + l.add(new AnalyzedToken(word, "P sak", lWord)); + } else if (setPrepositionsAccusative.contains(lWord)) { + l.add(new AnalyzedToken(word, "P kak", lWord)); + + } else if (setNumbers.contains(lWord)) { + l.add(new AnalyzedToken(word, "N", lWord)); + + // Tiu, kiu (tabelvortoj). + } else if ((matcher = patternTabelvorto.matcher(lWord)).find()) { + final String type1Group = matcher.group(1).substring(0, 1).toLowerCase(); + final String type2Group = matcher.group(4); + final String plGroup = matcher.group(5); + final String accGroup = matcher.group(6); + final String type3Group = matcher.group(7); + final String type; + final String plural; + final String accusative; + + if (accGroup == null) { + accusative = "xxx"; + } else { + accusative = accGroup.toLowerCase().equals("n") ? "akz" : "nak"; + } + if (plGroup == null) { + plural = " pn "; + } else { + plural = plGroup.toLowerCase().equals("j") ? " pl " : " np "; + } + type = ((type2Group == null) ? type3Group : type2Group).toLowerCase(); + + l.add(new AnalyzedToken(word, "T " + + accusative + plural + type1Group + " " + type, null)); + + if ((matcher = patternTabelvortoAdverb.matcher(lWord)).find()) { + l.add(new AnalyzedToken(word, "E nak", lWord)); + } + + // Words ending in .*oj?n? are nouns. + } else if (lWord.endsWith("o")) { + l.add(new AnalyzedToken(word, "O nak np", lWord)); + } else if (lWord.endsWith("oj")) { + l.add(new AnalyzedToken(word, "O nak pl", lWord.substring(0, lWord.length() - 1))); + } else if (lWord.endsWith("on")) { + l.add(new AnalyzedToken(word, "O akz np", lWord.substring(0, lWord.length() - 1))); + } else if (lWord.endsWith("ojn")) { + l.add(new AnalyzedToken(word, "O akz pl", lWord.substring(0, lWord.length() - 2))); + + // Words ending in .*aj?n? are nouns. + } else if (lWord.endsWith("a")) { + l.add(new AnalyzedToken(word, "A nak np", lWord)); + } else if (lWord.endsWith("aj")) { + l.add(new AnalyzedToken(word, "A nak pl", lWord.substring(0, lWord.length() - 1))); + } else if (lWord.endsWith("an")) { + l.add(new AnalyzedToken(word, "A akz np", lWord.substring(0, lWord.length() - 1))); + } else if (lWord.endsWith("ajn")) { + l.add(new AnalyzedToken(word, "A akz pl", lWord.substring(0, lWord.length() - 2))); + + // Words ending in .*en? are adverbs. + } else if (lWord.endsWith("e")) { + l.add(new AnalyzedToken(word, "E nak", lWord)); + } else if (lWord.endsWith("en")) { + l.add(new AnalyzedToken(word, "E akz", lWord.substring(0, lWord.length() - 1))); + + // Verbs. + } else if ((matcher = patternVerb1.matcher(lWord)).find()) { + final String verb = matcher.group(1) + "i"; + final String tense = matcher.group(2); + final String transitive; + + final Matcher matcher2 = patternVerb2.matcher(lWord); + if (matcher2.find()) { + transitive = matcher2.group(1).equals("ig") ? "tr" : "nt"; + } else { + final boolean isTransitive = setTransitiveVerbs.contains(verb); + final boolean isIntransitive = setNonTransitiveVerbs.contains(verb); + + if (isTransitive) { + transitive = isIntransitive ? "tn" : "tr"; + } else { + transitive = isIntransitive ? "nt" : "tn"; + } + } + l.add(new AnalyzedToken(word, "V " + transitive + " " + tense, verb)); + + // Irregular word (no tag). + } else { + l.add(new AnalyzedToken(word, null, null)); + } + + // Participle (can be combined with other tags). + if ((matcher = patternParticiple.matcher(lWord)).find()) { + final String verb = matcher.group(1) + "i"; + final String aio = matcher.group(2); + final String antAt = matcher.group(3).equals("n") ? "n" : "-"; + final String aoe = matcher.group(4); + final String plural = matcher.group(5).equals("j") ? "pl" : "np"; + final String accusative = matcher.group(6).equals("n") ? "akz" : "nak"; + final String transitive; + + final Matcher matcher2 = patternVerb2.matcher(lWord); + if (matcher2.find()) { + transitive = matcher2.group(1).equals("ig") ? "tr" : "nt"; + } else { + final boolean isTransitive = setTransitiveVerbs.contains(verb); + final boolean isIntransitive = setNonTransitiveVerbs.contains(verb); + + if (isTransitive) { + transitive = isIntransitive ? "tn" : "tr"; + } else { + transitive = isIntransitive ? "nt" : "tn"; + } + } + l.add(new AnalyzedToken(word, "C " + accusative + " " + plural + " " + + transitive + " " + aio + " " + antAt + " " + aoe, + verb)); + } + + pos += word.length(); + tokenReadings.add(new AnalyzedTokenReadings( + l.toArray(new AnalyzedToken[0]), 0)); + } + return tokenReadings; + } + + public AnalyzedTokenReadings createNullToken(String token, int startPos) { + return new AnalyzedTokenReadings( + new AnalyzedToken(token, null, null), startPos); + } + + public AnalyzedToken createToken(String token, String posTag) { + return new AnalyzedToken(token, posTag, null); + } +} diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/tagging/es/SpanishTagger.java b/JLanguageTool/src/java/de/danielnaber/languagetool/tagging/es/SpanishTagger.java new file mode 100644 index 0000000..fa0700c --- /dev/null +++ b/JLanguageTool/src/java/de/danielnaber/languagetool/tagging/es/SpanishTagger.java @@ -0,0 +1,43 @@ +/* LanguageTool, a natural language style checker + * Copyright (C) 2006 Daniel Naber (http://www.danielnaber.de) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ +package de.danielnaber.languagetool.tagging.es; + +import java.util.Locale; + +import de.danielnaber.languagetool.JLanguageTool; +import de.danielnaber.languagetool.tagging.BaseTagger; + +/** Spanish Tagger + * + * Based on FreeLing tagger dictionary + * and Spanish Wikipedia corpus tagged with FreeLing. + * + * @author Marcin Milkowski + */ +public class SpanishTagger extends BaseTagger { + + public final String getFileName() { + return JLanguageTool.getDataBroker().getResourceDir() + "/es/spanish.dict"; + } + + public SpanishTagger() { + super(); + setLocale(new Locale("es")); + } +} diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/tagging/fr/FrenchTagger.java b/JLanguageTool/src/java/de/danielnaber/languagetool/tagging/fr/FrenchTagger.java new file mode 100644 index 0000000..cbdea19 --- /dev/null +++ b/JLanguageTool/src/java/de/danielnaber/languagetool/tagging/fr/FrenchTagger.java @@ -0,0 +1,42 @@ +/* LanguageTool, a natural language style checker + * Copyright (C) 2006 Daniel Naber (http://www.danielnaber.de) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ +package de.danielnaber.languagetool.tagging.fr; + +import java.util.Locale; + +import de.danielnaber.languagetool.JLanguageTool; +import de.danielnaber.languagetool.tagging.BaseTagger; + +/** French Tagger + * + * Based on inDICO, implemented in FSA. + * + * @author Marcin Milkowski + */ +public class FrenchTagger extends BaseTagger { + + public final String getFileName() { + return JLanguageTool.getDataBroker().getResourceDir() + "/fr/french.dict"; + } + + public FrenchTagger() { + super(); + setLocale(Locale.FRENCH); + } +} diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/tagging/gl/GalicianTagger.java b/JLanguageTool/src/java/de/danielnaber/languagetool/tagging/gl/GalicianTagger.java new file mode 100644 index 0000000..9827d5d --- /dev/null +++ b/JLanguageTool/src/java/de/danielnaber/languagetool/tagging/gl/GalicianTagger.java @@ -0,0 +1,43 @@ +/* LanguageTool, a natural language style checker + * Copyright (C) 2006 Daniel Naber (http://www.danielnaber.de) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ +package de.danielnaber.languagetool.tagging.gl; + +import java.util.Locale; + +import de.danielnaber.languagetool.JLanguageTool; +import de.danielnaber.languagetool.tagging.BaseTagger; + +/** Galician Part-of-speech tagger. + * Based on English tagger. + * + * @author Marcin Milkowski + * + * modified by Susana Sotelo Docio + */ +public class GalicianTagger extends BaseTagger { + + public final String getFileName() { + return JLanguageTool.getDataBroker().getResourceDir() + "/gl/galician.dict"; + } + + public GalicianTagger() { + super(); + setLocale(new Locale("gl")); + } +} diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/tagging/it/ItalianTagger.java b/JLanguageTool/src/java/de/danielnaber/languagetool/tagging/it/ItalianTagger.java new file mode 100644 index 0000000..5ae55ac --- /dev/null +++ b/JLanguageTool/src/java/de/danielnaber/languagetool/tagging/it/ItalianTagger.java @@ -0,0 +1,46 @@ +/* LanguageTool, a natural language style checker + * Copyright (C) 2006 Daniel Naber (http://www.danielnaber.de) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ +package de.danielnaber.languagetool.tagging.it; + +import java.util.Locale; + +import de.danielnaber.languagetool.JLanguageTool; +import de.danielnaber.languagetool.tagging.BaseTagger; + +/** + * Italian tagger + * + * Uses morph-it! lexicon compiled by Marco Baroni and Eros Zanchetta + * + * see resource/it/readme-morph-it.txt for tagset + * + * @author Marcin Milkowski + */ +public class ItalianTagger extends BaseTagger { + + public final String getFileName() { + return JLanguageTool.getDataBroker().getResourceDir() + "/it/italian.dict"; + } + + public ItalianTagger() { + super(); + setLocale(Locale.ITALIAN); + } + +} diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/tagging/ml/MalayalamTagger.java b/JLanguageTool/src/java/de/danielnaber/languagetool/tagging/ml/MalayalamTagger.java new file mode 100644 index 0000000..b15ee58 --- /dev/null +++ b/JLanguageTool/src/java/de/danielnaber/languagetool/tagging/ml/MalayalamTagger.java @@ -0,0 +1,42 @@ +/* LanguageTool, a natural language style checker + * Copyright (C) 2010 Daniel Naber, Marcin Miłkowski (http://www.languagetool.org) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ + +package de.danielnaber.languagetool.tagging.ml; + +import java.util.Locale; + +import de.danielnaber.languagetool.JLanguageTool; +import de.danielnaber.languagetool.tagging.BaseTagger; + +/** Malayalam Part-of-speech tagger. + * + * @author Marcin Milkowski + */ +public class MalayalamTagger extends BaseTagger { + + public final String getFileName() { + return JLanguageTool.getDataBroker().getResourceDir() + "/ml/malayalam.dict"; + } + + public MalayalamTagger() { + super(); + setLocale(new Locale("ml")); + } +} + diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/tagging/nb/BokmalTagger.java b/JLanguageTool/src/java/de/danielnaber/languagetool/tagging/nb/BokmalTagger.java new file mode 100644 index 0000000..f7a5e09 --- /dev/null +++ b/JLanguageTool/src/java/de/danielnaber/languagetool/tagging/nb/BokmalTagger.java @@ -0,0 +1,43 @@ +/* LanguageTool, a natural language style checker + * Copyright (C) 2006 Daniel Naber (http://www.danielnaber.de) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ +package de.danielnaber.languagetool.tagging.nb; + +import java.util.Locale; + +import de.danielnaber.languagetool.JLanguageTool; +import de.danielnaber.languagetool.tagging.BaseTagger; + +/** Bokmål Part-of-speech tagger. + * Based on part-of-speech lists in Public Domain. + * see readme.txt for details, the POS tagset is + * described in tagset.txt + * + * @author Marcin Milkowski + */ +public class BokmalTagger extends BaseTagger { + + public final String getFileName() { + return JLanguageTool.getDataBroker().getResourceDir() + "/nb/bokmal.dict"; + } + + public BokmalTagger() { + super(); + setLocale(Locale.FRENCH); + } +} diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/tagging/nl/DutchTagger.java b/JLanguageTool/src/java/de/danielnaber/languagetool/tagging/nl/DutchTagger.java new file mode 100644 index 0000000..ece7dee --- /dev/null +++ b/JLanguageTool/src/java/de/danielnaber/languagetool/tagging/nl/DutchTagger.java @@ -0,0 +1,41 @@ +/* LanguageTool, a natural language style checker + * Copyright (C) 2006 Daniel Naber (http://www.danielnaber.de) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ +package de.danielnaber.languagetool.tagging.nl; + +import java.util.Locale; + +import de.danielnaber.languagetool.JLanguageTool; +import de.danielnaber.languagetool.tagging.BaseTagger; + +/** + * Dutch tagger. + * + * @author Marcin Milkowski + */ +public class DutchTagger extends BaseTagger { + + public final String getFileName() { + return JLanguageTool.getDataBroker().getResourceDir() + "/nl/dutch.dict"; + } + + public DutchTagger() { + super(); + setLocale(new Locale("nl")); + } +} diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/tagging/pl/PolishTagger.java b/JLanguageTool/src/java/de/danielnaber/languagetool/tagging/pl/PolishTagger.java new file mode 100644 index 0000000..881655e --- /dev/null +++ b/JLanguageTool/src/java/de/danielnaber/languagetool/tagging/pl/PolishTagger.java @@ -0,0 +1,117 @@ +/* LanguageTool, a natural language style checker + * Copyright (C) 2006 Daniel Naber (http://www.danielnaber.de) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ +package de.danielnaber.languagetool.tagging.pl; + +import java.io.IOException; +import java.net.URL; +import java.util.ArrayList; +import java.util.List; +import java.util.Locale; + +import morfologik.stemming.Dictionary; +import morfologik.stemming.DictionaryLookup; +import morfologik.stemming.IStemmer; +import de.danielnaber.languagetool.AnalyzedToken; +import de.danielnaber.languagetool.AnalyzedTokenReadings; +import de.danielnaber.languagetool.JLanguageTool; +import de.danielnaber.languagetool.tagging.BaseTagger; +import de.danielnaber.languagetool.tools.StringTools; + +/** + * Polish POS tagger based on FSA morphological dictionaries. + * + * @author Marcin Milkowski + */ + +public class PolishTagger extends BaseTagger { + + private static final String RESOURCE_FILENAME = "/pl/polish.dict"; + private IStemmer morfologik; + private final Locale plLocale = new Locale("pl"); + + @Override + public final String getFileName() { + return JLanguageTool.getDataBroker().getResourceDir() + RESOURCE_FILENAME; + } + + @Override + public final List<AnalyzedTokenReadings> tag(final List<String> sentenceTokens) + throws IOException { + List<AnalyzedToken> taggerTokens; + List<AnalyzedToken> lowerTaggerTokens; + List<AnalyzedToken> upperTaggerTokens; + final List<AnalyzedTokenReadings> tokenReadings = new ArrayList<AnalyzedTokenReadings>(); + int pos = 0; + // caching Lametyzator instance - lazy init + if (morfologik == null) { + final URL url = JLanguageTool.getDataBroker().getFromResourceDirAsUrl(RESOURCE_FILENAME); + morfologik = new DictionaryLookup(Dictionary.read(url)); + } + + for (String word : sentenceTokens) { + final List<AnalyzedToken> l = new ArrayList<AnalyzedToken>(); + final String lowerWord = word.toLowerCase(plLocale); + taggerTokens = asAnalyzedTokenList(word, morfologik.lookup(word)); + lowerTaggerTokens = asAnalyzedTokenList(word, morfologik.lookup(lowerWord)); + final boolean isLowercase = word.equals(lowerWord); + + //normal case + addTokens(taggerTokens, l); + + if (!isLowercase) { + //lowercase + addTokens(lowerTaggerTokens, l); + } + + //uppercase + if (lowerTaggerTokens.isEmpty() && taggerTokens.isEmpty()) { + if (isLowercase) { + upperTaggerTokens = asAnalyzedTokenList(word, morfologik.lookup(StringTools + .uppercaseFirstChar(word))); + if (!upperTaggerTokens.isEmpty()) { + addTokens(upperTaggerTokens, l); + } else { + l.add(new AnalyzedToken(word, null, null)); + } + } else { + l.add(new AnalyzedToken(word, null, null)); + } + } + tokenReadings.add(new AnalyzedTokenReadings(l, pos)); + pos += word.length(); + } + + return tokenReadings; + } + + private void addTokens(final List<AnalyzedToken> taggedTokens, + final List<AnalyzedToken> l) { + if (taggedTokens != null) { + for (AnalyzedToken at : taggedTokens) { + final String[] tagsArr = StringTools.asString(at.getPOSTag()).split("\\+"); + for (final String currTag : tagsArr) { + l.add(new AnalyzedToken(at.getToken(), currTag, + at.getLemma())); + } + } + } + } + + +}
\ No newline at end of file diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/tagging/ro/RomanianTagger.java b/JLanguageTool/src/java/de/danielnaber/languagetool/tagging/ro/RomanianTagger.java new file mode 100644 index 0000000..76ffe2e --- /dev/null +++ b/JLanguageTool/src/java/de/danielnaber/languagetool/tagging/ro/RomanianTagger.java @@ -0,0 +1,102 @@ +/* LanguageTool, a natural language style checker + * Copyright (C) 2006 Daniel Naber (http://www.danielnaber.de) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ +package de.danielnaber.languagetool.tagging.ro; + +import java.io.IOException; +import java.net.URL; +import java.util.ArrayList; +import java.util.List; +import java.util.Locale; + +import morfologik.stemming.Dictionary; +import morfologik.stemming.DictionaryLookup; +import morfologik.stemming.IStemmer; +import morfologik.stemming.WordData; + +import de.danielnaber.languagetool.AnalyzedToken; +import de.danielnaber.languagetool.AnalyzedTokenReadings; +import de.danielnaber.languagetool.JLanguageTool; +import de.danielnaber.languagetool.tagging.BaseTagger; + +/** + * Romanian Part-of-speech tagger + * + * @author Ionuț Păduraru + */ +public class RomanianTagger extends BaseTagger { + + private String RESOURCE_FILENAME = "/ro/romanian.dict"; + + private IStemmer morfologik; + private static final Locale roLocale = new Locale("ro"); + + public final String getFileName() { + return JLanguageTool.getDataBroker().getResourceDir() + RESOURCE_FILENAME; + } + + public RomanianTagger() { + super(); + setLocale(roLocale); + } + + public RomanianTagger(final String fileName) { + super(); + RESOURCE_FILENAME = fileName; + setLocale(roLocale); + } + + @Override + public final List<AnalyzedTokenReadings> tag( + final List<String> sentenceTokens) throws IOException { + List<WordData> taggerTokens; + + final List<AnalyzedTokenReadings> tokenReadings = new ArrayList<AnalyzedTokenReadings>(); + int pos = 0; + // caching Lametyzator instance - lazy init + if (morfologik == null) { + final URL url = JLanguageTool.getDataBroker().getFromResourceDirAsUrl(RESOURCE_FILENAME); + morfologik = new DictionaryLookup(Dictionary.read(url)); + } + + for (final String word : sentenceTokens) { + final List<AnalyzedToken> l = new ArrayList<AnalyzedToken>(); + taggerTokens = morfologik.lookup(word.toLowerCase(roLocale)); + if (taggerTokens != null) { + for (WordData wd : taggerTokens) { + final String[] tagsArr = wd.getStem().toString().split("\\+"); + for (final String currTag : tagsArr) { + l.add(new AnalyzedToken(word, + wd.getTag().toString(), currTag)); + } + } + } + + if (taggerTokens == null || taggerTokens.isEmpty()) { + l.add(new AnalyzedToken(word, null, null)); + } + tokenReadings.add(new AnalyzedTokenReadings(l + .toArray(new AnalyzedToken[l.size()]), pos)); + pos += word.length(); + } + + return tokenReadings; + + } + +} diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/tagging/ru/RussianTagger.java b/JLanguageTool/src/java/de/danielnaber/languagetool/tagging/ru/RussianTagger.java new file mode 100644 index 0000000..b31d29b --- /dev/null +++ b/JLanguageTool/src/java/de/danielnaber/languagetool/tagging/ru/RussianTagger.java @@ -0,0 +1,42 @@ +/* LanguageTool, a natural language style checker + * Copyright (C) 2006 Daniel Naber (http://www.danielnaber.de) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ +package de.danielnaber.languagetool.tagging.ru; + +import java.util.Locale; + +import de.danielnaber.languagetool.JLanguageTool; +import de.danielnaber.languagetool.tagging.BaseTagger; + +/** Part-of-speech tagger. + * Russian dictionary originally developed by www.aot.ru and licensed under LGPL. + * see readme.txt for details, the POS tagset is + * described in russian_tags.txt + * + * */ +public class RussianTagger extends BaseTagger { + + public final String getFileName() { + return JLanguageTool.getDataBroker().getResourceDir() + "/ru/russian.dict"; + } + + public RussianTagger() { + super(); + setLocale(new Locale("ru")); + } +} diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/tagging/sk/SlovakTagger.java b/JLanguageTool/src/java/de/danielnaber/languagetool/tagging/sk/SlovakTagger.java new file mode 100644 index 0000000..cd43a7c --- /dev/null +++ b/JLanguageTool/src/java/de/danielnaber/languagetool/tagging/sk/SlovakTagger.java @@ -0,0 +1,40 @@ +/* LanguageTool, a natural language style checker + * Copyright (C) 2006 Daniel Naber (http://www.danielnaber.de) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ +package de.danielnaber.languagetool.tagging.sk; + +import java.util.Locale; + +import de.danielnaber.languagetool.JLanguageTool; +import de.danielnaber.languagetool.tagging.BaseTagger; + +/** Slovak Part-of-speech tagger based on Russian Part-of-speech tagger. + * @author Zdenko Podobný + */ + +public class SlovakTagger extends BaseTagger { + + public final String getFileName() { + return JLanguageTool.getDataBroker().getResourceDir() + "/sk/slovak.dict"; + } + + public SlovakTagger() { + super(); + setLocale(new Locale("sk")); + } +} diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/tagging/sv/SwedishTagger.java b/JLanguageTool/src/java/de/danielnaber/languagetool/tagging/sv/SwedishTagger.java new file mode 100644 index 0000000..78bad25 --- /dev/null +++ b/JLanguageTool/src/java/de/danielnaber/languagetool/tagging/sv/SwedishTagger.java @@ -0,0 +1,39 @@ +/* LanguageTool, a natural language style checker + * Copyright (C) 2006 Daniel Naber (http://www.danielnaber.de) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ +package de.danielnaber.languagetool.tagging.sv; + +import java.util.Locale; + +import de.danielnaber.languagetool.JLanguageTool; +import de.danielnaber.languagetool.tagging.BaseTagger; + +/** Swedish Part-of-speech tagger. + * Based on DSSO. Will be expanded upon. + */ +public class SwedishTagger extends BaseTagger { + + public final String getFileName() { + return JLanguageTool.getDataBroker().getResourceDir() + "/sv/swedish.dict"; + } + + public SwedishTagger() { + super(); + setLocale(new Locale("sv")); + } +} diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/tagging/uk/IPOSTag.java b/JLanguageTool/src/java/de/danielnaber/languagetool/tagging/uk/IPOSTag.java new file mode 100644 index 0000000..d31cd4d --- /dev/null +++ b/JLanguageTool/src/java/de/danielnaber/languagetool/tagging/uk/IPOSTag.java @@ -0,0 +1,30 @@ +/* LanguageTool, a natural language style checker + * Copyright (C) 2007 Daniel Naber (http://www.danielnaber.de) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ + +package de.danielnaber.languagetool.tagging.uk; + +public interface IPOSTag { + + public static final String TAG_ADJ = "adj"; + public static final String TAG_NOUN = "noun"; + public static final String TAG_PLURAL = "plural"; + public static final String TAG_VERB = "verb"; + public static final String TAG_REFL = "refl"; + +} diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/tagging/uk/UkrainianMorfoTagger.java b/JLanguageTool/src/java/de/danielnaber/languagetool/tagging/uk/UkrainianMorfoTagger.java new file mode 100644 index 0000000..350b4ba --- /dev/null +++ b/JLanguageTool/src/java/de/danielnaber/languagetool/tagging/uk/UkrainianMorfoTagger.java @@ -0,0 +1,35 @@ +/* LanguageTool, a natural language style checker + * Copyright (C) 2006 Daniel Naber (http://www.danielnaber.de) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ +package de.danielnaber.languagetool.tagging.uk; + +import de.danielnaber.languagetool.JLanguageTool; +import de.danielnaber.languagetool.tagging.BaseTagger; + +/** Ukrainian Part-of-speech tagger. + * + * @author Adriy Rysin + */ +public class UkrainianMorfoTagger extends BaseTagger { + + private static final String RESOURCE_FILENAME = "/uk/ukrainian.dict"; + + public final String getFileName() { + return JLanguageTool.getDataBroker().getResourceDir() + RESOURCE_FILENAME; + } +} diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/tagging/uk/UkrainianMyspellTagger.java b/JLanguageTool/src/java/de/danielnaber/languagetool/tagging/uk/UkrainianMyspellTagger.java new file mode 100644 index 0000000..aa6bf8e --- /dev/null +++ b/JLanguageTool/src/java/de/danielnaber/languagetool/tagging/uk/UkrainianMyspellTagger.java @@ -0,0 +1,149 @@ +/* LanguageTool, a natural language style checker + * Copyright (C) 2006 Daniel Naber (http://www.danielnaber.de) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ +package de.danielnaber.languagetool.tagging.uk; + +import java.io.BufferedReader; +import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.nio.charset.Charset; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; + +import de.danielnaber.languagetool.AnalyzedToken; +import de.danielnaber.languagetool.AnalyzedTokenReadings; +import de.danielnaber.languagetool.JLanguageTool; +import de.danielnaber.languagetool.tagging.Tagger; + +/** + * Ukrainian Part-of-speech tagger. This class uses myspell uk_UA.dic dictionary + * file to assign tags to words. It only supports lemmas and three main parts of + * speech: noun, verb and adjective + * + * @author Adriy Rysin + */ +public class UkrainianMyspellTagger implements Tagger { + + private static final String RESOURCE_FILENAME = "/uk/ukrainian.dict"; + + // private Lametyzator morfologik = null; + private HashMap<String, String[]> wordsToPos; + + + public final List<AnalyzedTokenReadings> tag(final List<String> sentenceTokens) + throws IOException { + + final List<AnalyzedTokenReadings> tokenReadings = new ArrayList<AnalyzedTokenReadings>(); + int pos = 0; + // caching Lametyzator instance - lazy init + // if (morfologik == null) { + // File resourceFile = JLanguageTool.getAbsoluteFile(JLanguageTool.getDataBroker().getResourceDir() + RESOURCE_FILENAME); + // morfologik = new + // Lametyzator(Tools.getInputStream(resourceFile.getAbsolutePath()), + // "utf-8", '+'); + // } + if (wordsToPos == null) { + wordsToPos = new HashMap<String, String[]>(); + final InputStream resourceFile = JLanguageTool.getDataBroker().getFromResourceDirAsStream(RESOURCE_FILENAME); + // System.err.println("reading dict: " + resourceFile); + + final BufferedReader input = new BufferedReader(new InputStreamReader( + resourceFile, Charset.forName("UTF-8"))); + + String line; + while ((line = input.readLine()) != null) { + line = line.trim(); + if (line.matches("^[0-9]") || line.length() == 0) { + continue; + } + + final String[] wrd = line.split("/"); + if (wrd.length > 1) { + final String flags = wrd[1]; + final List<String> posTags = new ArrayList<String>(); + + if (flags.matches("[abcdefghijklmnop]+")) { + posTags.add(IPOSTag.TAG_NOUN); + if (flags.equals("b")) { + posTags.add(IPOSTag.TAG_PLURAL); + } + } else if (flags.matches("[ABCDEFGHIJKLMN]+")) { + posTags.add(IPOSTag.TAG_VERB); + if (flags.matches("^[BDFHJLN]+")) { + posTags.add(IPOSTag.TAG_REFL); + } + } else if (flags.matches("[UV]+")) { + posTags.add(IPOSTag.TAG_ADJ); + } + + if (posTags.size() > 0) { + wordsToPos.put(wrd[0], posTags.toArray(new String[0])); + } + } + } + // System.err.println("POSed words: " + wordsToPos.size()); + input.close(); + } + + for (final String word : sentenceTokens) { + final List<AnalyzedToken> analyzedTokens = new ArrayList<AnalyzedToken>(); + + final String[] posTags = wordsToPos.get(word); + String[] lowerPosTags = null; + + if (posTags != null) { + for (String posTag : posTags) + analyzedTokens.add(new AnalyzedToken(word, posTag, word)); + } else { + final String lowerWord = word.toLowerCase(); + if (!word.equals(lowerWord)) { + lowerPosTags = wordsToPos.get(lowerWord); + if (lowerPosTags != null) { + for (String lowerPosTag : lowerPosTags) + analyzedTokens.add(new AnalyzedToken(lowerWord, lowerPosTag, + lowerWord)); + } + } + // else { + // analyzedTokens.add(new AnalyzedToken(word, null, word)); + // } + } + + if (posTags == null && lowerPosTags == null) { + analyzedTokens.add(new AnalyzedToken(word, null, null)); + } + + tokenReadings.add(new AnalyzedTokenReadings(analyzedTokens + .toArray(new AnalyzedToken[analyzedTokens.size()]), pos)); + pos += word.length(); + } + + return tokenReadings; + } + + public final AnalyzedTokenReadings createNullToken(final String token, final int startPos) { + return new AnalyzedTokenReadings(new AnalyzedToken(token, null, null), startPos); + } + + public AnalyzedToken createToken(String token, String posTag) { + return new AnalyzedToken(token, posTag, null); + } + +} diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/tagging/uk/UkrainianTagger.java b/JLanguageTool/src/java/de/danielnaber/languagetool/tagging/uk/UkrainianTagger.java new file mode 100644 index 0000000..5785766 --- /dev/null +++ b/JLanguageTool/src/java/de/danielnaber/languagetool/tagging/uk/UkrainianTagger.java @@ -0,0 +1,26 @@ +/* LanguageTool, a natural language style checker + * Copyright (C) 2007 Daniel Naber (http://www.danielnaber.de) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ + +package de.danielnaber.languagetool.tagging.uk; + +import de.danielnaber.languagetool.tagging.Tagger; + +public class UkrainianTagger extends UkrainianMyspellTagger implements Tagger { + +} diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/tagging/xx/DemoTagger.java b/JLanguageTool/src/java/de/danielnaber/languagetool/tagging/xx/DemoTagger.java new file mode 100644 index 0000000..031f45d --- /dev/null +++ b/JLanguageTool/src/java/de/danielnaber/languagetool/tagging/xx/DemoTagger.java @@ -0,0 +1,63 @@ +/* LanguageTool, a natural language style checker + * Copyright (C) 2007 Daniel Naber (http://www.danielnaber.de) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ + +/* + * Created on 23.12.2005 + */ +package de.danielnaber.languagetool.tagging.xx; + +import java.util.ArrayList; +import java.util.List; + +import de.danielnaber.languagetool.AnalyzedToken; +import de.danielnaber.languagetool.AnalyzedTokenReadings; +import de.danielnaber.languagetool.tagging.Tagger; + +/** + * A trivial tagger that does nothing than assign null + * tags to words. + * + * @author Daniel Naber + */ +public class DemoTagger implements Tagger { + + public List<AnalyzedTokenReadings> tag(final List<String> sentenceTokens) { + + final List<AnalyzedTokenReadings> tokenReadings = new ArrayList<AnalyzedTokenReadings>(); + int pos = 0; + for (String word : sentenceTokens) { + final List<AnalyzedToken> l = new ArrayList<AnalyzedToken>(); + // a real tagger would need to assign a POS tag + // in the next line instead of null: + l.add(new AnalyzedToken(word, null, null)); + pos += word.length(); + tokenReadings.add(new AnalyzedTokenReadings(l.toArray(new AnalyzedToken[0]), 0)); + } + return tokenReadings; + } + + public AnalyzedTokenReadings createNullToken(String token, int startPos) { + return new AnalyzedTokenReadings(new AnalyzedToken(token, null, null), startPos); + } + + public AnalyzedToken createToken(String token, String posTag) { + return new AnalyzedToken(token, posTag, null); + } + +} diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/tokenizers/SRXSentenceTokenizer.java b/JLanguageTool/src/java/de/danielnaber/languagetool/tokenizers/SRXSentenceTokenizer.java new file mode 100644 index 0000000..dc11420 --- /dev/null +++ b/JLanguageTool/src/java/de/danielnaber/languagetool/tokenizers/SRXSentenceTokenizer.java @@ -0,0 +1,99 @@ +/* LanguageTool, a natural language style checker + * Copyright (C) 2009 Daniel Naber (http://www.danielnaber.de) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ +package de.danielnaber.languagetool.tokenizers; + +import java.io.BufferedReader; +import java.io.InputStreamReader; +import java.util.ArrayList; +import java.util.List; + +import net.sourceforge.segment.TextIterator; +import net.sourceforge.segment.srx.SrxDocument; +import net.sourceforge.segment.srx.SrxParser; +import net.sourceforge.segment.srx.SrxTextIterator; +import net.sourceforge.segment.srx.io.Srx2Parser; +import de.danielnaber.languagetool.JLanguageTool; + +/** + * Class to tokenize sentences using an SRX file. + * + * @author Marcin Miłkowski + * + */ +public class SRXSentenceTokenizer extends SentenceTokenizer { + + private BufferedReader srxReader; + private final SrxDocument document; + private final String language; + private String parCode; + + static final String RULES = "/segment.srx"; + + public SRXSentenceTokenizer(final String language) { + this.language = language; + try { + srxReader = new BufferedReader(new InputStreamReader( + JLanguageTool.getDataBroker().getFromResourceDirAsStream(RULES), "utf-8")); + } catch (Exception e) { + throw new RuntimeException("Could not load rules " + RULES + " from resource dir " + + JLanguageTool.getDataBroker().getResourceDir()); + } + final SrxParser srxParser = new Srx2Parser(); + document = srxParser.parse(srxReader); + setSingleLineBreaksMarksParagraph(false); + } + + @Override + public final List<String> tokenize(final String text) { + final List<String> segments = new ArrayList<String>(); + final TextIterator textIterator = new SrxTextIterator(document, language + + parCode, text); + while (textIterator.hasNext()) { + segments.add(textIterator.next()); + } + return segments; + } + + public final boolean singleLineBreaksMarksPara() { + return "_one".equals(parCode); + } + + /** + * @param lineBreakParagraphs + * if <code>true</code>, single lines breaks are assumed to end a + * paragraph, with <code>false</code>, only two ore more consecutive + * line breaks end a paragraph + */ + public final void setSingleLineBreaksMarksParagraph( + final boolean lineBreakParagraphs) { + if (lineBreakParagraphs) { + parCode = "_one"; + } else { + parCode = "_two"; + } + } + + protected final void finalize() throws Throwable { + if (srxReader != null) { + srxReader.close(); + } + super.finalize(); + } + +} diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/tokenizers/SentenceTokenizer.java b/JLanguageTool/src/java/de/danielnaber/languagetool/tokenizers/SentenceTokenizer.java new file mode 100644 index 0000000..55d1ec6 --- /dev/null +++ b/JLanguageTool/src/java/de/danielnaber/languagetool/tokenizers/SentenceTokenizer.java @@ -0,0 +1,250 @@ +/* LanguageTool, a natural language style checker + * Copyright (C) 2005 Daniel Naber (http://www.danielnaber.de) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ +package de.danielnaber.languagetool.tokenizers; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.HashSet; +import java.util.List; +import java.util.Set; +import java.util.StringTokenizer; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +/** + * Tokenizes text into sentences by looking for typical end-of-sentence markers, + * but considering exceptions (e.g. abbreviations). + * + * @author Daniel Naber + */ +public class SentenceTokenizer implements Tokenizer { + + // end of sentence marker: + protected static final String EOS = "\0"; + //private final static String EOS = "#"; // for testing only + protected static final String P = "[\\.!?…]"; // PUNCTUATION + protected static final String AP = "(?:'|«|\"||\\)|\\]|\\})?"; // AFTER PUNCTUATION + protected static final String PAP = P + AP; + protected static final String PARENS = "[\\(\\)\\[\\]]"; // parentheses + + // Check out the private methods for comments and examples about these + // regular expressions: + + private Pattern paragraph; + private static final Pattern paragraphByTwoLineBreaks = Pattern.compile("([\\n\\r]\\s*[\\n\\r])"); + private static final Pattern paragraphByLineBreak = Pattern.compile("([\\n\\r])"); + + // add unbreakable field, for example footnote, if it's at the end of the sentence + private static final Pattern punctWhitespace = Pattern.compile("(" + PAP + "(\u0002)?\\s)"); + // \p{Lu} = uppercase, with obeying Unicode (\p{Upper} is just US-ASCII!): + private static final Pattern punctUpperLower = Pattern.compile("(" + PAP + + ")([\\p{Lu}][^\\p{Lu}.])"); + private static final Pattern letterPunct = Pattern.compile("(\\s[\\wüöäÜÖÄß]" + P + ")"); + private static final Pattern abbrev1 = Pattern.compile("([^-\\wüöäÜÖÄß][\\wüöäÜÖÄß]" + PAP + "\\s)" + EOS); + private static final Pattern abbrev2 = Pattern.compile("([^-\\wüöäÜÖÄß][\\wüöäÜÖÄß]" + P + ")" + EOS); + private static final Pattern abbrev3 = Pattern.compile("(\\s[\\wüöäÜÖÄß]\\.\\s+)" + EOS); + private static final Pattern abbrev4 = Pattern.compile("(\\.\\.\\. )" + EOS + "([\\p{Ll}])"); + private static final Pattern abbrev5 = Pattern.compile("(['\"]" + P + "['\"]\\s+)" + EOS); + private static final Pattern abbrev6 = Pattern.compile("([\"']\\s*)" + EOS + "(\\s*[\\p{Ll}])"); + private static final Pattern abbrev7 = Pattern.compile("(\\s" + PAP + "\\s)" + EOS); + // z.b. 3.10. (im Datum): + private static final Pattern abbrev8 = Pattern.compile("(\\d{1,2}\\.\\d{1,2}\\.\\s+)" + EOS); + private static final Pattern repair1 = Pattern.compile("('[\\wüöäÜÖÄß]" + P + ")(\\s)"); + private static final Pattern repair2 = Pattern.compile("(\\sno\\.)(\\s+)(?!\\d)"); + private static final Pattern repair3 = Pattern.compile("([ap]\\.m\\.\\s+)([\\p{Lu}])"); + + private static final Pattern repair10 = Pattern.compile("([\\(\\[])([!?]+)([\\]\\)]) " + EOS); + private static final Pattern repair11 = Pattern.compile("([!?]+)([\\)\\]]) " + EOS); + private static final Pattern repair12 = Pattern.compile("(" + PARENS + ") " + EOS); + + // some abbreviations: + private static final String[] ABBREV_LIST = { + // English -- but these work globally for all languages: + "Mr", "Mrs", "No", "pp", "St", "no", + "Sr", "Jr", "Bros", "etc", "vs", "esp", "Fig", "fig", "Jan", "Feb", "Mar", "Apr", "Jun", "Jul", + "Aug", "Sep", "Sept", "Oct", "Okt", "Nov", "Dec", "Ph.D", "PhD", + "al", // in "et al." + "cf", "Inc", "Ms", "Gen", "Sen", "Prof", "Corp", "Co" + }; + + private final Set<Pattern> abbreviationPatterns = new HashSet<Pattern>(); + + /** + * Month names like "Dezember" that should not be considered a sentence + * boundary in string like "13. Dezember". May also contain other + * words that indicate there's no sentence boundary when preceded + * by a number and a dot. + */ + protected String[] monthNames; + + /** + * Create a sentence tokenizer that uses the built-in abbreviations. + */ + public SentenceTokenizer() { + this(new String[]{}); + } + + /** + * Create a sentence tokenizer with the given list of abbreviations, + * additionally to the built-in ones. + */ + public SentenceTokenizer(final String[] abbrevList) { + final List<String> allAbbreviations = new ArrayList<String>(); + allAbbreviations.addAll(Arrays.asList(abbrevList)); + allAbbreviations.addAll(Arrays.asList(ABBREV_LIST)); + for (String element : allAbbreviations) { + final Pattern pattern = Pattern.compile("(\\b" + element + PAP + "\\s)" + EOS); + abbreviationPatterns.add(pattern); + } + setSingleLineBreaksMarksParagraph(false); + } + + /** + * @param lineBreakParagraphs if <code>true</code>, single lines breaks are assumed to end a paragraph, + * with <code>false</code>, only two ore more consecutive line breaks end a paragraph + */ + public void setSingleLineBreaksMarksParagraph(final boolean lineBreakParagraphs) { + if (lineBreakParagraphs) { + paragraph = paragraphByLineBreak; + } else { + paragraph = paragraphByTwoLineBreaks; + } + } + + public boolean singleLineBreaksMarksPara() { + return paragraph == paragraphByLineBreak; + } + + /** + * Tokenize the given string to sentences. + */ + public List<String> tokenize(String s) { + s = firstSentenceSplitting(s); + s = removeFalseEndOfSentence(s); + s = splitUnsplitStuff(s); + final StringTokenizer stringTokenizer = + new StringTokenizer(s, EOS); + final List<String> l = new ArrayList<String>(); + while (stringTokenizer.hasMoreTokens()) { + final String sentence = stringTokenizer.nextToken(); + l.add(sentence); + } + return l; + } + + /** + * Add a special break character at all places with typical sentence delimiters. + */ + private String firstSentenceSplitting(String s) { + // Double new-line means a new sentence: + s = paragraph.matcher(s).replaceAll("$1" + EOS); + // Punctuation followed by whitespace means a new sentence: + s = punctWhitespace.matcher(s).replaceAll("$1" + EOS); + // New (compared to the perl module): Punctuation followed by uppercase followed + // by non-uppercase character (except dot) means a new sentence: + s = punctUpperLower.matcher(s).replaceAll("$1" + EOS + "$2"); + // Break also when single letter comes before punctuation: + s = letterPunct.matcher(s).replaceAll("$1" + EOS); + return s; + } + + /** + * Repair some positions that don't require a split, i.e. remove the special break character at + * those positions. + */ + protected String removeFalseEndOfSentence(String s) { + // Don't split at e.g. "U. S. A.": + s = abbrev1.matcher(s).replaceAll("$1"); + // Don't split at e.g. "U.S.A.": + s = abbrev2.matcher(s).replaceAll("$1"); + // Don't split after a white-space followed by a single letter followed + // by a dot followed by another whitespace. + // e.g. " p. " + s = abbrev3.matcher(s).replaceAll("$1"); + // Don't split at "bla bla... yada yada" (TODO: use \.\.\.\s+ instead?) + s = abbrev4.matcher(s).replaceAll("$1$2"); + // Don't split [.?!] when the're quoted: + s = abbrev5.matcher(s).replaceAll("$1"); + + // Don't split at abbreviations: + for (final Pattern abbrevPattern : abbreviationPatterns) { + final Matcher matcher = abbrevPattern.matcher(s); + s = matcher.replaceAll("$1"); + } + // Don't break after quote unless there's a capital letter: + // e.g.: "That's right!" he said. + s = abbrev6.matcher(s).replaceAll("$1$2"); + + // fixme? not sure where this should occur, leaving it commented out: + // don't break: text . . some more text. + // text=~s/(\s\.\s)$EOS(\s*)/$1$2/sg; + + // e.g. "Das ist . so." -> assume one sentence + s = abbrev7.matcher(s).replaceAll("$1"); + + // e.g. "Das ist . so." -> assume one sentence + s = abbrev8.matcher(s).replaceAll("$1"); + + // extension by dnaber --commented out, doesn't help: + // text = re.compile("(:\s+)%s(\s*[%s])" % (self.EOS, string.lowercase), + // re.DOTALL).sub("\\1\\2", text) + + // "13. Dezember" etc. -> keine Satzgrenze: + if (monthNames != null) { + for (String element : monthNames) { + s = s.replaceAll("(\\d+\\.) " + EOS + "(" + element + ")", "$1 $2"); + } + } + + // z.B. "Das hier ist ein(!) Satz." + s = repair10.matcher(s).replaceAll("$1$2$3 "); + + // z.B. "Das hier ist (genau!) ein Satz." + s = repair11.matcher(s).replaceAll("$1$2 "); + + // z.B. "bla (...) blubb" -> kein Satzende + s = repair12.matcher(s).replaceAll("$1 "); + + return s; + } + + /** + * Treat some more special cases that make up a sentence boundary. Insert the special break + * character at these positions. + */ + private String splitUnsplitStuff(String s) { + // e.g. "x5. bla..." -- not sure, leaving commented out: + // text = re.compile("(\D\d+)(%s)(\s+)" % self.P, re.DOTALL).sub("\\1\\2%s\\3" % self.EOS, text) + // Not sure about this one, leaving out four now: + // text = re.compile("(%s\s)(\s*\()" % self.PAP, re.DOTALL).sub("\\1%s\\2" % self.EOS, text) + // Split e.g.: He won't. #Really. + s = repair1.matcher(s).replaceAll("$1" + EOS + "$2"); + // Split e.g.: He won't say no. Not really. + s = repair2.matcher(s).replaceAll("$1" + EOS + "$2"); + // Split at "a.m." or "p.m." followed by a capital letter. + s = repair3.matcher(s).replaceAll("$1" + EOS + "$2"); + return s; + } + + /*public static void main(final String[] args) { + final SentenceTokenizer st = new GermanSentenceTokenizer(); + st.tokenize("Er sagte (...) und"); + }*/ + +} diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/tokenizers/Tokenizer.java b/JLanguageTool/src/java/de/danielnaber/languagetool/tokenizers/Tokenizer.java new file mode 100644 index 0000000..9a49fbe --- /dev/null +++ b/JLanguageTool/src/java/de/danielnaber/languagetool/tokenizers/Tokenizer.java @@ -0,0 +1,32 @@ +/* LanguageTool, a natural language style checker + * Copyright (C) 2005 Daniel Naber (http://www.danielnaber.de) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ +package de.danielnaber.languagetool.tokenizers; + +import java.util.List; + +/** + * Interface for classes that tokenize text into smaller units. + * + * @author Daniel Naber + */ +public interface Tokenizer { + + public abstract List<String> tokenize(String text); + +} diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/tokenizers/WordTokenizer.java b/JLanguageTool/src/java/de/danielnaber/languagetool/tokenizers/WordTokenizer.java new file mode 100644 index 0000000..6764c34 --- /dev/null +++ b/JLanguageTool/src/java/de/danielnaber/languagetool/tokenizers/WordTokenizer.java @@ -0,0 +1,59 @@ +/* LanguageTool, a natural language style checker + * Copyright (C) 2005 Daniel Naber (http://www.danielnaber.de) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ +package de.danielnaber.languagetool.tokenizers; + +import java.util.ArrayList; +import java.util.List; +import java.util.StringTokenizer; + +/** + * Tokenizes a sentence into words. + * Punctuation and whitespace gets its own token. + * + * @author Daniel Naber + */ +public class WordTokenizer implements Tokenizer { + + public WordTokenizer() { + } + + public List<String> tokenize(final String text) { + final List<String> l = new ArrayList<String>(); + final StringTokenizer st = new StringTokenizer(text, + "\u0020\u00A0\u115f\u1160\u1680" + + "\u2000\u2001\u2002\u2003\u2004\u2005\u2006\u2007" + + "\u2008\u2009\u200A\u200B\u200c\u200d\u200e\u200f" + + "\u2028\u2029\u202a\u202b\u202c\u202d\u202e\u202f" + + "\u205F\u2060\u2061\u2062\u2063\u206A\u206b\u206c\u206d" + + "\u206E\u206F\u3000\u3164\ufeff\uffa0\ufff9\ufffa\ufffb" + + ",.;()[]{}<>!?:/\\\"'«»„”“‘`’…¿¡\t\n\r", true); + while (st.hasMoreElements()) { + l.add(st.nextToken()); + } + return l; + } + +} + + + + + + +
\ No newline at end of file diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/tokenizers/cs/CzechSentenceTokenizer.java b/JLanguageTool/src/java/de/danielnaber/languagetool/tokenizers/cs/CzechSentenceTokenizer.java new file mode 100644 index 0000000..2f0a4f4 --- /dev/null +++ b/JLanguageTool/src/java/de/danielnaber/languagetool/tokenizers/cs/CzechSentenceTokenizer.java @@ -0,0 +1,228 @@ +/* LanguageTool, a natural language style checker + * Copyright (C) 2007 Daniel Naber (http://www.danielnaber.de) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ + +/* + * CzechSentenceTokenizer.java + * + * Created on 25.1.2007, 11:45 + */ + +package de.danielnaber.languagetool.tokenizers.cs; + +import java.util.ArrayList; +import java.util.List; +import java.util.StringTokenizer; +import java.util.regex.Pattern; + +import de.danielnaber.languagetool.tokenizers.SentenceTokenizer; + +/** + * + * @author Jozef Licko + */ +public class CzechSentenceTokenizer extends SentenceTokenizer { + + // End of sentence marker. + private static final String EOS = "\0"; + + // private final static String EOS = "#"; // for testing only + + // Punctuation. + private static final String P = "[\\.!?…]"; + + // After punctuation. + private static final String AP = "(?:'|«|\"|”|\\)|\\]|\\})?"; + + private static final String PAP = P + AP; + + // Check out the private methods for comments and examples about these + // regular expressions: + + private static final Pattern paragraphByTwoLineBreaks = Pattern.compile("(\\n\\s*\\n)"); + + private static final Pattern paragraphByLineBreak = Pattern.compile("(\\n)"); + + // add unbreakable field, for example footnote, if it's at the end of the sentence + private static final Pattern punctWhitespace = Pattern.compile("(" + PAP + "(\u0002)?\\s)"); + + // \p{Lu} = uppercase, with obeying Unicode (\p{Upper} is just US-ASCII!): + private static final Pattern punctUpperLower = Pattern.compile("(" + PAP + + ")([\\p{Lu}][^\\p{Lu}.])"); + + private static final Pattern letterPunct = Pattern + .compile("(\\s[\\wáčďéěíňóřšťúůýžÁČĎÉĚÍŇÓŘŠŤÚŮÝŽ]" + P + ")"); + + private static final Pattern abbrev1 = Pattern + .compile("([^-\\wáčďéěíňóřšťúůýžÁČĎÉĚÍŇÓŘŠŤÚŮÝŽ][\\wáčďéěíňóřšťúůýžÁČĎÉĚÍŇÓŘŠŤÚŮÝŽ]" + PAP + + "\\s)" + EOS); + + private static final Pattern abbrev2 = Pattern + .compile("([^-\\wáčďéěíňóřšťúůýžÁČĎÉĚÍŇÓŘŠŤÚŮÝŽ][\\wáčďéěíňóřšťúůýžÁČĎÉĚÍŇÓŘŠŤÚŮÝŽ]" + P + + ")" + EOS); + + private static final Pattern abbrev3 = Pattern + .compile("(\\s[\\wáčďéěíňóřšťúůýžÁČĎÉĚÍŇÓŘŠŤÚŮÝŽ]\\.\\s+)" + EOS); + + private static final Pattern abbrev4 = Pattern.compile("(\\.\\.\\. )" + EOS + "([\\p{Ll}])"); + private static final Pattern abbrev5 = Pattern.compile("(['\"]" + P + "['\"]\\s+)" + EOS); + private static final Pattern abbrev6 = Pattern.compile("([\"']\\s*)" + EOS + "(\\s*[\\p{Ll}])"); + private static final Pattern abbrev7 = Pattern.compile("(\\s" + PAP + "\\s)" + EOS); + // z.b. 3.10. (im Datum): + private static final Pattern abbrev8 = Pattern.compile("(\\d{1,2}\\.\\d{1,2}\\.\\s+)" + EOS); + private static final Pattern repair1 = Pattern.compile("('[\\wáčďéěíňóřšťúůýžÁČĎÉĚÍŇÓŘŠŤÚŮÝŽ]" + + P + ")(\\s)"); + private static final Pattern repair2 = Pattern.compile("(\\sno\\.)(\\s+)(?!\\d)"); + + // Czech abbreviations (ver. 0.2) + + // various titles + private static final String TITLES = "Bc|BcA|Ing|Ing.arch|MUDr|MVDr|MgA|Mgr|JUDr|PhDr|" + + "RNDr|PharmDr|ThLic|ThDr|Ph.D|Th.D|prof|doc|CSc|DrSc|dr. h. c|PaedDr|Dr|PhMr|DiS"; + + // as a single regexp: + private static final String ABBREVIATIONS = "abt|ad|a.i|aj|angl|anon|apod|atd|atp|aut|bd|biogr|" + + "b.m|b.p|b.r|cca|cit|cizojaz|c.k|col|čes|čín|čj|ed|facs|fasc|fol|fot|franc|h.c|hist|hl|" + + "hrsg|ibid|il|ind|inv.č|jap|jhdt|jv|koed|kol|korej|kl|krit|lat|lit|m.a|maď|mj|mp|násl|" + + "např|nepubl|něm|no|nr|n.s|okr|odd|odp|obr|opr|orig|phil|pl|pokrač|pol|port|pozn|př.kr|" + + "př.n.l|přel|přeprac|příl|pseud|pt|red|repr|resp|revid|rkp|roč|roz|rozš|samost|sect|" + + "sest|seš|sign|sl|srv|stol|sv|šk|šk.ro|špan|tab|t.č|tis|tj|tř|tzv|univ|uspoř|vol|" + + "vl.jm|vs|vyd|vyobr|zal|zejm|zkr|zprac|zvl|n.p" + + "|" + TITLES; + + private Pattern paragraph; + + /** + * Create a sentence tokenizer. + */ + public CzechSentenceTokenizer() { + setSingleLineBreaksMarksParagraph(false); + } + + /** + * @param lineBreakParagraphs if <code>true</code>, single lines breaks are assumed to end a paragraph, + * with <code>false</code>, only two ore more consecutive line breaks end a paragraph + */ + public final void setSingleLineBreaksMarksParagraph(final boolean lineBreakParagraphs) { + if (lineBreakParagraphs) + paragraph = paragraphByLineBreak; + else + paragraph = paragraphByTwoLineBreaks; + } + + public final List<String> tokenize(String s) { + s = firstSentenceSplitting(s); + s = removeFalseEndOfSentence(s); + s = splitUnsplitStuff(s); + final StringTokenizer stringTokenizer = + new StringTokenizer(s, EOS); + List<String> l = new ArrayList<String>(); + while (stringTokenizer.hasMoreTokens()) { + String sentence = stringTokenizer.nextToken(); + l.add(sentence); + } + return l; + } + + /** + * Add a special break character at all places with typical sentence delimiters. + */ + private String firstSentenceSplitting(String s) { + // Double new-line means a new sentence: + s = paragraph.matcher(s).replaceAll("$1" + EOS); + // Punctuation followed by whitespace means a new sentence: + s = punctWhitespace.matcher(s).replaceAll("$1" + EOS); + // New (compared to the perl module): Punctuation followed by uppercase followed + // by non-uppercase character (except dot) means a new sentence: + s = punctUpperLower.matcher(s).replaceAll("$1" + EOS + "$2"); + // Break also when single letter comes before punctuation: + s = letterPunct.matcher(s).replaceAll("$1" + EOS); + return s; + } + + /** + * Repair some positions that don't require a split, i.e. remove the special break character at + * those positions. + */ + protected String removeFalseEndOfSentence(String s) { + // Don't split at e.g. "U. S. A.": + s = abbrev1.matcher(s).replaceAll("$1"); + // Don't split at e.g. "U.S.A.": + s = abbrev2.matcher(s).replaceAll("$1"); + // Don't split after a white-space followed by a single letter followed + // by a dot followed by another whitespace. + // e.g. " p. " + s = abbrev3.matcher(s).replaceAll("$1"); + // Don't split at "bla bla... yada yada" (TODO: use \.\.\.\s+ instead?) + s = abbrev4.matcher(s).replaceAll("$1$2"); + // Don't split [.?!] when the're quoted: + s = abbrev5.matcher(s).replaceAll("$1"); + + // Don't split at abbreviations, treat them case insensitive + //TODO: don't split at some abbreviations followed by uppercase + //E.g., "Wojna rozpoczęła się w 1918 r. To była krwawa jatka" + //should be split at "r."... But + //"Ks. Jankowski jest analfabetą" shouldn't be split... + //this requires a special list of abbrevs used before names etc. + + //removing the loop and using only one regexp - this is definitely much, much faster + Pattern pattern = Pattern.compile("(?u)(\\b(" + ABBREVIATIONS + ")" + PAP + "\\s)" + EOS); + s = pattern.matcher(s).replaceAll("$1"); + + // Don't break after quote unless there's a capital letter: + // e.g.: "That's right!" he said. + s = abbrev6.matcher(s).replaceAll("$1$2"); + + // fixme? not sure where this should occur, leaving it commented out: + // don't break: text . . some more text. + // text=~s/(\s\.\s)$EOS(\s*)/$1$2/sg; + + // e.g. "Das ist . so." -> assume one sentence + s = abbrev7.matcher(s).replaceAll("$1"); + + // e.g. "Das ist . so." -> assume one sentence + s = abbrev8.matcher(s).replaceAll("$1"); + + // extension by dnaber --commented out, doesn't help: + // text = re.compile("(:\s+)%s(\s*[%s])" % (self.EOS, string.lowercase), + // re.DOTALL).sub("\\1\\2", text) + + s = s.replaceAll("(\\d+\\.) " + EOS + "([\\p{L}&&[^\\p{Lu}]]+)", "$1 $2"); + + // z.B. "Das hier ist ein(!) Satz." + s = s.replaceAll("\\(([!?]+)\\) " + EOS, "($1) "); + return s; + } + + /** + * Treat some more special cases that make up a sentence boundary. Insert the special break + * character at these positions. + */ + private String splitUnsplitStuff(String s) { + // e.g. "x5. bla..." -- not sure, leaving commented out: + // text = re.compile("(\D\d+)(%s)(\s+)" % self.P, re.DOTALL).sub("\\1\\2%s\\3" % self.EOS, text) + // Not sure about this one, leaving out four now: + // text = re.compile("(%s\s)(\s*\()" % self.PAP, re.DOTALL).sub("\\1%s\\2" % self.EOS, text) + // Split e.g.: He won't. #Really. + s = repair1.matcher(s).replaceAll("$1" + EOS + "$2"); + // Split e.g.: He won't say no. Not really. + s = repair2.matcher(s).replaceAll("$1" + EOS + "$2"); + return s; + } + +} diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/tokenizers/da/DanishSentenceTokenizer.java b/JLanguageTool/src/java/de/danielnaber/languagetool/tokenizers/da/DanishSentenceTokenizer.java new file mode 100644 index 0000000..32db6d5 --- /dev/null +++ b/JLanguageTool/src/java/de/danielnaber/languagetool/tokenizers/da/DanishSentenceTokenizer.java @@ -0,0 +1,43 @@ +/* LanguageTool, a natural language style checker + * Copyright (C) 2006 Daniel Naber (http://www.danielnaber.de) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ +package de.danielnaber.languagetool.tokenizers.da; + +import de.danielnaber.languagetool.tokenizers.SentenceTokenizer; + +/** + * @deprecated use {@code new SRXSentenceTokenizer("da")} instead + * @author Daniel Naber + */ +public class DanishSentenceTokenizer extends SentenceTokenizer { + + private static final String[] ABBREV_LIST = { +"abs", "abstr", "adj", "adm", "adr", "adv", "afd", "afg", "afl", "afs", "afvig", "agro", "akad", "akk", "allr", "alm", "amer", "anat", "ang", "anm", "anv", "apot", "appos", "apr", "arab", "arkais", "arkæol", "arp", "arr", "art", "ass", "astr", "att", "attrib", "aud", "aug", "aut", "bag", "barb", "barnespr", "bd", "bdt", "beg", "besl", "best", "bet", "bhk", "biavl", "bibet", "bibl", "bibliot", "billard", "billedl", "biol", "bjergv", "bk", "bl", "bogb", "bogh", "bogtr", "bornh", "bot", "br", "bryg", "bto", "bygn", "bødk", "ca", "cand", "Chr", "cirk", "cit", "co", "d", "da", "dagl", "dans", "dat", "dec", "def", "demonstr", "dep", "dial", "diam", "dim", "disp", "distr", "distrib", "dobb", "dr", "dvs", "e", "egl", "eks", "eksam", "ekskl", "eksp", "ekspl", "el", "ell", "ellipt", "emb", "endv", "eng", "enk", "ent", "etc", "etnogr", "eufem", "eur", "event", "evt", "f", "fagl", "fakt", "farv", "feb", "ff", "fhv", "fig", "filos", "fisk", "fk", "fl", "flg", "flt", "flyv", "fmd", "fon", "foragt", "forb", "foreg", "forf", "forsikr", "forsk", "forst", "foræld", "fot", "fr", "fre", "fris", "frk", "fsv", "fuldm", "fx", "fys", "fysiol", "fægt", "gart", "gartn", "garv", "gdr", "gen", "genopt", "geogr", "geol", "geom", "germ", "gl", "glarm", "glda", "gldgs", "glholl", "glno", "gns", "got", "gr", "gradbøjn", "gram", "gross", "grundbet", "græc", "guldsm", "gym", "h", "hat", "hd", "hebr", "henh", "hensobj", "herald", "hhv", "hist", "hj", "holl", "hovedbet", "hr", "hty", "højtid", "haandarb", "haandv", "i", "if", "ifm", "ift", "iht", "imp", "indb", "indik", "inf", "ing", "Inkl", "inkl", "insp", "instr", "interj", "intk", "intr", "iron", "isl", "ital", "jan", "jarg", "jf", "jnr", "jr", "jul", "jun", "jur", "jy", "jæg", "jærnb", "jød", "Kbh", "kbh", "kem", "kgl", "kirk", "kl", "kld", "knsp", "kog", "koll", "komm", "komp", "konj", "konkr", "kons", "Kr", "kr", "kurv", "kvt", "køkkenspr", "l", "landbr", "landmaaling", "lat", "lb", "lic", "lign", "litt", "Ll", "log", "Loll", "loll", "lrs", "lør", "m", "maj", "maks", "mal", "man", "mar", "mat", "mdl", "mdr", "med", "medl", "meng", "merc", "meteorol", "meton", "metr", "mf", "mfl", "mht", "mia", "min", "mineral", "mio", "ml", "mlat", "mm", "mnt", "mods", "modsætn", "modt", "mr", "mrk", "mur", "mv", "mvh", "mytol", "møl", "mønt", "n", "naturv", "ndf", "Ndr", "nedsæt", "nht", "no", "nom", "nov", "nr", "nt", "num", "nyda", "nydann", "nylat", "naal", "obj", "obl", "oblik", "obs", "odont", "oecon", "oeng", "ofl", "ogs", "oht", "okt", "oldfr", "oldfris", "oldn", "olgn", "omg", "omkr", "omtr", "ons", "opr", "ordspr", "org", "osax", "osv", "ovenst", "overf", "overs", "ovf", "p", "pag", "part", "pass", "pct", "perf", "pga", "ph", "pharm", "phil", "pk", "pkt", "pl", "plur", "poet", "polit", "port", "poss", "post", "pott", "pr", "pron", "propr", "prov", "præd", "præp", "præs", "præt", "psych", "pt", "pæd", "paavirkn", "reb", "ref", "refl", "regn", "relat", "relig", "resp", "retor", "rid", "rigsspr", "run", "russ", "s", "sa", "sanskr", "scient", "sdjy", "sdr", "sek", "sen", "sep", "sept", "shetl", "sj", "sjæll", "skibsbygn", "sko", "skol", "skr", "skriftspr", "skræd", "Skt", "slagt", "slutn", "smed", "sml", "smsat", "smst", "snedk", "soldat", "sp", "spec", "sport", "spot", "spr", "sprogv", "spøg", "ssg", "ssgr", "st", "stk", "str", "stud", "subj", "subst", "superl", "sv", "sætn", "søn", "talem", "talespr", "tandl", "td", "tdl", "teat", "techn", "telef", "telegr", "teol", "th", "theol", "tir", "tirs", "tlf", "told", "tor", "tors", "trans", "tsk", "ty", "tyrk", "tøm", "u", "ubesl", "ubest", "udd", "udenl", "udg", "udtr", "uegl", "ugtl", "ult", "underbet", "undt", "univ", "upers", "ur", "urnord", "v", "var", "vbs", "vedk", "vedl", "vedr", "vejl", "verb", "vet", "vha", "vol", "vs", "vsa", "vulg", "væv", "zool", "æ", "æda", "ænht", "ænyd", "æstet", "ø", "å", "årg", "årh" + }; + + // Month names like "januar" that should not be considered a sentence + // boundary in string like "13. januar". + private static final String[] MONTH_NAMES = { "januar", "februar", "marts", "april", "maj", + "juni", "juli", "august", "september", "oktober", "november", "december" }; + + public DanishSentenceTokenizer() { + super(ABBREV_LIST); + super.monthNames = MONTH_NAMES; + } + +} diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/tokenizers/de/GermanCompoundTokenizer.java b/JLanguageTool/src/java/de/danielnaber/languagetool/tokenizers/de/GermanCompoundTokenizer.java new file mode 100644 index 0000000..fb141a0 --- /dev/null +++ b/JLanguageTool/src/java/de/danielnaber/languagetool/tokenizers/de/GermanCompoundTokenizer.java @@ -0,0 +1,47 @@ +/* LanguageTool, a natural language style checker + * Copyright (C) 2007 Daniel Naber (http://www.danielnaber.de) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ +package de.danielnaber.languagetool.tokenizers.de; + +import java.io.IOException; +import java.util.List; + +import de.abelssoft.wordtools.jWordSplitter.impl.GermanWordSplitter; +import de.danielnaber.languagetool.tokenizers.Tokenizer; + +/** + * Split German nouns using the jWordSplitter library. + * + * @author Daniel Naber + */ +public class GermanCompoundTokenizer implements Tokenizer { + + private final GermanWordSplitter wordSplitter; + + public GermanCompoundTokenizer() throws IOException { + wordSplitter = new GermanWordSplitter(false); + wordSplitter.setStrictMode(true); // required for now to make minimum length work + wordSplitter.setMinimumWordLength(3); + } + + public List<String> tokenize(String word) { + return (List<String>) wordSplitter.splitWord(word); + } + +} + diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/tokenizers/de/GermanSentenceTokenizer.java b/JLanguageTool/src/java/de/danielnaber/languagetool/tokenizers/de/GermanSentenceTokenizer.java new file mode 100644 index 0000000..31dab43 --- /dev/null +++ b/JLanguageTool/src/java/de/danielnaber/languagetool/tokenizers/de/GermanSentenceTokenizer.java @@ -0,0 +1,95 @@ +/* LanguageTool, a natural language style checker + * Copyright (C) 2006 Daniel Naber (http://www.danielnaber.de) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ +package de.danielnaber.languagetool.tokenizers.de; + +import java.util.regex.Pattern; + +import de.danielnaber.languagetool.tokenizers.SentenceTokenizer; +import org.apache.commons.lang.ArrayUtils; + +/** + * @deprecated use {@code new SRXSentenceTokenizer("de")} instead + */ +public class GermanSentenceTokenizer extends SentenceTokenizer { + + private static final String[] ABBREV_LIST = { + "d", "Übers", "usw", "bzw", "Abh", "Abk", "Abt", "ahd", "Akk", + "allg", "alltagsspr", "altdt", "alttest", "amerikan", "Anh", + "Ank", "Anm", "Art", "Az", "Bat", "bayr", "Bd", "Bde", "Bed", + "Bem", "bes", "bez", "Bez", "Bhf", "bspw", "btto", "bw", "bzw", + "cts", "ct", "Ca", "ca", "chem", "chin", "Chr", "cresc", "dat", "Dat", + "desgl", "ders", "dgl", "Di", "Dipl", "Dir", "Do", "Doz", "Dr", + "dt", "ebd", "Ed", "eigtl", "engl", "Erg", "al", "etc", "etw", + "ev", "evtl", "Evtl", "exkl", "Expl", "Exz", "ff", "Fa", "fachspr", "fam", + "fem", "Fem", "Fr", "fr", "franz", "frz", "frdl", "Frl", + "Fut", "Gd", "geb", "gebr", "Gebr", "geh", "geh", "geleg", "gen", + "Gen", "germ", "gesch", "ges", "get", "ggf", "Ggs", "ggT", + "griech", "hebr", "hg", "Hrsg", "Hg", "hist", "hochd", "hochspr", + "Hptst", "Hr", "Allg", "ill", "inkl", "incl", "Ind", "Inf", "Ing", + "ital", "Tr", "Jb", "Jg", "Jh", "Jhd", "jmd", "jmdm", "jmdn", "jmds", + "jur", "Kap", "kart", "kath", "kfm", "kaufm", "Kfm", "kgl", + "Kl", "Konj", "Krs", "Kr", "Kto", "lat", "lfd", "Lit", "lt", + "Lz", "Mask", "mask", "Mrd", "mdal", "med", "met", "mhd", "Mi", + "Mio", "min", "Mo", "mod", "nachm", "nördlBr", "neutr", + "Nhd", "Nom", "Nr", "Nrn", "Num", "Obj", "od", "dgl", "offz", + "Part", "Perf", "Pers", "Pfd", "Pl", "Plur", + "pl", "Plusq", "Pos", "pp", "Präp", "Präs", "Prät", "Prov", "Prof", + "rd", "reg", "resp", "Rhld", "rit", "Sa", "südl", "Br", + "sel", "sen", "Sept", "Sing", "sign", "So", "sog", "Sp", "St", + "St", "St", "Std", "stacc", "Str", "stud", "Subst", "sva", "svw", + "sZ", "Tel", "Temp", "trans", "Tsd", "übertr", "übl", "ff", "ugs", "univ", + "urspr", "usw", "vgl", "Vol", "vorm", "vorm", "Vp", "Vs", + "vs", "wg", "Hd", "Ztr", "zus", "Zus", "zzt", "zz", "Zz", "Zt", + "Min", "bzgl"}; + + // einige deutsche Monate, vor denen eine Zahl erscheinen kann, + // ohne dass eine Satzgrenze erkannt wird (z.B. "am 13. Dezember" -> keine Satzgrenze) + private static final String[] MONTH_NAMES = { "Januar", "Februar", "März", "April", "Mai", + "Juni", "Juli", "August", "September", "Oktober", "November", "Dezember", + // ähnliche Fälle außerhalb der Monatsnamen: + "Jh", "Jhd", "Jahrhundert", "Jahrhunderts", "Geburtstag", "Geburtstags", "Platz", "Platzes"}; + + /** don't split at cases like "Friedrich II. wird auch..." */ + private static final Pattern REPAIR_NAME_PATTERN = Pattern.compile("( [IVX]+\\.) " + EOS + "([^A-ZÖÄÜ]+)"); + + /** don't split at cases like "im 13. oder 14. Jahrhundert" */ + private static final Pattern REPAIR_NUMBER_PATTERN = Pattern.compile("(\\d+\\.)(\\s+)" + EOS + "(und|oder|bis) "); + + public GermanSentenceTokenizer() { + super(ABBREV_LIST); + super.monthNames = MONTH_NAMES; + } + + /** + * Create a sentence tokenizer with the given list of abbreviations, + * additionally to the built-in ones. + */ + public GermanSentenceTokenizer(final String[] abbrevList) { + super((String[]) ArrayUtils.addAll(ABBREV_LIST, abbrevList)); + super.monthNames = MONTH_NAMES; + } + + protected String removeFalseEndOfSentence(String s) { + s = super.removeFalseEndOfSentence(s); + s = REPAIR_NAME_PATTERN.matcher(s).replaceAll("$1 $2"); + s = REPAIR_NUMBER_PATTERN.matcher(s).replaceAll("$1$2$3 "); + return s; + } + +} diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/tokenizers/en/EnglishWordTokenizer.java b/JLanguageTool/src/java/de/danielnaber/languagetool/tokenizers/en/EnglishWordTokenizer.java new file mode 100644 index 0000000..5b29d18 --- /dev/null +++ b/JLanguageTool/src/java/de/danielnaber/languagetool/tokenizers/en/EnglishWordTokenizer.java @@ -0,0 +1,53 @@ +/* LanguageTool, a natural language style checker + * Copyright (C) 2005 Daniel Naber (http://www.danielnaber.de) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ +package de.danielnaber.languagetool.tokenizers.en; + +import java.util.ArrayList; +import java.util.List; +import java.util.StringTokenizer; + +import de.danielnaber.languagetool.tokenizers.Tokenizer; + +/** + * Tokenizes a sentence into words. Punctuation and whitespace gets its own token. + * + * @author Daniel Naber + */ +public class EnglishWordTokenizer implements Tokenizer { + + public EnglishWordTokenizer() { + } + + public List<String> tokenize(final String text) { + final List<String> tokens = new ArrayList<String>(); + final StringTokenizer st = new StringTokenizer(text, + "\u0020\u00A0\u115f\u1160\u1680" + + "\u2000\u2001\u2002\u2003\u2004\u2005\u2006\u2007" + + "\u2008\u2009\u200A\u200B\u200c\u200d\u200e\u200f" + + "\u2028\u2029\u202a\u202b\u202c\u202d\u202e\u202f" + + "\u205F\u2060\u2061\u2062\u2063\u206A\u206b\u206c\u206d" + + "\u206E\u206F\u3000\u3164\ufeff\uffa0\ufff9\ufffa\ufffb" + + ",.;()[]{}!?:\"'’‘„“”…\\/\t\n", true); + while (st.hasMoreElements()) { + tokens.add(st.nextToken()); + } + return tokens; + } + +} diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/tokenizers/gl/GalicianWordTokenizer.java b/JLanguageTool/src/java/de/danielnaber/languagetool/tokenizers/gl/GalicianWordTokenizer.java new file mode 100644 index 0000000..6a1919e --- /dev/null +++ b/JLanguageTool/src/java/de/danielnaber/languagetool/tokenizers/gl/GalicianWordTokenizer.java @@ -0,0 +1,53 @@ +/* LanguageTool, a natural language style checker + * Copyright (C) 2005 Daniel Naber (http://www.danielnaber.de) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ +package de.danielnaber.languagetool.tokenizers.gl; + +import java.util.ArrayList; +import java.util.List; +import java.util.StringTokenizer; + +import de.danielnaber.languagetool.tokenizers.Tokenizer; + +/** + * Tokenizes a sentence into words. Punctuation and whitespace gets its own token. + * + * @author Daniel Naber + */ +public class GalicianWordTokenizer implements Tokenizer { + + public GalicianWordTokenizer() { + } + + public List<String> tokenize(final String text) { + final List<String> tokens = new ArrayList<String>(); + final StringTokenizer st = new StringTokenizer(text, + "\u0020\u00A0\u115f\u1160\u1680" + + "\u2000\u2001\u2002\u2003\u2004\u2005\u2006\u2007" + + "\u2008\u2009\u200A\u200B\u200c\u200d\u200e\u200f" + + "\u2028\u2029\u202a\u202b\u202c\u202d\u202e\u202f" + + "\u205F\u2060\u2061\u2062\u2063\u206A\u206b\u206c\u206d" + + "\u206E\u206F\u3000\u3164\ufeff\uffa0\ufff9\ufffa\ufffb" + + ",.;()[]{}¿¡!?:\"'’‘„“”…\\/\t\n", true); + while (st.hasMoreElements()) { + tokens.add(st.nextToken()); + } + return tokens; + } + +} diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/tokenizers/ml/MalayalamWordTokenizer.java b/JLanguageTool/src/java/de/danielnaber/languagetool/tokenizers/ml/MalayalamWordTokenizer.java new file mode 100644 index 0000000..ebd7ce3 --- /dev/null +++ b/JLanguageTool/src/java/de/danielnaber/languagetool/tokenizers/ml/MalayalamWordTokenizer.java @@ -0,0 +1,55 @@ +/* LanguageTool, a natural language style checker + * Copyright (C) 2005 Daniel Naber (http://www.danielnaber.de) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ +package de.danielnaber.languagetool.tokenizers.ml; + +import java.util.ArrayList; +import java.util.List; +import java.util.StringTokenizer; + +import de.danielnaber.languagetool.tokenizers.Tokenizer; + +/** + * Tokenizes a sentence into words. Punctuation and whitespace gets its own token. + * + * @author Daniel Naber + */ +public class MalayalamWordTokenizer implements Tokenizer { + + public MalayalamWordTokenizer() { + } + + public List<String> tokenize(final String text) { + final List<String> tokens = new ArrayList<String>(); + final StringTokenizer st = new StringTokenizer(text, + "\u0020\u00A0\u115f\u1160\u1680" + /** + + "\u2000\u2001\u2002\u2003\u2004\u2005\u2006\u2007" + + "\u2008\u2009\u200A\u200B\u200c\u200d\u200e\u200f" + + "\u2028\u2029\u202a\u202b\u202c\u202d\u202e\u202f" + + "\u205F\u2060\u2061\u2062\u2063\u206A\u206b\u206c\u206d" + + "\u206E\u206F\u3000\u3164\ufeff\uffa0\ufff9\ufffa\ufffb" + **/ + + ",.;()[]{}!?:\"'’‘„“”…\\/\t\n", true); + while (st.hasMoreElements()) { + tokens.add(st.nextToken()); + } + return tokens; + } + +} diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/tokenizers/nl/DutchWordTokenizer.java b/JLanguageTool/src/java/de/danielnaber/languagetool/tokenizers/nl/DutchWordTokenizer.java new file mode 100644 index 0000000..7b12536 --- /dev/null +++ b/JLanguageTool/src/java/de/danielnaber/languagetool/tokenizers/nl/DutchWordTokenizer.java @@ -0,0 +1,53 @@ +/* LanguageTool, a natural language style checker + * Copyright (C) 2008 Daniel Naber (http://www.danielnaber.de) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ + +package de.danielnaber.languagetool.tokenizers.nl; + +import java.util.Arrays; +import java.util.List; + +import de.danielnaber.languagetool.tokenizers.WordTokenizer; + +public class DutchWordTokenizer extends WordTokenizer { + + public DutchWordTokenizer() { + } + + /** + * Tokenizes just like WordTokenizer with the exception for words such as + * "oma's" that contains an apostrophe in their middle. + * + * @param text + * - Text to tokenize + * @return List of tokens. + * + * Note: a special string ##NL_APOS## is used to replace apostrophe + * during tokenizing. + */ + public List<String> tokenize(final String text) { + // TODO: find a cleaner implementation, this is a hack + final List<String> tokenList = super.tokenize(text.replaceAll( + "([\\p{L}])'([\\p{L}])", "$1##NL_APOS##$2")); + final String[] tokens = tokenList.toArray(new String[tokenList.size()]); + for (int i = 0; i < tokens.length; i++) { + tokens[i] = tokens[i].replace("##NL_APOS##", "'"); + } + return Arrays.asList(tokens); + } +} diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/tokenizers/ro/RomanianWordTokenizer.java b/JLanguageTool/src/java/de/danielnaber/languagetool/tokenizers/ro/RomanianWordTokenizer.java new file mode 100644 index 0000000..42fa76e --- /dev/null +++ b/JLanguageTool/src/java/de/danielnaber/languagetool/tokenizers/ro/RomanianWordTokenizer.java @@ -0,0 +1,56 @@ +/* LanguageTool, a natural language style checker + * Copyright (C) 2005 Daniel Naber (http://www.danielnaber.de) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ +package de.danielnaber.languagetool.tokenizers.ro; + +import java.util.ArrayList; +import java.util.List; +import java.util.StringTokenizer; + +import de.danielnaber.languagetool.tokenizers.Tokenizer; + +/** + * Tokenizes a sentence into words. Punctuation and whitespace gets its own + * token. Like EnglishWordTokenizer except for some characters: eg: "-' + * + * @author Ionuț Păduraru + * @since 20.02.2009 19:53:50 + */ +public class RomanianWordTokenizer implements Tokenizer { + + public RomanianWordTokenizer() { + } + + public List<String> tokenize(final String text) { + List<String> l = new ArrayList<String>(); + StringTokenizer st = new StringTokenizer( + text, + "\u0020\u00A0\u115f\u1160\u1680" + + "\u2000\u2001\u2002\u2003\u2004\u2005\u2006\u2007" + + "\u2008\u2009\u200A\u200B\u200c\u200d\u200e\u200f" + + "\u2028\u2029\u202a\u202b\u202c\u202d\u202e\u202f" + + "\u205F\u2060\u2061\u2062\u2063\u206A\u206b\u206c\u206d" + + "\u206E\u206F\u3000\u3164\ufeff\uffa0\ufff9\ufffa\ufffb" + + ",.;()[]{}!?:\"'’‘„“”…\\/\t\n\r«»<>%°" + "-|=", true); + while (st.hasMoreElements()) { + l.add(st.nextToken()); + } + return l; + } + +} diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/tools/.cvsignore b/JLanguageTool/src/java/de/danielnaber/languagetool/tools/.cvsignore new file mode 100644 index 0000000..b71c741 --- /dev/null +++ b/JLanguageTool/src/java/de/danielnaber/languagetool/tools/.cvsignore @@ -0,0 +1 @@ +EnglishTaggerExtract.java diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/tools/ReflectionUtils.java b/JLanguageTool/src/java/de/danielnaber/languagetool/tools/ReflectionUtils.java new file mode 100644 index 0000000..9735cac --- /dev/null +++ b/JLanguageTool/src/java/de/danielnaber/languagetool/tools/ReflectionUtils.java @@ -0,0 +1,232 @@ +/* ReflectionUtils, helper methods to load classes dynamically + * Copyright (C) 2007 Andriy Rysin, Marcin Milkowski, Daniel Naber + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ +package de.danielnaber.languagetool.tools; + +import java.io.File; +import java.io.IOException; +import java.lang.reflect.Modifier; +import java.net.JarURLConnection; +import java.net.URI; +import java.net.URISyntaxException; +import java.net.URL; +import java.util.*; +import java.util.jar.JarEntry; +import java.util.jar.JarFile; + +public final class ReflectionUtils { + + private ReflectionUtils() { + // a static singleton class + } + + /** + * @param classLoader + * Classloader to use for loading classes + * @param packageName + * Package name to check classes in + * @param classNameRegEx + * If not null limit class names to this regexp. This parameter is + * checked before class is loaded so use it to improve performance by + * skipping loading extra classes + * @param subdirLevel + * If more than 0 all subdirectories/subpackages up to + * <code>dirLevel</code> will be traversed This parameter is checked + * before class is loaded - use it to improve performance by skipping + * loading extra classes + * @param classExtends + * If not null return only classes which extend this class + * @param interfaceImplements + * If not null return only classes which implement this interface + * @return Returns all classes inside given package + * @throws ClassNotFoundException + */ + public static Class[] findClasses(final ClassLoader classLoader, + final String packageName, final String classNameRegEx, + final int subdirLevel, final Class classExtends, + final Class interfaceImplements) throws ClassNotFoundException { + final Map<Class,String> foundClasses = new HashMap<Class,String>(); + + try { + final String packagePath = packageName.replace('.', '/'); + final Enumeration<URL> resources_ = classLoader.getResources(packagePath); + + final Set<URI> uniqResources = new HashSet<URI>(); + while (resources_.hasMoreElements()) { + final URI resource = resources_.nextElement().toURI(); + uniqResources.add(resource); + } + + for (final URI res : uniqResources) { + final URL resource = res.toURL(); + // System.err.println("trying resource: " + resource); + // jars and directories are treated differently + if (resource.getProtocol().startsWith("jar")) { + findClassesInJar(packageName, classNameRegEx, subdirLevel, + classExtends, interfaceImplements, foundClasses, resource); + } else { + findClassesInDirectory(classLoader, packageName, classNameRegEx, + subdirLevel, classExtends, interfaceImplements, foundClasses, + resource); + } + } + } catch (final Exception ex) { + throw new ClassNotFoundException("Loading rules failed: " + + ex.getMessage(), ex); + } + + return foundClasses.keySet().toArray(new Class[foundClasses.size()]); + } + + private static void findClassesInDirectory(final ClassLoader classLoader, + final String packageName, final String classNameRegEx, + final int subdirLevel, final Class classExtends, + final Class interfaceImplements, final Map<Class,String> foundClasses, + final URL resource) throws Exception { + final File directory = new File(resource.toURI()); + + if (!directory.exists() && !directory.isDirectory()) { + throw new Exception("directory does not exist: " + + directory.getAbsolutePath()); + } + + // read classes + for (final File file : directory.listFiles()) { + if (file.isFile() && file.getName().endsWith(".class")) { + final String classShortNm = file.getName().substring(0, + file.getName().lastIndexOf('.')); + if (classNameRegEx == null || classShortNm.matches(classNameRegEx)) { + final Class clazz = Class.forName(packageName + "." + classShortNm); + + if (!isMaterial(clazz)) { + continue; + } + + if (classExtends == null + || isExtending(clazz, classExtends.getName()) + && interfaceImplements == null + || isImplementing(clazz, interfaceImplements)) { + foundClasses.put(clazz, file.getAbsolutePath()); + // System.err.println("Added rule from dir: " + classShortNm); + } + } + } + } + + // then subdirectories if we're traversing + if (subdirLevel > 0) { + for (final File dir : directory.listFiles()) { + if (dir.isDirectory()) { + final Class[] subLevelClasses = findClasses(classLoader, packageName + + "." + dir.getName(), classNameRegEx, subdirLevel - 1, + classExtends, interfaceImplements); + for (Class tmpClass : subLevelClasses) { + foundClasses.put(tmpClass, "dir:" + dir.getAbsolutePath()); + } + } + } + } + } + + private static void findClassesInJar(final String packageName, + final String classNameRegEx, final int subdirLevel, + final Class classExtends, final Class interfaceImplements, + final Map<Class,String> foundClasses, final URL resource) throws IOException, + URISyntaxException, ClassNotFoundException { + final JarURLConnection conn = (JarURLConnection) resource.openConnection(); + final JarFile currentFile = conn.getJarFile(); // new JarFile(new + // File(resource.toURI())); + // jars are flat containers: + for (final Enumeration<JarEntry> e = currentFile.entries(); e + .hasMoreElements();) { + final JarEntry current = e.nextElement(); + final String name = current.getName(); + // System.err.println("jar entry: " + name); + + if (name.endsWith(".class")) { + final String classNm = name.replaceAll("/", ".").replace(".class", ""); + final int pointIdx = classNm.lastIndexOf('.'); + final String classShortNm = pointIdx == -1 ? classNm : classNm + .substring(pointIdx + 1); + + if (classNm.startsWith(packageName) + && (classNameRegEx == null || classShortNm.matches(classNameRegEx))) { + final String subName = classNm.substring(packageName.length() + 1); + + if (countOccurrences(subName, '.') > subdirLevel) { + continue; + } + + final Class clazz = Class.forName(classNm); + if (foundClasses.containsKey(clazz)) { + throw new RuntimeException("Duplicate class definition:\n" + + clazz.getName() + ", found in\n" + currentFile.getName() + " and\n" + + foundClasses.get(clazz)); + } + + if (!isMaterial(clazz)) { + continue; + } + + if (classExtends == null + || isExtending(clazz, classExtends.getName()) + && interfaceImplements == null + || isImplementing(clazz, interfaceImplements)) { + foundClasses.put(clazz, currentFile.getName()); + // System.err.println("Added class from jar: " + name); + } + } + } + } + } + + private static int countOccurrences(final String str, final char ch) { + int i = 0; + int pos = str.indexOf(ch, 0); + while (pos != -1) { + i++; + pos = str.indexOf(ch, pos + 1); + } + return i; + } + + private static boolean isMaterial(final Class clazz) { + final int mod = clazz.getModifiers(); + return !Modifier.isAbstract(mod) && !Modifier.isInterface(mod); + } + + /** + * @return Returns true if clazz extends superClassName + */ + private static boolean isExtending(final Class clazz, + final String superClassName) { + Class tmpSuperClass = clazz.getSuperclass(); + while (tmpSuperClass != null) { + if (superClassName.equals(tmpSuperClass.getName())) { + return true; + } + tmpSuperClass = tmpSuperClass.getSuperclass(); + } + return false; + } + + private static boolean isImplementing(final Class clazz, final Class interfaze) { + return Arrays.asList(clazz.getInterfaces()).contains(interfaze); + } + +} diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/tools/StringTools.java b/JLanguageTool/src/java/de/danielnaber/languagetool/tools/StringTools.java new file mode 100644 index 0000000..af266f3 --- /dev/null +++ b/JLanguageTool/src/java/de/danielnaber/languagetool/tools/StringTools.java @@ -0,0 +1,581 @@ +/* LanguageTool, a natural language style checker + * Copyright (C) 2005 Daniel Naber (http://www.danielnaber.de) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ +package de.danielnaber.languagetool.tools; + +import java.io.BufferedReader; +import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.io.Reader; +import java.util.regex.Pattern; +import java.util.Collection; +import java.util.Iterator; +import java.util.List; + +import de.danielnaber.languagetool.Language; +import de.danielnaber.languagetool.gui.Tools; +import de.danielnaber.languagetool.rules.RuleMatch; +import de.danielnaber.languagetool.rules.patterns.PatternRule; + +/** + * Tools for reading files etc. + * + * @author Daniel Naber + */ +public final class StringTools { + + private static final int DEFAULT_CONTEXT_SIZE = 25; + + /** + * Constants for printing XML rule matches. + */ + public static enum XmlPrintMode { + /** + * Normally output the rule matches by starting and + * ending the XML output on every call. + */ + NORMAL_XML, + /** + * Start XML output by printing the preamble and the + * start of the root element. + */ + START_XML, + /** + * End XML output by closing the root element. + */ + END_XML, + /** + * Simply continue rule match output. + */ + CONTINUE_XML + } + + private static final Pattern XML_COMMENT_PATTERN = Pattern.compile("<!--.*?-->", Pattern.DOTALL); + private static final Pattern XML_PATTERN = Pattern.compile("(?<!<)<[^<>]+>", Pattern.DOTALL); + + + private StringTools() { + // only static stuff + } + + /** + * Throw exception if the given string is null or empty or only whitespace. + */ + public static void assureSet(final String s, final String varName) { + if (s == null) { + throw new NullPointerException(varName + " cannot be null"); + } + if (isEmpty(s.trim())) { + throw new IllegalArgumentException(varName + + " cannot be empty or whitespace only"); + } + } + + /** + * Read a file's content. + */ + public static String readFile(final InputStream file) throws IOException { + return readFile(file, null); + } + + /** + * Read the text file using the given encoding. + * + * @param file + * InputStream to a file to be read + * @param encoding + * the file's character encoding (e.g. <code>iso-8859-1</code>) + * @return a string with the file's content, lines separated by + * <code>\n</code> + * @throws IOException + */ + public static String readFile(final InputStream file, final String encoding) + throws IOException { + InputStreamReader isr = null; + BufferedReader br = null; + final StringBuilder sb = new StringBuilder(); + try { + if (encoding == null) { + isr = new InputStreamReader(file); + } else { + isr = new InputStreamReader(file, encoding); + } + br = new BufferedReader(isr); + String line; + while ((line = br.readLine()) != null) { + sb.append(line); + sb.append('\n'); + } + } finally { + if (br != null) { + br.close(); + } + if (isr != null) { + isr.close(); + } + } + return sb.toString(); + } + + /** + * Returns true if <code>str</code> is made up of all-uppercase characters + * (ignoring characters for which no upper-/lowercase distinction exists). + */ + public static boolean isAllUppercase(final String str) { + return str.equals(str.toUpperCase()); + } + + /** + * @param str - input str + * Returns true if str is MixedCase. + */ + public static boolean isMixedCase(final String str) { + return !isAllUppercase(str) + && !isCapitalizedWord(str) + && !str.equals(str.toLowerCase()); + } + + /** + * @param str - input string + */ + public static boolean isCapitalizedWord(final String str) { + if (isEmpty(str)) { + return false; + } + final char firstChar = str.charAt(0); + if (Character.isUpperCase(firstChar)) { + return str.substring(1).equals(str.substring(1).toLowerCase()); + } + return false; + } + + /** + * Whether the first character of <code>str</code> is an uppercase character. + */ + public static boolean startsWithUppercase(final String str) { + if (isEmpty(str)) { + return false; + } + final char firstChar = str.charAt(0); + if (Character.isUpperCase(firstChar)) { + return true; + } + return false; + } + + /** + * Return <code>str</code> modified so that its first character is now an + * uppercase character. If <code>str</code> starts with non-alphabetic + * characters, such as quotes or parentheses, the first character is + * determined as the first alphabetic character. + */ + public static String uppercaseFirstChar(final String str) { + return changeFirstCharCase(str, true); + } + + /** + * Return <code>str</code> modified so that its first character is now an + * lowercase character. If <code>str</code> starts with non-alphabetic + * characters, such as quotes or parentheses, the first character is + * determined as the first alphabetic character. + */ + public static String lowercaseFirstChar(final String str) { + return changeFirstCharCase(str, false); + } + + /** + * Return <code>str</code> modified so that its first character is now an + * lowercase or uppercase character, depending on <code>toUpperCase</code>. + * If <code>str</code> starts with non-alphabetic + * characters, such as quotes or parentheses, the first character is + * determined as the first alphabetic character. + */ + private static String changeFirstCharCase(final String str, final boolean toUpperCase) { + if (isEmpty(str)) { + return str; + } + if (str.length() == 1) { + return toUpperCase ? str.toUpperCase() : str.toLowerCase(); + } + int pos = 0; + final int len = str.length() - 1; + while (!Character.isLetterOrDigit(str.charAt(pos)) && len > pos) { + pos++; + } + final char firstChar = str.charAt(pos); + return str.substring(0, pos) + + (toUpperCase ? Character.toUpperCase(firstChar) : Character.toLowerCase(firstChar)) + + str.substring(pos + 1); + } + + public static String readerToString(final Reader reader) throws IOException { + final StringBuilder sb = new StringBuilder(); + int readBytes = 0; + final char[] chars = new char[4000]; + while (readBytes >= 0) { + readBytes = reader.read(chars, 0, 4000); + if (readBytes <= 0) { + break; + } + sb.append(new String(chars, 0, readBytes)); + } + return sb.toString(); + } + + public static String streamToString(final InputStream is) throws IOException { + final InputStreamReader isr = new InputStreamReader(is); + try { + return readerToString(isr); + } finally { + isr.close(); + } + } + + /** + * Calls escapeHTML(String). + */ + public static String escapeXML(final String s) { + return escapeHTML(s); + } + + /** + * Escapes these characters: less than, bigger than, quote, ampersand. + */ + public static String escapeHTML(final String s) { + // this version is much faster than using s.replaceAll + final StringBuilder sb = new StringBuilder(); + final int n = s.length(); + for (int i = 0; i < n; i++) { + final char c = s.charAt(i); + switch (c) { + case '<': + sb.append("<"); + break; + case '>': + sb.append(">"); + break; + case '&': + sb.append("&"); + break; + case '"': + sb.append("""); + break; + + default: + sb.append(c); + break; + } + } + return sb.toString(); + } + + /** + * Get an XML representation of the given rule matches. + * + * @param text + * the original text that was checked, used to get the context of the + * matches + * @param contextSize + * the desired context size in characters + * @deprecated Use {@link #ruleMatchesToXML(List,String,int,XmlPrintMode)} instead + */ + public static String ruleMatchesToXML(final List<RuleMatch> ruleMatches, + final String text, final int contextSize) { + return ruleMatchesToXML(ruleMatches, text, contextSize, XmlPrintMode.NORMAL_XML); + } + + /** + * Get an XML representation of the given rule matches. + * @param text + * the original text that was checked, used to get the context of the + * matches + * @param contextSize + * the desired context size in characters + * @param xmlMode how to print the XML + */ + public static String ruleMatchesToXML(final List<RuleMatch> ruleMatches, + final String text, final int contextSize, final XmlPrintMode xmlMode) { + // + // IMPORTANT: people rely on this format, don't change it! + // + final StringBuilder xml = new StringBuilder(); + + if (xmlMode == XmlPrintMode.NORMAL_XML || xmlMode == XmlPrintMode.START_XML) { + xml.append("<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n"); + xml.append("<matches>\n"); + } + + for (final RuleMatch match : ruleMatches) { + String subId = ""; + if (match.getRule() instanceof PatternRule) { + final PatternRule pRule = (PatternRule) match.getRule(); + if (pRule.getSubId() != null) { + subId = " subId=\"" + escapeXMLForAPIOutput(pRule.getSubId()) + "\" "; + } + } + xml.append("<error" + " fromy=\"" + match.getLine() + "\"" + " fromx=\"" + + (match.getColumn() - 1) + "\"" + " toy=\"" + match.getEndLine() + "\"" + + " tox=\"" + (match.getEndColumn() - 1) + "\"" + " ruleId=\"" + + match.getRule().getId() + "\""); + final String msg = match.getMessage().replaceAll("</?suggestion>", "'"); + xml.append(subId); + xml.append(" msg=\"" + escapeXMLForAPIOutput(msg) + "\""); + final String START_MARKER = "__languagetool_start_marker"; + String context = Tools.getContext(match.getFromPos(), match.getToPos(), + text, contextSize, START_MARKER, "", true); + xml.append(" replacements=\"" + + escapeXMLForAPIOutput(listToString( + match.getSuggestedReplacements(), "#")) + "\""); + // get position of error in context and remove artificial marker again: + final int contextOffset = context.indexOf(START_MARKER); + context = context.replaceFirst(START_MARKER, ""); + context = context.replaceAll("[\n\r]", " "); + xml.append(" context=\"" + context + "\""); + xml.append(" contextoffset=\"" + contextOffset + "\""); + xml.append(" errorlength=\"" + (match.getToPos() - match.getFromPos()) + + "\""); + xml.append("/>\n"); + } + if (xmlMode == XmlPrintMode.END_XML || xmlMode == XmlPrintMode.NORMAL_XML) { + xml.append("</matches>\n"); + } + return xml.toString(); + } + + private static String escapeXMLForAPIOutput(final String s) { + // this is simplified XML, i.e. put the "<error>" in one line: + return escapeXML(s).replaceAll("[\n\r]", " "); + } + + public static String listToString(final Collection<String> l, final String delimiter) { + final StringBuilder sb = new StringBuilder(); + for (final Iterator<String> iter = l.iterator(); iter.hasNext();) { + final String str = iter.next(); + sb.append(str); + if (iter.hasNext()) { + sb.append(delimiter); + } + } + return sb.toString(); + } + + public static String getContext(final int fromPos, final int toPos, + final String fileContents) { + return getContext(fromPos, toPos, fileContents, DEFAULT_CONTEXT_SIZE); + } + + public static String getContext(final int fromPos, final int toPos, + final String contents, final int contextSize) { + final String fileContents = contents.replace('\n', ' '); + // calculate context region: + int startContent = fromPos - contextSize; + String prefix = "..."; + String postfix = "..."; + String markerPrefix = " "; + if (startContent < 0) { + prefix = ""; + markerPrefix = ""; + startContent = 0; + } + int endContent = toPos + contextSize; + if (endContent > fileContents.length()) { + postfix = ""; + endContent = fileContents.length(); + } + // make "^" marker. inefficient but robust implementation: + final StringBuilder marker = new StringBuilder(); + for (int i = 0; i < fileContents.length() + prefix.length(); i++) { + if (i >= fromPos && i < toPos) { + marker.append('^'); + } else { + marker.append(' '); + } + } + // now build context string plus marker: + final StringBuilder sb = new StringBuilder(); + sb.append(prefix); + sb.append(fileContents.substring(startContent, endContent)); + sb.append(postfix); + sb.append('\n'); + sb.append(markerPrefix); + sb.append(marker.substring(startContent, endContent)); + return sb.toString(); + } + + /** + * Filters any whitespace characters. Useful for trimming the contents of + * token elements that cannot possibly contain any spaces. + * + * @param str + * String to be filtered. + * @return Filtered string. + */ + public static String trimWhitespace(final String str) { + final StringBuilder filter = new StringBuilder(); + for (int i = 0; i < str.length(); i++) { + final char c = str.charAt(i); + if (c != '\n' && c != ' ' && c != '\t') { + filter.append(c); + } + } + return filter.toString(); + } + + /** + * Adds spaces before words that are not punctuation. + * + * @param word + * Word to add the preceding space. + * @param language + * Language of the word (to check typography conventions). Currently + * French convention of not adding spaces only before '.' and ',' is + * implemented; other languages assume that before ,.;:!? no spaces + * should be added. + * @return String containing a space or an empty string. + */ + public static String addSpace(final String word, final Language language) { + String space = " "; + final int len = word.length(); + if (len == 1) { + final char c = word.charAt(0); + if (Language.FRENCH.equals(language)) { + if (c == '.' || c == ',') { + space = ""; + } + } else { + if (c == '.' || c == ',' || c == ';' || c == ':' || c == '?' + || c == '!') { + space = ""; + } + } + } + return space; + } + + /** + * Returns translation of the UI element without the control character "&". To + * have "&" in the UI, use "&&". + * + * @param label + * Label to convert. + * @return String UI element string without mnemonics. + */ + public static String getLabel(final String label) { + return label.replaceAll("&([^&])", "$1"). + replaceAll("&&", "&"); + } + + /** + * Returns the UI element string with mnemonics encoded in OpenOffice.org + * convention (using "~"). + * + * @param label + * Label to convert + * @return String UI element with ~ replacing &. + */ + public static String getOOoLabel(final String label) { + return label.replaceAll("&([^&])", "~$1"). + replaceAll("&&", "&"); + } + + /** + * Returns mnemonic of a UI element. + * + * @param label + * String Label of the UI element + * @return @char Mnemonic of the UI element, or \u0000 in case of no mnemonic + * set. + */ + public static char getMnemonic(final String label) { + int mnemonicPos = label.indexOf('&'); + while (mnemonicPos != -1 && mnemonicPos == label.indexOf("&&") + && mnemonicPos < label.length()) { + mnemonicPos = label.indexOf('&', mnemonicPos + 2); + } + if (mnemonicPos == -1 || mnemonicPos == label.length()) { + return '\u0000'; + } + return label.charAt(mnemonicPos + 1); + } + + /** + * Checks if a string contains only whitespace, including all Unicode + * whitespace. + * + * @param str + * String to check + * @return true if the string is whitespace-only. + */ + public static boolean isWhitespace(final String str) { + if ("\u0002".equals(str) // unbreakable field, e.g. a footnote number in OOo + || "\u0001".equals(str)) { // breakable field in OOo + return false; + } + final String trimStr = str.trim(); + if (isEmpty(trimStr)) { + return true; + } + if (trimStr.length() == 1) { + return java.lang.Character.isWhitespace(trimStr.charAt(0)); + } + return false; + } + + /** + * + * @param ch + * Character to check + * @return True if the character is a positive number (decimal digit from 1 to + * 9). + */ + public static boolean isPositiveNumber(final char ch) { + return ch >= '1' && ch <= '9'; + } + + /** + * Helper method to replace calls to "".equals(). + * + * @param str + * String to check + * @return true if string is empty OR null + */ + public static boolean isEmpty(final String str) { + return str == null || str.length() == 0; + } + + /** + * Simple XML filtering routing + * @param str XML string to be filtered. + * @return Filtered string without XML tags. + */ + public static String filterXML(final String str) { + String s = str; + s = XML_COMMENT_PATTERN.matcher(s).replaceAll(" "); + s = XML_PATTERN.matcher(s).replaceAll(""); + return s; + } + + public static String asString(final CharSequence s) { + if (s == null) { + return null; + } + return s.toString(); + } + +} diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/tools/SymbolLocator.java b/JLanguageTool/src/java/de/danielnaber/languagetool/tools/SymbolLocator.java new file mode 100644 index 0000000..6a6432a --- /dev/null +++ b/JLanguageTool/src/java/de/danielnaber/languagetool/tools/SymbolLocator.java @@ -0,0 +1,37 @@ +/* LanguageTool, a natural language style checker + * Copyright (C) 2010 Marcin Miłkowski (http://www.languagetool.org) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ + +package de.danielnaber.languagetool.tools; + +/** + * Helper class for GenericUnpairedBracketsRule to identify + * symbols indexed with integers. + * + * @author Marcin Miłkowski + * + */ +public class SymbolLocator { + public String symbol; + public int index; + + public SymbolLocator(final String symbol, final int index) { + this.symbol = symbol; + this.index = index; + } +} diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/tools/Tools.java b/JLanguageTool/src/java/de/danielnaber/languagetool/tools/Tools.java new file mode 100644 index 0000000..c5d1984 --- /dev/null +++ b/JLanguageTool/src/java/de/danielnaber/languagetool/tools/Tools.java @@ -0,0 +1,626 @@ +/* LanguageTool, a natural language style checker + * Copyright (C) 2006 Daniel Naber (http://www.danielnaber.de) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ +package de.danielnaber.languagetool.tools; + +import java.io.BufferedInputStream; +import java.io.File; +import java.io.FileInputStream; +import java.io.IOException; +import java.io.InputStream; +import java.io.PrintStream; +import java.io.PrintWriter; +import java.io.StringWriter; +import java.lang.reflect.Constructor; +import java.net.MalformedURLException; +import java.net.URL; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; +import java.util.Locale; +import java.util.ResourceBundle; + +import javax.xml.parsers.ParserConfigurationException; + +import org.xml.sax.SAXException; + +import de.danielnaber.languagetool.AnalyzedSentence; +import de.danielnaber.languagetool.JLanguageTool; +import de.danielnaber.languagetool.Language; +import de.danielnaber.languagetool.bitext.BitextReader; +import de.danielnaber.languagetool.bitext.StringPair; +import de.danielnaber.languagetool.rules.Rule; +import de.danielnaber.languagetool.rules.RuleMatch; +import de.danielnaber.languagetool.rules.bitext.BitextRule; +import de.danielnaber.languagetool.rules.patterns.PatternRule; +import de.danielnaber.languagetool.rules.patterns.bitext.BitextPatternRuleLoader; +import de.danielnaber.languagetool.rules.patterns.bitext.FalseFriendsAsBitextLoader; +import de.danielnaber.languagetool.tools.StringTools.XmlPrintMode; + +public final class Tools { + + private static final int DEFAULT_CONTEXT_SIZE = 45; + + private Tools() { + // cannot construct, static methods only + } + + /** + * Tags text using the LanguageTool tagger. + * + * @param contents + * Text to tag. + * @param lt + * LanguageTool instance + * @throws IOException + */ + public static void tagText(final String contents, final JLanguageTool lt) + throws IOException { + AnalyzedSentence analyzedText; + final List<String> sentences = lt.sentenceTokenize(contents); + for (final String sentence : sentences) { + analyzedText = lt.getAnalyzedSentence(sentence); + System.out.println(analyzedText.toString()); + } + } + + public static int checkText(final String contents, final JLanguageTool lt) + throws IOException { + return checkText(contents, lt, false, -1, 0, 0, StringTools.XmlPrintMode.NORMAL_XML); + } + + public static int checkText(final String contents, final JLanguageTool lt, final int lineOffset) + throws IOException { + return checkText(contents, lt, false, -1, lineOffset, 0, StringTools.XmlPrintMode.NORMAL_XML); + } + + public static int checkText(final String contents, final JLanguageTool lt, + final boolean apiFormat, final int lineOffset) throws IOException { + return checkText(contents, lt, apiFormat, -1, lineOffset, 0, StringTools.XmlPrintMode.NORMAL_XML); + } + + /** + * Check the given text and print results to System.out. + * + * @param contents + * a text to check (may be more than one sentence) + * @param lt + * Initialized LanguageTool + * @param apiFormat + * whether to print the result in a simple XML format + * @param contextSize + * error text context size: -1 for default + * @param lineOffset + * line number offset to be added to line numbers in matches + * @param prevMatches + * number of previously matched rules + * @param xmlMode + * mode of xml printout for simple xml output + * @return + * Number of rule matches to the input text. + * @throws IOException + */ + public static int checkText(final String contents, final JLanguageTool lt, + final boolean apiFormat, int contextSize, final int lineOffset, + final int prevMatches, final XmlPrintMode xmlMode) throws IOException { + if (contextSize == -1) { + contextSize = DEFAULT_CONTEXT_SIZE; + } + final long startTime = System.currentTimeMillis(); + final List<RuleMatch> ruleMatches = lt.check(contents); + // adjust line numbers + for (RuleMatch r : ruleMatches) { + r.setLine(r.getLine() + lineOffset); + r.setEndLine(r.getEndLine() + lineOffset); + } + if (apiFormat) { + final String xml = StringTools.ruleMatchesToXML(ruleMatches, contents, + contextSize, xmlMode); + PrintStream out = new PrintStream(System.out, true, "UTF-8"); + out.print(xml); + } else { + printMatches(ruleMatches, prevMatches, contents, contextSize); + } + + //display stats if it's not in a buffered mode + if (xmlMode == StringTools.XmlPrintMode.NORMAL_XML) { + displayTimeStats(startTime, lt.getSentenceCount(), apiFormat); + } + return ruleMatches.size(); + } + + private static void displayTimeStats(final long startTime, + final long sentCount, final boolean apiFormat) { + final long endTime = System.currentTimeMillis(); + final long time = endTime - startTime; + final float timeInSeconds = time / 1000.0f; + final float sentencesPerSecond = sentCount / timeInSeconds; + if (apiFormat) { + System.out.println("<!--"); + } + System.out.printf(Locale.ENGLISH, + "Time: %dms for %d sentences (%.1f sentences/sec)", time, + sentCount, sentencesPerSecond); + System.out.println(); + if (apiFormat) { + System.out.println("-->"); + } + } + + /** + * Displays matches in a simple text format. + * @param ruleMatches Matches from rules. + * @param prevMatches Number of previously found matches. + * @param contents The text that was checked. + * @param contextSize The size of contents displayed. + * @since 1.0.1 + */ + private static void printMatches(final List<RuleMatch> ruleMatches, + final int prevMatches, final String contents, final int contextSize) { + int i = 1; + for (final RuleMatch match : ruleMatches) { + String output = i + prevMatches + ".) Line " + (match.getLine() + 1) + ", column " + + match.getColumn() + ", Rule ID: " + match.getRule().getId(); + if (match.getRule() instanceof PatternRule) { + final PatternRule pRule = (PatternRule) match.getRule(); + output += "[" + pRule.getSubId() + "]"; + } + System.out.println(output); + String msg = match.getMessage(); + msg = msg.replaceAll("<suggestion>", "'"); + msg = msg.replaceAll("</suggestion>", "'"); + System.out.println("Message: " + msg); + final List<String> replacements = match.getSuggestedReplacements(); + if (!replacements.isEmpty()) { + System.out.println("Suggestion: " + + StringTools.listToString(replacements, "; ")); + } + System.out.println(StringTools.getContext(match.getFromPos(), match + .getToPos(), contents, contextSize)); + if (i < ruleMatches.size()) { + System.out.println(); + } + i++; + } + } + + /** + * Checks the bilingual input (bitext) and displays the output (considering the target + * language) in API format or in the simple text format. + * + * NOTE: the positions returned by the rule matches are relative + * to the target string only, and always start at the first line + * and first column, no matter how many lines were checked before. + * To have multiple lines taken into account, use the checkBitext + * method that takes a BitextReader. + * + * @param src Source text. + * @param trg Target text. + * @param srcLt Source JLanguageTool (used to analyze the text). + * @param trgLt Target JLanguageTool (used to analyze the text). + * @param bRules Bilingual rules used in addition to target standard rules. + * @param apiFormat Whether API format should be used. + * @param xmlMode The mode of XML output display. + * @return The number of rules matched on the bitext. + * @throws IOException + * @since 1.0.1 + */ + public static int checkBitext(final String src, final String trg, + final JLanguageTool srcLt, final JLanguageTool trgLt, + final List<BitextRule> bRules, + final boolean apiFormat, final XmlPrintMode xmlMode) throws IOException { + final long startTime = System.currentTimeMillis(); + final int contextSize = DEFAULT_CONTEXT_SIZE; + final List<RuleMatch> ruleMatches = + checkBitext(src, trg, srcLt, trgLt, bRules); + for (RuleMatch thisMatch : ruleMatches) { + thisMatch = + trgLt.adjustRuleMatchPos(thisMatch, + 0, 1, 1, trg); + } + if (apiFormat) { + final String xml = StringTools.ruleMatchesToXML(ruleMatches, trg, + contextSize, xmlMode); + PrintStream out = new PrintStream(System.out, true, "UTF-8"); + out.print(xml); + } else { + printMatches(ruleMatches, 0, trg, contextSize); + } + //display stats if it's not in a buffered mode + if (xmlMode == StringTools.XmlPrintMode.NORMAL_XML) { + displayTimeStats(startTime, srcLt.getSentenceCount(), apiFormat); + } + return ruleMatches.size(); + } + + /** + * Checks the bilingual input (bitext) and displays the output (considering the target + * language) in API format or in the simple text format. + * + * NOTE: the positions returned by the rule matches are adjusted + * according to the data returned by the reader. + * + * @param reader Reader of bitext strings. + * @param srcLt Source JLanguageTool (used to analyze the text). + * @param trgLt Target JLanguageTool (used to analyze the text). + * @param bRules Bilingual rules used in addition to target standard rules. + * @param apiFormat Whether API format should be used. + * @param xmlMode The mode of XML output display. + * @return The number of rules matched on the bitext. + * @throws IOException + * @since 1.0.1 + */ + public static int checkBitext(final BitextReader reader, + final JLanguageTool srcLt, final JLanguageTool trgLt, + final List<BitextRule> bRules, + final boolean apiFormat) throws IOException { + final long startTime = System.currentTimeMillis(); + final int contextSize = DEFAULT_CONTEXT_SIZE; + XmlPrintMode xmlMode = StringTools.XmlPrintMode.START_XML; + final List<RuleMatch> ruleMatches = new ArrayList<RuleMatch>(); + int matchCount = 0; + int sentCount = 0; + for (StringPair srcAndTrg : reader) { + final List<RuleMatch> curMatches = checkBitext( + srcAndTrg.getSource(), srcAndTrg.getTarget(), + srcLt, trgLt, bRules); + final List<RuleMatch> fixedMatches = new ArrayList<RuleMatch>(); + for (RuleMatch thisMatch : curMatches) { + fixedMatches.add( + trgLt.adjustRuleMatchPos(thisMatch, + reader.getSentencePosition(), + reader.getColumnCount(), + reader.getLineCount(), + reader.getCurrentLine())); + } + ruleMatches.addAll(fixedMatches); + if (fixedMatches.size() > 0) { + if (apiFormat) { + final String xml = StringTools.ruleMatchesToXML(fixedMatches, + reader.getCurrentLine(), + contextSize, xmlMode); + if (xmlMode == StringTools.XmlPrintMode.START_XML) { + xmlMode = StringTools.XmlPrintMode.CONTINUE_XML; + } + PrintStream out = new PrintStream(System.out, true, "UTF-8"); + out.print(xml); + } else { + printMatches(fixedMatches, matchCount, reader.getCurrentLine(), contextSize); + matchCount += fixedMatches.size(); + } + } + sentCount++; + } + displayTimeStats(startTime, sentCount, apiFormat); + if (apiFormat) { + PrintStream out = new PrintStream(System.out, true, "UTF-8"); + out.print("</matches>"); + } + return ruleMatches.size(); + } + + /** + * Checks the bilingual input (bitext) and displays the output (considering the target + * language) in API format or in the simple text format. + * + * @param src Source text. + * @param trg Target text. + * @param srcLt Source JLanguageTool (used to analyze the text). + * @param trgLt Target JLanguageTool (used to analyze the text). + * @param bRules Bilingual rules used in addition to target standard rules. + * @return The list of rule matches on the bitext. + * @throws IOException + * @since 1.0.1 + */ + public static List<RuleMatch> checkBitext(final String src, final String trg, + final JLanguageTool srcLt, final JLanguageTool trgLt, + final List<BitextRule> bRules) throws IOException { + final AnalyzedSentence srcText = srcLt.getAnalyzedSentence(src); + final AnalyzedSentence trgText = trgLt.getAnalyzedSentence(trg); + final List<RuleMatch> ruleMatches = trgLt.checkAnalyzedSentence + (JLanguageTool.paragraphHandling.NORMAL, + trgLt.getAllRules(), 0, 0, 1, trg, trgText); + for (BitextRule bRule : bRules) { + final RuleMatch[] curMatch = bRule.match(srcText, trgText); + if (curMatch != null) { + ruleMatches.addAll(Arrays.asList(curMatch)); + } + } + return ruleMatches; + } + + + /** + * Gets default bitext rules for a given pair of languages + * @param source Source language. + * @param target Target language. + * @return List of Bitext rules + * @throws IOException + * @throws ParserConfigurationException + * @throws SAXException + */ + public static List<BitextRule> getBitextRules(final Language source, + final Language target) throws IOException, ParserConfigurationException, SAXException { + final List<BitextRule> bRules = new ArrayList<BitextRule>(); + //try to load the bitext pattern rules for the language... + final BitextPatternRuleLoader ruleLoader = new BitextPatternRuleLoader(); + final String name = "/" + target.getShortName() + "/bitext.xml"; + final InputStream is = JLanguageTool.getDataBroker().getFromRulesDirAsStream(name); + if (is != null) { + bRules.addAll(ruleLoader.getRules(is, name)); + } + + //load the false friend rules in the bitext mode + final FalseFriendsAsBitextLoader fRuleLoader = new FalseFriendsAsBitextLoader(); + final String fName = "/false-friends.xml"; + bRules.addAll(fRuleLoader. + getFalseFriendsAsBitext( + JLanguageTool.getDataBroker().getRulesDir() + fName, + source, target)); + + //load Java bitext rules + // TODO: get ResourceBundle for possible parameters for rules + bRules.addAll(getAllBuiltinBitextRules(source, null)); + return bRules; + } + + private static List<BitextRule> getAllBuiltinBitextRules(final Language language, + final ResourceBundle messages) { + // use reflection to get a list of all non-pattern rules under + // "de.danielnaber.languagetool.rules.bitext" + // generic rules first, then language-specific ones + // TODO: the order of loading classes is not guaranteed so we may want to + // implement rule + // precedence + + final List<BitextRule> rules = new ArrayList<BitextRule>(); + try { + // we pass ".*Rule$" regexp to improve efficiency, see javadoc + final Class[] classes = ReflectionUtils.findClasses(Rule.class + .getClassLoader(), Rule.class.getPackage().getName() + + ".bitext", ".*Rule$", 0, + Rule.class, null); + + for (final Class class1 : classes) { + final Constructor[] constructors = class1.getConstructors(); + for (final Constructor constructor : constructors) { + final Class[] paramTypes = constructor.getParameterTypes(); + if (paramTypes.length == 0) { + rules.add((BitextRule) constructor.newInstance()); + break; + } + if (paramTypes.length == 1 + && paramTypes[0].equals(ResourceBundle.class)) { + rules.add((BitextRule) constructor.newInstance(messages)); + break; + } + if (paramTypes.length == 2 + && paramTypes[0].equals(ResourceBundle.class) + && paramTypes[1].equals(Language.class)) { + rules.add((BitextRule) constructor.newInstance(messages, language)); + break; + } + throw new RuntimeException("Unknown constructor for rule class: " + + class1.getName()); + } + } + } catch (final Exception e) { + throw new RuntimeException("Failed to load rules: " + e.getMessage(), e); + } + // System.err.println("Loaded " + rules.size() + " rules"); + return rules; + } + + + /** + * Simple rule profiler - used to run LT on a corpus to see which + * rule takes most time. + * @param contents - text to check + * @param lt - instance of LanguageTool + * @return number of matches + * @throws IOException + */ + public static void profileRulesOnText(final String contents, + final JLanguageTool lt) throws IOException { + final long[] workTime = new long[10]; + int matchCount = 0; + final List<Rule> rules = lt.getAllRules(); + final int ruleCount = rules.size(); + System.out.printf("Testing %d rules\n", ruleCount); + System.out.println("Rule ID\tTime\tSentences\tMatches\tSentences per sec."); + final List<String> sentences = lt.sentenceTokenize(contents); + for (Rule rule : rules) { + matchCount = 0; + for (int k = 0; k < 10; k++) { + final long startTime = System.currentTimeMillis(); + for (String sentence : sentences) { + matchCount += rule.match + (lt.getAnalyzedSentence(sentence)).length; + } + final long endTime = System.currentTimeMillis(); + workTime[k] = endTime - startTime; + } + Arrays.sort(workTime); + final long time = median(workTime); + final float timeInSeconds = time / 1000.0f; + final float sentencesPerSecond = sentences.size() / timeInSeconds; + System.out.printf(Locale.ENGLISH, + "%s\t%d\t%d\t%d\t%.1f", rule.getId(), + time, sentences.size(), matchCount, sentencesPerSecond); + System.out.println(); + } + } + + public static int profileRulesOnLine(final String contents, + final JLanguageTool lt, final Rule rule) throws IOException { + int count = 0; + for (final String sentence : lt.sentenceTokenize(contents)) { + count += rule.match(lt.getAnalyzedSentence(sentence)).length ; + } + return count; + } + + public static long median(long[] m) { + final int middle = m.length / 2; // subscript of middle element + if (m.length % 2 == 1) { + // Odd number of elements -- return the middle one. + return m[middle]; + } + return (m[middle-1] + m[middle]) / 2; + } + + /** + * Automatically applies suggestions to the text. + * Note: if there is more than one suggestion, always the first + * one is applied, and others ignored silently. + * + * @param + * contents - String to be corrected + * @param + * lt - Initialized LanguageTool object + * @return + * Corrected text as String. + */ + public static String correctText(final String contents, final JLanguageTool lt) throws IOException { + final List<RuleMatch> ruleMatches = lt.check(contents); + if (ruleMatches.isEmpty()) { + return contents; + } + return correctTextFromMatches(contents, ruleMatches); + } + + /** + * Automatically applies suggestions to the bilingual text. + * Note: if there is more than one suggestion, always the first + * one is applied, and others ignored silently. + * + * @param + * reader - a bitext file reader + * @param + * sourceLanguageTool Initialized source JLanguageTool object + * @param + * targetLanguageTool Initialized target JLanguageTool object + * @param + * bRules List of all BitextRules to use + */ + public static void correctBitext(final BitextReader reader, + final JLanguageTool srcLt, final JLanguageTool trgLt, + final List<BitextRule> bRules) throws IOException { + //TODO: implement a bitext writer for XML formats (like XLIFF) + final List<RuleMatch> ruleMatches = new ArrayList<RuleMatch>(); + for (StringPair srcAndTrg : reader) { + final List<RuleMatch> curMatches = checkBitext( + srcAndTrg.getSource(), srcAndTrg.getTarget(), + srcLt, trgLt, bRules); + final List<RuleMatch> fixedMatches = new ArrayList<RuleMatch>(); + for (RuleMatch thisMatch : curMatches) { + fixedMatches.add( + trgLt.adjustRuleMatchPos(thisMatch, + 0, //don't need to adjust at all, we have zero offset related to trg sentence + reader.getTargetColumnCount(), + reader.getLineCount(), + reader.getCurrentLine())); + } + ruleMatches.addAll(fixedMatches); + if (fixedMatches.size() > 0) { + System.out.println(correctTextFromMatches(srcAndTrg.getTarget(), + fixedMatches)); + } else { + System.out.println(srcAndTrg.getTarget()); + } + } + } + + private static String correctTextFromMatches( + final String contents, final List<RuleMatch> matches) { + final StringBuilder sb = new StringBuilder(contents); + //build error list: + final List<String> errors = new ArrayList<String>(); + for (RuleMatch rm : matches) { + final List<String> replacements = rm.getSuggestedReplacements(); + if (!replacements.isEmpty()) { + errors.add(sb.substring(rm.getFromPos(), rm.getToPos())); + } + } + int offset = 0; + int counter = 0; + for (RuleMatch rm : matches) { + final List<String> replacements = rm.getSuggestedReplacements(); + if (!replacements.isEmpty()) { + //make sure the error hasn't been already corrected: + if (errors.get(counter).equals(sb.substring(rm.getFromPos() - offset, rm.getToPos() - offset))) { + sb.replace(rm.getFromPos() - offset, + rm.getToPos() - offset, replacements.get(0)); + offset += (rm.getToPos() - rm.getFromPos()) + - replacements.get(0).length(); + } + counter++; + } + } + return sb.toString(); + } + + public static InputStream getInputStream(final String resourcePath) throws IOException { + try { + // try the URL first: + final URL url = new URL(resourcePath); + // success, load the resource. + return url.openStream(); + } catch (final MalformedURLException e) { + // no luck. Fallback to class loader paths. + } + // try file path: + final File f = new File(resourcePath); + if (f.exists() && f.isFile() && f.canRead()) { + return new BufferedInputStream(new FileInputStream(f)); + } + throw new IOException( + "Could not open input stream from URL/resource/file: " + + f.getAbsolutePath()); + } + + /** + * Get a stacktrace as a string. + */ + public static String getFullStackTrace(final Throwable e) { + final StringWriter sw = new StringWriter(); + final PrintWriter pw = new PrintWriter(sw); + e.printStackTrace(pw); + return sw.toString(); + } + + /** + * Load a file form the classpath using getResourceAsStream(). + * + * @param filename + * @return the stream of the file + * @throws IOException + * if the file cannot be loaded + */ + public static InputStream getStream(final String filename) throws IOException { + // the other ways to load the stream like + // "Tools.class.getClass().getResourceAsStream(filename)" + // don't work in a web context (using Grails): + final InputStream is = Tools.class.getResourceAsStream(filename); + if (is == null) { + throw new IOException("Could not load file from classpath : " + filename); + } + return is; + } + +} diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/tools/UnsyncStack.java b/JLanguageTool/src/java/de/danielnaber/languagetool/tools/UnsyncStack.java new file mode 100644 index 0000000..d7c2bfc --- /dev/null +++ b/JLanguageTool/src/java/de/danielnaber/languagetool/tools/UnsyncStack.java @@ -0,0 +1,127 @@ +/* LanguageTool, a natural language style checker + * Copyright (C) 2009 Daniel Naber (http://www.danielnaber.de) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ + +package de.danielnaber.languagetool.tools; + +import java.util.ArrayList; +import java.util.EmptyStackException; + +/** + * Implements unsynchronized stack (contrary to default Java java.util.Stack, + * this one is based on ArrayList). Usage is the same as the java.util.Stack. + * + * @author Marcin Miłkowski. + * + */ + +public class UnsyncStack<E> extends ArrayList<E> { + /** + * Generated automatically. + */ + private static final long serialVersionUID = -4984830372178073605L; + + public UnsyncStack() { + } + + /** + * Pushes an item onto the top of this stack. This has exactly the same effect + * as: <blockquote> + * + * <pre> + * add(item) + * </pre> + * + * </blockquote> + * + * @param item + * the item to be pushed onto this stack. + * @return the <code>item</code> argument. + * @see java.util.ArrayList#add + */ + public E push(E item) { + add(item); + return item; + } + + /** + * Removes the object at the top of this stack and returns that object as the + * value of this function. + * + * @return The object at the top of this stack (the last item of the + * <tt>ArrayList</tt> object). + * @exception EmptyStackException + * if this stack is empty. + */ + public E pop() { + E obj; + int len = size(); + obj = peek(); + remove(len - 1); + return obj; + } + + /** + * Looks at the object at the top of this stack without removing it from the + * stack. + * + * @return the object at the top of this stack (the last item of the + * <tt>ArrayList</tt> object). + * @exception EmptyStackException + * if this stack is empty. + */ + public E peek() { + int len = size(); + if (len == 0) + throw new EmptyStackException(); + return get(len - 1); + } + + /** + * Tests if this stack is empty. + * + * @return <code>true</code> if and only if this stack contains no items; + * <code>false</code> otherwise. + */ + public boolean empty() { + return size() == 0; + } + + /** + * Returns the 1-based position where an object is on this stack. If the + * object <tt>o</tt> occurs as an item in this stack, this method returns the + * distance from the top of the stack of the occurrence nearest the top of the + * stack; the topmost item on the stack is considered to be at distance + * <tt>1</tt>. The <tt>equals</tt> method is used to compare <tt>o</tt> to the + * items in this stack. + * + * @param o + * the desired object. + * @return the 1-based position from the top of the stack where the object is + * located; the return value <code>-1</code> indicates that the object + * is not on the stack. + */ + public int search(Object o) { + int i = lastIndexOf(o); + if (i >= 0) { + return size() - i; + } + return -1; + } + +} |