diff options
Diffstat (limited to 'JLanguageTool/src/java/de/danielnaber/languagetool/rules/bitext')
4 files changed, 351 insertions, 0 deletions
diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/rules/bitext/BitextRule.java b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/bitext/BitextRule.java new file mode 100644 index 0000000..d508ae5 --- /dev/null +++ b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/bitext/BitextRule.java @@ -0,0 +1,106 @@ +/* LanguageTool, a natural language style checker + * Copyright (C) 2010 Marcin Miłkowski (www.languagetool.org) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ + +package de.danielnaber.languagetool.rules.bitext; + +import java.io.IOException; +import java.util.List; + +import de.danielnaber.languagetool.AnalyzedSentence; +import de.danielnaber.languagetool.bitext.StringPair; +import de.danielnaber.languagetool.rules.Rule; +import de.danielnaber.languagetool.rules.RuleMatch; +import de.danielnaber.languagetool.Language; + +/** + * Abstract bitext rule class. A BitextRule describes a language error and + * can test whether a given pre-analyzed pair of source and target text + * contains that error using the {@link Rule#match} method. + * + * @author Marcin Miłkowski + */ + +public abstract class BitextRule extends Rule { + + private List<StringPair> correctExamples; + private List<IncorrectBitextExample> incorrectExamples; + + private Language sourceLanguage; + + @Override + public abstract String getDescription(); + + public abstract String getMessage(); + + @Override + public abstract String getId(); + + @Override + public abstract RuleMatch[] match(AnalyzedSentence text) throws IOException; + + public abstract RuleMatch[] match(AnalyzedSentence sourceText, + AnalyzedSentence targetText) throws IOException; + + @Override + public abstract void reset(); + + /** + * Set the source language. If the language is not supported + * by LT, you need to use the default tokenizers etc. + * @param lang - Source Language + */ + public final void setSourceLang(final Language lang) { + sourceLanguage = lang; + } + + public final Language getSourceLang() { + return sourceLanguage; + } + + /** + * Set the examples that are correct and thus do not trigger the rule. + */ + public final void setCorrectBitextExamples(final List<StringPair> correctExamples) { + this.correctExamples = correctExamples; + } + + /** + * Get example sentences that are correct and thus will not match this rule. + */ + public final List<StringPair> getCorrectBitextExamples() { + return correctExamples; + } + + /** + * Set the examples that are incorrect and thus do trigger the rule. + */ + public final void setIncorrectBitextExamples( + final List<IncorrectBitextExample> incorrectExamples) { + this.incorrectExamples = incorrectExamples; + } + + /** + * Get example sentences that are incorrect and thus will match this rule. + */ + public final List<IncorrectBitextExample> getIncorrectBitextExamples() { + return incorrectExamples; + } + + +} diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/rules/bitext/DifferentLengthRule.java b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/bitext/DifferentLengthRule.java new file mode 100644 index 0000000..995772c --- /dev/null +++ b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/bitext/DifferentLengthRule.java @@ -0,0 +1,93 @@ +/* LanguageTool, a natural language style checker + * Copyright (C) 2010 Marcin Miłkowski (www.languagetool.org) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ +package de.danielnaber.languagetool.rules.bitext; + +import java.io.IOException; + +import de.danielnaber.languagetool.AnalyzedSentence; +import de.danielnaber.languagetool.AnalyzedTokenReadings; +import de.danielnaber.languagetool.rules.RuleMatch; + +/** + * Checks if the translation has a really different length than the source + * (smaller than 30% or longer by 250%). + * + * @author Marcin Miłkowski + * + */ +public class DifferentLengthRule extends BitextRule { + + static final String MSG = "Source and target translation lengths are very different!"; + + @Override + public String getDescription() { + return "Check if translation length is similar to source length"; + } + + @Override + public String getId() { + return "TRANSLATION_LENGTH"; + } + + public String getMessage() { + return MSG; + } + + /** + * This method makes no sense for bitext, return null?? + */ + @Override + public RuleMatch[] match(AnalyzedSentence text) throws IOException { + // TODO Auto-generated method stub + return null; + } + + @Override + public RuleMatch[] match(AnalyzedSentence sourceText, + AnalyzedSentence targetText) throws IOException { + + if (isLengthDifferent( + getPureText(sourceText), getPureText(targetText))) { + final RuleMatch[] rm = new RuleMatch[1]; + final AnalyzedTokenReadings[] tokens = targetText.getTokens(); + final int len = tokens[tokens.length - 1].getStartPos() + tokens[tokens.length - 1].getToken().length(); + rm[0] = new RuleMatch(this, 1, len, + MSG); + return rm; + } + return new RuleMatch[0]; + } + + static boolean isLengthDifferent(final String src, final String trg) { + final double skew = (((double) src.length() / (double) trg.length()) * 100.00); + return (skew > 250 || skew < 30); + } + + private static String getPureText(AnalyzedSentence text) { + final StringBuilder sb = new StringBuilder(); + for (AnalyzedTokenReadings token : text.getTokens()) { + sb.append(token.getToken()); + } + return sb.toString(); + } + + public void reset() { + } + +} diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/rules/bitext/IncorrectBitextExample.java b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/bitext/IncorrectBitextExample.java new file mode 100644 index 0000000..e877826 --- /dev/null +++ b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/bitext/IncorrectBitextExample.java @@ -0,0 +1,64 @@ +/* LanguageTool, a natural language style checker + * Copyright (C) 2010 Marcin Miłkowski (http://www.languagetool.org) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ +package de.danielnaber.languagetool.rules.bitext; + +import java.util.Arrays; +import java.util.List; + +import de.danielnaber.languagetool.bitext.StringPair; + +/** + * A text, typically a pair of sentences that contains an error. + * + * @since 1.0.1 + * @author Marcin Miłkowski + */ +public class IncorrectBitextExample { + + private StringPair example; + private List<String> corrections; + + public IncorrectBitextExample(final StringPair example) { + this.example = example; + } + + public IncorrectBitextExample(final StringPair example, final String[] corrections) { + this(example); + this.corrections = Arrays.asList(corrections); + } + + /** + * Return the example that contains the error. + */ + public StringPair getExample() { + return example; + } + + /** + * Return the possible corrections. May be null. + */ + public List<String> getCorrections() { + return corrections; + } + + public String toString() { + return example.getSource() + "/ " + example.getTarget() + " " + corrections; + } + +} diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/rules/bitext/SameTranslationRule.java b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/bitext/SameTranslationRule.java new file mode 100644 index 0000000..c9e1ace --- /dev/null +++ b/JLanguageTool/src/java/de/danielnaber/languagetool/rules/bitext/SameTranslationRule.java @@ -0,0 +1,88 @@ +/* LanguageTool, a natural language style checker + * Copyright (C) 2010 Marcin Miłkowski (www.languagetool.org) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ +package de.danielnaber.languagetool.rules.bitext; + +import java.io.IOException; + +import de.danielnaber.languagetool.AnalyzedSentence; +import de.danielnaber.languagetool.AnalyzedTokenReadings; +import de.danielnaber.languagetool.rules.RuleMatch; + +/** + * Checks if the translation for segments that have more than two words + * is different. + * + * @author Marcin Miłkowski + * + */ +public class SameTranslationRule extends BitextRule { + + static final String MSG = "Source and target translation are the same!"; + + @Override + public String getDescription() { + return "Check if translation is the same as source"; + } + + @Override + public String getId() { + return "SAME_TRANSLATION"; + } + + public String getMessage() { + return MSG; + } + + /** + * This method makes no sense for bitext, return null?? + */ + @Override + public RuleMatch[] match(AnalyzedSentence text) throws IOException { + // TODO Auto-generated method stub + return null; + } + + @Override + public RuleMatch[] match(AnalyzedSentence sourceText, + AnalyzedSentence targetText) throws IOException { + + //This is just heuristics, checking word count + if (sourceText.getTokensWithoutWhitespace().length > 3 + && getPureText(sourceText).equals(getPureText(targetText))) { + final RuleMatch[] rm = new RuleMatch[1]; + final AnalyzedTokenReadings[] tokens = targetText.getTokens(); + final int len = tokens[tokens.length - 1].getStartPos() + tokens[tokens.length - 1].getToken().length(); + rm[0] = new RuleMatch(this, 1, len, MSG); + return rm; + } + return new RuleMatch[0]; + } + + private static String getPureText(AnalyzedSentence text) { + final StringBuilder sb = new StringBuilder(); + for (AnalyzedTokenReadings token : text.getTokens()) { + sb.append(token.getToken()); + } + return sb.toString(); + } + + public void reset() { + } + +} |