13 files changed, 1123 insertions, 0 deletions
diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/tokenizers/SRXSentenceTokenizer.java b/JLanguageTool/src/java/de/danielnaber/languagetool/tokenizers/SRXSentenceTokenizer.java
new file mode 100644
index 0000000..dc11420
--- /dev/null
+++ b/JLanguageTool/src/java/de/danielnaber/languagetool/tokenizers/SRXSentenceTokenizer.java
@@ -0,0 +1,99 @@
+/* LanguageTool, a natural language style checker 
+ * Copyright (C) 2009 Daniel Naber (http://www.danielnaber.de)
+ * 
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301
+ * USA
+ */
+package de.danielnaber.languagetool.tokenizers;
+
+import java.io.BufferedReader;
+import java.io.InputStreamReader;
+import java.util.ArrayList;
+import java.util.List;
+
+import net.sourceforge.segment.TextIterator;
+import net.sourceforge.segment.srx.SrxDocument;
+import net.sourceforge.segment.srx.SrxParser;
+import net.sourceforge.segment.srx.SrxTextIterator;
+import net.sourceforge.segment.srx.io.Srx2Parser;
+import de.danielnaber.languagetool.JLanguageTool;
+
+/**
+ * Class to tokenize sentences using an SRX file.
+ * 
+ * @author Marcin Miłkowski
+ * 
+ */
+public class SRXSentenceTokenizer extends SentenceTokenizer {
+
+  private BufferedReader srxReader;
+  private final SrxDocument document;
+  private final String language;
+  private String parCode;
+
+  static final String RULES = "/segment.srx";
+
+  public SRXSentenceTokenizer(final String language) {
+    this.language = language;
+    try {
+      srxReader = new BufferedReader(new InputStreamReader(
+  		  JLanguageTool.getDataBroker().getFromResourceDirAsStream(RULES), "utf-8"));
+    } catch (Exception e) {
+      throw new RuntimeException("Could not load rules " + RULES + " from resource dir "
+         + JLanguageTool.getDataBroker().getResourceDir());
+    }
+    final SrxParser srxParser = new Srx2Parser();
+    document = srxParser.parse(srxReader);
+    setSingleLineBreaksMarksParagraph(false);
+  }
+
+  @Override
+  public final List<String> tokenize(final String text) {
+    final List<String> segments = new ArrayList<String>();
+    final TextIterator textIterator = new SrxTextIterator(document, language
+        + parCode, text);
+    while (textIterator.hasNext()) {
+      segments.add(textIterator.next());
+    }
+    return segments;
+  }
+
+  public final boolean singleLineBreaksMarksPara() {
+    return "_one".equals(parCode);
+  }
+
+  /**
+   * @param lineBreakParagraphs
+   *          if <code>true</code>, single lines breaks are assumed to end a
+   *          paragraph, with <code>false</code>, only two ore more consecutive
+   *          line breaks end a paragraph
+   */
+  public final void setSingleLineBreaksMarksParagraph(
+      final boolean lineBreakParagraphs) {
+    if (lineBreakParagraphs) {
+      parCode = "_one";
+    } else {
+      parCode = "_two";
+    }
+  }
+
+  protected final void finalize() throws Throwable {
+    if (srxReader != null) {
+      srxReader.close();
+    }
+    super.finalize();
+  }
+
+}
diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/tokenizers/SentenceTokenizer.java b/JLanguageTool/src/java/de/danielnaber/languagetool/tokenizers/SentenceTokenizer.java
new file mode 100644
index 0000000..55d1ec6
--- /dev/null
+++ b/JLanguageTool/src/java/de/danielnaber/languagetool/tokenizers/SentenceTokenizer.java
@@ -0,0 +1,250 @@
+/* LanguageTool, a natural language style checker
+ * Copyright (C) 2005 Daniel Naber (http://www.danielnaber.de)
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301
+ * USA
+ */
+package de.danielnaber.languagetool.tokenizers;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Set;
+import java.util.StringTokenizer;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+/**
+ * Tokenizes text into sentences by looking for typical end-of-sentence markers,
+ * but considering exceptions (e.g. abbreviations).
+ *
+ * @author Daniel Naber
+ */
+public class SentenceTokenizer implements Tokenizer {
+
+  // end of sentence marker:
+  protected static final String EOS = "\0";
+  //private final static String EOS = "#"; // for testing only
+  protected static final String P = "[\\.!?…]"; // PUNCTUATION
+  protected static final String AP = "(?:'|«|\"||\\)|\\]|\\})?"; // AFTER PUNCTUATION
+  protected static final String PAP = P + AP;
+  protected static final String PARENS = "[\\(\\)\\[\\]]"; // parentheses
+
+  // Check out the private methods for comments and examples about these
+  // regular expressions:
+
+  private Pattern paragraph;
+  private static final Pattern paragraphByTwoLineBreaks = Pattern.compile("([\\n\\r]\\s*[\\n\\r])");
+  private static final Pattern paragraphByLineBreak = Pattern.compile("([\\n\\r])");
+
+  // add unbreakable field, for example footnote, if it's at the end of the sentence
+  private static final Pattern punctWhitespace = Pattern.compile("(" + PAP + "(\u0002)?\\s)");
+  // \p{Lu} = uppercase, with obeying Unicode (\p{Upper} is just US-ASCII!):
+  private static final Pattern punctUpperLower = Pattern.compile("(" + PAP
+      + ")([\\p{Lu}][^\\p{Lu}.])");
+  private static final Pattern letterPunct = Pattern.compile("(\\s[\\wüöäÜÖÄß]" + P + ")");
+  private static final Pattern abbrev1 = Pattern.compile("([^-\\wüöäÜÖÄß][\\wüöäÜÖÄß]" + PAP + "\\s)" + EOS);
+  private static final Pattern abbrev2 = Pattern.compile("([^-\\wüöäÜÖÄß][\\wüöäÜÖÄß]" + P + ")" + EOS);
+  private static final Pattern abbrev3 = Pattern.compile("(\\s[\\wüöäÜÖÄß]\\.\\s+)" + EOS);
+  private static final Pattern abbrev4 = Pattern.compile("(\\.\\.\\. )" + EOS + "([\\p{Ll}])");
+  private static final Pattern abbrev5 = Pattern.compile("(['\"]" + P + "['\"]\\s+)" + EOS);
+  private static final Pattern abbrev6 = Pattern.compile("([\"']\\s*)" + EOS + "(\\s*[\\p{Ll}])");
+  private static final Pattern abbrev7 = Pattern.compile("(\\s" + PAP + "\\s)" + EOS);
+  // z.b. 3.10. (im Datum):
+  private static final Pattern abbrev8 = Pattern.compile("(\\d{1,2}\\.\\d{1,2}\\.\\s+)" + EOS);
+  private static final Pattern repair1 = Pattern.compile("('[\\wüöäÜÖÄß]" + P + ")(\\s)");
+  private static final Pattern repair2 = Pattern.compile("(\\sno\\.)(\\s+)(?!\\d)");
+  private static final Pattern repair3 = Pattern.compile("([ap]\\.m\\.\\s+)([\\p{Lu}])");
+
+  private static final Pattern repair10 = Pattern.compile("([\\(\\[])([!?]+)([\\]\\)]) " + EOS);
+  private static final Pattern repair11 = Pattern.compile("([!?]+)([\\)\\]]) " + EOS);
+  private static final Pattern repair12 = Pattern.compile("(" + PARENS + ") " + EOS);
+
+  // some abbreviations:
+  private static final String[] ABBREV_LIST = {
+      // English -- but these work globally for all languages:
+      "Mr", "Mrs", "No", "pp", "St", "no",
+      "Sr", "Jr", "Bros", "etc", "vs", "esp", "Fig", "fig", "Jan", "Feb", "Mar", "Apr", "Jun", "Jul",
+      "Aug", "Sep", "Sept", "Oct", "Okt", "Nov", "Dec", "Ph.D", "PhD",
+      "al",  // in "et al."
+      "cf", "Inc", "Ms", "Gen", "Sen", "Prof", "Corp", "Co"
+  };
+
+  private final Set<Pattern> abbreviationPatterns = new HashSet<Pattern>();
+
+  /**
+   * Month names like "Dezember" that should not be considered a sentence
+   * boundary in string like "13. Dezember". May also contain other
+   * words that indicate there's no sentence boundary when preceded
+   * by a number and a dot.
+   */
+  protected String[] monthNames;
+
+  /**
+   * Create a sentence tokenizer that uses the built-in abbreviations.
+   */
+  public SentenceTokenizer() {
+    this(new String[]{});
+  }
+
+  /**
+   * Create a sentence tokenizer with the given list of abbreviations,
+   * additionally to the built-in ones.
+   */
+  public SentenceTokenizer(final String[] abbrevList) {
+    final List<String> allAbbreviations = new ArrayList<String>();
+    allAbbreviations.addAll(Arrays.asList(abbrevList));
+    allAbbreviations.addAll(Arrays.asList(ABBREV_LIST));
+    for (String element : allAbbreviations) {
+      final Pattern pattern = Pattern.compile("(\\b" + element + PAP + "\\s)" + EOS);
+      abbreviationPatterns.add(pattern);
+    }
+    setSingleLineBreaksMarksParagraph(false);
+  }
+
+  /**
+   * @param lineBreakParagraphs if <code>true</code>, single lines breaks are assumed to end a paragraph,
+   *  with <code>false</code>, only two ore more consecutive line breaks end a paragraph
+   */
+  public void setSingleLineBreaksMarksParagraph(final boolean lineBreakParagraphs) {
+    if (lineBreakParagraphs) {
+      paragraph = paragraphByLineBreak;
+    } else {
+      paragraph = paragraphByTwoLineBreaks;
+    }
+  }
+
+  public boolean singleLineBreaksMarksPara() {
+    return paragraph == paragraphByLineBreak;
+  }
+
+  /**
+   * Tokenize the given string to sentences.
+   */
+  public List<String> tokenize(String s) {
+    s = firstSentenceSplitting(s);
+    s = removeFalseEndOfSentence(s);
+    s = splitUnsplitStuff(s);
+    final StringTokenizer stringTokenizer =
+      new StringTokenizer(s, EOS);
+    final List<String> l = new ArrayList<String>();
+    while (stringTokenizer.hasMoreTokens()) {
+      final String sentence = stringTokenizer.nextToken();
+      l.add(sentence);
+    }
+    return l;
+  }
+
+  /**
+   * Add a special break character at all places with typical sentence delimiters.
+   */
+  private String firstSentenceSplitting(String s) {
+    // Double new-line means a new sentence:
+    s = paragraph.matcher(s).replaceAll("$1" + EOS);
+    // Punctuation followed by whitespace means a new sentence:
+    s = punctWhitespace.matcher(s).replaceAll("$1" + EOS);
+    // New (compared to the perl module): Punctuation followed by uppercase followed
+    // by non-uppercase character (except dot) means a new sentence:
+    s = punctUpperLower.matcher(s).replaceAll("$1" + EOS + "$2");
+    // Break also when single letter comes before punctuation:
+    s = letterPunct.matcher(s).replaceAll("$1" + EOS);
+    return s;
+  }
+
+  /**
+   * Repair some positions that don't require a split, i.e. remove the special break character at
+   * those positions.
+   */
+  protected String removeFalseEndOfSentence(String s) {
+    // Don't split at e.g. "U. S. A.":
+    s = abbrev1.matcher(s).replaceAll("$1");
+    // Don't split at e.g. "U.S.A.":
+    s = abbrev2.matcher(s).replaceAll("$1");
+    // Don't split after a white-space followed by a single letter followed
+    // by a dot followed by another whitespace.
+    // e.g. " p. "
+    s = abbrev3.matcher(s).replaceAll("$1");
+    // Don't split at "bla bla... yada yada" (TODO: use \.\.\.\s+ instead?)
+    s = abbrev4.matcher(s).replaceAll("$1$2");
+    // Don't split [.?!] when the're quoted:
+    s = abbrev5.matcher(s).replaceAll("$1");
+
+    // Don't split at abbreviations:
+    for (final Pattern abbrevPattern : abbreviationPatterns) {
+      final Matcher matcher = abbrevPattern.matcher(s);
+      s = matcher.replaceAll("$1");
+    }
+    // Don't break after quote unless there's a capital letter:
+    // e.g.: "That's right!" he said.
+    s = abbrev6.matcher(s).replaceAll("$1$2");
+
+    // fixme? not sure where this should occur, leaving it commented out:
+    // don't break: text . . some more text.
+    // text=~s/(\s\.\s)$EOS(\s*)/$1$2/sg;
+
+    // e.g. "Das ist . so." -> assume one sentence
+    s = abbrev7.matcher(s).replaceAll("$1");
+
+    // e.g. "Das ist . so." -> assume one sentence
+    s = abbrev8.matcher(s).replaceAll("$1");
+
+    // extension by dnaber --commented out, doesn't help:
+    // text = re.compile("(:\s+)%s(\s*[%s])" % (self.EOS, string.lowercase),
+    // re.DOTALL).sub("\\1\\2", text)
+
+    // "13. Dezember" etc. -> keine Satzgrenze:
+    if (monthNames != null) {
+      for (String element : monthNames) {
+        s = s.replaceAll("(\\d+\\.) " + EOS + "(" + element + ")", "$1 $2");
+      }
+    }
+
+    // z.B. "Das hier ist ein(!) Satz."
+    s = repair10.matcher(s).replaceAll("$1$2$3 ");
+
+    // z.B. "Das hier ist (genau!) ein Satz."
+    s = repair11.matcher(s).replaceAll("$1$2 ");
+
+    // z.B. "bla (...) blubb" -> kein Satzende
+    s = repair12.matcher(s).replaceAll("$1 ");
+
+    return s;
+  }
+
+  /**
+   * Treat some more special cases that make up a sentence boundary. Insert the special break
+   * character at these positions.
+   */
+  private String splitUnsplitStuff(String s) {
+    // e.g. "x5. bla..." -- not sure, leaving commented out:
+    // text = re.compile("(\D\d+)(%s)(\s+)" % self.P, re.DOTALL).sub("\\1\\2%s\\3" % self.EOS, text)
+    // Not sure about this one, leaving out four now:
+    // text = re.compile("(%s\s)(\s*\()" % self.PAP, re.DOTALL).sub("\\1%s\\2" % self.EOS, text)
+    // Split e.g.: He won't. #Really.
+    s = repair1.matcher(s).replaceAll("$1" + EOS + "$2");
+    // Split e.g.: He won't say no. Not really.
+    s = repair2.matcher(s).replaceAll("$1" + EOS + "$2");
+    // Split at "a.m." or "p.m." followed by a capital letter.
+    s = repair3.matcher(s).replaceAll("$1" + EOS + "$2");
+    return s;
+  }
+
+  /*public static void main(final String[] args) {
+    final SentenceTokenizer st = new GermanSentenceTokenizer();
+    st.tokenize("Er sagte (...) und");
+  }*/
+
+}
diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/tokenizers/Tokenizer.java b/JLanguageTool/src/java/de/danielnaber/languagetool/tokenizers/Tokenizer.java
new file mode 100644
index 0000000..9a49fbe
--- /dev/null
+++ b/JLanguageTool/src/java/de/danielnaber/languagetool/tokenizers/Tokenizer.java
@@ -0,0 +1,32 @@
+/* LanguageTool, a natural language style checker 
+ * Copyright (C) 2005 Daniel Naber (http://www.danielnaber.de)
+ * 
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301
+ * USA
+ */
+package de.danielnaber.languagetool.tokenizers;
+
+import java.util.List;
+
+/**
+ * Interface for classes that tokenize text into smaller units.
+ * 
+ * @author Daniel Naber
+ */
+public interface Tokenizer {
+
+  public abstract List<String> tokenize(String text);
+  
+}
diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/tokenizers/WordTokenizer.java b/JLanguageTool/src/java/de/danielnaber/languagetool/tokenizers/WordTokenizer.java
new file mode 100644
index 0000000..6764c34
--- /dev/null
+++ b/JLanguageTool/src/java/de/danielnaber/languagetool/tokenizers/WordTokenizer.java
@@ -0,0 +1,59 @@
+/* LanguageTool, a natural language style checker 
+ * Copyright (C) 2005 Daniel Naber (http://www.danielnaber.de)
+ * 
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301
+ * USA
+ */
+package de.danielnaber.languagetool.tokenizers;
+
+import java.util.ArrayList;
+import java.util.List;
+import java.util.StringTokenizer;
+
+/**
+ * Tokenizes a sentence into words.
+ * Punctuation and whitespace gets its own token.
+ * 
+ * @author Daniel Naber
+ */
+public class WordTokenizer implements Tokenizer {
+
+  public WordTokenizer() {
+  }
+
+  public List<String> tokenize(final String text) {
+    final List<String> l = new ArrayList<String>();
+    final StringTokenizer st = new StringTokenizer(text, 
+        "\u0020\u00A0\u115f\u1160\u1680" 
+        + "\u2000\u2001\u2002\u2003\u2004\u2005\u2006\u2007" 
+        + "\u2008\u2009\u200A\u200B\u200c\u200d\u200e\u200f"
+        + "\u2028\u2029\u202a\u202b\u202c\u202d\u202e\u202f"
+        + "\u205F\u2060\u2061\u2062\u2063\u206A\u206b\u206c\u206d"
+        + "\u206E\u206F\u3000\u3164\ufeff\uffa0\ufff9\ufffa\ufffb" 
+        + ",.;()[]{}<>!?:/\\\"'«»„”“‘`’…¿¡\t\n\r", true);
+    while (st.hasMoreElements()) {
+      l.add(st.nextToken());
+    }
+    return l;
+  }
+  
+}
+
+ 
+
+ 
+ 
+ 
+ 
+\ No newline at end of file
diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/tokenizers/cs/CzechSentenceTokenizer.java b/JLanguageTool/src/java/de/danielnaber/languagetool/tokenizers/cs/CzechSentenceTokenizer.java
new file mode 100644
index 0000000..2f0a4f4
--- /dev/null
+++ b/JLanguageTool/src/java/de/danielnaber/languagetool/tokenizers/cs/CzechSentenceTokenizer.java
@@ -0,0 +1,228 @@
+/* LanguageTool, a natural language style checker 
+ * Copyright (C) 2007 Daniel Naber (http://www.danielnaber.de)
+ * 
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301
+ * USA
+ */
+
+/*
+ * CzechSentenceTokenizer.java
+ *
+ * Created on 25.1.2007, 11:45
+ */
+
+package de.danielnaber.languagetool.tokenizers.cs;
+
+import java.util.ArrayList;
+import java.util.List;
+import java.util.StringTokenizer;
+import java.util.regex.Pattern;
+
+import de.danielnaber.languagetool.tokenizers.SentenceTokenizer;
+
+/**
+ *
+ * @author Jozef Licko
+ */
+public class CzechSentenceTokenizer extends SentenceTokenizer {
+
+  // End of sentence marker.
+  private static final String EOS = "\0";
+
+  // private final static String EOS = "#"; // for testing only
+
+  // Punctuation.
+  private static final String P = "[\\.!?…]";
+
+  // After punctuation.
+  private static final String AP = "(?:'|«|\"|”|\\)|\\]|\\})?";
+
+  private static final String PAP = P + AP;
+
+  // Check out the private methods for comments and examples about these
+  // regular expressions:
+
+  private static final Pattern paragraphByTwoLineBreaks = Pattern.compile("(\\n\\s*\\n)");
+
+  private static final Pattern paragraphByLineBreak = Pattern.compile("(\\n)");
+
+  // add unbreakable field, for example footnote, if it's at the end of the sentence
+  private static final Pattern punctWhitespace = Pattern.compile("(" + PAP + "(\u0002)?\\s)");
+
+  // \p{Lu} = uppercase, with obeying Unicode (\p{Upper} is just US-ASCII!):
+  private static final Pattern punctUpperLower = Pattern.compile("(" + PAP
+      + ")([\\p{Lu}][^\\p{Lu}.])");
+
+  private static final Pattern letterPunct = Pattern
+      .compile("(\\s[\\wáčďéěíňóřšťúůýžÁČĎÉĚÍŇÓŘŠŤÚŮÝŽ]" + P + ")");
+
+  private static final Pattern abbrev1 = Pattern
+      .compile("([^-\\wáčďéěíňóřšťúůýžÁČĎÉĚÍŇÓŘŠŤÚŮÝŽ][\\wáčďéěíňóřšťúůýžÁČĎÉĚÍŇÓŘŠŤÚŮÝŽ]" + PAP
+          + "\\s)" + EOS);
+
+  private static final Pattern abbrev2 = Pattern
+      .compile("([^-\\wáčďéěíňóřšťúůýžÁČĎÉĚÍŇÓŘŠŤÚŮÝŽ][\\wáčďéěíňóřšťúůýžÁČĎÉĚÍŇÓŘŠŤÚŮÝŽ]" + P
+          + ")" + EOS);
+
+  private static final Pattern abbrev3 = Pattern
+      .compile("(\\s[\\wáčďéěíňóřšťúůýžÁČĎÉĚÍŇÓŘŠŤÚŮÝŽ]\\.\\s+)" + EOS);
+
+  private static final Pattern abbrev4 = Pattern.compile("(\\.\\.\\. )" + EOS + "([\\p{Ll}])");
+  private static final Pattern abbrev5 = Pattern.compile("(['\"]" + P + "['\"]\\s+)" + EOS);
+  private static final Pattern abbrev6 = Pattern.compile("([\"']\\s*)" + EOS + "(\\s*[\\p{Ll}])");
+  private static final Pattern abbrev7 = Pattern.compile("(\\s" + PAP + "\\s)" + EOS);
+  // z.b. 3.10. (im Datum):
+  private static final Pattern abbrev8 = Pattern.compile("(\\d{1,2}\\.\\d{1,2}\\.\\s+)" + EOS);
+  private static final Pattern repair1 = Pattern.compile("('[\\wáčďéěíňóřšťúůýžÁČĎÉĚÍŇÓŘŠŤÚŮÝŽ]"
+      + P + ")(\\s)");
+  private static final Pattern repair2 = Pattern.compile("(\\sno\\.)(\\s+)(?!\\d)");
+
+  //  Czech abbreviations (ver. 0.2)
+
+  // various titles
+  private static final String TITLES = "Bc|BcA|Ing|Ing.arch|MUDr|MVDr|MgA|Mgr|JUDr|PhDr|" +
+      "RNDr|PharmDr|ThLic|ThDr|Ph.D|Th.D|prof|doc|CSc|DrSc|dr. h. c|PaedDr|Dr|PhMr|DiS";
+
+  // as a single regexp:
+  private static final String ABBREVIATIONS = "abt|ad|a.i|aj|angl|anon|apod|atd|atp|aut|bd|biogr|" +
+      "b.m|b.p|b.r|cca|cit|cizojaz|c.k|col|čes|čín|čj|ed|facs|fasc|fol|fot|franc|h.c|hist|hl|" +
+      "hrsg|ibid|il|ind|inv.č|jap|jhdt|jv|koed|kol|korej|kl|krit|lat|lit|m.a|maď|mj|mp|násl|" +
+      "např|nepubl|něm|no|nr|n.s|okr|odd|odp|obr|opr|orig|phil|pl|pokrač|pol|port|pozn|př.kr|" +
+      "př.n.l|přel|přeprac|příl|pseud|pt|red|repr|resp|revid|rkp|roč|roz|rozš|samost|sect|" +
+      "sest|seš|sign|sl|srv|stol|sv|šk|šk.ro|špan|tab|t.č|tis|tj|tř|tzv|univ|uspoř|vol|" +
+      "vl.jm|vs|vyd|vyobr|zal|zejm|zkr|zprac|zvl|n.p"
+      + "|" + TITLES;
+
+  private Pattern paragraph;
+
+  /**
+   * Create a sentence tokenizer.
+   */
+  public CzechSentenceTokenizer() {
+    setSingleLineBreaksMarksParagraph(false);
+  }
+
+  /**
+   * @param lineBreakParagraphs if <code>true</code>, single lines breaks are assumed to end a paragraph,
+   *  with <code>false</code>, only two ore more consecutive line breaks end a paragraph
+   */
+  public final void setSingleLineBreaksMarksParagraph(final boolean lineBreakParagraphs) {
+    if (lineBreakParagraphs)
+      paragraph = paragraphByLineBreak;
+    else
+      paragraph = paragraphByTwoLineBreaks;
+  }
+
+  public final List<String> tokenize(String s) {
+    s = firstSentenceSplitting(s);
+    s = removeFalseEndOfSentence(s);
+    s = splitUnsplitStuff(s);
+    final StringTokenizer stringTokenizer = 
+      new StringTokenizer(s, EOS);
+    List<String> l = new ArrayList<String>();
+    while (stringTokenizer.hasMoreTokens()) {
+      String sentence = stringTokenizer.nextToken();
+      l.add(sentence);
+    }
+    return l;
+  }
+
+  /**
+   * Add a special break character at all places with typical sentence delimiters.
+   */
+  private String firstSentenceSplitting(String s) {
+    // Double new-line means a new sentence:
+    s = paragraph.matcher(s).replaceAll("$1" + EOS);
+    // Punctuation followed by whitespace means a new sentence:
+    s = punctWhitespace.matcher(s).replaceAll("$1" + EOS);
+    // New (compared to the perl module): Punctuation followed by uppercase followed
+    // by non-uppercase character (except dot) means a new sentence:
+    s = punctUpperLower.matcher(s).replaceAll("$1" + EOS + "$2");
+    // Break also when single letter comes before punctuation:
+    s = letterPunct.matcher(s).replaceAll("$1" + EOS);
+    return s;
+  }
+
+  /**
+   * Repair some positions that don't require a split, i.e. remove the special break character at
+   * those positions.
+   */
+  protected String removeFalseEndOfSentence(String s) {
+    // Don't split at e.g. "U. S. A.":
+    s = abbrev1.matcher(s).replaceAll("$1");
+    // Don't split at e.g. "U.S.A.":
+    s = abbrev2.matcher(s).replaceAll("$1");
+    // Don't split after a white-space followed by a single letter followed
+    // by a dot followed by another whitespace.
+    // e.g. " p. "
+    s = abbrev3.matcher(s).replaceAll("$1");
+    // Don't split at "bla bla... yada yada" (TODO: use \.\.\.\s+ instead?)
+    s = abbrev4.matcher(s).replaceAll("$1$2");
+    // Don't split [.?!] when the're quoted:
+    s = abbrev5.matcher(s).replaceAll("$1");
+
+    // Don't split at abbreviations, treat them case insensitive
+    //TODO: don't split at some abbreviations followed by uppercase
+    //E.g., "Wojna rozpoczęła się w 1918 r. To była krwawa jatka"
+    //should be split at "r."... But
+    //"Ks. Jankowski jest analfabetą" shouldn't be split...
+    //this requires a special list of abbrevs used before names etc.
+
+    //removing the loop and using only one regexp - this is definitely much, much faster
+    Pattern pattern = Pattern.compile("(?u)(\\b(" + ABBREVIATIONS + ")" + PAP + "\\s)" + EOS);
+    s = pattern.matcher(s).replaceAll("$1");
+
+    // Don't break after quote unless there's a capital letter:
+    // e.g.: "That's right!" he said.
+    s = abbrev6.matcher(s).replaceAll("$1$2");
+
+    // fixme? not sure where this should occur, leaving it commented out:
+    // don't break: text . . some more text.
+    // text=~s/(\s\.\s)$EOS(\s*)/$1$2/sg;
+
+    // e.g. "Das ist . so." -> assume one sentence
+    s = abbrev7.matcher(s).replaceAll("$1");
+
+    // e.g. "Das ist . so." -> assume one sentence
+    s = abbrev8.matcher(s).replaceAll("$1");
+
+    // extension by dnaber --commented out, doesn't help:
+    // text = re.compile("(:\s+)%s(\s*[%s])" % (self.EOS, string.lowercase),
+    // re.DOTALL).sub("\\1\\2", text)
+
+    s = s.replaceAll("(\\d+\\.) " + EOS + "([\\p{L}&&[^\\p{Lu}]]+)", "$1 $2");
+
+    // z.B. "Das hier ist ein(!) Satz."
+    s = s.replaceAll("\\(([!?]+)\\) " + EOS, "($1) ");
+    return s;
+  }
+
+  /**
+   * Treat some more special cases that make up a sentence boundary. Insert the special break
+   * character at these positions.
+   */
+  private String splitUnsplitStuff(String s) {
+    // e.g. "x5. bla..." -- not sure, leaving commented out:
+    // text = re.compile("(\D\d+)(%s)(\s+)" % self.P, re.DOTALL).sub("\\1\\2%s\\3" % self.EOS, text)
+    // Not sure about this one, leaving out four now:
+    // text = re.compile("(%s\s)(\s*\()" % self.PAP, re.DOTALL).sub("\\1%s\\2" % self.EOS, text)
+    // Split e.g.: He won't. #Really.
+    s = repair1.matcher(s).replaceAll("$1" + EOS + "$2");
+    // Split e.g.: He won't say no. Not really.
+    s = repair2.matcher(s).replaceAll("$1" + EOS + "$2");
+    return s;
+  }
+
+}
diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/tokenizers/da/DanishSentenceTokenizer.java b/JLanguageTool/src/java/de/danielnaber/languagetool/tokenizers/da/DanishSentenceTokenizer.java
new file mode 100644
index 0000000..32db6d5
--- /dev/null
+++ b/JLanguageTool/src/java/de/danielnaber/languagetool/tokenizers/da/DanishSentenceTokenizer.java
@@ -0,0 +1,43 @@
+/* LanguageTool, a natural language style checker 
+ * Copyright (C) 2006 Daniel Naber (http://www.danielnaber.de)
+ * 
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301
+ * USA
+ */
+package de.danielnaber.languagetool.tokenizers.da;
+
+import de.danielnaber.languagetool.tokenizers.SentenceTokenizer;
+
+/**
+ * @deprecated use {@code new SRXSentenceTokenizer("da")} instead
+ * @author Daniel Naber
+ */
+public class DanishSentenceTokenizer extends SentenceTokenizer {
+
+  private static final String[] ABBREV_LIST = {
+"abs", "abstr", "adj", "adm", "adr", "adv", "afd", "afg", "afl", "afs", "afvig", "agro", "akad", "akk", "allr", "alm", "amer", "anat", "ang", "anm", "anv", "apot", "appos", "apr", "arab", "arkais", "arkæol", "arp", "arr", "art", "ass", "astr", "att", "attrib", "aud", "aug", "aut", "bag", "barb", "barnespr", "bd", "bdt", "beg", "besl", "best", "bet", "bhk", "biavl", "bibet", "bibl", "bibliot", "billard", "billedl", "biol", "bjergv", "bk", "bl", "bogb", "bogh", "bogtr", "bornh", "bot", "br", "bryg", "bto", "bygn", "bødk", "ca", "cand", "Chr", "cirk", "cit", "co", "d", "da", "dagl", "dans", "dat", "dec", "def", "demonstr", "dep", "dial", "diam", "dim", "disp", "distr", "distrib", "dobb", "dr", "dvs", "e", "egl", "eks", "eksam", "ekskl", "eksp", "ekspl", "el", "ell", "ellipt", "emb", "endv", "eng", "enk", "ent", "etc", "etnogr", "eufem", "eur", "event", "evt", "f", "fagl", "fakt", "farv", "feb", "ff", "fhv", "fig", "filos", "fisk", "fk", "fl", "flg", "flt", "flyv", "fmd", "fon", "foragt", "forb", "foreg", "forf", "forsikr", "forsk", "forst", "foræld", "fot", "fr", "fre", "fris", "frk", "fsv", "fuldm", "fx", "fys", "fysiol", "fægt", "gart", "gartn", "garv", "gdr", "gen", "genopt", "geogr", "geol", "geom", "germ", "gl", "glarm", "glda", "gldgs", "glholl", "glno", "gns", "got", "gr", "gradbøjn", "gram", "gross", "grundbet", "græc", "guldsm", "gym", "h", "hat", "hd", "hebr", "henh", "hensobj", "herald", "hhv", "hist", "hj", "holl", "hovedbet", "hr", "hty", "højtid", "haandarb", "haandv", "i", "if", "ifm", "ift", "iht", "imp", "indb", "indik", "inf", "ing", "Inkl", "inkl", "insp", "instr", "interj", "intk", "intr", "iron", "isl", "ital", "jan", "jarg", "jf", "jnr", "jr", "jul", "jun", "jur", "jy", "jæg", "jærnb", "jød", "Kbh", "kbh", "kem", "kgl", "kirk", "kl", "kld", "knsp", "kog", "koll", "komm", "komp", "konj", "konkr", "kons", "Kr", "kr", "kurv", "kvt", "køkkenspr", "l", "landbr", "landmaaling", "lat", "lb", "lic", "lign", "litt", "Ll", "log", "Loll", "loll", "lrs", "lør", "m", "maj", "maks", "mal", "man", "mar", "mat", "mdl", "mdr", "med", "medl", "meng", "merc", "meteorol", "meton", "metr", "mf", "mfl", "mht", "mia", "min", "mineral", "mio", "ml", "mlat", "mm", "mnt", "mods", "modsætn", "modt", "mr", "mrk", "mur", "mv", "mvh", "mytol", "møl", "mønt", "n", "naturv", "ndf", "Ndr", "nedsæt", "nht", "no", "nom", "nov", "nr", "nt", "num", "nyda", "nydann", "nylat", "naal", "obj", "obl", "oblik", "obs", "odont", "oecon", "oeng", "ofl", "ogs", "oht", "okt", "oldfr", "oldfris", "oldn", "olgn", "omg", "omkr", "omtr", "ons", "opr", "ordspr", "org", "osax", "osv", "ovenst", "overf", "overs", "ovf", "p", "pag", "part", "pass", "pct", "perf", "pga", "ph", "pharm", "phil", "pk", "pkt", "pl", "plur", "poet", "polit", "port", "poss", "post", "pott", "pr", "pron", "propr", "prov", "præd", "præp", "præs", "præt", "psych", "pt", "pæd", "paavirkn", "reb", "ref", "refl", "regn", "relat", "relig", "resp", "retor", "rid", "rigsspr", "run", "russ", "s", "sa", "sanskr", "scient", "sdjy", "sdr", "sek", "sen", "sep", "sept", "shetl", "sj", "sjæll", "skibsbygn", "sko", "skol", "skr", "skriftspr", "skræd", "Skt", "slagt", "slutn", "smed", "sml", "smsat", "smst", "snedk", "soldat", "sp", "spec", "sport", "spot", "spr", "sprogv", "spøg", "ssg", "ssgr", "st", "stk", "str", "stud", "subj", "subst", "superl", "sv", "sætn", "søn", "talem", "talespr", "tandl", "td", "tdl", "teat", "techn", "telef", "telegr", "teol", "th", "theol", "tir", "tirs", "tlf", "told", "tor", "tors", "trans", "tsk", "ty", "tyrk", "tøm", "u", "ubesl", "ubest", "udd", "udenl", "udg", "udtr", "uegl", "ugtl", "ult", "underbet", "undt", "univ", "upers", "ur", "urnord", "v", "var", "vbs", "vedk", "vedl", "vedr", "vejl", "verb", "vet", "vha", "vol", "vs", "vsa", "vulg", "væv", "zool", "æ", "æda", "ænht", "ænyd", "æstet", "ø", "å", "årg", "årh"
+  };
+
+  // Month names like "januar" that should not be considered a sentence
+  // boundary in string like "13. januar".
+  private static final String[] MONTH_NAMES = { "januar", "februar", "marts", "april", "maj",
+      "juni", "juli", "august", "september", "oktober", "november", "december" };
+
+  public DanishSentenceTokenizer() {
+    super(ABBREV_LIST);
+    super.monthNames = MONTH_NAMES;
+  }
+ 
+}
diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/tokenizers/de/GermanCompoundTokenizer.java b/JLanguageTool/src/java/de/danielnaber/languagetool/tokenizers/de/GermanCompoundTokenizer.java
new file mode 100644
index 0000000..fb141a0
--- /dev/null
+++ b/JLanguageTool/src/java/de/danielnaber/languagetool/tokenizers/de/GermanCompoundTokenizer.java
@@ -0,0 +1,47 @@
+/* LanguageTool, a natural language style checker 
+ * Copyright (C) 2007 Daniel Naber (http://www.danielnaber.de)
+ * 
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301
+ * USA
+ */
+package de.danielnaber.languagetool.tokenizers.de;
+
+import java.io.IOException;
+import java.util.List;
+
+import de.abelssoft.wordtools.jWordSplitter.impl.GermanWordSplitter;
+import de.danielnaber.languagetool.tokenizers.Tokenizer;
+
+/**
+ * Split German nouns using the jWordSplitter library.
+ * 
+ * @author Daniel Naber
+ */
+public class GermanCompoundTokenizer implements Tokenizer {
+
+  private final GermanWordSplitter wordSplitter;
+  
+  public GermanCompoundTokenizer() throws IOException {
+    wordSplitter = new GermanWordSplitter(false);
+    wordSplitter.setStrictMode(true); // required for now to make minimum length work
+    wordSplitter.setMinimumWordLength(3);
+  }
+  
+  public List<String> tokenize(String word) {
+    return (List<String>) wordSplitter.splitWord(word);
+  }
+
+}
+
diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/tokenizers/de/GermanSentenceTokenizer.java b/JLanguageTool/src/java/de/danielnaber/languagetool/tokenizers/de/GermanSentenceTokenizer.java
new file mode 100644
index 0000000..31dab43
--- /dev/null
+++ b/JLanguageTool/src/java/de/danielnaber/languagetool/tokenizers/de/GermanSentenceTokenizer.java
@@ -0,0 +1,95 @@
+/* LanguageTool, a natural language style checker 
+ * Copyright (C) 2006 Daniel Naber (http://www.danielnaber.de)
+ * 
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301
+ * USA
+ */
+package de.danielnaber.languagetool.tokenizers.de;
+
+import java.util.regex.Pattern;
+
+import de.danielnaber.languagetool.tokenizers.SentenceTokenizer;
+import org.apache.commons.lang.ArrayUtils;
+
+/**
+ * @deprecated use {@code new SRXSentenceTokenizer("de")} instead
+ */
+public class GermanSentenceTokenizer extends SentenceTokenizer {
+
+  private static final String[] ABBREV_LIST = {
+    "d", "Übers", "usw", "bzw", "Abh", "Abk", "Abt", "ahd", "Akk",
+    "allg", "alltagsspr", "altdt", "alttest", "amerikan", "Anh",
+    "Ank", "Anm", "Art", "Az", "Bat", "bayr", "Bd", "Bde", "Bed",
+    "Bem", "bes", "bez", "Bez", "Bhf", "bspw", "btto", "bw", "bzw",
+    "cts", "ct", "Ca", "ca", "chem", "chin", "Chr", "cresc", "dat", "Dat",
+    "desgl", "ders", "dgl", "Di", "Dipl", "Dir", "Do", "Doz", "Dr",
+    "dt", "ebd", "Ed", "eigtl", "engl", "Erg", "al", "etc", "etw",
+    "ev", "evtl", "Evtl", "exkl", "Expl", "Exz", "ff", "Fa", "fachspr", "fam",
+    "fem", "Fem", "Fr", "fr", "franz", "frz", "frdl", "Frl",
+    "Fut", "Gd", "geb", "gebr", "Gebr", "geh", "geh", "geleg", "gen",
+    "Gen", "germ", "gesch", "ges", "get", "ggf", "Ggs", "ggT",
+    "griech", "hebr", "hg", "Hrsg", "Hg", "hist", "hochd", "hochspr",
+    "Hptst", "Hr", "Allg", "ill", "inkl", "incl", "Ind", "Inf", "Ing",
+    "ital", "Tr", "Jb", "Jg", "Jh", "Jhd", "jmd", "jmdm", "jmdn", "jmds",
+    "jur", "Kap", "kart", "kath", "kfm", "kaufm", "Kfm", "kgl",
+    "Kl", "Konj", "Krs", "Kr", "Kto", "lat", "lfd", "Lit", "lt",
+    "Lz", "Mask", "mask", "Mrd", "mdal", "med", "met", "mhd", "Mi",
+    "Mio", "min", "Mo", "mod", "nachm", "nördlBr", "neutr",
+    "Nhd", "Nom", "Nr", "Nrn", "Num", "Obj", "od", "dgl", "offz",
+    "Part", "Perf", "Pers", "Pfd", "Pl", "Plur",
+    "pl", "Plusq", "Pos", "pp", "Präp", "Präs", "Prät", "Prov", "Prof",
+    "rd", "reg", "resp", "Rhld", "rit", "Sa", "südl", "Br",
+    "sel", "sen", "Sept", "Sing", "sign", "So", "sog", "Sp", "St",
+    "St", "St", "Std", "stacc", "Str", "stud", "Subst", "sva", "svw",
+    "sZ", "Tel", "Temp", "trans", "Tsd", "übertr", "übl", "ff", "ugs", "univ",
+    "urspr", "usw", "vgl", "Vol", "vorm", "vorm", "Vp", "Vs",
+    "vs", "wg", "Hd", "Ztr", "zus", "Zus", "zzt", "zz", "Zz", "Zt",
+    "Min", "bzgl"};
+
+  // einige deutsche Monate, vor denen eine Zahl erscheinen kann,
+  // ohne dass eine Satzgrenze erkannt wird (z.B. "am 13. Dezember" -> keine Satzgrenze)
+  private static final String[] MONTH_NAMES = { "Januar", "Februar", "März", "April", "Mai",
+      "Juni", "Juli", "August", "September", "Oktober", "November", "Dezember",
+      // ähnliche Fälle außerhalb der Monatsnamen:
+      "Jh", "Jhd", "Jahrhundert", "Jahrhunderts", "Geburtstag", "Geburtstags", "Platz", "Platzes"};
+
+  /** don't split at cases like "Friedrich II. wird auch..." */
+  private static final Pattern REPAIR_NAME_PATTERN = Pattern.compile("( [IVX]+\\.) " + EOS + "([^A-ZÖÄÜ]+)");
+
+  /** don't split at cases like "im 13. oder 14. Jahrhundert" */
+  private static final Pattern REPAIR_NUMBER_PATTERN = Pattern.compile("(\\d+\\.)(\\s+)" + EOS + "(und|oder|bis) ");
+
+  public GermanSentenceTokenizer() {
+    super(ABBREV_LIST);
+    super.monthNames = MONTH_NAMES;
+  }
+
+  /**
+   * Create a sentence tokenizer with the given list of abbreviations,
+   * additionally to the built-in ones.
+   */
+  public GermanSentenceTokenizer(final String[] abbrevList) {
+    super((String[]) ArrayUtils.addAll(ABBREV_LIST, abbrevList));
+    super.monthNames = MONTH_NAMES;
+  }
+
+  protected String removeFalseEndOfSentence(String s) {
+      s = super.removeFalseEndOfSentence(s);
+      s = REPAIR_NAME_PATTERN.matcher(s).replaceAll("$1 $2");
+      s = REPAIR_NUMBER_PATTERN.matcher(s).replaceAll("$1$2$3 ");
+      return s;
+  }
+  
+}
diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/tokenizers/en/EnglishWordTokenizer.java b/JLanguageTool/src/java/de/danielnaber/languagetool/tokenizers/en/EnglishWordTokenizer.java
new file mode 100644
index 0000000..5b29d18
--- /dev/null
+++ b/JLanguageTool/src/java/de/danielnaber/languagetool/tokenizers/en/EnglishWordTokenizer.java
@@ -0,0 +1,53 @@
+/* LanguageTool, a natural language style checker 
+ * Copyright (C) 2005 Daniel Naber (http://www.danielnaber.de)
+ * 
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301
+ * USA
+ */
+package de.danielnaber.languagetool.tokenizers.en;
+
+import java.util.ArrayList;
+import java.util.List;
+import java.util.StringTokenizer;
+
+import de.danielnaber.languagetool.tokenizers.Tokenizer;
+
+/**
+ * Tokenizes a sentence into words. Punctuation and whitespace gets its own token.
+ * 
+ * @author Daniel Naber
+ */
+public class EnglishWordTokenizer implements Tokenizer {
+
+  public EnglishWordTokenizer() {
+  }
+  
+  public List<String> tokenize(final String text) {
+    final List<String> tokens = new ArrayList<String>();
+    final StringTokenizer st = new StringTokenizer(text, 
+        "\u0020\u00A0\u115f\u1160\u1680" 
+        + "\u2000\u2001\u2002\u2003\u2004\u2005\u2006\u2007" 
+        + "\u2008\u2009\u200A\u200B\u200c\u200d\u200e\u200f"
+        + "\u2028\u2029\u202a\u202b\u202c\u202d\u202e\u202f"
+        + "\u205F\u2060\u2061\u2062\u2063\u206A\u206b\u206c\u206d"
+        + "\u206E\u206F\u3000\u3164\ufeff\uffa0\ufff9\ufffa\ufffb"  
+        + ",.;()[]{}!?:\"'’‘„“”…\\/\t\n", true);
+    while (st.hasMoreElements()) {
+      tokens.add(st.nextToken());
+    }
+    return tokens;
+  }
+  
+}
diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/tokenizers/gl/GalicianWordTokenizer.java b/JLanguageTool/src/java/de/danielnaber/languagetool/tokenizers/gl/GalicianWordTokenizer.java
new file mode 100644
index 0000000..6a1919e
--- /dev/null
+++ b/JLanguageTool/src/java/de/danielnaber/languagetool/tokenizers/gl/GalicianWordTokenizer.java
@@ -0,0 +1,53 @@
+/* LanguageTool, a natural language style checker 
+ * Copyright (C) 2005 Daniel Naber (http://www.danielnaber.de)
+ * 
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301
+ * USA
+ */
+package de.danielnaber.languagetool.tokenizers.gl;
+
+import java.util.ArrayList;
+import java.util.List;
+import java.util.StringTokenizer;
+
+import de.danielnaber.languagetool.tokenizers.Tokenizer;
+
+/**
+ * Tokenizes a sentence into words. Punctuation and whitespace gets its own token.
+ * 
+ * @author Daniel Naber
+ */
+public class GalicianWordTokenizer implements Tokenizer {
+
+  public GalicianWordTokenizer() {
+  }
+  
+  public List<String> tokenize(final String text) {
+    final List<String> tokens = new ArrayList<String>();
+    final StringTokenizer st = new StringTokenizer(text, 
+        "\u0020\u00A0\u115f\u1160\u1680" 
+        + "\u2000\u2001\u2002\u2003\u2004\u2005\u2006\u2007" 
+        + "\u2008\u2009\u200A\u200B\u200c\u200d\u200e\u200f"
+        + "\u2028\u2029\u202a\u202b\u202c\u202d\u202e\u202f"
+        + "\u205F\u2060\u2061\u2062\u2063\u206A\u206b\u206c\u206d"
+        + "\u206E\u206F\u3000\u3164\ufeff\uffa0\ufff9\ufffa\ufffb"  
+        + ",.;()[]{}¿¡!?:\"'’‘„“”…\\/\t\n", true);
+    while (st.hasMoreElements()) {
+      tokens.add(st.nextToken());
+    }
+    return tokens;
+  }
+  
+}
diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/tokenizers/ml/MalayalamWordTokenizer.java b/JLanguageTool/src/java/de/danielnaber/languagetool/tokenizers/ml/MalayalamWordTokenizer.java
new file mode 100644
index 0000000..ebd7ce3
--- /dev/null
+++ b/JLanguageTool/src/java/de/danielnaber/languagetool/tokenizers/ml/MalayalamWordTokenizer.java
@@ -0,0 +1,55 @@
+/* LanguageTool, a natural language style checker 
+ * Copyright (C) 2005 Daniel Naber (http://www.danielnaber.de)
+ * 
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301
+ * USA
+ */
+package de.danielnaber.languagetool.tokenizers.ml;
+
+import java.util.ArrayList;
+import java.util.List;
+import java.util.StringTokenizer;
+
+import de.danielnaber.languagetool.tokenizers.Tokenizer;
+
+/**
+ * Tokenizes a sentence into words. Punctuation and whitespace gets its own token.
+ * 
+ * @author Daniel Naber
+ */
+public class MalayalamWordTokenizer implements Tokenizer {
+
+  public MalayalamWordTokenizer() {
+  }
+  
+  public List<String> tokenize(final String text) {
+    final List<String> tokens = new ArrayList<String>();
+    final StringTokenizer st = new StringTokenizer(text, 
+        "\u0020\u00A0\u115f\u1160\u1680" 
+        /** 
+        + "\u2000\u2001\u2002\u2003\u2004\u2005\u2006\u2007" 
+        + "\u2008\u2009\u200A\u200B\u200c\u200d\u200e\u200f"
+        + "\u2028\u2029\u202a\u202b\u202c\u202d\u202e\u202f"
+        + "\u205F\u2060\u2061\u2062\u2063\u206A\u206b\u206c\u206d"
+        + "\u206E\u206F\u3000\u3164\ufeff\uffa0\ufff9\ufffa\ufffb"
+        **/  
+        + ",.;()[]{}!?:\"'’‘„“”…\\/\t\n", true);
+    while (st.hasMoreElements()) {
+      tokens.add(st.nextToken());
+    }
+    return tokens;
+  }
+  
+}
diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/tokenizers/nl/DutchWordTokenizer.java b/JLanguageTool/src/java/de/danielnaber/languagetool/tokenizers/nl/DutchWordTokenizer.java
new file mode 100644
index 0000000..7b12536
--- /dev/null
+++ b/JLanguageTool/src/java/de/danielnaber/languagetool/tokenizers/nl/DutchWordTokenizer.java
@@ -0,0 +1,53 @@
+/* LanguageTool, a natural language style checker 
+ * Copyright (C) 2008 Daniel Naber (http://www.danielnaber.de)
+ * 
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301
+ * USA
+ */
+
+package de.danielnaber.languagetool.tokenizers.nl;
+
+import java.util.Arrays;
+import java.util.List;
+
+import de.danielnaber.languagetool.tokenizers.WordTokenizer;
+
+public class DutchWordTokenizer extends WordTokenizer {
+
+  public DutchWordTokenizer() {
+  }
+
+  /**
+   * Tokenizes just like WordTokenizer with the exception for words such as
+   * "oma's" that contains an apostrophe in their middle.
+   * 
+   * @param text
+   *          - Text to tokenize
+   * @return List of tokens.
+   * 
+   *         Note: a special string ##NL_APOS## is used to replace apostrophe
+   *         during tokenizing.
+   */
+  public List<String> tokenize(final String text) {
+    // TODO: find a cleaner implementation, this is a hack
+    final List<String> tokenList = super.tokenize(text.replaceAll(
+        "([\\p{L}])'([\\p{L}])", "$1##NL_APOS##$2"));
+    final String[] tokens = tokenList.toArray(new String[tokenList.size()]);
+    for (int i = 0; i < tokens.length; i++) {
+      tokens[i] = tokens[i].replace("##NL_APOS##", "'");
+    }
+    return Arrays.asList(tokens);
+  }
+}
diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/tokenizers/ro/RomanianWordTokenizer.java b/JLanguageTool/src/java/de/danielnaber/languagetool/tokenizers/ro/RomanianWordTokenizer.java
new file mode 100644
index 0000000..42fa76e
--- /dev/null
+++ b/JLanguageTool/src/java/de/danielnaber/languagetool/tokenizers/ro/RomanianWordTokenizer.java
@@ -0,0 +1,56 @@
+/* LanguageTool, a natural language style checker 
+ * Copyright (C) 2005 Daniel Naber (http://www.danielnaber.de)
+ * 
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301
+ * USA
+ */
+package de.danielnaber.languagetool.tokenizers.ro;
+
+import java.util.ArrayList;
+import java.util.List;
+import java.util.StringTokenizer;
+
+import de.danielnaber.languagetool.tokenizers.Tokenizer;
+
+/**
+ * Tokenizes a sentence into words. Punctuation and whitespace gets its own
+ * token. Like EnglishWordTokenizer except for some characters: eg: "-'
+ * 
+ * @author Ionuț Păduraru
+ * @since 20.02.2009 19:53:50
+ */
+public class RomanianWordTokenizer implements Tokenizer {
+
+	public RomanianWordTokenizer() {
+	}
+
+	public List<String> tokenize(final String text) {
+		List<String> l = new ArrayList<String>();
+		StringTokenizer st = new StringTokenizer(
+				text,
+				"\u0020\u00A0\u115f\u1160\u1680"
+						+ "\u2000\u2001\u2002\u2003\u2004\u2005\u2006\u2007"
+						+ "\u2008\u2009\u200A\u200B\u200c\u200d\u200e\u200f"
+						+ "\u2028\u2029\u202a\u202b\u202c\u202d\u202e\u202f"
+						+ "\u205F\u2060\u2061\u2062\u2063\u206A\u206b\u206c\u206d"
+						+ "\u206E\u206F\u3000\u3164\ufeff\uffa0\ufff9\ufffa\ufffb"
+						+ ",.;()[]{}!?:\"'’‘„“”…\\/\t\n\r«»<>%°" + "-|=", true);
+		while (st.hasMoreElements()) {
+			l.add(st.nextToken());
+		}
+		return l;
+	}
+
+}