summaryrefslogtreecommitdiffstats
path: root/JLanguageTool/src/java/de/danielnaber/languagetool/tokenizers/SRXSentenceTokenizer.java
diff options
context:
space:
mode:
Diffstat (limited to 'JLanguageTool/src/java/de/danielnaber/languagetool/tokenizers/SRXSentenceTokenizer.java')
-rw-r--r--JLanguageTool/src/java/de/danielnaber/languagetool/tokenizers/SRXSentenceTokenizer.java99
1 files changed, 99 insertions, 0 deletions
diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/tokenizers/SRXSentenceTokenizer.java b/JLanguageTool/src/java/de/danielnaber/languagetool/tokenizers/SRXSentenceTokenizer.java
new file mode 100644
index 0000000..dc11420
--- /dev/null
+++ b/JLanguageTool/src/java/de/danielnaber/languagetool/tokenizers/SRXSentenceTokenizer.java
@@ -0,0 +1,99 @@
+/* LanguageTool, a natural language style checker
+ * Copyright (C) 2009 Daniel Naber (http://www.danielnaber.de)
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
+ * USA
+ */
+package de.danielnaber.languagetool.tokenizers;
+
+import java.io.BufferedReader;
+import java.io.InputStreamReader;
+import java.util.ArrayList;
+import java.util.List;
+
+import net.sourceforge.segment.TextIterator;
+import net.sourceforge.segment.srx.SrxDocument;
+import net.sourceforge.segment.srx.SrxParser;
+import net.sourceforge.segment.srx.SrxTextIterator;
+import net.sourceforge.segment.srx.io.Srx2Parser;
+import de.danielnaber.languagetool.JLanguageTool;
+
+/**
+ * Class to tokenize sentences using an SRX file.
+ *
+ * @author Marcin MiƂkowski
+ *
+ */
+public class SRXSentenceTokenizer extends SentenceTokenizer {
+
+ private BufferedReader srxReader;
+ private final SrxDocument document;
+ private final String language;
+ private String parCode;
+
+ static final String RULES = "/segment.srx";
+
+ public SRXSentenceTokenizer(final String language) {
+ this.language = language;
+ try {
+ srxReader = new BufferedReader(new InputStreamReader(
+ JLanguageTool.getDataBroker().getFromResourceDirAsStream(RULES), "utf-8"));
+ } catch (Exception e) {
+ throw new RuntimeException("Could not load rules " + RULES + " from resource dir "
+ + JLanguageTool.getDataBroker().getResourceDir());
+ }
+ final SrxParser srxParser = new Srx2Parser();
+ document = srxParser.parse(srxReader);
+ setSingleLineBreaksMarksParagraph(false);
+ }
+
+ @Override
+ public final List<String> tokenize(final String text) {
+ final List<String> segments = new ArrayList<String>();
+ final TextIterator textIterator = new SrxTextIterator(document, language
+ + parCode, text);
+ while (textIterator.hasNext()) {
+ segments.add(textIterator.next());
+ }
+ return segments;
+ }
+
+ public final boolean singleLineBreaksMarksPara() {
+ return "_one".equals(parCode);
+ }
+
+ /**
+ * @param lineBreakParagraphs
+ * if <code>true</code>, single lines breaks are assumed to end a
+ * paragraph, with <code>false</code>, only two ore more consecutive
+ * line breaks end a paragraph
+ */
+ public final void setSingleLineBreaksMarksParagraph(
+ final boolean lineBreakParagraphs) {
+ if (lineBreakParagraphs) {
+ parCode = "_one";
+ } else {
+ parCode = "_two";
+ }
+ }
+
+ protected final void finalize() throws Throwable {
+ if (srxReader != null) {
+ srxReader.close();
+ }
+ super.finalize();
+ }
+
+}