diff options
Diffstat (limited to 'JLanguageTool/src/java/de/danielnaber/languagetool/tokenizers/SentenceTokenizer.java')
-rw-r--r-- | JLanguageTool/src/java/de/danielnaber/languagetool/tokenizers/SentenceTokenizer.java | 250 |
1 files changed, 250 insertions, 0 deletions
diff --git a/JLanguageTool/src/java/de/danielnaber/languagetool/tokenizers/SentenceTokenizer.java b/JLanguageTool/src/java/de/danielnaber/languagetool/tokenizers/SentenceTokenizer.java new file mode 100644 index 0000000..55d1ec6 --- /dev/null +++ b/JLanguageTool/src/java/de/danielnaber/languagetool/tokenizers/SentenceTokenizer.java @@ -0,0 +1,250 @@ +/* LanguageTool, a natural language style checker + * Copyright (C) 2005 Daniel Naber (http://www.danielnaber.de) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ +package de.danielnaber.languagetool.tokenizers; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.HashSet; +import java.util.List; +import java.util.Set; +import java.util.StringTokenizer; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +/** + * Tokenizes text into sentences by looking for typical end-of-sentence markers, + * but considering exceptions (e.g. abbreviations). + * + * @author Daniel Naber + */ +public class SentenceTokenizer implements Tokenizer { + + // end of sentence marker: + protected static final String EOS = "\0"; + //private final static String EOS = "#"; // for testing only + protected static final String P = "[\\.!?…]"; // PUNCTUATION + protected static final String AP = "(?:'|«|\"||\\)|\\]|\\})?"; // AFTER PUNCTUATION + protected static final String PAP = P + AP; + protected static final String PARENS = "[\\(\\)\\[\\]]"; // parentheses + + // Check out the private methods for comments and examples about these + // regular expressions: + + private Pattern paragraph; + private static final Pattern paragraphByTwoLineBreaks = Pattern.compile("([\\n\\r]\\s*[\\n\\r])"); + private static final Pattern paragraphByLineBreak = Pattern.compile("([\\n\\r])"); + + // add unbreakable field, for example footnote, if it's at the end of the sentence + private static final Pattern punctWhitespace = Pattern.compile("(" + PAP + "(\u0002)?\\s)"); + // \p{Lu} = uppercase, with obeying Unicode (\p{Upper} is just US-ASCII!): + private static final Pattern punctUpperLower = Pattern.compile("(" + PAP + + ")([\\p{Lu}][^\\p{Lu}.])"); + private static final Pattern letterPunct = Pattern.compile("(\\s[\\wüöäÜÖÄß]" + P + ")"); + private static final Pattern abbrev1 = Pattern.compile("([^-\\wüöäÜÖÄß][\\wüöäÜÖÄß]" + PAP + "\\s)" + EOS); + private static final Pattern abbrev2 = Pattern.compile("([^-\\wüöäÜÖÄß][\\wüöäÜÖÄß]" + P + ")" + EOS); + private static final Pattern abbrev3 = Pattern.compile("(\\s[\\wüöäÜÖÄß]\\.\\s+)" + EOS); + private static final Pattern abbrev4 = Pattern.compile("(\\.\\.\\. )" + EOS + "([\\p{Ll}])"); + private static final Pattern abbrev5 = Pattern.compile("(['\"]" + P + "['\"]\\s+)" + EOS); + private static final Pattern abbrev6 = Pattern.compile("([\"']\\s*)" + EOS + "(\\s*[\\p{Ll}])"); + private static final Pattern abbrev7 = Pattern.compile("(\\s" + PAP + "\\s)" + EOS); + // z.b. 3.10. (im Datum): + private static final Pattern abbrev8 = Pattern.compile("(\\d{1,2}\\.\\d{1,2}\\.\\s+)" + EOS); + private static final Pattern repair1 = Pattern.compile("('[\\wüöäÜÖÄß]" + P + ")(\\s)"); + private static final Pattern repair2 = Pattern.compile("(\\sno\\.)(\\s+)(?!\\d)"); + private static final Pattern repair3 = Pattern.compile("([ap]\\.m\\.\\s+)([\\p{Lu}])"); + + private static final Pattern repair10 = Pattern.compile("([\\(\\[])([!?]+)([\\]\\)]) " + EOS); + private static final Pattern repair11 = Pattern.compile("([!?]+)([\\)\\]]) " + EOS); + private static final Pattern repair12 = Pattern.compile("(" + PARENS + ") " + EOS); + + // some abbreviations: + private static final String[] ABBREV_LIST = { + // English -- but these work globally for all languages: + "Mr", "Mrs", "No", "pp", "St", "no", + "Sr", "Jr", "Bros", "etc", "vs", "esp", "Fig", "fig", "Jan", "Feb", "Mar", "Apr", "Jun", "Jul", + "Aug", "Sep", "Sept", "Oct", "Okt", "Nov", "Dec", "Ph.D", "PhD", + "al", // in "et al." + "cf", "Inc", "Ms", "Gen", "Sen", "Prof", "Corp", "Co" + }; + + private final Set<Pattern> abbreviationPatterns = new HashSet<Pattern>(); + + /** + * Month names like "Dezember" that should not be considered a sentence + * boundary in string like "13. Dezember". May also contain other + * words that indicate there's no sentence boundary when preceded + * by a number and a dot. + */ + protected String[] monthNames; + + /** + * Create a sentence tokenizer that uses the built-in abbreviations. + */ + public SentenceTokenizer() { + this(new String[]{}); + } + + /** + * Create a sentence tokenizer with the given list of abbreviations, + * additionally to the built-in ones. + */ + public SentenceTokenizer(final String[] abbrevList) { + final List<String> allAbbreviations = new ArrayList<String>(); + allAbbreviations.addAll(Arrays.asList(abbrevList)); + allAbbreviations.addAll(Arrays.asList(ABBREV_LIST)); + for (String element : allAbbreviations) { + final Pattern pattern = Pattern.compile("(\\b" + element + PAP + "\\s)" + EOS); + abbreviationPatterns.add(pattern); + } + setSingleLineBreaksMarksParagraph(false); + } + + /** + * @param lineBreakParagraphs if <code>true</code>, single lines breaks are assumed to end a paragraph, + * with <code>false</code>, only two ore more consecutive line breaks end a paragraph + */ + public void setSingleLineBreaksMarksParagraph(final boolean lineBreakParagraphs) { + if (lineBreakParagraphs) { + paragraph = paragraphByLineBreak; + } else { + paragraph = paragraphByTwoLineBreaks; + } + } + + public boolean singleLineBreaksMarksPara() { + return paragraph == paragraphByLineBreak; + } + + /** + * Tokenize the given string to sentences. + */ + public List<String> tokenize(String s) { + s = firstSentenceSplitting(s); + s = removeFalseEndOfSentence(s); + s = splitUnsplitStuff(s); + final StringTokenizer stringTokenizer = + new StringTokenizer(s, EOS); + final List<String> l = new ArrayList<String>(); + while (stringTokenizer.hasMoreTokens()) { + final String sentence = stringTokenizer.nextToken(); + l.add(sentence); + } + return l; + } + + /** + * Add a special break character at all places with typical sentence delimiters. + */ + private String firstSentenceSplitting(String s) { + // Double new-line means a new sentence: + s = paragraph.matcher(s).replaceAll("$1" + EOS); + // Punctuation followed by whitespace means a new sentence: + s = punctWhitespace.matcher(s).replaceAll("$1" + EOS); + // New (compared to the perl module): Punctuation followed by uppercase followed + // by non-uppercase character (except dot) means a new sentence: + s = punctUpperLower.matcher(s).replaceAll("$1" + EOS + "$2"); + // Break also when single letter comes before punctuation: + s = letterPunct.matcher(s).replaceAll("$1" + EOS); + return s; + } + + /** + * Repair some positions that don't require a split, i.e. remove the special break character at + * those positions. + */ + protected String removeFalseEndOfSentence(String s) { + // Don't split at e.g. "U. S. A.": + s = abbrev1.matcher(s).replaceAll("$1"); + // Don't split at e.g. "U.S.A.": + s = abbrev2.matcher(s).replaceAll("$1"); + // Don't split after a white-space followed by a single letter followed + // by a dot followed by another whitespace. + // e.g. " p. " + s = abbrev3.matcher(s).replaceAll("$1"); + // Don't split at "bla bla... yada yada" (TODO: use \.\.\.\s+ instead?) + s = abbrev4.matcher(s).replaceAll("$1$2"); + // Don't split [.?!] when the're quoted: + s = abbrev5.matcher(s).replaceAll("$1"); + + // Don't split at abbreviations: + for (final Pattern abbrevPattern : abbreviationPatterns) { + final Matcher matcher = abbrevPattern.matcher(s); + s = matcher.replaceAll("$1"); + } + // Don't break after quote unless there's a capital letter: + // e.g.: "That's right!" he said. + s = abbrev6.matcher(s).replaceAll("$1$2"); + + // fixme? not sure where this should occur, leaving it commented out: + // don't break: text . . some more text. + // text=~s/(\s\.\s)$EOS(\s*)/$1$2/sg; + + // e.g. "Das ist . so." -> assume one sentence + s = abbrev7.matcher(s).replaceAll("$1"); + + // e.g. "Das ist . so." -> assume one sentence + s = abbrev8.matcher(s).replaceAll("$1"); + + // extension by dnaber --commented out, doesn't help: + // text = re.compile("(:\s+)%s(\s*[%s])" % (self.EOS, string.lowercase), + // re.DOTALL).sub("\\1\\2", text) + + // "13. Dezember" etc. -> keine Satzgrenze: + if (monthNames != null) { + for (String element : monthNames) { + s = s.replaceAll("(\\d+\\.) " + EOS + "(" + element + ")", "$1 $2"); + } + } + + // z.B. "Das hier ist ein(!) Satz." + s = repair10.matcher(s).replaceAll("$1$2$3 "); + + // z.B. "Das hier ist (genau!) ein Satz." + s = repair11.matcher(s).replaceAll("$1$2 "); + + // z.B. "bla (...) blubb" -> kein Satzende + s = repair12.matcher(s).replaceAll("$1 "); + + return s; + } + + /** + * Treat some more special cases that make up a sentence boundary. Insert the special break + * character at these positions. + */ + private String splitUnsplitStuff(String s) { + // e.g. "x5. bla..." -- not sure, leaving commented out: + // text = re.compile("(\D\d+)(%s)(\s+)" % self.P, re.DOTALL).sub("\\1\\2%s\\3" % self.EOS, text) + // Not sure about this one, leaving out four now: + // text = re.compile("(%s\s)(\s*\()" % self.PAP, re.DOTALL).sub("\\1%s\\2" % self.EOS, text) + // Split e.g.: He won't. #Really. + s = repair1.matcher(s).replaceAll("$1" + EOS + "$2"); + // Split e.g.: He won't say no. Not really. + s = repair2.matcher(s).replaceAll("$1" + EOS + "$2"); + // Split at "a.m." or "p.m." followed by a capital letter. + s = repair3.matcher(s).replaceAll("$1" + EOS + "$2"); + return s; + } + + /*public static void main(final String[] args) { + final SentenceTokenizer st = new GermanSentenceTokenizer(); + st.tokenize("Er sagte (...) und"); + }*/ + +} |