diff options
Diffstat (limited to 'JLanguageTool/src/dev/de/danielnaber/languagetool/dev/Indexer.java')
-rw-r--r-- | JLanguageTool/src/dev/de/danielnaber/languagetool/dev/Indexer.java | 100 |
1 files changed, 100 insertions, 0 deletions
diff --git a/JLanguageTool/src/dev/de/danielnaber/languagetool/dev/Indexer.java b/JLanguageTool/src/dev/de/danielnaber/languagetool/dev/Indexer.java new file mode 100644 index 0000000..fc3392b --- /dev/null +++ b/JLanguageTool/src/dev/de/danielnaber/languagetool/dev/Indexer.java @@ -0,0 +1,100 @@ +/* LanguageTool, a natural language style checker + * Copyright (C) 2006 Daniel Naber (http://www.danielnaber.de) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ +package de.danielnaber.languagetool.dev; + +import java.io.File; +import java.io.FileInputStream; +import java.io.IOException; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.index.IndexWriter; + +import de.danielnaber.languagetool.Language; +import de.danielnaber.languagetool.tools.StringTools; + +/** + * POS tag and index text files using Lucene. Required for ContextFinder.java. + * TODO: hard-coded to index a specific kind of XML. + * + * @author Daniel Naber + */ +public class Indexer { + + static final String BODY_FIELD = "body"; + + private Indexer() {} + + public static void main(String[] args) throws IOException { + final Indexer prg = new Indexer(); + if (args.length != 2) { + System.err.println("Usage: Indexer <dataDir> <indexDir>"); + System.exit(1); + } + // FIXME: make this an option: + final Language lang = Language.GERMAN; + prg.run(args[0], args[1], lang); + } + + private void run(String dataDir, String indexDir, Language lang) throws IOException { + final IndexWriter iw = new IndexWriter(indexDir, new POSTagAnalyzer(lang.getTagger()), true); + iw.setMaxBufferedDocs(100); + index(iw, new File(dataDir), 1); + System.out.println("Optimizing index..."); + iw.optimize(); + iw.close(); + System.out.println("Done."); + } + + private void index(IndexWriter iw, File dir, int count) throws IOException { + if (dir.isDirectory()) { + final File[] files = dir.listFiles(); + for (File file : files) { + index(iw, file, ++count); + } + } else { + final Document doc = new Document(); + if (count % 50 == 0) + System.out.println("Indexing file #" + count); + String s = StringTools.readFile(new FileInputStream(dir.getAbsolutePath()), "iso-8859-1"); + // XML data: + s = getParagraphs(s); + //s = s.replaceAll("(\\w)([.,?!])", "$1 $2"); + //s = s.replaceAll("<.*?>", ""); + //System.err.println(">"+s); + doc.add(new Field(BODY_FIELD, s, Field.Store.YES, Field.Index.TOKENIZED)); + iw.addDocument(doc); + } + } + + private String getParagraphs(String xml) { + final StringBuilder sb = new StringBuilder(); + final Pattern pattern = Pattern.compile("<p>(.*?)</p>", Pattern.DOTALL); + final Matcher matcher = pattern.matcher(xml); + int pos = 0; + while (matcher.find(pos)) { + sb.append(matcher.group(1)); + pos = matcher.end(); + } + return sb.toString(); + } + +} |