#!/usr/bin/python # -*- coding: iso-8859-1 -*- # # LanguageTool -- A Rule-Based Style and Grammar Checker # Copyright (C) 2002,2003,2004 Daniel Naber # # This library is free software; you can redistribute it and/or # modify it under the terms of the GNU Lesser General Public # License as published by the Free Software Foundation; either # version 2.1 of the License, or (at your option) any later version. # # This library is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # Lesser General Public License for more details. # # You should have received a copy of the GNU Lesser General Public # License along with this library; if not, write to the Free Software # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA import codecs import getopt import locale import os import re import socket import string import sys import time import xml.dom.minidom import profile sys.path.append(os.path.join(sys.path[0], "src")) import Entities import Tagger import Chunker import Rules import SentenceSplitter import Tools import ConfigParser class TextChecker: """A rule-based style and grammar checker.""" context = 15 # display this many character to the right and left for error context def __init__(self, grammar, falsefriends, words, \ builtin, textlanguage, mothertongue, max_sentence_length, debug_mode): # Which rules are activated (a list of IDs): self.grammar = grammar self.falsefriends = falsefriends self.words = words self.builtin = builtin self.textlanguage = textlanguage self.mothertongue = mothertongue self.max_sentence_length = max_sentence_length self.debug_mode = debug_mode config = ConfigParser.ConfigParser() config.readfp(open('TextChecker.ini')) Tagger.dicFile = config.get(textlanguage, 'dicFile'); Tagger.affFile = config.get(textlanguage, 'affFile'); if self.max_sentence_length == None: self.max_sentence_length = config.get(textlanguage, 'maxSentenceLength'); Rules.grammarFile = config.get(textlanguage, 'grammarFile'); self.tagger = Tagger.Tagger(textlanguage) self.chunker = Chunker.Chunker() rules = Chunker.Rules() self.chunker.setRules(rules) self.tagger.bindData() self.rules = Rules.Rules(self.max_sentence_length, self.grammar,\ self.words, self.builtin, self.falsefriends, \ textlanguage, mothertongue) self.bnc_paras = 0 self.bnc_sentences = 0 self.xml_output = 0 # default to non-XML output # anything but 'C' seems to be okay to make the sentence splitter work # for languages with special characters: locale.setlocale(locale.LC_CTYPE, 'en_US.iso-8859-1') return def setXMLOutput(self, xml_output): self.xml_output = xml_output return def setInputEncoding(self, input_encoding): self.input_encoding = input_encoding return def checkFile(self, filename): """Check a text file and return the results as an XML formatted list of possible errors.""" text = "" f = codecs.open(filename, "r", self.input_encoding) text = f.read() f.close() (rule_matches, result, tagged_words) = self.check(text) return (rule_matches, result, tagged_words) def check(self, text): """Check a text string and return the results as an XML formatted list of possible errors.""" splitter = SentenceSplitter.SentenceSplitter() sentences = splitter.split(text) rule_matches = [] char_counter = 0 all_tagged_words = [] line_counter = 1 column_counter = 0 prev_sentence = "" for sentence in sentences: #print "S='%s'" % (sentence) tagged_words = self.tagger.tagText(sentence) if self.debug_mode: print "Tw:", for tagged_word in tagged_words: if tagged_word[2]: print "%s/%s" % (tagged_word[0], tagged_word[2]), chunks = self.chunker.chunk(tagged_words) tagged_words.insert(0, ('', None, 'SENT_START')) tagged_words.append(('', None, 'SENT_END')) all_tagged_words.extend(tagged_words) if prev_sentence.endswith("\n") or sentence.startswith("\n"): column_counter = 0 for rule in self.rules.rules: matches = rule.match(tagged_words, chunks, char_counter, line_counter, column_counter) rule_matches.extend(matches) for triple in sentence: char_counter = char_counter + len(triple[0]) line_counter = line_counter + Tools.Tools.countLinebreaks(sentence) if Tools.Tools.countLinebreaks(sentence): column_counter = 0 column_counter = column_counter + len(sentence) prev_sentence = sentence if not self.builtin or "WHITESPACE" in self.builtin: whitespace_rule = Rules.WhitespaceRule() rule_matches.extend(whitespace_rule.match(all_tagged_words)) rule_match_list = [] for rule_match in rule_matches: if self.xml_output: rule_match_list.append(rule_match.toXML()) rule_match_list.append("\n") else: rule_match_list.append(rule_match.__str__()) from_pos = max(rule_match.from_pos-self.context, 0) to_pos = min(rule_match.to_pos+self.context, len(text)) summary = text[from_pos:to_pos] summary = re.compile("[\n\r]").sub(" ", summary) rule_match_list.append("\n\t...%s..." % summary) rule_match_list.append("\n") # TODO: use "^" to mark the *exact* position of the error: #rule_match_list.append("\n\t %s^\n" % (" " * (context-1))) result = string.join(rule_match_list, "") if self.xml_output: result = "\n%s" % result # TODO: optionally return tagged text? return (rule_matches, result, all_tagged_words) def checkBNCFiles(self, directory, checker): """Recursively load all files from a directory, extract all paragraphs and feed them to the style and grammar checker one by one.""" para_regex = re.compile("

(.*?)

", re.DOTALL) sentence_regex = re.compile("", re.DOTALL) xml_regex = re.compile("<.*?>", re.DOTALL) whitespace_regex = re.compile("\s+", re.DOTALL) files = [] filemode = 0 if os.path.isfile(directory): # call with a filename is okay files = [directory] filemode = 1 else: files = os.listdir(directory) for file in files: filename = None if filemode: filename = file else: filename = os.path.join(directory, file) if os.path.isdir(filename): self.checkBNCFiles(filename, checker) elif os.path.isfile(filename) and filename.find(".") != -1: print >> sys.stderr, "Ignoring %s" % filename elif os.path.isfile(filename): print >> sys.stderr, "FILE=%s" % filename f = open(filename, 'r') s = f.read() f.close() s = unicode(s, 'iso-8859-1') s_matches = sentence_regex.findall(s) self.bnc_sentences = self.bnc_sentences + len(s_matches) matches = para_regex.findall(s) for match in matches: self.bnc_paras = self.bnc_paras + 1 s = xml_regex.sub("", match) s = whitespace_regex.sub(" ", s) s = Entities.Entities.cleanEntities(s) s = s.strip() (rule_matches, result, tagged_words) = checker.check(s) if len(rule_matches) == 0: pass else: for rule_match in rule_matches: s_mark = "%s***%s" % (s[:rule_match.from_pos], s[rule_match.from_pos:]) print "%s:\n\n%s" % (rule_match.id, filename, s_mark.encode('utf8'), result.encode('utf8')) return def usage(): print "Usage: TextChecker.py [OPTION] " print " -h, --help Show this help" print " -l, --lang=... The text's language (de, en, or hu)" print " -g, --grammar=... Use only these grammar rules" print " -f, --falsefriends=... Use only these false friend rules" print " -w, --words=... Use only these style/word rules" print " -b, --builtin=... Use only these builtin rules (currently only WHITESPACE)" print " -m, --mothertongue=... Your native language, used with false friend checking" print " -s, --sentencelength=... Warn if a sentence is longer than this (default: never warn)" #print " -c, --check Check directory with BNC files in SGML format" print " -e, --encoding Input file's encoding/charset (e.g. latin1 or utf8)" print " -x, --xml Print out result as XML" print " -d, --debug Print out tagged words" return def main(): options = None rest = None try: (options, rest) = getopt.getopt(sys.argv[1:], 'hcxdg:f:w:b:m:l:s:e:', \ ['help', 'check', 'xml', 'debug', 'grammar=', 'falsefriends=', 'words=', \ 'builtin=', 'mothertongue=', 'lang=', 'sentencelength=', 'encoding=']) except getopt.GetoptError,e : print >> sys.stderr, "Error: ", e usage() sys.exit(2) # Define the variables with the default values: grammar = None falsefriends = None words = None builtin = None textlanguage = mothertongue = None max_sentence_length = None textlanguage = 'en' xml_output = 0 debug_mode = 0 input_encoding = 'latin1' for o, a in options: if o in ("-g", "--grammar"): grammar = a.split(",") elif o in ("-f", "--falsefriends"): falsefriends = a.split(",") elif o in ("-w", "--words"): words = a.split(",") elif o in ("-b", "--builtin"): builtin = a.split(",") elif o in ("-m", "--mothertongue"): mothertongue = a elif o in ("-l", "--lang"): textlanguage = a elif o in ("-s", "--sentencelength"): max_sentence_length = a elif o in ("-e", "--encoding"): input_encoding = a elif o in ("-x", "--xml"): xml_output = 1 elif o in ("-d", "--debug"): debug_mode = 1 for o, a in options: if o in ("-h", "--help"): usage() sys.exit(0) elif o in ("-c", "--check"): checker = TextChecker(grammar, falsefriends, words, \ builtin, textlanguage, mothertongue, max_sentence_length, debug_mode) for filename in rest: checker.checkBNCFiles(filename, checker) print >> sys.stderr, "Checked %d sentences in %d paragraphs." % \ (checker.bnc_sentences, checker.bnc_paras) sys.exit(0) if len(rest) == 1: filename = rest[0] if not xml_output: display_name = Tools.Tools.getLanguageName(textlanguage) if not display_name: print >> sys.stderr, "Unknown language code '%s'" % textlanguage print >> sys.stderr, "Supported languages are en, de, and hu" sys.exit(2) print "Checking '%s', file encoding %s, language %s:" % (filename, \ input_encoding, display_name) checker = TextChecker(grammar, falsefriends, words, builtin, \ textlanguage, mothertongue, max_sentence_length, debug_mode) checker.setXMLOutput(xml_output) checker.setInputEncoding(input_encoding) (rule_matches, result, tagged_words) = checker.checkFile(filename) if not result: print >> sys.stderr, "No errors found." else: print result.encode('latin1') else: usage() sys.exit(1) return if __name__ == "__main__": main() #profile.run('main()', 'prof')