diff options
author | Robin Houston <robin@lenny.robin> | 2011-06-20 23:34:44 +0100 |
---|---|---|
committer | Robin Houston <robin@lenny.robin> | 2011-06-20 23:34:44 +0100 |
commit | 770ceda38d780c2e8d3f0260a0f8912923d82b62 (patch) | |
tree | ea9e4972af2e933cd0bbd86d12e49b4aea06cc7b /app/models/incoming_message.rb | |
parent | db1136c70e1a580efd9307dce2efe421aaa63762 (diff) |
When external converters are used to extract text from attachments
for Xapian, suppress the stderr output of these external programs
unless they actually fail.
It is possible this will not significantly reduce the noise from
converters, because they may actually have been failing. At least
with this change we’ll be able to tell which it is.
Closes #52.
Diffstat (limited to 'app/models/incoming_message.rb')
-rw-r--r-- | app/models/incoming_message.rb | 62 |
1 files changed, 39 insertions, 23 deletions
diff --git a/app/models/incoming_message.rb b/app/models/incoming_message.rb index c808dc6a1..ca5676c67 100644 --- a/app/models/incoming_message.rb +++ b/app/models/incoming_message.rb @@ -28,6 +28,7 @@ # Move some of the (e.g. quoting) functions here into rblib, as they feel # general not specific to IncomingMessage. +require 'external_command' require 'htmlentities' require 'rexml/document' require 'zip/zip' @@ -163,6 +164,33 @@ def normalise_content_type(content_type) return content_type end +def external_command(program_name, *args) + # Run an external program, and return its output. + # Standard error is suppressed unless the program + # fails (i.e. returns a non-zero exit status). + opts = {} + if !args.empty? && args[-1].is_a?(Hash) + opts = args.pop + end + + xc = ExternalCommand.new(program_name, *args) + if opts.has_key? :append_to + xc.out = opts[:append_to] + end + xc.run() + if xc.status != 0 + # Error + $stderr.print(xc.err) + return nil + else + if opts.has_key? :append_to + opts[:append_to] << "\n\n" + else + return xc.out + end + end +end + # List of DSN codes taken from RFC 3463 # http://tools.ietf.org/html/rfc3463 $dsn_to_message = { @@ -1241,51 +1269,39 @@ class IncomingMessage < ActiveRecord::Base system("/usr/bin/wvText " + tempfile.path + " " + tempfile.path + ".txt") # Try catdoc if we get into trouble (e.g. for InfoRequestEvent 2701) if not File.exists?(tempfile.path + ".txt") - IO.popen("/usr/bin/catdoc " + tempfile.path, "r") do |child| - text += child.read() + "\n\n" - end + external_command("/usr/bin/catdoc", tempfile.path, :append_to => text) else text += File.read(tempfile.path + ".txt") + "\n\n" File.unlink(tempfile.path + ".txt") end elsif content_type == 'application/rtf' # catdoc on RTF prodcues less comments and extra bumf than --text option to unrtf - IO.popen("/usr/bin/catdoc " + tempfile.path, "r") do |child| - text += child.read() + "\n\n" - end + external_command("/usr/bin/catdoc", tempfile.path, :append_to => text) elsif content_type == 'text/html' # lynx wordwraps links in its output, which then don't get formatted properly # by WhatDoTheyKnow. We use elinks instead, which doesn't do that. - IO.popen("/usr/bin/elinks -dump-charset utf-8 -force-html -dump " + tempfile.path, "r") do |child| - text += child.read() + "\n\n" - end + external_command("/usr/bin/elinks", "-dump-charset", "utf-8", "-force-html", "-dump", + tempfile.path, :append_to => text) elsif content_type == 'application/vnd.ms-excel' # Bit crazy using /usr/bin/strings - but xls2csv, xlhtml and # py_xls2txt only extract text from cells, not from floating # notes. catdoc may be fooled by weird character sets, but will # probably do for UK FOI requests. - IO.popen("/usr/bin/strings " + tempfile.path, "r") do |child| - text += child.read() + "\n\n" - end + external_command("/usr/bin/strings", tempfile.path, :append_to => text) elsif content_type == 'application/vnd.ms-powerpoint' # ppthtml seems to catch more text, but only outputs HTML when # we want text, so just use catppt for now - IO.popen("/usr/bin/catppt " + tempfile.path, "r") do |child| - text += child.read() + "\n\n" - end + external_command("/usr/bin/catppt", tempfile.path, :append_to => text) elsif content_type == 'application/pdf' - IO.popen("/usr/bin/pdftotext " + tempfile.path + " -", "r") do |child| - text += child.read() + "\n\n" - end + external_command("/usr/bin/pdftotext", tempfile.path, "-", :append_to => text) elsif content_type == 'application/vnd.openxmlformats-officedocument.wordprocessingml.document' # This is Microsoft's XML office document format. # Just pull out the main XML file, and strip it of text. - xml = '' - IO.popen("/usr/bin/unzip -qq -c " + tempfile.path + " word/document.xml", "r") do |child| - xml += child.read() + "\n\n" + xml = external_command("/usr/bin/unzip", "-qq", "-c", tempfile.path, "word/document.xml") + if !xml.nil? + doc = REXML::Document.new(xml) + text += doc.each_element( './/text()' ){}.join(" ") end - doc = REXML::Document.new(xml) - text += doc.each_element( './/text()' ){}.join(" ") elsif content_type == 'application/zip' # recurse into zip files zip_file = Zip::ZipFile.open(tempfile.path) |