diff options
author | Robin Houston <robin@lenny.robin> | 2011-06-20 23:34:44 +0100 |
---|---|---|
committer | Robin Houston <robin@lenny.robin> | 2011-06-20 23:34:44 +0100 |
commit | 770ceda38d780c2e8d3f0260a0f8912923d82b62 (patch) | |
tree | ea9e4972af2e933cd0bbd86d12e49b4aea06cc7b | |
parent | db1136c70e1a580efd9307dce2efe421aaa63762 (diff) |
When external converters are used to extract text from attachments
for Xapian, suppress the stderr output of these external programs
unless they actually fail.
It is possible this will not significantly reduce the noise from
converters, because they may actually have been failing. At least
with this change we’ll be able to tell which it is.
Closes #52.
-rw-r--r-- | app/models/incoming_message.rb | 62 | ||||
-rw-r--r-- | lib/external_command.rb | 131 |
2 files changed, 170 insertions, 23 deletions
diff --git a/app/models/incoming_message.rb b/app/models/incoming_message.rb index c808dc6a1..ca5676c67 100644 --- a/app/models/incoming_message.rb +++ b/app/models/incoming_message.rb @@ -28,6 +28,7 @@ # Move some of the (e.g. quoting) functions here into rblib, as they feel # general not specific to IncomingMessage. +require 'external_command' require 'htmlentities' require 'rexml/document' require 'zip/zip' @@ -163,6 +164,33 @@ def normalise_content_type(content_type) return content_type end +def external_command(program_name, *args) + # Run an external program, and return its output. + # Standard error is suppressed unless the program + # fails (i.e. returns a non-zero exit status). + opts = {} + if !args.empty? && args[-1].is_a?(Hash) + opts = args.pop + end + + xc = ExternalCommand.new(program_name, *args) + if opts.has_key? :append_to + xc.out = opts[:append_to] + end + xc.run() + if xc.status != 0 + # Error + $stderr.print(xc.err) + return nil + else + if opts.has_key? :append_to + opts[:append_to] << "\n\n" + else + return xc.out + end + end +end + # List of DSN codes taken from RFC 3463 # http://tools.ietf.org/html/rfc3463 $dsn_to_message = { @@ -1241,51 +1269,39 @@ class IncomingMessage < ActiveRecord::Base system("/usr/bin/wvText " + tempfile.path + " " + tempfile.path + ".txt") # Try catdoc if we get into trouble (e.g. for InfoRequestEvent 2701) if not File.exists?(tempfile.path + ".txt") - IO.popen("/usr/bin/catdoc " + tempfile.path, "r") do |child| - text += child.read() + "\n\n" - end + external_command("/usr/bin/catdoc", tempfile.path, :append_to => text) else text += File.read(tempfile.path + ".txt") + "\n\n" File.unlink(tempfile.path + ".txt") end elsif content_type == 'application/rtf' # catdoc on RTF prodcues less comments and extra bumf than --text option to unrtf - IO.popen("/usr/bin/catdoc " + tempfile.path, "r") do |child| - text += child.read() + "\n\n" - end + external_command("/usr/bin/catdoc", tempfile.path, :append_to => text) elsif content_type == 'text/html' # lynx wordwraps links in its output, which then don't get formatted properly # by WhatDoTheyKnow. We use elinks instead, which doesn't do that. - IO.popen("/usr/bin/elinks -dump-charset utf-8 -force-html -dump " + tempfile.path, "r") do |child| - text += child.read() + "\n\n" - end + external_command("/usr/bin/elinks", "-dump-charset", "utf-8", "-force-html", "-dump", + tempfile.path, :append_to => text) elsif content_type == 'application/vnd.ms-excel' # Bit crazy using /usr/bin/strings - but xls2csv, xlhtml and # py_xls2txt only extract text from cells, not from floating # notes. catdoc may be fooled by weird character sets, but will # probably do for UK FOI requests. - IO.popen("/usr/bin/strings " + tempfile.path, "r") do |child| - text += child.read() + "\n\n" - end + external_command("/usr/bin/strings", tempfile.path, :append_to => text) elsif content_type == 'application/vnd.ms-powerpoint' # ppthtml seems to catch more text, but only outputs HTML when # we want text, so just use catppt for now - IO.popen("/usr/bin/catppt " + tempfile.path, "r") do |child| - text += child.read() + "\n\n" - end + external_command("/usr/bin/catppt", tempfile.path, :append_to => text) elsif content_type == 'application/pdf' - IO.popen("/usr/bin/pdftotext " + tempfile.path + " -", "r") do |child| - text += child.read() + "\n\n" - end + external_command("/usr/bin/pdftotext", tempfile.path, "-", :append_to => text) elsif content_type == 'application/vnd.openxmlformats-officedocument.wordprocessingml.document' # This is Microsoft's XML office document format. # Just pull out the main XML file, and strip it of text. - xml = '' - IO.popen("/usr/bin/unzip -qq -c " + tempfile.path + " word/document.xml", "r") do |child| - xml += child.read() + "\n\n" + xml = external_command("/usr/bin/unzip", "-qq", "-c", tempfile.path, "word/document.xml") + if !xml.nil? + doc = REXML::Document.new(xml) + text += doc.each_element( './/text()' ){}.join(" ") end - doc = REXML::Document.new(xml) - text += doc.each_element( './/text()' ){}.join(" ") elsif content_type == 'application/zip' # recurse into zip files zip_file = Zip::ZipFile.open(tempfile.path) diff --git a/lib/external_command.rb b/lib/external_command.rb new file mode 100644 index 000000000..de8bfc11e --- /dev/null +++ b/lib/external_command.rb @@ -0,0 +1,131 @@ +# Run an external command, capturing its stdout and stderr +# streams into variables. +# +# So it’s rather like the `backtick` built-in, except that: +# - The command is run as-is, rather than being parsed by the shell; +# - Standard error is also captured. +# +# After the run() method has been called, the instance variables +# out, err and status contain the contents of the process’s stdout, +# the contents of its stderr, and the exit status. +# +# Example usage: +# require 'external_command' +# xc = ExternalCommand("ls", "-l").run() +# puts "Ran ls -l with exit status #{xc.status}" +# puts "===STDOUT===\n#{xc.out}" +# puts "===STDERR===\n#{xc.err}" +# +# The out and err attributes are writeable. If you assign +# a string, after calling the constructor and before calling +# run(), then the subprocess output/error will be appended +# to this string. + +# <rant author="robin"> +# In any sane language, this would be implemented with a +# single child process. The parent process would block on +# select(), and when the child process terminated, the +# select call would be interrupted by a CHLD signal +# and return EINTR. Unfortunately Ruby goes out of its +# way to prevent this from working, automatically restarting +# the select call if EINTR is returned. Therefore we +# use a parent-child-grandchild arrangement, where the +# parent blocks on select() and the child blocks on +# waitpid(). When the child detects that the grandchild +# has finished, it writes to a pipe that’s included in +# the parent’s select() for this purpose. +# </rant> + +class ExternalCommand + attr_accessor :out, :err + attr_reader :status + + def initialize(cmd, *args) + @cmd = cmd + @args = args + + # Strings to collect stdout and stderr from the child process + # These may be replaced by the caller, to append to existing strings. + @out = "" + @err = "" + @fin = "" + end + + def run() + # Pipes for parent-child communication + @out_read, @out_write = IO::pipe + @err_read, @err_write = IO::pipe + @fin_read, @fin_write = IO::pipe + + @pid = fork do + # Here we’re in the child process. + child_process + end + + # Here we’re in the parent process. + parent_process + + return self + end + + private + + def child_process() + # Reopen stdout and stderr to point at the pipes + STDOUT.reopen(@out_write) + STDERR.reopen(@err_write) + + # Close all the filehandles other than the ones we intend to use. + ObjectSpace.each_object(IO) do |fh| + fh.close unless ( + [STDOUT, STDERR, @fin_write].include?(fh) || fh.closed?) + end + + Process::waitpid(fork { grandchild_process }) + @fin_write.puts($?.exitstatus.to_s) + + exit! 0 + end + + def grandchild_process() + exec(@cmd, *@args) + + # This is only reached if the exec fails + @err_write.print("Failed to exec: #{[@cmd, *@args].join(' ')}") + exit! 99 + end + + def parent_process() + # Close the writing ends of the pipes + @out_write.close + @err_write.close + @fin_write.close + + @fhs = {@out_read => @out, @err_read => @err, @fin_read => @fin} + + while @fin.empty? + r = read_data + if r.nil? + raise "select() timed out even with a nil (infinite) timeout" + end + end + + Process::waitpid(@pid) + @status = @fin.to_i + @out_read.close + @err_read.close + end + + def read_data() + ready_array = IO.select(@fhs.keys, [], [], nil) + return false if ready_array.nil? + ready_array[0].each do |fh| + begin + @fhs[fh] << fh.readpartial(8192) + rescue EOFError + @fhs.delete fh + end + end + return true + end +end |