diff options
-rw-r--r-- | app/models/incoming_message.rb | 62 | ||||
-rw-r--r-- | lib/external_command.rb | 131 |
2 files changed, 170 insertions, 23 deletions
diff --git a/app/models/incoming_message.rb b/app/models/incoming_message.rb index c808dc6a1..ca5676c67 100644 --- a/app/models/incoming_message.rb +++ b/app/models/incoming_message.rb @@ -28,6 +28,7 @@ # Move some of the (e.g. quoting) functions here into rblib, as they feel # general not specific to IncomingMessage. +require 'external_command' require 'htmlentities' require 'rexml/document' require 'zip/zip' @@ -163,6 +164,33 @@ def normalise_content_type(content_type) return content_type end +def external_command(program_name, *args) + # Run an external program, and return its output. + # Standard error is suppressed unless the program + # fails (i.e. returns a non-zero exit status). + opts = {} + if !args.empty? && args[-1].is_a?(Hash) + opts = args.pop + end + + xc = ExternalCommand.new(program_name, *args) + if opts.has_key? :append_to + xc.out = opts[:append_to] + end + xc.run() + if xc.status != 0 + # Error + $stderr.print(xc.err) + return nil + else + if opts.has_key? :append_to + opts[:append_to] << "\n\n" + else + return xc.out + end + end +end + # List of DSN codes taken from RFC 3463 # http://tools.ietf.org/html/rfc3463 $dsn_to_message = { @@ -1241,51 +1269,39 @@ class IncomingMessage < ActiveRecord::Base system("/usr/bin/wvText " + tempfile.path + " " + tempfile.path + ".txt") # Try catdoc if we get into trouble (e.g. for InfoRequestEvent 2701) if not File.exists?(tempfile.path + ".txt") - IO.popen("/usr/bin/catdoc " + tempfile.path, "r") do |child| - text += child.read() + "\n\n" - end + external_command("/usr/bin/catdoc", tempfile.path, :append_to => text) else text += File.read(tempfile.path + ".txt") + "\n\n" File.unlink(tempfile.path + ".txt") end elsif content_type == 'application/rtf' # catdoc on RTF prodcues less comments and extra bumf than --text option to unrtf - IO.popen("/usr/bin/catdoc " + tempfile.path, "r") do |child| - text += child.read() + "\n\n" - end + external_command("/usr/bin/catdoc", tempfile.path, :append_to => text) elsif content_type == 'text/html' # lynx wordwraps links in its output, which then don't get formatted properly # by WhatDoTheyKnow. We use elinks instead, which doesn't do that. - IO.popen("/usr/bin/elinks -dump-charset utf-8 -force-html -dump " + tempfile.path, "r") do |child| - text += child.read() + "\n\n" - end + external_command("/usr/bin/elinks", "-dump-charset", "utf-8", "-force-html", "-dump", + tempfile.path, :append_to => text) elsif content_type == 'application/vnd.ms-excel' # Bit crazy using /usr/bin/strings - but xls2csv, xlhtml and # py_xls2txt only extract text from cells, not from floating # notes. catdoc may be fooled by weird character sets, but will # probably do for UK FOI requests. - IO.popen("/usr/bin/strings " + tempfile.path, "r") do |child| - text += child.read() + "\n\n" - end + external_command("/usr/bin/strings", tempfile.path, :append_to => text) elsif content_type == 'application/vnd.ms-powerpoint' # ppthtml seems to catch more text, but only outputs HTML when # we want text, so just use catppt for now - IO.popen("/usr/bin/catppt " + tempfile.path, "r") do |child| - text += child.read() + "\n\n" - end + external_command("/usr/bin/catppt", tempfile.path, :append_to => text) elsif content_type == 'application/pdf' - IO.popen("/usr/bin/pdftotext " + tempfile.path + " -", "r") do |child| - text += child.read() + "\n\n" - end + external_command("/usr/bin/pdftotext", tempfile.path, "-", :append_to => text) elsif content_type == 'application/vnd.openxmlformats-officedocument.wordprocessingml.document' # This is Microsoft's XML office document format. # Just pull out the main XML file, and strip it of text. - xml = '' - IO.popen("/usr/bin/unzip -qq -c " + tempfile.path + " word/document.xml", "r") do |child| - xml += child.read() + "\n\n" + xml = external_command("/usr/bin/unzip", "-qq", "-c", tempfile.path, "word/document.xml") + if !xml.nil? + doc = REXML::Document.new(xml) + text += doc.each_element( './/text()' ){}.join(" ") end - doc = REXML::Document.new(xml) - text += doc.each_element( './/text()' ){}.join(" ") elsif content_type == 'application/zip' # recurse into zip files zip_file = Zip::ZipFile.open(tempfile.path) diff --git a/lib/external_command.rb b/lib/external_command.rb new file mode 100644 index 000000000..de8bfc11e --- /dev/null +++ b/lib/external_command.rb @@ -0,0 +1,131 @@ +# Run an external command, capturing its stdout and stderr +# streams into variables. +# +# So it’s rather like the `backtick` built-in, except that: +# - The command is run as-is, rather than being parsed by the shell; +# - Standard error is also captured. +# +# After the run() method has been called, the instance variables +# out, err and status contain the contents of the process’s stdout, +# the contents of its stderr, and the exit status. +# +# Example usage: +# require 'external_command' +# xc = ExternalCommand("ls", "-l").run() +# puts "Ran ls -l with exit status #{xc.status}" +# puts "===STDOUT===\n#{xc.out}" +# puts "===STDERR===\n#{xc.err}" +# +# The out and err attributes are writeable. If you assign +# a string, after calling the constructor and before calling +# run(), then the subprocess output/error will be appended +# to this string. + +# <rant author="robin"> +# In any sane language, this would be implemented with a +# single child process. The parent process would block on +# select(), and when the child process terminated, the +# select call would be interrupted by a CHLD signal +# and return EINTR. Unfortunately Ruby goes out of its +# way to prevent this from working, automatically restarting +# the select call if EINTR is returned. Therefore we +# use a parent-child-grandchild arrangement, where the +# parent blocks on select() and the child blocks on +# waitpid(). When the child detects that the grandchild +# has finished, it writes to a pipe that’s included in +# the parent’s select() for this purpose. +# </rant> + +class ExternalCommand + attr_accessor :out, :err + attr_reader :status + + def initialize(cmd, *args) + @cmd = cmd + @args = args + + # Strings to collect stdout and stderr from the child process + # These may be replaced by the caller, to append to existing strings. + @out = "" + @err = "" + @fin = "" + end + + def run() + # Pipes for parent-child communication + @out_read, @out_write = IO::pipe + @err_read, @err_write = IO::pipe + @fin_read, @fin_write = IO::pipe + + @pid = fork do + # Here we’re in the child process. + child_process + end + + # Here we’re in the parent process. + parent_process + + return self + end + + private + + def child_process() + # Reopen stdout and stderr to point at the pipes + STDOUT.reopen(@out_write) + STDERR.reopen(@err_write) + + # Close all the filehandles other than the ones we intend to use. + ObjectSpace.each_object(IO) do |fh| + fh.close unless ( + [STDOUT, STDERR, @fin_write].include?(fh) || fh.closed?) + end + + Process::waitpid(fork { grandchild_process }) + @fin_write.puts($?.exitstatus.to_s) + + exit! 0 + end + + def grandchild_process() + exec(@cmd, *@args) + + # This is only reached if the exec fails + @err_write.print("Failed to exec: #{[@cmd, *@args].join(' ')}") + exit! 99 + end + + def parent_process() + # Close the writing ends of the pipes + @out_write.close + @err_write.close + @fin_write.close + + @fhs = {@out_read => @out, @err_read => @err, @fin_read => @fin} + + while @fin.empty? + r = read_data + if r.nil? + raise "select() timed out even with a nil (infinite) timeout" + end + end + + Process::waitpid(@pid) + @status = @fin.to_i + @out_read.close + @err_read.close + end + + def read_data() + ready_array = IO.select(@fhs.keys, [], [], nil) + return false if ready_array.nil? + ready_array[0].each do |fh| + begin + @fhs[fh] << fh.readpartial(8192) + rescue EOFError + @fhs.delete fh + end + end + return true + end +end |