aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--app/models/incoming_message.rb62
-rw-r--r--lib/external_command.rb131
2 files changed, 170 insertions, 23 deletions
diff --git a/app/models/incoming_message.rb b/app/models/incoming_message.rb
index c808dc6a1..ca5676c67 100644
--- a/app/models/incoming_message.rb
+++ b/app/models/incoming_message.rb
@@ -28,6 +28,7 @@
# Move some of the (e.g. quoting) functions here into rblib, as they feel
# general not specific to IncomingMessage.
+require 'external_command'
require 'htmlentities'
require 'rexml/document'
require 'zip/zip'
@@ -163,6 +164,33 @@ def normalise_content_type(content_type)
return content_type
end
+def external_command(program_name, *args)
+ # Run an external program, and return its output.
+ # Standard error is suppressed unless the program
+ # fails (i.e. returns a non-zero exit status).
+ opts = {}
+ if !args.empty? && args[-1].is_a?(Hash)
+ opts = args.pop
+ end
+
+ xc = ExternalCommand.new(program_name, *args)
+ if opts.has_key? :append_to
+ xc.out = opts[:append_to]
+ end
+ xc.run()
+ if xc.status != 0
+ # Error
+ $stderr.print(xc.err)
+ return nil
+ else
+ if opts.has_key? :append_to
+ opts[:append_to] << "\n\n"
+ else
+ return xc.out
+ end
+ end
+end
+
# List of DSN codes taken from RFC 3463
# http://tools.ietf.org/html/rfc3463
$dsn_to_message = {
@@ -1241,51 +1269,39 @@ class IncomingMessage < ActiveRecord::Base
system("/usr/bin/wvText " + tempfile.path + " " + tempfile.path + ".txt")
# Try catdoc if we get into trouble (e.g. for InfoRequestEvent 2701)
if not File.exists?(tempfile.path + ".txt")
- IO.popen("/usr/bin/catdoc " + tempfile.path, "r") do |child|
- text += child.read() + "\n\n"
- end
+ external_command("/usr/bin/catdoc", tempfile.path, :append_to => text)
else
text += File.read(tempfile.path + ".txt") + "\n\n"
File.unlink(tempfile.path + ".txt")
end
elsif content_type == 'application/rtf'
# catdoc on RTF prodcues less comments and extra bumf than --text option to unrtf
- IO.popen("/usr/bin/catdoc " + tempfile.path, "r") do |child|
- text += child.read() + "\n\n"
- end
+ external_command("/usr/bin/catdoc", tempfile.path, :append_to => text)
elsif content_type == 'text/html'
# lynx wordwraps links in its output, which then don't get formatted properly
# by WhatDoTheyKnow. We use elinks instead, which doesn't do that.
- IO.popen("/usr/bin/elinks -dump-charset utf-8 -force-html -dump " + tempfile.path, "r") do |child|
- text += child.read() + "\n\n"
- end
+ external_command("/usr/bin/elinks", "-dump-charset", "utf-8", "-force-html", "-dump",
+ tempfile.path, :append_to => text)
elsif content_type == 'application/vnd.ms-excel'
# Bit crazy using /usr/bin/strings - but xls2csv, xlhtml and
# py_xls2txt only extract text from cells, not from floating
# notes. catdoc may be fooled by weird character sets, but will
# probably do for UK FOI requests.
- IO.popen("/usr/bin/strings " + tempfile.path, "r") do |child|
- text += child.read() + "\n\n"
- end
+ external_command("/usr/bin/strings", tempfile.path, :append_to => text)
elsif content_type == 'application/vnd.ms-powerpoint'
# ppthtml seems to catch more text, but only outputs HTML when
# we want text, so just use catppt for now
- IO.popen("/usr/bin/catppt " + tempfile.path, "r") do |child|
- text += child.read() + "\n\n"
- end
+ external_command("/usr/bin/catppt", tempfile.path, :append_to => text)
elsif content_type == 'application/pdf'
- IO.popen("/usr/bin/pdftotext " + tempfile.path + " -", "r") do |child|
- text += child.read() + "\n\n"
- end
+ external_command("/usr/bin/pdftotext", tempfile.path, "-", :append_to => text)
elsif content_type == 'application/vnd.openxmlformats-officedocument.wordprocessingml.document'
# This is Microsoft's XML office document format.
# Just pull out the main XML file, and strip it of text.
- xml = ''
- IO.popen("/usr/bin/unzip -qq -c " + tempfile.path + " word/document.xml", "r") do |child|
- xml += child.read() + "\n\n"
+ xml = external_command("/usr/bin/unzip", "-qq", "-c", tempfile.path, "word/document.xml")
+ if !xml.nil?
+ doc = REXML::Document.new(xml)
+ text += doc.each_element( './/text()' ){}.join(" ")
end
- doc = REXML::Document.new(xml)
- text += doc.each_element( './/text()' ){}.join(" ")
elsif content_type == 'application/zip'
# recurse into zip files
zip_file = Zip::ZipFile.open(tempfile.path)
diff --git a/lib/external_command.rb b/lib/external_command.rb
new file mode 100644
index 000000000..de8bfc11e
--- /dev/null
+++ b/lib/external_command.rb
@@ -0,0 +1,131 @@
+# Run an external command, capturing its stdout and stderr
+# streams into variables.
+#
+# So it’s rather like the `backtick` built-in, except that:
+# - The command is run as-is, rather than being parsed by the shell;
+# - Standard error is also captured.
+#
+# After the run() method has been called, the instance variables
+# out, err and status contain the contents of the process’s stdout,
+# the contents of its stderr, and the exit status.
+#
+# Example usage:
+# require 'external_command'
+# xc = ExternalCommand("ls", "-l").run()
+# puts "Ran ls -l with exit status #{xc.status}"
+# puts "===STDOUT===\n#{xc.out}"
+# puts "===STDERR===\n#{xc.err}"
+#
+# The out and err attributes are writeable. If you assign
+# a string, after calling the constructor and before calling
+# run(), then the subprocess output/error will be appended
+# to this string.
+
+# <rant author="robin">
+# In any sane language, this would be implemented with a
+# single child process. The parent process would block on
+# select(), and when the child process terminated, the
+# select call would be interrupted by a CHLD signal
+# and return EINTR. Unfortunately Ruby goes out of its
+# way to prevent this from working, automatically restarting
+# the select call if EINTR is returned. Therefore we
+# use a parent-child-grandchild arrangement, where the
+# parent blocks on select() and the child blocks on
+# waitpid(). When the child detects that the grandchild
+# has finished, it writes to a pipe that’s included in
+# the parent’s select() for this purpose.
+# </rant>
+
+class ExternalCommand
+ attr_accessor :out, :err
+ attr_reader :status
+
+ def initialize(cmd, *args)
+ @cmd = cmd
+ @args = args
+
+ # Strings to collect stdout and stderr from the child process
+ # These may be replaced by the caller, to append to existing strings.
+ @out = ""
+ @err = ""
+ @fin = ""
+ end
+
+ def run()
+ # Pipes for parent-child communication
+ @out_read, @out_write = IO::pipe
+ @err_read, @err_write = IO::pipe
+ @fin_read, @fin_write = IO::pipe
+
+ @pid = fork do
+ # Here we’re in the child process.
+ child_process
+ end
+
+ # Here we’re in the parent process.
+ parent_process
+
+ return self
+ end
+
+ private
+
+ def child_process()
+ # Reopen stdout and stderr to point at the pipes
+ STDOUT.reopen(@out_write)
+ STDERR.reopen(@err_write)
+
+ # Close all the filehandles other than the ones we intend to use.
+ ObjectSpace.each_object(IO) do |fh|
+ fh.close unless (
+ [STDOUT, STDERR, @fin_write].include?(fh) || fh.closed?)
+ end
+
+ Process::waitpid(fork { grandchild_process })
+ @fin_write.puts($?.exitstatus.to_s)
+
+ exit! 0
+ end
+
+ def grandchild_process()
+ exec(@cmd, *@args)
+
+ # This is only reached if the exec fails
+ @err_write.print("Failed to exec: #{[@cmd, *@args].join(' ')}")
+ exit! 99
+ end
+
+ def parent_process()
+ # Close the writing ends of the pipes
+ @out_write.close
+ @err_write.close
+ @fin_write.close
+
+ @fhs = {@out_read => @out, @err_read => @err, @fin_read => @fin}
+
+ while @fin.empty?
+ r = read_data
+ if r.nil?
+ raise "select() timed out even with a nil (infinite) timeout"
+ end
+ end
+
+ Process::waitpid(@pid)
+ @status = @fin.to_i
+ @out_read.close
+ @err_read.close
+ end
+
+ def read_data()
+ ready_array = IO.select(@fhs.keys, [], [], nil)
+ return false if ready_array.nil?
+ ready_array[0].each do |fh|
+ begin
+ @fhs[fh] << fh.readpartial(8192)
+ rescue EOFError
+ @fhs.delete fh
+ end
+ end
+ return true
+ end
+end