aboutsummaryrefslogtreecommitdiffstats
path: root/lib/mail_handler/mail_handler.rb
diff options
context:
space:
mode:
Diffstat (limited to 'lib/mail_handler/mail_handler.rb')
-rw-r--r--lib/mail_handler/mail_handler.rb179
1 files changed, 179 insertions, 0 deletions
diff --git a/lib/mail_handler/mail_handler.rb b/lib/mail_handler/mail_handler.rb
new file mode 100644
index 000000000..8b227b9ca
--- /dev/null
+++ b/lib/mail_handler/mail_handler.rb
@@ -0,0 +1,179 @@
+# Handles the parsing of email
+require 'tmpdir'
+
+module MailHandler
+
+ if RUBY_VERSION.to_f >= 1.9
+ require 'mail'
+ require 'backends/mail_extensions'
+ require 'backends/mail_backend'
+ include Backends::MailBackend
+ else
+ require 'action_mailer'
+ require 'backends/tmail_extensions'
+ require 'backends/tmail_backend'
+ include Backends::TmailBackend
+ end
+
+ # Returns a set of attachments from the given TNEF contents
+ # The TNEF contents also contains the message body, but in general this is the
+ # same as the message body in the message proper.
+ def tnef_attachments(content)
+ attachments = []
+ Dir.mktmpdir do |dir|
+ IO.popen("#{`which tnef`.chomp} -K -C #{dir}", "wb") do |f|
+ f.write(content)
+ f.close
+ if $?.signaled?
+ raise IOError, "tnef exited with signal #{$?.termsig}"
+ end
+ if $?.exited? && $?.exitstatus != 0
+ raise IOError, "tnef exited with status #{$?.exitstatus}"
+ end
+ end
+ found = 0
+ Dir.new(dir).sort.each do |file| # sort for deterministic behaviour
+ if file != "." && file != ".."
+ file_content = File.open("#{dir}/#{file}", "rb").read
+ attachments << { :content => file_content,
+ :filename => file }
+ found += 1
+ end
+ end
+ if found == 0
+ raise IOError, "tnef produced no attachments"
+ end
+ end
+ attachments
+ end
+
+ def normalise_content_type(content_type)
+ # e.g. http://www.whatdotheyknow.com/request/93/response/250
+ if content_type == 'application/excel' or content_type == 'application/msexcel' or content_type == 'application/x-ms-excel'
+ content_type = 'application/vnd.ms-excel'
+ end
+ if content_type == 'application/mspowerpoint' or content_type == 'application/x-ms-powerpoint'
+ content_type = 'application/vnd.ms-powerpoint'
+ end
+ if content_type == 'application/msword' or content_type == 'application/x-ms-word'
+ content_type = 'application/vnd.ms-word'
+ end
+ if content_type == 'application/x-zip-compressed'
+ content_type = 'application/zip'
+ end
+
+ # e.g. http://www.whatdotheyknow.com/request/copy_of_current_swessex_scr_opt#incoming-9928
+ if content_type == 'application/acrobat'
+ content_type = 'application/pdf'
+ end
+
+ return content_type
+ end
+
+ def get_attachment_text_one_file(content_type, body, charset = 'utf-8')
+ # note re. charset: TMail always tries to convert email bodies
+ # to UTF8 by default, so normally it should already be that.
+ text = ''
+ # XXX - tell all these command line tools to return utf-8
+ if content_type == 'text/plain'
+ text += body + "\n\n"
+ else
+ tempfile = Tempfile.new('foiextract')
+ tempfile.binmode
+ tempfile.print body
+ tempfile.flush
+ default_params = { :append_to => text, :binary_output => false }
+ if content_type == 'application/vnd.ms-word'
+ AlaveteliExternalCommand.run("wvText", tempfile.path, tempfile.path + ".txt")
+ # Try catdoc if we get into trouble (e.g. for InfoRequestEvent 2701)
+ if not File.exists?(tempfile.path + ".txt")
+ AlaveteliExternalCommand.run("catdoc", tempfile.path, default_params)
+ else
+ text += File.read(tempfile.path + ".txt") + "\n\n"
+ File.unlink(tempfile.path + ".txt")
+ end
+ elsif content_type == 'application/rtf'
+ # catdoc on RTF prodcues less comments and extra bumf than --text option to unrtf
+ AlaveteliExternalCommand.run("catdoc", tempfile.path, default_params)
+ elsif content_type == 'text/html'
+ # lynx wordwraps links in its output, which then don't
+ # get formatted properly by Alaveteli. We use elinks
+ # instead, which doesn't do that.
+ AlaveteliExternalCommand.run("elinks", "-eval", "set document.codepage.assume = \"#{charset}\"",
+ "-eval", "set document.codepage.force_assumed = 1",
+ "-dump-charset", "utf-8",
+ "-force-html", "-dump",
+ tempfile.path,
+ default_params.merge(:env => {"LANG" => "C"}))
+ elsif content_type == 'application/vnd.ms-excel'
+ # Bit crazy using /usr/bin/strings - but xls2csv, xlhtml and
+ # py_xls2txt only extract text from cells, not from floating
+ # notes. catdoc may be fooled by weird character sets, but will
+ # probably do for UK FOI requests.
+ AlaveteliExternalCommand.run("/usr/bin/strings", tempfile.path, default_params)
+ elsif content_type == 'application/vnd.ms-powerpoint'
+ # ppthtml seems to catch more text, but only outputs HTML when
+ # we want text, so just use catppt for now
+ AlaveteliExternalCommand.run("catppt", tempfile.path, default_params)
+ elsif content_type == 'application/pdf'
+ AlaveteliExternalCommand.run("pdftotext", tempfile.path, "-", default_params)
+ elsif content_type == 'application/vnd.openxmlformats-officedocument.wordprocessingml.document'
+ # This is Microsoft's XML office document format.
+ # Just pull out the main XML file, and strip it of text.
+ xml = AlaveteliExternalCommand.run("/usr/bin/unzip", "-qq",
+ "-c",
+ tempfile.path,
+ "word/document.xml",
+ {:binary_output => false})
+ if !xml.nil?
+ doc = REXML::Document.new(xml)
+ text += doc.each_element( './/text()' ){}.join(" ")
+ end
+ elsif content_type == 'application/zip'
+ # recurse into zip files
+ begin
+ zip_file = Zip::ZipFile.open(tempfile.path)
+ text += get_attachment_text_from_zip_file(zip_file)
+ zip_file.close()
+ rescue
+ $stderr.puts("Error processing zip file: #{$!.inspect}")
+ end
+ end
+ tempfile.close
+ end
+
+ return text
+ end
+ def get_attachment_text_from_zip_file(zip_file)
+
+ text = ""
+ for entry in zip_file
+ if entry.file?
+ filename = entry.to_s
+ begin
+ body = entry.get_input_stream.read
+ rescue
+ # move to next attachment silently if there were problems
+ # XXX really should reduce this to specific exceptions?
+ # e.g. password protected
+ next
+ end
+ calc_mime = AlaveteliFileTypes.filename_to_mimetype(filename)
+ if calc_mime
+ content_type = calc_mime
+ else
+ content_type = 'application/octet-stream'
+ end
+
+ text += get_attachment_text_one_file(content_type, body)
+
+ end
+ end
+ return text
+ end
+
+ # Turn instance methods into class methods
+ extend self
+
+end
+