From 8cae0445839920c64240e66478e43674c8713177 Mon Sep 17 00:00:00 2001 From: Louise Crow Date: Wed, 5 Dec 2012 15:37:08 +0000 Subject: Move content type normalization to mail handler. --- lib/mail_handler/mail_handler.rb | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) (limited to 'lib/mail_handler/mail_handler.rb') diff --git a/lib/mail_handler/mail_handler.rb b/lib/mail_handler/mail_handler.rb index 7b0f6e7f2..a49ec8e8c 100644 --- a/lib/mail_handler/mail_handler.rb +++ b/lib/mail_handler/mail_handler.rb @@ -46,6 +46,30 @@ module MailHandler attachments end + def normalise_content_type(content_type) + # e.g. http://www.whatdotheyknow.com/request/93/response/250 + if content_type == 'application/excel' or content_type == 'application/msexcel' or content_type == 'application/x-ms-excel' + content_type = 'application/vnd.ms-excel' + end + if content_type == 'application/mspowerpoint' or content_type == 'application/x-ms-powerpoint' + content_type = 'application/vnd.ms-powerpoint' + end + if content_type == 'application/msword' or content_type == 'application/x-ms-word' + content_type = 'application/vnd.ms-word' + end + if content_type == 'application/x-zip-compressed' + content_type = 'application/zip' + end + + # e.g. http://www.whatdotheyknow.com/request/copy_of_current_swessex_scr_opt#incoming-9928 + if content_type == 'application/acrobat' + content_type = 'application/pdf' + end + + return content_type + end + + # Turn instance methods into class methods extend self -- cgit v1.2.3 From 0b8c40a57aa6f4a1da2e83c276ca523499d517e4 Mon Sep 17 00:00:00 2001 From: Louise Crow Date: Wed, 5 Dec 2012 18:29:43 +0000 Subject: Read and write to the file in binary mode during a TNEF conversion operation. --- lib/mail_handler/mail_handler.rb | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'lib/mail_handler/mail_handler.rb') diff --git a/lib/mail_handler/mail_handler.rb b/lib/mail_handler/mail_handler.rb index a49ec8e8c..2909d873b 100644 --- a/lib/mail_handler/mail_handler.rb +++ b/lib/mail_handler/mail_handler.rb @@ -20,7 +20,7 @@ module MailHandler def tnef_attachments(content) attachments = [] Dir.mktmpdir do |dir| - IO.popen("#{`which tnef`.chomp} -K -C #{dir}", "w") do |f| + IO.popen("#{`which tnef`.chomp} -K -C #{dir}", "wb") do |f| f.write(content) f.close if $?.signaled? @@ -33,7 +33,7 @@ module MailHandler found = 0 Dir.new(dir).sort.each do |file| # sort for deterministic behaviour if file != "." && file != ".." - file_content = File.open("#{dir}/#{file}", "r").read + file_content = File.open("#{dir}/#{file}", "rb").read attachments << { :content => file_content, :filename => file } found += 1 -- cgit v1.2.3 From 14bbd1d75840add4fd5f8c440b58ac465c306fb6 Mon Sep 17 00:00:00 2001 From: Louise Crow Date: Thu, 6 Dec 2012 10:23:37 +0000 Subject: Move methods for getting the text out of attachments to the mail handler module. --- lib/mail_handler/mail_handler.rb | 101 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 101 insertions(+) (limited to 'lib/mail_handler/mail_handler.rb') diff --git a/lib/mail_handler/mail_handler.rb b/lib/mail_handler/mail_handler.rb index 2909d873b..a74a9876a 100644 --- a/lib/mail_handler/mail_handler.rb +++ b/lib/mail_handler/mail_handler.rb @@ -69,6 +69,107 @@ module MailHandler return content_type end + def _get_attachment_text_internal_one_file(content_type, body, charset = 'utf-8') + # note re. charset: TMail always tries to convert email bodies + # to UTF8 by default, so normally it should already be that. + text = '' + # XXX - tell all these command line tools to return utf-8 + if content_type == 'text/plain' + text += body + "\n\n" + else + tempfile = Tempfile.new('foiextract') + tempfile.binmode + tempfile.print body + tempfile.flush + default_params = { :append_to => text, :binary_output => false } + if content_type == 'application/vnd.ms-word' + AlaveteliExternalCommand.run("wvText", tempfile.path, tempfile.path + ".txt") + # Try catdoc if we get into trouble (e.g. for InfoRequestEvent 2701) + if not File.exists?(tempfile.path + ".txt") + AlaveteliExternalCommand.run("catdoc", tempfile.path, default_params) + else + text += File.read(tempfile.path + ".txt") + "\n\n" + File.unlink(tempfile.path + ".txt") + end + elsif content_type == 'application/rtf' + # catdoc on RTF prodcues less comments and extra bumf than --text option to unrtf + AlaveteliExternalCommand.run("catdoc", tempfile.path, default_params) + elsif content_type == 'text/html' + # lynx wordwraps links in its output, which then don't + # get formatted properly by Alaveteli. We use elinks + # instead, which doesn't do that. + AlaveteliExternalCommand.run("elinks", "-eval", "set document.codepage.assume = \"#{charset}\"", + "-eval", "set document.codepage.force_assumed = 1", + "-dump-charset", "utf-8", + "-force-html", "-dump", + tempfile.path, + default_params.merge(:env => {"LANG" => "C"})) + elsif content_type == 'application/vnd.ms-excel' + # Bit crazy using /usr/bin/strings - but xls2csv, xlhtml and + # py_xls2txt only extract text from cells, not from floating + # notes. catdoc may be fooled by weird character sets, but will + # probably do for UK FOI requests. + AlaveteliExternalCommand.run("/usr/bin/strings", tempfile.path, default_params) + elsif content_type == 'application/vnd.ms-powerpoint' + # ppthtml seems to catch more text, but only outputs HTML when + # we want text, so just use catppt for now + AlaveteliExternalCommand.run("catppt", tempfile.path, default_params) + elsif content_type == 'application/pdf' + AlaveteliExternalCommand.run("pdftotext", tempfile.path, "-", default_params) + elsif content_type == 'application/vnd.openxmlformats-officedocument.wordprocessingml.document' + # This is Microsoft's XML office document format. + # Just pull out the main XML file, and strip it of text. + xml = AlaveteliExternalCommand.run("/usr/bin/unzip", "-qq", + "-c", + tempfile.path, + "word/document.xml", + {:binary_output => false}) + if !xml.nil? + doc = REXML::Document.new(xml) + text += doc.each_element( './/text()' ){}.join(" ") + end + elsif content_type == 'application/zip' + # recurse into zip files + begin + zip_file = Zip::ZipFile.open(tempfile.path) + text += _get_attachment_text_from_zip_file(zip_file) + zip_file.close() + rescue + $stderr.puts("Error processing zip file: #{$!.inspect}") + end + end + tempfile.close + end + + return text + end + def _get_attachment_text_from_zip_file(zip_file) + + text = "" + for entry in zip_file + if entry.file? + filename = entry.to_s + begin + body = entry.get_input_stream.read + rescue + # move to next attachment silently if there were problems + # XXX really should reduce this to specific exceptions? + # e.g. password protected + next + end + calc_mime = AlaveteliFileTypes.filename_to_mimetype(filename) + if calc_mime + content_type = calc_mime + else + content_type = 'application/octet-stream' + end + + text += _get_attachment_text_internal_one_file(content_type, body) + + end + end + return text + end # Turn instance methods into class methods extend self -- cgit v1.2.3 From 12b9d57ea9bbe69e5e195a90085b66056f4116a4 Mon Sep 17 00:00:00 2001 From: Louise Crow Date: Thu, 6 Dec 2012 10:38:17 +0000 Subject: No real need for this to be an internal function. --- lib/mail_handler/mail_handler.rb | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'lib/mail_handler/mail_handler.rb') diff --git a/lib/mail_handler/mail_handler.rb b/lib/mail_handler/mail_handler.rb index a74a9876a..7d80753c2 100644 --- a/lib/mail_handler/mail_handler.rb +++ b/lib/mail_handler/mail_handler.rb @@ -132,7 +132,7 @@ module MailHandler # recurse into zip files begin zip_file = Zip::ZipFile.open(tempfile.path) - text += _get_attachment_text_from_zip_file(zip_file) + text += get_attachment_text_from_zip_file(zip_file) zip_file.close() rescue $stderr.puts("Error processing zip file: #{$!.inspect}") @@ -143,7 +143,7 @@ module MailHandler return text end - def _get_attachment_text_from_zip_file(zip_file) + def get_attachment_text_from_zip_file(zip_file) text = "" for entry in zip_file -- cgit v1.2.3 From 1d4ef88e60bcecc5c413cf3b6e6cb76f4bb6eaa1 Mon Sep 17 00:00:00 2001 From: Louise Crow Date: Thu, 6 Dec 2012 10:43:07 +0000 Subject: Rename _get_attachment_text_internal_one_file to get_attachment_text_one_file as it is now an externally-accessed method of the mail handler module. --- lib/mail_handler/mail_handler.rb | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'lib/mail_handler/mail_handler.rb') diff --git a/lib/mail_handler/mail_handler.rb b/lib/mail_handler/mail_handler.rb index 7d80753c2..4b16fd046 100644 --- a/lib/mail_handler/mail_handler.rb +++ b/lib/mail_handler/mail_handler.rb @@ -69,7 +69,7 @@ module MailHandler return content_type end - def _get_attachment_text_internal_one_file(content_type, body, charset = 'utf-8') + def get_attachment_text_one_file(content_type, body, charset = 'utf-8') # note re. charset: TMail always tries to convert email bodies # to UTF8 by default, so normally it should already be that. text = '' @@ -164,7 +164,7 @@ module MailHandler content_type = 'application/octet-stream' end - text += _get_attachment_text_internal_one_file(content_type, body) + text += get_attachment_text_one_file(content_type, body) end end -- cgit v1.2.3 From c01b40eac15193434a2805f57b68bf7cfc583225 Mon Sep 17 00:00:00 2001 From: Louise Crow Date: Thu, 6 Dec 2012 14:49:42 +0000 Subject: Explicitly require the mail gem when using it as a backend. --- lib/mail_handler/mail_handler.rb | 1 + 1 file changed, 1 insertion(+) (limited to 'lib/mail_handler/mail_handler.rb') diff --git a/lib/mail_handler/mail_handler.rb b/lib/mail_handler/mail_handler.rb index 4b16fd046..8b227b9ca 100644 --- a/lib/mail_handler/mail_handler.rb +++ b/lib/mail_handler/mail_handler.rb @@ -4,6 +4,7 @@ require 'tmpdir' module MailHandler if RUBY_VERSION.to_f >= 1.9 + require 'mail' require 'backends/mail_extensions' require 'backends/mail_backend' include Backends::MailBackend -- cgit v1.2.3