From be5e69a7dccaa6c76408f9b7883980bd79bdba28 Mon Sep 17 00:00:00 2001 From: Louise Crow Date: Thu, 15 Nov 2012 11:00:36 +0000 Subject: First skeletal version of separate mail handling library. --- lib/mail_handler/mail_handler.rb | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) create mode 100644 lib/mail_handler/mail_handler.rb (limited to 'lib/mail_handler/mail_handler.rb') diff --git a/lib/mail_handler/mail_handler.rb b/lib/mail_handler/mail_handler.rb new file mode 100644 index 000000000..5db17ae77 --- /dev/null +++ b/lib/mail_handler/mail_handler.rb @@ -0,0 +1,16 @@ +# Handles the parsing of email +module MailHandler + + if RUBY_VERSION.to_f >= 1.9 + require 'backends/mail_backend' + include Backends::MailBackend + else + require 'backends/tmail_backend' + include Backends::TmailBackend + end + + # Turn instance methods into class methods + extend self + +end + -- cgit v1.2.3 From 4bdab94e9d4f0a64647e5f8534c1fea8b4ba2809 Mon Sep 17 00:00:00 2001 From: Louise Crow Date: Thu, 15 Nov 2012 14:04:55 +0000 Subject: Move TMail extensions to mail handler. --- lib/mail_handler/mail_handler.rb | 1 + 1 file changed, 1 insertion(+) (limited to 'lib/mail_handler/mail_handler.rb') diff --git a/lib/mail_handler/mail_handler.rb b/lib/mail_handler/mail_handler.rb index 5db17ae77..f0c75670a 100644 --- a/lib/mail_handler/mail_handler.rb +++ b/lib/mail_handler/mail_handler.rb @@ -5,6 +5,7 @@ module MailHandler require 'backends/mail_backend' include Backends::MailBackend else + require 'backends/tmail_extensions' require 'backends/tmail_backend' include Backends::TmailBackend end -- cgit v1.2.3 From 388c75bfbd18fcaf273d95c21dc132ad19f0cefe Mon Sep 17 00:00:00 2001 From: Louise Crow Date: Thu, 15 Nov 2012 16:12:23 +0000 Subject: Move handling of TNEF mail attachments to mail handler --- lib/mail_handler/mail_handler.rb | 34 ++++++++++++++++++++++++++++++++++ 1 file changed, 34 insertions(+) (limited to 'lib/mail_handler/mail_handler.rb') diff --git a/lib/mail_handler/mail_handler.rb b/lib/mail_handler/mail_handler.rb index f0c75670a..0bd9a82f0 100644 --- a/lib/mail_handler/mail_handler.rb +++ b/lib/mail_handler/mail_handler.rb @@ -1,4 +1,6 @@ # Handles the parsing of email +require 'tmpdir' + module MailHandler if RUBY_VERSION.to_f >= 1.9 @@ -10,6 +12,38 @@ module MailHandler include Backends::TmailBackend end + # Returns a set of attachments from the given TNEF contents + # The TNEF contents also contains the message body, but in general this is the + # same as the message body in the message proper. + def tnef_attachments(content) + attachments = [] + Dir.mktmpdir do |dir| + IO.popen("#{`which tnef`.chomp} -K -C #{dir}", "w") do |f| + f.write(content) + f.close + if $?.signaled? + raise IOError, "tnef exited with signal #{$?.termsig}" + end + if $?.exited? && $?.exitstatus != 0 + raise IOError, "tnef exited with status #{$?.exitstatus}" + end + end + found = 0 + Dir.new(dir).sort.each do |file| # sort for deterministic behaviour + if file != "." && file != ".." + file_content = File.open("#{dir}/#{file}", "r").read + attachments << { :content => file_content, + :filename => file } + found += 1 + end + end + if found == 0 + raise IOError, "tnef produced no attachments" + end + end + attachments + end + # Turn instance methods into class methods extend self -- cgit v1.2.3 From 9ef3f43fca535ffb52d2420bcfd8f18e5213b943 Mon Sep 17 00:00:00 2001 From: Louise Crow Date: Thu, 15 Nov 2012 16:21:38 +0000 Subject: Add some extra accessors to Mail::Message for now --- lib/mail_handler/mail_handler.rb | 1 + 1 file changed, 1 insertion(+) (limited to 'lib/mail_handler/mail_handler.rb') diff --git a/lib/mail_handler/mail_handler.rb b/lib/mail_handler/mail_handler.rb index 0bd9a82f0..24d14b5c8 100644 --- a/lib/mail_handler/mail_handler.rb +++ b/lib/mail_handler/mail_handler.rb @@ -4,6 +4,7 @@ require 'tmpdir' module MailHandler if RUBY_VERSION.to_f >= 1.9 + require 'backends/mail_extensions' require 'backends/mail_backend' include Backends::MailBackend else -- cgit v1.2.3 From 4a4f085e05aca055378613afdf424e3a0adf7f7c Mon Sep 17 00:00:00 2001 From: Louise Crow Date: Tue, 4 Dec 2012 18:11:25 +0000 Subject: Move the action mailer dependency to the mail handler so that it's only active in Ruby 1.8. Use mail handler methods in handle-mail-replies. --- lib/mail_handler/mail_handler.rb | 1 + 1 file changed, 1 insertion(+) (limited to 'lib/mail_handler/mail_handler.rb') diff --git a/lib/mail_handler/mail_handler.rb b/lib/mail_handler/mail_handler.rb index 24d14b5c8..7b0f6e7f2 100644 --- a/lib/mail_handler/mail_handler.rb +++ b/lib/mail_handler/mail_handler.rb @@ -8,6 +8,7 @@ module MailHandler require 'backends/mail_backend' include Backends::MailBackend else + require 'action_mailer' require 'backends/tmail_extensions' require 'backends/tmail_backend' include Backends::TmailBackend -- cgit v1.2.3 From 8cae0445839920c64240e66478e43674c8713177 Mon Sep 17 00:00:00 2001 From: Louise Crow Date: Wed, 5 Dec 2012 15:37:08 +0000 Subject: Move content type normalization to mail handler. --- lib/mail_handler/mail_handler.rb | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) (limited to 'lib/mail_handler/mail_handler.rb') diff --git a/lib/mail_handler/mail_handler.rb b/lib/mail_handler/mail_handler.rb index 7b0f6e7f2..a49ec8e8c 100644 --- a/lib/mail_handler/mail_handler.rb +++ b/lib/mail_handler/mail_handler.rb @@ -46,6 +46,30 @@ module MailHandler attachments end + def normalise_content_type(content_type) + # e.g. http://www.whatdotheyknow.com/request/93/response/250 + if content_type == 'application/excel' or content_type == 'application/msexcel' or content_type == 'application/x-ms-excel' + content_type = 'application/vnd.ms-excel' + end + if content_type == 'application/mspowerpoint' or content_type == 'application/x-ms-powerpoint' + content_type = 'application/vnd.ms-powerpoint' + end + if content_type == 'application/msword' or content_type == 'application/x-ms-word' + content_type = 'application/vnd.ms-word' + end + if content_type == 'application/x-zip-compressed' + content_type = 'application/zip' + end + + # e.g. http://www.whatdotheyknow.com/request/copy_of_current_swessex_scr_opt#incoming-9928 + if content_type == 'application/acrobat' + content_type = 'application/pdf' + end + + return content_type + end + + # Turn instance methods into class methods extend self -- cgit v1.2.3 From 0b8c40a57aa6f4a1da2e83c276ca523499d517e4 Mon Sep 17 00:00:00 2001 From: Louise Crow Date: Wed, 5 Dec 2012 18:29:43 +0000 Subject: Read and write to the file in binary mode during a TNEF conversion operation. --- lib/mail_handler/mail_handler.rb | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'lib/mail_handler/mail_handler.rb') diff --git a/lib/mail_handler/mail_handler.rb b/lib/mail_handler/mail_handler.rb index a49ec8e8c..2909d873b 100644 --- a/lib/mail_handler/mail_handler.rb +++ b/lib/mail_handler/mail_handler.rb @@ -20,7 +20,7 @@ module MailHandler def tnef_attachments(content) attachments = [] Dir.mktmpdir do |dir| - IO.popen("#{`which tnef`.chomp} -K -C #{dir}", "w") do |f| + IO.popen("#{`which tnef`.chomp} -K -C #{dir}", "wb") do |f| f.write(content) f.close if $?.signaled? @@ -33,7 +33,7 @@ module MailHandler found = 0 Dir.new(dir).sort.each do |file| # sort for deterministic behaviour if file != "." && file != ".." - file_content = File.open("#{dir}/#{file}", "r").read + file_content = File.open("#{dir}/#{file}", "rb").read attachments << { :content => file_content, :filename => file } found += 1 -- cgit v1.2.3 From 14bbd1d75840add4fd5f8c440b58ac465c306fb6 Mon Sep 17 00:00:00 2001 From: Louise Crow Date: Thu, 6 Dec 2012 10:23:37 +0000 Subject: Move methods for getting the text out of attachments to the mail handler module. --- lib/mail_handler/mail_handler.rb | 101 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 101 insertions(+) (limited to 'lib/mail_handler/mail_handler.rb') diff --git a/lib/mail_handler/mail_handler.rb b/lib/mail_handler/mail_handler.rb index 2909d873b..a74a9876a 100644 --- a/lib/mail_handler/mail_handler.rb +++ b/lib/mail_handler/mail_handler.rb @@ -69,6 +69,107 @@ module MailHandler return content_type end + def _get_attachment_text_internal_one_file(content_type, body, charset = 'utf-8') + # note re. charset: TMail always tries to convert email bodies + # to UTF8 by default, so normally it should already be that. + text = '' + # XXX - tell all these command line tools to return utf-8 + if content_type == 'text/plain' + text += body + "\n\n" + else + tempfile = Tempfile.new('foiextract') + tempfile.binmode + tempfile.print body + tempfile.flush + default_params = { :append_to => text, :binary_output => false } + if content_type == 'application/vnd.ms-word' + AlaveteliExternalCommand.run("wvText", tempfile.path, tempfile.path + ".txt") + # Try catdoc if we get into trouble (e.g. for InfoRequestEvent 2701) + if not File.exists?(tempfile.path + ".txt") + AlaveteliExternalCommand.run("catdoc", tempfile.path, default_params) + else + text += File.read(tempfile.path + ".txt") + "\n\n" + File.unlink(tempfile.path + ".txt") + end + elsif content_type == 'application/rtf' + # catdoc on RTF prodcues less comments and extra bumf than --text option to unrtf + AlaveteliExternalCommand.run("catdoc", tempfile.path, default_params) + elsif content_type == 'text/html' + # lynx wordwraps links in its output, which then don't + # get formatted properly by Alaveteli. We use elinks + # instead, which doesn't do that. + AlaveteliExternalCommand.run("elinks", "-eval", "set document.codepage.assume = \"#{charset}\"", + "-eval", "set document.codepage.force_assumed = 1", + "-dump-charset", "utf-8", + "-force-html", "-dump", + tempfile.path, + default_params.merge(:env => {"LANG" => "C"})) + elsif content_type == 'application/vnd.ms-excel' + # Bit crazy using /usr/bin/strings - but xls2csv, xlhtml and + # py_xls2txt only extract text from cells, not from floating + # notes. catdoc may be fooled by weird character sets, but will + # probably do for UK FOI requests. + AlaveteliExternalCommand.run("/usr/bin/strings", tempfile.path, default_params) + elsif content_type == 'application/vnd.ms-powerpoint' + # ppthtml seems to catch more text, but only outputs HTML when + # we want text, so just use catppt for now + AlaveteliExternalCommand.run("catppt", tempfile.path, default_params) + elsif content_type == 'application/pdf' + AlaveteliExternalCommand.run("pdftotext", tempfile.path, "-", default_params) + elsif content_type == 'application/vnd.openxmlformats-officedocument.wordprocessingml.document' + # This is Microsoft's XML office document format. + # Just pull out the main XML file, and strip it of text. + xml = AlaveteliExternalCommand.run("/usr/bin/unzip", "-qq", + "-c", + tempfile.path, + "word/document.xml", + {:binary_output => false}) + if !xml.nil? + doc = REXML::Document.new(xml) + text += doc.each_element( './/text()' ){}.join(" ") + end + elsif content_type == 'application/zip' + # recurse into zip files + begin + zip_file = Zip::ZipFile.open(tempfile.path) + text += _get_attachment_text_from_zip_file(zip_file) + zip_file.close() + rescue + $stderr.puts("Error processing zip file: #{$!.inspect}") + end + end + tempfile.close + end + + return text + end + def _get_attachment_text_from_zip_file(zip_file) + + text = "" + for entry in zip_file + if entry.file? + filename = entry.to_s + begin + body = entry.get_input_stream.read + rescue + # move to next attachment silently if there were problems + # XXX really should reduce this to specific exceptions? + # e.g. password protected + next + end + calc_mime = AlaveteliFileTypes.filename_to_mimetype(filename) + if calc_mime + content_type = calc_mime + else + content_type = 'application/octet-stream' + end + + text += _get_attachment_text_internal_one_file(content_type, body) + + end + end + return text + end # Turn instance methods into class methods extend self -- cgit v1.2.3 From 12b9d57ea9bbe69e5e195a90085b66056f4116a4 Mon Sep 17 00:00:00 2001 From: Louise Crow Date: Thu, 6 Dec 2012 10:38:17 +0000 Subject: No real need for this to be an internal function. --- lib/mail_handler/mail_handler.rb | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'lib/mail_handler/mail_handler.rb') diff --git a/lib/mail_handler/mail_handler.rb b/lib/mail_handler/mail_handler.rb index a74a9876a..7d80753c2 100644 --- a/lib/mail_handler/mail_handler.rb +++ b/lib/mail_handler/mail_handler.rb @@ -132,7 +132,7 @@ module MailHandler # recurse into zip files begin zip_file = Zip::ZipFile.open(tempfile.path) - text += _get_attachment_text_from_zip_file(zip_file) + text += get_attachment_text_from_zip_file(zip_file) zip_file.close() rescue $stderr.puts("Error processing zip file: #{$!.inspect}") @@ -143,7 +143,7 @@ module MailHandler return text end - def _get_attachment_text_from_zip_file(zip_file) + def get_attachment_text_from_zip_file(zip_file) text = "" for entry in zip_file -- cgit v1.2.3 From 1d4ef88e60bcecc5c413cf3b6e6cb76f4bb6eaa1 Mon Sep 17 00:00:00 2001 From: Louise Crow Date: Thu, 6 Dec 2012 10:43:07 +0000 Subject: Rename _get_attachment_text_internal_one_file to get_attachment_text_one_file as it is now an externally-accessed method of the mail handler module. --- lib/mail_handler/mail_handler.rb | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'lib/mail_handler/mail_handler.rb') diff --git a/lib/mail_handler/mail_handler.rb b/lib/mail_handler/mail_handler.rb index 7d80753c2..4b16fd046 100644 --- a/lib/mail_handler/mail_handler.rb +++ b/lib/mail_handler/mail_handler.rb @@ -69,7 +69,7 @@ module MailHandler return content_type end - def _get_attachment_text_internal_one_file(content_type, body, charset = 'utf-8') + def get_attachment_text_one_file(content_type, body, charset = 'utf-8') # note re. charset: TMail always tries to convert email bodies # to UTF8 by default, so normally it should already be that. text = '' @@ -164,7 +164,7 @@ module MailHandler content_type = 'application/octet-stream' end - text += _get_attachment_text_internal_one_file(content_type, body) + text += get_attachment_text_one_file(content_type, body) end end -- cgit v1.2.3 From c01b40eac15193434a2805f57b68bf7cfc583225 Mon Sep 17 00:00:00 2001 From: Louise Crow Date: Thu, 6 Dec 2012 14:49:42 +0000 Subject: Explicitly require the mail gem when using it as a backend. --- lib/mail_handler/mail_handler.rb | 1 + 1 file changed, 1 insertion(+) (limited to 'lib/mail_handler/mail_handler.rb') diff --git a/lib/mail_handler/mail_handler.rb b/lib/mail_handler/mail_handler.rb index 4b16fd046..8b227b9ca 100644 --- a/lib/mail_handler/mail_handler.rb +++ b/lib/mail_handler/mail_handler.rb @@ -4,6 +4,7 @@ require 'tmpdir' module MailHandler if RUBY_VERSION.to_f >= 1.9 + require 'mail' require 'backends/mail_extensions' require 'backends/mail_backend' include Backends::MailBackend -- cgit v1.2.3