diff options
-rw-r--r-- | app/models/incoming_message.rb | 102 | ||||
-rw-r--r-- | lib/mail_handler/mail_handler.rb | 101 | ||||
-rw-r--r-- | spec/lib/mail_handler/mail_handler_spec.rb | 22 | ||||
-rw-r--r-- | spec/models/incoming_message_spec.rb | 27 |
4 files changed, 127 insertions, 125 deletions
diff --git a/app/models/incoming_message.rb b/app/models/incoming_message.rb index 809e8c04d..e1702689c 100644 --- a/app/models/incoming_message.rb +++ b/app/models/incoming_message.rb @@ -475,7 +475,7 @@ class IncomingMessage < ActiveRecord::Base rescue Iconv::IllegalSequence use_charset = source_charset end - text = self.class._get_attachment_text_internal_one_file(part.content_type, text, use_charset) + text = MailHandler._get_attachment_text_internal_one_file(part.content_type, text, use_charset) end end @@ -702,111 +702,13 @@ class IncomingMessage < ActiveRecord::Base return self.cached_attachment_text_clipped end - def IncomingMessage._get_attachment_text_internal_one_file(content_type, body, charset = 'utf-8') - # note re. charset: TMail always tries to convert email bodies - # to UTF8 by default, so normally it should already be that. - text = '' - # XXX - tell all these command line tools to return utf-8 - if content_type == 'text/plain' - text += body + "\n\n" - else - tempfile = Tempfile.new('foiextract') - tempfile.binmode - tempfile.print body - tempfile.flush - default_params = { :append_to => text, :binary_output => false } - if content_type == 'application/vnd.ms-word' - AlaveteliExternalCommand.run("wvText", tempfile.path, tempfile.path + ".txt") - # Try catdoc if we get into trouble (e.g. for InfoRequestEvent 2701) - if not File.exists?(tempfile.path + ".txt") - AlaveteliExternalCommand.run("catdoc", tempfile.path, default_params) - else - text += File.read(tempfile.path + ".txt") + "\n\n" - File.unlink(tempfile.path + ".txt") - end - elsif content_type == 'application/rtf' - # catdoc on RTF prodcues less comments and extra bumf than --text option to unrtf - AlaveteliExternalCommand.run("catdoc", tempfile.path, default_params) - elsif content_type == 'text/html' - # lynx wordwraps links in its output, which then don't - # get formatted properly by Alaveteli. We use elinks - # instead, which doesn't do that. - AlaveteliExternalCommand.run("elinks", "-eval", "set document.codepage.assume = \"#{charset}\"", - "-eval", "set document.codepage.force_assumed = 1", - "-dump-charset", "utf-8", - "-force-html", "-dump", - tempfile.path, - default_params.merge(:env => {"LANG" => "C"})) - elsif content_type == 'application/vnd.ms-excel' - # Bit crazy using /usr/bin/strings - but xls2csv, xlhtml and - # py_xls2txt only extract text from cells, not from floating - # notes. catdoc may be fooled by weird character sets, but will - # probably do for UK FOI requests. - AlaveteliExternalCommand.run("/usr/bin/strings", tempfile.path, default_params) - elsif content_type == 'application/vnd.ms-powerpoint' - # ppthtml seems to catch more text, but only outputs HTML when - # we want text, so just use catppt for now - AlaveteliExternalCommand.run("catppt", tempfile.path, default_params) - elsif content_type == 'application/pdf' - AlaveteliExternalCommand.run("pdftotext", tempfile.path, "-", default_params) - elsif content_type == 'application/vnd.openxmlformats-officedocument.wordprocessingml.document' - # This is Microsoft's XML office document format. - # Just pull out the main XML file, and strip it of text. - xml = AlaveteliExternalCommand.run("/usr/bin/unzip", "-qq", - "-c", - tempfile.path, - "word/document.xml", - {:binary_output => false}) - if !xml.nil? - doc = REXML::Document.new(xml) - text += doc.each_element( './/text()' ){}.join(" ") - end - elsif content_type == 'application/zip' - # recurse into zip files - begin - zip_file = Zip::ZipFile.open(tempfile.path) - text += IncomingMessage._get_attachment_text_from_zip_file(zip_file) - zip_file.close() - rescue - $stderr.puts("Error processing zip file: #{$!.inspect}") - end - end - tempfile.close - end - return text - end - def IncomingMessage._get_attachment_text_from_zip_file(zip_file) - text = "" - for entry in zip_file - if entry.file? - filename = entry.to_s - begin - body = entry.get_input_stream.read - rescue - # move to next attachment silently if there were problems - # XXX really should reduce this to specific exceptions? - # e.g. password protected - next - end - calc_mime = AlaveteliFileTypes.filename_to_mimetype(filename) - if calc_mime - content_type = calc_mime - else - content_type = 'application/octet-stream' - end - - text += _get_attachment_text_internal_one_file(content_type, body) - end - end - return text - end def _get_attachment_text_internal # Extract text from each attachment text = '' attachments = self.get_attachments_for_display for attachment in attachments - text += IncomingMessage._get_attachment_text_internal_one_file(attachment.content_type, attachment.body, attachment.charset) + text += MailHandler._get_attachment_text_internal_one_file(attachment.content_type, attachment.body, attachment.charset) end # Remove any bad characters text = Iconv.conv('utf-8//IGNORE', 'utf-8', text) diff --git a/lib/mail_handler/mail_handler.rb b/lib/mail_handler/mail_handler.rb index 2909d873b..a74a9876a 100644 --- a/lib/mail_handler/mail_handler.rb +++ b/lib/mail_handler/mail_handler.rb @@ -69,6 +69,107 @@ module MailHandler return content_type end + def _get_attachment_text_internal_one_file(content_type, body, charset = 'utf-8') + # note re. charset: TMail always tries to convert email bodies + # to UTF8 by default, so normally it should already be that. + text = '' + # XXX - tell all these command line tools to return utf-8 + if content_type == 'text/plain' + text += body + "\n\n" + else + tempfile = Tempfile.new('foiextract') + tempfile.binmode + tempfile.print body + tempfile.flush + default_params = { :append_to => text, :binary_output => false } + if content_type == 'application/vnd.ms-word' + AlaveteliExternalCommand.run("wvText", tempfile.path, tempfile.path + ".txt") + # Try catdoc if we get into trouble (e.g. for InfoRequestEvent 2701) + if not File.exists?(tempfile.path + ".txt") + AlaveteliExternalCommand.run("catdoc", tempfile.path, default_params) + else + text += File.read(tempfile.path + ".txt") + "\n\n" + File.unlink(tempfile.path + ".txt") + end + elsif content_type == 'application/rtf' + # catdoc on RTF prodcues less comments and extra bumf than --text option to unrtf + AlaveteliExternalCommand.run("catdoc", tempfile.path, default_params) + elsif content_type == 'text/html' + # lynx wordwraps links in its output, which then don't + # get formatted properly by Alaveteli. We use elinks + # instead, which doesn't do that. + AlaveteliExternalCommand.run("elinks", "-eval", "set document.codepage.assume = \"#{charset}\"", + "-eval", "set document.codepage.force_assumed = 1", + "-dump-charset", "utf-8", + "-force-html", "-dump", + tempfile.path, + default_params.merge(:env => {"LANG" => "C"})) + elsif content_type == 'application/vnd.ms-excel' + # Bit crazy using /usr/bin/strings - but xls2csv, xlhtml and + # py_xls2txt only extract text from cells, not from floating + # notes. catdoc may be fooled by weird character sets, but will + # probably do for UK FOI requests. + AlaveteliExternalCommand.run("/usr/bin/strings", tempfile.path, default_params) + elsif content_type == 'application/vnd.ms-powerpoint' + # ppthtml seems to catch more text, but only outputs HTML when + # we want text, so just use catppt for now + AlaveteliExternalCommand.run("catppt", tempfile.path, default_params) + elsif content_type == 'application/pdf' + AlaveteliExternalCommand.run("pdftotext", tempfile.path, "-", default_params) + elsif content_type == 'application/vnd.openxmlformats-officedocument.wordprocessingml.document' + # This is Microsoft's XML office document format. + # Just pull out the main XML file, and strip it of text. + xml = AlaveteliExternalCommand.run("/usr/bin/unzip", "-qq", + "-c", + tempfile.path, + "word/document.xml", + {:binary_output => false}) + if !xml.nil? + doc = REXML::Document.new(xml) + text += doc.each_element( './/text()' ){}.join(" ") + end + elsif content_type == 'application/zip' + # recurse into zip files + begin + zip_file = Zip::ZipFile.open(tempfile.path) + text += _get_attachment_text_from_zip_file(zip_file) + zip_file.close() + rescue + $stderr.puts("Error processing zip file: #{$!.inspect}") + end + end + tempfile.close + end + + return text + end + def _get_attachment_text_from_zip_file(zip_file) + + text = "" + for entry in zip_file + if entry.file? + filename = entry.to_s + begin + body = entry.get_input_stream.read + rescue + # move to next attachment silently if there were problems + # XXX really should reduce this to specific exceptions? + # e.g. password protected + next + end + calc_mime = AlaveteliFileTypes.filename_to_mimetype(filename) + if calc_mime + content_type = calc_mime + else + content_type = 'application/octet-stream' + end + + text += _get_attachment_text_internal_one_file(content_type, body) + + end + end + return text + end # Turn instance methods into class methods extend self diff --git a/spec/lib/mail_handler/mail_handler_spec.rb b/spec/lib/mail_handler/mail_handler_spec.rb index 7eeba47e0..efa89b6e0 100644 --- a/spec/lib/mail_handler/mail_handler_spec.rb +++ b/spec/lib/mail_handler/mail_handler_spec.rb @@ -250,4 +250,24 @@ describe 'when getting header strings' do '9; Autoresponder') end -end
\ No newline at end of file +end + +describe "when parsing HTML mail" do + it "should display UTF-8 characters in the plain text version correctly" do + html = "<html><b>foo</b> është" + plain_text = MailHandler._get_attachment_text_internal_one_file('text/html', html) + plain_text.should match(/është/) + end + +end + +describe "when getting the attachment text" do + + it "should not raise an error if the expansion of a zip file raises an error" do + mock_entry = mock('ZipFile entry', :file? => true) + mock_entry.stub!(:get_input_stream).and_raise("invalid distance too far back") + Zip::ZipFile.stub!(:open).and_return([mock_entry]) + MailHandler._get_attachment_text_internal_one_file('application/zip', "some string") + end + +end diff --git a/spec/models/incoming_message_spec.rb b/spec/models/incoming_message_spec.rb index 1278535f8..3cfb3d5dd 100644 --- a/spec/models/incoming_message_spec.rb +++ b/spec/models/incoming_message_spec.rb @@ -102,27 +102,6 @@ describe IncomingMessage, " when dealing with incoming mail" do end -describe IncomingMessage, "when parsing HTML mail" do - it "should display UTF-8 characters in the plain text version correctly" do - html = "<html><b>foo</b> është" - plain_text = IncomingMessage._get_attachment_text_internal_one_file('text/html', html) - plain_text.should match(/është/) - end - -end - -describe IncomingMessage, "when getting the attachment text" do - - it "should not raise an error if the expansion of a zip file raises an error" do - mock_entry = mock('ZipFile entry', :file? => true) - mock_entry.stub!(:get_input_stream).and_raise("invalid distance too far back") - Zip::ZipFile.stub!(:open).and_return([mock_entry]) - IncomingMessage._get_attachment_text_internal_one_file('application/zip', "some string") - end - -end - - describe IncomingMessage, " display attachments" do it "should not show slashes in filenames" do @@ -138,7 +117,7 @@ describe IncomingMessage, " display attachments" do # http://www.whatdotheyknow.com/request/post_commercial_manager_librarie#incoming-17233 foi_attachment.within_rfc822_subject = "FOI/09/066 RESPONSE TO FOI REQUEST RECEIVED 21st JANUARY 2009" foi_attachment.content_type = 'text/plain' - foi_attachment.ensure_filename! + foi_attachment.ensure_filename! expected_display_filename = foi_attachment.within_rfc822_subject.gsub(/\//, " ") + ".txt" foi_attachment.display_filename.should == expected_display_filename end @@ -326,12 +305,12 @@ describe IncomingMessage, " when censoring data" do orig_pdf = load_file_fixture('tfl.pdf') pdf = orig_pdf.dup - orig_text = IncomingMessage._get_attachment_text_internal_one_file('application/pdf', pdf) + orig_text = MailHandler._get_attachment_text_internal_one_file('application/pdf', pdf) orig_text.should match(/foi@tfl.gov.uk/) @im.binary_mask_stuff!(pdf, "application/pdf") - masked_text = IncomingMessage._get_attachment_text_internal_one_file('application/pdf', pdf) + masked_text = MailHandler._get_attachment_text_internal_one_file('application/pdf', pdf) masked_text.should_not match(/foi@tfl.gov.uk/) masked_text.should match(/xxx@xxx.xxx.xx/) config['USE_GHOSTSCRIPT_COMPRESSION'] = previous |