aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--app/models/incoming_message.rb102
-rw-r--r--lib/mail_handler/mail_handler.rb101
-rw-r--r--spec/lib/mail_handler/mail_handler_spec.rb22
-rw-r--r--spec/models/incoming_message_spec.rb27
4 files changed, 127 insertions, 125 deletions
diff --git a/app/models/incoming_message.rb b/app/models/incoming_message.rb
index 809e8c04d..e1702689c 100644
--- a/app/models/incoming_message.rb
+++ b/app/models/incoming_message.rb
@@ -475,7 +475,7 @@ class IncomingMessage < ActiveRecord::Base
rescue Iconv::IllegalSequence
use_charset = source_charset
end
- text = self.class._get_attachment_text_internal_one_file(part.content_type, text, use_charset)
+ text = MailHandler._get_attachment_text_internal_one_file(part.content_type, text, use_charset)
end
end
@@ -702,111 +702,13 @@ class IncomingMessage < ActiveRecord::Base
return self.cached_attachment_text_clipped
end
- def IncomingMessage._get_attachment_text_internal_one_file(content_type, body, charset = 'utf-8')
- # note re. charset: TMail always tries to convert email bodies
- # to UTF8 by default, so normally it should already be that.
- text = ''
- # XXX - tell all these command line tools to return utf-8
- if content_type == 'text/plain'
- text += body + "\n\n"
- else
- tempfile = Tempfile.new('foiextract')
- tempfile.binmode
- tempfile.print body
- tempfile.flush
- default_params = { :append_to => text, :binary_output => false }
- if content_type == 'application/vnd.ms-word'
- AlaveteliExternalCommand.run("wvText", tempfile.path, tempfile.path + ".txt")
- # Try catdoc if we get into trouble (e.g. for InfoRequestEvent 2701)
- if not File.exists?(tempfile.path + ".txt")
- AlaveteliExternalCommand.run("catdoc", tempfile.path, default_params)
- else
- text += File.read(tempfile.path + ".txt") + "\n\n"
- File.unlink(tempfile.path + ".txt")
- end
- elsif content_type == 'application/rtf'
- # catdoc on RTF prodcues less comments and extra bumf than --text option to unrtf
- AlaveteliExternalCommand.run("catdoc", tempfile.path, default_params)
- elsif content_type == 'text/html'
- # lynx wordwraps links in its output, which then don't
- # get formatted properly by Alaveteli. We use elinks
- # instead, which doesn't do that.
- AlaveteliExternalCommand.run("elinks", "-eval", "set document.codepage.assume = \"#{charset}\"",
- "-eval", "set document.codepage.force_assumed = 1",
- "-dump-charset", "utf-8",
- "-force-html", "-dump",
- tempfile.path,
- default_params.merge(:env => {"LANG" => "C"}))
- elsif content_type == 'application/vnd.ms-excel'
- # Bit crazy using /usr/bin/strings - but xls2csv, xlhtml and
- # py_xls2txt only extract text from cells, not from floating
- # notes. catdoc may be fooled by weird character sets, but will
- # probably do for UK FOI requests.
- AlaveteliExternalCommand.run("/usr/bin/strings", tempfile.path, default_params)
- elsif content_type == 'application/vnd.ms-powerpoint'
- # ppthtml seems to catch more text, but only outputs HTML when
- # we want text, so just use catppt for now
- AlaveteliExternalCommand.run("catppt", tempfile.path, default_params)
- elsif content_type == 'application/pdf'
- AlaveteliExternalCommand.run("pdftotext", tempfile.path, "-", default_params)
- elsif content_type == 'application/vnd.openxmlformats-officedocument.wordprocessingml.document'
- # This is Microsoft's XML office document format.
- # Just pull out the main XML file, and strip it of text.
- xml = AlaveteliExternalCommand.run("/usr/bin/unzip", "-qq",
- "-c",
- tempfile.path,
- "word/document.xml",
- {:binary_output => false})
- if !xml.nil?
- doc = REXML::Document.new(xml)
- text += doc.each_element( './/text()' ){}.join(" ")
- end
- elsif content_type == 'application/zip'
- # recurse into zip files
- begin
- zip_file = Zip::ZipFile.open(tempfile.path)
- text += IncomingMessage._get_attachment_text_from_zip_file(zip_file)
- zip_file.close()
- rescue
- $stderr.puts("Error processing zip file: #{$!.inspect}")
- end
- end
- tempfile.close
- end
- return text
- end
- def IncomingMessage._get_attachment_text_from_zip_file(zip_file)
- text = ""
- for entry in zip_file
- if entry.file?
- filename = entry.to_s
- begin
- body = entry.get_input_stream.read
- rescue
- # move to next attachment silently if there were problems
- # XXX really should reduce this to specific exceptions?
- # e.g. password protected
- next
- end
- calc_mime = AlaveteliFileTypes.filename_to_mimetype(filename)
- if calc_mime
- content_type = calc_mime
- else
- content_type = 'application/octet-stream'
- end
-
- text += _get_attachment_text_internal_one_file(content_type, body)
- end
- end
- return text
- end
def _get_attachment_text_internal
# Extract text from each attachment
text = ''
attachments = self.get_attachments_for_display
for attachment in attachments
- text += IncomingMessage._get_attachment_text_internal_one_file(attachment.content_type, attachment.body, attachment.charset)
+ text += MailHandler._get_attachment_text_internal_one_file(attachment.content_type, attachment.body, attachment.charset)
end
# Remove any bad characters
text = Iconv.conv('utf-8//IGNORE', 'utf-8', text)
diff --git a/lib/mail_handler/mail_handler.rb b/lib/mail_handler/mail_handler.rb
index 2909d873b..a74a9876a 100644
--- a/lib/mail_handler/mail_handler.rb
+++ b/lib/mail_handler/mail_handler.rb
@@ -69,6 +69,107 @@ module MailHandler
return content_type
end
+ def _get_attachment_text_internal_one_file(content_type, body, charset = 'utf-8')
+ # note re. charset: TMail always tries to convert email bodies
+ # to UTF8 by default, so normally it should already be that.
+ text = ''
+ # XXX - tell all these command line tools to return utf-8
+ if content_type == 'text/plain'
+ text += body + "\n\n"
+ else
+ tempfile = Tempfile.new('foiextract')
+ tempfile.binmode
+ tempfile.print body
+ tempfile.flush
+ default_params = { :append_to => text, :binary_output => false }
+ if content_type == 'application/vnd.ms-word'
+ AlaveteliExternalCommand.run("wvText", tempfile.path, tempfile.path + ".txt")
+ # Try catdoc if we get into trouble (e.g. for InfoRequestEvent 2701)
+ if not File.exists?(tempfile.path + ".txt")
+ AlaveteliExternalCommand.run("catdoc", tempfile.path, default_params)
+ else
+ text += File.read(tempfile.path + ".txt") + "\n\n"
+ File.unlink(tempfile.path + ".txt")
+ end
+ elsif content_type == 'application/rtf'
+ # catdoc on RTF prodcues less comments and extra bumf than --text option to unrtf
+ AlaveteliExternalCommand.run("catdoc", tempfile.path, default_params)
+ elsif content_type == 'text/html'
+ # lynx wordwraps links in its output, which then don't
+ # get formatted properly by Alaveteli. We use elinks
+ # instead, which doesn't do that.
+ AlaveteliExternalCommand.run("elinks", "-eval", "set document.codepage.assume = \"#{charset}\"",
+ "-eval", "set document.codepage.force_assumed = 1",
+ "-dump-charset", "utf-8",
+ "-force-html", "-dump",
+ tempfile.path,
+ default_params.merge(:env => {"LANG" => "C"}))
+ elsif content_type == 'application/vnd.ms-excel'
+ # Bit crazy using /usr/bin/strings - but xls2csv, xlhtml and
+ # py_xls2txt only extract text from cells, not from floating
+ # notes. catdoc may be fooled by weird character sets, but will
+ # probably do for UK FOI requests.
+ AlaveteliExternalCommand.run("/usr/bin/strings", tempfile.path, default_params)
+ elsif content_type == 'application/vnd.ms-powerpoint'
+ # ppthtml seems to catch more text, but only outputs HTML when
+ # we want text, so just use catppt for now
+ AlaveteliExternalCommand.run("catppt", tempfile.path, default_params)
+ elsif content_type == 'application/pdf'
+ AlaveteliExternalCommand.run("pdftotext", tempfile.path, "-", default_params)
+ elsif content_type == 'application/vnd.openxmlformats-officedocument.wordprocessingml.document'
+ # This is Microsoft's XML office document format.
+ # Just pull out the main XML file, and strip it of text.
+ xml = AlaveteliExternalCommand.run("/usr/bin/unzip", "-qq",
+ "-c",
+ tempfile.path,
+ "word/document.xml",
+ {:binary_output => false})
+ if !xml.nil?
+ doc = REXML::Document.new(xml)
+ text += doc.each_element( './/text()' ){}.join(" ")
+ end
+ elsif content_type == 'application/zip'
+ # recurse into zip files
+ begin
+ zip_file = Zip::ZipFile.open(tempfile.path)
+ text += _get_attachment_text_from_zip_file(zip_file)
+ zip_file.close()
+ rescue
+ $stderr.puts("Error processing zip file: #{$!.inspect}")
+ end
+ end
+ tempfile.close
+ end
+
+ return text
+ end
+ def _get_attachment_text_from_zip_file(zip_file)
+
+ text = ""
+ for entry in zip_file
+ if entry.file?
+ filename = entry.to_s
+ begin
+ body = entry.get_input_stream.read
+ rescue
+ # move to next attachment silently if there were problems
+ # XXX really should reduce this to specific exceptions?
+ # e.g. password protected
+ next
+ end
+ calc_mime = AlaveteliFileTypes.filename_to_mimetype(filename)
+ if calc_mime
+ content_type = calc_mime
+ else
+ content_type = 'application/octet-stream'
+ end
+
+ text += _get_attachment_text_internal_one_file(content_type, body)
+
+ end
+ end
+ return text
+ end
# Turn instance methods into class methods
extend self
diff --git a/spec/lib/mail_handler/mail_handler_spec.rb b/spec/lib/mail_handler/mail_handler_spec.rb
index 7eeba47e0..efa89b6e0 100644
--- a/spec/lib/mail_handler/mail_handler_spec.rb
+++ b/spec/lib/mail_handler/mail_handler_spec.rb
@@ -250,4 +250,24 @@ describe 'when getting header strings' do
'9; Autoresponder')
end
-end \ No newline at end of file
+end
+
+describe "when parsing HTML mail" do
+ it "should display UTF-8 characters in the plain text version correctly" do
+ html = "<html><b>foo</b> është"
+ plain_text = MailHandler._get_attachment_text_internal_one_file('text/html', html)
+ plain_text.should match(/është/)
+ end
+
+end
+
+describe "when getting the attachment text" do
+
+ it "should not raise an error if the expansion of a zip file raises an error" do
+ mock_entry = mock('ZipFile entry', :file? => true)
+ mock_entry.stub!(:get_input_stream).and_raise("invalid distance too far back")
+ Zip::ZipFile.stub!(:open).and_return([mock_entry])
+ MailHandler._get_attachment_text_internal_one_file('application/zip', "some string")
+ end
+
+end
diff --git a/spec/models/incoming_message_spec.rb b/spec/models/incoming_message_spec.rb
index 1278535f8..3cfb3d5dd 100644
--- a/spec/models/incoming_message_spec.rb
+++ b/spec/models/incoming_message_spec.rb
@@ -102,27 +102,6 @@ describe IncomingMessage, " when dealing with incoming mail" do
end
-describe IncomingMessage, "when parsing HTML mail" do
- it "should display UTF-8 characters in the plain text version correctly" do
- html = "<html><b>foo</b> është"
- plain_text = IncomingMessage._get_attachment_text_internal_one_file('text/html', html)
- plain_text.should match(/është/)
- end
-
-end
-
-describe IncomingMessage, "when getting the attachment text" do
-
- it "should not raise an error if the expansion of a zip file raises an error" do
- mock_entry = mock('ZipFile entry', :file? => true)
- mock_entry.stub!(:get_input_stream).and_raise("invalid distance too far back")
- Zip::ZipFile.stub!(:open).and_return([mock_entry])
- IncomingMessage._get_attachment_text_internal_one_file('application/zip', "some string")
- end
-
-end
-
-
describe IncomingMessage, " display attachments" do
it "should not show slashes in filenames" do
@@ -138,7 +117,7 @@ describe IncomingMessage, " display attachments" do
# http://www.whatdotheyknow.com/request/post_commercial_manager_librarie#incoming-17233
foi_attachment.within_rfc822_subject = "FOI/09/066 RESPONSE TO FOI REQUEST RECEIVED 21st JANUARY 2009"
foi_attachment.content_type = 'text/plain'
- foi_attachment.ensure_filename!
+ foi_attachment.ensure_filename!
expected_display_filename = foi_attachment.within_rfc822_subject.gsub(/\//, " ") + ".txt"
foi_attachment.display_filename.should == expected_display_filename
end
@@ -326,12 +305,12 @@ describe IncomingMessage, " when censoring data" do
orig_pdf = load_file_fixture('tfl.pdf')
pdf = orig_pdf.dup
- orig_text = IncomingMessage._get_attachment_text_internal_one_file('application/pdf', pdf)
+ orig_text = MailHandler._get_attachment_text_internal_one_file('application/pdf', pdf)
orig_text.should match(/foi@tfl.gov.uk/)
@im.binary_mask_stuff!(pdf, "application/pdf")
- masked_text = IncomingMessage._get_attachment_text_internal_one_file('application/pdf', pdf)
+ masked_text = MailHandler._get_attachment_text_internal_one_file('application/pdf', pdf)
masked_text.should_not match(/foi@tfl.gov.uk/)
masked_text.should match(/xxx@xxx.xxx.xx/)
config['USE_GHOSTSCRIPT_COMPRESSION'] = previous