4 files changed, 127 insertions, 125 deletions
diff --git a/app/models/incoming_message.rb b/app/models/incoming_message.rb
index 809e8c04d..e1702689c 100644
--- a/app/models/incoming_message.rb
+++ b/app/models/incoming_message.rb
@@ -475,7 +475,7 @@ class IncomingMessage < ActiveRecord::Base
                 rescue Iconv::IllegalSequence
                     use_charset = source_charset
                 end
-                text = self.class._get_attachment_text_internal_one_file(part.content_type, text, use_charset)
+                text = MailHandler._get_attachment_text_internal_one_file(part.content_type, text, use_charset)
             end
         end
 
@@ -702,111 +702,13 @@ class IncomingMessage < ActiveRecord::Base
 
         return self.cached_attachment_text_clipped
     end
-    def IncomingMessage._get_attachment_text_internal_one_file(content_type, body, charset = 'utf-8')
-        # note re. charset: TMail always tries to convert email bodies
-        # to UTF8 by default, so normally it should already be that.
-        text = ''
-        # XXX - tell all these command line tools to return utf-8
-        if content_type == 'text/plain'
-            text += body + "\n\n"
-        else
-            tempfile = Tempfile.new('foiextract')
-            tempfile.binmode
-            tempfile.print body
-            tempfile.flush
-            default_params = { :append_to => text, :binary_output => false }
-            if content_type == 'application/vnd.ms-word'
-                AlaveteliExternalCommand.run("wvText", tempfile.path, tempfile.path + ".txt")
-                # Try catdoc if we get into trouble (e.g. for InfoRequestEvent 2701)
-                if not File.exists?(tempfile.path + ".txt")
-                    AlaveteliExternalCommand.run("catdoc", tempfile.path, default_params)
-                else
-                    text += File.read(tempfile.path + ".txt") + "\n\n"
-                    File.unlink(tempfile.path + ".txt")
-                end
-            elsif content_type == 'application/rtf'
-                # catdoc on RTF prodcues less comments and extra bumf than --text option to unrtf
-                AlaveteliExternalCommand.run("catdoc", tempfile.path, default_params)
-            elsif content_type == 'text/html'
-                # lynx wordwraps links in its output, which then don't
-                # get formatted properly by Alaveteli. We use elinks
-                # instead, which doesn't do that.
-                AlaveteliExternalCommand.run("elinks", "-eval", "set document.codepage.assume = \"#{charset}\"",
-                                                       "-eval", "set document.codepage.force_assumed = 1",
-                                                       "-dump-charset", "utf-8",
-                                                       "-force-html", "-dump",
-                                                       tempfile.path,
-                                                       default_params.merge(:env => {"LANG" => "C"}))
-            elsif content_type == 'application/vnd.ms-excel'
-                # Bit crazy using /usr/bin/strings - but xls2csv, xlhtml and
-                # py_xls2txt only extract text from cells, not from floating
-                # notes. catdoc may be fooled by weird character sets, but will
-                # probably do for UK FOI requests.
-                AlaveteliExternalCommand.run("/usr/bin/strings", tempfile.path, default_params)
-            elsif content_type == 'application/vnd.ms-powerpoint'
-                # ppthtml seems to catch more text, but only outputs HTML when
-                # we want text, so just use catppt for now
-                AlaveteliExternalCommand.run("catppt", tempfile.path, default_params)
-            elsif content_type == 'application/pdf'
-                AlaveteliExternalCommand.run("pdftotext", tempfile.path, "-", default_params)
-            elsif content_type == 'application/vnd.openxmlformats-officedocument.wordprocessingml.document'
-                # This is Microsoft's XML office document format.
-                # Just pull out the main XML file, and strip it of text.
-                xml = AlaveteliExternalCommand.run("/usr/bin/unzip", "-qq",
-                                                                     "-c",
-                                                                     tempfile.path,
-                                                                     "word/document.xml",
-                                                                     {:binary_output => false})
-                if !xml.nil?
-                    doc = REXML::Document.new(xml)
-                    text += doc.each_element( './/text()' ){}.join(" ")
-                end
-            elsif content_type == 'application/zip'
-                # recurse into zip files
-                begin
-                    zip_file = Zip::ZipFile.open(tempfile.path)
-                    text += IncomingMessage._get_attachment_text_from_zip_file(zip_file)
-                    zip_file.close()
-                rescue
-                    $stderr.puts("Error processing zip file: #{$!.inspect}")
-                end
-            end
-            tempfile.close
-        end
 
-        return text
-    end
-    def IncomingMessage._get_attachment_text_from_zip_file(zip_file)
-        text = ""
-        for entry in zip_file
-            if entry.file?
-                filename = entry.to_s
-                begin
-                    body = entry.get_input_stream.read
-                rescue
-                    # move to next attachment silently if there were problems
-                    # XXX really should reduce this to specific exceptions?
-                    # e.g. password protected
-                    next
-                end
-                calc_mime = AlaveteliFileTypes.filename_to_mimetype(filename)
-                if calc_mime
-                    content_type = calc_mime
-                else
-                    content_type = 'application/octet-stream'
-                end
-
-                text += _get_attachment_text_internal_one_file(content_type, body)
-            end
-        end
-        return text
-    end
     def _get_attachment_text_internal
         # Extract text from each attachment
         text = ''
         attachments = self.get_attachments_for_display
         for attachment in attachments
-            text += IncomingMessage._get_attachment_text_internal_one_file(attachment.content_type, attachment.body, attachment.charset)
+            text += MailHandler._get_attachment_text_internal_one_file(attachment.content_type, attachment.body, attachment.charset)
         end
         # Remove any bad characters
         text = Iconv.conv('utf-8//IGNORE', 'utf-8', text)
diff --git a/lib/mail_handler/mail_handler.rb b/lib/mail_handler/mail_handler.rb
index 2909d873b..a74a9876a 100644
--- a/lib/mail_handler/mail_handler.rb
+++ b/lib/mail_handler/mail_handler.rb
@@ -69,6 +69,107 @@ module MailHandler
         return content_type
     end
 
+    def _get_attachment_text_internal_one_file(content_type, body, charset = 'utf-8')
+        # note re. charset: TMail always tries to convert email bodies
+        # to UTF8 by default, so normally it should already be that.
+        text = ''
+        # XXX - tell all these command line tools to return utf-8
+        if content_type == 'text/plain'
+            text += body + "\n\n"
+        else
+            tempfile = Tempfile.new('foiextract')
+            tempfile.binmode
+            tempfile.print body
+            tempfile.flush
+            default_params = { :append_to => text, :binary_output => false }
+            if content_type == 'application/vnd.ms-word'
+                AlaveteliExternalCommand.run("wvText", tempfile.path, tempfile.path + ".txt")
+                # Try catdoc if we get into trouble (e.g. for InfoRequestEvent 2701)
+                if not File.exists?(tempfile.path + ".txt")
+                    AlaveteliExternalCommand.run("catdoc", tempfile.path, default_params)
+                else
+                    text += File.read(tempfile.path + ".txt") + "\n\n"
+                    File.unlink(tempfile.path + ".txt")
+                end
+            elsif content_type == 'application/rtf'
+                # catdoc on RTF prodcues less comments and extra bumf than --text option to unrtf
+                AlaveteliExternalCommand.run("catdoc", tempfile.path, default_params)
+            elsif content_type == 'text/html'
+                # lynx wordwraps links in its output, which then don't
+                # get formatted properly by Alaveteli. We use elinks
+                # instead, which doesn't do that.
+                AlaveteliExternalCommand.run("elinks", "-eval", "set document.codepage.assume = \"#{charset}\"",
+                                                       "-eval", "set document.codepage.force_assumed = 1",
+                                                       "-dump-charset", "utf-8",
+                                                       "-force-html", "-dump",
+                                                       tempfile.path,
+                                                       default_params.merge(:env => {"LANG" => "C"}))
+            elsif content_type == 'application/vnd.ms-excel'
+                # Bit crazy using /usr/bin/strings - but xls2csv, xlhtml and
+                # py_xls2txt only extract text from cells, not from floating
+                # notes. catdoc may be fooled by weird character sets, but will
+                # probably do for UK FOI requests.
+                AlaveteliExternalCommand.run("/usr/bin/strings", tempfile.path, default_params)
+            elsif content_type == 'application/vnd.ms-powerpoint'
+                # ppthtml seems to catch more text, but only outputs HTML when
+                # we want text, so just use catppt for now
+                AlaveteliExternalCommand.run("catppt", tempfile.path, default_params)
+            elsif content_type == 'application/pdf'
+                AlaveteliExternalCommand.run("pdftotext", tempfile.path, "-", default_params)
+            elsif content_type == 'application/vnd.openxmlformats-officedocument.wordprocessingml.document'
+                # This is Microsoft's XML office document format.
+                # Just pull out the main XML file, and strip it of text.
+                xml = AlaveteliExternalCommand.run("/usr/bin/unzip", "-qq",
+                                                                     "-c",
+                                                                     tempfile.path,
+                                                                     "word/document.xml",
+                                                                     {:binary_output => false})
+                if !xml.nil?
+                    doc = REXML::Document.new(xml)
+                    text += doc.each_element( './/text()' ){}.join(" ")
+                end
+            elsif content_type == 'application/zip'
+                # recurse into zip files
+                begin
+                    zip_file = Zip::ZipFile.open(tempfile.path)
+                    text += _get_attachment_text_from_zip_file(zip_file)
+                    zip_file.close()
+                rescue
+                    $stderr.puts("Error processing zip file: #{$!.inspect}")
+                end
+            end
+            tempfile.close
+        end
+
+        return text
+    end
+    def _get_attachment_text_from_zip_file(zip_file)
+
+        text = ""
+        for entry in zip_file
+            if entry.file?
+                filename = entry.to_s
+                begin
+                    body = entry.get_input_stream.read
+                rescue
+                    # move to next attachment silently if there were problems
+                    # XXX really should reduce this to specific exceptions?
+                    # e.g. password protected
+                    next
+                end
+                calc_mime = AlaveteliFileTypes.filename_to_mimetype(filename)
+                if calc_mime
+                    content_type = calc_mime
+                else
+                    content_type = 'application/octet-stream'
+                end
+
+                text += _get_attachment_text_internal_one_file(content_type, body)
+
+            end
+        end
+        return text
+    end
 
     # Turn instance methods into class methods
     extend self
diff --git a/spec/lib/mail_handler/mail_handler_spec.rb b/spec/lib/mail_handler/mail_handler_spec.rb
index 7eeba47e0..efa89b6e0 100644
--- a/spec/lib/mail_handler/mail_handler_spec.rb
+++ b/spec/lib/mail_handler/mail_handler_spec.rb
@@ -250,4 +250,24 @@ describe 'when getting header strings' do
                              '9; Autoresponder')
     end
 
-end
-\ No newline at end of file
+end
+
+describe "when parsing HTML mail" do
+    it "should display UTF-8 characters in the plain text version correctly" do
+        html = "<html><b>foo</b> është"
+        plain_text = MailHandler._get_attachment_text_internal_one_file('text/html', html)
+        plain_text.should match(/është/)
+    end
+
+end
+
+describe "when getting the attachment text" do
+
+  it "should not raise an error if the expansion of a zip file raises an error" do
+    mock_entry = mock('ZipFile entry', :file? => true)
+    mock_entry.stub!(:get_input_stream).and_raise("invalid distance too far back")
+    Zip::ZipFile.stub!(:open).and_return([mock_entry])
+    MailHandler._get_attachment_text_internal_one_file('application/zip', "some string")
+  end
+
+end
diff --git a/spec/models/incoming_message_spec.rb b/spec/models/incoming_message_spec.rb
index 1278535f8..3cfb3d5dd 100644
--- a/spec/models/incoming_message_spec.rb
+++ b/spec/models/incoming_message_spec.rb
@@ -102,27 +102,6 @@ describe IncomingMessage, " when dealing with incoming mail" do
 
 end
 
-describe IncomingMessage, "when parsing HTML mail" do
-    it "should display UTF-8 characters in the plain text version correctly" do
-        html = "<html><b>foo</b> është"
-        plain_text = IncomingMessage._get_attachment_text_internal_one_file('text/html', html)
-        plain_text.should match(/është/)
-    end
-
-end
-
-describe IncomingMessage, "when getting the attachment text" do
-
-  it "should not raise an error if the expansion of a zip file raises an error" do
-    mock_entry = mock('ZipFile entry', :file? => true)
-    mock_entry.stub!(:get_input_stream).and_raise("invalid distance too far back")
-    Zip::ZipFile.stub!(:open).and_return([mock_entry])
-    IncomingMessage._get_attachment_text_internal_one_file('application/zip', "some string")
-  end
-
-end
-
-
 describe IncomingMessage, " display attachments" do
 
     it "should not show slashes in filenames" do
@@ -138,7 +117,7 @@ describe IncomingMessage, " display attachments" do
         # http://www.whatdotheyknow.com/request/post_commercial_manager_librarie#incoming-17233
         foi_attachment.within_rfc822_subject = "FOI/09/066 RESPONSE TO FOI REQUEST RECEIVED 21st JANUARY 2009"
         foi_attachment.content_type = 'text/plain'
-        foi_attachment.ensure_filename!
+            foi_attachment.ensure_filename!
         expected_display_filename = foi_attachment.within_rfc822_subject.gsub(/\//, " ") + ".txt"
         foi_attachment.display_filename.should == expected_display_filename
     end
@@ -326,12 +305,12 @@ describe IncomingMessage, " when censoring data" do
         orig_pdf = load_file_fixture('tfl.pdf')
         pdf = orig_pdf.dup
 
-        orig_text = IncomingMessage._get_attachment_text_internal_one_file('application/pdf', pdf)
+        orig_text = MailHandler._get_attachment_text_internal_one_file('application/pdf', pdf)
         orig_text.should match(/foi@tfl.gov.uk/)
 
         @im.binary_mask_stuff!(pdf, "application/pdf")
 
-        masked_text = IncomingMessage._get_attachment_text_internal_one_file('application/pdf', pdf)
+        masked_text = MailHandler._get_attachment_text_internal_one_file('application/pdf', pdf)
         masked_text.should_not match(/foi@tfl.gov.uk/)
         masked_text.should match(/xxx@xxx.xxx.xx/)
         config['USE_GHOSTSCRIPT_COMPRESSION'] = previous