4 files changed, 303 insertions, 12 deletions
diff --git a/lib/mail_handler/backends/mail_backend.rb b/lib/mail_handler/backends/mail_backend.rb
index f7893a60d..03d78e0a3 100644
--- a/lib/mail_handler/backends/mail_backend.rb
+++ b/lib/mail_handler/backends/mail_backend.rb
@@ -1,4 +1,35 @@
 require 'mail'
+require 'mapi/msg'
+require 'mapi/convert'
+
+module Mail
+    class Message
+
+        # The behaviour of the 'to' and 'cc' methods have changed
+        # between TMail and Mail; this monkey-patching restores the
+        # TMail behaviour.  The key difference is that when there's an
+        # invalid address, e.g. '<foo@example.org', Mail returns the
+        # string as an ActiveSupport::Multibyte::Chars, whereas
+        # previously TMail would return nil.
+
+        alias_method :old_to, :to
+        alias_method :old_cc, :cc
+
+        def clean_addresses(old_method, val)
+            old_result = self.send(old_method, val)
+            old_result.class == Mail::AddressContainer ? old_result : nil
+        end
+
+        def to(val = nil)
+            self.clean_addresses :old_to, val
+        end
+
+        def cc(val = nil)
+            self.clean_addresses :old_cc, val
+        end
+
+    end
+end
 
 module MailHandler
     module Backends
@@ -38,7 +69,11 @@ module MailHandler
 
             # Get the body of a mail part
             def get_part_body(part)
-                part.body.decoded
+                decoded = part.body.decoded
+                if part.content_type =~ /^text\//
+                    decoded = convert_string_to_utf8_or_binary decoded, part.charset
+                end
+                decoded
             end
 
             # Return the first from field if any
@@ -141,9 +176,14 @@ module MailHandler
                     end
                 elsif get_content_type(part) == 'application/ms-tnef'
                     # A set of attachments in a TNEF file
-                    part.rfc822_attachment = mail_from_tnef(part.body.decoded)
-                    if part.rfc822_attachment.nil?
-                        # Attached mail didn't parse, so treat as binary
+                    begin
+                        part.rfc822_attachment = mail_from_tnef(part.body.decoded)
+                        if part.rfc822_attachment.nil?
+                            # Attached mail didn't parse, so treat as binary
+                            part.content_type = 'application/octet-stream'
+                        end
+                    rescue TNEFParsingError
+                        part.rfc822_attachment = nil
                         part.content_type = 'application/octet-stream'
                     end
                 end
@@ -160,8 +200,11 @@ module MailHandler
                   part.parts.each{ |sub_part| expand_and_normalize_parts(sub_part, parent_mail) }
                 else
                   part_filename = get_part_file_name(part)
-                  charset = part.charset # save this, because overwriting content_type also resets charset
-
+                  if part.has_charset?
+                      original_charset = part.charset # save this, because overwriting content_type also resets charset
+                  else
+                      original_charset = nil
+                  end
                   # Don't allow nil content_types
                   if get_content_type(part).nil?
                       part.content_type = 'application/octet-stream'
@@ -180,7 +223,9 @@ module MailHandler
                   # Use standard content types for Word documents etc.
                   part.content_type = normalise_content_type(get_content_type(part))
                   decode_attached_part(part, parent_mail)
-                  part.charset = charset
+                  if original_charset
+                      part.charset = original_charset
+                  end
                 end
             end
 
@@ -228,8 +273,15 @@ module MailHandler
             def _get_attachment_leaves_recursive(part, within_rfc822_attachment, parent_mail)
                 leaves_found = []
                 if part.multipart?
-                    raise "no parts on multipart mail" if part.parts.size == 0
-                    if part.sub_type == 'alternative'
+                    if part.parts.size == 0
+                        # This is typically caused by a missing final
+                        # MIME boundary, in which case the text of the
+                        # message (including the opening MIME
+                        # boundary) is in part.body, so just add this
+                        # part as a leaf and treat it as text/plain:
+                        part.content_type = "text/plain"
+                        leaves_found += [part]
+                    elsif part.sub_type == 'alternative'
                         best_part = choose_best_alternative(part)
                         leaves_found += _get_attachment_leaves_recursive(best_part,
                                                                          within_rfc822_attachment,
diff --git a/lib/mail_handler/mail_handler.rb b/lib/mail_handler/mail_handler.rb
index 22ba26b97..9c955cccd 100644
--- a/lib/mail_handler/mail_handler.rb
+++ b/lib/mail_handler/mail_handler.rb
@@ -8,20 +8,23 @@ module MailHandler
     require 'backends/mail_backend'
     include Backends::MailBackend
 
+    class TNEFParsingError < StandardError
+    end
+
     # Returns a set of attachments from the given TNEF contents
     # The TNEF contents also contains the message body, but in general this is the
     # same as the message body in the message proper.
     def tnef_attachments(content)
         attachments = []
         Dir.mktmpdir do |dir|
-            IO.popen("#{`which tnef`.chomp} -K -C #{dir}", "wb") do |f|
+            IO.popen("tnef -K -C #{dir} 2> /dev/null", "wb") do |f|
                 f.write(content)
                 f.close
                 if $?.signaled?
                     raise IOError, "tnef exited with signal #{$?.termsig}"
                 end
                 if $?.exited? && $?.exitstatus != 0
-                    raise IOError, "tnef exited with status #{$?.exitstatus}"
+                    raise TNEFParsingError, "tnef exited with status #{$?.exitstatus}"
                 end
             end
             found = 0
@@ -34,7 +37,7 @@ module MailHandler
                 end
             end
             if found == 0
-                raise IOError, "tnef produced no attachments"
+                raise TNEFParsingError, "tnef produced no attachments"
             end
         end
         attachments
diff --git a/lib/normalize_string.rb b/lib/normalize_string.rb
new file mode 100644
index 000000000..f02b18ee0
--- /dev/null
+++ b/lib/normalize_string.rb
@@ -0,0 +1,86 @@
+require 'iconv' unless RUBY_VERSION.to_f >= 1.9
+require 'charlock_holmes'
+
+class EncodingNormalizationError < StandardError
+end
+
+def normalize_string_to_utf8(s, suggested_character_encoding=nil)
+
+    # Make a list of encodings to try:
+    to_try = []
+
+    guessed_encoding = CharlockHolmes::EncodingDetector.detect(s)[:encoding]
+    guessed_encoding ||= ''
+
+    # It's reasonably common for windows-1252 text to be mislabelled
+    # as ISO-8859-1, so try that first if charlock_holmes guessed
+    # that.  However, it can also easily misidentify UTF-8 strings as
+    # ISO-8859-1 so we don't want to go with the guess by default...
+    to_try.push guessed_encoding if guessed_encoding.downcase == 'windows-1252'
+
+    to_try.push suggested_character_encoding if suggested_character_encoding
+    to_try.push 'UTF-8'
+    to_try.push guessed_encoding
+
+    to_try.each do |from_encoding|
+        if RUBY_VERSION.to_f >= 1.9
+            begin
+                s.force_encoding from_encoding
+                return s.encode('UTF-8') if s.valid_encoding?
+            rescue ArgumentError
+                # We get this is there are invalid bytes when
+                # interpreted as from_encoding at the point of
+                # the encode('UTF-8'); move onto the next one...
+            end
+        else
+            to_encoding = 'UTF-8'
+            begin
+                converted = Iconv.conv 'UTF-8', from_encoding, s
+                return converted
+            rescue Iconv::Failure
+                # We get this is there are invalid bytes when
+                # interpreted as from_encoding at the point of
+                # the Iconv.iconv; move onto the next one...
+            end
+        end
+    end
+    raise EncodingNormalizationError, "Couldn't find a valid character encoding for the string"
+
+end
+
+def convert_string_to_utf8_or_binary(s, suggested_character_encoding=nil)
+    # This function exists to help to keep consistent with the
+    # behaviour of earlier versions of Alaveteli: in the code as it
+    # is, there are situations where it's expected that we generally
+    # have a UTF-8 encoded string, but if the source data was
+    # unintepretable under any character encoding, the string may be
+    # binary data (i.e. invalid UTF-8).  Such a string would then be
+    # mangled into valid UTF-8 by _sanitize_text for the purposes of
+    # display.
+
+    # This seems unsatisfactory to me - two better alternatives would
+    # be either: (a) to mangle the data into valid UTF-8 in this
+    # method or (b) to treat the 'text/*' attachment as
+    # 'application/octet-stream' instead.  However, for the purposes
+    # of the transition to Ruby 1.9 and/or Rails 3 we just want the
+    # behaviour to be as similar as possible.
+
+    begin
+        result = normalize_string_to_utf8 s, suggested_character_encoding
+    rescue EncodingNormalizationError
+        result = s
+        s.force_encoding 'ASCII-8BIT' if RUBY_VERSION.to_f >= 1.9
+    end
+    result
+end
+
+def log_text_details(message, text)
+    if RUBY_VERSION.to_f >= 1.9
+        STDERR.puts "#{message}, we have text: #{text}, of class #{text.class} and encoding #{text.encoding}"
+    else
+        STDERR.puts "#{message}, we have text: #{text}, of class #{text.class}"
+    end
+    filename = "/var/tmp/#{Digest::MD5.hexdigest(text)}.txt"
+    File.open(filename, "wb") { |f| f.write text }
+    STDERR.puts "#{message}, the filename is: #{filename}"
+end
diff --git a/lib/tasks/temp.rake b/lib/tasks/temp.rake
index e49a84ecb..f0085b5e1 100644
--- a/lib/tasks/temp.rake
+++ b/lib/tasks/temp.rake
@@ -50,4 +50,154 @@ namespace :temp do
         end
     end
 
+    desc 'Create a CSV file of a random selection of raw emails, for comparing hexdigests'
+    task :random_attachments_hexdigests => :environment do
+
+        # The idea is to run this under the Rail 2 codebase, where
+        # Tmail was used to extract the attachements, and the task
+        # will output all of those file paths in a CSV file, and a
+        # list of the raw email files in another.  The latter file is
+        # useful so that one can easily tar up the emails with:
+        #
+        #   tar cvz -T raw-email-files -f raw_emails.tar.gz
+        #
+        # Then you can switch to the Rails 3 codebase, where
+        # attachment parsing is done via
+        # recompute_attachments_hexdigests
+
+        require 'csv'
+
+        File.open('raw-email-files', 'w') do |f|
+            CSV.open('attachment-hexdigests.csv', 'w') do |csv|
+                csv << ['filepath', 'i', 'url_part_number', 'hexdigest']
+                IncomingMessage.all(:order => 'RANDOM()', :limit => 1000).each do |incoming_message|
+                    # raw_email.filepath fails unless the
+                    # incoming_message has an associated request
+                    next unless incoming_message.info_request
+                    raw_email = incoming_message.raw_email
+                    f.puts raw_email.filepath
+                    incoming_message.foi_attachments.each_with_index do |attachment, i|
+                        csv << [raw_email.filepath, i, attachment.url_part_number, attachment.hexdigest]
+                    end
+                end
+            end
+        end
+
+    end
+
+
+    desc 'Check the hexdigests of attachments in emails on disk'
+    task :recompute_attachments_hexdigests => :environment do
+
+        require 'csv'
+        require 'digest/md5'
+
+        OldAttachment = Struct.new :filename, :attachment_index, :url_part_number, :hexdigest
+
+        filename_to_attachments = Hash.new {|h,k| h[k] = []}
+
+        header_line = true
+        CSV.foreach('attachment-hexdigests.csv') do |filename, attachment_index, url_part_number, hexdigest|
+            if header_line
+                header_line = false
+            else
+                filename_to_attachments[filename].push OldAttachment.new filename, attachment_index, url_part_number, hexdigest
+            end
+        end
+
+        total_attachments = 0
+        attachments_with_different_hexdigest = 0
+        files_with_different_numbers_of_attachments = 0
+        no_tnef_attachments = 0
+        no_parts_in_multipart = 0
+
+        multipart_error = "no parts on multipart mail"
+        tnef_error = "tnef produced no attachments"
+
+        # Now check each file:
+        filename_to_attachments.each do |filename, old_attachments|
+
+            # Currently it doesn't seem to be possible to reuse the
+            # attachment parsing code in Alaveteli without saving
+            # objects to the database, so reproduce what it does:
+
+            raw_email = nil
+            File.open(filename) do |f|
+                raw_email = f.read
+            end
+            mail = MailHandler.mail_from_raw_email(raw_email)
+
+            begin
+                attachment_attributes = MailHandler.get_attachment_attributes(mail)
+            rescue IOError => e
+                if e.message == tnef_error
+                    puts "#{filename} #{tnef_error}"
+                    no_tnef_attachments += 1
+                    next
+                else
+                    raise
+                end
+            rescue Exception => e
+                if e.message == multipart_error
+                    puts "#{filename} #{multipart_error}"
+                    no_parts_in_multipart += 1
+                    next
+                else
+                    raise
+                end
+            end
+
+            if attachment_attributes.length != old_attachments.length
+                puts "#{filename} the number of old attachments #{old_attachments.length} didn't match the number of new attachments #{attachment_attributes.length}"
+                files_with_different_numbers_of_attachments += 1
+            else
+                old_attachments.each_with_index do |old_attachment, i|
+                    total_attachments += 1
+                    attrs = attachment_attributes[i]
+                    old_hexdigest = old_attachment.hexdigest
+                    new_hexdigest = attrs[:hexdigest]
+                    new_content_type = attrs[:content_type]
+                    old_url_part_number = old_attachment.url_part_number.to_i
+                    new_url_part_number = attrs[:url_part_number]
+                    if old_url_part_number != new_url_part_number
+                        puts "#{i} #{filename} old_url_part_number #{old_url_part_number}, new_url_part_number #{new_url_part_number}"
+                    end
+                    if old_hexdigest != new_hexdigest
+                        body = attrs[:body]
+                        # First, if the content type is one of
+                        # text/plain, text/html or application/rtf try
+                        # changing CRLF to LF and calculating a new
+                        # digest - we generally don't worry about
+                        # these changes:
+                        new_converted_hexdigest = nil
+                        if ["text/plain", "text/html", "application/rtf"].include? new_content_type
+                            converted_body = body.gsub /\r\n/, "\n"
+                            new_converted_hexdigest = Digest::MD5.hexdigest converted_body
+                            puts "new_converted_hexdigest is #{new_converted_hexdigest}"
+                        end
+                        if (! new_converted_hexdigest) || (old_hexdigest != new_converted_hexdigest)
+                            puts "#{i} #{filename} old_hexdigest #{old_hexdigest} wasn't the same as new_hexdigest #{new_hexdigest}"
+                            puts "  body was of length #{body.length}"
+                            puts "  content type was: #{new_content_type}"
+                            path = "/tmp/#{new_hexdigest}"
+                            f = File.new path, "w"
+                            f.write body
+                            f.close
+                            puts "  wrote body to #{path}"
+                            attachments_with_different_hexdigest += 1
+                        end
+                    end
+                end
+            end
+
+        end
+
+        puts "total_attachments: #{total_attachments}"
+        puts "attachments_with_different_hexdigest: #{attachments_with_different_hexdigest}"
+        puts "files_with_different_numbers_of_attachments: #{files_with_different_numbers_of_attachments}"
+        puts "no_tnef_attachments: #{no_tnef_attachments}"
+        puts "no_parts_in_multipart: #{no_parts_in_multipart}"
+
+    end
+
 end