Add rake tasks for checking the parsing of a random sample of emails

For the upgrade from Rails 2 to Rails 3, we want to check that existing emails are parsed correctly. One of these tasks (temp:random_attachments_hexdigests) is for dumping a CVS file of details about each attachment, and its hexdigest, and a list of the randomly selected raw emails. (It's intended that you run this on an old Rails 2 install of Alaveteli. The other task (temp:recompute_attachments_hexdigests) is intended to be run on a Rails 3 install of Alaveteli, with the previous files as input, to check that the same results are obtained.
author: Mark Longair <mhl@pobox.com> 2013-04-26 14:07:00 +0100
committer: Mark Longair <mhl@pobox.com> 2013-05-16 09:06:27 +0100
commit: cf7d6e92ca2daa1a96517a7b62fb6446679ad898 (patch)
tree: 3a61475d99ee4107980152455eef1b77e84da848
parent: 16ca4662692125290f8df336d71dc098af4b937b (diff)
1 files changed, 150 insertions, 0 deletions
diff --git a/lib/tasks/temp.rake b/lib/tasks/temp.rake
index e49a84ecb..f0085b5e1 100644
--- a/lib/tasks/temp.rake
+++ b/lib/tasks/temp.rake
@@ -50,4 +50,154 @@ namespace :temp do
         end
     end
 
+    desc 'Create a CSV file of a random selection of raw emails, for comparing hexdigests'
+    task :random_attachments_hexdigests => :environment do
+
+        # The idea is to run this under the Rail 2 codebase, where
+        # Tmail was used to extract the attachements, and the task
+        # will output all of those file paths in a CSV file, and a
+        # list of the raw email files in another.  The latter file is
+        # useful so that one can easily tar up the emails with:
+        #
+        #   tar cvz -T raw-email-files -f raw_emails.tar.gz
+        #
+        # Then you can switch to the Rails 3 codebase, where
+        # attachment parsing is done via
+        # recompute_attachments_hexdigests
+
+        require 'csv'
+
+        File.open('raw-email-files', 'w') do |f|
+            CSV.open('attachment-hexdigests.csv', 'w') do |csv|
+                csv << ['filepath', 'i', 'url_part_number', 'hexdigest']
+                IncomingMessage.all(:order => 'RANDOM()', :limit => 1000).each do |incoming_message|
+                    # raw_email.filepath fails unless the
+                    # incoming_message has an associated request
+                    next unless incoming_message.info_request
+                    raw_email = incoming_message.raw_email
+                    f.puts raw_email.filepath
+                    incoming_message.foi_attachments.each_with_index do |attachment, i|
+                        csv << [raw_email.filepath, i, attachment.url_part_number, attachment.hexdigest]
+                    end
+                end
+            end
+        end
+
+    end
+
+
+    desc 'Check the hexdigests of attachments in emails on disk'
+    task :recompute_attachments_hexdigests => :environment do
+
+        require 'csv'
+        require 'digest/md5'
+
+        OldAttachment = Struct.new :filename, :attachment_index, :url_part_number, :hexdigest
+
+        filename_to_attachments = Hash.new {|h,k| h[k] = []}
+
+        header_line = true
+        CSV.foreach('attachment-hexdigests.csv') do |filename, attachment_index, url_part_number, hexdigest|
+            if header_line
+                header_line = false
+            else
+                filename_to_attachments[filename].push OldAttachment.new filename, attachment_index, url_part_number, hexdigest
+            end
+        end
+
+        total_attachments = 0
+        attachments_with_different_hexdigest = 0
+        files_with_different_numbers_of_attachments = 0
+        no_tnef_attachments = 0
+        no_parts_in_multipart = 0
+
+        multipart_error = "no parts on multipart mail"
+        tnef_error = "tnef produced no attachments"
+
+        # Now check each file:
+        filename_to_attachments.each do |filename, old_attachments|
+
+            # Currently it doesn't seem to be possible to reuse the
+            # attachment parsing code in Alaveteli without saving
+            # objects to the database, so reproduce what it does:
+
+            raw_email = nil
+            File.open(filename) do |f|
+                raw_email = f.read
+            end
+            mail = MailHandler.mail_from_raw_email(raw_email)
+
+            begin
+                attachment_attributes = MailHandler.get_attachment_attributes(mail)
+            rescue IOError => e
+                if e.message == tnef_error
+                    puts "#{filename} #{tnef_error}"
+                    no_tnef_attachments += 1
+                    next
+                else
+                    raise
+                end
+            rescue Exception => e
+                if e.message == multipart_error
+                    puts "#{filename} #{multipart_error}"
+                    no_parts_in_multipart += 1
+                    next
+                else
+                    raise
+                end
+            end
+
+            if attachment_attributes.length != old_attachments.length
+                puts "#{filename} the number of old attachments #{old_attachments.length} didn't match the number of new attachments #{attachment_attributes.length}"
+                files_with_different_numbers_of_attachments += 1
+            else
+                old_attachments.each_with_index do |old_attachment, i|
+                    total_attachments += 1
+                    attrs = attachment_attributes[i]
+                    old_hexdigest = old_attachment.hexdigest
+                    new_hexdigest = attrs[:hexdigest]
+                    new_content_type = attrs[:content_type]
+                    old_url_part_number = old_attachment.url_part_number.to_i
+                    new_url_part_number = attrs[:url_part_number]
+                    if old_url_part_number != new_url_part_number
+                        puts "#{i} #{filename} old_url_part_number #{old_url_part_number}, new_url_part_number #{new_url_part_number}"
+                    end
+                    if old_hexdigest != new_hexdigest
+                        body = attrs[:body]
+                        # First, if the content type is one of
+                        # text/plain, text/html or application/rtf try
+                        # changing CRLF to LF and calculating a new
+                        # digest - we generally don't worry about
+                        # these changes:
+                        new_converted_hexdigest = nil
+                        if ["text/plain", "text/html", "application/rtf"].include? new_content_type
+                            converted_body = body.gsub /\r\n/, "\n"
+                            new_converted_hexdigest = Digest::MD5.hexdigest converted_body
+                            puts "new_converted_hexdigest is #{new_converted_hexdigest}"
+                        end
+                        if (! new_converted_hexdigest) || (old_hexdigest != new_converted_hexdigest)
+                            puts "#{i} #{filename} old_hexdigest #{old_hexdigest} wasn't the same as new_hexdigest #{new_hexdigest}"
+                            puts "  body was of length #{body.length}"
+                            puts "  content type was: #{new_content_type}"
+                            path = "/tmp/#{new_hexdigest}"
+                            f = File.new path, "w"
+                            f.write body
+                            f.close
+                            puts "  wrote body to #{path}"
+                            attachments_with_different_hexdigest += 1
+                        end
+                    end
+                end
+            end
+
+        end
+
+        puts "total_attachments: #{total_attachments}"
+        puts "attachments_with_different_hexdigest: #{attachments_with_different_hexdigest}"
+        puts "files_with_different_numbers_of_attachments: #{files_with_different_numbers_of_attachments}"
+        puts "no_tnef_attachments: #{no_tnef_attachments}"
+        puts "no_parts_in_multipart: #{no_parts_in_multipart}"
+
+    end
+
 end
author	Mark Longair <mhl@pobox.com>	2013-04-26 14:07:00 +0100
committer	Mark Longair <mhl@pobox.com>	2013-05-16 09:06:27 +0100
commit	cf7d6e92ca2daa1a96517a7b62fb6446679ad898 (patch)
tree	3a61475d99ee4107980152455eef1b77e84da848
parent	16ca4662692125290f8df336d71dc098af4b937b (diff)