aboutsummaryrefslogtreecommitdiffstats
path: root/lib/tasks
diff options
context:
space:
mode:
authorMark Longair <mhl@pobox.com>2013-04-26 14:07:00 +0100
committerMark Longair <mhl@pobox.com>2013-05-16 09:06:27 +0100
commitcf7d6e92ca2daa1a96517a7b62fb6446679ad898 (patch)
tree3a61475d99ee4107980152455eef1b77e84da848 /lib/tasks
parent16ca4662692125290f8df336d71dc098af4b937b (diff)
Add rake tasks for checking the parsing of a random sample of emails
For the upgrade from Rails 2 to Rails 3, we want to check that existing emails are parsed correctly. One of these tasks (temp:random_attachments_hexdigests) is for dumping a CVS file of details about each attachment, and its hexdigest, and a list of the randomly selected raw emails. (It's intended that you run this on an old Rails 2 install of Alaveteli. The other task (temp:recompute_attachments_hexdigests) is intended to be run on a Rails 3 install of Alaveteli, with the previous files as input, to check that the same results are obtained.
Diffstat (limited to 'lib/tasks')
-rw-r--r--lib/tasks/temp.rake150
1 files changed, 150 insertions, 0 deletions
diff --git a/lib/tasks/temp.rake b/lib/tasks/temp.rake
index e49a84ecb..f0085b5e1 100644
--- a/lib/tasks/temp.rake
+++ b/lib/tasks/temp.rake
@@ -50,4 +50,154 @@ namespace :temp do
end
end
+ desc 'Create a CSV file of a random selection of raw emails, for comparing hexdigests'
+ task :random_attachments_hexdigests => :environment do
+
+ # The idea is to run this under the Rail 2 codebase, where
+ # Tmail was used to extract the attachements, and the task
+ # will output all of those file paths in a CSV file, and a
+ # list of the raw email files in another. The latter file is
+ # useful so that one can easily tar up the emails with:
+ #
+ # tar cvz -T raw-email-files -f raw_emails.tar.gz
+ #
+ # Then you can switch to the Rails 3 codebase, where
+ # attachment parsing is done via
+ # recompute_attachments_hexdigests
+
+ require 'csv'
+
+ File.open('raw-email-files', 'w') do |f|
+ CSV.open('attachment-hexdigests.csv', 'w') do |csv|
+ csv << ['filepath', 'i', 'url_part_number', 'hexdigest']
+ IncomingMessage.all(:order => 'RANDOM()', :limit => 1000).each do |incoming_message|
+ # raw_email.filepath fails unless the
+ # incoming_message has an associated request
+ next unless incoming_message.info_request
+ raw_email = incoming_message.raw_email
+ f.puts raw_email.filepath
+ incoming_message.foi_attachments.each_with_index do |attachment, i|
+ csv << [raw_email.filepath, i, attachment.url_part_number, attachment.hexdigest]
+ end
+ end
+ end
+ end
+
+ end
+
+
+ desc 'Check the hexdigests of attachments in emails on disk'
+ task :recompute_attachments_hexdigests => :environment do
+
+ require 'csv'
+ require 'digest/md5'
+
+ OldAttachment = Struct.new :filename, :attachment_index, :url_part_number, :hexdigest
+
+ filename_to_attachments = Hash.new {|h,k| h[k] = []}
+
+ header_line = true
+ CSV.foreach('attachment-hexdigests.csv') do |filename, attachment_index, url_part_number, hexdigest|
+ if header_line
+ header_line = false
+ else
+ filename_to_attachments[filename].push OldAttachment.new filename, attachment_index, url_part_number, hexdigest
+ end
+ end
+
+ total_attachments = 0
+ attachments_with_different_hexdigest = 0
+ files_with_different_numbers_of_attachments = 0
+ no_tnef_attachments = 0
+ no_parts_in_multipart = 0
+
+ multipart_error = "no parts on multipart mail"
+ tnef_error = "tnef produced no attachments"
+
+ # Now check each file:
+ filename_to_attachments.each do |filename, old_attachments|
+
+ # Currently it doesn't seem to be possible to reuse the
+ # attachment parsing code in Alaveteli without saving
+ # objects to the database, so reproduce what it does:
+
+ raw_email = nil
+ File.open(filename) do |f|
+ raw_email = f.read
+ end
+ mail = MailHandler.mail_from_raw_email(raw_email)
+
+ begin
+ attachment_attributes = MailHandler.get_attachment_attributes(mail)
+ rescue IOError => e
+ if e.message == tnef_error
+ puts "#{filename} #{tnef_error}"
+ no_tnef_attachments += 1
+ next
+ else
+ raise
+ end
+ rescue Exception => e
+ if e.message == multipart_error
+ puts "#{filename} #{multipart_error}"
+ no_parts_in_multipart += 1
+ next
+ else
+ raise
+ end
+ end
+
+ if attachment_attributes.length != old_attachments.length
+ puts "#{filename} the number of old attachments #{old_attachments.length} didn't match the number of new attachments #{attachment_attributes.length}"
+ files_with_different_numbers_of_attachments += 1
+ else
+ old_attachments.each_with_index do |old_attachment, i|
+ total_attachments += 1
+ attrs = attachment_attributes[i]
+ old_hexdigest = old_attachment.hexdigest
+ new_hexdigest = attrs[:hexdigest]
+ new_content_type = attrs[:content_type]
+ old_url_part_number = old_attachment.url_part_number.to_i
+ new_url_part_number = attrs[:url_part_number]
+ if old_url_part_number != new_url_part_number
+ puts "#{i} #{filename} old_url_part_number #{old_url_part_number}, new_url_part_number #{new_url_part_number}"
+ end
+ if old_hexdigest != new_hexdigest
+ body = attrs[:body]
+ # First, if the content type is one of
+ # text/plain, text/html or application/rtf try
+ # changing CRLF to LF and calculating a new
+ # digest - we generally don't worry about
+ # these changes:
+ new_converted_hexdigest = nil
+ if ["text/plain", "text/html", "application/rtf"].include? new_content_type
+ converted_body = body.gsub /\r\n/, "\n"
+ new_converted_hexdigest = Digest::MD5.hexdigest converted_body
+ puts "new_converted_hexdigest is #{new_converted_hexdigest}"
+ end
+ if (! new_converted_hexdigest) || (old_hexdigest != new_converted_hexdigest)
+ puts "#{i} #{filename} old_hexdigest #{old_hexdigest} wasn't the same as new_hexdigest #{new_hexdigest}"
+ puts " body was of length #{body.length}"
+ puts " content type was: #{new_content_type}"
+ path = "/tmp/#{new_hexdigest}"
+ f = File.new path, "w"
+ f.write body
+ f.close
+ puts " wrote body to #{path}"
+ attachments_with_different_hexdigest += 1
+ end
+ end
+ end
+ end
+
+ end
+
+ puts "total_attachments: #{total_attachments}"
+ puts "attachments_with_different_hexdigest: #{attachments_with_different_hexdigest}"
+ puts "files_with_different_numbers_of_attachments: #{files_with_different_numbers_of_attachments}"
+ puts "no_tnef_attachments: #{no_tnef_attachments}"
+ puts "no_parts_in_multipart: #{no_parts_in_multipart}"
+
+ end
+
end