aboutsummaryrefslogtreecommitdiffstats
path: root/lib
diff options
context:
space:
mode:
Diffstat (limited to 'lib')
-rw-r--r--lib/mail_handler/backends/mail_backend.rb70
-rw-r--r--lib/mail_handler/mail_handler.rb9
-rw-r--r--lib/normalize_string.rb86
-rw-r--r--lib/tasks/temp.rake150
4 files changed, 303 insertions, 12 deletions
diff --git a/lib/mail_handler/backends/mail_backend.rb b/lib/mail_handler/backends/mail_backend.rb
index f7893a60d..03d78e0a3 100644
--- a/lib/mail_handler/backends/mail_backend.rb
+++ b/lib/mail_handler/backends/mail_backend.rb
@@ -1,4 +1,35 @@
require 'mail'
+require 'mapi/msg'
+require 'mapi/convert'
+
+module Mail
+ class Message
+
+ # The behaviour of the 'to' and 'cc' methods have changed
+ # between TMail and Mail; this monkey-patching restores the
+ # TMail behaviour. The key difference is that when there's an
+ # invalid address, e.g. '<foo@example.org', Mail returns the
+ # string as an ActiveSupport::Multibyte::Chars, whereas
+ # previously TMail would return nil.
+
+ alias_method :old_to, :to
+ alias_method :old_cc, :cc
+
+ def clean_addresses(old_method, val)
+ old_result = self.send(old_method, val)
+ old_result.class == Mail::AddressContainer ? old_result : nil
+ end
+
+ def to(val = nil)
+ self.clean_addresses :old_to, val
+ end
+
+ def cc(val = nil)
+ self.clean_addresses :old_cc, val
+ end
+
+ end
+end
module MailHandler
module Backends
@@ -38,7 +69,11 @@ module MailHandler
# Get the body of a mail part
def get_part_body(part)
- part.body.decoded
+ decoded = part.body.decoded
+ if part.content_type =~ /^text\//
+ decoded = convert_string_to_utf8_or_binary decoded, part.charset
+ end
+ decoded
end
# Return the first from field if any
@@ -141,9 +176,14 @@ module MailHandler
end
elsif get_content_type(part) == 'application/ms-tnef'
# A set of attachments in a TNEF file
- part.rfc822_attachment = mail_from_tnef(part.body.decoded)
- if part.rfc822_attachment.nil?
- # Attached mail didn't parse, so treat as binary
+ begin
+ part.rfc822_attachment = mail_from_tnef(part.body.decoded)
+ if part.rfc822_attachment.nil?
+ # Attached mail didn't parse, so treat as binary
+ part.content_type = 'application/octet-stream'
+ end
+ rescue TNEFParsingError
+ part.rfc822_attachment = nil
part.content_type = 'application/octet-stream'
end
end
@@ -160,8 +200,11 @@ module MailHandler
part.parts.each{ |sub_part| expand_and_normalize_parts(sub_part, parent_mail) }
else
part_filename = get_part_file_name(part)
- charset = part.charset # save this, because overwriting content_type also resets charset
-
+ if part.has_charset?
+ original_charset = part.charset # save this, because overwriting content_type also resets charset
+ else
+ original_charset = nil
+ end
# Don't allow nil content_types
if get_content_type(part).nil?
part.content_type = 'application/octet-stream'
@@ -180,7 +223,9 @@ module MailHandler
# Use standard content types for Word documents etc.
part.content_type = normalise_content_type(get_content_type(part))
decode_attached_part(part, parent_mail)
- part.charset = charset
+ if original_charset
+ part.charset = original_charset
+ end
end
end
@@ -228,8 +273,15 @@ module MailHandler
def _get_attachment_leaves_recursive(part, within_rfc822_attachment, parent_mail)
leaves_found = []
if part.multipart?
- raise "no parts on multipart mail" if part.parts.size == 0
- if part.sub_type == 'alternative'
+ if part.parts.size == 0
+ # This is typically caused by a missing final
+ # MIME boundary, in which case the text of the
+ # message (including the opening MIME
+ # boundary) is in part.body, so just add this
+ # part as a leaf and treat it as text/plain:
+ part.content_type = "text/plain"
+ leaves_found += [part]
+ elsif part.sub_type == 'alternative'
best_part = choose_best_alternative(part)
leaves_found += _get_attachment_leaves_recursive(best_part,
within_rfc822_attachment,
diff --git a/lib/mail_handler/mail_handler.rb b/lib/mail_handler/mail_handler.rb
index 22ba26b97..9c955cccd 100644
--- a/lib/mail_handler/mail_handler.rb
+++ b/lib/mail_handler/mail_handler.rb
@@ -8,20 +8,23 @@ module MailHandler
require 'backends/mail_backend'
include Backends::MailBackend
+ class TNEFParsingError < StandardError
+ end
+
# Returns a set of attachments from the given TNEF contents
# The TNEF contents also contains the message body, but in general this is the
# same as the message body in the message proper.
def tnef_attachments(content)
attachments = []
Dir.mktmpdir do |dir|
- IO.popen("#{`which tnef`.chomp} -K -C #{dir}", "wb") do |f|
+ IO.popen("tnef -K -C #{dir} 2> /dev/null", "wb") do |f|
f.write(content)
f.close
if $?.signaled?
raise IOError, "tnef exited with signal #{$?.termsig}"
end
if $?.exited? && $?.exitstatus != 0
- raise IOError, "tnef exited with status #{$?.exitstatus}"
+ raise TNEFParsingError, "tnef exited with status #{$?.exitstatus}"
end
end
found = 0
@@ -34,7 +37,7 @@ module MailHandler
end
end
if found == 0
- raise IOError, "tnef produced no attachments"
+ raise TNEFParsingError, "tnef produced no attachments"
end
end
attachments
diff --git a/lib/normalize_string.rb b/lib/normalize_string.rb
new file mode 100644
index 000000000..f02b18ee0
--- /dev/null
+++ b/lib/normalize_string.rb
@@ -0,0 +1,86 @@
+require 'iconv' unless RUBY_VERSION.to_f >= 1.9
+require 'charlock_holmes'
+
+class EncodingNormalizationError < StandardError
+end
+
+def normalize_string_to_utf8(s, suggested_character_encoding=nil)
+
+ # Make a list of encodings to try:
+ to_try = []
+
+ guessed_encoding = CharlockHolmes::EncodingDetector.detect(s)[:encoding]
+ guessed_encoding ||= ''
+
+ # It's reasonably common for windows-1252 text to be mislabelled
+ # as ISO-8859-1, so try that first if charlock_holmes guessed
+ # that. However, it can also easily misidentify UTF-8 strings as
+ # ISO-8859-1 so we don't want to go with the guess by default...
+ to_try.push guessed_encoding if guessed_encoding.downcase == 'windows-1252'
+
+ to_try.push suggested_character_encoding if suggested_character_encoding
+ to_try.push 'UTF-8'
+ to_try.push guessed_encoding
+
+ to_try.each do |from_encoding|
+ if RUBY_VERSION.to_f >= 1.9
+ begin
+ s.force_encoding from_encoding
+ return s.encode('UTF-8') if s.valid_encoding?
+ rescue ArgumentError
+ # We get this is there are invalid bytes when
+ # interpreted as from_encoding at the point of
+ # the encode('UTF-8'); move onto the next one...
+ end
+ else
+ to_encoding = 'UTF-8'
+ begin
+ converted = Iconv.conv 'UTF-8', from_encoding, s
+ return converted
+ rescue Iconv::Failure
+ # We get this is there are invalid bytes when
+ # interpreted as from_encoding at the point of
+ # the Iconv.iconv; move onto the next one...
+ end
+ end
+ end
+ raise EncodingNormalizationError, "Couldn't find a valid character encoding for the string"
+
+end
+
+def convert_string_to_utf8_or_binary(s, suggested_character_encoding=nil)
+ # This function exists to help to keep consistent with the
+ # behaviour of earlier versions of Alaveteli: in the code as it
+ # is, there are situations where it's expected that we generally
+ # have a UTF-8 encoded string, but if the source data was
+ # unintepretable under any character encoding, the string may be
+ # binary data (i.e. invalid UTF-8). Such a string would then be
+ # mangled into valid UTF-8 by _sanitize_text for the purposes of
+ # display.
+
+ # This seems unsatisfactory to me - two better alternatives would
+ # be either: (a) to mangle the data into valid UTF-8 in this
+ # method or (b) to treat the 'text/*' attachment as
+ # 'application/octet-stream' instead. However, for the purposes
+ # of the transition to Ruby 1.9 and/or Rails 3 we just want the
+ # behaviour to be as similar as possible.
+
+ begin
+ result = normalize_string_to_utf8 s, suggested_character_encoding
+ rescue EncodingNormalizationError
+ result = s
+ s.force_encoding 'ASCII-8BIT' if RUBY_VERSION.to_f >= 1.9
+ end
+ result
+end
+
+def log_text_details(message, text)
+ if RUBY_VERSION.to_f >= 1.9
+ STDERR.puts "#{message}, we have text: #{text}, of class #{text.class} and encoding #{text.encoding}"
+ else
+ STDERR.puts "#{message}, we have text: #{text}, of class #{text.class}"
+ end
+ filename = "/var/tmp/#{Digest::MD5.hexdigest(text)}.txt"
+ File.open(filename, "wb") { |f| f.write text }
+ STDERR.puts "#{message}, the filename is: #{filename}"
+end
diff --git a/lib/tasks/temp.rake b/lib/tasks/temp.rake
index e49a84ecb..f0085b5e1 100644
--- a/lib/tasks/temp.rake
+++ b/lib/tasks/temp.rake
@@ -50,4 +50,154 @@ namespace :temp do
end
end
+ desc 'Create a CSV file of a random selection of raw emails, for comparing hexdigests'
+ task :random_attachments_hexdigests => :environment do
+
+ # The idea is to run this under the Rail 2 codebase, where
+ # Tmail was used to extract the attachements, and the task
+ # will output all of those file paths in a CSV file, and a
+ # list of the raw email files in another. The latter file is
+ # useful so that one can easily tar up the emails with:
+ #
+ # tar cvz -T raw-email-files -f raw_emails.tar.gz
+ #
+ # Then you can switch to the Rails 3 codebase, where
+ # attachment parsing is done via
+ # recompute_attachments_hexdigests
+
+ require 'csv'
+
+ File.open('raw-email-files', 'w') do |f|
+ CSV.open('attachment-hexdigests.csv', 'w') do |csv|
+ csv << ['filepath', 'i', 'url_part_number', 'hexdigest']
+ IncomingMessage.all(:order => 'RANDOM()', :limit => 1000).each do |incoming_message|
+ # raw_email.filepath fails unless the
+ # incoming_message has an associated request
+ next unless incoming_message.info_request
+ raw_email = incoming_message.raw_email
+ f.puts raw_email.filepath
+ incoming_message.foi_attachments.each_with_index do |attachment, i|
+ csv << [raw_email.filepath, i, attachment.url_part_number, attachment.hexdigest]
+ end
+ end
+ end
+ end
+
+ end
+
+
+ desc 'Check the hexdigests of attachments in emails on disk'
+ task :recompute_attachments_hexdigests => :environment do
+
+ require 'csv'
+ require 'digest/md5'
+
+ OldAttachment = Struct.new :filename, :attachment_index, :url_part_number, :hexdigest
+
+ filename_to_attachments = Hash.new {|h,k| h[k] = []}
+
+ header_line = true
+ CSV.foreach('attachment-hexdigests.csv') do |filename, attachment_index, url_part_number, hexdigest|
+ if header_line
+ header_line = false
+ else
+ filename_to_attachments[filename].push OldAttachment.new filename, attachment_index, url_part_number, hexdigest
+ end
+ end
+
+ total_attachments = 0
+ attachments_with_different_hexdigest = 0
+ files_with_different_numbers_of_attachments = 0
+ no_tnef_attachments = 0
+ no_parts_in_multipart = 0
+
+ multipart_error = "no parts on multipart mail"
+ tnef_error = "tnef produced no attachments"
+
+ # Now check each file:
+ filename_to_attachments.each do |filename, old_attachments|
+
+ # Currently it doesn't seem to be possible to reuse the
+ # attachment parsing code in Alaveteli without saving
+ # objects to the database, so reproduce what it does:
+
+ raw_email = nil
+ File.open(filename) do |f|
+ raw_email = f.read
+ end
+ mail = MailHandler.mail_from_raw_email(raw_email)
+
+ begin
+ attachment_attributes = MailHandler.get_attachment_attributes(mail)
+ rescue IOError => e
+ if e.message == tnef_error
+ puts "#{filename} #{tnef_error}"
+ no_tnef_attachments += 1
+ next
+ else
+ raise
+ end
+ rescue Exception => e
+ if e.message == multipart_error
+ puts "#{filename} #{multipart_error}"
+ no_parts_in_multipart += 1
+ next
+ else
+ raise
+ end
+ end
+
+ if attachment_attributes.length != old_attachments.length
+ puts "#{filename} the number of old attachments #{old_attachments.length} didn't match the number of new attachments #{attachment_attributes.length}"
+ files_with_different_numbers_of_attachments += 1
+ else
+ old_attachments.each_with_index do |old_attachment, i|
+ total_attachments += 1
+ attrs = attachment_attributes[i]
+ old_hexdigest = old_attachment.hexdigest
+ new_hexdigest = attrs[:hexdigest]
+ new_content_type = attrs[:content_type]
+ old_url_part_number = old_attachment.url_part_number.to_i
+ new_url_part_number = attrs[:url_part_number]
+ if old_url_part_number != new_url_part_number
+ puts "#{i} #{filename} old_url_part_number #{old_url_part_number}, new_url_part_number #{new_url_part_number}"
+ end
+ if old_hexdigest != new_hexdigest
+ body = attrs[:body]
+ # First, if the content type is one of
+ # text/plain, text/html or application/rtf try
+ # changing CRLF to LF and calculating a new
+ # digest - we generally don't worry about
+ # these changes:
+ new_converted_hexdigest = nil
+ if ["text/plain", "text/html", "application/rtf"].include? new_content_type
+ converted_body = body.gsub /\r\n/, "\n"
+ new_converted_hexdigest = Digest::MD5.hexdigest converted_body
+ puts "new_converted_hexdigest is #{new_converted_hexdigest}"
+ end
+ if (! new_converted_hexdigest) || (old_hexdigest != new_converted_hexdigest)
+ puts "#{i} #{filename} old_hexdigest #{old_hexdigest} wasn't the same as new_hexdigest #{new_hexdigest}"
+ puts " body was of length #{body.length}"
+ puts " content type was: #{new_content_type}"
+ path = "/tmp/#{new_hexdigest}"
+ f = File.new path, "w"
+ f.write body
+ f.close
+ puts " wrote body to #{path}"
+ attachments_with_different_hexdigest += 1
+ end
+ end
+ end
+ end
+
+ end
+
+ puts "total_attachments: #{total_attachments}"
+ puts "attachments_with_different_hexdigest: #{attachments_with_different_hexdigest}"
+ puts "files_with_different_numbers_of_attachments: #{files_with_different_numbers_of_attachments}"
+ puts "no_tnef_attachments: #{no_tnef_attachments}"
+ puts "no_parts_in_multipart: #{no_parts_in_multipart}"
+
+ end
+
end