diff options
Diffstat (limited to 'lib')
-rw-r--r-- | lib/mail_handler/backends/mail_backend.rb | 74 | ||||
-rw-r--r-- | lib/mail_handler/backends/mail_extensions.rb | 30 | ||||
-rw-r--r-- | lib/mail_handler/mail_handler.rb | 9 | ||||
-rw-r--r-- | lib/no_constraint_disabling.rb | 110 | ||||
-rw-r--r-- | lib/normalize_string.rb | 86 | ||||
-rw-r--r-- | lib/tasks/temp.rake | 150 | ||||
-rw-r--r-- | lib/tasks/translation.rake | 12 | ||||
-rw-r--r-- | lib/willpaginate_extension.rb | 59 |
8 files changed, 449 insertions, 81 deletions
diff --git a/lib/mail_handler/backends/mail_backend.rb b/lib/mail_handler/backends/mail_backend.rb index f7893a60d..561946980 100644 --- a/lib/mail_handler/backends/mail_backend.rb +++ b/lib/mail_handler/backends/mail_backend.rb @@ -1,4 +1,35 @@ require 'mail' +require 'mapi/msg' +require 'mapi/convert' + +module Mail + class Message + + # The behaviour of the 'to' and 'cc' methods have changed + # between TMail and Mail; this monkey-patching restores the + # TMail behaviour. The key difference is that when there's an + # invalid address, e.g. '<foo@example.org', Mail returns the + # string as an ActiveSupport::Multibyte::Chars, whereas + # previously TMail would return nil. + + alias_method :old_to, :to + alias_method :old_cc, :cc + + def clean_addresses(old_method, val) + old_result = self.send(old_method, val) + old_result.class == Mail::AddressContainer ? old_result : nil + end + + def to(val = nil) + self.clean_addresses :old_to, val + end + + def cc(val = nil) + self.clean_addresses :old_cc, val + end + + end +end module MailHandler module Backends @@ -38,7 +69,11 @@ module MailHandler # Get the body of a mail part def get_part_body(part) - part.body.decoded + decoded = part.body.decoded + if part.content_type =~ /^text\// + decoded = convert_string_to_utf8_or_binary decoded, part.charset + end + decoded end # Return the first from field if any @@ -141,9 +176,14 @@ module MailHandler end elsif get_content_type(part) == 'application/ms-tnef' # A set of attachments in a TNEF file - part.rfc822_attachment = mail_from_tnef(part.body.decoded) - if part.rfc822_attachment.nil? - # Attached mail didn't parse, so treat as binary + begin + part.rfc822_attachment = mail_from_tnef(part.body.decoded) + if part.rfc822_attachment.nil? + # Attached mail didn't parse, so treat as binary + part.content_type = 'application/octet-stream' + end + rescue TNEFParsingError + part.rfc822_attachment = nil part.content_type = 'application/octet-stream' end end @@ -160,8 +200,11 @@ module MailHandler part.parts.each{ |sub_part| expand_and_normalize_parts(sub_part, parent_mail) } else part_filename = get_part_file_name(part) - charset = part.charset # save this, because overwriting content_type also resets charset - + if part.has_charset? + original_charset = part.charset # save this, because overwriting content_type also resets charset + else + original_charset = nil + end # Don't allow nil content_types if get_content_type(part).nil? part.content_type = 'application/octet-stream' @@ -180,7 +223,9 @@ module MailHandler # Use standard content types for Word documents etc. part.content_type = normalise_content_type(get_content_type(part)) decode_attached_part(part, parent_mail) - part.charset = charset + if original_charset + part.charset = original_charset + end end end @@ -228,8 +273,15 @@ module MailHandler def _get_attachment_leaves_recursive(part, within_rfc822_attachment, parent_mail) leaves_found = [] if part.multipart? - raise "no parts on multipart mail" if part.parts.size == 0 - if part.sub_type == 'alternative' + if part.parts.size == 0 + # This is typically caused by a missing final + # MIME boundary, in which case the text of the + # message (including the opening MIME + # boundary) is in part.body, so just add this + # part as a leaf and treat it as text/plain: + part.content_type = "text/plain" + leaves_found += [part] + elsif part.sub_type == 'alternative' best_part = choose_best_alternative(part) leaves_found += _get_attachment_leaves_recursive(best_part, within_rfc822_attachment, @@ -315,7 +367,9 @@ module MailHandler end def address_from_string(string) - Mail::Address.new(string).address + mail = Mail.new + mail.from = string + mail.from[0] end end end diff --git a/lib/mail_handler/backends/mail_extensions.rb b/lib/mail_handler/backends/mail_extensions.rb index d25012e39..322c49bb5 100644 --- a/lib/mail_handler/backends/mail_extensions.rb +++ b/lib/mail_handler/backends/mail_extensions.rb @@ -73,7 +73,12 @@ module Mail if match encoding = match[1] str = Ruby18.decode_base64(match[2]) - str = Iconv.conv('UTF-8//IGNORE', fix_encoding(encoding), str) + # Adding and removing trailing spaces is a workaround + # for Iconv.conv throwing an exception if it finds an + # invalid character at the end of the string, even + # with UTF-8//IGNORE: + # http://po-ru.com/diary/fixing-invalid-utf-8-in-ruby-revisited/ + str = Iconv.conv('UTF-8//IGNORE', fix_encoding(encoding), str + " ")[0...-4] end str end @@ -86,7 +91,12 @@ module Mail # Remove trailing = if it exists in a Q encoding string = string.sub(/\=$/, '') str = Encodings::QuotedPrintable.decode(string) - str = Iconv.conv('UTF-8//IGNORE', fix_encoding(encoding), str) + # Adding and removing trailing spaces is a workaround + # for Iconv.conv throwing an exception if it finds an + # invalid character at the end of the string, even + # with UTF-8//IGNORE: + # http://po-ru.com/diary/fixing-invalid-utf-8-in-ruby-revisited/ + str = Iconv.conv('UTF-8//IGNORE', fix_encoding(encoding), str + " ")[0...-4] end str end @@ -102,4 +112,20 @@ module Mail end end end + class Ruby19 + + def Ruby19.q_value_decode(str) + match = str.match(/\=\?(.+)?\?[Qq]\?(.+)?\?\=/m) + if match + encoding = match[1] + str = Encodings::QuotedPrintable.decode(match[2].gsub(/_/, '=20')) + # Backport line from mail 2.5 to strip a trailing = character + # Remove trailing = if it exists in a Q encoding + str = str.sub(/\=$/, '') + str.force_encoding(fix_encoding(encoding)) + end + decoded = str.encode("utf-8", :invalid => :replace, :replace => "") + decoded.valid_encoding? ? decoded : decoded.encode("utf-16le", :invalid => :replace, :replace => "").encode("utf-8") + end + end end diff --git a/lib/mail_handler/mail_handler.rb b/lib/mail_handler/mail_handler.rb index 22ba26b97..9c955cccd 100644 --- a/lib/mail_handler/mail_handler.rb +++ b/lib/mail_handler/mail_handler.rb @@ -8,20 +8,23 @@ module MailHandler require 'backends/mail_backend' include Backends::MailBackend + class TNEFParsingError < StandardError + end + # Returns a set of attachments from the given TNEF contents # The TNEF contents also contains the message body, but in general this is the # same as the message body in the message proper. def tnef_attachments(content) attachments = [] Dir.mktmpdir do |dir| - IO.popen("#{`which tnef`.chomp} -K -C #{dir}", "wb") do |f| + IO.popen("tnef -K -C #{dir} 2> /dev/null", "wb") do |f| f.write(content) f.close if $?.signaled? raise IOError, "tnef exited with signal #{$?.termsig}" end if $?.exited? && $?.exitstatus != 0 - raise IOError, "tnef exited with status #{$?.exitstatus}" + raise TNEFParsingError, "tnef exited with status #{$?.exitstatus}" end end found = 0 @@ -34,7 +37,7 @@ module MailHandler end end if found == 0 - raise IOError, "tnef produced no attachments" + raise TNEFParsingError, "tnef produced no attachments" end end attachments diff --git a/lib/no_constraint_disabling.rb b/lib/no_constraint_disabling.rb new file mode 100644 index 000000000..d515a959a --- /dev/null +++ b/lib/no_constraint_disabling.rb @@ -0,0 +1,110 @@ +# In order to work around the problem of the database use not having +# the permission to disable referential integrity when loading fixtures, +# we redefine disable_referential_integrity so that it doesn't try to +# disable foreign key constraints, and redefine the +# ActiveRecord::Fixtures.create_fixtures method to pay attention to the order +# which fixture tables are passed so that foreign key constraints won't be +# violated. The only lines that are changed from the initial definition +# are those between the "***" comments +require 'active_record/fixtures' +require 'active_record/connection_adapters/postgresql_adapter' +module ActiveRecord + module ConnectionAdapters + class PostgreSQLAdapter < AbstractAdapter + def disable_referential_integrity(&block) + transaction { + yield + } + end + end + end +end + +module ActiveRecord + class Fixtures + + def self.create_fixtures(fixtures_directory, table_names, class_names = {}) + table_names = [table_names].flatten.map { |n| n.to_s } + table_names.each { |n| + class_names[n.tr('/', '_').to_sym] = n.classify if n.include?('/') + } + + # FIXME: Apparently JK uses this. + connection = block_given? ? yield : ActiveRecord::Base.connection + + files_to_read = table_names.reject { |table_name| + fixture_is_cached?(connection, table_name) + } + + unless files_to_read.empty? + connection.disable_referential_integrity do + fixtures_map = {} + + fixture_files = files_to_read.map do |path| + table_name = path.tr '/', '_' + + fixtures_map[path] = ActiveRecord::Fixtures.new( + connection, + table_name, + class_names[table_name.to_sym] || table_name.classify, + File.join(fixtures_directory, path)) + end + + all_loaded_fixtures.update(fixtures_map) + + connection.transaction(:requires_new => true) do + # Patch - replace this... + # *** + # fixture_files.each do |ff| + # conn = ff.model_class.respond_to?(:connection) ? ff.model_class.connection : connection + # table_rows = ff.table_rows + # + # table_rows.keys.each do |table| + # conn.delete "DELETE FROM #{conn.quote_table_name(table)}", 'Fixture Delete' + # end + # + # table_rows.each do |table_name,rows| + # rows.each do |row| + # conn.insert_fixture(row, table_name) + # end + # end + # end + # *** + # ... with this + fixture_files.reverse.each do |ff| + conn = ff.model_class.respond_to?(:connection) ? ff.model_class.connection : connection + table_rows = ff.table_rows + + table_rows.keys.each do |table| + conn.delete "DELETE FROM #{conn.quote_table_name(table)}", 'Fixture Delete' + end + end + + fixture_files.each do |ff| + conn = ff.model_class.respond_to?(:connection) ? ff.model_class.connection : connection + table_rows = ff.table_rows + table_rows.each do |table_name,rows| + rows.each do |row| + conn.insert_fixture(row, table_name) + end + end + end + # *** + + # Cap primary key sequences to max(pk). + if connection.respond_to?(:reset_pk_sequence!) + table_names.each do |table_name| + connection.reset_pk_sequence!(table_name.tr('/', '_')) + end + end + end + + cache_fixtures(connection, fixtures_map) + end + end + cached_fixtures(connection, table_names) + end + + end + +end diff --git a/lib/normalize_string.rb b/lib/normalize_string.rb new file mode 100644 index 000000000..f02b18ee0 --- /dev/null +++ b/lib/normalize_string.rb @@ -0,0 +1,86 @@ +require 'iconv' unless RUBY_VERSION.to_f >= 1.9 +require 'charlock_holmes' + +class EncodingNormalizationError < StandardError +end + +def normalize_string_to_utf8(s, suggested_character_encoding=nil) + + # Make a list of encodings to try: + to_try = [] + + guessed_encoding = CharlockHolmes::EncodingDetector.detect(s)[:encoding] + guessed_encoding ||= '' + + # It's reasonably common for windows-1252 text to be mislabelled + # as ISO-8859-1, so try that first if charlock_holmes guessed + # that. However, it can also easily misidentify UTF-8 strings as + # ISO-8859-1 so we don't want to go with the guess by default... + to_try.push guessed_encoding if guessed_encoding.downcase == 'windows-1252' + + to_try.push suggested_character_encoding if suggested_character_encoding + to_try.push 'UTF-8' + to_try.push guessed_encoding + + to_try.each do |from_encoding| + if RUBY_VERSION.to_f >= 1.9 + begin + s.force_encoding from_encoding + return s.encode('UTF-8') if s.valid_encoding? + rescue ArgumentError + # We get this is there are invalid bytes when + # interpreted as from_encoding at the point of + # the encode('UTF-8'); move onto the next one... + end + else + to_encoding = 'UTF-8' + begin + converted = Iconv.conv 'UTF-8', from_encoding, s + return converted + rescue Iconv::Failure + # We get this is there are invalid bytes when + # interpreted as from_encoding at the point of + # the Iconv.iconv; move onto the next one... + end + end + end + raise EncodingNormalizationError, "Couldn't find a valid character encoding for the string" + +end + +def convert_string_to_utf8_or_binary(s, suggested_character_encoding=nil) + # This function exists to help to keep consistent with the + # behaviour of earlier versions of Alaveteli: in the code as it + # is, there are situations where it's expected that we generally + # have a UTF-8 encoded string, but if the source data was + # unintepretable under any character encoding, the string may be + # binary data (i.e. invalid UTF-8). Such a string would then be + # mangled into valid UTF-8 by _sanitize_text for the purposes of + # display. + + # This seems unsatisfactory to me - two better alternatives would + # be either: (a) to mangle the data into valid UTF-8 in this + # method or (b) to treat the 'text/*' attachment as + # 'application/octet-stream' instead. However, for the purposes + # of the transition to Ruby 1.9 and/or Rails 3 we just want the + # behaviour to be as similar as possible. + + begin + result = normalize_string_to_utf8 s, suggested_character_encoding + rescue EncodingNormalizationError + result = s + s.force_encoding 'ASCII-8BIT' if RUBY_VERSION.to_f >= 1.9 + end + result +end + +def log_text_details(message, text) + if RUBY_VERSION.to_f >= 1.9 + STDERR.puts "#{message}, we have text: #{text}, of class #{text.class} and encoding #{text.encoding}" + else + STDERR.puts "#{message}, we have text: #{text}, of class #{text.class}" + end + filename = "/var/tmp/#{Digest::MD5.hexdigest(text)}.txt" + File.open(filename, "wb") { |f| f.write text } + STDERR.puts "#{message}, the filename is: #{filename}" +end diff --git a/lib/tasks/temp.rake b/lib/tasks/temp.rake index e49a84ecb..f0085b5e1 100644 --- a/lib/tasks/temp.rake +++ b/lib/tasks/temp.rake @@ -50,4 +50,154 @@ namespace :temp do end end + desc 'Create a CSV file of a random selection of raw emails, for comparing hexdigests' + task :random_attachments_hexdigests => :environment do + + # The idea is to run this under the Rail 2 codebase, where + # Tmail was used to extract the attachements, and the task + # will output all of those file paths in a CSV file, and a + # list of the raw email files in another. The latter file is + # useful so that one can easily tar up the emails with: + # + # tar cvz -T raw-email-files -f raw_emails.tar.gz + # + # Then you can switch to the Rails 3 codebase, where + # attachment parsing is done via + # recompute_attachments_hexdigests + + require 'csv' + + File.open('raw-email-files', 'w') do |f| + CSV.open('attachment-hexdigests.csv', 'w') do |csv| + csv << ['filepath', 'i', 'url_part_number', 'hexdigest'] + IncomingMessage.all(:order => 'RANDOM()', :limit => 1000).each do |incoming_message| + # raw_email.filepath fails unless the + # incoming_message has an associated request + next unless incoming_message.info_request + raw_email = incoming_message.raw_email + f.puts raw_email.filepath + incoming_message.foi_attachments.each_with_index do |attachment, i| + csv << [raw_email.filepath, i, attachment.url_part_number, attachment.hexdigest] + end + end + end + end + + end + + + desc 'Check the hexdigests of attachments in emails on disk' + task :recompute_attachments_hexdigests => :environment do + + require 'csv' + require 'digest/md5' + + OldAttachment = Struct.new :filename, :attachment_index, :url_part_number, :hexdigest + + filename_to_attachments = Hash.new {|h,k| h[k] = []} + + header_line = true + CSV.foreach('attachment-hexdigests.csv') do |filename, attachment_index, url_part_number, hexdigest| + if header_line + header_line = false + else + filename_to_attachments[filename].push OldAttachment.new filename, attachment_index, url_part_number, hexdigest + end + end + + total_attachments = 0 + attachments_with_different_hexdigest = 0 + files_with_different_numbers_of_attachments = 0 + no_tnef_attachments = 0 + no_parts_in_multipart = 0 + + multipart_error = "no parts on multipart mail" + tnef_error = "tnef produced no attachments" + + # Now check each file: + filename_to_attachments.each do |filename, old_attachments| + + # Currently it doesn't seem to be possible to reuse the + # attachment parsing code in Alaveteli without saving + # objects to the database, so reproduce what it does: + + raw_email = nil + File.open(filename) do |f| + raw_email = f.read + end + mail = MailHandler.mail_from_raw_email(raw_email) + + begin + attachment_attributes = MailHandler.get_attachment_attributes(mail) + rescue IOError => e + if e.message == tnef_error + puts "#{filename} #{tnef_error}" + no_tnef_attachments += 1 + next + else + raise + end + rescue Exception => e + if e.message == multipart_error + puts "#{filename} #{multipart_error}" + no_parts_in_multipart += 1 + next + else + raise + end + end + + if attachment_attributes.length != old_attachments.length + puts "#{filename} the number of old attachments #{old_attachments.length} didn't match the number of new attachments #{attachment_attributes.length}" + files_with_different_numbers_of_attachments += 1 + else + old_attachments.each_with_index do |old_attachment, i| + total_attachments += 1 + attrs = attachment_attributes[i] + old_hexdigest = old_attachment.hexdigest + new_hexdigest = attrs[:hexdigest] + new_content_type = attrs[:content_type] + old_url_part_number = old_attachment.url_part_number.to_i + new_url_part_number = attrs[:url_part_number] + if old_url_part_number != new_url_part_number + puts "#{i} #{filename} old_url_part_number #{old_url_part_number}, new_url_part_number #{new_url_part_number}" + end + if old_hexdigest != new_hexdigest + body = attrs[:body] + # First, if the content type is one of + # text/plain, text/html or application/rtf try + # changing CRLF to LF and calculating a new + # digest - we generally don't worry about + # these changes: + new_converted_hexdigest = nil + if ["text/plain", "text/html", "application/rtf"].include? new_content_type + converted_body = body.gsub /\r\n/, "\n" + new_converted_hexdigest = Digest::MD5.hexdigest converted_body + puts "new_converted_hexdigest is #{new_converted_hexdigest}" + end + if (! new_converted_hexdigest) || (old_hexdigest != new_converted_hexdigest) + puts "#{i} #{filename} old_hexdigest #{old_hexdigest} wasn't the same as new_hexdigest #{new_hexdigest}" + puts " body was of length #{body.length}" + puts " content type was: #{new_content_type}" + path = "/tmp/#{new_hexdigest}" + f = File.new path, "w" + f.write body + f.close + puts " wrote body to #{path}" + attachments_with_different_hexdigest += 1 + end + end + end + end + + end + + puts "total_attachments: #{total_attachments}" + puts "attachments_with_different_hexdigest: #{attachments_with_different_hexdigest}" + puts "files_with_different_numbers_of_attachments: #{files_with_different_numbers_of_attachments}" + puts "no_tnef_attachments: #{no_tnef_attachments}" + puts "no_parts_in_multipart: #{no_parts_in_multipart}" + + end + end diff --git a/lib/tasks/translation.rake b/lib/tasks/translation.rake index 351faef2c..6458d9268 100644 --- a/lib/tasks/translation.rake +++ b/lib/tasks/translation.rake @@ -142,13 +142,11 @@ namespace :translation do output_file) # track mailer - xapian_object = InfoRequest.full_search([InfoRequestEvent], - track_thing.track_query, - 'described_at', - true, - nil, - 100, - 1) + xapian_object = ActsAsXapian::Search.new([InfoRequestEvent], track_thing.track_query, + :sort_by_prefix => 'described_at', + :sort_by_ascending => true, + :collapse_by_prefix => nil, + :limit => 100) event_digest_email = TrackMailer.event_digest(info_request.user, [[track_thing, xapian_object.results, diff --git a/lib/willpaginate_extension.rb b/lib/willpaginate_extension.rb deleted file mode 100644 index fa58bd9f0..000000000 --- a/lib/willpaginate_extension.rb +++ /dev/null @@ -1,59 +0,0 @@ -# this extension is loaded in environment.rb -module WillPaginateExtension - class LinkRenderer < WillPaginate::ActionView::LinkRenderer - def page_link(page, text, attributes = {}) - # Hack for admin pages, when proxied via https on mySociety servers, they - # need a relative URL. - url = url_for(page) - if url.match(/\/admin.*(\?.*)/) - url = $1 - end - # Hack around our type-ahead search magic - if url.match(/\/body\/search_ahead/) - url.sub!("/body/search_ahead", "/select_authority") - end - @template.link_to text, url, attributes - end - - # Returns URL params for +page_link_or_span+, taking the current GET params - # and <tt>:params</tt> option into account. - def url_for(page) - page_one = page == 1 - unless @url_string and !page_one - @url_params = {} - # page links should preserve GET parameters - stringified_merge @url_params, @template.params if @template.request.get? - stringified_merge @url_params, @options[:params] if @options[:params] - if complex = param_name.index(/[^\w-]/) - page_param = parse_query_parameters("#{param_name}=#{page}") - - stringified_merge @url_params, page_param - else - @url_params[param_name] = page_one ? 1 : 2 - end - # the following line makes pagination work on our specially munged search page - combined = @template.request.path_parameters["combined"] - @url_params["combined"] = combined if !combined.nil? - url = @template.url_for(@url_params) - return url if page_one - - if complex - @url_string = url.sub(%r!((?:\?|&)#{CGI.escape param_name}=)#{page}!, "\\1\0") - return url - else - @url_string = url - @url_params[param_name] = 3 - @template.url_for(@url_params).split(//).each_with_index do |char, i| - if char == '3' and url[i, 1] == '2' - @url_string[i] = "\0" - break - end - end - end - end - # finally! - @url_string.sub "\0", page.to_s - end - - end -end |