aboutsummaryrefslogtreecommitdiffstats
path: root/lib
diff options
context:
space:
mode:
Diffstat (limited to 'lib')
-rw-r--r--lib/mail_handler/backends/mail_backend.rb74
-rw-r--r--lib/mail_handler/backends/mail_extensions.rb30
-rw-r--r--lib/mail_handler/mail_handler.rb9
-rw-r--r--lib/no_constraint_disabling.rb110
-rw-r--r--lib/normalize_string.rb86
-rw-r--r--lib/tasks/temp.rake150
-rw-r--r--lib/tasks/translation.rake12
-rw-r--r--lib/willpaginate_extension.rb59
8 files changed, 449 insertions, 81 deletions
diff --git a/lib/mail_handler/backends/mail_backend.rb b/lib/mail_handler/backends/mail_backend.rb
index f7893a60d..561946980 100644
--- a/lib/mail_handler/backends/mail_backend.rb
+++ b/lib/mail_handler/backends/mail_backend.rb
@@ -1,4 +1,35 @@
require 'mail'
+require 'mapi/msg'
+require 'mapi/convert'
+
+module Mail
+ class Message
+
+ # The behaviour of the 'to' and 'cc' methods have changed
+ # between TMail and Mail; this monkey-patching restores the
+ # TMail behaviour. The key difference is that when there's an
+ # invalid address, e.g. '<foo@example.org', Mail returns the
+ # string as an ActiveSupport::Multibyte::Chars, whereas
+ # previously TMail would return nil.
+
+ alias_method :old_to, :to
+ alias_method :old_cc, :cc
+
+ def clean_addresses(old_method, val)
+ old_result = self.send(old_method, val)
+ old_result.class == Mail::AddressContainer ? old_result : nil
+ end
+
+ def to(val = nil)
+ self.clean_addresses :old_to, val
+ end
+
+ def cc(val = nil)
+ self.clean_addresses :old_cc, val
+ end
+
+ end
+end
module MailHandler
module Backends
@@ -38,7 +69,11 @@ module MailHandler
# Get the body of a mail part
def get_part_body(part)
- part.body.decoded
+ decoded = part.body.decoded
+ if part.content_type =~ /^text\//
+ decoded = convert_string_to_utf8_or_binary decoded, part.charset
+ end
+ decoded
end
# Return the first from field if any
@@ -141,9 +176,14 @@ module MailHandler
end
elsif get_content_type(part) == 'application/ms-tnef'
# A set of attachments in a TNEF file
- part.rfc822_attachment = mail_from_tnef(part.body.decoded)
- if part.rfc822_attachment.nil?
- # Attached mail didn't parse, so treat as binary
+ begin
+ part.rfc822_attachment = mail_from_tnef(part.body.decoded)
+ if part.rfc822_attachment.nil?
+ # Attached mail didn't parse, so treat as binary
+ part.content_type = 'application/octet-stream'
+ end
+ rescue TNEFParsingError
+ part.rfc822_attachment = nil
part.content_type = 'application/octet-stream'
end
end
@@ -160,8 +200,11 @@ module MailHandler
part.parts.each{ |sub_part| expand_and_normalize_parts(sub_part, parent_mail) }
else
part_filename = get_part_file_name(part)
- charset = part.charset # save this, because overwriting content_type also resets charset
-
+ if part.has_charset?
+ original_charset = part.charset # save this, because overwriting content_type also resets charset
+ else
+ original_charset = nil
+ end
# Don't allow nil content_types
if get_content_type(part).nil?
part.content_type = 'application/octet-stream'
@@ -180,7 +223,9 @@ module MailHandler
# Use standard content types for Word documents etc.
part.content_type = normalise_content_type(get_content_type(part))
decode_attached_part(part, parent_mail)
- part.charset = charset
+ if original_charset
+ part.charset = original_charset
+ end
end
end
@@ -228,8 +273,15 @@ module MailHandler
def _get_attachment_leaves_recursive(part, within_rfc822_attachment, parent_mail)
leaves_found = []
if part.multipart?
- raise "no parts on multipart mail" if part.parts.size == 0
- if part.sub_type == 'alternative'
+ if part.parts.size == 0
+ # This is typically caused by a missing final
+ # MIME boundary, in which case the text of the
+ # message (including the opening MIME
+ # boundary) is in part.body, so just add this
+ # part as a leaf and treat it as text/plain:
+ part.content_type = "text/plain"
+ leaves_found += [part]
+ elsif part.sub_type == 'alternative'
best_part = choose_best_alternative(part)
leaves_found += _get_attachment_leaves_recursive(best_part,
within_rfc822_attachment,
@@ -315,7 +367,9 @@ module MailHandler
end
def address_from_string(string)
- Mail::Address.new(string).address
+ mail = Mail.new
+ mail.from = string
+ mail.from[0]
end
end
end
diff --git a/lib/mail_handler/backends/mail_extensions.rb b/lib/mail_handler/backends/mail_extensions.rb
index d25012e39..322c49bb5 100644
--- a/lib/mail_handler/backends/mail_extensions.rb
+++ b/lib/mail_handler/backends/mail_extensions.rb
@@ -73,7 +73,12 @@ module Mail
if match
encoding = match[1]
str = Ruby18.decode_base64(match[2])
- str = Iconv.conv('UTF-8//IGNORE', fix_encoding(encoding), str)
+ # Adding and removing trailing spaces is a workaround
+ # for Iconv.conv throwing an exception if it finds an
+ # invalid character at the end of the string, even
+ # with UTF-8//IGNORE:
+ # http://po-ru.com/diary/fixing-invalid-utf-8-in-ruby-revisited/
+ str = Iconv.conv('UTF-8//IGNORE', fix_encoding(encoding), str + " ")[0...-4]
end
str
end
@@ -86,7 +91,12 @@ module Mail
# Remove trailing = if it exists in a Q encoding
string = string.sub(/\=$/, '')
str = Encodings::QuotedPrintable.decode(string)
- str = Iconv.conv('UTF-8//IGNORE', fix_encoding(encoding), str)
+ # Adding and removing trailing spaces is a workaround
+ # for Iconv.conv throwing an exception if it finds an
+ # invalid character at the end of the string, even
+ # with UTF-8//IGNORE:
+ # http://po-ru.com/diary/fixing-invalid-utf-8-in-ruby-revisited/
+ str = Iconv.conv('UTF-8//IGNORE', fix_encoding(encoding), str + " ")[0...-4]
end
str
end
@@ -102,4 +112,20 @@ module Mail
end
end
end
+ class Ruby19
+
+ def Ruby19.q_value_decode(str)
+ match = str.match(/\=\?(.+)?\?[Qq]\?(.+)?\?\=/m)
+ if match
+ encoding = match[1]
+ str = Encodings::QuotedPrintable.decode(match[2].gsub(/_/, '=20'))
+ # Backport line from mail 2.5 to strip a trailing = character
+ # Remove trailing = if it exists in a Q encoding
+ str = str.sub(/\=$/, '')
+ str.force_encoding(fix_encoding(encoding))
+ end
+ decoded = str.encode("utf-8", :invalid => :replace, :replace => "")
+ decoded.valid_encoding? ? decoded : decoded.encode("utf-16le", :invalid => :replace, :replace => "").encode("utf-8")
+ end
+ end
end
diff --git a/lib/mail_handler/mail_handler.rb b/lib/mail_handler/mail_handler.rb
index 22ba26b97..9c955cccd 100644
--- a/lib/mail_handler/mail_handler.rb
+++ b/lib/mail_handler/mail_handler.rb
@@ -8,20 +8,23 @@ module MailHandler
require 'backends/mail_backend'
include Backends::MailBackend
+ class TNEFParsingError < StandardError
+ end
+
# Returns a set of attachments from the given TNEF contents
# The TNEF contents also contains the message body, but in general this is the
# same as the message body in the message proper.
def tnef_attachments(content)
attachments = []
Dir.mktmpdir do |dir|
- IO.popen("#{`which tnef`.chomp} -K -C #{dir}", "wb") do |f|
+ IO.popen("tnef -K -C #{dir} 2> /dev/null", "wb") do |f|
f.write(content)
f.close
if $?.signaled?
raise IOError, "tnef exited with signal #{$?.termsig}"
end
if $?.exited? && $?.exitstatus != 0
- raise IOError, "tnef exited with status #{$?.exitstatus}"
+ raise TNEFParsingError, "tnef exited with status #{$?.exitstatus}"
end
end
found = 0
@@ -34,7 +37,7 @@ module MailHandler
end
end
if found == 0
- raise IOError, "tnef produced no attachments"
+ raise TNEFParsingError, "tnef produced no attachments"
end
end
attachments
diff --git a/lib/no_constraint_disabling.rb b/lib/no_constraint_disabling.rb
new file mode 100644
index 000000000..d515a959a
--- /dev/null
+++ b/lib/no_constraint_disabling.rb
@@ -0,0 +1,110 @@
+# In order to work around the problem of the database use not having
+# the permission to disable referential integrity when loading fixtures,
+# we redefine disable_referential_integrity so that it doesn't try to
+# disable foreign key constraints, and redefine the
+# ActiveRecord::Fixtures.create_fixtures method to pay attention to the order
+# which fixture tables are passed so that foreign key constraints won't be
+# violated. The only lines that are changed from the initial definition
+# are those between the "***" comments
+require 'active_record/fixtures'
+require 'active_record/connection_adapters/postgresql_adapter'
+module ActiveRecord
+ module ConnectionAdapters
+ class PostgreSQLAdapter < AbstractAdapter
+ def disable_referential_integrity(&block)
+ transaction {
+ yield
+ }
+ end
+ end
+ end
+end
+
+module ActiveRecord
+ class Fixtures
+
+ def self.create_fixtures(fixtures_directory, table_names, class_names = {})
+ table_names = [table_names].flatten.map { |n| n.to_s }
+ table_names.each { |n|
+ class_names[n.tr('/', '_').to_sym] = n.classify if n.include?('/')
+ }
+
+ # FIXME: Apparently JK uses this.
+ connection = block_given? ? yield : ActiveRecord::Base.connection
+
+ files_to_read = table_names.reject { |table_name|
+ fixture_is_cached?(connection, table_name)
+ }
+
+ unless files_to_read.empty?
+ connection.disable_referential_integrity do
+ fixtures_map = {}
+
+ fixture_files = files_to_read.map do |path|
+ table_name = path.tr '/', '_'
+
+ fixtures_map[path] = ActiveRecord::Fixtures.new(
+ connection,
+ table_name,
+ class_names[table_name.to_sym] || table_name.classify,
+ File.join(fixtures_directory, path))
+ end
+
+ all_loaded_fixtures.update(fixtures_map)
+
+ connection.transaction(:requires_new => true) do
+ # Patch - replace this...
+ # ***
+ # fixture_files.each do |ff|
+ # conn = ff.model_class.respond_to?(:connection) ? ff.model_class.connection : connection
+ # table_rows = ff.table_rows
+ #
+ # table_rows.keys.each do |table|
+ # conn.delete "DELETE FROM #{conn.quote_table_name(table)}", 'Fixture Delete'
+ # end
+ #
+ # table_rows.each do |table_name,rows|
+ # rows.each do |row|
+ # conn.insert_fixture(row, table_name)
+ # end
+ # end
+ # end
+ # ***
+ # ... with this
+ fixture_files.reverse.each do |ff|
+ conn = ff.model_class.respond_to?(:connection) ? ff.model_class.connection : connection
+ table_rows = ff.table_rows
+
+ table_rows.keys.each do |table|
+ conn.delete "DELETE FROM #{conn.quote_table_name(table)}", 'Fixture Delete'
+ end
+ end
+
+ fixture_files.each do |ff|
+ conn = ff.model_class.respond_to?(:connection) ? ff.model_class.connection : connection
+ table_rows = ff.table_rows
+ table_rows.each do |table_name,rows|
+ rows.each do |row|
+ conn.insert_fixture(row, table_name)
+ end
+ end
+ end
+ # ***
+
+ # Cap primary key sequences to max(pk).
+ if connection.respond_to?(:reset_pk_sequence!)
+ table_names.each do |table_name|
+ connection.reset_pk_sequence!(table_name.tr('/', '_'))
+ end
+ end
+ end
+
+ cache_fixtures(connection, fixtures_map)
+ end
+ end
+ cached_fixtures(connection, table_names)
+ end
+
+ end
+
+end
diff --git a/lib/normalize_string.rb b/lib/normalize_string.rb
new file mode 100644
index 000000000..f02b18ee0
--- /dev/null
+++ b/lib/normalize_string.rb
@@ -0,0 +1,86 @@
+require 'iconv' unless RUBY_VERSION.to_f >= 1.9
+require 'charlock_holmes'
+
+class EncodingNormalizationError < StandardError
+end
+
+def normalize_string_to_utf8(s, suggested_character_encoding=nil)
+
+ # Make a list of encodings to try:
+ to_try = []
+
+ guessed_encoding = CharlockHolmes::EncodingDetector.detect(s)[:encoding]
+ guessed_encoding ||= ''
+
+ # It's reasonably common for windows-1252 text to be mislabelled
+ # as ISO-8859-1, so try that first if charlock_holmes guessed
+ # that. However, it can also easily misidentify UTF-8 strings as
+ # ISO-8859-1 so we don't want to go with the guess by default...
+ to_try.push guessed_encoding if guessed_encoding.downcase == 'windows-1252'
+
+ to_try.push suggested_character_encoding if suggested_character_encoding
+ to_try.push 'UTF-8'
+ to_try.push guessed_encoding
+
+ to_try.each do |from_encoding|
+ if RUBY_VERSION.to_f >= 1.9
+ begin
+ s.force_encoding from_encoding
+ return s.encode('UTF-8') if s.valid_encoding?
+ rescue ArgumentError
+ # We get this is there are invalid bytes when
+ # interpreted as from_encoding at the point of
+ # the encode('UTF-8'); move onto the next one...
+ end
+ else
+ to_encoding = 'UTF-8'
+ begin
+ converted = Iconv.conv 'UTF-8', from_encoding, s
+ return converted
+ rescue Iconv::Failure
+ # We get this is there are invalid bytes when
+ # interpreted as from_encoding at the point of
+ # the Iconv.iconv; move onto the next one...
+ end
+ end
+ end
+ raise EncodingNormalizationError, "Couldn't find a valid character encoding for the string"
+
+end
+
+def convert_string_to_utf8_or_binary(s, suggested_character_encoding=nil)
+ # This function exists to help to keep consistent with the
+ # behaviour of earlier versions of Alaveteli: in the code as it
+ # is, there are situations where it's expected that we generally
+ # have a UTF-8 encoded string, but if the source data was
+ # unintepretable under any character encoding, the string may be
+ # binary data (i.e. invalid UTF-8). Such a string would then be
+ # mangled into valid UTF-8 by _sanitize_text for the purposes of
+ # display.
+
+ # This seems unsatisfactory to me - two better alternatives would
+ # be either: (a) to mangle the data into valid UTF-8 in this
+ # method or (b) to treat the 'text/*' attachment as
+ # 'application/octet-stream' instead. However, for the purposes
+ # of the transition to Ruby 1.9 and/or Rails 3 we just want the
+ # behaviour to be as similar as possible.
+
+ begin
+ result = normalize_string_to_utf8 s, suggested_character_encoding
+ rescue EncodingNormalizationError
+ result = s
+ s.force_encoding 'ASCII-8BIT' if RUBY_VERSION.to_f >= 1.9
+ end
+ result
+end
+
+def log_text_details(message, text)
+ if RUBY_VERSION.to_f >= 1.9
+ STDERR.puts "#{message}, we have text: #{text}, of class #{text.class} and encoding #{text.encoding}"
+ else
+ STDERR.puts "#{message}, we have text: #{text}, of class #{text.class}"
+ end
+ filename = "/var/tmp/#{Digest::MD5.hexdigest(text)}.txt"
+ File.open(filename, "wb") { |f| f.write text }
+ STDERR.puts "#{message}, the filename is: #{filename}"
+end
diff --git a/lib/tasks/temp.rake b/lib/tasks/temp.rake
index e49a84ecb..f0085b5e1 100644
--- a/lib/tasks/temp.rake
+++ b/lib/tasks/temp.rake
@@ -50,4 +50,154 @@ namespace :temp do
end
end
+ desc 'Create a CSV file of a random selection of raw emails, for comparing hexdigests'
+ task :random_attachments_hexdigests => :environment do
+
+ # The idea is to run this under the Rail 2 codebase, where
+ # Tmail was used to extract the attachements, and the task
+ # will output all of those file paths in a CSV file, and a
+ # list of the raw email files in another. The latter file is
+ # useful so that one can easily tar up the emails with:
+ #
+ # tar cvz -T raw-email-files -f raw_emails.tar.gz
+ #
+ # Then you can switch to the Rails 3 codebase, where
+ # attachment parsing is done via
+ # recompute_attachments_hexdigests
+
+ require 'csv'
+
+ File.open('raw-email-files', 'w') do |f|
+ CSV.open('attachment-hexdigests.csv', 'w') do |csv|
+ csv << ['filepath', 'i', 'url_part_number', 'hexdigest']
+ IncomingMessage.all(:order => 'RANDOM()', :limit => 1000).each do |incoming_message|
+ # raw_email.filepath fails unless the
+ # incoming_message has an associated request
+ next unless incoming_message.info_request
+ raw_email = incoming_message.raw_email
+ f.puts raw_email.filepath
+ incoming_message.foi_attachments.each_with_index do |attachment, i|
+ csv << [raw_email.filepath, i, attachment.url_part_number, attachment.hexdigest]
+ end
+ end
+ end
+ end
+
+ end
+
+
+ desc 'Check the hexdigests of attachments in emails on disk'
+ task :recompute_attachments_hexdigests => :environment do
+
+ require 'csv'
+ require 'digest/md5'
+
+ OldAttachment = Struct.new :filename, :attachment_index, :url_part_number, :hexdigest
+
+ filename_to_attachments = Hash.new {|h,k| h[k] = []}
+
+ header_line = true
+ CSV.foreach('attachment-hexdigests.csv') do |filename, attachment_index, url_part_number, hexdigest|
+ if header_line
+ header_line = false
+ else
+ filename_to_attachments[filename].push OldAttachment.new filename, attachment_index, url_part_number, hexdigest
+ end
+ end
+
+ total_attachments = 0
+ attachments_with_different_hexdigest = 0
+ files_with_different_numbers_of_attachments = 0
+ no_tnef_attachments = 0
+ no_parts_in_multipart = 0
+
+ multipart_error = "no parts on multipart mail"
+ tnef_error = "tnef produced no attachments"
+
+ # Now check each file:
+ filename_to_attachments.each do |filename, old_attachments|
+
+ # Currently it doesn't seem to be possible to reuse the
+ # attachment parsing code in Alaveteli without saving
+ # objects to the database, so reproduce what it does:
+
+ raw_email = nil
+ File.open(filename) do |f|
+ raw_email = f.read
+ end
+ mail = MailHandler.mail_from_raw_email(raw_email)
+
+ begin
+ attachment_attributes = MailHandler.get_attachment_attributes(mail)
+ rescue IOError => e
+ if e.message == tnef_error
+ puts "#{filename} #{tnef_error}"
+ no_tnef_attachments += 1
+ next
+ else
+ raise
+ end
+ rescue Exception => e
+ if e.message == multipart_error
+ puts "#{filename} #{multipart_error}"
+ no_parts_in_multipart += 1
+ next
+ else
+ raise
+ end
+ end
+
+ if attachment_attributes.length != old_attachments.length
+ puts "#{filename} the number of old attachments #{old_attachments.length} didn't match the number of new attachments #{attachment_attributes.length}"
+ files_with_different_numbers_of_attachments += 1
+ else
+ old_attachments.each_with_index do |old_attachment, i|
+ total_attachments += 1
+ attrs = attachment_attributes[i]
+ old_hexdigest = old_attachment.hexdigest
+ new_hexdigest = attrs[:hexdigest]
+ new_content_type = attrs[:content_type]
+ old_url_part_number = old_attachment.url_part_number.to_i
+ new_url_part_number = attrs[:url_part_number]
+ if old_url_part_number != new_url_part_number
+ puts "#{i} #{filename} old_url_part_number #{old_url_part_number}, new_url_part_number #{new_url_part_number}"
+ end
+ if old_hexdigest != new_hexdigest
+ body = attrs[:body]
+ # First, if the content type is one of
+ # text/plain, text/html or application/rtf try
+ # changing CRLF to LF and calculating a new
+ # digest - we generally don't worry about
+ # these changes:
+ new_converted_hexdigest = nil
+ if ["text/plain", "text/html", "application/rtf"].include? new_content_type
+ converted_body = body.gsub /\r\n/, "\n"
+ new_converted_hexdigest = Digest::MD5.hexdigest converted_body
+ puts "new_converted_hexdigest is #{new_converted_hexdigest}"
+ end
+ if (! new_converted_hexdigest) || (old_hexdigest != new_converted_hexdigest)
+ puts "#{i} #{filename} old_hexdigest #{old_hexdigest} wasn't the same as new_hexdigest #{new_hexdigest}"
+ puts " body was of length #{body.length}"
+ puts " content type was: #{new_content_type}"
+ path = "/tmp/#{new_hexdigest}"
+ f = File.new path, "w"
+ f.write body
+ f.close
+ puts " wrote body to #{path}"
+ attachments_with_different_hexdigest += 1
+ end
+ end
+ end
+ end
+
+ end
+
+ puts "total_attachments: #{total_attachments}"
+ puts "attachments_with_different_hexdigest: #{attachments_with_different_hexdigest}"
+ puts "files_with_different_numbers_of_attachments: #{files_with_different_numbers_of_attachments}"
+ puts "no_tnef_attachments: #{no_tnef_attachments}"
+ puts "no_parts_in_multipart: #{no_parts_in_multipart}"
+
+ end
+
end
diff --git a/lib/tasks/translation.rake b/lib/tasks/translation.rake
index 351faef2c..6458d9268 100644
--- a/lib/tasks/translation.rake
+++ b/lib/tasks/translation.rake
@@ -142,13 +142,11 @@ namespace :translation do
output_file)
# track mailer
- xapian_object = InfoRequest.full_search([InfoRequestEvent],
- track_thing.track_query,
- 'described_at',
- true,
- nil,
- 100,
- 1)
+ xapian_object = ActsAsXapian::Search.new([InfoRequestEvent], track_thing.track_query,
+ :sort_by_prefix => 'described_at',
+ :sort_by_ascending => true,
+ :collapse_by_prefix => nil,
+ :limit => 100)
event_digest_email = TrackMailer.event_digest(info_request.user,
[[track_thing,
xapian_object.results,
diff --git a/lib/willpaginate_extension.rb b/lib/willpaginate_extension.rb
deleted file mode 100644
index fa58bd9f0..000000000
--- a/lib/willpaginate_extension.rb
+++ /dev/null
@@ -1,59 +0,0 @@
-# this extension is loaded in environment.rb
-module WillPaginateExtension
- class LinkRenderer < WillPaginate::ActionView::LinkRenderer
- def page_link(page, text, attributes = {})
- # Hack for admin pages, when proxied via https on mySociety servers, they
- # need a relative URL.
- url = url_for(page)
- if url.match(/\/admin.*(\?.*)/)
- url = $1
- end
- # Hack around our type-ahead search magic
- if url.match(/\/body\/search_ahead/)
- url.sub!("/body/search_ahead", "/select_authority")
- end
- @template.link_to text, url, attributes
- end
-
- # Returns URL params for +page_link_or_span+, taking the current GET params
- # and <tt>:params</tt> option into account.
- def url_for(page)
- page_one = page == 1
- unless @url_string and !page_one
- @url_params = {}
- # page links should preserve GET parameters
- stringified_merge @url_params, @template.params if @template.request.get?
- stringified_merge @url_params, @options[:params] if @options[:params]
- if complex = param_name.index(/[^\w-]/)
- page_param = parse_query_parameters("#{param_name}=#{page}")
-
- stringified_merge @url_params, page_param
- else
- @url_params[param_name] = page_one ? 1 : 2
- end
- # the following line makes pagination work on our specially munged search page
- combined = @template.request.path_parameters["combined"]
- @url_params["combined"] = combined if !combined.nil?
- url = @template.url_for(@url_params)
- return url if page_one
-
- if complex
- @url_string = url.sub(%r!((?:\?|&amp;)#{CGI.escape param_name}=)#{page}!, "\\1\0")
- return url
- else
- @url_string = url
- @url_params[param_name] = 3
- @template.url_for(@url_params).split(//).each_with_index do |char, i|
- if char == '3' and url[i, 1] == '2'
- @url_string[i] = "\0"
- break
- end
- end
- end
- end
- # finally!
- @url_string.sub "\0", page.to_s
- end
-
- end
-end