diff options
29 files changed, 1153 insertions, 79 deletions
@@ -10,6 +10,7 @@ source 'https://rubygems.org' gem 'rails', '3.1.12' gem 'pg' +gem 'charlock_holmes' gem 'fastercsv', '>=1.5.5' gem 'json' gem 'mahoro' diff --git a/Gemfile.lock b/Gemfile.lock index 3dc08590d..a9c2e7278 100644 --- a/Gemfile.lock +++ b/Gemfile.lock @@ -58,6 +58,7 @@ GEM net-sftp (>= 2.0.0) net-ssh (>= 2.0.14) net-ssh-gateway (>= 1.1.0) + charlock_holmes (0.6.9.4) chunky_png (1.2.6) colorize (0.5.8) columnize (0.3.6) @@ -242,6 +243,7 @@ DEPENDENCIES annotate bootstrap-sass capistrano + charlock_holmes compass coveralls debugger diff --git a/app/controllers/request_controller.rb b/app/controllers/request_controller.rb index a0f88096e..b8ccdf926 100644 --- a/app/controllers/request_controller.rb +++ b/app/controllers/request_controller.rb @@ -5,7 +5,6 @@ # Copyright (c) 2007 UK Citizens Online Democracy. All rights reserved. # Email: hello@mysociety.org; WWW: http://www.mysociety.org/ -require 'alaveteli_file_types' require 'zip/zip' require 'open-uri' @@ -722,7 +721,7 @@ class RequestController < ApplicationController yield - if params[:skip_cache].nil? + if params[:skip_cache].nil? && response.status == 200 # write it to the fileystem ourselves, so is just a plain file. (The # various fragment cache functions using Ruby Marshall to write the file # which adds a header, so isnt compatible with images that have been @@ -737,6 +736,7 @@ class RequestController < ApplicationController def get_attachment get_attachment_internal(false) + return unless @attachment # Prevent spam to magic request address. Note that the binary # subsitution method used depends on the content type @@ -756,6 +756,7 @@ class RequestController < ApplicationController raise ActiveRecord::RecordNotFound.new("Attachment HTML not found.") end get_attachment_internal(true) + return unless @attachment # images made during conversion (e.g. images in PDF files) are put in the cache directory, so # the same cache code in cache_attachments above will display them. @@ -802,8 +803,11 @@ class RequestController < ApplicationController # check permissions raise "internal error, pre-auth filter should have caught this" if !@info_request.user_can_view?(authenticated_user) - @attachment = IncomingMessage.get_attachment_by_url_part_number(@incoming_message.get_attachments_for_display, @part_number) - raise ActiveRecord::RecordNotFound.new("attachment not found part number " + @part_number.to_s + " incoming_message " + @incoming_message.id.to_s) if @attachment.nil? + @attachment = IncomingMessage.get_attachment_by_url_part_number_and_filename(@incoming_message.get_attachments_for_display, @part_number, @original_filename) + # If we can't find the right attachment, redirect to the incoming message: + unless @attachment + return redirect_to incoming_message_url(@incoming_message), :status => 303 + end # check filename in URL matches that in database (use a censor rule if you want to change a filename) raise ActiveRecord::RecordNotFound.new("please use same filename as original file has, display: '" + @attachment.display_filename + "' old_display: '" + @attachment.old_display_filename + "' original: '" + @original_filename + "'") if @attachment.display_filename != @original_filename && @attachment.old_display_filename != @original_filename diff --git a/app/mailers/request_mailer.rb b/app/mailers/request_mailer.rb index 3eb89c660..4dbce6738 100644 --- a/app/mailers/request_mailer.rb +++ b/app/mailers/request_mailer.rb @@ -4,8 +4,6 @@ # Copyright (c) 2007 UK Citizens Online Democracy. All rights reserved. # Email: hello@mysociety.org; WWW: http://www.mysociety.org/ -require 'alaveteli_file_types' - class RequestMailer < ApplicationMailer # Used when an FOI officer uploads a response from their web browser - this is # the "fake" email used to store in the same format in the database as if they diff --git a/app/models/foi_attachment.rb b/app/models/foi_attachment.rb index fcde379e0..0340f2b83 100644 --- a/app/models/foi_attachment.rb +++ b/app/models/foi_attachment.rb @@ -71,7 +71,12 @@ class FoiAttachment < ActiveRecord::Base tries = 0 delay = 1 begin - @cached_body = File.open(self.filepath, "rb" ).read + binary_data = File.open(self.filepath, "rb" ).read + if self.content_type =~ /^text/ + @cached_body = convert_string_to_utf8_or_binary(binary_data, 'UTF-8') + else + @cached_body = binary_data + end rescue Errno::ENOENT # we've lost our cached attachments for some reason. Reparse them. if tries > BODY_MAX_TRIES diff --git a/app/models/incoming_message.rb b/app/models/incoming_message.rb index c914edb7e..252f81bb7 100644 --- a/app/models/incoming_message.rb +++ b/app/models/incoming_message.rb @@ -31,12 +31,9 @@ # Move some of the (e.g. quoting) functions here into rblib, as they feel # general not specific to IncomingMessage. -require 'alaveteli_file_types' require 'htmlentities' require 'rexml/document' require 'zip/zip' -require 'mapi/msg' -require 'mapi/convert' require 'iconv' unless RUBY_VERSION >= '1.9' class IncomingMessage < ActiveRecord::Base @@ -132,6 +129,7 @@ class IncomingMessage < ActiveRecord::Base end self.valid_to_reply_to = self._calculate_valid_to_reply_to self.last_parsed = Time.now + self.foi_attachments reload=true self.save! end end @@ -173,15 +171,29 @@ class IncomingMessage < ActiveRecord::Base super end - # And look up by URL part number to get an attachment + # And look up by URL part number and display filename to get an attachment # XXX relies on extract_attachments calling MailHandler.ensure_parts_counted - def self.get_attachment_by_url_part_number(attachments, found_url_part_number) - attachments.each do |a| - if a.url_part_number == found_url_part_number - return a + # The filename here is passed from the URL parameter, so it's the + # display_filename rather than the real filename. + def self.get_attachment_by_url_part_number_and_filename(attachments, found_url_part_number, display_filename) + attachment_by_part_number = attachments.detect { |a| a.url_part_number == found_url_part_number } + if attachment_by_part_number && attachment_by_part_number.display_filename == display_filename + # Then the filename matches, which is fine: + attachment_by_part_number + else + # Otherwise if the URL part number and filename don't + # match - this is probably due to a reparsing of the + # email. In that case, try to find a unique matching + # filename from any attachment. + attachments_by_filename = attachments.select { |a| + a.display_filename == display_filename + } + if attachments_by_filename.length == 1 + attachments_by_filename[0] + else + nil end end - return nil end # Converts email addresses we know about into textual descriptions of them diff --git a/config/initializers/alaveteli.rb b/config/initializers/alaveteli.rb index 4acc126d0..a9e9d498d 100644 --- a/config/initializers/alaveteli.rb +++ b/config/initializers/alaveteli.rb @@ -59,6 +59,8 @@ require 'quiet_opener.rb' require 'mail_handler' require 'public_body_categories' require 'ability' +require 'normalize_string' +require 'alaveteli_file_types' # Allow tests to be run under a non-superuser database account if required if Rails.env == 'test' and ActiveRecord::Base.configurations['test']['constraint_disabling'] == false diff --git a/config/packages b/config/packages index db51e5bdd..fc67cda6b 100644 --- a/config/packages +++ b/config/packages @@ -36,4 +36,5 @@ rake (>= 0.9.2.2) build-essential bundler sqlite3 -libsqlite3-dev
\ No newline at end of file +libsqlite3-dev +libicu-dev diff --git a/lib/mail_handler/backends/mail_backend.rb b/lib/mail_handler/backends/mail_backend.rb index f7893a60d..03d78e0a3 100644 --- a/lib/mail_handler/backends/mail_backend.rb +++ b/lib/mail_handler/backends/mail_backend.rb @@ -1,4 +1,35 @@ require 'mail' +require 'mapi/msg' +require 'mapi/convert' + +module Mail + class Message + + # The behaviour of the 'to' and 'cc' methods have changed + # between TMail and Mail; this monkey-patching restores the + # TMail behaviour. The key difference is that when there's an + # invalid address, e.g. '<foo@example.org', Mail returns the + # string as an ActiveSupport::Multibyte::Chars, whereas + # previously TMail would return nil. + + alias_method :old_to, :to + alias_method :old_cc, :cc + + def clean_addresses(old_method, val) + old_result = self.send(old_method, val) + old_result.class == Mail::AddressContainer ? old_result : nil + end + + def to(val = nil) + self.clean_addresses :old_to, val + end + + def cc(val = nil) + self.clean_addresses :old_cc, val + end + + end +end module MailHandler module Backends @@ -38,7 +69,11 @@ module MailHandler # Get the body of a mail part def get_part_body(part) - part.body.decoded + decoded = part.body.decoded + if part.content_type =~ /^text\// + decoded = convert_string_to_utf8_or_binary decoded, part.charset + end + decoded end # Return the first from field if any @@ -141,9 +176,14 @@ module MailHandler end elsif get_content_type(part) == 'application/ms-tnef' # A set of attachments in a TNEF file - part.rfc822_attachment = mail_from_tnef(part.body.decoded) - if part.rfc822_attachment.nil? - # Attached mail didn't parse, so treat as binary + begin + part.rfc822_attachment = mail_from_tnef(part.body.decoded) + if part.rfc822_attachment.nil? + # Attached mail didn't parse, so treat as binary + part.content_type = 'application/octet-stream' + end + rescue TNEFParsingError + part.rfc822_attachment = nil part.content_type = 'application/octet-stream' end end @@ -160,8 +200,11 @@ module MailHandler part.parts.each{ |sub_part| expand_and_normalize_parts(sub_part, parent_mail) } else part_filename = get_part_file_name(part) - charset = part.charset # save this, because overwriting content_type also resets charset - + if part.has_charset? + original_charset = part.charset # save this, because overwriting content_type also resets charset + else + original_charset = nil + end # Don't allow nil content_types if get_content_type(part).nil? part.content_type = 'application/octet-stream' @@ -180,7 +223,9 @@ module MailHandler # Use standard content types for Word documents etc. part.content_type = normalise_content_type(get_content_type(part)) decode_attached_part(part, parent_mail) - part.charset = charset + if original_charset + part.charset = original_charset + end end end @@ -228,8 +273,15 @@ module MailHandler def _get_attachment_leaves_recursive(part, within_rfc822_attachment, parent_mail) leaves_found = [] if part.multipart? - raise "no parts on multipart mail" if part.parts.size == 0 - if part.sub_type == 'alternative' + if part.parts.size == 0 + # This is typically caused by a missing final + # MIME boundary, in which case the text of the + # message (including the opening MIME + # boundary) is in part.body, so just add this + # part as a leaf and treat it as text/plain: + part.content_type = "text/plain" + leaves_found += [part] + elsif part.sub_type == 'alternative' best_part = choose_best_alternative(part) leaves_found += _get_attachment_leaves_recursive(best_part, within_rfc822_attachment, diff --git a/lib/mail_handler/mail_handler.rb b/lib/mail_handler/mail_handler.rb index 22ba26b97..9c955cccd 100644 --- a/lib/mail_handler/mail_handler.rb +++ b/lib/mail_handler/mail_handler.rb @@ -8,20 +8,23 @@ module MailHandler require 'backends/mail_backend' include Backends::MailBackend + class TNEFParsingError < StandardError + end + # Returns a set of attachments from the given TNEF contents # The TNEF contents also contains the message body, but in general this is the # same as the message body in the message proper. def tnef_attachments(content) attachments = [] Dir.mktmpdir do |dir| - IO.popen("#{`which tnef`.chomp} -K -C #{dir}", "wb") do |f| + IO.popen("tnef -K -C #{dir} 2> /dev/null", "wb") do |f| f.write(content) f.close if $?.signaled? raise IOError, "tnef exited with signal #{$?.termsig}" end if $?.exited? && $?.exitstatus != 0 - raise IOError, "tnef exited with status #{$?.exitstatus}" + raise TNEFParsingError, "tnef exited with status #{$?.exitstatus}" end end found = 0 @@ -34,7 +37,7 @@ module MailHandler end end if found == 0 - raise IOError, "tnef produced no attachments" + raise TNEFParsingError, "tnef produced no attachments" end end attachments diff --git a/lib/normalize_string.rb b/lib/normalize_string.rb new file mode 100644 index 000000000..f02b18ee0 --- /dev/null +++ b/lib/normalize_string.rb @@ -0,0 +1,86 @@ +require 'iconv' unless RUBY_VERSION.to_f >= 1.9 +require 'charlock_holmes' + +class EncodingNormalizationError < StandardError +end + +def normalize_string_to_utf8(s, suggested_character_encoding=nil) + + # Make a list of encodings to try: + to_try = [] + + guessed_encoding = CharlockHolmes::EncodingDetector.detect(s)[:encoding] + guessed_encoding ||= '' + + # It's reasonably common for windows-1252 text to be mislabelled + # as ISO-8859-1, so try that first if charlock_holmes guessed + # that. However, it can also easily misidentify UTF-8 strings as + # ISO-8859-1 so we don't want to go with the guess by default... + to_try.push guessed_encoding if guessed_encoding.downcase == 'windows-1252' + + to_try.push suggested_character_encoding if suggested_character_encoding + to_try.push 'UTF-8' + to_try.push guessed_encoding + + to_try.each do |from_encoding| + if RUBY_VERSION.to_f >= 1.9 + begin + s.force_encoding from_encoding + return s.encode('UTF-8') if s.valid_encoding? + rescue ArgumentError + # We get this is there are invalid bytes when + # interpreted as from_encoding at the point of + # the encode('UTF-8'); move onto the next one... + end + else + to_encoding = 'UTF-8' + begin + converted = Iconv.conv 'UTF-8', from_encoding, s + return converted + rescue Iconv::Failure + # We get this is there are invalid bytes when + # interpreted as from_encoding at the point of + # the Iconv.iconv; move onto the next one... + end + end + end + raise EncodingNormalizationError, "Couldn't find a valid character encoding for the string" + +end + +def convert_string_to_utf8_or_binary(s, suggested_character_encoding=nil) + # This function exists to help to keep consistent with the + # behaviour of earlier versions of Alaveteli: in the code as it + # is, there are situations where it's expected that we generally + # have a UTF-8 encoded string, but if the source data was + # unintepretable under any character encoding, the string may be + # binary data (i.e. invalid UTF-8). Such a string would then be + # mangled into valid UTF-8 by _sanitize_text for the purposes of + # display. + + # This seems unsatisfactory to me - two better alternatives would + # be either: (a) to mangle the data into valid UTF-8 in this + # method or (b) to treat the 'text/*' attachment as + # 'application/octet-stream' instead. However, for the purposes + # of the transition to Ruby 1.9 and/or Rails 3 we just want the + # behaviour to be as similar as possible. + + begin + result = normalize_string_to_utf8 s, suggested_character_encoding + rescue EncodingNormalizationError + result = s + s.force_encoding 'ASCII-8BIT' if RUBY_VERSION.to_f >= 1.9 + end + result +end + +def log_text_details(message, text) + if RUBY_VERSION.to_f >= 1.9 + STDERR.puts "#{message}, we have text: #{text}, of class #{text.class} and encoding #{text.encoding}" + else + STDERR.puts "#{message}, we have text: #{text}, of class #{text.class}" + end + filename = "/var/tmp/#{Digest::MD5.hexdigest(text)}.txt" + File.open(filename, "wb") { |f| f.write text } + STDERR.puts "#{message}, the filename is: #{filename}" +end diff --git a/lib/tasks/temp.rake b/lib/tasks/temp.rake index e49a84ecb..f0085b5e1 100644 --- a/lib/tasks/temp.rake +++ b/lib/tasks/temp.rake @@ -50,4 +50,154 @@ namespace :temp do end end + desc 'Create a CSV file of a random selection of raw emails, for comparing hexdigests' + task :random_attachments_hexdigests => :environment do + + # The idea is to run this under the Rail 2 codebase, where + # Tmail was used to extract the attachements, and the task + # will output all of those file paths in a CSV file, and a + # list of the raw email files in another. The latter file is + # useful so that one can easily tar up the emails with: + # + # tar cvz -T raw-email-files -f raw_emails.tar.gz + # + # Then you can switch to the Rails 3 codebase, where + # attachment parsing is done via + # recompute_attachments_hexdigests + + require 'csv' + + File.open('raw-email-files', 'w') do |f| + CSV.open('attachment-hexdigests.csv', 'w') do |csv| + csv << ['filepath', 'i', 'url_part_number', 'hexdigest'] + IncomingMessage.all(:order => 'RANDOM()', :limit => 1000).each do |incoming_message| + # raw_email.filepath fails unless the + # incoming_message has an associated request + next unless incoming_message.info_request + raw_email = incoming_message.raw_email + f.puts raw_email.filepath + incoming_message.foi_attachments.each_with_index do |attachment, i| + csv << [raw_email.filepath, i, attachment.url_part_number, attachment.hexdigest] + end + end + end + end + + end + + + desc 'Check the hexdigests of attachments in emails on disk' + task :recompute_attachments_hexdigests => :environment do + + require 'csv' + require 'digest/md5' + + OldAttachment = Struct.new :filename, :attachment_index, :url_part_number, :hexdigest + + filename_to_attachments = Hash.new {|h,k| h[k] = []} + + header_line = true + CSV.foreach('attachment-hexdigests.csv') do |filename, attachment_index, url_part_number, hexdigest| + if header_line + header_line = false + else + filename_to_attachments[filename].push OldAttachment.new filename, attachment_index, url_part_number, hexdigest + end + end + + total_attachments = 0 + attachments_with_different_hexdigest = 0 + files_with_different_numbers_of_attachments = 0 + no_tnef_attachments = 0 + no_parts_in_multipart = 0 + + multipart_error = "no parts on multipart mail" + tnef_error = "tnef produced no attachments" + + # Now check each file: + filename_to_attachments.each do |filename, old_attachments| + + # Currently it doesn't seem to be possible to reuse the + # attachment parsing code in Alaveteli without saving + # objects to the database, so reproduce what it does: + + raw_email = nil + File.open(filename) do |f| + raw_email = f.read + end + mail = MailHandler.mail_from_raw_email(raw_email) + + begin + attachment_attributes = MailHandler.get_attachment_attributes(mail) + rescue IOError => e + if e.message == tnef_error + puts "#{filename} #{tnef_error}" + no_tnef_attachments += 1 + next + else + raise + end + rescue Exception => e + if e.message == multipart_error + puts "#{filename} #{multipart_error}" + no_parts_in_multipart += 1 + next + else + raise + end + end + + if attachment_attributes.length != old_attachments.length + puts "#{filename} the number of old attachments #{old_attachments.length} didn't match the number of new attachments #{attachment_attributes.length}" + files_with_different_numbers_of_attachments += 1 + else + old_attachments.each_with_index do |old_attachment, i| + total_attachments += 1 + attrs = attachment_attributes[i] + old_hexdigest = old_attachment.hexdigest + new_hexdigest = attrs[:hexdigest] + new_content_type = attrs[:content_type] + old_url_part_number = old_attachment.url_part_number.to_i + new_url_part_number = attrs[:url_part_number] + if old_url_part_number != new_url_part_number + puts "#{i} #{filename} old_url_part_number #{old_url_part_number}, new_url_part_number #{new_url_part_number}" + end + if old_hexdigest != new_hexdigest + body = attrs[:body] + # First, if the content type is one of + # text/plain, text/html or application/rtf try + # changing CRLF to LF and calculating a new + # digest - we generally don't worry about + # these changes: + new_converted_hexdigest = nil + if ["text/plain", "text/html", "application/rtf"].include? new_content_type + converted_body = body.gsub /\r\n/, "\n" + new_converted_hexdigest = Digest::MD5.hexdigest converted_body + puts "new_converted_hexdigest is #{new_converted_hexdigest}" + end + if (! new_converted_hexdigest) || (old_hexdigest != new_converted_hexdigest) + puts "#{i} #{filename} old_hexdigest #{old_hexdigest} wasn't the same as new_hexdigest #{new_hexdigest}" + puts " body was of length #{body.length}" + puts " content type was: #{new_content_type}" + path = "/tmp/#{new_hexdigest}" + f = File.new path, "w" + f.write body + f.close + puts " wrote body to #{path}" + attachments_with_different_hexdigest += 1 + end + end + end + end + + end + + puts "total_attachments: #{total_attachments}" + puts "attachments_with_different_hexdigest: #{attachments_with_different_hexdigest}" + puts "files_with_different_numbers_of_attachments: #{files_with_different_numbers_of_attachments}" + puts "no_tnef_attachments: #{no_tnef_attachments}" + puts "no_parts_in_multipart: #{no_parts_in_multipart}" + + end + end diff --git a/spec/controllers/api_controller_spec.rb b/spec/controllers/api_controller_spec.rb index 749be9f85..66b8e33f0 100644 --- a/spec/controllers/api_controller_spec.rb +++ b/spec/controllers/api_controller_spec.rb @@ -259,7 +259,7 @@ describe ApiController, "when using the API" do attachments.size.should == 1 attachment = attachments[0] attachment.filename.should == "tfl.pdf" - attachment.body.should == load_file_fixture("tfl.pdf", as_binary=true) + attachment.body.should == load_file_fixture("tfl.pdf") end it "should show information about a request" do diff --git a/spec/controllers/request_controller_spec.rb b/spec/controllers/request_controller_spec.rb index 657837c72..9cc60a103 100644 --- a/spec/controllers/request_controller_spec.rb +++ b/spec/controllers/request_controller_spec.rb @@ -477,11 +477,11 @@ describe RequestController, "when showing one request" do (assigns[:info_request_events].size - size_before).should == 1 ir.reload - get :get_attachment, :incoming_message_id => ir.incoming_messages[1].id, :id => ir.id, :part => 2, :file_name => 'hello.txt', :skip_cache => 1 + get :get_attachment, :incoming_message_id => ir.incoming_messages[1].id, :id => ir.id, :part => 2, :file_name => 'hello world.txt', :skip_cache => 1 response.content_type.should == "text/plain" response.should contain "Second hello" - get :get_attachment, :incoming_message_id => ir.incoming_messages[1].id, :id => ir.id, :part => 3, :file_name => 'hello.txt', :skip_cache => 1 + get :get_attachment, :incoming_message_id => ir.incoming_messages[1].id, :id => ir.id, :part => 3, :file_name => 'hello world.txt', :skip_cache => 1 response.content_type.should == "text/plain" response.should contain "First hello" end @@ -494,7 +494,7 @@ describe RequestController, "when showing one request" do get :get_attachment, :incoming_message_id => ir.incoming_messages[1].id, :id => ir.id, :part => 2, - :file_name => 'hello.txt' + :file_name => 'hello world.txt' end it "should convert message body to UTF8" do @@ -508,7 +508,7 @@ describe RequestController, "when showing one request" do ir = info_requests(:fancy_dog_request) receive_incoming_mail('incoming-request-two-same-name.email', ir.incoming_email) ir.reload - get :get_attachment_as_html, :incoming_message_id => ir.incoming_messages[1].id, :id => ir.id, :part => 2, :file_name => 'hello.txt.html', :skip_cache => 1 + get :get_attachment_as_html, :incoming_message_id => ir.incoming_messages[1].id, :id => ir.id, :part => 2, :file_name => 'hello world.txt.html', :skip_cache => 1 response.content_type.should == "text/html" response.should contain "Second hello" end @@ -529,11 +529,11 @@ describe RequestController, "when showing one request" do ir.reload ugly_id = "55195" lambda { - get :get_attachment, :incoming_message_id => ir.incoming_messages[1].id, :id => ugly_id, :part => 2, :file_name => 'hello.txt.html', :skip_cache => 1 + get :get_attachment, :incoming_message_id => ir.incoming_messages[1].id, :id => ugly_id, :part => 2, :file_name => 'hello world.txt.html', :skip_cache => 1 }.should raise_error(ActiveRecord::RecordNotFound) lambda { - get :get_attachment_as_html, :incoming_message_id => ir.incoming_messages[1].id, :id => ugly_id, :part => 2, :file_name => 'hello.txt', :skip_cache => 1 + get :get_attachment_as_html, :incoming_message_id => ir.incoming_messages[1].id, :id => ugly_id, :part => 2, :file_name => 'hello world.txt', :skip_cache => 1 }.should raise_error(ActiveRecord::RecordNotFound) end it "should return 404 when incoming message and request ids don't match" do @@ -542,7 +542,7 @@ describe RequestController, "when showing one request" do receive_incoming_mail('incoming-request-two-same-name.email', ir.incoming_email) ir.reload lambda { - get :get_attachment_as_html, :incoming_message_id => ir.incoming_messages[1].id, :id => wrong_id, :part => 2, :file_name => 'hello.txt.html', :skip_cache => 1 + get :get_attachment_as_html, :incoming_message_id => ir.incoming_messages[1].id, :id => wrong_id, :part => 2, :file_name => 'hello world.txt.html', :skip_cache => 1 }.should raise_error(ActiveRecord::RecordNotFound) end it "should return 404 for ugly URLs contain a request id that isn't an integer, even if the integer prefix refers to an actual request" do @@ -552,11 +552,11 @@ describe RequestController, "when showing one request" do ugly_id = "%d95" % [info_requests(:naughty_chicken_request).id] lambda { - get :get_attachment, :incoming_message_id => ir.incoming_messages[1].id, :id => ugly_id, :part => 2, :file_name => 'hello.txt.html', :skip_cache => 1 + get :get_attachment, :incoming_message_id => ir.incoming_messages[1].id, :id => ugly_id, :part => 2, :file_name => 'hello world.txt.html', :skip_cache => 1 }.should raise_error(ActiveRecord::RecordNotFound) lambda { - get :get_attachment_as_html, :incoming_message_id => ir.incoming_messages[1].id, :id => ugly_id, :part => 2, :file_name => 'hello.txt', :skip_cache => 1 + get :get_attachment_as_html, :incoming_message_id => ir.incoming_messages[1].id, :id => ugly_id, :part => 2, :file_name => 'hello world.txt', :skip_cache => 1 }.should raise_error(ActiveRecord::RecordNotFound) end it "should return 404 when incoming message and request ids don't match" do @@ -565,7 +565,7 @@ describe RequestController, "when showing one request" do receive_incoming_mail('incoming-request-two-same-name.email', ir.incoming_email) ir.reload lambda { - get :get_attachment_as_html, :incoming_message_id => ir.incoming_messages[1].id, :id => wrong_id, :part => 2, :file_name => 'hello.txt.html', :skip_cache => 1 + get :get_attachment_as_html, :incoming_message_id => ir.incoming_messages[1].id, :id => wrong_id, :part => 2, :file_name => 'hello world.txt.html', :skip_cache => 1 }.should raise_error(ActiveRecord::RecordNotFound) end @@ -573,44 +573,66 @@ describe RequestController, "when showing one request" do ir = info_requests(:fancy_dog_request) receive_incoming_mail('incoming-request-pdf-attachment.email', ir.incoming_email) ir.reload - get :get_attachment_as_html, :incoming_message_id => ir.incoming_messages[1].id, :id => ir.id, :part => 2, :file_name => 'fs_50379341.pdf.html', :skip_cache => 1 + get :get_attachment_as_html, :incoming_message_id => ir.incoming_messages[1].id, :id => ir.id, :part => 2, :file_name => 'fs 50379341.pdf.html', :skip_cache => 1 response.content_type.should == "text/html" response.should contain "Walberswick Parish Council" end - it "should not cause a reparsing of the raw email, even when the result would be a 404" do + it "should not cause a reparsing of the raw email, even when the attachment can't be found" do ir = info_requests(:fancy_dog_request) receive_incoming_mail('incoming-request-two-same-name.email', ir.incoming_email) ir.reload - attachment = IncomingMessage.get_attachment_by_url_part_number(ir.incoming_messages[1].get_attachments_for_display, 2) + attachment = IncomingMessage.get_attachment_by_url_part_number_and_filename(ir.incoming_messages[1].get_attachments_for_display, 2, 'hello world.txt') attachment.body.should contain "Second hello" # change the raw_email associated with the message; this only be reparsed when explicitly asked for ir.incoming_messages[1].raw_email.data = ir.incoming_messages[1].raw_email.data.sub("Second", "Third") - # asking for an attachment by the wrong filename results - # in a 404 for browsing users. This shouldn't cause a - # re-parse... - lambda { - get :get_attachment_as_html, :incoming_message_id => ir.incoming_messages[1].id, :id => ir.id, :part => 2, :file_name => 'hello.txt.baz.html', :skip_cache => 1 - }.should raise_error(ActiveRecord::RecordNotFound) + # asking for an attachment by the wrong filename should result in redirecting + # back to the incoming message, but shouldn't cause a reparse: + get :get_attachment_as_html, :incoming_message_id => ir.incoming_messages[1].id, :id => ir.id, :part => 2, :file_name => 'hello world.txt.baz.html', :skip_cache => 1 + response.status.should == 303 - attachment = IncomingMessage.get_attachment_by_url_part_number(ir.incoming_messages[1].get_attachments_for_display, 2) + attachment = IncomingMessage.get_attachment_by_url_part_number_and_filename(ir.incoming_messages[1].get_attachments_for_display, 2, 'hello world.txt') attachment.body.should contain "Second hello" # ...nor should asking for it by its correct filename... - get :get_attachment_as_html, :incoming_message_id => ir.incoming_messages[1].id, :id => ir.id, :part => 2, :file_name => 'hello.txt.html', :skip_cache => 1 + get :get_attachment_as_html, :incoming_message_id => ir.incoming_messages[1].id, :id => ir.id, :part => 2, :file_name => 'hello world.txt.html', :skip_cache => 1 response.should_not contain "Third hello" # ...but if we explicitly ask for attachments to be extracted, then they should be force = true ir.incoming_messages[1].parse_raw_email!(force) ir.reload - attachment = IncomingMessage.get_attachment_by_url_part_number(ir.incoming_messages[1].get_attachments_for_display, 2) + attachment = IncomingMessage.get_attachment_by_url_part_number_and_filename(ir.incoming_messages[1].get_attachments_for_display, 2, 'hello world.txt') attachment.body.should contain "Third hello" - get :get_attachment_as_html, :incoming_message_id => ir.incoming_messages[1].id, :id => ir.id, :part => 2, :file_name => 'hello.txt.html', :skip_cache => 1 + get :get_attachment_as_html, :incoming_message_id => ir.incoming_messages[1].id, :id => ir.id, :part => 2, :file_name => 'hello world.txt.html', :skip_cache => 1 response.should contain "Third hello" end + it "should redirect to the incoming message if there's a wrong part number and an ambiguous filename" do + ir = info_requests(:fancy_dog_request) + receive_incoming_mail('incoming-request-two-same-name.email', ir.incoming_email) + ir.reload + + im = ir.incoming_messages[1] + + attachment = IncomingMessage.get_attachment_by_url_part_number_and_filename(im.get_attachments_for_display, 5, 'hello world.txt') + attachment.should be_nil + + get :get_attachment_as_html, :incoming_message_id => im.id, :id => ir.id, :part => 5, :file_name => 'hello world.txt', :skip_cache => 1 + response.status.should == 303 + new_location = response.header['Location'] + new_location.should match(/request\/#{ir.url_title}#incoming-#{im.id}/) + end + + it "should find a uniquely named filename even if the URL part number was wrong" do + ir = info_requests(:fancy_dog_request) + receive_incoming_mail('incoming-request-pdf-attachment.email', ir.incoming_email) + ir.reload + get :get_attachment, :incoming_message_id => ir.incoming_messages[1].id, :id => ir.id, :part => 5, :file_name => 'fs 50379341.pdf', :skip_cache => 1 + response.content_type.should == "application/pdf" + end + it "should treat attachments with unknown extensions as binary" do ir = info_requests(:fancy_dog_request) receive_incoming_mail('incoming-request-attachment-unknown-extension.email', ir.incoming_email) @@ -625,10 +647,8 @@ describe RequestController, "when showing one request" do ir = info_requests(:fancy_dog_request) receive_incoming_mail('incoming-request-two-same-name.email', ir.incoming_email) - lambda { - get :get_attachment, :incoming_message_id => ir.incoming_messages[1].id, :id => ir.id, :part => 2, - :file_name => 'http://trying.to.hack' - }.should raise_error(ActiveRecord::RecordNotFound) + get :get_attachment, :incoming_message_id => ir.incoming_messages[1].id, :id => ir.id, :part => 2, :file_name => 'http://trying.to.hack' + response.status.should == 303 end it "should censor attachments downloaded as binary" do @@ -644,7 +664,7 @@ describe RequestController, "when showing one request" do begin receive_incoming_mail('incoming-request-two-same-name.email', ir.incoming_email) - get :get_attachment, :incoming_message_id => ir.incoming_messages[1].id, :id => ir.id, :part => 2, :file_name => 'hello.txt', :skip_cache => 1 + get :get_attachment, :incoming_message_id => ir.incoming_messages[1].id, :id => ir.id, :part => 2, :file_name => 'hello world.txt', :skip_cache => 1 response.content_type.should == "text/plain" response.should contain "xxxxxx hello" ensure @@ -666,7 +686,7 @@ describe RequestController, "when showing one request" do receive_incoming_mail('incoming-request-two-same-name.email', ir.incoming_email) ir.reload - get :get_attachment, :incoming_message_id => ir.incoming_messages[1].id, :id => ir.id, :part => 2, :file_name => 'hello.txt', :skip_cache => 1 + get :get_attachment, :incoming_message_id => ir.incoming_messages[1].id, :id => ir.id, :part => 2, :file_name => 'hello world.txt', :skip_cache => 1 response.content_type.should == "text/plain" response.should contain "xxxxxx hello" ensure @@ -695,11 +715,13 @@ describe RequestController, "when showing one request" do # so at this point, assigns[:info_request].incoming_messages[1].get_attachments_for_display is returning stuff, but the equivalent thing in the template isn't. # but something odd is that the above is return a whole load of attachments which aren't there in the controller response.body.should have_selector("p.attachment strong") do |s| - s.should contain /hello.txt/m + s.should contain /hello world.txt/m end censor_rule = CensorRule.new() - censor_rule.text = "hello.txt" + # Note that the censor rule applies to the original filename, + # not the display_filename: + censor_rule.text = "hello-world.txt" censor_rule.replacement = "goodbye.txt" censor_rule.last_edit_editor = "unknown" censor_rule.last_edit_comment = "none" @@ -743,7 +765,7 @@ describe RequestController, "when showing one request" do old_path = assigns[:url_path] response.location.should contain /#{assigns[:url_path]}$/ zipfile = Zip::ZipFile.open(File.join(File.dirname(__FILE__), "../../cache/zips", old_path)) { |zipfile| - zipfile.count.should == 3 # the message plus two "hello.txt" files + zipfile.count.should == 3 # the message plus two "hello-world.txt" files } # The path of the zip file is based on the hash of the timestamp of the last request @@ -756,7 +778,7 @@ describe RequestController, "when showing one request" do assigns[:url_path].should_not == old_path response.location.should contain assigns[:url_path] zipfile = Zip::ZipFile.open(File.join(File.dirname(__FILE__), "../../cache/zips", assigns[:url_path])) { |zipfile| - zipfile.count.should == 4 # the message, two hello.txt plus the unknown attachment + zipfile.count.should == 4 # the message, two hello-world.txt plus the unknown attachment } end @@ -875,7 +897,7 @@ describe RequestController, "when changing prominence of a request" do get :get_attachment_as_html, :incoming_message_id => ir.incoming_messages[1].id, :id => ir.id, :part => 2, - :file_name => 'hello.txt' + :file_name => 'hello world.txt' end.should raise_error(ActiveRecord::RecordNotFound) end @@ -890,7 +912,7 @@ describe RequestController, "when changing prominence of a request" do get :get_attachment_as_html, :incoming_message_id => ir.incoming_messages[1].id, :id => ir.id, :part => 2, - :file_name => 'hello.txt' + :file_name => 'hello world.txt' end.should raise_error(ActiveRecord::RecordNotFound) end @@ -2394,7 +2416,7 @@ describe RequestController, "when caching fragments" do attachment = mock(FoiAttachment, :display_filename => long_name, :body_as_html => ['some text', 'wrapper']) IncomingMessage.stub!(:find).with("44").and_return(incoming_message) - IncomingMessage.stub!(:get_attachment_by_url_part_number).and_return(attachment) + IncomingMessage.stub!(:get_attachment_by_url_part_number_and_filename).and_return(attachment) InfoRequest.stub!(:find).with("132").and_return(info_request) params = { :file_name => long_name, :controller => "request", diff --git a/spec/fixtures/files/incoming-request-two-same-name.email b/spec/fixtures/files/incoming-request-two-same-name.email index f1024d607..ecd322fe4 100644 --- a/spec/fixtures/files/incoming-request-two-same-name.email +++ b/spec/fixtures/files/incoming-request-two-same-name.email @@ -13,13 +13,13 @@ Content-Disposition: inline --Q68bSM7Ycu6FN28Q Content-Type: text/plain; charset=us-ascii -Content-Disposition: attachment; filename="hello.txt" +Content-Disposition: attachment; filename="hello-world.txt" Second hello --Q68bSM7Ycu6FN28Q Content-Type: text/plain; charset=us-ascii -Content-Disposition: attachment; filename="hello.txt" +Content-Disposition: attachment; filename="hello-world.txt" First hello diff --git a/spec/fixtures/files/inline-uuencode.email b/spec/fixtures/files/inline-uuencode.email new file mode 100644 index 000000000..3134ba3ad --- /dev/null +++ b/spec/fixtures/files/inline-uuencode.email @@ -0,0 +1,27 @@ +From foo@bar Mon Jun 01 17:14:44 2009 +Return-path: <foo@bar> +Envelope-to: foi@quux +Delivery-date: Mon, 01 Jun 2009 17:14:44 +0100 +From: <foo@bar> +To: <request-whatever@quux> +Subject: something or other +Date: Mon, 1 Jun 2009 17:14:37 +0100 +X-MimeOLE: Produced By Microsoft MimeOLE V6.00.3790.181 +Message-ID: <baz@xyzzy> + +Thanks for your email - here's a truncated attachment +for you: + +********************************************************************** + +begin 666 ResponseT7363 9.doc +MT,\1X*&Q&N$`````````````````````/@`#`/[_"0`&```````````````" +M````) ``````````$ ``+@````$```#^____`````",```!L````________ +M```````````````````````````````````````````````````````````` +M```````````````````````````````````````````````````````````` +#```` +` +end + +The original of this email was scanned for viruses or something +like that. diff --git a/spec/fixtures/files/malformed-to-and-cc.email b/spec/fixtures/files/malformed-to-and-cc.email new file mode 100644 index 000000000..4fbb6e21e --- /dev/null +++ b/spec/fixtures/files/malformed-to-and-cc.email @@ -0,0 +1,11 @@ +From foo@bar Wed Mar 12 14:58:26 2008 +Return-path: <foo@bar> +Subject: example email +To: <bar@example.org +Cc: baz@example.org> +From: quux@example.org +Date: Mon, 7 May 2012 12:47:06 +0100 +Mime-Version: 1.0 +Content-Type: text/plain; charset=utf-8 + +A very basic email, but with malformed To: and Cc: lines diff --git a/spec/fixtures/files/mislabelled-as-iso-8859-1.email b/spec/fixtures/files/mislabelled-as-iso-8859-1.email new file mode 100644 index 000000000..6c8e6109e --- /dev/null +++ b/spec/fixtures/files/mislabelled-as-iso-8859-1.email @@ -0,0 +1,20 @@ +From foo@bar Thu Mar 01 15:02:33 2012 +Return-path: <foo@bar> +Envelope-to: foi@quux +Delivery-date: Thu, 01 Mar 2012 15:02:33 +0000 +Date: Thu, 01 Mar 2012 15:01:58 +0000 +Subject: some FOI request +To: foi@quux +From: foo@bar +MIME-Version: 1.0 +Content-Type: text/plain; charset="iso-8859-1" +Content-Transfer-Encoding: 7bit +Message-Id: <2468@bar.local> + +Dear Whoever, + +THERE'S A DASH NEXT REQUEST FOR INFORMATION + +Best regards, +Other Person + diff --git a/spec/fixtures/files/multipart-no-final-boundary.email b/spec/fixtures/files/multipart-no-final-boundary.email new file mode 100644 index 000000000..9c16dad52 --- /dev/null +++ b/spec/fixtures/files/multipart-no-final-boundary.email @@ -0,0 +1,21 @@ +From foo@bar Thu Sep 13 10:34:44 2012 +Return-path: <foo@bar> +Envelope-to: foi@example.org +Delivery-date: Thu, 13 Sep 2012 10:34:44 +0100 +From: foo@bar +To: foi@example.org +Subject: an acknowledgement email +Date: Thu, 13 Sep 2012 10:08:03 +0100 +Message-ID: <987654@foo.local> +Content-Type: multipart/mixed; boundary="-----7D81B75CCC90D2974F7A1CBD" + +This is a multi-part message in MIME format. +-------7D81B75CCC90D2974F7A1CBD +Content-Type: text/html + +<div> + <p> + This is an acknowledgement of your email, that irritatingly + leaves out the final MIME boundary. + </p> +<div> diff --git a/spec/fixtures/files/nested-attachments-premature-end.email b/spec/fixtures/files/nested-attachments-premature-end.email new file mode 100644 index 000000000..6b13808dc --- /dev/null +++ b/spec/fixtures/files/nested-attachments-premature-end.email @@ -0,0 +1,110 @@ +From someone@example.org Mon May 15 13:10:29 2012 +Return-path: <someone@example.org> +Envelope-to: foi@example.org +Delivery-date: Mon, 15 May 2012 13:10:29 +0100 +Message-Id: <abcde@baz.local> +Date: Mon, 15 May 2012 09:48:48 +0100 +From: "Example Person" <someone@example.org> +To: <request@example.org> +Subject: some FOI request or other +Mime-Version: 1.0 +Content-Type: multipart/mixed; boundary="=__outer__=" + +This is a MIME message. If you are reading this text, you may want to +consider changing to a mail reader or gateway that understands how to +properly handle MIME multipart messages. + +--=__outer__= +Content-Type: multipart/alternative; boundary="=__inner__=" + +--=__inner__= +Content-Type: text/plain; charset="utf-8" +Content-Transfer-Encoding: quoted-printable +X-MIME-Autoconverted: from 8bit to quoted-printable by something + +Hello +=20 +Please find some information attached. +=20 + +--=__inner__= +Content-Description: HTML +Content-Type: text/html; charset="utf-8" +Content-Transfer-Encoding: quoted-printable + +<html> + <head> + <title>some title text</title> + </head> + <body> + <p>blah blah blah</p> + </body> +</html> + +--=__inner__=-- + +--=__outer__= +Content-Type: message/rfc822 + +Return-path: <foo@bar> +Date: Mon, 7 May 2012 12:47:06 +0100 +From: someone-else@example.org +To: foi@example.org +Message-Id: <56789@quux.local> +Subject: a freedom of information requests +Mime-Version: 1.0 +Content-Type: text/plain; charset=utf-8 + + Dear Whoever, + + Please could you let me know, um, whatever ... + + Yours faithfully, + + Whoever I Am + +--=__outer__= +Content-Type: text/plain; charset=US-ASCII +Content-Disposition: inline +Content-Transfer-Encoding: quoted-printable + + Dear Whowever, + =20 + Please could you let me know, um, whatever ... + =20 + Yours faithfully, + =20 + Whoever I Am + =20 + +--=__outer__=-- + +--=__outer__= +Content-Type: application/png; name="maroon-square.png" +Content-Transfer-Encoding: base64 +Content-Disposition: attachment; filename="maroon-square.png" + +iVBORw0KGgoAAAANSUhEUgAAAEEAAABCCAYAAAAIY7vrAAAABmJLR0QA/wD/AP+g +vaeTAAAACXBIWXMAAAsTAAALEwEAmpwYAAAAB3RJTUUH3QQeDSEx8qultwAAABl0 +RVh0Q29tbWVudABDcmVhdGVkIHdpdGggR0lNUFeBDhcAAAMzSURBVHja7VtL2psw +DNS4rPv1Gj1Kt71Az9ZT9F7dN9MFGGThB/YfKDX2Kp8DRBpLowcKvn/5ShERiAgl +srh8aT93tJzWdae8XR0CEICwUx59K54H4QFKp0Eg5alrAwEYIDx5DRAGCAOEAcIA +QaUFfDoIHJawpEbOPd0dRPjJDWIUiEwt933+8es2Ovz++a3dCkREXmwD4ZbsVln6 +cLkef14duAMqAGCkY0A+jBNgXGFZU/eKa3fhZjlQqLhHKF9oFbpulE2Z/oFrXTd+ +nlOWkn1dMHXrAiWguq0iG9uk/REjBggPtgQOED781my4wwBhgDBAmPmUAwR0X0UO +dxggnA8CO5xocU8HoAoEDwA6nOyCH+ZMKQ4zy+QbNBoUirquMPBJcgPyJkOi+c7S +ohhn6ZctzDIrcFalIspYILG1et9WABUtt6WztLq+/0Amp9sCnsCBUhfvK4FLiRCA +QwC7JABGTngrIIPnIjf6R5We0uxz3j+FbCvdy2nlY/IgcfrMRQuFHIC9Sap3AW8n +2gZ+cZYCVn4LzBxxnykNgJpWN8lt7yw+QCMxan2s8lQXcNlDlpAW7YmIXMszTgoH +rU91+8OFYXN9ikz/LyLgExSCDlaO+cdGsIEQkyUAIgFMKRTEn3vDjFFHwWSIzEQC +cmN4IHVNGG2PQXhhsuRl3jihwQyB6H1274gV1BhKLKNt4ZEpkygeeoC+xytdK1cr +oX0EACphnTZXbbLMmL/YBGo9lSU1OmBONMnTlQUqTa4y1VgAddg0hdTR04lyT0Xq +8RYAyHVyBX6ET/9wTBD6TWVCMH5Qo3yhXju3bNY/BBMdsoLYBMmnzQdOP56O36s5 +40r1D7UWYV5dNT2nbxVBAHb43Y36CdbXfTii6isU/U7ZXLQ4w/V/wotFoilVF2kl +w7YCDrIPkj4/G9fao7q0rYSSJdgeSqmQrCU+r/j8rOv/gpuKPm5Lffen5eN+ljeo +rcfW0Om2Enm9KwDZAgrG98txX9cMe6X2E5SGU29VTE17lFAUkMybsXclndu31BGX +hcgWv8oxonYtkf/jhc10WPGgm2IZncKlu+sg8vLm7hDSwk3f2/wFEzN3v6aAXQ0A +AAAASUVORK5CYII= + +--=__outer__=-- + diff --git a/spec/fixtures/files/no-part-charset-random-data.email b/spec/fixtures/files/no-part-charset-random-data.email new file mode 100644 index 000000000..d51fd3f38 --- /dev/null +++ b/spec/fixtures/files/no-part-charset-random-data.email @@ -0,0 +1,30 @@ +From xxxx@yahoo.cn Mon Oct 08 14:01:34 2012 +Return-path: <xxxx@yahoo.cn> +Envelope-to: foi@atlas.ukcod.org.uk +Delivery-date: Mon, 08 Oct 2012 14:01:34 +0100 +Received: (qmail 63864 invoked from network); 8 Oct 2012 13:01:12 -0000 +DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=yahoo.cn; s=s1024; t=1349701272; bh=T/mtlIYvhB/L5RO+CvTazeAdGf1n1zsGXBoA8EKGT9M=; h=Message-ID:X-Yahoo-Newman-Property:X-YMail-OSG:X-Yahoo-SMTP:Received:X-mailer:From:Subject:To:Content-Transfer-Encoding:Content-Type:Date; b=LYI/PXvA7DA746bmyprChUg7N8YDvN9XE/bhfTt5MW7siOmxHHzn1w+s5X33PvLI0x0UfJLo+MCkTnGPKnG5BYY38US8PkocJYyphrvF/eaUl3ALf8UvxHBOJX1iIi89Xp2NnfbS8lz9kZAWifb9GOnOA5/kLDcL5/WJXliit2k= +Message-ID: <xxxx@xxxx.yahoo.com> +X-Yahoo-Newman-Property: ymail-5 +X-YMail-OSG: nPs5jgsVM1myUoKjeEPTxxalz4BM6BZMEUYu.E8NPMPQyo_ + Yej8T2WCTurn767NOwhuDIqNxC2QGZINqfjmKcdyW7a1P_Zxqr9GsjgxODci + ihwr7qYAGDDbcsrB.PX4epnJZHl3yAwoGW.1ReEZnXQANFcNep7.zNEbZ_2k + RU1IhI9aHYvxPxt5RWugwOoFRh9P8Ym35A88IMazNtVaBiBEXF6Vk8Aqr9XP + 3Vh9xOT9Pn6X8qOUjNXkdb3xB4S5AAIRSE9mqhL1KzHBwdVQs25IoM_2FV2b + gPsQGgL4_mwBH0WcEMhdj7Kn6Nfb44L.50E_V3DH.8P7KzDK8zNVXSbAqohX + Qi6MzUK2frr8IyZyYzHb.ekff7kAcJgUoHvhnyPar8tRYxhQT3_xsUTzsx8N + oWckVPh_i3OT7U4ObgekqgtteMoYqPH2eF1SZXamGBAs- +X-Yahoo-SMTP: YUQHwRWswBDjbw_M.D6EP4KpT9khlJErDRBQi4ySZQ-- +X-mailer: MIME::Lite 3.027 (F2.74; T1.31; A2.07; B3.13; Q3.13) +From: =?GB2312?B?zsJKaWFu?= Bing <xxxx@yahoo.cn> +Subject: =?GB2312?B?yM7A1svJ?= +To: FOI Person <EMAIL_TO> +Content-Transfer-Encoding: base64 +Content-Type: text/plain +Date: Tue, 9 Oct 2012 20:53:06 +0800 + +HPBSqsndNBX+ER4hyBoPhhnclcWKVFgbevdD5cJvfI/ARbxRYqA28hZ49Pf6A/ks +NdVh4N5VPgRs/7SHYPfw5625pZJYTLj6nVdYk76sxnjiiAmwCJWGjPoWvO7nHUBv +fuLXtNVq5HmD0bWWjAbSk2n74PW7v5izbNO2fjHyiyX2CIof0rriXDmOldJqoebO +ejybrjG+Tahpu3FF1Mw98HfswzkdB46u/izLCzdUQVM= + diff --git a/spec/fixtures/files/part-without-charset-in-content-type.email b/spec/fixtures/files/part-without-charset-in-content-type.email new file mode 100644 index 000000000..439d52cc3 --- /dev/null +++ b/spec/fixtures/files/part-without-charset-in-content-type.email @@ -0,0 +1,38 @@ +From example@example.com Wed Sep 15 17:55:40 2010 +Return-path: <example@example.com> +Envelope-to: example@example.com +Delivery-date: Wed, 15 Sep 2010 17:55:40 +0100 +From: <example@example.com> +To: <request-xxxxx@whatdotheyknow.com> +Date: Wed, 15 Sep 2010 17:56:03 +0100 +Subject: FOI Internal Review response +Thread-Topic: FOI Internal Review response +Thread-Index: xxxxx +Message-ID: <xxxxxx> +Accept-Language: en-US, en-GB +Content-Language: en-US +X-MS-Has-Attach: yes +X-MS-TNEF-Correlator: +acceptlanguage: en-US, en-GB +Content-Type: multipart/mixed; + boundary="_002_E6527350F565F54A88C36C23F6C2B86702618AD0DF95SDCCPMSXMB5_" +MIME-Version: 1.0 + +--_002_E6527350F565F54A88C36C23F6C2B86702618AD0DF95SDCCPMSXMB5_ +Content-Type: text/plain; charset="utf-8" +Content-Transfer-Encoding: base64 + +someencodedtext= + +--_002_E6527350F565F54A88C36C23F6C2B86702618AD0DF95SDCCPMSXMB5_ +Content-Type: document/pdf; name="document.pdf" +Content-Description: document.pdf +Content-Disposition: attachment; filename="document.pdf"; + size=62103; creation-date="Wed, 15 Sep 2010 17:54:27 GMT"; + modification-date="Wed, 15 Sep 2010 17:54:27 GMT" +Content-Transfer-Encoding: base64 + +somemoreencodedtext= + +--_002_E6527350F565F54A88C36C23F6C2B86702618AD0DF95SDCCPMSXMB5_-- + diff --git a/spec/fixtures/files/tnef-attachment-empty.email b/spec/fixtures/files/tnef-attachment-empty.email new file mode 100644 index 000000000..7967aa95b --- /dev/null +++ b/spec/fixtures/files/tnef-attachment-empty.email @@ -0,0 +1,196 @@ +From hello@blah.local Fri Feb 21 16:23:14 2013 +Return-path: <bar@example.org> +Envelope-to: foo@example.org +Delivery-date: Fri, 21 Feb 2013 16:23:14 +0000 +Content-Type: multipart/mixed; + boundary="_000_553468B23EE29B4F8836CBD0E1B2A15A275C3AA855POLNIEXMBV2po_" +From: <bar@example.org> +To: <foo@example.org> +Sender: <hello@blah.local> +Date: Fri, 21 Feb 2013 16:23:04 +0000 +Subject: here's a useless email +Message-ID: <12345@blah.local> +Accept-Language: en-US, en-GB +Content-Language: en-US +X-MS-Has-Attach: +X-MS-TNEF-Correlator: <12345@blah.local> +acceptlanguage: en-US, en-GB +MIME-Version: 1.0 + +--_000_553468B23EE29B4F8836CBD0E1B2A15A275C3AA855POLNIEXMBV2po_ +Content-Type: text/plain; charset="us-ascii" +Content-Transfer-Encoding: quoted-printable + +This attachment just has a body from one of the tests +in the tnef package in Debian. + +--_000_553468B23EE29B4F8836CBD0E1B2A15A275C3AA855POLNIEXMBV2po_ +Content-Disposition: attachment; filename="winmail.dat" +Content-Transfer-Encoding: base64 +Content-Type: application/ms-tnef; name="winmail.dat" + +eJ8+IiURAQaQCAAEAAAAAAABAAEAAQeQBgAIAAAA5AQAAAAAAADoAAENgAQAAgAA +AAIAAgABBYADAA4AAADVBwQAGQAKAA8AIwABADYBASCAAwAOAAAA1QcEABkACgAP +ACQAAQA3AQEJgAEAIQAAADBEREEwRkNCQ0MwN0MxNDE5MkVFODZGQzQyRDE1Qjk1 +AGYHAQSQBgBkAgAAAQAAAA8AAAAfAAEwAQAAABAAAAAzAGsAdQBzAGUAcgAyAAAA +HwACMAEAAAAGAAAARQBYAAAAAAAfAAMwAQAAAI4AAAAvAE8APQBCAFIALQBFAFgA +QwBIAC0AVABFAFMAVAAvAE8AVQA9AEYASQBSAFMAVAAgAEEARABNAEkATgBJAFMA +VABSAEEAVABJAFYARQAgAEcAUgBPAFUAUAAvAEMATgA9AFIARQBDAEkAUABJAEUA +TgBUAFMALwBDAE4APQAzAGsAdQBzAGUAcgAyAAAAAAADAAAwAAAAAAMA/18AAAAA +AwAVDAEAAAACAQswAQAAAEoAAABFWDovTz1CUi1FWENILVRFU1QvT1U9RklSU1Qg +QURNSU5JU1RSQVRJVkUgR1JPVVAvQ049UkVDSVBJRU5UUy9DTj0zS1VTRVIyAAAA +HwAgOgEAAAAQAAAAMwBrAHUAcwBlAHIAMgAAAAMA/V8BAAAACwBAOgAA+T8CAfdf +AQAAAGMAAAAAAAAA3KdAyMBCEBq0uQgAKy/hggEAAAAAAAAAL289QlItRVhDSC1U +RVNUL291PUZpcnN0IEFkbWluaXN0cmF0aXZlIEdyb3VwL2NuPVJlY2lwaWVudHMv +Y249M2t1c2VyMgAAAwAAOQAAAAAfAP45AQAAAEoAAAAzAGsAdQBzAGUAcgAyAEAA +YgByAGUAeABjAGgAYQBuAGcAZQAuAGQAbwBsAHAAaABpAG4AcwBlAGEAcgBjAGgA +LgBjAG8AbQAAAAAAAwBxOgAAAAAfAPZfAQAAABAAAAAzAGsAdQBzAGUAcgAyAAAA +m2sBA5AGAEwbAAAzAAAACwACAAEAAAAfABoAAQAAABIAAABJAFAATQAuAE4AbwB0 +AGUAAAAAAAMAJgAAAAAAAwA2AAAAAAAfADcAAQAAAB4AAABCAGkAbABsACAAbwBm +ACAAUgBpAGcAaAB0AHMAAAAAAEAAOQBgQvtkuknFAR8APQABAAAAAgAAAAAAAAAC +AUcAAQAAADgAAABjPXVzO2E9IDtwPUJSLUVYQ0gtVEVTVDtsPUJSLUVYQ0gtREVW +MS0wNTA0MjUxNzE1MzZaLTE0AB8AcAABAAAAHgAAAEIAaQBsAGwAIABvAGYAIABS +AGkAZwBoAHQAcwAAAAAAAgFxAAEAAAAWAAAAAcVJumT7yarjal9+TnmqsNvwaipi +/QAAHwAaDAEAAAAQAAAAMwBrAHIAZQBsAGEAeQAAAB8AHQ4BAAAAHgAAAEIAaQBs +AGwAIABvAGYAIABSAGkAZwBoAHQAcwAAAAAAAgETEAEAAADuFAAAPCFET0NUWVBF +IEhUTUwgUFVCTElDICItLy9XM0MvL0RURCBIVE1MIDQuMCBUcmFuc2l0aW9uYWwv +L0VOIj4NCjxIVE1MPjxIRUFEPg0KPE1FVEEgaHR0cC1lcXVpdj1Db250ZW50LVR5 +cGUgY29udGVudD0idGV4dC9odG1sOyBjaGFyc2V0PXVzLWFzY2lpIj4NCjxNRVRB +IGNvbnRlbnQ9Ik1TSFRNTCA2LjAwLjM3OTAuMTgzMCIgbmFtZT1HRU5FUkFUT1I+ +PC9IRUFEPg0KPEJPRFk+DQo8RElWPg0KPERJVj48Rk9OVCBmYWNlPUFyaWFsIHNp +emU9Mj5USEUgQklMTCBPRiBSSUdIVFM8QlI+QW1lbmRtZW50cyAxLTEwIG9mIHRo +ZSANCkNvbnN0aXR1dGlvbjwvRk9OVD48L0RJVj4NCjxESVY+Jm5ic3A7PC9ESVY+ +DQo8RElWPjxGT05UIGZhY2U9QXJpYWwgc2l6ZT0yPlRoZSBDb252ZW50aW9ucyBv +ZiBhIG51bWJlciBvZiB0aGUgU3RhdGVzIGhhdmluZywgDQphdCB0aGUgdGltZSBv +ZiBhZG9wdGluZyB0aGUgQ29uc3RpdHV0aW9uLCBleHByZXNzZWQgYSBkZXNpcmUs +IGluIG9yZGVyIHRvIA0KcHJldmVudCBtaXNjb25zdHJ1Y3Rpb24gb3IgYWJ1c2Ug +b2YgaXRzIHBvd2VycywgdGhhdCBmdXJ0aGVyIGRlY2xhcmF0b3J5IGFuZCANCnJl +c3RyaWN0aXZlIGNsYXVzZXMgc2hvdWxkIGJlIGFkZGVkLCBhbmQgYXMgZXh0ZW5k +aW5nIHRoZSBncm91bmQgb2YgcHVibGljIA0KY29uZmlkZW5jZSBpbiB0aGUgR292 +ZXJubWVudCB3aWxsIGJlc3QgaW5zdXJlIHRoZSBiZW5lZmljZW50IGVuZHMgb2Yg +aXRzIA0KaW5zdGl0dXRpb247IDxCUj5SZXNvbHZlZCwgYnkgdGhlIFNlbmF0ZSBh +bmQgSG91c2Ugb2YgUmVwcmVzZW50YXRpdmVzIG9mIHRoZSANClVuaXRlZCBTdGF0 +ZXMgb2YgQW1lcmljYSwgaW4gQ29uZ3Jlc3MgYXNzZW1ibGVkLCB0d28tdGhpcmRz +IG9mIGJvdGggSG91c2VzIA0KY29uY3VycmluZywgdGhhdCB0aGUgZm9sbG93aW5n +IGFydGljbGVzIGJlIHByb3Bvc2VkIHRvIHRoZSBMZWdpc2xhdHVyZXMgb2YgdGhl +IA0Kc2V2ZXJhbCBTdGF0ZXMsIGFzIGFtZW5kbWVudHMgdG8gdGhlIENvbnN0aXR1 +dGlvbiBvZiB0aGUgVW5pdGVkIFN0YXRlczsgYWxsIG9yIA0KYW55IG9mIHdoaWNo +IGFydGljbGVzLCB3aGVuIHJhdGlmaWVkIGJ5IHRocmVlLWZvdXJ0aHMgb2YgdGhl +IHNhaWQgTGVnaXNsYXR1cmVzLCANCnRvIGJlIHZhbGlkIHRvIGFsbCBpbnRlbnRz +IGFuZCBwdXJwb3NlcyBhcyBwYXJ0IG9mIHRoZSBzYWlkIENvbnN0aXR1dGlvbiwg +DQpuYW1lbHk6IDwvRk9OVD48L0RJVj4NCjxESVY+Jm5ic3A7PC9ESVY+DQo8RElW +PjxGT05UIGZhY2U9QXJpYWwgc2l6ZT0yPkFtZW5kbWVudCBJPC9GT05UPjwvRElW +Pg0KPERJVj4mbmJzcDs8L0RJVj4NCjxESVY+PEZPTlQgZmFjZT1BcmlhbCBzaXpl +PTI+Q29uZ3Jlc3Mgc2hhbGwgbWFrZSBubyBsYXcgcmVzcGVjdGluZyBhbiANCmVz +dGFibGlzaG1lbnQgb2YgcmVsaWdpb24sIG9yIHByb2hpYml0aW5nIHRoZSBmcmVl +IGV4ZXJjaXNlIHRoZXJlb2Y7IG9yIA0KYWJyaWRnaW5nIHRoZSBmcmVlZG9tIG9m +IHNwZWVjaCwgb3Igb2YgdGhlIHByZXNzOyBvciB0aGUgcmlnaHQgb2YgdGhlIHBl +b3BsZSANCnBlYWNlYWJseSB0byBhc3NlbWJsZSwgYW5kIHRvIHBldGl0aW9uIHRo +ZSBnb3Zlcm5tZW50IGZvciBhIHJlZHJlc3Mgb2YgDQpncmlldmFuY2VzLiA8L0ZP +TlQ+PC9ESVY+DQo8RElWPiZuYnNwOzwvRElWPg0KPERJVj48Rk9OVCBmYWNlPUFy +aWFsIHNpemU9Mj5BbWVuZG1lbnQgSUk8L0ZPTlQ+PC9ESVY+DQo8RElWPiZuYnNw +OzwvRElWPg0KPERJVj48Rk9OVCBmYWNlPUFyaWFsIHNpemU9Mj5BIHdlbGwgcmVn +dWxhdGVkIG1pbGl0aWEsIGJlaW5nIG5lY2Vzc2FyeSB0byB0aGUgDQpzZWN1cml0 +eSBvZiBhIGZyZWUgc3RhdGUsIHRoZSByaWdodCBvZiB0aGUgcGVvcGxlIHRvIGtl +ZXAgYW5kIGJlYXIgYXJtcywgc2hhbGwgDQpub3QgYmUgaW5mcmluZ2VkLiA8L0ZP +TlQ+PC9ESVY+DQo8RElWPiZuYnNwOzwvRElWPg0KPERJVj48Rk9OVCBmYWNlPUFy +aWFsIHNpemU9Mj5BbWVuZG1lbnQgSUlJPC9GT05UPjwvRElWPg0KPERJVj4mbmJz +cDs8L0RJVj4NCjxESVY+PEZPTlQgZmFjZT1BcmlhbCBzaXplPTI+Tm8gc29sZGll +ciBzaGFsbCwgaW4gdGltZSBvZiBwZWFjZSBiZSBxdWFydGVyZWQgaW4gDQphbnkg +aG91c2UsIHdpdGhvdXQgdGhlIGNvbnNlbnQgb2YgdGhlIG93bmVyLCBub3IgaW4g +dGltZSBvZiB3YXIsIGJ1dCBpbiBhIG1hbm5lciANCnRvIGJlIHByZXNjcmliZWQg +YnkgbGF3LiA8L0ZPTlQ+PC9ESVY+DQo8RElWPiZuYnNwOzwvRElWPg0KPERJVj48 +Rk9OVCBmYWNlPUFyaWFsIHNpemU9Mj5BbWVuZG1lbnQgSVY8L0ZPTlQ+PC9ESVY+ +DQo8RElWPiZuYnNwOzwvRElWPg0KPERJVj48Rk9OVCBmYWNlPUFyaWFsIHNpemU9 +Mj5UaGUgcmlnaHQgb2YgdGhlIHBlb3BsZSB0byBiZSBzZWN1cmUgaW4gdGhlaXIg +DQpwZXJzb25zLCBob3VzZXMsIHBhcGVycywgYW5kIGVmZmVjdHMsIGFnYWluc3Qg +dW5yZWFzb25hYmxlIHNlYXJjaGVzIGFuZCANCnNlaXp1cmVzLCBzaGFsbCBub3Qg +YmUgdmlvbGF0ZWQsIGFuZCBubyB3YXJyYW50cyBzaGFsbCBpc3N1ZSwgYnV0IHVw +b24gcHJvYmFibGUgDQpjYXVzZSwgc3VwcG9ydGVkIGJ5IG9hdGggb3IgYWZmaXJt +YXRpb24sIGFuZCBwYXJ0aWN1bGFybHkgZGVzY3JpYmluZyB0aGUgcGxhY2UgDQp0 +byBiZSBzZWFyY2hlZCwgYW5kIHRoZSBwZXJzb25zIG9yIHRoaW5ncyB0byBiZSBz +ZWl6ZWQuIDwvRk9OVD48L0RJVj4NCjxESVY+Jm5ic3A7PC9ESVY+DQo8RElWPjxG +T05UIGZhY2U9QXJpYWwgc2l6ZT0yPkFtZW5kbWVudCBWPC9GT05UPjwvRElWPg0K +PERJVj4mbmJzcDs8L0RJVj4NCjxESVY+PEZPTlQgZmFjZT1BcmlhbCBzaXplPTI+ +Tm8gcGVyc29uIHNoYWxsIGJlIGhlbGQgdG8gYW5zd2VyIGZvciBhIGNhcGl0YWws +IG9yIA0Kb3RoZXJ3aXNlIGluZmFtb3VzIGNyaW1lLCB1bmxlc3Mgb24gYSBwcmVz +ZW50bWVudCBvciBpbmRpY3RtZW50IG9mIGEgZ3JhbmQganVyeSwgDQpleGNlcHQg +aW4gY2FzZXMgYXJpc2luZyBpbiB0aGUgbGFuZCBvciBuYXZhbCBmb3JjZXMsIG9y +IGluIHRoZSBtaWxpdGlhLCB3aGVuIGluIA0KYWN0dWFsIHNlcnZpY2UgaW4gdGlt +ZSBvZiB3YXIgb3IgcHVibGljIGRhbmdlcjsgbm9yIHNoYWxsIGFueSBwZXJzb24g +YmUgc3ViamVjdCANCmZvciB0aGUgc2FtZSBvZmZlbnNlIHRvIGJlIHR3aWNlIHB1 +dCBpbiBqZW9wYXJkeSBvZiBsaWZlIG9yIGxpbWI7IG5vciBzaGFsbCBiZSANCmNv +bXBlbGxlZCBpbiBhbnkgY3JpbWluYWwgY2FzZSB0byBiZSBhIHdpdG5lc3MgYWdh +aW5zdCBoaW1zZWxmLCBub3IgYmUgZGVwcml2ZWQgDQpvZiBsaWZlLCBsaWJlcnR5 +LCBvciBwcm9wZXJ0eSwgd2l0aG91dCBkdWUgcHJvY2VzcyBvZiBsYXc7IG5vciBz +aGFsbCBwcml2YXRlIA0KcHJvcGVydHkgYmUgdGFrZW4gZm9yIHB1YmxpYyB1c2Us +IHdpdGhvdXQganVzdCBjb21wZW5zYXRpb24uIDwvRk9OVD48L0RJVj4NCjxESVY+ +Jm5ic3A7PC9ESVY+DQo8RElWPjxGT05UIGZhY2U9QXJpYWwgc2l6ZT0yPkFtZW5k +bWVudCBWSTwvRk9OVD48L0RJVj4NCjxESVY+Jm5ic3A7PC9ESVY+DQo8RElWPjxG +T05UIGZhY2U9QXJpYWwgc2l6ZT0yPkluIGFsbCBjcmltaW5hbCBwcm9zZWN1dGlv +bnMsIHRoZSBhY2N1c2VkIHNoYWxsIA0KZW5qb3kgdGhlIHJpZ2h0IHRvIGEgc3Bl +ZWR5IGFuZCBwdWJsaWMgdHJpYWwsIGJ5IGFuIGltcGFydGlhbCBqdXJ5IG9mIHRo +ZSBzdGF0ZSANCmFuZCBkaXN0cmljdCB3aGVyZWluIHRoZSBjcmltZSBzaGFsbCBo +YXZlIGJlZW4gY29tbWl0dGVkLCB3aGljaCBkaXN0cmljdCBzaGFsbCANCmhhdmUg +YmVlbiBwcmV2aW91c2x5IGFzY2VydGFpbmVkIGJ5IGxhdywgYW5kIHRvIGJlIGlu +Zm9ybWVkIG9mIHRoZSBuYXR1cmUgYW5kIA0KY2F1c2Ugb2YgdGhlIGFjY3VzYXRp +b247IHRvIGJlIGNvbmZyb250ZWQgd2l0aCB0aGUgd2l0bmVzc2VzIGFnYWluc3Qg +aGltOyB0byANCmhhdmUgY29tcHVsc29yeSBwcm9jZXNzIGZvciBvYnRhaW5pbmcg +d2l0bmVzc2VzIGluIGhpcyBmYXZvciwgYW5kIHRvIGhhdmUgdGhlIA0KYXNzaXN0 +YW5jZSBvZiBjb3Vuc2VsIGZvciBoaXMgZGVmZW5zZS4gPC9GT05UPjwvRElWPg0K +PERJVj4mbmJzcDs8L0RJVj4NCjxESVY+PEZPTlQgZmFjZT1BcmlhbCBzaXplPTI+ +QW1lbmRtZW50IFZJSTwvRk9OVD48L0RJVj4NCjxESVY+Jm5ic3A7PC9ESVY+DQo8 +RElWPjxGT05UIGZhY2U9QXJpYWwgc2l6ZT0yPkluIHN1aXRzIGF0IGNvbW1vbiBs +YXcsIHdoZXJlIHRoZSB2YWx1ZSBpbiANCmNvbnRyb3ZlcnN5IHNoYWxsIGV4Y2Vl +ZCB0d2VudHkgZG9sbGFycywgdGhlIHJpZ2h0IG9mIHRyaWFsIGJ5IGp1cnkgc2hh +bGwgYmUgDQpwcmVzZXJ2ZWQsIGFuZCBubyBmYWN0IHRyaWVkIGJ5IGEganVyeSwg +c2hhbGwgYmUgb3RoZXJ3aXNlIHJlZXhhbWluZWQgaW4gYW55IA0KY291cnQgb2Yg +dGhlIFVuaXRlZCBTdGF0ZXMsIHRoYW4gYWNjb3JkaW5nIHRvIHRoZSBydWxlcyBv +ZiB0aGUgY29tbW9uIGxhdy4gDQo8L0ZPTlQ+PC9ESVY+DQo8RElWPiZuYnNwOzwv +RElWPg0KPERJVj48Rk9OVCBmYWNlPUFyaWFsIHNpemU9Mj5BbWVuZG1lbnQgVklJ +STwvRk9OVD48L0RJVj4NCjxESVY+Jm5ic3A7PC9ESVY+DQo8RElWPjxGT05UIGZh +Y2U9QXJpYWwgc2l6ZT0yPkV4Y2Vzc2l2ZSBiYWlsIHNoYWxsIG5vdCBiZSByZXF1 +aXJlZCwgbm9yIGV4Y2Vzc2l2ZSANCmZpbmVzIGltcG9zZWQsIG5vciBjcnVlbCBh +bmQgdW51c3VhbCBwdW5pc2htZW50cyBpbmZsaWN0ZWQuIDwvRk9OVD48L0RJVj4N +CjxESVY+Jm5ic3A7PC9ESVY+DQo8RElWPjxGT05UIGZhY2U9QXJpYWwgc2l6ZT0y +PkFtZW5kbWVudCBJWDwvRk9OVD48L0RJVj4NCjxESVY+Jm5ic3A7PC9ESVY+DQo8 +RElWPjxGT05UIGZhY2U9QXJpYWwgc2l6ZT0yPlRoZSBlbnVtZXJhdGlvbiBpbiB0 +aGUgQ29uc3RpdHV0aW9uLCBvZiBjZXJ0YWluIA0KcmlnaHRzLCBzaGFsbCBub3Qg +YmUgY29uc3RydWVkIHRvIGRlbnkgb3IgZGlzcGFyYWdlIG90aGVycyByZXRhaW5l +ZCBieSB0aGUgDQpwZW9wbGUuIDwvRk9OVD48L0RJVj4NCjxESVY+Jm5ic3A7PC9E +SVY+DQo8RElWPjxGT05UIGZhY2U9QXJpYWwgc2l6ZT0yPkFtZW5kbWVudCBYPC9G +T05UPjwvRElWPg0KPERJVj4mbmJzcDs8L0RJVj4NCjxESVY+PEZPTlQgZmFjZT1B +cmlhbCBzaXplPTI+VGhlIHBvd2VycyBub3QgZGVsZWdhdGVkIHRvIHRoZSBVbml0 +ZWQgU3RhdGVzIGJ5IA0KdGhlIENvbnN0aXR1dGlvbiwgbm9yIHByb2hpYml0ZWQg +YnkgaXQgdG8gdGhlIHN0YXRlcywgYXJlIHJlc2VydmVkIHRvIHRoZSBzdGF0ZXMg +DQpyZXNwZWN0aXZlbHksIG9yIHRvIHRoZSBwZW9wbGUuIDwvRk9OVD48L0RJVj48 +L0RJVj48L0JPRFk+PC9IVE1MPg0KAAAfADUQAQAAAKIAAAA8ADQANQAyADAARgA2 +ADEANQAxAEQAQQBGADIAQQA0ADQAQgBBADgANwA4AEIARgAyAEYAMwA4ADAAMwA0 +ADgARQAyADYARQA1AEAAYgByAC0AZQB4AGMAaAAtAGQAZQB2ADEALgBiAHIAZQB4 +AGMAaABhAG4AZwBlAC4AZABvAGwAcABoAGkAbgBzAGUAYQByAGMAaAAuAGMAbwBt +AD4AAAAAAAMAgBD/////HwDzEAEAAAAmAAAAQgBpAGwAbAAgAG8AZgAgAFIAaQBn +AGgAdABzAC4ARQBNAEwAAAAAAAsA9BAAAAAACwD1EAAAAAALAPYQAAAAAEAABzBR +lpFluknFAUAACDBRlpFluknFAQMA3j+fTgAAAwDxPwkEAAAfAPg/AQAAABAAAAAz +AGsAcgBlAGwAYQB5AAAAAgH5PwEAAABjAAAAAAAAANynQMjAQhAatLkIACsv4YIB +AAAAAAAAAC9PPUJSLUVYQ0gtVEVTVC9PVT1GSVJTVCBBRE1JTklTVFJBVElWRSBH +Uk9VUC9DTj1SRUNJUElFTlRTL0NOPTNLUkVMQVkAAB8A+j8BAAAAEAAAADMAawBy +AGUAbABhAHkAAAACAfs/AQAAAGMAAAAAAAAA3KdAyMBCEBq0uQgAKy/hggEAAAAA +AAAAL089QlItRVhDSC1URVNUL09VPUZJUlNUIEFETUlOSVNUUkFUSVZFIEdST1VQ +L0NOPVJFQ0lQSUVOVFMvQ049M0tSRUxBWQAAAwD9P+QEAAADABlAAAAAAAMAGkAA +AAAAHwAwQAEAAAAQAAAAMwBLAFIARQBMAEEAWQAAAB8AMUABAAAAEAAAADMASwBS +AEUATABBAFkAAAAfADhAAQAAABAAAAAzAEsAUgBFAEwAQQBZAAAAHwA5QAEAAAAQ +AAAAMwBLAFIARQBMAEEAWQAAAAMAdkD/////AwACWQAAFgADAAlZAgAAAAsAhYEI +IAYAAAAAAMAAAAAAAABGAAAAAA6FAAAAAAAAAwCdgQggBgAAAAAAwAAAAAAAAEYA +AAAAUoUAAJjDAQAfAJ6BCCAGAAAAAADAAAAAAAAARgAAAABUhQAAAQAAAAoAAAAx +ADEALgAwAAAAAAADAOmBCCAGAAAAAADAAAAAAAAARgAAAAABhQAAAAAAAAsA7oEI +IAYAAAAAAMAAAAAAAABGAAAAAAOFAAAAAAAAAwD4gQggBgAAAAAAwAAAAAAAAEYA +AAAAEIUAAAAAAAADAP+BCCAGAAAAAADAAAAAAAAARgAAAAAYhQAAAAAAAAsAIIII +IAYAAAAAAMAAAAAAAABGAAAAAAaFAAAAAAAACwAkggggBgAAAAAAwAAAAAAAAEYA +AAAAgoUAAAAAAAAfACaCCCAGAAAAAADAAAAAAAAARgAAAACDhQAAAQAAACYAAAA0 +ADAANQAxADMAMQA1ADEANwAtADIANQAwADQAMgAwADAANQAAAAAAAwBxggggBgAA +AAAAwAAAAAAAAEYAAAAAk4UAAAAAAAALACkAAAAAAAsAIwAAAAAAAgF/AAEAAABR +AAAAPDQ1MjBGNjE1MURBRjJBNDRCQTg3OEJGMkYzODAzNDhFMjZFNUBici1leGNo +LWRldjEuYnJleGNoYW5nZS5kb2xwaGluc2VhcmNoLmNvbT4AAAAAC/o= + +--_000_553468B23EE29B4F8836CBD0E1B2A15A275C3AA855POLNIEXMBV2po_-- + diff --git a/spec/fixtures/files/tnef-attachment-truncated.email b/spec/fixtures/files/tnef-attachment-truncated.email new file mode 100644 index 000000000..365a5a442 --- /dev/null +++ b/spec/fixtures/files/tnef-attachment-truncated.email @@ -0,0 +1,34 @@ +From hello@blah.local Fri Feb 21 16:23:14 2013 +Return-path: <bar@example.org> +Envelope-to: foo@example.org +Delivery-date: Fri, 21 Feb 2013 16:23:14 +0000 +Content-Type: multipart/mixed; + boundary="_000_553468B23EE29B4F8836CBD0E1B2A15A275C3AA855POLNIEXMBV2po_" +From: <bar@example.org> +To: <foo@example.org> +Sender: <hello@blah.local> +Date: Fri, 21 Feb 2013 16:23:04 +0000 +Subject: here's a useless email +Message-ID: <12345@blah.local> +Accept-Language: en-US, en-GB +Content-Language: en-US +X-MS-Has-Attach: +X-MS-TNEF-Correlator: <12345@blah.local> +acceptlanguage: en-US, en-GB +MIME-Version: 1.0 + +--_000_553468B23EE29B4F8836CBD0E1B2A15A275C3AA855POLNIEXMBV2po_ +Content-Type: text/plain; charset="us-ascii" +Content-Transfer-Encoding: quoted-printable + +Some introductory text here, before the malformed TNEF attachment. + +--_000_553468B23EE29B4F8836CBD0E1B2A15A275C3AA855POLNIEXMBV2po_ +Content-Disposition: attachment; filename="winmail.dat" +Content-Transfer-Encoding: base64 +Content-Type: application/ms-tnef; name="winmail.dat" + +eJ8+IkV9AQaQCAAEAAAAAAABAAEAAQeQBgAIAAAA5AQAAAAAAADoAAEJgAEAIQAAAEMyRUUzRUYx + +--_000_553468B23EE29B4F8836CBD0E1B2A15A275C3AA855POLNIEXMBV2po_-- + diff --git a/spec/lib/basic_encoding_tests.rb b/spec/lib/basic_encoding_tests.rb new file mode 100644 index 000000000..35d35fd4a --- /dev/null +++ b/spec/lib/basic_encoding_tests.rb @@ -0,0 +1,157 @@ +# -*- coding: utf-8 -*- +require File.expand_path(File.dirname(__FILE__) + '/../spec_helper') + +def bytes_to_binary_string( bytes, claimed_encoding = nil ) + claimed_encoding ||= 'ASCII-8BIT' + bytes_string = bytes.pack('c*') + if RUBY_VERSION.to_f >= 1.9 + bytes_string.force_encoding! claimed_encoding + end + bytes_string +end + +random_string = bytes_to_binary_string [ 0x0f, 0x58, 0x1c, 0x8f, 0xa4, 0xcf, + 0xf6, 0x8c, 0x9d, 0xa7, 0x06, 0xd9, + 0xf7, 0x90, 0x6c, 0x6f] + +windows_1252_string = bytes_to_binary_string [ 0x44, 0x41, 0x53, 0x48, 0x20, + 0x96, 0x20, 0x44, 0x41, 0x53, + 0x48 ] + +# It's a shame this example is so long, but if we don't take enough it +# gets misinterpreted as Shift_JIS + +gb_18030_bytes = [ 0xb9, 0xf3, 0xb9, 0xab, 0xcb, 0xbe, 0xb8, 0xba, 0xd4, 0xf0, + 0xc8, 0xcb, 0x28, 0xbe, 0xad, 0xc0, 0xed, 0x2f, 0xb2, 0xc6, + 0xce, 0xf1, 0x29, 0xc4, 0xfa, 0xba, 0xc3, 0xa3, 0xba, 0x0d, + 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0xb1, 0xbe, 0xb9, 0xab, 0xcb, 0xbe, 0xd4, + 0xda, 0x31, 0x39, 0x39, 0x37, 0xc4, 0xea, 0xb3, 0xc9, 0xc1, + 0xa2, 0xb9, 0xfa, 0xbc, 0xd2, 0xb9, 0xa4, 0xc9, 0xcc, 0xd7, + 0xa2, 0xb2, 0xe1, 0x2e, 0xca, 0xb5, 0xc1, 0xa6, 0xd0, 0xdb, + 0xba, 0xf1, 0xa1, 0xa3, 0xd3, 0xd0, 0xb6, 0xc0, 0xc1, 0xa2, + 0xcb, 0xb0, 0xce, 0xf1, 0x0d, 0x0a, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0xd7, 0xa8, 0xd2, 0xb5, + 0xc8, 0xcb, 0xd4, 0xb1, 0x3b, 0xd4, 0xda, 0xc8, 0xab, 0xb9, + 0xfa, 0xb8, 0xf7, 0xb3, 0xc7, 0xca, 0xd0, 0xc9, 0xe8, 0xc1, + 0xa2, 0xb7, 0xd6, 0xb9, 0xab, 0xcb, 0xbe, 0xa3, 0xa8, 0xd5, + 0xe3, 0xbd, 0xad, 0xa1, 0xa2, 0xc9, 0xcf, 0xba, 0xa3, 0xa1, + 0xa2, 0xb9, 0xe3, 0xd6, 0xdd, 0xa1, 0xa2, 0xbd, 0xad, 0xcb, + 0xd5, 0xb5, 0xc8, 0x0d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0xb5, 0xd8, 0xb7, 0xbd, 0xa3, + 0xa9, 0xd2, 0xf2, 0xbd, 0xf8, 0xcf, 0xee, 0xbd, 0xcf, 0xb6, + 0xe0, 0xcf, 0xd6, 0xcd, 0xea, 0xb3, 0xc9, 0xb2, 0xbb, 0xc1, + 0xcb, 0xc3, 0xbf, 0xd4, 0xc2, 0xcf, 0xfa, 0xca, 0xdb, 0xb6, + 0xee, 0xb6, 0xc8, 0xa1, 0xa3, 0xc3, 0xbf, 0xd4, 0xc2, 0xd3, + 0xd0, 0xd2, 0xbb, 0xb2, 0xbf, 0xb7, 0xd6, 0x0d, 0x0a, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0xd4, + 0xf6, 0xd6, 0xb5, 0xb6, 0x90, 0xa3, 0xa8, 0x36, 0x2d, 0x37, + 0x25, 0xd7, 0xf3, 0xd3, 0xd2, 0x29, 0xba, 0xcd, 0xc6, 0xd5, + 0xc6, 0xb1, 0xa3, 0xa8, 0x30, 0x2e, 0x35, 0x25, 0x2d, 0x32, + 0x25, 0x20, 0xd7, 0xf3, 0xd3, 0xd2, 0xa3, 0xa9, 0xd3, 0xc5, + 0xbb, 0xdd, 0xb4, 0xfa, 0xbf, 0xaa, 0xbb, 0xf2, 0xba, 0xcf, + 0xd7, 0xf7, 0xa3, 0xac, 0x0d, 0x0a, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0xb5, 0xe3, 0xca, 0xfd, + 0xbd, 0xcf, 0xb5, 0xcd, 0xa1, 0xa3, 0xb4, 0xfa, 0xc0, 0xed, + 0xb7, 0xb6, 0xce, 0xa7, 0xc8, 0xe7, 0xcf, 0xc2, 0xa3, 0xba, + 0x0d, 0x0a ] + +gb_18030_spam_string = bytes_to_binary_string gb_18030_bytes + +describe "normalize_string_to_utf8" do + + describe "when passed uniterpretable character data" do + + it "should reject it as invalid" do + + expect { + normalize_string_to_utf8 random_string + }.to raise_error(EncodingNormalizationError) + + expect { + normalize_string_to_utf8 random_string, 'UTF-8' + }.to raise_error(EncodingNormalizationError) + + end + end + + describe "when passed unlabelled Windows 1252 data" do + + it "should correctly convert it to UTF-8" do + + normalized = normalize_string_to_utf8 windows_1252_string + + normalized.should == "DASH – DASH" + + end + + end + + describe "when passed GB 18030 data" do + + it "should correctly convert it to UTF-8 if unlabelled" do + + normalized = normalize_string_to_utf8 gb_18030_spam_string + + normalized.should start_with("贵公司负责人") + + end + + end + +end + +describe "convert_string_to_utf8_or_binary" do + + describe "when passed uniterpretable character data" do + + it "should return it as a binary string" do + + converted = convert_string_to_utf8_or_binary random_string + converted.should == random_string + + if RUBY_VERSION.to_f >= 1.9 + converted.encoding.should == 'ASCII-8BIT' + end + + converted = convert_string_to_utf8_or_binary random_string,'UTF-8' + converted.should == random_string + + if RUBY_VERSION.to_f >= 1.9 + converted.encoding.should == 'ASCII-8BIT' + end + + end + end + + describe "when passed unlabelled Windows 1252 data" do + + it "should correctly convert it to UTF-8" do + + converted = convert_string_to_utf8_or_binary windows_1252_string + + converted.should == "DASH – DASH" + + if RUBY_VERSION.to_f >= 1.9 + converted.encoding.should == 'UTF-8' + end + end + + end + + describe "when passed GB 18030 data" do + + it "should correctly convert it to UTF-8 if unlabelled" do + + converted = convert_string_to_utf8_or_binary gb_18030_spam_string + + converted.should start_with("贵公司负责人") + + if RUBY_VERSION.to_f >= 1.9 + converted.encoding.should == 'UTF-8' + end + end + + end + +end diff --git a/spec/lib/mail_handler/mail_handler_spec.rb b/spec/lib/mail_handler/mail_handler_spec.rb index 79b779687..01bf179f8 100644 --- a/spec/lib/mail_handler/mail_handler_spec.rb +++ b/spec/lib/mail_handler/mail_handler_spec.rb @@ -20,12 +20,33 @@ describe 'when creating a mail object from raw data' do mail.to.should == ["request-66666-caa77777@whatdotheyknow.com", "foi@example.com"] end + it 'should return nil for malformed To: and Cc: lines' do + mail = get_fixture_mail('malformed-to-and-cc.email') + mail.to.should == nil + mail.cc.should == nil + end + it 'should convert an iso8859 email to utf8' do mail = get_fixture_mail('iso8859_2_raw_email.email') mail.subject.should match /gjatë/u MailHandler.get_part_body(mail).is_utf8?.should == true end + it 'should convert a Windows-1252 body mislabelled as ISO-8859-1 to UTF-8' do + mail = get_fixture_mail('mislabelled-as-iso-8859-1.email') + body = MailHandler.get_part_body(mail) + body.is_utf8?.should == true + # This email is broken in at least these two ways: + # 1. It contains a top bit set character (0x96) despite the + # "Content-Transfer-Encoding: 7bit" + # 2. The charset in the Content-Type header is "iso-8859-1" + # but 0x96 is actually a Windows-1252 en dash, which would + # be Unicode codepoint 2013. It should be possible to + # spot the mislabelling, since 0x96 isn't a valid + # ISO-8859-1 character. + body.should match(/ \xe2\x80\x93 /) + end + end describe 'when asked for the from name' do @@ -275,6 +296,12 @@ end describe 'when getting attachment attributes' do + it 'should handle a mail with a non-multipart part with no charset in the Content-Type header' do + mail = get_fixture_mail('part-without-charset-in-content-type.email') + attributes = MailHandler.get_attachment_attributes(mail) + attributes.size.should == 2 + end + it 'should get two attachment parts from a multipart mail with text and html alternatives and an image' do mail = get_fixture_mail('quoted-subject-iso8859-1.email') @@ -282,6 +309,13 @@ describe 'when getting attachment attributes' do attributes.size.should == 2 end + it 'should get one attachment from a multipart mail with text and HTML alternatives, which should be UTF-8' do + mail = get_fixture_mail('iso8859_2_raw_email.email') + attributes = MailHandler.get_attachment_attributes(mail) + attributes.length.should == 1 + attributes[0][:body].is_utf8?.should == true + end + it 'should expand a mail attached as text' do # Note that this spec will only pass using Tmail in the timezone set as datetime headers # are rendered out in the local time - using the Mail gem this is not necessary @@ -304,6 +338,52 @@ describe 'when getting attachment attributes' do attributes = MailHandler.get_attachment_attributes(mail) end + it 'should ignore truncated TNEF attachment' do + mail = get_fixture_mail('tnef-attachment-truncated.email') + attributes = MailHandler.get_attachment_attributes(mail) + attributes.length.should == 2 + end + + it 'should ignore anything beyond the final MIME boundary' do + pending do + # This example raw email has a premature closing boundary for + # the outer multipart/mixed - my reading of RFC 1521 is that + # the "epilogue" beyond that should be ignored. + # See https://github.com/mysociety/alaveteli/issues/922 for + # more discussion. + mail = get_fixture_mail('nested-attachments-premature-end.email') + attributes = MailHandler.get_attachment_attributes(mail) + attributes.length.should == 3 + end + end + + it 'should cope with a missing final MIME boundary' do + mail = get_fixture_mail('multipart-no-final-boundary.email') + attributes = MailHandler.get_attachment_attributes(mail) + attributes.length.should == 1 + attributes[0][:body].should match(/This is an acknowledgement of your email/) + attributes[0][:content_type].should == "text/plain" + attributes[0][:url_part_number].should == 1 + end + + it 'should ignore a TNEF attachment with no usable contents' do + # FIXME: "no usable contents" is slightly misleading. The + # attachment in this example email does have usable content in + # the body of the TNEF attachment, but the invocation of tnef + # historically used to unpack these attachments doesn't add + # the --save-body parameter, so that they have been ignored so + # far. We probably should include the body from such + # attachments, but, at the moment, with the pending upgrade to + # Rails 3, we just want to check that the behaviour is the + # same as before. + mail = get_fixture_mail('tnef-attachment-empty.email') + attributes = MailHandler.get_attachment_attributes(mail) + attributes.length.should == 2 + # This is the size of the TNEF-encoded attachment; currently, + # we expect the code just to return this without decoding: + attributes[1][:body].length.should == 7769 + end + it 'should produce a consistent set of url_part_numbers, content_types, within_rfc822_subjects and filenames from an example mail with lots of attachments' do mail = get_fixture_mail('many-attachments-date-header.email') diff --git a/spec/models/incoming_message_spec.rb b/spec/models/incoming_message_spec.rb index e22235298..1d86c26ad 100644 --- a/spec/models/incoming_message_spec.rb +++ b/spec/models/incoming_message_spec.rb @@ -59,12 +59,19 @@ describe IncomingMessage, " when dealing with incoming mail" do message.subject.should == "Câmara Responde: Banco de ideias" end - it 'should not error on display of a message which has no charset set on the body part and - is not good utf-8' do + it 'should deal with GB18030 text even if the charset is missing' do ir = info_requests(:fancy_dog_request) receive_incoming_mail('no-part-charset-bad-utf8.email', ir.incoming_email) message = ir.incoming_messages[1] message.parse_raw_email! + message.get_main_body_text_internal.should include("贵公司负责人") + end + + it 'should not error on display of a message which has no charset set on the body part and is not good UTF-8' do + ir = info_requests(:fancy_dog_request) + receive_incoming_mail('no-part-charset-random-data.email', ir.incoming_email) + message = ir.incoming_messages[1] + message.parse_raw_email! message.get_main_body_text_internal.should include("The above text was badly encoded") end @@ -412,6 +419,17 @@ describe IncomingMessage, " when uudecoding bad messages" do im.get_attachments_for_display.size.should == 1 end + it "should still work when parsed from the raw email" do + raw_email = load_file_fixture 'inline-uuencode.email' + mail = MailHandler.mail_from_raw_email(raw_email) + im = incoming_messages :useless_incoming_message + im.stub!(:raw_email).and_return(raw_email) + im.stub!(:mail).and_return(mail) + im.parse_raw_email! + attachments = im.foi_attachments + attachments.size.should == 2 + end + it "should apply censor rules" do mail = get_fixture_mail('incoming-request-bad-uuencoding.email') diff --git a/spec/support/email_helpers.rb b/spec/support/email_helpers.rb index 7e98c39f6..252b1f137 100644 --- a/spec/support/email_helpers.rb +++ b/spec/support/email_helpers.rb @@ -8,7 +8,7 @@ end def receive_incoming_mail(email_name, email_to, email_from = 'geraldinequango@localhost') email_name = file_fixture_name(email_name) - content = File.read(email_name) + content = File.open(email_name, 'rb') { |f| f.read } content.gsub!('EMAIL_TO', email_to) content.gsub!('EMAIL_FROM', email_from) RequestMailer.receive(content) diff --git a/spec/support/load_file_fixtures.rb b/spec/support/load_file_fixtures.rb index 08079f654..a54505e99 100644 --- a/spec/support/load_file_fixtures.rb +++ b/spec/support/load_file_fixtures.rb @@ -2,13 +2,7 @@ def file_fixture_name(file_name) return File.join(RSpec.configuration.fixture_path, "files", file_name) end -def load_file_fixture(file_name, as_binary=false) +def load_file_fixture(file_name) file_name = file_fixture_name(file_name) - content = File.open(file_name, 'r') do |file| - if as_binary - file.set_encoding(Encoding::BINARY) if file.respond_to?(:set_encoding) - end - file.read - end - return content + return File.open(file_name, 'rb') { |f| f.read } end |