aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Gemfile1
-rw-r--r--Gemfile.lock2
-rw-r--r--app/controllers/request_controller.rb12
-rw-r--r--app/mailers/request_mailer.rb2
-rw-r--r--app/models/foi_attachment.rb7
-rw-r--r--app/models/incoming_message.rb30
-rw-r--r--config/initializers/alaveteli.rb2
-rw-r--r--config/packages3
-rw-r--r--doc/THEMES-UPGRADE.md101
-rw-r--r--lib/mail_handler/backends/mail_backend.rb70
-rw-r--r--lib/mail_handler/mail_handler.rb9
-rw-r--r--lib/normalize_string.rb86
-rw-r--r--lib/tasks/temp.rake150
-rw-r--r--spec/controllers/api_controller_spec.rb2
-rw-r--r--spec/controllers/request_controller_spec.rb94
-rw-r--r--spec/fixtures/files/incoming-request-two-same-name.email4
-rw-r--r--spec/fixtures/files/inline-uuencode.email27
-rw-r--r--spec/fixtures/files/malformed-to-and-cc.email11
-rw-r--r--spec/fixtures/files/mislabelled-as-iso-8859-1.email20
-rw-r--r--spec/fixtures/files/multipart-no-final-boundary.email21
-rw-r--r--spec/fixtures/files/nested-attachments-premature-end.email110
-rw-r--r--spec/fixtures/files/no-part-charset-random-data.email30
-rw-r--r--spec/fixtures/files/part-without-charset-in-content-type.email38
-rw-r--r--spec/fixtures/files/tnef-attachment-empty.email196
-rw-r--r--spec/fixtures/files/tnef-attachment-truncated.email34
-rw-r--r--spec/lib/basic_encoding_tests.rb157
-rw-r--r--spec/lib/mail_handler/mail_handler_spec.rb80
-rw-r--r--spec/models/incoming_message_spec.rb22
-rw-r--r--spec/support/email_helpers.rb2
-rw-r--r--spec/support/load_file_fixtures.rb10
30 files changed, 1254 insertions, 79 deletions
diff --git a/Gemfile b/Gemfile
index 5e4c60ea0..18c2aec0d 100644
--- a/Gemfile
+++ b/Gemfile
@@ -10,6 +10,7 @@ source 'https://rubygems.org'
gem 'rails', '3.1.12'
gem 'pg'
+gem 'charlock_holmes'
gem 'fastercsv', '>=1.5.5'
gem 'json'
gem 'mahoro'
diff --git a/Gemfile.lock b/Gemfile.lock
index 1864dd4fd..24e4dd5e3 100644
--- a/Gemfile.lock
+++ b/Gemfile.lock
@@ -58,6 +58,7 @@ GEM
net-sftp (>= 2.0.0)
net-ssh (>= 2.0.14)
net-ssh-gateway (>= 1.1.0)
+ charlock_holmes (0.6.9.4)
chunky_png (1.2.8)
colorize (0.5.8)
columnize (0.3.6)
@@ -243,6 +244,7 @@ DEPENDENCIES
annotate
bootstrap-sass
capistrano
+ charlock_holmes
compass
coveralls
debugger
diff --git a/app/controllers/request_controller.rb b/app/controllers/request_controller.rb
index a0f88096e..b8ccdf926 100644
--- a/app/controllers/request_controller.rb
+++ b/app/controllers/request_controller.rb
@@ -5,7 +5,6 @@
# Copyright (c) 2007 UK Citizens Online Democracy. All rights reserved.
# Email: hello@mysociety.org; WWW: http://www.mysociety.org/
-require 'alaveteli_file_types'
require 'zip/zip'
require 'open-uri'
@@ -722,7 +721,7 @@ class RequestController < ApplicationController
yield
- if params[:skip_cache].nil?
+ if params[:skip_cache].nil? && response.status == 200
# write it to the fileystem ourselves, so is just a plain file. (The
# various fragment cache functions using Ruby Marshall to write the file
# which adds a header, so isnt compatible with images that have been
@@ -737,6 +736,7 @@ class RequestController < ApplicationController
def get_attachment
get_attachment_internal(false)
+ return unless @attachment
# Prevent spam to magic request address. Note that the binary
# subsitution method used depends on the content type
@@ -756,6 +756,7 @@ class RequestController < ApplicationController
raise ActiveRecord::RecordNotFound.new("Attachment HTML not found.")
end
get_attachment_internal(true)
+ return unless @attachment
# images made during conversion (e.g. images in PDF files) are put in the cache directory, so
# the same cache code in cache_attachments above will display them.
@@ -802,8 +803,11 @@ class RequestController < ApplicationController
# check permissions
raise "internal error, pre-auth filter should have caught this" if !@info_request.user_can_view?(authenticated_user)
- @attachment = IncomingMessage.get_attachment_by_url_part_number(@incoming_message.get_attachments_for_display, @part_number)
- raise ActiveRecord::RecordNotFound.new("attachment not found part number " + @part_number.to_s + " incoming_message " + @incoming_message.id.to_s) if @attachment.nil?
+ @attachment = IncomingMessage.get_attachment_by_url_part_number_and_filename(@incoming_message.get_attachments_for_display, @part_number, @original_filename)
+ # If we can't find the right attachment, redirect to the incoming message:
+ unless @attachment
+ return redirect_to incoming_message_url(@incoming_message), :status => 303
+ end
# check filename in URL matches that in database (use a censor rule if you want to change a filename)
raise ActiveRecord::RecordNotFound.new("please use same filename as original file has, display: '" + @attachment.display_filename + "' old_display: '" + @attachment.old_display_filename + "' original: '" + @original_filename + "'") if @attachment.display_filename != @original_filename && @attachment.old_display_filename != @original_filename
diff --git a/app/mailers/request_mailer.rb b/app/mailers/request_mailer.rb
index 3eb89c660..4dbce6738 100644
--- a/app/mailers/request_mailer.rb
+++ b/app/mailers/request_mailer.rb
@@ -4,8 +4,6 @@
# Copyright (c) 2007 UK Citizens Online Democracy. All rights reserved.
# Email: hello@mysociety.org; WWW: http://www.mysociety.org/
-require 'alaveteli_file_types'
-
class RequestMailer < ApplicationMailer
# Used when an FOI officer uploads a response from their web browser - this is
# the "fake" email used to store in the same format in the database as if they
diff --git a/app/models/foi_attachment.rb b/app/models/foi_attachment.rb
index fcde379e0..0340f2b83 100644
--- a/app/models/foi_attachment.rb
+++ b/app/models/foi_attachment.rb
@@ -71,7 +71,12 @@ class FoiAttachment < ActiveRecord::Base
tries = 0
delay = 1
begin
- @cached_body = File.open(self.filepath, "rb" ).read
+ binary_data = File.open(self.filepath, "rb" ).read
+ if self.content_type =~ /^text/
+ @cached_body = convert_string_to_utf8_or_binary(binary_data, 'UTF-8')
+ else
+ @cached_body = binary_data
+ end
rescue Errno::ENOENT
# we've lost our cached attachments for some reason. Reparse them.
if tries > BODY_MAX_TRIES
diff --git a/app/models/incoming_message.rb b/app/models/incoming_message.rb
index c914edb7e..252f81bb7 100644
--- a/app/models/incoming_message.rb
+++ b/app/models/incoming_message.rb
@@ -31,12 +31,9 @@
# Move some of the (e.g. quoting) functions here into rblib, as they feel
# general not specific to IncomingMessage.
-require 'alaveteli_file_types'
require 'htmlentities'
require 'rexml/document'
require 'zip/zip'
-require 'mapi/msg'
-require 'mapi/convert'
require 'iconv' unless RUBY_VERSION >= '1.9'
class IncomingMessage < ActiveRecord::Base
@@ -132,6 +129,7 @@ class IncomingMessage < ActiveRecord::Base
end
self.valid_to_reply_to = self._calculate_valid_to_reply_to
self.last_parsed = Time.now
+ self.foi_attachments reload=true
self.save!
end
end
@@ -173,15 +171,29 @@ class IncomingMessage < ActiveRecord::Base
super
end
- # And look up by URL part number to get an attachment
+ # And look up by URL part number and display filename to get an attachment
# XXX relies on extract_attachments calling MailHandler.ensure_parts_counted
- def self.get_attachment_by_url_part_number(attachments, found_url_part_number)
- attachments.each do |a|
- if a.url_part_number == found_url_part_number
- return a
+ # The filename here is passed from the URL parameter, so it's the
+ # display_filename rather than the real filename.
+ def self.get_attachment_by_url_part_number_and_filename(attachments, found_url_part_number, display_filename)
+ attachment_by_part_number = attachments.detect { |a| a.url_part_number == found_url_part_number }
+ if attachment_by_part_number && attachment_by_part_number.display_filename == display_filename
+ # Then the filename matches, which is fine:
+ attachment_by_part_number
+ else
+ # Otherwise if the URL part number and filename don't
+ # match - this is probably due to a reparsing of the
+ # email. In that case, try to find a unique matching
+ # filename from any attachment.
+ attachments_by_filename = attachments.select { |a|
+ a.display_filename == display_filename
+ }
+ if attachments_by_filename.length == 1
+ attachments_by_filename[0]
+ else
+ nil
end
end
- return nil
end
# Converts email addresses we know about into textual descriptions of them
diff --git a/config/initializers/alaveteli.rb b/config/initializers/alaveteli.rb
index 4acc126d0..a9e9d498d 100644
--- a/config/initializers/alaveteli.rb
+++ b/config/initializers/alaveteli.rb
@@ -59,6 +59,8 @@ require 'quiet_opener.rb'
require 'mail_handler'
require 'public_body_categories'
require 'ability'
+require 'normalize_string'
+require 'alaveteli_file_types'
# Allow tests to be run under a non-superuser database account if required
if Rails.env == 'test' and ActiveRecord::Base.configurations['test']['constraint_disabling'] == false
diff --git a/config/packages b/config/packages
index db51e5bdd..fc67cda6b 100644
--- a/config/packages
+++ b/config/packages
@@ -36,4 +36,5 @@ rake (>= 0.9.2.2)
build-essential
bundler
sqlite3
-libsqlite3-dev \ No newline at end of file
+libsqlite3-dev
+libicu-dev
diff --git a/doc/THEMES-UPGRADE.md b/doc/THEMES-UPGRADE.md
new file mode 100644
index 000000000..457274d7a
--- /dev/null
+++ b/doc/THEMES-UPGRADE.md
@@ -0,0 +1,101 @@
+This file contains some notes on changing your Alaveteli theme for the
+upgrade to Rails 3, in version 0.11 of Alaveteli. These were written
+by Henare Degan, with some additions by Mark Longair.
+
+# Alaveteli Theme Upgrade Checks
+
+## RAILS_ROOT/RAILS_ENV
+
+[Example](https://github.com/henare/adminbootstraptheme/commit/857e33c9b0bc577024b476404aec4f9749f65a0b)
+
+Check your theme for instances of:
+
+* `RAILS_ROOT` and replace it with `Rails.root`
+* `RAILS_ENV` and replace it with `Rails.env`
+
+Note that `Rails.root` is a `Pathname`, so you can replace, for
+example:
+
+ File.join(RAILS_ROOT, 'public', 'alavetelitheme')
+
+... with:
+
+ Rails.root.join('public', 'alavetelitheme')
+
+## Dispatcher
+
+[Example](https://github.com/henare/adminbootstraptheme/commit/fba2d6b7dfdc26a25fdc1596bfe120270dd4cd0d)
+
+This...
+
+```ruby
+require 'dispatcher'
+Dispatcher.to_prepare do
+```
+
+should be replaced with this...
+
+```ruby
+Rails.configuration.to_prepare do
+````
+
+## Routes
+
+[Example](https://github.com/henare/adminbootstraptheme/commit/87f1991dafb09401f9b17f642a94382d5a47a713)
+
+You need to upgrade your custom routes to the new Rails syntax.
+
+## list_public_bodies_default removed
+
+[Example](https://github.com/openaustralia/alavetelitheme/commit/5927877af996a1afb1a23a950f0d012b52c36f83)
+
+The list_public_bodies_default helper has been removed from Alaveteli
+
+## Patching mailer templates has changed
+
+[Example](https://github.com/openaustralia/alavetelitheme/commit/ffb5242973a0b2acc4981c25659fcb752b92eb97)
+
+In `lib/patch_mailer_paths.rb` change `ActionMailer::Base.view_paths.unshift File.join(File.dirname(__FILE__), "views")` to `ActionMailer::Base.prepend_view_path File.join(File.dirname(__FILE__), "views")`
+
+There's also `ActionMailer::Base.append_view_path` for replacing `ActionMailer::Base.view_paths <<`.
+
+## Rename view templates
+
+[Example](https://github.com/henare/adminbootstraptheme/commit/b616b636c283ae6cf696a6af1fa481f371baf2b6)
+
+Rename view templates from `filename.rhtml` to `filename.html.erb`.
+
+Run this in the root of your theme directory:
+
+ for r in $(find lib/views -name '*.rhtml'); do echo git mv $r ${r%.rhtml}.html.erb; done
+
+[GOTCHA!](https://github.com/openaustralia/alavetelitheme/commit/65e775488822367d981bb15ab2cbcf1fce842cc2)
+One exception is mailer templates, these should be renamed to
+`filename.text.erb` as we only use text emails.
+
+## The Configuration class has been renamed
+
+[Example](https://github.com/openaustralia/alavetelitheme/commit/db6cca4650216c6f85acffaea380727344f0f740)
+
+Due to a naming conflict, `Configuration` has been renamed to `AlaveteliConfiguration`.
+
+You may have this in your theme for things like `Configuration::site_name`, just change it to `AlaveteliConfiguration::site_name`
+
+## request.request_uri is deprecated
+
+[Example](https://github.com/openaustralia/alavetelitheme/commit/d670eeebfb049e1dc83fdb36a628f7722d2ad419)
+
+Replace instances of `request.request_uri` with `request.fullpath`
+
+## content-inserting <% %> block helpers are deprecated
+
+[Example](https://github.com/openaustralia/alavetelitheme/commit/a4b13bbd76249b3a28e2a755cede20dd9db30140)
+
+The Rails 3 releases notes are [irritatingly
+imprecise](http://edgeguides.rubyonrails.org/3_0_release_notes.html#helpers-with-blocks)
+about which such helpers have changed. You can find some candidates
+with this `git grep` command:
+
+ git grep -E '<%[^=].*(_for|_tag|link_to)\b'
+
+(Ignore `content_for` in those results.)
diff --git a/lib/mail_handler/backends/mail_backend.rb b/lib/mail_handler/backends/mail_backend.rb
index f7893a60d..03d78e0a3 100644
--- a/lib/mail_handler/backends/mail_backend.rb
+++ b/lib/mail_handler/backends/mail_backend.rb
@@ -1,4 +1,35 @@
require 'mail'
+require 'mapi/msg'
+require 'mapi/convert'
+
+module Mail
+ class Message
+
+ # The behaviour of the 'to' and 'cc' methods have changed
+ # between TMail and Mail; this monkey-patching restores the
+ # TMail behaviour. The key difference is that when there's an
+ # invalid address, e.g. '<foo@example.org', Mail returns the
+ # string as an ActiveSupport::Multibyte::Chars, whereas
+ # previously TMail would return nil.
+
+ alias_method :old_to, :to
+ alias_method :old_cc, :cc
+
+ def clean_addresses(old_method, val)
+ old_result = self.send(old_method, val)
+ old_result.class == Mail::AddressContainer ? old_result : nil
+ end
+
+ def to(val = nil)
+ self.clean_addresses :old_to, val
+ end
+
+ def cc(val = nil)
+ self.clean_addresses :old_cc, val
+ end
+
+ end
+end
module MailHandler
module Backends
@@ -38,7 +69,11 @@ module MailHandler
# Get the body of a mail part
def get_part_body(part)
- part.body.decoded
+ decoded = part.body.decoded
+ if part.content_type =~ /^text\//
+ decoded = convert_string_to_utf8_or_binary decoded, part.charset
+ end
+ decoded
end
# Return the first from field if any
@@ -141,9 +176,14 @@ module MailHandler
end
elsif get_content_type(part) == 'application/ms-tnef'
# A set of attachments in a TNEF file
- part.rfc822_attachment = mail_from_tnef(part.body.decoded)
- if part.rfc822_attachment.nil?
- # Attached mail didn't parse, so treat as binary
+ begin
+ part.rfc822_attachment = mail_from_tnef(part.body.decoded)
+ if part.rfc822_attachment.nil?
+ # Attached mail didn't parse, so treat as binary
+ part.content_type = 'application/octet-stream'
+ end
+ rescue TNEFParsingError
+ part.rfc822_attachment = nil
part.content_type = 'application/octet-stream'
end
end
@@ -160,8 +200,11 @@ module MailHandler
part.parts.each{ |sub_part| expand_and_normalize_parts(sub_part, parent_mail) }
else
part_filename = get_part_file_name(part)
- charset = part.charset # save this, because overwriting content_type also resets charset
-
+ if part.has_charset?
+ original_charset = part.charset # save this, because overwriting content_type also resets charset
+ else
+ original_charset = nil
+ end
# Don't allow nil content_types
if get_content_type(part).nil?
part.content_type = 'application/octet-stream'
@@ -180,7 +223,9 @@ module MailHandler
# Use standard content types for Word documents etc.
part.content_type = normalise_content_type(get_content_type(part))
decode_attached_part(part, parent_mail)
- part.charset = charset
+ if original_charset
+ part.charset = original_charset
+ end
end
end
@@ -228,8 +273,15 @@ module MailHandler
def _get_attachment_leaves_recursive(part, within_rfc822_attachment, parent_mail)
leaves_found = []
if part.multipart?
- raise "no parts on multipart mail" if part.parts.size == 0
- if part.sub_type == 'alternative'
+ if part.parts.size == 0
+ # This is typically caused by a missing final
+ # MIME boundary, in which case the text of the
+ # message (including the opening MIME
+ # boundary) is in part.body, so just add this
+ # part as a leaf and treat it as text/plain:
+ part.content_type = "text/plain"
+ leaves_found += [part]
+ elsif part.sub_type == 'alternative'
best_part = choose_best_alternative(part)
leaves_found += _get_attachment_leaves_recursive(best_part,
within_rfc822_attachment,
diff --git a/lib/mail_handler/mail_handler.rb b/lib/mail_handler/mail_handler.rb
index 22ba26b97..9c955cccd 100644
--- a/lib/mail_handler/mail_handler.rb
+++ b/lib/mail_handler/mail_handler.rb
@@ -8,20 +8,23 @@ module MailHandler
require 'backends/mail_backend'
include Backends::MailBackend
+ class TNEFParsingError < StandardError
+ end
+
# Returns a set of attachments from the given TNEF contents
# The TNEF contents also contains the message body, but in general this is the
# same as the message body in the message proper.
def tnef_attachments(content)
attachments = []
Dir.mktmpdir do |dir|
- IO.popen("#{`which tnef`.chomp} -K -C #{dir}", "wb") do |f|
+ IO.popen("tnef -K -C #{dir} 2> /dev/null", "wb") do |f|
f.write(content)
f.close
if $?.signaled?
raise IOError, "tnef exited with signal #{$?.termsig}"
end
if $?.exited? && $?.exitstatus != 0
- raise IOError, "tnef exited with status #{$?.exitstatus}"
+ raise TNEFParsingError, "tnef exited with status #{$?.exitstatus}"
end
end
found = 0
@@ -34,7 +37,7 @@ module MailHandler
end
end
if found == 0
- raise IOError, "tnef produced no attachments"
+ raise TNEFParsingError, "tnef produced no attachments"
end
end
attachments
diff --git a/lib/normalize_string.rb b/lib/normalize_string.rb
new file mode 100644
index 000000000..f02b18ee0
--- /dev/null
+++ b/lib/normalize_string.rb
@@ -0,0 +1,86 @@
+require 'iconv' unless RUBY_VERSION.to_f >= 1.9
+require 'charlock_holmes'
+
+class EncodingNormalizationError < StandardError
+end
+
+def normalize_string_to_utf8(s, suggested_character_encoding=nil)
+
+ # Make a list of encodings to try:
+ to_try = []
+
+ guessed_encoding = CharlockHolmes::EncodingDetector.detect(s)[:encoding]
+ guessed_encoding ||= ''
+
+ # It's reasonably common for windows-1252 text to be mislabelled
+ # as ISO-8859-1, so try that first if charlock_holmes guessed
+ # that. However, it can also easily misidentify UTF-8 strings as
+ # ISO-8859-1 so we don't want to go with the guess by default...
+ to_try.push guessed_encoding if guessed_encoding.downcase == 'windows-1252'
+
+ to_try.push suggested_character_encoding if suggested_character_encoding
+ to_try.push 'UTF-8'
+ to_try.push guessed_encoding
+
+ to_try.each do |from_encoding|
+ if RUBY_VERSION.to_f >= 1.9
+ begin
+ s.force_encoding from_encoding
+ return s.encode('UTF-8') if s.valid_encoding?
+ rescue ArgumentError
+ # We get this is there are invalid bytes when
+ # interpreted as from_encoding at the point of
+ # the encode('UTF-8'); move onto the next one...
+ end
+ else
+ to_encoding = 'UTF-8'
+ begin
+ converted = Iconv.conv 'UTF-8', from_encoding, s
+ return converted
+ rescue Iconv::Failure
+ # We get this is there are invalid bytes when
+ # interpreted as from_encoding at the point of
+ # the Iconv.iconv; move onto the next one...
+ end
+ end
+ end
+ raise EncodingNormalizationError, "Couldn't find a valid character encoding for the string"
+
+end
+
+def convert_string_to_utf8_or_binary(s, suggested_character_encoding=nil)
+ # This function exists to help to keep consistent with the
+ # behaviour of earlier versions of Alaveteli: in the code as it
+ # is, there are situations where it's expected that we generally
+ # have a UTF-8 encoded string, but if the source data was
+ # unintepretable under any character encoding, the string may be
+ # binary data (i.e. invalid UTF-8). Such a string would then be
+ # mangled into valid UTF-8 by _sanitize_text for the purposes of
+ # display.
+
+ # This seems unsatisfactory to me - two better alternatives would
+ # be either: (a) to mangle the data into valid UTF-8 in this
+ # method or (b) to treat the 'text/*' attachment as
+ # 'application/octet-stream' instead. However, for the purposes
+ # of the transition to Ruby 1.9 and/or Rails 3 we just want the
+ # behaviour to be as similar as possible.
+
+ begin
+ result = normalize_string_to_utf8 s, suggested_character_encoding
+ rescue EncodingNormalizationError
+ result = s
+ s.force_encoding 'ASCII-8BIT' if RUBY_VERSION.to_f >= 1.9
+ end
+ result
+end
+
+def log_text_details(message, text)
+ if RUBY_VERSION.to_f >= 1.9
+ STDERR.puts "#{message}, we have text: #{text}, of class #{text.class} and encoding #{text.encoding}"
+ else
+ STDERR.puts "#{message}, we have text: #{text}, of class #{text.class}"
+ end
+ filename = "/var/tmp/#{Digest::MD5.hexdigest(text)}.txt"
+ File.open(filename, "wb") { |f| f.write text }
+ STDERR.puts "#{message}, the filename is: #{filename}"
+end
diff --git a/lib/tasks/temp.rake b/lib/tasks/temp.rake
index e49a84ecb..f0085b5e1 100644
--- a/lib/tasks/temp.rake
+++ b/lib/tasks/temp.rake
@@ -50,4 +50,154 @@ namespace :temp do
end
end
+ desc 'Create a CSV file of a random selection of raw emails, for comparing hexdigests'
+ task :random_attachments_hexdigests => :environment do
+
+ # The idea is to run this under the Rail 2 codebase, where
+ # Tmail was used to extract the attachements, and the task
+ # will output all of those file paths in a CSV file, and a
+ # list of the raw email files in another. The latter file is
+ # useful so that one can easily tar up the emails with:
+ #
+ # tar cvz -T raw-email-files -f raw_emails.tar.gz
+ #
+ # Then you can switch to the Rails 3 codebase, where
+ # attachment parsing is done via
+ # recompute_attachments_hexdigests
+
+ require 'csv'
+
+ File.open('raw-email-files', 'w') do |f|
+ CSV.open('attachment-hexdigests.csv', 'w') do |csv|
+ csv << ['filepath', 'i', 'url_part_number', 'hexdigest']
+ IncomingMessage.all(:order => 'RANDOM()', :limit => 1000).each do |incoming_message|
+ # raw_email.filepath fails unless the
+ # incoming_message has an associated request
+ next unless incoming_message.info_request
+ raw_email = incoming_message.raw_email
+ f.puts raw_email.filepath
+ incoming_message.foi_attachments.each_with_index do |attachment, i|
+ csv << [raw_email.filepath, i, attachment.url_part_number, attachment.hexdigest]
+ end
+ end
+ end
+ end
+
+ end
+
+
+ desc 'Check the hexdigests of attachments in emails on disk'
+ task :recompute_attachments_hexdigests => :environment do
+
+ require 'csv'
+ require 'digest/md5'
+
+ OldAttachment = Struct.new :filename, :attachment_index, :url_part_number, :hexdigest
+
+ filename_to_attachments = Hash.new {|h,k| h[k] = []}
+
+ header_line = true
+ CSV.foreach('attachment-hexdigests.csv') do |filename, attachment_index, url_part_number, hexdigest|
+ if header_line
+ header_line = false
+ else
+ filename_to_attachments[filename].push OldAttachment.new filename, attachment_index, url_part_number, hexdigest
+ end
+ end
+
+ total_attachments = 0
+ attachments_with_different_hexdigest = 0
+ files_with_different_numbers_of_attachments = 0
+ no_tnef_attachments = 0
+ no_parts_in_multipart = 0
+
+ multipart_error = "no parts on multipart mail"
+ tnef_error = "tnef produced no attachments"
+
+ # Now check each file:
+ filename_to_attachments.each do |filename, old_attachments|
+
+ # Currently it doesn't seem to be possible to reuse the
+ # attachment parsing code in Alaveteli without saving
+ # objects to the database, so reproduce what it does:
+
+ raw_email = nil
+ File.open(filename) do |f|
+ raw_email = f.read
+ end
+ mail = MailHandler.mail_from_raw_email(raw_email)
+
+ begin
+ attachment_attributes = MailHandler.get_attachment_attributes(mail)
+ rescue IOError => e
+ if e.message == tnef_error
+ puts "#{filename} #{tnef_error}"
+ no_tnef_attachments += 1
+ next
+ else
+ raise
+ end
+ rescue Exception => e
+ if e.message == multipart_error
+ puts "#{filename} #{multipart_error}"
+ no_parts_in_multipart += 1
+ next
+ else
+ raise
+ end
+ end
+
+ if attachment_attributes.length != old_attachments.length
+ puts "#{filename} the number of old attachments #{old_attachments.length} didn't match the number of new attachments #{attachment_attributes.length}"
+ files_with_different_numbers_of_attachments += 1
+ else
+ old_attachments.each_with_index do |old_attachment, i|
+ total_attachments += 1
+ attrs = attachment_attributes[i]
+ old_hexdigest = old_attachment.hexdigest
+ new_hexdigest = attrs[:hexdigest]
+ new_content_type = attrs[:content_type]
+ old_url_part_number = old_attachment.url_part_number.to_i
+ new_url_part_number = attrs[:url_part_number]
+ if old_url_part_number != new_url_part_number
+ puts "#{i} #{filename} old_url_part_number #{old_url_part_number}, new_url_part_number #{new_url_part_number}"
+ end
+ if old_hexdigest != new_hexdigest
+ body = attrs[:body]
+ # First, if the content type is one of
+ # text/plain, text/html or application/rtf try
+ # changing CRLF to LF and calculating a new
+ # digest - we generally don't worry about
+ # these changes:
+ new_converted_hexdigest = nil
+ if ["text/plain", "text/html", "application/rtf"].include? new_content_type
+ converted_body = body.gsub /\r\n/, "\n"
+ new_converted_hexdigest = Digest::MD5.hexdigest converted_body
+ puts "new_converted_hexdigest is #{new_converted_hexdigest}"
+ end
+ if (! new_converted_hexdigest) || (old_hexdigest != new_converted_hexdigest)
+ puts "#{i} #{filename} old_hexdigest #{old_hexdigest} wasn't the same as new_hexdigest #{new_hexdigest}"
+ puts " body was of length #{body.length}"
+ puts " content type was: #{new_content_type}"
+ path = "/tmp/#{new_hexdigest}"
+ f = File.new path, "w"
+ f.write body
+ f.close
+ puts " wrote body to #{path}"
+ attachments_with_different_hexdigest += 1
+ end
+ end
+ end
+ end
+
+ end
+
+ puts "total_attachments: #{total_attachments}"
+ puts "attachments_with_different_hexdigest: #{attachments_with_different_hexdigest}"
+ puts "files_with_different_numbers_of_attachments: #{files_with_different_numbers_of_attachments}"
+ puts "no_tnef_attachments: #{no_tnef_attachments}"
+ puts "no_parts_in_multipart: #{no_parts_in_multipart}"
+
+ end
+
end
diff --git a/spec/controllers/api_controller_spec.rb b/spec/controllers/api_controller_spec.rb
index 749be9f85..66b8e33f0 100644
--- a/spec/controllers/api_controller_spec.rb
+++ b/spec/controllers/api_controller_spec.rb
@@ -259,7 +259,7 @@ describe ApiController, "when using the API" do
attachments.size.should == 1
attachment = attachments[0]
attachment.filename.should == "tfl.pdf"
- attachment.body.should == load_file_fixture("tfl.pdf", as_binary=true)
+ attachment.body.should == load_file_fixture("tfl.pdf")
end
it "should show information about a request" do
diff --git a/spec/controllers/request_controller_spec.rb b/spec/controllers/request_controller_spec.rb
index 657837c72..9cc60a103 100644
--- a/spec/controllers/request_controller_spec.rb
+++ b/spec/controllers/request_controller_spec.rb
@@ -477,11 +477,11 @@ describe RequestController, "when showing one request" do
(assigns[:info_request_events].size - size_before).should == 1
ir.reload
- get :get_attachment, :incoming_message_id => ir.incoming_messages[1].id, :id => ir.id, :part => 2, :file_name => 'hello.txt', :skip_cache => 1
+ get :get_attachment, :incoming_message_id => ir.incoming_messages[1].id, :id => ir.id, :part => 2, :file_name => 'hello world.txt', :skip_cache => 1
response.content_type.should == "text/plain"
response.should contain "Second hello"
- get :get_attachment, :incoming_message_id => ir.incoming_messages[1].id, :id => ir.id, :part => 3, :file_name => 'hello.txt', :skip_cache => 1
+ get :get_attachment, :incoming_message_id => ir.incoming_messages[1].id, :id => ir.id, :part => 3, :file_name => 'hello world.txt', :skip_cache => 1
response.content_type.should == "text/plain"
response.should contain "First hello"
end
@@ -494,7 +494,7 @@ describe RequestController, "when showing one request" do
get :get_attachment, :incoming_message_id => ir.incoming_messages[1].id,
:id => ir.id,
:part => 2,
- :file_name => 'hello.txt'
+ :file_name => 'hello world.txt'
end
it "should convert message body to UTF8" do
@@ -508,7 +508,7 @@ describe RequestController, "when showing one request" do
ir = info_requests(:fancy_dog_request)
receive_incoming_mail('incoming-request-two-same-name.email', ir.incoming_email)
ir.reload
- get :get_attachment_as_html, :incoming_message_id => ir.incoming_messages[1].id, :id => ir.id, :part => 2, :file_name => 'hello.txt.html', :skip_cache => 1
+ get :get_attachment_as_html, :incoming_message_id => ir.incoming_messages[1].id, :id => ir.id, :part => 2, :file_name => 'hello world.txt.html', :skip_cache => 1
response.content_type.should == "text/html"
response.should contain "Second hello"
end
@@ -529,11 +529,11 @@ describe RequestController, "when showing one request" do
ir.reload
ugly_id = "55195"
lambda {
- get :get_attachment, :incoming_message_id => ir.incoming_messages[1].id, :id => ugly_id, :part => 2, :file_name => 'hello.txt.html', :skip_cache => 1
+ get :get_attachment, :incoming_message_id => ir.incoming_messages[1].id, :id => ugly_id, :part => 2, :file_name => 'hello world.txt.html', :skip_cache => 1
}.should raise_error(ActiveRecord::RecordNotFound)
lambda {
- get :get_attachment_as_html, :incoming_message_id => ir.incoming_messages[1].id, :id => ugly_id, :part => 2, :file_name => 'hello.txt', :skip_cache => 1
+ get :get_attachment_as_html, :incoming_message_id => ir.incoming_messages[1].id, :id => ugly_id, :part => 2, :file_name => 'hello world.txt', :skip_cache => 1
}.should raise_error(ActiveRecord::RecordNotFound)
end
it "should return 404 when incoming message and request ids don't match" do
@@ -542,7 +542,7 @@ describe RequestController, "when showing one request" do
receive_incoming_mail('incoming-request-two-same-name.email', ir.incoming_email)
ir.reload
lambda {
- get :get_attachment_as_html, :incoming_message_id => ir.incoming_messages[1].id, :id => wrong_id, :part => 2, :file_name => 'hello.txt.html', :skip_cache => 1
+ get :get_attachment_as_html, :incoming_message_id => ir.incoming_messages[1].id, :id => wrong_id, :part => 2, :file_name => 'hello world.txt.html', :skip_cache => 1
}.should raise_error(ActiveRecord::RecordNotFound)
end
it "should return 404 for ugly URLs contain a request id that isn't an integer, even if the integer prefix refers to an actual request" do
@@ -552,11 +552,11 @@ describe RequestController, "when showing one request" do
ugly_id = "%d95" % [info_requests(:naughty_chicken_request).id]
lambda {
- get :get_attachment, :incoming_message_id => ir.incoming_messages[1].id, :id => ugly_id, :part => 2, :file_name => 'hello.txt.html', :skip_cache => 1
+ get :get_attachment, :incoming_message_id => ir.incoming_messages[1].id, :id => ugly_id, :part => 2, :file_name => 'hello world.txt.html', :skip_cache => 1
}.should raise_error(ActiveRecord::RecordNotFound)
lambda {
- get :get_attachment_as_html, :incoming_message_id => ir.incoming_messages[1].id, :id => ugly_id, :part => 2, :file_name => 'hello.txt', :skip_cache => 1
+ get :get_attachment_as_html, :incoming_message_id => ir.incoming_messages[1].id, :id => ugly_id, :part => 2, :file_name => 'hello world.txt', :skip_cache => 1
}.should raise_error(ActiveRecord::RecordNotFound)
end
it "should return 404 when incoming message and request ids don't match" do
@@ -565,7 +565,7 @@ describe RequestController, "when showing one request" do
receive_incoming_mail('incoming-request-two-same-name.email', ir.incoming_email)
ir.reload
lambda {
- get :get_attachment_as_html, :incoming_message_id => ir.incoming_messages[1].id, :id => wrong_id, :part => 2, :file_name => 'hello.txt.html', :skip_cache => 1
+ get :get_attachment_as_html, :incoming_message_id => ir.incoming_messages[1].id, :id => wrong_id, :part => 2, :file_name => 'hello world.txt.html', :skip_cache => 1
}.should raise_error(ActiveRecord::RecordNotFound)
end
@@ -573,44 +573,66 @@ describe RequestController, "when showing one request" do
ir = info_requests(:fancy_dog_request)
receive_incoming_mail('incoming-request-pdf-attachment.email', ir.incoming_email)
ir.reload
- get :get_attachment_as_html, :incoming_message_id => ir.incoming_messages[1].id, :id => ir.id, :part => 2, :file_name => 'fs_50379341.pdf.html', :skip_cache => 1
+ get :get_attachment_as_html, :incoming_message_id => ir.incoming_messages[1].id, :id => ir.id, :part => 2, :file_name => 'fs 50379341.pdf.html', :skip_cache => 1
response.content_type.should == "text/html"
response.should contain "Walberswick Parish Council"
end
- it "should not cause a reparsing of the raw email, even when the result would be a 404" do
+ it "should not cause a reparsing of the raw email, even when the attachment can't be found" do
ir = info_requests(:fancy_dog_request)
receive_incoming_mail('incoming-request-two-same-name.email', ir.incoming_email)
ir.reload
- attachment = IncomingMessage.get_attachment_by_url_part_number(ir.incoming_messages[1].get_attachments_for_display, 2)
+ attachment = IncomingMessage.get_attachment_by_url_part_number_and_filename(ir.incoming_messages[1].get_attachments_for_display, 2, 'hello world.txt')
attachment.body.should contain "Second hello"
# change the raw_email associated with the message; this only be reparsed when explicitly asked for
ir.incoming_messages[1].raw_email.data = ir.incoming_messages[1].raw_email.data.sub("Second", "Third")
- # asking for an attachment by the wrong filename results
- # in a 404 for browsing users. This shouldn't cause a
- # re-parse...
- lambda {
- get :get_attachment_as_html, :incoming_message_id => ir.incoming_messages[1].id, :id => ir.id, :part => 2, :file_name => 'hello.txt.baz.html', :skip_cache => 1
- }.should raise_error(ActiveRecord::RecordNotFound)
+ # asking for an attachment by the wrong filename should result in redirecting
+ # back to the incoming message, but shouldn't cause a reparse:
+ get :get_attachment_as_html, :incoming_message_id => ir.incoming_messages[1].id, :id => ir.id, :part => 2, :file_name => 'hello world.txt.baz.html', :skip_cache => 1
+ response.status.should == 303
- attachment = IncomingMessage.get_attachment_by_url_part_number(ir.incoming_messages[1].get_attachments_for_display, 2)
+ attachment = IncomingMessage.get_attachment_by_url_part_number_and_filename(ir.incoming_messages[1].get_attachments_for_display, 2, 'hello world.txt')
attachment.body.should contain "Second hello"
# ...nor should asking for it by its correct filename...
- get :get_attachment_as_html, :incoming_message_id => ir.incoming_messages[1].id, :id => ir.id, :part => 2, :file_name => 'hello.txt.html', :skip_cache => 1
+ get :get_attachment_as_html, :incoming_message_id => ir.incoming_messages[1].id, :id => ir.id, :part => 2, :file_name => 'hello world.txt.html', :skip_cache => 1
response.should_not contain "Third hello"
# ...but if we explicitly ask for attachments to be extracted, then they should be
force = true
ir.incoming_messages[1].parse_raw_email!(force)
ir.reload
- attachment = IncomingMessage.get_attachment_by_url_part_number(ir.incoming_messages[1].get_attachments_for_display, 2)
+ attachment = IncomingMessage.get_attachment_by_url_part_number_and_filename(ir.incoming_messages[1].get_attachments_for_display, 2, 'hello world.txt')
attachment.body.should contain "Third hello"
- get :get_attachment_as_html, :incoming_message_id => ir.incoming_messages[1].id, :id => ir.id, :part => 2, :file_name => 'hello.txt.html', :skip_cache => 1
+ get :get_attachment_as_html, :incoming_message_id => ir.incoming_messages[1].id, :id => ir.id, :part => 2, :file_name => 'hello world.txt.html', :skip_cache => 1
response.should contain "Third hello"
end
+ it "should redirect to the incoming message if there's a wrong part number and an ambiguous filename" do
+ ir = info_requests(:fancy_dog_request)
+ receive_incoming_mail('incoming-request-two-same-name.email', ir.incoming_email)
+ ir.reload
+
+ im = ir.incoming_messages[1]
+
+ attachment = IncomingMessage.get_attachment_by_url_part_number_and_filename(im.get_attachments_for_display, 5, 'hello world.txt')
+ attachment.should be_nil
+
+ get :get_attachment_as_html, :incoming_message_id => im.id, :id => ir.id, :part => 5, :file_name => 'hello world.txt', :skip_cache => 1
+ response.status.should == 303
+ new_location = response.header['Location']
+ new_location.should match(/request\/#{ir.url_title}#incoming-#{im.id}/)
+ end
+
+ it "should find a uniquely named filename even if the URL part number was wrong" do
+ ir = info_requests(:fancy_dog_request)
+ receive_incoming_mail('incoming-request-pdf-attachment.email', ir.incoming_email)
+ ir.reload
+ get :get_attachment, :incoming_message_id => ir.incoming_messages[1].id, :id => ir.id, :part => 5, :file_name => 'fs 50379341.pdf', :skip_cache => 1
+ response.content_type.should == "application/pdf"
+ end
+
it "should treat attachments with unknown extensions as binary" do
ir = info_requests(:fancy_dog_request)
receive_incoming_mail('incoming-request-attachment-unknown-extension.email', ir.incoming_email)
@@ -625,10 +647,8 @@ describe RequestController, "when showing one request" do
ir = info_requests(:fancy_dog_request)
receive_incoming_mail('incoming-request-two-same-name.email', ir.incoming_email)
- lambda {
- get :get_attachment, :incoming_message_id => ir.incoming_messages[1].id, :id => ir.id, :part => 2,
- :file_name => 'http://trying.to.hack'
- }.should raise_error(ActiveRecord::RecordNotFound)
+ get :get_attachment, :incoming_message_id => ir.incoming_messages[1].id, :id => ir.id, :part => 2, :file_name => 'http://trying.to.hack'
+ response.status.should == 303
end
it "should censor attachments downloaded as binary" do
@@ -644,7 +664,7 @@ describe RequestController, "when showing one request" do
begin
receive_incoming_mail('incoming-request-two-same-name.email', ir.incoming_email)
- get :get_attachment, :incoming_message_id => ir.incoming_messages[1].id, :id => ir.id, :part => 2, :file_name => 'hello.txt', :skip_cache => 1
+ get :get_attachment, :incoming_message_id => ir.incoming_messages[1].id, :id => ir.id, :part => 2, :file_name => 'hello world.txt', :skip_cache => 1
response.content_type.should == "text/plain"
response.should contain "xxxxxx hello"
ensure
@@ -666,7 +686,7 @@ describe RequestController, "when showing one request" do
receive_incoming_mail('incoming-request-two-same-name.email', ir.incoming_email)
ir.reload
- get :get_attachment, :incoming_message_id => ir.incoming_messages[1].id, :id => ir.id, :part => 2, :file_name => 'hello.txt', :skip_cache => 1
+ get :get_attachment, :incoming_message_id => ir.incoming_messages[1].id, :id => ir.id, :part => 2, :file_name => 'hello world.txt', :skip_cache => 1
response.content_type.should == "text/plain"
response.should contain "xxxxxx hello"
ensure
@@ -695,11 +715,13 @@ describe RequestController, "when showing one request" do
# so at this point, assigns[:info_request].incoming_messages[1].get_attachments_for_display is returning stuff, but the equivalent thing in the template isn't.
# but something odd is that the above is return a whole load of attachments which aren't there in the controller
response.body.should have_selector("p.attachment strong") do |s|
- s.should contain /hello.txt/m
+ s.should contain /hello world.txt/m
end
censor_rule = CensorRule.new()
- censor_rule.text = "hello.txt"
+ # Note that the censor rule applies to the original filename,
+ # not the display_filename:
+ censor_rule.text = "hello-world.txt"
censor_rule.replacement = "goodbye.txt"
censor_rule.last_edit_editor = "unknown"
censor_rule.last_edit_comment = "none"
@@ -743,7 +765,7 @@ describe RequestController, "when showing one request" do
old_path = assigns[:url_path]
response.location.should contain /#{assigns[:url_path]}$/
zipfile = Zip::ZipFile.open(File.join(File.dirname(__FILE__), "../../cache/zips", old_path)) { |zipfile|
- zipfile.count.should == 3 # the message plus two "hello.txt" files
+ zipfile.count.should == 3 # the message plus two "hello-world.txt" files
}
# The path of the zip file is based on the hash of the timestamp of the last request
@@ -756,7 +778,7 @@ describe RequestController, "when showing one request" do
assigns[:url_path].should_not == old_path
response.location.should contain assigns[:url_path]
zipfile = Zip::ZipFile.open(File.join(File.dirname(__FILE__), "../../cache/zips", assigns[:url_path])) { |zipfile|
- zipfile.count.should == 4 # the message, two hello.txt plus the unknown attachment
+ zipfile.count.should == 4 # the message, two hello-world.txt plus the unknown attachment
}
end
@@ -875,7 +897,7 @@ describe RequestController, "when changing prominence of a request" do
get :get_attachment_as_html, :incoming_message_id => ir.incoming_messages[1].id,
:id => ir.id,
:part => 2,
- :file_name => 'hello.txt'
+ :file_name => 'hello world.txt'
end.should raise_error(ActiveRecord::RecordNotFound)
end
@@ -890,7 +912,7 @@ describe RequestController, "when changing prominence of a request" do
get :get_attachment_as_html, :incoming_message_id => ir.incoming_messages[1].id,
:id => ir.id,
:part => 2,
- :file_name => 'hello.txt'
+ :file_name => 'hello world.txt'
end.should raise_error(ActiveRecord::RecordNotFound)
end
@@ -2394,7 +2416,7 @@ describe RequestController, "when caching fragments" do
attachment = mock(FoiAttachment, :display_filename => long_name,
:body_as_html => ['some text', 'wrapper'])
IncomingMessage.stub!(:find).with("44").and_return(incoming_message)
- IncomingMessage.stub!(:get_attachment_by_url_part_number).and_return(attachment)
+ IncomingMessage.stub!(:get_attachment_by_url_part_number_and_filename).and_return(attachment)
InfoRequest.stub!(:find).with("132").and_return(info_request)
params = { :file_name => long_name,
:controller => "request",
diff --git a/spec/fixtures/files/incoming-request-two-same-name.email b/spec/fixtures/files/incoming-request-two-same-name.email
index f1024d607..ecd322fe4 100644
--- a/spec/fixtures/files/incoming-request-two-same-name.email
+++ b/spec/fixtures/files/incoming-request-two-same-name.email
@@ -13,13 +13,13 @@ Content-Disposition: inline
--Q68bSM7Ycu6FN28Q
Content-Type: text/plain; charset=us-ascii
-Content-Disposition: attachment; filename="hello.txt"
+Content-Disposition: attachment; filename="hello-world.txt"
Second hello
--Q68bSM7Ycu6FN28Q
Content-Type: text/plain; charset=us-ascii
-Content-Disposition: attachment; filename="hello.txt"
+Content-Disposition: attachment; filename="hello-world.txt"
First hello
diff --git a/spec/fixtures/files/inline-uuencode.email b/spec/fixtures/files/inline-uuencode.email
new file mode 100644
index 000000000..3134ba3ad
--- /dev/null
+++ b/spec/fixtures/files/inline-uuencode.email
@@ -0,0 +1,27 @@
+From foo@bar Mon Jun 01 17:14:44 2009
+Return-path: <foo@bar>
+Envelope-to: foi@quux
+Delivery-date: Mon, 01 Jun 2009 17:14:44 +0100
+From: <foo@bar>
+To: <request-whatever@quux>
+Subject: something or other
+Date: Mon, 1 Jun 2009 17:14:37 +0100
+X-MimeOLE: Produced By Microsoft MimeOLE V6.00.3790.181
+Message-ID: <baz@xyzzy>
+
+Thanks for your email - here's a truncated attachment
+for you:
+
+**********************************************************************
+
+begin 666 ResponseT7363 9.doc
+MT,\1X*&Q&N$`````````````````````/@`#`/[_"0`&```````````````"
+M````) ``````````$ ``+@````$```#^____`````",```!L````________
+M````````````````````````````````````````````````````````````
+M````````````````````````````````````````````````````````````
+#````
+`
+end
+
+The original of this email was scanned for viruses or something
+like that.
diff --git a/spec/fixtures/files/malformed-to-and-cc.email b/spec/fixtures/files/malformed-to-and-cc.email
new file mode 100644
index 000000000..4fbb6e21e
--- /dev/null
+++ b/spec/fixtures/files/malformed-to-and-cc.email
@@ -0,0 +1,11 @@
+From foo@bar Wed Mar 12 14:58:26 2008
+Return-path: <foo@bar>
+Subject: example email
+To: <bar@example.org
+Cc: baz@example.org>
+From: quux@example.org
+Date: Mon, 7 May 2012 12:47:06 +0100
+Mime-Version: 1.0
+Content-Type: text/plain; charset=utf-8
+
+A very basic email, but with malformed To: and Cc: lines
diff --git a/spec/fixtures/files/mislabelled-as-iso-8859-1.email b/spec/fixtures/files/mislabelled-as-iso-8859-1.email
new file mode 100644
index 000000000..6c8e6109e
--- /dev/null
+++ b/spec/fixtures/files/mislabelled-as-iso-8859-1.email
@@ -0,0 +1,20 @@
+From foo@bar Thu Mar 01 15:02:33 2012
+Return-path: <foo@bar>
+Envelope-to: foi@quux
+Delivery-date: Thu, 01 Mar 2012 15:02:33 +0000
+Date: Thu, 01 Mar 2012 15:01:58 +0000
+Subject: some FOI request
+To: foi@quux
+From: foo@bar
+MIME-Version: 1.0
+Content-Type: text/plain; charset="iso-8859-1"
+Content-Transfer-Encoding: 7bit
+Message-Id: <2468@bar.local>
+
+Dear Whoever,
+
+THERE'S A DASH NEXT REQUEST FOR INFORMATION
+
+Best regards,
+Other Person
+
diff --git a/spec/fixtures/files/multipart-no-final-boundary.email b/spec/fixtures/files/multipart-no-final-boundary.email
new file mode 100644
index 000000000..9c16dad52
--- /dev/null
+++ b/spec/fixtures/files/multipart-no-final-boundary.email
@@ -0,0 +1,21 @@
+From foo@bar Thu Sep 13 10:34:44 2012
+Return-path: <foo@bar>
+Envelope-to: foi@example.org
+Delivery-date: Thu, 13 Sep 2012 10:34:44 +0100
+From: foo@bar
+To: foi@example.org
+Subject: an acknowledgement email
+Date: Thu, 13 Sep 2012 10:08:03 +0100
+Message-ID: <987654@foo.local>
+Content-Type: multipart/mixed; boundary="-----7D81B75CCC90D2974F7A1CBD"
+
+This is a multi-part message in MIME format.
+-------7D81B75CCC90D2974F7A1CBD
+Content-Type: text/html
+
+<div>
+ <p>
+ This is an acknowledgement of your email, that irritatingly
+ leaves out the final MIME boundary.
+ </p>
+<div>
diff --git a/spec/fixtures/files/nested-attachments-premature-end.email b/spec/fixtures/files/nested-attachments-premature-end.email
new file mode 100644
index 000000000..6b13808dc
--- /dev/null
+++ b/spec/fixtures/files/nested-attachments-premature-end.email
@@ -0,0 +1,110 @@
+From someone@example.org Mon May 15 13:10:29 2012
+Return-path: <someone@example.org>
+Envelope-to: foi@example.org
+Delivery-date: Mon, 15 May 2012 13:10:29 +0100
+Message-Id: <abcde@baz.local>
+Date: Mon, 15 May 2012 09:48:48 +0100
+From: "Example Person" <someone@example.org>
+To: <request@example.org>
+Subject: some FOI request or other
+Mime-Version: 1.0
+Content-Type: multipart/mixed; boundary="=__outer__="
+
+This is a MIME message. If you are reading this text, you may want to
+consider changing to a mail reader or gateway that understands how to
+properly handle MIME multipart messages.
+
+--=__outer__=
+Content-Type: multipart/alternative; boundary="=__inner__="
+
+--=__inner__=
+Content-Type: text/plain; charset="utf-8"
+Content-Transfer-Encoding: quoted-printable
+X-MIME-Autoconverted: from 8bit to quoted-printable by something
+
+Hello
+=20
+Please find some information attached.
+=20
+
+--=__inner__=
+Content-Description: HTML
+Content-Type: text/html; charset="utf-8"
+Content-Transfer-Encoding: quoted-printable
+
+<html>
+ <head>
+ <title>some title text</title>
+ </head>
+ <body>
+ <p>blah blah blah</p>
+ </body>
+</html>
+
+--=__inner__=--
+
+--=__outer__=
+Content-Type: message/rfc822
+
+Return-path: <foo@bar>
+Date: Mon, 7 May 2012 12:47:06 +0100
+From: someone-else@example.org
+To: foi@example.org
+Message-Id: <56789@quux.local>
+Subject: a freedom of information requests
+Mime-Version: 1.0
+Content-Type: text/plain; charset=utf-8
+
+ Dear Whoever,
+
+ Please could you let me know, um, whatever ...
+
+ Yours faithfully,
+
+ Whoever I Am
+
+--=__outer__=
+Content-Type: text/plain; charset=US-ASCII
+Content-Disposition: inline
+Content-Transfer-Encoding: quoted-printable
+
+ Dear Whowever,
+ =20
+ Please could you let me know, um, whatever ...
+ =20
+ Yours faithfully,
+ =20
+ Whoever I Am
+ =20
+
+--=__outer__=--
+
+--=__outer__=
+Content-Type: application/png; name="maroon-square.png"
+Content-Transfer-Encoding: base64
+Content-Disposition: attachment; filename="maroon-square.png"
+
+iVBORw0KGgoAAAANSUhEUgAAAEEAAABCCAYAAAAIY7vrAAAABmJLR0QA/wD/AP+g
+vaeTAAAACXBIWXMAAAsTAAALEwEAmpwYAAAAB3RJTUUH3QQeDSEx8qultwAAABl0
+RVh0Q29tbWVudABDcmVhdGVkIHdpdGggR0lNUFeBDhcAAAMzSURBVHja7VtL2psw
+DNS4rPv1Gj1Kt71Az9ZT9F7dN9MFGGThB/YfKDX2Kp8DRBpLowcKvn/5ShERiAgl
+srh8aT93tJzWdae8XR0CEICwUx59K54H4QFKp0Eg5alrAwEYIDx5DRAGCAOEAcIA
+QaUFfDoIHJawpEbOPd0dRPjJDWIUiEwt933+8es2Ovz++a3dCkREXmwD4ZbsVln6
+cLkef14duAMqAGCkY0A+jBNgXGFZU/eKa3fhZjlQqLhHKF9oFbpulE2Z/oFrXTd+
+nlOWkn1dMHXrAiWguq0iG9uk/REjBggPtgQOED781my4wwBhgDBAmPmUAwR0X0UO
+dxggnA8CO5xocU8HoAoEDwA6nOyCH+ZMKQ4zy+QbNBoUirquMPBJcgPyJkOi+c7S
+ohhn6ZctzDIrcFalIspYILG1et9WABUtt6WztLq+/0Amp9sCnsCBUhfvK4FLiRCA
+QwC7JABGTngrIIPnIjf6R5We0uxz3j+FbCvdy2nlY/IgcfrMRQuFHIC9Sap3AW8n
+2gZ+cZYCVn4LzBxxnykNgJpWN8lt7yw+QCMxan2s8lQXcNlDlpAW7YmIXMszTgoH
+rU91+8OFYXN9ikz/LyLgExSCDlaO+cdGsIEQkyUAIgFMKRTEn3vDjFFHwWSIzEQC
+cmN4IHVNGG2PQXhhsuRl3jihwQyB6H1274gV1BhKLKNt4ZEpkygeeoC+xytdK1cr
+oX0EACphnTZXbbLMmL/YBGo9lSU1OmBONMnTlQUqTa4y1VgAddg0hdTR04lyT0Xq
+8RYAyHVyBX6ET/9wTBD6TWVCMH5Qo3yhXju3bNY/BBMdsoLYBMmnzQdOP56O36s5
+40r1D7UWYV5dNT2nbxVBAHb43Y36CdbXfTii6isU/U7ZXLQ4w/V/wotFoilVF2kl
+w7YCDrIPkj4/G9fao7q0rYSSJdgeSqmQrCU+r/j8rOv/gpuKPm5Lffen5eN+ljeo
+rcfW0Om2Enm9KwDZAgrG98txX9cMe6X2E5SGU29VTE17lFAUkMybsXclndu31BGX
+hcgWv8oxonYtkf/jhc10WPGgm2IZncKlu+sg8vLm7hDSwk3f2/wFEzN3v6aAXQ0A
+AAAASUVORK5CYII=
+
+--=__outer__=--
+
diff --git a/spec/fixtures/files/no-part-charset-random-data.email b/spec/fixtures/files/no-part-charset-random-data.email
new file mode 100644
index 000000000..d51fd3f38
--- /dev/null
+++ b/spec/fixtures/files/no-part-charset-random-data.email
@@ -0,0 +1,30 @@
+From xxxx@yahoo.cn Mon Oct 08 14:01:34 2012
+Return-path: <xxxx@yahoo.cn>
+Envelope-to: foi@atlas.ukcod.org.uk
+Delivery-date: Mon, 08 Oct 2012 14:01:34 +0100
+Received: (qmail 63864 invoked from network); 8 Oct 2012 13:01:12 -0000
+DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=yahoo.cn; s=s1024; t=1349701272; bh=T/mtlIYvhB/L5RO+CvTazeAdGf1n1zsGXBoA8EKGT9M=; h=Message-ID:X-Yahoo-Newman-Property:X-YMail-OSG:X-Yahoo-SMTP:Received:X-mailer:From:Subject:To:Content-Transfer-Encoding:Content-Type:Date; b=LYI/PXvA7DA746bmyprChUg7N8YDvN9XE/bhfTt5MW7siOmxHHzn1w+s5X33PvLI0x0UfJLo+MCkTnGPKnG5BYY38US8PkocJYyphrvF/eaUl3ALf8UvxHBOJX1iIi89Xp2NnfbS8lz9kZAWifb9GOnOA5/kLDcL5/WJXliit2k=
+Message-ID: <xxxx@xxxx.yahoo.com>
+X-Yahoo-Newman-Property: ymail-5
+X-YMail-OSG: nPs5jgsVM1myUoKjeEPTxxalz4BM6BZMEUYu.E8NPMPQyo_
+ Yej8T2WCTurn767NOwhuDIqNxC2QGZINqfjmKcdyW7a1P_Zxqr9GsjgxODci
+ ihwr7qYAGDDbcsrB.PX4epnJZHl3yAwoGW.1ReEZnXQANFcNep7.zNEbZ_2k
+ RU1IhI9aHYvxPxt5RWugwOoFRh9P8Ym35A88IMazNtVaBiBEXF6Vk8Aqr9XP
+ 3Vh9xOT9Pn6X8qOUjNXkdb3xB4S5AAIRSE9mqhL1KzHBwdVQs25IoM_2FV2b
+ gPsQGgL4_mwBH0WcEMhdj7Kn6Nfb44L.50E_V3DH.8P7KzDK8zNVXSbAqohX
+ Qi6MzUK2frr8IyZyYzHb.ekff7kAcJgUoHvhnyPar8tRYxhQT3_xsUTzsx8N
+ oWckVPh_i3OT7U4ObgekqgtteMoYqPH2eF1SZXamGBAs-
+X-Yahoo-SMTP: YUQHwRWswBDjbw_M.D6EP4KpT9khlJErDRBQi4ySZQ--
+X-mailer: MIME::Lite 3.027 (F2.74; T1.31; A2.07; B3.13; Q3.13)
+From: =?GB2312?B?zsJKaWFu?= Bing <xxxx@yahoo.cn>
+Subject: =?GB2312?B?yM7A1svJ?=
+To: FOI Person <EMAIL_TO>
+Content-Transfer-Encoding: base64
+Content-Type: text/plain
+Date: Tue, 9 Oct 2012 20:53:06 +0800
+
+HPBSqsndNBX+ER4hyBoPhhnclcWKVFgbevdD5cJvfI/ARbxRYqA28hZ49Pf6A/ks
+NdVh4N5VPgRs/7SHYPfw5625pZJYTLj6nVdYk76sxnjiiAmwCJWGjPoWvO7nHUBv
+fuLXtNVq5HmD0bWWjAbSk2n74PW7v5izbNO2fjHyiyX2CIof0rriXDmOldJqoebO
+ejybrjG+Tahpu3FF1Mw98HfswzkdB46u/izLCzdUQVM=
+
diff --git a/spec/fixtures/files/part-without-charset-in-content-type.email b/spec/fixtures/files/part-without-charset-in-content-type.email
new file mode 100644
index 000000000..439d52cc3
--- /dev/null
+++ b/spec/fixtures/files/part-without-charset-in-content-type.email
@@ -0,0 +1,38 @@
+From example@example.com Wed Sep 15 17:55:40 2010
+Return-path: <example@example.com>
+Envelope-to: example@example.com
+Delivery-date: Wed, 15 Sep 2010 17:55:40 +0100
+From: <example@example.com>
+To: <request-xxxxx@whatdotheyknow.com>
+Date: Wed, 15 Sep 2010 17:56:03 +0100
+Subject: FOI Internal Review response
+Thread-Topic: FOI Internal Review response
+Thread-Index: xxxxx
+Message-ID: <xxxxxx>
+Accept-Language: en-US, en-GB
+Content-Language: en-US
+X-MS-Has-Attach: yes
+X-MS-TNEF-Correlator:
+acceptlanguage: en-US, en-GB
+Content-Type: multipart/mixed;
+ boundary="_002_E6527350F565F54A88C36C23F6C2B86702618AD0DF95SDCCPMSXMB5_"
+MIME-Version: 1.0
+
+--_002_E6527350F565F54A88C36C23F6C2B86702618AD0DF95SDCCPMSXMB5_
+Content-Type: text/plain; charset="utf-8"
+Content-Transfer-Encoding: base64
+
+someencodedtext=
+
+--_002_E6527350F565F54A88C36C23F6C2B86702618AD0DF95SDCCPMSXMB5_
+Content-Type: document/pdf; name="document.pdf"
+Content-Description: document.pdf
+Content-Disposition: attachment; filename="document.pdf";
+ size=62103; creation-date="Wed, 15 Sep 2010 17:54:27 GMT";
+ modification-date="Wed, 15 Sep 2010 17:54:27 GMT"
+Content-Transfer-Encoding: base64
+
+somemoreencodedtext=
+
+--_002_E6527350F565F54A88C36C23F6C2B86702618AD0DF95SDCCPMSXMB5_--
+
diff --git a/spec/fixtures/files/tnef-attachment-empty.email b/spec/fixtures/files/tnef-attachment-empty.email
new file mode 100644
index 000000000..7967aa95b
--- /dev/null
+++ b/spec/fixtures/files/tnef-attachment-empty.email
@@ -0,0 +1,196 @@
+From hello@blah.local Fri Feb 21 16:23:14 2013
+Return-path: <bar@example.org>
+Envelope-to: foo@example.org
+Delivery-date: Fri, 21 Feb 2013 16:23:14 +0000
+Content-Type: multipart/mixed;
+ boundary="_000_553468B23EE29B4F8836CBD0E1B2A15A275C3AA855POLNIEXMBV2po_"
+From: <bar@example.org>
+To: <foo@example.org>
+Sender: <hello@blah.local>
+Date: Fri, 21 Feb 2013 16:23:04 +0000
+Subject: here's a useless email
+Message-ID: <12345@blah.local>
+Accept-Language: en-US, en-GB
+Content-Language: en-US
+X-MS-Has-Attach:
+X-MS-TNEF-Correlator: <12345@blah.local>
+acceptlanguage: en-US, en-GB
+MIME-Version: 1.0
+
+--_000_553468B23EE29B4F8836CBD0E1B2A15A275C3AA855POLNIEXMBV2po_
+Content-Type: text/plain; charset="us-ascii"
+Content-Transfer-Encoding: quoted-printable
+
+This attachment just has a body from one of the tests
+in the tnef package in Debian.
+
+--_000_553468B23EE29B4F8836CBD0E1B2A15A275C3AA855POLNIEXMBV2po_
+Content-Disposition: attachment; filename="winmail.dat"
+Content-Transfer-Encoding: base64
+Content-Type: application/ms-tnef; name="winmail.dat"
+
+eJ8+IiURAQaQCAAEAAAAAAABAAEAAQeQBgAIAAAA5AQAAAAAAADoAAENgAQAAgAA
+AAIAAgABBYADAA4AAADVBwQAGQAKAA8AIwABADYBASCAAwAOAAAA1QcEABkACgAP
+ACQAAQA3AQEJgAEAIQAAADBEREEwRkNCQ0MwN0MxNDE5MkVFODZGQzQyRDE1Qjk1
+AGYHAQSQBgBkAgAAAQAAAA8AAAAfAAEwAQAAABAAAAAzAGsAdQBzAGUAcgAyAAAA
+HwACMAEAAAAGAAAARQBYAAAAAAAfAAMwAQAAAI4AAAAvAE8APQBCAFIALQBFAFgA
+QwBIAC0AVABFAFMAVAAvAE8AVQA9AEYASQBSAFMAVAAgAEEARABNAEkATgBJAFMA
+VABSAEEAVABJAFYARQAgAEcAUgBPAFUAUAAvAEMATgA9AFIARQBDAEkAUABJAEUA
+TgBUAFMALwBDAE4APQAzAGsAdQBzAGUAcgAyAAAAAAADAAAwAAAAAAMA/18AAAAA
+AwAVDAEAAAACAQswAQAAAEoAAABFWDovTz1CUi1FWENILVRFU1QvT1U9RklSU1Qg
+QURNSU5JU1RSQVRJVkUgR1JPVVAvQ049UkVDSVBJRU5UUy9DTj0zS1VTRVIyAAAA
+HwAgOgEAAAAQAAAAMwBrAHUAcwBlAHIAMgAAAAMA/V8BAAAACwBAOgAA+T8CAfdf
+AQAAAGMAAAAAAAAA3KdAyMBCEBq0uQgAKy/hggEAAAAAAAAAL289QlItRVhDSC1U
+RVNUL291PUZpcnN0IEFkbWluaXN0cmF0aXZlIEdyb3VwL2NuPVJlY2lwaWVudHMv
+Y249M2t1c2VyMgAAAwAAOQAAAAAfAP45AQAAAEoAAAAzAGsAdQBzAGUAcgAyAEAA
+YgByAGUAeABjAGgAYQBuAGcAZQAuAGQAbwBsAHAAaABpAG4AcwBlAGEAcgBjAGgA
+LgBjAG8AbQAAAAAAAwBxOgAAAAAfAPZfAQAAABAAAAAzAGsAdQBzAGUAcgAyAAAA
+m2sBA5AGAEwbAAAzAAAACwACAAEAAAAfABoAAQAAABIAAABJAFAATQAuAE4AbwB0
+AGUAAAAAAAMAJgAAAAAAAwA2AAAAAAAfADcAAQAAAB4AAABCAGkAbABsACAAbwBm
+ACAAUgBpAGcAaAB0AHMAAAAAAEAAOQBgQvtkuknFAR8APQABAAAAAgAAAAAAAAAC
+AUcAAQAAADgAAABjPXVzO2E9IDtwPUJSLUVYQ0gtVEVTVDtsPUJSLUVYQ0gtREVW
+MS0wNTA0MjUxNzE1MzZaLTE0AB8AcAABAAAAHgAAAEIAaQBsAGwAIABvAGYAIABS
+AGkAZwBoAHQAcwAAAAAAAgFxAAEAAAAWAAAAAcVJumT7yarjal9+TnmqsNvwaipi
+/QAAHwAaDAEAAAAQAAAAMwBrAHIAZQBsAGEAeQAAAB8AHQ4BAAAAHgAAAEIAaQBs
+AGwAIABvAGYAIABSAGkAZwBoAHQAcwAAAAAAAgETEAEAAADuFAAAPCFET0NUWVBF
+IEhUTUwgUFVCTElDICItLy9XM0MvL0RURCBIVE1MIDQuMCBUcmFuc2l0aW9uYWwv
+L0VOIj4NCjxIVE1MPjxIRUFEPg0KPE1FVEEgaHR0cC1lcXVpdj1Db250ZW50LVR5
+cGUgY29udGVudD0idGV4dC9odG1sOyBjaGFyc2V0PXVzLWFzY2lpIj4NCjxNRVRB
+IGNvbnRlbnQ9Ik1TSFRNTCA2LjAwLjM3OTAuMTgzMCIgbmFtZT1HRU5FUkFUT1I+
+PC9IRUFEPg0KPEJPRFk+DQo8RElWPg0KPERJVj48Rk9OVCBmYWNlPUFyaWFsIHNp
+emU9Mj5USEUgQklMTCBPRiBSSUdIVFM8QlI+QW1lbmRtZW50cyAxLTEwIG9mIHRo
+ZSANCkNvbnN0aXR1dGlvbjwvRk9OVD48L0RJVj4NCjxESVY+Jm5ic3A7PC9ESVY+
+DQo8RElWPjxGT05UIGZhY2U9QXJpYWwgc2l6ZT0yPlRoZSBDb252ZW50aW9ucyBv
+ZiBhIG51bWJlciBvZiB0aGUgU3RhdGVzIGhhdmluZywgDQphdCB0aGUgdGltZSBv
+ZiBhZG9wdGluZyB0aGUgQ29uc3RpdHV0aW9uLCBleHByZXNzZWQgYSBkZXNpcmUs
+IGluIG9yZGVyIHRvIA0KcHJldmVudCBtaXNjb25zdHJ1Y3Rpb24gb3IgYWJ1c2Ug
+b2YgaXRzIHBvd2VycywgdGhhdCBmdXJ0aGVyIGRlY2xhcmF0b3J5IGFuZCANCnJl
+c3RyaWN0aXZlIGNsYXVzZXMgc2hvdWxkIGJlIGFkZGVkLCBhbmQgYXMgZXh0ZW5k
+aW5nIHRoZSBncm91bmQgb2YgcHVibGljIA0KY29uZmlkZW5jZSBpbiB0aGUgR292
+ZXJubWVudCB3aWxsIGJlc3QgaW5zdXJlIHRoZSBiZW5lZmljZW50IGVuZHMgb2Yg
+aXRzIA0KaW5zdGl0dXRpb247IDxCUj5SZXNvbHZlZCwgYnkgdGhlIFNlbmF0ZSBh
+bmQgSG91c2Ugb2YgUmVwcmVzZW50YXRpdmVzIG9mIHRoZSANClVuaXRlZCBTdGF0
+ZXMgb2YgQW1lcmljYSwgaW4gQ29uZ3Jlc3MgYXNzZW1ibGVkLCB0d28tdGhpcmRz
+IG9mIGJvdGggSG91c2VzIA0KY29uY3VycmluZywgdGhhdCB0aGUgZm9sbG93aW5n
+IGFydGljbGVzIGJlIHByb3Bvc2VkIHRvIHRoZSBMZWdpc2xhdHVyZXMgb2YgdGhl
+IA0Kc2V2ZXJhbCBTdGF0ZXMsIGFzIGFtZW5kbWVudHMgdG8gdGhlIENvbnN0aXR1
+dGlvbiBvZiB0aGUgVW5pdGVkIFN0YXRlczsgYWxsIG9yIA0KYW55IG9mIHdoaWNo
+IGFydGljbGVzLCB3aGVuIHJhdGlmaWVkIGJ5IHRocmVlLWZvdXJ0aHMgb2YgdGhl
+IHNhaWQgTGVnaXNsYXR1cmVzLCANCnRvIGJlIHZhbGlkIHRvIGFsbCBpbnRlbnRz
+IGFuZCBwdXJwb3NlcyBhcyBwYXJ0IG9mIHRoZSBzYWlkIENvbnN0aXR1dGlvbiwg
+DQpuYW1lbHk6IDwvRk9OVD48L0RJVj4NCjxESVY+Jm5ic3A7PC9ESVY+DQo8RElW
+PjxGT05UIGZhY2U9QXJpYWwgc2l6ZT0yPkFtZW5kbWVudCBJPC9GT05UPjwvRElW
+Pg0KPERJVj4mbmJzcDs8L0RJVj4NCjxESVY+PEZPTlQgZmFjZT1BcmlhbCBzaXpl
+PTI+Q29uZ3Jlc3Mgc2hhbGwgbWFrZSBubyBsYXcgcmVzcGVjdGluZyBhbiANCmVz
+dGFibGlzaG1lbnQgb2YgcmVsaWdpb24sIG9yIHByb2hpYml0aW5nIHRoZSBmcmVl
+IGV4ZXJjaXNlIHRoZXJlb2Y7IG9yIA0KYWJyaWRnaW5nIHRoZSBmcmVlZG9tIG9m
+IHNwZWVjaCwgb3Igb2YgdGhlIHByZXNzOyBvciB0aGUgcmlnaHQgb2YgdGhlIHBl
+b3BsZSANCnBlYWNlYWJseSB0byBhc3NlbWJsZSwgYW5kIHRvIHBldGl0aW9uIHRo
+ZSBnb3Zlcm5tZW50IGZvciBhIHJlZHJlc3Mgb2YgDQpncmlldmFuY2VzLiA8L0ZP
+TlQ+PC9ESVY+DQo8RElWPiZuYnNwOzwvRElWPg0KPERJVj48Rk9OVCBmYWNlPUFy
+aWFsIHNpemU9Mj5BbWVuZG1lbnQgSUk8L0ZPTlQ+PC9ESVY+DQo8RElWPiZuYnNw
+OzwvRElWPg0KPERJVj48Rk9OVCBmYWNlPUFyaWFsIHNpemU9Mj5BIHdlbGwgcmVn
+dWxhdGVkIG1pbGl0aWEsIGJlaW5nIG5lY2Vzc2FyeSB0byB0aGUgDQpzZWN1cml0
+eSBvZiBhIGZyZWUgc3RhdGUsIHRoZSByaWdodCBvZiB0aGUgcGVvcGxlIHRvIGtl
+ZXAgYW5kIGJlYXIgYXJtcywgc2hhbGwgDQpub3QgYmUgaW5mcmluZ2VkLiA8L0ZP
+TlQ+PC9ESVY+DQo8RElWPiZuYnNwOzwvRElWPg0KPERJVj48Rk9OVCBmYWNlPUFy
+aWFsIHNpemU9Mj5BbWVuZG1lbnQgSUlJPC9GT05UPjwvRElWPg0KPERJVj4mbmJz
+cDs8L0RJVj4NCjxESVY+PEZPTlQgZmFjZT1BcmlhbCBzaXplPTI+Tm8gc29sZGll
+ciBzaGFsbCwgaW4gdGltZSBvZiBwZWFjZSBiZSBxdWFydGVyZWQgaW4gDQphbnkg
+aG91c2UsIHdpdGhvdXQgdGhlIGNvbnNlbnQgb2YgdGhlIG93bmVyLCBub3IgaW4g
+dGltZSBvZiB3YXIsIGJ1dCBpbiBhIG1hbm5lciANCnRvIGJlIHByZXNjcmliZWQg
+YnkgbGF3LiA8L0ZPTlQ+PC9ESVY+DQo8RElWPiZuYnNwOzwvRElWPg0KPERJVj48
+Rk9OVCBmYWNlPUFyaWFsIHNpemU9Mj5BbWVuZG1lbnQgSVY8L0ZPTlQ+PC9ESVY+
+DQo8RElWPiZuYnNwOzwvRElWPg0KPERJVj48Rk9OVCBmYWNlPUFyaWFsIHNpemU9
+Mj5UaGUgcmlnaHQgb2YgdGhlIHBlb3BsZSB0byBiZSBzZWN1cmUgaW4gdGhlaXIg
+DQpwZXJzb25zLCBob3VzZXMsIHBhcGVycywgYW5kIGVmZmVjdHMsIGFnYWluc3Qg
+dW5yZWFzb25hYmxlIHNlYXJjaGVzIGFuZCANCnNlaXp1cmVzLCBzaGFsbCBub3Qg
+YmUgdmlvbGF0ZWQsIGFuZCBubyB3YXJyYW50cyBzaGFsbCBpc3N1ZSwgYnV0IHVw
+b24gcHJvYmFibGUgDQpjYXVzZSwgc3VwcG9ydGVkIGJ5IG9hdGggb3IgYWZmaXJt
+YXRpb24sIGFuZCBwYXJ0aWN1bGFybHkgZGVzY3JpYmluZyB0aGUgcGxhY2UgDQp0
+byBiZSBzZWFyY2hlZCwgYW5kIHRoZSBwZXJzb25zIG9yIHRoaW5ncyB0byBiZSBz
+ZWl6ZWQuIDwvRk9OVD48L0RJVj4NCjxESVY+Jm5ic3A7PC9ESVY+DQo8RElWPjxG
+T05UIGZhY2U9QXJpYWwgc2l6ZT0yPkFtZW5kbWVudCBWPC9GT05UPjwvRElWPg0K
+PERJVj4mbmJzcDs8L0RJVj4NCjxESVY+PEZPTlQgZmFjZT1BcmlhbCBzaXplPTI+
+Tm8gcGVyc29uIHNoYWxsIGJlIGhlbGQgdG8gYW5zd2VyIGZvciBhIGNhcGl0YWws
+IG9yIA0Kb3RoZXJ3aXNlIGluZmFtb3VzIGNyaW1lLCB1bmxlc3Mgb24gYSBwcmVz
+ZW50bWVudCBvciBpbmRpY3RtZW50IG9mIGEgZ3JhbmQganVyeSwgDQpleGNlcHQg
+aW4gY2FzZXMgYXJpc2luZyBpbiB0aGUgbGFuZCBvciBuYXZhbCBmb3JjZXMsIG9y
+IGluIHRoZSBtaWxpdGlhLCB3aGVuIGluIA0KYWN0dWFsIHNlcnZpY2UgaW4gdGlt
+ZSBvZiB3YXIgb3IgcHVibGljIGRhbmdlcjsgbm9yIHNoYWxsIGFueSBwZXJzb24g
+YmUgc3ViamVjdCANCmZvciB0aGUgc2FtZSBvZmZlbnNlIHRvIGJlIHR3aWNlIHB1
+dCBpbiBqZW9wYXJkeSBvZiBsaWZlIG9yIGxpbWI7IG5vciBzaGFsbCBiZSANCmNv
+bXBlbGxlZCBpbiBhbnkgY3JpbWluYWwgY2FzZSB0byBiZSBhIHdpdG5lc3MgYWdh
+aW5zdCBoaW1zZWxmLCBub3IgYmUgZGVwcml2ZWQgDQpvZiBsaWZlLCBsaWJlcnR5
+LCBvciBwcm9wZXJ0eSwgd2l0aG91dCBkdWUgcHJvY2VzcyBvZiBsYXc7IG5vciBz
+aGFsbCBwcml2YXRlIA0KcHJvcGVydHkgYmUgdGFrZW4gZm9yIHB1YmxpYyB1c2Us
+IHdpdGhvdXQganVzdCBjb21wZW5zYXRpb24uIDwvRk9OVD48L0RJVj4NCjxESVY+
+Jm5ic3A7PC9ESVY+DQo8RElWPjxGT05UIGZhY2U9QXJpYWwgc2l6ZT0yPkFtZW5k
+bWVudCBWSTwvRk9OVD48L0RJVj4NCjxESVY+Jm5ic3A7PC9ESVY+DQo8RElWPjxG
+T05UIGZhY2U9QXJpYWwgc2l6ZT0yPkluIGFsbCBjcmltaW5hbCBwcm9zZWN1dGlv
+bnMsIHRoZSBhY2N1c2VkIHNoYWxsIA0KZW5qb3kgdGhlIHJpZ2h0IHRvIGEgc3Bl
+ZWR5IGFuZCBwdWJsaWMgdHJpYWwsIGJ5IGFuIGltcGFydGlhbCBqdXJ5IG9mIHRo
+ZSBzdGF0ZSANCmFuZCBkaXN0cmljdCB3aGVyZWluIHRoZSBjcmltZSBzaGFsbCBo
+YXZlIGJlZW4gY29tbWl0dGVkLCB3aGljaCBkaXN0cmljdCBzaGFsbCANCmhhdmUg
+YmVlbiBwcmV2aW91c2x5IGFzY2VydGFpbmVkIGJ5IGxhdywgYW5kIHRvIGJlIGlu
+Zm9ybWVkIG9mIHRoZSBuYXR1cmUgYW5kIA0KY2F1c2Ugb2YgdGhlIGFjY3VzYXRp
+b247IHRvIGJlIGNvbmZyb250ZWQgd2l0aCB0aGUgd2l0bmVzc2VzIGFnYWluc3Qg
+aGltOyB0byANCmhhdmUgY29tcHVsc29yeSBwcm9jZXNzIGZvciBvYnRhaW5pbmcg
+d2l0bmVzc2VzIGluIGhpcyBmYXZvciwgYW5kIHRvIGhhdmUgdGhlIA0KYXNzaXN0
+YW5jZSBvZiBjb3Vuc2VsIGZvciBoaXMgZGVmZW5zZS4gPC9GT05UPjwvRElWPg0K
+PERJVj4mbmJzcDs8L0RJVj4NCjxESVY+PEZPTlQgZmFjZT1BcmlhbCBzaXplPTI+
+QW1lbmRtZW50IFZJSTwvRk9OVD48L0RJVj4NCjxESVY+Jm5ic3A7PC9ESVY+DQo8
+RElWPjxGT05UIGZhY2U9QXJpYWwgc2l6ZT0yPkluIHN1aXRzIGF0IGNvbW1vbiBs
+YXcsIHdoZXJlIHRoZSB2YWx1ZSBpbiANCmNvbnRyb3ZlcnN5IHNoYWxsIGV4Y2Vl
+ZCB0d2VudHkgZG9sbGFycywgdGhlIHJpZ2h0IG9mIHRyaWFsIGJ5IGp1cnkgc2hh
+bGwgYmUgDQpwcmVzZXJ2ZWQsIGFuZCBubyBmYWN0IHRyaWVkIGJ5IGEganVyeSwg
+c2hhbGwgYmUgb3RoZXJ3aXNlIHJlZXhhbWluZWQgaW4gYW55IA0KY291cnQgb2Yg
+dGhlIFVuaXRlZCBTdGF0ZXMsIHRoYW4gYWNjb3JkaW5nIHRvIHRoZSBydWxlcyBv
+ZiB0aGUgY29tbW9uIGxhdy4gDQo8L0ZPTlQ+PC9ESVY+DQo8RElWPiZuYnNwOzwv
+RElWPg0KPERJVj48Rk9OVCBmYWNlPUFyaWFsIHNpemU9Mj5BbWVuZG1lbnQgVklJ
+STwvRk9OVD48L0RJVj4NCjxESVY+Jm5ic3A7PC9ESVY+DQo8RElWPjxGT05UIGZh
+Y2U9QXJpYWwgc2l6ZT0yPkV4Y2Vzc2l2ZSBiYWlsIHNoYWxsIG5vdCBiZSByZXF1
+aXJlZCwgbm9yIGV4Y2Vzc2l2ZSANCmZpbmVzIGltcG9zZWQsIG5vciBjcnVlbCBh
+bmQgdW51c3VhbCBwdW5pc2htZW50cyBpbmZsaWN0ZWQuIDwvRk9OVD48L0RJVj4N
+CjxESVY+Jm5ic3A7PC9ESVY+DQo8RElWPjxGT05UIGZhY2U9QXJpYWwgc2l6ZT0y
+PkFtZW5kbWVudCBJWDwvRk9OVD48L0RJVj4NCjxESVY+Jm5ic3A7PC9ESVY+DQo8
+RElWPjxGT05UIGZhY2U9QXJpYWwgc2l6ZT0yPlRoZSBlbnVtZXJhdGlvbiBpbiB0
+aGUgQ29uc3RpdHV0aW9uLCBvZiBjZXJ0YWluIA0KcmlnaHRzLCBzaGFsbCBub3Qg
+YmUgY29uc3RydWVkIHRvIGRlbnkgb3IgZGlzcGFyYWdlIG90aGVycyByZXRhaW5l
+ZCBieSB0aGUgDQpwZW9wbGUuIDwvRk9OVD48L0RJVj4NCjxESVY+Jm5ic3A7PC9E
+SVY+DQo8RElWPjxGT05UIGZhY2U9QXJpYWwgc2l6ZT0yPkFtZW5kbWVudCBYPC9G
+T05UPjwvRElWPg0KPERJVj4mbmJzcDs8L0RJVj4NCjxESVY+PEZPTlQgZmFjZT1B
+cmlhbCBzaXplPTI+VGhlIHBvd2VycyBub3QgZGVsZWdhdGVkIHRvIHRoZSBVbml0
+ZWQgU3RhdGVzIGJ5IA0KdGhlIENvbnN0aXR1dGlvbiwgbm9yIHByb2hpYml0ZWQg
+YnkgaXQgdG8gdGhlIHN0YXRlcywgYXJlIHJlc2VydmVkIHRvIHRoZSBzdGF0ZXMg
+DQpyZXNwZWN0aXZlbHksIG9yIHRvIHRoZSBwZW9wbGUuIDwvRk9OVD48L0RJVj48
+L0RJVj48L0JPRFk+PC9IVE1MPg0KAAAfADUQAQAAAKIAAAA8ADQANQAyADAARgA2
+ADEANQAxAEQAQQBGADIAQQA0ADQAQgBBADgANwA4AEIARgAyAEYAMwA4ADAAMwA0
+ADgARQAyADYARQA1AEAAYgByAC0AZQB4AGMAaAAtAGQAZQB2ADEALgBiAHIAZQB4
+AGMAaABhAG4AZwBlAC4AZABvAGwAcABoAGkAbgBzAGUAYQByAGMAaAAuAGMAbwBt
+AD4AAAAAAAMAgBD/////HwDzEAEAAAAmAAAAQgBpAGwAbAAgAG8AZgAgAFIAaQBn
+AGgAdABzAC4ARQBNAEwAAAAAAAsA9BAAAAAACwD1EAAAAAALAPYQAAAAAEAABzBR
+lpFluknFAUAACDBRlpFluknFAQMA3j+fTgAAAwDxPwkEAAAfAPg/AQAAABAAAAAz
+AGsAcgBlAGwAYQB5AAAAAgH5PwEAAABjAAAAAAAAANynQMjAQhAatLkIACsv4YIB
+AAAAAAAAAC9PPUJSLUVYQ0gtVEVTVC9PVT1GSVJTVCBBRE1JTklTVFJBVElWRSBH
+Uk9VUC9DTj1SRUNJUElFTlRTL0NOPTNLUkVMQVkAAB8A+j8BAAAAEAAAADMAawBy
+AGUAbABhAHkAAAACAfs/AQAAAGMAAAAAAAAA3KdAyMBCEBq0uQgAKy/hggEAAAAA
+AAAAL089QlItRVhDSC1URVNUL09VPUZJUlNUIEFETUlOSVNUUkFUSVZFIEdST1VQ
+L0NOPVJFQ0lQSUVOVFMvQ049M0tSRUxBWQAAAwD9P+QEAAADABlAAAAAAAMAGkAA
+AAAAHwAwQAEAAAAQAAAAMwBLAFIARQBMAEEAWQAAAB8AMUABAAAAEAAAADMASwBS
+AEUATABBAFkAAAAfADhAAQAAABAAAAAzAEsAUgBFAEwAQQBZAAAAHwA5QAEAAAAQ
+AAAAMwBLAFIARQBMAEEAWQAAAAMAdkD/////AwACWQAAFgADAAlZAgAAAAsAhYEI
+IAYAAAAAAMAAAAAAAABGAAAAAA6FAAAAAAAAAwCdgQggBgAAAAAAwAAAAAAAAEYA
+AAAAUoUAAJjDAQAfAJ6BCCAGAAAAAADAAAAAAAAARgAAAABUhQAAAQAAAAoAAAAx
+ADEALgAwAAAAAAADAOmBCCAGAAAAAADAAAAAAAAARgAAAAABhQAAAAAAAAsA7oEI
+IAYAAAAAAMAAAAAAAABGAAAAAAOFAAAAAAAAAwD4gQggBgAAAAAAwAAAAAAAAEYA
+AAAAEIUAAAAAAAADAP+BCCAGAAAAAADAAAAAAAAARgAAAAAYhQAAAAAAAAsAIIII
+IAYAAAAAAMAAAAAAAABGAAAAAAaFAAAAAAAACwAkggggBgAAAAAAwAAAAAAAAEYA
+AAAAgoUAAAAAAAAfACaCCCAGAAAAAADAAAAAAAAARgAAAACDhQAAAQAAACYAAAA0
+ADAANQAxADMAMQA1ADEANwAtADIANQAwADQAMgAwADAANQAAAAAAAwBxggggBgAA
+AAAAwAAAAAAAAEYAAAAAk4UAAAAAAAALACkAAAAAAAsAIwAAAAAAAgF/AAEAAABR
+AAAAPDQ1MjBGNjE1MURBRjJBNDRCQTg3OEJGMkYzODAzNDhFMjZFNUBici1leGNo
+LWRldjEuYnJleGNoYW5nZS5kb2xwaGluc2VhcmNoLmNvbT4AAAAAC/o=
+
+--_000_553468B23EE29B4F8836CBD0E1B2A15A275C3AA855POLNIEXMBV2po_--
+
diff --git a/spec/fixtures/files/tnef-attachment-truncated.email b/spec/fixtures/files/tnef-attachment-truncated.email
new file mode 100644
index 000000000..365a5a442
--- /dev/null
+++ b/spec/fixtures/files/tnef-attachment-truncated.email
@@ -0,0 +1,34 @@
+From hello@blah.local Fri Feb 21 16:23:14 2013
+Return-path: <bar@example.org>
+Envelope-to: foo@example.org
+Delivery-date: Fri, 21 Feb 2013 16:23:14 +0000
+Content-Type: multipart/mixed;
+ boundary="_000_553468B23EE29B4F8836CBD0E1B2A15A275C3AA855POLNIEXMBV2po_"
+From: <bar@example.org>
+To: <foo@example.org>
+Sender: <hello@blah.local>
+Date: Fri, 21 Feb 2013 16:23:04 +0000
+Subject: here's a useless email
+Message-ID: <12345@blah.local>
+Accept-Language: en-US, en-GB
+Content-Language: en-US
+X-MS-Has-Attach:
+X-MS-TNEF-Correlator: <12345@blah.local>
+acceptlanguage: en-US, en-GB
+MIME-Version: 1.0
+
+--_000_553468B23EE29B4F8836CBD0E1B2A15A275C3AA855POLNIEXMBV2po_
+Content-Type: text/plain; charset="us-ascii"
+Content-Transfer-Encoding: quoted-printable
+
+Some introductory text here, before the malformed TNEF attachment.
+
+--_000_553468B23EE29B4F8836CBD0E1B2A15A275C3AA855POLNIEXMBV2po_
+Content-Disposition: attachment; filename="winmail.dat"
+Content-Transfer-Encoding: base64
+Content-Type: application/ms-tnef; name="winmail.dat"
+
+eJ8+IkV9AQaQCAAEAAAAAAABAAEAAQeQBgAIAAAA5AQAAAAAAADoAAEJgAEAIQAAAEMyRUUzRUYx
+
+--_000_553468B23EE29B4F8836CBD0E1B2A15A275C3AA855POLNIEXMBV2po_--
+
diff --git a/spec/lib/basic_encoding_tests.rb b/spec/lib/basic_encoding_tests.rb
new file mode 100644
index 000000000..35d35fd4a
--- /dev/null
+++ b/spec/lib/basic_encoding_tests.rb
@@ -0,0 +1,157 @@
+# -*- coding: utf-8 -*-
+require File.expand_path(File.dirname(__FILE__) + '/../spec_helper')
+
+def bytes_to_binary_string( bytes, claimed_encoding = nil )
+ claimed_encoding ||= 'ASCII-8BIT'
+ bytes_string = bytes.pack('c*')
+ if RUBY_VERSION.to_f >= 1.9
+ bytes_string.force_encoding! claimed_encoding
+ end
+ bytes_string
+end
+
+random_string = bytes_to_binary_string [ 0x0f, 0x58, 0x1c, 0x8f, 0xa4, 0xcf,
+ 0xf6, 0x8c, 0x9d, 0xa7, 0x06, 0xd9,
+ 0xf7, 0x90, 0x6c, 0x6f]
+
+windows_1252_string = bytes_to_binary_string [ 0x44, 0x41, 0x53, 0x48, 0x20,
+ 0x96, 0x20, 0x44, 0x41, 0x53,
+ 0x48 ]
+
+# It's a shame this example is so long, but if we don't take enough it
+# gets misinterpreted as Shift_JIS
+
+gb_18030_bytes = [ 0xb9, 0xf3, 0xb9, 0xab, 0xcb, 0xbe, 0xb8, 0xba, 0xd4, 0xf0,
+ 0xc8, 0xcb, 0x28, 0xbe, 0xad, 0xc0, 0xed, 0x2f, 0xb2, 0xc6,
+ 0xce, 0xf1, 0x29, 0xc4, 0xfa, 0xba, 0xc3, 0xa3, 0xba, 0x0d,
+ 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+ 0x20, 0x20, 0x20, 0xb1, 0xbe, 0xb9, 0xab, 0xcb, 0xbe, 0xd4,
+ 0xda, 0x31, 0x39, 0x39, 0x37, 0xc4, 0xea, 0xb3, 0xc9, 0xc1,
+ 0xa2, 0xb9, 0xfa, 0xbc, 0xd2, 0xb9, 0xa4, 0xc9, 0xcc, 0xd7,
+ 0xa2, 0xb2, 0xe1, 0x2e, 0xca, 0xb5, 0xc1, 0xa6, 0xd0, 0xdb,
+ 0xba, 0xf1, 0xa1, 0xa3, 0xd3, 0xd0, 0xb6, 0xc0, 0xc1, 0xa2,
+ 0xcb, 0xb0, 0xce, 0xf1, 0x0d, 0x0a, 0x20, 0x20, 0x20, 0x20,
+ 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0xd7, 0xa8, 0xd2, 0xb5,
+ 0xc8, 0xcb, 0xd4, 0xb1, 0x3b, 0xd4, 0xda, 0xc8, 0xab, 0xb9,
+ 0xfa, 0xb8, 0xf7, 0xb3, 0xc7, 0xca, 0xd0, 0xc9, 0xe8, 0xc1,
+ 0xa2, 0xb7, 0xd6, 0xb9, 0xab, 0xcb, 0xbe, 0xa3, 0xa8, 0xd5,
+ 0xe3, 0xbd, 0xad, 0xa1, 0xa2, 0xc9, 0xcf, 0xba, 0xa3, 0xa1,
+ 0xa2, 0xb9, 0xe3, 0xd6, 0xdd, 0xa1, 0xa2, 0xbd, 0xad, 0xcb,
+ 0xd5, 0xb5, 0xc8, 0x0d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20,
+ 0x20, 0x20, 0x20, 0x20, 0x20, 0xb5, 0xd8, 0xb7, 0xbd, 0xa3,
+ 0xa9, 0xd2, 0xf2, 0xbd, 0xf8, 0xcf, 0xee, 0xbd, 0xcf, 0xb6,
+ 0xe0, 0xcf, 0xd6, 0xcd, 0xea, 0xb3, 0xc9, 0xb2, 0xbb, 0xc1,
+ 0xcb, 0xc3, 0xbf, 0xd4, 0xc2, 0xcf, 0xfa, 0xca, 0xdb, 0xb6,
+ 0xee, 0xb6, 0xc8, 0xa1, 0xa3, 0xc3, 0xbf, 0xd4, 0xc2, 0xd3,
+ 0xd0, 0xd2, 0xbb, 0xb2, 0xbf, 0xb7, 0xd6, 0x0d, 0x0a, 0x20,
+ 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0xd4,
+ 0xf6, 0xd6, 0xb5, 0xb6, 0x90, 0xa3, 0xa8, 0x36, 0x2d, 0x37,
+ 0x25, 0xd7, 0xf3, 0xd3, 0xd2, 0x29, 0xba, 0xcd, 0xc6, 0xd5,
+ 0xc6, 0xb1, 0xa3, 0xa8, 0x30, 0x2e, 0x35, 0x25, 0x2d, 0x32,
+ 0x25, 0x20, 0xd7, 0xf3, 0xd3, 0xd2, 0xa3, 0xa9, 0xd3, 0xc5,
+ 0xbb, 0xdd, 0xb4, 0xfa, 0xbf, 0xaa, 0xbb, 0xf2, 0xba, 0xcf,
+ 0xd7, 0xf7, 0xa3, 0xac, 0x0d, 0x0a, 0x20, 0x20, 0x20, 0x20,
+ 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0xb5, 0xe3, 0xca, 0xfd,
+ 0xbd, 0xcf, 0xb5, 0xcd, 0xa1, 0xa3, 0xb4, 0xfa, 0xc0, 0xed,
+ 0xb7, 0xb6, 0xce, 0xa7, 0xc8, 0xe7, 0xcf, 0xc2, 0xa3, 0xba,
+ 0x0d, 0x0a ]
+
+gb_18030_spam_string = bytes_to_binary_string gb_18030_bytes
+
+describe "normalize_string_to_utf8" do
+
+ describe "when passed uniterpretable character data" do
+
+ it "should reject it as invalid" do
+
+ expect {
+ normalize_string_to_utf8 random_string
+ }.to raise_error(EncodingNormalizationError)
+
+ expect {
+ normalize_string_to_utf8 random_string, 'UTF-8'
+ }.to raise_error(EncodingNormalizationError)
+
+ end
+ end
+
+ describe "when passed unlabelled Windows 1252 data" do
+
+ it "should correctly convert it to UTF-8" do
+
+ normalized = normalize_string_to_utf8 windows_1252_string
+
+ normalized.should == "DASH – DASH"
+
+ end
+
+ end
+
+ describe "when passed GB 18030 data" do
+
+ it "should correctly convert it to UTF-8 if unlabelled" do
+
+ normalized = normalize_string_to_utf8 gb_18030_spam_string
+
+ normalized.should start_with("贵公司负责人")
+
+ end
+
+ end
+
+end
+
+describe "convert_string_to_utf8_or_binary" do
+
+ describe "when passed uniterpretable character data" do
+
+ it "should return it as a binary string" do
+
+ converted = convert_string_to_utf8_or_binary random_string
+ converted.should == random_string
+
+ if RUBY_VERSION.to_f >= 1.9
+ converted.encoding.should == 'ASCII-8BIT'
+ end
+
+ converted = convert_string_to_utf8_or_binary random_string,'UTF-8'
+ converted.should == random_string
+
+ if RUBY_VERSION.to_f >= 1.9
+ converted.encoding.should == 'ASCII-8BIT'
+ end
+
+ end
+ end
+
+ describe "when passed unlabelled Windows 1252 data" do
+
+ it "should correctly convert it to UTF-8" do
+
+ converted = convert_string_to_utf8_or_binary windows_1252_string
+
+ converted.should == "DASH – DASH"
+
+ if RUBY_VERSION.to_f >= 1.9
+ converted.encoding.should == 'UTF-8'
+ end
+ end
+
+ end
+
+ describe "when passed GB 18030 data" do
+
+ it "should correctly convert it to UTF-8 if unlabelled" do
+
+ converted = convert_string_to_utf8_or_binary gb_18030_spam_string
+
+ converted.should start_with("贵公司负责人")
+
+ if RUBY_VERSION.to_f >= 1.9
+ converted.encoding.should == 'UTF-8'
+ end
+ end
+
+ end
+
+end
diff --git a/spec/lib/mail_handler/mail_handler_spec.rb b/spec/lib/mail_handler/mail_handler_spec.rb
index 79b779687..01bf179f8 100644
--- a/spec/lib/mail_handler/mail_handler_spec.rb
+++ b/spec/lib/mail_handler/mail_handler_spec.rb
@@ -20,12 +20,33 @@ describe 'when creating a mail object from raw data' do
mail.to.should == ["request-66666-caa77777@whatdotheyknow.com", "foi@example.com"]
end
+ it 'should return nil for malformed To: and Cc: lines' do
+ mail = get_fixture_mail('malformed-to-and-cc.email')
+ mail.to.should == nil
+ mail.cc.should == nil
+ end
+
it 'should convert an iso8859 email to utf8' do
mail = get_fixture_mail('iso8859_2_raw_email.email')
mail.subject.should match /gjatë/u
MailHandler.get_part_body(mail).is_utf8?.should == true
end
+ it 'should convert a Windows-1252 body mislabelled as ISO-8859-1 to UTF-8' do
+ mail = get_fixture_mail('mislabelled-as-iso-8859-1.email')
+ body = MailHandler.get_part_body(mail)
+ body.is_utf8?.should == true
+ # This email is broken in at least these two ways:
+ # 1. It contains a top bit set character (0x96) despite the
+ # "Content-Transfer-Encoding: 7bit"
+ # 2. The charset in the Content-Type header is "iso-8859-1"
+ # but 0x96 is actually a Windows-1252 en dash, which would
+ # be Unicode codepoint 2013. It should be possible to
+ # spot the mislabelling, since 0x96 isn't a valid
+ # ISO-8859-1 character.
+ body.should match(/ \xe2\x80\x93 /)
+ end
+
end
describe 'when asked for the from name' do
@@ -275,6 +296,12 @@ end
describe 'when getting attachment attributes' do
+ it 'should handle a mail with a non-multipart part with no charset in the Content-Type header' do
+ mail = get_fixture_mail('part-without-charset-in-content-type.email')
+ attributes = MailHandler.get_attachment_attributes(mail)
+ attributes.size.should == 2
+ end
+
it 'should get two attachment parts from a multipart mail with text and html alternatives
and an image' do
mail = get_fixture_mail('quoted-subject-iso8859-1.email')
@@ -282,6 +309,13 @@ describe 'when getting attachment attributes' do
attributes.size.should == 2
end
+ it 'should get one attachment from a multipart mail with text and HTML alternatives, which should be UTF-8' do
+ mail = get_fixture_mail('iso8859_2_raw_email.email')
+ attributes = MailHandler.get_attachment_attributes(mail)
+ attributes.length.should == 1
+ attributes[0][:body].is_utf8?.should == true
+ end
+
it 'should expand a mail attached as text' do
# Note that this spec will only pass using Tmail in the timezone set as datetime headers
# are rendered out in the local time - using the Mail gem this is not necessary
@@ -304,6 +338,52 @@ describe 'when getting attachment attributes' do
attributes = MailHandler.get_attachment_attributes(mail)
end
+ it 'should ignore truncated TNEF attachment' do
+ mail = get_fixture_mail('tnef-attachment-truncated.email')
+ attributes = MailHandler.get_attachment_attributes(mail)
+ attributes.length.should == 2
+ end
+
+ it 'should ignore anything beyond the final MIME boundary' do
+ pending do
+ # This example raw email has a premature closing boundary for
+ # the outer multipart/mixed - my reading of RFC 1521 is that
+ # the "epilogue" beyond that should be ignored.
+ # See https://github.com/mysociety/alaveteli/issues/922 for
+ # more discussion.
+ mail = get_fixture_mail('nested-attachments-premature-end.email')
+ attributes = MailHandler.get_attachment_attributes(mail)
+ attributes.length.should == 3
+ end
+ end
+
+ it 'should cope with a missing final MIME boundary' do
+ mail = get_fixture_mail('multipart-no-final-boundary.email')
+ attributes = MailHandler.get_attachment_attributes(mail)
+ attributes.length.should == 1
+ attributes[0][:body].should match(/This is an acknowledgement of your email/)
+ attributes[0][:content_type].should == "text/plain"
+ attributes[0][:url_part_number].should == 1
+ end
+
+ it 'should ignore a TNEF attachment with no usable contents' do
+ # FIXME: "no usable contents" is slightly misleading. The
+ # attachment in this example email does have usable content in
+ # the body of the TNEF attachment, but the invocation of tnef
+ # historically used to unpack these attachments doesn't add
+ # the --save-body parameter, so that they have been ignored so
+ # far. We probably should include the body from such
+ # attachments, but, at the moment, with the pending upgrade to
+ # Rails 3, we just want to check that the behaviour is the
+ # same as before.
+ mail = get_fixture_mail('tnef-attachment-empty.email')
+ attributes = MailHandler.get_attachment_attributes(mail)
+ attributes.length.should == 2
+ # This is the size of the TNEF-encoded attachment; currently,
+ # we expect the code just to return this without decoding:
+ attributes[1][:body].length.should == 7769
+ end
+
it 'should produce a consistent set of url_part_numbers, content_types, within_rfc822_subjects
and filenames from an example mail with lots of attachments' do
mail = get_fixture_mail('many-attachments-date-header.email')
diff --git a/spec/models/incoming_message_spec.rb b/spec/models/incoming_message_spec.rb
index e22235298..1d86c26ad 100644
--- a/spec/models/incoming_message_spec.rb
+++ b/spec/models/incoming_message_spec.rb
@@ -59,12 +59,19 @@ describe IncomingMessage, " when dealing with incoming mail" do
message.subject.should == "Câmara Responde: Banco de ideias"
end
- it 'should not error on display of a message which has no charset set on the body part and
- is not good utf-8' do
+ it 'should deal with GB18030 text even if the charset is missing' do
ir = info_requests(:fancy_dog_request)
receive_incoming_mail('no-part-charset-bad-utf8.email', ir.incoming_email)
message = ir.incoming_messages[1]
message.parse_raw_email!
+ message.get_main_body_text_internal.should include("贵公司负责人")
+ end
+
+ it 'should not error on display of a message which has no charset set on the body part and is not good UTF-8' do
+ ir = info_requests(:fancy_dog_request)
+ receive_incoming_mail('no-part-charset-random-data.email', ir.incoming_email)
+ message = ir.incoming_messages[1]
+ message.parse_raw_email!
message.get_main_body_text_internal.should include("The above text was badly encoded")
end
@@ -412,6 +419,17 @@ describe IncomingMessage, " when uudecoding bad messages" do
im.get_attachments_for_display.size.should == 1
end
+ it "should still work when parsed from the raw email" do
+ raw_email = load_file_fixture 'inline-uuencode.email'
+ mail = MailHandler.mail_from_raw_email(raw_email)
+ im = incoming_messages :useless_incoming_message
+ im.stub!(:raw_email).and_return(raw_email)
+ im.stub!(:mail).and_return(mail)
+ im.parse_raw_email!
+ attachments = im.foi_attachments
+ attachments.size.should == 2
+ end
+
it "should apply censor rules" do
mail = get_fixture_mail('incoming-request-bad-uuencoding.email')
diff --git a/spec/support/email_helpers.rb b/spec/support/email_helpers.rb
index 7e98c39f6..252b1f137 100644
--- a/spec/support/email_helpers.rb
+++ b/spec/support/email_helpers.rb
@@ -8,7 +8,7 @@ end
def receive_incoming_mail(email_name, email_to, email_from = 'geraldinequango@localhost')
email_name = file_fixture_name(email_name)
- content = File.read(email_name)
+ content = File.open(email_name, 'rb') { |f| f.read }
content.gsub!('EMAIL_TO', email_to)
content.gsub!('EMAIL_FROM', email_from)
RequestMailer.receive(content)
diff --git a/spec/support/load_file_fixtures.rb b/spec/support/load_file_fixtures.rb
index 08079f654..a54505e99 100644
--- a/spec/support/load_file_fixtures.rb
+++ b/spec/support/load_file_fixtures.rb
@@ -2,13 +2,7 @@ def file_fixture_name(file_name)
return File.join(RSpec.configuration.fixture_path, "files", file_name)
end
-def load_file_fixture(file_name, as_binary=false)
+def load_file_fixture(file_name)
file_name = file_fixture_name(file_name)
- content = File.open(file_name, 'r') do |file|
- if as_binary
- file.set_encoding(Encoding::BINARY) if file.respond_to?(:set_encoding)
- end
- file.read
- end
- return content
+ return File.open(file_name, 'rb') { |f| f.read }
end