11 files changed, 723 insertions, 204 deletions
diff --git a/lib/alaveteli_external_command.rb b/lib/alaveteli_external_command.rb
index 3bfc34e3a..24b4b1aa8 100644
--- a/lib/alaveteli_external_command.rb
+++ b/lib/alaveteli_external_command.rb
@@ -2,6 +2,12 @@ require 'external_command'
 
 module AlaveteliExternalCommand
     class << self
+        # Final argument can be a hash of options.
+        # Valid options are:
+        # :append_to - string to append the output of the process to
+        # :stdin_string - stdin string to pass to the process
+        # :binary_output - boolean flag for treating the output as binary or text (only significant
+        #                  ruby 1.9 and above)
         def run(program_name, *args)
             # Run an external program, and return its output.
             # Standard error is suppressed unless the program
@@ -10,7 +16,7 @@ module AlaveteliExternalCommand
             if !args.empty? && args[-1].is_a?(Hash)
                 opts = args.pop
             end
-            
+
             if program_name =~ %r(^/)
                 program_path = program_name
             else
@@ -24,12 +30,16 @@ module AlaveteliExternalCommand
                 end
                  raise "Could not find #{program_name} in any of #{Configuration::utility_search_path.join(', ')}" if !found
             end
-            
+
             xc = ExternalCommand.new(program_path, *args)
             if opts.has_key? :append_to
                 xc.out = opts[:append_to]
             end
+            if opts.has_key? :binary_output
+                xc.binary_mode = opts[:binary_output]
+            end
             xc.run(opts[:stdin_string] || "", opts[:env] || {})
+
             if xc.status != 0
                 # Error
                 $stderr.puts("Error from #{program_name} #{args.join(' ')}:")
diff --git a/lib/configuration.rb b/lib/configuration.rb
index abd0f5cdc..9c369b2e7 100644
--- a/lib/configuration.rb
+++ b/lib/configuration.rb
@@ -1,3 +1,13 @@
+require File.dirname(__FILE__) + '/../commonlib/rblib/config'
+
+# Load intial mySociety config
+if ENV["RAILS_ENV"] == "test"
+    MySociety::Config.set_file(File.join(File.dirname(__FILE__), 'test'), true)
+else
+    MySociety::Config.set_file(File.join(File.dirname(__FILE__), 'general'), true)
+end
+MySociety::Config.load_default
+
 # Configuration values with defaults
 
 # TODO: Make this return different values depending on the current rails environment
@@ -25,6 +35,7 @@ module Configuration
     :GA_CODE => '',
     :GAZE_URL => '',
     :HTML_TO_PDF_COMMAND => '',
+    :INCLUDE_DEFAULT_LOCALE_IN_URLS => true,
     :INCOMING_EMAIL_DOMAIN => 'localhost',
     :INCOMING_EMAIL_PREFIX => '',
     :INCOMING_EMAIL_SECRET => 'dummysecret',
diff --git a/lib/i18n_fixes.rb b/lib/i18n_fixes.rb
index f75b969c4..bb339fc55 100644
--- a/lib/i18n_fixes.rb
+++ b/lib/i18n_fixes.rb
@@ -6,10 +6,12 @@
 # so that we can interpolate our translation strings nicely
 
 def _(key, options = {})
-  # Assume the result of doing any translation is html_safe.
-  # In other words, we trust the translators.
-  translation = FastGettext._(key).html_safe || key
-  gettext_interpolate(translation, options)
+  # HACK: We should be going via GettextI18nRails instead of FastGettext below
+  # so that #translations_are_html_safe is respected but calling it directly
+  # doesn't work for me. I'm just marking the resulting string as html_safe.
+  # This whole hacky file should be removed
+  translation = FastGettext._(key) || key
+  gettext_interpolate(translation, options).html_safe
 end
 
 INTERPOLATION_RESERVED_KEYS = %w(scope default)
diff --git a/lib/mail_handler/backends/mail_backend.rb b/lib/mail_handler/backends/mail_backend.rb
index 0e198adf0..b75e6ed63 100644
--- a/lib/mail_handler/backends/mail_backend.rb
+++ b/lib/mail_handler/backends/mail_backend.rb
@@ -23,19 +23,291 @@ module MailHandler
                 main
             end
 
+            # Returns an outlook message as a Mail object
+            def mail_from_outlook(content)
+                msg = Mapi::Msg.open(StringIO.new(content))
+                mail = mail_from_raw_email(msg.to_mime.to_s)
+                mail.ready_to_send!
+                mail
+            end
+
             # Return a copy of the file name for the mail part
-            def get_part_file_name(mail_part)
-                part_file_name = mail_part.filename
+            def get_part_file_name(part)
+                part_file_name = part.filename
                 part_file_name.nil? ? nil : part_file_name.dup
             end
 
+            # Get the body of a mail part
+            def get_part_body(part)
+                part.body.decoded
+            end
+
+            # Return the first from field if any
+            def first_from(mail)
+                if mail[:from]
+                    begin
+                        mail[:from].addrs[0]
+                        mail[:from].decoded
+                        return mail[:from].addrs[0]
+                    rescue
+                        return mail[:from].value
+                    end
+                else
+                    nil
+                end
+            end
+
+            # Return the first from address if any
+            def get_from_address(mail)
+                first_from = first_from(mail)
+                if first_from
+                    if first_from.is_a?(String)
+                        return nil
+                    else
+                        return first_from.address
+                    end
+                else
+                    return nil
+                end
+            end
+
+            # Return the first from name if any
+            def get_from_name(mail)
+                first_from = first_from(mail)
+                if first_from
+                    if first_from.is_a?(String)
+                        return nil
+                    else
+                        return first_from.display_name ? eval(%Q{"#{first_from.display_name}"}) : nil
+                    end
+                else
+                    return nil
+                end
+            end
+
+            def get_all_addresses(mail)
+                envelope_to = mail['envelope-to'] ? [mail['envelope-to'].value] : []
+                ((mail.to || []) +
+                (mail.cc || []) +
+                (envelope_to || [])).uniq
+            end
+
+            def empty_return_path?(mail)
+                return false if mail['return-path'].nil?
+                return true if mail['return-path'].value.blank?
+                return false
+            end
+
+            def get_auto_submitted(mail)
+                mail['auto-submitted'] ? mail['auto-submitted'].value : nil
+            end
+
+            def get_content_type(part)
+                part.content_type ? part.content_type.split(';')[0] : nil
+            end
+
+            def get_header_string(header, mail)
+                mail.header[header] ? mail.header[header].to_s : nil
+            end
+
+            # Detects whether a mail part is an Outlook email
+            def is_outlook?(part)
+                filename = get_part_file_name(part)
+                return true if get_content_type(part) == 'application/vnd.ms-outlook'
+                if filename && AlaveteliFileTypes.filename_to_mimetype(filename) == 'application/vnd.ms-outlook'
+                    return true
+                end
+                return false
+            end
+
+            # Convert a mail part which is an attached mail in one of
+            # several formats into a mail object and set it as the
+            # rfc822_attachment on the part. If the mail part can't be
+            # converted, the content type on the part is updated to
+            # 'text/plain' for an RFC822 attachment, and 'application/octet-stream'
+            # for other types
+            def decode_attached_part(part, parent_mail)
+                if get_content_type(part) == 'message/rfc822'
+                    # An email attached as text
+                    part.rfc822_attachment = mail_from_raw_email(part.body)
+                    if part.rfc822_attachment.nil?
+                        # Attached mail didn't parse, so treat as text
+                        part.content_type = 'text/plain'
+                    end
+                elsif is_outlook?(part)
+                    part.rfc822_attachment = mail_from_outlook(part.body.decoded)
+                    if part.rfc822_attachment.nil?
+                         # Attached mail didn't parse, so treat as binary
+                         part.content_type = 'application/octet-stream'
+                    end
+                elsif get_content_type(part) == 'application/ms-tnef'
+                    # A set of attachments in a TNEF file
+                    part.rfc822_attachment = mail_from_tnef(part.body.decoded)
+                    if part.rfc822_attachment.nil?
+                        # Attached mail didn't parse, so treat as binary
+                        part.content_type = 'application/octet-stream'
+                    end
+                end
+                if part.rfc822_attachment
+                    expand_and_normalize_parts(part.rfc822_attachment, parent_mail)
+                end
+            end
+
+            # Expand and normalize a mail part recursively. Decodes attached messages into
+            # Mail objects wherever possible. Sets a default content type if none is
+            # set. Tries to set a more specific content type for binary content types.
+            def expand_and_normalize_parts(part, parent_mail)
+                if part.multipart?
+                  part.parts.each{ |sub_part| expand_and_normalize_parts(sub_part, parent_mail) }
+                else
+                  part_filename = get_part_file_name(part)
+                  charset = part.charset # save this, because overwriting content_type also resets charset
+
+                  # Don't allow nil content_types
+                  if get_content_type(part).nil?
+                      part.content_type = 'application/octet-stream'
+                  end
+
+                  # PDFs often come with this mime type, fix it up for view code
+                  if get_content_type(part) == 'application/octet-stream'
+                      part_body = get_part_body(part)
+                      calc_mime = AlaveteliFileTypes.filename_and_content_to_mimetype(part_filename,
+                                                                                      part_body)
+                      if calc_mime
+                          part.content_type = calc_mime
+                      end
+                  end
+
+                  # Use standard content types for Word documents etc.
+                  part.content_type = normalise_content_type(get_content_type(part))
+                  decode_attached_part(part, parent_mail)
+                  part.charset = charset
+                end
+            end
+
+            # Count the parts in a mail part recursively, including any attached messages.
+            # Set the count on the parent mail, and set a url_part_number on the part itself.
+            # Set the count for the first uudecoded part on the parent mail also.
+            def count_parts(part, parent_mail)
+                if part.multipart?
+                    part.parts.each { |p| count_parts(p, parent_mail) }
+                else
+                    if part.rfc822_attachment
+                        count_parts(part.rfc822_attachment, parent_mail)
+                    else
+                        parent_mail.count_parts_count += 1
+                        part.url_part_number = parent_mail.count_parts_count
+                    end
+                end
+                parent_mail.count_first_uudecode_count = parent_mail.count_parts_count
+            end
+
+            # Choose the best part from alternatives
+            def choose_best_alternative(mail)
+                if mail.html_part
+                    return mail.html_part
+                elsif mail.text_part
+                    return mail.text_part
+                else
+                    return mail.parts.first
+                end
+            end
+
+            # Expand and normalize the parts of a mail, select the best part
+            # wherever there is an alternative, and then count the returned
+            # leaves and assign url_part values to them
+            def get_attachment_leaves(mail)
+                expand_and_normalize_parts(mail, mail)
+                leaves = _get_attachment_leaves_recursive(mail, nil, mail)
+                mail.count_parts_count = 0
+                count_parts(mail, mail)
+                return leaves
+            end
+
+            # Recurse through a mail part, selecting the best part wherever there is
+            # an alternative
+            def _get_attachment_leaves_recursive(part, within_rfc822_attachment, parent_mail)
+                leaves_found = []
+                if part.multipart?
+                    raise "no parts on multipart mail" if part.parts.size == 0
+                    if part.sub_type == 'alternative'
+                        best_part = choose_best_alternative(part)
+                        leaves_found += _get_attachment_leaves_recursive(best_part,
+                                                                         within_rfc822_attachment,
+                                                                         parent_mail)
+                    else
+                        # Add all parts
+                        part.parts.each do |sub_part|
+                            leaves_found += _get_attachment_leaves_recursive(sub_part,
+                                                                             within_rfc822_attachment,
+                                                                             parent_mail)
+                        end
+                    end
+                else
+                    # Add all the parts of a decoded attached message
+                    if part.rfc822_attachment
+                        leaves_found += _get_attachment_leaves_recursive(part.rfc822_attachment,
+                                                                         part.rfc822_attachment,
+                                                                         parent_mail)
+                    else
+                        # Store leaf
+                        part.within_rfc822_attachment = within_rfc822_attachment
+                        leaves_found += [part]
+                    end
+                end
+                return leaves_found
+            end
+
+            # Add selected useful headers from an attached message to its body
+            def extract_attached_message_headers(leaf)
+                body = get_part_body(leaf)
+                # Test to see if we are in the first part of the attached
+                # RFC822 message and it is text, if so add headers.
+                if leaf.within_rfc822_attachment == leaf && get_content_type(leaf) == 'text/plain'
+                    headers = ""
+                    [ 'Date', 'Subject', 'From', 'To', 'Cc' ].each do |header|
+                        if header_value = get_header_string(header, leaf.within_rfc822_attachment)
+                            if !header_value.blank?
+                                headers = headers + header + ": " + header_value.to_s + "\n"
+                            end
+                        end
+                    end
+                    # XXX call _convert_part_body_to_text here, but need to get charset somehow
+                    # e.g. http://www.whatdotheyknow.com/request/1593/response/3088/attach/4/Freedom%20of%20Information%20request%20-%20car%20oval%20sticker:%20Article%2020,%20Convention%20on%20Road%20Traffic%201949.txt
+                    body = headers + "\n" + body
+                end
+                body
+            end
+
+            # Generate a hash of the attributes associated with each significant part of a Mail object
+            def get_attachment_attributes(mail)
+                leaves = get_attachment_leaves(mail)
+                attachments = []
+                for leaf in leaves
+                    body = get_part_body(leaf)
+                    if leaf.within_rfc822_attachment
+                        within_rfc822_subject = leaf.within_rfc822_attachment.subject
+                        body = extract_attached_message_headers(leaf)
+                    end
+                    leaf_attributes = { :url_part_number => leaf.url_part_number,
+                                        :content_type => get_content_type(leaf),
+                                        :filename => get_part_file_name(leaf),
+                                        :charset => leaf.charset,
+                                        :within_rfc822_subject => within_rfc822_subject,
+                                        :body => body,
+                                        :hexdigest => Digest::MD5.hexdigest(body) }
+                    attachments << leaf_attributes
+                end
+                return attachments
+            end
+
             # Format
             def address_from_name_and_email(name, email)
                 if !MySociety::Validate.is_valid_email(email)
                     raise "invalid email " + email + " passed to address_from_name_and_email"
                 end
                 if name.nil?
-                    return Mail::Address.new(email)
+                    return Mail::Address.new(email).to_s
                 end
                 address = Mail::Address.new
                 address.display_name = name
diff --git a/lib/mail_handler/backends/mail_extensions.rb b/lib/mail_handler/backends/mail_extensions.rb
index cbe0491ed..f756abd1a 100644
--- a/lib/mail_handler/backends/mail_extensions.rb
+++ b/lib/mail_handler/backends/mail_extensions.rb
@@ -1,7 +1,67 @@
+require 'mail/message'
+require 'mail/fields/common/parameter_hash'
 module Mail
     class Message
         attr_accessor :url_part_number
         attr_accessor :rfc822_attachment # when a whole email message is attached as text
         attr_accessor :within_rfc822_attachment # for parts within a message attached as text (for getting subject mainly)
+        attr_accessor :count_parts_count
+        attr_accessor :count_first_uudecode_count
+
+        # A patched version of the message initializer to work around a bug where stripping the original
+        # input removes meaningful spaces - e.g. in the case of uuencoded bodies.
+        def initialize(*args, &block)
+            @body = nil
+            @body_raw = nil
+            @separate_parts = false
+            @text_part = nil
+            @html_part = nil
+            @errors = nil
+            @header = nil
+            @charset = 'UTF-8'
+            @defaulted_charset = true
+
+            @perform_deliveries = true
+            @raise_delivery_errors = true
+
+            @delivery_handler = nil
+
+            @delivery_method = Mail.delivery_method.dup
+
+            @transport_encoding = Mail::Encodings.get_encoding('7bit')
+
+            @mark_for_delete = false
+
+            if args.flatten.first.respond_to?(:each_pair)
+                init_with_hash(args.flatten.first)
+            else
+                # The replacement of this commented out line is the change.
+                # init_with_string(args.flatten[0].to_s.strip)
+                init_with_string(args.flatten[0].to_s)
+            end
+
+            if block_given?
+                instance_eval(&block)
+            end
+
+            self
+        end
+    end
+
+    # A patched version of the parameter hash that handles nil values without throwing
+    # an error.
+    class ParameterHash < IndifferentHash
+
+        def encoded
+          map.sort { |a,b| a.first.to_s <=> b.first.to_s }.map do |key_name, value|
+            # The replacement of this commented out line is the change
+            # unless value.ascii_only?
+            unless value.nil? || value.ascii_only?
+              value = Mail::Encodings.param_encode(value)
+              key_name = "#{key_name}*"
+            end
+            %Q{#{key_name}=#{quote_token(value)}}
+          end.join(";\r\n\s")
+        end
     end
 end
 \ No newline at end of file
diff --git a/lib/mail_handler/backends/tmail_backend.rb b/lib/mail_handler/backends/tmail_backend.rb
index 87aba73d7..02124cdb1 100644
--- a/lib/mail_handler/backends/tmail_backend.rb
+++ b/lib/mail_handler/backends/tmail_backend.rb
@@ -41,6 +41,234 @@ module MailHandler
                 return part_file_name
             end
 
+            # Get the body of a mail part
+            def get_part_body(mail_part)
+                mail_part.body
+            end
+
+            # Return the first from address if any
+            def get_from_address(mail)
+                if mail.from_addrs.nil? || mail.from_addrs.size == 0
+                    return nil
+                end
+                mail.from_addrs[0].spec
+            end
+
+            # Return the first from name if any
+            def get_from_name(mail)
+                mail.from_name_if_present
+            end
+
+            def get_all_addresses(mail)
+                ((mail.to || []) +
+                (mail.cc || []) +
+                (mail.envelope_to || [])).uniq
+            end
+
+            def empty_return_path?(mail)
+                return false if mail['return-path'].nil?
+                return true if mail['return-path'].addr.to_s == '<>'
+                return false
+            end
+
+            def get_auto_submitted(mail)
+                mail['auto-submitted'] ? mail['auto-submitted'].body : nil
+            end
+
+            def get_content_type(part)
+                part.content_type
+            end
+
+            def get_header_string(header, mail)
+                mail.header_string(header)
+            end
+
+            # Number the attachments in depth first tree order, for use in URLs.
+            # XXX This fills in part.rfc822_attachment and part.url_part_number within
+            # all the parts of the email (see monkeypatches in lib/mail_handler/tmail_extensions and
+            # lib/mail_handler/mail_extensions for how these attributes are added). ensure_parts_counted
+            # must be called before using the attributes.
+            def ensure_parts_counted(mail)
+                mail.count_parts_count = 0
+                _count_parts_recursive(mail, mail)
+                # we carry on using these numeric ids for attachments uudecoded from within text parts
+                mail.count_first_uudecode_count = mail.count_parts_count
+            end
+            def _count_parts_recursive(part, mail)
+                if part.multipart?
+                    part.parts.each do |p|
+                        _count_parts_recursive(p, mail)
+                    end
+                else
+                    part_filename = get_part_file_name(part)
+                    begin
+                        if part.content_type == 'message/rfc822'
+                            # An email attached as text
+                            # e.g. http://www.whatdotheyknow.com/request/64/response/102
+                            part.rfc822_attachment = mail_from_raw_email(part.body, decode=false)
+                        elsif part.content_type == 'application/vnd.ms-outlook' || part_filename && AlaveteliFileTypes.filename_to_mimetype(part_filename) == 'application/vnd.ms-outlook'
+                            # An email attached as an Outlook file
+                            # e.g. http://www.whatdotheyknow.com/request/chinese_names_for_british_politi
+                            msg = Mapi::Msg.open(StringIO.new(part.body))
+                            part.rfc822_attachment = mail_from_raw_email(msg.to_mime.to_s, decode=false)
+                        elsif part.content_type == 'application/ms-tnef'
+                            # A set of attachments in a TNEF file
+                            part.rfc822_attachment = mail_from_tnef(part.body)
+                        end
+                    rescue
+                        # If attached mail doesn't parse, treat it as text part
+                        part.rfc822_attachment = nil
+                    else
+                        unless part.rfc822_attachment.nil?
+                            _count_parts_recursive(part.rfc822_attachment, mail)
+                        end
+                    end
+                    if part.rfc822_attachment.nil?
+                        mail.count_parts_count += 1
+                        part.url_part_number = mail.count_parts_count
+                    end
+                end
+            end
+
+            def get_attachment_attributes(mail)
+                leaves = get_attachment_leaves(mail)
+                # XXX we have to call ensure_parts_counted after get_attachment_leaves
+                # which is really messy.
+                ensure_parts_counted(mail)
+                attachment_attributes = []
+                for leaf in leaves
+                    body = get_part_body(leaf)
+                    # As leaf.body causes MIME decoding which uses lots of RAM, do garbage collection here
+                    # to prevent excess memory use. XXX not really sure if this helps reduce
+                    # peak RAM use overall. Anyway, maybe there is something better to do than this.
+                    GC.start
+                    if leaf.within_rfc822_attachment
+                        within_rfc822_subject = leaf.within_rfc822_attachment.subject
+                        # Test to see if we are in the first part of the attached
+                        # RFC822 message and it is text, if so add headers.
+                        # XXX should probably use hunting algorithm to find main text part, rather than
+                        # just expect it to be first. This will do for now though.
+                        if leaf.within_rfc822_attachment == leaf && leaf.content_type == 'text/plain'
+                            headers = ""
+                            for header in [ 'Date', 'Subject', 'From', 'To', 'Cc' ]
+                                if leaf.within_rfc822_attachment.header.include?(header.downcase)
+                                    header_value = leaf.within_rfc822_attachment.header[header.downcase]
+                                     if !header_value.blank?
+                                        headers = headers + header + ": " + header_value.to_s + "\n"
+                                    end
+                                end
+                            end
+                            # XXX call _convert_part_body_to_text here, but need to get charset somehow
+                            # e.g. http://www.whatdotheyknow.com/request/1593/response/3088/attach/4/Freedom%20of%20Information%20request%20-%20car%20oval%20sticker:%20Article%2020,%20Convention%20on%20Road%20Traffic%201949.txt
+                            body = headers + "\n" + body
+
+                            # This is quick way of getting all headers, but instead we only add some a) to
+                            # make it more usable, b) as at least one authority accidentally leaked security
+                            # information into a header.
+                            #attachment.body = leaf.within_rfc822_attachment.port.to_s
+                        end
+                    end
+                    attachment_attributes << {:url_part_number => leaf.url_part_number,
+                                              :content_type => get_content_type(leaf),
+                                              :filename => get_part_file_name(leaf),
+                                              :charset => leaf.charset,
+                                              :within_rfc822_subject => within_rfc822_subject,
+                                              :body => body,
+                                              :hexdigest => Digest::MD5.hexdigest(body) }
+                end
+                attachment_attributes
+            end
+
+            # (This risks losing info if the unchosen alternative is the only one to contain
+            # useful info, but let's worry about that another time)
+            def get_attachment_leaves(mail)
+                return _get_attachment_leaves_recursive(mail, mail)
+            end
+            def _get_attachment_leaves_recursive(curr_mail, parent_mail, within_rfc822_attachment = nil)
+                leaves_found = []
+                if curr_mail.multipart?
+                    if curr_mail.parts.size == 0
+                        raise "no parts on multipart mail"
+                    end
+
+                    if curr_mail.sub_type == 'alternative'
+                        # Choose best part from alternatives
+                        best_part = nil
+                        # Take the last text/plain one, or else the first one
+                        curr_mail.parts.each do |m|
+                            if not best_part
+                                best_part = m
+                            elsif m.content_type == 'text/plain'
+                                best_part = m
+                            end
+                        end
+                        # Take an HTML one as even higher priority. (They tend
+                        # to render better than text/plain, e.g. don't wrap links here:
+                        # http://www.whatdotheyknow.com/request/amount_and_cost_of_freedom_of_in#incoming-72238 )
+                        curr_mail.parts.each do |m|
+                            if m.content_type == 'text/html'
+                                best_part = m
+                            end
+                        end
+                        leaves_found += _get_attachment_leaves_recursive(best_part, parent_mail, within_rfc822_attachment)
+                    else
+                        # Add all parts
+                        curr_mail.parts.each do |m|
+                            leaves_found += _get_attachment_leaves_recursive(m, parent_mail, within_rfc822_attachment)
+                        end
+                    end
+                else
+                    # XXX Yuck. this section alters various content_types. That puts
+                    # it into conflict with ensure_parts_counted which it has to be
+                    # called both before and after.  It will fail with cases of
+                    # attachments of attachments etc.
+                    charset = curr_mail.charset # save this, because overwriting content_type also resets charset
+                    # Don't allow nil content_types
+                    if curr_mail.content_type.nil?
+                        curr_mail.content_type = 'application/octet-stream'
+                    end
+                    # PDFs often come with this mime type, fix it up for view code
+                    if curr_mail.content_type == 'application/octet-stream'
+                        part_file_name = get_part_file_name(curr_mail)
+                        part_body = get_part_body(curr_mail)
+                        calc_mime = AlaveteliFileTypes.filename_and_content_to_mimetype(part_file_name, part_body)
+                        if calc_mime
+                            curr_mail.content_type = calc_mime
+                        end
+                    end
+
+                    # Use standard content types for Word documents etc.
+                    curr_mail.content_type = normalise_content_type(curr_mail.content_type)
+                    if curr_mail.content_type == 'message/rfc822'
+                        ensure_parts_counted(parent_mail) # fills in rfc822_attachment variable
+                        if curr_mail.rfc822_attachment.nil?
+                            # Attached mail didn't parse, so treat as text
+                            curr_mail.content_type = 'text/plain'
+                        end
+                    end
+                    if curr_mail.content_type == 'application/vnd.ms-outlook' || curr_mail.content_type == 'application/ms-tnef'
+                        ensure_parts_counted(parent_mail) # fills in rfc822_attachment variable
+                        if curr_mail.rfc822_attachment.nil?
+                            # Attached mail didn't parse, so treat as binary
+                            curr_mail.content_type = 'application/octet-stream'
+                        end
+                    end
+                    # If the part is an attachment of email
+                    if curr_mail.content_type == 'message/rfc822' || curr_mail.content_type == 'application/vnd.ms-outlook' || curr_mail.content_type == 'application/ms-tnef'
+                        ensure_parts_counted(parent_mail) # fills in rfc822_attachment variable
+                        leaves_found += _get_attachment_leaves_recursive(curr_mail.rfc822_attachment, parent_mail, curr_mail.rfc822_attachment)
+                    else
+                        # Store leaf
+                        curr_mail.within_rfc822_attachment = within_rfc822_attachment
+                        leaves_found += [curr_mail]
+                    end
+                    # restore original charset
+                    curr_mail.charset = charset
+                end
+                return leaves_found
+            end
+
+
             def address_from_name_and_email(name, email)
                 if !MySociety::Validate.is_valid_email(email)
                     raise "invalid email " + email + " passed to address_from_name_and_email"
diff --git a/lib/mail_handler/backends/tmail_extensions.rb b/lib/mail_handler/backends/tmail_extensions.rb
index 9359dfeea..3576a8eca 100644
--- a/lib/mail_handler/backends/tmail_extensions.rb
+++ b/lib/mail_handler/backends/tmail_extensions.rb
@@ -20,6 +20,8 @@ module TMail
         attr_accessor :url_part_number
         attr_accessor :rfc822_attachment # when a whole email message is attached as text
         attr_accessor :within_rfc822_attachment # for parts within a message attached as text (for getting subject mainly)
+        attr_accessor :count_parts_count
+        attr_accessor :count_first_uudecode_count
 
         # Monkeypatch! (check to see if this becomes a standard function in
         # TMail::Mail, then use that, whatever it is called)
diff --git a/lib/mail_handler/mail_handler.rb b/lib/mail_handler/mail_handler.rb
index 24d14b5c8..8b227b9ca 100644
--- a/lib/mail_handler/mail_handler.rb
+++ b/lib/mail_handler/mail_handler.rb
@@ -4,10 +4,12 @@ require 'tmpdir'
 module MailHandler
 
     if RUBY_VERSION.to_f >= 1.9
+        require 'mail'
         require 'backends/mail_extensions'
         require 'backends/mail_backend'
         include Backends::MailBackend
     else
+        require 'action_mailer'
         require 'backends/tmail_extensions'
         require 'backends/tmail_backend'
         include Backends::TmailBackend
@@ -19,7 +21,7 @@ module MailHandler
     def tnef_attachments(content)
         attachments = []
         Dir.mktmpdir do |dir|
-            IO.popen("#{`which tnef`.chomp} -K -C #{dir}", "w") do |f|
+            IO.popen("#{`which tnef`.chomp} -K -C #{dir}", "wb") do |f|
                 f.write(content)
                 f.close
                 if $?.signaled?
@@ -32,7 +34,7 @@ module MailHandler
             found = 0
             Dir.new(dir).sort.each do |file| # sort for deterministic behaviour
                 if file != "." && file != ".."
-                    file_content = File.open("#{dir}/#{file}", "r").read
+                    file_content = File.open("#{dir}/#{file}", "rb").read
                     attachments << { :content => file_content,
                                      :filename => file }
                     found += 1
@@ -45,6 +47,131 @@ module MailHandler
         attachments
     end
 
+    def normalise_content_type(content_type)
+        # e.g. http://www.whatdotheyknow.com/request/93/response/250
+        if content_type == 'application/excel' or content_type == 'application/msexcel' or content_type == 'application/x-ms-excel'
+            content_type = 'application/vnd.ms-excel'
+        end
+        if content_type == 'application/mspowerpoint' or content_type == 'application/x-ms-powerpoint'
+            content_type = 'application/vnd.ms-powerpoint'
+        end
+        if content_type == 'application/msword' or content_type == 'application/x-ms-word'
+            content_type = 'application/vnd.ms-word'
+        end
+        if content_type == 'application/x-zip-compressed'
+            content_type = 'application/zip'
+        end
+
+        # e.g. http://www.whatdotheyknow.com/request/copy_of_current_swessex_scr_opt#incoming-9928
+        if content_type == 'application/acrobat'
+            content_type = 'application/pdf'
+        end
+
+        return content_type
+    end
+
+    def get_attachment_text_one_file(content_type, body, charset = 'utf-8')
+        # note re. charset: TMail always tries to convert email bodies
+        # to UTF8 by default, so normally it should already be that.
+        text = ''
+        # XXX - tell all these command line tools to return utf-8
+        if content_type == 'text/plain'
+            text += body + "\n\n"
+        else
+            tempfile = Tempfile.new('foiextract')
+            tempfile.binmode
+            tempfile.print body
+            tempfile.flush
+            default_params = { :append_to => text, :binary_output => false }
+            if content_type == 'application/vnd.ms-word'
+                AlaveteliExternalCommand.run("wvText", tempfile.path, tempfile.path + ".txt")
+                # Try catdoc if we get into trouble (e.g. for InfoRequestEvent 2701)
+                if not File.exists?(tempfile.path + ".txt")
+                    AlaveteliExternalCommand.run("catdoc", tempfile.path, default_params)
+                else
+                    text += File.read(tempfile.path + ".txt") + "\n\n"
+                    File.unlink(tempfile.path + ".txt")
+                end
+            elsif content_type == 'application/rtf'
+                # catdoc on RTF prodcues less comments and extra bumf than --text option to unrtf
+                AlaveteliExternalCommand.run("catdoc", tempfile.path, default_params)
+            elsif content_type == 'text/html'
+                # lynx wordwraps links in its output, which then don't
+                # get formatted properly by Alaveteli. We use elinks
+                # instead, which doesn't do that.
+                AlaveteliExternalCommand.run("elinks", "-eval", "set document.codepage.assume = \"#{charset}\"",
+                                                       "-eval", "set document.codepage.force_assumed = 1",
+                                                       "-dump-charset", "utf-8",
+                                                       "-force-html", "-dump",
+                                                       tempfile.path,
+                                                       default_params.merge(:env => {"LANG" => "C"}))
+            elsif content_type == 'application/vnd.ms-excel'
+                # Bit crazy using /usr/bin/strings - but xls2csv, xlhtml and
+                # py_xls2txt only extract text from cells, not from floating
+                # notes. catdoc may be fooled by weird character sets, but will
+                # probably do for UK FOI requests.
+                AlaveteliExternalCommand.run("/usr/bin/strings", tempfile.path, default_params)
+            elsif content_type == 'application/vnd.ms-powerpoint'
+                # ppthtml seems to catch more text, but only outputs HTML when
+                # we want text, so just use catppt for now
+                AlaveteliExternalCommand.run("catppt", tempfile.path, default_params)
+            elsif content_type == 'application/pdf'
+                AlaveteliExternalCommand.run("pdftotext", tempfile.path, "-", default_params)
+            elsif content_type == 'application/vnd.openxmlformats-officedocument.wordprocessingml.document'
+                # This is Microsoft's XML office document format.
+                # Just pull out the main XML file, and strip it of text.
+                xml = AlaveteliExternalCommand.run("/usr/bin/unzip", "-qq",
+                                                                     "-c",
+                                                                     tempfile.path,
+                                                                     "word/document.xml",
+                                                                     {:binary_output => false})
+                if !xml.nil?
+                    doc = REXML::Document.new(xml)
+                    text += doc.each_element( './/text()' ){}.join(" ")
+                end
+            elsif content_type == 'application/zip'
+                # recurse into zip files
+                begin
+                    zip_file = Zip::ZipFile.open(tempfile.path)
+                    text += get_attachment_text_from_zip_file(zip_file)
+                    zip_file.close()
+                rescue
+                    $stderr.puts("Error processing zip file: #{$!.inspect}")
+                end
+            end
+            tempfile.close
+        end
+
+        return text
+    end
+    def get_attachment_text_from_zip_file(zip_file)
+
+        text = ""
+        for entry in zip_file
+            if entry.file?
+                filename = entry.to_s
+                begin
+                    body = entry.get_input_stream.read
+                rescue
+                    # move to next attachment silently if there were problems
+                    # XXX really should reduce this to specific exceptions?
+                    # e.g. password protected
+                    next
+                end
+                calc_mime = AlaveteliFileTypes.filename_to_mimetype(filename)
+                if calc_mime
+                    content_type = calc_mime
+                else
+                    content_type = 'application/octet-stream'
+                end
+
+                text += get_attachment_text_one_file(content_type, body)
+
+            end
+        end
+        return text
+    end
+
     # Turn instance methods into class methods
     extend self
 
diff --git a/lib/old_rubygems_patch.rb b/lib/old_rubygems_patch.rb
deleted file mode 100644
index 3001a7381..000000000
--- a/lib/old_rubygems_patch.rb
+++ /dev/null
@@ -1,46 +0,0 @@
-if File.exist? File.join(File.dirname(__FILE__),'..','vendor','rails','railties','lib','rails','gem_dependency.rb')
-  require File.join(File.dirname(__FILE__),'..','vendor','rails','railties','lib','rails','gem_dependency.rb')
-else
-  require 'rails/gem_dependency'
-end
-
-module Rails
-  class GemDependency < Gem::Dependency
-  
-    # This definition of the requirement method is a patch
-    if !method_defined?(:requirement)
-      def requirement
-        req = version_requirements
-      end
-    end
-  
-    def add_load_paths
-      self.class.add_frozen_gem_path
-      return if @loaded || @load_paths_added
-      if framework_gem?
-        @load_paths_added = @loaded = @frozen = true
-        return
-      end
-
-      begin
-        dep = Gem::Dependency.new(name, requirement)
-        spec = Gem.source_index.find { |_,s| s.satisfies_requirement?(dep) }.last
-        spec.activate           # a way that exists
-      rescue
-        begin 
-          gem self.name, self.requirement # <  1.8 unhappy way
-        # This second rescue is a patch - fall back to passing Rails::GemDependency to gem
-        # for older rubygems
-        rescue ArgumentError
-          gem self
-        end
-      end
-
-      @spec = Gem.loaded_specs[name]
-      @frozen = @spec.loaded_from.include?(self.class.unpacked_path) if @spec
-      @load_paths_added = true
-    rescue Gem::LoadError
-    end
-  end
-  
-end
diff --git a/lib/tasks/.gitkeep b/lib/tasks/.gitkeep
new file mode 100644
index 000000000..e69de29bb
--- /dev/null
+++ b/lib/tasks/.gitkeep
diff --git a/lib/tasks/rspec.rake b/lib/tasks/rspec.rake
deleted file mode 100644
index 1eee74aee..000000000
--- a/lib/tasks/rspec.rake
+++ /dev/null
@@ -1,147 +0,0 @@
-rspec_gem_dir = nil
-Dir["#{Rails.root}/vendor/gems/*"].each do |subdir|
-  rspec_gem_dir = subdir if subdir.gsub("#{Rails.root}/vendor/gems/","") =~ /^(\w+-)?rspec-(\d+)/ && File.exist?("#{subdir}/lib/spec/rake/spectask.rb")
-end
-rspec_plugin_dir = File.expand_path(File.dirname(__FILE__) + '/../../vendor/plugins/rspec')
-
-if rspec_gem_dir && (test ?d, rspec_plugin_dir)
-  raise "\n#{'*'*50}\nYou have rspec installed in both vendor/gems and vendor/plugins\nPlease pick one and dispose of the other.\n#{'*'*50}\n\n"
-end
-
-if rspec_gem_dir
-  $LOAD_PATH.unshift("#{rspec_gem_dir}/lib")
-elsif File.exist?(rspec_plugin_dir)
-  $LOAD_PATH.unshift("#{rspec_plugin_dir}/lib")
-end
-
-# Don't load rspec if running "rake gems:*"
-unless ARGV.any? {|a| a =~ /^gems/}
-
-begin
-  require 'spec/rake/spectask'
-rescue MissingSourceFile
-  module Spec
-    module Rake
-      class SpecTask
-        def initialize(name)
-          task name do
-            # if rspec-rails is a configured gem, this will output helpful material and exit ...
-            require File.expand_path(File.join(File.dirname(__FILE__),"..","..","config","environment"))
-
-            # ... otherwise, do this:
-            raise <<-MSG
-
-#{"*" * 80}
-*  You are trying to run an rspec rake task defined in
-*  #{__FILE__},
-*  but rspec can not be found in vendor/gems, vendor/plugins or system gems.
-#{"*" * 80}
-MSG
-          end
-        end
-      end
-    end
-  end
-end
-
-Rake.application.instance_variable_get('@tasks').delete('default')
-
-spec_prereq = File.exist?(File.join(Rails.root, 'config', 'database.yml')) ? "db:test:prepare" : :noop
-task :noop do
-end
-
-task :default => :spec
-task :stats => "spec:statsetup"
-# XXX commonlib tests are not Ruby 1.9 compatible
-#task :spec => ['spec:commonlib']
-task :test => ['spec']
-task :cruise => ['spec']
-
-desc "Run all specs in spec directory (excluding plugin specs)"
-Spec::Rake::SpecTask.new(:spec => spec_prereq) do |t|
-  t.spec_opts = ['--options', "\"#{Rails.root}/spec/spec.opts\""]
-  t.spec_files = FileList['spec/**/*_spec.rb']
-end
-
-namespace :spec do
-  desc "Run all specs in spec directory with RCov (excluding plugin specs)"
-  Spec::Rake::SpecTask.new(:rcov) do |t|
-    t.spec_opts = ['--options', "\"#{Rails.root}/spec/spec.opts\""]
-    t.spec_files = FileList['spec/**/*_spec.rb']
-    t.rcov = true
-    t.rcov_opts = lambda do
-      IO.readlines("#{Rails.root}/spec/rcov.opts").map {|l| l.chomp.split " "}.flatten
-    end
-  end
-
-  desc "Print Specdoc for all specs (excluding plugin specs)"
-  Spec::Rake::SpecTask.new(:doc) do |t|
-    t.spec_opts = ["--format", "specdoc", "--dry-run"]
-    t.spec_files = FileList['spec/**/*_spec.rb']
-  end
-
-  desc "Print Specdoc for all plugin examples"
-  Spec::Rake::SpecTask.new(:plugin_doc) do |t|
-    t.spec_opts = ["--format", "specdoc", "--dry-run"]
-    t.spec_files = FileList['vendor/plugins/**/spec/**/*_spec.rb'].exclude('vendor/plugins/rspec/*')
-  end
-
-  [:models, :controllers, :views, :helpers, :lib, :integration].each do |sub|
-    desc "Run the code examples in spec/#{sub}"
-    Spec::Rake::SpecTask.new(sub => spec_prereq) do |t|
-      t.spec_opts = ['--options', "\"#{Rails.root}/spec/spec.opts\""]
-      t.spec_files = FileList["spec/#{sub}/**/*_spec.rb"]
-    end
-  end
-
-  desc "Run the code examples in vendor/plugins (except RSpec's own)"
-  Spec::Rake::SpecTask.new(:plugins => spec_prereq) do |t|
-    t.spec_opts = ['--options', "\"#{Rails.root}/spec/spec.opts\""]
-    t.spec_files = FileList['vendor/plugins/**/spec/**/*_spec.rb'].exclude('vendor/plugins/rspec/*').exclude("vendor/plugins/rspec-rails/*")
-  end
-
-  namespace :plugins do
-    desc "Runs the examples for rspec_on_rails"
-    Spec::Rake::SpecTask.new(:rspec_on_rails) do |t|
-      t.spec_opts = ['--options', "\"#{Rails.root}/spec/spec.opts\""]
-      t.spec_files = FileList['vendor/plugins/rspec-rails/spec/**/*_spec.rb']
-    end
-  end
-
-  # Setup specs for stats
-  task :statsetup do
-    require 'code_statistics'
-    ::STATS_DIRECTORIES << %w(Model\ specs spec/models) if File.exist?('spec/models')
-    ::STATS_DIRECTORIES << %w(View\ specs spec/views) if File.exist?('spec/views')
-    ::STATS_DIRECTORIES << %w(Controller\ specs spec/controllers) if File.exist?('spec/controllers')
-    ::STATS_DIRECTORIES << %w(Helper\ specs spec/helpers) if File.exist?('spec/helpers')
-    ::STATS_DIRECTORIES << %w(Library\ specs spec/lib) if File.exist?('spec/lib')
-    ::STATS_DIRECTORIES << %w(Routing\ specs spec/routing) if File.exist?('spec/routing')
-    ::STATS_DIRECTORIES << %w(Integration\ specs spec/integration) if File.exist?('spec/integration')
-    ::CodeStatistics::TEST_TYPES << "Model specs" if File.exist?('spec/models')
-    ::CodeStatistics::TEST_TYPES << "View specs" if File.exist?('spec/views')
-    ::CodeStatistics::TEST_TYPES << "Controller specs" if File.exist?('spec/controllers')
-    ::CodeStatistics::TEST_TYPES << "Helper specs" if File.exist?('spec/helpers')
-    ::CodeStatistics::TEST_TYPES << "Library specs" if File.exist?('spec/lib')
-    ::CodeStatistics::TEST_TYPES << "Routing specs" if File.exist?('spec/routing')
-    ::CodeStatistics::TEST_TYPES << "Integration specs" if File.exist?('spec/integration')
-  end
-
-  namespace :db do
-    namespace :fixtures do
-      desc "Load fixtures (from spec/fixtures) into the current environment's database.  Load specific fixtures using FIXTURES=x,y. Load from subdirectory in test/fixtures using FIXTURES_DIR=z."
-      task :load => :environment do
-        ActiveRecord::Base.establish_connection(Rails.env)
-        base_dir = File.join(Rails.root, 'spec', 'fixtures')
-        fixtures_dir = ENV['FIXTURES_DIR'] ? File.join(base_dir, ENV['FIXTURES_DIR']) : base_dir
-
-        require 'active_record/fixtures'
-        (ENV['FIXTURES'] ? ENV['FIXTURES'].split(/,/).map {|f| File.join(fixtures_dir, f) } : Dir.glob(File.join(fixtures_dir, '*.{yml,csv}'))).each do |fixture_file|
-          Fixtures.create_fixtures(File.dirname(fixture_file), File.basename(fixture_file, '.*'))
-        end
-      end
-    end
-  end
-end
-
-end