diff options
Diffstat (limited to 'lib')
| -rw-r--r-- | lib/alaveteli_external_command.rb | 14 | ||||
| -rw-r--r-- | lib/configuration.rb | 1 | ||||
| -rw-r--r-- | lib/mail_handler/backends/mail_backend.rb | 278 | ||||
| -rw-r--r-- | lib/mail_handler/backends/mail_extensions.rb | 60 | ||||
| -rw-r--r-- | lib/mail_handler/backends/tmail_backend.rb | 228 | ||||
| -rw-r--r-- | lib/mail_handler/backends/tmail_extensions.rb | 2 | ||||
| -rw-r--r-- | lib/mail_handler/mail_handler.rb | 131 | 
7 files changed, 707 insertions, 7 deletions
| diff --git a/lib/alaveteli_external_command.rb b/lib/alaveteli_external_command.rb index 3bfc34e3a..24b4b1aa8 100644 --- a/lib/alaveteli_external_command.rb +++ b/lib/alaveteli_external_command.rb @@ -2,6 +2,12 @@ require 'external_command'  module AlaveteliExternalCommand      class << self +        # Final argument can be a hash of options. +        # Valid options are: +        # :append_to - string to append the output of the process to +        # :stdin_string - stdin string to pass to the process +        # :binary_output - boolean flag for treating the output as binary or text (only significant +        #                  ruby 1.9 and above)          def run(program_name, *args)              # Run an external program, and return its output.              # Standard error is suppressed unless the program @@ -10,7 +16,7 @@ module AlaveteliExternalCommand              if !args.empty? && args[-1].is_a?(Hash)                  opts = args.pop              end -             +              if program_name =~ %r(^/)                  program_path = program_name              else @@ -24,12 +30,16 @@ module AlaveteliExternalCommand                  end                   raise "Could not find #{program_name} in any of #{Configuration::utility_search_path.join(', ')}" if !found              end -             +              xc = ExternalCommand.new(program_path, *args)              if opts.has_key? :append_to                  xc.out = opts[:append_to]              end +            if opts.has_key? :binary_output +                xc.binary_mode = opts[:binary_output] +            end              xc.run(opts[:stdin_string] || "", opts[:env] || {}) +              if xc.status != 0                  # Error                  $stderr.puts("Error from #{program_name} #{args.join(' ')}:") diff --git a/lib/configuration.rb b/lib/configuration.rb index abd0f5cdc..11fe1c56e 100644 --- a/lib/configuration.rb +++ b/lib/configuration.rb @@ -25,6 +25,7 @@ module Configuration      :GA_CODE => '',      :GAZE_URL => '',      :HTML_TO_PDF_COMMAND => '', +    :INCLUDE_DEFAULT_LOCALE_IN_URLS => true,      :INCOMING_EMAIL_DOMAIN => 'localhost',      :INCOMING_EMAIL_PREFIX => '',      :INCOMING_EMAIL_SECRET => 'dummysecret', diff --git a/lib/mail_handler/backends/mail_backend.rb b/lib/mail_handler/backends/mail_backend.rb index 0e198adf0..b75e6ed63 100644 --- a/lib/mail_handler/backends/mail_backend.rb +++ b/lib/mail_handler/backends/mail_backend.rb @@ -23,19 +23,291 @@ module MailHandler                  main              end +            # Returns an outlook message as a Mail object +            def mail_from_outlook(content) +                msg = Mapi::Msg.open(StringIO.new(content)) +                mail = mail_from_raw_email(msg.to_mime.to_s) +                mail.ready_to_send! +                mail +            end +              # Return a copy of the file name for the mail part -            def get_part_file_name(mail_part) -                part_file_name = mail_part.filename +            def get_part_file_name(part) +                part_file_name = part.filename                  part_file_name.nil? ? nil : part_file_name.dup              end +            # Get the body of a mail part +            def get_part_body(part) +                part.body.decoded +            end + +            # Return the first from field if any +            def first_from(mail) +                if mail[:from] +                    begin +                        mail[:from].addrs[0] +                        mail[:from].decoded +                        return mail[:from].addrs[0] +                    rescue +                        return mail[:from].value +                    end +                else +                    nil +                end +            end + +            # Return the first from address if any +            def get_from_address(mail) +                first_from = first_from(mail) +                if first_from +                    if first_from.is_a?(String) +                        return nil +                    else +                        return first_from.address +                    end +                else +                    return nil +                end +            end + +            # Return the first from name if any +            def get_from_name(mail) +                first_from = first_from(mail) +                if first_from +                    if first_from.is_a?(String) +                        return nil +                    else +                        return first_from.display_name ? eval(%Q{"#{first_from.display_name}"}) : nil +                    end +                else +                    return nil +                end +            end + +            def get_all_addresses(mail) +                envelope_to = mail['envelope-to'] ? [mail['envelope-to'].value] : [] +                ((mail.to || []) + +                (mail.cc || []) + +                (envelope_to || [])).uniq +            end + +            def empty_return_path?(mail) +                return false if mail['return-path'].nil? +                return true if mail['return-path'].value.blank? +                return false +            end + +            def get_auto_submitted(mail) +                mail['auto-submitted'] ? mail['auto-submitted'].value : nil +            end + +            def get_content_type(part) +                part.content_type ? part.content_type.split(';')[0] : nil +            end + +            def get_header_string(header, mail) +                mail.header[header] ? mail.header[header].to_s : nil +            end + +            # Detects whether a mail part is an Outlook email +            def is_outlook?(part) +                filename = get_part_file_name(part) +                return true if get_content_type(part) == 'application/vnd.ms-outlook' +                if filename && AlaveteliFileTypes.filename_to_mimetype(filename) == 'application/vnd.ms-outlook' +                    return true +                end +                return false +            end + +            # Convert a mail part which is an attached mail in one of +            # several formats into a mail object and set it as the +            # rfc822_attachment on the part. If the mail part can't be +            # converted, the content type on the part is updated to +            # 'text/plain' for an RFC822 attachment, and 'application/octet-stream' +            # for other types +            def decode_attached_part(part, parent_mail) +                if get_content_type(part) == 'message/rfc822' +                    # An email attached as text +                    part.rfc822_attachment = mail_from_raw_email(part.body) +                    if part.rfc822_attachment.nil? +                        # Attached mail didn't parse, so treat as text +                        part.content_type = 'text/plain' +                    end +                elsif is_outlook?(part) +                    part.rfc822_attachment = mail_from_outlook(part.body.decoded) +                    if part.rfc822_attachment.nil? +                         # Attached mail didn't parse, so treat as binary +                         part.content_type = 'application/octet-stream' +                    end +                elsif get_content_type(part) == 'application/ms-tnef' +                    # A set of attachments in a TNEF file +                    part.rfc822_attachment = mail_from_tnef(part.body.decoded) +                    if part.rfc822_attachment.nil? +                        # Attached mail didn't parse, so treat as binary +                        part.content_type = 'application/octet-stream' +                    end +                end +                if part.rfc822_attachment +                    expand_and_normalize_parts(part.rfc822_attachment, parent_mail) +                end +            end + +            # Expand and normalize a mail part recursively. Decodes attached messages into +            # Mail objects wherever possible. Sets a default content type if none is +            # set. Tries to set a more specific content type for binary content types. +            def expand_and_normalize_parts(part, parent_mail) +                if part.multipart? +                  part.parts.each{ |sub_part| expand_and_normalize_parts(sub_part, parent_mail) } +                else +                  part_filename = get_part_file_name(part) +                  charset = part.charset # save this, because overwriting content_type also resets charset + +                  # Don't allow nil content_types +                  if get_content_type(part).nil? +                      part.content_type = 'application/octet-stream' +                  end + +                  # PDFs often come with this mime type, fix it up for view code +                  if get_content_type(part) == 'application/octet-stream' +                      part_body = get_part_body(part) +                      calc_mime = AlaveteliFileTypes.filename_and_content_to_mimetype(part_filename, +                                                                                      part_body) +                      if calc_mime +                          part.content_type = calc_mime +                      end +                  end + +                  # Use standard content types for Word documents etc. +                  part.content_type = normalise_content_type(get_content_type(part)) +                  decode_attached_part(part, parent_mail) +                  part.charset = charset +                end +            end + +            # Count the parts in a mail part recursively, including any attached messages. +            # Set the count on the parent mail, and set a url_part_number on the part itself. +            # Set the count for the first uudecoded part on the parent mail also. +            def count_parts(part, parent_mail) +                if part.multipart? +                    part.parts.each { |p| count_parts(p, parent_mail) } +                else +                    if part.rfc822_attachment +                        count_parts(part.rfc822_attachment, parent_mail) +                    else +                        parent_mail.count_parts_count += 1 +                        part.url_part_number = parent_mail.count_parts_count +                    end +                end +                parent_mail.count_first_uudecode_count = parent_mail.count_parts_count +            end + +            # Choose the best part from alternatives +            def choose_best_alternative(mail) +                if mail.html_part +                    return mail.html_part +                elsif mail.text_part +                    return mail.text_part +                else +                    return mail.parts.first +                end +            end + +            # Expand and normalize the parts of a mail, select the best part +            # wherever there is an alternative, and then count the returned +            # leaves and assign url_part values to them +            def get_attachment_leaves(mail) +                expand_and_normalize_parts(mail, mail) +                leaves = _get_attachment_leaves_recursive(mail, nil, mail) +                mail.count_parts_count = 0 +                count_parts(mail, mail) +                return leaves +            end + +            # Recurse through a mail part, selecting the best part wherever there is +            # an alternative +            def _get_attachment_leaves_recursive(part, within_rfc822_attachment, parent_mail) +                leaves_found = [] +                if part.multipart? +                    raise "no parts on multipart mail" if part.parts.size == 0 +                    if part.sub_type == 'alternative' +                        best_part = choose_best_alternative(part) +                        leaves_found += _get_attachment_leaves_recursive(best_part, +                                                                         within_rfc822_attachment, +                                                                         parent_mail) +                    else +                        # Add all parts +                        part.parts.each do |sub_part| +                            leaves_found += _get_attachment_leaves_recursive(sub_part, +                                                                             within_rfc822_attachment, +                                                                             parent_mail) +                        end +                    end +                else +                    # Add all the parts of a decoded attached message +                    if part.rfc822_attachment +                        leaves_found += _get_attachment_leaves_recursive(part.rfc822_attachment, +                                                                         part.rfc822_attachment, +                                                                         parent_mail) +                    else +                        # Store leaf +                        part.within_rfc822_attachment = within_rfc822_attachment +                        leaves_found += [part] +                    end +                end +                return leaves_found +            end + +            # Add selected useful headers from an attached message to its body +            def extract_attached_message_headers(leaf) +                body = get_part_body(leaf) +                # Test to see if we are in the first part of the attached +                # RFC822 message and it is text, if so add headers. +                if leaf.within_rfc822_attachment == leaf && get_content_type(leaf) == 'text/plain' +                    headers = "" +                    [ 'Date', 'Subject', 'From', 'To', 'Cc' ].each do |header| +                        if header_value = get_header_string(header, leaf.within_rfc822_attachment) +                            if !header_value.blank? +                                headers = headers + header + ": " + header_value.to_s + "\n" +                            end +                        end +                    end +                    # XXX call _convert_part_body_to_text here, but need to get charset somehow +                    # e.g. http://www.whatdotheyknow.com/request/1593/response/3088/attach/4/Freedom%20of%20Information%20request%20-%20car%20oval%20sticker:%20Article%2020,%20Convention%20on%20Road%20Traffic%201949.txt +                    body = headers + "\n" + body +                end +                body +            end + +            # Generate a hash of the attributes associated with each significant part of a Mail object +            def get_attachment_attributes(mail) +                leaves = get_attachment_leaves(mail) +                attachments = [] +                for leaf in leaves +                    body = get_part_body(leaf) +                    if leaf.within_rfc822_attachment +                        within_rfc822_subject = leaf.within_rfc822_attachment.subject +                        body = extract_attached_message_headers(leaf) +                    end +                    leaf_attributes = { :url_part_number => leaf.url_part_number, +                                        :content_type => get_content_type(leaf), +                                        :filename => get_part_file_name(leaf), +                                        :charset => leaf.charset, +                                        :within_rfc822_subject => within_rfc822_subject, +                                        :body => body, +                                        :hexdigest => Digest::MD5.hexdigest(body) } +                    attachments << leaf_attributes +                end +                return attachments +            end +              # Format              def address_from_name_and_email(name, email)                  if !MySociety::Validate.is_valid_email(email)                      raise "invalid email " + email + " passed to address_from_name_and_email"                  end                  if name.nil? -                    return Mail::Address.new(email) +                    return Mail::Address.new(email).to_s                  end                  address = Mail::Address.new                  address.display_name = name diff --git a/lib/mail_handler/backends/mail_extensions.rb b/lib/mail_handler/backends/mail_extensions.rb index cbe0491ed..f756abd1a 100644 --- a/lib/mail_handler/backends/mail_extensions.rb +++ b/lib/mail_handler/backends/mail_extensions.rb @@ -1,7 +1,67 @@ +require 'mail/message' +require 'mail/fields/common/parameter_hash'  module Mail      class Message          attr_accessor :url_part_number          attr_accessor :rfc822_attachment # when a whole email message is attached as text          attr_accessor :within_rfc822_attachment # for parts within a message attached as text (for getting subject mainly) +        attr_accessor :count_parts_count +        attr_accessor :count_first_uudecode_count + +        # A patched version of the message initializer to work around a bug where stripping the original +        # input removes meaningful spaces - e.g. in the case of uuencoded bodies. +        def initialize(*args, &block) +            @body = nil +            @body_raw = nil +            @separate_parts = false +            @text_part = nil +            @html_part = nil +            @errors = nil +            @header = nil +            @charset = 'UTF-8' +            @defaulted_charset = true + +            @perform_deliveries = true +            @raise_delivery_errors = true + +            @delivery_handler = nil + +            @delivery_method = Mail.delivery_method.dup + +            @transport_encoding = Mail::Encodings.get_encoding('7bit') + +            @mark_for_delete = false + +            if args.flatten.first.respond_to?(:each_pair) +                init_with_hash(args.flatten.first) +            else +                # The replacement of this commented out line is the change. +                # init_with_string(args.flatten[0].to_s.strip) +                init_with_string(args.flatten[0].to_s) +            end + +            if block_given? +                instance_eval(&block) +            end + +            self +        end +    end + +    # A patched version of the parameter hash that handles nil values without throwing +    # an error. +    class ParameterHash < IndifferentHash + +        def encoded +          map.sort { |a,b| a.first.to_s <=> b.first.to_s }.map do |key_name, value| +            # The replacement of this commented out line is the change +            # unless value.ascii_only? +            unless value.nil? || value.ascii_only? +              value = Mail::Encodings.param_encode(value) +              key_name = "#{key_name}*" +            end +            %Q{#{key_name}=#{quote_token(value)}} +          end.join(";\r\n\s") +        end      end  end
\ No newline at end of file diff --git a/lib/mail_handler/backends/tmail_backend.rb b/lib/mail_handler/backends/tmail_backend.rb index 87aba73d7..02124cdb1 100644 --- a/lib/mail_handler/backends/tmail_backend.rb +++ b/lib/mail_handler/backends/tmail_backend.rb @@ -41,6 +41,234 @@ module MailHandler                  return part_file_name              end +            # Get the body of a mail part +            def get_part_body(mail_part) +                mail_part.body +            end + +            # Return the first from address if any +            def get_from_address(mail) +                if mail.from_addrs.nil? || mail.from_addrs.size == 0 +                    return nil +                end +                mail.from_addrs[0].spec +            end + +            # Return the first from name if any +            def get_from_name(mail) +                mail.from_name_if_present +            end + +            def get_all_addresses(mail) +                ((mail.to || []) + +                (mail.cc || []) + +                (mail.envelope_to || [])).uniq +            end + +            def empty_return_path?(mail) +                return false if mail['return-path'].nil? +                return true if mail['return-path'].addr.to_s == '<>' +                return false +            end + +            def get_auto_submitted(mail) +                mail['auto-submitted'] ? mail['auto-submitted'].body : nil +            end + +            def get_content_type(part) +                part.content_type +            end + +            def get_header_string(header, mail) +                mail.header_string(header) +            end + +            # Number the attachments in depth first tree order, for use in URLs. +            # XXX This fills in part.rfc822_attachment and part.url_part_number within +            # all the parts of the email (see monkeypatches in lib/mail_handler/tmail_extensions and +            # lib/mail_handler/mail_extensions for how these attributes are added). ensure_parts_counted +            # must be called before using the attributes. +            def ensure_parts_counted(mail) +                mail.count_parts_count = 0 +                _count_parts_recursive(mail, mail) +                # we carry on using these numeric ids for attachments uudecoded from within text parts +                mail.count_first_uudecode_count = mail.count_parts_count +            end +            def _count_parts_recursive(part, mail) +                if part.multipart? +                    part.parts.each do |p| +                        _count_parts_recursive(p, mail) +                    end +                else +                    part_filename = get_part_file_name(part) +                    begin +                        if part.content_type == 'message/rfc822' +                            # An email attached as text +                            # e.g. http://www.whatdotheyknow.com/request/64/response/102 +                            part.rfc822_attachment = mail_from_raw_email(part.body, decode=false) +                        elsif part.content_type == 'application/vnd.ms-outlook' || part_filename && AlaveteliFileTypes.filename_to_mimetype(part_filename) == 'application/vnd.ms-outlook' +                            # An email attached as an Outlook file +                            # e.g. http://www.whatdotheyknow.com/request/chinese_names_for_british_politi +                            msg = Mapi::Msg.open(StringIO.new(part.body)) +                            part.rfc822_attachment = mail_from_raw_email(msg.to_mime.to_s, decode=false) +                        elsif part.content_type == 'application/ms-tnef' +                            # A set of attachments in a TNEF file +                            part.rfc822_attachment = mail_from_tnef(part.body) +                        end +                    rescue +                        # If attached mail doesn't parse, treat it as text part +                        part.rfc822_attachment = nil +                    else +                        unless part.rfc822_attachment.nil? +                            _count_parts_recursive(part.rfc822_attachment, mail) +                        end +                    end +                    if part.rfc822_attachment.nil? +                        mail.count_parts_count += 1 +                        part.url_part_number = mail.count_parts_count +                    end +                end +            end + +            def get_attachment_attributes(mail) +                leaves = get_attachment_leaves(mail) +                # XXX we have to call ensure_parts_counted after get_attachment_leaves +                # which is really messy. +                ensure_parts_counted(mail) +                attachment_attributes = [] +                for leaf in leaves +                    body = get_part_body(leaf) +                    # As leaf.body causes MIME decoding which uses lots of RAM, do garbage collection here +                    # to prevent excess memory use. XXX not really sure if this helps reduce +                    # peak RAM use overall. Anyway, maybe there is something better to do than this. +                    GC.start +                    if leaf.within_rfc822_attachment +                        within_rfc822_subject = leaf.within_rfc822_attachment.subject +                        # Test to see if we are in the first part of the attached +                        # RFC822 message and it is text, if so add headers. +                        # XXX should probably use hunting algorithm to find main text part, rather than +                        # just expect it to be first. This will do for now though. +                        if leaf.within_rfc822_attachment == leaf && leaf.content_type == 'text/plain' +                            headers = "" +                            for header in [ 'Date', 'Subject', 'From', 'To', 'Cc' ] +                                if leaf.within_rfc822_attachment.header.include?(header.downcase) +                                    header_value = leaf.within_rfc822_attachment.header[header.downcase] +                                     if !header_value.blank? +                                        headers = headers + header + ": " + header_value.to_s + "\n" +                                    end +                                end +                            end +                            # XXX call _convert_part_body_to_text here, but need to get charset somehow +                            # e.g. http://www.whatdotheyknow.com/request/1593/response/3088/attach/4/Freedom%20of%20Information%20request%20-%20car%20oval%20sticker:%20Article%2020,%20Convention%20on%20Road%20Traffic%201949.txt +                            body = headers + "\n" + body + +                            # This is quick way of getting all headers, but instead we only add some a) to +                            # make it more usable, b) as at least one authority accidentally leaked security +                            # information into a header. +                            #attachment.body = leaf.within_rfc822_attachment.port.to_s +                        end +                    end +                    attachment_attributes << {:url_part_number => leaf.url_part_number, +                                              :content_type => get_content_type(leaf), +                                              :filename => get_part_file_name(leaf), +                                              :charset => leaf.charset, +                                              :within_rfc822_subject => within_rfc822_subject, +                                              :body => body, +                                              :hexdigest => Digest::MD5.hexdigest(body) } +                end +                attachment_attributes +            end + +            # (This risks losing info if the unchosen alternative is the only one to contain +            # useful info, but let's worry about that another time) +            def get_attachment_leaves(mail) +                return _get_attachment_leaves_recursive(mail, mail) +            end +            def _get_attachment_leaves_recursive(curr_mail, parent_mail, within_rfc822_attachment = nil) +                leaves_found = [] +                if curr_mail.multipart? +                    if curr_mail.parts.size == 0 +                        raise "no parts on multipart mail" +                    end + +                    if curr_mail.sub_type == 'alternative' +                        # Choose best part from alternatives +                        best_part = nil +                        # Take the last text/plain one, or else the first one +                        curr_mail.parts.each do |m| +                            if not best_part +                                best_part = m +                            elsif m.content_type == 'text/plain' +                                best_part = m +                            end +                        end +                        # Take an HTML one as even higher priority. (They tend +                        # to render better than text/plain, e.g. don't wrap links here: +                        # http://www.whatdotheyknow.com/request/amount_and_cost_of_freedom_of_in#incoming-72238 ) +                        curr_mail.parts.each do |m| +                            if m.content_type == 'text/html' +                                best_part = m +                            end +                        end +                        leaves_found += _get_attachment_leaves_recursive(best_part, parent_mail, within_rfc822_attachment) +                    else +                        # Add all parts +                        curr_mail.parts.each do |m| +                            leaves_found += _get_attachment_leaves_recursive(m, parent_mail, within_rfc822_attachment) +                        end +                    end +                else +                    # XXX Yuck. this section alters various content_types. That puts +                    # it into conflict with ensure_parts_counted which it has to be +                    # called both before and after.  It will fail with cases of +                    # attachments of attachments etc. +                    charset = curr_mail.charset # save this, because overwriting content_type also resets charset +                    # Don't allow nil content_types +                    if curr_mail.content_type.nil? +                        curr_mail.content_type = 'application/octet-stream' +                    end +                    # PDFs often come with this mime type, fix it up for view code +                    if curr_mail.content_type == 'application/octet-stream' +                        part_file_name = get_part_file_name(curr_mail) +                        part_body = get_part_body(curr_mail) +                        calc_mime = AlaveteliFileTypes.filename_and_content_to_mimetype(part_file_name, part_body) +                        if calc_mime +                            curr_mail.content_type = calc_mime +                        end +                    end + +                    # Use standard content types for Word documents etc. +                    curr_mail.content_type = normalise_content_type(curr_mail.content_type) +                    if curr_mail.content_type == 'message/rfc822' +                        ensure_parts_counted(parent_mail) # fills in rfc822_attachment variable +                        if curr_mail.rfc822_attachment.nil? +                            # Attached mail didn't parse, so treat as text +                            curr_mail.content_type = 'text/plain' +                        end +                    end +                    if curr_mail.content_type == 'application/vnd.ms-outlook' || curr_mail.content_type == 'application/ms-tnef' +                        ensure_parts_counted(parent_mail) # fills in rfc822_attachment variable +                        if curr_mail.rfc822_attachment.nil? +                            # Attached mail didn't parse, so treat as binary +                            curr_mail.content_type = 'application/octet-stream' +                        end +                    end +                    # If the part is an attachment of email +                    if curr_mail.content_type == 'message/rfc822' || curr_mail.content_type == 'application/vnd.ms-outlook' || curr_mail.content_type == 'application/ms-tnef' +                        ensure_parts_counted(parent_mail) # fills in rfc822_attachment variable +                        leaves_found += _get_attachment_leaves_recursive(curr_mail.rfc822_attachment, parent_mail, curr_mail.rfc822_attachment) +                    else +                        # Store leaf +                        curr_mail.within_rfc822_attachment = within_rfc822_attachment +                        leaves_found += [curr_mail] +                    end +                    # restore original charset +                    curr_mail.charset = charset +                end +                return leaves_found +            end + +              def address_from_name_and_email(name, email)                  if !MySociety::Validate.is_valid_email(email)                      raise "invalid email " + email + " passed to address_from_name_and_email" diff --git a/lib/mail_handler/backends/tmail_extensions.rb b/lib/mail_handler/backends/tmail_extensions.rb index 9359dfeea..3576a8eca 100644 --- a/lib/mail_handler/backends/tmail_extensions.rb +++ b/lib/mail_handler/backends/tmail_extensions.rb @@ -20,6 +20,8 @@ module TMail          attr_accessor :url_part_number          attr_accessor :rfc822_attachment # when a whole email message is attached as text          attr_accessor :within_rfc822_attachment # for parts within a message attached as text (for getting subject mainly) +        attr_accessor :count_parts_count +        attr_accessor :count_first_uudecode_count          # Monkeypatch! (check to see if this becomes a standard function in          # TMail::Mail, then use that, whatever it is called) diff --git a/lib/mail_handler/mail_handler.rb b/lib/mail_handler/mail_handler.rb index 24d14b5c8..8b227b9ca 100644 --- a/lib/mail_handler/mail_handler.rb +++ b/lib/mail_handler/mail_handler.rb @@ -4,10 +4,12 @@ require 'tmpdir'  module MailHandler      if RUBY_VERSION.to_f >= 1.9 +        require 'mail'          require 'backends/mail_extensions'          require 'backends/mail_backend'          include Backends::MailBackend      else +        require 'action_mailer'          require 'backends/tmail_extensions'          require 'backends/tmail_backend'          include Backends::TmailBackend @@ -19,7 +21,7 @@ module MailHandler      def tnef_attachments(content)          attachments = []          Dir.mktmpdir do |dir| -            IO.popen("#{`which tnef`.chomp} -K -C #{dir}", "w") do |f| +            IO.popen("#{`which tnef`.chomp} -K -C #{dir}", "wb") do |f|                  f.write(content)                  f.close                  if $?.signaled? @@ -32,7 +34,7 @@ module MailHandler              found = 0              Dir.new(dir).sort.each do |file| # sort for deterministic behaviour                  if file != "." && file != ".." -                    file_content = File.open("#{dir}/#{file}", "r").read +                    file_content = File.open("#{dir}/#{file}", "rb").read                      attachments << { :content => file_content,                                       :filename => file }                      found += 1 @@ -45,6 +47,131 @@ module MailHandler          attachments      end +    def normalise_content_type(content_type) +        # e.g. http://www.whatdotheyknow.com/request/93/response/250 +        if content_type == 'application/excel' or content_type == 'application/msexcel' or content_type == 'application/x-ms-excel' +            content_type = 'application/vnd.ms-excel' +        end +        if content_type == 'application/mspowerpoint' or content_type == 'application/x-ms-powerpoint' +            content_type = 'application/vnd.ms-powerpoint' +        end +        if content_type == 'application/msword' or content_type == 'application/x-ms-word' +            content_type = 'application/vnd.ms-word' +        end +        if content_type == 'application/x-zip-compressed' +            content_type = 'application/zip' +        end + +        # e.g. http://www.whatdotheyknow.com/request/copy_of_current_swessex_scr_opt#incoming-9928 +        if content_type == 'application/acrobat' +            content_type = 'application/pdf' +        end + +        return content_type +    end + +    def get_attachment_text_one_file(content_type, body, charset = 'utf-8') +        # note re. charset: TMail always tries to convert email bodies +        # to UTF8 by default, so normally it should already be that. +        text = '' +        # XXX - tell all these command line tools to return utf-8 +        if content_type == 'text/plain' +            text += body + "\n\n" +        else +            tempfile = Tempfile.new('foiextract') +            tempfile.binmode +            tempfile.print body +            tempfile.flush +            default_params = { :append_to => text, :binary_output => false } +            if content_type == 'application/vnd.ms-word' +                AlaveteliExternalCommand.run("wvText", tempfile.path, tempfile.path + ".txt") +                # Try catdoc if we get into trouble (e.g. for InfoRequestEvent 2701) +                if not File.exists?(tempfile.path + ".txt") +                    AlaveteliExternalCommand.run("catdoc", tempfile.path, default_params) +                else +                    text += File.read(tempfile.path + ".txt") + "\n\n" +                    File.unlink(tempfile.path + ".txt") +                end +            elsif content_type == 'application/rtf' +                # catdoc on RTF prodcues less comments and extra bumf than --text option to unrtf +                AlaveteliExternalCommand.run("catdoc", tempfile.path, default_params) +            elsif content_type == 'text/html' +                # lynx wordwraps links in its output, which then don't +                # get formatted properly by Alaveteli. We use elinks +                # instead, which doesn't do that. +                AlaveteliExternalCommand.run("elinks", "-eval", "set document.codepage.assume = \"#{charset}\"", +                                                       "-eval", "set document.codepage.force_assumed = 1", +                                                       "-dump-charset", "utf-8", +                                                       "-force-html", "-dump", +                                                       tempfile.path, +                                                       default_params.merge(:env => {"LANG" => "C"})) +            elsif content_type == 'application/vnd.ms-excel' +                # Bit crazy using /usr/bin/strings - but xls2csv, xlhtml and +                # py_xls2txt only extract text from cells, not from floating +                # notes. catdoc may be fooled by weird character sets, but will +                # probably do for UK FOI requests. +                AlaveteliExternalCommand.run("/usr/bin/strings", tempfile.path, default_params) +            elsif content_type == 'application/vnd.ms-powerpoint' +                # ppthtml seems to catch more text, but only outputs HTML when +                # we want text, so just use catppt for now +                AlaveteliExternalCommand.run("catppt", tempfile.path, default_params) +            elsif content_type == 'application/pdf' +                AlaveteliExternalCommand.run("pdftotext", tempfile.path, "-", default_params) +            elsif content_type == 'application/vnd.openxmlformats-officedocument.wordprocessingml.document' +                # This is Microsoft's XML office document format. +                # Just pull out the main XML file, and strip it of text. +                xml = AlaveteliExternalCommand.run("/usr/bin/unzip", "-qq", +                                                                     "-c", +                                                                     tempfile.path, +                                                                     "word/document.xml", +                                                                     {:binary_output => false}) +                if !xml.nil? +                    doc = REXML::Document.new(xml) +                    text += doc.each_element( './/text()' ){}.join(" ") +                end +            elsif content_type == 'application/zip' +                # recurse into zip files +                begin +                    zip_file = Zip::ZipFile.open(tempfile.path) +                    text += get_attachment_text_from_zip_file(zip_file) +                    zip_file.close() +                rescue +                    $stderr.puts("Error processing zip file: #{$!.inspect}") +                end +            end +            tempfile.close +        end + +        return text +    end +    def get_attachment_text_from_zip_file(zip_file) + +        text = "" +        for entry in zip_file +            if entry.file? +                filename = entry.to_s +                begin +                    body = entry.get_input_stream.read +                rescue +                    # move to next attachment silently if there were problems +                    # XXX really should reduce this to specific exceptions? +                    # e.g. password protected +                    next +                end +                calc_mime = AlaveteliFileTypes.filename_to_mimetype(filename) +                if calc_mime +                    content_type = calc_mime +                else +                    content_type = 'application/octet-stream' +                end + +                text += get_attachment_text_one_file(content_type, body) + +            end +        end +        return text +    end +      # Turn instance methods into class methods      extend self | 
