diff options
Diffstat (limited to 'lib')
| -rw-r--r-- | lib/alaveteli_external_command.rb | 14 | ||||
| -rw-r--r-- | lib/configuration.rb | 1 | ||||
| -rw-r--r-- | lib/mail_handler/backends/mail_backend.rb | 323 | ||||
| -rw-r--r-- | lib/mail_handler/backends/mail_extensions.rb | 67 | ||||
| -rw-r--r-- | lib/mail_handler/backends/tmail_backend.rb | 290 | ||||
| -rw-r--r-- | lib/mail_handler/backends/tmail_extensions.rb (renamed from lib/tmail_extensions.rb) | 28 | ||||
| -rw-r--r-- | lib/mail_handler/mail_handler.rb | 179 | ||||
| -rw-r--r-- | lib/tasks/translation.rake | 4 | ||||
| -rw-r--r-- | lib/tnef.rb | 40 | ||||
| -rw-r--r-- | lib/world_foi_websites.rb | 6 | 
10 files changed, 887 insertions, 65 deletions
| diff --git a/lib/alaveteli_external_command.rb b/lib/alaveteli_external_command.rb index 3bfc34e3a..24b4b1aa8 100644 --- a/lib/alaveteli_external_command.rb +++ b/lib/alaveteli_external_command.rb @@ -2,6 +2,12 @@ require 'external_command'  module AlaveteliExternalCommand      class << self +        # Final argument can be a hash of options. +        # Valid options are: +        # :append_to - string to append the output of the process to +        # :stdin_string - stdin string to pass to the process +        # :binary_output - boolean flag for treating the output as binary or text (only significant +        #                  ruby 1.9 and above)          def run(program_name, *args)              # Run an external program, and return its output.              # Standard error is suppressed unless the program @@ -10,7 +16,7 @@ module AlaveteliExternalCommand              if !args.empty? && args[-1].is_a?(Hash)                  opts = args.pop              end -             +              if program_name =~ %r(^/)                  program_path = program_name              else @@ -24,12 +30,16 @@ module AlaveteliExternalCommand                  end                   raise "Could not find #{program_name} in any of #{Configuration::utility_search_path.join(', ')}" if !found              end -             +              xc = ExternalCommand.new(program_path, *args)              if opts.has_key? :append_to                  xc.out = opts[:append_to]              end +            if opts.has_key? :binary_output +                xc.binary_mode = opts[:binary_output] +            end              xc.run(opts[:stdin_string] || "", opts[:env] || {}) +              if xc.status != 0                  # Error                  $stderr.puts("Error from #{program_name} #{args.join(' ')}:") diff --git a/lib/configuration.rb b/lib/configuration.rb index abd0f5cdc..11fe1c56e 100644 --- a/lib/configuration.rb +++ b/lib/configuration.rb @@ -25,6 +25,7 @@ module Configuration      :GA_CODE => '',      :GAZE_URL => '',      :HTML_TO_PDF_COMMAND => '', +    :INCLUDE_DEFAULT_LOCALE_IN_URLS => true,      :INCOMING_EMAIL_DOMAIN => 'localhost',      :INCOMING_EMAIL_PREFIX => '',      :INCOMING_EMAIL_SECRET => 'dummysecret', diff --git a/lib/mail_handler/backends/mail_backend.rb b/lib/mail_handler/backends/mail_backend.rb new file mode 100644 index 000000000..b75e6ed63 --- /dev/null +++ b/lib/mail_handler/backends/mail_backend.rb @@ -0,0 +1,323 @@ +require 'mail' + +module MailHandler +    module Backends +        module MailBackend + +            def backend() +                'Mail' +            end + +            # Note that the decode flag is not yet used +            def mail_from_raw_email(data, decode=true) +                Mail.new(data) +            end + +             # Extracts all attachments from the given TNEF file as a Mail object +            def mail_from_tnef(content) +                main = Mail.new +                tnef_attachments(content).each do |attachment| +                    main.add_file(attachment) +                end +                main.ready_to_send! +                main +            end + +            # Returns an outlook message as a Mail object +            def mail_from_outlook(content) +                msg = Mapi::Msg.open(StringIO.new(content)) +                mail = mail_from_raw_email(msg.to_mime.to_s) +                mail.ready_to_send! +                mail +            end + +            # Return a copy of the file name for the mail part +            def get_part_file_name(part) +                part_file_name = part.filename +                part_file_name.nil? ? nil : part_file_name.dup +            end + +            # Get the body of a mail part +            def get_part_body(part) +                part.body.decoded +            end + +            # Return the first from field if any +            def first_from(mail) +                if mail[:from] +                    begin +                        mail[:from].addrs[0] +                        mail[:from].decoded +                        return mail[:from].addrs[0] +                    rescue +                        return mail[:from].value +                    end +                else +                    nil +                end +            end + +            # Return the first from address if any +            def get_from_address(mail) +                first_from = first_from(mail) +                if first_from +                    if first_from.is_a?(String) +                        return nil +                    else +                        return first_from.address +                    end +                else +                    return nil +                end +            end + +            # Return the first from name if any +            def get_from_name(mail) +                first_from = first_from(mail) +                if first_from +                    if first_from.is_a?(String) +                        return nil +                    else +                        return first_from.display_name ? eval(%Q{"#{first_from.display_name}"}) : nil +                    end +                else +                    return nil +                end +            end + +            def get_all_addresses(mail) +                envelope_to = mail['envelope-to'] ? [mail['envelope-to'].value] : [] +                ((mail.to || []) + +                (mail.cc || []) + +                (envelope_to || [])).uniq +            end + +            def empty_return_path?(mail) +                return false if mail['return-path'].nil? +                return true if mail['return-path'].value.blank? +                return false +            end + +            def get_auto_submitted(mail) +                mail['auto-submitted'] ? mail['auto-submitted'].value : nil +            end + +            def get_content_type(part) +                part.content_type ? part.content_type.split(';')[0] : nil +            end + +            def get_header_string(header, mail) +                mail.header[header] ? mail.header[header].to_s : nil +            end + +            # Detects whether a mail part is an Outlook email +            def is_outlook?(part) +                filename = get_part_file_name(part) +                return true if get_content_type(part) == 'application/vnd.ms-outlook' +                if filename && AlaveteliFileTypes.filename_to_mimetype(filename) == 'application/vnd.ms-outlook' +                    return true +                end +                return false +            end + +            # Convert a mail part which is an attached mail in one of +            # several formats into a mail object and set it as the +            # rfc822_attachment on the part. If the mail part can't be +            # converted, the content type on the part is updated to +            # 'text/plain' for an RFC822 attachment, and 'application/octet-stream' +            # for other types +            def decode_attached_part(part, parent_mail) +                if get_content_type(part) == 'message/rfc822' +                    # An email attached as text +                    part.rfc822_attachment = mail_from_raw_email(part.body) +                    if part.rfc822_attachment.nil? +                        # Attached mail didn't parse, so treat as text +                        part.content_type = 'text/plain' +                    end +                elsif is_outlook?(part) +                    part.rfc822_attachment = mail_from_outlook(part.body.decoded) +                    if part.rfc822_attachment.nil? +                         # Attached mail didn't parse, so treat as binary +                         part.content_type = 'application/octet-stream' +                    end +                elsif get_content_type(part) == 'application/ms-tnef' +                    # A set of attachments in a TNEF file +                    part.rfc822_attachment = mail_from_tnef(part.body.decoded) +                    if part.rfc822_attachment.nil? +                        # Attached mail didn't parse, so treat as binary +                        part.content_type = 'application/octet-stream' +                    end +                end +                if part.rfc822_attachment +                    expand_and_normalize_parts(part.rfc822_attachment, parent_mail) +                end +            end + +            # Expand and normalize a mail part recursively. Decodes attached messages into +            # Mail objects wherever possible. Sets a default content type if none is +            # set. Tries to set a more specific content type for binary content types. +            def expand_and_normalize_parts(part, parent_mail) +                if part.multipart? +                  part.parts.each{ |sub_part| expand_and_normalize_parts(sub_part, parent_mail) } +                else +                  part_filename = get_part_file_name(part) +                  charset = part.charset # save this, because overwriting content_type also resets charset + +                  # Don't allow nil content_types +                  if get_content_type(part).nil? +                      part.content_type = 'application/octet-stream' +                  end + +                  # PDFs often come with this mime type, fix it up for view code +                  if get_content_type(part) == 'application/octet-stream' +                      part_body = get_part_body(part) +                      calc_mime = AlaveteliFileTypes.filename_and_content_to_mimetype(part_filename, +                                                                                      part_body) +                      if calc_mime +                          part.content_type = calc_mime +                      end +                  end + +                  # Use standard content types for Word documents etc. +                  part.content_type = normalise_content_type(get_content_type(part)) +                  decode_attached_part(part, parent_mail) +                  part.charset = charset +                end +            end + +            # Count the parts in a mail part recursively, including any attached messages. +            # Set the count on the parent mail, and set a url_part_number on the part itself. +            # Set the count for the first uudecoded part on the parent mail also. +            def count_parts(part, parent_mail) +                if part.multipart? +                    part.parts.each { |p| count_parts(p, parent_mail) } +                else +                    if part.rfc822_attachment +                        count_parts(part.rfc822_attachment, parent_mail) +                    else +                        parent_mail.count_parts_count += 1 +                        part.url_part_number = parent_mail.count_parts_count +                    end +                end +                parent_mail.count_first_uudecode_count = parent_mail.count_parts_count +            end + +            # Choose the best part from alternatives +            def choose_best_alternative(mail) +                if mail.html_part +                    return mail.html_part +                elsif mail.text_part +                    return mail.text_part +                else +                    return mail.parts.first +                end +            end + +            # Expand and normalize the parts of a mail, select the best part +            # wherever there is an alternative, and then count the returned +            # leaves and assign url_part values to them +            def get_attachment_leaves(mail) +                expand_and_normalize_parts(mail, mail) +                leaves = _get_attachment_leaves_recursive(mail, nil, mail) +                mail.count_parts_count = 0 +                count_parts(mail, mail) +                return leaves +            end + +            # Recurse through a mail part, selecting the best part wherever there is +            # an alternative +            def _get_attachment_leaves_recursive(part, within_rfc822_attachment, parent_mail) +                leaves_found = [] +                if part.multipart? +                    raise "no parts on multipart mail" if part.parts.size == 0 +                    if part.sub_type == 'alternative' +                        best_part = choose_best_alternative(part) +                        leaves_found += _get_attachment_leaves_recursive(best_part, +                                                                         within_rfc822_attachment, +                                                                         parent_mail) +                    else +                        # Add all parts +                        part.parts.each do |sub_part| +                            leaves_found += _get_attachment_leaves_recursive(sub_part, +                                                                             within_rfc822_attachment, +                                                                             parent_mail) +                        end +                    end +                else +                    # Add all the parts of a decoded attached message +                    if part.rfc822_attachment +                        leaves_found += _get_attachment_leaves_recursive(part.rfc822_attachment, +                                                                         part.rfc822_attachment, +                                                                         parent_mail) +                    else +                        # Store leaf +                        part.within_rfc822_attachment = within_rfc822_attachment +                        leaves_found += [part] +                    end +                end +                return leaves_found +            end + +            # Add selected useful headers from an attached message to its body +            def extract_attached_message_headers(leaf) +                body = get_part_body(leaf) +                # Test to see if we are in the first part of the attached +                # RFC822 message and it is text, if so add headers. +                if leaf.within_rfc822_attachment == leaf && get_content_type(leaf) == 'text/plain' +                    headers = "" +                    [ 'Date', 'Subject', 'From', 'To', 'Cc' ].each do |header| +                        if header_value = get_header_string(header, leaf.within_rfc822_attachment) +                            if !header_value.blank? +                                headers = headers + header + ": " + header_value.to_s + "\n" +                            end +                        end +                    end +                    # XXX call _convert_part_body_to_text here, but need to get charset somehow +                    # e.g. http://www.whatdotheyknow.com/request/1593/response/3088/attach/4/Freedom%20of%20Information%20request%20-%20car%20oval%20sticker:%20Article%2020,%20Convention%20on%20Road%20Traffic%201949.txt +                    body = headers + "\n" + body +                end +                body +            end + +            # Generate a hash of the attributes associated with each significant part of a Mail object +            def get_attachment_attributes(mail) +                leaves = get_attachment_leaves(mail) +                attachments = [] +                for leaf in leaves +                    body = get_part_body(leaf) +                    if leaf.within_rfc822_attachment +                        within_rfc822_subject = leaf.within_rfc822_attachment.subject +                        body = extract_attached_message_headers(leaf) +                    end +                    leaf_attributes = { :url_part_number => leaf.url_part_number, +                                        :content_type => get_content_type(leaf), +                                        :filename => get_part_file_name(leaf), +                                        :charset => leaf.charset, +                                        :within_rfc822_subject => within_rfc822_subject, +                                        :body => body, +                                        :hexdigest => Digest::MD5.hexdigest(body) } +                    attachments << leaf_attributes +                end +                return attachments +            end + +            # Format +            def address_from_name_and_email(name, email) +                if !MySociety::Validate.is_valid_email(email) +                    raise "invalid email " + email + " passed to address_from_name_and_email" +                end +                if name.nil? +                    return Mail::Address.new(email).to_s +                end +                address = Mail::Address.new +                address.display_name = name +                address.address = email +                address.to_s +            end + +            def address_from_string(string) +                Mail::Address.new(string).address +            end +        end +    end +end
\ No newline at end of file diff --git a/lib/mail_handler/backends/mail_extensions.rb b/lib/mail_handler/backends/mail_extensions.rb new file mode 100644 index 000000000..f756abd1a --- /dev/null +++ b/lib/mail_handler/backends/mail_extensions.rb @@ -0,0 +1,67 @@ +require 'mail/message' +require 'mail/fields/common/parameter_hash' +module Mail +    class Message +        attr_accessor :url_part_number +        attr_accessor :rfc822_attachment # when a whole email message is attached as text +        attr_accessor :within_rfc822_attachment # for parts within a message attached as text (for getting subject mainly) +        attr_accessor :count_parts_count +        attr_accessor :count_first_uudecode_count + +        # A patched version of the message initializer to work around a bug where stripping the original +        # input removes meaningful spaces - e.g. in the case of uuencoded bodies. +        def initialize(*args, &block) +            @body = nil +            @body_raw = nil +            @separate_parts = false +            @text_part = nil +            @html_part = nil +            @errors = nil +            @header = nil +            @charset = 'UTF-8' +            @defaulted_charset = true + +            @perform_deliveries = true +            @raise_delivery_errors = true + +            @delivery_handler = nil + +            @delivery_method = Mail.delivery_method.dup + +            @transport_encoding = Mail::Encodings.get_encoding('7bit') + +            @mark_for_delete = false + +            if args.flatten.first.respond_to?(:each_pair) +                init_with_hash(args.flatten.first) +            else +                # The replacement of this commented out line is the change. +                # init_with_string(args.flatten[0].to_s.strip) +                init_with_string(args.flatten[0].to_s) +            end + +            if block_given? +                instance_eval(&block) +            end + +            self +        end +    end + +    # A patched version of the parameter hash that handles nil values without throwing +    # an error. +    class ParameterHash < IndifferentHash + +        def encoded +          map.sort { |a,b| a.first.to_s <=> b.first.to_s }.map do |key_name, value| +            # The replacement of this commented out line is the change +            # unless value.ascii_only? +            unless value.nil? || value.ascii_only? +              value = Mail::Encodings.param_encode(value) +              key_name = "#{key_name}*" +            end +            %Q{#{key_name}=#{quote_token(value)}} +          end.join(";\r\n\s") +        end +    end +end
\ No newline at end of file diff --git a/lib/mail_handler/backends/tmail_backend.rb b/lib/mail_handler/backends/tmail_backend.rb new file mode 100644 index 000000000..02124cdb1 --- /dev/null +++ b/lib/mail_handler/backends/tmail_backend.rb @@ -0,0 +1,290 @@ +module MailHandler +    module Backends +        module TmailBackend + +            def backend() +                'TMail' +            end + +            # Turn raw data into a structured TMail::Mail object +            # Documentation at http://i.loveruby.net/en/projects/tmail/doc/ +            def mail_from_raw_email(data, decode=true) +                # Hack round bug in TMail's MIME decoding. +                # Report of TMail bug: +                # http://rubyforge.org/tracker/index.php?func=detail&aid=21810&group_id=4512&atid=17370 +                copy_of_raw_data = data.gsub(/; boundary=\s+"/im,'; boundary="') +                mail = TMail::Mail.parse(copy_of_raw_data) +                mail.base64_decode if decode +                mail +            end + +            # Extracts all attachments from the given TNEF file as a TMail::Mail object +            def mail_from_tnef(content) +                main = TMail::Mail.new +                main.set_content_type 'multipart', 'mixed', { 'boundary' => TMail.new_boundary } +                tnef_attachments(content).each do |attachment| +                    tmail_attachment = TMail::Mail.new +                    tmail_attachment['content-location'] = attachment[:filename] +                    tmail_attachment.body = attachment[:content] +                    main.parts << tmail_attachment +                end +                main +            end + +            # Return a copy of the file name for the mail part +            def get_part_file_name(mail_part) +                part_file_name = TMail::Mail.get_part_file_name(mail_part) +                if part_file_name.nil? +                    return nil +                end +                part_file_name = part_file_name.dup +                return part_file_name +            end + +            # Get the body of a mail part +            def get_part_body(mail_part) +                mail_part.body +            end + +            # Return the first from address if any +            def get_from_address(mail) +                if mail.from_addrs.nil? || mail.from_addrs.size == 0 +                    return nil +                end +                mail.from_addrs[0].spec +            end + +            # Return the first from name if any +            def get_from_name(mail) +                mail.from_name_if_present +            end + +            def get_all_addresses(mail) +                ((mail.to || []) + +                (mail.cc || []) + +                (mail.envelope_to || [])).uniq +            end + +            def empty_return_path?(mail) +                return false if mail['return-path'].nil? +                return true if mail['return-path'].addr.to_s == '<>' +                return false +            end + +            def get_auto_submitted(mail) +                mail['auto-submitted'] ? mail['auto-submitted'].body : nil +            end + +            def get_content_type(part) +                part.content_type +            end + +            def get_header_string(header, mail) +                mail.header_string(header) +            end + +            # Number the attachments in depth first tree order, for use in URLs. +            # XXX This fills in part.rfc822_attachment and part.url_part_number within +            # all the parts of the email (see monkeypatches in lib/mail_handler/tmail_extensions and +            # lib/mail_handler/mail_extensions for how these attributes are added). ensure_parts_counted +            # must be called before using the attributes. +            def ensure_parts_counted(mail) +                mail.count_parts_count = 0 +                _count_parts_recursive(mail, mail) +                # we carry on using these numeric ids for attachments uudecoded from within text parts +                mail.count_first_uudecode_count = mail.count_parts_count +            end +            def _count_parts_recursive(part, mail) +                if part.multipart? +                    part.parts.each do |p| +                        _count_parts_recursive(p, mail) +                    end +                else +                    part_filename = get_part_file_name(part) +                    begin +                        if part.content_type == 'message/rfc822' +                            # An email attached as text +                            # e.g. http://www.whatdotheyknow.com/request/64/response/102 +                            part.rfc822_attachment = mail_from_raw_email(part.body, decode=false) +                        elsif part.content_type == 'application/vnd.ms-outlook' || part_filename && AlaveteliFileTypes.filename_to_mimetype(part_filename) == 'application/vnd.ms-outlook' +                            # An email attached as an Outlook file +                            # e.g. http://www.whatdotheyknow.com/request/chinese_names_for_british_politi +                            msg = Mapi::Msg.open(StringIO.new(part.body)) +                            part.rfc822_attachment = mail_from_raw_email(msg.to_mime.to_s, decode=false) +                        elsif part.content_type == 'application/ms-tnef' +                            # A set of attachments in a TNEF file +                            part.rfc822_attachment = mail_from_tnef(part.body) +                        end +                    rescue +                        # If attached mail doesn't parse, treat it as text part +                        part.rfc822_attachment = nil +                    else +                        unless part.rfc822_attachment.nil? +                            _count_parts_recursive(part.rfc822_attachment, mail) +                        end +                    end +                    if part.rfc822_attachment.nil? +                        mail.count_parts_count += 1 +                        part.url_part_number = mail.count_parts_count +                    end +                end +            end + +            def get_attachment_attributes(mail) +                leaves = get_attachment_leaves(mail) +                # XXX we have to call ensure_parts_counted after get_attachment_leaves +                # which is really messy. +                ensure_parts_counted(mail) +                attachment_attributes = [] +                for leaf in leaves +                    body = get_part_body(leaf) +                    # As leaf.body causes MIME decoding which uses lots of RAM, do garbage collection here +                    # to prevent excess memory use. XXX not really sure if this helps reduce +                    # peak RAM use overall. Anyway, maybe there is something better to do than this. +                    GC.start +                    if leaf.within_rfc822_attachment +                        within_rfc822_subject = leaf.within_rfc822_attachment.subject +                        # Test to see if we are in the first part of the attached +                        # RFC822 message and it is text, if so add headers. +                        # XXX should probably use hunting algorithm to find main text part, rather than +                        # just expect it to be first. This will do for now though. +                        if leaf.within_rfc822_attachment == leaf && leaf.content_type == 'text/plain' +                            headers = "" +                            for header in [ 'Date', 'Subject', 'From', 'To', 'Cc' ] +                                if leaf.within_rfc822_attachment.header.include?(header.downcase) +                                    header_value = leaf.within_rfc822_attachment.header[header.downcase] +                                     if !header_value.blank? +                                        headers = headers + header + ": " + header_value.to_s + "\n" +                                    end +                                end +                            end +                            # XXX call _convert_part_body_to_text here, but need to get charset somehow +                            # e.g. http://www.whatdotheyknow.com/request/1593/response/3088/attach/4/Freedom%20of%20Information%20request%20-%20car%20oval%20sticker:%20Article%2020,%20Convention%20on%20Road%20Traffic%201949.txt +                            body = headers + "\n" + body + +                            # This is quick way of getting all headers, but instead we only add some a) to +                            # make it more usable, b) as at least one authority accidentally leaked security +                            # information into a header. +                            #attachment.body = leaf.within_rfc822_attachment.port.to_s +                        end +                    end +                    attachment_attributes << {:url_part_number => leaf.url_part_number, +                                              :content_type => get_content_type(leaf), +                                              :filename => get_part_file_name(leaf), +                                              :charset => leaf.charset, +                                              :within_rfc822_subject => within_rfc822_subject, +                                              :body => body, +                                              :hexdigest => Digest::MD5.hexdigest(body) } +                end +                attachment_attributes +            end + +            # (This risks losing info if the unchosen alternative is the only one to contain +            # useful info, but let's worry about that another time) +            def get_attachment_leaves(mail) +                return _get_attachment_leaves_recursive(mail, mail) +            end +            def _get_attachment_leaves_recursive(curr_mail, parent_mail, within_rfc822_attachment = nil) +                leaves_found = [] +                if curr_mail.multipart? +                    if curr_mail.parts.size == 0 +                        raise "no parts on multipart mail" +                    end + +                    if curr_mail.sub_type == 'alternative' +                        # Choose best part from alternatives +                        best_part = nil +                        # Take the last text/plain one, or else the first one +                        curr_mail.parts.each do |m| +                            if not best_part +                                best_part = m +                            elsif m.content_type == 'text/plain' +                                best_part = m +                            end +                        end +                        # Take an HTML one as even higher priority. (They tend +                        # to render better than text/plain, e.g. don't wrap links here: +                        # http://www.whatdotheyknow.com/request/amount_and_cost_of_freedom_of_in#incoming-72238 ) +                        curr_mail.parts.each do |m| +                            if m.content_type == 'text/html' +                                best_part = m +                            end +                        end +                        leaves_found += _get_attachment_leaves_recursive(best_part, parent_mail, within_rfc822_attachment) +                    else +                        # Add all parts +                        curr_mail.parts.each do |m| +                            leaves_found += _get_attachment_leaves_recursive(m, parent_mail, within_rfc822_attachment) +                        end +                    end +                else +                    # XXX Yuck. this section alters various content_types. That puts +                    # it into conflict with ensure_parts_counted which it has to be +                    # called both before and after.  It will fail with cases of +                    # attachments of attachments etc. +                    charset = curr_mail.charset # save this, because overwriting content_type also resets charset +                    # Don't allow nil content_types +                    if curr_mail.content_type.nil? +                        curr_mail.content_type = 'application/octet-stream' +                    end +                    # PDFs often come with this mime type, fix it up for view code +                    if curr_mail.content_type == 'application/octet-stream' +                        part_file_name = get_part_file_name(curr_mail) +                        part_body = get_part_body(curr_mail) +                        calc_mime = AlaveteliFileTypes.filename_and_content_to_mimetype(part_file_name, part_body) +                        if calc_mime +                            curr_mail.content_type = calc_mime +                        end +                    end + +                    # Use standard content types for Word documents etc. +                    curr_mail.content_type = normalise_content_type(curr_mail.content_type) +                    if curr_mail.content_type == 'message/rfc822' +                        ensure_parts_counted(parent_mail) # fills in rfc822_attachment variable +                        if curr_mail.rfc822_attachment.nil? +                            # Attached mail didn't parse, so treat as text +                            curr_mail.content_type = 'text/plain' +                        end +                    end +                    if curr_mail.content_type == 'application/vnd.ms-outlook' || curr_mail.content_type == 'application/ms-tnef' +                        ensure_parts_counted(parent_mail) # fills in rfc822_attachment variable +                        if curr_mail.rfc822_attachment.nil? +                            # Attached mail didn't parse, so treat as binary +                            curr_mail.content_type = 'application/octet-stream' +                        end +                    end +                    # If the part is an attachment of email +                    if curr_mail.content_type == 'message/rfc822' || curr_mail.content_type == 'application/vnd.ms-outlook' || curr_mail.content_type == 'application/ms-tnef' +                        ensure_parts_counted(parent_mail) # fills in rfc822_attachment variable +                        leaves_found += _get_attachment_leaves_recursive(curr_mail.rfc822_attachment, parent_mail, curr_mail.rfc822_attachment) +                    else +                        # Store leaf +                        curr_mail.within_rfc822_attachment = within_rfc822_attachment +                        leaves_found += [curr_mail] +                    end +                    # restore original charset +                    curr_mail.charset = charset +                end +                return leaves_found +            end + + +            def address_from_name_and_email(name, email) +                if !MySociety::Validate.is_valid_email(email) +                    raise "invalid email " + email + " passed to address_from_name_and_email" +                end +                if name.nil? +                    return TMail::Address.parse(email).to_s +                end +                # Botch an always quoted RFC address, then parse it +                name = name.gsub(/(["\\])/, "\\\\\\1") +                TMail::Address.parse('"' + name + '" <' + email + '>').to_s +            end + +            def address_from_string(string) +                TMail::Address.parse(string).address +            end + +        end +    end +end
\ No newline at end of file diff --git a/lib/tmail_extensions.rb b/lib/mail_handler/backends/tmail_extensions.rb index 6a533e658..3576a8eca 100644 --- a/lib/tmail_extensions.rb +++ b/lib/mail_handler/backends/tmail_extensions.rb @@ -15,6 +15,14 @@ require 'tmail/interface'  # These mainly used in app/models/incoming_message.rb  module TMail      class Mail +        # Monkeypatch! Adding some extra members to store extra info in. + +        attr_accessor :url_part_number +        attr_accessor :rfc822_attachment # when a whole email message is attached as text +        attr_accessor :within_rfc822_attachment # for parts within a message attached as text (for getting subject mainly) +        attr_accessor :count_parts_count +        attr_accessor :count_first_uudecode_count +          # Monkeypatch! (check to see if this becomes a standard function in          # TMail::Mail, then use that, whatever it is called)          def Mail.get_part_file_name(part) @@ -68,22 +76,6 @@ module TMail      end -    class Address -        # Monkeypatch! Constructor which makes a TMail::Address given -        # a name and an email -        def Address.address_from_name_and_email(name, email) -            if !MySociety::Validate.is_valid_email(email) -                raise "invalid email " + email + " passed to address_from_name_and_email" -            end -            if name.nil? -                return TMail::Address.parse(email) -            end -            # Botch an always quoted RFC address, then parse it -            name = name.gsub(/(["\\])/, "\\\\\\1") -            return TMail::Address.parse('"' + name + '" <' + email + '>') -        end -    end -      module TextUtils          # Monkeypatch! Much more aggressive list of characters to cause quoting          # than in normal TMail. e.g. Have found real cases where @ needs quoting. @@ -95,8 +87,8 @@ module TMail      end  end -# Monkeypatch! TMail 1.2.7.1 will parse only one address out of a list of addresses with  -# unquoted display parts https://github.com/mikel/tmail/issues#issue/9 - this monkeypatch  +# Monkeypatch! TMail 1.2.7.1 will parse only one address out of a list of addresses with +# unquoted display parts https://github.com/mikel/tmail/issues#issue/9 - this monkeypatch  # fixes this issue.  module TMail diff --git a/lib/mail_handler/mail_handler.rb b/lib/mail_handler/mail_handler.rb new file mode 100644 index 000000000..8b227b9ca --- /dev/null +++ b/lib/mail_handler/mail_handler.rb @@ -0,0 +1,179 @@ +# Handles the parsing of email +require 'tmpdir' + +module MailHandler + +    if RUBY_VERSION.to_f >= 1.9 +        require 'mail' +        require 'backends/mail_extensions' +        require 'backends/mail_backend' +        include Backends::MailBackend +    else +        require 'action_mailer' +        require 'backends/tmail_extensions' +        require 'backends/tmail_backend' +        include Backends::TmailBackend +    end + +    # Returns a set of attachments from the given TNEF contents +    # The TNEF contents also contains the message body, but in general this is the +    # same as the message body in the message proper. +    def tnef_attachments(content) +        attachments = [] +        Dir.mktmpdir do |dir| +            IO.popen("#{`which tnef`.chomp} -K -C #{dir}", "wb") do |f| +                f.write(content) +                f.close +                if $?.signaled? +                    raise IOError, "tnef exited with signal #{$?.termsig}" +                end +                if $?.exited? && $?.exitstatus != 0 +                    raise IOError, "tnef exited with status #{$?.exitstatus}" +                end +            end +            found = 0 +            Dir.new(dir).sort.each do |file| # sort for deterministic behaviour +                if file != "." && file != ".." +                    file_content = File.open("#{dir}/#{file}", "rb").read +                    attachments << { :content => file_content, +                                     :filename => file } +                    found += 1 +                end +            end +            if found == 0 +                raise IOError, "tnef produced no attachments" +            end +        end +        attachments +    end + +    def normalise_content_type(content_type) +        # e.g. http://www.whatdotheyknow.com/request/93/response/250 +        if content_type == 'application/excel' or content_type == 'application/msexcel' or content_type == 'application/x-ms-excel' +            content_type = 'application/vnd.ms-excel' +        end +        if content_type == 'application/mspowerpoint' or content_type == 'application/x-ms-powerpoint' +            content_type = 'application/vnd.ms-powerpoint' +        end +        if content_type == 'application/msword' or content_type == 'application/x-ms-word' +            content_type = 'application/vnd.ms-word' +        end +        if content_type == 'application/x-zip-compressed' +            content_type = 'application/zip' +        end + +        # e.g. http://www.whatdotheyknow.com/request/copy_of_current_swessex_scr_opt#incoming-9928 +        if content_type == 'application/acrobat' +            content_type = 'application/pdf' +        end + +        return content_type +    end + +    def get_attachment_text_one_file(content_type, body, charset = 'utf-8') +        # note re. charset: TMail always tries to convert email bodies +        # to UTF8 by default, so normally it should already be that. +        text = '' +        # XXX - tell all these command line tools to return utf-8 +        if content_type == 'text/plain' +            text += body + "\n\n" +        else +            tempfile = Tempfile.new('foiextract') +            tempfile.binmode +            tempfile.print body +            tempfile.flush +            default_params = { :append_to => text, :binary_output => false } +            if content_type == 'application/vnd.ms-word' +                AlaveteliExternalCommand.run("wvText", tempfile.path, tempfile.path + ".txt") +                # Try catdoc if we get into trouble (e.g. for InfoRequestEvent 2701) +                if not File.exists?(tempfile.path + ".txt") +                    AlaveteliExternalCommand.run("catdoc", tempfile.path, default_params) +                else +                    text += File.read(tempfile.path + ".txt") + "\n\n" +                    File.unlink(tempfile.path + ".txt") +                end +            elsif content_type == 'application/rtf' +                # catdoc on RTF prodcues less comments and extra bumf than --text option to unrtf +                AlaveteliExternalCommand.run("catdoc", tempfile.path, default_params) +            elsif content_type == 'text/html' +                # lynx wordwraps links in its output, which then don't +                # get formatted properly by Alaveteli. We use elinks +                # instead, which doesn't do that. +                AlaveteliExternalCommand.run("elinks", "-eval", "set document.codepage.assume = \"#{charset}\"", +                                                       "-eval", "set document.codepage.force_assumed = 1", +                                                       "-dump-charset", "utf-8", +                                                       "-force-html", "-dump", +                                                       tempfile.path, +                                                       default_params.merge(:env => {"LANG" => "C"})) +            elsif content_type == 'application/vnd.ms-excel' +                # Bit crazy using /usr/bin/strings - but xls2csv, xlhtml and +                # py_xls2txt only extract text from cells, not from floating +                # notes. catdoc may be fooled by weird character sets, but will +                # probably do for UK FOI requests. +                AlaveteliExternalCommand.run("/usr/bin/strings", tempfile.path, default_params) +            elsif content_type == 'application/vnd.ms-powerpoint' +                # ppthtml seems to catch more text, but only outputs HTML when +                # we want text, so just use catppt for now +                AlaveteliExternalCommand.run("catppt", tempfile.path, default_params) +            elsif content_type == 'application/pdf' +                AlaveteliExternalCommand.run("pdftotext", tempfile.path, "-", default_params) +            elsif content_type == 'application/vnd.openxmlformats-officedocument.wordprocessingml.document' +                # This is Microsoft's XML office document format. +                # Just pull out the main XML file, and strip it of text. +                xml = AlaveteliExternalCommand.run("/usr/bin/unzip", "-qq", +                                                                     "-c", +                                                                     tempfile.path, +                                                                     "word/document.xml", +                                                                     {:binary_output => false}) +                if !xml.nil? +                    doc = REXML::Document.new(xml) +                    text += doc.each_element( './/text()' ){}.join(" ") +                end +            elsif content_type == 'application/zip' +                # recurse into zip files +                begin +                    zip_file = Zip::ZipFile.open(tempfile.path) +                    text += get_attachment_text_from_zip_file(zip_file) +                    zip_file.close() +                rescue +                    $stderr.puts("Error processing zip file: #{$!.inspect}") +                end +            end +            tempfile.close +        end + +        return text +    end +    def get_attachment_text_from_zip_file(zip_file) + +        text = "" +        for entry in zip_file +            if entry.file? +                filename = entry.to_s +                begin +                    body = entry.get_input_stream.read +                rescue +                    # move to next attachment silently if there were problems +                    # XXX really should reduce this to specific exceptions? +                    # e.g. password protected +                    next +                end +                calc_mime = AlaveteliFileTypes.filename_to_mimetype(filename) +                if calc_mime +                    content_type = calc_mime +                else +                    content_type = 'application/octet-stream' +                end + +                text += get_attachment_text_one_file(content_type, body) + +            end +        end +        return text +    end + +    # Turn instance methods into class methods +    extend self + +end + diff --git a/lib/tasks/translation.rake b/lib/tasks/translation.rake index f6611cc80..273c12bfa 100644 --- a/lib/tasks/translation.rake +++ b/lib/tasks/translation.rake @@ -4,7 +4,7 @@ namespace :translation do      include Usage      def write_email(email, email_description, output_file) -        mail_object = TMail::Mail.parse(email.to_s) +        mail_object =  MailHandler.mail_from_raw_email(email.to_s, decode=false)          output_file.write("\n")          output_file.write("Description of email: #{email_description}\n")          output_file.write("Subject line: #{mail_object.subject}\n") @@ -86,7 +86,7 @@ namespace :translation do                                        'fixtures',                                        'files',                                        'incoming-request-plain.email')) -        response_mail = TMail::Mail.parse(content) +        response_mail =  MailHandler.mail_from_raw_email(content, decode=false)          response_mail.from = "authority@example.com"          stopped_responses_email = RequestMailer.create_stopped_responses(info_request, diff --git a/lib/tnef.rb b/lib/tnef.rb deleted file mode 100644 index 1c941f8b0..000000000 --- a/lib/tnef.rb +++ /dev/null @@ -1,40 +0,0 @@ -require 'tmpdir' - -class TNEF - -    # Extracts all attachments from the given TNEF file as a TMail::Mail object -    # The TNEF file also contains the message body, but in general this is the -    # same as the message body in the message proper. -    def self.as_tmail(content) -        main = TMail::Mail.new -        main.set_content_type 'multipart', 'mixed', { 'boundary' => TMail.new_boundary } -        Dir.mktmpdir do |dir| -            IO.popen("#{`which tnef`.chomp} -K -C #{dir}", "w") do |f| -                f.write(content) -                f.close -                if $?.signaled? -                    raise IOError, "tnef exited with signal #{$?.termsig}" -                end -                if $?.exited? && $?.exitstatus != 0 -                    raise IOError, "tnef exited with status #{$?.exitstatus}" -                end -            end -            found = 0 -            Dir.new(dir).sort.each do |file| # sort for deterministic behaviour -                if file != "." && file != ".." -                    file_content = File.open("#{dir}/#{file}", "r").read -                    attachment = TMail::Mail.new -                    attachment['content-location'] = file -                    attachment.body = file_content -                    main.parts << attachment -                    found += 1 -                end -            end -            if found == 0 -                raise IOError, "tnef produced no attachments" -            end -        end -        main -    end - -end diff --git a/lib/world_foi_websites.rb b/lib/world_foi_websites.rb index 2ff924713..c3f3655df 100644 --- a/lib/world_foi_websites.rb +++ b/lib/world_foi_websites.rb @@ -46,10 +46,10 @@ class WorldFOIWebsites                                    :country_name => "Chile",                                    :country_iso_code => "CL",                                    :url => "http://accesointeligente.org"}, -                              {:country_name => "Australia", +                              {:name => "Right To Know", +                                  :country_name => "Australia",                                    :country_iso_code => "AU", -                                  # The Australian site is not yet live. So, not including name & url yet. -                                  }, +                                  :url => "http://www.righttoknow.org.au"},                                {:name => "Informace pro Vsechny",                                    :country_name => "Česká republika",                                    :country_iso_code => "CZ", | 
