aboutsummaryrefslogtreecommitdiffstats
path: root/lib
diff options
context:
space:
mode:
Diffstat (limited to 'lib')
-rw-r--r--lib/alaveteli_external_command.rb14
-rw-r--r--lib/configuration.rb1
-rw-r--r--lib/mail_handler/backends/mail_backend.rb323
-rw-r--r--lib/mail_handler/backends/mail_extensions.rb67
-rw-r--r--lib/mail_handler/backends/tmail_backend.rb290
-rw-r--r--lib/mail_handler/backends/tmail_extensions.rb (renamed from lib/tmail_extensions.rb)28
-rw-r--r--lib/mail_handler/mail_handler.rb179
-rw-r--r--lib/tasks/translation.rake4
-rw-r--r--lib/tnef.rb40
-rw-r--r--lib/world_foi_websites.rb6
10 files changed, 887 insertions, 65 deletions
diff --git a/lib/alaveteli_external_command.rb b/lib/alaveteli_external_command.rb
index 3bfc34e3a..24b4b1aa8 100644
--- a/lib/alaveteli_external_command.rb
+++ b/lib/alaveteli_external_command.rb
@@ -2,6 +2,12 @@ require 'external_command'
module AlaveteliExternalCommand
class << self
+ # Final argument can be a hash of options.
+ # Valid options are:
+ # :append_to - string to append the output of the process to
+ # :stdin_string - stdin string to pass to the process
+ # :binary_output - boolean flag for treating the output as binary or text (only significant
+ # ruby 1.9 and above)
def run(program_name, *args)
# Run an external program, and return its output.
# Standard error is suppressed unless the program
@@ -10,7 +16,7 @@ module AlaveteliExternalCommand
if !args.empty? && args[-1].is_a?(Hash)
opts = args.pop
end
-
+
if program_name =~ %r(^/)
program_path = program_name
else
@@ -24,12 +30,16 @@ module AlaveteliExternalCommand
end
raise "Could not find #{program_name} in any of #{Configuration::utility_search_path.join(', ')}" if !found
end
-
+
xc = ExternalCommand.new(program_path, *args)
if opts.has_key? :append_to
xc.out = opts[:append_to]
end
+ if opts.has_key? :binary_output
+ xc.binary_mode = opts[:binary_output]
+ end
xc.run(opts[:stdin_string] || "", opts[:env] || {})
+
if xc.status != 0
# Error
$stderr.puts("Error from #{program_name} #{args.join(' ')}:")
diff --git a/lib/configuration.rb b/lib/configuration.rb
index abd0f5cdc..11fe1c56e 100644
--- a/lib/configuration.rb
+++ b/lib/configuration.rb
@@ -25,6 +25,7 @@ module Configuration
:GA_CODE => '',
:GAZE_URL => '',
:HTML_TO_PDF_COMMAND => '',
+ :INCLUDE_DEFAULT_LOCALE_IN_URLS => true,
:INCOMING_EMAIL_DOMAIN => 'localhost',
:INCOMING_EMAIL_PREFIX => '',
:INCOMING_EMAIL_SECRET => 'dummysecret',
diff --git a/lib/mail_handler/backends/mail_backend.rb b/lib/mail_handler/backends/mail_backend.rb
new file mode 100644
index 000000000..b75e6ed63
--- /dev/null
+++ b/lib/mail_handler/backends/mail_backend.rb
@@ -0,0 +1,323 @@
+require 'mail'
+
+module MailHandler
+ module Backends
+ module MailBackend
+
+ def backend()
+ 'Mail'
+ end
+
+ # Note that the decode flag is not yet used
+ def mail_from_raw_email(data, decode=true)
+ Mail.new(data)
+ end
+
+ # Extracts all attachments from the given TNEF file as a Mail object
+ def mail_from_tnef(content)
+ main = Mail.new
+ tnef_attachments(content).each do |attachment|
+ main.add_file(attachment)
+ end
+ main.ready_to_send!
+ main
+ end
+
+ # Returns an outlook message as a Mail object
+ def mail_from_outlook(content)
+ msg = Mapi::Msg.open(StringIO.new(content))
+ mail = mail_from_raw_email(msg.to_mime.to_s)
+ mail.ready_to_send!
+ mail
+ end
+
+ # Return a copy of the file name for the mail part
+ def get_part_file_name(part)
+ part_file_name = part.filename
+ part_file_name.nil? ? nil : part_file_name.dup
+ end
+
+ # Get the body of a mail part
+ def get_part_body(part)
+ part.body.decoded
+ end
+
+ # Return the first from field if any
+ def first_from(mail)
+ if mail[:from]
+ begin
+ mail[:from].addrs[0]
+ mail[:from].decoded
+ return mail[:from].addrs[0]
+ rescue
+ return mail[:from].value
+ end
+ else
+ nil
+ end
+ end
+
+ # Return the first from address if any
+ def get_from_address(mail)
+ first_from = first_from(mail)
+ if first_from
+ if first_from.is_a?(String)
+ return nil
+ else
+ return first_from.address
+ end
+ else
+ return nil
+ end
+ end
+
+ # Return the first from name if any
+ def get_from_name(mail)
+ first_from = first_from(mail)
+ if first_from
+ if first_from.is_a?(String)
+ return nil
+ else
+ return first_from.display_name ? eval(%Q{"#{first_from.display_name}"}) : nil
+ end
+ else
+ return nil
+ end
+ end
+
+ def get_all_addresses(mail)
+ envelope_to = mail['envelope-to'] ? [mail['envelope-to'].value] : []
+ ((mail.to || []) +
+ (mail.cc || []) +
+ (envelope_to || [])).uniq
+ end
+
+ def empty_return_path?(mail)
+ return false if mail['return-path'].nil?
+ return true if mail['return-path'].value.blank?
+ return false
+ end
+
+ def get_auto_submitted(mail)
+ mail['auto-submitted'] ? mail['auto-submitted'].value : nil
+ end
+
+ def get_content_type(part)
+ part.content_type ? part.content_type.split(';')[0] : nil
+ end
+
+ def get_header_string(header, mail)
+ mail.header[header] ? mail.header[header].to_s : nil
+ end
+
+ # Detects whether a mail part is an Outlook email
+ def is_outlook?(part)
+ filename = get_part_file_name(part)
+ return true if get_content_type(part) == 'application/vnd.ms-outlook'
+ if filename && AlaveteliFileTypes.filename_to_mimetype(filename) == 'application/vnd.ms-outlook'
+ return true
+ end
+ return false
+ end
+
+ # Convert a mail part which is an attached mail in one of
+ # several formats into a mail object and set it as the
+ # rfc822_attachment on the part. If the mail part can't be
+ # converted, the content type on the part is updated to
+ # 'text/plain' for an RFC822 attachment, and 'application/octet-stream'
+ # for other types
+ def decode_attached_part(part, parent_mail)
+ if get_content_type(part) == 'message/rfc822'
+ # An email attached as text
+ part.rfc822_attachment = mail_from_raw_email(part.body)
+ if part.rfc822_attachment.nil?
+ # Attached mail didn't parse, so treat as text
+ part.content_type = 'text/plain'
+ end
+ elsif is_outlook?(part)
+ part.rfc822_attachment = mail_from_outlook(part.body.decoded)
+ if part.rfc822_attachment.nil?
+ # Attached mail didn't parse, so treat as binary
+ part.content_type = 'application/octet-stream'
+ end
+ elsif get_content_type(part) == 'application/ms-tnef'
+ # A set of attachments in a TNEF file
+ part.rfc822_attachment = mail_from_tnef(part.body.decoded)
+ if part.rfc822_attachment.nil?
+ # Attached mail didn't parse, so treat as binary
+ part.content_type = 'application/octet-stream'
+ end
+ end
+ if part.rfc822_attachment
+ expand_and_normalize_parts(part.rfc822_attachment, parent_mail)
+ end
+ end
+
+ # Expand and normalize a mail part recursively. Decodes attached messages into
+ # Mail objects wherever possible. Sets a default content type if none is
+ # set. Tries to set a more specific content type for binary content types.
+ def expand_and_normalize_parts(part, parent_mail)
+ if part.multipart?
+ part.parts.each{ |sub_part| expand_and_normalize_parts(sub_part, parent_mail) }
+ else
+ part_filename = get_part_file_name(part)
+ charset = part.charset # save this, because overwriting content_type also resets charset
+
+ # Don't allow nil content_types
+ if get_content_type(part).nil?
+ part.content_type = 'application/octet-stream'
+ end
+
+ # PDFs often come with this mime type, fix it up for view code
+ if get_content_type(part) == 'application/octet-stream'
+ part_body = get_part_body(part)
+ calc_mime = AlaveteliFileTypes.filename_and_content_to_mimetype(part_filename,
+ part_body)
+ if calc_mime
+ part.content_type = calc_mime
+ end
+ end
+
+ # Use standard content types for Word documents etc.
+ part.content_type = normalise_content_type(get_content_type(part))
+ decode_attached_part(part, parent_mail)
+ part.charset = charset
+ end
+ end
+
+ # Count the parts in a mail part recursively, including any attached messages.
+ # Set the count on the parent mail, and set a url_part_number on the part itself.
+ # Set the count for the first uudecoded part on the parent mail also.
+ def count_parts(part, parent_mail)
+ if part.multipart?
+ part.parts.each { |p| count_parts(p, parent_mail) }
+ else
+ if part.rfc822_attachment
+ count_parts(part.rfc822_attachment, parent_mail)
+ else
+ parent_mail.count_parts_count += 1
+ part.url_part_number = parent_mail.count_parts_count
+ end
+ end
+ parent_mail.count_first_uudecode_count = parent_mail.count_parts_count
+ end
+
+ # Choose the best part from alternatives
+ def choose_best_alternative(mail)
+ if mail.html_part
+ return mail.html_part
+ elsif mail.text_part
+ return mail.text_part
+ else
+ return mail.parts.first
+ end
+ end
+
+ # Expand and normalize the parts of a mail, select the best part
+ # wherever there is an alternative, and then count the returned
+ # leaves and assign url_part values to them
+ def get_attachment_leaves(mail)
+ expand_and_normalize_parts(mail, mail)
+ leaves = _get_attachment_leaves_recursive(mail, nil, mail)
+ mail.count_parts_count = 0
+ count_parts(mail, mail)
+ return leaves
+ end
+
+ # Recurse through a mail part, selecting the best part wherever there is
+ # an alternative
+ def _get_attachment_leaves_recursive(part, within_rfc822_attachment, parent_mail)
+ leaves_found = []
+ if part.multipart?
+ raise "no parts on multipart mail" if part.parts.size == 0
+ if part.sub_type == 'alternative'
+ best_part = choose_best_alternative(part)
+ leaves_found += _get_attachment_leaves_recursive(best_part,
+ within_rfc822_attachment,
+ parent_mail)
+ else
+ # Add all parts
+ part.parts.each do |sub_part|
+ leaves_found += _get_attachment_leaves_recursive(sub_part,
+ within_rfc822_attachment,
+ parent_mail)
+ end
+ end
+ else
+ # Add all the parts of a decoded attached message
+ if part.rfc822_attachment
+ leaves_found += _get_attachment_leaves_recursive(part.rfc822_attachment,
+ part.rfc822_attachment,
+ parent_mail)
+ else
+ # Store leaf
+ part.within_rfc822_attachment = within_rfc822_attachment
+ leaves_found += [part]
+ end
+ end
+ return leaves_found
+ end
+
+ # Add selected useful headers from an attached message to its body
+ def extract_attached_message_headers(leaf)
+ body = get_part_body(leaf)
+ # Test to see if we are in the first part of the attached
+ # RFC822 message and it is text, if so add headers.
+ if leaf.within_rfc822_attachment == leaf && get_content_type(leaf) == 'text/plain'
+ headers = ""
+ [ 'Date', 'Subject', 'From', 'To', 'Cc' ].each do |header|
+ if header_value = get_header_string(header, leaf.within_rfc822_attachment)
+ if !header_value.blank?
+ headers = headers + header + ": " + header_value.to_s + "\n"
+ end
+ end
+ end
+ # XXX call _convert_part_body_to_text here, but need to get charset somehow
+ # e.g. http://www.whatdotheyknow.com/request/1593/response/3088/attach/4/Freedom%20of%20Information%20request%20-%20car%20oval%20sticker:%20Article%2020,%20Convention%20on%20Road%20Traffic%201949.txt
+ body = headers + "\n" + body
+ end
+ body
+ end
+
+ # Generate a hash of the attributes associated with each significant part of a Mail object
+ def get_attachment_attributes(mail)
+ leaves = get_attachment_leaves(mail)
+ attachments = []
+ for leaf in leaves
+ body = get_part_body(leaf)
+ if leaf.within_rfc822_attachment
+ within_rfc822_subject = leaf.within_rfc822_attachment.subject
+ body = extract_attached_message_headers(leaf)
+ end
+ leaf_attributes = { :url_part_number => leaf.url_part_number,
+ :content_type => get_content_type(leaf),
+ :filename => get_part_file_name(leaf),
+ :charset => leaf.charset,
+ :within_rfc822_subject => within_rfc822_subject,
+ :body => body,
+ :hexdigest => Digest::MD5.hexdigest(body) }
+ attachments << leaf_attributes
+ end
+ return attachments
+ end
+
+ # Format
+ def address_from_name_and_email(name, email)
+ if !MySociety::Validate.is_valid_email(email)
+ raise "invalid email " + email + " passed to address_from_name_and_email"
+ end
+ if name.nil?
+ return Mail::Address.new(email).to_s
+ end
+ address = Mail::Address.new
+ address.display_name = name
+ address.address = email
+ address.to_s
+ end
+
+ def address_from_string(string)
+ Mail::Address.new(string).address
+ end
+ end
+ end
+end \ No newline at end of file
diff --git a/lib/mail_handler/backends/mail_extensions.rb b/lib/mail_handler/backends/mail_extensions.rb
new file mode 100644
index 000000000..f756abd1a
--- /dev/null
+++ b/lib/mail_handler/backends/mail_extensions.rb
@@ -0,0 +1,67 @@
+require 'mail/message'
+require 'mail/fields/common/parameter_hash'
+module Mail
+ class Message
+ attr_accessor :url_part_number
+ attr_accessor :rfc822_attachment # when a whole email message is attached as text
+ attr_accessor :within_rfc822_attachment # for parts within a message attached as text (for getting subject mainly)
+ attr_accessor :count_parts_count
+ attr_accessor :count_first_uudecode_count
+
+ # A patched version of the message initializer to work around a bug where stripping the original
+ # input removes meaningful spaces - e.g. in the case of uuencoded bodies.
+ def initialize(*args, &block)
+ @body = nil
+ @body_raw = nil
+ @separate_parts = false
+ @text_part = nil
+ @html_part = nil
+ @errors = nil
+ @header = nil
+ @charset = 'UTF-8'
+ @defaulted_charset = true
+
+ @perform_deliveries = true
+ @raise_delivery_errors = true
+
+ @delivery_handler = nil
+
+ @delivery_method = Mail.delivery_method.dup
+
+ @transport_encoding = Mail::Encodings.get_encoding('7bit')
+
+ @mark_for_delete = false
+
+ if args.flatten.first.respond_to?(:each_pair)
+ init_with_hash(args.flatten.first)
+ else
+ # The replacement of this commented out line is the change.
+ # init_with_string(args.flatten[0].to_s.strip)
+ init_with_string(args.flatten[0].to_s)
+ end
+
+ if block_given?
+ instance_eval(&block)
+ end
+
+ self
+ end
+ end
+
+ # A patched version of the parameter hash that handles nil values without throwing
+ # an error.
+ class ParameterHash < IndifferentHash
+
+ def encoded
+ map.sort { |a,b| a.first.to_s <=> b.first.to_s }.map do |key_name, value|
+ # The replacement of this commented out line is the change
+ # unless value.ascii_only?
+ unless value.nil? || value.ascii_only?
+ value = Mail::Encodings.param_encode(value)
+ key_name = "#{key_name}*"
+ end
+ %Q{#{key_name}=#{quote_token(value)}}
+ end.join(";\r\n\s")
+ end
+ end
+end \ No newline at end of file
diff --git a/lib/mail_handler/backends/tmail_backend.rb b/lib/mail_handler/backends/tmail_backend.rb
new file mode 100644
index 000000000..02124cdb1
--- /dev/null
+++ b/lib/mail_handler/backends/tmail_backend.rb
@@ -0,0 +1,290 @@
+module MailHandler
+ module Backends
+ module TmailBackend
+
+ def backend()
+ 'TMail'
+ end
+
+ # Turn raw data into a structured TMail::Mail object
+ # Documentation at http://i.loveruby.net/en/projects/tmail/doc/
+ def mail_from_raw_email(data, decode=true)
+ # Hack round bug in TMail's MIME decoding.
+ # Report of TMail bug:
+ # http://rubyforge.org/tracker/index.php?func=detail&aid=21810&group_id=4512&atid=17370
+ copy_of_raw_data = data.gsub(/; boundary=\s+"/im,'; boundary="')
+ mail = TMail::Mail.parse(copy_of_raw_data)
+ mail.base64_decode if decode
+ mail
+ end
+
+ # Extracts all attachments from the given TNEF file as a TMail::Mail object
+ def mail_from_tnef(content)
+ main = TMail::Mail.new
+ main.set_content_type 'multipart', 'mixed', { 'boundary' => TMail.new_boundary }
+ tnef_attachments(content).each do |attachment|
+ tmail_attachment = TMail::Mail.new
+ tmail_attachment['content-location'] = attachment[:filename]
+ tmail_attachment.body = attachment[:content]
+ main.parts << tmail_attachment
+ end
+ main
+ end
+
+ # Return a copy of the file name for the mail part
+ def get_part_file_name(mail_part)
+ part_file_name = TMail::Mail.get_part_file_name(mail_part)
+ if part_file_name.nil?
+ return nil
+ end
+ part_file_name = part_file_name.dup
+ return part_file_name
+ end
+
+ # Get the body of a mail part
+ def get_part_body(mail_part)
+ mail_part.body
+ end
+
+ # Return the first from address if any
+ def get_from_address(mail)
+ if mail.from_addrs.nil? || mail.from_addrs.size == 0
+ return nil
+ end
+ mail.from_addrs[0].spec
+ end
+
+ # Return the first from name if any
+ def get_from_name(mail)
+ mail.from_name_if_present
+ end
+
+ def get_all_addresses(mail)
+ ((mail.to || []) +
+ (mail.cc || []) +
+ (mail.envelope_to || [])).uniq
+ end
+
+ def empty_return_path?(mail)
+ return false if mail['return-path'].nil?
+ return true if mail['return-path'].addr.to_s == '<>'
+ return false
+ end
+
+ def get_auto_submitted(mail)
+ mail['auto-submitted'] ? mail['auto-submitted'].body : nil
+ end
+
+ def get_content_type(part)
+ part.content_type
+ end
+
+ def get_header_string(header, mail)
+ mail.header_string(header)
+ end
+
+ # Number the attachments in depth first tree order, for use in URLs.
+ # XXX This fills in part.rfc822_attachment and part.url_part_number within
+ # all the parts of the email (see monkeypatches in lib/mail_handler/tmail_extensions and
+ # lib/mail_handler/mail_extensions for how these attributes are added). ensure_parts_counted
+ # must be called before using the attributes.
+ def ensure_parts_counted(mail)
+ mail.count_parts_count = 0
+ _count_parts_recursive(mail, mail)
+ # we carry on using these numeric ids for attachments uudecoded from within text parts
+ mail.count_first_uudecode_count = mail.count_parts_count
+ end
+ def _count_parts_recursive(part, mail)
+ if part.multipart?
+ part.parts.each do |p|
+ _count_parts_recursive(p, mail)
+ end
+ else
+ part_filename = get_part_file_name(part)
+ begin
+ if part.content_type == 'message/rfc822'
+ # An email attached as text
+ # e.g. http://www.whatdotheyknow.com/request/64/response/102
+ part.rfc822_attachment = mail_from_raw_email(part.body, decode=false)
+ elsif part.content_type == 'application/vnd.ms-outlook' || part_filename && AlaveteliFileTypes.filename_to_mimetype(part_filename) == 'application/vnd.ms-outlook'
+ # An email attached as an Outlook file
+ # e.g. http://www.whatdotheyknow.com/request/chinese_names_for_british_politi
+ msg = Mapi::Msg.open(StringIO.new(part.body))
+ part.rfc822_attachment = mail_from_raw_email(msg.to_mime.to_s, decode=false)
+ elsif part.content_type == 'application/ms-tnef'
+ # A set of attachments in a TNEF file
+ part.rfc822_attachment = mail_from_tnef(part.body)
+ end
+ rescue
+ # If attached mail doesn't parse, treat it as text part
+ part.rfc822_attachment = nil
+ else
+ unless part.rfc822_attachment.nil?
+ _count_parts_recursive(part.rfc822_attachment, mail)
+ end
+ end
+ if part.rfc822_attachment.nil?
+ mail.count_parts_count += 1
+ part.url_part_number = mail.count_parts_count
+ end
+ end
+ end
+
+ def get_attachment_attributes(mail)
+ leaves = get_attachment_leaves(mail)
+ # XXX we have to call ensure_parts_counted after get_attachment_leaves
+ # which is really messy.
+ ensure_parts_counted(mail)
+ attachment_attributes = []
+ for leaf in leaves
+ body = get_part_body(leaf)
+ # As leaf.body causes MIME decoding which uses lots of RAM, do garbage collection here
+ # to prevent excess memory use. XXX not really sure if this helps reduce
+ # peak RAM use overall. Anyway, maybe there is something better to do than this.
+ GC.start
+ if leaf.within_rfc822_attachment
+ within_rfc822_subject = leaf.within_rfc822_attachment.subject
+ # Test to see if we are in the first part of the attached
+ # RFC822 message and it is text, if so add headers.
+ # XXX should probably use hunting algorithm to find main text part, rather than
+ # just expect it to be first. This will do for now though.
+ if leaf.within_rfc822_attachment == leaf && leaf.content_type == 'text/plain'
+ headers = ""
+ for header in [ 'Date', 'Subject', 'From', 'To', 'Cc' ]
+ if leaf.within_rfc822_attachment.header.include?(header.downcase)
+ header_value = leaf.within_rfc822_attachment.header[header.downcase]
+ if !header_value.blank?
+ headers = headers + header + ": " + header_value.to_s + "\n"
+ end
+ end
+ end
+ # XXX call _convert_part_body_to_text here, but need to get charset somehow
+ # e.g. http://www.whatdotheyknow.com/request/1593/response/3088/attach/4/Freedom%20of%20Information%20request%20-%20car%20oval%20sticker:%20Article%2020,%20Convention%20on%20Road%20Traffic%201949.txt
+ body = headers + "\n" + body
+
+ # This is quick way of getting all headers, but instead we only add some a) to
+ # make it more usable, b) as at least one authority accidentally leaked security
+ # information into a header.
+ #attachment.body = leaf.within_rfc822_attachment.port.to_s
+ end
+ end
+ attachment_attributes << {:url_part_number => leaf.url_part_number,
+ :content_type => get_content_type(leaf),
+ :filename => get_part_file_name(leaf),
+ :charset => leaf.charset,
+ :within_rfc822_subject => within_rfc822_subject,
+ :body => body,
+ :hexdigest => Digest::MD5.hexdigest(body) }
+ end
+ attachment_attributes
+ end
+
+ # (This risks losing info if the unchosen alternative is the only one to contain
+ # useful info, but let's worry about that another time)
+ def get_attachment_leaves(mail)
+ return _get_attachment_leaves_recursive(mail, mail)
+ end
+ def _get_attachment_leaves_recursive(curr_mail, parent_mail, within_rfc822_attachment = nil)
+ leaves_found = []
+ if curr_mail.multipart?
+ if curr_mail.parts.size == 0
+ raise "no parts on multipart mail"
+ end
+
+ if curr_mail.sub_type == 'alternative'
+ # Choose best part from alternatives
+ best_part = nil
+ # Take the last text/plain one, or else the first one
+ curr_mail.parts.each do |m|
+ if not best_part
+ best_part = m
+ elsif m.content_type == 'text/plain'
+ best_part = m
+ end
+ end
+ # Take an HTML one as even higher priority. (They tend
+ # to render better than text/plain, e.g. don't wrap links here:
+ # http://www.whatdotheyknow.com/request/amount_and_cost_of_freedom_of_in#incoming-72238 )
+ curr_mail.parts.each do |m|
+ if m.content_type == 'text/html'
+ best_part = m
+ end
+ end
+ leaves_found += _get_attachment_leaves_recursive(best_part, parent_mail, within_rfc822_attachment)
+ else
+ # Add all parts
+ curr_mail.parts.each do |m|
+ leaves_found += _get_attachment_leaves_recursive(m, parent_mail, within_rfc822_attachment)
+ end
+ end
+ else
+ # XXX Yuck. this section alters various content_types. That puts
+ # it into conflict with ensure_parts_counted which it has to be
+ # called both before and after. It will fail with cases of
+ # attachments of attachments etc.
+ charset = curr_mail.charset # save this, because overwriting content_type also resets charset
+ # Don't allow nil content_types
+ if curr_mail.content_type.nil?
+ curr_mail.content_type = 'application/octet-stream'
+ end
+ # PDFs often come with this mime type, fix it up for view code
+ if curr_mail.content_type == 'application/octet-stream'
+ part_file_name = get_part_file_name(curr_mail)
+ part_body = get_part_body(curr_mail)
+ calc_mime = AlaveteliFileTypes.filename_and_content_to_mimetype(part_file_name, part_body)
+ if calc_mime
+ curr_mail.content_type = calc_mime
+ end
+ end
+
+ # Use standard content types for Word documents etc.
+ curr_mail.content_type = normalise_content_type(curr_mail.content_type)
+ if curr_mail.content_type == 'message/rfc822'
+ ensure_parts_counted(parent_mail) # fills in rfc822_attachment variable
+ if curr_mail.rfc822_attachment.nil?
+ # Attached mail didn't parse, so treat as text
+ curr_mail.content_type = 'text/plain'
+ end
+ end
+ if curr_mail.content_type == 'application/vnd.ms-outlook' || curr_mail.content_type == 'application/ms-tnef'
+ ensure_parts_counted(parent_mail) # fills in rfc822_attachment variable
+ if curr_mail.rfc822_attachment.nil?
+ # Attached mail didn't parse, so treat as binary
+ curr_mail.content_type = 'application/octet-stream'
+ end
+ end
+ # If the part is an attachment of email
+ if curr_mail.content_type == 'message/rfc822' || curr_mail.content_type == 'application/vnd.ms-outlook' || curr_mail.content_type == 'application/ms-tnef'
+ ensure_parts_counted(parent_mail) # fills in rfc822_attachment variable
+ leaves_found += _get_attachment_leaves_recursive(curr_mail.rfc822_attachment, parent_mail, curr_mail.rfc822_attachment)
+ else
+ # Store leaf
+ curr_mail.within_rfc822_attachment = within_rfc822_attachment
+ leaves_found += [curr_mail]
+ end
+ # restore original charset
+ curr_mail.charset = charset
+ end
+ return leaves_found
+ end
+
+
+ def address_from_name_and_email(name, email)
+ if !MySociety::Validate.is_valid_email(email)
+ raise "invalid email " + email + " passed to address_from_name_and_email"
+ end
+ if name.nil?
+ return TMail::Address.parse(email).to_s
+ end
+ # Botch an always quoted RFC address, then parse it
+ name = name.gsub(/(["\\])/, "\\\\\\1")
+ TMail::Address.parse('"' + name + '" <' + email + '>').to_s
+ end
+
+ def address_from_string(string)
+ TMail::Address.parse(string).address
+ end
+
+ end
+ end
+end \ No newline at end of file
diff --git a/lib/tmail_extensions.rb b/lib/mail_handler/backends/tmail_extensions.rb
index 6a533e658..3576a8eca 100644
--- a/lib/tmail_extensions.rb
+++ b/lib/mail_handler/backends/tmail_extensions.rb
@@ -15,6 +15,14 @@ require 'tmail/interface'
# These mainly used in app/models/incoming_message.rb
module TMail
class Mail
+ # Monkeypatch! Adding some extra members to store extra info in.
+
+ attr_accessor :url_part_number
+ attr_accessor :rfc822_attachment # when a whole email message is attached as text
+ attr_accessor :within_rfc822_attachment # for parts within a message attached as text (for getting subject mainly)
+ attr_accessor :count_parts_count
+ attr_accessor :count_first_uudecode_count
+
# Monkeypatch! (check to see if this becomes a standard function in
# TMail::Mail, then use that, whatever it is called)
def Mail.get_part_file_name(part)
@@ -68,22 +76,6 @@ module TMail
end
- class Address
- # Monkeypatch! Constructor which makes a TMail::Address given
- # a name and an email
- def Address.address_from_name_and_email(name, email)
- if !MySociety::Validate.is_valid_email(email)
- raise "invalid email " + email + " passed to address_from_name_and_email"
- end
- if name.nil?
- return TMail::Address.parse(email)
- end
- # Botch an always quoted RFC address, then parse it
- name = name.gsub(/(["\\])/, "\\\\\\1")
- return TMail::Address.parse('"' + name + '" <' + email + '>')
- end
- end
-
module TextUtils
# Monkeypatch! Much more aggressive list of characters to cause quoting
# than in normal TMail. e.g. Have found real cases where @ needs quoting.
@@ -95,8 +87,8 @@ module TMail
end
end
-# Monkeypatch! TMail 1.2.7.1 will parse only one address out of a list of addresses with
-# unquoted display parts https://github.com/mikel/tmail/issues#issue/9 - this monkeypatch
+# Monkeypatch! TMail 1.2.7.1 will parse only one address out of a list of addresses with
+# unquoted display parts https://github.com/mikel/tmail/issues#issue/9 - this monkeypatch
# fixes this issue.
module TMail
diff --git a/lib/mail_handler/mail_handler.rb b/lib/mail_handler/mail_handler.rb
new file mode 100644
index 000000000..8b227b9ca
--- /dev/null
+++ b/lib/mail_handler/mail_handler.rb
@@ -0,0 +1,179 @@
+# Handles the parsing of email
+require 'tmpdir'
+
+module MailHandler
+
+ if RUBY_VERSION.to_f >= 1.9
+ require 'mail'
+ require 'backends/mail_extensions'
+ require 'backends/mail_backend'
+ include Backends::MailBackend
+ else
+ require 'action_mailer'
+ require 'backends/tmail_extensions'
+ require 'backends/tmail_backend'
+ include Backends::TmailBackend
+ end
+
+ # Returns a set of attachments from the given TNEF contents
+ # The TNEF contents also contains the message body, but in general this is the
+ # same as the message body in the message proper.
+ def tnef_attachments(content)
+ attachments = []
+ Dir.mktmpdir do |dir|
+ IO.popen("#{`which tnef`.chomp} -K -C #{dir}", "wb") do |f|
+ f.write(content)
+ f.close
+ if $?.signaled?
+ raise IOError, "tnef exited with signal #{$?.termsig}"
+ end
+ if $?.exited? && $?.exitstatus != 0
+ raise IOError, "tnef exited with status #{$?.exitstatus}"
+ end
+ end
+ found = 0
+ Dir.new(dir).sort.each do |file| # sort for deterministic behaviour
+ if file != "." && file != ".."
+ file_content = File.open("#{dir}/#{file}", "rb").read
+ attachments << { :content => file_content,
+ :filename => file }
+ found += 1
+ end
+ end
+ if found == 0
+ raise IOError, "tnef produced no attachments"
+ end
+ end
+ attachments
+ end
+
+ def normalise_content_type(content_type)
+ # e.g. http://www.whatdotheyknow.com/request/93/response/250
+ if content_type == 'application/excel' or content_type == 'application/msexcel' or content_type == 'application/x-ms-excel'
+ content_type = 'application/vnd.ms-excel'
+ end
+ if content_type == 'application/mspowerpoint' or content_type == 'application/x-ms-powerpoint'
+ content_type = 'application/vnd.ms-powerpoint'
+ end
+ if content_type == 'application/msword' or content_type == 'application/x-ms-word'
+ content_type = 'application/vnd.ms-word'
+ end
+ if content_type == 'application/x-zip-compressed'
+ content_type = 'application/zip'
+ end
+
+ # e.g. http://www.whatdotheyknow.com/request/copy_of_current_swessex_scr_opt#incoming-9928
+ if content_type == 'application/acrobat'
+ content_type = 'application/pdf'
+ end
+
+ return content_type
+ end
+
+ def get_attachment_text_one_file(content_type, body, charset = 'utf-8')
+ # note re. charset: TMail always tries to convert email bodies
+ # to UTF8 by default, so normally it should already be that.
+ text = ''
+ # XXX - tell all these command line tools to return utf-8
+ if content_type == 'text/plain'
+ text += body + "\n\n"
+ else
+ tempfile = Tempfile.new('foiextract')
+ tempfile.binmode
+ tempfile.print body
+ tempfile.flush
+ default_params = { :append_to => text, :binary_output => false }
+ if content_type == 'application/vnd.ms-word'
+ AlaveteliExternalCommand.run("wvText", tempfile.path, tempfile.path + ".txt")
+ # Try catdoc if we get into trouble (e.g. for InfoRequestEvent 2701)
+ if not File.exists?(tempfile.path + ".txt")
+ AlaveteliExternalCommand.run("catdoc", tempfile.path, default_params)
+ else
+ text += File.read(tempfile.path + ".txt") + "\n\n"
+ File.unlink(tempfile.path + ".txt")
+ end
+ elsif content_type == 'application/rtf'
+ # catdoc on RTF prodcues less comments and extra bumf than --text option to unrtf
+ AlaveteliExternalCommand.run("catdoc", tempfile.path, default_params)
+ elsif content_type == 'text/html'
+ # lynx wordwraps links in its output, which then don't
+ # get formatted properly by Alaveteli. We use elinks
+ # instead, which doesn't do that.
+ AlaveteliExternalCommand.run("elinks", "-eval", "set document.codepage.assume = \"#{charset}\"",
+ "-eval", "set document.codepage.force_assumed = 1",
+ "-dump-charset", "utf-8",
+ "-force-html", "-dump",
+ tempfile.path,
+ default_params.merge(:env => {"LANG" => "C"}))
+ elsif content_type == 'application/vnd.ms-excel'
+ # Bit crazy using /usr/bin/strings - but xls2csv, xlhtml and
+ # py_xls2txt only extract text from cells, not from floating
+ # notes. catdoc may be fooled by weird character sets, but will
+ # probably do for UK FOI requests.
+ AlaveteliExternalCommand.run("/usr/bin/strings", tempfile.path, default_params)
+ elsif content_type == 'application/vnd.ms-powerpoint'
+ # ppthtml seems to catch more text, but only outputs HTML when
+ # we want text, so just use catppt for now
+ AlaveteliExternalCommand.run("catppt", tempfile.path, default_params)
+ elsif content_type == 'application/pdf'
+ AlaveteliExternalCommand.run("pdftotext", tempfile.path, "-", default_params)
+ elsif content_type == 'application/vnd.openxmlformats-officedocument.wordprocessingml.document'
+ # This is Microsoft's XML office document format.
+ # Just pull out the main XML file, and strip it of text.
+ xml = AlaveteliExternalCommand.run("/usr/bin/unzip", "-qq",
+ "-c",
+ tempfile.path,
+ "word/document.xml",
+ {:binary_output => false})
+ if !xml.nil?
+ doc = REXML::Document.new(xml)
+ text += doc.each_element( './/text()' ){}.join(" ")
+ end
+ elsif content_type == 'application/zip'
+ # recurse into zip files
+ begin
+ zip_file = Zip::ZipFile.open(tempfile.path)
+ text += get_attachment_text_from_zip_file(zip_file)
+ zip_file.close()
+ rescue
+ $stderr.puts("Error processing zip file: #{$!.inspect}")
+ end
+ end
+ tempfile.close
+ end
+
+ return text
+ end
+ def get_attachment_text_from_zip_file(zip_file)
+
+ text = ""
+ for entry in zip_file
+ if entry.file?
+ filename = entry.to_s
+ begin
+ body = entry.get_input_stream.read
+ rescue
+ # move to next attachment silently if there were problems
+ # XXX really should reduce this to specific exceptions?
+ # e.g. password protected
+ next
+ end
+ calc_mime = AlaveteliFileTypes.filename_to_mimetype(filename)
+ if calc_mime
+ content_type = calc_mime
+ else
+ content_type = 'application/octet-stream'
+ end
+
+ text += get_attachment_text_one_file(content_type, body)
+
+ end
+ end
+ return text
+ end
+
+ # Turn instance methods into class methods
+ extend self
+
+end
+
diff --git a/lib/tasks/translation.rake b/lib/tasks/translation.rake
index f6611cc80..273c12bfa 100644
--- a/lib/tasks/translation.rake
+++ b/lib/tasks/translation.rake
@@ -4,7 +4,7 @@ namespace :translation do
include Usage
def write_email(email, email_description, output_file)
- mail_object = TMail::Mail.parse(email.to_s)
+ mail_object = MailHandler.mail_from_raw_email(email.to_s, decode=false)
output_file.write("\n")
output_file.write("Description of email: #{email_description}\n")
output_file.write("Subject line: #{mail_object.subject}\n")
@@ -86,7 +86,7 @@ namespace :translation do
'fixtures',
'files',
'incoming-request-plain.email'))
- response_mail = TMail::Mail.parse(content)
+ response_mail = MailHandler.mail_from_raw_email(content, decode=false)
response_mail.from = "authority@example.com"
stopped_responses_email = RequestMailer.create_stopped_responses(info_request,
diff --git a/lib/tnef.rb b/lib/tnef.rb
deleted file mode 100644
index 1c941f8b0..000000000
--- a/lib/tnef.rb
+++ /dev/null
@@ -1,40 +0,0 @@
-require 'tmpdir'
-
-class TNEF
-
- # Extracts all attachments from the given TNEF file as a TMail::Mail object
- # The TNEF file also contains the message body, but in general this is the
- # same as the message body in the message proper.
- def self.as_tmail(content)
- main = TMail::Mail.new
- main.set_content_type 'multipart', 'mixed', { 'boundary' => TMail.new_boundary }
- Dir.mktmpdir do |dir|
- IO.popen("#{`which tnef`.chomp} -K -C #{dir}", "w") do |f|
- f.write(content)
- f.close
- if $?.signaled?
- raise IOError, "tnef exited with signal #{$?.termsig}"
- end
- if $?.exited? && $?.exitstatus != 0
- raise IOError, "tnef exited with status #{$?.exitstatus}"
- end
- end
- found = 0
- Dir.new(dir).sort.each do |file| # sort for deterministic behaviour
- if file != "." && file != ".."
- file_content = File.open("#{dir}/#{file}", "r").read
- attachment = TMail::Mail.new
- attachment['content-location'] = file
- attachment.body = file_content
- main.parts << attachment
- found += 1
- end
- end
- if found == 0
- raise IOError, "tnef produced no attachments"
- end
- end
- main
- end
-
-end
diff --git a/lib/world_foi_websites.rb b/lib/world_foi_websites.rb
index 2ff924713..c3f3655df 100644
--- a/lib/world_foi_websites.rb
+++ b/lib/world_foi_websites.rb
@@ -46,10 +46,10 @@ class WorldFOIWebsites
:country_name => "Chile",
:country_iso_code => "CL",
:url => "http://accesointeligente.org"},
- {:country_name => "Australia",
+ {:name => "Right To Know",
+ :country_name => "Australia",
:country_iso_code => "AU",
- # The Australian site is not yet live. So, not including name & url yet.
- },
+ :url => "http://www.righttoknow.org.au"},
{:name => "Informace pro Vsechny",
:country_name => "Česká republika",
:country_iso_code => "CZ",