aboutsummaryrefslogtreecommitdiffstats
path: root/lib/mail_handler/backends/tmail_backend.rb
diff options
context:
space:
mode:
authorLouise Crow <louise.crow@gmail.com>2012-12-06 17:26:21 +0000
committerLouise Crow <louise.crow@gmail.com>2012-12-06 17:26:21 +0000
commit8aa8e4f23f72a70be3eb87b5b4c93e9bc70f411e (patch)
tree554308dd3fa334b288eba156f8ff3d8217d4fd59 /lib/mail_handler/backends/tmail_backend.rb
parent9735b7c11abe4cdcef473637fb5c92b04d6539fa (diff)
parent24648c4b8f0bfbcbb3cf0d192b28906a9b7e111c (diff)
Merge branch 'feature/rework-mail-handling' into develop
Diffstat (limited to 'lib/mail_handler/backends/tmail_backend.rb')
-rw-r--r--lib/mail_handler/backends/tmail_backend.rb186
1 files changed, 186 insertions, 0 deletions
diff --git a/lib/mail_handler/backends/tmail_backend.rb b/lib/mail_handler/backends/tmail_backend.rb
index 4b7291d00..02124cdb1 100644
--- a/lib/mail_handler/backends/tmail_backend.rb
+++ b/lib/mail_handler/backends/tmail_backend.rb
@@ -83,6 +83,192 @@ module MailHandler
mail.header_string(header)
end
+ # Number the attachments in depth first tree order, for use in URLs.
+ # XXX This fills in part.rfc822_attachment and part.url_part_number within
+ # all the parts of the email (see monkeypatches in lib/mail_handler/tmail_extensions and
+ # lib/mail_handler/mail_extensions for how these attributes are added). ensure_parts_counted
+ # must be called before using the attributes.
+ def ensure_parts_counted(mail)
+ mail.count_parts_count = 0
+ _count_parts_recursive(mail, mail)
+ # we carry on using these numeric ids for attachments uudecoded from within text parts
+ mail.count_first_uudecode_count = mail.count_parts_count
+ end
+ def _count_parts_recursive(part, mail)
+ if part.multipart?
+ part.parts.each do |p|
+ _count_parts_recursive(p, mail)
+ end
+ else
+ part_filename = get_part_file_name(part)
+ begin
+ if part.content_type == 'message/rfc822'
+ # An email attached as text
+ # e.g. http://www.whatdotheyknow.com/request/64/response/102
+ part.rfc822_attachment = mail_from_raw_email(part.body, decode=false)
+ elsif part.content_type == 'application/vnd.ms-outlook' || part_filename && AlaveteliFileTypes.filename_to_mimetype(part_filename) == 'application/vnd.ms-outlook'
+ # An email attached as an Outlook file
+ # e.g. http://www.whatdotheyknow.com/request/chinese_names_for_british_politi
+ msg = Mapi::Msg.open(StringIO.new(part.body))
+ part.rfc822_attachment = mail_from_raw_email(msg.to_mime.to_s, decode=false)
+ elsif part.content_type == 'application/ms-tnef'
+ # A set of attachments in a TNEF file
+ part.rfc822_attachment = mail_from_tnef(part.body)
+ end
+ rescue
+ # If attached mail doesn't parse, treat it as text part
+ part.rfc822_attachment = nil
+ else
+ unless part.rfc822_attachment.nil?
+ _count_parts_recursive(part.rfc822_attachment, mail)
+ end
+ end
+ if part.rfc822_attachment.nil?
+ mail.count_parts_count += 1
+ part.url_part_number = mail.count_parts_count
+ end
+ end
+ end
+
+ def get_attachment_attributes(mail)
+ leaves = get_attachment_leaves(mail)
+ # XXX we have to call ensure_parts_counted after get_attachment_leaves
+ # which is really messy.
+ ensure_parts_counted(mail)
+ attachment_attributes = []
+ for leaf in leaves
+ body = get_part_body(leaf)
+ # As leaf.body causes MIME decoding which uses lots of RAM, do garbage collection here
+ # to prevent excess memory use. XXX not really sure if this helps reduce
+ # peak RAM use overall. Anyway, maybe there is something better to do than this.
+ GC.start
+ if leaf.within_rfc822_attachment
+ within_rfc822_subject = leaf.within_rfc822_attachment.subject
+ # Test to see if we are in the first part of the attached
+ # RFC822 message and it is text, if so add headers.
+ # XXX should probably use hunting algorithm to find main text part, rather than
+ # just expect it to be first. This will do for now though.
+ if leaf.within_rfc822_attachment == leaf && leaf.content_type == 'text/plain'
+ headers = ""
+ for header in [ 'Date', 'Subject', 'From', 'To', 'Cc' ]
+ if leaf.within_rfc822_attachment.header.include?(header.downcase)
+ header_value = leaf.within_rfc822_attachment.header[header.downcase]
+ if !header_value.blank?
+ headers = headers + header + ": " + header_value.to_s + "\n"
+ end
+ end
+ end
+ # XXX call _convert_part_body_to_text here, but need to get charset somehow
+ # e.g. http://www.whatdotheyknow.com/request/1593/response/3088/attach/4/Freedom%20of%20Information%20request%20-%20car%20oval%20sticker:%20Article%2020,%20Convention%20on%20Road%20Traffic%201949.txt
+ body = headers + "\n" + body
+
+ # This is quick way of getting all headers, but instead we only add some a) to
+ # make it more usable, b) as at least one authority accidentally leaked security
+ # information into a header.
+ #attachment.body = leaf.within_rfc822_attachment.port.to_s
+ end
+ end
+ attachment_attributes << {:url_part_number => leaf.url_part_number,
+ :content_type => get_content_type(leaf),
+ :filename => get_part_file_name(leaf),
+ :charset => leaf.charset,
+ :within_rfc822_subject => within_rfc822_subject,
+ :body => body,
+ :hexdigest => Digest::MD5.hexdigest(body) }
+ end
+ attachment_attributes
+ end
+
+ # (This risks losing info if the unchosen alternative is the only one to contain
+ # useful info, but let's worry about that another time)
+ def get_attachment_leaves(mail)
+ return _get_attachment_leaves_recursive(mail, mail)
+ end
+ def _get_attachment_leaves_recursive(curr_mail, parent_mail, within_rfc822_attachment = nil)
+ leaves_found = []
+ if curr_mail.multipart?
+ if curr_mail.parts.size == 0
+ raise "no parts on multipart mail"
+ end
+
+ if curr_mail.sub_type == 'alternative'
+ # Choose best part from alternatives
+ best_part = nil
+ # Take the last text/plain one, or else the first one
+ curr_mail.parts.each do |m|
+ if not best_part
+ best_part = m
+ elsif m.content_type == 'text/plain'
+ best_part = m
+ end
+ end
+ # Take an HTML one as even higher priority. (They tend
+ # to render better than text/plain, e.g. don't wrap links here:
+ # http://www.whatdotheyknow.com/request/amount_and_cost_of_freedom_of_in#incoming-72238 )
+ curr_mail.parts.each do |m|
+ if m.content_type == 'text/html'
+ best_part = m
+ end
+ end
+ leaves_found += _get_attachment_leaves_recursive(best_part, parent_mail, within_rfc822_attachment)
+ else
+ # Add all parts
+ curr_mail.parts.each do |m|
+ leaves_found += _get_attachment_leaves_recursive(m, parent_mail, within_rfc822_attachment)
+ end
+ end
+ else
+ # XXX Yuck. this section alters various content_types. That puts
+ # it into conflict with ensure_parts_counted which it has to be
+ # called both before and after. It will fail with cases of
+ # attachments of attachments etc.
+ charset = curr_mail.charset # save this, because overwriting content_type also resets charset
+ # Don't allow nil content_types
+ if curr_mail.content_type.nil?
+ curr_mail.content_type = 'application/octet-stream'
+ end
+ # PDFs often come with this mime type, fix it up for view code
+ if curr_mail.content_type == 'application/octet-stream'
+ part_file_name = get_part_file_name(curr_mail)
+ part_body = get_part_body(curr_mail)
+ calc_mime = AlaveteliFileTypes.filename_and_content_to_mimetype(part_file_name, part_body)
+ if calc_mime
+ curr_mail.content_type = calc_mime
+ end
+ end
+
+ # Use standard content types for Word documents etc.
+ curr_mail.content_type = normalise_content_type(curr_mail.content_type)
+ if curr_mail.content_type == 'message/rfc822'
+ ensure_parts_counted(parent_mail) # fills in rfc822_attachment variable
+ if curr_mail.rfc822_attachment.nil?
+ # Attached mail didn't parse, so treat as text
+ curr_mail.content_type = 'text/plain'
+ end
+ end
+ if curr_mail.content_type == 'application/vnd.ms-outlook' || curr_mail.content_type == 'application/ms-tnef'
+ ensure_parts_counted(parent_mail) # fills in rfc822_attachment variable
+ if curr_mail.rfc822_attachment.nil?
+ # Attached mail didn't parse, so treat as binary
+ curr_mail.content_type = 'application/octet-stream'
+ end
+ end
+ # If the part is an attachment of email
+ if curr_mail.content_type == 'message/rfc822' || curr_mail.content_type == 'application/vnd.ms-outlook' || curr_mail.content_type == 'application/ms-tnef'
+ ensure_parts_counted(parent_mail) # fills in rfc822_attachment variable
+ leaves_found += _get_attachment_leaves_recursive(curr_mail.rfc822_attachment, parent_mail, curr_mail.rfc822_attachment)
+ else
+ # Store leaf
+ curr_mail.within_rfc822_attachment = within_rfc822_attachment
+ leaves_found += [curr_mail]
+ end
+ # restore original charset
+ curr_mail.charset = charset
+ end
+ return leaves_found
+ end
+
+
def address_from_name_and_email(name, email)
if !MySociety::Validate.is_valid_email(email)
raise "invalid email " + email + " passed to address_from_name_and_email"