diff options
Diffstat (limited to 'app/models/incoming_message.rb')
-rw-r--r-- | app/models/incoming_message.rb | 177 |
1 files changed, 132 insertions, 45 deletions
diff --git a/app/models/incoming_message.rb b/app/models/incoming_message.rb index 2348c17b5..316f2683a 100644 --- a/app/models/incoming_message.rb +++ b/app/models/incoming_message.rb @@ -29,6 +29,8 @@ require 'htmlentities' require 'rexml/document' require 'zip/zip' require 'mahoro' +require 'mapi/msg' +require 'mapi/convert' # Monkeypatch! Adding some extra members to store extra info in. module TMail @@ -51,6 +53,9 @@ $file_extension_to_mime_type = { "xlsx" => 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet', "ppt" => 'application/vnd.ms-powerpoint', "pptx" => 'application/vnd.openxmlformats-officedocument.presentationml.presentation', + "oft" => 'application/vnd.ms-outlook', + "msg" => 'application/vnd.ms-outlook', + "tnef" => 'application/ms-tnef', "tif" => 'image/tiff', "gif" => 'image/gif', "jpg" => 'image/jpeg', # XXX add jpeg @@ -303,10 +308,54 @@ class FOIAttachment end end + # Whether this type has a "View as HTML" + def has_body_as_html? + if self.content_type == 'text/plain' + return true + elsif self.content_type == 'application/vnd.ms-word' + return true + elsif self.content_type == 'application/vnd.ms-excel' + return true + elsif self.content_type == 'application/pdf' + return true + elsif self.content_type == 'application/rtf' + return true + end + return false + end + + # Name of type of attachment type - only valid for things that has_body_as_html? + def name_of_content_type + if self.content_type == 'text/plain' + return "Text file" + elsif self.content_type == 'application/vnd.ms-word' + return "Word document" + elsif self.content_type == 'application/vnd.ms-excel' + return "Excel spreadsheet" + elsif self.content_type == 'application/pdf' + return "PDF file" + elsif self.content_type == 'application/rtf' + return "RTF file" + end + end + # For "View as HTML" of attachment def body_as_html(dir) html = nil + wrapper_id = "wrapper" + + # simple cases, can never fail + if self.content_type == 'text/plain' + text = self.body.strip + text = CGI.escapeHTML(text) + text = MySociety::Format.make_clickable(text) + html = text.gsub(/\n/, '<br>') + return "<html><head></head><body>" + html + "</body></html>", wrapper_id + end + # the extractions will also produce image files, which go in the + # current directory, so change to the directory the function caller + # wants everything in Dir.chdir(dir) do tempfile = Tempfile.new('foiextract', '.') tempfile.print self.body @@ -317,10 +366,22 @@ class FOIAttachment system("/usr/bin/wvHtml --charset=UTF-8 " + tempfile.path + " " + tempfile.path + ".html") html = File.read(tempfile.path + ".html") File.unlink(tempfile.path + ".html") + elsif self.content_type == 'application/vnd.ms-excel' + # Don't colorise, e.g. otherwise this one comes out with white + # text which is nasty: + # http://www.whatdotheyknow.com/request/30485/response/74705/attach/html/2/Empty%20premises%20Sefton.xls.html + IO.popen("/usr/bin/xlhtml -nc -a " + tempfile.path + "", "r") do |child| + html = child.read() + wrapper_id = "wrapper_xlhtml" + end elsif self.content_type == 'application/pdf' IO.popen("/usr/bin/pdftohtml -nodrm -zoom 1.0 -stdout -enc UTF-8 -noframes " + tempfile.path + "", "r") do |child| html = child.read() end + elsif self.content_type == 'application/rtf' + IO.popen("/usr/bin/unrtf --html " + tempfile.path + "", "r") do |child| + html = child.read() + end else raise "No HTML conversion available for type " + self.content_type end @@ -341,30 +402,12 @@ class FOIAttachment body_without_tags = body.gsub(/\s+/,"").gsub(/\<[^\>]*\>/, "") contains_images = html.match(/<img/mi) ? true : false if !$?.success? || html.size == 0 || (body_without_tags.size == 0 && !contains_images) - return "<html><head></head><body><p>Sorry, the conversion to HTML failed. Please use the download link at the top right.</p></body></html>" + return "<html><head></head><body><p>Sorry, the conversion to HTML failed. Please use the download link at the top right.</p></body></html>", wrapper_id end - return html - end - - # Whether this type has a "View as HTML" - def has_body_as_html? - if self.content_type == 'application/vnd.ms-word' - return true - elsif self.content_type == 'application/pdf' - return true - end - return false + return html, wrapper_id end - # Name of type of attachment type - only valid for things that has_body_as_html? - def name_of_content_type - if self.content_type == 'application/vnd.ms-word' - return "Word document" - elsif self.content_type == 'application/pdf' - return "PDF file" - end - end end class IncomingMessage < ActiveRecord::Base @@ -419,20 +462,30 @@ class IncomingMessage < ActiveRecord::Base _count_parts_recursive(p) end else - if part.content_type == 'message/rfc822' - # An email attached as text - # e.g. http://www.whatdotheyknow.com/request/64/response/102 - begin + part_filename = TMail::Mail.get_part_file_name(part) + begin + if part.content_type == 'message/rfc822' + # An email attached as text + # e.g. http://www.whatdotheyknow.com/request/64/response/102 part.rfc822_attachment = TMail::Mail.parse(part.body) - rescue - # If attached mail doesn't parse, treat it as text part - part.rfc822_attachment = nil - @count_parts_count += 1 - part.url_part_number = @count_parts_count - else - _count_parts_recursive(part.rfc822_attachment) + elsif part.content_type == 'application/vnd.ms-outlook' || part_filename && filename_to_mimetype(part_filename) == 'application/vnd.ms-outlook' + # An email attached as an Outlook file + # e.g. http://www.whatdotheyknow.com/request/chinese_names_for_british_politi + msg = Mapi::Msg.open(StringIO.new(part.body)) + part.rfc822_attachment = TMail::Mail.parse(msg.to_mime.to_s) + elsif part.content_type == 'application/ms-tnef' + # A set of attachments in a TNEF file + part.rfc822_attachment = TNEF.as_tmail(part.body) end + rescue + # If attached mail doesn't parse, treat it as text part + part.rfc822_attachment = nil else + unless part.rfc822_attachment.nil? + _count_parts_recursive(part.rfc822_attachment) + end + end + if part.rfc822_attachment.nil? @count_parts_count += 1 part.url_part_number = @count_parts_count end @@ -486,7 +539,7 @@ class IncomingMessage < ActiveRecord::Base uncompressed_text = child.read() end # if we managed to uncompress the PDF... - if !uncompressed_text.nil? + if !uncompressed_text.nil? && !uncompressed_text.empty? # then censor stuff (making a copy so can compare again in a bit) censored_uncompressed_text = uncompressed_text.dup self._binary_mask_stuff_internal!(censored_uncompressed_text) @@ -499,7 +552,7 @@ class IncomingMessage < ActiveRecord::Base child.close_write() recompressed_text = child.read() end - if !recompressed_text.nil? + if !recompressed_text.nil? && !recompressed_text.empty? text[0..-1] = recompressed_text # [0..-1] makes it change the 'text' string in place end end @@ -707,15 +760,22 @@ class IncomingMessage < ActiveRecord::Base if curr_mail.sub_type == 'alternative' # Choose best part from alternatives best_part = nil + # Take the last text/plain one, or else the first one curr_mail.parts.each do |m| - # Take the first one, or the last text/plain one - # XXX - could do better! if not best_part best_part = m elsif m.content_type == 'text/plain' best_part = m end end + # Take an HTML one as even higher priority. (They tend + # to render better than text/plain, e.g. don't wrap links here: + # http://www.whatdotheyknow.com/request/amount_and_cost_of_freedom_of_in#incoming-72238 ) + curr_mail.parts.each do |m| + if m.content_type == 'text/html' + best_part = m + end + end leaves_found += _get_attachment_leaves_recursive(best_part, within_rfc822_attachment) else # Add all parts @@ -724,6 +784,11 @@ class IncomingMessage < ActiveRecord::Base end end else + # XXX Yuck. this section alters various content_type's. That puts + # it into conflict with ensure_parts_counted which it has to be + # called both before and after. It will fail with cases of + # attachments of attachments etc. + # Don't allow nil content_types if curr_mail.content_type.nil? curr_mail.content_type = 'application/octet-stream' @@ -746,9 +811,16 @@ class IncomingMessage < ActiveRecord::Base curr_mail.content_type = 'text/plain' end end + if curr_mail.content_type == 'application/vnd.ms-outlook' || curr_mail.content_type == 'application/ms-tnef' + ensure_parts_counted # fills in rfc822_attachment variable + if curr_mail.rfc822_attachment.nil? + # Attached mail didn't parse, so treat as binary + curr_mail.content_type = 'application/octet-stream' + end + end - # If the part is an attachment of email in text form - if curr_mail.content_type == 'message/rfc822' + # If the part is an attachment of email + if curr_mail.content_type == 'message/rfc822' || curr_mail.content_type == 'application/vnd.ms-outlook' || curr_mail.content_type == 'application/ms-tnef' ensure_parts_counted # fills in rfc822_attachment variable leaves_found += _get_attachment_leaves_recursive(curr_mail.rfc822_attachment, curr_mail.rfc822_attachment) else @@ -863,9 +935,13 @@ class IncomingMessage < ActiveRecord::Base def get_main_body_text_part leaves = get_attachment_leaves - # Find first part which is text/plain + # Find first part which is text/plain or text/html + # (We have to include HTML, as increasingly there are mail clients that + # include no text alternative for the main part, and we don't want to + # instead use the first text attachment + # e.g. http://www.whatdotheyknow.com/request/list_of_public_authorties) leaves.each do |p| - if p.content_type == 'text/plain' + if p.content_type == 'text/plain' or p.content_type == 'text/html' return p end end @@ -898,7 +974,7 @@ class IncomingMessage < ActiveRecord::Base # e.g. for https://secure.mysociety.org/admin/foi/request/show_raw_email/24550 main_part = get_main_body_text_part if main_part.nil? - return + return [] end text = main_part.body @@ -936,10 +1012,13 @@ class IncomingMessage < ActiveRecord::Base # Returns all attachments for use in display code # XXX is this called multiple times and should be cached? def get_attachments_for_display - ensure_parts_counted - main_part = get_main_body_text_part leaves = get_attachment_leaves + + # XXX we have to call ensure_parts_counted after get_attachment_leaves + # which is really messy. + ensure_parts_counted + attachments = [] for leaf in leaves if leaf != main_part @@ -965,7 +1044,12 @@ class IncomingMessage < ActiveRecord::Base headers = "" for header in [ 'Date', 'Subject', 'From', 'To', 'Cc' ] if leaf.within_rfc822_attachment.header.include?(header.downcase) - headers = headers + header + ": " + leaf.within_rfc822_attachment.header[header.downcase].to_s + "\n" + header_value = leaf.within_rfc822_attachment.header[header.downcase] + # Example message which has a blank Date header: + # http://www.whatdotheyknow.com/request/30747/response/80253/attach/html/17/Common%20Purpose%20Advisory%20Group%20Meeting%20Tuesday%202nd%20March.txt.html + if !header_value.blank? + headers = headers + header + ": " + header_value.to_s + "\n" + end end end # XXX call _convert_part_body_to_text here, but need to get charset somehow @@ -1099,11 +1183,14 @@ class IncomingMessage < ActiveRecord::Base File.unlink(tempfile.path + ".txt") end elsif content_type == 'application/rtf' + # catdoc on RTF prodcues less comments and extra bumf than --text option to unrtf IO.popen("/usr/bin/catdoc " + tempfile.path, "r") do |child| text += child.read() + "\n\n" end elsif content_type == 'text/html' - IO.popen("/usr/bin/lynx -display_charset=UTF-8 -force_html -dump " + tempfile.path, "r") do |child| + # lynx wordwraps links in its output, which then don't get formatted properly + # by WhatDoTheyKnow. We use elinks instead, which doesn't do that. + IO.popen("/usr/bin/elinks -dump-charset utf-8 -force-html -dump " + tempfile.path, "r") do |child| text += child.read() + "\n\n" end elsif content_type == 'application/vnd.ms-excel' @@ -1273,7 +1360,7 @@ class IncomingMessage < ActiveRecord::Base prefix = email prefix =~ /^(.*)@/ prefix = $1 - if !prefix.nil? && prefix.downcase.match(/^(postmaster|mailer-daemon|auto_reply|donotreply)$/) + if !prefix.nil? && prefix.downcase.match(/^(postmaster|mailer-daemon|auto_reply|donotreply|no-reply)$/) return false end |