aboutsummaryrefslogtreecommitdiffstats
path: root/app/models/incoming_message.rb
diff options
context:
space:
mode:
Diffstat (limited to 'app/models/incoming_message.rb')
-rw-r--r--app/models/incoming_message.rb177
1 files changed, 132 insertions, 45 deletions
diff --git a/app/models/incoming_message.rb b/app/models/incoming_message.rb
index 2348c17b5..316f2683a 100644
--- a/app/models/incoming_message.rb
+++ b/app/models/incoming_message.rb
@@ -29,6 +29,8 @@ require 'htmlentities'
require 'rexml/document'
require 'zip/zip'
require 'mahoro'
+require 'mapi/msg'
+require 'mapi/convert'
# Monkeypatch! Adding some extra members to store extra info in.
module TMail
@@ -51,6 +53,9 @@ $file_extension_to_mime_type = {
"xlsx" => 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
"ppt" => 'application/vnd.ms-powerpoint',
"pptx" => 'application/vnd.openxmlformats-officedocument.presentationml.presentation',
+ "oft" => 'application/vnd.ms-outlook',
+ "msg" => 'application/vnd.ms-outlook',
+ "tnef" => 'application/ms-tnef',
"tif" => 'image/tiff',
"gif" => 'image/gif',
"jpg" => 'image/jpeg', # XXX add jpeg
@@ -303,10 +308,54 @@ class FOIAttachment
end
end
+ # Whether this type has a "View as HTML"
+ def has_body_as_html?
+ if self.content_type == 'text/plain'
+ return true
+ elsif self.content_type == 'application/vnd.ms-word'
+ return true
+ elsif self.content_type == 'application/vnd.ms-excel'
+ return true
+ elsif self.content_type == 'application/pdf'
+ return true
+ elsif self.content_type == 'application/rtf'
+ return true
+ end
+ return false
+ end
+
+ # Name of type of attachment type - only valid for things that has_body_as_html?
+ def name_of_content_type
+ if self.content_type == 'text/plain'
+ return "Text file"
+ elsif self.content_type == 'application/vnd.ms-word'
+ return "Word document"
+ elsif self.content_type == 'application/vnd.ms-excel'
+ return "Excel spreadsheet"
+ elsif self.content_type == 'application/pdf'
+ return "PDF file"
+ elsif self.content_type == 'application/rtf'
+ return "RTF file"
+ end
+ end
+
# For "View as HTML" of attachment
def body_as_html(dir)
html = nil
+ wrapper_id = "wrapper"
+
+ # simple cases, can never fail
+ if self.content_type == 'text/plain'
+ text = self.body.strip
+ text = CGI.escapeHTML(text)
+ text = MySociety::Format.make_clickable(text)
+ html = text.gsub(/\n/, '<br>')
+ return "<html><head></head><body>" + html + "</body></html>", wrapper_id
+ end
+ # the extractions will also produce image files, which go in the
+ # current directory, so change to the directory the function caller
+ # wants everything in
Dir.chdir(dir) do
tempfile = Tempfile.new('foiextract', '.')
tempfile.print self.body
@@ -317,10 +366,22 @@ class FOIAttachment
system("/usr/bin/wvHtml --charset=UTF-8 " + tempfile.path + " " + tempfile.path + ".html")
html = File.read(tempfile.path + ".html")
File.unlink(tempfile.path + ".html")
+ elsif self.content_type == 'application/vnd.ms-excel'
+ # Don't colorise, e.g. otherwise this one comes out with white
+ # text which is nasty:
+ # http://www.whatdotheyknow.com/request/30485/response/74705/attach/html/2/Empty%20premises%20Sefton.xls.html
+ IO.popen("/usr/bin/xlhtml -nc -a " + tempfile.path + "", "r") do |child|
+ html = child.read()
+ wrapper_id = "wrapper_xlhtml"
+ end
elsif self.content_type == 'application/pdf'
IO.popen("/usr/bin/pdftohtml -nodrm -zoom 1.0 -stdout -enc UTF-8 -noframes " + tempfile.path + "", "r") do |child|
html = child.read()
end
+ elsif self.content_type == 'application/rtf'
+ IO.popen("/usr/bin/unrtf --html " + tempfile.path + "", "r") do |child|
+ html = child.read()
+ end
else
raise "No HTML conversion available for type " + self.content_type
end
@@ -341,30 +402,12 @@ class FOIAttachment
body_without_tags = body.gsub(/\s+/,"").gsub(/\<[^\>]*\>/, "")
contains_images = html.match(/<img/mi) ? true : false
if !$?.success? || html.size == 0 || (body_without_tags.size == 0 && !contains_images)
- return "<html><head></head><body><p>Sorry, the conversion to HTML failed. Please use the download link at the top right.</p></body></html>"
+ return "<html><head></head><body><p>Sorry, the conversion to HTML failed. Please use the download link at the top right.</p></body></html>", wrapper_id
end
- return html
- end
-
- # Whether this type has a "View as HTML"
- def has_body_as_html?
- if self.content_type == 'application/vnd.ms-word'
- return true
- elsif self.content_type == 'application/pdf'
- return true
- end
- return false
+ return html, wrapper_id
end
- # Name of type of attachment type - only valid for things that has_body_as_html?
- def name_of_content_type
- if self.content_type == 'application/vnd.ms-word'
- return "Word document"
- elsif self.content_type == 'application/pdf'
- return "PDF file"
- end
- end
end
class IncomingMessage < ActiveRecord::Base
@@ -419,20 +462,30 @@ class IncomingMessage < ActiveRecord::Base
_count_parts_recursive(p)
end
else
- if part.content_type == 'message/rfc822'
- # An email attached as text
- # e.g. http://www.whatdotheyknow.com/request/64/response/102
- begin
+ part_filename = TMail::Mail.get_part_file_name(part)
+ begin
+ if part.content_type == 'message/rfc822'
+ # An email attached as text
+ # e.g. http://www.whatdotheyknow.com/request/64/response/102
part.rfc822_attachment = TMail::Mail.parse(part.body)
- rescue
- # If attached mail doesn't parse, treat it as text part
- part.rfc822_attachment = nil
- @count_parts_count += 1
- part.url_part_number = @count_parts_count
- else
- _count_parts_recursive(part.rfc822_attachment)
+ elsif part.content_type == 'application/vnd.ms-outlook' || part_filename && filename_to_mimetype(part_filename) == 'application/vnd.ms-outlook'
+ # An email attached as an Outlook file
+ # e.g. http://www.whatdotheyknow.com/request/chinese_names_for_british_politi
+ msg = Mapi::Msg.open(StringIO.new(part.body))
+ part.rfc822_attachment = TMail::Mail.parse(msg.to_mime.to_s)
+ elsif part.content_type == 'application/ms-tnef'
+ # A set of attachments in a TNEF file
+ part.rfc822_attachment = TNEF.as_tmail(part.body)
end
+ rescue
+ # If attached mail doesn't parse, treat it as text part
+ part.rfc822_attachment = nil
else
+ unless part.rfc822_attachment.nil?
+ _count_parts_recursive(part.rfc822_attachment)
+ end
+ end
+ if part.rfc822_attachment.nil?
@count_parts_count += 1
part.url_part_number = @count_parts_count
end
@@ -486,7 +539,7 @@ class IncomingMessage < ActiveRecord::Base
uncompressed_text = child.read()
end
# if we managed to uncompress the PDF...
- if !uncompressed_text.nil?
+ if !uncompressed_text.nil? && !uncompressed_text.empty?
# then censor stuff (making a copy so can compare again in a bit)
censored_uncompressed_text = uncompressed_text.dup
self._binary_mask_stuff_internal!(censored_uncompressed_text)
@@ -499,7 +552,7 @@ class IncomingMessage < ActiveRecord::Base
child.close_write()
recompressed_text = child.read()
end
- if !recompressed_text.nil?
+ if !recompressed_text.nil? && !recompressed_text.empty?
text[0..-1] = recompressed_text # [0..-1] makes it change the 'text' string in place
end
end
@@ -707,15 +760,22 @@ class IncomingMessage < ActiveRecord::Base
if curr_mail.sub_type == 'alternative'
# Choose best part from alternatives
best_part = nil
+ # Take the last text/plain one, or else the first one
curr_mail.parts.each do |m|
- # Take the first one, or the last text/plain one
- # XXX - could do better!
if not best_part
best_part = m
elsif m.content_type == 'text/plain'
best_part = m
end
end
+ # Take an HTML one as even higher priority. (They tend
+ # to render better than text/plain, e.g. don't wrap links here:
+ # http://www.whatdotheyknow.com/request/amount_and_cost_of_freedom_of_in#incoming-72238 )
+ curr_mail.parts.each do |m|
+ if m.content_type == 'text/html'
+ best_part = m
+ end
+ end
leaves_found += _get_attachment_leaves_recursive(best_part, within_rfc822_attachment)
else
# Add all parts
@@ -724,6 +784,11 @@ class IncomingMessage < ActiveRecord::Base
end
end
else
+ # XXX Yuck. this section alters various content_type's. That puts
+ # it into conflict with ensure_parts_counted which it has to be
+ # called both before and after. It will fail with cases of
+ # attachments of attachments etc.
+
# Don't allow nil content_types
if curr_mail.content_type.nil?
curr_mail.content_type = 'application/octet-stream'
@@ -746,9 +811,16 @@ class IncomingMessage < ActiveRecord::Base
curr_mail.content_type = 'text/plain'
end
end
+ if curr_mail.content_type == 'application/vnd.ms-outlook' || curr_mail.content_type == 'application/ms-tnef'
+ ensure_parts_counted # fills in rfc822_attachment variable
+ if curr_mail.rfc822_attachment.nil?
+ # Attached mail didn't parse, so treat as binary
+ curr_mail.content_type = 'application/octet-stream'
+ end
+ end
- # If the part is an attachment of email in text form
- if curr_mail.content_type == 'message/rfc822'
+ # If the part is an attachment of email
+ if curr_mail.content_type == 'message/rfc822' || curr_mail.content_type == 'application/vnd.ms-outlook' || curr_mail.content_type == 'application/ms-tnef'
ensure_parts_counted # fills in rfc822_attachment variable
leaves_found += _get_attachment_leaves_recursive(curr_mail.rfc822_attachment, curr_mail.rfc822_attachment)
else
@@ -863,9 +935,13 @@ class IncomingMessage < ActiveRecord::Base
def get_main_body_text_part
leaves = get_attachment_leaves
- # Find first part which is text/plain
+ # Find first part which is text/plain or text/html
+ # (We have to include HTML, as increasingly there are mail clients that
+ # include no text alternative for the main part, and we don't want to
+ # instead use the first text attachment
+ # e.g. http://www.whatdotheyknow.com/request/list_of_public_authorties)
leaves.each do |p|
- if p.content_type == 'text/plain'
+ if p.content_type == 'text/plain' or p.content_type == 'text/html'
return p
end
end
@@ -898,7 +974,7 @@ class IncomingMessage < ActiveRecord::Base
# e.g. for https://secure.mysociety.org/admin/foi/request/show_raw_email/24550
main_part = get_main_body_text_part
if main_part.nil?
- return
+ return []
end
text = main_part.body
@@ -936,10 +1012,13 @@ class IncomingMessage < ActiveRecord::Base
# Returns all attachments for use in display code
# XXX is this called multiple times and should be cached?
def get_attachments_for_display
- ensure_parts_counted
-
main_part = get_main_body_text_part
leaves = get_attachment_leaves
+
+ # XXX we have to call ensure_parts_counted after get_attachment_leaves
+ # which is really messy.
+ ensure_parts_counted
+
attachments = []
for leaf in leaves
if leaf != main_part
@@ -965,7 +1044,12 @@ class IncomingMessage < ActiveRecord::Base
headers = ""
for header in [ 'Date', 'Subject', 'From', 'To', 'Cc' ]
if leaf.within_rfc822_attachment.header.include?(header.downcase)
- headers = headers + header + ": " + leaf.within_rfc822_attachment.header[header.downcase].to_s + "\n"
+ header_value = leaf.within_rfc822_attachment.header[header.downcase]
+ # Example message which has a blank Date header:
+ # http://www.whatdotheyknow.com/request/30747/response/80253/attach/html/17/Common%20Purpose%20Advisory%20Group%20Meeting%20Tuesday%202nd%20March.txt.html
+ if !header_value.blank?
+ headers = headers + header + ": " + header_value.to_s + "\n"
+ end
end
end
# XXX call _convert_part_body_to_text here, but need to get charset somehow
@@ -1099,11 +1183,14 @@ class IncomingMessage < ActiveRecord::Base
File.unlink(tempfile.path + ".txt")
end
elsif content_type == 'application/rtf'
+ # catdoc on RTF prodcues less comments and extra bumf than --text option to unrtf
IO.popen("/usr/bin/catdoc " + tempfile.path, "r") do |child|
text += child.read() + "\n\n"
end
elsif content_type == 'text/html'
- IO.popen("/usr/bin/lynx -display_charset=UTF-8 -force_html -dump " + tempfile.path, "r") do |child|
+ # lynx wordwraps links in its output, which then don't get formatted properly
+ # by WhatDoTheyKnow. We use elinks instead, which doesn't do that.
+ IO.popen("/usr/bin/elinks -dump-charset utf-8 -force-html -dump " + tempfile.path, "r") do |child|
text += child.read() + "\n\n"
end
elsif content_type == 'application/vnd.ms-excel'
@@ -1273,7 +1360,7 @@ class IncomingMessage < ActiveRecord::Base
prefix = email
prefix =~ /^(.*)@/
prefix = $1
- if !prefix.nil? && prefix.downcase.match(/^(postmaster|mailer-daemon|auto_reply|donotreply)$/)
+ if !prefix.nil? && prefix.downcase.match(/^(postmaster|mailer-daemon|auto_reply|donotreply|no-reply)$/)
return false
end