aboutsummaryrefslogtreecommitdiffstats
path: root/app/models/incoming_message.rb
diff options
context:
space:
mode:
Diffstat (limited to 'app/models/incoming_message.rb')
-rw-r--r--app/models/incoming_message.rb384
1 files changed, 146 insertions, 238 deletions
diff --git a/app/models/incoming_message.rb b/app/models/incoming_message.rb
index c808dc6a1..4d3c08df3 100644
--- a/app/models/incoming_message.rb
+++ b/app/models/incoming_message.rb
@@ -28,10 +28,11 @@
# Move some of the (e.g. quoting) functions here into rblib, as they feel
# general not specific to IncomingMessage.
+require 'alaveteli_file_types'
+require 'external_command'
require 'htmlentities'
require 'rexml/document'
require 'zip/zip'
-require 'mahoro'
require 'mapi/msg'
require 'mapi/convert'
@@ -44,128 +45,17 @@ module TMail
end
end
-# To add an image, create a file with appropriate name corresponding to the
-# mime type in public/images e.g. icon_image_tiff_large.png
-$file_extension_to_mime_type = {
- "txt" => 'text/plain',
- "pdf" => 'application/pdf',
- "rtf" => 'application/rtf',
- "doc" => 'application/vnd.ms-word',
- "docx" => 'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
- "xls" => 'application/vnd.ms-excel',
- "xlsx" => 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
- "ppt" => 'application/vnd.ms-powerpoint',
- "pptx" => 'application/vnd.openxmlformats-officedocument.presentationml.presentation',
- "oft" => 'application/vnd.ms-outlook',
- "msg" => 'application/vnd.ms-outlook',
- "tnef" => 'application/ms-tnef',
- "tif" => 'image/tiff',
- "gif" => 'image/gif',
- "jpg" => 'image/jpeg', # XXX add jpeg
- "png" => 'image/png',
- "bmp" => 'image/bmp',
- "html" => 'text/html', # XXX add htm
- "vcf" => 'text/x-vcard',
- "zip" => 'application/zip',
- "delivery-status" => 'message/delivery-status'
-}
-# XXX doesn't have way of choosing default for inverse map - might want to add
-# one when you need it
-$file_extension_to_mime_type_rev = $file_extension_to_mime_type.invert
-
-# See binary_mask_stuff function below. It just test for inclusion
-# in this hash, not the value of the right hand side.
-$do_not_binary_mask = {
- 'image/tiff' => 1,
- 'image/gif' => 1,
- 'image/jpeg' => 1,
- 'image/png' => 1,
- 'image/bmp' => 1,
- 'application/zip' => 1,
-}
-
-# Given file name and its content, return most likely type
-def filename_and_content_to_mimetype(filename, content)
- # Try filename
- ret = filename_to_mimetype(filename)
- if !ret.nil?
- return ret
- end
-
- # Otherwise look inside the file to work out the type.
- # Mahoro is a Ruby binding for libmagic.
- m = Mahoro.new(Mahoro::MIME)
- mahoro_type = m.buffer(content)
- mahoro_type.strip!
- #STDERR.puts("mahoro", mahoro_type, "xxxok")
- # XXX we shouldn't have to check empty? here, but Mahoro sometimes returns a blank line :(
- # e.g. for InfoRequestEvent 17930
- if mahoro_type.nil? || mahoro_type.empty?
- return nil
- end
- # text/plain types sometimes come with a charset
- mahoro_type.match(/^(.*);/)
- if $1
- mahoro_type = $1
- end
- # see if looks like a content type, or has something in it that does
- # and return that
- # mahoro returns junk "\012- application/msword" as mime type.
- mahoro_type.match(/([a-z0-9.-]+\/[a-z0-9.-]+)/)
- if $1
- return $1
- end
- # otherwise we got junk back from mahoro
- return nil
-end
-
-# XXX clearly this shouldn't be a global function, or the above global vars.
-def filename_to_mimetype(filename)
- if !filename
- return nil
- end
- if filename.match(/\.([^.]+)$/i)
- lext = $1.downcase
- if $file_extension_to_mime_type.include?(lext)
- return $file_extension_to_mime_type[lext]
- end
- end
- return nil
-end
-
-def mimetype_to_extension(mime)
- if $file_extension_to_mime_type_rev.include?(mime)
- return $file_extension_to_mime_type_rev[mime]
- end
- return nil
-end
-
-def normalise_content_type(content_type)
- # e.g. http://www.whatdotheyknow.com/request/93/response/250
- if content_type == 'application/excel' or content_type == 'application/msexcel' or content_type == 'application/x-ms-excel'
- content_type = 'application/vnd.ms-excel'
- end
- if content_type == 'application/mspowerpoint' or content_type == 'application/x-ms-powerpoint'
- content_type = 'application/vnd.ms-powerpoint'
- end
- if content_type == 'application/msword' or content_type == 'application/x-ms-word'
- content_type = 'application/vnd.ms-word'
- end
- if content_type == 'application/x-zip-compressed'
- content_type = 'application/zip'
- end
-
- # e.g. http://www.whatdotheyknow.com/request/copy_of_current_swessex_scr_opt#incoming-9928
- if content_type == 'application/acrobat'
- content_type = 'application/pdf'
- end
-
- return content_type
-end
+# This is the type which is used to send data about attachments to the view
+class FOIAttachment
+ attr_accessor :body
+ attr_accessor :content_type
+ attr_accessor :filename
+ attr_accessor :url_part_number
+ attr_accessor :within_rfc822_subject # we use the subject as the filename for email attachments
-# List of DSN codes taken from RFC 3463
-# http://tools.ietf.org/html/rfc3463
-$dsn_to_message = {
+ # List of DSN codes taken from RFC 3463
+ # http://tools.ietf.org/html/rfc3463
+ DsnToMessage = {
'X.1.0' => 'Other address status',
'X.1.1' => 'Bad destination mailbox address',
'X.1.2' => 'Bad destination system address',
@@ -213,15 +103,7 @@ $dsn_to_message = {
'X.7.5' => 'Cryptographic failure',
'X.7.6' => 'Cryptographic algorithm not supported',
'X.7.7' => 'Message integrity failure'
-}
-
-# This is the type which is used to send data about attachments to the view
-class FOIAttachment
- attr_accessor :body
- attr_accessor :content_type
- attr_accessor :filename
- attr_accessor :url_part_number
- attr_accessor :within_rfc822_subject # we use the subject as the filename for email attachments
+ }
# Returns HTML, of extra comment to put by attachment
def extra_note
@@ -235,8 +117,8 @@ class FOIAttachment
dsn_part = 'X.' + $2
dsn_message = ""
- if $dsn_to_message.include?(dsn_part)
- dsn_message = " (" + $dsn_to_message[dsn_part] + ")"
+ if DsnToMessage.include?(dsn_part)
+ dsn_message = " (" + DsnToMessage[dsn_part] + ")"
end
return "<br><em>DSN: " + dsn + dsn_message + "</em>"
@@ -279,7 +161,7 @@ class FOIAttachment
end
def _internal_display_filename
- calc_ext = mimetype_to_extension(@content_type)
+ calc_ext = AlaveteliFileTypes.mimetype_to_extension(@content_type)
if @filename
# Put right extension on if missing
@@ -312,65 +194,54 @@ class FOIAttachment
end
# Whether this type can be shown in the Google Docs Viewer.
- # PDF, PowerPoint and TIFF are listed on https://docs.google.com/viewer
- # .doc and .docx were added later http://gmailblog.blogspot.com/2010/06/view-doc-attachments-right-in-your.html
- # .xls appears to work fine too
+ # The full list of supported types can be found at
+ # https://docs.google.com/support/bin/answer.py?hl=en&answer=1189935
def has_google_docs_viewer?
- if self.content_type == 'application/vnd.ms-word'
- return true
- elsif self.content_type == 'application/vnd.openxmlformats-officedocument.wordprocessingml.document'
- return true
- elsif self.content_type == 'application/pdf'
- return true
- elsif self.content_type == 'image/tiff'
- return true
- elsif self.content_type == 'application/vnd.ms-powerpoint'
- return true
- elsif self.content_type == 'application/vnd.ms-excel'
- return true
- end
+ return !! {
+ "application/pdf" => true, # .pdf
+ "image/tiff" => true, # .tiff
+
+ "application/vnd.ms-word" => true, # .doc
+ "application/vnd.openxmlformats-officedocument.wordprocessingml.document" => true, # .docx
+
+ "application/vnd.ms-powerpoint" => true, # .ppt
+ "application/vnd.openxmlformats-officedocument.presentationml.presentation" => true, # .pptx
+
+ "application/vnd.ms-excel" => true, # .xls
+ "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" => true, # .xlsx
+
+ } [self.content_type]
end
# Whether this type has a "View as HTML"
def has_body_as_html?
- if self.content_type == 'text/plain'
- return true
- elsif self.content_type == 'application/vnd.ms-word'
- return true
- elsif self.content_type == 'application/vnd.ms-excel'
- return true
- elsif self.content_type == 'application/pdf'
- return true
- elsif self.content_type == 'application/rtf'
- return true
- end
- # We use the same "View as HTML" link to embed the Google Doc Viewer
- # (when it can't do a conversion locally)
- if self.has_google_docs_viewer?
- return true
- end
- return false
+ return (
+ !!{
+ "text/plain" => true,
+ "application/rtf" => true,
+ }[self.content_type] or
+ self.has_google_docs_viewer?
+ )
end
# Name of type of attachment type - only valid for things that has_body_as_html?
def name_of_content_type
- if self.content_type == 'text/plain'
- return "Text file"
- elsif self.content_type == 'application/vnd.ms-word'
- return "Word document"
- elsif self.content_type == 'application/vnd.openxmlformats-officedocument.wordprocessingml.document'
- return "Word document - XML"
- elsif self.content_type == 'application/vnd.ms-excel'
- return "Excel spreadsheet"
- elsif self.content_type == 'application/pdf'
- return "PDF file"
- elsif self.content_type == 'application/rtf'
- return "RTF file"
- elsif self.content_type == 'application/vnd.ms-powerpoint'
- return "PowerPoint presentation"
- elsif self.content_type == 'image/tiff'
- return "TIFF image"
- end
+ return {
+ "text/plain" => "Text file",
+ 'application/rtf' => "RTF file",
+
+ 'application/pdf' => "PDF file",
+ 'image/tiff' => "TIFF image",
+
+ 'application/vnd.ms-word' => "Word document",
+ 'application/vnd.openxmlformats-officedocument.wordprocessingml.document' => "Word document",
+
+ 'application/vnd.ms-powerpoint' => "PowerPoint presentation",
+ 'application/vnd.openxmlformats-officedocument.presentationml.presentation' => "PowerPoint presentation",
+
+ 'application/vnd.ms-excel' => "Excel spreadsheet",
+ 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet' => "Excel spreadsheet",
+ }[self.content_type]
end
# For "View as HTML" of attachment
@@ -395,20 +266,6 @@ class FOIAttachment
tempfile.print self.body
tempfile.flush
- # Use google docs for the view for these - hanging server
- # if self.content_type == 'application/vnd.ms-word'
- # # XXX do something with PNG files this spits out so they view too :)
- # system("/usr/bin/wvHtml --charset=UTF-8 " + tempfile.path + " " + tempfile.path + ".html")
- # html = File.read(tempfile.path + ".html")
- # File.unlink(tempfile.path + ".html")
-# elsif self.content_type == 'application/vnd.ms-excel'
-# # Don't colorise, e.g. otherwise this one comes out with white
-# # text which is nasty:
-# # http://www.whatdotheyknow.com/request/30485/response/74705/attach/html/2/Empty%20premises%20Sefton.xls.html
-# IO.popen("/usr/bin/xlhtml -nc -a " + tempfile.path + "", "r") do |child|
-# html = child.read()
-# wrapper_id = "wrapper_xlhtml"
-# end
if self.content_type == 'application/pdf'
IO.popen("/usr/bin/pdftohtml -nodrm -zoom 1.0 -stdout -enc UTF-8 -noframes " + tempfile.path + "", "r") do |child|
html = child.read()
@@ -455,6 +312,7 @@ class FOIAttachment
end
+
class IncomingMessage < ActiveRecord::Base
belongs_to :info_request
validates_presence_of :info_request
@@ -467,6 +325,17 @@ class IncomingMessage < ActiveRecord::Base
belongs_to :raw_email
+ # See binary_mask_stuff function below. It just test for inclusion
+ # in this hash, not the value of the right hand side.
+ DoNotBinaryMask = {
+ 'image/tiff' => 1,
+ 'image/gif' => 1,
+ 'image/jpeg' => 1,
+ 'image/png' => 1,
+ 'image/bmp' => 1,
+ 'application/zip' => 1,
+ }
+
# Return the structured TMail::Mail object
# Documentation at http://i.loveruby.net/en/projects/tmail/doc/
def mail
@@ -513,7 +382,7 @@ class IncomingMessage < ActiveRecord::Base
# An email attached as text
# e.g. http://www.whatdotheyknow.com/request/64/response/102
part.rfc822_attachment = TMail::Mail.parse(part.body)
- elsif part.content_type == 'application/vnd.ms-outlook' || part_filename && filename_to_mimetype(part_filename) == 'application/vnd.ms-outlook'
+ elsif part.content_type == 'application/vnd.ms-outlook' || part_filename && AlaveteliFileTypes.filename_to_mimetype(part_filename) == 'application/vnd.ms-outlook'
# An email attached as an Outlook file
# e.g. http://www.whatdotheyknow.com/request/chinese_names_for_british_politi
msg = Mapi::Msg.open(StringIO.new(part.body))
@@ -571,7 +440,7 @@ class IncomingMessage < ActiveRecord::Base
# See if content type is one that we mask - things like zip files and
# images may get broken if we try to. We err on the side of masking too
# much, as many unknown types will really be text.
- if $do_not_binary_mask.include?(content_type)
+ if DoNotBinaryMask.include?(content_type)
return
end
@@ -800,7 +669,6 @@ class IncomingMessage < ActiveRecord::Base
return _get_attachment_leaves_recursive(self.mail)
end
def _get_attachment_leaves_recursive(curr_mail, within_rfc822_attachment = nil)
- # STDERR.puts "_get_attachment_leaves_recursive", curr_mail.content_type, curr_mail.sub_type, curr_mail.multipart?, "\n"
leaves_found = []
if curr_mail.multipart?
if curr_mail.parts.size == 0
@@ -846,7 +714,7 @@ class IncomingMessage < ActiveRecord::Base
# PDFs often come with this mime type, fix it up for view code
if curr_mail.content_type == 'application/octet-stream'
part_file_name = self._get_censored_part_file_name(curr_mail)
- calc_mime = filename_and_content_to_mimetype(part_file_name, curr_mail.body)
+ calc_mime = AlaveteliFileTypes.filename_and_content_to_mimetype(part_file_name, curr_mail.body)
if calc_mime
curr_mail.content_type = calc_mime
end
@@ -949,7 +817,7 @@ class IncomingMessage < ActiveRecord::Base
# e.g. http://www.whatdotheyknow.com/request/35/response/177
# XXX This is a bit of a hack as it is calling a convert to text routine.
# Could instead call a sanitize HTML one.
- text = IncomingMessage._get_attachment_text_internal_one_file(part.content_type, text)
+ text = self.class._get_attachment_text_internal_one_file(part.content_type, text)
end
end
@@ -1027,7 +895,7 @@ class IncomingMessage < ActiveRecord::Base
# ... or if none, consider first part
p = leaves[0]
# if it is a known type then don't use it, return no body (nil)
- if mimetype_to_extension(p.content_type)
+ if AlaveteliFileTypes.mimetype_to_extension(p.content_type)
# this is guess of case where there are only attachments, no body text
# e.g. http://www.whatdotheyknow.com/request/cost_benefit_analysis_for_real_n
return nil
@@ -1067,7 +935,7 @@ class IncomingMessage < ActiveRecord::Base
attachment.body = content
attachment.filename = uu.match(/^begin\s+[0-9]+\s+(.*)$/)[1]
self.info_request.apply_censor_rules_to_text!(attachment.filename)
- calc_mime = filename_and_content_to_mimetype(attachment.filename, attachment.body)
+ calc_mime = AlaveteliFileTypes.filename_and_content_to_mimetype(attachment.filename, attachment.body)
if calc_mime
calc_mime = normalise_content_type(calc_mime)
attachment.content_type = calc_mime
@@ -1238,54 +1106,42 @@ class IncomingMessage < ActiveRecord::Base
tempfile.print body
tempfile.flush
if content_type == 'application/vnd.ms-word'
- system("/usr/bin/wvText " + tempfile.path + " " + tempfile.path + ".txt")
+ external_command("/usr/bin/wvText", tempfile.path, tempfile.path + ".txt")
# Try catdoc if we get into trouble (e.g. for InfoRequestEvent 2701)
if not File.exists?(tempfile.path + ".txt")
- IO.popen("/usr/bin/catdoc " + tempfile.path, "r") do |child|
- text += child.read() + "\n\n"
- end
+ external_command("/usr/bin/catdoc", tempfile.path, :append_to => text)
else
text += File.read(tempfile.path + ".txt") + "\n\n"
File.unlink(tempfile.path + ".txt")
end
elsif content_type == 'application/rtf'
# catdoc on RTF prodcues less comments and extra bumf than --text option to unrtf
- IO.popen("/usr/bin/catdoc " + tempfile.path, "r") do |child|
- text += child.read() + "\n\n"
- end
+ external_command("/usr/bin/catdoc", tempfile.path, :append_to => text)
elsif content_type == 'text/html'
# lynx wordwraps links in its output, which then don't get formatted properly
# by WhatDoTheyKnow. We use elinks instead, which doesn't do that.
- IO.popen("/usr/bin/elinks -dump-charset utf-8 -force-html -dump " + tempfile.path, "r") do |child|
- text += child.read() + "\n\n"
- end
+ external_command("/usr/bin/elinks", "-dump-charset", "utf-8", "-force-html", "-dump",
+ tempfile.path, :append_to => text)
elsif content_type == 'application/vnd.ms-excel'
# Bit crazy using /usr/bin/strings - but xls2csv, xlhtml and
# py_xls2txt only extract text from cells, not from floating
# notes. catdoc may be fooled by weird character sets, but will
# probably do for UK FOI requests.
- IO.popen("/usr/bin/strings " + tempfile.path, "r") do |child|
- text += child.read() + "\n\n"
- end
+ external_command("/usr/bin/strings", tempfile.path, :append_to => text)
elsif content_type == 'application/vnd.ms-powerpoint'
# ppthtml seems to catch more text, but only outputs HTML when
# we want text, so just use catppt for now
- IO.popen("/usr/bin/catppt " + tempfile.path, "r") do |child|
- text += child.read() + "\n\n"
- end
+ external_command("/usr/bin/catppt", tempfile.path, :append_to => text)
elsif content_type == 'application/pdf'
- IO.popen("/usr/bin/pdftotext " + tempfile.path + " -", "r") do |child|
- text += child.read() + "\n\n"
- end
+ external_command("/usr/bin/pdftotext", tempfile.path, "-", :append_to => text)
elsif content_type == 'application/vnd.openxmlformats-officedocument.wordprocessingml.document'
# This is Microsoft's XML office document format.
# Just pull out the main XML file, and strip it of text.
- xml = ''
- IO.popen("/usr/bin/unzip -qq -c " + tempfile.path + " word/document.xml", "r") do |child|
- xml += child.read() + "\n\n"
+ xml = external_command("/usr/bin/unzip", "-qq", "-c", tempfile.path, "word/document.xml")
+ if !xml.nil?
+ doc = REXML::Document.new(xml)
+ text += doc.each_element( './/text()' ){}.join(" ")
end
- doc = REXML::Document.new(xml)
- text += doc.each_element( './/text()' ){}.join(" ")
elsif content_type == 'application/zip'
# recurse into zip files
zip_file = Zip::ZipFile.open(tempfile.path)
@@ -1300,15 +1156,14 @@ class IncomingMessage < ActiveRecord::Base
# e.g. password protected
next
end
- calc_mime = filename_to_mimetype(filename)
+ calc_mime = AlaveteliFileTypes.filename_to_mimetype(filename)
if calc_mime
content_type = calc_mime
else
content_type = 'application/octet-stream'
end
- #STDERR.puts("doing file " + filename + " content type " + content_type)
- text += IncomingMessage._get_attachment_text_internal_one_file(content_type, body)
+ text += _get_attachment_text_internal_one_file(content_type, body)
end
end
end
@@ -1383,9 +1238,9 @@ class IncomingMessage < ActiveRecord::Base
for incoming_message in IncomingMessage.find(:all)
for attachment in incoming_message.get_attachments_for_display
raise "internal error incoming_message " + incoming_message.id.to_s if attachment.content_type.nil?
- if mimetype_to_extension(attachment.content_type).nil?
- STDERR.puts "Unknown type for /request/" + incoming_message.info_request.id.to_s + "#incoming-"+incoming_message.id.to_s
- STDERR.puts " " + attachment.filename.to_s + " " + attachment.content_type.to_s
+ if AlaveteliFileTypes.mimetype_to_extension(attachment.content_type).nil?
+ $stderr.puts "Unknown type for /request/" + incoming_message.info_request.id.to_s + "#incoming-"+incoming_message.id.to_s
+ $stderr.puts " " + attachment.filename.to_s + " " + attachment.content_type.to_s
end
end
end
@@ -1398,15 +1253,15 @@ class IncomingMessage < ActiveRecord::Base
def get_present_file_extensions
ret = {}
for attachment in self.get_attachments_for_display
- ext = mimetype_to_extension(attachment.content_type)
+ ext = AlaveteliFileTypes.mimetype_to_extension(attachment.content_type)
ext = File.extname(attachment.filename).gsub(/^[.]/, "") if ext.nil? && !attachment.filename.nil?
ret[ext] = 1 if !ext.nil?
end
return ret.keys.join(" ")
end
# Return space separated list of all file extensions known
- def IncomingMessage.get_all_file_extentions
- return $file_extension_to_mime_type.keys.join(" ")
+ def IncomingMessage.get_all_file_extensions
+ return AlaveteliFileTypes.all_extensions.join(" ")
end
# Return false if for some reason this is a message that we shouldn't let them reply to
@@ -1432,6 +1287,59 @@ class IncomingMessage < ActiveRecord::Base
return true
end
+
+ def normalise_content_type(content_type)
+ # e.g. http://www.whatdotheyknow.com/request/93/response/250
+ if content_type == 'application/excel' or content_type == 'application/msexcel' or content_type == 'application/x-ms-excel'
+ content_type = 'application/vnd.ms-excel'
+ end
+ if content_type == 'application/mspowerpoint' or content_type == 'application/x-ms-powerpoint'
+ content_type = 'application/vnd.ms-powerpoint'
+ end
+ if content_type == 'application/msword' or content_type == 'application/x-ms-word'
+ content_type = 'application/vnd.ms-word'
+ end
+ if content_type == 'application/x-zip-compressed'
+ content_type = 'application/zip'
+ end
+
+ # e.g. http://www.whatdotheyknow.com/request/copy_of_current_swessex_scr_opt#incoming-9928
+ if content_type == 'application/acrobat'
+ content_type = 'application/pdf'
+ end
+
+ return content_type
+ end
+ private :normalise_content_type
+
+ def self.external_command(program_name, *args)
+ # Run an external program, and return its output.
+ # Standard error is suppressed unless the program
+ # fails (i.e. returns a non-zero exit status).
+ opts = {}
+ if !args.empty? && args[-1].is_a?(Hash)
+ opts = args.pop
+ end
+
+ xc = ExternalCommand.new(program_name, *args)
+ if opts.has_key? :append_to
+ xc.out = opts[:append_to]
+ end
+ xc.run()
+ if xc.status != 0
+ # Error
+ $stderr.puts("Error from #{program_name} #{args.join(' ')}:")
+ $stderr.print(xc.err)
+ return nil
+ else
+ if opts.has_key? :append_to
+ opts[:append_to] << "\n\n"
+ else
+ return xc.out
+ end
+ end
+ end
+ private_class_method :external_command
end