diff options
Diffstat (limited to 'app/models/incoming_message.rb')
-rw-r--r-- | app/models/incoming_message.rb | 282 |
1 files changed, 92 insertions, 190 deletions
diff --git a/app/models/incoming_message.rb b/app/models/incoming_message.rb index a7341d77a..c2ba9c7de 100644 --- a/app/models/incoming_message.rb +++ b/app/models/incoming_message.rb @@ -28,11 +28,11 @@ # Move some of the (e.g. quoting) functions here into rblib, as they feel # general not specific to IncomingMessage. +require 'alaveteli_file_types' require 'external_command' require 'htmlentities' require 'rexml/document' require 'zip/zip' -require 'mahoro' require 'mapi/msg' require 'mapi/convert' @@ -45,156 +45,17 @@ module TMail end end -# To add an image, create a file with appropriate name corresponding to the -# mime type in public/images e.g. icon_image_tiff_large.png -$file_extension_to_mime_type = { - "txt" => 'text/plain', - "pdf" => 'application/pdf', - "rtf" => 'application/rtf', - "doc" => 'application/vnd.ms-word', - "docx" => 'application/vnd.openxmlformats-officedocument.wordprocessingml.document', - "xls" => 'application/vnd.ms-excel', - "xlsx" => 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet', - "ppt" => 'application/vnd.ms-powerpoint', - "pptx" => 'application/vnd.openxmlformats-officedocument.presentationml.presentation', - "oft" => 'application/vnd.ms-outlook', - "msg" => 'application/vnd.ms-outlook', - "tnef" => 'application/ms-tnef', - "tif" => 'image/tiff', - "gif" => 'image/gif', - "jpg" => 'image/jpeg', # XXX add jpeg - "png" => 'image/png', - "bmp" => 'image/bmp', - "html" => 'text/html', # XXX add htm - "vcf" => 'text/x-vcard', - "zip" => 'application/zip', - "delivery-status" => 'message/delivery-status' -} -# XXX doesn't have way of choosing default for inverse map - might want to add -# one when you need it -$file_extension_to_mime_type_rev = $file_extension_to_mime_type.invert - -# See binary_mask_stuff function below. It just test for inclusion -# in this hash, not the value of the right hand side. -$do_not_binary_mask = { - 'image/tiff' => 1, - 'image/gif' => 1, - 'image/jpeg' => 1, - 'image/png' => 1, - 'image/bmp' => 1, - 'application/zip' => 1, -} - -# Given file name and its content, return most likely type -def filename_and_content_to_mimetype(filename, content) - # Try filename - ret = filename_to_mimetype(filename) - if !ret.nil? - return ret - end - - # Otherwise look inside the file to work out the type. - # Mahoro is a Ruby binding for libmagic. - m = Mahoro.new(Mahoro::MIME) - mahoro_type = m.buffer(content) - mahoro_type.strip! - #STDERR.puts("mahoro", mahoro_type, "xxxok") - # XXX we shouldn't have to check empty? here, but Mahoro sometimes returns a blank line :( - # e.g. for InfoRequestEvent 17930 - if mahoro_type.nil? || mahoro_type.empty? - return nil - end - # text/plain types sometimes come with a charset - mahoro_type.match(/^(.*);/) - if $1 - mahoro_type = $1 - end - # see if looks like a content type, or has something in it that does - # and return that - # mahoro returns junk "\012- application/msword" as mime type. - mahoro_type.match(/([a-z0-9.-]+\/[a-z0-9.-]+)/) - if $1 - return $1 - end - # otherwise we got junk back from mahoro - return nil -end - -# XXX clearly this shouldn't be a global function, or the above global vars. -def filename_to_mimetype(filename) - if !filename - return nil - end - if filename.match(/\.([^.]+)$/i) - lext = $1.downcase - if $file_extension_to_mime_type.include?(lext) - return $file_extension_to_mime_type[lext] - end - end - return nil -end - -def mimetype_to_extension(mime) - if $file_extension_to_mime_type_rev.include?(mime) - return $file_extension_to_mime_type_rev[mime] - end - return nil -end - -def normalise_content_type(content_type) - # e.g. http://www.whatdotheyknow.com/request/93/response/250 - if content_type == 'application/excel' or content_type == 'application/msexcel' or content_type == 'application/x-ms-excel' - content_type = 'application/vnd.ms-excel' - end - if content_type == 'application/mspowerpoint' or content_type == 'application/x-ms-powerpoint' - content_type = 'application/vnd.ms-powerpoint' - end - if content_type == 'application/msword' or content_type == 'application/x-ms-word' - content_type = 'application/vnd.ms-word' - end - if content_type == 'application/x-zip-compressed' - content_type = 'application/zip' - end - - # e.g. http://www.whatdotheyknow.com/request/copy_of_current_swessex_scr_opt#incoming-9928 - if content_type == 'application/acrobat' - content_type = 'application/pdf' - end - - return content_type -end - -def external_command(program_name, *args) - # Run an external program, and return its output. - # Standard error is suppressed unless the program - # fails (i.e. returns a non-zero exit status). - opts = {} - if !args.empty? && args[-1].is_a?(Hash) - opts = args.pop - end - - xc = ExternalCommand.new(program_name, *args) - if opts.has_key? :append_to - xc.out = opts[:append_to] - end - xc.run() - if xc.status != 0 - # Error - $stderr.puts("Error from #{program_name} #{args.join(' ')}:") - $stderr.print(xc.err) - return nil - else - if opts.has_key? :append_to - opts[:append_to] << "\n\n" - else - return xc.out - end - end -end +# This is the type which is used to send data about attachments to the view +class FOIAttachment + attr_accessor :body + attr_accessor :content_type + attr_accessor :filename + attr_accessor :url_part_number + attr_accessor :within_rfc822_subject # we use the subject as the filename for email attachments -# List of DSN codes taken from RFC 3463 -# http://tools.ietf.org/html/rfc3463 -$dsn_to_message = { + # List of DSN codes taken from RFC 3463 + # http://tools.ietf.org/html/rfc3463 + DsnToMessage = { 'X.1.0' => 'Other address status', 'X.1.1' => 'Bad destination mailbox address', 'X.1.2' => 'Bad destination system address', @@ -242,15 +103,7 @@ $dsn_to_message = { 'X.7.5' => 'Cryptographic failure', 'X.7.6' => 'Cryptographic algorithm not supported', 'X.7.7' => 'Message integrity failure' -} - -# This is the type which is used to send data about attachments to the view -class FOIAttachment - attr_accessor :body - attr_accessor :content_type - attr_accessor :filename - attr_accessor :url_part_number - attr_accessor :within_rfc822_subject # we use the subject as the filename for email attachments + } # Returns HTML, of extra comment to put by attachment def extra_note @@ -264,8 +117,8 @@ class FOIAttachment dsn_part = 'X.' + $2 dsn_message = "" - if $dsn_to_message.include?(dsn_part) - dsn_message = " (" + $dsn_to_message[dsn_part] + ")" + if DsnToMessage.include?(dsn_part) + dsn_message = " (" + DsnToMessage[dsn_part] + ")" end return "<br><em>DSN: " + dsn + dsn_message + "</em>" @@ -308,7 +161,7 @@ class FOIAttachment end def _internal_display_filename - calc_ext = mimetype_to_extension(@content_type) + calc_ext = AlaveteliFileTypes.mimetype_to_extension(@content_type) if @filename # Put right extension on if missing @@ -424,20 +277,6 @@ class FOIAttachment tempfile.print self.body tempfile.flush - # Use google docs for the view for these - hanging server - # if self.content_type == 'application/vnd.ms-word' - # # XXX do something with PNG files this spits out so they view too :) - # system("/usr/bin/wvHtml --charset=UTF-8 " + tempfile.path + " " + tempfile.path + ".html") - # html = File.read(tempfile.path + ".html") - # File.unlink(tempfile.path + ".html") -# elsif self.content_type == 'application/vnd.ms-excel' -# # Don't colorise, e.g. otherwise this one comes out with white -# # text which is nasty: -# # http://www.whatdotheyknow.com/request/30485/response/74705/attach/html/2/Empty%20premises%20Sefton.xls.html -# IO.popen("/usr/bin/xlhtml -nc -a " + tempfile.path + "", "r") do |child| -# html = child.read() -# wrapper_id = "wrapper_xlhtml" -# end if self.content_type == 'application/pdf' IO.popen("/usr/bin/pdftohtml -nodrm -zoom 1.0 -stdout -enc UTF-8 -noframes " + tempfile.path + "", "r") do |child| html = child.read() @@ -484,6 +323,7 @@ class FOIAttachment end + class IncomingMessage < ActiveRecord::Base belongs_to :info_request validates_presence_of :info_request @@ -496,6 +336,17 @@ class IncomingMessage < ActiveRecord::Base belongs_to :raw_email + # See binary_mask_stuff function below. It just test for inclusion + # in this hash, not the value of the right hand side. + DoNotBinaryMask = { + 'image/tiff' => 1, + 'image/gif' => 1, + 'image/jpeg' => 1, + 'image/png' => 1, + 'image/bmp' => 1, + 'application/zip' => 1, + } + # Return the structured TMail::Mail object # Documentation at http://i.loveruby.net/en/projects/tmail/doc/ def mail @@ -542,7 +393,7 @@ class IncomingMessage < ActiveRecord::Base # An email attached as text # e.g. http://www.whatdotheyknow.com/request/64/response/102 part.rfc822_attachment = TMail::Mail.parse(part.body) - elsif part.content_type == 'application/vnd.ms-outlook' || part_filename && filename_to_mimetype(part_filename) == 'application/vnd.ms-outlook' + elsif part.content_type == 'application/vnd.ms-outlook' || part_filename && AlaveteliFileTypes.filename_to_mimetype(part_filename) == 'application/vnd.ms-outlook' # An email attached as an Outlook file # e.g. http://www.whatdotheyknow.com/request/chinese_names_for_british_politi msg = Mapi::Msg.open(StringIO.new(part.body)) @@ -600,7 +451,7 @@ class IncomingMessage < ActiveRecord::Base # See if content type is one that we mask - things like zip files and # images may get broken if we try to. We err on the side of masking too # much, as many unknown types will really be text. - if $do_not_binary_mask.include?(content_type) + if DoNotBinaryMask.include?(content_type) return end @@ -829,7 +680,6 @@ class IncomingMessage < ActiveRecord::Base return _get_attachment_leaves_recursive(self.mail) end def _get_attachment_leaves_recursive(curr_mail, within_rfc822_attachment = nil) - # STDERR.puts "_get_attachment_leaves_recursive", curr_mail.content_type, curr_mail.sub_type, curr_mail.multipart?, "\n" leaves_found = [] if curr_mail.multipart? if curr_mail.parts.size == 0 @@ -875,7 +725,7 @@ class IncomingMessage < ActiveRecord::Base # PDFs often come with this mime type, fix it up for view code if curr_mail.content_type == 'application/octet-stream' part_file_name = self._get_censored_part_file_name(curr_mail) - calc_mime = filename_and_content_to_mimetype(part_file_name, curr_mail.body) + calc_mime = AlaveteliFileTypes.filename_and_content_to_mimetype(part_file_name, curr_mail.body) if calc_mime curr_mail.content_type = calc_mime end @@ -978,7 +828,7 @@ class IncomingMessage < ActiveRecord::Base # e.g. http://www.whatdotheyknow.com/request/35/response/177 # XXX This is a bit of a hack as it is calling a convert to text routine. # Could instead call a sanitize HTML one. - text = IncomingMessage._get_attachment_text_internal_one_file(part.content_type, text) + text = _get_attachment_text_internal_one_file(part.content_type, text) end end @@ -1056,7 +906,7 @@ class IncomingMessage < ActiveRecord::Base # ... or if none, consider first part p = leaves[0] # if it is a known type then don't use it, return no body (nil) - if mimetype_to_extension(p.content_type) + if AlaveteliFileTypes.mimetype_to_extension(p.content_type) # this is guess of case where there are only attachments, no body text # e.g. http://www.whatdotheyknow.com/request/cost_benefit_analysis_for_real_n return nil @@ -1096,7 +946,7 @@ class IncomingMessage < ActiveRecord::Base attachment.body = content attachment.filename = uu.match(/^begin\s+[0-9]+\s+(.*)$/)[1] self.info_request.apply_censor_rules_to_text!(attachment.filename) - calc_mime = filename_and_content_to_mimetype(attachment.filename, attachment.body) + calc_mime = AlaveteliFileTypes.filename_and_content_to_mimetype(attachment.filename, attachment.body) if calc_mime calc_mime = normalise_content_type(calc_mime) attachment.content_type = calc_mime @@ -1317,15 +1167,14 @@ class IncomingMessage < ActiveRecord::Base # e.g. password protected next end - calc_mime = filename_to_mimetype(filename) + calc_mime = AlaveteliFileTypes.filename_to_mimetype(filename) if calc_mime content_type = calc_mime else content_type = 'application/octet-stream' end - #STDERR.puts("doing file " + filename + " content type " + content_type) - text += IncomingMessage._get_attachment_text_internal_one_file(content_type, body) + text += _get_attachment_text_internal_one_file(content_type, body) end end end @@ -1400,7 +1249,7 @@ class IncomingMessage < ActiveRecord::Base for incoming_message in IncomingMessage.find(:all) for attachment in incoming_message.get_attachments_for_display raise "internal error incoming_message " + incoming_message.id.to_s if attachment.content_type.nil? - if mimetype_to_extension(attachment.content_type).nil? + if AlaveteliFileTypes.mimetype_to_extension(attachment.content_type).nil? STDERR.puts "Unknown type for /request/" + incoming_message.info_request.id.to_s + "#incoming-"+incoming_message.id.to_s STDERR.puts " " + attachment.filename.to_s + " " + attachment.content_type.to_s end @@ -1415,15 +1264,15 @@ class IncomingMessage < ActiveRecord::Base def get_present_file_extensions ret = {} for attachment in self.get_attachments_for_display - ext = mimetype_to_extension(attachment.content_type) + ext = AlaveteliFileTypes.mimetype_to_extension(attachment.content_type) ext = File.extname(attachment.filename).gsub(/^[.]/, "") if ext.nil? && !attachment.filename.nil? ret[ext] = 1 if !ext.nil? end return ret.keys.join(" ") end # Return space separated list of all file extensions known - def IncomingMessage.get_all_file_extentions - return $file_extension_to_mime_type.keys.join(" ") + def IncomingMessage.get_all_file_extensions + return AlaveteliFileTypes.all_extensions.join(" ") end # Return false if for some reason this is a message that we shouldn't let them reply to @@ -1449,6 +1298,59 @@ class IncomingMessage < ActiveRecord::Base return true end + + def normalise_content_type(content_type) + # e.g. http://www.whatdotheyknow.com/request/93/response/250 + if content_type == 'application/excel' or content_type == 'application/msexcel' or content_type == 'application/x-ms-excel' + content_type = 'application/vnd.ms-excel' + end + if content_type == 'application/mspowerpoint' or content_type == 'application/x-ms-powerpoint' + content_type = 'application/vnd.ms-powerpoint' + end + if content_type == 'application/msword' or content_type == 'application/x-ms-word' + content_type = 'application/vnd.ms-word' + end + if content_type == 'application/x-zip-compressed' + content_type = 'application/zip' + end + + # e.g. http://www.whatdotheyknow.com/request/copy_of_current_swessex_scr_opt#incoming-9928 + if content_type == 'application/acrobat' + content_type = 'application/pdf' + end + + return content_type + end + private :normalise_content_type + + def self.external_command(program_name, *args) + # Run an external program, and return its output. + # Standard error is suppressed unless the program + # fails (i.e. returns a non-zero exit status). + opts = {} + if !args.empty? && args[-1].is_a?(Hash) + opts = args.pop + end + + xc = ExternalCommand.new(program_name, *args) + if opts.has_key? :append_to + xc.out = opts[:append_to] + end + xc.run() + if xc.status != 0 + # Error + $stderr.puts("Error from #{program_name} #{args.join(' ')}:") + $stderr.print(xc.err) + return nil + else + if opts.has_key? :append_to + opts[:append_to] << "\n\n" + else + return xc.out + end + end + end + private_class_method :external_command end |