# == Schema Information
# Schema version: 78
#
# Table name: incoming_messages
#
# id :integer not null, primary key
# info_request_id :integer not null
# created_at :datetime not null
# updated_at :datetime not null
# cached_attachment_text :text
# cached_main_body_text :text
# raw_email_id :integer not null
#
# models/incoming_message.rb:
# An (email) message from really anybody to be logged with a request. e.g. A
# response from the public body.
#
# Copyright (c) 2007 UK Citizens Online Democracy. All rights reserved.
# Email: francis@mysociety.org; WWW: http://www.mysociety.org/
#
# $Id: incoming_message.rb,v 1.220 2009-09-09 17:23:14 francis Exp $
# TODO
# Move some of the (e.g. quoting) functions here into rblib, as they feel
# general not specific to IncomingMessage.
require 'htmlentities'
require 'rexml/document'
require 'zip/zip'
require 'mahoro'
# Monkeypatch! Adding some extra members to store extra info in.
module TMail
class Mail
attr_accessor :url_part_number
attr_accessor :rfc822_attachment # when a whole email message is attached as text
attr_accessor :within_rfc822_attachment # for parts within a message attached as text (for getting subject mainly)
end
end
# To add an image, create a file with appropriate name corresponding to the
# mime type in public/images e.g. icon_image_tiff_large.png
$file_extension_to_mime_type = {
"txt" => 'text/plain',
"pdf" => 'application/pdf',
"rtf" => 'application/rtf',
"doc" => 'application/vnd.ms-word',
"docx" => 'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
"xls" => 'application/vnd.ms-excel',
"xlsx" => 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
"ppt" => 'application/vnd.ms-powerpoint',
"pptx" => 'application/vnd.openxmlformats-officedocument.presentationml.presentation',
"tif" => 'image/tiff',
"gif" => 'image/gif',
"jpg" => 'image/jpeg', # XXX add jpeg
"png" => 'image/png',
"bmp" => 'image/bmp',
"html" => 'text/html', # XXX add htm
"vcf" => 'text/x-vcard',
"zip" => 'application/zip',
"delivery-status" => 'message/delivery-status'
}
# XXX doesn't have way of choosing default for inverse map - might want to add
# one when you need it
$file_extension_to_mime_type_rev = $file_extension_to_mime_type.invert
# See binary_mask_stuff function below. It just test for inclusion
# in this hash, not the value of the right hand side.
$do_not_binary_mask = {
'image/tiff' => 1,
'image/gif' => 1,
'image/jpeg' => 1,
'image/png' => 1,
'image/bmp' => 1,
'application/zip' => 1,
}
# Given file name and its content, return most likely type
def filename_and_content_to_mimetype(filename, content)
# Try filename
ret = filename_to_mimetype(filename)
if !ret.nil?
return ret
end
# Otherwise look inside the file to work out the type.
# Mahoro is a Ruby binding for libmagic.
m = Mahoro.new(Mahoro::MIME)
mahoro_type = m.buffer(content)
mahoro_type.strip!
#STDERR.puts("mahoro", mahoro_type, "xxxok")
# XXX we shouldn't have to check empty? here, but Mahoro sometimes returns a blank line :(
# e.g. for InfoRequestEvent 17930
if mahoro_type.nil? || mahoro_type.empty?
return nil
end
# text/plain types sometimes come with a charset
mahoro_type.match(/^(.*);/)
if $1
mahoro_type = $1
end
# see if looks like a content type, or has something in it that does
# and return that
# mahoro returns junk "\012- application/msword" as mime type.
mahoro_type.match(/([a-z0-9.-]+\/[a-z0-9.-]+)/)
if $1
return $1
end
# otherwise we got junk back from mahoro
return nil
end
# XXX clearly this shouldn't be a global function, or the above global vars.
def filename_to_mimetype(filename)
if !filename
return nil
end
if filename.match(/\.([^.]+)$/i)
lext = $1.downcase
if $file_extension_to_mime_type.include?(lext)
return $file_extension_to_mime_type[lext]
end
end
return nil
end
def mimetype_to_extension(mime)
if $file_extension_to_mime_type_rev.include?(mime)
return $file_extension_to_mime_type_rev[mime]
end
return nil
end
def normalise_content_type(content_type)
# e.g. http://www.whatdotheyknow.com/request/93/response/250
if content_type == 'application/excel' or content_type == 'application/msexcel' or content_type == 'application/x-ms-excel'
content_type = 'application/vnd.ms-excel'
end
if content_type == 'application/mspowerpoint' or content_type == 'application/x-ms-powerpoint'
content_type = 'application/vnd.ms-powerpoint'
end
if content_type == 'application/msword' or content_type == 'application/x-ms-word'
content_type = 'application/vnd.ms-word'
end
if content_type == 'application/x-zip-compressed'
content_type = 'application/zip'
end
# e.g. http://www.whatdotheyknow.com/request/copy_of_current_swessex_scr_opt#incoming-9928
if content_type == 'application/acrobat'
content_type = 'application/pdf'
end
return content_type
end
# List of DSN codes taken from RFC 3463
# http://tools.ietf.org/html/rfc3463
$dsn_to_message = {
'X.1.0' => 'Other address status',
'X.1.1' => 'Bad destination mailbox address',
'X.1.2' => 'Bad destination system address',
'X.1.3' => 'Bad destination mailbox address syntax',
'X.1.4' => 'Destination mailbox address ambiguous',
'X.1.5' => 'Destination mailbox address valid',
'X.1.6' => 'Mailbox has moved',
'X.1.7' => 'Bad sender\'s mailbox address syntax',
'X.1.8' => 'Bad sender\'s system address',
'X.2.0' => 'Other or undefined mailbox status',
'X.2.1' => 'Mailbox disabled, not accepting messages',
'X.2.2' => 'Mailbox full',
'X.2.3' => 'Message length exceeds administrative limit.',
'X.2.4' => 'Mailing list expansion problem',
'X.3.0' => 'Other or undefined mail system status',
'X.3.1' => 'Mail system full',
'X.3.2' => 'System not accepting network messages',
'X.3.3' => 'System not capable of selected features',
'X.3.4' => 'Message too big for system',
'X.4.0' => 'Other or undefined network or routing status',
'X.4.1' => 'No answer from host',
'X.4.2' => 'Bad connection',
'X.4.3' => 'Routing server failure',
'X.4.4' => 'Unable to route',
'X.4.5' => 'Network congestion',
'X.4.6' => 'Routing loop detected',
'X.4.7' => 'Delivery time expired',
'X.5.0' => 'Other or undefined protocol status',
'X.5.1' => 'Invalid command',
'X.5.2' => 'Syntax error',
'X.5.3' => 'Too many recipients',
'X.5.4' => 'Invalid command arguments',
'X.5.5' => 'Wrong protocol version',
'X.6.0' => 'Other or undefined media error',
'X.6.1' => 'Media not supported',
'X.6.2' => 'Conversion required and prohibited',
'X.6.3' => 'Conversion required but not supported',
'X.6.4' => 'Conversion with loss performed',
'X.6.5' => 'Conversion failed',
'X.7.0' => 'Other or undefined security status',
'X.7.1' => 'Delivery not authorized, message refused',
'X.7.2' => 'Mailing list expansion prohibited',
'X.7.3' => 'Security conversion required but not possible',
'X.7.4' => 'Security features not supported',
'X.7.5' => 'Cryptographic failure',
'X.7.6' => 'Cryptographic algorithm not supported',
'X.7.7' => 'Message integrity failure'
}
# This is the type which is used to send data about attachments to the view
class FOIAttachment
attr_accessor :body
attr_accessor :content_type
attr_accessor :filename
attr_accessor :url_part_number
attr_accessor :within_rfc822_subject # we use the subject as the filename for email attachments
# Returns HTML, of extra comment to put by attachment
def extra_note
# For delivery status notification attachments, extract the status and
# look up what it means in the DSN table.
if @content_type == 'message/delivery-status':
if !@body.match(/Status:\s+([0-9]+\.([0-9]+\.[0-9]+))\s+/)
return ""
end
dsn = $1
dsn_part = 'X.' + $2
dsn_message = ""
if $dsn_to_message.include?(dsn_part)
dsn_message = " (" + $dsn_to_message[dsn_part] + ")"
end
return "
DSN: " + dsn + dsn_message + ""
end
return ""
end
# Called by controller so old filenames still work
def old_display_filename
filename = self._internal_display_filename
# Convert weird spaces (e.g. \n) to normal ones
filename = filename.gsub(/\s/, " ")
# Remove slashes, they mess with URLs
filename = filename.gsub(/\//, "-")
return filename
end
# XXX changing this will break existing URLs, so have a care - maybe
# make another old_display_filename see above
def display_filename
filename = self._internal_display_filename
# Sometimes filenames have e.g. %20 in - no point butchering that
# (without unescaping it, this would remove the % and leave 20s in there)
filename = CGI.unescape(filename)
# Remove weird spaces
filename = filename.gsub(/\s+/, " ")
# Remove non-alphabetic characters
filename = filename.gsub(/[^A-Za-z0-9.]/, " ")
# Remove spaces near dots
filename = filename.gsub(/\s*\.\s*/, ".")
# Compress adjacent spaces down to a single one
filename = filename.gsub(/\s+/, " ")
filename = filename.strip
return filename
end
def _internal_display_filename
calc_ext = mimetype_to_extension(@content_type)
if @filename
# Put right extension on if missing
if !filename.match(/\.#{calc_ext}$/) && calc_ext
filename + "." + calc_ext
else
filename
end
else
if !calc_ext
calc_ext = "bin"
end
if @within_rfc822_subject
@within_rfc822_subject + "." + calc_ext
else
"attachment." + calc_ext
end
end
end
# Size to show next to the download link for the attachment
def display_size
s = self.body.size
if s > 1024 * 1024
return sprintf("%.1f", s.to_f / 1024 / 1024) + 'M'
else
return (s / 1024).to_s + 'K'
end
end
# For "View as HTML" of attachment
def body_as_html(dir)
html = nil
Dir.chdir(dir) do
tempfile = Tempfile.new('foiextract', '.')
tempfile.print self.body
tempfile.flush
if self.content_type == 'application/vnd.ms-word'
# XXX do something with PNG files this spits out so they view too :)
system("/usr/bin/wvHtml --charset=UTF-8 " + tempfile.path + " " + tempfile.path + ".html")
html = File.read(tempfile.path + ".html")
File.unlink(tempfile.path + ".html")
elsif self.content_type == 'application/pdf'
IO.popen("/usr/bin/pdftohtml -zoom 1.0 -stdout -enc UTF-8 -noframes " + tempfile.path + "", "r") do |child|
html = child.read()
end
# if pdftohtml failed (size zero is only way to detect this, as doesn't return error codes)
# try converting to postscript and back, to strip problems such as this error:
# "Error: Copying of text from this document is not allowed"
if html.size == 0
system("/usr/bin/pdf2ps " + tempfile.path + " " + tempfile.path + ".ps")
system("/usr/bin/ps2pdf " + tempfile.path + ".ps " + tempfile.path)
IO.popen("/usr/bin/pdftohtml -zoom 1.0 -stdout -enc UTF-8 -noframes " + tempfile.path + "", "r") do |child|
html = child.read()
end
end
else
raise "No HTML conversion available for type " + self.content_type
end
tempfile.close
tempfile.delete
end
# We need to look at:
# a) Any error code
# b) The output size, as pdftohtml does not return an error code upon error.
# c) For cases when there is no text in the body of the HTML, or
# images, so nothing will be rendered. This is to detect some bug in
# pdftohtml, which sometimes makes it return just
Sorry, the conversion to HTML failed. Please use the download link at the top right.
" end return html end # Whether this type has a "View as HTML" def has_body_as_html? if self.content_type == 'application/vnd.ms-word' return true elsif self.content_type == 'application/pdf' return true end return false end # Name of type of attachment type - only valid for things that has_body_as_html? def name_of_content_type if self.content_type == 'application/vnd.ms-word' return "Word document" elsif self.content_type == 'application/pdf' return "PDF file" end end end class IncomingMessage < ActiveRecord::Base belongs_to :info_request validates_presence_of :info_request validates_presence_of :raw_email has_many :outgoing_message_followups, :foreign_key => 'incoming_message_followup_id', :class_name => 'OutgoingMessage' has_many :info_request_events # never really has many, but could in theory belongs_to :raw_email # Return the structured TMail::Mail object # Documentation at http://i.loveruby.net/en/projects/tmail/doc/ def mail if @mail.nil? && !self.raw_email.nil? # Hack round bug in TMail's MIME decoding. Example request which provokes it: # http://www.whatdotheyknow.com/request/reviews_of_unduly_lenient_senten#incoming-4830 # Report of TMail bug: # http://rubyforge.org/tracker/index.php?func=detail&aid=21810&group_id=4512&atid=17370 copy_of_raw_data = self.raw_email.data.gsub(/; boundary=\s+"/ims,'; boundary="') @mail = TMail::Mail.parse(copy_of_raw_data) @mail.base64_decode end @mail end # Number the attachments in depth first tree order, for use in URLs. # XXX This fills in part.rfc822_attachment and part.url_part_number within # all the parts of the email (see TMail monkeypatch above for how these # attributes are added). ensure_parts_counted must be called before using # the attributes. This calculation is done only when required to avoid # having to load and parse the email unnecessarily. def after_initialize @parts_counted = false end def ensure_parts_counted if not @parts_counted @count_parts_count = 0 count_parts_recursive(self.mail) # we carry on using these numeric ids for attachments uudecoded from within text parts @count_first_uudecode_count = @count_parts_count @parts_counted = true end end def count_parts_recursive(part) if part.multipart? part.parts.each do |p| count_parts_recursive(p) end else if part.content_type == 'message/rfc822' # An email attached as text # e.g. http://www.whatdotheyknow.com/request/64/response/102 begin part.rfc822_attachment = TMail::Mail.parse(part.body) rescue # If attached mail doesn't parse, treat it as text part part.rfc822_attachment = nil @count_parts_count += 1 part.url_part_number = @count_parts_count else count_parts_recursive(part.rfc822_attachment) end else @count_parts_count += 1 part.url_part_number = @count_parts_count end end end # And look up by URL part number to get an attachment # XXX relies on get_attachments_for_display calling ensure_parts_counted def self.get_attachment_by_url_part_number(attachments, found_url_part_number) attachments.each do |a| if a.url_part_number == found_url_part_number return a end end return nil end # Return date mail was sent def sent_at # Use date it arrived (created_at) if mail itself doesn't have Date: header self.mail.date || self.created_at end # Converts email addresses we know about into textual descriptions of them def mask_special_emails(text) # XXX can later display some of these special emails as actual emails, # if they are public anyway. For now just be precautionary and only # put in descriptions of them in square brackets. if self.info_request.public_body.is_followupable? text = text.gsub(self.info_request.public_body.request_email, "[" + self.info_request.public_body.short_or_long_name + " request email]") end text = text.gsub(self.info_request.incoming_email, "[FOI #" + self.info_request.id.to_s + " email]") text = text.gsub(MySociety::Config.get("CONTACT_EMAIL", 'contact@localhost'), "[WhatDoTheyKnow contact email]") return text end # Replaces all email addresses in (possibly binary data) with equal length alternative ones. # Also replaces censor items def binary_mask_stuff(text, content_type) # See if content type is one that we mask - things like zip files and # images may get broken if we try to. We err on the side of masking too # much, as many unknown types will really be text. if $do_not_binary_mask.include?(content_type) return text end # Special cases for some content types if content_type == 'application/pdf' uncompressed_text = nil IO.popen("/usr/bin/pdftk - output - uncompress", "r+") do |child| child.write(text) child.close_write() uncompressed_text = child.read() end # if we managed to uncompress the PDF... if !uncompressed_text.nil? # then censor stuff (making a copy so can compare again in a bit) censored_uncompressed_text = self._binary_mask_stuff_internal(uncompressed_text.dup) # if the censor rule removed something... if censored_uncompressed_text != uncompressed_text # then use the altered file (recompressed) recompressed_text = nil IO.popen("/usr/bin/pdftk - output - compress", "r+") do |child| child.write(censored_uncompressed_text) child.close_write() recompressed_text = child.read() end if !recompressed_text.nil? text = recompressed_text end end end return text end return self._binary_mask_stuff_internal(text) end # Used by binary_mask_stuff - replace text in place def _binary_mask_stuff_internal(text) # Keep original size, so can check haven't resized it orig_size = text.size # Replace ASCII email addresses... text.gsub!(MySociety::Validate.email_find_regexp) do |email| email.gsub(/[^@.]/, 'x') end # And replace UCS-2 ones (for Microsoft Office documents)... # Find emails, by finding them in parts of text that have ASCII # equivalents to the UCS-2 ascii_chars = text.gsub(/\0/, "") emails = ascii_chars.scan(MySociety::Validate.email_find_regexp) # Convert back to UCS-2, making a mask at the same time emails.map! {|email| [ Iconv.conv('ucs-2', 'ascii', email[0]), Iconv.conv('ucs-2', 'ascii', email[0].gsub(/[^@.]/, 'x')) ] } # Now search and replace the UCS-2 email with the UCS-2 mask for email, mask in emails text.gsub!(email, mask) end # Replace censor items text = self.info_request.apply_censor_rules_to_binary(text) raise "internal error in binary_mask_stuff" if text.size != orig_size return text end # Removes censored stuff from from HTML conversion of downloaded binaries def html_mask_stuff(html) html = self.mask_special_emails(html) html = self.remove_privacy_sensitive_things(html) return html end # Lotus notes quoting yeuch! def remove_lotus_quoting(text, replacement = "FOLDED_QUOTED_SECTION") text = text.dup name = Regexp.escape(self.info_request.user.name) # To end of message sections # http://www.whatdotheyknow.com/request/university_investment_in_the_arm text.gsub!(/^#{name}[^\n]+\nSent by:[^\n]+\n.*/ims, "\n\n" + replacement) # Some other sort of forwarding quoting # http://www.whatdotheyknow.com/request/224/response/326 text.gsub!(/^#{name}[^\n]+\n[0-9\/:\s]+\s+To\s+FOI requests at.*/ims, "\n\n" + replacement) # http://www.whatdotheyknow.com/request/how_do_the_pct_deal_with_retirin_33#incoming-930 # http://www.whatdotheyknow.com/request/229/response/809 text.gsub!(/^From: [^\n]+\nSent: [^\n]+\nTo:\s+['"?]#{name}['"]?\nSubject:.*/ims, "\n\n" + replacement) return text end # Remove emails, mobile phones and other details FOI officers ask us to remove. def remove_privacy_sensitive_things(text) text = text.dup # Remove any email addresses - we don't want bounce messages to leak out # either the requestor's email address or the request's response email # address out onto the internet text.gsub!(MySociety::Validate.email_find_regexp, "[email address]") # Mobile phone numbers # http://www.whatdotheyknow.com/request/failed_test_purchases_off_licenc#incoming-1013 # http://www.whatdotheyknow.com/request/selective_licensing_statistics_i#incoming-550 # http://www.whatdotheyknow.com/request/common_purpose_training_graduate#incoming-774 text.gsub!(/(Mobile|Mob)([\s\/]*(Fax|Tel))*\s*:?[\s\d]*\d/, "[mobile number]") # Specific removals # http://www.whatdotheyknow.com/request/total_number_of_objects_in_the_n_6 text.gsub!(/\*\*\*+\nPolly Tucker.*/ms, "") # http://www.whatdotheyknow.com/request/cctv_data_retention_and_use text.gsub!(/Andy 079.*/, "Andy [mobile number]") # http://www.whatdotheyknow.com/request/how_do_the_pct_deal_with_retirin_113 text.gsub!(/(Complaints and Corporate Affairs Officer)\s+Westminster Primary Care Trust.+/ms, "\\1") # Remove WhatDoTheyKnow signup links text.gsub!(/http:\/\/www.whatdotheyknow.com\/c\/[^\s]+/, "[WDTK login link]") # Remove Home Office survey links # e.g. http://www.whatdotheyknow.com/request/serious_crime_act_2007_section_7#incoming-12650 if self.info_request.public_body.url_name == 'home_office' text.gsub!(/Your password:-\s+[^\s]+/, '[password]') text.gsub!(/Password=[^\s]+/, '[password]') end # Remove things from censor rules text = self.info_request.apply_censor_rules_to_text(text) return text end # Remove quoted sections from emails (eventually the aim would be for this # to do as good a job as GMail does) XXX bet it needs a proper parser # XXX and this FOLDED_QUOTED_SECTION stuff is a mess def self.remove_quoted_sections(text, replacement = "FOLDED_QUOTED_SECTION") text = text.dup replacement = "\n" + replacement + "\n" # First do this peculiar form of quoting, as the > single line quoting # further below messes with it. Note the carriage return where it wraps - # this can happen anywhere according to length of the name/email. e.g. # >>> D K Elwell <[email address]> 17/03/2008 # 01:51:50 >>> # http://www.whatdotheyknow.com/request/71/response/108 # http://www.whatdotheyknow.com/request/police_powers_to_inform_car_insu # http://www.whatdotheyknow.com/request/secured_convictions_aided_by_cct multiline_original_message = '(' + '''>>>.* \d\d/\d\d/\d\d\d\d\s+\d\d:\d\d(?::\d\d)?\s*>>>''' + ')' text.gsub!(/^(#{multiline_original_message}\n.*)$/ms, replacement) # Single line sections text.gsub!(/^(>.*\n)/, replacement) text.gsub!(/^(On .+ (wrote|said):\n)/, replacement) # Multiple line sections # http://www.whatdotheyknow.com/request/identity_card_scheme_expenditure # http://www.whatdotheyknow.com/request/parliament_protest_actions # http://www.whatdotheyknow.com/request/64/response/102 # http://www.whatdotheyknow.com/request/47/response/283 # http://www.whatdotheyknow.com/request/30/response/166 # http://www.whatdotheyknow.com/request/52/response/238 # http://www.whatdotheyknow.com/request/224/response/328 # example with * * * * * # http://www.whatdotheyknow.com/request/297/response/506 ['-', '_', '*', '#'].each do |score| text.sub!(/(Disclaimer\s+)? # appears just before ( \s*(?:[#{score}]\s*){8,}\s*\n.*? # top line (disclaimer:\n|confidential|received\sthis\semail\sin\serror|virus|intended\s+recipient|monitored\s+centrally|intended\s+(for\s+|only\s+for\s+use\s+by\s+)the\s+addressee|routinely\s+monitored|MessageLabs|unauthorised\s+use) .*?((?:[#{score}]\s*){8,}\s*\n|\z) # bottom line OR end of whole string (for ones with no terminator XXX risky) ) /imx, replacement) end # Special paragraphs # http://www.whatdotheyknow.com/request/identity_card_scheme_expenditure text.gsub!(/^[^\n]+Government\s+Secure\s+Intranet\s+virus\s+scanning .*? virus\sfree\. /imx, replacement) text.gsub!(/^Communications\s+via\s+the\s+GSi\s+ .*? legal\spurposes\. /imx, replacement) # http://www.whatdotheyknow.com/request/net_promoter_value_scores_for_bb text.gsub!(/^http:\/\/www.bbc.co.uk .*? Further\s+communication\s+will\s+signify\s+your\s+consent\s+to\s+this\. /imx, replacement) # To end of message sections # http://www.whatdotheyknow.com/request/123/response/192 # http://www.whatdotheyknow.com/request/235/response/513 # http://www.whatdotheyknow.com/request/445/response/743 original_message = '(' + '''----* This is a copy of the message, including all the headers. ----*''' + '|' + '''----*\s*Original Message\s*----*''' + '|' + '''----*\s*Forwarded message.+----*''' + '|' + '''----*\s*Forwarded by.+----*''' + ')' # Could have a ^ at start here, but see messed up formatting here: # http://www.whatdotheyknow.com/request/refuse_and_recycling_collection#incoming-842 text.gsub!(/(#{original_message}\n.*)$/mi, replacement) # Some silly Microsoft XML gets into parts marked as plain text. # e.g. http://www.whatdotheyknow.com/request/are_traffic_wardens_paid_commiss#incoming-401 # Don't replace with "replacement" as it's pretty messy text.gsub!(/<\?xml:namespace[^>]*\/>/, " ") return text end # (This risks losing info if the unchosen alternative is the only one to contain # useful info, but let's worry about that another time) def get_attachment_leaves return get_attachment_leaves_recursive(self.mail) end def get_attachment_leaves_recursive(curr_mail, within_rfc822_attachment = nil) leaves_found = [] if curr_mail.multipart? if curr_mail.sub_type == 'alternative' # Choose best part from alternatives best_part = nil curr_mail.parts.each do |m| # Take the first one, or the last text/plain one # XXX - could do better! if not best_part best_part = m elsif m.content_type == 'text/plain' best_part = m end end leaves_found += get_attachment_leaves_recursive(best_part, within_rfc822_attachment) else # Add all parts curr_mail.parts.each do |m| leaves_found += get_attachment_leaves_recursive(m, within_rfc822_attachment) end end else # Don't allow nil content_types if curr_mail.content_type.nil? curr_mail.content_type = 'application/octet-stream' end # PDFs often come with this mime type, fix it up for view code if curr_mail.content_type == 'application/octet-stream' calc_mime = filename_and_content_to_mimetype(self.info_request.apply_censor_rules_to_text(TMail::Mail.get_part_file_name(curr_mail)), curr_mail.body) if calc_mime curr_mail.content_type = calc_mime end end # Use standard content types for Word documents etc. curr_mail.content_type = normalise_content_type(curr_mail.content_type) if curr_mail.content_type == 'message/rfc822' if curr_mail.rfc822_attachment.nil? # Attached mail didn't parse, so treat as text curr_mail.content_type = 'text/plain' end end # If the part is an attachment of email in text form if curr_mail.content_type == 'message/rfc822' ensure_parts_counted # fills in rfc822_attachment variable leaves_found += get_attachment_leaves_recursive(curr_mail.rfc822_attachment, curr_mail.rfc822_attachment) else # Store leaf curr_mail.within_rfc822_attachment = within_rfc822_attachment leaves_found += [curr_mail] end end return leaves_found end # Returns body text from main text part of email, converted to UTF-8, with uudecode removed def get_main_body_text # Cached as loading raw_email can be quite huge, and need this for just # search results if self.cached_main_body_text.nil? text = self.get_main_body_text_internal self.cached_main_body_text = text self.save! end text = self.cached_main_body_text # Strip the uudecode parts from main text text = text.split(/^begin.+^`\n^end\n/sm).join(" ") return text end # Returns body text from main text part of email, converted to UTF-8 def get_main_body_text_internal main_part = get_main_body_text_part return convert_part_body_to_text(main_part) end # Given a main text part, converts it to text def convert_part_body_to_text(part) if part.nil? text = "[ Email has no body, please see attachments ]" text_charset = "utf-8" else text = part.body text_charset = part.charset if part.content_type == 'text/html' # e.g. http://www.whatdotheyknow.com/request/35/response/177 # XXX This is a bit of a hack as it is calling a convert to text routine. # Could instead call a sanitize HTML one. text = IncomingMessage.get_attachment_text_internal_one_file(part.content_type, text) end end # Charset conversion, turn everything into UTF-8 if not text_charset.nil? begin # XXX specially convert unicode pound signs, was needed here # http://www.whatdotheyknow.com/request/88/response/352 text = text.gsub("£", Iconv.conv(text_charset, 'utf-8', '£')) # Try proper conversion text = Iconv.conv('utf-8', text_charset, text) rescue Iconv::IllegalSequence, Iconv::InvalidEncoding # Clearly specified charset was nonsense text_charset = nil end end if text_charset.nil? # No specified charset, so guess # Could use rchardet here, but it had trouble with # http://www.whatdotheyknow.com/request/107/response/144 # So I gave up - most likely in UK we'll only get windows-1252 anyway. begin # See if it is good UTF-8 anyway text = Iconv.conv('utf-8', 'utf-8', text) rescue Iconv::IllegalSequence begin # Or is it good windows-1252, most likely text = Iconv.conv('utf-8', 'windows-1252', text) rescue Iconv::IllegalSequence # Text looks like unlabelled nonsense, strip out anything that isn't UTF-8 text = Iconv.conv('utf-8//IGNORE', 'utf-8', text) + "\n\n[ WhatDoTheyKnow note: The above text was badly encoded, and has had strange characters removed. ]" end end end # An assertion that we have ended up with UTF-8 XXX can remove as this should # always be fine if code above is Iconv.conv('utf-8', 'utf-8', text) # Fix DOS style linefeeds to Unix style ones (or other later regexps won't work) # Needed for e.g. http://www.whatdotheyknow.com/request/60/response/98 text = text.gsub(/\r\n/, "\n") # Compress extra spaces down to save space, and to stop regular expressions # breaking in strange extreme cases. e.g. for # http://www.whatdotheyknow.com/request/spending_on_consultants text = text.gsub(/ +/, " ") return text end # Returns part which contains main body text, or nil if there isn't one def get_main_body_text_part leaves = get_attachment_leaves # Find first part which is text/plain leaves.each do |p| if p.content_type == 'text/plain' return p end end # Otherwise first part which is any sort of text leaves.each do |p| if p.main_type == 'text' return p end end # ... or if none, consider first part p = leaves[0] # if it is a known type then don't use it, return no body (nil) if mimetype_to_extension(p.content_type) # this is guess of case where there are only attachments, no body text # e.g. http://www.whatdotheyknow.com/request/cost_benefit_analysis_for_real_n return nil end # otherwise return it assuming it is text (sometimes you get things # like binary/octet-stream, or the like, which are really text - XXX if # you find an example, put URL here - perhaps we should be always returning # nil in this case) return p end # Returns attachments that are uuencoded in main body part def get_main_body_text_uudecode_attachments text = get_main_body_text_internal # Find any uudecoded things buried in it, yeuchly uus = text.scan(/^begin.+^`\n^end\n/sm) attachments = [] for uu in uus # Decode the string content = nil tempfile = Tempfile.new('foiuu') tempfile.print uu tempfile.flush IO.popen("/usr/bin/uudecode " + tempfile.path + " -o -", "r") do |child| content = child.read() end tempfile.close # Make attachment type from it, working out filename and mime type attachment = FOIAttachment.new() attachment.body = content attachment.filename = self.info_request.apply_censor_rules_to_text(uu.match(/^begin\s+[0-9]+\s+(.*)$/)[1]) calc_mime = filename_and_content_to_mimetype(attachment.filename, attachment.body) if calc_mime calc_mime = normalise_content_type(calc_mime) attachment.content_type = calc_mime else attachment.content_type = 'application/octet-stream' end attachments += [attachment] end return attachments end # Returns all attachments for use in display code def get_attachments_for_display ensure_parts_counted main_part = get_main_body_text_part leaves = get_attachment_leaves attachments = [] for leaf in leaves if leaf != main_part attachment = FOIAttachment.new attachment.body = leaf.body attachment.filename = self.info_request.apply_censor_rules_to_text(TMail::Mail.get_part_file_name(leaf)) if leaf.within_rfc822_attachment attachment.within_rfc822_subject = leaf.within_rfc822_attachment.subject # Test to see if we are in the first part of the attached # RFC822 message and it is text, if so add headers. # XXX should probably use hunting algorithm to find main text part, rather than # just expect it to be first. This will do for now though. # Example request that needs this: # http://www.whatdotheyknow.com/request/2923/response/7013/attach/2/Cycle%20Path%20Bank.txt if leaf.within_rfc822_attachment == leaf && leaf.content_type == 'text/plain' headers = "" for header in [ 'Date', 'Subject', 'From', 'To', 'Cc' ] if leaf.within_rfc822_attachment.header.include?(header.downcase) headers = headers + header + ": " + leaf.within_rfc822_attachment.header[header.downcase].to_s + "\n" end end # XXX call convert_part_body_to_text here, but need to get charset somehow # e.g. http://www.whatdotheyknow.com/request/1593/response/3088/attach/4/Freedom%20of%20Information%20request%20-%20car%20oval%20sticker:%20Article%2020,%20Convention%20on%20Road%20Traffic%201949.txt attachment.body = headers + "\n" + attachment.body # This is quick way of getting all headers, but instead we only add some a) to # make it more usable, b) as at least one authority accidentally leaked security # information into a header. #attachment.body = leaf.within_rfc822_attachment.port.to_s end end attachment.content_type = leaf.content_type attachment.url_part_number = leaf.url_part_number attachments += [attachment] end end uudecode_attachments = get_main_body_text_uudecode_attachments c = @count_first_uudecode_count for uudecode_attachment in uudecode_attachments c += 1 uudecode_attachment.url_part_number = c attachments += [uudecode_attachment] end return attachments end # Returns body text as HTML with quotes flattened, and emails removed. def get_body_for_html_display(collapse_quoted_sections = true) # Find the body text and remove emails for privacy/anti-spam reasons text = get_main_body_text text = self.mask_special_emails(text) text = self.remove_privacy_sensitive_things(text) # Remove quoted sections, adding HTML. XXX The FOLDED_QUOTED_SECTION is # a nasty hack so we can escape other HTML before adding the unfold # links, without escaping them. Rather than using some proper parser # making a tree structure (I don't know of one that is to hand, that # works well in this kind of situation, such as with regexps). folded_quoted_text = self.remove_lotus_quoting(text, 'FOLDED_QUOTED_SECTION') folded_quoted_text = IncomingMessage.remove_quoted_sections(folded_quoted_text, 'FOLDED_QUOTED_SECTION') if collapse_quoted_sections text = folded_quoted_text end text = MySociety::Format.simplify_angle_bracketed_urls(text) text = CGI.escapeHTML(text) text = MySociety::Format.make_clickable(text, :contract => 1) text.gsub!(/\[(email address|mobile number)\]/, '[\1]') if collapse_quoted_sections text = text.gsub(/(\s*FOLDED_QUOTED_SECTION\s*)+/m, "FOLDED_QUOTED_SECTION") text.strip! # if there is nothing but quoted stuff, then show the subject if text == "FOLDED_QUOTED_SECTION" text = "[Subject only] " + CGI.escapeHTML(self.mail.subject) + text end # and display link for quoted stuff text = text.gsub(/FOLDED_QUOTED_SECTION/, "\n\n" + 'show quoted sections' + "\n\n") else if folded_quoted_text.include?('FOLDED_QUOTED_SECTION') text = text + "\n\n" + 'hide quoted sections' end end text.strip! text = text.gsub(/\n/, '