# == Schema Information
# Schema version: 56
#
# Table name: incoming_messages
#
# id :integer not null, primary key
# info_request_id :integer not null
# raw_data :text not null
# created_at :datetime not null
# updated_at :datetime not null
# cached_attachment_text :text
#
# models/incoming_message.rb:
# An (email) message from really anybody to be logged with a request. e.g. A
# response from the public body.
#
# Copyright (c) 2007 UK Citizens Online Democracy. All rights reserved.
# Email: francis@mysociety.org; WWW: http://www.mysociety.org/
#
# $Id: incoming_message.rb,v 1.119 2008-07-10 13:32:01 francis Exp $
# TODO
# Move some of the (e.g. quoting) functions here into rblib, as they feel
# general not specific to IncomingMessage.
require 'htmlentities'
module TMail
class Mail
attr_accessor :url_part_number
attr_accessor :rfc822_attachment # when a whole email message is attached as text
# Monkeypatch! (check to see if this becomes a standard function in
# TMail::Mail, then use that, whatever it is called)
def self.get_part_file_name(part)
file_name = (part['content-location'] &&
part['content-location'].body) ||
part.sub_header("content-type", "name") ||
part.sub_header("content-disposition", "filename")
end
end
end
# To add an image, create a file with appropriate name corresponding to the
# mime type in public/images e.g. icon_image_tiff_large.png
$file_extension_to_mime_type = {
"txt" => 'text/plain',
"pdf" => 'application/pdf',
"rtf" => 'application/rtf',
"doc" => 'application/vnd.ms-word',
"xls" => 'application/vnd.ms-excel',
"ppt" => 'application/vnd.ms-powerpoint',
"tif" => 'image/tiff',
"gif" => 'image/gif',
"jpg" => 'image/jpeg', # XXX add jpeg
"html" => 'text/html', # XXX add htm
}
# XXX doesn't have way of choosing default for inverse map - might want to add
# one when you need it
$file_extension_to_mime_type_rev = $file_extension_to_mime_type.invert
# XXX clearly this shouldn't be a global function, or the above global vars.
def filename_to_mimetype(filename)
if not filename
return nil
end
if filename.match(/\.([^.]+)$/i)
lext = $1.downcase
if $file_extension_to_mime_type.include?(lext)
return $file_extension_to_mime_type[lext]
end
end
return nil
end
def mimetype_to_extension(mime)
if $file_extension_to_mime_type_rev.include?(mime)
return $file_extension_to_mime_type_rev[mime]
end
return nil
end
# This is the type which is used to send data about attachments to the view
class FOIAttachment
attr_accessor :body
attr_accessor :content_type
attr_accessor :filename
attr_accessor :url_part_number
def display_filename
if @filename
@filename
else
calc_ext = mimetype_to_extension(@content_type)
if calc_ext
"attachment." + calc_ext
else
"attachment.bin"
end
end
end
end
class IncomingMessage < ActiveRecord::Base
belongs_to :info_request
validates_presence_of :info_request
validates_presence_of :raw_data
has_many :outgoing_message_followups, :foreign_key => 'incoming_message_followup_id', :class_name => 'OutgoingMessage'
# Return the structured TMail::Mail object
# Documentation at http://i.loveruby.net/en/projects/tmail/doc/
def mail
if @mail.nil? && !self.raw_data.nil?
@mail = TMail::Mail.parse(self.raw_data)
@mail.base64_decode
end
@mail
end
# Number the attachments in depth first tree order, for use in URLs.
def after_initialize
if !self.mail.nil?
@count_parts_count = 0
count_parts_recursive(self.mail)
# we carry on using these numeric ids for attachments uudecoded from within text parts
@count_first_uudecode_count = @count_parts_count
end
end
def count_parts_recursive(part)
if part.multipart?
part.parts.each do |p|
count_parts_recursive(p)
end
else
if part.content_type == 'message/rfc822'
# An email attached as text
# e.g. http://www.whatdotheyknow.com/request/64/response/102
part.rfc822_attachment = TMail::Mail.parse(part.body)
count_parts_recursive(part.rfc822_attachment)
else
@count_parts_count += 1
part.url_part_number = @count_parts_count
end
end
end
# And look up by URL part number to get an attachment
def self.get_attachment_by_url_part_number(attachments, found_url_part_number)
@count_parts_count = 0
attachments.each do |a|
if a.url_part_number == found_url_part_number
return a
end
end
return nil
end
# Return date mail was sent
def sent_at
# Use date it arrived (created_at) if mail itself doesn't have Date: header
self.mail.date || self.created_at
end
# Converts email addresses we know about into textual descriptions of them
def mask_special_emails(text)
# XXX can later display some of these special emails as actual emails,
# if they are public anyway. For now just be precautionary and only
# put in descriptions of them in square brackets.
if info_request.public_body.is_requestable?
text = text.gsub(self.info_request.public_body.request_email, "[" + self.info_request.public_body.short_or_long_name + " request email]")
end
text = text.gsub(self.info_request.incoming_email, "[FOI #" + self.info_request.id.to_s + " email]")
text = text.gsub(MySociety::Config.get("CONTACT_EMAIL", 'contact@localhost'), "[WhatDoTheyKnow contact email]")
return text
end
# Replaces emails we know about in (possibly binary data) with equal length alternative ones.
def binary_mask_special_emails(text)
if info_request.public_body.is_requestable?
text = IncomingMessage.mask_string_multicharset(text, self.info_request.public_body.request_email)
end
text = IncomingMessage.mask_string_multicharset(text, self.info_request.incoming_email)
text = IncomingMessage.mask_string_multicharset(text, MySociety::Config.get("CONTACT_EMAIL", 'contact@localhost'))
text = IncomingMessage.mask_string_multicharset(text, "foi" + "@" + "sandwich.ukcod.org.uk") # gets in some due to temporary bug
return text
end
# Helper for binary_mask_special_emails. Masks out an email from some
# (binary) text, replacing with something of similar size. Does it for
# common fixed-width multibyte character sets used in word documents etc.
def IncomingMessage.mask_string_multicharset(text, email)
mask_with = email.gsub(/[^@.]/, 'X')
for encoding in ['ascii', 'ucs-2']
begin
email_enc = Iconv.conv(encoding, 'ascii', email)
mask_with_enc = Iconv.conv(encoding, 'ascii', mask_with)
# we musn't change size of the binary
raise "email/mask size mismatch in binary email mask" if email_enc.size != mask_with_enc.size
text = text.gsub(Regexp.new(email_enc, Regexp::IGNORECASE), mask_with_enc)
rescue Iconv::IllegalSequence, Iconv::InvalidEncoding
# just forget it, if not expressable in it
end
end
return text
end
# Lotus notes quoting yeuch!
def remove_lotus_quoting(text, replacement = "FOLDED_QUOTED_SECTION")
text = text.dup
name = self.info_request.user.name
# To end of message sections
# http://www.whatdotheyknow.com/request/university_investment_in_the_arm
text.gsub!(/^#{name}[^\n]+\nSent by:[^\n]+\n.*/ims, "\n\n" + replacement)
# Some other sort of forwarding quoting
# http://www.whatdotheyknow.com/request/224/response/326
text.gsub!(/^#{name}[^\n]+\n[0-9\/:\s]+\s+To\s+FOI requests at.*/ims, "\n\n" + replacement)
# http://www.whatdotheyknow.com/request/how_do_the_pct_deal_with_retirin_33#incoming-930
# http://www.whatdotheyknow.com/request/229/response/809
text.gsub!(/^From: [^\n]+\nSent: [^\n]+\nTo:\s+['"?]#{name}['"]?\nSubject:.*/ims, "\n\n" + replacement)
return text
end
# Remove emails, mobile phones and other details FOI officers ask us to remove.
def self.remove_privacy_sensitive_things(text)
text = text.dup
# Remove any email addresses - we don't want bounce messages to leak out
# either the requestor's email address or the request's response email
# address out onto the internet
text.gsub!(MySociety::Validate.email_find_regexp, "[email address]")
# Mobile phone numbers
# http://www.whatdotheyknow.com/request/failed_test_purchases_off_licenc#incoming-1013
# http://www.whatdotheyknow.com/request/selective_licensing_statistics_i#incoming-550
# http://www.whatdotheyknow.com/request/common_purpose_training_graduate#incoming-774
text.gsub!(/(Mobile|Mob)([\s\/]*(Fax|Tel))*\s*:?[\s\d]*\d/, "[mobile number]")
# Specific removals
# http://www.whatdotheyknow.com/request/total_number_of_objects_in_the_n_6
text.gsub!(/\*\*\*+\nPolly Tucker.*/ms, "")
return text
end
# Remove quoted sections from emails (eventually the aim would be for this
# to do as good a job as GMail does) XXX bet it needs a proper parser
# XXX and this FOLDED_QUOTED_SECTION stuff is a mess
def self.remove_quoted_sections(text, replacement = "FOLDED_QUOTED_SECTION")
text = text.dup
replacement = "\n" + replacement + "\n"
# First do this peculiar form of quoting, as the > single line quoting
# further below messes with it. Note the carriage return where it wraps -
# this can happen anywhere according to length of the name/email. e.g.
# >>> D K Elwell <[email address]> 17/03/2008
# 01:51:50 >>>
# http://www.whatdotheyknow.com/request/71/response/108
# http://www.whatdotheyknow.com/request/police_powers_to_inform_car_insu
# http://www.whatdotheyknow.com/request/secured_convictions_aided_by_cct
multiline_original_message = '(' + '''>>>.* \d\d/\d\d/\d\d\d\d\s+\d\d:\d\d(?::\d\d)?\s*>>>''' + ')'
text.gsub!(/^(#{multiline_original_message}\n.*)$/ms, replacement)
# Single line sections
text.gsub!(/^(>.*\n)/, replacement)
text.gsub!(/^(On .+ (wrote|said):\n)/, replacement)
# Multiple line sections
# http://www.whatdotheyknow.com/request/identity_card_scheme_expenditure
# http://www.whatdotheyknow.com/request/parliament_protest_actions
# http://www.whatdotheyknow.com/request/64/response/102
# http://www.whatdotheyknow.com/request/47/response/283
# http://www.whatdotheyknow.com/request/30/response/166
# http://www.whatdotheyknow.com/request/52/response/238
# http://www.whatdotheyknow.com/request/224/response/328 # example with * * * * *
# http://www.whatdotheyknow.com/request/297/response/506
['-', '_', '*', '#'].each do |score|
text.sub!(/(Disclaimer\s+)? # appears just before
(
\s*(?:[#{score}]\s*){8,}\s*\n.*? # top line
(disclaimer:\n|confidential|received\sthis\semail\sin\serror|virus|intended\s+recipient|monitored\s+centrally|intended\s+(for\s+|only\s+for\s+use\s+by\s+)the\s+addressee|routinely\s+monitored|MessageLabs|unauthorised\s+use)
.*?((?:[#{score}]\s*){8,}\s*\n|\z) # bottom line OR end of whole string (for ones with no terminator XXX risky)
)
/imx, replacement)
end
# Special paragraphs
# http://www.whatdotheyknow.com/request/identity_card_scheme_expenditure
text.gsub!(/^[^\n]+Government\s+Secure\s+Intranet\s+virus\s+scanning
.*?
virus\sfree\.
/imx, replacement)
text.gsub!(/^Communications\s+via\s+the\s+GSi\s+
.*?
legal\spurposes\.
/imx, replacement)
# http://www.whatdotheyknow.com/request/net_promoter_value_scores_for_bb
text.gsub!(/^http:\/\/www.bbc.co.uk
.*?
Further\s+communication\s+will\s+signify\s+your\s+consent\s+to\s+this\.
/imx, replacement)
# To end of message sections
# http://www.whatdotheyknow.com/request/123/response/192
# http://www.whatdotheyknow.com/request/235/response/513
# http://www.whatdotheyknow.com/request/445/response/743
original_message =
'(' + '''----* This is a copy of the message, including all the headers. ----*''' +
'|' + '''----*\s*Original Message\s*----*''' +
'|' + '''----*\s*Forwarded message.+----*''' +
'|' + '''----*\s*Forwarded by.+----*''' +
')'
# Could have a ^ at start here, but see messed up formatting here:
# http://www.whatdotheyknow.com/request/refuse_and_recycling_collection#incoming-842
text.gsub!(/(#{original_message}\n.*)$/mi, replacement)
# Some silly Microsoft XML gets into parts marked as plain text.
# e.g. http://www.whatdotheyknow.com/request/are_traffic_wardens_paid_commiss#incoming-401
# Don't replace with "replacement" as it's pretty messy
text.gsub!(/<\?xml:namespace[^>]*\/>/, " ")
return text
end
# Flattens all the attachments, picking only one part where there are alternatives.
# (This risks losing info if the unchosen alternative is the only one to contain
# useful info, but let's worry about that another time)
def get_attachment_leaves
return get_attachment_leaves_recursive(self.mail)
end
def get_attachment_leaves_recursive(curr_mail)
leaves_found = []
if curr_mail.multipart?
if curr_mail.sub_type == 'alternative'
# Choose best part from alternatives
best_part = nil
curr_mail.parts.each do |m|
# Take the first one, or the last text/plain one
# XXX - could do better!
if not best_part
best_part = m
elsif m.content_type == 'text/plain'
best_part = m
end
end
leaves_found += get_attachment_leaves_recursive(best_part)
else
# Add all parts
curr_mail.parts.each do |m|
leaves_found += get_attachment_leaves_recursive(m)
end
end
else
# PDFs often come with this mime type, fix it up for view code
if curr_mail.content_type == 'application/octet-stream'
calc_mime = filename_to_mimetype(TMail::Mail.get_part_file_name(curr_mail))
if calc_mime
curr_mail.content_type = calc_mime
end
end
# e.g. http://www.whatdotheyknow.com/request/93/response/250
if curr_mail.content_type == 'application/msexcel'
curr_mail.content_type = 'application/vnd.ms-excel'
end
if curr_mail.content_type == 'application/mspowerpoint'
curr_mail.content_type = 'application/vnd.ms-powerpoint'
end
if curr_mail.content_type == 'application/msword'
curr_mail.content_type = 'application/vnd.ms-word'
end
# If the part is an attachment of email in text form
if curr_mail.content_type == 'message/rfc822'
# This has been expanded from text to an email in count_parts_recursive above
leaves_found += get_attachment_leaves_recursive(curr_mail.rfc822_attachment)
else
# Store leaf
leaves_found += [curr_mail]
end
end
return leaves_found
end
# Returns body text from main text part of email, converted to UTF-8, with uudecode removed
def get_main_body_text
text = get_main_body_text_internal
# Strip the uudecode parts from main text
text = text.split(/^begin.+^`\n^end\n/sm).join(" ")
return text
end
# Returns body text from main text part of email, converted to UTF-8
def get_main_body_text_internal
main_part = get_main_body_text_part
if main_part.nil?
text = "[ Email has no body, please see attachments ]"
text_charset = "utf-8"
else
text = main_part.body
text_charset = main_part.charset
if main_part.content_type == 'text/html'
# XXX could use better HTML to text conversion than this!
# (it only matters for emails without a text part, so not a massive deal
# e.g. http://www.whatdotheyknow.com/request/35/response/177 )
text.gsub!(/
]+>/, "\n")
text.gsub!(/
]+>/, "\n\n") text.gsub!(/