aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--app/models/incoming_message_new.rb.new870
-rw-r--r--app/models/request_mailer.rb4
2 files changed, 872 insertions, 2 deletions
diff --git a/app/models/incoming_message_new.rb.new b/app/models/incoming_message_new.rb.new
new file mode 100644
index 000000000..62908f7fb
--- /dev/null
+++ b/app/models/incoming_message_new.rb.new
@@ -0,0 +1,870 @@
+# coding: utf-8
+
+# == Schema Information
+# Schema version: 114
+#
+# Table name: incoming_messages
+#
+# id :integer not null, primary key
+# info_request_id :integer not null
+# created_at :datetime not null
+# updated_at :datetime not null
+# raw_email_id :integer not null
+# cached_attachment_text_clipped :text
+# cached_main_body_text_folded :text
+# cached_main_body_text_unfolded :text
+# subject :text
+# mail_from_domain :text
+# valid_to_reply_to :boolean
+# last_parsed :datetime
+# mail_from :text
+# sent_at :datetime
+
+# models/incoming_message.rb:
+# An (email) message from really anybody to be logged with a request. e.g. A
+# response from the public body.
+#
+# Copyright (c) 2007 UK Citizens Online Democracy. All rights reserved.
+# Email: francis@mysociety.org; WWW: http://www.mysociety.org/
+#
+# $Id: incoming_message.rb,v 1.228 2009-10-21 11:24:14 francis Exp $
+
+# TODO
+# Move some of the (e.g. quoting) functions here into rblib, as they feel
+# general not specific to IncomingMessage.
+
+require 'alaveteli_file_types'
+require 'htmlentities'
+require 'rexml/document'
+require 'zip/zip'
+require 'mapi/msg'
+require 'mapi/convert'
+
+class IncomingMessageNew < ActiveRecord::Base
+ belongs_to :info_request
+ validates_presence_of :info_request
+
+ validates_presence_of :raw_email
+
+ has_many :outgoing_message_followups, :foreign_key => 'incoming_message_followup_id', :class_name => 'OutgoingMessage'
+ has_many :foi_attachments, :order => 'id'
+ has_many :info_request_events # never really has many, but could in theory
+
+ belongs_to :raw_email
+
+ # Note that this may be via either the tmail or mail gems, depending on if we
+ # are running ruby 1.8 or 1.9
+ include MailParsing
+
+ # See binary_mask_stuff function below. It just test for inclusion
+ # in this hash, not the value of the right hand side.
+ DoNotBinaryMask = {
+ 'image/tiff' => 1,
+ 'image/gif' => 1,
+ 'image/jpeg' => 1,
+ 'image/png' => 1,
+ 'image/bmp' => 1,
+ 'application/zip' => 1,
+ }
+
+ def mail(force = nil)
+ if (!force.nil? || @mail.nil?) && !self.raw_email.nil?
+ @mail = MailParsing.mail_from_raw_email(self.raw_email.data)
+ end
+ @mail
+ end
+
+ # Return false if for some reason this is a message that we shouldn't let them reply to
+ def _calculate_valid_to_reply_to
+
+ email = get_from_address()
+ return false if !email
+ return false if !MySociety::Validate.is_valid_email(email)
+
+ # reject postmaster - authorities seem to nearly always not respond to
+ # email to postmaster, and it tends to only happen after delivery failure.
+ # likewise Mailer-Daemon, Auto_Reply...
+ prefix = email
+ prefix =~ /^(.*)@/
+ prefix = $1
+ if !prefix.nil? && prefix.downcase.match(/^(postmaster|mailer-daemon|auto_reply|donotreply|no.reply)$/)
+ return false
+ end
+ if !self.mail['return-path'].nil? && self.mail['return-path'].addr == "<>"
+ return false
+ end
+ if !self.mail['auto-submitted'].nil?
+ return false
+ end
+ return true
+ end
+
+ def parse_raw_email!(force = nil)
+ # The following fields may be absent; we treat them as cached
+ # values in case we want to regenerate them (due to mail
+ # parsing bugs, etc).
+ if self.raw_email.nil?
+ raise "Incoming message id=#{id} has no raw_email"
+ end
+ if (!force.nil? || self.last_parsed.nil?)
+ ActiveRecord::Base.transaction do
+ self.extract_attachments!
+ self.sent_at = self.mail.date || self.created_at
+ self.subject = self.mail.subject
+ self.mail_from = get_from_name
+ from_address = get_from_address
+ if from_address
+ self.mail_from_domain = PublicBody.extract_domain_from_email(get_from_address)
+ else
+ self.mail_from_domain = ""
+ end
+ self.valid_to_reply_to = self._calculate_valid_to_reply_to
+ self.last_parsed = Time.now
+ self.save!
+ end
+ end
+ end
+
+ def valid_to_reply_to?
+ return self.valid_to_reply_to
+ end
+
+ # The cached fields mentioned in the previous comment
+ # XXX there must be a nicer way to do this without all that
+ # repetition. I tried overriding method_missing but got some
+ # unpredictable results.
+ def valid_to_reply_to
+ parse_raw_email!
+ super
+ end
+ def sent_at
+ parse_raw_email!
+ super
+ end
+ def subject
+ parse_raw_email!
+ super
+ end
+
+ # Returns the name of the person the incoming message is from, or nil if
+ # there isn't one or if there is only an email address.
+ def mail_from
+ parse_raw_email!
+ super
+ end
+
+ def safe_mail_from
+ if !self.mail_from.nil?
+ mail_from = self.mail_from.dup
+ self.info_request.apply_censor_rules_to_text!(mail_from)
+ return mail_from
+ end
+ end
+ def mail_from_domain
+ parse_raw_email!
+ super
+ end
+
+ # And look up by URL part number to get an attachment
+ # XXX relies on extract_attachments calling ensure_parts_counted
+ def self.get_attachment_by_url_part_number(attachments, found_url_part_number)
+ attachments.each do |a|
+ if a.url_part_number == found_url_part_number
+ return a
+ end
+ end
+ return nil
+ end
+
+ # Converts email addresses we know about into textual descriptions of them
+ def mask_special_emails!(text)
+ # XXX can later display some of these special emails as actual emails,
+ # if they are public anyway. For now just be precautionary and only
+ # put in descriptions of them in square brackets.
+ if self.info_request.public_body.is_followupable?
+ text.gsub!(self.info_request.public_body.request_email, _("[{{public_body}} request email]", :public_body => self.info_request.public_body.short_or_long_name))
+ end
+ text.gsub!(self.info_request.incoming_email, _('[FOI #{{request}} email]', :request => self.info_request.id.to_s) )
+ text.gsub!(MySociety::Config.get("CONTACT_EMAIL", 'contact@localhost'), _("[{{site_name}} contact email]", :site_name => MySociety::Config.get('SITE_NAME', 'Alaveteli')) )
+ end
+
+ # Replaces all email addresses in (possibly binary data) with equal length alternative ones.
+ # Also replaces censor items
+ def binary_mask_stuff!(text, content_type)
+ # See if content type is one that we mask - things like zip files and
+ # images may get broken if we try to. We err on the side of masking too
+ # much, as many unknown types will really be text.
+ if DoNotBinaryMask.include?(content_type)
+ return
+ end
+ # Special cases for some content types
+ if content_type == 'application/pdf'
+ uncompressed_text = nil
+ uncompressed_text = AlaveteliExternalCommand.run("pdftk", "-", "output", "-", "uncompress", :stdin_string => text)
+
+ # if we managed to uncompress the PDF...
+ if !uncompressed_text.nil? && !uncompressed_text.empty?
+ # then censor stuff (making a copy so can compare again in a bit)
+ censored_uncompressed_text = uncompressed_text.dup
+ self._binary_mask_stuff_internal!(censored_uncompressed_text)
+ # if the censor rule removed something...
+ if censored_uncompressed_text != uncompressed_text
+ # then use the altered file (recompressed)
+ recompressed_text = nil
+ if MySociety::Config.get('USE_GHOSTSCRIPT_COMPRESSION') == true
+ command = ["gs", "-sDEVICE=pdfwrite", "-dCompatibilityLevel=1.4", "-dPDFSETTINGS=/screen", "-dNOPAUSE", "-dQUIET", "-dBATCH", "-sOutputFile=-", "-"]
+ else
+ command = ["pdftk", "-", "output", "-", "compress"]
+ end
+ recompressed_text = AlaveteliExternalCommand.run(*(command + [{:stdin_string=>censored_uncompressed_text}]))
+ if recompressed_text.nil? || recompressed_text.empty?
+ # buggy versions of pdftk sometimes fail on
+ # compression, I don't see it's a disaster in
+ # these cases to save an uncompressed version?
+ recompressed_text = censored_uncompressed_text
+ logger.warn "Unable to compress PDF; problem with your pdftk version?"
+ end
+ if !recompressed_text.nil? && !recompressed_text.empty?
+ text.replace recompressed_text
+ end
+ end
+ end
+ return
+ end
+ self._binary_mask_stuff_internal!(text)
+ end
+
+ # Used by binary_mask_stuff - replace text in place
+ def _binary_mask_stuff_internal!(text)
+ # Keep original size, so can check haven't resized it
+ orig_size = text.size
+ # Replace ASCII email addresses...
+ text.gsub!(MySociety::Validate.email_find_regexp) do |email|
+ email.gsub(/[^@.]/, 'x')
+ end
+ # And replace UCS-2 ones (for Microsoft Office documents)...
+ # Find emails, by finding them in parts of text that have ASCII
+ # equivalents to the UCS-2
+ ascii_chars = text.gsub(/\0/, "")
+ emails = ascii_chars.scan(MySociety::Validate.email_find_regexp)
+ # Convert back to UCS-2, making a mask at the same time
+ emails.map! {|email| [
+ Iconv.conv('ucs-2le', 'ascii', email[0]),
+ Iconv.conv('ucs-2le', 'ascii', email[0].gsub(/[^@.]/, 'x'))
+ ] }
+ # Now search and replace the UCS-2 email with the UCS-2 mask
+ for email, mask in emails
+ text.gsub!(email, mask)
+ end
+ # Replace censor items
+ self.info_request.apply_censor_rules_to_binary!(text)
+
+ raise "internal error in binary_mask_stuff" if text.size != orig_size
+ return text
+ end
+
+ # Removes censored stuff from from HTML conversion of downloaded binaries
+ def html_mask_stuff!(html)
+ self.mask_special_emails!(html)
+ self.remove_privacy_sensitive_things!(html)
+ end
+
+ # Lotus notes quoting yeuch!
+ def remove_lotus_quoting(text, replacement = "FOLDED_QUOTED_SECTION")
+ text = text.dup
+ name = Regexp.escape(self.info_request.user_name)
+
+ # To end of message sections
+ # http://www.whatdotheyknow.com/request/university_investment_in_the_arm
+ text.gsub!(/^#{name}[^\n]+\nSent by:[^\n]+\n.*/im, "\n\n" + replacement)
+
+ # Some other sort of forwarding quoting
+ # http://www.whatdotheyknow.com/request/224/response/326
+ text.gsub!(/^#{name}[^\n]+\n[0-9\/:\s]+\s+To\s+FOI requests at.*/im, "\n\n" + replacement)
+
+ # http://www.whatdotheyknow.com/request/how_do_the_pct_deal_with_retirin_33#incoming-930
+ # http://www.whatdotheyknow.com/request/229/response/809
+ text.gsub!(/^From: [^\n]+\nSent: [^\n]+\nTo:\s+['"?]#{name}['"]?\nSubject:.*/im, "\n\n" + replacement)
+
+ return text
+
+ end
+
+ # Remove emails, mobile phones and other details FOI officers ask us to remove.
+ def remove_privacy_sensitive_things!(text)
+ # Remove any email addresses - we don't want bounce messages to leak out
+ # either the requestor's email address or the request's response email
+ # address out onto the internet
+ text.gsub!(MySociety::Validate.email_find_regexp, "[email address]")
+
+ # Mobile phone numbers
+ # http://www.whatdotheyknow.com/request/failed_test_purchases_off_licenc#incoming-1013
+ # http://www.whatdotheyknow.com/request/selective_licensing_statistics_i#incoming-550
+ # http://www.whatdotheyknow.com/request/common_purpose_training_graduate#incoming-774
+ text.gsub!(/(Mobile|Mob)([\s\/]*(Fax|Tel))*\s*:?[\s\d]*\d/, "[mobile number]")
+
+ # Remove WhatDoTheyKnow signup links
+ domain = MySociety::Config.get('DOMAIN')
+ text.gsub!(/http:\/\/#{domain}\/c\/[^\s]+/, "[WDTK login link]")
+
+ # Remove things from censor rules
+ self.info_request.apply_censor_rules_to_text!(text)
+ end
+
+
+ # Remove quoted sections from emails (eventually the aim would be for this
+ # to do as good a job as GMail does) XXX bet it needs a proper parser
+ # XXX and this FOLDED_QUOTED_SECTION stuff is a mess
+ def self.remove_quoted_sections(text, replacement = "FOLDED_QUOTED_SECTION")
+ text = text.dup
+ replacement = "\n" + replacement + "\n"
+
+ # First do this peculiar form of quoting, as the > single line quoting
+ # further below messes with it. Note the carriage return where it wraps -
+ # this can happen anywhere according to length of the name/email. e.g.
+ # >>> D K Elwell <[email address]> 17/03/2008
+ # 01:51:50 >>>
+ # http://www.whatdotheyknow.com/request/71/response/108
+ # http://www.whatdotheyknow.com/request/police_powers_to_inform_car_insu
+ # http://www.whatdotheyknow.com/request/secured_convictions_aided_by_cct
+ multiline_original_message = '(' + '''>>>.* \d\d/\d\d/\d\d\d\d\s+\d\d:\d\d(?::\d\d)?\s*>>>''' + ')'
+ text.gsub!(/^(#{multiline_original_message}\n.*)$/m, replacement)
+
+ # Single line sections
+ text.gsub!(/^(>.*\n)/, replacement)
+ text.gsub!(/^(On .+ (wrote|said):\n)/, replacement)
+
+ ['-', '_', '*', '#'].each do |scorechar|
+ score = /(?:[#{scorechar}]\s*){8,}/
+ text.sub!(/(Disclaimer\s+)? # appears just before
+ (
+ \s*#{score}\n(?:(?!#{score}\n).)*? # top line
+ (disclaimer:\n|confidential|received\sthis\semail\sin\serror|virus|intended\s+recipient|monitored\s+centrally|intended\s+(for\s+|only\s+for\s+use\s+by\s+)the\s+addressee|routinely\s+monitored|MessageLabs|unauthorised\s+use)
+ .*?(?:#{score}|\z) # bottom line OR end of whole string (for ones with no terminator XXX risky)
+ )
+ /imx, replacement)
+ end
+
+ # Special paragraphs
+ # http://www.whatdotheyknow.com/request/identity_card_scheme_expenditure
+ text.gsub!(/^[^\n]+Government\s+Secure\s+Intranet\s+virus\s+scanning
+ .*?
+ virus\sfree\.
+ /imx, replacement)
+ text.gsub!(/^Communications\s+via\s+the\s+GSi\s+
+ .*?
+ legal\spurposes\.
+ /imx, replacement)
+ # http://www.whatdotheyknow.com/request/net_promoter_value_scores_for_bb
+ text.gsub!(/^http:\/\/www.bbc.co.uk
+ .*?
+ Further\s+communication\s+will\s+signify\s+your\s+consent\s+to\s+this\.
+ /imx, replacement)
+
+
+ # To end of message sections
+ # http://www.whatdotheyknow.com/request/123/response/192
+ # http://www.whatdotheyknow.com/request/235/response/513
+ # http://www.whatdotheyknow.com/request/445/response/743
+ original_message =
+ '(' + '''----* This is a copy of the message, including all the headers. ----*''' +
+ '|' + '''----*\s*Original Message\s*----*''' +
+ '|' + '''----*\s*Forwarded message.+----*''' +
+ '|' + '''----*\s*Forwarded by.+----*''' +
+ ')'
+ # Could have a ^ at start here, but see messed up formatting here:
+ # http://www.whatdotheyknow.com/request/refuse_and_recycling_collection#incoming-842
+ text.gsub!(/(#{original_message}\n.*)$/mi, replacement)
+
+
+ # Some silly Microsoft XML gets into parts marked as plain text.
+ # e.g. http://www.whatdotheyknow.com/request/are_traffic_wardens_paid_commiss#incoming-401
+ # Don't replace with "replacement" as it's pretty messy
+ text.gsub!(/<\?xml:namespace[^>]*\/>/, " ")
+
+ return text
+ end
+
+
+ # Removes anything cached about the object in the database, and saves
+ def clear_in_database_caches!
+ self.cached_attachment_text_clipped = nil
+ self.cached_main_body_text_unfolded = nil
+ self.cached_main_body_text_folded = nil
+ self.save!
+ end
+
+ # Internal function to cache two sorts of main body text.
+ # Cached as loading raw_email can be quite huge, and need this for just
+ # search results
+ def _cache_main_body_text
+ text = self.get_main_body_text_internal
+ # Strip the uudecode parts from main text
+ # - this also effectively does a .dup as well, so text mods don't alter original
+ text = text.split(/^begin.+^`\n^end\n/m).join(" ")
+
+ if text.size > 1000000 # 1 MB ish
+ raise "main body text more than 1 MB, need to implement clipping like for attachment text, or there is some other MIME decoding problem or similar"
+ end
+
+ # remove emails for privacy/anti-spam reasons
+ self.mask_special_emails!(text)
+ self.remove_privacy_sensitive_things!(text)
+
+ # Remove existing quoted sections
+ folded_quoted_text = self.remove_lotus_quoting(text, 'FOLDED_QUOTED_SECTION')
+ folded_quoted_text = IncomingMessage.remove_quoted_sections(text, "FOLDED_QUOTED_SECTION")
+ self.cached_main_body_text_unfolded = text
+ self.cached_main_body_text_folded = folded_quoted_text
+ self.save!
+ end
+ # Returns body text from main text part of email, converted to UTF-8, with uudecode removed,
+ # emails and privacy sensitive things remove, censored, and folded to remove excess quoted text
+ # (marked with FOLDED_QUOTED_SECTION)
+ # XXX returns a .dup of the text, so calling functions can in place modify it
+ def get_main_body_text_folded
+ if self.cached_main_body_text_folded.nil?
+ self._cache_main_body_text
+ end
+ return self.cached_main_body_text_folded
+ end
+ def get_main_body_text_unfolded
+ if self.cached_main_body_text_unfolded.nil?
+ self._cache_main_body_text
+ end
+ return self.cached_main_body_text_unfolded
+ end
+ # Returns body text from main text part of email, converted to UTF-8
+ def get_main_body_text_internal
+ parse_raw_email!
+ main_part = get_main_body_text_part
+ return _convert_part_body_to_text(main_part)
+ end
+
+ def extract_attachments!
+ attachment_ids = _extract_attachments!
+ # now get rid of any attachments we no longer have
+ if !attachment_ids.empty?
+ FoiAttachment.destroy_all("id NOT IN (#{attachment_ids.join(',')}) AND incoming_message_id = #{self.id}")
+ end
+ end
+
+ # Given a main text part, converts it to text
+ def _convert_part_body_to_text(part)
+ if part.nil?
+ text = "[ Email has no body, please see attachments ]"
+ source_charset = "utf-8"
+ else
+ text = part.body # by default, TMail converts to UTF8 in this call
+ source_charset = part.charset
+ if part.content_type == 'text/html'
+ # e.g. http://www.whatdotheyknow.com/request/35/response/177
+ # XXX This is a bit of a hack as it is calling a
+ # convert to text routine. Could instead call a
+ # sanitize HTML one.
+
+ # If the text isn't UTF8, it means TMail had a problem
+ # converting it (invalid characters, etc), and we
+ # should instead tell elinks to respect the source
+ # charset
+ use_charset = "utf-8"
+ begin
+ text = Iconv.conv('utf-8', 'utf-8', text)
+ rescue Iconv::IllegalSequence
+ use_charset = source_charset
+ end
+ text = self.class._get_attachment_text_internal_one_file(part.content_type, text, use_charset)
+ end
+ end
+
+ # If TMail can't convert text, it just returns it, so we sanitise it.
+ begin
+ # Test if it's good UTF-8
+ text = Iconv.conv('utf-8', 'utf-8', text)
+ rescue Iconv::IllegalSequence
+ # Text looks like unlabelled nonsense,
+ # strip out anything that isn't UTF-8
+ begin
+ text = Iconv.conv('utf-8//IGNORE', source_charset, text) +
+ _("\n\n[ {{site_name}} note: The above text was badly encoded, and has had strange characters removed. ]",
+ :site_name => MySociety::Config.get('SITE_NAME', 'Alaveteli'))
+ rescue Iconv::InvalidEncoding, Iconv::IllegalSequence
+ if source_charset != "utf-8"
+ source_charset = "utf-8"
+ retry
+ end
+ end
+ end
+
+ # Fix DOS style linefeeds to Unix style ones (or other later regexps won't work)
+ # Needed for e.g. http://www.whatdotheyknow.com/request/60/response/98
+ text = text.gsub(/\r\n/, "\n")
+
+ # Compress extra spaces down to save space, and to stop regular expressions
+ # breaking in strange extreme cases. e.g. for
+ # http://www.whatdotheyknow.com/request/spending_on_consultants
+ text = text.gsub(/ +/, " ")
+
+ return text
+ end
+ # Returns part which contains main body text, or nil if there isn't one
+ def get_main_body_text_part
+ leaves = self.foi_attachments
+
+ # Find first part which is text/plain or text/html
+ # (We have to include HTML, as increasingly there are mail clients that
+ # include no text alternative for the main part, and we don't want to
+ # instead use the first text attachment
+ # e.g. http://www.whatdotheyknow.com/request/list_of_public_authorties)
+ leaves.each do |p|
+ if p.content_type == 'text/plain' or p.content_type == 'text/html'
+ return p
+ end
+ end
+
+ # Otherwise first part which is any sort of text
+ leaves.each do |p|
+ if p.content_type.match(/^text/)
+ return p
+ end
+ end
+
+ # ... or if none, consider first part
+ p = leaves[0]
+ # if it is a known type then don't use it, return no body (nil)
+ if !p.nil? && AlaveteliFileTypes.mimetype_to_extension(p.content_type)
+ # this is guess of case where there are only attachments, no body text
+ # e.g. http://www.whatdotheyknow.com/request/cost_benefit_analysis_for_real_n
+ return nil
+ end
+ # otherwise return it assuming it is text (sometimes you get things
+ # like binary/octet-stream, or the like, which are really text - XXX if
+ # you find an example, put URL here - perhaps we should be always returning
+ # nil in this case)
+ return p
+ end
+ # Returns attachments that are uuencoded in main body part
+ def _uudecode_and_save_attachments(text)
+ # Find any uudecoded things buried in it, yeuchly
+ uus = text.scan(/^begin.+^`\n^end\n/m)
+ attachments = []
+ for uu in uus
+ # Decode the string
+ content = nil
+ tempfile = Tempfile.new('foiuu')
+ tempfile.print uu
+ tempfile.flush
+ content = AlaveteliExternalCommand.run("uudecode", "-o", "/dev/stdout", tempfile.path)
+ tempfile.close
+ # Make attachment type from it, working out filename and mime type
+ filename = uu.match(/^begin\s+[0-9]+\s+(.*)$/)[1]
+ calc_mime = AlaveteliFileTypes.filename_and_content_to_mimetype(filename, content)
+ if calc_mime
+ calc_mime = normalise_content_type(calc_mime)
+ content_type = calc_mime
+ else
+ content_type = 'application/octet-stream'
+ end
+ hexdigest = Digest::MD5.hexdigest(content)
+ attachment = self.foi_attachments.find_or_create_by_hexdigest(:hexdigest => hexdigest)
+ attachment.update_attributes(:filename => filename,
+ :content_type => content_type,
+ :body => content,
+ :display_size => "0K")
+ attachment.save!
+ attachments << attachment
+ end
+ return attachments
+ end
+
+ def get_attachments_for_display
+ parse_raw_email!
+ # return what user would consider attachments, i.e. not the main body
+ main_part = get_main_body_text_part
+ attachments = []
+ for attachment in self.foi_attachments
+ attachments << attachment if attachment != main_part
+ end
+ return attachments
+ end
+
+ # Returns body text as HTML with quotes flattened, and emails removed.
+ def get_body_for_html_display(collapse_quoted_sections = true)
+ # Find the body text and remove emails for privacy/anti-spam reasons
+ text = get_main_body_text_unfolded
+ folded_quoted_text = get_main_body_text_folded
+
+ # Remove quoted sections, adding HTML. XXX The FOLDED_QUOTED_SECTION is
+ # a nasty hack so we can escape other HTML before adding the unfold
+ # links, without escaping them. Rather than using some proper parser
+ # making a tree structure (I don't know of one that is to hand, that
+ # works well in this kind of situation, such as with regexps).
+ if collapse_quoted_sections
+ text = folded_quoted_text
+ end
+ text = MySociety::Format.simplify_angle_bracketed_urls(text)
+ text = CGI.escapeHTML(text)
+ text = MySociety::Format.make_clickable(text, :contract => 1)
+ text.gsub!(/\[(email address|mobile number)\]/, '[<a href="/help/officers#mobiles">\1</a>]')
+ if collapse_quoted_sections
+ text = text.gsub(/(\s*FOLDED_QUOTED_SECTION\s*)+/m, "FOLDED_QUOTED_SECTION")
+ text.strip!
+ # if there is nothing but quoted stuff, then show the subject
+ if text == "FOLDED_QUOTED_SECTION"
+ text = "[Subject only] " + CGI.escapeHTML(self.subject) + text
+ end
+ # and display link for quoted stuff
+ text = text.gsub(/FOLDED_QUOTED_SECTION/, "\n\n" + '<span class="unfold_link"><a href="?unfold=1#incoming-'+self.id.to_s+'">'+_("show quoted sections")+'</a></span>' + "\n\n")
+ else
+ if folded_quoted_text.include?('FOLDED_QUOTED_SECTION')
+ text = text + "\n\n" + '<span class="unfold_link"><a href="?#incoming-'+self.id.to_s+'">'+_("hide quoted sections")+'</a></span>'
+ end
+ end
+ text.strip!
+
+ text = text.gsub(/\n/, '<br>')
+ text = text.gsub(/(?:<br>\s*){2,}/, '<br><br>') # remove excess linebreaks that unnecessarily space it out
+ return text
+ end
+
+
+ # Returns text of email for using in quoted section when replying
+ def get_body_for_quoting
+ # Get the body text with emails and quoted sections removed
+ text = get_main_body_text_folded
+ text.gsub!("FOLDED_QUOTED_SECTION", " ")
+ text.strip!
+ raise "internal error" if text.nil?
+ return text
+ end
+
+ MAX_ATTACHMENT_TEXT_CLIPPED = 1000000 # 1Mb ish
+
+ # Returns text version of attachment text
+ def get_attachment_text_full
+ text = self._get_attachment_text_internal
+ self.mask_special_emails!(text)
+ self.remove_privacy_sensitive_things!(text)
+ # This can be useful for memory debugging
+ #STDOUT.puts 'xxx '+ MySociety::DebugHelpers::allocated_string_size_around_gc
+
+ # Save clipped version for snippets
+ if self.cached_attachment_text_clipped.nil?
+ self.cached_attachment_text_clipped = text[0..MAX_ATTACHMENT_TEXT_CLIPPED]
+ self.save!
+ end
+
+ return text
+ end
+ # Returns a version reduced to a sensible maximum size - this
+ # is for performance reasons when showing snippets in search results.
+ def get_attachment_text_clipped
+ if self.cached_attachment_text_clipped.nil?
+ # As side effect, get_attachment_text_full makes snippet text
+ attachment_text = self.get_attachment_text_full
+ raise "internal error" if self.cached_attachment_text_clipped.nil?
+ end
+
+ return self.cached_attachment_text_clipped
+ end
+ def IncomingMessage._get_attachment_text_internal_one_file(content_type, body, charset = 'utf-8')
+ # note re. charset: TMail always tries to convert email bodies
+ # to UTF8 by default, so normally it should already be that.
+ text = ''
+ # XXX - tell all these command line tools to return utf-8
+ if content_type == 'text/plain'
+ text += body + "\n\n"
+ else
+ tempfile = Tempfile.new('foiextract')
+ tempfile.print body
+ tempfile.flush
+ if content_type == 'application/vnd.ms-word'
+ AlaveteliExternalCommand.run("wvText", tempfile.path, tempfile.path + ".txt")
+ # Try catdoc if we get into trouble (e.g. for InfoRequestEvent 2701)
+ if not File.exists?(tempfile.path + ".txt")
+ AlaveteliExternalCommand.run("catdoc", tempfile.path, :append_to => text)
+ else
+ text += File.read(tempfile.path + ".txt") + "\n\n"
+ File.unlink(tempfile.path + ".txt")
+ end
+ elsif content_type == 'application/rtf'
+ # catdoc on RTF prodcues less comments and extra bumf than --text option to unrtf
+ AlaveteliExternalCommand.run("catdoc", tempfile.path, :append_to => text)
+ elsif content_type == 'text/html'
+ # lynx wordwraps links in its output, which then don't
+ # get formatted properly by Alaveteli. We use elinks
+ # instead, which doesn't do that.
+ AlaveteliExternalCommand.run("elinks", "-eval", "set document.codepage.assume = \"#{charset}\"", "-eval", "set document.codepage.force_assumed = 1", "-dump-charset", "utf-8", "-force-html", "-dump",
+ tempfile.path, :append_to => text, :env => {"LANG" => "C"})
+ elsif content_type == 'application/vnd.ms-excel'
+ # Bit crazy using /usr/bin/strings - but xls2csv, xlhtml and
+ # py_xls2txt only extract text from cells, not from floating
+ # notes. catdoc may be fooled by weird character sets, but will
+ # probably do for UK FOI requests.
+ AlaveteliExternalCommand.run("/usr/bin/strings", tempfile.path, :append_to => text)
+ elsif content_type == 'application/vnd.ms-powerpoint'
+ # ppthtml seems to catch more text, but only outputs HTML when
+ # we want text, so just use catppt for now
+ AlaveteliExternalCommand.run("catppt", tempfile.path, :append_to => text)
+ elsif content_type == 'application/pdf'
+ AlaveteliExternalCommand.run("pdftotext", tempfile.path, "-", :append_to => text)
+ elsif content_type == 'application/vnd.openxmlformats-officedocument.wordprocessingml.document'
+ # This is Microsoft's XML office document format.
+ # Just pull out the main XML file, and strip it of text.
+ xml = AlaveteliExternalCommand.run("/usr/bin/unzip", "-qq", "-c", tempfile.path, "word/document.xml")
+ if !xml.nil?
+ doc = REXML::Document.new(xml)
+ text += doc.each_element( './/text()' ){}.join(" ")
+ end
+ elsif content_type == 'application/zip'
+ # recurse into zip files
+ begin
+ zip_file = Zip::ZipFile.open(tempfile.path)
+ text += IncomingMessage._get_attachment_text_from_zip_file(zip_file)
+ zip_file.close()
+ rescue
+ $stderr.puts("Error processing zip file: #{$!.inspect}")
+ end
+ end
+ tempfile.close
+ end
+
+ if text.respond_to? :force_encoding
+ text.force_encoding('utf-8')
+ else
+ text
+ end
+ end
+ def IncomingMessage._get_attachment_text_from_zip_file(zip_file)
+ text = ""
+ for entry in zip_file
+ if entry.file?
+ filename = entry.to_s
+ begin
+ body = entry.get_input_stream.read
+ rescue
+ # move to next attachment silently if there were problems
+ # XXX really should reduce this to specific exceptions?
+ # e.g. password protected
+ next
+ end
+ calc_mime = AlaveteliFileTypes.filename_to_mimetype(filename)
+ if calc_mime
+ content_type = calc_mime
+ else
+ content_type = 'application/octet-stream'
+ end
+
+ text += _get_attachment_text_internal_one_file(content_type, body)
+ end
+ end
+ return text
+ end
+ def _get_attachment_text_internal
+ # Extract text from each attachment
+ text = ''
+ attachments = self.get_attachments_for_display
+ for attachment in attachments
+ text += IncomingMessage._get_attachment_text_internal_one_file(attachment.content_type, attachment.body, attachment.charset)
+ end
+ # Remove any bad characters
+ text = Iconv.conv('utf-8//IGNORE', 'utf-8', text)
+ return text
+ end
+
+
+ # Returns text for indexing
+ def get_text_for_indexing_full
+ return get_body_for_quoting + "\n\n" + get_attachment_text_full
+ end
+ # Used for excerpts in search results, when loading full text would be too slow
+ def get_text_for_indexing_clipped
+ return get_body_for_quoting + "\n\n" + get_attachment_text_clipped
+ end
+
+ # Has message arrived "recently"?
+ def recently_arrived
+ (Time.now - self.created_at) <= 3.days
+ end
+
+ def fully_destroy
+ ActiveRecord::Base.transaction do
+ for o in self.outgoing_message_followups
+ o.incoming_message_followup = nil
+ o.save!
+ end
+ info_request_event = InfoRequestEvent.find_by_incoming_message_id(self.id)
+ info_request_event.track_things_sent_emails.each { |a| a.destroy }
+ info_request_event.user_info_request_sent_alerts.each { |a| a.destroy }
+ info_request_event.destroy
+ self.raw_email.destroy_file_representation!
+ self.destroy
+ end
+ end
+
+ # Search all info requests for
+ def IncomingMessage.find_all_unknown_mime_types
+ for incoming_message in IncomingMessage.find(:all)
+ for attachment in incoming_message.get_attachments_for_display
+ raise "internal error incoming_message " + incoming_message.id.to_s if attachment.content_type.nil?
+ if AlaveteliFileTypes.mimetype_to_extension(attachment.content_type).nil?
+ $stderr.puts "Unknown type for /request/" + incoming_message.info_request.id.to_s + "#incoming-"+incoming_message.id.to_s
+ $stderr.puts " " + attachment.filename.to_s + " " + attachment.content_type.to_s
+ end
+ end
+ end
+
+ return nil
+ end
+
+ # Returns space separated list of file extensions of attachments to this message. Defaults to
+ # the normal extension for known mime type, otherwise uses other extensions.
+ def get_present_file_extensions
+ ret = {}
+ for attachment in self.get_attachments_for_display
+ ext = AlaveteliFileTypes.mimetype_to_extension(attachment.content_type)
+ ext = File.extname(attachment.filename).gsub(/^[.]/, "") if ext.nil? && !attachment.filename.nil?
+ ret[ext] = 1 if !ext.nil?
+ end
+ return ret.keys.join(" ")
+ end
+ # Return space separated list of all file extensions known
+ def IncomingMessage.get_all_file_extensions
+ return AlaveteliFileTypes.all_extensions.join(" ")
+ end
+
+ def normalise_content_type(content_type)
+ # e.g. http://www.whatdotheyknow.com/request/93/response/250
+ if content_type == 'application/excel' or content_type == 'application/msexcel' or content_type == 'application/x-ms-excel'
+ content_type = 'application/vnd.ms-excel'
+ end
+ if content_type == 'application/mspowerpoint' or content_type == 'application/x-ms-powerpoint'
+ content_type = 'application/vnd.ms-powerpoint'
+ end
+ if content_type == 'application/msword' or content_type == 'application/x-ms-word'
+ content_type = 'application/vnd.ms-word'
+ end
+ if content_type == 'application/x-zip-compressed'
+ content_type = 'application/zip'
+ end
+
+ # e.g. http://www.whatdotheyknow.com/request/copy_of_current_swessex_scr_opt#incoming-9928
+ if content_type == 'application/acrobat'
+ content_type = 'application/pdf'
+ end
+
+ return content_type
+ end
+
+ def for_admin_column
+ self.class.content_columns.each do |column|
+ yield(column.human_name, self.send(column.name), column.type.to_s, column.name)
+ end
+ end
+
+ private :normalise_content_type
+
+end
+
+
+
diff --git a/app/models/request_mailer.rb b/app/models/request_mailer.rb
index ba9285fc6..6b38e3fa8 100644
--- a/app/models/request_mailer.rb
+++ b/app/models/request_mailer.rb
@@ -50,7 +50,7 @@ class RequestMailer < ApplicationMailer
headers 'Return-Path' => blackhole_email, 'Reply-To' => @from, # we don't care about bounces, likely from spammers
'Auto-Submitted' => 'auto-replied' # http://tools.ietf.org/html/rfc3834
@recipients = email.from_addrs[0].to_s
- @subject = "Your response to an FOI request was not delivered"
+ @subject = _("Your response to an FOI request was not delivered")
attachment :content_type => 'message/rfc822', :body => raw_email_data,
:filename => "original.eml", :transfer_encoding => '7bit', :content_disposition => 'inline'
@body = {
@@ -154,7 +154,7 @@ class RequestMailer < ApplicationMailer
'Auto-Submitted' => 'auto-generated', # http://tools.ietf.org/html/rfc3834
'X-Auto-Response-Suppress' => 'OOF'
@recipients = info_request.user.name_and_email
- @subject = "Someone has updated the status of your request"
+ @subject = _("Someone has updated the status of your request")
url = main_url(request_url(info_request))
@body = {:info_request => info_request, :url => url}
end