diff options
author | Louise Crow <louise.crow@gmail.com> | 2014-11-27 10:20:37 +0000 |
---|---|---|
committer | Louise Crow <louise.crow@gmail.com> | 2014-12-15 10:22:22 +0000 |
commit | 224725e202d581d956e8958c521abb00de9935b1 (patch) | |
tree | c53cfda3eed152070e8df62477dcbcd76f849e79 /app/models | |
parent | d76c2e82328ed2a00add7bdfb528ed4393e640b7 (diff) |
Refactor the application of masks and censor rules to messages.
Seems more logical to make this one method that figures out what to do
based on file type. Plus, incoming message does so many things, it
seemed like having these related methods be separate would make them
easier to read and understand. Also, email, mobile and login
substitution texts weren't being translated. Finally, I think passing
the censor rules and masks as arguments is a first step in some more
decoupling of models.
Diffstat (limited to 'app/models')
-rw-r--r-- | app/models/incoming_message.rb | 158 | ||||
-rw-r--r-- | app/models/info_request.rb | 16 |
2 files changed, 32 insertions, 142 deletions
diff --git a/app/models/incoming_message.rb b/app/models/incoming_message.rb index db6722976..658ee969a 100644 --- a/app/models/incoming_message.rb +++ b/app/models/incoming_message.rb @@ -52,17 +52,6 @@ class IncomingMessage < ActiveRecord::Base has_prominence - # See binary_mask_stuff function below. It just test for inclusion - # in this hash, not the value of the right hand side. - DoNotBinaryMask = { - 'image/tiff' => 1, - 'image/gif' => 1, - 'image/jpeg' => 1, - 'image/png' => 1, - 'image/bmp' => 1, - 'application/zip' => 1, - } - # Given that there are in theory many info request events, a convenience method for # getting the response event def response_event @@ -218,111 +207,10 @@ class IncomingMessage < ActiveRecord::Base end end - # Converts email addresses we know about into textual descriptions of them - def mask_special_emails!(text) - # TODO: can later display some of these special emails as actual emails, - # if they are public anyway. For now just be precautionary and only - # put in descriptions of them in square brackets. - if self.info_request.public_body.is_followupable? - text.gsub!(self.info_request.public_body.request_email, _("[{{public_body}} request email]", :public_body => self.info_request.public_body.short_or_long_name)) - end - text.gsub!(self.info_request.incoming_email, _('[FOI #{{request}} email]', :request => self.info_request.id.to_s) ) - text.gsub!(AlaveteliConfiguration::contact_email, _("[{{site_name}} contact email]", :site_name => AlaveteliConfiguration::site_name) ) - end - - # Replaces all email addresses in (possibly binary data) with equal length alternative ones. - # Also replaces censor items - def binary_mask_stuff!(text, content_type) - # See if content type is one that we mask - things like zip files and - # images may get broken if we try to. We err on the side of masking too - # much, as many unknown types will really be text. - if DoNotBinaryMask.include?(content_type) - return - end - - # Special cases for some content types - if content_type == 'application/pdf' - uncompressed_text = nil - uncompressed_text = AlaveteliExternalCommand.run("pdftk", "-", "output", "-", "uncompress", :stdin_string => text) - # if we managed to uncompress the PDF... - if !uncompressed_text.nil? && !uncompressed_text.empty? - # then censor stuff (making a copy so can compare again in a bit) - censored_uncompressed_text = uncompressed_text.dup - self._binary_mask_stuff_internal!(censored_uncompressed_text) - # if the censor rule removed something... - if censored_uncompressed_text != uncompressed_text - # then use the altered file (recompressed) - recompressed_text = nil - if AlaveteliConfiguration::use_ghostscript_compression == true - command = ["gs", "-sDEVICE=pdfwrite", "-dCompatibilityLevel=1.4", "-dPDFSETTINGS=/screen", "-dNOPAUSE", "-dQUIET", "-dBATCH", "-sOutputFile=-", "-"] - else - command = ["pdftk", "-", "output", "-", "compress"] - end - recompressed_text = AlaveteliExternalCommand.run(*(command + [{:stdin_string=>censored_uncompressed_text}])) - if recompressed_text.nil? || recompressed_text.empty? - # buggy versions of pdftk sometimes fail on - # compression, I don't see it's a disaster in - # these cases to save an uncompressed version? - recompressed_text = censored_uncompressed_text - logger.warn "Unable to compress PDF; problem with your pdftk version?" - end - if !recompressed_text.nil? && !recompressed_text.empty? - text.replace recompressed_text - end - end - end - return - end - - self._binary_mask_stuff_internal!(text) - end - - # Used by binary_mask_stuff - replace text in place - def _binary_mask_stuff_internal!(text) - # Keep original size, so can check haven't resized it - orig_size = text.mb_chars.size - - # Replace ASCII email addresses... - text.gsub!(MySociety::Validate.email_find_regexp) do |email| - email.gsub(/[^@.]/, 'x') - end - - # And replace UCS-2 ones (for Microsoft Office documents)... - # Find emails, by finding them in parts of text that have ASCII - # equivalents to the UCS-2 - ascii_chars = text.gsub(/\0/, "") - emails = ascii_chars.scan(MySociety::Validate.email_find_regexp) - - # Convert back to UCS-2, making a mask at the same time - if String.method_defined?(:encode) - emails.map! do |email| - # We want the ASCII representation of UCS-2 - [email[0].encode('UTF-16LE').force_encoding('US-ASCII'), - email[0].gsub(/[^@.]/, 'x').encode('UTF-16LE').force_encoding('US-ASCII')] - end - else - emails.map! {|email| [ - Iconv.conv('ucs-2le', 'ascii', email[0]), - Iconv.conv('ucs-2le', 'ascii', email[0].gsub(/[^@.]/, 'x')) - ] } - end - - # Now search and replace the UCS-2 email with the UCS-2 mask - for email, mask in emails - text.gsub!(email, mask) - end - - # Replace censor items - self.info_request.apply_censor_rules_to_binary!(text) - - raise "internal error in binary_mask_stuff" if text.mb_chars.size != orig_size - return text - end - - # Removes censored stuff from from HTML conversion of downloaded binaries - def html_mask_stuff!(html) - self.mask_special_emails!(html) - self.remove_privacy_sensitive_things!(html) + def apply_masks!(text, content_type) + mask_options = { :censor_rules => info_request.applicable_censor_rules, + :masks => info_request.masks } + AlaveteliTextMasker.apply_masks!(text, content_type, mask_options) end # Lotus notes quoting yeuch! @@ -346,26 +234,6 @@ class IncomingMessage < ActiveRecord::Base end - # Remove emails, mobile phones and other details FOI officers ask us to remove. - def remove_privacy_sensitive_things!(text) - # Remove any email addresses - we don't want bounce messages to leak out - # either the requestor's email address or the request's response email - # address out onto the internet - text.gsub!(MySociety::Validate.email_find_regexp, "[email address]") - - # Mobile phone numbers - # http://www.whatdotheyknow.com/request/failed_test_purchases_off_licenc#incoming-1013 - # http://www.whatdotheyknow.com/request/selective_licensing_statistics_i#incoming-550 - # http://www.whatdotheyknow.com/request/common_purpose_training_graduate#incoming-774 - text.gsub!(/(Mobile|Mob)([\s\/]*(Fax|Tel))*\s*:?[\s\d]*\d/, "[mobile number]") - - # Remove WhatDoTheyKnow signup links - text.gsub!(/http:\/\/#{AlaveteliConfiguration::domain}\/c\/[^\s]+/, "[WDTK login link]") - - # Remove things from censor rules - self.info_request.apply_censor_rules_to_text!(text) - end - # Remove quoted sections from emails (eventually the aim would be for this # to do as good a job as GMail does) TODO: bet it needs a proper parser @@ -465,9 +333,8 @@ class IncomingMessage < ActiveRecord::Base raise "main body text more than 1 MB, need to implement clipping like for attachment text, or there is some other MIME decoding problem or similar" end - # remove emails for privacy/anti-spam reasons - self.mask_special_emails!(text) - self.remove_privacy_sensitive_things!(text) + # apply masks for this message + apply_masks!(text, 'text/html') # Remove existing quoted sections folded_quoted_text = self.remove_lotus_quoting(text, 'FOLDED_QUOTED_SECTION') @@ -735,7 +602,14 @@ class IncomingMessage < ActiveRecord::Base text = MySociety::Format.simplify_angle_bracketed_urls(text) text = CGI.escapeHTML(text) text = MySociety::Format.make_clickable(text, :contract => 1) - text.gsub!(/\[(email address|mobile number)\]/, '[<a href="/help/officers#mobiles">\1</a>]') + + # add a helpful link to email addresses and mobile numbers removed + # by apply_masks! + email_pattern = Regexp.escape(_("email address")) + mobile_pattern = Regexp.escape(_("mobile number")) + text.gsub!(/\[(#{email_pattern}|#{mobile_pattern})\]/, + '[<a href="/help/officers#mobiles">\1</a>]') + if collapse_quoted_sections text = text.gsub(/(\s*FOLDED_QUOTED_SECTION\s*)+/m, "FOLDED_QUOTED_SECTION") text.strip! @@ -773,8 +647,8 @@ class IncomingMessage < ActiveRecord::Base # Returns text version of attachment text def get_attachment_text_full text = self._get_attachment_text_internal - self.mask_special_emails!(text) - self.remove_privacy_sensitive_things!(text) + apply_masks!(text, 'text/html') + # This can be useful for memory debugging #STDOUT.puts 'xxx '+ MySociety::DebugHelpers::allocated_string_size_around_gc diff --git a/app/models/info_request.rb b/app/models/info_request.rb index dcd16878b..20b7ef9af 100644 --- a/app/models/info_request.rb +++ b/app/models/info_request.rb @@ -1153,6 +1153,22 @@ public return binary end + # Masks we apply to text associated with this request convert email addresses + # we know about into textual descriptions of them + def masks + masks = [{ :to_replace => incoming_email, + :replacement => _('[FOI #{{request}} email]', + :request => id.to_s) }, + { :to_replace => AlaveteliConfiguration::contact_email, + :replacement => _("[{{site_name}} contact email]", + :site_name => AlaveteliConfiguration::site_name)} ] + if public_body.is_followupable? + masks << { :to_replace => public_body.request_email, + :replacement => _("[{{public_body}} request email]", + :public_body => public_body.short_or_long_name) } + end + end + def is_owning_user?(user) !user.nil? && (user.id == user_id || user.owns_every_request?) end |