diff options
author | Louise Crow <louise.crow@gmail.com> | 2014-11-27 10:20:37 +0000 |
---|---|---|
committer | Louise Crow <louise.crow@gmail.com> | 2014-12-15 10:22:22 +0000 |
commit | 224725e202d581d956e8958c521abb00de9935b1 (patch) | |
tree | c53cfda3eed152070e8df62477dcbcd76f849e79 | |
parent | d76c2e82328ed2a00add7bdfb528ed4393e640b7 (diff) |
Refactor the application of masks and censor rules to messages.
Seems more logical to make this one method that figures out what to do
based on file type. Plus, incoming message does so many things, it
seemed like having these related methods be separate would make them
easier to read and understand. Also, email, mobile and login
substitution texts weren't being translated. Finally, I think passing
the censor rules and masks as arguments is a first step in some more
decoupling of models.
-rw-r--r-- | app/controllers/request_controller.rb | 12 | ||||
-rw-r--r-- | app/models/incoming_message.rb | 158 | ||||
-rw-r--r-- | app/models/info_request.rb | 16 | ||||
-rw-r--r-- | config/initializers/alaveteli.rb | 1 | ||||
-rw-r--r-- | lib/alaveteli_text_masker.rb | 127 | ||||
-rw-r--r-- | spec/controllers/request_controller_spec.rb | 2 | ||||
-rw-r--r-- | spec/lib/alaveteli_text_masker_spec.rb | 146 | ||||
-rw-r--r-- | spec/models/incoming_message_spec.rb | 179 |
8 files changed, 371 insertions, 270 deletions
diff --git a/app/controllers/request_controller.rb b/app/controllers/request_controller.rb index 346aaf384..d529f8dbb 100644 --- a/app/controllers/request_controller.rb +++ b/app/controllers/request_controller.rb @@ -770,13 +770,14 @@ class RequestController < ApplicationController get_attachment_internal(false) return unless @attachment - # Prevent spam to magic request address. Note that the binary - # subsitution method used depends on the content type - @incoming_message.binary_mask_stuff!(@attachment.body, @attachment.content_type) # we don't use @attachment.content_type here, as we want same mime type when cached in cache_attachments above response.content_type = AlaveteliFileTypes.filename_to_mimetype(params[:file_name]) || 'application/octet-stream' + # Prevent spam to magic request address. Note that the binary + # subsitution method used depends on the content type + @incoming_message.apply_masks!(@attachment.body, @attachment.content_type) + render :text => @attachment.body end @@ -804,10 +805,9 @@ class RequestController < ApplicationController :body_prefix => render_to_string(:partial => "request/view_html_prefix") } ) - - @incoming_message.html_mask_stuff!(html) - response.content_type = 'text/html' + @incoming_message.apply_masks!(html, response.content_type) + render :text => html end diff --git a/app/models/incoming_message.rb b/app/models/incoming_message.rb index db6722976..658ee969a 100644 --- a/app/models/incoming_message.rb +++ b/app/models/incoming_message.rb @@ -52,17 +52,6 @@ class IncomingMessage < ActiveRecord::Base has_prominence - # See binary_mask_stuff function below. It just test for inclusion - # in this hash, not the value of the right hand side. - DoNotBinaryMask = { - 'image/tiff' => 1, - 'image/gif' => 1, - 'image/jpeg' => 1, - 'image/png' => 1, - 'image/bmp' => 1, - 'application/zip' => 1, - } - # Given that there are in theory many info request events, a convenience method for # getting the response event def response_event @@ -218,111 +207,10 @@ class IncomingMessage < ActiveRecord::Base end end - # Converts email addresses we know about into textual descriptions of them - def mask_special_emails!(text) - # TODO: can later display some of these special emails as actual emails, - # if they are public anyway. For now just be precautionary and only - # put in descriptions of them in square brackets. - if self.info_request.public_body.is_followupable? - text.gsub!(self.info_request.public_body.request_email, _("[{{public_body}} request email]", :public_body => self.info_request.public_body.short_or_long_name)) - end - text.gsub!(self.info_request.incoming_email, _('[FOI #{{request}} email]', :request => self.info_request.id.to_s) ) - text.gsub!(AlaveteliConfiguration::contact_email, _("[{{site_name}} contact email]", :site_name => AlaveteliConfiguration::site_name) ) - end - - # Replaces all email addresses in (possibly binary data) with equal length alternative ones. - # Also replaces censor items - def binary_mask_stuff!(text, content_type) - # See if content type is one that we mask - things like zip files and - # images may get broken if we try to. We err on the side of masking too - # much, as many unknown types will really be text. - if DoNotBinaryMask.include?(content_type) - return - end - - # Special cases for some content types - if content_type == 'application/pdf' - uncompressed_text = nil - uncompressed_text = AlaveteliExternalCommand.run("pdftk", "-", "output", "-", "uncompress", :stdin_string => text) - # if we managed to uncompress the PDF... - if !uncompressed_text.nil? && !uncompressed_text.empty? - # then censor stuff (making a copy so can compare again in a bit) - censored_uncompressed_text = uncompressed_text.dup - self._binary_mask_stuff_internal!(censored_uncompressed_text) - # if the censor rule removed something... - if censored_uncompressed_text != uncompressed_text - # then use the altered file (recompressed) - recompressed_text = nil - if AlaveteliConfiguration::use_ghostscript_compression == true - command = ["gs", "-sDEVICE=pdfwrite", "-dCompatibilityLevel=1.4", "-dPDFSETTINGS=/screen", "-dNOPAUSE", "-dQUIET", "-dBATCH", "-sOutputFile=-", "-"] - else - command = ["pdftk", "-", "output", "-", "compress"] - end - recompressed_text = AlaveteliExternalCommand.run(*(command + [{:stdin_string=>censored_uncompressed_text}])) - if recompressed_text.nil? || recompressed_text.empty? - # buggy versions of pdftk sometimes fail on - # compression, I don't see it's a disaster in - # these cases to save an uncompressed version? - recompressed_text = censored_uncompressed_text - logger.warn "Unable to compress PDF; problem with your pdftk version?" - end - if !recompressed_text.nil? && !recompressed_text.empty? - text.replace recompressed_text - end - end - end - return - end - - self._binary_mask_stuff_internal!(text) - end - - # Used by binary_mask_stuff - replace text in place - def _binary_mask_stuff_internal!(text) - # Keep original size, so can check haven't resized it - orig_size = text.mb_chars.size - - # Replace ASCII email addresses... - text.gsub!(MySociety::Validate.email_find_regexp) do |email| - email.gsub(/[^@.]/, 'x') - end - - # And replace UCS-2 ones (for Microsoft Office documents)... - # Find emails, by finding them in parts of text that have ASCII - # equivalents to the UCS-2 - ascii_chars = text.gsub(/\0/, "") - emails = ascii_chars.scan(MySociety::Validate.email_find_regexp) - - # Convert back to UCS-2, making a mask at the same time - if String.method_defined?(:encode) - emails.map! do |email| - # We want the ASCII representation of UCS-2 - [email[0].encode('UTF-16LE').force_encoding('US-ASCII'), - email[0].gsub(/[^@.]/, 'x').encode('UTF-16LE').force_encoding('US-ASCII')] - end - else - emails.map! {|email| [ - Iconv.conv('ucs-2le', 'ascii', email[0]), - Iconv.conv('ucs-2le', 'ascii', email[0].gsub(/[^@.]/, 'x')) - ] } - end - - # Now search and replace the UCS-2 email with the UCS-2 mask - for email, mask in emails - text.gsub!(email, mask) - end - - # Replace censor items - self.info_request.apply_censor_rules_to_binary!(text) - - raise "internal error in binary_mask_stuff" if text.mb_chars.size != orig_size - return text - end - - # Removes censored stuff from from HTML conversion of downloaded binaries - def html_mask_stuff!(html) - self.mask_special_emails!(html) - self.remove_privacy_sensitive_things!(html) + def apply_masks!(text, content_type) + mask_options = { :censor_rules => info_request.applicable_censor_rules, + :masks => info_request.masks } + AlaveteliTextMasker.apply_masks!(text, content_type, mask_options) end # Lotus notes quoting yeuch! @@ -346,26 +234,6 @@ class IncomingMessage < ActiveRecord::Base end - # Remove emails, mobile phones and other details FOI officers ask us to remove. - def remove_privacy_sensitive_things!(text) - # Remove any email addresses - we don't want bounce messages to leak out - # either the requestor's email address or the request's response email - # address out onto the internet - text.gsub!(MySociety::Validate.email_find_regexp, "[email address]") - - # Mobile phone numbers - # http://www.whatdotheyknow.com/request/failed_test_purchases_off_licenc#incoming-1013 - # http://www.whatdotheyknow.com/request/selective_licensing_statistics_i#incoming-550 - # http://www.whatdotheyknow.com/request/common_purpose_training_graduate#incoming-774 - text.gsub!(/(Mobile|Mob)([\s\/]*(Fax|Tel))*\s*:?[\s\d]*\d/, "[mobile number]") - - # Remove WhatDoTheyKnow signup links - text.gsub!(/http:\/\/#{AlaveteliConfiguration::domain}\/c\/[^\s]+/, "[WDTK login link]") - - # Remove things from censor rules - self.info_request.apply_censor_rules_to_text!(text) - end - # Remove quoted sections from emails (eventually the aim would be for this # to do as good a job as GMail does) TODO: bet it needs a proper parser @@ -465,9 +333,8 @@ class IncomingMessage < ActiveRecord::Base raise "main body text more than 1 MB, need to implement clipping like for attachment text, or there is some other MIME decoding problem or similar" end - # remove emails for privacy/anti-spam reasons - self.mask_special_emails!(text) - self.remove_privacy_sensitive_things!(text) + # apply masks for this message + apply_masks!(text, 'text/html') # Remove existing quoted sections folded_quoted_text = self.remove_lotus_quoting(text, 'FOLDED_QUOTED_SECTION') @@ -735,7 +602,14 @@ class IncomingMessage < ActiveRecord::Base text = MySociety::Format.simplify_angle_bracketed_urls(text) text = CGI.escapeHTML(text) text = MySociety::Format.make_clickable(text, :contract => 1) - text.gsub!(/\[(email address|mobile number)\]/, '[<a href="/help/officers#mobiles">\1</a>]') + + # add a helpful link to email addresses and mobile numbers removed + # by apply_masks! + email_pattern = Regexp.escape(_("email address")) + mobile_pattern = Regexp.escape(_("mobile number")) + text.gsub!(/\[(#{email_pattern}|#{mobile_pattern})\]/, + '[<a href="/help/officers#mobiles">\1</a>]') + if collapse_quoted_sections text = text.gsub(/(\s*FOLDED_QUOTED_SECTION\s*)+/m, "FOLDED_QUOTED_SECTION") text.strip! @@ -773,8 +647,8 @@ class IncomingMessage < ActiveRecord::Base # Returns text version of attachment text def get_attachment_text_full text = self._get_attachment_text_internal - self.mask_special_emails!(text) - self.remove_privacy_sensitive_things!(text) + apply_masks!(text, 'text/html') + # This can be useful for memory debugging #STDOUT.puts 'xxx '+ MySociety::DebugHelpers::allocated_string_size_around_gc diff --git a/app/models/info_request.rb b/app/models/info_request.rb index dcd16878b..20b7ef9af 100644 --- a/app/models/info_request.rb +++ b/app/models/info_request.rb @@ -1153,6 +1153,22 @@ public return binary end + # Masks we apply to text associated with this request convert email addresses + # we know about into textual descriptions of them + def masks + masks = [{ :to_replace => incoming_email, + :replacement => _('[FOI #{{request}} email]', + :request => id.to_s) }, + { :to_replace => AlaveteliConfiguration::contact_email, + :replacement => _("[{{site_name}} contact email]", + :site_name => AlaveteliConfiguration::site_name)} ] + if public_body.is_followupable? + masks << { :to_replace => public_body.request_email, + :replacement => _("[{{public_body}} request email]", + :public_body => public_body.short_or_long_name) } + end + end + def is_owning_user?(user) !user.nil? && (user.id == user_id || user.owns_every_request?) end diff --git a/config/initializers/alaveteli.rb b/config/initializers/alaveteli.rb index 2ca85579a..128f6bc5a 100644 --- a/config/initializers/alaveteli.rb +++ b/config/initializers/alaveteli.rb @@ -56,6 +56,7 @@ require 'public_body_csv' require 'category_and_heading_migrator' require 'public_body_categories' require 'routing_filters' +require 'alaveteli_text_masker' AlaveteliLocalization.set_locales(AlaveteliConfiguration::available_locales, AlaveteliConfiguration::default_locale) diff --git a/lib/alaveteli_text_masker.rb b/lib/alaveteli_text_masker.rb new file mode 100644 index 000000000..68ff0d318 --- /dev/null +++ b/lib/alaveteli_text_masker.rb @@ -0,0 +1,127 @@ +module AlaveteliTextMasker + extend self + DoNotBinaryMask = [ 'image/tiff', + 'image/gif', + 'image/jpeg', + 'image/png', + 'image/bmp', + 'application/zip' ] + + # Replaces all email addresses in (possibly binary) data + # Also applies custom masks and censor items + def apply_masks!(text, content_type, options = {}) + # See if content type is one that we mask - things like zip files and + # images may get broken if we try to. We err on the side of masking too + # much, as many unknown types will really be text. + + # Special cases for some content types + case content_type + when *DoNotBinaryMask + # do nothing + when 'text/html' + apply_text_masks!(text, options) + when 'application/pdf' + apply_pdf_masks!(text, options) + else + apply_binary_masks!(text, options) + end + end + + def apply_pdf_masks!(text, options = {}) + uncompressed_text = nil + uncompressed_text = AlaveteliExternalCommand.run("pdftk", "-", "output", "-", "uncompress", + :stdin_string => text) + # if we managed to uncompress the PDF... + if !uncompressed_text.blank? + # then censor stuff (making a copy so can compare again in a bit) + censored_uncompressed_text = uncompressed_text.dup + apply_binary_masks!(censored_uncompressed_text, options) + # if the censor rule removed something... + if censored_uncompressed_text != uncompressed_text + # then use the altered file (recompressed) + recompressed_text = nil + if AlaveteliConfiguration::use_ghostscript_compression == true + command = ["gs", "-sDEVICE=pdfwrite", "-dCompatibilityLevel=1.4", "-dPDFSETTINGS=/screen", "-dNOPAUSE", "-dQUIET", "-dBATCH", "-sOutputFile=-", "-"] + else + command = ["pdftk", "-", "output", "-", "compress"] + end + recompressed_text = AlaveteliExternalCommand.run(*(command + [{:stdin_string=>censored_uncompressed_text}])) + if recompressed_text.blank? + # buggy versions of pdftk sometimes fail on + # compression, I don't see it's a disaster in + # these cases to save an uncompressed version? + recompressed_text = censored_uncompressed_text + logger.warn "Unable to compress PDF; problem with your pdftk version?" + end + if !recompressed_text.blank? + text.replace recompressed_text + end + end + end + end + + private + + # Replace text in place + def apply_binary_masks!(text, options = {}) + # Keep original size, so can check haven't resized it + orig_size = text.mb_chars.size + + # Replace ASCII email addresses... + text.gsub!(MySociety::Validate.email_find_regexp) do |email| + email.gsub(/[^@.]/, 'x') + end + + # And replace UCS-2 ones (for Microsoft Office documents)... + # Find emails, by finding them in parts of text that have ASCII + # equivalents to the UCS-2 + ascii_chars = text.gsub(/\0/, "") + emails = ascii_chars.scan(MySociety::Validate.email_find_regexp) + + # Convert back to UCS-2, making a mask at the same time + if String.method_defined?(:encode) + emails.map! do |email| + # We want the ASCII representation of UCS-2 + [email[0].encode('UTF-16LE').force_encoding('US-ASCII'), + email[0].gsub(/[^@.]/, 'x').encode('UTF-16LE').force_encoding('US-ASCII')] + end + else + emails.map! {|email| [ + Iconv.conv('ucs-2le', 'ascii', email[0]), + Iconv.conv('ucs-2le', 'ascii', email[0].gsub(/[^@.]/, 'x')) + ] } + end + + # Now search and replace the UCS-2 email with the UCS-2 mask + for email, mask in emails + text.gsub!(email, mask) + end + + # Replace censor items + censor_rules = options[:censor_rules] || [] + censor_rules.each{ |censor_rule| censor_rule.apply_to_binary!(text) } + raise "internal error in apply_binary_masks!" if text.mb_chars.size != orig_size + return text + end + + # Remove any email addresses, login links and mobile phone numbers + def default_text_masks + [{ :to_replace => MySociety::Validate.email_find_regexp, + :replacement => "[#{_("email address")}]" }, + { :to_replace => /(Mobile|Mob)([\s\/]*(Fax|Tel))*\s*:?[\s\d]*\d/, + :replacement => "[#{_("mobile number")}]" }, + { :to_replace => /https?:\/\/#{AlaveteliConfiguration::domain}\/c\/[^\s]+/, + :replacement => "[#{_("{{site_name}} login link", + :site_name => AlaveteliConfiguration::site_name)}]" }] + end + + def apply_text_masks!(text, options = {}) + masks = options[:masks] || [] + masks += default_text_masks + censor_rules = options[:censor_rules] || [] + masks.each{ |mask| text.gsub!(mask[:to_replace], mask[:replacement]) } + censor_rules.each{ |censor_rule| censor_rule.apply_to_text!(text) } + text + end + +end diff --git a/spec/controllers/request_controller_spec.rb b/spec/controllers/request_controller_spec.rb index 4d0070470..ba558cc93 100644 --- a/spec/controllers/request_controller_spec.rb +++ b/spec/controllers/request_controller_spec.rb @@ -2447,7 +2447,7 @@ describe RequestController, "when caching fragments" do :info_request_id => 132, :id => 44, :get_attachments_for_display => nil, - :html_mask_stuff! => nil, + :apply_masks! => nil, :user_can_view? => true, :all_can_view? => true) attachment = FactoryGirl.build(:body_text, :filename => long_name) diff --git a/spec/lib/alaveteli_text_masker_spec.rb b/spec/lib/alaveteli_text_masker_spec.rb new file mode 100644 index 000000000..1a4782a83 --- /dev/null +++ b/spec/lib/alaveteli_text_masker_spec.rb @@ -0,0 +1,146 @@ +# -*- coding: utf-8 -*- +require File.expand_path(File.dirname(__FILE__) + '/../spec_helper') + +describe AlaveteliTextMasker do + include AlaveteliTextMasker + + describe :apply_masks! do + + describe 'when applying censor rules' do + + before do + @cheese_censor_rule = FactoryGirl.build(:censor_rule, :text => 'Stilton', + :replacement => 'Jarlsberg') + @colour_censor_rule = FactoryGirl.build(:censor_rule, :text => 'blue', + :replacement => 'yellow') + @regex_censor_rule = FactoryGirl.build(:censor_rule, :text => 'm[a-z][a-z][a-z]e', + :replacement => 'cat', + :regexp => true) + @censor_rules = [@cheese_censor_rule, @colour_censor_rule, @regex_censor_rule] + end + + it "should do nothing to a JPEG" do + data = "There was a mouse called Stilton, he wished that he was blue." + apply_masks!(data, "image/jpeg", :censor_rules => @censor_rules) + data.should == "There was a mouse called Stilton, he wished that he was blue." + end + + it "should replace censor text in Word documents" do + data = "There was a mouse called Stilton, he wished that he was blue." + apply_masks!(data, "application/vnd.ms-word", :censor_rules => @censor_rules) + data.should == "There was a xxxxx called xxxxxxx, he wished that he was xxxx." + end + + it 'should handle multibyte characters correctly' do + data = 'á mouse' + @regex_censor_rule.text = 'á' + apply_masks!(data, "application/octet-stream", :censor_rules => @censor_rules).should == 'x mouse' + end + + it "should apply censor rules to HTML files" do + data = "There was a mouse called Stilton, he wished that he was blue." + apply_masks!(data, 'text/html', :censor_rules => @censor_rules) + data.should == "There was a cat called Jarlsberg, he wished that he was yellow." + end + + end + + it "should replace ASCII email addresses in Word documents" do + data = "His email was foo@bar.com" + expected = "His email was xxx@xxx.xxx" + apply_masks!(data, "application/vnd.ms-word") + data.should == expected + end + + + it "should replace UCS-2 addresses in Word documents" do + data = "His email was f\000o\000o\000@\000b\000a\000r\000.\000c\000o\000m\000, indeed" + apply_masks!(data, "application/vnd.ms-word") + data.should == "His email was x\000x\000x\000@\000x\000x\000x\000.\000x\000x\000x\000, indeed" + end + + def pdf_replacement_test(use_ghostscript_compression) + config = MySociety::Config.load_default() + previous = config['USE_GHOSTSCRIPT_COMPRESSION'] + config['USE_GHOSTSCRIPT_COMPRESSION'] = use_ghostscript_compression + orig_pdf = load_file_fixture('tfl.pdf') + pdf = orig_pdf.dup + + orig_text = MailHandler.get_attachment_text_one_file('application/pdf', pdf) + orig_text.should match(/foi@tfl.gov.uk/) + + apply_masks!(pdf, "application/pdf") + + masked_text = MailHandler.get_attachment_text_one_file('application/pdf', pdf) + masked_text.should_not match(/foi@tfl.gov.uk/) + masked_text.should match(/xxx@xxx.xxx.xx/) + config['USE_GHOSTSCRIPT_COMPRESSION'] = previous + end + + it "should replace everything in PDF files using pdftk" do + pdf_replacement_test(false) + end + + it "should replace everything in PDF files using ghostscript" do + pdf_replacement_test(true) + end + + it "should not produce zero length output if pdftk silently fails" do + orig_pdf = load_file_fixture('psni.pdf') + pdf = orig_pdf.dup + apply_masks!(pdf, "application/pdf") + pdf.should_not == "" + end + + it "should apply hard-coded privacy rules to HTML files" do + data = "http://test.host/c/cheese" + apply_masks!(data, 'text/html') + data.should == "[Alaveteli login link]" + end + + it 'should replace a simple email address' do + expected = "the address is [email address]" + apply_masks!("the address is test@example.com", 'text/html', {}).should == expected + end + + it 'should replace a mobile phone number prefixed with "Mobile"' do + expected = "the mobile is [mobile number]" + apply_masks!("the mobile is Mobile 55555 555555", 'text/html', {}).should == expected + end + + it 'should replace a mobile phone number prefixed with "Mob Tel"' do + expected = "the mobile is [mobile number]" + apply_masks!("the mobile is Mob Tel: 55555 555 555", 'text/html', {}).should == expected + end + + it 'should replace a mobile phone number prefixed with "Mob/Fax:"' do + expected = "the mobile is [mobile number]" + apply_masks!("the mobile is Mob/Fax: 55555 555555", 'text/html', {}).should == expected + end + + it "should replace an Alaveteli login link" do + expected = "the login link is [Alaveteli login link]" + apply_masks!("the login link is http://test.host/c/ekfmsdfkm", 'text/html', {}).should == expected + end + + it "should replace a https Alaveteli login link" do + expected = "the login link is [Alaveteli login link]" + apply_masks!("the login link is https://test.host/c/ekfmsdfkm", 'text/html', {}).should == expected + end + + it "should apply censor rules to text" do + censor_rule = FactoryGirl.build(:censor_rule, :text => 'mouse', :replacement => 'cat') + expected = "here is a cat" + apply_masks!("here is a mouse", 'text/html', {:censor_rules => [ censor_rule ]}).should == expected + end + + it 'should apply extra masks to text' do + mask = {:to_replace => 'mouse', :replacement => 'cat'} + expected = "here is a cat" + apply_masks!("here is a mouse", 'text/html', {:masks => [ mask ]}).should == expected + end + + end + +end + diff --git a/spec/models/incoming_message_spec.rb b/spec/models/incoming_message_spec.rb index 3b6887f76..f6e524de3 100644 --- a/spec/models/incoming_message_spec.rb +++ b/spec/models/incoming_message_spec.rb @@ -423,127 +423,50 @@ describe IncomingMessage, " checking validity to reply to with real emails" do end -describe IncomingMessage, " when censoring data" do - - before(:each) do - @test_data = "There was a mouse called Stilton, he wished that he was blue." - - @im = incoming_messages(:useless_incoming_message) - - @censor_rule_1 = CensorRule.new() - @censor_rule_1.text = "Stilton" - @censor_rule_1.replacement = "Jarlsberg" - @censor_rule_1.last_edit_editor = "unknown" - @censor_rule_1.last_edit_comment = "none" - @im.info_request.censor_rules << @censor_rule_1 - - @censor_rule_2 = CensorRule.new() - @censor_rule_2.text = "blue" - @censor_rule_2.replacement = "yellow" - @censor_rule_2.last_edit_editor = "unknown" - @censor_rule_2.last_edit_comment = "none" - @im.info_request.censor_rules << @censor_rule_2 - - @regex_censor_rule = CensorRule.new() - @regex_censor_rule.text = 'm[a-z][a-z][a-z]e' - @regex_censor_rule.regexp = true - @regex_censor_rule.replacement = 'cat' - @regex_censor_rule.last_edit_editor = 'unknown' - @regex_censor_rule.last_edit_comment = 'none' - @im.info_request.censor_rules << @regex_censor_rule - load_raw_emails_data - end - - it "should do nothing to a JPEG" do - data = @test_data.dup - @im.binary_mask_stuff!(data, "image/jpeg") - data.should == @test_data - end - - it "should replace censor text in Word documents" do - data = @test_data.dup - @im.binary_mask_stuff!(data, "application/vnd.ms-word") - data.should == "There was a xxxxx called xxxxxxx, he wished that he was xxxx." - end - - it "should replace ASCII email addresses in Word documents" do - orig_data = "His email was foo@bar.com" - data = orig_data.dup - @im.binary_mask_stuff!(data, "application/vnd.ms-word") - data.should == "His email was xxx@xxx.xxx" - end - - it "should replace UCS-2 addresses in Word documents" do - orig_data = "His email was f\000o\000o\000@\000b\000a\000r\000.\000c\000o\000m\000, indeed" - data = orig_data.dup - @im.binary_mask_stuff!(data, "application/vnd.ms-word") - data.should == "His email was x\000x\000x\000@\000x\000x\000x\000.\000x\000x\000x\000, indeed" - end - - it 'should handle multibyte characters correctly' do - orig_data = 'á' - data = orig_data.dup - @regex_censor_rule = CensorRule.new() - @regex_censor_rule.text = 'á' - @regex_censor_rule.regexp = true - @regex_censor_rule.replacement = 'cat' - @regex_censor_rule.last_edit_editor = 'unknown' - @regex_censor_rule.last_edit_comment = 'none' - @im.info_request.censor_rules << @regex_censor_rule - lambda{ @im.binary_mask_stuff!(data, "text/plain") }.should_not raise_error - end - def pdf_replacement_test(use_ghostscript_compression) - config = MySociety::Config.load_default() - previous = config['USE_GHOSTSCRIPT_COMPRESSION'] - config['USE_GHOSTSCRIPT_COMPRESSION'] = use_ghostscript_compression - orig_pdf = load_file_fixture('tfl.pdf') - pdf = orig_pdf.dup - - orig_text = MailHandler.get_attachment_text_one_file('application/pdf', pdf) - orig_text.should match(/foi@tfl.gov.uk/) - - @im.binary_mask_stuff!(pdf, "application/pdf") - - masked_text = MailHandler.get_attachment_text_one_file('application/pdf', pdf) - masked_text.should_not match(/foi@tfl.gov.uk/) - masked_text.should match(/xxx@xxx.xxx.xx/) - config['USE_GHOSTSCRIPT_COMPRESSION'] = previous - end - - it "should replace everything in PDF files using pdftk" do - pdf_replacement_test(false) - end - - it "should replace everything in PDF files using ghostscript" do - pdf_replacement_test(true) - end - - it "should not produce zero length output if pdftk silently fails" do - orig_pdf = load_file_fixture('psni.pdf') - pdf = orig_pdf.dup - @im.binary_mask_stuff!(pdf, "application/pdf") - pdf.should_not == "" - end - - it "should apply censor rules to HTML files" do - data = @test_data.dup - @im.html_mask_stuff!(data) - data.should == "There was a cat called Jarlsberg, he wished that he was yellow." - end - - it "should apply hard-coded privacy rules to HTML files" do - data = "http://#{AlaveteliConfiguration::domain}/c/cheese" - @im.html_mask_stuff!(data) - data.should == "[WDTK login link]" - end +describe IncomingMessage, " when censoring data" do - it "should apply censor rules to From: addresses" do - @im.stub!(:mail_from).and_return("Stilton Mouse") - @im.stub!(:last_parsed).and_return(Time.now) - safe_mail_from = @im.safe_mail_from - safe_mail_from.should == "Jarlsberg Mouse" - end + before(:each) do + @test_data = "There was a mouse called Stilton, he wished that he was blue." + + @im = incoming_messages(:useless_incoming_message) + + @censor_rule_1 = CensorRule.new() + @censor_rule_1.text = "Stilton" + @censor_rule_1.replacement = "Jarlsberg" + @censor_rule_1.last_edit_editor = "unknown" + @censor_rule_1.last_edit_comment = "none" + @im.info_request.censor_rules << @censor_rule_1 + + @censor_rule_2 = CensorRule.new() + @censor_rule_2.text = "blue" + @censor_rule_2.replacement = "yellow" + @censor_rule_2.last_edit_editor = "unknown" + @censor_rule_2.last_edit_comment = "none" + @im.info_request.censor_rules << @censor_rule_2 + + @regex_censor_rule = CensorRule.new() + @regex_censor_rule.text = 'm[a-z][a-z][a-z]e' + @regex_censor_rule.regexp = true + @regex_censor_rule.replacement = 'cat' + @regex_censor_rule.last_edit_editor = 'unknown' + @regex_censor_rule.last_edit_comment = 'none' + @im.info_request.censor_rules << @regex_censor_rule + load_raw_emails_data + end + + it "should replace censor text" do + data = "There was a mouse called Stilton, he wished that he was blue." + @im.apply_masks!(data, "application/vnd.ms-word") + data.should == "There was a xxxxx called xxxxxxx, he wished that he was xxxx." + end + + it "should apply censor rules to From: addresses" do + @im.stub!(:mail_from).and_return("Stilton Mouse") + @im.stub!(:last_parsed).and_return(Time.now) + safe_mail_from = @im.safe_mail_from + safe_mail_from.should == "Jarlsberg Mouse" + end end @@ -565,15 +488,16 @@ describe IncomingMessage, " when censoring whole users" do it "should apply censor rules to HTML files" do data = @test_data.dup - @im.html_mask_stuff!(data) + @im.apply_masks!(data, 'text/html') data.should == "There was a mouse called Gorgonzola, he wished that he was blue." end it "should replace censor text to Word documents" do data = @test_data.dup - @im.binary_mask_stuff!(data, "application/vnd.ms-word") + @im.apply_masks!(data, "application/vnd.ms-word") data.should == "There was a mouse called xxxxxxx, he wished that he was blue." end + end @@ -770,3 +694,16 @@ describe IncomingMessage, "when extracting attachments" do end end + +describe IncomingMessage, 'when getting the body of a message for html display' do + + it 'should replace any masked email addresses with a link to the help page' do + incoming_message = IncomingMessage.new + body_text = 'there was an [email address] here' + incoming_message.stub!(:get_main_body_text_folded).and_return(body_text) + incoming_message.stub!(:get_main_body_text_unfolded).and_return(body_text) + expected = 'there was an [<a href="/help/officers#mobiles">email address</a>] here' + incoming_message.get_body_for_html_display.should == expected + end + +end |