diff options
-rw-r--r-- | app/controllers/request_controller.rb | 12 | ||||
-rw-r--r-- | app/models/incoming_message.rb | 158 | ||||
-rw-r--r-- | app/models/info_request.rb | 16 | ||||
-rw-r--r-- | config/initializers/alaveteli.rb | 1 | ||||
-rw-r--r-- | lib/alaveteli_text_masker.rb | 127 | ||||
-rw-r--r-- | spec/controllers/request_controller_spec.rb | 2 | ||||
-rw-r--r-- | spec/lib/alaveteli_text_masker_spec.rb | 146 | ||||
-rw-r--r-- | spec/models/incoming_message_spec.rb | 179 |
8 files changed, 371 insertions, 270 deletions
diff --git a/app/controllers/request_controller.rb b/app/controllers/request_controller.rb index 346aaf384..d529f8dbb 100644 --- a/app/controllers/request_controller.rb +++ b/app/controllers/request_controller.rb @@ -770,13 +770,14 @@ class RequestController < ApplicationController get_attachment_internal(false) return unless @attachment - # Prevent spam to magic request address. Note that the binary - # subsitution method used depends on the content type - @incoming_message.binary_mask_stuff!(@attachment.body, @attachment.content_type) # we don't use @attachment.content_type here, as we want same mime type when cached in cache_attachments above response.content_type = AlaveteliFileTypes.filename_to_mimetype(params[:file_name]) || 'application/octet-stream' + # Prevent spam to magic request address. Note that the binary + # subsitution method used depends on the content type + @incoming_message.apply_masks!(@attachment.body, @attachment.content_type) + render :text => @attachment.body end @@ -804,10 +805,9 @@ class RequestController < ApplicationController :body_prefix => render_to_string(:partial => "request/view_html_prefix") } ) - - @incoming_message.html_mask_stuff!(html) - response.content_type = 'text/html' + @incoming_message.apply_masks!(html, response.content_type) + render :text => html end diff --git a/app/models/incoming_message.rb b/app/models/incoming_message.rb index db6722976..658ee969a 100644 --- a/app/models/incoming_message.rb +++ b/app/models/incoming_message.rb @@ -52,17 +52,6 @@ class IncomingMessage < ActiveRecord::Base has_prominence - # See binary_mask_stuff function below. It just test for inclusion - # in this hash, not the value of the right hand side. - DoNotBinaryMask = { - 'image/tiff' => 1, - 'image/gif' => 1, - 'image/jpeg' => 1, - 'image/png' => 1, - 'image/bmp' => 1, - 'application/zip' => 1, - } - # Given that there are in theory many info request events, a convenience method for # getting the response event def response_event @@ -218,111 +207,10 @@ class IncomingMessage < ActiveRecord::Base end end - # Converts email addresses we know about into textual descriptions of them - def mask_special_emails!(text) - # TODO: can later display some of these special emails as actual emails, - # if they are public anyway. For now just be precautionary and only - # put in descriptions of them in square brackets. - if self.info_request.public_body.is_followupable? - text.gsub!(self.info_request.public_body.request_email, _("[{{public_body}} request email]", :public_body => self.info_request.public_body.short_or_long_name)) - end - text.gsub!(self.info_request.incoming_email, _('[FOI #{{request}} email]', :request => self.info_request.id.to_s) ) - text.gsub!(AlaveteliConfiguration::contact_email, _("[{{site_name}} contact email]", :site_name => AlaveteliConfiguration::site_name) ) - end - - # Replaces all email addresses in (possibly binary data) with equal length alternative ones. - # Also replaces censor items - def binary_mask_stuff!(text, content_type) - # See if content type is one that we mask - things like zip files and - # images may get broken if we try to. We err on the side of masking too - # much, as many unknown types will really be text. - if DoNotBinaryMask.include?(content_type) - return - end - - # Special cases for some content types - if content_type == 'application/pdf' - uncompressed_text = nil - uncompressed_text = AlaveteliExternalCommand.run("pdftk", "-", "output", "-", "uncompress", :stdin_string => text) - # if we managed to uncompress the PDF... - if !uncompressed_text.nil? && !uncompressed_text.empty? - # then censor stuff (making a copy so can compare again in a bit) - censored_uncompressed_text = uncompressed_text.dup - self._binary_mask_stuff_internal!(censored_uncompressed_text) - # if the censor rule removed something... - if censored_uncompressed_text != uncompressed_text - # then use the altered file (recompressed) - recompressed_text = nil - if AlaveteliConfiguration::use_ghostscript_compression == true - command = ["gs", "-sDEVICE=pdfwrite", "-dCompatibilityLevel=1.4", "-dPDFSETTINGS=/screen", "-dNOPAUSE", "-dQUIET", "-dBATCH", "-sOutputFile=-", "-"] - else - command = ["pdftk", "-", "output", "-", "compress"] - end - recompressed_text = AlaveteliExternalCommand.run(*(command + [{:stdin_string=>censored_uncompressed_text}])) - if recompressed_text.nil? || recompressed_text.empty? - # buggy versions of pdftk sometimes fail on - # compression, I don't see it's a disaster in - # these cases to save an uncompressed version? - recompressed_text = censored_uncompressed_text - logger.warn "Unable to compress PDF; problem with your pdftk version?" - end - if !recompressed_text.nil? && !recompressed_text.empty? - text.replace recompressed_text - end - end - end - return - end - - self._binary_mask_stuff_internal!(text) - end - - # Used by binary_mask_stuff - replace text in place - def _binary_mask_stuff_internal!(text) - # Keep original size, so can check haven't resized it - orig_size = text.mb_chars.size - - # Replace ASCII email addresses... - text.gsub!(MySociety::Validate.email_find_regexp) do |email| - email.gsub(/[^@.]/, 'x') - end - - # And replace UCS-2 ones (for Microsoft Office documents)... - # Find emails, by finding them in parts of text that have ASCII - # equivalents to the UCS-2 - ascii_chars = text.gsub(/\0/, "") - emails = ascii_chars.scan(MySociety::Validate.email_find_regexp) - - # Convert back to UCS-2, making a mask at the same time - if String.method_defined?(:encode) - emails.map! do |email| - # We want the ASCII representation of UCS-2 - [email[0].encode('UTF-16LE').force_encoding('US-ASCII'), - email[0].gsub(/[^@.]/, 'x').encode('UTF-16LE').force_encoding('US-ASCII')] - end - else - emails.map! {|email| [ - Iconv.conv('ucs-2le', 'ascii', email[0]), - Iconv.conv('ucs-2le', 'ascii', email[0].gsub(/[^@.]/, 'x')) - ] } - end - - # Now search and replace the UCS-2 email with the UCS-2 mask - for email, mask in emails - text.gsub!(email, mask) - end - - # Replace censor items - self.info_request.apply_censor_rules_to_binary!(text) - - raise "internal error in binary_mask_stuff" if text.mb_chars.size != orig_size - return text - end - - # Removes censored stuff from from HTML conversion of downloaded binaries - def html_mask_stuff!(html) - self.mask_special_emails!(html) - self.remove_privacy_sensitive_things!(html) + def apply_masks!(text, content_type) + mask_options = { :censor_rules => info_request.applicable_censor_rules, + :masks => info_request.masks } + AlaveteliTextMasker.apply_masks!(text, content_type, mask_options) end # Lotus notes quoting yeuch! @@ -346,26 +234,6 @@ class IncomingMessage < ActiveRecord::Base end - # Remove emails, mobile phones and other details FOI officers ask us to remove. - def remove_privacy_sensitive_things!(text) - # Remove any email addresses - we don't want bounce messages to leak out - # either the requestor's email address or the request's response email - # address out onto the internet - text.gsub!(MySociety::Validate.email_find_regexp, "[email address]") - - # Mobile phone numbers - # http://www.whatdotheyknow.com/request/failed_test_purchases_off_licenc#incoming-1013 - # http://www.whatdotheyknow.com/request/selective_licensing_statistics_i#incoming-550 - # http://www.whatdotheyknow.com/request/common_purpose_training_graduate#incoming-774 - text.gsub!(/(Mobile|Mob)([\s\/]*(Fax|Tel))*\s*:?[\s\d]*\d/, "[mobile number]") - - # Remove WhatDoTheyKnow signup links - text.gsub!(/http:\/\/#{AlaveteliConfiguration::domain}\/c\/[^\s]+/, "[WDTK login link]") - - # Remove things from censor rules - self.info_request.apply_censor_rules_to_text!(text) - end - # Remove quoted sections from emails (eventually the aim would be for this # to do as good a job as GMail does) TODO: bet it needs a proper parser @@ -465,9 +333,8 @@ class IncomingMessage < ActiveRecord::Base raise "main body text more than 1 MB, need to implement clipping like for attachment text, or there is some other MIME decoding problem or similar" end - # remove emails for privacy/anti-spam reasons - self.mask_special_emails!(text) - self.remove_privacy_sensitive_things!(text) + # apply masks for this message + apply_masks!(text, 'text/html') # Remove existing quoted sections folded_quoted_text = self.remove_lotus_quoting(text, 'FOLDED_QUOTED_SECTION') @@ -735,7 +602,14 @@ class IncomingMessage < ActiveRecord::Base text = MySociety::Format.simplify_angle_bracketed_urls(text) text = CGI.escapeHTML(text) text = MySociety::Format.make_clickable(text, :contract => 1) - text.gsub!(/\[(email address|mobile number)\]/, '[<a href="/help/officers#mobiles">\1</a>]') + + # add a helpful link to email addresses and mobile numbers removed + # by apply_masks! + email_pattern = Regexp.escape(_("email address")) + mobile_pattern = Regexp.escape(_("mobile number")) + text.gsub!(/\[(#{email_pattern}|#{mobile_pattern})\]/, + '[<a href="/help/officers#mobiles">\1</a>]') + if collapse_quoted_sections text = text.gsub(/(\s*FOLDED_QUOTED_SECTION\s*)+/m, "FOLDED_QUOTED_SECTION") text.strip! @@ -773,8 +647,8 @@ class IncomingMessage < ActiveRecord::Base # Returns text version of attachment text def get_attachment_text_full text = self._get_attachment_text_internal - self.mask_special_emails!(text) - self.remove_privacy_sensitive_things!(text) + apply_masks!(text, 'text/html') + # This can be useful for memory debugging #STDOUT.puts 'xxx '+ MySociety::DebugHelpers::allocated_string_size_around_gc diff --git a/app/models/info_request.rb b/app/models/info_request.rb index dcd16878b..20b7ef9af 100644 --- a/app/models/info_request.rb +++ b/app/models/info_request.rb @@ -1153,6 +1153,22 @@ public return binary end + # Masks we apply to text associated with this request convert email addresses + # we know about into textual descriptions of them + def masks + masks = [{ :to_replace => incoming_email, + :replacement => _('[FOI #{{request}} email]', + :request => id.to_s) }, + { :to_replace => AlaveteliConfiguration::contact_email, + :replacement => _("[{{site_name}} contact email]", + :site_name => AlaveteliConfiguration::site_name)} ] + if public_body.is_followupable? + masks << { :to_replace => public_body.request_email, + :replacement => _("[{{public_body}} request email]", + :public_body => public_body.short_or_long_name) } + end + end + def is_owning_user?(user) !user.nil? && (user.id == user_id || user.owns_every_request?) end diff --git a/config/initializers/alaveteli.rb b/config/initializers/alaveteli.rb index 2ca85579a..128f6bc5a 100644 --- a/config/initializers/alaveteli.rb +++ b/config/initializers/alaveteli.rb @@ -56,6 +56,7 @@ require 'public_body_csv' require 'category_and_heading_migrator' require 'public_body_categories' require 'routing_filters' +require 'alaveteli_text_masker' AlaveteliLocalization.set_locales(AlaveteliConfiguration::available_locales, AlaveteliConfiguration::default_locale) diff --git a/lib/alaveteli_text_masker.rb b/lib/alaveteli_text_masker.rb new file mode 100644 index 000000000..68ff0d318 --- /dev/null +++ b/lib/alaveteli_text_masker.rb @@ -0,0 +1,127 @@ +module AlaveteliTextMasker + extend self + DoNotBinaryMask = [ 'image/tiff', + 'image/gif', + 'image/jpeg', + 'image/png', + 'image/bmp', + 'application/zip' ] + + # Replaces all email addresses in (possibly binary) data + # Also applies custom masks and censor items + def apply_masks!(text, content_type, options = {}) + # See if content type is one that we mask - things like zip files and + # images may get broken if we try to. We err on the side of masking too + # much, as many unknown types will really be text. + + # Special cases for some content types + case content_type + when *DoNotBinaryMask + # do nothing + when 'text/html' + apply_text_masks!(text, options) + when 'application/pdf' + apply_pdf_masks!(text, options) + else + apply_binary_masks!(text, options) + end + end + + def apply_pdf_masks!(text, options = {}) + uncompressed_text = nil + uncompressed_text = AlaveteliExternalCommand.run("pdftk", "-", "output", "-", "uncompress", + :stdin_string => text) + # if we managed to uncompress the PDF... + if !uncompressed_text.blank? + # then censor stuff (making a copy so can compare again in a bit) + censored_uncompressed_text = uncompressed_text.dup + apply_binary_masks!(censored_uncompressed_text, options) + # if the censor rule removed something... + if censored_uncompressed_text != uncompressed_text + # then use the altered file (recompressed) + recompressed_text = nil + if AlaveteliConfiguration::use_ghostscript_compression == true + command = ["gs", "-sDEVICE=pdfwrite", "-dCompatibilityLevel=1.4", "-dPDFSETTINGS=/screen", "-dNOPAUSE", "-dQUIET", "-dBATCH", "-sOutputFile=-", "-"] + else + command = ["pdftk", "-", "output", "-", "compress"] + end + recompressed_text = AlaveteliExternalCommand.run(*(command + [{:stdin_string=>censored_uncompressed_text}])) + if recompressed_text.blank? + # buggy versions of pdftk sometimes fail on + # compression, I don't see it's a disaster in + # these cases to save an uncompressed version? + recompressed_text = censored_uncompressed_text + logger.warn "Unable to compress PDF; problem with your pdftk version?" + end + if !recompressed_text.blank? + text.replace recompressed_text + end + end + end + end + + private + + # Replace text in place + def apply_binary_masks!(text, options = {}) + # Keep original size, so can check haven't resized it + orig_size = text.mb_chars.size + + # Replace ASCII email addresses... + text.gsub!(MySociety::Validate.email_find_regexp) do |email| + email.gsub(/[^@.]/, 'x') + end + + # And replace UCS-2 ones (for Microsoft Office documents)... + # Find emails, by finding them in parts of text that have ASCII + # equivalents to the UCS-2 + ascii_chars = text.gsub(/\0/, "") + emails = ascii_chars.scan(MySociety::Validate.email_find_regexp) + + # Convert back to UCS-2, making a mask at the same time + if String.method_defined?(:encode) + emails.map! do |email| + # We want the ASCII representation of UCS-2 + [email[0].encode('UTF-16LE').force_encoding('US-ASCII'), + email[0].gsub(/[^@.]/, 'x').encode('UTF-16LE').force_encoding('US-ASCII')] + end + else + emails.map! {|email| [ + Iconv.conv('ucs-2le', 'ascii', email[0]), + Iconv.conv('ucs-2le', 'ascii', email[0].gsub(/[^@.]/, 'x')) + ] } + end + + # Now search and replace the UCS-2 email with the UCS-2 mask + for email, mask in emails + text.gsub!(email, mask) + end + + # Replace censor items + censor_rules = options[:censor_rules] || [] + censor_rules.each{ |censor_rule| censor_rule.apply_to_binary!(text) } + raise "internal error in apply_binary_masks!" if text.mb_chars.size != orig_size + return text + end + + # Remove any email addresses, login links and mobile phone numbers + def default_text_masks + [{ :to_replace => MySociety::Validate.email_find_regexp, + :replacement => "[#{_("email address")}]" }, + { :to_replace => /(Mobile|Mob)([\s\/]*(Fax|Tel))*\s*:?[\s\d]*\d/, + :replacement => "[#{_("mobile number")}]" }, + { :to_replace => /https?:\/\/#{AlaveteliConfiguration::domain}\/c\/[^\s]+/, + :replacement => "[#{_("{{site_name}} login link", + :site_name => AlaveteliConfiguration::site_name)}]" }] + end + + def apply_text_masks!(text, options = {}) + masks = options[:masks] || [] + masks += default_text_masks + censor_rules = options[:censor_rules] || [] + masks.each{ |mask| text.gsub!(mask[:to_replace], mask[:replacement]) } + censor_rules.each{ |censor_rule| censor_rule.apply_to_text!(text) } + text + end + +end diff --git a/spec/controllers/request_controller_spec.rb b/spec/controllers/request_controller_spec.rb index 4d0070470..ba558cc93 100644 --- a/spec/controllers/request_controller_spec.rb +++ b/spec/controllers/request_controller_spec.rb @@ -2447,7 +2447,7 @@ describe RequestController, "when caching fragments" do :info_request_id => 132, :id => 44, :get_attachments_for_display => nil, - :html_mask_stuff! => nil, + :apply_masks! => nil, :user_can_view? => true, :all_can_view? => true) attachment = FactoryGirl.build(:body_text, :filename => long_name) diff --git a/spec/lib/alaveteli_text_masker_spec.rb b/spec/lib/alaveteli_text_masker_spec.rb new file mode 100644 index 000000000..1a4782a83 --- /dev/null +++ b/spec/lib/alaveteli_text_masker_spec.rb @@ -0,0 +1,146 @@ +# -*- coding: utf-8 -*- +require File.expand_path(File.dirname(__FILE__) + '/../spec_helper') + +describe AlaveteliTextMasker do + include AlaveteliTextMasker + + describe :apply_masks! do + + describe 'when applying censor rules' do + + before do + @cheese_censor_rule = FactoryGirl.build(:censor_rule, :text => 'Stilton', + :replacement => 'Jarlsberg') + @colour_censor_rule = FactoryGirl.build(:censor_rule, :text => 'blue', + :replacement => 'yellow') + @regex_censor_rule = FactoryGirl.build(:censor_rule, :text => 'm[a-z][a-z][a-z]e', + :replacement => 'cat', + :regexp => true) + @censor_rules = [@cheese_censor_rule, @colour_censor_rule, @regex_censor_rule] + end + + it "should do nothing to a JPEG" do + data = "There was a mouse called Stilton, he wished that he was blue." + apply_masks!(data, "image/jpeg", :censor_rules => @censor_rules) + data.should == "There was a mouse called Stilton, he wished that he was blue." + end + + it "should replace censor text in Word documents" do + data = "There was a mouse called Stilton, he wished that he was blue." + apply_masks!(data, "application/vnd.ms-word", :censor_rules => @censor_rules) + data.should == "There was a xxxxx called xxxxxxx, he wished that he was xxxx." + end + + it 'should handle multibyte characters correctly' do + data = 'á mouse' + @regex_censor_rule.text = 'á' + apply_masks!(data, "application/octet-stream", :censor_rules => @censor_rules).should == 'x mouse' + end + + it "should apply censor rules to HTML files" do + data = "There was a mouse called Stilton, he wished that he was blue." + apply_masks!(data, 'text/html', :censor_rules => @censor_rules) + data.should == "There was a cat called Jarlsberg, he wished that he was yellow." + end + + end + + it "should replace ASCII email addresses in Word documents" do + data = "His email was foo@bar.com" + expected = "His email was xxx@xxx.xxx" + apply_masks!(data, "application/vnd.ms-word") + data.should == expected + end + + + it "should replace UCS-2 addresses in Word documents" do + data = "His email was f\000o\000o\000@\000b\000a\000r\000.\000c\000o\000m\000, indeed" + apply_masks!(data, "application/vnd.ms-word") + data.should == "His email was x\000x\000x\000@\000x\000x\000x\000.\000x\000x\000x\000, indeed" + end + + def pdf_replacement_test(use_ghostscript_compression) + config = MySociety::Config.load_default() + previous = config['USE_GHOSTSCRIPT_COMPRESSION'] + config['USE_GHOSTSCRIPT_COMPRESSION'] = use_ghostscript_compression + orig_pdf = load_file_fixture('tfl.pdf') + pdf = orig_pdf.dup + + orig_text = MailHandler.get_attachment_text_one_file('application/pdf', pdf) + orig_text.should match(/foi@tfl.gov.uk/) + + apply_masks!(pdf, "application/pdf") + + masked_text = MailHandler.get_attachment_text_one_file('application/pdf', pdf) + masked_text.should_not match(/foi@tfl.gov.uk/) + masked_text.should match(/xxx@xxx.xxx.xx/) + config['USE_GHOSTSCRIPT_COMPRESSION'] = previous + end + + it "should replace everything in PDF files using pdftk" do + pdf_replacement_test(false) + end + + it "should replace everything in PDF files using ghostscript" do + pdf_replacement_test(true) + end + + it "should not produce zero length output if pdftk silently fails" do + orig_pdf = load_file_fixture('psni.pdf') + pdf = orig_pdf.dup + apply_masks!(pdf, "application/pdf") + pdf.should_not == "" + end + + it "should apply hard-coded privacy rules to HTML files" do + data = "http://test.host/c/cheese" + apply_masks!(data, 'text/html') + data.should == "[Alaveteli login link]" + end + + it 'should replace a simple email address' do + expected = "the address is [email address]" + apply_masks!("the address is test@example.com", 'text/html', {}).should == expected + end + + it 'should replace a mobile phone number prefixed with "Mobile"' do + expected = "the mobile is [mobile number]" + apply_masks!("the mobile is Mobile 55555 555555", 'text/html', {}).should == expected + end + + it 'should replace a mobile phone number prefixed with "Mob Tel"' do + expected = "the mobile is [mobile number]" + apply_masks!("the mobile is Mob Tel: 55555 555 555", 'text/html', {}).should == expected + end + + it 'should replace a mobile phone number prefixed with "Mob/Fax:"' do + expected = "the mobile is [mobile number]" + apply_masks!("the mobile is Mob/Fax: 55555 555555", 'text/html', {}).should == expected + end + + it "should replace an Alaveteli login link" do + expected = "the login link is [Alaveteli login link]" + apply_masks!("the login link is http://test.host/c/ekfmsdfkm", 'text/html', {}).should == expected + end + + it "should replace a https Alaveteli login link" do + expected = "the login link is [Alaveteli login link]" + apply_masks!("the login link is https://test.host/c/ekfmsdfkm", 'text/html', {}).should == expected + end + + it "should apply censor rules to text" do + censor_rule = FactoryGirl.build(:censor_rule, :text => 'mouse', :replacement => 'cat') + expected = "here is a cat" + apply_masks!("here is a mouse", 'text/html', {:censor_rules => [ censor_rule ]}).should == expected + end + + it 'should apply extra masks to text' do + mask = {:to_replace => 'mouse', :replacement => 'cat'} + expected = "here is a cat" + apply_masks!("here is a mouse", 'text/html', {:masks => [ mask ]}).should == expected + end + + end + +end + diff --git a/spec/models/incoming_message_spec.rb b/spec/models/incoming_message_spec.rb index 3b6887f76..f6e524de3 100644 --- a/spec/models/incoming_message_spec.rb +++ b/spec/models/incoming_message_spec.rb @@ -423,127 +423,50 @@ describe IncomingMessage, " checking validity to reply to with real emails" do end -describe IncomingMessage, " when censoring data" do - - before(:each) do - @test_data = "There was a mouse called Stilton, he wished that he was blue." - - @im = incoming_messages(:useless_incoming_message) - - @censor_rule_1 = CensorRule.new() - @censor_rule_1.text = "Stilton" - @censor_rule_1.replacement = "Jarlsberg" - @censor_rule_1.last_edit_editor = "unknown" - @censor_rule_1.last_edit_comment = "none" - @im.info_request.censor_rules << @censor_rule_1 - - @censor_rule_2 = CensorRule.new() - @censor_rule_2.text = "blue" - @censor_rule_2.replacement = "yellow" - @censor_rule_2.last_edit_editor = "unknown" - @censor_rule_2.last_edit_comment = "none" - @im.info_request.censor_rules << @censor_rule_2 - - @regex_censor_rule = CensorRule.new() - @regex_censor_rule.text = 'm[a-z][a-z][a-z]e' - @regex_censor_rule.regexp = true - @regex_censor_rule.replacement = 'cat' - @regex_censor_rule.last_edit_editor = 'unknown' - @regex_censor_rule.last_edit_comment = 'none' - @im.info_request.censor_rules << @regex_censor_rule - load_raw_emails_data - end - - it "should do nothing to a JPEG" do - data = @test_data.dup - @im.binary_mask_stuff!(data, "image/jpeg") - data.should == @test_data - end - - it "should replace censor text in Word documents" do - data = @test_data.dup - @im.binary_mask_stuff!(data, "application/vnd.ms-word") - data.should == "There was a xxxxx called xxxxxxx, he wished that he was xxxx." - end - - it "should replace ASCII email addresses in Word documents" do - orig_data = "His email was foo@bar.com" - data = orig_data.dup - @im.binary_mask_stuff!(data, "application/vnd.ms-word") - data.should == "His email was xxx@xxx.xxx" - end - - it "should replace UCS-2 addresses in Word documents" do - orig_data = "His email was f\000o\000o\000@\000b\000a\000r\000.\000c\000o\000m\000, indeed" - data = orig_data.dup - @im.binary_mask_stuff!(data, "application/vnd.ms-word") - data.should == "His email was x\000x\000x\000@\000x\000x\000x\000.\000x\000x\000x\000, indeed" - end - - it 'should handle multibyte characters correctly' do - orig_data = 'á' - data = orig_data.dup - @regex_censor_rule = CensorRule.new() - @regex_censor_rule.text = 'á' - @regex_censor_rule.regexp = true - @regex_censor_rule.replacement = 'cat' - @regex_censor_rule.last_edit_editor = 'unknown' - @regex_censor_rule.last_edit_comment = 'none' - @im.info_request.censor_rules << @regex_censor_rule - lambda{ @im.binary_mask_stuff!(data, "text/plain") }.should_not raise_error - end - def pdf_replacement_test(use_ghostscript_compression) - config = MySociety::Config.load_default() - previous = config['USE_GHOSTSCRIPT_COMPRESSION'] - config['USE_GHOSTSCRIPT_COMPRESSION'] = use_ghostscript_compression - orig_pdf = load_file_fixture('tfl.pdf') - pdf = orig_pdf.dup - - orig_text = MailHandler.get_attachment_text_one_file('application/pdf', pdf) - orig_text.should match(/foi@tfl.gov.uk/) - - @im.binary_mask_stuff!(pdf, "application/pdf") - - masked_text = MailHandler.get_attachment_text_one_file('application/pdf', pdf) - masked_text.should_not match(/foi@tfl.gov.uk/) - masked_text.should match(/xxx@xxx.xxx.xx/) - config['USE_GHOSTSCRIPT_COMPRESSION'] = previous - end - - it "should replace everything in PDF files using pdftk" do - pdf_replacement_test(false) - end - - it "should replace everything in PDF files using ghostscript" do - pdf_replacement_test(true) - end - - it "should not produce zero length output if pdftk silently fails" do - orig_pdf = load_file_fixture('psni.pdf') - pdf = orig_pdf.dup - @im.binary_mask_stuff!(pdf, "application/pdf") - pdf.should_not == "" - end - - it "should apply censor rules to HTML files" do - data = @test_data.dup - @im.html_mask_stuff!(data) - data.should == "There was a cat called Jarlsberg, he wished that he was yellow." - end - - it "should apply hard-coded privacy rules to HTML files" do - data = "http://#{AlaveteliConfiguration::domain}/c/cheese" - @im.html_mask_stuff!(data) - data.should == "[WDTK login link]" - end +describe IncomingMessage, " when censoring data" do - it "should apply censor rules to From: addresses" do - @im.stub!(:mail_from).and_return("Stilton Mouse") - @im.stub!(:last_parsed).and_return(Time.now) - safe_mail_from = @im.safe_mail_from - safe_mail_from.should == "Jarlsberg Mouse" - end + before(:each) do + @test_data = "There was a mouse called Stilton, he wished that he was blue." + + @im = incoming_messages(:useless_incoming_message) + + @censor_rule_1 = CensorRule.new() + @censor_rule_1.text = "Stilton" + @censor_rule_1.replacement = "Jarlsberg" + @censor_rule_1.last_edit_editor = "unknown" + @censor_rule_1.last_edit_comment = "none" + @im.info_request.censor_rules << @censor_rule_1 + + @censor_rule_2 = CensorRule.new() + @censor_rule_2.text = "blue" + @censor_rule_2.replacement = "yellow" + @censor_rule_2.last_edit_editor = "unknown" + @censor_rule_2.last_edit_comment = "none" + @im.info_request.censor_rules << @censor_rule_2 + + @regex_censor_rule = CensorRule.new() + @regex_censor_rule.text = 'm[a-z][a-z][a-z]e' + @regex_censor_rule.regexp = true + @regex_censor_rule.replacement = 'cat' + @regex_censor_rule.last_edit_editor = 'unknown' + @regex_censor_rule.last_edit_comment = 'none' + @im.info_request.censor_rules << @regex_censor_rule + load_raw_emails_data + end + + it "should replace censor text" do + data = "There was a mouse called Stilton, he wished that he was blue." + @im.apply_masks!(data, "application/vnd.ms-word") + data.should == "There was a xxxxx called xxxxxxx, he wished that he was xxxx." + end + + it "should apply censor rules to From: addresses" do + @im.stub!(:mail_from).and_return("Stilton Mouse") + @im.stub!(:last_parsed).and_return(Time.now) + safe_mail_from = @im.safe_mail_from + safe_mail_from.should == "Jarlsberg Mouse" + end end @@ -565,15 +488,16 @@ describe IncomingMessage, " when censoring whole users" do it "should apply censor rules to HTML files" do data = @test_data.dup - @im.html_mask_stuff!(data) + @im.apply_masks!(data, 'text/html') data.should == "There was a mouse called Gorgonzola, he wished that he was blue." end it "should replace censor text to Word documents" do data = @test_data.dup - @im.binary_mask_stuff!(data, "application/vnd.ms-word") + @im.apply_masks!(data, "application/vnd.ms-word") data.should == "There was a mouse called xxxxxxx, he wished that he was blue." end + end @@ -770,3 +694,16 @@ describe IncomingMessage, "when extracting attachments" do end end + +describe IncomingMessage, 'when getting the body of a message for html display' do + + it 'should replace any masked email addresses with a link to the help page' do + incoming_message = IncomingMessage.new + body_text = 'there was an [email address] here' + incoming_message.stub!(:get_main_body_text_folded).and_return(body_text) + incoming_message.stub!(:get_main_body_text_unfolded).and_return(body_text) + expected = 'there was an [<a href="/help/officers#mobiles">email address</a>] here' + incoming_message.get_body_for_html_display.should == expected + end + +end |