diff options
author | Louise Crow <louise.crow@gmail.com> | 2015-06-04 17:56:44 +0100 |
---|---|---|
committer | Louise Crow <louise.crow@gmail.com> | 2015-06-22 17:43:09 +0100 |
commit | 910acfa8ae939f363a872123eb47a86e64a192c3 (patch) | |
tree | c0e7e0283cc191be7905ac1c5b5f58f4074842c4 | |
parent | 3efe2f333a9b143e88556c0aeedb534090eb41d3 (diff) |
Use ASCII-8BIT for replacement patterns when handling binary data.
Also be explicit about using UTF-8 when handling text data.
-rw-r--r-- | app/models/censor_rule.rb | 28 | ||||
-rw-r--r-- | lib/alaveteli_text_masker.rb | 4 | ||||
-rw-r--r-- | spec/lib/alaveteli_text_masker_spec.rb | 7 | ||||
-rw-r--r-- | spec/models/censor_rule_spec.rb | 58 |
4 files changed, 74 insertions, 23 deletions
diff --git a/app/models/censor_rule.rb b/app/models/censor_rule.rb index f1f1a0d70..aec8a87cc 100644 --- a/app/models/censor_rule.rb +++ b/app/models/censor_rule.rb @@ -46,17 +46,17 @@ class CensorRule < ActiveRecord::Base def apply_to_text(text_to_censor) return nil if text_to_censor.nil? - text_to_censor.gsub(to_replace, replacement) + text_to_censor.gsub(to_replace('UTF-8'), replacement) end def apply_to_text!(text_to_censor) return nil if text_to_censor.nil? - text_to_censor.gsub!(to_replace, replacement) + text_to_censor.gsub!(to_replace('UTF-8'), replacement) end def apply_to_binary!(binary_to_censor) return nil if binary_to_censor.nil? - binary_to_censor.gsub!(to_replace) { |match| match.gsub(/./, 'x') } + binary_to_censor.gsub!(to_replace('ASCII-8BIT')) { |match| match.gsub(single_char_regexp, 'x') } end def is_global? @@ -65,6 +65,14 @@ class CensorRule < ActiveRecord::Base private + def single_char_regexp + if String.method_defined?(:encode) + Regexp.new('.'.force_encoding('ASCII-8BIT')) + else + Regexp.new('.', nil, 'N') + end + end + def require_user_request_or_public_body if info_request.nil? && user.nil? && public_body.nil? [:info_request, :user, :public_body].each do |a| @@ -75,18 +83,22 @@ class CensorRule < ActiveRecord::Base def require_valid_regexp begin - make_regexp + make_regexp('UTF-8') rescue RegexpError => e errors.add(:text, e.message) end end - def make_regexp - Regexp.new(text, Regexp::MULTILINE) + def to_replace(encoding) + regexp? ? make_regexp(encoding) : encoded_text(encoding) + end + + def encoded_text(encoding) + String.method_defined?(:encode) ? text.dup.force_encoding(encoding) : text end - def to_replace - regexp? ? make_regexp : text + def make_regexp(encoding) + Regexp.new(encoded_text(encoding), Regexp::MULTILINE) end end diff --git a/lib/alaveteli_text_masker.rb b/lib/alaveteli_text_masker.rb index 3c2bcf825..5ec7aa95c 100644 --- a/lib/alaveteli_text_masker.rb +++ b/lib/alaveteli_text_masker.rb @@ -79,7 +79,7 @@ module AlaveteliTextMasker # Replace text in place def apply_binary_masks!(text, options = {}) # Keep original size, so can check haven't resized it - orig_size = text.mb_chars.size + orig_size = text.size # Replace ASCII email addresses... text.gsub!(MySociety::Validate.email_find_regexp) do |email| @@ -114,7 +114,7 @@ module AlaveteliTextMasker # Replace censor items censor_rules = options[:censor_rules] || [] censor_rules.each{ |censor_rule| censor_rule.apply_to_binary!(text) } - raise "internal error in apply_binary_masks!" if text.mb_chars.size != orig_size + raise "internal error in apply_binary_masks!" if text.size != orig_size return text end diff --git a/spec/lib/alaveteli_text_masker_spec.rb b/spec/lib/alaveteli_text_masker_spec.rb index f2d52c1cc..f8c22a849 100644 --- a/spec/lib/alaveteli_text_masker_spec.rb +++ b/spec/lib/alaveteli_text_masker_spec.rb @@ -31,10 +31,13 @@ describe AlaveteliTextMasker do data.should == "There was a xxxxx called xxxxxxx, he wished that he was xxxx." end - it 'should handle multibyte characters correctly' do + it 'should handle multibyte characters in binary file types as binary data' do data = 'á mouse' + if String.method_defined?(:encode) + data = data.force_encoding("ASCII-8BIT") + end @regex_censor_rule.text = 'á' - apply_masks!(data, "application/octet-stream", :censor_rules => @censor_rules).should == 'x mouse' + apply_masks!(data, "application/octet-stream", :censor_rules => @censor_rules).should == 'xx mouse' end it "should apply censor rules to HTML files" do diff --git a/spec/models/censor_rule_spec.rb b/spec/models/censor_rule_spec.rb index 314b060d2..d308ac1b9 100644 --- a/spec/models/censor_rule_spec.rb +++ b/spec/models/censor_rule_spec.rb @@ -64,19 +64,35 @@ describe CensorRule, "substituting things" do @censor_rule.replacement = "hello" end - it 'should do basic text substitution' do - body = "I don't know why you say goodbye" - @censor_rule.apply_to_text!(body) - body.should == "I don't know why you say hello" + describe :apply_to_text do + + it 'should do basic text substitution' do + body = "I don't know why you say goodbye" + @censor_rule.apply_to_text!(body) + body.should == "I don't know why you say hello" + end + end - it 'should keep size same for binary substitution' do - body = "I don't know why you say goodbye" - orig_body = body.dup - @censor_rule.apply_to_binary!(body) - body.size.should == orig_body.size - body.should == "I don't know why you say xxxxxxx" - body.should_not == orig_body # be sure duplicated as expected + describe :apply_to_binary do + + it 'should keep size same for binary substitution' do + body = "I don't know why you say goodbye" + orig_body = body.dup + @censor_rule.apply_to_binary!(body) + body.size.should == orig_body.size + body.should == "I don't know why you say xxxxxxx" + body.should_not == orig_body # be sure duplicated as expected + end + + it 'should handle a UTF-8 rule and ASCII-8BIT text' do + body = "I don't know why you say g‘oodbye" + body.force_encoding("ASCII-8BIT") if String.method_defined?(:encode) + @censor_rule.text = 'g‘oodbye' + @censor_rule.apply_to_binary!(body) + body.should == "I don't know why you say xxxxxxxxxx" + end + end end @@ -121,6 +137,26 @@ xxxxxxxxx BODY end + it "handles a UTF-8 rule with ASCII-8BIT text" do + @censor_rule.text = "--PRIVATE.*--P‘RIVATE" + @body = +<<BODY +Some public information +--PRIVATE +Some private information +--P‘RIVATE +BODY + @body.force_encoding('ASCII-8BIT') if String.method_defined?(:encode) + @censor_rule.apply_to_binary!(@body) + @body.should == +<<BODY +Some public information +xxxxxxxxx +xxxxxxxxxxxxxxxxxxxxxxxx +xxxxxxxxxxxx +BODY + end + end end |