aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLouise Crow <louise.crow@gmail.com>2015-06-04 17:56:44 +0100
committerLouise Crow <louise.crow@gmail.com>2015-06-22 17:43:09 +0100
commit910acfa8ae939f363a872123eb47a86e64a192c3 (patch)
treec0e7e0283cc191be7905ac1c5b5f58f4074842c4
parent3efe2f333a9b143e88556c0aeedb534090eb41d3 (diff)
Use ASCII-8BIT for replacement patterns when handling binary data.
Also be explicit about using UTF-8 when handling text data.
-rw-r--r--app/models/censor_rule.rb28
-rw-r--r--lib/alaveteli_text_masker.rb4
-rw-r--r--spec/lib/alaveteli_text_masker_spec.rb7
-rw-r--r--spec/models/censor_rule_spec.rb58
4 files changed, 74 insertions, 23 deletions
diff --git a/app/models/censor_rule.rb b/app/models/censor_rule.rb
index f1f1a0d70..aec8a87cc 100644
--- a/app/models/censor_rule.rb
+++ b/app/models/censor_rule.rb
@@ -46,17 +46,17 @@ class CensorRule < ActiveRecord::Base
def apply_to_text(text_to_censor)
return nil if text_to_censor.nil?
- text_to_censor.gsub(to_replace, replacement)
+ text_to_censor.gsub(to_replace('UTF-8'), replacement)
end
def apply_to_text!(text_to_censor)
return nil if text_to_censor.nil?
- text_to_censor.gsub!(to_replace, replacement)
+ text_to_censor.gsub!(to_replace('UTF-8'), replacement)
end
def apply_to_binary!(binary_to_censor)
return nil if binary_to_censor.nil?
- binary_to_censor.gsub!(to_replace) { |match| match.gsub(/./, 'x') }
+ binary_to_censor.gsub!(to_replace('ASCII-8BIT')) { |match| match.gsub(single_char_regexp, 'x') }
end
def is_global?
@@ -65,6 +65,14 @@ class CensorRule < ActiveRecord::Base
private
+ def single_char_regexp
+ if String.method_defined?(:encode)
+ Regexp.new('.'.force_encoding('ASCII-8BIT'))
+ else
+ Regexp.new('.', nil, 'N')
+ end
+ end
+
def require_user_request_or_public_body
if info_request.nil? && user.nil? && public_body.nil?
[:info_request, :user, :public_body].each do |a|
@@ -75,18 +83,22 @@ class CensorRule < ActiveRecord::Base
def require_valid_regexp
begin
- make_regexp
+ make_regexp('UTF-8')
rescue RegexpError => e
errors.add(:text, e.message)
end
end
- def make_regexp
- Regexp.new(text, Regexp::MULTILINE)
+ def to_replace(encoding)
+ regexp? ? make_regexp(encoding) : encoded_text(encoding)
+ end
+
+ def encoded_text(encoding)
+ String.method_defined?(:encode) ? text.dup.force_encoding(encoding) : text
end
- def to_replace
- regexp? ? make_regexp : text
+ def make_regexp(encoding)
+ Regexp.new(encoded_text(encoding), Regexp::MULTILINE)
end
end
diff --git a/lib/alaveteli_text_masker.rb b/lib/alaveteli_text_masker.rb
index 3c2bcf825..5ec7aa95c 100644
--- a/lib/alaveteli_text_masker.rb
+++ b/lib/alaveteli_text_masker.rb
@@ -79,7 +79,7 @@ module AlaveteliTextMasker
# Replace text in place
def apply_binary_masks!(text, options = {})
# Keep original size, so can check haven't resized it
- orig_size = text.mb_chars.size
+ orig_size = text.size
# Replace ASCII email addresses...
text.gsub!(MySociety::Validate.email_find_regexp) do |email|
@@ -114,7 +114,7 @@ module AlaveteliTextMasker
# Replace censor items
censor_rules = options[:censor_rules] || []
censor_rules.each{ |censor_rule| censor_rule.apply_to_binary!(text) }
- raise "internal error in apply_binary_masks!" if text.mb_chars.size != orig_size
+ raise "internal error in apply_binary_masks!" if text.size != orig_size
return text
end
diff --git a/spec/lib/alaveteli_text_masker_spec.rb b/spec/lib/alaveteli_text_masker_spec.rb
index f2d52c1cc..f8c22a849 100644
--- a/spec/lib/alaveteli_text_masker_spec.rb
+++ b/spec/lib/alaveteli_text_masker_spec.rb
@@ -31,10 +31,13 @@ describe AlaveteliTextMasker do
data.should == "There was a xxxxx called xxxxxxx, he wished that he was xxxx."
end
- it 'should handle multibyte characters correctly' do
+ it 'should handle multibyte characters in binary file types as binary data' do
data = 'á mouse'
+ if String.method_defined?(:encode)
+ data = data.force_encoding("ASCII-8BIT")
+ end
@regex_censor_rule.text = 'á'
- apply_masks!(data, "application/octet-stream", :censor_rules => @censor_rules).should == 'x mouse'
+ apply_masks!(data, "application/octet-stream", :censor_rules => @censor_rules).should == 'xx mouse'
end
it "should apply censor rules to HTML files" do
diff --git a/spec/models/censor_rule_spec.rb b/spec/models/censor_rule_spec.rb
index 314b060d2..d308ac1b9 100644
--- a/spec/models/censor_rule_spec.rb
+++ b/spec/models/censor_rule_spec.rb
@@ -64,19 +64,35 @@ describe CensorRule, "substituting things" do
@censor_rule.replacement = "hello"
end
- it 'should do basic text substitution' do
- body = "I don't know why you say goodbye"
- @censor_rule.apply_to_text!(body)
- body.should == "I don't know why you say hello"
+ describe :apply_to_text do
+
+ it 'should do basic text substitution' do
+ body = "I don't know why you say goodbye"
+ @censor_rule.apply_to_text!(body)
+ body.should == "I don't know why you say hello"
+ end
+
end
- it 'should keep size same for binary substitution' do
- body = "I don't know why you say goodbye"
- orig_body = body.dup
- @censor_rule.apply_to_binary!(body)
- body.size.should == orig_body.size
- body.should == "I don't know why you say xxxxxxx"
- body.should_not == orig_body # be sure duplicated as expected
+ describe :apply_to_binary do
+
+ it 'should keep size same for binary substitution' do
+ body = "I don't know why you say goodbye"
+ orig_body = body.dup
+ @censor_rule.apply_to_binary!(body)
+ body.size.should == orig_body.size
+ body.should == "I don't know why you say xxxxxxx"
+ body.should_not == orig_body # be sure duplicated as expected
+ end
+
+ it 'should handle a UTF-8 rule and ASCII-8BIT text' do
+ body = "I don't know why you say g‘oodbye"
+ body.force_encoding("ASCII-8BIT") if String.method_defined?(:encode)
+ @censor_rule.text = 'g‘oodbye'
+ @censor_rule.apply_to_binary!(body)
+ body.should == "I don't know why you say xxxxxxxxxx"
+ end
+
end
end
@@ -121,6 +137,26 @@ xxxxxxxxx
BODY
end
+ it "handles a UTF-8 rule with ASCII-8BIT text" do
+ @censor_rule.text = "--PRIVATE.*--P‘RIVATE"
+ @body =
+<<BODY
+Some public information
+--PRIVATE
+Some private information
+--P‘RIVATE
+BODY
+ @body.force_encoding('ASCII-8BIT') if String.method_defined?(:encode)
+ @censor_rule.apply_to_binary!(@body)
+ @body.should ==
+<<BODY
+Some public information
+xxxxxxxxx
+xxxxxxxxxxxxxxxxxxxxxxxx
+xxxxxxxxxxxx
+BODY
+ end
+
end
end