aboutsummaryrefslogtreecommitdiffstats
path: root/lib/normalize_string.rb
diff options
context:
space:
mode:
authorGareth Rees <gareth@mysociety.org>2015-06-24 11:19:43 +0100
committerGareth Rees <gareth@mysociety.org>2015-06-24 11:19:43 +0100
commit2cce1794a4d9d2c42b83bab8a693900e8ca23ebc (patch)
tree7408a04d5ac0963ec2defbbf7d4955cff7cd62b5 /lib/normalize_string.rb
parented6b256539e0dcaa3764951d90e2dc599a8acddd (diff)
parent54ba7a4fa232ad3b57310551b9a5e19d72060abe (diff)
Merge branch 'develop' into release-22-develop
Diffstat (limited to 'lib/normalize_string.rb')
-rw-r--r--lib/normalize_string.rb23
1 files changed, 16 insertions, 7 deletions
diff --git a/lib/normalize_string.rb b/lib/normalize_string.rb
index d850d7e05..69853fd6e 100644
--- a/lib/normalize_string.rb
+++ b/lib/normalize_string.rb
@@ -73,18 +73,27 @@ def convert_string_to_utf8_or_binary(s, suggested_character_encoding=nil)
result
end
+class StringConversionResult < Struct.new(:string, :scrubbed)
+ alias_method :scrubbed?, :scrubbed
+end
+
def convert_string_to_utf8(s, suggested_character_encoding=nil)
begin
result = normalize_string_to_utf8 s, suggested_character_encoding
+ StringConversionResult.new(result, false)
rescue EncodingNormalizationError
- result = s
- if String.method_defined?(:encode)
- result = s.force_encoding("utf-8").encode("utf-8", :invalid => :replace,
- :undef => :replace,
- :replace => "")
- end
+ result = scrub(s)
+ StringConversionResult.new(result, true)
+ end
+end
+
+def scrub(string)
+ if String.method_defined?(:encode)
+ string = string.force_encoding("utf-8")
+ string.valid_encoding? ? string : string.encode("utf-16le", :invalid => :replace, :replace => "").encode("utf-8")
+ else
+ Iconv.conv('UTF-8//IGNORE', 'UTF-8', string)
end
- result
end
def log_text_details(message, text)