diff options
author | Gareth Rees <gareth@mysociety.org> | 2015-06-24 11:19:43 +0100 |
---|---|---|
committer | Gareth Rees <gareth@mysociety.org> | 2015-06-24 11:19:43 +0100 |
commit | 2cce1794a4d9d2c42b83bab8a693900e8ca23ebc (patch) | |
tree | 7408a04d5ac0963ec2defbbf7d4955cff7cd62b5 /lib/normalize_string.rb | |
parent | ed6b256539e0dcaa3764951d90e2dc599a8acddd (diff) | |
parent | 54ba7a4fa232ad3b57310551b9a5e19d72060abe (diff) |
Merge branch 'develop' into release-22-develop
Diffstat (limited to 'lib/normalize_string.rb')
-rw-r--r-- | lib/normalize_string.rb | 23 |
1 files changed, 16 insertions, 7 deletions
diff --git a/lib/normalize_string.rb b/lib/normalize_string.rb index d850d7e05..69853fd6e 100644 --- a/lib/normalize_string.rb +++ b/lib/normalize_string.rb @@ -73,18 +73,27 @@ def convert_string_to_utf8_or_binary(s, suggested_character_encoding=nil) result end +class StringConversionResult < Struct.new(:string, :scrubbed) + alias_method :scrubbed?, :scrubbed +end + def convert_string_to_utf8(s, suggested_character_encoding=nil) begin result = normalize_string_to_utf8 s, suggested_character_encoding + StringConversionResult.new(result, false) rescue EncodingNormalizationError - result = s - if String.method_defined?(:encode) - result = s.force_encoding("utf-8").encode("utf-8", :invalid => :replace, - :undef => :replace, - :replace => "") - end + result = scrub(s) + StringConversionResult.new(result, true) + end +end + +def scrub(string) + if String.method_defined?(:encode) + string = string.force_encoding("utf-8") + string.valid_encoding? ? string : string.encode("utf-16le", :invalid => :replace, :replace => "").encode("utf-8") + else + Iconv.conv('UTF-8//IGNORE', 'UTF-8', string) end - result end def log_text_details(message, text) |