From 4440d11fb662c57428a2aba622209d6d1ddc0a59 Mon Sep 17 00:00:00 2001 From: Louise Crow Date: Tue, 9 Jun 2015 16:33:03 +0100 Subject: Round trip through utf-16 to clean utf-8 string As noted in the ruby docs (http://ruby-doc.org/core-1.9.3/String.html#method-i-encode), any conversion from an encoding to the same encoding is a no-op, covert it first to utf-16. --- lib/normalize_string.rb | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) (limited to 'lib/normalize_string.rb') diff --git a/lib/normalize_string.rb b/lib/normalize_string.rb index d850d7e05..8b54c080c 100644 --- a/lib/normalize_string.rb +++ b/lib/normalize_string.rb @@ -77,16 +77,20 @@ def convert_string_to_utf8(s, suggested_character_encoding=nil) begin result = normalize_string_to_utf8 s, suggested_character_encoding rescue EncodingNormalizationError - result = s - if String.method_defined?(:encode) - result = s.force_encoding("utf-8").encode("utf-8", :invalid => :replace, - :undef => :replace, - :replace => "") - end + result = scrub(s) end result end +def scrub(string) + if String.method_defined?(:encode) + string = string.force_encoding("utf-8") + string.valid_encoding? ? string : string.encode("utf-16le", :invalid => :replace, :replace => "").encode("utf-8") + else + Iconv.conv('UTF-8//IGNORE', 'UTF-8', string) + end +end + def log_text_details(message, text) if String.method_defined?(:encode) STDERR.puts "#{message}, we have text: #{text}, of class #{text.class} and encoding #{text.encoding}" -- cgit v1.2.3 From 3d8f0cc6b70b55aad20ab4d86642f0e6d605c921 Mon Sep 17 00:00:00 2001 From: Louise Crow Date: Thu, 11 Jun 2015 17:11:26 +0100 Subject: convert_string_to_utf8 returns struct of string and scrubbing status. --- lib/normalize_string.rb | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'lib/normalize_string.rb') diff --git a/lib/normalize_string.rb b/lib/normalize_string.rb index 8b54c080c..69853fd6e 100644 --- a/lib/normalize_string.rb +++ b/lib/normalize_string.rb @@ -73,13 +73,18 @@ def convert_string_to_utf8_or_binary(s, suggested_character_encoding=nil) result end +class StringConversionResult < Struct.new(:string, :scrubbed) + alias_method :scrubbed?, :scrubbed +end + def convert_string_to_utf8(s, suggested_character_encoding=nil) begin result = normalize_string_to_utf8 s, suggested_character_encoding + StringConversionResult.new(result, false) rescue EncodingNormalizationError result = scrub(s) + StringConversionResult.new(result, true) end - result end def scrub(string) -- cgit v1.2.3