diff options
-rw-r--r-- | lib/normalize_string.rb | 16 | ||||
-rw-r--r-- | spec/lib/basic_encoding_spec.rb | 5 |
2 files changed, 12 insertions, 9 deletions
diff --git a/lib/normalize_string.rb b/lib/normalize_string.rb index d850d7e05..8b54c080c 100644 --- a/lib/normalize_string.rb +++ b/lib/normalize_string.rb @@ -77,16 +77,20 @@ def convert_string_to_utf8(s, suggested_character_encoding=nil) begin result = normalize_string_to_utf8 s, suggested_character_encoding rescue EncodingNormalizationError - result = s - if String.method_defined?(:encode) - result = s.force_encoding("utf-8").encode("utf-8", :invalid => :replace, - :undef => :replace, - :replace => "") - end + result = scrub(s) end result end +def scrub(string) + if String.method_defined?(:encode) + string = string.force_encoding("utf-8") + string.valid_encoding? ? string : string.encode("utf-16le", :invalid => :replace, :replace => "").encode("utf-8") + else + Iconv.conv('UTF-8//IGNORE', 'UTF-8', string) + end +end + def log_text_details(message, text) if String.method_defined?(:encode) STDERR.puts "#{message}, we have text: #{text}, of class #{text.class} and encoding #{text.encoding}" diff --git a/spec/lib/basic_encoding_spec.rb b/spec/lib/basic_encoding_spec.rb index d77465ad8..d802da892 100644 --- a/spec/lib/basic_encoding_spec.rb +++ b/spec/lib/basic_encoding_spec.rb @@ -160,17 +160,16 @@ describe "convert_string_to_utf8" do describe "when passed uninterpretable character data" do - it "should return it as a utf8 string" do + it "should return it as a valid utf8 string with non-utf8 characters removed" do converted = convert_string_to_utf8 random_string - converted.should == random_string if String.method_defined?(:encode) converted.encoding.to_s.should == 'UTF-8' + converted.valid_encoding?.should == true end converted = convert_string_to_utf8 random_string,'UTF-8' - converted.should == random_string if String.method_defined?(:encode) converted.encoding.to_s.should == 'UTF-8' |