From 4440d11fb662c57428a2aba622209d6d1ddc0a59 Mon Sep 17 00:00:00 2001 From: Louise Crow Date: Tue, 9 Jun 2015 16:33:03 +0100 Subject: Round trip through utf-16 to clean utf-8 string As noted in the ruby docs (http://ruby-doc.org/core-1.9.3/String.html#method-i-encode), any conversion from an encoding to the same encoding is a no-op, covert it first to utf-16. --- spec/lib/basic_encoding_spec.rb | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'spec/lib/basic_encoding_spec.rb') diff --git a/spec/lib/basic_encoding_spec.rb b/spec/lib/basic_encoding_spec.rb index d77465ad8..d802da892 100644 --- a/spec/lib/basic_encoding_spec.rb +++ b/spec/lib/basic_encoding_spec.rb @@ -160,17 +160,16 @@ describe "convert_string_to_utf8" do describe "when passed uninterpretable character data" do - it "should return it as a utf8 string" do + it "should return it as a valid utf8 string with non-utf8 characters removed" do converted = convert_string_to_utf8 random_string - converted.should == random_string if String.method_defined?(:encode) converted.encoding.to_s.should == 'UTF-8' + converted.valid_encoding?.should == true end converted = convert_string_to_utf8 random_string,'UTF-8' - converted.should == random_string if String.method_defined?(:encode) converted.encoding.to_s.should == 'UTF-8' -- cgit v1.2.3 From 3d8f0cc6b70b55aad20ab4d86642f0e6d605c921 Mon Sep 17 00:00:00 2001 From: Louise Crow Date: Thu, 11 Jun 2015 17:11:26 +0100 Subject: convert_string_to_utf8 returns struct of string and scrubbing status. --- spec/lib/basic_encoding_spec.rb | 23 +++++++++++++++-------- 1 file changed, 15 insertions(+), 8 deletions(-) (limited to 'spec/lib/basic_encoding_spec.rb') diff --git a/spec/lib/basic_encoding_spec.rb b/spec/lib/basic_encoding_spec.rb index d802da892..6758d60a3 100644 --- a/spec/lib/basic_encoding_spec.rb +++ b/spec/lib/basic_encoding_spec.rb @@ -160,20 +160,24 @@ describe "convert_string_to_utf8" do describe "when passed uninterpretable character data" do - it "should return it as a valid utf8 string with non-utf8 characters removed" do + it "should return it as a valid utf8 string with non-utf8 characters removed + and mark it as scrubbed" do converted = convert_string_to_utf8 random_string if String.method_defined?(:encode) - converted.encoding.to_s.should == 'UTF-8' - converted.valid_encoding?.should == true + converted.string.encoding.to_s.should == 'UTF-8' + converted.string.valid_encoding?.should == true end + converted.scrubbed?.should == true converted = convert_string_to_utf8 random_string,'UTF-8' if String.method_defined?(:encode) - converted.encoding.to_s.should == 'UTF-8' + converted.string.encoding.to_s.should == 'UTF-8' + converted.string.valid_encoding?.should == true end + converted.scrubbed?.should == true end end @@ -184,11 +188,13 @@ describe "convert_string_to_utf8" do converted = convert_string_to_utf8 windows_1252_string - converted.should == "DASH – DASH" + converted.string.should == "DASH – DASH" if String.method_defined?(:encode) - converted.encoding.to_s.should == 'UTF-8' + converted.string.encoding.to_s.should == 'UTF-8' end + converted.scrubbed?.should == false + end end @@ -199,11 +205,12 @@ describe "convert_string_to_utf8" do converted = convert_string_to_utf8 gb_18030_spam_string - converted.should start_with("贵公司负责人") + converted.string.should start_with("贵公司负责人") if String.method_defined?(:encode) - converted.encoding.to_s.should == 'UTF-8' + converted.string.encoding.to_s.should == 'UTF-8' end + converted.scrubbed?.should == false end end -- cgit v1.2.3