diff options
-rw-r--r-- | lib/normalize_string.rb | 11 | ||||
-rw-r--r-- | spec/lib/basic_encoding_spec.rb | 55 |
2 files changed, 66 insertions, 0 deletions
diff --git a/lib/normalize_string.rb b/lib/normalize_string.rb index 3b6116970..de847cd16 100644 --- a/lib/normalize_string.rb +++ b/lib/normalize_string.rb @@ -72,6 +72,17 @@ def convert_string_to_utf8_or_binary(s, suggested_character_encoding=nil) result end +def convert_string_to_utf8(s, suggested_character_encoding=nil) + begin + result = normalize_string_to_utf8 s, suggested_character_encoding + rescue EncodingNormalizationError + result = s.force_encoding("utf-8").encode("utf-8", :invalid => :replace, + :undef => :replace, + :replace => "") if String.method_defined?(:encode) + end + result +end + def log_text_details(message, text) if String.method_defined?(:encode) STDERR.puts "#{message}, we have text: #{text}, of class #{text.class} and encoding #{text.encoding}" diff --git a/spec/lib/basic_encoding_spec.rb b/spec/lib/basic_encoding_spec.rb index 43a65eab9..da5acf361 100644 --- a/spec/lib/basic_encoding_spec.rb +++ b/spec/lib/basic_encoding_spec.rb @@ -155,3 +155,58 @@ describe "convert_string_to_utf8_or_binary" do end end + +describe "convert_string_to_utf8" do + + describe "when passed uninterpretable character data" do + + it "should return it as a utf8 string" do + + converted = convert_string_to_utf8 random_string + converted.should == random_string + + if String.method_defined?(:encode) + converted.encoding.to_s.should == 'UTF-8' + end + + converted = convert_string_to_utf8 random_string,'UTF-8' + converted.should == random_string + + if String.method_defined?(:encode) + converted.encoding.to_s.should == 'UTF-8' + end + + end + end + + describe "when passed unlabelled Windows 1252 data" do + + it "should correctly convert it to UTF-8" do + + converted = convert_string_to_utf8 windows_1252_string + + converted.should == "DASH – DASH" + + if String.method_defined?(:encode) + converted.encoding.to_s.should == 'UTF-8' + end + end + + end + + describe "when passed GB 18030 data" do + + it "should correctly convert it to UTF-8 if unlabelled" do + + converted = convert_string_to_utf8 gb_18030_spam_string + + converted.should start_with("贵公司负责人") + + if String.method_defined?(:encode) + converted.encoding.to_s.should == 'UTF-8' + end + end + + end + +end
\ No newline at end of file |