aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--lib/normalize_string.rb11
-rw-r--r--spec/lib/basic_encoding_spec.rb55
2 files changed, 66 insertions, 0 deletions
diff --git a/lib/normalize_string.rb b/lib/normalize_string.rb
index 3b6116970..de847cd16 100644
--- a/lib/normalize_string.rb
+++ b/lib/normalize_string.rb
@@ -72,6 +72,17 @@ def convert_string_to_utf8_or_binary(s, suggested_character_encoding=nil)
result
end
+def convert_string_to_utf8(s, suggested_character_encoding=nil)
+ begin
+ result = normalize_string_to_utf8 s, suggested_character_encoding
+ rescue EncodingNormalizationError
+ result = s.force_encoding("utf-8").encode("utf-8", :invalid => :replace,
+ :undef => :replace,
+ :replace => "") if String.method_defined?(:encode)
+ end
+ result
+end
+
def log_text_details(message, text)
if String.method_defined?(:encode)
STDERR.puts "#{message}, we have text: #{text}, of class #{text.class} and encoding #{text.encoding}"
diff --git a/spec/lib/basic_encoding_spec.rb b/spec/lib/basic_encoding_spec.rb
index 43a65eab9..da5acf361 100644
--- a/spec/lib/basic_encoding_spec.rb
+++ b/spec/lib/basic_encoding_spec.rb
@@ -155,3 +155,58 @@ describe "convert_string_to_utf8_or_binary" do
end
end
+
+describe "convert_string_to_utf8" do
+
+ describe "when passed uninterpretable character data" do
+
+ it "should return it as a utf8 string" do
+
+ converted = convert_string_to_utf8 random_string
+ converted.should == random_string
+
+ if String.method_defined?(:encode)
+ converted.encoding.to_s.should == 'UTF-8'
+ end
+
+ converted = convert_string_to_utf8 random_string,'UTF-8'
+ converted.should == random_string
+
+ if String.method_defined?(:encode)
+ converted.encoding.to_s.should == 'UTF-8'
+ end
+
+ end
+ end
+
+ describe "when passed unlabelled Windows 1252 data" do
+
+ it "should correctly convert it to UTF-8" do
+
+ converted = convert_string_to_utf8 windows_1252_string
+
+ converted.should == "DASH – DASH"
+
+ if String.method_defined?(:encode)
+ converted.encoding.to_s.should == 'UTF-8'
+ end
+ end
+
+ end
+
+ describe "when passed GB 18030 data" do
+
+ it "should correctly convert it to UTF-8 if unlabelled" do
+
+ converted = convert_string_to_utf8 gb_18030_spam_string
+
+ converted.should start_with("贵公司负责人")
+
+ if String.method_defined?(:encode)
+ converted.encoding.to_s.should == 'UTF-8'
+ end
+ end
+
+ end
+
+end \ No newline at end of file