diff options
Diffstat (limited to 'lib/normalize_string.rb')
-rw-r--r-- | lib/normalize_string.rb | 24 |
1 files changed, 24 insertions, 0 deletions
diff --git a/lib/normalize_string.rb b/lib/normalize_string.rb index 3b6116970..69853fd6e 100644 --- a/lib/normalize_string.rb +++ b/lib/normalize_string.rb @@ -1,3 +1,4 @@ +# -*- encoding : utf-8 -*- require 'iconv' unless String.method_defined?(:encode) require 'charlock_holmes' @@ -72,6 +73,29 @@ def convert_string_to_utf8_or_binary(s, suggested_character_encoding=nil) result end +class StringConversionResult < Struct.new(:string, :scrubbed) + alias_method :scrubbed?, :scrubbed +end + +def convert_string_to_utf8(s, suggested_character_encoding=nil) + begin + result = normalize_string_to_utf8 s, suggested_character_encoding + StringConversionResult.new(result, false) + rescue EncodingNormalizationError + result = scrub(s) + StringConversionResult.new(result, true) + end +end + +def scrub(string) + if String.method_defined?(:encode) + string = string.force_encoding("utf-8") + string.valid_encoding? ? string : string.encode("utf-16le", :invalid => :replace, :replace => "").encode("utf-8") + else + Iconv.conv('UTF-8//IGNORE', 'UTF-8', string) + end +end + def log_text_details(message, text) if String.method_defined?(:encode) STDERR.puts "#{message}, we have text: #{text}, of class #{text.class} and encoding #{text.encoding}" |