diff options
-rw-r--r-- | Gemfile | 1 | ||||
-rw-r--r-- | Gemfile.lock | 2 | ||||
-rw-r--r-- | config/initializers/alaveteli.rb | 1 | ||||
-rw-r--r-- | config/packages | 3 | ||||
-rw-r--r-- | lib/normalize_string.rb | 75 | ||||
-rw-r--r-- | spec/lib/basic_encoding_tests.rb | 157 |
6 files changed, 238 insertions, 1 deletions
@@ -10,6 +10,7 @@ source 'https://rubygems.org' gem 'rails', '3.1.12' gem 'pg' +gem 'charlock_holmes' gem 'fastercsv', '>=1.5.5' gem 'json' gem 'mahoro' diff --git a/Gemfile.lock b/Gemfile.lock index 3dc08590d..a9c2e7278 100644 --- a/Gemfile.lock +++ b/Gemfile.lock @@ -58,6 +58,7 @@ GEM net-sftp (>= 2.0.0) net-ssh (>= 2.0.14) net-ssh-gateway (>= 1.1.0) + charlock_holmes (0.6.9.4) chunky_png (1.2.6) colorize (0.5.8) columnize (0.3.6) @@ -242,6 +243,7 @@ DEPENDENCIES annotate bootstrap-sass capistrano + charlock_holmes compass coveralls debugger diff --git a/config/initializers/alaveteli.rb b/config/initializers/alaveteli.rb index 35d486837..455ff467d 100644 --- a/config/initializers/alaveteli.rb +++ b/config/initializers/alaveteli.rb @@ -59,3 +59,4 @@ require 'quiet_opener.rb' require 'mail_handler' require 'public_body_categories' require 'ability' +require 'normalize_string' diff --git a/config/packages b/config/packages index db51e5bdd..fc67cda6b 100644 --- a/config/packages +++ b/config/packages @@ -36,4 +36,5 @@ rake (>= 0.9.2.2) build-essential bundler sqlite3 -libsqlite3-dev
\ No newline at end of file +libsqlite3-dev +libicu-dev diff --git a/lib/normalize_string.rb b/lib/normalize_string.rb new file mode 100644 index 000000000..e708a8d96 --- /dev/null +++ b/lib/normalize_string.rb @@ -0,0 +1,75 @@ +require 'iconv' unless RUBY_VERSION.to_f >= 1.9 +require 'charlock_holmes' + +class EncodingNormalizationError < StandardError +end + +def normalize_string_to_utf8(s, suggested_character_encoding=nil) + + # Make a list of encodings to try: + to_try = [] + + guessed_encoding = CharlockHolmes::EncodingDetector.detect(s)[:encoding] + guessed_encoding ||= '' + + # It's reasonably common for windows-1252 text to be mislabelled + # as ISO-8859-1, so try that first if charlock_holmes guessed + # that. However, it can also easily misidentify UTF-8 strings as + # ISO-8859-1 so we don't want to go with the guess by default... + to_try.push guessed_encoding if guessed_encoding.downcase == 'windows-1252' + + to_try.push suggested_character_encoding if suggested_character_encoding + to_try.push 'UTF-8' + to_try.push guessed_encoding + + to_try.each do |from_encoding| + if RUBY_VERSION.to_f >= 1.9 + begin + s.force_encoding from_encoding + return s.encode('UTF-8') if s.valid_encoding? + rescue ArgumentError + # We get this is there are invalid bytes when + # interpreted as from_encoding at the point of + # the encode('UTF-8'); move onto the next one... + end + else + to_encoding = 'UTF-8' + begin + converted = Iconv.conv 'UTF-8', from_encoding, s + return converted + rescue Iconv::Failure + # We get this is there are invalid bytes when + # interpreted as from_encoding at the point of + # the Iconv.iconv; move onto the next one... + end + end + end + raise EncodingNormalizationError, "Couldn't find a valid character encoding for the string" + +end + +def convert_string_to_utf8_or_binary(s, suggested_character_encoding=nil) + # This function exists to help to keep consistent with the + # behaviour of earlier versions of Alaveteli: in the code as it + # is, there are situations where it's expected that we generally + # have a UTF-8 encoded string, but if the source data was + # unintepretable under any character encoding, the string may be + # binary data (i.e. invalid UTF-8). Such a string would then be + # mangled into valid UTF-8 by _sanitize_text for the purposes of + # display. + + # This seems unsatisfactory to me - two better alternatives would + # be either: (a) to mangle the data into valid UTF-8 in this + # method or (b) to treat the 'text/*' attachment as + # 'application/octet-stream' instead. However, for the purposes + # of the transition to Ruby 1.9 and/or Rails 3 we just want the + # behaviour to be as similar as possible. + + begin + result = normalize_string_to_utf8 s, suggested_character_encoding + rescue EncodingNormalizationError + result = s + s.force_encoding 'ASCII-8BIT' if RUBY_VERSION.to_f >= 1.9 + end + result +end diff --git a/spec/lib/basic_encoding_tests.rb b/spec/lib/basic_encoding_tests.rb new file mode 100644 index 000000000..35d35fd4a --- /dev/null +++ b/spec/lib/basic_encoding_tests.rb @@ -0,0 +1,157 @@ +# -*- coding: utf-8 -*- +require File.expand_path(File.dirname(__FILE__) + '/../spec_helper') + +def bytes_to_binary_string( bytes, claimed_encoding = nil ) + claimed_encoding ||= 'ASCII-8BIT' + bytes_string = bytes.pack('c*') + if RUBY_VERSION.to_f >= 1.9 + bytes_string.force_encoding! claimed_encoding + end + bytes_string +end + +random_string = bytes_to_binary_string [ 0x0f, 0x58, 0x1c, 0x8f, 0xa4, 0xcf, + 0xf6, 0x8c, 0x9d, 0xa7, 0x06, 0xd9, + 0xf7, 0x90, 0x6c, 0x6f] + +windows_1252_string = bytes_to_binary_string [ 0x44, 0x41, 0x53, 0x48, 0x20, + 0x96, 0x20, 0x44, 0x41, 0x53, + 0x48 ] + +# It's a shame this example is so long, but if we don't take enough it +# gets misinterpreted as Shift_JIS + +gb_18030_bytes = [ 0xb9, 0xf3, 0xb9, 0xab, 0xcb, 0xbe, 0xb8, 0xba, 0xd4, 0xf0, + 0xc8, 0xcb, 0x28, 0xbe, 0xad, 0xc0, 0xed, 0x2f, 0xb2, 0xc6, + 0xce, 0xf1, 0x29, 0xc4, 0xfa, 0xba, 0xc3, 0xa3, 0xba, 0x0d, + 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0xb1, 0xbe, 0xb9, 0xab, 0xcb, 0xbe, 0xd4, + 0xda, 0x31, 0x39, 0x39, 0x37, 0xc4, 0xea, 0xb3, 0xc9, 0xc1, + 0xa2, 0xb9, 0xfa, 0xbc, 0xd2, 0xb9, 0xa4, 0xc9, 0xcc, 0xd7, + 0xa2, 0xb2, 0xe1, 0x2e, 0xca, 0xb5, 0xc1, 0xa6, 0xd0, 0xdb, + 0xba, 0xf1, 0xa1, 0xa3, 0xd3, 0xd0, 0xb6, 0xc0, 0xc1, 0xa2, + 0xcb, 0xb0, 0xce, 0xf1, 0x0d, 0x0a, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0xd7, 0xa8, 0xd2, 0xb5, + 0xc8, 0xcb, 0xd4, 0xb1, 0x3b, 0xd4, 0xda, 0xc8, 0xab, 0xb9, + 0xfa, 0xb8, 0xf7, 0xb3, 0xc7, 0xca, 0xd0, 0xc9, 0xe8, 0xc1, + 0xa2, 0xb7, 0xd6, 0xb9, 0xab, 0xcb, 0xbe, 0xa3, 0xa8, 0xd5, + 0xe3, 0xbd, 0xad, 0xa1, 0xa2, 0xc9, 0xcf, 0xba, 0xa3, 0xa1, + 0xa2, 0xb9, 0xe3, 0xd6, 0xdd, 0xa1, 0xa2, 0xbd, 0xad, 0xcb, + 0xd5, 0xb5, 0xc8, 0x0d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0xb5, 0xd8, 0xb7, 0xbd, 0xa3, + 0xa9, 0xd2, 0xf2, 0xbd, 0xf8, 0xcf, 0xee, 0xbd, 0xcf, 0xb6, + 0xe0, 0xcf, 0xd6, 0xcd, 0xea, 0xb3, 0xc9, 0xb2, 0xbb, 0xc1, + 0xcb, 0xc3, 0xbf, 0xd4, 0xc2, 0xcf, 0xfa, 0xca, 0xdb, 0xb6, + 0xee, 0xb6, 0xc8, 0xa1, 0xa3, 0xc3, 0xbf, 0xd4, 0xc2, 0xd3, + 0xd0, 0xd2, 0xbb, 0xb2, 0xbf, 0xb7, 0xd6, 0x0d, 0x0a, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0xd4, + 0xf6, 0xd6, 0xb5, 0xb6, 0x90, 0xa3, 0xa8, 0x36, 0x2d, 0x37, + 0x25, 0xd7, 0xf3, 0xd3, 0xd2, 0x29, 0xba, 0xcd, 0xc6, 0xd5, + 0xc6, 0xb1, 0xa3, 0xa8, 0x30, 0x2e, 0x35, 0x25, 0x2d, 0x32, + 0x25, 0x20, 0xd7, 0xf3, 0xd3, 0xd2, 0xa3, 0xa9, 0xd3, 0xc5, + 0xbb, 0xdd, 0xb4, 0xfa, 0xbf, 0xaa, 0xbb, 0xf2, 0xba, 0xcf, + 0xd7, 0xf7, 0xa3, 0xac, 0x0d, 0x0a, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0xb5, 0xe3, 0xca, 0xfd, + 0xbd, 0xcf, 0xb5, 0xcd, 0xa1, 0xa3, 0xb4, 0xfa, 0xc0, 0xed, + 0xb7, 0xb6, 0xce, 0xa7, 0xc8, 0xe7, 0xcf, 0xc2, 0xa3, 0xba, + 0x0d, 0x0a ] + +gb_18030_spam_string = bytes_to_binary_string gb_18030_bytes + +describe "normalize_string_to_utf8" do + + describe "when passed uniterpretable character data" do + + it "should reject it as invalid" do + + expect { + normalize_string_to_utf8 random_string + }.to raise_error(EncodingNormalizationError) + + expect { + normalize_string_to_utf8 random_string, 'UTF-8' + }.to raise_error(EncodingNormalizationError) + + end + end + + describe "when passed unlabelled Windows 1252 data" do + + it "should correctly convert it to UTF-8" do + + normalized = normalize_string_to_utf8 windows_1252_string + + normalized.should == "DASH – DASH" + + end + + end + + describe "when passed GB 18030 data" do + + it "should correctly convert it to UTF-8 if unlabelled" do + + normalized = normalize_string_to_utf8 gb_18030_spam_string + + normalized.should start_with("贵公司负责人") + + end + + end + +end + +describe "convert_string_to_utf8_or_binary" do + + describe "when passed uniterpretable character data" do + + it "should return it as a binary string" do + + converted = convert_string_to_utf8_or_binary random_string + converted.should == random_string + + if RUBY_VERSION.to_f >= 1.9 + converted.encoding.should == 'ASCII-8BIT' + end + + converted = convert_string_to_utf8_or_binary random_string,'UTF-8' + converted.should == random_string + + if RUBY_VERSION.to_f >= 1.9 + converted.encoding.should == 'ASCII-8BIT' + end + + end + end + + describe "when passed unlabelled Windows 1252 data" do + + it "should correctly convert it to UTF-8" do + + converted = convert_string_to_utf8_or_binary windows_1252_string + + converted.should == "DASH – DASH" + + if RUBY_VERSION.to_f >= 1.9 + converted.encoding.should == 'UTF-8' + end + end + + end + + describe "when passed GB 18030 data" do + + it "should correctly convert it to UTF-8 if unlabelled" do + + converted = convert_string_to_utf8_or_binary gb_18030_spam_string + + converted.should start_with("贵公司负责人") + + if RUBY_VERSION.to_f >= 1.9 + converted.encoding.should == 'UTF-8' + end + end + + end + +end |