aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Gemfile1
-rw-r--r--Gemfile.lock2
-rw-r--r--config/initializers/alaveteli.rb1
-rw-r--r--config/packages3
-rw-r--r--lib/normalize_string.rb75
-rw-r--r--spec/lib/basic_encoding_tests.rb157
6 files changed, 238 insertions, 1 deletions
diff --git a/Gemfile b/Gemfile
index 5e4c60ea0..18c2aec0d 100644
--- a/Gemfile
+++ b/Gemfile
@@ -10,6 +10,7 @@ source 'https://rubygems.org'
gem 'rails', '3.1.12'
gem 'pg'
+gem 'charlock_holmes'
gem 'fastercsv', '>=1.5.5'
gem 'json'
gem 'mahoro'
diff --git a/Gemfile.lock b/Gemfile.lock
index 3dc08590d..a9c2e7278 100644
--- a/Gemfile.lock
+++ b/Gemfile.lock
@@ -58,6 +58,7 @@ GEM
net-sftp (>= 2.0.0)
net-ssh (>= 2.0.14)
net-ssh-gateway (>= 1.1.0)
+ charlock_holmes (0.6.9.4)
chunky_png (1.2.6)
colorize (0.5.8)
columnize (0.3.6)
@@ -242,6 +243,7 @@ DEPENDENCIES
annotate
bootstrap-sass
capistrano
+ charlock_holmes
compass
coveralls
debugger
diff --git a/config/initializers/alaveteli.rb b/config/initializers/alaveteli.rb
index 35d486837..455ff467d 100644
--- a/config/initializers/alaveteli.rb
+++ b/config/initializers/alaveteli.rb
@@ -59,3 +59,4 @@ require 'quiet_opener.rb'
require 'mail_handler'
require 'public_body_categories'
require 'ability'
+require 'normalize_string'
diff --git a/config/packages b/config/packages
index db51e5bdd..fc67cda6b 100644
--- a/config/packages
+++ b/config/packages
@@ -36,4 +36,5 @@ rake (>= 0.9.2.2)
build-essential
bundler
sqlite3
-libsqlite3-dev \ No newline at end of file
+libsqlite3-dev
+libicu-dev
diff --git a/lib/normalize_string.rb b/lib/normalize_string.rb
new file mode 100644
index 000000000..e708a8d96
--- /dev/null
+++ b/lib/normalize_string.rb
@@ -0,0 +1,75 @@
+require 'iconv' unless RUBY_VERSION.to_f >= 1.9
+require 'charlock_holmes'
+
+class EncodingNormalizationError < StandardError
+end
+
+def normalize_string_to_utf8(s, suggested_character_encoding=nil)
+
+ # Make a list of encodings to try:
+ to_try = []
+
+ guessed_encoding = CharlockHolmes::EncodingDetector.detect(s)[:encoding]
+ guessed_encoding ||= ''
+
+ # It's reasonably common for windows-1252 text to be mislabelled
+ # as ISO-8859-1, so try that first if charlock_holmes guessed
+ # that. However, it can also easily misidentify UTF-8 strings as
+ # ISO-8859-1 so we don't want to go with the guess by default...
+ to_try.push guessed_encoding if guessed_encoding.downcase == 'windows-1252'
+
+ to_try.push suggested_character_encoding if suggested_character_encoding
+ to_try.push 'UTF-8'
+ to_try.push guessed_encoding
+
+ to_try.each do |from_encoding|
+ if RUBY_VERSION.to_f >= 1.9
+ begin
+ s.force_encoding from_encoding
+ return s.encode('UTF-8') if s.valid_encoding?
+ rescue ArgumentError
+ # We get this is there are invalid bytes when
+ # interpreted as from_encoding at the point of
+ # the encode('UTF-8'); move onto the next one...
+ end
+ else
+ to_encoding = 'UTF-8'
+ begin
+ converted = Iconv.conv 'UTF-8', from_encoding, s
+ return converted
+ rescue Iconv::Failure
+ # We get this is there are invalid bytes when
+ # interpreted as from_encoding at the point of
+ # the Iconv.iconv; move onto the next one...
+ end
+ end
+ end
+ raise EncodingNormalizationError, "Couldn't find a valid character encoding for the string"
+
+end
+
+def convert_string_to_utf8_or_binary(s, suggested_character_encoding=nil)
+ # This function exists to help to keep consistent with the
+ # behaviour of earlier versions of Alaveteli: in the code as it
+ # is, there are situations where it's expected that we generally
+ # have a UTF-8 encoded string, but if the source data was
+ # unintepretable under any character encoding, the string may be
+ # binary data (i.e. invalid UTF-8). Such a string would then be
+ # mangled into valid UTF-8 by _sanitize_text for the purposes of
+ # display.
+
+ # This seems unsatisfactory to me - two better alternatives would
+ # be either: (a) to mangle the data into valid UTF-8 in this
+ # method or (b) to treat the 'text/*' attachment as
+ # 'application/octet-stream' instead. However, for the purposes
+ # of the transition to Ruby 1.9 and/or Rails 3 we just want the
+ # behaviour to be as similar as possible.
+
+ begin
+ result = normalize_string_to_utf8 s, suggested_character_encoding
+ rescue EncodingNormalizationError
+ result = s
+ s.force_encoding 'ASCII-8BIT' if RUBY_VERSION.to_f >= 1.9
+ end
+ result
+end
diff --git a/spec/lib/basic_encoding_tests.rb b/spec/lib/basic_encoding_tests.rb
new file mode 100644
index 000000000..35d35fd4a
--- /dev/null
+++ b/spec/lib/basic_encoding_tests.rb
@@ -0,0 +1,157 @@
+# -*- coding: utf-8 -*-
+require File.expand_path(File.dirname(__FILE__) + '/../spec_helper')
+
+def bytes_to_binary_string( bytes, claimed_encoding = nil )
+ claimed_encoding ||= 'ASCII-8BIT'
+ bytes_string = bytes.pack('c*')
+ if RUBY_VERSION.to_f >= 1.9
+ bytes_string.force_encoding! claimed_encoding
+ end
+ bytes_string
+end
+
+random_string = bytes_to_binary_string [ 0x0f, 0x58, 0x1c, 0x8f, 0xa4, 0xcf,
+ 0xf6, 0x8c, 0x9d, 0xa7, 0x06, 0xd9,
+ 0xf7, 0x90, 0x6c, 0x6f]
+
+windows_1252_string = bytes_to_binary_string [ 0x44, 0x41, 0x53, 0x48, 0x20,
+ 0x96, 0x20, 0x44, 0x41, 0x53,
+ 0x48 ]
+
+# It's a shame this example is so long, but if we don't take enough it
+# gets misinterpreted as Shift_JIS
+
+gb_18030_bytes = [ 0xb9, 0xf3, 0xb9, 0xab, 0xcb, 0xbe, 0xb8, 0xba, 0xd4, 0xf0,
+ 0xc8, 0xcb, 0x28, 0xbe, 0xad, 0xc0, 0xed, 0x2f, 0xb2, 0xc6,
+ 0xce, 0xf1, 0x29, 0xc4, 0xfa, 0xba, 0xc3, 0xa3, 0xba, 0x0d,
+ 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+ 0x20, 0x20, 0x20, 0xb1, 0xbe, 0xb9, 0xab, 0xcb, 0xbe, 0xd4,
+ 0xda, 0x31, 0x39, 0x39, 0x37, 0xc4, 0xea, 0xb3, 0xc9, 0xc1,
+ 0xa2, 0xb9, 0xfa, 0xbc, 0xd2, 0xb9, 0xa4, 0xc9, 0xcc, 0xd7,
+ 0xa2, 0xb2, 0xe1, 0x2e, 0xca, 0xb5, 0xc1, 0xa6, 0xd0, 0xdb,
+ 0xba, 0xf1, 0xa1, 0xa3, 0xd3, 0xd0, 0xb6, 0xc0, 0xc1, 0xa2,
+ 0xcb, 0xb0, 0xce, 0xf1, 0x0d, 0x0a, 0x20, 0x20, 0x20, 0x20,
+ 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0xd7, 0xa8, 0xd2, 0xb5,
+ 0xc8, 0xcb, 0xd4, 0xb1, 0x3b, 0xd4, 0xda, 0xc8, 0xab, 0xb9,
+ 0xfa, 0xb8, 0xf7, 0xb3, 0xc7, 0xca, 0xd0, 0xc9, 0xe8, 0xc1,
+ 0xa2, 0xb7, 0xd6, 0xb9, 0xab, 0xcb, 0xbe, 0xa3, 0xa8, 0xd5,
+ 0xe3, 0xbd, 0xad, 0xa1, 0xa2, 0xc9, 0xcf, 0xba, 0xa3, 0xa1,
+ 0xa2, 0xb9, 0xe3, 0xd6, 0xdd, 0xa1, 0xa2, 0xbd, 0xad, 0xcb,
+ 0xd5, 0xb5, 0xc8, 0x0d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20,
+ 0x20, 0x20, 0x20, 0x20, 0x20, 0xb5, 0xd8, 0xb7, 0xbd, 0xa3,
+ 0xa9, 0xd2, 0xf2, 0xbd, 0xf8, 0xcf, 0xee, 0xbd, 0xcf, 0xb6,
+ 0xe0, 0xcf, 0xd6, 0xcd, 0xea, 0xb3, 0xc9, 0xb2, 0xbb, 0xc1,
+ 0xcb, 0xc3, 0xbf, 0xd4, 0xc2, 0xcf, 0xfa, 0xca, 0xdb, 0xb6,
+ 0xee, 0xb6, 0xc8, 0xa1, 0xa3, 0xc3, 0xbf, 0xd4, 0xc2, 0xd3,
+ 0xd0, 0xd2, 0xbb, 0xb2, 0xbf, 0xb7, 0xd6, 0x0d, 0x0a, 0x20,
+ 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0xd4,
+ 0xf6, 0xd6, 0xb5, 0xb6, 0x90, 0xa3, 0xa8, 0x36, 0x2d, 0x37,
+ 0x25, 0xd7, 0xf3, 0xd3, 0xd2, 0x29, 0xba, 0xcd, 0xc6, 0xd5,
+ 0xc6, 0xb1, 0xa3, 0xa8, 0x30, 0x2e, 0x35, 0x25, 0x2d, 0x32,
+ 0x25, 0x20, 0xd7, 0xf3, 0xd3, 0xd2, 0xa3, 0xa9, 0xd3, 0xc5,
+ 0xbb, 0xdd, 0xb4, 0xfa, 0xbf, 0xaa, 0xbb, 0xf2, 0xba, 0xcf,
+ 0xd7, 0xf7, 0xa3, 0xac, 0x0d, 0x0a, 0x20, 0x20, 0x20, 0x20,
+ 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0xb5, 0xe3, 0xca, 0xfd,
+ 0xbd, 0xcf, 0xb5, 0xcd, 0xa1, 0xa3, 0xb4, 0xfa, 0xc0, 0xed,
+ 0xb7, 0xb6, 0xce, 0xa7, 0xc8, 0xe7, 0xcf, 0xc2, 0xa3, 0xba,
+ 0x0d, 0x0a ]
+
+gb_18030_spam_string = bytes_to_binary_string gb_18030_bytes
+
+describe "normalize_string_to_utf8" do
+
+ describe "when passed uniterpretable character data" do
+
+ it "should reject it as invalid" do
+
+ expect {
+ normalize_string_to_utf8 random_string
+ }.to raise_error(EncodingNormalizationError)
+
+ expect {
+ normalize_string_to_utf8 random_string, 'UTF-8'
+ }.to raise_error(EncodingNormalizationError)
+
+ end
+ end
+
+ describe "when passed unlabelled Windows 1252 data" do
+
+ it "should correctly convert it to UTF-8" do
+
+ normalized = normalize_string_to_utf8 windows_1252_string
+
+ normalized.should == "DASH – DASH"
+
+ end
+
+ end
+
+ describe "when passed GB 18030 data" do
+
+ it "should correctly convert it to UTF-8 if unlabelled" do
+
+ normalized = normalize_string_to_utf8 gb_18030_spam_string
+
+ normalized.should start_with("贵公司负责人")
+
+ end
+
+ end
+
+end
+
+describe "convert_string_to_utf8_or_binary" do
+
+ describe "when passed uniterpretable character data" do
+
+ it "should return it as a binary string" do
+
+ converted = convert_string_to_utf8_or_binary random_string
+ converted.should == random_string
+
+ if RUBY_VERSION.to_f >= 1.9
+ converted.encoding.should == 'ASCII-8BIT'
+ end
+
+ converted = convert_string_to_utf8_or_binary random_string,'UTF-8'
+ converted.should == random_string
+
+ if RUBY_VERSION.to_f >= 1.9
+ converted.encoding.should == 'ASCII-8BIT'
+ end
+
+ end
+ end
+
+ describe "when passed unlabelled Windows 1252 data" do
+
+ it "should correctly convert it to UTF-8" do
+
+ converted = convert_string_to_utf8_or_binary windows_1252_string
+
+ converted.should == "DASH – DASH"
+
+ if RUBY_VERSION.to_f >= 1.9
+ converted.encoding.should == 'UTF-8'
+ end
+ end
+
+ end
+
+ describe "when passed GB 18030 data" do
+
+ it "should correctly convert it to UTF-8 if unlabelled" do
+
+ converted = convert_string_to_utf8_or_binary gb_18030_spam_string
+
+ converted.should start_with("贵公司负责人")
+
+ if RUBY_VERSION.to_f >= 1.9
+ converted.encoding.should == 'UTF-8'
+ end
+ end
+
+ end
+
+end