From 95cf55aad1f0985d28c28beb61e122dc7465b039 Mon Sep 17 00:00:00 2001
From: Mark Longair <mhl@pobox.com>
Date: Wed, 15 May 2013 14:52:07 +0100
Subject: Add functions for converting from arbitrary text data to UTF-8

Throughout the codebase it is simplest and most consistent
if we could assume that all text/* attachments are represented
by UTF-8 strings, and this was largely true with the TMail
backend which ensured that all returned text parts were in
UTF-8.  We have to change the replacement Mail-backed to
similarly attempt to convert text parts to UTF-8.  This commit
introduces two functions which are useful for this.

The normalize_string_to_utf8 function will try various
encodings, either suggested or guessed (with charlock_holmes)
to convert the passed string to UTF-8, and if it can't find a
suitable encoding will throw an exception.

Unfortunately, the current behaviour of the site is that
uninterpretable text/* attachments are still passed around and
mangled to UTF-8 just before display.  To mimic this it's also
useful to have the convert_string_to_utf8_or_binary function,
which tries to convert the string to UTF-8 with
normalize_string_to_utf8, but if that's not possible just
returns the original string.  (In Ruby 1.9, encoding will be
set to UTF-8 or ASCII-8BIT appropriately.)
---
 lib/normalize_string.rb | 75 +++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 75 insertions(+)
 create mode 100644 lib/normalize_string.rb

(limited to 'lib/normalize_string.rb')

diff --git a/lib/normalize_string.rb b/lib/normalize_string.rb
new file mode 100644
index 000000000..e708a8d96
--- /dev/null
+++ b/lib/normalize_string.rb
@@ -0,0 +1,75 @@
+require 'iconv' unless RUBY_VERSION.to_f >= 1.9
+require 'charlock_holmes'
+
+class EncodingNormalizationError < StandardError
+end
+
+def normalize_string_to_utf8(s, suggested_character_encoding=nil)
+
+    # Make a list of encodings to try:
+    to_try = []
+
+    guessed_encoding = CharlockHolmes::EncodingDetector.detect(s)[:encoding]
+    guessed_encoding ||= ''
+
+    # It's reasonably common for windows-1252 text to be mislabelled
+    # as ISO-8859-1, so try that first if charlock_holmes guessed
+    # that.  However, it can also easily misidentify UTF-8 strings as
+    # ISO-8859-1 so we don't want to go with the guess by default...
+    to_try.push guessed_encoding if guessed_encoding.downcase == 'windows-1252'
+
+    to_try.push suggested_character_encoding if suggested_character_encoding
+    to_try.push 'UTF-8'
+    to_try.push guessed_encoding
+
+    to_try.each do |from_encoding|
+        if RUBY_VERSION.to_f >= 1.9
+            begin
+                s.force_encoding from_encoding
+                return s.encode('UTF-8') if s.valid_encoding?
+            rescue ArgumentError
+                # We get this is there are invalid bytes when
+                # interpreted as from_encoding at the point of
+                # the encode('UTF-8'); move onto the next one...
+            end
+        else
+            to_encoding = 'UTF-8'
+            begin
+                converted = Iconv.conv 'UTF-8', from_encoding, s
+                return converted
+            rescue Iconv::Failure
+                # We get this is there are invalid bytes when
+                # interpreted as from_encoding at the point of
+                # the Iconv.iconv; move onto the next one...
+            end
+        end
+    end
+    raise EncodingNormalizationError, "Couldn't find a valid character encoding for the string"
+
+end
+
+def convert_string_to_utf8_or_binary(s, suggested_character_encoding=nil)
+    # This function exists to help to keep consistent with the
+    # behaviour of earlier versions of Alaveteli: in the code as it
+    # is, there are situations where it's expected that we generally
+    # have a UTF-8 encoded string, but if the source data was
+    # unintepretable under any character encoding, the string may be
+    # binary data (i.e. invalid UTF-8).  Such a string would then be
+    # mangled into valid UTF-8 by _sanitize_text for the purposes of
+    # display.
+
+    # This seems unsatisfactory to me - two better alternatives would
+    # be either: (a) to mangle the data into valid UTF-8 in this
+    # method or (b) to treat the 'text/*' attachment as
+    # 'application/octet-stream' instead.  However, for the purposes
+    # of the transition to Ruby 1.9 and/or Rails 3 we just want the
+    # behaviour to be as similar as possible.
+
+    begin
+        result = normalize_string_to_utf8 s, suggested_character_encoding
+    rescue EncodingNormalizationError
+        result = s
+        s.force_encoding 'ASCII-8BIT' if RUBY_VERSION.to_f >= 1.9
+    end
+    result
+end
-- 
cgit v1.2.3


From 6b973b1d59b5c384a4ca5a5c0c2c53ad43159ad0 Mon Sep 17 00:00:00 2001
From: Mark Longair <mhl@pobox.com>
Date: Mon, 13 May 2013 17:50:25 +0100
Subject: Add a helper function for dumping text to disk

This function is useful for investigating problems with
handling of emails, attachments and the related character
encoding issues.  It can safely be removed later, but is
currently useful to have for debugging purposes.
---
 lib/normalize_string.rb | 11 +++++++++++
 1 file changed, 11 insertions(+)

(limited to 'lib/normalize_string.rb')

diff --git a/lib/normalize_string.rb b/lib/normalize_string.rb
index e708a8d96..f02b18ee0 100644
--- a/lib/normalize_string.rb
+++ b/lib/normalize_string.rb
@@ -73,3 +73,14 @@ def convert_string_to_utf8_or_binary(s, suggested_character_encoding=nil)
     end
     result
 end
+
+def log_text_details(message, text)
+    if RUBY_VERSION.to_f >= 1.9
+        STDERR.puts "#{message}, we have text: #{text}, of class #{text.class} and encoding #{text.encoding}"
+    else
+        STDERR.puts "#{message}, we have text: #{text}, of class #{text.class}"
+    end
+    filename = "/var/tmp/#{Digest::MD5.hexdigest(text)}.txt"
+    File.open(filename, "wb") { |f| f.write text }
+    STDERR.puts "#{message}, the filename is: #{filename}"
+end
-- 
cgit v1.2.3