aboutsummaryrefslogtreecommitdiffstats
path: root/lib/normalize_string.rb
diff options
context:
space:
mode:
authorMark Longair <mhl@pobox.com>2013-05-28 14:39:09 +0100
committerMark Longair <mhl@pobox.com>2013-05-28 14:39:09 +0100
commitc248356a8e8a13513827381977b24f7406f96a8c (patch)
treea0b2210c5956d0da8ea534fe8b772cd776087460 /lib/normalize_string.rb
parented03c3ef55fd61b9be3578ee0c93767d2c218b53 (diff)
parent011e55bd4acf4f3c9de91c5ed4c646e855f19c24 (diff)
Merge branch 'tmail-to-mail-tests' into rails-3-develop
This merge brings in a number of tests and fixes for the handling of mail under Mail / Rails 3 instead of TMail / Rails 2. Conflicts: config/initializers/alaveteli.rb
Diffstat (limited to 'lib/normalize_string.rb')
-rw-r--r--lib/normalize_string.rb86
1 files changed, 86 insertions, 0 deletions
diff --git a/lib/normalize_string.rb b/lib/normalize_string.rb
new file mode 100644
index 000000000..f02b18ee0
--- /dev/null
+++ b/lib/normalize_string.rb
@@ -0,0 +1,86 @@
+require 'iconv' unless RUBY_VERSION.to_f >= 1.9
+require 'charlock_holmes'
+
+class EncodingNormalizationError < StandardError
+end
+
+def normalize_string_to_utf8(s, suggested_character_encoding=nil)
+
+ # Make a list of encodings to try:
+ to_try = []
+
+ guessed_encoding = CharlockHolmes::EncodingDetector.detect(s)[:encoding]
+ guessed_encoding ||= ''
+
+ # It's reasonably common for windows-1252 text to be mislabelled
+ # as ISO-8859-1, so try that first if charlock_holmes guessed
+ # that. However, it can also easily misidentify UTF-8 strings as
+ # ISO-8859-1 so we don't want to go with the guess by default...
+ to_try.push guessed_encoding if guessed_encoding.downcase == 'windows-1252'
+
+ to_try.push suggested_character_encoding if suggested_character_encoding
+ to_try.push 'UTF-8'
+ to_try.push guessed_encoding
+
+ to_try.each do |from_encoding|
+ if RUBY_VERSION.to_f >= 1.9
+ begin
+ s.force_encoding from_encoding
+ return s.encode('UTF-8') if s.valid_encoding?
+ rescue ArgumentError
+ # We get this is there are invalid bytes when
+ # interpreted as from_encoding at the point of
+ # the encode('UTF-8'); move onto the next one...
+ end
+ else
+ to_encoding = 'UTF-8'
+ begin
+ converted = Iconv.conv 'UTF-8', from_encoding, s
+ return converted
+ rescue Iconv::Failure
+ # We get this is there are invalid bytes when
+ # interpreted as from_encoding at the point of
+ # the Iconv.iconv; move onto the next one...
+ end
+ end
+ end
+ raise EncodingNormalizationError, "Couldn't find a valid character encoding for the string"
+
+end
+
+def convert_string_to_utf8_or_binary(s, suggested_character_encoding=nil)
+ # This function exists to help to keep consistent with the
+ # behaviour of earlier versions of Alaveteli: in the code as it
+ # is, there are situations where it's expected that we generally
+ # have a UTF-8 encoded string, but if the source data was
+ # unintepretable under any character encoding, the string may be
+ # binary data (i.e. invalid UTF-8). Such a string would then be
+ # mangled into valid UTF-8 by _sanitize_text for the purposes of
+ # display.
+
+ # This seems unsatisfactory to me - two better alternatives would
+ # be either: (a) to mangle the data into valid UTF-8 in this
+ # method or (b) to treat the 'text/*' attachment as
+ # 'application/octet-stream' instead. However, for the purposes
+ # of the transition to Ruby 1.9 and/or Rails 3 we just want the
+ # behaviour to be as similar as possible.
+
+ begin
+ result = normalize_string_to_utf8 s, suggested_character_encoding
+ rescue EncodingNormalizationError
+ result = s
+ s.force_encoding 'ASCII-8BIT' if RUBY_VERSION.to_f >= 1.9
+ end
+ result
+end
+
+def log_text_details(message, text)
+ if RUBY_VERSION.to_f >= 1.9
+ STDERR.puts "#{message}, we have text: #{text}, of class #{text.class} and encoding #{text.encoding}"
+ else
+ STDERR.puts "#{message}, we have text: #{text}, of class #{text.class}"
+ end
+ filename = "/var/tmp/#{Digest::MD5.hexdigest(text)}.txt"
+ File.open(filename, "wb") { |f| f.write text }
+ STDERR.puts "#{message}, the filename is: #{filename}"
+end