1 files changed, 44 insertions, 47 deletions
diff --git a/app/models/incoming_message.rb b/app/models/incoming_message.rb
index 14cccbe12..474315717 100644
--- a/app/models/incoming_message.rb
+++ b/app/models/incoming_message.rb
@@ -523,7 +523,7 @@ class IncomingMessage < ActiveRecord::Base
             # it into conflict with ensure_parts_counted which it has to be
             # called both before and after.  It will fail with cases of
             # attachments of attachments etc.
-
+            charset = curr_mail.charset # save this, because overwriting content_type also resets charset
             # Don't allow nil content_types
             if curr_mail.content_type.nil?
                 curr_mail.content_type = 'application/octet-stream'
@@ -553,7 +553,6 @@ class IncomingMessage < ActiveRecord::Base
                     curr_mail.content_type = 'application/octet-stream'
                 end
             end
-
             # If the part is an attachment of email
             if curr_mail.content_type == 'message/rfc822' || curr_mail.content_type == 'application/vnd.ms-outlook' || curr_mail.content_type == 'application/ms-tnef'
                 ensure_parts_counted # fills in rfc822_attachment variable
@@ -563,6 +562,8 @@ class IncomingMessage < ActiveRecord::Base
                 curr_mail.within_rfc822_attachment = within_rfc822_attachment
                 leaves_found += [curr_mail]
             end
+            # restore original charset
+            curr_mail.charset = charset
         end
         return leaves_found
     end
@@ -621,61 +622,54 @@ class IncomingMessage < ActiveRecord::Base
         main_part = get_main_body_text_part
         return _convert_part_body_to_text(main_part)
     end
+
     # Given a main text part, converts it to text
     def _convert_part_body_to_text(part)
         if part.nil?
             text = "[ Email has no body, please see attachments ]"
-            text_charset = "utf-8"
+            source_charset = "utf-8"
         else
-            text = part.body
-            text_charset = part.charset
+            text = part.body # by default, TMail converts to UT8 in this call
+            source_charset = part.charset
             if part.content_type == 'text/html'
                 # e.g. http://www.whatdotheyknow.com/request/35/response/177
-                # XXX This is a bit of a hack as it is calling a convert to text routine.
-                # Could instead call a sanitize HTML one.
-                text = self.class._get_attachment_text_internal_one_file(part.content_type, text)
-            end
-        end
-
-        # Charset conversion, turn everything into UTF-8
-        if not text_charset.nil?
-            begin
-                # XXX specially convert unicode pound signs, was needed here
-                # http://www.whatdotheyknow.com/request/88/response/352
-                text = text.gsub("£", Iconv.conv(text_charset, 'utf-8', '£')) 
-                # Try proper conversion
-                text = Iconv.conv('utf-8', text_charset, text)
-            rescue Iconv::IllegalSequence, Iconv::InvalidEncoding
-                # Clearly specified charset was nonsense
-                text_charset = nil
+                # XXX This is a bit of a hack as it is calling a
+                # convert to text routine.  Could instead call a
+                # sanitize HTML one.
+
+                # If the text isn't UTF8, it means TMail had a problem
+                # converting it (invalid characters, etc), and we
+                # should instead tell elinks to respect the source
+                # charset
+                use_charset = "utf-8"
+                begin
+                    text = Iconv.conv('utf-8', 'utf-8', text)
+                rescue Iconv::IllegalSequence
+                    use_charset = source_charset
+                end
+                text = self.class._get_attachment_text_internal_one_file(part.content_type, text, use_charset)
             end
         end
-        if text_charset.nil?
-            # No specified charset, so guess
-            
-            # Could use rchardet here, but it had trouble with 
-            #   http://www.whatdotheyknow.com/request/107/response/144
-            # So I gave up - most likely in UK we'll only get windows-1252 anyway.
 
+        # If TMail can't convert text, it just returns it, so we sanitise it.
+        begin
+            # Test if it's good UTF-8
+            text = Iconv.conv('utf-8', 'utf-8', text)
+        rescue Iconv::IllegalSequence
+            # Text looks like unlabelled nonsense, 
+            # strip out anything that isn't UTF-8
             begin
-                # See if it is good UTF-8 anyway
-                text = Iconv.conv('utf-8', 'utf-8', text)
-            rescue Iconv::IllegalSequence
-                begin
-                    # Or is it good windows-1252, most likely
-                    text = Iconv.conv('utf-8', 'windows-1252', text)
-                rescue Iconv::IllegalSequence
-                    # Text looks like unlabelled nonsense, strip out anything that isn't UTF-8
-                    text = Iconv.conv('utf-8//IGNORE', 'utf-8', text) + 
-                        _("\n\n[ {{site_name}} note: The above text was badly encoded, and has had strange characters removed. ]", 
-                        :site_name => MySociety::Config.get('SITE_NAME', 'Alaveteli'))
+                text = Iconv.conv('utf-8//IGNORE', source_charset, text) + 
+                    _("\n\n[ {{site_name}} note: The above text was badly encoded, and has had strange characters removed. ]", 
+                      :site_name => MySociety::Config.get('SITE_NAME', 'Alaveteli'))
+            rescue Iconv::InvalidEncoding, Iconv::IllegalSequence
+                if source_charset != "utf-8"
+                    source_charset = "utf-8"
+                    retry
                 end
             end
         end
         
-        # An assertion that we have ended up with UTF-8 XXX can remove as this should
-        # always be fine if code above is
-        Iconv.conv('utf-8', 'utf-8', text)
 
         # Fix DOS style linefeeds to Unix style ones (or other later regexps won't work)
         # Needed for e.g. http://www.whatdotheyknow.com/request/60/response/98
@@ -923,7 +917,9 @@ class IncomingMessage < ActiveRecord::Base
 
         return self.cached_attachment_text_clipped
     end
-    def IncomingMessage._get_attachment_text_internal_one_file(content_type, body)
+    def IncomingMessage._get_attachment_text_internal_one_file(content_type, body, charset = 'utf-8')
+        # note re. charset: TMail always tries to convert email bodies
+        # to UTF8 by default, so normally it should already be that.
         text = ''
         # XXX - tell all these command line tools to return utf-8
         if content_type == 'text/plain'
@@ -945,9 +941,10 @@ class IncomingMessage < ActiveRecord::Base
                 # catdoc on RTF prodcues less comments and extra bumf than --text option to unrtf
                 AlaveteliExternalCommand.run(`which catdoc`.chomp, tempfile.path, :append_to => text)
             elsif content_type == 'text/html'
-                # lynx wordwraps links in its output, which then don't get formatted properly
-                # by Alaveteli. We use elinks instead, which doesn't do that.
-                AlaveteliExternalCommand.run(`which elinks`.chomp, "-eval", "'set document.codepage.assume = \"utf-8\"'", "-dump-charset", "utf-8", "-force-html", "-dump",
+                # lynx wordwraps links in its output, which then don't
+                # get formatted properly by Alaveteli. We use elinks
+                # instead, which doesn't do that.
+                AlaveteliExternalCommand.run(`which elinks`.chomp, "-eval", "'set document.codepage.assume = \"#{charset}\"'", "-eval", "'set document.codepage.force_assumed = 1'", "-dump-charset", "utf-8", "-force-html", "-dump",
                     tempfile.path, :append_to => text)
             elsif content_type == 'application/vnd.ms-excel'
                 # Bit crazy using /usr/bin/strings - but xls2csv, xlhtml and
@@ -1014,7 +1011,7 @@ class IncomingMessage < ActiveRecord::Base
         text = ''
         attachments = self.get_attachments_for_display
         for attachment in attachments
-            text += IncomingMessage._get_attachment_text_internal_one_file(attachment.content_type, attachment.body)
+            text += IncomingMessage._get_attachment_text_internal_one_file(attachment.content_type, attachment.body, attachment.charset)
         end
         # Remove any bad characters
         text = Iconv.conv('utf-8//IGNORE', 'utf-8', text)