aboutsummaryrefslogtreecommitdiffstats
path: root/app/models
diff options
context:
space:
mode:
Diffstat (limited to 'app/models')
-rw-r--r--app/models/incoming_message.rb91
1 files changed, 44 insertions, 47 deletions
diff --git a/app/models/incoming_message.rb b/app/models/incoming_message.rb
index 14cccbe12..474315717 100644
--- a/app/models/incoming_message.rb
+++ b/app/models/incoming_message.rb
@@ -523,7 +523,7 @@ class IncomingMessage < ActiveRecord::Base
# it into conflict with ensure_parts_counted which it has to be
# called both before and after. It will fail with cases of
# attachments of attachments etc.
-
+ charset = curr_mail.charset # save this, because overwriting content_type also resets charset
# Don't allow nil content_types
if curr_mail.content_type.nil?
curr_mail.content_type = 'application/octet-stream'
@@ -553,7 +553,6 @@ class IncomingMessage < ActiveRecord::Base
curr_mail.content_type = 'application/octet-stream'
end
end
-
# If the part is an attachment of email
if curr_mail.content_type == 'message/rfc822' || curr_mail.content_type == 'application/vnd.ms-outlook' || curr_mail.content_type == 'application/ms-tnef'
ensure_parts_counted # fills in rfc822_attachment variable
@@ -563,6 +562,8 @@ class IncomingMessage < ActiveRecord::Base
curr_mail.within_rfc822_attachment = within_rfc822_attachment
leaves_found += [curr_mail]
end
+ # restore original charset
+ curr_mail.charset = charset
end
return leaves_found
end
@@ -621,61 +622,54 @@ class IncomingMessage < ActiveRecord::Base
main_part = get_main_body_text_part
return _convert_part_body_to_text(main_part)
end
+
# Given a main text part, converts it to text
def _convert_part_body_to_text(part)
if part.nil?
text = "[ Email has no body, please see attachments ]"
- text_charset = "utf-8"
+ source_charset = "utf-8"
else
- text = part.body
- text_charset = part.charset
+ text = part.body # by default, TMail converts to UT8 in this call
+ source_charset = part.charset
if part.content_type == 'text/html'
# e.g. http://www.whatdotheyknow.com/request/35/response/177
- # XXX This is a bit of a hack as it is calling a convert to text routine.
- # Could instead call a sanitize HTML one.
- text = self.class._get_attachment_text_internal_one_file(part.content_type, text)
- end
- end
-
- # Charset conversion, turn everything into UTF-8
- if not text_charset.nil?
- begin
- # XXX specially convert unicode pound signs, was needed here
- # http://www.whatdotheyknow.com/request/88/response/352
- text = text.gsub("£", Iconv.conv(text_charset, 'utf-8', '£'))
- # Try proper conversion
- text = Iconv.conv('utf-8', text_charset, text)
- rescue Iconv::IllegalSequence, Iconv::InvalidEncoding
- # Clearly specified charset was nonsense
- text_charset = nil
+ # XXX This is a bit of a hack as it is calling a
+ # convert to text routine. Could instead call a
+ # sanitize HTML one.
+
+ # If the text isn't UTF8, it means TMail had a problem
+ # converting it (invalid characters, etc), and we
+ # should instead tell elinks to respect the source
+ # charset
+ use_charset = "utf-8"
+ begin
+ text = Iconv.conv('utf-8', 'utf-8', text)
+ rescue Iconv::IllegalSequence
+ use_charset = source_charset
+ end
+ text = self.class._get_attachment_text_internal_one_file(part.content_type, text, use_charset)
end
end
- if text_charset.nil?
- # No specified charset, so guess
-
- # Could use rchardet here, but it had trouble with
- # http://www.whatdotheyknow.com/request/107/response/144
- # So I gave up - most likely in UK we'll only get windows-1252 anyway.
+ # If TMail can't convert text, it just returns it, so we sanitise it.
+ begin
+ # Test if it's good UTF-8
+ text = Iconv.conv('utf-8', 'utf-8', text)
+ rescue Iconv::IllegalSequence
+ # Text looks like unlabelled nonsense,
+ # strip out anything that isn't UTF-8
begin
- # See if it is good UTF-8 anyway
- text = Iconv.conv('utf-8', 'utf-8', text)
- rescue Iconv::IllegalSequence
- begin
- # Or is it good windows-1252, most likely
- text = Iconv.conv('utf-8', 'windows-1252', text)
- rescue Iconv::IllegalSequence
- # Text looks like unlabelled nonsense, strip out anything that isn't UTF-8
- text = Iconv.conv('utf-8//IGNORE', 'utf-8', text) +
- _("\n\n[ {{site_name}} note: The above text was badly encoded, and has had strange characters removed. ]",
- :site_name => MySociety::Config.get('SITE_NAME', 'Alaveteli'))
+ text = Iconv.conv('utf-8//IGNORE', source_charset, text) +
+ _("\n\n[ {{site_name}} note: The above text was badly encoded, and has had strange characters removed. ]",
+ :site_name => MySociety::Config.get('SITE_NAME', 'Alaveteli'))
+ rescue Iconv::InvalidEncoding, Iconv::IllegalSequence
+ if source_charset != "utf-8"
+ source_charset = "utf-8"
+ retry
end
end
end
- # An assertion that we have ended up with UTF-8 XXX can remove as this should
- # always be fine if code above is
- Iconv.conv('utf-8', 'utf-8', text)
# Fix DOS style linefeeds to Unix style ones (or other later regexps won't work)
# Needed for e.g. http://www.whatdotheyknow.com/request/60/response/98
@@ -923,7 +917,9 @@ class IncomingMessage < ActiveRecord::Base
return self.cached_attachment_text_clipped
end
- def IncomingMessage._get_attachment_text_internal_one_file(content_type, body)
+ def IncomingMessage._get_attachment_text_internal_one_file(content_type, body, charset = 'utf-8')
+ # note re. charset: TMail always tries to convert email bodies
+ # to UTF8 by default, so normally it should already be that.
text = ''
# XXX - tell all these command line tools to return utf-8
if content_type == 'text/plain'
@@ -945,9 +941,10 @@ class IncomingMessage < ActiveRecord::Base
# catdoc on RTF prodcues less comments and extra bumf than --text option to unrtf
AlaveteliExternalCommand.run(`which catdoc`.chomp, tempfile.path, :append_to => text)
elsif content_type == 'text/html'
- # lynx wordwraps links in its output, which then don't get formatted properly
- # by Alaveteli. We use elinks instead, which doesn't do that.
- AlaveteliExternalCommand.run(`which elinks`.chomp, "-eval", "'set document.codepage.assume = \"utf-8\"'", "-dump-charset", "utf-8", "-force-html", "-dump",
+ # lynx wordwraps links in its output, which then don't
+ # get formatted properly by Alaveteli. We use elinks
+ # instead, which doesn't do that.
+ AlaveteliExternalCommand.run(`which elinks`.chomp, "-eval", "'set document.codepage.assume = \"#{charset}\"'", "-eval", "'set document.codepage.force_assumed = 1'", "-dump-charset", "utf-8", "-force-html", "-dump",
tempfile.path, :append_to => text)
elsif content_type == 'application/vnd.ms-excel'
# Bit crazy using /usr/bin/strings - but xls2csv, xlhtml and
@@ -1014,7 +1011,7 @@ class IncomingMessage < ActiveRecord::Base
text = ''
attachments = self.get_attachments_for_display
for attachment in attachments
- text += IncomingMessage._get_attachment_text_internal_one_file(attachment.content_type, attachment.body)
+ text += IncomingMessage._get_attachment_text_internal_one_file(attachment.content_type, attachment.body, attachment.charset)
end
# Remove any bad characters
text = Iconv.conv('utf-8//IGNORE', 'utf-8', text)