aboutsummaryrefslogtreecommitdiffstats
path: root/app/models
diff options
context:
space:
mode:
Diffstat (limited to 'app/models')
-rw-r--r--app/models/incoming_message.rb365
-rw-r--r--app/models/info_request_event.rb1
2 files changed, 48 insertions, 318 deletions
diff --git a/app/models/incoming_message.rb b/app/models/incoming_message.rb
index a4519a17d..20989d641 100644
--- a/app/models/incoming_message.rb
+++ b/app/models/incoming_message.rb
@@ -44,275 +44,6 @@ module TMail
end
end
-# This is the type which is used to send data about attachments to the view
-class FOIAttachment
- attr_accessor :body
- attr_accessor :content_type
- attr_accessor :filename
- attr_accessor :url_part_number
- attr_accessor :within_rfc822_subject # we use the subject as the filename for email attachments
-
- # List of DSN codes taken from RFC 3463
- # http://tools.ietf.org/html/rfc3463
- DsnToMessage = {
- 'X.1.0' => 'Other address status',
- 'X.1.1' => 'Bad destination mailbox address',
- 'X.1.2' => 'Bad destination system address',
- 'X.1.3' => 'Bad destination mailbox address syntax',
- 'X.1.4' => 'Destination mailbox address ambiguous',
- 'X.1.5' => 'Destination mailbox address valid',
- 'X.1.6' => 'Mailbox has moved',
- 'X.1.7' => 'Bad sender\'s mailbox address syntax',
- 'X.1.8' => 'Bad sender\'s system address',
- 'X.2.0' => 'Other or undefined mailbox status',
- 'X.2.1' => 'Mailbox disabled, not accepting messages',
- 'X.2.2' => 'Mailbox full',
- 'X.2.3' => 'Message length exceeds administrative limit.',
- 'X.2.4' => 'Mailing list expansion problem',
- 'X.3.0' => 'Other or undefined mail system status',
- 'X.3.1' => 'Mail system full',
- 'X.3.2' => 'System not accepting network messages',
- 'X.3.3' => 'System not capable of selected features',
- 'X.3.4' => 'Message too big for system',
- 'X.4.0' => 'Other or undefined network or routing status',
- 'X.4.1' => 'No answer from host',
- 'X.4.2' => 'Bad connection',
- 'X.4.3' => 'Routing server failure',
- 'X.4.4' => 'Unable to route',
- 'X.4.5' => 'Network congestion',
- 'X.4.6' => 'Routing loop detected',
- 'X.4.7' => 'Delivery time expired',
- 'X.5.0' => 'Other or undefined protocol status',
- 'X.5.1' => 'Invalid command',
- 'X.5.2' => 'Syntax error',
- 'X.5.3' => 'Too many recipients',
- 'X.5.4' => 'Invalid command arguments',
- 'X.5.5' => 'Wrong protocol version',
- 'X.6.0' => 'Other or undefined media error',
- 'X.6.1' => 'Media not supported',
- 'X.6.2' => 'Conversion required and prohibited',
- 'X.6.3' => 'Conversion required but not supported',
- 'X.6.4' => 'Conversion with loss performed',
- 'X.6.5' => 'Conversion failed',
- 'X.7.0' => 'Other or undefined security status',
- 'X.7.1' => 'Delivery not authorized, message refused',
- 'X.7.2' => 'Mailing list expansion prohibited',
- 'X.7.3' => 'Security conversion required but not possible',
- 'X.7.4' => 'Security features not supported',
- 'X.7.5' => 'Cryptographic failure',
- 'X.7.6' => 'Cryptographic algorithm not supported',
- 'X.7.7' => 'Message integrity failure'
- }
-
- # Returns HTML, of extra comment to put by attachment
- def extra_note
- # For delivery status notification attachments, extract the status and
- # look up what it means in the DSN table.
- if @content_type == 'message/delivery-status'
- if !@body.match(/Status:\s+([0-9]+\.([0-9]+\.[0-9]+))\s+/)
- return ""
- end
- dsn = $1
- dsn_part = 'X.' + $2
-
- dsn_message = ""
- if DsnToMessage.include?(dsn_part)
- dsn_message = " (" + DsnToMessage[dsn_part] + ")"
- end
-
- return "<br><em>DSN: " + dsn + dsn_message + "</em>"
- end
- return ""
- end
-
- # Called by controller so old filenames still work
- def old_display_filename
- filename = self._internal_display_filename
-
- # Convert weird spaces (e.g. \n) to normal ones
- filename = filename.gsub(/\s/, " ")
- # Remove slashes, they mess with URLs
- filename = filename.gsub(/\//, "-")
-
- return filename
- end
-
- # XXX changing this will break existing URLs, so have a care - maybe
- # make another old_display_filename see above
- def display_filename
- filename = self._internal_display_filename
-
- # Sometimes filenames have e.g. %20 in - no point butchering that
- # (without unescaping it, this would remove the % and leave 20s in there)
- filename = CGI.unescape(filename)
-
- # Remove weird spaces
- filename = filename.gsub(/\s+/, " ")
- # Remove non-alphabetic characters
- filename = filename.gsub(/[^A-Za-z0-9.]/, " ")
- # Remove spaces near dots
- filename = filename.gsub(/\s*\.\s*/, ".")
- # Compress adjacent spaces down to a single one
- filename = filename.gsub(/\s+/, " ")
- filename = filename.strip
-
- return filename
- end
-
- def _internal_display_filename
- calc_ext = AlaveteliFileTypes.mimetype_to_extension(@content_type)
-
- if @filename
- # Put right extension on if missing
- if !filename.match(/\.#{calc_ext}$/) && calc_ext
- filename + "." + calc_ext
- else
- filename
- end
- else
- if !calc_ext
- calc_ext = "bin"
- end
- if @within_rfc822_subject
- @within_rfc822_subject + "." + calc_ext
- else
- "attachment." + calc_ext
- end
- end
- end
-
- # Size to show next to the download link for the attachment
- def display_size
- s = self.body.size
-
- if s > 1024 * 1024
- return sprintf("%.1f", s.to_f / 1024 / 1024) + 'M'
- else
- return (s / 1024).to_s + 'K'
- end
- end
-
- # Whether this type can be shown in the Google Docs Viewer.
- # The full list of supported types can be found at
- # https://docs.google.com/support/bin/answer.py?hl=en&answer=1189935
- def has_google_docs_viewer?
- return !! {
- "application/pdf" => true, # .pdf
- "image/tiff" => true, # .tiff
-
- "application/vnd.ms-word" => true, # .doc
- "application/vnd.openxmlformats-officedocument.wordprocessingml.document" => true, # .docx
-
- "application/vnd.ms-powerpoint" => true, # .ppt
- "application/vnd.openxmlformats-officedocument.presentationml.presentation" => true, # .pptx
-
- "application/vnd.ms-excel" => true, # .xls
- "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" => true, # .xlsx
-
- } [self.content_type]
- end
-
- # Whether this type has a "View as HTML"
- def has_body_as_html?
- return (
- !!{
- "text/plain" => true,
- "application/rtf" => true,
- }[self.content_type] or
- self.has_google_docs_viewer?
- )
- end
-
- # Name of type of attachment type - only valid for things that has_body_as_html?
- def name_of_content_type
- return {
- "text/plain" => "Text file",
- 'application/rtf' => "RTF file",
-
- 'application/pdf' => "PDF file",
- 'image/tiff' => "TIFF image",
-
- 'application/vnd.ms-word' => "Word document",
- 'application/vnd.openxmlformats-officedocument.wordprocessingml.document' => "Word document",
-
- 'application/vnd.ms-powerpoint' => "PowerPoint presentation",
- 'application/vnd.openxmlformats-officedocument.presentationml.presentation' => "PowerPoint presentation",
-
- 'application/vnd.ms-excel' => "Excel spreadsheet",
- 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet' => "Excel spreadsheet",
- }[self.content_type]
- end
-
- # For "View as HTML" of attachment
- def body_as_html(dir)
- html = nil
- wrapper_id = "wrapper"
-
- # simple cases, can never fail
- if self.content_type == 'text/plain'
- text = self.body.strip
- text = CGI.escapeHTML(text)
- text = MySociety::Format.make_clickable(text)
- html = text.gsub(/\n/, '<br>')
- return '<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"
- "http://www.w3.org/TR/html4/loose.dtd"><html><head><title></title></head><body>' + html + "</body></html>", wrapper_id
- end
-
- # the extractions will also produce image files, which go in the
- # current directory, so change to the directory the function caller
- # wants everything in
- Dir.chdir(dir) do
- tempfile = Tempfile.new('foiextract', '.')
- tempfile.print self.body
- tempfile.flush
-
- if self.content_type == 'application/pdf'
- IO.popen("#{`which pdftohtml`.chomp} -nodrm -zoom 1.0 -stdout -enc UTF-8 -noframes " + tempfile.path + "", "r") do |child|
- html = child.read()
- end
- elsif self.content_type == 'application/rtf'
- IO.popen("/usr/bin/unrtf --html " + tempfile.path + "", "r") do |child|
- html = child.read()
- end
- elsif self.has_google_docs_viewer?
- html = '' # force error and using Google docs viewer
- else
- raise "No HTML conversion available for type " + self.content_type
- end
-
- tempfile.close
- tempfile.delete
- end
-
- # We need to look at:
- # a) Any error code
- # b) The output size, as pdftohtml does not return an error code upon error.
- # c) For cases when there is no text in the body of the HTML, or
- # images, so nothing will be rendered. This is to detect some bug in
- # pdftohtml, which sometimes makes it return just <hr>s and no other
- # content.
- html.match(/(\<body[^>]*\>.*)/mi)
- body = $1.to_s
- body_without_tags = body.gsub(/\s+/,"").gsub(/\<[^\>]*\>/, "")
- contains_images = html.match(/<img/mi) ? true : false
- if !$?.success? || html.size == 0 || (body_without_tags.size == 0 && !contains_images)
- ret = "<html><head></head><body>";
- if self.has_google_docs_viewer?
- wrapper_id = "wrapper_google_embed"
- ret = ret + "<iframe src='http://docs.google.com/viewer?url=<attachment-url-here>&embedded=true' width='100%' height='100%' style='border: none;'></iframe>";
- else
- ret = ret + "<p>Sorry, we were unable to convert this file to HTML. Please use the download link at the top right.</p>"
- end
- ret = ret + "</body></html>"
- return ret, wrapper_id
- end
-
- return html, wrapper_id
- end
-
-end
-
-
class IncomingMessage < ActiveRecord::Base
belongs_to :info_request
validates_presence_of :info_request
@@ -380,7 +111,7 @@ class IncomingMessage < ActiveRecord::Base
if !self.mail['return-path'].nil? && self.mail['return-path'].addr == "<>"
return false
end
- if !self.mail['auto-submitted'].nil? && !self.mail['auto-submitted'].keys.empty?
+ if !self.mail['auto-submitted'].nil?
return false
end
return true
@@ -792,7 +523,7 @@ class IncomingMessage < ActiveRecord::Base
# it into conflict with ensure_parts_counted which it has to be
# called both before and after. It will fail with cases of
# attachments of attachments etc.
-
+ charset = curr_mail.charset # save this, because overwriting content_type also resets charset
# Don't allow nil content_types
if curr_mail.content_type.nil?
curr_mail.content_type = 'application/octet-stream'
@@ -822,7 +553,6 @@ class IncomingMessage < ActiveRecord::Base
curr_mail.content_type = 'application/octet-stream'
end
end
-
# If the part is an attachment of email
if curr_mail.content_type == 'message/rfc822' || curr_mail.content_type == 'application/vnd.ms-outlook' || curr_mail.content_type == 'application/ms-tnef'
ensure_parts_counted # fills in rfc822_attachment variable
@@ -832,6 +562,8 @@ class IncomingMessage < ActiveRecord::Base
curr_mail.within_rfc822_attachment = within_rfc822_attachment
leaves_found += [curr_mail]
end
+ # restore original charset
+ curr_mail.charset = charset
end
return leaves_found
end
@@ -887,64 +619,58 @@ class IncomingMessage < ActiveRecord::Base
end
# Returns body text from main text part of email, converted to UTF-8
def get_main_body_text_internal
+ parse_raw_email!
main_part = get_main_body_text_part
return _convert_part_body_to_text(main_part)
end
+
# Given a main text part, converts it to text
def _convert_part_body_to_text(part)
if part.nil?
text = "[ Email has no body, please see attachments ]"
- text_charset = "utf-8"
+ source_charset = "utf-8"
else
- text = part.body
- text_charset = part.charset
+ text = part.body # by default, TMail converts to UT8 in this call
+ source_charset = part.charset
if part.content_type == 'text/html'
# e.g. http://www.whatdotheyknow.com/request/35/response/177
- # XXX This is a bit of a hack as it is calling a convert to text routine.
- # Could instead call a sanitize HTML one.
- text = self.class._get_attachment_text_internal_one_file(part.content_type, text)
- end
- end
-
- # Charset conversion, turn everything into UTF-8
- if not text_charset.nil?
- begin
- # XXX specially convert unicode pound signs, was needed here
- # http://www.whatdotheyknow.com/request/88/response/352
- text = text.gsub("£", Iconv.conv(text_charset, 'utf-8', '£'))
- # Try proper conversion
- text = Iconv.conv('utf-8', text_charset, text)
- rescue Iconv::IllegalSequence, Iconv::InvalidEncoding
- # Clearly specified charset was nonsense
- text_charset = nil
+ # XXX This is a bit of a hack as it is calling a
+ # convert to text routine. Could instead call a
+ # sanitize HTML one.
+
+ # If the text isn't UTF8, it means TMail had a problem
+ # converting it (invalid characters, etc), and we
+ # should instead tell elinks to respect the source
+ # charset
+ use_charset = "utf-8"
+ begin
+ text = Iconv.conv('utf-8', 'utf-8', text)
+ rescue Iconv::IllegalSequence
+ use_charset = source_charset
+ end
+ text = self.class._get_attachment_text_internal_one_file(part.content_type, text, use_charset)
end
end
- if text_charset.nil?
- # No specified charset, so guess
-
- # Could use rchardet here, but it had trouble with
- # http://www.whatdotheyknow.com/request/107/response/144
- # So I gave up - most likely in UK we'll only get windows-1252 anyway.
+ # If TMail can't convert text, it just returns it, so we sanitise it.
+ begin
+ # Test if it's good UTF-8
+ text = Iconv.conv('utf-8', 'utf-8', text)
+ rescue Iconv::IllegalSequence
+ # Text looks like unlabelled nonsense,
+ # strip out anything that isn't UTF-8
begin
- # See if it is good UTF-8 anyway
- text = Iconv.conv('utf-8', 'utf-8', text)
- rescue Iconv::IllegalSequence
- begin
- # Or is it good windows-1252, most likely
- text = Iconv.conv('utf-8', 'windows-1252', text)
- rescue Iconv::IllegalSequence
- # Text looks like unlabelled nonsense, strip out anything that isn't UTF-8
- text = Iconv.conv('utf-8//IGNORE', 'utf-8', text) +
- _("\n\n[ {{site_name}} note: The above text was badly encoded, and has had strange characters removed. ]",
- :site_name => MySociety::Config.get('SITE_NAME', 'Alaveteli'))
+ text = Iconv.conv('utf-8//IGNORE', source_charset, text) +
+ _("\n\n[ {{site_name}} note: The above text was badly encoded, and has had strange characters removed. ]",
+ :site_name => MySociety::Config.get('SITE_NAME', 'Alaveteli'))
+ rescue Iconv::InvalidEncoding, Iconv::IllegalSequence
+ if source_charset != "utf-8"
+ source_charset = "utf-8"
+ retry
end
end
end
- # An assertion that we have ended up with UTF-8 XXX can remove as this should
- # always be fine if code above is
- Iconv.conv('utf-8', 'utf-8', text)
# Fix DOS style linefeeds to Unix style ones (or other later regexps won't work)
# Needed for e.g. http://www.whatdotheyknow.com/request/60/response/98
@@ -1192,7 +918,9 @@ class IncomingMessage < ActiveRecord::Base
return self.cached_attachment_text_clipped
end
- def IncomingMessage._get_attachment_text_internal_one_file(content_type, body)
+ def IncomingMessage._get_attachment_text_internal_one_file(content_type, body, charset = 'utf-8')
+ # note re. charset: TMail always tries to convert email bodies
+ # to UTF8 by default, so normally it should already be that.
text = ''
# XXX - tell all these command line tools to return utf-8
if content_type == 'text/plain'
@@ -1214,9 +942,10 @@ class IncomingMessage < ActiveRecord::Base
# catdoc on RTF prodcues less comments and extra bumf than --text option to unrtf
AlaveteliExternalCommand.run(`which catdoc`.chomp, tempfile.path, :append_to => text)
elsif content_type == 'text/html'
- # lynx wordwraps links in its output, which then don't get formatted properly
- # by Alaveteli. We use elinks instead, which doesn't do that.
- AlaveteliExternalCommand.run(`which elinks`.chomp, "-eval", "'set document.codepage.assume = \"utf-8\"'", "-dump-charset", "utf-8", "-force-html", "-dump",
+ # lynx wordwraps links in its output, which then don't
+ # get formatted properly by Alaveteli. We use elinks
+ # instead, which doesn't do that.
+ AlaveteliExternalCommand.run(`which elinks`.chomp, "-eval", "'set document.codepage.assume = \"#{charset}\"'", "-eval", "'set document.codepage.force_assumed = 1'", "-dump-charset", "utf-8", "-force-html", "-dump",
tempfile.path, :append_to => text)
elsif content_type == 'application/vnd.ms-excel'
# Bit crazy using /usr/bin/strings - but xls2csv, xlhtml and
@@ -1283,7 +1012,7 @@ class IncomingMessage < ActiveRecord::Base
text = ''
attachments = self.get_attachments_for_display
for attachment in attachments
- text += IncomingMessage._get_attachment_text_internal_one_file(attachment.content_type, attachment.body)
+ text += IncomingMessage._get_attachment_text_internal_one_file(attachment.content_type, attachment.body, attachment.charset)
end
# Remove any bad characters
text = Iconv.conv('utf-8//IGNORE', 'utf-8', text)
@@ -1376,7 +1105,7 @@ class IncomingMessage < ActiveRecord::Base
if !self.mail['return-path'].nil? && self.mail['return-path'].addr == "<>"
return false
end
- if !self.mail['auto-submitted'].nil? && !self.mail['auto-submitted'].keys.empty?
+ if !self.mail['auto-submitted'].nil?
return false
end
return true
diff --git a/app/models/info_request_event.rb b/app/models/info_request_event.rb
index 4ea89bf81..3514702da 100644
--- a/app/models/info_request_event.rb
+++ b/app/models/info_request_event.rb
@@ -147,6 +147,7 @@ class InfoRequestEvent < ActiveRecord::Base
return event.calculated_state
end
end
+ return
end
def waiting_classification