diff options
Diffstat (limited to 'app/models')
-rw-r--r-- | app/models/incoming_message.rb | 365 | ||||
-rw-r--r-- | app/models/info_request_event.rb | 1 |
2 files changed, 48 insertions, 318 deletions
diff --git a/app/models/incoming_message.rb b/app/models/incoming_message.rb index a4519a17d..20989d641 100644 --- a/app/models/incoming_message.rb +++ b/app/models/incoming_message.rb @@ -44,275 +44,6 @@ module TMail end end -# This is the type which is used to send data about attachments to the view -class FOIAttachment - attr_accessor :body - attr_accessor :content_type - attr_accessor :filename - attr_accessor :url_part_number - attr_accessor :within_rfc822_subject # we use the subject as the filename for email attachments - - # List of DSN codes taken from RFC 3463 - # http://tools.ietf.org/html/rfc3463 - DsnToMessage = { - 'X.1.0' => 'Other address status', - 'X.1.1' => 'Bad destination mailbox address', - 'X.1.2' => 'Bad destination system address', - 'X.1.3' => 'Bad destination mailbox address syntax', - 'X.1.4' => 'Destination mailbox address ambiguous', - 'X.1.5' => 'Destination mailbox address valid', - 'X.1.6' => 'Mailbox has moved', - 'X.1.7' => 'Bad sender\'s mailbox address syntax', - 'X.1.8' => 'Bad sender\'s system address', - 'X.2.0' => 'Other or undefined mailbox status', - 'X.2.1' => 'Mailbox disabled, not accepting messages', - 'X.2.2' => 'Mailbox full', - 'X.2.3' => 'Message length exceeds administrative limit.', - 'X.2.4' => 'Mailing list expansion problem', - 'X.3.0' => 'Other or undefined mail system status', - 'X.3.1' => 'Mail system full', - 'X.3.2' => 'System not accepting network messages', - 'X.3.3' => 'System not capable of selected features', - 'X.3.4' => 'Message too big for system', - 'X.4.0' => 'Other or undefined network or routing status', - 'X.4.1' => 'No answer from host', - 'X.4.2' => 'Bad connection', - 'X.4.3' => 'Routing server failure', - 'X.4.4' => 'Unable to route', - 'X.4.5' => 'Network congestion', - 'X.4.6' => 'Routing loop detected', - 'X.4.7' => 'Delivery time expired', - 'X.5.0' => 'Other or undefined protocol status', - 'X.5.1' => 'Invalid command', - 'X.5.2' => 'Syntax error', - 'X.5.3' => 'Too many recipients', - 'X.5.4' => 'Invalid command arguments', - 'X.5.5' => 'Wrong protocol version', - 'X.6.0' => 'Other or undefined media error', - 'X.6.1' => 'Media not supported', - 'X.6.2' => 'Conversion required and prohibited', - 'X.6.3' => 'Conversion required but not supported', - 'X.6.4' => 'Conversion with loss performed', - 'X.6.5' => 'Conversion failed', - 'X.7.0' => 'Other or undefined security status', - 'X.7.1' => 'Delivery not authorized, message refused', - 'X.7.2' => 'Mailing list expansion prohibited', - 'X.7.3' => 'Security conversion required but not possible', - 'X.7.4' => 'Security features not supported', - 'X.7.5' => 'Cryptographic failure', - 'X.7.6' => 'Cryptographic algorithm not supported', - 'X.7.7' => 'Message integrity failure' - } - - # Returns HTML, of extra comment to put by attachment - def extra_note - # For delivery status notification attachments, extract the status and - # look up what it means in the DSN table. - if @content_type == 'message/delivery-status' - if !@body.match(/Status:\s+([0-9]+\.([0-9]+\.[0-9]+))\s+/) - return "" - end - dsn = $1 - dsn_part = 'X.' + $2 - - dsn_message = "" - if DsnToMessage.include?(dsn_part) - dsn_message = " (" + DsnToMessage[dsn_part] + ")" - end - - return "<br><em>DSN: " + dsn + dsn_message + "</em>" - end - return "" - end - - # Called by controller so old filenames still work - def old_display_filename - filename = self._internal_display_filename - - # Convert weird spaces (e.g. \n) to normal ones - filename = filename.gsub(/\s/, " ") - # Remove slashes, they mess with URLs - filename = filename.gsub(/\//, "-") - - return filename - end - - # XXX changing this will break existing URLs, so have a care - maybe - # make another old_display_filename see above - def display_filename - filename = self._internal_display_filename - - # Sometimes filenames have e.g. %20 in - no point butchering that - # (without unescaping it, this would remove the % and leave 20s in there) - filename = CGI.unescape(filename) - - # Remove weird spaces - filename = filename.gsub(/\s+/, " ") - # Remove non-alphabetic characters - filename = filename.gsub(/[^A-Za-z0-9.]/, " ") - # Remove spaces near dots - filename = filename.gsub(/\s*\.\s*/, ".") - # Compress adjacent spaces down to a single one - filename = filename.gsub(/\s+/, " ") - filename = filename.strip - - return filename - end - - def _internal_display_filename - calc_ext = AlaveteliFileTypes.mimetype_to_extension(@content_type) - - if @filename - # Put right extension on if missing - if !filename.match(/\.#{calc_ext}$/) && calc_ext - filename + "." + calc_ext - else - filename - end - else - if !calc_ext - calc_ext = "bin" - end - if @within_rfc822_subject - @within_rfc822_subject + "." + calc_ext - else - "attachment." + calc_ext - end - end - end - - # Size to show next to the download link for the attachment - def display_size - s = self.body.size - - if s > 1024 * 1024 - return sprintf("%.1f", s.to_f / 1024 / 1024) + 'M' - else - return (s / 1024).to_s + 'K' - end - end - - # Whether this type can be shown in the Google Docs Viewer. - # The full list of supported types can be found at - # https://docs.google.com/support/bin/answer.py?hl=en&answer=1189935 - def has_google_docs_viewer? - return !! { - "application/pdf" => true, # .pdf - "image/tiff" => true, # .tiff - - "application/vnd.ms-word" => true, # .doc - "application/vnd.openxmlformats-officedocument.wordprocessingml.document" => true, # .docx - - "application/vnd.ms-powerpoint" => true, # .ppt - "application/vnd.openxmlformats-officedocument.presentationml.presentation" => true, # .pptx - - "application/vnd.ms-excel" => true, # .xls - "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" => true, # .xlsx - - } [self.content_type] - end - - # Whether this type has a "View as HTML" - def has_body_as_html? - return ( - !!{ - "text/plain" => true, - "application/rtf" => true, - }[self.content_type] or - self.has_google_docs_viewer? - ) - end - - # Name of type of attachment type - only valid for things that has_body_as_html? - def name_of_content_type - return { - "text/plain" => "Text file", - 'application/rtf' => "RTF file", - - 'application/pdf' => "PDF file", - 'image/tiff' => "TIFF image", - - 'application/vnd.ms-word' => "Word document", - 'application/vnd.openxmlformats-officedocument.wordprocessingml.document' => "Word document", - - 'application/vnd.ms-powerpoint' => "PowerPoint presentation", - 'application/vnd.openxmlformats-officedocument.presentationml.presentation' => "PowerPoint presentation", - - 'application/vnd.ms-excel' => "Excel spreadsheet", - 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet' => "Excel spreadsheet", - }[self.content_type] - end - - # For "View as HTML" of attachment - def body_as_html(dir) - html = nil - wrapper_id = "wrapper" - - # simple cases, can never fail - if self.content_type == 'text/plain' - text = self.body.strip - text = CGI.escapeHTML(text) - text = MySociety::Format.make_clickable(text) - html = text.gsub(/\n/, '<br>') - return '<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" - "http://www.w3.org/TR/html4/loose.dtd"><html><head><title></title></head><body>' + html + "</body></html>", wrapper_id - end - - # the extractions will also produce image files, which go in the - # current directory, so change to the directory the function caller - # wants everything in - Dir.chdir(dir) do - tempfile = Tempfile.new('foiextract', '.') - tempfile.print self.body - tempfile.flush - - if self.content_type == 'application/pdf' - IO.popen("#{`which pdftohtml`.chomp} -nodrm -zoom 1.0 -stdout -enc UTF-8 -noframes " + tempfile.path + "", "r") do |child| - html = child.read() - end - elsif self.content_type == 'application/rtf' - IO.popen("/usr/bin/unrtf --html " + tempfile.path + "", "r") do |child| - html = child.read() - end - elsif self.has_google_docs_viewer? - html = '' # force error and using Google docs viewer - else - raise "No HTML conversion available for type " + self.content_type - end - - tempfile.close - tempfile.delete - end - - # We need to look at: - # a) Any error code - # b) The output size, as pdftohtml does not return an error code upon error. - # c) For cases when there is no text in the body of the HTML, or - # images, so nothing will be rendered. This is to detect some bug in - # pdftohtml, which sometimes makes it return just <hr>s and no other - # content. - html.match(/(\<body[^>]*\>.*)/mi) - body = $1.to_s - body_without_tags = body.gsub(/\s+/,"").gsub(/\<[^\>]*\>/, "") - contains_images = html.match(/<img/mi) ? true : false - if !$?.success? || html.size == 0 || (body_without_tags.size == 0 && !contains_images) - ret = "<html><head></head><body>"; - if self.has_google_docs_viewer? - wrapper_id = "wrapper_google_embed" - ret = ret + "<iframe src='http://docs.google.com/viewer?url=<attachment-url-here>&embedded=true' width='100%' height='100%' style='border: none;'></iframe>"; - else - ret = ret + "<p>Sorry, we were unable to convert this file to HTML. Please use the download link at the top right.</p>" - end - ret = ret + "</body></html>" - return ret, wrapper_id - end - - return html, wrapper_id - end - -end - - class IncomingMessage < ActiveRecord::Base belongs_to :info_request validates_presence_of :info_request @@ -380,7 +111,7 @@ class IncomingMessage < ActiveRecord::Base if !self.mail['return-path'].nil? && self.mail['return-path'].addr == "<>" return false end - if !self.mail['auto-submitted'].nil? && !self.mail['auto-submitted'].keys.empty? + if !self.mail['auto-submitted'].nil? return false end return true @@ -792,7 +523,7 @@ class IncomingMessage < ActiveRecord::Base # it into conflict with ensure_parts_counted which it has to be # called both before and after. It will fail with cases of # attachments of attachments etc. - + charset = curr_mail.charset # save this, because overwriting content_type also resets charset # Don't allow nil content_types if curr_mail.content_type.nil? curr_mail.content_type = 'application/octet-stream' @@ -822,7 +553,6 @@ class IncomingMessage < ActiveRecord::Base curr_mail.content_type = 'application/octet-stream' end end - # If the part is an attachment of email if curr_mail.content_type == 'message/rfc822' || curr_mail.content_type == 'application/vnd.ms-outlook' || curr_mail.content_type == 'application/ms-tnef' ensure_parts_counted # fills in rfc822_attachment variable @@ -832,6 +562,8 @@ class IncomingMessage < ActiveRecord::Base curr_mail.within_rfc822_attachment = within_rfc822_attachment leaves_found += [curr_mail] end + # restore original charset + curr_mail.charset = charset end return leaves_found end @@ -887,64 +619,58 @@ class IncomingMessage < ActiveRecord::Base end # Returns body text from main text part of email, converted to UTF-8 def get_main_body_text_internal + parse_raw_email! main_part = get_main_body_text_part return _convert_part_body_to_text(main_part) end + # Given a main text part, converts it to text def _convert_part_body_to_text(part) if part.nil? text = "[ Email has no body, please see attachments ]" - text_charset = "utf-8" + source_charset = "utf-8" else - text = part.body - text_charset = part.charset + text = part.body # by default, TMail converts to UT8 in this call + source_charset = part.charset if part.content_type == 'text/html' # e.g. http://www.whatdotheyknow.com/request/35/response/177 - # XXX This is a bit of a hack as it is calling a convert to text routine. - # Could instead call a sanitize HTML one. - text = self.class._get_attachment_text_internal_one_file(part.content_type, text) - end - end - - # Charset conversion, turn everything into UTF-8 - if not text_charset.nil? - begin - # XXX specially convert unicode pound signs, was needed here - # http://www.whatdotheyknow.com/request/88/response/352 - text = text.gsub("£", Iconv.conv(text_charset, 'utf-8', '£')) - # Try proper conversion - text = Iconv.conv('utf-8', text_charset, text) - rescue Iconv::IllegalSequence, Iconv::InvalidEncoding - # Clearly specified charset was nonsense - text_charset = nil + # XXX This is a bit of a hack as it is calling a + # convert to text routine. Could instead call a + # sanitize HTML one. + + # If the text isn't UTF8, it means TMail had a problem + # converting it (invalid characters, etc), and we + # should instead tell elinks to respect the source + # charset + use_charset = "utf-8" + begin + text = Iconv.conv('utf-8', 'utf-8', text) + rescue Iconv::IllegalSequence + use_charset = source_charset + end + text = self.class._get_attachment_text_internal_one_file(part.content_type, text, use_charset) end end - if text_charset.nil? - # No specified charset, so guess - - # Could use rchardet here, but it had trouble with - # http://www.whatdotheyknow.com/request/107/response/144 - # So I gave up - most likely in UK we'll only get windows-1252 anyway. + # If TMail can't convert text, it just returns it, so we sanitise it. + begin + # Test if it's good UTF-8 + text = Iconv.conv('utf-8', 'utf-8', text) + rescue Iconv::IllegalSequence + # Text looks like unlabelled nonsense, + # strip out anything that isn't UTF-8 begin - # See if it is good UTF-8 anyway - text = Iconv.conv('utf-8', 'utf-8', text) - rescue Iconv::IllegalSequence - begin - # Or is it good windows-1252, most likely - text = Iconv.conv('utf-8', 'windows-1252', text) - rescue Iconv::IllegalSequence - # Text looks like unlabelled nonsense, strip out anything that isn't UTF-8 - text = Iconv.conv('utf-8//IGNORE', 'utf-8', text) + - _("\n\n[ {{site_name}} note: The above text was badly encoded, and has had strange characters removed. ]", - :site_name => MySociety::Config.get('SITE_NAME', 'Alaveteli')) + text = Iconv.conv('utf-8//IGNORE', source_charset, text) + + _("\n\n[ {{site_name}} note: The above text was badly encoded, and has had strange characters removed. ]", + :site_name => MySociety::Config.get('SITE_NAME', 'Alaveteli')) + rescue Iconv::InvalidEncoding, Iconv::IllegalSequence + if source_charset != "utf-8" + source_charset = "utf-8" + retry end end end - # An assertion that we have ended up with UTF-8 XXX can remove as this should - # always be fine if code above is - Iconv.conv('utf-8', 'utf-8', text) # Fix DOS style linefeeds to Unix style ones (or other later regexps won't work) # Needed for e.g. http://www.whatdotheyknow.com/request/60/response/98 @@ -1192,7 +918,9 @@ class IncomingMessage < ActiveRecord::Base return self.cached_attachment_text_clipped end - def IncomingMessage._get_attachment_text_internal_one_file(content_type, body) + def IncomingMessage._get_attachment_text_internal_one_file(content_type, body, charset = 'utf-8') + # note re. charset: TMail always tries to convert email bodies + # to UTF8 by default, so normally it should already be that. text = '' # XXX - tell all these command line tools to return utf-8 if content_type == 'text/plain' @@ -1214,9 +942,10 @@ class IncomingMessage < ActiveRecord::Base # catdoc on RTF prodcues less comments and extra bumf than --text option to unrtf AlaveteliExternalCommand.run(`which catdoc`.chomp, tempfile.path, :append_to => text) elsif content_type == 'text/html' - # lynx wordwraps links in its output, which then don't get formatted properly - # by Alaveteli. We use elinks instead, which doesn't do that. - AlaveteliExternalCommand.run(`which elinks`.chomp, "-eval", "'set document.codepage.assume = \"utf-8\"'", "-dump-charset", "utf-8", "-force-html", "-dump", + # lynx wordwraps links in its output, which then don't + # get formatted properly by Alaveteli. We use elinks + # instead, which doesn't do that. + AlaveteliExternalCommand.run(`which elinks`.chomp, "-eval", "'set document.codepage.assume = \"#{charset}\"'", "-eval", "'set document.codepage.force_assumed = 1'", "-dump-charset", "utf-8", "-force-html", "-dump", tempfile.path, :append_to => text) elsif content_type == 'application/vnd.ms-excel' # Bit crazy using /usr/bin/strings - but xls2csv, xlhtml and @@ -1283,7 +1012,7 @@ class IncomingMessage < ActiveRecord::Base text = '' attachments = self.get_attachments_for_display for attachment in attachments - text += IncomingMessage._get_attachment_text_internal_one_file(attachment.content_type, attachment.body) + text += IncomingMessage._get_attachment_text_internal_one_file(attachment.content_type, attachment.body, attachment.charset) end # Remove any bad characters text = Iconv.conv('utf-8//IGNORE', 'utf-8', text) @@ -1376,7 +1105,7 @@ class IncomingMessage < ActiveRecord::Base if !self.mail['return-path'].nil? && self.mail['return-path'].addr == "<>" return false end - if !self.mail['auto-submitted'].nil? && !self.mail['auto-submitted'].keys.empty? + if !self.mail['auto-submitted'].nil? return false end return true diff --git a/app/models/info_request_event.rb b/app/models/info_request_event.rb index 4ea89bf81..3514702da 100644 --- a/app/models/info_request_event.rb +++ b/app/models/info_request_event.rb @@ -147,6 +147,7 @@ class InfoRequestEvent < ActiveRecord::Base return event.calculated_state end end + return end def waiting_classification |