diff options
22 files changed, 730 insertions, 187 deletions
diff --git a/app/controllers/request_controller.rb b/app/controllers/request_controller.rb index d91df3780..dad5e81cd 100644 --- a/app/controllers/request_controller.rb +++ b/app/controllers/request_controller.rb @@ -676,6 +676,7 @@ class RequestController < ApplicationController # Internal function def get_attachment_internal(html_conversion) @incoming_message = IncomingMessage.find(params[:incoming_message_id]) + @incoming_message.parse_raw_email! @info_request = @incoming_message.info_request if @incoming_message.info_request_id != params[:id].to_i raise sprintf("Incoming message %d does not belong to request %d", @incoming_message.info_request_id, params[:id]) @@ -690,7 +691,6 @@ class RequestController < ApplicationController # check permissions raise "internal error, pre-auth filter should have caught this" if !@info_request.user_can_view?(authenticated_user) - @attachment = IncomingMessage.get_attachment_by_url_part_number(@incoming_message.get_attachments_for_display, @part_number) raise ActiveRecord::RecordNotFound.new("attachment not found part number " + @part_number.to_s + " incoming_message " + @incoming_message.id.to_s) if @attachment.nil? @@ -713,6 +713,7 @@ class RequestController < ApplicationController :email => _("Then you can upload an FOI response. "), :email_subject => _("Confirm your account on {{site_name}}",:site_name=>site_name) } + if !authenticated?(@reason_params) return end diff --git a/app/models/foi_attachment.rb b/app/models/foi_attachment.rb new file mode 100644 index 000000000..057dcdb69 --- /dev/null +++ b/app/models/foi_attachment.rb @@ -0,0 +1,321 @@ +# encoding: UTF-8 + +# models/foi_attachment.rb: +# An attachment to an email (IncomingMessage) +# +# Copyright (c) 2007 UK Citizens Online Democracy. All rights reserved. +# Email: francis@mysociety.org; WWW: http://www.mysociety.org/ +# This is the type which is used to send data about attachments to the view + +require 'digest' + +class FoiAttachment < ActiveRecord::Base + belongs_to :incoming_message + validates_presence_of :content_type + validates_presence_of :filename + validates_presence_of :display_size + + before_validation :ensure_filename!, :only => [:filename] + before_destroy :delete_cached_file! + + def directory + base_dir = File.join("cache", "attachments_#{ENV['RAILS_ENV']}") + return File.join(base_dir, self.hexdigest[0..2]) + end + + def filepath + File.join(self.directory, self.hexdigest) + end + + def delete_cached_file! + begin + File.delete(self.filepath) + rescue + end + end + + def body=(d) + self.hexdigest = Digest::MD5.hexdigest(d) + if !File.exists?(self.directory) + FileUtils.mkdir_p self.directory + end + File.open(self.filepath, "wb") { |file| + file.write d + } + update_display_size! + end + + def body + if @cached_body.nil? + @cached_body = File.open(self.filepath, "rb" ).read + end + return @cached_body + end + + # List of DSN codes taken from RFC 3463 + # http://tools.ietf.org/html/rfc3463 + DsnToMessage = { + 'X.1.0' => 'Other address status', + 'X.1.1' => 'Bad destination mailbox address', + 'X.1.2' => 'Bad destination system address', + 'X.1.3' => 'Bad destination mailbox address syntax', + 'X.1.4' => 'Destination mailbox address ambiguous', + 'X.1.5' => 'Destination mailbox address valid', + 'X.1.6' => 'Mailbox has moved', + 'X.1.7' => 'Bad sender\'s mailbox address syntax', + 'X.1.8' => 'Bad sender\'s system address', + 'X.2.0' => 'Other or undefined mailbox status', + 'X.2.1' => 'Mailbox disabled, not accepting messages', + 'X.2.2' => 'Mailbox full', + 'X.2.3' => 'Message length exceeds administrative limit.', + 'X.2.4' => 'Mailing list expansion problem', + 'X.3.0' => 'Other or undefined mail system status', + 'X.3.1' => 'Mail system full', + 'X.3.2' => 'System not accepting network messages', + 'X.3.3' => 'System not capable of selected features', + 'X.3.4' => 'Message too big for system', + 'X.4.0' => 'Other or undefined network or routing status', + 'X.4.1' => 'No answer from host', + 'X.4.2' => 'Bad connection', + 'X.4.3' => 'Routing server failure', + 'X.4.4' => 'Unable to route', + 'X.4.5' => 'Network congestion', + 'X.4.6' => 'Routing loop detected', + 'X.4.7' => 'Delivery time expired', + 'X.5.0' => 'Other or undefined protocol status', + 'X.5.1' => 'Invalid command', + 'X.5.2' => 'Syntax error', + 'X.5.3' => 'Too many recipients', + 'X.5.4' => 'Invalid command arguments', + 'X.5.5' => 'Wrong protocol version', + 'X.6.0' => 'Other or undefined media error', + 'X.6.1' => 'Media not supported', + 'X.6.2' => 'Conversion required and prohibited', + 'X.6.3' => 'Conversion required but not supported', + 'X.6.4' => 'Conversion with loss performed', + 'X.6.5' => 'Conversion failed', + 'X.7.0' => 'Other or undefined security status', + 'X.7.1' => 'Delivery not authorized, message refused', + 'X.7.2' => 'Mailing list expansion prohibited', + 'X.7.3' => 'Security conversion required but not possible', + 'X.7.4' => 'Security features not supported', + 'X.7.5' => 'Cryptographic failure', + 'X.7.6' => 'Cryptographic algorithm not supported', + 'X.7.7' => 'Message integrity failure' + } + + # Returns HTML, of extra comment to put by attachment + def extra_note + # For delivery status notification attachments, extract the status and + # look up what it means in the DSN table. + if @content_type == 'message/delivery-status' + if !@body.match(/Status:\s+([0-9]+\.([0-9]+\.[0-9]+))\s+/) + return "" + end + dsn = $1 + dsn_part = 'X.' + $2 + + dsn_message = "" + if DsnToMessage.include?(dsn_part) + dsn_message = " (" + DsnToMessage[dsn_part] + ")" + end + + return "<br><em>DSN: " + dsn + dsn_message + "</em>" + end + return "" + end + + # Called by controller so old filenames still work + def old_display_filename + filename = self.filename + + # Convert weird spaces (e.g. \n) to normal ones + filename = filename.gsub(/\s/, " ") + # Remove slashes, they mess with URLs + filename = filename.gsub(/\//, "-") + + return filename + end + + # XXX changing this will break existing URLs, so have a care - maybe + # make another old_display_filename see above + def display_filename + filename = self.filename + if !self.incoming_message.nil? + self.incoming_message.info_request.apply_censor_rules_to_text!(filename) + end + # Sometimes filenames have e.g. %20 in - no point butchering that + # (without unescaping it, this would remove the % and leave 20s in there) + filename = CGI.unescape(filename) + + # Remove weird spaces + filename = filename.gsub(/\s+/, " ") + # Remove non-alphabetic characters + filename = filename.gsub(/[^A-Za-z0-9.]/, " ") + # Remove spaces near dots + filename = filename.gsub(/\s*\.\s*/, ".") + # Compress adjacent spaces down to a single one + filename = filename.gsub(/\s+/, " ") + filename = filename.strip + + return filename + end + + + def ensure_filename! + if self.filename.nil? + calc_ext = AlaveteliFileTypes.mimetype_to_extension(self.content_type) + if !calc_ext + calc_ext = "bin" + end + if !self.within_rfc822_subject.nil? + computed = self.within_rfc822_subject + "." + calc_ext + else + computed = "attachment." + calc_ext + end + self.filename = computed + end + end + + def filename=(filename) + calc_ext = AlaveteliFileTypes.mimetype_to_extension(self.content_type) + # Put right extension on if missing + if !filename.nil? && !filename.match(/\.#{calc_ext}$/) && calc_ext + computed = filename + "." + calc_ext + else + computed = filename + end + write_attribute('filename', computed) + end + + # Size to show next to the download link for the attachment + def update_display_size! + s = self.body.size + + if s > 1024 * 1024 + self.display_size = sprintf("%.1f", s.to_f / 1024 / 1024) + 'M' + else + self.display_size = (s / 1024).to_s + 'K' + end + end + + # Whether this type can be shown in the Google Docs Viewer. + # The full list of supported types can be found at + # https://docs.google.com/support/bin/answer.py?hl=en&answer=1189935 + def has_google_docs_viewer? + return !! { + "application/pdf" => true, # .pdf + "image/tiff" => true, # .tiff + + "application/vnd.ms-word" => true, # .doc + "application/vnd.openxmlformats-officedocument.wordprocessingml.document" => true, # .docx + + "application/vnd.ms-powerpoint" => true, # .ppt + "application/vnd.openxmlformats-officedocument.presentationml.presentation" => true, # .pptx + + "application/vnd.ms-excel" => true, # .xls + "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" => true, # .xlsx + + } [self.content_type] + end + + # Whether this type has a "View as HTML" + def has_body_as_html? + return ( + !!{ + "text/plain" => true, + "application/rtf" => true, + }[self.content_type] or + self.has_google_docs_viewer? + ) + end + + # Name of type of attachment type - only valid for things that has_body_as_html? + def name_of_content_type + return { + "text/plain" => "Text file", + 'application/rtf' => "RTF file", + + 'application/pdf' => "PDF file", + 'image/tiff' => "TIFF image", + + 'application/vnd.ms-word' => "Word document", + 'application/vnd.openxmlformats-officedocument.wordprocessingml.document' => "Word document", + + 'application/vnd.ms-powerpoint' => "PowerPoint presentation", + 'application/vnd.openxmlformats-officedocument.presentationml.presentation' => "PowerPoint presentation", + + 'application/vnd.ms-excel' => "Excel spreadsheet", + 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet' => "Excel spreadsheet", + }[self.content_type] + end + + # For "View as HTML" of attachment + def body_as_html(dir) + html = nil + wrapper_id = "wrapper" + + # simple cases, can never fail + if self.content_type == 'text/plain' + text = self.body.strip + text = CGI.escapeHTML(text) + text = MySociety::Format.make_clickable(text) + html = text.gsub(/\n/, '<br>') + return '<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" + "http://www.w3.org/TR/html4/loose.dtd"><html><head><title></title></head><body>' + html + "</body></html>", wrapper_id + end + + # the extractions will also produce image files, which go in the + # current directory, so change to the directory the function caller + # wants everything in + Dir.chdir(dir) do + tempfile = Tempfile.new('foiextract', '.') + tempfile.print self.body + tempfile.flush + + if self.content_type == 'application/pdf' + IO.popen("/usr/bin/pdftohtml -nodrm -zoom 1.0 -stdout -enc UTF-8 -noframes " + tempfile.path + "", "r") do |child| + html = child.read() + end + elsif self.content_type == 'application/rtf' + IO.popen("/usr/bin/unrtf --html " + tempfile.path + "", "r") do |child| + html = child.read() + end + elsif self.has_google_docs_viewer? + html = '' # force error and using Google docs viewer + else + raise "No HTML conversion available for type " + self.content_type + end + + tempfile.close + tempfile.delete + end + + # We need to look at: + # a) Any error code + # b) The output size, as pdftohtml does not return an error code upon error. + # c) For cases when there is no text in the body of the HTML, or + # images, so nothing will be rendered. This is to detect some bug in + # pdftohtml, which sometimes makes it return just <hr>s and no other + # content. + html.match(/(\<body[^>]*\>.*)/mi) + body = $1.to_s + body_without_tags = body.gsub(/\s+/,"").gsub(/\<[^\>]*\>/, "") + contains_images = html.match(/<img/mi) ? true : false + if !$?.success? || html.size == 0 || (body_without_tags.size == 0 && !contains_images) + ret = "<html><head></head><body>"; + if self.has_google_docs_viewer? + wrapper_id = "wrapper_google_embed" + ret = ret + "<iframe src='http://docs.google.com/viewer?url=<attachment-url-here>&embedded=true' width='100%' height='100%' style='border: none;'></iframe>"; + else + ret = ret + "<p>Sorry, we were unable to convert this file to HTML. Please use the download link at the top right.</p>" + end + ret = ret + "</body></html>" + return ret, wrapper_id + end + + return html, wrapper_id + end + +end + diff --git a/app/models/incoming_message.rb b/app/models/incoming_message.rb index 0608d46d7..a4519a17d 100644 --- a/app/models/incoming_message.rb +++ b/app/models/incoming_message.rb @@ -320,7 +320,7 @@ class IncomingMessage < ActiveRecord::Base validates_presence_of :raw_email has_many :outgoing_message_followups, :foreign_key => 'incoming_message_followup_id', :class_name => 'OutgoingMessage' - + has_many :foi_attachments has_many :info_request_events # never really has many, but could in theory belongs_to :raw_email @@ -338,8 +338,8 @@ class IncomingMessage < ActiveRecord::Base # Return the structured TMail::Mail object # Documentation at http://i.loveruby.net/en/projects/tmail/doc/ - def mail - if @mail.nil? && !self.raw_email.nil? + def mail(force = nil) + if (!force.nil? || @mail.nil?) && !self.raw_email.nil? # Hack round bug in TMail's MIME decoding. Example request which provokes it: # http://www.whatdotheyknow.com/request/reviews_of_unduly_lenient_senten#incoming-4830 # Report of TMail bug: @@ -352,23 +352,109 @@ class IncomingMessage < ActiveRecord::Base @mail end + # Returns the name of the person the incoming message is from, or nil if + # there isn't one or if there is only an email address. XXX can probably + # remove from_name_if_present (which is a monkey patch) by just calling + # .from_addrs[0].name here instead? + + # Return false if for some reason this is a message that we shouldn't let them reply to + def _calculate_valid_to_reply_to + # check validity of email + if self.mail.from_addrs.nil? || self.mail.from_addrs.size == 0 + return false + end + email = self.mail.from_addrs[0].spec + if !MySociety::Validate.is_valid_email(email) + return false + end + + # reject postmaster - authorities seem to nearly always not respond to + # email to postmaster, and it tends to only happen after delivery failure. + # likewise Mailer-Daemon, Auto_Reply... + prefix = email + prefix =~ /^(.*)@/ + prefix = $1 + if !prefix.nil? && prefix.downcase.match(/^(postmaster|mailer-daemon|auto_reply|donotreply|no.reply)$/) + return false + end + if !self.mail['return-path'].nil? && self.mail['return-path'].addr == "<>" + return false + end + if !self.mail['auto-submitted'].nil? && !self.mail['auto-submitted'].keys.empty? + return false + end + return true + end + + def parse_raw_email!(force = nil) + # The following fields may be absent; we treat them as cached + # values in case we want to regenerate them (due to mail + # parsing bugs, etc). + if (!force.nil? || self.last_parsed.nil?) + self.extract_attachments! + self.sent_at = self.mail.date || self.created_at + self.subject = self.mail.subject + # XXX can probably remove from_name_if_present (which is a + # monkey patch) by just calling .from_addrs[0].name here + # instead? + self.mail_from = self.mail.from_name_if_present + begin + self.mail_from_domain = PublicBody.extract_domain_from_email(self.mail.from_addrs[0].spec) + rescue NoMethodError + self.mail_from_domain = "" + end + self.valid_to_reply_to = self._calculate_valid_to_reply_to + self.last_parsed = Time.now + self.save! + end + end + + def valid_to_reply_to? + return self.valid_to_reply_to + end + + # The cached fields mentioned in the previous comment + # XXX there must be a nicer way to do this without all that + # repetition. I tried overriding method_missing but got some + # unpredictable results. + def valid_to_reply_to + parse_raw_email! + super + end + def sent_at + parse_raw_email! + super + end + def subject + parse_raw_email! + super + end + def mail_from + parse_raw_email! + super + end + def safe_mail_from + if !self.mail_from.nil? + mail_from = self.mail_from.dup + self.info_request.apply_censor_rules_to_text!(mail_from) + return mail_from + end + end + def mail_from_domain + parse_raw_email! + super + end + # Number the attachments in depth first tree order, for use in URLs. # XXX This fills in part.rfc822_attachment and part.url_part_number within # all the parts of the email (see TMail monkeypatch above for how these # attributes are added). ensure_parts_counted must be called before using - # the attributes. This calculation is done only when required to avoid - # having to load and parse the email unnecessarily. - def after_initialize - @parts_counted = false - end + # the attributes. def ensure_parts_counted - if not @parts_counted - @count_parts_count = 0 - _count_parts_recursive(self.mail) - # we carry on using these numeric ids for attachments uudecoded from within text parts - @count_first_uudecode_count = @count_parts_count - @parts_counted = true - end + @count_parts_count = 0 + _count_parts_recursive(self.mail) + # we carry on using these numeric ids for attachments uudecoded from within text parts + @count_first_uudecode_count = @count_parts_count end def _count_parts_recursive(part) if part.multipart? @@ -406,7 +492,7 @@ class IncomingMessage < ActiveRecord::Base end end # And look up by URL part number to get an attachment - # XXX relies on get_attachments_for_display calling ensure_parts_counted + # XXX relies on extract_attachments calling ensure_parts_counted def self.get_attachment_by_url_part_number(attachments, found_url_part_number) attachments.each do |a| if a.url_part_number == found_url_part_number @@ -416,12 +502,6 @@ class IncomingMessage < ActiveRecord::Base return nil end - # Return date mail was sent - def sent_at - # Use date it arrived (created_at) if mail itself doesn't have Date: header - self.mail.date || self.created_at - end - # Converts email addresses we know about into textual descriptions of them def mask_special_emails!(text) # XXX can later display some of these special emails as actual emails, @@ -518,6 +598,7 @@ class IncomingMessage < ActiveRecord::Base self.info_request.apply_censor_rules_to_binary!(text) raise "internal error in binary_mask_stuff" if text.size != orig_size + return text end # Removes censored stuff from from HTML conversion of downloaded binaries @@ -658,20 +739,20 @@ class IncomingMessage < ActiveRecord::Base end # Internal function - def _get_censored_part_file_name(mail) + def _get_part_file_name(mail) part_file_name = TMail::Mail.get_part_file_name(mail) if part_file_name.nil? return nil end part_file_name = part_file_name.dup - self.info_request.apply_censor_rules_to_text!(part_file_name) return part_file_name end # (This risks losing info if the unchosen alternative is the only one to contain # useful info, but let's worry about that another time) def get_attachment_leaves - return _get_attachment_leaves_recursive(self.mail) + force = true + return _get_attachment_leaves_recursive(self.mail(force)) end def _get_attachment_leaves_recursive(curr_mail, within_rfc822_attachment = nil) leaves_found = [] @@ -718,7 +799,7 @@ class IncomingMessage < ActiveRecord::Base end # PDFs often come with this mime type, fix it up for view code if curr_mail.content_type == 'application/octet-stream' - part_file_name = self._get_censored_part_file_name(curr_mail) + part_file_name = self._get_part_file_name(curr_mail) calc_mime = AlaveteliFileTypes.filename_and_content_to_mimetype(part_file_name, curr_mail.body) if calc_mime curr_mail.content_type = calc_mime @@ -768,7 +849,6 @@ class IncomingMessage < ActiveRecord::Base # search results def _cache_main_body_text text = self.get_main_body_text_internal - # Strip the uudecode parts from main text # - this also effectively does a .dup as well, so text mods don't alter original text = text.split(/^begin.+^`\n^end\n/sm).join(" ") @@ -879,8 +959,8 @@ class IncomingMessage < ActiveRecord::Base end # Returns part which contains main body text, or nil if there isn't one def get_main_body_text_part - leaves = get_attachment_leaves - + leaves = self.foi_attachments + # Find first part which is text/plain or text/html # (We have to include HTML, as increasingly there are mail clients that # include no text alternative for the main part, and we don't want to @@ -894,7 +974,7 @@ class IncomingMessage < ActiveRecord::Base # Otherwise first part which is any sort of text leaves.each do |p| - if p.main_type == 'text' + if p.content_type.match(/^text/) return p end end @@ -902,7 +982,7 @@ class IncomingMessage < ActiveRecord::Base # ... or if none, consider first part p = leaves[0] # if it is a known type then don't use it, return no body (nil) - if AlaveteliFileTypes.mimetype_to_extension(p.content_type) + if !p.nil? && AlaveteliFileTypes.mimetype_to_extension(p.content_type) # this is guess of case where there are only attachments, no body text # e.g. http://www.whatdotheyknow.com/request/cost_benefit_analysis_for_real_n return nil @@ -914,16 +994,7 @@ class IncomingMessage < ActiveRecord::Base return p end # Returns attachments that are uuencoded in main body part - def get_main_body_text_uudecode_attachments - # we don't use get_main_body_text_internal, as we want to avoid charset - # conversions, since /usr/bin/uudecode needs to deal with those. - # e.g. for https://secure.mysociety.org/admin/foi/request/show_raw_email/24550 - main_part = get_main_body_text_part - if main_part.nil? - return [] - end - text = main_part.body - + def _uudecode_and_save_attachments(text) # Find any uudecoded things buried in it, yeuchly uus = text.scan(/^begin.+^`\n^end\n/sm) attachments = [] @@ -938,91 +1009,109 @@ class IncomingMessage < ActiveRecord::Base end tempfile.close # Make attachment type from it, working out filename and mime type - attachment = FOIAttachment.new() - attachment.body = content - attachment.filename = uu.match(/^begin\s+[0-9]+\s+(.*)$/)[1] - self.info_request.apply_censor_rules_to_text!(attachment.filename) - calc_mime = AlaveteliFileTypes.filename_and_content_to_mimetype(attachment.filename, attachment.body) + filename = uu.match(/^begin\s+[0-9]+\s+(.*)$/)[1] + calc_mime = AlaveteliFileTypes.filename_and_content_to_mimetype(filename, content) if calc_mime calc_mime = normalise_content_type(calc_mime) - attachment.content_type = calc_mime + content_type = calc_mime else - attachment.content_type = 'application/octet-stream' + content_type = 'application/octet-stream' end - attachments += [attachment] - end - + hexdigest = Digest::MD5.hexdigest(content) + attachment = self.foi_attachments.find_or_create_by_hexdigest(:hexdigest => hexdigest) + attachment.update_attributes(:filename => filename, + :content_type => content_type, + :body => content, + :display_size => "0K") + attachment.save! + attachments << attachment + end return attachments end - # Returns all attachments for use in display code - # XXX is this called multiple times and should be cached? def get_attachments_for_display + parse_raw_email! + # return what user would consider attachments, i.e. not the main body main_part = get_main_body_text_part - leaves = get_attachment_leaves + attachments = [] + for attachment in self.foi_attachments + attachments << attachment if attachment != main_part + end + return attachments + end + def extract_attachments! + leaves = get_attachment_leaves # XXX check where else this is called from # XXX we have to call ensure_parts_counted after get_attachment_leaves # which is really messy. ensure_parts_counted - attachments = [] - for leaf in leaves - if leaf != main_part - attachment = FOIAttachment.new - - attachment.body = leaf.body - # As leaf.body causes MIME decoding which uses lots of RAM, do garbage collection here - # to prevent excess memory use. XXX not really sure if this helps reduce - # peak RAM use overall. Anyway, maybe there is something better to do than this. - GC.start - - attachment.filename = _get_censored_part_file_name(leaf) - if leaf.within_rfc822_attachment - attachment.within_rfc822_subject = leaf.within_rfc822_attachment.subject - # Test to see if we are in the first part of the attached - # RFC822 message and it is text, if so add headers. - # XXX should probably use hunting algorithm to find main text part, rather than - # just expect it to be first. This will do for now though. - # Example request that needs this: - # http://www.whatdotheyknow.com/request/2923/response/7013/attach/2/Cycle%20Path%20Bank.txt - if leaf.within_rfc822_attachment == leaf && leaf.content_type == 'text/plain' - headers = "" - for header in [ 'Date', 'Subject', 'From', 'To', 'Cc' ] - if leaf.within_rfc822_attachment.header.include?(header.downcase) - header_value = leaf.within_rfc822_attachment.header[header.downcase] - # Example message which has a blank Date header: - # http://www.whatdotheyknow.com/request/30747/response/80253/attach/html/17/Common%20Purpose%20Advisory%20Group%20Meeting%20Tuesday%202nd%20March.txt.html - if !header_value.blank? - headers = headers + header + ": " + header_value.to_s + "\n" - end + for leaf in leaves + body = leaf.body + # As leaf.body causes MIME decoding which uses lots of RAM, do garbage collection here + # to prevent excess memory use. XXX not really sure if this helps reduce + # peak RAM use overall. Anyway, maybe there is something better to do than this. + GC.start + if leaf.within_rfc822_attachment + within_rfc822_subject = leaf.within_rfc822_attachment.subject + # Test to see if we are in the first part of the attached + # RFC822 message and it is text, if so add headers. + # XXX should probably use hunting algorithm to find main text part, rather than + # just expect it to be first. This will do for now though. + # Example request that needs this: + # http://www.whatdotheyknow.com/request/2923/response/7013/attach/2/Cycle%20Path%20Bank.txt + if leaf.within_rfc822_attachment == leaf && leaf.content_type == 'text/plain' + headers = "" + for header in [ 'Date', 'Subject', 'From', 'To', 'Cc' ] + if leaf.within_rfc822_attachment.header.include?(header.downcase) + header_value = leaf.within_rfc822_attachment.header[header.downcase] + # Example message which has a blank Date header: + # http://www.whatdotheyknow.com/request/30747/response/80253/attach/html/17/Common%20Purpose%20Advisory%20Group%20Meeting%20Tuesday%202nd%20March.txt.html + if !header_value.blank? + headers = headers + header + ": " + header_value.to_s + "\n" end end - # XXX call _convert_part_body_to_text here, but need to get charset somehow - # e.g. http://www.whatdotheyknow.com/request/1593/response/3088/attach/4/Freedom%20of%20Information%20request%20-%20car%20oval%20sticker:%20Article%2020,%20Convention%20on%20Road%20Traffic%201949.txt - attachment.body = headers + "\n" + attachment.body - - # This is quick way of getting all headers, but instead we only add some a) to - # make it more usable, b) as at least one authority accidentally leaked security - # information into a header. - #attachment.body = leaf.within_rfc822_attachment.port.to_s end + # XXX call _convert_part_body_to_text here, but need to get charset somehow + # e.g. http://www.whatdotheyknow.com/request/1593/response/3088/attach/4/Freedom%20of%20Information%20request%20-%20car%20oval%20sticker:%20Article%2020,%20Convention%20on%20Road%20Traffic%201949.txt + body = headers + "\n" + body + + # This is quick way of getting all headers, but instead we only add some a) to + # make it more usable, b) as at least one authority accidentally leaked security + # information into a header. + #attachment.body = leaf.within_rfc822_attachment.port.to_s end - attachment.content_type = leaf.content_type - attachment.url_part_number = leaf.url_part_number - attachments += [attachment] end + hexdigest = Digest::MD5.hexdigest(body) + attachment = self.foi_attachments.find_or_create_by_hexdigest(:hexdigest => hexdigest) + attachment.update_attributes(:url_part_number => leaf.url_part_number, + :content_type => leaf.content_type, + :filename => _get_part_file_name(leaf), + :charset => leaf.charset, + :within_rfc822_subject => within_rfc822_subject, + :display_size => "0K", + :body => body) + attachment.save! + attachments << attachment.id end - - uudecode_attachments = get_main_body_text_uudecode_attachments - c = @count_first_uudecode_count - for uudecode_attachment in uudecode_attachments - c += 1 - uudecode_attachment.url_part_number = c - attachments += [uudecode_attachment] + main_part = get_main_body_text_part + # we don't use get_main_body_text_internal, as we want to avoid charset + # conversions, since /usr/bin/uudecode needs to deal with those. + # e.g. for https://secure.mysociety.org/admin/foi/request/show_raw_email/24550 + if !main_part.nil? + uudecoded_attachments = _uudecode_and_save_attachments(main_part.body) + c = @count_first_uudecode_count + for uudecode_attachment in uudecoded_attachments + c += 1 + uudecode_attachment.url_part_number = c + uudecode_attachment.save! + attachments << uudecode_attachment.id + end end - return attachments - end + # now get rid of any attachments we no longer have + FoiAttachment.destroy_all("id NOT IN (#{attachments.join(',')}) AND incoming_message_id = #{self.id}") + end # Returns body text as HTML with quotes flattened, and emails removed. def get_body_for_html_display(collapse_quoted_sections = true) @@ -1047,7 +1136,7 @@ class IncomingMessage < ActiveRecord::Base text.strip! # if there is nothing but quoted stuff, then show the subject if text == "FOLDED_QUOTED_SECTION" - text = "[Subject only] " + CGI.escapeHTML(self.mail.subject) + text + text = "[Subject only] " + CGI.escapeHTML(self.subject) + text end # and display link for quoted stuff text = text.gsub(/FOLDED_QUOTED_SECTION/, "\n\n" + '<span class="unfold_link"><a href="?unfold=1#incoming-'+self.id.to_s+'">show quoted sections</a></span>' + "\n\n") @@ -1063,6 +1152,7 @@ class IncomingMessage < ActiveRecord::Base return text end + # Returns text of email for using in quoted section when replying def get_body_for_quoting # Get the body text with emails and quoted sections removed @@ -1200,6 +1290,7 @@ class IncomingMessage < ActiveRecord::Base return text end + # Returns text for indexing def get_text_for_indexing_full return get_body_for_quoting + "\n\n" + get_attachment_text_full @@ -1209,23 +1300,6 @@ class IncomingMessage < ActiveRecord::Base return get_body_for_quoting + "\n\n" + get_attachment_text_clipped end - # Returns the name of the person the incoming message is from, or nil if - # there isn't one or if there is only an email address. XXX can probably - # remove from_name_if_present (which is a monkey patch) by just calling - # .from_addrs[0].name here instead? - def safe_mail_from - name = self.mail.from_name_if_present - if name.nil? - return nil - end - name = name.dup - self.info_request.apply_censor_rules_to_text!(name) - return name - end - - def mail_from_domain - return PublicBody.extract_domain_from_email(self.mail.from_addrs[0].spec) - end # Has message arrived "recently"? diff --git a/app/models/info_request.rb b/app/models/info_request.rb index f482ca700..cfef6ebd8 100644 --- a/app/models/info_request.rb +++ b/app/models/info_request.rb @@ -1,3 +1,4 @@ + # == Schema Information # Schema version: 95 # @@ -249,10 +250,10 @@ public if incoming_message.nil? || !incoming_message.valid_to_reply_to? 'Re: ' + self.email_subject_request else - if incoming_message.mail.subject.match(/^Re:/i) - incoming_message.mail.subject + if incoming_message.subject.match(/^Re:/i) + incoming_message.subject else - 'Re: ' + incoming_message.mail.subject + 'Re: ' + incoming_message.subject end end end diff --git a/app/models/request_mailer.rb b/app/models/request_mailer.rb index 75dc58447..272f2ea83 100644 --- a/app/models/request_mailer.rb +++ b/app/models/request_mailer.rb @@ -10,6 +10,7 @@ require 'alaveteli_file_types' class RequestMailer < ApplicationMailer + # Used when an FOI officer uploads a response from their web browser - this is # the "fake" email used to store in the same format in the database as if they # had emailed it. diff --git a/app/views/request/upload_response.rhtml b/app/views/request/upload_response.rhtml index eaa6602a1..38c3db268 100644 --- a/app/views/request/upload_response.rhtml +++ b/app/views/request/upload_response.rhtml @@ -23,7 +23,7 @@ file too large for email, use the form below.')%> <p><%= _('Enter your response below. You may attach one file (use email, or <a href="%s">contact us</a> if you need more).')% [help_contact_path] %></p> -<% form_tag '', :html => { :id => 'upload_response_form' }, :multipart => true do %> +<% form_tag '', :id => 'upload_response_form', :multipart => true do %> <p> <label class="form_label" for="body"><% _('Response:')%></label> <%= text_area_tag :body, "", :rows => 10, :cols => 55 %> diff --git a/db/migrate/104_create_foi_attachments.rb b/db/migrate/104_create_foi_attachments.rb new file mode 100644 index 000000000..c53cd4f64 --- /dev/null +++ b/db/migrate/104_create_foi_attachments.rb @@ -0,0 +1,18 @@ + +class CreateFoiAttachments < ActiveRecord::Migration + def self.up + create_table :foi_attachments do |t| + t.column :content_type, :text + t.column :filename, :text + t.column :charset, :text + t.column :display_size, :text + t.column :url_part_number, :integer + t.column :within_rfc822_subject, :text + t.column :incoming_message_id, :integer + end + end + + def self.down + drop_table :foi_attachments + end +end diff --git a/db/migrate/105_extend_incoming_message.rb b/db/migrate/105_extend_incoming_message.rb new file mode 100644 index 000000000..9db8649a0 --- /dev/null +++ b/db/migrate/105_extend_incoming_message.rb @@ -0,0 +1,17 @@ +class ExtendIncomingMessage < ActiveRecord::Migration + def self.up + add_column :incoming_messages, :sent_at, :time + add_column :incoming_messages, :subject, :text + add_column :incoming_messages, :safe_mail_from, :text + add_column :incoming_messages, :mail_from_domain, :text + add_column :incoming_messages, :valid_to_reply_to, :boolean + end + + def self.down + remove_column :incoming_messages, :sent_at + remove_column :incoming_messages, :subject + remove_column :incoming_messages, :safe_mail_from + remove_column :incoming_messages, :mail_from_domain + remove_column :incoming_messages, :valid_to_reply_to + end +end diff --git a/db/migrate/106_add_hex_digest_to_foi_attachment.rb b/db/migrate/106_add_hex_digest_to_foi_attachment.rb new file mode 100644 index 000000000..d9520a934 --- /dev/null +++ b/db/migrate/106_add_hex_digest_to_foi_attachment.rb @@ -0,0 +1,9 @@ +class AddHexDigestToFoiAttachment < ActiveRecord::Migration + def self.up + add_column :foi_attachments, :hexdigest, :string, :limit => 32 + end + + def self.down + remove_column :foi_attachments, :hexdigest + end +end diff --git a/db/migrate/107_add_date_parsed_field_to_incoming_message.rb b/db/migrate/107_add_date_parsed_field_to_incoming_message.rb new file mode 100644 index 000000000..fbc017134 --- /dev/null +++ b/db/migrate/107_add_date_parsed_field_to_incoming_message.rb @@ -0,0 +1,9 @@ +class AddDateParsedFieldToIncomingMessage < ActiveRecord::Migration + def self.up + add_column :incoming_messages, :last_parsed, :datetime + end + + def self.down + remove_column :incoming_messages, :last_parsed + end +end diff --git a/db/migrate/108_change_safe_mail_from_to_mail_from.rb b/db/migrate/108_change_safe_mail_from_to_mail_from.rb new file mode 100644 index 000000000..57997f674 --- /dev/null +++ b/db/migrate/108_change_safe_mail_from_to_mail_from.rb @@ -0,0 +1,11 @@ +class ChangeSafeMailFromToMailFrom < ActiveRecord::Migration + def self.up + remove_column :incoming_messages, :safe_mail_from + add_column :incoming_messages, :mail_from, :text + end + + def self.down + remove_column :incoming_messages, :mail_from + add_column :incoming_messages, :safe_mail_from, :text + end +end diff --git a/doc/CHANGES.md b/doc/CHANGES.md index 9afdc9e7c..758e6b56e 100644 --- a/doc/CHANGES.md +++ b/doc/CHANGES.md @@ -5,6 +5,7 @@ * Base design refactored: CSS simplified and reduced, base design colours removed, now provided in example Alaveteli theme override * It is now possible to rebuild the xapian index for specific terms, rather than having to drop and rebuild the entire database every time (as previously). See rake xapian:rebuild_index for more info. * When listing authorities, show all authorities in default locale, rather than only those in the currently selected locale. +* Ensure incoming emails are only ever parsed once (should give a performance boost) ## Upgrade notes * **IMPORTANT! We now depend on Xapian 1.2**, which means you may need to install Xapian from backports. See [issue #159] for more info. diff --git a/script/cache-incoming-emails b/script/cache-incoming-emails new file mode 100644 index 000000000..a84a713d6 --- /dev/null +++ b/script/cache-incoming-emails @@ -0,0 +1,9 @@ +#!/bin/bash + +# Fill in all the database caches of text from body/attachments. +# Will take a while to run! Can use after clear-caches to refresh the database +# level caches if you like. + +LOC=`dirname $0` + +"$LOC/runner" 'IncomingMessage.find_each() { |im| print "info request " + im.info_request.id.to_s + ", incoming message " + im.id.to_s + ": " + im.extract_attachments!.count.to_s + " attachments extracted to " + im.foi_attachments[0].directory + "; main body folded: " + im.get_main_body_text_folded.size.to_s + " attachment clipped:" + im.get_attachment_text_clipped.size.to_s + "\n" }' diff --git a/script/clear-caches b/script/clear-caches index e9438f92d..2d91774ef 100755 --- a/script/clear-caches +++ b/script/clear-caches @@ -4,7 +4,7 @@ LOC=`dirname $0` -"$LOC/runner" "ActiveRecord::Base.connection.execute(\"update incoming_messages set cached_attachment_text_clipped = null, cached_main_body_text_unfolded = null, cached_main_body_text_folded = null\")" +"$LOC/runner" "ActiveRecord::Base.connection.execute(\"update incoming_messages set cached_attachment_text_clipped = null, cached_main_body_text_unfolded = null, cached_main_body_text_folded = null, sent_at = null, subject = null, safe_mail_from = null, mail_from_domain = null, valid_to_reply_to = null\")" # Remove page cache (do it in two stages so live site gets cache cleared faster) rm -fr $LOC/../old-cache diff --git a/script/fill-database-caches b/script/fill-database-caches deleted file mode 100755 index e6b525144..000000000 --- a/script/fill-database-caches +++ /dev/null @@ -1,11 +0,0 @@ -#!/bin/bash - -# Fill in all the database caches of text from body/attachments. -# Will take a while to run! Can use after clear-caches to refresh the database -# level caches if you like. - -LOC=`dirname $0` - -"$LOC/runner" 'IncomingMessage.find_each() { |im| print im.id.to_s + " id: main body folded:" + im.get_main_body_text_folded.size.to_s + " attachment clipped:" + im.get_attachment_text_clipped.size.to_s + "\n" }' - - diff --git a/spec/controllers/request_controller_spec.rb b/spec/controllers/request_controller_spec.rb index 054654611..4994c2a8f 100644 --- a/spec/controllers/request_controller_spec.rb +++ b/spec/controllers/request_controller_spec.rb @@ -105,10 +105,12 @@ describe RequestController, "when showing one request" do integrate_views it "should receive incoming messages, send email to creator, and show them" do + ir = info_requests(:fancy_dog_request) + ir.incoming_messages.each { |x| x.parse_raw_email! } + get :show, :url_title => 'why_do_you_have_such_a_fancy_dog' size_before = assigns[:info_request_events].size - ir = info_requests(:fancy_dog_request) receive_incoming_mail('incoming-request-plain.email', ir.incoming_email) deliveries = ActionMailer::Base.deliveries deliveries.size.should == 1 @@ -120,6 +122,8 @@ describe RequestController, "when showing one request" do end it "should download attachments" do + ir = info_requests(:fancy_dog_request) + ir.incoming_messages.each { |x| x.parse_raw_email! } get :show, :url_title => 'why_do_you_have_such_a_fancy_dog' response.content_type.should == "text/html" size_before = assigns[:info_request_events].size @@ -129,7 +133,7 @@ describe RequestController, "when showing one request" do get :show, :url_title => 'why_do_you_have_such_a_fancy_dog' (assigns[:info_request_events].size - size_before).should == 1 - + ir.reload get :get_attachment, :incoming_message_id => ir.incoming_messages[1].id, :id => ir.id, :part => 2, :file_name => ['hello.txt'] response.content_type.should == "text/plain" response.should have_text(/Second hello/) @@ -148,16 +152,50 @@ describe RequestController, "when showing one request" do it "should generate valid HTML verson of plain text attachments " do ir = info_requests(:fancy_dog_request) receive_incoming_mail('incoming-request-two-same-name.email', ir.incoming_email) + ir.reload get :get_attachment_as_html, :incoming_message_id => ir.incoming_messages[1].id, :id => ir.id, :part => 2, :file_name => ['hello.txt.html'], :skip_cache => 1 response.content_type.should == "text/html" response.should have_text(/Second hello/) end + it "should not cause a reparsing of the raw email, even when the result would be a 404 " do + ir = info_requests(:fancy_dog_request) + receive_incoming_mail('incoming-request-two-same-name.email', ir.incoming_email) + ir.reload + attachment = IncomingMessage.get_attachment_by_url_part_number(ir.incoming_messages[1].get_attachments_for_display, 2) + attachment.body.should have_text(/Second hello/) + + # change the raw_email associated with the message; this only be reparsed when explicitly asked for + ir.incoming_messages[1].raw_email.data = ir.incoming_messages[1].raw_email.data.sub("Second", "Third") + # asking for an attachment by the wrong filename results + # in a 404 for browsing users. This shouldn't cause a + # re-parse... + lambda { + get :get_attachment_as_html, :incoming_message_id => ir.incoming_messages[1].id, :id => ir.id, :part => 2, :file_name => ['hello.txt.baz.html'], :skip_cache => 1 + }.should raise_error(ActiveRecord::RecordNotFound) + + attachment = IncomingMessage.get_attachment_by_url_part_number(ir.incoming_messages[1].get_attachments_for_display, 2) + attachment.body.should have_text(/Second hello/) + + # ...nor should asking for it by its correct filename... + get :get_attachment_as_html, :incoming_message_id => ir.incoming_messages[1].id, :id => ir.id, :part => 2, :file_name => ['hello.txt.html'], :skip_cache => 1 + response.should_not have_text(/Third hello/) + + # ...but if we explicitly ask for attachments to be extracted, then they should be + force = true + ir.incoming_messages[1].parse_raw_email!(force) + attachment = IncomingMessage.get_attachment_by_url_part_number(ir.incoming_messages[1].get_attachments_for_display, 2) + attachment.body.should have_text(/Second hello/) + get :get_attachment_as_html, :incoming_message_id => ir.incoming_messages[1].id, :id => ir.id, :part => 2, :file_name => ['hello.txt.html'], :skip_cache => 1 + response.should have_text(/Third hello/) + end + it "should treat attachments with unknown extensions as binary" do ir = info_requests(:fancy_dog_request) receive_incoming_mail('incoming-request-attachment-unknown-extension.email', ir.incoming_email) + ir.reload - get :get_attachment, :incoming_message_id => ir.incoming_messages[1].id, :id => ir.id, :part => 2, :file_name => ['hello.qwglhm'] + get :get_attachment, :incoming_message_id => ir.incoming_messages[1].id, :id => ir.id, :part => 2, :file_name => ['hello.qwglhm'], :skip_cache => 1 response.content_type.should == "application/octet-stream" response.should have_text(/an unusual sort of file/) end @@ -200,8 +238,9 @@ describe RequestController, "when showing one request" do ir.user.censor_rules << censor_rule receive_incoming_mail('incoming-request-two-same-name.email', ir.incoming_email) + ir.reload - get :get_attachment, :incoming_message_id => ir.incoming_messages[1].id, :id => ir.id, :part => 2, :file_name => ['hello.txt'] + get :get_attachment, :incoming_message_id => ir.incoming_messages[1].id, :id => ir.id, :part => 2, :file_name => ['hello.txt'], :skip_cache => 1 response.content_type.should == "text/plain" response.should have_text(/xxxxxx hello/) end @@ -210,7 +249,22 @@ describe RequestController, "when showing one request" do ir = info_requests(:fancy_dog_request) receive_incoming_mail('incoming-request-two-same-name.email', ir.incoming_email) + # XXX this is horrid, but don't know a better way. If we + # don't do this, the info_request_event to which the + # info_request is attached still uses the unmodified + # version from the fixture. + #event = info_request_events(:useless_incoming_message_event) + ir.reload + assert ir.info_request_events[3].incoming_message.get_attachments_for_display.count == 2 + ir.save! + ir.incoming_messages.last.save! get :show, :url_title => 'why_do_you_have_such_a_fancy_dog' + assert assigns[:info_request].info_request_events[3].incoming_message.get_attachments_for_display.count == 2 + # the issue is that the info_request_events have got cached on them the old info_requests. + # where i'm at: trying to replace those fields that got re-read from the raw email. however tests are failing in very strange ways. currently I don't appear to be getting any attachments parsed in at all when in the template (see "*****" in _correspondence.rhtml) but do when I'm in the code. + + # so at this point, assigns[:info_request].incoming_messages[1].get_attachments_for_display is returning stuff, but the equivalent thing in the template isn't. + # but something odd is that the above is return a whole load of attachments which aren't there in the controller response.body.should have_tag("p.attachment strong", /hello.txt/m) censor_rule = CensorRule.new() @@ -225,10 +279,17 @@ describe RequestController, "when showing one request" do end it "should make a zipfile available, which has a different URL when it changes" do + title = 'why_do_you_have_such_a_fancy_dog' ir = info_requests(:fancy_dog_request) session[:user_id] = ir.user.id # bob_smith_user + get :download_entire_request, :url_title => title + assigns[:url_path].should have_text(/#{title}.zip$/) + old_path = assigns[:url_path] + response.location.should have_text(/#{assigns[:url_path]}$/) + zipfile = Zip::ZipFile.open(File.join(File.dirname(__FILE__), "../../cache/zips", old_path)) { |zipfile| + zipfile.count.should == 2 + } receive_incoming_mail('incoming-request-two-same-name.email', ir.incoming_email) - title = 'why_do_you_have_such_a_fancy_dog' get :download_entire_request, :url_title => title assigns[:url_path].should have_text(/#{title}.zip$/) old_path = assigns[:url_path] @@ -1286,9 +1347,10 @@ end describe RequestController, "authority uploads a response from the web interface" do + integrate_views fixtures :public_bodies, :public_body_translations, :public_body_versions, :users, :info_requests, :incoming_messages, :outgoing_messages, :comments, :info_request_events, :track_things - before(:all) do + before(:each) do # domain after the @ is used for authentication of FOI officers, so to test it # we need a user which isn't at localhost. @normal_user = User.new(:name => "Mr. Normal", :email => "normal-user@flourish.org", @@ -1342,7 +1404,7 @@ describe RequestController, "authority uploads a response from the web interface # How do I test a file upload in rails? # http://stackoverflow.com/questions/1178587/how-do-i-test-a-file-upload-in-rails - it "should let the requester upload a file" do + it "should let the authority upload a file" do @ir = info_requests(:fancy_dog_request) incoming_before = @ir.incoming_messages.size session[:user_id] = @foi_officer_user.id diff --git a/spec/fixtures/foi_attachments.yml b/spec/fixtures/foi_attachments.yml new file mode 100644 index 000000000..8b1378917 --- /dev/null +++ b/spec/fixtures/foi_attachments.yml @@ -0,0 +1 @@ + diff --git a/spec/models/incoming_message_spec.rb b/spec/models/incoming_message_spec.rb index 1a4baca0b..ed31b7c5c 100644 --- a/spec/models/incoming_message_spec.rb +++ b/spec/models/incoming_message_spec.rb @@ -64,7 +64,7 @@ end describe IncomingMessage, " display attachments" do it "should not show slashes in filenames" do - foi_attachment = FOIAttachment.new() + foi_attachment = FoiAttachment.new() # http://www.whatdotheyknow.com/request/post_commercial_manager_librarie#incoming-17233 foi_attachment.filename = "FOI/09/066 RESPONSE TO FOI REQUEST RECEIVED 21st JANUARY 2009.txt" expected_display_filename = foi_attachment.filename.gsub(/\//, " ") @@ -72,10 +72,11 @@ describe IncomingMessage, " display attachments" do end it "should not show slashes in subject generated filenames" do - foi_attachment = FOIAttachment.new() + foi_attachment = FoiAttachment.new() # http://www.whatdotheyknow.com/request/post_commercial_manager_librarie#incoming-17233 foi_attachment.within_rfc822_subject = "FOI/09/066 RESPONSE TO FOI REQUEST RECEIVED 21st JANUARY 2009" foi_attachment.content_type = 'text/plain' + foi_attachment.ensure_filename! expected_display_filename = foi_attachment.within_rfc822_subject.gsub(/\//, " ") + ".txt" foi_attachment.display_filename.should == expected_display_filename end @@ -118,8 +119,7 @@ describe IncomingMessage, " checking validity to reply to" do @incoming_message = IncomingMessage.new() @incoming_message.stub!(:mail).and_return(@mail) - - @incoming_message.valid_to_reply_to?.should == result + @incoming_message._calculate_valid_to_reply_to.should == result end it "says a valid email is fine" do @@ -284,10 +284,8 @@ describe IncomingMessage, " when censoring data" do end it "should apply censor rules to From: addresses" do - mock_mail = mock('Email object') - mock_mail.stub!(:from_name_if_present).and_return("Stilton Mouse") - @im.stub!(:mail).and_return(mock_mail) - + @im.stub!(:mail_from).and_return("Stilton Mouse") + @im.stub!(:last_parsed).and_return(Time.now) safe_mail_from = @im.safe_mail_from safe_mail_from.should == "Jarlsberg Mouse" end @@ -326,21 +324,23 @@ end describe IncomingMessage, " when uudecoding bad messages" do + fixtures :incoming_messages, :raw_emails, :public_bodies, :public_body_translations, :info_requests, :users, :foi_attachments + + before(:each) do + load_raw_emails_data(raw_emails) + end + it "should be able to do it at all" do mail_body = load_file_fixture('incoming-request-bad-uuencoding.email') mail = TMail::Mail.parse(mail_body) mail.base64_decode - - im = IncomingMessage.new + im = incoming_messages(:useless_incoming_message) im.stub!(:mail).and_return(mail) - ir = InfoRequest.new - im.info_request = ir - u = User.new - ir.user = u - - attachments = im.get_main_body_text_uudecode_attachments - attachments.size.should == 1 - attachments[0].filename.should == 'moo.txt' + im.extract_attachments! + attachments = im.foi_attachments + attachments.size.should == 2 + attachments[1].filename.should == 'moo.txt' + im.get_attachments_for_display.size.should == 1 end it "should apply censor rules" do @@ -348,12 +348,9 @@ describe IncomingMessage, " when uudecoding bad messages" do mail = TMail::Mail.parse(mail_body) mail.base64_decode - im = IncomingMessage.new + im = incoming_messages(:useless_incoming_message) im.stub!(:mail).and_return(mail) - ir = InfoRequest.new - im.info_request = ir - u = User.new - ir.user = u + ir = info_requests(:fancy_dog_request) @censor_rule = CensorRule.new() @censor_rule.text = "moo" @@ -361,26 +358,31 @@ describe IncomingMessage, " when uudecoding bad messages" do @censor_rule.last_edit_editor = "unknown" @censor_rule.last_edit_comment = "none" ir.censor_rules << @censor_rule + im.extract_attachments! - attachments = im.get_main_body_text_uudecode_attachments + attachments = im.get_attachments_for_display attachments.size.should == 1 - attachments[0].filename.should == 'bah.txt' + attachments[0].display_filename.should == 'bah.txt' end end describe IncomingMessage, "when messages are attached to messages" do + fixtures :incoming_messages, :raw_emails, :public_bodies, :public_body_translations, :info_requests, :users, :foi_attachments + + before(:each) do + load_raw_emails_data(raw_emails) + end + it "should flatten all the attachments out" do mail_body = load_file_fixture('incoming-request-attach-attachments.email') mail = TMail::Mail.parse(mail_body) mail.base64_decode - im = IncomingMessage.new + im = incoming_messages(:useless_incoming_message) im.stub!(:mail).and_return(mail) - ir = InfoRequest.new - im.info_request = ir - u = User.new - ir.user = u + + im.extract_attachments! attachments = im.get_attachments_for_display attachments.size.should == 3 @@ -391,17 +393,20 @@ describe IncomingMessage, "when messages are attached to messages" do end describe IncomingMessage, "when Outlook messages are attached to messages" do + fixtures :incoming_messages, :raw_emails, :public_bodies, :public_body_translations, :info_requests, :users, :foi_attachments + + before(:each) do + load_raw_emails_data(raw_emails) + end + it "should flatten all the attachments out" do mail_body = load_file_fixture('incoming-request-oft-attachments.email') mail = TMail::Mail.parse(mail_body) mail.base64_decode - im = IncomingMessage.new + im = incoming_messages(:useless_incoming_message) im.stub!(:mail).and_return(mail) - ir = InfoRequest.new - im.info_request = ir - u = User.new - ir.user = u + im.extract_attachments! attachments = im.get_attachments_for_display attachments.size.should == 2 @@ -411,17 +416,20 @@ describe IncomingMessage, "when Outlook messages are attached to messages" do end describe IncomingMessage, "when TNEF attachments are attached to messages" do + fixtures :incoming_messages, :raw_emails, :public_bodies, :public_body_translations, :info_requests, :users, :foi_attachments + + before(:each) do + load_raw_emails_data(raw_emails) + end + it "should flatten all the attachments out" do mail_body = load_file_fixture('incoming-request-tnef-attachments.email') mail = TMail::Mail.parse(mail_body) mail.base64_decode - im = IncomingMessage.new + im = incoming_messages(:useless_incoming_message) im.stub!(:mail).and_return(mail) - ir = InfoRequest.new - im.info_request = ir - u = User.new - ir.user = u + im.extract_attachments! attachments = im.get_attachments_for_display attachments.size.should == 2 diff --git a/spec/models/info_request_event_spec.rb b/spec/models/info_request_event_spec.rb index 055965c23..3229284cc 100644 --- a/spec/models/info_request_event_spec.rb +++ b/spec/models/info_request_event_spec.rb @@ -54,6 +54,11 @@ describe InfoRequestEvent do describe "doing search/index stuff" do fixtures :public_bodies, :public_body_translations, :public_body_versions, :users, :info_requests, :raw_emails, :incoming_messages, :outgoing_messages, :comments, :info_request_events, :track_things + before(:each) do + load_raw_emails_data(raw_emails) + parse_all_incoming_messages + end + it 'should get search text for outgoing messages' do event = info_request_events(:useless_outgoing_message_event) message = outgoing_messages(:useless_outgoing_message).body diff --git a/spec/models/info_request_spec.rb b/spec/models/info_request_spec.rb index 409d48ede..b1baa66a2 100644 --- a/spec/models/info_request_spec.rb +++ b/spec/models/info_request_spec.rb @@ -143,8 +143,8 @@ describe InfoRequest do end it "should cope with indexing after item is deleted" do + IncomingMessage.find(:all).each{|x| x.parse_raw_email!} rebuild_xapian_index - # delete event from underneath indexing; shouldn't cause error info_request_events(:useless_incoming_message_event).save! info_request_events(:useless_incoming_message_event).destroy diff --git a/spec/models/xapian_spec.rb b/spec/models/xapian_spec.rb index cf9ea5fbd..ec11c944b 100644 --- a/spec/models/xapian_spec.rb +++ b/spec/models/xapian_spec.rb @@ -4,6 +4,7 @@ describe User, " when indexing users with Xapian" do fixtures :public_bodies, :public_body_translations, :public_body_versions, :users, :info_requests, :raw_emails, :incoming_messages, :outgoing_messages, :comments, :info_request_events, :track_things it "should search by name" do + parse_all_incoming_messages rebuild_xapian_index # def InfoRequest.full_search(models, query, order, ascending, collapse, per_page, page) xapian_object = InfoRequest.full_search([User], "Silly", 'created_at', true, nil, 100, 1) diff --git a/spec/spec_helper.rb b/spec/spec_helper.rb index e5a42f1a9..5c5cd9a7f 100644 --- a/spec/spec_helper.rb +++ b/spec/spec_helper.rb @@ -78,6 +78,7 @@ def load_file_fixture(file_name) end def rebuild_xapian_index(terms = true, values = true, texts = true, dropfirst = true) + parse_all_incoming_messages if dropfirst begin ActsAsXapian.readable_init @@ -168,3 +169,7 @@ def load_raw_emails_data(raw_emails) end raw_email.data = load_file_fixture("useless_raw_email.email") end + +def parse_all_incoming_messages + IncomingMessage.find(:all).each{|x| x.parse_raw_email!} +end |