diff options
-rw-r--r-- | .travis.yml | 1 | ||||
-rw-r--r-- | app/models/foi_attachment.rb | 64 | ||||
-rw-r--r-- | app/models/incoming_message.rb | 416 | ||||
-rw-r--r-- | lib/mail_handler/backends/mail_backend.rb | 209 | ||||
-rw-r--r-- | lib/mail_handler/backends/mail_extensions.rb | 60 | ||||
-rw-r--r-- | lib/mail_handler/backends/tmail_backend.rb | 186 | ||||
-rw-r--r-- | lib/mail_handler/backends/tmail_extensions.rb | 2 | ||||
-rw-r--r-- | lib/mail_handler/mail_handler.rb | 130 | ||||
-rw-r--r-- | spec/fixtures/files/dos-linebreaks.email | 31 | ||||
-rw-r--r-- | spec/fixtures/files/many-attachments-date-header.email | 451 | ||||
-rw-r--r-- | spec/lib/mail_handler/mail_handler_spec.rb | 132 | ||||
-rw-r--r-- | spec/models/incoming_message_spec.rb | 35 |
12 files changed, 1331 insertions, 386 deletions
diff --git a/.travis.yml b/.travis.yml index c5e63aeb3..493663940 100644 --- a/.travis.yml +++ b/.travis.yml @@ -5,6 +5,7 @@ branches: - master rvm: - 1.8.7 + - 1.9.3 before_install: - gem install rubygems-update --version=1.6.2 - gem update --system 1.6.2 diff --git a/app/models/foi_attachment.rb b/app/models/foi_attachment.rb index 7c4c3226f..2f8a9ab04 100644 --- a/app/models/foi_attachment.rb +++ b/app/models/foi_attachment.rb @@ -67,9 +67,22 @@ class FoiAttachment < ActiveRecord::Base file.write d } update_display_size! + encode_cached_body! @cached_body = d end + # If the original mail part had a charset, it's some kind of string, so assume that + # it should be handled as a string in the stated charset, not a bytearray, and then + # convert it our default encoding. For ruby 1.8 this is a noop. + def encode_cached_body! + if RUBY_VERSION.to_f >= 1.9 + if charset + @cached_body.force_encoding(charset) + @cached_body = @cached_body.encode(Encoding.default_internal, charset) + end + end + end + def body if @cached_body.nil? tries = 0 @@ -90,6 +103,7 @@ class FoiAttachment < ActiveRecord::Base self.incoming_message.parse_raw_email!(force) retry end + encode_cached_body! end return @cached_body end @@ -310,32 +324,42 @@ class FoiAttachment < ActiveRecord::Base # the extractions will also produce image files, which go in the # current directory, so change to the directory the function caller # wants everything in - Dir.chdir(dir) do - tempfile = Tempfile.new('foiextract', '.') - tempfile.print self.body - tempfile.flush - - html = nil - if self.content_type == 'application/pdf' - # We set a timeout here, because pdftohtml can spiral out of control - # on some PDF files and we don’t want to crash the whole server. - html = AlaveteliExternalCommand.run("pdftohtml", "-nodrm", "-zoom", "1.0", "-stdout", "-enc", "UTF-8", "-noframes", tempfile.path, :timeout => 30) - elsif self.content_type == 'application/rtf' - html = AlaveteliExternalCommand.run("unrtf", "--html", tempfile.path, :timeout => 120) - end - if html.nil? - if self.has_google_docs_viewer? - html = '' # force error and using Google docs viewer + html = nil + if ['application/pdf', 'application/rtf'].include?(self.content_type) + text = self.body + Dir.chdir(dir) do + if RUBY_VERSION.to_f >= 1.9 + tempfile = Tempfile.new('foiextract', '.', :encoding => text.encoding) else - raise "No HTML conversion available for type " + self.content_type + tempfile = Tempfile.new('foiextract', '.') end - end + tempfile.print text + tempfile.flush - tempfile.close - tempfile.delete + + if self.content_type == 'application/pdf' + # We set a timeout here, because pdftohtml can spiral out of control + # on some PDF files and we don't want to crash the whole server. + html = AlaveteliExternalCommand.run("pdftohtml", "-nodrm", "-zoom", "1.0", "-stdout", "-enc", "UTF-8", "-noframes", tempfile.path, :timeout => 30) + elsif self.content_type == 'application/rtf' + html = AlaveteliExternalCommand.run("unrtf", "--html", tempfile.path, :timeout => 120) + end + + tempfile.close + tempfile.delete + end + end + if html.nil? + if self.has_google_docs_viewer? + html = '' # force error and using Google docs viewer + else + raise "No HTML conversion available for type " + self.content_type + end end + + # We need to look at: # a) Any error code # b) The output size, as pdftohtml does not return an error code upon error. diff --git a/app/models/incoming_message.rb b/app/models/incoming_message.rb index 464910d01..a02d2456a 100644 --- a/app/models/incoming_message.rb +++ b/app/models/incoming_message.rb @@ -125,9 +125,9 @@ class IncomingMessage < ActiveRecord::Base self.sent_at = self.mail.date || self.created_at self.subject = self.mail.subject self.mail_from = MailHandler.get_from_name(self.mail) - begin + if self.from_email self.mail_from_domain = PublicBody.extract_domain_from_email(self.from_email) - rescue NoMethodError + else self.mail_from_domain = "" end self.valid_to_reply_to = self._calculate_valid_to_reply_to @@ -173,54 +173,8 @@ class IncomingMessage < ActiveRecord::Base super end - # Number the attachments in depth first tree order, for use in URLs. - # XXX This fills in part.rfc822_attachment and part.url_part_number within - # all the parts of the email (see monkeypatches in lib/mail_handler/tmail_extensions and - # lib/mail_handler/mail_extensions for how these attributes are added). ensure_parts_counted - # must be called before using the attributes. - def ensure_parts_counted - @count_parts_count = 0 - _count_parts_recursive(self.mail) - # we carry on using these numeric ids for attachments uudecoded from within text parts - @count_first_uudecode_count = @count_parts_count - end - def _count_parts_recursive(part) - if part.multipart? - part.parts.each do |p| - _count_parts_recursive(p) - end - else - part_filename = MailHandler.get_part_file_name(part) - begin - if part.content_type == 'message/rfc822' - # An email attached as text - # e.g. http://www.whatdotheyknow.com/request/64/response/102 - part.rfc822_attachment = MailHandler.mail_from_raw_email(part.body, decode=false) - elsif part.content_type == 'application/vnd.ms-outlook' || part_filename && AlaveteliFileTypes.filename_to_mimetype(part_filename) == 'application/vnd.ms-outlook' - # An email attached as an Outlook file - # e.g. http://www.whatdotheyknow.com/request/chinese_names_for_british_politi - msg = Mapi::Msg.open(StringIO.new(part.body)) - part.rfc822_attachment = MailHandler.mail_from_raw_email(msg.to_mime.to_s, decode=false) - elsif part.content_type == 'application/ms-tnef' - # A set of attachments in a TNEF file - part.rfc822_attachment = MailHandler.mail_from_tnef(part.body) - end - rescue - # If attached mail doesn't parse, treat it as text part - part.rfc822_attachment = nil - else - unless part.rfc822_attachment.nil? - _count_parts_recursive(part.rfc822_attachment) - end - end - if part.rfc822_attachment.nil? - @count_parts_count += 1 - part.url_part_number = @count_parts_count - end - end - end # And look up by URL part number to get an attachment - # XXX relies on extract_attachments calling ensure_parts_counted + # XXX relies on extract_attachments calling MailHandler.ensure_parts_counted def self.get_attachment_by_url_part_number(attachments, found_url_part_number) attachments.each do |a| if a.url_part_number == found_url_part_number @@ -441,96 +395,6 @@ class IncomingMessage < ActiveRecord::Base return text end - # (This risks losing info if the unchosen alternative is the only one to contain - # useful info, but let's worry about that another time) - def get_attachment_leaves - force = true - return _get_attachment_leaves_recursive(self.mail(force)) - end - def _get_attachment_leaves_recursive(curr_mail, within_rfc822_attachment = nil) - leaves_found = [] - if curr_mail.multipart? - if curr_mail.parts.size == 0 - raise "no parts on multipart mail" - end - - if curr_mail.sub_type == 'alternative' - # Choose best part from alternatives - best_part = nil - # Take the last text/plain one, or else the first one - curr_mail.parts.each do |m| - if not best_part - best_part = m - elsif m.content_type == 'text/plain' - best_part = m - end - end - # Take an HTML one as even higher priority. (They tend - # to render better than text/plain, e.g. don't wrap links here: - # http://www.whatdotheyknow.com/request/amount_and_cost_of_freedom_of_in#incoming-72238 ) - curr_mail.parts.each do |m| - if m.content_type == 'text/html' - best_part = m - end - end - leaves_found += _get_attachment_leaves_recursive(best_part, within_rfc822_attachment) - else - # Add all parts - curr_mail.parts.each do |m| - leaves_found += _get_attachment_leaves_recursive(m, within_rfc822_attachment) - end - end - else - # XXX Yuck. this section alters various content_type's. That puts - # it into conflict with ensure_parts_counted which it has to be - # called both before and after. It will fail with cases of - # attachments of attachments etc. - charset = curr_mail.charset # save this, because overwriting content_type also resets charset - # Don't allow nil content_types - if curr_mail.content_type.nil? - curr_mail.content_type = 'application/octet-stream' - end - # PDFs often come with this mime type, fix it up for view code - if curr_mail.content_type == 'application/octet-stream' - part_file_name = MailHandler.get_part_file_name(curr_mail) - part_body = MailHandler.get_part_body(curr_mail) - calc_mime = AlaveteliFileTypes.filename_and_content_to_mimetype(part_file_name, part_body) - if calc_mime - curr_mail.content_type = calc_mime - end - end - - # Use standard content types for Word documents etc. - curr_mail.content_type = normalise_content_type(curr_mail.content_type) - if curr_mail.content_type == 'message/rfc822' - ensure_parts_counted # fills in rfc822_attachment variable - if curr_mail.rfc822_attachment.nil? - # Attached mail didn't parse, so treat as text - curr_mail.content_type = 'text/plain' - end - end - if curr_mail.content_type == 'application/vnd.ms-outlook' || curr_mail.content_type == 'application/ms-tnef' - ensure_parts_counted # fills in rfc822_attachment variable - if curr_mail.rfc822_attachment.nil? - # Attached mail didn't parse, so treat as binary - curr_mail.content_type = 'application/octet-stream' - end - end - # If the part is an attachment of email - if curr_mail.content_type == 'message/rfc822' || curr_mail.content_type == 'application/vnd.ms-outlook' || curr_mail.content_type == 'application/ms-tnef' - ensure_parts_counted # fills in rfc822_attachment variable - leaves_found += _get_attachment_leaves_recursive(curr_mail.rfc822_attachment, curr_mail.rfc822_attachment) - else - # Store leaf - curr_mail.within_rfc822_attachment = within_rfc822_attachment - leaves_found += [curr_mail] - end - # restore original charset - curr_mail.charset = charset - end - return leaves_found - end - # Removes anything cached about the object in the database, and saves def clear_in_database_caches! self.cached_attachment_text_clipped = nil @@ -593,7 +457,8 @@ class IncomingMessage < ActiveRecord::Base text = "[ Email has no body, please see attachments ]" source_charset = "utf-8" else - text = part.body # by default, TMail converts to UTF8 in this call + # by default, the body (coming from an foi_attachment) should have been converted to utf-8 + text = part.body source_charset = part.charset if part.content_type == 'text/html' # e.g. http://www.whatdotheyknow.com/request/35/response/177 @@ -601,42 +466,31 @@ class IncomingMessage < ActiveRecord::Base # convert to text routine. Could instead call a # sanitize HTML one. - # If the text isn't UTF8, it means TMail had a problem + # If the text isn't UTF8, it means we had a problem # converting it (invalid characters, etc), and we # should instead tell elinks to respect the source # charset use_charset = "utf-8" - begin - text = Iconv.conv('utf-8', 'utf-8', text) - rescue Iconv::IllegalSequence - use_charset = source_charset - end - text = self.class._get_attachment_text_internal_one_file(part.content_type, text, use_charset) - end - end - - # If TMail can't convert text, it just returns it, so we sanitise it. - begin - # Test if it's good UTF-8 - text = Iconv.conv('utf-8', 'utf-8', text) - rescue Iconv::IllegalSequence - # Text looks like unlabelled nonsense, - # strip out anything that isn't UTF-8 - begin - source_charset = 'utf-8' if source_charset.nil? - text = Iconv.conv('utf-8//IGNORE', source_charset, text) + - _("\n\n[ {{site_name}} note: The above text was badly encoded, and has had strange characters removed. ]", - :site_name => Configuration::site_name) - rescue Iconv::InvalidEncoding, Iconv::IllegalSequence - if source_charset != "utf-8" - source_charset = "utf-8" - retry + if RUBY_VERSION.to_f >= 1.9 + begin + text.encode('utf-8') + rescue Encoding::UndefinedConversionError, Encoding::InvalidByteSequenceError + use_charset = source_charset + end + else + begin + text = Iconv.conv('utf-8', 'utf-8', text) + rescue Iconv::IllegalSequence + use_charset = source_charset + end end + text = MailHandler.get_attachment_text_one_file(part.content_type, text, use_charset) end end + # If text hasn't been converted, we sanitise it. + text = _sanitize_text(text) # Fix DOS style linefeeds to Unix style ones (or other later regexps won't work) - # Needed for e.g. http://www.whatdotheyknow.com/request/60/response/98 text = text.gsub(/\r\n/, "\n") # Compress extra spaces down to save space, and to stop regular expressions @@ -646,6 +500,51 @@ class IncomingMessage < ActiveRecord::Base return text end + + def _sanitize_text(text) + if RUBY_VERSION.to_f >= 1.9 + begin + # Test if it's good UTF-8 + text.encode('utf-8') + rescue Encoding::UndefinedConversionError, Encoding::InvalidByteSequenceError + source_charset = 'utf-8' if source_charset.nil? + # strip out anything that isn't UTF-8 + begin + text = text.encode("utf-8", :invalid => :replace, + :undef => :replace, + :replace => "") + + _("\n\n[ {{site_name}} note: The above text was badly encoded, and has had strange characters removed. ]", + :site_name => MySociety::Config.get('SITE_NAME', 'Alaveteli')) + rescue Encoding::UndefinedConversionError, Encoding::InvalidByteSequenceError + if source_charset != "utf-8" + source_charset = "utf-8" + retry + end + end + end + else + begin + # Test if it's good UTF-8 + text = Iconv.conv('utf-8', 'utf-8', text) + rescue Iconv::IllegalSequence + # Text looks like unlabelled nonsense, + # strip out anything that isn't UTF-8 + begin + source_charset = 'utf-8' if source_charset.nil? + text = Iconv.conv('utf-8//IGNORE', source_charset, text) + + _("\n\n[ {{site_name}} note: The above text was badly encoded, and has had strange characters removed. ]", + :site_name => Configuration::site_name) + rescue Iconv::InvalidEncoding, Iconv::IllegalSequence + if source_charset != "utf-8" + source_charset = "utf-8" + retry + end + end + end + end + text + end + # Returns part which contains main body text, or nil if there isn't one def get_main_body_text_part leaves = self.foi_attachments @@ -699,7 +598,7 @@ class IncomingMessage < ActiveRecord::Base filename = uu.match(/^begin\s+[0-9]+\s+(.*)$/)[1] calc_mime = AlaveteliFileTypes.filename_and_content_to_mimetype(filename, content) if calc_mime - calc_mime = normalise_content_type(calc_mime) + calc_mime = MailHandler.normalise_content_type(calc_mime) content_type = calc_mime else content_type = 'application/octet-stream' @@ -728,55 +627,15 @@ class IncomingMessage < ActiveRecord::Base end def extract_attachments! - leaves = get_attachment_leaves # XXX check where else this is called from - # XXX we have to call ensure_parts_counted after get_attachment_leaves - # which is really messy. - ensure_parts_counted + force = true + attachment_attributes = MailHandler.get_attachment_attributes(self.mail(force)) attachments = [] - for leaf in leaves - body = MailHandler.get_part_body(leaf) - # As leaf.body causes MIME decoding which uses lots of RAM, do garbage collection here - # to prevent excess memory use. XXX not really sure if this helps reduce - # peak RAM use overall. Anyway, maybe there is something better to do than this. - GC.start - if leaf.within_rfc822_attachment - within_rfc822_subject = leaf.within_rfc822_attachment.subject - # Test to see if we are in the first part of the attached - # RFC822 message and it is text, if so add headers. - # XXX should probably use hunting algorithm to find main text part, rather than - # just expect it to be first. This will do for now though. - # Example request that needs this: - # http://www.whatdotheyknow.com/request/2923/response/7013/attach/2/Cycle%20Path%20Bank.txt - if leaf.within_rfc822_attachment == leaf && leaf.content_type == 'text/plain' - headers = "" - for header in [ 'Date', 'Subject', 'From', 'To', 'Cc' ] - if leaf.within_rfc822_attachment.header.include?(header.downcase) - header_value = leaf.within_rfc822_attachment.header[header.downcase] - # Example message which has a blank Date header: - # http://www.whatdotheyknow.com/request/30747/response/80253/attach/html/17/Common%20Purpose%20Advisory%20Group%20Meeting%20Tuesday%202nd%20March.txt.html - if !header_value.blank? - headers = headers + header + ": " + header_value.to_s + "\n" - end - end - end - # XXX call _convert_part_body_to_text here, but need to get charset somehow - # e.g. http://www.whatdotheyknow.com/request/1593/response/3088/attach/4/Freedom%20of%20Information%20request%20-%20car%20oval%20sticker:%20Article%2020,%20Convention%20on%20Road%20Traffic%201949.txt - body = headers + "\n" + body - - # This is quick way of getting all headers, but instead we only add some a) to - # make it more usable, b) as at least one authority accidentally leaked security - # information into a header. - #attachment.body = leaf.within_rfc822_attachment.port.to_s - end - end - hexdigest = Digest::MD5.hexdigest(body) - attachment = self.foi_attachments.find_or_create_by_hexdigest(:hexdigest => hexdigest) - attachment.update_attributes(:url_part_number => leaf.url_part_number, - :content_type => leaf.content_type, - :filename => MailHandler.get_part_file_name(leaf), - :charset => leaf.charset, - :within_rfc822_subject => within_rfc822_subject, - :body => body) + attachment_attributes.each do |attrs| + attachment = self.foi_attachments.find_or_create_by_hexdigest(:hexdigest => attrs[:hexdigest]) + body = attrs.delete(:body) + attachment.update_attributes(attrs) + # Set the body separately as its handling can depend on the value of charset + attachment.body = body attachment.save! attachments << attachment.id end @@ -786,7 +645,7 @@ class IncomingMessage < ActiveRecord::Base # e.g. for https://secure.mysociety.org/admin/foi/request/show_raw_email/24550 if !main_part.nil? uudecoded_attachments = _uudecode_and_save_attachments(main_part.body) - c = @count_first_uudecode_count + c = self.mail.count_first_uudecode_count for uudecode_attachment in uudecoded_attachments c += 1 uudecode_attachment.url_part_number = c @@ -878,101 +737,15 @@ class IncomingMessage < ActiveRecord::Base return self.cached_attachment_text_clipped end - def IncomingMessage._get_attachment_text_internal_one_file(content_type, body, charset = 'utf-8') - # note re. charset: TMail always tries to convert email bodies - # to UTF8 by default, so normally it should already be that. - text = '' - # XXX - tell all these command line tools to return utf-8 - if content_type == 'text/plain' - text += body + "\n\n" - else - tempfile = Tempfile.new('foiextract') - tempfile.print body - tempfile.flush - if content_type == 'application/vnd.ms-word' - AlaveteliExternalCommand.run("wvText", tempfile.path, tempfile.path + ".txt") - # Try catdoc if we get into trouble (e.g. for InfoRequestEvent 2701) - if not File.exists?(tempfile.path + ".txt") - AlaveteliExternalCommand.run("catdoc", tempfile.path, :append_to => text) - else - text += File.read(tempfile.path + ".txt") + "\n\n" - File.unlink(tempfile.path + ".txt") - end - elsif content_type == 'application/rtf' - # catdoc on RTF prodcues less comments and extra bumf than --text option to unrtf - AlaveteliExternalCommand.run("catdoc", tempfile.path, :append_to => text) - elsif content_type == 'text/html' - # lynx wordwraps links in its output, which then don't - # get formatted properly by Alaveteli. We use elinks - # instead, which doesn't do that. - AlaveteliExternalCommand.run("elinks", "-eval", "set document.codepage.assume = \"#{charset}\"", "-eval", "set document.codepage.force_assumed = 1", "-dump-charset", "utf-8", "-force-html", "-dump", - tempfile.path, :append_to => text, :env => {"LANG" => "C"}) - elsif content_type == 'application/vnd.ms-excel' - # Bit crazy using /usr/bin/strings - but xls2csv, xlhtml and - # py_xls2txt only extract text from cells, not from floating - # notes. catdoc may be fooled by weird character sets, but will - # probably do for UK FOI requests. - AlaveteliExternalCommand.run("/usr/bin/strings", tempfile.path, :append_to => text) - elsif content_type == 'application/vnd.ms-powerpoint' - # ppthtml seems to catch more text, but only outputs HTML when - # we want text, so just use catppt for now - AlaveteliExternalCommand.run("catppt", tempfile.path, :append_to => text) - elsif content_type == 'application/pdf' - AlaveteliExternalCommand.run("pdftotext", tempfile.path, "-", :append_to => text) - elsif content_type == 'application/vnd.openxmlformats-officedocument.wordprocessingml.document' - # This is Microsoft's XML office document format. - # Just pull out the main XML file, and strip it of text. - xml = AlaveteliExternalCommand.run("/usr/bin/unzip", "-qq", "-c", tempfile.path, "word/document.xml") - if !xml.nil? - doc = REXML::Document.new(xml) - text += doc.each_element( './/text()' ){}.join(" ") - end - elsif content_type == 'application/zip' - # recurse into zip files - begin - zip_file = Zip::ZipFile.open(tempfile.path) - text += IncomingMessage._get_attachment_text_from_zip_file(zip_file) - zip_file.close() - rescue - $stderr.puts("Error processing zip file: #{$!.inspect}") - end - end - tempfile.close - end - - return text - end - def IncomingMessage._get_attachment_text_from_zip_file(zip_file) - text = "" - for entry in zip_file - if entry.file? - filename = entry.to_s - begin - body = entry.get_input_stream.read - rescue - # move to next attachment silently if there were problems - # XXX really should reduce this to specific exceptions? - # e.g. password protected - next - end - calc_mime = AlaveteliFileTypes.filename_to_mimetype(filename) - if calc_mime - content_type = calc_mime - else - content_type = 'application/octet-stream' - end - text += _get_attachment_text_internal_one_file(content_type, body) - end - end - return text - end def _get_attachment_text_internal # Extract text from each attachment text = '' attachments = self.get_attachments_for_display for attachment in attachments - text += IncomingMessage._get_attachment_text_internal_one_file(attachment.content_type, attachment.body, attachment.charset) + text += MailHandler.get_attachment_text_one_file(attachment.content_type, + attachment.body, + attachment.charset) end # Remove any bad characters text = Iconv.conv('utf-8//IGNORE', 'utf-8', text) @@ -1040,37 +813,12 @@ class IncomingMessage < ActiveRecord::Base return AlaveteliFileTypes.all_extensions.join(" ") end - def normalise_content_type(content_type) - # e.g. http://www.whatdotheyknow.com/request/93/response/250 - if content_type == 'application/excel' or content_type == 'application/msexcel' or content_type == 'application/x-ms-excel' - content_type = 'application/vnd.ms-excel' - end - if content_type == 'application/mspowerpoint' or content_type == 'application/x-ms-powerpoint' - content_type = 'application/vnd.ms-powerpoint' - end - if content_type == 'application/msword' or content_type == 'application/x-ms-word' - content_type = 'application/vnd.ms-word' - end - if content_type == 'application/x-zip-compressed' - content_type = 'application/zip' - end - - # e.g. http://www.whatdotheyknow.com/request/copy_of_current_swessex_scr_opt#incoming-9928 - if content_type == 'application/acrobat' - content_type = 'application/pdf' + def for_admin_column + self.class.content_columns.each do |column| + yield(column.human_name, self.send(column.name), column.type.to_s, column.name) end - - return content_type end - def for_admin_column - self.class.content_columns.each do |column| - yield(column.human_name, self.send(column.name), column.type.to_s, column.name) - end - end - - private :normalise_content_type - end diff --git a/lib/mail_handler/backends/mail_backend.rb b/lib/mail_handler/backends/mail_backend.rb index 8dd2e6b48..b75e6ed63 100644 --- a/lib/mail_handler/backends/mail_backend.rb +++ b/lib/mail_handler/backends/mail_backend.rb @@ -23,15 +23,23 @@ module MailHandler main end + # Returns an outlook message as a Mail object + def mail_from_outlook(content) + msg = Mapi::Msg.open(StringIO.new(content)) + mail = mail_from_raw_email(msg.to_mime.to_s) + mail.ready_to_send! + mail + end + # Return a copy of the file name for the mail part - def get_part_file_name(mail_part) - part_file_name = mail_part.filename + def get_part_file_name(part) + part_file_name = part.filename part_file_name.nil? ? nil : part_file_name.dup end # Get the body of a mail part - def get_part_body(mail_part) - mail_part.body.decoded + def get_part_body(part) + part.body.decoded end # Return the first from field if any @@ -102,13 +110,204 @@ module MailHandler mail.header[header] ? mail.header[header].to_s : nil end + # Detects whether a mail part is an Outlook email + def is_outlook?(part) + filename = get_part_file_name(part) + return true if get_content_type(part) == 'application/vnd.ms-outlook' + if filename && AlaveteliFileTypes.filename_to_mimetype(filename) == 'application/vnd.ms-outlook' + return true + end + return false + end + + # Convert a mail part which is an attached mail in one of + # several formats into a mail object and set it as the + # rfc822_attachment on the part. If the mail part can't be + # converted, the content type on the part is updated to + # 'text/plain' for an RFC822 attachment, and 'application/octet-stream' + # for other types + def decode_attached_part(part, parent_mail) + if get_content_type(part) == 'message/rfc822' + # An email attached as text + part.rfc822_attachment = mail_from_raw_email(part.body) + if part.rfc822_attachment.nil? + # Attached mail didn't parse, so treat as text + part.content_type = 'text/plain' + end + elsif is_outlook?(part) + part.rfc822_attachment = mail_from_outlook(part.body.decoded) + if part.rfc822_attachment.nil? + # Attached mail didn't parse, so treat as binary + part.content_type = 'application/octet-stream' + end + elsif get_content_type(part) == 'application/ms-tnef' + # A set of attachments in a TNEF file + part.rfc822_attachment = mail_from_tnef(part.body.decoded) + if part.rfc822_attachment.nil? + # Attached mail didn't parse, so treat as binary + part.content_type = 'application/octet-stream' + end + end + if part.rfc822_attachment + expand_and_normalize_parts(part.rfc822_attachment, parent_mail) + end + end + + # Expand and normalize a mail part recursively. Decodes attached messages into + # Mail objects wherever possible. Sets a default content type if none is + # set. Tries to set a more specific content type for binary content types. + def expand_and_normalize_parts(part, parent_mail) + if part.multipart? + part.parts.each{ |sub_part| expand_and_normalize_parts(sub_part, parent_mail) } + else + part_filename = get_part_file_name(part) + charset = part.charset # save this, because overwriting content_type also resets charset + + # Don't allow nil content_types + if get_content_type(part).nil? + part.content_type = 'application/octet-stream' + end + + # PDFs often come with this mime type, fix it up for view code + if get_content_type(part) == 'application/octet-stream' + part_body = get_part_body(part) + calc_mime = AlaveteliFileTypes.filename_and_content_to_mimetype(part_filename, + part_body) + if calc_mime + part.content_type = calc_mime + end + end + + # Use standard content types for Word documents etc. + part.content_type = normalise_content_type(get_content_type(part)) + decode_attached_part(part, parent_mail) + part.charset = charset + end + end + + # Count the parts in a mail part recursively, including any attached messages. + # Set the count on the parent mail, and set a url_part_number on the part itself. + # Set the count for the first uudecoded part on the parent mail also. + def count_parts(part, parent_mail) + if part.multipart? + part.parts.each { |p| count_parts(p, parent_mail) } + else + if part.rfc822_attachment + count_parts(part.rfc822_attachment, parent_mail) + else + parent_mail.count_parts_count += 1 + part.url_part_number = parent_mail.count_parts_count + end + end + parent_mail.count_first_uudecode_count = parent_mail.count_parts_count + end + + # Choose the best part from alternatives + def choose_best_alternative(mail) + if mail.html_part + return mail.html_part + elsif mail.text_part + return mail.text_part + else + return mail.parts.first + end + end + + # Expand and normalize the parts of a mail, select the best part + # wherever there is an alternative, and then count the returned + # leaves and assign url_part values to them + def get_attachment_leaves(mail) + expand_and_normalize_parts(mail, mail) + leaves = _get_attachment_leaves_recursive(mail, nil, mail) + mail.count_parts_count = 0 + count_parts(mail, mail) + return leaves + end + + # Recurse through a mail part, selecting the best part wherever there is + # an alternative + def _get_attachment_leaves_recursive(part, within_rfc822_attachment, parent_mail) + leaves_found = [] + if part.multipart? + raise "no parts on multipart mail" if part.parts.size == 0 + if part.sub_type == 'alternative' + best_part = choose_best_alternative(part) + leaves_found += _get_attachment_leaves_recursive(best_part, + within_rfc822_attachment, + parent_mail) + else + # Add all parts + part.parts.each do |sub_part| + leaves_found += _get_attachment_leaves_recursive(sub_part, + within_rfc822_attachment, + parent_mail) + end + end + else + # Add all the parts of a decoded attached message + if part.rfc822_attachment + leaves_found += _get_attachment_leaves_recursive(part.rfc822_attachment, + part.rfc822_attachment, + parent_mail) + else + # Store leaf + part.within_rfc822_attachment = within_rfc822_attachment + leaves_found += [part] + end + end + return leaves_found + end + + # Add selected useful headers from an attached message to its body + def extract_attached_message_headers(leaf) + body = get_part_body(leaf) + # Test to see if we are in the first part of the attached + # RFC822 message and it is text, if so add headers. + if leaf.within_rfc822_attachment == leaf && get_content_type(leaf) == 'text/plain' + headers = "" + [ 'Date', 'Subject', 'From', 'To', 'Cc' ].each do |header| + if header_value = get_header_string(header, leaf.within_rfc822_attachment) + if !header_value.blank? + headers = headers + header + ": " + header_value.to_s + "\n" + end + end + end + # XXX call _convert_part_body_to_text here, but need to get charset somehow + # e.g. http://www.whatdotheyknow.com/request/1593/response/3088/attach/4/Freedom%20of%20Information%20request%20-%20car%20oval%20sticker:%20Article%2020,%20Convention%20on%20Road%20Traffic%201949.txt + body = headers + "\n" + body + end + body + end + + # Generate a hash of the attributes associated with each significant part of a Mail object + def get_attachment_attributes(mail) + leaves = get_attachment_leaves(mail) + attachments = [] + for leaf in leaves + body = get_part_body(leaf) + if leaf.within_rfc822_attachment + within_rfc822_subject = leaf.within_rfc822_attachment.subject + body = extract_attached_message_headers(leaf) + end + leaf_attributes = { :url_part_number => leaf.url_part_number, + :content_type => get_content_type(leaf), + :filename => get_part_file_name(leaf), + :charset => leaf.charset, + :within_rfc822_subject => within_rfc822_subject, + :body => body, + :hexdigest => Digest::MD5.hexdigest(body) } + attachments << leaf_attributes + end + return attachments + end + # Format def address_from_name_and_email(name, email) if !MySociety::Validate.is_valid_email(email) raise "invalid email " + email + " passed to address_from_name_and_email" end if name.nil? - return Mail::Address.new(email) + return Mail::Address.new(email).to_s end address = Mail::Address.new address.display_name = name diff --git a/lib/mail_handler/backends/mail_extensions.rb b/lib/mail_handler/backends/mail_extensions.rb index cbe0491ed..f756abd1a 100644 --- a/lib/mail_handler/backends/mail_extensions.rb +++ b/lib/mail_handler/backends/mail_extensions.rb @@ -1,7 +1,67 @@ +require 'mail/message' +require 'mail/fields/common/parameter_hash' module Mail class Message attr_accessor :url_part_number attr_accessor :rfc822_attachment # when a whole email message is attached as text attr_accessor :within_rfc822_attachment # for parts within a message attached as text (for getting subject mainly) + attr_accessor :count_parts_count + attr_accessor :count_first_uudecode_count + + # A patched version of the message initializer to work around a bug where stripping the original + # input removes meaningful spaces - e.g. in the case of uuencoded bodies. + def initialize(*args, &block) + @body = nil + @body_raw = nil + @separate_parts = false + @text_part = nil + @html_part = nil + @errors = nil + @header = nil + @charset = 'UTF-8' + @defaulted_charset = true + + @perform_deliveries = true + @raise_delivery_errors = true + + @delivery_handler = nil + + @delivery_method = Mail.delivery_method.dup + + @transport_encoding = Mail::Encodings.get_encoding('7bit') + + @mark_for_delete = false + + if args.flatten.first.respond_to?(:each_pair) + init_with_hash(args.flatten.first) + else + # The replacement of this commented out line is the change. + # init_with_string(args.flatten[0].to_s.strip) + init_with_string(args.flatten[0].to_s) + end + + if block_given? + instance_eval(&block) + end + + self + end + end + + # A patched version of the parameter hash that handles nil values without throwing + # an error. + class ParameterHash < IndifferentHash + + def encoded + map.sort { |a,b| a.first.to_s <=> b.first.to_s }.map do |key_name, value| + # The replacement of this commented out line is the change + # unless value.ascii_only? + unless value.nil? || value.ascii_only? + value = Mail::Encodings.param_encode(value) + key_name = "#{key_name}*" + end + %Q{#{key_name}=#{quote_token(value)}} + end.join(";\r\n\s") + end end end
\ No newline at end of file diff --git a/lib/mail_handler/backends/tmail_backend.rb b/lib/mail_handler/backends/tmail_backend.rb index 4b7291d00..02124cdb1 100644 --- a/lib/mail_handler/backends/tmail_backend.rb +++ b/lib/mail_handler/backends/tmail_backend.rb @@ -83,6 +83,192 @@ module MailHandler mail.header_string(header) end + # Number the attachments in depth first tree order, for use in URLs. + # XXX This fills in part.rfc822_attachment and part.url_part_number within + # all the parts of the email (see monkeypatches in lib/mail_handler/tmail_extensions and + # lib/mail_handler/mail_extensions for how these attributes are added). ensure_parts_counted + # must be called before using the attributes. + def ensure_parts_counted(mail) + mail.count_parts_count = 0 + _count_parts_recursive(mail, mail) + # we carry on using these numeric ids for attachments uudecoded from within text parts + mail.count_first_uudecode_count = mail.count_parts_count + end + def _count_parts_recursive(part, mail) + if part.multipart? + part.parts.each do |p| + _count_parts_recursive(p, mail) + end + else + part_filename = get_part_file_name(part) + begin + if part.content_type == 'message/rfc822' + # An email attached as text + # e.g. http://www.whatdotheyknow.com/request/64/response/102 + part.rfc822_attachment = mail_from_raw_email(part.body, decode=false) + elsif part.content_type == 'application/vnd.ms-outlook' || part_filename && AlaveteliFileTypes.filename_to_mimetype(part_filename) == 'application/vnd.ms-outlook' + # An email attached as an Outlook file + # e.g. http://www.whatdotheyknow.com/request/chinese_names_for_british_politi + msg = Mapi::Msg.open(StringIO.new(part.body)) + part.rfc822_attachment = mail_from_raw_email(msg.to_mime.to_s, decode=false) + elsif part.content_type == 'application/ms-tnef' + # A set of attachments in a TNEF file + part.rfc822_attachment = mail_from_tnef(part.body) + end + rescue + # If attached mail doesn't parse, treat it as text part + part.rfc822_attachment = nil + else + unless part.rfc822_attachment.nil? + _count_parts_recursive(part.rfc822_attachment, mail) + end + end + if part.rfc822_attachment.nil? + mail.count_parts_count += 1 + part.url_part_number = mail.count_parts_count + end + end + end + + def get_attachment_attributes(mail) + leaves = get_attachment_leaves(mail) + # XXX we have to call ensure_parts_counted after get_attachment_leaves + # which is really messy. + ensure_parts_counted(mail) + attachment_attributes = [] + for leaf in leaves + body = get_part_body(leaf) + # As leaf.body causes MIME decoding which uses lots of RAM, do garbage collection here + # to prevent excess memory use. XXX not really sure if this helps reduce + # peak RAM use overall. Anyway, maybe there is something better to do than this. + GC.start + if leaf.within_rfc822_attachment + within_rfc822_subject = leaf.within_rfc822_attachment.subject + # Test to see if we are in the first part of the attached + # RFC822 message and it is text, if so add headers. + # XXX should probably use hunting algorithm to find main text part, rather than + # just expect it to be first. This will do for now though. + if leaf.within_rfc822_attachment == leaf && leaf.content_type == 'text/plain' + headers = "" + for header in [ 'Date', 'Subject', 'From', 'To', 'Cc' ] + if leaf.within_rfc822_attachment.header.include?(header.downcase) + header_value = leaf.within_rfc822_attachment.header[header.downcase] + if !header_value.blank? + headers = headers + header + ": " + header_value.to_s + "\n" + end + end + end + # XXX call _convert_part_body_to_text here, but need to get charset somehow + # e.g. http://www.whatdotheyknow.com/request/1593/response/3088/attach/4/Freedom%20of%20Information%20request%20-%20car%20oval%20sticker:%20Article%2020,%20Convention%20on%20Road%20Traffic%201949.txt + body = headers + "\n" + body + + # This is quick way of getting all headers, but instead we only add some a) to + # make it more usable, b) as at least one authority accidentally leaked security + # information into a header. + #attachment.body = leaf.within_rfc822_attachment.port.to_s + end + end + attachment_attributes << {:url_part_number => leaf.url_part_number, + :content_type => get_content_type(leaf), + :filename => get_part_file_name(leaf), + :charset => leaf.charset, + :within_rfc822_subject => within_rfc822_subject, + :body => body, + :hexdigest => Digest::MD5.hexdigest(body) } + end + attachment_attributes + end + + # (This risks losing info if the unchosen alternative is the only one to contain + # useful info, but let's worry about that another time) + def get_attachment_leaves(mail) + return _get_attachment_leaves_recursive(mail, mail) + end + def _get_attachment_leaves_recursive(curr_mail, parent_mail, within_rfc822_attachment = nil) + leaves_found = [] + if curr_mail.multipart? + if curr_mail.parts.size == 0 + raise "no parts on multipart mail" + end + + if curr_mail.sub_type == 'alternative' + # Choose best part from alternatives + best_part = nil + # Take the last text/plain one, or else the first one + curr_mail.parts.each do |m| + if not best_part + best_part = m + elsif m.content_type == 'text/plain' + best_part = m + end + end + # Take an HTML one as even higher priority. (They tend + # to render better than text/plain, e.g. don't wrap links here: + # http://www.whatdotheyknow.com/request/amount_and_cost_of_freedom_of_in#incoming-72238 ) + curr_mail.parts.each do |m| + if m.content_type == 'text/html' + best_part = m + end + end + leaves_found += _get_attachment_leaves_recursive(best_part, parent_mail, within_rfc822_attachment) + else + # Add all parts + curr_mail.parts.each do |m| + leaves_found += _get_attachment_leaves_recursive(m, parent_mail, within_rfc822_attachment) + end + end + else + # XXX Yuck. this section alters various content_types. That puts + # it into conflict with ensure_parts_counted which it has to be + # called both before and after. It will fail with cases of + # attachments of attachments etc. + charset = curr_mail.charset # save this, because overwriting content_type also resets charset + # Don't allow nil content_types + if curr_mail.content_type.nil? + curr_mail.content_type = 'application/octet-stream' + end + # PDFs often come with this mime type, fix it up for view code + if curr_mail.content_type == 'application/octet-stream' + part_file_name = get_part_file_name(curr_mail) + part_body = get_part_body(curr_mail) + calc_mime = AlaveteliFileTypes.filename_and_content_to_mimetype(part_file_name, part_body) + if calc_mime + curr_mail.content_type = calc_mime + end + end + + # Use standard content types for Word documents etc. + curr_mail.content_type = normalise_content_type(curr_mail.content_type) + if curr_mail.content_type == 'message/rfc822' + ensure_parts_counted(parent_mail) # fills in rfc822_attachment variable + if curr_mail.rfc822_attachment.nil? + # Attached mail didn't parse, so treat as text + curr_mail.content_type = 'text/plain' + end + end + if curr_mail.content_type == 'application/vnd.ms-outlook' || curr_mail.content_type == 'application/ms-tnef' + ensure_parts_counted(parent_mail) # fills in rfc822_attachment variable + if curr_mail.rfc822_attachment.nil? + # Attached mail didn't parse, so treat as binary + curr_mail.content_type = 'application/octet-stream' + end + end + # If the part is an attachment of email + if curr_mail.content_type == 'message/rfc822' || curr_mail.content_type == 'application/vnd.ms-outlook' || curr_mail.content_type == 'application/ms-tnef' + ensure_parts_counted(parent_mail) # fills in rfc822_attachment variable + leaves_found += _get_attachment_leaves_recursive(curr_mail.rfc822_attachment, parent_mail, curr_mail.rfc822_attachment) + else + # Store leaf + curr_mail.within_rfc822_attachment = within_rfc822_attachment + leaves_found += [curr_mail] + end + # restore original charset + curr_mail.charset = charset + end + return leaves_found + end + + def address_from_name_and_email(name, email) if !MySociety::Validate.is_valid_email(email) raise "invalid email " + email + " passed to address_from_name_and_email" diff --git a/lib/mail_handler/backends/tmail_extensions.rb b/lib/mail_handler/backends/tmail_extensions.rb index 9359dfeea..3576a8eca 100644 --- a/lib/mail_handler/backends/tmail_extensions.rb +++ b/lib/mail_handler/backends/tmail_extensions.rb @@ -20,6 +20,8 @@ module TMail attr_accessor :url_part_number attr_accessor :rfc822_attachment # when a whole email message is attached as text attr_accessor :within_rfc822_attachment # for parts within a message attached as text (for getting subject mainly) + attr_accessor :count_parts_count + attr_accessor :count_first_uudecode_count # Monkeypatch! (check to see if this becomes a standard function in # TMail::Mail, then use that, whatever it is called) diff --git a/lib/mail_handler/mail_handler.rb b/lib/mail_handler/mail_handler.rb index 7b0f6e7f2..8b227b9ca 100644 --- a/lib/mail_handler/mail_handler.rb +++ b/lib/mail_handler/mail_handler.rb @@ -4,6 +4,7 @@ require 'tmpdir' module MailHandler if RUBY_VERSION.to_f >= 1.9 + require 'mail' require 'backends/mail_extensions' require 'backends/mail_backend' include Backends::MailBackend @@ -20,7 +21,7 @@ module MailHandler def tnef_attachments(content) attachments = [] Dir.mktmpdir do |dir| - IO.popen("#{`which tnef`.chomp} -K -C #{dir}", "w") do |f| + IO.popen("#{`which tnef`.chomp} -K -C #{dir}", "wb") do |f| f.write(content) f.close if $?.signaled? @@ -33,7 +34,7 @@ module MailHandler found = 0 Dir.new(dir).sort.each do |file| # sort for deterministic behaviour if file != "." && file != ".." - file_content = File.open("#{dir}/#{file}", "r").read + file_content = File.open("#{dir}/#{file}", "rb").read attachments << { :content => file_content, :filename => file } found += 1 @@ -46,6 +47,131 @@ module MailHandler attachments end + def normalise_content_type(content_type) + # e.g. http://www.whatdotheyknow.com/request/93/response/250 + if content_type == 'application/excel' or content_type == 'application/msexcel' or content_type == 'application/x-ms-excel' + content_type = 'application/vnd.ms-excel' + end + if content_type == 'application/mspowerpoint' or content_type == 'application/x-ms-powerpoint' + content_type = 'application/vnd.ms-powerpoint' + end + if content_type == 'application/msword' or content_type == 'application/x-ms-word' + content_type = 'application/vnd.ms-word' + end + if content_type == 'application/x-zip-compressed' + content_type = 'application/zip' + end + + # e.g. http://www.whatdotheyknow.com/request/copy_of_current_swessex_scr_opt#incoming-9928 + if content_type == 'application/acrobat' + content_type = 'application/pdf' + end + + return content_type + end + + def get_attachment_text_one_file(content_type, body, charset = 'utf-8') + # note re. charset: TMail always tries to convert email bodies + # to UTF8 by default, so normally it should already be that. + text = '' + # XXX - tell all these command line tools to return utf-8 + if content_type == 'text/plain' + text += body + "\n\n" + else + tempfile = Tempfile.new('foiextract') + tempfile.binmode + tempfile.print body + tempfile.flush + default_params = { :append_to => text, :binary_output => false } + if content_type == 'application/vnd.ms-word' + AlaveteliExternalCommand.run("wvText", tempfile.path, tempfile.path + ".txt") + # Try catdoc if we get into trouble (e.g. for InfoRequestEvent 2701) + if not File.exists?(tempfile.path + ".txt") + AlaveteliExternalCommand.run("catdoc", tempfile.path, default_params) + else + text += File.read(tempfile.path + ".txt") + "\n\n" + File.unlink(tempfile.path + ".txt") + end + elsif content_type == 'application/rtf' + # catdoc on RTF prodcues less comments and extra bumf than --text option to unrtf + AlaveteliExternalCommand.run("catdoc", tempfile.path, default_params) + elsif content_type == 'text/html' + # lynx wordwraps links in its output, which then don't + # get formatted properly by Alaveteli. We use elinks + # instead, which doesn't do that. + AlaveteliExternalCommand.run("elinks", "-eval", "set document.codepage.assume = \"#{charset}\"", + "-eval", "set document.codepage.force_assumed = 1", + "-dump-charset", "utf-8", + "-force-html", "-dump", + tempfile.path, + default_params.merge(:env => {"LANG" => "C"})) + elsif content_type == 'application/vnd.ms-excel' + # Bit crazy using /usr/bin/strings - but xls2csv, xlhtml and + # py_xls2txt only extract text from cells, not from floating + # notes. catdoc may be fooled by weird character sets, but will + # probably do for UK FOI requests. + AlaveteliExternalCommand.run("/usr/bin/strings", tempfile.path, default_params) + elsif content_type == 'application/vnd.ms-powerpoint' + # ppthtml seems to catch more text, but only outputs HTML when + # we want text, so just use catppt for now + AlaveteliExternalCommand.run("catppt", tempfile.path, default_params) + elsif content_type == 'application/pdf' + AlaveteliExternalCommand.run("pdftotext", tempfile.path, "-", default_params) + elsif content_type == 'application/vnd.openxmlformats-officedocument.wordprocessingml.document' + # This is Microsoft's XML office document format. + # Just pull out the main XML file, and strip it of text. + xml = AlaveteliExternalCommand.run("/usr/bin/unzip", "-qq", + "-c", + tempfile.path, + "word/document.xml", + {:binary_output => false}) + if !xml.nil? + doc = REXML::Document.new(xml) + text += doc.each_element( './/text()' ){}.join(" ") + end + elsif content_type == 'application/zip' + # recurse into zip files + begin + zip_file = Zip::ZipFile.open(tempfile.path) + text += get_attachment_text_from_zip_file(zip_file) + zip_file.close() + rescue + $stderr.puts("Error processing zip file: #{$!.inspect}") + end + end + tempfile.close + end + + return text + end + def get_attachment_text_from_zip_file(zip_file) + + text = "" + for entry in zip_file + if entry.file? + filename = entry.to_s + begin + body = entry.get_input_stream.read + rescue + # move to next attachment silently if there were problems + # XXX really should reduce this to specific exceptions? + # e.g. password protected + next + end + calc_mime = AlaveteliFileTypes.filename_to_mimetype(filename) + if calc_mime + content_type = calc_mime + else + content_type = 'application/octet-stream' + end + + text += get_attachment_text_one_file(content_type, body) + + end + end + return text + end + # Turn instance methods into class methods extend self diff --git a/spec/fixtures/files/dos-linebreaks.email b/spec/fixtures/files/dos-linebreaks.email new file mode 100644 index 000000000..1f5f1473f --- /dev/null +++ b/spec/fixtures/files/dos-linebreaks.email @@ -0,0 +1,31 @@ +From email@example.com Wed Mar 12 14:58:26 2008 +Return-path: email@example.com> +Envelope-to: request-xxx-xxxxxx@whatdotheyknow.com +Delivery-date: Wed, 12 Mar 2008 14:58:26 +0000 +Received: from example.com ([0.0.0.0]:1368 helo=example.com) + by tea.ukcod.org.uk with esmtp (Exim 4.50) + id 1JZSPS-0002yK-Rq + for request-60-3548031c@whatdotheyknow.com; Wed, 12 Mar 2008 14:58:26 +0000 +X-MimeOLE: Produced By Microsoft Exchange V0.0.0.0 +Content-class: urn:content-classes:message +MIME-Version: 1.0 +Content-Type: text/plain; + charset="us-ascii" +Content-Transfer-Encoding: quoted-printable +Disposition-Notification-To: "A Person" email@example.com> +Subject: RE: Freedom of Information request - Plans for the East Oxford Community Centre +Date: Wed, 12 Mar 2008 14:59:04 -0000 +Message-ID: <3D8BEC617D49EF45A9E6D103A83FD30331BF84@local> +X-MS-Has-Attach: +X-MS-TNEF-Correlator: +Thread-Topic: Freedom of Information request +Thread-Index: AciDziuIcYirFQ7GT36VyP2ABE14qgAg1c0w +From: "A Person" email@example.com> +To: FOI Person <EMAIL_TO> +X-OriginalArrivalTime: 12 Mar 2008 14:59:04.0368 (UTC) FILETIME=[9D245300:01C88451] +X-SEF-7853D99-ADF1-478E-8894-213D316B8FFA: 1 +X-SEF-Processed: 6_0_1_111__2008_03_12_14_59_05 + +Thank you for your Freedom of Information request. I have forwarded it=0D=0A= +to the relevant department for their reply.=0D=0A=0D=0A + diff --git a/spec/fixtures/files/many-attachments-date-header.email b/spec/fixtures/files/many-attachments-date-header.email new file mode 100644 index 000000000..a241e2456 --- /dev/null +++ b/spec/fixtures/files/many-attachments-date-header.email @@ -0,0 +1,451 @@ +From email@example.com Wed Apr 14 11:23:08 2010 +Return-path: <email@example.com> +Envelope-to: email@example.com +Delivery-date: Wed, 14 Apr 2010 11:23:08 +0100 +X-TM-IMSS-Message-ID:<email@example.com> +Received: from example.com ([0.0.0.0]) by example.com ([0.0.0.0]) with ESMTP (TREND IMSS SMTP Service 7.0) id 1ec0f7ac0002a77f ; Wed, 14 Apr 2010 11:22:52 +0100 +Received: from GWGATE-MTA by example.com + with Novell_GroupWise; Wed, 14 Apr 2010 11:22:53 +0100 +Message-Id: <email@example.com> +X-Mailer: Novell GroupWise Internet Agent 8.0.1 +Date: Wed, 14 Apr 2010 11:22:47 +0100 +From: "A Person" <email@example.com> +To: <email@example.com> +Cc: "FOI FOI" <email@example.com>, + "A Person" <email@example.com> +Subject: Fwd: Re: Freedom of Information request +References: <email@example.com> + <email@example.com> +Mime-Version: 1.0 +Content-Type: multipart/mixed; boundary="=__Part163C9567.0__=" + +This is a MIME message. If you are reading this text, you may want to +consider changing to a mail reader or gateway that understands how to +properly handle MIME multipart messages. + +--=__Part163C9567.0__= +Content-Type: text/plain; charset=US-ASCII +Content-Transfer-Encoding: quoted-printable +Content-Disposition: inline + +Some information + + +--=__Part163C9567.0__= +Content-Type: message/rfc822 + +Date: Wed, 10 Mar 2010 14:17:52 +0000 +From: "A Person" <email@example.com> +To: "A Person" <email@example.com> +Subject: Re: xxx +Mime-Version: 1.0 +Content-Type: text/plain; charset=US-ASCII +Content-Transfer-Encoding: quoted-printable +Content-Disposition: inline + +2 + +--=__Part163C9567.0__= +Content-Type: message/rfc822 + +Return-path: <email@example.com> +Received: from example.com ([0.0.0.0]) + by example.com with ESMTP; Tue, 24 Nov 2009 10:45:58 +0000 +X-TM-IMSS-Message-ID:<email@example.com> +Received: from example.com ([0.0.0.0]) by example.com ([0.0.0.0]) with SMTP (TREND IMSS SMTP Service 7.0) id 00660acd00000f42 ; Tue, 24 Nov 2009 10:45:55 +0000 +Received: from source ([0.0.0.0]) (using TLSv1) by eu1sys200aob115.postini.com ([0.0.0.0]) with SMTP + ID email@example.com Tue, 24 Nov 2009 10:45:56 UTC +Received: from example.com ([::1]) by + example.com ([::1]) with mapi; Tue, 24 Nov 2009 + 10:45:53 +0000 +From: A Person <email@example.com> +To: email@example.com <email@example.com> +Date: Tue, 24 Nov 2009 10:45:52 +0000 +Subject: example +Thread-Topic: example +Thread-Index: AcpnbI2i+XAmfHFVTFy0eGDpVJhXoQFhVeZw +Message-ID: <email@example.com> +Accept-Language: en-US, en-GB +Content-Language: en-US +X-MS-Has-Attach: yes +X-MS-TNEF-Correlator: +acceptlanguage: en-US, en-GB +Content-Type: multipart/mixed; + boundary="_006_B3BDF1D06801114FA040D0F1BEE7CF9C48FD3549evs02ukcommonpu_" +MIME-Version: 1.0 +X-TM-AS-Product-Ver: IMSS-0.0.0.0-0.0.0.0-17028.005 +X-TM-AS-Result: No--19.329-5.0-31-1 +X-imss-scan-details: No--19.329-5.0-31-1 +X-TM-AS-User-Approved-Sender: No +X-TM-AS-User-Blocked-Sender: No + + +--_006_B3BDF1D06801114FA040D0F1BEE7CF9C48FD3549evs02ukcommonpu_ +Content-Type: multipart/related; + boundary="_005_B3BDF1D06801114FA040D0F1BEE7CF9C48FD3549evs02ukcommonpu_"; + type="multipart/alternative" + +--_005_B3BDF1D06801114FA040D0F1BEE7CF9C48FD3549evs02ukcommonpu_ +Content-Type: multipart/alternative; + boundary="_000_B3BDF1D06801114FA040D0F1BEE7CF9C48FD3549evs02ukcommonpu_" + +--_000_B3BDF1D06801114FA040D0F1BEE7CF9C48FD3549evs02ukcommonpu_ +Content-Type: text/plain; charset="iso-8859-1" +Content-Transfer-Encoding: quoted-printable + + +3 + +--_000_B3BDF1D06801114FA040D0F1BEE7CF9C48FD3549evs02ukcommonpu_ +Content-Type: text/html; charset="iso-8859-1" +Content-Transfer-Encoding: quoted-printable + +4 + +--_000_B3BDF1D06801114FA040D0F1BEE7CF9C48FD3549evs02ukcommonpu_-- + +--_005_B3BDF1D06801114FA040D0F1BEE7CF9C48FD3549evs02ukcommonpu_ +Content-Type: image/gif; name="image001.gif" +Content-Description: image001.gif +Content-Disposition: inline; filename="image001.gif"; size=5445; + creation-date="Tue, 17 Nov 2009 09:58:46 GMT"; + modification-date="Tue, 17 Nov 2009 09:58:46 GMT" +Content-ID: <email@example.com> +Content-Transfer-Encoding: base64 + +5 +--_005_B3BDF1D06801114FA040D0F1BEE7CF9C48FD3549evs02ukcommonpu_-- + +--_006_B3BDF1D06801114FA040D0F1BEE7CF9C48FD3549evs02ukcommonpu_ +Content-Type: application/vnd.ms-excel; + name="particpant list.xls" +Content-Description: particpant list.xls +Content-Disposition: attachment; + filename="particpant list.xls"; size=21504; + creation-date="Mon, 02 Nov 2009 09:42:37 GMT"; + modification-date="Tue, 24 Nov 2009 10:45:52 GMT" +Content-Transfer-Encoding: base64 + +6 + +--_006_B3BDF1D06801114FA040D0F1BEE7CF9C48FD3549evs02ukcommonpu_-- +--=__Part163C9567.0__= +Content-Type: message/rfc822 + +Return-path: <email@example.com> +Received: from example.com ([0.0.0.0]) + by example.com with ESMTP; Thu, 03 Dec 2009 09:29:07 +0000 +X-TM-IMSS-Message-ID:<email@example.com> +Received: from eu1sys200aog116.obsmtp.com ([0.0.0.0]) by example.com ([0.0.0.0]) with SMTP (TREND IMSS SMTP Service 7.0) id 0ac1bf1b0001116e ; Thu, 3 Dec 2009 09:29:04 +0000 +Received: from source ([0.0.0.0]) (using TLSv1) by eu1sys200aob116.postini.com ([0.0.0.0]) with SMTP + ID email@example.com Thu, 03 Dec 2009 09:29:06 UTC +Received: from example.com ([::1]) by + example.com ([::1]) with mapi; Thu, 3 Dec 2009 09:29:03 + +0000 +From: A Person <email@example.com> +To: 'A Person' <email@example.com> +Date: Thu, 3 Dec 2009 09:29:03 +0000 +Subject: RE: example +Thread-Topic: example +Thread-Index: AcpuoEyRvzM8fXw+THuj/617pjnvCgFWqZdQ +Message-ID: <email@example.com> +References: <email@example.com> + <email@example.com> +In-Reply-To: <email@example.com> +Accept-Language: en-US, en-GB +Content-Language: en-US +X-MS-Has-Attach: +X-MS-TNEF-Correlator: +acceptlanguage: en-US, en-GB +Content-Type: text/plain; charset="utf-8" +Content-Transfer-Encoding: base64 +MIME-Version: 1.0 +X-TM-AS-Product-Ver: IMSS-0.0.0.0-0.0.0.0-17046.004 +X-TM-AS-Result: No--16.791-5.0-31-1 +X-imss-scan-details: No--16.791-5.0-31-1 +X-TM-AS-User-Approved-Sender: No +X-TM-AS-User-Blocked-Sender: No + + +7 +--=__Part163C9567.0__= +Content-Type: message/rfc822 + +Return-path: <email@example.com> +Received: from example.com ([0.0.0.0]) + by example.com with ESMTP; Wed, 25 Nov 2009 22:26:23 +0000 +X-TM-IMSS-Message-ID:<email@example.com> +Received: from eu1sys200aog105.obsmtp.com ([0.0.0.0]) by example.com ([0.0.0.0]) with SMTP (TREND IMSS SMTP Service 7.0) id 034354c900007016 ; Wed, 25 Nov 2009 22:26:19 +0000 +Received: from source ([0.0.0.0]) (using TLSv1) by eu1sys200aob105.postini.com ([0.0.0.0]) with SMTP + ID email@example.com Wed, 25 Nov 2009 22:26:21 UTC +Received: from example.com ([::1]) by + example.com ([::1]) with mapi; Wed, 25 Nov 2009 + 22:26:15 +0000 +From: A Person <email@example.com> +To: email@example.com <email@example.com> +CC: A Person <email@example.com> +Date: Wed, 25 Nov 2009 22:26:12 +0000 +Subject: As promised - Masterclass info (example) +Thread-Topic: As promised - Masterclass info (example) +Thread-Index: AcpuHcJ4yrR8PBHZTVCU/RLGzwqsDAAACGwQ +Message-ID: <email@example.com> +Accept-Language: en-US, en-GB +Content-Language: en-US +X-MS-Has-Attach: yes +X-MS-TNEF-Correlator: +acceptlanguage: en-US, en-GB +Content-Type: multipart/mixed; + boundary="_007_B3BDF1D06801114FA040D0F1BEE7CF9C48DC6D82evs02ukcommonpu_" +MIME-Version: 1.0 +X-TM-AS-Product-Ver: IMSS-0.0.0.0-0.0.0.0-17032.000 +X-TM-AS-Result: No--26.167-5.0-31-1 +X-imss-scan-details: No--26.167-5.0-31-1 +X-TM-AS-User-Approved-Sender: No +X-TM-AS-User-Blocked-Sender: No + + +--_007_B3BDF1D06801114FA040D0F1BEE7CF9C48DC6D82evs02ukcommonpu_ +Content-Type: multipart/related; + boundary="_006_B3BDF1D06801114FA040D0F1BEE7CF9C48DC6D82evs02ukcommonpu_"; + type="multipart/alternative" + +--_006_B3BDF1D06801114FA040D0F1BEE7CF9C48DC6D82evs02ukcommonpu_ +Content-Type: multipart/alternative; + boundary="_000_B3BDF1D06801114FA040D0F1BEE7CF9C48DC6D82evs02ukcommonpu_" + +--_000_B3BDF1D06801114FA040D0F1BEE7CF9C48DC6D82evs02ukcommonpu_ +Content-Type: text/plain; charset="utf-8" +Content-Transfer-Encoding: base64 + +8 +--_000_B3BDF1D06801114FA040D0F1BEE7CF9C48DC6D82evs02ukcommonpu_ +Content-Type: text/html; charset="utf-8" +Content-Transfer-Encoding: base64 + +9 +--_000_B3BDF1D06801114FA040D0F1BEE7CF9C48DC6D82evs02ukcommonpu_-- + +--_006_B3BDF1D06801114FA040D0F1BEE7CF9C48DC6D82evs02ukcommonpu_ +Content-Type: image/gif; name="image001.gif" +Content-Description: image001.gif +Content-Disposition: inline; filename="image001.gif"; size=5445; + creation-date="Wed, 25 Nov 2009 22:26:14 GMT"; + modification-date="Wed, 25 Nov 2009 22:26:14 GMT" +Content-ID: <email@example.com> +Content-Transfer-Encoding: base64 + + +10 + +--_006_B3BDF1D06801114FA040D0F1BEE7CF9C48DC6D82evs02ukcommonpu_-- + +--_007_B3BDF1D06801114FA040D0F1BEE7CF9C48DC6D82evs02ukcommonpu_ +Content-Type: application/msword; + name= "Participant List.doc" +Content-Description: Participant List.doc +Content-Disposition: attachment; + filename="Participant List.doc"; size=112640; + creation-date="Wed, 25 Nov 2009 22:17:24 GMT"; + modification-date="Wed, 25 Nov 2009 11:43:48 GMT" +Content-Transfer-Encoding: base64 + +11 +--_007_B3BDF1D06801114FA040D0F1BEE7CF9C48DC6D82evs02ukcommonpu_ +Content-Type: application/msword; name="Information & Booking Form.doc" +Content-Description: Information & Booking Form.doc +Content-Disposition: attachment; filename="Information & Booking Form.doc"; size=84480; + creation-date="Wed, 25 Nov 2009 22:17:40 GMT"; + modification-date="Wed, 04 Nov 2009 14:42:54 GMT" +Content-Transfer-Encoding: base64 + +12 + +--_007_B3BDF1D06801114FA040D0F1BEE7CF9C48DC6D82evs02ukcommonpu_-- +--=__Part163C9567.0__= +Content-Type: message/rfc822 + +Return-path: <email@example.com> +Received: from example.com ([0.0.0.0]) + by example.com with ESMTP; Fri, 04 Dec 2009 10:00:05 +0000 +X-TM-IMSS-Message-ID:<email@example.com> +Received: from eu1sys200aog109.obsmtp.com ([0.0.0.0]) by example.com ([0.0.0.0]) with SMTP (TREND IMSS SMTP Service 7.0) id 100473260001a476 ; Fri, 4 Dec 2009 10:00:01 +0000 +Received: from source ([0.0.0.0]) (using TLSv1) by eu1sys200aob109.postini.com ([0.0.0.0]) with SMTP + ID email@example.com Fri, 04 Dec 2009 10:00:04 UTC +Received: from example.com ([::1]) by + example.com ([::1]) with mapi; Fri, 4 Dec 2009 10:00:01 + +0000 +From: A Person <email@example.com> +To: email@example.com <email@example.com> +Date: Fri, 4 Dec 2009 10:00:01 +0000 +Subject: Re: As promised - info (example) +Thread-Topic: As promised - info (example) +Thread-Index: AcpzhLeBjBId8eZATYudOfBgN6CPXQBQ9Pok +Message-ID: <email@example.com> +Accept-Language: en-US, en-GB +Content-Language: en-US +X-MS-Has-Attach: +X-MS-TNEF-Correlator: +acceptlanguage: en-US, en-GB +Content-Type: text/plain; charset="utf-8" +Content-Transfer-Encoding: base64 +MIME-Version: 1.0 +X-TM-AS-Product-Ver: IMSS-0.0.0.0-0.0.0.0-17048.005 +X-TM-AS-Result: No--24.171-5.0-31-1 +X-imss-scan-details: No--24.171-5.0-31-1 +X-TM-AS-User-Approved-Sender: No +X-TM-AS-User-Blocked-Sender: No + +13 +--=__Part163C9567.0__= +Content-Type: message/rfc822 + +Return-path: <email@example.com> +Received: from example.com ([0.0.0.0]) + by example.com with ESMTP; Sun, 21 Mar 2010 21:53:38 +0000 +X-TM-IMSS-Message-ID:<email@example.com> +Received: from eu1sys200aog117.obsmtp.com ([0.0.0.0]) by example.com ([0.0.0.0]) with SMTP (TREND IMSS SMTP Service 7.0) id 1e3611c1000d37df ; Sun, 21 Mar 2010 21:53:32 +0000 +Received: from source ([0.0.0.0]) (using TLSv1) by eu1sys200aob117.postini.com ([0.0.0.0]) with SMTP + ID email@example.com Sun, 21 Mar 2010 21:53:37 UTC +Received: from example.com ([::1]) by exchhub01 + ([0.0.0.0]) with mapi; Sun, 21 Mar 2010 21:53:34 +0000 +From: A Person <email@example.com> +To: email@example.com <email@example.com> +CC: A Person <email@example.com> +Date: Sun, 21 Mar 2010 21:53:32 +0000 +Subject: Thank you from example +Thread-Topic: Thank you from example +Thread-Index: AcrJQPL4xb9zjXMHRJGTjAxo3X/kfA== +Message-ID: <email@example.com> +Accept-Language: en-US, en-GB +Content-Language: en-US +X-MS-Has-Attach: yes +X-MS-TNEF-Correlator: +acceptlanguage: en-US, en-GB +Content-Type: multipart/related; + boundary="_004_B3BDF1D06801114FA040D0F1BEE7CF9C5E14635Bevs02ukcommonpu_"; + type="multipart/alternative" +MIME-Version: 1.0 +X-TM-AS-Product-Ver: IMSS-0.0.0.0-0.0.0.0-17266.002 +X-TM-AS-Result: No--26.373-5.0-31-1 +X-imss-scan-details: No--26.373-5.0-31-1 +X-TM-AS-User-Approved-Sender: No +X-TM-AS-User-Blocked-Sender: No + + +--_004_B3BDF1D06801114FA040D0F1BEE7CF9C5E14635Bevs02ukcommonpu_ +Content-Type: multipart/alternative; + boundary="_000_B3BDF1D06801114FA040D0F1BEE7CF9C5E14635Bevs02ukcommonpu_" + +--_000_B3BDF1D06801114FA040D0F1BEE7CF9C5E14635Bevs02ukcommonpu_ +Content-Type: text/plain; charset="us-ascii" +Content-Transfer-Encoding: quoted-printable + +14 + +--_000_B3BDF1D06801114FA040D0F1BEE7CF9C5E14635Bevs02ukcommonpu_ +Content-Type: text/html; charset="us-ascii" +Content-Transfer-Encoding: quoted-printable + +15 + +--_000_B3BDF1D06801114FA040D0F1BEE7CF9C5E14635Bevs02ukcommonpu_-- + +--_004_B3BDF1D06801114FA040D0F1BEE7CF9C5E14635Bevs02ukcommonpu_ +Content-Type: image/gif; name="image001.gif" +Content-Description: image001.gif +Content-Disposition: inline; filename="image001.gif"; size=5445; + creation-date="Sun, 21 Mar 2010 21:53:33 GMT"; + modification-date="Sun, 21 Mar 2010 21:53:33 GMT" +Content-ID: <email@example.com> +Content-Transfer-Encoding: base64 + +16 +--_004_B3BDF1D06801114FA040D0F1BEE7CF9C5E14635Bevs02ukcommonpu_-- +--=__Part163C9567.0__= +Content-Type: message/rfc822 + +Return-path: <email@example.com> +Received: from example.com ([0.0.0.0]) + by example.com with ESMTP; Tue, 23 Feb 2010 15:33:48 +0000 +X-TM-IMSS-Message-ID:<email@example.com> +Received: from eu1sys200aog112.obsmtp.com ([0.0.0.0]) by example.com ([0.0.0.0]) with SMTP (TREND IMSS SMTP Service 7.0) id 96f54043000f2e72 ; Tue, 23 Feb 2010 15:33:48 +0000 +Received: from source ([0.0.0.0]) by eu1sys200aob112.postini.com ([0.0.0.0]) with SMTP + ID email@example.com Tue, 23 Feb 2010 15:33:47 UTC +Received: from gla-002561-lap ([0.0.0.0]) by example.com with Microsoft SMTPSVC(0.0.0.0); + Tue, 23 Feb 2010 15:33:46 +0000 +Reply-To: email@example.com +From: email@example.com +To: email@example.com +Subject: example - Meeting - Tuesday 2nd March +Date: 23 February 2010 15:33 +X-Mailer: Internet Professional v1.15 +Return-Path: email@example.com +Message-ID: <email@example.com> +X-OriginalArrivalTime: 23 Feb 2010 15:33:46.0648 (UTC) FILETIME=[96CEC980:01CAB49D] +X-TM-AS-Product-Ver: IMSS-0.0.0.0-0.0.0.0-17212.000 +X-TM-AS-Result: No--16.146-5.0-31-1 +X-imss-scan-details: No--16.146-5.0-31-1 +X-TM-AS-User-Approved-Sender: No +X-TM-AS-User-Blocked-Sender: No + +17 + +--=__Part163C9567.0__= +Content-Type: message/rfc822 + +Return-path: <email@example.com> +Received: from example.com ([0.0.0.0]) + by example.com with ESMTP; Mon, 08 Mar 2010 09:21:42 +0000 +X-TM-IMSS-Message-ID:<email@example.com> +Received: from eu1sys200aog117.obsmtp.com ([0.0.0.0]) by example.com ([0.0.0.0]) with SMTP (TREND IMSS SMTP Service 7.0) id d8931aff001580d6 ; Mon, 8 Mar 2010 09:21:40 +0000 +Received: from source ([0.0.0.0]) by eu1sys200aob117.postini.com ([0.0.0.0]) with SMTP + ID email@example.com Mon, 08 Mar 2010 09:21:39 UTC +Received: from gla-002561-lap ([0.0.0.0]) by example.com with Microsoft SMTPSVC(0.0.0.0); + Mon, 8 Mar 2010 09:21:36 +0000 +Reply-To: email@example.com +From: email@example.com +To: email@example.com +Subject: example - Help needed +Date: 08 March 2010 09:21 +X-Mailer: Internet Professional v1.15 +MIME-Version: 1.0 +Content-Type: multipart/mixed;boundary="_NextPart_00009D35-00000F3C-00271781-26DF" +Return-Path: email@example.com +Message-ID: <email@example.com> +X-OriginalArrivalTime: 08 Mar 2010 09:21:36.0283 (UTC) FILETIME=[C03E3EB0:01CABEA0] +X-TM-AS-Product-Ver: IMSS-0.0.0.0-0.0.0.0-17236.006 +X-TM-AS-Result: No--32.111-5.0-31-1 +X-imss-scan-details: No--32.111-5.0-31-1 +X-TM-AS-User-Approved-Sender: No +X-TM-AS-User-Blocked-Sender: No + +This message is in MIME format. Since your mail reader does not +understand this format, some or all of this message may not be legible. +--_NextPart_00009D35-00000F3C-00271781-26DF +Content-Type: text/plain +Content-Transfer-Encoding: 7bit + +18 + +--_NextPart_00009D35-00000F3C-00271781-26DF +Content-Type: application/octet-stream;name="Information Pack.pdf" +Content-Transfer-Encoding: base64 +Content-Disposition: attachment;filename="Information Pack.pdf";size=106688 + +19 +--_NextPart_00009D35-00000F3C-00271781-26DF-- +--=__Part163C9567.0__= +Content-Type: message/rfc822 + +Date: Wed, 02 Dec 2009 19:21:27 +0000 +From: "A Person" <email@example.com> +To: "A Person" <email@example.com> +Subject: Re: As promised - info (example) +Mime-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit +Content-Disposition: inline + +20 +--=__Part163C9567.0__=-- + diff --git a/spec/lib/mail_handler/mail_handler_spec.rb b/spec/lib/mail_handler/mail_handler_spec.rb index 7eeba47e0..ae65210f2 100644 --- a/spec/lib/mail_handler/mail_handler_spec.rb +++ b/spec/lib/mail_handler/mail_handler_spec.rb @@ -250,4 +250,134 @@ describe 'when getting header strings' do '9; Autoresponder') end -end
\ No newline at end of file +end + +describe "when parsing HTML mail" do + it "should display UTF-8 characters in the plain text version correctly" do + html = "<html><b>foo</b> është" + plain_text = MailHandler.get_attachment_text_one_file('text/html', html) + plain_text.should match(/është/) + end + +end + +describe "when getting the attachment text" do + it "should not raise an error if the expansion of a zip file raises an error" do + mock_entry = mock('ZipFile entry', :file? => true) + mock_entries = [mock_entry] + mock_entries.stub!(:close) + mock_entry.stub!(:get_input_stream).and_raise("invalid distance too far back") + Zip::ZipFile.stub!(:open).and_return(mock_entries) + MailHandler.get_attachment_text_one_file('application/zip', "some string") + end + +end + +describe 'when getting attachment attributes' do + + it 'should get two attachment parts from a multipart mail with text and html alternatives + and an image' do + mail = get_fixture_mail('quoted-subject-iso8859-1.email') + attributes = MailHandler.get_attachment_attributes(mail) + attributes.size.should == 2 + end + + it 'should expand a mail attached as text' do + mail = get_fixture_mail('rfc822-attachment.email') + attributes = MailHandler.get_attachment_attributes(mail) + attributes.size.should == 2 + rfc_attachment = attributes[1] + rfc_attachment[:within_rfc822_subject].should == 'Freedom of Information request' + headers = ['Date: Thu, 13 Mar 2008 16:57:33 +0000', + 'Subject: Freedom of Information request', + 'From: An FOI Officer <foi.officer@example.com>', + 'To: request-bounce-xx-xxxxx@whatdotheyno.com'] + rfc_attachment[:body].should == "#{headers.join("\n")}\n\nsome example text" + end + + it 'should handle a mail which causes Tmail to generate a blank header value' do + mail = get_fixture_mail('many-attachments-date-header.email') + attributes = MailHandler.get_attachment_attributes(mail) + end + + it 'should produce a consistent set of url_part_numbers, content_types, within_rfc822_subjects + and filenames from an example mail with lots of attachments' do + mail = get_fixture_mail('many-attachments-date-header.email') + attributes = MailHandler.get_attachment_attributes(mail) + + expected_attributes = [ { :content_type=>"text/plain", + :url_part_number=>1, + :within_rfc822_subject=>nil, + :filename=>nil}, + { :content_type=>"text/plain", + :url_part_number=>2, + :within_rfc822_subject=>"Re: xxx", + :filename=>nil}, + { :content_type=>"text/html", + :url_part_number=>4, + :within_rfc822_subject=>"example", + :filename=>nil}, + { :content_type=>"image/gif", :url_part_number=>5, + :within_rfc822_subject=>"example", + :filename=>"image001.gif"}, + { :content_type=>"application/vnd.ms-excel", + :url_part_number=>6, + :within_rfc822_subject=>"example", + :filename=>"particpant list.xls"}, + { :content_type=>"text/plain", + :url_part_number=>7, + :within_rfc822_subject=>"RE: example", + :filename=>nil}, + { :content_type=>"text/html", + :url_part_number=>9, + :within_rfc822_subject=>"As promised - Masterclass info (example)", + :filename=>nil}, + { :content_type=>"image/gif", + :url_part_number=>10, + :within_rfc822_subject=>"As promised - Masterclass info (example)", + :filename=>"image001.gif"}, + { :content_type=>"application/vnd.ms-word", + :url_part_number=>11, + :within_rfc822_subject=>"As promised - Masterclass info (example)", + :filename=>"Participant List.doc"}, + { :content_type=>"application/vnd.ms-word", + :url_part_number=>12, + :within_rfc822_subject=>"As promised - Masterclass info (example)", + :filename=>"Information & Booking Form.doc"}, + { :content_type=>"text/plain", + :url_part_number=>13, + :within_rfc822_subject=>"Re: As promised - info (example)", + :filename=>nil}, + { :content_type=>"text/html", + :url_part_number=>15, + :within_rfc822_subject=>"Thank you from example", + :filename=>nil}, + { :content_type=>"image/gif", + :url_part_number=>16, + :within_rfc822_subject=>"Thank you from example", + :filename=>"image001.gif"}, + { :content_type=>"text/plain", + :url_part_number=>17, + :within_rfc822_subject=>"example - Meeting - Tuesday 2nd March", + :filename=>nil}, + { :content_type=>"text/plain", + :url_part_number=>18, + :within_rfc822_subject=>"example - Help needed", + :filename=>nil}, + { :content_type=>"application/pdf", + :url_part_number=>19, + :within_rfc822_subject=>"example - Help needed", + :filename=>"Information Pack.pdf"}, + { :content_type=>"text/plain", + :url_part_number=>20, + :within_rfc822_subject=>"Re: As promised - info (example)", + :filename=>nil} ] + + attributes.each_with_index do |attr, index| + attr.delete(:charset) + attr.delete(:body) + attr.delete(:hexdigest) + attr.should == expected_attributes[index] + end + end +end diff --git a/spec/models/incoming_message_spec.rb b/spec/models/incoming_message_spec.rb index 1278535f8..70b323e9f 100644 --- a/spec/models/incoming_message_spec.rb +++ b/spec/models/incoming_message_spec.rb @@ -68,6 +68,14 @@ describe IncomingMessage, " when dealing with incoming mail" do message.get_main_body_text_internal.should include("The above text was badly encoded") end + it 'should convert DOS-style linebreaks to Unix style' do + ir = info_requests(:fancy_dog_request) + receive_incoming_mail('dos-linebreaks.email', ir.incoming_email) + message = ir.incoming_messages[1] + message.parse_raw_email! + message.get_main_body_text_internal.should_not match(/\r\n/) + end + it "should fold multiline sections" do { "foo\n--------\nconfidential" => "foo\nFOLDED_QUOTED_SECTION\n", # basic test @@ -102,27 +110,6 @@ describe IncomingMessage, " when dealing with incoming mail" do end -describe IncomingMessage, "when parsing HTML mail" do - it "should display UTF-8 characters in the plain text version correctly" do - html = "<html><b>foo</b> është" - plain_text = IncomingMessage._get_attachment_text_internal_one_file('text/html', html) - plain_text.should match(/është/) - end - -end - -describe IncomingMessage, "when getting the attachment text" do - - it "should not raise an error if the expansion of a zip file raises an error" do - mock_entry = mock('ZipFile entry', :file? => true) - mock_entry.stub!(:get_input_stream).and_raise("invalid distance too far back") - Zip::ZipFile.stub!(:open).and_return([mock_entry]) - IncomingMessage._get_attachment_text_internal_one_file('application/zip', "some string") - end - -end - - describe IncomingMessage, " display attachments" do it "should not show slashes in filenames" do @@ -138,7 +125,7 @@ describe IncomingMessage, " display attachments" do # http://www.whatdotheyknow.com/request/post_commercial_manager_librarie#incoming-17233 foi_attachment.within_rfc822_subject = "FOI/09/066 RESPONSE TO FOI REQUEST RECEIVED 21st JANUARY 2009" foi_attachment.content_type = 'text/plain' - foi_attachment.ensure_filename! + foi_attachment.ensure_filename! expected_display_filename = foi_attachment.within_rfc822_subject.gsub(/\//, " ") + ".txt" foi_attachment.display_filename.should == expected_display_filename end @@ -326,12 +313,12 @@ describe IncomingMessage, " when censoring data" do orig_pdf = load_file_fixture('tfl.pdf') pdf = orig_pdf.dup - orig_text = IncomingMessage._get_attachment_text_internal_one_file('application/pdf', pdf) + orig_text = MailHandler.get_attachment_text_one_file('application/pdf', pdf) orig_text.should match(/foi@tfl.gov.uk/) @im.binary_mask_stuff!(pdf, "application/pdf") - masked_text = IncomingMessage._get_attachment_text_internal_one_file('application/pdf', pdf) + masked_text = MailHandler.get_attachment_text_one_file('application/pdf', pdf) masked_text.should_not match(/foi@tfl.gov.uk/) masked_text.should match(/xxx@xxx.xxx.xx/) config['USE_GHOSTSCRIPT_COMPRESSION'] = previous |