aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--.travis.yml1
-rw-r--r--app/models/foi_attachment.rb64
-rw-r--r--app/models/incoming_message.rb416
-rw-r--r--lib/mail_handler/backends/mail_backend.rb209
-rw-r--r--lib/mail_handler/backends/mail_extensions.rb60
-rw-r--r--lib/mail_handler/backends/tmail_backend.rb186
-rw-r--r--lib/mail_handler/backends/tmail_extensions.rb2
-rw-r--r--lib/mail_handler/mail_handler.rb130
-rw-r--r--spec/fixtures/files/dos-linebreaks.email31
-rw-r--r--spec/fixtures/files/many-attachments-date-header.email451
-rw-r--r--spec/lib/mail_handler/mail_handler_spec.rb132
-rw-r--r--spec/models/incoming_message_spec.rb35
12 files changed, 1331 insertions, 386 deletions
diff --git a/.travis.yml b/.travis.yml
index c5e63aeb3..493663940 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -5,6 +5,7 @@ branches:
- master
rvm:
- 1.8.7
+ - 1.9.3
before_install:
- gem install rubygems-update --version=1.6.2
- gem update --system 1.6.2
diff --git a/app/models/foi_attachment.rb b/app/models/foi_attachment.rb
index 7c4c3226f..2f8a9ab04 100644
--- a/app/models/foi_attachment.rb
+++ b/app/models/foi_attachment.rb
@@ -67,9 +67,22 @@ class FoiAttachment < ActiveRecord::Base
file.write d
}
update_display_size!
+ encode_cached_body!
@cached_body = d
end
+ # If the original mail part had a charset, it's some kind of string, so assume that
+ # it should be handled as a string in the stated charset, not a bytearray, and then
+ # convert it our default encoding. For ruby 1.8 this is a noop.
+ def encode_cached_body!
+ if RUBY_VERSION.to_f >= 1.9
+ if charset
+ @cached_body.force_encoding(charset)
+ @cached_body = @cached_body.encode(Encoding.default_internal, charset)
+ end
+ end
+ end
+
def body
if @cached_body.nil?
tries = 0
@@ -90,6 +103,7 @@ class FoiAttachment < ActiveRecord::Base
self.incoming_message.parse_raw_email!(force)
retry
end
+ encode_cached_body!
end
return @cached_body
end
@@ -310,32 +324,42 @@ class FoiAttachment < ActiveRecord::Base
# the extractions will also produce image files, which go in the
# current directory, so change to the directory the function caller
# wants everything in
- Dir.chdir(dir) do
- tempfile = Tempfile.new('foiextract', '.')
- tempfile.print self.body
- tempfile.flush
-
- html = nil
- if self.content_type == 'application/pdf'
- # We set a timeout here, because pdftohtml can spiral out of control
- # on some PDF files and we don’t want to crash the whole server.
- html = AlaveteliExternalCommand.run("pdftohtml", "-nodrm", "-zoom", "1.0", "-stdout", "-enc", "UTF-8", "-noframes", tempfile.path, :timeout => 30)
- elsif self.content_type == 'application/rtf'
- html = AlaveteliExternalCommand.run("unrtf", "--html", tempfile.path, :timeout => 120)
- end
- if html.nil?
- if self.has_google_docs_viewer?
- html = '' # force error and using Google docs viewer
+ html = nil
+ if ['application/pdf', 'application/rtf'].include?(self.content_type)
+ text = self.body
+ Dir.chdir(dir) do
+ if RUBY_VERSION.to_f >= 1.9
+ tempfile = Tempfile.new('foiextract', '.', :encoding => text.encoding)
else
- raise "No HTML conversion available for type " + self.content_type
+ tempfile = Tempfile.new('foiextract', '.')
end
- end
+ tempfile.print text
+ tempfile.flush
- tempfile.close
- tempfile.delete
+
+ if self.content_type == 'application/pdf'
+ # We set a timeout here, because pdftohtml can spiral out of control
+ # on some PDF files and we don't want to crash the whole server.
+ html = AlaveteliExternalCommand.run("pdftohtml", "-nodrm", "-zoom", "1.0", "-stdout", "-enc", "UTF-8", "-noframes", tempfile.path, :timeout => 30)
+ elsif self.content_type == 'application/rtf'
+ html = AlaveteliExternalCommand.run("unrtf", "--html", tempfile.path, :timeout => 120)
+ end
+
+ tempfile.close
+ tempfile.delete
+ end
+ end
+ if html.nil?
+ if self.has_google_docs_viewer?
+ html = '' # force error and using Google docs viewer
+ else
+ raise "No HTML conversion available for type " + self.content_type
+ end
end
+
+
# We need to look at:
# a) Any error code
# b) The output size, as pdftohtml does not return an error code upon error.
diff --git a/app/models/incoming_message.rb b/app/models/incoming_message.rb
index 464910d01..a02d2456a 100644
--- a/app/models/incoming_message.rb
+++ b/app/models/incoming_message.rb
@@ -125,9 +125,9 @@ class IncomingMessage < ActiveRecord::Base
self.sent_at = self.mail.date || self.created_at
self.subject = self.mail.subject
self.mail_from = MailHandler.get_from_name(self.mail)
- begin
+ if self.from_email
self.mail_from_domain = PublicBody.extract_domain_from_email(self.from_email)
- rescue NoMethodError
+ else
self.mail_from_domain = ""
end
self.valid_to_reply_to = self._calculate_valid_to_reply_to
@@ -173,54 +173,8 @@ class IncomingMessage < ActiveRecord::Base
super
end
- # Number the attachments in depth first tree order, for use in URLs.
- # XXX This fills in part.rfc822_attachment and part.url_part_number within
- # all the parts of the email (see monkeypatches in lib/mail_handler/tmail_extensions and
- # lib/mail_handler/mail_extensions for how these attributes are added). ensure_parts_counted
- # must be called before using the attributes.
- def ensure_parts_counted
- @count_parts_count = 0
- _count_parts_recursive(self.mail)
- # we carry on using these numeric ids for attachments uudecoded from within text parts
- @count_first_uudecode_count = @count_parts_count
- end
- def _count_parts_recursive(part)
- if part.multipart?
- part.parts.each do |p|
- _count_parts_recursive(p)
- end
- else
- part_filename = MailHandler.get_part_file_name(part)
- begin
- if part.content_type == 'message/rfc822'
- # An email attached as text
- # e.g. http://www.whatdotheyknow.com/request/64/response/102
- part.rfc822_attachment = MailHandler.mail_from_raw_email(part.body, decode=false)
- elsif part.content_type == 'application/vnd.ms-outlook' || part_filename && AlaveteliFileTypes.filename_to_mimetype(part_filename) == 'application/vnd.ms-outlook'
- # An email attached as an Outlook file
- # e.g. http://www.whatdotheyknow.com/request/chinese_names_for_british_politi
- msg = Mapi::Msg.open(StringIO.new(part.body))
- part.rfc822_attachment = MailHandler.mail_from_raw_email(msg.to_mime.to_s, decode=false)
- elsif part.content_type == 'application/ms-tnef'
- # A set of attachments in a TNEF file
- part.rfc822_attachment = MailHandler.mail_from_tnef(part.body)
- end
- rescue
- # If attached mail doesn't parse, treat it as text part
- part.rfc822_attachment = nil
- else
- unless part.rfc822_attachment.nil?
- _count_parts_recursive(part.rfc822_attachment)
- end
- end
- if part.rfc822_attachment.nil?
- @count_parts_count += 1
- part.url_part_number = @count_parts_count
- end
- end
- end
# And look up by URL part number to get an attachment
- # XXX relies on extract_attachments calling ensure_parts_counted
+ # XXX relies on extract_attachments calling MailHandler.ensure_parts_counted
def self.get_attachment_by_url_part_number(attachments, found_url_part_number)
attachments.each do |a|
if a.url_part_number == found_url_part_number
@@ -441,96 +395,6 @@ class IncomingMessage < ActiveRecord::Base
return text
end
- # (This risks losing info if the unchosen alternative is the only one to contain
- # useful info, but let's worry about that another time)
- def get_attachment_leaves
- force = true
- return _get_attachment_leaves_recursive(self.mail(force))
- end
- def _get_attachment_leaves_recursive(curr_mail, within_rfc822_attachment = nil)
- leaves_found = []
- if curr_mail.multipart?
- if curr_mail.parts.size == 0
- raise "no parts on multipart mail"
- end
-
- if curr_mail.sub_type == 'alternative'
- # Choose best part from alternatives
- best_part = nil
- # Take the last text/plain one, or else the first one
- curr_mail.parts.each do |m|
- if not best_part
- best_part = m
- elsif m.content_type == 'text/plain'
- best_part = m
- end
- end
- # Take an HTML one as even higher priority. (They tend
- # to render better than text/plain, e.g. don't wrap links here:
- # http://www.whatdotheyknow.com/request/amount_and_cost_of_freedom_of_in#incoming-72238 )
- curr_mail.parts.each do |m|
- if m.content_type == 'text/html'
- best_part = m
- end
- end
- leaves_found += _get_attachment_leaves_recursive(best_part, within_rfc822_attachment)
- else
- # Add all parts
- curr_mail.parts.each do |m|
- leaves_found += _get_attachment_leaves_recursive(m, within_rfc822_attachment)
- end
- end
- else
- # XXX Yuck. this section alters various content_type's. That puts
- # it into conflict with ensure_parts_counted which it has to be
- # called both before and after. It will fail with cases of
- # attachments of attachments etc.
- charset = curr_mail.charset # save this, because overwriting content_type also resets charset
- # Don't allow nil content_types
- if curr_mail.content_type.nil?
- curr_mail.content_type = 'application/octet-stream'
- end
- # PDFs often come with this mime type, fix it up for view code
- if curr_mail.content_type == 'application/octet-stream'
- part_file_name = MailHandler.get_part_file_name(curr_mail)
- part_body = MailHandler.get_part_body(curr_mail)
- calc_mime = AlaveteliFileTypes.filename_and_content_to_mimetype(part_file_name, part_body)
- if calc_mime
- curr_mail.content_type = calc_mime
- end
- end
-
- # Use standard content types for Word documents etc.
- curr_mail.content_type = normalise_content_type(curr_mail.content_type)
- if curr_mail.content_type == 'message/rfc822'
- ensure_parts_counted # fills in rfc822_attachment variable
- if curr_mail.rfc822_attachment.nil?
- # Attached mail didn't parse, so treat as text
- curr_mail.content_type = 'text/plain'
- end
- end
- if curr_mail.content_type == 'application/vnd.ms-outlook' || curr_mail.content_type == 'application/ms-tnef'
- ensure_parts_counted # fills in rfc822_attachment variable
- if curr_mail.rfc822_attachment.nil?
- # Attached mail didn't parse, so treat as binary
- curr_mail.content_type = 'application/octet-stream'
- end
- end
- # If the part is an attachment of email
- if curr_mail.content_type == 'message/rfc822' || curr_mail.content_type == 'application/vnd.ms-outlook' || curr_mail.content_type == 'application/ms-tnef'
- ensure_parts_counted # fills in rfc822_attachment variable
- leaves_found += _get_attachment_leaves_recursive(curr_mail.rfc822_attachment, curr_mail.rfc822_attachment)
- else
- # Store leaf
- curr_mail.within_rfc822_attachment = within_rfc822_attachment
- leaves_found += [curr_mail]
- end
- # restore original charset
- curr_mail.charset = charset
- end
- return leaves_found
- end
-
# Removes anything cached about the object in the database, and saves
def clear_in_database_caches!
self.cached_attachment_text_clipped = nil
@@ -593,7 +457,8 @@ class IncomingMessage < ActiveRecord::Base
text = "[ Email has no body, please see attachments ]"
source_charset = "utf-8"
else
- text = part.body # by default, TMail converts to UTF8 in this call
+ # by default, the body (coming from an foi_attachment) should have been converted to utf-8
+ text = part.body
source_charset = part.charset
if part.content_type == 'text/html'
# e.g. http://www.whatdotheyknow.com/request/35/response/177
@@ -601,42 +466,31 @@ class IncomingMessage < ActiveRecord::Base
# convert to text routine. Could instead call a
# sanitize HTML one.
- # If the text isn't UTF8, it means TMail had a problem
+ # If the text isn't UTF8, it means we had a problem
# converting it (invalid characters, etc), and we
# should instead tell elinks to respect the source
# charset
use_charset = "utf-8"
- begin
- text = Iconv.conv('utf-8', 'utf-8', text)
- rescue Iconv::IllegalSequence
- use_charset = source_charset
- end
- text = self.class._get_attachment_text_internal_one_file(part.content_type, text, use_charset)
- end
- end
-
- # If TMail can't convert text, it just returns it, so we sanitise it.
- begin
- # Test if it's good UTF-8
- text = Iconv.conv('utf-8', 'utf-8', text)
- rescue Iconv::IllegalSequence
- # Text looks like unlabelled nonsense,
- # strip out anything that isn't UTF-8
- begin
- source_charset = 'utf-8' if source_charset.nil?
- text = Iconv.conv('utf-8//IGNORE', source_charset, text) +
- _("\n\n[ {{site_name}} note: The above text was badly encoded, and has had strange characters removed. ]",
- :site_name => Configuration::site_name)
- rescue Iconv::InvalidEncoding, Iconv::IllegalSequence
- if source_charset != "utf-8"
- source_charset = "utf-8"
- retry
+ if RUBY_VERSION.to_f >= 1.9
+ begin
+ text.encode('utf-8')
+ rescue Encoding::UndefinedConversionError, Encoding::InvalidByteSequenceError
+ use_charset = source_charset
+ end
+ else
+ begin
+ text = Iconv.conv('utf-8', 'utf-8', text)
+ rescue Iconv::IllegalSequence
+ use_charset = source_charset
+ end
end
+ text = MailHandler.get_attachment_text_one_file(part.content_type, text, use_charset)
end
end
+ # If text hasn't been converted, we sanitise it.
+ text = _sanitize_text(text)
# Fix DOS style linefeeds to Unix style ones (or other later regexps won't work)
- # Needed for e.g. http://www.whatdotheyknow.com/request/60/response/98
text = text.gsub(/\r\n/, "\n")
# Compress extra spaces down to save space, and to stop regular expressions
@@ -646,6 +500,51 @@ class IncomingMessage < ActiveRecord::Base
return text
end
+
+ def _sanitize_text(text)
+ if RUBY_VERSION.to_f >= 1.9
+ begin
+ # Test if it's good UTF-8
+ text.encode('utf-8')
+ rescue Encoding::UndefinedConversionError, Encoding::InvalidByteSequenceError
+ source_charset = 'utf-8' if source_charset.nil?
+ # strip out anything that isn't UTF-8
+ begin
+ text = text.encode("utf-8", :invalid => :replace,
+ :undef => :replace,
+ :replace => "") +
+ _("\n\n[ {{site_name}} note: The above text was badly encoded, and has had strange characters removed. ]",
+ :site_name => MySociety::Config.get('SITE_NAME', 'Alaveteli'))
+ rescue Encoding::UndefinedConversionError, Encoding::InvalidByteSequenceError
+ if source_charset != "utf-8"
+ source_charset = "utf-8"
+ retry
+ end
+ end
+ end
+ else
+ begin
+ # Test if it's good UTF-8
+ text = Iconv.conv('utf-8', 'utf-8', text)
+ rescue Iconv::IllegalSequence
+ # Text looks like unlabelled nonsense,
+ # strip out anything that isn't UTF-8
+ begin
+ source_charset = 'utf-8' if source_charset.nil?
+ text = Iconv.conv('utf-8//IGNORE', source_charset, text) +
+ _("\n\n[ {{site_name}} note: The above text was badly encoded, and has had strange characters removed. ]",
+ :site_name => Configuration::site_name)
+ rescue Iconv::InvalidEncoding, Iconv::IllegalSequence
+ if source_charset != "utf-8"
+ source_charset = "utf-8"
+ retry
+ end
+ end
+ end
+ end
+ text
+ end
+
# Returns part which contains main body text, or nil if there isn't one
def get_main_body_text_part
leaves = self.foi_attachments
@@ -699,7 +598,7 @@ class IncomingMessage < ActiveRecord::Base
filename = uu.match(/^begin\s+[0-9]+\s+(.*)$/)[1]
calc_mime = AlaveteliFileTypes.filename_and_content_to_mimetype(filename, content)
if calc_mime
- calc_mime = normalise_content_type(calc_mime)
+ calc_mime = MailHandler.normalise_content_type(calc_mime)
content_type = calc_mime
else
content_type = 'application/octet-stream'
@@ -728,55 +627,15 @@ class IncomingMessage < ActiveRecord::Base
end
def extract_attachments!
- leaves = get_attachment_leaves # XXX check where else this is called from
- # XXX we have to call ensure_parts_counted after get_attachment_leaves
- # which is really messy.
- ensure_parts_counted
+ force = true
+ attachment_attributes = MailHandler.get_attachment_attributes(self.mail(force))
attachments = []
- for leaf in leaves
- body = MailHandler.get_part_body(leaf)
- # As leaf.body causes MIME decoding which uses lots of RAM, do garbage collection here
- # to prevent excess memory use. XXX not really sure if this helps reduce
- # peak RAM use overall. Anyway, maybe there is something better to do than this.
- GC.start
- if leaf.within_rfc822_attachment
- within_rfc822_subject = leaf.within_rfc822_attachment.subject
- # Test to see if we are in the first part of the attached
- # RFC822 message and it is text, if so add headers.
- # XXX should probably use hunting algorithm to find main text part, rather than
- # just expect it to be first. This will do for now though.
- # Example request that needs this:
- # http://www.whatdotheyknow.com/request/2923/response/7013/attach/2/Cycle%20Path%20Bank.txt
- if leaf.within_rfc822_attachment == leaf && leaf.content_type == 'text/plain'
- headers = ""
- for header in [ 'Date', 'Subject', 'From', 'To', 'Cc' ]
- if leaf.within_rfc822_attachment.header.include?(header.downcase)
- header_value = leaf.within_rfc822_attachment.header[header.downcase]
- # Example message which has a blank Date header:
- # http://www.whatdotheyknow.com/request/30747/response/80253/attach/html/17/Common%20Purpose%20Advisory%20Group%20Meeting%20Tuesday%202nd%20March.txt.html
- if !header_value.blank?
- headers = headers + header + ": " + header_value.to_s + "\n"
- end
- end
- end
- # XXX call _convert_part_body_to_text here, but need to get charset somehow
- # e.g. http://www.whatdotheyknow.com/request/1593/response/3088/attach/4/Freedom%20of%20Information%20request%20-%20car%20oval%20sticker:%20Article%2020,%20Convention%20on%20Road%20Traffic%201949.txt
- body = headers + "\n" + body
-
- # This is quick way of getting all headers, but instead we only add some a) to
- # make it more usable, b) as at least one authority accidentally leaked security
- # information into a header.
- #attachment.body = leaf.within_rfc822_attachment.port.to_s
- end
- end
- hexdigest = Digest::MD5.hexdigest(body)
- attachment = self.foi_attachments.find_or_create_by_hexdigest(:hexdigest => hexdigest)
- attachment.update_attributes(:url_part_number => leaf.url_part_number,
- :content_type => leaf.content_type,
- :filename => MailHandler.get_part_file_name(leaf),
- :charset => leaf.charset,
- :within_rfc822_subject => within_rfc822_subject,
- :body => body)
+ attachment_attributes.each do |attrs|
+ attachment = self.foi_attachments.find_or_create_by_hexdigest(:hexdigest => attrs[:hexdigest])
+ body = attrs.delete(:body)
+ attachment.update_attributes(attrs)
+ # Set the body separately as its handling can depend on the value of charset
+ attachment.body = body
attachment.save!
attachments << attachment.id
end
@@ -786,7 +645,7 @@ class IncomingMessage < ActiveRecord::Base
# e.g. for https://secure.mysociety.org/admin/foi/request/show_raw_email/24550
if !main_part.nil?
uudecoded_attachments = _uudecode_and_save_attachments(main_part.body)
- c = @count_first_uudecode_count
+ c = self.mail.count_first_uudecode_count
for uudecode_attachment in uudecoded_attachments
c += 1
uudecode_attachment.url_part_number = c
@@ -878,101 +737,15 @@ class IncomingMessage < ActiveRecord::Base
return self.cached_attachment_text_clipped
end
- def IncomingMessage._get_attachment_text_internal_one_file(content_type, body, charset = 'utf-8')
- # note re. charset: TMail always tries to convert email bodies
- # to UTF8 by default, so normally it should already be that.
- text = ''
- # XXX - tell all these command line tools to return utf-8
- if content_type == 'text/plain'
- text += body + "\n\n"
- else
- tempfile = Tempfile.new('foiextract')
- tempfile.print body
- tempfile.flush
- if content_type == 'application/vnd.ms-word'
- AlaveteliExternalCommand.run("wvText", tempfile.path, tempfile.path + ".txt")
- # Try catdoc if we get into trouble (e.g. for InfoRequestEvent 2701)
- if not File.exists?(tempfile.path + ".txt")
- AlaveteliExternalCommand.run("catdoc", tempfile.path, :append_to => text)
- else
- text += File.read(tempfile.path + ".txt") + "\n\n"
- File.unlink(tempfile.path + ".txt")
- end
- elsif content_type == 'application/rtf'
- # catdoc on RTF prodcues less comments and extra bumf than --text option to unrtf
- AlaveteliExternalCommand.run("catdoc", tempfile.path, :append_to => text)
- elsif content_type == 'text/html'
- # lynx wordwraps links in its output, which then don't
- # get formatted properly by Alaveteli. We use elinks
- # instead, which doesn't do that.
- AlaveteliExternalCommand.run("elinks", "-eval", "set document.codepage.assume = \"#{charset}\"", "-eval", "set document.codepage.force_assumed = 1", "-dump-charset", "utf-8", "-force-html", "-dump",
- tempfile.path, :append_to => text, :env => {"LANG" => "C"})
- elsif content_type == 'application/vnd.ms-excel'
- # Bit crazy using /usr/bin/strings - but xls2csv, xlhtml and
- # py_xls2txt only extract text from cells, not from floating
- # notes. catdoc may be fooled by weird character sets, but will
- # probably do for UK FOI requests.
- AlaveteliExternalCommand.run("/usr/bin/strings", tempfile.path, :append_to => text)
- elsif content_type == 'application/vnd.ms-powerpoint'
- # ppthtml seems to catch more text, but only outputs HTML when
- # we want text, so just use catppt for now
- AlaveteliExternalCommand.run("catppt", tempfile.path, :append_to => text)
- elsif content_type == 'application/pdf'
- AlaveteliExternalCommand.run("pdftotext", tempfile.path, "-", :append_to => text)
- elsif content_type == 'application/vnd.openxmlformats-officedocument.wordprocessingml.document'
- # This is Microsoft's XML office document format.
- # Just pull out the main XML file, and strip it of text.
- xml = AlaveteliExternalCommand.run("/usr/bin/unzip", "-qq", "-c", tempfile.path, "word/document.xml")
- if !xml.nil?
- doc = REXML::Document.new(xml)
- text += doc.each_element( './/text()' ){}.join(" ")
- end
- elsif content_type == 'application/zip'
- # recurse into zip files
- begin
- zip_file = Zip::ZipFile.open(tempfile.path)
- text += IncomingMessage._get_attachment_text_from_zip_file(zip_file)
- zip_file.close()
- rescue
- $stderr.puts("Error processing zip file: #{$!.inspect}")
- end
- end
- tempfile.close
- end
-
- return text
- end
- def IncomingMessage._get_attachment_text_from_zip_file(zip_file)
- text = ""
- for entry in zip_file
- if entry.file?
- filename = entry.to_s
- begin
- body = entry.get_input_stream.read
- rescue
- # move to next attachment silently if there were problems
- # XXX really should reduce this to specific exceptions?
- # e.g. password protected
- next
- end
- calc_mime = AlaveteliFileTypes.filename_to_mimetype(filename)
- if calc_mime
- content_type = calc_mime
- else
- content_type = 'application/octet-stream'
- end
- text += _get_attachment_text_internal_one_file(content_type, body)
- end
- end
- return text
- end
def _get_attachment_text_internal
# Extract text from each attachment
text = ''
attachments = self.get_attachments_for_display
for attachment in attachments
- text += IncomingMessage._get_attachment_text_internal_one_file(attachment.content_type, attachment.body, attachment.charset)
+ text += MailHandler.get_attachment_text_one_file(attachment.content_type,
+ attachment.body,
+ attachment.charset)
end
# Remove any bad characters
text = Iconv.conv('utf-8//IGNORE', 'utf-8', text)
@@ -1040,37 +813,12 @@ class IncomingMessage < ActiveRecord::Base
return AlaveteliFileTypes.all_extensions.join(" ")
end
- def normalise_content_type(content_type)
- # e.g. http://www.whatdotheyknow.com/request/93/response/250
- if content_type == 'application/excel' or content_type == 'application/msexcel' or content_type == 'application/x-ms-excel'
- content_type = 'application/vnd.ms-excel'
- end
- if content_type == 'application/mspowerpoint' or content_type == 'application/x-ms-powerpoint'
- content_type = 'application/vnd.ms-powerpoint'
- end
- if content_type == 'application/msword' or content_type == 'application/x-ms-word'
- content_type = 'application/vnd.ms-word'
- end
- if content_type == 'application/x-zip-compressed'
- content_type = 'application/zip'
- end
-
- # e.g. http://www.whatdotheyknow.com/request/copy_of_current_swessex_scr_opt#incoming-9928
- if content_type == 'application/acrobat'
- content_type = 'application/pdf'
+ def for_admin_column
+ self.class.content_columns.each do |column|
+ yield(column.human_name, self.send(column.name), column.type.to_s, column.name)
end
-
- return content_type
end
- def for_admin_column
- self.class.content_columns.each do |column|
- yield(column.human_name, self.send(column.name), column.type.to_s, column.name)
- end
- end
-
- private :normalise_content_type
-
end
diff --git a/lib/mail_handler/backends/mail_backend.rb b/lib/mail_handler/backends/mail_backend.rb
index 8dd2e6b48..b75e6ed63 100644
--- a/lib/mail_handler/backends/mail_backend.rb
+++ b/lib/mail_handler/backends/mail_backend.rb
@@ -23,15 +23,23 @@ module MailHandler
main
end
+ # Returns an outlook message as a Mail object
+ def mail_from_outlook(content)
+ msg = Mapi::Msg.open(StringIO.new(content))
+ mail = mail_from_raw_email(msg.to_mime.to_s)
+ mail.ready_to_send!
+ mail
+ end
+
# Return a copy of the file name for the mail part
- def get_part_file_name(mail_part)
- part_file_name = mail_part.filename
+ def get_part_file_name(part)
+ part_file_name = part.filename
part_file_name.nil? ? nil : part_file_name.dup
end
# Get the body of a mail part
- def get_part_body(mail_part)
- mail_part.body.decoded
+ def get_part_body(part)
+ part.body.decoded
end
# Return the first from field if any
@@ -102,13 +110,204 @@ module MailHandler
mail.header[header] ? mail.header[header].to_s : nil
end
+ # Detects whether a mail part is an Outlook email
+ def is_outlook?(part)
+ filename = get_part_file_name(part)
+ return true if get_content_type(part) == 'application/vnd.ms-outlook'
+ if filename && AlaveteliFileTypes.filename_to_mimetype(filename) == 'application/vnd.ms-outlook'
+ return true
+ end
+ return false
+ end
+
+ # Convert a mail part which is an attached mail in one of
+ # several formats into a mail object and set it as the
+ # rfc822_attachment on the part. If the mail part can't be
+ # converted, the content type on the part is updated to
+ # 'text/plain' for an RFC822 attachment, and 'application/octet-stream'
+ # for other types
+ def decode_attached_part(part, parent_mail)
+ if get_content_type(part) == 'message/rfc822'
+ # An email attached as text
+ part.rfc822_attachment = mail_from_raw_email(part.body)
+ if part.rfc822_attachment.nil?
+ # Attached mail didn't parse, so treat as text
+ part.content_type = 'text/plain'
+ end
+ elsif is_outlook?(part)
+ part.rfc822_attachment = mail_from_outlook(part.body.decoded)
+ if part.rfc822_attachment.nil?
+ # Attached mail didn't parse, so treat as binary
+ part.content_type = 'application/octet-stream'
+ end
+ elsif get_content_type(part) == 'application/ms-tnef'
+ # A set of attachments in a TNEF file
+ part.rfc822_attachment = mail_from_tnef(part.body.decoded)
+ if part.rfc822_attachment.nil?
+ # Attached mail didn't parse, so treat as binary
+ part.content_type = 'application/octet-stream'
+ end
+ end
+ if part.rfc822_attachment
+ expand_and_normalize_parts(part.rfc822_attachment, parent_mail)
+ end
+ end
+
+ # Expand and normalize a mail part recursively. Decodes attached messages into
+ # Mail objects wherever possible. Sets a default content type if none is
+ # set. Tries to set a more specific content type for binary content types.
+ def expand_and_normalize_parts(part, parent_mail)
+ if part.multipart?
+ part.parts.each{ |sub_part| expand_and_normalize_parts(sub_part, parent_mail) }
+ else
+ part_filename = get_part_file_name(part)
+ charset = part.charset # save this, because overwriting content_type also resets charset
+
+ # Don't allow nil content_types
+ if get_content_type(part).nil?
+ part.content_type = 'application/octet-stream'
+ end
+
+ # PDFs often come with this mime type, fix it up for view code
+ if get_content_type(part) == 'application/octet-stream'
+ part_body = get_part_body(part)
+ calc_mime = AlaveteliFileTypes.filename_and_content_to_mimetype(part_filename,
+ part_body)
+ if calc_mime
+ part.content_type = calc_mime
+ end
+ end
+
+ # Use standard content types for Word documents etc.
+ part.content_type = normalise_content_type(get_content_type(part))
+ decode_attached_part(part, parent_mail)
+ part.charset = charset
+ end
+ end
+
+ # Count the parts in a mail part recursively, including any attached messages.
+ # Set the count on the parent mail, and set a url_part_number on the part itself.
+ # Set the count for the first uudecoded part on the parent mail also.
+ def count_parts(part, parent_mail)
+ if part.multipart?
+ part.parts.each { |p| count_parts(p, parent_mail) }
+ else
+ if part.rfc822_attachment
+ count_parts(part.rfc822_attachment, parent_mail)
+ else
+ parent_mail.count_parts_count += 1
+ part.url_part_number = parent_mail.count_parts_count
+ end
+ end
+ parent_mail.count_first_uudecode_count = parent_mail.count_parts_count
+ end
+
+ # Choose the best part from alternatives
+ def choose_best_alternative(mail)
+ if mail.html_part
+ return mail.html_part
+ elsif mail.text_part
+ return mail.text_part
+ else
+ return mail.parts.first
+ end
+ end
+
+ # Expand and normalize the parts of a mail, select the best part
+ # wherever there is an alternative, and then count the returned
+ # leaves and assign url_part values to them
+ def get_attachment_leaves(mail)
+ expand_and_normalize_parts(mail, mail)
+ leaves = _get_attachment_leaves_recursive(mail, nil, mail)
+ mail.count_parts_count = 0
+ count_parts(mail, mail)
+ return leaves
+ end
+
+ # Recurse through a mail part, selecting the best part wherever there is
+ # an alternative
+ def _get_attachment_leaves_recursive(part, within_rfc822_attachment, parent_mail)
+ leaves_found = []
+ if part.multipart?
+ raise "no parts on multipart mail" if part.parts.size == 0
+ if part.sub_type == 'alternative'
+ best_part = choose_best_alternative(part)
+ leaves_found += _get_attachment_leaves_recursive(best_part,
+ within_rfc822_attachment,
+ parent_mail)
+ else
+ # Add all parts
+ part.parts.each do |sub_part|
+ leaves_found += _get_attachment_leaves_recursive(sub_part,
+ within_rfc822_attachment,
+ parent_mail)
+ end
+ end
+ else
+ # Add all the parts of a decoded attached message
+ if part.rfc822_attachment
+ leaves_found += _get_attachment_leaves_recursive(part.rfc822_attachment,
+ part.rfc822_attachment,
+ parent_mail)
+ else
+ # Store leaf
+ part.within_rfc822_attachment = within_rfc822_attachment
+ leaves_found += [part]
+ end
+ end
+ return leaves_found
+ end
+
+ # Add selected useful headers from an attached message to its body
+ def extract_attached_message_headers(leaf)
+ body = get_part_body(leaf)
+ # Test to see if we are in the first part of the attached
+ # RFC822 message and it is text, if so add headers.
+ if leaf.within_rfc822_attachment == leaf && get_content_type(leaf) == 'text/plain'
+ headers = ""
+ [ 'Date', 'Subject', 'From', 'To', 'Cc' ].each do |header|
+ if header_value = get_header_string(header, leaf.within_rfc822_attachment)
+ if !header_value.blank?
+ headers = headers + header + ": " + header_value.to_s + "\n"
+ end
+ end
+ end
+ # XXX call _convert_part_body_to_text here, but need to get charset somehow
+ # e.g. http://www.whatdotheyknow.com/request/1593/response/3088/attach/4/Freedom%20of%20Information%20request%20-%20car%20oval%20sticker:%20Article%2020,%20Convention%20on%20Road%20Traffic%201949.txt
+ body = headers + "\n" + body
+ end
+ body
+ end
+
+ # Generate a hash of the attributes associated with each significant part of a Mail object
+ def get_attachment_attributes(mail)
+ leaves = get_attachment_leaves(mail)
+ attachments = []
+ for leaf in leaves
+ body = get_part_body(leaf)
+ if leaf.within_rfc822_attachment
+ within_rfc822_subject = leaf.within_rfc822_attachment.subject
+ body = extract_attached_message_headers(leaf)
+ end
+ leaf_attributes = { :url_part_number => leaf.url_part_number,
+ :content_type => get_content_type(leaf),
+ :filename => get_part_file_name(leaf),
+ :charset => leaf.charset,
+ :within_rfc822_subject => within_rfc822_subject,
+ :body => body,
+ :hexdigest => Digest::MD5.hexdigest(body) }
+ attachments << leaf_attributes
+ end
+ return attachments
+ end
+
# Format
def address_from_name_and_email(name, email)
if !MySociety::Validate.is_valid_email(email)
raise "invalid email " + email + " passed to address_from_name_and_email"
end
if name.nil?
- return Mail::Address.new(email)
+ return Mail::Address.new(email).to_s
end
address = Mail::Address.new
address.display_name = name
diff --git a/lib/mail_handler/backends/mail_extensions.rb b/lib/mail_handler/backends/mail_extensions.rb
index cbe0491ed..f756abd1a 100644
--- a/lib/mail_handler/backends/mail_extensions.rb
+++ b/lib/mail_handler/backends/mail_extensions.rb
@@ -1,7 +1,67 @@
+require 'mail/message'
+require 'mail/fields/common/parameter_hash'
module Mail
class Message
attr_accessor :url_part_number
attr_accessor :rfc822_attachment # when a whole email message is attached as text
attr_accessor :within_rfc822_attachment # for parts within a message attached as text (for getting subject mainly)
+ attr_accessor :count_parts_count
+ attr_accessor :count_first_uudecode_count
+
+ # A patched version of the message initializer to work around a bug where stripping the original
+ # input removes meaningful spaces - e.g. in the case of uuencoded bodies.
+ def initialize(*args, &block)
+ @body = nil
+ @body_raw = nil
+ @separate_parts = false
+ @text_part = nil
+ @html_part = nil
+ @errors = nil
+ @header = nil
+ @charset = 'UTF-8'
+ @defaulted_charset = true
+
+ @perform_deliveries = true
+ @raise_delivery_errors = true
+
+ @delivery_handler = nil
+
+ @delivery_method = Mail.delivery_method.dup
+
+ @transport_encoding = Mail::Encodings.get_encoding('7bit')
+
+ @mark_for_delete = false
+
+ if args.flatten.first.respond_to?(:each_pair)
+ init_with_hash(args.flatten.first)
+ else
+ # The replacement of this commented out line is the change.
+ # init_with_string(args.flatten[0].to_s.strip)
+ init_with_string(args.flatten[0].to_s)
+ end
+
+ if block_given?
+ instance_eval(&block)
+ end
+
+ self
+ end
+ end
+
+ # A patched version of the parameter hash that handles nil values without throwing
+ # an error.
+ class ParameterHash < IndifferentHash
+
+ def encoded
+ map.sort { |a,b| a.first.to_s <=> b.first.to_s }.map do |key_name, value|
+ # The replacement of this commented out line is the change
+ # unless value.ascii_only?
+ unless value.nil? || value.ascii_only?
+ value = Mail::Encodings.param_encode(value)
+ key_name = "#{key_name}*"
+ end
+ %Q{#{key_name}=#{quote_token(value)}}
+ end.join(";\r\n\s")
+ end
end
end \ No newline at end of file
diff --git a/lib/mail_handler/backends/tmail_backend.rb b/lib/mail_handler/backends/tmail_backend.rb
index 4b7291d00..02124cdb1 100644
--- a/lib/mail_handler/backends/tmail_backend.rb
+++ b/lib/mail_handler/backends/tmail_backend.rb
@@ -83,6 +83,192 @@ module MailHandler
mail.header_string(header)
end
+ # Number the attachments in depth first tree order, for use in URLs.
+ # XXX This fills in part.rfc822_attachment and part.url_part_number within
+ # all the parts of the email (see monkeypatches in lib/mail_handler/tmail_extensions and
+ # lib/mail_handler/mail_extensions for how these attributes are added). ensure_parts_counted
+ # must be called before using the attributes.
+ def ensure_parts_counted(mail)
+ mail.count_parts_count = 0
+ _count_parts_recursive(mail, mail)
+ # we carry on using these numeric ids for attachments uudecoded from within text parts
+ mail.count_first_uudecode_count = mail.count_parts_count
+ end
+ def _count_parts_recursive(part, mail)
+ if part.multipart?
+ part.parts.each do |p|
+ _count_parts_recursive(p, mail)
+ end
+ else
+ part_filename = get_part_file_name(part)
+ begin
+ if part.content_type == 'message/rfc822'
+ # An email attached as text
+ # e.g. http://www.whatdotheyknow.com/request/64/response/102
+ part.rfc822_attachment = mail_from_raw_email(part.body, decode=false)
+ elsif part.content_type == 'application/vnd.ms-outlook' || part_filename && AlaveteliFileTypes.filename_to_mimetype(part_filename) == 'application/vnd.ms-outlook'
+ # An email attached as an Outlook file
+ # e.g. http://www.whatdotheyknow.com/request/chinese_names_for_british_politi
+ msg = Mapi::Msg.open(StringIO.new(part.body))
+ part.rfc822_attachment = mail_from_raw_email(msg.to_mime.to_s, decode=false)
+ elsif part.content_type == 'application/ms-tnef'
+ # A set of attachments in a TNEF file
+ part.rfc822_attachment = mail_from_tnef(part.body)
+ end
+ rescue
+ # If attached mail doesn't parse, treat it as text part
+ part.rfc822_attachment = nil
+ else
+ unless part.rfc822_attachment.nil?
+ _count_parts_recursive(part.rfc822_attachment, mail)
+ end
+ end
+ if part.rfc822_attachment.nil?
+ mail.count_parts_count += 1
+ part.url_part_number = mail.count_parts_count
+ end
+ end
+ end
+
+ def get_attachment_attributes(mail)
+ leaves = get_attachment_leaves(mail)
+ # XXX we have to call ensure_parts_counted after get_attachment_leaves
+ # which is really messy.
+ ensure_parts_counted(mail)
+ attachment_attributes = []
+ for leaf in leaves
+ body = get_part_body(leaf)
+ # As leaf.body causes MIME decoding which uses lots of RAM, do garbage collection here
+ # to prevent excess memory use. XXX not really sure if this helps reduce
+ # peak RAM use overall. Anyway, maybe there is something better to do than this.
+ GC.start
+ if leaf.within_rfc822_attachment
+ within_rfc822_subject = leaf.within_rfc822_attachment.subject
+ # Test to see if we are in the first part of the attached
+ # RFC822 message and it is text, if so add headers.
+ # XXX should probably use hunting algorithm to find main text part, rather than
+ # just expect it to be first. This will do for now though.
+ if leaf.within_rfc822_attachment == leaf && leaf.content_type == 'text/plain'
+ headers = ""
+ for header in [ 'Date', 'Subject', 'From', 'To', 'Cc' ]
+ if leaf.within_rfc822_attachment.header.include?(header.downcase)
+ header_value = leaf.within_rfc822_attachment.header[header.downcase]
+ if !header_value.blank?
+ headers = headers + header + ": " + header_value.to_s + "\n"
+ end
+ end
+ end
+ # XXX call _convert_part_body_to_text here, but need to get charset somehow
+ # e.g. http://www.whatdotheyknow.com/request/1593/response/3088/attach/4/Freedom%20of%20Information%20request%20-%20car%20oval%20sticker:%20Article%2020,%20Convention%20on%20Road%20Traffic%201949.txt
+ body = headers + "\n" + body
+
+ # This is quick way of getting all headers, but instead we only add some a) to
+ # make it more usable, b) as at least one authority accidentally leaked security
+ # information into a header.
+ #attachment.body = leaf.within_rfc822_attachment.port.to_s
+ end
+ end
+ attachment_attributes << {:url_part_number => leaf.url_part_number,
+ :content_type => get_content_type(leaf),
+ :filename => get_part_file_name(leaf),
+ :charset => leaf.charset,
+ :within_rfc822_subject => within_rfc822_subject,
+ :body => body,
+ :hexdigest => Digest::MD5.hexdigest(body) }
+ end
+ attachment_attributes
+ end
+
+ # (This risks losing info if the unchosen alternative is the only one to contain
+ # useful info, but let's worry about that another time)
+ def get_attachment_leaves(mail)
+ return _get_attachment_leaves_recursive(mail, mail)
+ end
+ def _get_attachment_leaves_recursive(curr_mail, parent_mail, within_rfc822_attachment = nil)
+ leaves_found = []
+ if curr_mail.multipart?
+ if curr_mail.parts.size == 0
+ raise "no parts on multipart mail"
+ end
+
+ if curr_mail.sub_type == 'alternative'
+ # Choose best part from alternatives
+ best_part = nil
+ # Take the last text/plain one, or else the first one
+ curr_mail.parts.each do |m|
+ if not best_part
+ best_part = m
+ elsif m.content_type == 'text/plain'
+ best_part = m
+ end
+ end
+ # Take an HTML one as even higher priority. (They tend
+ # to render better than text/plain, e.g. don't wrap links here:
+ # http://www.whatdotheyknow.com/request/amount_and_cost_of_freedom_of_in#incoming-72238 )
+ curr_mail.parts.each do |m|
+ if m.content_type == 'text/html'
+ best_part = m
+ end
+ end
+ leaves_found += _get_attachment_leaves_recursive(best_part, parent_mail, within_rfc822_attachment)
+ else
+ # Add all parts
+ curr_mail.parts.each do |m|
+ leaves_found += _get_attachment_leaves_recursive(m, parent_mail, within_rfc822_attachment)
+ end
+ end
+ else
+ # XXX Yuck. this section alters various content_types. That puts
+ # it into conflict with ensure_parts_counted which it has to be
+ # called both before and after. It will fail with cases of
+ # attachments of attachments etc.
+ charset = curr_mail.charset # save this, because overwriting content_type also resets charset
+ # Don't allow nil content_types
+ if curr_mail.content_type.nil?
+ curr_mail.content_type = 'application/octet-stream'
+ end
+ # PDFs often come with this mime type, fix it up for view code
+ if curr_mail.content_type == 'application/octet-stream'
+ part_file_name = get_part_file_name(curr_mail)
+ part_body = get_part_body(curr_mail)
+ calc_mime = AlaveteliFileTypes.filename_and_content_to_mimetype(part_file_name, part_body)
+ if calc_mime
+ curr_mail.content_type = calc_mime
+ end
+ end
+
+ # Use standard content types for Word documents etc.
+ curr_mail.content_type = normalise_content_type(curr_mail.content_type)
+ if curr_mail.content_type == 'message/rfc822'
+ ensure_parts_counted(parent_mail) # fills in rfc822_attachment variable
+ if curr_mail.rfc822_attachment.nil?
+ # Attached mail didn't parse, so treat as text
+ curr_mail.content_type = 'text/plain'
+ end
+ end
+ if curr_mail.content_type == 'application/vnd.ms-outlook' || curr_mail.content_type == 'application/ms-tnef'
+ ensure_parts_counted(parent_mail) # fills in rfc822_attachment variable
+ if curr_mail.rfc822_attachment.nil?
+ # Attached mail didn't parse, so treat as binary
+ curr_mail.content_type = 'application/octet-stream'
+ end
+ end
+ # If the part is an attachment of email
+ if curr_mail.content_type == 'message/rfc822' || curr_mail.content_type == 'application/vnd.ms-outlook' || curr_mail.content_type == 'application/ms-tnef'
+ ensure_parts_counted(parent_mail) # fills in rfc822_attachment variable
+ leaves_found += _get_attachment_leaves_recursive(curr_mail.rfc822_attachment, parent_mail, curr_mail.rfc822_attachment)
+ else
+ # Store leaf
+ curr_mail.within_rfc822_attachment = within_rfc822_attachment
+ leaves_found += [curr_mail]
+ end
+ # restore original charset
+ curr_mail.charset = charset
+ end
+ return leaves_found
+ end
+
+
def address_from_name_and_email(name, email)
if !MySociety::Validate.is_valid_email(email)
raise "invalid email " + email + " passed to address_from_name_and_email"
diff --git a/lib/mail_handler/backends/tmail_extensions.rb b/lib/mail_handler/backends/tmail_extensions.rb
index 9359dfeea..3576a8eca 100644
--- a/lib/mail_handler/backends/tmail_extensions.rb
+++ b/lib/mail_handler/backends/tmail_extensions.rb
@@ -20,6 +20,8 @@ module TMail
attr_accessor :url_part_number
attr_accessor :rfc822_attachment # when a whole email message is attached as text
attr_accessor :within_rfc822_attachment # for parts within a message attached as text (for getting subject mainly)
+ attr_accessor :count_parts_count
+ attr_accessor :count_first_uudecode_count
# Monkeypatch! (check to see if this becomes a standard function in
# TMail::Mail, then use that, whatever it is called)
diff --git a/lib/mail_handler/mail_handler.rb b/lib/mail_handler/mail_handler.rb
index 7b0f6e7f2..8b227b9ca 100644
--- a/lib/mail_handler/mail_handler.rb
+++ b/lib/mail_handler/mail_handler.rb
@@ -4,6 +4,7 @@ require 'tmpdir'
module MailHandler
if RUBY_VERSION.to_f >= 1.9
+ require 'mail'
require 'backends/mail_extensions'
require 'backends/mail_backend'
include Backends::MailBackend
@@ -20,7 +21,7 @@ module MailHandler
def tnef_attachments(content)
attachments = []
Dir.mktmpdir do |dir|
- IO.popen("#{`which tnef`.chomp} -K -C #{dir}", "w") do |f|
+ IO.popen("#{`which tnef`.chomp} -K -C #{dir}", "wb") do |f|
f.write(content)
f.close
if $?.signaled?
@@ -33,7 +34,7 @@ module MailHandler
found = 0
Dir.new(dir).sort.each do |file| # sort for deterministic behaviour
if file != "." && file != ".."
- file_content = File.open("#{dir}/#{file}", "r").read
+ file_content = File.open("#{dir}/#{file}", "rb").read
attachments << { :content => file_content,
:filename => file }
found += 1
@@ -46,6 +47,131 @@ module MailHandler
attachments
end
+ def normalise_content_type(content_type)
+ # e.g. http://www.whatdotheyknow.com/request/93/response/250
+ if content_type == 'application/excel' or content_type == 'application/msexcel' or content_type == 'application/x-ms-excel'
+ content_type = 'application/vnd.ms-excel'
+ end
+ if content_type == 'application/mspowerpoint' or content_type == 'application/x-ms-powerpoint'
+ content_type = 'application/vnd.ms-powerpoint'
+ end
+ if content_type == 'application/msword' or content_type == 'application/x-ms-word'
+ content_type = 'application/vnd.ms-word'
+ end
+ if content_type == 'application/x-zip-compressed'
+ content_type = 'application/zip'
+ end
+
+ # e.g. http://www.whatdotheyknow.com/request/copy_of_current_swessex_scr_opt#incoming-9928
+ if content_type == 'application/acrobat'
+ content_type = 'application/pdf'
+ end
+
+ return content_type
+ end
+
+ def get_attachment_text_one_file(content_type, body, charset = 'utf-8')
+ # note re. charset: TMail always tries to convert email bodies
+ # to UTF8 by default, so normally it should already be that.
+ text = ''
+ # XXX - tell all these command line tools to return utf-8
+ if content_type == 'text/plain'
+ text += body + "\n\n"
+ else
+ tempfile = Tempfile.new('foiextract')
+ tempfile.binmode
+ tempfile.print body
+ tempfile.flush
+ default_params = { :append_to => text, :binary_output => false }
+ if content_type == 'application/vnd.ms-word'
+ AlaveteliExternalCommand.run("wvText", tempfile.path, tempfile.path + ".txt")
+ # Try catdoc if we get into trouble (e.g. for InfoRequestEvent 2701)
+ if not File.exists?(tempfile.path + ".txt")
+ AlaveteliExternalCommand.run("catdoc", tempfile.path, default_params)
+ else
+ text += File.read(tempfile.path + ".txt") + "\n\n"
+ File.unlink(tempfile.path + ".txt")
+ end
+ elsif content_type == 'application/rtf'
+ # catdoc on RTF prodcues less comments and extra bumf than --text option to unrtf
+ AlaveteliExternalCommand.run("catdoc", tempfile.path, default_params)
+ elsif content_type == 'text/html'
+ # lynx wordwraps links in its output, which then don't
+ # get formatted properly by Alaveteli. We use elinks
+ # instead, which doesn't do that.
+ AlaveteliExternalCommand.run("elinks", "-eval", "set document.codepage.assume = \"#{charset}\"",
+ "-eval", "set document.codepage.force_assumed = 1",
+ "-dump-charset", "utf-8",
+ "-force-html", "-dump",
+ tempfile.path,
+ default_params.merge(:env => {"LANG" => "C"}))
+ elsif content_type == 'application/vnd.ms-excel'
+ # Bit crazy using /usr/bin/strings - but xls2csv, xlhtml and
+ # py_xls2txt only extract text from cells, not from floating
+ # notes. catdoc may be fooled by weird character sets, but will
+ # probably do for UK FOI requests.
+ AlaveteliExternalCommand.run("/usr/bin/strings", tempfile.path, default_params)
+ elsif content_type == 'application/vnd.ms-powerpoint'
+ # ppthtml seems to catch more text, but only outputs HTML when
+ # we want text, so just use catppt for now
+ AlaveteliExternalCommand.run("catppt", tempfile.path, default_params)
+ elsif content_type == 'application/pdf'
+ AlaveteliExternalCommand.run("pdftotext", tempfile.path, "-", default_params)
+ elsif content_type == 'application/vnd.openxmlformats-officedocument.wordprocessingml.document'
+ # This is Microsoft's XML office document format.
+ # Just pull out the main XML file, and strip it of text.
+ xml = AlaveteliExternalCommand.run("/usr/bin/unzip", "-qq",
+ "-c",
+ tempfile.path,
+ "word/document.xml",
+ {:binary_output => false})
+ if !xml.nil?
+ doc = REXML::Document.new(xml)
+ text += doc.each_element( './/text()' ){}.join(" ")
+ end
+ elsif content_type == 'application/zip'
+ # recurse into zip files
+ begin
+ zip_file = Zip::ZipFile.open(tempfile.path)
+ text += get_attachment_text_from_zip_file(zip_file)
+ zip_file.close()
+ rescue
+ $stderr.puts("Error processing zip file: #{$!.inspect}")
+ end
+ end
+ tempfile.close
+ end
+
+ return text
+ end
+ def get_attachment_text_from_zip_file(zip_file)
+
+ text = ""
+ for entry in zip_file
+ if entry.file?
+ filename = entry.to_s
+ begin
+ body = entry.get_input_stream.read
+ rescue
+ # move to next attachment silently if there were problems
+ # XXX really should reduce this to specific exceptions?
+ # e.g. password protected
+ next
+ end
+ calc_mime = AlaveteliFileTypes.filename_to_mimetype(filename)
+ if calc_mime
+ content_type = calc_mime
+ else
+ content_type = 'application/octet-stream'
+ end
+
+ text += get_attachment_text_one_file(content_type, body)
+
+ end
+ end
+ return text
+ end
+
# Turn instance methods into class methods
extend self
diff --git a/spec/fixtures/files/dos-linebreaks.email b/spec/fixtures/files/dos-linebreaks.email
new file mode 100644
index 000000000..1f5f1473f
--- /dev/null
+++ b/spec/fixtures/files/dos-linebreaks.email
@@ -0,0 +1,31 @@
+From email@example.com Wed Mar 12 14:58:26 2008
+Return-path: email@example.com>
+Envelope-to: request-xxx-xxxxxx@whatdotheyknow.com
+Delivery-date: Wed, 12 Mar 2008 14:58:26 +0000
+Received: from example.com ([0.0.0.0]:1368 helo=example.com)
+ by tea.ukcod.org.uk with esmtp (Exim 4.50)
+ id 1JZSPS-0002yK-Rq
+ for request-60-3548031c@whatdotheyknow.com; Wed, 12 Mar 2008 14:58:26 +0000
+X-MimeOLE: Produced By Microsoft Exchange V0.0.0.0
+Content-class: urn:content-classes:message
+MIME-Version: 1.0
+Content-Type: text/plain;
+ charset="us-ascii"
+Content-Transfer-Encoding: quoted-printable
+Disposition-Notification-To: "A Person" email@example.com>
+Subject: RE: Freedom of Information request - Plans for the East Oxford Community Centre
+Date: Wed, 12 Mar 2008 14:59:04 -0000
+Message-ID: <3D8BEC617D49EF45A9E6D103A83FD30331BF84@local>
+X-MS-Has-Attach:
+X-MS-TNEF-Correlator:
+Thread-Topic: Freedom of Information request
+Thread-Index: AciDziuIcYirFQ7GT36VyP2ABE14qgAg1c0w
+From: "A Person" email@example.com>
+To: FOI Person <EMAIL_TO>
+X-OriginalArrivalTime: 12 Mar 2008 14:59:04.0368 (UTC) FILETIME=[9D245300:01C88451]
+X-SEF-7853D99-ADF1-478E-8894-213D316B8FFA: 1
+X-SEF-Processed: 6_0_1_111__2008_03_12_14_59_05
+
+Thank you for your Freedom of Information request. I have forwarded it=0D=0A=
+to the relevant department for their reply.=0D=0A=0D=0A
+
diff --git a/spec/fixtures/files/many-attachments-date-header.email b/spec/fixtures/files/many-attachments-date-header.email
new file mode 100644
index 000000000..a241e2456
--- /dev/null
+++ b/spec/fixtures/files/many-attachments-date-header.email
@@ -0,0 +1,451 @@
+From email@example.com Wed Apr 14 11:23:08 2010
+Return-path: <email@example.com>
+Envelope-to: email@example.com
+Delivery-date: Wed, 14 Apr 2010 11:23:08 +0100
+X-TM-IMSS-Message-ID:<email@example.com>
+Received: from example.com ([0.0.0.0]) by example.com ([0.0.0.0]) with ESMTP (TREND IMSS SMTP Service 7.0) id 1ec0f7ac0002a77f ; Wed, 14 Apr 2010 11:22:52 +0100
+Received: from GWGATE-MTA by example.com
+ with Novell_GroupWise; Wed, 14 Apr 2010 11:22:53 +0100
+Message-Id: <email@example.com>
+X-Mailer: Novell GroupWise Internet Agent 8.0.1
+Date: Wed, 14 Apr 2010 11:22:47 +0100
+From: "A Person" <email@example.com>
+To: <email@example.com>
+Cc: "FOI FOI" <email@example.com>,
+ "A Person" <email@example.com>
+Subject: Fwd: Re: Freedom of Information request
+References: <email@example.com>
+ <email@example.com>
+Mime-Version: 1.0
+Content-Type: multipart/mixed; boundary="=__Part163C9567.0__="
+
+This is a MIME message. If you are reading this text, you may want to
+consider changing to a mail reader or gateway that understands how to
+properly handle MIME multipart messages.
+
+--=__Part163C9567.0__=
+Content-Type: text/plain; charset=US-ASCII
+Content-Transfer-Encoding: quoted-printable
+Content-Disposition: inline
+
+Some information
+
+
+--=__Part163C9567.0__=
+Content-Type: message/rfc822
+
+Date: Wed, 10 Mar 2010 14:17:52 +0000
+From: "A Person" <email@example.com>
+To: "A Person" <email@example.com>
+Subject: Re: xxx
+Mime-Version: 1.0
+Content-Type: text/plain; charset=US-ASCII
+Content-Transfer-Encoding: quoted-printable
+Content-Disposition: inline
+
+2
+
+--=__Part163C9567.0__=
+Content-Type: message/rfc822
+
+Return-path: <email@example.com>
+Received: from example.com ([0.0.0.0])
+ by example.com with ESMTP; Tue, 24 Nov 2009 10:45:58 +0000
+X-TM-IMSS-Message-ID:<email@example.com>
+Received: from example.com ([0.0.0.0]) by example.com ([0.0.0.0]) with SMTP (TREND IMSS SMTP Service 7.0) id 00660acd00000f42 ; Tue, 24 Nov 2009 10:45:55 +0000
+Received: from source ([0.0.0.0]) (using TLSv1) by eu1sys200aob115.postini.com ([0.0.0.0]) with SMTP
+ ID email@example.com Tue, 24 Nov 2009 10:45:56 UTC
+Received: from example.com ([::1]) by
+ example.com ([::1]) with mapi; Tue, 24 Nov 2009
+ 10:45:53 +0000
+From: A Person <email@example.com>
+To: email@example.com <email@example.com>
+Date: Tue, 24 Nov 2009 10:45:52 +0000
+Subject: example
+Thread-Topic: example
+Thread-Index: AcpnbI2i+XAmfHFVTFy0eGDpVJhXoQFhVeZw
+Message-ID: <email@example.com>
+Accept-Language: en-US, en-GB
+Content-Language: en-US
+X-MS-Has-Attach: yes
+X-MS-TNEF-Correlator:
+acceptlanguage: en-US, en-GB
+Content-Type: multipart/mixed;
+ boundary="_006_B3BDF1D06801114FA040D0F1BEE7CF9C48FD3549evs02ukcommonpu_"
+MIME-Version: 1.0
+X-TM-AS-Product-Ver: IMSS-0.0.0.0-0.0.0.0-17028.005
+X-TM-AS-Result: No--19.329-5.0-31-1
+X-imss-scan-details: No--19.329-5.0-31-1
+X-TM-AS-User-Approved-Sender: No
+X-TM-AS-User-Blocked-Sender: No
+
+
+--_006_B3BDF1D06801114FA040D0F1BEE7CF9C48FD3549evs02ukcommonpu_
+Content-Type: multipart/related;
+ boundary="_005_B3BDF1D06801114FA040D0F1BEE7CF9C48FD3549evs02ukcommonpu_";
+ type="multipart/alternative"
+
+--_005_B3BDF1D06801114FA040D0F1BEE7CF9C48FD3549evs02ukcommonpu_
+Content-Type: multipart/alternative;
+ boundary="_000_B3BDF1D06801114FA040D0F1BEE7CF9C48FD3549evs02ukcommonpu_"
+
+--_000_B3BDF1D06801114FA040D0F1BEE7CF9C48FD3549evs02ukcommonpu_
+Content-Type: text/plain; charset="iso-8859-1"
+Content-Transfer-Encoding: quoted-printable
+
+
+3
+
+--_000_B3BDF1D06801114FA040D0F1BEE7CF9C48FD3549evs02ukcommonpu_
+Content-Type: text/html; charset="iso-8859-1"
+Content-Transfer-Encoding: quoted-printable
+
+4
+
+--_000_B3BDF1D06801114FA040D0F1BEE7CF9C48FD3549evs02ukcommonpu_--
+
+--_005_B3BDF1D06801114FA040D0F1BEE7CF9C48FD3549evs02ukcommonpu_
+Content-Type: image/gif; name="image001.gif"
+Content-Description: image001.gif
+Content-Disposition: inline; filename="image001.gif"; size=5445;
+ creation-date="Tue, 17 Nov 2009 09:58:46 GMT";
+ modification-date="Tue, 17 Nov 2009 09:58:46 GMT"
+Content-ID: <email@example.com>
+Content-Transfer-Encoding: base64
+
+5
+--_005_B3BDF1D06801114FA040D0F1BEE7CF9C48FD3549evs02ukcommonpu_--
+
+--_006_B3BDF1D06801114FA040D0F1BEE7CF9C48FD3549evs02ukcommonpu_
+Content-Type: application/vnd.ms-excel;
+ name="particpant list.xls"
+Content-Description: particpant list.xls
+Content-Disposition: attachment;
+ filename="particpant list.xls"; size=21504;
+ creation-date="Mon, 02 Nov 2009 09:42:37 GMT";
+ modification-date="Tue, 24 Nov 2009 10:45:52 GMT"
+Content-Transfer-Encoding: base64
+
+6
+
+--_006_B3BDF1D06801114FA040D0F1BEE7CF9C48FD3549evs02ukcommonpu_--
+--=__Part163C9567.0__=
+Content-Type: message/rfc822
+
+Return-path: <email@example.com>
+Received: from example.com ([0.0.0.0])
+ by example.com with ESMTP; Thu, 03 Dec 2009 09:29:07 +0000
+X-TM-IMSS-Message-ID:<email@example.com>
+Received: from eu1sys200aog116.obsmtp.com ([0.0.0.0]) by example.com ([0.0.0.0]) with SMTP (TREND IMSS SMTP Service 7.0) id 0ac1bf1b0001116e ; Thu, 3 Dec 2009 09:29:04 +0000
+Received: from source ([0.0.0.0]) (using TLSv1) by eu1sys200aob116.postini.com ([0.0.0.0]) with SMTP
+ ID email@example.com Thu, 03 Dec 2009 09:29:06 UTC
+Received: from example.com ([::1]) by
+ example.com ([::1]) with mapi; Thu, 3 Dec 2009 09:29:03
+ +0000
+From: A Person <email@example.com>
+To: 'A Person' <email@example.com>
+Date: Thu, 3 Dec 2009 09:29:03 +0000
+Subject: RE: example
+Thread-Topic: example
+Thread-Index: AcpuoEyRvzM8fXw+THuj/617pjnvCgFWqZdQ
+Message-ID: <email@example.com>
+References: <email@example.com>
+ <email@example.com>
+In-Reply-To: <email@example.com>
+Accept-Language: en-US, en-GB
+Content-Language: en-US
+X-MS-Has-Attach:
+X-MS-TNEF-Correlator:
+acceptlanguage: en-US, en-GB
+Content-Type: text/plain; charset="utf-8"
+Content-Transfer-Encoding: base64
+MIME-Version: 1.0
+X-TM-AS-Product-Ver: IMSS-0.0.0.0-0.0.0.0-17046.004
+X-TM-AS-Result: No--16.791-5.0-31-1
+X-imss-scan-details: No--16.791-5.0-31-1
+X-TM-AS-User-Approved-Sender: No
+X-TM-AS-User-Blocked-Sender: No
+
+
+7
+--=__Part163C9567.0__=
+Content-Type: message/rfc822
+
+Return-path: <email@example.com>
+Received: from example.com ([0.0.0.0])
+ by example.com with ESMTP; Wed, 25 Nov 2009 22:26:23 +0000
+X-TM-IMSS-Message-ID:<email@example.com>
+Received: from eu1sys200aog105.obsmtp.com ([0.0.0.0]) by example.com ([0.0.0.0]) with SMTP (TREND IMSS SMTP Service 7.0) id 034354c900007016 ; Wed, 25 Nov 2009 22:26:19 +0000
+Received: from source ([0.0.0.0]) (using TLSv1) by eu1sys200aob105.postini.com ([0.0.0.0]) with SMTP
+ ID email@example.com Wed, 25 Nov 2009 22:26:21 UTC
+Received: from example.com ([::1]) by
+ example.com ([::1]) with mapi; Wed, 25 Nov 2009
+ 22:26:15 +0000
+From: A Person <email@example.com>
+To: email@example.com <email@example.com>
+CC: A Person <email@example.com>
+Date: Wed, 25 Nov 2009 22:26:12 +0000
+Subject: As promised - Masterclass info (example)
+Thread-Topic: As promised - Masterclass info (example)
+Thread-Index: AcpuHcJ4yrR8PBHZTVCU/RLGzwqsDAAACGwQ
+Message-ID: <email@example.com>
+Accept-Language: en-US, en-GB
+Content-Language: en-US
+X-MS-Has-Attach: yes
+X-MS-TNEF-Correlator:
+acceptlanguage: en-US, en-GB
+Content-Type: multipart/mixed;
+ boundary="_007_B3BDF1D06801114FA040D0F1BEE7CF9C48DC6D82evs02ukcommonpu_"
+MIME-Version: 1.0
+X-TM-AS-Product-Ver: IMSS-0.0.0.0-0.0.0.0-17032.000
+X-TM-AS-Result: No--26.167-5.0-31-1
+X-imss-scan-details: No--26.167-5.0-31-1
+X-TM-AS-User-Approved-Sender: No
+X-TM-AS-User-Blocked-Sender: No
+
+
+--_007_B3BDF1D06801114FA040D0F1BEE7CF9C48DC6D82evs02ukcommonpu_
+Content-Type: multipart/related;
+ boundary="_006_B3BDF1D06801114FA040D0F1BEE7CF9C48DC6D82evs02ukcommonpu_";
+ type="multipart/alternative"
+
+--_006_B3BDF1D06801114FA040D0F1BEE7CF9C48DC6D82evs02ukcommonpu_
+Content-Type: multipart/alternative;
+ boundary="_000_B3BDF1D06801114FA040D0F1BEE7CF9C48DC6D82evs02ukcommonpu_"
+
+--_000_B3BDF1D06801114FA040D0F1BEE7CF9C48DC6D82evs02ukcommonpu_
+Content-Type: text/plain; charset="utf-8"
+Content-Transfer-Encoding: base64
+
+8
+--_000_B3BDF1D06801114FA040D0F1BEE7CF9C48DC6D82evs02ukcommonpu_
+Content-Type: text/html; charset="utf-8"
+Content-Transfer-Encoding: base64
+
+9
+--_000_B3BDF1D06801114FA040D0F1BEE7CF9C48DC6D82evs02ukcommonpu_--
+
+--_006_B3BDF1D06801114FA040D0F1BEE7CF9C48DC6D82evs02ukcommonpu_
+Content-Type: image/gif; name="image001.gif"
+Content-Description: image001.gif
+Content-Disposition: inline; filename="image001.gif"; size=5445;
+ creation-date="Wed, 25 Nov 2009 22:26:14 GMT";
+ modification-date="Wed, 25 Nov 2009 22:26:14 GMT"
+Content-ID: <email@example.com>
+Content-Transfer-Encoding: base64
+
+
+10
+
+--_006_B3BDF1D06801114FA040D0F1BEE7CF9C48DC6D82evs02ukcommonpu_--
+
+--_007_B3BDF1D06801114FA040D0F1BEE7CF9C48DC6D82evs02ukcommonpu_
+Content-Type: application/msword;
+ name= "Participant List.doc"
+Content-Description: Participant List.doc
+Content-Disposition: attachment;
+ filename="Participant List.doc"; size=112640;
+ creation-date="Wed, 25 Nov 2009 22:17:24 GMT";
+ modification-date="Wed, 25 Nov 2009 11:43:48 GMT"
+Content-Transfer-Encoding: base64
+
+11
+--_007_B3BDF1D06801114FA040D0F1BEE7CF9C48DC6D82evs02ukcommonpu_
+Content-Type: application/msword; name="Information & Booking Form.doc"
+Content-Description: Information & Booking Form.doc
+Content-Disposition: attachment; filename="Information & Booking Form.doc"; size=84480;
+ creation-date="Wed, 25 Nov 2009 22:17:40 GMT";
+ modification-date="Wed, 04 Nov 2009 14:42:54 GMT"
+Content-Transfer-Encoding: base64
+
+12
+
+--_007_B3BDF1D06801114FA040D0F1BEE7CF9C48DC6D82evs02ukcommonpu_--
+--=__Part163C9567.0__=
+Content-Type: message/rfc822
+
+Return-path: <email@example.com>
+Received: from example.com ([0.0.0.0])
+ by example.com with ESMTP; Fri, 04 Dec 2009 10:00:05 +0000
+X-TM-IMSS-Message-ID:<email@example.com>
+Received: from eu1sys200aog109.obsmtp.com ([0.0.0.0]) by example.com ([0.0.0.0]) with SMTP (TREND IMSS SMTP Service 7.0) id 100473260001a476 ; Fri, 4 Dec 2009 10:00:01 +0000
+Received: from source ([0.0.0.0]) (using TLSv1) by eu1sys200aob109.postini.com ([0.0.0.0]) with SMTP
+ ID email@example.com Fri, 04 Dec 2009 10:00:04 UTC
+Received: from example.com ([::1]) by
+ example.com ([::1]) with mapi; Fri, 4 Dec 2009 10:00:01
+ +0000
+From: A Person <email@example.com>
+To: email@example.com <email@example.com>
+Date: Fri, 4 Dec 2009 10:00:01 +0000
+Subject: Re: As promised - info (example)
+Thread-Topic: As promised - info (example)
+Thread-Index: AcpzhLeBjBId8eZATYudOfBgN6CPXQBQ9Pok
+Message-ID: <email@example.com>
+Accept-Language: en-US, en-GB
+Content-Language: en-US
+X-MS-Has-Attach:
+X-MS-TNEF-Correlator:
+acceptlanguage: en-US, en-GB
+Content-Type: text/plain; charset="utf-8"
+Content-Transfer-Encoding: base64
+MIME-Version: 1.0
+X-TM-AS-Product-Ver: IMSS-0.0.0.0-0.0.0.0-17048.005
+X-TM-AS-Result: No--24.171-5.0-31-1
+X-imss-scan-details: No--24.171-5.0-31-1
+X-TM-AS-User-Approved-Sender: No
+X-TM-AS-User-Blocked-Sender: No
+
+13
+--=__Part163C9567.0__=
+Content-Type: message/rfc822
+
+Return-path: <email@example.com>
+Received: from example.com ([0.0.0.0])
+ by example.com with ESMTP; Sun, 21 Mar 2010 21:53:38 +0000
+X-TM-IMSS-Message-ID:<email@example.com>
+Received: from eu1sys200aog117.obsmtp.com ([0.0.0.0]) by example.com ([0.0.0.0]) with SMTP (TREND IMSS SMTP Service 7.0) id 1e3611c1000d37df ; Sun, 21 Mar 2010 21:53:32 +0000
+Received: from source ([0.0.0.0]) (using TLSv1) by eu1sys200aob117.postini.com ([0.0.0.0]) with SMTP
+ ID email@example.com Sun, 21 Mar 2010 21:53:37 UTC
+Received: from example.com ([::1]) by exchhub01
+ ([0.0.0.0]) with mapi; Sun, 21 Mar 2010 21:53:34 +0000
+From: A Person <email@example.com>
+To: email@example.com <email@example.com>
+CC: A Person <email@example.com>
+Date: Sun, 21 Mar 2010 21:53:32 +0000
+Subject: Thank you from example
+Thread-Topic: Thank you from example
+Thread-Index: AcrJQPL4xb9zjXMHRJGTjAxo3X/kfA==
+Message-ID: <email@example.com>
+Accept-Language: en-US, en-GB
+Content-Language: en-US
+X-MS-Has-Attach: yes
+X-MS-TNEF-Correlator:
+acceptlanguage: en-US, en-GB
+Content-Type: multipart/related;
+ boundary="_004_B3BDF1D06801114FA040D0F1BEE7CF9C5E14635Bevs02ukcommonpu_";
+ type="multipart/alternative"
+MIME-Version: 1.0
+X-TM-AS-Product-Ver: IMSS-0.0.0.0-0.0.0.0-17266.002
+X-TM-AS-Result: No--26.373-5.0-31-1
+X-imss-scan-details: No--26.373-5.0-31-1
+X-TM-AS-User-Approved-Sender: No
+X-TM-AS-User-Blocked-Sender: No
+
+
+--_004_B3BDF1D06801114FA040D0F1BEE7CF9C5E14635Bevs02ukcommonpu_
+Content-Type: multipart/alternative;
+ boundary="_000_B3BDF1D06801114FA040D0F1BEE7CF9C5E14635Bevs02ukcommonpu_"
+
+--_000_B3BDF1D06801114FA040D0F1BEE7CF9C5E14635Bevs02ukcommonpu_
+Content-Type: text/plain; charset="us-ascii"
+Content-Transfer-Encoding: quoted-printable
+
+14
+
+--_000_B3BDF1D06801114FA040D0F1BEE7CF9C5E14635Bevs02ukcommonpu_
+Content-Type: text/html; charset="us-ascii"
+Content-Transfer-Encoding: quoted-printable
+
+15
+
+--_000_B3BDF1D06801114FA040D0F1BEE7CF9C5E14635Bevs02ukcommonpu_--
+
+--_004_B3BDF1D06801114FA040D0F1BEE7CF9C5E14635Bevs02ukcommonpu_
+Content-Type: image/gif; name="image001.gif"
+Content-Description: image001.gif
+Content-Disposition: inline; filename="image001.gif"; size=5445;
+ creation-date="Sun, 21 Mar 2010 21:53:33 GMT";
+ modification-date="Sun, 21 Mar 2010 21:53:33 GMT"
+Content-ID: <email@example.com>
+Content-Transfer-Encoding: base64
+
+16
+--_004_B3BDF1D06801114FA040D0F1BEE7CF9C5E14635Bevs02ukcommonpu_--
+--=__Part163C9567.0__=
+Content-Type: message/rfc822
+
+Return-path: <email@example.com>
+Received: from example.com ([0.0.0.0])
+ by example.com with ESMTP; Tue, 23 Feb 2010 15:33:48 +0000
+X-TM-IMSS-Message-ID:<email@example.com>
+Received: from eu1sys200aog112.obsmtp.com ([0.0.0.0]) by example.com ([0.0.0.0]) with SMTP (TREND IMSS SMTP Service 7.0) id 96f54043000f2e72 ; Tue, 23 Feb 2010 15:33:48 +0000
+Received: from source ([0.0.0.0]) by eu1sys200aob112.postini.com ([0.0.0.0]) with SMTP
+ ID email@example.com Tue, 23 Feb 2010 15:33:47 UTC
+Received: from gla-002561-lap ([0.0.0.0]) by example.com with Microsoft SMTPSVC(0.0.0.0);
+ Tue, 23 Feb 2010 15:33:46 +0000
+Reply-To: email@example.com
+From: email@example.com
+To: email@example.com
+Subject: example - Meeting - Tuesday 2nd March
+Date: 23 February 2010 15:33
+X-Mailer: Internet Professional v1.15
+Return-Path: email@example.com
+Message-ID: <email@example.com>
+X-OriginalArrivalTime: 23 Feb 2010 15:33:46.0648 (UTC) FILETIME=[96CEC980:01CAB49D]
+X-TM-AS-Product-Ver: IMSS-0.0.0.0-0.0.0.0-17212.000
+X-TM-AS-Result: No--16.146-5.0-31-1
+X-imss-scan-details: No--16.146-5.0-31-1
+X-TM-AS-User-Approved-Sender: No
+X-TM-AS-User-Blocked-Sender: No
+
+17
+
+--=__Part163C9567.0__=
+Content-Type: message/rfc822
+
+Return-path: <email@example.com>
+Received: from example.com ([0.0.0.0])
+ by example.com with ESMTP; Mon, 08 Mar 2010 09:21:42 +0000
+X-TM-IMSS-Message-ID:<email@example.com>
+Received: from eu1sys200aog117.obsmtp.com ([0.0.0.0]) by example.com ([0.0.0.0]) with SMTP (TREND IMSS SMTP Service 7.0) id d8931aff001580d6 ; Mon, 8 Mar 2010 09:21:40 +0000
+Received: from source ([0.0.0.0]) by eu1sys200aob117.postini.com ([0.0.0.0]) with SMTP
+ ID email@example.com Mon, 08 Mar 2010 09:21:39 UTC
+Received: from gla-002561-lap ([0.0.0.0]) by example.com with Microsoft SMTPSVC(0.0.0.0);
+ Mon, 8 Mar 2010 09:21:36 +0000
+Reply-To: email@example.com
+From: email@example.com
+To: email@example.com
+Subject: example - Help needed
+Date: 08 March 2010 09:21
+X-Mailer: Internet Professional v1.15
+MIME-Version: 1.0
+Content-Type: multipart/mixed;boundary="_NextPart_00009D35-00000F3C-00271781-26DF"
+Return-Path: email@example.com
+Message-ID: <email@example.com>
+X-OriginalArrivalTime: 08 Mar 2010 09:21:36.0283 (UTC) FILETIME=[C03E3EB0:01CABEA0]
+X-TM-AS-Product-Ver: IMSS-0.0.0.0-0.0.0.0-17236.006
+X-TM-AS-Result: No--32.111-5.0-31-1
+X-imss-scan-details: No--32.111-5.0-31-1
+X-TM-AS-User-Approved-Sender: No
+X-TM-AS-User-Blocked-Sender: No
+
+This message is in MIME format. Since your mail reader does not
+understand this format, some or all of this message may not be legible.
+--_NextPart_00009D35-00000F3C-00271781-26DF
+Content-Type: text/plain
+Content-Transfer-Encoding: 7bit
+
+18
+
+--_NextPart_00009D35-00000F3C-00271781-26DF
+Content-Type: application/octet-stream;name="Information Pack.pdf"
+Content-Transfer-Encoding: base64
+Content-Disposition: attachment;filename="Information Pack.pdf";size=106688
+
+19
+--_NextPart_00009D35-00000F3C-00271781-26DF--
+--=__Part163C9567.0__=
+Content-Type: message/rfc822
+
+Date: Wed, 02 Dec 2009 19:21:27 +0000
+From: "A Person" <email@example.com>
+To: "A Person" <email@example.com>
+Subject: Re: As promised - info (example)
+Mime-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+Content-Disposition: inline
+
+20
+--=__Part163C9567.0__=--
+
diff --git a/spec/lib/mail_handler/mail_handler_spec.rb b/spec/lib/mail_handler/mail_handler_spec.rb
index 7eeba47e0..ae65210f2 100644
--- a/spec/lib/mail_handler/mail_handler_spec.rb
+++ b/spec/lib/mail_handler/mail_handler_spec.rb
@@ -250,4 +250,134 @@ describe 'when getting header strings' do
'9; Autoresponder')
end
-end \ No newline at end of file
+end
+
+describe "when parsing HTML mail" do
+ it "should display UTF-8 characters in the plain text version correctly" do
+ html = "<html><b>foo</b> është"
+ plain_text = MailHandler.get_attachment_text_one_file('text/html', html)
+ plain_text.should match(/është/)
+ end
+
+end
+
+describe "when getting the attachment text" do
+ it "should not raise an error if the expansion of a zip file raises an error" do
+ mock_entry = mock('ZipFile entry', :file? => true)
+ mock_entries = [mock_entry]
+ mock_entries.stub!(:close)
+ mock_entry.stub!(:get_input_stream).and_raise("invalid distance too far back")
+ Zip::ZipFile.stub!(:open).and_return(mock_entries)
+ MailHandler.get_attachment_text_one_file('application/zip', "some string")
+ end
+
+end
+
+describe 'when getting attachment attributes' do
+
+ it 'should get two attachment parts from a multipart mail with text and html alternatives
+ and an image' do
+ mail = get_fixture_mail('quoted-subject-iso8859-1.email')
+ attributes = MailHandler.get_attachment_attributes(mail)
+ attributes.size.should == 2
+ end
+
+ it 'should expand a mail attached as text' do
+ mail = get_fixture_mail('rfc822-attachment.email')
+ attributes = MailHandler.get_attachment_attributes(mail)
+ attributes.size.should == 2
+ rfc_attachment = attributes[1]
+ rfc_attachment[:within_rfc822_subject].should == 'Freedom of Information request'
+ headers = ['Date: Thu, 13 Mar 2008 16:57:33 +0000',
+ 'Subject: Freedom of Information request',
+ 'From: An FOI Officer <foi.officer@example.com>',
+ 'To: request-bounce-xx-xxxxx@whatdotheyno.com']
+ rfc_attachment[:body].should == "#{headers.join("\n")}\n\nsome example text"
+ end
+
+ it 'should handle a mail which causes Tmail to generate a blank header value' do
+ mail = get_fixture_mail('many-attachments-date-header.email')
+ attributes = MailHandler.get_attachment_attributes(mail)
+ end
+
+ it 'should produce a consistent set of url_part_numbers, content_types, within_rfc822_subjects
+ and filenames from an example mail with lots of attachments' do
+ mail = get_fixture_mail('many-attachments-date-header.email')
+ attributes = MailHandler.get_attachment_attributes(mail)
+
+ expected_attributes = [ { :content_type=>"text/plain",
+ :url_part_number=>1,
+ :within_rfc822_subject=>nil,
+ :filename=>nil},
+ { :content_type=>"text/plain",
+ :url_part_number=>2,
+ :within_rfc822_subject=>"Re: xxx",
+ :filename=>nil},
+ { :content_type=>"text/html",
+ :url_part_number=>4,
+ :within_rfc822_subject=>"example",
+ :filename=>nil},
+ { :content_type=>"image/gif", :url_part_number=>5,
+ :within_rfc822_subject=>"example",
+ :filename=>"image001.gif"},
+ { :content_type=>"application/vnd.ms-excel",
+ :url_part_number=>6,
+ :within_rfc822_subject=>"example",
+ :filename=>"particpant list.xls"},
+ { :content_type=>"text/plain",
+ :url_part_number=>7,
+ :within_rfc822_subject=>"RE: example",
+ :filename=>nil},
+ { :content_type=>"text/html",
+ :url_part_number=>9,
+ :within_rfc822_subject=>"As promised - Masterclass info (example)",
+ :filename=>nil},
+ { :content_type=>"image/gif",
+ :url_part_number=>10,
+ :within_rfc822_subject=>"As promised - Masterclass info (example)",
+ :filename=>"image001.gif"},
+ { :content_type=>"application/vnd.ms-word",
+ :url_part_number=>11,
+ :within_rfc822_subject=>"As promised - Masterclass info (example)",
+ :filename=>"Participant List.doc"},
+ { :content_type=>"application/vnd.ms-word",
+ :url_part_number=>12,
+ :within_rfc822_subject=>"As promised - Masterclass info (example)",
+ :filename=>"Information & Booking Form.doc"},
+ { :content_type=>"text/plain",
+ :url_part_number=>13,
+ :within_rfc822_subject=>"Re: As promised - info (example)",
+ :filename=>nil},
+ { :content_type=>"text/html",
+ :url_part_number=>15,
+ :within_rfc822_subject=>"Thank you from example",
+ :filename=>nil},
+ { :content_type=>"image/gif",
+ :url_part_number=>16,
+ :within_rfc822_subject=>"Thank you from example",
+ :filename=>"image001.gif"},
+ { :content_type=>"text/plain",
+ :url_part_number=>17,
+ :within_rfc822_subject=>"example - Meeting - Tuesday 2nd March",
+ :filename=>nil},
+ { :content_type=>"text/plain",
+ :url_part_number=>18,
+ :within_rfc822_subject=>"example - Help needed",
+ :filename=>nil},
+ { :content_type=>"application/pdf",
+ :url_part_number=>19,
+ :within_rfc822_subject=>"example - Help needed",
+ :filename=>"Information Pack.pdf"},
+ { :content_type=>"text/plain",
+ :url_part_number=>20,
+ :within_rfc822_subject=>"Re: As promised - info (example)",
+ :filename=>nil} ]
+
+ attributes.each_with_index do |attr, index|
+ attr.delete(:charset)
+ attr.delete(:body)
+ attr.delete(:hexdigest)
+ attr.should == expected_attributes[index]
+ end
+ end
+end
diff --git a/spec/models/incoming_message_spec.rb b/spec/models/incoming_message_spec.rb
index 1278535f8..70b323e9f 100644
--- a/spec/models/incoming_message_spec.rb
+++ b/spec/models/incoming_message_spec.rb
@@ -68,6 +68,14 @@ describe IncomingMessage, " when dealing with incoming mail" do
message.get_main_body_text_internal.should include("The above text was badly encoded")
end
+ it 'should convert DOS-style linebreaks to Unix style' do
+ ir = info_requests(:fancy_dog_request)
+ receive_incoming_mail('dos-linebreaks.email', ir.incoming_email)
+ message = ir.incoming_messages[1]
+ message.parse_raw_email!
+ message.get_main_body_text_internal.should_not match(/\r\n/)
+ end
+
it "should fold multiline sections" do
{
"foo\n--------\nconfidential" => "foo\nFOLDED_QUOTED_SECTION\n", # basic test
@@ -102,27 +110,6 @@ describe IncomingMessage, " when dealing with incoming mail" do
end
-describe IncomingMessage, "when parsing HTML mail" do
- it "should display UTF-8 characters in the plain text version correctly" do
- html = "<html><b>foo</b> është"
- plain_text = IncomingMessage._get_attachment_text_internal_one_file('text/html', html)
- plain_text.should match(/është/)
- end
-
-end
-
-describe IncomingMessage, "when getting the attachment text" do
-
- it "should not raise an error if the expansion of a zip file raises an error" do
- mock_entry = mock('ZipFile entry', :file? => true)
- mock_entry.stub!(:get_input_stream).and_raise("invalid distance too far back")
- Zip::ZipFile.stub!(:open).and_return([mock_entry])
- IncomingMessage._get_attachment_text_internal_one_file('application/zip', "some string")
- end
-
-end
-
-
describe IncomingMessage, " display attachments" do
it "should not show slashes in filenames" do
@@ -138,7 +125,7 @@ describe IncomingMessage, " display attachments" do
# http://www.whatdotheyknow.com/request/post_commercial_manager_librarie#incoming-17233
foi_attachment.within_rfc822_subject = "FOI/09/066 RESPONSE TO FOI REQUEST RECEIVED 21st JANUARY 2009"
foi_attachment.content_type = 'text/plain'
- foi_attachment.ensure_filename!
+ foi_attachment.ensure_filename!
expected_display_filename = foi_attachment.within_rfc822_subject.gsub(/\//, " ") + ".txt"
foi_attachment.display_filename.should == expected_display_filename
end
@@ -326,12 +313,12 @@ describe IncomingMessage, " when censoring data" do
orig_pdf = load_file_fixture('tfl.pdf')
pdf = orig_pdf.dup
- orig_text = IncomingMessage._get_attachment_text_internal_one_file('application/pdf', pdf)
+ orig_text = MailHandler.get_attachment_text_one_file('application/pdf', pdf)
orig_text.should match(/foi@tfl.gov.uk/)
@im.binary_mask_stuff!(pdf, "application/pdf")
- masked_text = IncomingMessage._get_attachment_text_internal_one_file('application/pdf', pdf)
+ masked_text = MailHandler.get_attachment_text_one_file('application/pdf', pdf)
masked_text.should_not match(/foi@tfl.gov.uk/)
masked_text.should match(/xxx@xxx.xxx.xx/)
config['USE_GHOSTSCRIPT_COMPRESSION'] = previous