diff options
-rw-r--r-- | app/models/incoming_message.rb | 134 | ||||
-rw-r--r-- | config/packages | 1 | ||||
-rw-r--r-- | public/images/icon_application_zip_large.png | bin | 0 -> 2863 bytes | |||
-rw-r--r-- | todo.txt | 12 |
4 files changed, 85 insertions, 62 deletions
diff --git a/app/models/incoming_message.rb b/app/models/incoming_message.rb index b5e461e81..062f43742 100644 --- a/app/models/incoming_message.rb +++ b/app/models/incoming_message.rb @@ -19,7 +19,7 @@ # Copyright (c) 2007 UK Citizens Online Democracy. All rights reserved. # Email: francis@mysociety.org; WWW: http://www.mysociety.org/ # -# $Id: incoming_message.rb,v 1.130 2008-08-15 00:50:33 francis Exp $ +# $Id: incoming_message.rb,v 1.131 2008-08-21 02:08:17 francis Exp $ # TODO # Move some of the (e.g. quoting) functions here into rblib, as they feel @@ -27,6 +27,7 @@ require 'htmlentities' require 'rexml/document' +require 'zip/zip' module TMail class Mail @@ -73,6 +74,7 @@ $file_extension_to_mime_type = { "jpg" => 'image/jpeg', # XXX add jpeg "png" => 'image/png', "html" => 'text/html', # XXX add htm + "zip" => 'application/zip' } # XXX doesn't have way of choosing default for inverse map - might want to add # one when you need it @@ -415,6 +417,9 @@ class IncomingMessage < ActiveRecord::Base if curr_mail.content_type == 'application/msword' curr_mail.content_type = 'application/vnd.ms-word' end + if curr_mail.content_type == 'application/x-zip-compressed' + curr_mail.content_type = 'application/zip' + end # If the part is an attachment of email in text form if curr_mail.content_type == 'message/rfc822' ensure_parts_counted # fills in rfc822_attachment variable @@ -681,66 +686,87 @@ class IncomingMessage < ActiveRecord::Base text = IncomingMessage.remove_privacy_sensitive_things(text) return text end - def get_attachment_text_internal - # XXX - tell all these command line tools to return utf-8 + def IncomingMessage.get_attachment_text_internal_one_file(content_type, body) text = '' - attachments = self.get_attachments_for_display - for attachment in attachments - if attachment.content_type == 'text/plain' - text += attachment.body + "\n\n" - else - tempfile = Tempfile.new('foiextract') - tempfile.print attachment.body - tempfile.flush - if attachment.content_type == 'application/vnd.ms-word' - system("/usr/bin/wvText " + tempfile.path + " " + tempfile.path + ".txt") - # Try catdoc if we get into trouble (e.g. for InfoRequestEvent 2701) - if not File.exists?(tempfile.path + ".txt") - IO.popen("/usr/bin/catdoc " + tempfile.path, "r") do |child| - text += child.read() + "\n\n" - end - else - text += File.read(tempfile.path + ".txt") + "\n\n" - File.unlink(tempfile.path + ".txt") - end - elsif attachment.content_type == 'application/rtf' + # XXX - tell all these command line tools to return utf-8 + if content_type == 'text/plain' + text += body + "\n\n" + else + tempfile = Tempfile.new('foiextract') + tempfile.print body + tempfile.flush + if content_type == 'application/vnd.ms-word' + system("/usr/bin/wvText " + tempfile.path + " " + tempfile.path + ".txt") + # Try catdoc if we get into trouble (e.g. for InfoRequestEvent 2701) + if not File.exists?(tempfile.path + ".txt") IO.popen("/usr/bin/catdoc " + tempfile.path, "r") do |child| text += child.read() + "\n\n" end - elsif attachment.content_type == 'text/html' - IO.popen("/usr/bin/lynx -force_html -dump " + tempfile.path, "r") do |child| - text += child.read() + "\n\n" - end - elsif attachment.content_type == 'application/vnd.ms-excel' - # Bit crazy using strings - but xls2csv, xlhtml and py_xls2txt - # only extract text from cells, not from floating notes. catdoc - # may be fooled by weird character sets, but will probably do for - # UK FOI requests. - IO.popen("/usr/bin/strings " + tempfile.path, "r") do |child| - text += child.read() + "\n\n" - end - elsif attachment.content_type == 'application/vnd.ms-powerpoint' - # ppthtml seems to catch more text, but only outputs HTML when - # we want text, so just use catppt for now - IO.popen("/usr/bin/catppt " + tempfile.path, "r") do |child| - text += child.read() + "\n\n" - end - elsif attachment.content_type == 'application/pdf' - IO.popen("/usr/bin/pdftotext " + tempfile.path + " -", "r") do |child| - text += child.read() + "\n\n" - end - elsif attachment.content_type == 'application/vnd.openxmlformats-officedocument.wordprocessingml.document' - # This is Microsoft's XML office document format. - # Just pull out the main XML file, and strip it of text. - xml = '' - IO.popen("/usr/bin/unzip -qq -c " + tempfile.path + " word/document.xml", "r") do |child| - xml += child.read() + "\n\n" + else + text += File.read(tempfile.path + ".txt") + "\n\n" + File.unlink(tempfile.path + ".txt") + end + elsif content_type == 'application/rtf' + IO.popen("/usr/bin/catdoc " + tempfile.path, "r") do |child| + text += child.read() + "\n\n" + end + elsif content_type == 'text/html' + IO.popen("/usr/bin/lynx -force_html -dump " + tempfile.path, "r") do |child| + text += child.read() + "\n\n" + end + elsif content_type == 'application/vnd.ms-excel' + # Bit crazy using strings - but xls2csv, xlhtml and py_xls2txt + # only extract text from cells, not from floating notes. catdoc + # may be fooled by weird character sets, but will probably do for + # UK FOI requests. + IO.popen("/usr/bin/strings " + tempfile.path, "r") do |child| + text += child.read() + "\n\n" + end + elsif content_type == 'application/vnd.ms-powerpoint' + # ppthtml seems to catch more text, but only outputs HTML when + # we want text, so just use catppt for now + IO.popen("/usr/bin/catppt " + tempfile.path, "r") do |child| + text += child.read() + "\n\n" + end + elsif content_type == 'application/pdf' + IO.popen("/usr/bin/pdftotext " + tempfile.path + " -", "r") do |child| + text += child.read() + "\n\n" + end + elsif content_type == 'application/vnd.openxmlformats-officedocument.wordprocessingml.document' + # This is Microsoft's XML office document format. + # Just pull out the main XML file, and strip it of text. + xml = '' + IO.popen("/usr/bin/unzip -qq -c " + tempfile.path + " word/document.xml", "r") do |child| + xml += child.read() + "\n\n" + end + doc = REXML::Document.new(xml) + text += doc.each_element( './/text()' ){}.join(" ") + elsif content_type == 'application/zip' + # recurse into zip files + zip_file = Zip::ZipFile.open(tempfile.path) + for entry in zip_file + if entry.file? + filename = entry.to_s + body = entry.get_input_stream.read + calc_mime = filename_to_mimetype(filename) + content_type = calc_mime or 'application/octet-stream' + + STDERR.puts("doing file " + filename + " content type " + content_type) + text += IncomingMessage.get_attachment_text_internal_one_file(content_type, body) end - doc = REXML::Document.new(xml) - text += doc.each_element( './/text()' ){}.join(" ") end - tempfile.close end + tempfile.close + end + + return text + end + def get_attachment_text_internal + # Extract text from each attachment + text = '' + attachments = self.get_attachments_for_display + for attachment in attachments + text += IncomingMessage.get_attachment_text_internal_one_file(attachment.content_type, attachment.body) end # Remove any bad characters text = Iconv.conv('utf-8//IGNORE', 'utf-8', text) diff --git a/config/packages b/config/packages index 0a944b286..c7b2c3964 100644 --- a/config/packages +++ b/config/packages @@ -16,3 +16,4 @@ ttf-bitstream-vera rubygems sharutils unzip +libzip-ruby1.8 diff --git a/public/images/icon_application_zip_large.png b/public/images/icon_application_zip_large.png Binary files differnew file mode 100644 index 000000000..0a14e978e --- /dev/null +++ b/public/images/icon_application_zip_large.png @@ -24,14 +24,6 @@ http://www.whatdotheyknow.com/request/forthcoming_tidal_and_flood_defe#incoming- .zip file extensions http://www.whatdotheyknow.com/request/open_source_schools_tender -Green box when you are classifying request is a bit too big / prominent -Solicit people to tell us requests are misclassified? - -Search for user.name, esp. ./contact_mailer/user_message.rhtml - -Sort list of email subscriptions by type, and probably alphabetically / -chronologically, and compress them a bit more somehow in vertical space. - "Then you will be emailed whenever 'Martin Stabe' requests something or gets a response" doesn't word wrap in confirmation email, and no full stop. @@ -84,6 +76,10 @@ http://www.whatdotheyknow.com/request/unusual_markings_in_the_uk_skies Clear out all the need admin attention requests Clear out all the need classifying requests +Admin: +Have internal links to different parts of request page +Somehow fold up the enormous pages on many admin pages +Make it easy to go from pages to admin page Later ===== |