aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--app/models/incoming_message.rb134
-rw-r--r--config/packages1
-rw-r--r--public/images/icon_application_zip_large.pngbin0 -> 2863 bytes
-rw-r--r--todo.txt12
4 files changed, 85 insertions, 62 deletions
diff --git a/app/models/incoming_message.rb b/app/models/incoming_message.rb
index b5e461e81..062f43742 100644
--- a/app/models/incoming_message.rb
+++ b/app/models/incoming_message.rb
@@ -19,7 +19,7 @@
# Copyright (c) 2007 UK Citizens Online Democracy. All rights reserved.
# Email: francis@mysociety.org; WWW: http://www.mysociety.org/
#
-# $Id: incoming_message.rb,v 1.130 2008-08-15 00:50:33 francis Exp $
+# $Id: incoming_message.rb,v 1.131 2008-08-21 02:08:17 francis Exp $
# TODO
# Move some of the (e.g. quoting) functions here into rblib, as they feel
@@ -27,6 +27,7 @@
require 'htmlentities'
require 'rexml/document'
+require 'zip/zip'
module TMail
class Mail
@@ -73,6 +74,7 @@ $file_extension_to_mime_type = {
"jpg" => 'image/jpeg', # XXX add jpeg
"png" => 'image/png',
"html" => 'text/html', # XXX add htm
+ "zip" => 'application/zip'
}
# XXX doesn't have way of choosing default for inverse map - might want to add
# one when you need it
@@ -415,6 +417,9 @@ class IncomingMessage < ActiveRecord::Base
if curr_mail.content_type == 'application/msword'
curr_mail.content_type = 'application/vnd.ms-word'
end
+ if curr_mail.content_type == 'application/x-zip-compressed'
+ curr_mail.content_type = 'application/zip'
+ end
# If the part is an attachment of email in text form
if curr_mail.content_type == 'message/rfc822'
ensure_parts_counted # fills in rfc822_attachment variable
@@ -681,66 +686,87 @@ class IncomingMessage < ActiveRecord::Base
text = IncomingMessage.remove_privacy_sensitive_things(text)
return text
end
- def get_attachment_text_internal
- # XXX - tell all these command line tools to return utf-8
+ def IncomingMessage.get_attachment_text_internal_one_file(content_type, body)
text = ''
- attachments = self.get_attachments_for_display
- for attachment in attachments
- if attachment.content_type == 'text/plain'
- text += attachment.body + "\n\n"
- else
- tempfile = Tempfile.new('foiextract')
- tempfile.print attachment.body
- tempfile.flush
- if attachment.content_type == 'application/vnd.ms-word'
- system("/usr/bin/wvText " + tempfile.path + " " + tempfile.path + ".txt")
- # Try catdoc if we get into trouble (e.g. for InfoRequestEvent 2701)
- if not File.exists?(tempfile.path + ".txt")
- IO.popen("/usr/bin/catdoc " + tempfile.path, "r") do |child|
- text += child.read() + "\n\n"
- end
- else
- text += File.read(tempfile.path + ".txt") + "\n\n"
- File.unlink(tempfile.path + ".txt")
- end
- elsif attachment.content_type == 'application/rtf'
+ # XXX - tell all these command line tools to return utf-8
+ if content_type == 'text/plain'
+ text += body + "\n\n"
+ else
+ tempfile = Tempfile.new('foiextract')
+ tempfile.print body
+ tempfile.flush
+ if content_type == 'application/vnd.ms-word'
+ system("/usr/bin/wvText " + tempfile.path + " " + tempfile.path + ".txt")
+ # Try catdoc if we get into trouble (e.g. for InfoRequestEvent 2701)
+ if not File.exists?(tempfile.path + ".txt")
IO.popen("/usr/bin/catdoc " + tempfile.path, "r") do |child|
text += child.read() + "\n\n"
end
- elsif attachment.content_type == 'text/html'
- IO.popen("/usr/bin/lynx -force_html -dump " + tempfile.path, "r") do |child|
- text += child.read() + "\n\n"
- end
- elsif attachment.content_type == 'application/vnd.ms-excel'
- # Bit crazy using strings - but xls2csv, xlhtml and py_xls2txt
- # only extract text from cells, not from floating notes. catdoc
- # may be fooled by weird character sets, but will probably do for
- # UK FOI requests.
- IO.popen("/usr/bin/strings " + tempfile.path, "r") do |child|
- text += child.read() + "\n\n"
- end
- elsif attachment.content_type == 'application/vnd.ms-powerpoint'
- # ppthtml seems to catch more text, but only outputs HTML when
- # we want text, so just use catppt for now
- IO.popen("/usr/bin/catppt " + tempfile.path, "r") do |child|
- text += child.read() + "\n\n"
- end
- elsif attachment.content_type == 'application/pdf'
- IO.popen("/usr/bin/pdftotext " + tempfile.path + " -", "r") do |child|
- text += child.read() + "\n\n"
- end
- elsif attachment.content_type == 'application/vnd.openxmlformats-officedocument.wordprocessingml.document'
- # This is Microsoft's XML office document format.
- # Just pull out the main XML file, and strip it of text.
- xml = ''
- IO.popen("/usr/bin/unzip -qq -c " + tempfile.path + " word/document.xml", "r") do |child|
- xml += child.read() + "\n\n"
+ else
+ text += File.read(tempfile.path + ".txt") + "\n\n"
+ File.unlink(tempfile.path + ".txt")
+ end
+ elsif content_type == 'application/rtf'
+ IO.popen("/usr/bin/catdoc " + tempfile.path, "r") do |child|
+ text += child.read() + "\n\n"
+ end
+ elsif content_type == 'text/html'
+ IO.popen("/usr/bin/lynx -force_html -dump " + tempfile.path, "r") do |child|
+ text += child.read() + "\n\n"
+ end
+ elsif content_type == 'application/vnd.ms-excel'
+ # Bit crazy using strings - but xls2csv, xlhtml and py_xls2txt
+ # only extract text from cells, not from floating notes. catdoc
+ # may be fooled by weird character sets, but will probably do for
+ # UK FOI requests.
+ IO.popen("/usr/bin/strings " + tempfile.path, "r") do |child|
+ text += child.read() + "\n\n"
+ end
+ elsif content_type == 'application/vnd.ms-powerpoint'
+ # ppthtml seems to catch more text, but only outputs HTML when
+ # we want text, so just use catppt for now
+ IO.popen("/usr/bin/catppt " + tempfile.path, "r") do |child|
+ text += child.read() + "\n\n"
+ end
+ elsif content_type == 'application/pdf'
+ IO.popen("/usr/bin/pdftotext " + tempfile.path + " -", "r") do |child|
+ text += child.read() + "\n\n"
+ end
+ elsif content_type == 'application/vnd.openxmlformats-officedocument.wordprocessingml.document'
+ # This is Microsoft's XML office document format.
+ # Just pull out the main XML file, and strip it of text.
+ xml = ''
+ IO.popen("/usr/bin/unzip -qq -c " + tempfile.path + " word/document.xml", "r") do |child|
+ xml += child.read() + "\n\n"
+ end
+ doc = REXML::Document.new(xml)
+ text += doc.each_element( './/text()' ){}.join(" ")
+ elsif content_type == 'application/zip'
+ # recurse into zip files
+ zip_file = Zip::ZipFile.open(tempfile.path)
+ for entry in zip_file
+ if entry.file?
+ filename = entry.to_s
+ body = entry.get_input_stream.read
+ calc_mime = filename_to_mimetype(filename)
+ content_type = calc_mime or 'application/octet-stream'
+
+ STDERR.puts("doing file " + filename + " content type " + content_type)
+ text += IncomingMessage.get_attachment_text_internal_one_file(content_type, body)
end
- doc = REXML::Document.new(xml)
- text += doc.each_element( './/text()' ){}.join(" ")
end
- tempfile.close
end
+ tempfile.close
+ end
+
+ return text
+ end
+ def get_attachment_text_internal
+ # Extract text from each attachment
+ text = ''
+ attachments = self.get_attachments_for_display
+ for attachment in attachments
+ text += IncomingMessage.get_attachment_text_internal_one_file(attachment.content_type, attachment.body)
end
# Remove any bad characters
text = Iconv.conv('utf-8//IGNORE', 'utf-8', text)
diff --git a/config/packages b/config/packages
index 0a944b286..c7b2c3964 100644
--- a/config/packages
+++ b/config/packages
@@ -16,3 +16,4 @@ ttf-bitstream-vera
rubygems
sharutils
unzip
+libzip-ruby1.8
diff --git a/public/images/icon_application_zip_large.png b/public/images/icon_application_zip_large.png
new file mode 100644
index 000000000..0a14e978e
--- /dev/null
+++ b/public/images/icon_application_zip_large.png
Binary files differ
diff --git a/todo.txt b/todo.txt
index c20b107d3..47082620c 100644
--- a/todo.txt
+++ b/todo.txt
@@ -24,14 +24,6 @@ http://www.whatdotheyknow.com/request/forthcoming_tidal_and_flood_defe#incoming-
.zip file extensions
http://www.whatdotheyknow.com/request/open_source_schools_tender
-Green box when you are classifying request is a bit too big / prominent
-Solicit people to tell us requests are misclassified?
-
-Search for user.name, esp. ./contact_mailer/user_message.rhtml
-
-Sort list of email subscriptions by type, and probably alphabetically /
-chronologically, and compress them a bit more somehow in vertical space.
-
"Then you will be emailed whenever 'Martin Stabe' requests something or gets a
response" doesn't word wrap in confirmation email, and no full stop.
@@ -84,6 +76,10 @@ http://www.whatdotheyknow.com/request/unusual_markings_in_the_uk_skies
Clear out all the need admin attention requests
Clear out all the need classifying requests
+Admin:
+Have internal links to different parts of request page
+Somehow fold up the enormous pages on many admin pages
+Make it easy to go from pages to admin page
Later
=====