aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorfrancis <francis>2008-07-23 23:27:53 +0000
committerfrancis <francis>2008-07-23 23:27:53 +0000
commitc3da447a2635d4a0f08b54a3e6abf1d166e53bd8 (patch)
treeb47ba56885fee411e0969b6a6df46facc55adf5b
parente2fd169704615db30cb1aaf5810872fb8323a345 (diff)
Cope with .docx files
-rw-r--r--app/models/incoming_message.rb14
-rw-r--r--config/packages3
-rw-r--r--todo.txt20
3 files changed, 35 insertions, 2 deletions
diff --git a/app/models/incoming_message.rb b/app/models/incoming_message.rb
index 0d5aca843..450ea0416 100644
--- a/app/models/incoming_message.rb
+++ b/app/models/incoming_message.rb
@@ -19,13 +19,14 @@
# Copyright (c) 2007 UK Citizens Online Democracy. All rights reserved.
# Email: francis@mysociety.org; WWW: http://www.mysociety.org/
#
-# $Id: incoming_message.rb,v 1.123 2008-07-17 11:39:46 francis Exp $
+# $Id: incoming_message.rb,v 1.124 2008-07-23 23:27:53 francis Exp $
# TODO
# Move some of the (e.g. quoting) functions here into rblib, as they feel
# general not specific to IncomingMessage.
require 'htmlentities'
+require 'rexml/document'
module TMail
class Mail
@@ -50,8 +51,11 @@ $file_extension_to_mime_type = {
"pdf" => 'application/pdf',
"rtf" => 'application/rtf',
"doc" => 'application/vnd.ms-word',
+ "docx" => 'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
"xls" => 'application/vnd.ms-excel',
+ "xlsx" => 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
"ppt" => 'application/vnd.ms-powerpoint',
+ "pptx" => 'application/vnd.openxmlformats-officedocument.presentationml.presentation',
"tif" => 'image/tiff',
"gif" => 'image/gif',
"jpg" => 'image/jpeg', # XXX add jpeg
@@ -694,6 +698,14 @@ class IncomingMessage < ActiveRecord::Base
IO.popen("/usr/bin/pdftotext " + tempfile.path + " -", "r") do |child|
text += child.read() + "\n\n"
end
+ elsif attachment.content_type == 'application/vnd.openxmlformats-officedocument.wordprocessingml.document'
+ # just pull out the main XML file, and strip it of text
+ xml = ''
+ IO.popen("/usr/bin/unzip -qq -c " + tempfile.path + " word/document.xml", "r") do |child|
+ xml += child.read() + "\n\n"
+ end
+ doc = REXML::Document.new(xml)
+ text += doc.each_element( './/text()' ){}.join(" ")
end
tempfile.close
end
diff --git a/config/packages b/config/packages
index 410a7f7be..0a944b286 100644
--- a/config/packages
+++ b/config/packages
@@ -14,4 +14,5 @@ libxapian-ruby1.8
gnuplot-nox
ttf-bitstream-vera
rubygems
-sharutils \ No newline at end of file
+sharutils
+unzip
diff --git a/todo.txt b/todo.txt
index 2f5edb45b..e777eb750 100644
--- a/todo.txt
+++ b/todo.txt
@@ -34,6 +34,9 @@ pages.
Next
====
+.docx (hooray!) -- maybe add view as HTML / text link?
+http://www.whatdotheyknow.com/request/presentations_made_at_climate_ch#incoming-2136
+
Clear out all the need admin attention requests
Clear out all the need classifying requests
@@ -55,9 +58,20 @@ user/show.rhtml sidebar vs. generic sidebar? (ask Tommy)
Needs tagline that v. quickly explains what site is on each page
There is grey on grey text in header? bad idea?
+Make it clearer people should ask for documents
+http://www.whatdotheyknow.com/request/unusual_markings_in_the_uk_skies
+
Later
=====
+When described state is edited in admin interface, automatically reset the flag
+for needs classification.
+
+.tif files are hard for people to view as multi page, consider automatically
+separating out the pages as separate links (to .png files or whatever)
+ http://www.whatdotheyknow.com/request/windsor_maidenhead_council_commo#incoming-1910
+Heck, may as well give thumbnails of all images, indeed all docs while you're at it :)
+
In sidebar of request
Share this request on Facebook, by email etc.
Email icon here: http://www.guardian.co.uk/news/video/2008/apr/03/mugabe
@@ -162,6 +176,7 @@ Quoting fixing TODO:
http://www.whatdotheyknow.com/request/123/response/184 # nasty nasty formatted quoting
http://www.whatdotheyknow.com/request/155/response/552 # nasty nasty formatted quoting
http://www.whatdotheyknow.com/request/51/response/93 # tough quoting with <
+ http://www.whatdotheyknow.com/request/how_do_the_pct_deal_with_retirin_87#incoming-1847
http://www.whatdotheyknow.com/request/265/response/688 # word wrapping of <
http://www.whatdotheyknow.com/request/224/response/589 # have knackered the apostrophes here
@@ -174,9 +189,14 @@ Quoting fixing TODO:
http://www.whatdotheyknow.com/request/231/response/338
http://www.whatdotheyknow.com/request/930/response/1609
http://www.whatdotheyknow.com/request/1102/response/2067
+ http://www.whatdotheyknow.com/request/list_of_public_space_cctv_instal#incoming-2164
+ http://www.whatdotheyknow.com/request/errors_in_list_of_postbox_locati#incoming-2272
+
+ http://www.whatdotheyknow.com/request/public_inspection_periods_for_lo_2#outgoing-1707 # square bracket in link
http://www.whatdotheyknow.com/request/415/response/1041/attach/3/CONF%20FOI%209508%20Ian%20Holton.doc
+
Larger new features
-------------------