aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorfrancis <francis>2008-03-07 10:13:57 +0000
committerfrancis <francis>2008-03-07 10:13:57 +0000
commit2d601445941b51ad14d6dc851ead1c0fbde3a8bf (patch)
treee427d3bb8d39d0ca058e7b11d47f7581919d2f2d
parentf31b32227a94aa2c8c3ecfca93d1c9ada7b6b21a (diff)
Index word docs and PDFs
-rw-r--r--app/models/incoming_message.rb32
-rw-r--r--todo.txt7
2 files changed, 37 insertions, 2 deletions
diff --git a/app/models/incoming_message.rb b/app/models/incoming_message.rb
index 49b24b694..a22385347 100644
--- a/app/models/incoming_message.rb
+++ b/app/models/incoming_message.rb
@@ -18,7 +18,7 @@
# Copyright (c) 2007 UK Citizens Online Democracy. All rights reserved.
# Email: francis@mysociety.org; WWW: http://www.mysociety.org/
#
-# $Id: incoming_message.rb,v 1.54 2008-03-06 23:17:28 francis Exp $
+# $Id: incoming_message.rb,v 1.55 2008-03-07 10:13:57 francis Exp $
# TODO
@@ -314,9 +314,37 @@ class IncomingMessage < ActiveRecord::Base
text = IncomingMessage.remove_quoted_sections(text, "")
end
+ # Returns text version of attachment text
+ def get_attachment_text
+ text = ''
+ attachments = self.get_attachments_for_display
+ for attachment in attachments
+ if attachment.content_type == 'text/plain'
+ text += attachment.body
+ elsif attachment.content_type == 'application/msword'
+ tempfile = Tempfile.new('foipdf')
+ tempfile.print attachment.body
+ tempfile.flush
+ system("/usr/bin/wvText " + tempfile.path + " " + tempfile.path + ".txt")
+ text += File.read(tempfile.path + ".txt")
+ File.unlink(tempfile.path + ".txt")
+ tempfile.close
+ elsif attachment.content_type == 'application/pdf'
+ tempfile = Tempfile.new('foipdf')
+ tempfile.print attachment.body
+ tempfile.flush
+ IO.popen("/usr/bin/pdftotext " + tempfile.path + " -", "r") do |child|
+ text += child.read()
+ end
+ tempfile.close
+ end
+ end
+ return text
+ end
+
# Returns text for indexing
def get_text_for_indexing
- return get_body_for_quoting()
+ return get_body_for_quoting + get_attachment_text
end
# Returns the name of the person the incoming message is from, or nil if there isn't one
diff --git a/todo.txt b/todo.txt
index acc31b894..3bb7ea53a 100644
--- a/todo.txt
+++ b/todo.txt
@@ -1,8 +1,11 @@
Search:
+Don't show same request so many times
Add indexing of PDFs and DOCs etc.
Date ranges and types and stuff
+Search for users
+Search for public bodies
FOI requests to use to test it
==============================
@@ -57,6 +60,10 @@ eived from server "/data/vhost/foi.mysociety.org/docs/dispatch.fcgi"
"Government" in about page
+Send email to remind people to classify
+Send email to tell admins something isn't classified
+Send email to remind people to clarify
+
Later
=====