aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--app/controllers/request_controller.rb4
-rw-r--r--app/models/incoming_message.rb34
-rw-r--r--app/views/admin_censor_rule/_form.rhtml10
-rw-r--r--config/packages1
-rw-r--r--todo.txt4
5 files changed, 41 insertions, 12 deletions
diff --git a/app/controllers/request_controller.rb b/app/controllers/request_controller.rb
index ec5115f99..98c4c5be4 100644
--- a/app/controllers/request_controller.rb
+++ b/app/controllers/request_controller.rb
@@ -4,7 +4,7 @@
# Copyright (c) 2007 UK Citizens Online Democracy. All rights reserved.
# Email: francis@mysociety.org; WWW: http://www.mysociety.org/
#
-# $Id: request_controller.rb,v 1.169 2009-08-18 20:51:25 francis Exp $
+# $Id: request_controller.rb,v 1.170 2009-08-20 11:05:24 francis Exp $
class RequestController < ApplicationController
@@ -578,7 +578,7 @@ class RequestController < ApplicationController
@attachment = IncomingMessage.get_attachment_by_url_part_number(@incoming_message.get_attachments_for_display, @part_number)
# Prevent spam to magic request address.
- # XXX Bit dodgy modifying a binary like this but hey. Maybe only do for some mime types?
+ # It's a bit dodgy modifying a binary like this but hey. Some mime types are excluded for that reason.
@attachment.body = @incoming_message.binary_mask_stuff(@attachment.body, @attachment.content_type)
@attachment_url = get_attachment_url(:id => @incoming_message.info_request_id,
diff --git a/app/models/incoming_message.rb b/app/models/incoming_message.rb
index d8aaeabe7..eae6542ef 100644
--- a/app/models/incoming_message.rb
+++ b/app/models/incoming_message.rb
@@ -19,7 +19,7 @@
# Copyright (c) 2007 UK Citizens Online Democracy. All rights reserved.
# Email: francis@mysociety.org; WWW: http://www.mysociety.org/
#
-# $Id: incoming_message.rb,v 1.210 2009-07-17 14:04:34 francis Exp $
+# $Id: incoming_message.rb,v 1.211 2009-08-20 11:05:27 francis Exp $
# TODO
# Move some of the (e.g. quoting) functions here into rblib, as they feel
@@ -68,7 +68,6 @@ $file_extension_to_mime_type_rev = $file_extension_to_mime_type.invert
# See binary_mask_stuff function below. It just test for inclusion
# in this hash, not the value of the right hand side.
$do_not_binary_mask = {
- 'application/pdf' => 1,
'image/tiff' => 1,
'image/gif' => 1,
'image/jpeg' => 1,
@@ -446,6 +445,37 @@ class IncomingMessage < ActiveRecord::Base
if $do_not_binary_mask.include?(content_type)
return text
end
+
+ # Special cases for some content types
+ if content_type == 'application/pdf'
+ # XXX currently just applies censor rules - change this if we apply email rules too
+ if self.info_request.censor_rules.count > 0
+ uncompressed_text = nil
+ IO.popen("/usr/bin/pdftk - output - uncompress", "r+") do |child|
+ child.write(text)
+ child.close_write()
+ uncompressed_text = child.read()
+ end
+ # if we managed to uncompress the PDF...
+ if !uncompressed_text.nil?
+ censored_uncompressed_text = self.info_request.apply_censor_rules_to_binary(uncompressed_text)
+ # and the censor rule removed something...
+ if censored_uncompressed_text != uncompressed_text
+ # then use the altered file (recompressed)
+ recompressed_text = nil
+ IO.popen("/usr/bin/pdftk - output - compress", "r+") do |child|
+ child.write(censored_uncompressed_text)
+ child.close_write()
+ recompressed_text = child.read()
+ end
+ if !recompressed_text.nil?
+ text = recompressed_text
+ end
+ end
+ end
+ end
+ return text
+ end
# Keep original size, so can check haven't resized it
orig_size = text.size
diff --git a/app/views/admin_censor_rule/_form.rhtml b/app/views/admin_censor_rule/_form.rhtml
index 98518d657..4f37ddc4c 100644
--- a/app/views/admin_censor_rule/_form.rhtml
+++ b/app/views/admin_censor_rule/_form.rhtml
@@ -13,11 +13,11 @@
<%= hidden_field 'censor_rule', 'info_request_id', { :value => info_request.id } %>
</p>
-<p><strong>Warning and notes:</strong> This does replace text in binary files, but only
-in a naive way. It doesn't even do UCS-2 (unicode sometimes used in Word). It
-should work for cases that aren't links and are in Word documents, but probably
-won't work for much else. Please <strong>carefully check</strong> all
-attachments have changed in the way you expect.
+<p><strong>Warning and notes:</strong> This does replace text in binary files, but for
+most formats only in a naive way. It works well on surprisingly many Word documents. Notably
+it doesn't even do UCS-2 (unicode sometimes used in Word). There is also special code
+which works on some PDFs. Please <strong>carefully check</strong> all attachments have
+changed in the way you expect, and haven't become corrupted.
</p>
<p>You may need to manually rebuild the search index afterwards. If you need to
diff --git a/config/packages b/config/packages
index 36701f613..a8a2f80a1 100644
--- a/config/packages
+++ b/config/packages
@@ -6,6 +6,7 @@ rake
irb
wv
poppler-utils
+pdftk
gs-gpl
catdoc
links
diff --git a/todo.txt b/todo.txt
index c60e2ade1..d9aa5e806 100644
--- a/todo.txt
+++ b/todo.txt
@@ -269,10 +269,8 @@ Edits to outgoing/incoming/title won't be reindexed in Xapian (maybe just reinde
This does it all:
$ ./script/clear-incoming-text-cache ; ./script/rebuild-xapian-index
-Remove request email address from PDFs (we already do from docs)
+Remove request email address from PDFs (we already do from docs, and we run censor rules on PDFs now)
http://www.whatdotheyknow.com/request/cost_of_policing_the_oxford_unio_3
- - maybe if text contains email, refuse to show full PDF just show conversion
- to text/HTML?
http://www.whatdotheyknow.com/request/5353/response/11911/attach/html/2/Freedom%20of%20Information%20-%20Letter%20Accepting%20Request%20-%2072057594037995214.pdf.html
Take care here, sometimes emails are found by spammers by Google's OCR of
images in PDFs