diff options
-rw-r--r-- | app/models/incoming_message.rb | 14 | ||||
-rw-r--r-- | config/general.yml-example | 9 | ||||
-rw-r--r-- | spec/models/incoming_message_spec.rb | 19 |
3 files changed, 37 insertions, 5 deletions
diff --git a/app/models/incoming_message.rb b/app/models/incoming_message.rb index 7d9cfbfa1..90ab84a8f 100644 --- a/app/models/incoming_message.rb +++ b/app/models/incoming_message.rb @@ -461,11 +461,23 @@ class IncomingMessage < ActiveRecord::Base if censored_uncompressed_text != uncompressed_text # then use the altered file (recompressed) recompressed_text = nil - IO.popen("/usr/bin/pdftk - output - compress", "r+") do |child| + if MySociety::Config.get('USE_GHOSTSCRIPT_COMPRESSION') == true + command = "gs -sDEVICE=pdfwrite -dCompatibilityLevel=1.4 -dPDFSETTINGS=/screen -dNOPAUSE -dQUIET -dBATCH -sOutputFile=- -" + else + command = "/usr/bin/pdftk - output - compress" + end + IO.popen(command, "r+") do |child| child.write(censored_uncompressed_text) child.close_write() recompressed_text = child.read() end + if recompressed_text.nil? || recompressed_text.empty? + # buggy versions of pdftk sometimes fail on + # compression, I don't see it's a disaster in + # these cases to save an uncompressed version? + recompressed_text = censored_uncompressed_text + logger.warn "Unable to compress PDF; problem with your pdftk version?" + end if !recompressed_text.nil? && !recompressed_text.empty? text[0..-1] = recompressed_text # [0..-1] makes it change the 'text' string in place end diff --git a/config/general.yml-example b/config/general.yml-example index 729278a8d..c832111c7 100644 --- a/config/general.yml-example +++ b/config/general.yml-example @@ -94,3 +94,12 @@ RECAPTCHA_PRIVATE_KEY: 'x' # existing process previously served a larger request, this won't # show any consumption for the later request. DEBUG_RECORD_MEMORY: false + +# Currently we default to using pdftk to compress PDFs. You can +# optionally try Ghostscript, which should do a better job of +# compression. Some versions of pdftk are buggy with respect to +# compression, in which case Alaveteli doesn't recompress the PDFs at +# all and logs a warning message "Unable to compress PDF"; which would +# be another reason to try this setting. +USE_GHOSTSCRIPT_COMPRESSION: true + diff --git a/spec/models/incoming_message_spec.rb b/spec/models/incoming_message_spec.rb index 70d92356f..5fcc534ca 100644 --- a/spec/models/incoming_message_spec.rb +++ b/spec/models/incoming_message_spec.rb @@ -160,10 +160,12 @@ describe IncomingMessage, " when censoring data" do data.should == "His email was x\000x\000x\000@\000x\000x\000x\000.\000x\000x\000x\000, indeed" end - # As at March 9th 2010: This test fails with pdftk 1.41+dfsg-1 installed - # which is in Ubuntu Karmic. It works again for the lasest version - # 1.41+dfsg-7 in Debian unstable. And it works for Debian stable. - it "should replace everything in PDF files" do + + + def pdf_replacement_test(use_ghostscript_compression) + config = MySociety::Config.load_default() + previous = config['USE_GHOSTSCRIPT_COMPRESSION'] + config['USE_GHOSTSCRIPT_COMPRESSION'] = use_ghostscript_compression orig_pdf = load_file_fixture('tfl.pdf') pdf = orig_pdf.dup @@ -175,6 +177,15 @@ describe IncomingMessage, " when censoring data" do masked_text = IncomingMessage._get_attachment_text_internal_one_file('application/pdf', pdf) masked_text.should_not match(/foi@tfl.gov.uk/) masked_text.should match(/xxx@xxx.xxx.xx/) + config['USE_GHOSTSCRIPT_COMPRESSION'] = previous + end + + it "should replace everything in PDF files using pdftk" do + pdf_replacement_test(false) + end + + it "should replace everything in PDF files using ghostscript" do + pdf_replacement_test(true) end it "should not produce zero length output if pdftk silently fails" do |