From b4339df4caa93f44abe0cd8d9d4b8c5888662421 Mon Sep 17 00:00:00 2001
From: Gareth Rees <gareth@mysociety.org>
Date: Fri, 25 Apr 2014 16:59:34 +0100
Subject: Work around bug#77932 in pdftohtml

Sometimes pdftohtml will generate thousands of images when converting an
image embedded in a PDF. This causes a request spike when a user tries
to view the converted PDF as HTML.

See https://bugs.freedesktop.org/show_bug.cgi?id=77932 for the bug
report.
---
 lib/attachment_to_html/adapters/pdf.rb | 11 +++++++++++
 1 file changed, 11 insertions(+)

(limited to 'lib/attachment_to_html')

diff --git a/lib/attachment_to_html/adapters/pdf.rb b/lib/attachment_to_html/adapters/pdf.rb
index 1fca2f201..b91958c52 100644
--- a/lib/attachment_to_html/adapters/pdf.rb
+++ b/lib/attachment_to_html/adapters/pdf.rb
@@ -2,6 +2,7 @@ module AttachmentToHTML
     module Adapters
         # Convert application/pdf documents in to HTML
         class PDF
+            TOO_MANY_IMAGES = 51
 
             attr_reader :attachment, :tmpdir
 
@@ -34,6 +35,7 @@ module AttachmentToHTML
             #
             # Returns a Boolean
             def success?
+                return false if contains_too_many_images?
                 has_content? || contains_images?
             end
 
@@ -52,6 +54,15 @@ module AttachmentToHTML
                 body.match(/<img[^>]*>/mi) ? true : false
             end
 
+            # Works around https://bugs.freedesktop.org/show_bug.cgi?id=77932 in pdftohtml
+            def contains_too_many_images?
+                number_of_images_in_body >= TOO_MANY_IMAGES
+            end
+
+            def number_of_images_in_body
+                body.scan(/<img[^>]*>/i).size
+            end
+
             def convert
                 # Get the attachment body outside of the chdir call as getting
                 # the body may require opening files too
-- 
cgit v1.2.3