aboutsummaryrefslogtreecommitdiffstats
path: root/lib/attachment_to_html
diff options
context:
space:
mode:
authorGareth Rees <gareth@mysociety.org>2014-04-25 16:59:34 +0100
committerGareth Rees <gareth@mysociety.org>2014-05-06 13:53:52 +0100
commitb4339df4caa93f44abe0cd8d9d4b8c5888662421 (patch)
treed4a65008c35eb8e65bdc610bb82923f681ecb913 /lib/attachment_to_html
parentf4692c8a40369f22a5abe2b7a52d65c0b7702d29 (diff)
Work around bug#77932 in pdftohtml
Sometimes pdftohtml will generate thousands of images when converting an image embedded in a PDF. This causes a request spike when a user tries to view the converted PDF as HTML. See https://bugs.freedesktop.org/show_bug.cgi?id=77932 for the bug report.
Diffstat (limited to 'lib/attachment_to_html')
-rw-r--r--lib/attachment_to_html/adapters/pdf.rb11
1 files changed, 11 insertions, 0 deletions
diff --git a/lib/attachment_to_html/adapters/pdf.rb b/lib/attachment_to_html/adapters/pdf.rb
index 1fca2f201..b91958c52 100644
--- a/lib/attachment_to_html/adapters/pdf.rb
+++ b/lib/attachment_to_html/adapters/pdf.rb
@@ -2,6 +2,7 @@ module AttachmentToHTML
module Adapters
# Convert application/pdf documents in to HTML
class PDF
+ TOO_MANY_IMAGES = 51
attr_reader :attachment, :tmpdir
@@ -34,6 +35,7 @@ module AttachmentToHTML
#
# Returns a Boolean
def success?
+ return false if contains_too_many_images?
has_content? || contains_images?
end
@@ -52,6 +54,15 @@ module AttachmentToHTML
body.match(/<img[^>]*>/mi) ? true : false
end
+ # Works around https://bugs.freedesktop.org/show_bug.cgi?id=77932 in pdftohtml
+ def contains_too_many_images?
+ number_of_images_in_body >= TOO_MANY_IMAGES
+ end
+
+ def number_of_images_in_body
+ body.scan(/<img[^>]*>/i).size
+ end
+
def convert
# Get the attachment body outside of the chdir call as getting
# the body may require opening files too