blob: afc8fbcb023754256a2e35a1073e1f49b7f678f3 (
plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
|
# -*- encoding : utf-8 -*-
module AttachmentToHTML
module Adapters
# Convert application/pdf documents in to HTML
class PDF < Adapter
TOO_MANY_IMAGES = 51
attr_reader :tmpdir
# Public: Initialize a PDF converter
#
# attachment - the FoiAttachment to convert to HTML
# opts - a Hash of options (default: {}):
# :tmpdir - String name of directory to store the
# converted document
def initialize(attachment, opts = {})
super
@tmpdir = opts.fetch(:tmpdir, ::Rails.root.join('tmp'))
end
# Public: Was the document conversion successful?
#
# Returns a Boolean
def success?
return false if contains_too_many_images?
has_content? || contains_images?
end
private
def parse_body
conversion = convert
match = conversion ? conversion.match(/<body[^>]*>(.*?)<\/body>/mi) : nil
match ? match[1] : ''
end
# Works around https://bugs.freedesktop.org/show_bug.cgi?id=77932 in pdftohtml
def contains_too_many_images?
number_of_images_in_body >= TOO_MANY_IMAGES
end
def number_of_images_in_body
body.scan(/<img[^>]*>/i).size
end
def convert
# Get the attachment body outside of the chdir call as getting
# the body may require opening files too
text = attachment_body
@converted ||= Dir.chdir(tmpdir) do
tempfile = create_tempfile(text)
html = AlaveteliExternalCommand.run("pdftohtml",
"-nodrm", "-zoom", "1.0", "-stdout", "-enc", "UTF-8",
"-noframes", "./#{File.basename(tempfile.path)}",
:timeout => 30, :binary_output => false
)
cleanup_tempfile(tempfile)
html
end
end
end
end
end
|