1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
|
module AttachmentToHTML
module Adapters
# Convert application/rtf documents in to HTML
class RTF
attr_reader :attachment, :wrapper, :tmpdir
# Public: Initialize a RTF converter
#
# attachment - the FoiAttachment to convert to HTML
# opts - a Hash of options (default: {}):
# :wrapper - String id of the div that wraps the
# attachment body
# :tmpdir - String name of directory to store the
# converted document
def initialize(attachment, opts = {})
@attachment = attachment
@wrapper = opts.fetch(:wrapper, 'wrapper')
@tmpdir = opts.fetch(:tmpdir, ::Rails.root.join('tmp'))
end
# Public: Convert the attachment to HTML
#
# Returns a String
def to_html
@html ||= generate_html
end
# Public: Was the document conversion successful?
#
# Returns a Boolean
def success?
has_content? || contains_images?
end
private
def generate_html
html = "<!DOCTYPE html>"
html += "<html>"
html += "<head>"
html += "<title>#{ title }</title>"
html += "</head>"
html += "<body>"
html += "<div id=\"#{ wrapper }\">"
html += "<div id=\"view-html-content\">"
html += body
html += "</div>"
html += "</div>"
html += "</body>"
html += "</html>"
end
def title
@title ||= attachment.display_filename
end
def body
parsed_body
end
# Parse the output of the converted attachment so that we can pluck
# the parts we need and insert in to our own sensible template
#
# Returns a Nokogiri::HTML::Document
def parsed
@parsed ||= Nokogiri::HTML.parse(convert)
end
def parsed_body
parsed.css('body').inner_html
end
# Does the body element have any content, excluding HTML tags?
#
# Returns a Boolean
def has_content?
!parsed.css('body').inner_text.empty?
end
def contains_images?
parsed.css('body img').any?
end
def convert
# Get the attachment body outside of the chdir call as getting
# the body may require opening files too
text = attachment_body
@converted ||= Dir.chdir(tmpdir) do
tempfile = create_tempfile(text)
html = AlaveteliExternalCommand.run("unrtf", "--html",
tempfile.path, :timeout => 120
)
cleanup_tempfile(tempfile)
sanitize_converted(html)
end
end
# Works around http://savannah.gnu.org/bugs/?42015 in unrtf ~> 0.21
def sanitize_converted(html)
invalid = %Q(<!DOCTYPE html PUBLIC -//W3C//DTD HTML 4.01 Transitional//EN>)
valid = %Q(<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN>")
if html.include?(invalid)
html.sub!(invalid, valid)
end
html
end
def create_tempfile(text)
tempfile = if RUBY_VERSION.to_f >= 1.9
Tempfile.new('foiextract', '.',
:encoding => text.encoding)
else
Tempfile.new('foiextract', '.')
end
tempfile.print(text)
tempfile.flush
tempfile
end
def cleanup_tempfile(tempfile)
tempfile.close
tempfile.delete
end
def attachment_body
@attachment_body ||= attachment.body
end
end
end
end
|