aboutsummaryrefslogtreecommitdiffstats
path: root/lib/attachment_to_html/adapters/rtf.rb
blob: 871ca2c60624f580bd1d3b2878b30b3c06b330fe (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
module AttachmentToHTML
    module Adapters
        # Convert application/rtf documents in to HTML
        class RTF

            attr_reader :attachment, :wrapper, :tmpdir

            # Public: Initialize a RTF converter
            #
            # attachment - the FoiAttachment to convert to HTML
            # opts       - a Hash of options (default: {}):
            #              :wrapper - String id of the div that wraps the
            #                         attachment body
            #              :tmpdir  - String name of directory to store the
            #                         converted document
            def initialize(attachment, opts = {})
                @attachment = attachment
                @wrapper = opts.fetch(:wrapper, 'wrapper')
                @tmpdir = opts.fetch(:tmpdir, ::Rails.root.join('tmp'))
            end

            # Public: Convert the attachment to HTML
            #
            # Returns a String
            def to_html
                @html ||= generate_html
            end

            # Public: Was the document conversion successful?
            #
            # Returns a Boolean
            def success?
                has_content? || contains_images?
            end

            private

            def generate_html
                html =  "<!DOCTYPE html>"
                html += "<html>"
                html += "<head>"
                html += "<title>#{ title }</title>"
                html += "</head>"
                html += "<body>"
                html += "<div id=\"#{ wrapper }\">"
                html += "<div id=\"view-html-content\">"
                html += body
                html += "</div>"
                html += "</div>"
                html += "</body>"
                html += "</html>"
            end

            def title
                @title ||= attachment.display_filename
            end

            def body
                parsed_body
            end

            # Parse the output of the converted attachment so that we can pluck
            # the parts we need and insert in to our own sensible template
            #
            # Returns a Nokogiri::HTML::Document
            def parsed
                @parsed ||= Nokogiri::HTML.parse(convert)
            end

            def parsed_body
                parsed.css('body').inner_html
            end

            # Does the body element have any content, excluding HTML tags?
            #
            # Returns a Boolean
            def has_content?
                !parsed.css('body').inner_text.empty?
            end

            def contains_images?
                parsed.css('body img').any?
            end

            def convert
                # Get the attachment body outside of the chdir call as getting
                # the body may require opening files too
                text = attachment_body

                @converted ||= Dir.chdir(tmpdir) do
                    tempfile = create_tempfile(text)

                    html = AlaveteliExternalCommand.run("unrtf", "--html",
                      tempfile.path, :timeout => 120
                    )

                    cleanup_tempfile(tempfile)

                    sanitize_converted(html)
                end

            end

            # Works around http://savannah.gnu.org/bugs/?42015 in unrtf ~> 0.21
            def sanitize_converted(html)
                invalid = %Q(<!DOCTYPE html PUBLIC -//W3C//DTD HTML 4.01 Transitional//EN>)
                valid   = %Q(<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN>")
                if html.include?(invalid)
                   html.sub!(invalid, valid)
                end
                html
            end

            def create_tempfile(text)
                tempfile = if RUBY_VERSION.to_f >= 1.9
                               Tempfile.new('foiextract', '.',
                                            :encoding => text.encoding)
                           else
                               Tempfile.new('foiextract', '.')
                           end
                tempfile.print(text)
                tempfile.flush
                tempfile
            end

            def cleanup_tempfile(tempfile)
                tempfile.close
                tempfile.delete
            end

            def attachment_body
                @attachment_body ||= attachment.body
            end

        end
    end
end