aboutsummaryrefslogtreecommitdiffstats
path: root/app/models/foi_attachment.rb
blob: 768e8d1d37a2df9c1e91e6f43fb6e3e37d84d696 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
# encoding: UTF-8

# == Schema Information
# Schema version: 114
#
# Table name: foi_attachments
#
#  id                    :integer         not null, primary key
#  content_type          :text
#  filename              :text
#  charset               :text
#  display_size          :text
#  url_part_number       :integer
#  within_rfc822_subject :text
#  incoming_message_id   :integer
#  hexdigest             :string(32)
#

# models/foi_attachment.rb:
# An attachment to an email (IncomingMessage)
#
# Copyright (c) 2007 UK Citizens Online Democracy. All rights reserved.
# Email: francis@mysociety.org; WWW: http://www.mysociety.org/
# This is the type which is used to send data about attachments to the view

require 'digest'

class FoiAttachment < ActiveRecord::Base
    belongs_to :incoming_message
    validates_presence_of :content_type
    validates_presence_of :filename
    validates_presence_of :display_size

    before_validation :ensure_filename!, :only => [:filename]
    before_destroy :delete_cached_file!

    BODY_MAX_TRIES = 3
    BODY_MAX_DELAY = 5

    def directory
        base_dir = File.expand_path(File.join(File.dirname(__FILE__), "../../cache", "attachments_#{Rails.env}"))
        return File.join(base_dir, self.hexdigest[0..2])
    end

    def filepath
        File.join(self.directory, self.hexdigest)
    end

    def delete_cached_file!
        begin
            @cached_body = nil
            File.delete(self.filepath)
        rescue
        end
    end

    def body=(d)
        self.hexdigest = Digest::MD5.hexdigest(d)
        if !File.exists?(self.directory)
            FileUtils.mkdir_p self.directory
        end
        File.open(self.filepath, "wb") { |file|
            file.write d
        }
        update_display_size!
        encode_cached_body!
        @cached_body = d
    end

    # If the original mail part had a charset, it's some kind of string, so assume that
    # it should be handled as a string in the stated charset, not a bytearray, and then
    # convert it our default encoding. For ruby 1.8 this is a noop.
    def encode_cached_body!
        if RUBY_VERSION.to_f >= 1.9
            if charset
                @cached_body.force_encoding(charset)
                @cached_body = @cached_body.encode(Encoding.default_internal, charset)
            end
        end
    end

    def body
        if @cached_body.nil?
            tries = 0
            delay = 1
            begin
                @cached_body = File.open(self.filepath, "rb" ).read
            rescue Errno::ENOENT
                # we've lost our cached attachments for some reason.  Reparse them.
                if tries > BODY_MAX_TRIES
                    raise
                else
                    sleep delay
                end
                tries += 1
                delay *= 2
                delay = BODY_MAX_DELAY if delay > BODY_MAX_DELAY
                force = true
                self.incoming_message.parse_raw_email!(force)
                retry
            end
            encode_cached_body!
        end
        return @cached_body
    end

    # List of DSN codes taken from RFC 3463
    # http://tools.ietf.org/html/rfc3463
    DsnToMessage = {
         'X.1.0' => 'Other address status',
         'X.1.1' => 'Bad destination mailbox address',
         'X.1.2' => 'Bad destination system address',
         'X.1.3' => 'Bad destination mailbox address syntax',
         'X.1.4' => 'Destination mailbox address ambiguous',
         'X.1.5' => 'Destination mailbox address valid',
         'X.1.6' => 'Mailbox has moved',
         'X.1.7' => 'Bad sender\'s mailbox address syntax',
         'X.1.8' => 'Bad sender\'s system address',
         'X.2.0' => 'Other or undefined mailbox status',
         'X.2.1' => 'Mailbox disabled, not accepting messages',
         'X.2.2' => 'Mailbox full',
         'X.2.3' => 'Message length exceeds administrative limit.',
         'X.2.4' => 'Mailing list expansion problem',
         'X.3.0' => 'Other or undefined mail system status',
         'X.3.1' => 'Mail system full',
         'X.3.2' => 'System not accepting network messages',
         'X.3.3' => 'System not capable of selected features',
         'X.3.4' => 'Message too big for system',
         'X.4.0' => 'Other or undefined network or routing status',
         'X.4.1' => 'No answer from host',
         'X.4.2' => 'Bad connection',
         'X.4.3' => 'Routing server failure',
         'X.4.4' => 'Unable to route',
         'X.4.5' => 'Network congestion',
         'X.4.6' => 'Routing loop detected',
         'X.4.7' => 'Delivery time expired',
         'X.5.0' => 'Other or undefined protocol status',
         'X.5.1' => 'Invalid command',
         'X.5.2' => 'Syntax error',
         'X.5.3' => 'Too many recipients',
         'X.5.4' => 'Invalid command arguments',
         'X.5.5' => 'Wrong protocol version',
         'X.6.0' => 'Other or undefined media error',
         'X.6.1' => 'Media not supported',
         'X.6.2' => 'Conversion required and prohibited',
         'X.6.3' => 'Conversion required but not supported',
         'X.6.4' => 'Conversion with loss performed',
         'X.6.5' => 'Conversion failed',
         'X.7.0' => 'Other or undefined security status',
         'X.7.1' => 'Delivery not authorized, message refused',
         'X.7.2' => 'Mailing list expansion prohibited',
         'X.7.3' => 'Security conversion required but not possible',
         'X.7.4' => 'Security features not supported',
         'X.7.5' => 'Cryptographic failure',
         'X.7.6' => 'Cryptographic algorithm not supported',
         'X.7.7' => 'Message integrity failure'
     }

    # Returns HTML, of extra comment to put by attachment
    def extra_note
        # For delivery status notification attachments, extract the status and
        # look up what it means in the DSN table.
        if @content_type == 'message/delivery-status'
            if !@body.match(/Status:\s+([0-9]+\.([0-9]+\.[0-9]+))\s+/)
                return ""
            end
            dsn = $1
            dsn_part = 'X.' + $2

            dsn_message = ""
            if DsnToMessage.include?(dsn_part)
                dsn_message = " (" + DsnToMessage[dsn_part] + ")"
            end

            return "<br><em>DSN: " + dsn + dsn_message + "</em>"
        end
        return ""
    end

    # Called by controller so old filenames still work
    def old_display_filename
        filename = self.filename

        # Convert weird spaces (e.g. \n) to normal ones
        filename = filename.gsub(/\s/, " ")
        # Remove slashes, they mess with URLs
        filename = filename.gsub(/\//, "-")

        return filename
    end

    # XXX changing this will break existing URLs, so have a care - maybe
    # make another old_display_filename see above
    def display_filename
        filename = self.filename
        if !self.incoming_message.nil?
            self.incoming_message.info_request.apply_censor_rules_to_text!(filename)
        end
        # Sometimes filenames have e.g. %20 in - no point butchering that
        # (without unescaping it, this would remove the % and leave 20s in there)
        filename = CGI.unescape(filename)

        # Remove weird spaces
        filename = filename.gsub(/\s+/, " ")
        # Remove non-alphabetic characters
        filename = filename.gsub(/[^A-Za-z0-9.]/, " ")
        # Remove spaces near dots
        filename = filename.gsub(/\s*\.\s*/, ".")
        # Compress adjacent spaces down to a single one
        filename = filename.gsub(/\s+/, " ")
        filename = filename.strip

        return filename
    end


    def ensure_filename!
        if self.filename.blank?
            calc_ext = AlaveteliFileTypes.mimetype_to_extension(self.content_type)
            if !calc_ext
                calc_ext = "bin"
            end
            if !self.within_rfc822_subject.nil?
                computed = self.within_rfc822_subject + "." + calc_ext
            else
                computed = "attachment." + calc_ext
            end
            self.filename = computed
        end
    end

    def filename=(filename)
        calc_ext = AlaveteliFileTypes.mimetype_to_extension(self.content_type)
        # Put right extension on if missing
        if !filename.nil? && !filename.match(/\.#{calc_ext}$/) && calc_ext
            computed = filename + "." + calc_ext
        else
            computed = filename
        end
        write_attribute('filename', computed)
    end

    # Size to show next to the download link for the attachment
    def update_display_size!
        s = self.body.size

        if s > 1024 * 1024
            self.display_size = sprintf("%.1f", s.to_f / 1024 / 1024) + 'M'
        else
            self.display_size = (s / 1024).to_s + 'K'
        end
    end

    # Whether this type can be shown in the Google Docs Viewer.
    # The full list of supported types can be found at
    #   https://docs.google.com/support/bin/answer.py?hl=en&answer=1189935
    def has_google_docs_viewer?
        return !! {
            "application/pdf" => true, # .pdf
            "image/tiff" => true, # .tiff

            "application/vnd.ms-word" => true, # .doc
            "application/vnd.openxmlformats-officedocument.wordprocessingml.document" => true, # .docx

            "application/vnd.ms-powerpoint" => true, # .ppt
            "application/vnd.openxmlformats-officedocument.presentationml.presentation" => true, # .pptx

            "application/vnd.ms-excel" => true, # .xls
            "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" => true, # .xlsx

        } [self.content_type]
    end

    # Whether this type has a "View as HTML"
    def has_body_as_html?
        return (
            !!{
                "text/plain" => true,
                "application/rtf" => true,
            }[self.content_type] or
            self.has_google_docs_viewer?
        )
    end

    # Name of type of attachment type - only valid for things that has_body_as_html?
    def name_of_content_type
        return {
            "text/plain" => "Text file",
            'application/rtf' => "RTF file",

            'application/pdf' => "PDF file",
            'image/tiff' => "TIFF image",

            'application/vnd.ms-word' => "Word document",
            'application/vnd.openxmlformats-officedocument.wordprocessingml.document' => "Word document",

            'application/vnd.ms-powerpoint' => "PowerPoint presentation",
            'application/vnd.openxmlformats-officedocument.presentationml.presentation' => "PowerPoint presentation",

            'application/vnd.ms-excel' => "Excel spreadsheet",
            'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet' => "Excel spreadsheet",
        }[self.content_type]
    end

    # For "View as HTML" of attachment
    def body_as_html(dir)
        html = nil
        wrapper_id = "wrapper"

        # simple cases, can never fail
        if self.content_type == 'text/plain'
            text = self.body.strip
            text = CGI.escapeHTML(text)
            text = MySociety::Format.make_clickable(text)
            html = text.gsub(/\n/, '<br>')
            return '<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"
   "http://www.w3.org/TR/html4/loose.dtd"><html><head><title></title></head><body>' + html + "</body></html>", wrapper_id
        end

        # the extractions will also produce image files, which go in the
        # current directory, so change to the directory the function caller
        # wants everything in

        html = nil
        if ['application/pdf', 'application/rtf'].include?(self.content_type)
            text = self.body
            Dir.chdir(dir) do
                if RUBY_VERSION.to_f >= 1.9
                    tempfile = Tempfile.new('foiextract', '.',  :encoding => text.encoding)
                else
                    tempfile = Tempfile.new('foiextract', '.')
                end
                tempfile.print text
                tempfile.flush


                if self.content_type == 'application/pdf'
                    # We set a timeout here, because pdftohtml can spiral out of control
                    # on some PDF files and we don't want to crash the whole server.
                    html = AlaveteliExternalCommand.run("pdftohtml", "-nodrm", "-zoom", "1.0", "-stdout", "-enc", "UTF-8", "-noframes", tempfile.path, :timeout => 30)
                elsif self.content_type == 'application/rtf'
                    html = AlaveteliExternalCommand.run("unrtf", "--html", tempfile.path, :timeout => 120)
                end

                tempfile.close
                tempfile.delete
            end
        end
        if html.nil?
            if self.has_google_docs_viewer?
                html = '' # force error and using Google docs viewer
            else
                raise "No HTML conversion available for type " + self.content_type
            end
        end



        # We need to look at:
        # a) Any error code
        # b) The output size, as pdftohtml does not return an error code upon error.
        # c) For cases when there is no text in the body of the HTML, or
        # images, so nothing will be rendered. This is to detect some bug in
        # pdftohtml, which sometimes makes it return just <hr>s and no other
        # content.
        html.match(/(\<body[^>]*\>.*)/mi)
        body = $1.to_s
        body_without_tags = body.gsub(/\s+/,"").gsub(/\<[^\>]*\>/, "")
        contains_images = html.match(/<img/mi) ? true : false
        if html.size == 0 || !$?.success? || (body_without_tags.size == 0 && !contains_images)
            ret = "<html><head></head><body>";
            if self.has_google_docs_viewer?
                wrapper_id = "wrapper_google_embed"
                ret = ret + "<iframe src='http://docs.google.com/viewer?url=<attachment-url-here>&embedded=true' width='100%' height='100%' style='border: none;'></iframe>";
            else
                ret = ret + "<p>Sorry, we were unable to convert this file to HTML. Please use the download link at the top right.</p>"
            end
            ret = ret + "</body></html>"
            return ret, wrapper_id
        end

        return html, wrapper_id
    end

end