aboutsummaryrefslogtreecommitdiffstats
path: root/lib/mail_handler/backends/tmail_backend.rb
blob: 02124cdb1d8ac8e4b484736b6b7dc68844035af2 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
module MailHandler
    module Backends
        module TmailBackend

            def backend()
                'TMail'
            end

            # Turn raw data into a structured TMail::Mail object
            # Documentation at http://i.loveruby.net/en/projects/tmail/doc/
            def mail_from_raw_email(data, decode=true)
                # Hack round bug in TMail's MIME decoding.
                # Report of TMail bug:
                # http://rubyforge.org/tracker/index.php?func=detail&aid=21810&group_id=4512&atid=17370
                copy_of_raw_data = data.gsub(/; boundary=\s+"/im,'; boundary="')
                mail = TMail::Mail.parse(copy_of_raw_data)
                mail.base64_decode if decode
                mail
            end

            # Extracts all attachments from the given TNEF file as a TMail::Mail object
            def mail_from_tnef(content)
                main = TMail::Mail.new
                main.set_content_type 'multipart', 'mixed', { 'boundary' => TMail.new_boundary }
                tnef_attachments(content).each do |attachment|
                    tmail_attachment = TMail::Mail.new
                    tmail_attachment['content-location'] = attachment[:filename]
                    tmail_attachment.body = attachment[:content]
                    main.parts << tmail_attachment
                end
                main
            end

            # Return a copy of the file name for the mail part
            def get_part_file_name(mail_part)
                part_file_name = TMail::Mail.get_part_file_name(mail_part)
                if part_file_name.nil?
                    return nil
                end
                part_file_name = part_file_name.dup
                return part_file_name
            end

            # Get the body of a mail part
            def get_part_body(mail_part)
                mail_part.body
            end

            # Return the first from address if any
            def get_from_address(mail)
                if mail.from_addrs.nil? || mail.from_addrs.size == 0
                    return nil
                end
                mail.from_addrs[0].spec
            end

            # Return the first from name if any
            def get_from_name(mail)
                mail.from_name_if_present
            end

            def get_all_addresses(mail)
                ((mail.to || []) +
                (mail.cc || []) +
                (mail.envelope_to || [])).uniq
            end

            def empty_return_path?(mail)
                return false if mail['return-path'].nil?
                return true if mail['return-path'].addr.to_s == '<>'
                return false
            end

            def get_auto_submitted(mail)
                mail['auto-submitted'] ? mail['auto-submitted'].body : nil
            end

            def get_content_type(part)
                part.content_type
            end

            def get_header_string(header, mail)
                mail.header_string(header)
            end

            # Number the attachments in depth first tree order, for use in URLs.
            # XXX This fills in part.rfc822_attachment and part.url_part_number within
            # all the parts of the email (see monkeypatches in lib/mail_handler/tmail_extensions and
            # lib/mail_handler/mail_extensions for how these attributes are added). ensure_parts_counted
            # must be called before using the attributes.
            def ensure_parts_counted(mail)
                mail.count_parts_count = 0
                _count_parts_recursive(mail, mail)
                # we carry on using these numeric ids for attachments uudecoded from within text parts
                mail.count_first_uudecode_count = mail.count_parts_count
            end
            def _count_parts_recursive(part, mail)
                if part.multipart?
                    part.parts.each do |p|
                        _count_parts_recursive(p, mail)
                    end
                else
                    part_filename = get_part_file_name(part)
                    begin
                        if part.content_type == 'message/rfc822'
                            # An email attached as text
                            # e.g. http://www.whatdotheyknow.com/request/64/response/102
                            part.rfc822_attachment = mail_from_raw_email(part.body, decode=false)
                        elsif part.content_type == 'application/vnd.ms-outlook' || part_filename && AlaveteliFileTypes.filename_to_mimetype(part_filename) == 'application/vnd.ms-outlook'
                            # An email attached as an Outlook file
                            # e.g. http://www.whatdotheyknow.com/request/chinese_names_for_british_politi
                            msg = Mapi::Msg.open(StringIO.new(part.body))
                            part.rfc822_attachment = mail_from_raw_email(msg.to_mime.to_s, decode=false)
                        elsif part.content_type == 'application/ms-tnef'
                            # A set of attachments in a TNEF file
                            part.rfc822_attachment = mail_from_tnef(part.body)
                        end
                    rescue
                        # If attached mail doesn't parse, treat it as text part
                        part.rfc822_attachment = nil
                    else
                        unless part.rfc822_attachment.nil?
                            _count_parts_recursive(part.rfc822_attachment, mail)
                        end
                    end
                    if part.rfc822_attachment.nil?
                        mail.count_parts_count += 1
                        part.url_part_number = mail.count_parts_count
                    end
                end
            end

            def get_attachment_attributes(mail)
                leaves = get_attachment_leaves(mail)
                # XXX we have to call ensure_parts_counted after get_attachment_leaves
                # which is really messy.
                ensure_parts_counted(mail)
                attachment_attributes = []
                for leaf in leaves
                    body = get_part_body(leaf)
                    # As leaf.body causes MIME decoding which uses lots of RAM, do garbage collection here
                    # to prevent excess memory use. XXX not really sure if this helps reduce
                    # peak RAM use overall. Anyway, maybe there is something better to do than this.
                    GC.start
                    if leaf.within_rfc822_attachment
                        within_rfc822_subject = leaf.within_rfc822_attachment.subject
                        # Test to see if we are in the first part of the attached
                        # RFC822 message and it is text, if so add headers.
                        # XXX should probably use hunting algorithm to find main text part, rather than
                        # just expect it to be first. This will do for now though.
                        if leaf.within_rfc822_attachment == leaf && leaf.content_type == 'text/plain'
                            headers = ""
                            for header in [ 'Date', 'Subject', 'From', 'To', 'Cc' ]
                                if leaf.within_rfc822_attachment.header.include?(header.downcase)
                                    header_value = leaf.within_rfc822_attachment.header[header.downcase]
                                     if !header_value.blank?
                                        headers = headers + header + ": " + header_value.to_s + "\n"
                                    end
                                end
                            end
                            # XXX call _convert_part_body_to_text here, but need to get charset somehow
                            # e.g. http://www.whatdotheyknow.com/request/1593/response/3088/attach/4/Freedom%20of%20Information%20request%20-%20car%20oval%20sticker:%20Article%2020,%20Convention%20on%20Road%20Traffic%201949.txt
                            body = headers + "\n" + body

                            # This is quick way of getting all headers, but instead we only add some a) to
                            # make it more usable, b) as at least one authority accidentally leaked security
                            # information into a header.
                            #attachment.body = leaf.within_rfc822_attachment.port.to_s
                        end
                    end
                    attachment_attributes << {:url_part_number => leaf.url_part_number,
                                              :content_type => get_content_type(leaf),
                                              :filename => get_part_file_name(leaf),
                                              :charset => leaf.charset,
                                              :within_rfc822_subject => within_rfc822_subject,
                                              :body => body,
                                              :hexdigest => Digest::MD5.hexdigest(body) }
                end
                attachment_attributes
            end

            # (This risks losing info if the unchosen alternative is the only one to contain
            # useful info, but let's worry about that another time)
            def get_attachment_leaves(mail)
                return _get_attachment_leaves_recursive(mail, mail)
            end
            def _get_attachment_leaves_recursive(curr_mail, parent_mail, within_rfc822_attachment = nil)
                leaves_found = []
                if curr_mail.multipart?
                    if curr_mail.parts.size == 0
                        raise "no parts on multipart mail"
                    end

                    if curr_mail.sub_type == 'alternative'
                        # Choose best part from alternatives
                        best_part = nil
                        # Take the last text/plain one, or else the first one
                        curr_mail.parts.each do |m|
                            if not best_part
                                best_part = m
                            elsif m.content_type == 'text/plain'
                                best_part = m
                            end
                        end
                        # Take an HTML one as even higher priority. (They tend
                        # to render better than text/plain, e.g. don't wrap links here:
                        # http://www.whatdotheyknow.com/request/amount_and_cost_of_freedom_of_in#incoming-72238 )
                        curr_mail.parts.each do |m|
                            if m.content_type == 'text/html'
                                best_part = m
                            end
                        end
                        leaves_found += _get_attachment_leaves_recursive(best_part, parent_mail, within_rfc822_attachment)
                    else
                        # Add all parts
                        curr_mail.parts.each do |m|
                            leaves_found += _get_attachment_leaves_recursive(m, parent_mail, within_rfc822_attachment)
                        end
                    end
                else
                    # XXX Yuck. this section alters various content_types. That puts
                    # it into conflict with ensure_parts_counted which it has to be
                    # called both before and after.  It will fail with cases of
                    # attachments of attachments etc.
                    charset = curr_mail.charset # save this, because overwriting content_type also resets charset
                    # Don't allow nil content_types
                    if curr_mail.content_type.nil?
                        curr_mail.content_type = 'application/octet-stream'
                    end
                    # PDFs often come with this mime type, fix it up for view code
                    if curr_mail.content_type == 'application/octet-stream'
                        part_file_name = get_part_file_name(curr_mail)
                        part_body = get_part_body(curr_mail)
                        calc_mime = AlaveteliFileTypes.filename_and_content_to_mimetype(part_file_name, part_body)
                        if calc_mime
                            curr_mail.content_type = calc_mime
                        end
                    end

                    # Use standard content types for Word documents etc.
                    curr_mail.content_type = normalise_content_type(curr_mail.content_type)
                    if curr_mail.content_type == 'message/rfc822'
                        ensure_parts_counted(parent_mail) # fills in rfc822_attachment variable
                        if curr_mail.rfc822_attachment.nil?
                            # Attached mail didn't parse, so treat as text
                            curr_mail.content_type = 'text/plain'
                        end
                    end
                    if curr_mail.content_type == 'application/vnd.ms-outlook' || curr_mail.content_type == 'application/ms-tnef'
                        ensure_parts_counted(parent_mail) # fills in rfc822_attachment variable
                        if curr_mail.rfc822_attachment.nil?
                            # Attached mail didn't parse, so treat as binary
                            curr_mail.content_type = 'application/octet-stream'
                        end
                    end
                    # If the part is an attachment of email
                    if curr_mail.content_type == 'message/rfc822' || curr_mail.content_type == 'application/vnd.ms-outlook' || curr_mail.content_type == 'application/ms-tnef'
                        ensure_parts_counted(parent_mail) # fills in rfc822_attachment variable
                        leaves_found += _get_attachment_leaves_recursive(curr_mail.rfc822_attachment, parent_mail, curr_mail.rfc822_attachment)
                    else
                        # Store leaf
                        curr_mail.within_rfc822_attachment = within_rfc822_attachment
                        leaves_found += [curr_mail]
                    end
                    # restore original charset
                    curr_mail.charset = charset
                end
                return leaves_found
            end


            def address_from_name_and_email(name, email)
                if !MySociety::Validate.is_valid_email(email)
                    raise "invalid email " + email + " passed to address_from_name_and_email"
                end
                if name.nil?
                    return TMail::Address.parse(email).to_s
                end
                # Botch an always quoted RFC address, then parse it
                name = name.gsub(/(["\\])/, "\\\\\\1")
                TMail::Address.parse('"' + name + '" <' + email + '>').to_s
            end

            def address_from_string(string)
                TMail::Address.parse(string).address
            end

        end
    end
end