1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
|
module MailHandler
module Backends
module TmailBackend
def backend()
'TMail'
end
# Turn raw data into a structured TMail::Mail object
# Documentation at http://i.loveruby.net/en/projects/tmail/doc/
def mail_from_raw_email(data, decode=true)
# Hack round bug in TMail's MIME decoding.
# Report of TMail bug:
# http://rubyforge.org/tracker/index.php?func=detail&aid=21810&group_id=4512&atid=17370
copy_of_raw_data = data.gsub(/; boundary=\s+"/im,'; boundary="')
mail = TMail::Mail.parse(copy_of_raw_data)
mail.base64_decode if decode
mail
end
# Extracts all attachments from the given TNEF file as a TMail::Mail object
def mail_from_tnef(content)
main = TMail::Mail.new
main.set_content_type 'multipart', 'mixed', { 'boundary' => TMail.new_boundary }
tnef_attachments(content).each do |attachment|
tmail_attachment = TMail::Mail.new
tmail_attachment['content-location'] = attachment[:filename]
tmail_attachment.body = attachment[:content]
main.parts << tmail_attachment
end
main
end
# Return a copy of the file name for the mail part
def get_part_file_name(mail_part)
part_file_name = TMail::Mail.get_part_file_name(mail_part)
if part_file_name.nil?
return nil
end
part_file_name = part_file_name.dup
return part_file_name
end
# Get the body of a mail part
def get_part_body(mail_part)
mail_part.body
end
# Return the first from address if any
def get_from_address(mail)
if mail.from_addrs.nil? || mail.from_addrs.size == 0
return nil
end
mail.from_addrs[0].spec
end
# Return the first from name if any
def get_from_name(mail)
mail.from_name_if_present
end
def get_all_addresses(mail)
((mail.to || []) +
(mail.cc || []) +
(mail.envelope_to || [])).uniq
end
def empty_return_path?(mail)
return false if mail['return-path'].nil?
return true if mail['return-path'].addr.to_s == '<>'
return false
end
def get_auto_submitted(mail)
mail['auto-submitted'] ? mail['auto-submitted'].body : nil
end
def get_content_type(part)
part.content_type
end
def get_header_string(header, mail)
mail.header_string(header)
end
# Number the attachments in depth first tree order, for use in URLs.
# XXX This fills in part.rfc822_attachment and part.url_part_number within
# all the parts of the email (see monkeypatches in lib/mail_handler/tmail_extensions and
# lib/mail_handler/mail_extensions for how these attributes are added). ensure_parts_counted
# must be called before using the attributes.
def ensure_parts_counted(mail)
mail.count_parts_count = 0
_count_parts_recursive(mail, mail)
# we carry on using these numeric ids for attachments uudecoded from within text parts
mail.count_first_uudecode_count = mail.count_parts_count
end
def _count_parts_recursive(part, mail)
if part.multipart?
part.parts.each do |p|
_count_parts_recursive(p, mail)
end
else
part_filename = get_part_file_name(part)
begin
if part.content_type == 'message/rfc822'
# An email attached as text
# e.g. http://www.whatdotheyknow.com/request/64/response/102
part.rfc822_attachment = mail_from_raw_email(part.body, decode=false)
elsif part.content_type == 'application/vnd.ms-outlook' || part_filename && AlaveteliFileTypes.filename_to_mimetype(part_filename) == 'application/vnd.ms-outlook'
# An email attached as an Outlook file
# e.g. http://www.whatdotheyknow.com/request/chinese_names_for_british_politi
msg = Mapi::Msg.open(StringIO.new(part.body))
part.rfc822_attachment = mail_from_raw_email(msg.to_mime.to_s, decode=false)
elsif part.content_type == 'application/ms-tnef'
# A set of attachments in a TNEF file
part.rfc822_attachment = mail_from_tnef(part.body)
end
rescue
# If attached mail doesn't parse, treat it as text part
part.rfc822_attachment = nil
else
unless part.rfc822_attachment.nil?
_count_parts_recursive(part.rfc822_attachment, mail)
end
end
if part.rfc822_attachment.nil?
mail.count_parts_count += 1
part.url_part_number = mail.count_parts_count
end
end
end
def get_attachment_attributes(mail)
leaves = get_attachment_leaves(mail)
# XXX we have to call ensure_parts_counted after get_attachment_leaves
# which is really messy.
ensure_parts_counted(mail)
attachment_attributes = []
for leaf in leaves
body = get_part_body(leaf)
# As leaf.body causes MIME decoding which uses lots of RAM, do garbage collection here
# to prevent excess memory use. XXX not really sure if this helps reduce
# peak RAM use overall. Anyway, maybe there is something better to do than this.
GC.start
if leaf.within_rfc822_attachment
within_rfc822_subject = leaf.within_rfc822_attachment.subject
# Test to see if we are in the first part of the attached
# RFC822 message and it is text, if so add headers.
# XXX should probably use hunting algorithm to find main text part, rather than
# just expect it to be first. This will do for now though.
if leaf.within_rfc822_attachment == leaf && leaf.content_type == 'text/plain'
headers = ""
for header in [ 'Date', 'Subject', 'From', 'To', 'Cc' ]
if leaf.within_rfc822_attachment.header.include?(header.downcase)
header_value = leaf.within_rfc822_attachment.header[header.downcase]
if !header_value.blank?
headers = headers + header + ": " + header_value.to_s + "\n"
end
end
end
# XXX call _convert_part_body_to_text here, but need to get charset somehow
# e.g. http://www.whatdotheyknow.com/request/1593/response/3088/attach/4/Freedom%20of%20Information%20request%20-%20car%20oval%20sticker:%20Article%2020,%20Convention%20on%20Road%20Traffic%201949.txt
body = headers + "\n" + body
# This is quick way of getting all headers, but instead we only add some a) to
# make it more usable, b) as at least one authority accidentally leaked security
# information into a header.
#attachment.body = leaf.within_rfc822_attachment.port.to_s
end
end
attachment_attributes << {:url_part_number => leaf.url_part_number,
:content_type => get_content_type(leaf),
:filename => get_part_file_name(leaf),
:charset => leaf.charset,
:within_rfc822_subject => within_rfc822_subject,
:body => body,
:hexdigest => Digest::MD5.hexdigest(body) }
end
attachment_attributes
end
# (This risks losing info if the unchosen alternative is the only one to contain
# useful info, but let's worry about that another time)
def get_attachment_leaves(mail)
return _get_attachment_leaves_recursive(mail, mail)
end
def _get_attachment_leaves_recursive(curr_mail, parent_mail, within_rfc822_attachment = nil)
leaves_found = []
if curr_mail.multipart?
if curr_mail.parts.size == 0
raise "no parts on multipart mail"
end
if curr_mail.sub_type == 'alternative'
# Choose best part from alternatives
best_part = nil
# Take the last text/plain one, or else the first one
curr_mail.parts.each do |m|
if not best_part
best_part = m
elsif m.content_type == 'text/plain'
best_part = m
end
end
# Take an HTML one as even higher priority. (They tend
# to render better than text/plain, e.g. don't wrap links here:
# http://www.whatdotheyknow.com/request/amount_and_cost_of_freedom_of_in#incoming-72238 )
curr_mail.parts.each do |m|
if m.content_type == 'text/html'
best_part = m
end
end
leaves_found += _get_attachment_leaves_recursive(best_part, parent_mail, within_rfc822_attachment)
else
# Add all parts
curr_mail.parts.each do |m|
leaves_found += _get_attachment_leaves_recursive(m, parent_mail, within_rfc822_attachment)
end
end
else
# XXX Yuck. this section alters various content_types. That puts
# it into conflict with ensure_parts_counted which it has to be
# called both before and after. It will fail with cases of
# attachments of attachments etc.
charset = curr_mail.charset # save this, because overwriting content_type also resets charset
# Don't allow nil content_types
if curr_mail.content_type.nil?
curr_mail.content_type = 'application/octet-stream'
end
# PDFs often come with this mime type, fix it up for view code
if curr_mail.content_type == 'application/octet-stream'
part_file_name = get_part_file_name(curr_mail)
part_body = get_part_body(curr_mail)
calc_mime = AlaveteliFileTypes.filename_and_content_to_mimetype(part_file_name, part_body)
if calc_mime
curr_mail.content_type = calc_mime
end
end
# Use standard content types for Word documents etc.
curr_mail.content_type = normalise_content_type(curr_mail.content_type)
if curr_mail.content_type == 'message/rfc822'
ensure_parts_counted(parent_mail) # fills in rfc822_attachment variable
if curr_mail.rfc822_attachment.nil?
# Attached mail didn't parse, so treat as text
curr_mail.content_type = 'text/plain'
end
end
if curr_mail.content_type == 'application/vnd.ms-outlook' || curr_mail.content_type == 'application/ms-tnef'
ensure_parts_counted(parent_mail) # fills in rfc822_attachment variable
if curr_mail.rfc822_attachment.nil?
# Attached mail didn't parse, so treat as binary
curr_mail.content_type = 'application/octet-stream'
end
end
# If the part is an attachment of email
if curr_mail.content_type == 'message/rfc822' || curr_mail.content_type == 'application/vnd.ms-outlook' || curr_mail.content_type == 'application/ms-tnef'
ensure_parts_counted(parent_mail) # fills in rfc822_attachment variable
leaves_found += _get_attachment_leaves_recursive(curr_mail.rfc822_attachment, parent_mail, curr_mail.rfc822_attachment)
else
# Store leaf
curr_mail.within_rfc822_attachment = within_rfc822_attachment
leaves_found += [curr_mail]
end
# restore original charset
curr_mail.charset = charset
end
return leaves_found
end
def address_from_name_and_email(name, email)
if !MySociety::Validate.is_valid_email(email)
raise "invalid email " + email + " passed to address_from_name_and_email"
end
if name.nil?
return TMail::Address.parse(email).to_s
end
# Botch an always quoted RFC address, then parse it
name = name.gsub(/(["\\])/, "\\\\\\1")
TMail::Address.parse('"' + name + '" <' + email + '>').to_s
end
def address_from_string(string)
TMail::Address.parse(string).address
end
end
end
end
|