1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
|
# Handles the parsing of email
require 'tmpdir'
module MailHandler
require 'mail'
require 'backends/mail_extensions'
require 'backends/mail_backend'
include Backends::MailBackend
class TNEFParsingError < StandardError
end
# Returns a set of attachments from the given TNEF contents
# The TNEF contents also contains the message body, but in general this is the
# same as the message body in the message proper.
def tnef_attachments(content)
attachments = []
Dir.mktmpdir do |dir|
IO.popen("tnef -K -C #{dir} 2> /dev/null", "wb") do |f|
f.write(content)
f.close
if $?.signaled?
raise IOError, "tnef exited with signal #{$?.termsig}"
end
if $?.exited? && $?.exitstatus != 0
raise TNEFParsingError, "tnef exited with status #{$?.exitstatus}"
end
end
found = 0
Dir.new(dir).sort.each do |file| # sort for deterministic behaviour
if file != "." && file != ".."
file_content = File.open("#{dir}/#{file}", "rb").read
attachments << { :content => file_content,
:filename => file }
found += 1
end
end
if found == 0
raise TNEFParsingError, "tnef produced no attachments"
end
end
attachments
end
def normalise_content_type(content_type)
# e.g. http://www.whatdotheyknow.com/request/93/response/250
if content_type == 'application/excel' or content_type == 'application/msexcel' or content_type == 'application/x-ms-excel'
content_type = 'application/vnd.ms-excel'
end
if content_type == 'application/mspowerpoint' or content_type == 'application/x-ms-powerpoint'
content_type = 'application/vnd.ms-powerpoint'
end
if content_type == 'application/msword' or content_type == 'application/x-ms-word'
content_type = 'application/vnd.ms-word'
end
if content_type == 'application/x-zip-compressed'
content_type = 'application/zip'
end
# e.g. http://www.whatdotheyknow.com/request/copy_of_current_swessex_scr_opt#incoming-9928
if content_type == 'application/acrobat' or content_type == 'document/pdf'
content_type = 'application/pdf'
end
return content_type
end
def get_attachment_text_one_file(content_type, body, charset = 'utf-8')
# note re. charset: TMail always tries to convert email bodies
# to UTF8 by default, so normally it should already be that.
text = ''
# TODO: - tell all these command line tools to return utf-8
if content_type == 'text/plain'
text += body + "\n\n"
else
tempfile = Tempfile.new('foiextract')
tempfile.binmode
tempfile.print body
tempfile.flush
default_params = { :append_to => text,
:binary_output => false,
:timeout => 1200 }
if content_type == 'application/vnd.ms-word'
AlaveteliExternalCommand.run("wvText", tempfile.path, tempfile.path + ".txt",
{ :memory_limit => 536870912, :timeout => 120 } )
# Try catdoc if we get into trouble (e.g. for InfoRequestEvent 2701)
if not File.exists?(tempfile.path + ".txt")
AlaveteliExternalCommand.run("catdoc", tempfile.path, default_params)
else
text += File.read(tempfile.path + ".txt") + "\n\n"
File.unlink(tempfile.path + ".txt")
end
elsif content_type == 'application/rtf'
# catdoc on RTF prodcues less comments and extra bumf than --text option to unrtf
AlaveteliExternalCommand.run("catdoc", tempfile.path, default_params)
elsif content_type == 'text/html'
# lynx wordwraps links in its output, which then don't
# get formatted properly by Alaveteli. We use elinks
# instead, which doesn't do that.
AlaveteliExternalCommand.run("elinks", "-eval", "set document.codepage.assume = \"#{charset}\"",
"-eval", "set document.codepage.force_assumed = 1",
"-dump-charset", "utf-8",
"-force-html", "-dump",
tempfile.path,
default_params.merge(:env => {"LANG" => "C"}))
elsif content_type == 'application/vnd.ms-excel'
# Bit crazy using /usr/bin/strings - but xls2csv, xlhtml and
# py_xls2txt only extract text from cells, not from floating
# notes. catdoc may be fooled by weird character sets, but will
# probably do for UK FOI requests.
AlaveteliExternalCommand.run("/usr/bin/strings", tempfile.path, default_params)
elsif content_type == 'application/vnd.ms-powerpoint'
# ppthtml seems to catch more text, but only outputs HTML when
# we want text, so just use catppt for now
AlaveteliExternalCommand.run("catppt", tempfile.path, default_params)
elsif content_type == 'application/pdf'
AlaveteliExternalCommand.run("pdftotext", tempfile.path, "-", default_params)
elsif content_type == 'application/vnd.openxmlformats-officedocument.wordprocessingml.document'
# This is Microsoft's XML office document format.
# Just pull out the main XML file, and strip it of text.
xml = AlaveteliExternalCommand.run("/usr/bin/unzip", "-qq",
"-c",
tempfile.path,
"word/document.xml",
{:binary_output => false})
if !xml.nil?
doc = REXML::Document.new(xml)
text += doc.each_element( './/text()' ){}.join(" ")
end
elsif content_type == 'application/zip'
# recurse into zip files
begin
zip_file = Zip::ZipFile.open(tempfile.path)
text += get_attachment_text_from_zip_file(zip_file)
zip_file.close()
rescue
$stderr.puts("Error processing zip file: #{$!.inspect}")
end
end
tempfile.close
end
return text
end
def get_attachment_text_from_zip_file(zip_file)
text = ""
for entry in zip_file
if entry.file?
filename = entry.to_s
begin
body = entry.get_input_stream.read
rescue
# move to next attachment silently if there were problems
# TODO: really should reduce this to specific exceptions?
# e.g. password protected
next
end
calc_mime = AlaveteliFileTypes.filename_to_mimetype(filename)
if calc_mime
content_type = calc_mime
else
content_type = 'application/octet-stream'
end
text += get_attachment_text_one_file(content_type, body)
end
end
return text
end
# Turn instance methods into class methods
extend self
end
|