lib/tasks/temp.rake


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203

namespace :temp do

    desc 'Populate the request_classifications table from info_request_events'
    task :populate_request_classifications => :environment do
        InfoRequestEvent.find_each(:conditions => ["event_type = 'status_update'"]) do |classification|
            RequestClassification.create!(:created_at => classification.created_at,
                                          :user_id => classification.params[:user_id],
                                          :info_request_event_id => classification.id)
        end
    end

    desc "Remove plaintext passwords from post_redirect params"
    task :remove_post_redirect_passwords => :environment do
        PostRedirect.find_each(:conditions => ['post_params_yaml is not null']) do |post_redirect|
              if post_redirect.post_params && post_redirect.post_params[:signchangeemail] && post_redirect.post_params[:signchangeemail][:password]
                params = post_redirect.post_params
                params[:signchangeemail].delete(:password)
                post_redirect.post_params = params
                post_redirect.save!
              end
        end
    end

    desc 'Remove file caches for requests that are not publicly visible or have been destroyed'
    task :remove_obsolete_info_request_caches => :environment do
        dryrun = ENV['DRYRUN'] == '0' ? false : true
        verbose = ENV['VERBOSE'] == '0' ? false : true
        if dryrun
            puts "Running in dryrun mode"
        end
        request_cache_path = File.join(Rails.root, 'cache', 'views', 'request', '*', '*')
        Dir.glob(request_cache_path) do |request_subdir|
            info_request_id = File.basename(request_subdir)
            puts "Looking for InfoRequest with id #{info_request_id}" if verbose
            begin
                info_request = InfoRequest.find(info_request_id)
                puts "Got InfoRequest #{info_request_id}" if verbose
                if ! info_request.all_can_view?
                    puts "Deleting cache at #{request_subdir} for hidden/requester_only InfoRequest #{info_request_id}"
                    if ! dryrun
                        FileUtils.rm_rf(request_subdir)
                    end
                end
            rescue ActiveRecord::RecordNotFound
                puts "Deleting cache at #{request_subdir} for deleted InfoRequest #{info_request_id}"
                if ! dryrun
                    FileUtils.rm_rf(request_subdir)
                end
            end
        end
    end

    desc 'Create a CSV file of a random selection of raw emails, for comparing hexdigests'
    task :random_attachments_hexdigests => :environment do

        # The idea is to run this under the Rail 2 codebase, where
        # Tmail was used to extract the attachements, and the task
        # will output all of those file paths in a CSV file, and a
        # list of the raw email files in another.  The latter file is
        # useful so that one can easily tar up the emails with:
        #
        #   tar cvz -T raw-email-files -f raw_emails.tar.gz
        #
        # Then you can switch to the Rails 3 codebase, where
        # attachment parsing is done via
        # recompute_attachments_hexdigests

        require 'csv'

        File.open('raw-email-files', 'w') do |f|
            CSV.open('attachment-hexdigests.csv', 'w') do |csv|
                csv << ['filepath', 'i', 'url_part_number', 'hexdigest']
                IncomingMessage.all(:order => 'RANDOM()', :limit => 1000).each do |incoming_message|
                    # raw_email.filepath fails unless the
                    # incoming_message has an associated request
                    next unless incoming_message.info_request
                    raw_email = incoming_message.raw_email
                    f.puts raw_email.filepath
                    incoming_message.foi_attachments.each_with_index do |attachment, i|
                        csv << [raw_email.filepath, i, attachment.url_part_number, attachment.hexdigest]
                    end
                end
            end
        end

    end


    desc 'Check the hexdigests of attachments in emails on disk'
    task :recompute_attachments_hexdigests => :environment do

        require 'csv'
        require 'digest/md5'

        OldAttachment = Struct.new :filename, :attachment_index, :url_part_number, :hexdigest

        filename_to_attachments = Hash.new {|h,k| h[k] = []}

        header_line = true
        CSV.foreach('attachment-hexdigests.csv') do |filename, attachment_index, url_part_number, hexdigest|
            if header_line
                header_line = false
            else
                filename_to_attachments[filename].push OldAttachment.new filename, attachment_index, url_part_number, hexdigest
            end
        end

        total_attachments = 0
        attachments_with_different_hexdigest = 0
        files_with_different_numbers_of_attachments = 0
        no_tnef_attachments = 0
        no_parts_in_multipart = 0

        multipart_error = "no parts on multipart mail"
        tnef_error = "tnef produced no attachments"

        # Now check each file:
        filename_to_attachments.each do |filename, old_attachments|

            # Currently it doesn't seem to be possible to reuse the
            # attachment parsing code in Alaveteli without saving
            # objects to the database, so reproduce what it does:

            raw_email = nil
            File.open(filename) do |f|
                raw_email = f.read
            end
            mail = MailHandler.mail_from_raw_email(raw_email)

            begin
                attachment_attributes = MailHandler.get_attachment_attributes(mail)
            rescue IOError => e
                if e.message == tnef_error
                    puts "#{filename} #{tnef_error}"
                    no_tnef_attachments += 1
                    next
                else
                    raise
                end
            rescue Exception => e
                if e.message == multipart_error
                    puts "#{filename} #{multipart_error}"
                    no_parts_in_multipart += 1
                    next
                else
                    raise
                end
            end

            if attachment_attributes.length != old_attachments.length
                puts "#{filename} the number of old attachments #{old_attachments.length} didn't match the number of new attachments #{attachment_attributes.length}"
                files_with_different_numbers_of_attachments += 1
            else
                old_attachments.each_with_index do |old_attachment, i|
                    total_attachments += 1
                    attrs = attachment_attributes[i]
                    old_hexdigest = old_attachment.hexdigest
                    new_hexdigest = attrs[:hexdigest]
                    new_content_type = attrs[:content_type]
                    old_url_part_number = old_attachment.url_part_number.to_i
                    new_url_part_number = attrs[:url_part_number]
                    if old_url_part_number != new_url_part_number
                        puts "#{i} #{filename} old_url_part_number #{old_url_part_number}, new_url_part_number #{new_url_part_number}"
                    end
                    if old_hexdigest != new_hexdigest
                        body = attrs[:body]
                        # First, if the content type is one of
                        # text/plain, text/html or application/rtf try
                        # changing CRLF to LF and calculating a new
                        # digest - we generally don't worry about
                        # these changes:
                        new_converted_hexdigest = nil
                        if ["text/plain", "text/html", "application/rtf"].include? new_content_type
                            converted_body = body.gsub /\r\n/, "\n"
                            new_converted_hexdigest = Digest::MD5.hexdigest converted_body
                            puts "new_converted_hexdigest is #{new_converted_hexdigest}"
                        end
                        if (! new_converted_hexdigest) || (old_hexdigest != new_converted_hexdigest)
                            puts "#{i} #{filename} old_hexdigest #{old_hexdigest} wasn't the same as new_hexdigest #{new_hexdigest}"
                            puts "  body was of length #{body.length}"
                            puts "  content type was: #{new_content_type}"
                            path = "/tmp/#{new_hexdigest}"
                            f = File.new path, "w"
                            f.write body
                            f.close
                            puts "  wrote body to #{path}"
                            attachments_with_different_hexdigest += 1
                        end
                    end
                end
            end

        end

        puts "total_attachments: #{total_attachments}"
        puts "attachments_with_different_hexdigest: #{attachments_with_different_hexdigest}"
        puts "files_with_different_numbers_of_attachments: #{files_with_different_numbers_of_attachments}"
        puts "no_tnef_attachments: #{no_tnef_attachments}"
        puts "no_parts_in_multipart: #{no_parts_in_multipart}"

    end

end