diff options
Diffstat (limited to 'lib/tasks/temp.rake')
-rw-r--r-- | lib/tasks/temp.rake | 272 |
1 files changed, 237 insertions, 35 deletions
diff --git a/lib/tasks/temp.rake b/lib/tasks/temp.rake index e49a84ecb..fcabb23de 100644 --- a/lib/tasks/temp.rake +++ b/lib/tasks/temp.rake @@ -1,53 +1,255 @@ namespace :temp do - desc 'Populate the request_classifications table from info_request_events' - task :populate_request_classifications => :environment do - InfoRequestEvent.find_each(:conditions => ["event_type = 'status_update'"]) do |classification| - RequestClassification.create!(:created_at => classification.created_at, - :user_id => classification.params[:user_id], - :info_request_event_id => classification.id) - end + def disable_duplicate_account(user, count, dryrun) + dupe_email = "duplicateemail#{count}@example.com" + puts "Updating #{user.email} to #{dupe_email} for user #{user.id}" + user.email = dupe_email + user.save! unless dryrun end - desc "Remove plaintext passwords from post_redirect params" - task :remove_post_redirect_passwords => :environment do - PostRedirect.find_each(:conditions => ['post_params_yaml is not null']) do |post_redirect| - if post_redirect.post_params && post_redirect.post_params[:signchangeemail] && post_redirect.post_params[:signchangeemail][:password] - params = post_redirect.post_params - params[:signchangeemail].delete(:password) - post_redirect.post_params = params - post_redirect.save! - end + desc "Re-extract any missing cached attachments" + task :reextract_missing_attachments, [:commit] => :environment do |t, args| + dry_run = args.commit.nil? || args.commit.empty? + total_messages = 0 + messages_to_reparse = 0 + IncomingMessage.find_each :include => :foi_attachments do |im| + reparse = im.foi_attachments.any? { |fa| ! File.exists? fa.filepath } + total_messages += 1 + messages_to_reparse += 1 if reparse + if total_messages % 1000 == 0 + puts "Considered #{total_messages} received emails." + end + unless dry_run + im.parse_raw_email! true if reparse + sleep 2 + end end + message = dry_run ? "Would reparse" : "Reparsed" + message += " #{messages_to_reparse} out of #{total_messages} received emails." + puts message end - desc 'Remove file caches for requests that are not publicly visible or have been destroyed' - task :remove_obsolete_info_request_caches => :environment do + desc 'Cleanup accounts with a space in the email address' + task :clean_up_emails_with_spaces => :environment do dryrun = ENV['DRYRUN'] == '0' ? false : true - verbose = ENV['VERBOSE'] == '0' ? false : true if dryrun - puts "Running in dryrun mode" + puts "This is a dryrun" end - request_cache_path = File.join(Rails.root, 'cache', 'views', 'request', '*', '*') - Dir.glob(request_cache_path) do |request_subdir| - info_request_id = File.basename(request_subdir) - puts "Looking for InfoRequest with id #{info_request_id}" if verbose - begin - info_request = InfoRequest.find(info_request_id) - puts "Got InfoRequest #{info_request_id}" if verbose - if ! info_request.all_can_view? - puts "Deleting cache at #{request_subdir} for hidden/requester_only InfoRequest #{info_request_id}" - if ! dryrun - FileUtils.rm_rf(request_subdir) + count = 0 + User.find_each do |user| + if / /.match(user.email) + + email_without_spaces = user.email.gsub(' ', '') + existing = User.find_user_by_email(email_without_spaces) + # Another account exists with the canonical address + if existing + if user.info_requests.count == 0 and user.comments.count == 0 and user.track_things.count == 0 + count += 1 + disable_duplicate_account(user, count, dryrun) + elsif existing.info_requests.count == 0 and existing.comments.count == 0 and existing.track_things.count == 0 + count += 1 + disable_duplicate_account(existing, count, dryrun) + user.email = email_without_spaces + puts "Updating #{user.email} to #{email_without_spaces} for user #{user.id}" + user.save! unless dryrun + else + user.info_requests.each do |info_request| + info_request.user = existing + info_request.save! unless dryrun + puts "Moved request #{info_request.id} from user #{user.id} to #{existing.id}" + end + + user.comments.each do |comment| + comment.user = existing + comment.save! unless dryrun + puts "Moved comment #{comment.id} from user #{user.id} to #{existing.id}" + end + + user.track_things.each do |track_thing| + track_thing.tracking_user = existing + track_thing.save! unless dryrun + puts "Moved track thing #{track_thing.id} from user #{user.id} to #{existing.id}" + end + + TrackThingsSentEmail.find_each(:conditions => ['user_id = ?', user]) do |sent_email| + sent_email.user = existing + sent_email.save! unless dryrun + puts "Moved track thing sent email #{sent_email.id} from user #{user.id} to #{existing.id}" + + end + + user.censor_rules.each do |censor_rule| + censor_rule.user = existing + censor_rule.save! unless dryrun + puts "Moved censor rule #{censor_rule.id} from user #{user.id} to #{existing.id}" + end + + user.user_info_request_sent_alerts.each do |sent_alert| + sent_alert.user = existing + sent_alert.save! unless dryrun + puts "Moved sent alert #{sent_alert.id} from user #{user.id} to #{existing.id}" + end + + count += 1 + disable_duplicate_account(user, count, dryrun) end + else + puts "Updating #{user.email} to #{email_without_spaces} for user #{user.id}" + user.email = email_without_spaces + user.save! unless dryrun + end + end + end + end + + desc 'Create a CSV file of a random selection of raw emails, for comparing hexdigests' + task :random_attachments_hexdigests => :environment do + + # The idea is to run this under the Rail 2 codebase, where + # Tmail was used to extract the attachements, and the task + # will output all of those file paths in a CSV file, and a + # list of the raw email files in another. The latter file is + # useful so that one can easily tar up the emails with: + # + # tar cvz -T raw-email-files -f raw_emails.tar.gz + # + # Then you can switch to the Rails 3 codebase, where + # attachment parsing is done via + # recompute_attachments_hexdigests + + require 'csv' + + File.open('raw-email-files', 'w') do |f| + CSV.open('attachment-hexdigests.csv', 'w') do |csv| + csv << ['filepath', 'i', 'url_part_number', 'hexdigest'] + IncomingMessage.all(:order => 'RANDOM()', :limit => 1000).each do |incoming_message| + # raw_email.filepath fails unless the + # incoming_message has an associated request + next unless incoming_message.info_request + raw_email = incoming_message.raw_email + f.puts raw_email.filepath + incoming_message.foi_attachments.each_with_index do |attachment, i| + csv << [raw_email.filepath, i, attachment.url_part_number, attachment.hexdigest] + end + end + end + end + + end + + + desc 'Check the hexdigests of attachments in emails on disk' + task :recompute_attachments_hexdigests => :environment do + + require 'csv' + require 'digest/md5' + + OldAttachment = Struct.new :filename, :attachment_index, :url_part_number, :hexdigest + + filename_to_attachments = Hash.new {|h,k| h[k] = []} + + header_line = true + CSV.foreach('attachment-hexdigests.csv') do |filename, attachment_index, url_part_number, hexdigest| + if header_line + header_line = false + else + filename_to_attachments[filename].push OldAttachment.new filename, attachment_index, url_part_number, hexdigest + end + end + + total_attachments = 0 + attachments_with_different_hexdigest = 0 + files_with_different_numbers_of_attachments = 0 + no_tnef_attachments = 0 + no_parts_in_multipart = 0 + + multipart_error = "no parts on multipart mail" + tnef_error = "tnef produced no attachments" + + # Now check each file: + filename_to_attachments.each do |filename, old_attachments| + + # Currently it doesn't seem to be possible to reuse the + # attachment parsing code in Alaveteli without saving + # objects to the database, so reproduce what it does: + + raw_email = nil + File.open(filename) do |f| + raw_email = f.read + end + mail = MailHandler.mail_from_raw_email(raw_email) + + begin + attachment_attributes = MailHandler.get_attachment_attributes(mail) + rescue IOError => e + if e.message == tnef_error + puts "#{filename} #{tnef_error}" + no_tnef_attachments += 1 + next + else + raise + end + rescue Exception => e + if e.message == multipart_error + puts "#{filename} #{multipart_error}" + no_parts_in_multipart += 1 + next + else + raise end - rescue ActiveRecord::RecordNotFound - puts "Deleting cache at #{request_subdir} for deleted InfoRequest #{info_request_id}" - if ! dryrun - FileUtils.rm_rf(request_subdir) + end + + if attachment_attributes.length != old_attachments.length + puts "#{filename} the number of old attachments #{old_attachments.length} didn't match the number of new attachments #{attachment_attributes.length}" + files_with_different_numbers_of_attachments += 1 + else + old_attachments.each_with_index do |old_attachment, i| + total_attachments += 1 + attrs = attachment_attributes[i] + old_hexdigest = old_attachment.hexdigest + new_hexdigest = attrs[:hexdigest] + new_content_type = attrs[:content_type] + old_url_part_number = old_attachment.url_part_number.to_i + new_url_part_number = attrs[:url_part_number] + if old_url_part_number != new_url_part_number + puts "#{i} #{filename} old_url_part_number #{old_url_part_number}, new_url_part_number #{new_url_part_number}" + end + if old_hexdigest != new_hexdigest + body = attrs[:body] + # First, if the content type is one of + # text/plain, text/html or application/rtf try + # changing CRLF to LF and calculating a new + # digest - we generally don't worry about + # these changes: + new_converted_hexdigest = nil + if ["text/plain", "text/html", "application/rtf"].include? new_content_type + converted_body = body.gsub /\r\n/, "\n" + new_converted_hexdigest = Digest::MD5.hexdigest converted_body + puts "new_converted_hexdigest is #{new_converted_hexdigest}" + end + if (! new_converted_hexdigest) || (old_hexdigest != new_converted_hexdigest) + puts "#{i} #{filename} old_hexdigest #{old_hexdigest} wasn't the same as new_hexdigest #{new_hexdigest}" + puts " body was of length #{body.length}" + puts " content type was: #{new_content_type}" + path = "/tmp/#{new_hexdigest}" + f = File.new path, "w" + f.write body + f.close + puts " wrote body to #{path}" + attachments_with_different_hexdigest += 1 + end + end end end + end + + puts "total_attachments: #{total_attachments}" + puts "attachments_with_different_hexdigest: #{attachments_with_different_hexdigest}" + puts "files_with_different_numbers_of_attachments: #{files_with_different_numbers_of_attachments}" + puts "no_tnef_attachments: #{no_tnef_attachments}" + puts "no_parts_in_multipart: #{no_parts_in_multipart}" + end end |