diff options
author | Mark Longair <mhl@pobox.com> | 2013-10-08 15:53:21 +0100 |
---|---|---|
committer | Mark Longair <mhl@pobox.com> | 2013-10-15 10:46:15 +0100 |
commit | 4c695f76fa62aeca14694f0af75e86f465f7efac (patch) | |
tree | 7bfa580d4abc323cda5ff99330fa80b8eeac919b | |
parent | 48ede95299e286c03a45966b00990b544b727206 (diff) |
Add a rake task to import public bodies from a CSV file
For importing a very large number of public bodies, it's mostly likely
less frustrating to import them from the CSV file using this rake task
instead of using the form in the admin interface.
Fixes #1132
-rw-r--r-- | app/models/public_body.rb | 2 | ||||
-rw-r--r-- | lib/tasks/import.rake | 86 |
2 files changed, 88 insertions, 0 deletions
diff --git a/app/models/public_body.rb b/app/models/public_body.rb index 485a794b0..9adcdc4a0 100644 --- a/app/models/public_body.rb +++ b/app/models/public_body.rb @@ -407,6 +407,8 @@ class PublicBody < ActiveRecord::Base fields = {} field_names.each{|name, i| fields[name] = row[i]} + yield line, fields if block_given? + name = row[field_names['name']] email = row[field_names['request_email']] next if name.nil? diff --git a/lib/tasks/import.rake b/lib/tasks/import.rake new file mode 100644 index 000000000..f6a1f3e38 --- /dev/null +++ b/lib/tasks/import.rake @@ -0,0 +1,86 @@ +require 'csv' +require 'tempfile' + +require 'action_view' +require 'action_view/helpers' + +namespace :import do + + include ActionView::Helpers::DateHelper + + desc 'Import public bodies from CSV provided on standard input' + task :import_csv => :environment do + dryrun = ENV['DRYRUN'] != '0' + if dryrun + STDERR.puts "Only a dry run; public bodies will not be created" + end + + tmp_csv = Tempfile.new 'alaveteli' + tmp_csv.write STDIN.read + + number_of_rows = 0 + + STDERR.puts "Preliminary check for ambiguous names or slugs..." + + # Check that the name and slugified version of the name are + # unique: + url_part_count = Hash.new { 0 } + name_count = Hash.new { 0 } + reader = CSV.open tmp_csv.path, 'r' + header_line = reader.shift + headers = header_line.collect { |h| h.gsub /^#/, ''} + + reader.each do |row_array| + row = Hash[headers.zip row_array] + name = row['name'] + url_part = MySociety::Format::simplify_url_part name, "body" + name_count[name] += 1 + url_part_count[url_part] += 1 + number_of_rows += 1 + end + + non_unique_error = false + + [[name_count, 'name'], + [url_part_count, 'url_part']].each do |counter, field| + counter.sort.map do |name, count| + if count > 1 + non_unique_error = true + STDERR.puts "The #{field} #{name} was found #{count} times." + end + end + end + + next if non_unique_error + + STDERR.puts "Now importing the public bodies..." + + start = Time.now.to_f + + # Now it's (probably) safe to try to import: + errors, notes = PublicBody.import_csv(tmp_csv.path, + tag='', + tag_behaviour='replace', + dryrun, + editor="#{ENV['USER']} (Unix user)", + I18n.available_locales) do |row_number, fields| + now = Time.now.to_f + percent_complete = (100 * row_number.to_f / number_of_rows).to_i + expected_end = number_of_rows * (now - start) / row_number.to_f + start + time_left = distance_of_time_in_words now, expected_end + STDERR.print "#{row_number} out of #{number_of_rows} " + STDERR.print "(#{percent_complete}% complete) " + STDERR.puts "#{time_left} remaining" + end + + if errors.length > 0 + STDERR.puts "Import failed, with the following errors:" + errors.each do |error| + STDERR.puts " #{error}" + end + else + STDERR.puts "Done." + end + + end +end |