From 4c695f76fa62aeca14694f0af75e86f465f7efac Mon Sep 17 00:00:00 2001 From: Mark Longair Date: Tue, 8 Oct 2013 15:53:21 +0100 Subject: Add a rake task to import public bodies from a CSV file For importing a very large number of public bodies, it's mostly likely less frustrating to import them from the CSV file using this rake task instead of using the form in the admin interface. Fixes #1132 --- lib/tasks/import.rake | 86 +++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 86 insertions(+) create mode 100644 lib/tasks/import.rake (limited to 'lib/tasks/import.rake') diff --git a/lib/tasks/import.rake b/lib/tasks/import.rake new file mode 100644 index 000000000..f6a1f3e38 --- /dev/null +++ b/lib/tasks/import.rake @@ -0,0 +1,86 @@ +require 'csv' +require 'tempfile' + +require 'action_view' +require 'action_view/helpers' + +namespace :import do + + include ActionView::Helpers::DateHelper + + desc 'Import public bodies from CSV provided on standard input' + task :import_csv => :environment do + dryrun = ENV['DRYRUN'] != '0' + if dryrun + STDERR.puts "Only a dry run; public bodies will not be created" + end + + tmp_csv = Tempfile.new 'alaveteli' + tmp_csv.write STDIN.read + + number_of_rows = 0 + + STDERR.puts "Preliminary check for ambiguous names or slugs..." + + # Check that the name and slugified version of the name are + # unique: + url_part_count = Hash.new { 0 } + name_count = Hash.new { 0 } + reader = CSV.open tmp_csv.path, 'r' + header_line = reader.shift + headers = header_line.collect { |h| h.gsub /^#/, ''} + + reader.each do |row_array| + row = Hash[headers.zip row_array] + name = row['name'] + url_part = MySociety::Format::simplify_url_part name, "body" + name_count[name] += 1 + url_part_count[url_part] += 1 + number_of_rows += 1 + end + + non_unique_error = false + + [[name_count, 'name'], + [url_part_count, 'url_part']].each do |counter, field| + counter.sort.map do |name, count| + if count > 1 + non_unique_error = true + STDERR.puts "The #{field} #{name} was found #{count} times." + end + end + end + + next if non_unique_error + + STDERR.puts "Now importing the public bodies..." + + start = Time.now.to_f + + # Now it's (probably) safe to try to import: + errors, notes = PublicBody.import_csv(tmp_csv.path, + tag='', + tag_behaviour='replace', + dryrun, + editor="#{ENV['USER']} (Unix user)", + I18n.available_locales) do |row_number, fields| + now = Time.now.to_f + percent_complete = (100 * row_number.to_f / number_of_rows).to_i + expected_end = number_of_rows * (now - start) / row_number.to_f + start + time_left = distance_of_time_in_words now, expected_end + STDERR.print "#{row_number} out of #{number_of_rows} " + STDERR.print "(#{percent_complete}% complete) " + STDERR.puts "#{time_left} remaining" + end + + if errors.length > 0 + STDERR.puts "Import failed, with the following errors:" + errors.each do |error| + STDERR.puts " #{error}" + end + else + STDERR.puts "Done." + end + + end +end -- cgit v1.2.3 From 37c91fceb8a2624e8bfb7a5dd644a36c81938b97 Mon Sep 17 00:00:00 2001 From: Mark Longair Date: Tue, 29 Oct 2013 17:43:59 +0000 Subject: Make sure that the temporary file is closed before reading from it Thanks to Louise Crow for pointing out this problem. --- lib/tasks/import.rake | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'lib/tasks/import.rake') diff --git a/lib/tasks/import.rake b/lib/tasks/import.rake index f6a1f3e38..015331a3a 100644 --- a/lib/tasks/import.rake +++ b/lib/tasks/import.rake @@ -15,8 +15,11 @@ namespace :import do STDERR.puts "Only a dry run; public bodies will not be created" end - tmp_csv = Tempfile.new 'alaveteli' - tmp_csv.write STDIN.read + tmp_csv = nil + Tempfile.open('alaveteli') do |f| + f.write STDIN.read + tmp_csv = f + end number_of_rows = 0 -- cgit v1.2.3 From d41314abf22b55c3215d6d012e573ea76391eeb3 Mon Sep 17 00:00:00 2001 From: Mark Longair Date: Tue, 29 Oct 2013 17:49:16 +0000 Subject: Remove the confusing "time remaining" message The time estimates were confusing on two counts: - The messages mixed locales - The estimates were hugely inaccurate, since the import slows down as time goes on --- lib/tasks/import.rake | 13 +------------ 1 file changed, 1 insertion(+), 12 deletions(-) (limited to 'lib/tasks/import.rake') diff --git a/lib/tasks/import.rake b/lib/tasks/import.rake index 015331a3a..0e8397fde 100644 --- a/lib/tasks/import.rake +++ b/lib/tasks/import.rake @@ -1,13 +1,8 @@ require 'csv' require 'tempfile' -require 'action_view' -require 'action_view/helpers' - namespace :import do - include ActionView::Helpers::DateHelper - desc 'Import public bodies from CSV provided on standard input' task :import_csv => :environment do dryrun = ENV['DRYRUN'] != '0' @@ -58,8 +53,6 @@ namespace :import do STDERR.puts "Now importing the public bodies..." - start = Time.now.to_f - # Now it's (probably) safe to try to import: errors, notes = PublicBody.import_csv(tmp_csv.path, tag='', @@ -67,13 +60,9 @@ namespace :import do dryrun, editor="#{ENV['USER']} (Unix user)", I18n.available_locales) do |row_number, fields| - now = Time.now.to_f percent_complete = (100 * row_number.to_f / number_of_rows).to_i - expected_end = number_of_rows * (now - start) / row_number.to_f + start - time_left = distance_of_time_in_words now, expected_end STDERR.print "#{row_number} out of #{number_of_rows} " - STDERR.print "(#{percent_complete}% complete) " - STDERR.puts "#{time_left} remaining" + STDERR.puts "(#{percent_complete}% complete)" end if errors.length > 0 -- cgit v1.2.3