diff options
author | Seb Bacon <seb.bacon@gmail.com> | 2011-12-05 15:14:37 +0000 |
---|---|---|
committer | Seb Bacon <seb.bacon@gmail.com> | 2011-12-05 15:14:37 +0000 |
commit | 67c582b362e32ab2c3e4d56aef9145b30a46119f (patch) | |
tree | ed614e332e660fba04072bfecaef185813a3713d | |
parent | 27870ffb6d9f87aef670d632556f5ede031bb744 (diff) |
add ability to rebuild specific terms rather than all indices -- useful for migrations where new terms have been added (see rake:xapian:rebuild_index help)
-rw-r--r-- | spec/models/info_request_event_spec.rb | 20 | ||||
-rw-r--r-- | spec/models/xapian_spec.rb | 63 | ||||
-rw-r--r-- | spec/spec_helper.rb | 12 | ||||
-rw-r--r-- | vendor/plugins/acts_as_xapian/lib/acts_as_xapian.rb | 111 | ||||
-rw-r--r-- | vendor/plugins/acts_as_xapian/lib/tasks/xapian.rake | 23 |
5 files changed, 192 insertions, 37 deletions
diff --git a/spec/models/info_request_event_spec.rb b/spec/models/info_request_event_spec.rb index 666f5cb1a..055965c23 100644 --- a/spec/models/info_request_event_spec.rb +++ b/spec/models/info_request_event_spec.rb @@ -50,5 +50,25 @@ describe InfoRequestEvent do end end + + describe "doing search/index stuff" do + fixtures :public_bodies, :public_body_translations, :public_body_versions, :users, :info_requests, :raw_emails, :incoming_messages, :outgoing_messages, :comments, :info_request_events, :track_things + + it 'should get search text for outgoing messages' do + event = info_request_events(:useless_outgoing_message_event) + message = outgoing_messages(:useless_outgoing_message).body + event.search_text_main.should == message + "\n\n" + end + + it 'should get search text for incoming messages' do + event = info_request_events(:useless_incoming_message_event) + event.search_text_main.strip.should == "No way! I'm not going to tell you that in a month of Thursdays.\n\nThe Geraldine Quango" + end + + + end + + + end diff --git a/spec/models/xapian_spec.rb b/spec/models/xapian_spec.rb index 932966dfb..cf9ea5fbd 100644 --- a/spec/models/xapian_spec.rb +++ b/spec/models/xapian_spec.rb @@ -12,9 +12,10 @@ describe User, " when indexing users with Xapian" do end it "should search by 'about me' text" do + rebuild_xapian_index user = users(:bob_smith_user) - # def InfoRequest.full_search(models, query, order, ascending, collapse, per_page, page) + # def InfoRequest.full_search(models, query, order, ascending, collapse, per_page, page) xapian_object = InfoRequest.full_search([User], "stuff", 'created_at', true, nil, 100, 1) xapian_object.results.size.should == 1 xapian_object.results[0][:model].should == user @@ -332,6 +333,66 @@ describe PublicBody, " when indexing authorities by tag" do end end +describe PublicBody, " when only indexing selected things on a rebuild" do + fixtures :public_bodies, :public_body_translations, :public_body_versions, :users, :info_requests, :raw_emails, :incoming_messages, :outgoing_messages, :comments, :info_request_events, :track_things + before(:each) do + load_raw_emails_data(raw_emails) + end + + it "should only index what we ask it to" do + rebuild_xapian_index + body = public_bodies(:geraldine_public_body) + body.tag_string = 'mice:3' + body.name = 'frobzn' + body.save! + # only reindex 'variety' term + dropfirst = true + terms = "V" + values = false + texts = false + rebuild_xapian_index(terms, values, texts, dropfirst) + xapian_object = InfoRequest.full_search([PublicBody], "tag:mice", 'created_at', true, nil, 100, 1) + xapian_object.results.size.should == 0 + xapian_object = InfoRequest.full_search([PublicBody], "frobzn", 'created_at', true, nil, 100, 1) + xapian_object.results.size.should == 0 + xapian_object = InfoRequest.full_search([PublicBody], "variety:authority", 'created_at', true, nil, 100, 1) + xapian_object.results.size.should == 2 + # only reindex 'tag' and text + dropfirst = true + terms = "U" + values = false + texts = true + rebuild_xapian_index(terms, values, texts, dropfirst) + xapian_object = InfoRequest.full_search([PublicBody], "tag:mice", 'created_at', true, nil, 100, 1) + xapian_object.results.size.should == 1 + xapian_object = InfoRequest.full_search([PublicBody], "frobzn", 'created_at', true, nil, 100, 1) + xapian_object.results.size.should == 1 + xapian_object = InfoRequest.full_search([PublicBody], "variety:authority", 'created_at', true, nil, 100, 1) + xapian_object.results.size.should == 0 + # only reindex 'variety' term, but keeping the existing data in-place + dropfirst = false + terms = "V" + texts = false + rebuild_xapian_index(terms, values, texts, dropfirst) + xapian_object = InfoRequest.full_search([PublicBody], "tag:mice", 'created_at', true, nil, 100, 1) + xapian_object.results.size.should == 1 + xapian_object = InfoRequest.full_search([PublicBody], "frobzn", 'created_at', true, nil, 100, 1) + xapian_object.results.size.should == 1 + xapian_object = InfoRequest.full_search([PublicBody], "variety:authority", 'created_at', true, nil, 100, 1) + xapian_object.results.size.should == 2 + # only reindex 'variety' term, blowing away existing data + dropfirst = true + rebuild_xapian_index(terms, values, texts, dropfirst) + xapian_object = InfoRequest.full_search([PublicBody], "tag:mice", 'created_at', true, nil, 100, 1) + xapian_object.results.size.should == 0 + xapian_object = InfoRequest.full_search([PublicBody], "frobzn", 'created_at', true, nil, 100, 1) + xapian_object.results.size.should == 0 + xapian_object = InfoRequest.full_search([PublicBody], "variety:authority", 'created_at', true, nil, 100, 1) + xapian_object.results.size.should == 2 + end +end + + diff --git a/spec/spec_helper.rb b/spec/spec_helper.rb index ecb67a3b4..e5a42f1a9 100644 --- a/spec/spec_helper.rb +++ b/spec/spec_helper.rb @@ -77,12 +77,20 @@ def load_file_fixture(file_name) return content end -def rebuild_xapian_index +def rebuild_xapian_index(terms = true, values = true, texts = true, dropfirst = true) + if dropfirst + begin + ActsAsXapian.readable_init + FileUtils.rm_r(ActsAsXapian.db_path) + rescue RuntimeError + end + ActsAsXapian.writable_init + end verbose = false # safe_rebuild=true, which involves forking to avoid memory leaks, doesn't work well with rspec. # unsafe is significantly faster, and we can afford possible memory leaks while testing. safe_rebuild = false - ActsAsXapian.rebuild_index(["PublicBody", "User", "InfoRequestEvent"].map{|m| m.constantize}, verbose, safe_rebuild) + ActsAsXapian.rebuild_index(["PublicBody", "User", "InfoRequestEvent"].map{|m| m.constantize}, verbose, terms, values, texts, safe_rebuild) end def update_xapian_index diff --git a/vendor/plugins/acts_as_xapian/lib/acts_as_xapian.rb b/vendor/plugins/acts_as_xapian/lib/acts_as_xapian.rb index 0af49dffd..fb6a08979 100644 --- a/vendor/plugins/acts_as_xapian/lib/acts_as_xapian.rb +++ b/vendor/plugins/acts_as_xapian/lib/acts_as_xapian.rb @@ -589,7 +589,7 @@ module ActsAsXapian # Incremental update_index calls above are suspended while this rebuild # happens (i.e. while the .new database is there) - any index update jobs # are left in the database, and will run after the rebuild has finished. - def ActsAsXapian.rebuild_index(model_classes, verbose = false, safe_rebuild = true) + def ActsAsXapian.rebuild_index(model_classes, verbose = false, terms = true, values = true, texts = true, safe_rebuild = true) #raise "when rebuilding all, please call as first and only thing done in process / task" if not ActsAsXapian.writable_db.nil? prepare_environment @@ -603,7 +603,7 @@ module ActsAsXapian # Index everything if safe_rebuild - _rebuild_index_safely(model_classes, verbose) + _rebuild_index_safely(model_classes, verbose, terms, values, texts) else # Save time by running the indexing in one go and in-process ActsAsXapian.writable_init(".new") @@ -611,7 +611,7 @@ module ActsAsXapian STDOUT.puts("ActsAsXapian.rebuild_index: Rebuilding #{model_class.to_s}") if verbose model_class.find(:all).each do |model| STDOUT.puts("ActsAsXapian.rebuild_index #{model_class} #{model.id}") if verbose - model.xapian_index + model.xapian_index(terms, values, texts) end end # make sure everything is written and close @@ -641,7 +641,7 @@ module ActsAsXapian # so they get the new db end - def ActsAsXapian._rebuild_index_safely(model_classes, verbose) + def ActsAsXapian._rebuild_index_safely(model_classes, verbose, terms, values, texts) batch_size = 1000 for model_class in model_classes model_class_count = model_class.count @@ -664,7 +664,7 @@ module ActsAsXapian STDOUT.puts("ActsAsXapian.rebuild_index: New batch. #{model_class.to_s} from #{i} to #{i + batch_size} of #{model_class_count} pid #{Process.pid.to_s}") if verbose model_class.find(:all, :limit => batch_size, :offset => i, :order => :id).each do |model| STDOUT.puts("ActsAsXapian.rebuild_index #{model_class} #{model.id}") if verbose - model.xapian_index + model.xapian_index(terms, values, texts) end # make sure everything is written ActsAsXapian.writable_db.flush @@ -738,7 +738,7 @@ module ActsAsXapian end # Store record in the Xapian database - def xapian_index + def xapian_index(terms = true, values = true, texts = true) # if we have a conditional function for indexing, call it and destory object if failed if self.class.xapian_options.include?(:if) if_value = xapian_value(self.class.xapian_options[:if], :boolean) @@ -748,37 +748,90 @@ module ActsAsXapian end end + if self.class.to_s == "PublicBody" and self.url_name == "tgq" + +#require 'ruby-debug' +#debugger + end # otherwise (re)write the Xapian record for the object - doc = Xapian::Document.new - ActsAsXapian.term_generator.document = doc + ActsAsXapian.readable_init + existing_query = Xapian::Query.new("I" + self.xapian_document_term) + ActsAsXapian.enquire.query = existing_query + match = ActsAsXapian.enquire.mset(0,1,1).matches[0] - doc.data = self.xapian_document_term + if !match.nil? + doc = match.document + else + doc = Xapian::Document.new + doc.data = self.xapian_document_term + doc.add_term("M" + self.class.to_s) + doc.add_term("I" + doc.data) + end + ActsAsXapian.term_generator.document = doc + # work out what to index. XXX for now, this is only selective on "terms". + terms_to_index = [] + drop_all_terms = false + if terms and self.xapian_options[:terms] + terms_to_index = self.xapian_options[:terms].dup + if terms.is_a?(String) + terms_to_index.reject!{|term| !terms.include?(term[1])} + if terms_to_index.length == self.xapian_options[:terms].length + drop_all_terms = true + end + else + drop_all_terms = true + end + end + texts_to_index = [] + if texts and self.xapian_options[:texts] + texts_to_index = self.xapian_options[:texts] + end + values_to_index = [] + if values and self.xapian_options[:values] + values_to_index = self.xapian_options[:values] + end - doc.add_term("M" + self.class.to_s) - doc.add_term("I" + doc.data) - if self.xapian_options[:terms] - for term in self.xapian_options[:terms] - value = xapian_value(term[0]) - if value.kind_of?(Array) + # clear any existing values that we might want to replace + if drop_all_terms && texts + # as an optimisation, if we're reindexing all of both, we remove everything + doc.clear_terms + doc.add_term("M" + self.class.to_s) + doc.add_term("I" + doc.data) + else + term_prefixes_to_index = terms_to_index.map {|x| x[1]} + for existing_term in doc.terms + first_letter = existing_term.term[0...1] + if !"MI".include?(first_letter) + if first_letter.match("^[A-Z]+") && terms_to_index.include?(first_letter) + doc.remove_term(existing_term.term) + elsif texts + doc.remove_term(existing_term.term) + end + end + end + end + # for now, we always clear values + doc.clear_values + + for term in terms_to_index + value = xapian_value(term[0]) + if value.kind_of?(Array) for v in value - doc.add_term(term[1] + v) + doc.add_term(term[1] + v) end - else + else doc.add_term(term[1] + value) - end - end + end end - if self.xapian_options[:values] - for value in self.xapian_options[:values] - doc.add_value(value[1], xapian_value(value[0], value[3])) - end + # values + for value in values_to_index + doc.add_value(value[1], xapian_value(value[0], value[3])) end - if self.xapian_options[:texts] - for text in self.xapian_options[:texts] - ActsAsXapian.term_generator.increase_termpos # stop phrases spanning different text fields - # XXX the "1" here is a weight that could be varied for a boost function - ActsAsXapian.term_generator.index_text(xapian_value(text, nil, true), 1) - end + # texts + for text in texts_to_index + ActsAsXapian.term_generator.increase_termpos # stop phrases spanning different text fields + # XXX the "1" here is a weight that could be varied for a boost function + ActsAsXapian.term_generator.index_text(xapian_value(text, nil, true), 1) end ActsAsXapian.writable_db.replace_document("I" + doc.data, doc) diff --git a/vendor/plugins/acts_as_xapian/lib/tasks/xapian.rake b/vendor/plugins/acts_as_xapian/lib/tasks/xapian.rake index 7168895f9..d18cd07d5 100644 --- a/vendor/plugins/acts_as_xapian/lib/tasks/xapian.rake +++ b/vendor/plugins/acts_as_xapian/lib/tasks/xapian.rake @@ -15,14 +15,27 @@ namespace :xapian do # Parameters - specify 'models="PublicBody User"' to say which models # you index with Xapian. - # This totally rebuilds the database, so you will want to restart any - # web server afterwards to make sure it gets the changes, rather than - # still pointing to the old deleted database. Specify "verbose=true" to - # print model name as it is run. + + # This totally rebuilds the database, so you will want to restart + # any web server afterwards to make sure it gets the changes, + # rather than still pointing to the old deleted database. Specify + # "verbose=true" to print model name as it is run. By default, + # all of the terms, values and texts are reindexed. You can + # suppress any of these by specifying, for example, "texts=false". + # You can specify that only certain terms should be updated by + # specifying their prefix(es) as a string, e.g. "terms=IV" will + # index the two terms I and V (and "terms=false" will index none, + # and "terms=true", the default, will index all) + + desc 'Completely rebuilds Xapian search index (must specify all models)' task :rebuild_index => :environment do raise "specify ALL your models with models=\"ModelName1 ModelName2\" as parameter" if ENV['models'].nil? - ActsAsXapian.rebuild_index(ENV['models'].split(" ").map{|m| m.constantize}, ENV['verbose'] ? true : false) + ActsAsXapian.rebuild_index(ENV['models'].split(" ").map{|m| m.constantize}, + ENV['verbose'] ? true : false, + ENV['terms'] == "false" ? false : ENV['terms'], + ENV['values'] == "false" ? false : ENV['values'], + ENV['texts'] == "false" ? false : true) end # Parameters - are models, query, offset, limit, sort_by_prefix, |