diff options
Diffstat (limited to 'lib/acts_as_xapian')
-rw-r--r-- | lib/acts_as_xapian/acts_as_xapian.rb | 229 | ||||
-rw-r--r-- | lib/acts_as_xapian/tasks/xapian.rake | 2 |
2 files changed, 119 insertions, 112 deletions
diff --git a/lib/acts_as_xapian/acts_as_xapian.rb b/lib/acts_as_xapian/acts_as_xapian.rb index 9194e8cc4..e0c7c6ae7 100644 --- a/lib/acts_as_xapian/acts_as_xapian.rb +++ b/lib/acts_as_xapian/acts_as_xapian.rb @@ -1,4 +1,4 @@ -# encoding: utf-8 +# -*- encoding : utf-8 -*- # acts_as_xapian/lib/acts_as_xapian.rb: # Xapian full text search in Ruby on Rails. # @@ -39,7 +39,7 @@ module ActsAsXapian ###################################################################### # Module level variables # TODO: must be some kind of cattr_accessor that can do this better - def ActsAsXapian.bindings_available + def self.bindings_available $acts_as_xapian_bindings_available end class NoXapianRubyBindingsError < StandardError @@ -58,40 +58,40 @@ module ActsAsXapian $acts_as_xapian_class_var_init = true end - def ActsAsXapian.db + def self.db @@db end - def ActsAsXapian.db_path=(db_path) + def self.db_path=(db_path) @@db_path = db_path end - def ActsAsXapian.db_path + def self.db_path @@db_path end - def ActsAsXapian.writable_db + def self.writable_db @@writable_db end - def ActsAsXapian.stemmer + def self.stemmer @@stemmer end - def ActsAsXapian.term_generator + def self.term_generator @@term_generator end - def ActsAsXapian.enquire + def self.enquire @@enquire end - def ActsAsXapian.query_parser + def self.query_parser @@query_parser end - def ActsAsXapian.values_by_prefix + def self.values_by_prefix @@values_by_prefix end - def ActsAsXapian.config + def self.config @@config end ###################################################################### # Initialisation - def ActsAsXapian.init(classname = nil, options = nil) + def self.init(classname = nil, options = nil) if not classname.nil? # store class and options for use later, when we open the db in readable_init @@init_values.push([classname,options]) @@ -99,7 +99,7 @@ module ActsAsXapian end # Reads the config file (if any) and sets up the path to the database we'll be using - def ActsAsXapian.prepare_environment + def self.prepare_environment return unless @@db_path.nil? # barf if we can't figure out the environment @@ -130,17 +130,17 @@ module ActsAsXapian # Opens / reopens the db for reading # TODO: we perhaps don't need to rebuild database and enquire and queryparser - # but db.reopen wasn't enough by itself, so just do everything it's easier. - def ActsAsXapian.readable_init + def self.readable_init raise NoXapianRubyBindingsError.new("Xapian Ruby bindings not installed") unless ActsAsXapian.bindings_available raise "acts_as_xapian hasn't been called in any models" if @@init_values.empty? prepare_environment # We need to reopen the database each time, so Xapian gets changes to it. - # Calling reopen() does not always pick up changes for reasons that I can + # Calling reopen does not always pick up changes for reasons that I can # only speculate about at the moment. (It is easy to reproduce this by - # changing the code below to use reopen() rather than open() followed by - # close(), and running rake spec.) + # changing the code below to use reopen rather than open followed by + # close, and running rake spec.) if !@@db.nil? @@db.close end @@ -157,23 +157,20 @@ module ActsAsXapian end # Make a new query parser - def ActsAsXapian.init_query_parser + def self.init_query_parser # for queries @@query_parser = Xapian::QueryParser.new @@query_parser.stemmer = @@stemmer @@query_parser.stemming_strategy = Xapian::QueryParser::STEM_SOME @@query_parser.database = @@db @@query_parser.default_op = Xapian::Query::OP_AND - begin - @@query_parser.set_max_wildcard_expansion(1000) - rescue NoMethodError - # The set_max_wildcard_expansion method was introduced in Xapian 1.2.7, - # so may legitimately not be available. - # - # Large installations of Alaveteli should consider - # upgrading, because uncontrolled wildcard expansion - # can crash the whole server: see http://trac.xapian.org/ticket/350 - end + # The set_max_wildcard_expansion method was introduced in Xapian 1.2.7, + # so may legitimately not be available. + # + # Large installations of Alaveteli should consider + # upgrading, because uncontrolled wildcard expansion + # can crash the whole server: see http://trac.xapian.org/ticket/350 + @@query_parser.set_max_wildcard_expansion(1000) if @@query_parser.respond_to? :set_max_wildcard_expansion @@stopper = Xapian::SimpleStopper.new @@stopper.add("and") @@ -186,61 +183,68 @@ module ActsAsXapian @@values_by_prefix = {} @@value_ranges_store = [] - for init_value_pair in @@init_values - classname = init_value_pair[0] - options = init_value_pair[1] - + @@init_values.each do |classname, options| # go through the various field types, and tell query parser about them, # and error check them - i.e. check for consistency between models @@query_parser.add_boolean_prefix("model", "M") @@query_parser.add_boolean_prefix("modelid", "I") - if options[:terms] - for term in options[:terms] - raise "Use a single capital letter for term code" if not term[1].match(/^[A-Z]$/) - raise "M and I are reserved for use as the model/id term" if term[1] == "M" or term[1] == "I" - raise "model and modelid are reserved for use as the model/id prefixes" if term[2] == "model" or term[2] == "modelid" - raise "Z is reserved for stemming terms" if term[1] == "Z" - raise "Already have code '" + term[1] + "' in another model but with different prefix '" + @@terms_by_capital[term[1]] + "'" if @@terms_by_capital.include?(term[1]) && @@terms_by_capital[term[1]] != term[2] - @@terms_by_capital[term[1]] = term[2] - # TODO: use boolean here so doesn't stem our URL names in WhatDoTheyKnow - # If making acts_as_xapian generic, would really need to make the :terms have - # another option that lets people choose non-boolean for terms that need it - # (i.e. searching explicitly within a free text field) - @@query_parser.add_boolean_prefix(term[2], term[1]) - end + init_terms(options[:terms]) if options[:terms] + init_values(options[:values]) if options[:values] + end + end + + def self.init_values(values) + values.each do |method, index, prefix, value_type| + raise "Value index '#{index}' must be an Integer, is #{index.class}" unless index.is_a? Integer + if @@values_by_number.include?(index) && @@values_by_number[index] != prefix + raise "Already have value index '#{index}' in another model " \ + "but with different prefix '#{@@values_by_number[index]}'" end - if options[:values] - for value in options[:values] - raise "Value index '"+value[1].to_s+"' must be an integer, is " + value[1].class.to_s if value[1].class != 1.class - raise "Already have value index '" + value[1].to_s + "' in another model but with different prefix '" + @@values_by_number[value[1]].to_s + "'" if @@values_by_number.include?(value[1]) && @@values_by_number[value[1]] != value[2] - - # date types are special, mark them so the first model they're seen for - if !@@values_by_number.include?(value[1]) - if value[3] == :date - value_range = Xapian::DateValueRangeProcessor.new(value[1]) - elsif value[3] == :string - value_range = Xapian::StringValueRangeProcessor.new(value[1]) - elsif value[3] == :number - value_range = Xapian::NumberValueRangeProcessor.new(value[1]) - else - raise "Unknown value type '" + value[3].to_s + "'" - end - - @@query_parser.add_valuerangeprocessor(value_range) - - # stop it being garbage collected, as - # add_valuerangeprocessor ref is outside Ruby's GC - @@value_ranges_store.push(value_range) - end + # date types are special, mark them so the first model they're seen for + unless @@values_by_number.include?(index) + case value_type + when :date + value_range = Xapian::DateValueRangeProcessor.new(index) + when :string + value_range = Xapian::StringValueRangeProcessor.new(index) + when :number + value_range = Xapian::NumberValueRangeProcessor.new(index) + else + raise "Unknown value type '#{value_type}'" + end - @@values_by_number[value[1]] = value[2] - @@values_by_prefix[value[2]] = value[1] - end + @@query_parser.add_valuerangeprocessor(value_range) + + # stop it being garbage collected, as + # add_valuerangeprocessor ref is outside Ruby's GC + @@value_ranges_store.push(value_range) + end + + @@values_by_number[index] = prefix + @@values_by_prefix[prefix] = index + end + end + + def self.init_terms(terms) + terms.each do |method, term_code, prefix| + raise "Use a single capital letter for term code" if not term_code.match(/^[A-Z]$/) + raise "M and I are reserved for use as the model/id term" if term_code == "M" || term_code == "I" + raise "model and modelid are reserved for use as the model/id prefixes" if prefix == "model" || prefix == "modelid" + raise "Z is reserved for stemming terms" if term_code == "Z" + if @@terms_by_capital.include?(term_code) && @@terms_by_capital[term_code] != prefix + raise "Already have code '#{term_code}' in another model but with different prefix " \ + "'#{@@terms_by_capital[term_code]}'" end + @@terms_by_capital[term_code] = prefix + # TODO: use boolean here so doesn't stem our URL names in WhatDoTheyKnow + # If making acts_as_xapian generic, would really need to make the :terms have + # another option that lets people choose non-boolean for terms that need it + # (i.e. searching explicitly within a free text field) + @@query_parser.add_boolean_prefix(prefix, term_code) end end - def ActsAsXapian.writable_init(suffix = "") + def self.writable_init(suffix = "") raise NoXapianRubyBindingsError.new("Xapian Ruby bindings not installed") unless ActsAsXapian.bindings_available raise "acts_as_xapian hasn't been called in any models" if @@init_values.empty? @@ -256,7 +260,7 @@ module ActsAsXapian # for indexing @@writable_db = Xapian::WritableDatabase.new(full_path, Xapian::DB_CREATE_OR_OPEN) @@enquire = Xapian::Enquire.new(@@writable_db) - @@term_generator = Xapian::TermGenerator.new() + @@term_generator = Xapian::TermGenerator.new @@term_generator.set_flags(Xapian::TermGenerator::FLAG_SPELLING, 0) @@term_generator.database = @@writable_db @@term_generator.stemmer = @@stemmer @@ -332,7 +336,7 @@ module ActsAsXapian delay *= 2 delay = MSET_MAX_DELAY if delay > MSET_MAX_DELAY - ActsAsXapian.db.reopen() + ActsAsXapian.db.reopen retry else raise @@ -558,7 +562,7 @@ module ActsAsXapian matches = ActsAsXapian.enquire.mset(0, 100, 100) # TODO: so this whole method will only work with 100 docs # Get set of relevant terms for those documents - selection = Xapian::RSet.new() + selection = Xapian::RSet.new iter = matches._begin while not iter.equals(matches._end) selection.add_document(iter) @@ -610,30 +614,29 @@ module ActsAsXapian # flush your changes. Specifying flush will reduce performance, but make # sure that each index update is definitely saved to disk before # logging in the database that it has been. - def ActsAsXapian.update_index(flush = false, verbose = false) + def self.update_index(flush = false, verbose = false) # STDOUT.puts("start of ActsAsXapian.update_index") if verbose # Before calling writable_init we have to make sure every model class has been initialized. # i.e. has had its class code loaded, so acts_as_xapian has been called inside it, and # we have the info from acts_as_xapian. - model_classes = ActsAsXapianJob.find_by_sql("select model from acts_as_xapian_jobs group by model").map {|a| a.model.constantize} + model_classes = ActsAsXapianJob.pluck("DISTINCT model").map { |a| a.constantize } # If there are no models in the queue, then nothing to do - return if model_classes.size == 0 + return if model_classes.empty? ActsAsXapian.writable_init # Abort if full rebuild is going on new_path = ActsAsXapian.db_path + ".new" if File.exist?(new_path) - raise "aborting incremental index update while full index rebuild happens; found existing " + new_path + raise "aborting incremental index update while full index rebuild happens; found existing #{new_path}" end - ids_to_refresh = ActsAsXapianJob.find(:all).map() { |i| i.id } - for id in ids_to_refresh + ActsAsXapianJob.pluck(:id).each do |id| job = nil begin ActiveRecord::Base.transaction do begin - job = ActsAsXapianJob.find(id, :lock =>true) + job = ActsAsXapianJob.find(id, :lock => true) rescue ActiveRecord::RecordNotFound => e # This could happen if while we are working the model # was updated a second time by another process. In that case @@ -642,30 +645,7 @@ module ActsAsXapian #STDERR.puts("job with #{id} vanished under foot") if verbose next end - STDOUT.puts("ActsAsXapian.update_index #{job.action} #{job.model} #{job.model_id.to_s} #{Time.now.to_s}") if verbose - - begin - if job.action == 'update' - # TODO: Index functions may reference other models, so we could eager load here too? - model = job.model.constantize.find(job.model_id) # :include => cls.constantize.xapian_options[:include] - model.xapian_index - elsif job.action == 'destroy' - # Make dummy model with right id, just for destruction - model = job.model.constantize.new - model.id = job.model_id - model.xapian_destroy - else - raise "unknown ActsAsXapianJob action '" + job.action + "'" - end - rescue ActiveRecord::RecordNotFound => e - # this can happen if the record was hand deleted in the database - job.action = 'destroy' - retry - end - if flush - ActsAsXapian.writable_db.flush - end - job.destroy + run_job(job, flush, verbose) end rescue => detail # print any error, and carry on so other things are indexed @@ -678,7 +658,34 @@ module ActsAsXapian ActsAsXapian.writable_db.close end - def ActsAsXapian._is_xapian_db(path) + def self.run_job(job, flush, verbose) + STDOUT.puts("ActsAsXapian.update_index #{job.action} #{job.model} #{job.model_id.to_s} #{Time.now.to_s}") if verbose + + begin + if job.action == 'update' + # TODO: Index functions may reference other models, so we could eager load here too? + model = job.model.constantize.find(job.model_id) # :include => cls.constantize.xapian_options[:include] + model.xapian_index + elsif job.action == 'destroy' + # Make dummy model with right id, just for destruction + model = job.model.constantize.new + model.id = job.model_id + model.xapian_destroy + else + raise "unknown ActsAsXapianJob action '#{job.action}'" + end + rescue ActiveRecord::RecordNotFound => e + # this can happen if the record was hand deleted in the database + job.action = 'destroy' + retry + end + if flush + ActsAsXapian.writable_db.flush + end + job.destroy + end + + def self._is_xapian_db(path) is_db = File.exist?(File.join(path, "iamflint")) || File.exist?(File.join(path, "iamchert")) return is_db end @@ -690,7 +697,7 @@ module ActsAsXapian # happens (i.e. while the .new database is there) - any index update jobs # are left in the database, and will run after the rebuild has finished. - def ActsAsXapian.rebuild_index(model_classes, verbose = false, terms = true, values = true, texts = true, safe_rebuild = true) + def self.rebuild_index(model_classes, verbose = false, terms = true, values = true, texts = true, safe_rebuild = true) #raise "when rebuilding all, please call as first and only thing done in process / task" if not ActsAsXapian.writable_db.nil? prepare_environment @@ -751,7 +758,7 @@ module ActsAsXapian @@db_path = old_path end - def ActsAsXapian._rebuild_index_safely(model_classes, verbose, terms, values, texts) + def self._rebuild_index_safely(model_classes, verbose, terms, values, texts) batch_size = 1000 for model_class in model_classes model_class_count = model_class.count diff --git a/lib/acts_as_xapian/tasks/xapian.rake b/lib/acts_as_xapian/tasks/xapian.rake index c1986ce1e..52d94011a 100644 --- a/lib/acts_as_xapian/tasks/xapian.rake +++ b/lib/acts_as_xapian/tasks/xapian.rake @@ -9,7 +9,7 @@ namespace :xapian do # "verbose=true" to print model name as it is run. desc 'Updates Xapian search index with changes to models since last call' task :update_index => :environment do - ActsAsXapian.update_index(ENV['flush'] ? true : false, ENV['verbose'] ? true : false) + ActsAsXapian.update_index(ENV['flush'], ENV['verbose']) end # Parameters - specify 'models="PublicBody User"' to say which models |