aboutsummaryrefslogtreecommitdiffstats
path: root/lib/acts_as_xapian/acts_as_xapian.rb
diff options
context:
space:
mode:
Diffstat (limited to 'lib/acts_as_xapian/acts_as_xapian.rb')
-rw-r--r--lib/acts_as_xapian/acts_as_xapian.rb80
1 files changed, 59 insertions, 21 deletions
diff --git a/lib/acts_as_xapian/acts_as_xapian.rb b/lib/acts_as_xapian/acts_as_xapian.rb
index b30bb4d10..6520a20a4 100644
--- a/lib/acts_as_xapian/acts_as_xapian.rb
+++ b/lib/acts_as_xapian/acts_as_xapian.rb
@@ -21,10 +21,24 @@ rescue LoadError
$acts_as_xapian_bindings_available = false
end
+module Xapian
+ class QueryParser
+ def unstem(term)
+ words = []
+
+ Xapian._safelyIterate(unstem_begin(term), unstem_end(term)) do |item|
+ words << item.term
+ end
+
+ words
+ end
+ end
+end
+
module ActsAsXapian
######################################################################
# Module level variables
- # XXX must be some kind of cattr_accessor that can do this better
+ # TODO: must be some kind of cattr_accessor that can do this better
def ActsAsXapian.bindings_available
$acts_as_xapian_bindings_available
end
@@ -109,12 +123,12 @@ module ActsAsXapian
@@db_path = File.join(db_parent_path, environment)
# make some things that don't depend on the db
- # XXX this gets made once for each acts_as_xapian. Oh well.
+ # TODO: this gets made once for each acts_as_xapian. Oh well.
@@stemmer = Xapian::Stem.new('english')
end
# Opens / reopens the db for reading
- # XXX we perhaps don't need to rebuild database and enquire and queryparser -
+ # TODO: we perhaps don't need to rebuild database and enquire and queryparser -
# but db.reopen wasn't enough by itself, so just do everything it's easier.
def ActsAsXapian.readable_init
raise NoXapianRubyBindingsError.new("Xapian Ruby bindings not installed") unless ActsAsXapian.bindings_available
@@ -188,7 +202,7 @@ module ActsAsXapian
raise "Z is reserved for stemming terms" if term[1] == "Z"
raise "Already have code '" + term[1] + "' in another model but with different prefix '" + @@terms_by_capital[term[1]] + "'" if @@terms_by_capital.include?(term[1]) && @@terms_by_capital[term[1]] != term[2]
@@terms_by_capital[term[1]] = term[2]
- # XXX use boolean here so doesn't stem our URL names in WhatDoTheyKnow
+ # TODO: use boolean here so doesn't stem our URL names in WhatDoTheyKnow
# If making acts_as_xapian generic, would really need to make the :terms have
# another option that lets people choose non-boolean for terms that need it
# (i.e. searching explicitly within a free text field)
@@ -231,7 +245,7 @@ module ActsAsXapian
raise "acts_as_xapian hasn't been called in any models" if @@init_values.empty?
# if DB is not nil, then we're already initialised, so don't do it
- # again XXX reopen it each time, xapian_spec.rb needs this so database
+ # again TODO: reopen it each time, xapian_spec.rb needs this so database
# gets written twice correctly.
# return unless @@writable_db.nil?
@@ -472,16 +486,42 @@ module ActsAsXapian
# Return just normal words in the query i.e. Not operators, ones in
# date ranges or similar. Use this for cheap highlighting with
# TextHelper::highlight, and excerpt.
- def words_to_highlight
- # TODO: In Ruby 1.9 we can do matching of any unicode letter with \p{L}
- # But we still need to support ruby 1.8 for the time being so...
- query_nopunc = self.query_string.gsub(/[^ёЁа-яА-Яa-zA-Zà-üÀ-Ü0-9:\.\/_]/iu, " ")
- query_nopunc = query_nopunc.gsub(/\s+/, " ")
- words = query_nopunc.split(" ")
- # Remove anything with a :, . or / in it
- words = words.find_all {|o| !o.match(/(:|\.|\/)/) }
- words = words.find_all {|o| !o.match(/^(AND|NOT|OR|XOR)$/) }
- return words
+ def words_to_highlight(opts = {})
+ default_opts = { :include_original => false, :regex => false }
+ opts = default_opts.merge(opts)
+
+ # Reject all prefixes other than Z, which we know is reserved for stems
+ terms = query.terms.reject { |t| t.term.first.match(/^[A-Y]$/) }
+ # Collect the stems including the Z prefix
+ raw_stems = terms.map { |t| t.term if t.term.start_with?('Z') }.compact.uniq.sort
+ # Collect stems, chopping the Z prefix off
+ stems = raw_stems.map { |t| t[1..-1] }.compact.sort
+ # Collect the non-stem terms
+ words = terms.map { |t| t.term unless t.term.start_with?('Z') }.compact.sort
+
+ # Add the unstemmed words from the original query
+ # Sometimes stems can be unhelpful with the :regex option, for example
+ # stemming 'boring' results in us trying to highlight 'bore'.
+ if opts[:include_original]
+ raw_stems.each do |raw_stem|
+ words << ActsAsXapian.query_parser.unstem(raw_stem).uniq
+ end
+
+ words = words.any? ? words.flatten.uniq : []
+ end
+
+ if opts[:regex]
+ stems.map! { |w| /\b(#{ w })\w*\b/iu }
+ words.map! { |w| /\b(#{ w })\b/iu }
+ end
+
+ if RUBY_VERSION.to_f >= 1.9
+ (stems + words).map! do |term|
+ term.is_a?(String) ? term.force_encoding('UTF-8') : term
+ end
+ else
+ stems + words
+ end
end
# Text for lines in log file
@@ -510,7 +550,7 @@ module ActsAsXapian
# Find the documents by their unique term
input_models_query = Xapian::Query.new(Xapian::Query::OP_OR, query_models.map{|m| "I" + m.xapian_document_term})
ActsAsXapian.enquire.query = input_models_query
- matches = ActsAsXapian.enquire.mset(0, 100, 100) # XXX so this whole method will only work with 100 docs
+ matches = ActsAsXapian.enquire.mset(0, 100, 100) # TODO: so this whole method will only work with 100 docs
# Get set of relevant terms for those documents
selection = Xapian::RSet.new()
@@ -601,7 +641,7 @@ module ActsAsXapian
begin
if job.action == 'update'
- # XXX Index functions may reference other models, so we could eager load here too?
+ # TODO: Index functions may reference other models, so we could eager load here too?
model = job.model.constantize.find(job.model_id) # :include => cls.constantize.xapian_options[:include]
model.xapian_index
elsif job.action == 'destroy'
@@ -717,7 +757,7 @@ module ActsAsXapian
ActiveRecord::Base.connection.disconnect!
- pid = Process.fork # XXX this will only work on Unix, tough
+ pid = Process.fork # TODO: this will only work on Unix, tough
if pid
Process.waitpid(pid)
if not $?.success?
@@ -898,7 +938,7 @@ module ActsAsXapian
ActsAsXapian.term_generator.document = doc
for text in texts_to_index
ActsAsXapian.term_generator.increase_termpos # stop phrases spanning different text fields
- # XXX the "1" here is a weight that could be varied for a boost function
+ # TODO: the "1" here is a weight that could be varied for a boost function
ActsAsXapian.term_generator.index_text(xapian_value(text, nil, true), 1)
end
end
@@ -975,5 +1015,3 @@ end
# Reopen ActiveRecord and include the acts_as_xapian method
ActiveRecord::Base.extend ActsAsXapian::ActsMethods
-
-