diff options
author | Gareth Rees <gareth@mysociety.org> | 2014-06-24 16:08:32 +0100 |
---|---|---|
committer | Gareth Rees <gareth@mysociety.org> | 2014-06-25 10:40:38 +0100 |
commit | f23b89f3474847cdd14ba892c5a7259964e18148 (patch) | |
tree | 2cf1440df2e9e8aeecd47ac2b25ec4f683bf0d6d /lib | |
parent | e490c4a7ec7157e794d849c962371e298d8342d9 (diff) |
Handle unhelpful stemming
Stemming returns 'bore' as the word to highlight which can't be matched
in the original phrase.
Also removes duplicates from the results
Diffstat (limited to 'lib')
-rw-r--r-- | lib/acts_as_xapian/acts_as_xapian.rb | 37 |
1 files changed, 32 insertions, 5 deletions
diff --git a/lib/acts_as_xapian/acts_as_xapian.rb b/lib/acts_as_xapian/acts_as_xapian.rb index d21ce4594..6520a20a4 100644 --- a/lib/acts_as_xapian/acts_as_xapian.rb +++ b/lib/acts_as_xapian/acts_as_xapian.rb @@ -21,6 +21,20 @@ rescue LoadError $acts_as_xapian_bindings_available = false end +module Xapian + class QueryParser + def unstem(term) + words = [] + + Xapian._safelyIterate(unstem_begin(term), unstem_end(term)) do |item| + words << item.term + end + + words + end + end +end + module ActsAsXapian ###################################################################### # Module level variables @@ -472,15 +486,30 @@ module ActsAsXapian # Return just normal words in the query i.e. Not operators, ones in # date ranges or similar. Use this for cheap highlighting with # TextHelper::highlight, and excerpt. - def words_to_highlight(opts = { :regex => false } ) + def words_to_highlight(opts = {}) + default_opts = { :include_original => false, :regex => false } + opts = default_opts.merge(opts) + # Reject all prefixes other than Z, which we know is reserved for stems terms = query.terms.reject { |t| t.term.first.match(/^[A-Y]$/) } - + # Collect the stems including the Z prefix + raw_stems = terms.map { |t| t.term if t.term.start_with?('Z') }.compact.uniq.sort # Collect stems, chopping the Z prefix off - stems = terms.map { |t| t.term[1..-1] if t.term.start_with?('Z') }.compact.sort + stems = raw_stems.map { |t| t[1..-1] }.compact.sort # Collect the non-stem terms words = terms.map { |t| t.term unless t.term.start_with?('Z') }.compact.sort + # Add the unstemmed words from the original query + # Sometimes stems can be unhelpful with the :regex option, for example + # stemming 'boring' results in us trying to highlight 'bore'. + if opts[:include_original] + raw_stems.each do |raw_stem| + words << ActsAsXapian.query_parser.unstem(raw_stem).uniq + end + + words = words.any? ? words.flatten.uniq : [] + end + if opts[:regex] stems.map! { |w| /\b(#{ w })\w*\b/iu } words.map! { |w| /\b(#{ w })\b/iu } @@ -986,5 +1015,3 @@ end # Reopen ActiveRecord and include the acts_as_xapian method ActiveRecord::Base.extend ActsAsXapian::ActsMethods - - |