diff options
-rw-r--r-- | app/controllers/general_controller.rb | 2 | ||||
-rw-r--r-- | app/controllers/track_controller.rb | 10 | ||||
-rw-r--r-- | lib/acts_as_xapian/acts_as_xapian.rb | 37 | ||||
-rw-r--r-- | spec/controllers/general_controller_spec.rb | 2 | ||||
-rw-r--r-- | spec/integration/xapian_search_highlighting_spec.rb | 10 | ||||
-rw-r--r-- | spec/models/xapian_spec.rb | 10 |
6 files changed, 63 insertions, 8 deletions
diff --git a/app/controllers/general_controller.rb b/app/controllers/general_controller.rb index 759e80af9..158492eb2 100644 --- a/app/controllers/general_controller.rb +++ b/app/controllers/general_controller.rb @@ -159,7 +159,7 @@ class GeneralController < ApplicationController end # Spelling and highight words are same for all three queries - @highlight_words = @request_for_spelling.words_to_highlight(:regex => true) + @highlight_words = @request_for_spelling.words_to_highlight(:regex => true, :include_original => true) if !(@request_for_spelling.spelling_correction =~ /[a-z]+:/) @spelling_correction = @request_for_spelling.spelling_correction end diff --git a/app/controllers/track_controller.rb b/app/controllers/track_controller.rb index 551d9e72e..83700a55b 100644 --- a/app/controllers/track_controller.rb +++ b/app/controllers/track_controller.rb @@ -154,7 +154,15 @@ class TrackController < ApplicationController request.format = 'xml' unless params[:format] respond_to do |format| format.json { render :json => @xapian_object.results.map { |r| r[:model].json_for_api(true, - lambda { |t| view_context.highlight_and_excerpt(t, @xapian_object.words_to_highlight(:regex => true), 150) } + lambda do |t| + view_context.highlight_and_excerpt( + t, + @xapian_object.words_to_highlight( + :regex => true, + :include_original => true), + 150 + ) + end ) } } format.any { render :template => 'track/atom_feed', :formats => ['atom'], diff --git a/lib/acts_as_xapian/acts_as_xapian.rb b/lib/acts_as_xapian/acts_as_xapian.rb index d21ce4594..6520a20a4 100644 --- a/lib/acts_as_xapian/acts_as_xapian.rb +++ b/lib/acts_as_xapian/acts_as_xapian.rb @@ -21,6 +21,20 @@ rescue LoadError $acts_as_xapian_bindings_available = false end +module Xapian + class QueryParser + def unstem(term) + words = [] + + Xapian._safelyIterate(unstem_begin(term), unstem_end(term)) do |item| + words << item.term + end + + words + end + end +end + module ActsAsXapian ###################################################################### # Module level variables @@ -472,15 +486,30 @@ module ActsAsXapian # Return just normal words in the query i.e. Not operators, ones in # date ranges or similar. Use this for cheap highlighting with # TextHelper::highlight, and excerpt. - def words_to_highlight(opts = { :regex => false } ) + def words_to_highlight(opts = {}) + default_opts = { :include_original => false, :regex => false } + opts = default_opts.merge(opts) + # Reject all prefixes other than Z, which we know is reserved for stems terms = query.terms.reject { |t| t.term.first.match(/^[A-Y]$/) } - + # Collect the stems including the Z prefix + raw_stems = terms.map { |t| t.term if t.term.start_with?('Z') }.compact.uniq.sort # Collect stems, chopping the Z prefix off - stems = terms.map { |t| t.term[1..-1] if t.term.start_with?('Z') }.compact.sort + stems = raw_stems.map { |t| t[1..-1] }.compact.sort # Collect the non-stem terms words = terms.map { |t| t.term unless t.term.start_with?('Z') }.compact.sort + # Add the unstemmed words from the original query + # Sometimes stems can be unhelpful with the :regex option, for example + # stemming 'boring' results in us trying to highlight 'bore'. + if opts[:include_original] + raw_stems.each do |raw_stem| + words << ActsAsXapian.query_parser.unstem(raw_stem).uniq + end + + words = words.any? ? words.flatten.uniq : [] + end + if opts[:regex] stems.map! { |w| /\b(#{ w })\w*\b/iu } words.map! { |w| /\b(#{ w })\b/iu } @@ -986,5 +1015,3 @@ end # Reopen ActiveRecord and include the acts_as_xapian method ActiveRecord::Base.extend ActsAsXapian::ActsMethods - - diff --git a/spec/controllers/general_controller_spec.rb b/spec/controllers/general_controller_spec.rb index f9c307913..c0a9d57d3 100644 --- a/spec/controllers/general_controller_spec.rb +++ b/spec/controllers/general_controller_spec.rb @@ -188,7 +188,7 @@ describe GeneralController, 'when using xapian search' do it 'should highlight words for a user-only request' do get :search, :combined => "bob/users" - assigns[:highlight_words].should == [/\b(bob)\w*\b/iu] + assigns[:highlight_words].should == [/\b(bob)\w*\b/iu, /\b(bob)\b/iu] end it 'should show spelling corrections for a user-only request' do diff --git a/spec/integration/xapian_search_highlighting_spec.rb b/spec/integration/xapian_search_highlighting_spec.rb index 7bd64c995..65a34cf91 100644 --- a/spec/integration/xapian_search_highlighting_spec.rb +++ b/spec/integration/xapian_search_highlighting_spec.rb @@ -26,4 +26,14 @@ describe 'highlighting search results' do highlight_matches(phrase, matches).should == '<mark>department</mark>' end + it 'highlights stemmed words even if the stem is unhelpful' do + # Stemming returns 'bore' as the word to highlight which can't be + # matched in the original phrase. + phrase = 'boring' + search = ActsAsXapian::Search.new([PublicBody], phrase, :limit => 1) + matches = search.words_to_highlight(:regex => true, :include_original => true) + + highlight_matches(phrase, matches).should == '<mark>boring</mark>' + end + end diff --git a/spec/models/xapian_spec.rb b/spec/models/xapian_spec.rb index 4230a63cd..678e3a2dc 100644 --- a/spec/models/xapian_spec.rb +++ b/spec/models/xapian_spec.rb @@ -413,6 +413,16 @@ describe ActsAsXapian::Search, "#words_to_highlight" do s.words_to_highlight.should == ["depart", "humpadinking"] end + it "includes the original search terms if requested" do + s = ActsAsXapian::Search.new([PublicBody], 'boring', :limit => 1) + s.words_to_highlight(:include_original => true).should == ['bore', 'boring'] + end + + it "does not return duplicate terms" do + s = ActsAsXapian::Search.new([PublicBody], 'boring boring', :limit => 1) + s.words_to_highlight.should == ['bore'] + end + context 'the :regex option' do it 'wraps each words in a regex that matches the full word' do |