aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorGareth Rees <gareth@mysociety.org>2014-06-24 16:08:32 +0100
committerGareth Rees <gareth@mysociety.org>2014-06-25 10:40:38 +0100
commitf23b89f3474847cdd14ba892c5a7259964e18148 (patch)
tree2cf1440df2e9e8aeecd47ac2b25ec4f683bf0d6d
parente490c4a7ec7157e794d849c962371e298d8342d9 (diff)
Handle unhelpful stemming
Stemming returns 'bore' as the word to highlight which can't be matched in the original phrase. Also removes duplicates from the results
-rw-r--r--app/controllers/general_controller.rb2
-rw-r--r--app/controllers/track_controller.rb10
-rw-r--r--lib/acts_as_xapian/acts_as_xapian.rb37
-rw-r--r--spec/controllers/general_controller_spec.rb2
-rw-r--r--spec/integration/xapian_search_highlighting_spec.rb10
-rw-r--r--spec/models/xapian_spec.rb10
6 files changed, 63 insertions, 8 deletions
diff --git a/app/controllers/general_controller.rb b/app/controllers/general_controller.rb
index 759e80af9..158492eb2 100644
--- a/app/controllers/general_controller.rb
+++ b/app/controllers/general_controller.rb
@@ -159,7 +159,7 @@ class GeneralController < ApplicationController
end
# Spelling and highight words are same for all three queries
- @highlight_words = @request_for_spelling.words_to_highlight(:regex => true)
+ @highlight_words = @request_for_spelling.words_to_highlight(:regex => true, :include_original => true)
if !(@request_for_spelling.spelling_correction =~ /[a-z]+:/)
@spelling_correction = @request_for_spelling.spelling_correction
end
diff --git a/app/controllers/track_controller.rb b/app/controllers/track_controller.rb
index 551d9e72e..83700a55b 100644
--- a/app/controllers/track_controller.rb
+++ b/app/controllers/track_controller.rb
@@ -154,7 +154,15 @@ class TrackController < ApplicationController
request.format = 'xml' unless params[:format]
respond_to do |format|
format.json { render :json => @xapian_object.results.map { |r| r[:model].json_for_api(true,
- lambda { |t| view_context.highlight_and_excerpt(t, @xapian_object.words_to_highlight(:regex => true), 150) }
+ lambda do |t|
+ view_context.highlight_and_excerpt(
+ t,
+ @xapian_object.words_to_highlight(
+ :regex => true,
+ :include_original => true),
+ 150
+ )
+ end
) } }
format.any { render :template => 'track/atom_feed',
:formats => ['atom'],
diff --git a/lib/acts_as_xapian/acts_as_xapian.rb b/lib/acts_as_xapian/acts_as_xapian.rb
index d21ce4594..6520a20a4 100644
--- a/lib/acts_as_xapian/acts_as_xapian.rb
+++ b/lib/acts_as_xapian/acts_as_xapian.rb
@@ -21,6 +21,20 @@ rescue LoadError
$acts_as_xapian_bindings_available = false
end
+module Xapian
+ class QueryParser
+ def unstem(term)
+ words = []
+
+ Xapian._safelyIterate(unstem_begin(term), unstem_end(term)) do |item|
+ words << item.term
+ end
+
+ words
+ end
+ end
+end
+
module ActsAsXapian
######################################################################
# Module level variables
@@ -472,15 +486,30 @@ module ActsAsXapian
# Return just normal words in the query i.e. Not operators, ones in
# date ranges or similar. Use this for cheap highlighting with
# TextHelper::highlight, and excerpt.
- def words_to_highlight(opts = { :regex => false } )
+ def words_to_highlight(opts = {})
+ default_opts = { :include_original => false, :regex => false }
+ opts = default_opts.merge(opts)
+
# Reject all prefixes other than Z, which we know is reserved for stems
terms = query.terms.reject { |t| t.term.first.match(/^[A-Y]$/) }
-
+ # Collect the stems including the Z prefix
+ raw_stems = terms.map { |t| t.term if t.term.start_with?('Z') }.compact.uniq.sort
# Collect stems, chopping the Z prefix off
- stems = terms.map { |t| t.term[1..-1] if t.term.start_with?('Z') }.compact.sort
+ stems = raw_stems.map { |t| t[1..-1] }.compact.sort
# Collect the non-stem terms
words = terms.map { |t| t.term unless t.term.start_with?('Z') }.compact.sort
+ # Add the unstemmed words from the original query
+ # Sometimes stems can be unhelpful with the :regex option, for example
+ # stemming 'boring' results in us trying to highlight 'bore'.
+ if opts[:include_original]
+ raw_stems.each do |raw_stem|
+ words << ActsAsXapian.query_parser.unstem(raw_stem).uniq
+ end
+
+ words = words.any? ? words.flatten.uniq : []
+ end
+
if opts[:regex]
stems.map! { |w| /\b(#{ w })\w*\b/iu }
words.map! { |w| /\b(#{ w })\b/iu }
@@ -986,5 +1015,3 @@ end
# Reopen ActiveRecord and include the acts_as_xapian method
ActiveRecord::Base.extend ActsAsXapian::ActsMethods
-
-
diff --git a/spec/controllers/general_controller_spec.rb b/spec/controllers/general_controller_spec.rb
index f9c307913..c0a9d57d3 100644
--- a/spec/controllers/general_controller_spec.rb
+++ b/spec/controllers/general_controller_spec.rb
@@ -188,7 +188,7 @@ describe GeneralController, 'when using xapian search' do
it 'should highlight words for a user-only request' do
get :search, :combined => "bob/users"
- assigns[:highlight_words].should == [/\b(bob)\w*\b/iu]
+ assigns[:highlight_words].should == [/\b(bob)\w*\b/iu, /\b(bob)\b/iu]
end
it 'should show spelling corrections for a user-only request' do
diff --git a/spec/integration/xapian_search_highlighting_spec.rb b/spec/integration/xapian_search_highlighting_spec.rb
index 7bd64c995..65a34cf91 100644
--- a/spec/integration/xapian_search_highlighting_spec.rb
+++ b/spec/integration/xapian_search_highlighting_spec.rb
@@ -26,4 +26,14 @@ describe 'highlighting search results' do
highlight_matches(phrase, matches).should == '<mark>department</mark>'
end
+ it 'highlights stemmed words even if the stem is unhelpful' do
+ # Stemming returns 'bore' as the word to highlight which can't be
+ # matched in the original phrase.
+ phrase = 'boring'
+ search = ActsAsXapian::Search.new([PublicBody], phrase, :limit => 1)
+ matches = search.words_to_highlight(:regex => true, :include_original => true)
+
+ highlight_matches(phrase, matches).should == '<mark>boring</mark>'
+ end
+
end
diff --git a/spec/models/xapian_spec.rb b/spec/models/xapian_spec.rb
index 4230a63cd..678e3a2dc 100644
--- a/spec/models/xapian_spec.rb
+++ b/spec/models/xapian_spec.rb
@@ -413,6 +413,16 @@ describe ActsAsXapian::Search, "#words_to_highlight" do
s.words_to_highlight.should == ["depart", "humpadinking"]
end
+ it "includes the original search terms if requested" do
+ s = ActsAsXapian::Search.new([PublicBody], 'boring', :limit => 1)
+ s.words_to_highlight(:include_original => true).should == ['bore', 'boring']
+ end
+
+ it "does not return duplicate terms" do
+ s = ActsAsXapian::Search.new([PublicBody], 'boring boring', :limit => 1)
+ s.words_to_highlight.should == ['bore']
+ end
+
context 'the :regex option' do
it 'wraps each words in a regex that matches the full word' do