aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--lib/acts_as_xapian/acts_as_xapian.rb31
-rw-r--r--spec/models/xapian_spec.rb40
2 files changed, 56 insertions, 15 deletions
diff --git a/lib/acts_as_xapian/acts_as_xapian.rb b/lib/acts_as_xapian/acts_as_xapian.rb
index 168d2eec3..d21ce4594 100644
--- a/lib/acts_as_xapian/acts_as_xapian.rb
+++ b/lib/acts_as_xapian/acts_as_xapian.rb
@@ -472,16 +472,27 @@ module ActsAsXapian
# Return just normal words in the query i.e. Not operators, ones in
# date ranges or similar. Use this for cheap highlighting with
# TextHelper::highlight, and excerpt.
- def words_to_highlight
- # TODO: In Ruby 1.9 we can do matching of any unicode letter with \p{L}
- # But we still need to support ruby 1.8 for the time being so...
- query_nopunc = self.query_string.gsub(/[^ёЁа-яА-Яa-zA-Zà-üÀ-Ü0-9:\.\/_]/iu, " ")
- query_nopunc = query_nopunc.gsub(/\s+/, " ")
- words = query_nopunc.split(" ")
- # Remove anything with a :, . or / in it
- words = words.find_all {|o| !o.match(/(:|\.|\/)/) }
- words = words.find_all {|o| !o.match(/^(AND|NOT|OR|XOR)$/) }
- return words
+ def words_to_highlight(opts = { :regex => false } )
+ # Reject all prefixes other than Z, which we know is reserved for stems
+ terms = query.terms.reject { |t| t.term.first.match(/^[A-Y]$/) }
+
+ # Collect stems, chopping the Z prefix off
+ stems = terms.map { |t| t.term[1..-1] if t.term.start_with?('Z') }.compact.sort
+ # Collect the non-stem terms
+ words = terms.map { |t| t.term unless t.term.start_with?('Z') }.compact.sort
+
+ if opts[:regex]
+ stems.map! { |w| /\b(#{ w })\w*\b/iu }
+ words.map! { |w| /\b(#{ w })\b/iu }
+ end
+
+ if RUBY_VERSION.to_f >= 1.9
+ (stems + words).map! do |term|
+ term.is_a?(String) ? term.force_encoding('UTF-8') : term
+ end
+ else
+ stems + words
+ end
end
# Text for lines in log file
diff --git a/spec/models/xapian_spec.rb b/spec/models/xapian_spec.rb
index a1e060d8e..4230a63cd 100644
--- a/spec/models/xapian_spec.rb
+++ b/spec/models/xapian_spec.rb
@@ -380,23 +380,53 @@ describe ActsAsXapian::Search, "#words_to_highlight" do
it "should return a list of words used in the search" do
s = ActsAsXapian::Search.new([PublicBody], "albatross words", :limit => 100)
- s.words_to_highlight.should == ["albatross", "words"]
+ s.words_to_highlight.should == ["albatross", "word"]
end
it "should remove any operators" do
s = ActsAsXapian::Search.new([PublicBody], "albatross words tag:mice", :limit => 100)
- s.words_to_highlight.should == ["albatross", "words"]
+ s.words_to_highlight.should == ["albatross", "word"]
end
- # This is the current behaviour but it seems a little simplistic to me
it "should separate punctuation" do
s = ActsAsXapian::Search.new([PublicBody], "The doctor's patient", :limit => 100)
- s.words_to_highlight.should == ["The", "doctor", "s", "patient"]
+ s.words_to_highlight.should == ["the", "doctor", "patient"].sort
end
it "should handle non-ascii characters" do
s = ActsAsXapian::Search.new([PublicBody], "adatigénylés words tag:mice", :limit => 100)
- s.words_to_highlight.should == ["adatigénylés", "words"]
+ s.words_to_highlight.should == ["adatigénylé", "word"]
+ end
+
+ it "should ignore stopwords" do
+ s = ActsAsXapian::Search.new([PublicBody], "department of humpadinking", :limit => 100)
+ s.words_to_highlight.should_not include('of')
+ end
+
+ it "uses stemming" do
+ s = ActsAsXapian::Search.new([PublicBody], 'department of humpadinking', :limit => 100)
+ s.words_to_highlight.should == ["depart", "humpadink"]
+ end
+
+ it "doesn't stem proper nouns" do
+ s = ActsAsXapian::Search.new([PublicBody], 'department of Humpadinking', :limit => 1)
+ s.words_to_highlight.should == ["depart", "humpadinking"]
+ end
+
+ context 'the :regex option' do
+
+ it 'wraps each words in a regex that matches the full word' do
+ expected = [/\b(albatross)\b/iu]
+ s = ActsAsXapian::Search.new([PublicBody], 'Albatross', :limit => 1)
+ s.words_to_highlight(:regex => true).should == expected
+ end
+
+ it 'wraps each stem in a regex' do
+ expected = [/\b(depart)\w*\b/iu]
+ s = ActsAsXapian::Search.new([PublicBody], 'department', :limit => 1)
+ s.words_to_highlight(:regex => true).should == expected
+ end
+
end
end