aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--app/controllers/general_controller.rb2
-rw-r--r--app/controllers/track_controller.rb10
-rw-r--r--app/helpers/application_helper.rb22
-rw-r--r--app/helpers/highlight_helper.rb98
-rw-r--r--app/views/track_mailer/event_digest.text.erb2
-rw-r--r--lib/acts_as_xapian/acts_as_xapian.rb62
-rw-r--r--spec/controllers/general_controller_spec.rb2
-rw-r--r--spec/controllers/request_controller_spec.rb1
-rw-r--r--spec/helpers/highlight_helper_spec.rb247
-rw-r--r--spec/integration/xapian_search_highlighting_spec.rb39
-rw-r--r--spec/models/xapian_spec.rb50
11 files changed, 494 insertions, 41 deletions
diff --git a/app/controllers/general_controller.rb b/app/controllers/general_controller.rb
index 28055ddbf..158492eb2 100644
--- a/app/controllers/general_controller.rb
+++ b/app/controllers/general_controller.rb
@@ -159,7 +159,7 @@ class GeneralController < ApplicationController
end
# Spelling and highight words are same for all three queries
- @highlight_words = @request_for_spelling.words_to_highlight
+ @highlight_words = @request_for_spelling.words_to_highlight(:regex => true, :include_original => true)
if !(@request_for_spelling.spelling_correction =~ /[a-z]+:/)
@spelling_correction = @request_for_spelling.spelling_correction
end
diff --git a/app/controllers/track_controller.rb b/app/controllers/track_controller.rb
index c15fb573d..83700a55b 100644
--- a/app/controllers/track_controller.rb
+++ b/app/controllers/track_controller.rb
@@ -154,7 +154,15 @@ class TrackController < ApplicationController
request.format = 'xml' unless params[:format]
respond_to do |format|
format.json { render :json => @xapian_object.results.map { |r| r[:model].json_for_api(true,
- lambda { |t| view_context.highlight_and_excerpt(t, @xapian_object.words_to_highlight, 150) }
+ lambda do |t|
+ view_context.highlight_and_excerpt(
+ t,
+ @xapian_object.words_to_highlight(
+ :regex => true,
+ :include_original => true),
+ 150
+ )
+ end
) } }
format.any { render :template => 'track/atom_feed',
:formats => ['atom'],
diff --git a/app/helpers/application_helper.rb b/app/helpers/application_helper.rb
index 45b042354..49ce94951 100644
--- a/app/helpers/application_helper.rb
+++ b/app/helpers/application_helper.rb
@@ -22,6 +22,9 @@ module ApplicationHelper
# Useful for sending emails
include MailerHelper
+ # Extra highlight helpers
+ include HighlightHelper
+
# Copied from error_messages_for in active_record_helper.rb
def foi_error_messages_for(*params)
options = params.last.is_a?(Hash) ? params.pop.symbolize_keys : {}
@@ -54,25 +57,6 @@ module ApplicationHelper
end
end
- # Highlight words, also escapes HTML (other than spans that we add)
- def highlight_words(t, words, html = true)
- if html
- highlight(h(t), words, :highlighter => '<span class="highlight">\1</span>').html_safe
- else
- highlight(t, words, :highlighter => '*\1*')
- end
- end
-
- def highlight_and_excerpt(t, words, excount, html = true)
- newt = excerpt(t, words[0], :radius => excount)
- if not newt
- newt = excerpt(t, '', :radius => excount)
- end
- t = newt
- t = highlight_words(t, words, html)
- return t
- end
-
def locale_name(locale)
return LanguageNames::get_language_name(locale)
end
diff --git a/app/helpers/highlight_helper.rb b/app/helpers/highlight_helper.rb
new file mode 100644
index 000000000..a98f6f320
--- /dev/null
+++ b/app/helpers/highlight_helper.rb
@@ -0,0 +1,98 @@
+module HighlightHelper
+ include ERB::Util
+
+ # Implementation of rails' highlight that allows regex to be passed to
+ # the phrases parameter.
+ # https://github.com/rails/rails/pull/11793
+ def highlight_matches(text, phrases, options = {})
+ text = ActionController::Base.helpers.sanitize(text).try(:html_safe) if options.fetch(:sanitize, true)
+
+ if text.blank? || phrases.blank?
+ text
+ else
+ match = Array(phrases).map do |p|
+ Regexp === p ? p.to_s : Regexp.escape(p)
+ end.join('|')
+
+ if block_given?
+ text.gsub(/(#{match})(?![^<]*?>)/i) { |found| yield found }
+ else
+ highlighter = options.fetch(:highlighter, '<mark>\1</mark>')
+ text.gsub(/(#{match})(?![^<]*?>)/i, highlighter)
+ end
+ end.html_safe
+ end
+
+ # Highlight words, also escapes HTML (other than spans that we add)
+ def highlight_words(t, words, html = true)
+ if html
+ highlight_matches(h(t), words, :highlighter => '<span class="highlight">\1</span>').html_safe
+ else
+ highlight_matches(t, words, :highlighter => '*\1*')
+ end
+ end
+
+ def highlight_and_excerpt(t, words, excount, html = true)
+ newt = excerpt(t, words[0], :radius => excount)
+ if not newt
+ newt = excerpt(t, '', :radius => excount)
+ end
+ t = newt
+ t = highlight_words(t, words, html)
+ return t
+ end
+
+ def excerpt(text, phrase, options = {})
+ return unless text && phrase
+
+ separator = options.fetch(:separator, nil) || ""
+ case phrase
+ when Regexp
+ regex = phrase
+ else
+ regex = /#{Regexp.escape(phrase)}/i
+ end
+
+ return unless matches = text.match(regex)
+ phrase = matches[0]
+
+ unless separator.empty?
+ text.split(separator).each do |value|
+ if value.match(regex)
+ regex = phrase = value
+ break
+ end
+ end
+ end
+
+ first_part, second_part = text.split(phrase, 2)
+
+ prefix, first_part = cut_excerpt_part(:first, first_part, separator, options)
+ postfix, second_part = cut_excerpt_part(:second, second_part, separator, options)
+
+ affix = [first_part, separator, phrase, separator, second_part].join.strip
+ [prefix, affix, postfix].join
+ end
+
+ private
+
+ def cut_excerpt_part(part_position, part, separator, options)
+ return "", "" unless part
+
+ radius = options.fetch(:radius, 100)
+ omission = options.fetch(:omission, "...")
+
+ part = part.split(separator)
+ part.delete("")
+ affix = part.size > radius ? omission : ""
+
+ part = if part_position == :first
+ drop_index = [part.length - radius, 0].max
+ part.drop(drop_index)
+ else
+ part.first(radius)
+ end
+
+ return affix, part.join(separator)
+ end
+end
diff --git a/app/views/track_mailer/event_digest.text.erb b/app/views/track_mailer/event_digest.text.erb
index a154f430f..f6e699e41 100644
--- a/app/views/track_mailer/event_digest.text.erb
+++ b/app/views/track_mailer/event_digest.text.erb
@@ -4,7 +4,7 @@
for track_thing, alert_results, xapian_object in @email_about_things
main_text += track_thing.params[:title_in_email] + "\n"
main_text += ("=" * track_thing.params[:title_in_email].size) + "\n\n"
- @highlight_words = xapian_object.words_to_highlight
+ @highlight_words = xapian_object.words_to_highlight(:regex => true)
for result in alert_results.reverse
if result[:model].class.to_s == "InfoRequestEvent"
event = result[:model]
diff --git a/lib/acts_as_xapian/acts_as_xapian.rb b/lib/acts_as_xapian/acts_as_xapian.rb
index 168d2eec3..6520a20a4 100644
--- a/lib/acts_as_xapian/acts_as_xapian.rb
+++ b/lib/acts_as_xapian/acts_as_xapian.rb
@@ -21,6 +21,20 @@ rescue LoadError
$acts_as_xapian_bindings_available = false
end
+module Xapian
+ class QueryParser
+ def unstem(term)
+ words = []
+
+ Xapian._safelyIterate(unstem_begin(term), unstem_end(term)) do |item|
+ words << item.term
+ end
+
+ words
+ end
+ end
+end
+
module ActsAsXapian
######################################################################
# Module level variables
@@ -472,16 +486,42 @@ module ActsAsXapian
# Return just normal words in the query i.e. Not operators, ones in
# date ranges or similar. Use this for cheap highlighting with
# TextHelper::highlight, and excerpt.
- def words_to_highlight
- # TODO: In Ruby 1.9 we can do matching of any unicode letter with \p{L}
- # But we still need to support ruby 1.8 for the time being so...
- query_nopunc = self.query_string.gsub(/[^ёЁа-яА-Яa-zA-Zà-üÀ-Ü0-9:\.\/_]/iu, " ")
- query_nopunc = query_nopunc.gsub(/\s+/, " ")
- words = query_nopunc.split(" ")
- # Remove anything with a :, . or / in it
- words = words.find_all {|o| !o.match(/(:|\.|\/)/) }
- words = words.find_all {|o| !o.match(/^(AND|NOT|OR|XOR)$/) }
- return words
+ def words_to_highlight(opts = {})
+ default_opts = { :include_original => false, :regex => false }
+ opts = default_opts.merge(opts)
+
+ # Reject all prefixes other than Z, which we know is reserved for stems
+ terms = query.terms.reject { |t| t.term.first.match(/^[A-Y]$/) }
+ # Collect the stems including the Z prefix
+ raw_stems = terms.map { |t| t.term if t.term.start_with?('Z') }.compact.uniq.sort
+ # Collect stems, chopping the Z prefix off
+ stems = raw_stems.map { |t| t[1..-1] }.compact.sort
+ # Collect the non-stem terms
+ words = terms.map { |t| t.term unless t.term.start_with?('Z') }.compact.sort
+
+ # Add the unstemmed words from the original query
+ # Sometimes stems can be unhelpful with the :regex option, for example
+ # stemming 'boring' results in us trying to highlight 'bore'.
+ if opts[:include_original]
+ raw_stems.each do |raw_stem|
+ words << ActsAsXapian.query_parser.unstem(raw_stem).uniq
+ end
+
+ words = words.any? ? words.flatten.uniq : []
+ end
+
+ if opts[:regex]
+ stems.map! { |w| /\b(#{ w })\w*\b/iu }
+ words.map! { |w| /\b(#{ w })\b/iu }
+ end
+
+ if RUBY_VERSION.to_f >= 1.9
+ (stems + words).map! do |term|
+ term.is_a?(String) ? term.force_encoding('UTF-8') : term
+ end
+ else
+ stems + words
+ end
end
# Text for lines in log file
@@ -975,5 +1015,3 @@ end
# Reopen ActiveRecord and include the acts_as_xapian method
ActiveRecord::Base.extend ActsAsXapian::ActsMethods
-
-
diff --git a/spec/controllers/general_controller_spec.rb b/spec/controllers/general_controller_spec.rb
index 7590a5b42..c0a9d57d3 100644
--- a/spec/controllers/general_controller_spec.rb
+++ b/spec/controllers/general_controller_spec.rb
@@ -188,7 +188,7 @@ describe GeneralController, 'when using xapian search' do
it 'should highlight words for a user-only request' do
get :search, :combined => "bob/users"
- assigns[:highlight_words].should == ['bob']
+ assigns[:highlight_words].should == [/\b(bob)\w*\b/iu, /\b(bob)\b/iu]
end
it 'should show spelling corrections for a user-only request' do
diff --git a/spec/controllers/request_controller_spec.rb b/spec/controllers/request_controller_spec.rb
index 48f37a45c..f7c935af3 100644
--- a/spec/controllers/request_controller_spec.rb
+++ b/spec/controllers/request_controller_spec.rb
@@ -923,7 +923,6 @@ describe RequestController, "when searching for an authority" do
end
it "should return matching bodies" do
-
session[:user_id] = @user.id
get :select_authority, :query => "Quango"
diff --git a/spec/helpers/highlight_helper_spec.rb b/spec/helpers/highlight_helper_spec.rb
new file mode 100644
index 000000000..e1be7e153
--- /dev/null
+++ b/spec/helpers/highlight_helper_spec.rb
@@ -0,0 +1,247 @@
+require File.expand_path(File.dirname(__FILE__) + '/../spec_helper')
+
+describe HighlightHelper do
+ include HighlightHelper
+
+ describe :highlight_and_excerpt do
+
+ it 'excerpts text and highlights phrases' do
+ text = "Quentin Nobble-Boston, Permanent Under-Secretary, Department for Humpadinking"
+ phrases = ['humpadinking']
+ expected = '...Department for <span class="highlight">Humpadinking</span>'
+ highlight_and_excerpt(text, phrases, 15).should == expected
+ end
+
+ it 'excerpts text and highlights matches' do
+ text = "Quentin Nobble-Boston, Permanent Under-Secretary, Department for Humpadinking"
+ matches = [/\bhumpadink\w*\b/iu]
+ expected = '...Department for <span class="highlight">Humpadinking</span>'
+ highlight_and_excerpt(text, matches, 15).should == expected
+ end
+
+ context 'multiple matches' do
+
+ it 'highlights multiple matches' do
+ text = <<-EOF
+Quentin Nobble-Boston, Permanent Under-Secretary, Department for Humpadinking
+decided to visit Humpadink so that he could be with the Humpadinks
+EOF
+
+ expected = <<-EOF
+Quentin Nobble-Boston, Permanent Under-Secretary, Department for <span class="highlight">Humpadinking</span>
+decided to visit <span class="highlight">Humpadink</span> so that he could be with the <span class="highlight">Humpadinks</span>
+EOF
+ text.chomp!
+ expected.chomp!
+ matches = [/\b(humpadink\w*)\b/iu]
+ highlight_and_excerpt(text, matches, 1000).should == expected
+ end
+
+ it 'bases the split on the first match' do
+ text = "Quentin Nobble-Boston, Permanent Under-Secretary," \
+ "Department for Humpadinking decided to visit Humpadink" \
+ "so that he could be with the Humpadinks"
+
+ expected = "...Department for <span class=\"highlight\">" \
+ "Humpadinking</span> decided to vis..."
+
+ matches = [/\b(humpadink\w*)\b/iu]
+ highlight_and_excerpt(text, matches, 15).should == expected
+ end
+
+ end
+
+ end
+
+ describe :highlight_matches do
+
+ it 'highlights' do
+ assert_equal(
+ "This is a <mark>beautiful</mark> morning",
+ highlight_matches("This is a beautiful morning", "beautiful")
+ )
+
+ assert_equal(
+ "This is a <mark>beautiful</mark> morning, but also a <mark>beautiful</mark> day",
+ highlight_matches("This is a beautiful morning, but also a beautiful day", "beautiful")
+ )
+
+ assert_equal(
+ "This is a <b>beautiful</b> morning, but also a <b>beautiful</b> day",
+ highlight_matches("This is a beautiful morning, but also a beautiful day", "beautiful", :highlighter => '<b>\1</b>')
+ )
+
+ assert_equal(
+ "This text is not changed because we supplied an empty phrase",
+ highlight_matches("This text is not changed because we supplied an empty phrase", nil)
+ )
+
+ assert_equal ' ', highlight_matches(' ', 'blank text is returned verbatim')
+ end
+
+ it 'sanitizes input' do
+ assert_equal(
+ "This is a <mark>beautiful</mark> morning",
+ highlight_matches("This is a beautiful morning<script>code!</script>", "beautiful")
+ )
+ end
+
+ it 'doesnt sanitize when the sanitize option is false' do
+ assert_equal(
+ "This is a <mark>beautiful</mark> morning<script>code!</script>",
+ highlight_matches("This is a beautiful morning<script>code!</script>", "beautiful", :sanitize => false)
+ )
+ end
+
+ it 'highlights using regexp' do
+ assert_equal(
+ "This is a <mark>beautiful!</mark> morning",
+ highlight_matches("This is a beautiful! morning", "beautiful!")
+ )
+
+ assert_equal(
+ "This is a <mark>beautiful! morning</mark>",
+ highlight_matches("This is a beautiful! morning", "beautiful! morning")
+ )
+
+ assert_equal(
+ "This is a <mark>beautiful? morning</mark>",
+ highlight_matches("This is a beautiful? morning", "beautiful? morning")
+ )
+ end
+
+ it 'accepts regex' do
+ assert_equal("This day was challenging for judge <mark>Allen</mark> and his colleagues.",
+ highlight_matches("This day was challenging for judge Allen and his colleagues.", /\ballen\b/i))
+ end
+
+ it 'highlights multiple phrases in one pass' do
+ assert_equal %(<em>wow</em> <em>em</em>), highlight_matches('wow em', %w(wow em), :highlighter => '<em>\1</em>')
+ end
+
+ it 'highlights with html' do
+ assert_equal(
+ "<p>This is a <mark>beautiful</mark> morning, but also a <mark>beautiful</mark> day</p>",
+ highlight_matches("<p>This is a beautiful morning, but also a beautiful day</p>", "beautiful")
+ )
+ assert_equal(
+ "<p>This is a <em><mark>beautiful</mark></em> morning, but also a <mark>beautiful</mark> day</p>",
+ highlight_matches("<p>This is a <em>beautiful</em> morning, but also a beautiful day</p>", "beautiful")
+ )
+ assert_equal(
+ "<p>This is a <em class=\"error\"><mark>beautiful</mark></em> morning, but also a <mark>beautiful</mark> <span class=\"last\">day</span></p>",
+ highlight_matches("<p>This is a <em class=\"error\">beautiful</em> morning, but also a beautiful <span class=\"last\">day</span></p>", "beautiful")
+ )
+ assert_equal(
+ "<p class=\"beautiful\">This is a <mark>beautiful</mark> morning, but also a <mark>beautiful</mark> day</p>",
+ highlight_matches("<p class=\"beautiful\">This is a beautiful morning, but also a beautiful day</p>", "beautiful")
+ )
+ assert_equal(
+ "<p>This is a <mark>beautiful</mark> <a href=\"http://example.com/beautiful#top?what=beautiful%20morning&amp;when=now+then\">morning</a>, but also a <mark>beautiful</mark> day</p>",
+ highlight_matches("<p>This is a beautiful <a href=\"http://example.com/beautiful\#top?what=beautiful%20morning&when=now+then\">morning</a>, but also a beautiful day</p>", "beautiful")
+ )
+ assert_equal(
+ "<div>abc <b>div</b></div>",
+ highlight_matches("<div>abc div</div>", "div", :highlighter => '<b>\1</b>')
+ )
+ end
+
+ it 'doesnt modify the options hash' do
+ options = { :highlighter => '<b>\1</b>', :sanitize => false }
+ passed_options = options.dup
+ highlight_matches("<div>abc div</div>", "div", passed_options)
+ assert_equal options, passed_options
+ end
+
+ it 'highlights with a block' do
+ assert_equal(
+ "<b>one</b> <b>two</b> <b>three</b>",
+ highlight_matches("one two three", ["one", "two", "three"]) { |word| "<b>#{word}</b>" }
+ )
+ end
+
+ end
+
+ describe :excerpt do
+
+ it 'excerpts' do
+ assert_equal("...is a beautiful morn...", excerpt("This is a beautiful morning", "beautiful", :radius => 5))
+ assert_equal("This is a...", excerpt("This is a beautiful morning", "this", :radius => 5))
+ assert_equal("...iful morning", excerpt("This is a beautiful morning", "morning", :radius => 5))
+ assert_nil excerpt("This is a beautiful morning", "day")
+ end
+
+ it 'is not html safe' do
+ assert !excerpt('This is a beautiful! morning', 'beautiful', :radius => 5).html_safe?
+ end
+
+ it 'excerpts borderline cases' do
+ assert_equal("", excerpt("", "", :radius => 0))
+ assert_equal("a", excerpt("a", "a", :radius => 0))
+ assert_equal("...b...", excerpt("abc", "b", :radius => 0))
+ assert_equal("abc", excerpt("abc", "b", :radius => 1))
+ assert_equal("abc...", excerpt("abcd", "b", :radius => 1))
+ assert_equal("...abc", excerpt("zabc", "b", :radius => 1))
+ assert_equal("...abc...", excerpt("zabcd", "b", :radius => 1))
+ assert_equal("zabcd", excerpt("zabcd", "b", :radius => 2))
+
+ # excerpt strips the resulting string before ap-/prepending excerpt_string.
+ # whether this behavior is meaningful when excerpt_string is not to be
+ # appended is questionable.
+ assert_equal("zabcd", excerpt(" zabcd ", "b", :radius => 4))
+ assert_equal("...abc...", excerpt("z abc d", "b", :radius => 1))
+ end
+
+ it 'excerpts with regex' do
+ assert_equal('...is a beautiful! mor...', excerpt('This is a beautiful! morning', 'beautiful', :radius => 5))
+ assert_equal('...is a beautiful? mor...', excerpt('This is a beautiful? morning', 'beautiful', :radius => 5))
+ assert_equal('...is a beautiful? mor...', excerpt('This is a beautiful? morning', /\bbeau\w*\b/i, :radius => 5))
+ assert_equal('...is a beautiful? mor...', excerpt('This is a beautiful? morning', /\b(beau\w*)\b/i, :radius => 5))
+ assert_equal("...udge Allen and...", excerpt("This day was challenging for judge Allen and his colleagues.", /\ballen\b/i, :radius => 5))
+ assert_equal("...judge Allen and...", excerpt("This day was challenging for judge Allen and his colleagues.", /\ballen\b/i, :radius => 1, :separator => ' '))
+ assert_equal("...was challenging for...", excerpt("This day was challenging for judge Allen and his colleagues.", /\b(\w*allen\w*)\b/i, :radius => 5))
+ end
+
+ it 'excerpts with omission' do
+ assert_equal("[...]is a beautiful morn[...]", excerpt("This is a beautiful morning", "beautiful", :omission => "[...]",:radius => 5))
+ assert_equal(
+ "This is the ultimate supercalifragilisticexpialidoceous very looooooooooooooooooong looooooooooooong beautiful morning with amazing sunshine and awesome tempera[...]",
+ excerpt("This is the ultimate supercalifragilisticexpialidoceous very looooooooooooooooooong looooooooooooong beautiful morning with amazing sunshine and awesome temperatures. So what are you gonna do about it?", "very",
+ :omission => "[...]")
+ )
+ end
+
+ it 'excerpts with utf8' do
+ if RUBY_VERSION.to_f >= 1.9
+ assert_equal("...\357\254\203ciency could not be...".force_encoding(Encoding::UTF_8), excerpt("That's why e\357\254\203ciency could not be helped".force_encoding(Encoding::UTF_8), 'could', :radius => 8))
+ else
+ assert_equal("...\357\254\203ciency could not be...", excerpt("That's why e\357\254\203ciency could not be helped", 'could', :radius => 8))
+ end
+ end
+
+ it 'doesnt modify the options hash' do
+ options = { :omission => "[...]",:radius => 5 }
+ passed_options = options.dup
+ excerpt("This is a beautiful morning", "beautiful", passed_options)
+ assert_equal options, passed_options
+ end
+
+ it 'excerpts with separator' do
+ options = { :separator => ' ', :radius => 1 }
+ assert_equal('...a very beautiful...', excerpt('This is a very beautiful morning', 'very', options))
+ assert_equal('This is...', excerpt('This is a very beautiful morning', 'this', options))
+ assert_equal('...beautiful morning', excerpt('This is a very beautiful morning', 'morning', options))
+
+ options = { :separator => "\n", :radius => 0 }
+ assert_equal("...very long...", excerpt("my very\nvery\nvery long\nstring", 'long', options))
+
+ options = { :separator => "\n", :radius => 1 }
+ assert_equal("...very\nvery long\nstring", excerpt("my very\nvery\nvery long\nstring", 'long', options))
+
+ assert_equal excerpt('This is a beautiful morning', 'a'),
+ excerpt('This is a beautiful morning', 'a', :separator => nil)
+ end
+
+ end
+
+end
diff --git a/spec/integration/xapian_search_highlighting_spec.rb b/spec/integration/xapian_search_highlighting_spec.rb
new file mode 100644
index 000000000..65a34cf91
--- /dev/null
+++ b/spec/integration/xapian_search_highlighting_spec.rb
@@ -0,0 +1,39 @@
+require File.expand_path(File.dirname(__FILE__) + '/../spec_helper')
+
+describe 'highlighting search results' do
+ include HighlightHelper
+
+ it 'ignores stopwords' do
+ phrase = 'department of humpadinking'
+ search = ActsAsXapian::Search.new([PublicBody], phrase, :limit => 1)
+ matches = search.words_to_highlight(:regex => true)
+ highlight_matches(phrase, matches).should == '<mark>department</mark> of <mark>humpadinking</mark>'
+ end
+
+ it 'ignores case' do
+ search_phrase = 'department of humpadinking'
+ search = ActsAsXapian::Search.new([PublicBody], search_phrase, :limit => 1)
+ matches = search.words_to_highlight(:regex => true)
+ highlight_matches('Department of Humpadinking', matches).should == '<mark>Department</mark> of <mark>Humpadinking</mark>'
+ end
+
+ it 'highlights stemmed words' do
+ phrase = 'department'
+ search = ActsAsXapian::Search.new([PublicBody], phrase, :limit => 1)
+ matches = search.words_to_highlight(:regex => true)
+
+ search.words_to_highlight(:regex => false).should == ['depart']
+ highlight_matches(phrase, matches).should == '<mark>department</mark>'
+ end
+
+ it 'highlights stemmed words even if the stem is unhelpful' do
+ # Stemming returns 'bore' as the word to highlight which can't be
+ # matched in the original phrase.
+ phrase = 'boring'
+ search = ActsAsXapian::Search.new([PublicBody], phrase, :limit => 1)
+ matches = search.words_to_highlight(:regex => true, :include_original => true)
+
+ highlight_matches(phrase, matches).should == '<mark>boring</mark>'
+ end
+
+end
diff --git a/spec/models/xapian_spec.rb b/spec/models/xapian_spec.rb
index a1e060d8e..678e3a2dc 100644
--- a/spec/models/xapian_spec.rb
+++ b/spec/models/xapian_spec.rb
@@ -380,23 +380,63 @@ describe ActsAsXapian::Search, "#words_to_highlight" do
it "should return a list of words used in the search" do
s = ActsAsXapian::Search.new([PublicBody], "albatross words", :limit => 100)
- s.words_to_highlight.should == ["albatross", "words"]
+ s.words_to_highlight.should == ["albatross", "word"]
end
it "should remove any operators" do
s = ActsAsXapian::Search.new([PublicBody], "albatross words tag:mice", :limit => 100)
- s.words_to_highlight.should == ["albatross", "words"]
+ s.words_to_highlight.should == ["albatross", "word"]
end
- # This is the current behaviour but it seems a little simplistic to me
it "should separate punctuation" do
s = ActsAsXapian::Search.new([PublicBody], "The doctor's patient", :limit => 100)
- s.words_to_highlight.should == ["The", "doctor", "s", "patient"]
+ s.words_to_highlight.should == ["the", "doctor", "patient"].sort
end
it "should handle non-ascii characters" do
s = ActsAsXapian::Search.new([PublicBody], "adatigénylés words tag:mice", :limit => 100)
- s.words_to_highlight.should == ["adatigénylés", "words"]
+ s.words_to_highlight.should == ["adatigénylé", "word"]
+ end
+
+ it "should ignore stopwords" do
+ s = ActsAsXapian::Search.new([PublicBody], "department of humpadinking", :limit => 100)
+ s.words_to_highlight.should_not include('of')
+ end
+
+ it "uses stemming" do
+ s = ActsAsXapian::Search.new([PublicBody], 'department of humpadinking', :limit => 100)
+ s.words_to_highlight.should == ["depart", "humpadink"]
+ end
+
+ it "doesn't stem proper nouns" do
+ s = ActsAsXapian::Search.new([PublicBody], 'department of Humpadinking', :limit => 1)
+ s.words_to_highlight.should == ["depart", "humpadinking"]
+ end
+
+ it "includes the original search terms if requested" do
+ s = ActsAsXapian::Search.new([PublicBody], 'boring', :limit => 1)
+ s.words_to_highlight(:include_original => true).should == ['bore', 'boring']
+ end
+
+ it "does not return duplicate terms" do
+ s = ActsAsXapian::Search.new([PublicBody], 'boring boring', :limit => 1)
+ s.words_to_highlight.should == ['bore']
+ end
+
+ context 'the :regex option' do
+
+ it 'wraps each words in a regex that matches the full word' do
+ expected = [/\b(albatross)\b/iu]
+ s = ActsAsXapian::Search.new([PublicBody], 'Albatross', :limit => 1)
+ s.words_to_highlight(:regex => true).should == expected
+ end
+
+ it 'wraps each stem in a regex' do
+ expected = [/\b(depart)\w*\b/iu]
+ s = ActsAsXapian::Search.new([PublicBody], 'department', :limit => 1)
+ s.words_to_highlight(:regex => true).should == expected
+ end
+
end
end