diff options
-rw-r--r-- | app/controllers/general_controller.rb | 2 | ||||
-rw-r--r-- | app/controllers/track_controller.rb | 10 | ||||
-rw-r--r-- | app/helpers/application_helper.rb | 22 | ||||
-rw-r--r-- | app/helpers/highlight_helper.rb | 98 | ||||
-rw-r--r-- | app/views/track_mailer/event_digest.text.erb | 2 | ||||
-rw-r--r-- | lib/acts_as_xapian/acts_as_xapian.rb | 62 | ||||
-rw-r--r-- | spec/controllers/general_controller_spec.rb | 2 | ||||
-rw-r--r-- | spec/controllers/request_controller_spec.rb | 1 | ||||
-rw-r--r-- | spec/helpers/highlight_helper_spec.rb | 247 | ||||
-rw-r--r-- | spec/integration/xapian_search_highlighting_spec.rb | 39 | ||||
-rw-r--r-- | spec/models/xapian_spec.rb | 50 |
11 files changed, 494 insertions, 41 deletions
diff --git a/app/controllers/general_controller.rb b/app/controllers/general_controller.rb index 28055ddbf..158492eb2 100644 --- a/app/controllers/general_controller.rb +++ b/app/controllers/general_controller.rb @@ -159,7 +159,7 @@ class GeneralController < ApplicationController end # Spelling and highight words are same for all three queries - @highlight_words = @request_for_spelling.words_to_highlight + @highlight_words = @request_for_spelling.words_to_highlight(:regex => true, :include_original => true) if !(@request_for_spelling.spelling_correction =~ /[a-z]+:/) @spelling_correction = @request_for_spelling.spelling_correction end diff --git a/app/controllers/track_controller.rb b/app/controllers/track_controller.rb index c15fb573d..83700a55b 100644 --- a/app/controllers/track_controller.rb +++ b/app/controllers/track_controller.rb @@ -154,7 +154,15 @@ class TrackController < ApplicationController request.format = 'xml' unless params[:format] respond_to do |format| format.json { render :json => @xapian_object.results.map { |r| r[:model].json_for_api(true, - lambda { |t| view_context.highlight_and_excerpt(t, @xapian_object.words_to_highlight, 150) } + lambda do |t| + view_context.highlight_and_excerpt( + t, + @xapian_object.words_to_highlight( + :regex => true, + :include_original => true), + 150 + ) + end ) } } format.any { render :template => 'track/atom_feed', :formats => ['atom'], diff --git a/app/helpers/application_helper.rb b/app/helpers/application_helper.rb index 45b042354..49ce94951 100644 --- a/app/helpers/application_helper.rb +++ b/app/helpers/application_helper.rb @@ -22,6 +22,9 @@ module ApplicationHelper # Useful for sending emails include MailerHelper + # Extra highlight helpers + include HighlightHelper + # Copied from error_messages_for in active_record_helper.rb def foi_error_messages_for(*params) options = params.last.is_a?(Hash) ? params.pop.symbolize_keys : {} @@ -54,25 +57,6 @@ module ApplicationHelper end end - # Highlight words, also escapes HTML (other than spans that we add) - def highlight_words(t, words, html = true) - if html - highlight(h(t), words, :highlighter => '<span class="highlight">\1</span>').html_safe - else - highlight(t, words, :highlighter => '*\1*') - end - end - - def highlight_and_excerpt(t, words, excount, html = true) - newt = excerpt(t, words[0], :radius => excount) - if not newt - newt = excerpt(t, '', :radius => excount) - end - t = newt - t = highlight_words(t, words, html) - return t - end - def locale_name(locale) return LanguageNames::get_language_name(locale) end diff --git a/app/helpers/highlight_helper.rb b/app/helpers/highlight_helper.rb new file mode 100644 index 000000000..a98f6f320 --- /dev/null +++ b/app/helpers/highlight_helper.rb @@ -0,0 +1,98 @@ +module HighlightHelper + include ERB::Util + + # Implementation of rails' highlight that allows regex to be passed to + # the phrases parameter. + # https://github.com/rails/rails/pull/11793 + def highlight_matches(text, phrases, options = {}) + text = ActionController::Base.helpers.sanitize(text).try(:html_safe) if options.fetch(:sanitize, true) + + if text.blank? || phrases.blank? + text + else + match = Array(phrases).map do |p| + Regexp === p ? p.to_s : Regexp.escape(p) + end.join('|') + + if block_given? + text.gsub(/(#{match})(?![^<]*?>)/i) { |found| yield found } + else + highlighter = options.fetch(:highlighter, '<mark>\1</mark>') + text.gsub(/(#{match})(?![^<]*?>)/i, highlighter) + end + end.html_safe + end + + # Highlight words, also escapes HTML (other than spans that we add) + def highlight_words(t, words, html = true) + if html + highlight_matches(h(t), words, :highlighter => '<span class="highlight">\1</span>').html_safe + else + highlight_matches(t, words, :highlighter => '*\1*') + end + end + + def highlight_and_excerpt(t, words, excount, html = true) + newt = excerpt(t, words[0], :radius => excount) + if not newt + newt = excerpt(t, '', :radius => excount) + end + t = newt + t = highlight_words(t, words, html) + return t + end + + def excerpt(text, phrase, options = {}) + return unless text && phrase + + separator = options.fetch(:separator, nil) || "" + case phrase + when Regexp + regex = phrase + else + regex = /#{Regexp.escape(phrase)}/i + end + + return unless matches = text.match(regex) + phrase = matches[0] + + unless separator.empty? + text.split(separator).each do |value| + if value.match(regex) + regex = phrase = value + break + end + end + end + + first_part, second_part = text.split(phrase, 2) + + prefix, first_part = cut_excerpt_part(:first, first_part, separator, options) + postfix, second_part = cut_excerpt_part(:second, second_part, separator, options) + + affix = [first_part, separator, phrase, separator, second_part].join.strip + [prefix, affix, postfix].join + end + + private + + def cut_excerpt_part(part_position, part, separator, options) + return "", "" unless part + + radius = options.fetch(:radius, 100) + omission = options.fetch(:omission, "...") + + part = part.split(separator) + part.delete("") + affix = part.size > radius ? omission : "" + + part = if part_position == :first + drop_index = [part.length - radius, 0].max + part.drop(drop_index) + else + part.first(radius) + end + + return affix, part.join(separator) + end +end diff --git a/app/views/track_mailer/event_digest.text.erb b/app/views/track_mailer/event_digest.text.erb index a154f430f..f6e699e41 100644 --- a/app/views/track_mailer/event_digest.text.erb +++ b/app/views/track_mailer/event_digest.text.erb @@ -4,7 +4,7 @@ for track_thing, alert_results, xapian_object in @email_about_things main_text += track_thing.params[:title_in_email] + "\n" main_text += ("=" * track_thing.params[:title_in_email].size) + "\n\n" - @highlight_words = xapian_object.words_to_highlight + @highlight_words = xapian_object.words_to_highlight(:regex => true) for result in alert_results.reverse if result[:model].class.to_s == "InfoRequestEvent" event = result[:model] diff --git a/lib/acts_as_xapian/acts_as_xapian.rb b/lib/acts_as_xapian/acts_as_xapian.rb index 168d2eec3..6520a20a4 100644 --- a/lib/acts_as_xapian/acts_as_xapian.rb +++ b/lib/acts_as_xapian/acts_as_xapian.rb @@ -21,6 +21,20 @@ rescue LoadError $acts_as_xapian_bindings_available = false end +module Xapian + class QueryParser + def unstem(term) + words = [] + + Xapian._safelyIterate(unstem_begin(term), unstem_end(term)) do |item| + words << item.term + end + + words + end + end +end + module ActsAsXapian ###################################################################### # Module level variables @@ -472,16 +486,42 @@ module ActsAsXapian # Return just normal words in the query i.e. Not operators, ones in # date ranges or similar. Use this for cheap highlighting with # TextHelper::highlight, and excerpt. - def words_to_highlight - # TODO: In Ruby 1.9 we can do matching of any unicode letter with \p{L} - # But we still need to support ruby 1.8 for the time being so... - query_nopunc = self.query_string.gsub(/[^ёЁа-яА-Яa-zA-Zà-üÀ-Ü0-9:\.\/_]/iu, " ") - query_nopunc = query_nopunc.gsub(/\s+/, " ") - words = query_nopunc.split(" ") - # Remove anything with a :, . or / in it - words = words.find_all {|o| !o.match(/(:|\.|\/)/) } - words = words.find_all {|o| !o.match(/^(AND|NOT|OR|XOR)$/) } - return words + def words_to_highlight(opts = {}) + default_opts = { :include_original => false, :regex => false } + opts = default_opts.merge(opts) + + # Reject all prefixes other than Z, which we know is reserved for stems + terms = query.terms.reject { |t| t.term.first.match(/^[A-Y]$/) } + # Collect the stems including the Z prefix + raw_stems = terms.map { |t| t.term if t.term.start_with?('Z') }.compact.uniq.sort + # Collect stems, chopping the Z prefix off + stems = raw_stems.map { |t| t[1..-1] }.compact.sort + # Collect the non-stem terms + words = terms.map { |t| t.term unless t.term.start_with?('Z') }.compact.sort + + # Add the unstemmed words from the original query + # Sometimes stems can be unhelpful with the :regex option, for example + # stemming 'boring' results in us trying to highlight 'bore'. + if opts[:include_original] + raw_stems.each do |raw_stem| + words << ActsAsXapian.query_parser.unstem(raw_stem).uniq + end + + words = words.any? ? words.flatten.uniq : [] + end + + if opts[:regex] + stems.map! { |w| /\b(#{ w })\w*\b/iu } + words.map! { |w| /\b(#{ w })\b/iu } + end + + if RUBY_VERSION.to_f >= 1.9 + (stems + words).map! do |term| + term.is_a?(String) ? term.force_encoding('UTF-8') : term + end + else + stems + words + end end # Text for lines in log file @@ -975,5 +1015,3 @@ end # Reopen ActiveRecord and include the acts_as_xapian method ActiveRecord::Base.extend ActsAsXapian::ActsMethods - - diff --git a/spec/controllers/general_controller_spec.rb b/spec/controllers/general_controller_spec.rb index 7590a5b42..c0a9d57d3 100644 --- a/spec/controllers/general_controller_spec.rb +++ b/spec/controllers/general_controller_spec.rb @@ -188,7 +188,7 @@ describe GeneralController, 'when using xapian search' do it 'should highlight words for a user-only request' do get :search, :combined => "bob/users" - assigns[:highlight_words].should == ['bob'] + assigns[:highlight_words].should == [/\b(bob)\w*\b/iu, /\b(bob)\b/iu] end it 'should show spelling corrections for a user-only request' do diff --git a/spec/controllers/request_controller_spec.rb b/spec/controllers/request_controller_spec.rb index 48f37a45c..f7c935af3 100644 --- a/spec/controllers/request_controller_spec.rb +++ b/spec/controllers/request_controller_spec.rb @@ -923,7 +923,6 @@ describe RequestController, "when searching for an authority" do end it "should return matching bodies" do - session[:user_id] = @user.id get :select_authority, :query => "Quango" diff --git a/spec/helpers/highlight_helper_spec.rb b/spec/helpers/highlight_helper_spec.rb new file mode 100644 index 000000000..e1be7e153 --- /dev/null +++ b/spec/helpers/highlight_helper_spec.rb @@ -0,0 +1,247 @@ +require File.expand_path(File.dirname(__FILE__) + '/../spec_helper') + +describe HighlightHelper do + include HighlightHelper + + describe :highlight_and_excerpt do + + it 'excerpts text and highlights phrases' do + text = "Quentin Nobble-Boston, Permanent Under-Secretary, Department for Humpadinking" + phrases = ['humpadinking'] + expected = '...Department for <span class="highlight">Humpadinking</span>' + highlight_and_excerpt(text, phrases, 15).should == expected + end + + it 'excerpts text and highlights matches' do + text = "Quentin Nobble-Boston, Permanent Under-Secretary, Department for Humpadinking" + matches = [/\bhumpadink\w*\b/iu] + expected = '...Department for <span class="highlight">Humpadinking</span>' + highlight_and_excerpt(text, matches, 15).should == expected + end + + context 'multiple matches' do + + it 'highlights multiple matches' do + text = <<-EOF +Quentin Nobble-Boston, Permanent Under-Secretary, Department for Humpadinking +decided to visit Humpadink so that he could be with the Humpadinks +EOF + + expected = <<-EOF +Quentin Nobble-Boston, Permanent Under-Secretary, Department for <span class="highlight">Humpadinking</span> +decided to visit <span class="highlight">Humpadink</span> so that he could be with the <span class="highlight">Humpadinks</span> +EOF + text.chomp! + expected.chomp! + matches = [/\b(humpadink\w*)\b/iu] + highlight_and_excerpt(text, matches, 1000).should == expected + end + + it 'bases the split on the first match' do + text = "Quentin Nobble-Boston, Permanent Under-Secretary," \ + "Department for Humpadinking decided to visit Humpadink" \ + "so that he could be with the Humpadinks" + + expected = "...Department for <span class=\"highlight\">" \ + "Humpadinking</span> decided to vis..." + + matches = [/\b(humpadink\w*)\b/iu] + highlight_and_excerpt(text, matches, 15).should == expected + end + + end + + end + + describe :highlight_matches do + + it 'highlights' do + assert_equal( + "This is a <mark>beautiful</mark> morning", + highlight_matches("This is a beautiful morning", "beautiful") + ) + + assert_equal( + "This is a <mark>beautiful</mark> morning, but also a <mark>beautiful</mark> day", + highlight_matches("This is a beautiful morning, but also a beautiful day", "beautiful") + ) + + assert_equal( + "This is a <b>beautiful</b> morning, but also a <b>beautiful</b> day", + highlight_matches("This is a beautiful morning, but also a beautiful day", "beautiful", :highlighter => '<b>\1</b>') + ) + + assert_equal( + "This text is not changed because we supplied an empty phrase", + highlight_matches("This text is not changed because we supplied an empty phrase", nil) + ) + + assert_equal ' ', highlight_matches(' ', 'blank text is returned verbatim') + end + + it 'sanitizes input' do + assert_equal( + "This is a <mark>beautiful</mark> morning", + highlight_matches("This is a beautiful morning<script>code!</script>", "beautiful") + ) + end + + it 'doesnt sanitize when the sanitize option is false' do + assert_equal( + "This is a <mark>beautiful</mark> morning<script>code!</script>", + highlight_matches("This is a beautiful morning<script>code!</script>", "beautiful", :sanitize => false) + ) + end + + it 'highlights using regexp' do + assert_equal( + "This is a <mark>beautiful!</mark> morning", + highlight_matches("This is a beautiful! morning", "beautiful!") + ) + + assert_equal( + "This is a <mark>beautiful! morning</mark>", + highlight_matches("This is a beautiful! morning", "beautiful! morning") + ) + + assert_equal( + "This is a <mark>beautiful? morning</mark>", + highlight_matches("This is a beautiful? morning", "beautiful? morning") + ) + end + + it 'accepts regex' do + assert_equal("This day was challenging for judge <mark>Allen</mark> and his colleagues.", + highlight_matches("This day was challenging for judge Allen and his colleagues.", /\ballen\b/i)) + end + + it 'highlights multiple phrases in one pass' do + assert_equal %(<em>wow</em> <em>em</em>), highlight_matches('wow em', %w(wow em), :highlighter => '<em>\1</em>') + end + + it 'highlights with html' do + assert_equal( + "<p>This is a <mark>beautiful</mark> morning, but also a <mark>beautiful</mark> day</p>", + highlight_matches("<p>This is a beautiful morning, but also a beautiful day</p>", "beautiful") + ) + assert_equal( + "<p>This is a <em><mark>beautiful</mark></em> morning, but also a <mark>beautiful</mark> day</p>", + highlight_matches("<p>This is a <em>beautiful</em> morning, but also a beautiful day</p>", "beautiful") + ) + assert_equal( + "<p>This is a <em class=\"error\"><mark>beautiful</mark></em> morning, but also a <mark>beautiful</mark> <span class=\"last\">day</span></p>", + highlight_matches("<p>This is a <em class=\"error\">beautiful</em> morning, but also a beautiful <span class=\"last\">day</span></p>", "beautiful") + ) + assert_equal( + "<p class=\"beautiful\">This is a <mark>beautiful</mark> morning, but also a <mark>beautiful</mark> day</p>", + highlight_matches("<p class=\"beautiful\">This is a beautiful morning, but also a beautiful day</p>", "beautiful") + ) + assert_equal( + "<p>This is a <mark>beautiful</mark> <a href=\"http://example.com/beautiful#top?what=beautiful%20morning&when=now+then\">morning</a>, but also a <mark>beautiful</mark> day</p>", + highlight_matches("<p>This is a beautiful <a href=\"http://example.com/beautiful\#top?what=beautiful%20morning&when=now+then\">morning</a>, but also a beautiful day</p>", "beautiful") + ) + assert_equal( + "<div>abc <b>div</b></div>", + highlight_matches("<div>abc div</div>", "div", :highlighter => '<b>\1</b>') + ) + end + + it 'doesnt modify the options hash' do + options = { :highlighter => '<b>\1</b>', :sanitize => false } + passed_options = options.dup + highlight_matches("<div>abc div</div>", "div", passed_options) + assert_equal options, passed_options + end + + it 'highlights with a block' do + assert_equal( + "<b>one</b> <b>two</b> <b>three</b>", + highlight_matches("one two three", ["one", "two", "three"]) { |word| "<b>#{word}</b>" } + ) + end + + end + + describe :excerpt do + + it 'excerpts' do + assert_equal("...is a beautiful morn...", excerpt("This is a beautiful morning", "beautiful", :radius => 5)) + assert_equal("This is a...", excerpt("This is a beautiful morning", "this", :radius => 5)) + assert_equal("...iful morning", excerpt("This is a beautiful morning", "morning", :radius => 5)) + assert_nil excerpt("This is a beautiful morning", "day") + end + + it 'is not html safe' do + assert !excerpt('This is a beautiful! morning', 'beautiful', :radius => 5).html_safe? + end + + it 'excerpts borderline cases' do + assert_equal("", excerpt("", "", :radius => 0)) + assert_equal("a", excerpt("a", "a", :radius => 0)) + assert_equal("...b...", excerpt("abc", "b", :radius => 0)) + assert_equal("abc", excerpt("abc", "b", :radius => 1)) + assert_equal("abc...", excerpt("abcd", "b", :radius => 1)) + assert_equal("...abc", excerpt("zabc", "b", :radius => 1)) + assert_equal("...abc...", excerpt("zabcd", "b", :radius => 1)) + assert_equal("zabcd", excerpt("zabcd", "b", :radius => 2)) + + # excerpt strips the resulting string before ap-/prepending excerpt_string. + # whether this behavior is meaningful when excerpt_string is not to be + # appended is questionable. + assert_equal("zabcd", excerpt(" zabcd ", "b", :radius => 4)) + assert_equal("...abc...", excerpt("z abc d", "b", :radius => 1)) + end + + it 'excerpts with regex' do + assert_equal('...is a beautiful! mor...', excerpt('This is a beautiful! morning', 'beautiful', :radius => 5)) + assert_equal('...is a beautiful? mor...', excerpt('This is a beautiful? morning', 'beautiful', :radius => 5)) + assert_equal('...is a beautiful? mor...', excerpt('This is a beautiful? morning', /\bbeau\w*\b/i, :radius => 5)) + assert_equal('...is a beautiful? mor...', excerpt('This is a beautiful? morning', /\b(beau\w*)\b/i, :radius => 5)) + assert_equal("...udge Allen and...", excerpt("This day was challenging for judge Allen and his colleagues.", /\ballen\b/i, :radius => 5)) + assert_equal("...judge Allen and...", excerpt("This day was challenging for judge Allen and his colleagues.", /\ballen\b/i, :radius => 1, :separator => ' ')) + assert_equal("...was challenging for...", excerpt("This day was challenging for judge Allen and his colleagues.", /\b(\w*allen\w*)\b/i, :radius => 5)) + end + + it 'excerpts with omission' do + assert_equal("[...]is a beautiful morn[...]", excerpt("This is a beautiful morning", "beautiful", :omission => "[...]",:radius => 5)) + assert_equal( + "This is the ultimate supercalifragilisticexpialidoceous very looooooooooooooooooong looooooooooooong beautiful morning with amazing sunshine and awesome tempera[...]", + excerpt("This is the ultimate supercalifragilisticexpialidoceous very looooooooooooooooooong looooooooooooong beautiful morning with amazing sunshine and awesome temperatures. So what are you gonna do about it?", "very", + :omission => "[...]") + ) + end + + it 'excerpts with utf8' do + if RUBY_VERSION.to_f >= 1.9 + assert_equal("...\357\254\203ciency could not be...".force_encoding(Encoding::UTF_8), excerpt("That's why e\357\254\203ciency could not be helped".force_encoding(Encoding::UTF_8), 'could', :radius => 8)) + else + assert_equal("...\357\254\203ciency could not be...", excerpt("That's why e\357\254\203ciency could not be helped", 'could', :radius => 8)) + end + end + + it 'doesnt modify the options hash' do + options = { :omission => "[...]",:radius => 5 } + passed_options = options.dup + excerpt("This is a beautiful morning", "beautiful", passed_options) + assert_equal options, passed_options + end + + it 'excerpts with separator' do + options = { :separator => ' ', :radius => 1 } + assert_equal('...a very beautiful...', excerpt('This is a very beautiful morning', 'very', options)) + assert_equal('This is...', excerpt('This is a very beautiful morning', 'this', options)) + assert_equal('...beautiful morning', excerpt('This is a very beautiful morning', 'morning', options)) + + options = { :separator => "\n", :radius => 0 } + assert_equal("...very long...", excerpt("my very\nvery\nvery long\nstring", 'long', options)) + + options = { :separator => "\n", :radius => 1 } + assert_equal("...very\nvery long\nstring", excerpt("my very\nvery\nvery long\nstring", 'long', options)) + + assert_equal excerpt('This is a beautiful morning', 'a'), + excerpt('This is a beautiful morning', 'a', :separator => nil) + end + + end + +end diff --git a/spec/integration/xapian_search_highlighting_spec.rb b/spec/integration/xapian_search_highlighting_spec.rb new file mode 100644 index 000000000..65a34cf91 --- /dev/null +++ b/spec/integration/xapian_search_highlighting_spec.rb @@ -0,0 +1,39 @@ +require File.expand_path(File.dirname(__FILE__) + '/../spec_helper') + +describe 'highlighting search results' do + include HighlightHelper + + it 'ignores stopwords' do + phrase = 'department of humpadinking' + search = ActsAsXapian::Search.new([PublicBody], phrase, :limit => 1) + matches = search.words_to_highlight(:regex => true) + highlight_matches(phrase, matches).should == '<mark>department</mark> of <mark>humpadinking</mark>' + end + + it 'ignores case' do + search_phrase = 'department of humpadinking' + search = ActsAsXapian::Search.new([PublicBody], search_phrase, :limit => 1) + matches = search.words_to_highlight(:regex => true) + highlight_matches('Department of Humpadinking', matches).should == '<mark>Department</mark> of <mark>Humpadinking</mark>' + end + + it 'highlights stemmed words' do + phrase = 'department' + search = ActsAsXapian::Search.new([PublicBody], phrase, :limit => 1) + matches = search.words_to_highlight(:regex => true) + + search.words_to_highlight(:regex => false).should == ['depart'] + highlight_matches(phrase, matches).should == '<mark>department</mark>' + end + + it 'highlights stemmed words even if the stem is unhelpful' do + # Stemming returns 'bore' as the word to highlight which can't be + # matched in the original phrase. + phrase = 'boring' + search = ActsAsXapian::Search.new([PublicBody], phrase, :limit => 1) + matches = search.words_to_highlight(:regex => true, :include_original => true) + + highlight_matches(phrase, matches).should == '<mark>boring</mark>' + end + +end diff --git a/spec/models/xapian_spec.rb b/spec/models/xapian_spec.rb index a1e060d8e..678e3a2dc 100644 --- a/spec/models/xapian_spec.rb +++ b/spec/models/xapian_spec.rb @@ -380,23 +380,63 @@ describe ActsAsXapian::Search, "#words_to_highlight" do it "should return a list of words used in the search" do s = ActsAsXapian::Search.new([PublicBody], "albatross words", :limit => 100) - s.words_to_highlight.should == ["albatross", "words"] + s.words_to_highlight.should == ["albatross", "word"] end it "should remove any operators" do s = ActsAsXapian::Search.new([PublicBody], "albatross words tag:mice", :limit => 100) - s.words_to_highlight.should == ["albatross", "words"] + s.words_to_highlight.should == ["albatross", "word"] end - # This is the current behaviour but it seems a little simplistic to me it "should separate punctuation" do s = ActsAsXapian::Search.new([PublicBody], "The doctor's patient", :limit => 100) - s.words_to_highlight.should == ["The", "doctor", "s", "patient"] + s.words_to_highlight.should == ["the", "doctor", "patient"].sort end it "should handle non-ascii characters" do s = ActsAsXapian::Search.new([PublicBody], "adatigénylés words tag:mice", :limit => 100) - s.words_to_highlight.should == ["adatigénylés", "words"] + s.words_to_highlight.should == ["adatigénylé", "word"] + end + + it "should ignore stopwords" do + s = ActsAsXapian::Search.new([PublicBody], "department of humpadinking", :limit => 100) + s.words_to_highlight.should_not include('of') + end + + it "uses stemming" do + s = ActsAsXapian::Search.new([PublicBody], 'department of humpadinking', :limit => 100) + s.words_to_highlight.should == ["depart", "humpadink"] + end + + it "doesn't stem proper nouns" do + s = ActsAsXapian::Search.new([PublicBody], 'department of Humpadinking', :limit => 1) + s.words_to_highlight.should == ["depart", "humpadinking"] + end + + it "includes the original search terms if requested" do + s = ActsAsXapian::Search.new([PublicBody], 'boring', :limit => 1) + s.words_to_highlight(:include_original => true).should == ['bore', 'boring'] + end + + it "does not return duplicate terms" do + s = ActsAsXapian::Search.new([PublicBody], 'boring boring', :limit => 1) + s.words_to_highlight.should == ['bore'] + end + + context 'the :regex option' do + + it 'wraps each words in a regex that matches the full word' do + expected = [/\b(albatross)\b/iu] + s = ActsAsXapian::Search.new([PublicBody], 'Albatross', :limit => 1) + s.words_to_highlight(:regex => true).should == expected + end + + it 'wraps each stem in a regex' do + expected = [/\b(depart)\w*\b/iu] + s = ActsAsXapian::Search.new([PublicBody], 'department', :limit => 1) + s.words_to_highlight(:regex => true).should == expected + end + end end |