aboutsummaryrefslogtreecommitdiffstats
path: root/spec
diff options
context:
space:
mode:
Diffstat (limited to 'spec')
-rw-r--r--spec/controllers/general_controller_spec.rb2
-rw-r--r--spec/controllers/request_controller_spec.rb1
-rw-r--r--spec/helpers/highlight_helper_spec.rb247
-rw-r--r--spec/integration/xapian_search_highlighting_spec.rb39
-rw-r--r--spec/models/xapian_spec.rb50
5 files changed, 332 insertions, 7 deletions
diff --git a/spec/controllers/general_controller_spec.rb b/spec/controllers/general_controller_spec.rb
index 7590a5b42..c0a9d57d3 100644
--- a/spec/controllers/general_controller_spec.rb
+++ b/spec/controllers/general_controller_spec.rb
@@ -188,7 +188,7 @@ describe GeneralController, 'when using xapian search' do
it 'should highlight words for a user-only request' do
get :search, :combined => "bob/users"
- assigns[:highlight_words].should == ['bob']
+ assigns[:highlight_words].should == [/\b(bob)\w*\b/iu, /\b(bob)\b/iu]
end
it 'should show spelling corrections for a user-only request' do
diff --git a/spec/controllers/request_controller_spec.rb b/spec/controllers/request_controller_spec.rb
index 48f37a45c..f7c935af3 100644
--- a/spec/controllers/request_controller_spec.rb
+++ b/spec/controllers/request_controller_spec.rb
@@ -923,7 +923,6 @@ describe RequestController, "when searching for an authority" do
end
it "should return matching bodies" do
-
session[:user_id] = @user.id
get :select_authority, :query => "Quango"
diff --git a/spec/helpers/highlight_helper_spec.rb b/spec/helpers/highlight_helper_spec.rb
new file mode 100644
index 000000000..e1be7e153
--- /dev/null
+++ b/spec/helpers/highlight_helper_spec.rb
@@ -0,0 +1,247 @@
+require File.expand_path(File.dirname(__FILE__) + '/../spec_helper')
+
+describe HighlightHelper do
+ include HighlightHelper
+
+ describe :highlight_and_excerpt do
+
+ it 'excerpts text and highlights phrases' do
+ text = "Quentin Nobble-Boston, Permanent Under-Secretary, Department for Humpadinking"
+ phrases = ['humpadinking']
+ expected = '...Department for <span class="highlight">Humpadinking</span>'
+ highlight_and_excerpt(text, phrases, 15).should == expected
+ end
+
+ it 'excerpts text and highlights matches' do
+ text = "Quentin Nobble-Boston, Permanent Under-Secretary, Department for Humpadinking"
+ matches = [/\bhumpadink\w*\b/iu]
+ expected = '...Department for <span class="highlight">Humpadinking</span>'
+ highlight_and_excerpt(text, matches, 15).should == expected
+ end
+
+ context 'multiple matches' do
+
+ it 'highlights multiple matches' do
+ text = <<-EOF
+Quentin Nobble-Boston, Permanent Under-Secretary, Department for Humpadinking
+decided to visit Humpadink so that he could be with the Humpadinks
+EOF
+
+ expected = <<-EOF
+Quentin Nobble-Boston, Permanent Under-Secretary, Department for <span class="highlight">Humpadinking</span>
+decided to visit <span class="highlight">Humpadink</span> so that he could be with the <span class="highlight">Humpadinks</span>
+EOF
+ text.chomp!
+ expected.chomp!
+ matches = [/\b(humpadink\w*)\b/iu]
+ highlight_and_excerpt(text, matches, 1000).should == expected
+ end
+
+ it 'bases the split on the first match' do
+ text = "Quentin Nobble-Boston, Permanent Under-Secretary," \
+ "Department for Humpadinking decided to visit Humpadink" \
+ "so that he could be with the Humpadinks"
+
+ expected = "...Department for <span class=\"highlight\">" \
+ "Humpadinking</span> decided to vis..."
+
+ matches = [/\b(humpadink\w*)\b/iu]
+ highlight_and_excerpt(text, matches, 15).should == expected
+ end
+
+ end
+
+ end
+
+ describe :highlight_matches do
+
+ it 'highlights' do
+ assert_equal(
+ "This is a <mark>beautiful</mark> morning",
+ highlight_matches("This is a beautiful morning", "beautiful")
+ )
+
+ assert_equal(
+ "This is a <mark>beautiful</mark> morning, but also a <mark>beautiful</mark> day",
+ highlight_matches("This is a beautiful morning, but also a beautiful day", "beautiful")
+ )
+
+ assert_equal(
+ "This is a <b>beautiful</b> morning, but also a <b>beautiful</b> day",
+ highlight_matches("This is a beautiful morning, but also a beautiful day", "beautiful", :highlighter => '<b>\1</b>')
+ )
+
+ assert_equal(
+ "This text is not changed because we supplied an empty phrase",
+ highlight_matches("This text is not changed because we supplied an empty phrase", nil)
+ )
+
+ assert_equal ' ', highlight_matches(' ', 'blank text is returned verbatim')
+ end
+
+ it 'sanitizes input' do
+ assert_equal(
+ "This is a <mark>beautiful</mark> morning",
+ highlight_matches("This is a beautiful morning<script>code!</script>", "beautiful")
+ )
+ end
+
+ it 'doesnt sanitize when the sanitize option is false' do
+ assert_equal(
+ "This is a <mark>beautiful</mark> morning<script>code!</script>",
+ highlight_matches("This is a beautiful morning<script>code!</script>", "beautiful", :sanitize => false)
+ )
+ end
+
+ it 'highlights using regexp' do
+ assert_equal(
+ "This is a <mark>beautiful!</mark> morning",
+ highlight_matches("This is a beautiful! morning", "beautiful!")
+ )
+
+ assert_equal(
+ "This is a <mark>beautiful! morning</mark>",
+ highlight_matches("This is a beautiful! morning", "beautiful! morning")
+ )
+
+ assert_equal(
+ "This is a <mark>beautiful? morning</mark>",
+ highlight_matches("This is a beautiful? morning", "beautiful? morning")
+ )
+ end
+
+ it 'accepts regex' do
+ assert_equal("This day was challenging for judge <mark>Allen</mark> and his colleagues.",
+ highlight_matches("This day was challenging for judge Allen and his colleagues.", /\ballen\b/i))
+ end
+
+ it 'highlights multiple phrases in one pass' do
+ assert_equal %(<em>wow</em> <em>em</em>), highlight_matches('wow em', %w(wow em), :highlighter => '<em>\1</em>')
+ end
+
+ it 'highlights with html' do
+ assert_equal(
+ "<p>This is a <mark>beautiful</mark> morning, but also a <mark>beautiful</mark> day</p>",
+ highlight_matches("<p>This is a beautiful morning, but also a beautiful day</p>", "beautiful")
+ )
+ assert_equal(
+ "<p>This is a <em><mark>beautiful</mark></em> morning, but also a <mark>beautiful</mark> day</p>",
+ highlight_matches("<p>This is a <em>beautiful</em> morning, but also a beautiful day</p>", "beautiful")
+ )
+ assert_equal(
+ "<p>This is a <em class=\"error\"><mark>beautiful</mark></em> morning, but also a <mark>beautiful</mark> <span class=\"last\">day</span></p>",
+ highlight_matches("<p>This is a <em class=\"error\">beautiful</em> morning, but also a beautiful <span class=\"last\">day</span></p>", "beautiful")
+ )
+ assert_equal(
+ "<p class=\"beautiful\">This is a <mark>beautiful</mark> morning, but also a <mark>beautiful</mark> day</p>",
+ highlight_matches("<p class=\"beautiful\">This is a beautiful morning, but also a beautiful day</p>", "beautiful")
+ )
+ assert_equal(
+ "<p>This is a <mark>beautiful</mark> <a href=\"http://example.com/beautiful#top?what=beautiful%20morning&amp;when=now+then\">morning</a>, but also a <mark>beautiful</mark> day</p>",
+ highlight_matches("<p>This is a beautiful <a href=\"http://example.com/beautiful\#top?what=beautiful%20morning&when=now+then\">morning</a>, but also a beautiful day</p>", "beautiful")
+ )
+ assert_equal(
+ "<div>abc <b>div</b></div>",
+ highlight_matches("<div>abc div</div>", "div", :highlighter => '<b>\1</b>')
+ )
+ end
+
+ it 'doesnt modify the options hash' do
+ options = { :highlighter => '<b>\1</b>', :sanitize => false }
+ passed_options = options.dup
+ highlight_matches("<div>abc div</div>", "div", passed_options)
+ assert_equal options, passed_options
+ end
+
+ it 'highlights with a block' do
+ assert_equal(
+ "<b>one</b> <b>two</b> <b>three</b>",
+ highlight_matches("one two three", ["one", "two", "three"]) { |word| "<b>#{word}</b>" }
+ )
+ end
+
+ end
+
+ describe :excerpt do
+
+ it 'excerpts' do
+ assert_equal("...is a beautiful morn...", excerpt("This is a beautiful morning", "beautiful", :radius => 5))
+ assert_equal("This is a...", excerpt("This is a beautiful morning", "this", :radius => 5))
+ assert_equal("...iful morning", excerpt("This is a beautiful morning", "morning", :radius => 5))
+ assert_nil excerpt("This is a beautiful morning", "day")
+ end
+
+ it 'is not html safe' do
+ assert !excerpt('This is a beautiful! morning', 'beautiful', :radius => 5).html_safe?
+ end
+
+ it 'excerpts borderline cases' do
+ assert_equal("", excerpt("", "", :radius => 0))
+ assert_equal("a", excerpt("a", "a", :radius => 0))
+ assert_equal("...b...", excerpt("abc", "b", :radius => 0))
+ assert_equal("abc", excerpt("abc", "b", :radius => 1))
+ assert_equal("abc...", excerpt("abcd", "b", :radius => 1))
+ assert_equal("...abc", excerpt("zabc", "b", :radius => 1))
+ assert_equal("...abc...", excerpt("zabcd", "b", :radius => 1))
+ assert_equal("zabcd", excerpt("zabcd", "b", :radius => 2))
+
+ # excerpt strips the resulting string before ap-/prepending excerpt_string.
+ # whether this behavior is meaningful when excerpt_string is not to be
+ # appended is questionable.
+ assert_equal("zabcd", excerpt(" zabcd ", "b", :radius => 4))
+ assert_equal("...abc...", excerpt("z abc d", "b", :radius => 1))
+ end
+
+ it 'excerpts with regex' do
+ assert_equal('...is a beautiful! mor...', excerpt('This is a beautiful! morning', 'beautiful', :radius => 5))
+ assert_equal('...is a beautiful? mor...', excerpt('This is a beautiful? morning', 'beautiful', :radius => 5))
+ assert_equal('...is a beautiful? mor...', excerpt('This is a beautiful? morning', /\bbeau\w*\b/i, :radius => 5))
+ assert_equal('...is a beautiful? mor...', excerpt('This is a beautiful? morning', /\b(beau\w*)\b/i, :radius => 5))
+ assert_equal("...udge Allen and...", excerpt("This day was challenging for judge Allen and his colleagues.", /\ballen\b/i, :radius => 5))
+ assert_equal("...judge Allen and...", excerpt("This day was challenging for judge Allen and his colleagues.", /\ballen\b/i, :radius => 1, :separator => ' '))
+ assert_equal("...was challenging for...", excerpt("This day was challenging for judge Allen and his colleagues.", /\b(\w*allen\w*)\b/i, :radius => 5))
+ end
+
+ it 'excerpts with omission' do
+ assert_equal("[...]is a beautiful morn[...]", excerpt("This is a beautiful morning", "beautiful", :omission => "[...]",:radius => 5))
+ assert_equal(
+ "This is the ultimate supercalifragilisticexpialidoceous very looooooooooooooooooong looooooooooooong beautiful morning with amazing sunshine and awesome tempera[...]",
+ excerpt("This is the ultimate supercalifragilisticexpialidoceous very looooooooooooooooooong looooooooooooong beautiful morning with amazing sunshine and awesome temperatures. So what are you gonna do about it?", "very",
+ :omission => "[...]")
+ )
+ end
+
+ it 'excerpts with utf8' do
+ if RUBY_VERSION.to_f >= 1.9
+ assert_equal("...\357\254\203ciency could not be...".force_encoding(Encoding::UTF_8), excerpt("That's why e\357\254\203ciency could not be helped".force_encoding(Encoding::UTF_8), 'could', :radius => 8))
+ else
+ assert_equal("...\357\254\203ciency could not be...", excerpt("That's why e\357\254\203ciency could not be helped", 'could', :radius => 8))
+ end
+ end
+
+ it 'doesnt modify the options hash' do
+ options = { :omission => "[...]",:radius => 5 }
+ passed_options = options.dup
+ excerpt("This is a beautiful morning", "beautiful", passed_options)
+ assert_equal options, passed_options
+ end
+
+ it 'excerpts with separator' do
+ options = { :separator => ' ', :radius => 1 }
+ assert_equal('...a very beautiful...', excerpt('This is a very beautiful morning', 'very', options))
+ assert_equal('This is...', excerpt('This is a very beautiful morning', 'this', options))
+ assert_equal('...beautiful morning', excerpt('This is a very beautiful morning', 'morning', options))
+
+ options = { :separator => "\n", :radius => 0 }
+ assert_equal("...very long...", excerpt("my very\nvery\nvery long\nstring", 'long', options))
+
+ options = { :separator => "\n", :radius => 1 }
+ assert_equal("...very\nvery long\nstring", excerpt("my very\nvery\nvery long\nstring", 'long', options))
+
+ assert_equal excerpt('This is a beautiful morning', 'a'),
+ excerpt('This is a beautiful morning', 'a', :separator => nil)
+ end
+
+ end
+
+end
diff --git a/spec/integration/xapian_search_highlighting_spec.rb b/spec/integration/xapian_search_highlighting_spec.rb
new file mode 100644
index 000000000..65a34cf91
--- /dev/null
+++ b/spec/integration/xapian_search_highlighting_spec.rb
@@ -0,0 +1,39 @@
+require File.expand_path(File.dirname(__FILE__) + '/../spec_helper')
+
+describe 'highlighting search results' do
+ include HighlightHelper
+
+ it 'ignores stopwords' do
+ phrase = 'department of humpadinking'
+ search = ActsAsXapian::Search.new([PublicBody], phrase, :limit => 1)
+ matches = search.words_to_highlight(:regex => true)
+ highlight_matches(phrase, matches).should == '<mark>department</mark> of <mark>humpadinking</mark>'
+ end
+
+ it 'ignores case' do
+ search_phrase = 'department of humpadinking'
+ search = ActsAsXapian::Search.new([PublicBody], search_phrase, :limit => 1)
+ matches = search.words_to_highlight(:regex => true)
+ highlight_matches('Department of Humpadinking', matches).should == '<mark>Department</mark> of <mark>Humpadinking</mark>'
+ end
+
+ it 'highlights stemmed words' do
+ phrase = 'department'
+ search = ActsAsXapian::Search.new([PublicBody], phrase, :limit => 1)
+ matches = search.words_to_highlight(:regex => true)
+
+ search.words_to_highlight(:regex => false).should == ['depart']
+ highlight_matches(phrase, matches).should == '<mark>department</mark>'
+ end
+
+ it 'highlights stemmed words even if the stem is unhelpful' do
+ # Stemming returns 'bore' as the word to highlight which can't be
+ # matched in the original phrase.
+ phrase = 'boring'
+ search = ActsAsXapian::Search.new([PublicBody], phrase, :limit => 1)
+ matches = search.words_to_highlight(:regex => true, :include_original => true)
+
+ highlight_matches(phrase, matches).should == '<mark>boring</mark>'
+ end
+
+end
diff --git a/spec/models/xapian_spec.rb b/spec/models/xapian_spec.rb
index a1e060d8e..678e3a2dc 100644
--- a/spec/models/xapian_spec.rb
+++ b/spec/models/xapian_spec.rb
@@ -380,23 +380,63 @@ describe ActsAsXapian::Search, "#words_to_highlight" do
it "should return a list of words used in the search" do
s = ActsAsXapian::Search.new([PublicBody], "albatross words", :limit => 100)
- s.words_to_highlight.should == ["albatross", "words"]
+ s.words_to_highlight.should == ["albatross", "word"]
end
it "should remove any operators" do
s = ActsAsXapian::Search.new([PublicBody], "albatross words tag:mice", :limit => 100)
- s.words_to_highlight.should == ["albatross", "words"]
+ s.words_to_highlight.should == ["albatross", "word"]
end
- # This is the current behaviour but it seems a little simplistic to me
it "should separate punctuation" do
s = ActsAsXapian::Search.new([PublicBody], "The doctor's patient", :limit => 100)
- s.words_to_highlight.should == ["The", "doctor", "s", "patient"]
+ s.words_to_highlight.should == ["the", "doctor", "patient"].sort
end
it "should handle non-ascii characters" do
s = ActsAsXapian::Search.new([PublicBody], "adatigénylés words tag:mice", :limit => 100)
- s.words_to_highlight.should == ["adatigénylés", "words"]
+ s.words_to_highlight.should == ["adatigénylé", "word"]
+ end
+
+ it "should ignore stopwords" do
+ s = ActsAsXapian::Search.new([PublicBody], "department of humpadinking", :limit => 100)
+ s.words_to_highlight.should_not include('of')
+ end
+
+ it "uses stemming" do
+ s = ActsAsXapian::Search.new([PublicBody], 'department of humpadinking', :limit => 100)
+ s.words_to_highlight.should == ["depart", "humpadink"]
+ end
+
+ it "doesn't stem proper nouns" do
+ s = ActsAsXapian::Search.new([PublicBody], 'department of Humpadinking', :limit => 1)
+ s.words_to_highlight.should == ["depart", "humpadinking"]
+ end
+
+ it "includes the original search terms if requested" do
+ s = ActsAsXapian::Search.new([PublicBody], 'boring', :limit => 1)
+ s.words_to_highlight(:include_original => true).should == ['bore', 'boring']
+ end
+
+ it "does not return duplicate terms" do
+ s = ActsAsXapian::Search.new([PublicBody], 'boring boring', :limit => 1)
+ s.words_to_highlight.should == ['bore']
+ end
+
+ context 'the :regex option' do
+
+ it 'wraps each words in a regex that matches the full word' do
+ expected = [/\b(albatross)\b/iu]
+ s = ActsAsXapian::Search.new([PublicBody], 'Albatross', :limit => 1)
+ s.words_to_highlight(:regex => true).should == expected
+ end
+
+ it 'wraps each stem in a regex' do
+ expected = [/\b(depart)\w*\b/iu]
+ s = ActsAsXapian::Search.new([PublicBody], 'department', :limit => 1)
+ s.words_to_highlight(:regex => true).should == expected
+ end
+
end
end