11 files changed, 494 insertions, 41 deletions
diff --git a/app/controllers/general_controller.rb b/app/controllers/general_controller.rb
index 28055ddbf..158492eb2 100644
--- a/app/controllers/general_controller.rb
+++ b/app/controllers/general_controller.rb
@@ -159,7 +159,7 @@ class GeneralController < ApplicationController
         end
 
         # Spelling and highight words are same for all three queries
-        @highlight_words = @request_for_spelling.words_to_highlight
+        @highlight_words = @request_for_spelling.words_to_highlight(:regex => true, :include_original => true)
         if !(@request_for_spelling.spelling_correction =~ /[a-z]+:/)
             @spelling_correction = @request_for_spelling.spelling_correction
         end
diff --git a/app/controllers/track_controller.rb b/app/controllers/track_controller.rb
index c15fb573d..83700a55b 100644
--- a/app/controllers/track_controller.rb
+++ b/app/controllers/track_controller.rb
@@ -154,7 +154,15 @@ class TrackController < ApplicationController
         request.format = 'xml' unless params[:format]
         respond_to do |format|
             format.json { render :json => @xapian_object.results.map { |r| r[:model].json_for_api(true,
-                    lambda { |t| view_context.highlight_and_excerpt(t, @xapian_object.words_to_highlight, 150) }
+                    lambda do |t|
+                        view_context.highlight_and_excerpt(
+                            t,
+                            @xapian_object.words_to_highlight(
+                                :regex => true,
+                                :include_original => true),
+                            150
+                        )
+                    end
                 ) } }
             format.any { render :template => 'track/atom_feed',
                                 :formats => ['atom'],
diff --git a/app/helpers/application_helper.rb b/app/helpers/application_helper.rb
index 45b042354..49ce94951 100644
--- a/app/helpers/application_helper.rb
+++ b/app/helpers/application_helper.rb
@@ -22,6 +22,9 @@ module ApplicationHelper
     # Useful for sending emails
     include MailerHelper
 
+    # Extra highlight helpers
+    include HighlightHelper
+
     # Copied from error_messages_for in active_record_helper.rb
     def foi_error_messages_for(*params)
         options = params.last.is_a?(Hash) ? params.pop.symbolize_keys : {}
@@ -54,25 +57,6 @@ module ApplicationHelper
         end
     end
 
-    # Highlight words, also escapes HTML (other than spans that we add)
-    def highlight_words(t, words, html = true)
-        if html
-            highlight(h(t), words, :highlighter => '<span class="highlight">\1</span>').html_safe
-        else
-            highlight(t, words, :highlighter => '*\1*')
-        end
-    end
-
-    def highlight_and_excerpt(t, words, excount, html = true)
-        newt = excerpt(t, words[0], :radius => excount)
-        if not newt
-            newt = excerpt(t, '', :radius => excount)
-        end
-        t = newt
-        t = highlight_words(t, words, html)
-        return t
-    end
-
     def locale_name(locale)
         return LanguageNames::get_language_name(locale)
     end
diff --git a/app/helpers/highlight_helper.rb b/app/helpers/highlight_helper.rb
new file mode 100644
index 000000000..a98f6f320
--- /dev/null
+++ b/app/helpers/highlight_helper.rb
@@ -0,0 +1,98 @@
+module HighlightHelper
+    include ERB::Util
+
+    # Implementation of rails' highlight that allows regex to be passed to
+    # the phrases parameter.
+    # https://github.com/rails/rails/pull/11793
+    def highlight_matches(text, phrases, options = {})
+        text = ActionController::Base.helpers.sanitize(text).try(:html_safe) if options.fetch(:sanitize, true)
+
+        if text.blank? || phrases.blank?
+            text
+        else
+            match = Array(phrases).map do |p|
+                Regexp === p ? p.to_s : Regexp.escape(p)
+            end.join('|')
+
+            if block_given?
+                text.gsub(/(#{match})(?![^<]*?>)/i) { |found| yield found }
+            else
+                highlighter = options.fetch(:highlighter, '<mark>\1</mark>')
+                text.gsub(/(#{match})(?![^<]*?>)/i, highlighter)
+            end
+         end.html_safe
+    end
+
+    # Highlight words, also escapes HTML (other than spans that we add)
+    def highlight_words(t, words, html = true)
+        if html
+            highlight_matches(h(t), words, :highlighter => '<span class="highlight">\1</span>').html_safe
+        else
+            highlight_matches(t, words, :highlighter => '*\1*')
+        end
+    end
+
+    def highlight_and_excerpt(t, words, excount, html = true)
+        newt = excerpt(t, words[0], :radius => excount)
+        if not newt
+            newt = excerpt(t, '', :radius => excount)
+        end
+        t = newt
+        t = highlight_words(t, words, html)
+        return t
+    end
+
+    def excerpt(text, phrase, options = {})
+      return unless text && phrase
+
+      separator = options.fetch(:separator, nil) || ""
+      case phrase
+      when Regexp
+        regex = phrase
+      else
+        regex = /#{Regexp.escape(phrase)}/i
+      end
+
+      return unless matches = text.match(regex)
+      phrase = matches[0]
+
+      unless separator.empty?
+        text.split(separator).each do |value|
+          if value.match(regex)
+            regex = phrase = value
+            break
+          end
+        end
+      end
+
+      first_part, second_part = text.split(phrase, 2)
+
+      prefix, first_part   = cut_excerpt_part(:first, first_part, separator, options)
+      postfix, second_part = cut_excerpt_part(:second, second_part, separator, options)
+
+      affix = [first_part, separator, phrase, separator, second_part].join.strip
+      [prefix, affix, postfix].join
+    end
+
+    private
+
+    def cut_excerpt_part(part_position, part, separator, options)
+      return "", "" unless part
+
+      radius   = options.fetch(:radius, 100)
+      omission = options.fetch(:omission, "...")
+
+      part = part.split(separator)
+      part.delete("")
+      affix = part.size > radius ? omission : ""
+
+      part = if part_position == :first
+        drop_index = [part.length - radius, 0].max
+        part.drop(drop_index)
+      else
+        part.first(radius)
+      end
+
+      return affix, part.join(separator)
+    end
+end
diff --git a/app/views/track_mailer/event_digest.text.erb b/app/views/track_mailer/event_digest.text.erb
index a154f430f..f6e699e41 100644
--- a/app/views/track_mailer/event_digest.text.erb
+++ b/app/views/track_mailer/event_digest.text.erb
@@ -4,7 +4,7 @@
     for track_thing, alert_results, xapian_object in @email_about_things
         main_text += track_thing.params[:title_in_email] + "\n"
         main_text += ("=" * track_thing.params[:title_in_email].size) + "\n\n"
-        @highlight_words = xapian_object.words_to_highlight
+        @highlight_words = xapian_object.words_to_highlight(:regex => true)
         for result in alert_results.reverse
             if result[:model].class.to_s == "InfoRequestEvent"
                 event = result[:model]
diff --git a/lib/acts_as_xapian/acts_as_xapian.rb b/lib/acts_as_xapian/acts_as_xapian.rb
index 168d2eec3..6520a20a4 100644
--- a/lib/acts_as_xapian/acts_as_xapian.rb
+++ b/lib/acts_as_xapian/acts_as_xapian.rb
@@ -21,6 +21,20 @@ rescue LoadError
     $acts_as_xapian_bindings_available = false
 end
 
+module Xapian
+    class QueryParser
+        def unstem(term)
+            words = []
+
+            Xapian._safelyIterate(unstem_begin(term), unstem_end(term)) do |item|
+                words << item.term
+            end
+
+            words
+        end
+    end
+end
+
 module ActsAsXapian
     ######################################################################
     # Module level variables
@@ -472,16 +486,42 @@ module ActsAsXapian
         # Return just normal words in the query i.e. Not operators, ones in
         # date ranges or similar. Use this for cheap highlighting with
         # TextHelper::highlight, and excerpt.
-        def words_to_highlight
-            # TODO: In Ruby 1.9 we can do matching of any unicode letter with \p{L}
-            # But we still need to support ruby 1.8 for the time being so...
-            query_nopunc = self.query_string.gsub(/[^ёЁа-яА-Яa-zA-Zà-üÀ-Ü0-9:\.\/_]/iu, " ")
-            query_nopunc = query_nopunc.gsub(/\s+/, " ")
-            words = query_nopunc.split(" ")
-            # Remove anything with a :, . or / in it
-            words = words.find_all {|o| !o.match(/(:|\.|\/)/) }
-            words = words.find_all {|o| !o.match(/^(AND|NOT|OR|XOR)$/) }
-            return words
+        def words_to_highlight(opts = {})
+          default_opts = { :include_original => false, :regex => false }
+          opts = default_opts.merge(opts)
+
+          # Reject all prefixes other than Z, which we know is reserved for stems
+          terms = query.terms.reject { |t| t.term.first.match(/^[A-Y]$/) }
+          # Collect the stems including the Z prefix
+          raw_stems = terms.map { |t| t.term if t.term.start_with?('Z') }.compact.uniq.sort
+          # Collect stems, chopping the Z prefix off
+          stems = raw_stems.map { |t| t[1..-1] }.compact.sort
+          # Collect the non-stem terms
+          words = terms.map { |t| t.term unless t.term.start_with?('Z') }.compact.sort
+
+          # Add the unstemmed words from the original query
+          # Sometimes stems can be unhelpful with the :regex option, for example
+          # stemming 'boring' results in us trying to highlight 'bore'.
+          if opts[:include_original]
+            raw_stems.each do |raw_stem|
+              words << ActsAsXapian.query_parser.unstem(raw_stem).uniq
+            end
+
+            words = words.any? ? words.flatten.uniq : []
+          end
+
+          if opts[:regex]
+            stems.map! { |w| /\b(#{ w })\w*\b/iu }
+            words.map! { |w| /\b(#{ w })\b/iu }
+          end
+
+          if RUBY_VERSION.to_f >= 1.9
+              (stems + words).map! do |term|
+                  term.is_a?(String) ? term.force_encoding('UTF-8') : term
+              end
+          else
+              stems + words
+          end
         end
 
         # Text for lines in log file
@@ -975,5 +1015,3 @@ end
 
 # Reopen ActiveRecord and include the acts_as_xapian method
 ActiveRecord::Base.extend ActsAsXapian::ActsMethods
-
-
diff --git a/spec/controllers/general_controller_spec.rb b/spec/controllers/general_controller_spec.rb
index 7590a5b42..c0a9d57d3 100644
--- a/spec/controllers/general_controller_spec.rb
+++ b/spec/controllers/general_controller_spec.rb
@@ -188,7 +188,7 @@ describe GeneralController, 'when using xapian search' do
 
     it 'should highlight words for a user-only request' do
       get :search, :combined => "bob/users"
-      assigns[:highlight_words].should == ['bob']
+      assigns[:highlight_words].should == [/\b(bob)\w*\b/iu,  /\b(bob)\b/iu]
     end
 
     it 'should show spelling corrections for a user-only request' do
diff --git a/spec/controllers/request_controller_spec.rb b/spec/controllers/request_controller_spec.rb
index 48f37a45c..f7c935af3 100644
--- a/spec/controllers/request_controller_spec.rb
+++ b/spec/controllers/request_controller_spec.rb
@@ -923,7 +923,6 @@ describe RequestController, "when searching for an authority" do
     end
 
     it "should return matching bodies" do
-
         session[:user_id] = @user.id
         get :select_authority, :query => "Quango"
 
diff --git a/spec/helpers/highlight_helper_spec.rb b/spec/helpers/highlight_helper_spec.rb
new file mode 100644
index 000000000..e1be7e153
--- /dev/null
+++ b/spec/helpers/highlight_helper_spec.rb
@@ -0,0 +1,247 @@
+require File.expand_path(File.dirname(__FILE__) + '/../spec_helper')
+
+describe HighlightHelper do
+  include HighlightHelper
+
+  describe :highlight_and_excerpt do
+
+      it 'excerpts text and highlights phrases' do
+          text = "Quentin Nobble-Boston, Permanent Under-Secretary, Department for Humpadinking"
+          phrases = ['humpadinking']
+          expected = '...Department for <span class="highlight">Humpadinking</span>'
+          highlight_and_excerpt(text, phrases, 15).should == expected
+      end
+
+      it 'excerpts text and highlights matches' do
+          text = "Quentin Nobble-Boston, Permanent Under-Secretary, Department for Humpadinking"
+          matches = [/\bhumpadink\w*\b/iu]
+          expected = '...Department for <span class="highlight">Humpadinking</span>'
+          highlight_and_excerpt(text, matches, 15).should == expected
+      end
+
+      context 'multiple matches' do
+
+          it 'highlights multiple matches' do
+              text = <<-EOF
+Quentin Nobble-Boston, Permanent Under-Secretary, Department for Humpadinking
+decided to visit Humpadink so that he could be with the Humpadinks
+EOF
+
+              expected = <<-EOF
+Quentin Nobble-Boston, Permanent Under-Secretary, Department for <span class="highlight">Humpadinking</span>
+decided to visit <span class="highlight">Humpadink</span> so that he could be with the <span class="highlight">Humpadinks</span>
+EOF
+              text.chomp!
+              expected.chomp!
+              matches = [/\b(humpadink\w*)\b/iu]
+              highlight_and_excerpt(text, matches, 1000).should == expected
+          end
+
+          it 'bases the split on the first match' do
+              text = "Quentin Nobble-Boston, Permanent Under-Secretary," \
+                     "Department for Humpadinking decided to visit Humpadink" \
+                     "so that he could be with the Humpadinks"
+
+              expected = "...Department for <span class=\"highlight\">" \
+                         "Humpadinking</span> decided to vis..."
+
+              matches = [/\b(humpadink\w*)\b/iu]
+              highlight_and_excerpt(text, matches, 15).should == expected
+          end
+
+      end
+
+  end
+
+  describe :highlight_matches do
+
+      it 'highlights' do
+          assert_equal(
+            "This is a <mark>beautiful</mark> morning",
+            highlight_matches("This is a beautiful morning", "beautiful")
+          )
+
+          assert_equal(
+            "This is a <mark>beautiful</mark> morning, but also a <mark>beautiful</mark> day",
+            highlight_matches("This is a beautiful morning, but also a beautiful day", "beautiful")
+          )
+
+          assert_equal(
+            "This is a <b>beautiful</b> morning, but also a <b>beautiful</b> day",
+            highlight_matches("This is a beautiful morning, but also a beautiful day", "beautiful", :highlighter => '<b>\1</b>')
+          )
+
+          assert_equal(
+            "This text is not changed because we supplied an empty phrase",
+            highlight_matches("This text is not changed because we supplied an empty phrase", nil)
+          )
+
+          assert_equal '   ', highlight_matches('   ', 'blank text is returned verbatim')
+        end
+
+        it 'sanitizes input' do
+          assert_equal(
+            "This is a <mark>beautiful</mark> morning",
+            highlight_matches("This is a beautiful morning<script>code!</script>", "beautiful")
+          )
+        end
+
+        it 'doesnt sanitize when the sanitize option is false' do
+          assert_equal(
+            "This is a <mark>beautiful</mark> morning<script>code!</script>",
+            highlight_matches("This is a beautiful morning<script>code!</script>", "beautiful", :sanitize => false)
+          )
+        end
+
+        it 'highlights using regexp' do
+          assert_equal(
+            "This is a <mark>beautiful!</mark> morning",
+            highlight_matches("This is a beautiful! morning", "beautiful!")
+          )
+
+          assert_equal(
+            "This is a <mark>beautiful! morning</mark>",
+            highlight_matches("This is a beautiful! morning", "beautiful! morning")
+          )
+
+          assert_equal(
+            "This is a <mark>beautiful? morning</mark>",
+            highlight_matches("This is a beautiful? morning", "beautiful? morning")
+          )
+        end
+
+        it 'accepts regex' do
+          assert_equal("This day was challenging for judge <mark>Allen</mark> and his colleagues.",
+                       highlight_matches("This day was challenging for judge Allen and his colleagues.", /\ballen\b/i))
+        end
+
+        it 'highlights multiple phrases in one pass' do
+          assert_equal %(<em>wow</em> <em>em</em>), highlight_matches('wow em', %w(wow em), :highlighter => '<em>\1</em>')
+        end
+
+        it 'highlights with html' do
+          assert_equal(
+            "<p>This is a <mark>beautiful</mark> morning, but also a <mark>beautiful</mark> day</p>",
+            highlight_matches("<p>This is a beautiful morning, but also a beautiful day</p>", "beautiful")
+          )
+          assert_equal(
+            "<p>This is a <em><mark>beautiful</mark></em> morning, but also a <mark>beautiful</mark> day</p>",
+            highlight_matches("<p>This is a <em>beautiful</em> morning, but also a beautiful day</p>", "beautiful")
+          )
+          assert_equal(
+            "<p>This is a <em class=\"error\"><mark>beautiful</mark></em> morning, but also a <mark>beautiful</mark> <span class=\"last\">day</span></p>",
+            highlight_matches("<p>This is a <em class=\"error\">beautiful</em> morning, but also a beautiful <span class=\"last\">day</span></p>", "beautiful")
+          )
+          assert_equal(
+            "<p class=\"beautiful\">This is a <mark>beautiful</mark> morning, but also a <mark>beautiful</mark> day</p>",
+            highlight_matches("<p class=\"beautiful\">This is a beautiful morning, but also a beautiful day</p>", "beautiful")
+          )
+          assert_equal(
+            "<p>This is a <mark>beautiful</mark> <a href=\"http://example.com/beautiful#top?what=beautiful%20morning&amp;when=now+then\">morning</a>, but also a <mark>beautiful</mark> day</p>",
+            highlight_matches("<p>This is a beautiful <a href=\"http://example.com/beautiful\#top?what=beautiful%20morning&when=now+then\">morning</a>, but also a beautiful day</p>", "beautiful")
+          )
+          assert_equal(
+            "<div>abc <b>div</b></div>",
+            highlight_matches("<div>abc div</div>", "div", :highlighter => '<b>\1</b>')
+          )
+        end
+
+        it 'doesnt modify the options hash' do
+            options = { :highlighter => '<b>\1</b>', :sanitize => false }
+            passed_options = options.dup
+            highlight_matches("<div>abc div</div>", "div", passed_options)
+            assert_equal options, passed_options
+        end
+
+        it 'highlights with a block' do
+            assert_equal(
+                "<b>one</b> <b>two</b> <b>three</b>",
+                highlight_matches("one two three", ["one", "two", "three"]) { |word| "<b>#{word}</b>" }
+            )
+        end
+
+    end
+
+    describe :excerpt do
+
+      it 'excerpts' do
+        assert_equal("...is a beautiful morn...", excerpt("This is a beautiful morning", "beautiful", :radius => 5))
+        assert_equal("This is a...", excerpt("This is a beautiful morning", "this", :radius => 5))
+        assert_equal("...iful morning", excerpt("This is a beautiful morning", "morning", :radius => 5))
+        assert_nil excerpt("This is a beautiful morning", "day")
+      end
+
+      it 'is not html safe' do
+        assert !excerpt('This is a beautiful! morning', 'beautiful', :radius => 5).html_safe?
+      end
+
+      it 'excerpts borderline cases' do
+        assert_equal("", excerpt("", "", :radius => 0))
+        assert_equal("a", excerpt("a", "a", :radius => 0))
+        assert_equal("...b...", excerpt("abc", "b", :radius => 0))
+        assert_equal("abc", excerpt("abc", "b", :radius => 1))
+        assert_equal("abc...", excerpt("abcd", "b", :radius => 1))
+        assert_equal("...abc", excerpt("zabc", "b", :radius => 1))
+        assert_equal("...abc...", excerpt("zabcd", "b", :radius => 1))
+        assert_equal("zabcd", excerpt("zabcd", "b", :radius => 2))
+
+        # excerpt strips the resulting string before ap-/prepending excerpt_string.
+        # whether this behavior is meaningful when excerpt_string is not to be
+        # appended is questionable.
+        assert_equal("zabcd", excerpt("  zabcd  ", "b", :radius => 4))
+        assert_equal("...abc...", excerpt("z  abc  d", "b", :radius => 1))
+      end
+
+      it 'excerpts with regex' do
+        assert_equal('...is a beautiful! mor...', excerpt('This is a beautiful! morning', 'beautiful', :radius => 5))
+        assert_equal('...is a beautiful? mor...', excerpt('This is a beautiful? morning', 'beautiful', :radius => 5))
+        assert_equal('...is a beautiful? mor...', excerpt('This is a beautiful? morning', /\bbeau\w*\b/i, :radius => 5))
+        assert_equal('...is a beautiful? mor...', excerpt('This is a beautiful? morning', /\b(beau\w*)\b/i, :radius => 5))
+        assert_equal("...udge Allen and...", excerpt("This day was challenging for judge Allen and his colleagues.", /\ballen\b/i, :radius => 5))
+        assert_equal("...judge Allen and...", excerpt("This day was challenging for judge Allen and his colleagues.", /\ballen\b/i, :radius => 1, :separator => ' '))
+        assert_equal("...was challenging for...", excerpt("This day was challenging for judge Allen and his colleagues.", /\b(\w*allen\w*)\b/i, :radius => 5))
+      end
+
+      it 'excerpts with omission' do
+        assert_equal("[...]is a beautiful morn[...]", excerpt("This is a beautiful morning", "beautiful", :omission => "[...]",:radius => 5))
+        assert_equal(
+          "This is the ultimate supercalifragilisticexpialidoceous very looooooooooooooooooong looooooooooooong beautiful morning with amazing sunshine and awesome tempera[...]",
+          excerpt("This is the ultimate supercalifragilisticexpialidoceous very looooooooooooooooooong looooooooooooong beautiful morning with amazing sunshine and awesome temperatures. So what are you gonna do about it?", "very",
+          :omission => "[...]")
+        )
+      end
+
+      it 'excerpts with utf8' do
+          if RUBY_VERSION.to_f >= 1.9
+              assert_equal("...\357\254\203ciency could not be...".force_encoding(Encoding::UTF_8), excerpt("That's why e\357\254\203ciency could not be helped".force_encoding(Encoding::UTF_8), 'could', :radius => 8))
+          else
+              assert_equal("...\357\254\203ciency could not be...", excerpt("That's why e\357\254\203ciency could not be helped", 'could', :radius => 8))
+          end
+      end
+
+      it 'doesnt modify the options hash' do
+        options = { :omission => "[...]",:radius => 5 }
+        passed_options = options.dup
+        excerpt("This is a beautiful morning", "beautiful", passed_options)
+        assert_equal options, passed_options
+      end
+
+      it 'excerpts with separator' do
+        options = { :separator => ' ', :radius => 1 }
+        assert_equal('...a very beautiful...', excerpt('This is a very beautiful morning', 'very', options))
+        assert_equal('This is...', excerpt('This is a very beautiful morning', 'this', options))
+        assert_equal('...beautiful morning', excerpt('This is a very beautiful morning', 'morning', options))
+
+        options = { :separator => "\n", :radius => 0 }
+        assert_equal("...very long...", excerpt("my very\nvery\nvery long\nstring", 'long', options))
+
+        options = { :separator => "\n", :radius => 1 }
+        assert_equal("...very\nvery long\nstring", excerpt("my very\nvery\nvery long\nstring", 'long', options))
+
+        assert_equal excerpt('This is a beautiful morning', 'a'),
+                     excerpt('This is a beautiful morning', 'a', :separator => nil)
+      end
+
+    end
+
+end
diff --git a/spec/integration/xapian_search_highlighting_spec.rb b/spec/integration/xapian_search_highlighting_spec.rb
new file mode 100644
index 000000000..65a34cf91
--- /dev/null
+++ b/spec/integration/xapian_search_highlighting_spec.rb
@@ -0,0 +1,39 @@
+require File.expand_path(File.dirname(__FILE__) + '/../spec_helper')
+
+describe 'highlighting search results' do
+    include HighlightHelper
+
+    it 'ignores stopwords' do
+        phrase = 'department of humpadinking'
+        search = ActsAsXapian::Search.new([PublicBody], phrase, :limit => 1)
+        matches = search.words_to_highlight(:regex => true)
+        highlight_matches(phrase, matches).should == '<mark>department</mark> of <mark>humpadinking</mark>'
+    end
+
+    it 'ignores case' do
+        search_phrase = 'department of humpadinking'
+        search = ActsAsXapian::Search.new([PublicBody], search_phrase, :limit => 1)
+        matches = search.words_to_highlight(:regex => true)
+        highlight_matches('Department of Humpadinking', matches).should == '<mark>Department</mark> of <mark>Humpadinking</mark>'
+    end
+
+    it 'highlights stemmed words' do
+        phrase = 'department'
+        search = ActsAsXapian::Search.new([PublicBody], phrase, :limit => 1)
+        matches = search.words_to_highlight(:regex => true)
+
+        search.words_to_highlight(:regex => false).should == ['depart']
+        highlight_matches(phrase, matches).should == '<mark>department</mark>'
+    end
+
+    it 'highlights stemmed words even if the stem is unhelpful' do
+        # Stemming returns 'bore' as the word to highlight which can't be
+        # matched in the original phrase.
+        phrase = 'boring'
+        search = ActsAsXapian::Search.new([PublicBody], phrase, :limit => 1)
+        matches = search.words_to_highlight(:regex => true, :include_original => true)
+
+        highlight_matches(phrase, matches).should == '<mark>boring</mark>'
+    end
+
+end
diff --git a/spec/models/xapian_spec.rb b/spec/models/xapian_spec.rb
index a1e060d8e..678e3a2dc 100644
--- a/spec/models/xapian_spec.rb
+++ b/spec/models/xapian_spec.rb
@@ -380,23 +380,63 @@ describe ActsAsXapian::Search, "#words_to_highlight" do
 
     it "should return a list of words used in the search" do
         s = ActsAsXapian::Search.new([PublicBody], "albatross words", :limit => 100)
-        s.words_to_highlight.should == ["albatross", "words"]
+        s.words_to_highlight.should == ["albatross", "word"]
     end
 
     it "should remove any operators" do
         s = ActsAsXapian::Search.new([PublicBody], "albatross words tag:mice", :limit => 100)
-        s.words_to_highlight.should == ["albatross", "words"]
+        s.words_to_highlight.should == ["albatross", "word"]
     end
 
-    # This is the current behaviour but it seems a little simplistic to me
     it "should separate punctuation" do
         s = ActsAsXapian::Search.new([PublicBody], "The doctor's patient", :limit => 100)
-        s.words_to_highlight.should == ["The", "doctor", "s", "patient"] 
+        s.words_to_highlight.should == ["the", "doctor", "patient"].sort
     end
 
     it "should handle non-ascii characters" do
         s = ActsAsXapian::Search.new([PublicBody], "adatigénylés words tag:mice", :limit => 100)
-        s.words_to_highlight.should == ["adatigénylés", "words"]
+        s.words_to_highlight.should == ["adatigénylé", "word"]
+    end
+
+    it "should ignore stopwords" do
+        s = ActsAsXapian::Search.new([PublicBody], "department of humpadinking", :limit => 100)
+        s.words_to_highlight.should_not include('of')
+    end
+
+    it "uses stemming" do
+        s = ActsAsXapian::Search.new([PublicBody], 'department of humpadinking', :limit => 100)
+        s.words_to_highlight.should == ["depart", "humpadink"]
+    end
+
+    it "doesn't stem proper nouns" do
+        s = ActsAsXapian::Search.new([PublicBody], 'department of Humpadinking', :limit => 1)
+        s.words_to_highlight.should == ["depart", "humpadinking"]
+    end
+
+    it "includes the original search terms if requested" do
+        s = ActsAsXapian::Search.new([PublicBody], 'boring', :limit => 1)
+        s.words_to_highlight(:include_original => true).should == ['bore', 'boring']
+    end
+
+    it "does not return duplicate terms" do
+        s = ActsAsXapian::Search.new([PublicBody], 'boring boring', :limit => 1)
+        s.words_to_highlight.should == ['bore']
+    end
+
+    context 'the :regex option' do
+
+        it 'wraps each words in a regex that matches the full word' do
+            expected = [/\b(albatross)\b/iu]
+            s = ActsAsXapian::Search.new([PublicBody], 'Albatross', :limit => 1)
+            s.words_to_highlight(:regex => true).should == expected
+        end
+
+        it 'wraps each stem in a regex' do
+            expected = [/\b(depart)\w*\b/iu]
+            s = ActsAsXapian::Search.new([PublicBody], 'department', :limit => 1)
+            s.words_to_highlight(:regex => true).should == expected
+        end
+
     end
 
 end