diff options
author | Gareth Rees <gareth@mysociety.org> | 2014-05-30 12:42:48 +0100 |
---|---|---|
committer | Gareth Rees <gareth@mysociety.org> | 2014-06-25 10:40:37 +0100 |
commit | e5a73815f580d296572e11b71b5f3ed320bbe912 (patch) | |
tree | a6afe88e876933f15a65e9f948e195dc5b79cf5e | |
parent | b429a1a2fc2a3b73322e6361fb27ed0a5c3ace7f (diff) |
Add helper to highlight and excerpt by regex
Backport of https://github.com/rails/rails/pull/11793/
Contains integration tests to check that it works
as expected with ActsAsXapian.
-rw-r--r-- | app/helpers/application_helper.rb | 22 | ||||
-rw-r--r-- | app/helpers/highlight_helper.rb | 92 | ||||
-rw-r--r-- | spec/helpers/highlight_helper_spec.rb | 189 | ||||
-rw-r--r-- | spec/integration/xapian_search_highlighting_spec.rb | 29 |
4 files changed, 313 insertions, 19 deletions
diff --git a/app/helpers/application_helper.rb b/app/helpers/application_helper.rb index 45b042354..49ce94951 100644 --- a/app/helpers/application_helper.rb +++ b/app/helpers/application_helper.rb @@ -22,6 +22,9 @@ module ApplicationHelper # Useful for sending emails include MailerHelper + # Extra highlight helpers + include HighlightHelper + # Copied from error_messages_for in active_record_helper.rb def foi_error_messages_for(*params) options = params.last.is_a?(Hash) ? params.pop.symbolize_keys : {} @@ -54,25 +57,6 @@ module ApplicationHelper end end - # Highlight words, also escapes HTML (other than spans that we add) - def highlight_words(t, words, html = true) - if html - highlight(h(t), words, :highlighter => '<span class="highlight">\1</span>').html_safe - else - highlight(t, words, :highlighter => '*\1*') - end - end - - def highlight_and_excerpt(t, words, excount, html = true) - newt = excerpt(t, words[0], :radius => excount) - if not newt - newt = excerpt(t, '', :radius => excount) - end - t = newt - t = highlight_words(t, words, html) - return t - end - def locale_name(locale) return LanguageNames::get_language_name(locale) end diff --git a/app/helpers/highlight_helper.rb b/app/helpers/highlight_helper.rb new file mode 100644 index 000000000..63809aff5 --- /dev/null +++ b/app/helpers/highlight_helper.rb @@ -0,0 +1,92 @@ +module HighlightHelper + + # Implementation of rails' highlight that allows regex to be passed to + # the phrases parameter. + # https://github.com/rails/rails/pull/11793 + def highlight_matches(text, phrases, options = {}) + text = ActionController::Base.helpers.sanitize(text).try(:html_safe) if options.fetch(:sanitize, true) + + if text.blank? || phrases.blank? + text + else + highlighter = options.fetch(:highlighter, '<mark>\1</mark>') + match = Array(phrases).map do |p| + Regexp === p ? p.to_s : Regexp.escape(p) + end.join('|') + text.gsub(/(#{match})(?![^<]*?>)/i, highlighter) + end.html_safe + end + + # Highlight words, also escapes HTML (other than spans that we add) + def highlight_words(t, words, html = true) + if html + highlight_matches(h(t), words, :highlighter => '<span class="highlight">\1</span>').html_safe + else + highlight_matches(t, words, :highlighter => '*\1*') + end + end + + def highlight_and_excerpt(t, words, excount, html = true) + newt = excerpt(t, words[0], :radius => excount) + if not newt + newt = excerpt(t, '', :radius => excount) + end + t = newt + t = highlight_words(t, words, html) + return t + end + + def excerpt(text, phrase, options = {}) + return unless text && phrase + + separator = options.fetch(:separator, nil) || "" + if Regexp === phrase + regex = phrase + else + phrase = Regexp.escape(phrase) + regex = /#{phrase}/iu + end + + return unless matches = text.match(regex) + phrase = matches[0] + + unless separator.empty? + text.split(separator).each do |value| + if value.match(regex) + regex = phrase = value + break + end + end + end + + first_part, second_part = text.split(regex, 2) + + prefix, first_part = cut_excerpt_part(:first, first_part, separator, options) + postfix, second_part = cut_excerpt_part(:second, second_part, separator, options) + + affix = [first_part, separator, phrase, separator, second_part].join.strip + [prefix, affix, postfix].join + end + + private + + def cut_excerpt_part(part_position, part, separator, options) + return "", "" unless part + + radius = options.fetch(:radius, 100) + omission = options.fetch(:omission, "...") + + part = part.split(separator) + part.delete("") + affix = part.size > radius ? omission : "" + + part = if part_position == :first + drop_index = [part.length - radius, 0].max + part.drop(drop_index) + else + part.first(radius) + end + + return affix, part.join(separator) + end +end diff --git a/spec/helpers/highlight_helper_spec.rb b/spec/helpers/highlight_helper_spec.rb new file mode 100644 index 000000000..bd0c62226 --- /dev/null +++ b/spec/helpers/highlight_helper_spec.rb @@ -0,0 +1,189 @@ +require File.expand_path(File.dirname(__FILE__) + '/../spec_helper') + +describe HighlightHelper do + + include HighlightHelper + + + describe :highlight_matches do + + it 'highlights' do + assert_equal( + "This is a <mark>beautiful</mark> morning", + highlight_matches("This is a beautiful morning", "beautiful") + ) + + assert_equal( + "This is a <mark>beautiful</mark> morning, but also a <mark>beautiful</mark> day", + highlight_matches("This is a beautiful morning, but also a beautiful day", "beautiful") + ) + + assert_equal( + "This is a <b>beautiful</b> morning, but also a <b>beautiful</b> day", + highlight_matches("This is a beautiful morning, but also a beautiful day", "beautiful", :highlighter => '<b>\1</b>') + ) + + assert_equal( + "This text is not changed because we supplied an empty phrase", + highlight_matches("This text is not changed because we supplied an empty phrase", nil) + ) + + assert_equal ' ', highlight_matches(' ', 'blank text is returned verbatim') + end + + it 'sanitizes input' do + assert_equal( + "This is a <mark>beautiful</mark> morning", + highlight_matches("This is a beautiful morning<script>code!</script>", "beautiful") + ) + end + + it 'doesnt sanitize when the sanitize option is false' do + assert_equal( + "This is a <mark>beautiful</mark> morning<script>code!</script>", + highlight_matches("This is a beautiful morning<script>code!</script>", "beautiful", :sanitize => false) + ) + end + + it 'highlights using regexp' do + assert_equal( + "This is a <mark>beautiful!</mark> morning", + highlight_matches("This is a beautiful! morning", "beautiful!") + ) + + assert_equal( + "This is a <mark>beautiful! morning</mark>", + highlight_matches("This is a beautiful! morning", "beautiful! morning") + ) + + assert_equal( + "This is a <mark>beautiful? morning</mark>", + highlight_matches("This is a beautiful? morning", "beautiful? morning") + ) + end + + it 'accepts regex' do + assert_equal("This day was challenging for judge <mark>Allen</mark> and his colleagues.", + highlight_matches("This day was challenging for judge Allen and his colleagues.", /\ballen\b/i)) + end + + it 'highlights multiple phrases in one pass' do + assert_equal %(<em>wow</em> <em>em</em>), highlight_matches('wow em', %w(wow em), :highlighter => '<em>\1</em>') + end + + it 'highlights with html' do + assert_equal( + "<p>This is a <mark>beautiful</mark> morning, but also a <mark>beautiful</mark> day</p>", + highlight_matches("<p>This is a beautiful morning, but also a beautiful day</p>", "beautiful") + ) + assert_equal( + "<p>This is a <em><mark>beautiful</mark></em> morning, but also a <mark>beautiful</mark> day</p>", + highlight_matches("<p>This is a <em>beautiful</em> morning, but also a beautiful day</p>", "beautiful") + ) + assert_equal( + "<p>This is a <em class=\"error\"><mark>beautiful</mark></em> morning, but also a <mark>beautiful</mark> <span class=\"last\">day</span></p>", + highlight_matches("<p>This is a <em class=\"error\">beautiful</em> morning, but also a beautiful <span class=\"last\">day</span></p>", "beautiful") + ) + assert_equal( + "<p class=\"beautiful\">This is a <mark>beautiful</mark> morning, but also a <mark>beautiful</mark> day</p>", + highlight_matches("<p class=\"beautiful\">This is a beautiful morning, but also a beautiful day</p>", "beautiful") + ) + assert_equal( + "<p>This is a <mark>beautiful</mark> <a href=\"http://example.com/beautiful#top?what=beautiful%20morning&when=now+then\">morning</a>, but also a <mark>beautiful</mark> day</p>", + highlight_matches("<p>This is a beautiful <a href=\"http://example.com/beautiful\#top?what=beautiful%20morning&when=now+then\">morning</a>, but also a beautiful day</p>", "beautiful") + ) + assert_equal( + "<div>abc <b>div</b></div>", + highlight_matches("<div>abc div</div>", "div", :highlighter => '<b>\1</b>') + ) + end + + it 'doesnt modify the options hash' do + options = { :highlighter => '<b>\1</b>', :sanitize => false } + passed_options = options.dup + highlight_matches("<div>abc div</div>", "div", passed_options) + assert_equal options, passed_options + end + + end + + describe :excerpt do + + it 'excerpts' do + assert_equal("...is a beautiful morn...", excerpt("This is a beautiful morning", "beautiful", :radius => 5)) + assert_equal("This is a...", excerpt("This is a beautiful morning", "this", :radius => 5)) + assert_equal("...iful morning", excerpt("This is a beautiful morning", "morning", :radius => 5)) + assert_equal("...udge Allen and...", excerpt("This day was challenging for judge Allen and his colleagues.", /\ballen\b/i, :radius => 5)) + assert_equal("...judge Allen and...", excerpt("This day was challenging for judge Allen and his colleagues.", /\ballen\b/i, :radius => 1, :separator => ' ')) + assert_nil excerpt("This is a beautiful morning", "day") + end + + it 'is not html safe' do + assert !excerpt('This is a beautiful! morning', 'beautiful', :radius => 5).html_safe? + end + + it 'excerpts borderline cases' do + assert_equal("", excerpt("", "", :radius => 0)) + assert_equal("a", excerpt("a", "a", :radius => 0)) + assert_equal("...b...", excerpt("abc", "b", :radius => 0)) + assert_equal("abc", excerpt("abc", "b", :radius => 1)) + assert_equal("abc...", excerpt("abcd", "b", :radius => 1)) + assert_equal("...abc", excerpt("zabc", "b", :radius => 1)) + assert_equal("...abc...", excerpt("zabcd", "b", :radius => 1)) + assert_equal("zabcd", excerpt("zabcd", "b", :radius => 2)) + + # excerpt strips the resulting string before ap-/prepending excerpt_string. + # whether this behavior is meaningful when excerpt_string is not to be + # appended is questionable. + assert_equal("zabcd", excerpt(" zabcd ", "b", :radius => 4)) + assert_equal("...abc...", excerpt("z abc d", "b", :radius => 1)) + end + + it 'excerpts with regex' do + assert_equal('...is a beautiful! mor...', excerpt('This is a beautiful! morning', 'beautiful', :radius => 5)) + assert_equal('...is a beautiful? mor...', excerpt('This is a beautiful? morning', 'beautiful', :radius => 5)) + end + + it 'excerpts with omission' do + assert_equal("[...]is a beautiful morn[...]", excerpt("This is a beautiful morning", "beautiful", :omission => "[...]",:radius => 5)) + assert_equal( + "This is the ultimate supercalifragilisticexpialidoceous very looooooooooooooooooong looooooooooooong beautiful morning with amazing sunshine and awesome tempera[...]", + excerpt("This is the ultimate supercalifragilisticexpialidoceous very looooooooooooooooooong looooooooooooong beautiful morning with amazing sunshine and awesome temperatures. So what are you gonna do about it?", "very", + :omission => "[...]") + ) + end + + it 'excerpts with utf8' do + if RUBY_VERSION.to_f >= 1.9 + assert_equal("...\357\254\203ciency could not be...".force_encoding(Encoding::UTF_8), excerpt("That's why e\357\254\203ciency could not be helped".force_encoding(Encoding::UTF_8), 'could', :radius => 8)) + else + assert_equal("...\357\254\203ciency could not be...", excerpt("That's why e\357\254\203ciency could not be helped", 'could', :radius => 8)) + end + end + + it 'doesnt modify the options hash' do + options = { :omission => "[...]",:radius => 5 } + passed_options = options.dup + excerpt("This is a beautiful morning", "beautiful", passed_options) + assert_equal options, passed_options + end + + it 'excerpts with separator' do + options = { :separator => ' ', :radius => 1 } + assert_equal('...a very beautiful...', excerpt('This is a very beautiful morning', 'very', options)) + assert_equal('This is...', excerpt('This is a very beautiful morning', 'this', options)) + assert_equal('...beautiful morning', excerpt('This is a very beautiful morning', 'morning', options)) + + options = { :separator => "\n", :radius => 0 } + assert_equal("...very long...", excerpt("my very\nvery\nvery long\nstring", 'long', options)) + + options = { :separator => "\n", :radius => 1 } + assert_equal("...very\nvery long\nstring", excerpt("my very\nvery\nvery long\nstring", 'long', options)) + + assert_equal excerpt('This is a beautiful morning', 'a'), + excerpt('This is a beautiful morning', 'a', :separator => nil) + end + + end + +end diff --git a/spec/integration/xapian_search_highlighting_spec.rb b/spec/integration/xapian_search_highlighting_spec.rb new file mode 100644 index 000000000..7bd64c995 --- /dev/null +++ b/spec/integration/xapian_search_highlighting_spec.rb @@ -0,0 +1,29 @@ +require File.expand_path(File.dirname(__FILE__) + '/../spec_helper') + +describe 'highlighting search results' do + include HighlightHelper + + it 'ignores stopwords' do + phrase = 'department of humpadinking' + search = ActsAsXapian::Search.new([PublicBody], phrase, :limit => 1) + matches = search.words_to_highlight(:regex => true) + highlight_matches(phrase, matches).should == '<mark>department</mark> of <mark>humpadinking</mark>' + end + + it 'ignores case' do + search_phrase = 'department of humpadinking' + search = ActsAsXapian::Search.new([PublicBody], search_phrase, :limit => 1) + matches = search.words_to_highlight(:regex => true) + highlight_matches('Department of Humpadinking', matches).should == '<mark>Department</mark> of <mark>Humpadinking</mark>' + end + + it 'highlights stemmed words' do + phrase = 'department' + search = ActsAsXapian::Search.new([PublicBody], phrase, :limit => 1) + matches = search.words_to_highlight(:regex => true) + + search.words_to_highlight(:regex => false).should == ['depart'] + highlight_matches(phrase, matches).should == '<mark>department</mark>' + end + +end |