diff options
-rw-r--r-- | lib/acts_as_xapian/acts_as_xapian.rb | 72 | ||||
-rw-r--r-- | spec/integration/xapian_search_highlighting_spec.rb | 11 |
2 files changed, 48 insertions, 35 deletions
diff --git a/lib/acts_as_xapian/acts_as_xapian.rb b/lib/acts_as_xapian/acts_as_xapian.rb index 48d0b0554..9194e8cc4 100644 --- a/lib/acts_as_xapian/acts_as_xapian.rb +++ b/lib/acts_as_xapian/acts_as_xapian.rb @@ -490,41 +490,37 @@ module ActsAsXapian # date ranges or similar. Use this for cheap highlighting with # TextHelper::highlight, and excerpt. def words_to_highlight(opts = {}) - default_opts = { :include_original => false, :regex => false } - opts = default_opts.merge(opts) - - # Reject all prefixes other than Z, which we know is reserved for stems - terms = query.terms.reject { |t| t.term.first.match(/^[A-Y]$/) } - # Collect the stems including the Z prefix - raw_stems = terms.map { |t| t.term if t.term.start_with?('Z') }.compact.uniq.sort - # Collect stems, chopping the Z prefix off - stems = raw_stems.map { |t| t[1..-1] }.compact.sort - # Collect the non-stem terms - words = terms.map { |t| t.term unless t.term.start_with?('Z') }.compact.sort - - # Add the unstemmed words from the original query - # Sometimes stems can be unhelpful with the :regex option, for example - # stemming 'boring' results in us trying to highlight 'bore'. - if opts[:include_original] - raw_stems.each do |raw_stem| - words << ActsAsXapian.query_parser.unstem(raw_stem).uniq - end - - words = words.any? ? words.flatten.uniq : [] - end - - if opts[:regex] - stems.map! { |w| /\b(#{ w })\w*\b/iu } - words.map! { |w| /\b(#{ w })\b/iu } - end - - if RUBY_VERSION.to_f >= 1.9 - (stems + words).map! do |term| - term.is_a?(String) ? term.force_encoding('UTF-8') : term - end - else - stems + words - end + default_opts = { :include_original => false, :regex => false } + opts = default_opts.merge(opts) + + # Reject all prefixes other than Z, which we know is reserved for stems + terms = query.terms.reject { |t| t.term.first.match(/^[A-Y]$/) } + # Collect the stems including the Z prefix + raw_stems = terms.map { |t| t.term if t.term.start_with?('Z') }.compact.uniq.sort + # Collect stems, chopping the Z prefix off + stems = raw_stems.map { |t| t[1..-1] }.compact.sort + # Collect the non-stem terms + words = terms.map { |t| t.term unless t.term.start_with?('Z') }.compact.sort + + # Add the unstemmed words from the original query + # Sometimes stems can be unhelpful with the :regex option, for example + # stemming 'boring' results in us trying to highlight 'bore'. + if opts[:include_original] + raw_stems.each do |raw_stem| + words << ActsAsXapian.query_parser.unstem(raw_stem).uniq + end + + words = words.any? ? words.flatten.uniq : [] + end + + if opts[:regex] + stems.map! { |w| /\b(#{ correctly_encode(w) })\w*\b/iu } + words.map! { |w| /\b(#{ correctly_encode(w) })\b/iu } + end + + (stems + words).map! do |term| + term.is_a?(String) ? correctly_encode(term) : term + end end # Text for lines in log file @@ -532,6 +528,12 @@ module ActsAsXapian "Search: " + self.query_string end + private + + def correctly_encode(w) + RUBY_VERSION.to_f >= 1.9 ? w.force_encoding('UTF-8') : w + end + end # Search for models which contain theimportant terms taken from a specified diff --git a/spec/integration/xapian_search_highlighting_spec.rb b/spec/integration/xapian_search_highlighting_spec.rb index 65a34cf91..c0834a2c1 100644 --- a/spec/integration/xapian_search_highlighting_spec.rb +++ b/spec/integration/xapian_search_highlighting_spec.rb @@ -1,3 +1,5 @@ +# encoding: utf-8 + require File.expand_path(File.dirname(__FILE__) + '/../spec_helper') describe 'highlighting search results' do @@ -36,4 +38,13 @@ describe 'highlighting search results' do highlight_matches(phrase, matches).should == '<mark>boring</mark>' end + it 'handles macrons correctly' do + phrase = 'Māori' + + search = ActsAsXapian::Search.new([PublicBody], phrase, :limit => 1) + matches = search.words_to_highlight(:regex => true, :include_original => true) + + highlight_matches(phrase, matches).should == '<mark>Māori</mark>' + end + end |