aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--lib/acts_as_xapian/acts_as_xapian.rb72
-rw-r--r--spec/integration/xapian_search_highlighting_spec.rb11
2 files changed, 48 insertions, 35 deletions
diff --git a/lib/acts_as_xapian/acts_as_xapian.rb b/lib/acts_as_xapian/acts_as_xapian.rb
index 48d0b0554..9194e8cc4 100644
--- a/lib/acts_as_xapian/acts_as_xapian.rb
+++ b/lib/acts_as_xapian/acts_as_xapian.rb
@@ -490,41 +490,37 @@ module ActsAsXapian
# date ranges or similar. Use this for cheap highlighting with
# TextHelper::highlight, and excerpt.
def words_to_highlight(opts = {})
- default_opts = { :include_original => false, :regex => false }
- opts = default_opts.merge(opts)
-
- # Reject all prefixes other than Z, which we know is reserved for stems
- terms = query.terms.reject { |t| t.term.first.match(/^[A-Y]$/) }
- # Collect the stems including the Z prefix
- raw_stems = terms.map { |t| t.term if t.term.start_with?('Z') }.compact.uniq.sort
- # Collect stems, chopping the Z prefix off
- stems = raw_stems.map { |t| t[1..-1] }.compact.sort
- # Collect the non-stem terms
- words = terms.map { |t| t.term unless t.term.start_with?('Z') }.compact.sort
-
- # Add the unstemmed words from the original query
- # Sometimes stems can be unhelpful with the :regex option, for example
- # stemming 'boring' results in us trying to highlight 'bore'.
- if opts[:include_original]
- raw_stems.each do |raw_stem|
- words << ActsAsXapian.query_parser.unstem(raw_stem).uniq
- end
-
- words = words.any? ? words.flatten.uniq : []
- end
-
- if opts[:regex]
- stems.map! { |w| /\b(#{ w })\w*\b/iu }
- words.map! { |w| /\b(#{ w })\b/iu }
- end
-
- if RUBY_VERSION.to_f >= 1.9
- (stems + words).map! do |term|
- term.is_a?(String) ? term.force_encoding('UTF-8') : term
- end
- else
- stems + words
- end
+ default_opts = { :include_original => false, :regex => false }
+ opts = default_opts.merge(opts)
+
+ # Reject all prefixes other than Z, which we know is reserved for stems
+ terms = query.terms.reject { |t| t.term.first.match(/^[A-Y]$/) }
+ # Collect the stems including the Z prefix
+ raw_stems = terms.map { |t| t.term if t.term.start_with?('Z') }.compact.uniq.sort
+ # Collect stems, chopping the Z prefix off
+ stems = raw_stems.map { |t| t[1..-1] }.compact.sort
+ # Collect the non-stem terms
+ words = terms.map { |t| t.term unless t.term.start_with?('Z') }.compact.sort
+
+ # Add the unstemmed words from the original query
+ # Sometimes stems can be unhelpful with the :regex option, for example
+ # stemming 'boring' results in us trying to highlight 'bore'.
+ if opts[:include_original]
+ raw_stems.each do |raw_stem|
+ words << ActsAsXapian.query_parser.unstem(raw_stem).uniq
+ end
+
+ words = words.any? ? words.flatten.uniq : []
+ end
+
+ if opts[:regex]
+ stems.map! { |w| /\b(#{ correctly_encode(w) })\w*\b/iu }
+ words.map! { |w| /\b(#{ correctly_encode(w) })\b/iu }
+ end
+
+ (stems + words).map! do |term|
+ term.is_a?(String) ? correctly_encode(term) : term
+ end
end
# Text for lines in log file
@@ -532,6 +528,12 @@ module ActsAsXapian
"Search: " + self.query_string
end
+ private
+
+ def correctly_encode(w)
+ RUBY_VERSION.to_f >= 1.9 ? w.force_encoding('UTF-8') : w
+ end
+
end
# Search for models which contain theimportant terms taken from a specified
diff --git a/spec/integration/xapian_search_highlighting_spec.rb b/spec/integration/xapian_search_highlighting_spec.rb
index 65a34cf91..c0834a2c1 100644
--- a/spec/integration/xapian_search_highlighting_spec.rb
+++ b/spec/integration/xapian_search_highlighting_spec.rb
@@ -1,3 +1,5 @@
+# encoding: utf-8
+
require File.expand_path(File.dirname(__FILE__) + '/../spec_helper')
describe 'highlighting search results' do
@@ -36,4 +38,13 @@ describe 'highlighting search results' do
highlight_matches(phrase, matches).should == '<mark>boring</mark>'
end
+ it 'handles macrons correctly' do
+ phrase = 'Māori'
+
+ search = ActsAsXapian::Search.new([PublicBody], phrase, :limit => 1)
+ matches = search.words_to_highlight(:regex => true, :include_original => true)
+
+ highlight_matches(phrase, matches).should == '<mark>Māori</mark>'
+ end
+
end