diff options
author | Gareth Rees <gareth@mysociety.org> | 2014-06-25 12:49:17 +0100 |
---|---|---|
committer | Gareth Rees <gareth@mysociety.org> | 2014-06-25 12:49:17 +0100 |
commit | fe5880e555f98350b4c08a4885c560e35692687f (patch) | |
tree | 31ae4a508ec50a1c4801a56d85f1329848358380 /lib | |
parent | c99c9ef86c9020939cf8d0930faca5bd206e111f (diff) | |
parent | f23b89f3474847cdd14ba892c5a7259964e18148 (diff) |
Merge branch 'issues/1434-email-highlighting' into rails-3-develop
Diffstat (limited to 'lib')
-rw-r--r-- | lib/acts_as_xapian/acts_as_xapian.rb | 62 |
1 files changed, 50 insertions, 12 deletions
diff --git a/lib/acts_as_xapian/acts_as_xapian.rb b/lib/acts_as_xapian/acts_as_xapian.rb index 168d2eec3..6520a20a4 100644 --- a/lib/acts_as_xapian/acts_as_xapian.rb +++ b/lib/acts_as_xapian/acts_as_xapian.rb @@ -21,6 +21,20 @@ rescue LoadError $acts_as_xapian_bindings_available = false end +module Xapian + class QueryParser + def unstem(term) + words = [] + + Xapian._safelyIterate(unstem_begin(term), unstem_end(term)) do |item| + words << item.term + end + + words + end + end +end + module ActsAsXapian ###################################################################### # Module level variables @@ -472,16 +486,42 @@ module ActsAsXapian # Return just normal words in the query i.e. Not operators, ones in # date ranges or similar. Use this for cheap highlighting with # TextHelper::highlight, and excerpt. - def words_to_highlight - # TODO: In Ruby 1.9 we can do matching of any unicode letter with \p{L} - # But we still need to support ruby 1.8 for the time being so... - query_nopunc = self.query_string.gsub(/[^ёЁа-яА-Яa-zA-Zà-üÀ-Ü0-9:\.\/_]/iu, " ") - query_nopunc = query_nopunc.gsub(/\s+/, " ") - words = query_nopunc.split(" ") - # Remove anything with a :, . or / in it - words = words.find_all {|o| !o.match(/(:|\.|\/)/) } - words = words.find_all {|o| !o.match(/^(AND|NOT|OR|XOR)$/) } - return words + def words_to_highlight(opts = {}) + default_opts = { :include_original => false, :regex => false } + opts = default_opts.merge(opts) + + # Reject all prefixes other than Z, which we know is reserved for stems + terms = query.terms.reject { |t| t.term.first.match(/^[A-Y]$/) } + # Collect the stems including the Z prefix + raw_stems = terms.map { |t| t.term if t.term.start_with?('Z') }.compact.uniq.sort + # Collect stems, chopping the Z prefix off + stems = raw_stems.map { |t| t[1..-1] }.compact.sort + # Collect the non-stem terms + words = terms.map { |t| t.term unless t.term.start_with?('Z') }.compact.sort + + # Add the unstemmed words from the original query + # Sometimes stems can be unhelpful with the :regex option, for example + # stemming 'boring' results in us trying to highlight 'bore'. + if opts[:include_original] + raw_stems.each do |raw_stem| + words << ActsAsXapian.query_parser.unstem(raw_stem).uniq + end + + words = words.any? ? words.flatten.uniq : [] + end + + if opts[:regex] + stems.map! { |w| /\b(#{ w })\w*\b/iu } + words.map! { |w| /\b(#{ w })\b/iu } + end + + if RUBY_VERSION.to_f >= 1.9 + (stems + words).map! do |term| + term.is_a?(String) ? term.force_encoding('UTF-8') : term + end + else + stems + words + end end # Text for lines in log file @@ -975,5 +1015,3 @@ end # Reopen ActiveRecord and include the acts_as_xapian method ActiveRecord::Base.extend ActsAsXapian::ActsMethods - - |