diff options
-rw-r--r-- | app/models/raw_email.rb | 19 | ||||
-rw-r--r-- | app/views/admin_raw_email/show.html.erb | 5 | ||||
-rw-r--r-- | config/initializers/alaveteli.rb | 2 | ||||
-rw-r--r-- | lib/acts_as_xapian/acts_as_xapian.rb | 5 | ||||
-rw-r--r-- | spec/factories/raw_emails.rb | 2 | ||||
-rw-r--r-- | spec/lib/acts_as_xapian_spec.rb | 110 | ||||
-rw-r--r-- | spec/models/raw_email_spec.rb | 70 | ||||
-rw-r--r-- | spec/models/xapian_spec.rb | 71 |
8 files changed, 178 insertions, 106 deletions
diff --git a/app/models/raw_email.rb b/app/models/raw_email.rb index 2a52921f0..58ae29a3b 100644 --- a/app/models/raw_email.rb +++ b/app/models/raw_email.rb @@ -40,11 +40,26 @@ class RawEmail < ActiveRecord::Base def data=(d) FileUtils.mkdir_p(directory) unless File.exists?(directory) - File.atomic_write(filepath) { |file| file.write(d) } + File.atomic_write(filepath) do |file| + file.binmode + file.write(d) + end end def data - File.open(filepath, "r").read + File.open(filepath, "rb").read + end + + def data_as_text + text = data + if text.respond_to?(:encoding) + text = text.encode("UTF-8", :invalid => :replace, + :undef => :replace, + :replace => "") + else + text = Iconv.conv('UTF-8//IGNORE', 'UTF-8', text) + end + text end def destroy_file_representation! diff --git a/app/views/admin_raw_email/show.html.erb b/app/views/admin_raw_email/show.html.erb index f88b00ef0..1de719544 100644 --- a/app/views/admin_raw_email/show.html.erb +++ b/app/views/admin_raw_email/show.html.erb @@ -59,5 +59,8 @@ <p><%= link_to "Download", admin_raw_email_path(@raw_email, :format => 'txt') %></p> -<pre><%=h(@raw_email.data).gsub(/\n/, '<br>').html_safe %></pre> +<h2>Preview</h2> + +For an exact rendering of this email, use the "Download" link. +<pre><%=h(@raw_email.data_as_text).gsub(/\n/, '<br>').html_safe %></pre> diff --git a/config/initializers/alaveteli.rb b/config/initializers/alaveteli.rb index cda163a9b..7c3b76b43 100644 --- a/config/initializers/alaveteli.rb +++ b/config/initializers/alaveteli.rb @@ -11,7 +11,7 @@ load "debug_helpers.rb" load "util.rb" # Application version -ALAVETELI_VERSION = '0.21.0.32' +ALAVETELI_VERSION = '0.21.0.33' # Add new inflection rules using the following format # (all these examples are active by default): diff --git a/lib/acts_as_xapian/acts_as_xapian.rb b/lib/acts_as_xapian/acts_as_xapian.rb index 565212904..e0c7c6ae7 100644 --- a/lib/acts_as_xapian/acts_as_xapian.rb +++ b/lib/acts_as_xapian/acts_as_xapian.rb @@ -379,7 +379,10 @@ module ActsAsXapian if correction.empty? return nil end - return correction + if correction.respond_to?(:force_encoding) + correction = correction.force_encoding('UTF-8') + end + correction end # Return array of models found diff --git a/spec/factories/raw_emails.rb b/spec/factories/raw_emails.rb index a2b6496e8..a6e3c21ac 100644 --- a/spec/factories/raw_emails.rb +++ b/spec/factories/raw_emails.rb @@ -1,6 +1,4 @@ # -*- encoding : utf-8 -*- FactoryGirl.define do - factory :raw_email - end diff --git a/spec/lib/acts_as_xapian_spec.rb b/spec/lib/acts_as_xapian_spec.rb new file mode 100644 index 000000000..1d9256441 --- /dev/null +++ b/spec/lib/acts_as_xapian_spec.rb @@ -0,0 +1,110 @@ +# -*- encoding : utf-8 -*- +require File.expand_path(File.dirname(__FILE__) + '/../spec_helper') + +describe ActsAsXapian::Search do + + describe "#words_to_highlight" do + + before :all do + # make sure an index exists + @alice = FactoryGirl.create(:public_body, :name => 'alice') + ActsAsXapian.update_index + end + + after :all do + @alice.destroy + ActsAsXapian.update_index + end + + it "should return a list of words used in the search" do + s = ActsAsXapian::Search.new([PublicBody], "albatross words", :limit => 100) + s.words_to_highlight.should == ["albatross", "word"] + end + + it "should remove any operators" do + s = ActsAsXapian::Search.new([PublicBody], "albatross words tag:mice", :limit => 100) + s.words_to_highlight.should == ["albatross", "word"] + end + + it "should separate punctuation" do + s = ActsAsXapian::Search.new([PublicBody], "The doctor's patient", :limit => 100) + s.words_to_highlight.should == ["the", "doctor", "patient"].sort + end + + it "should handle non-ascii characters" do + s = ActsAsXapian::Search.new([PublicBody], "adatigénylés words tag:mice", :limit => 100) + s.words_to_highlight.should == ["adatigénylé", "word"] + end + + it "should ignore stopwords" do + s = ActsAsXapian::Search.new([PublicBody], "department of humpadinking", :limit => 100) + s.words_to_highlight.should_not include('of') + end + + it "uses stemming" do + s = ActsAsXapian::Search.new([PublicBody], 'department of humpadinking', :limit => 100) + s.words_to_highlight.should == ["depart", "humpadink"] + end + + it "doesn't stem proper nouns" do + s = ActsAsXapian::Search.new([PublicBody], 'department of Humpadinking', :limit => 1) + s.words_to_highlight.should == ["depart", "humpadinking"] + end + + it "includes the original search terms if requested" do + s = ActsAsXapian::Search.new([PublicBody], 'boring', :limit => 1) + s.words_to_highlight(:include_original => true).should == ['bore', 'boring'] + end + + it "does not return duplicate terms" do + s = ActsAsXapian::Search.new([PublicBody], 'boring boring', :limit => 1) + s.words_to_highlight.should == ['bore'] + end + + context 'the :regex option' do + + it 'wraps each words in a regex that matches the full word' do + expected = [/\b(albatross)\b/iu] + s = ActsAsXapian::Search.new([PublicBody], 'Albatross', :limit => 1) + s.words_to_highlight(:regex => true).should == expected + end + + it 'wraps each stem in a regex' do + expected = [/\b(depart)\w*\b/iu] + s = ActsAsXapian::Search.new([PublicBody], 'department', :limit => 1) + s.words_to_highlight(:regex => true).should == expected + end + + end + end + + describe :spelling_correction do + + before :all do + @alice = FactoryGirl.create(:public_body, :name => 'alice') + @bob = FactoryGirl.create(:public_body, :name => 'bôbby') + ActsAsXapian.update_index + end + + after :all do + @alice.destroy + @bob.destroy + ActsAsXapian.update_index + end + + it 'returns a UTF-8 encoded string' do + s = ActsAsXapian::Search.new([PublicBody], "alece", :limit => 100) + s.spelling_correction.should == "alice" + if s.spelling_correction.respond_to? :encoding + s.spelling_correction.encoding.to_s.should == 'UTF-8' + end + end + + it 'handles non-ASCII characters' do + s = ActsAsXapian::Search.new([PublicBody], "bobby", :limit => 100) + s.spelling_correction.should == "bôbby" + end + + end + +end
\ No newline at end of file diff --git a/spec/models/raw_email_spec.rb b/spec/models/raw_email_spec.rb index 044c89d3f..8e0d3b457 100644 --- a/spec/models/raw_email_spec.rb +++ b/spec/models/raw_email_spec.rb @@ -8,37 +8,51 @@ require File.expand_path(File.dirname(__FILE__) + '/../spec_helper') -describe User, "manipulating a raw email" do - before do - @raw_email = RawEmail.new - incoming_message = mock_model(IncomingMessage) - info_request = mock_model(InfoRequest) - incoming_message.stub!(:info_request).and_return(info_request) - @raw_email.stub!(:incoming_message).and_return(incoming_message) +describe RawEmail do + + def roundtrip_data(raw_email, data) + raw_email.data = data + raw_email.save! + raw_email.reload + raw_email.data end - it 'putting data in comes back out' do - @raw_email.data = "Hello, world!" - @raw_email.save! - @raw_email.reload - @raw_email.data.should == "Hello, world!" + describe :data do + + it 'roundtrips data unchanged' do + raw_email = FactoryGirl.create(:incoming_message).raw_email + data = roundtrip_data(raw_email, "Hello, world!") + data.should == "Hello, world!" + end + + it 'returns an unchanged binary string with a valid encoding if the data is non-ascii and non-utf-8' do + raw_email = FactoryGirl.create(:incoming_message).raw_email + data = roundtrip_data(raw_email, "\xA0") + + if data.respond_to?(:encoding) + data.encoding.to_s.should == 'ASCII-8BIT' + data.valid_encoding?.should be_true + data = data.force_encoding('UTF-8') + end + data.should == "\xA0" + end + end - # TODO: this test fails, hopefully will be fixed in later Rails. - # Doesn't matter too much for us for storing raw_emails, it would seem, - # but keep an eye out. - - # This is testing a bug in Rails PostgreSQL code - # http://blog.aradine.com/2009/09/rubys-marshal-and-activerecord-and.html - # https://rails.lighthouseapp.com/projects/8994/tickets/1063-binary-data-broken-with-postgresql-adapter -# it 'putting data in comes back out even if it has a backslash in it' do -# @raw_email.data = "This \\ that" -# @raw_email.save! -# @raw_email.reload -# $stderr.puts @raw_email.data -# $stderr.puts "This \\ that" -# @raw_email.data.should == "This \\ that" -# end + describe :data_as_text do + + it 'returns a utf-8 string with a valid encoding if the data is non-ascii and non-utf8' do + raw_email = FactoryGirl.create(:incoming_message).raw_email + roundtrip_data(raw_email, "\xA0ccc") + data_as_text = raw_email.data_as_text + data_as_text.should == "ccc" + if data_as_text.respond_to?(:encoding) + data_as_text.encoding.to_s.should == 'UTF-8' + data_as_text.valid_encoding?.should be_true + end + end + + end end - + diff --git a/spec/models/xapian_spec.rb b/spec/models/xapian_spec.rb index b3f2e2b3c..212a1cc7e 100644 --- a/spec/models/xapian_spec.rb +++ b/spec/models/xapian_spec.rb @@ -370,77 +370,6 @@ describe PublicBody, " when only indexing selected things on a rebuild" do end end -# I would expect ActsAsXapian to have some tests under lib/acts_as_xapian, but -# it looks like this is not the case. Putting a test here instead. -describe ActsAsXapian::Search, "#words_to_highlight" do - before(:each) do - load_raw_emails_data - get_fixtures_xapian_index - end - - it "should return a list of words used in the search" do - s = ActsAsXapian::Search.new([PublicBody], "albatross words", :limit => 100) - s.words_to_highlight.should == ["albatross", "word"] - end - - it "should remove any operators" do - s = ActsAsXapian::Search.new([PublicBody], "albatross words tag:mice", :limit => 100) - s.words_to_highlight.should == ["albatross", "word"] - end - - it "should separate punctuation" do - s = ActsAsXapian::Search.new([PublicBody], "The doctor's patient", :limit => 100) - s.words_to_highlight.should == ["the", "doctor", "patient"].sort - end - - it "should handle non-ascii characters" do - s = ActsAsXapian::Search.new([PublicBody], "adatigénylés words tag:mice", :limit => 100) - s.words_to_highlight.should == ["adatigénylé", "word"] - end - - it "should ignore stopwords" do - s = ActsAsXapian::Search.new([PublicBody], "department of humpadinking", :limit => 100) - s.words_to_highlight.should_not include('of') - end - - it "uses stemming" do - s = ActsAsXapian::Search.new([PublicBody], 'department of humpadinking', :limit => 100) - s.words_to_highlight.should == ["depart", "humpadink"] - end - - it "doesn't stem proper nouns" do - s = ActsAsXapian::Search.new([PublicBody], 'department of Humpadinking', :limit => 1) - s.words_to_highlight.should == ["depart", "humpadinking"] - end - - it "includes the original search terms if requested" do - s = ActsAsXapian::Search.new([PublicBody], 'boring', :limit => 1) - s.words_to_highlight(:include_original => true).should == ['bore', 'boring'] - end - - it "does not return duplicate terms" do - s = ActsAsXapian::Search.new([PublicBody], 'boring boring', :limit => 1) - s.words_to_highlight.should == ['bore'] - end - - context 'the :regex option' do - - it 'wraps each words in a regex that matches the full word' do - expected = [/\b(albatross)\b/iu] - s = ActsAsXapian::Search.new([PublicBody], 'Albatross', :limit => 1) - s.words_to_highlight(:regex => true).should == expected - end - - it 'wraps each stem in a regex' do - expected = [/\b(depart)\w*\b/iu] - s = ActsAsXapian::Search.new([PublicBody], 'department', :limit => 1) - s.words_to_highlight(:regex => true).should == expected - end - - end - -end - describe InfoRequestEvent, " when faced with a race condition during xapian_mark_needs_index" do before(:each) do |