diff options
-rw-r--r-- | Gemfile | 6 | ||||
-rw-r--r-- | README.md | 2 | ||||
-rw-r--r-- | app/controllers/request_controller.rb | 8 | ||||
-rw-r--r-- | app/models/censor_rule.rb | 28 | ||||
-rw-r--r-- | app/models/foi_attachment.rb | 28 | ||||
-rw-r--r-- | app/models/incoming_message.rb | 89 | ||||
-rw-r--r-- | config/application.rb | 2 | ||||
-rw-r--r-- | lib/alaveteli_text_masker.rb | 21 | ||||
-rw-r--r-- | lib/attachment_to_html/adapter.rb | 2 | ||||
-rw-r--r-- | lib/mail_handler/backends/mail_backend.rb | 2 | ||||
-rw-r--r-- | lib/normalize_string.rb | 23 | ||||
-rw-r--r-- | lib/tasks/temp.rake | 96 | ||||
-rwxr-xr-x | script/handle-mail-replies | 2 | ||||
-rwxr-xr-x | script/mailin | 2 | ||||
-rw-r--r-- | spec/controllers/request_controller_spec.rb | 6 | ||||
-rw-r--r-- | spec/integration/parameter_stripping_spec.rb | 2 | ||||
-rw-r--r-- | spec/lib/alaveteli_text_masker_spec.rb | 7 | ||||
-rw-r--r-- | spec/lib/basic_encoding_spec.rb | 24 | ||||
-rw-r--r-- | spec/models/censor_rule_spec.rb | 58 | ||||
-rw-r--r-- | spec/models/foi_attachment_spec.rb | 74 |
20 files changed, 339 insertions, 143 deletions
@@ -21,11 +21,10 @@ gem 'mahoro', '~> 0.4' gem 'memcache-client', '~> 1.8.5' gem 'net-http-local', '~> 0.1.2', :platforms => [:ruby_18, :ruby_19] gem 'net-purge', '~> 0.1.0' +gem 'nokogiri', '~> 1.5.9' gem 'open4', '~> 1.3.4' gem 'rack', '~> 1.4.6' -if RUBY_VERSION.to_f >= 1.9 - gem 'rack-utf8_sanitizer', '~> 1.3.0' -end +gem 'rack-utf8_sanitizer', '~> 1.3.0', :platforms => :ruby_19 gem 'rake', '0.9.2.2' gem 'rails-i18n', '~> 0.7.3' gem 'recaptcha', '~> 0.3.1', :require => 'recaptcha/rails' @@ -69,7 +68,6 @@ group :test do gem 'fakeweb', '~> 1.3.0' gem 'coveralls', :require => false gem 'webrat', '~> 0.7.3' - gem 'nokogiri', '~> 1.5.9' end group :test, :development do @@ -68,3 +68,5 @@ We have some more notes for developers [on the project site](http://alaveteli.or * [KiMitTud](http://kimittud.atlatszo.hu) * [Informace Pro Všechny](http://www.infoprovsechny.cz) * [fyi.org.nz](https://fyi.org.nz) + +See more at [alaveteli.org](http://alaveteli.org/deployments/). diff --git a/app/controllers/request_controller.rb b/app/controllers/request_controller.rb index 45229fd7e..26e3b350c 100644 --- a/app/controllers/request_controller.rb +++ b/app/controllers/request_controller.rb @@ -763,12 +763,12 @@ class RequestController < ApplicationController # Prevent spam to magic request address. Note that the binary # subsitution method used depends on the content type - @incoming_message.apply_masks!(@attachment.body, @attachment.content_type) + body = @attachment.default_body + @incoming_message.apply_masks!(body, @attachment.content_type) if response.content_type == 'text/html' - @attachment.body = ActionController::Base.helpers.sanitize(@attachment.body) + body = ActionController::Base.helpers.sanitize(body) end - - render :text => @attachment.body + render :text => body end def get_attachment_as_html diff --git a/app/models/censor_rule.rb b/app/models/censor_rule.rb index f1f1a0d70..aec8a87cc 100644 --- a/app/models/censor_rule.rb +++ b/app/models/censor_rule.rb @@ -46,17 +46,17 @@ class CensorRule < ActiveRecord::Base def apply_to_text(text_to_censor) return nil if text_to_censor.nil? - text_to_censor.gsub(to_replace, replacement) + text_to_censor.gsub(to_replace('UTF-8'), replacement) end def apply_to_text!(text_to_censor) return nil if text_to_censor.nil? - text_to_censor.gsub!(to_replace, replacement) + text_to_censor.gsub!(to_replace('UTF-8'), replacement) end def apply_to_binary!(binary_to_censor) return nil if binary_to_censor.nil? - binary_to_censor.gsub!(to_replace) { |match| match.gsub(/./, 'x') } + binary_to_censor.gsub!(to_replace('ASCII-8BIT')) { |match| match.gsub(single_char_regexp, 'x') } end def is_global? @@ -65,6 +65,14 @@ class CensorRule < ActiveRecord::Base private + def single_char_regexp + if String.method_defined?(:encode) + Regexp.new('.'.force_encoding('ASCII-8BIT')) + else + Regexp.new('.', nil, 'N') + end + end + def require_user_request_or_public_body if info_request.nil? && user.nil? && public_body.nil? [:info_request, :user, :public_body].each do |a| @@ -75,18 +83,22 @@ class CensorRule < ActiveRecord::Base def require_valid_regexp begin - make_regexp + make_regexp('UTF-8') rescue RegexpError => e errors.add(:text, e.message) end end - def make_regexp - Regexp.new(text, Regexp::MULTILINE) + def to_replace(encoding) + regexp? ? make_regexp(encoding) : encoded_text(encoding) + end + + def encoded_text(encoding) + String.method_defined?(:encode) ? text.dup.force_encoding(encoding) : text end - def to_replace - regexp? ? make_regexp : text + def make_regexp(encoding) + Regexp.new(encoded_text(encoding), Regexp::MULTILINE) end end diff --git a/app/models/foi_attachment.rb b/app/models/foi_attachment.rb index 0af47b26e..37a9c9827 100644 --- a/app/models/foi_attachment.rb +++ b/app/models/foi_attachment.rb @@ -62,19 +62,18 @@ class FoiAttachment < ActiveRecord::Base } update_display_size! @cached_body = d + if String.method_defined?(:encode) + @cached_body = @cached_body.force_encoding("ASCII-8BIT") + end end + # raw body, encoded as binary def body if @cached_body.nil? tries = 0 delay = 1 begin - binary_data = File.open(self.filepath, "rb" ){ |file| file.read } - if self.content_type =~ /^text/ - @cached_body = convert_string_to_utf8_or_binary(binary_data, 'UTF-8') - else - @cached_body = binary_data - end + @cached_body = File.open(filepath, "rb" ){ |file| file.read } rescue Errno::ENOENT # we've lost our cached attachments for some reason. Reparse them. if tries > BODY_MAX_TRIES @@ -93,6 +92,17 @@ class FoiAttachment < ActiveRecord::Base return @cached_body end + # body as UTF-8 text, with scrubbing of invalid chars if needed + def body_as_text + convert_string_to_utf8(body, 'UTF-8') + end + + # for text types, the scrubbed UTF-8 text. For all other types, the + # raw binary + def default_body + text_type? ? body_as_text.string : body + end + # List of DSN codes taken from RFC 3463 # http://tools.ietf.org/html/rfc3463 DsnToMessage = { @@ -294,5 +304,11 @@ class FoiAttachment < ActiveRecord::Base AttachmentToHTML.to_html(self, to_html_opts) end + private + + def text_type? + AlaveteliTextMasker::TextMask.include?(content_type) + end + end diff --git a/app/models/incoming_message.rb b/app/models/incoming_message.rb index f28cae0c6..71b081560 100644 --- a/app/models/incoming_message.rb +++ b/app/models/incoming_message.rb @@ -372,41 +372,23 @@ class IncomingMessage < ActiveRecord::Base def _convert_part_body_to_text(part) if part.nil? text = "[ Email has no body, please see attachments ]" - source_charset = "utf-8" else - # by default, the body (coming from an foi_attachment) should have been converted to utf-8 - text = part.body - source_charset = part.charset + # whatever kind of attachment it is, get the UTF-8 encoded text + text = part.body_as_text.string if part.content_type == 'text/html' # e.g. http://www.whatdotheyknow.com/request/35/response/177 # TODO: This is a bit of a hack as it is calling a # convert to text routine. Could instead call a # sanitize HTML one. - - # If the text isn't UTF8, it means we had a problem - # converting it (invalid characters, etc), and we - # should instead tell elinks to respect the source - # charset - use_charset = "utf-8" - if String.method_defined?(:encode) - begin - text.encode('utf-8') - rescue Encoding::UndefinedConversionError, Encoding::InvalidByteSequenceError - use_charset = source_charset - end - else - begin - text = Iconv.conv('utf-8', 'utf-8', text) - rescue Iconv::IllegalSequence - use_charset = source_charset - end - end - text = MailHandler.get_attachment_text_one_file(part.content_type, text, use_charset) + text = MailHandler.get_attachment_text_one_file(part.content_type, text, "UTF-8") end end - # If text hasn't been converted, we sanitise it. - text = _sanitize_text(text) + # Add an annotation if the text had to be scrubbed + if part.body_as_text.scrubbed? + text += _("\n\n[ {{site_name}} note: The above text was badly encoded, and has had strange characters removed. ]", + :site_name => MySociety::Config.get('SITE_NAME', 'Alaveteli')) + end # Fix DOS style linefeeds to Unix style ones (or other later regexps won't work) text = text.gsub(/\r\n/, "\n") @@ -418,50 +400,6 @@ class IncomingMessage < ActiveRecord::Base return text end - def _sanitize_text(text) - if String.method_defined?(:encode) - begin - # Test if it's good UTF-8 - text.encode('utf-8') - rescue Encoding::UndefinedConversionError, Encoding::InvalidByteSequenceError - source_charset = 'utf-8' if source_charset.nil? - # strip out anything that isn't UTF-8 - begin - text = text.encode("utf-8", :invalid => :replace, - :undef => :replace, - :replace => "") + - _("\n\n[ {{site_name}} note: The above text was badly encoded, and has had strange characters removed. ]", - :site_name => MySociety::Config.get('SITE_NAME', 'Alaveteli')) - rescue Encoding::UndefinedConversionError, Encoding::InvalidByteSequenceError - if source_charset != "utf-8" - source_charset = "utf-8" - retry - end - end - end - else - begin - # Test if it's good UTF-8 - text = Iconv.conv('utf-8', 'utf-8', text) - rescue Iconv::IllegalSequence - # Text looks like unlabelled nonsense, - # strip out anything that isn't UTF-8 - begin - source_charset = 'utf-8' if source_charset.nil? - text = Iconv.conv('utf-8//IGNORE', source_charset, text) + - _("\n\n[ {{site_name}} note: The above text was badly encoded, and has had strange characters removed. ]", - :site_name => AlaveteliConfiguration::site_name) - rescue Iconv::InvalidEncoding, Iconv::IllegalSequence, Iconv::InvalidCharacter - if source_charset != "utf-8" - source_charset = "utf-8" - retry - end - end - end - end - text - end - # Returns part which contains main body text, or nil if there isn't one, # from a set of foi_attachments. If the leaves parameter is empty or not # supplied, uses its own foi_attachments. @@ -677,16 +615,7 @@ class IncomingMessage < ActiveRecord::Base end def _get_attachment_text_internal - text = self._extract_text - - # Remove any bad characters - if String.method_defined?(:encode) - # handle "problematic" encoding - text.encode!('UTF-16', 'UTF-8', :invalid => :replace, :undef => :replace, :replace => '') - text.encode('UTF-8', 'UTF-16') - else - Iconv.conv('utf-8//IGNORE', 'utf-8', text) - end + convert_string_to_utf8(_extract_text, 'UTF-8').string end # Returns text for indexing diff --git a/config/application.rb b/config/application.rb index ed7488454..79784148f 100644 --- a/config/application.rb +++ b/config/application.rb @@ -85,7 +85,7 @@ module Alaveteli config.middleware.insert_before ::ActionDispatch::Cookies, WhatDoTheyKnow::StripEmptySessions, :key => '_wdtk_cookie_session', :path => "/", :httponly => true # Strip non-UTF-8 request parameters - if RUBY_VERSION.to_f >= 1.9 + if RUBY_VERSION == '1.9.3' config.middleware.insert 0, Rack::UTF8Sanitizer end diff --git a/lib/alaveteli_text_masker.rb b/lib/alaveteli_text_masker.rb index 3c2bcf825..49dd15ae5 100644 --- a/lib/alaveteli_text_masker.rb +++ b/lib/alaveteli_text_masker.rb @@ -8,6 +8,21 @@ module AlaveteliTextMasker 'image/bmp', 'application/zip' ] + TextMask = [ 'text/css', + 'text/csv', + 'text/html', + 'text/plain', + 'text/rfc822-headers', + 'text/rtf', + 'text/tab-separated-values', + 'text/x-c', + 'text/x-diff', + 'text/x-fortran', + 'text/x-mail', + 'text/xml', + 'text/x-pascal', + 'text/x-vcard' ] + # Replaces all email addresses in (possibly binary) data # Also applies custom masks and censor items def apply_masks!(text, content_type, options = {}) @@ -19,7 +34,7 @@ module AlaveteliTextMasker case content_type when *DoNotBinaryMask # do nothing - when 'text/html' + when *TextMask apply_text_masks!(text, options) when 'application/pdf' apply_pdf_masks!(text, options) @@ -79,7 +94,7 @@ module AlaveteliTextMasker # Replace text in place def apply_binary_masks!(text, options = {}) # Keep original size, so can check haven't resized it - orig_size = text.mb_chars.size + orig_size = text.size # Replace ASCII email addresses... text.gsub!(MySociety::Validate.email_find_regexp) do |email| @@ -114,7 +129,7 @@ module AlaveteliTextMasker # Replace censor items censor_rules = options[:censor_rules] || [] censor_rules.each{ |censor_rule| censor_rule.apply_to_binary!(text) } - raise "internal error in apply_binary_masks!" if text.mb_chars.size != orig_size + raise "internal error in apply_binary_masks!" if text.size != orig_size return text end diff --git a/lib/attachment_to_html/adapter.rb b/lib/attachment_to_html/adapter.rb index 058fb2a01..ac8a16411 100644 --- a/lib/attachment_to_html/adapter.rb +++ b/lib/attachment_to_html/adapter.rb @@ -61,7 +61,7 @@ module AttachmentToHTML end def attachment_body - @attachment_body ||= attachment.body + @attachment_body ||= attachment.default_body end end end diff --git a/lib/mail_handler/backends/mail_backend.rb b/lib/mail_handler/backends/mail_backend.rb index 34fbc91ab..19f502275 100644 --- a/lib/mail_handler/backends/mail_backend.rb +++ b/lib/mail_handler/backends/mail_backend.rb @@ -68,7 +68,7 @@ module MailHandler part_file_name = part_file_name.nil? ? nil : part_file_name.dup if part_file_name part_file_name = CGI.unescape(part_file_name) - part_file_name = convert_string_to_utf8(part_file_name, part.charset) + part_file_name = convert_string_to_utf8(part_file_name, part.charset).string end part_file_name end diff --git a/lib/normalize_string.rb b/lib/normalize_string.rb index d850d7e05..69853fd6e 100644 --- a/lib/normalize_string.rb +++ b/lib/normalize_string.rb @@ -73,18 +73,27 @@ def convert_string_to_utf8_or_binary(s, suggested_character_encoding=nil) result end +class StringConversionResult < Struct.new(:string, :scrubbed) + alias_method :scrubbed?, :scrubbed +end + def convert_string_to_utf8(s, suggested_character_encoding=nil) begin result = normalize_string_to_utf8 s, suggested_character_encoding + StringConversionResult.new(result, false) rescue EncodingNormalizationError - result = s - if String.method_defined?(:encode) - result = s.force_encoding("utf-8").encode("utf-8", :invalid => :replace, - :undef => :replace, - :replace => "") - end + result = scrub(s) + StringConversionResult.new(result, true) + end +end + +def scrub(string) + if String.method_defined?(:encode) + string = string.force_encoding("utf-8") + string.valid_encoding? ? string : string.encode("utf-16le", :invalid => :replace, :replace => "").encode("utf-8") + else + Iconv.conv('UTF-8//IGNORE', 'UTF-8', string) end - result end def log_text_details(message, text) diff --git a/lib/tasks/temp.rake b/lib/tasks/temp.rake index 67fa10174..d5f7e8b22 100644 --- a/lib/tasks/temp.rake +++ b/lib/tasks/temp.rake @@ -1,3 +1,4 @@ +# -*- coding: utf-8 -*- namespace :temp do @@ -37,4 +38,99 @@ namespace :temp do end + desc 'Look for and fix invalid UTF-8 text in various models. Should be run under ruby 1.9 or above' + task :fix_invalid_utf8 => :environment do + + dryrun = ENV['DRYRUN'] != '0' + if dryrun + $stderr.puts "This is a dryrun - nothing will be changed" + end + + + PublicBody.find_each do |public_body| + unless public_body.name.valid_encoding? + name = convert_string_to_utf8(public_body.name) + puts "Bad encoding in PublicBody name, id: #{public_body.id}, " \ + "old name: #{public_body.name.force_encoding('UTF-8')}, new name #{name}" + unless dryrun + public_body.name_will_change! + public_body.name = name + public_body.last_edit_editor = 'system' + public_body.last_edit_comment = 'Invalid utf-8 encoding fixed by temp:fix_invalid_utf8' + public_body.save! + end + end + + # Editing old versions of public bodies - we don't want to affect the timestamp + PublicBody::Version.record_timestamps = false + public_body.versions.each do |public_body_version| + unless public_body_version.name.valid_encoding? + name = convert_string_to_utf8(public_body_version.name).string + puts "Bad encoding in PublicBody::Version name, " \ + "id: #{public_body_version.id}, old name: #{public_body_version.name.force_encoding('UTF-8')}, " \ + "new name: #{name}" + unless dryrun + public_body_version.name_will_change! + public_body_version.name = name + public_body_version.save! + end + end + end + PublicBody::Version.record_timestamps = true + + end + + IncomingMessage.find_each do |incoming_message| + if (incoming_message.cached_attachment_text_clipped && + !incoming_message.cached_attachment_text_clipped.valid_encoding?) || + (incoming_message.cached_main_body_text_folded && + !incoming_message.cached_main_body_text_folded.valid_encoding?) || + (incoming_message.cached_main_body_text_unfolded && + !incoming_message.cached_main_body_text_unfolded.valid_encoding?) + puts "Bad encoding in IncomingMessage cached fields, :id #{incoming_message.id} " + unless dryrun + incoming_message.clear_in_database_caches! + end + end + end + + FoiAttachment.find_each do |foi_attachment| + unescaped_filename = CGI.unescape(foi_attachment.filename) + unless unescaped_filename.valid_encoding? + filename = convert_string_to_utf8(unescaped_filename).string + puts "Bad encoding in FoiAttachment filename, id: #{foi_attachment.id} " \ + "old filename #{unescaped_filename.force_encoding('UTF-8')}, new filename #{filename}" + unless dryrun + foi_attachment.filename = filename + foi_attachment.save! + end + end + end + + OutgoingMessage.find_each do |outgoing_message| + unless outgoing_message.raw_body.valid_encoding? + + raw_body = convert_string_to_utf8(outgoing_message.raw_body).string + puts "Bad encoding in OutgoingMessage raw_body, id: #{outgoing_message.id} " \ + "old raw_body: #{outgoing_message.raw_body.force_encoding('UTF-8')}, new raw_body: #{raw_body}" + unless dryrun + outgoing_message.body = raw_body + outgoing_message.save! + end + end + end + + User.find_each do |user| + unless user.name.valid_encoding? + name = convert_string_to_utf8(user.name).string + puts "Bad encoding in User name, id: #{user.id}, " \ + "old name: #{user.name.force_encoding('UTF-8')}, new name: #{name}" + unless dryrun + user.name = name + user.save! + end + end + end + + end end diff --git a/script/handle-mail-replies b/script/handle-mail-replies index 15454b311..267ad5632 100755 --- a/script/handle-mail-replies +++ b/script/handle-mail-replies @@ -1,4 +1,4 @@ #!/bin/bash -cd "`dirname "$0"`" +cd "`dirname "${BASH_SOURCE[0]}"`" exec bundle exec ./handle-mail-replies.rb "$@" diff --git a/script/mailin b/script/mailin index 5f2a9c243..48e236cfe 100755 --- a/script/mailin +++ b/script/mailin @@ -8,7 +8,7 @@ OUTPUT=$(mktemp -t foi-mailin-output-XXXXXXXX) # Read the email message from stdin, and write it to the file $INPUT cat >"$INPUT" -cd "$(dirname "$0")"/.. +cd "$(dirname "${BASH_SOURCE[0]}")"/.. source commonlib/shlib/deployfns read_conf config/general diff --git a/spec/controllers/request_controller_spec.rb b/spec/controllers/request_controller_spec.rb index a5534e9ff..9e2e1bff7 100644 --- a/spec/controllers/request_controller_spec.rb +++ b/spec/controllers/request_controller_spec.rb @@ -608,7 +608,7 @@ describe RequestController, "when showing one request" do response.body.should match('dull') end - it "should censor attachments downloaded as binary" do + it "should censor attachments downloaded directly" do ir = info_requests(:fancy_dog_request) censor_rule = CensorRule.new @@ -623,7 +623,7 @@ describe RequestController, "when showing one request" do get :get_attachment, :incoming_message_id => ir.incoming_messages[1].id, :id => ir.id, :part => 2, :file_name => 'hello world.txt', :skip_cache => 1 response.content_type.should == "text/plain" - response.should contain "xxxxxx hello" + response.should contain "Mouse hello" ensure ir.censor_rules.clear end @@ -645,7 +645,7 @@ describe RequestController, "when showing one request" do get :get_attachment, :incoming_message_id => ir.incoming_messages[1].id, :id => ir.id, :part => 2, :file_name => 'hello world.txt', :skip_cache => 1 response.content_type.should == "text/plain" - response.should contain "xxxxxx hello" + response.should contain "Mouse hello" ensure ir.user.censor_rules.clear end diff --git a/spec/integration/parameter_stripping_spec.rb b/spec/integration/parameter_stripping_spec.rb index b910062a9..7e3c0adc2 100644 --- a/spec/integration/parameter_stripping_spec.rb +++ b/spec/integration/parameter_stripping_spec.rb @@ -3,7 +3,7 @@ require File.expand_path(File.dirname(__FILE__) + '/../spec_helper') describe "When handling bad requests" do - if RUBY_VERSION.to_f >= 1.9 + if RUBY_VERSION == '1.9.3' it 'should return a 404 for GET requests to a malformed request URL' do get 'request/228%85' diff --git a/spec/lib/alaveteli_text_masker_spec.rb b/spec/lib/alaveteli_text_masker_spec.rb index f2d52c1cc..f8c22a849 100644 --- a/spec/lib/alaveteli_text_masker_spec.rb +++ b/spec/lib/alaveteli_text_masker_spec.rb @@ -31,10 +31,13 @@ describe AlaveteliTextMasker do data.should == "There was a xxxxx called xxxxxxx, he wished that he was xxxx." end - it 'should handle multibyte characters correctly' do + it 'should handle multibyte characters in binary file types as binary data' do data = 'á mouse' + if String.method_defined?(:encode) + data = data.force_encoding("ASCII-8BIT") + end @regex_censor_rule.text = 'á' - apply_masks!(data, "application/octet-stream", :censor_rules => @censor_rules).should == 'x mouse' + apply_masks!(data, "application/octet-stream", :censor_rules => @censor_rules).should == 'xx mouse' end it "should apply censor rules to HTML files" do diff --git a/spec/lib/basic_encoding_spec.rb b/spec/lib/basic_encoding_spec.rb index d77465ad8..6758d60a3 100644 --- a/spec/lib/basic_encoding_spec.rb +++ b/spec/lib/basic_encoding_spec.rb @@ -160,21 +160,24 @@ describe "convert_string_to_utf8" do describe "when passed uninterpretable character data" do - it "should return it as a utf8 string" do + it "should return it as a valid utf8 string with non-utf8 characters removed + and mark it as scrubbed" do converted = convert_string_to_utf8 random_string - converted.should == random_string if String.method_defined?(:encode) - converted.encoding.to_s.should == 'UTF-8' + converted.string.encoding.to_s.should == 'UTF-8' + converted.string.valid_encoding?.should == true end + converted.scrubbed?.should == true converted = convert_string_to_utf8 random_string,'UTF-8' - converted.should == random_string if String.method_defined?(:encode) - converted.encoding.to_s.should == 'UTF-8' + converted.string.encoding.to_s.should == 'UTF-8' + converted.string.valid_encoding?.should == true end + converted.scrubbed?.should == true end end @@ -185,11 +188,13 @@ describe "convert_string_to_utf8" do converted = convert_string_to_utf8 windows_1252_string - converted.should == "DASH – DASH" + converted.string.should == "DASH – DASH" if String.method_defined?(:encode) - converted.encoding.to_s.should == 'UTF-8' + converted.string.encoding.to_s.should == 'UTF-8' end + converted.scrubbed?.should == false + end end @@ -200,11 +205,12 @@ describe "convert_string_to_utf8" do converted = convert_string_to_utf8 gb_18030_spam_string - converted.should start_with("贵公司负责人") + converted.string.should start_with("贵公司负责人") if String.method_defined?(:encode) - converted.encoding.to_s.should == 'UTF-8' + converted.string.encoding.to_s.should == 'UTF-8' end + converted.scrubbed?.should == false end end diff --git a/spec/models/censor_rule_spec.rb b/spec/models/censor_rule_spec.rb index 314b060d2..d308ac1b9 100644 --- a/spec/models/censor_rule_spec.rb +++ b/spec/models/censor_rule_spec.rb @@ -64,19 +64,35 @@ describe CensorRule, "substituting things" do @censor_rule.replacement = "hello" end - it 'should do basic text substitution' do - body = "I don't know why you say goodbye" - @censor_rule.apply_to_text!(body) - body.should == "I don't know why you say hello" + describe :apply_to_text do + + it 'should do basic text substitution' do + body = "I don't know why you say goodbye" + @censor_rule.apply_to_text!(body) + body.should == "I don't know why you say hello" + end + end - it 'should keep size same for binary substitution' do - body = "I don't know why you say goodbye" - orig_body = body.dup - @censor_rule.apply_to_binary!(body) - body.size.should == orig_body.size - body.should == "I don't know why you say xxxxxxx" - body.should_not == orig_body # be sure duplicated as expected + describe :apply_to_binary do + + it 'should keep size same for binary substitution' do + body = "I don't know why you say goodbye" + orig_body = body.dup + @censor_rule.apply_to_binary!(body) + body.size.should == orig_body.size + body.should == "I don't know why you say xxxxxxx" + body.should_not == orig_body # be sure duplicated as expected + end + + it 'should handle a UTF-8 rule and ASCII-8BIT text' do + body = "I don't know why you say g‘oodbye" + body.force_encoding("ASCII-8BIT") if String.method_defined?(:encode) + @censor_rule.text = 'g‘oodbye' + @censor_rule.apply_to_binary!(body) + body.should == "I don't know why you say xxxxxxxxxx" + end + end end @@ -121,6 +137,26 @@ xxxxxxxxx BODY end + it "handles a UTF-8 rule with ASCII-8BIT text" do + @censor_rule.text = "--PRIVATE.*--P‘RIVATE" + @body = +<<BODY +Some public information +--PRIVATE +Some private information +--P‘RIVATE +BODY + @body.force_encoding('ASCII-8BIT') if String.method_defined?(:encode) + @censor_rule.apply_to_binary!(@body) + @body.should == +<<BODY +Some public information +xxxxxxxxx +xxxxxxxxxxxxxxxxxxxxxxxx +xxxxxxxxxxxx +BODY + end + end end diff --git a/spec/models/foi_attachment_spec.rb b/spec/models/foi_attachment_spec.rb index 9583f4c76..b383e5d09 100644 --- a/spec/models/foi_attachment_spec.rb +++ b/spec/models/foi_attachment_spec.rb @@ -50,6 +50,80 @@ describe FoiAttachment do end + describe :body do + + it 'returns a binary encoded string when newly created' do + foi_attachment = FactoryGirl.create(:body_text) + if String.method_defined?(:encode) + expect(foi_attachment.body.encoding.to_s).to eq('ASCII-8BIT') + end + end + + + it 'returns a binary encoded string when saved' do + foi_attachment = FactoryGirl.create(:body_text) + foi_attachment = FoiAttachment.find(foi_attachment) + if String.method_defined?(:encode) + expect(foi_attachment.body.encoding.to_s).to eq('ASCII-8BIT') + end + end + + end + + describe :body_as_text do + + it 'has a valid UTF-8 string when newly created' do + foi_attachment = FactoryGirl.create(:body_text) + if String.method_defined?(:encode) + expect(foi_attachment.body_as_text.string.encoding.to_s).to eq('UTF-8') + expect(foi_attachment.body_as_text.string.valid_encoding?).to be_true + end + end + + it 'has a valid UTF-8 string when saved' do + foi_attachment = FactoryGirl.create(:body_text) + foi_attachment = FoiAttachment.find(foi_attachment) + if String.method_defined?(:encode) + expect(foi_attachment.body_as_text.string.encoding.to_s).to eq('UTF-8') + expect(foi_attachment.body_as_text.string.valid_encoding?).to be_true + end + end + + + it 'has a true scrubbed? value if the body has been coerced to valid UTF-8' do + foi_attachment = FactoryGirl.create(:body_text) + foi_attachment.body = "\x0FX\x1C\x8F\xA4\xCF\xF6\x8C\x9D\xA7\x06\xD9\xF7\x90lo" + expect(foi_attachment.body_as_text.scrubbed?).to be_true + end + + it 'has a false scrubbed? value if the body has not been coerced to valid UTF-8' do + foi_attachment = FactoryGirl.create(:body_text) + foi_attachment.body = "κόσμε" + expect(foi_attachment.body_as_text.scrubbed?).to be_false + end + + end + + describe :default_body do + + it 'returns valid UTF-8 for a text attachment' do + foi_attachment = FactoryGirl.create(:body_text) + if String.method_defined?(:encode) + expect(foi_attachment.default_body.encoding.to_s).to eq('UTF-8') + expect(foi_attachment.default_body.valid_encoding?).to be_true + end + end + + it 'returns binary for a PDF attachment' do + foi_attachment = FactoryGirl.create(:pdf_attachment) + if String.method_defined?(:encode) + expect(foi_attachment.default_body.encoding.to_s).to eq('ASCII-8BIT') + end + end + + end + + describe :ensure_filename! do it 'should create a filename for an instance with a blank filename' do |