diff options
author | Gareth Rees <gareth@mysociety.org> | 2015-06-24 11:19:43 +0100 |
---|---|---|
committer | Gareth Rees <gareth@mysociety.org> | 2015-06-24 11:19:43 +0100 |
commit | 2cce1794a4d9d2c42b83bab8a693900e8ca23ebc (patch) | |
tree | 7408a04d5ac0963ec2defbbf7d4955cff7cd62b5 /lib | |
parent | ed6b256539e0dcaa3764951d90e2dc599a8acddd (diff) | |
parent | 54ba7a4fa232ad3b57310551b9a5e19d72060abe (diff) |
Merge branch 'develop' into release-22-develop
Diffstat (limited to 'lib')
-rw-r--r-- | lib/alaveteli_text_masker.rb | 21 | ||||
-rw-r--r-- | lib/attachment_to_html/adapter.rb | 2 | ||||
-rw-r--r-- | lib/mail_handler/backends/mail_backend.rb | 2 | ||||
-rw-r--r-- | lib/normalize_string.rb | 23 | ||||
-rw-r--r-- | lib/tasks/config_files.rake | 16 | ||||
-rw-r--r-- | lib/tasks/temp.rake | 96 |
6 files changed, 148 insertions, 12 deletions
diff --git a/lib/alaveteli_text_masker.rb b/lib/alaveteli_text_masker.rb index 3c2bcf825..49dd15ae5 100644 --- a/lib/alaveteli_text_masker.rb +++ b/lib/alaveteli_text_masker.rb @@ -8,6 +8,21 @@ module AlaveteliTextMasker 'image/bmp', 'application/zip' ] + TextMask = [ 'text/css', + 'text/csv', + 'text/html', + 'text/plain', + 'text/rfc822-headers', + 'text/rtf', + 'text/tab-separated-values', + 'text/x-c', + 'text/x-diff', + 'text/x-fortran', + 'text/x-mail', + 'text/xml', + 'text/x-pascal', + 'text/x-vcard' ] + # Replaces all email addresses in (possibly binary) data # Also applies custom masks and censor items def apply_masks!(text, content_type, options = {}) @@ -19,7 +34,7 @@ module AlaveteliTextMasker case content_type when *DoNotBinaryMask # do nothing - when 'text/html' + when *TextMask apply_text_masks!(text, options) when 'application/pdf' apply_pdf_masks!(text, options) @@ -79,7 +94,7 @@ module AlaveteliTextMasker # Replace text in place def apply_binary_masks!(text, options = {}) # Keep original size, so can check haven't resized it - orig_size = text.mb_chars.size + orig_size = text.size # Replace ASCII email addresses... text.gsub!(MySociety::Validate.email_find_regexp) do |email| @@ -114,7 +129,7 @@ module AlaveteliTextMasker # Replace censor items censor_rules = options[:censor_rules] || [] censor_rules.each{ |censor_rule| censor_rule.apply_to_binary!(text) } - raise "internal error in apply_binary_masks!" if text.mb_chars.size != orig_size + raise "internal error in apply_binary_masks!" if text.size != orig_size return text end diff --git a/lib/attachment_to_html/adapter.rb b/lib/attachment_to_html/adapter.rb index 058fb2a01..ac8a16411 100644 --- a/lib/attachment_to_html/adapter.rb +++ b/lib/attachment_to_html/adapter.rb @@ -61,7 +61,7 @@ module AttachmentToHTML end def attachment_body - @attachment_body ||= attachment.body + @attachment_body ||= attachment.default_body end end end diff --git a/lib/mail_handler/backends/mail_backend.rb b/lib/mail_handler/backends/mail_backend.rb index 34fbc91ab..19f502275 100644 --- a/lib/mail_handler/backends/mail_backend.rb +++ b/lib/mail_handler/backends/mail_backend.rb @@ -68,7 +68,7 @@ module MailHandler part_file_name = part_file_name.nil? ? nil : part_file_name.dup if part_file_name part_file_name = CGI.unescape(part_file_name) - part_file_name = convert_string_to_utf8(part_file_name, part.charset) + part_file_name = convert_string_to_utf8(part_file_name, part.charset).string end part_file_name end diff --git a/lib/normalize_string.rb b/lib/normalize_string.rb index d850d7e05..69853fd6e 100644 --- a/lib/normalize_string.rb +++ b/lib/normalize_string.rb @@ -73,18 +73,27 @@ def convert_string_to_utf8_or_binary(s, suggested_character_encoding=nil) result end +class StringConversionResult < Struct.new(:string, :scrubbed) + alias_method :scrubbed?, :scrubbed +end + def convert_string_to_utf8(s, suggested_character_encoding=nil) begin result = normalize_string_to_utf8 s, suggested_character_encoding + StringConversionResult.new(result, false) rescue EncodingNormalizationError - result = s - if String.method_defined?(:encode) - result = s.force_encoding("utf-8").encode("utf-8", :invalid => :replace, - :undef => :replace, - :replace => "") - end + result = scrub(s) + StringConversionResult.new(result, true) + end +end + +def scrub(string) + if String.method_defined?(:encode) + string = string.force_encoding("utf-8") + string.valid_encoding? ? string : string.encode("utf-16le", :invalid => :replace, :replace => "").encode("utf-8") + else + Iconv.conv('UTF-8//IGNORE', 'UTF-8', string) end - result end def log_text_details(message, text) diff --git a/lib/tasks/config_files.rake b/lib/tasks/config_files.rake index f6b25185e..d0dc8f594 100644 --- a/lib/tasks/config_files.rake +++ b/lib/tasks/config_files.rake @@ -21,6 +21,22 @@ namespace :config_files do converted_lines end + desc 'Convert wrapper example in config to a form suitable for running mail handling scripts with rbenv' + task :convert_wrapper => :environment do + example = 'rake config_files:convert_wrapper DEPLOY_USER=deploy SCRIPT_FILE=config/run-with-rbenv-path.example' + check_for_env_vars(['DEPLOY_USER', + 'SCRIPT_FILE'], example) + + replacements = { + :user => ENV['DEPLOY_USER'], + } + + # Generate the template for potential further processing + convert_ugly(ENV['SCRIPT_FILE'], replacements).each do |line| + puts line + end + end + desc 'Convert Debian example init script in config to a form suitable for installing in /etc/init.d' task :convert_init_script => :environment do example = 'rake config_files:convert_init_script DEPLOY_USER=deploy VHOST_DIR=/dir/above/alaveteli VCSPATH=alaveteli SITE=alaveteli SCRIPT_FILE=config/alert-tracks-debian.example' diff --git a/lib/tasks/temp.rake b/lib/tasks/temp.rake index 67fa10174..d5f7e8b22 100644 --- a/lib/tasks/temp.rake +++ b/lib/tasks/temp.rake @@ -1,3 +1,4 @@ +# -*- coding: utf-8 -*- namespace :temp do @@ -37,4 +38,99 @@ namespace :temp do end + desc 'Look for and fix invalid UTF-8 text in various models. Should be run under ruby 1.9 or above' + task :fix_invalid_utf8 => :environment do + + dryrun = ENV['DRYRUN'] != '0' + if dryrun + $stderr.puts "This is a dryrun - nothing will be changed" + end + + + PublicBody.find_each do |public_body| + unless public_body.name.valid_encoding? + name = convert_string_to_utf8(public_body.name) + puts "Bad encoding in PublicBody name, id: #{public_body.id}, " \ + "old name: #{public_body.name.force_encoding('UTF-8')}, new name #{name}" + unless dryrun + public_body.name_will_change! + public_body.name = name + public_body.last_edit_editor = 'system' + public_body.last_edit_comment = 'Invalid utf-8 encoding fixed by temp:fix_invalid_utf8' + public_body.save! + end + end + + # Editing old versions of public bodies - we don't want to affect the timestamp + PublicBody::Version.record_timestamps = false + public_body.versions.each do |public_body_version| + unless public_body_version.name.valid_encoding? + name = convert_string_to_utf8(public_body_version.name).string + puts "Bad encoding in PublicBody::Version name, " \ + "id: #{public_body_version.id}, old name: #{public_body_version.name.force_encoding('UTF-8')}, " \ + "new name: #{name}" + unless dryrun + public_body_version.name_will_change! + public_body_version.name = name + public_body_version.save! + end + end + end + PublicBody::Version.record_timestamps = true + + end + + IncomingMessage.find_each do |incoming_message| + if (incoming_message.cached_attachment_text_clipped && + !incoming_message.cached_attachment_text_clipped.valid_encoding?) || + (incoming_message.cached_main_body_text_folded && + !incoming_message.cached_main_body_text_folded.valid_encoding?) || + (incoming_message.cached_main_body_text_unfolded && + !incoming_message.cached_main_body_text_unfolded.valid_encoding?) + puts "Bad encoding in IncomingMessage cached fields, :id #{incoming_message.id} " + unless dryrun + incoming_message.clear_in_database_caches! + end + end + end + + FoiAttachment.find_each do |foi_attachment| + unescaped_filename = CGI.unescape(foi_attachment.filename) + unless unescaped_filename.valid_encoding? + filename = convert_string_to_utf8(unescaped_filename).string + puts "Bad encoding in FoiAttachment filename, id: #{foi_attachment.id} " \ + "old filename #{unescaped_filename.force_encoding('UTF-8')}, new filename #{filename}" + unless dryrun + foi_attachment.filename = filename + foi_attachment.save! + end + end + end + + OutgoingMessage.find_each do |outgoing_message| + unless outgoing_message.raw_body.valid_encoding? + + raw_body = convert_string_to_utf8(outgoing_message.raw_body).string + puts "Bad encoding in OutgoingMessage raw_body, id: #{outgoing_message.id} " \ + "old raw_body: #{outgoing_message.raw_body.force_encoding('UTF-8')}, new raw_body: #{raw_body}" + unless dryrun + outgoing_message.body = raw_body + outgoing_message.save! + end + end + end + + User.find_each do |user| + unless user.name.valid_encoding? + name = convert_string_to_utf8(user.name).string + puts "Bad encoding in User name, id: #{user.id}, " \ + "old name: #{user.name.force_encoding('UTF-8')}, new name: #{name}" + unless dryrun + user.name = name + user.save! + end + end + end + + end end |