aboutsummaryrefslogtreecommitdiffstats
path: root/lib
diff options
context:
space:
mode:
authorGareth Rees <gareth@mysociety.org>2015-06-24 11:19:43 +0100
committerGareth Rees <gareth@mysociety.org>2015-06-24 11:19:43 +0100
commit2cce1794a4d9d2c42b83bab8a693900e8ca23ebc (patch)
tree7408a04d5ac0963ec2defbbf7d4955cff7cd62b5 /lib
parented6b256539e0dcaa3764951d90e2dc599a8acddd (diff)
parent54ba7a4fa232ad3b57310551b9a5e19d72060abe (diff)
Merge branch 'develop' into release-22-develop
Diffstat (limited to 'lib')
-rw-r--r--lib/alaveteli_text_masker.rb21
-rw-r--r--lib/attachment_to_html/adapter.rb2
-rw-r--r--lib/mail_handler/backends/mail_backend.rb2
-rw-r--r--lib/normalize_string.rb23
-rw-r--r--lib/tasks/config_files.rake16
-rw-r--r--lib/tasks/temp.rake96
6 files changed, 148 insertions, 12 deletions
diff --git a/lib/alaveteli_text_masker.rb b/lib/alaveteli_text_masker.rb
index 3c2bcf825..49dd15ae5 100644
--- a/lib/alaveteli_text_masker.rb
+++ b/lib/alaveteli_text_masker.rb
@@ -8,6 +8,21 @@ module AlaveteliTextMasker
'image/bmp',
'application/zip' ]
+ TextMask = [ 'text/css',
+ 'text/csv',
+ 'text/html',
+ 'text/plain',
+ 'text/rfc822-headers',
+ 'text/rtf',
+ 'text/tab-separated-values',
+ 'text/x-c',
+ 'text/x-diff',
+ 'text/x-fortran',
+ 'text/x-mail',
+ 'text/xml',
+ 'text/x-pascal',
+ 'text/x-vcard' ]
+
# Replaces all email addresses in (possibly binary) data
# Also applies custom masks and censor items
def apply_masks!(text, content_type, options = {})
@@ -19,7 +34,7 @@ module AlaveteliTextMasker
case content_type
when *DoNotBinaryMask
# do nothing
- when 'text/html'
+ when *TextMask
apply_text_masks!(text, options)
when 'application/pdf'
apply_pdf_masks!(text, options)
@@ -79,7 +94,7 @@ module AlaveteliTextMasker
# Replace text in place
def apply_binary_masks!(text, options = {})
# Keep original size, so can check haven't resized it
- orig_size = text.mb_chars.size
+ orig_size = text.size
# Replace ASCII email addresses...
text.gsub!(MySociety::Validate.email_find_regexp) do |email|
@@ -114,7 +129,7 @@ module AlaveteliTextMasker
# Replace censor items
censor_rules = options[:censor_rules] || []
censor_rules.each{ |censor_rule| censor_rule.apply_to_binary!(text) }
- raise "internal error in apply_binary_masks!" if text.mb_chars.size != orig_size
+ raise "internal error in apply_binary_masks!" if text.size != orig_size
return text
end
diff --git a/lib/attachment_to_html/adapter.rb b/lib/attachment_to_html/adapter.rb
index 058fb2a01..ac8a16411 100644
--- a/lib/attachment_to_html/adapter.rb
+++ b/lib/attachment_to_html/adapter.rb
@@ -61,7 +61,7 @@ module AttachmentToHTML
end
def attachment_body
- @attachment_body ||= attachment.body
+ @attachment_body ||= attachment.default_body
end
end
end
diff --git a/lib/mail_handler/backends/mail_backend.rb b/lib/mail_handler/backends/mail_backend.rb
index 34fbc91ab..19f502275 100644
--- a/lib/mail_handler/backends/mail_backend.rb
+++ b/lib/mail_handler/backends/mail_backend.rb
@@ -68,7 +68,7 @@ module MailHandler
part_file_name = part_file_name.nil? ? nil : part_file_name.dup
if part_file_name
part_file_name = CGI.unescape(part_file_name)
- part_file_name = convert_string_to_utf8(part_file_name, part.charset)
+ part_file_name = convert_string_to_utf8(part_file_name, part.charset).string
end
part_file_name
end
diff --git a/lib/normalize_string.rb b/lib/normalize_string.rb
index d850d7e05..69853fd6e 100644
--- a/lib/normalize_string.rb
+++ b/lib/normalize_string.rb
@@ -73,18 +73,27 @@ def convert_string_to_utf8_or_binary(s, suggested_character_encoding=nil)
result
end
+class StringConversionResult < Struct.new(:string, :scrubbed)
+ alias_method :scrubbed?, :scrubbed
+end
+
def convert_string_to_utf8(s, suggested_character_encoding=nil)
begin
result = normalize_string_to_utf8 s, suggested_character_encoding
+ StringConversionResult.new(result, false)
rescue EncodingNormalizationError
- result = s
- if String.method_defined?(:encode)
- result = s.force_encoding("utf-8").encode("utf-8", :invalid => :replace,
- :undef => :replace,
- :replace => "")
- end
+ result = scrub(s)
+ StringConversionResult.new(result, true)
+ end
+end
+
+def scrub(string)
+ if String.method_defined?(:encode)
+ string = string.force_encoding("utf-8")
+ string.valid_encoding? ? string : string.encode("utf-16le", :invalid => :replace, :replace => "").encode("utf-8")
+ else
+ Iconv.conv('UTF-8//IGNORE', 'UTF-8', string)
end
- result
end
def log_text_details(message, text)
diff --git a/lib/tasks/config_files.rake b/lib/tasks/config_files.rake
index f6b25185e..d0dc8f594 100644
--- a/lib/tasks/config_files.rake
+++ b/lib/tasks/config_files.rake
@@ -21,6 +21,22 @@ namespace :config_files do
converted_lines
end
+ desc 'Convert wrapper example in config to a form suitable for running mail handling scripts with rbenv'
+ task :convert_wrapper => :environment do
+ example = 'rake config_files:convert_wrapper DEPLOY_USER=deploy SCRIPT_FILE=config/run-with-rbenv-path.example'
+ check_for_env_vars(['DEPLOY_USER',
+ 'SCRIPT_FILE'], example)
+
+ replacements = {
+ :user => ENV['DEPLOY_USER'],
+ }
+
+ # Generate the template for potential further processing
+ convert_ugly(ENV['SCRIPT_FILE'], replacements).each do |line|
+ puts line
+ end
+ end
+
desc 'Convert Debian example init script in config to a form suitable for installing in /etc/init.d'
task :convert_init_script => :environment do
example = 'rake config_files:convert_init_script DEPLOY_USER=deploy VHOST_DIR=/dir/above/alaveteli VCSPATH=alaveteli SITE=alaveteli SCRIPT_FILE=config/alert-tracks-debian.example'
diff --git a/lib/tasks/temp.rake b/lib/tasks/temp.rake
index 67fa10174..d5f7e8b22 100644
--- a/lib/tasks/temp.rake
+++ b/lib/tasks/temp.rake
@@ -1,3 +1,4 @@
+# -*- coding: utf-8 -*-
namespace :temp do
@@ -37,4 +38,99 @@ namespace :temp do
end
+ desc 'Look for and fix invalid UTF-8 text in various models. Should be run under ruby 1.9 or above'
+ task :fix_invalid_utf8 => :environment do
+
+ dryrun = ENV['DRYRUN'] != '0'
+ if dryrun
+ $stderr.puts "This is a dryrun - nothing will be changed"
+ end
+
+
+ PublicBody.find_each do |public_body|
+ unless public_body.name.valid_encoding?
+ name = convert_string_to_utf8(public_body.name)
+ puts "Bad encoding in PublicBody name, id: #{public_body.id}, " \
+ "old name: #{public_body.name.force_encoding('UTF-8')}, new name #{name}"
+ unless dryrun
+ public_body.name_will_change!
+ public_body.name = name
+ public_body.last_edit_editor = 'system'
+ public_body.last_edit_comment = 'Invalid utf-8 encoding fixed by temp:fix_invalid_utf8'
+ public_body.save!
+ end
+ end
+
+ # Editing old versions of public bodies - we don't want to affect the timestamp
+ PublicBody::Version.record_timestamps = false
+ public_body.versions.each do |public_body_version|
+ unless public_body_version.name.valid_encoding?
+ name = convert_string_to_utf8(public_body_version.name).string
+ puts "Bad encoding in PublicBody::Version name, " \
+ "id: #{public_body_version.id}, old name: #{public_body_version.name.force_encoding('UTF-8')}, " \
+ "new name: #{name}"
+ unless dryrun
+ public_body_version.name_will_change!
+ public_body_version.name = name
+ public_body_version.save!
+ end
+ end
+ end
+ PublicBody::Version.record_timestamps = true
+
+ end
+
+ IncomingMessage.find_each do |incoming_message|
+ if (incoming_message.cached_attachment_text_clipped &&
+ !incoming_message.cached_attachment_text_clipped.valid_encoding?) ||
+ (incoming_message.cached_main_body_text_folded &&
+ !incoming_message.cached_main_body_text_folded.valid_encoding?) ||
+ (incoming_message.cached_main_body_text_unfolded &&
+ !incoming_message.cached_main_body_text_unfolded.valid_encoding?)
+ puts "Bad encoding in IncomingMessage cached fields, :id #{incoming_message.id} "
+ unless dryrun
+ incoming_message.clear_in_database_caches!
+ end
+ end
+ end
+
+ FoiAttachment.find_each do |foi_attachment|
+ unescaped_filename = CGI.unescape(foi_attachment.filename)
+ unless unescaped_filename.valid_encoding?
+ filename = convert_string_to_utf8(unescaped_filename).string
+ puts "Bad encoding in FoiAttachment filename, id: #{foi_attachment.id} " \
+ "old filename #{unescaped_filename.force_encoding('UTF-8')}, new filename #{filename}"
+ unless dryrun
+ foi_attachment.filename = filename
+ foi_attachment.save!
+ end
+ end
+ end
+
+ OutgoingMessage.find_each do |outgoing_message|
+ unless outgoing_message.raw_body.valid_encoding?
+
+ raw_body = convert_string_to_utf8(outgoing_message.raw_body).string
+ puts "Bad encoding in OutgoingMessage raw_body, id: #{outgoing_message.id} " \
+ "old raw_body: #{outgoing_message.raw_body.force_encoding('UTF-8')}, new raw_body: #{raw_body}"
+ unless dryrun
+ outgoing_message.body = raw_body
+ outgoing_message.save!
+ end
+ end
+ end
+
+ User.find_each do |user|
+ unless user.name.valid_encoding?
+ name = convert_string_to_utf8(user.name).string
+ puts "Bad encoding in User name, id: #{user.id}, " \
+ "old name: #{user.name.force_encoding('UTF-8')}, new name: #{name}"
+ unless dryrun
+ user.name = name
+ user.save!
+ end
+ end
+ end
+
+ end
end