aboutsummaryrefslogtreecommitdiffstats
path: root/app/models
diff options
context:
space:
mode:
Diffstat (limited to 'app/models')
-rw-r--r--app/models/censor_rule.rb28
-rw-r--r--app/models/foi_attachment.rb28
-rw-r--r--app/models/incoming_message.rb89
3 files changed, 51 insertions, 94 deletions
diff --git a/app/models/censor_rule.rb b/app/models/censor_rule.rb
index f1f1a0d70..aec8a87cc 100644
--- a/app/models/censor_rule.rb
+++ b/app/models/censor_rule.rb
@@ -46,17 +46,17 @@ class CensorRule < ActiveRecord::Base
def apply_to_text(text_to_censor)
return nil if text_to_censor.nil?
- text_to_censor.gsub(to_replace, replacement)
+ text_to_censor.gsub(to_replace('UTF-8'), replacement)
end
def apply_to_text!(text_to_censor)
return nil if text_to_censor.nil?
- text_to_censor.gsub!(to_replace, replacement)
+ text_to_censor.gsub!(to_replace('UTF-8'), replacement)
end
def apply_to_binary!(binary_to_censor)
return nil if binary_to_censor.nil?
- binary_to_censor.gsub!(to_replace) { |match| match.gsub(/./, 'x') }
+ binary_to_censor.gsub!(to_replace('ASCII-8BIT')) { |match| match.gsub(single_char_regexp, 'x') }
end
def is_global?
@@ -65,6 +65,14 @@ class CensorRule < ActiveRecord::Base
private
+ def single_char_regexp
+ if String.method_defined?(:encode)
+ Regexp.new('.'.force_encoding('ASCII-8BIT'))
+ else
+ Regexp.new('.', nil, 'N')
+ end
+ end
+
def require_user_request_or_public_body
if info_request.nil? && user.nil? && public_body.nil?
[:info_request, :user, :public_body].each do |a|
@@ -75,18 +83,22 @@ class CensorRule < ActiveRecord::Base
def require_valid_regexp
begin
- make_regexp
+ make_regexp('UTF-8')
rescue RegexpError => e
errors.add(:text, e.message)
end
end
- def make_regexp
- Regexp.new(text, Regexp::MULTILINE)
+ def to_replace(encoding)
+ regexp? ? make_regexp(encoding) : encoded_text(encoding)
+ end
+
+ def encoded_text(encoding)
+ String.method_defined?(:encode) ? text.dup.force_encoding(encoding) : text
end
- def to_replace
- regexp? ? make_regexp : text
+ def make_regexp(encoding)
+ Regexp.new(encoded_text(encoding), Regexp::MULTILINE)
end
end
diff --git a/app/models/foi_attachment.rb b/app/models/foi_attachment.rb
index 0af47b26e..37a9c9827 100644
--- a/app/models/foi_attachment.rb
+++ b/app/models/foi_attachment.rb
@@ -62,19 +62,18 @@ class FoiAttachment < ActiveRecord::Base
}
update_display_size!
@cached_body = d
+ if String.method_defined?(:encode)
+ @cached_body = @cached_body.force_encoding("ASCII-8BIT")
+ end
end
+ # raw body, encoded as binary
def body
if @cached_body.nil?
tries = 0
delay = 1
begin
- binary_data = File.open(self.filepath, "rb" ){ |file| file.read }
- if self.content_type =~ /^text/
- @cached_body = convert_string_to_utf8_or_binary(binary_data, 'UTF-8')
- else
- @cached_body = binary_data
- end
+ @cached_body = File.open(filepath, "rb" ){ |file| file.read }
rescue Errno::ENOENT
# we've lost our cached attachments for some reason. Reparse them.
if tries > BODY_MAX_TRIES
@@ -93,6 +92,17 @@ class FoiAttachment < ActiveRecord::Base
return @cached_body
end
+ # body as UTF-8 text, with scrubbing of invalid chars if needed
+ def body_as_text
+ convert_string_to_utf8(body, 'UTF-8')
+ end
+
+ # for text types, the scrubbed UTF-8 text. For all other types, the
+ # raw binary
+ def default_body
+ text_type? ? body_as_text.string : body
+ end
+
# List of DSN codes taken from RFC 3463
# http://tools.ietf.org/html/rfc3463
DsnToMessage = {
@@ -294,5 +304,11 @@ class FoiAttachment < ActiveRecord::Base
AttachmentToHTML.to_html(self, to_html_opts)
end
+ private
+
+ def text_type?
+ AlaveteliTextMasker::TextMask.include?(content_type)
+ end
+
end
diff --git a/app/models/incoming_message.rb b/app/models/incoming_message.rb
index f28cae0c6..71b081560 100644
--- a/app/models/incoming_message.rb
+++ b/app/models/incoming_message.rb
@@ -372,41 +372,23 @@ class IncomingMessage < ActiveRecord::Base
def _convert_part_body_to_text(part)
if part.nil?
text = "[ Email has no body, please see attachments ]"
- source_charset = "utf-8"
else
- # by default, the body (coming from an foi_attachment) should have been converted to utf-8
- text = part.body
- source_charset = part.charset
+ # whatever kind of attachment it is, get the UTF-8 encoded text
+ text = part.body_as_text.string
if part.content_type == 'text/html'
# e.g. http://www.whatdotheyknow.com/request/35/response/177
# TODO: This is a bit of a hack as it is calling a
# convert to text routine. Could instead call a
# sanitize HTML one.
-
- # If the text isn't UTF8, it means we had a problem
- # converting it (invalid characters, etc), and we
- # should instead tell elinks to respect the source
- # charset
- use_charset = "utf-8"
- if String.method_defined?(:encode)
- begin
- text.encode('utf-8')
- rescue Encoding::UndefinedConversionError, Encoding::InvalidByteSequenceError
- use_charset = source_charset
- end
- else
- begin
- text = Iconv.conv('utf-8', 'utf-8', text)
- rescue Iconv::IllegalSequence
- use_charset = source_charset
- end
- end
- text = MailHandler.get_attachment_text_one_file(part.content_type, text, use_charset)
+ text = MailHandler.get_attachment_text_one_file(part.content_type, text, "UTF-8")
end
end
- # If text hasn't been converted, we sanitise it.
- text = _sanitize_text(text)
+ # Add an annotation if the text had to be scrubbed
+ if part.body_as_text.scrubbed?
+ text += _("\n\n[ {{site_name}} note: The above text was badly encoded, and has had strange characters removed. ]",
+ :site_name => MySociety::Config.get('SITE_NAME', 'Alaveteli'))
+ end
# Fix DOS style linefeeds to Unix style ones (or other later regexps won't work)
text = text.gsub(/\r\n/, "\n")
@@ -418,50 +400,6 @@ class IncomingMessage < ActiveRecord::Base
return text
end
- def _sanitize_text(text)
- if String.method_defined?(:encode)
- begin
- # Test if it's good UTF-8
- text.encode('utf-8')
- rescue Encoding::UndefinedConversionError, Encoding::InvalidByteSequenceError
- source_charset = 'utf-8' if source_charset.nil?
- # strip out anything that isn't UTF-8
- begin
- text = text.encode("utf-8", :invalid => :replace,
- :undef => :replace,
- :replace => "") +
- _("\n\n[ {{site_name}} note: The above text was badly encoded, and has had strange characters removed. ]",
- :site_name => MySociety::Config.get('SITE_NAME', 'Alaveteli'))
- rescue Encoding::UndefinedConversionError, Encoding::InvalidByteSequenceError
- if source_charset != "utf-8"
- source_charset = "utf-8"
- retry
- end
- end
- end
- else
- begin
- # Test if it's good UTF-8
- text = Iconv.conv('utf-8', 'utf-8', text)
- rescue Iconv::IllegalSequence
- # Text looks like unlabelled nonsense,
- # strip out anything that isn't UTF-8
- begin
- source_charset = 'utf-8' if source_charset.nil?
- text = Iconv.conv('utf-8//IGNORE', source_charset, text) +
- _("\n\n[ {{site_name}} note: The above text was badly encoded, and has had strange characters removed. ]",
- :site_name => AlaveteliConfiguration::site_name)
- rescue Iconv::InvalidEncoding, Iconv::IllegalSequence, Iconv::InvalidCharacter
- if source_charset != "utf-8"
- source_charset = "utf-8"
- retry
- end
- end
- end
- end
- text
- end
-
# Returns part which contains main body text, or nil if there isn't one,
# from a set of foi_attachments. If the leaves parameter is empty or not
# supplied, uses its own foi_attachments.
@@ -677,16 +615,7 @@ class IncomingMessage < ActiveRecord::Base
end
def _get_attachment_text_internal
- text = self._extract_text
-
- # Remove any bad characters
- if String.method_defined?(:encode)
- # handle "problematic" encoding
- text.encode!('UTF-16', 'UTF-8', :invalid => :replace, :undef => :replace, :replace => '')
- text.encode('UTF-8', 'UTF-16')
- else
- Iconv.conv('utf-8//IGNORE', 'utf-8', text)
- end
+ convert_string_to_utf8(_extract_text, 'UTF-8').string
end
# Returns text for indexing