diff options
Diffstat (limited to 'app')
-rw-r--r-- | app/controllers/request_controller.rb | 8 | ||||
-rw-r--r-- | app/models/censor_rule.rb | 28 | ||||
-rw-r--r-- | app/models/foi_attachment.rb | 28 | ||||
-rw-r--r-- | app/models/incoming_message.rb | 89 |
4 files changed, 55 insertions, 98 deletions
diff --git a/app/controllers/request_controller.rb b/app/controllers/request_controller.rb index 45229fd7e..26e3b350c 100644 --- a/app/controllers/request_controller.rb +++ b/app/controllers/request_controller.rb @@ -763,12 +763,12 @@ class RequestController < ApplicationController # Prevent spam to magic request address. Note that the binary # subsitution method used depends on the content type - @incoming_message.apply_masks!(@attachment.body, @attachment.content_type) + body = @attachment.default_body + @incoming_message.apply_masks!(body, @attachment.content_type) if response.content_type == 'text/html' - @attachment.body = ActionController::Base.helpers.sanitize(@attachment.body) + body = ActionController::Base.helpers.sanitize(body) end - - render :text => @attachment.body + render :text => body end def get_attachment_as_html diff --git a/app/models/censor_rule.rb b/app/models/censor_rule.rb index f1f1a0d70..aec8a87cc 100644 --- a/app/models/censor_rule.rb +++ b/app/models/censor_rule.rb @@ -46,17 +46,17 @@ class CensorRule < ActiveRecord::Base def apply_to_text(text_to_censor) return nil if text_to_censor.nil? - text_to_censor.gsub(to_replace, replacement) + text_to_censor.gsub(to_replace('UTF-8'), replacement) end def apply_to_text!(text_to_censor) return nil if text_to_censor.nil? - text_to_censor.gsub!(to_replace, replacement) + text_to_censor.gsub!(to_replace('UTF-8'), replacement) end def apply_to_binary!(binary_to_censor) return nil if binary_to_censor.nil? - binary_to_censor.gsub!(to_replace) { |match| match.gsub(/./, 'x') } + binary_to_censor.gsub!(to_replace('ASCII-8BIT')) { |match| match.gsub(single_char_regexp, 'x') } end def is_global? @@ -65,6 +65,14 @@ class CensorRule < ActiveRecord::Base private + def single_char_regexp + if String.method_defined?(:encode) + Regexp.new('.'.force_encoding('ASCII-8BIT')) + else + Regexp.new('.', nil, 'N') + end + end + def require_user_request_or_public_body if info_request.nil? && user.nil? && public_body.nil? [:info_request, :user, :public_body].each do |a| @@ -75,18 +83,22 @@ class CensorRule < ActiveRecord::Base def require_valid_regexp begin - make_regexp + make_regexp('UTF-8') rescue RegexpError => e errors.add(:text, e.message) end end - def make_regexp - Regexp.new(text, Regexp::MULTILINE) + def to_replace(encoding) + regexp? ? make_regexp(encoding) : encoded_text(encoding) + end + + def encoded_text(encoding) + String.method_defined?(:encode) ? text.dup.force_encoding(encoding) : text end - def to_replace - regexp? ? make_regexp : text + def make_regexp(encoding) + Regexp.new(encoded_text(encoding), Regexp::MULTILINE) end end diff --git a/app/models/foi_attachment.rb b/app/models/foi_attachment.rb index 0af47b26e..37a9c9827 100644 --- a/app/models/foi_attachment.rb +++ b/app/models/foi_attachment.rb @@ -62,19 +62,18 @@ class FoiAttachment < ActiveRecord::Base } update_display_size! @cached_body = d + if String.method_defined?(:encode) + @cached_body = @cached_body.force_encoding("ASCII-8BIT") + end end + # raw body, encoded as binary def body if @cached_body.nil? tries = 0 delay = 1 begin - binary_data = File.open(self.filepath, "rb" ){ |file| file.read } - if self.content_type =~ /^text/ - @cached_body = convert_string_to_utf8_or_binary(binary_data, 'UTF-8') - else - @cached_body = binary_data - end + @cached_body = File.open(filepath, "rb" ){ |file| file.read } rescue Errno::ENOENT # we've lost our cached attachments for some reason. Reparse them. if tries > BODY_MAX_TRIES @@ -93,6 +92,17 @@ class FoiAttachment < ActiveRecord::Base return @cached_body end + # body as UTF-8 text, with scrubbing of invalid chars if needed + def body_as_text + convert_string_to_utf8(body, 'UTF-8') + end + + # for text types, the scrubbed UTF-8 text. For all other types, the + # raw binary + def default_body + text_type? ? body_as_text.string : body + end + # List of DSN codes taken from RFC 3463 # http://tools.ietf.org/html/rfc3463 DsnToMessage = { @@ -294,5 +304,11 @@ class FoiAttachment < ActiveRecord::Base AttachmentToHTML.to_html(self, to_html_opts) end + private + + def text_type? + AlaveteliTextMasker::TextMask.include?(content_type) + end + end diff --git a/app/models/incoming_message.rb b/app/models/incoming_message.rb index f28cae0c6..71b081560 100644 --- a/app/models/incoming_message.rb +++ b/app/models/incoming_message.rb @@ -372,41 +372,23 @@ class IncomingMessage < ActiveRecord::Base def _convert_part_body_to_text(part) if part.nil? text = "[ Email has no body, please see attachments ]" - source_charset = "utf-8" else - # by default, the body (coming from an foi_attachment) should have been converted to utf-8 - text = part.body - source_charset = part.charset + # whatever kind of attachment it is, get the UTF-8 encoded text + text = part.body_as_text.string if part.content_type == 'text/html' # e.g. http://www.whatdotheyknow.com/request/35/response/177 # TODO: This is a bit of a hack as it is calling a # convert to text routine. Could instead call a # sanitize HTML one. - - # If the text isn't UTF8, it means we had a problem - # converting it (invalid characters, etc), and we - # should instead tell elinks to respect the source - # charset - use_charset = "utf-8" - if String.method_defined?(:encode) - begin - text.encode('utf-8') - rescue Encoding::UndefinedConversionError, Encoding::InvalidByteSequenceError - use_charset = source_charset - end - else - begin - text = Iconv.conv('utf-8', 'utf-8', text) - rescue Iconv::IllegalSequence - use_charset = source_charset - end - end - text = MailHandler.get_attachment_text_one_file(part.content_type, text, use_charset) + text = MailHandler.get_attachment_text_one_file(part.content_type, text, "UTF-8") end end - # If text hasn't been converted, we sanitise it. - text = _sanitize_text(text) + # Add an annotation if the text had to be scrubbed + if part.body_as_text.scrubbed? + text += _("\n\n[ {{site_name}} note: The above text was badly encoded, and has had strange characters removed. ]", + :site_name => MySociety::Config.get('SITE_NAME', 'Alaveteli')) + end # Fix DOS style linefeeds to Unix style ones (or other later regexps won't work) text = text.gsub(/\r\n/, "\n") @@ -418,50 +400,6 @@ class IncomingMessage < ActiveRecord::Base return text end - def _sanitize_text(text) - if String.method_defined?(:encode) - begin - # Test if it's good UTF-8 - text.encode('utf-8') - rescue Encoding::UndefinedConversionError, Encoding::InvalidByteSequenceError - source_charset = 'utf-8' if source_charset.nil? - # strip out anything that isn't UTF-8 - begin - text = text.encode("utf-8", :invalid => :replace, - :undef => :replace, - :replace => "") + - _("\n\n[ {{site_name}} note: The above text was badly encoded, and has had strange characters removed. ]", - :site_name => MySociety::Config.get('SITE_NAME', 'Alaveteli')) - rescue Encoding::UndefinedConversionError, Encoding::InvalidByteSequenceError - if source_charset != "utf-8" - source_charset = "utf-8" - retry - end - end - end - else - begin - # Test if it's good UTF-8 - text = Iconv.conv('utf-8', 'utf-8', text) - rescue Iconv::IllegalSequence - # Text looks like unlabelled nonsense, - # strip out anything that isn't UTF-8 - begin - source_charset = 'utf-8' if source_charset.nil? - text = Iconv.conv('utf-8//IGNORE', source_charset, text) + - _("\n\n[ {{site_name}} note: The above text was badly encoded, and has had strange characters removed. ]", - :site_name => AlaveteliConfiguration::site_name) - rescue Iconv::InvalidEncoding, Iconv::IllegalSequence, Iconv::InvalidCharacter - if source_charset != "utf-8" - source_charset = "utf-8" - retry - end - end - end - end - text - end - # Returns part which contains main body text, or nil if there isn't one, # from a set of foi_attachments. If the leaves parameter is empty or not # supplied, uses its own foi_attachments. @@ -677,16 +615,7 @@ class IncomingMessage < ActiveRecord::Base end def _get_attachment_text_internal - text = self._extract_text - - # Remove any bad characters - if String.method_defined?(:encode) - # handle "problematic" encoding - text.encode!('UTF-16', 'UTF-8', :invalid => :replace, :undef => :replace, :replace => '') - text.encode('UTF-8', 'UTF-16') - else - Iconv.conv('utf-8//IGNORE', 'utf-8', text) - end + convert_string_to_utf8(_extract_text, 'UTF-8').string end # Returns text for indexing |