diff options
Diffstat (limited to 'app/models')
-rw-r--r-- | app/models/censor_rule.rb | 28 | ||||
-rw-r--r-- | app/models/foi_attachment.rb | 28 | ||||
-rw-r--r-- | app/models/incoming_message.rb | 89 | ||||
-rw-r--r-- | app/models/public_body.rb | 390 |
4 files changed, 275 insertions, 260 deletions
diff --git a/app/models/censor_rule.rb b/app/models/censor_rule.rb index f1f1a0d70..aec8a87cc 100644 --- a/app/models/censor_rule.rb +++ b/app/models/censor_rule.rb @@ -46,17 +46,17 @@ class CensorRule < ActiveRecord::Base def apply_to_text(text_to_censor) return nil if text_to_censor.nil? - text_to_censor.gsub(to_replace, replacement) + text_to_censor.gsub(to_replace('UTF-8'), replacement) end def apply_to_text!(text_to_censor) return nil if text_to_censor.nil? - text_to_censor.gsub!(to_replace, replacement) + text_to_censor.gsub!(to_replace('UTF-8'), replacement) end def apply_to_binary!(binary_to_censor) return nil if binary_to_censor.nil? - binary_to_censor.gsub!(to_replace) { |match| match.gsub(/./, 'x') } + binary_to_censor.gsub!(to_replace('ASCII-8BIT')) { |match| match.gsub(single_char_regexp, 'x') } end def is_global? @@ -65,6 +65,14 @@ class CensorRule < ActiveRecord::Base private + def single_char_regexp + if String.method_defined?(:encode) + Regexp.new('.'.force_encoding('ASCII-8BIT')) + else + Regexp.new('.', nil, 'N') + end + end + def require_user_request_or_public_body if info_request.nil? && user.nil? && public_body.nil? [:info_request, :user, :public_body].each do |a| @@ -75,18 +83,22 @@ class CensorRule < ActiveRecord::Base def require_valid_regexp begin - make_regexp + make_regexp('UTF-8') rescue RegexpError => e errors.add(:text, e.message) end end - def make_regexp - Regexp.new(text, Regexp::MULTILINE) + def to_replace(encoding) + regexp? ? make_regexp(encoding) : encoded_text(encoding) + end + + def encoded_text(encoding) + String.method_defined?(:encode) ? text.dup.force_encoding(encoding) : text end - def to_replace - regexp? ? make_regexp : text + def make_regexp(encoding) + Regexp.new(encoded_text(encoding), Regexp::MULTILINE) end end diff --git a/app/models/foi_attachment.rb b/app/models/foi_attachment.rb index 0af47b26e..37a9c9827 100644 --- a/app/models/foi_attachment.rb +++ b/app/models/foi_attachment.rb @@ -62,19 +62,18 @@ class FoiAttachment < ActiveRecord::Base } update_display_size! @cached_body = d + if String.method_defined?(:encode) + @cached_body = @cached_body.force_encoding("ASCII-8BIT") + end end + # raw body, encoded as binary def body if @cached_body.nil? tries = 0 delay = 1 begin - binary_data = File.open(self.filepath, "rb" ){ |file| file.read } - if self.content_type =~ /^text/ - @cached_body = convert_string_to_utf8_or_binary(binary_data, 'UTF-8') - else - @cached_body = binary_data - end + @cached_body = File.open(filepath, "rb" ){ |file| file.read } rescue Errno::ENOENT # we've lost our cached attachments for some reason. Reparse them. if tries > BODY_MAX_TRIES @@ -93,6 +92,17 @@ class FoiAttachment < ActiveRecord::Base return @cached_body end + # body as UTF-8 text, with scrubbing of invalid chars if needed + def body_as_text + convert_string_to_utf8(body, 'UTF-8') + end + + # for text types, the scrubbed UTF-8 text. For all other types, the + # raw binary + def default_body + text_type? ? body_as_text.string : body + end + # List of DSN codes taken from RFC 3463 # http://tools.ietf.org/html/rfc3463 DsnToMessage = { @@ -294,5 +304,11 @@ class FoiAttachment < ActiveRecord::Base AttachmentToHTML.to_html(self, to_html_opts) end + private + + def text_type? + AlaveteliTextMasker::TextMask.include?(content_type) + end + end diff --git a/app/models/incoming_message.rb b/app/models/incoming_message.rb index f28cae0c6..71b081560 100644 --- a/app/models/incoming_message.rb +++ b/app/models/incoming_message.rb @@ -372,41 +372,23 @@ class IncomingMessage < ActiveRecord::Base def _convert_part_body_to_text(part) if part.nil? text = "[ Email has no body, please see attachments ]" - source_charset = "utf-8" else - # by default, the body (coming from an foi_attachment) should have been converted to utf-8 - text = part.body - source_charset = part.charset + # whatever kind of attachment it is, get the UTF-8 encoded text + text = part.body_as_text.string if part.content_type == 'text/html' # e.g. http://www.whatdotheyknow.com/request/35/response/177 # TODO: This is a bit of a hack as it is calling a # convert to text routine. Could instead call a # sanitize HTML one. - - # If the text isn't UTF8, it means we had a problem - # converting it (invalid characters, etc), and we - # should instead tell elinks to respect the source - # charset - use_charset = "utf-8" - if String.method_defined?(:encode) - begin - text.encode('utf-8') - rescue Encoding::UndefinedConversionError, Encoding::InvalidByteSequenceError - use_charset = source_charset - end - else - begin - text = Iconv.conv('utf-8', 'utf-8', text) - rescue Iconv::IllegalSequence - use_charset = source_charset - end - end - text = MailHandler.get_attachment_text_one_file(part.content_type, text, use_charset) + text = MailHandler.get_attachment_text_one_file(part.content_type, text, "UTF-8") end end - # If text hasn't been converted, we sanitise it. - text = _sanitize_text(text) + # Add an annotation if the text had to be scrubbed + if part.body_as_text.scrubbed? + text += _("\n\n[ {{site_name}} note: The above text was badly encoded, and has had strange characters removed. ]", + :site_name => MySociety::Config.get('SITE_NAME', 'Alaveteli')) + end # Fix DOS style linefeeds to Unix style ones (or other later regexps won't work) text = text.gsub(/\r\n/, "\n") @@ -418,50 +400,6 @@ class IncomingMessage < ActiveRecord::Base return text end - def _sanitize_text(text) - if String.method_defined?(:encode) - begin - # Test if it's good UTF-8 - text.encode('utf-8') - rescue Encoding::UndefinedConversionError, Encoding::InvalidByteSequenceError - source_charset = 'utf-8' if source_charset.nil? - # strip out anything that isn't UTF-8 - begin - text = text.encode("utf-8", :invalid => :replace, - :undef => :replace, - :replace => "") + - _("\n\n[ {{site_name}} note: The above text was badly encoded, and has had strange characters removed. ]", - :site_name => MySociety::Config.get('SITE_NAME', 'Alaveteli')) - rescue Encoding::UndefinedConversionError, Encoding::InvalidByteSequenceError - if source_charset != "utf-8" - source_charset = "utf-8" - retry - end - end - end - else - begin - # Test if it's good UTF-8 - text = Iconv.conv('utf-8', 'utf-8', text) - rescue Iconv::IllegalSequence - # Text looks like unlabelled nonsense, - # strip out anything that isn't UTF-8 - begin - source_charset = 'utf-8' if source_charset.nil? - text = Iconv.conv('utf-8//IGNORE', source_charset, text) + - _("\n\n[ {{site_name}} note: The above text was badly encoded, and has had strange characters removed. ]", - :site_name => AlaveteliConfiguration::site_name) - rescue Iconv::InvalidEncoding, Iconv::IllegalSequence, Iconv::InvalidCharacter - if source_charset != "utf-8" - source_charset = "utf-8" - retry - end - end - end - end - text - end - # Returns part which contains main body text, or nil if there isn't one, # from a set of foi_attachments. If the leaves parameter is empty or not # supplied, uses its own foi_attachments. @@ -677,16 +615,7 @@ class IncomingMessage < ActiveRecord::Base end def _get_attachment_text_internal - text = self._extract_text - - # Remove any bad characters - if String.method_defined?(:encode) - # handle "problematic" encoding - text.encode!('UTF-16', 'UTF-8', :invalid => :replace, :undef => :replace, :replace => '') - text.encode('UTF-8', 'UTF-16') - else - Iconv.conv('utf-8//IGNORE', 'utf-8', text) - end + convert_string_to_utf8(_extract_text, 'UTF-8').string end # Returns text for indexing diff --git a/app/models/public_body.rb b/app/models/public_body.rb index 5d6e51534..ac924a941 100644 --- a/app/models/public_body.rb +++ b/app/models/public_body.rb @@ -34,9 +34,30 @@ require 'set' class PublicBody < ActiveRecord::Base include AdminColumn + class ImportCSVDryRun < StandardError ; end + @non_admin_columns = %w(name last_edit_comment) - strip_attributes! + attr_accessor :no_xapian_reindex + + # Default fields available for importing from CSV, in the format + # [field_name, 'short description of field (basic html allowed)'] + cattr_accessor :csv_import_fields do + [ + ['name', '(i18n)<strong>Existing records cannot be renamed</strong>'], + ['short_name', '(i18n)'], + ['request_email', '(i18n)'], + ['notes', '(i18n)'], + ['publication_scheme', '(i18n)'], + ['disclosure_log', '(i18n)'], + ['home_page', ''], + ['tag_string', '(tags separated by spaces)'], + ] + end + + has_many :info_requests, :order => 'created_at desc' + has_many :track_things, :order => 'created_at desc' + has_many :censor_rules, :order => 'created_at desc' validates_presence_of :name, :message => N_("Name can't be blank") validates_presence_of :url_name, :message => N_("URL name can't be blank") @@ -47,15 +68,8 @@ class PublicBody < ActiveRecord::Base validate :request_email_if_requestable - has_many :info_requests, :order => 'created_at desc' - has_many :track_things, :order => 'created_at desc' - has_many :censor_rules, :order => 'created_at desc' - attr_accessor :no_xapian_reindex - - has_tag_string - - before_save :set_api_key, - :set_default_publication_scheme + before_save :set_api_key!, :unless => :api_key + before_save :set_default_publication_scheme after_save :purge_in_cache after_update :reindex_requested_from @@ -67,44 +81,82 @@ class PublicBody < ActiveRecord::Base } } + acts_as_versioned + acts_as_xapian :texts => [:name, :short_name, :notes], + :values => [ + # for sorting + [:created_at_numeric, 1, "created_at", :number] + ], + :terms => [ + [:variety, 'V', "variety"], + [:tag_array_for_search, 'U', "tag"] + ] + has_tag_string + strip_attributes! translates :name, :short_name, :request_email, :url_name, :notes, :first_letter, :publication_scheme + # Cannot be grouped at top as it depends on the `translates` macro + include Translatable + + # Cannot be grouped at top as it depends on the `translates` macro include PublicBodyDerivedFields + + # Cannot be grouped at top as it depends on the `translates` macro class Translation include PublicBodyDerivedFields end - # Default fields available for importing from CSV, in the format - # [field_name, 'short description of field (basic html allowed)'] - cattr_accessor :csv_import_fields do - [ - ['name', '(i18n)<strong>Existing records cannot be renamed</strong>'], - ['short_name', '(i18n)'], - ['request_email', '(i18n)'], - ['notes', '(i18n)'], - ['publication_scheme', '(i18n)'], - ['disclosure_log', '(i18n)'], - ['home_page', ''], - ['tag_string', '(tags separated by spaces)'], - ] - end - - acts_as_xapian :texts => [ :name, :short_name, :notes ], - :values => [ - [ :created_at_numeric, 1, "created_at", :number ] # for sorting - ], - :terms => [ [ :variety, 'V', "variety" ], - [ :tag_array_for_search, 'U', "tag" ] - ] - - acts_as_versioned self.non_versioned_columns << 'created_at' << 'updated_at' << 'first_letter' << 'api_key' self.non_versioned_columns << 'info_requests_count' << 'info_requests_successful_count' self.non_versioned_columns << 'info_requests_count' << 'info_requests_visible_classified_count' self.non_versioned_columns << 'info_requests_not_held_count' << 'info_requests_overdue' self.non_versioned_columns << 'info_requests_overdue_count' - include Translatable + # Cannot be defined directly under `include` statements as this is opening + # the PublicBody::Version class dynamically defined by the + # `acts_as_versioned` macro. + # + # TODO: acts_as_versioned accepts an extend parameter [1] so these methods + # could be extracted to a module: + # + # acts_as_versioned :extend => PublicBodyVersionExtensions + # + # This includes the module in both the parent class (PublicBody) and the + # Version class (PublicBody::Version), so the behaviour is slightly + # different to opening up PublicBody::Version. + # + # We could add an `extend_version_class` option pretty trivially by + # following the pattern for the existing `extend` option. + # + # [1] http://git.io/vIetK + class Version + def last_edit_comment_for_html_display + text = self.last_edit_comment.strip + text = CGI.escapeHTML(text) + text = MySociety::Format.make_clickable(text) + text = text.gsub(/\n/, '<br>') + return text + end + + def compare(previous = nil) + if previous.nil? + yield([]) + else + v = self + changes = self.class.content_columns.inject([]) {|memo, c| + unless %w(version last_edit_editor last_edit_comment updated_at).include?(c.name) + from = previous.send(c.name) + to = self.send(c.name) + memo << { :name => c.human_name, :from => from, :to => to } if from != to + end + memo + } + changes.each do |change| + yield(change) + end + end + end + end # Public: Search for Public Bodies whose name, short_name, request_email or # tags contain the given query @@ -134,33 +186,12 @@ class PublicBody < ActiveRecord::Base uniq end - # TODO: - Don't like repeating this! - def calculate_cached_fields(t) - PublicBody.set_first_letter(t) - short_long_name = t.name - short_long_name = t.short_name if t.short_name and !t.short_name.empty? - t.url_name = MySociety::Format.simplify_url_part(short_long_name, 'body') - end - - # Set the first letter on a public body or translation - def self.set_first_letter(instance) - unless instance.name.nil? or instance.name.empty? - # we use a regex to ensure it works with utf-8/multi-byte - first_letter = Unicode.upcase instance.name.scan(/^./mu)[0] - if first_letter != instance.first_letter - instance.first_letter = first_letter - end - end - end - - def set_default_publication_scheme - # Make sure publication_scheme gets the correct default value. - # (This would work automatically, were publication_scheme not a translated attribute) - self.publication_scheme = "" if self.publication_scheme.nil? + def set_api_key + set_api_key! if api_key.nil? end - def set_api_key - self.api_key = SecureRandom.base64(33) if self.api_key.nil? + def set_api_key! + self.api_key = SecureRandom.base64(33) end # like find_by_url_name but also search historic url_name if none found @@ -188,15 +219,29 @@ class PublicBody < ActiveRecord::Base PublicBody.find(old.first) end - - # If tagged "not_apply", then FOI/EIR no longer applies to authority at all def not_apply? - return self.has_tag?('not_apply') + has_tag?('not_apply') end + # If tagged "defunct", then the authority no longer exists at all def defunct? - return self.has_tag?('defunct') + has_tag?('defunct') + end + + # Are all requests to this body under the Environmental Information + # Regulations? + def eir_only? + has_tag?('eir_only') + end + + # Schools are allowed more time in holidays, so we change some wordings + def is_school? + has_tag?('school') + end + + def site_administration? + has_tag?('site_administration') end # Can an FOI (etc.) request be made to this body? @@ -215,76 +260,39 @@ class PublicBody < ActiveRecord::Base # Also used as not_followable_reason def not_requestable_reason - if self.defunct? - return 'defunct' - elsif self.not_apply? - return 'not_apply' + if defunct? + 'defunct' + elsif not_apply? + 'not_apply' elsif !has_request_email? - return 'bad_contact' + 'bad_contact' else raise "not_requestable_reason called with type that has no reason" end end def special_not_requestable_reason? - self.defunct? || self.not_apply? - end - - - class Version - - def last_edit_comment_for_html_display - text = self.last_edit_comment.strip - text = CGI.escapeHTML(text) - text = MySociety::Format.make_clickable(text) - text = text.gsub(/\n/, '<br>') - return text - end - - def compare(previous = nil) - if previous.nil? - yield([]) - else - v = self - changes = self.class.content_columns.inject([]) {|memo, c| - unless %w(version last_edit_editor last_edit_comment updated_at).include?(c.name) - from = previous.send(c.name) - to = self.send(c.name) - memo << { :name => c.human_name, :from => from, :to => to } if from != to - end - memo - } - changes.each do |change| - yield(change) - end - end - end + defunct? || not_apply? end def created_at_numeric # format it here as no datetime support in Xapian's value ranges - return self.created_at.strftime("%Y%m%d%H%M%S") + created_at.strftime("%Y%m%d%H%M%S") end + def variety - return "authority" + "authority" end - # if the URL name has changed, then all requested_from: queries - # will break unless we update index for every event for every - # request linked to it - def reindex_requested_from - if self.changes.include?('url_name') - for info_request in self.info_requests - - for info_request_event in info_request.info_request_events - info_request_event.xapian_mark_needs_index - end - end - end + def law_only_short + eir_only? ? 'EIR' : 'FOI' end # Guess home page from the request email, or use explicit override, or nil # if not known. + # + # TODO: PublicBody#calculated_home_page would be a good candidate to cache + # in an instance variable def calculated_home_page if home_page && !home_page.empty? home_page[URI::regexp(%w(http https))] ? home_page : "http://#{home_page}" @@ -293,24 +301,6 @@ class PublicBody < ActiveRecord::Base end end - # Are all requests to this body under the Environmental Information Regulations? - def eir_only? - has_tag?('eir_only') - end - - def law_only_short - eir_only? ? 'EIR' : 'FOI' - end - - # Schools are allowed more time in holidays, so we change some wordings - def is_school? - has_tag?('school') - end - - def site_administration? - has_tag?('site_administration') - end - # The "internal admin" is a special body for internal use. def self.internal_admin_body # Use find_by_sql to avoid the search being specific to a @@ -336,9 +326,6 @@ class PublicBody < ActiveRecord::Base end end - class ImportCSVDryRun < StandardError - end - # Import from a string in CSV format. # Just tests things and returns messages if dry_run is true. # Returns an array of [array of errors, array of notes]. If there @@ -518,16 +505,10 @@ class PublicBody < ActiveRecord::Base # Does this user have the power of FOI officer for this body? def is_foi_officer?(user) user_domain = user.email_domain - our_domain = self.request_email_domain + our_domain = request_email_domain - if user_domain.nil? or our_domain.nil? - return false - end - - return our_domain == user_domain - end - def foi_officer_domain_required - return self.request_email_domain + return false if user_domain.nil? or our_domain.nil? + our_domain == user_domain end def request_email @@ -540,11 +521,15 @@ class PublicBody < ActiveRecord::Base # Domain name of the request email def request_email_domain - return PublicBody.extract_domain_from_email(self.request_email) + PublicBody.extract_domain_from_email(request_email) end + alias_method :foi_officer_domain_required, :request_email_domain + # Return the domain part of an email address, canonicalised and with common # extra UK Government server name parts removed. + # + # TODO: Extract to library class def self.extract_domain_from_email(email) email =~ /@(.*)/ if $1.nil? @@ -562,45 +547,53 @@ class PublicBody < ActiveRecord::Base return ret end + # TODO: Could this be defined as `sorted_versions.reverse`? def reverse_sorted_versions - self.versions.sort { |a,b| b.version <=> a.version } + versions.sort { |a,b| b.version <=> a.version } end + def sorted_versions - self.versions.sort { |a,b| a.version <=> b.version } + versions.sort { |a,b| a.version <=> b.version } end def has_notes? - return !self.notes.nil? && self.notes != "" + !notes.nil? && notes != "" end + + # TODO: Deprecate this method. Its only used in a couple of views so easy to + # update to just call PublicBody#notes def notes_as_html - self.notes + notes end def notes_without_html - # assume notes are reasonably behaved HTML, so just use simple regexp on this - @notes_without_html ||= (self.notes.nil? ? '' : self.notes.gsub(/<\/?[^>]*>/, "")) + # assume notes are reasonably behaved HTML, so just use simple regexp + # on this + @notes_without_html ||= (notes.nil? ? '' : notes.gsub(/<\/?[^>]*>/, "")) end def json_for_api - return { - :id => self.id, - :url_name => self.url_name, - :name => self.name, - :short_name => self.short_name, - # :request_email # we hide this behind a captcha, to stop people doing bulk requests easily - :created_at => self.created_at, - :updated_at => self.updated_at, - # don't add the history as some edit comments contain sensitive information + { + :id => id, + :url_name => url_name, + :name => name, + :short_name => short_name, + # :request_email # we hide this behind a captcha, to stop people + # doing bulk requests easily + :created_at => created_at, + :updated_at => updated_at, + # don't add the history as some edit comments contain sensitive + # information # :version, :last_edit_editor, :last_edit_comment - :home_page => self.calculated_home_page, - :notes => self.notes, - :publication_scheme => self.publication_scheme, - :tags => self.tag_array, + :home_page => calculated_home_page, + :notes => notes, + :publication_scheme => publication_scheme, + :tags => tag_array, } end def purge_in_cache - self.info_requests.each {|x| x.purge_in_cache} + info_requests.each { |x| x.purge_in_cache } end def self.where_clause_for_stats(minimum_requests, total_column) @@ -673,6 +666,7 @@ class PublicBody < ActiveRecord::Base 'y_max' => 100, 'totals' => original_totals} end + def self.popular_bodies(locale) # get some example searches and public bodies to display # either from config, or based on a (slow!) query if not set @@ -699,6 +693,70 @@ class PublicBody < ActiveRecord::Base return bodies end + # Methods to privatise + # -------------------------------------------------------------------------- + + # TODO: This could be removed by updating the default value (to '') of the + # `publication_scheme` column in the `public_body_translations` table. + # + # TODO: Can't actually deprecate this because spec/script/mailin_spec.rb:28 + # fails due to the deprecation notice output + def set_default_publication_scheme + # warn %q([DEPRECATION] PublicBody#set_default_publication_scheme will + # become a private method in 0.23).squish + + # Make sure publication_scheme gets the correct default value. + # (This would work automatically, were publication_scheme not a + # translated attribute) + self.publication_scheme = "" if publication_scheme.nil? + end + + # if the URL name has changed, then all requested_from: queries + # will break unless we update index for every event for every + # request linked to it + # + # TODO: Can't actually deprecate this because spec/script/mailin_spec.rb:28 + # fails due to the deprecation notice output + def reindex_requested_from + # warn %q([DEPRECATION] PublicBody#reindex_requested_from will become a + # private method in 0.23).squish + + if changes.include?('url_name') + info_requests.each do |info_request| + info_request.info_request_events.each do |info_request_event| + info_request_event.xapian_mark_needs_index + end + end + end + end + + # Methods to remove + # -------------------------------------------------------------------------- + + # Set the first letter on a public body or translation + def self.set_first_letter(instance) + warn %q([DEPRECATION] PublicBody.set_first_letter will be removed + in 0.23).squish + + unless instance.name.nil? or instance.name.empty? + # we use a regex to ensure it works with utf-8/multi-byte + first_letter = Unicode.upcase instance.name.scan(/^./mu)[0] + if first_letter != instance.first_letter + instance.first_letter = first_letter + end + end + end + + def calculate_cached_fields(t) + warn %q([DEPRECATION] PublicBody#calculate_cached_fields will be removed + in 0.23).squish + + PublicBody.set_first_letter(t) + short_long_name = t.name + short_long_name = t.short_name if t.short_name and !t.short_name.empty? + t.url_name = MySociety::Format.simplify_url_part(short_long_name, 'body') + end + private # Read an attribute value (without using locale fallbacks if the attribute is translated) |