diff options
-rw-r--r-- | app/controllers/admin_controller.rb | 6 | ||||
-rw-r--r-- | app/controllers/application.rb | 10 | ||||
-rw-r--r-- | app/models/incoming_message.rb | 77 | ||||
-rw-r--r-- | app/models/info_request.rb | 8 | ||||
-rw-r--r-- | app/models/info_request_event.rb | 12 | ||||
-rw-r--r-- | app/views/request/_request_listing_via_event.rhtml | 6 | ||||
-rw-r--r-- | app/views/track_mailer/event_digest.rhtml | 2 | ||||
-rw-r--r-- | db/migrate/080_cache_only_clipped_attachment_text.rb | 10 | ||||
-rw-r--r-- | db/schema.rb | 12 | ||||
-rwxr-xr-x | script/clear-incoming-text-cache | 9 | ||||
-rw-r--r-- | todo.txt | 10 |
11 files changed, 103 insertions, 59 deletions
diff --git a/app/controllers/admin_controller.rb b/app/controllers/admin_controller.rb index 32882f8a4..7f8cfbd67 100644 --- a/app/controllers/admin_controller.rb +++ b/app/controllers/admin_controller.rb @@ -4,7 +4,7 @@ # Copyright (c) 2009 UK Citizens Online Democracy. All rights reserved. # Email: francis@mysociety.org; WWW: http://www.mysociety.org/ # -# $Id: admin_controller.rb,v 1.28 2009-09-15 20:46:35 francis Exp $ +# $Id: admin_controller.rb,v 1.29 2009-09-17 10:24:35 francis Exp $ require 'fileutils' @@ -34,6 +34,10 @@ class AdminController < ApplicationController cache_subpath = File.join(self.cache_store.cache_path, "views/request/#{info_request.id}") FileUtils.rm_rf(cache_subpath) + # Remove the database caches of body / attachment text (the attachment text + # one is after privacy rules are applied) + info_request.clear_in_database_caches! + # also force a search reindexing (so changed text reflected in search) info_request.reindex_request_events end diff --git a/app/controllers/application.rb b/app/controllers/application.rb index 8c4410b65..0d3664d4d 100644 --- a/app/controllers/application.rb +++ b/app/controllers/application.rb @@ -6,15 +6,21 @@ # Copyright (c) 2007 UK Citizens Online Democracy. All rights reserved. # Email: francis@mysociety.org; WWW: http://www.mysociety.org/ # -# $Id: application.rb,v 1.57 2009-09-15 11:47:20 francis Exp $ +# $Id: application.rb,v 1.58 2009-09-17 10:24:35 francis Exp $ class ApplicationController < ActionController::Base # Standard headers, footers and navigation for whole site layout "default" - # Help work out which request causes RAM spike + # Help work out which request causes RAM spike. # http://www.codeweblog.com/rails-to-monitor-the-process-of-memory-leaks-skills/ + # This shows the memory use increase of the Ruby process due to the request. + # Since Ruby never returns memory to the OS, if the existing process previously + # served a larger request, this won't show any consumption for the later request. + # Ruby also grabs memory from the OS in variously sized jumps, so the extra + # consumption of a request shown by this function will only appear in such + # jumps. around_filter :record_memory def record_memory File.read("/proc/#{Process.pid}/status").match(/VmRSS:\s+(\d+)/) diff --git a/app/models/incoming_message.rb b/app/models/incoming_message.rb index 17a8b6ba1..ca2d07263 100644 --- a/app/models/incoming_message.rb +++ b/app/models/incoming_message.rb @@ -19,7 +19,7 @@ # Copyright (c) 2007 UK Citizens Online Democracy. All rights reserved. # Email: francis@mysociety.org; WWW: http://www.mysociety.org/ # -# $Id: incoming_message.rb,v 1.223 2009-09-15 21:30:39 francis Exp $ +# $Id: incoming_message.rb,v 1.224 2009-09-17 10:24:35 francis Exp $ # TODO # Move some of the (e.g. quoting) functions here into rblib, as they feel @@ -770,6 +770,13 @@ class IncomingMessage < ActiveRecord::Base return leaves_found end + # Removes anything cached about the object in the database, and saves + def clear_in_database_caches! + self.cached_attachment_text_clipped = nil + self.cached_main_body_text = nil + self.save! + end + # Returns body text from main text part of email, converted to UTF-8, with uudecode removed # XXX returns a .dup of the text, so calling functions can in place modify it def get_main_body_text @@ -791,10 +798,10 @@ class IncomingMessage < ActiveRecord::Base # Returns body text from main text part of email, converted to UTF-8 def get_main_body_text_internal main_part = get_main_body_text_part - return convert_part_body_to_text(main_part) + return _convert_part_body_to_text(main_part) end # Given a main text part, converts it to text - def convert_part_body_to_text(part) + def _convert_part_body_to_text(part) if part.nil? text = "[ Email has no body, please see attachments ]" text_charset = "utf-8" @@ -805,7 +812,7 @@ class IncomingMessage < ActiveRecord::Base # e.g. http://www.whatdotheyknow.com/request/35/response/177 # XXX This is a bit of a hack as it is calling a convert to text routine. # Could instead call a sanitize HTML one. - text = IncomingMessage.get_attachment_text_internal_one_file(part.content_type, text) + text = IncomingMessage._get_attachment_text_internal_one_file(part.content_type, text) end end @@ -960,7 +967,7 @@ class IncomingMessage < ActiveRecord::Base headers = headers + header + ": " + leaf.within_rfc822_attachment.header[header.downcase].to_s + "\n" end end - # XXX call convert_part_body_to_text here, but need to get charset somehow + # XXX call _convert_part_body_to_text here, but need to get charset somehow # e.g. http://www.whatdotheyknow.com/request/1593/response/3088/attach/4/Freedom%20of%20Information%20request%20-%20car%20oval%20sticker:%20Article%2020,%20Convention%20on%20Road%20Traffic%201949.txt attachment.body = headers + "\n" + attachment.body @@ -1041,30 +1048,36 @@ class IncomingMessage < ActiveRecord::Base text = IncomingMessage.remove_quoted_sections(text, "") end - # Returns text version of attachment text - def get_attachment_text - #STDOUT.puts 'start ' + MySociety::DebugHelpers::allocated_string_size_around_gc - if self.cached_attachment_text.nil? - attachment_text = self.get_attachment_text_internal - #STDOUT.puts 'after get_attachment_text_internal ' + MySociety::DebugHelpers::allocated_string_size_around_gc - self.cached_attachment_text = attachment_text - #STDOUT.puts 'after assign to cached_attachment_text ' + MySociety::DebugHelpers::allocated_string_size_around_gc - self.save! - #STDOUT.puts 'after save!' + MySociety::DebugHelpers::allocated_string_size_around_gc - end - #STDOUT.puts 'after cache ' + MySociety::DebugHelpers::allocated_string_size_around_gc + MAX_ATTACHMENT_TEXT_CLIPPED = 1000000 # 1Mb ish - # Remove any privacy things - #STDOUT.puts 'before dup ' + MySociety::DebugHelpers::allocated_string_size_around_gc - text = self.cached_attachment_text.dup - #STDOUT.puts 'before mask_special_emails ' + MySociety::DebugHelpers::allocated_string_size_around_gc + # Returns text version of attachment text + def get_attachment_text_full + text = self._get_attachment_text_internal self.mask_special_emails!(text) - #STDOUT.puts 'after mask_special_emails ' + MySociety::DebugHelpers::allocated_string_size_around_gc self.remove_privacy_sensitive_things!(text) - #STDOUT.puts 'after remove_privacy_sensitive_things ' + MySociety::DebugHelpers::allocated_string_size_around_gc + # This can be useful for memory debugging + #STDOUT.puts 'xxx '+ MySociety::DebugHelpers::allocated_string_size_around_gc + + # Save clipped version for snippets + if self.cached_attachment_text_clipped.nil? + self.cached_attachment_text_clipped = text[0..MAX_ATTACHMENT_TEXT_CLIPPED] + self.save! + end + return text end - def IncomingMessage.get_attachment_text_internal_one_file(content_type, body) + # Returns a version reduced to a sensible maximum size - this + # is for performance reasons when showing snippets in search results. + def get_attachment_text_clipped + if self.cached_attachment_text_clipped.nil? + # As side effect, get_attachment_text_full makes snippet text + attachment_text = self.get_attachment_text_full + raise "internal error" if self.cached_attachment_text_clipped.nil? + end + + return self.cached_attachment_text_clipped + end + def IncomingMessage._get_attachment_text_internal_one_file(content_type, body) text = '' # XXX - tell all these command line tools to return utf-8 if content_type == 'text/plain' @@ -1141,7 +1154,7 @@ class IncomingMessage < ActiveRecord::Base end #STDERR.puts("doing file " + filename + " content type " + content_type) - text += IncomingMessage.get_attachment_text_internal_one_file(content_type, body) + text += IncomingMessage._get_attachment_text_internal_one_file(content_type, body) end end end @@ -1150,12 +1163,12 @@ class IncomingMessage < ActiveRecord::Base return text end - def get_attachment_text_internal + def _get_attachment_text_internal # Extract text from each attachment text = '' attachments = self.get_attachments_for_display for attachment in attachments - text += IncomingMessage.get_attachment_text_internal_one_file(attachment.content_type, attachment.body) + text += IncomingMessage._get_attachment_text_internal_one_file(attachment.content_type, attachment.body) end # Remove any bad characters text = Iconv.conv('utf-8//IGNORE', 'utf-8', text) @@ -1163,12 +1176,12 @@ class IncomingMessage < ActiveRecord::Base end # Returns text for indexing - def get_text_for_indexing - return get_body_for_quoting + "\n\n" + get_attachment_text + def get_text_for_indexing_full + return get_body_for_quoting + "\n\n" + get_attachment_text_full end - # Used when there are no highlight words, so full attachment text not required - def get_text_for_indexing_quick - return get_body_for_quoting + # Used for excerpts in search results, when loading full text would be too slow + def get_text_for_indexing_clipped + return get_body_for_quoting + "\n\n" + get_attachment_text_clipped end # Returns the name of the person the incoming message is from, or nil if diff --git a/app/models/info_request.rb b/app/models/info_request.rb index e7033addc..16a010f1c 100644 --- a/app/models/info_request.rb +++ b/app/models/info_request.rb @@ -24,7 +24,7 @@ # Copyright (c) 2007 UK Citizens Online Democracy. All rights reserved. # Email: francis@mysociety.org; WWW: http://www.mysociety.org/ # -# $Id: info_request.rb,v 1.205 2009-09-15 17:45:51 francis Exp $ +# $Id: info_request.rb,v 1.206 2009-09-17 10:24:35 francis Exp $ require 'digest/sha1' require File.join(File.dirname(__FILE__),'../../vendor/plugins/acts_as_xapian/lib/acts_as_xapian') @@ -135,6 +135,12 @@ class InfoRequest < ActiveRecord::Base end end + # Removes anything cached about the object in the database, and saves + def clear_in_database_caches! + for incoming_message in self.incoming_messages + incoming_message.clear_in_database_caches! + end + end # For debugging def InfoRequest.profile_search(query) diff --git a/app/models/info_request_event.rb b/app/models/info_request_event.rb index 0f20dbc42..922507a5f 100644 --- a/app/models/info_request_event.rb +++ b/app/models/info_request_event.rb @@ -21,7 +21,7 @@ # Copyright (c) 2007 UK Citizens Online Democracy. All rights reserved. # Email: francis@mysociety.org; WWW: http://www.mysociety.org/ # -# $Id: info_request_event.rb,v 1.90 2009-09-15 21:30:39 francis Exp $ +# $Id: info_request_event.rb,v 1.91 2009-09-17 10:24:35 francis Exp $ class InfoRequestEvent < ActiveRecord::Base belongs_to :info_request @@ -126,17 +126,19 @@ class InfoRequestEvent < ActiveRecord::Base # format it here as no datetime support in Xapian's value ranges return self.created_at.strftime("%Y%m%d%H%M%S") end - def search_text_main(quick = false) + # clipped = true - means return shorter text. It is used for snippets for + # performance reasons. Xapian will take the full text. + def search_text_main(clipped = false) text = '' if self.event_type == 'sent' text = text + self.outgoing_message.get_text_for_indexing + "\n\n" elsif self.event_type == 'followup_sent' text = text + self.outgoing_message.get_text_for_indexing + "\n\n" elsif self.event_type == 'response' - if quick - text = text + self.incoming_message.get_text_for_indexing_quick + "\n\n" + if clipped + text = text + self.incoming_message.get_text_for_indexing_clipped + "\n\n" else - text = text + self.incoming_message.get_text_for_indexing + "\n\n" + text = text + self.incoming_message.get_text_for_indexing_full + "\n\n" end elsif self.event_type == 'comment' text = text + self.comment.body + "\n\n" diff --git a/app/views/request/_request_listing_via_event.rhtml b/app/views/request/_request_listing_via_event.rhtml index 839d52cfd..831ab5836 100644 --- a/app/views/request/_request_listing_via_event.rhtml +++ b/app/views/request/_request_listing_via_event.rhtml @@ -15,11 +15,7 @@ end %> <% end %> </span> <span class="desc"> - <% if @highlight_words == [] %> - <%= highlight_and_excerpt(event.search_text_main(quick = true), @highlight_words, 150) %> - <% else %> - <%= highlight_and_excerpt(event.search_text_main, @highlight_words, 150) %> - <% end %> + <%= highlight_and_excerpt(event.search_text_main(true), @highlight_words, 150) %> </span> <span class="bottomline icon_<%= info_request.calculate_status %>"> diff --git a/app/views/track_mailer/event_digest.rhtml b/app/views/track_mailer/event_digest.rhtml index cd28b8840..8084f778d 100644 --- a/app/views/track_mailer/event_digest.rhtml +++ b/app/views/track_mailer/event_digest.rhtml @@ -38,7 +38,7 @@ if event.is_outgoing_message? extract = highlight_and_excerpt(event.outgoing_message.get_text_for_indexing, @highlight_words, 150, false) elsif event.is_incoming_message? - extract = highlight_and_excerpt(event.incoming_message.get_text_for_indexing, @highlight_words, 150, false) + extract = highlight_and_excerpt(event.incoming_message.get_text_for_indexing_clipped, @highlight_words, 150, false) elsif event.is_comment? extract = highlight_and_excerpt(event.comment.body, @highlight_words, 150, false) else diff --git a/db/migrate/080_cache_only_clipped_attachment_text.rb b/db/migrate/080_cache_only_clipped_attachment_text.rb new file mode 100644 index 000000000..0cd835e49 --- /dev/null +++ b/db/migrate/080_cache_only_clipped_attachment_text.rb @@ -0,0 +1,10 @@ +class CacheOnlyClippedAttachmentText < ActiveRecord::Migration + def self.up + remove_column :incoming_messages, :cached_attachment_text + add_column :incoming_messages, :cached_attachment_text_clipped, :text + end + + def self.down + raise "safer not to have reverse migration scripts, and we never use them" + end +end diff --git a/db/schema.rb b/db/schema.rb index 78f0ed992..59d779cd8 100644 --- a/db/schema.rb +++ b/db/schema.rb @@ -9,7 +9,7 @@ # # It's strongly recommended to check this file into your version control system. -ActiveRecord::Schema.define(:version => 79) do +ActiveRecord::Schema.define(:version => 80) do create_table "acts_as_xapian_jobs", :force => true do |t| t.string "model", :null => false @@ -69,12 +69,12 @@ ActiveRecord::Schema.define(:version => 79) do add_index "holidays", ["day"], :name => "index_holidays_on_day" create_table "incoming_messages", :force => true do |t| - t.integer "info_request_id", :null => false - t.datetime "created_at", :null => false - t.datetime "updated_at", :null => false - t.text "cached_attachment_text" + t.integer "info_request_id", :null => false + t.datetime "created_at", :null => false + t.datetime "updated_at", :null => false t.text "cached_main_body_text" - t.integer "raw_email_id", :null => false + t.integer "raw_email_id", :null => false + t.text "cached_attachment_text_clipped" end create_table "info_request_events", :force => true do |t| diff --git a/script/clear-incoming-text-cache b/script/clear-incoming-text-cache index d03991341..01ea95edc 100755 --- a/script/clear-incoming-text-cache +++ b/script/clear-incoming-text-cache @@ -4,9 +4,10 @@ LOC=`dirname $0` -$LOC/runner "ActiveRecord::Base.connection.execute(\"update incoming_messages set cached_attachment_text = null, cached_main_body_text = null\")" +$LOC/runner "ActiveRecord::Base.connection.execute(\"update incoming_messages set cached_attachment_text_clipped = null, cached_main_body_text = null\")" -# Remove page caches -rm -fr $LOC/../public/request -rm -fr $LOC/../cache +# Remove page cache (do it in two stages so live site gets cache cleared faster) +rm -fr $LOC/../old-cache +mv $LOC/../cache $LOC/../old-cache +rm -fr $LOC/../old-cache @@ -1,3 +1,4 @@ + test if get_attachments_for_display called multiple times in one request? (MyModel.column_names - ['column_to_exclude']).join(', ') @@ -87,6 +88,9 @@ and resend messages to them Later ===== +cached_main_body_text could store the privacy censored versions now, since +cached_attachment_text_clipped does (and clears it when censor rules are edited) + Things to make bots not crawl somehow: /request/13683/response?internal_review=1 /request/febrile_neutropenia_154?unfold=1 @@ -275,7 +279,8 @@ Edits to outgoing/incoming/title won't be reindexed in Xapian (maybe just reinde might matter with software updates, or code changes) This does it all: $ ./script/clear-incoming-text-cache ; ./script/rebuild-xapian-index -Well, except nowday has to clear the disk file cache in mysociety/foi/cache also +(clear-incoming-text-cache needs renaming to make it clearer it does clear the disk cache too, +and that code testing) Renaming public authorities will break alerts on them. For basic alerts the structured info is there so this should just be fixed. For searches, perhaps @@ -307,7 +312,8 @@ To decode Outlook .msg (.oft?) files, e.g. Search for other file extensions that we have now and look for ones we could and should be indexing - (call IncomingMessage.find_all_unknown_mime_types to find them) + (call IncomingMessage.find_all_unknown_mime_types to find them - needs + updating to do it in clumps as all requests won't load in RAM now ) Make tables prettier in view as HTMl, just normal thick borders. http://www.whatdotheyknow.com/request/1610/response/8093/attach/html/3/2008.10.29%20Reply.doc.html |