aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorfrancis <francis>2009-09-17 10:24:34 +0000
committerfrancis <francis>2009-09-17 10:24:34 +0000
commit6c251bd414c4a6ea10cc74f25db1d9c5f2fbb15c (patch)
tree7e21693480ce2ba4192249b3dc81a8bac7c24c9c
parent7220c3a29466b507e49828af34bde5dd5edc10c4 (diff)
Store only clipped attachment text in database.
-rw-r--r--app/controllers/admin_controller.rb6
-rw-r--r--app/controllers/application.rb10
-rw-r--r--app/models/incoming_message.rb77
-rw-r--r--app/models/info_request.rb8
-rw-r--r--app/models/info_request_event.rb12
-rw-r--r--app/views/request/_request_listing_via_event.rhtml6
-rw-r--r--app/views/track_mailer/event_digest.rhtml2
-rw-r--r--db/migrate/080_cache_only_clipped_attachment_text.rb10
-rw-r--r--db/schema.rb12
-rwxr-xr-xscript/clear-incoming-text-cache9
-rw-r--r--todo.txt10
11 files changed, 103 insertions, 59 deletions
diff --git a/app/controllers/admin_controller.rb b/app/controllers/admin_controller.rb
index 32882f8a4..7f8cfbd67 100644
--- a/app/controllers/admin_controller.rb
+++ b/app/controllers/admin_controller.rb
@@ -4,7 +4,7 @@
# Copyright (c) 2009 UK Citizens Online Democracy. All rights reserved.
# Email: francis@mysociety.org; WWW: http://www.mysociety.org/
#
-# $Id: admin_controller.rb,v 1.28 2009-09-15 20:46:35 francis Exp $
+# $Id: admin_controller.rb,v 1.29 2009-09-17 10:24:35 francis Exp $
require 'fileutils'
@@ -34,6 +34,10 @@ class AdminController < ApplicationController
cache_subpath = File.join(self.cache_store.cache_path, "views/request/#{info_request.id}")
FileUtils.rm_rf(cache_subpath)
+ # Remove the database caches of body / attachment text (the attachment text
+ # one is after privacy rules are applied)
+ info_request.clear_in_database_caches!
+
# also force a search reindexing (so changed text reflected in search)
info_request.reindex_request_events
end
diff --git a/app/controllers/application.rb b/app/controllers/application.rb
index 8c4410b65..0d3664d4d 100644
--- a/app/controllers/application.rb
+++ b/app/controllers/application.rb
@@ -6,15 +6,21 @@
# Copyright (c) 2007 UK Citizens Online Democracy. All rights reserved.
# Email: francis@mysociety.org; WWW: http://www.mysociety.org/
#
-# $Id: application.rb,v 1.57 2009-09-15 11:47:20 francis Exp $
+# $Id: application.rb,v 1.58 2009-09-17 10:24:35 francis Exp $
class ApplicationController < ActionController::Base
# Standard headers, footers and navigation for whole site
layout "default"
- # Help work out which request causes RAM spike
+ # Help work out which request causes RAM spike.
# http://www.codeweblog.com/rails-to-monitor-the-process-of-memory-leaks-skills/
+ # This shows the memory use increase of the Ruby process due to the request.
+ # Since Ruby never returns memory to the OS, if the existing process previously
+ # served a larger request, this won't show any consumption for the later request.
+ # Ruby also grabs memory from the OS in variously sized jumps, so the extra
+ # consumption of a request shown by this function will only appear in such
+ # jumps.
around_filter :record_memory
def record_memory
File.read("/proc/#{Process.pid}/status").match(/VmRSS:\s+(\d+)/)
diff --git a/app/models/incoming_message.rb b/app/models/incoming_message.rb
index 17a8b6ba1..ca2d07263 100644
--- a/app/models/incoming_message.rb
+++ b/app/models/incoming_message.rb
@@ -19,7 +19,7 @@
# Copyright (c) 2007 UK Citizens Online Democracy. All rights reserved.
# Email: francis@mysociety.org; WWW: http://www.mysociety.org/
#
-# $Id: incoming_message.rb,v 1.223 2009-09-15 21:30:39 francis Exp $
+# $Id: incoming_message.rb,v 1.224 2009-09-17 10:24:35 francis Exp $
# TODO
# Move some of the (e.g. quoting) functions here into rblib, as they feel
@@ -770,6 +770,13 @@ class IncomingMessage < ActiveRecord::Base
return leaves_found
end
+ # Removes anything cached about the object in the database, and saves
+ def clear_in_database_caches!
+ self.cached_attachment_text_clipped = nil
+ self.cached_main_body_text = nil
+ self.save!
+ end
+
# Returns body text from main text part of email, converted to UTF-8, with uudecode removed
# XXX returns a .dup of the text, so calling functions can in place modify it
def get_main_body_text
@@ -791,10 +798,10 @@ class IncomingMessage < ActiveRecord::Base
# Returns body text from main text part of email, converted to UTF-8
def get_main_body_text_internal
main_part = get_main_body_text_part
- return convert_part_body_to_text(main_part)
+ return _convert_part_body_to_text(main_part)
end
# Given a main text part, converts it to text
- def convert_part_body_to_text(part)
+ def _convert_part_body_to_text(part)
if part.nil?
text = "[ Email has no body, please see attachments ]"
text_charset = "utf-8"
@@ -805,7 +812,7 @@ class IncomingMessage < ActiveRecord::Base
# e.g. http://www.whatdotheyknow.com/request/35/response/177
# XXX This is a bit of a hack as it is calling a convert to text routine.
# Could instead call a sanitize HTML one.
- text = IncomingMessage.get_attachment_text_internal_one_file(part.content_type, text)
+ text = IncomingMessage._get_attachment_text_internal_one_file(part.content_type, text)
end
end
@@ -960,7 +967,7 @@ class IncomingMessage < ActiveRecord::Base
headers = headers + header + ": " + leaf.within_rfc822_attachment.header[header.downcase].to_s + "\n"
end
end
- # XXX call convert_part_body_to_text here, but need to get charset somehow
+ # XXX call _convert_part_body_to_text here, but need to get charset somehow
# e.g. http://www.whatdotheyknow.com/request/1593/response/3088/attach/4/Freedom%20of%20Information%20request%20-%20car%20oval%20sticker:%20Article%2020,%20Convention%20on%20Road%20Traffic%201949.txt
attachment.body = headers + "\n" + attachment.body
@@ -1041,30 +1048,36 @@ class IncomingMessage < ActiveRecord::Base
text = IncomingMessage.remove_quoted_sections(text, "")
end
- # Returns text version of attachment text
- def get_attachment_text
- #STDOUT.puts 'start ' + MySociety::DebugHelpers::allocated_string_size_around_gc
- if self.cached_attachment_text.nil?
- attachment_text = self.get_attachment_text_internal
- #STDOUT.puts 'after get_attachment_text_internal ' + MySociety::DebugHelpers::allocated_string_size_around_gc
- self.cached_attachment_text = attachment_text
- #STDOUT.puts 'after assign to cached_attachment_text ' + MySociety::DebugHelpers::allocated_string_size_around_gc
- self.save!
- #STDOUT.puts 'after save!' + MySociety::DebugHelpers::allocated_string_size_around_gc
- end
- #STDOUT.puts 'after cache ' + MySociety::DebugHelpers::allocated_string_size_around_gc
+ MAX_ATTACHMENT_TEXT_CLIPPED = 1000000 # 1Mb ish
- # Remove any privacy things
- #STDOUT.puts 'before dup ' + MySociety::DebugHelpers::allocated_string_size_around_gc
- text = self.cached_attachment_text.dup
- #STDOUT.puts 'before mask_special_emails ' + MySociety::DebugHelpers::allocated_string_size_around_gc
+ # Returns text version of attachment text
+ def get_attachment_text_full
+ text = self._get_attachment_text_internal
self.mask_special_emails!(text)
- #STDOUT.puts 'after mask_special_emails ' + MySociety::DebugHelpers::allocated_string_size_around_gc
self.remove_privacy_sensitive_things!(text)
- #STDOUT.puts 'after remove_privacy_sensitive_things ' + MySociety::DebugHelpers::allocated_string_size_around_gc
+ # This can be useful for memory debugging
+ #STDOUT.puts 'xxx '+ MySociety::DebugHelpers::allocated_string_size_around_gc
+
+ # Save clipped version for snippets
+ if self.cached_attachment_text_clipped.nil?
+ self.cached_attachment_text_clipped = text[0..MAX_ATTACHMENT_TEXT_CLIPPED]
+ self.save!
+ end
+
return text
end
- def IncomingMessage.get_attachment_text_internal_one_file(content_type, body)
+ # Returns a version reduced to a sensible maximum size - this
+ # is for performance reasons when showing snippets in search results.
+ def get_attachment_text_clipped
+ if self.cached_attachment_text_clipped.nil?
+ # As side effect, get_attachment_text_full makes snippet text
+ attachment_text = self.get_attachment_text_full
+ raise "internal error" if self.cached_attachment_text_clipped.nil?
+ end
+
+ return self.cached_attachment_text_clipped
+ end
+ def IncomingMessage._get_attachment_text_internal_one_file(content_type, body)
text = ''
# XXX - tell all these command line tools to return utf-8
if content_type == 'text/plain'
@@ -1141,7 +1154,7 @@ class IncomingMessage < ActiveRecord::Base
end
#STDERR.puts("doing file " + filename + " content type " + content_type)
- text += IncomingMessage.get_attachment_text_internal_one_file(content_type, body)
+ text += IncomingMessage._get_attachment_text_internal_one_file(content_type, body)
end
end
end
@@ -1150,12 +1163,12 @@ class IncomingMessage < ActiveRecord::Base
return text
end
- def get_attachment_text_internal
+ def _get_attachment_text_internal
# Extract text from each attachment
text = ''
attachments = self.get_attachments_for_display
for attachment in attachments
- text += IncomingMessage.get_attachment_text_internal_one_file(attachment.content_type, attachment.body)
+ text += IncomingMessage._get_attachment_text_internal_one_file(attachment.content_type, attachment.body)
end
# Remove any bad characters
text = Iconv.conv('utf-8//IGNORE', 'utf-8', text)
@@ -1163,12 +1176,12 @@ class IncomingMessage < ActiveRecord::Base
end
# Returns text for indexing
- def get_text_for_indexing
- return get_body_for_quoting + "\n\n" + get_attachment_text
+ def get_text_for_indexing_full
+ return get_body_for_quoting + "\n\n" + get_attachment_text_full
end
- # Used when there are no highlight words, so full attachment text not required
- def get_text_for_indexing_quick
- return get_body_for_quoting
+ # Used for excerpts in search results, when loading full text would be too slow
+ def get_text_for_indexing_clipped
+ return get_body_for_quoting + "\n\n" + get_attachment_text_clipped
end
# Returns the name of the person the incoming message is from, or nil if
diff --git a/app/models/info_request.rb b/app/models/info_request.rb
index e7033addc..16a010f1c 100644
--- a/app/models/info_request.rb
+++ b/app/models/info_request.rb
@@ -24,7 +24,7 @@
# Copyright (c) 2007 UK Citizens Online Democracy. All rights reserved.
# Email: francis@mysociety.org; WWW: http://www.mysociety.org/
#
-# $Id: info_request.rb,v 1.205 2009-09-15 17:45:51 francis Exp $
+# $Id: info_request.rb,v 1.206 2009-09-17 10:24:35 francis Exp $
require 'digest/sha1'
require File.join(File.dirname(__FILE__),'../../vendor/plugins/acts_as_xapian/lib/acts_as_xapian')
@@ -135,6 +135,12 @@ class InfoRequest < ActiveRecord::Base
end
end
+ # Removes anything cached about the object in the database, and saves
+ def clear_in_database_caches!
+ for incoming_message in self.incoming_messages
+ incoming_message.clear_in_database_caches!
+ end
+ end
# For debugging
def InfoRequest.profile_search(query)
diff --git a/app/models/info_request_event.rb b/app/models/info_request_event.rb
index 0f20dbc42..922507a5f 100644
--- a/app/models/info_request_event.rb
+++ b/app/models/info_request_event.rb
@@ -21,7 +21,7 @@
# Copyright (c) 2007 UK Citizens Online Democracy. All rights reserved.
# Email: francis@mysociety.org; WWW: http://www.mysociety.org/
#
-# $Id: info_request_event.rb,v 1.90 2009-09-15 21:30:39 francis Exp $
+# $Id: info_request_event.rb,v 1.91 2009-09-17 10:24:35 francis Exp $
class InfoRequestEvent < ActiveRecord::Base
belongs_to :info_request
@@ -126,17 +126,19 @@ class InfoRequestEvent < ActiveRecord::Base
# format it here as no datetime support in Xapian's value ranges
return self.created_at.strftime("%Y%m%d%H%M%S")
end
- def search_text_main(quick = false)
+ # clipped = true - means return shorter text. It is used for snippets for
+ # performance reasons. Xapian will take the full text.
+ def search_text_main(clipped = false)
text = ''
if self.event_type == 'sent'
text = text + self.outgoing_message.get_text_for_indexing + "\n\n"
elsif self.event_type == 'followup_sent'
text = text + self.outgoing_message.get_text_for_indexing + "\n\n"
elsif self.event_type == 'response'
- if quick
- text = text + self.incoming_message.get_text_for_indexing_quick + "\n\n"
+ if clipped
+ text = text + self.incoming_message.get_text_for_indexing_clipped + "\n\n"
else
- text = text + self.incoming_message.get_text_for_indexing + "\n\n"
+ text = text + self.incoming_message.get_text_for_indexing_full + "\n\n"
end
elsif self.event_type == 'comment'
text = text + self.comment.body + "\n\n"
diff --git a/app/views/request/_request_listing_via_event.rhtml b/app/views/request/_request_listing_via_event.rhtml
index 839d52cfd..831ab5836 100644
--- a/app/views/request/_request_listing_via_event.rhtml
+++ b/app/views/request/_request_listing_via_event.rhtml
@@ -15,11 +15,7 @@ end %>
<% end %>
</span>
<span class="desc">
- <% if @highlight_words == [] %>
- <%= highlight_and_excerpt(event.search_text_main(quick = true), @highlight_words, 150) %>
- <% else %>
- <%= highlight_and_excerpt(event.search_text_main, @highlight_words, 150) %>
- <% end %>
+ <%= highlight_and_excerpt(event.search_text_main(true), @highlight_words, 150) %>
</span>
<span class="bottomline icon_<%= info_request.calculate_status %>">
diff --git a/app/views/track_mailer/event_digest.rhtml b/app/views/track_mailer/event_digest.rhtml
index cd28b8840..8084f778d 100644
--- a/app/views/track_mailer/event_digest.rhtml
+++ b/app/views/track_mailer/event_digest.rhtml
@@ -38,7 +38,7 @@
if event.is_outgoing_message?
extract = highlight_and_excerpt(event.outgoing_message.get_text_for_indexing, @highlight_words, 150, false)
elsif event.is_incoming_message?
- extract = highlight_and_excerpt(event.incoming_message.get_text_for_indexing, @highlight_words, 150, false)
+ extract = highlight_and_excerpt(event.incoming_message.get_text_for_indexing_clipped, @highlight_words, 150, false)
elsif event.is_comment?
extract = highlight_and_excerpt(event.comment.body, @highlight_words, 150, false)
else
diff --git a/db/migrate/080_cache_only_clipped_attachment_text.rb b/db/migrate/080_cache_only_clipped_attachment_text.rb
new file mode 100644
index 000000000..0cd835e49
--- /dev/null
+++ b/db/migrate/080_cache_only_clipped_attachment_text.rb
@@ -0,0 +1,10 @@
+class CacheOnlyClippedAttachmentText < ActiveRecord::Migration
+ def self.up
+ remove_column :incoming_messages, :cached_attachment_text
+ add_column :incoming_messages, :cached_attachment_text_clipped, :text
+ end
+
+ def self.down
+ raise "safer not to have reverse migration scripts, and we never use them"
+ end
+end
diff --git a/db/schema.rb b/db/schema.rb
index 78f0ed992..59d779cd8 100644
--- a/db/schema.rb
+++ b/db/schema.rb
@@ -9,7 +9,7 @@
#
# It's strongly recommended to check this file into your version control system.
-ActiveRecord::Schema.define(:version => 79) do
+ActiveRecord::Schema.define(:version => 80) do
create_table "acts_as_xapian_jobs", :force => true do |t|
t.string "model", :null => false
@@ -69,12 +69,12 @@ ActiveRecord::Schema.define(:version => 79) do
add_index "holidays", ["day"], :name => "index_holidays_on_day"
create_table "incoming_messages", :force => true do |t|
- t.integer "info_request_id", :null => false
- t.datetime "created_at", :null => false
- t.datetime "updated_at", :null => false
- t.text "cached_attachment_text"
+ t.integer "info_request_id", :null => false
+ t.datetime "created_at", :null => false
+ t.datetime "updated_at", :null => false
t.text "cached_main_body_text"
- t.integer "raw_email_id", :null => false
+ t.integer "raw_email_id", :null => false
+ t.text "cached_attachment_text_clipped"
end
create_table "info_request_events", :force => true do |t|
diff --git a/script/clear-incoming-text-cache b/script/clear-incoming-text-cache
index d03991341..01ea95edc 100755
--- a/script/clear-incoming-text-cache
+++ b/script/clear-incoming-text-cache
@@ -4,9 +4,10 @@
LOC=`dirname $0`
-$LOC/runner "ActiveRecord::Base.connection.execute(\"update incoming_messages set cached_attachment_text = null, cached_main_body_text = null\")"
+$LOC/runner "ActiveRecord::Base.connection.execute(\"update incoming_messages set cached_attachment_text_clipped = null, cached_main_body_text = null\")"
-# Remove page caches
-rm -fr $LOC/../public/request
-rm -fr $LOC/../cache
+# Remove page cache (do it in two stages so live site gets cache cleared faster)
+rm -fr $LOC/../old-cache
+mv $LOC/../cache $LOC/../old-cache
+rm -fr $LOC/../old-cache
diff --git a/todo.txt b/todo.txt
index 30d3da6c7..2a2a5bff4 100644
--- a/todo.txt
+++ b/todo.txt
@@ -1,3 +1,4 @@
+
test if get_attachments_for_display called multiple times in one request?
(MyModel.column_names - ['column_to_exclude']).join(', ')
@@ -87,6 +88,9 @@ and resend messages to them
Later
=====
+cached_main_body_text could store the privacy censored versions now, since
+cached_attachment_text_clipped does (and clears it when censor rules are edited)
+
Things to make bots not crawl somehow:
/request/13683/response?internal_review=1
/request/febrile_neutropenia_154?unfold=1
@@ -275,7 +279,8 @@ Edits to outgoing/incoming/title won't be reindexed in Xapian (maybe just reinde
might matter with software updates, or code changes)
This does it all:
$ ./script/clear-incoming-text-cache ; ./script/rebuild-xapian-index
-Well, except nowday has to clear the disk file cache in mysociety/foi/cache also
+(clear-incoming-text-cache needs renaming to make it clearer it does clear the disk cache too,
+and that code testing)
Renaming public authorities will break alerts on them. For basic alerts the
structured info is there so this should just be fixed. For searches, perhaps
@@ -307,7 +312,8 @@ To decode Outlook .msg (.oft?) files, e.g.
Search for other file extensions that we have now and look for ones we could
and should be indexing
- (call IncomingMessage.find_all_unknown_mime_types to find them)
+ (call IncomingMessage.find_all_unknown_mime_types to find them - needs
+ updating to do it in clumps as all requests won't load in RAM now )
Make tables prettier in view as HTMl, just normal thick borders.
http://www.whatdotheyknow.com/request/1610/response/8093/attach/html/3/2008.10.29%20Reply.doc.html