diff options
32 files changed, 1010 insertions, 257 deletions
diff --git a/app/controllers/public_body_controller.rb b/app/controllers/public_body_controller.rb index ff9fbadb3..62229a441 100644 --- a/app/controllers/public_body_controller.rb +++ b/app/controllers/public_body_controller.rb @@ -185,11 +185,14 @@ class PublicBodyController < ApplicationController def search_typeahead # Since acts_as_xapian doesn't support the Partial match flag, we work around it # by making the last work a wildcard, which is quite the same - query = params[:q] + '*' - - query = query.split(' ').join(' OR ') # XXX: HACK for OR instead of default AND! - @xapian_requests = perform_search([PublicBody], query, 'relevant', nil, 5) - + query = params[:q] + query = query.split(' ') + if query.last.nil? || query.last.strip.length < 3 + @xapian_requests = nil + else + query = query.join(' OR ') # XXX: HACK for OR instead of default AND! + @xapian_requests = perform_search([PublicBody], query, 'relevant', nil, 5) + end render :partial => "public_body/search_ahead" end end diff --git a/app/controllers/request_controller.rb b/app/controllers/request_controller.rb index 1801648ca..dad5e81cd 100644 --- a/app/controllers/request_controller.rb +++ b/app/controllers/request_controller.rb @@ -676,6 +676,7 @@ class RequestController < ApplicationController # Internal function def get_attachment_internal(html_conversion) @incoming_message = IncomingMessage.find(params[:incoming_message_id]) + @incoming_message.parse_raw_email! @info_request = @incoming_message.info_request if @incoming_message.info_request_id != params[:id].to_i raise sprintf("Incoming message %d does not belong to request %d", @incoming_message.info_request_id, params[:id]) @@ -690,7 +691,6 @@ class RequestController < ApplicationController # check permissions raise "internal error, pre-auth filter should have caught this" if !@info_request.user_can_view?(authenticated_user) - @attachment = IncomingMessage.get_attachment_by_url_part_number(@incoming_message.get_attachments_for_display, @part_number) raise ActiveRecord::RecordNotFound.new("attachment not found part number " + @part_number.to_s + " incoming_message " + @incoming_message.id.to_s) if @attachment.nil? @@ -713,6 +713,7 @@ class RequestController < ApplicationController :email => _("Then you can upload an FOI response. "), :email_subject => _("Confirm your account on {{site_name}}",:site_name=>site_name) } + if !authenticated?(@reason_params) return end @@ -754,11 +755,14 @@ class RequestController < ApplicationController def search_typeahead # Since acts_as_xapian doesn't support the Partial match flag, we work around it # by making the last work a wildcard, which is quite the same - query = params[:q] + '*' - - query = query.split(' ').join(' OR ') # XXX: HACK for OR instead of default AND! - @xapian_requests = perform_search([InfoRequestEvent], query, 'relevant', 'request_collapse', 5) - + query = params[:q] + query = query.split(' ') + if query.last.nil? || query.last.strip.length < 3 + @xapian_requests = nil + else + query = query.join(' OR ') # XXX: HACK for OR instead of default AND! + @xapian_requests = perform_search([InfoRequestEvent], query, 'relevant', 'request_collapse', 5) + end render :partial => "request/search_ahead.rhtml" end diff --git a/app/helpers/link_to_helper.rb b/app/helpers/link_to_helper.rb index 54b8d69d0..5866c31f0 100755 --- a/app/helpers/link_to_helper.rb +++ b/app/helpers/link_to_helper.rb @@ -185,9 +185,16 @@ module LinkToHelper end - def main_url(relative_path) + def main_url(relative_path, append = nil) url_prefix = "http://" + MySociety::Config.get("DOMAIN", '127.0.0.1:3000') - return url_prefix + relative_path + url = url_prefix + relative_path + if !append.nil? + env = Rack::MockRequest.env_for(url) + req = Rack::Request.new(env) + req.path_info += append + url = req.url + end + return url end # Basic date format diff --git a/app/models/foi_attachment.rb b/app/models/foi_attachment.rb new file mode 100644 index 000000000..057dcdb69 --- /dev/null +++ b/app/models/foi_attachment.rb @@ -0,0 +1,321 @@ +# encoding: UTF-8 + +# models/foi_attachment.rb: +# An attachment to an email (IncomingMessage) +# +# Copyright (c) 2007 UK Citizens Online Democracy. All rights reserved. +# Email: francis@mysociety.org; WWW: http://www.mysociety.org/ +# This is the type which is used to send data about attachments to the view + +require 'digest' + +class FoiAttachment < ActiveRecord::Base + belongs_to :incoming_message + validates_presence_of :content_type + validates_presence_of :filename + validates_presence_of :display_size + + before_validation :ensure_filename!, :only => [:filename] + before_destroy :delete_cached_file! + + def directory + base_dir = File.join("cache", "attachments_#{ENV['RAILS_ENV']}") + return File.join(base_dir, self.hexdigest[0..2]) + end + + def filepath + File.join(self.directory, self.hexdigest) + end + + def delete_cached_file! + begin + File.delete(self.filepath) + rescue + end + end + + def body=(d) + self.hexdigest = Digest::MD5.hexdigest(d) + if !File.exists?(self.directory) + FileUtils.mkdir_p self.directory + end + File.open(self.filepath, "wb") { |file| + file.write d + } + update_display_size! + end + + def body + if @cached_body.nil? + @cached_body = File.open(self.filepath, "rb" ).read + end + return @cached_body + end + + # List of DSN codes taken from RFC 3463 + # http://tools.ietf.org/html/rfc3463 + DsnToMessage = { + 'X.1.0' => 'Other address status', + 'X.1.1' => 'Bad destination mailbox address', + 'X.1.2' => 'Bad destination system address', + 'X.1.3' => 'Bad destination mailbox address syntax', + 'X.1.4' => 'Destination mailbox address ambiguous', + 'X.1.5' => 'Destination mailbox address valid', + 'X.1.6' => 'Mailbox has moved', + 'X.1.7' => 'Bad sender\'s mailbox address syntax', + 'X.1.8' => 'Bad sender\'s system address', + 'X.2.0' => 'Other or undefined mailbox status', + 'X.2.1' => 'Mailbox disabled, not accepting messages', + 'X.2.2' => 'Mailbox full', + 'X.2.3' => 'Message length exceeds administrative limit.', + 'X.2.4' => 'Mailing list expansion problem', + 'X.3.0' => 'Other or undefined mail system status', + 'X.3.1' => 'Mail system full', + 'X.3.2' => 'System not accepting network messages', + 'X.3.3' => 'System not capable of selected features', + 'X.3.4' => 'Message too big for system', + 'X.4.0' => 'Other or undefined network or routing status', + 'X.4.1' => 'No answer from host', + 'X.4.2' => 'Bad connection', + 'X.4.3' => 'Routing server failure', + 'X.4.4' => 'Unable to route', + 'X.4.5' => 'Network congestion', + 'X.4.6' => 'Routing loop detected', + 'X.4.7' => 'Delivery time expired', + 'X.5.0' => 'Other or undefined protocol status', + 'X.5.1' => 'Invalid command', + 'X.5.2' => 'Syntax error', + 'X.5.3' => 'Too many recipients', + 'X.5.4' => 'Invalid command arguments', + 'X.5.5' => 'Wrong protocol version', + 'X.6.0' => 'Other or undefined media error', + 'X.6.1' => 'Media not supported', + 'X.6.2' => 'Conversion required and prohibited', + 'X.6.3' => 'Conversion required but not supported', + 'X.6.4' => 'Conversion with loss performed', + 'X.6.5' => 'Conversion failed', + 'X.7.0' => 'Other or undefined security status', + 'X.7.1' => 'Delivery not authorized, message refused', + 'X.7.2' => 'Mailing list expansion prohibited', + 'X.7.3' => 'Security conversion required but not possible', + 'X.7.4' => 'Security features not supported', + 'X.7.5' => 'Cryptographic failure', + 'X.7.6' => 'Cryptographic algorithm not supported', + 'X.7.7' => 'Message integrity failure' + } + + # Returns HTML, of extra comment to put by attachment + def extra_note + # For delivery status notification attachments, extract the status and + # look up what it means in the DSN table. + if @content_type == 'message/delivery-status' + if !@body.match(/Status:\s+([0-9]+\.([0-9]+\.[0-9]+))\s+/) + return "" + end + dsn = $1 + dsn_part = 'X.' + $2 + + dsn_message = "" + if DsnToMessage.include?(dsn_part) + dsn_message = " (" + DsnToMessage[dsn_part] + ")" + end + + return "<br><em>DSN: " + dsn + dsn_message + "</em>" + end + return "" + end + + # Called by controller so old filenames still work + def old_display_filename + filename = self.filename + + # Convert weird spaces (e.g. \n) to normal ones + filename = filename.gsub(/\s/, " ") + # Remove slashes, they mess with URLs + filename = filename.gsub(/\//, "-") + + return filename + end + + # XXX changing this will break existing URLs, so have a care - maybe + # make another old_display_filename see above + def display_filename + filename = self.filename + if !self.incoming_message.nil? + self.incoming_message.info_request.apply_censor_rules_to_text!(filename) + end + # Sometimes filenames have e.g. %20 in - no point butchering that + # (without unescaping it, this would remove the % and leave 20s in there) + filename = CGI.unescape(filename) + + # Remove weird spaces + filename = filename.gsub(/\s+/, " ") + # Remove non-alphabetic characters + filename = filename.gsub(/[^A-Za-z0-9.]/, " ") + # Remove spaces near dots + filename = filename.gsub(/\s*\.\s*/, ".") + # Compress adjacent spaces down to a single one + filename = filename.gsub(/\s+/, " ") + filename = filename.strip + + return filename + end + + + def ensure_filename! + if self.filename.nil? + calc_ext = AlaveteliFileTypes.mimetype_to_extension(self.content_type) + if !calc_ext + calc_ext = "bin" + end + if !self.within_rfc822_subject.nil? + computed = self.within_rfc822_subject + "." + calc_ext + else + computed = "attachment." + calc_ext + end + self.filename = computed + end + end + + def filename=(filename) + calc_ext = AlaveteliFileTypes.mimetype_to_extension(self.content_type) + # Put right extension on if missing + if !filename.nil? && !filename.match(/\.#{calc_ext}$/) && calc_ext + computed = filename + "." + calc_ext + else + computed = filename + end + write_attribute('filename', computed) + end + + # Size to show next to the download link for the attachment + def update_display_size! + s = self.body.size + + if s > 1024 * 1024 + self.display_size = sprintf("%.1f", s.to_f / 1024 / 1024) + 'M' + else + self.display_size = (s / 1024).to_s + 'K' + end + end + + # Whether this type can be shown in the Google Docs Viewer. + # The full list of supported types can be found at + # https://docs.google.com/support/bin/answer.py?hl=en&answer=1189935 + def has_google_docs_viewer? + return !! { + "application/pdf" => true, # .pdf + "image/tiff" => true, # .tiff + + "application/vnd.ms-word" => true, # .doc + "application/vnd.openxmlformats-officedocument.wordprocessingml.document" => true, # .docx + + "application/vnd.ms-powerpoint" => true, # .ppt + "application/vnd.openxmlformats-officedocument.presentationml.presentation" => true, # .pptx + + "application/vnd.ms-excel" => true, # .xls + "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" => true, # .xlsx + + } [self.content_type] + end + + # Whether this type has a "View as HTML" + def has_body_as_html? + return ( + !!{ + "text/plain" => true, + "application/rtf" => true, + }[self.content_type] or + self.has_google_docs_viewer? + ) + end + + # Name of type of attachment type - only valid for things that has_body_as_html? + def name_of_content_type + return { + "text/plain" => "Text file", + 'application/rtf' => "RTF file", + + 'application/pdf' => "PDF file", + 'image/tiff' => "TIFF image", + + 'application/vnd.ms-word' => "Word document", + 'application/vnd.openxmlformats-officedocument.wordprocessingml.document' => "Word document", + + 'application/vnd.ms-powerpoint' => "PowerPoint presentation", + 'application/vnd.openxmlformats-officedocument.presentationml.presentation' => "PowerPoint presentation", + + 'application/vnd.ms-excel' => "Excel spreadsheet", + 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet' => "Excel spreadsheet", + }[self.content_type] + end + + # For "View as HTML" of attachment + def body_as_html(dir) + html = nil + wrapper_id = "wrapper" + + # simple cases, can never fail + if self.content_type == 'text/plain' + text = self.body.strip + text = CGI.escapeHTML(text) + text = MySociety::Format.make_clickable(text) + html = text.gsub(/\n/, '<br>') + return '<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" + "http://www.w3.org/TR/html4/loose.dtd"><html><head><title></title></head><body>' + html + "</body></html>", wrapper_id + end + + # the extractions will also produce image files, which go in the + # current directory, so change to the directory the function caller + # wants everything in + Dir.chdir(dir) do + tempfile = Tempfile.new('foiextract', '.') + tempfile.print self.body + tempfile.flush + + if self.content_type == 'application/pdf' + IO.popen("/usr/bin/pdftohtml -nodrm -zoom 1.0 -stdout -enc UTF-8 -noframes " + tempfile.path + "", "r") do |child| + html = child.read() + end + elsif self.content_type == 'application/rtf' + IO.popen("/usr/bin/unrtf --html " + tempfile.path + "", "r") do |child| + html = child.read() + end + elsif self.has_google_docs_viewer? + html = '' # force error and using Google docs viewer + else + raise "No HTML conversion available for type " + self.content_type + end + + tempfile.close + tempfile.delete + end + + # We need to look at: + # a) Any error code + # b) The output size, as pdftohtml does not return an error code upon error. + # c) For cases when there is no text in the body of the HTML, or + # images, so nothing will be rendered. This is to detect some bug in + # pdftohtml, which sometimes makes it return just <hr>s and no other + # content. + html.match(/(\<body[^>]*\>.*)/mi) + body = $1.to_s + body_without_tags = body.gsub(/\s+/,"").gsub(/\<[^\>]*\>/, "") + contains_images = html.match(/<img/mi) ? true : false + if !$?.success? || html.size == 0 || (body_without_tags.size == 0 && !contains_images) + ret = "<html><head></head><body>"; + if self.has_google_docs_viewer? + wrapper_id = "wrapper_google_embed" + ret = ret + "<iframe src='http://docs.google.com/viewer?url=<attachment-url-here>&embedded=true' width='100%' height='100%' style='border: none;'></iframe>"; + else + ret = ret + "<p>Sorry, we were unable to convert this file to HTML. Please use the download link at the top right.</p>" + end + ret = ret + "</body></html>" + return ret, wrapper_id + end + + return html, wrapper_id + end + +end + diff --git a/app/models/incoming_message.rb b/app/models/incoming_message.rb index 0608d46d7..a4519a17d 100644 --- a/app/models/incoming_message.rb +++ b/app/models/incoming_message.rb @@ -320,7 +320,7 @@ class IncomingMessage < ActiveRecord::Base validates_presence_of :raw_email has_many :outgoing_message_followups, :foreign_key => 'incoming_message_followup_id', :class_name => 'OutgoingMessage' - + has_many :foi_attachments has_many :info_request_events # never really has many, but could in theory belongs_to :raw_email @@ -338,8 +338,8 @@ class IncomingMessage < ActiveRecord::Base # Return the structured TMail::Mail object # Documentation at http://i.loveruby.net/en/projects/tmail/doc/ - def mail - if @mail.nil? && !self.raw_email.nil? + def mail(force = nil) + if (!force.nil? || @mail.nil?) && !self.raw_email.nil? # Hack round bug in TMail's MIME decoding. Example request which provokes it: # http://www.whatdotheyknow.com/request/reviews_of_unduly_lenient_senten#incoming-4830 # Report of TMail bug: @@ -352,23 +352,109 @@ class IncomingMessage < ActiveRecord::Base @mail end + # Returns the name of the person the incoming message is from, or nil if + # there isn't one or if there is only an email address. XXX can probably + # remove from_name_if_present (which is a monkey patch) by just calling + # .from_addrs[0].name here instead? + + # Return false if for some reason this is a message that we shouldn't let them reply to + def _calculate_valid_to_reply_to + # check validity of email + if self.mail.from_addrs.nil? || self.mail.from_addrs.size == 0 + return false + end + email = self.mail.from_addrs[0].spec + if !MySociety::Validate.is_valid_email(email) + return false + end + + # reject postmaster - authorities seem to nearly always not respond to + # email to postmaster, and it tends to only happen after delivery failure. + # likewise Mailer-Daemon, Auto_Reply... + prefix = email + prefix =~ /^(.*)@/ + prefix = $1 + if !prefix.nil? && prefix.downcase.match(/^(postmaster|mailer-daemon|auto_reply|donotreply|no.reply)$/) + return false + end + if !self.mail['return-path'].nil? && self.mail['return-path'].addr == "<>" + return false + end + if !self.mail['auto-submitted'].nil? && !self.mail['auto-submitted'].keys.empty? + return false + end + return true + end + + def parse_raw_email!(force = nil) + # The following fields may be absent; we treat them as cached + # values in case we want to regenerate them (due to mail + # parsing bugs, etc). + if (!force.nil? || self.last_parsed.nil?) + self.extract_attachments! + self.sent_at = self.mail.date || self.created_at + self.subject = self.mail.subject + # XXX can probably remove from_name_if_present (which is a + # monkey patch) by just calling .from_addrs[0].name here + # instead? + self.mail_from = self.mail.from_name_if_present + begin + self.mail_from_domain = PublicBody.extract_domain_from_email(self.mail.from_addrs[0].spec) + rescue NoMethodError + self.mail_from_domain = "" + end + self.valid_to_reply_to = self._calculate_valid_to_reply_to + self.last_parsed = Time.now + self.save! + end + end + + def valid_to_reply_to? + return self.valid_to_reply_to + end + + # The cached fields mentioned in the previous comment + # XXX there must be a nicer way to do this without all that + # repetition. I tried overriding method_missing but got some + # unpredictable results. + def valid_to_reply_to + parse_raw_email! + super + end + def sent_at + parse_raw_email! + super + end + def subject + parse_raw_email! + super + end + def mail_from + parse_raw_email! + super + end + def safe_mail_from + if !self.mail_from.nil? + mail_from = self.mail_from.dup + self.info_request.apply_censor_rules_to_text!(mail_from) + return mail_from + end + end + def mail_from_domain + parse_raw_email! + super + end + # Number the attachments in depth first tree order, for use in URLs. # XXX This fills in part.rfc822_attachment and part.url_part_number within # all the parts of the email (see TMail monkeypatch above for how these # attributes are added). ensure_parts_counted must be called before using - # the attributes. This calculation is done only when required to avoid - # having to load and parse the email unnecessarily. - def after_initialize - @parts_counted = false - end + # the attributes. def ensure_parts_counted - if not @parts_counted - @count_parts_count = 0 - _count_parts_recursive(self.mail) - # we carry on using these numeric ids for attachments uudecoded from within text parts - @count_first_uudecode_count = @count_parts_count - @parts_counted = true - end + @count_parts_count = 0 + _count_parts_recursive(self.mail) + # we carry on using these numeric ids for attachments uudecoded from within text parts + @count_first_uudecode_count = @count_parts_count end def _count_parts_recursive(part) if part.multipart? @@ -406,7 +492,7 @@ class IncomingMessage < ActiveRecord::Base end end # And look up by URL part number to get an attachment - # XXX relies on get_attachments_for_display calling ensure_parts_counted + # XXX relies on extract_attachments calling ensure_parts_counted def self.get_attachment_by_url_part_number(attachments, found_url_part_number) attachments.each do |a| if a.url_part_number == found_url_part_number @@ -416,12 +502,6 @@ class IncomingMessage < ActiveRecord::Base return nil end - # Return date mail was sent - def sent_at - # Use date it arrived (created_at) if mail itself doesn't have Date: header - self.mail.date || self.created_at - end - # Converts email addresses we know about into textual descriptions of them def mask_special_emails!(text) # XXX can later display some of these special emails as actual emails, @@ -518,6 +598,7 @@ class IncomingMessage < ActiveRecord::Base self.info_request.apply_censor_rules_to_binary!(text) raise "internal error in binary_mask_stuff" if text.size != orig_size + return text end # Removes censored stuff from from HTML conversion of downloaded binaries @@ -658,20 +739,20 @@ class IncomingMessage < ActiveRecord::Base end # Internal function - def _get_censored_part_file_name(mail) + def _get_part_file_name(mail) part_file_name = TMail::Mail.get_part_file_name(mail) if part_file_name.nil? return nil end part_file_name = part_file_name.dup - self.info_request.apply_censor_rules_to_text!(part_file_name) return part_file_name end # (This risks losing info if the unchosen alternative is the only one to contain # useful info, but let's worry about that another time) def get_attachment_leaves - return _get_attachment_leaves_recursive(self.mail) + force = true + return _get_attachment_leaves_recursive(self.mail(force)) end def _get_attachment_leaves_recursive(curr_mail, within_rfc822_attachment = nil) leaves_found = [] @@ -718,7 +799,7 @@ class IncomingMessage < ActiveRecord::Base end # PDFs often come with this mime type, fix it up for view code if curr_mail.content_type == 'application/octet-stream' - part_file_name = self._get_censored_part_file_name(curr_mail) + part_file_name = self._get_part_file_name(curr_mail) calc_mime = AlaveteliFileTypes.filename_and_content_to_mimetype(part_file_name, curr_mail.body) if calc_mime curr_mail.content_type = calc_mime @@ -768,7 +849,6 @@ class IncomingMessage < ActiveRecord::Base # search results def _cache_main_body_text text = self.get_main_body_text_internal - # Strip the uudecode parts from main text # - this also effectively does a .dup as well, so text mods don't alter original text = text.split(/^begin.+^`\n^end\n/sm).join(" ") @@ -879,8 +959,8 @@ class IncomingMessage < ActiveRecord::Base end # Returns part which contains main body text, or nil if there isn't one def get_main_body_text_part - leaves = get_attachment_leaves - + leaves = self.foi_attachments + # Find first part which is text/plain or text/html # (We have to include HTML, as increasingly there are mail clients that # include no text alternative for the main part, and we don't want to @@ -894,7 +974,7 @@ class IncomingMessage < ActiveRecord::Base # Otherwise first part which is any sort of text leaves.each do |p| - if p.main_type == 'text' + if p.content_type.match(/^text/) return p end end @@ -902,7 +982,7 @@ class IncomingMessage < ActiveRecord::Base # ... or if none, consider first part p = leaves[0] # if it is a known type then don't use it, return no body (nil) - if AlaveteliFileTypes.mimetype_to_extension(p.content_type) + if !p.nil? && AlaveteliFileTypes.mimetype_to_extension(p.content_type) # this is guess of case where there are only attachments, no body text # e.g. http://www.whatdotheyknow.com/request/cost_benefit_analysis_for_real_n return nil @@ -914,16 +994,7 @@ class IncomingMessage < ActiveRecord::Base return p end # Returns attachments that are uuencoded in main body part - def get_main_body_text_uudecode_attachments - # we don't use get_main_body_text_internal, as we want to avoid charset - # conversions, since /usr/bin/uudecode needs to deal with those. - # e.g. for https://secure.mysociety.org/admin/foi/request/show_raw_email/24550 - main_part = get_main_body_text_part - if main_part.nil? - return [] - end - text = main_part.body - + def _uudecode_and_save_attachments(text) # Find any uudecoded things buried in it, yeuchly uus = text.scan(/^begin.+^`\n^end\n/sm) attachments = [] @@ -938,91 +1009,109 @@ class IncomingMessage < ActiveRecord::Base end tempfile.close # Make attachment type from it, working out filename and mime type - attachment = FOIAttachment.new() - attachment.body = content - attachment.filename = uu.match(/^begin\s+[0-9]+\s+(.*)$/)[1] - self.info_request.apply_censor_rules_to_text!(attachment.filename) - calc_mime = AlaveteliFileTypes.filename_and_content_to_mimetype(attachment.filename, attachment.body) + filename = uu.match(/^begin\s+[0-9]+\s+(.*)$/)[1] + calc_mime = AlaveteliFileTypes.filename_and_content_to_mimetype(filename, content) if calc_mime calc_mime = normalise_content_type(calc_mime) - attachment.content_type = calc_mime + content_type = calc_mime else - attachment.content_type = 'application/octet-stream' + content_type = 'application/octet-stream' end - attachments += [attachment] - end - + hexdigest = Digest::MD5.hexdigest(content) + attachment = self.foi_attachments.find_or_create_by_hexdigest(:hexdigest => hexdigest) + attachment.update_attributes(:filename => filename, + :content_type => content_type, + :body => content, + :display_size => "0K") + attachment.save! + attachments << attachment + end return attachments end - # Returns all attachments for use in display code - # XXX is this called multiple times and should be cached? def get_attachments_for_display + parse_raw_email! + # return what user would consider attachments, i.e. not the main body main_part = get_main_body_text_part - leaves = get_attachment_leaves + attachments = [] + for attachment in self.foi_attachments + attachments << attachment if attachment != main_part + end + return attachments + end + def extract_attachments! + leaves = get_attachment_leaves # XXX check where else this is called from # XXX we have to call ensure_parts_counted after get_attachment_leaves # which is really messy. ensure_parts_counted - attachments = [] - for leaf in leaves - if leaf != main_part - attachment = FOIAttachment.new - - attachment.body = leaf.body - # As leaf.body causes MIME decoding which uses lots of RAM, do garbage collection here - # to prevent excess memory use. XXX not really sure if this helps reduce - # peak RAM use overall. Anyway, maybe there is something better to do than this. - GC.start - - attachment.filename = _get_censored_part_file_name(leaf) - if leaf.within_rfc822_attachment - attachment.within_rfc822_subject = leaf.within_rfc822_attachment.subject - # Test to see if we are in the first part of the attached - # RFC822 message and it is text, if so add headers. - # XXX should probably use hunting algorithm to find main text part, rather than - # just expect it to be first. This will do for now though. - # Example request that needs this: - # http://www.whatdotheyknow.com/request/2923/response/7013/attach/2/Cycle%20Path%20Bank.txt - if leaf.within_rfc822_attachment == leaf && leaf.content_type == 'text/plain' - headers = "" - for header in [ 'Date', 'Subject', 'From', 'To', 'Cc' ] - if leaf.within_rfc822_attachment.header.include?(header.downcase) - header_value = leaf.within_rfc822_attachment.header[header.downcase] - # Example message which has a blank Date header: - # http://www.whatdotheyknow.com/request/30747/response/80253/attach/html/17/Common%20Purpose%20Advisory%20Group%20Meeting%20Tuesday%202nd%20March.txt.html - if !header_value.blank? - headers = headers + header + ": " + header_value.to_s + "\n" - end + for leaf in leaves + body = leaf.body + # As leaf.body causes MIME decoding which uses lots of RAM, do garbage collection here + # to prevent excess memory use. XXX not really sure if this helps reduce + # peak RAM use overall. Anyway, maybe there is something better to do than this. + GC.start + if leaf.within_rfc822_attachment + within_rfc822_subject = leaf.within_rfc822_attachment.subject + # Test to see if we are in the first part of the attached + # RFC822 message and it is text, if so add headers. + # XXX should probably use hunting algorithm to find main text part, rather than + # just expect it to be first. This will do for now though. + # Example request that needs this: + # http://www.whatdotheyknow.com/request/2923/response/7013/attach/2/Cycle%20Path%20Bank.txt + if leaf.within_rfc822_attachment == leaf && leaf.content_type == 'text/plain' + headers = "" + for header in [ 'Date', 'Subject', 'From', 'To', 'Cc' ] + if leaf.within_rfc822_attachment.header.include?(header.downcase) + header_value = leaf.within_rfc822_attachment.header[header.downcase] + # Example message which has a blank Date header: + # http://www.whatdotheyknow.com/request/30747/response/80253/attach/html/17/Common%20Purpose%20Advisory%20Group%20Meeting%20Tuesday%202nd%20March.txt.html + if !header_value.blank? + headers = headers + header + ": " + header_value.to_s + "\n" end end - # XXX call _convert_part_body_to_text here, but need to get charset somehow - # e.g. http://www.whatdotheyknow.com/request/1593/response/3088/attach/4/Freedom%20of%20Information%20request%20-%20car%20oval%20sticker:%20Article%2020,%20Convention%20on%20Road%20Traffic%201949.txt - attachment.body = headers + "\n" + attachment.body - - # This is quick way of getting all headers, but instead we only add some a) to - # make it more usable, b) as at least one authority accidentally leaked security - # information into a header. - #attachment.body = leaf.within_rfc822_attachment.port.to_s end + # XXX call _convert_part_body_to_text here, but need to get charset somehow + # e.g. http://www.whatdotheyknow.com/request/1593/response/3088/attach/4/Freedom%20of%20Information%20request%20-%20car%20oval%20sticker:%20Article%2020,%20Convention%20on%20Road%20Traffic%201949.txt + body = headers + "\n" + body + + # This is quick way of getting all headers, but instead we only add some a) to + # make it more usable, b) as at least one authority accidentally leaked security + # information into a header. + #attachment.body = leaf.within_rfc822_attachment.port.to_s end - attachment.content_type = leaf.content_type - attachment.url_part_number = leaf.url_part_number - attachments += [attachment] end + hexdigest = Digest::MD5.hexdigest(body) + attachment = self.foi_attachments.find_or_create_by_hexdigest(:hexdigest => hexdigest) + attachment.update_attributes(:url_part_number => leaf.url_part_number, + :content_type => leaf.content_type, + :filename => _get_part_file_name(leaf), + :charset => leaf.charset, + :within_rfc822_subject => within_rfc822_subject, + :display_size => "0K", + :body => body) + attachment.save! + attachments << attachment.id end - - uudecode_attachments = get_main_body_text_uudecode_attachments - c = @count_first_uudecode_count - for uudecode_attachment in uudecode_attachments - c += 1 - uudecode_attachment.url_part_number = c - attachments += [uudecode_attachment] + main_part = get_main_body_text_part + # we don't use get_main_body_text_internal, as we want to avoid charset + # conversions, since /usr/bin/uudecode needs to deal with those. + # e.g. for https://secure.mysociety.org/admin/foi/request/show_raw_email/24550 + if !main_part.nil? + uudecoded_attachments = _uudecode_and_save_attachments(main_part.body) + c = @count_first_uudecode_count + for uudecode_attachment in uudecoded_attachments + c += 1 + uudecode_attachment.url_part_number = c + uudecode_attachment.save! + attachments << uudecode_attachment.id + end end - return attachments - end + # now get rid of any attachments we no longer have + FoiAttachment.destroy_all("id NOT IN (#{attachments.join(',')}) AND incoming_message_id = #{self.id}") + end # Returns body text as HTML with quotes flattened, and emails removed. def get_body_for_html_display(collapse_quoted_sections = true) @@ -1047,7 +1136,7 @@ class IncomingMessage < ActiveRecord::Base text.strip! # if there is nothing but quoted stuff, then show the subject if text == "FOLDED_QUOTED_SECTION" - text = "[Subject only] " + CGI.escapeHTML(self.mail.subject) + text + text = "[Subject only] " + CGI.escapeHTML(self.subject) + text end # and display link for quoted stuff text = text.gsub(/FOLDED_QUOTED_SECTION/, "\n\n" + '<span class="unfold_link"><a href="?unfold=1#incoming-'+self.id.to_s+'">show quoted sections</a></span>' + "\n\n") @@ -1063,6 +1152,7 @@ class IncomingMessage < ActiveRecord::Base return text end + # Returns text of email for using in quoted section when replying def get_body_for_quoting # Get the body text with emails and quoted sections removed @@ -1200,6 +1290,7 @@ class IncomingMessage < ActiveRecord::Base return text end + # Returns text for indexing def get_text_for_indexing_full return get_body_for_quoting + "\n\n" + get_attachment_text_full @@ -1209,23 +1300,6 @@ class IncomingMessage < ActiveRecord::Base return get_body_for_quoting + "\n\n" + get_attachment_text_clipped end - # Returns the name of the person the incoming message is from, or nil if - # there isn't one or if there is only an email address. XXX can probably - # remove from_name_if_present (which is a monkey patch) by just calling - # .from_addrs[0].name here instead? - def safe_mail_from - name = self.mail.from_name_if_present - if name.nil? - return nil - end - name = name.dup - self.info_request.apply_censor_rules_to_text!(name) - return name - end - - def mail_from_domain - return PublicBody.extract_domain_from_email(self.mail.from_addrs[0].spec) - end # Has message arrived "recently"? diff --git a/app/models/info_request.rb b/app/models/info_request.rb index f482ca700..cfef6ebd8 100644 --- a/app/models/info_request.rb +++ b/app/models/info_request.rb @@ -1,3 +1,4 @@ + # == Schema Information # Schema version: 95 # @@ -249,10 +250,10 @@ public if incoming_message.nil? || !incoming_message.valid_to_reply_to? 'Re: ' + self.email_subject_request else - if incoming_message.mail.subject.match(/^Re:/i) - incoming_message.mail.subject + if incoming_message.subject.match(/^Re:/i) + incoming_message.subject else - 'Re: ' + incoming_message.mail.subject + 'Re: ' + incoming_message.subject end end end diff --git a/app/models/request_mailer.rb b/app/models/request_mailer.rb index 75dc58447..272f2ea83 100644 --- a/app/models/request_mailer.rb +++ b/app/models/request_mailer.rb @@ -10,6 +10,7 @@ require 'alaveteli_file_types' class RequestMailer < ApplicationMailer + # Used when an FOI officer uploads a response from their web browser - this is # the "fake" email used to store in the same format in the database as if they # had emailed it. diff --git a/app/views/layouts/default.rhtml b/app/views/layouts/default.rhtml index 675bb38de..2f8e0bf36 100644 --- a/app/views/layouts/default.rhtml +++ b/app/views/layouts/default.rhtml @@ -49,7 +49,7 @@ <% end %> <% end %> <% if @has_json %> - <link rel="alternate" type="application/json" title="JSON version of this page" href="<%=h main_url(request.request_uri) %>.json"> + <link rel="alternate" type="application/json" title="JSON version of this page" href="<%=h main_url(request.request_uri, '.json') %>"> <% end %> <% if @no_crawl %> diff --git a/app/views/public_body/_search_ahead.rhtml b/app/views/public_body/_search_ahead.rhtml index 19c7eb4e8..436471544 100644 --- a/app/views/public_body/_search_ahead.rhtml +++ b/app/views/public_body/_search_ahead.rhtml @@ -1,4 +1,5 @@ <p> + <% if !@xapian_requests.nil? %> <% if @xapian_requests.results.size > 0 %> <h3><%= _('Top search results:') %></h3> <p> @@ -12,6 +13,7 @@ <%= render :partial => 'body_listing_single', :locals => { :public_body => result[:model] } %> <% end %> </div> + <% end %> </p> diff --git a/app/views/request/_search_ahead.rhtml b/app/views/request/_search_ahead.rhtml index 9c49680c3..d0b19de7d 100644 --- a/app/views/request/_search_ahead.rhtml +++ b/app/views/request/_search_ahead.rhtml @@ -1,4 +1,5 @@ <div id="request_search_ahead_results"> + <% if !@xapian_requests.nil? %> <% if @xapian_requests.results.size > 0 %> <h3><%= _("Possibly related requests:") %></h3> <% end %> @@ -9,4 +10,5 @@ <p> <a id="body-site-search-link" target="_blank"><%= _("Or search in their website for this information.") %></a> </p> + <% end %> </div> diff --git a/app/views/request/upload_response.rhtml b/app/views/request/upload_response.rhtml index eaa6602a1..38c3db268 100644 --- a/app/views/request/upload_response.rhtml +++ b/app/views/request/upload_response.rhtml @@ -23,7 +23,7 @@ file too large for email, use the form below.')%> <p><%= _('Enter your response below. You may attach one file (use email, or <a href="%s">contact us</a> if you need more).')% [help_contact_path] %></p> -<% form_tag '', :html => { :id => 'upload_response_form' }, :multipart => true do %> +<% form_tag '', :id => 'upload_response_form', :multipart => true do %> <p> <label class="form_label" for="body"><% _('Response:')%></label> <%= text_area_tag :body, "", :rows => 10, :cols => 55 %> diff --git a/db/migrate/104_create_foi_attachments.rb b/db/migrate/104_create_foi_attachments.rb new file mode 100644 index 000000000..c53cd4f64 --- /dev/null +++ b/db/migrate/104_create_foi_attachments.rb @@ -0,0 +1,18 @@ + +class CreateFoiAttachments < ActiveRecord::Migration + def self.up + create_table :foi_attachments do |t| + t.column :content_type, :text + t.column :filename, :text + t.column :charset, :text + t.column :display_size, :text + t.column :url_part_number, :integer + t.column :within_rfc822_subject, :text + t.column :incoming_message_id, :integer + end + end + + def self.down + drop_table :foi_attachments + end +end diff --git a/db/migrate/105_extend_incoming_message.rb b/db/migrate/105_extend_incoming_message.rb new file mode 100644 index 000000000..9db8649a0 --- /dev/null +++ b/db/migrate/105_extend_incoming_message.rb @@ -0,0 +1,17 @@ +class ExtendIncomingMessage < ActiveRecord::Migration + def self.up + add_column :incoming_messages, :sent_at, :time + add_column :incoming_messages, :subject, :text + add_column :incoming_messages, :safe_mail_from, :text + add_column :incoming_messages, :mail_from_domain, :text + add_column :incoming_messages, :valid_to_reply_to, :boolean + end + + def self.down + remove_column :incoming_messages, :sent_at + remove_column :incoming_messages, :subject + remove_column :incoming_messages, :safe_mail_from + remove_column :incoming_messages, :mail_from_domain + remove_column :incoming_messages, :valid_to_reply_to + end +end diff --git a/db/migrate/106_add_hex_digest_to_foi_attachment.rb b/db/migrate/106_add_hex_digest_to_foi_attachment.rb new file mode 100644 index 000000000..d9520a934 --- /dev/null +++ b/db/migrate/106_add_hex_digest_to_foi_attachment.rb @@ -0,0 +1,9 @@ +class AddHexDigestToFoiAttachment < ActiveRecord::Migration + def self.up + add_column :foi_attachments, :hexdigest, :string, :limit => 32 + end + + def self.down + remove_column :foi_attachments, :hexdigest + end +end diff --git a/db/migrate/107_add_date_parsed_field_to_incoming_message.rb b/db/migrate/107_add_date_parsed_field_to_incoming_message.rb new file mode 100644 index 000000000..fbc017134 --- /dev/null +++ b/db/migrate/107_add_date_parsed_field_to_incoming_message.rb @@ -0,0 +1,9 @@ +class AddDateParsedFieldToIncomingMessage < ActiveRecord::Migration + def self.up + add_column :incoming_messages, :last_parsed, :datetime + end + + def self.down + remove_column :incoming_messages, :last_parsed + end +end diff --git a/db/migrate/108_change_safe_mail_from_to_mail_from.rb b/db/migrate/108_change_safe_mail_from_to_mail_from.rb new file mode 100644 index 000000000..57997f674 --- /dev/null +++ b/db/migrate/108_change_safe_mail_from_to_mail_from.rb @@ -0,0 +1,11 @@ +class ChangeSafeMailFromToMailFrom < ActiveRecord::Migration + def self.up + remove_column :incoming_messages, :safe_mail_from + add_column :incoming_messages, :mail_from, :text + end + + def self.down + remove_column :incoming_messages, :mail_from + add_column :incoming_messages, :safe_mail_from, :text + end +end diff --git a/doc/CHANGES.md b/doc/CHANGES.md index ea09de187..758e6b56e 100644 --- a/doc/CHANGES.md +++ b/doc/CHANGES.md @@ -3,6 +3,9 @@ ## Highlighted features * It should now be possible to develop the software on OSX * Base design refactored: CSS simplified and reduced, base design colours removed, now provided in example Alaveteli theme override +* It is now possible to rebuild the xapian index for specific terms, rather than having to drop and rebuild the entire database every time (as previously). See rake xapian:rebuild_index for more info. +* When listing authorities, show all authorities in default locale, rather than only those in the currently selected locale. +* Ensure incoming emails are only ever parsed once (should give a performance boost) ## Upgrade notes * **IMPORTANT! We now depend on Xapian 1.2**, which means you may need to install Xapian from backports. See [issue #159] for more info. diff --git a/public/stylesheets/print.css b/public/stylesheets/print.css index d5401ead0..43d7a1807 100644 --- a/public/stylesheets/print.css +++ b/public/stylesheets/print.css @@ -35,4 +35,8 @@ p#request_status { } div.correspondence { page-break-before: avoid; +} + +#other-country-notice { + display: none; }
\ No newline at end of file diff --git a/script/cache-incoming-emails b/script/cache-incoming-emails new file mode 100644 index 000000000..a84a713d6 --- /dev/null +++ b/script/cache-incoming-emails @@ -0,0 +1,9 @@ +#!/bin/bash + +# Fill in all the database caches of text from body/attachments. +# Will take a while to run! Can use after clear-caches to refresh the database +# level caches if you like. + +LOC=`dirname $0` + +"$LOC/runner" 'IncomingMessage.find_each() { |im| print "info request " + im.info_request.id.to_s + ", incoming message " + im.id.to_s + ": " + im.extract_attachments!.count.to_s + " attachments extracted to " + im.foi_attachments[0].directory + "; main body folded: " + im.get_main_body_text_folded.size.to_s + " attachment clipped:" + im.get_attachment_text_clipped.size.to_s + "\n" }' diff --git a/script/clear-caches b/script/clear-caches index e9438f92d..2d91774ef 100755 --- a/script/clear-caches +++ b/script/clear-caches @@ -4,7 +4,7 @@ LOC=`dirname $0` -"$LOC/runner" "ActiveRecord::Base.connection.execute(\"update incoming_messages set cached_attachment_text_clipped = null, cached_main_body_text_unfolded = null, cached_main_body_text_folded = null\")" +"$LOC/runner" "ActiveRecord::Base.connection.execute(\"update incoming_messages set cached_attachment_text_clipped = null, cached_main_body_text_unfolded = null, cached_main_body_text_folded = null, sent_at = null, subject = null, safe_mail_from = null, mail_from_domain = null, valid_to_reply_to = null\")" # Remove page cache (do it in two stages so live site gets cache cleared faster) rm -fr $LOC/../old-cache diff --git a/script/fill-database-caches b/script/fill-database-caches deleted file mode 100755 index e6b525144..000000000 --- a/script/fill-database-caches +++ /dev/null @@ -1,11 +0,0 @@ -#!/bin/bash - -# Fill in all the database caches of text from body/attachments. -# Will take a while to run! Can use after clear-caches to refresh the database -# level caches if you like. - -LOC=`dirname $0` - -"$LOC/runner" 'IncomingMessage.find_each() { |im| print im.id.to_s + " id: main body folded:" + im.get_main_body_text_folded.size.to_s + " attachment clipped:" + im.get_attachment_text_clipped.size.to_s + "\n" }' - - diff --git a/spec/controllers/public_body_controller_spec.rb b/spec/controllers/public_body_controller_spec.rb index 53e6c169a..8182e1331 100644 --- a/spec/controllers/public_body_controller_spec.rb +++ b/spec/controllers/public_body_controller_spec.rb @@ -68,17 +68,19 @@ describe PublicBodyController, "when listing bodies" do response.should be_success end - it "should list all bodies even when there are no translations for selected locale" do + it "should list all bodies from default locale, even when there are no translations for selected locale" do + PublicBody.with_locale(:en) do + english_only = PublicBody.new(:name => 'English only', + :short_name => 'EO', + :request_email => 'english@flourish.org', + :last_edit_editor => 'test', + :last_edit_comment => '') + english_only.save + end PublicBody.with_locale(:es) do - - spanish_only = PublicBody.new(:name => 'Spanish only', - :short_name => 'SO', - :request_email => 'spanish@flourish.org', - :last_edit_editor => 'test', - :last_edit_comment => '') - end - get :list - assigns[:public_bodies].length.should == 3 + get :list + assigns[:public_bodies].length.should == 3 + end end it "should list bodies in alphabetical order" do @@ -175,7 +177,7 @@ describe PublicBodyController, "when doing type ahead searches" do it "should return nothing for the empty query string" do get :search_typeahead, :q => "" response.should render_template('public_body/_search_ahead') - assigns[:xapian_requests].results.size.should == 0 + assigns[:xapian_requests].should be_nil end it "should return a body matching the given keyword, but not users with a matching description" do @@ -200,10 +202,9 @@ describe PublicBodyController, "when doing type ahead searches" do assigns[:xapian_requests].results[0][:model].name.should == public_bodies(:humpadink_public_body).name end - it "should return partial matches" do - get :search_typeahead, :q => "geral" # 'geral' for 'Geraldine' + it "should not return matches for short words" do + get :search_typeahead, :q => "b" response.should render_template('public_body/_search_ahead') - assigns[:xapian_requests].results.size.should == 1 - assigns[:xapian_requests].results[0][:model].name.should == public_bodies(:geraldine_public_body).name + assigns[:xapian_requests].should be_nil end end diff --git a/spec/controllers/request_controller_spec.rb b/spec/controllers/request_controller_spec.rb index 3420d212e..4994c2a8f 100644 --- a/spec/controllers/request_controller_spec.rb +++ b/spec/controllers/request_controller_spec.rb @@ -105,10 +105,12 @@ describe RequestController, "when showing one request" do integrate_views it "should receive incoming messages, send email to creator, and show them" do + ir = info_requests(:fancy_dog_request) + ir.incoming_messages.each { |x| x.parse_raw_email! } + get :show, :url_title => 'why_do_you_have_such_a_fancy_dog' size_before = assigns[:info_request_events].size - ir = info_requests(:fancy_dog_request) receive_incoming_mail('incoming-request-plain.email', ir.incoming_email) deliveries = ActionMailer::Base.deliveries deliveries.size.should == 1 @@ -120,6 +122,8 @@ describe RequestController, "when showing one request" do end it "should download attachments" do + ir = info_requests(:fancy_dog_request) + ir.incoming_messages.each { |x| x.parse_raw_email! } get :show, :url_title => 'why_do_you_have_such_a_fancy_dog' response.content_type.should == "text/html" size_before = assigns[:info_request_events].size @@ -129,7 +133,7 @@ describe RequestController, "when showing one request" do get :show, :url_title => 'why_do_you_have_such_a_fancy_dog' (assigns[:info_request_events].size - size_before).should == 1 - + ir.reload get :get_attachment, :incoming_message_id => ir.incoming_messages[1].id, :id => ir.id, :part => 2, :file_name => ['hello.txt'] response.content_type.should == "text/plain" response.should have_text(/Second hello/) @@ -148,16 +152,50 @@ describe RequestController, "when showing one request" do it "should generate valid HTML verson of plain text attachments " do ir = info_requests(:fancy_dog_request) receive_incoming_mail('incoming-request-two-same-name.email', ir.incoming_email) + ir.reload get :get_attachment_as_html, :incoming_message_id => ir.incoming_messages[1].id, :id => ir.id, :part => 2, :file_name => ['hello.txt.html'], :skip_cache => 1 response.content_type.should == "text/html" response.should have_text(/Second hello/) end + it "should not cause a reparsing of the raw email, even when the result would be a 404 " do + ir = info_requests(:fancy_dog_request) + receive_incoming_mail('incoming-request-two-same-name.email', ir.incoming_email) + ir.reload + attachment = IncomingMessage.get_attachment_by_url_part_number(ir.incoming_messages[1].get_attachments_for_display, 2) + attachment.body.should have_text(/Second hello/) + + # change the raw_email associated with the message; this only be reparsed when explicitly asked for + ir.incoming_messages[1].raw_email.data = ir.incoming_messages[1].raw_email.data.sub("Second", "Third") + # asking for an attachment by the wrong filename results + # in a 404 for browsing users. This shouldn't cause a + # re-parse... + lambda { + get :get_attachment_as_html, :incoming_message_id => ir.incoming_messages[1].id, :id => ir.id, :part => 2, :file_name => ['hello.txt.baz.html'], :skip_cache => 1 + }.should raise_error(ActiveRecord::RecordNotFound) + + attachment = IncomingMessage.get_attachment_by_url_part_number(ir.incoming_messages[1].get_attachments_for_display, 2) + attachment.body.should have_text(/Second hello/) + + # ...nor should asking for it by its correct filename... + get :get_attachment_as_html, :incoming_message_id => ir.incoming_messages[1].id, :id => ir.id, :part => 2, :file_name => ['hello.txt.html'], :skip_cache => 1 + response.should_not have_text(/Third hello/) + + # ...but if we explicitly ask for attachments to be extracted, then they should be + force = true + ir.incoming_messages[1].parse_raw_email!(force) + attachment = IncomingMessage.get_attachment_by_url_part_number(ir.incoming_messages[1].get_attachments_for_display, 2) + attachment.body.should have_text(/Second hello/) + get :get_attachment_as_html, :incoming_message_id => ir.incoming_messages[1].id, :id => ir.id, :part => 2, :file_name => ['hello.txt.html'], :skip_cache => 1 + response.should have_text(/Third hello/) + end + it "should treat attachments with unknown extensions as binary" do ir = info_requests(:fancy_dog_request) receive_incoming_mail('incoming-request-attachment-unknown-extension.email', ir.incoming_email) + ir.reload - get :get_attachment, :incoming_message_id => ir.incoming_messages[1].id, :id => ir.id, :part => 2, :file_name => ['hello.qwglhm'] + get :get_attachment, :incoming_message_id => ir.incoming_messages[1].id, :id => ir.id, :part => 2, :file_name => ['hello.qwglhm'], :skip_cache => 1 response.content_type.should == "application/octet-stream" response.should have_text(/an unusual sort of file/) end @@ -200,8 +238,9 @@ describe RequestController, "when showing one request" do ir.user.censor_rules << censor_rule receive_incoming_mail('incoming-request-two-same-name.email', ir.incoming_email) + ir.reload - get :get_attachment, :incoming_message_id => ir.incoming_messages[1].id, :id => ir.id, :part => 2, :file_name => ['hello.txt'] + get :get_attachment, :incoming_message_id => ir.incoming_messages[1].id, :id => ir.id, :part => 2, :file_name => ['hello.txt'], :skip_cache => 1 response.content_type.should == "text/plain" response.should have_text(/xxxxxx hello/) end @@ -210,7 +249,22 @@ describe RequestController, "when showing one request" do ir = info_requests(:fancy_dog_request) receive_incoming_mail('incoming-request-two-same-name.email', ir.incoming_email) + # XXX this is horrid, but don't know a better way. If we + # don't do this, the info_request_event to which the + # info_request is attached still uses the unmodified + # version from the fixture. + #event = info_request_events(:useless_incoming_message_event) + ir.reload + assert ir.info_request_events[3].incoming_message.get_attachments_for_display.count == 2 + ir.save! + ir.incoming_messages.last.save! get :show, :url_title => 'why_do_you_have_such_a_fancy_dog' + assert assigns[:info_request].info_request_events[3].incoming_message.get_attachments_for_display.count == 2 + # the issue is that the info_request_events have got cached on them the old info_requests. + # where i'm at: trying to replace those fields that got re-read from the raw email. however tests are failing in very strange ways. currently I don't appear to be getting any attachments parsed in at all when in the template (see "*****" in _correspondence.rhtml) but do when I'm in the code. + + # so at this point, assigns[:info_request].incoming_messages[1].get_attachments_for_display is returning stuff, but the equivalent thing in the template isn't. + # but something odd is that the above is return a whole load of attachments which aren't there in the controller response.body.should have_tag("p.attachment strong", /hello.txt/m) censor_rule = CensorRule.new() @@ -225,10 +279,17 @@ describe RequestController, "when showing one request" do end it "should make a zipfile available, which has a different URL when it changes" do + title = 'why_do_you_have_such_a_fancy_dog' ir = info_requests(:fancy_dog_request) session[:user_id] = ir.user.id # bob_smith_user + get :download_entire_request, :url_title => title + assigns[:url_path].should have_text(/#{title}.zip$/) + old_path = assigns[:url_path] + response.location.should have_text(/#{assigns[:url_path]}$/) + zipfile = Zip::ZipFile.open(File.join(File.dirname(__FILE__), "../../cache/zips", old_path)) { |zipfile| + zipfile.count.should == 2 + } receive_incoming_mail('incoming-request-two-same-name.email', ir.incoming_email) - title = 'why_do_you_have_such_a_fancy_dog' get :download_entire_request, :url_title => title assigns[:url_path].should have_text(/#{title}.zip$/) old_path = assigns[:url_path] @@ -1286,9 +1347,10 @@ end describe RequestController, "authority uploads a response from the web interface" do + integrate_views fixtures :public_bodies, :public_body_translations, :public_body_versions, :users, :info_requests, :incoming_messages, :outgoing_messages, :comments, :info_request_events, :track_things - before(:all) do + before(:each) do # domain after the @ is used for authentication of FOI officers, so to test it # we need a user which isn't at localhost. @normal_user = User.new(:name => "Mr. Normal", :email => "normal-user@flourish.org", @@ -1342,7 +1404,7 @@ describe RequestController, "authority uploads a response from the web interface # How do I test a file upload in rails? # http://stackoverflow.com/questions/1178587/how-do-i-test-a-file-upload-in-rails - it "should let the requester upload a file" do + it "should let the authority upload a file" do @ir = info_requests(:fancy_dog_request) incoming_before = @ir.incoming_messages.size session[:user_id] = @foi_officer_user.id @@ -1398,7 +1460,7 @@ describe RequestController, "when doing type ahead searches" do it "should return nothing for the empty query string" do get :search_typeahead, :q => "" response.should render_template('request/_search_ahead.rhtml') - assigns[:xapian_requests].results.size.should == 0 + assigns[:xapian_requests].should be_nil end it "should return a request matching the given keyword, but not users with a matching description" do @@ -1416,11 +1478,10 @@ describe RequestController, "when doing type ahead searches" do assigns[:xapian_requests].results[1][:model].title.should == info_requests(:naughty_chicken_request).title end - it "should return partial matches" do - get :search_typeahead, :q => "chick" # 'chick' for 'chicken' + it "should not return matches for short words" do + get :search_typeahead, :q => "a" response.should render_template('request/_search_ahead.rhtml') - assigns[:xapian_requests].results.size.should == 1 - assigns[:xapian_requests].results[0][:model].title.should == info_requests(:naughty_chicken_request).title + assigns[:xapian_requests].should be_nil end end diff --git a/spec/fixtures/foi_attachments.yml b/spec/fixtures/foi_attachments.yml new file mode 100644 index 000000000..8b1378917 --- /dev/null +++ b/spec/fixtures/foi_attachments.yml @@ -0,0 +1 @@ + diff --git a/spec/integration/view_request_spec.rb b/spec/integration/view_request_spec.rb new file mode 100644 index 000000000..cf1e4ca6c --- /dev/null +++ b/spec/integration/view_request_spec.rb @@ -0,0 +1,32 @@ +require File.expand_path(File.dirname(__FILE__) + '/../spec_helper') + +describe "When viewing requests" do + + fixtures [ + :users, + :public_bodies, + :public_body_translations, + :public_body_versions, + :info_requests, + :raw_emails, + :outgoing_messages, + :incoming_messages, + :comments, + :info_request_events, + :track_things, + ] + + before(:each) do + emails = raw_emails.clone + load_raw_emails_data(emails) + end + + it "should not make endlessly recursive JSON <link>s" do + @dog_request = info_requests(:fancy_dog_request) + get "request/#{@dog_request.url_title}?unfold=1" + response.body.should_not include("dog?unfold=1.json") + response.body.should include("dog.json?unfold=1") + end + +end + diff --git a/spec/models/incoming_message_spec.rb b/spec/models/incoming_message_spec.rb index 1a4baca0b..ed31b7c5c 100644 --- a/spec/models/incoming_message_spec.rb +++ b/spec/models/incoming_message_spec.rb @@ -64,7 +64,7 @@ end describe IncomingMessage, " display attachments" do it "should not show slashes in filenames" do - foi_attachment = FOIAttachment.new() + foi_attachment = FoiAttachment.new() # http://www.whatdotheyknow.com/request/post_commercial_manager_librarie#incoming-17233 foi_attachment.filename = "FOI/09/066 RESPONSE TO FOI REQUEST RECEIVED 21st JANUARY 2009.txt" expected_display_filename = foi_attachment.filename.gsub(/\//, " ") @@ -72,10 +72,11 @@ describe IncomingMessage, " display attachments" do end it "should not show slashes in subject generated filenames" do - foi_attachment = FOIAttachment.new() + foi_attachment = FoiAttachment.new() # http://www.whatdotheyknow.com/request/post_commercial_manager_librarie#incoming-17233 foi_attachment.within_rfc822_subject = "FOI/09/066 RESPONSE TO FOI REQUEST RECEIVED 21st JANUARY 2009" foi_attachment.content_type = 'text/plain' + foi_attachment.ensure_filename! expected_display_filename = foi_attachment.within_rfc822_subject.gsub(/\//, " ") + ".txt" foi_attachment.display_filename.should == expected_display_filename end @@ -118,8 +119,7 @@ describe IncomingMessage, " checking validity to reply to" do @incoming_message = IncomingMessage.new() @incoming_message.stub!(:mail).and_return(@mail) - - @incoming_message.valid_to_reply_to?.should == result + @incoming_message._calculate_valid_to_reply_to.should == result end it "says a valid email is fine" do @@ -284,10 +284,8 @@ describe IncomingMessage, " when censoring data" do end it "should apply censor rules to From: addresses" do - mock_mail = mock('Email object') - mock_mail.stub!(:from_name_if_present).and_return("Stilton Mouse") - @im.stub!(:mail).and_return(mock_mail) - + @im.stub!(:mail_from).and_return("Stilton Mouse") + @im.stub!(:last_parsed).and_return(Time.now) safe_mail_from = @im.safe_mail_from safe_mail_from.should == "Jarlsberg Mouse" end @@ -326,21 +324,23 @@ end describe IncomingMessage, " when uudecoding bad messages" do + fixtures :incoming_messages, :raw_emails, :public_bodies, :public_body_translations, :info_requests, :users, :foi_attachments + + before(:each) do + load_raw_emails_data(raw_emails) + end + it "should be able to do it at all" do mail_body = load_file_fixture('incoming-request-bad-uuencoding.email') mail = TMail::Mail.parse(mail_body) mail.base64_decode - - im = IncomingMessage.new + im = incoming_messages(:useless_incoming_message) im.stub!(:mail).and_return(mail) - ir = InfoRequest.new - im.info_request = ir - u = User.new - ir.user = u - - attachments = im.get_main_body_text_uudecode_attachments - attachments.size.should == 1 - attachments[0].filename.should == 'moo.txt' + im.extract_attachments! + attachments = im.foi_attachments + attachments.size.should == 2 + attachments[1].filename.should == 'moo.txt' + im.get_attachments_for_display.size.should == 1 end it "should apply censor rules" do @@ -348,12 +348,9 @@ describe IncomingMessage, " when uudecoding bad messages" do mail = TMail::Mail.parse(mail_body) mail.base64_decode - im = IncomingMessage.new + im = incoming_messages(:useless_incoming_message) im.stub!(:mail).and_return(mail) - ir = InfoRequest.new - im.info_request = ir - u = User.new - ir.user = u + ir = info_requests(:fancy_dog_request) @censor_rule = CensorRule.new() @censor_rule.text = "moo" @@ -361,26 +358,31 @@ describe IncomingMessage, " when uudecoding bad messages" do @censor_rule.last_edit_editor = "unknown" @censor_rule.last_edit_comment = "none" ir.censor_rules << @censor_rule + im.extract_attachments! - attachments = im.get_main_body_text_uudecode_attachments + attachments = im.get_attachments_for_display attachments.size.should == 1 - attachments[0].filename.should == 'bah.txt' + attachments[0].display_filename.should == 'bah.txt' end end describe IncomingMessage, "when messages are attached to messages" do + fixtures :incoming_messages, :raw_emails, :public_bodies, :public_body_translations, :info_requests, :users, :foi_attachments + + before(:each) do + load_raw_emails_data(raw_emails) + end + it "should flatten all the attachments out" do mail_body = load_file_fixture('incoming-request-attach-attachments.email') mail = TMail::Mail.parse(mail_body) mail.base64_decode - im = IncomingMessage.new + im = incoming_messages(:useless_incoming_message) im.stub!(:mail).and_return(mail) - ir = InfoRequest.new - im.info_request = ir - u = User.new - ir.user = u + + im.extract_attachments! attachments = im.get_attachments_for_display attachments.size.should == 3 @@ -391,17 +393,20 @@ describe IncomingMessage, "when messages are attached to messages" do end describe IncomingMessage, "when Outlook messages are attached to messages" do + fixtures :incoming_messages, :raw_emails, :public_bodies, :public_body_translations, :info_requests, :users, :foi_attachments + + before(:each) do + load_raw_emails_data(raw_emails) + end + it "should flatten all the attachments out" do mail_body = load_file_fixture('incoming-request-oft-attachments.email') mail = TMail::Mail.parse(mail_body) mail.base64_decode - im = IncomingMessage.new + im = incoming_messages(:useless_incoming_message) im.stub!(:mail).and_return(mail) - ir = InfoRequest.new - im.info_request = ir - u = User.new - ir.user = u + im.extract_attachments! attachments = im.get_attachments_for_display attachments.size.should == 2 @@ -411,17 +416,20 @@ describe IncomingMessage, "when Outlook messages are attached to messages" do end describe IncomingMessage, "when TNEF attachments are attached to messages" do + fixtures :incoming_messages, :raw_emails, :public_bodies, :public_body_translations, :info_requests, :users, :foi_attachments + + before(:each) do + load_raw_emails_data(raw_emails) + end + it "should flatten all the attachments out" do mail_body = load_file_fixture('incoming-request-tnef-attachments.email') mail = TMail::Mail.parse(mail_body) mail.base64_decode - im = IncomingMessage.new + im = incoming_messages(:useless_incoming_message) im.stub!(:mail).and_return(mail) - ir = InfoRequest.new - im.info_request = ir - u = User.new - ir.user = u + im.extract_attachments! attachments = im.get_attachments_for_display attachments.size.should == 2 diff --git a/spec/models/info_request_event_spec.rb b/spec/models/info_request_event_spec.rb index 666f5cb1a..3229284cc 100644 --- a/spec/models/info_request_event_spec.rb +++ b/spec/models/info_request_event_spec.rb @@ -50,5 +50,30 @@ describe InfoRequestEvent do end end + + describe "doing search/index stuff" do + fixtures :public_bodies, :public_body_translations, :public_body_versions, :users, :info_requests, :raw_emails, :incoming_messages, :outgoing_messages, :comments, :info_request_events, :track_things + + before(:each) do + load_raw_emails_data(raw_emails) + parse_all_incoming_messages + end + + it 'should get search text for outgoing messages' do + event = info_request_events(:useless_outgoing_message_event) + message = outgoing_messages(:useless_outgoing_message).body + event.search_text_main.should == message + "\n\n" + end + + it 'should get search text for incoming messages' do + event = info_request_events(:useless_incoming_message_event) + event.search_text_main.strip.should == "No way! I'm not going to tell you that in a month of Thursdays.\n\nThe Geraldine Quango" + end + + + end + + + end diff --git a/spec/models/info_request_spec.rb b/spec/models/info_request_spec.rb index 409d48ede..b1baa66a2 100644 --- a/spec/models/info_request_spec.rb +++ b/spec/models/info_request_spec.rb @@ -143,8 +143,8 @@ describe InfoRequest do end it "should cope with indexing after item is deleted" do + IncomingMessage.find(:all).each{|x| x.parse_raw_email!} rebuild_xapian_index - # delete event from underneath indexing; shouldn't cause error info_request_events(:useless_incoming_message_event).save! info_request_events(:useless_incoming_message_event).destroy diff --git a/spec/models/xapian_spec.rb b/spec/models/xapian_spec.rb index 932966dfb..ec11c944b 100644 --- a/spec/models/xapian_spec.rb +++ b/spec/models/xapian_spec.rb @@ -4,6 +4,7 @@ describe User, " when indexing users with Xapian" do fixtures :public_bodies, :public_body_translations, :public_body_versions, :users, :info_requests, :raw_emails, :incoming_messages, :outgoing_messages, :comments, :info_request_events, :track_things it "should search by name" do + parse_all_incoming_messages rebuild_xapian_index # def InfoRequest.full_search(models, query, order, ascending, collapse, per_page, page) xapian_object = InfoRequest.full_search([User], "Silly", 'created_at', true, nil, 100, 1) @@ -12,9 +13,10 @@ describe User, " when indexing users with Xapian" do end it "should search by 'about me' text" do + rebuild_xapian_index user = users(:bob_smith_user) - # def InfoRequest.full_search(models, query, order, ascending, collapse, per_page, page) + # def InfoRequest.full_search(models, query, order, ascending, collapse, per_page, page) xapian_object = InfoRequest.full_search([User], "stuff", 'created_at', true, nil, 100, 1) xapian_object.results.size.should == 1 xapian_object.results[0][:model].should == user @@ -332,6 +334,66 @@ describe PublicBody, " when indexing authorities by tag" do end end +describe PublicBody, " when only indexing selected things on a rebuild" do + fixtures :public_bodies, :public_body_translations, :public_body_versions, :users, :info_requests, :raw_emails, :incoming_messages, :outgoing_messages, :comments, :info_request_events, :track_things + before(:each) do + load_raw_emails_data(raw_emails) + end + + it "should only index what we ask it to" do + rebuild_xapian_index + body = public_bodies(:geraldine_public_body) + body.tag_string = 'mice:3' + body.name = 'frobzn' + body.save! + # only reindex 'variety' term + dropfirst = true + terms = "V" + values = false + texts = false + rebuild_xapian_index(terms, values, texts, dropfirst) + xapian_object = InfoRequest.full_search([PublicBody], "tag:mice", 'created_at', true, nil, 100, 1) + xapian_object.results.size.should == 0 + xapian_object = InfoRequest.full_search([PublicBody], "frobzn", 'created_at', true, nil, 100, 1) + xapian_object.results.size.should == 0 + xapian_object = InfoRequest.full_search([PublicBody], "variety:authority", 'created_at', true, nil, 100, 1) + xapian_object.results.size.should == 2 + # only reindex 'tag' and text + dropfirst = true + terms = "U" + values = false + texts = true + rebuild_xapian_index(terms, values, texts, dropfirst) + xapian_object = InfoRequest.full_search([PublicBody], "tag:mice", 'created_at', true, nil, 100, 1) + xapian_object.results.size.should == 1 + xapian_object = InfoRequest.full_search([PublicBody], "frobzn", 'created_at', true, nil, 100, 1) + xapian_object.results.size.should == 1 + xapian_object = InfoRequest.full_search([PublicBody], "variety:authority", 'created_at', true, nil, 100, 1) + xapian_object.results.size.should == 0 + # only reindex 'variety' term, but keeping the existing data in-place + dropfirst = false + terms = "V" + texts = false + rebuild_xapian_index(terms, values, texts, dropfirst) + xapian_object = InfoRequest.full_search([PublicBody], "tag:mice", 'created_at', true, nil, 100, 1) + xapian_object.results.size.should == 1 + xapian_object = InfoRequest.full_search([PublicBody], "frobzn", 'created_at', true, nil, 100, 1) + xapian_object.results.size.should == 1 + xapian_object = InfoRequest.full_search([PublicBody], "variety:authority", 'created_at', true, nil, 100, 1) + xapian_object.results.size.should == 2 + # only reindex 'variety' term, blowing away existing data + dropfirst = true + rebuild_xapian_index(terms, values, texts, dropfirst) + xapian_object = InfoRequest.full_search([PublicBody], "tag:mice", 'created_at', true, nil, 100, 1) + xapian_object.results.size.should == 0 + xapian_object = InfoRequest.full_search([PublicBody], "frobzn", 'created_at', true, nil, 100, 1) + xapian_object.results.size.should == 0 + xapian_object = InfoRequest.full_search([PublicBody], "variety:authority", 'created_at', true, nil, 100, 1) + xapian_object.results.size.should == 2 + end +end + + diff --git a/spec/spec_helper.rb b/spec/spec_helper.rb index ecb67a3b4..5c5cd9a7f 100644 --- a/spec/spec_helper.rb +++ b/spec/spec_helper.rb @@ -77,12 +77,21 @@ def load_file_fixture(file_name) return content end -def rebuild_xapian_index +def rebuild_xapian_index(terms = true, values = true, texts = true, dropfirst = true) + parse_all_incoming_messages + if dropfirst + begin + ActsAsXapian.readable_init + FileUtils.rm_r(ActsAsXapian.db_path) + rescue RuntimeError + end + ActsAsXapian.writable_init + end verbose = false # safe_rebuild=true, which involves forking to avoid memory leaks, doesn't work well with rspec. # unsafe is significantly faster, and we can afford possible memory leaks while testing. safe_rebuild = false - ActsAsXapian.rebuild_index(["PublicBody", "User", "InfoRequestEvent"].map{|m| m.constantize}, verbose, safe_rebuild) + ActsAsXapian.rebuild_index(["PublicBody", "User", "InfoRequestEvent"].map{|m| m.constantize}, verbose, terms, values, texts, safe_rebuild) end def update_xapian_index @@ -160,3 +169,7 @@ def load_raw_emails_data(raw_emails) end raw_email.data = load_file_fixture("useless_raw_email.email") end + +def parse_all_incoming_messages + IncomingMessage.find(:all).each{|x| x.parse_raw_email!} +end diff --git a/vendor/plugins/acts_as_xapian/lib/acts_as_xapian.rb b/vendor/plugins/acts_as_xapian/lib/acts_as_xapian.rb index 0af49dffd..fb6a08979 100644 --- a/vendor/plugins/acts_as_xapian/lib/acts_as_xapian.rb +++ b/vendor/plugins/acts_as_xapian/lib/acts_as_xapian.rb @@ -589,7 +589,7 @@ module ActsAsXapian # Incremental update_index calls above are suspended while this rebuild # happens (i.e. while the .new database is there) - any index update jobs # are left in the database, and will run after the rebuild has finished. - def ActsAsXapian.rebuild_index(model_classes, verbose = false, safe_rebuild = true) + def ActsAsXapian.rebuild_index(model_classes, verbose = false, terms = true, values = true, texts = true, safe_rebuild = true) #raise "when rebuilding all, please call as first and only thing done in process / task" if not ActsAsXapian.writable_db.nil? prepare_environment @@ -603,7 +603,7 @@ module ActsAsXapian # Index everything if safe_rebuild - _rebuild_index_safely(model_classes, verbose) + _rebuild_index_safely(model_classes, verbose, terms, values, texts) else # Save time by running the indexing in one go and in-process ActsAsXapian.writable_init(".new") @@ -611,7 +611,7 @@ module ActsAsXapian STDOUT.puts("ActsAsXapian.rebuild_index: Rebuilding #{model_class.to_s}") if verbose model_class.find(:all).each do |model| STDOUT.puts("ActsAsXapian.rebuild_index #{model_class} #{model.id}") if verbose - model.xapian_index + model.xapian_index(terms, values, texts) end end # make sure everything is written and close @@ -641,7 +641,7 @@ module ActsAsXapian # so they get the new db end - def ActsAsXapian._rebuild_index_safely(model_classes, verbose) + def ActsAsXapian._rebuild_index_safely(model_classes, verbose, terms, values, texts) batch_size = 1000 for model_class in model_classes model_class_count = model_class.count @@ -664,7 +664,7 @@ module ActsAsXapian STDOUT.puts("ActsAsXapian.rebuild_index: New batch. #{model_class.to_s} from #{i} to #{i + batch_size} of #{model_class_count} pid #{Process.pid.to_s}") if verbose model_class.find(:all, :limit => batch_size, :offset => i, :order => :id).each do |model| STDOUT.puts("ActsAsXapian.rebuild_index #{model_class} #{model.id}") if verbose - model.xapian_index + model.xapian_index(terms, values, texts) end # make sure everything is written ActsAsXapian.writable_db.flush @@ -738,7 +738,7 @@ module ActsAsXapian end # Store record in the Xapian database - def xapian_index + def xapian_index(terms = true, values = true, texts = true) # if we have a conditional function for indexing, call it and destory object if failed if self.class.xapian_options.include?(:if) if_value = xapian_value(self.class.xapian_options[:if], :boolean) @@ -748,37 +748,90 @@ module ActsAsXapian end end + if self.class.to_s == "PublicBody" and self.url_name == "tgq" + +#require 'ruby-debug' +#debugger + end # otherwise (re)write the Xapian record for the object - doc = Xapian::Document.new - ActsAsXapian.term_generator.document = doc + ActsAsXapian.readable_init + existing_query = Xapian::Query.new("I" + self.xapian_document_term) + ActsAsXapian.enquire.query = existing_query + match = ActsAsXapian.enquire.mset(0,1,1).matches[0] - doc.data = self.xapian_document_term + if !match.nil? + doc = match.document + else + doc = Xapian::Document.new + doc.data = self.xapian_document_term + doc.add_term("M" + self.class.to_s) + doc.add_term("I" + doc.data) + end + ActsAsXapian.term_generator.document = doc + # work out what to index. XXX for now, this is only selective on "terms". + terms_to_index = [] + drop_all_terms = false + if terms and self.xapian_options[:terms] + terms_to_index = self.xapian_options[:terms].dup + if terms.is_a?(String) + terms_to_index.reject!{|term| !terms.include?(term[1])} + if terms_to_index.length == self.xapian_options[:terms].length + drop_all_terms = true + end + else + drop_all_terms = true + end + end + texts_to_index = [] + if texts and self.xapian_options[:texts] + texts_to_index = self.xapian_options[:texts] + end + values_to_index = [] + if values and self.xapian_options[:values] + values_to_index = self.xapian_options[:values] + end - doc.add_term("M" + self.class.to_s) - doc.add_term("I" + doc.data) - if self.xapian_options[:terms] - for term in self.xapian_options[:terms] - value = xapian_value(term[0]) - if value.kind_of?(Array) + # clear any existing values that we might want to replace + if drop_all_terms && texts + # as an optimisation, if we're reindexing all of both, we remove everything + doc.clear_terms + doc.add_term("M" + self.class.to_s) + doc.add_term("I" + doc.data) + else + term_prefixes_to_index = terms_to_index.map {|x| x[1]} + for existing_term in doc.terms + first_letter = existing_term.term[0...1] + if !"MI".include?(first_letter) + if first_letter.match("^[A-Z]+") && terms_to_index.include?(first_letter) + doc.remove_term(existing_term.term) + elsif texts + doc.remove_term(existing_term.term) + end + end + end + end + # for now, we always clear values + doc.clear_values + + for term in terms_to_index + value = xapian_value(term[0]) + if value.kind_of?(Array) for v in value - doc.add_term(term[1] + v) + doc.add_term(term[1] + v) end - else + else doc.add_term(term[1] + value) - end - end + end end - if self.xapian_options[:values] - for value in self.xapian_options[:values] - doc.add_value(value[1], xapian_value(value[0], value[3])) - end + # values + for value in values_to_index + doc.add_value(value[1], xapian_value(value[0], value[3])) end - if self.xapian_options[:texts] - for text in self.xapian_options[:texts] - ActsAsXapian.term_generator.increase_termpos # stop phrases spanning different text fields - # XXX the "1" here is a weight that could be varied for a boost function - ActsAsXapian.term_generator.index_text(xapian_value(text, nil, true), 1) - end + # texts + for text in texts_to_index + ActsAsXapian.term_generator.increase_termpos # stop phrases spanning different text fields + # XXX the "1" here is a weight that could be varied for a boost function + ActsAsXapian.term_generator.index_text(xapian_value(text, nil, true), 1) end ActsAsXapian.writable_db.replace_document("I" + doc.data, doc) diff --git a/vendor/plugins/acts_as_xapian/lib/tasks/xapian.rake b/vendor/plugins/acts_as_xapian/lib/tasks/xapian.rake index 7168895f9..d18cd07d5 100644 --- a/vendor/plugins/acts_as_xapian/lib/tasks/xapian.rake +++ b/vendor/plugins/acts_as_xapian/lib/tasks/xapian.rake @@ -15,14 +15,27 @@ namespace :xapian do # Parameters - specify 'models="PublicBody User"' to say which models # you index with Xapian. - # This totally rebuilds the database, so you will want to restart any - # web server afterwards to make sure it gets the changes, rather than - # still pointing to the old deleted database. Specify "verbose=true" to - # print model name as it is run. + + # This totally rebuilds the database, so you will want to restart + # any web server afterwards to make sure it gets the changes, + # rather than still pointing to the old deleted database. Specify + # "verbose=true" to print model name as it is run. By default, + # all of the terms, values and texts are reindexed. You can + # suppress any of these by specifying, for example, "texts=false". + # You can specify that only certain terms should be updated by + # specifying their prefix(es) as a string, e.g. "terms=IV" will + # index the two terms I and V (and "terms=false" will index none, + # and "terms=true", the default, will index all) + + desc 'Completely rebuilds Xapian search index (must specify all models)' task :rebuild_index => :environment do raise "specify ALL your models with models=\"ModelName1 ModelName2\" as parameter" if ENV['models'].nil? - ActsAsXapian.rebuild_index(ENV['models'].split(" ").map{|m| m.constantize}, ENV['verbose'] ? true : false) + ActsAsXapian.rebuild_index(ENV['models'].split(" ").map{|m| m.constantize}, + ENV['verbose'] ? true : false, + ENV['terms'] == "false" ? false : ENV['terms'], + ENV['values'] == "false" ? false : ENV['values'], + ENV['texts'] == "false" ? false : true) end # Parameters - are models, query, offset, limit, sort_by_prefix, |