diff options
Diffstat (limited to 'lib')
-rw-r--r-- | lib/attachment_to_html/adapters/could_not_convert.rb | 49 | ||||
-rw-r--r-- | lib/attachment_to_html/adapters/google_docs_viewer.rb | 56 | ||||
-rw-r--r-- | lib/attachment_to_html/adapters/pdf.rb | 108 | ||||
-rw-r--r-- | lib/attachment_to_html/adapters/rtf.rb | 107 | ||||
-rw-r--r-- | lib/attachment_to_html/adapters/text.rb | 61 | ||||
-rw-r--r-- | lib/attachment_to_html/attachment_to_html.rb | 46 | ||||
-rw-r--r-- | lib/attachment_to_html/template.html.erb | 16 | ||||
-rw-r--r-- | lib/attachment_to_html/view.rb | 39 | ||||
-rw-r--r-- | lib/configuration.rb | 1 | ||||
-rw-r--r-- | lib/date_quarter.rb | 22 | ||||
-rw-r--r-- | lib/normalize_string.rb | 12 | ||||
-rw-r--r-- | lib/tasks/stats.rake | 55 |
12 files changed, 565 insertions, 7 deletions
diff --git a/lib/attachment_to_html/adapters/could_not_convert.rb b/lib/attachment_to_html/adapters/could_not_convert.rb new file mode 100644 index 000000000..8e4bf39dc --- /dev/null +++ b/lib/attachment_to_html/adapters/could_not_convert.rb @@ -0,0 +1,49 @@ +module AttachmentToHTML + module Adapters + class CouldNotConvert + + attr_reader :attachment + + # Public: Initialize a PDF converter + # + # attachment - the FoiAttachment to convert to HTML + # opts - a Hash of options (default: {}): + # No options currently accepted + def initialize(attachment, opts = {}) + @attachment = attachment + end + + # Public: The title to use in the <title> tag + # + # Returns a String + def title + @title ||= attachment.display_filename + end + + # Public: The contents of the extracted html <body> tag + # + # Returns a String + def body + @body ||= parse_body + end + + + # Public: Was the document conversion successful? + # As this is a fallback option and not doing anything dynamic + # we're assuming this is successful whatever the case + # + # Returns true + def success? + true + end + + private + + def parse_body + "<p>Sorry, we were unable to convert this file to HTML. " \ + "Please use the download link at the top right.</p>" + end + + end + end +end
\ No newline at end of file diff --git a/lib/attachment_to_html/adapters/google_docs_viewer.rb b/lib/attachment_to_html/adapters/google_docs_viewer.rb new file mode 100644 index 000000000..991fbb757 --- /dev/null +++ b/lib/attachment_to_html/adapters/google_docs_viewer.rb @@ -0,0 +1,56 @@ +module AttachmentToHTML + module Adapters + # Renders the attachment in a Google Docs Viewer + class GoogleDocsViewer + + attr_reader :attachment, :attachment_url + + # Public: Initialize a GoogleDocsViewer converter + # + # attachment - the FoiAttachment to convert to HTML + # opts - a Hash of options (default: {}): + # :attachment_url - a String url to the attachment for + # Google to render (default: nil) + def initialize(attachment, opts = {}) + @attachment = attachment + @attachment_url = opts.fetch(:attachment_url, nil) + end + + # Public: The title to use in the <title> tag + # + # Returns a String + def title + @title ||= attachment.display_filename + end + + # Public: The contents of the extracted html <body> tag + # + # Returns a String + def body + @body ||= parse_body + end + + # Public: Was the document conversion successful? + # We can't really tell whether the document conversion has been + # successful as such; We're assuming that given a correctly + # constructed iframe (which is tested) that Google will make this + # Just Work. + # + # Returns true + def success? + true + end + + private + + def parse_body + %Q(<iframe src="#{ protocol }://docs.google.com/viewer?url=#{ attachment_url }&embedded=true" width="100%" height="100%" style="border: none;"></iframe>) + end + + def protocol + AlaveteliConfiguration.force_ssl ? 'https' : 'http' + end + + end + end +end diff --git a/lib/attachment_to_html/adapters/pdf.rb b/lib/attachment_to_html/adapters/pdf.rb new file mode 100644 index 000000000..b91958c52 --- /dev/null +++ b/lib/attachment_to_html/adapters/pdf.rb @@ -0,0 +1,108 @@ +module AttachmentToHTML + module Adapters + # Convert application/pdf documents in to HTML + class PDF + TOO_MANY_IMAGES = 51 + + attr_reader :attachment, :tmpdir + + # Public: Initialize a PDF converter + # + # attachment - the FoiAttachment to convert to HTML + # opts - a Hash of options (default: {}): + # :tmpdir - String name of directory to store the + # converted document + def initialize(attachment, opts = {}) + @attachment = attachment + @tmpdir = opts.fetch(:tmpdir, ::Rails.root.join('tmp')) + end + + # Public: The title to use in the <title> tag + # + # Returns a String + def title + @title ||= attachment.display_filename + end + + # Public: The contents of the extracted html <body> tag + # + # Returns a String + def body + @body ||= parse_body + end + + # Public: Was the document conversion successful? + # + # Returns a Boolean + def success? + return false if contains_too_many_images? + has_content? || contains_images? + end + + private + + def parse_body + match = convert.match(/<body[^>]*>(.*?)<\/body>/mi) + match ? match[1] : '' + end + + def has_content? + !body.gsub(/\s+/,"").gsub(/\<[^\>]*\>/, "").empty? + end + + def contains_images? + body.match(/<img[^>]*>/mi) ? true : false + end + + # Works around https://bugs.freedesktop.org/show_bug.cgi?id=77932 in pdftohtml + def contains_too_many_images? + number_of_images_in_body >= TOO_MANY_IMAGES + end + + def number_of_images_in_body + body.scan(/<img[^>]*>/i).size + end + + def convert + # Get the attachment body outside of the chdir call as getting + # the body may require opening files too + text = attachment_body + + @converted ||= Dir.chdir(tmpdir) do + tempfile = create_tempfile(text) + + html = AlaveteliExternalCommand.run("pdftohtml", + "-nodrm", "-zoom", "1.0", "-stdout", "-enc", "UTF-8", + "-noframes", tempfile.path, :timeout => 30 + ) + + cleanup_tempfile(tempfile) + + html + end + end + + def create_tempfile(text) + tempfile = if RUBY_VERSION.to_f >= 1.9 + Tempfile.new('foiextract', '.', + :encoding => text.encoding) + else + Tempfile.new('foiextract', '.') + end + tempfile.print(text) + tempfile.flush + tempfile + end + + def cleanup_tempfile(tempfile) + tempfile.close + tempfile.delete + end + + def attachment_body + @attachment_body ||= attachment.body + end + + end + end +end diff --git a/lib/attachment_to_html/adapters/rtf.rb b/lib/attachment_to_html/adapters/rtf.rb new file mode 100644 index 000000000..859c0e541 --- /dev/null +++ b/lib/attachment_to_html/adapters/rtf.rb @@ -0,0 +1,107 @@ +module AttachmentToHTML + module Adapters + # Convert application/rtf documents in to HTML + class RTF + + attr_reader :attachment, :tmpdir + + # Public: Initialize a RTF converter + # + # attachment - the FoiAttachment to convert to HTML + # opts - a Hash of options (default: {}): + # :tmpdir - String name of directory to store the + # converted document + def initialize(attachment, opts = {}) + @attachment = attachment + @tmpdir = opts.fetch(:tmpdir, ::Rails.root.join('tmp')) + end + + # Public: The title to use in the <title> tag + # + # Returns a String + def title + @title ||= attachment.display_filename + end + + # Public: The contents of the extracted html <body> tag + # + # Returns a String + def body + @body ||= parse_body + end + + # Public: Was the document conversion successful? + # + # Returns a Boolean + def success? + has_content? || contains_images? + end + + private + + def parse_body + match = convert.match(/<body[^>]*>(.*?)<\/body>/mi) + match ? match[1] : '' + end + + def has_content? + !body.gsub(/\s+/,"").gsub(/\<[^\>]*\>/, "").empty? + end + + def contains_images? + body.match(/<img[^>]*>/mi) ? true : false + end + + def convert + # Get the attachment body outside of the chdir call as getting + # the body may require opening files too + text = attachment_body + + @converted ||= Dir.chdir(tmpdir) do + tempfile = create_tempfile(text) + + html = AlaveteliExternalCommand.run("unrtf", "--html", + tempfile.path, :timeout => 120 + ) + + cleanup_tempfile(tempfile) + + sanitize_converted(html) + end + + end + + # Works around http://savannah.gnu.org/bugs/?42015 in unrtf ~> 0.21 + def sanitize_converted(html) + invalid = %Q(<!DOCTYPE html PUBLIC -//W3C//DTD HTML 4.01 Transitional//EN>) + valid = %Q(<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN>") + if html.include?(invalid) + html.sub!(invalid, valid) + end + html + end + + def create_tempfile(text) + tempfile = if RUBY_VERSION.to_f >= 1.9 + Tempfile.new('foiextract', '.', + :encoding => text.encoding) + else + Tempfile.new('foiextract', '.') + end + tempfile.print(text) + tempfile.flush + tempfile + end + + def cleanup_tempfile(tempfile) + tempfile.close + tempfile.delete + end + + def attachment_body + @attachment_body ||= attachment.body + end + + end + end +end diff --git a/lib/attachment_to_html/adapters/text.rb b/lib/attachment_to_html/adapters/text.rb new file mode 100644 index 000000000..e99183f0e --- /dev/null +++ b/lib/attachment_to_html/adapters/text.rb @@ -0,0 +1,61 @@ +module AttachmentToHTML + module Adapters + # Convert text/plain documents in to HTML + class Text + + attr_reader :attachment + + # Public: Initialize a Text converter + # + # attachment - the FoiAttachment to convert to HTML + # opts - a Hash of options (default: {}): + # No options currently accepted + def initialize(attachment, opts = {}) + @attachment = attachment + end + + # Public: The title to use in the <title> tag + # + # Returns a String + def title + @title ||= attachment.display_filename + end + + # Public: The contents of the extracted html <body> tag + # + # Returns a String + def body + @body ||= parse_body + end + + # Public: Was the document conversion successful? + # + # Returns a Boolean + def success? + has_content? || contains_images? + end + + private + + def convert + text = attachment.body.strip + text = CGI.escapeHTML(text) + text = MySociety::Format.make_clickable(text) + text = text.gsub(/\n/, '<br>') + end + + def parse_body + convert + end + + def has_content? + !body.gsub(/\s+/,"").gsub(/\<[^\>]*\>/, "").empty? + end + + def contains_images? + body.match(/<img[^>]*>/mi) ? true : false + end + + end + end +end diff --git a/lib/attachment_to_html/attachment_to_html.rb b/lib/attachment_to_html/attachment_to_html.rb new file mode 100644 index 000000000..2f7c08264 --- /dev/null +++ b/lib/attachment_to_html/attachment_to_html.rb @@ -0,0 +1,46 @@ +require 'view' + +Dir[File.dirname(__FILE__) + '/adapters/*.rb'].each do |file| + require file +end + +module AttachmentToHTML + extend self + + def to_html(attachment, opts = {}) + adapter = adapter_for(attachment).new(attachment, opts) + + unless adapter.success? + adapter = fallback_adapter_for(attachment).new(attachment, opts) + end + + view = View.new(adapter) + view.wrapper = 'wrapper_google_embed' if adapter.is_a?(Adapters::GoogleDocsViewer) + + view.render do + opts.fetch(:content_for, []).each do |k,v| + inject_content(k) { v } + end + end + end + + private + + def adapter_for(attachment) + case attachment.content_type + when 'text/plain' then Adapters::Text + when 'application/pdf' then Adapters::PDF + when 'application/rtf' then Adapters::RTF + else + fallback_adapter_for(attachment) + end + end + + def fallback_adapter_for(attachment) + if attachment.has_google_docs_viewer? + Adapters::GoogleDocsViewer + else + Adapters::CouldNotConvert + end + end +end diff --git a/lib/attachment_to_html/template.html.erb b/lib/attachment_to_html/template.html.erb new file mode 100644 index 000000000..38286a5f9 --- /dev/null +++ b/lib/attachment_to_html/template.html.erb @@ -0,0 +1,16 @@ +<!DOCTYPE html> +<html> +<head> + <title><%= title %></title> + <%= content_for(:head_suffix) %> +</head> +<body> + <%= content_for(:body_prefix) %> + <div id="<%= wrapper %>"> + <div id="view-html-content"> + <%= body %> + </div> + </div> + <%= content_for(:body_suffix) %> +</body> +</html> diff --git a/lib/attachment_to_html/view.rb b/lib/attachment_to_html/view.rb new file mode 100644 index 000000000..e6991d44e --- /dev/null +++ b/lib/attachment_to_html/view.rb @@ -0,0 +1,39 @@ +module AttachmentToHTML + class View < ERB + + def self.template + @template || "#{ File.dirname(__FILE__) }/template.html.erb" + end + + def self.template=(path) + @template = path + end + + attr_accessor :title, :body, :template, :wrapper + + def initialize(adapter, opts = {}) + self.title = adapter.title + self.body = adapter.body + self.template = opts.fetch(:template, self.class.template) + self.wrapper = opts.fetch(:wrapper, 'wrapper') + super(File.read(template)) + end + + def render(&block) + instance_eval(&block) if block_given? + result(binding) + end + + def content_for(area) + send(area) if respond_to?(area) + end + + private + + def inject_content(area, &block) + instance_variable_set("@#{ area }".to_sym, block.call) + self.class.send(:attr_accessor, area) + end + + end +end diff --git a/lib/configuration.rb b/lib/configuration.rb index bd705b777..d525bf712 100644 --- a/lib/configuration.rb +++ b/lib/configuration.rb @@ -58,6 +58,7 @@ module AlaveteliConfiguration :RECAPTCHA_PUBLIC_KEY => 'x', :REPLY_LATE_AFTER_DAYS => 20, :REPLY_VERY_LATE_AFTER_DAYS => 40, + :RESPONSIVE_STYLING => false, :SITE_NAME => 'Alaveteli', :SKIP_ADMIN_AUTH => false, :SPECIAL_REPLY_VERY_LATE_AFTER_DAYS => 60, diff --git a/lib/date_quarter.rb b/lib/date_quarter.rb new file mode 100644 index 000000000..ac159b420 --- /dev/null +++ b/lib/date_quarter.rb @@ -0,0 +1,22 @@ +module DateQuarter + extend self + + def quarters_between(start_at, finish_at) + results = [] + + quarter_start = start_at.beginning_of_quarter + quarter_end = start_at.end_of_quarter + + while quarter_end <= finish_at.end_of_quarter do + # Collect these + results << [quarter_start, quarter_end] + + # Update dates + quarter_start = quarter_end + 1.second + quarter_end = quarter_start.end_of_quarter + end + + results + end + +end diff --git a/lib/normalize_string.rb b/lib/normalize_string.rb index f02b18ee0..3b6116970 100644 --- a/lib/normalize_string.rb +++ b/lib/normalize_string.rb @@ -1,4 +1,4 @@ -require 'iconv' unless RUBY_VERSION.to_f >= 1.9 +require 'iconv' unless String.method_defined?(:encode) require 'charlock_holmes' class EncodingNormalizationError < StandardError @@ -23,17 +23,16 @@ def normalize_string_to_utf8(s, suggested_character_encoding=nil) to_try.push guessed_encoding to_try.each do |from_encoding| - if RUBY_VERSION.to_f >= 1.9 + if String.method_defined?(:encode) begin s.force_encoding from_encoding return s.encode('UTF-8') if s.valid_encoding? - rescue ArgumentError + rescue ArgumentError, Encoding::UndefinedConversionError # We get this is there are invalid bytes when # interpreted as from_encoding at the point of # the encode('UTF-8'); move onto the next one... end else - to_encoding = 'UTF-8' begin converted = Iconv.conv 'UTF-8', from_encoding, s return converted @@ -45,7 +44,6 @@ def normalize_string_to_utf8(s, suggested_character_encoding=nil) end end raise EncodingNormalizationError, "Couldn't find a valid character encoding for the string" - end def convert_string_to_utf8_or_binary(s, suggested_character_encoding=nil) @@ -69,13 +67,13 @@ def convert_string_to_utf8_or_binary(s, suggested_character_encoding=nil) result = normalize_string_to_utf8 s, suggested_character_encoding rescue EncodingNormalizationError result = s - s.force_encoding 'ASCII-8BIT' if RUBY_VERSION.to_f >= 1.9 + s.force_encoding 'ASCII-8BIT' if String.method_defined?(:encode) end result end def log_text_details(message, text) - if RUBY_VERSION.to_f >= 1.9 + if String.method_defined?(:encode) STDERR.puts "#{message}, we have text: #{text}, of class #{text.class} and encoding #{text.encoding}" else STDERR.puts "#{message}, we have text: #{text}, of class #{text.class}" diff --git a/lib/tasks/stats.rake b/lib/tasks/stats.rake index 38eb15996..f09594529 100644 --- a/lib/tasks/stats.rake +++ b/lib/tasks/stats.rake @@ -97,6 +97,61 @@ namespace :stats do end end + desc <<-DESC +Prints the per-quarter number of created FOI Requests made to each Public Body found by the query. +Specify the search query as QUERY='london school' +DESC + task :number_of_requests_created => :environment do + query = ENV['QUERY'] + start_at = PublicBody.minimum(:created_at) + finish_at = PublicBody.maximum(:created_at) + public_bodies = PublicBody.search(query) + quarters = DateQuarter.quarters_between(start_at, finish_at) + + # Headers + headers = ['Body'] + quarters.map { |date_tuple| date_tuple.join('~') } + puts headers.join(",") + + public_bodies.each do |body| + stats = quarters.map do |quarter| + conditions = ['created_at >= ? AND created_at < ?', quarter[0], quarter[1]] + count = body.info_requests.count(:conditions => conditions) + count ? count : 0 + end + + row = [body.name] + stats + puts row.join(",") + end + end + + desc <<-DESC +Prints the per-quarter number of successful FOI Requests made to each Public Body found by the query. +Specify the search query as QUERY='london school' +DESC + task :number_of_requests_successful => :environment do + query = ENV['QUERY'] + start_at = PublicBody.minimum(:created_at) + finish_at = PublicBody.maximum(:created_at) + public_bodies = PublicBody.search(query) + quarters = DateQuarter.quarters_between(start_at, finish_at) + + # Headers + headers = ['Body'] + quarters.map { |date_tuple| date_tuple.join('~') } + puts headers.join(",") + + public_bodies.each do |body| + stats = quarters.map do |quarter| + conditions = ['created_at >= ? AND created_at < ? AND described_state = ?', + quarter[0], quarter[1], 'successful'] + count = body.info_requests.count(:conditions => conditions) + count ? count : 0 + end + + row = [body.name] + stats + puts row.join(",") + end + end + desc 'Update statistics in the public_bodies table' task :update_public_bodies_stats => :environment do verbose = ENV['VERBOSE'] == '1' |