From d57ca2a22579df4c634d554989c0ee9e4ebb5165 Mon Sep 17 00:00:00 2001 From: Gareth Rees Date: Mon, 17 Mar 2014 11:15:40 +0000 Subject: Add AttachmentToHTML library Extracts the attachment processing from FoiAttachment#body_to_html AttachmentToHTML contains adapters which convert - text/plain - application/pdf - application/rtf Results are returned as an AttachmentHTML::HTML instance which contains the raw HTML and other metadata about the conversion. --- lib/attachment_to_html/adapters/rtf.rb | 120 +++++++++++++++++++++++++++++++++ 1 file changed, 120 insertions(+) create mode 100644 lib/attachment_to_html/adapters/rtf.rb (limited to 'lib/attachment_to_html/adapters/rtf.rb') diff --git a/lib/attachment_to_html/adapters/rtf.rb b/lib/attachment_to_html/adapters/rtf.rb new file mode 100644 index 000000000..f38e5e381 --- /dev/null +++ b/lib/attachment_to_html/adapters/rtf.rb @@ -0,0 +1,120 @@ +module AttachmentToHTML + module Adapters + # Convert application/rtf documents in to HTML + class RTF + + attr_reader :attachment, :wrapper, :tmpdir + + # Public: Initialize a RTF converter + # + # attachment - the FoiAttachment to convert to HTML + # opts - a Hash of options (default: {}): + # :wrapper - String id of the div that wraps the + # attachment body + # :tmpdir - String name of directory to store the + # converted document + def initialize(attachment, opts = {}) + @attachment = attachment + @wrapper = opts.fetch(:wrapper, 'wrapper') + @tmpdir = opts.fetch(:tmpdir, ::Rails.root.join('tmp')) + end + + # Public: Convert the attachment to HTML + # + # Returns a String + def to_html + @html ||= generate_html + end + + # Public: Was the document conversion successful? + # + # Returns a Boolean + def success? + has_content? || contains_images? + end + + private + + def generate_html + html = "" + html += "" + html += "" + html += "#{ title }" + html += "" + html += "" + html += "
" + html += "
" + html += body + html += "
" + html += "
" + html += "" + html += "" + end + + def title + @title ||= attachment.display_filename + end + + def body + parsed_body + end + + # Parse the output of the converted attachment so that we can pluck + # the parts we need and insert in to our own sensible template + # + # Returns a Nokogiri::HTML::Document + def parsed + @parsed ||= Nokogiri::HTML.parse(convert) + end + + def parsed_body + parsed.css('body').inner_html + end + + # Does the body element have any content, excluding HTML tags? + # + # Returns a Boolean + def has_content? + !parsed.css('body').inner_text.empty? + end + + def contains_images? + parsed.css('body img').any? + end + + def convert + @converted ||= Dir.chdir(tmpdir) do + tempfile = create_tempfile + write_attachment_body_to_tempfile(tempfile) + + html = AlaveteliExternalCommand.run("unrtf", "--html", + tempfile.path, :timeout => 120 + ) + + cleanup_tempfile(tempfile) + + html + end + end + + def create_tempfile + if RUBY_VERSION.to_f >= 1.9 + Tempfile.new('foiextract', '.', :encoding => attachment.body.encoding) + else + Tempfile.new('foiextract', '.') + end + end + + def write_attachment_body_to_tempfile(tempfile) + tempfile.print(attachment.body) + tempfile.flush + end + + def cleanup_tempfile(tempfile) + tempfile.close + tempfile.delete + end + + end + end +end -- cgit v1.2.3 From 82c69083609ad14b127c0037ecc8c4df959654ac Mon Sep 17 00:00:00 2001 From: Gareth Rees Date: Tue, 1 Apr 2014 11:34:30 +0100 Subject: Get attachment body outside of chdir --- lib/attachment_to_html/adapters/rtf.rb | 30 ++++++++++++++++++------------ 1 file changed, 18 insertions(+), 12 deletions(-) (limited to 'lib/attachment_to_html/adapters/rtf.rb') diff --git a/lib/attachment_to_html/adapters/rtf.rb b/lib/attachment_to_html/adapters/rtf.rb index f38e5e381..24987a975 100644 --- a/lib/attachment_to_html/adapters/rtf.rb +++ b/lib/attachment_to_html/adapters/rtf.rb @@ -83,9 +83,12 @@ module AttachmentToHTML end def convert + # Get the attachment body outside of the chdir call as getting + # the body may require opening files too + text = attachment_body + @converted ||= Dir.chdir(tmpdir) do - tempfile = create_tempfile - write_attachment_body_to_tempfile(tempfile) + tempfile = create_tempfile(text) html = AlaveteliExternalCommand.run("unrtf", "--html", tempfile.path, :timeout => 120 @@ -97,17 +100,16 @@ module AttachmentToHTML end end - def create_tempfile - if RUBY_VERSION.to_f >= 1.9 - Tempfile.new('foiextract', '.', :encoding => attachment.body.encoding) - else - Tempfile.new('foiextract', '.') - end - end - - def write_attachment_body_to_tempfile(tempfile) - tempfile.print(attachment.body) + def create_tempfile(text) + tempfile = if RUBY_VERSION.to_f >= 1.9 + Tempfile.new('foiextract', '.', + :encoding => text.encoding) + else + Tempfile.new('foiextract', '.') + end + tempfile.print(text) tempfile.flush + tempfile end def cleanup_tempfile(tempfile) @@ -115,6 +117,10 @@ module AttachmentToHTML tempfile.delete end + def attachment_body + @attachment_body ||= attachment.body + end + end end end -- cgit v1.2.3 From 08572fe8d0ad97c01ecc5c0f0ee39e610de383a3 Mon Sep 17 00:00:00 2001 From: Gareth Rees Date: Tue, 1 Apr 2014 11:59:26 +0100 Subject: Work around a bug in unrtf --- lib/attachment_to_html/adapters/rtf.rb | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) (limited to 'lib/attachment_to_html/adapters/rtf.rb') diff --git a/lib/attachment_to_html/adapters/rtf.rb b/lib/attachment_to_html/adapters/rtf.rb index 24987a975..871ca2c60 100644 --- a/lib/attachment_to_html/adapters/rtf.rb +++ b/lib/attachment_to_html/adapters/rtf.rb @@ -96,8 +96,19 @@ module AttachmentToHTML cleanup_tempfile(tempfile) - html + sanitize_converted(html) end + + end + + # Works around http://savannah.gnu.org/bugs/?42015 in unrtf ~> 0.21 + def sanitize_converted(html) + invalid = %Q() + valid = %Q(") + if html.include?(invalid) + html.sub!(invalid, valid) + end + html end def create_tempfile(text) -- cgit v1.2.3 From 0532eeee63f06e796f0e967f39dfa5f23d4821f7 Mon Sep 17 00:00:00 2001 From: Gareth Rees Date: Wed, 2 Apr 2014 12:11:56 +0100 Subject: Simpler AttachmentToHTML::Adapters::RTF interface TODO: We really should be testing the full output of RTF#body, but we currently want to remain consistent with Adapters::PDF as many methods are shared between the Adapters. A more correct spec might be: expected = %Q(thisisthebody) adapter.body.should == expected --- lib/attachment_to_html/adapters/rtf.rb | 62 +++++++++------------------------- 1 file changed, 16 insertions(+), 46 deletions(-) (limited to 'lib/attachment_to_html/adapters/rtf.rb') diff --git a/lib/attachment_to_html/adapters/rtf.rb b/lib/attachment_to_html/adapters/rtf.rb index 871ca2c60..859c0e541 100644 --- a/lib/attachment_to_html/adapters/rtf.rb +++ b/lib/attachment_to_html/adapters/rtf.rb @@ -3,27 +3,31 @@ module AttachmentToHTML # Convert application/rtf documents in to HTML class RTF - attr_reader :attachment, :wrapper, :tmpdir + attr_reader :attachment, :tmpdir # Public: Initialize a RTF converter # # attachment - the FoiAttachment to convert to HTML # opts - a Hash of options (default: {}): - # :wrapper - String id of the div that wraps the - # attachment body # :tmpdir - String name of directory to store the # converted document def initialize(attachment, opts = {}) @attachment = attachment - @wrapper = opts.fetch(:wrapper, 'wrapper') @tmpdir = opts.fetch(:tmpdir, ::Rails.root.join('tmp')) end - # Public: Convert the attachment to HTML + # Public: The title to use in the tag # # Returns a String - def to_html - @html ||= generate_html + def title + @title ||= attachment.display_filename + end + + # Public: The contents of the extracted html <body> tag + # + # Returns a String + def body + @body ||= parse_body end # Public: Was the document conversion successful? @@ -35,51 +39,17 @@ module AttachmentToHTML private - def generate_html - html = "<!DOCTYPE html>" - html += "<html>" - html += "<head>" - html += "<title>#{ title }" - html += "" - html += "" - html += "
" - html += "
" - html += body - html += "
" - html += "
" - html += "" - html += "" - end - - def title - @title ||= attachment.display_filename + def parse_body + match = convert.match(/]*>(.*?)<\/body>/mi) + match ? match[1] : '' end - def body - parsed_body - end - - # Parse the output of the converted attachment so that we can pluck - # the parts we need and insert in to our own sensible template - # - # Returns a Nokogiri::HTML::Document - def parsed - @parsed ||= Nokogiri::HTML.parse(convert) - end - - def parsed_body - parsed.css('body').inner_html - end - - # Does the body element have any content, excluding HTML tags? - # - # Returns a Boolean def has_content? - !parsed.css('body').inner_text.empty? + !body.gsub(/\s+/,"").gsub(/\<[^\>]*\>/, "").empty? end def contains_images? - parsed.css('body img').any? + body.match(/]*>/mi) ? true : false end def convert -- cgit v1.2.3