Simpler AttachmentToHTML::Adapters::PDF interface

TODO: We really should be testing the full output of PDF#body, but inconsistencies between pdftohtml prevent sensible means of doing this. For example: adapter.body.should == %Q(\n<A name=1></a>thisisthebody<br/>\n<hr>\n) Fails because some versions (correctly!) use lower case tag names.
author: Gareth Rees <gareth@mysociety.org> 2014-04-02 12:11:32 +0100
committer: Gareth Rees <gareth@mysociety.org> 2014-04-07 17:09:30 +0100
commit: e7d0f9a8b350ffe3c17451d6bb18051c7230ca61 (patch)
tree: 03f5ad3335ed677e49135d6617ccd7fc09d6877a
parent: 44eff43ee8024a03fe4c327638ac0dbc1b47f4fd (diff)
2 files changed, 37 insertions, 102 deletions
diff --git a/lib/attachment_to_html/adapters/pdf.rb b/lib/attachment_to_html/adapters/pdf.rb
index cc1bf06bc..1fca2f201 100644
--- a/lib/attachment_to_html/adapters/pdf.rb
+++ b/lib/attachment_to_html/adapters/pdf.rb
@@ -3,27 +3,31 @@ module AttachmentToHTML
         # Convert application/pdf documents in to HTML
         class PDF
 
-            attr_reader :attachment, :wrapper, :tmpdir
+            attr_reader :attachment, :tmpdir
 
             # Public: Initialize a PDF converter
             #
             # attachment - the FoiAttachment to convert to HTML
             # opts       - a Hash of options (default: {}):
-            #              :wrapper - String id of the div that wraps the
-            #                         attachment body
             #              :tmpdir  - String name of directory to store the
             #                         converted document
             def initialize(attachment, opts = {})
                 @attachment = attachment
-                @wrapper = opts.fetch(:wrapper, 'wrapper')
                 @tmpdir = opts.fetch(:tmpdir, ::Rails.root.join('tmp'))
             end
 
-            # Public: Convert the attachment to HTML
+            # Public: The title to use in the <title> tag
             #
             # Returns a String
-            def to_html
-                @html ||= generate_html
+            def title
+                @title ||= attachment.display_filename
+            end
+
+            # Public: The contents of the extracted html <body> tag
+            #
+            # Returns a String
+            def body
+                @body ||= parse_body
             end
 
             # Public: Was the document conversion successful?
@@ -35,51 +39,17 @@ module AttachmentToHTML
 
             private
 
-            def generate_html
-                html =  "<!DOCTYPE html>"
-                html += "<html>"
-                html += "<head>"
-                html += "<title>#{ title }</title>"
-                html += "</head>"
-                html += "<body>"
-                html += "<div id=\"#{ wrapper }\">"
-                html += "<div id=\"view-html-content\">"
-                html += body
-                html += "</div>"
-                html += "</div>"
-                html += "</body>"
-                html += "</html>"
-            end
-
-            def title
-                @title ||= attachment.display_filename
+            def parse_body
+                match = convert.match(/<body[^>]*>(.*?)<\/body>/mi)
+                match ? match[1] : ''
             end
 
-            def body
-                parsed_body
-            end
-
-            # Parse the output of the converted attachment so that we can pluck
-            # the parts we need and insert in to our own sensible template
-            #
-            # Returns a Nokogiri::HTML::Document
-            def parsed
-                @parsed ||= Nokogiri::HTML.parse(convert)
-            end
-
-            def parsed_body
-                parsed.css('body').inner_html
-            end
-
-            # Does the body element have any content, excluding HTML tags?
-            #
-            # Returns a Boolean
             def has_content?
-                !parsed.css('body').inner_text.empty?
+                !body.gsub(/\s+/,"").gsub(/\<[^\>]*\>/, "").empty?
             end
 
             def contains_images?
-                parsed.css('body img').any?
+                body.match(/<img[^>]*>/mi) ? true : false
             end
 
             def convert
diff --git a/spec/lib/attachment_to_html/adapters/pdf_spec.rb b/spec/lib/attachment_to_html/adapters/pdf_spec.rb
index 65c376043..c02b157e4 100644
--- a/spec/lib/attachment_to_html/adapters/pdf_spec.rb
+++ b/spec/lib/attachment_to_html/adapters/pdf_spec.rb
@@ -3,94 +3,59 @@ require File.expand_path(File.dirname(__FILE__) + '/../../../spec_helper')
 describe AttachmentToHTML::Adapters::PDF do
 
     let(:attachment) { FactoryGirl.build(:pdf_attachment) }
-    let(:pdf_adapter) { AttachmentToHTML::Adapters::PDF.new(attachment) }
-
-    describe :wrapper do
-
-        it 'defaults to wrapper' do
-           pdf_adapter.wrapper.should == 'wrapper'
-        end
-
-        it 'accepts a wrapper option' do
-            pdf_adapter = AttachmentToHTML::Adapters::PDF.new(attachment, :wrapper => 'wrap')
-            pdf_adapter.wrapper.should == 'wrap'
-        end
- 
-    end
+    let(:adapter) { AttachmentToHTML::Adapters::PDF.new(attachment) }
 
     describe :tmpdir do
 
         it 'defaults to the rails tmp directory' do
-           pdf_adapter.tmpdir.should == Rails.root.join('tmp') 
+           adapter.tmpdir.should == Rails.root.join('tmp')
         end
 
         it 'allows a tmpdir to be specified to store the converted document' do
-            pdf_adapter = AttachmentToHTML::Adapters::PDF.new(attachment, :tmpdir => '/tmp')
-            pdf_adapter.tmpdir.should == '/tmp'
+            adapter = AttachmentToHTML::Adapters::PDF.new(attachment, :tmpdir => '/tmp')
+            adapter.tmpdir.should == '/tmp'
         end
   
     end
 
-    describe :to_html do
+    describe :title do
 
-        it 'should be a valid html document' do
-            parsed = Nokogiri::HTML.parse(pdf_adapter.to_html) do |config|
-               config.strict
-            end
-            parsed.errors.any?.should be_false
-        end
-
-        it 'contains the attachment filename in the title tag' do
-            parsed = Nokogiri::HTML.parse(pdf_adapter.to_html) do |config|
-               config.strict
-            end
-            parsed.css('title').inner_html.should == attachment.display_filename
+        it 'uses the attachment filename for the title' do
+            adapter.title.should == attachment.display_filename
         end
+ 
+    end
 
-        it 'contains the wrapper div in the body tag' do
-            pdf_adapter = AttachmentToHTML::Adapters::PDF.new(attachment, :wrapper => 'wrap')
-            parsed = Nokogiri::HTML.parse(pdf_adapter.to_html) do |config|
-               config.strict
-            end
-            parsed.css('body div').first.attributes['id'].value.should == 'wrap'
-        end
+    describe :body do
 
-        it 'contains the attachment body in the wrapper div' do
-            pdf_adapter = AttachmentToHTML::Adapters::PDF.new(attachment, :wrapper => 'wrap')
-            parsed = Nokogiri::HTML.parse(pdf_adapter.to_html) do |config|
-               config.strict
-            end
-            parsed.css('div#wrap div#view-html-content').inner_html.should include('thisisthebody')
+        it 'extracts the body from the document' do
+            adapter.body.should include('thisisthebody')
         end
 
         it 'operates in the context of the supplied tmpdir' do
-            pdf_adapter = AttachmentToHTML::Adapters::PDF.new(attachment, :tmpdir => '/tmp')
+            adapter = AttachmentToHTML::Adapters::PDF.new(attachment, :tmpdir => '/tmp')
             Dir.should_receive(:chdir).with('/tmp').and_call_original
-            pdf_adapter.to_html
+            adapter.body
         end
 
     end
 
+
     describe :success? do
 
         it 'is successful if the body has content excluding the tags' do
-            pdf_adapter.to_html
-            pdf_adapter.success?.should be_true
+            adapter.stub(:body).and_return('<p>some content</p>')
+            adapter.success?.should be_true
         end
 
         it 'is successful if the body contains images' do
-            mocked_return = %Q(<!DOCTYPE html><html><head></head><body><img src="logo.png" /></body></html>)
-            pdf_adapter = AttachmentToHTML::Adapters::PDF.new(attachment)
-            pdf_adapter.stub(:to_html).and_return(mocked_return)
-            pdf_adapter.success?.should be_true
+            adapter.stub(:body).and_return(%Q(<img src="logo.png" />))
+            adapter.success?.should be_true
         end
 
         it 'is not successful if the body has no content other than tags' do
-            # TODO: Add and use spec/fixtures/files/empty.pdf
-            attachment = FactoryGirl.build(:body_text, :body => '')
-            pdf_adapter = AttachmentToHTML::Adapters::PDF.new(attachment)
-            pdf_adapter.to_html
-            pdf_adapter.success?.should be_false
+            adapter.stub(:body).and_return('<p></p>')
+            adapter.success?.should be_false
         end
 
     end
author	Gareth Rees <gareth@mysociety.org>	2014-04-02 12:11:32 +0100
committer	Gareth Rees <gareth@mysociety.org>	2014-04-07 17:09:30 +0100
commit	e7d0f9a8b350ffe3c17451d6bb18051c7230ca61 (patch)
tree	03f5ad3335ed677e49135d6617ccd7fc09d6877a
parent	44eff43ee8024a03fe4c327638ac0dbc1b47f4fd (diff)