5 files changed, 372 insertions, 0 deletions
diff --git a/lib/attachment_to_html/adapters/could_not_convert.rb b/lib/attachment_to_html/adapters/could_not_convert.rb
new file mode 100644
index 000000000..8e4bf39dc
--- /dev/null
+++ b/lib/attachment_to_html/adapters/could_not_convert.rb
@@ -0,0 +1,49 @@
+module AttachmentToHTML
+    module Adapters
+        class CouldNotConvert
+
+            attr_reader :attachment
+
+            # Public: Initialize a PDF converter
+            #
+            # attachment - the FoiAttachment to convert to HTML
+            # opts       - a Hash of options (default: {}):
+            #              No options currently accepted
+            def initialize(attachment, opts = {})
+                @attachment = attachment
+            end
+
+            # Public: The title to use in the <title> tag
+            #
+            # Returns a String
+            def title
+                @title ||= attachment.display_filename
+            end
+
+            # Public: The contents of the extracted html <body> tag
+            #
+            # Returns a String
+            def body
+                @body ||= parse_body
+            end
+
+
+            # Public: Was the document conversion successful?
+            # As this is a fallback option and not doing anything dynamic
+            # we're assuming this is successful whatever the case
+            #
+            # Returns true
+            def success?
+                true
+            end
+
+            private
+
+            def parse_body
+                "<p>Sorry, we were unable to convert this file to HTML. " \
+                "Please use the download link at the top right.</p>"
+            end
+
+        end
+    end
+end
+\ No newline at end of file
diff --git a/lib/attachment_to_html/adapters/google_docs_viewer.rb b/lib/attachment_to_html/adapters/google_docs_viewer.rb
new file mode 100644
index 000000000..991fbb757
--- /dev/null
+++ b/lib/attachment_to_html/adapters/google_docs_viewer.rb
@@ -0,0 +1,56 @@
+module AttachmentToHTML
+    module Adapters
+        # Renders the attachment in a Google Docs Viewer
+        class GoogleDocsViewer
+
+            attr_reader :attachment, :attachment_url
+
+            # Public: Initialize a GoogleDocsViewer converter
+            #
+            # attachment - the FoiAttachment to convert to HTML
+            # opts       - a Hash of options (default: {}):
+            #              :attachment_url - a String url to the attachment for
+            #                                Google to render (default: nil)
+            def initialize(attachment, opts = {})
+                @attachment = attachment
+                @attachment_url = opts.fetch(:attachment_url, nil)
+            end
+
+            # Public: The title to use in the <title> tag
+            #
+            # Returns a String
+            def title
+                @title ||= attachment.display_filename
+            end
+
+            # Public: The contents of the extracted html <body> tag
+            #
+            # Returns a String
+            def body
+                @body ||= parse_body
+            end
+
+            # Public: Was the document conversion successful?
+            # We can't really tell whether the document conversion has been
+            # successful as such; We're assuming that given a correctly
+            # constructed iframe (which is tested) that Google will make this
+            # Just Work.
+            #
+            # Returns true
+            def success?
+                true
+            end
+
+            private
+
+            def parse_body
+                %Q(<iframe src="#{ protocol }://docs.google.com/viewer?url=#{ attachment_url }&amp;embedded=true" width="100%" height="100%" style="border: none;"></iframe>)
+            end
+
+            def protocol
+                AlaveteliConfiguration.force_ssl ? 'https' : 'http'
+            end
+
+        end
+    end
+end
diff --git a/lib/attachment_to_html/adapters/pdf.rb b/lib/attachment_to_html/adapters/pdf.rb
new file mode 100644
index 000000000..1fca2f201
--- /dev/null
+++ b/lib/attachment_to_html/adapters/pdf.rb
@@ -0,0 +1,97 @@
+module AttachmentToHTML
+    module Adapters
+        # Convert application/pdf documents in to HTML
+        class PDF
+
+            attr_reader :attachment, :tmpdir
+
+            # Public: Initialize a PDF converter
+            #
+            # attachment - the FoiAttachment to convert to HTML
+            # opts       - a Hash of options (default: {}):
+            #              :tmpdir  - String name of directory to store the
+            #                         converted document
+            def initialize(attachment, opts = {})
+                @attachment = attachment
+                @tmpdir = opts.fetch(:tmpdir, ::Rails.root.join('tmp'))
+            end
+
+            # Public: The title to use in the <title> tag
+            #
+            # Returns a String
+            def title
+                @title ||= attachment.display_filename
+            end
+
+            # Public: The contents of the extracted html <body> tag
+            #
+            # Returns a String
+            def body
+                @body ||= parse_body
+            end
+
+            # Public: Was the document conversion successful?
+            #
+            # Returns a Boolean
+            def success?
+                has_content? || contains_images?
+            end
+
+            private
+
+            def parse_body
+                match = convert.match(/<body[^>]*>(.*?)<\/body>/mi)
+                match ? match[1] : ''
+            end
+
+            def has_content?
+                !body.gsub(/\s+/,"").gsub(/\<[^\>]*\>/, "").empty?
+            end
+
+            def contains_images?
+                body.match(/<img[^>]*>/mi) ? true : false
+            end
+
+            def convert
+                # Get the attachment body outside of the chdir call as getting
+                # the body may require opening files too
+                text = attachment_body
+
+                @converted ||= Dir.chdir(tmpdir) do
+                    tempfile = create_tempfile(text)
+
+                    html = AlaveteliExternalCommand.run("pdftohtml",
+                      "-nodrm", "-zoom", "1.0", "-stdout", "-enc", "UTF-8",
+                      "-noframes", tempfile.path, :timeout => 30
+                    )
+
+                    cleanup_tempfile(tempfile)
+
+                    html
+                end
+            end
+
+            def create_tempfile(text)
+                tempfile = if RUBY_VERSION.to_f >= 1.9
+                               Tempfile.new('foiextract', '.',
+                                            :encoding => text.encoding)
+                           else
+                               Tempfile.new('foiextract', '.')
+                           end
+                tempfile.print(text)
+                tempfile.flush
+                tempfile
+            end
+
+            def cleanup_tempfile(tempfile)
+                tempfile.close
+                tempfile.delete
+            end
+
+            def attachment_body
+                @attachment_body ||= attachment.body
+            end
+
+        end
+    end
+end
diff --git a/lib/attachment_to_html/adapters/rtf.rb b/lib/attachment_to_html/adapters/rtf.rb
new file mode 100644
index 000000000..859c0e541
--- /dev/null
+++ b/lib/attachment_to_html/adapters/rtf.rb
@@ -0,0 +1,107 @@
+module AttachmentToHTML
+    module Adapters
+        # Convert application/rtf documents in to HTML
+        class RTF
+
+            attr_reader :attachment, :tmpdir
+
+            # Public: Initialize a RTF converter
+            #
+            # attachment - the FoiAttachment to convert to HTML
+            # opts       - a Hash of options (default: {}):
+            #              :tmpdir  - String name of directory to store the
+            #                         converted document
+            def initialize(attachment, opts = {})
+                @attachment = attachment
+                @tmpdir = opts.fetch(:tmpdir, ::Rails.root.join('tmp'))
+            end
+
+            # Public: The title to use in the <title> tag
+            #
+            # Returns a String
+            def title
+                @title ||= attachment.display_filename
+            end
+
+            # Public: The contents of the extracted html <body> tag
+            #
+            # Returns a String
+            def body
+                @body ||= parse_body
+            end
+
+            # Public: Was the document conversion successful?
+            #
+            # Returns a Boolean
+            def success?
+                has_content? || contains_images?
+            end
+
+            private
+
+            def parse_body
+                match = convert.match(/<body[^>]*>(.*?)<\/body>/mi)
+                match ? match[1] : ''
+            end
+
+            def has_content?
+                !body.gsub(/\s+/,"").gsub(/\<[^\>]*\>/, "").empty?
+            end
+
+            def contains_images?
+                body.match(/<img[^>]*>/mi) ? true : false
+            end
+
+            def convert
+                # Get the attachment body outside of the chdir call as getting
+                # the body may require opening files too
+                text = attachment_body
+
+                @converted ||= Dir.chdir(tmpdir) do
+                    tempfile = create_tempfile(text)
+
+                    html = AlaveteliExternalCommand.run("unrtf", "--html",
+                      tempfile.path, :timeout => 120
+                    )
+
+                    cleanup_tempfile(tempfile)
+
+                    sanitize_converted(html)
+                end
+
+            end
+
+            # Works around http://savannah.gnu.org/bugs/?42015 in unrtf ~> 0.21
+            def sanitize_converted(html)
+                invalid = %Q(<!DOCTYPE html PUBLIC -//W3C//DTD HTML 4.01 Transitional//EN>)
+                valid   = %Q(<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN>")
+                if html.include?(invalid)
+                   html.sub!(invalid, valid)
+                end
+                html
+            end
+
+            def create_tempfile(text)
+                tempfile = if RUBY_VERSION.to_f >= 1.9
+                               Tempfile.new('foiextract', '.',
+                                            :encoding => text.encoding)
+                           else
+                               Tempfile.new('foiextract', '.')
+                           end
+                tempfile.print(text)
+                tempfile.flush
+                tempfile
+            end
+
+            def cleanup_tempfile(tempfile)
+                tempfile.close
+                tempfile.delete
+            end
+
+            def attachment_body
+                @attachment_body ||= attachment.body
+            end
+
+        end
+    end
+end
diff --git a/lib/attachment_to_html/adapters/text.rb b/lib/attachment_to_html/adapters/text.rb
new file mode 100644
index 000000000..b431ada5e
--- /dev/null
+++ b/lib/attachment_to_html/adapters/text.rb
@@ -0,0 +1,63 @@
+require 'nokogiri'
+
+module AttachmentToHTML
+    module Adapters
+        # Convert text/plain documents in to HTML
+        class Text
+
+            attr_reader :attachment
+
+            # Public: Initialize a Text converter
+            #
+            # attachment - the FoiAttachment to convert to HTML
+            # opts       - a Hash of options (default: {}):
+            #              No options currently accepted
+            def initialize(attachment, opts = {})
+                @attachment = attachment
+            end
+
+            # Public: The title to use in the <title> tag
+            #
+            # Returns a String
+            def title
+                @title ||= attachment.display_filename
+            end
+
+            # Public: The contents of the extracted html <body> tag
+            #
+            # Returns a String
+            def body
+                @body ||= parse_body
+            end
+
+            # Public: Was the document conversion successful?
+            #
+            # Returns a Boolean
+            def success?
+                has_content? || contains_images?
+            end
+
+            private
+
+            def convert
+                text = attachment.body.strip
+                text = CGI.escapeHTML(text)
+                text = MySociety::Format.make_clickable(text)
+                text = text.gsub(/\n/, '<br>')
+            end
+
+            def parse_body
+                convert
+            end
+
+            def has_content?
+                !body.gsub(/\s+/,"").gsub(/\<[^\>]*\>/, "").empty?
+            end
+
+            def contains_images?
+                body.match(/<img[^>]*>/mi) ? true : false
+            end
+
+         end
+    end
+end