# Handles the parsing of email
require 'tmpdir'

module MailHandler

    require 'mail'
    require 'backends/mail_extensions'
    require 'backends/mail_backend'
    include Backends::MailBackend

    class TNEFParsingError < StandardError
    end

    # Returns a set of attachments from the given TNEF contents
    # The TNEF contents also contains the message body, but in general this is the
    # same as the message body in the message proper.
    def tnef_attachments(content)
        attachments = []
        Dir.mktmpdir do |dir|
            IO.popen("tnef -K -C #{dir} 2> /dev/null", "wb") do |f|
                f.write(content)
                f.close
                if $?.signaled?
                    raise IOError, "tnef exited with signal #{$?.termsig}"
                end
                if $?.exited? && $?.exitstatus != 0
                    raise TNEFParsingError, "tnef exited with status #{$?.exitstatus}"
                end
            end
            found = 0
            Dir.new(dir).sort.each do |file| # sort for deterministic behaviour
                if file != "." && file != ".."
                    file_content = File.open("#{dir}/#{file}", "rb").read
                    attachments << { :content => file_content,
                                     :filename => file }
                    found += 1
                end
            end
            if found == 0
                raise TNEFParsingError, "tnef produced no attachments"
            end
        end
        attachments
    end

    def normalise_content_type(content_type)
        # e.g. http://www.whatdotheyknow.com/request/93/response/250
        if content_type == 'application/excel' or content_type == 'application/msexcel' or content_type == 'application/x-ms-excel'
            content_type = 'application/vnd.ms-excel'
        end
        if content_type == 'application/mspowerpoint' or content_type == 'application/x-ms-powerpoint'
            content_type = 'application/vnd.ms-powerpoint'
        end
        if content_type == 'application/msword' or content_type == 'application/x-ms-word'
            content_type = 'application/vnd.ms-word'
        end
        if content_type == 'application/x-zip-compressed'
            content_type = 'application/zip'
        end

        # e.g. http://www.whatdotheyknow.com/request/copy_of_current_swessex_scr_opt#incoming-9928
        if content_type == 'application/acrobat' or content_type == 'document/pdf'
            content_type = 'application/pdf'
        end

        return content_type
    end

    def get_attachment_text_one_file(content_type, body, charset = 'utf-8')
        # note re. charset: TMail always tries to convert email bodies
        # to UTF8 by default, so normally it should already be that.
        text = ''
        # TODO: - tell all these command line tools to return utf-8
        if content_type == 'text/plain'
            text += body + "\n\n"
        else
            tempfile = Tempfile.new('foiextract')
            tempfile.binmode
            tempfile.print body
            tempfile.flush
            default_params = { :append_to => text,
                               :binary_output => false,
                               :timeout => 1200 }
            if content_type == 'application/vnd.ms-word'
                AlaveteliExternalCommand.run("wvText", tempfile.path, tempfile.path + ".txt",
                                             { :memory_limit => 536870912,  :timeout => 120 } )
                # Try catdoc if we get into trouble (e.g. for InfoRequestEvent 2701)
                if not File.exists?(tempfile.path + ".txt")
                    AlaveteliExternalCommand.run("catdoc", tempfile.path, default_params)
                else
                    text += File.read(tempfile.path + ".txt") + "\n\n"
                    File.unlink(tempfile.path + ".txt")
                end
            elsif content_type == 'application/rtf'
                # catdoc on RTF prodcues less comments and extra bumf than --text option to unrtf
                AlaveteliExternalCommand.run("catdoc", tempfile.path, default_params)
            elsif content_type == 'text/html'
                # lynx wordwraps links in its output, which then don't
                # get formatted properly by Alaveteli. We use elinks
                # instead, which doesn't do that.
                AlaveteliExternalCommand.run("elinks", "-eval", "set document.codepage.assume = \"#{charset}\"",
                                                       "-eval", "set document.codepage.force_assumed = 1",
                                                       "-dump-charset", "utf-8",
                                                       "-force-html", "-dump",
                                                       tempfile.path,
                                                       default_params.merge(:env => {"LANG" => "C"}))
            elsif content_type == 'application/vnd.ms-excel'
                # Bit crazy using /usr/bin/strings - but xls2csv, xlhtml and
                # py_xls2txt only extract text from cells, not from floating
                # notes. catdoc may be fooled by weird character sets, but will
                # probably do for UK FOI requests.
                AlaveteliExternalCommand.run("/usr/bin/strings", tempfile.path, default_params)
            elsif content_type == 'application/vnd.ms-powerpoint'
                # ppthtml seems to catch more text, but only outputs HTML when
                # we want text, so just use catppt for now
                AlaveteliExternalCommand.run("catppt", tempfile.path, default_params)
            elsif content_type == 'application/pdf'
                AlaveteliExternalCommand.run("pdftotext", tempfile.path, "-", default_params)
            elsif content_type == 'application/vnd.openxmlformats-officedocument.wordprocessingml.document'
                # This is Microsoft's XML office document format.
                # Just pull out the main XML file, and strip it of text.
                xml = AlaveteliExternalCommand.run("/usr/bin/unzip", "-qq",
                                                                     "-c",
                                                                     tempfile.path,
                                                                     "word/document.xml",
                                                                     {:binary_output => false})
                if !xml.nil?
                    doc = REXML::Document.new(xml)
                    text += doc.each_element( './/text()' ){}.join(" ")
                end
            elsif content_type == 'application/zip'
                # recurse into zip files
                begin
                    zip_file = Zip::ZipFile.open(tempfile.path)
                    text += get_attachment_text_from_zip_file(zip_file)
                    zip_file.close()
                rescue
                    $stderr.puts("Error processing zip file: #{$!.inspect}")
                end
            end
            tempfile.close
        end

        return text
    end
    def get_attachment_text_from_zip_file(zip_file)

        text = ""
        for entry in zip_file
            if entry.file?
                filename = entry.to_s
                begin
                    body = entry.get_input_stream.read
                rescue
                    # move to next attachment silently if there were problems
                    # TODO: really should reduce this to specific exceptions?
                    # e.g. password protected
                    next
                end
                calc_mime = AlaveteliFileTypes.filename_to_mimetype(filename)
                if calc_mime
                    content_type = calc_mime
                else
                    content_type = 'application/octet-stream'
                end

                text += get_attachment_text_one_file(content_type, body)

            end
        end
        return text
    end

    # Turn instance methods into class methods
    extend self

end