#!/usr/bin/python
# -*- coding: utf-8 -*-

import psycopg2
import urllib2
import json
import string
import sys
import os.path
import time
import sqlalchemy

dbname = "postjournal"
dbtable = "journal"
f = open(os.path.expanduser('~/.nuug-fpj-pwd'), 'r')
password = f.readline().strip()
f.close()

#Define our connection string
conn_string = "host='sqldb.nuug.no' dbname='%s' user='pere' password='%s' sslmode='require'" % (dbname, password)

columns = {
    'agency'         : 'text NOT NULL',
    'docdate'        : 'date',
    'docdesc'        : 'text NOT NULL',
    'doctype'        : 'char(1) NOT NULL',
    'casedesc'       : 'text',
    'casedocseq'     : 'integer NOT NULL',
    'recipient'      : 'text',
    'sender'         : 'text',
    'caseid'         : 'text',
    'caseseqnr'      : 'integer NOT NULL',
    'caseyear'       : 'integer NOT NULL',
    'exemption'      : 'text',
    'journalid'      : 'text',
    'journalseqnr'   : 'integer',
    'journalyear'    : 'integer',
    'recorddate'     : 'date',
    'scraper'        : 'text',
    'scrapedurl'     : 'text',
    'scrapestamputc' : 'timestamp without time zone NOT NULL',
}

# print the connection string we will use to connect
def db_connect():
#    print "Connecting to database\n	->%s" % (conn_string)

    try:
        # get a connection, if a connect cannot be made an exception
        # will be raised here
        conn = psycopg2.connect(conn_string)
        # conn.cursor will return a cursor object, you can use this
        # cursor to perform queries
        cursor = conn.cursor()
        print "Connected!\n"
    except:
        # Get the most recent exception
        exceptionType, exceptionValue, exceptionTraceback = sys.exc_info()
        # Exit the script and print an error telling what happened.
        sys.exit("Database connection failed!\n ->%s" % (exceptionValue))
    return (conn, cursor)

def sqliteline(lastscrapestamputc, limit):
    if lastscrapestamputc is None or '' == lastscrapestamputc:
        lastscrapestamputc = '1970-01-01'
    sql = "select * from 'swdata' where " \
        "datetime(scrapestamputc) >= datetime('%s') "\
        "order by scrapestamputc limit %d" % (lastscrapestamputc, limit)
#    print sql
    return sql

def jsonurl(scraper, sql):
    urlsql = urllib.urlencode(sql)
    return "https://api.scraperwiki.com/" + \
        "api/1.0/datastore/sqlite?format=json&name=" + scraper + "&query=%s" % urlsql

def create_table(dbconn, dbcursor):
    print "Remove old %s table" % dbtable
    dbcursor.execute("DROP TABLE " + dbtable)
    print "Create new table"
    s = map((lambda colname: colname + " " + columns[colname]), columns)
    dbcursor.execute("CREATE TABLE "+dbtable+" (" + string.join(s,",") + ")")
    dbconn.commit()
    print "Add unique index"
    dbcursor.execute("CREATE UNIQUE INDEX " + dbtable + "_unique " + \
                         "ON " + dbtable + " (agency, caseyear, caseseqnr, casedocseq, journalyear, journalseqnr)")
    dbconn.commit()

def insert_entry(dbcursor, entry):
    # Columns that can not be NULL
    mustcols = {
        'agency'         : 1,
        'docdesc'        : 1,
        'doctype'        : 1,
    }
    uniquecols1 = ['agency', 'caseyear', 'caseseqnr', 'casedocseq']
    uniquecols2 = ['agency', 'journalyear', 'journalseqnr']

    cols = []
    args = []
    for colname in columns.keys():
        if colname in entry:
            if 'casedocseq' == colname and -1 == entry['casedocseq']:
                entry['casedocseq'] = None
            if '' != entry[colname] or colname in mustcols:
                cols.append(colname)
                args.append(entry[colname])
    inssql = "INSERT INTO "+dbtable+" ( " + string.join(cols, ",") + " ) " + \
        "VALUES ( " + \
        string.join(map((lambda colname: "%s"), args), ",") + " )"

    # Make sure replacing work (as INSERT or REPLACE do not work with
    # PostgreSQL) by removing the old entry if it exist.
    if 'casedocseq' in entry and -1 != entry['casedocseq']:
        delsql = "DELETE FROM " + dbtable + " WHERE " + \
            string.join(map((lambda colname: colname + " = %s"), uniquecols1),
                        " and ")
        uniquecols = uniquecols1
    else:
        delsql = "DELETE FROM " + dbtable + " WHERE " + \
            string.join(map((lambda colname: colname + " = %s"), uniquecols2),
                        " and ")
        uniquecols = uniquecols2

    delargs = []
    for colname in uniquecols:
        delargs.append(entry[colname])
    try:
        dbcursor.execute(delsql, delargs)
        dbcursor.execute(inssql, args)
    except:
        exceptionType, exceptionValue, exceptionTraceback = sys.exc_info()
        print entry
        print inssql
        print exceptionType, exceptionValue, exceptionTraceback
        exit(1)

#    except psycopg2.IntegrityError, e:
#    except psycopg2.DataError, e:
#	print entry
#	print sql
#	print e

def populate_from_scraper_real(dbcursor, scraper, limit):
    lastscrapestamputc = ''
    sys.stdout.write(scraper + ": ")
    sys.stdout.flush()
    if True:
        try:
            sql = "SELECT MAX(scrapestamputc) FROM %s WHERE scraper = '%s'" % (dbtable, scraper)
#            print sql
            dbcursor.execute(sql, (scraper,))
            res = dbcursor.fetchone()[0]
            if res is not None:
                status = "new"
                lastscrapestamputc = res
                lastscrapestamputc = lastscrapestamputc.isoformat()
#                print "Before: %s, after %s" % (res, lastscrapestamputc)
            else:
                status = "initial"
        except psycopg2.DataError, e:
            print "Failed" + e
            exit(0)

    sql = sqliteline(lastscrapestamputc, limit)
    if False: # old API 
        url = jsonurl(scraper, sql)
        #print "Using " + url
        data = json.load(urllib2.urlopen(url))
        try:
            if data['error']:
                print "Error fetching data from " + scraper
                return
        except:
            pass
    else:
        filename = "sqlite:///data/%s.sqlite" % scraper
        create = sqlalchemy.create_engine
#        print "opening %s" % filename
        engine = create(filename, echo=False, connect_args={'timeout': 300})
        connection = engine.connect()
        result = connection.execute(sql)
    
        data = []
        for row in result:
            data.append(dict(row.items()))
	connection.close()
    
    sys.stdout.write("Adding/updating " + str(len(data)) + " " + status + " entries (" + lastscrapestamputc + ")")
    sys.stdout.flush()
    skipped = 0
    for entry in data:

        # Handle OEP scraper 2012-06-16
        if not 'caseyear' in entry or entry['caseyear'] is None or \
                not 'caseseqnr' in entry or entry['caseseqnr'] is None:
            if True:
                if entry['caseid'] is None:
#                    print "Strange entry, skipping: ", entry
                    skipped = skipped + 1
                    continue
            entry['caseyear'], entry['caseseqnr'] = entry['caseid'].split("/")

        entry['scraper'] = scraper
#	print "scrapestamputc: %s" % entry['scrapestamputc']
	entry['scrapestamputc'] = entry['scrapestamputc'] + '+0000'
#        print entry
        # Workaround for postliste-stortinget failing on some PDFs
        if entry['doctype'] == u'Avs./mot:':
            continue
        try:
            insert_entry(dbcursor, entry)
        except:
            print entry
            raise
    print "done"
    return len(data) - skipped

def populate_from_scraper(dbcursor, scraper, limit):
    ret = populate_from_scraper_real(dbcursor, scraper, limit)
    if ret is None:
        time.sleep(10)
        ret = populate_from_scraper_real(dbcursor, scraper, limit)
    return ret

def verify_all_data_is_transfered(dbcursor, scraper):
    sql = "SELECT COUNT(*) FROM %s WHERE scraper = '%s'" % (dbtable, scraper)
    dbcursor.execute(sql, (scraper,))
    res = dbcursor.fetchone()[0]
    if res is not None:
        sqlcount = res
	try:
            if False: # old API 
                url="https://api.scraperwiki.com/api/1.0/scraper/getinfo?format=jsondict&name=%s&version=-1" % scraper
                jsondata = urllib2.urlopen(url)
                data = json.load(jsondata)
                swcount = data[0]['datasummary']['tables']['swdata']['count']
            else:
                filename = "sqlite:///data/%s.sqlite" % scraper
                create = sqlalchemy.create_engine
#                print "opening %s" % filename
                engine = create(filename, echo=False, connect_args={'timeout': 300})
                connection = engine.connect()
                result = connection.execute("SELECT COUNT(*) FROM 'swdata'")
                if result is not None:
		    for row in result:
#                        print row.items()
                        swcount = row.items()[0][1]
		else:
		    print "No count from SQL?"
		connection.close()
#		print "swcount = %d" % swcount

            if swcount != sqlcount:
                print "warning: %d records in SQL table do not match %d records in source (diff %d)" % (sqlcount, swcount, swcount - sqlcount)
        except Exception, e:
            print "warning: Unable to verify the data transfered: %s" % str(e)

def main():
    dbconn, dbcursor = db_connect()
    if False:
        create_table(dbconn, dbcursor)

    scrapers = [
        'postliste-oep',
        'postliste-midsund-kommune',
        'postliste-hvaler',
        'postliste-bergen-kommune',
        'postliste-universitetet-i-tromso',
        'postliste-nordreisa-kommune',
        'postliste-lunner-kommune',
        'postliste-mattilsynet',
#        'postliste-arendal',   # Missing caseid, casedesc etc.
#        'postliste-lindesnes', # Missing caseid, casedesc etc.
#        'postliste-hvaler', # kommune # parsefeil

#        'postliste-hole',      # Missing casedocseq
        'postliste-lenvik',
        'postlist-ssb',
        'postliste-ballangen', # kommune  # Inconsistent dataset before 2006?
        'postliste-difi',
        'postliste-fredrikstad', # kommune
        'postliste-hadsel', # kommune
        'postliste-halden', # kommune
        'postliste-hoegskolen-i-finnmark',
        'postliste-hoegskolen-i-gjoevik',
#        'postliste-hoegskolen-i-hamar',
        'postliste-hoegskolen-i-hedmark', # replaces -i-hamar
        'postliste-hoegskolen-i-buskerud',
        'postliste-hoegskolen-i-lillehammer',
        'postliste-hoegskolen-i-nord-troendelag',
        'postliste-hoegskolen-i-soer-troendelag',
        'postliste-hoegskolen-i-telemark',
        'postliste-hoegskolen-i-vestfold',
        'postliste-hoegskolen-i-vestfold',
        'postliste-hoegskolen-i-volda',
        'postliste-kafjord-kommune',
        'postliste-kristiansund', # kommune
        'postliste-lier', # kommune
        'postliste-loppa-kommune',
        'postliste-luftambulanse',
        'postliste-met',
        'postliste-mrfylke',
        'postliste-naroy', # kommune
        'postliste-nih',
        'postliste-npolar',
        'postliste-ntnu',
        'postliste-oslo-bydel-ullern', # kommune
        'postliste-oslo-gravferdsetaten', # kommune
        'postliste-oslo-havn', # kommune
        'postliste-ruter',
        'postliste-saltdal', # kommune
        'postliste-sivilombudsmannen',
        'postliste-skogoglandskap',
        'postliste-sogne', # kommune
        'postliste-stavanger-universitetssjukehus',
        'postliste-storfjord', # kommune
        'postliste-stortinget',
        'postliste-universitetet-i-agder',
        'postliste-universitetet-i-oslo',
        'postliste-universitetet-i-stavanger',
        'postliste-universitetssykehuset-nord-norge',
    ]

    for scraper in scrapers:
        print
#        print "Moving data from " + scraper
	limit = 100000;
        while limit/2 < populate_from_scraper(dbcursor, scraper, limit):
            dbconn.commit()
        dbconn.commit()
        verify_all_data_is_transfered(dbcursor, scraper)

main()