#!/usr/bin/python # -*- coding: utf-8 -*- import psycopg2 import urllib2 import json import string import sys import os.path import time import sqlalchemy dbname = "postjournal" dbtable = "journal" f = open(os.path.expanduser('~/.nuug-fpj-pwd'), 'r') password = f.readline().strip() f.close() #Define our connection string conn_string = "host='sqldb.nuug.no' dbname='%s' user='pere' password='%s' sslmode='require'" % (dbname, password) columns = { 'agency' : 'text NOT NULL', 'docdate' : 'date', 'docdesc' : 'text NOT NULL', 'doctype' : 'char(1) NOT NULL', 'casedesc' : 'text', 'casedocseq' : 'integer NOT NULL', 'recipient' : 'text', 'sender' : 'text', 'caseid' : 'text', 'caseseqnr' : 'integer NOT NULL', 'caseyear' : 'integer NOT NULL', 'exemption' : 'text', 'journalid' : 'text', 'journalseqnr' : 'integer', 'journalyear' : 'integer', 'recorddate' : 'date', 'scraper' : 'text', 'scrapedurl' : 'text', 'scrapestamputc' : 'timestamp without time zone NOT NULL', } # print the connection string we will use to connect def db_connect(): # print "Connecting to database\n ->%s" % (conn_string) try: # get a connection, if a connect cannot be made an exception # will be raised here conn = psycopg2.connect(conn_string) # conn.cursor will return a cursor object, you can use this # cursor to perform queries cursor = conn.cursor() print "Connected!\n" except: # Get the most recent exception exceptionType, exceptionValue, exceptionTraceback = sys.exc_info() # Exit the script and print an error telling what happened. sys.exit("Database connection failed!\n ->%s" % (exceptionValue)) return (conn, cursor) def sqliteline(lastscrapestamputc, limit): if lastscrapestamputc is None or '' == lastscrapestamputc: lastscrapestamputc = '1970-01-01' sql = "select * from 'swdata' where " \ "datetime(scrapestamputc) >= datetime('%s') "\ "order by scrapestamputc limit %d" % (lastscrapestamputc, limit) # print sql return sql def jsonurl(scraper, sql): urlsql = urllib.urlencode(sql) return "https://api.scraperwiki.com/" + \ "api/1.0/datastore/sqlite?format=json&name=" + scraper + "&query=%s" % urlsql def create_table(dbconn, dbcursor): print "Remove old %s table" % dbtable dbcursor.execute("DROP TABLE " + dbtable) print "Create new table" s = map((lambda colname: colname + " " + columns[colname]), columns) dbcursor.execute("CREATE TABLE "+dbtable+" (" + string.join(s,",") + ")") dbconn.commit() print "Add unique index" dbcursor.execute("CREATE UNIQUE INDEX " + dbtable + "_unique " + \ "ON " + dbtable + " (agency, caseyear, caseseqnr, casedocseq, journalyear, journalseqnr)") dbconn.commit() def insert_entry(dbcursor, entry): # Columns that can not be NULL mustcols = { 'agency' : 1, 'docdesc' : 1, 'doctype' : 1, } uniquecols1 = ['agency', 'caseyear', 'caseseqnr', 'casedocseq'] uniquecols2 = ['agency', 'journalyear', 'journalseqnr'] cols = [] args = [] for colname in columns.keys(): if colname in entry: if 'casedocseq' == colname and -1 == entry['casedocseq']: entry['casedocseq'] = None if '' != entry[colname] or colname in mustcols: cols.append(colname) args.append(entry[colname]) inssql = "INSERT INTO "+dbtable+" ( " + string.join(cols, ",") + " ) " + \ "VALUES ( " + \ string.join(map((lambda colname: "%s"), args), ",") + " )" # Make sure replacing work (as INSERT or REPLACE do not work with # PostgreSQL) by removing the old entry if it exist. if 'casedocseq' in entry and -1 != entry['casedocseq']: delsql = "DELETE FROM " + dbtable + " WHERE " + \ string.join(map((lambda colname: colname + " = %s"), uniquecols1), " and ") uniquecols = uniquecols1 else: delsql = "DELETE FROM " + dbtable + " WHERE " + \ string.join(map((lambda colname: colname + " = %s"), uniquecols2), " and ") uniquecols = uniquecols2 delargs = [] for colname in uniquecols: delargs.append(entry[colname]) try: dbcursor.execute(delsql, delargs) dbcursor.execute(inssql, args) except: exceptionType, exceptionValue, exceptionTraceback = sys.exc_info() print entry print inssql print exceptionType, exceptionValue, exceptionTraceback exit(1) # except psycopg2.IntegrityError, e: # except psycopg2.DataError, e: # print entry # print sql # print e def populate_from_scraper_real(dbcursor, scraper, limit): lastscrapestamputc = '' sys.stdout.write(scraper + ": ") sys.stdout.flush() if True: try: sql = "SELECT MAX(scrapestamputc) FROM %s WHERE scraper = '%s'" % (dbtable, scraper) # print sql dbcursor.execute(sql, (scraper,)) res = dbcursor.fetchone()[0] if res is not None: status = "new" lastscrapestamputc = res lastscrapestamputc = lastscrapestamputc.isoformat() # print "Before: %s, after %s" % (res, lastscrapestamputc) else: status = "initial" except psycopg2.DataError, e: print "Failed" + e exit(0) sql = sqliteline(lastscrapestamputc, limit) if False: # old API url = jsonurl(scraper, sql) #print "Using " + url data = json.load(urllib2.urlopen(url)) try: if data['error']: print "Error fetching data from " + scraper return except: pass else: filename = "sqlite:///data/%s.sqlite" % scraper create = sqlalchemy.create_engine # print "opening %s" % filename engine = create(filename, echo=False, connect_args={'timeout': 300}) connection = engine.connect() result = connection.execute(sql) data = [] for row in result: data.append(dict(row.items())) connection.close() sys.stdout.write("Adding/updating " + str(len(data)) + " " + status + " entries (" + lastscrapestamputc + ")") sys.stdout.flush() skipped = 0 for entry in data: # Handle OEP scraper 2012-06-16 if not 'caseyear' in entry or entry['caseyear'] is None or \ not 'caseseqnr' in entry or entry['caseseqnr'] is None: if True: if entry['caseid'] is None: # print "Strange entry, skipping: ", entry skipped = skipped + 1 continue entry['caseyear'], entry['caseseqnr'] = entry['caseid'].split("/") entry['scraper'] = scraper # print "scrapestamputc: %s" % entry['scrapestamputc'] entry['scrapestamputc'] = entry['scrapestamputc'] + '+0000' # print entry # Workaround for postliste-stortinget failing on some PDFs if entry['doctype'] == u'Avs./mot:': continue try: insert_entry(dbcursor, entry) except: print entry raise print "done" return len(data) - skipped def populate_from_scraper(dbcursor, scraper, limit): ret = populate_from_scraper_real(dbcursor, scraper, limit) if ret is None: time.sleep(10) ret = populate_from_scraper_real(dbcursor, scraper, limit) return ret def verify_all_data_is_transfered(dbcursor, scraper): sql = "SELECT COUNT(*) FROM %s WHERE scraper = '%s'" % (dbtable, scraper) dbcursor.execute(sql, (scraper,)) res = dbcursor.fetchone()[0] if res is not None: sqlcount = res try: if False: # old API url="https://api.scraperwiki.com/api/1.0/scraper/getinfo?format=jsondict&name=%s&version=-1" % scraper jsondata = urllib2.urlopen(url) data = json.load(jsondata) swcount = data[0]['datasummary']['tables']['swdata']['count'] else: filename = "sqlite:///data/%s.sqlite" % scraper create = sqlalchemy.create_engine # print "opening %s" % filename engine = create(filename, echo=False, connect_args={'timeout': 300}) connection = engine.connect() result = connection.execute("SELECT COUNT(*) FROM 'swdata'") if result is not None: for row in result: # print row.items() swcount = row.items()[0][1] else: print "No count from SQL?" connection.close() # print "swcount = %d" % swcount if swcount != sqlcount: print "warning: %d records in SQL table do not match %d records in source (diff %d)" % (sqlcount, swcount, swcount - sqlcount) except Exception, e: print "warning: Unable to verify the data transfered: %s" % str(e) def main(): dbconn, dbcursor = db_connect() if False: create_table(dbconn, dbcursor) scrapers = [ 'postliste-oep', 'postliste-midsund-kommune', 'postliste-hvaler', 'postliste-bergen-kommune', 'postliste-universitetet-i-tromso', 'postliste-nordreisa-kommune', 'postliste-lunner-kommune', 'postliste-mattilsynet', # 'postliste-arendal', # Missing caseid, casedesc etc. # 'postliste-lindesnes', # Missing caseid, casedesc etc. # 'postliste-hvaler', # kommune # parsefeil # 'postliste-hole', # Missing casedocseq 'postliste-lenvik', 'postlist-ssb', 'postliste-ballangen', # kommune # Inconsistent dataset before 2006? 'postliste-difi', 'postliste-fredrikstad', # kommune 'postliste-hadsel', # kommune 'postliste-halden', # kommune 'postliste-hoegskolen-i-finnmark', 'postliste-hoegskolen-i-gjoevik', # 'postliste-hoegskolen-i-hamar', 'postliste-hoegskolen-i-hedmark', # replaces -i-hamar 'postliste-hoegskolen-i-buskerud', 'postliste-hoegskolen-i-lillehammer', 'postliste-hoegskolen-i-nord-troendelag', 'postliste-hoegskolen-i-soer-troendelag', 'postliste-hoegskolen-i-telemark', 'postliste-hoegskolen-i-vestfold', 'postliste-hoegskolen-i-vestfold', 'postliste-hoegskolen-i-volda', 'postliste-kafjord-kommune', 'postliste-kristiansund', # kommune 'postliste-lier', # kommune 'postliste-loppa-kommune', 'postliste-luftambulanse', 'postliste-met', 'postliste-mrfylke', 'postliste-naroy', # kommune 'postliste-nih', 'postliste-npolar', 'postliste-ntnu', 'postliste-oslo-bydel-ullern', # kommune 'postliste-oslo-gravferdsetaten', # kommune 'postliste-oslo-havn', # kommune 'postliste-ruter', 'postliste-saltdal', # kommune 'postliste-sivilombudsmannen', 'postliste-skogoglandskap', 'postliste-sogne', # kommune 'postliste-stavanger-universitetssjukehus', 'postliste-storfjord', # kommune 'postliste-stortinget', 'postliste-universitetet-i-agder', 'postliste-universitetet-i-oslo', 'postliste-universitetet-i-stavanger', 'postliste-universitetssykehuset-nord-norge', ] for scraper in scrapers: print # print "Moving data from " + scraper limit = 100000; while limit/2 < populate_from_scraper(dbcursor, scraper, limit): dbconn.commit() dbconn.commit() verify_all_data_is_transfered(dbcursor, scraper) main()