#!/usr/bin/python # -*- coding: utf-8 -*- import psycopg2 import urllib2 import json import string import sys import os.path dbname = "postjournal" dbtable = "journal" f = open(os.path.expanduser('~/.nuug-fpj-pwd'), 'r') password = f.readline().strip() f.close() #Define our connection string conn_string = "host='sqldb.nuug.no' dbname='%s' user='pere' password='%s' sslmode='require'" % (dbname, password) columns = { 'agency' : 'text NOT NULL', 'docdate' : 'date', 'docdesc' : 'text NOT NULL', 'doctype' : 'char(1) NOT NULL', 'casedesc' : 'text', 'casedocseq' : 'integer NOT NULL', 'recipient' : 'text', 'sender' : 'text', 'caseid' : 'text', 'caseseqnr' : 'integer NOT NULL', 'caseyear' : 'integer NOT NULL', 'exemption' : 'text', 'journalid' : 'text', 'journalseqnr' : 'integer', 'journalyear' : 'integer', 'recorddate' : 'date', 'scraper' : 'text', 'scrapedurl' : 'text', 'scrapestamputc' : 'timestamp without time zone NOT NULL', } # print the connection string we will use to connect def db_connect(): print "Connecting to database\n ->%s" % (conn_string) try: # get a connection, if a connect cannot be made an exception # will be raised here conn = psycopg2.connect(conn_string) # conn.cursor will return a cursor object, you can use this # cursor to perform queries cursor = conn.cursor() print "Connected!\n" except: # Get the most recent exception exceptionType, exceptionValue, exceptionTraceback = sys.exc_info() # Exit the script and print an error telling what happened. sys.exit("Database connection failed!\n ->%s" % (exceptionValue)) return (conn, cursor) def jsonurl(scraper, lastscrapestamputc): limit = 10000 limit = 350 return "https://api.scraperwiki.com/"+\ "api/1.0/datastore/sqlite?format=json&name=" + scraper +\ "&query=select+*+from+%60swdata%60+where+scrapestamputc+>+"+\ "'%s'+order+by+scrapestamputc+limit+%d" % (lastscrapestamputc, limit) def create_table(dbconn, dbcursor): print "Remove old %s table" % dbtable dbcursor.execute("DROP TABLE " + dbtable) print "Create new table" s = map((lambda colname: colname + " " + columns[colname]), columns) dbcursor.execute("CREATE TABLE "+dbtable+" (" + string.join(s,",") + ")") dbconn.commit() print "Add unique index" dbcursor.execute("CREATE UNIQUE INDEX " + dbtable + "_unique " + \ "ON " + dbtable + " (agency, caseyear, caseseqnr, casedocseq, journalyear, journalseqnr)") dbconn.commit() def insert_entry(dbcursor, entry): # Columns that can not be NULL mustcols = { 'agency' : 1, 'docdesc' : 1, 'doctype' : 1, } uniquecols1 = ['agency', 'caseyear', 'caseseqnr', 'casedocseq'] uniquecols2 = ['agency', 'journalyear', 'journalseqnr'] cols = [] args = [] for colname in columns.keys(): if colname in entry: if 'casedocseq' == colname and -1 == entry['casedocseq']: entry['casedocseq'] = None if '' != entry[colname] or colname in mustcols: cols.append(colname) args.append(entry[colname]) inssql = "INSERT INTO "+dbtable+" ( " + string.join(cols, ",") + " ) " + \ "VALUES ( " + \ string.join(map((lambda colname: "%s"), args), ",") + " )" # Make sure replacing work (as INSERT or REPLACE do not work with # PostgreSQL) by removing the old entry if it exist. if 'casedocseq' in entry and -1 != entry['casedocseq']: delsql = "DELETE FROM " + dbtable + " WHERE " + \ string.join(map((lambda colname: colname + " = %s"), uniquecols1), " and ") uniquecols = uniquecols1 else: delsql = "DELETE FROM " + dbtable + " WHERE " + \ string.join(map((lambda colname: colname + " = %s"), uniquecols2), " and ") uniquecols = uniquecols2 delargs = [] for colname in uniquecols: delargs.append(entry[colname]) try: dbcursor.execute(delsql, delargs) dbcursor.execute(inssql, args) except: exceptionType, exceptionValue, exceptionTraceback = sys.exc_info() print entry print inssql print exceptionType, exceptionValue, exceptionTraceback exit(1) # except psycopg2.IntegrityError, e: # except psycopg2.DataError, e: # print entry # print sql # print e def populate_from_scraper(dbcursor, scraper): lastscrapestamputc = '' if True: try: sql = "SELECT MAX(scrapestamputc) FROM %s WHERE scraper = '%s'" % (dbtable, scraper) # print sql dbcursor.execute(sql, (scraper,)) res = dbcursor.fetchone()[0] if res is not None: status = "new" lastscrapestamputc = res lastscrapestamputc = lastscrapestamputc.isoformat().replace("+02:00", "+00:00").replace(" ", "T") else: status = "initial" except psycopg2.DataError, e: print "Failed" + e exit(0) url = jsonurl(scraper, lastscrapestamputc) #print "Using " + url data = json.load(urllib2.urlopen(url)) try: if data['error']: print "Error fetching data from " + scraper return except: pass print "Adding/updating " + str(len(data)) + " " + status + " entries" for entry in data: # Handle OEP scraper 2012-06-16 if not 'caseyear' in entry or entry['caseyear'] is None or \ not 'caseseqnr' in entry or entry['caseseqnr'] is None: entry['caseyear'], entry['caseseqnr'] = entry['caseid'].split("/") entry['scraper'] = scraper # print entry # Workaround for postliste-stortinget failing on some PDFs if entry['doctype'] == u'Avs./mot:': continue try: insert_entry(dbcursor, entry) except: print entry raise return len(data) def verify_all_data_is_transfered(dbcursor, scraper): sql = "SELECT COUNT(*) FROM %s WHERE scraper = '%s'" % (dbtable, scraper) dbcursor.execute(sql, (scraper,)) res = dbcursor.fetchone()[0] if res is not None: sqlcount = res url="https://api.scraperwiki.com/api/1.0/scraper/getinfo?format=jsondict&name=%s&version=-1" % scraper jsondata = urllib2.urlopen(url) data = json.load(jsondata) swcount = data[0]['datasummary']['tables']['swdata']['count'] if swcount != sqlcount: print "warning: %d records in SQL table do not match %d records in source (diff %d)" % (sqlcount, swcount, swcount - sqlcount) def main(): dbconn, dbcursor = db_connect() if False: create_table(dbconn, dbcursor) scrapers = [ # 'postliste-arendal', # Missing caseid, casedesc etc. # 'postliste-lindesnes', # Missing caseid, casedesc etc. # 'postliste-hvaler', # kommune # parsefeil # 'postliste-hole', # Missing casedocseq 'postlist-ssb', 'postliste-ballangen', # kommune # Inconsistent dataset before 2006? 'postliste-difi', 'postliste-fredrikstad', # kommune 'postliste-hadsel', # kommune 'postliste-halden', # kommune 'postliste-hoegskolen-i-finnmark', 'postliste-hoegskolen-i-gjoevik', 'postliste-hoegskolen-i-hamar', # 'postliste-hoegskolen-i-hedmark', # replaces -i-hamar 'postliste-hoegskolen-i-buskerud', 'postliste-hoegskolen-i-lillehammer', 'postliste-hoegskolen-i-nord-troendelag', 'postliste-hoegskolen-i-soer-troendelag', 'postliste-hoegskolen-i-telemark', 'postliste-hoegskolen-i-vestfold', 'postliste-hoegskolen-i-vestfold', 'postliste-hoegskolen-i-volda', 'postliste-kafjord-kommune', 'postliste-kristiansund', # kommune 'postliste-lier', # kommune 'postliste-loppa-kommune', 'postliste-luftambulanse', 'postliste-met', 'postliste-mrfylke', 'postliste-naroy', # kommune 'postliste-nih', 'postliste-npolar', 'postliste-ntnu', 'postliste-oep', 'postliste-oslo-bydel-ullern', # kommune 'postliste-oslo-gravferdsetaten', # kommune 'postliste-oslo-havn', # kommune 'postliste-ruter', 'postliste-saltdal', # kommune 'postliste-sivilombudsmannen', 'postliste-skogoglandskap', 'postliste-sogne', # kommune 'postliste-stavanger-universitetssjukehus', 'postliste-storfjord', # kommune 'postliste-stortinget', 'postliste-universitetet-i-agder', 'postliste-universitetet-i-oslo', 'postliste-universitetet-i-stavanger', 'postliste-universitetssykehuset-nord-norge', ] for scraper in scrapers: print "Moving data from " + scraper while 100 < populate_from_scraper(dbcursor, scraper): dbconn.commit() dbconn.commit() verify_all_data_is_transfered(dbcursor, scraper) main()