diff options
author | Petter Reinholdtsen <pere@hungry.com> | 2012-07-13 12:45:45 +0200 |
---|---|---|
committer | Petter Reinholdtsen <pere@hungry.com> | 2012-07-13 12:45:45 +0200 |
commit | b023d46935617dcdefbf80bb751c4dbc9987d53a (patch) | |
tree | 22819ad8e17b483d646ac9008481581067f06971 | |
parent | c99e7bfda6e025314b6a7c6683a1bc3c5818621c (diff) |
Add two scripts.
-rwxr-xr-x | move-postjournal | 262 | ||||
-rwxr-xr-x | postliste-keysummary | 57 |
2 files changed, 319 insertions, 0 deletions
diff --git a/move-postjournal b/move-postjournal new file mode 100755 index 0000000..af79ebc --- /dev/null +++ b/move-postjournal @@ -0,0 +1,262 @@ +#!/usr/bin/python +# -*- coding: utf-8 -*- + +import psycopg2 +import urllib2 +import json +import string +import sys +import os.path + +dbname = "postjournal" +f = open(os.path.expanduser('~/.nuug-fpj-pwd'), 'r') +password = f.readline().strip() +f.close() + +#Define our connection string +conn_string = "host='sqldb.nuug.no' dbname='%s' user='pere' password='%s' sslmode='require'" % (dbname, password) + +columns = { + 'agency' : 'text NOT NULL', + 'docdate' : 'date', + 'docdesc' : 'text NOT NULL', + 'doctype' : 'char(1) NOT NULL', + 'casedesc' : 'text', + 'casedocseq' : 'integer NOT NULL', + 'recipient' : 'text', + 'sender' : 'text', + 'caseid' : 'text', + 'caseseqnr' : 'integer NOT NULL', + 'caseyear' : 'integer NOT NULL', + 'exemption' : 'text', + 'journalid' : 'text', + 'journalseqnr' : 'integer', + 'journalyear' : 'integer', + 'recorddate' : 'date', + 'scraper' : 'text', + 'scrapedurl' : 'text', + 'scrapestamputc' : 'timestamp without time zone NOT NULL', +} + +# print the connection string we will use to connect +def db_connect(): + print "Connecting to database\n ->%s" % (conn_string) + + try: + # get a connection, if a connect cannot be made an exception + # will be raised here + conn = psycopg2.connect(conn_string) + # conn.cursor will return a cursor object, you can use this + # cursor to perform queries + cursor = conn.cursor() + print "Connected!\n" + except: + # Get the most recent exception + exceptionType, exceptionValue, exceptionTraceback = sys.exc_info() + # Exit the script and print an error telling what happened. + sys.exit("Database connection failed!\n ->%s" % (exceptionValue)) + return (conn, cursor) + +def jsonurl(scraper, lastscrapestamputc): + limit = 10000 + limit = 350 + return "https://api.scraperwiki.com/"+\ + "api/1.0/datastore/sqlite?format=json&name=" + scraper +\ + "&query=select+*+from+%60swdata%60+where+scrapestamputc+>+"+\ + "'%s'+order+by+scrapestamputc+limit+%d" % (lastscrapestamputc, limit) + +def create_table(dbconn, dbcursor): + table = "journal" + print "Remove old %s table" % table + dbcursor.execute("DROP TABLE " + table) + print "Create new table" + s = map((lambda colname: colname + " " + columns[colname]), columns) + dbcursor.execute("CREATE TABLE "+table+" (" + string.join(s,",") + ")") + dbconn.commit() + print "Add unique index" + dbcursor.execute("CREATE UNIQUE INDEX " + table + "_unique " + \ + "ON " + table + " (agency, caseyear, caseseqnr, casedocseq)") + dbconn.commit() + +def insert_entry(dbcursor, entry): + # Columns that can not be NULL + mustcols = { + 'agency' : 1, + 'docdesc' : 1, + 'doctype' : 1, + } + uniquecols1 = ['agency', 'caseyear', 'caseseqnr', 'casedocseq'] + uniquecols2 = ['agency', 'journalyear', 'journalseqnr'] + + cols = [] + args = [] + for colname in columns.keys(): + if colname in entry: + if 'casedocseq' == colname and -1 == entry['casedocseq']: + entry['casedocseq'] = None + if '' != entry[colname] or colname in mustcols: + cols.append(colname) + args.append(entry[colname]) + inssql = "INSERT INTO journal ( " + string.join(cols, ",") + " ) " + \ + "VALUES ( " + \ + string.join(map((lambda colname: "%s"), args), ",") + " )" + + # Make sure replacing work (as INSERT or REPLACE do not work with + # PostgreSQL) by removing the old entry if it exist. + if 'casedocseq' in entry and -1 != entry['casedocseq']: + delsql = "DELETE FROM journal WHERE " + \ + string.join(map((lambda colname: colname + " = %s"), uniquecols1), + " and ") + uniquecols = uniquecols1 + else: + delsql = "DELETE FROM journal WHERE " + \ + string.join(map((lambda colname: colname + " = %s"), uniquecols2), + " and ") + uniquecols = uniquecols2 + + delargs = [] + for colname in uniquecols: + delargs.append(entry[colname]) + try: + dbcursor.execute(delsql, delargs) + dbcursor.execute(inssql, args) + except: + exceptionType, exceptionValue, exceptionTraceback = sys.exc_info() + print entry + print inssql + print exceptionType, exceptionValue, exceptionTraceback + exit(1) + +# except psycopg2.IntegrityError, e: +# except psycopg2.DataError, e: +# print entry +# print sql +# print e + +def populate_from_scraper(dbcursor, scraper): + lastscrapestamputc = '' + if True: + try: + sql = "SELECT MAX(scrapestamputc) FROM journal WHERE scraper = '%s'" % scraper +# print sql + dbcursor.execute(sql, (scraper,)) + res = dbcursor.fetchone()[0] + if res is not None: + status = "new" + lastscrapestamputc = res + lastscrapestamputc = lastscrapestamputc.isoformat().replace("+02:00", "+00:00").replace(" ", "T") + else: + status = "initial" + except psycopg2.DataError, e: + print "Failed" + e + exit(0) + + url = jsonurl(scraper, lastscrapestamputc) + #print "Using " + url + data = json.load(urllib2.urlopen(url)) + try: + if data['error']: + print "Error fetching data from " + scraper + return + except: + pass + + print "Adding/updating " + str(len(data)) + " " + status + " entries" + for entry in data: + + # Handle OEP scraper 2012-06-16 + if not 'caseyear' in entry or entry['caseyear'] is None or \ + not 'caseseqnr' in entry or entry['caseseqnr'] is None: + entry['caseyear'], entry['caseseqnr'] = entry['caseid'].split("/") + + entry['scraper'] = scraper +# print entry + # Workaround for postliste-stortinget failing on some PDFs + if entry['doctype'] == u'Avs./mot:': + continue + try: + insert_entry(dbcursor, entry) + except: + print entry + raise + return len(data) + +def verify_all_data_is_transfered(dbcursor, scraper): + sql = "SELECT COUNT(*) FROM journal WHERE scraper = '%s'" % scraper + dbcursor.execute(sql, (scraper,)) + res = dbcursor.fetchone()[0] + if res is not None: + sqlcount = res + url="https://api.scraperwiki.com/api/1.0/scraper/getinfo?format=jsondict&name=%s&version=-1" % scraper + jsondata = urllib2.urlopen(url) + data = json.load(jsondata) + swcount = data[0]['datasummary']['tables']['swdata']['count'] + if swcount != sqlcount: + print "warning: %d records in SQL table do not match %d records in source (diff %d)" % (sqlcount, swcount, swcount - sqlcount) + +def main(): + dbconn, dbcursor = db_connect() + if False: + create_table(dbconn, dbcursor) + + scrapers = [ +# 'postliste-arendal', # Missing caseid, casedesc etc. +# 'postliste-lindesnes', # Missing caseid, casedesc etc. +# 'postliste-hvaler', # kommune # parsefeil + +# 'postliste-hole', # Missing casedocseq + 'postlist-ssb', + 'postliste-ballangen', # kommune # Inconsistent dataset before 2006? + 'postliste-difi', + 'postliste-fredrikstad', # kommune + 'postliste-hadsel', # kommune + 'postliste-halden', # kommune + 'postliste-hoegskolen-i-finnmark', + 'postliste-hoegskolen-i-gjoevik', + 'postliste-hoegskolen-i-hamar', +# 'postliste-hoegskolen-i-hedmark', # replaces -i-hamar + 'postliste-hoegskolen-i-buskerud', + 'postliste-hoegskolen-i-lillehammer', + 'postliste-hoegskolen-i-nord-troendelag', + 'postliste-hoegskolen-i-soer-troendelag', + 'postliste-hoegskolen-i-telemark', + 'postliste-hoegskolen-i-vestfold', + 'postliste-hoegskolen-i-vestfold', + 'postliste-hoegskolen-i-volda', + 'postliste-kafjord-kommune', + 'postliste-kristiansund', # kommune + 'postliste-lier', # kommune + 'postliste-loppa-kommune', + 'postliste-luftambulanse', + 'postliste-met', + 'postliste-mrfylke', + 'postliste-naroy', # kommune + 'postliste-nih', + 'postliste-npolar', + 'postliste-ntnu', + 'postliste-oep', + 'postliste-oslo-bydel-ullern', # kommune + 'postliste-oslo-gravferdsetaten', # kommune + 'postliste-oslo-havn', # kommune + 'postliste-ruter', + 'postliste-saltdal', # kommune + 'postliste-sivilombudsmannen', + 'postliste-skogoglandskap', + 'postliste-sogne', # kommune + 'postliste-stavanger-universitetssjukehus', + 'postliste-storfjord', # kommune + 'postliste-stortinget', + 'postliste-universitetet-i-agder', + 'postliste-universitetet-i-oslo', + 'postliste-universitetet-i-stavanger', + 'postliste-universitetssykehuset-nord-norge', + ] + + for scraper in scrapers: + print "Moving data from " + scraper + while 100 < populate_from_scraper(dbcursor, scraper): + dbconn.commit() + dbconn.commit() + verify_all_data_is_transfered(dbcursor, scraper) + +main() diff --git a/postliste-keysummary b/postliste-keysummary new file mode 100755 index 0000000..0041b73 --- /dev/null +++ b/postliste-keysummary @@ -0,0 +1,57 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +import sys +import urllib2 +import json + +scrapers = [ + 'postliste-arendal', + 'postliste-ballangen', + 'postliste-fredrikstad', + 'postliste-hadsel', + 'postliste-halden', + 'postliste-kafjord', + 'postliste-kristiansund', + 'postliste-lier', + 'postliste-lindesnes', + 'postliste-naroy', + 'postliste-nrk', + 'postliste-oep', + 'postliste-oslo-bydel-ullern', + 'postliste-oslo-gravferdsetaten', + 'postliste-oslo-havn', + 'postliste-risør-kommune', + 'postliste-ruter', + 'postliste-saltdal', + 'postliste-sivilombudsmannen', + 'postliste-skogoglandskap', + 'postliste-sogne', + 'postliste-stavanger-universitetssjukehus', + 'postliste-storfjord', + 'postliste-stortinget', + 'postliste-universitetet-i-oslo', + ] + +keys = {} + +for scraper in scrapers: + print >> sys.stderr, \ + "Loading " + scraper + url = 'https://api.scraperwiki.com/api/1.0/scraper/getinfo?format=jsondict&name=' + scraper + '&version=-1' + response = urllib2.urlopen(url) + html = response.read() + data = json.loads(html) + try: + if 'swdata' in data[0]['datasummary']['tables']: + for key in data[0]['datasummary']['tables']['swdata']['keys']: + key = key.lower() + if key in keys: + keys[key].append(scraper) + else: + keys[key] = [scraper] + except: + print >> sys.stderr, \ + "error: unable to find data from scraper " + scraper +for key in keys: + print len(keys[key]), key, str(keys[key]) |