aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorPetter Reinholdtsen <pere@hungry.com>2012-07-13 12:45:45 +0200
committerPetter Reinholdtsen <pere@hungry.com>2012-07-13 12:45:45 +0200
commitb023d46935617dcdefbf80bb751c4dbc9987d53a (patch)
tree22819ad8e17b483d646ac9008481581067f06971
parentc99e7bfda6e025314b6a7c6683a1bc3c5818621c (diff)
Add two scripts.
-rwxr-xr-xmove-postjournal262
-rwxr-xr-xpostliste-keysummary57
2 files changed, 319 insertions, 0 deletions
diff --git a/move-postjournal b/move-postjournal
new file mode 100755
index 0000000..af79ebc
--- /dev/null
+++ b/move-postjournal
@@ -0,0 +1,262 @@
+#!/usr/bin/python
+# -*- coding: utf-8 -*-
+
+import psycopg2
+import urllib2
+import json
+import string
+import sys
+import os.path
+
+dbname = "postjournal"
+f = open(os.path.expanduser('~/.nuug-fpj-pwd'), 'r')
+password = f.readline().strip()
+f.close()
+
+#Define our connection string
+conn_string = "host='sqldb.nuug.no' dbname='%s' user='pere' password='%s' sslmode='require'" % (dbname, password)
+
+columns = {
+ 'agency' : 'text NOT NULL',
+ 'docdate' : 'date',
+ 'docdesc' : 'text NOT NULL',
+ 'doctype' : 'char(1) NOT NULL',
+ 'casedesc' : 'text',
+ 'casedocseq' : 'integer NOT NULL',
+ 'recipient' : 'text',
+ 'sender' : 'text',
+ 'caseid' : 'text',
+ 'caseseqnr' : 'integer NOT NULL',
+ 'caseyear' : 'integer NOT NULL',
+ 'exemption' : 'text',
+ 'journalid' : 'text',
+ 'journalseqnr' : 'integer',
+ 'journalyear' : 'integer',
+ 'recorddate' : 'date',
+ 'scraper' : 'text',
+ 'scrapedurl' : 'text',
+ 'scrapestamputc' : 'timestamp without time zone NOT NULL',
+}
+
+# print the connection string we will use to connect
+def db_connect():
+ print "Connecting to database\n ->%s" % (conn_string)
+
+ try:
+ # get a connection, if a connect cannot be made an exception
+ # will be raised here
+ conn = psycopg2.connect(conn_string)
+ # conn.cursor will return a cursor object, you can use this
+ # cursor to perform queries
+ cursor = conn.cursor()
+ print "Connected!\n"
+ except:
+ # Get the most recent exception
+ exceptionType, exceptionValue, exceptionTraceback = sys.exc_info()
+ # Exit the script and print an error telling what happened.
+ sys.exit("Database connection failed!\n ->%s" % (exceptionValue))
+ return (conn, cursor)
+
+def jsonurl(scraper, lastscrapestamputc):
+ limit = 10000
+ limit = 350
+ return "https://api.scraperwiki.com/"+\
+ "api/1.0/datastore/sqlite?format=json&name=" + scraper +\
+ "&query=select+*+from+%60swdata%60+where+scrapestamputc+>+"+\
+ "'%s'+order+by+scrapestamputc+limit+%d" % (lastscrapestamputc, limit)
+
+def create_table(dbconn, dbcursor):
+ table = "journal"
+ print "Remove old %s table" % table
+ dbcursor.execute("DROP TABLE " + table)
+ print "Create new table"
+ s = map((lambda colname: colname + " " + columns[colname]), columns)
+ dbcursor.execute("CREATE TABLE "+table+" (" + string.join(s,",") + ")")
+ dbconn.commit()
+ print "Add unique index"
+ dbcursor.execute("CREATE UNIQUE INDEX " + table + "_unique " + \
+ "ON " + table + " (agency, caseyear, caseseqnr, casedocseq)")
+ dbconn.commit()
+
+def insert_entry(dbcursor, entry):
+ # Columns that can not be NULL
+ mustcols = {
+ 'agency' : 1,
+ 'docdesc' : 1,
+ 'doctype' : 1,
+ }
+ uniquecols1 = ['agency', 'caseyear', 'caseseqnr', 'casedocseq']
+ uniquecols2 = ['agency', 'journalyear', 'journalseqnr']
+
+ cols = []
+ args = []
+ for colname in columns.keys():
+ if colname in entry:
+ if 'casedocseq' == colname and -1 == entry['casedocseq']:
+ entry['casedocseq'] = None
+ if '' != entry[colname] or colname in mustcols:
+ cols.append(colname)
+ args.append(entry[colname])
+ inssql = "INSERT INTO journal ( " + string.join(cols, ",") + " ) " + \
+ "VALUES ( " + \
+ string.join(map((lambda colname: "%s"), args), ",") + " )"
+
+ # Make sure replacing work (as INSERT or REPLACE do not work with
+ # PostgreSQL) by removing the old entry if it exist.
+ if 'casedocseq' in entry and -1 != entry['casedocseq']:
+ delsql = "DELETE FROM journal WHERE " + \
+ string.join(map((lambda colname: colname + " = %s"), uniquecols1),
+ " and ")
+ uniquecols = uniquecols1
+ else:
+ delsql = "DELETE FROM journal WHERE " + \
+ string.join(map((lambda colname: colname + " = %s"), uniquecols2),
+ " and ")
+ uniquecols = uniquecols2
+
+ delargs = []
+ for colname in uniquecols:
+ delargs.append(entry[colname])
+ try:
+ dbcursor.execute(delsql, delargs)
+ dbcursor.execute(inssql, args)
+ except:
+ exceptionType, exceptionValue, exceptionTraceback = sys.exc_info()
+ print entry
+ print inssql
+ print exceptionType, exceptionValue, exceptionTraceback
+ exit(1)
+
+# except psycopg2.IntegrityError, e:
+# except psycopg2.DataError, e:
+# print entry
+# print sql
+# print e
+
+def populate_from_scraper(dbcursor, scraper):
+ lastscrapestamputc = ''
+ if True:
+ try:
+ sql = "SELECT MAX(scrapestamputc) FROM journal WHERE scraper = '%s'" % scraper
+# print sql
+ dbcursor.execute(sql, (scraper,))
+ res = dbcursor.fetchone()[0]
+ if res is not None:
+ status = "new"
+ lastscrapestamputc = res
+ lastscrapestamputc = lastscrapestamputc.isoformat().replace("+02:00", "+00:00").replace(" ", "T")
+ else:
+ status = "initial"
+ except psycopg2.DataError, e:
+ print "Failed" + e
+ exit(0)
+
+ url = jsonurl(scraper, lastscrapestamputc)
+ #print "Using " + url
+ data = json.load(urllib2.urlopen(url))
+ try:
+ if data['error']:
+ print "Error fetching data from " + scraper
+ return
+ except:
+ pass
+
+ print "Adding/updating " + str(len(data)) + " " + status + " entries"
+ for entry in data:
+
+ # Handle OEP scraper 2012-06-16
+ if not 'caseyear' in entry or entry['caseyear'] is None or \
+ not 'caseseqnr' in entry or entry['caseseqnr'] is None:
+ entry['caseyear'], entry['caseseqnr'] = entry['caseid'].split("/")
+
+ entry['scraper'] = scraper
+# print entry
+ # Workaround for postliste-stortinget failing on some PDFs
+ if entry['doctype'] == u'Avs./mot:':
+ continue
+ try:
+ insert_entry(dbcursor, entry)
+ except:
+ print entry
+ raise
+ return len(data)
+
+def verify_all_data_is_transfered(dbcursor, scraper):
+ sql = "SELECT COUNT(*) FROM journal WHERE scraper = '%s'" % scraper
+ dbcursor.execute(sql, (scraper,))
+ res = dbcursor.fetchone()[0]
+ if res is not None:
+ sqlcount = res
+ url="https://api.scraperwiki.com/api/1.0/scraper/getinfo?format=jsondict&name=%s&version=-1" % scraper
+ jsondata = urllib2.urlopen(url)
+ data = json.load(jsondata)
+ swcount = data[0]['datasummary']['tables']['swdata']['count']
+ if swcount != sqlcount:
+ print "warning: %d records in SQL table do not match %d records in source (diff %d)" % (sqlcount, swcount, swcount - sqlcount)
+
+def main():
+ dbconn, dbcursor = db_connect()
+ if False:
+ create_table(dbconn, dbcursor)
+
+ scrapers = [
+# 'postliste-arendal', # Missing caseid, casedesc etc.
+# 'postliste-lindesnes', # Missing caseid, casedesc etc.
+# 'postliste-hvaler', # kommune # parsefeil
+
+# 'postliste-hole', # Missing casedocseq
+ 'postlist-ssb',
+ 'postliste-ballangen', # kommune # Inconsistent dataset before 2006?
+ 'postliste-difi',
+ 'postliste-fredrikstad', # kommune
+ 'postliste-hadsel', # kommune
+ 'postliste-halden', # kommune
+ 'postliste-hoegskolen-i-finnmark',
+ 'postliste-hoegskolen-i-gjoevik',
+ 'postliste-hoegskolen-i-hamar',
+# 'postliste-hoegskolen-i-hedmark', # replaces -i-hamar
+ 'postliste-hoegskolen-i-buskerud',
+ 'postliste-hoegskolen-i-lillehammer',
+ 'postliste-hoegskolen-i-nord-troendelag',
+ 'postliste-hoegskolen-i-soer-troendelag',
+ 'postliste-hoegskolen-i-telemark',
+ 'postliste-hoegskolen-i-vestfold',
+ 'postliste-hoegskolen-i-vestfold',
+ 'postliste-hoegskolen-i-volda',
+ 'postliste-kafjord-kommune',
+ 'postliste-kristiansund', # kommune
+ 'postliste-lier', # kommune
+ 'postliste-loppa-kommune',
+ 'postliste-luftambulanse',
+ 'postliste-met',
+ 'postliste-mrfylke',
+ 'postliste-naroy', # kommune
+ 'postliste-nih',
+ 'postliste-npolar',
+ 'postliste-ntnu',
+ 'postliste-oep',
+ 'postliste-oslo-bydel-ullern', # kommune
+ 'postliste-oslo-gravferdsetaten', # kommune
+ 'postliste-oslo-havn', # kommune
+ 'postliste-ruter',
+ 'postliste-saltdal', # kommune
+ 'postliste-sivilombudsmannen',
+ 'postliste-skogoglandskap',
+ 'postliste-sogne', # kommune
+ 'postliste-stavanger-universitetssjukehus',
+ 'postliste-storfjord', # kommune
+ 'postliste-stortinget',
+ 'postliste-universitetet-i-agder',
+ 'postliste-universitetet-i-oslo',
+ 'postliste-universitetet-i-stavanger',
+ 'postliste-universitetssykehuset-nord-norge',
+ ]
+
+ for scraper in scrapers:
+ print "Moving data from " + scraper
+ while 100 < populate_from_scraper(dbcursor, scraper):
+ dbconn.commit()
+ dbconn.commit()
+ verify_all_data_is_transfered(dbcursor, scraper)
+
+main()
diff --git a/postliste-keysummary b/postliste-keysummary
new file mode 100755
index 0000000..0041b73
--- /dev/null
+++ b/postliste-keysummary
@@ -0,0 +1,57 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+import sys
+import urllib2
+import json
+
+scrapers = [
+ 'postliste-arendal',
+ 'postliste-ballangen',
+ 'postliste-fredrikstad',
+ 'postliste-hadsel',
+ 'postliste-halden',
+ 'postliste-kafjord',
+ 'postliste-kristiansund',
+ 'postliste-lier',
+ 'postliste-lindesnes',
+ 'postliste-naroy',
+ 'postliste-nrk',
+ 'postliste-oep',
+ 'postliste-oslo-bydel-ullern',
+ 'postliste-oslo-gravferdsetaten',
+ 'postliste-oslo-havn',
+ 'postliste-risør-kommune',
+ 'postliste-ruter',
+ 'postliste-saltdal',
+ 'postliste-sivilombudsmannen',
+ 'postliste-skogoglandskap',
+ 'postliste-sogne',
+ 'postliste-stavanger-universitetssjukehus',
+ 'postliste-storfjord',
+ 'postliste-stortinget',
+ 'postliste-universitetet-i-oslo',
+ ]
+
+keys = {}
+
+for scraper in scrapers:
+ print >> sys.stderr, \
+ "Loading " + scraper
+ url = 'https://api.scraperwiki.com/api/1.0/scraper/getinfo?format=jsondict&name=' + scraper + '&version=-1'
+ response = urllib2.urlopen(url)
+ html = response.read()
+ data = json.loads(html)
+ try:
+ if 'swdata' in data[0]['datasummary']['tables']:
+ for key in data[0]['datasummary']['tables']['swdata']['keys']:
+ key = key.lower()
+ if key in keys:
+ keys[key].append(scraper)
+ else:
+ keys[key] = [scraper]
+ except:
+ print >> sys.stderr, \
+ "error: unable to find data from scraper " + scraper
+for key in keys:
+ print len(keys[key]), key, str(keys[key])