Add two scripts.

author: Petter Reinholdtsen <pere@hungry.com> 2012-07-13 12:45:45 +0200
committer: Petter Reinholdtsen <pere@hungry.com> 2012-07-13 12:45:45 +0200
commit: b023d46935617dcdefbf80bb751c4dbc9987d53a (patch)
tree: 22819ad8e17b483d646ac9008481581067f06971
parent: c99e7bfda6e025314b6a7c6683a1bc3c5818621c (diff)
2 files changed, 319 insertions, 0 deletions
diff --git a/move-postjournal b/move-postjournal
new file mode 100755
index 0000000..af79ebc
--- /dev/null
+++ b/move-postjournal
@@ -0,0 +1,262 @@
+#!/usr/bin/python
+# -*- coding: utf-8 -*-
+
+import psycopg2
+import urllib2
+import json
+import string
+import sys
+import os.path
+
+dbname = "postjournal"
+f = open(os.path.expanduser('~/.nuug-fpj-pwd'), 'r')
+password = f.readline().strip()
+f.close()
+
+#Define our connection string
+conn_string = "host='sqldb.nuug.no' dbname='%s' user='pere' password='%s' sslmode='require'" % (dbname, password)
+
+columns = {
+    'agency'         : 'text NOT NULL',
+    'docdate'        : 'date',
+    'docdesc'        : 'text NOT NULL',
+    'doctype'        : 'char(1) NOT NULL',
+    'casedesc'       : 'text',
+    'casedocseq'     : 'integer NOT NULL',
+    'recipient'      : 'text',
+    'sender'         : 'text',
+    'caseid'         : 'text',
+    'caseseqnr'      : 'integer NOT NULL',
+    'caseyear'       : 'integer NOT NULL',
+    'exemption'      : 'text',
+    'journalid'      : 'text',
+    'journalseqnr'   : 'integer',
+    'journalyear'    : 'integer',
+    'recorddate'     : 'date',
+    'scraper'        : 'text',
+    'scrapedurl'     : 'text',
+    'scrapestamputc' : 'timestamp without time zone NOT NULL',
+}
+
+# print the connection string we will use to connect
+def db_connect():
+    print "Connecting to database\n	->%s" % (conn_string)
+
+    try:
+        # get a connection, if a connect cannot be made an exception
+        # will be raised here
+        conn = psycopg2.connect(conn_string)
+        # conn.cursor will return a cursor object, you can use this
+        # cursor to perform queries
+        cursor = conn.cursor()
+        print "Connected!\n"
+    except:
+        # Get the most recent exception
+        exceptionType, exceptionValue, exceptionTraceback = sys.exc_info()
+        # Exit the script and print an error telling what happened.
+        sys.exit("Database connection failed!\n ->%s" % (exceptionValue))
+    return (conn, cursor)
+
+def jsonurl(scraper, lastscrapestamputc):
+    limit = 10000
+    limit = 350
+    return "https://api.scraperwiki.com/"+\
+        "api/1.0/datastore/sqlite?format=json&name=" + scraper +\
+        "&query=select+*+from+%60swdata%60+where+scrapestamputc+>+"+\
+        "'%s'+order+by+scrapestamputc+limit+%d" % (lastscrapestamputc, limit)
+
+def create_table(dbconn, dbcursor):
+    table = "journal"
+    print "Remove old %s table" % table
+    dbcursor.execute("DROP TABLE " + table)
+    print "Create new table"
+    s = map((lambda colname: colname + " " + columns[colname]), columns)
+    dbcursor.execute("CREATE TABLE "+table+" (" + string.join(s,",") + ")")
+    dbconn.commit()
+    print "Add unique index"
+    dbcursor.execute("CREATE UNIQUE INDEX " + table + "_unique " + \
+                         "ON " + table + " (agency, caseyear, caseseqnr, casedocseq)")
+    dbconn.commit()
+
+def insert_entry(dbcursor, entry):
+    # Columns that can not be NULL
+    mustcols = {
+        'agency'         : 1,
+        'docdesc'        : 1,
+        'doctype'        : 1,
+    }
+    uniquecols1 = ['agency', 'caseyear', 'caseseqnr', 'casedocseq']
+    uniquecols2 = ['agency', 'journalyear', 'journalseqnr']
+
+    cols = []
+    args = []
+    for colname in columns.keys():
+        if colname in entry:
+            if 'casedocseq' == colname and -1 == entry['casedocseq']:
+                entry['casedocseq'] = None
+            if '' != entry[colname] or colname in mustcols:
+                cols.append(colname)
+                args.append(entry[colname])
+    inssql = "INSERT INTO journal ( " + string.join(cols, ",") + " ) " + \
+        "VALUES ( " + \
+        string.join(map((lambda colname: "%s"), args), ",") + " )"
+
+    # Make sure replacing work (as INSERT or REPLACE do not work with
+    # PostgreSQL) by removing the old entry if it exist.
+    if 'casedocseq' in entry and -1 != entry['casedocseq']:
+        delsql = "DELETE FROM journal WHERE " + \
+            string.join(map((lambda colname: colname + " = %s"), uniquecols1),
+                        " and ")
+        uniquecols = uniquecols1
+    else:
+        delsql = "DELETE FROM journal WHERE " + \
+            string.join(map((lambda colname: colname + " = %s"), uniquecols2),
+                        " and ")
+        uniquecols = uniquecols2
+
+    delargs = []
+    for colname in uniquecols:
+        delargs.append(entry[colname])
+    try:
+        dbcursor.execute(delsql, delargs)
+        dbcursor.execute(inssql, args)
+    except:
+        exceptionType, exceptionValue, exceptionTraceback = sys.exc_info()
+        print entry
+        print inssql
+        print exceptionType, exceptionValue, exceptionTraceback
+        exit(1)
+
+#    except psycopg2.IntegrityError, e:
+#    except psycopg2.DataError, e:
+#	print entry
+#	print sql
+#	print e
+
+def populate_from_scraper(dbcursor, scraper):
+    lastscrapestamputc = ''
+    if True:
+        try:
+            sql = "SELECT MAX(scrapestamputc) FROM journal WHERE scraper = '%s'" % scraper
+#            print sql
+            dbcursor.execute(sql, (scraper,))
+            res = dbcursor.fetchone()[0]
+            if res is not None:
+                status = "new"
+                lastscrapestamputc = res
+                lastscrapestamputc = lastscrapestamputc.isoformat().replace("+02:00", "+00:00").replace(" ", "T")
+            else:
+                status = "initial"
+        except psycopg2.DataError, e:
+            print "Failed" + e
+            exit(0)
+        
+    url = jsonurl(scraper, lastscrapestamputc)
+    #print "Using " + url
+    data = json.load(urllib2.urlopen(url))
+    try:
+        if data['error']:
+            print "Error fetching data from " + scraper
+            return
+    except:
+        pass
+    
+    print "Adding/updating " + str(len(data)) + " " + status + " entries"
+    for entry in data:
+
+        # Handle OEP scraper 2012-06-16
+        if not 'caseyear' in entry or entry['caseyear'] is None or \
+                not 'caseseqnr' in entry or entry['caseseqnr'] is None:
+            entry['caseyear'], entry['caseseqnr'] = entry['caseid'].split("/")
+
+        entry['scraper'] = scraper
+#        print entry
+        # Workaround for postliste-stortinget failing on some PDFs
+        if entry['doctype'] == u'Avs./mot:':
+            continue
+        try:
+            insert_entry(dbcursor, entry)
+        except:
+            print entry
+            raise
+    return len(data)
+
+def verify_all_data_is_transfered(dbcursor, scraper):
+    sql = "SELECT COUNT(*) FROM journal WHERE scraper = '%s'" % scraper
+    dbcursor.execute(sql, (scraper,))
+    res = dbcursor.fetchone()[0]
+    if res is not None:
+        sqlcount = res
+        url="https://api.scraperwiki.com/api/1.0/scraper/getinfo?format=jsondict&name=%s&version=-1" % scraper
+        jsondata = urllib2.urlopen(url)
+        data = json.load(jsondata)
+        swcount = data[0]['datasummary']['tables']['swdata']['count']
+        if swcount != sqlcount:
+            print "warning: %d records in SQL table do not match %d records in source (diff %d)" % (sqlcount, swcount, swcount - sqlcount)
+
+def main():
+    dbconn, dbcursor = db_connect()
+    if False:
+        create_table(dbconn, dbcursor)
+
+    scrapers = [
+#        'postliste-arendal',   # Missing caseid, casedesc etc.
+#        'postliste-lindesnes', # Missing caseid, casedesc etc.
+#        'postliste-hvaler', # kommune # parsefeil
+
+#        'postliste-hole',      # Missing casedocseq
+        'postlist-ssb',
+        'postliste-ballangen', # kommune  # Inconsistent dataset before 2006?
+        'postliste-difi',
+        'postliste-fredrikstad', # kommune
+        'postliste-hadsel', # kommune
+        'postliste-halden', # kommune
+        'postliste-hoegskolen-i-finnmark',
+        'postliste-hoegskolen-i-gjoevik',
+        'postliste-hoegskolen-i-hamar',
+#        'postliste-hoegskolen-i-hedmark', # replaces -i-hamar
+        'postliste-hoegskolen-i-buskerud',
+        'postliste-hoegskolen-i-lillehammer',
+        'postliste-hoegskolen-i-nord-troendelag',
+        'postliste-hoegskolen-i-soer-troendelag',
+        'postliste-hoegskolen-i-telemark',
+        'postliste-hoegskolen-i-vestfold',
+        'postliste-hoegskolen-i-vestfold',
+        'postliste-hoegskolen-i-volda',
+        'postliste-kafjord-kommune',
+        'postliste-kristiansund', # kommune
+        'postliste-lier', # kommune
+        'postliste-loppa-kommune',
+        'postliste-luftambulanse',
+        'postliste-met',
+        'postliste-mrfylke',
+        'postliste-naroy', # kommune
+        'postliste-nih',
+        'postliste-npolar',
+        'postliste-ntnu',
+        'postliste-oep',
+        'postliste-oslo-bydel-ullern', # kommune
+        'postliste-oslo-gravferdsetaten', # kommune
+        'postliste-oslo-havn', # kommune
+        'postliste-ruter',
+        'postliste-saltdal', # kommune
+        'postliste-sivilombudsmannen',
+        'postliste-skogoglandskap',
+        'postliste-sogne', # kommune
+        'postliste-stavanger-universitetssjukehus',
+        'postliste-storfjord', # kommune
+        'postliste-stortinget',
+        'postliste-universitetet-i-agder',
+        'postliste-universitetet-i-oslo',
+        'postliste-universitetet-i-stavanger',
+        'postliste-universitetssykehuset-nord-norge',
+    ]
+
+    for scraper in scrapers:
+        print "Moving data from " + scraper
+        while 100 < populate_from_scraper(dbcursor, scraper):
+            dbconn.commit()
+        dbconn.commit()
+        verify_all_data_is_transfered(dbcursor, scraper)
+
+main()
diff --git a/postliste-keysummary b/postliste-keysummary
new file mode 100755
index 0000000..0041b73
--- /dev/null
+++ b/postliste-keysummary
@@ -0,0 +1,57 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+import sys
+import urllib2
+import json
+
+scrapers = [
+    'postliste-arendal',
+    'postliste-ballangen',
+    'postliste-fredrikstad',
+    'postliste-hadsel',
+    'postliste-halden',
+    'postliste-kafjord',
+    'postliste-kristiansund',
+    'postliste-lier',
+    'postliste-lindesnes',
+    'postliste-naroy',
+    'postliste-nrk',
+    'postliste-oep',
+    'postliste-oslo-bydel-ullern',
+    'postliste-oslo-gravferdsetaten',
+    'postliste-oslo-havn',
+    'postliste-risør-kommune',
+    'postliste-ruter',
+    'postliste-saltdal',
+    'postliste-sivilombudsmannen',
+    'postliste-skogoglandskap',
+    'postliste-sogne',
+    'postliste-stavanger-universitetssjukehus',
+    'postliste-storfjord',
+    'postliste-stortinget',
+    'postliste-universitetet-i-oslo',
+  ]
+
+keys = {}
+
+for scraper in scrapers:
+    print >> sys.stderr, \
+        "Loading " + scraper
+    url = 'https://api.scraperwiki.com/api/1.0/scraper/getinfo?format=jsondict&name=' + scraper + '&version=-1'
+    response = urllib2.urlopen(url)
+    html = response.read()
+    data = json.loads(html)
+    try:
+        if 'swdata' in data[0]['datasummary']['tables']:
+            for key in data[0]['datasummary']['tables']['swdata']['keys']:
+                key = key.lower()
+                if key in keys:
+                    keys[key].append(scraper)
+                else:
+                    keys[key] = [scraper]
+    except:
+        print >> sys.stderr, \
+            "error: unable to find data from scraper " + scraper
+for key in keys:
+    print len(keys[key]), key, str(keys[key])
author	Petter Reinholdtsen <pere@hungry.com>	2012-07-13 12:45:45 +0200
committer	Petter Reinholdtsen <pere@hungry.com>	2012-07-13 12:45:45 +0200
commit	b023d46935617dcdefbf80bb751c4dbc9987d53a (patch)
tree	22819ad8e17b483d646ac9008481581067f06971
parent	c99e7bfda6e025314b6a7c6683a1bc3c5818621c (diff)