diff options
author | Petter Reinholdtsen <pere@hungry.com> | 2014-02-27 14:50:56 +0100 |
---|---|---|
committer | Petter Reinholdtsen <pere@hungry.com> | 2014-02-27 14:51:44 +0100 |
commit | e718f0bca2fb52c37ef1c35cfe0345d4fb83cac2 (patch) | |
tree | 9ade061ecc283196a07882a20608526ff6976e69 /move-postjournal | |
parent | f3485a50f39f7fc50ae0f79ca11a45e9ea67856e (diff) |
Update to current version, make it more robust.
Diffstat (limited to 'move-postjournal')
-rwxr-xr-x | move-postjournal | 34 |
1 files changed, 25 insertions, 9 deletions
diff --git a/move-postjournal b/move-postjournal index fce7e64..144ccf4 100755 --- a/move-postjournal +++ b/move-postjournal @@ -7,6 +7,7 @@ import json import string import sys import os.path +import time dbname = "postjournal" dbtable = "journal" @@ -133,7 +134,7 @@ def insert_entry(dbcursor, entry): # print sql # print e -def populate_from_scraper(dbcursor, scraper): +def populate_from_scraper_real(dbcursor, scraper): lastscrapestamputc = '' if True: try: @@ -167,6 +168,9 @@ def populate_from_scraper(dbcursor, scraper): # Handle OEP scraper 2012-06-16 if not 'caseyear' in entry or entry['caseyear'] is None or \ not 'caseseqnr' in entry or entry['caseseqnr'] is None: +# if entry['caseid'] is None: +# print "Strange entry, skipping: ", entry +# continue entry['caseyear'], entry['caseseqnr'] = entry['caseid'].split("/") entry['scraper'] = scraper @@ -181,6 +185,13 @@ def populate_from_scraper(dbcursor, scraper): raise return len(data) +def populate_from_scraper(dbcursor, scraper): + ret = populate_from_scraper_real(dbcursor, scraper) + if ret is None: + time.sleep(10) + ret = populate_from_scraper_real(dbcursor, scraper) + return ret + def verify_all_data_is_transfered(dbcursor, scraper): sql = "SELECT COUNT(*) FROM %s WHERE scraper = '%s'" % (dbtable, scraper) dbcursor.execute(sql, (scraper,)) @@ -188,11 +199,14 @@ def verify_all_data_is_transfered(dbcursor, scraper): if res is not None: sqlcount = res url="https://api.scraperwiki.com/api/1.0/scraper/getinfo?format=jsondict&name=%s&version=-1" % scraper - jsondata = urllib2.urlopen(url) - data = json.load(jsondata) - swcount = data[0]['datasummary']['tables']['swdata']['count'] - if swcount != sqlcount: - print "warning: %d records in SQL table do not match %d records in source (diff %d)" % (sqlcount, swcount, swcount - sqlcount) + try: + jsondata = urllib2.urlopen(url) + data = json.load(jsondata) + swcount = data[0]['datasummary']['tables']['swdata']['count'] + if swcount != sqlcount: + print "warning: %d records in SQL table do not match %d records in source (diff %d)" % (sqlcount, swcount, swcount - sqlcount) + except: + print "warning: Unable to verify the data transfered" def main(): dbconn, dbcursor = db_connect() @@ -200,11 +214,13 @@ def main(): create_table(dbconn, dbcursor) scrapers = [ + 'postliste-mattilsynet', # 'postliste-arendal', # Missing caseid, casedesc etc. # 'postliste-lindesnes', # Missing caseid, casedesc etc. # 'postliste-hvaler', # kommune # parsefeil # 'postliste-hole', # Missing casedocseq + 'postliste-lenvik', 'postlist-ssb', 'postliste-ballangen', # kommune # Inconsistent dataset before 2006? 'postliste-difi', @@ -213,8 +229,8 @@ def main(): 'postliste-halden', # kommune 'postliste-hoegskolen-i-finnmark', 'postliste-hoegskolen-i-gjoevik', - 'postliste-hoegskolen-i-hamar', -# 'postliste-hoegskolen-i-hedmark', # replaces -i-hamar +# 'postliste-hoegskolen-i-hamar', + 'postliste-hoegskolen-i-hedmark', # replaces -i-hamar 'postliste-hoegskolen-i-buskerud', 'postliste-hoegskolen-i-lillehammer', 'postliste-hoegskolen-i-nord-troendelag', @@ -234,7 +250,6 @@ def main(): 'postliste-nih', 'postliste-npolar', 'postliste-ntnu', - 'postliste-oep', 'postliste-oslo-bydel-ullern', # kommune 'postliste-oslo-gravferdsetaten', # kommune 'postliste-oslo-havn', # kommune @@ -250,6 +265,7 @@ def main(): 'postliste-universitetet-i-oslo', 'postliste-universitetet-i-stavanger', 'postliste-universitetssykehuset-nord-norge', + 'postliste-oep', ] for scraper in scrapers: |