diff options
author | Petter Reinholdtsen <pere@hungry.com> | 2014-12-13 10:55:13 +0100 |
---|---|---|
committer | Petter Reinholdtsen <pere@hungry.com> | 2014-12-13 10:55:13 +0100 |
commit | cead925114ac5e8396402303ac339a766fdcd9d0 (patch) | |
tree | b65944adb400cc08c82760136db5408827a29ec2 | |
parent | a6b05e08fd045b7d50e46a469528120cc28dbde2 (diff) |
Get script working with local SQlite files.
-rwxr-xr-x | move-postjournal | 105 |
1 files changed, 74 insertions, 31 deletions
diff --git a/move-postjournal b/move-postjournal index 6ae548e..045cad0 100755 --- a/move-postjournal +++ b/move-postjournal @@ -8,6 +8,7 @@ import string import sys import os.path import time +import sqlalchemy dbname = "postjournal" dbtable = "journal" @@ -59,13 +60,19 @@ def db_connect(): sys.exit("Database connection failed!\n ->%s" % (exceptionValue)) return (conn, cursor) -def jsonurl(scraper, lastscrapestamputc): - limit = 10000 - limit = 400 - return "https://api.scraperwiki.com/"+\ - "api/1.0/datastore/sqlite?format=json&name=" + scraper +\ - "&query=select+*+from+%60swdata%60+where+scrapestamputc+>+"+\ - "'%s'+order+by+scrapestamputc+limit+%d" % (lastscrapestamputc, limit) +def sqliteline(lastscrapestamputc, limit): + if lastscrapestamputc is None or '' == lastscrapestamputc: + lastscrapestamputc = '1970-01-01' + sql = "select * from 'swdata' where " \ + "datetime(scrapestamputc) >= datetime('%s') "\ + "order by scrapestamputc limit %d" % (lastscrapestamputc, limit) +# print sql + return sql + +def jsonurl(scraper, sql): + urlsql = urllib.urlencode(sql) + return "https://api.scraperwiki.com/" + \ + "api/1.0/datastore/sqlite?format=json&name=" + scraper + "&query=%s" % urlsql def create_table(dbconn, dbcursor): print "Remove old %s table" % dbtable @@ -134,7 +141,7 @@ def insert_entry(dbcursor, entry): # print sql # print e -def populate_from_scraper_real(dbcursor, scraper): +def populate_from_scraper_real(dbcursor, scraper, limit): lastscrapestamputc = '' if True: try: @@ -145,22 +152,37 @@ def populate_from_scraper_real(dbcursor, scraper): if res is not None: status = "new" lastscrapestamputc = res - lastscrapestamputc = lastscrapestamputc.isoformat().replace("+02:00", "+00:00").replace(" ", "T") + lastscrapestamputc = lastscrapestamputc.isoformat() +# print "Before: %s, after %s" % (res, lastscrapestamputc) else: status = "initial" except psycopg2.DataError, e: print "Failed" + e exit(0) - - url = jsonurl(scraper, lastscrapestamputc) - #print "Using " + url - data = json.load(urllib2.urlopen(url)) - try: - if data['error']: - print "Error fetching data from " + scraper - return - except: - pass + + sql = sqliteline(lastscrapestamputc, limit) + if False: # old API + url = jsonurl(scraper, sql) + #print "Using " + url + data = json.load(urllib2.urlopen(url)) + try: + if data['error']: + print "Error fetching data from " + scraper + return + except: + pass + else: + filename = "sqlite:///data/%s.sqlite" % scraper + create = sqlalchemy.create_engine +# print "opening %s" % filename + engine = create(filename, echo=False, connect_args={'timeout': 300}) + connection = engine.connect() + result = connection.execute(sql) + + data = [] + for row in result: + data.append(dict(row.items())) + connection.close() print "Adding/updating " + str(len(data)) + " " + status + " entries (" + lastscrapestamputc + ")" skipped = 0 @@ -177,6 +199,8 @@ def populate_from_scraper_real(dbcursor, scraper): entry['caseyear'], entry['caseseqnr'] = entry['caseid'].split("/") entry['scraper'] = scraper +# print "scrapestamputc: %s" % entry['scrapestamputc'] + entry['scrapestamputc'] = entry['scrapestamputc'] + '+0000' # print entry # Workaround for postliste-stortinget failing on some PDFs if entry['doctype'] == u'Avs./mot:': @@ -186,14 +210,14 @@ def populate_from_scraper_real(dbcursor, scraper): except: print entry raise - print "Added/upded " + str(len(data)-skipped) + " " + status + " entries" + print "Added/updated " + str(len(data)-skipped) + " " + status + " entries" return len(data) - skipped -def populate_from_scraper(dbcursor, scraper): - ret = populate_from_scraper_real(dbcursor, scraper) +def populate_from_scraper(dbcursor, scraper, limit): + ret = populate_from_scraper_real(dbcursor, scraper, limit) if ret is None: time.sleep(10) - ret = populate_from_scraper_real(dbcursor, scraper) + ret = populate_from_scraper_real(dbcursor, scraper, limit) return ret def verify_all_data_is_transfered(dbcursor, scraper): @@ -202,15 +226,32 @@ def verify_all_data_is_transfered(dbcursor, scraper): res = dbcursor.fetchone()[0] if res is not None: sqlcount = res - url="https://api.scraperwiki.com/api/1.0/scraper/getinfo?format=jsondict&name=%s&version=-1" % scraper try: - jsondata = urllib2.urlopen(url) - data = json.load(jsondata) - swcount = data[0]['datasummary']['tables']['swdata']['count'] + if False: # old API + url="https://api.scraperwiki.com/api/1.0/scraper/getinfo?format=jsondict&name=%s&version=-1" % scraper + jsondata = urllib2.urlopen(url) + data = json.load(jsondata) + swcount = data[0]['datasummary']['tables']['swdata']['count'] + else: + filename = "sqlite:///data/%s.sqlite" % scraper + create = sqlalchemy.create_engine +# print "opening %s" % filename + engine = create(filename, echo=False, connect_args={'timeout': 300}) + connection = engine.connect() + result = connection.execute("SELECT COUNT(*) FROM 'swdata'") + if result is not None: + for row in result: +# print row.items() + swcount = row.items()[0][1] + else: + print "No count from SQL?" + connection.close() +# print "swcount = %d" % swcount + if swcount != sqlcount: print "warning: %d records in SQL table do not match %d records in source (diff %d)" % (sqlcount, swcount, swcount - sqlcount) - except: - print "warning: Unable to verify the data transfered" + except Exception, e: + print "warning: Unable to verify the data transfered: %s" % str(e) def main(): dbconn, dbcursor = db_connect() @@ -218,6 +259,7 @@ def main(): create_table(dbconn, dbcursor) scrapers = [ + 'postliste-oep', 'postliste-lunner-kommune', 'postliste-mattilsynet', # 'postliste-arendal', # Missing caseid, casedesc etc. @@ -270,12 +312,13 @@ def main(): 'postliste-universitetet-i-oslo', 'postliste-universitetet-i-stavanger', 'postliste-universitetssykehuset-nord-norge', - 'postliste-oep', ] for scraper in scrapers: + print print "Moving data from " + scraper - while 100 < populate_from_scraper(dbcursor, scraper): + limit = 100000; + while limit/2 < populate_from_scraper(dbcursor, scraper, limit): dbconn.commit() dbconn.commit() verify_all_data_is_transfered(dbcursor, scraper) |