aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorPetter Reinholdtsen <pere@hungry.com>2014-12-13 10:55:13 +0100
committerPetter Reinholdtsen <pere@hungry.com>2014-12-13 10:55:13 +0100
commitcead925114ac5e8396402303ac339a766fdcd9d0 (patch)
treeb65944adb400cc08c82760136db5408827a29ec2
parenta6b05e08fd045b7d50e46a469528120cc28dbde2 (diff)
Get script working with local SQlite files.
-rwxr-xr-xmove-postjournal105
1 files changed, 74 insertions, 31 deletions
diff --git a/move-postjournal b/move-postjournal
index 6ae548e..045cad0 100755
--- a/move-postjournal
+++ b/move-postjournal
@@ -8,6 +8,7 @@ import string
import sys
import os.path
import time
+import sqlalchemy
dbname = "postjournal"
dbtable = "journal"
@@ -59,13 +60,19 @@ def db_connect():
sys.exit("Database connection failed!\n ->%s" % (exceptionValue))
return (conn, cursor)
-def jsonurl(scraper, lastscrapestamputc):
- limit = 10000
- limit = 400
- return "https://api.scraperwiki.com/"+\
- "api/1.0/datastore/sqlite?format=json&name=" + scraper +\
- "&query=select+*+from+%60swdata%60+where+scrapestamputc+>+"+\
- "'%s'+order+by+scrapestamputc+limit+%d" % (lastscrapestamputc, limit)
+def sqliteline(lastscrapestamputc, limit):
+ if lastscrapestamputc is None or '' == lastscrapestamputc:
+ lastscrapestamputc = '1970-01-01'
+ sql = "select * from 'swdata' where " \
+ "datetime(scrapestamputc) >= datetime('%s') "\
+ "order by scrapestamputc limit %d" % (lastscrapestamputc, limit)
+# print sql
+ return sql
+
+def jsonurl(scraper, sql):
+ urlsql = urllib.urlencode(sql)
+ return "https://api.scraperwiki.com/" + \
+ "api/1.0/datastore/sqlite?format=json&name=" + scraper + "&query=%s" % urlsql
def create_table(dbconn, dbcursor):
print "Remove old %s table" % dbtable
@@ -134,7 +141,7 @@ def insert_entry(dbcursor, entry):
# print sql
# print e
-def populate_from_scraper_real(dbcursor, scraper):
+def populate_from_scraper_real(dbcursor, scraper, limit):
lastscrapestamputc = ''
if True:
try:
@@ -145,22 +152,37 @@ def populate_from_scraper_real(dbcursor, scraper):
if res is not None:
status = "new"
lastscrapestamputc = res
- lastscrapestamputc = lastscrapestamputc.isoformat().replace("+02:00", "+00:00").replace(" ", "T")
+ lastscrapestamputc = lastscrapestamputc.isoformat()
+# print "Before: %s, after %s" % (res, lastscrapestamputc)
else:
status = "initial"
except psycopg2.DataError, e:
print "Failed" + e
exit(0)
-
- url = jsonurl(scraper, lastscrapestamputc)
- #print "Using " + url
- data = json.load(urllib2.urlopen(url))
- try:
- if data['error']:
- print "Error fetching data from " + scraper
- return
- except:
- pass
+
+ sql = sqliteline(lastscrapestamputc, limit)
+ if False: # old API
+ url = jsonurl(scraper, sql)
+ #print "Using " + url
+ data = json.load(urllib2.urlopen(url))
+ try:
+ if data['error']:
+ print "Error fetching data from " + scraper
+ return
+ except:
+ pass
+ else:
+ filename = "sqlite:///data/%s.sqlite" % scraper
+ create = sqlalchemy.create_engine
+# print "opening %s" % filename
+ engine = create(filename, echo=False, connect_args={'timeout': 300})
+ connection = engine.connect()
+ result = connection.execute(sql)
+
+ data = []
+ for row in result:
+ data.append(dict(row.items()))
+ connection.close()
print "Adding/updating " + str(len(data)) + " " + status + " entries (" + lastscrapestamputc + ")"
skipped = 0
@@ -177,6 +199,8 @@ def populate_from_scraper_real(dbcursor, scraper):
entry['caseyear'], entry['caseseqnr'] = entry['caseid'].split("/")
entry['scraper'] = scraper
+# print "scrapestamputc: %s" % entry['scrapestamputc']
+ entry['scrapestamputc'] = entry['scrapestamputc'] + '+0000'
# print entry
# Workaround for postliste-stortinget failing on some PDFs
if entry['doctype'] == u'Avs./mot:':
@@ -186,14 +210,14 @@ def populate_from_scraper_real(dbcursor, scraper):
except:
print entry
raise
- print "Added/upded " + str(len(data)-skipped) + " " + status + " entries"
+ print "Added/updated " + str(len(data)-skipped) + " " + status + " entries"
return len(data) - skipped
-def populate_from_scraper(dbcursor, scraper):
- ret = populate_from_scraper_real(dbcursor, scraper)
+def populate_from_scraper(dbcursor, scraper, limit):
+ ret = populate_from_scraper_real(dbcursor, scraper, limit)
if ret is None:
time.sleep(10)
- ret = populate_from_scraper_real(dbcursor, scraper)
+ ret = populate_from_scraper_real(dbcursor, scraper, limit)
return ret
def verify_all_data_is_transfered(dbcursor, scraper):
@@ -202,15 +226,32 @@ def verify_all_data_is_transfered(dbcursor, scraper):
res = dbcursor.fetchone()[0]
if res is not None:
sqlcount = res
- url="https://api.scraperwiki.com/api/1.0/scraper/getinfo?format=jsondict&name=%s&version=-1" % scraper
try:
- jsondata = urllib2.urlopen(url)
- data = json.load(jsondata)
- swcount = data[0]['datasummary']['tables']['swdata']['count']
+ if False: # old API
+ url="https://api.scraperwiki.com/api/1.0/scraper/getinfo?format=jsondict&name=%s&version=-1" % scraper
+ jsondata = urllib2.urlopen(url)
+ data = json.load(jsondata)
+ swcount = data[0]['datasummary']['tables']['swdata']['count']
+ else:
+ filename = "sqlite:///data/%s.sqlite" % scraper
+ create = sqlalchemy.create_engine
+# print "opening %s" % filename
+ engine = create(filename, echo=False, connect_args={'timeout': 300})
+ connection = engine.connect()
+ result = connection.execute("SELECT COUNT(*) FROM 'swdata'")
+ if result is not None:
+ for row in result:
+# print row.items()
+ swcount = row.items()[0][1]
+ else:
+ print "No count from SQL?"
+ connection.close()
+# print "swcount = %d" % swcount
+
if swcount != sqlcount:
print "warning: %d records in SQL table do not match %d records in source (diff %d)" % (sqlcount, swcount, swcount - sqlcount)
- except:
- print "warning: Unable to verify the data transfered"
+ except Exception, e:
+ print "warning: Unable to verify the data transfered: %s" % str(e)
def main():
dbconn, dbcursor = db_connect()
@@ -218,6 +259,7 @@ def main():
create_table(dbconn, dbcursor)
scrapers = [
+ 'postliste-oep',
'postliste-lunner-kommune',
'postliste-mattilsynet',
# 'postliste-arendal', # Missing caseid, casedesc etc.
@@ -270,12 +312,13 @@ def main():
'postliste-universitetet-i-oslo',
'postliste-universitetet-i-stavanger',
'postliste-universitetssykehuset-nord-norge',
- 'postliste-oep',
]
for scraper in scrapers:
+ print
print "Moving data from " + scraper
- while 100 < populate_from_scraper(dbcursor, scraper):
+ limit = 100000;
+ while limit/2 < populate_from_scraper(dbcursor, scraper, limit):
dbconn.commit()
dbconn.commit()
verify_all_data_is_transfered(dbcursor, scraper)