diff options
author | Petter Reinholdtsen <pere@hungry.com> | 2014-12-09 11:46:16 +0100 |
---|---|---|
committer | Petter Reinholdtsen <pere@hungry.com> | 2014-12-09 11:46:16 +0100 |
commit | a51e734b0ba2e335f697900cbf21cd5405ab625b (patch) | |
tree | 8c4ecb87fec3a5b448bf8db1973849b7636cbef1 | |
parent | e720e189eb9724dcf20430a4a79d49f111f6916e (diff) |
Get working in new environment and make more robust.
-rw-r--r-- | scrapersources/postliste-oep | 25 |
1 files changed, 19 insertions, 6 deletions
diff --git a/scrapersources/postliste-oep b/scrapersources/postliste-oep index 95e0a2f..6ad338a 100644 --- a/scrapersources/postliste-oep +++ b/scrapersources/postliste-oep @@ -19,6 +19,11 @@ import urllib2 # Try several times as the database get bigger writetries = 8 +# Try several times if there is a problem with the service +readtries = 3 + +read_backwards = False + # http://www.oep.no/search/resultSingle.html?journalPostId=1000000 # http://www.oep.no/search/resultSingle.html?journalPostId=3889259 @@ -106,8 +111,11 @@ def cpu_spent(): usage = resource.getrusage(resource.RUSAGE_SELF) return getattr(usage, 'ru_utime') + getattr(usage, 'ru_stime') +def cpu_available(): + return resource.getrlimit(resource.RLIMIT_CPU)[0]/2 - 1 + def url_from_id(id): - return "http://www.oep.no/search/resultSingle.html?journalPostId=" + str(id) + return "https://www.oep.no/search/resultSingle.html?journalPostId=" + str(id) def save(data): problem = False @@ -164,6 +172,7 @@ doctypemap = { def fetch_oep_entry(id, datastorage): oepurl = url_from_id(id) +# print "Fetching %s" % oepurl html = scraperwiki.scrape(oepurl) root = lxml.html.fromstring(html.decode('utf-8')) data = { 'journalPostId' : id } @@ -194,6 +203,7 @@ def fetch_oep_entry(id, datastorage): return 0 def fetch_range(datastorage, first, last, step): + global readtries myskiplimit = skiplimit skipped = 0 fetched = 0 @@ -202,7 +212,7 @@ def fetch_range(datastorage, first, last, step): if id < 0: break try: - tries = 3 + tries = readtries while 0 < tries: tries = tries - 1 try: @@ -223,12 +233,14 @@ def fetch_range(datastorage, first, last, step): myskiplimit = skiplimit break except urllib2.HTTPError, e: # Because HTTPError lack reason due to bug - print "URLError triggered for url " + url_from_id(id) + ", trying again: " + str(e.msg) + print "HTTPError triggered for url " + url_from_id(id) + ", trying again: " + str(e.msg) except urllib2.URLError, e: print "URLError triggered for url " + url_from_id(id) + ", trying again: " + str(e.reason) except httplib.BadStatusLine, e: # e.msg do not exist. trying .reason 2012-06-25 print "BadStatusLine triggered for url " + url_from_id(id) + ", trying again: " + str(e.reason) + if 0 == tries: + raise urllib2.URLError("unable to talk to service, giving up.") if skipped >= myskiplimit: print "Reached end of list, exiting at " + str(id) @@ -243,7 +255,7 @@ def fetch_range(datastorage, first, last, step): min_id = id # print "Updating min_id to " + str(min_id) save_var('min_tested_id', min_id) - if cpu_spent() > 79: + if cpu_spent() > cpu_available(): print "Running short on CPU time, exiting at " + str(id) break time.sleep(0.2) @@ -357,9 +369,10 @@ try: except scraperwiki.sqlite.SqliteError: pass -fetched = fetch_range(datastorage, max + 1, max + count, 1) +fetched = 0 +fetched = fetched + fetch_range(datastorage, max + 1, max + count, 1) print "Fetched " + str(fetched) + " new journal entries, cpu spent: " + str(cpu_spent()) -if min >= 0: +if min >= 0 and read_backwards: fetched = fetch_range(datastorage, min, min - count, -1) print "Fetched " + str(fetched) + " old journal entries, cpu spent: " + str(cpu_spent()) |