aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorPetter Reinholdtsen <pere@hungry.com>2014-12-09 11:46:16 +0100
committerPetter Reinholdtsen <pere@hungry.com>2014-12-09 11:46:16 +0100
commita51e734b0ba2e335f697900cbf21cd5405ab625b (patch)
tree8c4ecb87fec3a5b448bf8db1973849b7636cbef1
parente720e189eb9724dcf20430a4a79d49f111f6916e (diff)
Get working in new environment and make more robust.
-rw-r--r--scrapersources/postliste-oep25
1 files changed, 19 insertions, 6 deletions
diff --git a/scrapersources/postliste-oep b/scrapersources/postliste-oep
index 95e0a2f..6ad338a 100644
--- a/scrapersources/postliste-oep
+++ b/scrapersources/postliste-oep
@@ -19,6 +19,11 @@ import urllib2
# Try several times as the database get bigger
writetries = 8
+# Try several times if there is a problem with the service
+readtries = 3
+
+read_backwards = False
+
# http://www.oep.no/search/resultSingle.html?journalPostId=1000000
# http://www.oep.no/search/resultSingle.html?journalPostId=3889259
@@ -106,8 +111,11 @@ def cpu_spent():
usage = resource.getrusage(resource.RUSAGE_SELF)
return getattr(usage, 'ru_utime') + getattr(usage, 'ru_stime')
+def cpu_available():
+ return resource.getrlimit(resource.RLIMIT_CPU)[0]/2 - 1
+
def url_from_id(id):
- return "http://www.oep.no/search/resultSingle.html?journalPostId=" + str(id)
+ return "https://www.oep.no/search/resultSingle.html?journalPostId=" + str(id)
def save(data):
problem = False
@@ -164,6 +172,7 @@ doctypemap = {
def fetch_oep_entry(id, datastorage):
oepurl = url_from_id(id)
+# print "Fetching %s" % oepurl
html = scraperwiki.scrape(oepurl)
root = lxml.html.fromstring(html.decode('utf-8'))
data = { 'journalPostId' : id }
@@ -194,6 +203,7 @@ def fetch_oep_entry(id, datastorage):
return 0
def fetch_range(datastorage, first, last, step):
+ global readtries
myskiplimit = skiplimit
skipped = 0
fetched = 0
@@ -202,7 +212,7 @@ def fetch_range(datastorage, first, last, step):
if id < 0:
break
try:
- tries = 3
+ tries = readtries
while 0 < tries:
tries = tries - 1
try:
@@ -223,12 +233,14 @@ def fetch_range(datastorage, first, last, step):
myskiplimit = skiplimit
break
except urllib2.HTTPError, e: # Because HTTPError lack reason due to bug
- print "URLError triggered for url " + url_from_id(id) + ", trying again: " + str(e.msg)
+ print "HTTPError triggered for url " + url_from_id(id) + ", trying again: " + str(e.msg)
except urllib2.URLError, e:
print "URLError triggered for url " + url_from_id(id) + ", trying again: " + str(e.reason)
except httplib.BadStatusLine, e:
# e.msg do not exist. trying .reason 2012-06-25
print "BadStatusLine triggered for url " + url_from_id(id) + ", trying again: " + str(e.reason)
+ if 0 == tries:
+ raise urllib2.URLError("unable to talk to service, giving up.")
if skipped >= myskiplimit:
print "Reached end of list, exiting at " + str(id)
@@ -243,7 +255,7 @@ def fetch_range(datastorage, first, last, step):
min_id = id
# print "Updating min_id to " + str(min_id)
save_var('min_tested_id', min_id)
- if cpu_spent() > 79:
+ if cpu_spent() > cpu_available():
print "Running short on CPU time, exiting at " + str(id)
break
time.sleep(0.2)
@@ -357,9 +369,10 @@ try:
except scraperwiki.sqlite.SqliteError:
pass
-fetched = fetch_range(datastorage, max + 1, max + count, 1)
+fetched = 0
+fetched = fetched + fetch_range(datastorage, max + 1, max + count, 1)
print "Fetched " + str(fetched) + " new journal entries, cpu spent: " + str(cpu_spent())
-if min >= 0:
+if min >= 0 and read_backwards:
fetched = fetch_range(datastorage, min, min - count, -1)
print "Fetched " + str(fetched) + " old journal entries, cpu spent: " + str(cpu_spent())