aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorPetter Reinholdtsen <pere@hungry.com>2016-10-02 11:40:48 +0200
committerPetter Reinholdtsen <pere@hungry.com>2016-10-02 11:40:48 +0200
commitf903a2e46be2fdca5fcf05426b357906537d8eb0 (patch)
tree4188e3aaf524a41134225aa95d1b9af6c9177dc6
parent191ad0a3d8adec41269a1fa120239d8cd202c579 (diff)
Add CPU limit and report the number of records fetched.
-rw-r--r--scrapersources/postliste-oslo-kommune-byraadsavdelingene30
1 files changed, 27 insertions, 3 deletions
diff --git a/scrapersources/postliste-oslo-kommune-byraadsavdelingene b/scrapersources/postliste-oslo-kommune-byraadsavdelingene
index b0c0d30..f2601e6 100644
--- a/scrapersources/postliste-oslo-kommune-byraadsavdelingene
+++ b/scrapersources/postliste-oslo-kommune-byraadsavdelingene
@@ -15,6 +15,7 @@ import urllib
import urllib2
import lxml.html
import re
+import resource
import dateutil.parser
import datetime
from dateutil.relativedelta import relativedelta
@@ -53,6 +54,18 @@ fieldmap = {
class NoDataEntries(LookupError):
pass
+def cpu_spent():
+ usage = resource.getrusage(resource.RUSAGE_SELF)
+ return getattr(usage, 'ru_utime') + getattr(usage, 'ru_stime')
+
+def cpu_available():
+ available = resource.getrlimit(resource.RLIMIT_CPU)[0]
+ # If no limit is set, assume 20 CPU seconds as the limit to avoid
+ # running for more than a few minutes every time.
+ if 0 > available:
+ available = 20
+ return available
+
def parse_day_html(parser, datastore, dayurl, html):
root = lxml.html.fromstring(html)
count = 0
@@ -108,6 +121,7 @@ def parse_day_html(parser, datastore, dayurl, html):
def fetch_day(parser, day):
datastore = []
daystr = day.strftime('%d.%m.%Y')
+ totalcount = 0
try:
offset = 0
offsetstep = 10
@@ -116,10 +130,11 @@ def fetch_day(parser, day):
html = postlistelib.fetch_url_harder(dayurl).decode('utf-8')
# print html
count = parse_day_html(parser, datastore, dayurl, html)
+ totalcount = totalcount + count
# print count, dayurl
if 0 == count:
# print "Ending day at offset %d" % offset
- return
+ return totalcount
scraperwiki.sqlite.save(unique_keys=['arkivsaksref'], data=datastore)
datastore = []
offset = offset + offsetstep
@@ -144,12 +159,21 @@ if not newest:
skiplimit = 10
+totalcount = 0
# Look forward one week to at least get past the weekends
for n in xrange(skiplimit):
- fetch_day(parser, newest + aday * n)
+ totalcount = totalcount + fetch_day(parser, newest + aday * n)
+ if cpu_spent() > cpu_available():
+ print "Running short on CPU time, exiting"
+ os.exit(0)
for n in xrange(skiplimit):
- fetch_day(parser, oldest - aday * n)
+ totalcount = totalcount + fetch_day(parser, oldest - aday * n)
+ if cpu_spent() > cpu_available():
+ print "Running short on CPU time, exiting"
+ os.exit(0)
+
+print "Fetched %d journal entries" % totalcount
# FIXME should rescan after a while to make sure we get all the
# entries when moving forward