Add CPU limit and report the number of records fetched.

author: Petter Reinholdtsen <pere@hungry.com> 2016-10-02 11:40:48 +0200
committer: Petter Reinholdtsen <pere@hungry.com> 2016-10-02 11:40:48 +0200
commit: f903a2e46be2fdca5fcf05426b357906537d8eb0 (patch)
tree: 4188e3aaf524a41134225aa95d1b9af6c9177dc6
parent: 191ad0a3d8adec41269a1fa120239d8cd202c579 (diff)
1 files changed, 27 insertions, 3 deletions
diff --git a/scrapersources/postliste-oslo-kommune-byraadsavdelingene b/scrapersources/postliste-oslo-kommune-byraadsavdelingene
index b0c0d30..f2601e6 100644
--- a/scrapersources/postliste-oslo-kommune-byraadsavdelingene
+++ b/scrapersources/postliste-oslo-kommune-byraadsavdelingene
@@ -15,6 +15,7 @@ import urllib
 import urllib2
 import lxml.html
 import re
+import resource
 import dateutil.parser
 import datetime
 from dateutil.relativedelta import relativedelta
@@ -53,6 +54,18 @@ fieldmap = {
 class NoDataEntries(LookupError):
     pass
 
+def cpu_spent():
+    usage = resource.getrusage(resource.RUSAGE_SELF)
+    return getattr(usage, 'ru_utime') + getattr(usage, 'ru_stime')
+
+def cpu_available():
+    available = resource.getrlimit(resource.RLIMIT_CPU)[0]
+    # If no limit is set, assume 20 CPU seconds as the limit to avoid
+    # running for more than a few minutes every time.
+    if 0 > available:
+        available = 20
+    return available
+
 def parse_day_html(parser, datastore, dayurl, html):
     root = lxml.html.fromstring(html)
     count = 0
@@ -108,6 +121,7 @@ def parse_day_html(parser, datastore, dayurl, html):
 def fetch_day(parser, day):
     datastore = []
     daystr = day.strftime('%d.%m.%Y')
+    totalcount = 0
     try:
         offset = 0
         offsetstep = 10
@@ -116,10 +130,11 @@ def fetch_day(parser, day):
             html = postlistelib.fetch_url_harder(dayurl).decode('utf-8')
 #            print html
             count = parse_day_html(parser, datastore, dayurl, html)
+            totalcount = totalcount + count
 #            print count, dayurl
             if 0 == count:
 #                print "Ending day at offset %d" % offset
-                return
+                return totalcount
             scraperwiki.sqlite.save(unique_keys=['arkivsaksref'], data=datastore)
             datastore = []
             offset = offset + offsetstep
@@ -144,12 +159,21 @@ if not newest:
 
 skiplimit = 10
 
+totalcount = 0
 # Look forward one week to at least get past the weekends
 for n in xrange(skiplimit):
-    fetch_day(parser, newest + aday * n)
+    totalcount = totalcount + fetch_day(parser, newest + aday * n)
+    if cpu_spent() > cpu_available():
+        print "Running short on CPU time, exiting"
+        os.exit(0)
 
 for n in xrange(skiplimit):
-    fetch_day(parser, oldest - aday * n)
+    totalcount = totalcount + fetch_day(parser, oldest - aday * n)
+    if cpu_spent() > cpu_available():
+        print "Running short on CPU time, exiting"
+        os.exit(0)
+
+print "Fetched %d journal entries" % totalcount
 
 # FIXME should rescan after a while to make sure we get all the
 # entries when moving forward
author	Petter Reinholdtsen <pere@hungry.com>	2016-10-02 11:40:48 +0200
committer	Petter Reinholdtsen <pere@hungry.com>	2016-10-02 11:40:48 +0200
commit	f903a2e46be2fdca5fcf05426b357906537d8eb0 (patch)
tree	4188e3aaf524a41134225aa95d1b9af6c9177dc6
parent	191ad0a3d8adec41269a1fa120239d8cd202c579 (diff)