Correct day calculations and handle running out of CPU time better.

author: Petter Reinholdtsen <pere@hungry.com> 2016-10-02 22:05:15 +0200
committer: Petter Reinholdtsen <pere@hungry.com> 2016-10-02 22:05:15 +0200
commit: 6ded9f706717d494d312df1a63dfa50f9c520714 (patch)
tree: 4d5d6078dc2278b53909e0e3e0d73592a7195463
parent: f903a2e46be2fdca5fcf05426b357906537d8eb0 (diff)
1 files changed, 21 insertions, 9 deletions
diff --git a/scrapersources/postliste-oslo-kommune-byraadsavdelingene b/scrapersources/postliste-oslo-kommune-byraadsavdelingene
index f2601e6..8523e8b 100644
--- a/scrapersources/postliste-oslo-kommune-byraadsavdelingene
+++ b/scrapersources/postliste-oslo-kommune-byraadsavdelingene
@@ -135,11 +135,14 @@ def fetch_day(parser, day):
             if 0 == count:
 #                print "Ending day at offset %d" % offset
                 return totalcount
-            scraperwiki.sqlite.save(unique_keys=['arkivsaksref'], data=datastore)
-            datastore = []
             offset = offset + offsetstep
+        scraperwiki.sqlite.save(unique_keys=['arkivsaksref'], data=datastore)
+        datastore = []
+    except scraperwiki.CPUTimeExceededError, e:
+        print "error: Ran out of time, abort scraping"
+        pass
     except Exception, e:
-        print html
+#        print html
         print e
         raise
 
@@ -157,19 +160,28 @@ if not newest:
     newest = datetime.datetime.today() - aday * 30
     oldest = newest
 
+#print oldest, newest
+
 skiplimit = 10
 
 totalcount = 0
-# Look forward one week to at least get past the weekends
-for n in xrange(skiplimit):
-    totalcount = totalcount + fetch_day(parser, newest + aday * n)
-    if cpu_spent() > cpu_available():
+
+# Look forward one week to at least get past the weekends, rescan the
+# last day in case new records showed up in the mean time.
+for n in xrange(skiplimit+1):
+    day = newest + aday * n
+#    print day
+    totalcount = totalcount + fetch_day(parser, day)
+    if cpu_spent() > cpu_available() + 5:
         print "Running short on CPU time, exiting"
         os.exit(0)
 
+# Scan backwards, one day before the oldest entry in the database
 for n in xrange(skiplimit):
-    totalcount = totalcount + fetch_day(parser, oldest - aday * n)
-    if cpu_spent() > cpu_available():
+    day = oldest - aday * (n+1)
+#    print day
+    totalcount = totalcount + fetch_day(parser, day)
+    if cpu_spent() > cpu_available() + 5:
         print "Running short on CPU time, exiting"
         os.exit(0)
author	Petter Reinholdtsen <pere@hungry.com>	2016-10-02 22:05:15 +0200
committer	Petter Reinholdtsen <pere@hungry.com>	2016-10-02 22:05:15 +0200
commit	6ded9f706717d494d312df1a63dfa50f9c520714 (patch)
tree	4d5d6078dc2278b53909e0e3e0d73592a7195463
parent	f903a2e46be2fdca5fcf05426b357906537d8eb0 (diff)