aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorPetter Reinholdtsen <pere@hungry.com>2016-04-08 08:54:50 +0200
committerPetter Reinholdtsen <pere@hungry.com>2016-04-08 08:54:50 +0200
commitd2a04f43eee94ab715fabd7ef175b5177eab709a (patch)
tree3a2f590d2bd29e817299fd33bbdf6ffd5917e902
parente37a8fde9b4625621d9bf4c209d78c9fbbe9d13b (diff)
Make rescanning optional.
-rw-r--r--scrapersources/postliste-oep32
1 files changed, 22 insertions, 10 deletions
diff --git a/scrapersources/postliste-oep b/scrapersources/postliste-oep
index b305f53..06a7b3a 100644
--- a/scrapersources/postliste-oep
+++ b/scrapersources/postliste-oep
@@ -22,8 +22,16 @@ writetries = 8
# Try several times if there is a problem with the service
readtries = 3
+# Set to false to not look for earlier entries before the earliest in
+# the database.
read_backwards = True
+# Set to False not not rescan entries with a latency to discover
+# updates and entries introduced after we checked the ID the first
+# time.
+rescan_after_a_while = True
+
+
# http://www.oep.no/search/resultSingle.html?journalPostId=1000000
# http://www.oep.no/search/resultSingle.html?journalPostId=3889259
@@ -418,13 +426,17 @@ if min >= 0 and read_backwards:
fetched = fetch_range(datastorage, min, min - count, -1)
print "Fetched " + str(fetched) + " old journal entries, cpu spent: " + str(cpu_spent())
-# Rescan to see if we missed something, and to get the latest version
-rescan_min = scraperwiki.sqlite.get_var('min_rescan_id')
-if rescan_min is None:
- rescan_min = 0
-rescan_count = 8000
-if rescan_min + rescan_count < max - 100000:
- end = rescan_min + rescan_count
- fetched = fetch_range(datastorage, rescan_min, end, 1)
- save_var('min_rescan_id', end - 1)
- print "Fetched %d rescanned journal entries (%d-%d), cpu spent: %f" % (fetched, rescan_min, end, cpu_spent())
+if rescan_after_a_while:
+ rescan_count = 8000
+ rescan_latency = 100000
+
+ # Rescan to see if we missed something, and to get the latest version
+ rescan_min = scraperwiki.sqlite.get_var('min_rescan_id')
+ if rescan_min is None:
+ rescan_min = 0
+ if rescan_min + rescan_count < max - rescan_latency:
+ end = rescan_min + rescan_count
+ fetched = fetch_range(datastorage, rescan_min, end, 1)
+ save_var('min_rescan_id', end - 1)
+ print "Fetched %d rescanned journal entries (%d-%d), cpu spent: %f" \
+ % (fetched, rescan_min, end, cpu_spent())