diff options
author | Petter Reinholdtsen <pere@hungry.com> | 2016-04-08 08:54:50 +0200 |
---|---|---|
committer | Petter Reinholdtsen <pere@hungry.com> | 2016-04-08 08:54:50 +0200 |
commit | d2a04f43eee94ab715fabd7ef175b5177eab709a (patch) | |
tree | 3a2f590d2bd29e817299fd33bbdf6ffd5917e902 | |
parent | e37a8fde9b4625621d9bf4c209d78c9fbbe9d13b (diff) |
Make rescanning optional.
-rw-r--r-- | scrapersources/postliste-oep | 32 |
1 files changed, 22 insertions, 10 deletions
diff --git a/scrapersources/postliste-oep b/scrapersources/postliste-oep index b305f53..06a7b3a 100644 --- a/scrapersources/postliste-oep +++ b/scrapersources/postliste-oep @@ -22,8 +22,16 @@ writetries = 8 # Try several times if there is a problem with the service readtries = 3 +# Set to false to not look for earlier entries before the earliest in +# the database. read_backwards = True +# Set to False not not rescan entries with a latency to discover +# updates and entries introduced after we checked the ID the first +# time. +rescan_after_a_while = True + + # http://www.oep.no/search/resultSingle.html?journalPostId=1000000 # http://www.oep.no/search/resultSingle.html?journalPostId=3889259 @@ -418,13 +426,17 @@ if min >= 0 and read_backwards: fetched = fetch_range(datastorage, min, min - count, -1) print "Fetched " + str(fetched) + " old journal entries, cpu spent: " + str(cpu_spent()) -# Rescan to see if we missed something, and to get the latest version -rescan_min = scraperwiki.sqlite.get_var('min_rescan_id') -if rescan_min is None: - rescan_min = 0 -rescan_count = 8000 -if rescan_min + rescan_count < max - 100000: - end = rescan_min + rescan_count - fetched = fetch_range(datastorage, rescan_min, end, 1) - save_var('min_rescan_id', end - 1) - print "Fetched %d rescanned journal entries (%d-%d), cpu spent: %f" % (fetched, rescan_min, end, cpu_spent()) +if rescan_after_a_while: + rescan_count = 8000 + rescan_latency = 100000 + + # Rescan to see if we missed something, and to get the latest version + rescan_min = scraperwiki.sqlite.get_var('min_rescan_id') + if rescan_min is None: + rescan_min = 0 + if rescan_min + rescan_count < max - rescan_latency: + end = rescan_min + rescan_count + fetched = fetch_range(datastorage, rescan_min, end, 1) + save_var('min_rescan_id', end - 1) + print "Fetched %d rescanned journal entries (%d-%d), cpu spent: %f" \ + % (fetched, rescan_min, end, cpu_spent()) |