diff options
author | Petter Reinholdtsen <pere@hungry.com> | 2014-12-10 20:08:00 +0100 |
---|---|---|
committer | Petter Reinholdtsen <pere@hungry.com> | 2014-12-10 20:08:00 +0100 |
commit | 6445f4f308efdd9f6d6ff78ded62d4045b4af86a (patch) | |
tree | bc5ac4f3ff8f48c55661a3659dfe1ceb32c20618 | |
parent | 5ed5a0bd4551c8c853ad631cd094311f86136a28 (diff) |
Add code to try again to load some broken entries in the database.
Increase the amount fetched in from 3000 to 5000 the rescan code.
-rw-r--r-- | scrapersources/postliste-oep | 22 |
1 files changed, 21 insertions, 1 deletions
diff --git a/scrapersources/postliste-oep b/scrapersources/postliste-oep index 86ea66b..807a11e 100644 --- a/scrapersources/postliste-oep +++ b/scrapersources/postliste-oep @@ -327,6 +327,24 @@ def remove_original(): scraperwiki.sqlite.commit() exit(0) +# Fetch again some crap entries that ended up in the database when the +# script was slightly broken and filled in non-existing entries in the +# SQL database. +def reparse_strange_entries(datastorage): + for idref in scraperwiki.sqlite.select("journalPostId FROM swdata WHERE caseid IS NULL"): + id = idref['journalPostId'] + if -1 == fetch_oep_entry(id, datastorage): + print "Refetching %d failed, flush ID" % id + scraperwiki.sqlite.execute("DELETE from swdata where journalPostId = %d" % id) + else: + print "Refetching %d" % id + if 50 <= len(datastorage): + save(data=datastorage) + datastorage = [] + if 0 < len(datastorage): + save(data=datastorage) + datastorage = [] + #update_caseyear() #create_indexes() @@ -341,6 +359,8 @@ print "Starting to fetch journal entries " + str(datetime.datetime.now()) scraperwiki.scrape("http://www.oep.no/") datastorage = [] +reparse_strange_entries(datastorage) + # Update entries to handle <URL: https://rt.nuug.no:443/Ticket/Display.html?id=6342 >. # Used 2012-09-17 #scraperwiki.sqlite.execute("DELETE from swdata where journalPostId = 638167") @@ -380,7 +400,7 @@ if min >= 0 and read_backwards: rescan_min = scraperwiki.sqlite.get_var('min_rescan_id') if rescan_min is None: rescan_min = 0 -rescan_count = 3000 +rescan_count = 5000 if rescan_min < max: end = rescan_min + rescan_count fetched = fetch_range(datastorage, rescan_min, end, 1) |