aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorPetter Reinholdtsen <pere@hungry.com>2014-12-10 20:08:00 +0100
committerPetter Reinholdtsen <pere@hungry.com>2014-12-10 20:08:00 +0100
commit6445f4f308efdd9f6d6ff78ded62d4045b4af86a (patch)
treebc5ac4f3ff8f48c55661a3659dfe1ceb32c20618
parent5ed5a0bd4551c8c853ad631cd094311f86136a28 (diff)
Add code to try again to load some broken entries in the database.
Increase the amount fetched in from 3000 to 5000 the rescan code.
-rw-r--r--scrapersources/postliste-oep22
1 files changed, 21 insertions, 1 deletions
diff --git a/scrapersources/postliste-oep b/scrapersources/postliste-oep
index 86ea66b..807a11e 100644
--- a/scrapersources/postliste-oep
+++ b/scrapersources/postliste-oep
@@ -327,6 +327,24 @@ def remove_original():
scraperwiki.sqlite.commit()
exit(0)
+# Fetch again some crap entries that ended up in the database when the
+# script was slightly broken and filled in non-existing entries in the
+# SQL database.
+def reparse_strange_entries(datastorage):
+ for idref in scraperwiki.sqlite.select("journalPostId FROM swdata WHERE caseid IS NULL"):
+ id = idref['journalPostId']
+ if -1 == fetch_oep_entry(id, datastorage):
+ print "Refetching %d failed, flush ID" % id
+ scraperwiki.sqlite.execute("DELETE from swdata where journalPostId = %d" % id)
+ else:
+ print "Refetching %d" % id
+ if 50 <= len(datastorage):
+ save(data=datastorage)
+ datastorage = []
+ if 0 < len(datastorage):
+ save(data=datastorage)
+ datastorage = []
+
#update_caseyear()
#create_indexes()
@@ -341,6 +359,8 @@ print "Starting to fetch journal entries " + str(datetime.datetime.now())
scraperwiki.scrape("http://www.oep.no/")
datastorage = []
+reparse_strange_entries(datastorage)
+
# Update entries to handle <URL: https://rt.nuug.no:443/Ticket/Display.html?id=6342 >.
# Used 2012-09-17
#scraperwiki.sqlite.execute("DELETE from swdata where journalPostId = 638167")
@@ -380,7 +400,7 @@ if min >= 0 and read_backwards:
rescan_min = scraperwiki.sqlite.get_var('min_rescan_id')
if rescan_min is None:
rescan_min = 0
-rescan_count = 3000
+rescan_count = 5000
if rescan_min < max:
end = rescan_min + rescan_count
fetched = fetch_range(datastorage, rescan_min, end, 1)