# -*- coding: utf-8 -*- # YAML-tagger: # Type: statlig # Status: finished # Name: Offentlig Elektronisk postjournal # Format: HTML # Datatype: # Vendor: DIFI # Run: hourly import scraperwiki import lxml.html import datetime import time import resource import httplib import urllib2 # Try several times as the database get bigger writetries = 8 # Try several times if there is a problem with the service readtries = 3 # Set to false to not look for earlier entries before the earliest in # the database. read_backwards = True # Set to False not not rescan entries with a latency to discover # updates and entries introduced after we checked the ID the first # time. rescan_after_a_while = True # http://www.oep.no/search/resultSingle.html?journalPostId=1000000 # http://www.oep.no/search/resultSingle.html?journalPostId=3889259 # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
Agency:Kulturdepartementet
Case:DNT Oslo og Omegn - rehabilitering og utvidelse av turisthytta Snøheim på Dovre - spillemidler til anlegg for friluftsliv i fjellet 2011
Document title:DNT Oslo og Omegn - turisthytta Snøheim på Dovre - eventuelt navnebytte
Case number:2010/04027
Document number:4
Document type: # # # # Outgoing # # #
Recipient:Den Norske Turistforening
Document date:2010-12-13
Record entry date: # # # # 2010-12-14 # # #
Published in OEP2011-01-03
Grounds for exemption, document: # #
Archive code: # #
Contact point: #
# Tel.: 22 24 90 90
# Email: postmottak@kud.dep.no #
def cpu_spent(): usage = resource.getrusage(resource.RUSAGE_SELF) return getattr(usage, 'ru_utime') + getattr(usage, 'ru_stime') def cpu_available(): available = resource.getrlimit(resource.RLIMIT_CPU)[0] # If no limit is set, assume 20 CPU seconds as the limit to avoid # running for more than a few minutes every time. if 0 > available: available = 20 return available def url_from_id(id): return "https://www.oep.no/search/resultSingle.html?journalPostId=" + str(id) def save(data): problem = False for run in range(0,writetries): try: scraperwiki.sqlite.save(unique_keys=['journalPostId'], data=data) if problem: print "Sqlite write succeeded" return except scraperwiki.sqlite.SqliteError, e: print "Sqlite write error, trying again: " + str(e) time.sleep(22) problem = True raise scraperwiki.sqlite.SqliteError("Unable to write to database, tried " + str(writetries) + " times") def save_var(var, data): problem = False for run in range(0,writetries): try: scraperwiki.sqlite.save_var(var, data) if problem: print "Sqlite write succeeded" return except scraperwiki.sqlite.SqliteError, e: print "Sqlite write error, trying again: " + str(e) time.sleep(22) problem = True raise scraperwiki.sqlite.SqliteError("Unable to write variable " + var + " to database, tried " + str(writetries) + " times") fieldmap = { 'Agency' : 'agency', 'Record entry date' : 'recorddate', 'Case' : 'casedesc', 'Case number' : 'caseid', 'Document number' : 'casedocseq', 'Document date' : 'docdate', 'Document title' : 'docdesc', 'Document type' : 'doctype', 'Grounds for exemption document' : 'exemption', 'Recipient' : 'recipient', 'Sender' : 'sender', 'Published in OEP' : 'recordpublishdate', # 'Archive code', # 'Contact point', # 'journalPostId', # 'scrapestamputc', } doctypemap = { 'Incoming' : 'I', 'Outgoing' : 'U', 'internal' : 'X', } def fetch_oep_entry(id, datastorage): oepurl = url_from_id(id) # print "Fetching %s" % oepurl try: html = scraperwiki.scrape(oepurl) except urllib2.HTTPError, e: return -1 root = lxml.html.fromstring(html.decode('utf-8')) data = { 'journalPostId' : id } for tr in root.cssselect("table.defaultTable tr"): vtype = tr.cssselect("th")[0].text_content().strip().replace(":", "").replace(",", "") value = tr.cssselect("td")[0].text_content().strip() #print '"' + vtype + '"', '"'+value+'"' if (vtype == 'Record entry date' and value == 'Not stated.') or \ (vtype == 'Document type' and value == '-') or \ (vtype == 'Case number' and value == ''): return -1 if vtype in fieldmap: vtype = fieldmap[vtype] if 'doctype' == vtype: value = doctypemap[value] if 'exemption' == vtype and '-' == value: value = None if 'caseid' == vtype: caseyear, caseseqnr = value.split("/") data['caseyear'] = int(caseyear) data['caseseqnr'] = int(caseseqnr) data[vtype] = value # print str(id) + ": " + str(data) data['scrapestamputc'] = datetime.datetime.now() # print data['scrapestamputc'] # exit () datastorage.append(data) # scraperwiki.sqlite.save(unique_keys=['journalPostId'], data=data) return 0 def fetch_range(datastorage, first, last, step): global readtries myskiplimit = skiplimit skipped = 0 fetched = 0 min_id = first for id in range(first, last, step): if id < 0: break try: tries = readtries while 0 < tries: tries = tries - 1 try: if -1 == fetch_oep_entry(id, datastorage): skipped = skipped + 1 if skipped == myskiplimit and myskiplimit == skiplimit: tmp = [] for limit in [250, 500, 800, 1000, 1200, 1500, 1700, 2000, 3000, 5000, 7000]: testid = id + limit * step if -1 != fetch_oep_entry(testid, tmp): print "Looking "+str(limit)+" ahead, found " + url_from_id(testid) myskiplimit = skiplimit + limit + 1 break break else: fetched = fetched + 1 skipped = 0 myskiplimit = skiplimit break except urllib2.HTTPError, e: # Because HTTPError lack reason due to bug print "HTTPError triggered for url " + url_from_id(id) + ", trying again: " + str(e.msg) except urllib2.URLError, e: print "URLError triggered for url " + url_from_id(id) + ", trying again: " + str(e.reason) except httplib.BadStatusLine, e: # e.msg do not exist. trying .reason 2012-06-25 print "BadStatusLine triggered for url " + url_from_id(id) + ", trying again: " + str(e.reason) if 0 == tries: raise urllib2.URLError("unable to talk to service, giving up.") if skipped >= myskiplimit: print "Reached end of list, exiting at " + str(id) break if 50 <= len(datastorage): save(data=datastorage) datastorage = [] # Only do this for every 50 ID tested, to avoid spending too much CPU seconds updating the sqlite file if 0 == (id % 50): if id < min_id: min_id = id # print "Updating min_id to " + str(min_id) save_var('min_tested_id', min_id) if cpu_spent() > cpu_available(): print "Running short on CPU time, exiting at " + str(id) break time.sleep(0.2) except scraperwiki.CPUTimeExceededError: if 0 < len(datastorage): save(data=datastorage) datastorage = [] print "CPU exception caught" raise except: print "Error, unexpected exception" raise if 0 < len(datastorage): save(data=datastorage) datastorage = [] return fetched def rename_sql_columns(): print "Dropping temp table" scraperwiki.sqlite.execute("DROP TABLE IF EXISTS swdatanew") print "Creating table" scraperwiki.sqlite.execute("CREATE TABLE IF NOT EXISTS swdatanew (agency text, recorddate text, casedesc text, caseid text, casedocseq integer, docdate text, docdesc text, doctype text, exemption text, recipient text, sender text, recordpublishdate text, `Archive code` text, `Contact point` text, `journalPostId` integer, scrapestamputc text)") print "Copying table" scraperwiki.sqlite.execute("INSERT INTO swdatanew(agency, recorddate, casedesc, caseid, casedocseq, docdate, docdesc, doctype, exemption, recipient, sender, recordpublishdate, `Archive code`, `Contact point`, `journalPostId`, scrapestamputc) SELECT `Agency`, `Record entry date`, `Case`, `Case number`, `Document number`, `Document date`, `Document title`, `Document type`, `Grounds for exemption document`, `Recipient`, `Sender`, `Published in OEP`, `Archive code`, `Contact point`, `journalPostId`, `scrapestamputc` FROM swdata") scraperwiki.sqlite.execute("ALTER TABLE swdata RENAME TO swdataold") scraperwiki.sqlite.execute("ALTER TABLE swdatanew RENAME TO swdata") scraperwiki.sqlite.commit() exit(0) def create_indexes(): for field in ['doctype', 'agency', 'recorddate', 'caseid']: print "Creating %s index" % field scraperwiki.sqlite.execute("CREATE INDEX IF NOT EXISTS swdata_%s_index ON swdata (%s)" % (field, field)) scraperwiki.sqlite.commit() def update_doctypes(): print "Updating doctype" agencies = [] for agencyref in scraperwiki.sqlite.select("distinct agency from swdata"): agencies.append(agencyref['agency']) # Updating individual agencies to try to avoid SQL timeout for agency in agencies: print "Updating doctype for " + agency scraperwiki.sqlite.execute("UPDATE swdata set doctype = 'I' where agency = ? and doctype = 'Incoming'", (agency)) scraperwiki.sqlite.execute("UPDATE swdata set doctype = 'U' where agency = ? and doctype = 'Outgoing'", (agency)) scraperwiki.sqlite.execute("UPDATE swdata set doctype = 'X' where agency = ? and doctype = 'internal'", (agency)) scraperwiki.sqlite.commit() exit(0) def update_caseyear(): print "Updating caseyear and caseseqnr" agencies = [] for agencyref in scraperwiki.sqlite.select("distinct agency from swdata WHERE caseyear is NULL"): agencies.append(agencyref['agency']) # Updating individual agencies to try to avoid SQL timeout for agency in agencies: print "Updating caseyear for " + agency res = scraperwiki.sqlite.execute("select journalPostId, substr(caseid, 1, 4), substr(caseid, 6) from swdata where agency = ? and caseyear is NULL limit 2", (agency)) print res scraperwiki.sqlite.execute("UPDATE swdata set caseyear = substr(caseid, 1, 4), caseseqnr = substr(caseid, 6) where agency = ? AND caseyear is NULL", (agency)) scraperwiki.sqlite.commit() exit(0) def remove_original(): scraperwiki.sqlite.execute("DROP TABLE IF EXISTS swdataold") scraperwiki.sqlite.commit() exit(0) # Fetch again some crap entries that ended up in the database when the # script was slightly broken and filled in non-existing entries in the # SQL database. def reparse_strange_entries(datastorage): try: strange = "journalPostId FROM swdata WHERE caseid IS NULL OR scrapestamputc IS NULL OR agency IS NULL order by journalPostId" for idref in scraperwiki.sqlite.select(strange): id = idref['journalPostId'] if -1 == fetch_oep_entry(id, datastorage): print "Refetching %d failed, flush ID" % id scraperwiki.sqlite.execute("DELETE from swdata where journalPostId = %d" % id) if 0 < len(datastorage): save(data=datastorage) datastorage = [] else: print "Refetching %d" % id if 50 <= len(datastorage): save(data=datastorage) datastorage = [] time.sleep(0.2) if 0 < len(datastorage): save(data=datastorage) datastorage = [] except scraperwiki.sqlite.SqliteError, e: # Most likely no table, keep going pass #update_caseyear() #create_indexes() #rename_sql_columns() #remove_original() # This one give me SQL timeout #update_doctypes() print "Starting to fetch journal entries " + str(datetime.datetime.now()) scraperwiki.scrape("http://www.oep.no/") datastorage = [] reparse_strange_entries(datastorage) # Update entries to handle . # Used 2012-09-17 #scraperwiki.sqlite.execute("DELETE from swdata where journalPostId = 638167") #fetch_oep_entry(638167, datastorage) #scraperwiki.sqlite.execute("DELETE from swdata where journalPostId = 638104") #fetch_oep_entry(638104, datastorage) #scraperwiki.sqlite.commit() # Missing entry, should -1 #print fetch_oep_entry(16629772, datastorage) # Exist, should return 0 #print fetch_oep_entry(16629773, datastorage) count = 10000 skiplimit = 500 # Random value fairly close to the most recent ID when this project started 2016-04-07 max = min = startid = 16682410 try: max = scraperwiki.sqlite.select("max(journalPostId) as max from swdata")[0]["max"] if 0 < scraperwiki.sqlite.get_var('min_tested_id'): saved_min = scraperwiki.sqlite.get_var('min_tested_id') else: saved_min = min + 1 sql_min = scraperwiki.sqlite.select("min(journalPostId) as min from swdata")[0]["min"] print "Saved min: " + str(saved_min) + ", sql min: " + str(sql_min) if sql_min < saved_min: min = sql_min else: min = saved_min print "Scraping " + str(count) + " IDs below " + str(min) + " and above " + str(max) except scraperwiki.sqlite.SqliteError: pass fetched = 0 fetched = fetched + fetch_range(datastorage, max + 1, max + count, 1) print "Fetched " + str(fetched) + " new journal entries, cpu spent: " + str(cpu_spent()) if min >= 0 and read_backwards: fetched = fetch_range(datastorage, min, min - count, -1) print "Fetched " + str(fetched) + " old journal entries, cpu spent: " + str(cpu_spent()) if rescan_after_a_while: rescan_count = 8000 rescan_latency = 100000 # Rescan to see if we missed something, and to get the latest version rescan_min = scraperwiki.sqlite.get_var('min_rescan_id') if rescan_min is None: rescan_min = 0 if rescan_min + rescan_count < max - rescan_latency: end = rescan_min + rescan_count fetched = fetch_range(datastorage, rescan_min, end, 1) save_var('min_rescan_id', end - 1) print "Fetched %d rescanned journal entries (%d-%d), cpu spent: %f" \ % (fetched, rescan_min, end, cpu_spent())