# -*- coding: utf-8 -*- # YAML-tagger: # Type: kommune # Status: finished # Name: Oslo kommunes byrådsavdelinger og Rådhusets forvaltningstjeneste # Format: HTML # Datatype: # Vendor: # Run: daily # Missingfields: journalseqnr, journalyear, journalid # Publish duration: x months import scraperwiki import urllib import urllib2 import lxml.html import re import resource import dateutil.parser import datetime import sys from dateutil.relativedelta import relativedelta # Some example URLs #http://byr-journal.cloudapp.net/Journal/SearchRelated?caseYear=2016&sequenceNumber=451 #http://byr-journal.cloudapp.net/Journal/Search?searchStringAdv=FromDate%3D01.09.2016,department%3DAll #http://byr-journal.cloudapp.net/Journal/Search?searchStringAdv=FromDate%3D01.09.2016,ToDate%3D01.09.2016,department%3DAll #http://byr-journal.cloudapp.net/Journal/Search/?querytype=and&offset=10 #http://byr-journal.cloudapp.net/Journal/Search/?querytype=and&FromDate=23.08.2016&ToDate=23.08.2016&offset=20 # http://byr-journal.cloudapp.net/Journal/Search?searchStringAdv=FromDate%3D23.08.2016,ToDate%3D23.08.2016,department%3DAll scraperwiki.scrape("https://www.oslo.kommune.no/postjournal/") postlistelib=scraperwiki.swimport('postliste-python-lib') agency = u'Oslo kommune, Byrådsavdelingene' baseurl = "http://www.oslo.kommune.no" print "Fetching public journal for %s!" % agency parser = postlistelib.JournalParser(agency=agency) fieldmap = { 'Dokumentdato' : 'docdate', 'Dokumenttype' : 'doctype', 'Sak' : 'casedesc', 'Journaldato' : 'recorddate', 'Dato' : None, # Duplicate of recorddate 'Saksansvarlig' : 'saksansvarligenhet', 'Tilgangskode' : 'exemption', 'Fra' : 'sender', 'Til' : 'recipient', 'Til / Fra' : None, # Internal note, field empty } class NoDataEntries(LookupError): pass def cpu_spent(): usage = resource.getrusage(resource.RUSAGE_SELF) return getattr(usage, 'ru_utime') + getattr(usage, 'ru_stime') def cpu_available(): available = resource.getrlimit(resource.RLIMIT_CPU)[0] # If no limit is set, assume 20 CPU seconds as the limit to avoid # running for more than a few minutes every time. if 0 > available: available = 20 return available def parse_day_html(parser, datastore, dayurl, html): root = lxml.html.fromstring(html) count = 0 for row in root.cssselect("div.document-rows"): count = count + 1 data = { 'agency' : parser.agency, 'scrapedurl' : dayurl, 'scrapestamputc' : datetime.datetime.now() } head = row.cssselect("div.data-column div h3")[0].text_content().strip() (arkivsaksref, docdesc) = head.split(" ", 1) data['docdesc'] = docdesc caseyear = 0 caseseqnr = 0 casedocseq = 0 caseid = 'unknown' matchObj = re.match( r'(\d+)/(\d+)\s*-\s*(\d+)$', arkivsaksref, re.M|re.I) if matchObj: caseyear = int(matchObj.group(1)) data['caseseqnr'] = int(matchObj.group(2)) data['casedocseq'] = int(matchObj.group(3)) data['caseyear'] = caseyear data['caseid'] = str(data['caseyear']) + "/" + str(data['caseseqnr']) data['arkivsaksref'] = arkivsaksref else: print "error: really broken Arkivsaksnr: %s" % arkivsaksref raise Exception("unable to parse url %s" % dayurl) for tagclass in ['journal-recipients', 'journal-details']: for d in row.cssselect("div.%s > dl" % tagclass): field = d.cssselect("dt")[0].text_content().strip() value = d.cssselect("dd")[0].text_content().strip() if field in fieldmap: if fieldmap[field] is not None: # Ignore duplicates field = fieldmap[field] else: raise Exception("unknown field %s in %s" % (field, dayurl)) if value and '' != value: data[field] = value for field in ['docdate', 'recorddate']: if field in data: data[field] = dateutil.parser.parse(data[field], dayfirst=True).date() parser.verify_entry(data) datastore.append(data) # print data return count def fetch_day(parser, day): datastore = [] daystr = day.strftime('%d.%m.%Y') totalcount = 0 try: offset = 0 offsetstep = 10 while True: dayurl = "http://byr-journal.cloudapp.net/Journal/Search/?querytype=and&FromDate=%s&ToDate=%s&offset=%d" % (daystr, daystr, offset) html = postlistelib.fetch_url_harder(dayurl).decode('utf-8') # print html count = parse_day_html(parser, datastore, dayurl, html) totalcount = totalcount + count # print count, dayurl if 0 == count: # print "Ending day at offset %d" % offset break offset = offset + offsetstep scraperwiki.sqlite.save(unique_keys=['arkivsaksref'], data=datastore) datastore = [] return totalcount except scraperwiki.CPUTimeExceededError, e: print "error: Ran out of time, abort scraping" # Not saving, to avoid saving partial day. Better to scrape # the entire day the next run. return 0 except Exception, e: # print html print e raise aday = datetime.timedelta(1) # one day delta newest = None try: newest = dateutil.parser.parse(scraperwiki.sqlite.select("max(recorddate) as max from swdata")[0]["max"], dayfirst=False).date() oldest = dateutil.parser.parse(scraperwiki.sqlite.select("min(recorddate) as min from swdata")[0]["min"], dayfirst=False).date() except scraperwiki.sqlite.SqliteError: # Table not created yet, ignore the error pass if not newest: # Bootstrap a month ago newest = datetime.datetime.today() - aday * 30 oldest = newest #print oldest, newest skiplimit = 10 totalcount = 0 # Look forward one week to at least get past the weekends, rescan the # last day in case new records showed up in the mean time. Next, scan # backwards, one day before the oldest entry in the database. for n in range(0, skiplimit, 1): day = newest + aday * n # print day totalcount = totalcount + fetch_day(parser, day) if cpu_spent() > (cpu_available() - 3): print "Running short on CPU time, exiting" sys.exit(0) for n in range(-1, -skiplimit, -1): day = oldest + aday * n # print day totalcount = totalcount + fetch_day(parser, day) if cpu_spent() > (cpu_available() - 3): print "Running short on CPU time, exiting" sys.exit(0) print "Fetched %d journal entries" % totalcount # Need to rescan after a while to make sure we get the entries that # take a while to show up when moving forward. Idea: Revisit all days # where the record date is less than 30 days after the scraper date, # allowing records to change for 30 days until we stop rescraping # them. But wait 15 days before scraping again, to avoid scraping the # same day over and over. totalcount = 0 for drec in scraperwiki.sqlite.select("DISTINCT(recorddate) as d FROM swdata WHERE JULIANDAY(scrapestamputc) - JULIANDAY(recorddate) < 30 AND JULIANDAY('now') - JULIANDAY(scrapestamputc) > 15"): day = dateutil.parser.parse(drec['d'], dayfirst=False).date() print day totalcount = totalcount + fetch_day(parser, day) if cpu_spent() > (cpu_available() - 3): print "Running short on CPU time, exiting" sys.exit(0) print "Rescanned %d journal entries" % totalcount