# -*- coding: utf-8 -*- # YAML-tagger: # Type: kommune # Status: finished # Name: Lenvik kommune # Format: HTML # Run: daily # Missingfields: casedesc import scraperwiki import urllib2 import lxml.html import re import dateutil.parser from dateutil.relativedelta import relativedelta import datetime import urlparse agency = "Lenvik kommune" # Point scraperwiki GUI to the start page scraperwiki.scrape("http://webway.lenvik.kommune.no/postjournal") postlistelib=scraperwiki.swimport('postliste-python-lib') parser = postlistelib.JournalParser(agency=agency) def saver(unique_keys, data): # return #print "Not saving data" scraperwiki.sqlite.save(unique_keys, data) def expand_year(year): year = int(year) if year > 50: year = year + 1900 else: year = year + 2000 return year # # 13/00563-001 # 04.03.2013 # #
I
# # Flytting av VPN-tunell # EVRY AS # Jan-Eirik Nordahl # # Bestill # # # # # def fetch_postjournal_day(parser, url, html, saver): root = lxml.html.fromstring(html.decode('utf-8')) recorddate = None for div in root.cssselect('div'): divcontent = div.text_content() if 0 == divcontent.find("Offentlig postjournal for "): recorddate = dateutil.parser.parse(divcontent.replace("Offentlig postjournal for ",""), dayfirst=True) print recorddate # Make sure we save the entire URL or nothing at all datastore = [] for tr in root.cssselect('tr.yang'): tds = tr.cssselect("td") docidstr = tds[0].text_content().strip() docdate = tds[1].text_content().strip() doctype = tds[2].text_content().strip() docdesc = tds[3].text_content().strip() fratil = tds[4].text_content().strip() saksbehandler = tds[5].text_content().strip() if -1 != tds[6].text_content().find("Bestill"): exemption = None else: exemption = tds[6].text_content().strip() docdate = dateutil.parser.parse(docdate, dayfirst=True) # print doctype, docdesc if not parser.is_valid_doctype(doctype): doctype = { '' : '?', }[doctype] if parser.is_sender_doctype(doctype): fratilfield = 'sender' elif parser.is_recipient_doctype(doctype): fratilfield = 'recipient' caseyear, caseseqnr = docidstr.split("/") caseyear = expand_year(caseyear) caseseqnr, casedocseq = caseseqnr.split("-") caseid = "%d/%d" % (int(caseyear), int(caseseqnr)) data = { 'agency' : parser.agency, 'recorddate' : recorddate.date(), 'docdate' : docdate.date(), 'docdesc' : docdesc, 'casedesc' : docdesc, # FIXME fake value 'caseyear' : int(caseyear), 'caseseqnr' : int(caseseqnr), 'casedocseq' : int(casedocseq), 'caseid' : caseid, 'doctype' : doctype, # 'journalseqnr' : int(journalseqnr), # 'journalyear' : int(journalyear), # 'journalid' : journalid, fratilfield : fratil, 'saksbehandler' : saksbehandler, # 'saksansvarlig' : saksansvarlig.strip(), # 'saksansvarligenhet' : saksansvarligenhet.strip(), 'docidstr' : docidstr, # 'laapenr' : laapenr, 'exemption' : exemption, 'scrapedurl' : url, 'scrapestamputc' : datetime.datetime.now() } # print data parser.verify_entry(data) datastore.append(data) seenurl = {} # Find next URL. There are two on each page. for ahref in root.cssselect('a.next_page'): if 0 == ahref.text_content().find('Neste'): nexturl = urlparse.urljoin(url, ahref.attrib['href']) if nexturl not in seenurl: seenurl[nexturl] = True; print 'Fetching ' + nexturl html = postlistelib.fetch_url_harder(nexturl) mysaver = lambda unique_keys, data: datastore.extend(data) fetch_postjournal_day(parser=parser, url=nexturl, html=html, saver=mysaver) saver(unique_keys=['docidstr'], data=datastore) def date2url(date): return 'http://webway.lenvik.kommune.no/?date=%s' % date def gen_date_urls(urllist, startdate, step, count): d = dateutil.parser.parse(startdate, dayfirst=False) for n in xrange(1, step*(count+1), step): next = (d + relativedelta(days=n)).strftime("%Y-%m-%d") urllist.append(date2url(next)) urllist = [] today = datetime.date.today() try: first = scraperwiki.sqlite.select("min(recorddate) as min from swdata")[0]['min'] last = scraperwiki.sqlite.select("max(recorddate) as max from swdata")[0]['max'] except: last = (today + relativedelta(days=-14)).strftime("%Y-%m-%d") first = None print first, last # Parse forward in time if last is not None: gen_date_urls(urllist, last, 1, 14) # Parse back in time if first is not None: gen_date_urls(urllist, first, -1, 5) for dayurl in urllist: print 'Fetching ' + dayurl html = postlistelib.fetch_url_harder(dayurl) fetch_postjournal_day(parser=parser, url=dayurl, html=html, saver=saver)