# -*- coding: utf-8 -*- # YAML-tagger: # Type: kommune # Status: finished # Name: Ballangen kommune # Format: HTML # Datatype: # Vendor: # Run: daily # Missingfields: docdate import scraperwiki import urllib2 import lxml.html import re import dateutil.parser from collections import deque import datetime from dateutil.relativedelta import relativedelta scraperwiki.scrape("http://www.ballangen.kommune.no/artikler/postlister") postlistelib=scraperwiki.swimport('postliste-python-lib') # # #     # SÃ<98>KER KULTURMIDLER FOR BALLANGEN FRIIDRETT # # # #   # # # # # Sakstittel: KULTURMIDLER 2012 # # # # #   # # # # # Arkivsaksnr.: 12/00093 - 032 I    Løpenr.:002255/12 # # # # Fra/Til: Eirin Sørslett # # # # Saksbehandler: # Oddbjørn Dalsbø # (RÃ<85>D/KVO) # # # # Datert: 02.04.2012 # # # #  Ikon # Bestill journal # # def saver(unique_keys, data): # return #print "Not saving data" scraperwiki.sqlite.save(unique_keys, data) def expand_year(year): year = int(year) if year > 50: year = year + 1900 else: year = year + 2000 return year def fetch_postjournal_day(parser, url, html, saver): root = lxml.html.fromstring(html) listdate = dateutil.parser.parse(root.cssselect("h2")[0].text_content().replace("Postlister for ",""), dayfirst=True) print listdate.date() entries = [] for tr in root.cssselect("table.ui-corner-all tr"): tds = tr.cssselect("td") line = tds[0].text_content() entries.append(line) # 9 or 12 lines per entry queue = deque(entries) datastore = [] while queue: docdesc = (queue.popleft() + queue.popleft()).strip() casedesc = (queue.popleft() + queue.popleft()).replace("Sakstittel:", "").strip() ref = queue.popleft().strip() arkivsaksref = re.sub(r"L.penr.:.+$", "", ref).replace("Arkivsaksnr.:","").strip() caseyear = 0 caseseqnr = 0 casedocseq = 0 doctype = '?' caseid = 'unknown' matchObj = re.match( r'(\d+)/(\d+)\s*-\s*(\d+) (.+)$', arkivsaksref, re.M|re.I) if matchObj: caseyear = matchObj.group(1) caseseqnr = matchObj.group(2) casedocseq = matchObj.group(3) doctype = matchObj.group(4) caseyear = expand_year(caseyear) caseid = str(caseyear) + "/" + str(caseseqnr) else: print "error: invalid Arkivsaksnr: " + arkivsaksref matchObj = re.match( r'(\d+)/(\d+)\s*-', arkivsaksref, re.M|re.I) if matchObj: caseyear = expand_year(matchObj.group(1)) caseseqnr = matchObj.group(2) caseid = str(caseyear) + "/" + str(caseseqnr) else: matchObj = re.match( r'/\s*-\s*(\S+)', arkivsaksref, re.M|re.I) if matchObj: doctype = matchObj.group(1) else: print "error: really broken Arkivsaksnr: %s" % arkivsaksref laapenr = re.sub(r"^.+L.penr.:", "", ref) journalseqnr = 0 journalyear = 0 journalid = 'unknown' if -1 != laapenr.find('/') and "/" != laapenr: # Avoid broken/empty values journalseqnr, journalyear = laapenr.split("/") journalyear = expand_year(journalyear) journalid = str(journalyear) + "/" + str(journalseqnr) else: print u"error: invalid Løpenr: " + laapenr if not parser.is_valid_doctype(doctype): doctype = { 'S' : 'N', 'PLN' : 'N', 'Z' : 'N', 'UB' : 'U', }[doctype] fratil = queue.popleft().replace("Fra/Til:", "").strip() if parser.is_sender_doctype(doctype): fratilfield = 'sender' elif parser.is_recipient_doctype(doctype): fratilfield = 'recipient' saksbehandler = queue.popleft().replace("Saksbehandler:","").strip() saksansvarlig, bar = saksbehandler.split("(") saksansvarlig = saksansvarlig.strip() saksansvarligenhet, foo = bar.split(")") #print saksansvarligenhet recorddate = dateutil.parser.parse(queue.popleft().replace("Datert:","").strip(), dayfirst=True) requesturl = queue.popleft().strip() exemption = "" if -1 != requesturl.find("Gradering"): exemption = requesturl.replace("Gradering:", "").strip() requesturl = queue.popleft() fratil = "" data = { 'agency' : parser.agency, 'recorddate' : recorddate.date(), 'docdesc' : docdesc, 'casedesc' : casedesc, 'caseyear' : int(caseyear), 'caseseqnr' : int(caseseqnr), 'casedocseq' : int(casedocseq), 'caseid' : caseid, 'doctype' : doctype, 'journalseqnr' : int(journalseqnr), 'journalyear' : int(journalyear), 'journalid' : journalid, fratilfield : fratil, 'saksbehandler' : saksbehandler, 'saksansvarlig' : saksansvarlig.strip(), 'saksansvarligenhet' : saksansvarligenhet.strip(), 'arkivsaksref' : arkivsaksref, 'laapenr' : laapenr, 'exemption' : exemption, 'scrapedurl' : url, 'scrapestamputc' : datetime.datetime.now() } # print data parser.verify_entry(data) datastore.append(data) saver(unique_keys=['arkivsaksref'], data=datastore) def fetch_postjournal_monthlist(baseurl, html): root = lxml.html.fromstring(html) subset = root.cssselect("div table") urls = subset[0].cssselect("td a") urllist = [] for ahref in urls: href = ahref.attrib['href'] if -1 != href.find("day="): # print href urllist.append(baseurl + href) return urllist # http://www.offentlighet.no/ agency = "Ballangen kommune" baseurl = "http://www.ballangen.kommune.no" monthurls = [] def addyear(monthurls, year): for m in [12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1]: monthstr = "%02d%d" % (m, year) url = "http://www.ballangen.kommune.no/artikler/postlister?month=" + monthstr monthurls.append(url) #addyear(monthurls, 2003) #addyear(monthurls, 2004) # Consistency problems in http://www.ballangen.kommune.no/artikler/postlister?month=012004&day=06 (bad Arkivsaksnr. and lacking Løpenr.) #addyear(monthurls, 2005) #addyear(monthurls, 2006) #addyear(monthurls, 2007) #addyear(monthurls, 2008) #addyear(monthurls, 2009) #addyear(monthurls, 2010) #addyear(monthurls, 2011) #addyear(monthurls, 2012) parsemonths = 5 today = datetime.date.today() i = 1 while i <= parsemonths: i = i + 1 # parsemonths = parsemonths - 1 monthtoparse = today - relativedelta(months=(parsemonths - i)) monthstr = monthtoparse.strftime("%m%Y") url = "http://www.ballangen.kommune.no/artikler/postlister?month=" + monthstr monthurls.append(url) #url = "http://www.ballangen.kommune.no/artikler/postlister?month=032012&day=19" def reload_error_entries(): monthurls = [] problems = scraperwiki.sqlite.select("distinct scrapedurl from swdata where caseid = 'unknown'") for n in problems: monthurls.append(n['scrapedurl']) print "Fetching public journal!" parser = postlistelib.JournalParser(agency=agency) urllist = [] for monthurl in monthurls: print "Fetching month list from " + monthurl html = postlistelib.fetch_url_harder(monthurl) urllist.extend(fetch_postjournal_monthlist(baseurl = baseurl, html = html)) for dayurl in urllist: res = scraperwiki.sqlite.select("scrapedurl from swdata where scrapedurl = '"+dayurl+"' and scrapestamputc > '2012-06-23T15:12:40' limit 1") if 0 < len(res): continue print "Fetching from " + dayurl html = postlistelib.fetch_url_harder(dayurl) # print html fetch_postjournal_day(parser=parser, url=dayurl, html=html, saver=saver) # Workaround for missing day links discovered 2015-01-15 def hardparse(): for m in ['09', '10', '11', '12']: for d in range(1,32): dayurl = "http://www.ballangen.kommune.no/artikler/postlister?month=%s2014&day=%d" % (m, d) try: html = postlistelib.fetch_url_harder(dayurl) fetch_postjournal_day(parser=parser, url=dayurl, html=html, saver=saver) except ValueError: pass