diff options
author | Petter Reinholdtsen <pere@hungry.com> | 2014-12-07 21:17:44 +0100 |
---|---|---|
committer | Petter Reinholdtsen <pere@hungry.com> | 2014-12-07 21:18:16 +0100 |
commit | 508e9cbd8241ea0bfa500e96253d53ffd78c7a6c (patch) | |
tree | 9158c9b256918bc207a73f5773f52e4e171554aa | |
parent | 86e44b3f761841374b11e9f9efde0a2ce55c4999 (diff) |
New scraper for Lunner kommune.
-rw-r--r-- | scrapersources/postliste-lunner-kommune | 179 |
1 files changed, 179 insertions, 0 deletions
diff --git a/scrapersources/postliste-lunner-kommune b/scrapersources/postliste-lunner-kommune new file mode 100644 index 0000000..e58731b --- /dev/null +++ b/scrapersources/postliste-lunner-kommune @@ -0,0 +1,179 @@ +# -*- coding: utf-8 -*- + +import scraperwiki +import urllib2 +import urlparse +import lxml.html +import re +import dateutil.parser +import datetime +from dateutil.relativedelta import relativedelta + +agency = "Lunner kommune" +starturl = "http://www.lunner.kommune.no/postlister.298000.no.html" + +scraperwiki.scrape(starturl) +postlistelib=scraperwiki.swimport('postliste-python-lib') + +def saver(unique_keys, data): +# return + #print "Not saving data" + scraperwiki.sqlite.save(unique_keys, data) + +def expand_year(year): + year = int(year) + if year > 50: + year = year + 1900 + else: + year = year + 2000 + return year + +def fetch_postjournal_day(parser, url, html, saver): + root = lxml.html.fromstring(html) +# print html + listdate = dateutil.parser.parse(root.cssselect("h2")[0].text_content().replace("Postliste for ",""), dayfirst=True) +# print listdate.date() + + entryqueue = [] + for div in root.cssselect("div.innerbody div.postlistedokument div"): + tds = div.cssselect("div") + line = tds[0].text_content().strip(' \n\t\r') +# print "L: \"" + line + "\"" + entryqueue.append(line) + + entry = {} + entries = [] + i = 0 + while i < len(entryqueue) - 1: + if 'Datert:' == entryqueue[i]: + if 'Datert:' in entry: + entries.append(entry) + entry = {} + entry[entryqueue[i]] = entryqueue[i+1] + i = i + 1 + entries.append(entry) + + datastore = [] + for entry in entries: +# print entry + recorddate = dateutil.parser.parse(entry['Regdato:'].strip(), dayfirst=True) + docdate = dateutil.parser.parse(entry['Datert:'].strip(), dayfirst=True) + docdesc = entry['Dokbesk:'].strip() + # FIXME sakstittel mangler i webjournal + casedesc = '[sakstittel mangler]' + doctype = entry['Doktype:'] + doctype = { + u'UtgƄende' : 'U', + u'Innkommende' : 'I', + u'Internt' : 'N', + }[doctype] + docdesc = entry['Dokbesk:'] + saksbehandler = entry['Saksbeh:'].strip() + + arkivsaksref = entry['Saksnr:'] + caseyear = 0 + caseseqnr = 0 + casedocseq = 0 + caseid = 'unknown' + matchObj = re.match( r'(\d+)/(\d+)-(\d+)$', arkivsaksref, re.M|re.I) + if matchObj: + caseyear = matchObj.group(1) + caseseqnr = matchObj.group(2) + casedocseq = matchObj.group(3) + caseyear = expand_year(caseyear) + caseid = str(caseyear) + "/" + str(caseseqnr) + else: + print "error: invalid Arkivsaksnr: " + arkivsaksref + raise Exception("Unable to parse %s" % url) + + if parser.is_sender_doctype(doctype) and 'N' is not doctype: + fratilfield = 'sender' + fratil = entry['Avsender:'] + elif parser.is_recipient_doctype(doctype): + fratilfield = 'recipient' + fratil = entry['Mottaker:'] + else: + fratilfield = 'intern' + fratil = None + + exemption = None + if 'Gradering:' in entry: + exemption = entry['Gradering:'] + fratil = "" + + data = { + 'agency' : parser.agency, + 'recorddate' : recorddate.date(), + 'docdate' : docdate.date(), + 'docdesc' : docdesc, + 'casedesc' : casedesc, + + 'caseyear' : int(caseyear), + 'caseseqnr' : int(caseseqnr), + 'casedocseq' : int(casedocseq), + 'caseid' : caseid, + 'doctype' : doctype, + +# 'journalseqnr' : int(journalseqnr), +# 'journalyear' : int(journalyear), +# 'journalid' : journalid, + + 'saksbehandler' : saksbehandler, +# 'saksansvarlig' : saksansvarlig.strip(), +# 'saksansvarligenhet' : saksansvarligenhet.strip(), + + 'arkivsaksref' : arkivsaksref, +# 'laapenr' : laapenr, + + 'scrapedurl' : url, + 'scrapestamputc' : datetime.datetime.now() + } + if fratil is not None: + data[fratilfield] = fratil + if exemption is not None: + data['exemption'] = exemption + + print data + parser.verify_entry(data) + datastore.append(data) + saver(unique_keys=['arkivsaksref'], data=datastore) + +def parse_day_urls(starturl, urllist): + html = scraperwiki.scrape(starturl) + root = lxml.html.fromstring(html) + for ahref in root.cssselect("a.headlinelink"): + href = ahref.attrib['href'] + url = urlparse.urljoin(starturl, href) + urllist.append(url) + + nexturls = root.cssselect("a.next") + for ahref in nexturls: + href = ahref.attrib['href'] + if -1 != href.find("cat="): + print href + parse_day_urls(urlparse.urljoin(starturl, href), urllist) + return urllist + +print "Fetching public journal!" + +parser = postlistelib.JournalParser(agency=agency) + +urllist = [] +parse_day_urls(starturl, urllist) + +for dayurl in urllist: + + # Only parse once + try: + res = scraperwiki.sqlite.select("scrapedurl from swdata where scrapedurl = '"+dayurl+"' limit 1") + if 0 < len(res): + continue + except Exception, e: # Probably no table yet + pass + + print + print "Fetching from " + dayurl + print + html = postlistelib.fetch_url_harder(dayurl) +# print html + fetch_postjournal_day(parser=parser, url=dayurl, html=html, saver=saver) |