diff options
author | Petter Reinholdtsen <pere@hungry.com> | 2014-02-27 14:49:41 +0100 |
---|---|---|
committer | Petter Reinholdtsen <pere@hungry.com> | 2014-02-27 14:51:44 +0100 |
commit | f3485a50f39f7fc50ae0f79ca11a45e9ea67856e (patch) | |
tree | e8276c8149b6e076c12d1fcb568224527602a31b | |
parent | 77654fc493379b0680038b0da3e5d55592d3af4d (diff) |
New scrapers.
-rw-r--r-- | scrapersources/postliste-mattilsynet | 200 |
1 files changed, 200 insertions, 0 deletions
diff --git a/scrapersources/postliste-mattilsynet b/scrapersources/postliste-mattilsynet new file mode 100644 index 0000000..9dea5e4 --- /dev/null +++ b/scrapersources/postliste-mattilsynet @@ -0,0 +1,200 @@ +# -*- coding: UTF-8 -*- + +import scraperwiki +import json +from BeautifulSoup import BeautifulSoup +import datetime +import dateutil.parser +import lxml.html +import resource +import sys +import urlparse +import re +import string +import HTMLParser + +# Make sure Scraperwiki believe this is the source from this database +baseurl = "http://www.mattilsynet.no/om_mattilsynet/offentlig_journal_og_innsyn/" +roothtml = scraperwiki.scrape(baseurl) + +lazycache=scraperwiki.swimport('lazycache') +postlistelib=scraperwiki.swimport('postliste-python-lib') + +agency = 'Mattilsynet' + +def report_errors(errors): + if 0 < len(errors): + print "Errors:" + for e in errors: + print e + raise ValueError("Something went wrong") + +def out_of_cpu(arg, spent, hard, soft): + report_errors(arg) + +# Based on http://stackoverflow.com/questions/13122353/parsing-html-using-lxml-html +def entry_by_hr(parent): + paralist = [] + para = lxml.html.etree.Element('entry') + if parent.text: + para.text = parent.text + for item in parent: + if item.tag=='hr': + paralist.append(para) + para = lxml.html.etree.Element('entry') + if item.tail: + para.text = item.tail + else: + para.append(item) + return paralist + +def process_list(parser, url): + html = postlistelib.fetch_url_harder(url) + #print html + hp = HTMLParser.HTMLParser() + root = lxml.html.fromstring(hp.unescape(html.decode('utf-8'))) + + period = root.cssselect("head title") + print period[0].text_content() + matchObj = re.match( r'.*Periode (\d{2}.\d{2}.\d{4})\s*\D*\s* (\d{2}.\d{2}.\d{4}) .*', period[0].text_content(), re.M|re.I) +#Periode 23.09.2013 - 29.09.2013 Oslo +#Periode 29.07.2013 – 04.08.2013 + if matchObj: # Fake recorddate + recorddate = dateutil.parser.parse(matchObj.group(2), dayfirst=True) + print "match", recorddate + subset = root.cssselect("div#articleContent") + #print subset[0].text_content() + savelist = [] + + for entry in entry_by_hr(subset[0]): + estr = lxml.html.etree.tostring(entry) + if -1 != estr.find("Offentlig journal"): + continue +# print estr + lines = estr.split("<br/>") +# print lines + if '' == lines[1]: + del lines[1] + meta = lines[1].split(" ") +# print meta + docdate = dateutil.parser.parse(meta[5], dayfirst=True) + doctype = meta[1] + + matchObj = re.match( r'(\d+)/(\d+)\s*-\s*(\d+)$', meta[0], re.M|re.I) + if matchObj: + caseyear = matchObj.group(1) + caseseqnr = matchObj.group(2) + casedocseq = matchObj.group(3) + caseid = str(caseyear) + "/" + str(caseseqnr) + else: + print "error: invalid saksnr: " + meta[0] + arkivnr = meta[9] + saksbehandler = meta[13].strip() + saksansvarligenhet, saksansvarlig = saksbehandler.split('/') + exemption = None + for row in lines[2:-1]: +# print "R: ", row + rowtype, rest = row.split(":", 1) + if 'Til' == rowtype or 'Fra' == rowtype: + fratil = hp.unescape(string.join(row.split(" ")[1:], " ")) + fratilfield = { + 'Til' : 'recipient', + 'Fra' : 'sender', + }[rowtype] + elif 'Dok' == rowtype: + docdesc = hp.unescape(rest.strip()) + elif 'Sak' == rowtype: + casedesc = hp.unescape(rest.strip()) + elif 'U.off' == rowtype: + if -1 != row.find('Grad: UO'): + gradert = hp.unescape(row) + exemption = gradert.split(':')[1].strip() +# print gradert, exemption + elif 'Lnr' == rowtype: +# print rest + laapenr = rest.strip().split(" ")[0].strip() +# print laapenr + journalseqnr, journalyear = laapenr.split("/") + journalid = str(journalyear) + "/" + str(journalseqnr) + else: + raise Exception("unhandled type") + + data = { + 'agency' : parser.agency, + 'recorddate' : recorddate.date(), + 'docdate' : docdate.date(), + 'docdesc' : docdesc, + 'casedesc' : casedesc, + + 'caseyear' : int(caseyear), + 'caseseqnr' : int(caseseqnr), + 'casedocseq' : int(casedocseq), + 'caseid' : caseid, + 'doctype' : doctype, + + 'journalseqnr' : int(journalseqnr), + 'journalyear' : int(journalyear), + 'journalid' : journalid, + fratilfield : fratil, + + 'saksbehandler' : saksbehandler, + 'saksansvarlig' : saksansvarlig.strip(), + 'saksansvarligenhet' : int(saksansvarligenhet.strip()), + + + 'arkivnr' : arkivnr, + 'laapenr' : laapenr, + 'exemption' : exemption, + + 'scrapedurl' : url, + 'scrapestamputc' : datetime.datetime.now() + } + +# print data + parser.verify_entry(data) + savelist.append(data) +# return # debug + scraperwiki.sqlite.save(data=savelist, unique_keys=['caseyear', 'caseseqnr', 'casedocseq']) + return + +def fetch_urls_list(parser, baseurl, roothtml): + root = lxml.html.fromstring(roothtml) + subset = root.cssselect("ul.listContainer") +# print subset[0].text_content() + urllist = [] + for list in subset: + urls = list.cssselect("li a") + for ahref in urls: + href = ahref.attrib['href'] + #print href + newurl = urlparse.urljoin(baseurl, href) + urllist.append(newurl) + return urllist + +def test(parser): + url = "http://www.mattilsynet.no/om_mattilsynet/offentlig_journal_og_innsyn/bvt/periode_29072013__04082013_buskerud_vestfold_og_telemark" + process_list(parser, url) + +errors = [] +parser = postlistelib.JournalParser(agency=agency) + +if False: + test(parser) + exit(0) + +urls = fetch_urls_list(parser, baseurl, roothtml) +for url in urls: + try: + res = scraperwiki.sqlite.select("scrapedurl from swdata where scrapedurl = '"+url+"' limit 1") + if 0 < len(res): + continue + except Exception, e: # Ignore it if the table is missing + pass + print "Processing ", url + try: + process_list(parser, url) + except Exception, e: + print "Unable to process ", url, e + pass +report_errors(errors) + |