aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorPetter Reinholdtsen <pere@hungry.com>2014-02-27 14:49:41 +0100
committerPetter Reinholdtsen <pere@hungry.com>2014-02-27 14:51:44 +0100
commitf3485a50f39f7fc50ae0f79ca11a45e9ea67856e (patch)
treee8276c8149b6e076c12d1fcb568224527602a31b
parent77654fc493379b0680038b0da3e5d55592d3af4d (diff)
New scrapers.
-rw-r--r--scrapersources/postliste-mattilsynet200
1 files changed, 200 insertions, 0 deletions
diff --git a/scrapersources/postliste-mattilsynet b/scrapersources/postliste-mattilsynet
new file mode 100644
index 0000000..9dea5e4
--- /dev/null
+++ b/scrapersources/postliste-mattilsynet
@@ -0,0 +1,200 @@
+# -*- coding: UTF-8 -*-
+
+import scraperwiki
+import json
+from BeautifulSoup import BeautifulSoup
+import datetime
+import dateutil.parser
+import lxml.html
+import resource
+import sys
+import urlparse
+import re
+import string
+import HTMLParser
+
+# Make sure Scraperwiki believe this is the source from this database
+baseurl = "http://www.mattilsynet.no/om_mattilsynet/offentlig_journal_og_innsyn/"
+roothtml = scraperwiki.scrape(baseurl)
+
+lazycache=scraperwiki.swimport('lazycache')
+postlistelib=scraperwiki.swimport('postliste-python-lib')
+
+agency = 'Mattilsynet'
+
+def report_errors(errors):
+ if 0 < len(errors):
+ print "Errors:"
+ for e in errors:
+ print e
+ raise ValueError("Something went wrong")
+
+def out_of_cpu(arg, spent, hard, soft):
+ report_errors(arg)
+
+# Based on http://stackoverflow.com/questions/13122353/parsing-html-using-lxml-html
+def entry_by_hr(parent):
+ paralist = []
+ para = lxml.html.etree.Element('entry')
+ if parent.text:
+ para.text = parent.text
+ for item in parent:
+ if item.tag=='hr':
+ paralist.append(para)
+ para = lxml.html.etree.Element('entry')
+ if item.tail:
+ para.text = item.tail
+ else:
+ para.append(item)
+ return paralist
+
+def process_list(parser, url):
+ html = postlistelib.fetch_url_harder(url)
+ #print html
+ hp = HTMLParser.HTMLParser()
+ root = lxml.html.fromstring(hp.unescape(html.decode('utf-8')))
+
+ period = root.cssselect("head title")
+ print period[0].text_content()
+ matchObj = re.match( r'.*Periode (\d{2}.\d{2}.\d{4})\s*\D*\s* (\d{2}.\d{2}.\d{4}) .*', period[0].text_content(), re.M|re.I)
+#Periode 23.09.2013 - 29.09.2013 Oslo
+#Periode 29.07.2013 – 04.08.2013
+ if matchObj: # Fake recorddate
+ recorddate = dateutil.parser.parse(matchObj.group(2), dayfirst=True)
+ print "match", recorddate
+ subset = root.cssselect("div#articleContent")
+ #print subset[0].text_content()
+ savelist = []
+
+ for entry in entry_by_hr(subset[0]):
+ estr = lxml.html.etree.tostring(entry)
+ if -1 != estr.find("Offentlig journal"):
+ continue
+# print estr
+ lines = estr.split("<br/>")
+# print lines
+ if '' == lines[1]:
+ del lines[1]
+ meta = lines[1].split(" ")
+# print meta
+ docdate = dateutil.parser.parse(meta[5], dayfirst=True)
+ doctype = meta[1]
+
+ matchObj = re.match( r'(\d+)/(\d+)\s*-\s*(\d+)$', meta[0], re.M|re.I)
+ if matchObj:
+ caseyear = matchObj.group(1)
+ caseseqnr = matchObj.group(2)
+ casedocseq = matchObj.group(3)
+ caseid = str(caseyear) + "/" + str(caseseqnr)
+ else:
+ print "error: invalid saksnr: " + meta[0]
+ arkivnr = meta[9]
+ saksbehandler = meta[13].strip()
+ saksansvarligenhet, saksansvarlig = saksbehandler.split('/')
+ exemption = None
+ for row in lines[2:-1]:
+# print "R: ", row
+ rowtype, rest = row.split(":", 1)
+ if 'Til' == rowtype or 'Fra' == rowtype:
+ fratil = hp.unescape(string.join(row.split(" ")[1:], " "))
+ fratilfield = {
+ 'Til' : 'recipient',
+ 'Fra' : 'sender',
+ }[rowtype]
+ elif 'Dok' == rowtype:
+ docdesc = hp.unescape(rest.strip())
+ elif 'Sak' == rowtype:
+ casedesc = hp.unescape(rest.strip())
+ elif 'U.off' == rowtype:
+ if -1 != row.find('Grad: UO'):
+ gradert = hp.unescape(row)
+ exemption = gradert.split(':')[1].strip()
+# print gradert, exemption
+ elif 'Lnr' == rowtype:
+# print rest
+ laapenr = rest.strip().split(" ")[0].strip()
+# print laapenr
+ journalseqnr, journalyear = laapenr.split("/")
+ journalid = str(journalyear) + "/" + str(journalseqnr)
+ else:
+ raise Exception("unhandled type")
+
+ data = {
+ 'agency' : parser.agency,
+ 'recorddate' : recorddate.date(),
+ 'docdate' : docdate.date(),
+ 'docdesc' : docdesc,
+ 'casedesc' : casedesc,
+
+ 'caseyear' : int(caseyear),
+ 'caseseqnr' : int(caseseqnr),
+ 'casedocseq' : int(casedocseq),
+ 'caseid' : caseid,
+ 'doctype' : doctype,
+
+ 'journalseqnr' : int(journalseqnr),
+ 'journalyear' : int(journalyear),
+ 'journalid' : journalid,
+ fratilfield : fratil,
+
+ 'saksbehandler' : saksbehandler,
+ 'saksansvarlig' : saksansvarlig.strip(),
+ 'saksansvarligenhet' : int(saksansvarligenhet.strip()),
+
+
+ 'arkivnr' : arkivnr,
+ 'laapenr' : laapenr,
+ 'exemption' : exemption,
+
+ 'scrapedurl' : url,
+ 'scrapestamputc' : datetime.datetime.now()
+ }
+
+# print data
+ parser.verify_entry(data)
+ savelist.append(data)
+# return # debug
+ scraperwiki.sqlite.save(data=savelist, unique_keys=['caseyear', 'caseseqnr', 'casedocseq'])
+ return
+
+def fetch_urls_list(parser, baseurl, roothtml):
+ root = lxml.html.fromstring(roothtml)
+ subset = root.cssselect("ul.listContainer")
+# print subset[0].text_content()
+ urllist = []
+ for list in subset:
+ urls = list.cssselect("li a")
+ for ahref in urls:
+ href = ahref.attrib['href']
+ #print href
+ newurl = urlparse.urljoin(baseurl, href)
+ urllist.append(newurl)
+ return urllist
+
+def test(parser):
+ url = "http://www.mattilsynet.no/om_mattilsynet/offentlig_journal_og_innsyn/bvt/periode_29072013__04082013_buskerud_vestfold_og_telemark"
+ process_list(parser, url)
+
+errors = []
+parser = postlistelib.JournalParser(agency=agency)
+
+if False:
+ test(parser)
+ exit(0)
+
+urls = fetch_urls_list(parser, baseurl, roothtml)
+for url in urls:
+ try:
+ res = scraperwiki.sqlite.select("scrapedurl from swdata where scrapedurl = '"+url+"' limit 1")
+ if 0 < len(res):
+ continue
+ except Exception, e: # Ignore it if the table is missing
+ pass
+ print "Processing ", url
+ try:
+ process_list(parser, url)
+ except Exception, e:
+ print "Unable to process ", url, e
+ pass
+report_errors(errors)
+