# -*- coding: utf-8 -*- # YAML-tagger: # Type: statlig # Status: disabled # Name: Statistisk sentralbyrå # Run: obsolete, moved to OEP import scraperwiki import urllib2 import lxml.html import datetime import time import dateutil.parser import pickle import re from datetime import date from datetime import timedelta from time import strftime # Make sure Scraperwiki believe this is the source from this database scraperwiki.scrape("http://www.ssb.no/omssb/journal/") postlistelib=scraperwiki.swimport('postliste-python-lib') agency = "Statistisk sentralbyrÃ¥" def daterange(start_date, end_date): for n in range((end_date - start_date).days): yield start_date + timedelta(n) def expand_year(year): year = int(year) if year > 50: year = year + 1900 else: year = year + 2000 return year def fetch_url(url): html = None for n in [1]: try: html = scraperwiki.scrape(url) break except urllib2.URLError, e: print "URLError fetching " + url + ", trying again" return html def save_date(parser, date, url, html): num_saved = 0 root = lxml.html.fromstring(html) journal_date = dateutil.parser.parse(root.cssselect("p")[0].text_content().replace("Journaldato: ",""), dayfirst=True) if date == journal_date.date(): datastore = [] for table in root.cssselect("table"): docid = table.cssselect("tr")[0].cssselect("p")[1].text.strip() datedesc = table.cssselect("tr")[0].cssselect("td")[3].cssselect("p")[0].text.strip() exemption = table.cssselect("tr")[1].cssselect("td")[5].cssselect("p")[0].text.strip() fratil_indicator = table.cssselect("tr")[2].cssselect("td")[0].cssselect("p")[0].text.strip() doctype = "" if fratil_indicator.startswith("Til"): doctype = "U" elif fratil_indicator.startswith("Fra"): doctype = "I" elif fratil_indicator.startswith("Notat fra"): doctype = "N" else: raise ValueError("Fant ikke doctype %s" % fratil_indicator) fratil_agency = table.cssselect("tr")[2].cssselect("td")[1].cssselect("p")[0].text.strip() casedesc = table.cssselect("tr")[4].cssselect("td")[1].cssselect("p")[0].text.strip() docdesc = table.cssselect("tr")[5].cssselect("td")[1].cssselect("p")[0].text.strip() saksb = table.cssselect("tr")[0].cssselect("p")[5].text.strip() docdate = dateutil.parser.parse(datedesc.strip(), dayfirst=True) matchObj = re.match( r'(\d+)/(\d+)\s*-\s*(\d+)$', docid, re.M|re.I) if matchObj: caseyear = matchObj.group(1) caseseqnr = matchObj.group(2) casedocseq = matchObj.group(3) caseyear = expand_year(caseyear) caseid = str(caseyear) + "/" + str(caseseqnr) else: print "error: invalid Arkivsaksnr: " + docid matchObj = re.match( r'(\d+)/(\d+)\s*-', docid, re.M|re.I) if matchObj: caseyear = expand_year(matchObj.group(1)) caseseqnr = matchObj.group(2) caseid = str(caseyear) + "/" + str(caseseqnr) if parser.is_sender_doctype(doctype): fratilfield = 'sender' elif parser.is_recipient_doctype(doctype): fratilfield = 'recipient' data = { 'agency' : agency, 'docdate' : docdate.date(), 'recorddate' : journal_date.date(), 'docdesc' : docdesc, 'casedesc' : casedesc, 'caseid' : caseid, 'docid' : docid, 'caseyear' : caseyear, 'caseseqnr' : caseseqnr, 'casedocseq' : casedocseq, fratilfield : fratil_agency, 'doctype' : doctype, 'saksbehandler' : saksb, 'exemption' : exemption, 'scrapedurl' : url, 'scrapestamputc' : datetime.datetime.now() } parser.verify_entry(data) datastore.append(data) scraperwiki.sqlite.save(unique_keys=['docid'], data=datastore) num_saved += len(datastore) datastore = [] #print "Saved %s" % data['caseid'] else: # TODO: log error or exit? msg = "Tried to scrape %s but got %s" % (date, journal_date.date()) #raise ValueError(msg) print msg return num_saved def scrape_date(parser, date): url = base_url % (strftime("%d%m%y", date.timetuple())) html = fetch_url(url) if html: return save_date(parser, date, url, html) base_url = 'http://www.ssb.no/omssb/journal/OJ%s.htm' end_date = date.today() #print res start_date_obj = scraperwiki.sqlite.get_var('last_finished_date') if start_date_obj: start_date = pickle.loads(start_date_obj) else: start_date = datetime.date(2011, 1, 3) print "Start date %s" % start_date parser = postlistelib.JournalParser(agency=agency) for single_date in daterange(start_date, end_date): if single_date.weekday() < 5: num_saved = scrape_date(parser, single_date) print "Scraped %s found %s" % (single_date, num_saved) if num_saved > 0: scraperwiki.sqlite.save_var('last_finished_date', pickle.dumps(single_date)) if num_saved == None: print "No more new. Exit..." break