import scraperwiki import urllib2 import lxml.html import datetime import time import dateutil.parser import pickle import re from datetime import date from datetime import timedelta from time import strftime # Make sure Scraperwiki believe this is the source from this database scraperwiki.scrape("http://www.ssb.no/omssb/journal/") postlistelib=scraperwiki.swimport('postliste-python-lib') agency = "Statistisk sentralbyrå" def daterange(start_date, end_date): for n in range((end_date - start_date).days): yield start_date + timedelta(n) def expand_year(year): year = int(year) if year > 50: year = year + 1900 else: year = year + 2000 return year def fetch_url(url): html = None for n in [1]: try: html = scraperwiki.scrape(url) break except urllib2.URLError, e: print "URLError fetching " + url + ", trying again" return html def save_date(parser, date, url, html): num_saved = 0 root = lxml.html.fromstring(html) journal_date = dateutil.parser.parse(root.cssselect("p")[0].text_content().replace("Journaldato: ",""), dayfirst=True) if date == journal_date.date(): datastore = [] for table in root.cssselect("table"): docid = table.cssselect("tr")[0].cssselect("p")[1].text.strip() datedesc = table.cssselect("tr")[0].cssselect("td")[3].cssselect("p")[0].text.strip() exemption = table.cssselect("tr")[1].cssselect("td")[5].cssselect("p")[0].text.strip() fratil_indicator = table.cssselect("tr")[2].cssselect("td")[0].cssselect("p")[0].text.strip() doctype = "" if fratil_indicator.startswith("Til"): doctype = "U" elif fratil_indicator.startswith("Fra"): doctype = "I" elif fratil_indicator.startswith("Notat fra"): doctype = "N" else: raise ValueError("Fant ikke doctype %s" % fratil_indicator) fratil_agency = table.cssselect("tr")[2].cssselect("td")[1].cssselect("p")[0].text.strip() casedesc = table.cssselect("tr")[4].cssselect("td")[1].cssselect("p")[0].text.strip() docdesc = table.cssselect("tr")[5].cssselect("td")[1].cssselect("p")[0].text.strip() saksb = table.cssselect("tr")[0].cssselect("p")[5].text.strip() docdate = dateutil.parser.parse(datedesc.strip(), dayfirst=True) matchObj = re.match( r'(\d+)/(\d+)\s*-\s*(\d+)$', docid, re.M|re.I) if matchObj: caseyear = matchObj.group(1) caseseqnr = matchObj.group(2) casedocseq = matchObj.group(3) caseyear = expand_year(caseyear) caseid = str(caseyear) + "/" + str(caseseqnr) else: print "error: invalid Arkivsaksnr: " + docid matchObj = re.match( r'(\d+)/(\d+)\s*-', docid, re.M|re.I) if matchObj: caseyear = expand_year(matchObj.group(1)) caseseqnr = matchObj.group(2) caseid = str(caseyear) + "/" + str(caseseqnr) if parser.is_sender_doctype(doctype): fratilfield = 'sender' elif parser.is_recipient_doctype(doctype): fratilfield = 'recipient' data = { 'agency' : agency, 'docdate' : docdate.date(), 'recorddate' : journal_date.date(), 'docdesc' : docdesc, 'casedesc' : casedesc, 'caseid' : caseid, 'docid' : docid, 'caseyear' : caseyear, 'caseseqnr' : caseseqnr, 'casedocseq' : casedocseq, fratilfield : fratil_agency, 'doctype' : doctype, 'saksbehandler' : saksb, 'exemption' : exemption, 'scrapedurl' : url, 'scrapestamputc' : datetime.datetime.now() } parser.verify_entry(data) datastore.append(data) scraperwiki.sqlite.save(unique_keys=['docid'], data=datastore) num_saved += len(datastore) datastore = [] #print "Saved %s" % data['caseid'] else: # TODO: log error or exit? msg = "Tried to scrape %s but got %s" % (date, journal_date.date()) #raise ValueError(msg) print msg return num_saved def scrape_date(parser, date): url = base_url % (strftime("%d%m%y", date.timetuple())) html = fetch_url(url) if html: return save_date(parser, date, url, html) base_url = 'http://www.ssb.no/omssb/journal/OJ%s.htm' end_date = date.today() #print res start_date_obj = scraperwiki.sqlite.get_var('last_finished_date') if start_date_obj: start_date = pickle.loads(start_date_obj) else: start_date = datetime.date(2011, 1, 3) print "Start date %s" % start_date parser = postlistelib.JournalParser(agency=agency) for single_date in daterange(start_date, end_date): if single_date.weekday() < 5: num_saved = scrape_date(parser, single_date) print "Scraped %s found %s" % (single_date, num_saved) if num_saved > 0: scraperwiki.sqlite.save_var('last_finished_date', pickle.dumps(single_date)) if num_saved == None: print "No more new. Exit..." break