# -*- coding: utf-8 -*- # YAML-tagger: # Type: university # Status: unfinished # Name: Universitetet i Tromsø # Format: PDF/HTML # Datatype: ePhorte # Vendor: Ergo # Run: daily # Missingfields: docdate journalseqnr journalyear journalid # # The PDF/ePhorte scraper is done, but the new HTML format is not yet # handled. The HTML version is missing docdate, journalseqnr, # journalyear, journalid. import scraperwiki import json import datetime import dateutil.parser import lxml.html import resource import sys import urlparse import re # Make sure Scraperwiki believe this is the source from this database scraperwiki.scrape("http://uit.no/om/offjour") lazycache=scraperwiki.swimport('lazycache') postlistelib=scraperwiki.swimport('postliste-python-lib') agency = u'Universitetet i Tromsø' def report_errors(errors): if 0 < len(errors): print "Errors:" for e in errors: print e raise ValueError("Something went wrong") def out_of_cpu(arg, spent, hard, soft): report_errors(arg) def process_pdf(parser, pdfurl, errors): postlistelib.exit_if_no_cpu_left(0, out_of_cpu, errors) try: pdfcontent = scraperwiki.scrape(pdfurl) parser.preprocess(pdfurl, pdfcontent) pdfcontent = None except ValueError, e: errors.append(e) except IndexError, e: errors.append(e) def process_page_queue(parser, errors): try: parser.process_pages() postlistelib.exit_if_no_cpu_left(0, out_of_cpu, errors) except scraperwiki.CPUTimeExceededError, e: errors.append("Processing pages interrupted") def process_journal_pdfs(parser, listurl, errors): # print "Finding PDFs on " + listurl # u = urllib.parse.urlparse(listurl) html = scraperwiki.scrape(listurl) root = lxml.html.fromstring(html) html = None for ahref in root.cssselect("div.nyhArtikkel a"): href = ahref.attrib['href'] url = urlparse.urljoin(listurl, href) if -1 != href.find("file://") or -1 == url.find(".pdf"): # print "Skipping non-http URL " + url continue if parser.is_already_scraped(url): True # print "Skipping already scraped " + url else: # print "Will process " + url process_pdf(parser, url, errors) def newlinetocomma(str): return re.sub('\r\n\s*', ', ', str) def fetch_postjournal_day(parser, url, html, saver): root = lxml.html.fromstring(html) # print html count = None for h2 in root.cssselect("h2"): header = h2.text_content().strip() matchObj = re.match( r'Søkeresultat \((\d+)\)$', header, re.M|re.I) if matchObj: count = int(matchObj.group(1)) break datastore = [] for table in root.cssselect("table[summary='Offentlig journal dokumenter']"): # print table.text_content() cells = table.cssselect("th,td") i = 0 entrydata= {} while i < len(cells) - 1: field = cells[i].text_content().strip(' \n\t\r') value = cells[i+1].text_content().strip(' \n\t\r') entrydata[field] = value i = i + 1 # print entrydata recorddate = dateutil.parser.parse(entrydata['Journaldato:'].strip(), dayfirst=True) docdesc = entrydata['Beskrivelse:'].strip() casedesc = entrydata['Sakstittel:'] doctype = entrydata['Dokument type:'] if not parser.is_valid_doctype(doctype): doctype = { 'S' : 'X', }[doctype] saksbehandler = entrydata['Saksbehandler:'].strip() saksansvarlig = entrydata['Ansvarlig:'].strip() arkivsaksref = entrydata['Journalpost:'] caseyear = 0 caseseqnr = 0 casedocseq = 0 caseid = 'unknown' matchObj = re.match( r'(\d+)/(\d+)-(\d+)$', arkivsaksref, re.M|re.I) if matchObj: caseyear = matchObj.group(1) caseseqnr = matchObj.group(2) casedocseq = matchObj.group(3) caseid = str(caseyear) + "/" + str(caseseqnr) else: print "error: invalid Arkivsaksnr: " + arkivsaksref raise Exception("Unable to parse %s" % url) exemption = None if 'Gradering:' in entrydata: exemption = entrydata['Gradering:'] data = { 'agency' : parser.agency, 'recorddate' : recorddate.date(), # 'docdate' : docdate.date(), 'docdesc' : docdesc, 'casedesc' : casedesc, 'caseyear' : int(caseyear), 'caseseqnr' : int(caseseqnr), 'casedocseq' : int(casedocseq), 'caseid' : caseid, 'doctype' : doctype, # 'journalseqnr' : int(journalseqnr), # 'journalyear' : int(journalyear), # 'journalid' : journalid, 'saksbehandler' : saksbehandler, 'saksansvarlig' : saksansvarlig.strip(), # 'saksansvarligenhet' : saksansvarligenhet.strip(), 'arkivsaksref' : arkivsaksref, # 'laapenr' : laapenr, 'scrapedurl' : url, 'scrapestamputc' : datetime.datetime.now() } if 'Fra:' in entrydata: sender = newlinetocomma(entrydata['Fra:']) if sender != '**** **** **** ****': data['sender'] = sender if 'Til:' in entrydata: recipient = newlinetocomma(entrydata['Til:']) if recipient != '**** **** **** ****': data['recipient'] = recipient if exemption is not None: data['exemption'] = exemption print data parser.verify_entry(data) datastore.append(data) # If paging were done here and not in find_day_urls # if count != len(datastore): # raise ValueError("Unable to find all entries on %s, found %d of %d" % (url, len(datastore), count)) saver(unique_keys=['arkivsaksref'], data=datastore) def find_day_urls(parser, year): urls=[] for month in range(12+1): url="http://uit.no/samfunn/offjour/count?year=%d&month=%d" % (year, month) jsonres = scraperwiki.scrape(url) res = json.loads(jsonres) # print res for daystr in res['month']: count = int(res['month'][daystr]) if count > 0: matchObj = re.match( r'day(\d+)$', daystr, re.M|re.I) if matchObj: day = int(matchObj.group(1)) # print year, month, day, res['month'][daystr] nordatestr = "%02d.%02d.%d" % (day, month, year) htmlpagesize=100 for page in range(int((count-1) / htmlpagesize)+1): url="http://uit.no/samfunn/offjour?elementsprpage=%d&pageindex=%d&uitgyldigfra=%s&uitgyldigtil=%s&searchtitle=&searchinnhold=" % (htmlpagesize, page+1, nordatestr, nordatestr) if not parser.is_already_scraped(url): urls.append(url) else: raise ValueError("Unable to parse day string '%s'" % daystr) return urls def saver(unique_keys, data): # print "Not saving data" # return scraperwiki.sqlite.save(unique_keys, data) def test_small_pdfs(parser): # Test with some smaller PDFs errors = [] process_pdf(parser, "http://uit.no/Content/382902/Januar%202011.pdf", errors) process_page_queue(parser, errors) report_errors(errors) exit(0) errors = [] parser = postlistelib.PDFJournalParser(agency=agency) #test_small_pdfs(parser) process_page_queue(parser, errors) process_journal_pdfs(parser, "http://uit.no/om/enhet/artikkel?p_document_id=382893&p_dimension_id=88216", errors) urls = [] urls.extend(find_day_urls(parser, 2013)) urls.extend(find_day_urls(parser, 2014)) urls.extend(find_day_urls(parser, 2015)) for url in urls: html = scraperwiki.scrape(url).decode('utf-8') fetch_postjournal_day(parser, url, html, saver=saver) report_errors(errors)