diff options
-rw-r--r-- | scrapersources/postliste-universitetet-i-tromso | 145 |
1 files changed, 143 insertions, 2 deletions
diff --git a/scrapersources/postliste-universitetet-i-tromso b/scrapersources/postliste-universitetet-i-tromso index c2553db..cef5815 100644 --- a/scrapersources/postliste-universitetet-i-tromso +++ b/scrapersources/postliste-universitetet-i-tromso @@ -9,11 +9,11 @@ # Run: daily # # The PDF/ePhorte scraper is done, but the new HTML format is not yet -# handled. +# handled. The HTML version is missing docdate, journalseqnr, +# journalyear, journalid. import scraperwiki import json -from BeautifulSoup import BeautifulSoup import datetime import dateutil.parser import lxml.html @@ -77,6 +77,140 @@ def process_journal_pdfs(parser, listurl, errors): # print "Will process " + url process_pdf(parser, url, errors) +def newlinetocomma(str): + return re.sub('\r\n\s*', ', ', str) + +def fetch_postjournal_day(parser, url, html, saver): + root = lxml.html.fromstring(html) +# print html + + count = None + for h2 in root.cssselect("h2"): + header = h2.text_content().strip() + matchObj = re.match( r'Søkeresultat \((\d+)\)$', header, re.M|re.I) + if matchObj: + count = int(matchObj.group(1)) + break + + datastore = [] + for table in root.cssselect("table[summary='Offentlig journal dokumenter']"): +# print table.text_content() + cells = table.cssselect("th,td") + i = 0 + entrydata= {} + while i < len(cells) - 1: + field = cells[i].text_content().strip(' \n\t\r') + value = cells[i+1].text_content().strip(' \n\t\r') + entrydata[field] = value + i = i + 1 +# print entrydata + + recorddate = dateutil.parser.parse(entrydata['Journaldato:'].strip(), dayfirst=True) + docdesc = entrydata['Beskrivelse:'].strip() + casedesc = entrydata['Sakstittel:'] + doctype = entrydata['Dokument type:'] + + saksbehandler = entrydata['Saksbehandler:'].strip() + saksansvarlig = entrydata['Ansvarlig:'].strip() + + arkivsaksref = entrydata['Journalpost:'] + caseyear = 0 + caseseqnr = 0 + casedocseq = 0 + caseid = 'unknown' + matchObj = re.match( r'(\d+)/(\d+)-(\d+)$', arkivsaksref, re.M|re.I) + if matchObj: + caseyear = matchObj.group(1) + caseseqnr = matchObj.group(2) + casedocseq = matchObj.group(3) + caseid = str(caseyear) + "/" + str(caseseqnr) + else: + print "error: invalid Arkivsaksnr: " + arkivsaksref + raise Exception("Unable to parse %s" % url) + + exemption = None + if 'Gradering:' in entrydata: + exemption = entrydata['Gradering:'] + + data = { + 'agency' : parser.agency, + 'recorddate' : recorddate.date(), +# 'docdate' : docdate.date(), + 'docdesc' : docdesc, + 'casedesc' : casedesc, + + 'caseyear' : int(caseyear), + 'caseseqnr' : int(caseseqnr), + 'casedocseq' : int(casedocseq), + 'caseid' : caseid, + 'doctype' : doctype, + +# 'journalseqnr' : int(journalseqnr), +# 'journalyear' : int(journalyear), +# 'journalid' : journalid, + + 'saksbehandler' : saksbehandler, + 'saksansvarlig' : saksansvarlig.strip(), +# 'saksansvarligenhet' : saksansvarligenhet.strip(), + + 'arkivsaksref' : arkivsaksref, +# 'laapenr' : laapenr, + + 'scrapedurl' : url, + 'scrapestamputc' : datetime.datetime.now() + } + + if 'Fra:' in entrydata: + sender = newlinetocomma(entrydata['Fra:']) + if sender != '**** **** **** ****': + data['sender'] = sender + if 'Til:' in entrydata: + recipient = newlinetocomma(entrydata['Til:']) + if recipient != '**** **** **** ****': + data['recipient'] = recipient + + if exemption is not None: + data['exemption'] = exemption + + print data + parser.verify_entry(data) + datastore.append(data) + +# If paging were done here and not in find_day_urls +# if count != len(datastore): +# raise ValueError("Unable to find all entries on %s, found %d of %d" % (url, len(datastore), count)) + + saver(unique_keys=['arkivsaksref'], data=datastore) + +def find_day_urls(parser, year): + urls=[] + for month in range(12+1): + url="http://uit.no/samfunn/offjour/count?year=%d&month=%d" % (year, month) + jsonres = scrape(url) + res = json.loads(jsonres) +# print res + for daystr in res['month']: + count = int(res['month'][daystr]) + if count > 0: + matchObj = re.match( r'day(\d+)$', daystr, re.M|re.I) + if matchObj: + day = int(matchObj.group(1)) +# print year, month, day, res['month'][daystr] + nordatestr = "%02d.%02d.%d" % (day, month, year) + htmlpagesize=100 + for page in range(int(count / htmlpagesize)+1): + url="http://uit.no/samfunn/offjour?elementsprpage=%d&pageindex=%d&uitgyldigfra=%s&uitgyldigtil=%s&searchtitle=&searchinnhold=" % (htmlpagesize, page, nordatestr, nordatestr) + if ! parser.is_already_scraped(url): + urls.append(url) + else: + raise ValueError("Unable to parse day string '%s'" % daystr) + return urls + +def saver(unique_keys, data): + print "Not saving data" + return +# scraperwiki.sqlite.save(unique_keys, data) + def test_small_pdfs(parser): # Test with some smaller PDFs errors = [] @@ -90,6 +224,13 @@ parser = postlistelib.PDFJournalParser(agency=agency) #test_small_pdfs(parser) +urls = [] +urls.extend(find_day_urls(parser, 2014)) +urls.extend(find_day_urls(parser, 2015)) +for url in urls: + html = scrape(url) + fetch_postjournal_day(parser, url, html, saver=saver) + process_journal_pdfs(parser, "http://uit.no/om/enhet/artikkel?p_document_id=382893&p_dimension_id=88216", errors) process_page_queue(parser, errors) report_errors(errors) |