aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--scrapersources/postliste-universitetet-i-tromso145
1 files changed, 143 insertions, 2 deletions
diff --git a/scrapersources/postliste-universitetet-i-tromso b/scrapersources/postliste-universitetet-i-tromso
index c2553db..cef5815 100644
--- a/scrapersources/postliste-universitetet-i-tromso
+++ b/scrapersources/postliste-universitetet-i-tromso
@@ -9,11 +9,11 @@
# Run: daily
#
# The PDF/ePhorte scraper is done, but the new HTML format is not yet
-# handled.
+# handled. The HTML version is missing docdate, journalseqnr,
+# journalyear, journalid.
import scraperwiki
import json
-from BeautifulSoup import BeautifulSoup
import datetime
import dateutil.parser
import lxml.html
@@ -77,6 +77,140 @@ def process_journal_pdfs(parser, listurl, errors):
# print "Will process " + url
process_pdf(parser, url, errors)
+def newlinetocomma(str):
+ return re.sub('\r\n\s*', ', ', str)
+
+def fetch_postjournal_day(parser, url, html, saver):
+ root = lxml.html.fromstring(html)
+# print html
+
+ count = None
+ for h2 in root.cssselect("h2"):
+ header = h2.text_content().strip()
+ matchObj = re.match( r'Søkeresultat \((\d+)\)$', header, re.M|re.I)
+ if matchObj:
+ count = int(matchObj.group(1))
+ break
+
+ datastore = []
+ for table in root.cssselect("table[summary='Offentlig journal dokumenter']"):
+# print table.text_content()
+ cells = table.cssselect("th,td")
+ i = 0
+ entrydata= {}
+ while i < len(cells) - 1:
+ field = cells[i].text_content().strip(' \n\t\r')
+ value = cells[i+1].text_content().strip(' \n\t\r')
+ entrydata[field] = value
+ i = i + 1
+# print entrydata
+
+ recorddate = dateutil.parser.parse(entrydata['Journaldato:'].strip(), dayfirst=True)
+ docdesc = entrydata['Beskrivelse:'].strip()
+ casedesc = entrydata['Sakstittel:']
+ doctype = entrydata['Dokument type:']
+
+ saksbehandler = entrydata['Saksbehandler:'].strip()
+ saksansvarlig = entrydata['Ansvarlig:'].strip()
+
+ arkivsaksref = entrydata['Journalpost:']
+ caseyear = 0
+ caseseqnr = 0
+ casedocseq = 0
+ caseid = 'unknown'
+ matchObj = re.match( r'(\d+)/(\d+)-(\d+)$', arkivsaksref, re.M|re.I)
+ if matchObj:
+ caseyear = matchObj.group(1)
+ caseseqnr = matchObj.group(2)
+ casedocseq = matchObj.group(3)
+ caseid = str(caseyear) + "/" + str(caseseqnr)
+ else:
+ print "error: invalid Arkivsaksnr: " + arkivsaksref
+ raise Exception("Unable to parse %s" % url)
+
+ exemption = None
+ if 'Gradering:' in entrydata:
+ exemption = entrydata['Gradering:']
+
+ data = {
+ 'agency' : parser.agency,
+ 'recorddate' : recorddate.date(),
+# 'docdate' : docdate.date(),
+ 'docdesc' : docdesc,
+ 'casedesc' : casedesc,
+
+ 'caseyear' : int(caseyear),
+ 'caseseqnr' : int(caseseqnr),
+ 'casedocseq' : int(casedocseq),
+ 'caseid' : caseid,
+ 'doctype' : doctype,
+
+# 'journalseqnr' : int(journalseqnr),
+# 'journalyear' : int(journalyear),
+# 'journalid' : journalid,
+
+ 'saksbehandler' : saksbehandler,
+ 'saksansvarlig' : saksansvarlig.strip(),
+# 'saksansvarligenhet' : saksansvarligenhet.strip(),
+
+ 'arkivsaksref' : arkivsaksref,
+# 'laapenr' : laapenr,
+
+ 'scrapedurl' : url,
+ 'scrapestamputc' : datetime.datetime.now()
+ }
+
+ if 'Fra:' in entrydata:
+ sender = newlinetocomma(entrydata['Fra:'])
+ if sender != '**** **** **** ****':
+ data['sender'] = sender
+ if 'Til:' in entrydata:
+ recipient = newlinetocomma(entrydata['Til:'])
+ if recipient != '**** **** **** ****':
+ data['recipient'] = recipient
+
+ if exemption is not None:
+ data['exemption'] = exemption
+
+ print data
+ parser.verify_entry(data)
+ datastore.append(data)
+
+# If paging were done here and not in find_day_urls
+# if count != len(datastore):
+# raise ValueError("Unable to find all entries on %s, found %d of %d" % (url, len(datastore), count))
+
+ saver(unique_keys=['arkivsaksref'], data=datastore)
+
+def find_day_urls(parser, year):
+ urls=[]
+ for month in range(12+1):
+ url="http://uit.no/samfunn/offjour/count?year=%d&month=%d" % (year, month)
+ jsonres = scrape(url)
+ res = json.loads(jsonres)
+# print res
+ for daystr in res['month']:
+ count = int(res['month'][daystr])
+ if count > 0:
+ matchObj = re.match( r'day(\d+)$', daystr, re.M|re.I)
+ if matchObj:
+ day = int(matchObj.group(1))
+# print year, month, day, res['month'][daystr]
+ nordatestr = "%02d.%02d.%d" % (day, month, year)
+ htmlpagesize=100
+ for page in range(int(count / htmlpagesize)+1):
+ url="http://uit.no/samfunn/offjour?elementsprpage=%d&pageindex=%d&uitgyldigfra=%s&uitgyldigtil=%s&searchtitle=&searchinnhold=" % (htmlpagesize, page, nordatestr, nordatestr)
+ if ! parser.is_already_scraped(url):
+ urls.append(url)
+ else:
+ raise ValueError("Unable to parse day string '%s'" % daystr)
+ return urls
+
+def saver(unique_keys, data):
+ print "Not saving data"
+ return
+# scraperwiki.sqlite.save(unique_keys, data)
+
def test_small_pdfs(parser):
# Test with some smaller PDFs
errors = []
@@ -90,6 +224,13 @@ parser = postlistelib.PDFJournalParser(agency=agency)
#test_small_pdfs(parser)
+urls = []
+urls.extend(find_day_urls(parser, 2014))
+urls.extend(find_day_urls(parser, 2015))
+for url in urls:
+ html = scrape(url)
+ fetch_postjournal_day(parser, url, html, saver=saver)
+
process_journal_pdfs(parser, "http://uit.no/om/enhet/artikkel?p_document_id=382893&p_dimension_id=88216", errors)
process_page_queue(parser, errors)
report_errors(errors)