# -*- coding: utf-8 -*-
# YAML-tagger:
# Type: kommune
# Status: unfinished
# Name: Hole kommune
# Format: HTML
# Datatype:
# Publish duration:
# Run: not yet
# Missingfields: casedocseq
import scraperwiki
import lxml.html
import datetime
import dateutil.parser
import urllib2
import urlparse
# Start page is the front page, to get it listed as the primary source
scraperwiki.scrape("http://www.hole.kommune.no/postjournaler.173497.no.html")
postlistelib=scraperwiki.swimport('postliste-python-lib')
agency = 'Hole kommune'
def fetch_url(url):
html = None
for n in [1, 2, 3]:
try:
html = scraperwiki.scrape(url)
break
except urllib2.URLError, e:
print "URLError fetching " + url + ", trying again"
return html
def expand_id(value, fieldtype, entry):
year, seqnr = value.split('/')
year = int(year)
seqnr = int(seqnr)
if year < 50:
year = year + 2000
if year > 50 and year < 100:
year = year + 1900
entry[fieldtype + 'year'] = year
entry[fieldtype + 'seqnr'] = seqnr
newvalue = str(year) + '/' + str(seqnr)
return entry, newvalue
def fetch_postjournal(agency, url, datastore):
# print "Scraping " + url
scrapestamputc = datetime.datetime.now()
html = fetch_url(url)
root = lxml.html.fromstring(html)
entry = {
'agency' : agency,
'scrapestamputc' : scrapestamputc,
'scrapedurl' : url,
}
fieldmap = {
u'Tittel på saken' : 'casedesc',
u'Tittel på dokumentet' : 'docdesc',
'Dokumentansvarlig' : 'saksansvarlig',
'Hjemmel' : 'exemption',
'DokumentID' : 'journalid',
'ArkivsakID' : 'caseid',
'Journaldato' : 'recorddate',
'Brevdato' : 'docdate',
#'Journalpostkategori' :
}
doctypemap = { # Valid codes are I, U, X, N, S
u'Innkommende dokument' : 'I',
u'Innkommende dokument (Gradert)' : 'I',
u'Utgående dokument' : 'U',
u'Utgående dokument (Gradert)' : 'U',
u'Utgående dokument (Ikke publisert)' : 'X',
u'Innkommende dokument (Ikke publisert)' : 'X',
u'Internt notat (Gradert)' : 'N',
u'Internt notat' : 'N',
}
for span in root.cssselect("div.innsyn-content"):
#print span.text_content()
doctype = span.cssselect("h1.header-head")[0].text_content().strip()
print doctype
entry['doctype'] = doctypemap[doctype]
trs = span.cssselect("div.nobox tr")
for tr in trs:
field = tr.cssselect("th.header-cell")[0].text_content().strip().replace(":","")
value = tr.cssselect("td.content-cell")[0].text_content().strip()
#print "'" + field + "' = " + value
if field in fieldmap:
field = fieldmap[field]
#print "hit"
if field in ['docdate','recorddate']:
value = dateutil.parser.parse(value, dayfirst=True).date()
if field == 'saksansvarlig' and -1 != value.find(','):
#print value
names = value.split(",", 1)
value = names[1].strip() + " " + names[0].strip()
if field == 'caseid':
entry, value = expand_id(value, 'case', entry)
if field == 'journalid':
entry, value = expand_id(value, 'journal', entry)
entry[field] = value
sendinfo = span.cssselect("div.dokmottakere")
if 0 < len(sendinfo):
if 'doctype' in entry and entry['doctype'] in ['U', 'X', 'N']:
field = 'recipient'
else:
field = 'sender'
# Value is "Margrethe Ingeland
Gravfossveien
3360 GEITHUS", should be split in person, addr and zip
entry[field] = sendinfo[0].text
brs = sendinfo[0].cssselect("br")
if 3 == len(brs):
addr = brs[0].tail + ", " + brs[1].tail
zip = brs[2].tail
entry[field + 'addr'] = addr
entry[field + 'zip'] = zip
elif 2 == len(brs):
addr = brs[0].tail
zip = brs[1].tail
entry[field + 'addr'] = addr
entry[field + 'zip'] = zip
elif 1 == len(brs):
zip = brs[0].tail
entry[field + 'zip'] = zip
elif 0 == len(brs):
True # Ignore
else:
raise ValueError("Unexpected number of address lines")
print entry
if 'doctype' in entry:
entry['casedocseq'] = 0 # Fake value, not sure how to extract the real value
datastore.append(entry)
return
def get_journal_day(agency, date, startrow, jurlqueue):
datestr = str(date) + "T00:00:00"
url = "http://innsyn.hole.kommune.no/wfinnsyn.ashx?response=journalpost_postliste&showresults=true&fradato=%s&startrow=%d" % (datestr, startrow)
print url
html = fetch_url(url)
root = lxml.html.fromstring(html)
ahrefs = root.cssselect("table.inner-max-width tbody tr a")
for a in ahrefs:
href = a.attrib['href']
if -1 != href.find("/wfinnsyn.ashx?response=journalpost_detaljer&journalpostid="):
jurl = urlparse.urljoin(url, href)
jurlqueue.append(jurl)
ahrefs = root.cssselect("table.inner-max-width tfoot tr a")
for a in ahrefs:
if 'neste' == a.text_content():
get_journal_day(agency, date, startrow+10, jurlqueue)
def is_already_scraped(url):
for sql in ["scrapedurl from swdata where scrapedurl = '" + url + "' limit 1"]:
try:
result = scraperwiki.sqlite.select(sql)
#int sql, " : ", result
if 0 < len(result) and u'scrapedurl' in result[0]:
return True
except:
print "Exception"
pass
return False
def minmax_recorddate(minmax):
for sql in ["%s(recorddate) as recorddate from swdata" % minmax]:
try:
result = scraperwiki.sqlite.select(sql)
date = dateutil.parser.parse(result[0]['recorddate']).date()
return date
except:
pass
return None
def scraper():
html = fetch_url("http://innsyn.hole.kommune.no/wfinnsyn.ashx?response=journalpost_postliste&showresults=true")
root = lxml.html.fromstring(html)
table = root.cssselect("table.inner-max-width")
#print table[0].text_content()
lastdate = dateutil.parser.parse(table[0].cssselect("caption")[0].text_content().replace("Postliste den ", ""), dayfirst=True).date()
maxdate = minmax_recorddate("max")
if maxdate:
startdate = maxdate + datetime.timedelta(days=1)
start = 0
end = (lastdate-startdate).days + 1
print maxdate, startdate, start, end
else:
startdate = maxdate
start = 0
end = 0
for old in range(start, end):
date = startdate + datetime.timedelta(days=old)
print date
urlqueue = []
get_journal_day(agency, date, 0, urlqueue)
datastore = []
for jurl in urlqueue:
if not is_already_scraped(jurl):
res = fetch_postjournal(agency, jurl, datastore)
if 0 < len(datastore):
print datastore
scraperwiki.sqlite.save(unique_keys=['scrapedurl'], data=datastore)
datastore = []
mindate = minmax_recorddate("min")
# Only three months back
return
if mindate:
startdate = mindate - datetime.timedelta(days=1)
start = 0
end = -60
print mindate, startdate, start, end
else:
return
for old in range(start, end, -1):
date = startdate + datetime.timedelta(days=old)
print date
urlqueue = []
get_journal_day(agency, date, 0, urlqueue)
datastore = []
for jurl in urlqueue:
if not is_already_scraped(jurl):
res = fetch_postjournal(agency, jurl, datastore)
if 0 < len(datastore):
print datastore
scraperwiki.sqlite.save(unique_keys=['scrapedurl'], data=datastore)
datastore = []
#GET http://innsyn.hole.kommune.no/wfinnsyn.ashx?response=journalpost_postliste&showresults=true&fradato=2012-06-15T00:00:00
#GET http://innsyn.hole.kommune.no/wfinnsyn.ashx?response=journalpost_detaljer&journalpostid=2012005569&
#GET http://innsyn.hole.kommune.no/wfinnsyn.ashx?response=arkivsak_detaljer&arkivsakid=2006002016&
scraper()