1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
|
# -*- coding: utf-8 -*-
# YAML-tagger:
# Type: kommune
# Status: unfinished
# Name: Lindesnes kommune
# Format: HTML
# Datatype:
# Vendor:
# Run: not finished
import scraperwiki
import lxml.html
import datetime
import dateutil.parser
import urllib2
# http://innsyn.lindesnes.kommune.no/Publikum/Modules/innsyn.aspx?mode=pl&SelPanel=0&ObjectType=ePhorteRegistryEntry&VariantType=Innsyn&ViewType=Table&Query=RecordDate%3a%28-14%29+AND+ResponsibleUnitID%3a%2811%29+AND+DocumentType%3a%28I%2cU%29
def fetch_url(url):
html = None
for n in [1, 2, 3]:
try:
html = scraperwiki.scrape(url)
break
except urllib2.URLError, e:
print "URLError fetching " + url + ", trying again"
return html
def make_url(id):
url = "http://innsyn.lindesnes.kommune.no/Publikum/Modules/innsyn.aspx?mode=pl&SelPanel=0&ObjectType=ePhorteRegistryEntry&VariantType=Innsyn&ViewType=Detail&Query=ID:" + str(id)
return url
def fetch_postjournal(agency, id, url, datastore):
# print "Scraping " + url
scrapestamputc = datetime.datetime.now()
html = fetch_url(url)
root = lxml.html.fromstring(html.decode('utf-8'))
entry = {
'agency' : agency,
'scrapestamputc' : scrapestamputc,
'scrapedurl' : url,
'queryid' : id
}
for span in root.cssselect("div.robots-content span.Element"):
# print span.text_content()
field = None
value = None
if span.cssselect("h3"):
field = span.cssselect("h3")[0].text_content().strip()
value = span.cssselect("span.Content span")[0].text_content().strip()
elif span.cssselect("h2"):
field = span.cssselect("h2")[0].text_content().strip()
# FIXME
value = ""
elif span.cssselect("h1"):
field = "docdesc"
value = span.cssselect("h1")[0].text_content().strip()
# else:
# raise ValueError("Unexpected span")
# print field + " = " + value
doctypemap = {
u'Inngående brev' : 'I',
u'Utgående brev' : 'U',
u'Internt notat' : 'N',
u'Internt notat uten oppfølging' : 'X',
u'Saksframlegg/innstilling' : 'S',
u'Dokumentpost i saksmappe' : 'Y', # Code not in NOARK, value based on http://img6.custompublish.com/getfile.php/1168825.136.pqftpqctyt/Ephorte-brukerveiledning_2.1.15.pdf?return=www.kafjord.kommune.no
}
if 'Type' == field:
field = 'doctype'
value = doctypemap[value]
elif 'Journaldato' == field:
field = 'recorddate'
value = dateutil.parser.parse(value, dayfirst=True)
elif 'Dokumentdato' == field:
field = 'docdate'
value = dateutil.parser.parse(value, dayfirst=True)
elif u'Tilhører sak' == field:
field = 'casedesc'
elif 'Avsender/Mottaker' == field:
if 'doctype' in entry and entry['doctype'] in ['U', 'X', 'N']:
field = 'recipient'
else:
field = 'sender'
td = span.cssselect("table td")
if td:
name = td[0].text_content().strip()
addr = td[1].text_content().strip()
zip = td[2].text_content().strip()
# print "N: '",name, "' '", addr, "' '", zip, "'"
entry[field] = name
entry[field + 'addr'] = addr
entry[field + 'zip'] = zip
field = ''
# elif 'Saksbehandlende enhet' == field:
# elif 'Saksbehandler' == field:
if field is not None and '' != field:
entry[field] = value
print entry
if 'doctype' in entry:
datastore.append(entry)
agency = 'Lindesnes kommune'
def scrape_range(start, end, step, agency):
datastore = []
for id in range(start, end, step):
fetch_postjournal(agency, id, make_url(id), datastore)
if 0 < len(datastore) and 0 == (len(datastore) % 10):
#print datastore
scraperwiki.sqlite.save(unique_keys=['queryid'], data=datastore)
datastore = []
if 0 < len(datastore):
scraperwiki.sqlite.save(unique_keys=['queryid'], data=datastore)
def scraper():
try:
min = scraperwiki.sqlite.select("min(queryid) as min from swdata")[0]["min"]
max = scraperwiki.sqlite.select("min(queryid) as max from swdata")[0]["max"]
except:
# Random number around 2012-05-15 (ie recent when I wrote this scraper)
min = 71836
scrape_range(max, max + 200, 1, agency)
scrape_range(min-1, min - 3000, -1, agency)
if __name__ == "scraper":
scraper()
else:
print "Not called as scraper"
|