1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
|
# coding=utf-8
# YAML-tagger:
# Type: kommune
# Status: unfinished
# Name: Arendal kommune
# Format: HTML
# Datatype:
# Vendor:
# Run: not finished
# Missingfields: caseid, casedesc, etc
import scraperwiki
import json
import httplib, urllib
import datetime
import dateutil.parser
import time
import re
agency = "Arendal kommune"
urlhost = "www.arendal.kommune.no"
fieldmap = {
'AntallVedlegg' : '',
'Arkivdel' : '',
'AvsenderMottaker' : 'sender', # or recipient
'Dokumentdato' : 'docdate',
'Dokumentnummer' : 'casedocseq',
'Dokumenttype' : 'doctype',
'EkspedertDato' : '',
'Hjemmel' : 'exemption',
'Id' : 'id',
'Innholdsbeskrivelse' : 'docdesc',
'Mappetype' : '',
'Offentlig' : 'ispublic',
'PostlisteType' : 'doctype',
'RegistrertDato' : 'recorddate',
'SaksId' : '',
'SaksNr' : 'caseid',
'Sakstittel' : 'casedesc',
#'SaksNr' : 'SA.SAAR + SA.SEKNR',
'Saksansvarlig' : 'saksbehandler',
'SaksansvarligEnhet' : '',
'SaksansvarligEpost' : '',
# 'scrapestamputc' : '',
# 'scrapedurl' : '',
# 'agency' : '',
}
# Convert "/Date(1317808020000+0200)/" to a datetime object
# FIXME Currently ignore the timezone information
def parse_datestr(str):
match = re.split("[/()+]", str)
# print match
sinceepoch = float(match[2]) / 1000
if match[3] == '0200':
sinceepoch = sinceepoch + 2 * 60 * 60
if match[3] == '0100':
sinceepoch = sinceepoch + 1 * 60 * 60
# print sinceepoch
date = datetime.datetime.fromtimestamp(sinceepoch)
# print date
return date
def reformat_caseid(caseid):
# Input 12/13123, output 2012, 13123, "2012/13123"
year, seqnr = caseid.split("/")
year = int(year)
if year < 100:
year = year + 2000
caseid = "%d/%s" % (year, seqnr)
return year, int(seqnr), caseid
def ws_post(url, urlhost, urlpath, params):
jsonparams = json.dumps(params)
headers = {"Content-type": "application/json; charset=utf-8",
"Accept": "application/json"}
conn = httplib.HTTPConnection(urlhost)
#print jsonparams
conn.request("POST", urlpath, jsonparams, headers)
response = conn.getresponse()
#print response.status, response.reason
jsonres = response.read()
res = json.loads(jsonres)
#print res
return res
def fetch_journal_entry(id):
params = { "id" : str(id)}
headers = {"Content-type": "application/json; charset=utf-8",
"Accept": "application/json"}
urlpath = "/Templates/eDemokrati/Services/eDemokratiService.svc/GetPostlisteObjekt"
data = ws_post(None, urlhost, urlpath, params)['d']
entry = None
if data:
del data['__type'] # This is useless, ignore
print data
entry = {}
entry['agency'] = agency
entry['scrapestamputc'] = datetime.datetime.now()
entry['scrapedurl'] = "http://" + urlhost + urlpath
# entry['scrapedurl'] = url
for dfield in fieldmap.keys():
if dfield in data and data[dfield]:
if dfield in fieldmap and fieldmap[dfield] != "":
fieldname = fieldmap[dfield]
else:
fieldname = dfield
if 'sender' == fieldname:
if data['Dokumenttype'] == 'U':
fieldname = 'recipient'
if dfield in ['RegistrertDato', 'Dokumentdato', 'EkspedertDato']:
entry[fieldname] = parse_datestr(data[dfield]).date()
else:
entry[fieldname] = data[dfield]
else:
entry[dfield] = data[dfield]
entry['caseyear'], entry['caseseqnr'], entry['caseid'] = reformat_caseid(entry['caseid'])
# data["sourceurl"] = "http://" + server + path
print entry
return entry
def epoctime_to_datestr(epoctime):
return "/Date("+str(int(epoctime * 1000) )+")/"
def get_last_entry_id():
now = time.time()
# Get the last week, as the most recent entry should be in this range
fradato = epoctime_to_datestr(now - 7 * 24 * 60 * 60)
tildato = epoctime_to_datestr(now)
#print fradato
maxid = 0
urlpath = "/Templates/eDemokrati/Services/eDemokratiService.svc/GetPostlisteArkivdeler"
params = {
"dato": fradato,
"tilDato": tildato,
"søkestreng":""}
arkivdeler = ws_post(None, urlhost, urlpath, params)['d']
# {u'd': [u'_', u'HVA-IFE-A', u'KAR-BR-A', u'KAR-BRUK-A', u'KAR-EIEN-A', u'KAR-ELBH-A', u'KAR-ELS-A', ...
urlpath = "/Templates/eDemokrati/Services/eDemokratiService.svc/GetPostlisteDokumenttyper"
for arkivdel in arkivdeler[0]:
params = {
"dato":fradato,
"tilDato":tildato,
"søkestreng":"",
"arkivdel":arkivdel,
}
doctypes = ws_post(None, urlhost, urlpath, params)['d']
#{"d":["I","N","S","U","X"]}
urlpath = "/Templates/eDemokrati/Services/eDemokratiService.svc/GetPostlisteS%C3%B8k"
for doctype in doctypes:
params = {
"fraDato":fradato,
"tilDato":tildato,
"søkestreng":"",
"arkivdel":arkivdel,
"dokumenttype":doctype,
}
entries = ws_post(None, urlhost, urlpath, params)['d']
for entry in entries:
#print entry['Id']
id = int(entry['Id'])
if id > maxid:
maxid = id
# data = fetch_journal_entry(entry['Id'])
# if data:
# scraperwiki.sqlite.save(unique_keys=['id'], data=data)
return maxid
#{"d":[{"__type":"PostlisteObjekt:#SSP.NoarkServices","AntallVedlegg":1,"Dokumentnummer":2,"Dokumenttype":"I","EkspedertDato":null,"Hjemmel":null,"Id":1507868,"Innholdsbeskrivelse":"Tomtejustering - Lillebæk, eiendom 208\/1611","Offentlig":true,"RegistrertDato":"\/Date(1339538400000+0200)\/","SaksId":296971,"SaksNr":"12\/8658","Arkivdel":"KAR-EIEN-A","AvsenderMottaker":"Randi Wilberg","Dokumentdato":"\/Date(1339624800000+0200)\/","Mappetype":"DS","PostlisteType":"I","Saksansvarlig":null,"SaksansvarligEnhet":null,"SaksansvarligEpost":null,"Sakstittel":null},{"__type":"PostlisteObjekt:#SSP.NoarkServices","AntallVedlegg":4,"Dokumentnummer":1,"Dokumenttype":"I","EkspedertDato":null,"Hjemmel":null,"Id":1507865,"Innholdsbeskrivelse":"Søknkad om utvidelse av balkong - Kalleraveien 14","Offentlig":true,"RegistrertDato":"\/Date(1339538400000+0200)\/","SaksId":298804,"SaksNr":"12\/10480","Arkivdel":"KAR-EIEN-A","AvsenderMottaker":"Ole Henning Løken","Dokumentdato":"\/Date(1338847200000+0200)\/","Mappetype":"BS","PostlisteType":"I","Saksansvarlig":null,"SaksansvarligEnhet":null,"SaksansvarligEpost":null,"Sakstittel":null},...
def get_journal_enries_range(min, max, step):
for id in range(min, max, step):
data = fetch_journal_entry(id)
#print data
if data:
scraperwiki.sqlite.save(unique_keys=['id'], data=data)
maxid = get_last_entry_id()
print "max id =", maxid
try:
start = scraperwiki.sqlite.select("max(id) as max from swdata")[0]['max'] + 1
except:
start = 137459
print start, maxid
#if maxid > start + 20:
# maxid = start + 10
get_journal_enries_range(start, maxid, 1)
start = scraperwiki.sqlite.select("min(id) as min from swdata")[0]['min'] - 1
end = start - 1000
print start, end
get_journal_enries_range(start, end, -1)
|