aboutsummaryrefslogtreecommitdiffstats
path: root/scrapersources/postliste-oep
blob: cc39d9ead27d00540286279a7cd2b424f0eb1045 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
# -*- coding: utf-8 -*-
# YAML-tagger:
#  Type: statlig
#  Status: finished
#  Name: Offentlig Elektronisk postjournal
#  Format: HTML
#  Datatype:
#  Vendor: DIFI
#  Run: hourly

import scraperwiki
import lxml.html
import datetime
import time
import resource
import httplib
import urllib2

# Try several times as the database get bigger
writetries = 8

# Try several times if there is a problem with the service
readtries = 3

# Set to false to not look for earlier entries before the earliest in
# the database.
read_backwards = True

# Set to False not not rescan entries with a latency to discover
# updates and entries introduced after we checked the ID the first
# time.
rescan_after_a_while = True


# http://www.oep.no/search/resultSingle.html?journalPostId=1000000
# http://www.oep.no/search/resultSingle.html?journalPostId=3889259

#                 <table class="defaultTable">
#                     <tr>
#                         <th class="noLeftBorder" style="width: 20%;">Agency:</th>
#                         <td class="noRightBorder" style="width: 80%;">Kulturdepartementet</td>
#                     </tr>
#                     <tr>
#                         <th class="noLeftBorder">Case:</th>
#                         <td class="noRightBorder">DNT Oslo og Omegn - rehabilitering og utvidelse av turisthytta Snøheim på Dovre - spillemidler til anlegg for friluftsliv i fjellet 2011</td>
#                     </tr>
#                     <tr>
#                         <th class="noLeftBorder">Document title:</th>
#                         <td class="noRightBorder">DNT Oslo og Omegn - turisthytta Snøheim på Dovre - eventuelt navnebytte</td>
#                     </tr>
#                     <tr>
#                         <th class="noLeftBorder">Case number:</th>
#                         <td class="noRightBorder">2010/04027</td>
#                     </tr>
#                     <tr>
#                         <th class="noLeftBorder">Document number:</th>
#                         <td class="noRightBorder">4</td>
#                     </tr>
#                     <tr>
#                         <th class="noLeftBorder">Document type:</th>
#                         <td class="noRightBorder">
#                             
#                                 
#                                 
#                                 Outgoing
#                                 
#                             
#                         </td>
#                     </tr>
#                     
#                     
#                         <tr>
#                             <th class="noLeftBorder">Recipient:</th>
#                             <td  class="noRightBorder">Den Norske Turistforening</td>
#                         </tr>
#                     
#                     <tr>
#                         <th class="noLeftBorder">Document date:</th>
#                         <td class="noRightBorder">2010-12-13</td>
#                     </tr>
#                     <tr>
#                         <th class="noLeftBorder">Record entry date:</th>
#                         <td class="noRightBorder">
#                             
#                                 
#                                 
#                                     2010-12-14
#                                 
#                             
#                         </td>
#                     </tr>
#                     <tr>
#                         <th class="noLeftBorder">Published in OEP</th>
#                         <td class="noRightBorder">2011-01-03</td>
#                     </tr>
#                     <tr>
#                         <th class="noLeftBorder" title="Hvis dokumentet er unntatt offentlighet kan unntaket gjelde hele eller deler av dokumentet."><span class="dottedBorderBottom">Grounds for exemption, document:</span></th>
#                         <td  class="noRightBorder">
#                             
#                         </td>
#                     </tr>
#                     <tr>
#                         <th class="noLeftBorder">Archive code:</th>
#                         <td  class="noRightBorder">
#                             
#                         </td>
#                     </tr>
#                     <tr>
#                         <th class="noLeftBorder">Contact point:</th>
#                         <td class="noRightBorder">
#                              <br />
#                             Tel.:&nbsp;22 24 90 90<br />
#                             Email:&nbsp;<a href="mailto:postmottak@kud.dep.no" title="Send email">postmottak@kud.dep.no</a>
#                         </td>
#                     </tr>
#                 </table>

def cpu_spent():
    usage = resource.getrusage(resource.RUSAGE_SELF)
    return getattr(usage, 'ru_utime') + getattr(usage, 'ru_stime')

def cpu_available():
    available = resource.getrlimit(resource.RLIMIT_CPU)[0]
    # If no limit is set, assume 20 CPU seconds as the limit to avoid
    # running for more than a few minutes every time.
    if 0 > available:
        available = 20
    return available

def url_from_id(id):
    return "https://www.oep.no/search/resultSingle.html?journalPostId=" + str(id)

def save(data):
    problem = False
    for run in range(0,writetries):
        try:
            scraperwiki.sqlite.save(unique_keys=['journalPostId'], data=data)
            if problem:
                print "Sqlite write succeeded"
            return
        except scraperwiki.sqlite.SqliteError, e:
            print "Sqlite write error, trying again: " + str(e)
            time.sleep(22)
            problem = True
    raise scraperwiki.sqlite.SqliteError("Unable to write to database, tried " + str(writetries) + " times")

def save_var(var, data):
    problem = False
    for run in range(0,writetries):
        try:
            scraperwiki.sqlite.save_var(var, data)
            if problem:
                print "Sqlite write succeeded"
            return
        except scraperwiki.sqlite.SqliteError, e:
            print "Sqlite write error, trying again: " + str(e)
            time.sleep(22)
            problem = True
    raise scraperwiki.sqlite.SqliteError("Unable to write variable " + var + " to database, tried " + str(writetries) + " times")

fieldmap = {
    'Agency'            : 'agency',
    'Record entry date' : 'recorddate',
    'Case'              : 'casedesc',
    'Case number'       : 'caseid',
    'Document number'   : 'casedocseq',
    'Document date'     : 'docdate',
    'Document title'    : 'docdesc',
    'Document type'     : 'doctype',
    'Grounds for exemption document' : 'exemption',
    'Recipient'         : 'recipient',
    'Sender'            : 'sender',
    'Published in OEP'  : 'recordpublishdate',
#    'Archive code',
#    'Contact point',
#    'journalPostId',
#    'scrapestamputc',
}

doctypemap = {
    'Incoming' : 'I',
    'Outgoing' : 'U',
    'internal' : 'X',
}

def fetch_oep_entry(id, datastorage):
    oepurl = url_from_id(id)
#    print "Fetching %s" % oepurl
    try:
        html = scraperwiki.scrape(oepurl)
    except urllib2.HTTPError, e:
	return -1	
    root = lxml.html.fromstring(html.decode('utf-8'))
    data = { 'journalPostId' : id }
    for tr in root.cssselect("table.defaultTable tr"):
        vtype = tr.cssselect("th")[0].text_content().strip().replace(":", "").replace(",", "")
        value = tr.cssselect("td")[0].text_content().strip()
        #print '"' + vtype + '"', '"'+value+'"'
        if (vtype == 'Record entry date' and value == 'Not stated.') or \
            (vtype == 'Document type' and value == '-') or \
            (vtype == 'Case number' and value == ''):
            return -1
        if vtype in fieldmap:
            vtype = fieldmap[vtype]
        if 'doctype' == vtype:
            value = doctypemap[value]
        if 'exemption' == vtype and '-' == value:
            value = None
        if 'caseid' == vtype:
            caseyear, caseseqnr = value.split("/")
            data['caseyear'] = int(caseyear)
            data['caseseqnr'] =  int(caseseqnr)
        data[vtype] = value
#    print str(id) + ": " + str(data)
    data['scrapestamputc'] = datetime.datetime.now()
#    print data['scrapestamputc']
#    exit ()

    datastorage.append(data)
#    scraperwiki.sqlite.save(unique_keys=['journalPostId'], data=data)
    return 0

def fetch_range(datastorage, first, last, step):
    global readtries
    myskiplimit = skiplimit
    skipped = 0
    fetched = 0
    min_id = first
    for id in range(first, last, step):
        if id < 0:
            break
        try:
            tries = readtries
            while 0 < tries:
                tries = tries - 1
                try:
                    if -1 == fetch_oep_entry(id, datastorage):
                        skipped = skipped + 1
                        if skipped == myskiplimit and myskiplimit == skiplimit:
                            tmp = []
                            for limit  in [250, 500, 800, 1000, 1200, 1500, 1700, 2000, 3000, 5000, 7000]:
                                testid = id + limit * step
                                if -1 != fetch_oep_entry(testid, tmp):
                                    print "Looking "+str(limit)+" ahead, found " + url_from_id(testid)
                                    myskiplimit = skiplimit + limit + 1
                                    break
                        break
                    else:
                        fetched = fetched + 1
                        skipped = 0
                        myskiplimit = skiplimit
                        break
                except urllib2.HTTPError, e: # Because HTTPError lack reason due to bug
                    print "HTTPError triggered for url " + url_from_id(id) + ", trying again: " + str(e.msg)
                except urllib2.URLError, e:
                    print "URLError triggered for url " + url_from_id(id) + ", trying again: " + str(e.reason)
                except httplib.BadStatusLine, e:
                    # e.msg do not exist.  trying .reason 2012-06-25
                    print "BadStatusLine triggered for url " + url_from_id(id) + ", trying again: " + str(e.reason)
            if 0 == tries:
                raise urllib2.URLError("unable to talk to service, giving up.")

            if skipped >= myskiplimit:
                print "Reached end of list, exiting at " + str(id)
                break
            if 50 <= len(datastorage):
                save(data=datastorage)
                datastorage = []

            # Only do this for every 50 ID tested, to avoid spending too much CPU seconds updating the sqlite file
            if 0 == (id % 50):
                if id < min_id:
                    min_id = id
#                    print "Updating min_id to " + str(min_id)
                    save_var('min_tested_id', min_id)
                if cpu_spent() > cpu_available():
                    print "Running short on CPU time, exiting at " + str(id)
                    break
            time.sleep(0.2)
        except scraperwiki.CPUTimeExceededError:
            if 0 < len(datastorage):
                save(data=datastorage)
                datastorage = []
            print "CPU exception caught"
            raise
        except:
            print "Error, unexpected exception"
            raise
    if 0 < len(datastorage):
        save(data=datastorage)
        datastorage = []
    return fetched

def rename_sql_columns():
    print "Dropping temp table"
    scraperwiki.sqlite.execute("DROP TABLE IF EXISTS swdatanew")
    print "Creating table"
    scraperwiki.sqlite.execute("CREATE TABLE IF NOT EXISTS swdatanew (agency text, recorddate text, casedesc text, caseid text, casedocseq integer, docdate text, docdesc text, doctype text, exemption text, recipient text, sender text, recordpublishdate text, `Archive code` text, `Contact point` text, `journalPostId` integer, scrapestamputc text)")
    print  "Copying table"
    scraperwiki.sqlite.execute("INSERT INTO swdatanew(agency, recorddate, casedesc, caseid, casedocseq, docdate, docdesc, doctype, exemption, recipient, sender, recordpublishdate, `Archive code`, `Contact point`, `journalPostId`, scrapestamputc) SELECT `Agency`, `Record entry date`, `Case`, `Case number`, `Document number`, `Document date`, `Document title`, `Document type`, `Grounds for exemption document`, `Recipient`, `Sender`, `Published in OEP`, `Archive code`, `Contact point`, `journalPostId`, `scrapestamputc` FROM swdata")

    scraperwiki.sqlite.execute("ALTER TABLE swdata RENAME TO swdataold")
    scraperwiki.sqlite.execute("ALTER TABLE swdatanew RENAME TO swdata")
    scraperwiki.sqlite.commit()
    exit(0)

def create_indexes():
    for field in ['doctype', 'agency', 'recorddate', 'caseid']:
        print "Creating %s index" % field
        scraperwiki.sqlite.execute("CREATE INDEX IF NOT EXISTS swdata_%s_index ON swdata (%s)" % (field, field))
        scraperwiki.sqlite.commit()

def update_doctypes():
    print "Updating doctype"
    agencies = []
    for agencyref in scraperwiki.sqlite.select("distinct agency from swdata"):
        agencies.append(agencyref['agency'])

    # Updating individual agencies to try to avoid SQL timeout
    for agency in agencies:
        print "Updating doctype for " + agency
        scraperwiki.sqlite.execute("UPDATE swdata set doctype = 'I' where agency = ? and doctype = 'Incoming'", (agency))
        scraperwiki.sqlite.execute("UPDATE swdata set doctype = 'U' where agency = ? and doctype = 'Outgoing'", (agency))
        scraperwiki.sqlite.execute("UPDATE swdata set doctype = 'X' where agency = ? and doctype = 'internal'", (agency))
        scraperwiki.sqlite.commit()
    exit(0)

def update_caseyear():
    print "Updating caseyear and caseseqnr"
    agencies = []
    for agencyref in scraperwiki.sqlite.select("distinct agency from swdata WHERE caseyear is NULL"):
        agencies.append(agencyref['agency'])

    # Updating individual agencies to try to avoid SQL timeout
    for agency in agencies:
        print "Updating caseyear for " + agency
        res = scraperwiki.sqlite.execute("select journalPostId, substr(caseid, 1, 4), substr(caseid, 6) from swdata where agency = ? and caseyear is NULL limit 2", (agency))
        print res
        scraperwiki.sqlite.execute("UPDATE swdata set caseyear = substr(caseid, 1, 4), caseseqnr = substr(caseid, 6) where agency = ? AND caseyear is NULL", (agency))
        scraperwiki.sqlite.commit()
    exit(0)

def remove_original():
    scraperwiki.sqlite.execute("DROP TABLE IF EXISTS swdataold")
    scraperwiki.sqlite.commit()
    exit(0)

# Fetch again some crap entries that ended up in the database when the
# script was slightly broken and filled in non-existing entries in the
# SQL database.
def reparse_strange_entries(datastorage):
    try:
        strange = "journalPostId FROM swdata WHERE caseid IS NULL OR scrapestamputc IS NULL OR agency IS NULL order by journalPostId"
        for idref in scraperwiki.sqlite.select(strange):
            id = idref['journalPostId']
            if -1 == fetch_oep_entry(id, datastorage):
                print "Refetching %d failed, flush ID" % id
                scraperwiki.sqlite.execute("DELETE from swdata where journalPostId = %d" % id)
                if 0 < len(datastorage):
                    save(data=datastorage)
                    datastorage = []
                else:
                    print "Refetching %d" % id
            if 50 <= len(datastorage):
                save(data=datastorage)
                datastorage = []
            time.sleep(0.2)
        if 0 < len(datastorage):
            save(data=datastorage)
            datastorage = []
    except scraperwiki.sqlite.SqliteError, e:
        # Most likely no table, keep going
        pass
#update_caseyear()

#create_indexes()

#rename_sql_columns()
#remove_original()

# This one give me SQL timeout
#update_doctypes()

print "Starting to fetch journal entries " + str(datetime.datetime.now())
scraperwiki.scrape("http://www.oep.no/")
datastorage = []

reparse_strange_entries(datastorage)

# Update entries to handle <URL: https://rt.nuug.no:443/Ticket/Display.html?id=6342 >.
# Used 2012-09-17
#scraperwiki.sqlite.execute("DELETE from swdata where journalPostId = 638167")
#fetch_oep_entry(638167, datastorage)
#scraperwiki.sqlite.execute("DELETE from swdata where journalPostId = 638104")
#fetch_oep_entry(638104, datastorage)
#scraperwiki.sqlite.commit()
# Missing entry, should -1
#print fetch_oep_entry(16629772, datastorage)
# Exist, should return 0
#print fetch_oep_entry(16629773, datastorage)

count = 10000
skiplimit = 500
# Random value fairly close to the most recent ID when this project started 2016-04-07
max = min = startid = 16682410
try:
    max = scraperwiki.sqlite.select("max(journalPostId) as max from swdata")[0]["max"]
    if 0 < scraperwiki.sqlite.get_var('min_tested_id'):
        saved_min = scraperwiki.sqlite.get_var('min_tested_id')
    else:
        saved_min = min + 1
    sql_min = scraperwiki.sqlite.select("min(journalPostId) as min from swdata")[0]["min"]
    print "Saved min: " + str(saved_min) + ", sql min: " + str(sql_min)
    if sql_min < saved_min:
        min = sql_min
    else:
        min = saved_min
    print "Scraping " + str(count) + " IDs below " + str(min) + " and above " + str(max)
except scraperwiki.sqlite.SqliteError:
    pass

fetched = 0
fetched = fetched + fetch_range(datastorage, max + 1, max + count, 1)
print "Fetched " + str(fetched) + " new journal entries, cpu spent: " + str(cpu_spent())
if min >= 0 and read_backwards:
    fetched = fetch_range(datastorage, min, min - count, -1)
    print "Fetched " + str(fetched) + " old journal entries, cpu spent: " + str(cpu_spent())

if rescan_after_a_while:
    rescan_count = 8000
    rescan_latency = 100000

    # Rescan to see if we missed something, and to get the latest version
    rescan_min = scraperwiki.sqlite.get_var('min_rescan_id')
    if rescan_min is None:
        rescan_min = 0
    if rescan_min + rescan_count < max - rescan_latency:
        end = rescan_min + rescan_count
        fetched = fetch_range(datastorage, rescan_min, end, 1)
        save_var('min_rescan_id', end - 1)
        print "Fetched %d rescanned journal entries (%d-%d), cpu spent: %f" \
            % (fetched, rescan_min, end, cpu_spent())