1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
|
#!/usr/bin/python
# -*- coding: utf-8 -*-
import string
import sys
import time
import sqlalchemy
from elasticsearch import Elasticsearch
from elasticsearch import helpers
def populate_from_scraper_real(scraper):
sys.stdout.write(scraper + ": ")
sys.stdout.flush()
es = Elasticsearch()
sql = "select * from swdata"
filename = "sqlite:///data/%s.sqlite" % scraper
create = sqlalchemy.create_engine
# print "opening %s" % filename
engine = create(filename, echo=False, connect_args={'timeout': 300})
connection = engine.connect()
result = connection.execute(sql)
data = []
skipped = 0
id = 0
actions = []
for row in result:
entry = dict(row.items())
# Handle OEP scraper 2012-06-16
if not 'caseyear' in entry or entry['caseyear'] is None or \
not 'caseseqnr' in entry or entry['caseseqnr'] is None:
if True:
if entry['caseid'] is None:
# print "Strange entry, skipping: ", entry
skipped = skipped + 1
continue
entry['caseyear'], entry['caseseqnr'] = entry['caseid'].split("/")
entry['scraper'] = scraper
# Handle missing scrapestamputc field, some OEP entries are
# missing them.
if 'scrapestamputc' in entry and entry['scrapestamputc'] is not None:
# print "scrapestamputc: %s" % entry['scrapestamputc']
entry['scrapestamputc'] = entry['scrapestamputc'] + '+0000'
# print entry
# Workaround for postliste-stortinget failing on some PDFs
if entry['doctype'] == u'Avs./mot:':
continue
# Clean up numbers
for field in ['caseyear','caseseqnr', 'casedocseq']:
if field in entry:
entry[field] = int(entry[field])
indexname = 'index-' + entry['agency']
indexname = indexname.replace(' ', '_').replace(',', '').lower()
action = {
"_type": "publicjournal",
"_index": indexname,
"_id": "%d/%d-%d" % (entry['caseyear'],
entry['caseseqnr'],
entry['casedocseq']),
"_source": entry
}
actions.append(action)
id += 1
while (len(actions) > 10000 and len(actions) % 10000 == 0):
sys.stdout.write(".")
sys.stdout.flush()
helpers.bulk(es, actions)
del actions[0:len(actions)]
break
connection.close()
if (len(actions) > 0):
helpers.bulk(es, actions)
del actions[0:len(actions)]
print "done"
return len(data) - skipped
def populate_from_scraper(scraper):
ret = populate_from_scraper_real(scraper)
if ret is None:
time.sleep(10)
ret = populate_from_scraper_real(scraper)
return ret
def main():
scrapers = [
'postliste-oep',
]
for scraper in scrapers:
print
# print "Moving data from " + scraper
populate_from_scraper(scraper)
main()
|