move-postjournal-elasticsearch


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105

#!/usr/bin/python
# -*- coding: utf-8 -*-

import string
import sys
import time
import sqlalchemy
from elasticsearch import Elasticsearch
from elasticsearch import helpers

def populate_from_scraper_real(scraper):
    sys.stdout.write(scraper + ": ")
    sys.stdout.flush()

    es = Elasticsearch()

    sql = "select * from swdata"
    filename = "sqlite:///data/%s.sqlite" % scraper
    create = sqlalchemy.create_engine
#        print "opening %s" % filename
    engine = create(filename, echo=False, connect_args={'timeout': 300})
    connection = engine.connect()
    result = connection.execute(sql)
    
    data = []
    skipped = 0
    id = 0
    actions = []
    for row in result:
        entry = dict(row.items())

        # Handle OEP scraper 2012-06-16
        if not 'caseyear' in entry or entry['caseyear'] is None or \
                not 'caseseqnr' in entry or entry['caseseqnr'] is None:
            if True:
                if entry['caseid'] is None:
#                    print "Strange entry, skipping: ", entry
                    skipped = skipped + 1
                    continue
            entry['caseyear'], entry['caseseqnr'] = entry['caseid'].split("/")

        entry['scraper'] = scraper

        # Handle missing scrapestamputc field, some OEP entries are
        # missing them.
        if 'scrapestamputc' in entry and entry['scrapestamputc'] is not None:
#            print "scrapestamputc: %s" % entry['scrapestamputc']
            entry['scrapestamputc'] = entry['scrapestamputc'] + '+0000'

#        print entry
        # Workaround for postliste-stortinget failing on some PDFs
        if entry['doctype'] == u'Avs./mot:':
            continue

        # Clean up numbers
        for field in ['caseyear','caseseqnr', 'casedocseq']:
            if field in entry:
                entry[field] = int(entry[field])

        indexname = 'index-' + entry['agency']
        indexname = indexname.replace(' ', '_').replace(',', '').lower()
        action = {
            "_type": "publicjournal",
            "_index": indexname,
            "_id": "%d/%d-%d" % (entry['caseyear'],
                                 entry['caseseqnr'],
                                 entry['casedocseq']),
            "_source": entry
            }
        actions.append(action)
        id += 1

        while (len(actions) > 10000 and len(actions) % 10000 == 0):
            sys.stdout.write(".")
            sys.stdout.flush()
            helpers.bulk(es, actions)
            del actions[0:len(actions)]
            break
    connection.close()

    if (len(actions) > 0):
        helpers.bulk(es, actions)
        del actions[0:len(actions)]

    print "done"
    return len(data) - skipped

def populate_from_scraper(scraper):
    ret = populate_from_scraper_real(scraper)
    if ret is None:
        time.sleep(10)
        ret = populate_from_scraper_real(scraper)
    return ret

def main():
    scrapers = [
        'postliste-oep',
    ]

    for scraper in scrapers:
        print
#        print "Moving data from " + scraper
        populate_from_scraper(scraper)

main()