scrapersources/postliste-lindesnes


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133

# -*- coding: utf-8 -*-
# YAML-tagger:
#  Type: kommune
#  Status: unfinished
#  Name: Lindesnes kommune
#  Format: HTML
#  Datatype:
#  Vendor:
#  Run: not finished

import scraperwiki
import lxml.html
import datetime
import dateutil.parser
import urllib2

# http://innsyn.lindesnes.kommune.no/Publikum/Modules/innsyn.aspx?mode=pl&SelPanel=0&ObjectType=ePhorteRegistryEntry&VariantType=Innsyn&ViewType=Table&Query=RecordDate%3a%28-14%29+AND+ResponsibleUnitID%3a%2811%29+AND+DocumentType%3a%28I%2cU%29

def fetch_url(url):
    html = None
    for n in [1, 2, 3]:
        try:
            html = scraperwiki.scrape(url)
            break
        except urllib2.URLError, e:
            print "URLError fetching " + url + ", trying again"
    return html

def make_url(id):
    url = "http://innsyn.lindesnes.kommune.no/Publikum/Modules/innsyn.aspx?mode=pl&SelPanel=0&ObjectType=ePhorteRegistryEntry&VariantType=Innsyn&ViewType=Detail&Query=ID:" + str(id)
    return url

def fetch_postjournal(agency, id, url, datastore):
#    print "Scraping " + url
    scrapestamputc = datetime.datetime.now()
    html = fetch_url(url)
    root = lxml.html.fromstring(html.decode('utf-8'))
    entry = {
        'agency' : agency,
        'scrapestamputc' : scrapestamputc,
        'scrapedurl' : url,
        'queryid' : id
    }

    for span in root.cssselect("div.robots-content span.Element"):
#        print span.text_content()
        field = None
        value = None
        if span.cssselect("h3"):
            field = span.cssselect("h3")[0].text_content().strip()
            value = span.cssselect("span.Content span")[0].text_content().strip()
        elif span.cssselect("h2"):
            field = span.cssselect("h2")[0].text_content().strip()
# FIXME
            value = ""
        elif span.cssselect("h1"):
            field = "docdesc"
            value = span.cssselect("h1")[0].text_content().strip()
#        else:
#            raise ValueError("Unexpected span")
#        print field + " = " + value
        doctypemap = {
          u'Inngående brev' : 'I',
          u'Utgående brev'  : 'U',
          u'Internt notat'  : 'N',
          u'Internt notat uten oppfølging' : 'X',
          u'Saksframlegg/innstilling' : 'S',
          u'Dokumentpost i saksmappe'  : 'Y', # Code not in NOARK, value based on http://img6.custompublish.com/getfile.php/1168825.136.pqftpqctyt/Ephorte-brukerveiledning_2.1.15.pdf?return=www.kafjord.kommune.no
        }
        if 'Type' == field:
            field = 'doctype'
            value = doctypemap[value]
        elif 'Journaldato' == field:
            field = 'recorddate'
            value =  dateutil.parser.parse(value, dayfirst=True)
        elif 'Dokumentdato' == field:
            field = 'docdate'
            value =  dateutil.parser.parse(value, dayfirst=True)
        elif u'Tilhører sak' == field:
            field = 'casedesc'
        elif 'Avsender/Mottaker' == field:
            if 'doctype' in entry and entry['doctype'] in ['U', 'X', 'N']:
                field = 'recipient'
            else:
                field = 'sender'
            td = span.cssselect("table td")
            if td:
                name = td[0].text_content().strip()
                addr = td[1].text_content().strip()
                zip  = td[2].text_content().strip()
               # print "N: '",name, "' '", addr, "' '", zip, "'"
                entry[field] = name
                entry[field + 'addr'] = addr
                entry[field + 'zip'] = zip
                field = ''

#        elif 'Saksbehandlende enhet' == field:
#        elif 'Saksbehandler' == field:
        if field is not None and '' != field:
            entry[field] = value

    print entry
    if 'doctype' in entry:
        datastore.append(entry)

agency = 'Lindesnes kommune'

def scrape_range(start, end, step, agency):
    datastore = []
    for id in range(start, end, step):
        fetch_postjournal(agency, id, make_url(id), datastore)
        if 0 < len(datastore) and 0 == (len(datastore) % 10):
            #print datastore
            scraperwiki.sqlite.save(unique_keys=['queryid'], data=datastore)
            datastore = []
    if 0 < len(datastore):
        scraperwiki.sqlite.save(unique_keys=['queryid'], data=datastore)

def scraper():
    try:
        min = scraperwiki.sqlite.select("min(queryid) as min from swdata")[0]["min"]
        max = scraperwiki.sqlite.select("min(queryid) as max from swdata")[0]["max"]
    except:
        # Random number around 2012-05-15 (ie recent when I wrote this scraper)
        min = 71836

    scrape_range(max,   max + 200,  1, agency)
    scrape_range(min-1, min - 3000, -1, agency)

if __name__ == "scraper":
    scraper()
else:
    print "Not called as scraper"