scrapersources/postliste-oep-deliverydates


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45

# -*- coding: utf-8 -*-
# YAML-tagger:
#  Type: statlig
#  Status: finished
#  Name: Offentlig Elektronisk postjournal delivery dates
#  Format: HTML
#  Datatype:
#  Vendor: DIFI
#  Run: hourly

import scraperwiki
import lxml.html
import datetime
import resource
import dateutil.parser
import resource

def fetch_oep_deliverydates(url, datastorage):
    html = scraperwiki.scrape(url)
    root = lxml.html.fromstring(html.decode('utf-8'))
    data = { 'scrapedurl' : id }
    for tr in root.cssselect("table.defaulttable tr"):
        if 3 == len(tr.cssselect("td")):
            data = { 'scrapedurl' : url }
            #print tr
#        vtype = tr.cssselect("th")[0].text_content().strip().replace(":", "").replace(",", "")
            agency = tr.cssselect("td")[0].text_content().strip()
            deliverydate = tr.cssselect("td")[1].text_content().strip()
            if deliverydate == "Levert":
                continue
            data['agency'] = agency
            #print "D: '" + deliverydate + "'"
            data['deliverydate'] = dateutil.parser.parse(deliverydate, dayfirst=True)
            data['scrapestamputc'] = datetime.datetime.now()
            datastorage.append(data)
    return 0

print "Starting to fetch journal delivery dates " + str(datetime.datetime.now())

datastorage = []
#fetch_oep_deliverydates("http://www.oep.no/pub/faces/statistikk.jsp?reposId=3", datastorage)
# New url before 2012-11-09
fetch_oep_deliverydates("http://www.oep.no/pub/report.xhtml?reportId=3", datastorage)
#print datastorage
scraperwiki.sqlite.save(unique_keys=['agency', 'deliverydate'], data=datastorage)