blob: a2f9697ccfd80247193a013296e55d555f4654b9 (
plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
|
# -*- coding: utf-8 -*-
# YAML-tagger:
# Type: statlig
# Status: finished
# Name: Offentlig Elektronisk postjournal delivery dates
# Format: HTML
# Datatype:
# Vendor: DIFI
# Run: hourly
import scraperwiki
import lxml.html
import datetime
import resource
import dateutil.parser
import resource
def fetch_oep_deliverydates(url, datastorage):
html = scraperwiki.scrape(url)
root = lxml.html.fromstring(html.decode('utf-8'))
data = { 'scrapedurl' : id }
for tr in root.cssselect("table.defaulttable tr"):
if 3 == len(tr.cssselect("td")):
data = { 'scrapedurl' : url }
#print tr
# vtype = tr.cssselect("th")[0].text_content().strip().replace(":", "").replace(",", "")
agency = tr.cssselect("td")[0].text_content().strip()
deliverydate = tr.cssselect("td")[1].text_content().strip()
if deliverydate == "Levert":
continue
data['agency'] = agency
#print "D: '" + deliverydate + "'"
data['deliverydate'] = dateutil.parser.parse(deliverydate, dayfirst=True)
data['scrapestamputc'] = datetime.datetime.now()
datastorage.append(data)
return 0
print "Starting to fetch journal delivery dates " + str(datetime.datetime.now())
datastorage = []
#fetch_oep_deliverydates("http://www.oep.no/pub/faces/statistikk.jsp?reposId=3", datastorage)
# New url before 2012-11-09
fetch_oep_deliverydates("http://www.oep.no/pub/report.xhtml?reportId=3", datastorage)
#print datastorage
scraperwiki.sqlite.save(unique_keys=['agency', 'deliverydate'], data=datastorage)
|