aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--scrapersources/postliste-difi8
-rw-r--r--scrapersources/postliste-hoegskolen-i-gjoevik14
-rw-r--r--scrapersources/postliste-hoegskolen-i-lillehammer4
-rw-r--r--scrapersources/postliste-hoegskolen-i-volda3
-rw-r--r--scrapersources/postliste-lenvik173
-rw-r--r--scrapersources/postliste-met4
-rw-r--r--scrapersources/postliste-naroy7
-rw-r--r--scrapersources/postliste-oep34
-rw-r--r--scrapersources/postliste-oep-deliverydates4
-rw-r--r--scrapersources/postliste-ruter1
10 files changed, 232 insertions, 20 deletions
diff --git a/scrapersources/postliste-difi b/scrapersources/postliste-difi
index dfc986f..459327b 100644
--- a/scrapersources/postliste-difi
+++ b/scrapersources/postliste-difi
@@ -54,17 +54,17 @@ def process_journal_pdfs(parser, listurl, errors):
html = scraperwiki.scrape(listurl)
root = lxml.html.fromstring(html)
html = None
- for ahref in root.cssselect("div.body a"):
+ for ahref in root.cssselect("div.sixcol a"):
href = ahref.attrib['href']
url = urlparse.urljoin(listurl, href)
if -1 != href.find("file://") or -1 == url.find(".pdf"):
-# print "Skipping non-http URL " + url
+ print "Skipping non-http URL " + url
continue
if parser.is_already_scraped(url):
True
-# print "Skipping already scraped " + url
+ print "Skipping already scraped " + url
else:
-# print "Will process " + url
+ #print "Will process " + url
process_pdf(parser, url, errors)
def test_small_pdfs(parser):
diff --git a/scrapersources/postliste-hoegskolen-i-gjoevik b/scrapersources/postliste-hoegskolen-i-gjoevik
index d4f7931..cdf007e 100644
--- a/scrapersources/postliste-hoegskolen-i-gjoevik
+++ b/scrapersources/postliste-hoegskolen-i-gjoevik
@@ -60,13 +60,15 @@ def process_journal_pdfs(parser, listurl, errors):
html = scraperwiki.scrape(listurl)
root = lxml.html.fromstring(html)
html = None
- for ahref in root.cssselect("div.spalte-inner a"):
+ for ahref in root.cssselect("section a"):
href = ahref.attrib['href']
url = urlparse.urljoin(listurl, href).replace(" ", "+")
+ #print url
if -1 != href.find("file://") or -1 == url.find(".pdf"):
# print "Skipping non-http URL " + url
continue
if parser.is_already_scraped(url):
+ #print "Scraped: %s" % url
True
# print "Skipping already scraped " + url
else:
@@ -98,6 +100,16 @@ endYear=datetime.datetime.now().year
for year in range(startYear, endYear+1): # range goes from startyear to endYear-1
process_journal_pdfs(parser, "http://www.hig.no/om_hig/offentleg_journal/%d" % year, errors)
+
process_page_queue(parser, errors)
report_errors(errors)
+warningQuery = "recorddate as lastupdate from 'swdata' order by recorddate DESC limit 1";
+result = scraperwiki.sqlite.select(warningQuery)
+now=datetime.datetime.today()
+then=datetime.datetime.strptime(result[0]['lastupdate'],"20%y-%m-%dT%H:%M:%S")
+
+if (now-then).days > 14:
+ print "warning"
+ warningURL = "http://hild1.no/~hildenae/files/dynamic/run.php?scraper=postliste-hoegskolen-i-gjoevik&reason=7days";
+ scraperwiki.scrape(warningURL) \ No newline at end of file
diff --git a/scrapersources/postliste-hoegskolen-i-lillehammer b/scrapersources/postliste-hoegskolen-i-lillehammer
index 5337521..5687ece 100644
--- a/scrapersources/postliste-hoegskolen-i-lillehammer
+++ b/scrapersources/postliste-hoegskolen-i-lillehammer
@@ -64,9 +64,9 @@ def process_journal_pdfs(parser, listurl, errors):
continue
if parser.is_already_scraped(url):
True
-# print "Skipping already scraped " + url
+ print "Skipping already scraped " + url
else:
-# print "Will process " + url
+ print "Will process " + url
process_pdf(parser, url, errors)
def test_small_pdfs(parser):
diff --git a/scrapersources/postliste-hoegskolen-i-volda b/scrapersources/postliste-hoegskolen-i-volda
index 0106cb7..d8f3686 100644
--- a/scrapersources/postliste-hoegskolen-i-volda
+++ b/scrapersources/postliste-hoegskolen-i-volda
@@ -53,11 +53,12 @@ def process_journal_pdfs(parser, listurl, errors):
html = scraperwiki.scrape(listurl)
root = lxml.html.fromstring(html)
html = None
- for ahref in root.cssselect("div.inside a"):
+ for ahref in root.cssselect("div#maincontent a"):
if 'id' not in ahref.attrib or -1 == ahref.attrib['id'].find("archiveimage_"):
continue
href = ahref.attrib['href']
url = urlparse.urljoin(listurl, href)
+ #print "found url %s" %url
if -1 != href.find("file://"):
# print "Skipping non-http URL " + url
continue
diff --git a/scrapersources/postliste-lenvik b/scrapersources/postliste-lenvik
new file mode 100644
index 0000000..66a502d
--- /dev/null
+++ b/scrapersources/postliste-lenvik
@@ -0,0 +1,173 @@
+# -*- coding: utf-8 -*-
+
+import scraperwiki
+import urllib2
+import lxml.html
+import re
+import dateutil.parser
+from dateutil.relativedelta import relativedelta
+import datetime
+import urlparse
+
+agency = "Lenvik kommune"
+
+# Point scraperwiki GUI to the start page
+scraperwiki.scrape("http://webway.lenvik.kommune.no/postjournal")
+
+postlistelib=scraperwiki.swimport('postliste-python-lib')
+parser = postlistelib.JournalParser(agency=agency)
+
+def saver(unique_keys, data):
+# return
+ #print "Not saving data"
+ scraperwiki.sqlite.save(unique_keys, data)
+
+def expand_year(year):
+ year = int(year)
+ if year > 50:
+ year = year + 1900
+ else:
+ year = year + 2000
+ return year
+
+# <tr class=yang>
+# <td>13/00563-001</td>
+# <td>04.03.2013</td>
+# <td style="text-align:center;">
+# <div title="Inngående">I</div>
+# </td>
+# <td>Flytting av VPN-tunell </td>
+# <td>EVRY AS</td>
+# <td>Jan-Eirik Nordahl</td>
+# <td>
+# <a href="/dokumentbestilling?jpid=13003566" title="Klikk for å bestille innsyn">Bestill</a>
+# </td>
+# <td></td>
+#
+# </tr>
+#
+
+def fetch_postjournal_day(parser, url, html, saver):
+ root = lxml.html.fromstring(html.decode('utf-8'))
+
+ recorddate = None
+ for div in root.cssselect('div'):
+ divcontent = div.text_content()
+ if 0 == divcontent.find("Offentlig postjournal for "):
+ recorddate = dateutil.parser.parse(divcontent.replace("Offentlig postjournal for ",""), dayfirst=True)
+ print recorddate
+
+ # Make sure we save the entire URL or nothing at all
+ datastore = []
+ for tr in root.cssselect('tr.yang'):
+ tds = tr.cssselect("td")
+ docidstr = tds[0].text_content().strip()
+ docdate = tds[1].text_content().strip()
+ doctype = tds[2].text_content().strip()
+ docdesc = tds[3].text_content().strip()
+ fratil = tds[4].text_content().strip()
+ saksbehandler = tds[5].text_content().strip()
+ if -1 != tds[6].text_content().find("Bestill"):
+ exemption = None
+ else:
+ exemption = tds[6].text_content().strip()
+
+ docdate = dateutil.parser.parse(docdate, dayfirst=True)
+
+# print doctype, docdesc
+ if not parser.is_valid_doctype(doctype):
+ doctype = {
+ '' : '?',
+ }[doctype]
+ if parser.is_sender_doctype(doctype):
+ fratilfield = 'sender'
+ elif parser.is_recipient_doctype(doctype):
+ fratilfield = 'recipient'
+
+ caseyear, caseseqnr = docidstr.split("/")
+ caseyear = expand_year(caseyear)
+ caseseqnr, casedocseq = caseseqnr.split("-")
+ caseid = "%d/%d" % (int(caseyear), int(caseseqnr))
+
+ data = {
+ 'agency' : parser.agency,
+ 'recorddate' : recorddate.date(),
+ 'docdate' : docdate.date(),
+ 'docdesc' : docdesc,
+ 'casedesc' : docdesc, # FIXME fake value
+
+ 'caseyear' : int(caseyear),
+ 'caseseqnr' : int(caseseqnr),
+ 'casedocseq' : int(casedocseq),
+ 'caseid' : caseid,
+ 'doctype' : doctype,
+
+# 'journalseqnr' : int(journalseqnr),
+# 'journalyear' : int(journalyear),
+# 'journalid' : journalid,
+ fratilfield : fratil,
+
+ 'saksbehandler' : saksbehandler,
+# 'saksansvarlig' : saksansvarlig.strip(),
+# 'saksansvarligenhet' : saksansvarligenhet.strip(),
+
+ 'docidstr' : docidstr,
+# 'laapenr' : laapenr,
+ 'exemption' : exemption,
+
+ 'scrapedurl' : url,
+ 'scrapestamputc' : datetime.datetime.now()
+ }
+
+# print data
+ parser.verify_entry(data)
+ datastore.append(data)
+
+ seenurl = {}
+ # Find next URL. There are two on each page.
+ for ahref in root.cssselect('a.next_page'):
+ if 0 == ahref.text_content().find('Neste'):
+ nexturl = urlparse.urljoin(url, ahref.attrib['href'])
+ if nexturl not in seenurl:
+ seenurl[nexturl] = True;
+ print 'Fetching ' + nexturl
+ html = postlistelib.fetch_url_harder(nexturl)
+ mysaver = lambda unique_keys, data: datastore.extend(data)
+ fetch_postjournal_day(parser=parser, url=nexturl, html=html,
+ saver=mysaver)
+
+ saver(unique_keys=['docidstr'], data=datastore)
+
+def date2url(date):
+ return 'http://webway.lenvik.kommune.no/?date=%s' % date
+
+def gen_date_urls(urllist, startdate, step, count):
+ d = dateutil.parser.parse(startdate, dayfirst=False)
+ for n in xrange(1, step*(count+1), step):
+ next = (d + relativedelta(days=n)).strftime("%Y-%m-%d")
+ urllist.append(date2url(next))
+
+urllist = []
+today = datetime.date.today()
+try:
+ first = scraperwiki.sqlite.select("min(recorddate) as min from swdata")[0]['min']
+ last = scraperwiki.sqlite.select("max(recorddate) as max from swdata")[0]['max']
+except:
+ last = (today + relativedelta(days=-14)).strftime("%Y-%m-%d")
+ first = None
+
+print first, last
+
+# Parse back in time
+if first is not None:
+ gen_date_urls(urllist, first, -1, 100)
+
+# Parse forward in time
+if last is not None:
+ gen_date_urls(urllist, last, 1, 3)
+
+for dayurl in urllist:
+ print 'Fetching ' + dayurl
+ html = postlistelib.fetch_url_harder(dayurl)
+ fetch_postjournal_day(parser=parser, url=dayurl, html=html, saver=saver)
+
diff --git a/scrapersources/postliste-met b/scrapersources/postliste-met
index 02c53ca..d769e97 100644
--- a/scrapersources/postliste-met
+++ b/scrapersources/postliste-met
@@ -58,11 +58,11 @@ def process_journal_pdfs(parser, listurl, errors):
href = ahref.attrib['href']
url = urlparse.urljoin(listurl, href)
if -1 != href.find("file://") or -1 == url.find("=File.getFile;"):
-# print "Skipping non-http URL " + url
+ print "Skipping non-http URL " + url
continue
if parser.is_already_scraped(url):
True
-# print "Skipping already scraped " + url
+ print "Skipping already scraped " + url
else:
# print "Will process " + url
process_pdf(parser, url, errors)
diff --git a/scrapersources/postliste-naroy b/scrapersources/postliste-naroy
index b8fa33b..f47adb3 100644
--- a/scrapersources/postliste-naroy
+++ b/scrapersources/postliste-naroy
@@ -59,7 +59,12 @@ def process_journal_pdfs(parser, listurl, errors):
continue
# Special case, file indicating no journal entries this day
if "http://www.naroy.kommune.no/NK/Intern.nsf/FilA/CA6C83764E56DDCBC1257A02003F9025/$FILE/Postjournal+11.05.12.pdf" == url or \
- "http://www.naroy.kommune.no/NK/Intern.nsf/FilA/7FD82A18C1A1F137C12579F90029DEBD/$FILE/Postjournal+07.05.12.pdf" == url:
+ "http://www.naroy.kommune.no/NK/Intern.nsf/FilA/7FD82A18C1A1F137C12579F90029DEBD/$FILE/Postjournal+07.05.12.pdf" == url or \
+ "http://www.naroy.kommune.no/NK/Intern.nsf/FilA/777B497BB48936ACC1257A450033E1D4/$FILE/Postjournal+20.07.12.pdf" == url or \
+ "http://www.naroy.kommune.no/NK/Intern.nsf/FilA/1802A0FF57C08EFEC1257A4500337345/$FILE/Postjournal+16.07.12.pdf" == url or \
+ "http://www.naroy.kommune.no/NK/Intern.nsf/FilA/90373A38701C27E5C1257A45002F63FD/$FILE/Postjournal+12.07.12.pdf" == url or \
+ "http://www.naroy.kommune.no/NK/Intern.nsf/FilA/6B00A3BD92B3C2AEC1257A45002F4044/$FILE/Postjournal+10.07.12.pdf" == url or \
+ "http://www.naroy.kommune.no/NK/Intern.nsf/FilA/0141B5488D38B8FEC1257A44003756ED/$FILE/Postjournal+06.07.12.pdf" == url:
continue
if parser.is_already_scraped(url):
True
diff --git a/scrapersources/postliste-oep b/scrapersources/postliste-oep
index 360ab91..bcfde1b 100644
--- a/scrapersources/postliste-oep
+++ b/scrapersources/postliste-oep
@@ -9,7 +9,7 @@ import httplib
import urllib2
# Try several times as the database get bigger
-writetries = 6
+writetries = 8
# http://www.oep.no/search/resultSingle.html?journalPostId=1000000
# http://www.oep.no/search/resultSingle.html?journalPostId=3889259
@@ -102,23 +102,31 @@ def url_from_id(id):
return "http://www.oep.no/search/resultSingle.html?journalPostId=" + str(id)
def save(data):
+ problem = False
for run in range(0,writetries):
try:
scraperwiki.sqlite.save(unique_keys=['journalPostId'], data=data)
+ if problem:
+ print "Sqlite write succeeded"
return
except scraperwiki.sqlite.SqliteError, e:
- print "Sqlite write error, trying again"
+ print "Sqlite write error, trying again: " + str(e)
time.sleep(22)
+ problem = True
raise scraperwiki.sqlite.SqliteError("Unable to write to database, tried " + str(writetries) + " times")
def save_var(var, data):
+ problem = False
for run in range(0,writetries):
try:
scraperwiki.sqlite.save_var(var, data)
+ if problem:
+ print "Sqlite write succeeded"
return
except scraperwiki.sqlite.SqliteError, e:
- print "Sqlite write error, trying again"
+ print "Sqlite write error, trying again: " + str(e)
time.sleep(22)
+ problem = True
raise scraperwiki.sqlite.SqliteError("Unable to write variable " + var + " to database, tried " + str(writetries) + " times")
fieldmap = {
@@ -177,9 +185,8 @@ def fetch_oep_entry(id, datastorage):
# scraperwiki.sqlite.save(unique_keys=['journalPostId'], data=data)
return 0
-def fetch_range(first, last, step):
+def fetch_range(datastorage, first, last, step):
myskiplimit = skiplimit
- datastorage = []
skipped = 0
fetched = 0
min_id = first
@@ -312,6 +319,16 @@ def remove_original():
print "Starting to fetch journal entries " + str(datetime.datetime.now())
scraperwiki.scrape("http://www.oep.no/")
+datastorage = []
+
+# Update entries to handle <URL: https://rt.nuug.no:443/Ticket/Display.html?id=6342 >.
+# Used 2012-09-17
+#scraperwiki.sqlite.execute("DELETE from swdata where journalPostId = 638167")
+#fetch_oep_entry(638167, datastorage)
+#scraperwiki.sqlite.execute("DELETE from swdata where journalPostId = 638104")
+#fetch_oep_entry(638104, datastorage)
+#scraperwiki.sqlite.commit()
+
count = 10000
skiplimit = 500
# Random value fairly close to the most recent ID when this project started 2012-05-03
@@ -320,20 +337,21 @@ try:
max = scraperwiki.sqlite.select("max(journalPostId) as max from swdata")[0]["max"]
if 0 < scraperwiki.sqlite.get_var('min_tested_id'):
saved_min = scraperwiki.sqlite.get_var('min_tested_id')
+ else:
+ saved_min = 0
sql_min = scraperwiki.sqlite.select("min(journalPostId) as min from swdata")[0]["min"]
print "Saved min: " + str(saved_min) + ", sql min: " + str(sql_min)
if sql_min < saved_min:
min = sql_min
else:
min = saved_min
-
print "Scraping " + str(count) + " IDs below " + str(min) + " and above " + str(max)
except scraperwiki.sqlite.SqliteError:
pass
-fetched = fetch_range(max + 1, max + count, 1)
+fetched = fetch_range(datastorage, max + 1, max + count, 1)
print "Fetched " + str(fetched) + " new journal entries, cpu spent: " + str(cpu_spent())
if min >= 0:
- fetched = fetch_range(min, min - count, -1)
+ fetched = fetch_range(datastorage, min, min - count, -1)
print "Fetched " + str(fetched) + " old journal entries, cpu spent: " + str(cpu_spent())
diff --git a/scrapersources/postliste-oep-deliverydates b/scrapersources/postliste-oep-deliverydates
index f04ce49..ebce253 100644
--- a/scrapersources/postliste-oep-deliverydates
+++ b/scrapersources/postliste-oep-deliverydates
@@ -30,7 +30,9 @@ def fetch_oep_deliverydates(url, datastorage):
return 0
datastorage = []
-fetch_oep_deliverydates("http://www.oep.no/pub/faces/statistikk.jsp?reposId=3", datastorage)
+#fetch_oep_deliverydates("http://www.oep.no/pub/faces/statistikk.jsp?reposId=3", datastorage)
+# New url before 2012-11-09
+fetch_oep_deliverydates("http://www.oep.no/pub/report.xhtml?reportId=3", datastorage)
print datastorage
scraperwiki.sqlite.save(unique_keys=['agency', 'deliverydate'], data=datastorage)
diff --git a/scrapersources/postliste-ruter b/scrapersources/postliste-ruter
index 757d6be..51a2776 100644
--- a/scrapersources/postliste-ruter
+++ b/scrapersources/postliste-ruter
@@ -10,6 +10,7 @@ import resource
import sys
import urlparse
import re
+scraperwiki.scrape('http://www2.ruter.no/verdt-a-vite/presse/offentlig-journal/')
lazycache=scraperwiki.swimport('lazycache')
postlistelib=scraperwiki.swimport('postliste-python-lib')