aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorPetter Reinholdtsen <pere@hungry.com>2012-07-13 12:28:13 +0200
committerPetter Reinholdtsen <pere@hungry.com>2012-07-13 12:28:13 +0200
commit22bceaf65dd89df97529df0102149aefa2b54f54 (patch)
tree24a6dd995d146b27d92d4c91593dc8d8fd952064
Store current scraperwiki sources.
-rwxr-xr-xfetch-scraper-sources15
-rw-r--r--scrapersources/nrks_offentlig_journal_pdf_text_positioning141
-rw-r--r--scrapersources/oep-exemptions101
-rw-r--r--scrapersources/oep-exemptions_1101
-rw-r--r--scrapersources/postlist-ssb164
-rw-r--r--scrapersources/postliste-arendal188
-rw-r--r--scrapersources/postliste-ballangen276
-rw-r--r--scrapersources/postliste-ballangen-view124
-rw-r--r--scrapersources/postliste-bioforsk81
-rw-r--r--scrapersources/postliste-difi88
-rw-r--r--scrapersources/postliste-fredrikstad196
-rw-r--r--scrapersources/postliste-hadsel108
-rw-r--r--scrapersources/postliste-halden93
-rw-r--r--scrapersources/postliste-hoegskolen-i-gjoevik104
-rw-r--r--scrapersources/postliste-hoegskolen-i-hamar103
-rw-r--r--scrapersources/postliste-hoegskolen-i-lillehammer90
-rw-r--r--scrapersources/postliste-hole237
-rw-r--r--scrapersources/postliste-hvaler81
-rw-r--r--scrapersources/postliste-kafjord81
-rw-r--r--scrapersources/postliste-kristiansund87
-rw-r--r--scrapersources/postliste-lier81
-rw-r--r--scrapersources/postliste-lindesnes124
-rw-r--r--scrapersources/postliste-luftambulanse91
-rw-r--r--scrapersources/postliste-naroy89
-rw-r--r--scrapersources/postliste-nih85
-rw-r--r--scrapersources/postliste-npolar101
-rw-r--r--scrapersources/postliste-nrk94
-rw-r--r--scrapersources/postliste-ntnu87
-rw-r--r--scrapersources/postliste-oep336
-rw-r--r--scrapersources/postliste-oep-deliverydates37
-rw-r--r--scrapersources/postliste-oslo-bydel-ullern85
-rw-r--r--scrapersources/postliste-oslo-gravferdsetaten90
-rw-r--r--scrapersources/postliste-oslo-havn86
-rw-r--r--scrapersources/postliste-oslo-radhusets-forvaltningstjeneste231
-rw-r--r--scrapersources/postliste-python-lib577
-rw-r--r--scrapersources/postliste-risr-kommune126
-rw-r--r--scrapersources/postliste-ruter81
-rw-r--r--scrapersources/postliste-saltdal98
-rw-r--r--scrapersources/postliste-sivilombudsmannen81
-rw-r--r--scrapersources/postliste-sogne81
-rw-r--r--scrapersources/postliste-storfjord82
-rw-r--r--scrapersources/postliste-stortinget90
-rw-r--r--scrapersources/postliste-universitetet-i-oslo125
-rw-r--r--scrapersources/postliste-universitetet-i-stavanger89
-rw-r--r--scrapersources/postliste-vikna89
-rw-r--r--scrapersources/statens_vegvesen_offentlig_journal56
46 files changed, 5651 insertions, 0 deletions
diff --git a/fetch-scraper-sources b/fetch-scraper-sources
new file mode 100755
index 0000000..6465ea3
--- /dev/null
+++ b/fetch-scraper-sources
@@ -0,0 +1,15 @@
+#!/bin/sh
+
+scrapers="postliste-python-lib postliste-ballangen"
+
+scrapers="$( (
+ GET https://scraperwiki.com/tags/postjournal
+ GET https://scraperwiki.com/tags/postjournal?page=2
+ GET https://scraperwiki.com/tags/postjournal?page=3
+) | grep owner | rev | cut -d/ -f3 | rev)"
+
+mkdir -p scrapersources
+for scraper in $scrapers; do
+ echo "Fetching source for $scraper"
+ GET "https://scraperwiki.com/editor/raw/$scraper" > "scrapersources/$scraper"
+done
diff --git a/scrapersources/nrks_offentlig_journal_pdf_text_positioning b/scrapersources/nrks_offentlig_journal_pdf_text_positioning
new file mode 100644
index 0000000..51cd746
--- /dev/null
+++ b/scrapersources/nrks_offentlig_journal_pdf_text_positioning
@@ -0,0 +1,141 @@
+import scraperwiki, urllib2, datetime, base64, time, re
+from bs4 import BeautifulSoup
+from collections import deque
+import scraperwiki
+lazycache = scraperwiki.swimport('lazycache')
+u = scraperwiki.swimport('hildenae_utils')
+
+def d(text):
+ if(False):
+ print "DEBUG:", text
+
+def process_pdf(pdfurl):
+ pdfxml = u.findInCache(pdfurl,verbose=True) # look for html parse in cache
+ if pdfxml is None: # a html parse is not cached
+ pdfdata=lazycache.lazycache(pdfurl, verbose=True) # look for pdf document in cache, if not download
+ pdfxml = scraperwiki.pdftoxml(pdfdata, "-hidden") # parse pdf text to html
+ u.putInCache(pdfurl, pdfxml, verbose=True) # save cache of html parse
+
+ beautifulxml = BeautifulSoup(pdfxml) # convert html to BeautifulSoup(4) object
+
+ for page in beautifulxml.find_all('page'):
+ FIRSTPAGE = 6
+ LASTPAGE = 6
+ if int(page['number']) < FIRSTPAGE:
+ continue
+ if int(page['number']) == FIRSTPAGE:
+ print "*******************************************"
+ print "***** FIRSTPAGE #%d while developing ******" % (FIRSTPAGE)
+ print "*******************************************"
+ if int(page['number']) == LASTPAGE+1:
+ print "*******************************************"
+ print "****** LASTPAGE #%d while developing ******" % (LASTPAGE)
+ print "*******************************************"
+ break
+
+ print( "*******************************************")
+ print( "********** Working on page #%s **********" % page['number'])
+ print( "*******************************************")
+ elementList = deque(page.find_all('text')) # we want to be able to use popleft
+ d(elementList)
+ while True:
+ try:
+ currElement = elementList.popleft()
+ if "Innhold:" in currElement.text and currElement.b: # we found a "Innhold:"-header
+ entry = parseDocumentRecord(currElement, elementList)
+ print entry
+ scraperwiki.sqlite.save(unique_keys=["innhold", "sakstittel"], data=entry)
+ d( "back in process_pdf")
+ #else:
+ #print currElement.text
+ except IndexError, e:
+ d("No more text elements on page (%s)" % e)
+ break
+
+
+
+def parseDocumentRecord(currElement, elementList):
+ # previous element in list is "Innhold:"
+ d ("starting parseDocumentRecord")
+ entry = {}
+ while(True):
+ try:
+ d(elementList)
+ if "Innhold:" in elementList[0].text: # look ahead, if next is "Innhold:" return to process_pdf
+ break
+
+ currElement = elementList.popleft() # first text in innhold
+ entry["innhold"] = ""
+ while(True):
+ if "Sakstittel:" in currElement.text: # we found sakstittel, go to next
+ break
+ entry["innhold"] += currElement.text
+ currElement = elementList.popleft()
+ entry["innhold"] = u.removeDoubleSpaces(entry["innhold"])
+
+ currElement = elementList.popleft() # first text in sakstittel
+ entry["sakstittel"] = ""
+ while(True):
+ if "DokType" in currElement.text: # we found DokType, go to next
+ break
+ entry["sakstittel"] += currElement.text
+ currElement = elementList.popleft()
+ entry["sakstittel"] = u.removeDoubleSpaces(entry["sakstittel"])
+
+ print("before spool to 'mottaker:'")
+
+ '''
+
+
+
+ Komments: Virker som om pdf2html noen ganger ikke klarer å lese DokType. Hittil er dette kun observert når
+ DokType er U (selv om den klarer å lese noen DokType U). Dette er bekreftet mesteparten av 18 og 22 i juni
+
+
+
+ '''
+ print elementList
+
+
+
+ print("spool to 'mottaker:'")
+ currElement = elementList.popleft() # first text after DocType
+ while(True):
+ if re.search( r'[t].*[t].*[a].*[k].*[e].*[r].*[:]', currElement.text): # match "motta ker:" (some last pages - nooooot pretty)
+ d("found mottaker")
+ break
+ currElement = elementList.popleft()
+
+ d(elementList)
+
+ entry["avsender_mottager"] = ""
+ while(True):
+ if ("Innhold:" in elementList[0].text) or ("Side:" in elementList[0].text): # ***look ahead***, if next is "Innhold:" return to process_pdf
+ #print "next is innhold, cleanup"
+ entry["avsender_mottager"] = u.removeDoubleSpaces(entry["avsender_mottager"])
+ if re.match("^[*]+$", entry["avsender_mottager"]):
+ entry["avsender_mottager"] = None
+ #print elementList
+ #print entry
+ d("finished with record")
+ break
+ #print "Adding to avs_mot (%s)" % currElement.text
+ entry["avsender_mottager"] += currElement.text
+ currElement = elementList.popleft()
+
+ #print "lastBreak"
+ break # we are finished with this Innhold
+ except IndexError, e:
+ d("No more text elements on page (%s)" % e)
+ break
+ return entry
+
+process_pdf("http://www.nrk.no/contentfile/file/1.8221353!offentlig22062012.pdf") # 4 records on last page
+#process_pdf("http://www.nrk.no/contentfile/file/1.8217234!offentligjournal21062012.pdf") # 3 records on last page
+#process_pdf("http://www.nrk.no/contentfile/file/1.8214156!offentligjournal20062012.pdf")
+#process_pdf("http://www.nrk.no/contentfile/file/1.8212381!offentligjournal19062012.pdf")
+
+# https://views.scraperwiki.com/run/pdf_to_html_preview_4/?url=http%3A%2F%2Fwww.nrk.no%2Fcontentfile%2Ffile%2F1.8209505%21offentligjournal18062012.pdf&hidden=1
+#process_pdf("http://www.nrk.no/contentfile/file/1.8209505!offentligjournal18062012.pdf") # 1 record on last page
+
+
diff --git a/scrapersources/oep-exemptions b/scrapersources/oep-exemptions
new file mode 100644
index 0000000..23a1691
--- /dev/null
+++ b/scrapersources/oep-exemptions
@@ -0,0 +1,101 @@
+<!doctype html>
+<html lang="nb">
+<head>
+<meta charset="utf-8" />
+<title>Hvor mange dokumenter er journalført i hver etat og hvor mange er unntatt innsyn?</title>
+<script src="https://cdnjs.cloudflare.com/ajax/libs/jquery/1.7.2/jquery.min.js"></script>
+<script src="https://cdnjs.cloudflare.com/ajax/libs/highcharts/2.2.2/highcharts.js"></script>
+<!-- <script src="https://code.highcharts.com/modules/exporting.js"></script>-->
+<script src="https://cdnjs.cloudflare.com/ajax/libs/underscore.js/1.3.3/underscore-min.js"></script>
+<script>
+$(function()
+ {
+ var chart;
+ var query_url = "https://api.scraperwiki.com/api/1.0/datastore/sqlite?format=jsondict&name=postliste-oep&query=select%20Agency%2C%22Grounds%20for%20exemption%20document%22%20as%20ex%2C%20count(*)%20as%20num%20from%20%60swdata%60%20group%20by%20Agency%2Cex%20";
+
+ function get_chart_opts(agencies, series) {
+ return {
+ chart: { renderTo: 'container', type: 'bar' },
+ title: { text: 'Hvor mange dokumenter er journalført i hver etat og hvor mange er unntatt innsyn?' },
+ xAxis: { categories: agencies },
+ yAxis: {
+ min: 0,
+ title: { text: "Antall journalførte dokumenter" },
+ },
+ legend: {
+ backgroundColor: '#FFFFFF',
+ reversed: true
+ },
+ tooltip: {
+ formatter: function() {
+ return ''+ this.series.name + ': '+ this.y + ' ('+parseInt(this.percentage) + '%)';
+
+ }
+ },
+ plotOptions: {
+ series: {
+ stacking: 'normal'
+ }
+ },
+ series: series
+ };
+
+ }
+
+ function populate_chart(data) {
+ // TODO: Very naive iteration today. Should be optimized
+ var agencies = _.uniq( _.pluck(data, 'Agency') );
+ var totals = {};
+ var not_exemption = {};
+ var series = [];
+
+ // traverse and find data
+ _.each(data, function(entry) {
+ var agency_name = entry['Agency'];
+
+ if (agency_name) {
+ if (! totals[agency_name]) {
+ totals[agency_name] = 0;
+ }
+ totals[agency_name] += entry['num'];
+
+ if ("" == entry['ex']) {
+ not_exemption[agency_name] = entry['num'];
+ }
+ }
+ });
+
+
+ // make series
+ series.push({ name: 'Ingen merknader',
+ data: _.map(agencies, function(agency) {
+ return not_exemption[agency];
+ })
+ });
+
+
+ series.push({ name: 'Unntatt innsyn',
+ data: _.map(agencies, function(agency) {
+ return totals[agency] - not_exemption[agency];
+ })
+ });
+
+
+
+ chart = new Highcharts.Chart(get_chart_opts(agencies, series));
+ };
+
+
+ $(document).ready(function() {
+ $.ajax({ url: query_url, dataType: 'json', success: function(data){ populate_chart(data); } });
+ });
+}
+);
+
+</script>
+</head>
+<body>
+ <div id="container" style="height: 2000px;width: 100%;margin: 0 auto"></div>
+ <p>Alle dokumenter som har oppgitt en grunn for å unnlate offentligjøring vil telles som "Unnatt innsyn".</p>
+</body>
+</html>
diff --git a/scrapersources/oep-exemptions_1 b/scrapersources/oep-exemptions_1
new file mode 100644
index 0000000..29c3a98
--- /dev/null
+++ b/scrapersources/oep-exemptions_1
@@ -0,0 +1,101 @@
+<!doctype html>
+<html lang="nb">
+<head>
+<meta charset="utf-8" />
+<title>Hvor mange dokumenter er journalført i hver etat og hvor mange er unntatt innsyn?</title>
+<script src="https://cdnjs.cloudflare.com/ajax/libs/jquery/1.7.2/jquery.min.js"></script>
+<script src="https://cdnjs.cloudflare.com/ajax/libs/highcharts/2.2.2/highcharts.js"></script>
+<!-- <script src="https://code.highcharts.com/modules/exporting.js"></script>-->
+<script src="https://cdnjs.cloudflare.com/ajax/libs/underscore.js/1.3.3/underscore-min.js"></script>
+<script>
+$(function()
+ {
+ var chart;
+ var query_url = "https://api.scraperwiki.com/api/1.0/datastore/sqlite?format=jsondict&name=postliste-oep&query=select%20Agency%2C%22Grounds%20for%20exemption%20document%22%20as%20ex%2C%20count(*)%20as%20num%20from%20%60swdata%60%20group%20by%20Agency%2Cex%20";
+
+ function get_chart_opts(agencies, series) {
+ return {
+ chart: { renderTo: 'container', type: 'bar' },
+ title: { text: 'Hvor mange dokumenter er journalført i hver etat og hvor mange er unntatt innsyn?' },
+ xAxis: { categories: agencies },
+ yAxis: {
+ min: 0,
+ title: { text: "Antall journalførte dokumenter" },
+ },
+ legend: {
+ backgroundColor: '#FFFFFF',
+ reversed: true
+ },
+ tooltip: {
+ formatter: function() {
+ return ''+ this.series.name + ': '+ this.y + ' ('+parseInt(this.percentage) + '%)';
+
+ }
+ },
+ plotOptions: {
+ series: {
+ stacking: 'percent'
+ }
+ },
+ series: series
+ };
+
+ }
+
+ function populate_chart(data) {
+ // TODO: Very naive iteration today. Should be optimized
+ var agencies = _.uniq( _.pluck(data, 'Agency') );
+ var totals = {};
+ var not_exemption = {};
+ var series = [];
+
+ // traverse and find data
+ _.each(data, function(entry) {
+ var agency_name = entry['Agency'];
+
+ if (agency_name) {
+ if (! totals[agency_name]) {
+ totals[agency_name] = 0;
+ }
+ totals[agency_name] += entry['num'];
+
+ if ("" == entry['ex']) {
+ not_exemption[agency_name] = entry['num'];
+ }
+ }
+ });
+
+
+ // make series
+ series.push({ name: 'Ingen merknader',
+ data: _.map(agencies, function(agency) {
+ return not_exemption[agency];
+ })
+ });
+
+
+ series.push({ name: 'Unntatt innsyn',
+ data: _.map(agencies, function(agency) {
+ return totals[agency] - not_exemption[agency];
+ })
+ });
+
+
+
+ chart = new Highcharts.Chart(get_chart_opts(agencies, series));
+ };
+
+
+ $(document).ready(function() {
+ $.ajax({ url: query_url, dataType: 'json', success: function(data){ populate_chart(data); } });
+ });
+}
+);
+
+</script>
+</head>
+<body>
+ <div id="container" style="height: 2000px;width: 100%;margin: 0 auto"></div>
+ <p>Alle dokumenter som har oppgitt en grunn for å unnlate offentligjøring vil telles som "Unnatt innsyn".</p>
+</body>
+</html>
diff --git a/scrapersources/postlist-ssb b/scrapersources/postlist-ssb
new file mode 100644
index 0000000..de2a051
--- /dev/null
+++ b/scrapersources/postlist-ssb
@@ -0,0 +1,164 @@
+import scraperwiki
+import urllib2
+import lxml.html
+import datetime
+import time
+import dateutil.parser
+import pickle
+import re
+
+from datetime import date
+from datetime import timedelta
+from time import strftime
+
+# Make sure Scraperwiki believe this is the source from this database
+scraperwiki.scrape("http://www.ssb.no/omssb/journal/")
+
+postlistelib=scraperwiki.swimport('postliste-python-lib')
+
+agency = "Statistisk sentralbyrå"
+
+def daterange(start_date, end_date):
+ for n in range((end_date - start_date).days):
+ yield start_date + timedelta(n)
+
+def expand_year(year):
+ year = int(year)
+ if year > 50:
+ year = year + 1900
+ else:
+ year = year + 2000
+ return year
+
+def fetch_url(url):
+ html = None
+ for n in [1]:
+ try:
+ html = scraperwiki.scrape(url)
+ break
+ except urllib2.URLError, e:
+ print "URLError fetching " + url + ", trying again"
+ return html
+
+def save_date(parser, date, url, html):
+ num_saved = 0
+ root = lxml.html.fromstring(html)
+ journal_date = dateutil.parser.parse(root.cssselect("p")[0].text_content().replace("Journaldato: ",""), dayfirst=True)
+ if date == journal_date.date():
+ datastore = []
+ for table in root.cssselect("table"):
+ docid = table.cssselect("tr")[0].cssselect("p")[1].text.strip()
+ datedesc = table.cssselect("tr")[0].cssselect("td")[3].cssselect("p")[0].text.strip()
+
+ exemption = table.cssselect("tr")[1].cssselect("td")[5].cssselect("p")[0].text.strip()
+
+ fratil_indicator = table.cssselect("tr")[2].cssselect("td")[0].cssselect("p")[0].text.strip()
+
+ doctype = ""
+ if fratil_indicator.startswith("Til"):
+ doctype = "U"
+ elif fratil_indicator.startswith("Fra"):
+ doctype = "I"
+ elif fratil_indicator.startswith("Notat fra"):
+ doctype = "N"
+ else:
+ raise ValueError("Fant ikke doctype %s" % fratil_indicator)
+
+ fratil_agency = table.cssselect("tr")[2].cssselect("td")[1].cssselect("p")[0].text.strip()
+
+ casedesc = table.cssselect("tr")[4].cssselect("td")[1].cssselect("p")[0].text.strip()
+
+ docdesc = table.cssselect("tr")[5].cssselect("td")[1].cssselect("p")[0].text.strip()
+ saksb = table.cssselect("tr")[0].cssselect("p")[5].text.strip()
+
+ docdate = dateutil.parser.parse(datedesc.strip(), dayfirst=True)
+
+ matchObj = re.match( r'(\d+)/(\d+)\s*-\s*(\d+)$', docid, re.M|re.I)
+ if matchObj:
+ caseyear = matchObj.group(1)
+ caseseqnr = matchObj.group(2)
+ casedocseq = matchObj.group(3)
+ caseyear = expand_year(caseyear)
+ caseid = str(caseyear) + "/" + str(caseseqnr)
+ else:
+ print "error: invalid Arkivsaksnr: " + docid
+ matchObj = re.match( r'(\d+)/(\d+)\s*-', docid, re.M|re.I)
+ if matchObj:
+ caseyear = expand_year(matchObj.group(1))
+ caseseqnr = matchObj.group(2)
+ caseid = str(caseyear) + "/" + str(caseseqnr)
+
+ if parser.is_sender_doctype(doctype):
+ fratilfield = 'sender'
+ elif parser.is_recipient_doctype(doctype):
+ fratilfield = 'recipient'
+
+ data = {
+ 'agency' : agency,
+ 'docdate' : docdate.date(),
+ 'recorddate' : journal_date.date(),
+ 'docdesc' : docdesc,
+ 'casedesc' : casedesc,
+ 'caseid' : caseid,
+ 'docid' : docid,
+
+ 'caseyear' : caseyear,
+ 'caseseqnr' : caseseqnr,
+ 'casedocseq' : casedocseq,
+
+ fratilfield : fratil_agency,
+ 'doctype' : doctype,
+
+ 'saksbehandler' : saksb,
+
+ 'exemption' : exemption,
+
+ 'scrapedurl' : url,
+ 'scrapestamputc' : datetime.datetime.now()
+ }
+ parser.verify_entry(data)
+ datastore.append(data)
+ scraperwiki.sqlite.save(unique_keys=['docid'], data=datastore)
+ num_saved += len(datastore)
+ datastore = []
+ #print "Saved %s" % data['caseid']
+ else:
+ # TODO: log error or exit?
+ msg = "Tried to scrape %s but got %s" % (date, journal_date.date())
+ #raise ValueError(msg)
+ print msg
+
+ return num_saved
+
+def scrape_date(parser, date):
+ url = base_url % (strftime("%d%m%y", date.timetuple()))
+ html = fetch_url(url)
+ if html:
+ return save_date(parser, date, url, html)
+
+base_url = 'http://www.ssb.no/omssb/journal/OJ%s.htm'
+end_date = date.today()
+
+#print res
+
+start_date_obj = scraperwiki.sqlite.get_var('last_finished_date')
+
+if start_date_obj:
+ start_date = pickle.loads(start_date_obj)
+else:
+ start_date = datetime.date(2011, 1, 3)
+
+print "Start date %s" % start_date
+
+parser = postlistelib.JournalParser(agency=agency)
+
+for single_date in daterange(start_date, end_date):
+ if single_date.weekday() < 5:
+ num_saved = scrape_date(parser, single_date)
+ print "Scraped %s found %s" % (single_date, num_saved)
+ if num_saved > 0:
+ scraperwiki.sqlite.save_var('last_finished_date', pickle.dumps(single_date))
+
+ if num_saved == None:
+ print "No more new. Exit..."
+ break
diff --git a/scrapersources/postliste-arendal b/scrapersources/postliste-arendal
new file mode 100644
index 0000000..5960033
--- /dev/null
+++ b/scrapersources/postliste-arendal
@@ -0,0 +1,188 @@
+import scraperwiki
+
+import json
+import httplib, urllib
+import datetime
+import dateutil.parser
+import time
+import re
+
+agency = "Arendal kommune"
+urlhost = "www.arendal.kommune.no"
+
+fieldmap = {
+ 'AntallVedlegg' : '',
+ 'Arkivdel' : '',
+ 'AvsenderMottaker' : 'sender', # or recipient
+ 'Dokumentdato' : 'docdate',
+ 'Dokumentnummer' : 'casedocseq',
+ 'Dokumenttype' : 'doctype',
+ 'EkspedertDato' : '',
+ 'Hjemmel' : 'exemption',
+ 'Id' : 'id',
+ 'Innholdsbeskrivelse' : 'docdesc',
+ 'Mappetype' : '',
+ 'Offentlig' : 'ispublic',
+ 'PostlisteType' : 'doctype',
+ 'RegistrertDato' : 'recorddate',
+ 'SaksId' : '',
+ 'SaksNr' : 'caseid',
+ 'Sakstittel' : 'casedesc',
+ #'SaksNr' : 'SA.SAAR + SA.SEKNR',
+ 'Saksansvarlig' : 'saksbehandler',
+ 'SaksansvarligEnhet' : '',
+ 'SaksansvarligEpost' : '',
+
+# 'scrapestamputc' : '',
+# 'scrapedurl' : '',
+# 'agency' : '',
+}
+
+
+# Convert "/Date(1317808020000+0200)/" to a datetime object
+# FIXME Currently ignore the timezone information
+def parse_datestr(str):
+ match = re.split("[/()+]", str)
+# print match
+ sinceepoch = float(match[2]) / 1000
+ if match[3] == '0200':
+ sinceepoch = sinceepoch + 2 * 60 * 60
+ if match[3] == '0100':
+ sinceepoch = sinceepoch + 1 * 60 * 60
+# print sinceepoch
+ date = datetime.datetime.fromtimestamp(sinceepoch)
+# print date
+ return date
+
+def reformat_caseid(caseid):
+ # Input 12/13123, output 2012, 13123, "2012/13123"
+ year, seqnr = caseid.split("/")
+ year = int(year)
+ if year < 100:
+ year = year + 2000
+ caseid = "%d/%s" % (year, seqnr)
+ return year, int(seqnr), caseid
+
+def ws_post(url, urlhost, urlpath, params):
+ jsonparams = json.dumps(params)
+ headers = {"Content-type": "application/json; charset=utf-8",
+ "Accept": "application/json"}
+ conn = httplib.HTTPConnection(urlhost)
+ #print jsonparams
+ conn.request("POST", urlpath, jsonparams, headers)
+ response = conn.getresponse()
+ #print response.status, response.reason
+ jsonres = response.read()
+ res = json.loads(jsonres)
+ #print res
+ return res
+
+def fetch_journal_entry(id):
+ params = { "id" : str(id)}
+ headers = {"Content-type": "application/json; charset=utf-8",
+ "Accept": "application/json"}
+ urlpath = "/Templates/eDemokrati/Services/eDemokratiService.svc/GetPostlisteObjekt"
+ data = ws_post(None, urlhost, urlpath, params)['d']
+ entry = None
+ if data:
+ del data['__type'] # This is useless, ignore
+ print data
+ entry = {}
+ entry['agency'] = agency
+ entry['scrapestamputc'] = datetime.datetime.now()
+ entry['scrapedurl'] = "http://" + urlhost + urlpath
+# entry['scrapedurl'] = url
+ for dfield in fieldmap.keys():
+ if dfield in data and data[dfield]:
+ if dfield in fieldmap and fieldmap[dfield] != "":
+ fieldname = fieldmap[dfield]
+ else:
+ fieldname = dfield
+ if 'sender' == fieldname:
+ if data['Dokumenttype'] == 'U':
+ fieldname = 'recipient'
+ if dfield in ['RegistrertDato', 'Dokumentdato', 'EkspedertDato']:
+ entry[fieldname] = parse_datestr(data[dfield]).date()
+ else:
+ entry[fieldname] = data[dfield]
+ else:
+ entry[dfield] = data[dfield]
+ entry['caseyear'], entry['caseseqnr'], entry['caseid'] = reformat_caseid(entry['caseid'])
+# data["sourceurl"] = "http://" + server + path
+ print entry
+ return entry
+
+def epoctime_to_datestr(epoctime):
+ return "/Date("+str(int(epoctime * 1000) )+")/"
+
+def get_last_entry_id():
+ now = time.time()
+ # Get the last week, as the most recent entry should be in this range
+ fradato = epoctime_to_datestr(now - 7 * 24 * 60 * 60)
+ tildato = epoctime_to_datestr(now)
+ #print fradato
+
+ maxid = 0
+
+ urlpath = "/Templates/eDemokrati/Services/eDemokratiService.svc/GetPostlisteArkivdeler"
+ params = {
+ "dato": fradato,
+ "tilDato": tildato,
+ "søkestreng":""}
+ arkivdeler = ws_post(None, urlhost, urlpath, params)['d']
+ # {u'd': [u'_', u'HVA-IFE-A', u'KAR-BR-A', u'KAR-BRUK-A', u'KAR-EIEN-A', u'KAR-ELBH-A', u'KAR-ELS-A', ...
+
+ urlpath = "/Templates/eDemokrati/Services/eDemokratiService.svc/GetPostlisteDokumenttyper"
+ for arkivdel in arkivdeler[0]:
+ params = {
+ "dato":fradato,
+ "tilDato":tildato,
+ "søkestreng":"",
+ "arkivdel":arkivdel,
+ }
+ doctypes = ws_post(None, urlhost, urlpath, params)['d']
+ #{"d":["I","N","S","U","X"]}
+ urlpath = "/Templates/eDemokrati/Services/eDemokratiService.svc/GetPostlisteS%C3%B8k"
+ for doctype in doctypes:
+ params = {
+ "fraDato":fradato,
+ "tilDato":tildato,
+ "søkestreng":"",
+ "arkivdel":arkivdel,
+ "dokumenttype":doctype,
+ }
+ entries = ws_post(None, urlhost, urlpath, params)['d']
+ for entry in entries:
+ #print entry['Id']
+ id = int(entry['Id'])
+ if id > maxid:
+ maxid = id
+# data = fetch_journal_entry(entry['Id'])
+# if data:
+# scraperwiki.sqlite.save(unique_keys=['id'], data=data)
+ return maxid
+
+#{"d":[{"__type":"PostlisteObjekt:#SSP.NoarkServices","AntallVedlegg":1,"Dokumentnummer":2,"Dokumenttype":"I","EkspedertDato":null,"Hjemmel":null,"Id":1507868,"Innholdsbeskrivelse":"Tomtejustering - Lillebæk, eiendom 208\/1611","Offentlig":true,"RegistrertDato":"\/Date(1339538400000+0200)\/","SaksId":296971,"SaksNr":"12\/8658","Arkivdel":"KAR-EIEN-A","AvsenderMottaker":"Randi Wilberg","Dokumentdato":"\/Date(1339624800000+0200)\/","Mappetype":"DS","PostlisteType":"I","Saksansvarlig":null,"SaksansvarligEnhet":null,"SaksansvarligEpost":null,"Sakstittel":null},{"__type":"PostlisteObjekt:#SSP.NoarkServices","AntallVedlegg":4,"Dokumentnummer":1,"Dokumenttype":"I","EkspedertDato":null,"Hjemmel":null,"Id":1507865,"Innholdsbeskrivelse":"Søknkad om utvidelse av balkong - Kalleraveien 14","Offentlig":true,"RegistrertDato":"\/Date(1339538400000+0200)\/","SaksId":298804,"SaksNr":"12\/10480","Arkivdel":"KAR-EIEN-A","AvsenderMottaker":"Ole Henning Løken","Dokumentdato":"\/Date(1338847200000+0200)\/","Mappetype":"BS","PostlisteType":"I","Saksansvarlig":null,"SaksansvarligEnhet":null,"SaksansvarligEpost":null,"Sakstittel":null},...
+
+def get_journal_enries_range(min, max, step):
+ for id in range(min, max, step):
+ data = fetch_journal_entry(id)
+ #print data
+ if data:
+ scraperwiki.sqlite.save(unique_keys=['id'], data=data)
+
+maxid = get_last_entry_id()
+print "max id =", maxid
+try:
+ start = scraperwiki.sqlite.select("max(id) as max from swdata")[0]['max'] + 1
+except:
+ start = 137459
+print start, maxid
+#if maxid > start + 20:
+# maxid = start + 10
+get_journal_enries_range(start, maxid, 1)
+
+start = scraperwiki.sqlite.select("min(id) as min from swdata")[0]['min'] - 1
+end = start - 1000
+print start, end
+get_journal_enries_range(start, end, -1)
diff --git a/scrapersources/postliste-ballangen b/scrapersources/postliste-ballangen
new file mode 100644
index 0000000..89e981f
--- /dev/null
+++ b/scrapersources/postliste-ballangen
@@ -0,0 +1,276 @@
+import scraperwiki
+import urllib2
+import lxml.html
+import re
+import dateutil.parser
+from collections import deque
+import datetime
+from dateutil.relativedelta import relativedelta
+
+scraperwiki.scrape("http://www.ballangen.kommune.no/artikler/postlister")
+postlistelib=scraperwiki.swimport('postliste-python-lib')
+
+# <!-- $BeginBlock postjournal_liste -->
+# <tr>
+# <td class="CommonBold">&nbsp;&nbsp;&nbsp;
+# SÃ<98>KER KULTURMIDLER FOR BALLANGEN FRIIDRETT
+# </td>
+# </tr>
+# <tr>
+# <td>&nbsp;
+# </td>
+# </tr>
+# <tr>
+# <td>
+# <b>Sakstittel: </b>KULTURMIDLER 2012
+#
+# </td>
+# </tr>
+# <tr>
+# <td>&nbsp;
+# </td>
+# </tr>
+# <tr>
+# <td>
+# <b>Arkivsaksnr.: </b>12/00093 - 032 I&nbsp;&nbsp;&nbsp;&nbsp;<b>Løpenr.:</b
+# >002255/12
+# </td>
+# </tr>
+# <tr>
+# <td><b>Fra/Til: </b>Eirin Sørslett
+# </td>
+# </tr>
+# <tr>
+# <td><b>Saksbehandler: </b>
+# Oddbjørn Dalsbø
+# (RÃ<85>D/KVO)
+# </td>
+# </tr>
+# <tr>
+# <td><b>Datert: </b> 02.04.2012</td>
+# </tr>
+# <tr>
+# <td style="padding-bottom: 15px;">
+# &nbsp;<img src="/icons/vwsent.gif" border="0" align="top" alt="Ikon" />
+# <a href="mailto:post@ballangen.kommune.no?subject=Bestill postjournal med Ark
+# ivsaksnr 12/00093 - 032 I og løpenr 002255/12">Bestill journal</a>
+# </td>
+# </tr>
+
+def saver(unique_keys, data):
+# return
+ #print "Not saving data"
+ scraperwiki.sqlite.save(unique_keys, data)
+
+def expand_year(year):
+ year = int(year)
+ if year > 50:
+ year = year + 1900
+ else:
+ year = year + 2000
+ return year
+
+def fetch_postjournal_day(parser, url, html, saver):
+ root = lxml.html.fromstring(html)
+
+ listdate = dateutil.parser.parse(root.cssselect("h2")[0].text_content().replace("Postlister for ",""), dayfirst=True)
+ print listdate.date()
+
+ entries = []
+ for tr in root.cssselect("table.ui-corner-all tr"):
+ tds = tr.cssselect("td")
+ line = tds[0].text_content()
+ entries.append(line)
+
+# 9 or 12 lines per entry
+ queue = deque(entries)
+ datastore = []
+ while queue:
+ docdesc = (queue.popleft() + queue.popleft()).strip()
+
+ casedesc = (queue.popleft() + queue.popleft()).replace("Sakstittel:", "").strip()
+
+ ref = queue.popleft().strip()
+ arkivsaksref = re.sub(r"L.penr.:.+$", "", ref).replace("Arkivsaksnr.:","").strip()
+
+ caseyear = 0
+ caseseqnr = 0
+ casedocseq = 0
+ doctype = '?'
+ caseid = 'unknown'
+ matchObj = re.match( r'(\d+)/(\d+)\s*-\s*(\d+) (.+)$', arkivsaksref, re.M|re.I)
+ if matchObj:
+ caseyear = matchObj.group(1)
+ caseseqnr = matchObj.group(2)
+ casedocseq = matchObj.group(3)
+ doctype = matchObj.group(4)
+ caseyear = expand_year(caseyear)
+ caseid = str(caseyear) + "/" + str(caseseqnr)
+ else:
+ print "error: invalid Arkivsaksnr: " + arkivsaksref
+ matchObj = re.match( r'(\d+)/(\d+)\s*-', arkivsaksref, re.M|re.I)
+ if matchObj:
+ caseyear = expand_year(matchObj.group(1))
+ caseseqnr = matchObj.group(2)
+ caseid = str(caseyear) + "/" + str(caseseqnr)
+
+ laapenr = re.sub(r"^.+L.penr.:", "", ref)
+ journalseqnr = 0
+ journalyear = 0
+ journalid = 'unknown'
+ if -1 != laapenr.find('/') and "/" != laapenr: # Avoid broken/empty values
+ journalseqnr, journalyear = laapenr.split("/")
+ journalyear = expand_year(journalyear)
+ journalid = str(journalyear) + "/" + str(journalseqnr)
+ else:
+ print u"error: invalid Løpenr: " + laapenr
+
+ if not parser.is_valid_doctype(doctype):
+ doctype = {
+ 'S' : 'N',
+ 'PLN' : 'N',
+ 'Z' : 'N',
+ }[doctype]
+
+ fratil = queue.popleft().replace("Fra/Til:", "").strip()
+ if parser.is_sender_doctype(doctype):
+ fratilfield = 'sender'
+ elif parser.is_recipient_doctype(doctype):
+ fratilfield = 'recipient'
+
+ saksbehandler = queue.popleft().replace("Saksbehandler:","").strip()
+ saksansvarlig, bar = saksbehandler.split(" (")
+ saksansvarligenhet, foo = bar.split(")")
+ #print saksansvarligenhet
+
+ recorddate = dateutil.parser.parse(queue.popleft().replace("Datert:","").strip(), dayfirst=True)
+
+ requesturl = queue.popleft().strip()
+
+ exemption = ""
+ if -1 != requesturl.find("Gradering"):
+ exemption = requesturl.replace("Gradering:", "").strip()
+ requesturl = queue.popleft()
+ fratil = ""
+
+ data = {
+ 'agency' : parser.agency,
+ 'recorddate' : recorddate.date(),
+ 'docdesc' : docdesc,
+ 'casedesc' : casedesc,
+
+ 'caseyear' : int(caseyear),
+ 'caseseqnr' : int(caseseqnr),
+ 'casedocseq' : int(casedocseq),
+ 'caseid' : caseid,
+ 'doctype' : doctype,
+
+ 'journalseqnr' : int(journalseqnr),
+ 'journalyear' : int(journalyear),
+ 'journalid' : journalid,
+ fratilfield : fratil,
+
+ 'saksbehandler' : saksbehandler,
+ 'saksansvarlig' : saksansvarlig.strip(),
+ 'saksansvarligenhet' : saksansvarligenhet.strip(),
+
+ 'arkivsaksref' : arkivsaksref,
+ 'laapenr' : laapenr,
+ 'exemption' : exemption,
+
+ 'scrapedurl' : url,
+ 'scrapestamputc' : datetime.datetime.now()
+ }
+
+# print data
+ parser.verify_entry(data)
+ datastore.append(data)
+ saver(unique_keys=['arkivsaksref'], data=datastore)
+
+def fetch_postjournal_monthlist(baseurl, html):
+ root = lxml.html.fromstring(html)
+ subset = root.cssselect("div table")
+ urls = subset[0].cssselect("td a")
+ urllist = []
+ for ahref in urls:
+ href = ahref.attrib['href']
+ if -1 != href.find("day="):
+# print href
+ urllist.append(baseurl + href)
+ return urllist
+
+# http://www.offentlighet.no/
+
+agency = "Ballangen kommune"
+baseurl = "http://www.ballangen.kommune.no"
+
+monthurls = []
+
+def addyear(monthurls, year):
+ for m in [12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1]:
+ monthstr = "%02d%d" % (m, year)
+ url = "http://www.ballangen.kommune.no/artikler/postlister?month=" + monthstr
+ monthurls.append(url)
+
+#addyear(monthurls, 2003)
+#addyear(monthurls, 2004) # Consistency problems in http://www.ballangen.kommune.no/artikler/postlister?month=012004&day=06 (bad Arkivsaksnr. and lacking Løpenr.)
+
+#addyear(monthurls, 2005)
+#addyear(monthurls, 2006)
+#addyear(monthurls, 2007)
+#addyear(monthurls, 2008)
+#addyear(monthurls, 2009)
+#addyear(monthurls, 2010)
+#addyear(monthurls, 2011)
+#addyear(monthurls, 2012)
+
+parsemonths = 2
+
+today = datetime.date.today()
+i = 1
+while i <= parsemonths:
+ i = i + 1
+# parsemonths = parsemonths - 1
+ monthtoparse = today + relativedelta(months=parsemonths - i)
+ monthstr = monthtoparse.strftime("%m%Y")
+ url = "http://www.ballangen.kommune.no/artikler/postlister?month=" + monthstr
+ monthurls.append(url)
+
+#url = "http://www.ballangen.kommune.no/artikler/postlister?month=032012&day=19"
+
+def reload_error_entries():
+ monthurls = []
+ problems = scraperwiki.sqlite.select("distinct scrapedurl from swdata where caseid = 'unknown'")
+ for n in problems:
+ monthurls.append(n['scrapedurl'])
+
+print "Fetching public journal!"
+
+parser = postlistelib.JournalParser(agency=agency)
+
+urllist = []
+
+def fetch_url(url):
+ html = None
+ for n in [1, 2, 3]:
+ try:
+ html = scraperwiki.scrape(url)
+ break
+ except urllib2.URLError, e:
+ print "URLError fetching " + url + ", trying again"
+ return html
+
+for monthurl in monthurls:
+ print "Fetching month list from " + monthurl
+ html = fetch_url(monthurl)
+ urllist.extend(fetch_postjournal_monthlist(baseurl = baseurl, html = html))
+
+for dayurl in urllist:
+ res = scraperwiki.sqlite.select("scrapedurl from swdata where scrapedurl = '"+dayurl+"' and scrapestamputc > '2012-06-23T15:12:40' limit 1")
+ if 0 < len(res):
+ continue
+ print "Fetching from " + dayurl
+ html = fetch_url(dayurl)
+# print html
+ fetch_postjournal_day(parser=parser, url=dayurl, html=html, saver=saver)
+
diff --git a/scrapersources/postliste-ballangen-view b/scrapersources/postliste-ballangen-view
new file mode 100644
index 0000000..73d96b4
--- /dev/null
+++ b/scrapersources/postliste-ballangen-view
@@ -0,0 +1,124 @@
+import scraperwiki
+import cgi, os
+import re
+
+paramdict = dict(cgi.parse_qsl(os.getenv("QUERY_STRING", "")))
+#print paramdict
+
+if 'source' in paramdict:
+ sourcescraper = paramdict['source']
+else:
+ sourcescraper = 'postliste-ballangen'
+
+scraperwiki.sqlite.attach(sourcescraper)
+
+def htc(m):
+ return chr(int(m.group(1),16))
+
+def urldecode(url):
+ rex=re.compile('%([0-9a-hA-H][0-9a-hA-H])',re.M)
+ return rex.sub(htc,url)
+
+def table_saksbehandler():
+ data = scraperwiki.sqlite.select(
+ '''saksbehandler,count(*) as antall from swdata group by saksbehandler order by antall desc'''
+ )
+ # print data
+
+ print "<table>"
+ print "<tr><th>Saksbehandler</th><th>Saker</th>"
+ for d in data:
+ print "<tr>"
+ print "<td>", d["saksbehandler"], "</td>"
+ print "<td>", d["antall"], "</td>"
+ print "</tr>"
+ print "</table>"
+
+# {'datert': datetime.date(2012, 1, 6), 'arkivsaksref': u'12/00008 - 008 U', 'tittel': u'INNKALLING TIL DR\xd8FTELSESM\xd8TE - 13.01.12', 'sakstittel': u'BEMANNINGSSITUASJON ETTER BUDSJETTVEDTAK 2012', 'laapenr': u'000183/12', 'kommune': 'Ballangen kommune', 'saksbehandler': u'Svenn Ole Wiik\n (R\xc5D/)', 'listdate': datetime.date(2012, 1, 6), 'gradering': '', 'fratil': u'Anne J\xf8rgensen'}
+
+sql = "select * from swdata"
+where = ""
+args = []
+if "caseid" in paramdict:
+ where = where + ' caseid = ?'
+ args.append(paramdict["caseid"])
+if "agency" in paramdict:
+ where = where + ' agency = ?'
+ args.append(urldecode(paramdict["agency"]))
+if "saksansvarlig" in paramdict:
+ where = where + ' saksansvarlig = ?'
+ saksansvarlig = urldecode(paramdict["saksansvarlig"])
+ print "S: '" + saksansvarlig + "'"
+ args.append(urldecode(paramdict["saksansvarlig"]))
+if "fratil" in paramdict:
+ where = where + ' sender = ? or recipient = ?'
+ fratil = urldecode(paramdict["fratil"])
+ args.extend([fratil, fratil])
+if "q" in paramdict:
+ q = urldecode(paramdict["q"])
+ qlike = '%' + q + '%'
+ where = where + ' docdesc like ? or casedesc like ? or sender like ? or recipient like ?'
+ args.extend([qlike, qlike, qlike, qlike])
+if where:
+ sql = sql + ' where ' + where
+sql = sql + " order by recorddate desc, casedocseq limit 200"
+print sql
+data = scraperwiki.sqlite.execute(sql, args)
+#print data
+
+print "<p>Søk i tittel, sakstittel, fra/til.</p>"
+print "<p><form>Enter search term: "
+print "<input name='q' length='60'>"
+print "<input name='source' type='hidden' value='" + sourcescraper + "'>"
+print "<INPUT type=\"submit\" value=\"Search\"> <INPUT type=\"reset\">"
+print "</form></p>"
+print "<table>"
+
+#print data
+
+i = 0
+key = {}
+print "<tr>"
+while i < len(data['keys']):
+ colname = data['keys'][i]
+ key[colname] = i
+ if colname in ["scrapedurl", "caseid", "scrapestamputc"]:
+ True # Skip, see below
+ else:
+ print "<th>" + colname + "</th>"
+ i = i + 1
+print "</tr>"
+
+#print data
+for d in data['data']:
+ print "<tr>"
+ i = 0
+ while i < len(data['keys']):
+ colname = data['keys'][i]
+ value = d[key[colname]]
+ if value is None:
+ value = ""
+ if "docdesc" == colname:
+ if 'scrapedurl' in key:
+ scrapedurl = d[key['scrapedurl']]
+ print "<td><a href='" + scrapedurl + "'>", value, "</a></td>"
+ else:
+ print "<td>", value, "</td>"
+ elif "saksansvarlig" == colname:
+ saksansvarlig = d[key['saksansvarlig']]
+ print "<td><a href='?saksansvarlig=" + saksansvarlig + "'>", value, "</a></td>"
+ elif "casedesc" == colname:
+ caseid = d[key['caseid']]
+ print "<td><a href='?caseid=" + caseid + "&source=" + sourcescraper + "'>", value, "</a></td>"
+ elif "sender" == colname or "recipient" == colname:
+ if "" != value:
+ print "<td><a href='?fratil=" + value + "&source=" + sourcescraper + "'>", value, "</a></td>"
+ else:
+ print "<td></td>"
+ elif colname in ["scrapedurl", "caseid", "scrapestamputc"]:
+ True # Skip these, as they are included as links
+ else:
+ print "<td>", value, "</td>"
+ i = i + 1
+ print "</tr>"
+print "</table>"
diff --git a/scrapersources/postliste-bioforsk b/scrapersources/postliste-bioforsk
new file mode 100644
index 0000000..b41b30f
--- /dev/null
+++ b/scrapersources/postliste-bioforsk
@@ -0,0 +1,81 @@
+# -*- coding: UTF-8 -*-
+
+import scraperwiki
+import json
+from BeautifulSoup import BeautifulSoup
+import datetime
+import dateutil.parser
+import lxml.html
+import resource
+import sys
+import urlparse
+import re
+lazycache=scraperwiki.swimport('lazycache')
+postlistelib=scraperwiki.swimport('postliste-python-lib')
+
+agency = 'Bioforsk AS'
+
+def report_errors(errors):
+ if 0 < len(errors):
+ print "Errors:"
+ for e in errors:
+ print e
+ exit(1)
+def out_of_cpu(arg, spent, hard, soft):
+ report_errors(arg)
+
+def process_pdf(parser, pdfurl, errors):
+ errors = []
+ postlistelib.exit_if_no_cpu_left(0, out_of_cpu, errors)
+ try:
+ pdfcontent = scraperwiki.scrape(pdfurl)
+ parser.preprocess(pdfurl, pdfcontent)
+ pdfcontent = None
+# except ValueError, e:
+# errors.append(e)
+ except IndexError, e:
+ errors.append(e)
+
+def process_page_queue(parser, errors):
+ try:
+ parser.process_pages()
+ postlistelib.exit_if_no_cpu_left(0, out_of_cpu, errors)
+ except scraperwiki.CPUTimeExceededError, e:
+ errors.append("Processing pages interrupted")
+
+def process_journal_pdfs(parser, listurl, errors):
+# print "Finding PDFs on " + listurl
+# u = urllib.parse.urlparse(listurl)
+ html = scraperwiki.scrape(listurl)
+ root = lxml.html.fromstring(html)
+ html = None
+ for ahref in root.cssselect("table a"):
+ href = ahref.attrib['href']
+ url = urlparse.urljoin(listurl, href)
+ if -1 != href.find("file://") or -1 == url.find(".pdf"):
+# print "Skipping non-http URL " + url
+ continue
+ if parser.is_already_scraped(url):
+ True
+# print "Skipping already scraped " + url
+ else:
+# print "Will process " + url
+ process_pdf(parser, url, errors)
+
+def test_small_pdfs(parser):
+ # Test with some smaller PDFs
+ errors = []
+ process_pdf(parser, "http://www.bioforsk.no/ikbViewer/Content/97492/off_journal_uke17%202012.pdf", errors)
+ process_page_queue(parser, errors)
+ report_errors(errors)
+ exit(0)
+
+errors = []
+parser = postlistelib.PDFJournalParser(agency=agency)
+
+#test_small_pdfs(parser)
+
+process_journal_pdfs(parser, "http://www.bioforsk.no/ikbViewer/page/bioforsk/presse?p_dimension_id=21903", errors)
+process_page_queue(parser, errors)
+report_errors(errors)
+
diff --git a/scrapersources/postliste-difi b/scrapersources/postliste-difi
new file mode 100644
index 0000000..dfc986f
--- /dev/null
+++ b/scrapersources/postliste-difi
@@ -0,0 +1,88 @@
+# -*- coding: UTF-8 -*-
+
+import scraperwiki
+import json
+from BeautifulSoup import BeautifulSoup
+import datetime
+import dateutil.parser
+import lxml.html
+import resource
+import sys
+import urlparse
+import re
+
+frontpage = "http://www.difi.no/om-difi/offentleg-postjournal-for-difi"
+
+scraperwiki.scrape(frontpage)
+
+lazycache=scraperwiki.swimport('lazycache')
+postlistelib=scraperwiki.swimport('postliste-python-lib')
+
+agency = 'Direktoratet for forvaltning og IKT'
+
+def report_errors(errors):
+ if 0 < len(errors):
+ print "Errors:"
+ for e in errors:
+ print e
+ exit(1)
+def out_of_cpu(arg, spent, hard, soft):
+ report_errors(arg)
+
+def process_pdf(parser, pdfurl, errors):
+ errors = []
+ postlistelib.exit_if_no_cpu_left(0, out_of_cpu, errors)
+ try:
+ pdfcontent = scraperwiki.scrape(pdfurl)
+ parser.preprocess(pdfurl, pdfcontent)
+ pdfcontent = None
+# except ValueError, e:
+# errors.append(e)
+ except IndexError, e:
+ errors.append(e)
+
+def process_page_queue(parser, errors):
+ try:
+ parser.process_pages()
+ postlistelib.exit_if_no_cpu_left(0, out_of_cpu, errors)
+ except scraperwiki.CPUTimeExceededError, e:
+ errors.append("Processing pages interrupted")
+
+def process_journal_pdfs(parser, listurl, errors):
+# print "Finding PDFs on " + listurl
+# u = urllib.parse.urlparse(listurl)
+ html = scraperwiki.scrape(listurl)
+ root = lxml.html.fromstring(html)
+ html = None
+ for ahref in root.cssselect("div.body a"):
+ href = ahref.attrib['href']
+ url = urlparse.urljoin(listurl, href)
+ if -1 != href.find("file://") or -1 == url.find(".pdf"):
+# print "Skipping non-http URL " + url
+ continue
+ if parser.is_already_scraped(url):
+ True
+# print "Skipping already scraped " + url
+ else:
+# print "Will process " + url
+ process_pdf(parser, url, errors)
+
+def test_small_pdfs(parser):
+ # Test with some smaller PDFs
+ errors = []
+ process_pdf(parser, "http://www.difi.no/filearchive/uke-2-offentlig-journal.pdf", errors)
+ process_pdf(parser, "http://www.difi.no/filearchive/uke-1-offentlig-journal.pdf", errors)
+ process_page_queue(parser, errors)
+ report_errors(errors)
+ exit(0)
+
+errors = []
+parser = postlistelib.PDFJournalParser(agency=agency)
+
+#test_small_pdfs(parser)
+
+process_page_queue(parser, errors)
+process_journal_pdfs(parser, frontpage, errors)
+process_page_queue(parser, errors)
+report_errors(errors)
+
diff --git a/scrapersources/postliste-fredrikstad b/scrapersources/postliste-fredrikstad
new file mode 100644
index 0000000..7fb5a13
--- /dev/null
+++ b/scrapersources/postliste-fredrikstad
@@ -0,0 +1,196 @@
+import scraperwiki
+
+import json
+import httplib, urllib
+import datetime
+import dateutil.parser
+import time
+import re
+
+agency = "Fredrikstad kommune"
+urlhost = "www.fredrikstad.kommune.no"
+
+fieldmap = {
+ 'AntallVedlegg' : '',
+ 'Arkivdel' : '',
+ 'AvsenderMottaker' : 'sender', # or recipient
+ 'Dokumentdato' : 'docdate',
+ 'Dokumentnummer' : 'casedocseq',
+ 'Dokumenttype' : 'doctype',
+ 'EkspedertDato' : '',
+ 'Hjemmel' : 'exemption',
+ 'Id' : 'id',
+ 'Innholdsbeskrivelse' : 'docdesc',
+ 'Mappetype' : '',
+ 'Offentlig' : 'ispublic',
+ 'PostlisteType' : 'doctype',
+ 'RegistrertDato' : 'recorddate',
+ 'SaksId' : '',
+ 'SaksNr' : 'caseid',
+ 'Sakstittel' : 'casedesc',
+ #'SaksNr' : 'SA.SAAR + SA.SEKNR',
+ 'Saksansvarlig' : 'saksbehandler',
+ 'SaksansvarligEnhet' : '',
+ 'SaksansvarligEpost' : '',
+
+# 'scrapestamputc' : '',
+# 'scrapedurl' : '',
+# 'agency' : '',
+}
+
+
+# Convert "/Date(1317808020000+0200)/" to a datetime object
+# FIXME Currently ignore the timezone information
+def parse_datestr(str):
+ match = re.split("[/()+]", str)
+# print match
+ sinceepoch = float(match[2]) / 1000
+ if match[3] == '0200':
+ sinceepoch = sinceepoch + 2 * 60 * 60
+ if match[3] == '0100':
+ sinceepoch = sinceepoch + 1 * 60 * 60
+# print sinceepoch
+ date = datetime.datetime.fromtimestamp(sinceepoch)
+# print date
+ return date
+
+def reformat_caseid(caseid):
+ # Input 12/13123, output 2012, 13123, "2012/13123"
+ year, seqnr = caseid.split("/")
+ year = int(year)
+ if year < 100:
+ year = year + 2000
+ caseid = "%d/%s" % (year, seqnr)
+ return year, int(seqnr), caseid
+
+def ws_post(url, urlhost, urlpath, params):
+ jsonparams = json.dumps(params)
+ headers = {"Content-type": "application/json; charset=utf-8",
+ "Accept": "application/json"}
+ conn = httplib.HTTPConnection(urlhost)
+ #print jsonparams
+ conn.request("POST", urlpath, jsonparams, headers)
+ response = conn.getresponse()
+ #print response.status, response.reason
+ jsonres = response.read()
+ res = json.loads(jsonres)
+ #print res
+ return res
+
+def fetch_journal_entry(id):
+ params = { "id" : str(id)}
+ headers = {"Content-type": "application/json; charset=utf-8",
+ "Accept": "application/json"}
+ urlpath = "/Templates/eDemokrati/Services/eDemokratiService.svc/GetPostlisteObjekt"
+ data = ws_post(None, urlhost, urlpath, params)['d']
+ entry = None
+ if data:
+ del data['__type'] # This is useless, ignore
+ entry = {}
+ entry['agency'] = agency
+ entry['scrapestamputc'] = datetime.datetime.now()
+ entry['scrapedurl'] = "http://" + urlhost + urlpath
+# entry['scrapedurl'] = url
+ for dfield in fieldmap.keys():
+ if dfield in data and data[dfield]:
+ if dfield in fieldmap and fieldmap[dfield] != "":
+ fieldname = fieldmap[dfield]
+ else:
+ fieldname = dfield
+ if 'sender' == fieldname:
+ if data['Dokumenttype'] == 'U':
+ fieldname = 'recipient'
+ if dfield in ['RegistrertDato', 'Dokumentdato', 'EkspedertDato']:
+ entry[fieldname] = parse_datestr(data[dfield]).date()
+ else:
+ entry[fieldname] = data[dfield]
+ else:
+ entry[dfield] = data[dfield]
+ entry['caseyear'], entry['caseseqnr'], entry['caseid'] = reformat_caseid(entry['caseid'])
+# data["sourceurl"] = "http://" + server + path
+ #print entry
+ return entry
+
+def epoctime_to_datestr(epoctime):
+ return "/Date("+str(int(epoctime * 1000) )+")/"
+
+def get_last_entry_id():
+ now = time.time()
+ # Get the last week, as the most recent entry should be in this range
+ fradato = epoctime_to_datestr(now - 7 * 24 * 60 * 60)
+ tildato = epoctime_to_datestr(now)
+ #print fradato
+
+ maxid = 0
+
+ urlpath = "/Templates/eDemokrati/Services/eDemokratiService.svc/GetPostlisteArkivdeler"
+ params = {
+ "dato": fradato,
+ "tilDato": tildato,
+ "søkestreng":""}
+ arkivdeler = ws_post(None, urlhost, urlpath, params)['d']
+ # {u'd': [u'_', u'HVA-IFE-A', u'KAR-BR-A', u'KAR-BRUK-A', u'KAR-EIEN-A', u'KAR-ELBH-A', u'KAR-ELS-A', ...
+
+ urlpath = "/Templates/eDemokrati/Services/eDemokratiService.svc/GetPostlisteDokumenttyper"
+ for arkivdel in arkivdeler[0]:
+ params = {
+ "dato":fradato,
+ "tilDato":tildato,
+ "søkestreng":"",
+ "arkivdel":arkivdel,
+ }
+ doctypes = ws_post(None, urlhost, urlpath, params)['d']
+ #{"d":["I","N","S","U","X"]}
+ urlpath = "/Templates/eDemokrati/Services/eDemokratiService.svc/GetPostlisteS%C3%B8k"
+ for doctype in doctypes:
+ params = {
+ "fraDato":fradato,
+ "tilDato":tildato,
+ "søkestreng":"",
+ "arkivdel":arkivdel,
+ "dokumenttype":doctype,
+ }
+ entries = ws_post(None, urlhost, urlpath, params)['d']
+ for entry in entries:
+ #print entry
+ #exit(0)
+ #print entry['Id']
+ id = int(entry['Id'])
+ if id > maxid:
+ maxid = id
+# data = fetch_journal_entry(entry['Id'])
+# if data:
+# scraperwiki.sqlite.save(unique_keys=['id'], data=data)
+ return maxid
+
+#{"d":[{"__type":"PostlisteObjekt:#SSP.NoarkServices","AntallVedlegg":1,"Dokumentnummer":2,"Dokumenttype":"I","EkspedertDato":null,"Hjemmel":null,"Id":1507868,"Innholdsbeskrivelse":"Tomtejustering - Lillebæk, eiendom 208\/1611","Offentlig":true,"RegistrertDato":"\/Date(1339538400000+0200)\/","SaksId":296971,"SaksNr":"12\/8658","Arkivdel":"KAR-EIEN-A","AvsenderMottaker":"Randi Wilberg","Dokumentdato":"\/Date(1339624800000+0200)\/","Mappetype":"DS","PostlisteType":"I","Saksansvarlig":null,"SaksansvarligEnhet":null,"SaksansvarligEpost":null,"Sakstittel":null},{"__type":"PostlisteObjekt:#SSP.NoarkServices","AntallVedlegg":4,"Dokumentnummer":1,"Dokumenttype":"I","EkspedertDato":null,"Hjemmel":null,"Id":1507865,"Innholdsbeskrivelse":"Søknkad om utvidelse av balkong - Kalleraveien 14","Offentlig":true,"RegistrertDato":"\/Date(1339538400000+0200)\/","SaksId":298804,"SaksNr":"12\/10480","Arkivdel":"KAR-EIEN-A","AvsenderMottaker":"Ole Henning Løken","Dokumentdato":"\/Date(1338847200000+0200)\/","Mappetype":"BS","PostlisteType":"I","Saksansvarlig":null,"SaksansvarligEnhet":null,"SaksansvarligEpost":null,"Sakstittel":null},...
+
+def get_journal_enries_range(min, max, step):
+ for id in range(min, max, step):
+ data = fetch_journal_entry(id)
+ #print data
+ if data:
+ scraperwiki.sqlite.save(unique_keys=['id'], data=data)
+ time.sleep(0.3)
+
+maxid = get_last_entry_id()
+print "max id =", maxid
+try:
+ start = scraperwiki.sqlite.select("max(id) as max from swdata")[0]['max'] + 1
+except:
+ start = 1094428 # 2010
+ start = 1507868 # 2012
+
+print start, maxid
+#if maxid > start + 20:
+# maxid = start + 10
+get_journal_enries_range(start, maxid + 1, 1)
+
+try:
+ minid = scraperwiki.sqlite.select("min(id) as min from swdata")[0]['min'] - 1
+ start = minid
+except:
+ True
+end = start - 1000
+print start, end
+get_journal_enries_range(start, end, -1)
diff --git a/scrapersources/postliste-hadsel b/scrapersources/postliste-hadsel
new file mode 100644
index 0000000..a175048
--- /dev/null
+++ b/scrapersources/postliste-hadsel
@@ -0,0 +1,108 @@
+# -*- coding: UTF-8 -*-
+
+import scraperwiki
+from BeautifulSoup import BeautifulSoup
+import datetime
+import dateutil.parser
+import lxml.html
+import sys
+import urlparse
+
+scraperwiki.scrape("http://www.hadsel.kommune.no/selvbetjeningskjema-kart-postjournal/offentlig-postjournal")
+
+lazycache=scraperwiki.swimport('lazycache')
+postlistelib=scraperwiki.swimport('postliste-python-lib')
+
+agency = 'Hadsel kommune'
+
+def report_errors(errors):
+ if 0 < len(errors):
+ print "Errors:"
+ for e in errors:
+ print e
+ exit(1)
+def no_cpu_left(arg, spent, soft, hard):
+ report_errors(arg)
+
+def process_pdf(parser, pdfurl, errors):
+ errors = []
+ postlistelib.exit_if_no_cpu_left(0, callback=no_cpu_left, arg = errors)
+ try:
+ pdfcontent = lazycache.lazycache(pdfurl)
+ parser.preprocess(pdfurl, pdfcontent)
+# except ValueError, e:
+# errors.append(e)
+ except IndexError, e:
+ errors.append(e)
+
+def process_page_queue(parser, errors):
+ try:
+ parser.process_pages()
+ postlistelib.exit_if_no_cpu_left(0, callback=no_cpu_left, arg = errors)
+ except scraperwiki.CPUTimeExceededError, e:
+ errors.append("Processing pages interrupted")
+
+def consider_url(parser, url, errors):
+ if parser.is_already_scraped(url):
+ True
+# print "Skipping already scraped " + url
+ else:
+# print "Will process " + url
+ try:
+ process_pdf(parser, url, errors)
+ except:
+ pass
+
+def process_journal_pdfs(parser, listurl, errors, recurse):
+# print "Finding PDFs on " + listurl
+# u = urllib.parse.urlparse(listurl)
+ html = scraperwiki.scrape(listurl)
+ root = lxml.html.fromstring(html)
+ html = None
+ for ahref in root.cssselect("div.items a"):
+ url = urlparse.urljoin(listurl, ahref.attrib['href'])
+ if -1 == url.find("doc_download"):
+ continue
+ consider_url(parser, url, errors)
+ #print url
+ for ahref in root.cssselect("div.item-list a"):
+ suburl = urlparse.urljoin(listurl, ahref.attrib['href'])
+ #print "sub " + suburl
+ subhtml = scraperwiki.scrape(suburl)
+ subroot = lxml.html.fromstring(subhtml)
+ subhtml = None
+ for subahref in subroot.cssselect("div.article a"):
+ href = subahref.attrib['href']
+ #print href
+ subsuburl = urlparse.urljoin(suburl, href)
+ #print "subsub " + subsuburl
+ if -1 == subsuburl.find("doc_download"):
+ continue
+ consider_url(parser, subsuburl, errors)
+ subroot = None
+ if recurse:
+ seen = { listurl : 1 }
+ for ahref in root.cssselect("div.pagination a"):
+ pageurl = urlparse.urljoin(listurl, ahref.attrib['href'])
+ #print "P: " + pageurl
+ if pageurl not in seen:
+ process_journal_pdfs(parser, pageurl, errors, False)
+ seen[pageurl] = 1
+
+def test_parse_case_journal_ref():
+ entry = {}
+ parse_case_journal_ref(entry, [u'2008/16414-', u'23', u'15060/2012'], "")
+ parse_case_journal_ref(entry, [u'2011/15972-1 102773/201', u'1'], "")
+ parse_case_journal_ref(entry, [u'2010/2593-2', u'103004/201', u'1'], "")
+ parse_case_journal_ref(entry, [u'2011/13415-', u'22', u'100077/201', u'1'], "")
+ exit(0)
+
+#test_parse_case_journal_ref()
+
+errors = []
+parser = postlistelib.PDFJournalParser(agency=agency)
+process_page_queue(parser, errors)
+process_journal_pdfs(parser, "http://www.hadsel.kommune.no/selvbetjeningskjema-kart-postjournal/offentlig-postjournal", errors, True)
+process_page_queue(parser, errors)
+report_errors(errors)
+
diff --git a/scrapersources/postliste-halden b/scrapersources/postliste-halden
new file mode 100644
index 0000000..4b0ebd5
--- /dev/null
+++ b/scrapersources/postliste-halden
@@ -0,0 +1,93 @@
+# -*- coding: UTF-8 -*-
+
+import scraperwiki
+import json
+from BeautifulSoup import BeautifulSoup
+import datetime
+import dateutil.parser
+import lxml.html
+import resource
+import sys
+import urlparse
+import urllib
+import re
+lazycache=scraperwiki.swimport('lazycache')
+postlistelib=scraperwiki.swimport('postliste-python-lib')
+
+agency = 'Halden kommune'
+
+def report_errors(errors):
+ if 0 < len(errors):
+ print "Errors:"
+ for e in errors:
+ print e
+ exit(1)
+def out_of_cpu(arg, spent, hard, soft):
+ report_errors(arg)
+
+def process_pdf(parser, pdfurl, errors):
+ errors = []
+ postlistelib.exit_if_no_cpu_left(0, out_of_cpu, errors)
+ try:
+ pdfcontent = scraperwiki.scrape(pdfurl)
+ parser.preprocess(pdfurl, pdfcontent)
+ pdfcontent = None
+# except ValueError, e:
+# errors.append(e)
+ except IndexError, e:
+ errors.append(e)
+
+def process_page_queue(parser, errors):
+ try:
+ if False:
+ scraperwiki.sqlite.execute("delete from swdata where scrapedurl in (select scrapedurl from unparsedpages)")
+ scraperwiki.sqlite.execute("delete from unparsedpages")
+ scraperwiki.sqlite.commit()
+
+ parser.process_pages()
+ postlistelib.exit_if_no_cpu_left(0, out_of_cpu, errors)
+ except scraperwiki.CPUTimeExceededError, e:
+ errors.append("Processing pages interrupted")
+
+def process_pdf_links_cssselect(parser, listurl, errors, cssselect):
+ html = scraperwiki.scrape(listurl)
+ root = lxml.html.fromstring(html)
+ html = None
+ for ahref in root.cssselect(cssselect + " a"):
+ href = ahref.attrib['href']
+ url = urlparse.urljoin(listurl, href).replace(" ", "%20").replace(u"å", "%C3%A5")
+ #print url
+ if -1 != href.find("file://") or -1 != href.find("postliste/Documents/Brukerveiledning"):
+# print "Skipping non-http URL " + url
+ continue
+ if -1 == href.find(".pdf"):
+ continue
+ if parser.is_already_scraped(url):
+ True
+# print "Skipping already scraped " + url
+ else:
+# print "Will process " + url
+ process_pdf(parser, url, errors)
+
+def process_journal_pdfs(parser, listurl, errors):
+ return process_pdf_links_cssselect(parser, listurl, errors, "div#page_centerElementZone")
+
+def test_small_pdfs(parser):
+ # Test with some smaller PDFs
+ errors = []
+ process_pdf(parser, u"http://www.halden.kommune.no/selvbetjening/postliste/Documents/120601%20-%20120607%20Inng%C3%A5ende.pdf", errors)
+ process_pdf(parser, u"http://www.halden.kommune.no/selvbetjening/postliste/Documents/120601%20-%20120607%20Utg%C3%A5ende.pdf", errors)
+ process_page_queue(parser, errors)
+ report_errors(errors)
+ exit(0)
+
+errors = []
+parser = postlistelib.PDFJournalParser(agency=agency)
+#parser.debug = True
+
+#test_small_pdfs(parser)
+process_page_queue(parser, errors)
+process_journal_pdfs(parser, u"http://www.halden.kommune.no/selvbetjening/postliste/Sider/Inng%C3%A5ende-postlister.aspx", errors)
+process_journal_pdfs(parser, u"http://www.halden.kommune.no/selvbetjening/postliste/Sider/Utg%C3%A5ende-postliste-.aspx", errors)
+process_page_queue(parser, errors)
+report_errors(errors) \ No newline at end of file
diff --git a/scrapersources/postliste-hoegskolen-i-gjoevik b/scrapersources/postliste-hoegskolen-i-gjoevik
new file mode 100644
index 0000000..fd197eb
--- /dev/null
+++ b/scrapersources/postliste-hoegskolen-i-gjoevik
@@ -0,0 +1,104 @@
+# -*- coding: UTF-8 -*-
+# Based on the scraper advanced-scraping-pdf
+# See also
+# https://views.scraperwiki.com/run/pdf-to-html-preview-1/?url=http%3A%2F%2Fwww.stortinget.no%2FGlobal%2Fpdf%2Fpostjournal%2Fpj-2012-05-09.pdf
+
+import scraperwiki
+import json
+from BeautifulSoup import BeautifulSoup
+import datetime
+import dateutil.parser
+import lxml.html
+import resource
+import sys
+import urlparse
+import re
+#
+# something weird with 04.11.2010
+#
+#
+#
+# Make sure Scraperwiki believe this is the source from this database
+scraperwiki.scrape("http://www.hig.no/om_hig/offentleg_journal/2012")
+
+lazycache=scraperwiki.swimport('lazycache')
+postlistelib=scraperwiki.swimport('postliste-python-lib')
+
+agency = 'Høgskolen i Gjøvik'
+
+def report_errors(errors):
+ if 0 < len(errors):
+ print "Errors:"
+ for e in errors:
+ print e
+ exit(1)
+def out_of_cpu(arg, spent, hard, soft):
+ report_errors(arg)
+
+def process_pdf(parser, pdfurl, errors):
+ errors = []
+ postlistelib.exit_if_no_cpu_left(0, out_of_cpu, errors)
+ try:
+ pdfcontent = scraperwiki.scrape(pdfurl)
+ parser.preprocess(pdfurl, pdfcontent)
+ pdfcontent = None
+# except ValueError, e:
+# errors.append(e)
+ except IndexError, e:
+ errors.append(e)
+
+def process_page_queue(parser, errors):
+ try:
+ parser.process_pages()
+ postlistelib.exit_if_no_cpu_left(0, out_of_cpu, errors)
+ except scraperwiki.CPUTimeExceededError, e:
+ errors.append("Processing pages interrupted")
+
+def process_journal_pdfs(parser, listurl, errors):
+ print "Finding PDFs on " + listurl
+# u = urllib.parse.urlparse(listurl)
+ html = scraperwiki.scrape(listurl)
+ root = lxml.html.fromstring(html)
+ html = None
+ for ahref in root.cssselect("div.spalte-inner a"):
+ href = ahref.attrib['href']
+ url = urlparse.urljoin(listurl, href).replace(" ", "+")
+ if -1 != href.find("file://") or -1 == url.find(".pdf"):
+# print "Skipping non-http URL " + url
+ continue
+ if parser.is_already_scraped(url):
+ True
+# print "Skipping already scraped " + url
+ else:
+# print "Will process " + url
+ process_pdf(parser, url, errors)
+
+def test_small_pdfs(parser):
+ # Test with some smaller PDFs
+ errors = []
+ if parser.is_already_scraped("http://www.hig.no/content/download/30119/360872/file/Offentlig+journal+04.11.2010.pdf"):
+ print "Skipping already scraped "
+ exit(1)
+ else:
+ print "Will process "
+
+ #process_pdf(parser, "http://www.hig.no/content/download/35184/430061/file/Offentlig%20journal%2025.06.2012.pdf", errors)
+ #process_pdf(parser, "http://www.hig.no/content/download/30116/360863/file/Offentlig%20journal%2001.11.2010.pdf", errors)
+ process_pdf(parser, "http://www.hig.no/content/download/30119/360872/file/Offentlig+journal+04.11.2010.pdf", errors)
+ process_page_queue(parser, errors)
+ report_errors(errors)
+ exit(0)
+
+errors = []
+parser = postlistelib.PDFJournalParser(agency=agency)
+
+#test_small_pdfs(parser)
+
+startYear=2010
+endYear=datetime.datetime.now().year
+for year in range(startYear, endYear):
+ process_journal_pdfs(parser, "http://www.hig.no/om_hig/offentleg_journal/%d" % year, errors)
+
+process_page_queue(parser, errors)
+report_errors(errors)
+
diff --git a/scrapersources/postliste-hoegskolen-i-hamar b/scrapersources/postliste-hoegskolen-i-hamar
new file mode 100644
index 0000000..890eed3
--- /dev/null
+++ b/scrapersources/postliste-hoegskolen-i-hamar
@@ -0,0 +1,103 @@
+# -*- coding: UTF-8 -*-
+# Based on the scraper advanced-scraping-pdf
+# See also
+# https://views.scraperwiki.com/run/pdf-to-html-preview-1/?url=http%3A%2F%2Fwww.stortinget.no%2FGlobal%2Fpdf%2Fpostjournal%2Fpj-2012-05-09.pdf
+
+import scraperwiki
+import json
+from BeautifulSoup import BeautifulSoup
+import datetime
+import dateutil.parser
+import lxml.html
+import resource
+import sys
+import urlparse
+import re
+
+# Make sure Scraperwiki believe this is the source from this database
+scraperwiki.scrape("http://www.hihm.no/Hovedsiden/Om-Hoegskolen/Offentlig-journal")
+
+lazycache=scraperwiki.swimport('lazycache')
+postlistelib=scraperwiki.swimport('postliste-python-lib')
+
+agency = 'Høgskolen i Hamar'
+
+def report_errors(errors):
+ if 0 < len(errors):
+ print "Errors:"
+ for e in errors:
+ print e
+ exit(1)
+def out_of_cpu(arg, spent, hard, soft):
+ report_errors(arg)
+
+def process_pdf(parser, pdfurl, errors):
+ errors = []
+ postlistelib.exit_if_no_cpu_left(0, out_of_cpu, errors)
+ try:
+ pdfcontent = scraperwiki.scrape(pdfurl)
+ parser.preprocess(pdfurl, pdfcontent)
+ pdfcontent = None
+# except ValueError, e:
+# errors.append(e)
+ except IndexError, e:
+ errors.append(e)
+
+def process_page_queue(parser, errors):
+ try:
+ parser.process_pages()
+ postlistelib.exit_if_no_cpu_left(0, out_of_cpu, errors)
+ except scraperwiki.CPUTimeExceededError, e:
+ errors.append("Processing pages interrupted")
+
+def process_journal_pdfs(parser, listurl, errors):
+ print "Finding PDFs on " + listurl
+# u = urllib.parse.urlparse(listurl)
+ html = scraperwiki.scrape(listurl)
+ root = lxml.html.fromstring(html)
+ html = None
+ for ahref in root.cssselect("div.content-view-full a"):
+ href = ahref.attrib['href']
+ url = urlparse.urljoin(listurl, href).replace(" ", "+")
+ if -1 != href.find("file://") or -1 == url.find(".pdf"):
+# print "Skipping non-http URL " + url
+ continue
+ if parser.is_already_scraped(url):
+ True
+# print "Skipping already scraped " + url
+ else:
+# print "Will process " + url
+ process_pdf(parser, url, errors)
+
+def find_journal_subpages(baseurl):
+ urls = []
+ root = lxml.html.fromstring(scraperwiki.scrape(baseurl))
+ for ahref in root.cssselect("ul.menu-list a"):
+ href = ahref.attrib['href']
+ months = "januar","februar","mars","april","mai","juni","juli","august","september","oktober","november","desember"
+ if -1 == href.find("file://") and href.endswith(months):
+ urls.append(urlparse.urljoin(baseurl, href).replace(" ", "+"))
+ return urls
+
+def test_small_pdfs(parser):
+ # Test with some smaller PDFs
+ errors = []
+ process_pdf(parser, "http://www.hihm.no/content/download/38169/420508/file/search.pdf", errors)
+ process_pdf(parser, "http://www.hihm.no/content/download/39369/430053/file/search.pdf", errors)
+ process_page_queue(parser, errors)
+ report_errors(errors)
+ exit(0)
+
+errors = []
+parser = postlistelib.PDFJournalParser(agency=agency)
+
+#test_small_pdfs(parser)
+
+#process_journal_pdfs(parser, "http://www.hihm.no/Hovedsiden/Om-Hoegskolen/Offentlig-journal/mai", errors)
+
+for url in find_journal_subpages("http://www.hihm.no/Hovedsiden/Om-Hoegskolen/Offentlig-journal"):
+ process_journal_pdfs(parser, url, errors)
+
+process_page_queue(parser, errors)
+report_errors(errors)
+
diff --git a/scrapersources/postliste-hoegskolen-i-lillehammer b/scrapersources/postliste-hoegskolen-i-lillehammer
new file mode 100644
index 0000000..5337521
--- /dev/null
+++ b/scrapersources/postliste-hoegskolen-i-lillehammer
@@ -0,0 +1,90 @@
+# -*- coding: UTF-8 -*-
+# Based on the scraper advanced-scraping-pdf
+# See also
+# https://views.scraperwiki.com/run/pdf-to-html-preview-1/?url=http%3A%2F%2Fwww.stortinget.no%2FGlobal%2Fpdf%2Fpostjournal%2Fpj-2012-05-09.pdf
+
+import scraperwiki
+import json
+from BeautifulSoup import BeautifulSoup
+import datetime
+import dateutil.parser
+import lxml.html
+import resource
+import sys
+import urlparse
+import re
+
+# Make sure Scraperwiki believe this is the source from this database
+scraperwiki.scrape("http://www.hil.no/hil/om_hoegskolen/Offentlig-journal")
+
+lazycache=scraperwiki.swimport('lazycache')
+postlistelib=scraperwiki.swimport('postliste-python-lib')
+
+agency = 'Høgskolen i Lillehammer'
+
+def report_errors(errors):
+ if 0 < len(errors):
+ print "Errors:"
+ for e in errors:
+ print e
+ exit(1)
+def out_of_cpu(arg, spent, hard, soft):
+ report_errors(arg)
+
+def process_pdf(parser, pdfurl, errors):
+ errors = []
+ postlistelib.exit_if_no_cpu_left(0, out_of_cpu, errors)
+ try:
+ pdfcontent = scraperwiki.scrape(pdfurl)
+ parser.preprocess(pdfurl, pdfcontent)
+ pdfcontent = None
+# except ValueError, e:
+# errors.append(e)
+ except IndexError, e:
+ errors.append(e)
+
+def process_page_queue(parser, errors):
+ try:
+ parser.process_pages()
+ postlistelib.exit_if_no_cpu_left(0, out_of_cpu, errors)
+ except scraperwiki.CPUTimeExceededError, e:
+ errors.append("Processing pages interrupted")
+
+def process_journal_pdfs(parser, listurl, errors):
+ print "Finding PDFs on " + listurl
+# u = urllib.parse.urlparse(listurl)
+ html = scraperwiki.scrape(listurl)
+ root = lxml.html.fromstring(html)
+ html = None
+ for ahref in root.cssselect("div.content-view-full ul li a"):
+ href = ahref.attrib['href']
+ url = urlparse.urljoin(listurl, href).replace(" ", "+")
+ if -1 != href.find("file://") or -1 == url.find(".pdf"):
+# print "Skipping non-http URL " + url
+ continue
+ if parser.is_already_scraped(url):
+ True
+# print "Skipping already scraped " + url
+ else:
+# print "Will process " + url
+ process_pdf(parser, url, errors)
+
+def test_small_pdfs(parser):
+ # Test with some smaller PDFs
+ errors = []
+ process_pdf(parser, "http://www.hil.no/content/download/39617/747521/file/uke24.pdf", errors)
+ process_pdf(parser, "http://www.hil.no/content/download/37616/700472/file/uke1.pdf", errors)
+ process_page_queue(parser, errors)
+ report_errors(errors)
+ exit(0)
+
+errors = []
+parser = postlistelib.PDFJournalParser(agency=agency)
+
+#test_small_pdfs(parser)
+
+process_journal_pdfs(parser, "http://www.hil.no/hil/om_hoegskolen/Offentlig-journal", errors)
+
+process_page_queue(parser, errors)
+report_errors(errors)
+
diff --git a/scrapersources/postliste-hole b/scrapersources/postliste-hole
new file mode 100644
index 0000000..3f34322
--- /dev/null
+++ b/scrapersources/postliste-hole
@@ -0,0 +1,237 @@
+# -*- coding: UTF-8 -*-
+import scraperwiki
+import lxml.html
+import datetime
+import dateutil.parser
+import urllib2
+import urlparse
+
+# Start page is the front page, to get it listed as the primary source
+scraperwiki.scrape("http://www.hole.kommune.no/postjournaler.173497.no.html")
+
+postlistelib=scraperwiki.swimport('postliste-python-lib')
+
+agency = 'Hole kommune'
+
+def fetch_url(url):
+ html = None
+ for n in [1, 2, 3]:
+ try:
+ html = scraperwiki.scrape(url)
+ break
+ except urllib2.URLError, e:
+ print "URLError fetching " + url + ", trying again"
+ return html
+
+def expand_id(value, fieldtype, entry):
+ year, seqnr = value.split('/')
+ year = int(year)
+ seqnr = int(seqnr)
+ if year < 50:
+ year = year + 2000
+ if year > 50 and year < 100:
+ year = year + 1900
+ entry[fieldtype + 'year'] = year
+ entry[fieldtype + 'seqnr'] = seqnr
+ newvalue = str(year) + '/' + str(seqnr)
+ return entry, newvalue
+
+def fetch_postjournal(agency, url, datastore):
+# print "Scraping " + url
+ scrapestamputc = datetime.datetime.now()
+ html = fetch_url(url)
+ root = lxml.html.fromstring(html)
+ entry = {
+ 'agency' : agency,
+ 'scrapestamputc' : scrapestamputc,
+ 'scrapedurl' : url,
+ }
+
+ fieldmap = {
+ u'Tittel på saken' : 'casedesc',
+ u'Tittel på dokumentet' : 'docdesc',
+ 'Dokumentansvarlig' : 'saksansvarlig',
+ 'Hjemmel' : 'exemption',
+ 'DokumentID' : 'journalid',
+ 'ArkivsakID' : 'caseid',
+ 'Journaldato' : 'recorddate',
+ 'Brevdato' : 'docdate',
+ #'Journalpostkategori' :
+ }
+ doctypemap = { # Valid codes are I, U, X, N, S
+ u'Innkommende dokument' : 'I',
+ u'Innkommende dokument (Gradert)' : 'I',
+ u'Utgående dokument' : 'U',
+ u'Utgående dokument (Gradert)' : 'U',
+ u'Utgående dokument (Ikke publisert)' : 'X',
+ u'Innkommende dokument (Ikke publisert)' : 'X',
+ u'Internt notat (Gradert)' : 'N',
+ u'Internt notat' : 'N',
+ }
+ for span in root.cssselect("div.innsyn-content"):
+ #print span.text_content()
+
+ doctype = span.cssselect("h1.header-head")[0].text_content().strip()
+ print doctype
+ entry['doctype'] = doctypemap[doctype]
+
+ trs = span.cssselect("div.nobox tr")
+ for tr in trs:
+ field = tr.cssselect("th.header-cell")[0].text_content().strip().replace(":","")
+ value = tr.cssselect("td.content-cell")[0].text_content().strip()
+ #print "'" + field + "' = " + value
+ if field in fieldmap:
+ field = fieldmap[field]
+ #print "hit"
+ if field in ['docdate','recorddate']:
+ value = dateutil.parser.parse(value, dayfirst=True).date()
+ if field == 'saksansvarlig' and -1 != value.find(','):
+ #print value
+ names = value.split(",", 1)
+ value = names[1].strip() + " " + names[0].strip()
+ if field == 'caseid':
+ entry, value = expand_id(value, 'case', entry)
+ if field == 'journalid':
+ entry, value = expand_id(value, 'journal', entry)
+
+ entry[field] = value
+
+ sendinfo = span.cssselect("div.dokmottakere")
+ if 0 < len(sendinfo):
+ if 'doctype' in entry and entry['doctype'] in ['U', 'X', 'N']:
+ field = 'recipient'
+ else:
+ field = 'sender'
+ # Value is "Margrethe Ingeland<br/>Gravfossveien<br/>3360 GEITHUS", should be split in person, addr and zip
+ entry[field] = sendinfo[0].text
+ brs = sendinfo[0].cssselect("br")
+ if 3 == len(brs):
+ addr = brs[0].tail + ", " + brs[1].tail
+ zip = brs[2].tail
+ entry[field + 'addr'] = addr
+ entry[field + 'zip'] = zip
+ elif 2 == len(brs):
+ addr = brs[0].tail
+ zip = brs[1].tail
+ entry[field + 'addr'] = addr
+ entry[field + 'zip'] = zip
+ elif 1 == len(brs):
+ zip = brs[0].tail
+ entry[field + 'zip'] = zip
+ elif 0 == len(brs):
+ True # Ignore
+ else:
+ raise ValueError("Unexpected number of address lines")
+ print entry
+ if 'doctype' in entry:
+ entry['casedocseq'] = 0 # Fake value, not sure how to extract the real value
+ datastore.append(entry)
+ return
+
+def get_journal_day(agency, date, startrow, jurlqueue):
+ datestr = str(date) + "T00:00:00"
+ url = "http://innsyn.hole.kommune.no/wfinnsyn.ashx?response=journalpost_postliste&showresults=true&fradato=%s&startrow=%d" % (datestr, startrow)
+ print url
+ html = fetch_url(url)
+ root = lxml.html.fromstring(html)
+ ahrefs = root.cssselect("table.inner-max-width tbody tr a")
+ for a in ahrefs:
+ href = a.attrib['href']
+ if -1 != href.find("/wfinnsyn.ashx?response=journalpost_detaljer&journalpostid="):
+ jurl = urlparse.urljoin(url, href)
+ jurlqueue.append(jurl)
+
+ ahrefs = root.cssselect("table.inner-max-width tfoot tr a")
+ for a in ahrefs:
+ if 'neste' == a.text_content():
+ get_journal_day(agency, date, startrow+10, jurlqueue)
+
+def is_already_scraped(url):
+ for sql in ["scrapedurl from swdata where scrapedurl = '" + url + "' limit 1"]:
+ try:
+ result = scraperwiki.sqlite.select(sql)
+ #int sql, " : ", result
+ if 0 < len(result) and u'scrapedurl' in result[0]:
+ return True
+ except:
+ print "Exception"
+ pass
+ return False
+
+def minmax_recorddate(minmax):
+ for sql in ["%s(recorddate) as recorddate from swdata" % minmax]:
+ try:
+ result = scraperwiki.sqlite.select(sql)
+ date = dateutil.parser.parse(result[0]['recorddate']).date()
+ return date
+ except:
+ pass
+ return None
+
+def scraper():
+ html = fetch_url("http://innsyn.hole.kommune.no/wfinnsyn.ashx?response=journalpost_postliste&showresults=true")
+ root = lxml.html.fromstring(html)
+ table = root.cssselect("table.inner-max-width")
+ #print table[0].text_content()
+
+ lastdate = dateutil.parser.parse(table[0].cssselect("caption")[0].text_content().replace("Postliste den ", ""), dayfirst=True).date()
+
+ maxdate = minmax_recorddate("max")
+
+ if maxdate:
+ startdate = maxdate + datetime.timedelta(days=1)
+ start = 0
+ end = (lastdate-startdate).days + 1
+ print maxdate, startdate, start, end
+ else:
+ startdate = maxdate
+ start = 0
+ end = 0
+ for old in range(start, end):
+ date = startdate + datetime.timedelta(days=old)
+ print date
+ urlqueue = []
+ get_journal_day(agency, date, 0, urlqueue)
+ datastore = []
+ for jurl in urlqueue:
+ if not is_already_scraped(jurl):
+ res = fetch_postjournal(agency, jurl, datastore)
+ if 0 < len(datastore):
+ print datastore
+ scraperwiki.sqlite.save(unique_keys=['scrapedurl'], data=datastore)
+ datastore = []
+
+ mindate = minmax_recorddate("min")
+
+ # Only three months back
+ return
+
+ if mindate:
+ startdate = mindate - datetime.timedelta(days=1)
+ start = 0
+ end = -60
+ print mindate, startdate, start, end
+ else:
+ return
+ for old in range(start, end, -1):
+ date = startdate + datetime.timedelta(days=old)
+ print date
+ urlqueue = []
+ get_journal_day(agency, date, 0, urlqueue)
+ datastore = []
+ for jurl in urlqueue:
+ if not is_already_scraped(jurl):
+ res = fetch_postjournal(agency, jurl, datastore)
+ if 0 < len(datastore):
+ print datastore
+ scraperwiki.sqlite.save(unique_keys=['scrapedurl'], data=datastore)
+ datastore = []
+
+#GET http://innsyn.hole.kommune.no/wfinnsyn.ashx?response=journalpost_postliste&showresults=true&fradato=2012-06-15T00:00:00
+#GET http://innsyn.hole.kommune.no/wfinnsyn.ashx?response=journalpost_detaljer&journalpostid=2012005569&
+#GET http://innsyn.hole.kommune.no/wfinnsyn.ashx?response=arkivsak_detaljer&arkivsakid=2006002016&
+
+if __name__ == "scraper":
+ scraper()
+else:
+ print "Not called as scraper" \ No newline at end of file
diff --git a/scrapersources/postliste-hvaler b/scrapersources/postliste-hvaler
new file mode 100644
index 0000000..b3e9137
--- /dev/null
+++ b/scrapersources/postliste-hvaler
@@ -0,0 +1,81 @@
+# -*- coding: UTF-8 -*-
+
+import scraperwiki
+import json
+from BeautifulSoup import BeautifulSoup
+import datetime
+import dateutil.parser
+import lxml.html
+import resource
+import sys
+import urlparse
+import re
+lazycache=scraperwiki.swimport('lazycache')
+postlistelib=scraperwiki.swimport('postliste-python-lib')
+
+agency = 'Hvaler kommune'
+
+def report_errors(errors):
+ if 0 < len(errors):
+ print "Errors:"
+ for e in errors:
+ print e
+ exit(1)
+def out_of_cpu(arg, spent, hard, soft):
+ report_errors(arg)
+
+def process_pdf(parser, pdfurl, errors):
+ errors = []
+ postlistelib.exit_if_no_cpu_left(0, out_of_cpu, errors)
+ try:
+ pdfcontent = scraperwiki.scrape(pdfurl)
+ parser.preprocess(pdfurl, pdfcontent)
+ pdfcontent = None
+# except ValueError, e:
+# errors.append(e)
+ except IndexError, e:
+ errors.append(e)
+
+def process_page_queue(parser, errors):
+ try:
+ parser.process_pages()
+ postlistelib.exit_if_no_cpu_left(0, out_of_cpu, errors)
+ except scraperwiki.CPUTimeExceededError, e:
+ errors.append("Processing pages interrupted")
+
+def process_journal_pdfs(parser, listurl, errors):
+# print "Finding PDFs on " + listurl
+# u = urllib.parse.urlparse(listurl)
+ html = scraperwiki.scrape(listurl)
+ root = lxml.html.fromstring(html)
+ html = None
+ for ahref in root.cssselect("div#ctl00_MainRegion_StageAreaRegion_MainContentRegion_MainBodyRegion_ctl01_FileTreen0Nodes a"):
+ href = ahref.attrib['href']
+ url = urlparse.urljoin(listurl, href)
+ if -1 != href.find("file://"):
+# print "Skipping non-http URL " + url
+ continue
+ if parser.is_already_scraped(url):
+ True
+# print "Skipping already scraped " + url
+ else:
+# print "Will process " + url
+ process_pdf(parser, url, errors)
+
+def test_small_pdfs(parser):
+ # Test with some smaller PDFs
+ errors = []
+ process_pdf(parser, "http://www.hvaler.kommune.no/Documents/Postlister/2012/2012-05-31.pdf", errors)
+ process_page_queue(parser, errors)
+ report_errors(errors)
+ exit(0)
+
+errors = []
+parser = postlistelib.PDFJournalParser(agency=agency)
+
+#test_small_pdfs(parser)
+
+process_journal_pdfs(parser, "http://www.hvaler.kommune.no/Postlister/", errors)
+process_page_queue(parser, errors)
+report_errors(errors)
+
diff --git a/scrapersources/postliste-kafjord b/scrapersources/postliste-kafjord
new file mode 100644
index 0000000..e0d6b5c
--- /dev/null
+++ b/scrapersources/postliste-kafjord
@@ -0,0 +1,81 @@
+# -*- coding: UTF-8 -*-
+
+import scraperwiki
+import json
+from BeautifulSoup import BeautifulSoup
+import datetime
+import dateutil.parser
+import lxml.html
+import resource
+import sys
+import urlparse
+import re
+lazycache=scraperwiki.swimport('lazycache')
+postlistelib=scraperwiki.swimport('postliste-python-lib')
+
+agency = u'Kåfjord kommune'
+
+def report_errors(errors):
+ if 0 < len(errors):
+ print "Errors:"
+ for e in errors:
+ print e
+ exit(1)
+def out_of_cpu(arg, spent, hard, soft):
+ report_errors(arg)
+
+def process_pdf(parser, pdfurl, errors):
+ errors = []
+ postlistelib.exit_if_no_cpu_left(0, out_of_cpu, errors)
+ try:
+ pdfcontent = scraperwiki.scrape(pdfurl)
+ parser.preprocess(pdfurl, pdfcontent)
+ pdfcontent = None
+# except ValueError, e:
+# errors.append(e)
+ except IndexError, e:
+ errors.append(e)
+
+def process_page_queue(parser, errors):
+ try:
+ parser.process_pages()
+ postlistelib.exit_if_no_cpu_left(0, out_of_cpu, errors)
+ except scraperwiki.CPUTimeExceededError, e:
+ errors.append("Processing pages interrupted")
+
+def process_journal_pdfs(parser, listurl, errors):
+# print "Finding PDFs on " + listurl
+# u = urllib.parse.urlparse(listurl)
+ html = scraperwiki.scrape(listurl)
+ root = lxml.html.fromstring(html)
+ html = None
+ for ahref in root.cssselect("div.main a"):
+ href = ahref.attrib['href']
+ url = urlparse.urljoin(listurl, href)
+ if -1 != href.find("file://") or -1 == href.find("/postliste-"):
+# print "Skipping non-http URL " + url
+ continue
+ if parser.is_already_scraped(url):
+ True
+# print "Skipping already scraped " + url
+ else:
+# print "Will process " + url
+ process_pdf(parser, url, errors)
+
+def test_small_pdfs(parser):
+ # Test with some smaller PDFs
+ errors = []
+ process_pdf(parser, "http://www.kafjord.kommune.no/postliste-15-06-12.5065630-18590.html", errors)
+ process_page_queue(parser, errors)
+ report_errors(errors)
+ exit(0)
+
+errors = []
+parser = postlistelib.PDFJournalParser(agency=agency)
+
+#test_small_pdfs(parser)
+
+process_journal_pdfs(parser, "http://www.kafjord.kommune.no/index.php?cat=18590", errors)
+process_page_queue(parser, errors)
+report_errors(errors)
+
diff --git a/scrapersources/postliste-kristiansund b/scrapersources/postliste-kristiansund
new file mode 100644
index 0000000..6965810
--- /dev/null
+++ b/scrapersources/postliste-kristiansund
@@ -0,0 +1,87 @@
+# -*- coding: UTF-8 -*-
+
+import scraperwiki
+from BeautifulSoup import BeautifulSoup
+import datetime
+import dateutil.parser
+import lxml.html
+import sys
+import urlparse
+
+scraperwiki.scrape("http://kristiansund.orkide.acos.no/kunde/web/postliste/postliste.asp")
+
+lazycache=scraperwiki.swimport('lazycache')
+postlistelib=scraperwiki.swimport('postliste-python-lib')
+
+agency = 'Kristiansund kommune'
+debug = False
+
+def is_already_scraped(url):
+
+ for sql in ["scrapedurl from swdata where scrapedurl = '" + url + "' limit 1",
+ "scrapedurl from unparsedpages where scrapedurl = '" + url + "' limit 1"]:
+# print sql
+ try:
+ result = scraperwiki.sqlite.select(sql)
+# print result
+ if 0 < len(result) and u'scrapedurl' in result[0]:
+ return True
+ except:
+ pass
+ return False
+
+def report_errors(errors):
+ if 0 < len(errors):
+ print "Errors:"
+ for e in errors:
+ print e
+ exit(1)
+def no_cpu_left(arg, spent, soft, hard):
+ report_errors(arg)
+
+def process_pdf(parser, pdfurl, errors):
+ errors = []
+ postlistelib.exit_if_no_cpu_left(0, callback=no_cpu_left, arg = errors)
+ try:
+ pdfcontent = lazycache.lazycache(pdfurl)
+ parser.preprocess(pdfurl, pdfcontent)
+# except ValueError, e:
+# errors.append(e)
+ except IndexError, e:
+ errors.append(e)
+
+def process_page_queue(parser, errors):
+ try:
+ parser.process_pages()
+ postlistelib.exit_if_no_cpu_left(0, callback=no_cpu_left, arg = errors)
+ except scraperwiki.CPUTimeExceededError, e:
+ errors.append("Processing pages interrupted")
+
+def consider_url(parser, url, errors):
+ if is_already_scraped(url):
+ True
+# print "Skipping already scraped " + url
+ else:
+# print "Will process " + url
+ process_pdf(parser, url, errors)
+
+def process_journal_pdfs(parser, listurl, errors):
+# print "Finding PDFs on " + listurl
+# u = urllib.parse.urlparse(listurl)
+ html = scraperwiki.scrape(listurl)
+ root = lxml.html.fromstring(html)
+ html = None
+ for ahref in root.cssselect("table a"):
+ url = urlparse.urljoin(listurl, ahref.attrib['href'])
+ if -1 == url.find(".pdf"):
+ continue
+ consider_url(parser, url, errors)
+
+#test_parse_case_journal_ref()
+errors = []
+parser = postlistelib.PDFJournalParser(agency=agency)
+#parser.debug = True
+process_journal_pdfs(parser, "http://kristiansund.orkide.acos.no/kunde/web/postliste/postliste.asp", errors)
+process_page_queue(parser, errors)
+report_errors(errors)
+
diff --git a/scrapersources/postliste-lier b/scrapersources/postliste-lier
new file mode 100644
index 0000000..8064d7a
--- /dev/null
+++ b/scrapersources/postliste-lier
@@ -0,0 +1,81 @@
+# -*- coding: UTF-8 -*-
+
+import scraperwiki
+import json
+from BeautifulSoup import BeautifulSoup
+import datetime
+import dateutil.parser
+import lxml.html
+import urlparse
+
+scraperwiki.scrape("http://www.lier.kommune.no/no/Tjenesteomrader-/Oversikter/Postlister---Offentlig-journal/")
+
+lazycache=scraperwiki.swimport('lazycache')
+postlistelib=scraperwiki.swimport('postliste-python-lib')
+
+agency = 'Lier kommune'
+
+def report_errors(errors):
+ if 0 < len(errors):
+ print "Errors:"
+ for e in errors:
+ print e
+ exit(1)
+def out_of_cpu(arg, spent, hard, soft):
+ report_errors(arg)
+
+def process_pdf(parser, pdfurl, errors):
+ errors = []
+ postlistelib.exit_if_no_cpu_left(0, out_of_cpu, errors)
+ try:
+ pdfcontent = scraperwiki.scrape(pdfurl)
+ parser.preprocess(pdfurl, pdfcontent)
+ pdfcontent = None
+# except ValueError, e:
+# errors.append(e)
+ except IndexError, e:
+ errors.append(e)
+
+def process_page_queue(parser, errors):
+ try:
+ parser.process_pages()
+ postlistelib.exit_if_no_cpu_left(0, out_of_cpu, errors)
+ except scraperwiki.CPUTimeExceededError, e:
+ errors.append("Processing pages interrupted")
+
+def process_journal_pdfs(parser, listurl, errors):
+# print "Finding PDFs on " + listurl
+# u = urllib.parse.urlparse(listurl)
+ html = scraperwiki.scrape(listurl)
+ root = lxml.html.fromstring(html)
+ html = None
+ for ahref in root.cssselect("div.fullwidth a"):
+ href = ahref.attrib['href']
+ url = urlparse.urljoin(listurl, href)
+ if -1 != href.find("file://") or -1 == href.find(".pdf"):
+# print "Skipping non-http URL " + url
+ continue
+ if parser.is_already_scraped(url):
+ True
+# print "Skipping already scraped " + url
+ else:
+# print "Will process " + url
+ process_pdf(parser, url, errors)
+
+def test_small_pdfs(parser):
+ # Test with some smaller PDFs
+ errors = []
+ process_pdf(parser, "http://www.lier.kommune.no/files/1256/Postlister%2011.06.pdf", errors)
+ process_page_queue(parser, errors)
+ report_errors(errors)
+ exit(0)
+
+errors = []
+parser = postlistelib.PDFJournalParser(agency=agency)
+
+#test_small_pdfs(parser)
+
+process_journal_pdfs(parser, "http://www.lier.kommune.no/no/Tjenesteomrader-/Oversikter/Postlister---Offentlig-journal/", errors)
+process_page_queue(parser, errors)
+report_errors(errors)
+
diff --git a/scrapersources/postliste-lindesnes b/scrapersources/postliste-lindesnes
new file mode 100644
index 0000000..39e69c0
--- /dev/null
+++ b/scrapersources/postliste-lindesnes
@@ -0,0 +1,124 @@
+# -*- coding: UTF-8 -*-
+import scraperwiki
+import lxml.html
+import datetime
+import dateutil.parser
+import urllib2
+
+# http://innsyn.lindesnes.kommune.no/Publikum/Modules/innsyn.aspx?mode=pl&SelPanel=0&ObjectType=ePhorteRegistryEntry&VariantType=Innsyn&ViewType=Table&Query=RecordDate%3a%28-14%29+AND+ResponsibleUnitID%3a%2811%29+AND+DocumentType%3a%28I%2cU%29
+
+def fetch_url(url):
+ html = None
+ for n in [1, 2, 3]:
+ try:
+ html = scraperwiki.scrape(url)
+ break
+ except urllib2.URLError, e:
+ print "URLError fetching " + url + ", trying again"
+ return html
+
+def make_url(id):
+ url = "http://innsyn.lindesnes.kommune.no/Publikum/Modules/innsyn.aspx?mode=pl&SelPanel=0&ObjectType=ePhorteRegistryEntry&VariantType=Innsyn&ViewType=Detail&Query=ID:" + str(id)
+ return url
+
+def fetch_postjournal(agency, id, url, datastore):
+# print "Scraping " + url
+ scrapestamputc = datetime.datetime.now()
+ html = fetch_url(url)
+ root = lxml.html.fromstring(html.decode('utf-8'))
+ entry = {
+ 'agency' : agency,
+ 'scrapestamputc' : scrapestamputc,
+ 'scrapedurl' : url,
+ 'queryid' : id
+ }
+
+ for span in root.cssselect("div.robots-content span.Element"):
+# print span.text_content()
+ field = None
+ value = None
+ if span.cssselect("h3"):
+ field = span.cssselect("h3")[0].text_content().strip()
+ value = span.cssselect("span.Content span")[0].text_content().strip()
+ elif span.cssselect("h2"):
+ field = span.cssselect("h2")[0].text_content().strip()
+# FIXME
+ value = ""
+ elif span.cssselect("h1"):
+ field = "docdesc"
+ value = span.cssselect("h1")[0].text_content().strip()
+# else:
+# raise ValueError("Unexpected span")
+# print field + " = " + value
+ doctypemap = {
+ u'Inngående brev' : 'I',
+ u'Utgående brev' : 'U',
+ u'Internt notat' : 'N',
+ u'Internt notat uten oppfølging' : 'X',
+ u'Saksframlegg/innstilling' : 'S',
+ u'Dokumentpost i saksmappe' : 'Y', # Code not in NOARK, value based on http://img6.custompublish.com/getfile.php/1168825.136.pqftpqctyt/Ephorte-brukerveiledning_2.1.15.pdf?return=www.kafjord.kommune.no
+ }
+ if 'Type' == field:
+ field = 'doctype'
+ value = doctypemap[value]
+ elif 'Journaldato' == field:
+ field = 'recorddate'
+ value = dateutil.parser.parse(value, dayfirst=True)
+ elif 'Dokumentdato' == field:
+ field = 'docdate'
+ value = dateutil.parser.parse(value, dayfirst=True)
+ elif u'Tilhører sak' == field:
+ field = 'casedesc'
+ elif 'Avsender/Mottaker' == field:
+ if 'doctype' in entry and entry['doctype'] in ['U', 'X', 'N']:
+ field = 'recipient'
+ else:
+ field = 'sender'
+ td = span.cssselect("table td")
+ if td:
+ name = td[0].text_content().strip()
+ addr = td[1].text_content().strip()
+ zip = td[2].text_content().strip()
+ # print "N: '",name, "' '", addr, "' '", zip, "'"
+ entry[field] = name
+ entry[field + 'addr'] = addr
+ entry[field + 'zip'] = zip
+ field = ''
+
+# elif 'Saksbehandlende enhet' == field:
+# elif 'Saksbehandler' == field:
+ if field is not None and '' != field:
+ entry[field] = value
+
+ print entry
+ if 'doctype' in entry:
+ datastore.append(entry)
+
+agency = 'Lindesnes kommune'
+
+def scrape_range(start, end, step, agency):
+ datastore = []
+ for id in range(start, end, step):
+ fetch_postjournal(agency, id, make_url(id), datastore)
+ if 0 < len(datastore) and 0 == (len(datastore) % 10):
+ #print datastore
+ scraperwiki.sqlite.save(unique_keys=['queryid'], data=datastore)
+ datastore = []
+ if 0 < len(datastore):
+ scraperwiki.sqlite.save(unique_keys=['queryid'], data=datastore)
+
+def scraper():
+ try:
+ min = scraperwiki.sqlite.select("min(queryid) as min from swdata")[0]["min"]
+ max = scraperwiki.sqlite.select("min(queryid) as max from swdata")[0]["max"]
+ except:
+ # Random number around 2012-05-15 (ie recent when I wrote this scraper)
+ min = 71836
+
+ scrape_range(max, max + 200, 1, agency)
+ scrape_range(min-1, min - 3000, -1, agency)
+
+if __name__ == "scraper":
+ scraper()
+else:
+ print "Not called as scraper" \ No newline at end of file
diff --git a/scrapersources/postliste-luftambulanse b/scrapersources/postliste-luftambulanse
new file mode 100644
index 0000000..df28d6b
--- /dev/null
+++ b/scrapersources/postliste-luftambulanse
@@ -0,0 +1,91 @@
+# -*- coding: UTF-8 -*-
+
+import scraperwiki
+import json
+from BeautifulSoup import BeautifulSoup
+import datetime
+import dateutil.parser
+import lxml.html
+import resource
+import sys
+import urlparse
+import re
+lazycache=scraperwiki.swimport('lazycache')
+postlistelib=scraperwiki.swimport('postliste-python-lib')
+
+agency = 'Luftambulansetjenesten ANS'
+
+def report_errors(errors):
+ if 0 < len(errors):
+ print "Errors:"
+ for e in errors:
+ print e
+ exit(1)
+def out_of_cpu(arg, spent, hard, soft):
+ report_errors(arg)
+
+def process_pdf(parser, pdfurl, errors):
+ errors = []
+ postlistelib.exit_if_no_cpu_left(0, out_of_cpu, errors)
+ try:
+ pdfcontent = scraperwiki.scrape(pdfurl)
+ parser.preprocess(pdfurl, pdfcontent)
+ pdfcontent = None
+# except ValueError, e:
+# errors.append(e)
+ except IndexError, e:
+ errors.append(e)
+
+def process_page_queue(parser, errors):
+ try:
+ parser.process_pages()
+ postlistelib.exit_if_no_cpu_left(0, out_of_cpu, errors)
+ except scraperwiki.CPUTimeExceededError, e:
+ errors.append("Processing pages interrupted")
+
+def process_journal_pdfs(parser, listurl, errors):
+# print "Finding PDFs on " + listurl
+# u = urllib.parse.urlparse(listurl)
+ html = scraperwiki.scrape(listurl)
+ root = lxml.html.fromstring(html)
+ html = None
+ for ahref in root.cssselect("table a"):
+ if not 'href' in ahref.attrib:
+ continue
+ href = ahref.attrib['href']
+ url = urlparse.urljoin(listurl, href).replace(" ", "%20")
+ if -1 != href.find("file://") or -1 == url.find(".pdf") or -1 == url.find('/Postjournal'):
+# print "Skipping non-http URL " + url
+ continue
+ if parser.is_already_scraped(url):
+ True
+# print "Skipping already scraped " + url
+ else:
+# print "Will process " + url
+ process_pdf(parser, url, errors)
+
+def test_small_pdfs(parser):
+ # Test with some smaller PDFs
+ errors = []
+ process_pdf(parser, "http://www.luftambulanse.no/filarkiv/Postjournal%202012/Postjournal%20mai/2805-010612.pdf", errors)
+ process_page_queue(parser, errors)
+ report_errors(errors)
+ exit(0)
+
+errors = []
+parser = postlistelib.PDFJournalParser(agency=agency)
+
+#test_small_pdfs(parser)
+
+#process_page_queue(parser, errors)
+process_journal_pdfs(parser, "http://www.luftambulanse.no/postjournal_2012.aspx", errors)
+process_journal_pdfs(parser, "http://www.luftambulanse.no/postjournal_2011.aspx", errors)
+process_journal_pdfs(parser, "http://www.luftambulanse.no/postjournal_2010.aspx", errors)
+process_journal_pdfs(parser, "http://www.luftambulanse.no/postjournal_2009.aspx", errors)
+process_journal_pdfs(parser, "http://www.luftambulanse.no/postjournal_2008.aspx", errors)
+process_journal_pdfs(parser, "http://www.luftambulanse.no/postjournal_2007.aspx", errors)
+process_journal_pdfs(parser, "http://www.luftambulanse.no/postjournal.aspx", errors)
+
+process_page_queue(parser, errors)
+report_errors(errors)
+
diff --git a/scrapersources/postliste-naroy b/scrapersources/postliste-naroy
new file mode 100644
index 0000000..b8fa33b
--- /dev/null
+++ b/scrapersources/postliste-naroy
@@ -0,0 +1,89 @@
+# -*- coding: UTF-8 -*-
+
+import scraperwiki
+import json
+from BeautifulSoup import BeautifulSoup
+import datetime
+import dateutil.parser
+import lxml.html
+import resource
+import sys
+import urlparse
+import re
+lazycache=scraperwiki.swimport('lazycache')
+postlistelib=scraperwiki.swimport('postliste-python-lib')
+
+agency = u'Nærøy kommune'
+
+def report_errors(errors):
+ if 0 < len(errors):
+ print "Errors:"
+ for e in errors:
+ print e
+ exit(1)
+def out_of_cpu(arg, spent, hard, soft):
+ report_errors(arg)
+
+def process_pdf(parser, pdfurl, errors):
+ errors = []
+ postlistelib.exit_if_no_cpu_left(0, out_of_cpu, errors)
+ try:
+ pdfcontent = scraperwiki.scrape(pdfurl)
+ parser.preprocess(pdfurl, pdfcontent)
+ pdfcontent = None
+# except ValueError, e:
+# errors.append(e)
+ except IndexError, e:
+ errors.append(e)
+
+def process_page_queue(parser, errors):
+ try:
+ parser.process_pages()
+ postlistelib.exit_if_no_cpu_left(0, out_of_cpu, errors)
+ except scraperwiki.CPUTimeExceededError, e:
+ errors.append("Processing pages interrupted")
+
+def process_journal_pdfs(parser, listurl, errors):
+# print "Finding PDFs on " + listurl
+# u = urllib.parse.urlparse(listurl)
+ html = scraperwiki.scrape(listurl)
+ root = lxml.html.fromstring(html)
+ html = None
+ for ahref in root.cssselect("table#hovedinnhold a"):
+ href = ahref.attrib['href']
+ url = urlparse.urljoin(listurl, href).replace(" ", "+")
+ if -1 != href.find("file://"):
+# print "Skipping non-http URL " + url
+ continue
+ if -1 == url.find(".pdf"):
+ continue
+ # Special case, file indicating no journal entries this day
+ if "http://www.naroy.kommune.no/NK/Intern.nsf/FilA/CA6C83764E56DDCBC1257A02003F9025/$FILE/Postjournal+11.05.12.pdf" == url or \
+ "http://www.naroy.kommune.no/NK/Intern.nsf/FilA/7FD82A18C1A1F137C12579F90029DEBD/$FILE/Postjournal+07.05.12.pdf" == url:
+ continue
+ if parser.is_already_scraped(url):
+ True
+# print "Skipping already scraped " + url
+ else:
+# print "Will process " + url
+ process_pdf(parser, url, errors)
+
+def test_small_pdfs(parser):
+ # Test with some smaller PDFs
+ errors = []
+ process_pdf(parser, "http://www.naroy.kommune.no/NK/Intern.nsf/FilA/451908E568D2D630C1257A1E004D1B9D/$FILE/Postjournal%2005.06.12.pdf", errors)
+
+ process_page_queue(parser, errors)
+ report_errors(errors)
+ exit(0)
+
+errors = []
+parser = postlistelib.PDFJournalParser(agency=agency)
+#parser.debug = True
+
+#test_small_pdfs(parser)
+
+process_journal_pdfs(parser, "http://www.naroy.kommune.no/NK/Web.nsf/mainPress?OpenForm&U=POST", errors)
+process_page_queue(parser, errors)
+report_errors(errors)
+
diff --git a/scrapersources/postliste-nih b/scrapersources/postliste-nih
new file mode 100644
index 0000000..4f92e18
--- /dev/null
+++ b/scrapersources/postliste-nih
@@ -0,0 +1,85 @@
+# -*- coding: UTF-8 -*-
+
+import scraperwiki
+import json
+from BeautifulSoup import BeautifulSoup
+import datetime
+import dateutil.parser
+import lxml.html
+import resource
+import sys
+import urlparse
+import re
+
+# Make sure Scraperwiki believe this is the source from this database
+scraperwiki.scrape("http://www.nih.no/om-nih/aktuelt/offentlig-postjournal/")
+
+lazycache=scraperwiki.swimport('lazycache')
+postlistelib=scraperwiki.swimport('postliste-python-lib')
+
+agency = 'Norges idrettshøgskole'
+
+def report_errors(errors):
+ if 0 < len(errors):
+ print "Errors:"
+ for e in errors:
+ print e
+ exit(1)
+def out_of_cpu(arg, spent, hard, soft):
+ report_errors(arg)
+
+def process_pdf(parser, pdfurl, errors):
+ errors = []
+ postlistelib.exit_if_no_cpu_left(0, out_of_cpu, errors)
+ try:
+ pdfcontent = scraperwiki.scrape(pdfurl)
+ parser.preprocess(pdfurl, pdfcontent)
+ pdfcontent = None
+# except ValueError, e:
+# errors.append(e)
+ except IndexError, e:
+ errors.append(e)
+
+def process_page_queue(parser, errors):
+ try:
+ parser.process_pages()
+ postlistelib.exit_if_no_cpu_left(0, out_of_cpu, errors)
+ except scraperwiki.CPUTimeExceededError, e:
+ errors.append("Processing pages interrupted")
+
+def process_journal_pdfs(parser, listurl, errors):
+# print "Finding PDFs on " + listurl
+# u = urllib.parse.urlparse(listurl)
+ html = scraperwiki.scrape(listurl)
+ root = lxml.html.fromstring(html)
+ html = None
+ for ahref in root.cssselect("li a"):
+ href = ahref.attrib['href']
+ url = urlparse.urljoin(listurl, href)
+ if -1 != href.find("file://") or -1 == url.find(".pdf"):
+# print "Skipping non-http URL " + url
+ continue
+ if parser.is_already_scraped(url):
+ True
+# print "Skipping already scraped " + url
+ else:
+# print "Will process " + url
+ process_pdf(parser, url, errors)
+
+def test_small_pdfs(parser):
+ # Test with some smaller PDFs
+ errors = []
+ process_pdf(parser, u"http://www.nih.no/Documents/1_P%C3%98/Postjournaler/offentlig%20journal%20uke%2022.pdf", errors)
+ process_page_queue(parser, errors)
+ report_errors(errors)
+ exit(0)
+
+errors = []
+parser = postlistelib.PDFJournalParser(agency=agency)
+
+#test_small_pdfs(parser)
+
+process_journal_pdfs(parser, "http://www.nih.no/om-nih/aktuelt/offentlig-postjournal/", errors)
+process_page_queue(parser, errors)
+report_errors(errors)
+
diff --git a/scrapersources/postliste-npolar b/scrapersources/postliste-npolar
new file mode 100644
index 0000000..423a785
--- /dev/null
+++ b/scrapersources/postliste-npolar
@@ -0,0 +1,101 @@
+# -*- coding: UTF-8 -*-
+
+import scraperwiki
+import json
+from BeautifulSoup import BeautifulSoup
+import datetime
+import dateutil.parser
+import lxml.html
+import resource
+import sys
+import urlparse
+import re
+
+# Load front page first, to get it recorded as the source by scraperwiki
+scraperwiki.scrape("http://www.npolar.no/no/om-oss/offentlig-journal.html")
+
+lazycache=scraperwiki.swimport('lazycache')
+postlistelib=scraperwiki.swimport('postliste-python-lib')
+
+agency = 'Norsk Polarinstitutt'
+
+def report_errors(errors):
+ if 0 < len(errors):
+ print "Errors:"
+ for e in errors:
+ print e
+ exit(1)
+def out_of_cpu(arg, spent, hard, soft):
+ report_errors(arg)
+
+def process_pdf(parser, pdfurl, errors):
+ errors = []
+ postlistelib.exit_if_no_cpu_left(0, out_of_cpu, errors)
+ try:
+ pdfcontent = scraperwiki.scrape(pdfurl)
+ parser.preprocess(pdfurl, pdfcontent)
+ pdfcontent = None
+# except ValueError, e:
+# errors.append(e)
+ except IndexError, e:
+ errors.append(e)
+
+def process_page_queue(parser, errors):
+ try:
+ parser.process_pages()
+ postlistelib.exit_if_no_cpu_left(0, out_of_cpu, errors)
+ except scraperwiki.CPUTimeExceededError, e:
+ errors.append("Processing pages interrupted")
+
+def process_journal_pdfs(parser, listurl, errors):
+# print "Finding PDFs on " + listurl
+# u = urllib.parse.urlparse(listurl)
+ html = scraperwiki.scrape(listurl)
+ root = lxml.html.fromstring(html)
+ html = None
+ for ahref in root.cssselect("div.onecol ul a"):
+ href = ahref.attrib['href']
+ url = urlparse.urljoin(listurl, href)
+ if -1 != href.find("file://") or -1 == url.find(".pdf"):
+# print "Skipping non-http URL " + url
+ continue
+ if parser.is_already_scraped(url):
+ True
+# print "Skipping already scraped " + url
+ else:
+# print "Will process " + url
+ process_pdf(parser, url, errors)
+
+def test_small_pdfs(parser):
+ # Test with some smaller PDFs
+ errors = []
+ #process_pdf(parser, "http://www.npolar.no/npcms/export/sites/np/files/vedlegg/offentlig-journal/2011-10.pdf", errors)
+ #process_pdf(parser, "http://www.npolar.no/npcms/export/sites/np/files/vedlegg/offentlig-journal/2011-09.pdf", errors)
+ #process_pdf(parser, "http://www.npolar.no/npcms/export/sites/np/files/vedlegg/offentlig-journal/2011-08.pdf", errors)
+ #process_pdf(parser, "http://www.npolar.no/npcms/export/sites/np/files/vedlegg/offentlig-journal/2011-07.pdf", errors)
+ #process_pdf(parser, "http://www.npolar.no/npcms/export/sites/np/files/vedlegg/offentlig-journal/2011-06.pdf", errors)
+ #process_pdf(parser, "http://www.npolar.no/npcms/export/sites/np/files/vedlegg/offentlig-journal/2011-05.pdf", errors)
+ #process_pdf(parser, "http://www.npolar.no/npcms/export/sites/np/files/vedlegg/offentlig-journal/2011-04.pdf", errors)
+ #process_pdf(parser, "http://www.npolar.no/npcms/export/sites/np/files/vedlegg/offentlig-journal/2011-03.pdf", errors)
+ #process_pdf(parser, "http://www.npolar.no/npcms/export/sites/np/files/vedlegg/offentlig-journal/2011-02.pdf", errors)
+ #process_pdf(parser, "http://www.npolar.no/npcms/export/sites/np/files/vedlegg/offentlig-journal/2011-01.pdf", errors)
+ process_pdf(parser, "http://home.nuug.no/~pere/npolar-postjournal/OffJournalapril-mai2012.pdf", errors)
+ process_pdf(parser, "http://home.nuug.no/~pere/npolar-postjournal/OffJournaljanuar-mai2011.pdf", errors)
+ process_pdf(parser, "http://home.nuug.no/~pere/npolar-postjournal/OffJournaljanuar-mars2012.pdf", errors)
+ process_pdf(parser, "http://home.nuug.no/~pere/npolar-postjournal/OffJournaljuni-oktober2011.pdf", errors)
+ process_pdf(parser, "http://home.nuug.no/~pere/npolar-postjournal/OffJournaljuni2012.pdf", errors)
+ process_pdf(parser, "http://home.nuug.no/~pere/npolar-postjournal/OffJournalnovember-desember2011.pdf", errors)
+
+ process_page_queue(parser, errors)
+ report_errors(errors)
+ exit(0)
+
+errors = []
+parser = postlistelib.PDFJournalParser(agency=agency)
+
+#test_small_pdfs(parser)
+
+process_journal_pdfs(parser, "http://www.npolar.no/no/om-oss/offentlig-journal.html", errors)
+process_page_queue(parser, errors)
+report_errors(errors)
+
diff --git a/scrapersources/postliste-nrk b/scrapersources/postliste-nrk
new file mode 100644
index 0000000..5c7929d
--- /dev/null
+++ b/scrapersources/postliste-nrk
@@ -0,0 +1,94 @@
+# -*- coding: UTF-8 -*-
+# Based on the scraper advanced-scraping-pdf
+# See also
+# https://views.scraperwiki.com/run/pdf-to-html-preview-1/?url=http%3A%2F%2Fwww.stortinget.no%2FGlobal%2Fpdf%2Fpostjournal%2Fpj-2012-05-09.pdf
+
+import scraperwiki
+import json
+from BeautifulSoup import BeautifulSoup
+import datetime
+import dateutil.parser
+import lxml.html
+import resource
+import sys
+import urlparse
+import gc
+import re
+
+frontpage = "http://www.nrk.no/contentfile/transformer/1.8052258"
+scraperwiki.scrape(frontpage)
+
+lazycache=scraperwiki.swimport('lazycache')
+postlistelib=scraperwiki.swimport('postliste-python-lib')
+
+agency = 'Universitetet i Oslo'
+
+def report_errors(errors):
+ if 0 < len(errors):
+ print "Errors:"
+ for e in errors:
+ print e
+ exit(1)
+def out_of_cpu(arg, spent, hard, soft):
+ report_errors(arg)
+
+def process_pdf(parser, pdfurl, errors):
+ errors = []
+ postlistelib.exit_if_no_cpu_left(0, out_of_cpu, errors)
+ try:
+ pdfcontent = scraperwiki.scrape(pdfurl)
+ parser.preprocess(pdfurl, pdfcontent)
+ pdfcontent = None
+# except ValueError, e:
+# errors.append(e)
+ except IndexError, e:
+ errors.append(e)
+
+def process_page_queue(parser, errors):
+ try:
+ parser.process_pages()
+ postlistelib.exit_if_no_cpu_left(0, out_of_cpu, errors)
+ except scraperwiki.CPUTimeExceededError, e:
+ errors.append("Processing pages interrupted")
+
+def process_journal_pdfs(parser, listurl, errors):
+# print "Finding PDFs on " + listurl
+# u = urllib.parse.urlparse(listurl)
+ html = scraperwiki.scrape(listurl)
+ root = lxml.html.fromstring(html)
+ html = None
+ for ahref in root.cssselect("table a"):
+ href = ahref.attrib['href']
+ url = urlparse.urljoin(listurl, href)
+ if -1 != href.find("file://"):
+# print "Skipping non-http URL " + url
+ continue
+ if parser.is_already_scraped(url):
+ True
+# print "Skipping already scraped " + url
+ else:
+# print "Will process " + url
+ process_pdf(parser, url, errors)
+
+def test_small_pdfs(parser):
+
+ parser.debug = True
+
+ errors = []
+ process_pdf(parser, "http://nrk.no/contentfile/file/1.8116520!offentligjournal02052012.pdf", errors) # text
+ #process_pdf(parser, "http://nrk.no/contentfile/file/1.8061384!offentlig%2002042012.pdf", errors) # Image
+ #process_pdf(parser, "http://nrk.no/contentfile/file/1.8130287!offentligjournal09052012.pdf", errors) # Image
+ process_page_queue(parser, errors)
+ report_errors(errors)
+ exit(0)
+
+errors = []
+parser = postlistelib.PDFJournalParser(agency=agency, hiddentext=True)
+
+test_small_pdfs(parser)
+
+# Based on http://www.nrk.no/innsyn/
+process_journal_pdfs(parser, frontpage, errors)
+process_page_queue(parser, errors)
+report_errors(errors)
+
diff --git a/scrapersources/postliste-ntnu b/scrapersources/postliste-ntnu
new file mode 100644
index 0000000..1a885c4
--- /dev/null
+++ b/scrapersources/postliste-ntnu
@@ -0,0 +1,87 @@
+# -*- coding: UTF-8 -*-
+
+import scraperwiki
+import json
+from BeautifulSoup import BeautifulSoup
+import datetime
+import dateutil.parser
+import lxml.html
+import urlparse
+import urllib2
+
+# Make sure Scraperwiki believe this is the source from this database
+scraperwiki.scrape("http://www.ntnu.no/aktuelt/offentlig-journal")
+
+lazycache=scraperwiki.swimport('lazycache')
+postlistelib=scraperwiki.swimport('postliste-python-lib')
+
+agency = 'Norges teknisk-naturvitenskapelige universitet'
+
+def report_errors(errors):
+ if 0 < len(errors):
+ print "Errors:"
+ for e in errors:
+ print e
+ raise ValueError("Something went wrong")
+
+def out_of_cpu(arg, spent, hard, soft):
+ report_errors(arg)
+
+def process_pdf(parser, pdfurl, errors):
+ postlistelib.exit_if_no_cpu_left(0, out_of_cpu, errors)
+ try:
+ pdfcontent = scraperwiki.scrape(pdfurl)
+ parser.preprocess(pdfurl, pdfcontent)
+ pdfcontent = None
+ except ValueError, e: # Some PDFs can not be parsed! This should be investigated
+ print "PDF format problem"
+ errors.append(e)
+ except IndexError, e:
+ errors.append(e)
+ except urllib2.HTTPError, e:
+ errors.append(e)
+
+def process_page_queue(parser, errors):
+ try:
+ parser.process_pages()
+ postlistelib.exit_if_no_cpu_left(0, out_of_cpu, errors)
+ except scraperwiki.CPUTimeExceededError, e:
+ errors.append("Processing pages interrupted")
+
+def process_journal_pdfs(parser, listurl, errors):
+# print "Finding PDFs on " + listurl
+# u = urllib.parse.urlparse(listurl)
+ html = scraperwiki.scrape(listurl)
+ root = lxml.html.fromstring(html)
+ html = None
+ for ahref in root.cssselect("ul a"):
+ href = ahref.attrib['href']
+ url = urlparse.urljoin(listurl, href)
+ if -1 != href.find("file://") or -1 == url.find(".pdf"):
+# print "Skipping non-http URL " + url
+ continue
+ if parser.is_already_scraped(url):
+ True
+# print "Skipping already scraped " + url
+ else:
+# print "Will process " + url
+ process_pdf(parser, url, errors)
+
+def test_small_pdfs(parser):
+ # Test with some smaller PDFs
+ errors = []
+ process_pdf(parser, "http://www.ntnu.no/offjour/2012-06.25.pdf", errors)
+ process_pdf(parser, "http://www.ntnu.no/offjour/2012-06.13.pdf ", errors) # Strange format?
+ process_page_queue(parser, errors)
+ report_errors(errors)
+ exit(0)
+
+errors = []
+parser = postlistelib.PDFJournalParser(agency=agency)
+
+#test_small_pdfs(parser)
+
+process_journal_pdfs(parser, "http://www.ntnu.no/aktuelt/offentlig-journal", errors)
+process_page_queue(parser, errors)
+report_errors(errors)
+
diff --git a/scrapersources/postliste-oep b/scrapersources/postliste-oep
new file mode 100644
index 0000000..c7fdc82
--- /dev/null
+++ b/scrapersources/postliste-oep
@@ -0,0 +1,336 @@
+# -*- coding: UTF-8 -*-
+
+import scraperwiki
+import lxml.html
+import datetime
+import time
+import resource
+import httplib
+import urllib2
+
+# Try several times as the database get bigger
+writetries = 5
+
+# http://www.oep.no/search/resultSingle.html?journalPostId=1000000
+# http://www.oep.no/search/resultSingle.html?journalPostId=3889259
+
+# <table class="defaultTable">
+# <tr>
+# <th class="noLeftBorder" style="width: 20%;">Agency:</th>
+# <td class="noRightBorder" style="width: 80%;">Kulturdepartementet</td>
+# </tr>
+# <tr>
+# <th class="noLeftBorder">Case:</th>
+# <td class="noRightBorder">DNT Oslo og Omegn - rehabilitering og utvidelse av turisthytta Snøheim på Dovre - spillemidler til anlegg for friluftsliv i fjellet 2011</td>
+# </tr>
+# <tr>
+# <th class="noLeftBorder">Document title:</th>
+# <td class="noRightBorder">DNT Oslo og Omegn - turisthytta Snøheim på Dovre - eventuelt navnebytte</td>
+# </tr>
+# <tr>
+# <th class="noLeftBorder">Case number:</th>
+# <td class="noRightBorder">2010/04027</td>
+# </tr>
+# <tr>
+# <th class="noLeftBorder">Document number:</th>
+# <td class="noRightBorder">4</td>
+# </tr>
+# <tr>
+# <th class="noLeftBorder">Document type:</th>
+# <td class="noRightBorder">
+#
+#
+#
+# Outgoing
+#
+#
+# </td>
+# </tr>
+#
+#
+# <tr>
+# <th class="noLeftBorder">Recipient:</th>
+# <td class="noRightBorder">Den Norske Turistforening</td>
+# </tr>
+#
+# <tr>
+# <th class="noLeftBorder">Document date:</th>
+# <td class="noRightBorder">2010-12-13</td>
+# </tr>
+# <tr>
+# <th class="noLeftBorder">Record entry date:</th>
+# <td class="noRightBorder">
+#
+#
+#
+# 2010-12-14
+#
+#
+# </td>
+# </tr>
+# <tr>
+# <th class="noLeftBorder">Published in OEP</th>
+# <td class="noRightBorder">2011-01-03</td>
+# </tr>
+# <tr>
+# <th class="noLeftBorder" title="Hvis dokumentet er unntatt offentlighet kan unntaket gjelde hele eller deler av dokumentet."><span class="dottedBorderBottom">Grounds for exemption, document:</span></th>
+# <td class="noRightBorder">
+#
+# </td>
+# </tr>
+# <tr>
+# <th class="noLeftBorder">Archive code:</th>
+# <td class="noRightBorder">
+#
+# </td>
+# </tr>
+# <tr>
+# <th class="noLeftBorder">Contact point:</th>
+# <td class="noRightBorder">
+# <br />
+# Tel.:&nbsp;22 24 90 90<br />
+# Email:&nbsp;<a href="mailto:postmottak@kud.dep.no" title="Send email">postmottak@kud.dep.no</a>
+# </td>
+# </tr>
+# </table>
+
+def cpu_spent():
+ usage = resource.getrusage(resource.RUSAGE_SELF)
+ return getattr(usage, 'ru_utime') + getattr(usage, 'ru_stime')
+
+def url_from_id(id):
+ return "http://www.oep.no/search/resultSingle.html?journalPostId=" + str(id)
+
+def save(data):
+ for run in range(1,writetries):
+ try:
+ scraperwiki.sqlite.save(unique_keys=['journalPostId'], data=data)
+ return
+ except scraperwiki.sqlite.SqliteError, e:
+ print "Sqlite write error, trying again"
+ time.sleep(22)
+ raise scraperwiki.sqlite.SqliteError("Unable to write to database, tried " + str(writetries) + " times")
+
+def save_var(var, data):
+ for run in range(1,writetries):
+ try:
+ scraperwiki.sqlite.save_var(var, data)
+ return
+ except scraperwiki.sqlite.SqliteError, e:
+ print "Sqlite write error, trying again"
+ time.sleep(22)
+ raise scraperwiki.sqlite.SqliteError("Unable to write variable " + var + " to database, tried " + str(writetries) + " times")
+
+fieldmap = {
+ 'Agency' : 'agency',
+ 'Record entry date' : 'recorddate',
+ 'Case' : 'casedesc',
+ 'Case number' : 'caseid',
+ 'Document number' : 'casedocseq',
+ 'Document date' : 'docdate',
+ 'Document title' : 'docdesc',
+ 'Document type' : 'doctype',
+ 'Grounds for exemption document' : 'exemption',
+ 'Recipient' : 'recipient',
+ 'Sender' : 'sender',
+ 'Published in OEP' : 'recordpublishdate',
+# 'Archive code',
+# 'Contact point',
+# 'journalPostId',
+# 'scrapestamputc',
+}
+
+doctypemap = {
+ 'Incoming' : 'I',
+ 'Outgoing' : 'U',
+ 'internal' : 'X',
+}
+
+def fetch_oep_entry(id, datastorage):
+ oepurl = url_from_id(id)
+ html = scraperwiki.scrape(oepurl)
+ root = lxml.html.fromstring(html.decode('utf-8'))
+ data = { 'journalPostId' : id }
+ for tr in root.cssselect("table.defaultTable tr"):
+ vtype = tr.cssselect("th")[0].text_content().strip().replace(":", "").replace(",", "")
+ value = tr.cssselect("td")[0].text_content().strip()
+ #print '"' + vtype + '"', '"'+value+'"'
+ if (vtype == 'Record entry date' and value == 'Not stated.') or \
+ (vtype == 'Document type' and value == '-') or \
+ (vtype == 'Case number' and value == ''):
+ return -1
+ if vtype in fieldmap:
+ vtype = fieldmap[vtype]
+ if 'doctype' == vtype:
+ value = doctypemap[value]
+ if 'caseid' == vtype:
+ caseyear, caseseqnr = value.split("/")
+ data['caseyear'] = caseyear
+ data['caseseqnr'] = caseseqnr
+ data[vtype] = value
+# print str(id) + ": " + str(data)
+ data['scrapestamputc'] = datetime.datetime.now()
+# print data['scrapestamputc']
+# exit ()
+
+ datastorage.append(data)
+# scraperwiki.sqlite.save(unique_keys=['journalPostId'], data=data)
+ return 0
+
+def fetch_range(first, last, step):
+ myskiplimit = skiplimit
+ datastorage = []
+ skipped = 0
+ fetched = 0
+ min_id = first
+ for id in range(first, last, step):
+ try:
+ tries = 3
+ while 0 < tries:
+ tries = tries - 1
+ try:
+ if -1 == fetch_oep_entry(id, datastorage):
+ skipped = skipped + 1
+ if skipped == myskiplimit and myskiplimit == skiplimit:
+ tmp = []
+ for limit in [250, 500, 800, 1000, 1200, 1500, 1700, 2000, 3000, 5000]:
+ testid = id + limit * step
+ if -1 != fetch_oep_entry(testid, tmp):
+ print "Looking "+str(limit)+" ahead, found " + url_from_id(testid)
+ myskiplimit = skiplimit + limit + 1
+ break
+ break
+ else:
+ fetched = fetched + 1
+ skipped = 0
+ myskiplimit = skiplimit
+ break
+ except urllib2.HTTPError, e: # Because HTTPError lack reason due to bug
+ print "URLError triggered for url " + url_from_id(id) + ", trying again: " + str(e.msg)
+ except urllib2.URLError, e:
+ print "URLError triggered for url " + url_from_id(id) + ", trying again: " + str(e.reason)
+ except httplib.BadStatusLine, e:
+ # e.msg do not exist. trying .reason 2012-06-25
+ print "BadStatusLine triggered for url " + url_from_id(id) + ", trying again: " + str(e.reason)
+
+ if skipped >= myskiplimit:
+ print "Reached end of list, exiting at " + str(id)
+ break
+ if 50 <= len(datastorage):
+ save(data=datastorage)
+ datastorage = []
+
+ # Only do this for every 50 ID tested, to avoid spending too much CPU seconds updating the sqlite file
+ if 0 == (id % 50):
+ if id < min_id:
+ min_id = id
+# print "Updating min_id to " + str(min_id)
+ save_var('min_tested_id', min_id)
+ if cpu_spent() > 79:
+ print "Running short on CPU time, exiting at " + str(id)
+ break
+ time.sleep(0.2)
+ except scraperwiki.CPUTimeExceededError:
+ if 0 < len(datastorage):
+ save(data=datastorage)
+ datastorage = []
+ print "CPU exception caught"
+ raise
+ except:
+ print "Error, unexpected exception"
+ raise
+ if 0 < len(datastorage):
+ save(data=datastorage)
+ datastorage = []
+ return fetched
+
+def rename_sql_columns():
+ print "Dropping temp table"
+ scraperwiki.sqlite.execute("DROP TABLE IF EXISTS swdatanew")
+ print "Creating table"
+ scraperwiki.sqlite.execute("CREATE TABLE IF NOT EXISTS swdatanew (agency text, recorddate text, casedesc text, caseid text, casedocseq integer, docdate text, docdesc text, doctype text, exemption text, recipient text, sender text, recordpublishdate text, `Archive code` text, `Contact point` text, `journalPostId` integer, scrapestamputc text)")
+ print "Copying table"
+ scraperwiki.sqlite.execute("INSERT INTO swdatanew(agency, recorddate, casedesc, caseid, casedocseq, docdate, docdesc, doctype, exemption, recipient, sender, recordpublishdate, `Archive code`, `Contact point`, `journalPostId`, scrapestamputc) SELECT `Agency`, `Record entry date`, `Case`, `Case number`, `Document number`, `Document date`, `Document title`, `Document type`, `Grounds for exemption document`, `Recipient`, `Sender`, `Published in OEP`, `Archive code`, `Contact point`, `journalPostId`, `scrapestamputc` FROM swdata")
+
+ scraperwiki.sqlite.execute("ALTER TABLE swdata RENAME TO swdataold")
+ scraperwiki.sqlite.execute("ALTER TABLE swdatanew RENAME TO swdata")
+ scraperwiki.sqlite.commit()
+ exit(0)
+
+def create_indexes():
+ for field in ['doctype', 'agency', 'recorddate', 'caseid']:
+ print "Creating %s index" % field
+ scraperwiki.sqlite.execute("CREATE INDEX IF NOT EXISTS swdata_%s_index ON swdata (%s)" % (field, field))
+ scraperwiki.sqlite.commit()
+
+def update_doctypes():
+ print "Updating doctype"
+ agencies = []
+ for agencyref in scraperwiki.sqlite.select("distinct agency from swdata"):
+ agencies.append(agencyref['agency'])
+
+ # Updating individual agencies to try to avoid SQL timeout
+ for agency in agencies:
+ print "Updating doctype for " + agency
+ scraperwiki.sqlite.execute("UPDATE swdata set doctype = 'I' where agency = ? and doctype = 'Incoming'", (agency))
+ scraperwiki.sqlite.execute("UPDATE swdata set doctype = 'U' where agency = ? and doctype = 'Outgoing'", (agency))
+ scraperwiki.sqlite.execute("UPDATE swdata set doctype = 'X' where agency = ? and doctype = 'internal'", (agency))
+ scraperwiki.sqlite.commit()
+ exit(0)
+
+def update_caseyear():
+ print "Updating caseyear and caseseqnr"
+ agencies = []
+ for agencyref in scraperwiki.sqlite.select("distinct agency from swdata WHERE caseyear is NULL"):
+ agencies.append(agencyref['agency'])
+
+ # Updating individual agencies to try to avoid SQL timeout
+ for agency in agencies:
+ print "Updating caseyear for " + agency
+ res = scraperwiki.sqlite.execute("select journalPostId, substr(caseid, 1, 4), substr(caseid, 6) from swdata where agency = ? and caseyear is NULL limit 2", (agency))
+ print res
+ scraperwiki.sqlite.execute("UPDATE swdata set caseyear = substr(caseid, 1, 4), caseseqnr = substr(caseid, 6) where agency = ? AND caseyear is NULL", (agency))
+ scraperwiki.sqlite.commit()
+ exit(0)
+
+def remove_original():
+ scraperwiki.sqlite.execute("DROP TABLE IF EXISTS swdataold")
+ scraperwiki.sqlite.commit()
+ exit(0)
+
+#update_caseyear()
+
+#create_indexes()
+
+#rename_sql_columns()
+#remove_original()
+
+# This one give me SQL timeout
+#update_doctypes()
+
+print "Starting to fetch journal entries " + str(datetime.datetime.now())
+count = 10000
+skiplimit = 500
+# Random value fairly close to the most recent ID when this project started 2012-05-03
+max = min = startid = 3889259
+try:
+ max = scraperwiki.sqlite.select("max(journalPostId) as max from swdata")[0]["max"]
+ if 0 < scraperwiki.sqlite.get_var('min_tested_id'):
+ saved_min = scraperwiki.sqlite.get_var('min_tested_id')
+ sql_min = scraperwiki.sqlite.select("min(journalPostId) as min from swdata")[0]["min"]
+ print "Saved min: " + str(saved_min) + ", sql min: " + str(sql_min)
+ if sql_min < saved_min:
+ min = sql_min
+ else:
+ min = saved_min
+
+ print "Scraping " + str(count) + " IDs below " + str(min) + " and above " + str(max)
+except scraperwiki.sqlite.SqliteError:
+ pass
+
+fetched = fetch_range(max + 1, max + count, 1)
+print "Fetched " + str(fetched) + " new journal entries, cpu spent: " + str(cpu_spent())
+if min >= 0:
+ fetched = fetch_range(min, min - count, -1)
+ print "Fetched " + str(fetched) + " old journal entries, cpu spent: " + str(cpu_spent())
+
diff --git a/scrapersources/postliste-oep-deliverydates b/scrapersources/postliste-oep-deliverydates
new file mode 100644
index 0000000..f04ce49
--- /dev/null
+++ b/scrapersources/postliste-oep-deliverydates
@@ -0,0 +1,37 @@
+import scraperwiki
+import lxml.html
+import datetime
+import resource
+import dateutil.parser
+import resource
+
+def cpu_spent():
+ usage = resource.getrusage(resource.RUSAGE_SELF)
+ return getattr(usage, 'ru_utime') + getattr(usage, 'ru_stime')
+
+def fetch_oep_deliverydates(url, datastorage):
+ html = scraperwiki.scrape(url)
+ root = lxml.html.fromstring(html.decode('utf-8'))
+ data = { 'scrapedurl' : id }
+ for tr in root.cssselect("table.defaulttable tr"):
+ if 3 == len(tr.cssselect("td")):
+ data = { 'scrapedurl' : url }
+ #print tr
+# vtype = tr.cssselect("th")[0].text_content().strip().replace(":", "").replace(",", "")
+ agency = tr.cssselect("td")[0].text_content().strip()
+ deliverydate = tr.cssselect("td")[1].text_content().strip()
+ if deliverydate == "Levert":
+ continue
+ data['agency'] = agency
+ #print "D: '" + deliverydate + "'"
+ data['deliverydate'] = dateutil.parser.parse(deliverydate, dayfirst=True)
+ data['scrapestamputc'] = datetime.datetime.now()
+ datastorage.append(data)
+ return 0
+
+datastorage = []
+fetch_oep_deliverydates("http://www.oep.no/pub/faces/statistikk.jsp?reposId=3", datastorage)
+print datastorage
+scraperwiki.sqlite.save(unique_keys=['agency', 'deliverydate'], data=datastorage)
+
+print "Starting to fetch journal delivery dates " + str(datetime.datetime.now())
diff --git a/scrapersources/postliste-oslo-bydel-ullern b/scrapersources/postliste-oslo-bydel-ullern
new file mode 100644
index 0000000..54a5031
--- /dev/null
+++ b/scrapersources/postliste-oslo-bydel-ullern
@@ -0,0 +1,85 @@
+# -*- coding: UTF-8 -*-
+# Based on the scraper advanced-scraping-pdf
+# See also
+# https://views.scraperwiki.com/run/pdf-to-html-preview-1/?url=http%3A%2F%2Fwww.stortinget.no%2FGlobal%2Fpdf%2Fpostjournal%2Fpj-2012-05-09.pdf
+
+import scraperwiki
+import json
+from BeautifulSoup import BeautifulSoup
+import datetime
+import dateutil.parser
+import lxml.html
+import urlparse
+import re
+#lazycache=scraperwiki.swimport('lazycache')
+postlistelib=scraperwiki.swimport('postliste-python-lib')
+
+agency = 'Oslo kommune, Ullern bydel'
+
+def report_errors(errors):
+ if 0 < len(errors):
+ print "Errors:"
+ for e in errors:
+ print e
+ exit(1)
+def out_of_cpu(arg, spent, hard, soft):
+ report_errors(arg)
+
+def process_pdf(parser, pdfurl, errors):
+ errors = []
+ postlistelib.exit_if_no_cpu_left(0, out_of_cpu, errors)
+ try:
+ pdfcontent = scraperwiki.scrape(pdfurl)
+ parser.preprocess(pdfurl, pdfcontent)
+ pdfcontent = None
+# except ValueError, e:
+# errors.append(e)
+ except IndexError, e:
+ errors.append(e)
+
+def process_page_queue(parser, errors):
+ try:
+ parser.process_pages()
+ postlistelib.exit_if_no_cpu_left(0, out_of_cpu, errors)
+ except scraperwiki.CPUTimeExceededError, e:
+ errors.append("Processing pages interrupted")
+
+def process_journal_pdfs(parser, listurl, errors):
+# print "Finding PDFs on " + listurl
+# u = urllib.parse.urlparse(listurl)
+ html = scraperwiki.scrape(listurl)
+ root = lxml.html.fromstring(html)
+ html = None
+ for ahref in root.cssselect("table a"):
+ href = ahref.attrib['href']
+ url = urlparse.urljoin(listurl, href)
+ if -1 != href.find("file://") or -1 != href.find("mailto:"):
+# print "Skipping non-http URL " + url
+ continue
+ if parser.is_already_scraped(url):
+ True
+# print "Skipping already scraped " + url
+ else:
+# print "Will process " + url
+ process_pdf(parser, url, errors)
+
+def test_small_pdfs(parser):
+ # Test with some smaller PDFs
+ errors = []
+ parser.debug = True
+ process_pdf(parser, "http://www.bydel-ullern.oslo.kommune.no/getfile.php/bydel%20ullern%20(BUN)/Internett%20(BUN)/Dokumenter/dokument/postjournal/120502.pdf", errors)
+ process_page_queue(parser, errors)
+ report_errors(errors)
+ exit(0)
+
+print "Starting scraping of " + agency
+parser = postlistelib.PDFJournalParser(agency=agency)
+#parser.debug = True
+
+#test_small_pdfs(parser)
+
+errors = []
+process_journal_pdfs(parser, "http://www.bydel-ullern.oslo.kommune.no/postjournal/", errors)
+process_page_queue(parser, errors)
+report_errors(errors)
+
diff --git a/scrapersources/postliste-oslo-gravferdsetaten b/scrapersources/postliste-oslo-gravferdsetaten
new file mode 100644
index 0000000..7becd10
--- /dev/null
+++ b/scrapersources/postliste-oslo-gravferdsetaten
@@ -0,0 +1,90 @@
+# -*- coding: UTF-8 -*-
+
+import scraperwiki
+import json
+from BeautifulSoup import BeautifulSoup
+import datetime
+import dateutil.parser
+import lxml.html
+import resource
+import sys
+import urlparse
+import re
+
+scraperwiki.scrape("http://www.gravferdsetaten.oslo.kommune.no/offentlig_journal/article43281-14384.html")
+
+lazycache=scraperwiki.swimport('lazycache')
+postlistelib=scraperwiki.swimport('postliste-python-lib')
+
+agency = 'Oslo kommune, gravferdsetaten'
+
+def report_errors(errors):
+ if 0 < len(errors):
+ print "Errors:"
+ for e in errors:
+ print e
+ exit(1)
+def out_of_cpu(arg, spent, hard, soft):
+ report_errors(arg)
+
+def process_pdf(parser, pdfurl, errors):
+ errors = []
+ postlistelib.exit_if_no_cpu_left(0, out_of_cpu, errors)
+ try:
+ pdfcontent = scraperwiki.scrape(pdfurl)
+ parser.preprocess(pdfurl, pdfcontent)
+ pdfcontent = None
+# except ValueError, e:
+# errors.append(e)
+ except IndexError, e:
+ errors.append(e)
+
+def process_page_queue(parser, errors):
+ try:
+ if False:
+ scraperwiki.sqlite.execute("delete from swdata where scrapedurl in (select scrapedurl from unparsedpages)")
+ scraperwiki.sqlite.execute("delete from unparsedpages")
+ scraperwiki.sqlite.commit()
+
+ parser.process_pages()
+ postlistelib.exit_if_no_cpu_left(0, out_of_cpu, errors)
+ except scraperwiki.CPUTimeExceededError, e:
+ errors.append("Processing pages interrupted")
+
+def process_journal_pdfs(parser, listurl, errors):
+# print "Finding PDFs on " + listurl
+# u = urllib.parse.urlparse(listurl)
+ html = scraperwiki.scrape(listurl)
+ root = lxml.html.fromstring(html)
+ html = None
+ for ahref in root.cssselect("table a"):
+ href = ahref.attrib['href']
+ url = urlparse.urljoin(listurl, href)
+ if -1 != href.find("file://") or -1 != href.find("mailto:"):
+# print "Skipping non-http URL " + url
+ continue
+ if parser.is_already_scraped(url):
+ True
+# print "Skipping already scraped " + url
+ else:
+# print "Will process " + url
+ process_pdf(parser, url, errors)
+
+def test_small_pdfs(parser):
+ # Test with some smaller PDFs
+ errors = []
+ process_pdf(parser, "http://www.gravferdsetaten.oslo.kommune.no/getfile.php/gravferdsetaten%20(GFE)/Internett%20(GFE)/Dokumenter/dokument/Arkivet/Postjournal/Juni/13.06.pdf", errors)
+ process_pdf(parser, "http://www.gravferdsetaten.oslo.kommune.no/getfile.php/gravferdsetaten%20(GFE)/Internett%20(GFE)/Dokumenter/dokument/Arkivet/Postjournal/Juni/12.06.pdf", errors)
+ process_page_queue(parser, errors)
+ report_errors(errors)
+ exit(0)
+
+errors = []
+parser = postlistelib.PDFJournalParser(agency=agency)
+
+#test_small_pdfs(parser)
+
+process_journal_pdfs(parser, "http://www.gravferdsetaten.oslo.kommune.no/offentlig_journal/article43281-14384.html", errors)
+process_page_queue(parser, errors)
+report_errors(errors)
+
diff --git a/scrapersources/postliste-oslo-havn b/scrapersources/postliste-oslo-havn
new file mode 100644
index 0000000..d453ef7
--- /dev/null
+++ b/scrapersources/postliste-oslo-havn
@@ -0,0 +1,86 @@
+# -*- coding: UTF-8 -*-
+# Based on the scraper advanced-scraping-pdf
+# See also
+# https://views.scraperwiki.com/run/pdf-to-html-preview-1/?url=http%3A%2F%2Fwww.stortinget.no%2FGlobal%2Fpdf%2Fpostjournal%2Fpj-2012-05-09.pdf
+
+import scraperwiki
+import json
+from BeautifulSoup import BeautifulSoup
+import datetime
+import dateutil.parser
+import lxml.html
+import sys
+import urlparse
+import re
+lazycache=scraperwiki.swimport('lazycache')
+postlistelib=scraperwiki.swimport('postliste-python-lib')
+
+agency = 'Oslo kommune, Oslo Havn KF'
+
+def report_errors(errors):
+ if 0 < len(errors):
+ print "Errors:"
+ for e in errors:
+ print e
+ exit(1)
+def out_of_cpu(arg, spent, hard, soft):
+ report_errors(arg)
+
+def process_pdf(parser, pdfurl, errors):
+ errors = []
+ postlistelib.exit_if_no_cpu_left(0, out_of_cpu, errors)
+ try:
+ parser.fetch_and_preprocess(pdfurl)
+# except ValueError, e:
+# errors.append(e)
+ except IndexError, e:
+ errors.append(e)
+
+def process_page_queue(parser, errors):
+ try:
+ parser.process_pages()
+ postlistelib.exit_if_no_cpu_left(0, out_of_cpu, errors)
+ except scraperwiki.CPUTimeExceededError, e:
+ errors.append("Processing pages interrupted, ran out of cpu")
+
+def process_journal_pdfs(parser, listurl, errors):
+# print "Finding PDFs on " + listurl
+# u = urllib.parse.urlparse(listurl)
+ html = scraperwiki.scrape(listurl)
+ root = lxml.html.fromstring(html)
+ html = None
+ for ahref in root.cssselect("table a"):
+ href = ahref.attrib['href']
+ url = urlparse.urljoin(listurl, href)
+ if -1 != href.find("file://"):
+# print "Skipping non-http URL " + url
+ continue
+ if parser.is_already_scraped(url):
+ True
+# print "Skipping already scraped " + url
+ else:
+# print "Will process " + url
+ process_pdf(parser, url, errors)
+
+def test_pdfs(parser):
+ parser.debug = True
+ # Test with some smaller PDFs
+ errors = []
+ process_pdf(parser, "http://www.havn.oslo.kommune.no/getfile.php/oslo%20havn%20kf%20(HAV)/Internett%20(HAV)/Dokumenter/Postjournal/Mai/24.05.2012.pdf", errors)
+
+ # This file have a problematic format, the text fragments have a different order than most
+ # journal PDFs.
+ process_pdf(parser, "http://www.havn.oslo.kommune.no/getfile.php/oslo%20havn%20kf%20%28HAV%29/Internett%20%28HAV%29/Dokumenter/Postjournal/Mars/1%20MTMzMDY4NjY3ODI5OTk5Mz.pdf", errors)
+ process_page_queue(parser, errors)
+ report_errors(errors)
+ exit(0)
+
+parser = postlistelib.PDFJournalParser(agency=agency)
+
+#test_pdfs(parser)
+
+errors = []
+process_journal_pdfs(parser, "http://www.havn.oslo.kommune.no/postjournal/", errors)
+process_page_queue(parser, errors)
+report_errors(errors)
+
diff --git a/scrapersources/postliste-oslo-radhusets-forvaltningstjeneste b/scrapersources/postliste-oslo-radhusets-forvaltningstjeneste
new file mode 100644
index 0000000..4f9b5c1
--- /dev/null
+++ b/scrapersources/postliste-oslo-radhusets-forvaltningstjeneste
@@ -0,0 +1,231 @@
+# -*- coding: UTF-8 -*-
+
+import scraperwiki
+import json
+from BeautifulSoup import BeautifulSoup
+import datetime
+import dateutil.parser
+import lxml.html
+import urlparse
+import re
+
+# Make sure Scraperwiki believe this is the source from this database
+scraperwiki.scrape("http://www.radhusets-forvaltningstjeneste.oslo.kommune.no/postjournal/")
+
+lazycache=scraperwiki.swimport('lazycache')
+postlistelib=scraperwiki.swimport('postliste-python-lib')
+
+agency = 'Oslo kommune, Rådhusets forvaltningstjeneste'
+
+def report_errors(errors):
+ if 0 < len(errors):
+ print "Errors:"
+ for e in errors:
+ print e
+ exit(1)
+
+def out_of_cpu(arg, spent, hard, soft):
+ report_errors(arg)
+
+# Input YY/X-Z, return YYYY, X, Z
+def split_docid(docid):
+ caseyear, rest = docid.split('/')
+ caseseqnr, casedocseq = rest.split('-')
+ caseyear = int(caseyear)
+ caseseqnr = int(caseseqnr)
+ casedocsec = int(casedocseq)
+ if caseyear < 50:
+ caseyear = caseyear + 2000
+ if 50 <= caseyear and caseyear < 100:
+ caseyear = caseyear + 1900
+ return caseyear, caseseqnr, casedocseq
+
+# Input DDMMYYYY, output YYYY-MM-DD
+def parse_date(date):
+ if 'Udatert' == date:
+ return None
+ year = date[4:8]
+ month = date[2:4]
+ day = date[0:2]
+ isodate = year + "-" + month + "-" + day
+ #print date, isodate
+ return dateutil.parser.parse(isodate, dayfirst=True).date()
+
+def parse_entry(pdfurl, lines):
+ print lines
+ print "Entry lines " + str(len(lines))
+ entry = {
+ 'agency' : agency,
+ 'scrapedurl' : pdfurl,
+ }
+ cur = 0
+ while cur < len(lines):
+ line = lines[cur].text
+ #print line
+ if -1 != line.find('Dok.dato:'):
+ entry['docid'] = lines[cur-2].text
+ entry['doctype'] = lines[cur-1].text
+ entry['docdate'] = parse_date(line.replace("Dok.dato:", ""))
+ caseyear, caseseqnr, casedocseq = split_docid(entry['docid'])
+ entry['caseyear'] = caseyear
+ entry['caseseqnr'] = caseseqnr
+ entry['casedocseq'] = casedocseq
+ entry['caseid'] = str(caseyear) + '/' + str(caseseqnr)
+ if -1 != line.find('Jour.dato:'):
+ entry['recorddate'] = parse_date(lines[cur+1].text)
+ cur = cur + 1
+ if -1 != line.find('Arkivdel:'):
+ entry['arkivdel'] = lines[cur+1].text
+ cur = cur + 1
+ if -1 != line.find('Tilg. kode:'):
+ entry['tilgangskode'] = line.replace("Tilg. kode:", "")
+ if -1 != line.find('Sak:'):
+ entry['casedesc'] = lines[cur+1].text
+ cur = cur + 1
+ if -1 != line.find('Dok:'):
+ entry['docdesc'] = lines[cur+1].text
+ cur = cur + 1
+ if -1 != line.find('Par.:'):
+ entry['exemption'] = line.replace("Par.:", "")
+ cur = cur + 1
+ if -1 != line.find('Avsender:'):
+ entry['sender'] = lines[cur+1].text
+ cur = cur + 1
+ if -1 != line.find('Mottaker:'):
+ entry['recipient'] = lines[cur+1].text
+ cur = cur + 1
+ if -1 != line.find('Saksansv:'):
+ entry['saksansvarlig'] = line.replace("Saksansv:", "").strip()
+ if -1 != line.find('Saksbeh:'):
+ entry['saksbehandler'] = lines[cur+1].text
+ cur = cur + 1
+ cur = cur + 1
+ print entry
+ if 'docid' in entry:
+ scraperwiki.sqlite.save(unique_keys=['docid'], data=entry)
+ #return
+
+def parse_pdf(pdfurl, pdfcontent):
+ pdfxml = scraperwiki.pdftoxml(pdfcontent)
+ pages=re.findall('(<page .+?</page>)',pdfxml,flags=re.DOTALL)
+ for page in pages:
+ s = BeautifulSoup(page)
+ lines = s.findAll('text')
+ last = 0
+ cur = 0
+ while cur < len(lines):
+ #print cur, lines[cur]
+ if -1 != lines[cur].text.find('Dok.dato:'):
+ print last, cur-2
+ parse_entry(pdfurl, lines[last:cur-2])
+ last = cur - 2
+ cur = cur + 1
+ return
+ if False:
+ cur = 0
+ entry = { 'agency' : agency, 'scrapedurl' : pdfurl }
+ while cur < len(lines):
+ line = lines[cur].text
+ #print line
+ if -1 != line.find('Dok.dato:'):
+ entry['docid'] = lines[cur-2].text
+ entry['doctype'] = lines[cur-1].text
+ entry['docdate'] = parse_date(line.replace("Dok.dato:", ""))
+ caseyear, caseseqnr, casedocseq = split_docid(entry['docid'])
+ entry['caseyear'] = caseyear
+ entry['caseseqnr'] = caseseqnr
+ entry['casedocseq'] = casedocseq
+ entry['caseid'] = str(caseyear) + '/' + str(caseseqnr)
+ if -1 != line.find('Jour.dato:'):
+ entry['recorddate'] = parse_date(lines[cur+1].text)
+ cur = cur + 1
+ if -1 != line.find('Arkivdel:'):
+ entry['arkivdel'] = lines[cur+1].text
+ cur = cur + 1
+ if -1 != line.find('Tilg. kode:'):
+ entry['tilgangskode'] = line.replace("Tilg. kode:", "")
+ if -1 != line.find('Sak:'):
+ entry['casedesc'] = lines[cur+1].text
+ cur = cur + 1
+ if -1 != line.find('Dok:'):
+ entry['docdesc'] = lines[cur+1].text
+ cur = cur + 1
+ if -1 != line.find('Par.:'):
+ entry['exemption'] = line.replace("Par.:", "")
+ cur = cur + 1
+ if -1 != line.find('Avsender:'):
+ entry['sender'] = lines[cur+1].text
+ cur = cur + 1
+ if -1 != line.find('Mottaker:'):
+ entry['recipient'] = lines[cur+1].text
+ cur = cur + 1
+ if -1 != line.find('Saksansv:'):
+ entry['saksansvarlig'] = line.replace("Saksansv:", "").strip()
+ if -1 != line.find('Saksbeh:'):
+ entry['saksbehandler'] = lines[cur+1].text
+ cur = cur + 1
+ print entry
+ scraperwiki.sqlite.save(unique_keys=['docid'], data=entry)
+ entry = { 'agency' : agency, 'scrapedurl' : pdfurl }
+ cur = cur + 1
+ #return
+
+def process_pdf(parser, pdfurl, errors):
+ postlistelib.exit_if_no_cpu_left(0, out_of_cpu, errors)
+ try:
+ #if True:
+ pdfcontent = scraperwiki.scrape(pdfurl)
+ parse_pdf(pdfurl, pdfcontent)
+ #parser.preprocess(pdfurl, pdfcontent)
+ pdfcontent = None
+# except ValueError, e:
+# errors.append(e)
+ #except IndexError, e:
+ # errors.append(e)
+ except Exception, e:
+ print e
+
+def process_page_queue(parser, errors):
+ try:
+ parser.process_pages()
+ postlistelib.exit_if_no_cpu_left(0, out_of_cpu, errors)
+ except scraperwiki.CPUTimeExceededError, e:
+ errors.append("Processing pages interrupted")
+
+def process_journal_pdfs(parser, listurl, errors):
+# print "Finding PDFs on " + listurl
+# u = urllib.parse.urlparse(listurl)
+ html = scraperwiki.scrape(listurl)
+ root = lxml.html.fromstring(html)
+ html = None
+ for ahref in root.cssselect("table a"):
+ href = ahref.attrib['href']
+ url = urlparse.urljoin(listurl, href)
+ if -1 != href.find("file://") or -1 == url.find(".pdf"):
+# print "Skipping non-http URL " + url
+ continue
+ if parser.is_already_scraped(url):
+ True
+# print "Skipping already scraped " + url
+ else:
+# print "Will process " + url
+ process_pdf(parser, url, errors)
+
+def test_small_pdfs(parser):
+ # Test with some smaller PDFs
+ errors = []
+ process_pdf(parser, "http://www.radhusets-forvaltningstjeneste.oslo.kommune.no/getfile.php/rådhusets%20forvaltningstjeneste%20(RFT)/Intranett%20(RFT)/Dokumenter/Postjournal/11%20November/29112011.pdf", errors)
+ process_pdf(parser, "http://www.radhusets-forvaltningstjeneste.oslo.kommune.no/getfile.php/rådhusets%20forvaltningstjeneste%20(RFT)/Intranett%20(RFT)/Dokumenter/Postjournal/12%20Desember/02122011.pdf", errors)
+ process_page_queue(parser, errors)
+ report_errors(errors)
+ exit(0)
+
+errors = []
+parser = postlistelib.PDFJournalParser(agency=agency)
+
+#test_small_pdfs(parser)
+
+process_journal_pdfs(parser, "http://www.radhusets-forvaltningstjeneste.oslo.kommune.no/postjournal/", errors)
+#process_page_queue(parser, errors)
+report_errors(errors)
+
diff --git a/scrapersources/postliste-python-lib b/scrapersources/postliste-python-lib
new file mode 100644
index 0000000..042d1fd
--- /dev/null
+++ b/scrapersources/postliste-python-lib
@@ -0,0 +1,577 @@
+# -*- coding: utf-8 -*-
+#
+# Python library for parsing public post journals (postlister) in Norway.
+#
+
+# Based on the scraper advanced-scraping-pdf
+#
+# See also
+# https://views.scraperwiki.com/run/pdf-to-html-preview-1/
+
+# Possible sources using format 1 pdf:
+# www.bydel-ullern.oslo.kommune.no
+# www.gravferdsetaten.oslo.kommune.no
+# www.halden.kommune.no (done)
+# www.havn.oslo.kommune.no (done)
+# www.hvaler.kommune.no (done)
+# www.kafjord.kommune.no
+# www.lier.kommune.no
+# www.lindesnes.kommune.no
+# www.naroy.kommune.no
+# www.saltdal.kommune.no
+# www.sogne.kommune.no
+# www.vikna.kommune.no
+#
+# Google search to find more: "Offentlig journal" Seleksjon Sakstittel Dokumenttype Status filetype:pdf
+
+
+import scraperwiki
+import string
+import re
+from BeautifulSoup import BeautifulSoup
+import datetime
+import dateutil.parser
+
+def cpu_spent():
+ import resource
+ usage = resource.getrusage(resource.RUSAGE_SELF)
+ return getattr(usage, 'ru_utime') + getattr(usage, 'ru_stime')
+
+def exit_if_no_cpu_left(retval, callback=None, arg = None):
+ import resource
+ soft, hard = resource.getrlimit(resource.RLIMIT_CPU)
+ spent = cpu_spent()
+ if soft < spent:
+ if callback is not None:
+ callback(arg, spent, hard, soft)
+ print "Running out of CPU, exiting."
+ exit(retval)
+
+def fetch_url_harder(url, scraper = None):
+ import urllib2
+ html = None
+ for n in [1, 2, 3]:
+ try:
+ if None == scraper:
+ scraper = scraperwiki.scrape
+ html = scraper(url)
+ break
+ except urllib2.URLError, e:
+ print "URLError fetching " + url + ", trying again"
+ return html
+
+class JournalParser:
+ agency = None
+ debug = False
+
+ validdoctypes = ['I', 'U', 'X', 'N']
+ senderdoctypes = ['I', 'X', 'N']
+ recipientdoctypes = ['U']
+ mustfields = {
+ 'agency' : 1,
+ 'docdesc' : 1,
+ 'doctype' : 1,
+ 'caseyear' : 1,
+ 'caseseqnr' : 1,
+ 'casedocseq' : 1,
+ }
+
+ def __init__(self, agency):
+ self.agency = agency
+
+ def is_valid_doctype(self, doctype):
+ return doctype in self.validdoctypes
+
+ def is_sender_doctype(self, doctype):
+ return doctype in self.senderdoctypes
+
+ def is_recipient_doctype(self, doctype):
+ return doctype in self.recipientdoctypes
+
+ def verify_entry(self, entry):
+
+ for field in self.mustfields:
+ if not field in entry:
+ raise ValueError("Missing required field " + field)
+
+ if not self.is_valid_doctype(entry['doctype']):
+ raise ValueError("Invalid doctype " + doctype)
+
+ if -1 != entry['caseid'].find('-'):
+ raise ValueError("Field caseid should not include dash: " + entry['caseid'])
+
+#
+# Parser of PDFs looking like
+# http://www.storfjord.kommune.no/postliste-18-mai-2012.5056067-105358.html (type 1)
+# http://www.hadsel.kommune.no/component/docman/doc_download/946-offentlig-postjournal-28032012 (type 2)
+# http://www.stortinget.no/Global/pdf/postjournal/pj-2011-06-23.pdf (type 2 variant)
+# Note sender/receiver is not yet parsed for type 2 PDFs
+class PDFJournalParser(JournalParser):
+ pagetable = "unparsedpages"
+ brokenpagetable = "brokenpages"
+ hiddentext = False
+ breakonfailure = True
+
+ def __init__(self, agency, hiddentext=False):
+ self.hiddentext = hiddentext
+ JournalParser.__init__(self, agency=agency)
+
+ def is_already_scraped(self, url):
+ # Ignore entries were sender and recipient is the result of a broken parser (before 2012-05-25)
+ for sql in ["scrapedurl, sender, recipient from swdata where scrapedurl = '" + url + "' " +
+ # FIXME Figure out why this do not work
+ #" and not (sender = 'parse error' or recipient != 'parse error') " +
+ "limit 1",
+ "scrapedurl from " + self.pagetable + " where scrapedurl = '" + url + "' limit 1"]:
+ try:
+ result = scraperwiki.sqlite.select(sql)
+ #int sql, " : ", result
+ if 0 < len(result) and u'scrapedurl' in result[0]:
+ return True
+ except Exception as e:
+ #if ('no such table: %s' % self.pagetable) not in str(e) and 'no such table: swdata' not in str(e):
+ # raise
+ print "Exception: %s" % e
+ return False
+
+ # Check if we recognize the page content, and throw if not
+ def is_valid_page(self, pdfurl, pagenum, pagecontent):
+ s = BeautifulSoup(pagecontent)
+ for t in s.findAll('text'):
+ if t.text != " ":
+ if 'Innhold:' == t.text: # type 1 or 2 (ePhorge)
+ s = None
+ return True
+ if 'Arkivdel:' == t.text]: # type 3 (doculive)
+ s = None
+ return True
+ s = None
+ if self.debug:
+ print "Unrecognized page format for " + pdfurl
+ raise ValueError("Unrecognized page format for " + pdfurl)
+
+ #
+ # Split PDF content into pages and store in SQL table for later processing.
+ # The process is split in two to better handle parge PDFs (like 600 pages),
+ # without running out of CPU time without loosing track of what is left to
+ # parse.
+ def preprocess(self, pdfurl, pdfcontent):
+ print "Preprocessing PDF " + pdfurl
+ if not pdfcontent:
+ raise ValueError("No pdf content passed for " + pdfurl)
+ if self.hiddentext:
+ options = '-hidden'
+ else:
+ options = ''
+ xml=scraperwiki.pdftoxml(pdfcontent, options)
+ if self.debug:
+ print xml
+ pages=re.findall('(<page .+?</page>)',xml,flags=re.DOTALL)
+ xml=None
+# print pages[:1][:1000]
+ pagecount = 0
+ datastore = []
+ for page in pages:
+ pagecount = pagecount + 1
+ self.is_valid_page(pdfurl, pagecount, page)
+ data = {
+ 'scrapedurl' : pdfurl,
+ 'pagenum' : pagecount,
+ 'pagecontent' : page,
+ }
+ datastore.append(data)
+ if 0 < len(datastore):
+ scraperwiki.sqlite.save(unique_keys=['scrapedurl', 'pagenum'], data=datastore, table_name=self.pagetable)
+ else:
+ raise ValueError("Unable to find any pages in " + pdfurl)
+ pages = None
+
+ def fetch_and_preprocess(self, pdfurl):
+ pdfcontent = fetch_url_harder(pdfurl)
+ self.preprocess(pdfurl, pdfcontent)
+ pdfcontent = None
+
+ def print_entry(self, entrytext):
+ for i in range(0, len(entrytext)):
+ print str(i) + ": '" + entrytext[i] + "'"
+
+ # ePhorte PDF
+ def parse_entry_type1(self, entrytext, pdfurl):
+ scrapestamputc = datetime.datetime.now()
+ entry = {
+ 'agency' : self.agency,
+ 'scrapestamputc' : scrapestamputc,
+ 'scrapedurl' : pdfurl
+ }
+ i = 0
+ while i < len(entrytext):
+ #print "T: '" + entrytext[i] + "'"
+ if 'Innhold:' == entrytext[i]:
+ tittel = ""
+ # handle multi-line titles
+ while 'Sakstittel:' != entrytext[i+1]:
+ tittel = tittel + " " + entrytext[i+1]
+ i = i + 1
+ entry['docdesc'] = tittel
+ if 'Sakstittel:' == entrytext[i]:
+ sakstittel = ""
+ while 'DokType' != entrytext[i+1]:
+# print "'" + entrytext[i+1] + "'"
+ sakstittel = sakstittel + " " + entrytext[i+1]
+ i = i + 1
+ entry['casedesc'] = sakstittel
+ if 'DokType' == entrytext[i]: # Values I/U/N/X from NOARK 4 table 14.2.11
+ entry['doctype'] = entrytext[i+1]
+ # As seen on http://www.saltdal.kommune.no/images/module.files/2007-05-16.pdf, page 1
+ if entry['doctype'] == 'S':
+ entry['doctype'] = 'X'
+ i = i + 1
+ if 'Sak/dok nr:' == entrytext[i]:
+ # FIXME Split and handle combined sak/løpenr
+ # Use find('penr.:') to avoid non-ascii search string 'Løpenr.:'
+ caseid = None
+ lnr = None
+ if -1 != entrytext[i+4].find('penr.:'):
+ caseid = entrytext[i+1] + entrytext[i+2]
+ lnr = entrytext[i+3]
+ i = i + 4
+ elif -1 != entrytext[i+3].find('penr.:'):
+ caseid = entrytext[i+1]
+ lnr = entrytext[i+2]
+ i = i + 3
+ elif -1 != entrytext[i+2].find('penr.:'):
+ caseid, lnr = entrytext[i+1].split(" ")
+ i = i + 2
+
+ caseyear, caseseqnr = caseid.split("/")
+ entry['caseyear'] = int(caseyear)
+ caseseqnr, casedocseq = caseseqnr.split("-")
+ entry['caseseqnr'] = int(caseseqnr)
+ entry['casedocseq'] = int(casedocseq)
+ entry['caseid'] = caseyear + "/" + caseseqnr
+
+ journalseqnr, journalyear = lnr.split("/")
+ entry['journalid'] = journalyear + "/" + journalseqnr
+ entry['journalyear'] = int(journalyear)
+ entry['journalseqnr'] = int(journalseqnr)
+
+# if -1 != text[i].find('penr.:'): # Use find('penr.:') to avoid non-ascii search string 'Løpenr.:'
+# str = text[i-1]
+# print "S: '" + str + "'"
+# data['journalid'] = str
+# # FIXME handle combined sak/løpenr
+ if 'Journaldato:' == entrytext[i]:
+ entry['recorddate'] = dateutil.parser.parse(entrytext[i-1], dayfirst=True)
+ if 'Dok.dato:' == entrytext[i]:
+ entry['docdate'] = dateutil.parser.parse(entrytext[i-1], dayfirst=True)
+ if 'Tilg.kode Hjemmel:' == entrytext[i] and 'Avsender\mottaker:' != entrytext[i+1]:
+ entry['exemption'] = entrytext[i+1]
+ i = i + 1
+ if 'Tilg.kode' == entrytext[i]:
+ entry['accesscode'] = entrytext[i+1]
+ i = i + 1
+ if 'Hjemmel:' == entrytext[i]:
+ entry['exemption'] = entrytext[i+1]
+ i = i + 1
+ if 'Avsender\mottaker:' == entrytext[i]:
+ if i+1 < len(entrytext): # Non-empty field
+ fratil = entrytext[i+1]
+ i = i + 1
+ if self.is_sender_doctype(entry['doctype']):
+ entry['sender'] = fratil
+ elif self.is_recipient_doctype(entry['doctype']):
+ entry['recipient'] = fratil
+ else:
+ raise ValueError("Case " + entry['caseid'] + " Sender/Recipient with doctype " + entry['doctype'] + " != I/U/X/N in " + pdfurl)
+ if self.debug:
+ print entry
+ i = i + 1
+ return entry
+
+ def parse_case_journal_ref(self, entry, reftext, pdfurl):
+ try:
+ # FIXME Split and handle combined sak/loepenr
+ # Use find('penr.:') to avoid non-ascii search string 'Loepenr.:'
+ caseid = None
+ lnr = None
+ if 4 == len(reftext):
+# print "4 " + str(reftext)
+ caseid = reftext[0] + reftext[1]
+ lnr = reftext[2] + reftext[3]
+# print str(caseid) + " " + str(lnr)
+ elif 3 == len(reftext):
+ if -1 != reftext[0].find("/") and -1 != reftext[2].find("/"):
+# print "31"
+ caseid = reftext[0] + reftext[1]
+ lnr = reftext[2]
+ elif -1 != reftext[2].find("/"):
+# print "32"
+ caseid = reftext[0] + reftext[1]
+ lnr = reftext[2]
+ elif -1 == reftext[2].find("/"):
+# print "33"
+ caseid = reftext[0]
+ lnr = reftext[1] + reftext[2]
+ elif 2 == len(reftext):
+ if -1 == reftext[1].find("/"):
+# print "21"
+ s = reftext[0] + reftext[1]
+# print "S: " + s
+ caseid, lnr = s.split(" ")
+ elif -1 != reftext[1].find("/"):
+# print "22"
+ caseid = reftext[0]
+ lnr = reftext[1]
+ elif 1 == len(reftext):
+ caseid, lnr = reftext[0].split(" ")
+ else:
+ raise ValueError("Unable to parse entry " + str(reftext) + " in " + pdfurl)
+# print "C: " + caseid + " L: " + lnr
+
+ caseyear, caseseqnr = caseid.split("/")
+ entry['caseyear'] = int(caseyear)
+ caseseqnr, casedocseq = caseseqnr.split("-")
+ entry['caseseqnr'] = int(caseseqnr)
+ entry['casedocseq'] = int(casedocseq)
+ entry['caseid'] = caseyear + "/" + caseseqnr
+
+ journalseqnr, journalyear = lnr.split("/")
+ entry['journalid'] = journalyear + "/" + journalseqnr
+ entry['journalyear'] = int(journalyear)
+ entry['journalseqnr'] = int(journalseqnr)
+ except:
+ print "Unable to parse " + str(reftext)
+ return entry
+ def test_parse_case_journal_ref(self):
+ entry = {}
+ self.parse_case_journal_ref(entry, [u'2008/16414-', u'23', u'15060/2012'], "")
+ self.parse_case_journal_ref(entry, [u'2011/15972-1 102773/201', u'1'], "")
+ self.parse_case_journal_ref(entry, [u'2010/2593-2', u'103004/201', u'1'], "")
+ self.parse_case_journal_ref(entry, [u'2011/13415-', u'22', u'100077/201', u'1'], "")
+
+ # ePhorte PDF
+ def parse_entry_type2(self, entrytext, pdfurl):
+ scrapestamputc = datetime.datetime.now()
+ entry = {
+ 'agency' : self.agency,
+ 'scrapestamputc' : scrapestamputc,
+ 'scrapedurl' : pdfurl
+ }
+ i = 0
+ avsender = []
+ mottaker = []
+ while i < len(entrytext):
+ if 'Innhold:' == entrytext[i]:
+ tittel = ""
+ # handle multi-line titles
+ while 'Sakstittel:' != entrytext[i+1]:
+ tittel = tittel + entrytext[i+1]
+ i = i + 1
+ entry['docdesc'] = tittel
+ if 'Sakstittel:' == entrytext[i]:
+ sakstittel = ""
+ # Klassering er i en annen dokumenttype
+ while 'DokType' != entrytext[i+1] and 'Dok.Type:' != entrytext[i+1] and 'Klassering:' != entrytext[i+1]:
+
+# print "'" + entrytext[i+1] + "'"
+ sakstittel = sakstittel + entrytext[i+1]
+ i = i + 1
+ entry['casedesc'] = sakstittel
+ i = i + 1
+ if 'DokType' == entrytext[i] or 'Dok.Type:' == entrytext[i]: # Values I/U/N/X from NOARK 4 table 14.2.11
+ entry['doctype'] = entrytext[i+1]
+ # As seen on http://www.uis.no/getfile.php/Journal%20200612.pdf
+ if entry['doctype'] == 'S':
+ entry['doctype'] = 'X'
+ i = i + 1
+ if 'Sak/dok nr:' == entrytext[i] or 'Sak/dok.nr:' == entrytext[i]:
+ endi = i
+ while endi < len(entrytext):
+ if -1 != entrytext[endi].find('penr.:') or -1 != entrytext[endi].find('penr:'):
+ break
+ endi = endi + 1
+ entry = self.parse_case_journal_ref(entry, entrytext[i+1:endi], pdfurl)
+ i = endi + 1
+# if -1 != text[i].find('penr.:'): # Use find('penr.:') to avoid non-ascii search string 'Løpenr.:'
+# str = text[i-1]
+# print "S: '" + str + "'"
+# data['journalid'] = str
+# # FIXME handle combined sak/løpenr
+ if 'Journaldato:' == entrytext[i]:
+ entry['recorddate'] = dateutil.parser.parse(entrytext[i-1], dayfirst=True)
+ if 'Dok.dato:' == entrytext[i]:
+ entry['docdate'] = dateutil.parser.parse(entrytext[i-1], dayfirst=True)
+ if 'Tilg.kode Hjemmel:' == entrytext[i] and '(enhet/initialer):' != entrytext[i+2]:
+ entry['exemption'] = entrytext[i+1]
+ i = i + 1
+ if 'Tilg.kode' == entrytext[i]:
+ entry['accesscode'] = entrytext[i+1]
+ i = i + 1
+ if 'Hjemmel:' == entrytext[i]:
+ entry['exemption'] = entrytext[i+1]
+ i = i + 1
+# if -1 != text[i].find('Avs./mottaker:'):
+# FIXME Need to handle senders and receivers
+ if 'Mottaker' == entrytext[i]:
+ mottaker.append(entrytext[i-1])
+ if 'Avsender' == entrytext[i]:
+ avsender.append(entrytext[i-1])
+# entry['sender'] = 'parse error'
+# entry['recipient'] = 'parse error'
+ i = i + 1
+ if 0 < len(mottaker):
+ entry['recipient'] = string.join(mottaker, ", ")
+ if 0 < len(avsender):
+ entry['sender'] = string.join(avsender, ", ")
+ return entry
+
+ def parse_page(self, pdfurl, pagenum, pagecontent):
+ print "Scraping " + pdfurl + " page " + str(pagenum)
+ s = BeautifulSoup(pagecontent)
+ datastore = []
+ text = []
+ linecount = 0
+ if self.debug:
+ print s
+ for t in s.findAll('text'):
+ if t.text != " ":
+ text.append(t.text)
+ if self.debug:
+ print str(linecount) + ": " + t.text
+# FIXME Remove length limit when working
+# if 100 <= linecount:
+# break
+ linecount = linecount + 1
+# if -1 != t.text.find("Side:"):
+# print t.text
+ s = None
+
+# print "Found " + str(linecount) + " lines/text fragments in the PDF"
+ if len(text) < linecount:
+ raise ValueError("Text array too sort!")
+
+ # First count how many entries to expect on this page, to be able to
+ # verify that all of them were found.
+ entrycount = 0
+ i = 0
+ while i < len(text):
+ if 'Innhold:' == text[i] \ # Type 1 and 2 (ePhorge)
+ or 'Arkivdel:' == text[i]: # type 3 (doculive)
+ entrycount = entrycount + 1
+ i = i + 1
+
+ i = 0
+ while i < len(text):
+ if self.debug:
+ print "T: '" + text[i] + "'"
+ if self.debug and -1 != text[i].find("Side:"):
+ print text[i]
+ if 'Innhold:' == text[i]:
+ endi = i + 1
+ pdfparser = None
+ format = "unknown"
+ while endi < len(text):
+ if 'Klassering:' == text[endi]:
+ pdfparser = self.parse_entry_type2
+ format = "type2"
+ if 'Avsender\mottaker:' == text[endi]:
+ pdfparser = self.parse_entry_type1
+ format = "type1"
+ if 'Innhold:' == text[endi]:
+ break
+ endi = endi + 1
+ if self.debug:
+ print "Entry " + str(entrycount) + " from " + str(i) + " to " + str(endi) + " ie " + str(endi - i) + " lines"
+ try:
+ entry = pdfparser(text[i:endi], pdfurl)
+ if 'caseid' not in entry or entry['caseid'] is None or \
+ not self.is_valid_doctype(entry['doctype']):
+ raise ValueError("Unable to parse " + pdfurl + " as format " + format + "[" + str(entry) + "]")
+# print entry
+ datastore.append(entry)
+ i = endi - 2
+ except:
+ self.print_entry(text[i:endi])
+ raise
+ i = i + 1
+# print data
+# print "Found " + str(len(datastore)) + " of " + str(entrycount) + " entries"
+ if entrycount != len(datastore):
+# print text
+ raise ValueError("Unable to parse all entries in " + pdfurl)
+ if 0 == len(datastore):
+ print "Unable to find any entries in " + pdfurl
+ else:
+ scraperwiki.sqlite.save(unique_keys=['caseid', 'casedocseq'], data=datastore)
+ datastore = None
+ text = None
+
+ def process_pages(self):
+ try:
+ sqlselect = "* from " + self.pagetable + " limit 1"
+ pageref = scraperwiki.sqlite.select(sqlselect)
+ while pageref:
+ scrapedurl = pageref[0]['scrapedurl']
+ pagenum = pageref[0]['pagenum']
+ pagecontent = pageref[0]['pagecontent']
+# print "Found " + scrapedurl + " page " + str(pagenum) + " length " + str(len(pagecontent))
+ try:
+ sqldelete = "delete from " + self.pagetable + " where scrapedurl = '" + scrapedurl + "' and pagenum = " + str(pagenum)
+ self.parse_page(scrapedurl, pagenum, pagecontent)
+# print "Trying to: " + sqldelete
+ scraperwiki.sqlite.execute(sqldelete)
+ except ValueError, e:
+ brokenpage = {
+ 'scrapedurl' : scrapedurl,
+ 'pagenum' : pagenum,
+ 'pagecontent' : pagecontent,
+ }
+ print "Broken page %d from %s" % (pagenum, scrapedurl)
+ scraperwiki.sqlite.save(unique_keys=['scrapedurl', 'pagenum'], data=brokenpage, table_name=self.brokenpagetable)
+ scraperwiki.sqlite.execute(sqldelete)
+ scraperwiki.sqlite.commit()
+ pageref = scraperwiki.sqlite.select(sqlselect)
+ except scraperwiki.sqlite.SqliteError, e:
+ print str(e)
+ raise
+
+def fieldlist():
+ import urllib2
+ import json
+
+ scrapers = [
+ 'postliste-universitetet-i-oslo',
+ 'postliste-lindesnes',
+ 'postliste-kristiansund',
+ 'postliste-stortinget',
+ 'postliste-arendal',
+ 'postliste-oep',
+ 'postliste-ballangen',
+ 'postliste-hadsel',
+ 'postliste-storfjord',
+ 'postliste-oslo-havn',
+ ]
+
+ keys = {}
+
+ for scraper in scrapers:
+ url = 'https://api.scraperwiki.com/api/1.0/scraper/getinfo?format=jsondict&name=' + scraper + '&version=-1'
+ response = urllib2.urlopen(url)
+ html = response.read()
+ data = json.loads(html)
+ if 'swdata' in data[0]['datasummary']['tables']:
+ for key in data[0]['datasummary']['tables']['swdata']['keys']:
+ key = key.lower()
+ if key in keys:
+ keys[key].append(scraper)
+ else:
+ keys[key] = [scraper]
+ def lensort(a, b):
+ return cmp(len(keys[b]), len(keys[a]))
+
+ for key in sorted(keys.keys(), lensort):
+ print len(keys[key]), key, str(keys[key])
+
+if __name__ == "scraper":
+ fieldlist()
+
diff --git a/scrapersources/postliste-risr-kommune b/scrapersources/postliste-risr-kommune
new file mode 100644
index 0000000..cb87bdb
--- /dev/null
+++ b/scrapersources/postliste-risr-kommune
@@ -0,0 +1,126 @@
+# -*- coding: UTF-8 -*-
+# Based on the scraper advanced-scraping-pdf
+# See also
+# https://views.scraperwiki.com/run/pdf-to-html-preview-1/?url=http%3A%2F%2Fwww.stortinget.no%2FGlobal%2Fpdf%2Fpostjournal%2Fpj-2012-05-09.pdf
+
+import scraperwiki
+import json
+from BeautifulSoup import BeautifulSoup
+import datetime
+import dateutil.parser
+import lxml.html
+#import resource
+import sys
+#import urlparse
+#import gc
+import re
+#lazycache=scraperwiki.swimport('lazycache')
+#postlistelib=scraperwiki.swimport('postliste-python-lib')
+
+agency = 'Risør kommune'
+
+import mechanize
+
+# ASPX pages are some of the hardest challenges because they use javascript and forms to navigate
+# Almost always the links go through the function function __doPostBack(eventTarget, eventArgument)
+# which you have to simulate in the mechanize form handling library
+
+# This example shows how to follow the Next page link
+
+url = 'http://159.171.0.169/ris/Modules/innsyn.aspx?mode=pl&SelPanel=0&ObjectType=ePhorteRegistryEntry&VariantType=Innsyn&ViewType=List&Query=RecordDate%3a%28-7%29+AND+DocumentType%3a%28I%2cU%29'
+br = mechanize.Browser()
+
+# sometimes the server is sensitive to this information
+br.addheaders = [('User-agent', 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.1) Gecko/2008071615 Fedora/3.0.1-1.fc9 Firefox/3.0.1')]
+response = br.open(url)
+
+html = response.read()
+
+
+
+for pagenum in range(6):
+ print "Page %d page length %d" % (pagenum, len(html))
+ #print html
+ #print "Clinicians found:", re.findall("PDetails.aspx\?ProviderId.*?>(.*?)</a>", html)
+
+
+ mnextlink = re.search("javascript:__doPostBack\('ctl00\$ctl00\$ctl00\$WebPartManager\$wp1243460126ViewPart\$ctl04',''\).>Neste", html)
+ #print mnextlink
+ if not mnextlink:
+ break
+
+ br.select_form(name='aspnetForm')
+ br.form.set_all_readonly(False)
+ br['__EVENTTARGET'] = 'ctl00$ctl00$ctl00$WebPartManager$wp1243460126ViewPart$ctl04' #'ProviderSearchResultsTable1$NextLinkButton'
+ br['__EVENTARGUMENT'] = ''
+ br.submit()
+
+ html = br.response().read()
+ #print len(html)
+
+
+
+
+# def report_errors(errors):
+# if 0 < len(errors):
+# print "Errors:"
+# for e in errors:
+# print e
+# exit(1)
+# def out_of_cpu(arg, spent, hard, soft):
+# report_errors(arg)
+#
+# def process_pdf(parser, pdfurl, errors):
+# errors = []
+# postlistelib.exit_if_no_cpu_left(0, out_of_cpu, errors)
+# try:
+# pdfcontent = scraperwiki.scrape(pdfurl)
+# parser.preprocess(pdfurl, pdfcontent)
+# pdfcontent = None
+# # except ValueError, e:
+# # errors.append(e)
+# except IndexError, e:
+# errors.append(e)
+#
+# def process_page_queue(parser, errors):
+# try:
+# parser.process_pages()
+# postlistelib.exit_if_no_cpu_left(0, out_of_cpu, errors)
+# except scraperwiki.CPUTimeExceededError, e:
+# errors.append("Processing pages interrupted")
+#
+# def process_journal_pdfs(parser, listurl, errors):
+# # print "Finding PDFs on " + listurl
+# # u = urllib.parse.urlparse(listurl)
+# html = scraperwiki.scrape(listurl)
+# root = lxml.html.fromstring(html)
+# html = None
+# for ahref in root.cssselect("table a"):
+# href = ahref.attrib['href']
+# url = urlparse.urljoin(listurl, href)
+# if -1 != href.find("file://"):
+# # print "Skipping non-http URL " + url
+# continue
+# if parser.is_already_scraped(url):
+# True
+# # print "Skipping already scraped " + url
+# else:
+# # print "Will process " + url
+# process_pdf(parser, url, errors)
+#
+# def test_small_pdfs():
+# # Test with some smaller PDFs
+# errors = []
+# process_pdf("http://home.nuug.no/~pere/uio-postjournal/2011-16.pdf", errors)
+# process_pdf("http://home.nuug.no/~pere/uio-postjournal/2011-52.pdf", errors)
+# process_page_queue(errors)
+# report_errors(errors)
+# exit(0)
+#
+# #test_small_pdfs()
+# errors = []
+# parser = postlistelib.PDFJournalParser(agency=agency)
+# process_journal_pdfs(parser, "http://www.havn.oslo.kommune.no/postjournal/", errors)
+# process_page_queue(parser, errors)
+# report_errors(errors)
+
diff --git a/scrapersources/postliste-ruter b/scrapersources/postliste-ruter
new file mode 100644
index 0000000..757d6be
--- /dev/null
+++ b/scrapersources/postliste-ruter
@@ -0,0 +1,81 @@
+# -*- coding: UTF-8 -*-
+
+import scraperwiki
+import json
+from BeautifulSoup import BeautifulSoup
+import datetime
+import dateutil.parser
+import lxml.html
+import resource
+import sys
+import urlparse
+import re
+lazycache=scraperwiki.swimport('lazycache')
+postlistelib=scraperwiki.swimport('postliste-python-lib')
+
+agency = 'Ruter AS'
+
+def report_errors(errors):
+ if 0 < len(errors):
+ print "Errors:"
+ for e in errors:
+ print e
+ exit(1)
+def out_of_cpu(arg, spent, hard, soft):
+ report_errors(arg)
+
+def process_pdf(parser, pdfurl, errors):
+ errors = []
+ postlistelib.exit_if_no_cpu_left(0, out_of_cpu, errors)
+ try:
+ pdfcontent = scraperwiki.scrape(pdfurl)
+ parser.preprocess(pdfurl, pdfcontent)
+ pdfcontent = None
+# except ValueError, e:
+# errors.append(e)
+ except IndexError, e:
+ errors.append(e)
+
+def process_page_queue(parser, errors):
+ try:
+ parser.process_pages()
+ postlistelib.exit_if_no_cpu_left(0, out_of_cpu, errors)
+ except scraperwiki.CPUTimeExceededError, e:
+ errors.append("Processing pages interrupted")
+
+def process_journal_pdfs(parser, listurl, errors):
+# print "Finding PDFs on " + listurl
+# u = urllib.parse.urlparse(listurl)
+ html = scraperwiki.scrape(listurl)
+ root = lxml.html.fromstring(html)
+ html = None
+ for ahref in root.cssselect("div.vedlegg a"):
+ href = ahref.attrib['href']
+ url = urlparse.urljoin(listurl, href)
+ if -1 != href.find("file://") or -1 == url.find(".pdf"):
+# print "Skipping non-http URL " + url
+ continue
+ if parser.is_already_scraped(url):
+ True
+# print "Skipping already scraped " + url
+ else:
+# print "Will process " + url
+ process_pdf(parser, url, errors)
+
+def test_small_pdfs(parser):
+ # Test with some smaller PDFs
+ errors = []
+ process_pdf(parser, "http://www2.ruter.no/Documents/Offentlig_journal/2012_Uke_24.pdf?epslanguage=no", errors)
+ process_page_queue(parser, errors)
+ report_errors(errors)
+ exit(0)
+
+errors = []
+parser = postlistelib.PDFJournalParser(agency=agency)
+
+#test_small_pdfs(parser)
+
+process_journal_pdfs(parser, "http://www2.ruter.no/verdt-a-vite/presse/offentlig-journal/", errors)
+process_page_queue(parser, errors)
+report_errors(errors)
+
diff --git a/scrapersources/postliste-saltdal b/scrapersources/postliste-saltdal
new file mode 100644
index 0000000..0650d6c
--- /dev/null
+++ b/scrapersources/postliste-saltdal
@@ -0,0 +1,98 @@
+# -*- coding: UTF-8 -*-
+
+import scraperwiki
+import json
+from BeautifulSoup import BeautifulSoup
+import datetime
+import dateutil.parser
+import lxml.html
+import resource
+import sys
+import urllib2
+import urlparse
+import re
+lazycache=scraperwiki.swimport('lazycache')
+postlistelib=scraperwiki.swimport('postliste-python-lib')
+
+agency = 'Saltdal kommune'
+
+def report_errors(errors):
+ if 0 < len(errors):
+ print "Errors:"
+ laste = None
+ for e in errors:
+ print e
+ laste = e
+ raise e
+
+def out_of_cpu(arg, spent, hard, soft):
+ report_errors(arg)
+
+def process_pdf(parser, pdfurl, errors):
+ errors = []
+ postlistelib.exit_if_no_cpu_left(0, out_of_cpu, errors)
+ try:
+ pdfcontent = scraperwiki.scrape(pdfurl)
+ parser.preprocess(pdfurl, pdfcontent)
+ pdfcontent = None
+# except ValueError, e:
+# errors.append(e)
+ except IndexError, e:
+ errors.append(e)
+ except ValueError, e:
+ errors.append(e)
+ except urllib2.HTTPError, e:
+ errors.append(e)
+
+def process_page_queue(parser, errors):
+ try:
+ parser.process_pages()
+ postlistelib.exit_if_no_cpu_left(0, out_of_cpu, errors)
+ except scraperwiki.CPUTimeExceededError, e:
+ errors.append(e)
+
+def process_journal_pdfs(parser, listurl, errors):
+# print "Finding PDFs on " + listurl
+# u = urllib.parse.urlparse(listurl)
+ html = scraperwiki.scrape(listurl)
+ root = lxml.html.fromstring(html)
+ html = None
+ for ahref in root.cssselect("table a"):
+ href = ahref.attrib['href']
+ url = urlparse.urljoin(listurl, href)
+ if -1 != href.find("file://") or -1 == url.find(".pdf"):
+# print "Skipping non-http URL " + url
+ continue
+ if parser.is_already_scraped(url):
+ True
+# print "Skipping already scraped " + url
+ else:
+# print "Will process " + url
+ process_pdf(parser, url, errors)
+
+def test_small_pdfs(parser):
+ # Test with some smaller PDFs
+ errors = []
+ #parser.debug = True
+ newurl = "http://www.saltdal.kommune.no/images/module.files/010612.pdf"
+ if not parser.is_already_scraped(newurl):
+ process_pdf(parser, newurl, errors) # New format
+ if parser.is_already_scraped(newurl):
+ print "Already parsed"
+ else:
+ raise ValueError("Failed to parse")
+# process_pdf(parser, "http://www.saltdal.kommune.no/images/module.files/2007-01-31.pdf", errors) # Old format
+ process_page_queue(parser, errors)
+ report_errors(errors)
+ exit(0)
+
+errors = []
+parser = postlistelib.PDFJournalParser(agency=agency)
+#parser.debug = True
+
+#test_small_pdfs(parser)
+
+process_journal_pdfs(parser, "http://www.saltdal.kommune.no/postlister.html", errors)
+process_page_queue(parser, errors)
+report_errors(errors)
+
diff --git a/scrapersources/postliste-sivilombudsmannen b/scrapersources/postliste-sivilombudsmannen
new file mode 100644
index 0000000..0bf5914
--- /dev/null
+++ b/scrapersources/postliste-sivilombudsmannen
@@ -0,0 +1,81 @@
+# -*- coding: UTF-8 -*-
+
+import scraperwiki
+import json
+from BeautifulSoup import BeautifulSoup
+import datetime
+import dateutil.parser
+import lxml.html
+import resource
+import sys
+import urlparse
+import re
+lazycache=scraperwiki.swimport('lazycache')
+postlistelib=scraperwiki.swimport('postliste-python-lib')
+
+agency = 'Sivilombudsmannen'
+
+def report_errors(errors):
+ if 0 < len(errors):
+ print "Errors:"
+ for e in errors:
+ print e
+ exit(1)
+def out_of_cpu(arg, spent, hard, soft):
+ report_errors(arg)
+
+def process_pdf(parser, pdfurl, errors):
+ errors = []
+ postlistelib.exit_if_no_cpu_left(0, out_of_cpu, errors)
+ try:
+ pdfcontent = scraperwiki.scrape(pdfurl)
+ parser.preprocess(pdfurl, pdfcontent)
+ pdfcontent = None
+# except ValueError, e:
+# errors.append(e)
+ except IndexError, e:
+ errors.append(e)
+
+def process_page_queue(parser, errors):
+ try:
+ parser.process_pages()
+ postlistelib.exit_if_no_cpu_left(0, out_of_cpu, errors)
+ except scraperwiki.CPUTimeExceededError, e:
+ errors.append("Processing pages interrupted")
+
+def process_journal_pdfs(parser, listurl, errors):
+# print "Finding PDFs on " + listurl
+# u = urllib.parse.urlparse(listurl)
+ html = scraperwiki.scrape(listurl)
+ root = lxml.html.fromstring(html)
+ html = None
+ for ahref in root.cssselect("div.rightColumn a"):
+ href = ahref.attrib['href']
+ url = urlparse.urljoin(listurl, href)
+ if -1 != href.find("file://") or -1 == url.find(".pdf"):
+# print "Skipping non-http URL " + url
+ continue
+ if parser.is_already_scraped(url):
+ True
+# print "Skipping already scraped " + url
+ else:
+# print "Will process " + url
+ process_pdf(parser, url, errors)
+
+def test_small_pdfs(parser):
+ # Test with some smaller PDFs
+ errors = []
+ process_pdf(parser, "http://www.sivilombudsmannen.no/getfile.php/Dokumenter/Journaler/11.06.2012.pdf", errors)
+ process_page_queue(parser, errors)
+ report_errors(errors)
+ exit(0)
+
+errors = []
+parser = postlistelib.PDFJournalParser(agency=agency)
+
+#test_small_pdfs(parser)
+
+process_journal_pdfs(parser, "http://www.sivilombudsmannen.no/offentlig-journal/", errors)
+process_page_queue(parser, errors)
+report_errors(errors)
+
diff --git a/scrapersources/postliste-sogne b/scrapersources/postliste-sogne
new file mode 100644
index 0000000..afa4fdf
--- /dev/null
+++ b/scrapersources/postliste-sogne
@@ -0,0 +1,81 @@
+# -*- coding: UTF-8 -*-
+
+import scraperwiki
+import json
+from BeautifulSoup import BeautifulSoup
+import datetime
+import dateutil.parser
+import lxml.html
+import resource
+import sys
+import urlparse
+import re
+lazycache=scraperwiki.swimport('lazycache')
+postlistelib=scraperwiki.swimport('postliste-python-lib')
+
+agency = 'Søgne kommune'
+
+def report_errors(errors):
+ if 0 < len(errors):
+ print "Errors:"
+ for e in errors:
+ print e
+ exit(1)
+def out_of_cpu(arg, spent, hard, soft):
+ report_errors(arg)
+
+def process_pdf(parser, pdfurl, errors):
+ errors = []
+ postlistelib.exit_if_no_cpu_left(0, out_of_cpu, errors)
+ try:
+ pdfcontent = scraperwiki.scrape(pdfurl)
+ parser.preprocess(pdfurl, pdfcontent)
+ pdfcontent = None
+# except ValueError, e:
+# errors.append(e)
+ except IndexError, e:
+ errors.append(e)
+
+def process_page_queue(parser, errors):
+ try:
+ parser.process_pages()
+ postlistelib.exit_if_no_cpu_left(0, out_of_cpu, errors)
+ except scraperwiki.CPUTimeExceededError, e:
+ errors.append("Processing pages interrupted")
+
+def process_journal_pdfs(parser, listurl, errors):
+# print "Finding PDFs on " + listurl
+# u = urllib.parse.urlparse(listurl)
+ html = scraperwiki.scrape(listurl)
+ root = lxml.html.fromstring(html)
+ html = None
+ for ahref in root.cssselect("div#ReadArea a"):
+ href = ahref.attrib['href']
+ url = urlparse.urljoin(listurl, href)
+ if -1 != href.find("file://") or -1 == url.find(".pdf"):
+# print "Skipping non-http URL " + url
+ continue
+ if parser.is_already_scraped(url):
+ True
+# print "Skipping already scraped " + url
+ else:
+# print "Will process " + url
+ process_pdf(parser, url, errors)
+
+def test_small_pdfs(parser):
+ # Test with some smaller PDFs
+ errors = []
+ process_pdf(parser, "http://www.sogne.kommune.no/Documents/Postlister/2012.06.18.pdf", errors)
+ process_page_queue(parser, errors)
+ report_errors(errors)
+ exit(0)
+
+errors = []
+parser = postlistelib.PDFJournalParser(agency=agency)
+
+#test_small_pdfs(parser)
+
+process_journal_pdfs(parser, "http://www.sogne.kommune.no/Organisasjon1/Administrasjonsavdelingen/Arkivet/Postlister/", errors)
+process_page_queue(parser, errors)
+report_errors(errors)
+
diff --git a/scrapersources/postliste-storfjord b/scrapersources/postliste-storfjord
new file mode 100644
index 0000000..4702f8d
--- /dev/null
+++ b/scrapersources/postliste-storfjord
@@ -0,0 +1,82 @@
+# -*- coding: UTF-8 -*-
+# Based on the scraper advanced-scraping-pdf
+# See also
+# https://views.scraperwiki.com/run/pdf-to-html-preview-1/?url=http%3A%2F%2Fwww.stortinget.no%2FGlobal%2Fpdf%2Fpostjournal%2Fpj-2012-05-09.pdf
+
+import scraperwiki
+import json
+from BeautifulSoup import BeautifulSoup
+import datetime
+import dateutil.parser
+import lxml.html
+import urlparse
+import re
+lazycache=scraperwiki.swimport('lazycache')
+postlistelib=scraperwiki.swimport('postliste-python-lib')
+
+agency = 'Storfjord kommune'
+
+def report_errors(errors):
+ if 0 < len(errors):
+ print "Errors:"
+ for e in errors:
+ print e
+ exit(1)
+
+def out_of_cpu(arg, spent, hard, soft):
+ report_errors(arg)
+
+def process_pdf(parser, pdfurl, errors):
+ errors = []
+ postlistelib.exit_if_no_cpu_left(0, out_of_cpu, errors)
+ try:
+ parser.fetch_and_preprocess(pdfurl)
+# except ValueError, e:
+# errors.append(e)
+ except IndexError, e:
+ errors.append(e)
+
+def process_page_queue(parser, errors):
+ try:
+ parser.process_pages()
+ postlistelib.exit_if_no_cpu_left(0, out_of_cpu, errors)
+ except scraperwiki.CPUTimeExceededError, e:
+ errors.append("Processing pages interrupted")
+
+def process_journal_pdfs(parser, listurl, errors):
+# print "Finding PDFs on " + listurl
+# u = urllib.parse.urlparse(listurl)
+ html = scraperwiki.scrape(listurl)
+ root = lxml.html.fromstring(html)
+ html = None
+ for ahref in root.cssselect("div.main a"):
+ href = ahref.attrib['href']
+ url = urlparse.urljoin(listurl, href)
+ if -1 == url.find("postliste-"):
+ continue
+ if parser.is_already_scraped(url):
+ True
+# print "Skipping already scraped " + url
+ else:
+# print "Will process " + url
+ process_pdf(parser, url, errors)
+
+def test_small_pdfs(parser):
+ # Test with some smaller PDFs
+ errors = []
+ process_pdf(parser, "http://www.storfjord.kommune.no/postliste-18-mai-2012.5056067-105358.html", errors)
+ process_pdf(parser, "http://www.storfjord.kommune.no/postliste-16-mai-2012.5056059-105358.html", errors)
+ process_page_queue(parser, errors)
+ report_errors(errors)
+ exit(0)
+
+parser = postlistelib.PDFJournalParser(agency=agency)
+#test_small_pdfs(parser)
+
+errors = []
+process_journal_pdfs(parser, "http://www.storfjord.kommune.no/postliste.105358.no.html", errors)
+for page in range(2,91):
+ process_journal_pdfs(parser, "http://www.storfjord.kommune.no/?cat=105358&apage=" + str(page), errors)
+process_page_queue(parser, errors)
+report_errors(errors)
+
diff --git a/scrapersources/postliste-stortinget b/scrapersources/postliste-stortinget
new file mode 100644
index 0000000..98fd7d6
--- /dev/null
+++ b/scrapersources/postliste-stortinget
@@ -0,0 +1,90 @@
+# Based on the scraper advanced-scraping-pdf
+# See also
+# https://views.scraperwiki.com/run/pdf-to-html-preview-1/?url=http%3A%2F%2Fwww.stortinget.no%2FGlobal%2Fpdf%2Fpostjournal%2Fpj-2012-05-09.pdf
+import scraperwiki
+from BeautifulSoup import BeautifulSoup
+import datetime
+import dateutil.parser
+import lxml.html
+import urlparse
+import resource
+import sys
+postlistelib=scraperwiki.swimport('postliste-python-lib')
+
+def find_journal_pdfs(parser, listurl):
+# print "Finding PDFs on " + listurl
+ html = postlistelib.fetch_url_harder(listurl)
+
+ root = lxml.html.fromstring(html)
+ pdfurls = []
+ for ahref in root.cssselect("div.mainbody a"):
+ href = ahref.attrib['href']
+ url = urlparse.urljoin(listurl, href)
+ if -1 != href.find("file://"):
+# print "Skipping non-http URL " + url
+ continue
+ if parser.is_already_scraped(url):
+ True
+# print "Skipping already scraped " + url
+ else:
+# print "Will process " + url
+ pdfurls.append(url)
+ return pdfurls
+
+def fetch_and_preprocess(parser, pdfurl):
+ pdfcontent = postlistelib.fetch_url_harder(pdfurl)
+ parser.preprocess(pdfurl, pdfcontent)
+ pdfcontent = None
+
+def add_pdf_lists(parser, pdfurls):
+ for period in [
+ "",
+ "_2010-2011",
+ "-2009-2010",
+ "-2008-2009",
+ ]:
+ url = "http://www.stortinget.no/no/Stortinget-og-demokratiet/Administrasjonen/Dokumentoffentlighet/Stortingets-offentlige-postjournal" + period + "/"
+ pdfurls.extend(find_journal_pdfs(parser, url))
+
+
+def report_errors(errors):
+ if 0 < len(errors):
+ print "Errors:"
+ for e in errors:
+ print e
+ raise
+
+def no_cpu_left(arg, spent, soft, hard):
+ report_errors(arg)
+
+agency = 'Stortinget'
+parser = postlistelib.PDFJournalParser(agency=agency)
+#parser.debug = True
+
+if False:
+ pdfurl = "http://www.stortinget.no/Global/pdf/postjournal/pj-2010-06-04-05.pdf"
+ parse_pdf(pdfurl)
+ exit(0)
+
+pdfurls = []
+add_pdf_lists(parser, pdfurls)
+
+# Fetch all journal PDFs
+errors = []
+for pdfurl in pdfurls:
+ postlistelib.exit_if_no_cpu_left(0, callback=no_cpu_left, arg = errors)
+ try:
+ parser.fetch_and_preprocess(pdfurl)
+ except ValueError, e:
+ errors.append(e)
+ except IndexError, e:
+ errors.append(e)
+try:
+ parser.process_pages()
+except ValueError, e:
+ errors.append(e)
+except IndexError, e:
+ errors.append(e)
+
+report_errors(errors)
+
diff --git a/scrapersources/postliste-universitetet-i-oslo b/scrapersources/postliste-universitetet-i-oslo
new file mode 100644
index 0000000..be7b77b
--- /dev/null
+++ b/scrapersources/postliste-universitetet-i-oslo
@@ -0,0 +1,125 @@
+# -*- coding: UTF-8 -*-
+# Based on the scraper advanced-scraping-pdf
+# See also
+# https://views.scraperwiki.com/run/pdf-to-html-preview-1/?url=http%3A%2F%2Fwww.stortinget.no%2FGlobal%2Fpdf%2Fpostjournal%2Fpj-2012-05-09.pdf
+
+import scraperwiki
+import json
+from BeautifulSoup import BeautifulSoup
+import datetime
+import dateutil.parser
+import lxml.html
+import resource
+import sys
+import urlparse
+import re
+
+# Make sure Scraperwiki believe this is the source from this database
+scraperwiki.scrape("http://www.uio.no/om/journal/")
+
+lazycache=scraperwiki.swimport('lazycache')
+postlistelib=scraperwiki.swimport('postliste-python-lib')
+
+agency = 'Universitetet i Oslo'
+
+def report_errors(errors):
+ if 0 < len(errors):
+ print "Errors:"
+ for e in errors:
+ print e
+ raise ValueError("Something went wrong")
+
+def out_of_cpu(arg, spent, hard, soft):
+ report_errors(arg)
+
+def process_pdf(parser, pdfurl, errors):
+ postlistelib.exit_if_no_cpu_left(0, out_of_cpu, errors)
+ try:
+ pdfcontent = scraperwiki.scrape(pdfurl)
+ parser.preprocess(pdfurl, pdfcontent)
+ pdfcontent = None
+ except ValueError, e:
+ errors.append(e)
+ except IndexError, e:
+ errors.append(e)
+
+def process_page_queue(parser, errors):
+ try:
+ parser.process_pages()
+ postlistelib.exit_if_no_cpu_left(0, out_of_cpu, errors)
+ except scraperwiki.CPUTimeExceededError, e:
+ errors.append("Processing pages interrupted")
+
+def process_journal_pdfs(parser, listurl, errors):
+# print "Finding PDFs on " + listurl
+# u = urllib.parse.urlparse(listurl)
+ html = scraperwiki.scrape(listurl)
+ root = lxml.html.fromstring(html)
+ html = None
+ for ahref in root.cssselect("table a"):
+ href = ahref.attrib['href']
+ url = urlparse.urljoin(listurl, href)
+ if -1 != href.find("file://") or -1 == url.find(".pdf"):
+# print "Skipping non-http URL " + url
+ continue
+ if parser.is_already_scraped(url):
+ True
+# print "Skipping already scraped " + url
+ else:
+# print "Will process " + url
+ process_pdf(parser, url, errors)
+
+def process_journal_pdf_directory(parser, listurl, errors):
+ #html = scraperwiki.scrape(listurl)
+ html = lazycache.lazycache(listurl)
+ root = lxml.html.fromstring(html)
+ html = None
+
+ pdflisturls = []
+ for ahref in root.cssselect("span.vrtx-paging-wrapper a"):
+ href = ahref.attrib['href']
+ url = urlparse.urljoin(listurl, href)
+ pdflisturls.append(url)
+# print pdflisturls
+
+ for listurl in pdflisturls:
+ html = scraperwiki.scrape(listurl)
+ root = lxml.html.fromstring(html)
+ html = None
+ urlseen = {}
+ for ahref in root.cssselect("div.vrtx-resource a"):
+ href = ahref.attrib['href']
+ url = urlparse.urljoin(listurl, href)
+ if -1 == url.find(".pdf"):
+ continue
+ # Ignore duplicates with M: as part of the name
+ if -1 != url.find("/M%"):
+ continue
+ if url in urlseen or parser.is_already_scraped(url):
+ True
+# print "Skipping already scraped " + url
+ else:
+# print "Will process " + url
+ process_pdf(parser, url, errors)
+ urlseen[url] = 1
+
+def test_small_pdfs(parser):
+ # Test with some smaller PDFs
+ errors = []
+ process_pdf(parser, "http://home.nuug.no/~pere/uio-postjournal/2011-16.pdf", errors)
+ process_pdf(parser, "http://home.nuug.no/~pere/uio-postjournal/2011-52.pdf", errors)
+ process_page_queue(parser, errors)
+ report_errors(errors)
+ exit(0)
+
+errors = []
+parser = postlistelib.PDFJournalParser(agency=agency)
+
+#test_small_pdfs(parser)
+
+process_journal_pdfs(parser, "http://www.uio.no/om/journal/", errors)
+#process_journal_pdf_directory(parser, "http://www.uio.no/om/journal/2012/", errors)
+#process_journal_pdf_directory(parser, "http://www.uio.no/om/journal/2011/", errors)
+process_page_queue(parser, errors)
+report_errors(errors)
+
diff --git a/scrapersources/postliste-universitetet-i-stavanger b/scrapersources/postliste-universitetet-i-stavanger
new file mode 100644
index 0000000..5852cb7
--- /dev/null
+++ b/scrapersources/postliste-universitetet-i-stavanger
@@ -0,0 +1,89 @@
+# -*- coding: UTF-8 -*-
+
+import scraperwiki
+import json
+from BeautifulSoup import BeautifulSoup
+import datetime
+import dateutil.parser
+import lxml.html
+import resource
+import sys
+import urlparse
+import re
+lazycache=scraperwiki.swimport('lazycache')
+postlistelib=scraperwiki.swimport('postliste-python-lib')
+
+agency = 'Universitetet i Stavanger'
+
+def report_errors(errors):
+ if 0 < len(errors):
+ print "Errors:"
+ for e in errors:
+ print e
+ exit(1)
+def out_of_cpu(arg, spent, hard, soft):
+ report_errors(arg)
+
+def process_pdf(parser, pdfurl, errors):
+ errors = []
+ postlistelib.exit_if_no_cpu_left(0, out_of_cpu, errors)
+ try:
+ pdfcontent = scraperwiki.scrape(pdfurl)
+ parser.preprocess(pdfurl, pdfcontent)
+ pdfcontent = None
+# except ValueError, e:
+# errors.append(e)
+ except IndexError, e:
+ errors.append(e)
+
+def process_page_queue(parser, errors):
+ try:
+ parser.process_pages()
+ postlistelib.exit_if_no_cpu_left(0, out_of_cpu, errors)
+ except scraperwiki.CPUTimeExceededError, e:
+ errors.append("Processing pages interrupted")
+
+def process_journal_pdfs(parser, listurl, errors):
+# print "Finding PDFs on " + listurl
+# u = urllib.parse.urlparse(listurl)
+ html = scraperwiki.scrape(listurl)
+ root = lxml.html.fromstring(html)
+ html = None
+ for ahref in root.cssselect("div#placeholder-content-main-left-column a"):
+ href = ahref.attrib['href']
+ url = urlparse.urljoin(listurl, href)
+ if -1 != href.find("file://") or -1 == url.find('/postjournal/article'):
+# print "Skipping non-http URL " + url
+ continue
+ subhtml = scraperwiki.scrape(url)
+ subroot = lxml.html.fromstring(subhtml)
+ subhtml = None
+ for subahref in subroot.cssselect("div.article-content a"):
+ subhref = subahref.attrib['href']
+ suburl = urlparse.urljoin(listurl, subhref)
+ if -1 == suburl.find(".pdf"):
+ continue
+ if parser.is_already_scraped(suburl):
+ True
+# print "Skipping already scraped " + suburl
+ else:
+# print "Will process " + suburl
+ process_pdf(parser, suburl, errors)
+
+def test_small_pdfs(parser):
+ # Test with some smaller PDFs
+ errors = []
+ process_pdf(parser, "http://www.uis.no/getfile.php/Journal%20200612.pdf", errors)
+ process_page_queue(parser, errors)
+ report_errors(errors)
+ exit(0)
+
+errors = []
+parser = postlistelib.PDFJournalParser(agency=agency)
+
+#test_small_pdfs(parser)
+
+process_journal_pdfs(parser, "http://www.uis.no/nyheter/postjournal/", errors)
+process_page_queue(parser, errors)
+report_errors(errors)
+
diff --git a/scrapersources/postliste-vikna b/scrapersources/postliste-vikna
new file mode 100644
index 0000000..1279f9e
--- /dev/null
+++ b/scrapersources/postliste-vikna
@@ -0,0 +1,89 @@
+# -*- coding: UTF-8 -*-
+
+import scraperwiki
+import json
+from BeautifulSoup import BeautifulSoup
+import datetime
+import dateutil.parser
+import lxml.html
+import resource
+import sys
+import urlparse
+import re
+
+# Make sure Scraperwiki believe this is the source from this database
+scraperwiki.scrape("http://www.vikna.kommune.no/Vikna/Web.nsf/mainPress?OpenForm&amp;U=POST")
+
+lazycache=scraperwiki.swimport('lazycache')
+postlistelib=scraperwiki.swimport('postliste-python-lib')
+
+agency = 'Vikna kommune'
+
+def report_errors(errors):
+ if 0 < len(errors):
+ print "Errors:"
+ for e in errors:
+ print e
+ exit(1)
+def out_of_cpu(arg, spent, hard, soft):
+ report_errors(arg)
+
+def process_pdf(parser, pdfurl, errors):
+ errors = []
+ postlistelib.exit_if_no_cpu_left(0, out_of_cpu, errors)
+ try:
+ pdfcontent = scraperwiki.scrape(pdfurl)
+ parser.preprocess(pdfurl, pdfcontent)
+ pdfcontent = None
+# except ValueError, e:
+# errors.append(e)
+ except IndexError, e:
+ errors.append(e)
+
+def process_page_queue(parser, errors):
+ try:
+ parser.process_pages()
+ postlistelib.exit_if_no_cpu_left(0, out_of_cpu, errors)
+ except scraperwiki.CPUTimeExceededError, e:
+ errors.append("Processing pages interrupted")
+
+def process_journal_pdfs(parser, listurl, errors):
+# print "Finding PDFs on " + listurl
+# u = urllib.parse.urlparse(listurl)
+ html = scraperwiki.scrape(listurl)
+ root = lxml.html.fromstring(html)
+ html = None
+ for ahref in root.cssselect("table a"):
+ href = ahref.attrib['href']
+ url = urlparse.urljoin(listurl, href)
+ if -1 != href.find("file://") or -1 == url.find(".pdf"):
+# print "Skipping non-http URL " + url
+ continue
+ if -1 != href.find("/Ingen postjournal.pdf"):
+ continue
+ if parser.is_already_scraped(url):
+ True
+# print "Skipping already scraped " + url
+ else:
+# print "Will process " + url
+ process_pdf(parser, url, errors)
+
+def test_small_pdfs(parser):
+ # Test with some smaller PDFs
+ errors = []
+ process_pdf(parser, "http://www.vikna.kommune.no/Vikna/Intern.nsf/FilA/A715C0C6E0D8CC05C12578F70024857B/$FILE/PJ230811.pdf", errors)
+ process_page_queue(parser, errors)
+ report_errors(errors)
+ exit(0)
+
+errors = []
+parser = postlistelib.PDFJournalParser(agency=agency)
+
+#test_small_pdfs(parser)
+
+process_page_queue(parser, errors)
+process_journal_pdfs(parser, "http://www.vikna.kommune.no/Vikna/Web.nsf/mainPress?OpenForm&amp;U=POST", errors)
+process_page_queue(parser, errors)
+report_errors(errors)
+
+# FIXME Need to handle recent journal entries too \ No newline at end of file
diff --git a/scrapersources/statens_vegvesen_offentlig_journal b/scrapersources/statens_vegvesen_offentlig_journal
new file mode 100644
index 0000000..947da4e
--- /dev/null
+++ b/scrapersources/statens_vegvesen_offentlig_journal
@@ -0,0 +1,56 @@
+import scraperwiki
+import lxml.html
+import datetime
+
+#uncomment to run for a selected timeperiod
+#fromdate = "01.04.2011"
+#todate = "21.05.2011"
+
+#fromdate = datetime.datetime.strptime(fromdate, "%d.%m.%Y")
+#todate = datetime.datetime.strptime(todate, "%d.%m.%Y")
+#adday = datetime.timedelta(days=1)
+
+def scrapepage(mydate):
+
+ formatteddate = mydate.strftime("%d.%m.%Y")
+ #formatteddate = "10.05.2011"
+
+ url = "http://www.vegvesen.no/Om+Statens+vegvesen/Aktuelt/Offentlig+journal?dokumenttyper=&dato=%s&journalenhet=6&utforSok=S%%C3%%B8k&submitButton=S%%C3%%B8k" % formatteddate
+
+ root = lxml.html.parse(url).getroot()
+
+ divs = root.cssselect("div.treff")
+
+ for p in divs:
+
+ dateandtype = p.xpath("p/text()")[0].split(" ")
+ saksdetaljer = p.xpath("ul[@class='saksdetaljer']/li/text()")
+
+
+ record = {
+ "doknr": dateandtype[0],
+ "innut": dateandtype[2],
+ "tittel": p.xpath("h2/text()")[0],
+ "sak": p.xpath("span[@class='sak']")[0].text[6:],
+ "fratil": p.xpath("ul[@class='fraTil']/li/text()")[0][5:],
+ }
+
+ record.update(dict([x.split(":") for x in saksdetaljer]))
+
+ record['Dokumenttdato'] = datetime.datetime.strptime(record['Dokumenttdato'].strip(), "%d.%m.%Y").date()
+ record['Journaldato'] = datetime.datetime.strptime(record['Journaldato'].strip(), "%d.%m.%Y").date()
+
+ scraperwiki.sqlite.save(unique_keys=["doknr"], data=record)
+
+#uncomment to run for a selected timeperiod
+#thedate = fromdate
+#while thedate <= todate:
+# print thedate
+# thedate = thedate + adday
+# scrapepage(thedate)
+#comment out these two lines in order to run for a selected timeperiod
+thedate = datetime.datetime.now()
+print thedate
+
+scrapepage(thedate)
+ \ No newline at end of file