diff options
author | Petter Reinholdtsen <pere@hungry.com> | 2012-07-13 12:28:13 +0200 |
---|---|---|
committer | Petter Reinholdtsen <pere@hungry.com> | 2012-07-13 12:28:13 +0200 |
commit | 22bceaf65dd89df97529df0102149aefa2b54f54 (patch) | |
tree | 24a6dd995d146b27d92d4c91593dc8d8fd952064 |
Store current scraperwiki sources.
46 files changed, 5651 insertions, 0 deletions
diff --git a/fetch-scraper-sources b/fetch-scraper-sources new file mode 100755 index 0000000..6465ea3 --- /dev/null +++ b/fetch-scraper-sources @@ -0,0 +1,15 @@ +#!/bin/sh + +scrapers="postliste-python-lib postliste-ballangen" + +scrapers="$( ( + GET https://scraperwiki.com/tags/postjournal + GET https://scraperwiki.com/tags/postjournal?page=2 + GET https://scraperwiki.com/tags/postjournal?page=3 +) | grep owner | rev | cut -d/ -f3 | rev)" + +mkdir -p scrapersources +for scraper in $scrapers; do + echo "Fetching source for $scraper" + GET "https://scraperwiki.com/editor/raw/$scraper" > "scrapersources/$scraper" +done diff --git a/scrapersources/nrks_offentlig_journal_pdf_text_positioning b/scrapersources/nrks_offentlig_journal_pdf_text_positioning new file mode 100644 index 0000000..51cd746 --- /dev/null +++ b/scrapersources/nrks_offentlig_journal_pdf_text_positioning @@ -0,0 +1,141 @@ +import scraperwiki, urllib2, datetime, base64, time, re +from bs4 import BeautifulSoup +from collections import deque +import scraperwiki +lazycache = scraperwiki.swimport('lazycache') +u = scraperwiki.swimport('hildenae_utils') + +def d(text): + if(False): + print "DEBUG:", text + +def process_pdf(pdfurl): + pdfxml = u.findInCache(pdfurl,verbose=True) # look for html parse in cache + if pdfxml is None: # a html parse is not cached + pdfdata=lazycache.lazycache(pdfurl, verbose=True) # look for pdf document in cache, if not download + pdfxml = scraperwiki.pdftoxml(pdfdata, "-hidden") # parse pdf text to html + u.putInCache(pdfurl, pdfxml, verbose=True) # save cache of html parse + + beautifulxml = BeautifulSoup(pdfxml) # convert html to BeautifulSoup(4) object + + for page in beautifulxml.find_all('page'): + FIRSTPAGE = 6 + LASTPAGE = 6 + if int(page['number']) < FIRSTPAGE: + continue + if int(page['number']) == FIRSTPAGE: + print "*******************************************" + print "***** FIRSTPAGE #%d while developing ******" % (FIRSTPAGE) + print "*******************************************" + if int(page['number']) == LASTPAGE+1: + print "*******************************************" + print "****** LASTPAGE #%d while developing ******" % (LASTPAGE) + print "*******************************************" + break + + print( "*******************************************") + print( "********** Working on page #%s **********" % page['number']) + print( "*******************************************") + elementList = deque(page.find_all('text')) # we want to be able to use popleft + d(elementList) + while True: + try: + currElement = elementList.popleft() + if "Innhold:" in currElement.text and currElement.b: # we found a "Innhold:"-header + entry = parseDocumentRecord(currElement, elementList) + print entry + scraperwiki.sqlite.save(unique_keys=["innhold", "sakstittel"], data=entry) + d( "back in process_pdf") + #else: + #print currElement.text + except IndexError, e: + d("No more text elements on page (%s)" % e) + break + + + +def parseDocumentRecord(currElement, elementList): + # previous element in list is "Innhold:" + d ("starting parseDocumentRecord") + entry = {} + while(True): + try: + d(elementList) + if "Innhold:" in elementList[0].text: # look ahead, if next is "Innhold:" return to process_pdf + break + + currElement = elementList.popleft() # first text in innhold + entry["innhold"] = "" + while(True): + if "Sakstittel:" in currElement.text: # we found sakstittel, go to next + break + entry["innhold"] += currElement.text + currElement = elementList.popleft() + entry["innhold"] = u.removeDoubleSpaces(entry["innhold"]) + + currElement = elementList.popleft() # first text in sakstittel + entry["sakstittel"] = "" + while(True): + if "DokType" in currElement.text: # we found DokType, go to next + break + entry["sakstittel"] += currElement.text + currElement = elementList.popleft() + entry["sakstittel"] = u.removeDoubleSpaces(entry["sakstittel"]) + + print("before spool to 'mottaker:'") + + ''' + + + + Komments: Virker som om pdf2html noen ganger ikke klarer å lese DokType. Hittil er dette kun observert når + DokType er U (selv om den klarer å lese noen DokType U). Dette er bekreftet mesteparten av 18 og 22 i juni + + + + ''' + print elementList + + + + print("spool to 'mottaker:'") + currElement = elementList.popleft() # first text after DocType + while(True): + if re.search( r'[t].*[t].*[a].*[k].*[e].*[r].*[:]', currElement.text): # match "motta ker:" (some last pages - nooooot pretty) + d("found mottaker") + break + currElement = elementList.popleft() + + d(elementList) + + entry["avsender_mottager"] = "" + while(True): + if ("Innhold:" in elementList[0].text) or ("Side:" in elementList[0].text): # ***look ahead***, if next is "Innhold:" return to process_pdf + #print "next is innhold, cleanup" + entry["avsender_mottager"] = u.removeDoubleSpaces(entry["avsender_mottager"]) + if re.match("^[*]+$", entry["avsender_mottager"]): + entry["avsender_mottager"] = None + #print elementList + #print entry + d("finished with record") + break + #print "Adding to avs_mot (%s)" % currElement.text + entry["avsender_mottager"] += currElement.text + currElement = elementList.popleft() + + #print "lastBreak" + break # we are finished with this Innhold + except IndexError, e: + d("No more text elements on page (%s)" % e) + break + return entry + +process_pdf("http://www.nrk.no/contentfile/file/1.8221353!offentlig22062012.pdf") # 4 records on last page +#process_pdf("http://www.nrk.no/contentfile/file/1.8217234!offentligjournal21062012.pdf") # 3 records on last page +#process_pdf("http://www.nrk.no/contentfile/file/1.8214156!offentligjournal20062012.pdf") +#process_pdf("http://www.nrk.no/contentfile/file/1.8212381!offentligjournal19062012.pdf") + +# https://views.scraperwiki.com/run/pdf_to_html_preview_4/?url=http%3A%2F%2Fwww.nrk.no%2Fcontentfile%2Ffile%2F1.8209505%21offentligjournal18062012.pdf&hidden=1 +#process_pdf("http://www.nrk.no/contentfile/file/1.8209505!offentligjournal18062012.pdf") # 1 record on last page + + diff --git a/scrapersources/oep-exemptions b/scrapersources/oep-exemptions new file mode 100644 index 0000000..23a1691 --- /dev/null +++ b/scrapersources/oep-exemptions @@ -0,0 +1,101 @@ +<!doctype html> +<html lang="nb"> +<head> +<meta charset="utf-8" /> +<title>Hvor mange dokumenter er journalført i hver etat og hvor mange er unntatt innsyn?</title> +<script src="https://cdnjs.cloudflare.com/ajax/libs/jquery/1.7.2/jquery.min.js"></script> +<script src="https://cdnjs.cloudflare.com/ajax/libs/highcharts/2.2.2/highcharts.js"></script> +<!-- <script src="https://code.highcharts.com/modules/exporting.js"></script>--> +<script src="https://cdnjs.cloudflare.com/ajax/libs/underscore.js/1.3.3/underscore-min.js"></script> +<script> +$(function() + { + var chart; + var query_url = "https://api.scraperwiki.com/api/1.0/datastore/sqlite?format=jsondict&name=postliste-oep&query=select%20Agency%2C%22Grounds%20for%20exemption%20document%22%20as%20ex%2C%20count(*)%20as%20num%20from%20%60swdata%60%20group%20by%20Agency%2Cex%20"; + + function get_chart_opts(agencies, series) { + return { + chart: { renderTo: 'container', type: 'bar' }, + title: { text: 'Hvor mange dokumenter er journalført i hver etat og hvor mange er unntatt innsyn?' }, + xAxis: { categories: agencies }, + yAxis: { + min: 0, + title: { text: "Antall journalførte dokumenter" }, + }, + legend: { + backgroundColor: '#FFFFFF', + reversed: true + }, + tooltip: { + formatter: function() { + return ''+ this.series.name + ': '+ this.y + ' ('+parseInt(this.percentage) + '%)'; + + } + }, + plotOptions: { + series: { + stacking: 'normal' + } + }, + series: series + }; + + } + + function populate_chart(data) { + // TODO: Very naive iteration today. Should be optimized + var agencies = _.uniq( _.pluck(data, 'Agency') ); + var totals = {}; + var not_exemption = {}; + var series = []; + + // traverse and find data + _.each(data, function(entry) { + var agency_name = entry['Agency']; + + if (agency_name) { + if (! totals[agency_name]) { + totals[agency_name] = 0; + } + totals[agency_name] += entry['num']; + + if ("" == entry['ex']) { + not_exemption[agency_name] = entry['num']; + } + } + }); + + + // make series + series.push({ name: 'Ingen merknader', + data: _.map(agencies, function(agency) { + return not_exemption[agency]; + }) + }); + + + series.push({ name: 'Unntatt innsyn', + data: _.map(agencies, function(agency) { + return totals[agency] - not_exemption[agency]; + }) + }); + + + + chart = new Highcharts.Chart(get_chart_opts(agencies, series)); + }; + + + $(document).ready(function() { + $.ajax({ url: query_url, dataType: 'json', success: function(data){ populate_chart(data); } }); + }); +} +); + +</script> +</head> +<body> + <div id="container" style="height: 2000px;width: 100%;margin: 0 auto"></div> + <p>Alle dokumenter som har oppgitt en grunn for å unnlate offentligjøring vil telles som "Unnatt innsyn".</p> +</body> +</html> diff --git a/scrapersources/oep-exemptions_1 b/scrapersources/oep-exemptions_1 new file mode 100644 index 0000000..29c3a98 --- /dev/null +++ b/scrapersources/oep-exemptions_1 @@ -0,0 +1,101 @@ +<!doctype html> +<html lang="nb"> +<head> +<meta charset="utf-8" /> +<title>Hvor mange dokumenter er journalført i hver etat og hvor mange er unntatt innsyn?</title> +<script src="https://cdnjs.cloudflare.com/ajax/libs/jquery/1.7.2/jquery.min.js"></script> +<script src="https://cdnjs.cloudflare.com/ajax/libs/highcharts/2.2.2/highcharts.js"></script> +<!-- <script src="https://code.highcharts.com/modules/exporting.js"></script>--> +<script src="https://cdnjs.cloudflare.com/ajax/libs/underscore.js/1.3.3/underscore-min.js"></script> +<script> +$(function() + { + var chart; + var query_url = "https://api.scraperwiki.com/api/1.0/datastore/sqlite?format=jsondict&name=postliste-oep&query=select%20Agency%2C%22Grounds%20for%20exemption%20document%22%20as%20ex%2C%20count(*)%20as%20num%20from%20%60swdata%60%20group%20by%20Agency%2Cex%20"; + + function get_chart_opts(agencies, series) { + return { + chart: { renderTo: 'container', type: 'bar' }, + title: { text: 'Hvor mange dokumenter er journalført i hver etat og hvor mange er unntatt innsyn?' }, + xAxis: { categories: agencies }, + yAxis: { + min: 0, + title: { text: "Antall journalførte dokumenter" }, + }, + legend: { + backgroundColor: '#FFFFFF', + reversed: true + }, + tooltip: { + formatter: function() { + return ''+ this.series.name + ': '+ this.y + ' ('+parseInt(this.percentage) + '%)'; + + } + }, + plotOptions: { + series: { + stacking: 'percent' + } + }, + series: series + }; + + } + + function populate_chart(data) { + // TODO: Very naive iteration today. Should be optimized + var agencies = _.uniq( _.pluck(data, 'Agency') ); + var totals = {}; + var not_exemption = {}; + var series = []; + + // traverse and find data + _.each(data, function(entry) { + var agency_name = entry['Agency']; + + if (agency_name) { + if (! totals[agency_name]) { + totals[agency_name] = 0; + } + totals[agency_name] += entry['num']; + + if ("" == entry['ex']) { + not_exemption[agency_name] = entry['num']; + } + } + }); + + + // make series + series.push({ name: 'Ingen merknader', + data: _.map(agencies, function(agency) { + return not_exemption[agency]; + }) + }); + + + series.push({ name: 'Unntatt innsyn', + data: _.map(agencies, function(agency) { + return totals[agency] - not_exemption[agency]; + }) + }); + + + + chart = new Highcharts.Chart(get_chart_opts(agencies, series)); + }; + + + $(document).ready(function() { + $.ajax({ url: query_url, dataType: 'json', success: function(data){ populate_chart(data); } }); + }); +} +); + +</script> +</head> +<body> + <div id="container" style="height: 2000px;width: 100%;margin: 0 auto"></div> + <p>Alle dokumenter som har oppgitt en grunn for å unnlate offentligjøring vil telles som "Unnatt innsyn".</p> +</body> +</html> diff --git a/scrapersources/postlist-ssb b/scrapersources/postlist-ssb new file mode 100644 index 0000000..de2a051 --- /dev/null +++ b/scrapersources/postlist-ssb @@ -0,0 +1,164 @@ +import scraperwiki +import urllib2 +import lxml.html +import datetime +import time +import dateutil.parser +import pickle +import re + +from datetime import date +from datetime import timedelta +from time import strftime + +# Make sure Scraperwiki believe this is the source from this database +scraperwiki.scrape("http://www.ssb.no/omssb/journal/") + +postlistelib=scraperwiki.swimport('postliste-python-lib') + +agency = "Statistisk sentralbyrå" + +def daterange(start_date, end_date): + for n in range((end_date - start_date).days): + yield start_date + timedelta(n) + +def expand_year(year): + year = int(year) + if year > 50: + year = year + 1900 + else: + year = year + 2000 + return year + +def fetch_url(url): + html = None + for n in [1]: + try: + html = scraperwiki.scrape(url) + break + except urllib2.URLError, e: + print "URLError fetching " + url + ", trying again" + return html + +def save_date(parser, date, url, html): + num_saved = 0 + root = lxml.html.fromstring(html) + journal_date = dateutil.parser.parse(root.cssselect("p")[0].text_content().replace("Journaldato: ",""), dayfirst=True) + if date == journal_date.date(): + datastore = [] + for table in root.cssselect("table"): + docid = table.cssselect("tr")[0].cssselect("p")[1].text.strip() + datedesc = table.cssselect("tr")[0].cssselect("td")[3].cssselect("p")[0].text.strip() + + exemption = table.cssselect("tr")[1].cssselect("td")[5].cssselect("p")[0].text.strip() + + fratil_indicator = table.cssselect("tr")[2].cssselect("td")[0].cssselect("p")[0].text.strip() + + doctype = "" + if fratil_indicator.startswith("Til"): + doctype = "U" + elif fratil_indicator.startswith("Fra"): + doctype = "I" + elif fratil_indicator.startswith("Notat fra"): + doctype = "N" + else: + raise ValueError("Fant ikke doctype %s" % fratil_indicator) + + fratil_agency = table.cssselect("tr")[2].cssselect("td")[1].cssselect("p")[0].text.strip() + + casedesc = table.cssselect("tr")[4].cssselect("td")[1].cssselect("p")[0].text.strip() + + docdesc = table.cssselect("tr")[5].cssselect("td")[1].cssselect("p")[0].text.strip() + saksb = table.cssselect("tr")[0].cssselect("p")[5].text.strip() + + docdate = dateutil.parser.parse(datedesc.strip(), dayfirst=True) + + matchObj = re.match( r'(\d+)/(\d+)\s*-\s*(\d+)$', docid, re.M|re.I) + if matchObj: + caseyear = matchObj.group(1) + caseseqnr = matchObj.group(2) + casedocseq = matchObj.group(3) + caseyear = expand_year(caseyear) + caseid = str(caseyear) + "/" + str(caseseqnr) + else: + print "error: invalid Arkivsaksnr: " + docid + matchObj = re.match( r'(\d+)/(\d+)\s*-', docid, re.M|re.I) + if matchObj: + caseyear = expand_year(matchObj.group(1)) + caseseqnr = matchObj.group(2) + caseid = str(caseyear) + "/" + str(caseseqnr) + + if parser.is_sender_doctype(doctype): + fratilfield = 'sender' + elif parser.is_recipient_doctype(doctype): + fratilfield = 'recipient' + + data = { + 'agency' : agency, + 'docdate' : docdate.date(), + 'recorddate' : journal_date.date(), + 'docdesc' : docdesc, + 'casedesc' : casedesc, + 'caseid' : caseid, + 'docid' : docid, + + 'caseyear' : caseyear, + 'caseseqnr' : caseseqnr, + 'casedocseq' : casedocseq, + + fratilfield : fratil_agency, + 'doctype' : doctype, + + 'saksbehandler' : saksb, + + 'exemption' : exemption, + + 'scrapedurl' : url, + 'scrapestamputc' : datetime.datetime.now() + } + parser.verify_entry(data) + datastore.append(data) + scraperwiki.sqlite.save(unique_keys=['docid'], data=datastore) + num_saved += len(datastore) + datastore = [] + #print "Saved %s" % data['caseid'] + else: + # TODO: log error or exit? + msg = "Tried to scrape %s but got %s" % (date, journal_date.date()) + #raise ValueError(msg) + print msg + + return num_saved + +def scrape_date(parser, date): + url = base_url % (strftime("%d%m%y", date.timetuple())) + html = fetch_url(url) + if html: + return save_date(parser, date, url, html) + +base_url = 'http://www.ssb.no/omssb/journal/OJ%s.htm' +end_date = date.today() + +#print res + +start_date_obj = scraperwiki.sqlite.get_var('last_finished_date') + +if start_date_obj: + start_date = pickle.loads(start_date_obj) +else: + start_date = datetime.date(2011, 1, 3) + +print "Start date %s" % start_date + +parser = postlistelib.JournalParser(agency=agency) + +for single_date in daterange(start_date, end_date): + if single_date.weekday() < 5: + num_saved = scrape_date(parser, single_date) + print "Scraped %s found %s" % (single_date, num_saved) + if num_saved > 0: + scraperwiki.sqlite.save_var('last_finished_date', pickle.dumps(single_date)) + + if num_saved == None: + print "No more new. Exit..." + break diff --git a/scrapersources/postliste-arendal b/scrapersources/postliste-arendal new file mode 100644 index 0000000..5960033 --- /dev/null +++ b/scrapersources/postliste-arendal @@ -0,0 +1,188 @@ +import scraperwiki + +import json +import httplib, urllib +import datetime +import dateutil.parser +import time +import re + +agency = "Arendal kommune" +urlhost = "www.arendal.kommune.no" + +fieldmap = { + 'AntallVedlegg' : '', + 'Arkivdel' : '', + 'AvsenderMottaker' : 'sender', # or recipient + 'Dokumentdato' : 'docdate', + 'Dokumentnummer' : 'casedocseq', + 'Dokumenttype' : 'doctype', + 'EkspedertDato' : '', + 'Hjemmel' : 'exemption', + 'Id' : 'id', + 'Innholdsbeskrivelse' : 'docdesc', + 'Mappetype' : '', + 'Offentlig' : 'ispublic', + 'PostlisteType' : 'doctype', + 'RegistrertDato' : 'recorddate', + 'SaksId' : '', + 'SaksNr' : 'caseid', + 'Sakstittel' : 'casedesc', + #'SaksNr' : 'SA.SAAR + SA.SEKNR', + 'Saksansvarlig' : 'saksbehandler', + 'SaksansvarligEnhet' : '', + 'SaksansvarligEpost' : '', + +# 'scrapestamputc' : '', +# 'scrapedurl' : '', +# 'agency' : '', +} + + +# Convert "/Date(1317808020000+0200)/" to a datetime object +# FIXME Currently ignore the timezone information +def parse_datestr(str): + match = re.split("[/()+]", str) +# print match + sinceepoch = float(match[2]) / 1000 + if match[3] == '0200': + sinceepoch = sinceepoch + 2 * 60 * 60 + if match[3] == '0100': + sinceepoch = sinceepoch + 1 * 60 * 60 +# print sinceepoch + date = datetime.datetime.fromtimestamp(sinceepoch) +# print date + return date + +def reformat_caseid(caseid): + # Input 12/13123, output 2012, 13123, "2012/13123" + year, seqnr = caseid.split("/") + year = int(year) + if year < 100: + year = year + 2000 + caseid = "%d/%s" % (year, seqnr) + return year, int(seqnr), caseid + +def ws_post(url, urlhost, urlpath, params): + jsonparams = json.dumps(params) + headers = {"Content-type": "application/json; charset=utf-8", + "Accept": "application/json"} + conn = httplib.HTTPConnection(urlhost) + #print jsonparams + conn.request("POST", urlpath, jsonparams, headers) + response = conn.getresponse() + #print response.status, response.reason + jsonres = response.read() + res = json.loads(jsonres) + #print res + return res + +def fetch_journal_entry(id): + params = { "id" : str(id)} + headers = {"Content-type": "application/json; charset=utf-8", + "Accept": "application/json"} + urlpath = "/Templates/eDemokrati/Services/eDemokratiService.svc/GetPostlisteObjekt" + data = ws_post(None, urlhost, urlpath, params)['d'] + entry = None + if data: + del data['__type'] # This is useless, ignore + print data + entry = {} + entry['agency'] = agency + entry['scrapestamputc'] = datetime.datetime.now() + entry['scrapedurl'] = "http://" + urlhost + urlpath +# entry['scrapedurl'] = url + for dfield in fieldmap.keys(): + if dfield in data and data[dfield]: + if dfield in fieldmap and fieldmap[dfield] != "": + fieldname = fieldmap[dfield] + else: + fieldname = dfield + if 'sender' == fieldname: + if data['Dokumenttype'] == 'U': + fieldname = 'recipient' + if dfield in ['RegistrertDato', 'Dokumentdato', 'EkspedertDato']: + entry[fieldname] = parse_datestr(data[dfield]).date() + else: + entry[fieldname] = data[dfield] + else: + entry[dfield] = data[dfield] + entry['caseyear'], entry['caseseqnr'], entry['caseid'] = reformat_caseid(entry['caseid']) +# data["sourceurl"] = "http://" + server + path + print entry + return entry + +def epoctime_to_datestr(epoctime): + return "/Date("+str(int(epoctime * 1000) )+")/" + +def get_last_entry_id(): + now = time.time() + # Get the last week, as the most recent entry should be in this range + fradato = epoctime_to_datestr(now - 7 * 24 * 60 * 60) + tildato = epoctime_to_datestr(now) + #print fradato + + maxid = 0 + + urlpath = "/Templates/eDemokrati/Services/eDemokratiService.svc/GetPostlisteArkivdeler" + params = { + "dato": fradato, + "tilDato": tildato, + "søkestreng":""} + arkivdeler = ws_post(None, urlhost, urlpath, params)['d'] + # {u'd': [u'_', u'HVA-IFE-A', u'KAR-BR-A', u'KAR-BRUK-A', u'KAR-EIEN-A', u'KAR-ELBH-A', u'KAR-ELS-A', ... + + urlpath = "/Templates/eDemokrati/Services/eDemokratiService.svc/GetPostlisteDokumenttyper" + for arkivdel in arkivdeler[0]: + params = { + "dato":fradato, + "tilDato":tildato, + "søkestreng":"", + "arkivdel":arkivdel, + } + doctypes = ws_post(None, urlhost, urlpath, params)['d'] + #{"d":["I","N","S","U","X"]} + urlpath = "/Templates/eDemokrati/Services/eDemokratiService.svc/GetPostlisteS%C3%B8k" + for doctype in doctypes: + params = { + "fraDato":fradato, + "tilDato":tildato, + "søkestreng":"", + "arkivdel":arkivdel, + "dokumenttype":doctype, + } + entries = ws_post(None, urlhost, urlpath, params)['d'] + for entry in entries: + #print entry['Id'] + id = int(entry['Id']) + if id > maxid: + maxid = id +# data = fetch_journal_entry(entry['Id']) +# if data: +# scraperwiki.sqlite.save(unique_keys=['id'], data=data) + return maxid + +#{"d":[{"__type":"PostlisteObjekt:#SSP.NoarkServices","AntallVedlegg":1,"Dokumentnummer":2,"Dokumenttype":"I","EkspedertDato":null,"Hjemmel":null,"Id":1507868,"Innholdsbeskrivelse":"Tomtejustering - Lillebæk, eiendom 208\/1611","Offentlig":true,"RegistrertDato":"\/Date(1339538400000+0200)\/","SaksId":296971,"SaksNr":"12\/8658","Arkivdel":"KAR-EIEN-A","AvsenderMottaker":"Randi Wilberg","Dokumentdato":"\/Date(1339624800000+0200)\/","Mappetype":"DS","PostlisteType":"I","Saksansvarlig":null,"SaksansvarligEnhet":null,"SaksansvarligEpost":null,"Sakstittel":null},{"__type":"PostlisteObjekt:#SSP.NoarkServices","AntallVedlegg":4,"Dokumentnummer":1,"Dokumenttype":"I","EkspedertDato":null,"Hjemmel":null,"Id":1507865,"Innholdsbeskrivelse":"Søknkad om utvidelse av balkong - Kalleraveien 14","Offentlig":true,"RegistrertDato":"\/Date(1339538400000+0200)\/","SaksId":298804,"SaksNr":"12\/10480","Arkivdel":"KAR-EIEN-A","AvsenderMottaker":"Ole Henning Løken","Dokumentdato":"\/Date(1338847200000+0200)\/","Mappetype":"BS","PostlisteType":"I","Saksansvarlig":null,"SaksansvarligEnhet":null,"SaksansvarligEpost":null,"Sakstittel":null},... + +def get_journal_enries_range(min, max, step): + for id in range(min, max, step): + data = fetch_journal_entry(id) + #print data + if data: + scraperwiki.sqlite.save(unique_keys=['id'], data=data) + +maxid = get_last_entry_id() +print "max id =", maxid +try: + start = scraperwiki.sqlite.select("max(id) as max from swdata")[0]['max'] + 1 +except: + start = 137459 +print start, maxid +#if maxid > start + 20: +# maxid = start + 10 +get_journal_enries_range(start, maxid, 1) + +start = scraperwiki.sqlite.select("min(id) as min from swdata")[0]['min'] - 1 +end = start - 1000 +print start, end +get_journal_enries_range(start, end, -1) diff --git a/scrapersources/postliste-ballangen b/scrapersources/postliste-ballangen new file mode 100644 index 0000000..89e981f --- /dev/null +++ b/scrapersources/postliste-ballangen @@ -0,0 +1,276 @@ +import scraperwiki +import urllib2 +import lxml.html +import re +import dateutil.parser +from collections import deque +import datetime +from dateutil.relativedelta import relativedelta + +scraperwiki.scrape("http://www.ballangen.kommune.no/artikler/postlister") +postlistelib=scraperwiki.swimport('postliste-python-lib') + +# <!-- $BeginBlock postjournal_liste --> +# <tr> +# <td class="CommonBold"> +# SÃ<98>KER KULTURMIDLER FOR BALLANGEN FRIIDRETT +# </td> +# </tr> +# <tr> +# <td> +# </td> +# </tr> +# <tr> +# <td> +# <b>Sakstittel: </b>KULTURMIDLER 2012 +# +# </td> +# </tr> +# <tr> +# <td> +# </td> +# </tr> +# <tr> +# <td> +# <b>Arkivsaksnr.: </b>12/00093 - 032 I <b>Løpenr.:</b +# >002255/12 +# </td> +# </tr> +# <tr> +# <td><b>Fra/Til: </b>Eirin Sørslett +# </td> +# </tr> +# <tr> +# <td><b>Saksbehandler: </b> +# Oddbjørn Dalsbø +# (RÃ<85>D/KVO) +# </td> +# </tr> +# <tr> +# <td><b>Datert: </b> 02.04.2012</td> +# </tr> +# <tr> +# <td style="padding-bottom: 15px;"> +# <img src="/icons/vwsent.gif" border="0" align="top" alt="Ikon" /> +# <a href="mailto:post@ballangen.kommune.no?subject=Bestill postjournal med Ark +# ivsaksnr 12/00093 - 032 I og løpenr 002255/12">Bestill journal</a> +# </td> +# </tr> + +def saver(unique_keys, data): +# return + #print "Not saving data" + scraperwiki.sqlite.save(unique_keys, data) + +def expand_year(year): + year = int(year) + if year > 50: + year = year + 1900 + else: + year = year + 2000 + return year + +def fetch_postjournal_day(parser, url, html, saver): + root = lxml.html.fromstring(html) + + listdate = dateutil.parser.parse(root.cssselect("h2")[0].text_content().replace("Postlister for ",""), dayfirst=True) + print listdate.date() + + entries = [] + for tr in root.cssselect("table.ui-corner-all tr"): + tds = tr.cssselect("td") + line = tds[0].text_content() + entries.append(line) + +# 9 or 12 lines per entry + queue = deque(entries) + datastore = [] + while queue: + docdesc = (queue.popleft() + queue.popleft()).strip() + + casedesc = (queue.popleft() + queue.popleft()).replace("Sakstittel:", "").strip() + + ref = queue.popleft().strip() + arkivsaksref = re.sub(r"L.penr.:.+$", "", ref).replace("Arkivsaksnr.:","").strip() + + caseyear = 0 + caseseqnr = 0 + casedocseq = 0 + doctype = '?' + caseid = 'unknown' + matchObj = re.match( r'(\d+)/(\d+)\s*-\s*(\d+) (.+)$', arkivsaksref, re.M|re.I) + if matchObj: + caseyear = matchObj.group(1) + caseseqnr = matchObj.group(2) + casedocseq = matchObj.group(3) + doctype = matchObj.group(4) + caseyear = expand_year(caseyear) + caseid = str(caseyear) + "/" + str(caseseqnr) + else: + print "error: invalid Arkivsaksnr: " + arkivsaksref + matchObj = re.match( r'(\d+)/(\d+)\s*-', arkivsaksref, re.M|re.I) + if matchObj: + caseyear = expand_year(matchObj.group(1)) + caseseqnr = matchObj.group(2) + caseid = str(caseyear) + "/" + str(caseseqnr) + + laapenr = re.sub(r"^.+L.penr.:", "", ref) + journalseqnr = 0 + journalyear = 0 + journalid = 'unknown' + if -1 != laapenr.find('/') and "/" != laapenr: # Avoid broken/empty values + journalseqnr, journalyear = laapenr.split("/") + journalyear = expand_year(journalyear) + journalid = str(journalyear) + "/" + str(journalseqnr) + else: + print u"error: invalid Løpenr: " + laapenr + + if not parser.is_valid_doctype(doctype): + doctype = { + 'S' : 'N', + 'PLN' : 'N', + 'Z' : 'N', + }[doctype] + + fratil = queue.popleft().replace("Fra/Til:", "").strip() + if parser.is_sender_doctype(doctype): + fratilfield = 'sender' + elif parser.is_recipient_doctype(doctype): + fratilfield = 'recipient' + + saksbehandler = queue.popleft().replace("Saksbehandler:","").strip() + saksansvarlig, bar = saksbehandler.split(" (") + saksansvarligenhet, foo = bar.split(")") + #print saksansvarligenhet + + recorddate = dateutil.parser.parse(queue.popleft().replace("Datert:","").strip(), dayfirst=True) + + requesturl = queue.popleft().strip() + + exemption = "" + if -1 != requesturl.find("Gradering"): + exemption = requesturl.replace("Gradering:", "").strip() + requesturl = queue.popleft() + fratil = "" + + data = { + 'agency' : parser.agency, + 'recorddate' : recorddate.date(), + 'docdesc' : docdesc, + 'casedesc' : casedesc, + + 'caseyear' : int(caseyear), + 'caseseqnr' : int(caseseqnr), + 'casedocseq' : int(casedocseq), + 'caseid' : caseid, + 'doctype' : doctype, + + 'journalseqnr' : int(journalseqnr), + 'journalyear' : int(journalyear), + 'journalid' : journalid, + fratilfield : fratil, + + 'saksbehandler' : saksbehandler, + 'saksansvarlig' : saksansvarlig.strip(), + 'saksansvarligenhet' : saksansvarligenhet.strip(), + + 'arkivsaksref' : arkivsaksref, + 'laapenr' : laapenr, + 'exemption' : exemption, + + 'scrapedurl' : url, + 'scrapestamputc' : datetime.datetime.now() + } + +# print data + parser.verify_entry(data) + datastore.append(data) + saver(unique_keys=['arkivsaksref'], data=datastore) + +def fetch_postjournal_monthlist(baseurl, html): + root = lxml.html.fromstring(html) + subset = root.cssselect("div table") + urls = subset[0].cssselect("td a") + urllist = [] + for ahref in urls: + href = ahref.attrib['href'] + if -1 != href.find("day="): +# print href + urllist.append(baseurl + href) + return urllist + +# http://www.offentlighet.no/ + +agency = "Ballangen kommune" +baseurl = "http://www.ballangen.kommune.no" + +monthurls = [] + +def addyear(monthurls, year): + for m in [12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1]: + monthstr = "%02d%d" % (m, year) + url = "http://www.ballangen.kommune.no/artikler/postlister?month=" + monthstr + monthurls.append(url) + +#addyear(monthurls, 2003) +#addyear(monthurls, 2004) # Consistency problems in http://www.ballangen.kommune.no/artikler/postlister?month=012004&day=06 (bad Arkivsaksnr. and lacking Løpenr.) + +#addyear(monthurls, 2005) +#addyear(monthurls, 2006) +#addyear(monthurls, 2007) +#addyear(monthurls, 2008) +#addyear(monthurls, 2009) +#addyear(monthurls, 2010) +#addyear(monthurls, 2011) +#addyear(monthurls, 2012) + +parsemonths = 2 + +today = datetime.date.today() +i = 1 +while i <= parsemonths: + i = i + 1 +# parsemonths = parsemonths - 1 + monthtoparse = today + relativedelta(months=parsemonths - i) + monthstr = monthtoparse.strftime("%m%Y") + url = "http://www.ballangen.kommune.no/artikler/postlister?month=" + monthstr + monthurls.append(url) + +#url = "http://www.ballangen.kommune.no/artikler/postlister?month=032012&day=19" + +def reload_error_entries(): + monthurls = [] + problems = scraperwiki.sqlite.select("distinct scrapedurl from swdata where caseid = 'unknown'") + for n in problems: + monthurls.append(n['scrapedurl']) + +print "Fetching public journal!" + +parser = postlistelib.JournalParser(agency=agency) + +urllist = [] + +def fetch_url(url): + html = None + for n in [1, 2, 3]: + try: + html = scraperwiki.scrape(url) + break + except urllib2.URLError, e: + print "URLError fetching " + url + ", trying again" + return html + +for monthurl in monthurls: + print "Fetching month list from " + monthurl + html = fetch_url(monthurl) + urllist.extend(fetch_postjournal_monthlist(baseurl = baseurl, html = html)) + +for dayurl in urllist: + res = scraperwiki.sqlite.select("scrapedurl from swdata where scrapedurl = '"+dayurl+"' and scrapestamputc > '2012-06-23T15:12:40' limit 1") + if 0 < len(res): + continue + print "Fetching from " + dayurl + html = fetch_url(dayurl) +# print html + fetch_postjournal_day(parser=parser, url=dayurl, html=html, saver=saver) + diff --git a/scrapersources/postliste-ballangen-view b/scrapersources/postliste-ballangen-view new file mode 100644 index 0000000..73d96b4 --- /dev/null +++ b/scrapersources/postliste-ballangen-view @@ -0,0 +1,124 @@ +import scraperwiki +import cgi, os +import re + +paramdict = dict(cgi.parse_qsl(os.getenv("QUERY_STRING", ""))) +#print paramdict + +if 'source' in paramdict: + sourcescraper = paramdict['source'] +else: + sourcescraper = 'postliste-ballangen' + +scraperwiki.sqlite.attach(sourcescraper) + +def htc(m): + return chr(int(m.group(1),16)) + +def urldecode(url): + rex=re.compile('%([0-9a-hA-H][0-9a-hA-H])',re.M) + return rex.sub(htc,url) + +def table_saksbehandler(): + data = scraperwiki.sqlite.select( + '''saksbehandler,count(*) as antall from swdata group by saksbehandler order by antall desc''' + ) + # print data + + print "<table>" + print "<tr><th>Saksbehandler</th><th>Saker</th>" + for d in data: + print "<tr>" + print "<td>", d["saksbehandler"], "</td>" + print "<td>", d["antall"], "</td>" + print "</tr>" + print "</table>" + +# {'datert': datetime.date(2012, 1, 6), 'arkivsaksref': u'12/00008 - 008 U', 'tittel': u'INNKALLING TIL DR\xd8FTELSESM\xd8TE - 13.01.12', 'sakstittel': u'BEMANNINGSSITUASJON ETTER BUDSJETTVEDTAK 2012', 'laapenr': u'000183/12', 'kommune': 'Ballangen kommune', 'saksbehandler': u'Svenn Ole Wiik\n (R\xc5D/)', 'listdate': datetime.date(2012, 1, 6), 'gradering': '', 'fratil': u'Anne J\xf8rgensen'} + +sql = "select * from swdata" +where = "" +args = [] +if "caseid" in paramdict: + where = where + ' caseid = ?' + args.append(paramdict["caseid"]) +if "agency" in paramdict: + where = where + ' agency = ?' + args.append(urldecode(paramdict["agency"])) +if "saksansvarlig" in paramdict: + where = where + ' saksansvarlig = ?' + saksansvarlig = urldecode(paramdict["saksansvarlig"]) + print "S: '" + saksansvarlig + "'" + args.append(urldecode(paramdict["saksansvarlig"])) +if "fratil" in paramdict: + where = where + ' sender = ? or recipient = ?' + fratil = urldecode(paramdict["fratil"]) + args.extend([fratil, fratil]) +if "q" in paramdict: + q = urldecode(paramdict["q"]) + qlike = '%' + q + '%' + where = where + ' docdesc like ? or casedesc like ? or sender like ? or recipient like ?' + args.extend([qlike, qlike, qlike, qlike]) +if where: + sql = sql + ' where ' + where +sql = sql + " order by recorddate desc, casedocseq limit 200" +print sql +data = scraperwiki.sqlite.execute(sql, args) +#print data + +print "<p>Søk i tittel, sakstittel, fra/til.</p>" +print "<p><form>Enter search term: " +print "<input name='q' length='60'>" +print "<input name='source' type='hidden' value='" + sourcescraper + "'>" +print "<INPUT type=\"submit\" value=\"Search\"> <INPUT type=\"reset\">" +print "</form></p>" +print "<table>" + +#print data + +i = 0 +key = {} +print "<tr>" +while i < len(data['keys']): + colname = data['keys'][i] + key[colname] = i + if colname in ["scrapedurl", "caseid", "scrapestamputc"]: + True # Skip, see below + else: + print "<th>" + colname + "</th>" + i = i + 1 +print "</tr>" + +#print data +for d in data['data']: + print "<tr>" + i = 0 + while i < len(data['keys']): + colname = data['keys'][i] + value = d[key[colname]] + if value is None: + value = "" + if "docdesc" == colname: + if 'scrapedurl' in key: + scrapedurl = d[key['scrapedurl']] + print "<td><a href='" + scrapedurl + "'>", value, "</a></td>" + else: + print "<td>", value, "</td>" + elif "saksansvarlig" == colname: + saksansvarlig = d[key['saksansvarlig']] + print "<td><a href='?saksansvarlig=" + saksansvarlig + "'>", value, "</a></td>" + elif "casedesc" == colname: + caseid = d[key['caseid']] + print "<td><a href='?caseid=" + caseid + "&source=" + sourcescraper + "'>", value, "</a></td>" + elif "sender" == colname or "recipient" == colname: + if "" != value: + print "<td><a href='?fratil=" + value + "&source=" + sourcescraper + "'>", value, "</a></td>" + else: + print "<td></td>" + elif colname in ["scrapedurl", "caseid", "scrapestamputc"]: + True # Skip these, as they are included as links + else: + print "<td>", value, "</td>" + i = i + 1 + print "</tr>" +print "</table>" diff --git a/scrapersources/postliste-bioforsk b/scrapersources/postliste-bioforsk new file mode 100644 index 0000000..b41b30f --- /dev/null +++ b/scrapersources/postliste-bioforsk @@ -0,0 +1,81 @@ +# -*- coding: UTF-8 -*- + +import scraperwiki +import json +from BeautifulSoup import BeautifulSoup +import datetime +import dateutil.parser +import lxml.html +import resource +import sys +import urlparse +import re +lazycache=scraperwiki.swimport('lazycache') +postlistelib=scraperwiki.swimport('postliste-python-lib') + +agency = 'Bioforsk AS' + +def report_errors(errors): + if 0 < len(errors): + print "Errors:" + for e in errors: + print e + exit(1) +def out_of_cpu(arg, spent, hard, soft): + report_errors(arg) + +def process_pdf(parser, pdfurl, errors): + errors = [] + postlistelib.exit_if_no_cpu_left(0, out_of_cpu, errors) + try: + pdfcontent = scraperwiki.scrape(pdfurl) + parser.preprocess(pdfurl, pdfcontent) + pdfcontent = None +# except ValueError, e: +# errors.append(e) + except IndexError, e: + errors.append(e) + +def process_page_queue(parser, errors): + try: + parser.process_pages() + postlistelib.exit_if_no_cpu_left(0, out_of_cpu, errors) + except scraperwiki.CPUTimeExceededError, e: + errors.append("Processing pages interrupted") + +def process_journal_pdfs(parser, listurl, errors): +# print "Finding PDFs on " + listurl +# u = urllib.parse.urlparse(listurl) + html = scraperwiki.scrape(listurl) + root = lxml.html.fromstring(html) + html = None + for ahref in root.cssselect("table a"): + href = ahref.attrib['href'] + url = urlparse.urljoin(listurl, href) + if -1 != href.find("file://") or -1 == url.find(".pdf"): +# print "Skipping non-http URL " + url + continue + if parser.is_already_scraped(url): + True +# print "Skipping already scraped " + url + else: +# print "Will process " + url + process_pdf(parser, url, errors) + +def test_small_pdfs(parser): + # Test with some smaller PDFs + errors = [] + process_pdf(parser, "http://www.bioforsk.no/ikbViewer/Content/97492/off_journal_uke17%202012.pdf", errors) + process_page_queue(parser, errors) + report_errors(errors) + exit(0) + +errors = [] +parser = postlistelib.PDFJournalParser(agency=agency) + +#test_small_pdfs(parser) + +process_journal_pdfs(parser, "http://www.bioforsk.no/ikbViewer/page/bioforsk/presse?p_dimension_id=21903", errors) +process_page_queue(parser, errors) +report_errors(errors) + diff --git a/scrapersources/postliste-difi b/scrapersources/postliste-difi new file mode 100644 index 0000000..dfc986f --- /dev/null +++ b/scrapersources/postliste-difi @@ -0,0 +1,88 @@ +# -*- coding: UTF-8 -*- + +import scraperwiki +import json +from BeautifulSoup import BeautifulSoup +import datetime +import dateutil.parser +import lxml.html +import resource +import sys +import urlparse +import re + +frontpage = "http://www.difi.no/om-difi/offentleg-postjournal-for-difi" + +scraperwiki.scrape(frontpage) + +lazycache=scraperwiki.swimport('lazycache') +postlistelib=scraperwiki.swimport('postliste-python-lib') + +agency = 'Direktoratet for forvaltning og IKT' + +def report_errors(errors): + if 0 < len(errors): + print "Errors:" + for e in errors: + print e + exit(1) +def out_of_cpu(arg, spent, hard, soft): + report_errors(arg) + +def process_pdf(parser, pdfurl, errors): + errors = [] + postlistelib.exit_if_no_cpu_left(0, out_of_cpu, errors) + try: + pdfcontent = scraperwiki.scrape(pdfurl) + parser.preprocess(pdfurl, pdfcontent) + pdfcontent = None +# except ValueError, e: +# errors.append(e) + except IndexError, e: + errors.append(e) + +def process_page_queue(parser, errors): + try: + parser.process_pages() + postlistelib.exit_if_no_cpu_left(0, out_of_cpu, errors) + except scraperwiki.CPUTimeExceededError, e: + errors.append("Processing pages interrupted") + +def process_journal_pdfs(parser, listurl, errors): +# print "Finding PDFs on " + listurl +# u = urllib.parse.urlparse(listurl) + html = scraperwiki.scrape(listurl) + root = lxml.html.fromstring(html) + html = None + for ahref in root.cssselect("div.body a"): + href = ahref.attrib['href'] + url = urlparse.urljoin(listurl, href) + if -1 != href.find("file://") or -1 == url.find(".pdf"): +# print "Skipping non-http URL " + url + continue + if parser.is_already_scraped(url): + True +# print "Skipping already scraped " + url + else: +# print "Will process " + url + process_pdf(parser, url, errors) + +def test_small_pdfs(parser): + # Test with some smaller PDFs + errors = [] + process_pdf(parser, "http://www.difi.no/filearchive/uke-2-offentlig-journal.pdf", errors) + process_pdf(parser, "http://www.difi.no/filearchive/uke-1-offentlig-journal.pdf", errors) + process_page_queue(parser, errors) + report_errors(errors) + exit(0) + +errors = [] +parser = postlistelib.PDFJournalParser(agency=agency) + +#test_small_pdfs(parser) + +process_page_queue(parser, errors) +process_journal_pdfs(parser, frontpage, errors) +process_page_queue(parser, errors) +report_errors(errors) + diff --git a/scrapersources/postliste-fredrikstad b/scrapersources/postliste-fredrikstad new file mode 100644 index 0000000..7fb5a13 --- /dev/null +++ b/scrapersources/postliste-fredrikstad @@ -0,0 +1,196 @@ +import scraperwiki + +import json +import httplib, urllib +import datetime +import dateutil.parser +import time +import re + +agency = "Fredrikstad kommune" +urlhost = "www.fredrikstad.kommune.no" + +fieldmap = { + 'AntallVedlegg' : '', + 'Arkivdel' : '', + 'AvsenderMottaker' : 'sender', # or recipient + 'Dokumentdato' : 'docdate', + 'Dokumentnummer' : 'casedocseq', + 'Dokumenttype' : 'doctype', + 'EkspedertDato' : '', + 'Hjemmel' : 'exemption', + 'Id' : 'id', + 'Innholdsbeskrivelse' : 'docdesc', + 'Mappetype' : '', + 'Offentlig' : 'ispublic', + 'PostlisteType' : 'doctype', + 'RegistrertDato' : 'recorddate', + 'SaksId' : '', + 'SaksNr' : 'caseid', + 'Sakstittel' : 'casedesc', + #'SaksNr' : 'SA.SAAR + SA.SEKNR', + 'Saksansvarlig' : 'saksbehandler', + 'SaksansvarligEnhet' : '', + 'SaksansvarligEpost' : '', + +# 'scrapestamputc' : '', +# 'scrapedurl' : '', +# 'agency' : '', +} + + +# Convert "/Date(1317808020000+0200)/" to a datetime object +# FIXME Currently ignore the timezone information +def parse_datestr(str): + match = re.split("[/()+]", str) +# print match + sinceepoch = float(match[2]) / 1000 + if match[3] == '0200': + sinceepoch = sinceepoch + 2 * 60 * 60 + if match[3] == '0100': + sinceepoch = sinceepoch + 1 * 60 * 60 +# print sinceepoch + date = datetime.datetime.fromtimestamp(sinceepoch) +# print date + return date + +def reformat_caseid(caseid): + # Input 12/13123, output 2012, 13123, "2012/13123" + year, seqnr = caseid.split("/") + year = int(year) + if year < 100: + year = year + 2000 + caseid = "%d/%s" % (year, seqnr) + return year, int(seqnr), caseid + +def ws_post(url, urlhost, urlpath, params): + jsonparams = json.dumps(params) + headers = {"Content-type": "application/json; charset=utf-8", + "Accept": "application/json"} + conn = httplib.HTTPConnection(urlhost) + #print jsonparams + conn.request("POST", urlpath, jsonparams, headers) + response = conn.getresponse() + #print response.status, response.reason + jsonres = response.read() + res = json.loads(jsonres) + #print res + return res + +def fetch_journal_entry(id): + params = { "id" : str(id)} + headers = {"Content-type": "application/json; charset=utf-8", + "Accept": "application/json"} + urlpath = "/Templates/eDemokrati/Services/eDemokratiService.svc/GetPostlisteObjekt" + data = ws_post(None, urlhost, urlpath, params)['d'] + entry = None + if data: + del data['__type'] # This is useless, ignore + entry = {} + entry['agency'] = agency + entry['scrapestamputc'] = datetime.datetime.now() + entry['scrapedurl'] = "http://" + urlhost + urlpath +# entry['scrapedurl'] = url + for dfield in fieldmap.keys(): + if dfield in data and data[dfield]: + if dfield in fieldmap and fieldmap[dfield] != "": + fieldname = fieldmap[dfield] + else: + fieldname = dfield + if 'sender' == fieldname: + if data['Dokumenttype'] == 'U': + fieldname = 'recipient' + if dfield in ['RegistrertDato', 'Dokumentdato', 'EkspedertDato']: + entry[fieldname] = parse_datestr(data[dfield]).date() + else: + entry[fieldname] = data[dfield] + else: + entry[dfield] = data[dfield] + entry['caseyear'], entry['caseseqnr'], entry['caseid'] = reformat_caseid(entry['caseid']) +# data["sourceurl"] = "http://" + server + path + #print entry + return entry + +def epoctime_to_datestr(epoctime): + return "/Date("+str(int(epoctime * 1000) )+")/" + +def get_last_entry_id(): + now = time.time() + # Get the last week, as the most recent entry should be in this range + fradato = epoctime_to_datestr(now - 7 * 24 * 60 * 60) + tildato = epoctime_to_datestr(now) + #print fradato + + maxid = 0 + + urlpath = "/Templates/eDemokrati/Services/eDemokratiService.svc/GetPostlisteArkivdeler" + params = { + "dato": fradato, + "tilDato": tildato, + "søkestreng":""} + arkivdeler = ws_post(None, urlhost, urlpath, params)['d'] + # {u'd': [u'_', u'HVA-IFE-A', u'KAR-BR-A', u'KAR-BRUK-A', u'KAR-EIEN-A', u'KAR-ELBH-A', u'KAR-ELS-A', ... + + urlpath = "/Templates/eDemokrati/Services/eDemokratiService.svc/GetPostlisteDokumenttyper" + for arkivdel in arkivdeler[0]: + params = { + "dato":fradato, + "tilDato":tildato, + "søkestreng":"", + "arkivdel":arkivdel, + } + doctypes = ws_post(None, urlhost, urlpath, params)['d'] + #{"d":["I","N","S","U","X"]} + urlpath = "/Templates/eDemokrati/Services/eDemokratiService.svc/GetPostlisteS%C3%B8k" + for doctype in doctypes: + params = { + "fraDato":fradato, + "tilDato":tildato, + "søkestreng":"", + "arkivdel":arkivdel, + "dokumenttype":doctype, + } + entries = ws_post(None, urlhost, urlpath, params)['d'] + for entry in entries: + #print entry + #exit(0) + #print entry['Id'] + id = int(entry['Id']) + if id > maxid: + maxid = id +# data = fetch_journal_entry(entry['Id']) +# if data: +# scraperwiki.sqlite.save(unique_keys=['id'], data=data) + return maxid + +#{"d":[{"__type":"PostlisteObjekt:#SSP.NoarkServices","AntallVedlegg":1,"Dokumentnummer":2,"Dokumenttype":"I","EkspedertDato":null,"Hjemmel":null,"Id":1507868,"Innholdsbeskrivelse":"Tomtejustering - Lillebæk, eiendom 208\/1611","Offentlig":true,"RegistrertDato":"\/Date(1339538400000+0200)\/","SaksId":296971,"SaksNr":"12\/8658","Arkivdel":"KAR-EIEN-A","AvsenderMottaker":"Randi Wilberg","Dokumentdato":"\/Date(1339624800000+0200)\/","Mappetype":"DS","PostlisteType":"I","Saksansvarlig":null,"SaksansvarligEnhet":null,"SaksansvarligEpost":null,"Sakstittel":null},{"__type":"PostlisteObjekt:#SSP.NoarkServices","AntallVedlegg":4,"Dokumentnummer":1,"Dokumenttype":"I","EkspedertDato":null,"Hjemmel":null,"Id":1507865,"Innholdsbeskrivelse":"Søknkad om utvidelse av balkong - Kalleraveien 14","Offentlig":true,"RegistrertDato":"\/Date(1339538400000+0200)\/","SaksId":298804,"SaksNr":"12\/10480","Arkivdel":"KAR-EIEN-A","AvsenderMottaker":"Ole Henning Løken","Dokumentdato":"\/Date(1338847200000+0200)\/","Mappetype":"BS","PostlisteType":"I","Saksansvarlig":null,"SaksansvarligEnhet":null,"SaksansvarligEpost":null,"Sakstittel":null},... + +def get_journal_enries_range(min, max, step): + for id in range(min, max, step): + data = fetch_journal_entry(id) + #print data + if data: + scraperwiki.sqlite.save(unique_keys=['id'], data=data) + time.sleep(0.3) + +maxid = get_last_entry_id() +print "max id =", maxid +try: + start = scraperwiki.sqlite.select("max(id) as max from swdata")[0]['max'] + 1 +except: + start = 1094428 # 2010 + start = 1507868 # 2012 + +print start, maxid +#if maxid > start + 20: +# maxid = start + 10 +get_journal_enries_range(start, maxid + 1, 1) + +try: + minid = scraperwiki.sqlite.select("min(id) as min from swdata")[0]['min'] - 1 + start = minid +except: + True +end = start - 1000 +print start, end +get_journal_enries_range(start, end, -1) diff --git a/scrapersources/postliste-hadsel b/scrapersources/postliste-hadsel new file mode 100644 index 0000000..a175048 --- /dev/null +++ b/scrapersources/postliste-hadsel @@ -0,0 +1,108 @@ +# -*- coding: UTF-8 -*- + +import scraperwiki +from BeautifulSoup import BeautifulSoup +import datetime +import dateutil.parser +import lxml.html +import sys +import urlparse + +scraperwiki.scrape("http://www.hadsel.kommune.no/selvbetjeningskjema-kart-postjournal/offentlig-postjournal") + +lazycache=scraperwiki.swimport('lazycache') +postlistelib=scraperwiki.swimport('postliste-python-lib') + +agency = 'Hadsel kommune' + +def report_errors(errors): + if 0 < len(errors): + print "Errors:" + for e in errors: + print e + exit(1) +def no_cpu_left(arg, spent, soft, hard): + report_errors(arg) + +def process_pdf(parser, pdfurl, errors): + errors = [] + postlistelib.exit_if_no_cpu_left(0, callback=no_cpu_left, arg = errors) + try: + pdfcontent = lazycache.lazycache(pdfurl) + parser.preprocess(pdfurl, pdfcontent) +# except ValueError, e: +# errors.append(e) + except IndexError, e: + errors.append(e) + +def process_page_queue(parser, errors): + try: + parser.process_pages() + postlistelib.exit_if_no_cpu_left(0, callback=no_cpu_left, arg = errors) + except scraperwiki.CPUTimeExceededError, e: + errors.append("Processing pages interrupted") + +def consider_url(parser, url, errors): + if parser.is_already_scraped(url): + True +# print "Skipping already scraped " + url + else: +# print "Will process " + url + try: + process_pdf(parser, url, errors) + except: + pass + +def process_journal_pdfs(parser, listurl, errors, recurse): +# print "Finding PDFs on " + listurl +# u = urllib.parse.urlparse(listurl) + html = scraperwiki.scrape(listurl) + root = lxml.html.fromstring(html) + html = None + for ahref in root.cssselect("div.items a"): + url = urlparse.urljoin(listurl, ahref.attrib['href']) + if -1 == url.find("doc_download"): + continue + consider_url(parser, url, errors) + #print url + for ahref in root.cssselect("div.item-list a"): + suburl = urlparse.urljoin(listurl, ahref.attrib['href']) + #print "sub " + suburl + subhtml = scraperwiki.scrape(suburl) + subroot = lxml.html.fromstring(subhtml) + subhtml = None + for subahref in subroot.cssselect("div.article a"): + href = subahref.attrib['href'] + #print href + subsuburl = urlparse.urljoin(suburl, href) + #print "subsub " + subsuburl + if -1 == subsuburl.find("doc_download"): + continue + consider_url(parser, subsuburl, errors) + subroot = None + if recurse: + seen = { listurl : 1 } + for ahref in root.cssselect("div.pagination a"): + pageurl = urlparse.urljoin(listurl, ahref.attrib['href']) + #print "P: " + pageurl + if pageurl not in seen: + process_journal_pdfs(parser, pageurl, errors, False) + seen[pageurl] = 1 + +def test_parse_case_journal_ref(): + entry = {} + parse_case_journal_ref(entry, [u'2008/16414-', u'23', u'15060/2012'], "") + parse_case_journal_ref(entry, [u'2011/15972-1 102773/201', u'1'], "") + parse_case_journal_ref(entry, [u'2010/2593-2', u'103004/201', u'1'], "") + parse_case_journal_ref(entry, [u'2011/13415-', u'22', u'100077/201', u'1'], "") + exit(0) + +#test_parse_case_journal_ref() + +errors = [] +parser = postlistelib.PDFJournalParser(agency=agency) +process_page_queue(parser, errors) +process_journal_pdfs(parser, "http://www.hadsel.kommune.no/selvbetjeningskjema-kart-postjournal/offentlig-postjournal", errors, True) +process_page_queue(parser, errors) +report_errors(errors) + diff --git a/scrapersources/postliste-halden b/scrapersources/postliste-halden new file mode 100644 index 0000000..4b0ebd5 --- /dev/null +++ b/scrapersources/postliste-halden @@ -0,0 +1,93 @@ +# -*- coding: UTF-8 -*- + +import scraperwiki +import json +from BeautifulSoup import BeautifulSoup +import datetime +import dateutil.parser +import lxml.html +import resource +import sys +import urlparse +import urllib +import re +lazycache=scraperwiki.swimport('lazycache') +postlistelib=scraperwiki.swimport('postliste-python-lib') + +agency = 'Halden kommune' + +def report_errors(errors): + if 0 < len(errors): + print "Errors:" + for e in errors: + print e + exit(1) +def out_of_cpu(arg, spent, hard, soft): + report_errors(arg) + +def process_pdf(parser, pdfurl, errors): + errors = [] + postlistelib.exit_if_no_cpu_left(0, out_of_cpu, errors) + try: + pdfcontent = scraperwiki.scrape(pdfurl) + parser.preprocess(pdfurl, pdfcontent) + pdfcontent = None +# except ValueError, e: +# errors.append(e) + except IndexError, e: + errors.append(e) + +def process_page_queue(parser, errors): + try: + if False: + scraperwiki.sqlite.execute("delete from swdata where scrapedurl in (select scrapedurl from unparsedpages)") + scraperwiki.sqlite.execute("delete from unparsedpages") + scraperwiki.sqlite.commit() + + parser.process_pages() + postlistelib.exit_if_no_cpu_left(0, out_of_cpu, errors) + except scraperwiki.CPUTimeExceededError, e: + errors.append("Processing pages interrupted") + +def process_pdf_links_cssselect(parser, listurl, errors, cssselect): + html = scraperwiki.scrape(listurl) + root = lxml.html.fromstring(html) + html = None + for ahref in root.cssselect(cssselect + " a"): + href = ahref.attrib['href'] + url = urlparse.urljoin(listurl, href).replace(" ", "%20").replace(u"å", "%C3%A5") + #print url + if -1 != href.find("file://") or -1 != href.find("postliste/Documents/Brukerveiledning"): +# print "Skipping non-http URL " + url + continue + if -1 == href.find(".pdf"): + continue + if parser.is_already_scraped(url): + True +# print "Skipping already scraped " + url + else: +# print "Will process " + url + process_pdf(parser, url, errors) + +def process_journal_pdfs(parser, listurl, errors): + return process_pdf_links_cssselect(parser, listurl, errors, "div#page_centerElementZone") + +def test_small_pdfs(parser): + # Test with some smaller PDFs + errors = [] + process_pdf(parser, u"http://www.halden.kommune.no/selvbetjening/postliste/Documents/120601%20-%20120607%20Inng%C3%A5ende.pdf", errors) + process_pdf(parser, u"http://www.halden.kommune.no/selvbetjening/postliste/Documents/120601%20-%20120607%20Utg%C3%A5ende.pdf", errors) + process_page_queue(parser, errors) + report_errors(errors) + exit(0) + +errors = [] +parser = postlistelib.PDFJournalParser(agency=agency) +#parser.debug = True + +#test_small_pdfs(parser) +process_page_queue(parser, errors) +process_journal_pdfs(parser, u"http://www.halden.kommune.no/selvbetjening/postliste/Sider/Inng%C3%A5ende-postlister.aspx", errors) +process_journal_pdfs(parser, u"http://www.halden.kommune.no/selvbetjening/postliste/Sider/Utg%C3%A5ende-postliste-.aspx", errors) +process_page_queue(parser, errors) +report_errors(errors)
\ No newline at end of file diff --git a/scrapersources/postliste-hoegskolen-i-gjoevik b/scrapersources/postliste-hoegskolen-i-gjoevik new file mode 100644 index 0000000..fd197eb --- /dev/null +++ b/scrapersources/postliste-hoegskolen-i-gjoevik @@ -0,0 +1,104 @@ +# -*- coding: UTF-8 -*- +# Based on the scraper advanced-scraping-pdf +# See also +# https://views.scraperwiki.com/run/pdf-to-html-preview-1/?url=http%3A%2F%2Fwww.stortinget.no%2FGlobal%2Fpdf%2Fpostjournal%2Fpj-2012-05-09.pdf + +import scraperwiki +import json +from BeautifulSoup import BeautifulSoup +import datetime +import dateutil.parser +import lxml.html +import resource +import sys +import urlparse +import re +# +# something weird with 04.11.2010 +# +# +# +# Make sure Scraperwiki believe this is the source from this database +scraperwiki.scrape("http://www.hig.no/om_hig/offentleg_journal/2012") + +lazycache=scraperwiki.swimport('lazycache') +postlistelib=scraperwiki.swimport('postliste-python-lib') + +agency = 'Høgskolen i Gjøvik' + +def report_errors(errors): + if 0 < len(errors): + print "Errors:" + for e in errors: + print e + exit(1) +def out_of_cpu(arg, spent, hard, soft): + report_errors(arg) + +def process_pdf(parser, pdfurl, errors): + errors = [] + postlistelib.exit_if_no_cpu_left(0, out_of_cpu, errors) + try: + pdfcontent = scraperwiki.scrape(pdfurl) + parser.preprocess(pdfurl, pdfcontent) + pdfcontent = None +# except ValueError, e: +# errors.append(e) + except IndexError, e: + errors.append(e) + +def process_page_queue(parser, errors): + try: + parser.process_pages() + postlistelib.exit_if_no_cpu_left(0, out_of_cpu, errors) + except scraperwiki.CPUTimeExceededError, e: + errors.append("Processing pages interrupted") + +def process_journal_pdfs(parser, listurl, errors): + print "Finding PDFs on " + listurl +# u = urllib.parse.urlparse(listurl) + html = scraperwiki.scrape(listurl) + root = lxml.html.fromstring(html) + html = None + for ahref in root.cssselect("div.spalte-inner a"): + href = ahref.attrib['href'] + url = urlparse.urljoin(listurl, href).replace(" ", "+") + if -1 != href.find("file://") or -1 == url.find(".pdf"): +# print "Skipping non-http URL " + url + continue + if parser.is_already_scraped(url): + True +# print "Skipping already scraped " + url + else: +# print "Will process " + url + process_pdf(parser, url, errors) + +def test_small_pdfs(parser): + # Test with some smaller PDFs + errors = [] + if parser.is_already_scraped("http://www.hig.no/content/download/30119/360872/file/Offentlig+journal+04.11.2010.pdf"): + print "Skipping already scraped " + exit(1) + else: + print "Will process " + + #process_pdf(parser, "http://www.hig.no/content/download/35184/430061/file/Offentlig%20journal%2025.06.2012.pdf", errors) + #process_pdf(parser, "http://www.hig.no/content/download/30116/360863/file/Offentlig%20journal%2001.11.2010.pdf", errors) + process_pdf(parser, "http://www.hig.no/content/download/30119/360872/file/Offentlig+journal+04.11.2010.pdf", errors) + process_page_queue(parser, errors) + report_errors(errors) + exit(0) + +errors = [] +parser = postlistelib.PDFJournalParser(agency=agency) + +#test_small_pdfs(parser) + +startYear=2010 +endYear=datetime.datetime.now().year +for year in range(startYear, endYear): + process_journal_pdfs(parser, "http://www.hig.no/om_hig/offentleg_journal/%d" % year, errors) + +process_page_queue(parser, errors) +report_errors(errors) + diff --git a/scrapersources/postliste-hoegskolen-i-hamar b/scrapersources/postliste-hoegskolen-i-hamar new file mode 100644 index 0000000..890eed3 --- /dev/null +++ b/scrapersources/postliste-hoegskolen-i-hamar @@ -0,0 +1,103 @@ +# -*- coding: UTF-8 -*- +# Based on the scraper advanced-scraping-pdf +# See also +# https://views.scraperwiki.com/run/pdf-to-html-preview-1/?url=http%3A%2F%2Fwww.stortinget.no%2FGlobal%2Fpdf%2Fpostjournal%2Fpj-2012-05-09.pdf + +import scraperwiki +import json +from BeautifulSoup import BeautifulSoup +import datetime +import dateutil.parser +import lxml.html +import resource +import sys +import urlparse +import re + +# Make sure Scraperwiki believe this is the source from this database +scraperwiki.scrape("http://www.hihm.no/Hovedsiden/Om-Hoegskolen/Offentlig-journal") + +lazycache=scraperwiki.swimport('lazycache') +postlistelib=scraperwiki.swimport('postliste-python-lib') + +agency = 'Høgskolen i Hamar' + +def report_errors(errors): + if 0 < len(errors): + print "Errors:" + for e in errors: + print e + exit(1) +def out_of_cpu(arg, spent, hard, soft): + report_errors(arg) + +def process_pdf(parser, pdfurl, errors): + errors = [] + postlistelib.exit_if_no_cpu_left(0, out_of_cpu, errors) + try: + pdfcontent = scraperwiki.scrape(pdfurl) + parser.preprocess(pdfurl, pdfcontent) + pdfcontent = None +# except ValueError, e: +# errors.append(e) + except IndexError, e: + errors.append(e) + +def process_page_queue(parser, errors): + try: + parser.process_pages() + postlistelib.exit_if_no_cpu_left(0, out_of_cpu, errors) + except scraperwiki.CPUTimeExceededError, e: + errors.append("Processing pages interrupted") + +def process_journal_pdfs(parser, listurl, errors): + print "Finding PDFs on " + listurl +# u = urllib.parse.urlparse(listurl) + html = scraperwiki.scrape(listurl) + root = lxml.html.fromstring(html) + html = None + for ahref in root.cssselect("div.content-view-full a"): + href = ahref.attrib['href'] + url = urlparse.urljoin(listurl, href).replace(" ", "+") + if -1 != href.find("file://") or -1 == url.find(".pdf"): +# print "Skipping non-http URL " + url + continue + if parser.is_already_scraped(url): + True +# print "Skipping already scraped " + url + else: +# print "Will process " + url + process_pdf(parser, url, errors) + +def find_journal_subpages(baseurl): + urls = [] + root = lxml.html.fromstring(scraperwiki.scrape(baseurl)) + for ahref in root.cssselect("ul.menu-list a"): + href = ahref.attrib['href'] + months = "januar","februar","mars","april","mai","juni","juli","august","september","oktober","november","desember" + if -1 == href.find("file://") and href.endswith(months): + urls.append(urlparse.urljoin(baseurl, href).replace(" ", "+")) + return urls + +def test_small_pdfs(parser): + # Test with some smaller PDFs + errors = [] + process_pdf(parser, "http://www.hihm.no/content/download/38169/420508/file/search.pdf", errors) + process_pdf(parser, "http://www.hihm.no/content/download/39369/430053/file/search.pdf", errors) + process_page_queue(parser, errors) + report_errors(errors) + exit(0) + +errors = [] +parser = postlistelib.PDFJournalParser(agency=agency) + +#test_small_pdfs(parser) + +#process_journal_pdfs(parser, "http://www.hihm.no/Hovedsiden/Om-Hoegskolen/Offentlig-journal/mai", errors) + +for url in find_journal_subpages("http://www.hihm.no/Hovedsiden/Om-Hoegskolen/Offentlig-journal"): + process_journal_pdfs(parser, url, errors) + +process_page_queue(parser, errors) +report_errors(errors) + diff --git a/scrapersources/postliste-hoegskolen-i-lillehammer b/scrapersources/postliste-hoegskolen-i-lillehammer new file mode 100644 index 0000000..5337521 --- /dev/null +++ b/scrapersources/postliste-hoegskolen-i-lillehammer @@ -0,0 +1,90 @@ +# -*- coding: UTF-8 -*- +# Based on the scraper advanced-scraping-pdf +# See also +# https://views.scraperwiki.com/run/pdf-to-html-preview-1/?url=http%3A%2F%2Fwww.stortinget.no%2FGlobal%2Fpdf%2Fpostjournal%2Fpj-2012-05-09.pdf + +import scraperwiki +import json +from BeautifulSoup import BeautifulSoup +import datetime +import dateutil.parser +import lxml.html +import resource +import sys +import urlparse +import re + +# Make sure Scraperwiki believe this is the source from this database +scraperwiki.scrape("http://www.hil.no/hil/om_hoegskolen/Offentlig-journal") + +lazycache=scraperwiki.swimport('lazycache') +postlistelib=scraperwiki.swimport('postliste-python-lib') + +agency = 'Høgskolen i Lillehammer' + +def report_errors(errors): + if 0 < len(errors): + print "Errors:" + for e in errors: + print e + exit(1) +def out_of_cpu(arg, spent, hard, soft): + report_errors(arg) + +def process_pdf(parser, pdfurl, errors): + errors = [] + postlistelib.exit_if_no_cpu_left(0, out_of_cpu, errors) + try: + pdfcontent = scraperwiki.scrape(pdfurl) + parser.preprocess(pdfurl, pdfcontent) + pdfcontent = None +# except ValueError, e: +# errors.append(e) + except IndexError, e: + errors.append(e) + +def process_page_queue(parser, errors): + try: + parser.process_pages() + postlistelib.exit_if_no_cpu_left(0, out_of_cpu, errors) + except scraperwiki.CPUTimeExceededError, e: + errors.append("Processing pages interrupted") + +def process_journal_pdfs(parser, listurl, errors): + print "Finding PDFs on " + listurl +# u = urllib.parse.urlparse(listurl) + html = scraperwiki.scrape(listurl) + root = lxml.html.fromstring(html) + html = None + for ahref in root.cssselect("div.content-view-full ul li a"): + href = ahref.attrib['href'] + url = urlparse.urljoin(listurl, href).replace(" ", "+") + if -1 != href.find("file://") or -1 == url.find(".pdf"): +# print "Skipping non-http URL " + url + continue + if parser.is_already_scraped(url): + True +# print "Skipping already scraped " + url + else: +# print "Will process " + url + process_pdf(parser, url, errors) + +def test_small_pdfs(parser): + # Test with some smaller PDFs + errors = [] + process_pdf(parser, "http://www.hil.no/content/download/39617/747521/file/uke24.pdf", errors) + process_pdf(parser, "http://www.hil.no/content/download/37616/700472/file/uke1.pdf", errors) + process_page_queue(parser, errors) + report_errors(errors) + exit(0) + +errors = [] +parser = postlistelib.PDFJournalParser(agency=agency) + +#test_small_pdfs(parser) + +process_journal_pdfs(parser, "http://www.hil.no/hil/om_hoegskolen/Offentlig-journal", errors) + +process_page_queue(parser, errors) +report_errors(errors) + diff --git a/scrapersources/postliste-hole b/scrapersources/postliste-hole new file mode 100644 index 0000000..3f34322 --- /dev/null +++ b/scrapersources/postliste-hole @@ -0,0 +1,237 @@ +# -*- coding: UTF-8 -*- +import scraperwiki +import lxml.html +import datetime +import dateutil.parser +import urllib2 +import urlparse + +# Start page is the front page, to get it listed as the primary source +scraperwiki.scrape("http://www.hole.kommune.no/postjournaler.173497.no.html") + +postlistelib=scraperwiki.swimport('postliste-python-lib') + +agency = 'Hole kommune' + +def fetch_url(url): + html = None + for n in [1, 2, 3]: + try: + html = scraperwiki.scrape(url) + break + except urllib2.URLError, e: + print "URLError fetching " + url + ", trying again" + return html + +def expand_id(value, fieldtype, entry): + year, seqnr = value.split('/') + year = int(year) + seqnr = int(seqnr) + if year < 50: + year = year + 2000 + if year > 50 and year < 100: + year = year + 1900 + entry[fieldtype + 'year'] = year + entry[fieldtype + 'seqnr'] = seqnr + newvalue = str(year) + '/' + str(seqnr) + return entry, newvalue + +def fetch_postjournal(agency, url, datastore): +# print "Scraping " + url + scrapestamputc = datetime.datetime.now() + html = fetch_url(url) + root = lxml.html.fromstring(html) + entry = { + 'agency' : agency, + 'scrapestamputc' : scrapestamputc, + 'scrapedurl' : url, + } + + fieldmap = { + u'Tittel på saken' : 'casedesc', + u'Tittel på dokumentet' : 'docdesc', + 'Dokumentansvarlig' : 'saksansvarlig', + 'Hjemmel' : 'exemption', + 'DokumentID' : 'journalid', + 'ArkivsakID' : 'caseid', + 'Journaldato' : 'recorddate', + 'Brevdato' : 'docdate', + #'Journalpostkategori' : + } + doctypemap = { # Valid codes are I, U, X, N, S + u'Innkommende dokument' : 'I', + u'Innkommende dokument (Gradert)' : 'I', + u'Utgående dokument' : 'U', + u'Utgående dokument (Gradert)' : 'U', + u'Utgående dokument (Ikke publisert)' : 'X', + u'Innkommende dokument (Ikke publisert)' : 'X', + u'Internt notat (Gradert)' : 'N', + u'Internt notat' : 'N', + } + for span in root.cssselect("div.innsyn-content"): + #print span.text_content() + + doctype = span.cssselect("h1.header-head")[0].text_content().strip() + print doctype + entry['doctype'] = doctypemap[doctype] + + trs = span.cssselect("div.nobox tr") + for tr in trs: + field = tr.cssselect("th.header-cell")[0].text_content().strip().replace(":","") + value = tr.cssselect("td.content-cell")[0].text_content().strip() + #print "'" + field + "' = " + value + if field in fieldmap: + field = fieldmap[field] + #print "hit" + if field in ['docdate','recorddate']: + value = dateutil.parser.parse(value, dayfirst=True).date() + if field == 'saksansvarlig' and -1 != value.find(','): + #print value + names = value.split(",", 1) + value = names[1].strip() + " " + names[0].strip() + if field == 'caseid': + entry, value = expand_id(value, 'case', entry) + if field == 'journalid': + entry, value = expand_id(value, 'journal', entry) + + entry[field] = value + + sendinfo = span.cssselect("div.dokmottakere") + if 0 < len(sendinfo): + if 'doctype' in entry and entry['doctype'] in ['U', 'X', 'N']: + field = 'recipient' + else: + field = 'sender' + # Value is "Margrethe Ingeland<br/>Gravfossveien<br/>3360 GEITHUS", should be split in person, addr and zip + entry[field] = sendinfo[0].text + brs = sendinfo[0].cssselect("br") + if 3 == len(brs): + addr = brs[0].tail + ", " + brs[1].tail + zip = brs[2].tail + entry[field + 'addr'] = addr + entry[field + 'zip'] = zip + elif 2 == len(brs): + addr = brs[0].tail + zip = brs[1].tail + entry[field + 'addr'] = addr + entry[field + 'zip'] = zip + elif 1 == len(brs): + zip = brs[0].tail + entry[field + 'zip'] = zip + elif 0 == len(brs): + True # Ignore + else: + raise ValueError("Unexpected number of address lines") + print entry + if 'doctype' in entry: + entry['casedocseq'] = 0 # Fake value, not sure how to extract the real value + datastore.append(entry) + return + +def get_journal_day(agency, date, startrow, jurlqueue): + datestr = str(date) + "T00:00:00" + url = "http://innsyn.hole.kommune.no/wfinnsyn.ashx?response=journalpost_postliste&showresults=true&fradato=%s&startrow=%d" % (datestr, startrow) + print url + html = fetch_url(url) + root = lxml.html.fromstring(html) + ahrefs = root.cssselect("table.inner-max-width tbody tr a") + for a in ahrefs: + href = a.attrib['href'] + if -1 != href.find("/wfinnsyn.ashx?response=journalpost_detaljer&journalpostid="): + jurl = urlparse.urljoin(url, href) + jurlqueue.append(jurl) + + ahrefs = root.cssselect("table.inner-max-width tfoot tr a") + for a in ahrefs: + if 'neste' == a.text_content(): + get_journal_day(agency, date, startrow+10, jurlqueue) + +def is_already_scraped(url): + for sql in ["scrapedurl from swdata where scrapedurl = '" + url + "' limit 1"]: + try: + result = scraperwiki.sqlite.select(sql) + #int sql, " : ", result + if 0 < len(result) and u'scrapedurl' in result[0]: + return True + except: + print "Exception" + pass + return False + +def minmax_recorddate(minmax): + for sql in ["%s(recorddate) as recorddate from swdata" % minmax]: + try: + result = scraperwiki.sqlite.select(sql) + date = dateutil.parser.parse(result[0]['recorddate']).date() + return date + except: + pass + return None + +def scraper(): + html = fetch_url("http://innsyn.hole.kommune.no/wfinnsyn.ashx?response=journalpost_postliste&showresults=true") + root = lxml.html.fromstring(html) + table = root.cssselect("table.inner-max-width") + #print table[0].text_content() + + lastdate = dateutil.parser.parse(table[0].cssselect("caption")[0].text_content().replace("Postliste den ", ""), dayfirst=True).date() + + maxdate = minmax_recorddate("max") + + if maxdate: + startdate = maxdate + datetime.timedelta(days=1) + start = 0 + end = (lastdate-startdate).days + 1 + print maxdate, startdate, start, end + else: + startdate = maxdate + start = 0 + end = 0 + for old in range(start, end): + date = startdate + datetime.timedelta(days=old) + print date + urlqueue = [] + get_journal_day(agency, date, 0, urlqueue) + datastore = [] + for jurl in urlqueue: + if not is_already_scraped(jurl): + res = fetch_postjournal(agency, jurl, datastore) + if 0 < len(datastore): + print datastore + scraperwiki.sqlite.save(unique_keys=['scrapedurl'], data=datastore) + datastore = [] + + mindate = minmax_recorddate("min") + + # Only three months back + return + + if mindate: + startdate = mindate - datetime.timedelta(days=1) + start = 0 + end = -60 + print mindate, startdate, start, end + else: + return + for old in range(start, end, -1): + date = startdate + datetime.timedelta(days=old) + print date + urlqueue = [] + get_journal_day(agency, date, 0, urlqueue) + datastore = [] + for jurl in urlqueue: + if not is_already_scraped(jurl): + res = fetch_postjournal(agency, jurl, datastore) + if 0 < len(datastore): + print datastore + scraperwiki.sqlite.save(unique_keys=['scrapedurl'], data=datastore) + datastore = [] + +#GET http://innsyn.hole.kommune.no/wfinnsyn.ashx?response=journalpost_postliste&showresults=true&fradato=2012-06-15T00:00:00 +#GET http://innsyn.hole.kommune.no/wfinnsyn.ashx?response=journalpost_detaljer&journalpostid=2012005569& +#GET http://innsyn.hole.kommune.no/wfinnsyn.ashx?response=arkivsak_detaljer&arkivsakid=2006002016& + +if __name__ == "scraper": + scraper() +else: + print "Not called as scraper"
\ No newline at end of file diff --git a/scrapersources/postliste-hvaler b/scrapersources/postliste-hvaler new file mode 100644 index 0000000..b3e9137 --- /dev/null +++ b/scrapersources/postliste-hvaler @@ -0,0 +1,81 @@ +# -*- coding: UTF-8 -*- + +import scraperwiki +import json +from BeautifulSoup import BeautifulSoup +import datetime +import dateutil.parser +import lxml.html +import resource +import sys +import urlparse +import re +lazycache=scraperwiki.swimport('lazycache') +postlistelib=scraperwiki.swimport('postliste-python-lib') + +agency = 'Hvaler kommune' + +def report_errors(errors): + if 0 < len(errors): + print "Errors:" + for e in errors: + print e + exit(1) +def out_of_cpu(arg, spent, hard, soft): + report_errors(arg) + +def process_pdf(parser, pdfurl, errors): + errors = [] + postlistelib.exit_if_no_cpu_left(0, out_of_cpu, errors) + try: + pdfcontent = scraperwiki.scrape(pdfurl) + parser.preprocess(pdfurl, pdfcontent) + pdfcontent = None +# except ValueError, e: +# errors.append(e) + except IndexError, e: + errors.append(e) + +def process_page_queue(parser, errors): + try: + parser.process_pages() + postlistelib.exit_if_no_cpu_left(0, out_of_cpu, errors) + except scraperwiki.CPUTimeExceededError, e: + errors.append("Processing pages interrupted") + +def process_journal_pdfs(parser, listurl, errors): +# print "Finding PDFs on " + listurl +# u = urllib.parse.urlparse(listurl) + html = scraperwiki.scrape(listurl) + root = lxml.html.fromstring(html) + html = None + for ahref in root.cssselect("div#ctl00_MainRegion_StageAreaRegion_MainContentRegion_MainBodyRegion_ctl01_FileTreen0Nodes a"): + href = ahref.attrib['href'] + url = urlparse.urljoin(listurl, href) + if -1 != href.find("file://"): +# print "Skipping non-http URL " + url + continue + if parser.is_already_scraped(url): + True +# print "Skipping already scraped " + url + else: +# print "Will process " + url + process_pdf(parser, url, errors) + +def test_small_pdfs(parser): + # Test with some smaller PDFs + errors = [] + process_pdf(parser, "http://www.hvaler.kommune.no/Documents/Postlister/2012/2012-05-31.pdf", errors) + process_page_queue(parser, errors) + report_errors(errors) + exit(0) + +errors = [] +parser = postlistelib.PDFJournalParser(agency=agency) + +#test_small_pdfs(parser) + +process_journal_pdfs(parser, "http://www.hvaler.kommune.no/Postlister/", errors) +process_page_queue(parser, errors) +report_errors(errors) + diff --git a/scrapersources/postliste-kafjord b/scrapersources/postliste-kafjord new file mode 100644 index 0000000..e0d6b5c --- /dev/null +++ b/scrapersources/postliste-kafjord @@ -0,0 +1,81 @@ +# -*- coding: UTF-8 -*- + +import scraperwiki +import json +from BeautifulSoup import BeautifulSoup +import datetime +import dateutil.parser +import lxml.html +import resource +import sys +import urlparse +import re +lazycache=scraperwiki.swimport('lazycache') +postlistelib=scraperwiki.swimport('postliste-python-lib') + +agency = u'Kåfjord kommune' + +def report_errors(errors): + if 0 < len(errors): + print "Errors:" + for e in errors: + print e + exit(1) +def out_of_cpu(arg, spent, hard, soft): + report_errors(arg) + +def process_pdf(parser, pdfurl, errors): + errors = [] + postlistelib.exit_if_no_cpu_left(0, out_of_cpu, errors) + try: + pdfcontent = scraperwiki.scrape(pdfurl) + parser.preprocess(pdfurl, pdfcontent) + pdfcontent = None +# except ValueError, e: +# errors.append(e) + except IndexError, e: + errors.append(e) + +def process_page_queue(parser, errors): + try: + parser.process_pages() + postlistelib.exit_if_no_cpu_left(0, out_of_cpu, errors) + except scraperwiki.CPUTimeExceededError, e: + errors.append("Processing pages interrupted") + +def process_journal_pdfs(parser, listurl, errors): +# print "Finding PDFs on " + listurl +# u = urllib.parse.urlparse(listurl) + html = scraperwiki.scrape(listurl) + root = lxml.html.fromstring(html) + html = None + for ahref in root.cssselect("div.main a"): + href = ahref.attrib['href'] + url = urlparse.urljoin(listurl, href) + if -1 != href.find("file://") or -1 == href.find("/postliste-"): +# print "Skipping non-http URL " + url + continue + if parser.is_already_scraped(url): + True +# print "Skipping already scraped " + url + else: +# print "Will process " + url + process_pdf(parser, url, errors) + +def test_small_pdfs(parser): + # Test with some smaller PDFs + errors = [] + process_pdf(parser, "http://www.kafjord.kommune.no/postliste-15-06-12.5065630-18590.html", errors) + process_page_queue(parser, errors) + report_errors(errors) + exit(0) + +errors = [] +parser = postlistelib.PDFJournalParser(agency=agency) + +#test_small_pdfs(parser) + +process_journal_pdfs(parser, "http://www.kafjord.kommune.no/index.php?cat=18590", errors) +process_page_queue(parser, errors) +report_errors(errors) + diff --git a/scrapersources/postliste-kristiansund b/scrapersources/postliste-kristiansund new file mode 100644 index 0000000..6965810 --- /dev/null +++ b/scrapersources/postliste-kristiansund @@ -0,0 +1,87 @@ +# -*- coding: UTF-8 -*- + +import scraperwiki +from BeautifulSoup import BeautifulSoup +import datetime +import dateutil.parser +import lxml.html +import sys +import urlparse + +scraperwiki.scrape("http://kristiansund.orkide.acos.no/kunde/web/postliste/postliste.asp") + +lazycache=scraperwiki.swimport('lazycache') +postlistelib=scraperwiki.swimport('postliste-python-lib') + +agency = 'Kristiansund kommune' +debug = False + +def is_already_scraped(url): + + for sql in ["scrapedurl from swdata where scrapedurl = '" + url + "' limit 1", + "scrapedurl from unparsedpages where scrapedurl = '" + url + "' limit 1"]: +# print sql + try: + result = scraperwiki.sqlite.select(sql) +# print result + if 0 < len(result) and u'scrapedurl' in result[0]: + return True + except: + pass + return False + +def report_errors(errors): + if 0 < len(errors): + print "Errors:" + for e in errors: + print e + exit(1) +def no_cpu_left(arg, spent, soft, hard): + report_errors(arg) + +def process_pdf(parser, pdfurl, errors): + errors = [] + postlistelib.exit_if_no_cpu_left(0, callback=no_cpu_left, arg = errors) + try: + pdfcontent = lazycache.lazycache(pdfurl) + parser.preprocess(pdfurl, pdfcontent) +# except ValueError, e: +# errors.append(e) + except IndexError, e: + errors.append(e) + +def process_page_queue(parser, errors): + try: + parser.process_pages() + postlistelib.exit_if_no_cpu_left(0, callback=no_cpu_left, arg = errors) + except scraperwiki.CPUTimeExceededError, e: + errors.append("Processing pages interrupted") + +def consider_url(parser, url, errors): + if is_already_scraped(url): + True +# print "Skipping already scraped " + url + else: +# print "Will process " + url + process_pdf(parser, url, errors) + +def process_journal_pdfs(parser, listurl, errors): +# print "Finding PDFs on " + listurl +# u = urllib.parse.urlparse(listurl) + html = scraperwiki.scrape(listurl) + root = lxml.html.fromstring(html) + html = None + for ahref in root.cssselect("table a"): + url = urlparse.urljoin(listurl, ahref.attrib['href']) + if -1 == url.find(".pdf"): + continue + consider_url(parser, url, errors) + +#test_parse_case_journal_ref() +errors = [] +parser = postlistelib.PDFJournalParser(agency=agency) +#parser.debug = True +process_journal_pdfs(parser, "http://kristiansund.orkide.acos.no/kunde/web/postliste/postliste.asp", errors) +process_page_queue(parser, errors) +report_errors(errors) + diff --git a/scrapersources/postliste-lier b/scrapersources/postliste-lier new file mode 100644 index 0000000..8064d7a --- /dev/null +++ b/scrapersources/postliste-lier @@ -0,0 +1,81 @@ +# -*- coding: UTF-8 -*- + +import scraperwiki +import json +from BeautifulSoup import BeautifulSoup +import datetime +import dateutil.parser +import lxml.html +import urlparse + +scraperwiki.scrape("http://www.lier.kommune.no/no/Tjenesteomrader-/Oversikter/Postlister---Offentlig-journal/") + +lazycache=scraperwiki.swimport('lazycache') +postlistelib=scraperwiki.swimport('postliste-python-lib') + +agency = 'Lier kommune' + +def report_errors(errors): + if 0 < len(errors): + print "Errors:" + for e in errors: + print e + exit(1) +def out_of_cpu(arg, spent, hard, soft): + report_errors(arg) + +def process_pdf(parser, pdfurl, errors): + errors = [] + postlistelib.exit_if_no_cpu_left(0, out_of_cpu, errors) + try: + pdfcontent = scraperwiki.scrape(pdfurl) + parser.preprocess(pdfurl, pdfcontent) + pdfcontent = None +# except ValueError, e: +# errors.append(e) + except IndexError, e: + errors.append(e) + +def process_page_queue(parser, errors): + try: + parser.process_pages() + postlistelib.exit_if_no_cpu_left(0, out_of_cpu, errors) + except scraperwiki.CPUTimeExceededError, e: + errors.append("Processing pages interrupted") + +def process_journal_pdfs(parser, listurl, errors): +# print "Finding PDFs on " + listurl +# u = urllib.parse.urlparse(listurl) + html = scraperwiki.scrape(listurl) + root = lxml.html.fromstring(html) + html = None + for ahref in root.cssselect("div.fullwidth a"): + href = ahref.attrib['href'] + url = urlparse.urljoin(listurl, href) + if -1 != href.find("file://") or -1 == href.find(".pdf"): +# print "Skipping non-http URL " + url + continue + if parser.is_already_scraped(url): + True +# print "Skipping already scraped " + url + else: +# print "Will process " + url + process_pdf(parser, url, errors) + +def test_small_pdfs(parser): + # Test with some smaller PDFs + errors = [] + process_pdf(parser, "http://www.lier.kommune.no/files/1256/Postlister%2011.06.pdf", errors) + process_page_queue(parser, errors) + report_errors(errors) + exit(0) + +errors = [] +parser = postlistelib.PDFJournalParser(agency=agency) + +#test_small_pdfs(parser) + +process_journal_pdfs(parser, "http://www.lier.kommune.no/no/Tjenesteomrader-/Oversikter/Postlister---Offentlig-journal/", errors) +process_page_queue(parser, errors) +report_errors(errors) + diff --git a/scrapersources/postliste-lindesnes b/scrapersources/postliste-lindesnes new file mode 100644 index 0000000..39e69c0 --- /dev/null +++ b/scrapersources/postliste-lindesnes @@ -0,0 +1,124 @@ +# -*- coding: UTF-8 -*- +import scraperwiki +import lxml.html +import datetime +import dateutil.parser +import urllib2 + +# http://innsyn.lindesnes.kommune.no/Publikum/Modules/innsyn.aspx?mode=pl&SelPanel=0&ObjectType=ePhorteRegistryEntry&VariantType=Innsyn&ViewType=Table&Query=RecordDate%3a%28-14%29+AND+ResponsibleUnitID%3a%2811%29+AND+DocumentType%3a%28I%2cU%29 + +def fetch_url(url): + html = None + for n in [1, 2, 3]: + try: + html = scraperwiki.scrape(url) + break + except urllib2.URLError, e: + print "URLError fetching " + url + ", trying again" + return html + +def make_url(id): + url = "http://innsyn.lindesnes.kommune.no/Publikum/Modules/innsyn.aspx?mode=pl&SelPanel=0&ObjectType=ePhorteRegistryEntry&VariantType=Innsyn&ViewType=Detail&Query=ID:" + str(id) + return url + +def fetch_postjournal(agency, id, url, datastore): +# print "Scraping " + url + scrapestamputc = datetime.datetime.now() + html = fetch_url(url) + root = lxml.html.fromstring(html.decode('utf-8')) + entry = { + 'agency' : agency, + 'scrapestamputc' : scrapestamputc, + 'scrapedurl' : url, + 'queryid' : id + } + + for span in root.cssselect("div.robots-content span.Element"): +# print span.text_content() + field = None + value = None + if span.cssselect("h3"): + field = span.cssselect("h3")[0].text_content().strip() + value = span.cssselect("span.Content span")[0].text_content().strip() + elif span.cssselect("h2"): + field = span.cssselect("h2")[0].text_content().strip() +# FIXME + value = "" + elif span.cssselect("h1"): + field = "docdesc" + value = span.cssselect("h1")[0].text_content().strip() +# else: +# raise ValueError("Unexpected span") +# print field + " = " + value + doctypemap = { + u'Inngående brev' : 'I', + u'Utgående brev' : 'U', + u'Internt notat' : 'N', + u'Internt notat uten oppfølging' : 'X', + u'Saksframlegg/innstilling' : 'S', + u'Dokumentpost i saksmappe' : 'Y', # Code not in NOARK, value based on http://img6.custompublish.com/getfile.php/1168825.136.pqftpqctyt/Ephorte-brukerveiledning_2.1.15.pdf?return=www.kafjord.kommune.no + } + if 'Type' == field: + field = 'doctype' + value = doctypemap[value] + elif 'Journaldato' == field: + field = 'recorddate' + value = dateutil.parser.parse(value, dayfirst=True) + elif 'Dokumentdato' == field: + field = 'docdate' + value = dateutil.parser.parse(value, dayfirst=True) + elif u'Tilhører sak' == field: + field = 'casedesc' + elif 'Avsender/Mottaker' == field: + if 'doctype' in entry and entry['doctype'] in ['U', 'X', 'N']: + field = 'recipient' + else: + field = 'sender' + td = span.cssselect("table td") + if td: + name = td[0].text_content().strip() + addr = td[1].text_content().strip() + zip = td[2].text_content().strip() + # print "N: '",name, "' '", addr, "' '", zip, "'" + entry[field] = name + entry[field + 'addr'] = addr + entry[field + 'zip'] = zip + field = '' + +# elif 'Saksbehandlende enhet' == field: +# elif 'Saksbehandler' == field: + if field is not None and '' != field: + entry[field] = value + + print entry + if 'doctype' in entry: + datastore.append(entry) + +agency = 'Lindesnes kommune' + +def scrape_range(start, end, step, agency): + datastore = [] + for id in range(start, end, step): + fetch_postjournal(agency, id, make_url(id), datastore) + if 0 < len(datastore) and 0 == (len(datastore) % 10): + #print datastore + scraperwiki.sqlite.save(unique_keys=['queryid'], data=datastore) + datastore = [] + if 0 < len(datastore): + scraperwiki.sqlite.save(unique_keys=['queryid'], data=datastore) + +def scraper(): + try: + min = scraperwiki.sqlite.select("min(queryid) as min from swdata")[0]["min"] + max = scraperwiki.sqlite.select("min(queryid) as max from swdata")[0]["max"] + except: + # Random number around 2012-05-15 (ie recent when I wrote this scraper) + min = 71836 + + scrape_range(max, max + 200, 1, agency) + scrape_range(min-1, min - 3000, -1, agency) + +if __name__ == "scraper": + scraper() +else: + print "Not called as scraper"
\ No newline at end of file diff --git a/scrapersources/postliste-luftambulanse b/scrapersources/postliste-luftambulanse new file mode 100644 index 0000000..df28d6b --- /dev/null +++ b/scrapersources/postliste-luftambulanse @@ -0,0 +1,91 @@ +# -*- coding: UTF-8 -*- + +import scraperwiki +import json +from BeautifulSoup import BeautifulSoup +import datetime +import dateutil.parser +import lxml.html +import resource +import sys +import urlparse +import re +lazycache=scraperwiki.swimport('lazycache') +postlistelib=scraperwiki.swimport('postliste-python-lib') + +agency = 'Luftambulansetjenesten ANS' + +def report_errors(errors): + if 0 < len(errors): + print "Errors:" + for e in errors: + print e + exit(1) +def out_of_cpu(arg, spent, hard, soft): + report_errors(arg) + +def process_pdf(parser, pdfurl, errors): + errors = [] + postlistelib.exit_if_no_cpu_left(0, out_of_cpu, errors) + try: + pdfcontent = scraperwiki.scrape(pdfurl) + parser.preprocess(pdfurl, pdfcontent) + pdfcontent = None +# except ValueError, e: +# errors.append(e) + except IndexError, e: + errors.append(e) + +def process_page_queue(parser, errors): + try: + parser.process_pages() + postlistelib.exit_if_no_cpu_left(0, out_of_cpu, errors) + except scraperwiki.CPUTimeExceededError, e: + errors.append("Processing pages interrupted") + +def process_journal_pdfs(parser, listurl, errors): +# print "Finding PDFs on " + listurl +# u = urllib.parse.urlparse(listurl) + html = scraperwiki.scrape(listurl) + root = lxml.html.fromstring(html) + html = None + for ahref in root.cssselect("table a"): + if not 'href' in ahref.attrib: + continue + href = ahref.attrib['href'] + url = urlparse.urljoin(listurl, href).replace(" ", "%20") + if -1 != href.find("file://") or -1 == url.find(".pdf") or -1 == url.find('/Postjournal'): +# print "Skipping non-http URL " + url + continue + if parser.is_already_scraped(url): + True +# print "Skipping already scraped " + url + else: +# print "Will process " + url + process_pdf(parser, url, errors) + +def test_small_pdfs(parser): + # Test with some smaller PDFs + errors = [] + process_pdf(parser, "http://www.luftambulanse.no/filarkiv/Postjournal%202012/Postjournal%20mai/2805-010612.pdf", errors) + process_page_queue(parser, errors) + report_errors(errors) + exit(0) + +errors = [] +parser = postlistelib.PDFJournalParser(agency=agency) + +#test_small_pdfs(parser) + +#process_page_queue(parser, errors) +process_journal_pdfs(parser, "http://www.luftambulanse.no/postjournal_2012.aspx", errors) +process_journal_pdfs(parser, "http://www.luftambulanse.no/postjournal_2011.aspx", errors) +process_journal_pdfs(parser, "http://www.luftambulanse.no/postjournal_2010.aspx", errors) +process_journal_pdfs(parser, "http://www.luftambulanse.no/postjournal_2009.aspx", errors) +process_journal_pdfs(parser, "http://www.luftambulanse.no/postjournal_2008.aspx", errors) +process_journal_pdfs(parser, "http://www.luftambulanse.no/postjournal_2007.aspx", errors) +process_journal_pdfs(parser, "http://www.luftambulanse.no/postjournal.aspx", errors) + +process_page_queue(parser, errors) +report_errors(errors) + diff --git a/scrapersources/postliste-naroy b/scrapersources/postliste-naroy new file mode 100644 index 0000000..b8fa33b --- /dev/null +++ b/scrapersources/postliste-naroy @@ -0,0 +1,89 @@ +# -*- coding: UTF-8 -*- + +import scraperwiki +import json +from BeautifulSoup import BeautifulSoup +import datetime +import dateutil.parser +import lxml.html +import resource +import sys +import urlparse +import re +lazycache=scraperwiki.swimport('lazycache') +postlistelib=scraperwiki.swimport('postliste-python-lib') + +agency = u'Nærøy kommune' + +def report_errors(errors): + if 0 < len(errors): + print "Errors:" + for e in errors: + print e + exit(1) +def out_of_cpu(arg, spent, hard, soft): + report_errors(arg) + +def process_pdf(parser, pdfurl, errors): + errors = [] + postlistelib.exit_if_no_cpu_left(0, out_of_cpu, errors) + try: + pdfcontent = scraperwiki.scrape(pdfurl) + parser.preprocess(pdfurl, pdfcontent) + pdfcontent = None +# except ValueError, e: +# errors.append(e) + except IndexError, e: + errors.append(e) + +def process_page_queue(parser, errors): + try: + parser.process_pages() + postlistelib.exit_if_no_cpu_left(0, out_of_cpu, errors) + except scraperwiki.CPUTimeExceededError, e: + errors.append("Processing pages interrupted") + +def process_journal_pdfs(parser, listurl, errors): +# print "Finding PDFs on " + listurl +# u = urllib.parse.urlparse(listurl) + html = scraperwiki.scrape(listurl) + root = lxml.html.fromstring(html) + html = None + for ahref in root.cssselect("table#hovedinnhold a"): + href = ahref.attrib['href'] + url = urlparse.urljoin(listurl, href).replace(" ", "+") + if -1 != href.find("file://"): +# print "Skipping non-http URL " + url + continue + if -1 == url.find(".pdf"): + continue + # Special case, file indicating no journal entries this day + if "http://www.naroy.kommune.no/NK/Intern.nsf/FilA/CA6C83764E56DDCBC1257A02003F9025/$FILE/Postjournal+11.05.12.pdf" == url or \ + "http://www.naroy.kommune.no/NK/Intern.nsf/FilA/7FD82A18C1A1F137C12579F90029DEBD/$FILE/Postjournal+07.05.12.pdf" == url: + continue + if parser.is_already_scraped(url): + True +# print "Skipping already scraped " + url + else: +# print "Will process " + url + process_pdf(parser, url, errors) + +def test_small_pdfs(parser): + # Test with some smaller PDFs + errors = [] + process_pdf(parser, "http://www.naroy.kommune.no/NK/Intern.nsf/FilA/451908E568D2D630C1257A1E004D1B9D/$FILE/Postjournal%2005.06.12.pdf", errors) + + process_page_queue(parser, errors) + report_errors(errors) + exit(0) + +errors = [] +parser = postlistelib.PDFJournalParser(agency=agency) +#parser.debug = True + +#test_small_pdfs(parser) + +process_journal_pdfs(parser, "http://www.naroy.kommune.no/NK/Web.nsf/mainPress?OpenForm&U=POST", errors) +process_page_queue(parser, errors) +report_errors(errors) + diff --git a/scrapersources/postliste-nih b/scrapersources/postliste-nih new file mode 100644 index 0000000..4f92e18 --- /dev/null +++ b/scrapersources/postliste-nih @@ -0,0 +1,85 @@ +# -*- coding: UTF-8 -*- + +import scraperwiki +import json +from BeautifulSoup import BeautifulSoup +import datetime +import dateutil.parser +import lxml.html +import resource +import sys +import urlparse +import re + +# Make sure Scraperwiki believe this is the source from this database +scraperwiki.scrape("http://www.nih.no/om-nih/aktuelt/offentlig-postjournal/") + +lazycache=scraperwiki.swimport('lazycache') +postlistelib=scraperwiki.swimport('postliste-python-lib') + +agency = 'Norges idrettshøgskole' + +def report_errors(errors): + if 0 < len(errors): + print "Errors:" + for e in errors: + print e + exit(1) +def out_of_cpu(arg, spent, hard, soft): + report_errors(arg) + +def process_pdf(parser, pdfurl, errors): + errors = [] + postlistelib.exit_if_no_cpu_left(0, out_of_cpu, errors) + try: + pdfcontent = scraperwiki.scrape(pdfurl) + parser.preprocess(pdfurl, pdfcontent) + pdfcontent = None +# except ValueError, e: +# errors.append(e) + except IndexError, e: + errors.append(e) + +def process_page_queue(parser, errors): + try: + parser.process_pages() + postlistelib.exit_if_no_cpu_left(0, out_of_cpu, errors) + except scraperwiki.CPUTimeExceededError, e: + errors.append("Processing pages interrupted") + +def process_journal_pdfs(parser, listurl, errors): +# print "Finding PDFs on " + listurl +# u = urllib.parse.urlparse(listurl) + html = scraperwiki.scrape(listurl) + root = lxml.html.fromstring(html) + html = None + for ahref in root.cssselect("li a"): + href = ahref.attrib['href'] + url = urlparse.urljoin(listurl, href) + if -1 != href.find("file://") or -1 == url.find(".pdf"): +# print "Skipping non-http URL " + url + continue + if parser.is_already_scraped(url): + True +# print "Skipping already scraped " + url + else: +# print "Will process " + url + process_pdf(parser, url, errors) + +def test_small_pdfs(parser): + # Test with some smaller PDFs + errors = [] + process_pdf(parser, u"http://www.nih.no/Documents/1_P%C3%98/Postjournaler/offentlig%20journal%20uke%2022.pdf", errors) + process_page_queue(parser, errors) + report_errors(errors) + exit(0) + +errors = [] +parser = postlistelib.PDFJournalParser(agency=agency) + +#test_small_pdfs(parser) + +process_journal_pdfs(parser, "http://www.nih.no/om-nih/aktuelt/offentlig-postjournal/", errors) +process_page_queue(parser, errors) +report_errors(errors) + diff --git a/scrapersources/postliste-npolar b/scrapersources/postliste-npolar new file mode 100644 index 0000000..423a785 --- /dev/null +++ b/scrapersources/postliste-npolar @@ -0,0 +1,101 @@ +# -*- coding: UTF-8 -*- + +import scraperwiki +import json +from BeautifulSoup import BeautifulSoup +import datetime +import dateutil.parser +import lxml.html +import resource +import sys +import urlparse +import re + +# Load front page first, to get it recorded as the source by scraperwiki +scraperwiki.scrape("http://www.npolar.no/no/om-oss/offentlig-journal.html") + +lazycache=scraperwiki.swimport('lazycache') +postlistelib=scraperwiki.swimport('postliste-python-lib') + +agency = 'Norsk Polarinstitutt' + +def report_errors(errors): + if 0 < len(errors): + print "Errors:" + for e in errors: + print e + exit(1) +def out_of_cpu(arg, spent, hard, soft): + report_errors(arg) + +def process_pdf(parser, pdfurl, errors): + errors = [] + postlistelib.exit_if_no_cpu_left(0, out_of_cpu, errors) + try: + pdfcontent = scraperwiki.scrape(pdfurl) + parser.preprocess(pdfurl, pdfcontent) + pdfcontent = None +# except ValueError, e: +# errors.append(e) + except IndexError, e: + errors.append(e) + +def process_page_queue(parser, errors): + try: + parser.process_pages() + postlistelib.exit_if_no_cpu_left(0, out_of_cpu, errors) + except scraperwiki.CPUTimeExceededError, e: + errors.append("Processing pages interrupted") + +def process_journal_pdfs(parser, listurl, errors): +# print "Finding PDFs on " + listurl +# u = urllib.parse.urlparse(listurl) + html = scraperwiki.scrape(listurl) + root = lxml.html.fromstring(html) + html = None + for ahref in root.cssselect("div.onecol ul a"): + href = ahref.attrib['href'] + url = urlparse.urljoin(listurl, href) + if -1 != href.find("file://") or -1 == url.find(".pdf"): +# print "Skipping non-http URL " + url + continue + if parser.is_already_scraped(url): + True +# print "Skipping already scraped " + url + else: +# print "Will process " + url + process_pdf(parser, url, errors) + +def test_small_pdfs(parser): + # Test with some smaller PDFs + errors = [] + #process_pdf(parser, "http://www.npolar.no/npcms/export/sites/np/files/vedlegg/offentlig-journal/2011-10.pdf", errors) + #process_pdf(parser, "http://www.npolar.no/npcms/export/sites/np/files/vedlegg/offentlig-journal/2011-09.pdf", errors) + #process_pdf(parser, "http://www.npolar.no/npcms/export/sites/np/files/vedlegg/offentlig-journal/2011-08.pdf", errors) + #process_pdf(parser, "http://www.npolar.no/npcms/export/sites/np/files/vedlegg/offentlig-journal/2011-07.pdf", errors) + #process_pdf(parser, "http://www.npolar.no/npcms/export/sites/np/files/vedlegg/offentlig-journal/2011-06.pdf", errors) + #process_pdf(parser, "http://www.npolar.no/npcms/export/sites/np/files/vedlegg/offentlig-journal/2011-05.pdf", errors) + #process_pdf(parser, "http://www.npolar.no/npcms/export/sites/np/files/vedlegg/offentlig-journal/2011-04.pdf", errors) + #process_pdf(parser, "http://www.npolar.no/npcms/export/sites/np/files/vedlegg/offentlig-journal/2011-03.pdf", errors) + #process_pdf(parser, "http://www.npolar.no/npcms/export/sites/np/files/vedlegg/offentlig-journal/2011-02.pdf", errors) + #process_pdf(parser, "http://www.npolar.no/npcms/export/sites/np/files/vedlegg/offentlig-journal/2011-01.pdf", errors) + process_pdf(parser, "http://home.nuug.no/~pere/npolar-postjournal/OffJournalapril-mai2012.pdf", errors) + process_pdf(parser, "http://home.nuug.no/~pere/npolar-postjournal/OffJournaljanuar-mai2011.pdf", errors) + process_pdf(parser, "http://home.nuug.no/~pere/npolar-postjournal/OffJournaljanuar-mars2012.pdf", errors) + process_pdf(parser, "http://home.nuug.no/~pere/npolar-postjournal/OffJournaljuni-oktober2011.pdf", errors) + process_pdf(parser, "http://home.nuug.no/~pere/npolar-postjournal/OffJournaljuni2012.pdf", errors) + process_pdf(parser, "http://home.nuug.no/~pere/npolar-postjournal/OffJournalnovember-desember2011.pdf", errors) + + process_page_queue(parser, errors) + report_errors(errors) + exit(0) + +errors = [] +parser = postlistelib.PDFJournalParser(agency=agency) + +#test_small_pdfs(parser) + +process_journal_pdfs(parser, "http://www.npolar.no/no/om-oss/offentlig-journal.html", errors) +process_page_queue(parser, errors) +report_errors(errors) + diff --git a/scrapersources/postliste-nrk b/scrapersources/postliste-nrk new file mode 100644 index 0000000..5c7929d --- /dev/null +++ b/scrapersources/postliste-nrk @@ -0,0 +1,94 @@ +# -*- coding: UTF-8 -*- +# Based on the scraper advanced-scraping-pdf +# See also +# https://views.scraperwiki.com/run/pdf-to-html-preview-1/?url=http%3A%2F%2Fwww.stortinget.no%2FGlobal%2Fpdf%2Fpostjournal%2Fpj-2012-05-09.pdf + +import scraperwiki +import json +from BeautifulSoup import BeautifulSoup +import datetime +import dateutil.parser +import lxml.html +import resource +import sys +import urlparse +import gc +import re + +frontpage = "http://www.nrk.no/contentfile/transformer/1.8052258" +scraperwiki.scrape(frontpage) + +lazycache=scraperwiki.swimport('lazycache') +postlistelib=scraperwiki.swimport('postliste-python-lib') + +agency = 'Universitetet i Oslo' + +def report_errors(errors): + if 0 < len(errors): + print "Errors:" + for e in errors: + print e + exit(1) +def out_of_cpu(arg, spent, hard, soft): + report_errors(arg) + +def process_pdf(parser, pdfurl, errors): + errors = [] + postlistelib.exit_if_no_cpu_left(0, out_of_cpu, errors) + try: + pdfcontent = scraperwiki.scrape(pdfurl) + parser.preprocess(pdfurl, pdfcontent) + pdfcontent = None +# except ValueError, e: +# errors.append(e) + except IndexError, e: + errors.append(e) + +def process_page_queue(parser, errors): + try: + parser.process_pages() + postlistelib.exit_if_no_cpu_left(0, out_of_cpu, errors) + except scraperwiki.CPUTimeExceededError, e: + errors.append("Processing pages interrupted") + +def process_journal_pdfs(parser, listurl, errors): +# print "Finding PDFs on " + listurl +# u = urllib.parse.urlparse(listurl) + html = scraperwiki.scrape(listurl) + root = lxml.html.fromstring(html) + html = None + for ahref in root.cssselect("table a"): + href = ahref.attrib['href'] + url = urlparse.urljoin(listurl, href) + if -1 != href.find("file://"): +# print "Skipping non-http URL " + url + continue + if parser.is_already_scraped(url): + True +# print "Skipping already scraped " + url + else: +# print "Will process " + url + process_pdf(parser, url, errors) + +def test_small_pdfs(parser): + + parser.debug = True + + errors = [] + process_pdf(parser, "http://nrk.no/contentfile/file/1.8116520!offentligjournal02052012.pdf", errors) # text + #process_pdf(parser, "http://nrk.no/contentfile/file/1.8061384!offentlig%2002042012.pdf", errors) # Image + #process_pdf(parser, "http://nrk.no/contentfile/file/1.8130287!offentligjournal09052012.pdf", errors) # Image + process_page_queue(parser, errors) + report_errors(errors) + exit(0) + +errors = [] +parser = postlistelib.PDFJournalParser(agency=agency, hiddentext=True) + +test_small_pdfs(parser) + +# Based on http://www.nrk.no/innsyn/ +process_journal_pdfs(parser, frontpage, errors) +process_page_queue(parser, errors) +report_errors(errors) + diff --git a/scrapersources/postliste-ntnu b/scrapersources/postliste-ntnu new file mode 100644 index 0000000..1a885c4 --- /dev/null +++ b/scrapersources/postliste-ntnu @@ -0,0 +1,87 @@ +# -*- coding: UTF-8 -*- + +import scraperwiki +import json +from BeautifulSoup import BeautifulSoup +import datetime +import dateutil.parser +import lxml.html +import urlparse +import urllib2 + +# Make sure Scraperwiki believe this is the source from this database +scraperwiki.scrape("http://www.ntnu.no/aktuelt/offentlig-journal") + +lazycache=scraperwiki.swimport('lazycache') +postlistelib=scraperwiki.swimport('postliste-python-lib') + +agency = 'Norges teknisk-naturvitenskapelige universitet' + +def report_errors(errors): + if 0 < len(errors): + print "Errors:" + for e in errors: + print e + raise ValueError("Something went wrong") + +def out_of_cpu(arg, spent, hard, soft): + report_errors(arg) + +def process_pdf(parser, pdfurl, errors): + postlistelib.exit_if_no_cpu_left(0, out_of_cpu, errors) + try: + pdfcontent = scraperwiki.scrape(pdfurl) + parser.preprocess(pdfurl, pdfcontent) + pdfcontent = None + except ValueError, e: # Some PDFs can not be parsed! This should be investigated + print "PDF format problem" + errors.append(e) + except IndexError, e: + errors.append(e) + except urllib2.HTTPError, e: + errors.append(e) + +def process_page_queue(parser, errors): + try: + parser.process_pages() + postlistelib.exit_if_no_cpu_left(0, out_of_cpu, errors) + except scraperwiki.CPUTimeExceededError, e: + errors.append("Processing pages interrupted") + +def process_journal_pdfs(parser, listurl, errors): +# print "Finding PDFs on " + listurl +# u = urllib.parse.urlparse(listurl) + html = scraperwiki.scrape(listurl) + root = lxml.html.fromstring(html) + html = None + for ahref in root.cssselect("ul a"): + href = ahref.attrib['href'] + url = urlparse.urljoin(listurl, href) + if -1 != href.find("file://") or -1 == url.find(".pdf"): +# print "Skipping non-http URL " + url + continue + if parser.is_already_scraped(url): + True +# print "Skipping already scraped " + url + else: +# print "Will process " + url + process_pdf(parser, url, errors) + +def test_small_pdfs(parser): + # Test with some smaller PDFs + errors = [] + process_pdf(parser, "http://www.ntnu.no/offjour/2012-06.25.pdf", errors) + process_pdf(parser, "http://www.ntnu.no/offjour/2012-06.13.pdf ", errors) # Strange format? + process_page_queue(parser, errors) + report_errors(errors) + exit(0) + +errors = [] +parser = postlistelib.PDFJournalParser(agency=agency) + +#test_small_pdfs(parser) + +process_journal_pdfs(parser, "http://www.ntnu.no/aktuelt/offentlig-journal", errors) +process_page_queue(parser, errors) +report_errors(errors) + diff --git a/scrapersources/postliste-oep b/scrapersources/postliste-oep new file mode 100644 index 0000000..c7fdc82 --- /dev/null +++ b/scrapersources/postliste-oep @@ -0,0 +1,336 @@ +# -*- coding: UTF-8 -*- + +import scraperwiki +import lxml.html +import datetime +import time +import resource +import httplib +import urllib2 + +# Try several times as the database get bigger +writetries = 5 + +# http://www.oep.no/search/resultSingle.html?journalPostId=1000000 +# http://www.oep.no/search/resultSingle.html?journalPostId=3889259 + +# <table class="defaultTable"> +# <tr> +# <th class="noLeftBorder" style="width: 20%;">Agency:</th> +# <td class="noRightBorder" style="width: 80%;">Kulturdepartementet</td> +# </tr> +# <tr> +# <th class="noLeftBorder">Case:</th> +# <td class="noRightBorder">DNT Oslo og Omegn - rehabilitering og utvidelse av turisthytta Snøheim pÃ¥ Dovre - spillemidler til anlegg for friluftsliv i fjellet 2011</td> +# </tr> +# <tr> +# <th class="noLeftBorder">Document title:</th> +# <td class="noRightBorder">DNT Oslo og Omegn - turisthytta Snøheim pÃ¥ Dovre - eventuelt navnebytte</td> +# </tr> +# <tr> +# <th class="noLeftBorder">Case number:</th> +# <td class="noRightBorder">2010/04027</td> +# </tr> +# <tr> +# <th class="noLeftBorder">Document number:</th> +# <td class="noRightBorder">4</td> +# </tr> +# <tr> +# <th class="noLeftBorder">Document type:</th> +# <td class="noRightBorder"> +# +# +# +# Outgoing +# +# +# </td> +# </tr> +# +# +# <tr> +# <th class="noLeftBorder">Recipient:</th> +# <td class="noRightBorder">Den Norske Turistforening</td> +# </tr> +# +# <tr> +# <th class="noLeftBorder">Document date:</th> +# <td class="noRightBorder">2010-12-13</td> +# </tr> +# <tr> +# <th class="noLeftBorder">Record entry date:</th> +# <td class="noRightBorder"> +# +# +# +# 2010-12-14 +# +# +# </td> +# </tr> +# <tr> +# <th class="noLeftBorder">Published in OEP</th> +# <td class="noRightBorder">2011-01-03</td> +# </tr> +# <tr> +# <th class="noLeftBorder" title="Hvis dokumentet er unntatt offentlighet kan unntaket gjelde hele eller deler av dokumentet."><span class="dottedBorderBottom">Grounds for exemption, document:</span></th> +# <td class="noRightBorder"> +# +# </td> +# </tr> +# <tr> +# <th class="noLeftBorder">Archive code:</th> +# <td class="noRightBorder"> +# +# </td> +# </tr> +# <tr> +# <th class="noLeftBorder">Contact point:</th> +# <td class="noRightBorder"> +# <br /> +# Tel.: 22 24 90 90<br /> +# Email: <a href="mailto:postmottak@kud.dep.no" title="Send email">postmottak@kud.dep.no</a> +# </td> +# </tr> +# </table> + +def cpu_spent(): + usage = resource.getrusage(resource.RUSAGE_SELF) + return getattr(usage, 'ru_utime') + getattr(usage, 'ru_stime') + +def url_from_id(id): + return "http://www.oep.no/search/resultSingle.html?journalPostId=" + str(id) + +def save(data): + for run in range(1,writetries): + try: + scraperwiki.sqlite.save(unique_keys=['journalPostId'], data=data) + return + except scraperwiki.sqlite.SqliteError, e: + print "Sqlite write error, trying again" + time.sleep(22) + raise scraperwiki.sqlite.SqliteError("Unable to write to database, tried " + str(writetries) + " times") + +def save_var(var, data): + for run in range(1,writetries): + try: + scraperwiki.sqlite.save_var(var, data) + return + except scraperwiki.sqlite.SqliteError, e: + print "Sqlite write error, trying again" + time.sleep(22) + raise scraperwiki.sqlite.SqliteError("Unable to write variable " + var + " to database, tried " + str(writetries) + " times") + +fieldmap = { + 'Agency' : 'agency', + 'Record entry date' : 'recorddate', + 'Case' : 'casedesc', + 'Case number' : 'caseid', + 'Document number' : 'casedocseq', + 'Document date' : 'docdate', + 'Document title' : 'docdesc', + 'Document type' : 'doctype', + 'Grounds for exemption document' : 'exemption', + 'Recipient' : 'recipient', + 'Sender' : 'sender', + 'Published in OEP' : 'recordpublishdate', +# 'Archive code', +# 'Contact point', +# 'journalPostId', +# 'scrapestamputc', +} + +doctypemap = { + 'Incoming' : 'I', + 'Outgoing' : 'U', + 'internal' : 'X', +} + +def fetch_oep_entry(id, datastorage): + oepurl = url_from_id(id) + html = scraperwiki.scrape(oepurl) + root = lxml.html.fromstring(html.decode('utf-8')) + data = { 'journalPostId' : id } + for tr in root.cssselect("table.defaultTable tr"): + vtype = tr.cssselect("th")[0].text_content().strip().replace(":", "").replace(",", "") + value = tr.cssselect("td")[0].text_content().strip() + #print '"' + vtype + '"', '"'+value+'"' + if (vtype == 'Record entry date' and value == 'Not stated.') or \ + (vtype == 'Document type' and value == '-') or \ + (vtype == 'Case number' and value == ''): + return -1 + if vtype in fieldmap: + vtype = fieldmap[vtype] + if 'doctype' == vtype: + value = doctypemap[value] + if 'caseid' == vtype: + caseyear, caseseqnr = value.split("/") + data['caseyear'] = caseyear + data['caseseqnr'] = caseseqnr + data[vtype] = value +# print str(id) + ": " + str(data) + data['scrapestamputc'] = datetime.datetime.now() +# print data['scrapestamputc'] +# exit () + + datastorage.append(data) +# scraperwiki.sqlite.save(unique_keys=['journalPostId'], data=data) + return 0 + +def fetch_range(first, last, step): + myskiplimit = skiplimit + datastorage = [] + skipped = 0 + fetched = 0 + min_id = first + for id in range(first, last, step): + try: + tries = 3 + while 0 < tries: + tries = tries - 1 + try: + if -1 == fetch_oep_entry(id, datastorage): + skipped = skipped + 1 + if skipped == myskiplimit and myskiplimit == skiplimit: + tmp = [] + for limit in [250, 500, 800, 1000, 1200, 1500, 1700, 2000, 3000, 5000]: + testid = id + limit * step + if -1 != fetch_oep_entry(testid, tmp): + print "Looking "+str(limit)+" ahead, found " + url_from_id(testid) + myskiplimit = skiplimit + limit + 1 + break + break + else: + fetched = fetched + 1 + skipped = 0 + myskiplimit = skiplimit + break + except urllib2.HTTPError, e: # Because HTTPError lack reason due to bug + print "URLError triggered for url " + url_from_id(id) + ", trying again: " + str(e.msg) + except urllib2.URLError, e: + print "URLError triggered for url " + url_from_id(id) + ", trying again: " + str(e.reason) + except httplib.BadStatusLine, e: + # e.msg do not exist. trying .reason 2012-06-25 + print "BadStatusLine triggered for url " + url_from_id(id) + ", trying again: " + str(e.reason) + + if skipped >= myskiplimit: + print "Reached end of list, exiting at " + str(id) + break + if 50 <= len(datastorage): + save(data=datastorage) + datastorage = [] + + # Only do this for every 50 ID tested, to avoid spending too much CPU seconds updating the sqlite file + if 0 == (id % 50): + if id < min_id: + min_id = id +# print "Updating min_id to " + str(min_id) + save_var('min_tested_id', min_id) + if cpu_spent() > 79: + print "Running short on CPU time, exiting at " + str(id) + break + time.sleep(0.2) + except scraperwiki.CPUTimeExceededError: + if 0 < len(datastorage): + save(data=datastorage) + datastorage = [] + print "CPU exception caught" + raise + except: + print "Error, unexpected exception" + raise + if 0 < len(datastorage): + save(data=datastorage) + datastorage = [] + return fetched + +def rename_sql_columns(): + print "Dropping temp table" + scraperwiki.sqlite.execute("DROP TABLE IF EXISTS swdatanew") + print "Creating table" + scraperwiki.sqlite.execute("CREATE TABLE IF NOT EXISTS swdatanew (agency text, recorddate text, casedesc text, caseid text, casedocseq integer, docdate text, docdesc text, doctype text, exemption text, recipient text, sender text, recordpublishdate text, `Archive code` text, `Contact point` text, `journalPostId` integer, scrapestamputc text)") + print "Copying table" + scraperwiki.sqlite.execute("INSERT INTO swdatanew(agency, recorddate, casedesc, caseid, casedocseq, docdate, docdesc, doctype, exemption, recipient, sender, recordpublishdate, `Archive code`, `Contact point`, `journalPostId`, scrapestamputc) SELECT `Agency`, `Record entry date`, `Case`, `Case number`, `Document number`, `Document date`, `Document title`, `Document type`, `Grounds for exemption document`, `Recipient`, `Sender`, `Published in OEP`, `Archive code`, `Contact point`, `journalPostId`, `scrapestamputc` FROM swdata") + + scraperwiki.sqlite.execute("ALTER TABLE swdata RENAME TO swdataold") + scraperwiki.sqlite.execute("ALTER TABLE swdatanew RENAME TO swdata") + scraperwiki.sqlite.commit() + exit(0) + +def create_indexes(): + for field in ['doctype', 'agency', 'recorddate', 'caseid']: + print "Creating %s index" % field + scraperwiki.sqlite.execute("CREATE INDEX IF NOT EXISTS swdata_%s_index ON swdata (%s)" % (field, field)) + scraperwiki.sqlite.commit() + +def update_doctypes(): + print "Updating doctype" + agencies = [] + for agencyref in scraperwiki.sqlite.select("distinct agency from swdata"): + agencies.append(agencyref['agency']) + + # Updating individual agencies to try to avoid SQL timeout + for agency in agencies: + print "Updating doctype for " + agency + scraperwiki.sqlite.execute("UPDATE swdata set doctype = 'I' where agency = ? and doctype = 'Incoming'", (agency)) + scraperwiki.sqlite.execute("UPDATE swdata set doctype = 'U' where agency = ? and doctype = 'Outgoing'", (agency)) + scraperwiki.sqlite.execute("UPDATE swdata set doctype = 'X' where agency = ? and doctype = 'internal'", (agency)) + scraperwiki.sqlite.commit() + exit(0) + +def update_caseyear(): + print "Updating caseyear and caseseqnr" + agencies = [] + for agencyref in scraperwiki.sqlite.select("distinct agency from swdata WHERE caseyear is NULL"): + agencies.append(agencyref['agency']) + + # Updating individual agencies to try to avoid SQL timeout + for agency in agencies: + print "Updating caseyear for " + agency + res = scraperwiki.sqlite.execute("select journalPostId, substr(caseid, 1, 4), substr(caseid, 6) from swdata where agency = ? and caseyear is NULL limit 2", (agency)) + print res + scraperwiki.sqlite.execute("UPDATE swdata set caseyear = substr(caseid, 1, 4), caseseqnr = substr(caseid, 6) where agency = ? AND caseyear is NULL", (agency)) + scraperwiki.sqlite.commit() + exit(0) + +def remove_original(): + scraperwiki.sqlite.execute("DROP TABLE IF EXISTS swdataold") + scraperwiki.sqlite.commit() + exit(0) + +#update_caseyear() + +#create_indexes() + +#rename_sql_columns() +#remove_original() + +# This one give me SQL timeout +#update_doctypes() + +print "Starting to fetch journal entries " + str(datetime.datetime.now()) +count = 10000 +skiplimit = 500 +# Random value fairly close to the most recent ID when this project started 2012-05-03 +max = min = startid = 3889259 +try: + max = scraperwiki.sqlite.select("max(journalPostId) as max from swdata")[0]["max"] + if 0 < scraperwiki.sqlite.get_var('min_tested_id'): + saved_min = scraperwiki.sqlite.get_var('min_tested_id') + sql_min = scraperwiki.sqlite.select("min(journalPostId) as min from swdata")[0]["min"] + print "Saved min: " + str(saved_min) + ", sql min: " + str(sql_min) + if sql_min < saved_min: + min = sql_min + else: + min = saved_min + + print "Scraping " + str(count) + " IDs below " + str(min) + " and above " + str(max) +except scraperwiki.sqlite.SqliteError: + pass + +fetched = fetch_range(max + 1, max + count, 1) +print "Fetched " + str(fetched) + " new journal entries, cpu spent: " + str(cpu_spent()) +if min >= 0: + fetched = fetch_range(min, min - count, -1) + print "Fetched " + str(fetched) + " old journal entries, cpu spent: " + str(cpu_spent()) + diff --git a/scrapersources/postliste-oep-deliverydates b/scrapersources/postliste-oep-deliverydates new file mode 100644 index 0000000..f04ce49 --- /dev/null +++ b/scrapersources/postliste-oep-deliverydates @@ -0,0 +1,37 @@ +import scraperwiki +import lxml.html +import datetime +import resource +import dateutil.parser +import resource + +def cpu_spent(): + usage = resource.getrusage(resource.RUSAGE_SELF) + return getattr(usage, 'ru_utime') + getattr(usage, 'ru_stime') + +def fetch_oep_deliverydates(url, datastorage): + html = scraperwiki.scrape(url) + root = lxml.html.fromstring(html.decode('utf-8')) + data = { 'scrapedurl' : id } + for tr in root.cssselect("table.defaulttable tr"): + if 3 == len(tr.cssselect("td")): + data = { 'scrapedurl' : url } + #print tr +# vtype = tr.cssselect("th")[0].text_content().strip().replace(":", "").replace(",", "") + agency = tr.cssselect("td")[0].text_content().strip() + deliverydate = tr.cssselect("td")[1].text_content().strip() + if deliverydate == "Levert": + continue + data['agency'] = agency + #print "D: '" + deliverydate + "'" + data['deliverydate'] = dateutil.parser.parse(deliverydate, dayfirst=True) + data['scrapestamputc'] = datetime.datetime.now() + datastorage.append(data) + return 0 + +datastorage = [] +fetch_oep_deliverydates("http://www.oep.no/pub/faces/statistikk.jsp?reposId=3", datastorage) +print datastorage +scraperwiki.sqlite.save(unique_keys=['agency', 'deliverydate'], data=datastorage) + +print "Starting to fetch journal delivery dates " + str(datetime.datetime.now()) diff --git a/scrapersources/postliste-oslo-bydel-ullern b/scrapersources/postliste-oslo-bydel-ullern new file mode 100644 index 0000000..54a5031 --- /dev/null +++ b/scrapersources/postliste-oslo-bydel-ullern @@ -0,0 +1,85 @@ +# -*- coding: UTF-8 -*- +# Based on the scraper advanced-scraping-pdf +# See also +# https://views.scraperwiki.com/run/pdf-to-html-preview-1/?url=http%3A%2F%2Fwww.stortinget.no%2FGlobal%2Fpdf%2Fpostjournal%2Fpj-2012-05-09.pdf + +import scraperwiki +import json +from BeautifulSoup import BeautifulSoup +import datetime +import dateutil.parser +import lxml.html +import urlparse +import re +#lazycache=scraperwiki.swimport('lazycache') +postlistelib=scraperwiki.swimport('postliste-python-lib') + +agency = 'Oslo kommune, Ullern bydel' + +def report_errors(errors): + if 0 < len(errors): + print "Errors:" + for e in errors: + print e + exit(1) +def out_of_cpu(arg, spent, hard, soft): + report_errors(arg) + +def process_pdf(parser, pdfurl, errors): + errors = [] + postlistelib.exit_if_no_cpu_left(0, out_of_cpu, errors) + try: + pdfcontent = scraperwiki.scrape(pdfurl) + parser.preprocess(pdfurl, pdfcontent) + pdfcontent = None +# except ValueError, e: +# errors.append(e) + except IndexError, e: + errors.append(e) + +def process_page_queue(parser, errors): + try: + parser.process_pages() + postlistelib.exit_if_no_cpu_left(0, out_of_cpu, errors) + except scraperwiki.CPUTimeExceededError, e: + errors.append("Processing pages interrupted") + +def process_journal_pdfs(parser, listurl, errors): +# print "Finding PDFs on " + listurl +# u = urllib.parse.urlparse(listurl) + html = scraperwiki.scrape(listurl) + root = lxml.html.fromstring(html) + html = None + for ahref in root.cssselect("table a"): + href = ahref.attrib['href'] + url = urlparse.urljoin(listurl, href) + if -1 != href.find("file://") or -1 != href.find("mailto:"): +# print "Skipping non-http URL " + url + continue + if parser.is_already_scraped(url): + True +# print "Skipping already scraped " + url + else: +# print "Will process " + url + process_pdf(parser, url, errors) + +def test_small_pdfs(parser): + # Test with some smaller PDFs + errors = [] + parser.debug = True + process_pdf(parser, "http://www.bydel-ullern.oslo.kommune.no/getfile.php/bydel%20ullern%20(BUN)/Internett%20(BUN)/Dokumenter/dokument/postjournal/120502.pdf", errors) + process_page_queue(parser, errors) + report_errors(errors) + exit(0) + +print "Starting scraping of " + agency +parser = postlistelib.PDFJournalParser(agency=agency) +#parser.debug = True + +#test_small_pdfs(parser) + +errors = [] +process_journal_pdfs(parser, "http://www.bydel-ullern.oslo.kommune.no/postjournal/", errors) +process_page_queue(parser, errors) +report_errors(errors) + diff --git a/scrapersources/postliste-oslo-gravferdsetaten b/scrapersources/postliste-oslo-gravferdsetaten new file mode 100644 index 0000000..7becd10 --- /dev/null +++ b/scrapersources/postliste-oslo-gravferdsetaten @@ -0,0 +1,90 @@ +# -*- coding: UTF-8 -*- + +import scraperwiki +import json +from BeautifulSoup import BeautifulSoup +import datetime +import dateutil.parser +import lxml.html +import resource +import sys +import urlparse +import re + +scraperwiki.scrape("http://www.gravferdsetaten.oslo.kommune.no/offentlig_journal/article43281-14384.html") + +lazycache=scraperwiki.swimport('lazycache') +postlistelib=scraperwiki.swimport('postliste-python-lib') + +agency = 'Oslo kommune, gravferdsetaten' + +def report_errors(errors): + if 0 < len(errors): + print "Errors:" + for e in errors: + print e + exit(1) +def out_of_cpu(arg, spent, hard, soft): + report_errors(arg) + +def process_pdf(parser, pdfurl, errors): + errors = [] + postlistelib.exit_if_no_cpu_left(0, out_of_cpu, errors) + try: + pdfcontent = scraperwiki.scrape(pdfurl) + parser.preprocess(pdfurl, pdfcontent) + pdfcontent = None +# except ValueError, e: +# errors.append(e) + except IndexError, e: + errors.append(e) + +def process_page_queue(parser, errors): + try: + if False: + scraperwiki.sqlite.execute("delete from swdata where scrapedurl in (select scrapedurl from unparsedpages)") + scraperwiki.sqlite.execute("delete from unparsedpages") + scraperwiki.sqlite.commit() + + parser.process_pages() + postlistelib.exit_if_no_cpu_left(0, out_of_cpu, errors) + except scraperwiki.CPUTimeExceededError, e: + errors.append("Processing pages interrupted") + +def process_journal_pdfs(parser, listurl, errors): +# print "Finding PDFs on " + listurl +# u = urllib.parse.urlparse(listurl) + html = scraperwiki.scrape(listurl) + root = lxml.html.fromstring(html) + html = None + for ahref in root.cssselect("table a"): + href = ahref.attrib['href'] + url = urlparse.urljoin(listurl, href) + if -1 != href.find("file://") or -1 != href.find("mailto:"): +# print "Skipping non-http URL " + url + continue + if parser.is_already_scraped(url): + True +# print "Skipping already scraped " + url + else: +# print "Will process " + url + process_pdf(parser, url, errors) + +def test_small_pdfs(parser): + # Test with some smaller PDFs + errors = [] + process_pdf(parser, "http://www.gravferdsetaten.oslo.kommune.no/getfile.php/gravferdsetaten%20(GFE)/Internett%20(GFE)/Dokumenter/dokument/Arkivet/Postjournal/Juni/13.06.pdf", errors) + process_pdf(parser, "http://www.gravferdsetaten.oslo.kommune.no/getfile.php/gravferdsetaten%20(GFE)/Internett%20(GFE)/Dokumenter/dokument/Arkivet/Postjournal/Juni/12.06.pdf", errors) + process_page_queue(parser, errors) + report_errors(errors) + exit(0) + +errors = [] +parser = postlistelib.PDFJournalParser(agency=agency) + +#test_small_pdfs(parser) + +process_journal_pdfs(parser, "http://www.gravferdsetaten.oslo.kommune.no/offentlig_journal/article43281-14384.html", errors) +process_page_queue(parser, errors) +report_errors(errors) + diff --git a/scrapersources/postliste-oslo-havn b/scrapersources/postliste-oslo-havn new file mode 100644 index 0000000..d453ef7 --- /dev/null +++ b/scrapersources/postliste-oslo-havn @@ -0,0 +1,86 @@ +# -*- coding: UTF-8 -*- +# Based on the scraper advanced-scraping-pdf +# See also +# https://views.scraperwiki.com/run/pdf-to-html-preview-1/?url=http%3A%2F%2Fwww.stortinget.no%2FGlobal%2Fpdf%2Fpostjournal%2Fpj-2012-05-09.pdf + +import scraperwiki +import json +from BeautifulSoup import BeautifulSoup +import datetime +import dateutil.parser +import lxml.html +import sys +import urlparse +import re +lazycache=scraperwiki.swimport('lazycache') +postlistelib=scraperwiki.swimport('postliste-python-lib') + +agency = 'Oslo kommune, Oslo Havn KF' + +def report_errors(errors): + if 0 < len(errors): + print "Errors:" + for e in errors: + print e + exit(1) +def out_of_cpu(arg, spent, hard, soft): + report_errors(arg) + +def process_pdf(parser, pdfurl, errors): + errors = [] + postlistelib.exit_if_no_cpu_left(0, out_of_cpu, errors) + try: + parser.fetch_and_preprocess(pdfurl) +# except ValueError, e: +# errors.append(e) + except IndexError, e: + errors.append(e) + +def process_page_queue(parser, errors): + try: + parser.process_pages() + postlistelib.exit_if_no_cpu_left(0, out_of_cpu, errors) + except scraperwiki.CPUTimeExceededError, e: + errors.append("Processing pages interrupted, ran out of cpu") + +def process_journal_pdfs(parser, listurl, errors): +# print "Finding PDFs on " + listurl +# u = urllib.parse.urlparse(listurl) + html = scraperwiki.scrape(listurl) + root = lxml.html.fromstring(html) + html = None + for ahref in root.cssselect("table a"): + href = ahref.attrib['href'] + url = urlparse.urljoin(listurl, href) + if -1 != href.find("file://"): +# print "Skipping non-http URL " + url + continue + if parser.is_already_scraped(url): + True +# print "Skipping already scraped " + url + else: +# print "Will process " + url + process_pdf(parser, url, errors) + +def test_pdfs(parser): + parser.debug = True + # Test with some smaller PDFs + errors = [] + process_pdf(parser, "http://www.havn.oslo.kommune.no/getfile.php/oslo%20havn%20kf%20(HAV)/Internett%20(HAV)/Dokumenter/Postjournal/Mai/24.05.2012.pdf", errors) + + # This file have a problematic format, the text fragments have a different order than most + # journal PDFs. + process_pdf(parser, "http://www.havn.oslo.kommune.no/getfile.php/oslo%20havn%20kf%20%28HAV%29/Internett%20%28HAV%29/Dokumenter/Postjournal/Mars/1%20MTMzMDY4NjY3ODI5OTk5Mz.pdf", errors) + process_page_queue(parser, errors) + report_errors(errors) + exit(0) + +parser = postlistelib.PDFJournalParser(agency=agency) + +#test_pdfs(parser) + +errors = [] +process_journal_pdfs(parser, "http://www.havn.oslo.kommune.no/postjournal/", errors) +process_page_queue(parser, errors) +report_errors(errors) + diff --git a/scrapersources/postliste-oslo-radhusets-forvaltningstjeneste b/scrapersources/postliste-oslo-radhusets-forvaltningstjeneste new file mode 100644 index 0000000..4f9b5c1 --- /dev/null +++ b/scrapersources/postliste-oslo-radhusets-forvaltningstjeneste @@ -0,0 +1,231 @@ +# -*- coding: UTF-8 -*- + +import scraperwiki +import json +from BeautifulSoup import BeautifulSoup +import datetime +import dateutil.parser +import lxml.html +import urlparse +import re + +# Make sure Scraperwiki believe this is the source from this database +scraperwiki.scrape("http://www.radhusets-forvaltningstjeneste.oslo.kommune.no/postjournal/") + +lazycache=scraperwiki.swimport('lazycache') +postlistelib=scraperwiki.swimport('postliste-python-lib') + +agency = 'Oslo kommune, Rådhusets forvaltningstjeneste' + +def report_errors(errors): + if 0 < len(errors): + print "Errors:" + for e in errors: + print e + exit(1) + +def out_of_cpu(arg, spent, hard, soft): + report_errors(arg) + +# Input YY/X-Z, return YYYY, X, Z +def split_docid(docid): + caseyear, rest = docid.split('/') + caseseqnr, casedocseq = rest.split('-') + caseyear = int(caseyear) + caseseqnr = int(caseseqnr) + casedocsec = int(casedocseq) + if caseyear < 50: + caseyear = caseyear + 2000 + if 50 <= caseyear and caseyear < 100: + caseyear = caseyear + 1900 + return caseyear, caseseqnr, casedocseq + +# Input DDMMYYYY, output YYYY-MM-DD +def parse_date(date): + if 'Udatert' == date: + return None + year = date[4:8] + month = date[2:4] + day = date[0:2] + isodate = year + "-" + month + "-" + day + #print date, isodate + return dateutil.parser.parse(isodate, dayfirst=True).date() + +def parse_entry(pdfurl, lines): + print lines + print "Entry lines " + str(len(lines)) + entry = { + 'agency' : agency, + 'scrapedurl' : pdfurl, + } + cur = 0 + while cur < len(lines): + line = lines[cur].text + #print line + if -1 != line.find('Dok.dato:'): + entry['docid'] = lines[cur-2].text + entry['doctype'] = lines[cur-1].text + entry['docdate'] = parse_date(line.replace("Dok.dato:", "")) + caseyear, caseseqnr, casedocseq = split_docid(entry['docid']) + entry['caseyear'] = caseyear + entry['caseseqnr'] = caseseqnr + entry['casedocseq'] = casedocseq + entry['caseid'] = str(caseyear) + '/' + str(caseseqnr) + if -1 != line.find('Jour.dato:'): + entry['recorddate'] = parse_date(lines[cur+1].text) + cur = cur + 1 + if -1 != line.find('Arkivdel:'): + entry['arkivdel'] = lines[cur+1].text + cur = cur + 1 + if -1 != line.find('Tilg. kode:'): + entry['tilgangskode'] = line.replace("Tilg. kode:", "") + if -1 != line.find('Sak:'): + entry['casedesc'] = lines[cur+1].text + cur = cur + 1 + if -1 != line.find('Dok:'): + entry['docdesc'] = lines[cur+1].text + cur = cur + 1 + if -1 != line.find('Par.:'): + entry['exemption'] = line.replace("Par.:", "") + cur = cur + 1 + if -1 != line.find('Avsender:'): + entry['sender'] = lines[cur+1].text + cur = cur + 1 + if -1 != line.find('Mottaker:'): + entry['recipient'] = lines[cur+1].text + cur = cur + 1 + if -1 != line.find('Saksansv:'): + entry['saksansvarlig'] = line.replace("Saksansv:", "").strip() + if -1 != line.find('Saksbeh:'): + entry['saksbehandler'] = lines[cur+1].text + cur = cur + 1 + cur = cur + 1 + print entry + if 'docid' in entry: + scraperwiki.sqlite.save(unique_keys=['docid'], data=entry) + #return + +def parse_pdf(pdfurl, pdfcontent): + pdfxml = scraperwiki.pdftoxml(pdfcontent) + pages=re.findall('(<page .+?</page>)',pdfxml,flags=re.DOTALL) + for page in pages: + s = BeautifulSoup(page) + lines = s.findAll('text') + last = 0 + cur = 0 + while cur < len(lines): + #print cur, lines[cur] + if -1 != lines[cur].text.find('Dok.dato:'): + print last, cur-2 + parse_entry(pdfurl, lines[last:cur-2]) + last = cur - 2 + cur = cur + 1 + return + if False: + cur = 0 + entry = { 'agency' : agency, 'scrapedurl' : pdfurl } + while cur < len(lines): + line = lines[cur].text + #print line + if -1 != line.find('Dok.dato:'): + entry['docid'] = lines[cur-2].text + entry['doctype'] = lines[cur-1].text + entry['docdate'] = parse_date(line.replace("Dok.dato:", "")) + caseyear, caseseqnr, casedocseq = split_docid(entry['docid']) + entry['caseyear'] = caseyear + entry['caseseqnr'] = caseseqnr + entry['casedocseq'] = casedocseq + entry['caseid'] = str(caseyear) + '/' + str(caseseqnr) + if -1 != line.find('Jour.dato:'): + entry['recorddate'] = parse_date(lines[cur+1].text) + cur = cur + 1 + if -1 != line.find('Arkivdel:'): + entry['arkivdel'] = lines[cur+1].text + cur = cur + 1 + if -1 != line.find('Tilg. kode:'): + entry['tilgangskode'] = line.replace("Tilg. kode:", "") + if -1 != line.find('Sak:'): + entry['casedesc'] = lines[cur+1].text + cur = cur + 1 + if -1 != line.find('Dok:'): + entry['docdesc'] = lines[cur+1].text + cur = cur + 1 + if -1 != line.find('Par.:'): + entry['exemption'] = line.replace("Par.:", "") + cur = cur + 1 + if -1 != line.find('Avsender:'): + entry['sender'] = lines[cur+1].text + cur = cur + 1 + if -1 != line.find('Mottaker:'): + entry['recipient'] = lines[cur+1].text + cur = cur + 1 + if -1 != line.find('Saksansv:'): + entry['saksansvarlig'] = line.replace("Saksansv:", "").strip() + if -1 != line.find('Saksbeh:'): + entry['saksbehandler'] = lines[cur+1].text + cur = cur + 1 + print entry + scraperwiki.sqlite.save(unique_keys=['docid'], data=entry) + entry = { 'agency' : agency, 'scrapedurl' : pdfurl } + cur = cur + 1 + #return + +def process_pdf(parser, pdfurl, errors): + postlistelib.exit_if_no_cpu_left(0, out_of_cpu, errors) + try: + #if True: + pdfcontent = scraperwiki.scrape(pdfurl) + parse_pdf(pdfurl, pdfcontent) + #parser.preprocess(pdfurl, pdfcontent) + pdfcontent = None +# except ValueError, e: +# errors.append(e) + #except IndexError, e: + # errors.append(e) + except Exception, e: + print e + +def process_page_queue(parser, errors): + try: + parser.process_pages() + postlistelib.exit_if_no_cpu_left(0, out_of_cpu, errors) + except scraperwiki.CPUTimeExceededError, e: + errors.append("Processing pages interrupted") + +def process_journal_pdfs(parser, listurl, errors): +# print "Finding PDFs on " + listurl +# u = urllib.parse.urlparse(listurl) + html = scraperwiki.scrape(listurl) + root = lxml.html.fromstring(html) + html = None + for ahref in root.cssselect("table a"): + href = ahref.attrib['href'] + url = urlparse.urljoin(listurl, href) + if -1 != href.find("file://") or -1 == url.find(".pdf"): +# print "Skipping non-http URL " + url + continue + if parser.is_already_scraped(url): + True +# print "Skipping already scraped " + url + else: +# print "Will process " + url + process_pdf(parser, url, errors) + +def test_small_pdfs(parser): + # Test with some smaller PDFs + errors = [] + process_pdf(parser, "http://www.radhusets-forvaltningstjeneste.oslo.kommune.no/getfile.php/rådhusets%20forvaltningstjeneste%20(RFT)/Intranett%20(RFT)/Dokumenter/Postjournal/11%20November/29112011.pdf", errors) + process_pdf(parser, "http://www.radhusets-forvaltningstjeneste.oslo.kommune.no/getfile.php/rådhusets%20forvaltningstjeneste%20(RFT)/Intranett%20(RFT)/Dokumenter/Postjournal/12%20Desember/02122011.pdf", errors) + process_page_queue(parser, errors) + report_errors(errors) + exit(0) + +errors = [] +parser = postlistelib.PDFJournalParser(agency=agency) + +#test_small_pdfs(parser) + +process_journal_pdfs(parser, "http://www.radhusets-forvaltningstjeneste.oslo.kommune.no/postjournal/", errors) +#process_page_queue(parser, errors) +report_errors(errors) + diff --git a/scrapersources/postliste-python-lib b/scrapersources/postliste-python-lib new file mode 100644 index 0000000..042d1fd --- /dev/null +++ b/scrapersources/postliste-python-lib @@ -0,0 +1,577 @@ +# -*- coding: utf-8 -*- +# +# Python library for parsing public post journals (postlister) in Norway. +# + +# Based on the scraper advanced-scraping-pdf +# +# See also +# https://views.scraperwiki.com/run/pdf-to-html-preview-1/ + +# Possible sources using format 1 pdf: +# www.bydel-ullern.oslo.kommune.no +# www.gravferdsetaten.oslo.kommune.no +# www.halden.kommune.no (done) +# www.havn.oslo.kommune.no (done) +# www.hvaler.kommune.no (done) +# www.kafjord.kommune.no +# www.lier.kommune.no +# www.lindesnes.kommune.no +# www.naroy.kommune.no +# www.saltdal.kommune.no +# www.sogne.kommune.no +# www.vikna.kommune.no +# +# Google search to find more: "Offentlig journal" Seleksjon Sakstittel Dokumenttype Status filetype:pdf + + +import scraperwiki +import string +import re +from BeautifulSoup import BeautifulSoup +import datetime +import dateutil.parser + +def cpu_spent(): + import resource + usage = resource.getrusage(resource.RUSAGE_SELF) + return getattr(usage, 'ru_utime') + getattr(usage, 'ru_stime') + +def exit_if_no_cpu_left(retval, callback=None, arg = None): + import resource + soft, hard = resource.getrlimit(resource.RLIMIT_CPU) + spent = cpu_spent() + if soft < spent: + if callback is not None: + callback(arg, spent, hard, soft) + print "Running out of CPU, exiting." + exit(retval) + +def fetch_url_harder(url, scraper = None): + import urllib2 + html = None + for n in [1, 2, 3]: + try: + if None == scraper: + scraper = scraperwiki.scrape + html = scraper(url) + break + except urllib2.URLError, e: + print "URLError fetching " + url + ", trying again" + return html + +class JournalParser: + agency = None + debug = False + + validdoctypes = ['I', 'U', 'X', 'N'] + senderdoctypes = ['I', 'X', 'N'] + recipientdoctypes = ['U'] + mustfields = { + 'agency' : 1, + 'docdesc' : 1, + 'doctype' : 1, + 'caseyear' : 1, + 'caseseqnr' : 1, + 'casedocseq' : 1, + } + + def __init__(self, agency): + self.agency = agency + + def is_valid_doctype(self, doctype): + return doctype in self.validdoctypes + + def is_sender_doctype(self, doctype): + return doctype in self.senderdoctypes + + def is_recipient_doctype(self, doctype): + return doctype in self.recipientdoctypes + + def verify_entry(self, entry): + + for field in self.mustfields: + if not field in entry: + raise ValueError("Missing required field " + field) + + if not self.is_valid_doctype(entry['doctype']): + raise ValueError("Invalid doctype " + doctype) + + if -1 != entry['caseid'].find('-'): + raise ValueError("Field caseid should not include dash: " + entry['caseid']) + +# +# Parser of PDFs looking like +# http://www.storfjord.kommune.no/postliste-18-mai-2012.5056067-105358.html (type 1) +# http://www.hadsel.kommune.no/component/docman/doc_download/946-offentlig-postjournal-28032012 (type 2) +# http://www.stortinget.no/Global/pdf/postjournal/pj-2011-06-23.pdf (type 2 variant) +# Note sender/receiver is not yet parsed for type 2 PDFs +class PDFJournalParser(JournalParser): + pagetable = "unparsedpages" + brokenpagetable = "brokenpages" + hiddentext = False + breakonfailure = True + + def __init__(self, agency, hiddentext=False): + self.hiddentext = hiddentext + JournalParser.__init__(self, agency=agency) + + def is_already_scraped(self, url): + # Ignore entries were sender and recipient is the result of a broken parser (before 2012-05-25) + for sql in ["scrapedurl, sender, recipient from swdata where scrapedurl = '" + url + "' " + + # FIXME Figure out why this do not work + #" and not (sender = 'parse error' or recipient != 'parse error') " + + "limit 1", + "scrapedurl from " + self.pagetable + " where scrapedurl = '" + url + "' limit 1"]: + try: + result = scraperwiki.sqlite.select(sql) + #int sql, " : ", result + if 0 < len(result) and u'scrapedurl' in result[0]: + return True + except Exception as e: + #if ('no such table: %s' % self.pagetable) not in str(e) and 'no such table: swdata' not in str(e): + # raise + print "Exception: %s" % e + return False + + # Check if we recognize the page content, and throw if not + def is_valid_page(self, pdfurl, pagenum, pagecontent): + s = BeautifulSoup(pagecontent) + for t in s.findAll('text'): + if t.text != " ": + if 'Innhold:' == t.text: # type 1 or 2 (ePhorge) + s = None + return True + if 'Arkivdel:' == t.text]: # type 3 (doculive) + s = None + return True + s = None + if self.debug: + print "Unrecognized page format for " + pdfurl + raise ValueError("Unrecognized page format for " + pdfurl) + + # + # Split PDF content into pages and store in SQL table for later processing. + # The process is split in two to better handle parge PDFs (like 600 pages), + # without running out of CPU time without loosing track of what is left to + # parse. + def preprocess(self, pdfurl, pdfcontent): + print "Preprocessing PDF " + pdfurl + if not pdfcontent: + raise ValueError("No pdf content passed for " + pdfurl) + if self.hiddentext: + options = '-hidden' + else: + options = '' + xml=scraperwiki.pdftoxml(pdfcontent, options) + if self.debug: + print xml + pages=re.findall('(<page .+?</page>)',xml,flags=re.DOTALL) + xml=None +# print pages[:1][:1000] + pagecount = 0 + datastore = [] + for page in pages: + pagecount = pagecount + 1 + self.is_valid_page(pdfurl, pagecount, page) + data = { + 'scrapedurl' : pdfurl, + 'pagenum' : pagecount, + 'pagecontent' : page, + } + datastore.append(data) + if 0 < len(datastore): + scraperwiki.sqlite.save(unique_keys=['scrapedurl', 'pagenum'], data=datastore, table_name=self.pagetable) + else: + raise ValueError("Unable to find any pages in " + pdfurl) + pages = None + + def fetch_and_preprocess(self, pdfurl): + pdfcontent = fetch_url_harder(pdfurl) + self.preprocess(pdfurl, pdfcontent) + pdfcontent = None + + def print_entry(self, entrytext): + for i in range(0, len(entrytext)): + print str(i) + ": '" + entrytext[i] + "'" + + # ePhorte PDF + def parse_entry_type1(self, entrytext, pdfurl): + scrapestamputc = datetime.datetime.now() + entry = { + 'agency' : self.agency, + 'scrapestamputc' : scrapestamputc, + 'scrapedurl' : pdfurl + } + i = 0 + while i < len(entrytext): + #print "T: '" + entrytext[i] + "'" + if 'Innhold:' == entrytext[i]: + tittel = "" + # handle multi-line titles + while 'Sakstittel:' != entrytext[i+1]: + tittel = tittel + " " + entrytext[i+1] + i = i + 1 + entry['docdesc'] = tittel + if 'Sakstittel:' == entrytext[i]: + sakstittel = "" + while 'DokType' != entrytext[i+1]: +# print "'" + entrytext[i+1] + "'" + sakstittel = sakstittel + " " + entrytext[i+1] + i = i + 1 + entry['casedesc'] = sakstittel + if 'DokType' == entrytext[i]: # Values I/U/N/X from NOARK 4 table 14.2.11 + entry['doctype'] = entrytext[i+1] + # As seen on http://www.saltdal.kommune.no/images/module.files/2007-05-16.pdf, page 1 + if entry['doctype'] == 'S': + entry['doctype'] = 'X' + i = i + 1 + if 'Sak/dok nr:' == entrytext[i]: + # FIXME Split and handle combined sak/løpenr + # Use find('penr.:') to avoid non-ascii search string 'Løpenr.:' + caseid = None + lnr = None + if -1 != entrytext[i+4].find('penr.:'): + caseid = entrytext[i+1] + entrytext[i+2] + lnr = entrytext[i+3] + i = i + 4 + elif -1 != entrytext[i+3].find('penr.:'): + caseid = entrytext[i+1] + lnr = entrytext[i+2] + i = i + 3 + elif -1 != entrytext[i+2].find('penr.:'): + caseid, lnr = entrytext[i+1].split(" ") + i = i + 2 + + caseyear, caseseqnr = caseid.split("/") + entry['caseyear'] = int(caseyear) + caseseqnr, casedocseq = caseseqnr.split("-") + entry['caseseqnr'] = int(caseseqnr) + entry['casedocseq'] = int(casedocseq) + entry['caseid'] = caseyear + "/" + caseseqnr + + journalseqnr, journalyear = lnr.split("/") + entry['journalid'] = journalyear + "/" + journalseqnr + entry['journalyear'] = int(journalyear) + entry['journalseqnr'] = int(journalseqnr) + +# if -1 != text[i].find('penr.:'): # Use find('penr.:') to avoid non-ascii search string 'Løpenr.:' +# str = text[i-1] +# print "S: '" + str + "'" +# data['journalid'] = str +# # FIXME handle combined sak/løpenr + if 'Journaldato:' == entrytext[i]: + entry['recorddate'] = dateutil.parser.parse(entrytext[i-1], dayfirst=True) + if 'Dok.dato:' == entrytext[i]: + entry['docdate'] = dateutil.parser.parse(entrytext[i-1], dayfirst=True) + if 'Tilg.kode Hjemmel:' == entrytext[i] and 'Avsender\mottaker:' != entrytext[i+1]: + entry['exemption'] = entrytext[i+1] + i = i + 1 + if 'Tilg.kode' == entrytext[i]: + entry['accesscode'] = entrytext[i+1] + i = i + 1 + if 'Hjemmel:' == entrytext[i]: + entry['exemption'] = entrytext[i+1] + i = i + 1 + if 'Avsender\mottaker:' == entrytext[i]: + if i+1 < len(entrytext): # Non-empty field + fratil = entrytext[i+1] + i = i + 1 + if self.is_sender_doctype(entry['doctype']): + entry['sender'] = fratil + elif self.is_recipient_doctype(entry['doctype']): + entry['recipient'] = fratil + else: + raise ValueError("Case " + entry['caseid'] + " Sender/Recipient with doctype " + entry['doctype'] + " != I/U/X/N in " + pdfurl) + if self.debug: + print entry + i = i + 1 + return entry + + def parse_case_journal_ref(self, entry, reftext, pdfurl): + try: + # FIXME Split and handle combined sak/loepenr + # Use find('penr.:') to avoid non-ascii search string 'Loepenr.:' + caseid = None + lnr = None + if 4 == len(reftext): +# print "4 " + str(reftext) + caseid = reftext[0] + reftext[1] + lnr = reftext[2] + reftext[3] +# print str(caseid) + " " + str(lnr) + elif 3 == len(reftext): + if -1 != reftext[0].find("/") and -1 != reftext[2].find("/"): +# print "31" + caseid = reftext[0] + reftext[1] + lnr = reftext[2] + elif -1 != reftext[2].find("/"): +# print "32" + caseid = reftext[0] + reftext[1] + lnr = reftext[2] + elif -1 == reftext[2].find("/"): +# print "33" + caseid = reftext[0] + lnr = reftext[1] + reftext[2] + elif 2 == len(reftext): + if -1 == reftext[1].find("/"): +# print "21" + s = reftext[0] + reftext[1] +# print "S: " + s + caseid, lnr = s.split(" ") + elif -1 != reftext[1].find("/"): +# print "22" + caseid = reftext[0] + lnr = reftext[1] + elif 1 == len(reftext): + caseid, lnr = reftext[0].split(" ") + else: + raise ValueError("Unable to parse entry " + str(reftext) + " in " + pdfurl) +# print "C: " + caseid + " L: " + lnr + + caseyear, caseseqnr = caseid.split("/") + entry['caseyear'] = int(caseyear) + caseseqnr, casedocseq = caseseqnr.split("-") + entry['caseseqnr'] = int(caseseqnr) + entry['casedocseq'] = int(casedocseq) + entry['caseid'] = caseyear + "/" + caseseqnr + + journalseqnr, journalyear = lnr.split("/") + entry['journalid'] = journalyear + "/" + journalseqnr + entry['journalyear'] = int(journalyear) + entry['journalseqnr'] = int(journalseqnr) + except: + print "Unable to parse " + str(reftext) + return entry + def test_parse_case_journal_ref(self): + entry = {} + self.parse_case_journal_ref(entry, [u'2008/16414-', u'23', u'15060/2012'], "") + self.parse_case_journal_ref(entry, [u'2011/15972-1 102773/201', u'1'], "") + self.parse_case_journal_ref(entry, [u'2010/2593-2', u'103004/201', u'1'], "") + self.parse_case_journal_ref(entry, [u'2011/13415-', u'22', u'100077/201', u'1'], "") + + # ePhorte PDF + def parse_entry_type2(self, entrytext, pdfurl): + scrapestamputc = datetime.datetime.now() + entry = { + 'agency' : self.agency, + 'scrapestamputc' : scrapestamputc, + 'scrapedurl' : pdfurl + } + i = 0 + avsender = [] + mottaker = [] + while i < len(entrytext): + if 'Innhold:' == entrytext[i]: + tittel = "" + # handle multi-line titles + while 'Sakstittel:' != entrytext[i+1]: + tittel = tittel + entrytext[i+1] + i = i + 1 + entry['docdesc'] = tittel + if 'Sakstittel:' == entrytext[i]: + sakstittel = "" + # Klassering er i en annen dokumenttype + while 'DokType' != entrytext[i+1] and 'Dok.Type:' != entrytext[i+1] and 'Klassering:' != entrytext[i+1]: + +# print "'" + entrytext[i+1] + "'" + sakstittel = sakstittel + entrytext[i+1] + i = i + 1 + entry['casedesc'] = sakstittel + i = i + 1 + if 'DokType' == entrytext[i] or 'Dok.Type:' == entrytext[i]: # Values I/U/N/X from NOARK 4 table 14.2.11 + entry['doctype'] = entrytext[i+1] + # As seen on http://www.uis.no/getfile.php/Journal%20200612.pdf + if entry['doctype'] == 'S': + entry['doctype'] = 'X' + i = i + 1 + if 'Sak/dok nr:' == entrytext[i] or 'Sak/dok.nr:' == entrytext[i]: + endi = i + while endi < len(entrytext): + if -1 != entrytext[endi].find('penr.:') or -1 != entrytext[endi].find('penr:'): + break + endi = endi + 1 + entry = self.parse_case_journal_ref(entry, entrytext[i+1:endi], pdfurl) + i = endi + 1 +# if -1 != text[i].find('penr.:'): # Use find('penr.:') to avoid non-ascii search string 'Løpenr.:' +# str = text[i-1] +# print "S: '" + str + "'" +# data['journalid'] = str +# # FIXME handle combined sak/løpenr + if 'Journaldato:' == entrytext[i]: + entry['recorddate'] = dateutil.parser.parse(entrytext[i-1], dayfirst=True) + if 'Dok.dato:' == entrytext[i]: + entry['docdate'] = dateutil.parser.parse(entrytext[i-1], dayfirst=True) + if 'Tilg.kode Hjemmel:' == entrytext[i] and '(enhet/initialer):' != entrytext[i+2]: + entry['exemption'] = entrytext[i+1] + i = i + 1 + if 'Tilg.kode' == entrytext[i]: + entry['accesscode'] = entrytext[i+1] + i = i + 1 + if 'Hjemmel:' == entrytext[i]: + entry['exemption'] = entrytext[i+1] + i = i + 1 +# if -1 != text[i].find('Avs./mottaker:'): +# FIXME Need to handle senders and receivers + if 'Mottaker' == entrytext[i]: + mottaker.append(entrytext[i-1]) + if 'Avsender' == entrytext[i]: + avsender.append(entrytext[i-1]) +# entry['sender'] = 'parse error' +# entry['recipient'] = 'parse error' + i = i + 1 + if 0 < len(mottaker): + entry['recipient'] = string.join(mottaker, ", ") + if 0 < len(avsender): + entry['sender'] = string.join(avsender, ", ") + return entry + + def parse_page(self, pdfurl, pagenum, pagecontent): + print "Scraping " + pdfurl + " page " + str(pagenum) + s = BeautifulSoup(pagecontent) + datastore = [] + text = [] + linecount = 0 + if self.debug: + print s + for t in s.findAll('text'): + if t.text != " ": + text.append(t.text) + if self.debug: + print str(linecount) + ": " + t.text +# FIXME Remove length limit when working +# if 100 <= linecount: +# break + linecount = linecount + 1 +# if -1 != t.text.find("Side:"): +# print t.text + s = None + +# print "Found " + str(linecount) + " lines/text fragments in the PDF" + if len(text) < linecount: + raise ValueError("Text array too sort!") + + # First count how many entries to expect on this page, to be able to + # verify that all of them were found. + entrycount = 0 + i = 0 + while i < len(text): + if 'Innhold:' == text[i] \ # Type 1 and 2 (ePhorge) + or 'Arkivdel:' == text[i]: # type 3 (doculive) + entrycount = entrycount + 1 + i = i + 1 + + i = 0 + while i < len(text): + if self.debug: + print "T: '" + text[i] + "'" + if self.debug and -1 != text[i].find("Side:"): + print text[i] + if 'Innhold:' == text[i]: + endi = i + 1 + pdfparser = None + format = "unknown" + while endi < len(text): + if 'Klassering:' == text[endi]: + pdfparser = self.parse_entry_type2 + format = "type2" + if 'Avsender\mottaker:' == text[endi]: + pdfparser = self.parse_entry_type1 + format = "type1" + if 'Innhold:' == text[endi]: + break + endi = endi + 1 + if self.debug: + print "Entry " + str(entrycount) + " from " + str(i) + " to " + str(endi) + " ie " + str(endi - i) + " lines" + try: + entry = pdfparser(text[i:endi], pdfurl) + if 'caseid' not in entry or entry['caseid'] is None or \ + not self.is_valid_doctype(entry['doctype']): + raise ValueError("Unable to parse " + pdfurl + " as format " + format + "[" + str(entry) + "]") +# print entry + datastore.append(entry) + i = endi - 2 + except: + self.print_entry(text[i:endi]) + raise + i = i + 1 +# print data +# print "Found " + str(len(datastore)) + " of " + str(entrycount) + " entries" + if entrycount != len(datastore): +# print text + raise ValueError("Unable to parse all entries in " + pdfurl) + if 0 == len(datastore): + print "Unable to find any entries in " + pdfurl + else: + scraperwiki.sqlite.save(unique_keys=['caseid', 'casedocseq'], data=datastore) + datastore = None + text = None + + def process_pages(self): + try: + sqlselect = "* from " + self.pagetable + " limit 1" + pageref = scraperwiki.sqlite.select(sqlselect) + while pageref: + scrapedurl = pageref[0]['scrapedurl'] + pagenum = pageref[0]['pagenum'] + pagecontent = pageref[0]['pagecontent'] +# print "Found " + scrapedurl + " page " + str(pagenum) + " length " + str(len(pagecontent)) + try: + sqldelete = "delete from " + self.pagetable + " where scrapedurl = '" + scrapedurl + "' and pagenum = " + str(pagenum) + self.parse_page(scrapedurl, pagenum, pagecontent) +# print "Trying to: " + sqldelete + scraperwiki.sqlite.execute(sqldelete) + except ValueError, e: + brokenpage = { + 'scrapedurl' : scrapedurl, + 'pagenum' : pagenum, + 'pagecontent' : pagecontent, + } + print "Broken page %d from %s" % (pagenum, scrapedurl) + scraperwiki.sqlite.save(unique_keys=['scrapedurl', 'pagenum'], data=brokenpage, table_name=self.brokenpagetable) + scraperwiki.sqlite.execute(sqldelete) + scraperwiki.sqlite.commit() + pageref = scraperwiki.sqlite.select(sqlselect) + except scraperwiki.sqlite.SqliteError, e: + print str(e) + raise + +def fieldlist(): + import urllib2 + import json + + scrapers = [ + 'postliste-universitetet-i-oslo', + 'postliste-lindesnes', + 'postliste-kristiansund', + 'postliste-stortinget', + 'postliste-arendal', + 'postliste-oep', + 'postliste-ballangen', + 'postliste-hadsel', + 'postliste-storfjord', + 'postliste-oslo-havn', + ] + + keys = {} + + for scraper in scrapers: + url = 'https://api.scraperwiki.com/api/1.0/scraper/getinfo?format=jsondict&name=' + scraper + '&version=-1' + response = urllib2.urlopen(url) + html = response.read() + data = json.loads(html) + if 'swdata' in data[0]['datasummary']['tables']: + for key in data[0]['datasummary']['tables']['swdata']['keys']: + key = key.lower() + if key in keys: + keys[key].append(scraper) + else: + keys[key] = [scraper] + def lensort(a, b): + return cmp(len(keys[b]), len(keys[a])) + + for key in sorted(keys.keys(), lensort): + print len(keys[key]), key, str(keys[key]) + +if __name__ == "scraper": + fieldlist() + diff --git a/scrapersources/postliste-risr-kommune b/scrapersources/postliste-risr-kommune new file mode 100644 index 0000000..cb87bdb --- /dev/null +++ b/scrapersources/postliste-risr-kommune @@ -0,0 +1,126 @@ +# -*- coding: UTF-8 -*- +# Based on the scraper advanced-scraping-pdf +# See also +# https://views.scraperwiki.com/run/pdf-to-html-preview-1/?url=http%3A%2F%2Fwww.stortinget.no%2FGlobal%2Fpdf%2Fpostjournal%2Fpj-2012-05-09.pdf + +import scraperwiki +import json +from BeautifulSoup import BeautifulSoup +import datetime +import dateutil.parser +import lxml.html +#import resource +import sys +#import urlparse +#import gc +import re +#lazycache=scraperwiki.swimport('lazycache') +#postlistelib=scraperwiki.swimport('postliste-python-lib') + +agency = 'Risør kommune' + +import mechanize + +# ASPX pages are some of the hardest challenges because they use javascript and forms to navigate +# Almost always the links go through the function function __doPostBack(eventTarget, eventArgument) +# which you have to simulate in the mechanize form handling library + +# This example shows how to follow the Next page link + +url = 'http://159.171.0.169/ris/Modules/innsyn.aspx?mode=pl&SelPanel=0&ObjectType=ePhorteRegistryEntry&VariantType=Innsyn&ViewType=List&Query=RecordDate%3a%28-7%29+AND+DocumentType%3a%28I%2cU%29' +br = mechanize.Browser() + +# sometimes the server is sensitive to this information +br.addheaders = [('User-agent', 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.1) Gecko/2008071615 Fedora/3.0.1-1.fc9 Firefox/3.0.1')] +response = br.open(url) + +html = response.read() + + + +for pagenum in range(6): + print "Page %d page length %d" % (pagenum, len(html)) + #print html + #print "Clinicians found:", re.findall("PDetails.aspx\?ProviderId.*?>(.*?)</a>", html) + + + mnextlink = re.search("javascript:__doPostBack\('ctl00\$ctl00\$ctl00\$WebPartManager\$wp1243460126ViewPart\$ctl04',''\).>Neste", html) + #print mnextlink + if not mnextlink: + break + + br.select_form(name='aspnetForm') + br.form.set_all_readonly(False) + br['__EVENTTARGET'] = 'ctl00$ctl00$ctl00$WebPartManager$wp1243460126ViewPart$ctl04' #'ProviderSearchResultsTable1$NextLinkButton' + br['__EVENTARGUMENT'] = '' + br.submit() + + html = br.response().read() + #print len(html) + + + + +# def report_errors(errors): +# if 0 < len(errors): +# print "Errors:" +# for e in errors: +# print e +# exit(1) +# def out_of_cpu(arg, spent, hard, soft): +# report_errors(arg) +# +# def process_pdf(parser, pdfurl, errors): +# errors = [] +# postlistelib.exit_if_no_cpu_left(0, out_of_cpu, errors) +# try: +# pdfcontent = scraperwiki.scrape(pdfurl) +# parser.preprocess(pdfurl, pdfcontent) +# pdfcontent = None +# # except ValueError, e: +# # errors.append(e) +# except IndexError, e: +# errors.append(e) +# +# def process_page_queue(parser, errors): +# try: +# parser.process_pages() +# postlistelib.exit_if_no_cpu_left(0, out_of_cpu, errors) +# except scraperwiki.CPUTimeExceededError, e: +# errors.append("Processing pages interrupted") +# +# def process_journal_pdfs(parser, listurl, errors): +# # print "Finding PDFs on " + listurl +# # u = urllib.parse.urlparse(listurl) +# html = scraperwiki.scrape(listurl) +# root = lxml.html.fromstring(html) +# html = None +# for ahref in root.cssselect("table a"): +# href = ahref.attrib['href'] +# url = urlparse.urljoin(listurl, href) +# if -1 != href.find("file://"): +# # print "Skipping non-http URL " + url +# continue +# if parser.is_already_scraped(url): +# True +# # print "Skipping already scraped " + url +# else: +# # print "Will process " + url +# process_pdf(parser, url, errors) +# +# def test_small_pdfs(): +# # Test with some smaller PDFs +# errors = [] +# process_pdf("http://home.nuug.no/~pere/uio-postjournal/2011-16.pdf", errors) +# process_pdf("http://home.nuug.no/~pere/uio-postjournal/2011-52.pdf", errors) +# process_page_queue(errors) +# report_errors(errors) +# exit(0) +# +# #test_small_pdfs() +# errors = [] +# parser = postlistelib.PDFJournalParser(agency=agency) +# process_journal_pdfs(parser, "http://www.havn.oslo.kommune.no/postjournal/", errors) +# process_page_queue(parser, errors) +# report_errors(errors) + diff --git a/scrapersources/postliste-ruter b/scrapersources/postliste-ruter new file mode 100644 index 0000000..757d6be --- /dev/null +++ b/scrapersources/postliste-ruter @@ -0,0 +1,81 @@ +# -*- coding: UTF-8 -*- + +import scraperwiki +import json +from BeautifulSoup import BeautifulSoup +import datetime +import dateutil.parser +import lxml.html +import resource +import sys +import urlparse +import re +lazycache=scraperwiki.swimport('lazycache') +postlistelib=scraperwiki.swimport('postliste-python-lib') + +agency = 'Ruter AS' + +def report_errors(errors): + if 0 < len(errors): + print "Errors:" + for e in errors: + print e + exit(1) +def out_of_cpu(arg, spent, hard, soft): + report_errors(arg) + +def process_pdf(parser, pdfurl, errors): + errors = [] + postlistelib.exit_if_no_cpu_left(0, out_of_cpu, errors) + try: + pdfcontent = scraperwiki.scrape(pdfurl) + parser.preprocess(pdfurl, pdfcontent) + pdfcontent = None +# except ValueError, e: +# errors.append(e) + except IndexError, e: + errors.append(e) + +def process_page_queue(parser, errors): + try: + parser.process_pages() + postlistelib.exit_if_no_cpu_left(0, out_of_cpu, errors) + except scraperwiki.CPUTimeExceededError, e: + errors.append("Processing pages interrupted") + +def process_journal_pdfs(parser, listurl, errors): +# print "Finding PDFs on " + listurl +# u = urllib.parse.urlparse(listurl) + html = scraperwiki.scrape(listurl) + root = lxml.html.fromstring(html) + html = None + for ahref in root.cssselect("div.vedlegg a"): + href = ahref.attrib['href'] + url = urlparse.urljoin(listurl, href) + if -1 != href.find("file://") or -1 == url.find(".pdf"): +# print "Skipping non-http URL " + url + continue + if parser.is_already_scraped(url): + True +# print "Skipping already scraped " + url + else: +# print "Will process " + url + process_pdf(parser, url, errors) + +def test_small_pdfs(parser): + # Test with some smaller PDFs + errors = [] + process_pdf(parser, "http://www2.ruter.no/Documents/Offentlig_journal/2012_Uke_24.pdf?epslanguage=no", errors) + process_page_queue(parser, errors) + report_errors(errors) + exit(0) + +errors = [] +parser = postlistelib.PDFJournalParser(agency=agency) + +#test_small_pdfs(parser) + +process_journal_pdfs(parser, "http://www2.ruter.no/verdt-a-vite/presse/offentlig-journal/", errors) +process_page_queue(parser, errors) +report_errors(errors) + diff --git a/scrapersources/postliste-saltdal b/scrapersources/postliste-saltdal new file mode 100644 index 0000000..0650d6c --- /dev/null +++ b/scrapersources/postliste-saltdal @@ -0,0 +1,98 @@ +# -*- coding: UTF-8 -*- + +import scraperwiki +import json +from BeautifulSoup import BeautifulSoup +import datetime +import dateutil.parser +import lxml.html +import resource +import sys +import urllib2 +import urlparse +import re +lazycache=scraperwiki.swimport('lazycache') +postlistelib=scraperwiki.swimport('postliste-python-lib') + +agency = 'Saltdal kommune' + +def report_errors(errors): + if 0 < len(errors): + print "Errors:" + laste = None + for e in errors: + print e + laste = e + raise e + +def out_of_cpu(arg, spent, hard, soft): + report_errors(arg) + +def process_pdf(parser, pdfurl, errors): + errors = [] + postlistelib.exit_if_no_cpu_left(0, out_of_cpu, errors) + try: + pdfcontent = scraperwiki.scrape(pdfurl) + parser.preprocess(pdfurl, pdfcontent) + pdfcontent = None +# except ValueError, e: +# errors.append(e) + except IndexError, e: + errors.append(e) + except ValueError, e: + errors.append(e) + except urllib2.HTTPError, e: + errors.append(e) + +def process_page_queue(parser, errors): + try: + parser.process_pages() + postlistelib.exit_if_no_cpu_left(0, out_of_cpu, errors) + except scraperwiki.CPUTimeExceededError, e: + errors.append(e) + +def process_journal_pdfs(parser, listurl, errors): +# print "Finding PDFs on " + listurl +# u = urllib.parse.urlparse(listurl) + html = scraperwiki.scrape(listurl) + root = lxml.html.fromstring(html) + html = None + for ahref in root.cssselect("table a"): + href = ahref.attrib['href'] + url = urlparse.urljoin(listurl, href) + if -1 != href.find("file://") or -1 == url.find(".pdf"): +# print "Skipping non-http URL " + url + continue + if parser.is_already_scraped(url): + True +# print "Skipping already scraped " + url + else: +# print "Will process " + url + process_pdf(parser, url, errors) + +def test_small_pdfs(parser): + # Test with some smaller PDFs + errors = [] + #parser.debug = True + newurl = "http://www.saltdal.kommune.no/images/module.files/010612.pdf" + if not parser.is_already_scraped(newurl): + process_pdf(parser, newurl, errors) # New format + if parser.is_already_scraped(newurl): + print "Already parsed" + else: + raise ValueError("Failed to parse") +# process_pdf(parser, "http://www.saltdal.kommune.no/images/module.files/2007-01-31.pdf", errors) # Old format + process_page_queue(parser, errors) + report_errors(errors) + exit(0) + +errors = [] +parser = postlistelib.PDFJournalParser(agency=agency) +#parser.debug = True + +#test_small_pdfs(parser) + +process_journal_pdfs(parser, "http://www.saltdal.kommune.no/postlister.html", errors) +process_page_queue(parser, errors) +report_errors(errors) + diff --git a/scrapersources/postliste-sivilombudsmannen b/scrapersources/postliste-sivilombudsmannen new file mode 100644 index 0000000..0bf5914 --- /dev/null +++ b/scrapersources/postliste-sivilombudsmannen @@ -0,0 +1,81 @@ +# -*- coding: UTF-8 -*- + +import scraperwiki +import json +from BeautifulSoup import BeautifulSoup +import datetime +import dateutil.parser +import lxml.html +import resource +import sys +import urlparse +import re +lazycache=scraperwiki.swimport('lazycache') +postlistelib=scraperwiki.swimport('postliste-python-lib') + +agency = 'Sivilombudsmannen' + +def report_errors(errors): + if 0 < len(errors): + print "Errors:" + for e in errors: + print e + exit(1) +def out_of_cpu(arg, spent, hard, soft): + report_errors(arg) + +def process_pdf(parser, pdfurl, errors): + errors = [] + postlistelib.exit_if_no_cpu_left(0, out_of_cpu, errors) + try: + pdfcontent = scraperwiki.scrape(pdfurl) + parser.preprocess(pdfurl, pdfcontent) + pdfcontent = None +# except ValueError, e: +# errors.append(e) + except IndexError, e: + errors.append(e) + +def process_page_queue(parser, errors): + try: + parser.process_pages() + postlistelib.exit_if_no_cpu_left(0, out_of_cpu, errors) + except scraperwiki.CPUTimeExceededError, e: + errors.append("Processing pages interrupted") + +def process_journal_pdfs(parser, listurl, errors): +# print "Finding PDFs on " + listurl +# u = urllib.parse.urlparse(listurl) + html = scraperwiki.scrape(listurl) + root = lxml.html.fromstring(html) + html = None + for ahref in root.cssselect("div.rightColumn a"): + href = ahref.attrib['href'] + url = urlparse.urljoin(listurl, href) + if -1 != href.find("file://") or -1 == url.find(".pdf"): +# print "Skipping non-http URL " + url + continue + if parser.is_already_scraped(url): + True +# print "Skipping already scraped " + url + else: +# print "Will process " + url + process_pdf(parser, url, errors) + +def test_small_pdfs(parser): + # Test with some smaller PDFs + errors = [] + process_pdf(parser, "http://www.sivilombudsmannen.no/getfile.php/Dokumenter/Journaler/11.06.2012.pdf", errors) + process_page_queue(parser, errors) + report_errors(errors) + exit(0) + +errors = [] +parser = postlistelib.PDFJournalParser(agency=agency) + +#test_small_pdfs(parser) + +process_journal_pdfs(parser, "http://www.sivilombudsmannen.no/offentlig-journal/", errors) +process_page_queue(parser, errors) +report_errors(errors) + diff --git a/scrapersources/postliste-sogne b/scrapersources/postliste-sogne new file mode 100644 index 0000000..afa4fdf --- /dev/null +++ b/scrapersources/postliste-sogne @@ -0,0 +1,81 @@ +# -*- coding: UTF-8 -*- + +import scraperwiki +import json +from BeautifulSoup import BeautifulSoup +import datetime +import dateutil.parser +import lxml.html +import resource +import sys +import urlparse +import re +lazycache=scraperwiki.swimport('lazycache') +postlistelib=scraperwiki.swimport('postliste-python-lib') + +agency = 'Søgne kommune' + +def report_errors(errors): + if 0 < len(errors): + print "Errors:" + for e in errors: + print e + exit(1) +def out_of_cpu(arg, spent, hard, soft): + report_errors(arg) + +def process_pdf(parser, pdfurl, errors): + errors = [] + postlistelib.exit_if_no_cpu_left(0, out_of_cpu, errors) + try: + pdfcontent = scraperwiki.scrape(pdfurl) + parser.preprocess(pdfurl, pdfcontent) + pdfcontent = None +# except ValueError, e: +# errors.append(e) + except IndexError, e: + errors.append(e) + +def process_page_queue(parser, errors): + try: + parser.process_pages() + postlistelib.exit_if_no_cpu_left(0, out_of_cpu, errors) + except scraperwiki.CPUTimeExceededError, e: + errors.append("Processing pages interrupted") + +def process_journal_pdfs(parser, listurl, errors): +# print "Finding PDFs on " + listurl +# u = urllib.parse.urlparse(listurl) + html = scraperwiki.scrape(listurl) + root = lxml.html.fromstring(html) + html = None + for ahref in root.cssselect("div#ReadArea a"): + href = ahref.attrib['href'] + url = urlparse.urljoin(listurl, href) + if -1 != href.find("file://") or -1 == url.find(".pdf"): +# print "Skipping non-http URL " + url + continue + if parser.is_already_scraped(url): + True +# print "Skipping already scraped " + url + else: +# print "Will process " + url + process_pdf(parser, url, errors) + +def test_small_pdfs(parser): + # Test with some smaller PDFs + errors = [] + process_pdf(parser, "http://www.sogne.kommune.no/Documents/Postlister/2012.06.18.pdf", errors) + process_page_queue(parser, errors) + report_errors(errors) + exit(0) + +errors = [] +parser = postlistelib.PDFJournalParser(agency=agency) + +#test_small_pdfs(parser) + +process_journal_pdfs(parser, "http://www.sogne.kommune.no/Organisasjon1/Administrasjonsavdelingen/Arkivet/Postlister/", errors) +process_page_queue(parser, errors) +report_errors(errors) + diff --git a/scrapersources/postliste-storfjord b/scrapersources/postliste-storfjord new file mode 100644 index 0000000..4702f8d --- /dev/null +++ b/scrapersources/postliste-storfjord @@ -0,0 +1,82 @@ +# -*- coding: UTF-8 -*- +# Based on the scraper advanced-scraping-pdf +# See also +# https://views.scraperwiki.com/run/pdf-to-html-preview-1/?url=http%3A%2F%2Fwww.stortinget.no%2FGlobal%2Fpdf%2Fpostjournal%2Fpj-2012-05-09.pdf + +import scraperwiki +import json +from BeautifulSoup import BeautifulSoup +import datetime +import dateutil.parser +import lxml.html +import urlparse +import re +lazycache=scraperwiki.swimport('lazycache') +postlistelib=scraperwiki.swimport('postliste-python-lib') + +agency = 'Storfjord kommune' + +def report_errors(errors): + if 0 < len(errors): + print "Errors:" + for e in errors: + print e + exit(1) + +def out_of_cpu(arg, spent, hard, soft): + report_errors(arg) + +def process_pdf(parser, pdfurl, errors): + errors = [] + postlistelib.exit_if_no_cpu_left(0, out_of_cpu, errors) + try: + parser.fetch_and_preprocess(pdfurl) +# except ValueError, e: +# errors.append(e) + except IndexError, e: + errors.append(e) + +def process_page_queue(parser, errors): + try: + parser.process_pages() + postlistelib.exit_if_no_cpu_left(0, out_of_cpu, errors) + except scraperwiki.CPUTimeExceededError, e: + errors.append("Processing pages interrupted") + +def process_journal_pdfs(parser, listurl, errors): +# print "Finding PDFs on " + listurl +# u = urllib.parse.urlparse(listurl) + html = scraperwiki.scrape(listurl) + root = lxml.html.fromstring(html) + html = None + for ahref in root.cssselect("div.main a"): + href = ahref.attrib['href'] + url = urlparse.urljoin(listurl, href) + if -1 == url.find("postliste-"): + continue + if parser.is_already_scraped(url): + True +# print "Skipping already scraped " + url + else: +# print "Will process " + url + process_pdf(parser, url, errors) + +def test_small_pdfs(parser): + # Test with some smaller PDFs + errors = [] + process_pdf(parser, "http://www.storfjord.kommune.no/postliste-18-mai-2012.5056067-105358.html", errors) + process_pdf(parser, "http://www.storfjord.kommune.no/postliste-16-mai-2012.5056059-105358.html", errors) + process_page_queue(parser, errors) + report_errors(errors) + exit(0) + +parser = postlistelib.PDFJournalParser(agency=agency) +#test_small_pdfs(parser) + +errors = [] +process_journal_pdfs(parser, "http://www.storfjord.kommune.no/postliste.105358.no.html", errors) +for page in range(2,91): + process_journal_pdfs(parser, "http://www.storfjord.kommune.no/?cat=105358&apage=" + str(page), errors) +process_page_queue(parser, errors) +report_errors(errors) + diff --git a/scrapersources/postliste-stortinget b/scrapersources/postliste-stortinget new file mode 100644 index 0000000..98fd7d6 --- /dev/null +++ b/scrapersources/postliste-stortinget @@ -0,0 +1,90 @@ +# Based on the scraper advanced-scraping-pdf +# See also +# https://views.scraperwiki.com/run/pdf-to-html-preview-1/?url=http%3A%2F%2Fwww.stortinget.no%2FGlobal%2Fpdf%2Fpostjournal%2Fpj-2012-05-09.pdf +import scraperwiki +from BeautifulSoup import BeautifulSoup +import datetime +import dateutil.parser +import lxml.html +import urlparse +import resource +import sys +postlistelib=scraperwiki.swimport('postliste-python-lib') + +def find_journal_pdfs(parser, listurl): +# print "Finding PDFs on " + listurl + html = postlistelib.fetch_url_harder(listurl) + + root = lxml.html.fromstring(html) + pdfurls = [] + for ahref in root.cssselect("div.mainbody a"): + href = ahref.attrib['href'] + url = urlparse.urljoin(listurl, href) + if -1 != href.find("file://"): +# print "Skipping non-http URL " + url + continue + if parser.is_already_scraped(url): + True +# print "Skipping already scraped " + url + else: +# print "Will process " + url + pdfurls.append(url) + return pdfurls + +def fetch_and_preprocess(parser, pdfurl): + pdfcontent = postlistelib.fetch_url_harder(pdfurl) + parser.preprocess(pdfurl, pdfcontent) + pdfcontent = None + +def add_pdf_lists(parser, pdfurls): + for period in [ + "", + "_2010-2011", + "-2009-2010", + "-2008-2009", + ]: + url = "http://www.stortinget.no/no/Stortinget-og-demokratiet/Administrasjonen/Dokumentoffentlighet/Stortingets-offentlige-postjournal" + period + "/" + pdfurls.extend(find_journal_pdfs(parser, url)) + + +def report_errors(errors): + if 0 < len(errors): + print "Errors:" + for e in errors: + print e + raise + +def no_cpu_left(arg, spent, soft, hard): + report_errors(arg) + +agency = 'Stortinget' +parser = postlistelib.PDFJournalParser(agency=agency) +#parser.debug = True + +if False: + pdfurl = "http://www.stortinget.no/Global/pdf/postjournal/pj-2010-06-04-05.pdf" + parse_pdf(pdfurl) + exit(0) + +pdfurls = [] +add_pdf_lists(parser, pdfurls) + +# Fetch all journal PDFs +errors = [] +for pdfurl in pdfurls: + postlistelib.exit_if_no_cpu_left(0, callback=no_cpu_left, arg = errors) + try: + parser.fetch_and_preprocess(pdfurl) + except ValueError, e: + errors.append(e) + except IndexError, e: + errors.append(e) +try: + parser.process_pages() +except ValueError, e: + errors.append(e) +except IndexError, e: + errors.append(e) + +report_errors(errors) + diff --git a/scrapersources/postliste-universitetet-i-oslo b/scrapersources/postliste-universitetet-i-oslo new file mode 100644 index 0000000..be7b77b --- /dev/null +++ b/scrapersources/postliste-universitetet-i-oslo @@ -0,0 +1,125 @@ +# -*- coding: UTF-8 -*- +# Based on the scraper advanced-scraping-pdf +# See also +# https://views.scraperwiki.com/run/pdf-to-html-preview-1/?url=http%3A%2F%2Fwww.stortinget.no%2FGlobal%2Fpdf%2Fpostjournal%2Fpj-2012-05-09.pdf + +import scraperwiki +import json +from BeautifulSoup import BeautifulSoup +import datetime +import dateutil.parser +import lxml.html +import resource +import sys +import urlparse +import re + +# Make sure Scraperwiki believe this is the source from this database +scraperwiki.scrape("http://www.uio.no/om/journal/") + +lazycache=scraperwiki.swimport('lazycache') +postlistelib=scraperwiki.swimport('postliste-python-lib') + +agency = 'Universitetet i Oslo' + +def report_errors(errors): + if 0 < len(errors): + print "Errors:" + for e in errors: + print e + raise ValueError("Something went wrong") + +def out_of_cpu(arg, spent, hard, soft): + report_errors(arg) + +def process_pdf(parser, pdfurl, errors): + postlistelib.exit_if_no_cpu_left(0, out_of_cpu, errors) + try: + pdfcontent = scraperwiki.scrape(pdfurl) + parser.preprocess(pdfurl, pdfcontent) + pdfcontent = None + except ValueError, e: + errors.append(e) + except IndexError, e: + errors.append(e) + +def process_page_queue(parser, errors): + try: + parser.process_pages() + postlistelib.exit_if_no_cpu_left(0, out_of_cpu, errors) + except scraperwiki.CPUTimeExceededError, e: + errors.append("Processing pages interrupted") + +def process_journal_pdfs(parser, listurl, errors): +# print "Finding PDFs on " + listurl +# u = urllib.parse.urlparse(listurl) + html = scraperwiki.scrape(listurl) + root = lxml.html.fromstring(html) + html = None + for ahref in root.cssselect("table a"): + href = ahref.attrib['href'] + url = urlparse.urljoin(listurl, href) + if -1 != href.find("file://") or -1 == url.find(".pdf"): +# print "Skipping non-http URL " + url + continue + if parser.is_already_scraped(url): + True +# print "Skipping already scraped " + url + else: +# print "Will process " + url + process_pdf(parser, url, errors) + +def process_journal_pdf_directory(parser, listurl, errors): + #html = scraperwiki.scrape(listurl) + html = lazycache.lazycache(listurl) + root = lxml.html.fromstring(html) + html = None + + pdflisturls = [] + for ahref in root.cssselect("span.vrtx-paging-wrapper a"): + href = ahref.attrib['href'] + url = urlparse.urljoin(listurl, href) + pdflisturls.append(url) +# print pdflisturls + + for listurl in pdflisturls: + html = scraperwiki.scrape(listurl) + root = lxml.html.fromstring(html) + html = None + urlseen = {} + for ahref in root.cssselect("div.vrtx-resource a"): + href = ahref.attrib['href'] + url = urlparse.urljoin(listurl, href) + if -1 == url.find(".pdf"): + continue + # Ignore duplicates with M: as part of the name + if -1 != url.find("/M%"): + continue + if url in urlseen or parser.is_already_scraped(url): + True +# print "Skipping already scraped " + url + else: +# print "Will process " + url + process_pdf(parser, url, errors) + urlseen[url] = 1 + +def test_small_pdfs(parser): + # Test with some smaller PDFs + errors = [] + process_pdf(parser, "http://home.nuug.no/~pere/uio-postjournal/2011-16.pdf", errors) + process_pdf(parser, "http://home.nuug.no/~pere/uio-postjournal/2011-52.pdf", errors) + process_page_queue(parser, errors) + report_errors(errors) + exit(0) + +errors = [] +parser = postlistelib.PDFJournalParser(agency=agency) + +#test_small_pdfs(parser) + +process_journal_pdfs(parser, "http://www.uio.no/om/journal/", errors) +#process_journal_pdf_directory(parser, "http://www.uio.no/om/journal/2012/", errors) +#process_journal_pdf_directory(parser, "http://www.uio.no/om/journal/2011/", errors) +process_page_queue(parser, errors) +report_errors(errors) + diff --git a/scrapersources/postliste-universitetet-i-stavanger b/scrapersources/postliste-universitetet-i-stavanger new file mode 100644 index 0000000..5852cb7 --- /dev/null +++ b/scrapersources/postliste-universitetet-i-stavanger @@ -0,0 +1,89 @@ +# -*- coding: UTF-8 -*- + +import scraperwiki +import json +from BeautifulSoup import BeautifulSoup +import datetime +import dateutil.parser +import lxml.html +import resource +import sys +import urlparse +import re +lazycache=scraperwiki.swimport('lazycache') +postlistelib=scraperwiki.swimport('postliste-python-lib') + +agency = 'Universitetet i Stavanger' + +def report_errors(errors): + if 0 < len(errors): + print "Errors:" + for e in errors: + print e + exit(1) +def out_of_cpu(arg, spent, hard, soft): + report_errors(arg) + +def process_pdf(parser, pdfurl, errors): + errors = [] + postlistelib.exit_if_no_cpu_left(0, out_of_cpu, errors) + try: + pdfcontent = scraperwiki.scrape(pdfurl) + parser.preprocess(pdfurl, pdfcontent) + pdfcontent = None +# except ValueError, e: +# errors.append(e) + except IndexError, e: + errors.append(e) + +def process_page_queue(parser, errors): + try: + parser.process_pages() + postlistelib.exit_if_no_cpu_left(0, out_of_cpu, errors) + except scraperwiki.CPUTimeExceededError, e: + errors.append("Processing pages interrupted") + +def process_journal_pdfs(parser, listurl, errors): +# print "Finding PDFs on " + listurl +# u = urllib.parse.urlparse(listurl) + html = scraperwiki.scrape(listurl) + root = lxml.html.fromstring(html) + html = None + for ahref in root.cssselect("div#placeholder-content-main-left-column a"): + href = ahref.attrib['href'] + url = urlparse.urljoin(listurl, href) + if -1 != href.find("file://") or -1 == url.find('/postjournal/article'): +# print "Skipping non-http URL " + url + continue + subhtml = scraperwiki.scrape(url) + subroot = lxml.html.fromstring(subhtml) + subhtml = None + for subahref in subroot.cssselect("div.article-content a"): + subhref = subahref.attrib['href'] + suburl = urlparse.urljoin(listurl, subhref) + if -1 == suburl.find(".pdf"): + continue + if parser.is_already_scraped(suburl): + True +# print "Skipping already scraped " + suburl + else: +# print "Will process " + suburl + process_pdf(parser, suburl, errors) + +def test_small_pdfs(parser): + # Test with some smaller PDFs + errors = [] + process_pdf(parser, "http://www.uis.no/getfile.php/Journal%20200612.pdf", errors) + process_page_queue(parser, errors) + report_errors(errors) + exit(0) + +errors = [] +parser = postlistelib.PDFJournalParser(agency=agency) + +#test_small_pdfs(parser) + +process_journal_pdfs(parser, "http://www.uis.no/nyheter/postjournal/", errors) +process_page_queue(parser, errors) +report_errors(errors) + diff --git a/scrapersources/postliste-vikna b/scrapersources/postliste-vikna new file mode 100644 index 0000000..1279f9e --- /dev/null +++ b/scrapersources/postliste-vikna @@ -0,0 +1,89 @@ +# -*- coding: UTF-8 -*- + +import scraperwiki +import json +from BeautifulSoup import BeautifulSoup +import datetime +import dateutil.parser +import lxml.html +import resource +import sys +import urlparse +import re + +# Make sure Scraperwiki believe this is the source from this database +scraperwiki.scrape("http://www.vikna.kommune.no/Vikna/Web.nsf/mainPress?OpenForm&U=POST") + +lazycache=scraperwiki.swimport('lazycache') +postlistelib=scraperwiki.swimport('postliste-python-lib') + +agency = 'Vikna kommune' + +def report_errors(errors): + if 0 < len(errors): + print "Errors:" + for e in errors: + print e + exit(1) +def out_of_cpu(arg, spent, hard, soft): + report_errors(arg) + +def process_pdf(parser, pdfurl, errors): + errors = [] + postlistelib.exit_if_no_cpu_left(0, out_of_cpu, errors) + try: + pdfcontent = scraperwiki.scrape(pdfurl) + parser.preprocess(pdfurl, pdfcontent) + pdfcontent = None +# except ValueError, e: +# errors.append(e) + except IndexError, e: + errors.append(e) + +def process_page_queue(parser, errors): + try: + parser.process_pages() + postlistelib.exit_if_no_cpu_left(0, out_of_cpu, errors) + except scraperwiki.CPUTimeExceededError, e: + errors.append("Processing pages interrupted") + +def process_journal_pdfs(parser, listurl, errors): +# print "Finding PDFs on " + listurl +# u = urllib.parse.urlparse(listurl) + html = scraperwiki.scrape(listurl) + root = lxml.html.fromstring(html) + html = None + for ahref in root.cssselect("table a"): + href = ahref.attrib['href'] + url = urlparse.urljoin(listurl, href) + if -1 != href.find("file://") or -1 == url.find(".pdf"): +# print "Skipping non-http URL " + url + continue + if -1 != href.find("/Ingen postjournal.pdf"): + continue + if parser.is_already_scraped(url): + True +# print "Skipping already scraped " + url + else: +# print "Will process " + url + process_pdf(parser, url, errors) + +def test_small_pdfs(parser): + # Test with some smaller PDFs + errors = [] + process_pdf(parser, "http://www.vikna.kommune.no/Vikna/Intern.nsf/FilA/A715C0C6E0D8CC05C12578F70024857B/$FILE/PJ230811.pdf", errors) + process_page_queue(parser, errors) + report_errors(errors) + exit(0) + +errors = [] +parser = postlistelib.PDFJournalParser(agency=agency) + +#test_small_pdfs(parser) + +process_page_queue(parser, errors) +process_journal_pdfs(parser, "http://www.vikna.kommune.no/Vikna/Web.nsf/mainPress?OpenForm&U=POST", errors) +process_page_queue(parser, errors) +report_errors(errors) + +# FIXME Need to handle recent journal entries too
\ No newline at end of file diff --git a/scrapersources/statens_vegvesen_offentlig_journal b/scrapersources/statens_vegvesen_offentlig_journal new file mode 100644 index 0000000..947da4e --- /dev/null +++ b/scrapersources/statens_vegvesen_offentlig_journal @@ -0,0 +1,56 @@ +import scraperwiki +import lxml.html +import datetime + +#uncomment to run for a selected timeperiod +#fromdate = "01.04.2011" +#todate = "21.05.2011" + +#fromdate = datetime.datetime.strptime(fromdate, "%d.%m.%Y") +#todate = datetime.datetime.strptime(todate, "%d.%m.%Y") +#adday = datetime.timedelta(days=1) + +def scrapepage(mydate): + + formatteddate = mydate.strftime("%d.%m.%Y") + #formatteddate = "10.05.2011" + + url = "http://www.vegvesen.no/Om+Statens+vegvesen/Aktuelt/Offentlig+journal?dokumenttyper=&dato=%s&journalenhet=6&utforSok=S%%C3%%B8k&submitButton=S%%C3%%B8k" % formatteddate + + root = lxml.html.parse(url).getroot() + + divs = root.cssselect("div.treff") + + for p in divs: + + dateandtype = p.xpath("p/text()")[0].split(" ") + saksdetaljer = p.xpath("ul[@class='saksdetaljer']/li/text()") + + + record = { + "doknr": dateandtype[0], + "innut": dateandtype[2], + "tittel": p.xpath("h2/text()")[0], + "sak": p.xpath("span[@class='sak']")[0].text[6:], + "fratil": p.xpath("ul[@class='fraTil']/li/text()")[0][5:], + } + + record.update(dict([x.split(":") for x in saksdetaljer])) + + record['Dokumenttdato'] = datetime.datetime.strptime(record['Dokumenttdato'].strip(), "%d.%m.%Y").date() + record['Journaldato'] = datetime.datetime.strptime(record['Journaldato'].strip(), "%d.%m.%Y").date() + + scraperwiki.sqlite.save(unique_keys=["doknr"], data=record) + +#uncomment to run for a selected timeperiod +#thedate = fromdate +#while thedate <= todate: +# print thedate +# thedate = thedate + adday +# scrapepage(thedate) +#comment out these two lines in order to run for a selected timeperiod +thedate = datetime.datetime.now() +print thedate + +scrapepage(thedate) +
\ No newline at end of file |