Store current scraperwiki sources.

author: Petter Reinholdtsen <pere@hungry.com> 2012-07-13 12:28:13 +0200
committer: Petter Reinholdtsen <pere@hungry.com> 2012-07-13 12:28:13 +0200
commit: 22bceaf65dd89df97529df0102149aefa2b54f54 (patch)
tree: 24a6dd995d146b27d92d4c91593dc8d8fd952064
46 files changed, 5651 insertions, 0 deletions
diff --git a/fetch-scraper-sources b/fetch-scraper-sources
new file mode 100755
index 0000000..6465ea3
--- /dev/null
+++ b/fetch-scraper-sources
@@ -0,0 +1,15 @@
+#!/bin/sh
+
+scrapers="postliste-python-lib postliste-ballangen"
+
+scrapers="$( (
+    GET https://scraperwiki.com/tags/postjournal
+    GET https://scraperwiki.com/tags/postjournal?page=2
+    GET https://scraperwiki.com/tags/postjournal?page=3
+) | grep owner | rev | cut -d/ -f3 | rev)"
+
+mkdir -p scrapersources
+for scraper in $scrapers; do
+    echo "Fetching source for $scraper"
+    GET "https://scraperwiki.com/editor/raw/$scraper" > "scrapersources/$scraper"
+done
diff --git a/scrapersources/nrks_offentlig_journal_pdf_text_positioning b/scrapersources/nrks_offentlig_journal_pdf_text_positioning
new file mode 100644
index 0000000..51cd746
--- /dev/null
+++ b/scrapersources/nrks_offentlig_journal_pdf_text_positioning
@@ -0,0 +1,141 @@
+import scraperwiki, urllib2, datetime, base64, time, re
+from bs4 import BeautifulSoup
+from collections import deque
+import scraperwiki
+lazycache = scraperwiki.swimport('lazycache')
+u = scraperwiki.swimport('hildenae_utils')
+
+def d(text):
+    if(False):
+        print "DEBUG:", text
+
+def process_pdf(pdfurl):
+    pdfxml = u.findInCache(pdfurl,verbose=True) # look for html parse in cache
+    if pdfxml is None: # a html parse is not cached
+        pdfdata=lazycache.lazycache(pdfurl, verbose=True) # look for pdf document in cache, if not download
+        pdfxml = scraperwiki.pdftoxml(pdfdata, "-hidden") # parse pdf text to html
+        u.putInCache(pdfurl, pdfxml, verbose=True) # save cache of html parse
+
+    beautifulxml = BeautifulSoup(pdfxml) # convert html to BeautifulSoup(4) object
+
+    for page in beautifulxml.find_all('page'):
+        FIRSTPAGE = 6
+        LASTPAGE = 6
+        if int(page['number']) < FIRSTPAGE:
+            continue
+        if int(page['number']) == FIRSTPAGE:
+            print "*******************************************"
+            print "***** FIRSTPAGE #%d while developing ******" % (FIRSTPAGE)
+            print "*******************************************"
+        if int(page['number']) == LASTPAGE+1: 
+            print "*******************************************"
+            print "****** LASTPAGE #%d while developing ******" % (LASTPAGE)
+            print "*******************************************"
+            break
+
+        print( "*******************************************")
+        print( "********** Working on page #%s **********" % page['number'])
+        print( "*******************************************")
+        elementList = deque(page.find_all('text')) # we want to be able to use popleft
+        d(elementList)
+        while True:
+            try:
+                currElement = elementList.popleft()
+                if "Innhold:" in currElement.text and currElement.b: # we found a "Innhold:"-header
+                    entry = parseDocumentRecord(currElement, elementList)
+                    print entry
+                    scraperwiki.sqlite.save(unique_keys=["innhold", "sakstittel"], data=entry)
+                    d( "back in process_pdf")
+                #else:
+                    #print currElement.text
+            except IndexError, e:
+                d("No more text elements on page (%s)" % e)
+                break
+
+
+
+def parseDocumentRecord(currElement, elementList):
+    # previous element in list is "Innhold:"
+    d ("starting parseDocumentRecord")
+    entry = {}
+    while(True):
+        try:
+            d(elementList)
+            if "Innhold:" in elementList[0].text: # look ahead, if next is "Innhold:" return to process_pdf
+                break
+
+            currElement = elementList.popleft() # first text in innhold
+            entry["innhold"] = ""
+            while(True):
+                if "Sakstittel:" in currElement.text: # we found sakstittel, go to next
+                    break
+                entry["innhold"] += currElement.text
+                currElement = elementList.popleft()
+            entry["innhold"] = u.removeDoubleSpaces(entry["innhold"])
+
+            currElement = elementList.popleft() # first text in sakstittel
+            entry["sakstittel"] = ""
+            while(True):
+                if "DokType" in currElement.text: # we found DokType, go to next
+                    break
+                entry["sakstittel"] += currElement.text
+                currElement = elementList.popleft()
+            entry["sakstittel"] = u.removeDoubleSpaces(entry["sakstittel"])
+
+            print("before spool to 'mottaker:'")
+
+            '''
+
+
+
+            Komments: Virker som om pdf2html noen ganger ikke klarer å lese DokType. Hittil er dette kun observert når
+            DokType er U (selv om den klarer å lese noen DokType U). Dette er bekreftet mesteparten av 18 og 22 i juni
+
+
+
+            '''
+            print elementList
+
+
+
+            print("spool to 'mottaker:'")
+            currElement = elementList.popleft() # first text after DocType
+            while(True):
+                if re.search( r'[t].*[t].*[a].*[k].*[e].*[r].*[:]', currElement.text): # match "motta ker:" (some last pages - nooooot pretty)
+                    d("found mottaker")
+                    break
+                currElement = elementList.popleft()
+
+            d(elementList)
+
+            entry["avsender_mottager"] = ""
+            while(True):
+                if ("Innhold:" in elementList[0].text) or ("Side:" in elementList[0].text): # ***look ahead***, if next is "Innhold:" return to process_pdf
+                    #print "next is innhold, cleanup"
+                    entry["avsender_mottager"] = u.removeDoubleSpaces(entry["avsender_mottager"])
+                    if re.match("^[*]+$", entry["avsender_mottager"]):
+                        entry["avsender_mottager"] = None
+                    #print elementList
+                    #print entry
+                    d("finished with record")
+                    break
+                #print "Adding to avs_mot (%s)" % currElement.text
+                entry["avsender_mottager"] += currElement.text
+                currElement = elementList.popleft()
+
+            #print "lastBreak"
+            break # we are finished with this Innhold
+        except IndexError, e:
+            d("No more text elements on page (%s)" % e)
+            break
+    return entry
+
+process_pdf("http://www.nrk.no/contentfile/file/1.8221353!offentlig22062012.pdf") # 4 records on last page
+#process_pdf("http://www.nrk.no/contentfile/file/1.8217234!offentligjournal21062012.pdf") # 3 records on last page
+#process_pdf("http://www.nrk.no/contentfile/file/1.8214156!offentligjournal20062012.pdf")
+#process_pdf("http://www.nrk.no/contentfile/file/1.8212381!offentligjournal19062012.pdf")
+
+# https://views.scraperwiki.com/run/pdf_to_html_preview_4/?url=http%3A%2F%2Fwww.nrk.no%2Fcontentfile%2Ffile%2F1.8209505%21offentligjournal18062012.pdf&hidden=1
+#process_pdf("http://www.nrk.no/contentfile/file/1.8209505!offentligjournal18062012.pdf") # 1 record on last page
+
+
diff --git a/scrapersources/oep-exemptions b/scrapersources/oep-exemptions
new file mode 100644
index 0000000..23a1691
--- /dev/null
+++ b/scrapersources/oep-exemptions
@@ -0,0 +1,101 @@
+<!doctype html>
+<html lang="nb">
+<head>
+<meta charset="utf-8" />
+<title>Hvor mange dokumenter er journalført i hver etat og hvor mange er unntatt innsyn?</title>
+<script src="https://cdnjs.cloudflare.com/ajax/libs/jquery/1.7.2/jquery.min.js"></script>
+<script src="https://cdnjs.cloudflare.com/ajax/libs/highcharts/2.2.2/highcharts.js"></script>
+<!-- <script src="https://code.highcharts.com/modules/exporting.js"></script>-->
+<script src="https://cdnjs.cloudflare.com/ajax/libs/underscore.js/1.3.3/underscore-min.js"></script>
+<script>
+$(function() 
+  {
+      var chart;
+      var query_url = "https://api.scraperwiki.com/api/1.0/datastore/sqlite?format=jsondict&name=postliste-oep&query=select%20Agency%2C%22Grounds%20for%20exemption%20document%22%20as%20ex%2C%20count(*)%20as%20num%20from%20%60swdata%60%20group%20by%20Agency%2Cex%20";
+
+      function get_chart_opts(agencies, series) {
+          return {
+              chart: { renderTo: 'container', type: 'bar' },
+              title: { text: 'Hvor mange dokumenter er journalført i hver etat og hvor mange er unntatt innsyn?' },
+              xAxis: { categories: agencies },
+              yAxis: { 
+                  min: 0,
+                  title: { text: "Antall journalførte dokumenter" },
+              },
+              legend: {
+                  backgroundColor: '#FFFFFF',
+                  reversed: true
+              },
+              tooltip: {
+                  formatter: function() {
+                      return ''+ this.series.name + ': '+ this.y + ' ('+parseInt(this.percentage) + '%)';
+
+                  }
+              },
+              plotOptions: {
+                  series: {
+                      stacking: 'normal'
+                  }
+              },
+              series: series
+          };
+          
+      }
+
+      function populate_chart(data) {
+          // TODO: Very naive iteration today. Should be optimized
+          var agencies = _.uniq( _.pluck(data, 'Agency') );
+          var totals = {};
+          var not_exemption = {};
+          var series = [];
+
+          // traverse and find data
+          _.each(data, function(entry) {
+                     var agency_name = entry['Agency'];
+                     
+                     if (agency_name) {
+                         if (! totals[agency_name]) {
+                             totals[agency_name] = 0;
+                         }
+                         totals[agency_name] += entry['num'];
+
+                         if ("" == entry['ex']) {
+                             not_exemption[agency_name] = entry['num'];
+                         }
+                     }
+                 });
+          
+
+          // make series
+          series.push({ name: 'Ingen merknader',
+                      data: _.map(agencies, function(agency) {
+                                      return not_exemption[agency];
+                                  })
+                      });
+
+
+          series.push({ name: 'Unntatt innsyn',
+                      data: _.map(agencies, function(agency) {
+                                      return totals[agency] - not_exemption[agency];
+                                  })
+                        });
+
+                        
+  
+          chart = new Highcharts.Chart(get_chart_opts(agencies, series));
+      };
+      
+      
+      $(document).ready(function() {
+                            $.ajax({ url: query_url, dataType: 'json', success: function(data){ populate_chart(data); } });
+                        });
+}
+);
+
+</script>  
+</head>
+<body>
+  <div id="container" style="height: 2000px;width: 100%;margin: 0 auto"></div>  
+  <p>Alle dokumenter som har oppgitt en grunn for å unnlate offentligjøring vil telles som "Unnatt innsyn".</p>
+</body>
+</html>
diff --git a/scrapersources/oep-exemptions_1 b/scrapersources/oep-exemptions_1
new file mode 100644
index 0000000..29c3a98
--- /dev/null
+++ b/scrapersources/oep-exemptions_1
@@ -0,0 +1,101 @@
+<!doctype html>
+<html lang="nb">
+<head>
+<meta charset="utf-8" />
+<title>Hvor mange dokumenter er journalført i hver etat og hvor mange er unntatt innsyn?</title>
+<script src="https://cdnjs.cloudflare.com/ajax/libs/jquery/1.7.2/jquery.min.js"></script>
+<script src="https://cdnjs.cloudflare.com/ajax/libs/highcharts/2.2.2/highcharts.js"></script>
+<!-- <script src="https://code.highcharts.com/modules/exporting.js"></script>-->
+<script src="https://cdnjs.cloudflare.com/ajax/libs/underscore.js/1.3.3/underscore-min.js"></script>
+<script>
+$(function() 
+  {
+      var chart;
+      var query_url = "https://api.scraperwiki.com/api/1.0/datastore/sqlite?format=jsondict&name=postliste-oep&query=select%20Agency%2C%22Grounds%20for%20exemption%20document%22%20as%20ex%2C%20count(*)%20as%20num%20from%20%60swdata%60%20group%20by%20Agency%2Cex%20";
+
+      function get_chart_opts(agencies, series) {
+          return {
+              chart: { renderTo: 'container', type: 'bar' },
+              title: { text: 'Hvor mange dokumenter er journalført i hver etat og hvor mange er unntatt innsyn?' },
+              xAxis: { categories: agencies },
+              yAxis: { 
+                  min: 0,
+                  title: { text: "Antall journalførte dokumenter" },
+              },
+              legend: {
+                  backgroundColor: '#FFFFFF',
+                  reversed: true
+              },
+              tooltip: {
+                  formatter: function() {
+                      return ''+ this.series.name + ': '+ this.y + ' ('+parseInt(this.percentage) + '%)';
+
+                  }
+              },
+              plotOptions: {
+                  series: {
+                      stacking: 'percent'
+                  }
+              },
+              series: series
+          };
+          
+      }
+
+      function populate_chart(data) {
+          // TODO: Very naive iteration today. Should be optimized
+          var agencies = _.uniq( _.pluck(data, 'Agency') );
+          var totals = {};
+          var not_exemption = {};
+          var series = [];
+
+          // traverse and find data
+          _.each(data, function(entry) {
+                     var agency_name = entry['Agency'];
+                     
+                     if (agency_name) {
+                         if (! totals[agency_name]) {
+                             totals[agency_name] = 0;
+                         }
+                         totals[agency_name] += entry['num'];
+
+                         if ("" == entry['ex']) {
+                             not_exemption[agency_name] = entry['num'];
+                         }
+                     }
+                 });
+          
+
+          // make series
+          series.push({ name: 'Ingen merknader',
+                      data: _.map(agencies, function(agency) {
+                                      return not_exemption[agency];
+                                  })
+                      });
+
+
+          series.push({ name: 'Unntatt innsyn',
+                      data: _.map(agencies, function(agency) {
+                                      return totals[agency] - not_exemption[agency];
+                                  })
+                        });
+
+                        
+  
+          chart = new Highcharts.Chart(get_chart_opts(agencies, series));
+      };
+      
+      
+      $(document).ready(function() {
+                            $.ajax({ url: query_url, dataType: 'json', success: function(data){ populate_chart(data); } });
+                        });
+}
+);
+
+</script>  
+</head>
+<body>
+  <div id="container" style="height: 2000px;width: 100%;margin: 0 auto"></div>  
+  <p>Alle dokumenter som har oppgitt en grunn for å unnlate offentligjøring vil telles som "Unnatt innsyn".</p>
+</body>
+</html>
diff --git a/scrapersources/postlist-ssb b/scrapersources/postlist-ssb
new file mode 100644
index 0000000..de2a051
--- /dev/null
+++ b/scrapersources/postlist-ssb
@@ -0,0 +1,164 @@
+import scraperwiki
+import urllib2
+import lxml.html
+import datetime
+import time
+import dateutil.parser
+import pickle 
+import re
+
+from datetime import date
+from datetime import timedelta
+from time import strftime
+
+# Make sure Scraperwiki believe this is the source from this database
+scraperwiki.scrape("http://www.ssb.no/omssb/journal/")
+
+postlistelib=scraperwiki.swimport('postliste-python-lib')
+
+agency = "Statistisk sentralbyrå"
+
+def daterange(start_date, end_date):
+    for n in range((end_date - start_date).days):
+        yield start_date + timedelta(n)
+
+def expand_year(year):
+    year = int(year)
+    if year > 50:
+        year = year + 1900
+    else:
+        year = year + 2000
+    return year
+
+def fetch_url(url):
+    html = None
+    for n in [1]:
+        try:
+            html = scraperwiki.scrape(url)
+            break
+        except urllib2.URLError, e:
+            print "URLError fetching " + url + ", trying again"
+    return html
+    
+def save_date(parser, date, url, html):
+    num_saved = 0
+    root = lxml.html.fromstring(html)    
+    journal_date = dateutil.parser.parse(root.cssselect("p")[0].text_content().replace("Journaldato: ",""), dayfirst=True)
+    if date == journal_date.date():
+        datastore = []
+        for table in root.cssselect("table"):
+            docid = table.cssselect("tr")[0].cssselect("p")[1].text.strip()
+            datedesc = table.cssselect("tr")[0].cssselect("td")[3].cssselect("p")[0].text.strip()
+
+            exemption = table.cssselect("tr")[1].cssselect("td")[5].cssselect("p")[0].text.strip()
+
+            fratil_indicator = table.cssselect("tr")[2].cssselect("td")[0].cssselect("p")[0].text.strip()
+
+            doctype = ""
+            if fratil_indicator.startswith("Til"):
+                doctype = "U"
+            elif fratil_indicator.startswith("Fra"):
+                doctype = "I"
+            elif fratil_indicator.startswith("Notat fra"):
+                doctype = "N"
+            else: 
+                raise ValueError("Fant ikke doctype %s" % fratil_indicator)
+
+            fratil_agency = table.cssselect("tr")[2].cssselect("td")[1].cssselect("p")[0].text.strip()
+ 
+            casedesc = table.cssselect("tr")[4].cssselect("td")[1].cssselect("p")[0].text.strip()
+
+            docdesc = table.cssselect("tr")[5].cssselect("td")[1].cssselect("p")[0].text.strip()
+            saksb = table.cssselect("tr")[0].cssselect("p")[5].text.strip()
+
+            docdate = dateutil.parser.parse(datedesc.strip(), dayfirst=True)
+
+            matchObj = re.match( r'(\d+)/(\d+)\s*-\s*(\d+)$', docid, re.M|re.I)
+            if matchObj:
+                caseyear = matchObj.group(1)
+                caseseqnr = matchObj.group(2)
+                casedocseq = matchObj.group(3)
+                caseyear = expand_year(caseyear)
+                caseid = str(caseyear) + "/" + str(caseseqnr)
+            else:
+                print "error: invalid Arkivsaksnr: " + docid
+                matchObj = re.match( r'(\d+)/(\d+)\s*-', docid, re.M|re.I)
+                if matchObj:
+                    caseyear = expand_year(matchObj.group(1))
+                    caseseqnr = matchObj.group(2)
+                    caseid = str(caseyear) + "/" + str(caseseqnr)
+            
+            if parser.is_sender_doctype(doctype):
+                fratilfield = 'sender'
+            elif parser.is_recipient_doctype(doctype):
+                 fratilfield = 'recipient'
+
+            data = {
+                'agency' : agency,
+                'docdate' : docdate.date(),
+                'recorddate' : journal_date.date(),
+                'docdesc' : docdesc,
+                'casedesc' : casedesc,
+                'caseid' : caseid,
+                'docid' : docid,
+
+                'caseyear' : caseyear,
+                'caseseqnr' : caseseqnr,
+                'casedocseq' : casedocseq,
+                
+                fratilfield : fratil_agency,
+                'doctype' : doctype,
+
+                'saksbehandler' : saksb,
+
+                'exemption' : exemption,
+                
+                'scrapedurl' : url,
+                'scrapestamputc' : datetime.datetime.now()
+                }
+            parser.verify_entry(data)
+            datastore.append(data)
+        scraperwiki.sqlite.save(unique_keys=['docid'], data=datastore)
+        num_saved += len(datastore)
+        datastore = []
+        #print "Saved %s" % data['caseid']
+    else:
+        # TODO: log error or exit?
+        msg = "Tried to scrape %s but got %s" % (date, journal_date.date())
+        #raise ValueError(msg)
+        print msg
+
+    return num_saved
+
+def scrape_date(parser, date):
+    url = base_url % (strftime("%d%m%y", date.timetuple()))
+    html = fetch_url(url)
+    if html:
+        return save_date(parser, date, url, html)
+
+base_url = 'http://www.ssb.no/omssb/journal/OJ%s.htm'
+end_date = date.today()
+
+#print res
+
+start_date_obj = scraperwiki.sqlite.get_var('last_finished_date')
+
+if start_date_obj:
+    start_date = pickle.loads(start_date_obj)
+else:
+    start_date = datetime.date(2011, 1, 3)
+
+print "Start date %s" % start_date
+
+parser = postlistelib.JournalParser(agency=agency)
+
+for single_date in daterange(start_date, end_date):
+    if single_date.weekday() < 5:
+        num_saved = scrape_date(parser, single_date)
+        print "Scraped %s found %s" % (single_date, num_saved)
+        if num_saved > 0:
+            scraperwiki.sqlite.save_var('last_finished_date', pickle.dumps(single_date)) 
+
+        if num_saved == None: 
+            print "No more new. Exit..."
+            break
diff --git a/scrapersources/postliste-arendal b/scrapersources/postliste-arendal
new file mode 100644
index 0000000..5960033
--- /dev/null
+++ b/scrapersources/postliste-arendal
@@ -0,0 +1,188 @@
+import scraperwiki
+
+import json
+import httplib, urllib
+import datetime
+import dateutil.parser
+import time
+import re
+
+agency = "Arendal kommune"
+urlhost = "www.arendal.kommune.no"
+
+fieldmap = {
+    'AntallVedlegg' : '',
+    'Arkivdel' : '',
+    'AvsenderMottaker' : 'sender', # or recipient
+    'Dokumentdato' : 'docdate',
+    'Dokumentnummer' : 'casedocseq',
+    'Dokumenttype' : 'doctype',
+    'EkspedertDato' : '',
+    'Hjemmel' : 'exemption',
+    'Id' : 'id',
+    'Innholdsbeskrivelse' : 'docdesc',
+    'Mappetype' : '',
+    'Offentlig' : 'ispublic',
+    'PostlisteType' : 'doctype',
+    'RegistrertDato' : 'recorddate',
+    'SaksId' : '',
+    'SaksNr' : 'caseid',
+    'Sakstittel' : 'casedesc',
+    #'SaksNr' : 'SA.SAAR + SA.SEKNR',
+    'Saksansvarlig' : 'saksbehandler',
+    'SaksansvarligEnhet' : '',
+    'SaksansvarligEpost' : '',
+
+#    'scrapestamputc' : '',
+#    'scrapedurl' : '',
+#    'agency' : '',
+}
+
+
+# Convert "/Date(1317808020000+0200)/" to a datetime object
+# FIXME Currently ignore the timezone information
+def parse_datestr(str):
+    match = re.split("[/()+]", str)
+#    print match
+    sinceepoch = float(match[2]) / 1000
+    if match[3] == '0200':
+        sinceepoch = sinceepoch + 2 * 60 * 60
+    if match[3] == '0100':
+        sinceepoch = sinceepoch + 1 * 60 * 60
+#    print sinceepoch
+    date = datetime.datetime.fromtimestamp(sinceepoch)
+#    print date
+    return date
+
+def reformat_caseid(caseid):
+    # Input 12/13123, output 2012, 13123, "2012/13123"
+    year, seqnr = caseid.split("/")
+    year = int(year)
+    if year < 100:
+        year = year + 2000
+    caseid = "%d/%s" % (year, seqnr)
+    return year, int(seqnr), caseid
+
+def ws_post(url, urlhost, urlpath, params):
+    jsonparams = json.dumps(params)
+    headers = {"Content-type": "application/json; charset=utf-8",
+               "Accept": "application/json"}
+    conn = httplib.HTTPConnection(urlhost)
+    #print jsonparams
+    conn.request("POST", urlpath, jsonparams, headers)
+    response = conn.getresponse()
+    #print response.status, response.reason
+    jsonres = response.read()
+    res = json.loads(jsonres)
+    #print res
+    return res
+
+def fetch_journal_entry(id):
+    params = { "id" : str(id)}
+    headers = {"Content-type": "application/json; charset=utf-8",
+               "Accept": "application/json"}
+    urlpath = "/Templates/eDemokrati/Services/eDemokratiService.svc/GetPostlisteObjekt"
+    data = ws_post(None, urlhost, urlpath, params)['d']
+    entry = None
+    if data:
+        del data['__type'] # This is useless, ignore
+        print data
+        entry = {}
+        entry['agency'] = agency
+        entry['scrapestamputc'] = datetime.datetime.now()
+        entry['scrapedurl'] = "http://" + urlhost + urlpath
+#        entry['scrapedurl'] = url
+        for dfield in fieldmap.keys():
+            if dfield in data and data[dfield]:
+                if dfield in fieldmap and fieldmap[dfield] != "":
+                    fieldname = fieldmap[dfield]
+                else:
+                    fieldname = dfield
+                if 'sender' == fieldname:
+                    if data['Dokumenttype'] == 'U':
+                        fieldname = 'recipient'
+                if dfield in ['RegistrertDato', 'Dokumentdato', 'EkspedertDato']:
+                    entry[fieldname] = parse_datestr(data[dfield]).date()
+                else:
+                    entry[fieldname] = data[dfield]
+            else:
+                entry[dfield] = data[dfield]
+        entry['caseyear'], entry['caseseqnr'], entry['caseid'] = reformat_caseid(entry['caseid'])
+#    data["sourceurl"] = "http://" + server + path
+    print entry
+    return entry
+
+def epoctime_to_datestr(epoctime):
+    return "/Date("+str(int(epoctime * 1000) )+")/"
+
+def get_last_entry_id():
+    now = time.time()
+    # Get the last week, as the most recent entry should be in this range
+    fradato = epoctime_to_datestr(now - 7 * 24 * 60 * 60)
+    tildato = epoctime_to_datestr(now)
+    #print fradato
+
+    maxid = 0
+
+    urlpath = "/Templates/eDemokrati/Services/eDemokratiService.svc/GetPostlisteArkivdeler"
+    params = {
+        "dato": fradato,
+        "tilDato": tildato,
+        "søkestreng":""}
+    arkivdeler = ws_post(None, urlhost, urlpath, params)['d']
+    # {u'd': [u'_', u'HVA-IFE-A', u'KAR-BR-A', u'KAR-BRUK-A', u'KAR-EIEN-A', u'KAR-ELBH-A', u'KAR-ELS-A', ...
+
+    urlpath = "/Templates/eDemokrati/Services/eDemokratiService.svc/GetPostlisteDokumenttyper"
+    for arkivdel in arkivdeler[0]:
+        params = {
+            "dato":fradato,
+            "tilDato":tildato,
+            "søkestreng":"",
+            "arkivdel":arkivdel,
+        }
+        doctypes = ws_post(None, urlhost, urlpath, params)['d']
+        #{"d":["I","N","S","U","X"]}
+        urlpath = "/Templates/eDemokrati/Services/eDemokratiService.svc/GetPostlisteS%C3%B8k"
+        for doctype in doctypes:
+            params = {
+                "fraDato":fradato,
+                "tilDato":tildato,
+                "søkestreng":"",
+                "arkivdel":arkivdel,
+                "dokumenttype":doctype,
+            }
+            entries = ws_post(None, urlhost, urlpath, params)['d']
+            for entry in entries:
+                #print entry['Id']
+                id = int(entry['Id'])
+                if id > maxid:
+                    maxid = id
+#                data = fetch_journal_entry(entry['Id'])
+#                if data:
+#                    scraperwiki.sqlite.save(unique_keys=['id'], data=data)
+    return maxid
+
+#{"d":[{"__type":"PostlisteObjekt:#SSP.NoarkServices","AntallVedlegg":1,"Dokumentnummer":2,"Dokumenttype":"I","EkspedertDato":null,"Hjemmel":null,"Id":1507868,"Innholdsbeskrivelse":"Tomtejustering - Lillebæk, eiendom 208\/1611","Offentlig":true,"RegistrertDato":"\/Date(1339538400000+0200)\/","SaksId":296971,"SaksNr":"12\/8658","Arkivdel":"KAR-EIEN-A","AvsenderMottaker":"Randi Wilberg","Dokumentdato":"\/Date(1339624800000+0200)\/","Mappetype":"DS","PostlisteType":"I","Saksansvarlig":null,"SaksansvarligEnhet":null,"SaksansvarligEpost":null,"Sakstittel":null},{"__type":"PostlisteObjekt:#SSP.NoarkServices","AntallVedlegg":4,"Dokumentnummer":1,"Dokumenttype":"I","EkspedertDato":null,"Hjemmel":null,"Id":1507865,"Innholdsbeskrivelse":"Søknkad om utvidelse av balkong - Kalleraveien 14","Offentlig":true,"RegistrertDato":"\/Date(1339538400000+0200)\/","SaksId":298804,"SaksNr":"12\/10480","Arkivdel":"KAR-EIEN-A","AvsenderMottaker":"Ole Henning Løken","Dokumentdato":"\/Date(1338847200000+0200)\/","Mappetype":"BS","PostlisteType":"I","Saksansvarlig":null,"SaksansvarligEnhet":null,"SaksansvarligEpost":null,"Sakstittel":null},...
+
+def get_journal_enries_range(min, max, step):
+    for id in range(min, max, step):
+        data = fetch_journal_entry(id)
+        #print data
+        if data:
+            scraperwiki.sqlite.save(unique_keys=['id'], data=data)
+
+maxid = get_last_entry_id()
+print "max id =", maxid
+try:
+    start = scraperwiki.sqlite.select("max(id) as max from swdata")[0]['max'] + 1
+except:
+    start = 137459
+print start, maxid
+#if maxid > start + 20:
+#    maxid = start + 10
+get_journal_enries_range(start, maxid, 1)
+
+start = scraperwiki.sqlite.select("min(id) as min from swdata")[0]['min'] - 1
+end = start - 1000
+print start, end
+get_journal_enries_range(start, end, -1)
diff --git a/scrapersources/postliste-ballangen b/scrapersources/postliste-ballangen
new file mode 100644
index 0000000..89e981f
--- /dev/null
+++ b/scrapersources/postliste-ballangen
@@ -0,0 +1,276 @@
+import scraperwiki
+import urllib2
+import lxml.html
+import re
+import dateutil.parser
+from collections import deque
+import datetime
+from dateutil.relativedelta import relativedelta
+
+scraperwiki.scrape("http://www.ballangen.kommune.no/artikler/postlister")
+postlistelib=scraperwiki.swimport('postliste-python-lib')
+
+#     <!-- $BeginBlock postjournal_liste -->
+#     <tr>
+#         <td class="CommonBold">&nbsp;&nbsp;&nbsp;
+#             SÃ<98>KER KULTURMIDLER FOR BALLANGEN FRIIDRETT
+#         </td>
+#     </tr>
+#     <tr>
+#         <td>&nbsp;
+# </td>
+#     </tr>
+#     <tr>
+#         <td>
+#             <b>Sakstittel:  </b>KULTURMIDLER 2012
+# 
+#         </td>
+#     </tr>
+#     <tr>
+#         <td>&nbsp;
+# </td>
+#     </tr>
+#     <tr>
+#         <td>
+#             <b>Arkivsaksnr.:  </b>12/00093 - 032 I&nbsp;&nbsp;&nbsp;&nbsp;<b>LÃ¸penr.:</b
+# >002255/12
+#         </td>
+#     </tr>
+#     <tr>
+#         <td><b>Fra/Til: </b>Eirin SÃ¸rslett
+# </td>
+#     </tr>
+#     <tr>
+#         <td><b>Saksbehandler:  </b>
+#         OddbjÃ¸rn DalsbÃ¸
+#  (RÃ<85>D/KVO)
+#         </td>
+#     </tr>
+#     <tr>
+#         <td><b>Datert:  </b> 02.04.2012</td>
+#     </tr>
+#     <tr>
+#         <td style="padding-bottom: 15px;">
+#             &nbsp;<img src="/icons/vwsent.gif" border="0" align="top" alt="Ikon" />
+#             <a href="mailto:post@ballangen.kommune.no?subject=Bestill postjournal med Ark
+# ivsaksnr 12/00093 - 032 I og lÃ¸penr 002255/12">Bestill journal</a>
+#         </td>
+#     </tr>
+
+def saver(unique_keys, data):
+#    return
+    #print "Not saving data"
+    scraperwiki.sqlite.save(unique_keys, data)
+
+def expand_year(year):
+    year = int(year)
+    if year > 50:
+        year = year + 1900
+    else:
+        year = year + 2000
+    return year
+
+def fetch_postjournal_day(parser, url, html, saver):
+    root = lxml.html.fromstring(html)
+
+    listdate = dateutil.parser.parse(root.cssselect("h2")[0].text_content().replace("Postlister for ",""), dayfirst=True)
+    print listdate.date()
+    
+    entries = []
+    for tr in root.cssselect("table.ui-corner-all tr"):
+        tds = tr.cssselect("td")
+        line = tds[0].text_content()
+        entries.append(line)
+
+# 9 or 12 lines per entry
+    queue = deque(entries)
+    datastore = []
+    while queue:
+        docdesc = (queue.popleft() + queue.popleft()).strip()
+    
+        casedesc = (queue.popleft() +  queue.popleft()).replace("Sakstittel:", "").strip()
+    
+        ref = queue.popleft().strip()
+        arkivsaksref = re.sub(r"L.penr.:.+$", "", ref).replace("Arkivsaksnr.:","").strip()
+
+        caseyear = 0
+        caseseqnr = 0
+        casedocseq = 0
+        doctype = '?'
+        caseid = 'unknown'
+        matchObj = re.match( r'(\d+)/(\d+)\s*-\s*(\d+) (.+)$', arkivsaksref, re.M|re.I)
+        if matchObj:
+            caseyear = matchObj.group(1)
+            caseseqnr = matchObj.group(2)
+            casedocseq = matchObj.group(3)
+            doctype = matchObj.group(4)
+            caseyear = expand_year(caseyear)
+            caseid = str(caseyear) + "/" + str(caseseqnr)
+        else:
+            print "error: invalid Arkivsaksnr: " + arkivsaksref
+            matchObj = re.match( r'(\d+)/(\d+)\s*-', arkivsaksref, re.M|re.I)
+            if matchObj:
+                caseyear = expand_year(matchObj.group(1))
+                caseseqnr = matchObj.group(2)
+                caseid = str(caseyear) + "/" + str(caseseqnr)
+
+        laapenr = re.sub(r"^.+L.penr.:", "", ref)
+        journalseqnr = 0
+        journalyear = 0
+        journalid = 'unknown'
+        if -1 != laapenr.find('/') and "/" != laapenr: # Avoid broken/empty values
+            journalseqnr, journalyear = laapenr.split("/")
+            journalyear = expand_year(journalyear)
+            journalid = str(journalyear) + "/" + str(journalseqnr)
+        else:
+            print u"error: invalid Løpenr: " + laapenr
+
+        if not parser.is_valid_doctype(doctype):
+            doctype = {
+                'S'   : 'N',
+                'PLN' : 'N',
+                'Z'   : 'N',
+            }[doctype]
+
+        fratil = queue.popleft().replace("Fra/Til:", "").strip()
+        if parser.is_sender_doctype(doctype):
+            fratilfield = 'sender'
+        elif parser.is_recipient_doctype(doctype):
+             fratilfield = 'recipient'
+
+        saksbehandler = queue.popleft().replace("Saksbehandler:","").strip()
+        saksansvarlig, bar = saksbehandler.split(" (")
+        saksansvarligenhet, foo = bar.split(")")
+        #print saksansvarligenhet
+
+        recorddate = dateutil.parser.parse(queue.popleft().replace("Datert:","").strip(), dayfirst=True)
+
+        requesturl = queue.popleft().strip()
+
+        exemption = ""
+        if -1 != requesturl.find("Gradering"):
+            exemption = requesturl.replace("Gradering:", "").strip()
+            requesturl = queue.popleft()
+            fratil = ""
+
+        data = {
+            'agency' : parser.agency,
+            'recorddate' : recorddate.date(),
+            'docdesc' : docdesc,
+            'casedesc' : casedesc,
+
+            'caseyear' : int(caseyear),
+            'caseseqnr' : int(caseseqnr),
+            'casedocseq' : int(casedocseq),
+            'caseid' : caseid,
+            'doctype' : doctype,
+
+            'journalseqnr' : int(journalseqnr),
+            'journalyear' : int(journalyear),
+            'journalid' : journalid,
+            fratilfield : fratil,
+
+            'saksbehandler' : saksbehandler,
+            'saksansvarlig' : saksansvarlig.strip(),
+            'saksansvarligenhet' : saksansvarligenhet.strip(),
+
+            'arkivsaksref' : arkivsaksref,
+            'laapenr' : laapenr,
+            'exemption' : exemption,
+
+            'scrapedurl' : url,
+            'scrapestamputc' : datetime.datetime.now()
+        }
+
+#        print data
+        parser.verify_entry(data)
+        datastore.append(data)
+    saver(unique_keys=['arkivsaksref'], data=datastore)
+
+def fetch_postjournal_monthlist(baseurl, html):
+    root = lxml.html.fromstring(html)
+    subset = root.cssselect("div table")
+    urls = subset[0].cssselect("td a")
+    urllist = []
+    for ahref in urls:
+        href = ahref.attrib['href']
+        if -1 != href.find("day="):
+#            print href
+            urllist.append(baseurl + href)
+    return urllist
+
+# http://www.offentlighet.no/
+
+agency = "Ballangen kommune"
+baseurl = "http://www.ballangen.kommune.no"
+
+monthurls = []
+
+def addyear(monthurls, year):
+    for m in [12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1]:
+        monthstr = "%02d%d" % (m, year)
+        url = "http://www.ballangen.kommune.no/artikler/postlister?month=" + monthstr
+        monthurls.append(url)
+
+#addyear(monthurls, 2003)
+#addyear(monthurls, 2004) # Consistency problems in http://www.ballangen.kommune.no/artikler/postlister?month=012004&day=06 (bad Arkivsaksnr. and lacking    Løpenr.)
+
+#addyear(monthurls, 2005)
+#addyear(monthurls, 2006)
+#addyear(monthurls, 2007)
+#addyear(monthurls, 2008)
+#addyear(monthurls, 2009)
+#addyear(monthurls, 2010)
+#addyear(monthurls, 2011)
+#addyear(monthurls, 2012)
+
+parsemonths = 2
+
+today = datetime.date.today()
+i  = 1
+while i <= parsemonths:
+    i = i + 1
+#    parsemonths = parsemonths - 1
+    monthtoparse = today + relativedelta(months=parsemonths - i)
+    monthstr = monthtoparse.strftime("%m%Y")
+    url = "http://www.ballangen.kommune.no/artikler/postlister?month=" + monthstr
+    monthurls.append(url)
+
+#url = "http://www.ballangen.kommune.no/artikler/postlister?month=032012&day=19"
+
+def reload_error_entries():
+    monthurls = []
+    problems = scraperwiki.sqlite.select("distinct scrapedurl from swdata where caseid = 'unknown'")
+    for n in problems:
+        monthurls.append(n['scrapedurl'])
+
+print "Fetching public journal!"
+
+parser = postlistelib.JournalParser(agency=agency)
+
+urllist = []
+
+def fetch_url(url):
+    html = None
+    for n in [1, 2, 3]:
+        try:
+            html = scraperwiki.scrape(url)
+            break
+        except urllib2.URLError, e:
+            print "URLError fetching " + url + ", trying again"
+    return html
+
+for monthurl in monthurls:
+    print "Fetching month list from " + monthurl
+    html = fetch_url(monthurl)
+    urllist.extend(fetch_postjournal_monthlist(baseurl = baseurl, html = html))
+
+for dayurl in urllist:
+    res = scraperwiki.sqlite.select("scrapedurl from swdata where scrapedurl = '"+dayurl+"' and scrapestamputc > '2012-06-23T15:12:40' limit 1")
+    if 0 < len(res):
+        continue
+    print "Fetching from " + dayurl
+    html = fetch_url(dayurl)
+#    print html
+    fetch_postjournal_day(parser=parser, url=dayurl, html=html, saver=saver)
+
diff --git a/scrapersources/postliste-ballangen-view b/scrapersources/postliste-ballangen-view
new file mode 100644
index 0000000..73d96b4
--- /dev/null
+++ b/scrapersources/postliste-ballangen-view
@@ -0,0 +1,124 @@
+import scraperwiki
+import cgi, os
+import re
+
+paramdict = dict(cgi.parse_qsl(os.getenv("QUERY_STRING", "")))
+#print paramdict
+
+if 'source' in paramdict:
+    sourcescraper = paramdict['source']
+else:
+    sourcescraper = 'postliste-ballangen'
+
+scraperwiki.sqlite.attach(sourcescraper)
+
+def htc(m):
+    return chr(int(m.group(1),16))
+
+def urldecode(url):
+    rex=re.compile('%([0-9a-hA-H][0-9a-hA-H])',re.M)
+    return rex.sub(htc,url)
+
+def table_saksbehandler():
+    data = scraperwiki.sqlite.select(           
+        '''saksbehandler,count(*) as antall from swdata group by saksbehandler order by antall desc'''
+    )
+    # print data
+
+    print "<table>"           
+    print "<tr><th>Saksbehandler</th><th>Saker</th>"
+    for d in data:
+        print "<tr>"
+        print "<td>", d["saksbehandler"], "</td>"
+        print "<td>", d["antall"], "</td>"
+        print "</tr>"
+    print "</table>"
+
+# {'datert': datetime.date(2012, 1, 6), 'arkivsaksref': u'12/00008 - 008 U', 'tittel': u'INNKALLING TIL DR\xd8FTELSESM\xd8TE - 13.01.12', 'sakstittel': u'BEMANNINGSSITUASJON ETTER BUDSJETTVEDTAK 2012', 'laapenr': u'000183/12', 'kommune': 'Ballangen kommune', 'saksbehandler': u'Svenn Ole Wiik\n (R\xc5D/)', 'listdate': datetime.date(2012, 1, 6), 'gradering': '', 'fratil': u'Anne J\xf8rgensen'}
+
+sql = "select * from swdata"
+where = ""
+args = []
+if "caseid" in paramdict:
+    where = where + ' caseid = ?'
+    args.append(paramdict["caseid"])
+if "agency" in paramdict:
+    where = where + ' agency = ?'
+    args.append(urldecode(paramdict["agency"]))
+if "saksansvarlig" in paramdict:
+    where = where + ' saksansvarlig = ?'
+    saksansvarlig = urldecode(paramdict["saksansvarlig"])
+    print "S: '" + saksansvarlig + "'"
+    args.append(urldecode(paramdict["saksansvarlig"]))
+if "fratil" in paramdict:
+    where = where + ' sender = ? or recipient = ?'
+    fratil = urldecode(paramdict["fratil"])
+    args.extend([fratil, fratil])
+if "q" in paramdict:
+    q = urldecode(paramdict["q"])
+    qlike = '%' + q + '%'
+    where = where + ' docdesc like ? or casedesc like ? or sender like ? or recipient like ?'
+    args.extend([qlike, qlike, qlike, qlike])
+if where:
+    sql = sql + ' where ' + where
+sql = sql + " order by recorddate desc, casedocseq limit 200"
+print sql
+data = scraperwiki.sqlite.execute(sql, args)
+#print data
+
+print "<p>Søk i tittel, sakstittel, fra/til.</p>"
+print "<p><form>Enter search term: "
+print "<input name='q' length='60'>"
+print "<input name='source' type='hidden' value='" + sourcescraper + "'>"
+print "<INPUT type=\"submit\" value=\"Search\"> <INPUT type=\"reset\">"
+print "</form></p>"
+print "<table>"
+
+#print data
+
+i = 0
+key = {}
+print "<tr>"
+while i < len(data['keys']):
+    colname = data['keys'][i]
+    key[colname] = i
+    if colname in ["scrapedurl", "caseid", "scrapestamputc"]:
+        True # Skip, see below
+    else:
+        print "<th>" + colname + "</th>"
+    i = i + 1
+print "</tr>"
+
+#print data
+for d in data['data']:
+    print "<tr>"
+    i = 0
+    while i < len(data['keys']):
+        colname = data['keys'][i]
+        value = d[key[colname]]
+        if value is None:
+            value = ""
+        if "docdesc" == colname:
+            if 'scrapedurl' in key:
+                scrapedurl = d[key['scrapedurl']]
+                print "<td><a href='" + scrapedurl + "'>", value, "</a></td>"
+            else:
+                print "<td>", value, "</td>"
+        elif "saksansvarlig" == colname:
+            saksansvarlig = d[key['saksansvarlig']]
+            print "<td><a href='?saksansvarlig=" + saksansvarlig + "'>", value, "</a></td>"
+        elif "casedesc" == colname:
+            caseid = d[key['caseid']]
+            print "<td><a href='?caseid=" + caseid + "&source=" + sourcescraper + "'>", value, "</a></td>"
+        elif "sender" == colname or "recipient" == colname:
+            if "" != value:
+                print "<td><a href='?fratil=" + value + "&source=" + sourcescraper + "'>", value, "</a></td>"
+            else:
+                print "<td></td>"
+        elif colname in ["scrapedurl", "caseid", "scrapestamputc"]:
+            True # Skip these, as they are included as links
+        else:
+            print "<td>", value, "</td>"
+        i = i + 1
+    print "</tr>"
+print "</table>"
diff --git a/scrapersources/postliste-bioforsk b/scrapersources/postliste-bioforsk
new file mode 100644
index 0000000..b41b30f
--- /dev/null
+++ b/scrapersources/postliste-bioforsk
@@ -0,0 +1,81 @@
+# -*- coding: UTF-8 -*-
+
+import scraperwiki
+import json
+from BeautifulSoup import BeautifulSoup
+import datetime
+import dateutil.parser
+import lxml.html
+import resource
+import sys
+import urlparse
+import re
+lazycache=scraperwiki.swimport('lazycache')
+postlistelib=scraperwiki.swimport('postliste-python-lib')
+
+agency = 'Bioforsk AS'
+
+def report_errors(errors):
+    if 0 < len(errors):
+        print "Errors:"
+        for e in errors:
+            print e
+        exit(1)
+def out_of_cpu(arg, spent, hard, soft):
+    report_errors(arg)
+
+def process_pdf(parser, pdfurl, errors):
+    errors = []
+    postlistelib.exit_if_no_cpu_left(0, out_of_cpu, errors)
+    try:
+        pdfcontent = scraperwiki.scrape(pdfurl)
+        parser.preprocess(pdfurl, pdfcontent)
+        pdfcontent = None
+#    except ValueError, e:
+#        errors.append(e)
+    except IndexError, e:
+        errors.append(e)
+
+def process_page_queue(parser, errors):
+    try:
+        parser.process_pages()
+        postlistelib.exit_if_no_cpu_left(0, out_of_cpu, errors)
+    except scraperwiki.CPUTimeExceededError, e:
+        errors.append("Processing pages interrupted")
+
+def process_journal_pdfs(parser, listurl, errors):
+#    print "Finding PDFs on " + listurl
+#    u = urllib.parse.urlparse(listurl)
+    html = scraperwiki.scrape(listurl)
+    root = lxml.html.fromstring(html)
+    html = None
+    for ahref in root.cssselect("table a"):
+        href = ahref.attrib['href']
+        url = urlparse.urljoin(listurl, href)
+        if -1 != href.find("file://") or -1 == url.find(".pdf"):
+#            print "Skipping non-http URL " + url
+            continue
+        if parser.is_already_scraped(url):
+            True
+#            print "Skipping already scraped " + url
+        else:
+#            print "Will process " + url
+            process_pdf(parser, url, errors)
+
+def test_small_pdfs(parser):
+    # Test with some smaller PDFs
+    errors = []
+    process_pdf(parser, "http://www.bioforsk.no/ikbViewer/Content/97492/off_journal_uke17%202012.pdf", errors)
+    process_page_queue(parser, errors)
+    report_errors(errors)
+    exit(0)
+
+errors = []
+parser = postlistelib.PDFJournalParser(agency=agency)
+
+#test_small_pdfs(parser)
+
+process_journal_pdfs(parser, "http://www.bioforsk.no/ikbViewer/page/bioforsk/presse?p_dimension_id=21903", errors)
+process_page_queue(parser, errors)
+report_errors(errors)
+
diff --git a/scrapersources/postliste-difi b/scrapersources/postliste-difi
new file mode 100644
index 0000000..dfc986f
--- /dev/null
+++ b/scrapersources/postliste-difi
@@ -0,0 +1,88 @@
+# -*- coding: UTF-8 -*-
+
+import scraperwiki
+import json
+from BeautifulSoup import BeautifulSoup
+import datetime
+import dateutil.parser
+import lxml.html
+import resource
+import sys
+import urlparse
+import re
+
+frontpage = "http://www.difi.no/om-difi/offentleg-postjournal-for-difi"
+
+scraperwiki.scrape(frontpage)
+
+lazycache=scraperwiki.swimport('lazycache')
+postlistelib=scraperwiki.swimport('postliste-python-lib')
+
+agency = 'Direktoratet for forvaltning og IKT'
+
+def report_errors(errors):
+    if 0 < len(errors):
+        print "Errors:"
+        for e in errors:
+            print e
+        exit(1)
+def out_of_cpu(arg, spent, hard, soft):
+    report_errors(arg)
+
+def process_pdf(parser, pdfurl, errors):
+    errors = []
+    postlistelib.exit_if_no_cpu_left(0, out_of_cpu, errors)
+    try:
+        pdfcontent = scraperwiki.scrape(pdfurl)
+        parser.preprocess(pdfurl, pdfcontent)
+        pdfcontent = None
+#    except ValueError, e:
+#        errors.append(e)
+    except IndexError, e:
+        errors.append(e)
+
+def process_page_queue(parser, errors):
+    try:
+        parser.process_pages()
+        postlistelib.exit_if_no_cpu_left(0, out_of_cpu, errors)
+    except scraperwiki.CPUTimeExceededError, e:
+        errors.append("Processing pages interrupted")
+
+def process_journal_pdfs(parser, listurl, errors):
+#    print "Finding PDFs on " + listurl
+#    u = urllib.parse.urlparse(listurl)
+    html = scraperwiki.scrape(listurl)
+    root = lxml.html.fromstring(html)
+    html = None
+    for ahref in root.cssselect("div.body a"):
+        href = ahref.attrib['href']
+        url = urlparse.urljoin(listurl, href)
+        if -1 != href.find("file://") or -1 == url.find(".pdf"):
+#            print "Skipping non-http URL " + url
+            continue
+        if parser.is_already_scraped(url):
+            True
+#            print "Skipping already scraped " + url
+        else:
+#            print "Will process " + url
+            process_pdf(parser, url, errors)
+
+def test_small_pdfs(parser):
+    # Test with some smaller PDFs
+    errors = []
+    process_pdf(parser, "http://www.difi.no/filearchive/uke-2-offentlig-journal.pdf", errors)
+    process_pdf(parser, "http://www.difi.no/filearchive/uke-1-offentlig-journal.pdf", errors)
+    process_page_queue(parser, errors)
+    report_errors(errors)
+    exit(0)
+
+errors = []
+parser = postlistelib.PDFJournalParser(agency=agency)
+
+#test_small_pdfs(parser)
+
+process_page_queue(parser, errors)
+process_journal_pdfs(parser, frontpage, errors)
+process_page_queue(parser, errors)
+report_errors(errors)
+
diff --git a/scrapersources/postliste-fredrikstad b/scrapersources/postliste-fredrikstad
new file mode 100644
index 0000000..7fb5a13
--- /dev/null
+++ b/scrapersources/postliste-fredrikstad
@@ -0,0 +1,196 @@
+import scraperwiki
+
+import json
+import httplib, urllib
+import datetime
+import dateutil.parser
+import time
+import re
+
+agency = "Fredrikstad kommune"
+urlhost = "www.fredrikstad.kommune.no"
+
+fieldmap = {
+    'AntallVedlegg' : '',
+    'Arkivdel' : '',
+    'AvsenderMottaker' : 'sender', # or recipient
+    'Dokumentdato' : 'docdate',
+    'Dokumentnummer' : 'casedocseq',
+    'Dokumenttype' : 'doctype',
+    'EkspedertDato' : '',
+    'Hjemmel' : 'exemption',
+    'Id' : 'id',
+    'Innholdsbeskrivelse' : 'docdesc',
+    'Mappetype' : '',
+    'Offentlig' : 'ispublic',
+    'PostlisteType' : 'doctype',
+    'RegistrertDato' : 'recorddate',
+    'SaksId' : '',
+    'SaksNr' : 'caseid',
+    'Sakstittel' : 'casedesc',
+    #'SaksNr' : 'SA.SAAR + SA.SEKNR',
+    'Saksansvarlig' : 'saksbehandler',
+    'SaksansvarligEnhet' : '',
+    'SaksansvarligEpost' : '',
+
+#    'scrapestamputc' : '',
+#    'scrapedurl' : '',
+#    'agency' : '',
+}
+
+
+# Convert "/Date(1317808020000+0200)/" to a datetime object
+# FIXME Currently ignore the timezone information
+def parse_datestr(str):
+    match = re.split("[/()+]", str)
+#    print match
+    sinceepoch = float(match[2]) / 1000
+    if match[3] == '0200':
+        sinceepoch = sinceepoch + 2 * 60 * 60
+    if match[3] == '0100':
+        sinceepoch = sinceepoch + 1 * 60 * 60
+#    print sinceepoch
+    date = datetime.datetime.fromtimestamp(sinceepoch)
+#    print date
+    return date
+
+def reformat_caseid(caseid):
+    # Input 12/13123, output 2012, 13123, "2012/13123"
+    year, seqnr = caseid.split("/")
+    year = int(year)
+    if year < 100:
+        year = year + 2000
+    caseid = "%d/%s" % (year, seqnr)
+    return year, int(seqnr), caseid
+
+def ws_post(url, urlhost, urlpath, params):
+    jsonparams = json.dumps(params)
+    headers = {"Content-type": "application/json; charset=utf-8",
+               "Accept": "application/json"}
+    conn = httplib.HTTPConnection(urlhost)
+    #print jsonparams
+    conn.request("POST", urlpath, jsonparams, headers)
+    response = conn.getresponse()
+    #print response.status, response.reason
+    jsonres = response.read()
+    res = json.loads(jsonres)
+    #print res
+    return res
+
+def fetch_journal_entry(id):
+    params = { "id" : str(id)}
+    headers = {"Content-type": "application/json; charset=utf-8",
+               "Accept": "application/json"}
+    urlpath = "/Templates/eDemokrati/Services/eDemokratiService.svc/GetPostlisteObjekt"
+    data = ws_post(None, urlhost, urlpath, params)['d']
+    entry = None
+    if data:
+        del data['__type'] # This is useless, ignore
+        entry = {}
+        entry['agency'] = agency
+        entry['scrapestamputc'] = datetime.datetime.now()
+        entry['scrapedurl'] = "http://" + urlhost + urlpath
+#        entry['scrapedurl'] = url
+        for dfield in fieldmap.keys():
+            if dfield in data and data[dfield]:
+                if dfield in fieldmap and fieldmap[dfield] != "":
+                    fieldname = fieldmap[dfield]
+                else:
+                    fieldname = dfield
+                if 'sender' == fieldname:
+                    if data['Dokumenttype'] == 'U':
+                        fieldname = 'recipient'
+                if dfield in ['RegistrertDato', 'Dokumentdato', 'EkspedertDato']:
+                    entry[fieldname] = parse_datestr(data[dfield]).date()
+                else:
+                    entry[fieldname] = data[dfield]
+            else:
+                entry[dfield] = data[dfield]
+        entry['caseyear'], entry['caseseqnr'], entry['caseid'] = reformat_caseid(entry['caseid'])
+#    data["sourceurl"] = "http://" + server + path
+    #print entry
+    return entry
+
+def epoctime_to_datestr(epoctime):
+    return "/Date("+str(int(epoctime * 1000) )+")/"
+
+def get_last_entry_id():
+    now = time.time()
+    # Get the last week, as the most recent entry should be in this range
+    fradato = epoctime_to_datestr(now - 7 * 24 * 60 * 60)
+    tildato = epoctime_to_datestr(now)
+    #print fradato
+
+    maxid = 0
+
+    urlpath = "/Templates/eDemokrati/Services/eDemokratiService.svc/GetPostlisteArkivdeler"
+    params = {
+        "dato": fradato,
+        "tilDato": tildato,
+        "søkestreng":""}
+    arkivdeler = ws_post(None, urlhost, urlpath, params)['d']
+    # {u'd': [u'_', u'HVA-IFE-A', u'KAR-BR-A', u'KAR-BRUK-A', u'KAR-EIEN-A', u'KAR-ELBH-A', u'KAR-ELS-A', ...
+
+    urlpath = "/Templates/eDemokrati/Services/eDemokratiService.svc/GetPostlisteDokumenttyper"
+    for arkivdel in arkivdeler[0]:
+        params = {
+            "dato":fradato,
+            "tilDato":tildato,
+            "søkestreng":"",
+            "arkivdel":arkivdel,
+        }
+        doctypes = ws_post(None, urlhost, urlpath, params)['d']
+        #{"d":["I","N","S","U","X"]}
+        urlpath = "/Templates/eDemokrati/Services/eDemokratiService.svc/GetPostlisteS%C3%B8k"
+        for doctype in doctypes:
+            params = {
+                "fraDato":fradato,
+                "tilDato":tildato,
+                "søkestreng":"",
+                "arkivdel":arkivdel,
+                "dokumenttype":doctype,
+            }
+            entries = ws_post(None, urlhost, urlpath, params)['d']
+            for entry in entries:
+                #print entry
+                #exit(0)
+                #print entry['Id']
+                id = int(entry['Id'])
+                if id > maxid:
+                    maxid = id
+#                data = fetch_journal_entry(entry['Id'])
+#                if data:
+#                    scraperwiki.sqlite.save(unique_keys=['id'], data=data)
+    return maxid
+
+#{"d":[{"__type":"PostlisteObjekt:#SSP.NoarkServices","AntallVedlegg":1,"Dokumentnummer":2,"Dokumenttype":"I","EkspedertDato":null,"Hjemmel":null,"Id":1507868,"Innholdsbeskrivelse":"Tomtejustering - Lillebæk, eiendom 208\/1611","Offentlig":true,"RegistrertDato":"\/Date(1339538400000+0200)\/","SaksId":296971,"SaksNr":"12\/8658","Arkivdel":"KAR-EIEN-A","AvsenderMottaker":"Randi Wilberg","Dokumentdato":"\/Date(1339624800000+0200)\/","Mappetype":"DS","PostlisteType":"I","Saksansvarlig":null,"SaksansvarligEnhet":null,"SaksansvarligEpost":null,"Sakstittel":null},{"__type":"PostlisteObjekt:#SSP.NoarkServices","AntallVedlegg":4,"Dokumentnummer":1,"Dokumenttype":"I","EkspedertDato":null,"Hjemmel":null,"Id":1507865,"Innholdsbeskrivelse":"Søknkad om utvidelse av balkong - Kalleraveien 14","Offentlig":true,"RegistrertDato":"\/Date(1339538400000+0200)\/","SaksId":298804,"SaksNr":"12\/10480","Arkivdel":"KAR-EIEN-A","AvsenderMottaker":"Ole Henning Løken","Dokumentdato":"\/Date(1338847200000+0200)\/","Mappetype":"BS","PostlisteType":"I","Saksansvarlig":null,"SaksansvarligEnhet":null,"SaksansvarligEpost":null,"Sakstittel":null},...
+
+def get_journal_enries_range(min, max, step):
+    for id in range(min, max, step):
+        data = fetch_journal_entry(id)
+        #print data
+        if data:
+            scraperwiki.sqlite.save(unique_keys=['id'], data=data)
+        time.sleep(0.3)
+
+maxid = get_last_entry_id()
+print "max id =", maxid
+try:
+    start = scraperwiki.sqlite.select("max(id) as max from swdata")[0]['max'] + 1
+except:
+    start = 1094428 # 2010
+    start = 1507868 # 2012
+
+print start, maxid
+#if maxid > start + 20:
+#    maxid = start + 10
+get_journal_enries_range(start, maxid + 1, 1)
+
+try:
+    minid = scraperwiki.sqlite.select("min(id) as min from swdata")[0]['min'] - 1
+    start = minid
+except:
+    True
+end = start - 1000
+print start, end
+get_journal_enries_range(start, end, -1)
diff --git a/scrapersources/postliste-hadsel b/scrapersources/postliste-hadsel
new file mode 100644
index 0000000..a175048
--- /dev/null
+++ b/scrapersources/postliste-hadsel
@@ -0,0 +1,108 @@
+# -*- coding: UTF-8 -*-
+
+import scraperwiki
+from BeautifulSoup import BeautifulSoup
+import datetime
+import dateutil.parser
+import lxml.html
+import sys
+import urlparse
+
+scraperwiki.scrape("http://www.hadsel.kommune.no/selvbetjeningskjema-kart-postjournal/offentlig-postjournal")
+
+lazycache=scraperwiki.swimport('lazycache')
+postlistelib=scraperwiki.swimport('postliste-python-lib')
+
+agency = 'Hadsel kommune'
+
+def report_errors(errors):
+    if 0 < len(errors):
+        print "Errors:"
+        for e in errors:
+            print e
+        exit(1)
+def no_cpu_left(arg, spent, soft, hard):
+    report_errors(arg)
+
+def process_pdf(parser, pdfurl, errors):
+    errors = []
+    postlistelib.exit_if_no_cpu_left(0, callback=no_cpu_left, arg = errors)
+    try:
+        pdfcontent = lazycache.lazycache(pdfurl)
+        parser.preprocess(pdfurl, pdfcontent)
+#    except ValueError, e:
+#        errors.append(e)
+    except IndexError, e:
+        errors.append(e)
+
+def process_page_queue(parser, errors):
+    try:
+        parser.process_pages()
+        postlistelib.exit_if_no_cpu_left(0, callback=no_cpu_left, arg = errors)
+    except scraperwiki.CPUTimeExceededError, e:
+        errors.append("Processing pages interrupted")
+
+def consider_url(parser, url, errors):
+    if parser.is_already_scraped(url):
+        True
+#            print "Skipping already scraped " + url
+    else:
+#            print "Will process " + url
+        try:
+            process_pdf(parser, url, errors)
+        except:
+            pass
+
+def process_journal_pdfs(parser, listurl, errors, recurse):
+#    print "Finding PDFs on " + listurl
+#    u = urllib.parse.urlparse(listurl)
+    html = scraperwiki.scrape(listurl)
+    root = lxml.html.fromstring(html)
+    html = None
+    for ahref in root.cssselect("div.items a"):
+        url = urlparse.urljoin(listurl, ahref.attrib['href'])
+        if -1 == url.find("doc_download"):
+            continue
+        consider_url(parser, url, errors)
+        #print url
+    for ahref in root.cssselect("div.item-list a"):
+        suburl = urlparse.urljoin(listurl, ahref.attrib['href'])
+        #print "sub " + suburl
+        subhtml = scraperwiki.scrape(suburl)
+        subroot = lxml.html.fromstring(subhtml)
+        subhtml = None
+        for subahref in subroot.cssselect("div.article a"):
+            href = subahref.attrib['href']
+            #print href
+            subsuburl = urlparse.urljoin(suburl, href)
+            #print "subsub " + subsuburl
+            if -1 == subsuburl.find("doc_download"):
+                continue
+            consider_url(parser, subsuburl, errors)
+        subroot = None
+    if recurse:
+        seen = { listurl : 1 }
+        for ahref in root.cssselect("div.pagination a"):
+            pageurl = urlparse.urljoin(listurl, ahref.attrib['href'])
+            #print "P: " + pageurl
+            if pageurl not in seen:
+                process_journal_pdfs(parser, pageurl, errors, False)
+                seen[pageurl] = 1
+
+def test_parse_case_journal_ref():
+    entry = {}
+    parse_case_journal_ref(entry, [u'2008/16414-', u'23', u'15060/2012'], "")
+    parse_case_journal_ref(entry, [u'2011/15972-1 102773/201', u'1'], "")
+    parse_case_journal_ref(entry, [u'2010/2593-2', u'103004/201', u'1'], "")
+    parse_case_journal_ref(entry, [u'2011/13415-', u'22', u'100077/201', u'1'], "")
+    exit(0)
+
+#test_parse_case_journal_ref()
+
+errors = []
+parser = postlistelib.PDFJournalParser(agency=agency)
+process_page_queue(parser, errors)
+process_journal_pdfs(parser, "http://www.hadsel.kommune.no/selvbetjeningskjema-kart-postjournal/offentlig-postjournal", errors, True)
+process_page_queue(parser, errors)
+report_errors(errors)
+
diff --git a/scrapersources/postliste-halden b/scrapersources/postliste-halden
new file mode 100644
index 0000000..4b0ebd5
--- /dev/null
+++ b/scrapersources/postliste-halden
@@ -0,0 +1,93 @@
+# -*- coding: UTF-8 -*-
+
+import scraperwiki
+import json
+from BeautifulSoup import BeautifulSoup
+import datetime
+import dateutil.parser
+import lxml.html
+import resource
+import sys
+import urlparse
+import urllib
+import re
+lazycache=scraperwiki.swimport('lazycache')
+postlistelib=scraperwiki.swimport('postliste-python-lib')
+
+agency = 'Halden kommune'
+
+def report_errors(errors):
+    if 0 < len(errors):
+        print "Errors:"
+        for e in errors:
+            print e
+        exit(1)
+def out_of_cpu(arg, spent, hard, soft):
+    report_errors(arg)
+
+def process_pdf(parser, pdfurl, errors):
+    errors = []
+    postlistelib.exit_if_no_cpu_left(0, out_of_cpu, errors)
+    try:
+        pdfcontent = scraperwiki.scrape(pdfurl)
+        parser.preprocess(pdfurl, pdfcontent)
+        pdfcontent = None
+#    except ValueError, e:
+#        errors.append(e)
+    except IndexError, e:
+        errors.append(e)
+
+def process_page_queue(parser, errors):
+    try:
+        if False:
+            scraperwiki.sqlite.execute("delete from swdata where scrapedurl in (select scrapedurl from unparsedpages)")
+            scraperwiki.sqlite.execute("delete from unparsedpages")
+            scraperwiki.sqlite.commit()
+
+        parser.process_pages()
+        postlistelib.exit_if_no_cpu_left(0, out_of_cpu, errors)
+    except scraperwiki.CPUTimeExceededError, e:
+        errors.append("Processing pages interrupted")
+
+def process_pdf_links_cssselect(parser, listurl, errors, cssselect):
+    html = scraperwiki.scrape(listurl)
+    root = lxml.html.fromstring(html)
+    html = None
+    for ahref in root.cssselect(cssselect + " a"):
+        href = ahref.attrib['href']
+        url = urlparse.urljoin(listurl, href).replace(" ", "%20").replace(u"å", "%C3%A5")
+        #print url
+        if -1 != href.find("file://") or -1 != href.find("postliste/Documents/Brukerveiledning"):
+#            print "Skipping non-http URL " + url
+            continue
+        if -1 == href.find(".pdf"):
+            continue
+        if parser.is_already_scraped(url):
+            True
+#            print "Skipping already scraped " + url
+        else:
+#            print "Will process " + url
+            process_pdf(parser, url, errors)
+
+def process_journal_pdfs(parser, listurl, errors):
+    return process_pdf_links_cssselect(parser, listurl, errors, "div#page_centerElementZone")
+
+def test_small_pdfs(parser):
+    # Test with some smaller PDFs
+    errors = []
+    process_pdf(parser, u"http://www.halden.kommune.no/selvbetjening/postliste/Documents/120601%20-%20120607%20Inng%C3%A5ende.pdf", errors)
+    process_pdf(parser, u"http://www.halden.kommune.no/selvbetjening/postliste/Documents/120601%20-%20120607%20Utg%C3%A5ende.pdf", errors)
+    process_page_queue(parser, errors)
+    report_errors(errors)
+    exit(0)
+
+errors = []
+parser = postlistelib.PDFJournalParser(agency=agency)
+#parser.debug = True
+
+#test_small_pdfs(parser)
+process_page_queue(parser, errors)
+process_journal_pdfs(parser, u"http://www.halden.kommune.no/selvbetjening/postliste/Sider/Inng%C3%A5ende-postlister.aspx", errors)
+process_journal_pdfs(parser, u"http://www.halden.kommune.no/selvbetjening/postliste/Sider/Utg%C3%A5ende-postliste-.aspx", errors)
+process_page_queue(parser, errors)
+report_errors(errors)
+\ No newline at end of file
diff --git a/scrapersources/postliste-hoegskolen-i-gjoevik b/scrapersources/postliste-hoegskolen-i-gjoevik
new file mode 100644
index 0000000..fd197eb
--- /dev/null
+++ b/scrapersources/postliste-hoegskolen-i-gjoevik
@@ -0,0 +1,104 @@
+# -*- coding: UTF-8 -*-
+# Based on the scraper advanced-scraping-pdf
+# See also
+# https://views.scraperwiki.com/run/pdf-to-html-preview-1/?url=http%3A%2F%2Fwww.stortinget.no%2FGlobal%2Fpdf%2Fpostjournal%2Fpj-2012-05-09.pdf
+
+import scraperwiki
+import json
+from BeautifulSoup import BeautifulSoup
+import datetime
+import dateutil.parser
+import lxml.html
+import resource
+import sys
+import urlparse
+import re
+#
+#  something weird with 04.11.2010
+#
+#
+#
+# Make sure Scraperwiki believe this is the source from this database
+scraperwiki.scrape("http://www.hig.no/om_hig/offentleg_journal/2012")
+
+lazycache=scraperwiki.swimport('lazycache')
+postlistelib=scraperwiki.swimport('postliste-python-lib')
+
+agency = 'Høgskolen i Gjøvik'
+
+def report_errors(errors):
+    if 0 < len(errors):
+        print "Errors:"
+        for e in errors:
+            print e
+        exit(1)
+def out_of_cpu(arg, spent, hard, soft):
+    report_errors(arg)
+
+def process_pdf(parser, pdfurl, errors):
+    errors = []
+    postlistelib.exit_if_no_cpu_left(0, out_of_cpu, errors)
+    try:
+        pdfcontent = scraperwiki.scrape(pdfurl)
+        parser.preprocess(pdfurl, pdfcontent)
+        pdfcontent = None
+#    except ValueError, e:
+#        errors.append(e)
+    except IndexError, e:
+        errors.append(e)
+
+def process_page_queue(parser, errors):
+    try:
+        parser.process_pages()
+        postlistelib.exit_if_no_cpu_left(0, out_of_cpu, errors)
+    except scraperwiki.CPUTimeExceededError, e:
+        errors.append("Processing pages interrupted")
+
+def process_journal_pdfs(parser, listurl, errors):
+    print "Finding PDFs on " + listurl
+#    u = urllib.parse.urlparse(listurl)
+    html = scraperwiki.scrape(listurl)
+    root = lxml.html.fromstring(html)
+    html = None
+    for ahref in root.cssselect("div.spalte-inner a"):
+        href = ahref.attrib['href']
+        url = urlparse.urljoin(listurl, href).replace(" ", "+")
+        if -1 != href.find("file://") or -1 == url.find(".pdf"):
+#            print "Skipping non-http URL " + url
+            continue
+        if parser.is_already_scraped(url):
+            True
+#            print "Skipping already scraped " + url
+        else:
+#            print "Will process " + url
+            process_pdf(parser, url, errors)
+
+def test_small_pdfs(parser):
+    # Test with some smaller PDFs
+    errors = []
+    if parser.is_already_scraped("http://www.hig.no/content/download/30119/360872/file/Offentlig+journal+04.11.2010.pdf"):
+        print "Skipping already scraped "
+        exit(1)
+    else:
+        print "Will process "
+    
+    #process_pdf(parser, "http://www.hig.no/content/download/35184/430061/file/Offentlig%20journal%2025.06.2012.pdf", errors)
+    #process_pdf(parser, "http://www.hig.no/content/download/30116/360863/file/Offentlig%20journal%2001.11.2010.pdf", errors)
+    process_pdf(parser, "http://www.hig.no/content/download/30119/360872/file/Offentlig+journal+04.11.2010.pdf", errors)
+    process_page_queue(parser, errors)
+    report_errors(errors)
+    exit(0)
+
+errors = []
+parser = postlistelib.PDFJournalParser(agency=agency)
+
+#test_small_pdfs(parser)
+
+startYear=2010
+endYear=datetime.datetime.now().year
+for year in range(startYear, endYear):
+    process_journal_pdfs(parser, "http://www.hig.no/om_hig/offentleg_journal/%d" % year, errors)
+
+process_page_queue(parser, errors)
+report_errors(errors)
+
diff --git a/scrapersources/postliste-hoegskolen-i-hamar b/scrapersources/postliste-hoegskolen-i-hamar
new file mode 100644
index 0000000..890eed3
--- /dev/null
+++ b/scrapersources/postliste-hoegskolen-i-hamar
@@ -0,0 +1,103 @@
+# -*- coding: UTF-8 -*-
+# Based on the scraper advanced-scraping-pdf
+# See also
+# https://views.scraperwiki.com/run/pdf-to-html-preview-1/?url=http%3A%2F%2Fwww.stortinget.no%2FGlobal%2Fpdf%2Fpostjournal%2Fpj-2012-05-09.pdf
+
+import scraperwiki
+import json
+from BeautifulSoup import BeautifulSoup
+import datetime
+import dateutil.parser
+import lxml.html
+import resource
+import sys
+import urlparse
+import re
+
+# Make sure Scraperwiki believe this is the source from this database
+scraperwiki.scrape("http://www.hihm.no/Hovedsiden/Om-Hoegskolen/Offentlig-journal")
+
+lazycache=scraperwiki.swimport('lazycache')
+postlistelib=scraperwiki.swimport('postliste-python-lib')
+
+agency = 'Høgskolen i Hamar'
+
+def report_errors(errors):
+    if 0 < len(errors):
+        print "Errors:"
+        for e in errors:
+            print e
+        exit(1)
+def out_of_cpu(arg, spent, hard, soft):
+    report_errors(arg)
+
+def process_pdf(parser, pdfurl, errors):
+    errors = []
+    postlistelib.exit_if_no_cpu_left(0, out_of_cpu, errors)
+    try:
+        pdfcontent = scraperwiki.scrape(pdfurl)
+        parser.preprocess(pdfurl, pdfcontent)
+        pdfcontent = None
+#    except ValueError, e:
+#        errors.append(e)
+    except IndexError, e:
+        errors.append(e)
+
+def process_page_queue(parser, errors):
+    try:
+        parser.process_pages()
+        postlistelib.exit_if_no_cpu_left(0, out_of_cpu, errors)
+    except scraperwiki.CPUTimeExceededError, e:
+        errors.append("Processing pages interrupted")
+
+def process_journal_pdfs(parser, listurl, errors):
+    print "Finding PDFs on " + listurl
+#    u = urllib.parse.urlparse(listurl)
+    html = scraperwiki.scrape(listurl)
+    root = lxml.html.fromstring(html)
+    html = None
+    for ahref in root.cssselect("div.content-view-full a"):
+        href = ahref.attrib['href']
+        url = urlparse.urljoin(listurl, href).replace(" ", "+")
+        if -1 != href.find("file://") or -1 == url.find(".pdf"):
+#            print "Skipping non-http URL " + url
+            continue
+        if parser.is_already_scraped(url):
+            True
+#            print "Skipping already scraped " + url
+        else:
+#            print "Will process " + url
+            process_pdf(parser, url, errors)
+
+def find_journal_subpages(baseurl):
+    urls = []    
+    root = lxml.html.fromstring(scraperwiki.scrape(baseurl))
+    for ahref in root.cssselect("ul.menu-list a"):
+        href = ahref.attrib['href']
+        months = "januar","februar","mars","april","mai","juni","juli","august","september","oktober","november","desember"
+        if -1 == href.find("file://") and href.endswith(months):
+            urls.append(urlparse.urljoin(baseurl, href).replace(" ", "+"))
+    return urls
+
+def test_small_pdfs(parser):
+    # Test with some smaller PDFs
+    errors = []
+    process_pdf(parser, "http://www.hihm.no/content/download/38169/420508/file/search.pdf", errors)
+    process_pdf(parser, "http://www.hihm.no/content/download/39369/430053/file/search.pdf", errors)
+    process_page_queue(parser, errors)
+    report_errors(errors)
+    exit(0)
+
+errors = []
+parser = postlistelib.PDFJournalParser(agency=agency)
+
+#test_small_pdfs(parser)
+
+#process_journal_pdfs(parser, "http://www.hihm.no/Hovedsiden/Om-Hoegskolen/Offentlig-journal/mai", errors)
+
+for url in find_journal_subpages("http://www.hihm.no/Hovedsiden/Om-Hoegskolen/Offentlig-journal"):
+    process_journal_pdfs(parser, url, errors)
+
+process_page_queue(parser, errors)
+report_errors(errors)
+
diff --git a/scrapersources/postliste-hoegskolen-i-lillehammer b/scrapersources/postliste-hoegskolen-i-lillehammer
new file mode 100644
index 0000000..5337521
--- /dev/null
+++ b/scrapersources/postliste-hoegskolen-i-lillehammer
@@ -0,0 +1,90 @@
+# -*- coding: UTF-8 -*-
+# Based on the scraper advanced-scraping-pdf
+# See also
+# https://views.scraperwiki.com/run/pdf-to-html-preview-1/?url=http%3A%2F%2Fwww.stortinget.no%2FGlobal%2Fpdf%2Fpostjournal%2Fpj-2012-05-09.pdf
+
+import scraperwiki
+import json
+from BeautifulSoup import BeautifulSoup
+import datetime
+import dateutil.parser
+import lxml.html
+import resource
+import sys
+import urlparse
+import re
+
+# Make sure Scraperwiki believe this is the source from this database
+scraperwiki.scrape("http://www.hil.no/hil/om_hoegskolen/Offentlig-journal")
+
+lazycache=scraperwiki.swimport('lazycache')
+postlistelib=scraperwiki.swimport('postliste-python-lib')
+
+agency = 'Høgskolen i Lillehammer'
+
+def report_errors(errors):
+    if 0 < len(errors):
+        print "Errors:"
+        for e in errors:
+            print e
+        exit(1)
+def out_of_cpu(arg, spent, hard, soft):
+    report_errors(arg)
+
+def process_pdf(parser, pdfurl, errors):
+    errors = []
+    postlistelib.exit_if_no_cpu_left(0, out_of_cpu, errors)
+    try:
+        pdfcontent = scraperwiki.scrape(pdfurl)
+        parser.preprocess(pdfurl, pdfcontent)
+        pdfcontent = None
+#    except ValueError, e:
+#        errors.append(e)
+    except IndexError, e:
+        errors.append(e)
+
+def process_page_queue(parser, errors):
+    try:
+        parser.process_pages()
+        postlistelib.exit_if_no_cpu_left(0, out_of_cpu, errors)
+    except scraperwiki.CPUTimeExceededError, e:
+        errors.append("Processing pages interrupted")
+
+def process_journal_pdfs(parser, listurl, errors):
+    print "Finding PDFs on " + listurl
+#    u = urllib.parse.urlparse(listurl)
+    html = scraperwiki.scrape(listurl)
+    root = lxml.html.fromstring(html)
+    html = None
+    for ahref in root.cssselect("div.content-view-full ul li a"):
+        href = ahref.attrib['href']
+        url = urlparse.urljoin(listurl, href).replace(" ", "+")
+        if -1 != href.find("file://") or -1 == url.find(".pdf"):
+#            print "Skipping non-http URL " + url
+            continue
+        if parser.is_already_scraped(url):
+            True
+#            print "Skipping already scraped " + url
+        else:
+#            print "Will process " + url
+            process_pdf(parser, url, errors)
+
+def test_small_pdfs(parser):
+    # Test with some smaller PDFs
+    errors = []
+    process_pdf(parser, "http://www.hil.no/content/download/39617/747521/file/uke24.pdf", errors)
+    process_pdf(parser, "http://www.hil.no/content/download/37616/700472/file/uke1.pdf", errors)
+    process_page_queue(parser, errors)
+    report_errors(errors)
+    exit(0)
+
+errors = []
+parser = postlistelib.PDFJournalParser(agency=agency)
+
+#test_small_pdfs(parser)
+
+process_journal_pdfs(parser, "http://www.hil.no/hil/om_hoegskolen/Offentlig-journal", errors)
+
+process_page_queue(parser, errors)
+report_errors(errors)
+
diff --git a/scrapersources/postliste-hole b/scrapersources/postliste-hole
new file mode 100644
index 0000000..3f34322
--- /dev/null
+++ b/scrapersources/postliste-hole
@@ -0,0 +1,237 @@
+# -*- coding: UTF-8 -*-
+import scraperwiki
+import lxml.html
+import datetime
+import dateutil.parser
+import urllib2
+import urlparse
+
+# Start page is the front page, to get it listed as the primary source
+scraperwiki.scrape("http://www.hole.kommune.no/postjournaler.173497.no.html")
+
+postlistelib=scraperwiki.swimport('postliste-python-lib')
+
+agency = 'Hole kommune'
+
+def fetch_url(url):
+    html = None
+    for n in [1, 2, 3]:
+        try:
+            html = scraperwiki.scrape(url)
+            break
+        except urllib2.URLError, e:
+            print "URLError fetching " + url + ", trying again"
+    return html
+
+def expand_id(value, fieldtype, entry):
+    year, seqnr = value.split('/')
+    year = int(year)
+    seqnr = int(seqnr)
+    if year < 50:
+        year = year + 2000
+    if year > 50 and year < 100:
+        year = year + 1900
+    entry[fieldtype + 'year'] = year
+    entry[fieldtype + 'seqnr'] = seqnr
+    newvalue = str(year) + '/' + str(seqnr)
+    return entry, newvalue
+
+def fetch_postjournal(agency, url, datastore):
+#    print "Scraping " + url
+    scrapestamputc = datetime.datetime.now()
+    html = fetch_url(url)
+    root = lxml.html.fromstring(html)
+    entry = {
+        'agency' : agency,
+        'scrapestamputc' : scrapestamputc,
+        'scrapedurl' : url,
+    }
+
+    fieldmap = {
+        u'Tittel på saken'      : 'casedesc',
+        u'Tittel på dokumentet' : 'docdesc',
+        'Dokumentansvarlig'    : 'saksansvarlig',
+        'Hjemmel'     : 'exemption',
+        'DokumentID'  : 'journalid',
+        'ArkivsakID'  : 'caseid', 
+        'Journaldato' : 'recorddate',
+        'Brevdato'    : 'docdate',
+        #'Journalpostkategori' : 
+    }
+    doctypemap = { # Valid codes are I, U, X, N, S
+        u'Innkommende dokument'                  : 'I',
+        u'Innkommende dokument (Gradert)'        : 'I',
+        u'Utgående dokument'                     : 'U',
+        u'Utgående dokument (Gradert)'           : 'U',
+        u'Utgående dokument (Ikke publisert)'    : 'X',
+        u'Innkommende dokument (Ikke publisert)' : 'X',
+        u'Internt notat (Gradert)'               : 'N',
+        u'Internt notat'                         : 'N',
+    }
+    for span in root.cssselect("div.innsyn-content"):
+        #print span.text_content()
+
+        doctype = span.cssselect("h1.header-head")[0].text_content().strip()
+        print doctype
+        entry['doctype'] = doctypemap[doctype]
+
+        trs = span.cssselect("div.nobox tr")
+        for tr in trs:
+            field = tr.cssselect("th.header-cell")[0].text_content().strip().replace(":","")
+            value = tr.cssselect("td.content-cell")[0].text_content().strip()
+            #print "'" + field + "' = " + value
+            if field in fieldmap:
+                field = fieldmap[field]
+                #print "hit"
+            if field in ['docdate','recorddate']:
+                value = dateutil.parser.parse(value, dayfirst=True).date()
+            if field == 'saksansvarlig' and -1 != value.find(','):
+                #print value
+                names = value.split(",", 1)
+                value = names[1].strip() + " " + names[0].strip()
+            if field == 'caseid':
+                entry, value = expand_id(value, 'case', entry)
+            if field == 'journalid':
+                entry, value = expand_id(value, 'journal', entry)
+
+            entry[field] = value
+
+    sendinfo = span.cssselect("div.dokmottakere")
+    if 0 < len(sendinfo):
+        if 'doctype' in entry and entry['doctype'] in ['U', 'X', 'N']:
+            field = 'recipient'
+        else:
+            field = 'sender'
+        # Value is "Margrethe Ingeland<br/>Gravfossveien<br/>3360 GEITHUS", should be split in person, addr and zip
+        entry[field] = sendinfo[0].text
+        brs = sendinfo[0].cssselect("br")
+        if 3 == len(brs):
+            addr = brs[0].tail + ", " + brs[1].tail
+            zip  = brs[2].tail
+            entry[field + 'addr'] = addr
+            entry[field + 'zip'] = zip
+        elif 2 == len(brs):
+            addr = brs[0].tail
+            zip  = brs[1].tail
+            entry[field + 'addr'] = addr
+            entry[field + 'zip'] = zip
+        elif 1 == len(brs):
+            zip  = brs[0].tail
+            entry[field + 'zip'] = zip
+        elif 0 == len(brs):
+            True # Ignore
+        else:
+            raise ValueError("Unexpected number of address lines")
+    print entry
+    if 'doctype' in entry:
+        entry['casedocseq'] = 0 # Fake value, not sure how to extract the real value
+        datastore.append(entry)
+    return
+
+def get_journal_day(agency, date, startrow, jurlqueue):
+    datestr = str(date) + "T00:00:00"
+    url = "http://innsyn.hole.kommune.no/wfinnsyn.ashx?response=journalpost_postliste&showresults=true&fradato=%s&startrow=%d" % (datestr, startrow)
+    print url
+    html = fetch_url(url)
+    root = lxml.html.fromstring(html)
+    ahrefs = root.cssselect("table.inner-max-width tbody tr a")
+    for a in ahrefs:
+        href = a.attrib['href']
+        if -1 != href.find("/wfinnsyn.ashx?response=journalpost_detaljer&journalpostid="):
+            jurl = urlparse.urljoin(url, href)
+            jurlqueue.append(jurl)
+
+    ahrefs = root.cssselect("table.inner-max-width tfoot tr a")
+    for a in ahrefs:
+        if 'neste' == a.text_content():
+            get_journal_day(agency, date, startrow+10, jurlqueue)
+
+def is_already_scraped(url):
+    for sql in ["scrapedurl from swdata where scrapedurl = '" + url + "' limit 1"]:
+        try:
+            result = scraperwiki.sqlite.select(sql)
+            #int sql, " : ", result
+            if 0 < len(result) and u'scrapedurl' in result[0]:
+                return True
+        except:
+            print "Exception"
+            pass
+    return False
+
+def minmax_recorddate(minmax):
+    for sql in ["%s(recorddate) as recorddate from swdata" % minmax]:
+        try:
+            result = scraperwiki.sqlite.select(sql)
+            date = dateutil.parser.parse(result[0]['recorddate']).date()
+            return date
+        except:
+            pass
+    return None
+
+def scraper():
+    html = fetch_url("http://innsyn.hole.kommune.no/wfinnsyn.ashx?response=journalpost_postliste&showresults=true")
+    root = lxml.html.fromstring(html)
+    table = root.cssselect("table.inner-max-width")
+    #print table[0].text_content()
+
+    lastdate = dateutil.parser.parse(table[0].cssselect("caption")[0].text_content().replace("Postliste den ", ""), dayfirst=True).date()
+
+    maxdate = minmax_recorddate("max")
+
+    if maxdate:
+        startdate = maxdate + datetime.timedelta(days=1)
+        start = 0
+        end = (lastdate-startdate).days + 1
+        print maxdate, startdate, start, end
+    else:
+        startdate = maxdate
+        start = 0
+        end = 0
+    for old in range(start, end):
+        date = startdate + datetime.timedelta(days=old)
+        print date
+        urlqueue = []
+        get_journal_day(agency, date, 0, urlqueue)
+        datastore = []
+        for jurl in urlqueue:
+            if not is_already_scraped(jurl):
+                res = fetch_postjournal(agency, jurl, datastore)
+        if 0 < len(datastore):
+            print datastore
+            scraperwiki.sqlite.save(unique_keys=['scrapedurl'], data=datastore)
+            datastore = []
+
+    mindate = minmax_recorddate("min")
+
+    # Only three months back
+    return
+
+    if mindate:
+        startdate = mindate - datetime.timedelta(days=1)
+        start = 0
+        end = -60
+        print mindate, startdate, start, end
+    else:
+        return
+    for old in range(start, end, -1):
+        date = startdate + datetime.timedelta(days=old)
+        print date
+        urlqueue = []
+        get_journal_day(agency, date, 0, urlqueue)
+        datastore = []
+        for jurl in urlqueue:
+            if not is_already_scraped(jurl):
+                res = fetch_postjournal(agency, jurl, datastore)
+        if 0 < len(datastore):
+            print datastore
+            scraperwiki.sqlite.save(unique_keys=['scrapedurl'], data=datastore)
+            datastore = []
+
+#GET http://innsyn.hole.kommune.no/wfinnsyn.ashx?response=journalpost_postliste&showresults=true&fradato=2012-06-15T00:00:00
+#GET http://innsyn.hole.kommune.no/wfinnsyn.ashx?response=journalpost_detaljer&journalpostid=2012005569&
+#GET http://innsyn.hole.kommune.no/wfinnsyn.ashx?response=arkivsak_detaljer&arkivsakid=2006002016&
+
+if __name__ == "scraper":
+    scraper()
+else:
+    print "Not called as scraper"
+\ No newline at end of file
diff --git a/scrapersources/postliste-hvaler b/scrapersources/postliste-hvaler
new file mode 100644
index 0000000..b3e9137
--- /dev/null
+++ b/scrapersources/postliste-hvaler
@@ -0,0 +1,81 @@
+# -*- coding: UTF-8 -*-
+
+import scraperwiki
+import json
+from BeautifulSoup import BeautifulSoup
+import datetime
+import dateutil.parser
+import lxml.html
+import resource
+import sys
+import urlparse
+import re
+lazycache=scraperwiki.swimport('lazycache')
+postlistelib=scraperwiki.swimport('postliste-python-lib')
+
+agency = 'Hvaler kommune'
+
+def report_errors(errors):
+    if 0 < len(errors):
+        print "Errors:"
+        for e in errors:
+            print e
+        exit(1)
+def out_of_cpu(arg, spent, hard, soft):
+    report_errors(arg)
+
+def process_pdf(parser, pdfurl, errors):
+    errors = []
+    postlistelib.exit_if_no_cpu_left(0, out_of_cpu, errors)
+    try:
+        pdfcontent = scraperwiki.scrape(pdfurl)
+        parser.preprocess(pdfurl, pdfcontent)
+        pdfcontent = None
+#    except ValueError, e:
+#        errors.append(e)
+    except IndexError, e:
+        errors.append(e)
+
+def process_page_queue(parser, errors):
+    try:
+        parser.process_pages()
+        postlistelib.exit_if_no_cpu_left(0, out_of_cpu, errors)
+    except scraperwiki.CPUTimeExceededError, e:
+        errors.append("Processing pages interrupted")
+
+def process_journal_pdfs(parser, listurl, errors):
+#    print "Finding PDFs on " + listurl
+#    u = urllib.parse.urlparse(listurl)
+    html = scraperwiki.scrape(listurl)
+    root = lxml.html.fromstring(html)
+    html = None
+    for ahref in root.cssselect("div#ctl00_MainRegion_StageAreaRegion_MainContentRegion_MainBodyRegion_ctl01_FileTreen0Nodes a"):
+        href = ahref.attrib['href']
+        url = urlparse.urljoin(listurl, href)
+        if -1 != href.find("file://"):
+#            print "Skipping non-http URL " + url
+            continue
+        if parser.is_already_scraped(url):
+            True
+#            print "Skipping already scraped " + url
+        else:
+#            print "Will process " + url
+            process_pdf(parser, url, errors)
+
+def test_small_pdfs(parser):
+    # Test with some smaller PDFs
+    errors = []
+    process_pdf(parser, "http://www.hvaler.kommune.no/Documents/Postlister/2012/2012-05-31.pdf", errors)
+    process_page_queue(parser, errors)
+    report_errors(errors)
+    exit(0)
+
+errors = []
+parser = postlistelib.PDFJournalParser(agency=agency)
+
+#test_small_pdfs(parser)
+
+process_journal_pdfs(parser, "http://www.hvaler.kommune.no/Postlister/", errors)
+process_page_queue(parser, errors)
+report_errors(errors)
+
diff --git a/scrapersources/postliste-kafjord b/scrapersources/postliste-kafjord
new file mode 100644
index 0000000..e0d6b5c
--- /dev/null
+++ b/scrapersources/postliste-kafjord
@@ -0,0 +1,81 @@
+# -*- coding: UTF-8 -*-
+
+import scraperwiki
+import json
+from BeautifulSoup import BeautifulSoup
+import datetime
+import dateutil.parser
+import lxml.html
+import resource
+import sys
+import urlparse
+import re
+lazycache=scraperwiki.swimport('lazycache')
+postlistelib=scraperwiki.swimport('postliste-python-lib')
+
+agency = u'Kåfjord kommune'
+
+def report_errors(errors):
+    if 0 < len(errors):
+        print "Errors:"
+        for e in errors:
+            print e
+        exit(1)
+def out_of_cpu(arg, spent, hard, soft):
+    report_errors(arg)
+
+def process_pdf(parser, pdfurl, errors):
+    errors = []
+    postlistelib.exit_if_no_cpu_left(0, out_of_cpu, errors)
+    try:
+        pdfcontent = scraperwiki.scrape(pdfurl)
+        parser.preprocess(pdfurl, pdfcontent)
+        pdfcontent = None
+#    except ValueError, e:
+#        errors.append(e)
+    except IndexError, e:
+        errors.append(e)
+
+def process_page_queue(parser, errors):
+    try:
+        parser.process_pages()
+        postlistelib.exit_if_no_cpu_left(0, out_of_cpu, errors)
+    except scraperwiki.CPUTimeExceededError, e:
+        errors.append("Processing pages interrupted")
+
+def process_journal_pdfs(parser, listurl, errors):
+#    print "Finding PDFs on " + listurl
+#    u = urllib.parse.urlparse(listurl)
+    html = scraperwiki.scrape(listurl)
+    root = lxml.html.fromstring(html)
+    html = None
+    for ahref in root.cssselect("div.main a"):
+        href = ahref.attrib['href']
+        url = urlparse.urljoin(listurl, href)
+        if -1 != href.find("file://") or -1 == href.find("/postliste-"):
+#            print "Skipping non-http URL " + url
+            continue
+        if parser.is_already_scraped(url):
+            True
+#            print "Skipping already scraped " + url
+        else:
+#            print "Will process " + url
+            process_pdf(parser, url, errors)
+
+def test_small_pdfs(parser):
+    # Test with some smaller PDFs
+    errors = []
+    process_pdf(parser, "http://www.kafjord.kommune.no/postliste-15-06-12.5065630-18590.html", errors)
+    process_page_queue(parser, errors)
+    report_errors(errors)
+    exit(0)
+
+errors = []
+parser = postlistelib.PDFJournalParser(agency=agency)
+
+#test_small_pdfs(parser)
+
+process_journal_pdfs(parser, "http://www.kafjord.kommune.no/index.php?cat=18590", errors)
+process_page_queue(parser, errors)
+report_errors(errors)
+
diff --git a/scrapersources/postliste-kristiansund b/scrapersources/postliste-kristiansund
new file mode 100644
index 0000000..6965810
--- /dev/null
+++ b/scrapersources/postliste-kristiansund
@@ -0,0 +1,87 @@
+# -*- coding: UTF-8 -*-
+
+import scraperwiki
+from BeautifulSoup import BeautifulSoup
+import datetime
+import dateutil.parser
+import lxml.html
+import sys
+import urlparse
+
+scraperwiki.scrape("http://kristiansund.orkide.acos.no/kunde/web/postliste/postliste.asp")
+
+lazycache=scraperwiki.swimport('lazycache')
+postlistelib=scraperwiki.swimport('postliste-python-lib')
+
+agency = 'Kristiansund kommune'
+debug = False
+
+def is_already_scraped(url):
+
+    for sql in ["scrapedurl from swdata where scrapedurl = '" + url + "' limit 1",
+                "scrapedurl from unparsedpages where scrapedurl = '" + url + "' limit 1"]:
+#    print sql
+        try:
+            result = scraperwiki.sqlite.select(sql)
+#    print result
+            if 0 < len(result) and u'scrapedurl' in result[0]:
+                return True
+        except:
+            pass
+    return False
+
+def report_errors(errors):
+    if 0 < len(errors):
+        print "Errors:"
+        for e in errors:
+            print e
+        exit(1)
+def no_cpu_left(arg, spent, soft, hard):
+    report_errors(arg)
+
+def process_pdf(parser, pdfurl, errors):
+    errors = []
+    postlistelib.exit_if_no_cpu_left(0, callback=no_cpu_left, arg = errors)
+    try:
+        pdfcontent = lazycache.lazycache(pdfurl)
+        parser.preprocess(pdfurl, pdfcontent)
+#    except ValueError, e:
+#        errors.append(e)
+    except IndexError, e:
+        errors.append(e)
+
+def process_page_queue(parser, errors):
+    try:
+        parser.process_pages()
+        postlistelib.exit_if_no_cpu_left(0, callback=no_cpu_left, arg = errors)
+    except scraperwiki.CPUTimeExceededError, e:
+        errors.append("Processing pages interrupted")
+
+def consider_url(parser, url, errors):
+    if is_already_scraped(url):
+        True
+#            print "Skipping already scraped " + url
+    else:
+#            print "Will process " + url
+        process_pdf(parser, url, errors)
+
+def process_journal_pdfs(parser, listurl, errors):
+#    print "Finding PDFs on " + listurl
+#    u = urllib.parse.urlparse(listurl)
+    html = scraperwiki.scrape(listurl)
+    root = lxml.html.fromstring(html)
+    html = None
+    for ahref in root.cssselect("table a"):
+        url = urlparse.urljoin(listurl, ahref.attrib['href'])
+        if -1 == url.find(".pdf"):
+            continue
+        consider_url(parser, url, errors)
+
+#test_parse_case_journal_ref()
+errors = []
+parser = postlistelib.PDFJournalParser(agency=agency)
+#parser.debug = True
+process_journal_pdfs(parser, "http://kristiansund.orkide.acos.no/kunde/web/postliste/postliste.asp", errors)
+process_page_queue(parser, errors)
+report_errors(errors)
+
diff --git a/scrapersources/postliste-lier b/scrapersources/postliste-lier
new file mode 100644
index 0000000..8064d7a
--- /dev/null
+++ b/scrapersources/postliste-lier
@@ -0,0 +1,81 @@
+# -*- coding: UTF-8 -*-
+
+import scraperwiki
+import json
+from BeautifulSoup import BeautifulSoup
+import datetime
+import dateutil.parser
+import lxml.html
+import urlparse
+
+scraperwiki.scrape("http://www.lier.kommune.no/no/Tjenesteomrader-/Oversikter/Postlister---Offentlig-journal/")
+
+lazycache=scraperwiki.swimport('lazycache')
+postlistelib=scraperwiki.swimport('postliste-python-lib')
+
+agency = 'Lier kommune'
+
+def report_errors(errors):
+    if 0 < len(errors):
+        print "Errors:"
+        for e in errors:
+            print e
+        exit(1)
+def out_of_cpu(arg, spent, hard, soft):
+    report_errors(arg)
+
+def process_pdf(parser, pdfurl, errors):
+    errors = []
+    postlistelib.exit_if_no_cpu_left(0, out_of_cpu, errors)
+    try:
+        pdfcontent = scraperwiki.scrape(pdfurl)
+        parser.preprocess(pdfurl, pdfcontent)
+        pdfcontent = None
+#    except ValueError, e:
+#        errors.append(e)
+    except IndexError, e:
+        errors.append(e)
+
+def process_page_queue(parser, errors):
+    try:
+        parser.process_pages()
+        postlistelib.exit_if_no_cpu_left(0, out_of_cpu, errors)
+    except scraperwiki.CPUTimeExceededError, e:
+        errors.append("Processing pages interrupted")
+
+def process_journal_pdfs(parser, listurl, errors):
+#    print "Finding PDFs on " + listurl
+#    u = urllib.parse.urlparse(listurl)
+    html = scraperwiki.scrape(listurl)
+    root = lxml.html.fromstring(html)
+    html = None
+    for ahref in root.cssselect("div.fullwidth a"):
+        href = ahref.attrib['href']
+        url = urlparse.urljoin(listurl, href)
+        if -1 != href.find("file://") or -1 == href.find(".pdf"):
+#            print "Skipping non-http URL " + url
+            continue
+        if parser.is_already_scraped(url):
+            True
+#            print "Skipping already scraped " + url
+        else:
+#            print "Will process " + url
+            process_pdf(parser, url, errors)
+
+def test_small_pdfs(parser):
+    # Test with some smaller PDFs
+    errors = []
+    process_pdf(parser, "http://www.lier.kommune.no/files/1256/Postlister%2011.06.pdf", errors)
+    process_page_queue(parser, errors)
+    report_errors(errors)
+    exit(0)
+
+errors = []
+parser = postlistelib.PDFJournalParser(agency=agency)
+
+#test_small_pdfs(parser)
+
+process_journal_pdfs(parser, "http://www.lier.kommune.no/no/Tjenesteomrader-/Oversikter/Postlister---Offentlig-journal/", errors)
+process_page_queue(parser, errors)
+report_errors(errors)
+
diff --git a/scrapersources/postliste-lindesnes b/scrapersources/postliste-lindesnes
new file mode 100644
index 0000000..39e69c0
--- /dev/null
+++ b/scrapersources/postliste-lindesnes
@@ -0,0 +1,124 @@
+# -*- coding: UTF-8 -*-
+import scraperwiki
+import lxml.html
+import datetime
+import dateutil.parser
+import urllib2
+
+# http://innsyn.lindesnes.kommune.no/Publikum/Modules/innsyn.aspx?mode=pl&SelPanel=0&ObjectType=ePhorteRegistryEntry&VariantType=Innsyn&ViewType=Table&Query=RecordDate%3a%28-14%29+AND+ResponsibleUnitID%3a%2811%29+AND+DocumentType%3a%28I%2cU%29
+
+def fetch_url(url):
+    html = None
+    for n in [1, 2, 3]:
+        try:
+            html = scraperwiki.scrape(url)
+            break
+        except urllib2.URLError, e:
+            print "URLError fetching " + url + ", trying again"
+    return html
+
+def make_url(id):
+    url = "http://innsyn.lindesnes.kommune.no/Publikum/Modules/innsyn.aspx?mode=pl&SelPanel=0&ObjectType=ePhorteRegistryEntry&VariantType=Innsyn&ViewType=Detail&Query=ID:" + str(id)
+    return url
+
+def fetch_postjournal(agency, id, url, datastore):
+#    print "Scraping " + url
+    scrapestamputc = datetime.datetime.now()
+    html = fetch_url(url)
+    root = lxml.html.fromstring(html.decode('utf-8'))
+    entry = {
+        'agency' : agency,
+        'scrapestamputc' : scrapestamputc,
+        'scrapedurl' : url,
+        'queryid' : id
+    }
+
+    for span in root.cssselect("div.robots-content span.Element"):
+#        print span.text_content()
+        field = None
+        value = None
+        if span.cssselect("h3"):
+            field = span.cssselect("h3")[0].text_content().strip()
+            value = span.cssselect("span.Content span")[0].text_content().strip()
+        elif span.cssselect("h2"):
+            field = span.cssselect("h2")[0].text_content().strip()
+# FIXME
+            value = ""
+        elif span.cssselect("h1"):
+            field = "docdesc"
+            value = span.cssselect("h1")[0].text_content().strip()
+#        else:
+#            raise ValueError("Unexpected span")
+#        print field + " = " + value
+        doctypemap = {
+          u'Inngående brev' : 'I',
+          u'Utgående brev'  : 'U',
+          u'Internt notat'  : 'N',
+          u'Internt notat uten oppfølging' : 'X',
+          u'Saksframlegg/innstilling' : 'S',
+          u'Dokumentpost i saksmappe'  : 'Y', # Code not in NOARK, value based on http://img6.custompublish.com/getfile.php/1168825.136.pqftpqctyt/Ephorte-brukerveiledning_2.1.15.pdf?return=www.kafjord.kommune.no
+        }
+        if 'Type' == field:
+            field = 'doctype'
+            value = doctypemap[value]
+        elif 'Journaldato' == field:
+            field = 'recorddate'
+            value =  dateutil.parser.parse(value, dayfirst=True)
+        elif 'Dokumentdato' == field:
+            field = 'docdate'
+            value =  dateutil.parser.parse(value, dayfirst=True)
+        elif u'Tilhører sak' == field:
+            field = 'casedesc'
+        elif 'Avsender/Mottaker' == field:
+            if 'doctype' in entry and entry['doctype'] in ['U', 'X', 'N']:
+                field = 'recipient'
+            else:
+                field = 'sender'
+            td = span.cssselect("table td")
+            if td:
+                name = td[0].text_content().strip()
+                addr = td[1].text_content().strip()
+                zip  = td[2].text_content().strip()
+               # print "N: '",name, "' '", addr, "' '", zip, "'"
+                entry[field] = name
+                entry[field + 'addr'] = addr
+                entry[field + 'zip'] = zip
+                field = ''
+
+#        elif 'Saksbehandlende enhet' == field:
+#        elif 'Saksbehandler' == field:
+        if field is not None and '' != field:
+            entry[field] = value
+
+    print entry
+    if 'doctype' in entry:
+        datastore.append(entry)
+
+agency = 'Lindesnes kommune'
+
+def scrape_range(start, end, step, agency):
+    datastore = []
+    for id in range(start, end, step):
+        fetch_postjournal(agency, id, make_url(id), datastore)
+        if 0 < len(datastore) and 0 == (len(datastore) % 10):
+            #print datastore
+            scraperwiki.sqlite.save(unique_keys=['queryid'], data=datastore)
+            datastore = []
+    if 0 < len(datastore):
+        scraperwiki.sqlite.save(unique_keys=['queryid'], data=datastore)
+
+def scraper():
+    try:
+        min = scraperwiki.sqlite.select("min(queryid) as min from swdata")[0]["min"]
+        max = scraperwiki.sqlite.select("min(queryid) as max from swdata")[0]["max"]
+    except:
+        # Random number around 2012-05-15 (ie recent when I wrote this scraper)
+        min = 71836
+
+    scrape_range(max,   max + 200,  1, agency)
+    scrape_range(min-1, min - 3000, -1, agency)
+
+if __name__ == "scraper":
+    scraper()
+else:
+    print "Not called as scraper"
+\ No newline at end of file
diff --git a/scrapersources/postliste-luftambulanse b/scrapersources/postliste-luftambulanse
new file mode 100644
index 0000000..df28d6b
--- /dev/null
+++ b/scrapersources/postliste-luftambulanse
@@ -0,0 +1,91 @@
+# -*- coding: UTF-8 -*-
+
+import scraperwiki
+import json
+from BeautifulSoup import BeautifulSoup
+import datetime
+import dateutil.parser
+import lxml.html
+import resource
+import sys
+import urlparse
+import re
+lazycache=scraperwiki.swimport('lazycache')
+postlistelib=scraperwiki.swimport('postliste-python-lib')
+
+agency = 'Luftambulansetjenesten ANS'
+
+def report_errors(errors):
+    if 0 < len(errors):
+        print "Errors:"
+        for e in errors:
+            print e
+        exit(1)
+def out_of_cpu(arg, spent, hard, soft):
+    report_errors(arg)
+
+def process_pdf(parser, pdfurl, errors):
+    errors = []
+    postlistelib.exit_if_no_cpu_left(0, out_of_cpu, errors)
+    try:
+        pdfcontent = scraperwiki.scrape(pdfurl)
+        parser.preprocess(pdfurl, pdfcontent)
+        pdfcontent = None
+#    except ValueError, e:
+#        errors.append(e)
+    except IndexError, e:
+        errors.append(e)
+
+def process_page_queue(parser, errors):
+    try:
+        parser.process_pages()
+        postlistelib.exit_if_no_cpu_left(0, out_of_cpu, errors)
+    except scraperwiki.CPUTimeExceededError, e:
+        errors.append("Processing pages interrupted")
+
+def process_journal_pdfs(parser, listurl, errors):
+#    print "Finding PDFs on " + listurl
+#    u = urllib.parse.urlparse(listurl)
+    html = scraperwiki.scrape(listurl)
+    root = lxml.html.fromstring(html)
+    html = None
+    for ahref in root.cssselect("table a"):
+        if not 'href' in ahref.attrib:
+            continue
+        href = ahref.attrib['href']
+        url = urlparse.urljoin(listurl, href).replace(" ", "%20")
+        if -1 != href.find("file://") or -1 == url.find(".pdf") or -1 == url.find('/Postjournal'):
+#            print "Skipping non-http URL " + url
+            continue
+        if parser.is_already_scraped(url):
+            True
+#            print "Skipping already scraped " + url
+        else:
+#            print "Will process " + url
+            process_pdf(parser, url, errors)
+
+def test_small_pdfs(parser):
+    # Test with some smaller PDFs
+    errors = []
+    process_pdf(parser, "http://www.luftambulanse.no/filarkiv/Postjournal%202012/Postjournal%20mai/2805-010612.pdf", errors)
+    process_page_queue(parser, errors)
+    report_errors(errors)
+    exit(0)
+
+errors = []
+parser = postlistelib.PDFJournalParser(agency=agency)
+
+#test_small_pdfs(parser)
+
+#process_page_queue(parser, errors)
+process_journal_pdfs(parser, "http://www.luftambulanse.no/postjournal_2012.aspx", errors)
+process_journal_pdfs(parser, "http://www.luftambulanse.no/postjournal_2011.aspx", errors)
+process_journal_pdfs(parser, "http://www.luftambulanse.no/postjournal_2010.aspx", errors)
+process_journal_pdfs(parser, "http://www.luftambulanse.no/postjournal_2009.aspx", errors)
+process_journal_pdfs(parser, "http://www.luftambulanse.no/postjournal_2008.aspx", errors)
+process_journal_pdfs(parser, "http://www.luftambulanse.no/postjournal_2007.aspx", errors)
+process_journal_pdfs(parser, "http://www.luftambulanse.no/postjournal.aspx", errors)
+
+process_page_queue(parser, errors)
+report_errors(errors)
+
diff --git a/scrapersources/postliste-naroy b/scrapersources/postliste-naroy
new file mode 100644
index 0000000..b8fa33b
--- /dev/null
+++ b/scrapersources/postliste-naroy
@@ -0,0 +1,89 @@
+# -*- coding: UTF-8 -*-
+
+import scraperwiki
+import json
+from BeautifulSoup import BeautifulSoup
+import datetime
+import dateutil.parser
+import lxml.html
+import resource
+import sys
+import urlparse
+import re
+lazycache=scraperwiki.swimport('lazycache')
+postlistelib=scraperwiki.swimport('postliste-python-lib')
+
+agency = u'Nærøy kommune'
+
+def report_errors(errors):
+    if 0 < len(errors):
+        print "Errors:"
+        for e in errors:
+            print e
+        exit(1)
+def out_of_cpu(arg, spent, hard, soft):
+    report_errors(arg)
+
+def process_pdf(parser, pdfurl, errors):
+    errors = []
+    postlistelib.exit_if_no_cpu_left(0, out_of_cpu, errors)
+    try:
+        pdfcontent = scraperwiki.scrape(pdfurl)
+        parser.preprocess(pdfurl, pdfcontent)
+        pdfcontent = None
+#    except ValueError, e:
+#        errors.append(e)
+    except IndexError, e:
+        errors.append(e)
+
+def process_page_queue(parser, errors):
+    try:
+        parser.process_pages()
+        postlistelib.exit_if_no_cpu_left(0, out_of_cpu, errors)
+    except scraperwiki.CPUTimeExceededError, e:
+        errors.append("Processing pages interrupted")
+
+def process_journal_pdfs(parser, listurl, errors):
+#    print "Finding PDFs on " + listurl
+#    u = urllib.parse.urlparse(listurl)
+    html = scraperwiki.scrape(listurl)
+    root = lxml.html.fromstring(html)
+    html = None
+    for ahref in root.cssselect("table#hovedinnhold a"):
+        href = ahref.attrib['href']
+        url = urlparse.urljoin(listurl, href).replace(" ", "+")
+        if -1 != href.find("file://"):
+#            print "Skipping non-http URL " + url
+            continue
+        if -1 == url.find(".pdf"):
+            continue
+        # Special case, file indicating no journal entries this day
+        if "http://www.naroy.kommune.no/NK/Intern.nsf/FilA/CA6C83764E56DDCBC1257A02003F9025/$FILE/Postjournal+11.05.12.pdf" == url or \
+            "http://www.naroy.kommune.no/NK/Intern.nsf/FilA/7FD82A18C1A1F137C12579F90029DEBD/$FILE/Postjournal+07.05.12.pdf" == url:
+            continue
+        if parser.is_already_scraped(url):
+            True
+#            print "Skipping already scraped " + url
+        else:
+#            print "Will process " + url
+            process_pdf(parser, url, errors)
+
+def test_small_pdfs(parser):
+    # Test with some smaller PDFs
+    errors = []
+    process_pdf(parser, "http://www.naroy.kommune.no/NK/Intern.nsf/FilA/451908E568D2D630C1257A1E004D1B9D/$FILE/Postjournal%2005.06.12.pdf", errors)
+
+    process_page_queue(parser, errors)
+    report_errors(errors)
+    exit(0)
+
+errors = []
+parser = postlistelib.PDFJournalParser(agency=agency)
+#parser.debug = True
+
+#test_small_pdfs(parser)
+
+process_journal_pdfs(parser, "http://www.naroy.kommune.no/NK/Web.nsf/mainPress?OpenForm&U=POST", errors)
+process_page_queue(parser, errors)
+report_errors(errors)
+
diff --git a/scrapersources/postliste-nih b/scrapersources/postliste-nih
new file mode 100644
index 0000000..4f92e18
--- /dev/null
+++ b/scrapersources/postliste-nih
@@ -0,0 +1,85 @@
+# -*- coding: UTF-8 -*-
+
+import scraperwiki
+import json
+from BeautifulSoup import BeautifulSoup
+import datetime
+import dateutil.parser
+import lxml.html
+import resource
+import sys
+import urlparse
+import re
+
+# Make sure Scraperwiki believe this is the source from this database
+scraperwiki.scrape("http://www.nih.no/om-nih/aktuelt/offentlig-postjournal/")
+
+lazycache=scraperwiki.swimport('lazycache')
+postlistelib=scraperwiki.swimport('postliste-python-lib')
+
+agency = 'Norges idrettshøgskole'
+
+def report_errors(errors):
+    if 0 < len(errors):
+        print "Errors:"
+        for e in errors:
+            print e
+        exit(1)
+def out_of_cpu(arg, spent, hard, soft):
+    report_errors(arg)
+
+def process_pdf(parser, pdfurl, errors):
+    errors = []
+    postlistelib.exit_if_no_cpu_left(0, out_of_cpu, errors)
+    try:
+        pdfcontent = scraperwiki.scrape(pdfurl)
+        parser.preprocess(pdfurl, pdfcontent)
+        pdfcontent = None
+#    except ValueError, e:
+#        errors.append(e)
+    except IndexError, e:
+        errors.append(e)
+
+def process_page_queue(parser, errors):
+    try:
+        parser.process_pages()
+        postlistelib.exit_if_no_cpu_left(0, out_of_cpu, errors)
+    except scraperwiki.CPUTimeExceededError, e:
+        errors.append("Processing pages interrupted")
+
+def process_journal_pdfs(parser, listurl, errors):
+#    print "Finding PDFs on " + listurl
+#    u = urllib.parse.urlparse(listurl)
+    html = scraperwiki.scrape(listurl)
+    root = lxml.html.fromstring(html)
+    html = None
+    for ahref in root.cssselect("li a"):
+        href = ahref.attrib['href']
+        url = urlparse.urljoin(listurl, href)
+        if -1 != href.find("file://") or -1 == url.find(".pdf"):
+#            print "Skipping non-http URL " + url
+            continue
+        if parser.is_already_scraped(url):
+            True
+#            print "Skipping already scraped " + url
+        else:
+#            print "Will process " + url
+            process_pdf(parser, url, errors)
+
+def test_small_pdfs(parser):
+    # Test with some smaller PDFs
+    errors = []
+    process_pdf(parser, u"http://www.nih.no/Documents/1_P%C3%98/Postjournaler/offentlig%20journal%20uke%2022.pdf", errors)
+    process_page_queue(parser, errors)
+    report_errors(errors)
+    exit(0)
+
+errors = []
+parser = postlistelib.PDFJournalParser(agency=agency)
+
+#test_small_pdfs(parser)
+
+process_journal_pdfs(parser, "http://www.nih.no/om-nih/aktuelt/offentlig-postjournal/", errors)
+process_page_queue(parser, errors)
+report_errors(errors)
+
diff --git a/scrapersources/postliste-npolar b/scrapersources/postliste-npolar
new file mode 100644
index 0000000..423a785
--- /dev/null
+++ b/scrapersources/postliste-npolar
@@ -0,0 +1,101 @@
+# -*- coding: UTF-8 -*-
+
+import scraperwiki
+import json
+from BeautifulSoup import BeautifulSoup
+import datetime
+import dateutil.parser
+import lxml.html
+import resource
+import sys
+import urlparse
+import re
+
+# Load front page first, to get it recorded as the source by scraperwiki
+scraperwiki.scrape("http://www.npolar.no/no/om-oss/offentlig-journal.html")
+
+lazycache=scraperwiki.swimport('lazycache')
+postlistelib=scraperwiki.swimport('postliste-python-lib')
+
+agency = 'Norsk Polarinstitutt'
+
+def report_errors(errors):
+    if 0 < len(errors):
+        print "Errors:"
+        for e in errors:
+            print e
+        exit(1)
+def out_of_cpu(arg, spent, hard, soft):
+    report_errors(arg)
+
+def process_pdf(parser, pdfurl, errors):
+    errors = []
+    postlistelib.exit_if_no_cpu_left(0, out_of_cpu, errors)
+    try:
+        pdfcontent = scraperwiki.scrape(pdfurl)
+        parser.preprocess(pdfurl, pdfcontent)
+        pdfcontent = None
+#    except ValueError, e:
+#        errors.append(e)
+    except IndexError, e:
+        errors.append(e)
+
+def process_page_queue(parser, errors):
+    try:
+        parser.process_pages()
+        postlistelib.exit_if_no_cpu_left(0, out_of_cpu, errors)
+    except scraperwiki.CPUTimeExceededError, e:
+        errors.append("Processing pages interrupted")
+
+def process_journal_pdfs(parser, listurl, errors):
+#    print "Finding PDFs on " + listurl
+#    u = urllib.parse.urlparse(listurl)
+    html = scraperwiki.scrape(listurl)
+    root = lxml.html.fromstring(html)
+    html = None
+    for ahref in root.cssselect("div.onecol ul a"):
+        href = ahref.attrib['href']
+        url = urlparse.urljoin(listurl, href)
+        if -1 != href.find("file://") or -1 == url.find(".pdf"):
+#            print "Skipping non-http URL " + url
+            continue
+        if parser.is_already_scraped(url):
+            True
+#            print "Skipping already scraped " + url
+        else:
+#            print "Will process " + url
+            process_pdf(parser, url, errors)
+
+def test_small_pdfs(parser):
+    # Test with some smaller PDFs
+    errors = []
+    #process_pdf(parser, "http://www.npolar.no/npcms/export/sites/np/files/vedlegg/offentlig-journal/2011-10.pdf", errors)
+    #process_pdf(parser, "http://www.npolar.no/npcms/export/sites/np/files/vedlegg/offentlig-journal/2011-09.pdf", errors)
+    #process_pdf(parser, "http://www.npolar.no/npcms/export/sites/np/files/vedlegg/offentlig-journal/2011-08.pdf", errors)
+    #process_pdf(parser, "http://www.npolar.no/npcms/export/sites/np/files/vedlegg/offentlig-journal/2011-07.pdf", errors)
+    #process_pdf(parser, "http://www.npolar.no/npcms/export/sites/np/files/vedlegg/offentlig-journal/2011-06.pdf", errors)
+    #process_pdf(parser, "http://www.npolar.no/npcms/export/sites/np/files/vedlegg/offentlig-journal/2011-05.pdf", errors)
+    #process_pdf(parser, "http://www.npolar.no/npcms/export/sites/np/files/vedlegg/offentlig-journal/2011-04.pdf", errors)
+    #process_pdf(parser, "http://www.npolar.no/npcms/export/sites/np/files/vedlegg/offentlig-journal/2011-03.pdf", errors)
+    #process_pdf(parser, "http://www.npolar.no/npcms/export/sites/np/files/vedlegg/offentlig-journal/2011-02.pdf", errors)
+    #process_pdf(parser, "http://www.npolar.no/npcms/export/sites/np/files/vedlegg/offentlig-journal/2011-01.pdf", errors)
+    process_pdf(parser, "http://home.nuug.no/~pere/npolar-postjournal/OffJournalapril-mai2012.pdf", errors)
+    process_pdf(parser, "http://home.nuug.no/~pere/npolar-postjournal/OffJournaljanuar-mai2011.pdf", errors)
+    process_pdf(parser, "http://home.nuug.no/~pere/npolar-postjournal/OffJournaljanuar-mars2012.pdf", errors)
+    process_pdf(parser, "http://home.nuug.no/~pere/npolar-postjournal/OffJournaljuni-oktober2011.pdf", errors)
+    process_pdf(parser, "http://home.nuug.no/~pere/npolar-postjournal/OffJournaljuni2012.pdf", errors)
+    process_pdf(parser, "http://home.nuug.no/~pere/npolar-postjournal/OffJournalnovember-desember2011.pdf", errors)
+
+    process_page_queue(parser, errors)
+    report_errors(errors)
+    exit(0)
+
+errors = []
+parser = postlistelib.PDFJournalParser(agency=agency)
+
+#test_small_pdfs(parser)
+
+process_journal_pdfs(parser, "http://www.npolar.no/no/om-oss/offentlig-journal.html", errors)
+process_page_queue(parser, errors)
+report_errors(errors)
+
diff --git a/scrapersources/postliste-nrk b/scrapersources/postliste-nrk
new file mode 100644
index 0000000..5c7929d
--- /dev/null
+++ b/scrapersources/postliste-nrk
@@ -0,0 +1,94 @@
+# -*- coding: UTF-8 -*-
+# Based on the scraper advanced-scraping-pdf
+# See also
+# https://views.scraperwiki.com/run/pdf-to-html-preview-1/?url=http%3A%2F%2Fwww.stortinget.no%2FGlobal%2Fpdf%2Fpostjournal%2Fpj-2012-05-09.pdf
+
+import scraperwiki
+import json
+from BeautifulSoup import BeautifulSoup
+import datetime
+import dateutil.parser
+import lxml.html
+import resource
+import sys
+import urlparse
+import gc
+import re
+
+frontpage = "http://www.nrk.no/contentfile/transformer/1.8052258"
+scraperwiki.scrape(frontpage)
+
+lazycache=scraperwiki.swimport('lazycache')
+postlistelib=scraperwiki.swimport('postliste-python-lib')
+
+agency = 'Universitetet i Oslo'
+
+def report_errors(errors):
+    if 0 < len(errors):
+        print "Errors:"
+        for e in errors:
+            print e
+        exit(1)
+def out_of_cpu(arg, spent, hard, soft):
+    report_errors(arg)
+
+def process_pdf(parser, pdfurl, errors):
+    errors = []
+    postlistelib.exit_if_no_cpu_left(0, out_of_cpu, errors)
+    try:
+        pdfcontent = scraperwiki.scrape(pdfurl)
+        parser.preprocess(pdfurl, pdfcontent)
+        pdfcontent = None
+#    except ValueError, e:
+#        errors.append(e)
+    except IndexError, e:
+        errors.append(e)
+
+def process_page_queue(parser, errors):
+    try:
+        parser.process_pages()
+        postlistelib.exit_if_no_cpu_left(0, out_of_cpu, errors)
+    except scraperwiki.CPUTimeExceededError, e:
+        errors.append("Processing pages interrupted")
+
+def process_journal_pdfs(parser, listurl, errors):
+#    print "Finding PDFs on " + listurl
+#    u = urllib.parse.urlparse(listurl)
+    html = scraperwiki.scrape(listurl)
+    root = lxml.html.fromstring(html)
+    html = None
+    for ahref in root.cssselect("table a"):
+        href = ahref.attrib['href']
+        url = urlparse.urljoin(listurl, href)
+        if -1 != href.find("file://"):
+#            print "Skipping non-http URL " + url
+            continue
+        if parser.is_already_scraped(url):
+            True
+#            print "Skipping already scraped " + url
+        else:
+#            print "Will process " + url
+            process_pdf(parser, url, errors)
+
+def test_small_pdfs(parser):
+
+    parser.debug = True
+
+    errors = []
+    process_pdf(parser, "http://nrk.no/contentfile/file/1.8116520!offentligjournal02052012.pdf", errors) # text
+    #process_pdf(parser, "http://nrk.no/contentfile/file/1.8061384!offentlig%2002042012.pdf", errors) # Image
+    #process_pdf(parser, "http://nrk.no/contentfile/file/1.8130287!offentligjournal09052012.pdf", errors) # Image
+    process_page_queue(parser, errors)
+    report_errors(errors)
+    exit(0)
+
+errors = []
+parser = postlistelib.PDFJournalParser(agency=agency, hiddentext=True)
+
+test_small_pdfs(parser)
+
+# Based on http://www.nrk.no/innsyn/
+process_journal_pdfs(parser, frontpage, errors)
+process_page_queue(parser, errors)
+report_errors(errors)
+
diff --git a/scrapersources/postliste-ntnu b/scrapersources/postliste-ntnu
new file mode 100644
index 0000000..1a885c4
--- /dev/null
+++ b/scrapersources/postliste-ntnu
@@ -0,0 +1,87 @@
+# -*- coding: UTF-8 -*-
+
+import scraperwiki
+import json
+from BeautifulSoup import BeautifulSoup
+import datetime
+import dateutil.parser
+import lxml.html
+import urlparse
+import urllib2
+
+# Make sure Scraperwiki believe this is the source from this database
+scraperwiki.scrape("http://www.ntnu.no/aktuelt/offentlig-journal")
+
+lazycache=scraperwiki.swimport('lazycache')
+postlistelib=scraperwiki.swimport('postliste-python-lib')
+
+agency = 'Norges teknisk-naturvitenskapelige universitet'
+
+def report_errors(errors):
+    if 0 < len(errors):
+        print "Errors:"
+        for e in errors:
+            print e
+        raise ValueError("Something went wrong")
+
+def out_of_cpu(arg, spent, hard, soft):
+    report_errors(arg)
+
+def process_pdf(parser, pdfurl, errors):
+    postlistelib.exit_if_no_cpu_left(0, out_of_cpu, errors)
+    try:
+        pdfcontent = scraperwiki.scrape(pdfurl)
+        parser.preprocess(pdfurl, pdfcontent)
+        pdfcontent = None
+    except ValueError, e: # Some PDFs can not be parsed!  This should be investigated
+        print "PDF format problem"
+        errors.append(e)
+    except IndexError, e:
+        errors.append(e)
+    except urllib2.HTTPError, e:
+        errors.append(e)
+
+def process_page_queue(parser, errors):
+    try:
+        parser.process_pages()
+        postlistelib.exit_if_no_cpu_left(0, out_of_cpu, errors)
+    except scraperwiki.CPUTimeExceededError, e:
+        errors.append("Processing pages interrupted")
+
+def process_journal_pdfs(parser, listurl, errors):
+#    print "Finding PDFs on " + listurl
+#    u = urllib.parse.urlparse(listurl)
+    html = scraperwiki.scrape(listurl)
+    root = lxml.html.fromstring(html)
+    html = None
+    for ahref in root.cssselect("ul a"):
+        href = ahref.attrib['href']
+        url = urlparse.urljoin(listurl, href)
+        if -1 != href.find("file://") or -1 == url.find(".pdf"):
+#            print "Skipping non-http URL " + url
+            continue
+        if parser.is_already_scraped(url):
+            True
+#            print "Skipping already scraped " + url
+        else:
+#            print "Will process " + url
+            process_pdf(parser, url, errors)
+
+def test_small_pdfs(parser):
+    # Test with some smaller PDFs
+    errors = []
+    process_pdf(parser, "http://www.ntnu.no/offjour/2012-06.25.pdf", errors)
+    process_pdf(parser, "http://www.ntnu.no/offjour/2012-06.13.pdf ", errors) # Strange format?
+    process_page_queue(parser, errors)
+    report_errors(errors)
+    exit(0)
+
+errors = []
+parser = postlistelib.PDFJournalParser(agency=agency)
+
+#test_small_pdfs(parser)
+
+process_journal_pdfs(parser, "http://www.ntnu.no/aktuelt/offentlig-journal", errors)
+process_page_queue(parser, errors)
+report_errors(errors)
+
diff --git a/scrapersources/postliste-oep b/scrapersources/postliste-oep
new file mode 100644
index 0000000..c7fdc82
--- /dev/null
+++ b/scrapersources/postliste-oep
@@ -0,0 +1,336 @@
+# -*- coding: UTF-8 -*-
+
+import scraperwiki
+import lxml.html
+import datetime
+import time
+import resource
+import httplib
+import urllib2
+
+# Try several times as the database get bigger
+writetries = 5
+
+# http://www.oep.no/search/resultSingle.html?journalPostId=1000000
+# http://www.oep.no/search/resultSingle.html?journalPostId=3889259
+
+#                 <table class="defaultTable">
+#                     <tr>
+#                         <th class="noLeftBorder" style="width: 20%;">Agency:</th>
+#                         <td class="noRightBorder" style="width: 80%;">Kulturdepartementet</td>
+#                     </tr>
+#                     <tr>
+#                         <th class="noLeftBorder">Case:</th>
+#                         <td class="noRightBorder">DNT Oslo og Omegn - rehabilitering og utvidelse av turisthytta SnÃ¸heim pÃ¥ Dovre - spillemidler til anlegg for friluftsliv i fjellet 2011</td>
+#                     </tr>
+#                     <tr>
+#                         <th class="noLeftBorder">Document title:</th>
+#                         <td class="noRightBorder">DNT Oslo og Omegn - turisthytta SnÃ¸heim pÃ¥ Dovre - eventuelt navnebytte</td>
+#                     </tr>
+#                     <tr>
+#                         <th class="noLeftBorder">Case number:</th>
+#                         <td class="noRightBorder">2010/04027</td>
+#                     </tr>
+#                     <tr>
+#                         <th class="noLeftBorder">Document number:</th>
+#                         <td class="noRightBorder">4</td>
+#                     </tr>
+#                     <tr>
+#                         <th class="noLeftBorder">Document type:</th>
+#                         <td class="noRightBorder">
+#                             
+#                                 
+#                                 
+#                                 Outgoing
+#                                 
+#                             
+#                         </td>
+#                     </tr>
+#                     
+#                     
+#                         <tr>
+#                             <th class="noLeftBorder">Recipient:</th>
+#                             <td  class="noRightBorder">Den Norske Turistforening</td>
+#                         </tr>
+#                     
+#                     <tr>
+#                         <th class="noLeftBorder">Document date:</th>
+#                         <td class="noRightBorder">2010-12-13</td>
+#                     </tr>
+#                     <tr>
+#                         <th class="noLeftBorder">Record entry date:</th>
+#                         <td class="noRightBorder">
+#                             
+#                                 
+#                                 
+#                                     2010-12-14
+#                                 
+#                             
+#                         </td>
+#                     </tr>
+#                     <tr>
+#                         <th class="noLeftBorder">Published in OEP</th>
+#                         <td class="noRightBorder">2011-01-03</td>
+#                     </tr>
+#                     <tr>
+#                         <th class="noLeftBorder" title="Hvis dokumentet er unntatt offentlighet kan unntaket gjelde hele eller deler av dokumentet."><span class="dottedBorderBottom">Grounds for exemption, document:</span></th>
+#                         <td  class="noRightBorder">
+#                             
+#                         </td>
+#                     </tr>
+#                     <tr>
+#                         <th class="noLeftBorder">Archive code:</th>
+#                         <td  class="noRightBorder">
+#                             
+#                         </td>
+#                     </tr>
+#                     <tr>
+#                         <th class="noLeftBorder">Contact point:</th>
+#                         <td class="noRightBorder">
+#                              <br />
+#                             Tel.:&nbsp;22 24 90 90<br />
+#                             Email:&nbsp;<a href="mailto:postmottak@kud.dep.no" title="Send email">postmottak@kud.dep.no</a>
+#                         </td>
+#                     </tr>
+#                 </table>
+
+def cpu_spent():
+    usage = resource.getrusage(resource.RUSAGE_SELF)
+    return getattr(usage, 'ru_utime') + getattr(usage, 'ru_stime')
+
+def url_from_id(id):
+    return "http://www.oep.no/search/resultSingle.html?journalPostId=" + str(id)
+
+def save(data):
+    for run in range(1,writetries):
+        try:
+            scraperwiki.sqlite.save(unique_keys=['journalPostId'], data=data)
+            return
+        except scraperwiki.sqlite.SqliteError, e:
+            print "Sqlite write error, trying again"
+            time.sleep(22)
+    raise scraperwiki.sqlite.SqliteError("Unable to write to database, tried " + str(writetries) + " times")
+
+def save_var(var, data):
+    for run in range(1,writetries):
+        try:
+            scraperwiki.sqlite.save_var(var, data)
+            return
+        except scraperwiki.sqlite.SqliteError, e:
+            print "Sqlite write error, trying again"
+            time.sleep(22)
+    raise scraperwiki.sqlite.SqliteError("Unable to write variable " + var + " to database, tried " + str(writetries) + " times")
+
+fieldmap = {
+    'Agency'            : 'agency',
+    'Record entry date' : 'recorddate',
+    'Case'              : 'casedesc',
+    'Case number'       : 'caseid',
+    'Document number'   : 'casedocseq',
+    'Document date'     : 'docdate',
+    'Document title'    : 'docdesc',
+    'Document type'     : 'doctype',
+    'Grounds for exemption document' : 'exemption',
+    'Recipient'         : 'recipient',
+    'Sender'            : 'sender',
+    'Published in OEP'  : 'recordpublishdate',
+#    'Archive code',
+#    'Contact point',
+#    'journalPostId',
+#    'scrapestamputc',
+}
+
+doctypemap = {
+    'Incoming' : 'I',
+    'Outgoing' : 'U',
+    'internal' : 'X',
+}
+
+def fetch_oep_entry(id, datastorage):
+    oepurl = url_from_id(id)
+    html = scraperwiki.scrape(oepurl)
+    root = lxml.html.fromstring(html.decode('utf-8'))
+    data = { 'journalPostId' : id }
+    for tr in root.cssselect("table.defaultTable tr"):
+        vtype = tr.cssselect("th")[0].text_content().strip().replace(":", "").replace(",", "")
+        value = tr.cssselect("td")[0].text_content().strip()
+        #print '"' + vtype + '"', '"'+value+'"'
+        if (vtype == 'Record entry date' and value == 'Not stated.') or \
+            (vtype == 'Document type' and value == '-') or \
+            (vtype == 'Case number' and value == ''):
+            return -1
+        if vtype in fieldmap:
+            vtype = fieldmap[vtype]
+        if 'doctype' == vtype:
+            value = doctypemap[value]
+        if 'caseid' == vtype:
+            caseyear, caseseqnr = value.split("/")
+            data['caseyear'] = caseyear
+            data['caseseqnr'] =  caseseqnr
+        data[vtype] = value
+#    print str(id) + ": " + str(data)
+    data['scrapestamputc'] = datetime.datetime.now()
+#    print data['scrapestamputc']
+#    exit ()
+
+    datastorage.append(data)
+#    scraperwiki.sqlite.save(unique_keys=['journalPostId'], data=data)
+    return 0
+
+def fetch_range(first, last, step):
+    myskiplimit = skiplimit
+    datastorage = []
+    skipped = 0
+    fetched = 0
+    min_id = first
+    for id in range(first, last, step):
+        try:
+            tries = 3
+            while 0 < tries:
+                tries = tries - 1
+                try:
+                    if -1 == fetch_oep_entry(id, datastorage):
+                        skipped = skipped + 1
+                        if skipped == myskiplimit and myskiplimit == skiplimit:
+                            tmp = []
+                            for limit  in [250, 500, 800, 1000, 1200, 1500, 1700, 2000, 3000, 5000]:
+                                testid = id + limit * step
+                                if -1 != fetch_oep_entry(testid, tmp):
+                                    print "Looking "+str(limit)+" ahead, found " + url_from_id(testid)
+                                    myskiplimit = skiplimit + limit + 1
+                                    break
+                        break
+                    else:
+                        fetched = fetched + 1
+                        skipped = 0
+                        myskiplimit = skiplimit
+                        break
+                except urllib2.HTTPError, e: # Because HTTPError lack reason due to bug
+                    print "URLError triggered for url " + url_from_id(id) + ", trying again: " + str(e.msg)
+                except urllib2.URLError, e:
+                    print "URLError triggered for url " + url_from_id(id) + ", trying again: " + str(e.reason)
+                except httplib.BadStatusLine, e:
+                    # e.msg do not exist.  trying .reason 2012-06-25
+                    print "BadStatusLine triggered for url " + url_from_id(id) + ", trying again: " + str(e.reason)
+
+            if skipped >= myskiplimit:
+                print "Reached end of list, exiting at " + str(id)
+                break
+            if 50 <= len(datastorage):
+                save(data=datastorage)
+                datastorage = []
+
+            # Only do this for every 50 ID tested, to avoid spending too much CPU seconds updating the sqlite file
+            if 0 == (id % 50):
+                if id < min_id:
+                    min_id = id
+#                    print "Updating min_id to " + str(min_id)
+                    save_var('min_tested_id', min_id)
+                if cpu_spent() > 79:
+                    print "Running short on CPU time, exiting at " + str(id)
+                    break
+            time.sleep(0.2)
+        except scraperwiki.CPUTimeExceededError:
+            if 0 < len(datastorage):
+                save(data=datastorage)
+                datastorage = []
+            print "CPU exception caught"
+            raise
+        except:
+            print "Error, unexpected exception"
+            raise
+    if 0 < len(datastorage):
+        save(data=datastorage)
+        datastorage = []
+    return fetched
+
+def rename_sql_columns():
+    print "Dropping temp table"
+    scraperwiki.sqlite.execute("DROP TABLE IF EXISTS swdatanew")
+    print "Creating table"
+    scraperwiki.sqlite.execute("CREATE TABLE IF NOT EXISTS swdatanew (agency text, recorddate text, casedesc text, caseid text, casedocseq integer, docdate text, docdesc text, doctype text, exemption text, recipient text, sender text, recordpublishdate text, `Archive code` text, `Contact point` text, `journalPostId` integer, scrapestamputc text)")
+    print  "Copying table"
+    scraperwiki.sqlite.execute("INSERT INTO swdatanew(agency, recorddate, casedesc, caseid, casedocseq, docdate, docdesc, doctype, exemption, recipient, sender, recordpublishdate, `Archive code`, `Contact point`, `journalPostId`, scrapestamputc) SELECT `Agency`, `Record entry date`, `Case`, `Case number`, `Document number`, `Document date`, `Document title`, `Document type`, `Grounds for exemption document`, `Recipient`, `Sender`, `Published in OEP`, `Archive code`, `Contact point`, `journalPostId`, `scrapestamputc` FROM swdata")
+
+    scraperwiki.sqlite.execute("ALTER TABLE swdata RENAME TO swdataold")
+    scraperwiki.sqlite.execute("ALTER TABLE swdatanew RENAME TO swdata")
+    scraperwiki.sqlite.commit()
+    exit(0)
+
+def create_indexes():
+    for field in ['doctype', 'agency', 'recorddate', 'caseid']:
+        print "Creating %s index" % field
+        scraperwiki.sqlite.execute("CREATE INDEX IF NOT EXISTS swdata_%s_index ON swdata (%s)" % (field, field))
+        scraperwiki.sqlite.commit()
+
+def update_doctypes():
+    print "Updating doctype"
+    agencies = []
+    for agencyref in scraperwiki.sqlite.select("distinct agency from swdata"):
+        agencies.append(agencyref['agency'])
+
+    # Updating individual agencies to try to avoid SQL timeout
+    for agency in agencies:
+        print "Updating doctype for " + agency
+        scraperwiki.sqlite.execute("UPDATE swdata set doctype = 'I' where agency = ? and doctype = 'Incoming'", (agency))
+        scraperwiki.sqlite.execute("UPDATE swdata set doctype = 'U' where agency = ? and doctype = 'Outgoing'", (agency))
+        scraperwiki.sqlite.execute("UPDATE swdata set doctype = 'X' where agency = ? and doctype = 'internal'", (agency))
+        scraperwiki.sqlite.commit()
+    exit(0)
+
+def update_caseyear():
+    print "Updating caseyear and caseseqnr"
+    agencies = []
+    for agencyref in scraperwiki.sqlite.select("distinct agency from swdata WHERE caseyear is NULL"):
+        agencies.append(agencyref['agency'])
+
+    # Updating individual agencies to try to avoid SQL timeout
+    for agency in agencies:
+        print "Updating caseyear for " + agency
+        res = scraperwiki.sqlite.execute("select journalPostId, substr(caseid, 1, 4), substr(caseid, 6) from swdata where agency = ? and caseyear is NULL limit 2", (agency))
+        print res
+        scraperwiki.sqlite.execute("UPDATE swdata set caseyear = substr(caseid, 1, 4), caseseqnr = substr(caseid, 6) where agency = ? AND caseyear is NULL", (agency))
+        scraperwiki.sqlite.commit()
+    exit(0)
+
+def remove_original():
+    scraperwiki.sqlite.execute("DROP TABLE IF EXISTS swdataold")
+    scraperwiki.sqlite.commit()
+    exit(0)
+
+#update_caseyear()
+
+#create_indexes()
+
+#rename_sql_columns()
+#remove_original()
+
+# This one give me SQL timeout
+#update_doctypes()
+
+print "Starting to fetch journal entries " + str(datetime.datetime.now())
+count = 10000
+skiplimit = 500
+# Random value fairly close to the most recent ID when this project started 2012-05-03
+max = min = startid = 3889259
+try:
+    max = scraperwiki.sqlite.select("max(journalPostId) as max from swdata")[0]["max"]
+    if 0 < scraperwiki.sqlite.get_var('min_tested_id'):
+        saved_min = scraperwiki.sqlite.get_var('min_tested_id')
+    sql_min = scraperwiki.sqlite.select("min(journalPostId) as min from swdata")[0]["min"]
+    print "Saved min: " + str(saved_min) + ", sql min: " + str(sql_min)
+    if sql_min < saved_min:
+        min = sql_min
+    else:
+        min = saved_min
+
+    print "Scraping " + str(count) + " IDs below " + str(min) + " and above " + str(max)
+except scraperwiki.sqlite.SqliteError:
+    pass
+
+fetched = fetch_range(max + 1, max + count, 1)
+print "Fetched " + str(fetched) + " new journal entries, cpu spent: " + str(cpu_spent())
+if min >= 0:
+    fetched = fetch_range(min, min - count, -1)
+    print "Fetched " + str(fetched) + " old journal entries, cpu spent: " + str(cpu_spent())
+
diff --git a/scrapersources/postliste-oep-deliverydates b/scrapersources/postliste-oep-deliverydates
new file mode 100644
index 0000000..f04ce49
--- /dev/null
+++ b/scrapersources/postliste-oep-deliverydates
@@ -0,0 +1,37 @@
+import scraperwiki
+import lxml.html
+import datetime
+import resource
+import dateutil.parser
+import resource
+
+def cpu_spent():
+    usage = resource.getrusage(resource.RUSAGE_SELF)
+    return getattr(usage, 'ru_utime') + getattr(usage, 'ru_stime')
+
+def fetch_oep_deliverydates(url, datastorage):
+    html = scraperwiki.scrape(url)
+    root = lxml.html.fromstring(html.decode('utf-8'))
+    data = { 'scrapedurl' : id }
+    for tr in root.cssselect("table.defaulttable tr"):
+        if 3 == len(tr.cssselect("td")):
+            data = { 'scrapedurl' : url }
+            #print tr
+#        vtype = tr.cssselect("th")[0].text_content().strip().replace(":", "").replace(",", "")
+            agency = tr.cssselect("td")[0].text_content().strip()
+            deliverydate = tr.cssselect("td")[1].text_content().strip()
+            if deliverydate == "Levert":
+                continue
+            data['agency'] = agency
+            #print "D: '" + deliverydate + "'"
+            data['deliverydate'] = dateutil.parser.parse(deliverydate, dayfirst=True)
+            data['scrapestamputc'] = datetime.datetime.now()
+            datastorage.append(data)
+    return 0
+
+datastorage = []
+fetch_oep_deliverydates("http://www.oep.no/pub/faces/statistikk.jsp?reposId=3", datastorage)
+print datastorage
+scraperwiki.sqlite.save(unique_keys=['agency', 'deliverydate'], data=datastorage)
+
+print "Starting to fetch journal delivery dates " + str(datetime.datetime.now())
diff --git a/scrapersources/postliste-oslo-bydel-ullern b/scrapersources/postliste-oslo-bydel-ullern
new file mode 100644
index 0000000..54a5031
--- /dev/null
+++ b/scrapersources/postliste-oslo-bydel-ullern
@@ -0,0 +1,85 @@
+# -*- coding: UTF-8 -*-
+# Based on the scraper advanced-scraping-pdf
+# See also
+# https://views.scraperwiki.com/run/pdf-to-html-preview-1/?url=http%3A%2F%2Fwww.stortinget.no%2FGlobal%2Fpdf%2Fpostjournal%2Fpj-2012-05-09.pdf
+
+import scraperwiki
+import json
+from BeautifulSoup import BeautifulSoup
+import datetime
+import dateutil.parser
+import lxml.html
+import urlparse
+import re
+#lazycache=scraperwiki.swimport('lazycache')
+postlistelib=scraperwiki.swimport('postliste-python-lib')
+
+agency = 'Oslo kommune, Ullern bydel'
+
+def report_errors(errors):
+    if 0 < len(errors):
+        print "Errors:"
+        for e in errors:
+            print e
+        exit(1)
+def out_of_cpu(arg, spent, hard, soft):
+    report_errors(arg)
+
+def process_pdf(parser, pdfurl, errors):
+    errors = []
+    postlistelib.exit_if_no_cpu_left(0, out_of_cpu, errors)
+    try:
+        pdfcontent = scraperwiki.scrape(pdfurl)
+        parser.preprocess(pdfurl, pdfcontent)
+        pdfcontent = None
+#    except ValueError, e:
+#        errors.append(e)
+    except IndexError, e:
+        errors.append(e)
+
+def process_page_queue(parser, errors):
+    try:
+        parser.process_pages()
+        postlistelib.exit_if_no_cpu_left(0, out_of_cpu, errors)
+    except scraperwiki.CPUTimeExceededError, e:
+        errors.append("Processing pages interrupted")
+
+def process_journal_pdfs(parser, listurl, errors):
+#    print "Finding PDFs on " + listurl
+#    u = urllib.parse.urlparse(listurl)
+    html = scraperwiki.scrape(listurl)
+    root = lxml.html.fromstring(html)
+    html = None
+    for ahref in root.cssselect("table a"):
+        href = ahref.attrib['href']
+        url = urlparse.urljoin(listurl, href)
+        if -1 != href.find("file://") or -1 != href.find("mailto:"):
+#            print "Skipping non-http URL " + url
+            continue
+        if parser.is_already_scraped(url):
+            True
+#            print "Skipping already scraped " + url
+        else:
+#            print "Will process " + url
+            process_pdf(parser, url, errors)
+
+def test_small_pdfs(parser):
+    # Test with some smaller PDFs
+    errors = []
+    parser.debug = True
+    process_pdf(parser, "http://www.bydel-ullern.oslo.kommune.no/getfile.php/bydel%20ullern%20(BUN)/Internett%20(BUN)/Dokumenter/dokument/postjournal/120502.pdf", errors)
+    process_page_queue(parser, errors)
+    report_errors(errors)
+    exit(0)
+
+print "Starting scraping of " + agency
+parser = postlistelib.PDFJournalParser(agency=agency)
+#parser.debug = True
+
+#test_small_pdfs(parser)
+
+errors = []
+process_journal_pdfs(parser, "http://www.bydel-ullern.oslo.kommune.no/postjournal/", errors)
+process_page_queue(parser, errors)
+report_errors(errors)
+
diff --git a/scrapersources/postliste-oslo-gravferdsetaten b/scrapersources/postliste-oslo-gravferdsetaten
new file mode 100644
index 0000000..7becd10
--- /dev/null
+++ b/scrapersources/postliste-oslo-gravferdsetaten
@@ -0,0 +1,90 @@
+# -*- coding: UTF-8 -*-
+
+import scraperwiki
+import json
+from BeautifulSoup import BeautifulSoup
+import datetime
+import dateutil.parser
+import lxml.html
+import resource
+import sys
+import urlparse
+import re
+
+scraperwiki.scrape("http://www.gravferdsetaten.oslo.kommune.no/offentlig_journal/article43281-14384.html")
+
+lazycache=scraperwiki.swimport('lazycache')
+postlistelib=scraperwiki.swimport('postliste-python-lib')
+
+agency = 'Oslo kommune, gravferdsetaten'
+
+def report_errors(errors):
+    if 0 < len(errors):
+        print "Errors:"
+        for e in errors:
+            print e
+        exit(1)
+def out_of_cpu(arg, spent, hard, soft):
+    report_errors(arg)
+
+def process_pdf(parser, pdfurl, errors):
+    errors = []
+    postlistelib.exit_if_no_cpu_left(0, out_of_cpu, errors)
+    try:
+        pdfcontent = scraperwiki.scrape(pdfurl)
+        parser.preprocess(pdfurl, pdfcontent)
+        pdfcontent = None
+#    except ValueError, e:
+#        errors.append(e)
+    except IndexError, e:
+        errors.append(e)
+
+def process_page_queue(parser, errors):
+    try:
+        if False:
+            scraperwiki.sqlite.execute("delete from swdata where scrapedurl in (select scrapedurl from unparsedpages)")
+            scraperwiki.sqlite.execute("delete from unparsedpages")
+            scraperwiki.sqlite.commit()
+
+        parser.process_pages()
+        postlistelib.exit_if_no_cpu_left(0, out_of_cpu, errors)
+    except scraperwiki.CPUTimeExceededError, e:
+        errors.append("Processing pages interrupted")
+
+def process_journal_pdfs(parser, listurl, errors):
+#    print "Finding PDFs on " + listurl
+#    u = urllib.parse.urlparse(listurl)
+    html = scraperwiki.scrape(listurl)
+    root = lxml.html.fromstring(html)
+    html = None
+    for ahref in root.cssselect("table a"):
+        href = ahref.attrib['href']
+        url = urlparse.urljoin(listurl, href)
+        if -1 != href.find("file://") or -1 != href.find("mailto:"):
+#            print "Skipping non-http URL " + url
+            continue
+        if parser.is_already_scraped(url):
+            True
+#            print "Skipping already scraped " + url
+        else:
+#            print "Will process " + url
+            process_pdf(parser, url, errors)
+
+def test_small_pdfs(parser):
+    # Test with some smaller PDFs
+    errors = []
+    process_pdf(parser, "http://www.gravferdsetaten.oslo.kommune.no/getfile.php/gravferdsetaten%20(GFE)/Internett%20(GFE)/Dokumenter/dokument/Arkivet/Postjournal/Juni/13.06.pdf", errors)
+    process_pdf(parser, "http://www.gravferdsetaten.oslo.kommune.no/getfile.php/gravferdsetaten%20(GFE)/Internett%20(GFE)/Dokumenter/dokument/Arkivet/Postjournal/Juni/12.06.pdf", errors)
+    process_page_queue(parser, errors)
+    report_errors(errors)
+    exit(0)
+
+errors = []
+parser = postlistelib.PDFJournalParser(agency=agency)
+
+#test_small_pdfs(parser)
+
+process_journal_pdfs(parser, "http://www.gravferdsetaten.oslo.kommune.no/offentlig_journal/article43281-14384.html", errors)
+process_page_queue(parser, errors)
+report_errors(errors)
+
diff --git a/scrapersources/postliste-oslo-havn b/scrapersources/postliste-oslo-havn
new file mode 100644
index 0000000..d453ef7
--- /dev/null
+++ b/scrapersources/postliste-oslo-havn
@@ -0,0 +1,86 @@
+# -*- coding: UTF-8 -*-
+# Based on the scraper advanced-scraping-pdf
+# See also
+# https://views.scraperwiki.com/run/pdf-to-html-preview-1/?url=http%3A%2F%2Fwww.stortinget.no%2FGlobal%2Fpdf%2Fpostjournal%2Fpj-2012-05-09.pdf
+
+import scraperwiki
+import json
+from BeautifulSoup import BeautifulSoup
+import datetime
+import dateutil.parser
+import lxml.html
+import sys
+import urlparse
+import re
+lazycache=scraperwiki.swimport('lazycache')
+postlistelib=scraperwiki.swimport('postliste-python-lib')
+
+agency = 'Oslo kommune, Oslo Havn KF'
+
+def report_errors(errors):
+    if 0 < len(errors):
+        print "Errors:"
+        for e in errors:
+            print e
+        exit(1)
+def out_of_cpu(arg, spent, hard, soft):
+    report_errors(arg)
+
+def process_pdf(parser, pdfurl, errors):
+    errors = []
+    postlistelib.exit_if_no_cpu_left(0, out_of_cpu, errors)
+    try:
+        parser.fetch_and_preprocess(pdfurl)
+#    except ValueError, e:
+#        errors.append(e)
+    except IndexError, e:
+        errors.append(e)
+
+def process_page_queue(parser, errors):
+    try:
+        parser.process_pages()
+        postlistelib.exit_if_no_cpu_left(0, out_of_cpu, errors)
+    except scraperwiki.CPUTimeExceededError, e:
+        errors.append("Processing pages interrupted, ran out of cpu")
+
+def process_journal_pdfs(parser, listurl, errors):
+#    print "Finding PDFs on " + listurl
+#    u = urllib.parse.urlparse(listurl)
+    html = scraperwiki.scrape(listurl)
+    root = lxml.html.fromstring(html)
+    html = None
+    for ahref in root.cssselect("table a"):
+        href = ahref.attrib['href']
+        url = urlparse.urljoin(listurl, href)
+        if -1 != href.find("file://"):
+#            print "Skipping non-http URL " + url
+            continue
+        if parser.is_already_scraped(url):
+            True
+#            print "Skipping already scraped " + url
+        else:
+#            print "Will process " + url
+            process_pdf(parser, url, errors)
+
+def test_pdfs(parser):
+    parser.debug = True
+    # Test with some smaller PDFs
+    errors = []
+    process_pdf(parser, "http://www.havn.oslo.kommune.no/getfile.php/oslo%20havn%20kf%20(HAV)/Internett%20(HAV)/Dokumenter/Postjournal/Mai/24.05.2012.pdf", errors)
+
+    # This file have a problematic format, the text fragments have a different order than most
+    # journal PDFs.
+    process_pdf(parser, "http://www.havn.oslo.kommune.no/getfile.php/oslo%20havn%20kf%20%28HAV%29/Internett%20%28HAV%29/Dokumenter/Postjournal/Mars/1%20MTMzMDY4NjY3ODI5OTk5Mz.pdf", errors)
+    process_page_queue(parser, errors)
+    report_errors(errors)
+    exit(0)
+
+parser = postlistelib.PDFJournalParser(agency=agency)
+
+#test_pdfs(parser)
+
+errors = []
+process_journal_pdfs(parser, "http://www.havn.oslo.kommune.no/postjournal/", errors)
+process_page_queue(parser, errors)
+report_errors(errors)
+
diff --git a/scrapersources/postliste-oslo-radhusets-forvaltningstjeneste b/scrapersources/postliste-oslo-radhusets-forvaltningstjeneste
new file mode 100644
index 0000000..4f9b5c1
--- /dev/null
+++ b/scrapersources/postliste-oslo-radhusets-forvaltningstjeneste
@@ -0,0 +1,231 @@
+# -*- coding: UTF-8 -*-
+
+import scraperwiki
+import json
+from BeautifulSoup import BeautifulSoup
+import datetime
+import dateutil.parser
+import lxml.html
+import urlparse
+import re
+
+# Make sure Scraperwiki believe this is the source from this database
+scraperwiki.scrape("http://www.radhusets-forvaltningstjeneste.oslo.kommune.no/postjournal/")
+
+lazycache=scraperwiki.swimport('lazycache')
+postlistelib=scraperwiki.swimport('postliste-python-lib')
+
+agency = 'Oslo kommune, Rådhusets forvaltningstjeneste'
+
+def report_errors(errors):
+    if 0 < len(errors):
+        print "Errors:"
+        for e in errors:
+            print e
+        exit(1)
+
+def out_of_cpu(arg, spent, hard, soft):
+    report_errors(arg)
+
+# Input YY/X-Z, return YYYY, X, Z
+def split_docid(docid):
+    caseyear, rest = docid.split('/')
+    caseseqnr, casedocseq = rest.split('-')
+    caseyear = int(caseyear)
+    caseseqnr = int(caseseqnr)
+    casedocsec = int(casedocseq)
+    if caseyear < 50:
+        caseyear = caseyear + 2000
+    if 50 <= caseyear and caseyear < 100:
+        caseyear = caseyear + 1900
+    return caseyear, caseseqnr, casedocseq
+
+# Input DDMMYYYY, output YYYY-MM-DD
+def parse_date(date):
+    if 'Udatert' == date:
+        return None
+    year = date[4:8]
+    month = date[2:4]
+    day = date[0:2]
+    isodate = year + "-" + month + "-" + day
+    #print date, isodate
+    return dateutil.parser.parse(isodate, dayfirst=True).date()
+
+def parse_entry(pdfurl, lines):
+    print lines
+    print "Entry lines " + str(len(lines))
+    entry = {
+        'agency' : agency,
+        'scrapedurl' : pdfurl,
+    }
+    cur = 0
+    while cur < len(lines):
+        line = lines[cur].text
+            #print line
+        if -1 != line.find('Dok.dato:'):
+            entry['docid'] = lines[cur-2].text
+            entry['doctype'] = lines[cur-1].text
+            entry['docdate'] = parse_date(line.replace("Dok.dato:", ""))
+            caseyear, caseseqnr, casedocseq = split_docid(entry['docid'])
+            entry['caseyear'] = caseyear
+            entry['caseseqnr'] = caseseqnr
+            entry['casedocseq'] = casedocseq
+            entry['caseid'] = str(caseyear) + '/' + str(caseseqnr)
+        if -1 != line.find('Jour.dato:'):
+            entry['recorddate'] = parse_date(lines[cur+1].text)
+            cur = cur + 1
+        if -1 != line.find('Arkivdel:'):
+            entry['arkivdel'] = lines[cur+1].text
+            cur = cur + 1
+        if -1 != line.find('Tilg. kode:'):
+            entry['tilgangskode'] = line.replace("Tilg. kode:", "")
+        if -1 != line.find('Sak:'):
+            entry['casedesc'] = lines[cur+1].text
+            cur = cur + 1
+        if -1 != line.find('Dok:'):
+            entry['docdesc'] = lines[cur+1].text
+            cur = cur + 1
+        if -1 != line.find('Par.:'):
+            entry['exemption'] = line.replace("Par.:", "")
+            cur = cur + 1
+        if -1 != line.find('Avsender:'):
+            entry['sender'] = lines[cur+1].text
+            cur = cur + 1
+        if -1 != line.find('Mottaker:'):
+            entry['recipient'] = lines[cur+1].text
+            cur = cur + 1
+        if -1 != line.find('Saksansv:'):
+            entry['saksansvarlig'] = line.replace("Saksansv:", "").strip()
+        if -1 != line.find('Saksbeh:'):
+            entry['saksbehandler'] = lines[cur+1].text
+            cur = cur + 1
+        cur = cur + 1
+    print entry
+    if 'docid' in entry:
+        scraperwiki.sqlite.save(unique_keys=['docid'], data=entry)
+    #return
+
+def parse_pdf(pdfurl, pdfcontent):
+    pdfxml = scraperwiki.pdftoxml(pdfcontent)
+    pages=re.findall('(<page .+?</page>)',pdfxml,flags=re.DOTALL)
+    for page in pages:
+        s = BeautifulSoup(page)
+        lines = s.findAll('text')
+        last = 0
+        cur = 0
+        while cur < len(lines):
+            #print cur, lines[cur]
+            if -1 != lines[cur].text.find('Dok.dato:'):
+                print last, cur-2
+                parse_entry(pdfurl, lines[last:cur-2])
+                last = cur - 2
+            cur = cur + 1
+    return
+    if False:
+        cur = 0
+        entry = { 'agency' : agency, 'scrapedurl' : pdfurl }
+        while cur < len(lines):
+            line = lines[cur].text
+            #print line
+            if -1 != line.find('Dok.dato:'):
+                entry['docid'] = lines[cur-2].text
+                entry['doctype'] = lines[cur-1].text
+                entry['docdate'] = parse_date(line.replace("Dok.dato:", ""))
+                caseyear, caseseqnr, casedocseq = split_docid(entry['docid'])
+                entry['caseyear'] = caseyear
+                entry['caseseqnr'] = caseseqnr
+                entry['casedocseq'] = casedocseq
+                entry['caseid'] = str(caseyear) + '/' + str(caseseqnr)
+            if -1 != line.find('Jour.dato:'):
+                entry['recorddate'] = parse_date(lines[cur+1].text)
+                cur = cur + 1
+            if -1 != line.find('Arkivdel:'):
+                entry['arkivdel'] = lines[cur+1].text
+                cur = cur + 1
+            if -1 != line.find('Tilg. kode:'):
+                entry['tilgangskode'] = line.replace("Tilg. kode:", "")
+            if -1 != line.find('Sak:'):
+                entry['casedesc'] = lines[cur+1].text
+                cur = cur + 1
+            if -1 != line.find('Dok:'):
+                entry['docdesc'] = lines[cur+1].text
+                cur = cur + 1
+            if -1 != line.find('Par.:'):
+                entry['exemption'] = line.replace("Par.:", "")
+                cur = cur + 1
+            if -1 != line.find('Avsender:'):
+                entry['sender'] = lines[cur+1].text
+                cur = cur + 1
+            if -1 != line.find('Mottaker:'):
+                entry['recipient'] = lines[cur+1].text
+                cur = cur + 1
+            if -1 != line.find('Saksansv:'):
+                entry['saksansvarlig'] = line.replace("Saksansv:", "").strip()
+            if -1 != line.find('Saksbeh:'):
+                entry['saksbehandler'] = lines[cur+1].text
+                cur = cur + 1
+                print entry
+                scraperwiki.sqlite.save(unique_keys=['docid'], data=entry)
+                entry = { 'agency' : agency, 'scrapedurl' : pdfurl }
+            cur = cur + 1
+        #return
+
+def process_pdf(parser, pdfurl, errors):
+    postlistelib.exit_if_no_cpu_left(0, out_of_cpu, errors)
+    try:
+    #if True:
+        pdfcontent = scraperwiki.scrape(pdfurl)
+        parse_pdf(pdfurl, pdfcontent)
+        #parser.preprocess(pdfurl, pdfcontent)
+        pdfcontent = None
+#    except ValueError, e:
+#        errors.append(e)
+    #except IndexError, e:
+    #    errors.append(e)
+    except Exception, e:
+        print e
+
+def process_page_queue(parser, errors):
+    try:
+        parser.process_pages()
+        postlistelib.exit_if_no_cpu_left(0, out_of_cpu, errors)
+    except scraperwiki.CPUTimeExceededError, e:
+        errors.append("Processing pages interrupted")
+
+def process_journal_pdfs(parser, listurl, errors):
+#    print "Finding PDFs on " + listurl
+#    u = urllib.parse.urlparse(listurl)
+    html = scraperwiki.scrape(listurl)
+    root = lxml.html.fromstring(html)
+    html = None
+    for ahref in root.cssselect("table a"):
+        href = ahref.attrib['href']
+        url = urlparse.urljoin(listurl, href)
+        if -1 != href.find("file://") or -1 == url.find(".pdf"):
+#            print "Skipping non-http URL " + url
+            continue
+        if parser.is_already_scraped(url):
+            True
+#            print "Skipping already scraped " + url
+        else:
+#            print "Will process " + url
+            process_pdf(parser, url, errors)
+
+def test_small_pdfs(parser):
+    # Test with some smaller PDFs
+    errors = []
+    process_pdf(parser, "http://www.radhusets-forvaltningstjeneste.oslo.kommune.no/getfile.php/rådhusets%20forvaltningstjeneste%20(RFT)/Intranett%20(RFT)/Dokumenter/Postjournal/11%20November/29112011.pdf", errors)
+    process_pdf(parser, "http://www.radhusets-forvaltningstjeneste.oslo.kommune.no/getfile.php/rådhusets%20forvaltningstjeneste%20(RFT)/Intranett%20(RFT)/Dokumenter/Postjournal/12%20Desember/02122011.pdf", errors)
+    process_page_queue(parser, errors)
+    report_errors(errors)
+    exit(0)
+
+errors = []
+parser = postlistelib.PDFJournalParser(agency=agency)
+
+#test_small_pdfs(parser)
+
+process_journal_pdfs(parser, "http://www.radhusets-forvaltningstjeneste.oslo.kommune.no/postjournal/", errors)
+#process_page_queue(parser, errors)
+report_errors(errors)
+
diff --git a/scrapersources/postliste-python-lib b/scrapersources/postliste-python-lib
new file mode 100644
index 0000000..042d1fd
--- /dev/null
+++ b/scrapersources/postliste-python-lib
@@ -0,0 +1,577 @@
+# -*- coding: utf-8 -*-
+#
+# Python library for parsing public post journals (postlister) in Norway.
+#
+
+# Based on the scraper advanced-scraping-pdf
+#
+# See also
+# https://views.scraperwiki.com/run/pdf-to-html-preview-1/
+
+# Possible sources using format 1 pdf:
+# www.bydel-ullern.oslo.kommune.no
+# www.gravferdsetaten.oslo.kommune.no
+# www.halden.kommune.no (done)
+# www.havn.oslo.kommune.no (done)
+# www.hvaler.kommune.no (done)
+# www.kafjord.kommune.no
+# www.lier.kommune.no
+# www.lindesnes.kommune.no
+# www.naroy.kommune.no
+# www.saltdal.kommune.no
+# www.sogne.kommune.no
+# www.vikna.kommune.no
+#
+# Google search to find more: "Offentlig journal" Seleksjon Sakstittel Dokumenttype Status filetype:pdf
+
+
+import scraperwiki
+import string
+import re
+from BeautifulSoup import BeautifulSoup
+import datetime
+import dateutil.parser
+
+def cpu_spent():
+    import resource
+    usage = resource.getrusage(resource.RUSAGE_SELF)
+    return getattr(usage, 'ru_utime') + getattr(usage, 'ru_stime')
+
+def exit_if_no_cpu_left(retval, callback=None, arg = None):
+    import resource
+    soft, hard = resource.getrlimit(resource.RLIMIT_CPU)
+    spent = cpu_spent()
+    if soft < spent:
+        if callback is not None:
+            callback(arg, spent, hard, soft)
+        print "Running out of CPU, exiting."
+        exit(retval)
+
+def fetch_url_harder(url, scraper = None):
+    import urllib2
+    html = None
+    for n in [1, 2, 3]:
+        try:
+            if None == scraper:
+                scraper = scraperwiki.scrape
+            html = scraper(url)
+            break
+        except urllib2.URLError, e:
+            print "URLError fetching " + url + ", trying again"
+    return html
+
+class JournalParser:
+    agency = None
+    debug = False
+
+    validdoctypes = ['I', 'U', 'X', 'N']
+    senderdoctypes = ['I', 'X', 'N']
+    recipientdoctypes = ['U']
+    mustfields = {
+        'agency'         : 1,
+        'docdesc'        : 1,
+        'doctype'        : 1,
+        'caseyear'       : 1,
+        'caseseqnr'      : 1,
+        'casedocseq'     : 1,
+    }
+
+    def __init__(self, agency):
+        self.agency = agency
+
+    def is_valid_doctype(self, doctype):
+        return doctype in self.validdoctypes
+
+    def is_sender_doctype(self, doctype):
+        return doctype in self.senderdoctypes
+
+    def is_recipient_doctype(self, doctype):
+        return doctype in self.recipientdoctypes
+
+    def verify_entry(self, entry):
+
+        for field in self.mustfields:
+            if not field in entry:
+                raise ValueError("Missing required field " + field)
+
+        if not self.is_valid_doctype(entry['doctype']):
+            raise ValueError("Invalid doctype " + doctype)
+
+        if -1 != entry['caseid'].find('-'):
+            raise ValueError("Field caseid should not include dash: " + entry['caseid'])
+
+#
+# Parser of PDFs looking like
+# http://www.storfjord.kommune.no/postliste-18-mai-2012.5056067-105358.html (type 1)
+# http://www.hadsel.kommune.no/component/docman/doc_download/946-offentlig-postjournal-28032012 (type 2)
+# http://www.stortinget.no/Global/pdf/postjournal/pj-2011-06-23.pdf (type 2 variant)
+# Note sender/receiver is not yet parsed for type 2 PDFs
+class PDFJournalParser(JournalParser):
+    pagetable = "unparsedpages"
+    brokenpagetable = "brokenpages"
+    hiddentext = False
+    breakonfailure = True
+
+    def __init__(self, agency, hiddentext=False):
+        self.hiddentext = hiddentext
+        JournalParser.__init__(self, agency=agency)
+
+    def is_already_scraped(self, url):
+        # Ignore entries were sender and recipient is the result of a broken parser (before 2012-05-25)
+        for sql in ["scrapedurl, sender, recipient from swdata where scrapedurl = '" + url + "' " +
+        # FIXME Figure out why this do not work
+        #" and not (sender = 'parse error' or recipient != 'parse error') " +
+        "limit 1",
+                    "scrapedurl from " + self.pagetable + " where scrapedurl = '" + url + "' limit 1"]:
+            try:
+                result = scraperwiki.sqlite.select(sql)
+                #int sql, " : ", result
+                if 0 < len(result) and u'scrapedurl' in result[0]:
+                    return True
+            except Exception as e:
+                #if ('no such table: %s' % self.pagetable) not in str(e) and 'no such table: swdata' not in str(e):
+                #    raise
+                print "Exception: %s" % e
+        return False
+
+    # Check if we recognize the page content, and throw if not
+    def is_valid_page(self, pdfurl, pagenum, pagecontent):
+        s = BeautifulSoup(pagecontent)
+        for t in s.findAll('text'):
+            if t.text != " ":
+                if 'Innhold:' == t.text: # type 1 or 2 (ePhorge)
+                    s = None
+                    return True
+                if 'Arkivdel:' == t.text]: # type 3 (doculive)
+                    s = None
+                    return True
+        s = None
+        if self.debug:
+            print "Unrecognized page format for " + pdfurl
+        raise ValueError("Unrecognized page format for " + pdfurl)
+
+    #
+    # Split PDF content into pages and store in SQL table for later processing.
+    # The process is split in two to better handle parge PDFs (like 600 pages),
+    # without running out of CPU time without loosing track of what is left to
+    # parse.
+    def preprocess(self, pdfurl, pdfcontent):
+        print "Preprocessing PDF " + pdfurl
+        if not pdfcontent:
+            raise ValueError("No pdf content passed for " + pdfurl)
+        if self.hiddentext:
+            options = '-hidden'
+        else:
+            options = ''
+        xml=scraperwiki.pdftoxml(pdfcontent, options)
+        if self.debug:
+            print xml
+        pages=re.findall('(<page .+?</page>)',xml,flags=re.DOTALL)
+        xml=None
+#    print pages[:1][:1000]
+        pagecount = 0
+        datastore = []
+        for page in pages:
+            pagecount = pagecount + 1
+            self.is_valid_page(pdfurl, pagecount, page)
+            data = {
+                'scrapedurl' : pdfurl,
+                'pagenum' : pagecount,
+                'pagecontent' : page,
+            }
+            datastore.append(data)
+        if 0 < len(datastore):
+            scraperwiki.sqlite.save(unique_keys=['scrapedurl', 'pagenum'], data=datastore, table_name=self.pagetable)
+        else:
+            raise ValueError("Unable to find any pages in " + pdfurl)
+        pages = None
+
+    def fetch_and_preprocess(self, pdfurl):
+        pdfcontent = fetch_url_harder(pdfurl)
+        self.preprocess(pdfurl, pdfcontent)
+        pdfcontent = None
+
+    def print_entry(self, entrytext):
+        for i in range(0, len(entrytext)):
+            print str(i) + ": '" + entrytext[i] + "'"
+
+    # ePhorte PDF
+    def parse_entry_type1(self, entrytext, pdfurl):
+        scrapestamputc = datetime.datetime.now()
+        entry = {
+             'agency' : self.agency,
+             'scrapestamputc' : scrapestamputc,
+             'scrapedurl' : pdfurl
+             }
+        i = 0
+        while i < len(entrytext):
+            #print "T: '" + entrytext[i] + "'"
+            if 'Innhold:' == entrytext[i]:
+                tittel = ""
+                # handle multi-line titles
+                while 'Sakstittel:' != entrytext[i+1]:
+                    tittel = tittel + " " + entrytext[i+1]
+                    i = i + 1
+                entry['docdesc'] = tittel
+            if 'Sakstittel:' == entrytext[i]:
+                sakstittel = ""
+                while 'DokType' != entrytext[i+1]:
+#                    print "'" + entrytext[i+1] + "'"
+                    sakstittel = sakstittel + " " + entrytext[i+1]
+                    i = i + 1
+                entry['casedesc'] = sakstittel
+            if 'DokType' == entrytext[i]: # Values I/U/N/X from NOARK 4 table 14.2.11
+                entry['doctype'] = entrytext[i+1]
+                # As seen on http://www.saltdal.kommune.no/images/module.files/2007-05-16.pdf, page 1
+                if entry['doctype'] == 'S':
+                    entry['doctype'] = 'X'
+                i = i + 1
+            if 'Sak/dok nr:' == entrytext[i]:
+            # FIXME Split and handle combined sak/løpenr
+            # Use find('penr.:') to avoid non-ascii search string 'Løpenr.:'
+                caseid = None
+                lnr = None
+                if -1 != entrytext[i+4].find('penr.:'):
+                    caseid = entrytext[i+1] + entrytext[i+2]
+                    lnr = entrytext[i+3]
+                    i = i + 4
+                elif -1 != entrytext[i+3].find('penr.:'):
+                    caseid = entrytext[i+1]
+                    lnr = entrytext[i+2]
+                    i = i + 3
+                elif -1 != entrytext[i+2].find('penr.:'):
+                    caseid, lnr = entrytext[i+1].split(" ")
+                    i = i + 2
+
+                caseyear, caseseqnr = caseid.split("/")
+                entry['caseyear'] = int(caseyear)
+                caseseqnr, casedocseq = caseseqnr.split("-")
+                entry['caseseqnr'] = int(caseseqnr)
+                entry['casedocseq'] = int(casedocseq)
+                entry['caseid'] = caseyear + "/" + caseseqnr
+
+                journalseqnr, journalyear = lnr.split("/")
+                entry['journalid'] = journalyear + "/" + journalseqnr
+                entry['journalyear'] = int(journalyear)
+                entry['journalseqnr'] = int(journalseqnr)
+
+#        if -1 != text[i].find('penr.:'): # Use find('penr.:') to avoid non-ascii search string 'Løpenr.:'
+#            str = text[i-1]
+#            print "S: '" + str + "'"
+#            data['journalid'] = str
+#            # FIXME handle combined sak/løpenr
+            if 'Journaldato:' == entrytext[i]:
+                entry['recorddate'] = dateutil.parser.parse(entrytext[i-1], dayfirst=True)
+            if 'Dok.dato:' == entrytext[i]:
+                entry['docdate'] = dateutil.parser.parse(entrytext[i-1], dayfirst=True)
+            if 'Tilg.kode Hjemmel:' == entrytext[i] and 'Avsender\mottaker:' != entrytext[i+1]:
+                entry['exemption'] = entrytext[i+1]
+                i = i + 1
+            if 'Tilg.kode' == entrytext[i]:
+                entry['accesscode'] = entrytext[i+1]
+                i = i + 1
+            if 'Hjemmel:' == entrytext[i]:
+                 entry['exemption'] = entrytext[i+1]
+                 i = i + 1
+            if 'Avsender\mottaker:' == entrytext[i]:
+                if i+1 < len(entrytext): # Non-empty field
+                    fratil = entrytext[i+1]
+                    i = i + 1
+                    if self.is_sender_doctype(entry['doctype']):
+                        entry['sender'] = fratil
+                    elif self.is_recipient_doctype(entry['doctype']):
+                        entry['recipient'] = fratil
+                    else:
+                        raise ValueError("Case " + entry['caseid'] + " Sender/Recipient with doctype " + entry['doctype'] + " != I/U/X/N in " + pdfurl)
+            if self.debug:
+                print entry
+            i = i + 1
+        return entry
+
+    def parse_case_journal_ref(self, entry, reftext, pdfurl):
+        try:
+            # FIXME Split and handle combined sak/loepenr
+            # Use find('penr.:') to avoid non-ascii search string 'Loepenr.:'
+            caseid = None
+            lnr = None
+            if 4 == len(reftext):
+#                print "4 " + str(reftext)
+                caseid = reftext[0] + reftext[1]
+                lnr = reftext[2] + reftext[3]
+#                print str(caseid) + " " + str(lnr)
+            elif 3 == len(reftext):
+                if -1 != reftext[0].find("/") and -1 != reftext[2].find("/"):
+#                    print "31"
+                    caseid = reftext[0] + reftext[1]
+                    lnr = reftext[2]
+                elif -1 != reftext[2].find("/"):
+#                    print "32"
+                    caseid = reftext[0] + reftext[1]
+                    lnr = reftext[2]
+                elif -1 == reftext[2].find("/"):
+#                    print "33"
+                    caseid = reftext[0]
+                    lnr = reftext[1] + reftext[2]
+            elif 2 == len(reftext):
+                if -1 == reftext[1].find("/"):
+#                    print "21"
+                    s = reftext[0] + reftext[1]
+#                    print "S: " + s
+                    caseid, lnr = s.split(" ")
+                elif -1 != reftext[1].find("/"):
+#                    print "22"
+                    caseid = reftext[0]
+                    lnr = reftext[1]
+            elif 1 == len(reftext):
+                caseid, lnr  = reftext[0].split(" ")
+            else:
+                raise ValueError("Unable to parse entry " + str(reftext) + " in " + pdfurl)
+#            print "C: " + caseid + " L: " + lnr
+
+            caseyear, caseseqnr = caseid.split("/")
+            entry['caseyear'] = int(caseyear)
+            caseseqnr, casedocseq = caseseqnr.split("-")
+            entry['caseseqnr'] = int(caseseqnr)
+            entry['casedocseq'] = int(casedocseq)
+            entry['caseid'] = caseyear + "/" + caseseqnr
+
+            journalseqnr, journalyear = lnr.split("/")
+            entry['journalid'] = journalyear + "/" + journalseqnr
+            entry['journalyear'] = int(journalyear)
+            entry['journalseqnr'] = int(journalseqnr)
+        except:
+            print "Unable to parse " + str(reftext)
+        return entry
+    def test_parse_case_journal_ref(self):
+        entry = {}
+        self.parse_case_journal_ref(entry, [u'2008/16414-', u'23', u'15060/2012'], "")
+        self.parse_case_journal_ref(entry, [u'2011/15972-1 102773/201', u'1'], "")
+        self.parse_case_journal_ref(entry, [u'2010/2593-2', u'103004/201', u'1'], "")
+        self.parse_case_journal_ref(entry, [u'2011/13415-', u'22', u'100077/201', u'1'], "")
+
+    # ePhorte PDF
+    def parse_entry_type2(self, entrytext, pdfurl):
+        scrapestamputc = datetime.datetime.now()
+        entry = {
+            'agency' : self.agency,
+            'scrapestamputc' : scrapestamputc,
+            'scrapedurl' : pdfurl
+            }
+        i = 0
+        avsender = []
+        mottaker = []
+        while i < len(entrytext):
+            if 'Innhold:' == entrytext[i]:
+                tittel = ""
+                # handle multi-line titles
+                while 'Sakstittel:' != entrytext[i+1]:
+                    tittel = tittel + entrytext[i+1]
+                    i = i + 1
+                entry['docdesc'] = tittel
+            if 'Sakstittel:' == entrytext[i]:
+                sakstittel = ""
+                # Klassering er i en annen dokumenttype
+                while 'DokType' != entrytext[i+1] and 'Dok.Type:' != entrytext[i+1] and 'Klassering:' != entrytext[i+1]:
+
+#                print "'" + entrytext[i+1] + "'"
+                    sakstittel = sakstittel + entrytext[i+1]
+                    i = i + 1
+                entry['casedesc'] = sakstittel
+                i = i + 1
+            if 'DokType' == entrytext[i] or 'Dok.Type:' == entrytext[i]: # Values I/U/N/X from NOARK 4 table 14.2.11
+                entry['doctype'] = entrytext[i+1]
+                # As seen on http://www.uis.no/getfile.php/Journal%20200612.pdf
+                if entry['doctype'] == 'S':
+                    entry['doctype'] = 'X'
+                i = i + 1
+            if 'Sak/dok nr:' == entrytext[i] or 'Sak/dok.nr:' == entrytext[i]:
+                endi = i
+                while endi < len(entrytext):
+                    if -1 != entrytext[endi].find('penr.:') or -1 != entrytext[endi].find('penr:'):
+                        break
+                    endi = endi + 1
+                entry = self.parse_case_journal_ref(entry, entrytext[i+1:endi], pdfurl)
+                i = endi + 1
+#       if -1 != text[i].find('penr.:'): # Use find('penr.:') to avoid non-ascii search string 'Løpenr.:'
+#                str = text[i-1]
+#                print "S: '" + str + "'"
+#                data['journalid'] = str
+#                # FIXME handle combined sak/løpenr
+            if 'Journaldato:' == entrytext[i]:
+                entry['recorddate'] = dateutil.parser.parse(entrytext[i-1], dayfirst=True)
+            if 'Dok.dato:' == entrytext[i]:
+                entry['docdate'] = dateutil.parser.parse(entrytext[i-1], dayfirst=True)
+            if 'Tilg.kode Hjemmel:' == entrytext[i] and '(enhet/initialer):' != entrytext[i+2]:
+                entry['exemption'] = entrytext[i+1]
+                i = i + 1
+            if 'Tilg.kode' == entrytext[i]:
+                entry['accesscode'] = entrytext[i+1]
+                i = i + 1
+            if 'Hjemmel:' == entrytext[i]:
+                entry['exemption'] = entrytext[i+1]
+                i = i + 1
+#        if -1 != text[i].find('Avs./mottaker:'):
+# FIXME Need to handle senders and receivers
+            if 'Mottaker' == entrytext[i]:
+                mottaker.append(entrytext[i-1])
+            if 'Avsender' == entrytext[i]:
+                avsender.append(entrytext[i-1])
+#            entry['sender'] = 'parse error'
+#            entry['recipient'] = 'parse error'
+            i = i + 1
+        if 0 < len(mottaker):
+            entry['recipient'] = string.join(mottaker, ", ")
+        if 0 < len(avsender):
+            entry['sender'] = string.join(avsender, ", ")
+        return entry
+
+    def parse_page(self, pdfurl, pagenum, pagecontent):
+        print "Scraping " + pdfurl + " page " + str(pagenum)
+        s = BeautifulSoup(pagecontent)
+        datastore = []
+        text = []
+        linecount = 0
+        if self.debug:
+            print s
+        for t in s.findAll('text'):
+            if t.text != " ":
+                text.append(t.text)
+                if self.debug:
+                    print str(linecount) + ": " + t.text
+# FIXME Remove length limit when working
+#        if 100 <= linecount:
+#            break
+            linecount = linecount + 1
+#        if -1 != t.text.find("Side:"):
+#            print t.text
+        s = None
+
+#    print "Found " + str(linecount) + " lines/text fragments in the PDF"
+        if len(text) < linecount:
+            raise  ValueError("Text array too sort!")
+
+        # First count how many entries to expect on this page, to be able to
+        # verify that all of them were found.
+        entrycount = 0
+        i = 0
+        while i < len(text):
+            if 'Innhold:' == text[i] \ # Type 1 and 2 (ePhorge)
+            or 'Arkivdel:' == text[i]:  # type 3 (doculive)
+                entrycount = entrycount + 1
+            i = i + 1
+
+        i = 0
+        while i < len(text):
+            if self.debug:
+                print "T: '" + text[i] + "'"
+            if self.debug and -1 != text[i].find("Side:"):
+                print text[i]
+            if 'Innhold:' == text[i]:
+                endi = i + 1
+                pdfparser = None
+                format = "unknown"
+                while endi < len(text):
+                    if 'Klassering:' == text[endi]:
+                        pdfparser = self.parse_entry_type2
+                        format = "type2"
+                    if 'Avsender\mottaker:' == text[endi]:
+                        pdfparser = self.parse_entry_type1
+                        format = "type1"
+                    if 'Innhold:' == text[endi]:
+                        break
+                    endi = endi + 1
+                if self.debug:
+                    print "Entry " + str(entrycount) + " from " + str(i) + " to " + str(endi) + " ie " + str(endi - i) + " lines"
+                try:
+                    entry = pdfparser(text[i:endi], pdfurl)
+                    if 'caseid' not in entry or entry['caseid'] is None or \
+                       not self.is_valid_doctype(entry['doctype']):
+                        raise ValueError("Unable to parse " + pdfurl + " as format " + format + "[" + str(entry) + "]")
+#                print entry
+                    datastore.append(entry)
+                    i = endi - 2
+                except:
+                    self.print_entry(text[i:endi])
+                    raise
+            i = i + 1
+#        print data
+#    print "Found " + str(len(datastore)) + " of " + str(entrycount) + " entries"
+        if entrycount != len(datastore):
+#        print text
+            raise ValueError("Unable to parse all entries in " + pdfurl)
+        if 0 == len(datastore):
+            print "Unable to find any entries in " + pdfurl
+        else:
+            scraperwiki.sqlite.save(unique_keys=['caseid', 'casedocseq'], data=datastore)
+        datastore = None
+        text = None
+
+    def process_pages(self):
+        try:
+            sqlselect = "* from " + self.pagetable + " limit 1"
+            pageref = scraperwiki.sqlite.select(sqlselect)
+            while pageref:
+                scrapedurl = pageref[0]['scrapedurl']
+                pagenum = pageref[0]['pagenum']
+                pagecontent = pageref[0]['pagecontent']
+#            print "Found " + scrapedurl + " page " + str(pagenum) + " length " + str(len(pagecontent))
+                try:
+                    sqldelete = "delete from " + self.pagetable + " where scrapedurl = '" + scrapedurl + "' and pagenum = " + str(pagenum)
+                    self.parse_page(scrapedurl, pagenum, pagecontent)
+#                    print "Trying to: " + sqldelete
+                    scraperwiki.sqlite.execute(sqldelete)
+                except ValueError, e:
+                    brokenpage = {
+                        'scrapedurl' : scrapedurl,
+                        'pagenum' : pagenum,
+                        'pagecontent' : pagecontent,
+                    }
+                    print "Broken page %d from %s" % (pagenum, scrapedurl)
+                    scraperwiki.sqlite.save(unique_keys=['scrapedurl', 'pagenum'], data=brokenpage, table_name=self.brokenpagetable)
+                scraperwiki.sqlite.execute(sqldelete)
+                scraperwiki.sqlite.commit()
+                pageref = scraperwiki.sqlite.select(sqlselect)
+        except scraperwiki.sqlite.SqliteError, e:
+            print str(e)
+            raise
+
+def fieldlist():
+    import urllib2
+    import json
+
+    scrapers = [
+        'postliste-universitetet-i-oslo',
+        'postliste-lindesnes',
+        'postliste-kristiansund',
+        'postliste-stortinget',
+        'postliste-arendal',
+        'postliste-oep',
+        'postliste-ballangen',
+        'postliste-hadsel',
+        'postliste-storfjord',
+        'postliste-oslo-havn',
+      ]
+
+    keys = {}
+
+    for scraper in scrapers:
+        url = 'https://api.scraperwiki.com/api/1.0/scraper/getinfo?format=jsondict&name=' + scraper + '&version=-1'
+        response = urllib2.urlopen(url)
+        html = response.read()
+        data = json.loads(html)
+        if 'swdata' in data[0]['datasummary']['tables']:
+            for key in data[0]['datasummary']['tables']['swdata']['keys']:
+                key = key.lower()
+                if key in keys:
+                    keys[key].append(scraper)
+                else:
+                    keys[key] = [scraper]
+    def lensort(a, b):
+        return cmp(len(keys[b]), len(keys[a]))
+
+    for key in sorted(keys.keys(), lensort):
+        print len(keys[key]), key, str(keys[key])
+
+if __name__ == "scraper":
+    fieldlist()
+
diff --git a/scrapersources/postliste-risr-kommune b/scrapersources/postliste-risr-kommune
new file mode 100644
index 0000000..cb87bdb
--- /dev/null
+++ b/scrapersources/postliste-risr-kommune
@@ -0,0 +1,126 @@
+# -*- coding: UTF-8 -*-
+# Based on the scraper advanced-scraping-pdf
+# See also
+# https://views.scraperwiki.com/run/pdf-to-html-preview-1/?url=http%3A%2F%2Fwww.stortinget.no%2FGlobal%2Fpdf%2Fpostjournal%2Fpj-2012-05-09.pdf
+
+import scraperwiki
+import json
+from BeautifulSoup import BeautifulSoup
+import datetime
+import dateutil.parser
+import lxml.html
+#import resource
+import sys
+#import urlparse
+#import gc
+import re
+#lazycache=scraperwiki.swimport('lazycache')
+#postlistelib=scraperwiki.swimport('postliste-python-lib')
+
+agency = 'Risør kommune'
+
+import mechanize
+
+# ASPX pages are some of the hardest challenges because they use javascript and forms to navigate
+# Almost always the links go through the function function __doPostBack(eventTarget, eventArgument)
+# which you have to simulate in the mechanize form handling library
+
+# This example shows how to follow the Next page link
+
+url = 'http://159.171.0.169/ris/Modules/innsyn.aspx?mode=pl&SelPanel=0&ObjectType=ePhorteRegistryEntry&VariantType=Innsyn&ViewType=List&Query=RecordDate%3a%28-7%29+AND+DocumentType%3a%28I%2cU%29'
+br = mechanize.Browser()
+
+# sometimes the server is sensitive to this information
+br.addheaders = [('User-agent', 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.1) Gecko/2008071615 Fedora/3.0.1-1.fc9 Firefox/3.0.1')]
+response = br.open(url)
+
+html = response.read()
+
+
+
+for pagenum in range(6):
+    print "Page %d  page length %d" % (pagenum, len(html))
+    #print html
+    #print "Clinicians found:", re.findall("PDetails.aspx\?ProviderId.*?>(.*?)</a>", html)
+    
+    
+    mnextlink = re.search("javascript:__doPostBack\('ctl00\$ctl00\$ctl00\$WebPartManager\$wp1243460126ViewPart\$ctl04',''\).>Neste", html)
+    #print mnextlink
+    if not mnextlink:
+        break
+    
+    br.select_form(name='aspnetForm')
+    br.form.set_all_readonly(False)
+    br['__EVENTTARGET'] = 'ctl00$ctl00$ctl00$WebPartManager$wp1243460126ViewPart$ctl04' #'ProviderSearchResultsTable1$NextLinkButton'
+    br['__EVENTARGUMENT'] = ''
+    br.submit()
+    
+    html = br.response().read()
+    #print len(html)
+
+
+
+
+# def report_errors(errors):
+#     if 0 < len(errors):
+#         print "Errors:"
+#         for e in errors:
+#             print e
+#         exit(1)
+# def out_of_cpu(arg, spent, hard, soft):
+#     report_errors(arg)
+# 
+# def process_pdf(parser, pdfurl, errors):
+#     errors = []
+#     postlistelib.exit_if_no_cpu_left(0, out_of_cpu, errors)
+#     try:
+#         pdfcontent = scraperwiki.scrape(pdfurl)
+#         parser.preprocess(pdfurl, pdfcontent)
+#         pdfcontent = None
+# #    except ValueError, e:
+# #        errors.append(e)
+#     except IndexError, e:
+#         errors.append(e)
+# 
+# def process_page_queue(parser, errors):
+#     try:
+#         parser.process_pages()
+#         postlistelib.exit_if_no_cpu_left(0, out_of_cpu, errors)
+#     except scraperwiki.CPUTimeExceededError, e:
+#         errors.append("Processing pages interrupted")
+# 
+# def process_journal_pdfs(parser, listurl, errors):
+# #    print "Finding PDFs on " + listurl
+# #    u = urllib.parse.urlparse(listurl)
+#     html = scraperwiki.scrape(listurl)
+#     root = lxml.html.fromstring(html)
+#     html = None
+#     for ahref in root.cssselect("table a"):
+#         href = ahref.attrib['href']
+#         url = urlparse.urljoin(listurl, href)
+#         if -1 != href.find("file://"):
+# #            print "Skipping non-http URL " + url
+#             continue
+#         if parser.is_already_scraped(url):
+#             True
+# #            print "Skipping already scraped " + url
+#         else:
+# #            print "Will process " + url
+#             process_pdf(parser, url, errors)
+# 
+# def test_small_pdfs():
+#     # Test with some smaller PDFs
+#     errors = []
+#     process_pdf("http://home.nuug.no/~pere/uio-postjournal/2011-16.pdf", errors)
+#     process_pdf("http://home.nuug.no/~pere/uio-postjournal/2011-52.pdf", errors)
+#     process_page_queue(errors)
+#     report_errors(errors)
+#     exit(0)
+# 
+# #test_small_pdfs()
+# errors = []
+# parser = postlistelib.PDFJournalParser(agency=agency)
+# process_journal_pdfs(parser, "http://www.havn.oslo.kommune.no/postjournal/", errors)
+# process_page_queue(parser, errors)
+# report_errors(errors)
+
diff --git a/scrapersources/postliste-ruter b/scrapersources/postliste-ruter
new file mode 100644
index 0000000..757d6be
--- /dev/null
+++ b/scrapersources/postliste-ruter
@@ -0,0 +1,81 @@
+# -*- coding: UTF-8 -*-
+
+import scraperwiki
+import json
+from BeautifulSoup import BeautifulSoup
+import datetime
+import dateutil.parser
+import lxml.html
+import resource
+import sys
+import urlparse
+import re
+lazycache=scraperwiki.swimport('lazycache')
+postlistelib=scraperwiki.swimport('postliste-python-lib')
+
+agency = 'Ruter AS'
+
+def report_errors(errors):
+    if 0 < len(errors):
+        print "Errors:"
+        for e in errors:
+            print e
+        exit(1)
+def out_of_cpu(arg, spent, hard, soft):
+    report_errors(arg)
+
+def process_pdf(parser, pdfurl, errors):
+    errors = []
+    postlistelib.exit_if_no_cpu_left(0, out_of_cpu, errors)
+    try:
+        pdfcontent = scraperwiki.scrape(pdfurl)
+        parser.preprocess(pdfurl, pdfcontent)
+        pdfcontent = None
+#    except ValueError, e:
+#        errors.append(e)
+    except IndexError, e:
+        errors.append(e)
+
+def process_page_queue(parser, errors):
+    try:
+        parser.process_pages()
+        postlistelib.exit_if_no_cpu_left(0, out_of_cpu, errors)
+    except scraperwiki.CPUTimeExceededError, e:
+        errors.append("Processing pages interrupted")
+
+def process_journal_pdfs(parser, listurl, errors):
+#    print "Finding PDFs on " + listurl
+#    u = urllib.parse.urlparse(listurl)
+    html = scraperwiki.scrape(listurl)
+    root = lxml.html.fromstring(html)
+    html = None
+    for ahref in root.cssselect("div.vedlegg a"):
+        href = ahref.attrib['href']
+        url = urlparse.urljoin(listurl, href)
+        if -1 != href.find("file://") or -1 == url.find(".pdf"):
+#            print "Skipping non-http URL " + url
+            continue
+        if parser.is_already_scraped(url):
+            True
+#            print "Skipping already scraped " + url
+        else:
+#            print "Will process " + url
+            process_pdf(parser, url, errors)
+
+def test_small_pdfs(parser):
+    # Test with some smaller PDFs
+    errors = []
+    process_pdf(parser, "http://www2.ruter.no/Documents/Offentlig_journal/2012_Uke_24.pdf?epslanguage=no", errors)
+    process_page_queue(parser, errors)
+    report_errors(errors)
+    exit(0)
+
+errors = []
+parser = postlistelib.PDFJournalParser(agency=agency)
+
+#test_small_pdfs(parser)
+
+process_journal_pdfs(parser, "http://www2.ruter.no/verdt-a-vite/presse/offentlig-journal/", errors)
+process_page_queue(parser, errors)
+report_errors(errors)
+
diff --git a/scrapersources/postliste-saltdal b/scrapersources/postliste-saltdal
new file mode 100644
index 0000000..0650d6c
--- /dev/null
+++ b/scrapersources/postliste-saltdal
@@ -0,0 +1,98 @@
+# -*- coding: UTF-8 -*-
+
+import scraperwiki
+import json
+from BeautifulSoup import BeautifulSoup
+import datetime
+import dateutil.parser
+import lxml.html
+import resource
+import sys
+import urllib2
+import urlparse
+import re
+lazycache=scraperwiki.swimport('lazycache')
+postlistelib=scraperwiki.swimport('postliste-python-lib')
+
+agency = 'Saltdal kommune'
+
+def report_errors(errors):
+    if 0 < len(errors):
+        print "Errors:"
+        laste  = None
+        for e in errors:
+            print e
+            laste = e
+        raise e
+
+def out_of_cpu(arg, spent, hard, soft):
+    report_errors(arg)
+
+def process_pdf(parser, pdfurl, errors):
+    errors = []
+    postlistelib.exit_if_no_cpu_left(0, out_of_cpu, errors)
+    try:
+        pdfcontent = scraperwiki.scrape(pdfurl)
+        parser.preprocess(pdfurl, pdfcontent)
+        pdfcontent = None
+#    except ValueError, e:
+#        errors.append(e)
+    except IndexError, e:
+        errors.append(e)
+    except ValueError, e:
+        errors.append(e)
+    except urllib2.HTTPError, e:
+        errors.append(e)
+
+def process_page_queue(parser, errors):
+    try:
+        parser.process_pages()
+        postlistelib.exit_if_no_cpu_left(0, out_of_cpu, errors)
+    except scraperwiki.CPUTimeExceededError, e:
+        errors.append(e)
+
+def process_journal_pdfs(parser, listurl, errors):
+#    print "Finding PDFs on " + listurl
+#    u = urllib.parse.urlparse(listurl)
+    html = scraperwiki.scrape(listurl)
+    root = lxml.html.fromstring(html)
+    html = None
+    for ahref in root.cssselect("table a"):
+        href = ahref.attrib['href']
+        url = urlparse.urljoin(listurl, href)
+        if -1 != href.find("file://") or -1 == url.find(".pdf"):
+#            print "Skipping non-http URL " + url
+            continue
+        if parser.is_already_scraped(url):
+            True
+#            print "Skipping already scraped " + url
+        else:
+#            print "Will process " + url
+            process_pdf(parser, url, errors)
+
+def test_small_pdfs(parser):
+    # Test with some smaller PDFs
+    errors = []
+    #parser.debug = True
+    newurl = "http://www.saltdal.kommune.no/images/module.files/010612.pdf"
+    if not parser.is_already_scraped(newurl):
+        process_pdf(parser, newurl, errors) # New format
+    if parser.is_already_scraped(newurl):
+        print "Already parsed"
+    else:
+        raise ValueError("Failed to parse")
+#    process_pdf(parser, "http://www.saltdal.kommune.no/images/module.files/2007-01-31.pdf", errors) # Old format
+    process_page_queue(parser, errors)
+    report_errors(errors)
+    exit(0)
+
+errors = []
+parser = postlistelib.PDFJournalParser(agency=agency)
+#parser.debug = True
+
+#test_small_pdfs(parser)
+
+process_journal_pdfs(parser, "http://www.saltdal.kommune.no/postlister.html", errors)
+process_page_queue(parser, errors)
+report_errors(errors)
+
diff --git a/scrapersources/postliste-sivilombudsmannen b/scrapersources/postliste-sivilombudsmannen
new file mode 100644
index 0000000..0bf5914
--- /dev/null
+++ b/scrapersources/postliste-sivilombudsmannen
@@ -0,0 +1,81 @@
+# -*- coding: UTF-8 -*-
+
+import scraperwiki
+import json
+from BeautifulSoup import BeautifulSoup
+import datetime
+import dateutil.parser
+import lxml.html
+import resource
+import sys
+import urlparse
+import re
+lazycache=scraperwiki.swimport('lazycache')
+postlistelib=scraperwiki.swimport('postliste-python-lib')
+
+agency = 'Sivilombudsmannen'
+
+def report_errors(errors):
+    if 0 < len(errors):
+        print "Errors:"
+        for e in errors:
+            print e
+        exit(1)
+def out_of_cpu(arg, spent, hard, soft):
+    report_errors(arg)
+
+def process_pdf(parser, pdfurl, errors):
+    errors = []
+    postlistelib.exit_if_no_cpu_left(0, out_of_cpu, errors)
+    try:
+        pdfcontent = scraperwiki.scrape(pdfurl)
+        parser.preprocess(pdfurl, pdfcontent)
+        pdfcontent = None
+#    except ValueError, e:
+#        errors.append(e)
+    except IndexError, e:
+        errors.append(e)
+
+def process_page_queue(parser, errors):
+    try:
+        parser.process_pages()
+        postlistelib.exit_if_no_cpu_left(0, out_of_cpu, errors)
+    except scraperwiki.CPUTimeExceededError, e:
+        errors.append("Processing pages interrupted")
+
+def process_journal_pdfs(parser, listurl, errors):
+#    print "Finding PDFs on " + listurl
+#    u = urllib.parse.urlparse(listurl)
+    html = scraperwiki.scrape(listurl)
+    root = lxml.html.fromstring(html)
+    html = None
+    for ahref in root.cssselect("div.rightColumn a"):
+        href = ahref.attrib['href']
+        url = urlparse.urljoin(listurl, href)
+        if -1 != href.find("file://") or -1 == url.find(".pdf"):
+#            print "Skipping non-http URL " + url
+            continue
+        if parser.is_already_scraped(url):
+            True
+#            print "Skipping already scraped " + url
+        else:
+#            print "Will process " + url
+            process_pdf(parser, url, errors)
+
+def test_small_pdfs(parser):
+    # Test with some smaller PDFs
+    errors = []
+    process_pdf(parser, "http://www.sivilombudsmannen.no/getfile.php/Dokumenter/Journaler/11.06.2012.pdf", errors)
+    process_page_queue(parser, errors)
+    report_errors(errors)
+    exit(0)
+
+errors = []
+parser = postlistelib.PDFJournalParser(agency=agency)
+
+#test_small_pdfs(parser)
+
+process_journal_pdfs(parser, "http://www.sivilombudsmannen.no/offentlig-journal/", errors)
+process_page_queue(parser, errors)
+report_errors(errors)
+
diff --git a/scrapersources/postliste-sogne b/scrapersources/postliste-sogne
new file mode 100644
index 0000000..afa4fdf
--- /dev/null
+++ b/scrapersources/postliste-sogne
@@ -0,0 +1,81 @@
+# -*- coding: UTF-8 -*-
+
+import scraperwiki
+import json
+from BeautifulSoup import BeautifulSoup
+import datetime
+import dateutil.parser
+import lxml.html
+import resource
+import sys
+import urlparse
+import re
+lazycache=scraperwiki.swimport('lazycache')
+postlistelib=scraperwiki.swimport('postliste-python-lib')
+
+agency = 'Søgne kommune'
+
+def report_errors(errors):
+    if 0 < len(errors):
+        print "Errors:"
+        for e in errors:
+            print e
+        exit(1)
+def out_of_cpu(arg, spent, hard, soft):
+    report_errors(arg)
+
+def process_pdf(parser, pdfurl, errors):
+    errors = []
+    postlistelib.exit_if_no_cpu_left(0, out_of_cpu, errors)
+    try:
+        pdfcontent = scraperwiki.scrape(pdfurl)
+        parser.preprocess(pdfurl, pdfcontent)
+        pdfcontent = None
+#    except ValueError, e:
+#        errors.append(e)
+    except IndexError, e:
+        errors.append(e)
+
+def process_page_queue(parser, errors):
+    try:
+        parser.process_pages()
+        postlistelib.exit_if_no_cpu_left(0, out_of_cpu, errors)
+    except scraperwiki.CPUTimeExceededError, e:
+        errors.append("Processing pages interrupted")
+
+def process_journal_pdfs(parser, listurl, errors):
+#    print "Finding PDFs on " + listurl
+#    u = urllib.parse.urlparse(listurl)
+    html = scraperwiki.scrape(listurl)
+    root = lxml.html.fromstring(html)
+    html = None
+    for ahref in root.cssselect("div#ReadArea a"):
+        href = ahref.attrib['href']
+        url = urlparse.urljoin(listurl, href)
+        if -1 != href.find("file://") or -1 == url.find(".pdf"):
+#            print "Skipping non-http URL " + url
+            continue
+        if parser.is_already_scraped(url):
+            True
+#            print "Skipping already scraped " + url
+        else:
+#            print "Will process " + url
+            process_pdf(parser, url, errors)
+
+def test_small_pdfs(parser):
+    # Test with some smaller PDFs
+    errors = []
+    process_pdf(parser, "http://www.sogne.kommune.no/Documents/Postlister/2012.06.18.pdf", errors)
+    process_page_queue(parser, errors)
+    report_errors(errors)
+    exit(0)
+
+errors = []
+parser = postlistelib.PDFJournalParser(agency=agency)
+
+#test_small_pdfs(parser)
+
+process_journal_pdfs(parser, "http://www.sogne.kommune.no/Organisasjon1/Administrasjonsavdelingen/Arkivet/Postlister/", errors)
+process_page_queue(parser, errors)
+report_errors(errors)
+
diff --git a/scrapersources/postliste-storfjord b/scrapersources/postliste-storfjord
new file mode 100644
index 0000000..4702f8d
--- /dev/null
+++ b/scrapersources/postliste-storfjord
@@ -0,0 +1,82 @@
+# -*- coding: UTF-8 -*-
+# Based on the scraper advanced-scraping-pdf
+# See also
+# https://views.scraperwiki.com/run/pdf-to-html-preview-1/?url=http%3A%2F%2Fwww.stortinget.no%2FGlobal%2Fpdf%2Fpostjournal%2Fpj-2012-05-09.pdf
+
+import scraperwiki
+import json
+from BeautifulSoup import BeautifulSoup
+import datetime
+import dateutil.parser
+import lxml.html
+import urlparse
+import re
+lazycache=scraperwiki.swimport('lazycache')
+postlistelib=scraperwiki.swimport('postliste-python-lib')
+
+agency = 'Storfjord kommune'
+
+def report_errors(errors):
+    if 0 < len(errors):
+        print "Errors:"
+        for e in errors:
+            print e
+        exit(1)
+
+def out_of_cpu(arg, spent, hard, soft):
+    report_errors(arg)
+
+def process_pdf(parser, pdfurl, errors):
+    errors = []
+    postlistelib.exit_if_no_cpu_left(0, out_of_cpu, errors)
+    try:
+        parser.fetch_and_preprocess(pdfurl)
+#    except ValueError, e:
+#        errors.append(e)
+    except IndexError, e:
+        errors.append(e)
+
+def process_page_queue(parser, errors):
+    try:
+        parser.process_pages()
+        postlistelib.exit_if_no_cpu_left(0, out_of_cpu, errors)
+    except scraperwiki.CPUTimeExceededError, e:
+        errors.append("Processing pages interrupted")
+
+def process_journal_pdfs(parser, listurl, errors):
+#    print "Finding PDFs on " + listurl
+#    u = urllib.parse.urlparse(listurl)
+    html = scraperwiki.scrape(listurl)
+    root = lxml.html.fromstring(html)
+    html = None
+    for ahref in root.cssselect("div.main a"):
+        href = ahref.attrib['href']
+        url = urlparse.urljoin(listurl, href)
+        if -1 == url.find("postliste-"):
+            continue
+        if parser.is_already_scraped(url):
+            True
+#            print "Skipping already scraped " + url
+        else:
+#            print "Will process " + url
+            process_pdf(parser, url, errors)
+
+def test_small_pdfs(parser):
+    # Test with some smaller PDFs
+    errors = []
+    process_pdf(parser, "http://www.storfjord.kommune.no/postliste-18-mai-2012.5056067-105358.html", errors)
+    process_pdf(parser, "http://www.storfjord.kommune.no/postliste-16-mai-2012.5056059-105358.html", errors)
+    process_page_queue(parser, errors)
+    report_errors(errors)
+    exit(0)
+
+parser = postlistelib.PDFJournalParser(agency=agency)
+#test_small_pdfs(parser)
+
+errors = []
+process_journal_pdfs(parser, "http://www.storfjord.kommune.no/postliste.105358.no.html", errors)
+for page in range(2,91):
+    process_journal_pdfs(parser, "http://www.storfjord.kommune.no/?cat=105358&apage=" + str(page), errors)
+process_page_queue(parser, errors)
+report_errors(errors)
+
diff --git a/scrapersources/postliste-stortinget b/scrapersources/postliste-stortinget
new file mode 100644
index 0000000..98fd7d6
--- /dev/null
+++ b/scrapersources/postliste-stortinget
@@ -0,0 +1,90 @@
+# Based on the scraper advanced-scraping-pdf
+# See also 
+# https://views.scraperwiki.com/run/pdf-to-html-preview-1/?url=http%3A%2F%2Fwww.stortinget.no%2FGlobal%2Fpdf%2Fpostjournal%2Fpj-2012-05-09.pdf
+import scraperwiki
+from BeautifulSoup import BeautifulSoup
+import datetime
+import dateutil.parser
+import lxml.html
+import urlparse
+import resource
+import sys
+postlistelib=scraperwiki.swimport('postliste-python-lib')
+
+def find_journal_pdfs(parser, listurl):
+#    print "Finding PDFs on " + listurl
+    html = postlistelib.fetch_url_harder(listurl)
+
+    root = lxml.html.fromstring(html)
+    pdfurls = []
+    for ahref in root.cssselect("div.mainbody a"):
+        href = ahref.attrib['href']
+        url = urlparse.urljoin(listurl, href)
+        if -1 != href.find("file://"):
+#            print "Skipping non-http URL " + url
+            continue
+        if parser.is_already_scraped(url):
+            True
+#            print "Skipping already scraped " + url
+        else:
+#            print "Will process " + url
+            pdfurls.append(url)
+    return pdfurls
+
+def fetch_and_preprocess(parser, pdfurl):
+    pdfcontent = postlistelib.fetch_url_harder(pdfurl)
+    parser.preprocess(pdfurl, pdfcontent)
+    pdfcontent = None
+
+def add_pdf_lists(parser, pdfurls):
+    for period in [
+         "",
+         "_2010-2011",
+         "-2009-2010",
+         "-2008-2009",
+        ]:
+        url = "http://www.stortinget.no/no/Stortinget-og-demokratiet/Administrasjonen/Dokumentoffentlighet/Stortingets-offentlige-postjournal" + period + "/"
+        pdfurls.extend(find_journal_pdfs(parser, url))
+
+
+def report_errors(errors):
+    if 0 < len(errors):
+        print "Errors:"
+        for e in errors:
+            print e
+        raise
+
+def no_cpu_left(arg, spent, soft, hard):
+     report_errors(arg)
+
+agency = 'Stortinget'
+parser = postlistelib.PDFJournalParser(agency=agency)
+#parser.debug = True
+
+if False:
+    pdfurl = "http://www.stortinget.no/Global/pdf/postjournal/pj-2010-06-04-05.pdf"
+    parse_pdf(pdfurl)
+    exit(0)
+
+pdfurls = []
+add_pdf_lists(parser, pdfurls)
+
+# Fetch all journal PDFs
+errors = []
+for pdfurl in pdfurls:
+    postlistelib.exit_if_no_cpu_left(0, callback=no_cpu_left, arg = errors)
+    try:
+        parser.fetch_and_preprocess(pdfurl)
+    except ValueError, e:
+        errors.append(e)
+    except IndexError, e:
+        errors.append(e)
+try:
+    parser.process_pages()
+except ValueError, e:
+    errors.append(e)
+except IndexError, e:
+    errors.append(e)
+
+report_errors(errors)
+
diff --git a/scrapersources/postliste-universitetet-i-oslo b/scrapersources/postliste-universitetet-i-oslo
new file mode 100644
index 0000000..be7b77b
--- /dev/null
+++ b/scrapersources/postliste-universitetet-i-oslo
@@ -0,0 +1,125 @@
+# -*- coding: UTF-8 -*-
+# Based on the scraper advanced-scraping-pdf
+# See also
+# https://views.scraperwiki.com/run/pdf-to-html-preview-1/?url=http%3A%2F%2Fwww.stortinget.no%2FGlobal%2Fpdf%2Fpostjournal%2Fpj-2012-05-09.pdf
+
+import scraperwiki
+import json
+from BeautifulSoup import BeautifulSoup
+import datetime
+import dateutil.parser
+import lxml.html
+import resource
+import sys
+import urlparse
+import re
+
+# Make sure Scraperwiki believe this is the source from this database
+scraperwiki.scrape("http://www.uio.no/om/journal/")
+
+lazycache=scraperwiki.swimport('lazycache')
+postlistelib=scraperwiki.swimport('postliste-python-lib')
+
+agency = 'Universitetet i Oslo'
+
+def report_errors(errors):
+    if 0 < len(errors):
+        print "Errors:"
+        for e in errors:
+            print e
+        raise ValueError("Something went wrong")
+
+def out_of_cpu(arg, spent, hard, soft):
+    report_errors(arg)
+
+def process_pdf(parser, pdfurl, errors):
+    postlistelib.exit_if_no_cpu_left(0, out_of_cpu, errors)
+    try:
+        pdfcontent = scraperwiki.scrape(pdfurl)
+        parser.preprocess(pdfurl, pdfcontent)
+        pdfcontent = None
+    except ValueError, e:
+        errors.append(e)
+    except IndexError, e:
+        errors.append(e)
+
+def process_page_queue(parser, errors):
+    try:
+        parser.process_pages()
+        postlistelib.exit_if_no_cpu_left(0, out_of_cpu, errors)
+    except scraperwiki.CPUTimeExceededError, e:
+        errors.append("Processing pages interrupted")
+
+def process_journal_pdfs(parser, listurl, errors):
+#    print "Finding PDFs on " + listurl
+#    u = urllib.parse.urlparse(listurl)
+    html = scraperwiki.scrape(listurl)
+    root = lxml.html.fromstring(html)
+    html = None
+    for ahref in root.cssselect("table a"):
+        href = ahref.attrib['href']
+        url = urlparse.urljoin(listurl, href)
+        if -1 != href.find("file://") or -1 == url.find(".pdf"):
+#            print "Skipping non-http URL " + url
+            continue
+        if parser.is_already_scraped(url):
+            True
+#            print "Skipping already scraped " + url
+        else:
+#            print "Will process " + url
+            process_pdf(parser, url, errors)
+
+def process_journal_pdf_directory(parser, listurl, errors):
+    #html = scraperwiki.scrape(listurl)
+    html = lazycache.lazycache(listurl)
+    root = lxml.html.fromstring(html)
+    html = None
+
+    pdflisturls = []
+    for ahref in root.cssselect("span.vrtx-paging-wrapper a"):
+        href = ahref.attrib['href']
+        url = urlparse.urljoin(listurl, href)
+        pdflisturls.append(url)
+#    print pdflisturls
+
+    for listurl in pdflisturls:
+        html = scraperwiki.scrape(listurl)
+        root = lxml.html.fromstring(html)
+        html = None
+        urlseen = {}
+        for ahref in root.cssselect("div.vrtx-resource a"):
+            href = ahref.attrib['href']
+            url = urlparse.urljoin(listurl, href)
+            if -1 == url.find(".pdf"):
+                continue
+            # Ignore duplicates with M: as part of the name
+            if -1 != url.find("/M%"):
+                continue
+            if url in urlseen or parser.is_already_scraped(url):
+                True
+#                print "Skipping already scraped " + url
+            else:
+#                print "Will process " + url
+                process_pdf(parser, url, errors)
+            urlseen[url] = 1
+
+def test_small_pdfs(parser):
+    # Test with some smaller PDFs
+    errors = []
+    process_pdf(parser, "http://home.nuug.no/~pere/uio-postjournal/2011-16.pdf", errors)
+    process_pdf(parser, "http://home.nuug.no/~pere/uio-postjournal/2011-52.pdf", errors)
+    process_page_queue(parser, errors)
+    report_errors(errors)
+    exit(0)
+
+errors = []
+parser = postlistelib.PDFJournalParser(agency=agency)
+
+#test_small_pdfs(parser)
+
+process_journal_pdfs(parser, "http://www.uio.no/om/journal/", errors)
+#process_journal_pdf_directory(parser, "http://www.uio.no/om/journal/2012/", errors)
+#process_journal_pdf_directory(parser, "http://www.uio.no/om/journal/2011/", errors)
+process_page_queue(parser, errors)
+report_errors(errors)
+
diff --git a/scrapersources/postliste-universitetet-i-stavanger b/scrapersources/postliste-universitetet-i-stavanger
new file mode 100644
index 0000000..5852cb7
--- /dev/null
+++ b/scrapersources/postliste-universitetet-i-stavanger
@@ -0,0 +1,89 @@
+# -*- coding: UTF-8 -*-
+
+import scraperwiki
+import json
+from BeautifulSoup import BeautifulSoup
+import datetime
+import dateutil.parser
+import lxml.html
+import resource
+import sys
+import urlparse
+import re
+lazycache=scraperwiki.swimport('lazycache')
+postlistelib=scraperwiki.swimport('postliste-python-lib')
+
+agency = 'Universitetet i Stavanger'
+
+def report_errors(errors):
+    if 0 < len(errors):
+        print "Errors:"
+        for e in errors:
+            print e
+        exit(1)
+def out_of_cpu(arg, spent, hard, soft):
+    report_errors(arg)
+
+def process_pdf(parser, pdfurl, errors):
+    errors = []
+    postlistelib.exit_if_no_cpu_left(0, out_of_cpu, errors)
+    try:
+        pdfcontent = scraperwiki.scrape(pdfurl)
+        parser.preprocess(pdfurl, pdfcontent)
+        pdfcontent = None
+#    except ValueError, e:
+#        errors.append(e)
+    except IndexError, e:
+        errors.append(e)
+
+def process_page_queue(parser, errors):
+    try:
+        parser.process_pages()
+        postlistelib.exit_if_no_cpu_left(0, out_of_cpu, errors)
+    except scraperwiki.CPUTimeExceededError, e:
+        errors.append("Processing pages interrupted")
+
+def process_journal_pdfs(parser, listurl, errors):
+#    print "Finding PDFs on " + listurl
+#    u = urllib.parse.urlparse(listurl)
+    html = scraperwiki.scrape(listurl)
+    root = lxml.html.fromstring(html)
+    html = None
+    for ahref in root.cssselect("div#placeholder-content-main-left-column a"):
+        href = ahref.attrib['href']
+        url = urlparse.urljoin(listurl, href)
+        if -1 != href.find("file://") or -1 == url.find('/postjournal/article'):
+#            print "Skipping non-http URL " + url
+            continue
+        subhtml = scraperwiki.scrape(url)
+        subroot = lxml.html.fromstring(subhtml)
+        subhtml = None
+        for subahref in subroot.cssselect("div.article-content a"):
+            subhref = subahref.attrib['href']
+            suburl = urlparse.urljoin(listurl, subhref)
+            if -1 == suburl.find(".pdf"):
+                continue
+            if parser.is_already_scraped(suburl):
+                True
+#            print "Skipping already scraped " + suburl
+            else:
+#                print "Will process " + suburl
+                process_pdf(parser, suburl, errors)
+
+def test_small_pdfs(parser):
+    # Test with some smaller PDFs
+    errors = []
+    process_pdf(parser, "http://www.uis.no/getfile.php/Journal%20200612.pdf", errors)
+    process_page_queue(parser, errors)
+    report_errors(errors)
+    exit(0)
+
+errors = []
+parser = postlistelib.PDFJournalParser(agency=agency)
+
+#test_small_pdfs(parser)
+
+process_journal_pdfs(parser, "http://www.uis.no/nyheter/postjournal/", errors)
+process_page_queue(parser, errors)
+report_errors(errors)
+
diff --git a/scrapersources/postliste-vikna b/scrapersources/postliste-vikna
new file mode 100644
index 0000000..1279f9e
--- /dev/null
+++ b/scrapersources/postliste-vikna
@@ -0,0 +1,89 @@
+# -*- coding: UTF-8 -*-
+
+import scraperwiki
+import json
+from BeautifulSoup import BeautifulSoup
+import datetime
+import dateutil.parser
+import lxml.html
+import resource
+import sys
+import urlparse
+import re
+
+# Make sure Scraperwiki believe this is the source from this database
+scraperwiki.scrape("http://www.vikna.kommune.no/Vikna/Web.nsf/mainPress?OpenForm&amp;U=POST")
+
+lazycache=scraperwiki.swimport('lazycache')
+postlistelib=scraperwiki.swimport('postliste-python-lib')
+
+agency = 'Vikna kommune'
+
+def report_errors(errors):
+    if 0 < len(errors):
+        print "Errors:"
+        for e in errors:
+            print e
+        exit(1)
+def out_of_cpu(arg, spent, hard, soft):
+    report_errors(arg)
+
+def process_pdf(parser, pdfurl, errors):
+    errors = []
+    postlistelib.exit_if_no_cpu_left(0, out_of_cpu, errors)
+    try:
+        pdfcontent = scraperwiki.scrape(pdfurl)
+        parser.preprocess(pdfurl, pdfcontent)
+        pdfcontent = None
+#    except ValueError, e:
+#        errors.append(e)
+    except IndexError, e:
+        errors.append(e)
+
+def process_page_queue(parser, errors):
+    try:
+        parser.process_pages()
+        postlistelib.exit_if_no_cpu_left(0, out_of_cpu, errors)
+    except scraperwiki.CPUTimeExceededError, e:
+        errors.append("Processing pages interrupted")
+
+def process_journal_pdfs(parser, listurl, errors):
+#    print "Finding PDFs on " + listurl
+#    u = urllib.parse.urlparse(listurl)
+    html = scraperwiki.scrape(listurl)
+    root = lxml.html.fromstring(html)
+    html = None
+    for ahref in root.cssselect("table a"):
+        href = ahref.attrib['href']
+        url = urlparse.urljoin(listurl, href)
+        if -1 != href.find("file://") or -1 == url.find(".pdf"):
+#            print "Skipping non-http URL " + url
+            continue
+        if -1 != href.find("/Ingen postjournal.pdf"):
+            continue
+        if parser.is_already_scraped(url):
+            True
+#            print "Skipping already scraped " + url
+        else:
+#            print "Will process " + url
+            process_pdf(parser, url, errors)
+
+def test_small_pdfs(parser):
+    # Test with some smaller PDFs
+    errors = []
+    process_pdf(parser, "http://www.vikna.kommune.no/Vikna/Intern.nsf/FilA/A715C0C6E0D8CC05C12578F70024857B/$FILE/PJ230811.pdf", errors)
+    process_page_queue(parser, errors)
+    report_errors(errors)
+    exit(0)
+
+errors = []
+parser = postlistelib.PDFJournalParser(agency=agency)
+
+#test_small_pdfs(parser)
+
+process_page_queue(parser, errors)
+process_journal_pdfs(parser, "http://www.vikna.kommune.no/Vikna/Web.nsf/mainPress?OpenForm&amp;U=POST", errors)
+process_page_queue(parser, errors)
+report_errors(errors)
+
+# FIXME Need to handle recent journal entries too
+\ No newline at end of file
diff --git a/scrapersources/statens_vegvesen_offentlig_journal b/scrapersources/statens_vegvesen_offentlig_journal
new file mode 100644
index 0000000..947da4e
--- /dev/null
+++ b/scrapersources/statens_vegvesen_offentlig_journal
@@ -0,0 +1,56 @@
+import scraperwiki
+import lxml.html
+import datetime
+
+#uncomment to run for a selected timeperiod
+#fromdate = "01.04.2011"
+#todate = "21.05.2011"
+
+#fromdate = datetime.datetime.strptime(fromdate, "%d.%m.%Y")
+#todate = datetime.datetime.strptime(todate, "%d.%m.%Y")
+#adday = datetime.timedelta(days=1)
+
+def scrapepage(mydate):
+    
+    formatteddate = mydate.strftime("%d.%m.%Y")
+    #formatteddate = "10.05.2011"
+
+    url = "http://www.vegvesen.no/Om+Statens+vegvesen/Aktuelt/Offentlig+journal?dokumenttyper=&dato=%s&journalenhet=6&utforSok=S%%C3%%B8k&submitButton=S%%C3%%B8k" % formatteddate
+    
+    root = lxml.html.parse(url).getroot()
+    
+    divs = root.cssselect("div.treff")
+    
+    for p in divs:
+        
+        dateandtype = p.xpath("p/text()")[0].split(" ")
+        saksdetaljer = p.xpath("ul[@class='saksdetaljer']/li/text()")
+    
+        
+        record = { 
+                    "doknr": dateandtype[0],
+                    "innut": dateandtype[2],
+                    "tittel": p.xpath("h2/text()")[0],
+                    "sak": p.xpath("span[@class='sak']")[0].text[6:],
+                    "fratil": p.xpath("ul[@class='fraTil']/li/text()")[0][5:],
+                }
+    
+        record.update(dict([x.split(":") for x in saksdetaljer]))
+
+        record['Dokumenttdato'] = datetime.datetime.strptime(record['Dokumenttdato'].strip(), "%d.%m.%Y").date()
+        record['Journaldato'] = datetime.datetime.strptime(record['Journaldato'].strip(), "%d.%m.%Y").date()
+ 
+        scraperwiki.sqlite.save(unique_keys=["doknr"], data=record)
+
+#uncomment to run for a selected timeperiod
+#thedate = fromdate
+#while thedate <= todate:
+#    print thedate
+#    thedate = thedate + adday
+#    scrapepage(thedate)
+#comment out these two lines in order to run for a selected timeperiod
+thedate = datetime.datetime.now()
+print thedate
+
+scrapepage(thedate)
+    
+\ No newline at end of file
author	Petter Reinholdtsen <pere@hungry.com>	2012-07-13 12:28:13 +0200
committer	Petter Reinholdtsen <pere@hungry.com>	2012-07-13 12:28:13 +0200
commit	22bceaf65dd89df97529df0102149aefa2b54f54 (patch)
tree	24a6dd995d146b27d92d4c91593dc8d8fd952064