Updated from scraperwiki.

author: Petter Reinholdtsen <pere@hungry.com> 2012-07-13 12:34:28 +0200
committer: Petter Reinholdtsen <pere@hungry.com> 2012-07-13 12:34:28 +0200
commit: c99e7bfda6e025314b6a7c6683a1bc3c5818621c (patch)
tree: ef4230a7830973c37e9d83c9e773d41a56834658
parent: 22bceaf65dd89df97529df0102149aefa2b54f54 (diff)
22 files changed, 1881 insertions, 80 deletions
diff --git a/scrapersources/list-nuug-postliste-scrapers b/scrapersources/list-nuug-postliste-scrapers
new file mode 100644
index 0000000..67c4158
--- /dev/null
+++ b/scrapersources/list-nuug-postliste-scrapers
@@ -0,0 +1,90 @@
+import os
+import urlparse
+urlquery = os.getenv('URLQUERY')
+
+if urlquery:
+     querydata = urlparse.parse_qsl(urlquery);
+     for pair in querydata:
+        if pair[0] == "js" and pair[1] == "jquery.js":
+            print 'js-sourcecode'
+            exit(0)
+
+import urllib2, json, re
+import yaml
+
+url = "https://api.scraperwiki.com/api/1.0/scraper/search?format=jsondict&maxrows=200&searchquery=nuug-postliste-endyaml"
+json_data = json.load(urllib2.urlopen(url))
+print '''<html>
+<head>
+<link rel="stylesheet" href="https://views.scraperwiki.com/run/jquery-tablesorter/?file=style-blue.css" type="text/css" />
+<script type="text/javascript" src="https://views.scraperwiki.com/run/jquery-tablesorter/?file=jquery-1-4-2-min.js"></script>
+<script type="text/javascript" src="https://views.scraperwiki.com/run/jquery-tablesorter/?file=jquery.tablesorter.2-0-5.min.js"></script>'''
+
+print '''</head><body>
+<p>This view lists scrapers with yaml-combatible comments (containing the string "nuug-postliste-endyaml" like the following in their description
+<pre>
+&lt;!-- nuug-postliste-yaml --&gt;
+YAML-tagger:&lt;br&gt;
+Type: kommune&lt;br&gt;
+Status: finished&lt;br&gt;
+Name: Lillesteinsmyr kommune&lt;br&gt;
+Format: PDF&lt;br&gt;
+Datatype: ePhorte&lt;br&gt;
+Run: daily&lt;br&gt;
+&lt;!-- nuug-postliste-endyaml --&gt;
+</pre></p>
+<table id="myTable" class="tablesorter">'''
+
+print '<thead><tr><th>Name</th><th>type</th><th>status</th><th>schedule</th><th>format</th><th>datatype</th><th>URL</th></tr></thead><tbody>'
+counter = {}
+for scraper in json_data:
+    print scraper
+    comment = re.findall(r'<!-- nuug-postliste-yaml -->(.*)<!-- nuug-postliste-endyaml -->', 
+                    scraper['description'], re.DOTALL)
+    assert len(comment) == 1
+    data = yaml.load(comment[0].strip().replace('<br>',''))
+
+    if data['Type'] in counter:
+        counter[data['Type']] = counter[data['Type']] + 1
+    else:
+        counter[data['Type']] = 1
+
+    if  'Run' in data: Run = data['Run']
+    else: Run = 'unknown'
+
+    if  'Format' in data: Format = data['Format']
+    else: Format = 'unknown'
+
+    if  'Datatype' in data: Type = data['Datatype']
+    else: Type = 'unknown'
+
+
+    print '<tr><td>%s</td><td>%s</td><td>%s</td><td>%s</td><td>%s</td><td>%s</td><td><a href="https://scraperwiki.com/scrapers/%s/">URL</a></td></tr>' % \
+    (data['Name'],data['Type'],data['Status'], Run, Format, Type, scraper['short_name'])
+print '''</tbody></table><table id="myTable2" class="tablesorter"><thead><tr><th>type</th><th>count</th></tr></thead><tbody>'''
+
+for key in counter:
+    print '<tr><td>%s</td><td>%d</td></tr>' % (key, counter[key])
+print '</tbody></table>'
+
+num_kommune = float(429)
+num_fylke = float(19)
+print '<table class="tablesorter"><thead><tr><td>Type</td><td>Prosent</td></tr></thead><tbody>'
+try:
+    print "<tr><td>Kommune</td><td>%.2f%% (%d av %d)</td></tr>" % \
+    ((float(counter['kommune'])/float(num_kommune))*100, counter['kommune'], num_kommune)
+except KeyError: pass
+try:
+    print "<tr><td>Fylkeskommune</td><td>%.2f%% (%d av %d)</td></tr>" % \
+    ((float(counter['fylkeskommune'])/float(num_fylke))*100, counter['fylkeskommune'], num_fylke)
+except KeyError: pass
+print '''</tbody></table>
+<script type="text/javascript">
+    $(document).ready(function() 
+        { 
+            $("#myTable").tablesorter(); 
+            $("#myTable2").tablesorter(); 
+        } 
+    );
+</script>
+</body></html>'''
+\ No newline at end of file
diff --git a/scrapersources/postliste-halden b/scrapersources/postliste-halden
index 4b0ebd5..e7c2d30 100644
--- a/scrapersources/postliste-halden
+++ b/scrapersources/postliste-halden
@@ -83,7 +83,7 @@ def test_small_pdfs(parser):
 
 errors = []
 parser = postlistelib.PDFJournalParser(agency=agency)
-#parser.debug = True
+parser.debug = True
 
 #test_small_pdfs(parser)
 process_page_queue(parser, errors)
diff --git a/scrapersources/postliste-hoegskolen-i-finnmark b/scrapersources/postliste-hoegskolen-i-finnmark
new file mode 100644
index 0000000..2a4b972
--- /dev/null
+++ b/scrapersources/postliste-hoegskolen-i-finnmark
@@ -0,0 +1,86 @@
+# -*- coding: UTF-8 -*-
+
+import scraperwiki
+import json
+from BeautifulSoup import BeautifulSoup
+import datetime
+import dateutil.parser
+import lxml.html
+import resource
+import sys
+import urlparse
+import re
+
+# Make sure Scraperwiki believe this is the source from this database
+scraperwiki.scrape("http://www.hifm.no/nor/www_hifm_no/hogskolen-i-finnmark-_-startside/om-hogskolen/om-hogskolen/offentlig-journal-1")
+
+lazycache=scraperwiki.swimport('lazycache')
+postlistelib=scraperwiki.swimport('postliste-python-lib')
+
+agency = 'Høgskolen i Finnmark'
+
+def report_errors(errors):
+    if 0 < len(errors):
+        print "Errors:"
+        for e in errors:
+            print e
+        raise ValueError("Something went wrong")
+
+def out_of_cpu(arg, spent, hard, soft):
+    report_errors(arg)
+
+def process_pdf(parser, pdfurl, errors):
+    postlistelib.exit_if_no_cpu_left(0, out_of_cpu, errors)
+    try:
+        pdfcontent = scraperwiki.scrape(pdfurl)
+        parser.preprocess(pdfurl, pdfcontent)
+        pdfcontent = None
+    except ValueError, e:
+        errors.append(e)
+    except IndexError, e:
+        errors.append(e)
+
+def process_page_queue(parser, errors):
+    try:
+        parser.process_pages()
+        postlistelib.exit_if_no_cpu_left(0, out_of_cpu, errors)
+    except scraperwiki.CPUTimeExceededError, e:
+        errors.append("Processing pages interrupted")
+
+def process_journal_pdfs(parser, listurl, errors):
+#    print "Finding PDFs on " + listurl
+#    u = urllib.parse.urlparse(listurl)
+    html = scraperwiki.scrape(listurl)
+    root = lxml.html.fromstring(html)
+    html = None
+    for ahref in root.cssselect("div.content-padding a"):
+        href = ahref.attrib['href']
+        url = urlparse.urljoin(listurl, href)
+        if -1 != href.find("file://") or -1 == url.find("/download_journal.php"):
+#            print "Skipping non-http URL " + url
+            continue
+        if parser.is_already_scraped(url):
+            True
+#            print "Skipping already scraped " + url
+        else:
+#            print "Will process " + url
+            process_pdf(parser, url, errors)
+
+def test_small_pdfs(parser):
+    # Test with some smaller PDFs
+    errors = []
+    process_pdf(parser, "http://www.hifm.no/neted/includes/hifm/download_journal.php?fn=120503", errors)
+    process_page_queue(parser, errors)
+    report_errors(errors)
+    exit(0)
+
+errors = []
+parser = postlistelib.PDFJournalParser(agency=agency)
+
+#test_small_pdfs(parser)
+
+process_page_queue(parser, errors)
+process_journal_pdfs(parser, "http://www.hifm.no/nor/www_hifm_no/hogskolen-i-finnmark-_-startside/om-hogskolen/om-hogskolen/offentlig-journal-1/?&type=a", errors)
+process_page_queue(parser, errors)
+report_errors(errors)
+
diff --git a/scrapersources/postliste-hoegskolen-i-gjoevik b/scrapersources/postliste-hoegskolen-i-gjoevik
index fd197eb..d4f7931 100644
--- a/scrapersources/postliste-hoegskolen-i-gjoevik
+++ b/scrapersources/postliste-hoegskolen-i-gjoevik
@@ -19,7 +19,7 @@ import re
 #
 #
 # Make sure Scraperwiki believe this is the source from this database
-scraperwiki.scrape("http://www.hig.no/om_hig/offentleg_journal/2012")
+scraperwiki.scrape("http://www.hig.no/om_hig/offentleg_journal")
 
 lazycache=scraperwiki.swimport('lazycache')
 postlistelib=scraperwiki.swimport('postliste-python-lib')
@@ -31,19 +31,19 @@ def report_errors(errors):
         print "Errors:"
         for e in errors:
             print e
-        exit(1)
+        raise ValueError(str(len(errors)) + " errors detected")
+
 def out_of_cpu(arg, spent, hard, soft):
     report_errors(arg)
 
 def process_pdf(parser, pdfurl, errors):
-    errors = []
     postlistelib.exit_if_no_cpu_left(0, out_of_cpu, errors)
     try:
         pdfcontent = scraperwiki.scrape(pdfurl)
         parser.preprocess(pdfurl, pdfcontent)
         pdfcontent = None
-#    except ValueError, e:
-#        errors.append(e)
+    except ValueError, e:
+        errors.append(e)
     except IndexError, e:
         errors.append(e)
 
@@ -73,21 +73,20 @@ def process_journal_pdfs(parser, listurl, errors):
 #            print "Will process " + url
             process_pdf(parser, url, errors)
 
-def test_small_pdfs(parser):
+#def test_small_pdfs(parser):
     # Test with some smaller PDFs
-    errors = []
-    if parser.is_already_scraped("http://www.hig.no/content/download/30119/360872/file/Offentlig+journal+04.11.2010.pdf"):
-        print "Skipping already scraped "
-        exit(1)
-    else:
-        print "Will process "
-    
+#    errors = []
+#    if parser.is_already_scraped("http://www.hig.no/content/download/30119/360872/file/Offentlig+journal+04.11.2010.pdf"):
+#        print "Skipping already scraped "
+#        exit(1)
+#    else:
+#        print "Will process "
     #process_pdf(parser, "http://www.hig.no/content/download/35184/430061/file/Offentlig%20journal%2025.06.2012.pdf", errors)
     #process_pdf(parser, "http://www.hig.no/content/download/30116/360863/file/Offentlig%20journal%2001.11.2010.pdf", errors)
-    process_pdf(parser, "http://www.hig.no/content/download/30119/360872/file/Offentlig+journal+04.11.2010.pdf", errors)
-    process_page_queue(parser, errors)
-    report_errors(errors)
-    exit(0)
+#    process_pdf(parser, "http://www.hig.no/content/download/30119/360872/file/Offentlig+journal+04.11.2010.pdf", errors)
+#    process_page_queue(parser, errors)
+#    report_errors(errors)
+#    exit(0)
 
 errors = []
 parser = postlistelib.PDFJournalParser(agency=agency)
@@ -96,9 +95,9 @@ parser = postlistelib.PDFJournalParser(agency=agency)
 
 startYear=2010
 endYear=datetime.datetime.now().year
-for year in range(startYear, endYear):
-    process_journal_pdfs(parser, "http://www.hig.no/om_hig/offentleg_journal/%d" % year, errors)
 
+for year in range(startYear, endYear+1): # range goes from startyear to endYear-1
+    process_journal_pdfs(parser, "http://www.hig.no/om_hig/offentleg_journal/%d" % year, errors)
 process_page_queue(parser, errors)
 report_errors(errors)
 
diff --git a/scrapersources/postliste-hoegskolen-i-nord-troendelag b/scrapersources/postliste-hoegskolen-i-nord-troendelag
new file mode 100644
index 0000000..3db177b
--- /dev/null
+++ b/scrapersources/postliste-hoegskolen-i-nord-troendelag
@@ -0,0 +1,88 @@
+# -*- coding: UTF-8 -*-
+# Based on the scraper advanced-scraping-pdf
+# See also
+# https://views.scraperwiki.com/run/pdf-to-html-preview-1/?url=http%3A%2F%2Fwww.stortinget.no%2FGlobal%2Fpdf%2Fpostjournal%2Fpj-2012-05-09.pdf
+
+import scraperwiki
+import json
+from BeautifulSoup import BeautifulSoup
+import datetime
+import dateutil.parser
+import lxml.html
+import resource
+import sys
+import urlparse
+import re
+
+# Make sure Scraperwiki believe this is the source from this database
+scraperwiki.scrape("http://www.hint.no/aktuelt/offentlig_postjournal")
+
+lazycache=scraperwiki.swimport('lazycache')
+postlistelib=scraperwiki.swimport('postliste-python-lib')
+
+agency = 'Høgskolen i Nord-Trøndelag'
+
+def report_errors(errors):
+    if 0 < len(errors):
+        print "Errors:"
+        for e in errors:
+            print e
+        raise ValueError("Something went wrong")
+
+def out_of_cpu(arg, spent, hard, soft):
+    report_errors(arg)
+
+def process_pdf(parser, pdfurl, errors):
+    postlistelib.exit_if_no_cpu_left(0, out_of_cpu, errors)
+    try:
+        pdfcontent = scraperwiki.scrape(pdfurl)
+        parser.preprocess(pdfurl, pdfcontent)
+        pdfcontent = None
+    except ValueError, e:
+        errors.append(e)
+    except IndexError, e:
+        errors.append(e)
+
+def process_page_queue(parser, errors):
+    try:
+        parser.process_pages()
+        postlistelib.exit_if_no_cpu_left(0, out_of_cpu, errors)
+    except scraperwiki.CPUTimeExceededError, e:
+        errors.append("Processing pages interrupted")
+
+def process_journal_pdfs(parser, listurl, errors):
+#    print "Finding PDFs on " + listurl
+#    u = urllib.parse.urlparse(listurl)
+    html = scraperwiki.scrape(listurl)
+    root = lxml.html.fromstring(html)
+    html = None
+    for ahref in root.cssselect("div.mliste a"):
+        href = ahref.attrib['href']
+        url = urlparse.urljoin(listurl, href)
+        if -1 != href.find("file://") or -1 == url.find(".pdf"):
+#            print "Skipping non-http URL " + url
+            continue
+        if parser.is_already_scraped(url):
+            True
+#            print "Skipping already scraped " + url
+        else:
+#            print "Will process " + url
+            process_pdf(parser, url, errors)
+
+def test_small_pdfs(parser):
+    # Test with some smaller PDFs
+    errors = []
+    process_pdf(parser, "http://www.hint.no/content/download/60032/904325/version/1/file/Off.+journal+28.06.2012.pdf", errors)
+    process_page_queue(parser, errors)
+    report_errors(errors)
+    exit(0)
+
+errors = []
+parser = postlistelib.PDFJournalParser(agency=agency)
+
+#test_small_pdfs(parser)
+
+process_journal_pdfs(parser, "http://www.hint.no/aktuelt/offentlig_postjournal", errors)
+process_page_queue(parser, errors)
+report_errors(errors)
+
diff --git a/scrapersources/postliste-hoegskolen-i-telemark b/scrapersources/postliste-hoegskolen-i-telemark
new file mode 100644
index 0000000..a41d014
--- /dev/null
+++ b/scrapersources/postliste-hoegskolen-i-telemark
@@ -0,0 +1,86 @@
+# -*- coding: UTF-8 -*-
+
+import scraperwiki
+import json
+from BeautifulSoup import BeautifulSoup
+import datetime
+import dateutil.parser
+import lxml.html
+import resource
+import sys
+import urlparse
+import re
+
+# Make sure Scraperwiki believe this is the source from this database
+scraperwiki.scrape("http://www.hit.no/nor/HiT/Om-HiT/Offentlig-journal-for-HiT")
+
+lazycache=scraperwiki.swimport('lazycache')
+postlistelib=scraperwiki.swimport('postliste-python-lib')
+
+agency = 'Høgskolen i Telemark'
+
+def report_errors(errors):
+    if 0 < len(errors):
+        print "Errors:"
+        for e in errors:
+            print e
+        raise ValueError("Something went wrong")
+
+def out_of_cpu(arg, spent, hard, soft):
+    report_errors(arg)
+
+def process_pdf(parser, pdfurl, errors):
+    postlistelib.exit_if_no_cpu_left(0, out_of_cpu, errors)
+    try:
+        pdfcontent = scraperwiki.scrape(pdfurl)
+        parser.preprocess(pdfurl, pdfcontent)
+        pdfcontent = None
+    except ValueError, e:
+        errors.append(e)
+    except IndexError, e:
+        errors.append(e)
+
+def process_page_queue(parser, errors):
+    try:
+        parser.process_pages()
+        postlistelib.exit_if_no_cpu_left(0, out_of_cpu, errors)
+    except scraperwiki.CPUTimeExceededError, e:
+        errors.append("Processing pages interrupted")
+
+def process_journal_pdfs(parser, listurl, errors):
+#    print "Finding PDFs on " + listurl
+#    u = urllib.parse.urlparse(listurl)
+    html = scraperwiki.scrape(listurl)
+    root = lxml.html.fromstring(html)
+    html = None
+    for ahref in root.cssselect("table a"):
+        href = ahref.attrib['href']
+        url = urlparse.urljoin(listurl, href)
+        if -1 != href.find("file://") or -1 == url.find(".pdf"):
+#            print "Skipping non-http URL " + url
+            continue
+        if parser.is_already_scraped(url):
+            True
+#            print "Skipping already scraped " + url
+        else:
+#            print "Will process " + url
+            process_pdf(parser, url, errors)
+
+def test_small_pdfs(parser):
+    # Test with some smaller PDFs
+    errors = []
+    process_pdf(parser, "http://www.hit.no/nor/content/download/128467/1372770/file/Offentlig+journal+uke+1.pdf", errors)
+    process_page_queue(parser, errors)
+    report_errors(errors)
+    exit(0)
+
+errors = []
+parser = postlistelib.PDFJournalParser(agency=agency)
+
+#test_small_pdfs(parser)
+
+process_page_queue(parser, errors)
+process_journal_pdfs(parser, "http://www.hit.no/nor/HiT/Om-HiT/Offentlig-journal-for-HiT", errors)
+process_page_queue(parser, errors)
+report_errors(errors)
+
diff --git a/scrapersources/postliste-hoegskolen-i-volda b/scrapersources/postliste-hoegskolen-i-volda
new file mode 100644
index 0000000..0106cb7
--- /dev/null
+++ b/scrapersources/postliste-hoegskolen-i-volda
@@ -0,0 +1,88 @@
+# -*- coding: UTF-8 -*-
+
+import scraperwiki
+import json
+from BeautifulSoup import BeautifulSoup
+import datetime
+import dateutil.parser
+import lxml.html
+import resource
+import sys
+import urlparse
+import re
+
+# Make sure Scraperwiki believe this is the source from this database
+scraperwiki.scrape("http://www.hivolda.no/nyn/hivolda/om-hogskulen/administrasjon/dokumentsenteret")
+
+lazycache=scraperwiki.swimport('lazycache')
+postlistelib=scraperwiki.swimport('postliste-python-lib')
+
+agency = 'Høgskolen i Volda'
+
+def report_errors(errors):
+    if 0 < len(errors):
+        print "Errors:"
+        for e in errors:
+            print e
+        raise ValueError("Something went wrong")
+
+def out_of_cpu(arg, spent, hard, soft):
+    report_errors(arg)
+
+def process_pdf(parser, pdfurl, errors):
+    postlistelib.exit_if_no_cpu_left(0, out_of_cpu, errors)
+    try:
+        pdfcontent = scraperwiki.scrape(pdfurl)
+        parser.preprocess(pdfurl, pdfcontent)
+        pdfcontent = None
+    except ValueError, e:
+        errors.append(e)
+    except IndexError, e:
+        errors.append(e)
+
+def process_page_queue(parser, errors):
+    try:
+        parser.process_pages()
+        postlistelib.exit_if_no_cpu_left(0, out_of_cpu, errors)
+    except scraperwiki.CPUTimeExceededError, e:
+        errors.append("Processing pages interrupted")
+
+def process_journal_pdfs(parser, listurl, errors):
+#    print "Finding PDFs on " + listurl
+#    u = urllib.parse.urlparse(listurl)
+    html = scraperwiki.scrape(listurl)
+    root = lxml.html.fromstring(html)
+    html = None
+    for ahref in root.cssselect("div.inside a"):
+        if 'id' not in ahref.attrib or -1 == ahref.attrib['id'].find("archiveimage_"):
+            continue
+        href = ahref.attrib['href']
+        url = urlparse.urljoin(listurl, href)
+        if -1 != href.find("file://"):
+#            print "Skipping non-http URL " + url
+            continue
+        if parser.is_already_scraped(url):
+            True
+#            print "Skipping already scraped " + url
+        else:
+#            print "Will process " + url
+            process_pdf(parser, url, errors)
+
+def test_small_pdfs(parser):
+    # Test with some smaller PDFs
+    errors = []
+    process_pdf(parser, "http://www.hivolda.no/neted/modules/archive/front/file.php?data=47449f5f5477b30f13f282759d5f08b1", errors)
+    process_page_queue(parser, errors)
+    report_errors(errors)
+    exit(0)
+
+errors = []
+parser = postlistelib.PDFJournalParser(agency=agency)
+#parser.debug = True
+
+#test_small_pdfs(parser)
+
+process_journal_pdfs(parser, "http://www.hivolda.no/nyn/hivolda/om-hogskulen/administrasjon/dokumentsenteret", errors)
+process_page_queue(parser, errors)
+report_errors(errors)
+
diff --git a/scrapersources/postliste-hvaler b/scrapersources/postliste-hvaler
index b3e9137..113b145 100644
--- a/scrapersources/postliste-hvaler
+++ b/scrapersources/postliste-hvaler
@@ -6,10 +6,11 @@ from BeautifulSoup import BeautifulSoup
 import datetime
 import dateutil.parser
 import lxml.html
-import resource
-import sys
 import urlparse
 import re
+
+scraperwiki.scrape("http://www.hvaler.kommune.no/Postlister/")
+
 lazycache=scraperwiki.swimport('lazycache')
 postlistelib=scraperwiki.swimport('postliste-python-lib')
 
@@ -20,12 +21,12 @@ def report_errors(errors):
         print "Errors:"
         for e in errors:
             print e
-        exit(1)
+        raise ValueError("Something went wrong, " + str(len(errors)) + " errors detected")
+
 def out_of_cpu(arg, spent, hard, soft):
     report_errors(arg)
 
 def process_pdf(parser, pdfurl, errors):
-    errors = []
     postlistelib.exit_if_no_cpu_left(0, out_of_cpu, errors)
     try:
         pdfcontent = scraperwiki.scrape(pdfurl)
@@ -35,6 +36,8 @@ def process_pdf(parser, pdfurl, errors):
 #        errors.append(e)
     except IndexError, e:
         errors.append(e)
+    except ValueError, e:
+        errors.append(e)
 
 def process_page_queue(parser, errors):
     try:
@@ -49,10 +52,11 @@ def process_journal_pdfs(parser, listurl, errors):
     html = scraperwiki.scrape(listurl)
     root = lxml.html.fromstring(html)
     html = None
-    for ahref in root.cssselect("div#ctl00_MainRegion_StageAreaRegion_MainContentRegion_MainBodyRegion_ctl01_FileTreen0Nodes a"):
+    for ahref in root.cssselect("table a"):
         href = ahref.attrib['href']
         url = urlparse.urljoin(listurl, href)
-        if -1 != href.find("file://"):
+#        print url
+        if -1 != href.find("file://") or -1 != href.find("javascript:"):
 #            print "Skipping non-http URL " + url
             continue
         if parser.is_already_scraped(url):
@@ -72,9 +76,11 @@ def test_small_pdfs(parser):
 
 errors = []
 parser = postlistelib.PDFJournalParser(agency=agency)
+#parser.debug = True
 
 #test_small_pdfs(parser)
 
+process_page_queue(parser, errors)
 process_journal_pdfs(parser, "http://www.hvaler.kommune.no/Postlister/", errors)
 process_page_queue(parser, errors)
 report_errors(errors)
diff --git a/scrapersources/postliste-kafjord-kommune b/scrapersources/postliste-kafjord-kommune
new file mode 100644
index 0000000..212a308
--- /dev/null
+++ b/scrapersources/postliste-kafjord-kommune
@@ -0,0 +1,93 @@
+# -*- coding: UTF-8 -*-
+
+import scraperwiki
+import json
+from BeautifulSoup import BeautifulSoup
+import datetime
+import dateutil.parser
+import lxml.html
+import resource
+import sys
+import urlparse
+import re
+
+# Make sure Scraperwiki believe this is the source from this database
+scraperwiki.scrape("http://www.kafjord.kommune.no/postlister.18590.no.html")
+
+lazycache=scraperwiki.swimport('lazycache')
+postlistelib=scraperwiki.swimport('postliste-python-lib')
+
+agency = 'Kåfjord kommune'
+
+def report_errors(errors):
+    if 0 < len(errors):
+        print "Errors:"
+        for e in errors:
+            print e
+        raise ValueError("Something went wrong")
+
+def out_of_cpu(arg, spent, hard, soft):
+    report_errors(arg)
+
+def process_pdf(parser, pdfurl, errors):
+    postlistelib.exit_if_no_cpu_left(0, out_of_cpu, errors)
+    try:
+        pdfcontent = scraperwiki.scrape(pdfurl)
+        parser.preprocess(pdfurl, pdfcontent)
+        pdfcontent = None
+    except ValueError, e:
+        errors.append(e)
+    except IndexError, e:
+        errors.append(e)
+
+def process_page_queue(parser, errors):
+    try:
+        parser.process_pages()
+        postlistelib.exit_if_no_cpu_left(0, out_of_cpu, errors)
+    except scraperwiki.CPUTimeExceededError, e:
+        errors.append("Processing pages interrupted")
+
+def process_journal_pdfs(parser, listurl, errors):
+#    print "Finding PDFs on " + listurl
+#    u = urllib.parse.urlparse(listurl)
+    html = scraperwiki.scrape(listurl)
+    root = lxml.html.fromstring(html)
+    html = None
+    for ahref in root.cssselect("table a"):
+        href = ahref.attrib['href']
+        url = urlparse.urljoin(listurl, href)
+        if -1 != href.find("file://") or -1 == url.find("/postliste-"):
+#            print "Skipping non-http URL " + url
+            continue
+        if parser.is_already_scraped(url):
+            True
+#            print "Skipping already scraped " + url
+        else:
+#            print "Will process " + url
+            process_pdf(parser, url, errors)
+    # Follow the "next page" link to the end
+    for ahref in root.cssselect("center a"):
+        href = ahref.attrib['href']
+        url = urlparse.urljoin(listurl, href)
+        print ahref.text, url
+        if -1 != ahref.text.find("Neste side"):
+            process_journal_pdfs(parser, url, errors)
+
+def test_small_pdfs(parser):
+    # Test with some smaller PDFs
+    errors = []
+    process_pdf(parser, "http://www.kafjord.kommune.no/postliste-03-07-12.5071007-18590.html", errors)
+    process_page_queue(parser, errors)
+    report_errors(errors)
+    exit(0)
+
+errors = []
+parser = postlistelib.PDFJournalParser(agency=agency)
+
+#test_small_pdfs(parser)
+
+process_page_queue(parser, errors)
+process_journal_pdfs(parser, "http://www.kafjord.kommune.no/postlister.18590.no.html", errors)
+process_page_queue(parser, errors)
+report_errors(errors)
+
diff --git a/scrapersources/postliste-loppa-kommune b/scrapersources/postliste-loppa-kommune
new file mode 100644
index 0000000..7c7ec72
--- /dev/null
+++ b/scrapersources/postliste-loppa-kommune
@@ -0,0 +1,88 @@
+# -*- coding: UTF-8 -*-
+# Based on the scraper advanced-scraping-pdf
+# See also
+# https://views.scraperwiki.com/run/pdf-to-html-preview-1/?url=http%3A%2F%2Fwww.stortinget.no%2FGlobal%2Fpdf%2Fpostjournal%2Fpj-2012-05-09.pdf
+
+import scraperwiki
+import json
+from BeautifulSoup import BeautifulSoup
+import datetime
+import dateutil.parser
+import lxml.html
+import resource
+import sys
+import urlparse
+import re
+
+# Make sure Scraperwiki believe this is the source from this database
+scraperwiki.scrape("http://www.loppa.kommune.no/postjournal.113285.no.html")
+
+lazycache=scraperwiki.swimport('lazycache')
+postlistelib=scraperwiki.swimport('postliste-python-lib')
+
+agency = 'Loppa kommune'
+
+def report_errors(errors):
+    if 0 < len(errors):
+        print "Errors:"
+        for e in errors:
+            print e
+        raise ValueError("Something went wrong")
+
+def out_of_cpu(arg, spent, hard, soft):
+    report_errors(arg)
+
+def process_pdf(parser, pdfurl, errors):
+    postlistelib.exit_if_no_cpu_left(0, out_of_cpu, errors)
+    try:
+        pdfcontent = scraperwiki.scrape(pdfurl)
+        parser.preprocess(pdfurl, pdfcontent)
+        pdfcontent = None
+    except ValueError, e:
+        errors.append(e)
+    except IndexError, e:
+        errors.append(e)
+
+def process_page_queue(parser, errors):
+    try:
+        parser.process_pages()
+        postlistelib.exit_if_no_cpu_left(0, out_of_cpu, errors)
+    except scraperwiki.CPUTimeExceededError, e:
+        errors.append("Processing pages interrupted")
+
+def process_journal_pdfs(parser, listurl, errors):
+#    print "Finding PDFs on " + listurl
+#    u = urllib.parse.urlparse(listurl)
+    html = scraperwiki.scrape(listurl)
+    root = lxml.html.fromstring(html)
+    html = None
+    for ahref in root.cssselect("div.body a"):
+        href = ahref.attrib['href']
+        url = urlparse.urljoin(listurl, href)
+        if -1 != href.find("file://") or -1 == url.find(".pdf"):
+#            print "Skipping non-http URL " + url
+            continue
+        if parser.is_already_scraped(url):
+            True
+#            print "Skipping already scraped " + url
+        else:
+#            print "Will process " + url
+            process_pdf(parser, url, errors)
+
+def test_small_pdfs(parser):
+    # Test with some smaller PDFs
+    errors = []
+    process_pdf(parser, "http://www.loppa.kommune.no/getfile.php/1983773.670.bbsaudxaex/25_2012.pdf", errors)
+    process_page_queue(parser, errors)
+    report_errors(errors)
+    exit(0)
+
+errors = []
+parser = postlistelib.PDFJournalParser(agency=agency)
+
+#test_small_pdfs(parser)
+
+process_journal_pdfs(parser, "http://www.loppa.kommune.no/postjournal.113285.no.html", errors)
+process_page_queue(parser, errors)
+report_errors(errors)
+
diff --git a/scrapersources/postliste-met b/scrapersources/postliste-met
new file mode 100644
index 0000000..02c53ca
--- /dev/null
+++ b/scrapersources/postliste-met
@@ -0,0 +1,91 @@
+# -*- coding: UTF-8 -*-
+
+import scraperwiki
+import json
+from BeautifulSoup import BeautifulSoup
+import datetime
+import dateutil.parser
+import lxml.html
+import urllib2
+import urlparse
+import re
+
+# Make sure Scraperwiki believe this is the source from this database
+scraperwiki.scrape("http://met.no/Om_oss/Offentlig_journal/")
+
+lazycache=scraperwiki.swimport('lazycache')
+postlistelib=scraperwiki.swimport('postliste-python-lib')
+
+agency = 'Meteorologisk institutt'
+
+def report_errors(errors):
+    if 0 < len(errors):
+        print "Errors:"
+        for e in errors:
+            print e
+        raise ValueError("Something went wrong")
+
+def out_of_cpu(arg, spent, hard, soft):
+    report_errors(arg)
+
+def process_pdf(parser, pdfurl, errors):
+    postlistelib.exit_if_no_cpu_left(0, out_of_cpu, errors)
+    try:
+        pdfcontent = scraperwiki.scrape(pdfurl)
+        parser.preprocess(pdfurl, pdfcontent)
+        pdfcontent = None
+    except ValueError, e:
+        errors.append(e)
+    except IndexError, e:
+        errors.append(e)
+    except urllib2.HTTPError, e:
+        errors.append(str(e) + " " + pdfurl)
+
+def process_page_queue(parser, errors):
+    try:
+        parser.process_pages()
+        postlistelib.exit_if_no_cpu_left(0, out_of_cpu, errors)
+    except scraperwiki.CPUTimeExceededError, e:
+        errors.append("Processing pages interrupted")
+
+def process_journal_pdfs(parser, listurl, errors):
+#    print "Finding PDFs on " + listurl
+#    u = urllib.parse.urlparse(listurl)
+    html = scraperwiki.scrape(listurl)
+    root = lxml.html.fromstring(html)
+    html = None
+    for ahref in root.cssselect("div.article-content a"):
+        href = ahref.attrib['href']
+        url = urlparse.urljoin(listurl, href)
+        if -1 != href.find("file://") or -1 == url.find("=File.getFile;"):
+#            print "Skipping non-http URL " + url
+            continue
+        if parser.is_already_scraped(url):
+            True
+#            print "Skipping already scraped " + url
+        else:
+#            print "Will process " + url
+            process_pdf(parser, url, errors)
+
+def test_small_pdfs(parser):
+    # Test with some smaller PDFs
+    errors = []
+    process_pdf(parser, "http://met.no/Om_oss/Offentlig_journal/2012/?module=Files;action=File.getFile;ID=4570", errors)
+    process_page_queue(parser, errors)
+    report_errors(errors)
+    exit(0)
+
+errors = []
+parser = postlistelib.PDFJournalParser(agency=agency)
+
+#test_small_pdfs(parser)
+
+process_page_queue(parser, errors)
+process_journal_pdfs(parser, "http://met.no/Om_oss/Offentlig_journal/2012/", errors)
+process_journal_pdfs(parser, "http://met.no/Om_oss/Offentlig_journal/2011/", errors)
+process_journal_pdfs(parser, "http://met.no/Om_oss/Offentlig_journal/2010/", errors)
+process_journal_pdfs(parser, "http://met.no/Om_oss/Offentlig_journal/2009/", errors)
+process_journal_pdfs(parser, "http://met.no/Om_oss/Offentlig_journal/2008/", errors)
+process_page_queue(parser, errors)
+report_errors(errors)
+
diff --git a/scrapersources/postliste-mrfylke b/scrapersources/postliste-mrfylke
new file mode 100644
index 0000000..5c26ba3
--- /dev/null
+++ b/scrapersources/postliste-mrfylke
@@ -0,0 +1,82 @@
+# -*- coding: UTF-8 -*-
+
+import scraperwiki
+import json
+from BeautifulSoup import BeautifulSoup
+import datetime
+import dateutil.parser
+import lxml.html
+import resource
+import sys
+import urlparse
+import re
+lazycache=scraperwiki.swimport('lazycache')
+postlistelib=scraperwiki.swimport('postliste-python-lib')
+
+agency = 'Møre og Romsdal fylkeskommune'
+
+def report_errors(errors):
+    if 0 < len(errors):
+        print "Errors:"
+        for e in errors:
+            print e
+        exit(1)
+def out_of_cpu(arg, spent, hard, soft):
+    report_errors(arg)
+
+def process_pdf(parser, pdfurl, errors):
+    errors = []
+    postlistelib.exit_if_no_cpu_left(0, out_of_cpu, errors)
+    try:
+        pdfcontent = scraperwiki.scrape(pdfurl)
+        parser.preprocess(pdfurl, pdfcontent)
+        pdfcontent = None
+#    except ValueError, e:
+#        errors.append(e)
+    except IndexError, e:
+        errors.append(e)
+
+def process_page_queue(parser, errors):
+    try:
+        parser.process_pages()
+        postlistelib.exit_if_no_cpu_left(0, out_of_cpu, errors)
+    except scraperwiki.CPUTimeExceededError, e:
+        errors.append("Processing pages interrupted")
+
+def process_journal_pdfs(parser, listurl, errors):
+#    print "Finding PDFs on " + listurl
+#    u = urllib.parse.urlparse(listurl)
+    html = scraperwiki.scrape(listurl)
+    root = lxml.html.fromstring(html)
+    html = None
+    for ahref in root.cssselect("div.attribute-long a"):
+        href = ahref.attrib['href']
+        url = urlparse.urljoin(listurl, href)
+        if -1 != href.find("file://") or -1 != href.find("mailto:"):
+#            print "Skipping non-http URL " + url
+            continue
+        if parser.is_already_scraped(url):
+            True
+#            print "Skipping already scraped " + url
+        else:
+#            print "Will process " + url
+            process_pdf(parser, url, errors)
+
+def test_small_pdfs(parser):
+    # Test with some smaller PDFs
+    errors = []
+    process_pdf(parser, "https://mrfylke.no/Media/Files/Filer-administrasjonsavdelinga/Dokumentsenteret/Oktober-2011/Offentleg-journal-03.10.11", errors)
+    process_page_queue(parser, errors)
+    report_errors(errors)
+    exit(0)
+
+errors = []
+parser = postlistelib.PDFJournalParser(agency=agency)
+
+#test_small_pdfs(parser)
+
+process_page_queue(parser, errors)
+process_journal_pdfs(parser, "https://mrfylke.no/Organisasjon/Organisasjon/Administrasjonsavdelinga/Dokumentsenter/Offentleg-journal", errors)
+process_page_queue(parser, errors)
+report_errors(errors)
+
diff --git a/scrapersources/postliste-nrk b/scrapersources/postliste-nrk
index 5c7929d..3379f31 100644
--- a/scrapersources/postliste-nrk
+++ b/scrapersources/postliste-nrk
@@ -1,7 +1,4 @@
 # -*- coding: UTF-8 -*-
-# Based on the scraper advanced-scraping-pdf
-# See also
-# https://views.scraperwiki.com/run/pdf-to-html-preview-1/?url=http%3A%2F%2Fwww.stortinget.no%2FGlobal%2Fpdf%2Fpostjournal%2Fpj-2012-05-09.pdf
 
 import scraperwiki
 import json
@@ -9,39 +6,39 @@ from BeautifulSoup import BeautifulSoup
 import datetime
 import dateutil.parser
 import lxml.html
-import resource
-import sys
 import urlparse
-import gc
 import re
 
-frontpage = "http://www.nrk.no/contentfile/transformer/1.8052258"
-scraperwiki.scrape(frontpage)
+scraperwiki.scrape("http://www.nrk.no/innsyn/")
 
 lazycache=scraperwiki.swimport('lazycache')
 postlistelib=scraperwiki.swimport('postliste-python-lib')
 
-agency = 'Universitetet i Oslo'
+agency = 'Norsk Rikskringkasting AS'
 
 def report_errors(errors):
     if 0 < len(errors):
         print "Errors:"
         for e in errors:
             print e
-        exit(1)
+        raise ValueError(str(len(errors)) + " errors detected")
+
 def out_of_cpu(arg, spent, hard, soft):
     report_errors(arg)
 
 def process_pdf(parser, pdfurl, errors):
-    errors = []
+    if parser.is_already_scraped(pdfurl):
+        return
     postlistelib.exit_if_no_cpu_left(0, out_of_cpu, errors)
     try:
         pdfcontent = scraperwiki.scrape(pdfurl)
         parser.preprocess(pdfurl, pdfcontent)
         pdfcontent = None
-#    except ValueError, e:
-#        errors.append(e)
+    except ValueError, e:
+        print e
+        errors.append(e)
     except IndexError, e:
+        print e
         errors.append(e)
 
 def process_page_queue(parser, errors):
@@ -52,17 +49,14 @@ def process_page_queue(parser, errors):
         errors.append("Processing pages interrupted")
 
 def process_journal_pdfs(parser, listurl, errors):
-#    print "Finding PDFs on " + listurl
+    print "Finding PDFs on " + listurl
 #    u = urllib.parse.urlparse(listurl)
-    html = scraperwiki.scrape(listurl)
-    root = lxml.html.fromstring(html)
-    html = None
-    for ahref in root.cssselect("table a"):
-        href = ahref.attrib['href']
-        url = urlparse.urljoin(listurl, href)
-        if -1 != href.find("file://"):
-#            print "Skipping non-http URL " + url
-            continue
+    xml = scraperwiki.scrape(listurl)
+    root = lxml.html.fromstring(xml)
+    xml = None
+    for link in root.cssselect("hendelse link"):
+        url = lxml.html.tostring(link).replace("<link>", "").strip()
+        #print url
         if parser.is_already_scraped(url):
             True
 #            print "Skipping already scraped " + url
@@ -72,10 +66,50 @@ def process_journal_pdfs(parser, listurl, errors):
 
 def test_small_pdfs(parser):
 
-    parser.debug = True
+    #parser.debug = True
 
     errors = []
-    process_pdf(parser, "http://nrk.no/contentfile/file/1.8116520!offentligjournal02052012.pdf", errors) # text
+
+    # 2011:
+    if True:
+        process_pdf(parser, "http://home.nuug.no/~pere/nrk-postjournal/Offentlig%20journal%20NRK%200101_15012011.pdf", errors)
+        process_pdf(parser, "http://home.nuug.no/~pere/nrk-postjournal/Offentlig%20journal%20NRK%200102_10022011.pdf", errors)
+        process_pdf(parser, "http://home.nuug.no/~pere/nrk-postjournal/Offentlig%20journal%20NRK%200103_10032011.pdf", errors)
+        process_pdf(parser, "http://home.nuug.no/~pere/nrk-postjournal/Offentlig%20journal%20NRK%200104_07042011.pdf", errors)
+        process_pdf(parser, "http://home.nuug.no/~pere/nrk-postjournal/Offentlig%20journal%20NRK%200105_05052011.pdf", errors)
+        process_pdf(parser, "http://home.nuug.no/~pere/nrk-postjournal/Offentlig%20journal%20NRK%200106_18062011.pdf", errors)
+        process_pdf(parser, "http://home.nuug.no/~pere/nrk-postjournal/Offentlig%20journal%20NRK%200107_15072011.pdf", errors)
+        process_pdf(parser, "http://home.nuug.no/~pere/nrk-postjournal/Offentlig%20journal%20NRK%200108_15082011.pdf", errors)
+        process_pdf(parser, "http://home.nuug.no/~pere/nrk-postjournal/Offentlig%20journal%20NRK%200109_12092011.pdf", errors)
+        process_pdf(parser, "http://home.nuug.no/~pere/nrk-postjournal/Offentlig%20journal%20NRK%200110_12102011.pdf", errors)
+        process_pdf(parser, "http://home.nuug.no/~pere/nrk-postjournal/Offentlig%20journal%20NRK%200111_10112011.pdf", errors)
+        process_pdf(parser, "http://home.nuug.no/~pere/nrk-postjournal/Offentlig%20journal%20NRK%200112_10122011.pdf", errors)
+        process_pdf(parser, "http://home.nuug.no/~pere/nrk-postjournal/Offentlig%20journal%20NRK%200605_10052011.pdf", errors)
+        process_pdf(parser, "http://home.nuug.no/~pere/nrk-postjournal/Offentlig%20journal%20NRK%200804_15042011.pdf", errors)
+        process_pdf(parser, "http://home.nuug.no/~pere/nrk-postjournal/Offentlig%20journal%20NRK%201102_19022011.pdf", errors)
+        process_pdf(parser, "http://home.nuug.no/~pere/nrk-postjournal/Offentlig%20journal%20NRK%201103_17032011.pdf", errors)
+        process_pdf(parser, "http://home.nuug.no/~pere/nrk-postjournal/Offentlig%20journal%20NRK%201105_20052011.pdf", errors)
+        process_pdf(parser, "http://home.nuug.no/~pere/nrk-postjournal/Offentlig%20journal%20NRK%201111_20112011.pdf", errors)
+        process_pdf(parser, "http://home.nuug.no/~pere/nrk-postjournal/Offentlig%20journal%20NRK%201112_20122011.pdf", errors)
+        process_pdf(parser, "http://home.nuug.no/~pere/nrk-postjournal/Offentlig%20journal%20NRK%201309_25092011.pdf", errors)
+        process_pdf(parser, "http://home.nuug.no/~pere/nrk-postjournal/Offentlig%20journal%20NRK%201310_20102011.pdf", errors)
+        process_pdf(parser, "http://home.nuug.no/~pere/nrk-postjournal/Offentlig%20journal%20NRK%201601_31012011.pdf", errors)
+        process_pdf(parser, "http://home.nuug.no/~pere/nrk-postjournal/Offentlig%20journal%20NRK%201604_30042011.pdf", errors)
+        process_pdf(parser, "http://home.nuug.no/~pere/nrk-postjournal/Offentlig%20journal%20NRK%201607_31072011.pdf", errors)
+        process_pdf(parser, "http://home.nuug.no/~pere/nrk-postjournal/Offentlig%20journal%20NRK%201608_25082011.pdf", errors)
+        process_pdf(parser, "http://home.nuug.no/~pere/nrk-postjournal/Offentlig%20journal%20NRK%201803_26032011.pdf", errors)
+        process_pdf(parser, "http://home.nuug.no/~pere/nrk-postjournal/Offentlig%20journal%20NRK%201906_26062011.pdf", errors)
+        process_pdf(parser, "http://home.nuug.no/~pere/nrk-postjournal/Offentlig%20journal%20NRK%202105_31052011.pdf", errors)
+        process_pdf(parser, "http://home.nuug.no/~pere/nrk-postjournal/Offentlig%20journal%20NRK%202110_31102011.pdf", errors)
+        process_pdf(parser, "http://home.nuug.no/~pere/nrk-postjournal/Offentlig%20journal%20NRK%202111_30112011.pdf", errors)
+        process_pdf(parser, "http://home.nuug.no/~pere/nrk-postjournal/Offentlig%20journal%20NRK%202112_31122011.pdf", errors)
+        process_pdf(parser, "http://home.nuug.no/~pere/nrk-postjournal/Offentlig%20journal%20NRK%202502_28022011.pdf", errors)
+        process_pdf(parser, "http://home.nuug.no/~pere/nrk-postjournal/Offentlig%20journal%20NRK%202608_31082011.pdf", errors)
+        process_pdf(parser, "http://home.nuug.no/~pere/nrk-postjournal/Offentlig%20journal%20NRK%202609_30092011.pdf", errors)
+        process_pdf(parser, "http://home.nuug.no/~pere/nrk-postjournal/Offentlig%20journal%20NRK%202703_31032011.pdf", errors)
+        process_pdf(parser, "http://home.nuug.no/~pere/nrk-postjournal/Offentlig%20journal%20NRK%202706_30062011.pdf", errors)
+
+    #process_pdf(parser, "http://nrk.no/contentfile/file/1.8116520!offentligjournal02052012.pdf", errors) # text
     #process_pdf(parser, "http://nrk.no/contentfile/file/1.8061384!offentlig%2002042012.pdf", errors) # Image
     #process_pdf(parser, "http://nrk.no/contentfile/file/1.8130287!offentligjournal09052012.pdf", errors) # Image
     process_page_queue(parser, errors)
@@ -85,10 +119,9 @@ def test_small_pdfs(parser):
 errors = []
 parser = postlistelib.PDFJournalParser(agency=agency, hiddentext=True)
 
-test_small_pdfs(parser)
+#test_small_pdfs(parser)
 
 # Based on http://www.nrk.no/innsyn/
-process_journal_pdfs(parser, frontpage, errors)
+process_journal_pdfs(parser, "http://www.nrk.no/contentfile/transformer/1.8052258", errors)
 process_page_queue(parser, errors)
-report_errors(errors)
-
+report_errors(errors)
+\ No newline at end of file
diff --git a/scrapersources/postliste-ntnu b/scrapersources/postliste-ntnu
index 1a885c4..d6c6695 100644
--- a/scrapersources/postliste-ntnu
+++ b/scrapersources/postliste-ntnu
@@ -22,7 +22,7 @@ def report_errors(errors):
         print "Errors:"
         for e in errors:
             print e
-        raise ValueError("Something went wrong")
+        raise ValueError(str(len(errors)) + "errors detected")
 
 def out_of_cpu(arg, spent, hard, soft):
     report_errors(arg)
@@ -39,7 +39,7 @@ def process_pdf(parser, pdfurl, errors):
     except IndexError, e:
         errors.append(e)
     except urllib2.HTTPError, e:
-        errors.append(e)
+        errors.append(str(e)  + " " + pdfurl)
 
 def process_page_queue(parser, errors):
     try:
diff --git a/scrapersources/postliste-oep b/scrapersources/postliste-oep
index c7fdc82..735d0a7 100644
--- a/scrapersources/postliste-oep
+++ b/scrapersources/postliste-oep
@@ -9,7 +9,7 @@ import httplib
 import urllib2
 
 # Try several times as the database get bigger
-writetries = 5
+writetries = 6
 
 # http://www.oep.no/search/resultSingle.html?journalPostId=1000000
 # http://www.oep.no/search/resultSingle.html?journalPostId=3889259
@@ -102,7 +102,7 @@ def url_from_id(id):
     return "http://www.oep.no/search/resultSingle.html?journalPostId=" + str(id)
 
 def save(data):
-    for run in range(1,writetries):
+    for run in range(0,writetries):
         try:
             scraperwiki.sqlite.save(unique_keys=['journalPostId'], data=data)
             return
@@ -112,7 +112,7 @@ def save(data):
     raise scraperwiki.sqlite.SqliteError("Unable to write to database, tried " + str(writetries) + " times")
 
 def save_var(var, data):
-    for run in range(1,writetries):
+    for run in range(0,writetries):
         try:
             scraperwiki.sqlite.save_var(var, data)
             return
@@ -184,6 +184,8 @@ def fetch_range(first, last, step):
     fetched = 0
     min_id = first
     for id in range(first, last, step):
+        if id < 0:
+            break
         try:
             tries = 3
             while 0 < tries:
@@ -309,6 +311,7 @@ def remove_original():
 #update_doctypes()
 
 print "Starting to fetch journal entries " + str(datetime.datetime.now())
+scraperwiki.scrape("http://www.oep.no/")
 count = 10000
 skiplimit = 500
 # Random value fairly close to the most recent ID when this project started 2012-05-03
diff --git a/scrapersources/postliste-oslo-bydel-ullern b/scrapersources/postliste-oslo-bydel-ullern
index 54a5031..614b12f 100644
--- a/scrapersources/postliste-oslo-bydel-ullern
+++ b/scrapersources/postliste-oslo-bydel-ullern
@@ -11,6 +11,9 @@ import dateutil.parser
 import lxml.html
 import urlparse
 import re
+
+scraperwiki.scrape("http://www.bydel-ullern.oslo.kommune.no/postjournal/")
+
 #lazycache=scraperwiki.swimport('lazycache')
 postlistelib=scraperwiki.swimport('postliste-python-lib')
 
@@ -21,19 +24,19 @@ def report_errors(errors):
         print "Errors:"
         for e in errors:
             print e
-        exit(1)
+        raise ValueError("Something went wrong")
+
 def out_of_cpu(arg, spent, hard, soft):
     report_errors(arg)
 
 def process_pdf(parser, pdfurl, errors):
-    errors = []
     postlistelib.exit_if_no_cpu_left(0, out_of_cpu, errors)
     try:
         pdfcontent = scraperwiki.scrape(pdfurl)
         parser.preprocess(pdfurl, pdfcontent)
         pdfcontent = None
-#    except ValueError, e:
-#        errors.append(e)
+    except ValueError, e:
+        errors.append(e)
     except IndexError, e:
         errors.append(e)
 
diff --git a/scrapersources/postliste-oslo-havn b/scrapersources/postliste-oslo-havn
index d453ef7..1139b81 100644
--- a/scrapersources/postliste-oslo-havn
+++ b/scrapersources/postliste-oslo-havn
@@ -12,6 +12,9 @@ import lxml.html
 import sys
 import urlparse
 import re
+
+scraperwiki.scrape("http://www.havn.oslo.kommune.no/postjournal/")
+
 lazycache=scraperwiki.swimport('lazycache')
 postlistelib=scraperwiki.swimport('postliste-python-lib')
 
@@ -22,17 +25,17 @@ def report_errors(errors):
         print "Errors:"
         for e in errors:
             print e
-        exit(1)
+        raise ValueError(str(len(errors)) + " errors detected")
+
 def out_of_cpu(arg, spent, hard, soft):
     report_errors(arg)
 
 def process_pdf(parser, pdfurl, errors):
-    errors = []
     postlistelib.exit_if_no_cpu_left(0, out_of_cpu, errors)
     try:
         parser.fetch_and_preprocess(pdfurl)
-#    except ValueError, e:
-#        errors.append(e)
+    except ValueError, e:
+        errors.append(e)
     except IndexError, e:
         errors.append(e)
 
diff --git a/scrapersources/postliste-python-lib b/scrapersources/postliste-python-lib
index 042d1fd..7176ae9 100644
--- a/scrapersources/postliste-python-lib
+++ b/scrapersources/postliste-python-lib
@@ -1,4 +1,4 @@
-# -*- coding: utf-8 -*-
+# -*- coding: UTF-8 -*-
 #
 # Python library for parsing public post journals (postlister) in Norway.
 #
@@ -100,6 +100,11 @@ class JournalParser:
         if -1 != entry['caseid'].find('-'):
             raise ValueError("Field caseid should not include dash: " + entry['caseid'])
 
+
+        # Seen in http://home.nuug.no/~pere/nrk-postjournal/Offentlig%20journal%20NRK%200101_15012011.pdf
+        if 'sender' in entry and -1 != entry['sender'].find("Side: "):
+            raise ValueError("Field sender got page number, not real content")
+
 #
 # Parser of PDFs looking like
 # http://www.storfjord.kommune.no/postliste-18-mai-2012.5056067-105358.html (type 1)
@@ -122,6 +127,8 @@ class PDFJournalParser(JournalParser):
         # FIXME Figure out why this do not work
         #" and not (sender = 'parse error' or recipient != 'parse error') " +
         "limit 1",
+
+                    "scrapedurl from " + self.brokenpagetable + " where scrapedurl = '" + url + "' limit 1",
                     "scrapedurl from " + self.pagetable + " where scrapedurl = '" + url + "' limit 1"]:
             try:
                 result = scraperwiki.sqlite.select(sql)
@@ -131,7 +138,8 @@ class PDFJournalParser(JournalParser):
             except Exception as e:
                 #if ('no such table: %s' % self.pagetable) not in str(e) and 'no such table: swdata' not in str(e):
                 #    raise
-                print "Exception: %s" % e
+                #print "Ignoring exception: %s" % e
+                True
         return False
 
     # Check if we recognize the page content, and throw if not
@@ -139,10 +147,7 @@ class PDFJournalParser(JournalParser):
         s = BeautifulSoup(pagecontent)
         for t in s.findAll('text'):
             if t.text != " ":
-                if 'Innhold:' == t.text: # type 1 or 2 (ePhorge)
-                    s = None
-                    return True
-                if 'Arkivdel:' == t.text]: # type 3 (doculive)
+                if 'Innhold:' == t.text:
                     s = None
                     return True
         s = None
@@ -195,7 +200,6 @@ class PDFJournalParser(JournalParser):
         for i in range(0, len(entrytext)):
             print str(i) + ": '" + entrytext[i] + "'"
 
-    # ePhorte PDF
     def parse_entry_type1(self, entrytext, pdfurl):
         scrapestamputc = datetime.datetime.now()
         entry = {
@@ -349,7 +353,6 @@ class PDFJournalParser(JournalParser):
         self.parse_case_journal_ref(entry, [u'2010/2593-2', u'103004/201', u'1'], "")
         self.parse_case_journal_ref(entry, [u'2011/13415-', u'22', u'100077/201', u'1'], "")
 
-    # ePhorte PDF
     def parse_entry_type2(self, entrytext, pdfurl):
         scrapestamputc = datetime.datetime.now()
         entry = {
@@ -455,8 +458,7 @@ class PDFJournalParser(JournalParser):
         entrycount = 0
         i = 0
         while i < len(text):
-            if 'Innhold:' == text[i] \ # Type 1 and 2 (ePhorge)
-            or 'Arkivdel:' == text[i]:  # type 3 (doculive)
+            if 'Innhold:' == text[i]:
                 entrycount = entrycount + 1
             i = i + 1
 
@@ -483,10 +485,12 @@ class PDFJournalParser(JournalParser):
                 if self.debug:
                     print "Entry " + str(entrycount) + " from " + str(i) + " to " + str(endi) + " ie " + str(endi - i) + " lines"
                 try:
+                    if pdfparser is None:
+                        raise ValueError("Unrecognized page format in " + pdfurl)
                     entry = pdfparser(text[i:endi], pdfurl)
                     if 'caseid' not in entry or entry['caseid'] is None or \
                        not self.is_valid_doctype(entry['doctype']):
-                        raise ValueError("Unable to parse " + pdfurl + " as format " + format + "[" + str(entry) + "]")
+                        raise ValueError("Unable to parse " + pdfurl + " as format " + format + " [" + str(entry) + "]")
 #                print entry
                     datastore.append(entry)
                     i = endi - 2
@@ -507,6 +511,7 @@ class PDFJournalParser(JournalParser):
         text = None
 
     def process_pages(self):
+        brokenpages = 0
         try:
             sqlselect = "* from " + self.pagetable + " limit 1"
             pageref = scraperwiki.sqlite.select(sqlselect)
@@ -525,15 +530,61 @@ class PDFJournalParser(JournalParser):
                         'scrapedurl' : scrapedurl,
                         'pagenum' : pagenum,
                         'pagecontent' : pagecontent,
+                        'failstamp' : datetime.datetime.now(),
                     }
-                    print "Broken page %d from %s" % (pagenum, scrapedurl)
+                    print "Unsupported page %d from %s" % (pagenum, scrapedurl)
+                    brokenpages = brokenpages + 1
                     scraperwiki.sqlite.save(unique_keys=['scrapedurl', 'pagenum'], data=brokenpage, table_name=self.brokenpagetable)
                 scraperwiki.sqlite.execute(sqldelete)
                 scraperwiki.sqlite.commit()
                 pageref = scraperwiki.sqlite.select(sqlselect)
+
+            # Last, try some of the broken pages again, in case we got support for handling them in the mean time
+            try:
+                # First, check if the table exist
+                scraperwiki.sqlite.execute("select * from " + self.brokenpagetable)
+
+                newtrystamp = datetime.datetime.now()
+                sqlselect = "* from " + self.brokenpagetable + " where failstamp is NULL or failstamp < '" + str(newtrystamp) + "'" + " limit 1"
+                try:
+                    pageref = scraperwiki.sqlite.select(sqlselect)
+                except scraperwiki.sqlite.SqliteError, e:
+                    scraperwiki.sqlite.execute("ALTER TABLE " + self.brokenpagetable + " ADD COLUMN failstamp")
+                    scraperwiki.sqlite.commit()
+                    pageref = scraperwiki.sqlite.select(sqlselect)
+
+                pagelimit = 10
+                while pageref and 0 < pagelimit:
+                    pagelimit = pagelimit - 1
+                    scrapedurl = pageref[0]['scrapedurl']
+                    pagenum = pageref[0]['pagenum']
+                    pagecontent = pageref[0]['pagecontent']
+#                    print "Found " + scrapedurl + " page " + str(pagenum) + " length " + str(len(pagecontent))
+                    try:
+                        sqldelete = "delete from " + self.brokenpagetable + " where scrapedurl = '" + scrapedurl + "' and pagenum = " + str(pagenum)
+                        self.parse_page(scrapedurl, pagenum, pagecontent)
+#                    print "Trying to: " + sqldelete
+                        scraperwiki.sqlite.execute(sqldelete)
+                    except ValueError, e:
+                        brokenpage = {
+                            'scrapedurl' : scrapedurl,
+                            'pagenum' : pagenum,
+                            'pagecontent' : pagecontent,
+                            'failstamp' : newtrystamp,
+                        }
+                    
+                        print "Still unsupported page %d from %s" % (pagenum, scrapedurl)
+                        brokenpages = brokenpages + 1
+                        scraperwiki.sqlite.save(unique_keys=['scrapedurl', 'pagenum'], data=brokenpage, table_name=self.brokenpagetable)
+                    scraperwiki.sqlite.commit()
+                    pageref = scraperwiki.sqlite.select(sqlselect)
+            except:
+                True # Ignore missing brokenpages table
         except scraperwiki.sqlite.SqliteError, e:
             print str(e)
             raise
+        if 0 < brokenpages:
+            raise ValueError("Found %d pages with unsupported format" % brokenpages)
 
 def fieldlist():
     import urllib2
diff --git a/scrapersources/postliste-python-lib-doculive b/scrapersources/postliste-python-lib-doculive
new file mode 100644
index 0000000..520c915
--- /dev/null
+++ b/scrapersources/postliste-python-lib-doculive
@@ -0,0 +1,649 @@
+# -*- coding: UTF-8 -*-
+#
+# Python library for parsing public post journals (postlister) in Norway.
+#
+
+# Based on the scraper advanced-scraping-pdf
+#
+# See also
+# https://views.scraperwiki.com/run/pdf-to-html-preview-1/
+
+# Possible sources using format 1 pdf:
+# www.bydel-ullern.oslo.kommune.no
+# www.gravferdsetaten.oslo.kommune.no
+# www.halden.kommune.no (done)
+# www.havn.oslo.kommune.no (done)
+# www.hvaler.kommune.no (done)
+# www.kafjord.kommune.no
+# www.lier.kommune.no
+# www.lindesnes.kommune.no
+# www.naroy.kommune.no
+# www.saltdal.kommune.no
+# www.sogne.kommune.no
+# www.vikna.kommune.no
+#
+# Google search to find more: "Offentlig journal" Seleksjon Sakstittel Dokumenttype Status filetype:pdf
+
+
+import scraperwiki
+import string
+import re
+from BeautifulSoup import BeautifulSoup
+import datetime
+import dateutil.parser
+
+def cpu_spent():
+    import resource
+    usage = resource.getrusage(resource.RUSAGE_SELF)
+    return getattr(usage, 'ru_utime') + getattr(usage, 'ru_stime')
+
+def exit_if_no_cpu_left(retval, callback=None, arg = None):
+    import resource
+    soft, hard = resource.getrlimit(resource.RLIMIT_CPU)
+    spent = cpu_spent()
+    if soft < spent:
+        if callback is not None:
+            callback(arg, spent, hard, soft)
+        print "Running out of CPU, exiting."
+        exit(retval)
+
+def fetch_url_harder(url, scraper = None):
+    import urllib2
+    html = None
+    for n in [1, 2, 3]:
+        try:
+            if None == scraper:
+                scraper = scraperwiki.scrape
+            html = scraper(url)
+            break
+        except urllib2.URLError, e:
+            print "URLError fetching " + url + ", trying again"
+    return html
+
+class JournalParser:
+    agency = None
+    debug = False
+
+    validdoctypes = ['I', 'U', 'X', 'N']
+    senderdoctypes = ['I', 'X', 'N']
+    recipientdoctypes = ['U']
+    mustfields = {
+        'agency'         : 1,
+        'docdesc'        : 1,
+        'doctype'        : 1,
+        'caseyear'       : 1,
+        'caseseqnr'      : 1,
+        'casedocseq'     : 1,
+    }
+
+    def __init__(self, agency):
+        self.agency = agency
+
+    def is_valid_doctype(self, doctype):
+        return doctype in self.validdoctypes
+
+    def is_sender_doctype(self, doctype):
+        return doctype in self.senderdoctypes
+
+    def is_recipient_doctype(self, doctype):
+        return doctype in self.recipientdoctypes
+
+    def verify_entry(self, entry):
+
+        for field in self.mustfields:
+            if not field in entry:
+                raise ValueError("Missing required field " + field)
+
+        if not self.is_valid_doctype(entry['doctype']):
+            raise ValueError("Invalid doctype " + doctype)
+
+        if -1 != entry['caseid'].find('-'):
+            raise ValueError("Field caseid should not include dash: " + entry['caseid'])
+
+#
+# Parser of PDFs looking like
+# http://www.storfjord.kommune.no/postliste-18-mai-2012.5056067-105358.html (type 1)
+# http://www.hadsel.kommune.no/component/docman/doc_download/946-offentlig-postjournal-28032012 (type 2)
+# http://www.stortinget.no/Global/pdf/postjournal/pj-2011-06-23.pdf (type 2 variant)
+# Note sender/receiver is not yet parsed for type 2 PDFs
+class PDFJournalParser(JournalParser):
+    pagetable = "unparsedpages"
+    brokenpagetable = "brokenpages"
+    hiddentext = False
+    breakonfailure = True
+
+    def __init__(self, agency, hiddentext=False):
+        self.hiddentext = hiddentext
+        JournalParser.__init__(self, agency=agency)
+
+    def is_already_scraped(self, url):
+        # Ignore entries were sender and recipient is the result of a broken parser (before 2012-05-25)
+        for sql in ["scrapedurl, sender, recipient from swdata where scrapedurl = '" + url + "' " +
+        # FIXME Figure out why this do not work
+        #" and not (sender = 'parse error' or recipient != 'parse error') " +
+        "limit 1",
+                    "scrapedurl from " + self.pagetable + " where scrapedurl = '" + url + "' limit 1"]:
+            try:
+                result = scraperwiki.sqlite.select(sql)
+                #int sql, " : ", result
+                if 0 < len(result) and u'scrapedurl' in result[0]:
+                    return True
+            except Exception as e:
+                #if ('no such table: %s' % self.pagetable) not in str(e) and 'no such table: swdata' not in str(e):
+                #    raise
+                print "Exception: %s" % e
+        return False
+
+    # Check if we recognize the page content, and throw if not
+    def is_valid_page(self, pdfurl, pagenum, pagecontent):
+        s = BeautifulSoup(pagecontent)
+        for t in s.findAll('text'):
+            if t.text != " ":
+                if self.debug:
+                    print t.text
+                if 'Innhold:' == t.text: # type 1 or 2 (ePhorge)
+                    s = None
+                    return True
+                if 'Arkivdel:' == t.text or 'Notater (X):' == t.text: # type 3 (doculive)
+                    s = None
+                    return True
+        s = None
+        if self.debug:
+            print "Unrecognized page format for " + pdfurl
+        raise ValueError("Unrecognized page format for " + pdfurl)
+
+    #
+    # Split PDF content into pages and store in SQL table for later processing.
+    # The process is split in two to better handle parge PDFs (like 600 pages),
+    # without running out of CPU time without loosing track of what is left to
+    # parse.
+    def preprocess(self, pdfurl, pdfcontent):
+        print "Preprocessing PDF " + pdfurl
+        if not pdfcontent:
+            raise ValueError("No pdf content passed for " + pdfurl)
+        if self.hiddentext:
+            options = '-hidden'
+        else:
+            options = ''
+        xml=scraperwiki.pdftoxml(pdfcontent, options)
+        if self.debug:
+            print xml
+        pages=re.findall('(<page .+?</page>)',xml,flags=re.DOTALL)
+        xml=None
+#    print pages[:1][:1000]
+        pagecount = 0
+        datastore = []
+        for page in pages:
+            pagecount = pagecount + 1
+            self.is_valid_page(pdfurl, pagecount, page)
+            data = {
+                'scrapedurl' : pdfurl,
+                'pagenum' : pagecount,
+                'pagecontent' : page,
+            }
+            datastore.append(data)
+        if 0 < len(datastore):
+            scraperwiki.sqlite.save(unique_keys=['scrapedurl', 'pagenum'], data=datastore, table_name=self.pagetable)
+        else:
+            raise ValueError("Unable to find any pages in " + pdfurl)
+        pages = None
+
+    def fetch_and_preprocess(self, pdfurl):
+        pdfcontent = fetch_url_harder(pdfurl)
+        self.preprocess(pdfurl, pdfcontent)
+        pdfcontent = None
+
+    def print_entry(self, entrytext):
+        for i in range(0, len(entrytext)):
+            print str(i) + ": '" + entrytext[i] + "'"
+
+    # ePhorte PDF
+    def parse_entry_type1(self, entrytext, pdfurl):
+        scrapestamputc = datetime.datetime.now()
+        entry = {
+             'agency' : self.agency,
+             'scrapestamputc' : scrapestamputc,
+             'scrapedurl' : pdfurl
+             }
+        i = 0
+        while i < len(entrytext):
+            #print "T: '" + entrytext[i] + "'"
+            if 'Innhold:' == entrytext[i]:
+                tittel = ""
+                # handle multi-line titles
+                while 'Sakstittel:' != entrytext[i+1]:
+                    tittel = tittel + " " + entrytext[i+1]
+                    i = i + 1
+                entry['docdesc'] = tittel
+            if 'Sakstittel:' == entrytext[i]:
+                sakstittel = ""
+                while 'DokType' != entrytext[i+1]:
+#                    print "'" + entrytext[i+1] + "'"
+                    sakstittel = sakstittel + " " + entrytext[i+1]
+                    i = i + 1
+                entry['casedesc'] = sakstittel
+            if 'DokType' == entrytext[i]: # Values I/U/N/X from NOARK 4 table 14.2.11
+                entry['doctype'] = entrytext[i+1]
+                # As seen on http://www.saltdal.kommune.no/images/module.files/2007-05-16.pdf, page 1
+                if entry['doctype'] == 'S':
+                    entry['doctype'] = 'X'
+                i = i + 1
+            if 'Sak/dok nr:' == entrytext[i]:
+            # FIXME Split and handle combined sak/løpenr
+            # Use find('penr.:') to avoid non-ascii search string 'Løpenr.:'
+                caseid = None
+                lnr = None
+                if -1 != entrytext[i+4].find('penr.:'):
+                    caseid = entrytext[i+1] + entrytext[i+2]
+                    lnr = entrytext[i+3]
+                    i = i + 4
+                elif -1 != entrytext[i+3].find('penr.:'):
+                    caseid = entrytext[i+1]
+                    lnr = entrytext[i+2]
+                    i = i + 3
+                elif -1 != entrytext[i+2].find('penr.:'):
+                    caseid, lnr = entrytext[i+1].split(" ")
+                    i = i + 2
+
+                caseyear, caseseqnr = caseid.split("/")
+                entry['caseyear'] = int(caseyear)
+                caseseqnr, casedocseq = caseseqnr.split("-")
+                entry['caseseqnr'] = int(caseseqnr)
+                entry['casedocseq'] = int(casedocseq)
+                entry['caseid'] = caseyear + "/" + caseseqnr
+
+                journalseqnr, journalyear = lnr.split("/")
+                entry['journalid'] = journalyear + "/" + journalseqnr
+                entry['journalyear'] = int(journalyear)
+                entry['journalseqnr'] = int(journalseqnr)
+
+#        if -1 != text[i].find('penr.:'): # Use find('penr.:') to avoid non-ascii search string 'Løpenr.:'
+#            str = text[i-1]
+#            print "S: '" + str + "'"
+#            data['journalid'] = str
+#            # FIXME handle combined sak/løpenr
+            if 'Journaldato:' == entrytext[i]:
+                entry['recorddate'] = dateutil.parser.parse(entrytext[i-1], dayfirst=True)
+            if 'Dok.dato:' == entrytext[i]:
+                entry['docdate'] = dateutil.parser.parse(entrytext[i-1], dayfirst=True)
+            if 'Tilg.kode Hjemmel:' == entrytext[i] and 'Avsender\mottaker:' != entrytext[i+1]:
+                entry['exemption'] = entrytext[i+1]
+                i = i + 1
+            if 'Tilg.kode' == entrytext[i]:
+                entry['accesscode'] = entrytext[i+1]
+                i = i + 1
+            if 'Hjemmel:' == entrytext[i]:
+                 entry['exemption'] = entrytext[i+1]
+                 i = i + 1
+            if 'Avsender\mottaker:' == entrytext[i]:
+                if i+1 < len(entrytext): # Non-empty field
+                    fratil = entrytext[i+1]
+                    i = i + 1
+                    if self.is_sender_doctype(entry['doctype']):
+                        entry['sender'] = fratil
+                    elif self.is_recipient_doctype(entry['doctype']):
+                        entry['recipient'] = fratil
+                    else:
+                        raise ValueError("Case " + entry['caseid'] + " Sender/Recipient with doctype " + entry['doctype'] + " != I/U/X/N in " + pdfurl)
+            if self.debug:
+                print entry
+            i = i + 1
+        return entry
+
+    def parse_case_journal_ref(self, entry, reftext, pdfurl):
+        try:
+            # FIXME Split and handle combined sak/loepenr
+            # Use find('penr.:') to avoid non-ascii search string 'Loepenr.:'
+            caseid = None
+            lnr = None
+            if 4 == len(reftext):
+#                print "4 " + str(reftext)
+                caseid = reftext[0] + reftext[1]
+                lnr = reftext[2] + reftext[3]
+#                print str(caseid) + " " + str(lnr)
+            elif 3 == len(reftext):
+                if -1 != reftext[0].find("/") and -1 != reftext[2].find("/"):
+#                    print "31"
+                    caseid = reftext[0] + reftext[1]
+                    lnr = reftext[2]
+                elif -1 != reftext[2].find("/"):
+#                    print "32"
+                    caseid = reftext[0] + reftext[1]
+                    lnr = reftext[2]
+                elif -1 == reftext[2].find("/"):
+#                    print "33"
+                    caseid = reftext[0]
+                    lnr = reftext[1] + reftext[2]
+            elif 2 == len(reftext):
+                if -1 == reftext[1].find("/"):
+#                    print "21"
+                    s = reftext[0] + reftext[1]
+#                    print "S: " + s
+                    caseid, lnr = s.split(" ")
+                elif -1 != reftext[1].find("/"):
+#                    print "22"
+                    caseid = reftext[0]
+                    lnr = reftext[1]
+            elif 1 == len(reftext):
+                caseid, lnr  = reftext[0].split(" ")
+            else:
+                raise ValueError("Unable to parse entry " + str(reftext) + " in " + pdfurl)
+#            print "C: " + caseid + " L: " + lnr
+
+            caseyear, caseseqnr = caseid.split("/")
+            entry['caseyear'] = int(caseyear)
+            caseseqnr, casedocseq = caseseqnr.split("-")
+            entry['caseseqnr'] = int(caseseqnr)
+            entry['casedocseq'] = int(casedocseq)
+            entry['caseid'] = caseyear + "/" + caseseqnr
+
+            journalseqnr, journalyear = lnr.split("/")
+            entry['journalid'] = journalyear + "/" + journalseqnr
+            entry['journalyear'] = int(journalyear)
+            entry['journalseqnr'] = int(journalseqnr)
+        except:
+            print "Unable to parse " + str(reftext)
+        return entry
+    def test_parse_case_journal_ref(self):
+        entry = {}
+        self.parse_case_journal_ref(entry, [u'2008/16414-', u'23', u'15060/2012'], "")
+        self.parse_case_journal_ref(entry, [u'2011/15972-1 102773/201', u'1'], "")
+        self.parse_case_journal_ref(entry, [u'2010/2593-2', u'103004/201', u'1'], "")
+        self.parse_case_journal_ref(entry, [u'2011/13415-', u'22', u'100077/201', u'1'], "")
+
+    # ePhorte PDF
+    def parse_entry_type2(self, entrytext, pdfurl):
+        scrapestamputc = datetime.datetime.now()
+        entry = {
+            'agency' : self.agency,
+            'scrapestamputc' : scrapestamputc,
+            'scrapedurl' : pdfurl
+            }
+        i = 0
+        avsender = []
+        mottaker = []
+        while i < len(entrytext):
+            if 'Innhold:' == entrytext[i]:
+                tittel = ""
+                # handle multi-line titles
+                while 'Sakstittel:' != entrytext[i+1]:
+                    tittel = tittel + entrytext[i+1]
+                    i = i + 1
+                entry['docdesc'] = tittel
+            if 'Sakstittel:' == entrytext[i]:
+                sakstittel = ""
+                # Klassering er i en annen dokumenttype
+                while 'DokType' != entrytext[i+1] and 'Dok.Type:' != entrytext[i+1] and 'Klassering:' != entrytext[i+1]:
+
+#                print "'" + entrytext[i+1] + "'"
+                    sakstittel = sakstittel + entrytext[i+1]
+                    i = i + 1
+                entry['casedesc'] = sakstittel
+                i = i + 1
+            if 'DokType' == entrytext[i] or 'Dok.Type:' == entrytext[i]: # Values I/U/N/X from NOARK 4 table 14.2.11
+                entry['doctype'] = entrytext[i+1]
+                # As seen on http://www.uis.no/getfile.php/Journal%20200612.pdf
+                if entry['doctype'] == 'S':
+                    entry['doctype'] = 'X'
+                i = i + 1
+            if 'Sak/dok nr:' == entrytext[i] or 'Sak/dok.nr:' == entrytext[i]:
+                endi = i
+                while endi < len(entrytext):
+                    if -1 != entrytext[endi].find('penr.:') or -1 != entrytext[endi].find('penr:'):
+                        break
+                    endi = endi + 1
+                entry = self.parse_case_journal_ref(entry, entrytext[i+1:endi], pdfurl)
+                i = endi + 1
+#       if -1 != text[i].find('penr.:'): # Use find('penr.:') to avoid non-ascii search string 'Løpenr.:'
+#                str = text[i-1]
+#                print "S: '" + str + "'"
+#                data['journalid'] = str
+#                # FIXME handle combined sak/løpenr
+            if 'Journaldato:' == entrytext[i]:
+                entry['recorddate'] = dateutil.parser.parse(entrytext[i-1], dayfirst=True)
+            if 'Dok.dato:' == entrytext[i]:
+                entry['docdate'] = dateutil.parser.parse(entrytext[i-1], dayfirst=True)
+            if 'Tilg.kode Hjemmel:' == entrytext[i] and '(enhet/initialer):' != entrytext[i+2]:
+                entry['exemption'] = entrytext[i+1]
+                i = i + 1
+            if 'Tilg.kode' == entrytext[i]:
+                entry['accesscode'] = entrytext[i+1]
+                i = i + 1
+            if 'Hjemmel:' == entrytext[i]:
+                entry['exemption'] = entrytext[i+1]
+                i = i + 1
+#        if -1 != text[i].find('Avs./mottaker:'):
+# FIXME Need to handle senders and receivers
+            if 'Mottaker' == entrytext[i]:
+                mottaker.append(entrytext[i-1])
+            if 'Avsender' == entrytext[i]:
+                avsender.append(entrytext[i-1])
+#            entry['sender'] = 'parse error'
+#            entry['recipient'] = 'parse error'
+            i = i + 1
+        if 0 < len(mottaker):
+            entry['recipient'] = string.join(mottaker, ", ")
+        if 0 < len(avsender):
+            entry['sender'] = string.join(avsender, ", ")
+        return entry
+
+    def parse_entry_type3(self, entrytext, pdfurl):
+        scrapestamputc = datetime.datetime.now()
+        entry = {
+            'agency' : self.agency,
+            'scrapestamputc' : scrapestamputc,
+            'scrapedurl' : pdfurl
+            }
+        cur = 0
+        while cur < len(lines):
+            line = lines[cur].text
+            #print line
+            if -1 != line.find('Dok.dato:'):
+                entry['docid'] = lines[cur-2].text
+                entry['doctype'] = lines[cur-1].text
+                entry['docdate'] = parse_date(line.replace("Dok.dato:", ""))
+                caseyear, caseseqnr, casedocseq = split_docid(entry['docid'])
+                entry['caseyear'] = caseyear
+                entry['caseseqnr'] = caseseqnr
+                entry['casedocseq'] = casedocseq
+                entry['caseid'] = str(caseyear) + '/' + str(caseseqnr)
+            if -1 != line.find('Jour.dato:'):
+                entry['recorddate'] = parse_date(lines[cur+1].text)
+                cur = cur + 1
+            if -1 != line.find('Arkivdel:'):
+                entry['arkivdel'] = lines[cur+1].text
+                cur = cur + 1
+            if -1 != line.find('Tilg. kode:'):
+                entry['tilgangskode'] = line.replace("Tilg. kode:", "")
+            if -1 != line.find('Sak:'):
+                entry['casedesc'] = lines[cur+1].text
+                cur = cur + 1
+            if -1 != line.find('Dok:'):
+                entry['docdesc'] = lines[cur+1].text
+                cur = cur + 1
+            if -1 != line.find('Par.:'):
+                entry['exemption'] = line.replace("Par.:", "")
+                cur = cur + 1
+            if -1 != line.find('Avsender:'):
+                entry['sender'] = lines[cur+1].text
+                cur = cur + 1
+            if -1 != line.find('Mottaker:'):
+                entry['recipient'] = lines[cur+1].text
+                cur = cur + 1
+            if -1 != line.find('Saksansv:'):
+                entry['saksansvarlig'] = line.replace("Saksansv:", "").strip()
+            if -1 != line.find('Saksbeh:'):
+                entry['saksbehandler'] = lines[cur+1].text
+                cur = cur + 1
+            cur = cur + 1
+        print entry
+        return entry
+
+    def parse_page(self, pdfurl, pagenum, pagecontent):
+        print "Scraping " + pdfurl + " page " + str(pagenum)
+        s = BeautifulSoup(pagecontent)
+        datastore = []
+        text = []
+        linecount = 0
+        if self.debug:
+            print s
+        for t in s.findAll('text'):
+            if t.text != " ":
+                text.append(t.text)
+                if self.debug:
+                    print str(linecount) + ": " + t.text
+# FIXME Remove length limit when working
+#        if 100 <= linecount:
+#            break
+            linecount = linecount + 1
+#        if -1 != t.text.find("Side:"):
+#            print t.text
+        s = None
+
+#    print "Found " + str(linecount) + " lines/text fragments in the PDF"
+        if len(text) < linecount:
+            raise  ValueError("Text array too sort!")
+
+        # First count how many entries to expect on this page, to be able to
+        # verify that all of them were found.
+        entrycount = 0
+        i = 0
+        while i < len(text):
+            # Type 1 and 2 (ePhorge)
+            if 'Innhold:' == text[i] or \
+               'Arkivdel:' == text[i]:  # type 3 (doculive)
+                entrycount = entrycount + 1
+            i = i + 1
+
+        i = 0
+        while i < len(text):
+            if self.debug:
+                print "T: '" + text[i] + "'"
+            if self.debug and -1 != text[i].find("Side:"):
+                print text[i]
+            if 'Innhold:' == text[i]:
+                endi = i + 1
+                pdfparser = None
+                format = "unknown"
+                while endi < len(text):
+                    if 'Klassering:' == text[endi]:
+                        print "Found ePhorte PDF (type 1)"
+                        pdfparser = self.parse_entry_type2
+                        format = "type2"
+                    if 'Avsender\mottaker:' == text[endi]:
+                        print "Found ePhorge PDF (type 2)"
+                        pdfparser = self.parse_entry_type1
+                        format = "type1"
+                    if 'Arkivdel:' == text[endi]:
+                        print "Found Doculive PDF"
+                        pdfparser = self.parse_entry_type3
+                        format = "type3"
+                    if 'Innhold:' == text[endi]:
+                        break
+                    endi = endi + 1
+                if self.debug:
+                    print "Entry " + str(entrycount) + " from " + str(i) + " to " + str(endi) + " ie " + str(endi - i) + " lines"
+                try:
+                    entry = pdfparser(text[i:endi], pdfurl)
+                    if 'caseid' not in entry or entry['caseid'] is None or \
+                       not self.is_valid_doctype(entry['doctype']):
+                        raise ValueError("Unable to parse " + pdfurl + " as format " + format + "[" + str(entry) + "]")
+#                print entry
+                    datastore.append(entry)
+                    i = endi - 2
+                except:
+                    self.print_entry(text[i:endi])
+                    raise
+            i = i + 1
+#        print data
+#    print "Found " + str(len(datastore)) + " of " + str(entrycount) + " entries"
+        if entrycount != len(datastore):
+#        print text
+            raise ValueError("Unable to parse all entries in " + pdfurl)
+        if 0 == len(datastore):
+            print "Unable to find any entries in " + pdfurl
+        else:
+            scraperwiki.sqlite.save(unique_keys=['caseid', 'casedocseq'], data=datastore)
+        datastore = None
+        text = None
+
+    def process_pages(self):
+        try:
+            sqlselect = "* from " + self.pagetable + " limit 1"
+            pageref = scraperwiki.sqlite.select(sqlselect)
+            while pageref:
+                scrapedurl = pageref[0]['scrapedurl']
+                pagenum = pageref[0]['pagenum']
+                pagecontent = pageref[0]['pagecontent']
+#            print "Found " + scrapedurl + " page " + str(pagenum) + " length " + str(len(pagecontent))
+                try:
+                    sqldelete = "delete from " + self.pagetable + " where scrapedurl = '" + scrapedurl + "' and pagenum = " + str(pagenum)
+                    self.parse_page(scrapedurl, pagenum, pagecontent)
+#                    print "Trying to: " + sqldelete
+                    scraperwiki.sqlite.execute(sqldelete)
+                except ValueError, e:
+                    brokenpage = {
+                        'scrapedurl' : scrapedurl,
+                        'pagenum' : pagenum,
+                        'pagecontent' : pagecontent,
+                    }
+                    print "Broken page %d from %s" % (pagenum, scrapedurl)
+                    scraperwiki.sqlite.save(unique_keys=['scrapedurl', 'pagenum'], data=brokenpage, table_name=self.brokenpagetable)
+                    print e
+                scraperwiki.sqlite.execute(sqldelete)
+                scraperwiki.sqlite.commit()
+                pageref = scraperwiki.sqlite.select(sqlselect)
+        except scraperwiki.sqlite.SqliteError, e:
+            print str(e)
+            raise
+
+def fieldlist():
+    import urllib2
+    import json
+
+    scrapers = [
+        'postliste-universitetet-i-oslo',
+        'postliste-lindesnes',
+        'postliste-kristiansund',
+        'postliste-stortinget',
+        'postliste-arendal',
+        'postliste-oep',
+        'postliste-ballangen',
+        'postliste-hadsel',
+        'postliste-storfjord',
+        'postliste-oslo-havn',
+      ]
+
+    keys = {}
+
+    for scraper in scrapers:
+        url = 'https://api.scraperwiki.com/api/1.0/scraper/getinfo?format=jsondict&name=' + scraper + '&version=-1'
+        response = urllib2.urlopen(url)
+        html = response.read()
+        data = json.loads(html)
+        if 'swdata' in data[0]['datasummary']['tables']:
+            for key in data[0]['datasummary']['tables']['swdata']['keys']:
+                key = key.lower()
+                if key in keys:
+                    keys[key].append(scraper)
+                else:
+                    keys[key] = [scraper]
+    def lensort(a, b):
+        return cmp(len(keys[b]), len(keys[a]))
+
+    for key in sorted(keys.keys(), lensort):
+        print len(keys[key]), key, str(keys[key])
+
+def test_parser():
+    parser = PDFJournalParser(agency="Dummy agency")
+    parser.debug = True
+    for url in [ #"http://www.stortinget.no/Global/pdf/postjournal/pj-2011-06-23.pdf",
+                "http://www.radhusets-forvaltningstjeneste.oslo.kommune.no/getfile.php/rådhusets%20forvaltningstjeneste%20(RFT)/Intranett%20(RFT)/Dokumenter/Postjournal/11%20November/29112011.pdf"]:
+        pdfcontent = scraperwiki.scrape(url)
+        parser.preprocess(url,pdfcontent)
+        parser.process_pages()
+
+
+if __name__ == "scraper":
+    test_parser()
+#    fieldlist()
diff --git a/scrapersources/postliste-stavanger-universitetssjukehus b/scrapersources/postliste-stavanger-universitetssjukehus
new file mode 100644
index 0000000..5a9dc08
--- /dev/null
+++ b/scrapersources/postliste-stavanger-universitetssjukehus
@@ -0,0 +1,81 @@
+# -*- coding: UTF-8 -*-
+
+import scraperwiki
+import json
+from BeautifulSoup import BeautifulSoup
+import datetime
+import dateutil.parser
+import lxml.html
+import resource
+import sys
+import urlparse
+import re
+lazycache=scraperwiki.swimport('lazycache')
+postlistelib=scraperwiki.swimport('postliste-python-lib')
+
+agency = 'Stavanger Universitetssjukehus – Helse Stavanger HF'
+
+def report_errors(errors):
+    if 0 < len(errors):
+        print "Errors:"
+        for e in errors:
+            print e
+        exit(1)
+def out_of_cpu(arg, spent, hard, soft):
+    report_errors(arg)
+
+def process_pdf(parser, pdfurl, errors):
+    errors = []
+    postlistelib.exit_if_no_cpu_left(0, out_of_cpu, errors)
+    try:
+        pdfcontent = scraperwiki.scrape(pdfurl)
+        parser.preprocess(pdfurl, pdfcontent)
+        pdfcontent = None
+#    except ValueError, e:
+#        errors.append(e)
+    except IndexError, e:
+        errors.append(e)
+
+def process_page_queue(parser, errors):
+    try:
+        parser.process_pages()
+        postlistelib.exit_if_no_cpu_left(0, out_of_cpu, errors)
+    except scraperwiki.CPUTimeExceededError, e:
+        errors.append("Processing pages interrupted")
+
+def process_journal_pdfs(parser, listurl, errors):
+#    print "Finding PDFs on " + listurl
+#    u = urllib.parse.urlparse(listurl)
+    html = scraperwiki.scrape(listurl)
+    root = lxml.html.fromstring(html)
+    html = None
+    for ahref in root.cssselect("table a"):
+        href = ahref.attrib['href']
+        url = urlparse.urljoin(listurl, href)
+        if -1 != href.find("file://") or -1 == url.find(".pdf"):
+#            print "Skipping non-http URL " + url
+            continue
+        if parser.is_already_scraped(url):
+            True
+#            print "Skipping already scraped " + url
+        else:
+#            print "Will process " + url
+            process_pdf(parser, url, errors)
+
+def test_small_pdfs(parser):
+    # Test with some smaller PDFs
+    errors = []
+    process_pdf(parser, "http://www.sus.no/aktuelt/postjournal/Documents/2012/2012-06-18.pdf", errors)
+    process_page_queue(parser, errors)
+    report_errors(errors)
+    exit(0)
+
+errors = []
+parser = postlistelib.PDFJournalParser(agency=agency)
+
+#test_small_pdfs(parser)
+
+process_page_queue(parser, errors)
+process_journal_pdfs(parser, "http://www.sus.no/aktuelt/postjournal/Sider/side.aspx", errors)
+process_page_queue(parser, errors)
+report_errors(errors)
+\ No newline at end of file
diff --git a/scrapersources/postliste-universitetet-i-agder b/scrapersources/postliste-universitetet-i-agder
new file mode 100644
index 0000000..cfdfddc
--- /dev/null
+++ b/scrapersources/postliste-universitetet-i-agder
@@ -0,0 +1,85 @@
+# -*- coding: UTF-8 -*-
+
+import scraperwiki
+import json
+from BeautifulSoup import BeautifulSoup
+import datetime
+import dateutil.parser
+import lxml.html
+import resource
+import sys
+import urlparse
+import re
+
+# Make sure Scraperwiki believe this is the source from this database
+scraperwiki.scrape("http://www.uia.no/no/portaler/om_universitetet/offentlig_journal")
+
+lazycache=scraperwiki.swimport('lazycache')
+postlistelib=scraperwiki.swimport('postliste-python-lib')
+
+agency = 'Universitetet i Agder'
+
+def report_errors(errors):
+    if 0 < len(errors):
+        print "Errors:"
+        for e in errors:
+            print e
+        raise ValueError("Something went wrong")
+
+def out_of_cpu(arg, spent, hard, soft):
+    report_errors(arg)
+
+def process_pdf(parser, pdfurl, errors):
+    postlistelib.exit_if_no_cpu_left(0, out_of_cpu, errors)
+    try:
+        pdfcontent = scraperwiki.scrape(pdfurl)
+        parser.preprocess(pdfurl, pdfcontent)
+        pdfcontent = None
+    except ValueError, e:
+        errors.append(e)
+    except IndexError, e:
+        errors.append(e)
+
+def process_page_queue(parser, errors):
+    try:
+        parser.process_pages()
+        postlistelib.exit_if_no_cpu_left(0, out_of_cpu, errors)
+    except scraperwiki.CPUTimeExceededError, e:
+        errors.append("Processing pages interrupted")
+
+def process_journal_pdfs(parser, listurl, errors):
+#    print "Finding PDFs on " + listurl
+#    u = urllib.parse.urlparse(listurl)
+    html = scraperwiki.scrape(listurl)
+    root = lxml.html.fromstring(html)
+    html = None
+    for ahref in root.cssselect("table a"):
+        href = ahref.attrib['href']
+        url = urlparse.urljoin(listurl, href).replace(" ", "%20")
+        if -1 != href.find("file://") or -1 == url.find(".pdf"):
+#            print "Skipping non-http URL " + url
+            continue
+        if parser.is_already_scraped(url):
+            True
+#            print "Skipping already scraped " + url
+        else:
+#            print "Will process " + url
+            process_pdf(parser, url, errors)
+
+def test_small_pdfs(parser):
+    # Test with some smaller PDFs
+    errors = []
+    process_pdf(parser, "http://www.uia.no/no/content/download/297514/5641673/file/Uke%2018.pdf", errors)
+    process_page_queue(parser, errors)
+    report_errors(errors)
+    exit(0)
+
+errors = []
+parser = postlistelib.PDFJournalParser(agency=agency)
+
+#test_small_pdfs(parser)
+
+process_journal_pdfs(parser, "http://www.uia.no/no/portaler/om_universitetet/offentlig_journal", errors)
+process_page_queue(parser, errors)
+report_errors(errors)
+
diff --git a/scrapersources/postliste-universitetssykehuset-nord-norge b/scrapersources/postliste-universitetssykehuset-nord-norge
new file mode 100644
index 0000000..1b06793
--- /dev/null
+++ b/scrapersources/postliste-universitetssykehuset-nord-norge
@@ -0,0 +1,96 @@
+# -*- coding: UTF-8 -*-
+
+
+import scraperwiki
+import json
+from BeautifulSoup import BeautifulSoup
+import datetime
+import dateutil.parser
+import lxml.html
+import urlparse
+import re
+
+# Make sure Scraperwiki believe this is the source from this database
+scraperwiki.scrape("http://www.unn.no/offentlig-postjournal/category8944.html")
+
+lazycache=scraperwiki.swimport('lazycache')
+postlistelib=scraperwiki.swimport('postliste-python-lib')
+
+agency = 'Universitetssykehuset Nord-Norge'
+
+def report_errors(errors):
+    if 0 < len(errors):
+        print "Errors:"
+        for e in errors:
+            print e
+        raise ValueError("Something went wrong")
+
+def out_of_cpu(arg, spent, hard, soft):
+    report_errors(arg)
+
+def process_pdf(parser, pdfurl, errors):
+    postlistelib.exit_if_no_cpu_left(0, out_of_cpu, errors)
+    try:
+        pdfcontent = scraperwiki.scrape(pdfurl)
+        parser.preprocess(pdfurl, pdfcontent)
+        pdfcontent = None
+    except ValueError, e:
+        errors.append(e)
+    except IndexError, e:
+        errors.append(e)
+
+def process_page_queue(parser, errors):
+    try:
+        parser.process_pages()
+        postlistelib.exit_if_no_cpu_left(0, out_of_cpu, errors)
+    except scraperwiki.CPUTimeExceededError, e:
+        errors.append("Processing pages interrupted")
+
+def process_journal_pdfs(parser, listurl, errors):
+#    print "Finding PDFs on " + listurl
+#    u = urllib.parse.urlparse(listurl)
+    html = scraperwiki.scrape(listurl)
+    root = lxml.html.fromstring(html)
+    html = None
+    for ahref in root.cssselect("div.month-entry-title a"):
+        href = ahref.attrib['href']
+        url = urlparse.urljoin(listurl, href)
+        print url
+        if -1 != href.find("file://"):
+#            print "Skipping non-http URL " + url
+            continue
+        subhtml = scraperwiki.scrape(url)
+        subroot = lxml.html.fromstring(subhtml)
+        subhtml = None
+        for subahref in subroot.cssselect("div.related-attachements a"):
+            subhref = subahref.attrib['href']
+            suburl = urlparse.urljoin(url, subhref)
+            if -1 == suburl.find(".pdf"):
+                continue
+            if parser.is_already_scraped(suburl):
+                True
+#                print "Skipping already scraped " + url
+            else:
+#                print "Will process " + url
+                process_pdf(parser, suburl, errors)
+
+def test_small_pdfs(parser):
+    # Test with some smaller PDFs
+    errors = []
+    process_pdf(parser, "http://www.unn.no/getfile.php/UNN-Internett/Media/Postjournal/UNN%20offentlig%20journal%202007/200807.pdf", errors)
+    process_page_queue(parser, errors)
+    report_errors(errors)
+    exit(0)
+
+errors = []
+parser = postlistelib.PDFJournalParser(agency=agency)
+
+#test_small_pdfs(parser)
+
+process_page_queue(parser, errors)
+process_journal_pdfs(parser, "http://www.unn.no/offentlig-postjournal/category8944.html", errors)
+for year in range(2011, 2007, -1):
+    process_journal_pdfs(parser, "http://www.unn.no/offentlig-postjournal/category8944.html?year=" + str(year), errors)
+process_page_queue(parser, errors)
+report_errors(errors)
+
author	Petter Reinholdtsen <pere@hungry.com>	2012-07-13 12:34:28 +0200
committer	Petter Reinholdtsen <pere@hungry.com>	2012-07-13 12:34:28 +0200
commit	c99e7bfda6e025314b6a7c6683a1bc3c5818621c (patch)
tree	ef4230a7830973c37e9d83c9e773d41a56834658
parent	22bceaf65dd89df97529df0102149aefa2b54f54 (diff)