aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorPetter Reinholdtsen <pere@hungry.com>2012-07-13 12:34:28 +0200
committerPetter Reinholdtsen <pere@hungry.com>2012-07-13 12:34:28 +0200
commitc99e7bfda6e025314b6a7c6683a1bc3c5818621c (patch)
treeef4230a7830973c37e9d83c9e773d41a56834658
parent22bceaf65dd89df97529df0102149aefa2b54f54 (diff)
Updated from scraperwiki.
-rw-r--r--scrapersources/list-nuug-postliste-scrapers90
-rw-r--r--scrapersources/postliste-halden2
-rw-r--r--scrapersources/postliste-hoegskolen-i-finnmark86
-rw-r--r--scrapersources/postliste-hoegskolen-i-gjoevik37
-rw-r--r--scrapersources/postliste-hoegskolen-i-nord-troendelag88
-rw-r--r--scrapersources/postliste-hoegskolen-i-telemark86
-rw-r--r--scrapersources/postliste-hoegskolen-i-volda88
-rw-r--r--scrapersources/postliste-hvaler18
-rw-r--r--scrapersources/postliste-kafjord-kommune93
-rw-r--r--scrapersources/postliste-loppa-kommune88
-rw-r--r--scrapersources/postliste-met91
-rw-r--r--scrapersources/postliste-mrfylke82
-rw-r--r--scrapersources/postliste-nrk91
-rw-r--r--scrapersources/postliste-ntnu4
-rw-r--r--scrapersources/postliste-oep9
-rw-r--r--scrapersources/postliste-oslo-bydel-ullern11
-rw-r--r--scrapersources/postliste-oslo-havn11
-rw-r--r--scrapersources/postliste-python-lib75
-rw-r--r--scrapersources/postliste-python-lib-doculive649
-rw-r--r--scrapersources/postliste-stavanger-universitetssjukehus81
-rw-r--r--scrapersources/postliste-universitetet-i-agder85
-rw-r--r--scrapersources/postliste-universitetssykehuset-nord-norge96
22 files changed, 1881 insertions, 80 deletions
diff --git a/scrapersources/list-nuug-postliste-scrapers b/scrapersources/list-nuug-postliste-scrapers
new file mode 100644
index 0000000..67c4158
--- /dev/null
+++ b/scrapersources/list-nuug-postliste-scrapers
@@ -0,0 +1,90 @@
+import os
+import urlparse
+urlquery = os.getenv('URLQUERY')
+
+if urlquery:
+ querydata = urlparse.parse_qsl(urlquery);
+ for pair in querydata:
+ if pair[0] == "js" and pair[1] == "jquery.js":
+ print 'js-sourcecode'
+ exit(0)
+
+import urllib2, json, re
+import yaml
+
+url = "https://api.scraperwiki.com/api/1.0/scraper/search?format=jsondict&maxrows=200&searchquery=nuug-postliste-endyaml"
+json_data = json.load(urllib2.urlopen(url))
+print '''<html>
+<head>
+<link rel="stylesheet" href="https://views.scraperwiki.com/run/jquery-tablesorter/?file=style-blue.css" type="text/css" />
+<script type="text/javascript" src="https://views.scraperwiki.com/run/jquery-tablesorter/?file=jquery-1-4-2-min.js"></script>
+<script type="text/javascript" src="https://views.scraperwiki.com/run/jquery-tablesorter/?file=jquery.tablesorter.2-0-5.min.js"></script>'''
+
+print '''</head><body>
+<p>This view lists scrapers with yaml-combatible comments (containing the string "nuug-postliste-endyaml" like the following in their description
+<pre>
+&lt;!-- nuug-postliste-yaml --&gt;
+YAML-tagger:&lt;br&gt;
+Type: kommune&lt;br&gt;
+Status: finished&lt;br&gt;
+Name: Lillesteinsmyr kommune&lt;br&gt;
+Format: PDF&lt;br&gt;
+Datatype: ePhorte&lt;br&gt;
+Run: daily&lt;br&gt;
+&lt;!-- nuug-postliste-endyaml --&gt;
+</pre></p>
+<table id="myTable" class="tablesorter">'''
+
+print '<thead><tr><th>Name</th><th>type</th><th>status</th><th>schedule</th><th>format</th><th>datatype</th><th>URL</th></tr></thead><tbody>'
+counter = {}
+for scraper in json_data:
+ print scraper
+ comment = re.findall(r'<!-- nuug-postliste-yaml -->(.*)<!-- nuug-postliste-endyaml -->',
+ scraper['description'], re.DOTALL)
+ assert len(comment) == 1
+ data = yaml.load(comment[0].strip().replace('<br>',''))
+
+ if data['Type'] in counter:
+ counter[data['Type']] = counter[data['Type']] + 1
+ else:
+ counter[data['Type']] = 1
+
+ if 'Run' in data: Run = data['Run']
+ else: Run = 'unknown'
+
+ if 'Format' in data: Format = data['Format']
+ else: Format = 'unknown'
+
+ if 'Datatype' in data: Type = data['Datatype']
+ else: Type = 'unknown'
+
+
+ print '<tr><td>%s</td><td>%s</td><td>%s</td><td>%s</td><td>%s</td><td>%s</td><td><a href="https://scraperwiki.com/scrapers/%s/">URL</a></td></tr>' % \
+ (data['Name'],data['Type'],data['Status'], Run, Format, Type, scraper['short_name'])
+print '''</tbody></table><table id="myTable2" class="tablesorter"><thead><tr><th>type</th><th>count</th></tr></thead><tbody>'''
+
+for key in counter:
+ print '<tr><td>%s</td><td>%d</td></tr>' % (key, counter[key])
+print '</tbody></table>'
+
+num_kommune = float(429)
+num_fylke = float(19)
+print '<table class="tablesorter"><thead><tr><td>Type</td><td>Prosent</td></tr></thead><tbody>'
+try:
+ print "<tr><td>Kommune</td><td>%.2f%% (%d av %d)</td></tr>" % \
+ ((float(counter['kommune'])/float(num_kommune))*100, counter['kommune'], num_kommune)
+except KeyError: pass
+try:
+ print "<tr><td>Fylkeskommune</td><td>%.2f%% (%d av %d)</td></tr>" % \
+ ((float(counter['fylkeskommune'])/float(num_fylke))*100, counter['fylkeskommune'], num_fylke)
+except KeyError: pass
+print '''</tbody></table>
+<script type="text/javascript">
+ $(document).ready(function()
+ {
+ $("#myTable").tablesorter();
+ $("#myTable2").tablesorter();
+ }
+ );
+</script>
+</body></html>''' \ No newline at end of file
diff --git a/scrapersources/postliste-halden b/scrapersources/postliste-halden
index 4b0ebd5..e7c2d30 100644
--- a/scrapersources/postliste-halden
+++ b/scrapersources/postliste-halden
@@ -83,7 +83,7 @@ def test_small_pdfs(parser):
errors = []
parser = postlistelib.PDFJournalParser(agency=agency)
-#parser.debug = True
+parser.debug = True
#test_small_pdfs(parser)
process_page_queue(parser, errors)
diff --git a/scrapersources/postliste-hoegskolen-i-finnmark b/scrapersources/postliste-hoegskolen-i-finnmark
new file mode 100644
index 0000000..2a4b972
--- /dev/null
+++ b/scrapersources/postliste-hoegskolen-i-finnmark
@@ -0,0 +1,86 @@
+# -*- coding: UTF-8 -*-
+
+import scraperwiki
+import json
+from BeautifulSoup import BeautifulSoup
+import datetime
+import dateutil.parser
+import lxml.html
+import resource
+import sys
+import urlparse
+import re
+
+# Make sure Scraperwiki believe this is the source from this database
+scraperwiki.scrape("http://www.hifm.no/nor/www_hifm_no/hogskolen-i-finnmark-_-startside/om-hogskolen/om-hogskolen/offentlig-journal-1")
+
+lazycache=scraperwiki.swimport('lazycache')
+postlistelib=scraperwiki.swimport('postliste-python-lib')
+
+agency = 'Høgskolen i Finnmark'
+
+def report_errors(errors):
+ if 0 < len(errors):
+ print "Errors:"
+ for e in errors:
+ print e
+ raise ValueError("Something went wrong")
+
+def out_of_cpu(arg, spent, hard, soft):
+ report_errors(arg)
+
+def process_pdf(parser, pdfurl, errors):
+ postlistelib.exit_if_no_cpu_left(0, out_of_cpu, errors)
+ try:
+ pdfcontent = scraperwiki.scrape(pdfurl)
+ parser.preprocess(pdfurl, pdfcontent)
+ pdfcontent = None
+ except ValueError, e:
+ errors.append(e)
+ except IndexError, e:
+ errors.append(e)
+
+def process_page_queue(parser, errors):
+ try:
+ parser.process_pages()
+ postlistelib.exit_if_no_cpu_left(0, out_of_cpu, errors)
+ except scraperwiki.CPUTimeExceededError, e:
+ errors.append("Processing pages interrupted")
+
+def process_journal_pdfs(parser, listurl, errors):
+# print "Finding PDFs on " + listurl
+# u = urllib.parse.urlparse(listurl)
+ html = scraperwiki.scrape(listurl)
+ root = lxml.html.fromstring(html)
+ html = None
+ for ahref in root.cssselect("div.content-padding a"):
+ href = ahref.attrib['href']
+ url = urlparse.urljoin(listurl, href)
+ if -1 != href.find("file://") or -1 == url.find("/download_journal.php"):
+# print "Skipping non-http URL " + url
+ continue
+ if parser.is_already_scraped(url):
+ True
+# print "Skipping already scraped " + url
+ else:
+# print "Will process " + url
+ process_pdf(parser, url, errors)
+
+def test_small_pdfs(parser):
+ # Test with some smaller PDFs
+ errors = []
+ process_pdf(parser, "http://www.hifm.no/neted/includes/hifm/download_journal.php?fn=120503", errors)
+ process_page_queue(parser, errors)
+ report_errors(errors)
+ exit(0)
+
+errors = []
+parser = postlistelib.PDFJournalParser(agency=agency)
+
+#test_small_pdfs(parser)
+
+process_page_queue(parser, errors)
+process_journal_pdfs(parser, "http://www.hifm.no/nor/www_hifm_no/hogskolen-i-finnmark-_-startside/om-hogskolen/om-hogskolen/offentlig-journal-1/?&type=a", errors)
+process_page_queue(parser, errors)
+report_errors(errors)
+
diff --git a/scrapersources/postliste-hoegskolen-i-gjoevik b/scrapersources/postliste-hoegskolen-i-gjoevik
index fd197eb..d4f7931 100644
--- a/scrapersources/postliste-hoegskolen-i-gjoevik
+++ b/scrapersources/postliste-hoegskolen-i-gjoevik
@@ -19,7 +19,7 @@ import re
#
#
# Make sure Scraperwiki believe this is the source from this database
-scraperwiki.scrape("http://www.hig.no/om_hig/offentleg_journal/2012")
+scraperwiki.scrape("http://www.hig.no/om_hig/offentleg_journal")
lazycache=scraperwiki.swimport('lazycache')
postlistelib=scraperwiki.swimport('postliste-python-lib')
@@ -31,19 +31,19 @@ def report_errors(errors):
print "Errors:"
for e in errors:
print e
- exit(1)
+ raise ValueError(str(len(errors)) + " errors detected")
+
def out_of_cpu(arg, spent, hard, soft):
report_errors(arg)
def process_pdf(parser, pdfurl, errors):
- errors = []
postlistelib.exit_if_no_cpu_left(0, out_of_cpu, errors)
try:
pdfcontent = scraperwiki.scrape(pdfurl)
parser.preprocess(pdfurl, pdfcontent)
pdfcontent = None
-# except ValueError, e:
-# errors.append(e)
+ except ValueError, e:
+ errors.append(e)
except IndexError, e:
errors.append(e)
@@ -73,21 +73,20 @@ def process_journal_pdfs(parser, listurl, errors):
# print "Will process " + url
process_pdf(parser, url, errors)
-def test_small_pdfs(parser):
+#def test_small_pdfs(parser):
# Test with some smaller PDFs
- errors = []
- if parser.is_already_scraped("http://www.hig.no/content/download/30119/360872/file/Offentlig+journal+04.11.2010.pdf"):
- print "Skipping already scraped "
- exit(1)
- else:
- print "Will process "
-
+# errors = []
+# if parser.is_already_scraped("http://www.hig.no/content/download/30119/360872/file/Offentlig+journal+04.11.2010.pdf"):
+# print "Skipping already scraped "
+# exit(1)
+# else:
+# print "Will process "
#process_pdf(parser, "http://www.hig.no/content/download/35184/430061/file/Offentlig%20journal%2025.06.2012.pdf", errors)
#process_pdf(parser, "http://www.hig.no/content/download/30116/360863/file/Offentlig%20journal%2001.11.2010.pdf", errors)
- process_pdf(parser, "http://www.hig.no/content/download/30119/360872/file/Offentlig+journal+04.11.2010.pdf", errors)
- process_page_queue(parser, errors)
- report_errors(errors)
- exit(0)
+# process_pdf(parser, "http://www.hig.no/content/download/30119/360872/file/Offentlig+journal+04.11.2010.pdf", errors)
+# process_page_queue(parser, errors)
+# report_errors(errors)
+# exit(0)
errors = []
parser = postlistelib.PDFJournalParser(agency=agency)
@@ -96,9 +95,9 @@ parser = postlistelib.PDFJournalParser(agency=agency)
startYear=2010
endYear=datetime.datetime.now().year
-for year in range(startYear, endYear):
- process_journal_pdfs(parser, "http://www.hig.no/om_hig/offentleg_journal/%d" % year, errors)
+for year in range(startYear, endYear+1): # range goes from startyear to endYear-1
+ process_journal_pdfs(parser, "http://www.hig.no/om_hig/offentleg_journal/%d" % year, errors)
process_page_queue(parser, errors)
report_errors(errors)
diff --git a/scrapersources/postliste-hoegskolen-i-nord-troendelag b/scrapersources/postliste-hoegskolen-i-nord-troendelag
new file mode 100644
index 0000000..3db177b
--- /dev/null
+++ b/scrapersources/postliste-hoegskolen-i-nord-troendelag
@@ -0,0 +1,88 @@
+# -*- coding: UTF-8 -*-
+# Based on the scraper advanced-scraping-pdf
+# See also
+# https://views.scraperwiki.com/run/pdf-to-html-preview-1/?url=http%3A%2F%2Fwww.stortinget.no%2FGlobal%2Fpdf%2Fpostjournal%2Fpj-2012-05-09.pdf
+
+import scraperwiki
+import json
+from BeautifulSoup import BeautifulSoup
+import datetime
+import dateutil.parser
+import lxml.html
+import resource
+import sys
+import urlparse
+import re
+
+# Make sure Scraperwiki believe this is the source from this database
+scraperwiki.scrape("http://www.hint.no/aktuelt/offentlig_postjournal")
+
+lazycache=scraperwiki.swimport('lazycache')
+postlistelib=scraperwiki.swimport('postliste-python-lib')
+
+agency = 'Høgskolen i Nord-Trøndelag'
+
+def report_errors(errors):
+ if 0 < len(errors):
+ print "Errors:"
+ for e in errors:
+ print e
+ raise ValueError("Something went wrong")
+
+def out_of_cpu(arg, spent, hard, soft):
+ report_errors(arg)
+
+def process_pdf(parser, pdfurl, errors):
+ postlistelib.exit_if_no_cpu_left(0, out_of_cpu, errors)
+ try:
+ pdfcontent = scraperwiki.scrape(pdfurl)
+ parser.preprocess(pdfurl, pdfcontent)
+ pdfcontent = None
+ except ValueError, e:
+ errors.append(e)
+ except IndexError, e:
+ errors.append(e)
+
+def process_page_queue(parser, errors):
+ try:
+ parser.process_pages()
+ postlistelib.exit_if_no_cpu_left(0, out_of_cpu, errors)
+ except scraperwiki.CPUTimeExceededError, e:
+ errors.append("Processing pages interrupted")
+
+def process_journal_pdfs(parser, listurl, errors):
+# print "Finding PDFs on " + listurl
+# u = urllib.parse.urlparse(listurl)
+ html = scraperwiki.scrape(listurl)
+ root = lxml.html.fromstring(html)
+ html = None
+ for ahref in root.cssselect("div.mliste a"):
+ href = ahref.attrib['href']
+ url = urlparse.urljoin(listurl, href)
+ if -1 != href.find("file://") or -1 == url.find(".pdf"):
+# print "Skipping non-http URL " + url
+ continue
+ if parser.is_already_scraped(url):
+ True
+# print "Skipping already scraped " + url
+ else:
+# print "Will process " + url
+ process_pdf(parser, url, errors)
+
+def test_small_pdfs(parser):
+ # Test with some smaller PDFs
+ errors = []
+ process_pdf(parser, "http://www.hint.no/content/download/60032/904325/version/1/file/Off.+journal+28.06.2012.pdf", errors)
+ process_page_queue(parser, errors)
+ report_errors(errors)
+ exit(0)
+
+errors = []
+parser = postlistelib.PDFJournalParser(agency=agency)
+
+#test_small_pdfs(parser)
+
+process_journal_pdfs(parser, "http://www.hint.no/aktuelt/offentlig_postjournal", errors)
+process_page_queue(parser, errors)
+report_errors(errors)
+
diff --git a/scrapersources/postliste-hoegskolen-i-telemark b/scrapersources/postliste-hoegskolen-i-telemark
new file mode 100644
index 0000000..a41d014
--- /dev/null
+++ b/scrapersources/postliste-hoegskolen-i-telemark
@@ -0,0 +1,86 @@
+# -*- coding: UTF-8 -*-
+
+import scraperwiki
+import json
+from BeautifulSoup import BeautifulSoup
+import datetime
+import dateutil.parser
+import lxml.html
+import resource
+import sys
+import urlparse
+import re
+
+# Make sure Scraperwiki believe this is the source from this database
+scraperwiki.scrape("http://www.hit.no/nor/HiT/Om-HiT/Offentlig-journal-for-HiT")
+
+lazycache=scraperwiki.swimport('lazycache')
+postlistelib=scraperwiki.swimport('postliste-python-lib')
+
+agency = 'Høgskolen i Telemark'
+
+def report_errors(errors):
+ if 0 < len(errors):
+ print "Errors:"
+ for e in errors:
+ print e
+ raise ValueError("Something went wrong")
+
+def out_of_cpu(arg, spent, hard, soft):
+ report_errors(arg)
+
+def process_pdf(parser, pdfurl, errors):
+ postlistelib.exit_if_no_cpu_left(0, out_of_cpu, errors)
+ try:
+ pdfcontent = scraperwiki.scrape(pdfurl)
+ parser.preprocess(pdfurl, pdfcontent)
+ pdfcontent = None
+ except ValueError, e:
+ errors.append(e)
+ except IndexError, e:
+ errors.append(e)
+
+def process_page_queue(parser, errors):
+ try:
+ parser.process_pages()
+ postlistelib.exit_if_no_cpu_left(0, out_of_cpu, errors)
+ except scraperwiki.CPUTimeExceededError, e:
+ errors.append("Processing pages interrupted")
+
+def process_journal_pdfs(parser, listurl, errors):
+# print "Finding PDFs on " + listurl
+# u = urllib.parse.urlparse(listurl)
+ html = scraperwiki.scrape(listurl)
+ root = lxml.html.fromstring(html)
+ html = None
+ for ahref in root.cssselect("table a"):
+ href = ahref.attrib['href']
+ url = urlparse.urljoin(listurl, href)
+ if -1 != href.find("file://") or -1 == url.find(".pdf"):
+# print "Skipping non-http URL " + url
+ continue
+ if parser.is_already_scraped(url):
+ True
+# print "Skipping already scraped " + url
+ else:
+# print "Will process " + url
+ process_pdf(parser, url, errors)
+
+def test_small_pdfs(parser):
+ # Test with some smaller PDFs
+ errors = []
+ process_pdf(parser, "http://www.hit.no/nor/content/download/128467/1372770/file/Offentlig+journal+uke+1.pdf", errors)
+ process_page_queue(parser, errors)
+ report_errors(errors)
+ exit(0)
+
+errors = []
+parser = postlistelib.PDFJournalParser(agency=agency)
+
+#test_small_pdfs(parser)
+
+process_page_queue(parser, errors)
+process_journal_pdfs(parser, "http://www.hit.no/nor/HiT/Om-HiT/Offentlig-journal-for-HiT", errors)
+process_page_queue(parser, errors)
+report_errors(errors)
+
diff --git a/scrapersources/postliste-hoegskolen-i-volda b/scrapersources/postliste-hoegskolen-i-volda
new file mode 100644
index 0000000..0106cb7
--- /dev/null
+++ b/scrapersources/postliste-hoegskolen-i-volda
@@ -0,0 +1,88 @@
+# -*- coding: UTF-8 -*-
+
+import scraperwiki
+import json
+from BeautifulSoup import BeautifulSoup
+import datetime
+import dateutil.parser
+import lxml.html
+import resource
+import sys
+import urlparse
+import re
+
+# Make sure Scraperwiki believe this is the source from this database
+scraperwiki.scrape("http://www.hivolda.no/nyn/hivolda/om-hogskulen/administrasjon/dokumentsenteret")
+
+lazycache=scraperwiki.swimport('lazycache')
+postlistelib=scraperwiki.swimport('postliste-python-lib')
+
+agency = 'Høgskolen i Volda'
+
+def report_errors(errors):
+ if 0 < len(errors):
+ print "Errors:"
+ for e in errors:
+ print e
+ raise ValueError("Something went wrong")
+
+def out_of_cpu(arg, spent, hard, soft):
+ report_errors(arg)
+
+def process_pdf(parser, pdfurl, errors):
+ postlistelib.exit_if_no_cpu_left(0, out_of_cpu, errors)
+ try:
+ pdfcontent = scraperwiki.scrape(pdfurl)
+ parser.preprocess(pdfurl, pdfcontent)
+ pdfcontent = None
+ except ValueError, e:
+ errors.append(e)
+ except IndexError, e:
+ errors.append(e)
+
+def process_page_queue(parser, errors):
+ try:
+ parser.process_pages()
+ postlistelib.exit_if_no_cpu_left(0, out_of_cpu, errors)
+ except scraperwiki.CPUTimeExceededError, e:
+ errors.append("Processing pages interrupted")
+
+def process_journal_pdfs(parser, listurl, errors):
+# print "Finding PDFs on " + listurl
+# u = urllib.parse.urlparse(listurl)
+ html = scraperwiki.scrape(listurl)
+ root = lxml.html.fromstring(html)
+ html = None
+ for ahref in root.cssselect("div.inside a"):
+ if 'id' not in ahref.attrib or -1 == ahref.attrib['id'].find("archiveimage_"):
+ continue
+ href = ahref.attrib['href']
+ url = urlparse.urljoin(listurl, href)
+ if -1 != href.find("file://"):
+# print "Skipping non-http URL " + url
+ continue
+ if parser.is_already_scraped(url):
+ True
+# print "Skipping already scraped " + url
+ else:
+# print "Will process " + url
+ process_pdf(parser, url, errors)
+
+def test_small_pdfs(parser):
+ # Test with some smaller PDFs
+ errors = []
+ process_pdf(parser, "http://www.hivolda.no/neted/modules/archive/front/file.php?data=47449f5f5477b30f13f282759d5f08b1", errors)
+ process_page_queue(parser, errors)
+ report_errors(errors)
+ exit(0)
+
+errors = []
+parser = postlistelib.PDFJournalParser(agency=agency)
+#parser.debug = True
+
+#test_small_pdfs(parser)
+
+process_journal_pdfs(parser, "http://www.hivolda.no/nyn/hivolda/om-hogskulen/administrasjon/dokumentsenteret", errors)
+process_page_queue(parser, errors)
+report_errors(errors)
+
diff --git a/scrapersources/postliste-hvaler b/scrapersources/postliste-hvaler
index b3e9137..113b145 100644
--- a/scrapersources/postliste-hvaler
+++ b/scrapersources/postliste-hvaler
@@ -6,10 +6,11 @@ from BeautifulSoup import BeautifulSoup
import datetime
import dateutil.parser
import lxml.html
-import resource
-import sys
import urlparse
import re
+
+scraperwiki.scrape("http://www.hvaler.kommune.no/Postlister/")
+
lazycache=scraperwiki.swimport('lazycache')
postlistelib=scraperwiki.swimport('postliste-python-lib')
@@ -20,12 +21,12 @@ def report_errors(errors):
print "Errors:"
for e in errors:
print e
- exit(1)
+ raise ValueError("Something went wrong, " + str(len(errors)) + " errors detected")
+
def out_of_cpu(arg, spent, hard, soft):
report_errors(arg)
def process_pdf(parser, pdfurl, errors):
- errors = []
postlistelib.exit_if_no_cpu_left(0, out_of_cpu, errors)
try:
pdfcontent = scraperwiki.scrape(pdfurl)
@@ -35,6 +36,8 @@ def process_pdf(parser, pdfurl, errors):
# errors.append(e)
except IndexError, e:
errors.append(e)
+ except ValueError, e:
+ errors.append(e)
def process_page_queue(parser, errors):
try:
@@ -49,10 +52,11 @@ def process_journal_pdfs(parser, listurl, errors):
html = scraperwiki.scrape(listurl)
root = lxml.html.fromstring(html)
html = None
- for ahref in root.cssselect("div#ctl00_MainRegion_StageAreaRegion_MainContentRegion_MainBodyRegion_ctl01_FileTreen0Nodes a"):
+ for ahref in root.cssselect("table a"):
href = ahref.attrib['href']
url = urlparse.urljoin(listurl, href)
- if -1 != href.find("file://"):
+# print url
+ if -1 != href.find("file://") or -1 != href.find("javascript:"):
# print "Skipping non-http URL " + url
continue
if parser.is_already_scraped(url):
@@ -72,9 +76,11 @@ def test_small_pdfs(parser):
errors = []
parser = postlistelib.PDFJournalParser(agency=agency)
+#parser.debug = True
#test_small_pdfs(parser)
+process_page_queue(parser, errors)
process_journal_pdfs(parser, "http://www.hvaler.kommune.no/Postlister/", errors)
process_page_queue(parser, errors)
report_errors(errors)
diff --git a/scrapersources/postliste-kafjord-kommune b/scrapersources/postliste-kafjord-kommune
new file mode 100644
index 0000000..212a308
--- /dev/null
+++ b/scrapersources/postliste-kafjord-kommune
@@ -0,0 +1,93 @@
+# -*- coding: UTF-8 -*-
+
+import scraperwiki
+import json
+from BeautifulSoup import BeautifulSoup
+import datetime
+import dateutil.parser
+import lxml.html
+import resource
+import sys
+import urlparse
+import re
+
+# Make sure Scraperwiki believe this is the source from this database
+scraperwiki.scrape("http://www.kafjord.kommune.no/postlister.18590.no.html")
+
+lazycache=scraperwiki.swimport('lazycache')
+postlistelib=scraperwiki.swimport('postliste-python-lib')
+
+agency = 'Kåfjord kommune'
+
+def report_errors(errors):
+ if 0 < len(errors):
+ print "Errors:"
+ for e in errors:
+ print e
+ raise ValueError("Something went wrong")
+
+def out_of_cpu(arg, spent, hard, soft):
+ report_errors(arg)
+
+def process_pdf(parser, pdfurl, errors):
+ postlistelib.exit_if_no_cpu_left(0, out_of_cpu, errors)
+ try:
+ pdfcontent = scraperwiki.scrape(pdfurl)
+ parser.preprocess(pdfurl, pdfcontent)
+ pdfcontent = None
+ except ValueError, e:
+ errors.append(e)
+ except IndexError, e:
+ errors.append(e)
+
+def process_page_queue(parser, errors):
+ try:
+ parser.process_pages()
+ postlistelib.exit_if_no_cpu_left(0, out_of_cpu, errors)
+ except scraperwiki.CPUTimeExceededError, e:
+ errors.append("Processing pages interrupted")
+
+def process_journal_pdfs(parser, listurl, errors):
+# print "Finding PDFs on " + listurl
+# u = urllib.parse.urlparse(listurl)
+ html = scraperwiki.scrape(listurl)
+ root = lxml.html.fromstring(html)
+ html = None
+ for ahref in root.cssselect("table a"):
+ href = ahref.attrib['href']
+ url = urlparse.urljoin(listurl, href)
+ if -1 != href.find("file://") or -1 == url.find("/postliste-"):
+# print "Skipping non-http URL " + url
+ continue
+ if parser.is_already_scraped(url):
+ True
+# print "Skipping already scraped " + url
+ else:
+# print "Will process " + url
+ process_pdf(parser, url, errors)
+ # Follow the "next page" link to the end
+ for ahref in root.cssselect("center a"):
+ href = ahref.attrib['href']
+ url = urlparse.urljoin(listurl, href)
+ print ahref.text, url
+ if -1 != ahref.text.find("Neste side"):
+ process_journal_pdfs(parser, url, errors)
+
+def test_small_pdfs(parser):
+ # Test with some smaller PDFs
+ errors = []
+ process_pdf(parser, "http://www.kafjord.kommune.no/postliste-03-07-12.5071007-18590.html", errors)
+ process_page_queue(parser, errors)
+ report_errors(errors)
+ exit(0)
+
+errors = []
+parser = postlistelib.PDFJournalParser(agency=agency)
+
+#test_small_pdfs(parser)
+
+process_page_queue(parser, errors)
+process_journal_pdfs(parser, "http://www.kafjord.kommune.no/postlister.18590.no.html", errors)
+process_page_queue(parser, errors)
+report_errors(errors)
+
diff --git a/scrapersources/postliste-loppa-kommune b/scrapersources/postliste-loppa-kommune
new file mode 100644
index 0000000..7c7ec72
--- /dev/null
+++ b/scrapersources/postliste-loppa-kommune
@@ -0,0 +1,88 @@
+# -*- coding: UTF-8 -*-
+# Based on the scraper advanced-scraping-pdf
+# See also
+# https://views.scraperwiki.com/run/pdf-to-html-preview-1/?url=http%3A%2F%2Fwww.stortinget.no%2FGlobal%2Fpdf%2Fpostjournal%2Fpj-2012-05-09.pdf
+
+import scraperwiki
+import json
+from BeautifulSoup import BeautifulSoup
+import datetime
+import dateutil.parser
+import lxml.html
+import resource
+import sys
+import urlparse
+import re
+
+# Make sure Scraperwiki believe this is the source from this database
+scraperwiki.scrape("http://www.loppa.kommune.no/postjournal.113285.no.html")
+
+lazycache=scraperwiki.swimport('lazycache')
+postlistelib=scraperwiki.swimport('postliste-python-lib')
+
+agency = 'Loppa kommune'
+
+def report_errors(errors):
+ if 0 < len(errors):
+ print "Errors:"
+ for e in errors:
+ print e
+ raise ValueError("Something went wrong")
+
+def out_of_cpu(arg, spent, hard, soft):
+ report_errors(arg)
+
+def process_pdf(parser, pdfurl, errors):
+ postlistelib.exit_if_no_cpu_left(0, out_of_cpu, errors)
+ try:
+ pdfcontent = scraperwiki.scrape(pdfurl)
+ parser.preprocess(pdfurl, pdfcontent)
+ pdfcontent = None
+ except ValueError, e:
+ errors.append(e)
+ except IndexError, e:
+ errors.append(e)
+
+def process_page_queue(parser, errors):
+ try:
+ parser.process_pages()
+ postlistelib.exit_if_no_cpu_left(0, out_of_cpu, errors)
+ except scraperwiki.CPUTimeExceededError, e:
+ errors.append("Processing pages interrupted")
+
+def process_journal_pdfs(parser, listurl, errors):
+# print "Finding PDFs on " + listurl
+# u = urllib.parse.urlparse(listurl)
+ html = scraperwiki.scrape(listurl)
+ root = lxml.html.fromstring(html)
+ html = None
+ for ahref in root.cssselect("div.body a"):
+ href = ahref.attrib['href']
+ url = urlparse.urljoin(listurl, href)
+ if -1 != href.find("file://") or -1 == url.find(".pdf"):
+# print "Skipping non-http URL " + url
+ continue
+ if parser.is_already_scraped(url):
+ True
+# print "Skipping already scraped " + url
+ else:
+# print "Will process " + url
+ process_pdf(parser, url, errors)
+
+def test_small_pdfs(parser):
+ # Test with some smaller PDFs
+ errors = []
+ process_pdf(parser, "http://www.loppa.kommune.no/getfile.php/1983773.670.bbsaudxaex/25_2012.pdf", errors)
+ process_page_queue(parser, errors)
+ report_errors(errors)
+ exit(0)
+
+errors = []
+parser = postlistelib.PDFJournalParser(agency=agency)
+
+#test_small_pdfs(parser)
+
+process_journal_pdfs(parser, "http://www.loppa.kommune.no/postjournal.113285.no.html", errors)
+process_page_queue(parser, errors)
+report_errors(errors)
+
diff --git a/scrapersources/postliste-met b/scrapersources/postliste-met
new file mode 100644
index 0000000..02c53ca
--- /dev/null
+++ b/scrapersources/postliste-met
@@ -0,0 +1,91 @@
+# -*- coding: UTF-8 -*-
+
+import scraperwiki
+import json
+from BeautifulSoup import BeautifulSoup
+import datetime
+import dateutil.parser
+import lxml.html
+import urllib2
+import urlparse
+import re
+
+# Make sure Scraperwiki believe this is the source from this database
+scraperwiki.scrape("http://met.no/Om_oss/Offentlig_journal/")
+
+lazycache=scraperwiki.swimport('lazycache')
+postlistelib=scraperwiki.swimport('postliste-python-lib')
+
+agency = 'Meteorologisk institutt'
+
+def report_errors(errors):
+ if 0 < len(errors):
+ print "Errors:"
+ for e in errors:
+ print e
+ raise ValueError("Something went wrong")
+
+def out_of_cpu(arg, spent, hard, soft):
+ report_errors(arg)
+
+def process_pdf(parser, pdfurl, errors):
+ postlistelib.exit_if_no_cpu_left(0, out_of_cpu, errors)
+ try:
+ pdfcontent = scraperwiki.scrape(pdfurl)
+ parser.preprocess(pdfurl, pdfcontent)
+ pdfcontent = None
+ except ValueError, e:
+ errors.append(e)
+ except IndexError, e:
+ errors.append(e)
+ except urllib2.HTTPError, e:
+ errors.append(str(e) + " " + pdfurl)
+
+def process_page_queue(parser, errors):
+ try:
+ parser.process_pages()
+ postlistelib.exit_if_no_cpu_left(0, out_of_cpu, errors)
+ except scraperwiki.CPUTimeExceededError, e:
+ errors.append("Processing pages interrupted")
+
+def process_journal_pdfs(parser, listurl, errors):
+# print "Finding PDFs on " + listurl
+# u = urllib.parse.urlparse(listurl)
+ html = scraperwiki.scrape(listurl)
+ root = lxml.html.fromstring(html)
+ html = None
+ for ahref in root.cssselect("div.article-content a"):
+ href = ahref.attrib['href']
+ url = urlparse.urljoin(listurl, href)
+ if -1 != href.find("file://") or -1 == url.find("=File.getFile;"):
+# print "Skipping non-http URL " + url
+ continue
+ if parser.is_already_scraped(url):
+ True
+# print "Skipping already scraped " + url
+ else:
+# print "Will process " + url
+ process_pdf(parser, url, errors)
+
+def test_small_pdfs(parser):
+ # Test with some smaller PDFs
+ errors = []
+ process_pdf(parser, "http://met.no/Om_oss/Offentlig_journal/2012/?module=Files;action=File.getFile;ID=4570", errors)
+ process_page_queue(parser, errors)
+ report_errors(errors)
+ exit(0)
+
+errors = []
+parser = postlistelib.PDFJournalParser(agency=agency)
+
+#test_small_pdfs(parser)
+
+process_page_queue(parser, errors)
+process_journal_pdfs(parser, "http://met.no/Om_oss/Offentlig_journal/2012/", errors)
+process_journal_pdfs(parser, "http://met.no/Om_oss/Offentlig_journal/2011/", errors)
+process_journal_pdfs(parser, "http://met.no/Om_oss/Offentlig_journal/2010/", errors)
+process_journal_pdfs(parser, "http://met.no/Om_oss/Offentlig_journal/2009/", errors)
+process_journal_pdfs(parser, "http://met.no/Om_oss/Offentlig_journal/2008/", errors)
+process_page_queue(parser, errors)
+report_errors(errors)
+
diff --git a/scrapersources/postliste-mrfylke b/scrapersources/postliste-mrfylke
new file mode 100644
index 0000000..5c26ba3
--- /dev/null
+++ b/scrapersources/postliste-mrfylke
@@ -0,0 +1,82 @@
+# -*- coding: UTF-8 -*-
+
+import scraperwiki
+import json
+from BeautifulSoup import BeautifulSoup
+import datetime
+import dateutil.parser
+import lxml.html
+import resource
+import sys
+import urlparse
+import re
+lazycache=scraperwiki.swimport('lazycache')
+postlistelib=scraperwiki.swimport('postliste-python-lib')
+
+agency = 'Møre og Romsdal fylkeskommune'
+
+def report_errors(errors):
+ if 0 < len(errors):
+ print "Errors:"
+ for e in errors:
+ print e
+ exit(1)
+def out_of_cpu(arg, spent, hard, soft):
+ report_errors(arg)
+
+def process_pdf(parser, pdfurl, errors):
+ errors = []
+ postlistelib.exit_if_no_cpu_left(0, out_of_cpu, errors)
+ try:
+ pdfcontent = scraperwiki.scrape(pdfurl)
+ parser.preprocess(pdfurl, pdfcontent)
+ pdfcontent = None
+# except ValueError, e:
+# errors.append(e)
+ except IndexError, e:
+ errors.append(e)
+
+def process_page_queue(parser, errors):
+ try:
+ parser.process_pages()
+ postlistelib.exit_if_no_cpu_left(0, out_of_cpu, errors)
+ except scraperwiki.CPUTimeExceededError, e:
+ errors.append("Processing pages interrupted")
+
+def process_journal_pdfs(parser, listurl, errors):
+# print "Finding PDFs on " + listurl
+# u = urllib.parse.urlparse(listurl)
+ html = scraperwiki.scrape(listurl)
+ root = lxml.html.fromstring(html)
+ html = None
+ for ahref in root.cssselect("div.attribute-long a"):
+ href = ahref.attrib['href']
+ url = urlparse.urljoin(listurl, href)
+ if -1 != href.find("file://") or -1 != href.find("mailto:"):
+# print "Skipping non-http URL " + url
+ continue
+ if parser.is_already_scraped(url):
+ True
+# print "Skipping already scraped " + url
+ else:
+# print "Will process " + url
+ process_pdf(parser, url, errors)
+
+def test_small_pdfs(parser):
+ # Test with some smaller PDFs
+ errors = []
+ process_pdf(parser, "https://mrfylke.no/Media/Files/Filer-administrasjonsavdelinga/Dokumentsenteret/Oktober-2011/Offentleg-journal-03.10.11", errors)
+ process_page_queue(parser, errors)
+ report_errors(errors)
+ exit(0)
+
+errors = []
+parser = postlistelib.PDFJournalParser(agency=agency)
+
+#test_small_pdfs(parser)
+
+process_page_queue(parser, errors)
+process_journal_pdfs(parser, "https://mrfylke.no/Organisasjon/Organisasjon/Administrasjonsavdelinga/Dokumentsenter/Offentleg-journal", errors)
+process_page_queue(parser, errors)
+report_errors(errors)
+
diff --git a/scrapersources/postliste-nrk b/scrapersources/postliste-nrk
index 5c7929d..3379f31 100644
--- a/scrapersources/postliste-nrk
+++ b/scrapersources/postliste-nrk
@@ -1,7 +1,4 @@
# -*- coding: UTF-8 -*-
-# Based on the scraper advanced-scraping-pdf
-# See also
-# https://views.scraperwiki.com/run/pdf-to-html-preview-1/?url=http%3A%2F%2Fwww.stortinget.no%2FGlobal%2Fpdf%2Fpostjournal%2Fpj-2012-05-09.pdf
import scraperwiki
import json
@@ -9,39 +6,39 @@ from BeautifulSoup import BeautifulSoup
import datetime
import dateutil.parser
import lxml.html
-import resource
-import sys
import urlparse
-import gc
import re
-frontpage = "http://www.nrk.no/contentfile/transformer/1.8052258"
-scraperwiki.scrape(frontpage)
+scraperwiki.scrape("http://www.nrk.no/innsyn/")
lazycache=scraperwiki.swimport('lazycache')
postlistelib=scraperwiki.swimport('postliste-python-lib')
-agency = 'Universitetet i Oslo'
+agency = 'Norsk Rikskringkasting AS'
def report_errors(errors):
if 0 < len(errors):
print "Errors:"
for e in errors:
print e
- exit(1)
+ raise ValueError(str(len(errors)) + " errors detected")
+
def out_of_cpu(arg, spent, hard, soft):
report_errors(arg)
def process_pdf(parser, pdfurl, errors):
- errors = []
+ if parser.is_already_scraped(pdfurl):
+ return
postlistelib.exit_if_no_cpu_left(0, out_of_cpu, errors)
try:
pdfcontent = scraperwiki.scrape(pdfurl)
parser.preprocess(pdfurl, pdfcontent)
pdfcontent = None
-# except ValueError, e:
-# errors.append(e)
+ except ValueError, e:
+ print e
+ errors.append(e)
except IndexError, e:
+ print e
errors.append(e)
def process_page_queue(parser, errors):
@@ -52,17 +49,14 @@ def process_page_queue(parser, errors):
errors.append("Processing pages interrupted")
def process_journal_pdfs(parser, listurl, errors):
-# print "Finding PDFs on " + listurl
+ print "Finding PDFs on " + listurl
# u = urllib.parse.urlparse(listurl)
- html = scraperwiki.scrape(listurl)
- root = lxml.html.fromstring(html)
- html = None
- for ahref in root.cssselect("table a"):
- href = ahref.attrib['href']
- url = urlparse.urljoin(listurl, href)
- if -1 != href.find("file://"):
-# print "Skipping non-http URL " + url
- continue
+ xml = scraperwiki.scrape(listurl)
+ root = lxml.html.fromstring(xml)
+ xml = None
+ for link in root.cssselect("hendelse link"):
+ url = lxml.html.tostring(link).replace("<link>", "").strip()
+ #print url
if parser.is_already_scraped(url):
True
# print "Skipping already scraped " + url
@@ -72,10 +66,50 @@ def process_journal_pdfs(parser, listurl, errors):
def test_small_pdfs(parser):
- parser.debug = True
+ #parser.debug = True
errors = []
- process_pdf(parser, "http://nrk.no/contentfile/file/1.8116520!offentligjournal02052012.pdf", errors) # text
+
+ # 2011:
+ if True:
+ process_pdf(parser, "http://home.nuug.no/~pere/nrk-postjournal/Offentlig%20journal%20NRK%200101_15012011.pdf", errors)
+ process_pdf(parser, "http://home.nuug.no/~pere/nrk-postjournal/Offentlig%20journal%20NRK%200102_10022011.pdf", errors)
+ process_pdf(parser, "http://home.nuug.no/~pere/nrk-postjournal/Offentlig%20journal%20NRK%200103_10032011.pdf", errors)
+ process_pdf(parser, "http://home.nuug.no/~pere/nrk-postjournal/Offentlig%20journal%20NRK%200104_07042011.pdf", errors)
+ process_pdf(parser, "http://home.nuug.no/~pere/nrk-postjournal/Offentlig%20journal%20NRK%200105_05052011.pdf", errors)
+ process_pdf(parser, "http://home.nuug.no/~pere/nrk-postjournal/Offentlig%20journal%20NRK%200106_18062011.pdf", errors)
+ process_pdf(parser, "http://home.nuug.no/~pere/nrk-postjournal/Offentlig%20journal%20NRK%200107_15072011.pdf", errors)
+ process_pdf(parser, "http://home.nuug.no/~pere/nrk-postjournal/Offentlig%20journal%20NRK%200108_15082011.pdf", errors)
+ process_pdf(parser, "http://home.nuug.no/~pere/nrk-postjournal/Offentlig%20journal%20NRK%200109_12092011.pdf", errors)
+ process_pdf(parser, "http://home.nuug.no/~pere/nrk-postjournal/Offentlig%20journal%20NRK%200110_12102011.pdf", errors)
+ process_pdf(parser, "http://home.nuug.no/~pere/nrk-postjournal/Offentlig%20journal%20NRK%200111_10112011.pdf", errors)
+ process_pdf(parser, "http://home.nuug.no/~pere/nrk-postjournal/Offentlig%20journal%20NRK%200112_10122011.pdf", errors)
+ process_pdf(parser, "http://home.nuug.no/~pere/nrk-postjournal/Offentlig%20journal%20NRK%200605_10052011.pdf", errors)
+ process_pdf(parser, "http://home.nuug.no/~pere/nrk-postjournal/Offentlig%20journal%20NRK%200804_15042011.pdf", errors)
+ process_pdf(parser, "http://home.nuug.no/~pere/nrk-postjournal/Offentlig%20journal%20NRK%201102_19022011.pdf", errors)
+ process_pdf(parser, "http://home.nuug.no/~pere/nrk-postjournal/Offentlig%20journal%20NRK%201103_17032011.pdf", errors)
+ process_pdf(parser, "http://home.nuug.no/~pere/nrk-postjournal/Offentlig%20journal%20NRK%201105_20052011.pdf", errors)
+ process_pdf(parser, "http://home.nuug.no/~pere/nrk-postjournal/Offentlig%20journal%20NRK%201111_20112011.pdf", errors)
+ process_pdf(parser, "http://home.nuug.no/~pere/nrk-postjournal/Offentlig%20journal%20NRK%201112_20122011.pdf", errors)
+ process_pdf(parser, "http://home.nuug.no/~pere/nrk-postjournal/Offentlig%20journal%20NRK%201309_25092011.pdf", errors)
+ process_pdf(parser, "http://home.nuug.no/~pere/nrk-postjournal/Offentlig%20journal%20NRK%201310_20102011.pdf", errors)
+ process_pdf(parser, "http://home.nuug.no/~pere/nrk-postjournal/Offentlig%20journal%20NRK%201601_31012011.pdf", errors)
+ process_pdf(parser, "http://home.nuug.no/~pere/nrk-postjournal/Offentlig%20journal%20NRK%201604_30042011.pdf", errors)
+ process_pdf(parser, "http://home.nuug.no/~pere/nrk-postjournal/Offentlig%20journal%20NRK%201607_31072011.pdf", errors)
+ process_pdf(parser, "http://home.nuug.no/~pere/nrk-postjournal/Offentlig%20journal%20NRK%201608_25082011.pdf", errors)
+ process_pdf(parser, "http://home.nuug.no/~pere/nrk-postjournal/Offentlig%20journal%20NRK%201803_26032011.pdf", errors)
+ process_pdf(parser, "http://home.nuug.no/~pere/nrk-postjournal/Offentlig%20journal%20NRK%201906_26062011.pdf", errors)
+ process_pdf(parser, "http://home.nuug.no/~pere/nrk-postjournal/Offentlig%20journal%20NRK%202105_31052011.pdf", errors)
+ process_pdf(parser, "http://home.nuug.no/~pere/nrk-postjournal/Offentlig%20journal%20NRK%202110_31102011.pdf", errors)
+ process_pdf(parser, "http://home.nuug.no/~pere/nrk-postjournal/Offentlig%20journal%20NRK%202111_30112011.pdf", errors)
+ process_pdf(parser, "http://home.nuug.no/~pere/nrk-postjournal/Offentlig%20journal%20NRK%202112_31122011.pdf", errors)
+ process_pdf(parser, "http://home.nuug.no/~pere/nrk-postjournal/Offentlig%20journal%20NRK%202502_28022011.pdf", errors)
+ process_pdf(parser, "http://home.nuug.no/~pere/nrk-postjournal/Offentlig%20journal%20NRK%202608_31082011.pdf", errors)
+ process_pdf(parser, "http://home.nuug.no/~pere/nrk-postjournal/Offentlig%20journal%20NRK%202609_30092011.pdf", errors)
+ process_pdf(parser, "http://home.nuug.no/~pere/nrk-postjournal/Offentlig%20journal%20NRK%202703_31032011.pdf", errors)
+ process_pdf(parser, "http://home.nuug.no/~pere/nrk-postjournal/Offentlig%20journal%20NRK%202706_30062011.pdf", errors)
+
+ #process_pdf(parser, "http://nrk.no/contentfile/file/1.8116520!offentligjournal02052012.pdf", errors) # text
#process_pdf(parser, "http://nrk.no/contentfile/file/1.8061384!offentlig%2002042012.pdf", errors) # Image
#process_pdf(parser, "http://nrk.no/contentfile/file/1.8130287!offentligjournal09052012.pdf", errors) # Image
process_page_queue(parser, errors)
@@ -85,10 +119,9 @@ def test_small_pdfs(parser):
errors = []
parser = postlistelib.PDFJournalParser(agency=agency, hiddentext=True)
-test_small_pdfs(parser)
+#test_small_pdfs(parser)
# Based on http://www.nrk.no/innsyn/
-process_journal_pdfs(parser, frontpage, errors)
+process_journal_pdfs(parser, "http://www.nrk.no/contentfile/transformer/1.8052258", errors)
process_page_queue(parser, errors)
-report_errors(errors)
-
+report_errors(errors) \ No newline at end of file
diff --git a/scrapersources/postliste-ntnu b/scrapersources/postliste-ntnu
index 1a885c4..d6c6695 100644
--- a/scrapersources/postliste-ntnu
+++ b/scrapersources/postliste-ntnu
@@ -22,7 +22,7 @@ def report_errors(errors):
print "Errors:"
for e in errors:
print e
- raise ValueError("Something went wrong")
+ raise ValueError(str(len(errors)) + "errors detected")
def out_of_cpu(arg, spent, hard, soft):
report_errors(arg)
@@ -39,7 +39,7 @@ def process_pdf(parser, pdfurl, errors):
except IndexError, e:
errors.append(e)
except urllib2.HTTPError, e:
- errors.append(e)
+ errors.append(str(e) + " " + pdfurl)
def process_page_queue(parser, errors):
try:
diff --git a/scrapersources/postliste-oep b/scrapersources/postliste-oep
index c7fdc82..735d0a7 100644
--- a/scrapersources/postliste-oep
+++ b/scrapersources/postliste-oep
@@ -9,7 +9,7 @@ import httplib
import urllib2
# Try several times as the database get bigger
-writetries = 5
+writetries = 6
# http://www.oep.no/search/resultSingle.html?journalPostId=1000000
# http://www.oep.no/search/resultSingle.html?journalPostId=3889259
@@ -102,7 +102,7 @@ def url_from_id(id):
return "http://www.oep.no/search/resultSingle.html?journalPostId=" + str(id)
def save(data):
- for run in range(1,writetries):
+ for run in range(0,writetries):
try:
scraperwiki.sqlite.save(unique_keys=['journalPostId'], data=data)
return
@@ -112,7 +112,7 @@ def save(data):
raise scraperwiki.sqlite.SqliteError("Unable to write to database, tried " + str(writetries) + " times")
def save_var(var, data):
- for run in range(1,writetries):
+ for run in range(0,writetries):
try:
scraperwiki.sqlite.save_var(var, data)
return
@@ -184,6 +184,8 @@ def fetch_range(first, last, step):
fetched = 0
min_id = first
for id in range(first, last, step):
+ if id < 0:
+ break
try:
tries = 3
while 0 < tries:
@@ -309,6 +311,7 @@ def remove_original():
#update_doctypes()
print "Starting to fetch journal entries " + str(datetime.datetime.now())
+scraperwiki.scrape("http://www.oep.no/")
count = 10000
skiplimit = 500
# Random value fairly close to the most recent ID when this project started 2012-05-03
diff --git a/scrapersources/postliste-oslo-bydel-ullern b/scrapersources/postliste-oslo-bydel-ullern
index 54a5031..614b12f 100644
--- a/scrapersources/postliste-oslo-bydel-ullern
+++ b/scrapersources/postliste-oslo-bydel-ullern
@@ -11,6 +11,9 @@ import dateutil.parser
import lxml.html
import urlparse
import re
+
+scraperwiki.scrape("http://www.bydel-ullern.oslo.kommune.no/postjournal/")
+
#lazycache=scraperwiki.swimport('lazycache')
postlistelib=scraperwiki.swimport('postliste-python-lib')
@@ -21,19 +24,19 @@ def report_errors(errors):
print "Errors:"
for e in errors:
print e
- exit(1)
+ raise ValueError("Something went wrong")
+
def out_of_cpu(arg, spent, hard, soft):
report_errors(arg)
def process_pdf(parser, pdfurl, errors):
- errors = []
postlistelib.exit_if_no_cpu_left(0, out_of_cpu, errors)
try:
pdfcontent = scraperwiki.scrape(pdfurl)
parser.preprocess(pdfurl, pdfcontent)
pdfcontent = None
-# except ValueError, e:
-# errors.append(e)
+ except ValueError, e:
+ errors.append(e)
except IndexError, e:
errors.append(e)
diff --git a/scrapersources/postliste-oslo-havn b/scrapersources/postliste-oslo-havn
index d453ef7..1139b81 100644
--- a/scrapersources/postliste-oslo-havn
+++ b/scrapersources/postliste-oslo-havn
@@ -12,6 +12,9 @@ import lxml.html
import sys
import urlparse
import re
+
+scraperwiki.scrape("http://www.havn.oslo.kommune.no/postjournal/")
+
lazycache=scraperwiki.swimport('lazycache')
postlistelib=scraperwiki.swimport('postliste-python-lib')
@@ -22,17 +25,17 @@ def report_errors(errors):
print "Errors:"
for e in errors:
print e
- exit(1)
+ raise ValueError(str(len(errors)) + " errors detected")
+
def out_of_cpu(arg, spent, hard, soft):
report_errors(arg)
def process_pdf(parser, pdfurl, errors):
- errors = []
postlistelib.exit_if_no_cpu_left(0, out_of_cpu, errors)
try:
parser.fetch_and_preprocess(pdfurl)
-# except ValueError, e:
-# errors.append(e)
+ except ValueError, e:
+ errors.append(e)
except IndexError, e:
errors.append(e)
diff --git a/scrapersources/postliste-python-lib b/scrapersources/postliste-python-lib
index 042d1fd..7176ae9 100644
--- a/scrapersources/postliste-python-lib
+++ b/scrapersources/postliste-python-lib
@@ -1,4 +1,4 @@
-# -*- coding: utf-8 -*-
+# -*- coding: UTF-8 -*-
#
# Python library for parsing public post journals (postlister) in Norway.
#
@@ -100,6 +100,11 @@ class JournalParser:
if -1 != entry['caseid'].find('-'):
raise ValueError("Field caseid should not include dash: " + entry['caseid'])
+
+ # Seen in http://home.nuug.no/~pere/nrk-postjournal/Offentlig%20journal%20NRK%200101_15012011.pdf
+ if 'sender' in entry and -1 != entry['sender'].find("Side: "):
+ raise ValueError("Field sender got page number, not real content")
+
#
# Parser of PDFs looking like
# http://www.storfjord.kommune.no/postliste-18-mai-2012.5056067-105358.html (type 1)
@@ -122,6 +127,8 @@ class PDFJournalParser(JournalParser):
# FIXME Figure out why this do not work
#" and not (sender = 'parse error' or recipient != 'parse error') " +
"limit 1",
+
+ "scrapedurl from " + self.brokenpagetable + " where scrapedurl = '" + url + "' limit 1",
"scrapedurl from " + self.pagetable + " where scrapedurl = '" + url + "' limit 1"]:
try:
result = scraperwiki.sqlite.select(sql)
@@ -131,7 +138,8 @@ class PDFJournalParser(JournalParser):
except Exception as e:
#if ('no such table: %s' % self.pagetable) not in str(e) and 'no such table: swdata' not in str(e):
# raise
- print "Exception: %s" % e
+ #print "Ignoring exception: %s" % e
+ True
return False
# Check if we recognize the page content, and throw if not
@@ -139,10 +147,7 @@ class PDFJournalParser(JournalParser):
s = BeautifulSoup(pagecontent)
for t in s.findAll('text'):
if t.text != " ":
- if 'Innhold:' == t.text: # type 1 or 2 (ePhorge)
- s = None
- return True
- if 'Arkivdel:' == t.text]: # type 3 (doculive)
+ if 'Innhold:' == t.text:
s = None
return True
s = None
@@ -195,7 +200,6 @@ class PDFJournalParser(JournalParser):
for i in range(0, len(entrytext)):
print str(i) + ": '" + entrytext[i] + "'"
- # ePhorte PDF
def parse_entry_type1(self, entrytext, pdfurl):
scrapestamputc = datetime.datetime.now()
entry = {
@@ -349,7 +353,6 @@ class PDFJournalParser(JournalParser):
self.parse_case_journal_ref(entry, [u'2010/2593-2', u'103004/201', u'1'], "")
self.parse_case_journal_ref(entry, [u'2011/13415-', u'22', u'100077/201', u'1'], "")
- # ePhorte PDF
def parse_entry_type2(self, entrytext, pdfurl):
scrapestamputc = datetime.datetime.now()
entry = {
@@ -455,8 +458,7 @@ class PDFJournalParser(JournalParser):
entrycount = 0
i = 0
while i < len(text):
- if 'Innhold:' == text[i] \ # Type 1 and 2 (ePhorge)
- or 'Arkivdel:' == text[i]: # type 3 (doculive)
+ if 'Innhold:' == text[i]:
entrycount = entrycount + 1
i = i + 1
@@ -483,10 +485,12 @@ class PDFJournalParser(JournalParser):
if self.debug:
print "Entry " + str(entrycount) + " from " + str(i) + " to " + str(endi) + " ie " + str(endi - i) + " lines"
try:
+ if pdfparser is None:
+ raise ValueError("Unrecognized page format in " + pdfurl)
entry = pdfparser(text[i:endi], pdfurl)
if 'caseid' not in entry or entry['caseid'] is None or \
not self.is_valid_doctype(entry['doctype']):
- raise ValueError("Unable to parse " + pdfurl + " as format " + format + "[" + str(entry) + "]")
+ raise ValueError("Unable to parse " + pdfurl + " as format " + format + " [" + str(entry) + "]")
# print entry
datastore.append(entry)
i = endi - 2
@@ -507,6 +511,7 @@ class PDFJournalParser(JournalParser):
text = None
def process_pages(self):
+ brokenpages = 0
try:
sqlselect = "* from " + self.pagetable + " limit 1"
pageref = scraperwiki.sqlite.select(sqlselect)
@@ -525,15 +530,61 @@ class PDFJournalParser(JournalParser):
'scrapedurl' : scrapedurl,
'pagenum' : pagenum,
'pagecontent' : pagecontent,
+ 'failstamp' : datetime.datetime.now(),
}
- print "Broken page %d from %s" % (pagenum, scrapedurl)
+ print "Unsupported page %d from %s" % (pagenum, scrapedurl)
+ brokenpages = brokenpages + 1
scraperwiki.sqlite.save(unique_keys=['scrapedurl', 'pagenum'], data=brokenpage, table_name=self.brokenpagetable)
scraperwiki.sqlite.execute(sqldelete)
scraperwiki.sqlite.commit()
pageref = scraperwiki.sqlite.select(sqlselect)
+
+ # Last, try some of the broken pages again, in case we got support for handling them in the mean time
+ try:
+ # First, check if the table exist
+ scraperwiki.sqlite.execute("select * from " + self.brokenpagetable)
+
+ newtrystamp = datetime.datetime.now()
+ sqlselect = "* from " + self.brokenpagetable + " where failstamp is NULL or failstamp < '" + str(newtrystamp) + "'" + " limit 1"
+ try:
+ pageref = scraperwiki.sqlite.select(sqlselect)
+ except scraperwiki.sqlite.SqliteError, e:
+ scraperwiki.sqlite.execute("ALTER TABLE " + self.brokenpagetable + " ADD COLUMN failstamp")
+ scraperwiki.sqlite.commit()
+ pageref = scraperwiki.sqlite.select(sqlselect)
+
+ pagelimit = 10
+ while pageref and 0 < pagelimit:
+ pagelimit = pagelimit - 1
+ scrapedurl = pageref[0]['scrapedurl']
+ pagenum = pageref[0]['pagenum']
+ pagecontent = pageref[0]['pagecontent']
+# print "Found " + scrapedurl + " page " + str(pagenum) + " length " + str(len(pagecontent))
+ try:
+ sqldelete = "delete from " + self.brokenpagetable + " where scrapedurl = '" + scrapedurl + "' and pagenum = " + str(pagenum)
+ self.parse_page(scrapedurl, pagenum, pagecontent)
+# print "Trying to: " + sqldelete
+ scraperwiki.sqlite.execute(sqldelete)
+ except ValueError, e:
+ brokenpage = {
+ 'scrapedurl' : scrapedurl,
+ 'pagenum' : pagenum,
+ 'pagecontent' : pagecontent,
+ 'failstamp' : newtrystamp,
+ }
+
+ print "Still unsupported page %d from %s" % (pagenum, scrapedurl)
+ brokenpages = brokenpages + 1
+ scraperwiki.sqlite.save(unique_keys=['scrapedurl', 'pagenum'], data=brokenpage, table_name=self.brokenpagetable)
+ scraperwiki.sqlite.commit()
+ pageref = scraperwiki.sqlite.select(sqlselect)
+ except:
+ True # Ignore missing brokenpages table
except scraperwiki.sqlite.SqliteError, e:
print str(e)
raise
+ if 0 < brokenpages:
+ raise ValueError("Found %d pages with unsupported format" % brokenpages)
def fieldlist():
import urllib2
diff --git a/scrapersources/postliste-python-lib-doculive b/scrapersources/postliste-python-lib-doculive
new file mode 100644
index 0000000..520c915
--- /dev/null
+++ b/scrapersources/postliste-python-lib-doculive
@@ -0,0 +1,649 @@
+# -*- coding: UTF-8 -*-
+#
+# Python library for parsing public post journals (postlister) in Norway.
+#
+
+# Based on the scraper advanced-scraping-pdf
+#
+# See also
+# https://views.scraperwiki.com/run/pdf-to-html-preview-1/
+
+# Possible sources using format 1 pdf:
+# www.bydel-ullern.oslo.kommune.no
+# www.gravferdsetaten.oslo.kommune.no
+# www.halden.kommune.no (done)
+# www.havn.oslo.kommune.no (done)
+# www.hvaler.kommune.no (done)
+# www.kafjord.kommune.no
+# www.lier.kommune.no
+# www.lindesnes.kommune.no
+# www.naroy.kommune.no
+# www.saltdal.kommune.no
+# www.sogne.kommune.no
+# www.vikna.kommune.no
+#
+# Google search to find more: "Offentlig journal" Seleksjon Sakstittel Dokumenttype Status filetype:pdf
+
+
+import scraperwiki
+import string
+import re
+from BeautifulSoup import BeautifulSoup
+import datetime
+import dateutil.parser
+
+def cpu_spent():
+ import resource
+ usage = resource.getrusage(resource.RUSAGE_SELF)
+ return getattr(usage, 'ru_utime') + getattr(usage, 'ru_stime')
+
+def exit_if_no_cpu_left(retval, callback=None, arg = None):
+ import resource
+ soft, hard = resource.getrlimit(resource.RLIMIT_CPU)
+ spent = cpu_spent()
+ if soft < spent:
+ if callback is not None:
+ callback(arg, spent, hard, soft)
+ print "Running out of CPU, exiting."
+ exit(retval)
+
+def fetch_url_harder(url, scraper = None):
+ import urllib2
+ html = None
+ for n in [1, 2, 3]:
+ try:
+ if None == scraper:
+ scraper = scraperwiki.scrape
+ html = scraper(url)
+ break
+ except urllib2.URLError, e:
+ print "URLError fetching " + url + ", trying again"
+ return html
+
+class JournalParser:
+ agency = None
+ debug = False
+
+ validdoctypes = ['I', 'U', 'X', 'N']
+ senderdoctypes = ['I', 'X', 'N']
+ recipientdoctypes = ['U']
+ mustfields = {
+ 'agency' : 1,
+ 'docdesc' : 1,
+ 'doctype' : 1,
+ 'caseyear' : 1,
+ 'caseseqnr' : 1,
+ 'casedocseq' : 1,
+ }
+
+ def __init__(self, agency):
+ self.agency = agency
+
+ def is_valid_doctype(self, doctype):
+ return doctype in self.validdoctypes
+
+ def is_sender_doctype(self, doctype):
+ return doctype in self.senderdoctypes
+
+ def is_recipient_doctype(self, doctype):
+ return doctype in self.recipientdoctypes
+
+ def verify_entry(self, entry):
+
+ for field in self.mustfields:
+ if not field in entry:
+ raise ValueError("Missing required field " + field)
+
+ if not self.is_valid_doctype(entry['doctype']):
+ raise ValueError("Invalid doctype " + doctype)
+
+ if -1 != entry['caseid'].find('-'):
+ raise ValueError("Field caseid should not include dash: " + entry['caseid'])
+
+#
+# Parser of PDFs looking like
+# http://www.storfjord.kommune.no/postliste-18-mai-2012.5056067-105358.html (type 1)
+# http://www.hadsel.kommune.no/component/docman/doc_download/946-offentlig-postjournal-28032012 (type 2)
+# http://www.stortinget.no/Global/pdf/postjournal/pj-2011-06-23.pdf (type 2 variant)
+# Note sender/receiver is not yet parsed for type 2 PDFs
+class PDFJournalParser(JournalParser):
+ pagetable = "unparsedpages"
+ brokenpagetable = "brokenpages"
+ hiddentext = False
+ breakonfailure = True
+
+ def __init__(self, agency, hiddentext=False):
+ self.hiddentext = hiddentext
+ JournalParser.__init__(self, agency=agency)
+
+ def is_already_scraped(self, url):
+ # Ignore entries were sender and recipient is the result of a broken parser (before 2012-05-25)
+ for sql in ["scrapedurl, sender, recipient from swdata where scrapedurl = '" + url + "' " +
+ # FIXME Figure out why this do not work
+ #" and not (sender = 'parse error' or recipient != 'parse error') " +
+ "limit 1",
+ "scrapedurl from " + self.pagetable + " where scrapedurl = '" + url + "' limit 1"]:
+ try:
+ result = scraperwiki.sqlite.select(sql)
+ #int sql, " : ", result
+ if 0 < len(result) and u'scrapedurl' in result[0]:
+ return True
+ except Exception as e:
+ #if ('no such table: %s' % self.pagetable) not in str(e) and 'no such table: swdata' not in str(e):
+ # raise
+ print "Exception: %s" % e
+ return False
+
+ # Check if we recognize the page content, and throw if not
+ def is_valid_page(self, pdfurl, pagenum, pagecontent):
+ s = BeautifulSoup(pagecontent)
+ for t in s.findAll('text'):
+ if t.text != " ":
+ if self.debug:
+ print t.text
+ if 'Innhold:' == t.text: # type 1 or 2 (ePhorge)
+ s = None
+ return True
+ if 'Arkivdel:' == t.text or 'Notater (X):' == t.text: # type 3 (doculive)
+ s = None
+ return True
+ s = None
+ if self.debug:
+ print "Unrecognized page format for " + pdfurl
+ raise ValueError("Unrecognized page format for " + pdfurl)
+
+ #
+ # Split PDF content into pages and store in SQL table for later processing.
+ # The process is split in two to better handle parge PDFs (like 600 pages),
+ # without running out of CPU time without loosing track of what is left to
+ # parse.
+ def preprocess(self, pdfurl, pdfcontent):
+ print "Preprocessing PDF " + pdfurl
+ if not pdfcontent:
+ raise ValueError("No pdf content passed for " + pdfurl)
+ if self.hiddentext:
+ options = '-hidden'
+ else:
+ options = ''
+ xml=scraperwiki.pdftoxml(pdfcontent, options)
+ if self.debug:
+ print xml
+ pages=re.findall('(<page .+?</page>)',xml,flags=re.DOTALL)
+ xml=None
+# print pages[:1][:1000]
+ pagecount = 0
+ datastore = []
+ for page in pages:
+ pagecount = pagecount + 1
+ self.is_valid_page(pdfurl, pagecount, page)
+ data = {
+ 'scrapedurl' : pdfurl,
+ 'pagenum' : pagecount,
+ 'pagecontent' : page,
+ }
+ datastore.append(data)
+ if 0 < len(datastore):
+ scraperwiki.sqlite.save(unique_keys=['scrapedurl', 'pagenum'], data=datastore, table_name=self.pagetable)
+ else:
+ raise ValueError("Unable to find any pages in " + pdfurl)
+ pages = None
+
+ def fetch_and_preprocess(self, pdfurl):
+ pdfcontent = fetch_url_harder(pdfurl)
+ self.preprocess(pdfurl, pdfcontent)
+ pdfcontent = None
+
+ def print_entry(self, entrytext):
+ for i in range(0, len(entrytext)):
+ print str(i) + ": '" + entrytext[i] + "'"
+
+ # ePhorte PDF
+ def parse_entry_type1(self, entrytext, pdfurl):
+ scrapestamputc = datetime.datetime.now()
+ entry = {
+ 'agency' : self.agency,
+ 'scrapestamputc' : scrapestamputc,
+ 'scrapedurl' : pdfurl
+ }
+ i = 0
+ while i < len(entrytext):
+ #print "T: '" + entrytext[i] + "'"
+ if 'Innhold:' == entrytext[i]:
+ tittel = ""
+ # handle multi-line titles
+ while 'Sakstittel:' != entrytext[i+1]:
+ tittel = tittel + " " + entrytext[i+1]
+ i = i + 1
+ entry['docdesc'] = tittel
+ if 'Sakstittel:' == entrytext[i]:
+ sakstittel = ""
+ while 'DokType' != entrytext[i+1]:
+# print "'" + entrytext[i+1] + "'"
+ sakstittel = sakstittel + " " + entrytext[i+1]
+ i = i + 1
+ entry['casedesc'] = sakstittel
+ if 'DokType' == entrytext[i]: # Values I/U/N/X from NOARK 4 table 14.2.11
+ entry['doctype'] = entrytext[i+1]
+ # As seen on http://www.saltdal.kommune.no/images/module.files/2007-05-16.pdf, page 1
+ if entry['doctype'] == 'S':
+ entry['doctype'] = 'X'
+ i = i + 1
+ if 'Sak/dok nr:' == entrytext[i]:
+ # FIXME Split and handle combined sak/løpenr
+ # Use find('penr.:') to avoid non-ascii search string 'Løpenr.:'
+ caseid = None
+ lnr = None
+ if -1 != entrytext[i+4].find('penr.:'):
+ caseid = entrytext[i+1] + entrytext[i+2]
+ lnr = entrytext[i+3]
+ i = i + 4
+ elif -1 != entrytext[i+3].find('penr.:'):
+ caseid = entrytext[i+1]
+ lnr = entrytext[i+2]
+ i = i + 3
+ elif -1 != entrytext[i+2].find('penr.:'):
+ caseid, lnr = entrytext[i+1].split(" ")
+ i = i + 2
+
+ caseyear, caseseqnr = caseid.split("/")
+ entry['caseyear'] = int(caseyear)
+ caseseqnr, casedocseq = caseseqnr.split("-")
+ entry['caseseqnr'] = int(caseseqnr)
+ entry['casedocseq'] = int(casedocseq)
+ entry['caseid'] = caseyear + "/" + caseseqnr
+
+ journalseqnr, journalyear = lnr.split("/")
+ entry['journalid'] = journalyear + "/" + journalseqnr
+ entry['journalyear'] = int(journalyear)
+ entry['journalseqnr'] = int(journalseqnr)
+
+# if -1 != text[i].find('penr.:'): # Use find('penr.:') to avoid non-ascii search string 'Løpenr.:'
+# str = text[i-1]
+# print "S: '" + str + "'"
+# data['journalid'] = str
+# # FIXME handle combined sak/løpenr
+ if 'Journaldato:' == entrytext[i]:
+ entry['recorddate'] = dateutil.parser.parse(entrytext[i-1], dayfirst=True)
+ if 'Dok.dato:' == entrytext[i]:
+ entry['docdate'] = dateutil.parser.parse(entrytext[i-1], dayfirst=True)
+ if 'Tilg.kode Hjemmel:' == entrytext[i] and 'Avsender\mottaker:' != entrytext[i+1]:
+ entry['exemption'] = entrytext[i+1]
+ i = i + 1
+ if 'Tilg.kode' == entrytext[i]:
+ entry['accesscode'] = entrytext[i+1]
+ i = i + 1
+ if 'Hjemmel:' == entrytext[i]:
+ entry['exemption'] = entrytext[i+1]
+ i = i + 1
+ if 'Avsender\mottaker:' == entrytext[i]:
+ if i+1 < len(entrytext): # Non-empty field
+ fratil = entrytext[i+1]
+ i = i + 1
+ if self.is_sender_doctype(entry['doctype']):
+ entry['sender'] = fratil
+ elif self.is_recipient_doctype(entry['doctype']):
+ entry['recipient'] = fratil
+ else:
+ raise ValueError("Case " + entry['caseid'] + " Sender/Recipient with doctype " + entry['doctype'] + " != I/U/X/N in " + pdfurl)
+ if self.debug:
+ print entry
+ i = i + 1
+ return entry
+
+ def parse_case_journal_ref(self, entry, reftext, pdfurl):
+ try:
+ # FIXME Split and handle combined sak/loepenr
+ # Use find('penr.:') to avoid non-ascii search string 'Loepenr.:'
+ caseid = None
+ lnr = None
+ if 4 == len(reftext):
+# print "4 " + str(reftext)
+ caseid = reftext[0] + reftext[1]
+ lnr = reftext[2] + reftext[3]
+# print str(caseid) + " " + str(lnr)
+ elif 3 == len(reftext):
+ if -1 != reftext[0].find("/") and -1 != reftext[2].find("/"):
+# print "31"
+ caseid = reftext[0] + reftext[1]
+ lnr = reftext[2]
+ elif -1 != reftext[2].find("/"):
+# print "32"
+ caseid = reftext[0] + reftext[1]
+ lnr = reftext[2]
+ elif -1 == reftext[2].find("/"):
+# print "33"
+ caseid = reftext[0]
+ lnr = reftext[1] + reftext[2]
+ elif 2 == len(reftext):
+ if -1 == reftext[1].find("/"):
+# print "21"
+ s = reftext[0] + reftext[1]
+# print "S: " + s
+ caseid, lnr = s.split(" ")
+ elif -1 != reftext[1].find("/"):
+# print "22"
+ caseid = reftext[0]
+ lnr = reftext[1]
+ elif 1 == len(reftext):
+ caseid, lnr = reftext[0].split(" ")
+ else:
+ raise ValueError("Unable to parse entry " + str(reftext) + " in " + pdfurl)
+# print "C: " + caseid + " L: " + lnr
+
+ caseyear, caseseqnr = caseid.split("/")
+ entry['caseyear'] = int(caseyear)
+ caseseqnr, casedocseq = caseseqnr.split("-")
+ entry['caseseqnr'] = int(caseseqnr)
+ entry['casedocseq'] = int(casedocseq)
+ entry['caseid'] = caseyear + "/" + caseseqnr
+
+ journalseqnr, journalyear = lnr.split("/")
+ entry['journalid'] = journalyear + "/" + journalseqnr
+ entry['journalyear'] = int(journalyear)
+ entry['journalseqnr'] = int(journalseqnr)
+ except:
+ print "Unable to parse " + str(reftext)
+ return entry
+ def test_parse_case_journal_ref(self):
+ entry = {}
+ self.parse_case_journal_ref(entry, [u'2008/16414-', u'23', u'15060/2012'], "")
+ self.parse_case_journal_ref(entry, [u'2011/15972-1 102773/201', u'1'], "")
+ self.parse_case_journal_ref(entry, [u'2010/2593-2', u'103004/201', u'1'], "")
+ self.parse_case_journal_ref(entry, [u'2011/13415-', u'22', u'100077/201', u'1'], "")
+
+ # ePhorte PDF
+ def parse_entry_type2(self, entrytext, pdfurl):
+ scrapestamputc = datetime.datetime.now()
+ entry = {
+ 'agency' : self.agency,
+ 'scrapestamputc' : scrapestamputc,
+ 'scrapedurl' : pdfurl
+ }
+ i = 0
+ avsender = []
+ mottaker = []
+ while i < len(entrytext):
+ if 'Innhold:' == entrytext[i]:
+ tittel = ""
+ # handle multi-line titles
+ while 'Sakstittel:' != entrytext[i+1]:
+ tittel = tittel + entrytext[i+1]
+ i = i + 1
+ entry['docdesc'] = tittel
+ if 'Sakstittel:' == entrytext[i]:
+ sakstittel = ""
+ # Klassering er i en annen dokumenttype
+ while 'DokType' != entrytext[i+1] and 'Dok.Type:' != entrytext[i+1] and 'Klassering:' != entrytext[i+1]:
+
+# print "'" + entrytext[i+1] + "'"
+ sakstittel = sakstittel + entrytext[i+1]
+ i = i + 1
+ entry['casedesc'] = sakstittel
+ i = i + 1
+ if 'DokType' == entrytext[i] or 'Dok.Type:' == entrytext[i]: # Values I/U/N/X from NOARK 4 table 14.2.11
+ entry['doctype'] = entrytext[i+1]
+ # As seen on http://www.uis.no/getfile.php/Journal%20200612.pdf
+ if entry['doctype'] == 'S':
+ entry['doctype'] = 'X'
+ i = i + 1
+ if 'Sak/dok nr:' == entrytext[i] or 'Sak/dok.nr:' == entrytext[i]:
+ endi = i
+ while endi < len(entrytext):
+ if -1 != entrytext[endi].find('penr.:') or -1 != entrytext[endi].find('penr:'):
+ break
+ endi = endi + 1
+ entry = self.parse_case_journal_ref(entry, entrytext[i+1:endi], pdfurl)
+ i = endi + 1
+# if -1 != text[i].find('penr.:'): # Use find('penr.:') to avoid non-ascii search string 'Løpenr.:'
+# str = text[i-1]
+# print "S: '" + str + "'"
+# data['journalid'] = str
+# # FIXME handle combined sak/løpenr
+ if 'Journaldato:' == entrytext[i]:
+ entry['recorddate'] = dateutil.parser.parse(entrytext[i-1], dayfirst=True)
+ if 'Dok.dato:' == entrytext[i]:
+ entry['docdate'] = dateutil.parser.parse(entrytext[i-1], dayfirst=True)
+ if 'Tilg.kode Hjemmel:' == entrytext[i] and '(enhet/initialer):' != entrytext[i+2]:
+ entry['exemption'] = entrytext[i+1]
+ i = i + 1
+ if 'Tilg.kode' == entrytext[i]:
+ entry['accesscode'] = entrytext[i+1]
+ i = i + 1
+ if 'Hjemmel:' == entrytext[i]:
+ entry['exemption'] = entrytext[i+1]
+ i = i + 1
+# if -1 != text[i].find('Avs./mottaker:'):
+# FIXME Need to handle senders and receivers
+ if 'Mottaker' == entrytext[i]:
+ mottaker.append(entrytext[i-1])
+ if 'Avsender' == entrytext[i]:
+ avsender.append(entrytext[i-1])
+# entry['sender'] = 'parse error'
+# entry['recipient'] = 'parse error'
+ i = i + 1
+ if 0 < len(mottaker):
+ entry['recipient'] = string.join(mottaker, ", ")
+ if 0 < len(avsender):
+ entry['sender'] = string.join(avsender, ", ")
+ return entry
+
+ def parse_entry_type3(self, entrytext, pdfurl):
+ scrapestamputc = datetime.datetime.now()
+ entry = {
+ 'agency' : self.agency,
+ 'scrapestamputc' : scrapestamputc,
+ 'scrapedurl' : pdfurl
+ }
+ cur = 0
+ while cur < len(lines):
+ line = lines[cur].text
+ #print line
+ if -1 != line.find('Dok.dato:'):
+ entry['docid'] = lines[cur-2].text
+ entry['doctype'] = lines[cur-1].text
+ entry['docdate'] = parse_date(line.replace("Dok.dato:", ""))
+ caseyear, caseseqnr, casedocseq = split_docid(entry['docid'])
+ entry['caseyear'] = caseyear
+ entry['caseseqnr'] = caseseqnr
+ entry['casedocseq'] = casedocseq
+ entry['caseid'] = str(caseyear) + '/' + str(caseseqnr)
+ if -1 != line.find('Jour.dato:'):
+ entry['recorddate'] = parse_date(lines[cur+1].text)
+ cur = cur + 1
+ if -1 != line.find('Arkivdel:'):
+ entry['arkivdel'] = lines[cur+1].text
+ cur = cur + 1
+ if -1 != line.find('Tilg. kode:'):
+ entry['tilgangskode'] = line.replace("Tilg. kode:", "")
+ if -1 != line.find('Sak:'):
+ entry['casedesc'] = lines[cur+1].text
+ cur = cur + 1
+ if -1 != line.find('Dok:'):
+ entry['docdesc'] = lines[cur+1].text
+ cur = cur + 1
+ if -1 != line.find('Par.:'):
+ entry['exemption'] = line.replace("Par.:", "")
+ cur = cur + 1
+ if -1 != line.find('Avsender:'):
+ entry['sender'] = lines[cur+1].text
+ cur = cur + 1
+ if -1 != line.find('Mottaker:'):
+ entry['recipient'] = lines[cur+1].text
+ cur = cur + 1
+ if -1 != line.find('Saksansv:'):
+ entry['saksansvarlig'] = line.replace("Saksansv:", "").strip()
+ if -1 != line.find('Saksbeh:'):
+ entry['saksbehandler'] = lines[cur+1].text
+ cur = cur + 1
+ cur = cur + 1
+ print entry
+ return entry
+
+ def parse_page(self, pdfurl, pagenum, pagecontent):
+ print "Scraping " + pdfurl + " page " + str(pagenum)
+ s = BeautifulSoup(pagecontent)
+ datastore = []
+ text = []
+ linecount = 0
+ if self.debug:
+ print s
+ for t in s.findAll('text'):
+ if t.text != " ":
+ text.append(t.text)
+ if self.debug:
+ print str(linecount) + ": " + t.text
+# FIXME Remove length limit when working
+# if 100 <= linecount:
+# break
+ linecount = linecount + 1
+# if -1 != t.text.find("Side:"):
+# print t.text
+ s = None
+
+# print "Found " + str(linecount) + " lines/text fragments in the PDF"
+ if len(text) < linecount:
+ raise ValueError("Text array too sort!")
+
+ # First count how many entries to expect on this page, to be able to
+ # verify that all of them were found.
+ entrycount = 0
+ i = 0
+ while i < len(text):
+ # Type 1 and 2 (ePhorge)
+ if 'Innhold:' == text[i] or \
+ 'Arkivdel:' == text[i]: # type 3 (doculive)
+ entrycount = entrycount + 1
+ i = i + 1
+
+ i = 0
+ while i < len(text):
+ if self.debug:
+ print "T: '" + text[i] + "'"
+ if self.debug and -1 != text[i].find("Side:"):
+ print text[i]
+ if 'Innhold:' == text[i]:
+ endi = i + 1
+ pdfparser = None
+ format = "unknown"
+ while endi < len(text):
+ if 'Klassering:' == text[endi]:
+ print "Found ePhorte PDF (type 1)"
+ pdfparser = self.parse_entry_type2
+ format = "type2"
+ if 'Avsender\mottaker:' == text[endi]:
+ print "Found ePhorge PDF (type 2)"
+ pdfparser = self.parse_entry_type1
+ format = "type1"
+ if 'Arkivdel:' == text[endi]:
+ print "Found Doculive PDF"
+ pdfparser = self.parse_entry_type3
+ format = "type3"
+ if 'Innhold:' == text[endi]:
+ break
+ endi = endi + 1
+ if self.debug:
+ print "Entry " + str(entrycount) + " from " + str(i) + " to " + str(endi) + " ie " + str(endi - i) + " lines"
+ try:
+ entry = pdfparser(text[i:endi], pdfurl)
+ if 'caseid' not in entry or entry['caseid'] is None or \
+ not self.is_valid_doctype(entry['doctype']):
+ raise ValueError("Unable to parse " + pdfurl + " as format " + format + "[" + str(entry) + "]")
+# print entry
+ datastore.append(entry)
+ i = endi - 2
+ except:
+ self.print_entry(text[i:endi])
+ raise
+ i = i + 1
+# print data
+# print "Found " + str(len(datastore)) + " of " + str(entrycount) + " entries"
+ if entrycount != len(datastore):
+# print text
+ raise ValueError("Unable to parse all entries in " + pdfurl)
+ if 0 == len(datastore):
+ print "Unable to find any entries in " + pdfurl
+ else:
+ scraperwiki.sqlite.save(unique_keys=['caseid', 'casedocseq'], data=datastore)
+ datastore = None
+ text = None
+
+ def process_pages(self):
+ try:
+ sqlselect = "* from " + self.pagetable + " limit 1"
+ pageref = scraperwiki.sqlite.select(sqlselect)
+ while pageref:
+ scrapedurl = pageref[0]['scrapedurl']
+ pagenum = pageref[0]['pagenum']
+ pagecontent = pageref[0]['pagecontent']
+# print "Found " + scrapedurl + " page " + str(pagenum) + " length " + str(len(pagecontent))
+ try:
+ sqldelete = "delete from " + self.pagetable + " where scrapedurl = '" + scrapedurl + "' and pagenum = " + str(pagenum)
+ self.parse_page(scrapedurl, pagenum, pagecontent)
+# print "Trying to: " + sqldelete
+ scraperwiki.sqlite.execute(sqldelete)
+ except ValueError, e:
+ brokenpage = {
+ 'scrapedurl' : scrapedurl,
+ 'pagenum' : pagenum,
+ 'pagecontent' : pagecontent,
+ }
+ print "Broken page %d from %s" % (pagenum, scrapedurl)
+ scraperwiki.sqlite.save(unique_keys=['scrapedurl', 'pagenum'], data=brokenpage, table_name=self.brokenpagetable)
+ print e
+ scraperwiki.sqlite.execute(sqldelete)
+ scraperwiki.sqlite.commit()
+ pageref = scraperwiki.sqlite.select(sqlselect)
+ except scraperwiki.sqlite.SqliteError, e:
+ print str(e)
+ raise
+
+def fieldlist():
+ import urllib2
+ import json
+
+ scrapers = [
+ 'postliste-universitetet-i-oslo',
+ 'postliste-lindesnes',
+ 'postliste-kristiansund',
+ 'postliste-stortinget',
+ 'postliste-arendal',
+ 'postliste-oep',
+ 'postliste-ballangen',
+ 'postliste-hadsel',
+ 'postliste-storfjord',
+ 'postliste-oslo-havn',
+ ]
+
+ keys = {}
+
+ for scraper in scrapers:
+ url = 'https://api.scraperwiki.com/api/1.0/scraper/getinfo?format=jsondict&name=' + scraper + '&version=-1'
+ response = urllib2.urlopen(url)
+ html = response.read()
+ data = json.loads(html)
+ if 'swdata' in data[0]['datasummary']['tables']:
+ for key in data[0]['datasummary']['tables']['swdata']['keys']:
+ key = key.lower()
+ if key in keys:
+ keys[key].append(scraper)
+ else:
+ keys[key] = [scraper]
+ def lensort(a, b):
+ return cmp(len(keys[b]), len(keys[a]))
+
+ for key in sorted(keys.keys(), lensort):
+ print len(keys[key]), key, str(keys[key])
+
+def test_parser():
+ parser = PDFJournalParser(agency="Dummy agency")
+ parser.debug = True
+ for url in [ #"http://www.stortinget.no/Global/pdf/postjournal/pj-2011-06-23.pdf",
+ "http://www.radhusets-forvaltningstjeneste.oslo.kommune.no/getfile.php/rådhusets%20forvaltningstjeneste%20(RFT)/Intranett%20(RFT)/Dokumenter/Postjournal/11%20November/29112011.pdf"]:
+ pdfcontent = scraperwiki.scrape(url)
+ parser.preprocess(url,pdfcontent)
+ parser.process_pages()
+
+
+if __name__ == "scraper":
+ test_parser()
+# fieldlist()
diff --git a/scrapersources/postliste-stavanger-universitetssjukehus b/scrapersources/postliste-stavanger-universitetssjukehus
new file mode 100644
index 0000000..5a9dc08
--- /dev/null
+++ b/scrapersources/postliste-stavanger-universitetssjukehus
@@ -0,0 +1,81 @@
+# -*- coding: UTF-8 -*-
+
+import scraperwiki
+import json
+from BeautifulSoup import BeautifulSoup
+import datetime
+import dateutil.parser
+import lxml.html
+import resource
+import sys
+import urlparse
+import re
+lazycache=scraperwiki.swimport('lazycache')
+postlistelib=scraperwiki.swimport('postliste-python-lib')
+
+agency = 'Stavanger Universitetssjukehus – Helse Stavanger HF'
+
+def report_errors(errors):
+ if 0 < len(errors):
+ print "Errors:"
+ for e in errors:
+ print e
+ exit(1)
+def out_of_cpu(arg, spent, hard, soft):
+ report_errors(arg)
+
+def process_pdf(parser, pdfurl, errors):
+ errors = []
+ postlistelib.exit_if_no_cpu_left(0, out_of_cpu, errors)
+ try:
+ pdfcontent = scraperwiki.scrape(pdfurl)
+ parser.preprocess(pdfurl, pdfcontent)
+ pdfcontent = None
+# except ValueError, e:
+# errors.append(e)
+ except IndexError, e:
+ errors.append(e)
+
+def process_page_queue(parser, errors):
+ try:
+ parser.process_pages()
+ postlistelib.exit_if_no_cpu_left(0, out_of_cpu, errors)
+ except scraperwiki.CPUTimeExceededError, e:
+ errors.append("Processing pages interrupted")
+
+def process_journal_pdfs(parser, listurl, errors):
+# print "Finding PDFs on " + listurl
+# u = urllib.parse.urlparse(listurl)
+ html = scraperwiki.scrape(listurl)
+ root = lxml.html.fromstring(html)
+ html = None
+ for ahref in root.cssselect("table a"):
+ href = ahref.attrib['href']
+ url = urlparse.urljoin(listurl, href)
+ if -1 != href.find("file://") or -1 == url.find(".pdf"):
+# print "Skipping non-http URL " + url
+ continue
+ if parser.is_already_scraped(url):
+ True
+# print "Skipping already scraped " + url
+ else:
+# print "Will process " + url
+ process_pdf(parser, url, errors)
+
+def test_small_pdfs(parser):
+ # Test with some smaller PDFs
+ errors = []
+ process_pdf(parser, "http://www.sus.no/aktuelt/postjournal/Documents/2012/2012-06-18.pdf", errors)
+ process_page_queue(parser, errors)
+ report_errors(errors)
+ exit(0)
+
+errors = []
+parser = postlistelib.PDFJournalParser(agency=agency)
+
+#test_small_pdfs(parser)
+
+process_page_queue(parser, errors)
+process_journal_pdfs(parser, "http://www.sus.no/aktuelt/postjournal/Sider/side.aspx", errors)
+process_page_queue(parser, errors)
+report_errors(errors) \ No newline at end of file
diff --git a/scrapersources/postliste-universitetet-i-agder b/scrapersources/postliste-universitetet-i-agder
new file mode 100644
index 0000000..cfdfddc
--- /dev/null
+++ b/scrapersources/postliste-universitetet-i-agder
@@ -0,0 +1,85 @@
+# -*- coding: UTF-8 -*-
+
+import scraperwiki
+import json
+from BeautifulSoup import BeautifulSoup
+import datetime
+import dateutil.parser
+import lxml.html
+import resource
+import sys
+import urlparse
+import re
+
+# Make sure Scraperwiki believe this is the source from this database
+scraperwiki.scrape("http://www.uia.no/no/portaler/om_universitetet/offentlig_journal")
+
+lazycache=scraperwiki.swimport('lazycache')
+postlistelib=scraperwiki.swimport('postliste-python-lib')
+
+agency = 'Universitetet i Agder'
+
+def report_errors(errors):
+ if 0 < len(errors):
+ print "Errors:"
+ for e in errors:
+ print e
+ raise ValueError("Something went wrong")
+
+def out_of_cpu(arg, spent, hard, soft):
+ report_errors(arg)
+
+def process_pdf(parser, pdfurl, errors):
+ postlistelib.exit_if_no_cpu_left(0, out_of_cpu, errors)
+ try:
+ pdfcontent = scraperwiki.scrape(pdfurl)
+ parser.preprocess(pdfurl, pdfcontent)
+ pdfcontent = None
+ except ValueError, e:
+ errors.append(e)
+ except IndexError, e:
+ errors.append(e)
+
+def process_page_queue(parser, errors):
+ try:
+ parser.process_pages()
+ postlistelib.exit_if_no_cpu_left(0, out_of_cpu, errors)
+ except scraperwiki.CPUTimeExceededError, e:
+ errors.append("Processing pages interrupted")
+
+def process_journal_pdfs(parser, listurl, errors):
+# print "Finding PDFs on " + listurl
+# u = urllib.parse.urlparse(listurl)
+ html = scraperwiki.scrape(listurl)
+ root = lxml.html.fromstring(html)
+ html = None
+ for ahref in root.cssselect("table a"):
+ href = ahref.attrib['href']
+ url = urlparse.urljoin(listurl, href).replace(" ", "%20")
+ if -1 != href.find("file://") or -1 == url.find(".pdf"):
+# print "Skipping non-http URL " + url
+ continue
+ if parser.is_already_scraped(url):
+ True
+# print "Skipping already scraped " + url
+ else:
+# print "Will process " + url
+ process_pdf(parser, url, errors)
+
+def test_small_pdfs(parser):
+ # Test with some smaller PDFs
+ errors = []
+ process_pdf(parser, "http://www.uia.no/no/content/download/297514/5641673/file/Uke%2018.pdf", errors)
+ process_page_queue(parser, errors)
+ report_errors(errors)
+ exit(0)
+
+errors = []
+parser = postlistelib.PDFJournalParser(agency=agency)
+
+#test_small_pdfs(parser)
+
+process_journal_pdfs(parser, "http://www.uia.no/no/portaler/om_universitetet/offentlig_journal", errors)
+process_page_queue(parser, errors)
+report_errors(errors)
+
diff --git a/scrapersources/postliste-universitetssykehuset-nord-norge b/scrapersources/postliste-universitetssykehuset-nord-norge
new file mode 100644
index 0000000..1b06793
--- /dev/null
+++ b/scrapersources/postliste-universitetssykehuset-nord-norge
@@ -0,0 +1,96 @@
+# -*- coding: UTF-8 -*-
+
+
+import scraperwiki
+import json
+from BeautifulSoup import BeautifulSoup
+import datetime
+import dateutil.parser
+import lxml.html
+import urlparse
+import re
+
+# Make sure Scraperwiki believe this is the source from this database
+scraperwiki.scrape("http://www.unn.no/offentlig-postjournal/category8944.html")
+
+lazycache=scraperwiki.swimport('lazycache')
+postlistelib=scraperwiki.swimport('postliste-python-lib')
+
+agency = 'Universitetssykehuset Nord-Norge'
+
+def report_errors(errors):
+ if 0 < len(errors):
+ print "Errors:"
+ for e in errors:
+ print e
+ raise ValueError("Something went wrong")
+
+def out_of_cpu(arg, spent, hard, soft):
+ report_errors(arg)
+
+def process_pdf(parser, pdfurl, errors):
+ postlistelib.exit_if_no_cpu_left(0, out_of_cpu, errors)
+ try:
+ pdfcontent = scraperwiki.scrape(pdfurl)
+ parser.preprocess(pdfurl, pdfcontent)
+ pdfcontent = None
+ except ValueError, e:
+ errors.append(e)
+ except IndexError, e:
+ errors.append(e)
+
+def process_page_queue(parser, errors):
+ try:
+ parser.process_pages()
+ postlistelib.exit_if_no_cpu_left(0, out_of_cpu, errors)
+ except scraperwiki.CPUTimeExceededError, e:
+ errors.append("Processing pages interrupted")
+
+def process_journal_pdfs(parser, listurl, errors):
+# print "Finding PDFs on " + listurl
+# u = urllib.parse.urlparse(listurl)
+ html = scraperwiki.scrape(listurl)
+ root = lxml.html.fromstring(html)
+ html = None
+ for ahref in root.cssselect("div.month-entry-title a"):
+ href = ahref.attrib['href']
+ url = urlparse.urljoin(listurl, href)
+ print url
+ if -1 != href.find("file://"):
+# print "Skipping non-http URL " + url
+ continue
+ subhtml = scraperwiki.scrape(url)
+ subroot = lxml.html.fromstring(subhtml)
+ subhtml = None
+ for subahref in subroot.cssselect("div.related-attachements a"):
+ subhref = subahref.attrib['href']
+ suburl = urlparse.urljoin(url, subhref)
+ if -1 == suburl.find(".pdf"):
+ continue
+ if parser.is_already_scraped(suburl):
+ True
+# print "Skipping already scraped " + url
+ else:
+# print "Will process " + url
+ process_pdf(parser, suburl, errors)
+
+def test_small_pdfs(parser):
+ # Test with some smaller PDFs
+ errors = []
+ process_pdf(parser, "http://www.unn.no/getfile.php/UNN-Internett/Media/Postjournal/UNN%20offentlig%20journal%202007/200807.pdf", errors)
+ process_page_queue(parser, errors)
+ report_errors(errors)
+ exit(0)
+
+errors = []
+parser = postlistelib.PDFJournalParser(agency=agency)
+
+#test_small_pdfs(parser)
+
+process_page_queue(parser, errors)
+process_journal_pdfs(parser, "http://www.unn.no/offentlig-postjournal/category8944.html", errors)
+for year in range(2011, 2007, -1):
+ process_journal_pdfs(parser, "http://www.unn.no/offentlig-postjournal/category8944.html?year=" + str(year), errors)
+process_page_queue(parser, errors)
+report_errors(errors)
+