diff options
author | Petter Reinholdtsen <pere@hungry.com> | 2012-07-13 12:34:28 +0200 |
---|---|---|
committer | Petter Reinholdtsen <pere@hungry.com> | 2012-07-13 12:34:28 +0200 |
commit | c99e7bfda6e025314b6a7c6683a1bc3c5818621c (patch) | |
tree | ef4230a7830973c37e9d83c9e773d41a56834658 | |
parent | 22bceaf65dd89df97529df0102149aefa2b54f54 (diff) |
Updated from scraperwiki.
22 files changed, 1881 insertions, 80 deletions
diff --git a/scrapersources/list-nuug-postliste-scrapers b/scrapersources/list-nuug-postliste-scrapers new file mode 100644 index 0000000..67c4158 --- /dev/null +++ b/scrapersources/list-nuug-postliste-scrapers @@ -0,0 +1,90 @@ +import os +import urlparse +urlquery = os.getenv('URLQUERY') + +if urlquery: + querydata = urlparse.parse_qsl(urlquery); + for pair in querydata: + if pair[0] == "js" and pair[1] == "jquery.js": + print 'js-sourcecode' + exit(0) + +import urllib2, json, re +import yaml + +url = "https://api.scraperwiki.com/api/1.0/scraper/search?format=jsondict&maxrows=200&searchquery=nuug-postliste-endyaml" +json_data = json.load(urllib2.urlopen(url)) +print '''<html> +<head> +<link rel="stylesheet" href="https://views.scraperwiki.com/run/jquery-tablesorter/?file=style-blue.css" type="text/css" /> +<script type="text/javascript" src="https://views.scraperwiki.com/run/jquery-tablesorter/?file=jquery-1-4-2-min.js"></script> +<script type="text/javascript" src="https://views.scraperwiki.com/run/jquery-tablesorter/?file=jquery.tablesorter.2-0-5.min.js"></script>''' + +print '''</head><body> +<p>This view lists scrapers with yaml-combatible comments (containing the string "nuug-postliste-endyaml" like the following in their description +<pre> +<!-- nuug-postliste-yaml --> +YAML-tagger:<br> +Type: kommune<br> +Status: finished<br> +Name: Lillesteinsmyr kommune<br> +Format: PDF<br> +Datatype: ePhorte<br> +Run: daily<br> +<!-- nuug-postliste-endyaml --> +</pre></p> +<table id="myTable" class="tablesorter">''' + +print '<thead><tr><th>Name</th><th>type</th><th>status</th><th>schedule</th><th>format</th><th>datatype</th><th>URL</th></tr></thead><tbody>' +counter = {} +for scraper in json_data: + print scraper + comment = re.findall(r'<!-- nuug-postliste-yaml -->(.*)<!-- nuug-postliste-endyaml -->', + scraper['description'], re.DOTALL) + assert len(comment) == 1 + data = yaml.load(comment[0].strip().replace('<br>','')) + + if data['Type'] in counter: + counter[data['Type']] = counter[data['Type']] + 1 + else: + counter[data['Type']] = 1 + + if 'Run' in data: Run = data['Run'] + else: Run = 'unknown' + + if 'Format' in data: Format = data['Format'] + else: Format = 'unknown' + + if 'Datatype' in data: Type = data['Datatype'] + else: Type = 'unknown' + + + print '<tr><td>%s</td><td>%s</td><td>%s</td><td>%s</td><td>%s</td><td>%s</td><td><a href="https://scraperwiki.com/scrapers/%s/">URL</a></td></tr>' % \ + (data['Name'],data['Type'],data['Status'], Run, Format, Type, scraper['short_name']) +print '''</tbody></table><table id="myTable2" class="tablesorter"><thead><tr><th>type</th><th>count</th></tr></thead><tbody>''' + +for key in counter: + print '<tr><td>%s</td><td>%d</td></tr>' % (key, counter[key]) +print '</tbody></table>' + +num_kommune = float(429) +num_fylke = float(19) +print '<table class="tablesorter"><thead><tr><td>Type</td><td>Prosent</td></tr></thead><tbody>' +try: + print "<tr><td>Kommune</td><td>%.2f%% (%d av %d)</td></tr>" % \ + ((float(counter['kommune'])/float(num_kommune))*100, counter['kommune'], num_kommune) +except KeyError: pass +try: + print "<tr><td>Fylkeskommune</td><td>%.2f%% (%d av %d)</td></tr>" % \ + ((float(counter['fylkeskommune'])/float(num_fylke))*100, counter['fylkeskommune'], num_fylke) +except KeyError: pass +print '''</tbody></table> +<script type="text/javascript"> + $(document).ready(function() + { + $("#myTable").tablesorter(); + $("#myTable2").tablesorter(); + } + ); +</script> +</body></html>'''
\ No newline at end of file diff --git a/scrapersources/postliste-halden b/scrapersources/postliste-halden index 4b0ebd5..e7c2d30 100644 --- a/scrapersources/postliste-halden +++ b/scrapersources/postliste-halden @@ -83,7 +83,7 @@ def test_small_pdfs(parser): errors = [] parser = postlistelib.PDFJournalParser(agency=agency) -#parser.debug = True +parser.debug = True #test_small_pdfs(parser) process_page_queue(parser, errors) diff --git a/scrapersources/postliste-hoegskolen-i-finnmark b/scrapersources/postliste-hoegskolen-i-finnmark new file mode 100644 index 0000000..2a4b972 --- /dev/null +++ b/scrapersources/postliste-hoegskolen-i-finnmark @@ -0,0 +1,86 @@ +# -*- coding: UTF-8 -*- + +import scraperwiki +import json +from BeautifulSoup import BeautifulSoup +import datetime +import dateutil.parser +import lxml.html +import resource +import sys +import urlparse +import re + +# Make sure Scraperwiki believe this is the source from this database +scraperwiki.scrape("http://www.hifm.no/nor/www_hifm_no/hogskolen-i-finnmark-_-startside/om-hogskolen/om-hogskolen/offentlig-journal-1") + +lazycache=scraperwiki.swimport('lazycache') +postlistelib=scraperwiki.swimport('postliste-python-lib') + +agency = 'Høgskolen i Finnmark' + +def report_errors(errors): + if 0 < len(errors): + print "Errors:" + for e in errors: + print e + raise ValueError("Something went wrong") + +def out_of_cpu(arg, spent, hard, soft): + report_errors(arg) + +def process_pdf(parser, pdfurl, errors): + postlistelib.exit_if_no_cpu_left(0, out_of_cpu, errors) + try: + pdfcontent = scraperwiki.scrape(pdfurl) + parser.preprocess(pdfurl, pdfcontent) + pdfcontent = None + except ValueError, e: + errors.append(e) + except IndexError, e: + errors.append(e) + +def process_page_queue(parser, errors): + try: + parser.process_pages() + postlistelib.exit_if_no_cpu_left(0, out_of_cpu, errors) + except scraperwiki.CPUTimeExceededError, e: + errors.append("Processing pages interrupted") + +def process_journal_pdfs(parser, listurl, errors): +# print "Finding PDFs on " + listurl +# u = urllib.parse.urlparse(listurl) + html = scraperwiki.scrape(listurl) + root = lxml.html.fromstring(html) + html = None + for ahref in root.cssselect("div.content-padding a"): + href = ahref.attrib['href'] + url = urlparse.urljoin(listurl, href) + if -1 != href.find("file://") or -1 == url.find("/download_journal.php"): +# print "Skipping non-http URL " + url + continue + if parser.is_already_scraped(url): + True +# print "Skipping already scraped " + url + else: +# print "Will process " + url + process_pdf(parser, url, errors) + +def test_small_pdfs(parser): + # Test with some smaller PDFs + errors = [] + process_pdf(parser, "http://www.hifm.no/neted/includes/hifm/download_journal.php?fn=120503", errors) + process_page_queue(parser, errors) + report_errors(errors) + exit(0) + +errors = [] +parser = postlistelib.PDFJournalParser(agency=agency) + +#test_small_pdfs(parser) + +process_page_queue(parser, errors) +process_journal_pdfs(parser, "http://www.hifm.no/nor/www_hifm_no/hogskolen-i-finnmark-_-startside/om-hogskolen/om-hogskolen/offentlig-journal-1/?&type=a", errors) +process_page_queue(parser, errors) +report_errors(errors) + diff --git a/scrapersources/postliste-hoegskolen-i-gjoevik b/scrapersources/postliste-hoegskolen-i-gjoevik index fd197eb..d4f7931 100644 --- a/scrapersources/postliste-hoegskolen-i-gjoevik +++ b/scrapersources/postliste-hoegskolen-i-gjoevik @@ -19,7 +19,7 @@ import re # # # Make sure Scraperwiki believe this is the source from this database -scraperwiki.scrape("http://www.hig.no/om_hig/offentleg_journal/2012") +scraperwiki.scrape("http://www.hig.no/om_hig/offentleg_journal") lazycache=scraperwiki.swimport('lazycache') postlistelib=scraperwiki.swimport('postliste-python-lib') @@ -31,19 +31,19 @@ def report_errors(errors): print "Errors:" for e in errors: print e - exit(1) + raise ValueError(str(len(errors)) + " errors detected") + def out_of_cpu(arg, spent, hard, soft): report_errors(arg) def process_pdf(parser, pdfurl, errors): - errors = [] postlistelib.exit_if_no_cpu_left(0, out_of_cpu, errors) try: pdfcontent = scraperwiki.scrape(pdfurl) parser.preprocess(pdfurl, pdfcontent) pdfcontent = None -# except ValueError, e: -# errors.append(e) + except ValueError, e: + errors.append(e) except IndexError, e: errors.append(e) @@ -73,21 +73,20 @@ def process_journal_pdfs(parser, listurl, errors): # print "Will process " + url process_pdf(parser, url, errors) -def test_small_pdfs(parser): +#def test_small_pdfs(parser): # Test with some smaller PDFs - errors = [] - if parser.is_already_scraped("http://www.hig.no/content/download/30119/360872/file/Offentlig+journal+04.11.2010.pdf"): - print "Skipping already scraped " - exit(1) - else: - print "Will process " - +# errors = [] +# if parser.is_already_scraped("http://www.hig.no/content/download/30119/360872/file/Offentlig+journal+04.11.2010.pdf"): +# print "Skipping already scraped " +# exit(1) +# else: +# print "Will process " #process_pdf(parser, "http://www.hig.no/content/download/35184/430061/file/Offentlig%20journal%2025.06.2012.pdf", errors) #process_pdf(parser, "http://www.hig.no/content/download/30116/360863/file/Offentlig%20journal%2001.11.2010.pdf", errors) - process_pdf(parser, "http://www.hig.no/content/download/30119/360872/file/Offentlig+journal+04.11.2010.pdf", errors) - process_page_queue(parser, errors) - report_errors(errors) - exit(0) +# process_pdf(parser, "http://www.hig.no/content/download/30119/360872/file/Offentlig+journal+04.11.2010.pdf", errors) +# process_page_queue(parser, errors) +# report_errors(errors) +# exit(0) errors = [] parser = postlistelib.PDFJournalParser(agency=agency) @@ -96,9 +95,9 @@ parser = postlistelib.PDFJournalParser(agency=agency) startYear=2010 endYear=datetime.datetime.now().year -for year in range(startYear, endYear): - process_journal_pdfs(parser, "http://www.hig.no/om_hig/offentleg_journal/%d" % year, errors) +for year in range(startYear, endYear+1): # range goes from startyear to endYear-1 + process_journal_pdfs(parser, "http://www.hig.no/om_hig/offentleg_journal/%d" % year, errors) process_page_queue(parser, errors) report_errors(errors) diff --git a/scrapersources/postliste-hoegskolen-i-nord-troendelag b/scrapersources/postliste-hoegskolen-i-nord-troendelag new file mode 100644 index 0000000..3db177b --- /dev/null +++ b/scrapersources/postliste-hoegskolen-i-nord-troendelag @@ -0,0 +1,88 @@ +# -*- coding: UTF-8 -*- +# Based on the scraper advanced-scraping-pdf +# See also +# https://views.scraperwiki.com/run/pdf-to-html-preview-1/?url=http%3A%2F%2Fwww.stortinget.no%2FGlobal%2Fpdf%2Fpostjournal%2Fpj-2012-05-09.pdf + +import scraperwiki +import json +from BeautifulSoup import BeautifulSoup +import datetime +import dateutil.parser +import lxml.html +import resource +import sys +import urlparse +import re + +# Make sure Scraperwiki believe this is the source from this database +scraperwiki.scrape("http://www.hint.no/aktuelt/offentlig_postjournal") + +lazycache=scraperwiki.swimport('lazycache') +postlistelib=scraperwiki.swimport('postliste-python-lib') + +agency = 'Høgskolen i Nord-Trøndelag' + +def report_errors(errors): + if 0 < len(errors): + print "Errors:" + for e in errors: + print e + raise ValueError("Something went wrong") + +def out_of_cpu(arg, spent, hard, soft): + report_errors(arg) + +def process_pdf(parser, pdfurl, errors): + postlistelib.exit_if_no_cpu_left(0, out_of_cpu, errors) + try: + pdfcontent = scraperwiki.scrape(pdfurl) + parser.preprocess(pdfurl, pdfcontent) + pdfcontent = None + except ValueError, e: + errors.append(e) + except IndexError, e: + errors.append(e) + +def process_page_queue(parser, errors): + try: + parser.process_pages() + postlistelib.exit_if_no_cpu_left(0, out_of_cpu, errors) + except scraperwiki.CPUTimeExceededError, e: + errors.append("Processing pages interrupted") + +def process_journal_pdfs(parser, listurl, errors): +# print "Finding PDFs on " + listurl +# u = urllib.parse.urlparse(listurl) + html = scraperwiki.scrape(listurl) + root = lxml.html.fromstring(html) + html = None + for ahref in root.cssselect("div.mliste a"): + href = ahref.attrib['href'] + url = urlparse.urljoin(listurl, href) + if -1 != href.find("file://") or -1 == url.find(".pdf"): +# print "Skipping non-http URL " + url + continue + if parser.is_already_scraped(url): + True +# print "Skipping already scraped " + url + else: +# print "Will process " + url + process_pdf(parser, url, errors) + +def test_small_pdfs(parser): + # Test with some smaller PDFs + errors = [] + process_pdf(parser, "http://www.hint.no/content/download/60032/904325/version/1/file/Off.+journal+28.06.2012.pdf", errors) + process_page_queue(parser, errors) + report_errors(errors) + exit(0) + +errors = [] +parser = postlistelib.PDFJournalParser(agency=agency) + +#test_small_pdfs(parser) + +process_journal_pdfs(parser, "http://www.hint.no/aktuelt/offentlig_postjournal", errors) +process_page_queue(parser, errors) +report_errors(errors) + diff --git a/scrapersources/postliste-hoegskolen-i-telemark b/scrapersources/postliste-hoegskolen-i-telemark new file mode 100644 index 0000000..a41d014 --- /dev/null +++ b/scrapersources/postliste-hoegskolen-i-telemark @@ -0,0 +1,86 @@ +# -*- coding: UTF-8 -*- + +import scraperwiki +import json +from BeautifulSoup import BeautifulSoup +import datetime +import dateutil.parser +import lxml.html +import resource +import sys +import urlparse +import re + +# Make sure Scraperwiki believe this is the source from this database +scraperwiki.scrape("http://www.hit.no/nor/HiT/Om-HiT/Offentlig-journal-for-HiT") + +lazycache=scraperwiki.swimport('lazycache') +postlistelib=scraperwiki.swimport('postliste-python-lib') + +agency = 'Høgskolen i Telemark' + +def report_errors(errors): + if 0 < len(errors): + print "Errors:" + for e in errors: + print e + raise ValueError("Something went wrong") + +def out_of_cpu(arg, spent, hard, soft): + report_errors(arg) + +def process_pdf(parser, pdfurl, errors): + postlistelib.exit_if_no_cpu_left(0, out_of_cpu, errors) + try: + pdfcontent = scraperwiki.scrape(pdfurl) + parser.preprocess(pdfurl, pdfcontent) + pdfcontent = None + except ValueError, e: + errors.append(e) + except IndexError, e: + errors.append(e) + +def process_page_queue(parser, errors): + try: + parser.process_pages() + postlistelib.exit_if_no_cpu_left(0, out_of_cpu, errors) + except scraperwiki.CPUTimeExceededError, e: + errors.append("Processing pages interrupted") + +def process_journal_pdfs(parser, listurl, errors): +# print "Finding PDFs on " + listurl +# u = urllib.parse.urlparse(listurl) + html = scraperwiki.scrape(listurl) + root = lxml.html.fromstring(html) + html = None + for ahref in root.cssselect("table a"): + href = ahref.attrib['href'] + url = urlparse.urljoin(listurl, href) + if -1 != href.find("file://") or -1 == url.find(".pdf"): +# print "Skipping non-http URL " + url + continue + if parser.is_already_scraped(url): + True +# print "Skipping already scraped " + url + else: +# print "Will process " + url + process_pdf(parser, url, errors) + +def test_small_pdfs(parser): + # Test with some smaller PDFs + errors = [] + process_pdf(parser, "http://www.hit.no/nor/content/download/128467/1372770/file/Offentlig+journal+uke+1.pdf", errors) + process_page_queue(parser, errors) + report_errors(errors) + exit(0) + +errors = [] +parser = postlistelib.PDFJournalParser(agency=agency) + +#test_small_pdfs(parser) + +process_page_queue(parser, errors) +process_journal_pdfs(parser, "http://www.hit.no/nor/HiT/Om-HiT/Offentlig-journal-for-HiT", errors) +process_page_queue(parser, errors) +report_errors(errors) + diff --git a/scrapersources/postliste-hoegskolen-i-volda b/scrapersources/postliste-hoegskolen-i-volda new file mode 100644 index 0000000..0106cb7 --- /dev/null +++ b/scrapersources/postliste-hoegskolen-i-volda @@ -0,0 +1,88 @@ +# -*- coding: UTF-8 -*- + +import scraperwiki +import json +from BeautifulSoup import BeautifulSoup +import datetime +import dateutil.parser +import lxml.html +import resource +import sys +import urlparse +import re + +# Make sure Scraperwiki believe this is the source from this database +scraperwiki.scrape("http://www.hivolda.no/nyn/hivolda/om-hogskulen/administrasjon/dokumentsenteret") + +lazycache=scraperwiki.swimport('lazycache') +postlistelib=scraperwiki.swimport('postliste-python-lib') + +agency = 'Høgskolen i Volda' + +def report_errors(errors): + if 0 < len(errors): + print "Errors:" + for e in errors: + print e + raise ValueError("Something went wrong") + +def out_of_cpu(arg, spent, hard, soft): + report_errors(arg) + +def process_pdf(parser, pdfurl, errors): + postlistelib.exit_if_no_cpu_left(0, out_of_cpu, errors) + try: + pdfcontent = scraperwiki.scrape(pdfurl) + parser.preprocess(pdfurl, pdfcontent) + pdfcontent = None + except ValueError, e: + errors.append(e) + except IndexError, e: + errors.append(e) + +def process_page_queue(parser, errors): + try: + parser.process_pages() + postlistelib.exit_if_no_cpu_left(0, out_of_cpu, errors) + except scraperwiki.CPUTimeExceededError, e: + errors.append("Processing pages interrupted") + +def process_journal_pdfs(parser, listurl, errors): +# print "Finding PDFs on " + listurl +# u = urllib.parse.urlparse(listurl) + html = scraperwiki.scrape(listurl) + root = lxml.html.fromstring(html) + html = None + for ahref in root.cssselect("div.inside a"): + if 'id' not in ahref.attrib or -1 == ahref.attrib['id'].find("archiveimage_"): + continue + href = ahref.attrib['href'] + url = urlparse.urljoin(listurl, href) + if -1 != href.find("file://"): +# print "Skipping non-http URL " + url + continue + if parser.is_already_scraped(url): + True +# print "Skipping already scraped " + url + else: +# print "Will process " + url + process_pdf(parser, url, errors) + +def test_small_pdfs(parser): + # Test with some smaller PDFs + errors = [] + process_pdf(parser, "http://www.hivolda.no/neted/modules/archive/front/file.php?data=47449f5f5477b30f13f282759d5f08b1", errors) + process_page_queue(parser, errors) + report_errors(errors) + exit(0) + +errors = [] +parser = postlistelib.PDFJournalParser(agency=agency) +#parser.debug = True + +#test_small_pdfs(parser) + +process_journal_pdfs(parser, "http://www.hivolda.no/nyn/hivolda/om-hogskulen/administrasjon/dokumentsenteret", errors) +process_page_queue(parser, errors) +report_errors(errors) + diff --git a/scrapersources/postliste-hvaler b/scrapersources/postliste-hvaler index b3e9137..113b145 100644 --- a/scrapersources/postliste-hvaler +++ b/scrapersources/postliste-hvaler @@ -6,10 +6,11 @@ from BeautifulSoup import BeautifulSoup import datetime import dateutil.parser import lxml.html -import resource -import sys import urlparse import re + +scraperwiki.scrape("http://www.hvaler.kommune.no/Postlister/") + lazycache=scraperwiki.swimport('lazycache') postlistelib=scraperwiki.swimport('postliste-python-lib') @@ -20,12 +21,12 @@ def report_errors(errors): print "Errors:" for e in errors: print e - exit(1) + raise ValueError("Something went wrong, " + str(len(errors)) + " errors detected") + def out_of_cpu(arg, spent, hard, soft): report_errors(arg) def process_pdf(parser, pdfurl, errors): - errors = [] postlistelib.exit_if_no_cpu_left(0, out_of_cpu, errors) try: pdfcontent = scraperwiki.scrape(pdfurl) @@ -35,6 +36,8 @@ def process_pdf(parser, pdfurl, errors): # errors.append(e) except IndexError, e: errors.append(e) + except ValueError, e: + errors.append(e) def process_page_queue(parser, errors): try: @@ -49,10 +52,11 @@ def process_journal_pdfs(parser, listurl, errors): html = scraperwiki.scrape(listurl) root = lxml.html.fromstring(html) html = None - for ahref in root.cssselect("div#ctl00_MainRegion_StageAreaRegion_MainContentRegion_MainBodyRegion_ctl01_FileTreen0Nodes a"): + for ahref in root.cssselect("table a"): href = ahref.attrib['href'] url = urlparse.urljoin(listurl, href) - if -1 != href.find("file://"): +# print url + if -1 != href.find("file://") or -1 != href.find("javascript:"): # print "Skipping non-http URL " + url continue if parser.is_already_scraped(url): @@ -72,9 +76,11 @@ def test_small_pdfs(parser): errors = [] parser = postlistelib.PDFJournalParser(agency=agency) +#parser.debug = True #test_small_pdfs(parser) +process_page_queue(parser, errors) process_journal_pdfs(parser, "http://www.hvaler.kommune.no/Postlister/", errors) process_page_queue(parser, errors) report_errors(errors) diff --git a/scrapersources/postliste-kafjord-kommune b/scrapersources/postliste-kafjord-kommune new file mode 100644 index 0000000..212a308 --- /dev/null +++ b/scrapersources/postliste-kafjord-kommune @@ -0,0 +1,93 @@ +# -*- coding: UTF-8 -*- + +import scraperwiki +import json +from BeautifulSoup import BeautifulSoup +import datetime +import dateutil.parser +import lxml.html +import resource +import sys +import urlparse +import re + +# Make sure Scraperwiki believe this is the source from this database +scraperwiki.scrape("http://www.kafjord.kommune.no/postlister.18590.no.html") + +lazycache=scraperwiki.swimport('lazycache') +postlistelib=scraperwiki.swimport('postliste-python-lib') + +agency = 'Kåfjord kommune' + +def report_errors(errors): + if 0 < len(errors): + print "Errors:" + for e in errors: + print e + raise ValueError("Something went wrong") + +def out_of_cpu(arg, spent, hard, soft): + report_errors(arg) + +def process_pdf(parser, pdfurl, errors): + postlistelib.exit_if_no_cpu_left(0, out_of_cpu, errors) + try: + pdfcontent = scraperwiki.scrape(pdfurl) + parser.preprocess(pdfurl, pdfcontent) + pdfcontent = None + except ValueError, e: + errors.append(e) + except IndexError, e: + errors.append(e) + +def process_page_queue(parser, errors): + try: + parser.process_pages() + postlistelib.exit_if_no_cpu_left(0, out_of_cpu, errors) + except scraperwiki.CPUTimeExceededError, e: + errors.append("Processing pages interrupted") + +def process_journal_pdfs(parser, listurl, errors): +# print "Finding PDFs on " + listurl +# u = urllib.parse.urlparse(listurl) + html = scraperwiki.scrape(listurl) + root = lxml.html.fromstring(html) + html = None + for ahref in root.cssselect("table a"): + href = ahref.attrib['href'] + url = urlparse.urljoin(listurl, href) + if -1 != href.find("file://") or -1 == url.find("/postliste-"): +# print "Skipping non-http URL " + url + continue + if parser.is_already_scraped(url): + True +# print "Skipping already scraped " + url + else: +# print "Will process " + url + process_pdf(parser, url, errors) + # Follow the "next page" link to the end + for ahref in root.cssselect("center a"): + href = ahref.attrib['href'] + url = urlparse.urljoin(listurl, href) + print ahref.text, url + if -1 != ahref.text.find("Neste side"): + process_journal_pdfs(parser, url, errors) + +def test_small_pdfs(parser): + # Test with some smaller PDFs + errors = [] + process_pdf(parser, "http://www.kafjord.kommune.no/postliste-03-07-12.5071007-18590.html", errors) + process_page_queue(parser, errors) + report_errors(errors) + exit(0) + +errors = [] +parser = postlistelib.PDFJournalParser(agency=agency) + +#test_small_pdfs(parser) + +process_page_queue(parser, errors) +process_journal_pdfs(parser, "http://www.kafjord.kommune.no/postlister.18590.no.html", errors) +process_page_queue(parser, errors) +report_errors(errors) + diff --git a/scrapersources/postliste-loppa-kommune b/scrapersources/postliste-loppa-kommune new file mode 100644 index 0000000..7c7ec72 --- /dev/null +++ b/scrapersources/postliste-loppa-kommune @@ -0,0 +1,88 @@ +# -*- coding: UTF-8 -*- +# Based on the scraper advanced-scraping-pdf +# See also +# https://views.scraperwiki.com/run/pdf-to-html-preview-1/?url=http%3A%2F%2Fwww.stortinget.no%2FGlobal%2Fpdf%2Fpostjournal%2Fpj-2012-05-09.pdf + +import scraperwiki +import json +from BeautifulSoup import BeautifulSoup +import datetime +import dateutil.parser +import lxml.html +import resource +import sys +import urlparse +import re + +# Make sure Scraperwiki believe this is the source from this database +scraperwiki.scrape("http://www.loppa.kommune.no/postjournal.113285.no.html") + +lazycache=scraperwiki.swimport('lazycache') +postlistelib=scraperwiki.swimport('postliste-python-lib') + +agency = 'Loppa kommune' + +def report_errors(errors): + if 0 < len(errors): + print "Errors:" + for e in errors: + print e + raise ValueError("Something went wrong") + +def out_of_cpu(arg, spent, hard, soft): + report_errors(arg) + +def process_pdf(parser, pdfurl, errors): + postlistelib.exit_if_no_cpu_left(0, out_of_cpu, errors) + try: + pdfcontent = scraperwiki.scrape(pdfurl) + parser.preprocess(pdfurl, pdfcontent) + pdfcontent = None + except ValueError, e: + errors.append(e) + except IndexError, e: + errors.append(e) + +def process_page_queue(parser, errors): + try: + parser.process_pages() + postlistelib.exit_if_no_cpu_left(0, out_of_cpu, errors) + except scraperwiki.CPUTimeExceededError, e: + errors.append("Processing pages interrupted") + +def process_journal_pdfs(parser, listurl, errors): +# print "Finding PDFs on " + listurl +# u = urllib.parse.urlparse(listurl) + html = scraperwiki.scrape(listurl) + root = lxml.html.fromstring(html) + html = None + for ahref in root.cssselect("div.body a"): + href = ahref.attrib['href'] + url = urlparse.urljoin(listurl, href) + if -1 != href.find("file://") or -1 == url.find(".pdf"): +# print "Skipping non-http URL " + url + continue + if parser.is_already_scraped(url): + True +# print "Skipping already scraped " + url + else: +# print "Will process " + url + process_pdf(parser, url, errors) + +def test_small_pdfs(parser): + # Test with some smaller PDFs + errors = [] + process_pdf(parser, "http://www.loppa.kommune.no/getfile.php/1983773.670.bbsaudxaex/25_2012.pdf", errors) + process_page_queue(parser, errors) + report_errors(errors) + exit(0) + +errors = [] +parser = postlistelib.PDFJournalParser(agency=agency) + +#test_small_pdfs(parser) + +process_journal_pdfs(parser, "http://www.loppa.kommune.no/postjournal.113285.no.html", errors) +process_page_queue(parser, errors) +report_errors(errors) + diff --git a/scrapersources/postliste-met b/scrapersources/postliste-met new file mode 100644 index 0000000..02c53ca --- /dev/null +++ b/scrapersources/postliste-met @@ -0,0 +1,91 @@ +# -*- coding: UTF-8 -*- + +import scraperwiki +import json +from BeautifulSoup import BeautifulSoup +import datetime +import dateutil.parser +import lxml.html +import urllib2 +import urlparse +import re + +# Make sure Scraperwiki believe this is the source from this database +scraperwiki.scrape("http://met.no/Om_oss/Offentlig_journal/") + +lazycache=scraperwiki.swimport('lazycache') +postlistelib=scraperwiki.swimport('postliste-python-lib') + +agency = 'Meteorologisk institutt' + +def report_errors(errors): + if 0 < len(errors): + print "Errors:" + for e in errors: + print e + raise ValueError("Something went wrong") + +def out_of_cpu(arg, spent, hard, soft): + report_errors(arg) + +def process_pdf(parser, pdfurl, errors): + postlistelib.exit_if_no_cpu_left(0, out_of_cpu, errors) + try: + pdfcontent = scraperwiki.scrape(pdfurl) + parser.preprocess(pdfurl, pdfcontent) + pdfcontent = None + except ValueError, e: + errors.append(e) + except IndexError, e: + errors.append(e) + except urllib2.HTTPError, e: + errors.append(str(e) + " " + pdfurl) + +def process_page_queue(parser, errors): + try: + parser.process_pages() + postlistelib.exit_if_no_cpu_left(0, out_of_cpu, errors) + except scraperwiki.CPUTimeExceededError, e: + errors.append("Processing pages interrupted") + +def process_journal_pdfs(parser, listurl, errors): +# print "Finding PDFs on " + listurl +# u = urllib.parse.urlparse(listurl) + html = scraperwiki.scrape(listurl) + root = lxml.html.fromstring(html) + html = None + for ahref in root.cssselect("div.article-content a"): + href = ahref.attrib['href'] + url = urlparse.urljoin(listurl, href) + if -1 != href.find("file://") or -1 == url.find("=File.getFile;"): +# print "Skipping non-http URL " + url + continue + if parser.is_already_scraped(url): + True +# print "Skipping already scraped " + url + else: +# print "Will process " + url + process_pdf(parser, url, errors) + +def test_small_pdfs(parser): + # Test with some smaller PDFs + errors = [] + process_pdf(parser, "http://met.no/Om_oss/Offentlig_journal/2012/?module=Files;action=File.getFile;ID=4570", errors) + process_page_queue(parser, errors) + report_errors(errors) + exit(0) + +errors = [] +parser = postlistelib.PDFJournalParser(agency=agency) + +#test_small_pdfs(parser) + +process_page_queue(parser, errors) +process_journal_pdfs(parser, "http://met.no/Om_oss/Offentlig_journal/2012/", errors) +process_journal_pdfs(parser, "http://met.no/Om_oss/Offentlig_journal/2011/", errors) +process_journal_pdfs(parser, "http://met.no/Om_oss/Offentlig_journal/2010/", errors) +process_journal_pdfs(parser, "http://met.no/Om_oss/Offentlig_journal/2009/", errors) +process_journal_pdfs(parser, "http://met.no/Om_oss/Offentlig_journal/2008/", errors) +process_page_queue(parser, errors) +report_errors(errors) + diff --git a/scrapersources/postliste-mrfylke b/scrapersources/postliste-mrfylke new file mode 100644 index 0000000..5c26ba3 --- /dev/null +++ b/scrapersources/postliste-mrfylke @@ -0,0 +1,82 @@ +# -*- coding: UTF-8 -*- + +import scraperwiki +import json +from BeautifulSoup import BeautifulSoup +import datetime +import dateutil.parser +import lxml.html +import resource +import sys +import urlparse +import re +lazycache=scraperwiki.swimport('lazycache') +postlistelib=scraperwiki.swimport('postliste-python-lib') + +agency = 'Møre og Romsdal fylkeskommune' + +def report_errors(errors): + if 0 < len(errors): + print "Errors:" + for e in errors: + print e + exit(1) +def out_of_cpu(arg, spent, hard, soft): + report_errors(arg) + +def process_pdf(parser, pdfurl, errors): + errors = [] + postlistelib.exit_if_no_cpu_left(0, out_of_cpu, errors) + try: + pdfcontent = scraperwiki.scrape(pdfurl) + parser.preprocess(pdfurl, pdfcontent) + pdfcontent = None +# except ValueError, e: +# errors.append(e) + except IndexError, e: + errors.append(e) + +def process_page_queue(parser, errors): + try: + parser.process_pages() + postlistelib.exit_if_no_cpu_left(0, out_of_cpu, errors) + except scraperwiki.CPUTimeExceededError, e: + errors.append("Processing pages interrupted") + +def process_journal_pdfs(parser, listurl, errors): +# print "Finding PDFs on " + listurl +# u = urllib.parse.urlparse(listurl) + html = scraperwiki.scrape(listurl) + root = lxml.html.fromstring(html) + html = None + for ahref in root.cssselect("div.attribute-long a"): + href = ahref.attrib['href'] + url = urlparse.urljoin(listurl, href) + if -1 != href.find("file://") or -1 != href.find("mailto:"): +# print "Skipping non-http URL " + url + continue + if parser.is_already_scraped(url): + True +# print "Skipping already scraped " + url + else: +# print "Will process " + url + process_pdf(parser, url, errors) + +def test_small_pdfs(parser): + # Test with some smaller PDFs + errors = [] + process_pdf(parser, "https://mrfylke.no/Media/Files/Filer-administrasjonsavdelinga/Dokumentsenteret/Oktober-2011/Offentleg-journal-03.10.11", errors) + process_page_queue(parser, errors) + report_errors(errors) + exit(0) + +errors = [] +parser = postlistelib.PDFJournalParser(agency=agency) + +#test_small_pdfs(parser) + +process_page_queue(parser, errors) +process_journal_pdfs(parser, "https://mrfylke.no/Organisasjon/Organisasjon/Administrasjonsavdelinga/Dokumentsenter/Offentleg-journal", errors) +process_page_queue(parser, errors) +report_errors(errors) + diff --git a/scrapersources/postliste-nrk b/scrapersources/postliste-nrk index 5c7929d..3379f31 100644 --- a/scrapersources/postliste-nrk +++ b/scrapersources/postliste-nrk @@ -1,7 +1,4 @@ # -*- coding: UTF-8 -*- -# Based on the scraper advanced-scraping-pdf -# See also -# https://views.scraperwiki.com/run/pdf-to-html-preview-1/?url=http%3A%2F%2Fwww.stortinget.no%2FGlobal%2Fpdf%2Fpostjournal%2Fpj-2012-05-09.pdf import scraperwiki import json @@ -9,39 +6,39 @@ from BeautifulSoup import BeautifulSoup import datetime import dateutil.parser import lxml.html -import resource -import sys import urlparse -import gc import re -frontpage = "http://www.nrk.no/contentfile/transformer/1.8052258" -scraperwiki.scrape(frontpage) +scraperwiki.scrape("http://www.nrk.no/innsyn/") lazycache=scraperwiki.swimport('lazycache') postlistelib=scraperwiki.swimport('postliste-python-lib') -agency = 'Universitetet i Oslo' +agency = 'Norsk Rikskringkasting AS' def report_errors(errors): if 0 < len(errors): print "Errors:" for e in errors: print e - exit(1) + raise ValueError(str(len(errors)) + " errors detected") + def out_of_cpu(arg, spent, hard, soft): report_errors(arg) def process_pdf(parser, pdfurl, errors): - errors = [] + if parser.is_already_scraped(pdfurl): + return postlistelib.exit_if_no_cpu_left(0, out_of_cpu, errors) try: pdfcontent = scraperwiki.scrape(pdfurl) parser.preprocess(pdfurl, pdfcontent) pdfcontent = None -# except ValueError, e: -# errors.append(e) + except ValueError, e: + print e + errors.append(e) except IndexError, e: + print e errors.append(e) def process_page_queue(parser, errors): @@ -52,17 +49,14 @@ def process_page_queue(parser, errors): errors.append("Processing pages interrupted") def process_journal_pdfs(parser, listurl, errors): -# print "Finding PDFs on " + listurl + print "Finding PDFs on " + listurl # u = urllib.parse.urlparse(listurl) - html = scraperwiki.scrape(listurl) - root = lxml.html.fromstring(html) - html = None - for ahref in root.cssselect("table a"): - href = ahref.attrib['href'] - url = urlparse.urljoin(listurl, href) - if -1 != href.find("file://"): -# print "Skipping non-http URL " + url - continue + xml = scraperwiki.scrape(listurl) + root = lxml.html.fromstring(xml) + xml = None + for link in root.cssselect("hendelse link"): + url = lxml.html.tostring(link).replace("<link>", "").strip() + #print url if parser.is_already_scraped(url): True # print "Skipping already scraped " + url @@ -72,10 +66,50 @@ def process_journal_pdfs(parser, listurl, errors): def test_small_pdfs(parser): - parser.debug = True + #parser.debug = True errors = [] - process_pdf(parser, "http://nrk.no/contentfile/file/1.8116520!offentligjournal02052012.pdf", errors) # text + + # 2011: + if True: + process_pdf(parser, "http://home.nuug.no/~pere/nrk-postjournal/Offentlig%20journal%20NRK%200101_15012011.pdf", errors) + process_pdf(parser, "http://home.nuug.no/~pere/nrk-postjournal/Offentlig%20journal%20NRK%200102_10022011.pdf", errors) + process_pdf(parser, "http://home.nuug.no/~pere/nrk-postjournal/Offentlig%20journal%20NRK%200103_10032011.pdf", errors) + process_pdf(parser, "http://home.nuug.no/~pere/nrk-postjournal/Offentlig%20journal%20NRK%200104_07042011.pdf", errors) + process_pdf(parser, "http://home.nuug.no/~pere/nrk-postjournal/Offentlig%20journal%20NRK%200105_05052011.pdf", errors) + process_pdf(parser, "http://home.nuug.no/~pere/nrk-postjournal/Offentlig%20journal%20NRK%200106_18062011.pdf", errors) + process_pdf(parser, "http://home.nuug.no/~pere/nrk-postjournal/Offentlig%20journal%20NRK%200107_15072011.pdf", errors) + process_pdf(parser, "http://home.nuug.no/~pere/nrk-postjournal/Offentlig%20journal%20NRK%200108_15082011.pdf", errors) + process_pdf(parser, "http://home.nuug.no/~pere/nrk-postjournal/Offentlig%20journal%20NRK%200109_12092011.pdf", errors) + process_pdf(parser, "http://home.nuug.no/~pere/nrk-postjournal/Offentlig%20journal%20NRK%200110_12102011.pdf", errors) + process_pdf(parser, "http://home.nuug.no/~pere/nrk-postjournal/Offentlig%20journal%20NRK%200111_10112011.pdf", errors) + process_pdf(parser, "http://home.nuug.no/~pere/nrk-postjournal/Offentlig%20journal%20NRK%200112_10122011.pdf", errors) + process_pdf(parser, "http://home.nuug.no/~pere/nrk-postjournal/Offentlig%20journal%20NRK%200605_10052011.pdf", errors) + process_pdf(parser, "http://home.nuug.no/~pere/nrk-postjournal/Offentlig%20journal%20NRK%200804_15042011.pdf", errors) + process_pdf(parser, "http://home.nuug.no/~pere/nrk-postjournal/Offentlig%20journal%20NRK%201102_19022011.pdf", errors) + process_pdf(parser, "http://home.nuug.no/~pere/nrk-postjournal/Offentlig%20journal%20NRK%201103_17032011.pdf", errors) + process_pdf(parser, "http://home.nuug.no/~pere/nrk-postjournal/Offentlig%20journal%20NRK%201105_20052011.pdf", errors) + process_pdf(parser, "http://home.nuug.no/~pere/nrk-postjournal/Offentlig%20journal%20NRK%201111_20112011.pdf", errors) + process_pdf(parser, "http://home.nuug.no/~pere/nrk-postjournal/Offentlig%20journal%20NRK%201112_20122011.pdf", errors) + process_pdf(parser, "http://home.nuug.no/~pere/nrk-postjournal/Offentlig%20journal%20NRK%201309_25092011.pdf", errors) + process_pdf(parser, "http://home.nuug.no/~pere/nrk-postjournal/Offentlig%20journal%20NRK%201310_20102011.pdf", errors) + process_pdf(parser, "http://home.nuug.no/~pere/nrk-postjournal/Offentlig%20journal%20NRK%201601_31012011.pdf", errors) + process_pdf(parser, "http://home.nuug.no/~pere/nrk-postjournal/Offentlig%20journal%20NRK%201604_30042011.pdf", errors) + process_pdf(parser, "http://home.nuug.no/~pere/nrk-postjournal/Offentlig%20journal%20NRK%201607_31072011.pdf", errors) + process_pdf(parser, "http://home.nuug.no/~pere/nrk-postjournal/Offentlig%20journal%20NRK%201608_25082011.pdf", errors) + process_pdf(parser, "http://home.nuug.no/~pere/nrk-postjournal/Offentlig%20journal%20NRK%201803_26032011.pdf", errors) + process_pdf(parser, "http://home.nuug.no/~pere/nrk-postjournal/Offentlig%20journal%20NRK%201906_26062011.pdf", errors) + process_pdf(parser, "http://home.nuug.no/~pere/nrk-postjournal/Offentlig%20journal%20NRK%202105_31052011.pdf", errors) + process_pdf(parser, "http://home.nuug.no/~pere/nrk-postjournal/Offentlig%20journal%20NRK%202110_31102011.pdf", errors) + process_pdf(parser, "http://home.nuug.no/~pere/nrk-postjournal/Offentlig%20journal%20NRK%202111_30112011.pdf", errors) + process_pdf(parser, "http://home.nuug.no/~pere/nrk-postjournal/Offentlig%20journal%20NRK%202112_31122011.pdf", errors) + process_pdf(parser, "http://home.nuug.no/~pere/nrk-postjournal/Offentlig%20journal%20NRK%202502_28022011.pdf", errors) + process_pdf(parser, "http://home.nuug.no/~pere/nrk-postjournal/Offentlig%20journal%20NRK%202608_31082011.pdf", errors) + process_pdf(parser, "http://home.nuug.no/~pere/nrk-postjournal/Offentlig%20journal%20NRK%202609_30092011.pdf", errors) + process_pdf(parser, "http://home.nuug.no/~pere/nrk-postjournal/Offentlig%20journal%20NRK%202703_31032011.pdf", errors) + process_pdf(parser, "http://home.nuug.no/~pere/nrk-postjournal/Offentlig%20journal%20NRK%202706_30062011.pdf", errors) + + #process_pdf(parser, "http://nrk.no/contentfile/file/1.8116520!offentligjournal02052012.pdf", errors) # text #process_pdf(parser, "http://nrk.no/contentfile/file/1.8061384!offentlig%2002042012.pdf", errors) # Image #process_pdf(parser, "http://nrk.no/contentfile/file/1.8130287!offentligjournal09052012.pdf", errors) # Image process_page_queue(parser, errors) @@ -85,10 +119,9 @@ def test_small_pdfs(parser): errors = [] parser = postlistelib.PDFJournalParser(agency=agency, hiddentext=True) -test_small_pdfs(parser) +#test_small_pdfs(parser) # Based on http://www.nrk.no/innsyn/ -process_journal_pdfs(parser, frontpage, errors) +process_journal_pdfs(parser, "http://www.nrk.no/contentfile/transformer/1.8052258", errors) process_page_queue(parser, errors) -report_errors(errors) - +report_errors(errors)
\ No newline at end of file diff --git a/scrapersources/postliste-ntnu b/scrapersources/postliste-ntnu index 1a885c4..d6c6695 100644 --- a/scrapersources/postliste-ntnu +++ b/scrapersources/postliste-ntnu @@ -22,7 +22,7 @@ def report_errors(errors): print "Errors:" for e in errors: print e - raise ValueError("Something went wrong") + raise ValueError(str(len(errors)) + "errors detected") def out_of_cpu(arg, spent, hard, soft): report_errors(arg) @@ -39,7 +39,7 @@ def process_pdf(parser, pdfurl, errors): except IndexError, e: errors.append(e) except urllib2.HTTPError, e: - errors.append(e) + errors.append(str(e) + " " + pdfurl) def process_page_queue(parser, errors): try: diff --git a/scrapersources/postliste-oep b/scrapersources/postliste-oep index c7fdc82..735d0a7 100644 --- a/scrapersources/postliste-oep +++ b/scrapersources/postliste-oep @@ -9,7 +9,7 @@ import httplib import urllib2 # Try several times as the database get bigger -writetries = 5 +writetries = 6 # http://www.oep.no/search/resultSingle.html?journalPostId=1000000 # http://www.oep.no/search/resultSingle.html?journalPostId=3889259 @@ -102,7 +102,7 @@ def url_from_id(id): return "http://www.oep.no/search/resultSingle.html?journalPostId=" + str(id) def save(data): - for run in range(1,writetries): + for run in range(0,writetries): try: scraperwiki.sqlite.save(unique_keys=['journalPostId'], data=data) return @@ -112,7 +112,7 @@ def save(data): raise scraperwiki.sqlite.SqliteError("Unable to write to database, tried " + str(writetries) + " times") def save_var(var, data): - for run in range(1,writetries): + for run in range(0,writetries): try: scraperwiki.sqlite.save_var(var, data) return @@ -184,6 +184,8 @@ def fetch_range(first, last, step): fetched = 0 min_id = first for id in range(first, last, step): + if id < 0: + break try: tries = 3 while 0 < tries: @@ -309,6 +311,7 @@ def remove_original(): #update_doctypes() print "Starting to fetch journal entries " + str(datetime.datetime.now()) +scraperwiki.scrape("http://www.oep.no/") count = 10000 skiplimit = 500 # Random value fairly close to the most recent ID when this project started 2012-05-03 diff --git a/scrapersources/postliste-oslo-bydel-ullern b/scrapersources/postliste-oslo-bydel-ullern index 54a5031..614b12f 100644 --- a/scrapersources/postliste-oslo-bydel-ullern +++ b/scrapersources/postliste-oslo-bydel-ullern @@ -11,6 +11,9 @@ import dateutil.parser import lxml.html import urlparse import re + +scraperwiki.scrape("http://www.bydel-ullern.oslo.kommune.no/postjournal/") + #lazycache=scraperwiki.swimport('lazycache') postlistelib=scraperwiki.swimport('postliste-python-lib') @@ -21,19 +24,19 @@ def report_errors(errors): print "Errors:" for e in errors: print e - exit(1) + raise ValueError("Something went wrong") + def out_of_cpu(arg, spent, hard, soft): report_errors(arg) def process_pdf(parser, pdfurl, errors): - errors = [] postlistelib.exit_if_no_cpu_left(0, out_of_cpu, errors) try: pdfcontent = scraperwiki.scrape(pdfurl) parser.preprocess(pdfurl, pdfcontent) pdfcontent = None -# except ValueError, e: -# errors.append(e) + except ValueError, e: + errors.append(e) except IndexError, e: errors.append(e) diff --git a/scrapersources/postliste-oslo-havn b/scrapersources/postliste-oslo-havn index d453ef7..1139b81 100644 --- a/scrapersources/postliste-oslo-havn +++ b/scrapersources/postliste-oslo-havn @@ -12,6 +12,9 @@ import lxml.html import sys import urlparse import re + +scraperwiki.scrape("http://www.havn.oslo.kommune.no/postjournal/") + lazycache=scraperwiki.swimport('lazycache') postlistelib=scraperwiki.swimport('postliste-python-lib') @@ -22,17 +25,17 @@ def report_errors(errors): print "Errors:" for e in errors: print e - exit(1) + raise ValueError(str(len(errors)) + " errors detected") + def out_of_cpu(arg, spent, hard, soft): report_errors(arg) def process_pdf(parser, pdfurl, errors): - errors = [] postlistelib.exit_if_no_cpu_left(0, out_of_cpu, errors) try: parser.fetch_and_preprocess(pdfurl) -# except ValueError, e: -# errors.append(e) + except ValueError, e: + errors.append(e) except IndexError, e: errors.append(e) diff --git a/scrapersources/postliste-python-lib b/scrapersources/postliste-python-lib index 042d1fd..7176ae9 100644 --- a/scrapersources/postliste-python-lib +++ b/scrapersources/postliste-python-lib @@ -1,4 +1,4 @@ -# -*- coding: utf-8 -*- +# -*- coding: UTF-8 -*- # # Python library for parsing public post journals (postlister) in Norway. # @@ -100,6 +100,11 @@ class JournalParser: if -1 != entry['caseid'].find('-'): raise ValueError("Field caseid should not include dash: " + entry['caseid']) + + # Seen in http://home.nuug.no/~pere/nrk-postjournal/Offentlig%20journal%20NRK%200101_15012011.pdf + if 'sender' in entry and -1 != entry['sender'].find("Side: "): + raise ValueError("Field sender got page number, not real content") + # # Parser of PDFs looking like # http://www.storfjord.kommune.no/postliste-18-mai-2012.5056067-105358.html (type 1) @@ -122,6 +127,8 @@ class PDFJournalParser(JournalParser): # FIXME Figure out why this do not work #" and not (sender = 'parse error' or recipient != 'parse error') " + "limit 1", + + "scrapedurl from " + self.brokenpagetable + " where scrapedurl = '" + url + "' limit 1", "scrapedurl from " + self.pagetable + " where scrapedurl = '" + url + "' limit 1"]: try: result = scraperwiki.sqlite.select(sql) @@ -131,7 +138,8 @@ class PDFJournalParser(JournalParser): except Exception as e: #if ('no such table: %s' % self.pagetable) not in str(e) and 'no such table: swdata' not in str(e): # raise - print "Exception: %s" % e + #print "Ignoring exception: %s" % e + True return False # Check if we recognize the page content, and throw if not @@ -139,10 +147,7 @@ class PDFJournalParser(JournalParser): s = BeautifulSoup(pagecontent) for t in s.findAll('text'): if t.text != " ": - if 'Innhold:' == t.text: # type 1 or 2 (ePhorge) - s = None - return True - if 'Arkivdel:' == t.text]: # type 3 (doculive) + if 'Innhold:' == t.text: s = None return True s = None @@ -195,7 +200,6 @@ class PDFJournalParser(JournalParser): for i in range(0, len(entrytext)): print str(i) + ": '" + entrytext[i] + "'" - # ePhorte PDF def parse_entry_type1(self, entrytext, pdfurl): scrapestamputc = datetime.datetime.now() entry = { @@ -349,7 +353,6 @@ class PDFJournalParser(JournalParser): self.parse_case_journal_ref(entry, [u'2010/2593-2', u'103004/201', u'1'], "") self.parse_case_journal_ref(entry, [u'2011/13415-', u'22', u'100077/201', u'1'], "") - # ePhorte PDF def parse_entry_type2(self, entrytext, pdfurl): scrapestamputc = datetime.datetime.now() entry = { @@ -455,8 +458,7 @@ class PDFJournalParser(JournalParser): entrycount = 0 i = 0 while i < len(text): - if 'Innhold:' == text[i] \ # Type 1 and 2 (ePhorge) - or 'Arkivdel:' == text[i]: # type 3 (doculive) + if 'Innhold:' == text[i]: entrycount = entrycount + 1 i = i + 1 @@ -483,10 +485,12 @@ class PDFJournalParser(JournalParser): if self.debug: print "Entry " + str(entrycount) + " from " + str(i) + " to " + str(endi) + " ie " + str(endi - i) + " lines" try: + if pdfparser is None: + raise ValueError("Unrecognized page format in " + pdfurl) entry = pdfparser(text[i:endi], pdfurl) if 'caseid' not in entry or entry['caseid'] is None or \ not self.is_valid_doctype(entry['doctype']): - raise ValueError("Unable to parse " + pdfurl + " as format " + format + "[" + str(entry) + "]") + raise ValueError("Unable to parse " + pdfurl + " as format " + format + " [" + str(entry) + "]") # print entry datastore.append(entry) i = endi - 2 @@ -507,6 +511,7 @@ class PDFJournalParser(JournalParser): text = None def process_pages(self): + brokenpages = 0 try: sqlselect = "* from " + self.pagetable + " limit 1" pageref = scraperwiki.sqlite.select(sqlselect) @@ -525,15 +530,61 @@ class PDFJournalParser(JournalParser): 'scrapedurl' : scrapedurl, 'pagenum' : pagenum, 'pagecontent' : pagecontent, + 'failstamp' : datetime.datetime.now(), } - print "Broken page %d from %s" % (pagenum, scrapedurl) + print "Unsupported page %d from %s" % (pagenum, scrapedurl) + brokenpages = brokenpages + 1 scraperwiki.sqlite.save(unique_keys=['scrapedurl', 'pagenum'], data=brokenpage, table_name=self.brokenpagetable) scraperwiki.sqlite.execute(sqldelete) scraperwiki.sqlite.commit() pageref = scraperwiki.sqlite.select(sqlselect) + + # Last, try some of the broken pages again, in case we got support for handling them in the mean time + try: + # First, check if the table exist + scraperwiki.sqlite.execute("select * from " + self.brokenpagetable) + + newtrystamp = datetime.datetime.now() + sqlselect = "* from " + self.brokenpagetable + " where failstamp is NULL or failstamp < '" + str(newtrystamp) + "'" + " limit 1" + try: + pageref = scraperwiki.sqlite.select(sqlselect) + except scraperwiki.sqlite.SqliteError, e: + scraperwiki.sqlite.execute("ALTER TABLE " + self.brokenpagetable + " ADD COLUMN failstamp") + scraperwiki.sqlite.commit() + pageref = scraperwiki.sqlite.select(sqlselect) + + pagelimit = 10 + while pageref and 0 < pagelimit: + pagelimit = pagelimit - 1 + scrapedurl = pageref[0]['scrapedurl'] + pagenum = pageref[0]['pagenum'] + pagecontent = pageref[0]['pagecontent'] +# print "Found " + scrapedurl + " page " + str(pagenum) + " length " + str(len(pagecontent)) + try: + sqldelete = "delete from " + self.brokenpagetable + " where scrapedurl = '" + scrapedurl + "' and pagenum = " + str(pagenum) + self.parse_page(scrapedurl, pagenum, pagecontent) +# print "Trying to: " + sqldelete + scraperwiki.sqlite.execute(sqldelete) + except ValueError, e: + brokenpage = { + 'scrapedurl' : scrapedurl, + 'pagenum' : pagenum, + 'pagecontent' : pagecontent, + 'failstamp' : newtrystamp, + } + + print "Still unsupported page %d from %s" % (pagenum, scrapedurl) + brokenpages = brokenpages + 1 + scraperwiki.sqlite.save(unique_keys=['scrapedurl', 'pagenum'], data=brokenpage, table_name=self.brokenpagetable) + scraperwiki.sqlite.commit() + pageref = scraperwiki.sqlite.select(sqlselect) + except: + True # Ignore missing brokenpages table except scraperwiki.sqlite.SqliteError, e: print str(e) raise + if 0 < brokenpages: + raise ValueError("Found %d pages with unsupported format" % brokenpages) def fieldlist(): import urllib2 diff --git a/scrapersources/postliste-python-lib-doculive b/scrapersources/postliste-python-lib-doculive new file mode 100644 index 0000000..520c915 --- /dev/null +++ b/scrapersources/postliste-python-lib-doculive @@ -0,0 +1,649 @@ +# -*- coding: UTF-8 -*- +# +# Python library for parsing public post journals (postlister) in Norway. +# + +# Based on the scraper advanced-scraping-pdf +# +# See also +# https://views.scraperwiki.com/run/pdf-to-html-preview-1/ + +# Possible sources using format 1 pdf: +# www.bydel-ullern.oslo.kommune.no +# www.gravferdsetaten.oslo.kommune.no +# www.halden.kommune.no (done) +# www.havn.oslo.kommune.no (done) +# www.hvaler.kommune.no (done) +# www.kafjord.kommune.no +# www.lier.kommune.no +# www.lindesnes.kommune.no +# www.naroy.kommune.no +# www.saltdal.kommune.no +# www.sogne.kommune.no +# www.vikna.kommune.no +# +# Google search to find more: "Offentlig journal" Seleksjon Sakstittel Dokumenttype Status filetype:pdf + + +import scraperwiki +import string +import re +from BeautifulSoup import BeautifulSoup +import datetime +import dateutil.parser + +def cpu_spent(): + import resource + usage = resource.getrusage(resource.RUSAGE_SELF) + return getattr(usage, 'ru_utime') + getattr(usage, 'ru_stime') + +def exit_if_no_cpu_left(retval, callback=None, arg = None): + import resource + soft, hard = resource.getrlimit(resource.RLIMIT_CPU) + spent = cpu_spent() + if soft < spent: + if callback is not None: + callback(arg, spent, hard, soft) + print "Running out of CPU, exiting." + exit(retval) + +def fetch_url_harder(url, scraper = None): + import urllib2 + html = None + for n in [1, 2, 3]: + try: + if None == scraper: + scraper = scraperwiki.scrape + html = scraper(url) + break + except urllib2.URLError, e: + print "URLError fetching " + url + ", trying again" + return html + +class JournalParser: + agency = None + debug = False + + validdoctypes = ['I', 'U', 'X', 'N'] + senderdoctypes = ['I', 'X', 'N'] + recipientdoctypes = ['U'] + mustfields = { + 'agency' : 1, + 'docdesc' : 1, + 'doctype' : 1, + 'caseyear' : 1, + 'caseseqnr' : 1, + 'casedocseq' : 1, + } + + def __init__(self, agency): + self.agency = agency + + def is_valid_doctype(self, doctype): + return doctype in self.validdoctypes + + def is_sender_doctype(self, doctype): + return doctype in self.senderdoctypes + + def is_recipient_doctype(self, doctype): + return doctype in self.recipientdoctypes + + def verify_entry(self, entry): + + for field in self.mustfields: + if not field in entry: + raise ValueError("Missing required field " + field) + + if not self.is_valid_doctype(entry['doctype']): + raise ValueError("Invalid doctype " + doctype) + + if -1 != entry['caseid'].find('-'): + raise ValueError("Field caseid should not include dash: " + entry['caseid']) + +# +# Parser of PDFs looking like +# http://www.storfjord.kommune.no/postliste-18-mai-2012.5056067-105358.html (type 1) +# http://www.hadsel.kommune.no/component/docman/doc_download/946-offentlig-postjournal-28032012 (type 2) +# http://www.stortinget.no/Global/pdf/postjournal/pj-2011-06-23.pdf (type 2 variant) +# Note sender/receiver is not yet parsed for type 2 PDFs +class PDFJournalParser(JournalParser): + pagetable = "unparsedpages" + brokenpagetable = "brokenpages" + hiddentext = False + breakonfailure = True + + def __init__(self, agency, hiddentext=False): + self.hiddentext = hiddentext + JournalParser.__init__(self, agency=agency) + + def is_already_scraped(self, url): + # Ignore entries were sender and recipient is the result of a broken parser (before 2012-05-25) + for sql in ["scrapedurl, sender, recipient from swdata where scrapedurl = '" + url + "' " + + # FIXME Figure out why this do not work + #" and not (sender = 'parse error' or recipient != 'parse error') " + + "limit 1", + "scrapedurl from " + self.pagetable + " where scrapedurl = '" + url + "' limit 1"]: + try: + result = scraperwiki.sqlite.select(sql) + #int sql, " : ", result + if 0 < len(result) and u'scrapedurl' in result[0]: + return True + except Exception as e: + #if ('no such table: %s' % self.pagetable) not in str(e) and 'no such table: swdata' not in str(e): + # raise + print "Exception: %s" % e + return False + + # Check if we recognize the page content, and throw if not + def is_valid_page(self, pdfurl, pagenum, pagecontent): + s = BeautifulSoup(pagecontent) + for t in s.findAll('text'): + if t.text != " ": + if self.debug: + print t.text + if 'Innhold:' == t.text: # type 1 or 2 (ePhorge) + s = None + return True + if 'Arkivdel:' == t.text or 'Notater (X):' == t.text: # type 3 (doculive) + s = None + return True + s = None + if self.debug: + print "Unrecognized page format for " + pdfurl + raise ValueError("Unrecognized page format for " + pdfurl) + + # + # Split PDF content into pages and store in SQL table for later processing. + # The process is split in two to better handle parge PDFs (like 600 pages), + # without running out of CPU time without loosing track of what is left to + # parse. + def preprocess(self, pdfurl, pdfcontent): + print "Preprocessing PDF " + pdfurl + if not pdfcontent: + raise ValueError("No pdf content passed for " + pdfurl) + if self.hiddentext: + options = '-hidden' + else: + options = '' + xml=scraperwiki.pdftoxml(pdfcontent, options) + if self.debug: + print xml + pages=re.findall('(<page .+?</page>)',xml,flags=re.DOTALL) + xml=None +# print pages[:1][:1000] + pagecount = 0 + datastore = [] + for page in pages: + pagecount = pagecount + 1 + self.is_valid_page(pdfurl, pagecount, page) + data = { + 'scrapedurl' : pdfurl, + 'pagenum' : pagecount, + 'pagecontent' : page, + } + datastore.append(data) + if 0 < len(datastore): + scraperwiki.sqlite.save(unique_keys=['scrapedurl', 'pagenum'], data=datastore, table_name=self.pagetable) + else: + raise ValueError("Unable to find any pages in " + pdfurl) + pages = None + + def fetch_and_preprocess(self, pdfurl): + pdfcontent = fetch_url_harder(pdfurl) + self.preprocess(pdfurl, pdfcontent) + pdfcontent = None + + def print_entry(self, entrytext): + for i in range(0, len(entrytext)): + print str(i) + ": '" + entrytext[i] + "'" + + # ePhorte PDF + def parse_entry_type1(self, entrytext, pdfurl): + scrapestamputc = datetime.datetime.now() + entry = { + 'agency' : self.agency, + 'scrapestamputc' : scrapestamputc, + 'scrapedurl' : pdfurl + } + i = 0 + while i < len(entrytext): + #print "T: '" + entrytext[i] + "'" + if 'Innhold:' == entrytext[i]: + tittel = "" + # handle multi-line titles + while 'Sakstittel:' != entrytext[i+1]: + tittel = tittel + " " + entrytext[i+1] + i = i + 1 + entry['docdesc'] = tittel + if 'Sakstittel:' == entrytext[i]: + sakstittel = "" + while 'DokType' != entrytext[i+1]: +# print "'" + entrytext[i+1] + "'" + sakstittel = sakstittel + " " + entrytext[i+1] + i = i + 1 + entry['casedesc'] = sakstittel + if 'DokType' == entrytext[i]: # Values I/U/N/X from NOARK 4 table 14.2.11 + entry['doctype'] = entrytext[i+1] + # As seen on http://www.saltdal.kommune.no/images/module.files/2007-05-16.pdf, page 1 + if entry['doctype'] == 'S': + entry['doctype'] = 'X' + i = i + 1 + if 'Sak/dok nr:' == entrytext[i]: + # FIXME Split and handle combined sak/løpenr + # Use find('penr.:') to avoid non-ascii search string 'Løpenr.:' + caseid = None + lnr = None + if -1 != entrytext[i+4].find('penr.:'): + caseid = entrytext[i+1] + entrytext[i+2] + lnr = entrytext[i+3] + i = i + 4 + elif -1 != entrytext[i+3].find('penr.:'): + caseid = entrytext[i+1] + lnr = entrytext[i+2] + i = i + 3 + elif -1 != entrytext[i+2].find('penr.:'): + caseid, lnr = entrytext[i+1].split(" ") + i = i + 2 + + caseyear, caseseqnr = caseid.split("/") + entry['caseyear'] = int(caseyear) + caseseqnr, casedocseq = caseseqnr.split("-") + entry['caseseqnr'] = int(caseseqnr) + entry['casedocseq'] = int(casedocseq) + entry['caseid'] = caseyear + "/" + caseseqnr + + journalseqnr, journalyear = lnr.split("/") + entry['journalid'] = journalyear + "/" + journalseqnr + entry['journalyear'] = int(journalyear) + entry['journalseqnr'] = int(journalseqnr) + +# if -1 != text[i].find('penr.:'): # Use find('penr.:') to avoid non-ascii search string 'Løpenr.:' +# str = text[i-1] +# print "S: '" + str + "'" +# data['journalid'] = str +# # FIXME handle combined sak/løpenr + if 'Journaldato:' == entrytext[i]: + entry['recorddate'] = dateutil.parser.parse(entrytext[i-1], dayfirst=True) + if 'Dok.dato:' == entrytext[i]: + entry['docdate'] = dateutil.parser.parse(entrytext[i-1], dayfirst=True) + if 'Tilg.kode Hjemmel:' == entrytext[i] and 'Avsender\mottaker:' != entrytext[i+1]: + entry['exemption'] = entrytext[i+1] + i = i + 1 + if 'Tilg.kode' == entrytext[i]: + entry['accesscode'] = entrytext[i+1] + i = i + 1 + if 'Hjemmel:' == entrytext[i]: + entry['exemption'] = entrytext[i+1] + i = i + 1 + if 'Avsender\mottaker:' == entrytext[i]: + if i+1 < len(entrytext): # Non-empty field + fratil = entrytext[i+1] + i = i + 1 + if self.is_sender_doctype(entry['doctype']): + entry['sender'] = fratil + elif self.is_recipient_doctype(entry['doctype']): + entry['recipient'] = fratil + else: + raise ValueError("Case " + entry['caseid'] + " Sender/Recipient with doctype " + entry['doctype'] + " != I/U/X/N in " + pdfurl) + if self.debug: + print entry + i = i + 1 + return entry + + def parse_case_journal_ref(self, entry, reftext, pdfurl): + try: + # FIXME Split and handle combined sak/loepenr + # Use find('penr.:') to avoid non-ascii search string 'Loepenr.:' + caseid = None + lnr = None + if 4 == len(reftext): +# print "4 " + str(reftext) + caseid = reftext[0] + reftext[1] + lnr = reftext[2] + reftext[3] +# print str(caseid) + " " + str(lnr) + elif 3 == len(reftext): + if -1 != reftext[0].find("/") and -1 != reftext[2].find("/"): +# print "31" + caseid = reftext[0] + reftext[1] + lnr = reftext[2] + elif -1 != reftext[2].find("/"): +# print "32" + caseid = reftext[0] + reftext[1] + lnr = reftext[2] + elif -1 == reftext[2].find("/"): +# print "33" + caseid = reftext[0] + lnr = reftext[1] + reftext[2] + elif 2 == len(reftext): + if -1 == reftext[1].find("/"): +# print "21" + s = reftext[0] + reftext[1] +# print "S: " + s + caseid, lnr = s.split(" ") + elif -1 != reftext[1].find("/"): +# print "22" + caseid = reftext[0] + lnr = reftext[1] + elif 1 == len(reftext): + caseid, lnr = reftext[0].split(" ") + else: + raise ValueError("Unable to parse entry " + str(reftext) + " in " + pdfurl) +# print "C: " + caseid + " L: " + lnr + + caseyear, caseseqnr = caseid.split("/") + entry['caseyear'] = int(caseyear) + caseseqnr, casedocseq = caseseqnr.split("-") + entry['caseseqnr'] = int(caseseqnr) + entry['casedocseq'] = int(casedocseq) + entry['caseid'] = caseyear + "/" + caseseqnr + + journalseqnr, journalyear = lnr.split("/") + entry['journalid'] = journalyear + "/" + journalseqnr + entry['journalyear'] = int(journalyear) + entry['journalseqnr'] = int(journalseqnr) + except: + print "Unable to parse " + str(reftext) + return entry + def test_parse_case_journal_ref(self): + entry = {} + self.parse_case_journal_ref(entry, [u'2008/16414-', u'23', u'15060/2012'], "") + self.parse_case_journal_ref(entry, [u'2011/15972-1 102773/201', u'1'], "") + self.parse_case_journal_ref(entry, [u'2010/2593-2', u'103004/201', u'1'], "") + self.parse_case_journal_ref(entry, [u'2011/13415-', u'22', u'100077/201', u'1'], "") + + # ePhorte PDF + def parse_entry_type2(self, entrytext, pdfurl): + scrapestamputc = datetime.datetime.now() + entry = { + 'agency' : self.agency, + 'scrapestamputc' : scrapestamputc, + 'scrapedurl' : pdfurl + } + i = 0 + avsender = [] + mottaker = [] + while i < len(entrytext): + if 'Innhold:' == entrytext[i]: + tittel = "" + # handle multi-line titles + while 'Sakstittel:' != entrytext[i+1]: + tittel = tittel + entrytext[i+1] + i = i + 1 + entry['docdesc'] = tittel + if 'Sakstittel:' == entrytext[i]: + sakstittel = "" + # Klassering er i en annen dokumenttype + while 'DokType' != entrytext[i+1] and 'Dok.Type:' != entrytext[i+1] and 'Klassering:' != entrytext[i+1]: + +# print "'" + entrytext[i+1] + "'" + sakstittel = sakstittel + entrytext[i+1] + i = i + 1 + entry['casedesc'] = sakstittel + i = i + 1 + if 'DokType' == entrytext[i] or 'Dok.Type:' == entrytext[i]: # Values I/U/N/X from NOARK 4 table 14.2.11 + entry['doctype'] = entrytext[i+1] + # As seen on http://www.uis.no/getfile.php/Journal%20200612.pdf + if entry['doctype'] == 'S': + entry['doctype'] = 'X' + i = i + 1 + if 'Sak/dok nr:' == entrytext[i] or 'Sak/dok.nr:' == entrytext[i]: + endi = i + while endi < len(entrytext): + if -1 != entrytext[endi].find('penr.:') or -1 != entrytext[endi].find('penr:'): + break + endi = endi + 1 + entry = self.parse_case_journal_ref(entry, entrytext[i+1:endi], pdfurl) + i = endi + 1 +# if -1 != text[i].find('penr.:'): # Use find('penr.:') to avoid non-ascii search string 'Løpenr.:' +# str = text[i-1] +# print "S: '" + str + "'" +# data['journalid'] = str +# # FIXME handle combined sak/løpenr + if 'Journaldato:' == entrytext[i]: + entry['recorddate'] = dateutil.parser.parse(entrytext[i-1], dayfirst=True) + if 'Dok.dato:' == entrytext[i]: + entry['docdate'] = dateutil.parser.parse(entrytext[i-1], dayfirst=True) + if 'Tilg.kode Hjemmel:' == entrytext[i] and '(enhet/initialer):' != entrytext[i+2]: + entry['exemption'] = entrytext[i+1] + i = i + 1 + if 'Tilg.kode' == entrytext[i]: + entry['accesscode'] = entrytext[i+1] + i = i + 1 + if 'Hjemmel:' == entrytext[i]: + entry['exemption'] = entrytext[i+1] + i = i + 1 +# if -1 != text[i].find('Avs./mottaker:'): +# FIXME Need to handle senders and receivers + if 'Mottaker' == entrytext[i]: + mottaker.append(entrytext[i-1]) + if 'Avsender' == entrytext[i]: + avsender.append(entrytext[i-1]) +# entry['sender'] = 'parse error' +# entry['recipient'] = 'parse error' + i = i + 1 + if 0 < len(mottaker): + entry['recipient'] = string.join(mottaker, ", ") + if 0 < len(avsender): + entry['sender'] = string.join(avsender, ", ") + return entry + + def parse_entry_type3(self, entrytext, pdfurl): + scrapestamputc = datetime.datetime.now() + entry = { + 'agency' : self.agency, + 'scrapestamputc' : scrapestamputc, + 'scrapedurl' : pdfurl + } + cur = 0 + while cur < len(lines): + line = lines[cur].text + #print line + if -1 != line.find('Dok.dato:'): + entry['docid'] = lines[cur-2].text + entry['doctype'] = lines[cur-1].text + entry['docdate'] = parse_date(line.replace("Dok.dato:", "")) + caseyear, caseseqnr, casedocseq = split_docid(entry['docid']) + entry['caseyear'] = caseyear + entry['caseseqnr'] = caseseqnr + entry['casedocseq'] = casedocseq + entry['caseid'] = str(caseyear) + '/' + str(caseseqnr) + if -1 != line.find('Jour.dato:'): + entry['recorddate'] = parse_date(lines[cur+1].text) + cur = cur + 1 + if -1 != line.find('Arkivdel:'): + entry['arkivdel'] = lines[cur+1].text + cur = cur + 1 + if -1 != line.find('Tilg. kode:'): + entry['tilgangskode'] = line.replace("Tilg. kode:", "") + if -1 != line.find('Sak:'): + entry['casedesc'] = lines[cur+1].text + cur = cur + 1 + if -1 != line.find('Dok:'): + entry['docdesc'] = lines[cur+1].text + cur = cur + 1 + if -1 != line.find('Par.:'): + entry['exemption'] = line.replace("Par.:", "") + cur = cur + 1 + if -1 != line.find('Avsender:'): + entry['sender'] = lines[cur+1].text + cur = cur + 1 + if -1 != line.find('Mottaker:'): + entry['recipient'] = lines[cur+1].text + cur = cur + 1 + if -1 != line.find('Saksansv:'): + entry['saksansvarlig'] = line.replace("Saksansv:", "").strip() + if -1 != line.find('Saksbeh:'): + entry['saksbehandler'] = lines[cur+1].text + cur = cur + 1 + cur = cur + 1 + print entry + return entry + + def parse_page(self, pdfurl, pagenum, pagecontent): + print "Scraping " + pdfurl + " page " + str(pagenum) + s = BeautifulSoup(pagecontent) + datastore = [] + text = [] + linecount = 0 + if self.debug: + print s + for t in s.findAll('text'): + if t.text != " ": + text.append(t.text) + if self.debug: + print str(linecount) + ": " + t.text +# FIXME Remove length limit when working +# if 100 <= linecount: +# break + linecount = linecount + 1 +# if -1 != t.text.find("Side:"): +# print t.text + s = None + +# print "Found " + str(linecount) + " lines/text fragments in the PDF" + if len(text) < linecount: + raise ValueError("Text array too sort!") + + # First count how many entries to expect on this page, to be able to + # verify that all of them were found. + entrycount = 0 + i = 0 + while i < len(text): + # Type 1 and 2 (ePhorge) + if 'Innhold:' == text[i] or \ + 'Arkivdel:' == text[i]: # type 3 (doculive) + entrycount = entrycount + 1 + i = i + 1 + + i = 0 + while i < len(text): + if self.debug: + print "T: '" + text[i] + "'" + if self.debug and -1 != text[i].find("Side:"): + print text[i] + if 'Innhold:' == text[i]: + endi = i + 1 + pdfparser = None + format = "unknown" + while endi < len(text): + if 'Klassering:' == text[endi]: + print "Found ePhorte PDF (type 1)" + pdfparser = self.parse_entry_type2 + format = "type2" + if 'Avsender\mottaker:' == text[endi]: + print "Found ePhorge PDF (type 2)" + pdfparser = self.parse_entry_type1 + format = "type1" + if 'Arkivdel:' == text[endi]: + print "Found Doculive PDF" + pdfparser = self.parse_entry_type3 + format = "type3" + if 'Innhold:' == text[endi]: + break + endi = endi + 1 + if self.debug: + print "Entry " + str(entrycount) + " from " + str(i) + " to " + str(endi) + " ie " + str(endi - i) + " lines" + try: + entry = pdfparser(text[i:endi], pdfurl) + if 'caseid' not in entry or entry['caseid'] is None or \ + not self.is_valid_doctype(entry['doctype']): + raise ValueError("Unable to parse " + pdfurl + " as format " + format + "[" + str(entry) + "]") +# print entry + datastore.append(entry) + i = endi - 2 + except: + self.print_entry(text[i:endi]) + raise + i = i + 1 +# print data +# print "Found " + str(len(datastore)) + " of " + str(entrycount) + " entries" + if entrycount != len(datastore): +# print text + raise ValueError("Unable to parse all entries in " + pdfurl) + if 0 == len(datastore): + print "Unable to find any entries in " + pdfurl + else: + scraperwiki.sqlite.save(unique_keys=['caseid', 'casedocseq'], data=datastore) + datastore = None + text = None + + def process_pages(self): + try: + sqlselect = "* from " + self.pagetable + " limit 1" + pageref = scraperwiki.sqlite.select(sqlselect) + while pageref: + scrapedurl = pageref[0]['scrapedurl'] + pagenum = pageref[0]['pagenum'] + pagecontent = pageref[0]['pagecontent'] +# print "Found " + scrapedurl + " page " + str(pagenum) + " length " + str(len(pagecontent)) + try: + sqldelete = "delete from " + self.pagetable + " where scrapedurl = '" + scrapedurl + "' and pagenum = " + str(pagenum) + self.parse_page(scrapedurl, pagenum, pagecontent) +# print "Trying to: " + sqldelete + scraperwiki.sqlite.execute(sqldelete) + except ValueError, e: + brokenpage = { + 'scrapedurl' : scrapedurl, + 'pagenum' : pagenum, + 'pagecontent' : pagecontent, + } + print "Broken page %d from %s" % (pagenum, scrapedurl) + scraperwiki.sqlite.save(unique_keys=['scrapedurl', 'pagenum'], data=brokenpage, table_name=self.brokenpagetable) + print e + scraperwiki.sqlite.execute(sqldelete) + scraperwiki.sqlite.commit() + pageref = scraperwiki.sqlite.select(sqlselect) + except scraperwiki.sqlite.SqliteError, e: + print str(e) + raise + +def fieldlist(): + import urllib2 + import json + + scrapers = [ + 'postliste-universitetet-i-oslo', + 'postliste-lindesnes', + 'postliste-kristiansund', + 'postliste-stortinget', + 'postliste-arendal', + 'postliste-oep', + 'postliste-ballangen', + 'postliste-hadsel', + 'postliste-storfjord', + 'postliste-oslo-havn', + ] + + keys = {} + + for scraper in scrapers: + url = 'https://api.scraperwiki.com/api/1.0/scraper/getinfo?format=jsondict&name=' + scraper + '&version=-1' + response = urllib2.urlopen(url) + html = response.read() + data = json.loads(html) + if 'swdata' in data[0]['datasummary']['tables']: + for key in data[0]['datasummary']['tables']['swdata']['keys']: + key = key.lower() + if key in keys: + keys[key].append(scraper) + else: + keys[key] = [scraper] + def lensort(a, b): + return cmp(len(keys[b]), len(keys[a])) + + for key in sorted(keys.keys(), lensort): + print len(keys[key]), key, str(keys[key]) + +def test_parser(): + parser = PDFJournalParser(agency="Dummy agency") + parser.debug = True + for url in [ #"http://www.stortinget.no/Global/pdf/postjournal/pj-2011-06-23.pdf", + "http://www.radhusets-forvaltningstjeneste.oslo.kommune.no/getfile.php/rådhusets%20forvaltningstjeneste%20(RFT)/Intranett%20(RFT)/Dokumenter/Postjournal/11%20November/29112011.pdf"]: + pdfcontent = scraperwiki.scrape(url) + parser.preprocess(url,pdfcontent) + parser.process_pages() + + +if __name__ == "scraper": + test_parser() +# fieldlist() diff --git a/scrapersources/postliste-stavanger-universitetssjukehus b/scrapersources/postliste-stavanger-universitetssjukehus new file mode 100644 index 0000000..5a9dc08 --- /dev/null +++ b/scrapersources/postliste-stavanger-universitetssjukehus @@ -0,0 +1,81 @@ +# -*- coding: UTF-8 -*- + +import scraperwiki +import json +from BeautifulSoup import BeautifulSoup +import datetime +import dateutil.parser +import lxml.html +import resource +import sys +import urlparse +import re +lazycache=scraperwiki.swimport('lazycache') +postlistelib=scraperwiki.swimport('postliste-python-lib') + +agency = 'Stavanger Universitetssjukehus – Helse Stavanger HF' + +def report_errors(errors): + if 0 < len(errors): + print "Errors:" + for e in errors: + print e + exit(1) +def out_of_cpu(arg, spent, hard, soft): + report_errors(arg) + +def process_pdf(parser, pdfurl, errors): + errors = [] + postlistelib.exit_if_no_cpu_left(0, out_of_cpu, errors) + try: + pdfcontent = scraperwiki.scrape(pdfurl) + parser.preprocess(pdfurl, pdfcontent) + pdfcontent = None +# except ValueError, e: +# errors.append(e) + except IndexError, e: + errors.append(e) + +def process_page_queue(parser, errors): + try: + parser.process_pages() + postlistelib.exit_if_no_cpu_left(0, out_of_cpu, errors) + except scraperwiki.CPUTimeExceededError, e: + errors.append("Processing pages interrupted") + +def process_journal_pdfs(parser, listurl, errors): +# print "Finding PDFs on " + listurl +# u = urllib.parse.urlparse(listurl) + html = scraperwiki.scrape(listurl) + root = lxml.html.fromstring(html) + html = None + for ahref in root.cssselect("table a"): + href = ahref.attrib['href'] + url = urlparse.urljoin(listurl, href) + if -1 != href.find("file://") or -1 == url.find(".pdf"): +# print "Skipping non-http URL " + url + continue + if parser.is_already_scraped(url): + True +# print "Skipping already scraped " + url + else: +# print "Will process " + url + process_pdf(parser, url, errors) + +def test_small_pdfs(parser): + # Test with some smaller PDFs + errors = [] + process_pdf(parser, "http://www.sus.no/aktuelt/postjournal/Documents/2012/2012-06-18.pdf", errors) + process_page_queue(parser, errors) + report_errors(errors) + exit(0) + +errors = [] +parser = postlistelib.PDFJournalParser(agency=agency) + +#test_small_pdfs(parser) + +process_page_queue(parser, errors) +process_journal_pdfs(parser, "http://www.sus.no/aktuelt/postjournal/Sider/side.aspx", errors) +process_page_queue(parser, errors) +report_errors(errors)
\ No newline at end of file diff --git a/scrapersources/postliste-universitetet-i-agder b/scrapersources/postliste-universitetet-i-agder new file mode 100644 index 0000000..cfdfddc --- /dev/null +++ b/scrapersources/postliste-universitetet-i-agder @@ -0,0 +1,85 @@ +# -*- coding: UTF-8 -*- + +import scraperwiki +import json +from BeautifulSoup import BeautifulSoup +import datetime +import dateutil.parser +import lxml.html +import resource +import sys +import urlparse +import re + +# Make sure Scraperwiki believe this is the source from this database +scraperwiki.scrape("http://www.uia.no/no/portaler/om_universitetet/offentlig_journal") + +lazycache=scraperwiki.swimport('lazycache') +postlistelib=scraperwiki.swimport('postliste-python-lib') + +agency = 'Universitetet i Agder' + +def report_errors(errors): + if 0 < len(errors): + print "Errors:" + for e in errors: + print e + raise ValueError("Something went wrong") + +def out_of_cpu(arg, spent, hard, soft): + report_errors(arg) + +def process_pdf(parser, pdfurl, errors): + postlistelib.exit_if_no_cpu_left(0, out_of_cpu, errors) + try: + pdfcontent = scraperwiki.scrape(pdfurl) + parser.preprocess(pdfurl, pdfcontent) + pdfcontent = None + except ValueError, e: + errors.append(e) + except IndexError, e: + errors.append(e) + +def process_page_queue(parser, errors): + try: + parser.process_pages() + postlistelib.exit_if_no_cpu_left(0, out_of_cpu, errors) + except scraperwiki.CPUTimeExceededError, e: + errors.append("Processing pages interrupted") + +def process_journal_pdfs(parser, listurl, errors): +# print "Finding PDFs on " + listurl +# u = urllib.parse.urlparse(listurl) + html = scraperwiki.scrape(listurl) + root = lxml.html.fromstring(html) + html = None + for ahref in root.cssselect("table a"): + href = ahref.attrib['href'] + url = urlparse.urljoin(listurl, href).replace(" ", "%20") + if -1 != href.find("file://") or -1 == url.find(".pdf"): +# print "Skipping non-http URL " + url + continue + if parser.is_already_scraped(url): + True +# print "Skipping already scraped " + url + else: +# print "Will process " + url + process_pdf(parser, url, errors) + +def test_small_pdfs(parser): + # Test with some smaller PDFs + errors = [] + process_pdf(parser, "http://www.uia.no/no/content/download/297514/5641673/file/Uke%2018.pdf", errors) + process_page_queue(parser, errors) + report_errors(errors) + exit(0) + +errors = [] +parser = postlistelib.PDFJournalParser(agency=agency) + +#test_small_pdfs(parser) + +process_journal_pdfs(parser, "http://www.uia.no/no/portaler/om_universitetet/offentlig_journal", errors) +process_page_queue(parser, errors) +report_errors(errors) + diff --git a/scrapersources/postliste-universitetssykehuset-nord-norge b/scrapersources/postliste-universitetssykehuset-nord-norge new file mode 100644 index 0000000..1b06793 --- /dev/null +++ b/scrapersources/postliste-universitetssykehuset-nord-norge @@ -0,0 +1,96 @@ +# -*- coding: UTF-8 -*- + + +import scraperwiki +import json +from BeautifulSoup import BeautifulSoup +import datetime +import dateutil.parser +import lxml.html +import urlparse +import re + +# Make sure Scraperwiki believe this is the source from this database +scraperwiki.scrape("http://www.unn.no/offentlig-postjournal/category8944.html") + +lazycache=scraperwiki.swimport('lazycache') +postlistelib=scraperwiki.swimport('postliste-python-lib') + +agency = 'Universitetssykehuset Nord-Norge' + +def report_errors(errors): + if 0 < len(errors): + print "Errors:" + for e in errors: + print e + raise ValueError("Something went wrong") + +def out_of_cpu(arg, spent, hard, soft): + report_errors(arg) + +def process_pdf(parser, pdfurl, errors): + postlistelib.exit_if_no_cpu_left(0, out_of_cpu, errors) + try: + pdfcontent = scraperwiki.scrape(pdfurl) + parser.preprocess(pdfurl, pdfcontent) + pdfcontent = None + except ValueError, e: + errors.append(e) + except IndexError, e: + errors.append(e) + +def process_page_queue(parser, errors): + try: + parser.process_pages() + postlistelib.exit_if_no_cpu_left(0, out_of_cpu, errors) + except scraperwiki.CPUTimeExceededError, e: + errors.append("Processing pages interrupted") + +def process_journal_pdfs(parser, listurl, errors): +# print "Finding PDFs on " + listurl +# u = urllib.parse.urlparse(listurl) + html = scraperwiki.scrape(listurl) + root = lxml.html.fromstring(html) + html = None + for ahref in root.cssselect("div.month-entry-title a"): + href = ahref.attrib['href'] + url = urlparse.urljoin(listurl, href) + print url + if -1 != href.find("file://"): +# print "Skipping non-http URL " + url + continue + subhtml = scraperwiki.scrape(url) + subroot = lxml.html.fromstring(subhtml) + subhtml = None + for subahref in subroot.cssselect("div.related-attachements a"): + subhref = subahref.attrib['href'] + suburl = urlparse.urljoin(url, subhref) + if -1 == suburl.find(".pdf"): + continue + if parser.is_already_scraped(suburl): + True +# print "Skipping already scraped " + url + else: +# print "Will process " + url + process_pdf(parser, suburl, errors) + +def test_small_pdfs(parser): + # Test with some smaller PDFs + errors = [] + process_pdf(parser, "http://www.unn.no/getfile.php/UNN-Internett/Media/Postjournal/UNN%20offentlig%20journal%202007/200807.pdf", errors) + process_page_queue(parser, errors) + report_errors(errors) + exit(0) + +errors = [] +parser = postlistelib.PDFJournalParser(agency=agency) + +#test_small_pdfs(parser) + +process_page_queue(parser, errors) +process_journal_pdfs(parser, "http://www.unn.no/offentlig-postjournal/category8944.html", errors) +for year in range(2011, 2007, -1): + process_journal_pdfs(parser, "http://www.unn.no/offentlig-postjournal/category8944.html?year=" + str(year), errors) +process_page_queue(parser, errors) +report_errors(errors) + |