diff options
author | Petter Reinholdtsen <pere@hungry.com> | 2015-01-04 10:57:41 +0100 |
---|---|---|
committer | Petter Reinholdtsen <pere@hungry.com> | 2015-01-04 10:57:41 +0100 |
commit | f3ff09a180210a14ef63529b1b1be7ac0302f195 (patch) | |
tree | be50ae5f10112a13d464aa5fb0fcffe9f0a611ec | |
parent | 39268f1aa28e673e15c2a48a52dd3249b99ae04f (diff) |
New scraper for Nordreisa kommune.
-rw-r--r-- | scrapersources/postliste-nordreisa-kommune | 90 |
1 files changed, 90 insertions, 0 deletions
diff --git a/scrapersources/postliste-nordreisa-kommune b/scrapersources/postliste-nordreisa-kommune new file mode 100644 index 0000000..602c745 --- /dev/null +++ b/scrapersources/postliste-nordreisa-kommune @@ -0,0 +1,90 @@ +# -*- coding: utf-8 -*- +# YAML-tagger: +# Type: kommune +# Status: unfinished +# Name: Nordreisa kommune +# Format: PDF +# Datatype: ePhorte +# Vendor: Ergo +# Publish duration: unlimited +# Run: daily + +import scraperwiki +import json +from BeautifulSoup import BeautifulSoup +import datetime +import dateutil.parser +import lxml.html +import resource +import sys +import urlparse +import re +lazycache=scraperwiki.swimport('lazycache') +postlistelib=scraperwiki.swimport('postliste-python-lib') + +agency = 'Nordreisa kommune' + +def report_errors(errors): + if 0 < len(errors): + print "Errors:" + for e in errors: + print e + exit(1) +def out_of_cpu(arg, spent, hard, soft): + report_errors(arg) + +def process_pdf(parser, pdfurl, errors): + errors = [] + postlistelib.exit_if_no_cpu_left(0, out_of_cpu, errors) + try: + pdfcontent = scraperwiki.scrape(pdfurl) + parser.preprocess(pdfurl, pdfcontent) + pdfcontent = None +# except ValueError, e: +# errors.append(e) + except IndexError, e: + errors.append(e) + +def process_page_queue(parser, errors): + try: + parser.process_pages() + postlistelib.exit_if_no_cpu_left(0, out_of_cpu, errors) + except scraperwiki.CPUTimeExceededError, e: + errors.append("Processing pages interrupted") + +def process_journal_pdfs(parser, listurl, errors): +# print "Finding PDFs on " + listurl +# u = urllib.parse.urlparse(listurl) + html = scraperwiki.scrape(listurl) + root = lxml.html.fromstring(html) + html = None + for ahref in root.cssselect("a"): + href = ahref.attrib['href'] + url = urlparse.urljoin(listurl, href) + if -1 != href.find("file://") or -1 == url.find(".pdf"): +# print "Skipping non-http URL " + url + continue + if parser.is_already_scraped(url): + True + print "Skipping already scraped " + url + else: + print "Will process " + url + process_pdf(parser, url, errors) + +def test_small_pdfs(parser): + # Test with some smaller PDFs + errors = [] + process_pdf(parser, "http://www.nordreisa.kommune.no/getfile.php/2801845.1386.tuedyspuey/Postliste+02.12.14.pdf", errors) + process_page_queue(parser, errors) + report_errors(errors) + exit(0) + +errors = [] +parser = postlistelib.PDFJournalParser(agency=agency) + +test_small_pdfs(parser) + +process_journal_pdfs(parser, "http://www.nordreisa.kommune.no/postlister-20122013-og-2014.4866638-137620.html", errors) +process_page_queue(parser, errors) +report_errors(errors) + |