diff options
author | Petter Reinholdtsen <pere@hungry.com> | 2015-01-19 23:41:36 +0100 |
---|---|---|
committer | Petter Reinholdtsen <pere@hungry.com> | 2015-01-19 23:41:36 +0100 |
commit | 0701a3f4b260d29f98ff86a208fa8511fe7d3e72 (patch) | |
tree | b13b431001bc363aa497d921f926bd7131a6266a | |
parent | 58aa823071914debecfd9c5e5544b53d91b0c641 (diff) |
New county scraper.
-rw-r--r-- | scrapersources/postliste-midsund-kommune | 93 |
1 files changed, 93 insertions, 0 deletions
diff --git a/scrapersources/postliste-midsund-kommune b/scrapersources/postliste-midsund-kommune new file mode 100644 index 0000000..08f4cd7 --- /dev/null +++ b/scrapersources/postliste-midsund-kommune @@ -0,0 +1,93 @@ +# -*- coding: utf-8 -*- +# YAML-tagger: +# Type: kommune +# Status: unfinished +# Name: Mudsund kommune +# Format: PDF +# Datatype: ePhorte +# Vendor: Ergo +# Run: daily +# Publish duration: 2 months + +import scraperwiki +import json +from BeautifulSoup import BeautifulSoup +import datetime +import dateutil.parser +import lxml.html +import resource +import sys +import urlparse +import re +lazycache=scraperwiki.swimport('lazycache') +postlistelib=scraperwiki.swimport('postliste-python-lib') + +agency = 'Midsund kommune' + +def report_errors(errors): + if 0 < len(errors): + print "Errors:" + for e in errors: + print e + exit(1) +def out_of_cpu(arg, spent, hard, soft): + report_errors(arg) + +def process_pdf(parser, pdfurl, errors): + errors = [] + postlistelib.exit_if_no_cpu_left(0, out_of_cpu, errors) + try: + pdfcontent = scraperwiki.scrape(pdfurl) + parser.preprocess(pdfurl, pdfcontent) + pdfcontent = None + except ValueError, e: + errors.append(e) + except IndexError, e: + errors.append(e) + +def process_page_queue(parser, errors): + try: + parser.process_pages() + postlistelib.exit_if_no_cpu_left(0, out_of_cpu, errors) + except scraperwiki.CPUTimeExceededError, e: + errors.append("Processing pages interrupted") + +def process_journal_pdfs(parser, listurl, errors): + print "Finding PDFs on " + listurl +# u = urllib.parse.urlparse(listurl) + html = scraperwiki.scrape(listurl) + root = lxml.html.fromstring(html) + html = None + for ahref in root.cssselect("td.ingress a"): + if not 'href' in ahref.attrib: + continue + href = ahref.attrib['href'] + url = urlparse.urljoin(listurl, href).replace(" ", "%20") + if -1 != href.find("file://") or -1 == url.find(".pdf"): + print "Skipping non-http URL " + url + continue + if parser.is_already_scraped(url): + True + print "Skipping already scraped " + url + else: + print "Will process " + url + process_pdf(parser, url, errors) + +def test_small_pdfs(parser): + # Test with some smaller PDFs + errors = [] + process_pdf(parser, "http://www.midsund.kommune.no/getfile.php/2845832.706.cucapdfbsf/Postliste+3.pdf", errors) + process_page_queue(parser, errors) + report_errors(errors) + exit(0) + +errors = [] +parser = postlistelib.PDFJournalParser(agency=agency) + +#test_small_pdfs(parser) + +#process_page_queue(parser, errors) +process_journal_pdfs(parser, "http://www.midsund.kommune.no/offentleg-postliste.22718.no.html", errors) + +process_page_queue(parser, errors) +report_errors(errors) |