diff options
author | Petter Reinholdtsen <pere@hungry.com> | 2015-01-04 12:56:01 +0100 |
---|---|---|
committer | Petter Reinholdtsen <pere@hungry.com> | 2015-01-04 12:57:36 +0100 |
commit | a16c2c6dade926a77d4748d69acbbbae87c61ce7 (patch) | |
tree | 2fec5ac18a36d2920482b1d7ba6ef9fc1f450322 | |
parent | a9188ef1258edb11699f2449eeda8caf9ae25d82 (diff) |
Make scraper more robust.
-rw-r--r-- | scrapersources/postliste-nordreisa-kommune | 14 |
1 files changed, 8 insertions, 6 deletions
diff --git a/scrapersources/postliste-nordreisa-kommune b/scrapersources/postliste-nordreisa-kommune index 681fb78..13e8317 100644 --- a/scrapersources/postliste-nordreisa-kommune +++ b/scrapersources/postliste-nordreisa-kommune @@ -14,6 +14,7 @@ import json from BeautifulSoup import BeautifulSoup import datetime import dateutil.parser +import urllib2 import lxml.html import resource import sys @@ -34,16 +35,17 @@ def out_of_cpu(arg, spent, hard, soft): report_errors(arg) def process_pdf(parser, pdfurl, errors): - errors = [] postlistelib.exit_if_no_cpu_left(0, out_of_cpu, errors) try: pdfcontent = scraperwiki.scrape(pdfurl) parser.preprocess(pdfurl, pdfcontent) pdfcontent = None -# except ValueError, e: -# errors.append(e) + except ValueError, e: + errors.append(e) except IndexError, e: errors.append(e) + except urllib2.HTTPError, e: + errors.append(e) def process_page_queue(parser, errors): try: @@ -60,7 +62,7 @@ def process_journal_pdfs(parser, listurl, errors): html = None for ahref in root.cssselect("a"): if 'href' not in ahref.attrib: - print "Skipping a without href" +# print "Skipping a without href" continue href = ahref.attrib['href'] url = urlparse.urljoin(listurl, href) @@ -68,10 +70,10 @@ def process_journal_pdfs(parser, listurl, errors): # print "Skipping non-http URL " + url continue if parser.is_already_scraped(url): +# print "Skipping already scraped " + url True - print "Skipping already scraped " + url else: - print "Will process " + url +# print "Will process " + url process_pdf(parser, url, errors) def test_small_pdfs(parser): |