From d2cf06dc0e204e3e6a01a54e645db57a2b50d238 Mon Sep 17 00:00:00 2001 From: Petter Reinholdtsen Date: Sun, 4 Jan 2015 11:00:27 +0100 Subject: Improve scraper. --- scrapersources/postliste-nordreisa-kommune | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/scrapersources/postliste-nordreisa-kommune b/scrapersources/postliste-nordreisa-kommune index 602c745..681fb78 100644 --- a/scrapersources/postliste-nordreisa-kommune +++ b/scrapersources/postliste-nordreisa-kommune @@ -59,6 +59,9 @@ def process_journal_pdfs(parser, listurl, errors): root = lxml.html.fromstring(html) html = None for ahref in root.cssselect("a"): + if 'href' not in ahref.attrib: + print "Skipping a without href" + continue href = ahref.attrib['href'] url = urlparse.urljoin(listurl, href) if -1 != href.find("file://") or -1 == url.find(".pdf"): @@ -82,7 +85,7 @@ def test_small_pdfs(parser): errors = [] parser = postlistelib.PDFJournalParser(agency=agency) -test_small_pdfs(parser) +#test_small_pdfs(parser) process_journal_pdfs(parser, "http://www.nordreisa.kommune.no/postlister-20122013-og-2014.4866638-137620.html", errors) process_page_queue(parser, errors) -- cgit v1.2.3