diff options
author | Petter Reinholdtsen <pere@hungry.com> | 2015-01-04 11:00:27 +0100 |
---|---|---|
committer | Petter Reinholdtsen <pere@hungry.com> | 2015-01-04 11:00:27 +0100 |
commit | d2cf06dc0e204e3e6a01a54e645db57a2b50d238 (patch) | |
tree | 55ddf51065f40892bdd208dd921e997b0543375a | |
parent | f3ff09a180210a14ef63529b1b1be7ac0302f195 (diff) |
Improve scraper.
-rw-r--r-- | scrapersources/postliste-nordreisa-kommune | 5 |
1 files changed, 4 insertions, 1 deletions
diff --git a/scrapersources/postliste-nordreisa-kommune b/scrapersources/postliste-nordreisa-kommune index 602c745..681fb78 100644 --- a/scrapersources/postliste-nordreisa-kommune +++ b/scrapersources/postliste-nordreisa-kommune @@ -59,6 +59,9 @@ def process_journal_pdfs(parser, listurl, errors): root = lxml.html.fromstring(html) html = None for ahref in root.cssselect("a"): + if 'href' not in ahref.attrib: + print "Skipping a without href" + continue href = ahref.attrib['href'] url = urlparse.urljoin(listurl, href) if -1 != href.find("file://") or -1 == url.find(".pdf"): @@ -82,7 +85,7 @@ def test_small_pdfs(parser): errors = [] parser = postlistelib.PDFJournalParser(agency=agency) -test_small_pdfs(parser) +#test_small_pdfs(parser) process_journal_pdfs(parser, "http://www.nordreisa.kommune.no/postlister-20122013-og-2014.4866638-137620.html", errors) process_page_queue(parser, errors) |