aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorPetter Reinholdtsen <pere@hungry.com>2015-01-04 11:00:27 +0100
committerPetter Reinholdtsen <pere@hungry.com>2015-01-04 11:00:27 +0100
commitd2cf06dc0e204e3e6a01a54e645db57a2b50d238 (patch)
tree55ddf51065f40892bdd208dd921e997b0543375a
parentf3ff09a180210a14ef63529b1b1be7ac0302f195 (diff)
Improve scraper.
-rw-r--r--scrapersources/postliste-nordreisa-kommune5
1 files changed, 4 insertions, 1 deletions
diff --git a/scrapersources/postliste-nordreisa-kommune b/scrapersources/postliste-nordreisa-kommune
index 602c745..681fb78 100644
--- a/scrapersources/postliste-nordreisa-kommune
+++ b/scrapersources/postliste-nordreisa-kommune
@@ -59,6 +59,9 @@ def process_journal_pdfs(parser, listurl, errors):
root = lxml.html.fromstring(html)
html = None
for ahref in root.cssselect("a"):
+ if 'href' not in ahref.attrib:
+ print "Skipping a without href"
+ continue
href = ahref.attrib['href']
url = urlparse.urljoin(listurl, href)
if -1 != href.find("file://") or -1 == url.find(".pdf"):
@@ -82,7 +85,7 @@ def test_small_pdfs(parser):
errors = []
parser = postlistelib.PDFJournalParser(agency=agency)
-test_small_pdfs(parser)
+#test_small_pdfs(parser)
process_journal_pdfs(parser, "http://www.nordreisa.kommune.no/postlister-20122013-og-2014.4866638-137620.html", errors)
process_page_queue(parser, errors)