diff options
author | Petter Reinholdtsen <pere@hungry.com> | 2014-12-13 10:51:48 +0100 |
---|---|---|
committer | Petter Reinholdtsen <pere@hungry.com> | 2014-12-13 10:53:38 +0100 |
commit | b69a3b9cb76c71b3d303811e1cae91b33362784f (patch) | |
tree | e6bf080d7884ea8dd840382f1083ffe67a00ddaa | |
parent | ee69cd43debba7f1724f6761f937808477f27c21 (diff) |
Make scraper more robust.
-rw-r--r-- | scrapersources/postliste-hadsel | 14 |
1 files changed, 8 insertions, 6 deletions
diff --git a/scrapersources/postliste-hadsel b/scrapersources/postliste-hadsel index 1f27b48..940e165 100644 --- a/scrapersources/postliste-hadsel +++ b/scrapersources/postliste-hadsel @@ -7,6 +7,7 @@ # Datatype: ePhorte # Vendor: Ergo # Run: daily +# Publish duration: 3 months import scraperwiki from BeautifulSoup import BeautifulSoup @@ -53,12 +54,13 @@ def process_page_queue(parser, errors): def consider_url(parser, url, errors): if parser.is_already_scraped(url): True -# print "Skipping already scraped " + url +# print "Skipping already scraped " + url else: -# print "Will process " + url +# print "Will process " + url try: process_pdf(parser, url, errors) - except: + except Exception, e: + print "Processing PDF on %s failed:" % url, e pass def process_journal_pdfs(parser, listurl, errors, recurse): @@ -67,12 +69,12 @@ def process_journal_pdfs(parser, listurl, errors, recurse): html = scraperwiki.scrape(listurl) root = lxml.html.fromstring(html) html = None - for ahref in root.cssselect("div.items a"): + for ahref in root.cssselect("div.items a.doclink"): url = urlparse.urljoin(listurl, ahref.attrib['href']) - if -1 == url.find("doc_download"): + if -1 == url.find("doc_download") or -1 != url.find("docman"): continue +# print url consider_url(parser, url, errors) - #print url for ahref in root.cssselect("div.item-list a"): suburl = urlparse.urljoin(listurl, ahref.attrib['href']) #print "sub " + suburl |