Make scraper more robust.

author: Petter Reinholdtsen <pere@hungry.com> 2014-12-13 10:51:48 +0100
committer: Petter Reinholdtsen <pere@hungry.com> 2014-12-13 10:53:38 +0100
commit: b69a3b9cb76c71b3d303811e1cae91b33362784f (patch)
tree: e6bf080d7884ea8dd840382f1083ffe67a00ddaa
parent: ee69cd43debba7f1724f6761f937808477f27c21 (diff)
1 files changed, 8 insertions, 6 deletions
diff --git a/scrapersources/postliste-hadsel b/scrapersources/postliste-hadsel
index 1f27b48..940e165 100644
--- a/scrapersources/postliste-hadsel
+++ b/scrapersources/postliste-hadsel
@@ -7,6 +7,7 @@
 #  Datatype: ePhorte
 #  Vendor: Ergo
 #  Run: daily
+#  Publish duration: 3 months
 
 import scraperwiki
 from BeautifulSoup import BeautifulSoup
@@ -53,12 +54,13 @@ def process_page_queue(parser, errors):
 def consider_url(parser, url, errors):
     if parser.is_already_scraped(url):
         True
-#            print "Skipping already scraped " + url
+#        print "Skipping already scraped " + url
     else:
-#            print "Will process " + url
+#        print "Will process " + url
         try:
             process_pdf(parser, url, errors)
-        except:
+        except Exception, e:
+            print "Processing PDF on %s failed:" % url, e
             pass
 
 def process_journal_pdfs(parser, listurl, errors, recurse):
@@ -67,12 +69,12 @@ def process_journal_pdfs(parser, listurl, errors, recurse):
     html = scraperwiki.scrape(listurl)
     root = lxml.html.fromstring(html)
     html = None
-    for ahref in root.cssselect("div.items a"):
+    for ahref in root.cssselect("div.items a.doclink"):
         url = urlparse.urljoin(listurl, ahref.attrib['href'])
-        if -1 == url.find("doc_download"):
+        if -1 == url.find("doc_download") or -1 != url.find("docman"):
             continue
+#        print url
         consider_url(parser, url, errors)
-        #print url
     for ahref in root.cssselect("div.item-list a"):
         suburl = urlparse.urljoin(listurl, ahref.attrib['href'])
         #print "sub " + suburl
author	Petter Reinholdtsen <pere@hungry.com>	2014-12-13 10:51:48 +0100
committer	Petter Reinholdtsen <pere@hungry.com>	2014-12-13 10:53:38 +0100
commit	b69a3b9cb76c71b3d303811e1cae91b33362784f (patch)
tree	e6bf080d7884ea8dd840382f1083ffe67a00ddaa
parent	ee69cd43debba7f1724f6761f937808477f27c21 (diff)