aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorPetter Reinholdtsen <pere@hungry.com>2014-12-14 09:22:07 +0100
committerPetter Reinholdtsen <pere@hungry.com>2014-12-14 09:22:07 +0100
commit2b348d56f8d9518c114c38b9aa567e1b38c7f67a (patch)
tree38507508fb1296faec4202257e657062bf199b97
parent91ea93eb886e664007148144d96c2e5cfb6a88c3 (diff)
Get scraper working again.
-rw-r--r--scrapersources/postliste-luftambulanse16
1 files changed, 8 insertions, 8 deletions
diff --git a/scrapersources/postliste-luftambulanse b/scrapersources/postliste-luftambulanse
index 3d7c3f2..db52f18 100644
--- a/scrapersources/postliste-luftambulanse
+++ b/scrapersources/postliste-luftambulanse
@@ -40,8 +40,8 @@ def process_pdf(parser, pdfurl, errors):
pdfcontent = scraperwiki.scrape(pdfurl)
parser.preprocess(pdfurl, pdfcontent)
pdfcontent = None
-# except ValueError, e:
-# errors.append(e)
+ except ValueError, e:
+ errors.append(e)
except IndexError, e:
errors.append(e)
@@ -53,24 +53,24 @@ def process_page_queue(parser, errors):
errors.append("Processing pages interrupted")
def process_journal_pdfs(parser, listurl, errors):
-# print "Finding PDFs on " + listurl
+ print "Finding PDFs on " + listurl
# u = urllib.parse.urlparse(listurl)
html = scraperwiki.scrape(listurl)
root = lxml.html.fromstring(html)
html = None
- for ahref in root.cssselect("table a"):
+ for ahref in root.cssselect("div.field-items a"):
if not 'href' in ahref.attrib:
continue
href = ahref.attrib['href']
url = urlparse.urljoin(listurl, href).replace(" ", "%20")
- if -1 != href.find("file://") or -1 == url.find(".pdf") or -1 == url.find('/Postjournal'):
-# print "Skipping non-http URL " + url
+ if -1 != href.find("file://") or -1 == url.find(".pdf"):
+ print "Skipping non-http URL " + url
continue
if parser.is_already_scraped(url):
True
-# print "Skipping already scraped " + url
+ print "Skipping already scraped " + url
else:
-# print "Will process " + url
+ print "Will process " + url
process_pdf(parser, url, errors)
def test_small_pdfs(parser):