diff options
author | Petter Reinholdtsen <pere@hungry.com> | 2014-12-14 09:22:07 +0100 |
---|---|---|
committer | Petter Reinholdtsen <pere@hungry.com> | 2014-12-14 09:22:07 +0100 |
commit | 2b348d56f8d9518c114c38b9aa567e1b38c7f67a (patch) | |
tree | 38507508fb1296faec4202257e657062bf199b97 | |
parent | 91ea93eb886e664007148144d96c2e5cfb6a88c3 (diff) |
Get scraper working again.
-rw-r--r-- | scrapersources/postliste-luftambulanse | 16 |
1 files changed, 8 insertions, 8 deletions
diff --git a/scrapersources/postliste-luftambulanse b/scrapersources/postliste-luftambulanse index 3d7c3f2..db52f18 100644 --- a/scrapersources/postliste-luftambulanse +++ b/scrapersources/postliste-luftambulanse @@ -40,8 +40,8 @@ def process_pdf(parser, pdfurl, errors): pdfcontent = scraperwiki.scrape(pdfurl) parser.preprocess(pdfurl, pdfcontent) pdfcontent = None -# except ValueError, e: -# errors.append(e) + except ValueError, e: + errors.append(e) except IndexError, e: errors.append(e) @@ -53,24 +53,24 @@ def process_page_queue(parser, errors): errors.append("Processing pages interrupted") def process_journal_pdfs(parser, listurl, errors): -# print "Finding PDFs on " + listurl + print "Finding PDFs on " + listurl # u = urllib.parse.urlparse(listurl) html = scraperwiki.scrape(listurl) root = lxml.html.fromstring(html) html = None - for ahref in root.cssselect("table a"): + for ahref in root.cssselect("div.field-items a"): if not 'href' in ahref.attrib: continue href = ahref.attrib['href'] url = urlparse.urljoin(listurl, href).replace(" ", "%20") - if -1 != href.find("file://") or -1 == url.find(".pdf") or -1 == url.find('/Postjournal'): -# print "Skipping non-http URL " + url + if -1 != href.find("file://") or -1 == url.find(".pdf"): + print "Skipping non-http URL " + url continue if parser.is_already_scraped(url): True -# print "Skipping already scraped " + url + print "Skipping already scraped " + url else: -# print "Will process " + url + print "Will process " + url process_pdf(parser, url, errors) def test_small_pdfs(parser): |