diff options
author | Petter Reinholdtsen <pere@hungry.com> | 2014-12-20 00:03:41 +0100 |
---|---|---|
committer | Petter Reinholdtsen <pere@hungry.com> | 2014-12-20 00:03:41 +0100 |
commit | e5e077498d19b426f1b849dd66e9e5555d494fde (patch) | |
tree | 459b887d012ff36840bfe52d8dfe804a26a0c9a1 | |
parent | 00dfe4c4c7b9db7a6b5ff0362b2155c281ae0ac4 (diff) |
Update and add meta info.
-rw-r--r-- | scrapersources/postliste-npolar | 14 |
1 files changed, 12 insertions, 2 deletions
diff --git a/scrapersources/postliste-npolar b/scrapersources/postliste-npolar index 423a785..0fca7e2 100644 --- a/scrapersources/postliste-npolar +++ b/scrapersources/postliste-npolar @@ -1,4 +1,12 @@ -# -*- coding: UTF-8 -*- +# -*- coding: utf-8 -*- +# YAML-tagger: +# Type: unknown +# Status: finished +# Name: Norsk Polarinstitutt +# Format: PDF +# Datatype: ePhorte +# Vendor: Ergo +# Run: daily import scraperwiki import json @@ -53,7 +61,7 @@ def process_journal_pdfs(parser, listurl, errors): html = scraperwiki.scrape(listurl) root = lxml.html.fromstring(html) html = None - for ahref in root.cssselect("div.onecol ul a"): + for ahref in root.cssselect("div#rightside ul a"): href = ahref.attrib['href'] url = urlparse.urljoin(listurl, href) if -1 != href.find("file://") or -1 == url.find(".pdf"): @@ -69,6 +77,8 @@ def process_journal_pdfs(parser, listurl, errors): def test_small_pdfs(parser): # Test with some smaller PDFs errors = [] + process_pdf(parser, "http://www.npolar.no/npcms/export/sites/np/files/vedlegg/offentlig-journal/2014-09.pdf", errors) + process_pdf(parser, "http://www.npolar.no/npcms/export/sites/np/files/vedlegg/offentlig-journal/2014-08.pdf", errors) #process_pdf(parser, "http://www.npolar.no/npcms/export/sites/np/files/vedlegg/offentlig-journal/2011-10.pdf", errors) #process_pdf(parser, "http://www.npolar.no/npcms/export/sites/np/files/vedlegg/offentlig-journal/2011-09.pdf", errors) #process_pdf(parser, "http://www.npolar.no/npcms/export/sites/np/files/vedlegg/offentlig-journal/2011-08.pdf", errors) |