diff options
author | Petter Reinholdtsen <pere@hungry.com> | 2016-10-11 15:54:02 +0200 |
---|---|---|
committer | Petter Reinholdtsen <pere@hungry.com> | 2016-10-11 15:54:02 +0200 |
commit | 70468bc82d0ff305e9bf5d4560b04d573638ccaf (patch) | |
tree | bf1f98aa50e59f64837d6cc58a64eb978ccb9352 | |
parent | ec60a70ab16681b0499918057a13179d1b90352b (diff) |
-rw-r--r-- | scrapersources/postliste-universitetssykehuset-nord-norge | 21 |
1 files changed, 13 insertions, 8 deletions
diff --git a/scrapersources/postliste-universitetssykehuset-nord-norge b/scrapersources/postliste-universitetssykehuset-nord-norge index e4584e6..6f5a325 100644 --- a/scrapersources/postliste-universitetssykehuset-nord-norge +++ b/scrapersources/postliste-universitetssykehuset-nord-norge @@ -16,10 +16,11 @@ import datetime import dateutil.parser import lxml.html import urlparse +import urllib import re # Make sure Scraperwiki believe this is the source from this database -scraperwiki.scrape("http://www.unn.no/offentlig-postjournal/category8944.html") +scraperwiki.scrape("https://unn.no/om-oss/media/offentlig-journal-unn-hf") lazycache=scraperwiki.swimport('lazycache') postlistelib=scraperwiki.swimport('postliste-python-lib') @@ -53,6 +54,8 @@ def process_page_queue(parser, errors): postlistelib.exit_if_no_cpu_left(0, out_of_cpu, errors) except scraperwiki.CPUTimeExceededError, e: errors.append("Processing pages interrupted") + except scraperwiki.sql.SqliteError, e: + pass def process_journal_pdfs(parser, listurl, errors): # print "Finding PDFs on " + listurl @@ -60,13 +63,15 @@ def process_journal_pdfs(parser, listurl, errors): html = scraperwiki.scrape(listurl) root = lxml.html.fromstring(html) html = None - for ahref in root.cssselect("div.month-entry-title a"): + for ahref in root.cssselect("div.row a"): href = ahref.attrib['href'] url = urlparse.urljoin(listurl, href) -# print url - if -1 != href.find("file://"): -# print "Skipping non-http URL " + url + print url + if -1 != href.find("file://") or -1 != href.find("mailto:") : + print "Skipping non-http URL " + url continue + url = urllib.quote(url, safe=':/') + print url subhtml = scraperwiki.scrape(url) subroot = lxml.html.fromstring(subhtml) subhtml = None @@ -96,9 +101,9 @@ parser = postlistelib.PDFJournalParser(agency=agency) #test_small_pdfs(parser) process_page_queue(parser, errors) -process_journal_pdfs(parser, "http://www.unn.no/offentlig-postjournal/category8944.html", errors) -for year in range(2013, 2007, -1): - process_journal_pdfs(parser, "http://www.unn.no/offentlig-postjournal/category8944.html?year=" + str(year), errors) +process_journal_pdfs(parser, "https://unn.no/om-oss/media/offentlig-journal-unn-hf", errors) +#for year in range(2013, 2007, -1): +# process_journal_pdfs(parser, "http://www.unn.no/offentlig-postjournal/category8944.html?year=" + str(year), errors) process_page_queue(parser, errors) report_errors(errors) |