aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorPetter Reinholdtsen <pere@hungry.com>2016-10-11 15:54:02 +0200
committerPetter Reinholdtsen <pere@hungry.com>2016-10-11 15:54:02 +0200
commit70468bc82d0ff305e9bf5d4560b04d573638ccaf (patch)
treebf1f98aa50e59f64837d6cc58a64eb978ccb9352
parentec60a70ab16681b0499918057a13179d1b90352b (diff)
Correct scraper.HEADmaster
-rw-r--r--scrapersources/postliste-universitetssykehuset-nord-norge21
1 files changed, 13 insertions, 8 deletions
diff --git a/scrapersources/postliste-universitetssykehuset-nord-norge b/scrapersources/postliste-universitetssykehuset-nord-norge
index e4584e6..6f5a325 100644
--- a/scrapersources/postliste-universitetssykehuset-nord-norge
+++ b/scrapersources/postliste-universitetssykehuset-nord-norge
@@ -16,10 +16,11 @@ import datetime
import dateutil.parser
import lxml.html
import urlparse
+import urllib
import re
# Make sure Scraperwiki believe this is the source from this database
-scraperwiki.scrape("http://www.unn.no/offentlig-postjournal/category8944.html")
+scraperwiki.scrape("https://unn.no/om-oss/media/offentlig-journal-unn-hf")
lazycache=scraperwiki.swimport('lazycache')
postlistelib=scraperwiki.swimport('postliste-python-lib')
@@ -53,6 +54,8 @@ def process_page_queue(parser, errors):
postlistelib.exit_if_no_cpu_left(0, out_of_cpu, errors)
except scraperwiki.CPUTimeExceededError, e:
errors.append("Processing pages interrupted")
+ except scraperwiki.sql.SqliteError, e:
+ pass
def process_journal_pdfs(parser, listurl, errors):
# print "Finding PDFs on " + listurl
@@ -60,13 +63,15 @@ def process_journal_pdfs(parser, listurl, errors):
html = scraperwiki.scrape(listurl)
root = lxml.html.fromstring(html)
html = None
- for ahref in root.cssselect("div.month-entry-title a"):
+ for ahref in root.cssselect("div.row a"):
href = ahref.attrib['href']
url = urlparse.urljoin(listurl, href)
-# print url
- if -1 != href.find("file://"):
-# print "Skipping non-http URL " + url
+ print url
+ if -1 != href.find("file://") or -1 != href.find("mailto:") :
+ print "Skipping non-http URL " + url
continue
+ url = urllib.quote(url, safe=':/')
+ print url
subhtml = scraperwiki.scrape(url)
subroot = lxml.html.fromstring(subhtml)
subhtml = None
@@ -96,9 +101,9 @@ parser = postlistelib.PDFJournalParser(agency=agency)
#test_small_pdfs(parser)
process_page_queue(parser, errors)
-process_journal_pdfs(parser, "http://www.unn.no/offentlig-postjournal/category8944.html", errors)
-for year in range(2013, 2007, -1):
- process_journal_pdfs(parser, "http://www.unn.no/offentlig-postjournal/category8944.html?year=" + str(year), errors)
+process_journal_pdfs(parser, "https://unn.no/om-oss/media/offentlig-journal-unn-hf", errors)
+#for year in range(2013, 2007, -1):
+# process_journal_pdfs(parser, "http://www.unn.no/offentlig-postjournal/category8944.html?year=" + str(year), errors)
process_page_queue(parser, errors)
report_errors(errors)