diff options
Diffstat (limited to 'scrapersources')
| -rw-r--r-- | scrapersources/postliste-universitetssykehuset-nord-norge | 21 | 
1 files changed, 13 insertions, 8 deletions
| diff --git a/scrapersources/postliste-universitetssykehuset-nord-norge b/scrapersources/postliste-universitetssykehuset-nord-norge index e4584e6..6f5a325 100644 --- a/scrapersources/postliste-universitetssykehuset-nord-norge +++ b/scrapersources/postliste-universitetssykehuset-nord-norge @@ -16,10 +16,11 @@ import datetime  import dateutil.parser  import lxml.html  import urlparse +import urllib  import re  # Make sure Scraperwiki believe this is the source from this database -scraperwiki.scrape("http://www.unn.no/offentlig-postjournal/category8944.html") +scraperwiki.scrape("https://unn.no/om-oss/media/offentlig-journal-unn-hf")  lazycache=scraperwiki.swimport('lazycache')  postlistelib=scraperwiki.swimport('postliste-python-lib') @@ -53,6 +54,8 @@ def process_page_queue(parser, errors):          postlistelib.exit_if_no_cpu_left(0, out_of_cpu, errors)      except scraperwiki.CPUTimeExceededError, e:          errors.append("Processing pages interrupted") +    except scraperwiki.sql.SqliteError, e: +	pass  def process_journal_pdfs(parser, listurl, errors):  #    print "Finding PDFs on " + listurl @@ -60,13 +63,15 @@ def process_journal_pdfs(parser, listurl, errors):      html = scraperwiki.scrape(listurl)      root = lxml.html.fromstring(html)      html = None -    for ahref in root.cssselect("div.month-entry-title a"): +    for ahref in root.cssselect("div.row a"):          href = ahref.attrib['href']          url = urlparse.urljoin(listurl, href) -#        print url -        if -1 != href.find("file://"): -#            print "Skipping non-http URL " + url +        print url +        if -1 != href.find("file://") or -1 != href.find("mailto:") : +            print "Skipping non-http URL " + url              continue +	url = urllib.quote(url, safe=':/') +	print url          subhtml = scraperwiki.scrape(url)          subroot = lxml.html.fromstring(subhtml)          subhtml = None @@ -96,9 +101,9 @@ parser = postlistelib.PDFJournalParser(agency=agency)  #test_small_pdfs(parser)  process_page_queue(parser, errors) -process_journal_pdfs(parser, "http://www.unn.no/offentlig-postjournal/category8944.html", errors) -for year in range(2013, 2007, -1): -    process_journal_pdfs(parser, "http://www.unn.no/offentlig-postjournal/category8944.html?year=" + str(year), errors) +process_journal_pdfs(parser, "https://unn.no/om-oss/media/offentlig-journal-unn-hf", errors) +#for year in range(2013, 2007, -1): +#    process_journal_pdfs(parser, "http://www.unn.no/offentlig-postjournal/category8944.html?year=" + str(year), errors)  process_page_queue(parser, errors)  report_errors(errors) | 
