diff options
-rw-r--r-- | scrapersources/postliste-universitetet-i-tromso | 95 |
1 files changed, 95 insertions, 0 deletions
diff --git a/scrapersources/postliste-universitetet-i-tromso b/scrapersources/postliste-universitetet-i-tromso new file mode 100644 index 0000000..c2553db --- /dev/null +++ b/scrapersources/postliste-universitetet-i-tromso @@ -0,0 +1,95 @@ +# -*- coding: utf-8 -*- +# YAML-tagger: +# Type: unknown +# Status: unfinished +# Name: Universitetet i Tromsø +# Format: PDF/HTML +# Datatype: ePhorte +# Vendor: Ergo +# Run: daily +# +# The PDF/ePhorte scraper is done, but the new HTML format is not yet +# handled. + +import scraperwiki +import json +from BeautifulSoup import BeautifulSoup +import datetime +import dateutil.parser +import lxml.html +import resource +import sys +import urlparse +import re + +# Make sure Scraperwiki believe this is the source from this database +scraperwiki.scrape("http://uit.no/om/offjour") + +lazycache=scraperwiki.swimport('lazycache') +postlistelib=scraperwiki.swimport('postliste-python-lib') + +agency = u'Universitetet i Tromsø' + +def report_errors(errors): + if 0 < len(errors): + print "Errors:" + for e in errors: + print e + raise ValueError("Something went wrong") + +def out_of_cpu(arg, spent, hard, soft): + report_errors(arg) + +def process_pdf(parser, pdfurl, errors): + postlistelib.exit_if_no_cpu_left(0, out_of_cpu, errors) + try: + pdfcontent = scraperwiki.scrape(pdfurl) + parser.preprocess(pdfurl, pdfcontent) + pdfcontent = None + except ValueError, e: + errors.append(e) + except IndexError, e: + errors.append(e) + +def process_page_queue(parser, errors): + try: + parser.process_pages() + postlistelib.exit_if_no_cpu_left(0, out_of_cpu, errors) + except scraperwiki.CPUTimeExceededError, e: + errors.append("Processing pages interrupted") + +def process_journal_pdfs(parser, listurl, errors): +# print "Finding PDFs on " + listurl +# u = urllib.parse.urlparse(listurl) + html = scraperwiki.scrape(listurl) + root = lxml.html.fromstring(html) + html = None + for ahref in root.cssselect("div.nyhArtikkel a"): + href = ahref.attrib['href'] + url = urlparse.urljoin(listurl, href) + if -1 != href.find("file://") or -1 == url.find(".pdf"): +# print "Skipping non-http URL " + url + continue + if parser.is_already_scraped(url): + True +# print "Skipping already scraped " + url + else: +# print "Will process " + url + process_pdf(parser, url, errors) + +def test_small_pdfs(parser): + # Test with some smaller PDFs + errors = [] + process_pdf(parser, "http://uit.no/Content/382902/Januar%202011.pdf", errors) + process_page_queue(parser, errors) + report_errors(errors) + exit(0) + +errors = [] +parser = postlistelib.PDFJournalParser(agency=agency) + +#test_small_pdfs(parser) + +process_journal_pdfs(parser, "http://uit.no/om/enhet/artikkel?p_document_id=382893&p_dimension_id=88216", errors) +process_page_queue(parser, errors) +report_errors(errors) |