aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorPetter Reinholdtsen <pere@hungry.com>2014-12-17 23:31:19 +0100
committerPetter Reinholdtsen <pere@hungry.com>2014-12-17 23:31:19 +0100
commit23033cd8b447c53d6f466760a59004bed3c7661b (patch)
treec6c933d799cf7cf0bf895ddf10166dc3073a31f3
parentf0eb29b2e41ec148ebdcf55d04cd2ff4ed3a3ecd (diff)
New scraper for the University of Tromsø. Not yet complete.
-rw-r--r--scrapersources/postliste-universitetet-i-tromso95
1 files changed, 95 insertions, 0 deletions
diff --git a/scrapersources/postliste-universitetet-i-tromso b/scrapersources/postliste-universitetet-i-tromso
new file mode 100644
index 0000000..c2553db
--- /dev/null
+++ b/scrapersources/postliste-universitetet-i-tromso
@@ -0,0 +1,95 @@
+# -*- coding: utf-8 -*-
+# YAML-tagger:
+# Type: unknown
+# Status: unfinished
+# Name: Universitetet i Tromsø
+# Format: PDF/HTML
+# Datatype: ePhorte
+# Vendor: Ergo
+# Run: daily
+#
+# The PDF/ePhorte scraper is done, but the new HTML format is not yet
+# handled.
+
+import scraperwiki
+import json
+from BeautifulSoup import BeautifulSoup
+import datetime
+import dateutil.parser
+import lxml.html
+import resource
+import sys
+import urlparse
+import re
+
+# Make sure Scraperwiki believe this is the source from this database
+scraperwiki.scrape("http://uit.no/om/offjour")
+
+lazycache=scraperwiki.swimport('lazycache')
+postlistelib=scraperwiki.swimport('postliste-python-lib')
+
+agency = u'Universitetet i Tromsø'
+
+def report_errors(errors):
+ if 0 < len(errors):
+ print "Errors:"
+ for e in errors:
+ print e
+ raise ValueError("Something went wrong")
+
+def out_of_cpu(arg, spent, hard, soft):
+ report_errors(arg)
+
+def process_pdf(parser, pdfurl, errors):
+ postlistelib.exit_if_no_cpu_left(0, out_of_cpu, errors)
+ try:
+ pdfcontent = scraperwiki.scrape(pdfurl)
+ parser.preprocess(pdfurl, pdfcontent)
+ pdfcontent = None
+ except ValueError, e:
+ errors.append(e)
+ except IndexError, e:
+ errors.append(e)
+
+def process_page_queue(parser, errors):
+ try:
+ parser.process_pages()
+ postlistelib.exit_if_no_cpu_left(0, out_of_cpu, errors)
+ except scraperwiki.CPUTimeExceededError, e:
+ errors.append("Processing pages interrupted")
+
+def process_journal_pdfs(parser, listurl, errors):
+# print "Finding PDFs on " + listurl
+# u = urllib.parse.urlparse(listurl)
+ html = scraperwiki.scrape(listurl)
+ root = lxml.html.fromstring(html)
+ html = None
+ for ahref in root.cssselect("div.nyhArtikkel a"):
+ href = ahref.attrib['href']
+ url = urlparse.urljoin(listurl, href)
+ if -1 != href.find("file://") or -1 == url.find(".pdf"):
+# print "Skipping non-http URL " + url
+ continue
+ if parser.is_already_scraped(url):
+ True
+# print "Skipping already scraped " + url
+ else:
+# print "Will process " + url
+ process_pdf(parser, url, errors)
+
+def test_small_pdfs(parser):
+ # Test with some smaller PDFs
+ errors = []
+ process_pdf(parser, "http://uit.no/Content/382902/Januar%202011.pdf", errors)
+ process_page_queue(parser, errors)
+ report_errors(errors)
+ exit(0)
+
+errors = []
+parser = postlistelib.PDFJournalParser(agency=agency)
+
+#test_small_pdfs(parser)
+
+process_journal_pdfs(parser, "http://uit.no/om/enhet/artikkel?p_document_id=382893&p_dimension_id=88216", errors)
+process_page_queue(parser, errors)
+report_errors(errors)