Add forgotten scraper.

author: Petter Reinholdtsen <pere@hungry.com> 2014-12-10 21:19:13 +0100
committer: Petter Reinholdtsen <pere@hungry.com> 2014-12-10 21:19:13 +0100
commit: edfec2f9d097268e4db0ceab4c36e7ea51f2bd28 (patch)
tree: cc204246437b62c8ce69a2d9b805f145aa5eaaa2
parent: 316499d3b157f625cc8b3ecff9802e6064b82102 (diff)
1 files changed, 111 insertions, 0 deletions
diff --git a/scrapersources/postliste-hoegskolen-i-hedmark b/scrapersources/postliste-hoegskolen-i-hedmark
new file mode 100644
index 0000000..49887d2
--- /dev/null
+++ b/scrapersources/postliste-hoegskolen-i-hedmark
@@ -0,0 +1,111 @@
+# -*- coding: utf-8 -*-
+# YAML-tagger:
+#  Type: høgskole
+#  Status: finished
+#  Name: Høgskolen i Hedmark
+#  Format: PDF
+#  Datatype: ePhorte
+#  Run: daily
+#  Publish duration: 3 months
+
+# Based on the scraper advanced-scraping-pdf
+# See also
+# https://views.scraperwiki.com/run/pdf-to-html-preview-1/?url=http%3A%2F%2Fwww.stortinget.no%2FGlobal%2Fpdf%2Fpostjournal%2Fpj-2012-05-09.pdf
+
+import scraperwiki
+import json
+from BeautifulSoup import BeautifulSoup
+import datetime
+import dateutil.parser
+import lxml.html
+import resource
+import sys
+import urlparse
+import re
+
+# Make sure Scraperwiki believe this is the source from this database
+scraperwiki.scrape("http://www.hihm.no/Hovedsiden/Om-Hoegskolen/Offentlig-journal")
+
+lazycache=scraperwiki.swimport('lazycache')
+postlistelib=scraperwiki.swimport('postliste-python-lib')
+
+agency = u'Høgskolen i Hedmark'
+
+def report_errors(errors):
+    if 0 < len(errors):
+        print "Errors:"
+        for e in errors:
+            print e
+        exit(1)
+def out_of_cpu(arg, spent, hard, soft):
+    report_errors(arg)
+
+def process_pdf(parser, pdfurl, errors):
+    errors = []
+    postlistelib.exit_if_no_cpu_left(0, out_of_cpu, errors)
+    try:
+        pdfcontent = scraperwiki.scrape(pdfurl)
+        parser.preprocess(pdfurl, pdfcontent)
+        pdfcontent = None
+#    except ValueError, e:
+#        errors.append(e)
+    except IndexError, e:
+        errors.append(e)
+
+def process_page_queue(parser, errors):
+    try:
+        parser.process_pages()
+        postlistelib.exit_if_no_cpu_left(0, out_of_cpu, errors)
+    except scraperwiki.CPUTimeExceededError, e:
+        errors.append("Processing pages interrupted")
+
+def process_journal_pdfs(parser, listurl, errors):
+    print "Finding PDFs on " + listurl
+#    u = urllib.parse.urlparse(listurl)
+    html = scraperwiki.scrape(listurl)
+    root = lxml.html.fromstring(html)
+    html = None
+    for ahref in root.cssselect("div.content-view-full a"):
+        href = ahref.attrib['href']
+        url = urlparse.urljoin(listurl, href).replace(" ", "+")
+        if -1 != href.find("file://") or -1 == url.find(".pdf"):
+#            print "Skipping non-http URL " + url
+            continue
+        if parser.is_already_scraped(url):
+            True
+#            print "Skipping already scraped " + url
+        else:
+#            print "Will process " + url
+            process_pdf(parser, url, errors)
+
+def find_journal_subpages(baseurl):
+    urls = []    
+    root = lxml.html.fromstring(scraperwiki.scrape(baseurl))
+    for ahref in root.cssselect("ul.menu-list a"):
+        href = ahref.attrib['href']
+        months = "januar","februar","mars","april","mai","juni","juli","august","september","oktober","november","desember"
+        if -1 == href.find("file://") and href.endswith(months):
+            urls.append(urlparse.urljoin(baseurl, href).replace(" ", "+"))
+    return urls
+
+def test_small_pdfs(parser):
+    # Test with some smaller PDFs
+    errors = []
+    process_pdf(parser, "http://www.hihm.no/content/download/38169/420508/file/search.pdf", errors)
+    process_pdf(parser, "http://www.hihm.no/content/download/39369/430053/file/search.pdf", errors)
+    process_page_queue(parser, errors)
+    report_errors(errors)
+    exit(0)
+
+errors = []
+parser = postlistelib.PDFJournalParser(agency=agency)
+
+#test_small_pdfs(parser)
+
+#process_journal_pdfs(parser, "http://www.hihm.no/Hovedsiden/Om-Hoegskolen/Offentlig-journal/mai", errors)
+
+for url in find_journal_subpages("http://www.hihm.no/Hovedsiden/Om-Hoegskolen/Offentlig-journal"):
+    process_journal_pdfs(parser, url, errors)
+
+process_page_queue(parser, errors)
+report_errors(errors)
author	Petter Reinholdtsen <pere@hungry.com>	2014-12-10 21:19:13 +0100
committer	Petter Reinholdtsen <pere@hungry.com>	2014-12-10 21:19:13 +0100
commit	edfec2f9d097268e4db0ceab4c36e7ea51f2bd28 (patch)
tree	cc204246437b62c8ce69a2d9b805f145aa5eaaa2
parent	316499d3b157f625cc8b3ecff9802e6064b82102 (diff)