scrapersources/postliste-universitetet-i-oslo


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134

# -*- coding: utf-8 -*-
# YAML-tagger:
#  Type: university
#  Status: finished
#  Name: Universitetet i Oslo
#  Format: PDF
#  Datatype: ePhorte
#  Vendor: Ergo
#  Run: daily

# Based on the scraper advanced-scraping-pdf
# See also
# https://views.scraperwiki.com/run/pdf-to-html-preview-1/?url=http%3A%2F%2Fwww.stortinget.no%2FGlobal%2Fpdf%2Fpostjournal%2Fpj-2012-05-09.pdf

import scraperwiki
import json
from BeautifulSoup import BeautifulSoup
import datetime
import dateutil.parser
import lxml.html
import resource
import sys
import urlparse
import re

# Make sure Scraperwiki believe this is the source from this database
scraperwiki.scrape("http://www.uio.no/om/journal/")

lazycache=scraperwiki.swimport('lazycache')
postlistelib=scraperwiki.swimport('postliste-python-lib')

agency = 'Universitetet i Oslo'

def report_errors(errors):
    if 0 < len(errors):
        print "Errors:"
        for e in errors:
            print e
        raise ValueError("Something went wrong")

def out_of_cpu(arg, spent, hard, soft):
    report_errors(arg)

def process_pdf(parser, pdfurl, errors):
    postlistelib.exit_if_no_cpu_left(0, out_of_cpu, errors)
    try:
        pdfcontent = scraperwiki.scrape(pdfurl)
        parser.preprocess(pdfurl, pdfcontent)
        pdfcontent = None
    except ValueError, e:
        errors.append(e)
    except IndexError, e:
        errors.append(e)

def process_page_queue(parser, errors):
    try:
        parser.process_pages()
        postlistelib.exit_if_no_cpu_left(0, out_of_cpu, errors)
    except scraperwiki.CPUTimeExceededError, e:
        errors.append("Processing pages interrupted")

def process_journal_pdfs(parser, listurl, errors):
#    print "Finding PDFs on " + listurl
#    u = urllib.parse.urlparse(listurl)
    html = scraperwiki.scrape(listurl)
    root = lxml.html.fromstring(html)
    html = None
    for ahref in root.cssselect("table a"):
        href = ahref.attrib['href']
        url = urlparse.urljoin(listurl, href)
        if -1 != href.find("file://") or -1 == url.find(".pdf"):
#            print "Skipping non-http URL " + url
            continue
        if parser.is_already_scraped(url):
            True
#            print "Skipping already scraped " + url
        else:
#            print "Will process " + url
            process_pdf(parser, url, errors)

def process_journal_pdf_directory(parser, listurl, errors):
    #html = scraperwiki.scrape(listurl)
    html = lazycache.lazycache(listurl)
    root = lxml.html.fromstring(html)
    html = None

    pdflisturls = []
    for ahref in root.cssselect("span.vrtx-paging-wrapper a"):
        href = ahref.attrib['href']
        url = urlparse.urljoin(listurl, href)
        pdflisturls.append(url)
#    print pdflisturls

    for listurl in pdflisturls:
        html = scraperwiki.scrape(listurl)
        root = lxml.html.fromstring(html)
        html = None
        urlseen = {}
        for ahref in root.cssselect("div.vrtx-resource a"):
            href = ahref.attrib['href']
            url = urlparse.urljoin(listurl, href)
            if -1 == url.find(".pdf"):
                continue
            # Ignore duplicates with M: as part of the name
            if -1 != url.find("/M%"):
                continue
            if url in urlseen or parser.is_already_scraped(url):
                True
#                print "Skipping already scraped " + url
            else:
#                print "Will process " + url
                process_pdf(parser, url, errors)
            urlseen[url] = 1

def test_small_pdfs(parser):
    # Test with some smaller PDFs
    errors = []
    process_pdf(parser, "http://home.nuug.no/~pere/uio-postjournal/2011-16.pdf", errors)
    process_pdf(parser, "http://home.nuug.no/~pere/uio-postjournal/2011-52.pdf", errors)
    process_page_queue(parser, errors)
    report_errors(errors)
    exit(0)

errors = []
parser = postlistelib.PDFJournalParser(agency=agency)

#test_small_pdfs(parser)

process_journal_pdfs(parser, "http://www.uio.no/om/journal/", errors)
#process_journal_pdf_directory(parser, "http://www.uio.no/om/journal/2012/", errors)
#process_journal_pdf_directory(parser, "http://www.uio.no/om/journal/2011/", errors)
process_page_queue(parser, errors)
report_errors(errors)