scrapersources/postliste-stortinget


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105

# -*- coding: utf-8 -*-
# YAML-tagger:
#  Type: statlig
#  Status: broken
#  Name: Stortinget
#  Format: PDF
#  Datatype: ePhorte
#  Vendor: Ergo
#  Run: notyet

# The journal is no longer published as PDF, but as a web search
# service, see
# http://einnsyn.stortinget.no/einnsyn/RegistryEntry/DocumentDetails/74000
# for an example.

# Based on the scraper advanced-scraping-pdf
# See also 
# https://views.scraperwiki.com/run/pdf-to-html-preview-1/?url=http%3A%2F%2Fwww.stortinget.no%2FGlobal%2Fpdf%2Fpostjournal%2Fpj-2012-05-09.pdf
import scraperwiki
from BeautifulSoup import BeautifulSoup
import datetime
import dateutil.parser
import lxml.html
import urlparse
import resource
import sys
postlistelib=scraperwiki.swimport('postliste-python-lib')

def find_journal_pdfs(parser, listurl):
#    print "Finding PDFs on " + listurl
    html = postlistelib.fetch_url_harder(listurl)

    root = lxml.html.fromstring(html)
    pdfurls = []
    for ahref in root.cssselect("div.mainbody a"):
        href = ahref.attrib['href']
        url = urlparse.urljoin(listurl, href)
        if -1 != href.find("file://"):
#            print "Skipping non-http URL " + url
            continue
        if parser.is_already_scraped(url):
            True
#            print "Skipping already scraped " + url
        else:
#            print "Will process " + url
            pdfurls.append(url)
    return pdfurls

def fetch_and_preprocess(parser, pdfurl):
    pdfcontent = postlistelib.fetch_url_harder(pdfurl)
    parser.preprocess(pdfurl, pdfcontent)
    pdfcontent = None

def add_pdf_lists(parser, pdfurls):
    for period in [
         "",
         "_2010-2011",
         "-2009-2010",
         "-2008-2009",
        ]:
        url = "http://www.stortinget.no/no/Stortinget-og-demokratiet/Administrasjonen/Dokumentoffentlighet/Stortingets-offentlige-postjournal" + period + "/"
        pdfurls.extend(find_journal_pdfs(parser, url))


def report_errors(errors):
    if 0 < len(errors):
        print "Errors:"
        for e in errors:
            print e
        raise

def no_cpu_left(arg, spent, soft, hard):
     report_errors(arg)

agency = 'Stortinget'
parser = postlistelib.PDFJournalParser(agency=agency)
#parser.debug = True

if False:
    pdfurl = "http://www.stortinget.no/Global/pdf/postjournal/pj-2010-06-04-05.pdf"
    parse_pdf(pdfurl)
    exit(0)

pdfurls = []
add_pdf_lists(parser, pdfurls)

# Fetch all journal PDFs
errors = []
for pdfurl in pdfurls:
    postlistelib.exit_if_no_cpu_left(0, callback=no_cpu_left, arg = errors)
    try:
        parser.fetch_and_preprocess(pdfurl)
    except ValueError, e:
        errors.append(e)
    except IndexError, e:
        errors.append(e)
try:
    parser.process_pages()
except ValueError, e:
    errors.append(e)
except IndexError, e:
    errors.append(e)

report_errors(errors)