scrapersources/postliste-hadsel


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118

# -*- coding: utf-8 -*-
# YAML-tagger:
#  Type: kommune
#  Status: finished
#  Name: Hadsel kommune
#  Format: PDF
#  Datatype: ePhorte
#  Vendor: Ergo
#  Run: daily
#  Publish duration: 3 months

import scraperwiki
from BeautifulSoup import BeautifulSoup
import datetime
import dateutil.parser
import lxml.html
import sys
import urlparse

scraperwiki.scrape("http://www.hadsel.kommune.no/selvbetjeningskjema-kart-postjournal/offentlig-postjournal")

lazycache=scraperwiki.swimport('lazycache')
postlistelib=scraperwiki.swimport('postliste-python-lib')

agency = 'Hadsel kommune'

def report_errors(errors):
    if 0 < len(errors):
        print "Errors:"
        for e in errors:
            print e
        exit(1)
def no_cpu_left(arg, spent, soft, hard):
    report_errors(arg)

def process_pdf(parser, pdfurl, errors):
    errors = []
    postlistelib.exit_if_no_cpu_left(0, callback=no_cpu_left, arg = errors)
    try:
        pdfcontent = lazycache.lazycache(pdfurl)
        parser.preprocess(pdfurl, pdfcontent)
#    except ValueError, e:
#        errors.append(e)
    except IndexError, e:
        errors.append(e)

def process_page_queue(parser, errors):
    try:
        parser.process_pages()
        postlistelib.exit_if_no_cpu_left(0, callback=no_cpu_left, arg = errors)
    except scraperwiki.CPUTimeExceededError, e:
        errors.append("Processing pages interrupted")

def consider_url(parser, url, errors):
    if parser.is_already_scraped(url):
        True
#        print "Skipping already scraped " + url
    else:
#        print "Will process " + url
        try:
            process_pdf(parser, url, errors)
        except Exception, e:
            print "Processing PDF on %s failed:" % url, e
            pass

def process_journal_pdfs(parser, listurl, errors, recurse):
#    print "Finding PDFs on " + listurl
#    u = urllib.parse.urlparse(listurl)
    html = scraperwiki.scrape(listurl)
    root = lxml.html.fromstring(html)
    html = None
    for ahref in root.cssselect("div.items a.doclink"):
        url = urlparse.urljoin(listurl, ahref.attrib['href'])
        if -1 == url.find("doc_download") or -1 != url.find("docman"):
            continue
#        print url
        consider_url(parser, url, errors)
    for ahref in root.cssselect("div.item-list a"):
        suburl = urlparse.urljoin(listurl, ahref.attrib['href'])
        #print "sub " + suburl
        subhtml = scraperwiki.scrape(suburl)
        subroot = lxml.html.fromstring(subhtml)
        subhtml = None
        for subahref in subroot.cssselect("div.article a"):
            href = subahref.attrib['href']
            #print href
            subsuburl = urlparse.urljoin(suburl, href)
            #print "subsub " + subsuburl
            if -1 == subsuburl.find("doc_download"):
                continue
            consider_url(parser, subsuburl, errors)
        subroot = None
    if recurse:
        seen = { listurl : 1 }
        for ahref in root.cssselect("div.pagination a"):
            pageurl = urlparse.urljoin(listurl, ahref.attrib['href'])
            #print "P: " + pageurl
            if pageurl not in seen:
                process_journal_pdfs(parser, pageurl, errors, False)
                seen[pageurl] = 1

def test_parse_case_journal_ref():
    entry = {}
    parse_case_journal_ref(entry, [u'2008/16414-', u'23', u'15060/2012'], "")
    parse_case_journal_ref(entry, [u'2011/15972-1 102773/201', u'1'], "")
    parse_case_journal_ref(entry, [u'2010/2593-2', u'103004/201', u'1'], "")
    parse_case_journal_ref(entry, [u'2011/13415-', u'22', u'100077/201', u'1'], "")
    exit(0)

#test_parse_case_journal_ref()

errors = []
parser = postlistelib.PDFJournalParser(agency=agency)
process_page_queue(parser, errors)
process_journal_pdfs(parser, "http://www.hadsel.kommune.no/selvbetjeningskjema-kart-postjournal/offentlig-postjournal", errors, True)
process_page_queue(parser, errors)
report_errors(errors)