scrapersources/postliste-risr-kommune


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126

# -*- coding: UTF-8 -*-
# Based on the scraper advanced-scraping-pdf
# See also
# https://views.scraperwiki.com/run/pdf-to-html-preview-1/?url=http%3A%2F%2Fwww.stortinget.no%2FGlobal%2Fpdf%2Fpostjournal%2Fpj-2012-05-09.pdf

import scraperwiki
import json
from BeautifulSoup import BeautifulSoup
import datetime
import dateutil.parser
import lxml.html
#import resource
import sys
#import urlparse
#import gc
import re
#lazycache=scraperwiki.swimport('lazycache')
#postlistelib=scraperwiki.swimport('postliste-python-lib')

agency = 'Risør kommune'

import mechanize

# ASPX pages are some of the hardest challenges because they use javascript and forms to navigate
# Almost always the links go through the function function __doPostBack(eventTarget, eventArgument)
# which you have to simulate in the mechanize form handling library

# This example shows how to follow the Next page link

url = 'http://159.171.0.169/ris/Modules/innsyn.aspx?mode=pl&SelPanel=0&ObjectType=ePhorteRegistryEntry&VariantType=Innsyn&ViewType=List&Query=RecordDate%3a%28-7%29+AND+DocumentType%3a%28I%2cU%29'
br = mechanize.Browser()

# sometimes the server is sensitive to this information
br.addheaders = [('User-agent', 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.1) Gecko/2008071615 Fedora/3.0.1-1.fc9 Firefox/3.0.1')]
response = br.open(url)

html = response.read()


for pagenum in range(6):
    print "Page %d  page length %d" % (pagenum, len(html))
    #print html
    #print "Clinicians found:", re.findall("PDetails.aspx\?ProviderId.*?>(.*?)</a>", html)
    
    
    mnextlink = re.search("javascript:__doPostBack\('ctl00\$ctl00\$ctl00\$WebPartManager\$wp1243460126ViewPart\$ctl04',''\).>Neste", html)
    #print mnextlink
    if not mnextlink:
        break
    
    br.select_form(name='aspnetForm')
    br.form.set_all_readonly(False)
    br['__EVENTTARGET'] = 'ctl00$ctl00$ctl00$WebPartManager$wp1243460126ViewPart$ctl04' #'ProviderSearchResultsTable1$NextLinkButton'
    br['__EVENTARGUMENT'] = ''
    br.submit()
    
    html = br.response().read()
    #print len(html)


# def report_errors(errors):
#     if 0 < len(errors):
#         print "Errors:"
#         for e in errors:
#             print e
#         exit(1)
# def out_of_cpu(arg, spent, hard, soft):
#     report_errors(arg)
# 
# def process_pdf(parser, pdfurl, errors):
#     errors = []
#     postlistelib.exit_if_no_cpu_left(0, out_of_cpu, errors)
#     try:
#         pdfcontent = scraperwiki.scrape(pdfurl)
#         parser.preprocess(pdfurl, pdfcontent)
#         pdfcontent = None
# #    except ValueError, e:
# #        errors.append(e)
#     except IndexError, e:
#         errors.append(e)
# 
# def process_page_queue(parser, errors):
#     try:
#         parser.process_pages()
#         postlistelib.exit_if_no_cpu_left(0, out_of_cpu, errors)
#     except scraperwiki.CPUTimeExceededError, e:
#         errors.append("Processing pages interrupted")
# 
# def process_journal_pdfs(parser, listurl, errors):
# #    print "Finding PDFs on " + listurl
# #    u = urllib.parse.urlparse(listurl)
#     html = scraperwiki.scrape(listurl)
#     root = lxml.html.fromstring(html)
#     html = None
#     for ahref in root.cssselect("table a"):
#         href = ahref.attrib['href']
#         url = urlparse.urljoin(listurl, href)
#         if -1 != href.find("file://"):
# #            print "Skipping non-http URL " + url
#             continue
#         if parser.is_already_scraped(url):
#             True
# #            print "Skipping already scraped " + url
#         else:
# #            print "Will process " + url
#             process_pdf(parser, url, errors)
# 
# def test_small_pdfs():
#     # Test with some smaller PDFs
#     errors = []
#     process_pdf("http://home.nuug.no/~pere/uio-postjournal/2011-16.pdf", errors)
#     process_pdf("http://home.nuug.no/~pere/uio-postjournal/2011-52.pdf", errors)
#     process_page_queue(errors)
#     report_errors(errors)
#     exit(0)
# 
# #test_small_pdfs()
# errors = []
# parser = postlistelib.PDFJournalParser(agency=agency)
# process_journal_pdfs(parser, "http://www.havn.oslo.kommune.no/postjournal/", errors)
# process_page_queue(parser, errors)
# report_errors(errors)