diff options
-rw-r--r-- | scrapersources/postliste-difi | 8 | ||||
-rw-r--r-- | scrapersources/postliste-hoegskolen-i-gjoevik | 14 | ||||
-rw-r--r-- | scrapersources/postliste-hoegskolen-i-lillehammer | 4 | ||||
-rw-r--r-- | scrapersources/postliste-hoegskolen-i-volda | 3 | ||||
-rw-r--r-- | scrapersources/postliste-lenvik | 173 | ||||
-rw-r--r-- | scrapersources/postliste-met | 4 | ||||
-rw-r--r-- | scrapersources/postliste-naroy | 7 | ||||
-rw-r--r-- | scrapersources/postliste-oep | 34 | ||||
-rw-r--r-- | scrapersources/postliste-oep-deliverydates | 4 | ||||
-rw-r--r-- | scrapersources/postliste-ruter | 1 |
10 files changed, 232 insertions, 20 deletions
diff --git a/scrapersources/postliste-difi b/scrapersources/postliste-difi index dfc986f..459327b 100644 --- a/scrapersources/postliste-difi +++ b/scrapersources/postliste-difi @@ -54,17 +54,17 @@ def process_journal_pdfs(parser, listurl, errors): html = scraperwiki.scrape(listurl) root = lxml.html.fromstring(html) html = None - for ahref in root.cssselect("div.body a"): + for ahref in root.cssselect("div.sixcol a"): href = ahref.attrib['href'] url = urlparse.urljoin(listurl, href) if -1 != href.find("file://") or -1 == url.find(".pdf"): -# print "Skipping non-http URL " + url + print "Skipping non-http URL " + url continue if parser.is_already_scraped(url): True -# print "Skipping already scraped " + url + print "Skipping already scraped " + url else: -# print "Will process " + url + #print "Will process " + url process_pdf(parser, url, errors) def test_small_pdfs(parser): diff --git a/scrapersources/postliste-hoegskolen-i-gjoevik b/scrapersources/postliste-hoegskolen-i-gjoevik index d4f7931..cdf007e 100644 --- a/scrapersources/postliste-hoegskolen-i-gjoevik +++ b/scrapersources/postliste-hoegskolen-i-gjoevik @@ -60,13 +60,15 @@ def process_journal_pdfs(parser, listurl, errors): html = scraperwiki.scrape(listurl) root = lxml.html.fromstring(html) html = None - for ahref in root.cssselect("div.spalte-inner a"): + for ahref in root.cssselect("section a"): href = ahref.attrib['href'] url = urlparse.urljoin(listurl, href).replace(" ", "+") + #print url if -1 != href.find("file://") or -1 == url.find(".pdf"): # print "Skipping non-http URL " + url continue if parser.is_already_scraped(url): + #print "Scraped: %s" % url True # print "Skipping already scraped " + url else: @@ -98,6 +100,16 @@ endYear=datetime.datetime.now().year for year in range(startYear, endYear+1): # range goes from startyear to endYear-1 process_journal_pdfs(parser, "http://www.hig.no/om_hig/offentleg_journal/%d" % year, errors) + process_page_queue(parser, errors) report_errors(errors) +warningQuery = "recorddate as lastupdate from 'swdata' order by recorddate DESC limit 1"; +result = scraperwiki.sqlite.select(warningQuery) +now=datetime.datetime.today() +then=datetime.datetime.strptime(result[0]['lastupdate'],"20%y-%m-%dT%H:%M:%S") + +if (now-then).days > 14: + print "warning" + warningURL = "http://hild1.no/~hildenae/files/dynamic/run.php?scraper=postliste-hoegskolen-i-gjoevik&reason=7days"; + scraperwiki.scrape(warningURL)
\ No newline at end of file diff --git a/scrapersources/postliste-hoegskolen-i-lillehammer b/scrapersources/postliste-hoegskolen-i-lillehammer index 5337521..5687ece 100644 --- a/scrapersources/postliste-hoegskolen-i-lillehammer +++ b/scrapersources/postliste-hoegskolen-i-lillehammer @@ -64,9 +64,9 @@ def process_journal_pdfs(parser, listurl, errors): continue if parser.is_already_scraped(url): True -# print "Skipping already scraped " + url + print "Skipping already scraped " + url else: -# print "Will process " + url + print "Will process " + url process_pdf(parser, url, errors) def test_small_pdfs(parser): diff --git a/scrapersources/postliste-hoegskolen-i-volda b/scrapersources/postliste-hoegskolen-i-volda index 0106cb7..d8f3686 100644 --- a/scrapersources/postliste-hoegskolen-i-volda +++ b/scrapersources/postliste-hoegskolen-i-volda @@ -53,11 +53,12 @@ def process_journal_pdfs(parser, listurl, errors): html = scraperwiki.scrape(listurl) root = lxml.html.fromstring(html) html = None - for ahref in root.cssselect("div.inside a"): + for ahref in root.cssselect("div#maincontent a"): if 'id' not in ahref.attrib or -1 == ahref.attrib['id'].find("archiveimage_"): continue href = ahref.attrib['href'] url = urlparse.urljoin(listurl, href) + #print "found url %s" %url if -1 != href.find("file://"): # print "Skipping non-http URL " + url continue diff --git a/scrapersources/postliste-lenvik b/scrapersources/postliste-lenvik new file mode 100644 index 0000000..66a502d --- /dev/null +++ b/scrapersources/postliste-lenvik @@ -0,0 +1,173 @@ +# -*- coding: utf-8 -*- + +import scraperwiki +import urllib2 +import lxml.html +import re +import dateutil.parser +from dateutil.relativedelta import relativedelta +import datetime +import urlparse + +agency = "Lenvik kommune" + +# Point scraperwiki GUI to the start page +scraperwiki.scrape("http://webway.lenvik.kommune.no/postjournal") + +postlistelib=scraperwiki.swimport('postliste-python-lib') +parser = postlistelib.JournalParser(agency=agency) + +def saver(unique_keys, data): +# return + #print "Not saving data" + scraperwiki.sqlite.save(unique_keys, data) + +def expand_year(year): + year = int(year) + if year > 50: + year = year + 1900 + else: + year = year + 2000 + return year + +# <tr class=yang> +# <td>13/00563-001</td> +# <td>04.03.2013</td> +# <td style="text-align:center;"> +# <div title="Inngående">I</div> +# </td> +# <td>Flytting av VPN-tunell </td> +# <td>EVRY AS</td> +# <td>Jan-Eirik Nordahl</td> +# <td> +# <a href="/dokumentbestilling?jpid=13003566" title="Klikk for å bestille innsyn">Bestill</a> +# </td> +# <td></td> +# +# </tr> +# + +def fetch_postjournal_day(parser, url, html, saver): + root = lxml.html.fromstring(html.decode('utf-8')) + + recorddate = None + for div in root.cssselect('div'): + divcontent = div.text_content() + if 0 == divcontent.find("Offentlig postjournal for "): + recorddate = dateutil.parser.parse(divcontent.replace("Offentlig postjournal for ",""), dayfirst=True) + print recorddate + + # Make sure we save the entire URL or nothing at all + datastore = [] + for tr in root.cssselect('tr.yang'): + tds = tr.cssselect("td") + docidstr = tds[0].text_content().strip() + docdate = tds[1].text_content().strip() + doctype = tds[2].text_content().strip() + docdesc = tds[3].text_content().strip() + fratil = tds[4].text_content().strip() + saksbehandler = tds[5].text_content().strip() + if -1 != tds[6].text_content().find("Bestill"): + exemption = None + else: + exemption = tds[6].text_content().strip() + + docdate = dateutil.parser.parse(docdate, dayfirst=True) + +# print doctype, docdesc + if not parser.is_valid_doctype(doctype): + doctype = { + '' : '?', + }[doctype] + if parser.is_sender_doctype(doctype): + fratilfield = 'sender' + elif parser.is_recipient_doctype(doctype): + fratilfield = 'recipient' + + caseyear, caseseqnr = docidstr.split("/") + caseyear = expand_year(caseyear) + caseseqnr, casedocseq = caseseqnr.split("-") + caseid = "%d/%d" % (int(caseyear), int(caseseqnr)) + + data = { + 'agency' : parser.agency, + 'recorddate' : recorddate.date(), + 'docdate' : docdate.date(), + 'docdesc' : docdesc, + 'casedesc' : docdesc, # FIXME fake value + + 'caseyear' : int(caseyear), + 'caseseqnr' : int(caseseqnr), + 'casedocseq' : int(casedocseq), + 'caseid' : caseid, + 'doctype' : doctype, + +# 'journalseqnr' : int(journalseqnr), +# 'journalyear' : int(journalyear), +# 'journalid' : journalid, + fratilfield : fratil, + + 'saksbehandler' : saksbehandler, +# 'saksansvarlig' : saksansvarlig.strip(), +# 'saksansvarligenhet' : saksansvarligenhet.strip(), + + 'docidstr' : docidstr, +# 'laapenr' : laapenr, + 'exemption' : exemption, + + 'scrapedurl' : url, + 'scrapestamputc' : datetime.datetime.now() + } + +# print data + parser.verify_entry(data) + datastore.append(data) + + seenurl = {} + # Find next URL. There are two on each page. + for ahref in root.cssselect('a.next_page'): + if 0 == ahref.text_content().find('Neste'): + nexturl = urlparse.urljoin(url, ahref.attrib['href']) + if nexturl not in seenurl: + seenurl[nexturl] = True; + print 'Fetching ' + nexturl + html = postlistelib.fetch_url_harder(nexturl) + mysaver = lambda unique_keys, data: datastore.extend(data) + fetch_postjournal_day(parser=parser, url=nexturl, html=html, + saver=mysaver) + + saver(unique_keys=['docidstr'], data=datastore) + +def date2url(date): + return 'http://webway.lenvik.kommune.no/?date=%s' % date + +def gen_date_urls(urllist, startdate, step, count): + d = dateutil.parser.parse(startdate, dayfirst=False) + for n in xrange(1, step*(count+1), step): + next = (d + relativedelta(days=n)).strftime("%Y-%m-%d") + urllist.append(date2url(next)) + +urllist = [] +today = datetime.date.today() +try: + first = scraperwiki.sqlite.select("min(recorddate) as min from swdata")[0]['min'] + last = scraperwiki.sqlite.select("max(recorddate) as max from swdata")[0]['max'] +except: + last = (today + relativedelta(days=-14)).strftime("%Y-%m-%d") + first = None + +print first, last + +# Parse back in time +if first is not None: + gen_date_urls(urllist, first, -1, 100) + +# Parse forward in time +if last is not None: + gen_date_urls(urllist, last, 1, 3) + +for dayurl in urllist: + print 'Fetching ' + dayurl + html = postlistelib.fetch_url_harder(dayurl) + fetch_postjournal_day(parser=parser, url=dayurl, html=html, saver=saver) + diff --git a/scrapersources/postliste-met b/scrapersources/postliste-met index 02c53ca..d769e97 100644 --- a/scrapersources/postliste-met +++ b/scrapersources/postliste-met @@ -58,11 +58,11 @@ def process_journal_pdfs(parser, listurl, errors): href = ahref.attrib['href'] url = urlparse.urljoin(listurl, href) if -1 != href.find("file://") or -1 == url.find("=File.getFile;"): -# print "Skipping non-http URL " + url + print "Skipping non-http URL " + url continue if parser.is_already_scraped(url): True -# print "Skipping already scraped " + url + print "Skipping already scraped " + url else: # print "Will process " + url process_pdf(parser, url, errors) diff --git a/scrapersources/postliste-naroy b/scrapersources/postliste-naroy index b8fa33b..f47adb3 100644 --- a/scrapersources/postliste-naroy +++ b/scrapersources/postliste-naroy @@ -59,7 +59,12 @@ def process_journal_pdfs(parser, listurl, errors): continue # Special case, file indicating no journal entries this day if "http://www.naroy.kommune.no/NK/Intern.nsf/FilA/CA6C83764E56DDCBC1257A02003F9025/$FILE/Postjournal+11.05.12.pdf" == url or \ - "http://www.naroy.kommune.no/NK/Intern.nsf/FilA/7FD82A18C1A1F137C12579F90029DEBD/$FILE/Postjournal+07.05.12.pdf" == url: + "http://www.naroy.kommune.no/NK/Intern.nsf/FilA/7FD82A18C1A1F137C12579F90029DEBD/$FILE/Postjournal+07.05.12.pdf" == url or \ + "http://www.naroy.kommune.no/NK/Intern.nsf/FilA/777B497BB48936ACC1257A450033E1D4/$FILE/Postjournal+20.07.12.pdf" == url or \ + "http://www.naroy.kommune.no/NK/Intern.nsf/FilA/1802A0FF57C08EFEC1257A4500337345/$FILE/Postjournal+16.07.12.pdf" == url or \ + "http://www.naroy.kommune.no/NK/Intern.nsf/FilA/90373A38701C27E5C1257A45002F63FD/$FILE/Postjournal+12.07.12.pdf" == url or \ + "http://www.naroy.kommune.no/NK/Intern.nsf/FilA/6B00A3BD92B3C2AEC1257A45002F4044/$FILE/Postjournal+10.07.12.pdf" == url or \ + "http://www.naroy.kommune.no/NK/Intern.nsf/FilA/0141B5488D38B8FEC1257A44003756ED/$FILE/Postjournal+06.07.12.pdf" == url: continue if parser.is_already_scraped(url): True diff --git a/scrapersources/postliste-oep b/scrapersources/postliste-oep index 360ab91..bcfde1b 100644 --- a/scrapersources/postliste-oep +++ b/scrapersources/postliste-oep @@ -9,7 +9,7 @@ import httplib import urllib2 # Try several times as the database get bigger -writetries = 6 +writetries = 8 # http://www.oep.no/search/resultSingle.html?journalPostId=1000000 # http://www.oep.no/search/resultSingle.html?journalPostId=3889259 @@ -102,23 +102,31 @@ def url_from_id(id): return "http://www.oep.no/search/resultSingle.html?journalPostId=" + str(id) def save(data): + problem = False for run in range(0,writetries): try: scraperwiki.sqlite.save(unique_keys=['journalPostId'], data=data) + if problem: + print "Sqlite write succeeded" return except scraperwiki.sqlite.SqliteError, e: - print "Sqlite write error, trying again" + print "Sqlite write error, trying again: " + str(e) time.sleep(22) + problem = True raise scraperwiki.sqlite.SqliteError("Unable to write to database, tried " + str(writetries) + " times") def save_var(var, data): + problem = False for run in range(0,writetries): try: scraperwiki.sqlite.save_var(var, data) + if problem: + print "Sqlite write succeeded" return except scraperwiki.sqlite.SqliteError, e: - print "Sqlite write error, trying again" + print "Sqlite write error, trying again: " + str(e) time.sleep(22) + problem = True raise scraperwiki.sqlite.SqliteError("Unable to write variable " + var + " to database, tried " + str(writetries) + " times") fieldmap = { @@ -177,9 +185,8 @@ def fetch_oep_entry(id, datastorage): # scraperwiki.sqlite.save(unique_keys=['journalPostId'], data=data) return 0 -def fetch_range(first, last, step): +def fetch_range(datastorage, first, last, step): myskiplimit = skiplimit - datastorage = [] skipped = 0 fetched = 0 min_id = first @@ -312,6 +319,16 @@ def remove_original(): print "Starting to fetch journal entries " + str(datetime.datetime.now()) scraperwiki.scrape("http://www.oep.no/") +datastorage = [] + +# Update entries to handle <URL: https://rt.nuug.no:443/Ticket/Display.html?id=6342 >. +# Used 2012-09-17 +#scraperwiki.sqlite.execute("DELETE from swdata where journalPostId = 638167") +#fetch_oep_entry(638167, datastorage) +#scraperwiki.sqlite.execute("DELETE from swdata where journalPostId = 638104") +#fetch_oep_entry(638104, datastorage) +#scraperwiki.sqlite.commit() + count = 10000 skiplimit = 500 # Random value fairly close to the most recent ID when this project started 2012-05-03 @@ -320,20 +337,21 @@ try: max = scraperwiki.sqlite.select("max(journalPostId) as max from swdata")[0]["max"] if 0 < scraperwiki.sqlite.get_var('min_tested_id'): saved_min = scraperwiki.sqlite.get_var('min_tested_id') + else: + saved_min = 0 sql_min = scraperwiki.sqlite.select("min(journalPostId) as min from swdata")[0]["min"] print "Saved min: " + str(saved_min) + ", sql min: " + str(sql_min) if sql_min < saved_min: min = sql_min else: min = saved_min - print "Scraping " + str(count) + " IDs below " + str(min) + " and above " + str(max) except scraperwiki.sqlite.SqliteError: pass -fetched = fetch_range(max + 1, max + count, 1) +fetched = fetch_range(datastorage, max + 1, max + count, 1) print "Fetched " + str(fetched) + " new journal entries, cpu spent: " + str(cpu_spent()) if min >= 0: - fetched = fetch_range(min, min - count, -1) + fetched = fetch_range(datastorage, min, min - count, -1) print "Fetched " + str(fetched) + " old journal entries, cpu spent: " + str(cpu_spent()) diff --git a/scrapersources/postliste-oep-deliverydates b/scrapersources/postliste-oep-deliverydates index f04ce49..ebce253 100644 --- a/scrapersources/postliste-oep-deliverydates +++ b/scrapersources/postliste-oep-deliverydates @@ -30,7 +30,9 @@ def fetch_oep_deliverydates(url, datastorage): return 0 datastorage = [] -fetch_oep_deliverydates("http://www.oep.no/pub/faces/statistikk.jsp?reposId=3", datastorage) +#fetch_oep_deliverydates("http://www.oep.no/pub/faces/statistikk.jsp?reposId=3", datastorage) +# New url before 2012-11-09 +fetch_oep_deliverydates("http://www.oep.no/pub/report.xhtml?reportId=3", datastorage) print datastorage scraperwiki.sqlite.save(unique_keys=['agency', 'deliverydate'], data=datastorage) diff --git a/scrapersources/postliste-ruter b/scrapersources/postliste-ruter index 757d6be..51a2776 100644 --- a/scrapersources/postliste-ruter +++ b/scrapersources/postliste-ruter @@ -10,6 +10,7 @@ import resource import sys import urlparse import re +scraperwiki.scrape('http://www2.ruter.no/verdt-a-vite/presse/offentlig-journal/') lazycache=scraperwiki.swimport('lazycache') postlistelib=scraperwiki.swimport('postliste-python-lib') |