diff options
author | Petter Reinholdtsen <pere@hungry.com> | 2014-12-08 16:00:10 +0100 |
---|---|---|
committer | Petter Reinholdtsen <pere@hungry.com> | 2014-12-08 16:00:10 +0100 |
commit | c222b7f8575a9c684825ef30cb32b830cab27175 (patch) | |
tree | e76970093a980fde97d679c67ef9cc937ece6c47 | |
parent | 86c2ff0a3e17887ecb6e8c074a327ce79e7ceac5 (diff) |
Add a few extra scrapers.
-rw-r--r-- | scrapersources/postliste-skogoglandskap | 90 | ||||
-rw-r--r-- | scrapersources/postliste-ssb | 170 |
2 files changed, 260 insertions, 0 deletions
diff --git a/scrapersources/postliste-skogoglandskap b/scrapersources/postliste-skogoglandskap new file mode 100644 index 0000000..1510c97 --- /dev/null +++ b/scrapersources/postliste-skogoglandskap @@ -0,0 +1,90 @@ +# -*- coding: utf-8 -*- +# YAML-tagger: +# Type: unknown +# Status: unfinished +# Name: Norsk institutt for skog og landskap +# Format: +# Datatype: ePhorte +# Vendor: Ergo +# Run: not yet + +import scraperwiki +import json +from BeautifulSoup import BeautifulSoup +import datetime +import dateutil.parser +import lxml.html +import resource +import sys +import urlparse +import re +lazycache=scraperwiki.swimport('lazycache') +postlistelib=scraperwiki.swimport('postliste-python-lib') + +agency = 'Norsk institutt for skog og landskap' + +def report_errors(errors): + if 0 < len(errors): + print "Errors:" + for e in errors: + print e + exit(1) +def out_of_cpu(arg, spent, hard, soft): + report_errors(arg) + +def process_pdf(parser, pdfurl, errors): + errors = [] + postlistelib.exit_if_no_cpu_left(0, out_of_cpu, errors) + try: + pdfcontent = scraperwiki.scrape(pdfurl) + parser.preprocess(pdfurl, pdfcontent) + pdfcontent = None +# except ValueError, e: +# errors.append(e) + except IndexError, e: + errors.append(e) + +def process_page_queue(parser, errors): + try: + parser.process_pages() + postlistelib.exit_if_no_cpu_left(0, out_of_cpu, errors) + except scraperwiki.CPUTimeExceededError, e: + errors.append("Processing pages interrupted") + +def process_journal_pdfs(parser, listurl, errors): +# print "Finding PDFs on " + listurl +# u = urllib.parse.urlparse(listurl) + html = scraperwiki.scrape(listurl) + root = lxml.html.fromstring(html) + html = None + for ahref in root.cssselect("ul#attachment a"): + href = ahref.attrib['href'] + url = urlparse.urljoin(listurl, href) + if -1 != href.find("file://") or -1 == url.find(".pdf"): +# print "Skipping non-http URL " + url + continue + if parser.is_already_scraped(url): + True +# print "Skipping already scraped " + url + else: +# print "Will process " + url + process_pdf(parser, url, errors) + +def test_small_pdfs(parser): + # Test with some smaller PDFs + errors = [] + process_pdf(parser, "http://www.skogoglandskap.no/filearchive/offentlig_journal_09012012_15012012.pdf", errors) + process_page_queue(parser, errors) + report_errors(errors) + exit(0) + +errors = [] +parser = postlistelib.PDFJournalParser(agency=agency) + +#test_small_pdfs(parser) + +process_journal_pdfs(parser, "http://www.skogoglandskap.no/seksjoner/offentlig_journal", errors) +process_journal_pdfs(parser, "http://www.skogoglandskap.no/artikler/2011/postliste_2010", errors) +process_journal_pdfs(parser, "http://www.skogoglandskap.no/artikler/2012/postliste_2011", errors) +process_page_queue(parser, errors) +report_errors(errors) diff --git a/scrapersources/postliste-ssb b/scrapersources/postliste-ssb new file mode 100644 index 0000000..cd854eb --- /dev/null +++ b/scrapersources/postliste-ssb @@ -0,0 +1,170 @@ +# -*- coding: utf-8 -*- +# YAML-tagger: +# Type: movedtoOEP +# Status: disabled +# Name: Statistisk sentralbyrå + +import scraperwiki +import urllib2 +import lxml.html +import datetime +import time +import dateutil.parser +import pickle +import re + +from datetime import date +from datetime import timedelta +from time import strftime + +# Make sure Scraperwiki believe this is the source from this database +scraperwiki.scrape("http://www.ssb.no/omssb/journal/") + +postlistelib=scraperwiki.swimport('postliste-python-lib') + +agency = "Statistisk sentralbyrÃ¥" + +def daterange(start_date, end_date): + for n in range((end_date - start_date).days): + yield start_date + timedelta(n) + +def expand_year(year): + year = int(year) + if year > 50: + year = year + 1900 + else: + year = year + 2000 + return year + +def fetch_url(url): + html = None + for n in [1]: + try: + html = scraperwiki.scrape(url) + break + except urllib2.URLError, e: + print "URLError fetching " + url + ", trying again" + return html + +def save_date(parser, date, url, html): + num_saved = 0 + root = lxml.html.fromstring(html) + journal_date = dateutil.parser.parse(root.cssselect("p")[0].text_content().replace("Journaldato: ",""), dayfirst=True) + if date == journal_date.date(): + datastore = [] + for table in root.cssselect("table"): + docid = table.cssselect("tr")[0].cssselect("p")[1].text.strip() + datedesc = table.cssselect("tr")[0].cssselect("td")[3].cssselect("p")[0].text.strip() + + exemption = table.cssselect("tr")[1].cssselect("td")[5].cssselect("p")[0].text.strip() + + fratil_indicator = table.cssselect("tr")[2].cssselect("td")[0].cssselect("p")[0].text.strip() + + doctype = "" + if fratil_indicator.startswith("Til"): + doctype = "U" + elif fratil_indicator.startswith("Fra"): + doctype = "I" + elif fratil_indicator.startswith("Notat fra"): + doctype = "N" + else: + raise ValueError("Fant ikke doctype %s" % fratil_indicator) + + fratil_agency = table.cssselect("tr")[2].cssselect("td")[1].cssselect("p")[0].text.strip() + + casedesc = table.cssselect("tr")[4].cssselect("td")[1].cssselect("p")[0].text.strip() + + docdesc = table.cssselect("tr")[5].cssselect("td")[1].cssselect("p")[0].text.strip() + saksb = table.cssselect("tr")[0].cssselect("p")[5].text.strip() + + docdate = dateutil.parser.parse(datedesc.strip(), dayfirst=True) + + matchObj = re.match( r'(\d+)/(\d+)\s*-\s*(\d+)$', docid, re.M|re.I) + if matchObj: + caseyear = matchObj.group(1) + caseseqnr = matchObj.group(2) + casedocseq = matchObj.group(3) + caseyear = expand_year(caseyear) + caseid = str(caseyear) + "/" + str(caseseqnr) + else: + print "error: invalid Arkivsaksnr: " + docid + matchObj = re.match( r'(\d+)/(\d+)\s*-', docid, re.M|re.I) + if matchObj: + caseyear = expand_year(matchObj.group(1)) + caseseqnr = matchObj.group(2) + caseid = str(caseyear) + "/" + str(caseseqnr) + + if parser.is_sender_doctype(doctype): + fratilfield = 'sender' + elif parser.is_recipient_doctype(doctype): + fratilfield = 'recipient' + + data = { + 'agency' : agency, + 'docdate' : docdate.date(), + 'recorddate' : journal_date.date(), + 'docdesc' : docdesc, + 'casedesc' : casedesc, + 'caseid' : caseid, + 'docid' : docid, + + 'caseyear' : caseyear, + 'caseseqnr' : caseseqnr, + 'casedocseq' : casedocseq, + + fratilfield : fratil_agency, + 'doctype' : doctype, + + 'saksbehandler' : saksb, + + 'exemption' : exemption, + + 'scrapedurl' : url, + 'scrapestamputc' : datetime.datetime.now() + } + parser.verify_entry(data) + datastore.append(data) + scraperwiki.sqlite.save(unique_keys=['docid'], data=datastore) + num_saved += len(datastore) + datastore = [] + #print "Saved %s" % data['caseid'] + else: + # TODO: log error or exit? + msg = "Tried to scrape %s but got %s" % (date, journal_date.date()) + #raise ValueError(msg) + print msg + + return num_saved + +def scrape_date(parser, date): + url = base_url % (strftime("%d%m%y", date.timetuple())) + html = fetch_url(url) + if html: + return save_date(parser, date, url, html) + +base_url = 'http://www.ssb.no/omssb/journal/OJ%s.htm' +end_date = date.today() + +#print res + +start_date_obj = scraperwiki.sqlite.get_var('last_finished_date') + +if start_date_obj: + start_date = pickle.loads(start_date_obj) +else: + start_date = datetime.date(2011, 1, 3) + +print "Start date %s" % start_date + +parser = postlistelib.JournalParser(agency=agency) + +for single_date in daterange(start_date, end_date): + if single_date.weekday() < 5: + num_saved = scrape_date(parser, single_date) + print "Scraped %s found %s" % (single_date, num_saved) + if num_saved > 0: + scraperwiki.sqlite.save_var('last_finished_date', pickle.dumps(single_date)) + + if num_saved == None: + print "No more new. Exit..." + break |