Complete scraper for Ås kommune.

author: Petter Reinholdtsen <pere@hungry.com> 2016-09-26 18:30:43 +0200
committer: Petter Reinholdtsen <pere@hungry.com> 2016-09-26 18:30:43 +0200
commit: 0168d7df91e4a43139119cacd6874c19cc29a29e (patch)
tree: e19abede4f17ba4e8754701965484e73a1d8742a
parent: 8a96ae99c228480a67b9aad044a6b1847b79317a (diff)
1 files changed, 54 insertions, 23 deletions
diff --git a/scrapersources/postliste-aas-kommune b/scrapersources/postliste-aas-kommune
index 742f6a0..f8ee9e2 100644
--- a/scrapersources/postliste-aas-kommune
+++ b/scrapersources/postliste-aas-kommune
@@ -1,13 +1,14 @@
 # -*- coding: utf-8 -*-
 # YAML-tagger:
 #  Type: kommune
-#  Status: unfinished/draft
+#  Status: finished
 #  Name: Ås kommune
 #  Format: HTML
 #  Datatype: ?
 #  Vendor: ?
 #  Run: daily
 #  Missingfields: journalseqnr, journalyear, journalid
+#  Publish duration: 3 months
 
 import scraperwiki
 import urllib2
@@ -24,7 +25,7 @@ postlistelib=scraperwiki.swimport('postliste-python-lib')
 agency = u'Ås kommune'
 baseurl = "http://www.as.kommune.no"
 
-print "Fetching public journal!"
+print "Fetching public journal for %s!" % agency
 
 parser = postlistelib.JournalParser(agency=agency)
 
@@ -40,12 +41,9 @@ typemap = {
     u'Inngående dokument (I)' : 'I',
     u'Utgående dokument (U)' : 'U',
     }
-          
 
-def saver(unique_keys, data):
-#    return
-    #print "Not saving data"
-    scraperwiki.sqlite.save(unique_keys, data)
+class NoDataEntries(LookupError):
+    pass
 
 def expand_year(year):
     year = int(year)
@@ -55,24 +53,25 @@ def expand_year(year):
         year = year + 2000
     return year
 
-def fetch_day(parser, datastore, day):
-    dayurl = 'http://www.as.kommune.no/offentlig-journal-og-innsyn-i-arkiv.352152.no.html?pjdate=%s&pjfind=&pjdoktype=&cat=352152' % day
-
-    html = postlistelib.fetch_url_harder(dayurl)
-#    print html
-    root = lxml.html.fromstring(html.decode('utf-8'))
-    count = 0
+def parse_day_html(parser, datastore, dayurl, html):
+    root = lxml.html.fromstring(html)
+#    count = 0
     for tr in root.cssselect("table.postjournal > tr"):
         data = {
             'agency' : parser.agency,
             'scrapedurl' : dayurl,
             'scrapestamputc' : datetime.datetime.now()
             }
-        count = count + 1
+#        count = count + 1
 #        print "=========== %d =============" % count
 #        print tr.text_content()
-
-        arkivsaksref = tr.cssselect("td div.doknr")[0].text_content().strip()
+        doknrroot = tr.cssselect("td div.doknr")
+        if not doknrroot:
+            # No records found, just return
+            msg = "No entries found in %s" % dayurl
+            print msg
+            raise NoDataEntries(msg)
+        arkivsaksref = doknrroot[0].text_content().strip()
         caseyear = 0
         caseseqnr = 0
         casedocseq = 0
@@ -120,12 +119,44 @@ def fetch_day(parser, datastore, day):
              fratilfield = 'recipient'
         data[fratilfield] = fratil
 
-        print data
+#        print data
         parser.verify_entry(data)
         datastore.append(data)
 
-datastore = []
-#fetch_day(parser, datastore, '03.09.2016')
-fetch_day(parser, datastore, '02.09.2016')
-#fetch_day(parser, datastore, '01.09.2016')
-saver(unique_keys=['arkivsaksref'], data=datastore)
+def fetch_day(parser, day):
+    datastore = []
+    dayurl = 'http://www.as.kommune.no/offentlig-journal-og-innsyn-i-arkiv.352152.no.html?pjdate=%s&pjfind=&pjdoktype=&cat=352152' % day.strftime('%d.%m.%Y')
+    html = postlistelib.fetch_url_harder(dayurl).decode('utf-8')
+#    print html
+    try:
+        parse_day_html(parser, datastore, dayurl, html)
+        scraperwiki.sqlite.save(unique_keys=['arkivsaksref'], data=datastore)
+    except NoDataEntries, e:
+    	return
+    except Exception, e:
+        print html
+        raise
+
+aday = datetime.timedelta(1) # one day delta
+newest = None
+try:
+    newest = dateutil.parser.parse(scraperwiki.sqlite.select("max(recorddate) as max from swdata")[0]["max"], dayfirst=False).date()
+    oldest = dateutil.parser.parse(scraperwiki.sqlite.select("min(recorddate) as min from swdata")[0]["min"], dayfirst=False).date()
+except scraperwiki.sqlite.SqliteError:
+    # Table not created yet, ignore the error
+    pass
+
+if not newest:
+    # Bootstrap a month ago
+    newest = datetime.datetime.today() - aday * 30
+    oldest = newest
+
+skiplimit = 10
+
+# Look forward one week to at least get past the weekends
+for n in xrange(skiplimit):
+    fetch_day(parser, newest + aday * n)
+
+for n in xrange(skiplimit):
+    print n
+    fetch_day(parser, oldest - aday * n)
author	Petter Reinholdtsen <pere@hungry.com>	2016-09-26 18:30:43 +0200
committer	Petter Reinholdtsen <pere@hungry.com>	2016-09-26 18:30:43 +0200
commit	0168d7df91e4a43139119cacd6874c19cc29a29e (patch)
tree	e19abede4f17ba4e8754701965484e73a1d8742a
parent	8a96ae99c228480a67b9aad044a6b1847b79317a (diff)