10 files changed, 232 insertions, 20 deletions
diff --git a/scrapersources/postliste-difi b/scrapersources/postliste-difi
index dfc986f..459327b 100644
--- a/scrapersources/postliste-difi
+++ b/scrapersources/postliste-difi
@@ -54,17 +54,17 @@ def process_journal_pdfs(parser, listurl, errors):
     html = scraperwiki.scrape(listurl)
     root = lxml.html.fromstring(html)
     html = None
-    for ahref in root.cssselect("div.body a"):
+    for ahref in root.cssselect("div.sixcol a"):
         href = ahref.attrib['href']
         url = urlparse.urljoin(listurl, href)
         if -1 != href.find("file://") or -1 == url.find(".pdf"):
-#            print "Skipping non-http URL " + url
+            print "Skipping non-http URL " + url
             continue
         if parser.is_already_scraped(url):
             True
-#            print "Skipping already scraped " + url
+            print "Skipping already scraped " + url
         else:
-#            print "Will process " + url
+            #print "Will process " + url
             process_pdf(parser, url, errors)
 
 def test_small_pdfs(parser):
diff --git a/scrapersources/postliste-hoegskolen-i-gjoevik b/scrapersources/postliste-hoegskolen-i-gjoevik
index d4f7931..cdf007e 100644
--- a/scrapersources/postliste-hoegskolen-i-gjoevik
+++ b/scrapersources/postliste-hoegskolen-i-gjoevik
@@ -60,13 +60,15 @@ def process_journal_pdfs(parser, listurl, errors):
     html = scraperwiki.scrape(listurl)
     root = lxml.html.fromstring(html)
     html = None
-    for ahref in root.cssselect("div.spalte-inner a"):
+    for ahref in root.cssselect("section a"):
         href = ahref.attrib['href']
         url = urlparse.urljoin(listurl, href).replace(" ", "+")
+        #print url
         if -1 != href.find("file://") or -1 == url.find(".pdf"):
 #            print "Skipping non-http URL " + url
             continue
         if parser.is_already_scraped(url):
+            #print "Scraped: %s" % url
             True
 #            print "Skipping already scraped " + url
         else:
@@ -98,6 +100,16 @@ endYear=datetime.datetime.now().year
 
 for year in range(startYear, endYear+1): # range goes from startyear to endYear-1
     process_journal_pdfs(parser, "http://www.hig.no/om_hig/offentleg_journal/%d" % year, errors)
+
 process_page_queue(parser, errors)
 report_errors(errors)
 
+warningQuery = "recorddate as lastupdate from 'swdata' order by recorddate DESC limit 1";
+result = scraperwiki.sqlite.select(warningQuery)
+now=datetime.datetime.today()
+then=datetime.datetime.strptime(result[0]['lastupdate'],"20%y-%m-%dT%H:%M:%S")
+
+if (now-then).days > 14:
+    print "warning"
+    warningURL = "http://hild1.no/~hildenae/files/dynamic/run.php?scraper=postliste-hoegskolen-i-gjoevik&reason=7days";
+    scraperwiki.scrape(warningURL)
+\ No newline at end of file
diff --git a/scrapersources/postliste-hoegskolen-i-lillehammer b/scrapersources/postliste-hoegskolen-i-lillehammer
index 5337521..5687ece 100644
--- a/scrapersources/postliste-hoegskolen-i-lillehammer
+++ b/scrapersources/postliste-hoegskolen-i-lillehammer
@@ -64,9 +64,9 @@ def process_journal_pdfs(parser, listurl, errors):
             continue
         if parser.is_already_scraped(url):
             True
-#            print "Skipping already scraped " + url
+            print "Skipping already scraped " + url
         else:
-#            print "Will process " + url
+            print "Will process " + url
             process_pdf(parser, url, errors)
 
 def test_small_pdfs(parser):
diff --git a/scrapersources/postliste-hoegskolen-i-volda b/scrapersources/postliste-hoegskolen-i-volda
index 0106cb7..d8f3686 100644
--- a/scrapersources/postliste-hoegskolen-i-volda
+++ b/scrapersources/postliste-hoegskolen-i-volda
@@ -53,11 +53,12 @@ def process_journal_pdfs(parser, listurl, errors):
     html = scraperwiki.scrape(listurl)
     root = lxml.html.fromstring(html)
     html = None
-    for ahref in root.cssselect("div.inside a"):
+    for ahref in root.cssselect("div#maincontent a"):
         if 'id' not in ahref.attrib or -1 == ahref.attrib['id'].find("archiveimage_"):
             continue
         href = ahref.attrib['href']
         url = urlparse.urljoin(listurl, href)
+        #print "found url %s" %url
         if -1 != href.find("file://"):
 #            print "Skipping non-http URL " + url
             continue
diff --git a/scrapersources/postliste-lenvik b/scrapersources/postliste-lenvik
new file mode 100644
index 0000000..66a502d
--- /dev/null
+++ b/scrapersources/postliste-lenvik
@@ -0,0 +1,173 @@
+# -*- coding: utf-8 -*-
+
+import scraperwiki
+import urllib2
+import lxml.html
+import re
+import dateutil.parser
+from dateutil.relativedelta import relativedelta
+import datetime
+import urlparse
+
+agency = "Lenvik kommune"
+
+# Point scraperwiki GUI to the start page
+scraperwiki.scrape("http://webway.lenvik.kommune.no/postjournal")
+
+postlistelib=scraperwiki.swimport('postliste-python-lib')
+parser = postlistelib.JournalParser(agency=agency)
+
+def saver(unique_keys, data):
+#    return
+    #print "Not saving data"
+    scraperwiki.sqlite.save(unique_keys, data)
+
+def expand_year(year):
+    year = int(year)
+    if year > 50:
+        year = year + 1900
+    else:
+        year = year + 2000
+    return year
+
+#            <tr class=yang>
+#              <td>13/00563-001</td>
+#              <td>04.03.2013</td>
+#              <td style="text-align:center;">
+#                <div title="Inngående">I</div>
+#              </td>
+#              <td>Flytting av VPN-tunell </td>
+#                  <td>EVRY AS</td>
+#              <td>Jan-Eirik Nordahl</td>
+#                  <td>
+#                        <a href="/dokumentbestilling?jpid=13003566" title="Klikk for å bestille innsyn">Bestill</a>
+#                  </td>
+#                  <td></td>
+#
+#            </tr>
+#
+
+def fetch_postjournal_day(parser, url, html, saver):
+    root = lxml.html.fromstring(html.decode('utf-8'))
+
+    recorddate = None
+    for div in root.cssselect('div'):
+        divcontent = div.text_content()
+        if 0 == divcontent.find("Offentlig postjournal for "):
+            recorddate = dateutil.parser.parse(divcontent.replace("Offentlig postjournal for ",""), dayfirst=True)
+    print recorddate
+
+    # Make sure we save the entire URL or nothing at all
+    datastore = []
+    for tr in root.cssselect('tr.yang'):
+        tds = tr.cssselect("td")
+        docidstr = tds[0].text_content().strip()
+        docdate = tds[1].text_content().strip()
+        doctype = tds[2].text_content().strip()
+        docdesc = tds[3].text_content().strip()
+        fratil = tds[4].text_content().strip()
+        saksbehandler = tds[5].text_content().strip()
+        if -1 != tds[6].text_content().find("Bestill"):
+            exemption = None
+        else:
+            exemption = tds[6].text_content().strip()
+
+        docdate = dateutil.parser.parse(docdate, dayfirst=True)
+
+#        print doctype, docdesc
+        if not parser.is_valid_doctype(doctype):
+            doctype = {
+                '' : '?',
+                }[doctype]
+        if parser.is_sender_doctype(doctype):
+            fratilfield = 'sender'
+        elif parser.is_recipient_doctype(doctype):
+            fratilfield = 'recipient'
+
+        caseyear, caseseqnr = docidstr.split("/")
+        caseyear = expand_year(caseyear)
+        caseseqnr, casedocseq = caseseqnr.split("-")
+        caseid = "%d/%d" % (int(caseyear), int(caseseqnr))
+
+        data = {
+            'agency' : parser.agency,
+            'recorddate' : recorddate.date(),
+            'docdate' : docdate.date(),
+            'docdesc' : docdesc,
+            'casedesc' : docdesc, # FIXME fake value
+
+            'caseyear' : int(caseyear),
+            'caseseqnr' : int(caseseqnr),
+            'casedocseq' : int(casedocseq),
+            'caseid' : caseid,
+            'doctype' : doctype,
+
+#        'journalseqnr' : int(journalseqnr),
+#        'journalyear' : int(journalyear),
+#        'journalid' : journalid,
+            fratilfield : fratil,
+
+            'saksbehandler' : saksbehandler,
+#        'saksansvarlig' : saksansvarlig.strip(),
+#        'saksansvarligenhet' : saksansvarligenhet.strip(),
+
+            'docidstr' : docidstr,
+#        'laapenr' : laapenr,
+            'exemption' : exemption,
+
+            'scrapedurl' : url,
+            'scrapestamputc' : datetime.datetime.now()
+            }
+
+#        print data
+        parser.verify_entry(data)
+        datastore.append(data)
+
+    seenurl = {}
+    # Find next URL.  There are two on each page.
+    for ahref in root.cssselect('a.next_page'):
+        if 0 == ahref.text_content().find('Neste'):
+            nexturl = urlparse.urljoin(url, ahref.attrib['href'])
+            if nexturl not in seenurl:
+                seenurl[nexturl] = True;
+                print 'Fetching ' + nexturl
+                html = postlistelib.fetch_url_harder(nexturl)
+                mysaver = lambda unique_keys, data: datastore.extend(data)
+                fetch_postjournal_day(parser=parser, url=nexturl, html=html,
+                                      saver=mysaver)
+
+    saver(unique_keys=['docidstr'], data=datastore)
+
+def date2url(date):
+    return 'http://webway.lenvik.kommune.no/?date=%s' % date
+
+def gen_date_urls(urllist, startdate, step, count):
+    d = dateutil.parser.parse(startdate, dayfirst=False)
+    for n in xrange(1, step*(count+1), step):
+        next = (d + relativedelta(days=n)).strftime("%Y-%m-%d")
+        urllist.append(date2url(next))
+
+urllist = []
+today = datetime.date.today()
+try:
+    first = scraperwiki.sqlite.select("min(recorddate) as min from swdata")[0]['min']
+    last = scraperwiki.sqlite.select("max(recorddate) as max from swdata")[0]['max']
+except:
+    last = (today + relativedelta(days=-14)).strftime("%Y-%m-%d")
+    first = None
+
+print first, last
+
+# Parse back in time
+if first is not None:
+    gen_date_urls(urllist, first, -1, 100)
+
+# Parse forward in time
+if last is not None:
+    gen_date_urls(urllist, last, 1, 3)
+
+for dayurl in urllist:
+    print 'Fetching ' + dayurl
+    html = postlistelib.fetch_url_harder(dayurl)
+    fetch_postjournal_day(parser=parser, url=dayurl, html=html, saver=saver)
+
diff --git a/scrapersources/postliste-met b/scrapersources/postliste-met
index 02c53ca..d769e97 100644
--- a/scrapersources/postliste-met
+++ b/scrapersources/postliste-met
@@ -58,11 +58,11 @@ def process_journal_pdfs(parser, listurl, errors):
         href = ahref.attrib['href']
         url = urlparse.urljoin(listurl, href)
         if -1 != href.find("file://") or -1 == url.find("=File.getFile;"):
-#            print "Skipping non-http URL " + url
+            print "Skipping non-http URL " + url
             continue
         if parser.is_already_scraped(url):
             True
-#            print "Skipping already scraped " + url
+            print "Skipping already scraped " + url
         else:
 #            print "Will process " + url
             process_pdf(parser, url, errors)
diff --git a/scrapersources/postliste-naroy b/scrapersources/postliste-naroy
index b8fa33b..f47adb3 100644
--- a/scrapersources/postliste-naroy
+++ b/scrapersources/postliste-naroy
@@ -59,7 +59,12 @@ def process_journal_pdfs(parser, listurl, errors):
             continue
         # Special case, file indicating no journal entries this day
         if "http://www.naroy.kommune.no/NK/Intern.nsf/FilA/CA6C83764E56DDCBC1257A02003F9025/$FILE/Postjournal+11.05.12.pdf" == url or \
-            "http://www.naroy.kommune.no/NK/Intern.nsf/FilA/7FD82A18C1A1F137C12579F90029DEBD/$FILE/Postjournal+07.05.12.pdf" == url:
+            "http://www.naroy.kommune.no/NK/Intern.nsf/FilA/7FD82A18C1A1F137C12579F90029DEBD/$FILE/Postjournal+07.05.12.pdf" == url or \
+            "http://www.naroy.kommune.no/NK/Intern.nsf/FilA/777B497BB48936ACC1257A450033E1D4/$FILE/Postjournal+20.07.12.pdf" == url or \
+            "http://www.naroy.kommune.no/NK/Intern.nsf/FilA/1802A0FF57C08EFEC1257A4500337345/$FILE/Postjournal+16.07.12.pdf" == url or \
+            "http://www.naroy.kommune.no/NK/Intern.nsf/FilA/90373A38701C27E5C1257A45002F63FD/$FILE/Postjournal+12.07.12.pdf" == url or \
+            "http://www.naroy.kommune.no/NK/Intern.nsf/FilA/6B00A3BD92B3C2AEC1257A45002F4044/$FILE/Postjournal+10.07.12.pdf" == url or \
+            "http://www.naroy.kommune.no/NK/Intern.nsf/FilA/0141B5488D38B8FEC1257A44003756ED/$FILE/Postjournal+06.07.12.pdf" == url:
             continue
         if parser.is_already_scraped(url):
             True
diff --git a/scrapersources/postliste-oep b/scrapersources/postliste-oep
index 360ab91..bcfde1b 100644
--- a/scrapersources/postliste-oep
+++ b/scrapersources/postliste-oep
@@ -9,7 +9,7 @@ import httplib
 import urllib2
 
 # Try several times as the database get bigger
-writetries = 6
+writetries = 8
 
 # http://www.oep.no/search/resultSingle.html?journalPostId=1000000
 # http://www.oep.no/search/resultSingle.html?journalPostId=3889259
@@ -102,23 +102,31 @@ def url_from_id(id):
     return "http://www.oep.no/search/resultSingle.html?journalPostId=" + str(id)
 
 def save(data):
+    problem = False
     for run in range(0,writetries):
         try:
             scraperwiki.sqlite.save(unique_keys=['journalPostId'], data=data)
+            if problem:
+                print "Sqlite write succeeded"
             return
         except scraperwiki.sqlite.SqliteError, e:
-            print "Sqlite write error, trying again"
+            print "Sqlite write error, trying again: " + str(e)
             time.sleep(22)
+            problem = True
     raise scraperwiki.sqlite.SqliteError("Unable to write to database, tried " + str(writetries) + " times")
 
 def save_var(var, data):
+    problem = False
     for run in range(0,writetries):
         try:
             scraperwiki.sqlite.save_var(var, data)
+            if problem:
+                print "Sqlite write succeeded"
             return
         except scraperwiki.sqlite.SqliteError, e:
-            print "Sqlite write error, trying again"
+            print "Sqlite write error, trying again: " + str(e)
             time.sleep(22)
+            problem = True
     raise scraperwiki.sqlite.SqliteError("Unable to write variable " + var + " to database, tried " + str(writetries) + " times")
 
 fieldmap = {
@@ -177,9 +185,8 @@ def fetch_oep_entry(id, datastorage):
 #    scraperwiki.sqlite.save(unique_keys=['journalPostId'], data=data)
     return 0
 
-def fetch_range(first, last, step):
+def fetch_range(datastorage, first, last, step):
     myskiplimit = skiplimit
-    datastorage = []
     skipped = 0
     fetched = 0
     min_id = first
@@ -312,6 +319,16 @@ def remove_original():
 
 print "Starting to fetch journal entries " + str(datetime.datetime.now())
 scraperwiki.scrape("http://www.oep.no/")
+datastorage = []
+
+# Update entries to handle <URL: https://rt.nuug.no:443/Ticket/Display.html?id=6342 >.
+# Used 2012-09-17
+#scraperwiki.sqlite.execute("DELETE from swdata where journalPostId = 638167")
+#fetch_oep_entry(638167, datastorage)
+#scraperwiki.sqlite.execute("DELETE from swdata where journalPostId = 638104")
+#fetch_oep_entry(638104, datastorage)
+#scraperwiki.sqlite.commit()
+
 count = 10000
 skiplimit = 500
 # Random value fairly close to the most recent ID when this project started 2012-05-03
@@ -320,20 +337,21 @@ try:
     max = scraperwiki.sqlite.select("max(journalPostId) as max from swdata")[0]["max"]
     if 0 < scraperwiki.sqlite.get_var('min_tested_id'):
         saved_min = scraperwiki.sqlite.get_var('min_tested_id')
+    else:
+        saved_min = 0
     sql_min = scraperwiki.sqlite.select("min(journalPostId) as min from swdata")[0]["min"]
     print "Saved min: " + str(saved_min) + ", sql min: " + str(sql_min)
     if sql_min < saved_min:
         min = sql_min
     else:
         min = saved_min
-
     print "Scraping " + str(count) + " IDs below " + str(min) + " and above " + str(max)
 except scraperwiki.sqlite.SqliteError:
     pass
 
-fetched = fetch_range(max + 1, max + count, 1)
+fetched = fetch_range(datastorage, max + 1, max + count, 1)
 print "Fetched " + str(fetched) + " new journal entries, cpu spent: " + str(cpu_spent())
 if min >= 0:
-    fetched = fetch_range(min, min - count, -1)
+    fetched = fetch_range(datastorage, min, min - count, -1)
     print "Fetched " + str(fetched) + " old journal entries, cpu spent: " + str(cpu_spent())
 
diff --git a/scrapersources/postliste-oep-deliverydates b/scrapersources/postliste-oep-deliverydates
index f04ce49..ebce253 100644
--- a/scrapersources/postliste-oep-deliverydates
+++ b/scrapersources/postliste-oep-deliverydates
@@ -30,7 +30,9 @@ def fetch_oep_deliverydates(url, datastorage):
     return 0
 
 datastorage = []
-fetch_oep_deliverydates("http://www.oep.no/pub/faces/statistikk.jsp?reposId=3", datastorage)
+#fetch_oep_deliverydates("http://www.oep.no/pub/faces/statistikk.jsp?reposId=3", datastorage)
+# New url before 2012-11-09
+fetch_oep_deliverydates("http://www.oep.no/pub/report.xhtml?reportId=3", datastorage)
 print datastorage
 scraperwiki.sqlite.save(unique_keys=['agency', 'deliverydate'], data=datastorage)
 
diff --git a/scrapersources/postliste-ruter b/scrapersources/postliste-ruter
index 757d6be..51a2776 100644
--- a/scrapersources/postliste-ruter
+++ b/scrapersources/postliste-ruter
@@ -10,6 +10,7 @@ import resource
 import sys
 import urlparse
 import re
+scraperwiki.scrape('http://www2.ruter.no/verdt-a-vite/presse/offentlig-journal/')
 lazycache=scraperwiki.swimport('lazycache')
 postlistelib=scraperwiki.swimport('postliste-python-lib')