# -*- coding: utf-8 -*-
# YAML-tagger:
#  Type: kommune
#  Status: finished
#  Name: Oslo kommunes byrådsavdelinger og Rådhusets forvaltningstjeneste
#  Format: HTML
#  Datatype:
#  Vendor:
#  Run: daily
#  Missingfields: journalseqnr, journalyear, journalid
#  Publish duration: x months

import scraperwiki
import urllib
import urllib2
import lxml.html
import re
import resource
import dateutil.parser
import datetime
import sys
from dateutil.relativedelta import relativedelta

# Some example URLs
#http://byr-journal.cloudapp.net/Journal/SearchRelated?caseYear=2016&sequenceNumber=451
#http://byr-journal.cloudapp.net/Journal/Search?searchStringAdv=FromDate%3D01.09.2016,department%3DAll
#http://byr-journal.cloudapp.net/Journal/Search?searchStringAdv=FromDate%3D01.09.2016,ToDate%3D01.09.2016,department%3DAll
#http://byr-journal.cloudapp.net/Journal/Search/?querytype=and&amp;offset=10
#http://byr-journal.cloudapp.net/Journal/Search/?querytype=and&FromDate=23.08.2016&ToDate=23.08.2016&offset=20
# http://byr-journal.cloudapp.net/Journal/Search?searchStringAdv=FromDate%3D23.08.2016,ToDate%3D23.08.2016,department%3DAll

scraperwiki.scrape("https://www.oslo.kommune.no/postjournal/")
postlistelib=scraperwiki.swimport('postliste-python-lib')

agency = u'Oslo kommune, Byrådsavdelingene'
baseurl = "http://www.oslo.kommune.no"

print "Fetching public journal for %s!" % agency

parser = postlistelib.JournalParser(agency=agency)

fieldmap = {
    'Dokumentdato'  : 'docdate',
    'Dokumenttype'  : 'doctype',
    'Sak'           : 'casedesc',
    'Journaldato'   : 'recorddate',
    'Dato'          : None, # Duplicate of recorddate
    'Saksansvarlig' : 'saksansvarligenhet',
    'Tilgangskode'  : 'exemption',
    'Fra'           : 'sender',
    'Til'           : 'recipient',
    'Til / Fra'     : None, # Internal note, field empty
    }

class NoDataEntries(LookupError):
    pass

def cpu_spent():
    usage = resource.getrusage(resource.RUSAGE_SELF)
    return getattr(usage, 'ru_utime') + getattr(usage, 'ru_stime')

def cpu_available():
    available = resource.getrlimit(resource.RLIMIT_CPU)[0]
    # If no limit is set, assume 20 CPU seconds as the limit to avoid
    # running for more than a few minutes every time.
    if 0 > available:
        available = 20
    return available

def parse_day_html(parser, datastore, dayurl, html):
    root = lxml.html.fromstring(html)
    count = 0
    for row in root.cssselect("div.document-rows"):
        count = count + 1
        data = {
            'agency' : parser.agency,
            'scrapedurl' : dayurl,
            'scrapestamputc' : datetime.datetime.now()
            }

        head = row.cssselect("div.data-column div h3")[0].text_content().strip()
        (arkivsaksref, docdesc) = head.split(" ", 1)
        data['docdesc'] = docdesc

        caseyear = 0
        caseseqnr = 0
        casedocseq = 0
        caseid = 'unknown'
        matchObj = re.match( r'(\d+)/(\d+)\s*-\s*(\d+)$', arkivsaksref, re.M|re.I)
        if matchObj:
            caseyear = int(matchObj.group(1))
            data['caseseqnr'] = int(matchObj.group(2))
            data['casedocseq'] = int(matchObj.group(3))
            data['caseyear']  = caseyear
            data['caseid'] = str(data['caseyear']) + "/" + str(data['caseseqnr'])
            data['arkivsaksref'] = arkivsaksref
        else:
            print "error: really broken Arkivsaksnr: %s" % arkivsaksref
            raise Exception("unable to parse url %s" % dayurl)

        for tagclass in ['journal-recipients', 'journal-details']:
            for d in row.cssselect("div.%s > dl" % tagclass):
                field = d.cssselect("dt")[0].text_content().strip()
                value = d.cssselect("dd")[0].text_content().strip()
                if field in fieldmap:
                    if fieldmap[field] is not None: # Ignore duplicates
                        field = fieldmap[field]
                else:
                    raise Exception("unknown field %s in %s" % (field, dayurl))
                if value and '' != value:
                    data[field] = value
        for field in ['docdate', 'recorddate']:
            if field in data:
                data[field] = dateutil.parser.parse(data[field],
                                                    dayfirst=True).date()
        parser.verify_entry(data)
        datastore.append(data)
#        print data

    return count

def fetch_day(parser, day):
    datastore = []
    daystr = day.strftime('%d.%m.%Y')
    totalcount = 0
    try:
        offset = 0
        offsetstep = 10
        while True:
            dayurl = "http://byr-journal.cloudapp.net/Journal/Search/?querytype=and&FromDate=%s&ToDate=%s&offset=%d" % (daystr, daystr, offset)
            html = postlistelib.fetch_url_harder(dayurl).decode('utf-8')
#            print html
            count = parse_day_html(parser, datastore, dayurl, html)
            totalcount = totalcount + count
#            print count, dayurl
            if 0 == count:
#                print "Ending day at offset %d" % offset
                break
            offset = offset + offsetstep
        scraperwiki.sqlite.save(unique_keys=['arkivsaksref'], data=datastore)
        datastore = []
        return totalcount
    except scraperwiki.CPUTimeExceededError, e:
        print "error: Ran out of time, abort scraping"
        # Not saving, to avoid saving partial day.  Better to scrape
        # the entire day the next run.
        return 0
    except Exception, e:
#        print html
        print e
        raise

aday = datetime.timedelta(1) # one day delta
newest = None
try:
    newest = dateutil.parser.parse(scraperwiki.sqlite.select("max(recorddate) as max from swdata")[0]["max"], dayfirst=False).date()
    oldest = dateutil.parser.parse(scraperwiki.sqlite.select("min(recorddate) as min from swdata")[0]["min"], dayfirst=False).date()
except scraperwiki.sqlite.SqliteError:
    # Table not created yet, ignore the error
    pass

if not newest:
    # Bootstrap a month ago
    newest = datetime.datetime.today() - aday * 30
    oldest = newest

#print oldest, newest

skiplimit = 10

totalcount = 0

# Look forward one week to at least get past the weekends, rescan the
# last day in case new records showed up in the mean time.  Next, scan
# backwards, one day before the oldest entry in the database.
for n in range(0, skiplimit, 1):
    day = newest + aday * n
#    print day
    totalcount = totalcount + fetch_day(parser, day)
    if cpu_spent() > (cpu_available() - 3):
        print "Running short on CPU time, exiting"
        sys.exit(0)

for n in range(-1, -skiplimit, -1):
    day = oldest + aday * n
#    print day
    totalcount = totalcount + fetch_day(parser, day)
    if cpu_spent() > (cpu_available() - 3):
        print "Running short on CPU time, exiting"
        sys.exit(0)

print "Fetched %d journal entries" % totalcount

# Need to rescan after a while to make sure we get the entries that
# take a while to show up when moving forward.  Idea: Revisit all days
# where the record date is less than 30 days after the scraper date,
# allowing records to change for 30 days until we stop rescraping
# them.  But wait 15 days before scraping again, to avoid scraping the
# same day over and over.
totalcount = 0
for drec in scraperwiki.sqlite.select("DISTINCT(recorddate) as d FROM swdata WHERE JULIANDAY(scrapestamputc) - JULIANDAY(recorddate) < 30 AND JULIANDAY('now') - JULIANDAY(scrapestamputc) > 15"):
    day = dateutil.parser.parse(drec['d'], dayfirst=False).date()
    print day
    totalcount = totalcount + fetch_day(parser, day)
    if cpu_spent() > (cpu_available() - 3):
        print "Running short on CPU time, exiting"
        sys.exit(0)
print "Rescanned %d journal entries" % totalcount