diff options
author | Petter Reinholdtsen <pere@hungry.com> | 2016-04-07 10:08:41 +0200 |
---|---|---|
committer | Petter Reinholdtsen <pere@hungry.com> | 2016-04-07 10:08:41 +0200 |
commit | b76474ec73f40d0e3b365a8be817610d9fb43d4e (patch) | |
tree | 6a47f33b2b69fdd3cb6f49e53b2a75e21b022b9a | |
parent | ce219f7d12170a80e9efa7acc2a21af3f26a3636 (diff) |
Add a copy of the old lazycache library.
-rwxr-xr-x | run-scraper | 2 | ||||
-rw-r--r-- | testlib/lazycache/lazycache.py | 46 |
2 files changed, 47 insertions, 1 deletions
diff --git a/run-scraper b/run-scraper index 45ed793..bed4eab 100755 --- a/run-scraper +++ b/run-scraper @@ -7,7 +7,7 @@ file=$1 topdir=$(cd $(dirname $0); pwd) -export PYTHONPATH=`pwd`/testlib/scraperwiki-python:`pwd`/testlib/dumptruck +export PYTHONPATH=`pwd`/testlib/scraperwiki-python:`pwd`/testlib/lazycache SCRAPERWIKI_DATABASE_NAME=sqlite:///$file.sqlite export SCRAPERWIKI_DATABASE_NAME diff --git a/testlib/lazycache/lazycache.py b/testlib/lazycache/lazycache.py new file mode 100644 index 0000000..1e52f3c --- /dev/null +++ b/testlib/lazycache/lazycache.py @@ -0,0 +1,46 @@ +import scraperwiki, urllib2, datetime, base64,time, logging + +# TODO: Implement time-based cache removal (some stubs in, 'age' and 'date' + +def lazycache(url,age=datetime.timedelta(1), verbose=False, delay=0): + # html is the data from the webpage; it might not be html. + + def scrapesave(url): + if verbose: print "Downloading %s"%url + try: + html=urllib2.urlopen(url).read() + except: + html=urllib2.urlopen(url).read() + scraperwiki.sqlite.save(table_name='__lazycache', data={'url':url,'payload':base64.b64encode(html),'date':datetime.datetime.now()}, unique_keys=['url'], verbose=0) + return html + + if ' ' in url: + logging.warn('URL "%s" contains spaces.'%url) + url=url.replace(' ','%20') + logging.warn('Using "%s"'%url) + try: + r=scraperwiki.sqlite.select("* from __lazycache where url=?", url, verbose=0) # attempt grab from database. + except scraperwiki.sqlite.SqliteError, e: # if table doesn't exist, don't bother checking + if 'no such table: __lazycache' not in str(e): + raise + return scrapesave(url) + if len(r)>0: + if verbose: print "Cache hit for %s"%url + return base64.b64decode(r[0]['payload']) + else: + if verbose: print "No cache held for %s"%url + if delay: + if verbose: print "sleeping" + time.sleep(delay) + return scrapesave(url) + + + + +#scraperwiki.sqlite.execute("drop table __lazycache;") # reset to new condition +#html=lazycache('http://placekitten.com/g/200/300') +#html=lazycache('http://www.everything2.net') +#print html +#html=lazycache('http://www.everything2.net') +#print html + |