aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rwxr-xr-xrun-scraper2
-rw-r--r--testlib/lazycache/lazycache.py46
2 files changed, 47 insertions, 1 deletions
diff --git a/run-scraper b/run-scraper
index 45ed793..bed4eab 100755
--- a/run-scraper
+++ b/run-scraper
@@ -7,7 +7,7 @@ file=$1
topdir=$(cd $(dirname $0); pwd)
-export PYTHONPATH=`pwd`/testlib/scraperwiki-python:`pwd`/testlib/dumptruck
+export PYTHONPATH=`pwd`/testlib/scraperwiki-python:`pwd`/testlib/lazycache
SCRAPERWIKI_DATABASE_NAME=sqlite:///$file.sqlite
export SCRAPERWIKI_DATABASE_NAME
diff --git a/testlib/lazycache/lazycache.py b/testlib/lazycache/lazycache.py
new file mode 100644
index 0000000..1e52f3c
--- /dev/null
+++ b/testlib/lazycache/lazycache.py
@@ -0,0 +1,46 @@
+import scraperwiki, urllib2, datetime, base64,time, logging
+
+# TODO: Implement time-based cache removal (some stubs in, 'age' and 'date'
+
+def lazycache(url,age=datetime.timedelta(1), verbose=False, delay=0):
+ # html is the data from the webpage; it might not be html.
+
+ def scrapesave(url):
+ if verbose: print "Downloading %s"%url
+ try:
+ html=urllib2.urlopen(url).read()
+ except:
+ html=urllib2.urlopen(url).read()
+ scraperwiki.sqlite.save(table_name='__lazycache', data={'url':url,'payload':base64.b64encode(html),'date':datetime.datetime.now()}, unique_keys=['url'], verbose=0)
+ return html
+
+ if ' ' in url:
+ logging.warn('URL "%s" contains spaces.'%url)
+ url=url.replace(' ','%20')
+ logging.warn('Using "%s"'%url)
+ try:
+ r=scraperwiki.sqlite.select("* from __lazycache where url=?", url, verbose=0) # attempt grab from database.
+ except scraperwiki.sqlite.SqliteError, e: # if table doesn't exist, don't bother checking
+ if 'no such table: __lazycache' not in str(e):
+ raise
+ return scrapesave(url)
+ if len(r)>0:
+ if verbose: print "Cache hit for %s"%url
+ return base64.b64decode(r[0]['payload'])
+ else:
+ if verbose: print "No cache held for %s"%url
+ if delay:
+ if verbose: print "sleeping"
+ time.sleep(delay)
+ return scrapesave(url)
+
+
+
+
+#scraperwiki.sqlite.execute("drop table __lazycache;") # reset to new condition
+#html=lazycache('http://placekitten.com/g/200/300')
+#html=lazycache('http://www.everything2.net')
+#print html
+#html=lazycache('http://www.everything2.net')
+#print html
+