aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorAnders Einar Hilden <hildenae@gmail.com>2015-01-16 23:38:18 +0100
committerAnders Einar Hilden <hildenae@gmail.com>2015-01-16 23:38:50 +0100
commit8ac5d9ec8290e281fbcb6f7c7f7790d5c1317feb (patch)
treeb06b2a93d63f4d089930a409baa807a93a160b9b
parent2817125d916678acdedf2bd6822eefedcb719137 (diff)
Add scraper library for DMS2002 - Software Innovation. Currently a separate library, but might be merged with postliste-python-lib in the future
-rw-r--r--scrapersources/postliste-python-lib-pdf-dms2002-brokenpages.py41
1 files changed, 41 insertions, 0 deletions
diff --git a/scrapersources/postliste-python-lib-pdf-dms2002-brokenpages.py b/scrapersources/postliste-python-lib-pdf-dms2002-brokenpages.py
new file mode 100644
index 0000000..2d551e4
--- /dev/null
+++ b/scrapersources/postliste-python-lib-pdf-dms2002-brokenpages.py
@@ -0,0 +1,41 @@
+ # Last, try some of the broken pages again, in case we got support for handling them in the mean time
+ try:
+ # First, check if the table exist
+ scraperwiki.sqlite.execute("select * from " + self.brokenpagetable)
+
+ newtrystamp = datetime.datetime.now()
+ sqlselect = "* from " + self.brokenpagetable + " where failstamp is NULL or failstamp < '" + str(newtrystamp) + "'" + " limit 1"
+ try:
+ pageref = scraperwiki.sqlite.select(sqlselect)
+ except scraperwiki.sqlite.SqliteError, e:
+ scraperwiki.sqlite.execute("ALTER TABLE " + self.brokenpagetable + " ADD COLUMN failstamp")
+ scraperwiki.sqlite.commit()
+ pageref = scraperwiki.sqlite.select(sqlselect)
+
+ pagelimit = 10
+ while pageref and 0 < pagelimit:
+ pagelimit = pagelimit - 1
+ scrapedurl = pageref[0]['scrapedurl']
+ pagenum = pageref[0]['pagenum']
+ pagecontent = pageref[0]['pagecontent']
+# print "Found " + scrapedurl + " page " + str(pagenum) + " length " + str(len(pagecontent))
+ try:
+ sqldelete = "delete from " + self.brokenpagetable + " where scrapedurl = '" + scrapedurl + "' and pagenum = " + str(pagenum)
+ self.parse_page(scrapedurl, pagenum, pagecontent)
+# print "Trying to: " + sqldelete
+ scraperwiki.sqlite.execute(sqldelete)
+ except ValueError, e:
+ brokenpage = {
+ 'scrapedurl' : scrapedurl,
+ 'pagenum' : pagenum,
+ 'pagecontent' : pagecontent,
+ 'failstamp' : newtrystamp,
+ }
+
+ print "Still unsupported page %d from %s" % (pagenum, scrapedurl)
+ brokenpages = brokenpages + 1
+ scraperwiki.sqlite.save(unique_keys=['scrapedurl', 'pagenum'], data=brokenpage, table_name=self.brokenpagetable)
+ scraperwiki.sqlite.commit()
+ pageref = scraperwiki.sqlite.select(sqlselect)
+ except:
+ True # Ignore missing brokenpages table