diff options
-rw-r--r-- | scrapersources/postliste-python-lib-pdf-dms2002-brokenpages.py | 41 |
1 files changed, 41 insertions, 0 deletions
diff --git a/scrapersources/postliste-python-lib-pdf-dms2002-brokenpages.py b/scrapersources/postliste-python-lib-pdf-dms2002-brokenpages.py new file mode 100644 index 0000000..2d551e4 --- /dev/null +++ b/scrapersources/postliste-python-lib-pdf-dms2002-brokenpages.py @@ -0,0 +1,41 @@ + # Last, try some of the broken pages again, in case we got support for handling them in the mean time + try: + # First, check if the table exist + scraperwiki.sqlite.execute("select * from " + self.brokenpagetable) + + newtrystamp = datetime.datetime.now() + sqlselect = "* from " + self.brokenpagetable + " where failstamp is NULL or failstamp < '" + str(newtrystamp) + "'" + " limit 1" + try: + pageref = scraperwiki.sqlite.select(sqlselect) + except scraperwiki.sqlite.SqliteError, e: + scraperwiki.sqlite.execute("ALTER TABLE " + self.brokenpagetable + " ADD COLUMN failstamp") + scraperwiki.sqlite.commit() + pageref = scraperwiki.sqlite.select(sqlselect) + + pagelimit = 10 + while pageref and 0 < pagelimit: + pagelimit = pagelimit - 1 + scrapedurl = pageref[0]['scrapedurl'] + pagenum = pageref[0]['pagenum'] + pagecontent = pageref[0]['pagecontent'] +# print "Found " + scrapedurl + " page " + str(pagenum) + " length " + str(len(pagecontent)) + try: + sqldelete = "delete from " + self.brokenpagetable + " where scrapedurl = '" + scrapedurl + "' and pagenum = " + str(pagenum) + self.parse_page(scrapedurl, pagenum, pagecontent) +# print "Trying to: " + sqldelete + scraperwiki.sqlite.execute(sqldelete) + except ValueError, e: + brokenpage = { + 'scrapedurl' : scrapedurl, + 'pagenum' : pagenum, + 'pagecontent' : pagecontent, + 'failstamp' : newtrystamp, + } + + print "Still unsupported page %d from %s" % (pagenum, scrapedurl) + brokenpages = brokenpages + 1 + scraperwiki.sqlite.save(unique_keys=['scrapedurl', 'pagenum'], data=brokenpage, table_name=self.brokenpagetable) + scraperwiki.sqlite.commit() + pageref = scraperwiki.sqlite.select(sqlselect) + except: + True # Ignore missing brokenpages table |