diff options
author | Anders Einar Hilden <hildenae@gmail.com> | 2015-01-16 23:38:18 +0100 |
---|---|---|
committer | Anders Einar Hilden <hildenae@gmail.com> | 2015-01-16 23:38:50 +0100 |
commit | 8ac5d9ec8290e281fbcb6f7c7f7790d5c1317feb (patch) | |
tree | b06b2a93d63f4d089930a409baa807a93a160b9b | |
parent | 2817125d916678acdedf2bd6822eefedcb719137 (diff) |
Add scraper library for DMS2002 - Software Innovation. Currently a separate library, but might be merged with postliste-python-lib in the future
-rw-r--r-- | scrapersources/postliste-python-lib-pdf-dms2002-brokenpages.py | 41 |
1 files changed, 41 insertions, 0 deletions
diff --git a/scrapersources/postliste-python-lib-pdf-dms2002-brokenpages.py b/scrapersources/postliste-python-lib-pdf-dms2002-brokenpages.py new file mode 100644 index 0000000..2d551e4 --- /dev/null +++ b/scrapersources/postliste-python-lib-pdf-dms2002-brokenpages.py @@ -0,0 +1,41 @@ + # Last, try some of the broken pages again, in case we got support for handling them in the mean time + try: + # First, check if the table exist + scraperwiki.sqlite.execute("select * from " + self.brokenpagetable) + + newtrystamp = datetime.datetime.now() + sqlselect = "* from " + self.brokenpagetable + " where failstamp is NULL or failstamp < '" + str(newtrystamp) + "'" + " limit 1" + try: + pageref = scraperwiki.sqlite.select(sqlselect) + except scraperwiki.sqlite.SqliteError, e: + scraperwiki.sqlite.execute("ALTER TABLE " + self.brokenpagetable + " ADD COLUMN failstamp") + scraperwiki.sqlite.commit() + pageref = scraperwiki.sqlite.select(sqlselect) + + pagelimit = 10 + while pageref and 0 < pagelimit: + pagelimit = pagelimit - 1 + scrapedurl = pageref[0]['scrapedurl'] + pagenum = pageref[0]['pagenum'] + pagecontent = pageref[0]['pagecontent'] +# print "Found " + scrapedurl + " page " + str(pagenum) + " length " + str(len(pagecontent)) + try: + sqldelete = "delete from " + self.brokenpagetable + " where scrapedurl = '" + scrapedurl + "' and pagenum = " + str(pagenum) + self.parse_page(scrapedurl, pagenum, pagecontent) +# print "Trying to: " + sqldelete + scraperwiki.sqlite.execute(sqldelete) + except ValueError, e: + brokenpage = { + 'scrapedurl' : scrapedurl, + 'pagenum' : pagenum, + 'pagecontent' : pagecontent, + 'failstamp' : newtrystamp, + } + + print "Still unsupported page %d from %s" % (pagenum, scrapedurl) + brokenpages = brokenpages + 1 + scraperwiki.sqlite.save(unique_keys=['scrapedurl', 'pagenum'], data=brokenpage, table_name=self.brokenpagetable) + scraperwiki.sqlite.commit() + pageref = scraperwiki.sqlite.select(sqlselect) + except: + True # Ignore missing brokenpages table |