diff options
author | Petter Reinholdtsen <pere@hungry.com> | 2016-04-06 15:57:10 +0200 |
---|---|---|
committer | Petter Reinholdtsen <pere@hungry.com> | 2016-04-06 15:57:10 +0200 |
commit | 731b44075054279cd352c48181cd378f5f69d596 (patch) | |
tree | d5ffe32d324e4b99bb39f0303cef50786b00d4b8 | |
parent | 87ea4374450fbf3ead4c9cbb946d4be5801931e2 (diff) |
Remove draft replaced by scraperwiki-python.
-rw-r--r-- | testlib/scraperwiki.py | 39 |
1 files changed, 0 insertions, 39 deletions
diff --git a/testlib/scraperwiki.py b/testlib/scraperwiki.py deleted file mode 100644 index 0d7466e..0000000 --- a/testlib/scraperwiki.py +++ /dev/null @@ -1,39 +0,0 @@ -# The real version is available from -# https://bitbucket.org/ScraperWiki/scraperwiki/src/85cbc82c32f2/scraperlibs/python/scraperwiki/ - -import tempfile -import urllib2 -import os - -def scrape(url): - print "Scraping %s" % url - if -1 != url.find("file://"): - f = open(url.replace("file://", ""), "r") - content = f.read() - f.close() - return content - else: - response = urllib2.urlopen(url) - html = response.read() - return html - -def pdftoxml(pdfdata, options=''): - """converts pdf file to xml file""" - pdffout = tempfile.NamedTemporaryFile(suffix='.pdf') - pdffout.write(pdfdata) - pdffout.flush() - - xmlin = tempfile.NamedTemporaryFile(mode='r', suffix='.xml') - tmpxml = xmlin.name # "temph.xml" - cmd = '/usr/bin/pdftohtml -xml -nodrm -zoom 1.5 -enc UTF-8 -noframes %s "%s" "%s"' % (options, pdffout.name, os.path.splitext(tmpxml)[0]) - cmd = cmd + " >/dev/null 2>&1" # can't turn off output, so throw away even stderr yeuch - os.system(cmd) - - pdffout.close() - #xmlfin = open(tmpxml) - xmldata = xmlin.read() - xmlin.close() - return xmldata - -def swimport(scrapername): - return None |