diff options
author | Petter Reinholdtsen <pere@hungry.com> | 2012-07-13 17:45:44 +0200 |
---|---|---|
committer | Petter Reinholdtsen <pere@hungry.com> | 2012-07-13 17:45:44 +0200 |
commit | 7a3c32bc67692241dc1ff4cbe5fa627966b0e19c (patch) | |
tree | 2757e766dc564c2205ea7fdcef4ce46568d31a49 /testlib/scraperwiki.py | |
parent | d6b3d17b5489827cecd628545b7a416026290968 (diff) |
Get pdftoxml working.
Diffstat (limited to 'testlib/scraperwiki.py')
-rw-r--r-- | testlib/scraperwiki.py | 24 |
1 files changed, 22 insertions, 2 deletions
diff --git a/testlib/scraperwiki.py b/testlib/scraperwiki.py index 901acec..0d7466e 100644 --- a/testlib/scraperwiki.py +++ b/testlib/scraperwiki.py @@ -1,4 +1,9 @@ +# The real version is available from +# https://bitbucket.org/ScraperWiki/scraperwiki/src/85cbc82c32f2/scraperlibs/python/scraperwiki/ + +import tempfile import urllib2 +import os def scrape(url): print "Scraping %s" % url @@ -12,8 +17,23 @@ def scrape(url): html = response.read() return html -def pdftoxml(pdfcontent, options): - return pdfcontent +def pdftoxml(pdfdata, options=''): + """converts pdf file to xml file""" + pdffout = tempfile.NamedTemporaryFile(suffix='.pdf') + pdffout.write(pdfdata) + pdffout.flush() + + xmlin = tempfile.NamedTemporaryFile(mode='r', suffix='.xml') + tmpxml = xmlin.name # "temph.xml" + cmd = '/usr/bin/pdftohtml -xml -nodrm -zoom 1.5 -enc UTF-8 -noframes %s "%s" "%s"' % (options, pdffout.name, os.path.splitext(tmpxml)[0]) + cmd = cmd + " >/dev/null 2>&1" # can't turn off output, so throw away even stderr yeuch + os.system(cmd) + + pdffout.close() + #xmlfin = open(tmpxml) + xmldata = xmlin.read() + xmlin.close() + return xmldata def swimport(scrapername): return None |