aboutsummaryrefslogtreecommitdiffstats
path: root/testlib/scraperwiki.py
diff options
context:
space:
mode:
authorPetter Reinholdtsen <pere@hungry.com>2012-07-13 17:45:44 +0200
committerPetter Reinholdtsen <pere@hungry.com>2012-07-13 17:45:44 +0200
commit7a3c32bc67692241dc1ff4cbe5fa627966b0e19c (patch)
tree2757e766dc564c2205ea7fdcef4ce46568d31a49 /testlib/scraperwiki.py
parentd6b3d17b5489827cecd628545b7a416026290968 (diff)
Get pdftoxml working.
Diffstat (limited to 'testlib/scraperwiki.py')
-rw-r--r--testlib/scraperwiki.py24
1 files changed, 22 insertions, 2 deletions
diff --git a/testlib/scraperwiki.py b/testlib/scraperwiki.py
index 901acec..0d7466e 100644
--- a/testlib/scraperwiki.py
+++ b/testlib/scraperwiki.py
@@ -1,4 +1,9 @@
+# The real version is available from
+# https://bitbucket.org/ScraperWiki/scraperwiki/src/85cbc82c32f2/scraperlibs/python/scraperwiki/
+
+import tempfile
import urllib2
+import os
def scrape(url):
print "Scraping %s" % url
@@ -12,8 +17,23 @@ def scrape(url):
html = response.read()
return html
-def pdftoxml(pdfcontent, options):
- return pdfcontent
+def pdftoxml(pdfdata, options=''):
+ """converts pdf file to xml file"""
+ pdffout = tempfile.NamedTemporaryFile(suffix='.pdf')
+ pdffout.write(pdfdata)
+ pdffout.flush()
+
+ xmlin = tempfile.NamedTemporaryFile(mode='r', suffix='.xml')
+ tmpxml = xmlin.name # "temph.xml"
+ cmd = '/usr/bin/pdftohtml -xml -nodrm -zoom 1.5 -enc UTF-8 -noframes %s "%s" "%s"' % (options, pdffout.name, os.path.splitext(tmpxml)[0])
+ cmd = cmd + " >/dev/null 2>&1" # can't turn off output, so throw away even stderr yeuch
+ os.system(cmd)
+
+ pdffout.close()
+ #xmlfin = open(tmpxml)
+ xmldata = xmlin.read()
+ xmlin.close()
+ return xmldata
def swimport(scrapername):
return None