aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorPetter Reinholdtsen <pere@hungry.com>2012-07-21 23:49:50 +0200
committerPetter Reinholdtsen <pere@hungry.com>2012-07-21 23:49:50 +0200
commit0543e2d0e68d0742052371d74dfd6600816b7c46 (patch)
treeb208c92c6520931b06188c08ad3d5ca6ef3a1946
parent7778a476effe04f6e7251f20665acf0f848a8f5a (diff)
Updated from scraperwiki.
-rw-r--r--scrapersources/list-nuug-postliste-scrapers35
-rw-r--r--scrapersources/postliste-oep2
-rw-r--r--scrapersources/postliste-python-lib-doculive10
3 files changed, 33 insertions, 14 deletions
diff --git a/scrapersources/list-nuug-postliste-scrapers b/scrapersources/list-nuug-postliste-scrapers
index 67c4158..709a220 100644
--- a/scrapersources/list-nuug-postliste-scrapers
+++ b/scrapersources/list-nuug-postliste-scrapers
@@ -1,5 +1,4 @@
-import os
-import urlparse
+import os, urlparse, cgi
urlquery = os.getenv('URLQUERY')
if urlquery:
@@ -18,7 +17,8 @@ print '''<html>
<head>
<link rel="stylesheet" href="https://views.scraperwiki.com/run/jquery-tablesorter/?file=style-blue.css" type="text/css" />
<script type="text/javascript" src="https://views.scraperwiki.com/run/jquery-tablesorter/?file=jquery-1-4-2-min.js"></script>
-<script type="text/javascript" src="https://views.scraperwiki.com/run/jquery-tablesorter/?file=jquery.tablesorter.2-0-5.min.js"></script>'''
+<script type="text/javascript" src="https://views.scraperwiki.com/run/jquery-tablesorter/?file=jquery.tablesorter.2-0-5.min.js"></script>
+'''
print '''</head><body>
<p>This view lists scrapers with yaml-combatible comments (containing the string "nuug-postliste-endyaml" like the following in their description
@@ -35,10 +35,10 @@ Run: daily&lt;br&gt;
</pre></p>
<table id="myTable" class="tablesorter">'''
-print '<thead><tr><th>Name</th><th>type</th><th>status</th><th>schedule</th><th>format</th><th>datatype</th><th>URL</th></tr></thead><tbody>'
+print '<thead><tr><th>Name</th><th>type</th><th>status</th><th>schedule</th><th>format</th><th>datatype</th><th>created</th><th>URL</th></tr></thead><tbody>'
counter = {}
for scraper in json_data:
- print scraper
+ #print "<!-- %s -->" % cgi.escape("%s" % scraper)
comment = re.findall(r'<!-- nuug-postliste-yaml -->(.*)<!-- nuug-postliste-endyaml -->',
scraper['description'], re.DOTALL)
assert len(comment) == 1
@@ -59,8 +59,8 @@ for scraper in json_data:
else: Type = 'unknown'
- print '<tr><td>%s</td><td>%s</td><td>%s</td><td>%s</td><td>%s</td><td>%s</td><td><a href="https://scraperwiki.com/scrapers/%s/">URL</a></td></tr>' % \
- (data['Name'],data['Type'],data['Status'], Run, Format, Type, scraper['short_name'])
+ print '<tr><td>%s</td><td>%s</td><td>%s</td><td>%s</td><td>%s</td><td>%s</td><td>%s</td><td><a href="https://scraperwiki.com/scrapers/%s/">URL</a></td></tr>' % \
+ (data['Name'],data['Type'],data['Status'], Run, Format, Type, scraper['created'], scraper['short_name'])
print '''</tbody></table><table id="myTable2" class="tablesorter"><thead><tr><th>type</th><th>count</th></tr></thead><tbody>'''
for key in counter:
@@ -78,13 +78,30 @@ try:
print "<tr><td>Fylkeskommune</td><td>%.2f%% (%d av %d)</td></tr>" % \
((float(counter['fylkeskommune'])/float(num_fylke))*100, counter['fylkeskommune'], num_fylke)
except KeyError: pass
+#http://stackoverflow.com/questions/7561026/jquery-tablesorter-parser-for-datetime-in-mm-dd-yyyy-hhmi-am-format
+#http://stackoverflow.com/questions/1707840/date-sorting-problem-with-jquery-tablesorter
print '''</tbody></table>
<script type="text/javascript">
$(document).ready(function()
{
- $("#myTable").tablesorter();
- $("#myTable2").tablesorter();
+ $("#myTable").tablesorter(
+ {
+ debug: true,
+ headers:
+ {
+ 6 : { sorter: "text" },
+ 7: {sorter: false}
+ }
+ }
+ );
+ //$("#myTable2").tablesorter();
}
);
+
+$(function() {
+
+});
+
+
</script>
</body></html>''' \ No newline at end of file
diff --git a/scrapersources/postliste-oep b/scrapersources/postliste-oep
index 735d0a7..360ab91 100644
--- a/scrapersources/postliste-oep
+++ b/scrapersources/postliste-oep
@@ -195,7 +195,7 @@ def fetch_range(first, last, step):
skipped = skipped + 1
if skipped == myskiplimit and myskiplimit == skiplimit:
tmp = []
- for limit in [250, 500, 800, 1000, 1200, 1500, 1700, 2000, 3000, 5000]:
+ for limit in [250, 500, 800, 1000, 1200, 1500, 1700, 2000, 3000, 5000, 7000]:
testid = id + limit * step
if -1 != fetch_oep_entry(testid, tmp):
print "Looking "+str(limit)+" ahead, found " + url_from_id(testid)
diff --git a/scrapersources/postliste-python-lib-doculive b/scrapersources/postliste-python-lib-doculive
index 4907b48..102d3f1 100644
--- a/scrapersources/postliste-python-lib-doculive
+++ b/scrapersources/postliste-python-lib-doculive
@@ -147,12 +147,13 @@ class PDFJournalParser(JournalParser):
for t in s.findAll('text'):
if t.text != " ":
# if self.debug:
-# print t.text
+# print "'%s'" % t.text
if 'Innhold:' == t.text: # type 1 or 2 (ePhorge)
s = None
return True
if 'Arkivdel:' == t.text or 'Notater (X):' == t.text: # type 3 (doculive)
s = None
+ print "Found doculive (type 3)"
return True
s = None
if self.debug:
@@ -173,8 +174,8 @@ class PDFJournalParser(JournalParser):
else:
options = ''
xml=scraperwiki.pdftoxml(pdfcontent, options)
- if self.debug:
- print xml
+# if self.debug:
+# print xml
pages=re.findall('(<page .+?</page>)',xml,flags=re.DOTALL)
xml=None
# print pages[:1][:1000]
@@ -694,7 +695,7 @@ def test_parser():
parser.debug = True
for url in [ #"http://www.stortinget.no/Global/pdf/postjournal/pj-2011-06-23.pdf",
"http://www.radhusets-forvaltningstjeneste.oslo.kommune.no/getfile.php/rÄdhusets%20forvaltningstjeneste%20(RFT)/Intranett%20(RFT)/Dokumenter/Postjournal/11%20November/29112011.pdf",
- ]:
+ ]:
pdfcontent = scraperwiki.scrape(url)
parser.preprocess(url,pdfcontent)
parser.process_pages()
@@ -702,3 +703,4 @@ def test_parser():
if __name__ == "scraper" or __name__ == '__main__':
test_parser()
# fieldlist()
+