diff options
-rw-r--r-- | scrapersources/list-nuug-postliste-scrapers | 48 |
1 files changed, 22 insertions, 26 deletions
diff --git a/scrapersources/list-nuug-postliste-scrapers b/scrapersources/list-nuug-postliste-scrapers index 709a220..462422e 100644 --- a/scrapersources/list-nuug-postliste-scrapers +++ b/scrapersources/list-nuug-postliste-scrapers @@ -1,26 +1,15 @@ -import os, urlparse, cgi -urlquery = os.getenv('URLQUERY') - -if urlquery: - querydata = urlparse.parse_qsl(urlquery); - for pair in querydata: - if pair[0] == "js" and pair[1] == "jquery.js": - print 'js-sourcecode' - exit(0) - -import urllib2, json, re +import json +import re import yaml +import glob + -url = "https://api.scraperwiki.com/api/1.0/scraper/search?format=jsondict&maxrows=200&searchquery=nuug-postliste-endyaml" -json_data = json.load(urllib2.urlopen(url)) print '''<html> <head> <link rel="stylesheet" href="https://views.scraperwiki.com/run/jquery-tablesorter/?file=style-blue.css" type="text/css" /> <script type="text/javascript" src="https://views.scraperwiki.com/run/jquery-tablesorter/?file=jquery-1-4-2-min.js"></script> <script type="text/javascript" src="https://views.scraperwiki.com/run/jquery-tablesorter/?file=jquery.tablesorter.2-0-5.min.js"></script> -''' - -print '''</head><body> +</head><body> <p>This view lists scrapers with yaml-combatible comments (containing the string "nuug-postliste-endyaml" like the following in their description <pre> <!-- nuug-postliste-yaml --> @@ -33,16 +22,23 @@ Datatype: ePhorte<br> Run: daily<br> <!-- nuug-postliste-endyaml --> </pre></p> -<table id="myTable" class="tablesorter">''' +<table id="myTable" class="tablesorter"> +<thead><tr><th>Name</th><th>type</th><th>status</th><th>schedule</th><th>format</th><th>datatype</th><th>created</th><th>URL</th></tr></thead><tbody> +''' -print '<thead><tr><th>Name</th><th>type</th><th>status</th><th>schedule</th><th>format</th><th>datatype</th><th>created</th><th>URL</th></tr></thead><tbody>' counter = {} -for scraper in json_data: - #print "<!-- %s -->" % cgi.escape("%s" % scraper) - comment = re.findall(r'<!-- nuug-postliste-yaml -->(.*)<!-- nuug-postliste-endyaml -->', - scraper['description'], re.DOTALL) - assert len(comment) == 1 - data = yaml.load(comment[0].strip().replace('<br>','')) +for scrapername in glob.glob("scrapersources/postlist*"): +# print scrapername + with open(scrapername, 'r') as scraperfile: + data = scraperfile.read() + if -1 == data.find("YAML-tagger:"): + continue + data = re.sub(r"\n\n.*", "", data, flags=re.DOTALL) + data = re.sub("^.*YAML-tagger:\n", "", data, flags=re.DOTALL) + data = data.replace("# ", "") +# print data + + data = yaml.load(data) if data['Type'] in counter: counter[data['Type']] = counter[data['Type']] + 1 @@ -60,7 +56,7 @@ for scraper in json_data: print '<tr><td>%s</td><td>%s</td><td>%s</td><td>%s</td><td>%s</td><td>%s</td><td>%s</td><td><a href="https://scraperwiki.com/scrapers/%s/">URL</a></td></tr>' % \ - (data['Name'],data['Type'],data['Status'], Run, Format, Type, scraper['created'], scraper['short_name']) + (data['Name'],data['Type'],data['Status'], Run, Format, Type, "unknown", scrapername) print '''</tbody></table><table id="myTable2" class="tablesorter"><thead><tr><th>type</th><th>count</th></tr></thead><tbody>''' for key in counter: @@ -104,4 +100,4 @@ $(function() { </script> -</body></html>'''
\ No newline at end of file +</body></html>''' |