From 94168caddb231774892d3d707b3313685d0d2463 Mon Sep 17 00:00:00 2001 From: Petter Reinholdtsen Date: Thu, 6 Oct 2016 21:03:37 +0200 Subject: Get script working on local files instead of using scraperwiki. --- scrapersources/list-nuug-postliste-scrapers | 48 +++++++++++++---------------- 1 file changed, 22 insertions(+), 26 deletions(-) diff --git a/scrapersources/list-nuug-postliste-scrapers b/scrapersources/list-nuug-postliste-scrapers index 709a220..462422e 100644 --- a/scrapersources/list-nuug-postliste-scrapers +++ b/scrapersources/list-nuug-postliste-scrapers @@ -1,26 +1,15 @@ -import os, urlparse, cgi -urlquery = os.getenv('URLQUERY') - -if urlquery: - querydata = urlparse.parse_qsl(urlquery); - for pair in querydata: - if pair[0] == "js" and pair[1] == "jquery.js": - print 'js-sourcecode' - exit(0) - -import urllib2, json, re +import json +import re import yaml +import glob + -url = "https://api.scraperwiki.com/api/1.0/scraper/search?format=jsondict&maxrows=200&searchquery=nuug-postliste-endyaml" -json_data = json.load(urllib2.urlopen(url)) print ''' -''' - -print ''' +

This view lists scrapers with yaml-combatible comments (containing the string "nuug-postliste-endyaml" like the following in their description

 <!-- nuug-postliste-yaml -->
@@ -33,16 +22,23 @@ Datatype: ePhorte<br>
 Run: daily<br>
 <!-- nuug-postliste-endyaml -->

-''' +

+ +''' -print '' counter = {} -for scraper in json_data: - #print "" % cgi.escape("%s" % scraper) - comment = re.findall(r'(.*)', - scraper['description'], re.DOTALL) - assert len(comment) == 1 - data = yaml.load(comment[0].strip().replace('
','')) +for scrapername in glob.glob("scrapersources/postlist*"): +# print scrapername + with open(scrapername, 'r') as scraperfile: + data = scraperfile.read() + if -1 == data.find("YAML-tagger:"): + continue + data = re.sub(r"\n\n.*", "", data, flags=re.DOTALL) + data = re.sub("^.*YAML-tagger:\n", "", data, flags=re.DOTALL) + data = data.replace("# ", "") +# print data + + data = yaml.load(data) if data['Type'] in counter: counter[data['Type']] = counter[data['Type']] + 1 @@ -60,7 +56,7 @@ for scraper in json_data: print '' % \ - (data['Name'],data['Type'],data['Status'], Run, Format, Type, scraper['created'], scraper['short_name']) + (data['Name'],data['Type'],data['Status'], Run, Format, Type, "unknown", scrapername) print '''

Name	type	status	schedule	format	datatype	created	URL
Name	type	status	schedule	format	datatype	created	URL
%s	%s	%s	%s	%s	%s	%s	URL

''' for key in counter: @@ -104,4 +100,4 @@ $(function() { -''' \ No newline at end of file +''' -- cgit v1.2.3

type	count