diff options
author | Petter Reinholdtsen <pere@hungry.com> | 2012-07-21 23:49:50 +0200 |
---|---|---|
committer | Petter Reinholdtsen <pere@hungry.com> | 2012-07-21 23:49:50 +0200 |
commit | 0543e2d0e68d0742052371d74dfd6600816b7c46 (patch) | |
tree | b208c92c6520931b06188c08ad3d5ca6ef3a1946 | |
parent | 7778a476effe04f6e7251f20665acf0f848a8f5a (diff) |
Updated from scraperwiki.
-rw-r--r-- | scrapersources/list-nuug-postliste-scrapers | 35 | ||||
-rw-r--r-- | scrapersources/postliste-oep | 2 | ||||
-rw-r--r-- | scrapersources/postliste-python-lib-doculive | 10 |
3 files changed, 33 insertions, 14 deletions
diff --git a/scrapersources/list-nuug-postliste-scrapers b/scrapersources/list-nuug-postliste-scrapers index 67c4158..709a220 100644 --- a/scrapersources/list-nuug-postliste-scrapers +++ b/scrapersources/list-nuug-postliste-scrapers @@ -1,5 +1,4 @@ -import os -import urlparse +import os, urlparse, cgi urlquery = os.getenv('URLQUERY') if urlquery: @@ -18,7 +17,8 @@ print '''<html> <head> <link rel="stylesheet" href="https://views.scraperwiki.com/run/jquery-tablesorter/?file=style-blue.css" type="text/css" /> <script type="text/javascript" src="https://views.scraperwiki.com/run/jquery-tablesorter/?file=jquery-1-4-2-min.js"></script> -<script type="text/javascript" src="https://views.scraperwiki.com/run/jquery-tablesorter/?file=jquery.tablesorter.2-0-5.min.js"></script>''' +<script type="text/javascript" src="https://views.scraperwiki.com/run/jquery-tablesorter/?file=jquery.tablesorter.2-0-5.min.js"></script> +''' print '''</head><body> <p>This view lists scrapers with yaml-combatible comments (containing the string "nuug-postliste-endyaml" like the following in their description @@ -35,10 +35,10 @@ Run: daily<br> </pre></p> <table id="myTable" class="tablesorter">''' -print '<thead><tr><th>Name</th><th>type</th><th>status</th><th>schedule</th><th>format</th><th>datatype</th><th>URL</th></tr></thead><tbody>' +print '<thead><tr><th>Name</th><th>type</th><th>status</th><th>schedule</th><th>format</th><th>datatype</th><th>created</th><th>URL</th></tr></thead><tbody>' counter = {} for scraper in json_data: - print scraper + #print "<!-- %s -->" % cgi.escape("%s" % scraper) comment = re.findall(r'<!-- nuug-postliste-yaml -->(.*)<!-- nuug-postliste-endyaml -->', scraper['description'], re.DOTALL) assert len(comment) == 1 @@ -59,8 +59,8 @@ for scraper in json_data: else: Type = 'unknown' - print '<tr><td>%s</td><td>%s</td><td>%s</td><td>%s</td><td>%s</td><td>%s</td><td><a href="https://scraperwiki.com/scrapers/%s/">URL</a></td></tr>' % \ - (data['Name'],data['Type'],data['Status'], Run, Format, Type, scraper['short_name']) + print '<tr><td>%s</td><td>%s</td><td>%s</td><td>%s</td><td>%s</td><td>%s</td><td>%s</td><td><a href="https://scraperwiki.com/scrapers/%s/">URL</a></td></tr>' % \ + (data['Name'],data['Type'],data['Status'], Run, Format, Type, scraper['created'], scraper['short_name']) print '''</tbody></table><table id="myTable2" class="tablesorter"><thead><tr><th>type</th><th>count</th></tr></thead><tbody>''' for key in counter: @@ -78,13 +78,30 @@ try: print "<tr><td>Fylkeskommune</td><td>%.2f%% (%d av %d)</td></tr>" % \ ((float(counter['fylkeskommune'])/float(num_fylke))*100, counter['fylkeskommune'], num_fylke) except KeyError: pass +#http://stackoverflow.com/questions/7561026/jquery-tablesorter-parser-for-datetime-in-mm-dd-yyyy-hhmi-am-format +#http://stackoverflow.com/questions/1707840/date-sorting-problem-with-jquery-tablesorter print '''</tbody></table> <script type="text/javascript"> $(document).ready(function() { - $("#myTable").tablesorter(); - $("#myTable2").tablesorter(); + $("#myTable").tablesorter( + { + debug: true, + headers: + { + 6 : { sorter: "text" }, + 7: {sorter: false} + } + } + ); + //$("#myTable2").tablesorter(); } ); + +$(function() { + +}); + + </script> </body></html>'''
\ No newline at end of file diff --git a/scrapersources/postliste-oep b/scrapersources/postliste-oep index 735d0a7..360ab91 100644 --- a/scrapersources/postliste-oep +++ b/scrapersources/postliste-oep @@ -195,7 +195,7 @@ def fetch_range(first, last, step): skipped = skipped + 1 if skipped == myskiplimit and myskiplimit == skiplimit: tmp = [] - for limit in [250, 500, 800, 1000, 1200, 1500, 1700, 2000, 3000, 5000]: + for limit in [250, 500, 800, 1000, 1200, 1500, 1700, 2000, 3000, 5000, 7000]: testid = id + limit * step if -1 != fetch_oep_entry(testid, tmp): print "Looking "+str(limit)+" ahead, found " + url_from_id(testid) diff --git a/scrapersources/postliste-python-lib-doculive b/scrapersources/postliste-python-lib-doculive index 4907b48..102d3f1 100644 --- a/scrapersources/postliste-python-lib-doculive +++ b/scrapersources/postliste-python-lib-doculive @@ -147,12 +147,13 @@ class PDFJournalParser(JournalParser): for t in s.findAll('text'): if t.text != " ": # if self.debug: -# print t.text +# print "'%s'" % t.text if 'Innhold:' == t.text: # type 1 or 2 (ePhorge) s = None return True if 'Arkivdel:' == t.text or 'Notater (X):' == t.text: # type 3 (doculive) s = None + print "Found doculive (type 3)" return True s = None if self.debug: @@ -173,8 +174,8 @@ class PDFJournalParser(JournalParser): else: options = '' xml=scraperwiki.pdftoxml(pdfcontent, options) - if self.debug: - print xml +# if self.debug: +# print xml pages=re.findall('(<page .+?</page>)',xml,flags=re.DOTALL) xml=None # print pages[:1][:1000] @@ -694,7 +695,7 @@ def test_parser(): parser.debug = True for url in [ #"http://www.stortinget.no/Global/pdf/postjournal/pj-2011-06-23.pdf", "http://www.radhusets-forvaltningstjeneste.oslo.kommune.no/getfile.php/rÄdhusets%20forvaltningstjeneste%20(RFT)/Intranett%20(RFT)/Dokumenter/Postjournal/11%20November/29112011.pdf", - ]: + ]: pdfcontent = scraperwiki.scrape(url) parser.preprocess(url,pdfcontent) parser.process_pages() @@ -702,3 +703,4 @@ def test_parser(): if __name__ == "scraper" or __name__ == '__main__': test_parser() # fieldlist() + |