Updated from scraperwiki.

author: Petter Reinholdtsen <pere@hungry.com> 2012-07-21 23:49:50 +0200
committer: Petter Reinholdtsen <pere@hungry.com> 2012-07-21 23:49:50 +0200
commit: 0543e2d0e68d0742052371d74dfd6600816b7c46 (patch)
tree: b208c92c6520931b06188c08ad3d5ca6ef3a1946
parent: 7778a476effe04f6e7251f20665acf0f848a8f5a (diff)
3 files changed, 33 insertions, 14 deletions
diff --git a/scrapersources/list-nuug-postliste-scrapers b/scrapersources/list-nuug-postliste-scrapers
index 67c4158..709a220 100644
--- a/scrapersources/list-nuug-postliste-scrapers
+++ b/scrapersources/list-nuug-postliste-scrapers
@@ -1,5 +1,4 @@
-import os
-import urlparse
+import os, urlparse, cgi
 urlquery = os.getenv('URLQUERY')
 
 if urlquery:
@@ -18,7 +17,8 @@ print '''<html>
 <head>
 <link rel="stylesheet" href="https://views.scraperwiki.com/run/jquery-tablesorter/?file=style-blue.css" type="text/css" />
 <script type="text/javascript" src="https://views.scraperwiki.com/run/jquery-tablesorter/?file=jquery-1-4-2-min.js"></script>
-<script type="text/javascript" src="https://views.scraperwiki.com/run/jquery-tablesorter/?file=jquery.tablesorter.2-0-5.min.js"></script>'''
+<script type="text/javascript" src="https://views.scraperwiki.com/run/jquery-tablesorter/?file=jquery.tablesorter.2-0-5.min.js"></script>
+'''
 
 print '''</head><body>
 <p>This view lists scrapers with yaml-combatible comments (containing the string "nuug-postliste-endyaml" like the following in their description
@@ -35,10 +35,10 @@ Run: daily&lt;br&gt;
 </pre></p>
 <table id="myTable" class="tablesorter">'''
 
-print '<thead><tr><th>Name</th><th>type</th><th>status</th><th>schedule</th><th>format</th><th>datatype</th><th>URL</th></tr></thead><tbody>'
+print '<thead><tr><th>Name</th><th>type</th><th>status</th><th>schedule</th><th>format</th><th>datatype</th><th>created</th><th>URL</th></tr></thead><tbody>'
 counter = {}
 for scraper in json_data:
-    print scraper
+    #print "<!-- %s -->" % cgi.escape("%s" % scraper)
     comment = re.findall(r'<!-- nuug-postliste-yaml -->(.*)<!-- nuug-postliste-endyaml -->', 
                     scraper['description'], re.DOTALL)
     assert len(comment) == 1
@@ -59,8 +59,8 @@ for scraper in json_data:
     else: Type = 'unknown'
 
 
-    print '<tr><td>%s</td><td>%s</td><td>%s</td><td>%s</td><td>%s</td><td>%s</td><td><a href="https://scraperwiki.com/scrapers/%s/">URL</a></td></tr>' % \
-    (data['Name'],data['Type'],data['Status'], Run, Format, Type, scraper['short_name'])
+    print '<tr><td>%s</td><td>%s</td><td>%s</td><td>%s</td><td>%s</td><td>%s</td><td>%s</td><td><a href="https://scraperwiki.com/scrapers/%s/">URL</a></td></tr>' % \
+    (data['Name'],data['Type'],data['Status'], Run, Format, Type, scraper['created'], scraper['short_name'])
 print '''</tbody></table><table id="myTable2" class="tablesorter"><thead><tr><th>type</th><th>count</th></tr></thead><tbody>'''
 
 for key in counter:
@@ -78,13 +78,30 @@ try:
     print "<tr><td>Fylkeskommune</td><td>%.2f%% (%d av %d)</td></tr>" % \
     ((float(counter['fylkeskommune'])/float(num_fylke))*100, counter['fylkeskommune'], num_fylke)
 except KeyError: pass
+#http://stackoverflow.com/questions/7561026/jquery-tablesorter-parser-for-datetime-in-mm-dd-yyyy-hhmi-am-format
+#http://stackoverflow.com/questions/1707840/date-sorting-problem-with-jquery-tablesorter
 print '''</tbody></table>
 <script type="text/javascript">
     $(document).ready(function() 
         { 
-            $("#myTable").tablesorter(); 
-            $("#myTable2").tablesorter(); 
+            $("#myTable").tablesorter(
+                {
+                    debug: true,
+                    headers:
+                    {  
+                        6 : { sorter: "text"  },
+                        7: {sorter: false}
+                    }
+                }
+            );
+            //$("#myTable2").tablesorter(); 
         } 
     );
+
+$(function() {
+
+});
+
+
 </script>
 </body></html>'''
 \ No newline at end of file
diff --git a/scrapersources/postliste-oep b/scrapersources/postliste-oep
index 735d0a7..360ab91 100644
--- a/scrapersources/postliste-oep
+++ b/scrapersources/postliste-oep
@@ -195,7 +195,7 @@ def fetch_range(first, last, step):
                         skipped = skipped + 1
                         if skipped == myskiplimit and myskiplimit == skiplimit:
                             tmp = []
-                            for limit  in [250, 500, 800, 1000, 1200, 1500, 1700, 2000, 3000, 5000]:
+                            for limit  in [250, 500, 800, 1000, 1200, 1500, 1700, 2000, 3000, 5000, 7000]:
                                 testid = id + limit * step
                                 if -1 != fetch_oep_entry(testid, tmp):
                                     print "Looking "+str(limit)+" ahead, found " + url_from_id(testid)
diff --git a/scrapersources/postliste-python-lib-doculive b/scrapersources/postliste-python-lib-doculive
index 4907b48..102d3f1 100644
--- a/scrapersources/postliste-python-lib-doculive
+++ b/scrapersources/postliste-python-lib-doculive
@@ -147,12 +147,13 @@ class PDFJournalParser(JournalParser):
         for t in s.findAll('text'):
             if t.text != " ":
 #                if self.debug:
-#                    print t.text
+#                    print "'%s'" % t.text
                 if 'Innhold:' == t.text: # type 1 or 2 (ePhorge)
                     s = None
                     return True
                 if 'Arkivdel:' == t.text or 'Notater (X):' == t.text: # type 3 (doculive)
                     s = None
+                    print "Found doculive (type 3)"
                     return True
         s = None
         if self.debug:
@@ -173,8 +174,8 @@ class PDFJournalParser(JournalParser):
         else:
             options = ''
         xml=scraperwiki.pdftoxml(pdfcontent, options)
-        if self.debug:
-            print xml
+#        if self.debug:
+#            print xml
         pages=re.findall('(<page .+?</page>)',xml,flags=re.DOTALL)
         xml=None
 #    print pages[:1][:1000]
@@ -694,7 +695,7 @@ def test_parser():
     parser.debug = True
     for url in [ #"http://www.stortinget.no/Global/pdf/postjournal/pj-2011-06-23.pdf",
                 "http://www.radhusets-forvaltningstjeneste.oslo.kommune.no/getfile.php/rådhusets%20forvaltningstjeneste%20(RFT)/Intranett%20(RFT)/Dokumenter/Postjournal/11%20November/29112011.pdf",
-		]:
+        ]:
         pdfcontent = scraperwiki.scrape(url)
         parser.preprocess(url,pdfcontent)
         parser.process_pages()
@@ -702,3 +703,4 @@ def test_parser():
 if __name__ == "scraper" or __name__ == '__main__':
     test_parser()
 #    fieldlist()
+
author	Petter Reinholdtsen <pere@hungry.com>	2012-07-21 23:49:50 +0200
committer	Petter Reinholdtsen <pere@hungry.com>	2012-07-21 23:49:50 +0200
commit	0543e2d0e68d0742052371d74dfd6600816b7c46 (patch)
tree	b208c92c6520931b06188c08ad3d5ca6ef3a1946
parent	7778a476effe04f6e7251f20665acf0f848a8f5a (diff)