diff options
-rw-r--r-- | scrapersources/postliste-sioa | 30 |
1 files changed, 30 insertions, 0 deletions
diff --git a/scrapersources/postliste-sioa b/scrapersources/postliste-sioa new file mode 100644 index 0000000..6fabcc1 --- /dev/null +++ b/scrapersources/postliste-sioa @@ -0,0 +1,30 @@ +# -*- coding: utf-8 -*- + +#import scraperwiki +import json +from BeautifulSoup import BeautifulSoup +import datetime +import dateutil.parser +import lxml.html +import urlparse +import re +import urllib2 +frontpage = 'https://sio.no/snarveier/om-sio/rapporter-og-referater' +#scraperwiki.scrape(frontpage) + +#postlistelib=scraperwiki.swimport('postliste-python-lib') + +agency = 'Samskipnaden i Oslo og Akershus' + +baseurl = 'https://sio.no' +response = urllib2.urlopen(frontpage) +html = response.read() +root = lxml.html.fromstring(html) +urls = root.cssselect("a.readmore") +urllist = [] +for ahref in urls: + linktext = ahref.text_content() + if -1 != linktext.find('Postliste SiO'): + href = ahref.attrib['href'] + print href + urllist.append(baseurl + href) |