diff options
author | Petter Reinholdtsen <pere@hungry.com> | 2015-11-23 07:40:00 +0100 |
---|---|---|
committer | Petter Reinholdtsen <pere@hungry.com> | 2015-11-23 07:40:00 +0100 |
commit | d05e9377d1168c24d4d9096011a548059148b614 (patch) | |
tree | 222903718ac44be1055fc024352e62f18f7392b4 | |
parent | 0a8885848b1cf2f57e84a5440855300875c1718d (diff) |
Start on new scraiper for sio.no.
-rw-r--r-- | scrapersources/postliste-sioa | 30 |
1 files changed, 30 insertions, 0 deletions
diff --git a/scrapersources/postliste-sioa b/scrapersources/postliste-sioa new file mode 100644 index 0000000..6fabcc1 --- /dev/null +++ b/scrapersources/postliste-sioa @@ -0,0 +1,30 @@ +# -*- coding: utf-8 -*- + +#import scraperwiki +import json +from BeautifulSoup import BeautifulSoup +import datetime +import dateutil.parser +import lxml.html +import urlparse +import re +import urllib2 +frontpage = 'https://sio.no/snarveier/om-sio/rapporter-og-referater' +#scraperwiki.scrape(frontpage) + +#postlistelib=scraperwiki.swimport('postliste-python-lib') + +agency = 'Samskipnaden i Oslo og Akershus' + +baseurl = 'https://sio.no' +response = urllib2.urlopen(frontpage) +html = response.read() +root = lxml.html.fromstring(html) +urls = root.cssselect("a.readmore") +urllist = [] +for ahref in urls: + linktext = ahref.text_content() + if -1 != linktext.find('Postliste SiO'): + href = ahref.attrib['href'] + print href + urllist.append(baseurl + href) |