aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorAnders Einar Hilden <hildenae@gmail.com>2015-01-17 01:39:55 +0100
committerAnders Einar Hilden <hildenae@gmail.com>2015-01-17 01:39:55 +0100
commitf5072f480d1db635e3612eead00a66395c5df2be (patch)
tree77ef5ced6d37444d78089b502d5344430806b869
parent8ac5d9ec8290e281fbcb6f7c7f7790d5c1317feb (diff)
Add the correct libraryfile for dms2002
-rw-r--r--scrapersources/postliste-python-lib-pdf-dms2002.py217
1 files changed, 217 insertions, 0 deletions
diff --git a/scrapersources/postliste-python-lib-pdf-dms2002.py b/scrapersources/postliste-python-lib-pdf-dms2002.py
new file mode 100644
index 0000000..57b2c04
--- /dev/null
+++ b/scrapersources/postliste-python-lib-pdf-dms2002.py
@@ -0,0 +1,217 @@
+# -*- coding: utf-8 -*-
+#
+# Python library for parsing public post journals (postlister) in Norway
+#
+# This parser is for the format currently known as
+# "DMS2002 - Software Innovation"
+#
+# Based on the scraper advanced-scraping-pdf and postliste-python-lib
+#
+# Possible sources using format:
+# http://www.hig.no/om_hig/offentleg_journal (week 34 2014 and onwards)
+# khib.no
+# hbv.no
+# www.bystyret.oslo.kommune.no
+# www.spesialenheten.no
+# www.frogn.kommune.no
+
+# Google search to find more: "Offentlig journal" "Ansvarlig enhet" Arkivdel "Dok. dato" Avskrevet filetype:pdf
+
+
+import scraperwiki
+import string
+import re
+from BeautifulSoup import BeautifulSoup
+import datetime
+import dateutil.parser
+import sys
+jp=scraperwiki.swimport('postliste-python-lib')
+
+class PDFJournalParser(jp.JournalParser):
+ pagetable = "unparsedpages"
+ brokenpagetable = "brokenpages"
+ hiddentext = False
+ breakonfailure = True
+ debug = False
+ def __init__(self, agency, hiddentext=False, debug=False):
+ self.hiddentext = hiddentext
+ self.debug = debug
+ jp.JournalParser.__init__(self, agency=agency)
+
+ def sync(self):
+ sys.stdout.flush()
+ sys.stderr.flush()
+
+ def dprint(self, msg):
+ if self.debug:
+ print(msg)
+ self.sync()
+
+ def parse_page(self, pdfurl, pagenum, pagecontent):
+ self.sync()
+ print "Scraping " + pdfurl + " page " + str(pagenum)
+ s = BeautifulSoup(pagecontent)
+ datastore = []
+ text = []
+ linecount = 0
+ #dprint(s)
+ # Find all text-blobs and number them
+ for t in s.findAll('text'):
+ if t.text != " ":
+ text.append(t.text)
+ #self.dprint(str(linecount) + ": " + t.text)
+ #self.dprint(str(linecount) + ": " + ":".join("{:02x}".format(ord(c)) for c in t.text))
+ linecount = linecount + 1
+
+ #self.dprint("Found " + str(linecount) + " lines/text fragments in the PDF")
+ if len(text) < linecount:
+ raise ValueError("[ERROR] Found %s interresting lines, but only saved %s?" % (linecount, len(text)))
+
+ # Count how many entries to expect on this page, to be able to
+ # verify that all of them were found.
+ entrycount = 0
+ i = 0
+ while i < len(text):
+ if 'Avskrevet:' == text[i]:
+ entrycount = entrycount + 1
+ i = i + 1
+ self.dprint("We found %s entries on page %s ('Avskrevet:')" % (entrycount, pagenum))
+
+ if(entrycount > 6):
+ self.dprint("[WARNING] We found %s entries on page %s, more that 6 is not normal" % (entrycount, pagenum))
+
+ if(entrycount < 1):
+ raise ValueError("[ERROR] No entries found on page %s" % (pagenum))
+
+ i = 0
+ found_entries = 0
+ entry_start = -1
+ entry_stop = -1
+ while i < len(text):
+ if 'Avsender:' == text[i] or 'Mottaker:' == text[i]:
+ entry_start = i - 1
+ if (entry_start < 0):
+ entry_start = 0
+ #self.dprint("ESTART")
+
+ if 'Arkivdel:' == text[i]:
+ #self.dprint("EEND")
+ if(entry_start == -1):
+ self.dprint("[ERROR] Found end of entry (line %s) before start of entry on page %s" % (i, pagenum))
+ raise ValueError("[ERROR] Found end of entry before start of entry on page %s" % (pagenum))
+ entry_end = i + 2
+ if (entry_end > len(text)):
+ entry_end = len(text)
+ found_entries = found_entries + 1
+ entry = self.pdfparser(text[entry_start:entry_end], pdfurl, pagenum, found_entries)
+ entry_start = -1
+ entry_stop = -1
+ i = i + 1
+ if (found_entries != entrycount):
+ self.dprint("[ERROR] We expected %s but found %s entries on page %s" % (found_entries, entrycount, pagenum))
+ raise ValueError("[ERROR] We expected %s but found %s entries on page %s" % (found_entries, entrycount, pagenum))
+ self.dprint("We found %s of %s expected entries on page %s" %(found_entries, entrycount, pagenum))
+ s = None
+ raise ValueError("parse_page not implemented")
+
+ def pdfparser(self, entrytext, pdfurl, pagenum, num_entry):
+ FIELDS_IN_ENTRY = 10
+ field_order = {'Arkivdel:': 10, 'Arkivkode:': 7, 'Sak:': 2, 'Dok.:': 3, 'Tilg. kode:': 5, 'Dok. dato:': 9, 'Avskrevet:': 6, 'Avsender:': 1, 'Mottaker:': 1, 'Journaldato:': 4, 'Saksbehandler:': 8}
+ fields = {'Avsender:', 'Mottaker:', 'Sak:', 'Dok.:', 'Journaldato:', 'Tilg. kode:', 'Avskrevet:', 'Arkivkode:', 'Saksbehandler:', 'Dok. dato:', 'Arkivdel:' }
+ num_fields_found = 0
+ for text in entrytext:
+ if text in field_order:
+ num_fields_found = num_fields_found + 1
+ if (field_order[text] != num_fields_found): # Sanity check
+ self.dprint("[ERROR] Field '%s' is normally field #%s, but was #%s on page %s, entry %s" % (text, field_order[text], num_fields_found, pagenum, num_entry))
+ raise ValueError("[ERROR] Field '%s' is normally field #%s, but was #%s on page %s, entry %s" % (text, field_order[text], num_fields_found, pagenum, num_entry))
+ self.dprint("All fields appeared in the expected order")
+ if (num_fields_found != FIELDS_IN_ENTRY): # Sanity check
+ self.dprint("[ERROR] Found %s fields, expected %s on page %s, field %s!" % (num_fields_found, FIELDS_IN_ENTRY, pagenum, num_entry))
+ raise ValueError("[ERROR] Found %s fields, expected %s on page %s, field %s!" % (num_fields_found, FIELDS_IN_ENTRY, pagenum, num_entry))
+ else:
+ self.dprint("Found %s/10 fields in entry %s on page %s" % (num_fields_found, num_entry, pagenum))
+ #print field_order
+
+ def process_pages(self):
+ brokenpages = 0
+ try:
+ sqlselect = "* from " + self.pagetable + " limit 1"
+ pageref = scraperwiki.sqlite.select(sqlselect)
+ while pageref:
+ scrapedurl = pageref[0]['scrapedurl']
+ pagenum = pageref[0]['pagenum']
+ pagecontent = pageref[0]['pagecontent']
+ try:
+ sqldelete = "delete from " + self.pagetable + " where scrapedurl = '" + scrapedurl + "' and pagenum = " + str(pagenum)
+ self.parse_page(scrapedurl, pagenum, pagecontent)
+ sys.stdout.flush()
+ sys.stderr.flush()
+ scraperwiki.sqlite.execute(sqldelete)
+ except ValueError, e:
+ brokenpage = {
+ 'scrapedurl' : scrapedurl,
+ 'pagenum' : pagenum,
+ 'pagecontent' : pagecontent,
+ 'failstamp' : datetime.datetime.now(),
+ }
+ #print "Unsupported page %d from %s" % (pagenum, scrapedurl)
+ brokenpages = brokenpages + 1
+ scraperwiki.sqlite.save(unique_keys=['scrapedurl', 'pagenum'], data=brokenpage, table_name=self.brokenpagetable)
+ scraperwiki.sqlite.execute(sqldelete)
+ #scraperwiki.sqlite.commit()
+ #exit(0)
+ pageref = scraperwiki.sqlite.select(sqlselect)
+ except scraperwiki.sqlite.SqliteError, e:
+ print str(e)
+ raise
+ if 0 < brokenpages:
+ raise ValueError("Found %d pages with unsupported format" % brokenpages)
+
+ # Check if we recognize the page content, and throw if not
+ def is_valid_page(self, pdfurl, pagenum, pagecontent):
+ s = BeautifulSoup(pagecontent)
+ for t in s.findAll('text'):
+ if t.text != " ":
+ if 'Dok.:' == t.text:
+ s = None
+ return True
+ s = None
+ self.dprint("Unrecognized page format for " + pdfurl)
+ #raise ValueError("Unrecognized page format for " + pdfurl)
+
+ # Split PDF content into pages and store in SQL table for later processing.
+ # The process is split in two to better handle parge PDFs (like 600 pages),
+ # without running out of CPU time without loosing track of what is left to
+ # parse.
+ def preprocess(self, pdfurl, pdfcontent):
+ print "Preprocessing PDF " + pdfurl
+ if not pdfcontent:
+ raise ValueError("No pdf content passed for " + pdfurl)
+ if self.hiddentext:
+ options = '-hidden'
+ else:
+ options = ''
+ xml=scraperwiki.pdftoxml(pdfcontent, options)
+ #self.dprint("The XMLK:")
+ #self.dprint(xml)
+ pages=re.findall('(<page .+?</page>)',xml,flags=re.DOTALL)
+ xml=None
+
+ pagecount = 0
+ datastore = []
+ for page in pages:
+ pagecount = pagecount + 1
+ self.is_valid_page(pdfurl, pagecount, page)
+ data = {
+ 'scrapedurl' : pdfurl,
+ 'pagenum' : pagecount,
+ 'pagecontent' : page,
+ }
+ datastore.append(data)
+ self.dprint("Found %s pages, %s added to database" % (pagecount, len(datastore)))
+ if 0 < len(datastore):
+ scraperwiki.sqlite.save(unique_keys=['scrapedurl', 'pagenum'], data=datastore, table_name=self.pagetable)
+ else:
+ raise ValueError("Unable to find any pages in " + pdfurl)
+ pages = None