diff options
author | Anders Einar Hilden <hildenae@gmail.com> | 2015-01-17 21:50:38 +0100 |
---|---|---|
committer | Anders Einar Hilden <hildenae@gmail.com> | 2015-01-18 00:12:41 +0100 |
commit | 8a075f42f219eda50eb3694f0957727d7f9deded (patch) | |
tree | 92226facf6ac6b27fe60a3081611bde11c546cf1 | |
parent | 8c2f45465f26ecf7bdf85cd09a14ec2b9fa5b8ce (diff) |
Starting to rewrite datafinder-dode
-rw-r--r-- | scrapersources/postliste-python-lib-pdf-dms2002.py | 95 |
1 files changed, 65 insertions, 30 deletions
diff --git a/scrapersources/postliste-python-lib-pdf-dms2002.py b/scrapersources/postliste-python-lib-pdf-dms2002.py index 57b2c04..4d9d216 100644 --- a/scrapersources/postliste-python-lib-pdf-dms2002.py +++ b/scrapersources/postliste-python-lib-pdf-dms2002.py @@ -58,7 +58,9 @@ class PDFJournalParser(jp.JournalParser): # Find all text-blobs and number them for t in s.findAll('text'): if t.text != " ": - text.append(t.text) + #print(t['top']) + #print(t.text) + text.append(t) #self.dprint(str(linecount) + ": " + t.text) #self.dprint(str(linecount) + ": " + ":".join("{:02x}".format(ord(c)) for c in t.text)) linecount = linecount + 1 @@ -72,7 +74,7 @@ class PDFJournalParser(jp.JournalParser): entrycount = 0 i = 0 while i < len(text): - if 'Avskrevet:' == text[i]: + if 'Avskrevet:' == text[i].text: entrycount = entrycount + 1 i = i + 1 self.dprint("We found %s entries on page %s ('Avskrevet:')" % (entrycount, pagenum)) @@ -87,26 +89,43 @@ class PDFJournalParser(jp.JournalParser): found_entries = 0 entry_start = -1 entry_stop = -1 + prev_entry_end = -1 + tops = [] + FIELD_ORDER= {'Arkivdel:': 10, 'Arkivkode:': 7, 'Sak:': 2, 'Dok.:': 3, 'Tilg. kode:': 5, 'Dok. dato:': 9, + 'Avskrevet:': 6, 'Avsender:': 1, 'Mottaker:': 1, 'Journaldato:': 4, 'Saksbehandler:': 8} while i < len(text): - if 'Avsender:' == text[i] or 'Mottaker:' == text[i]: - entry_start = i - 1 - if (entry_start < 0): - entry_start = 0 - #self.dprint("ESTART") - - if 'Arkivdel:' == text[i]: - #self.dprint("EEND") - if(entry_start == -1): - self.dprint("[ERROR] Found end of entry (line %s) before start of entry on page %s" % (i, pagenum)) - raise ValueError("[ERROR] Found end of entry before start of entry on page %s" % (pagenum)) - entry_end = i + 2 - if (entry_end > len(text)): - entry_end = len(text) + if 'Avsender:' == text[i].text or 'Mottaker:' == text[i].text: found_entries = found_entries + 1 - entry = self.pdfparser(text[entry_start:entry_end], pdfurl, pagenum, found_entries) - entry_start = -1 - entry_stop = -1 + + + + i = i + 1 + # while i < len(text): + # if 'Avsender:' == text[i].text or 'Mottaker:' == text[i].text: + # entry_start = i - 1 + # if (entry_start < 0): + # entry_start = 0 + # #self.dprint("ESTART") + # + # if 'Arkivdel:' == text[i].text: + # #self.dprint("EEND") + # if(entry_start == -1): + # self.dprint("[ERROR] Found end of entry (line %s) before start of entry on page %s" % (i, pagenum)) + # raise ValueError("[ERROR] Found end of entry before start of entry on page %s" % (pagenum)) + # entry_end = i + 2 + # if (entry_end > len(text)): + # entry_end = len(text) + # found_entries = found_entries + 1 + # self.verify_entry(text[entry_start:entry_end], pagenum, found_entries) + # #entry = self.parse_entry(text[entry_start:entry_end], pdfurl, pagenum, found_entries) + # #print(entry) + # if(found_entries == 2): + # exit(0) + # entry_start = -1 + # prev_entry_end = entry_stop + # entry_stop = -1 + # i = i + 1 if (found_entries != entrycount): self.dprint("[ERROR] We expected %s but found %s entries on page %s" % (found_entries, entrycount, pagenum)) raise ValueError("[ERROR] We expected %s but found %s entries on page %s" % (found_entries, entrycount, pagenum)) @@ -114,24 +133,40 @@ class PDFJournalParser(jp.JournalParser): s = None raise ValueError("parse_page not implemented") - def pdfparser(self, entrytext, pdfurl, pagenum, num_entry): + def verify_entry(self, entrytext, pagenum, num_entry): + pass + + def verify_entry2(self, entrytext, pagenum, num_entry): FIELDS_IN_ENTRY = 10 - field_order = {'Arkivdel:': 10, 'Arkivkode:': 7, 'Sak:': 2, 'Dok.:': 3, 'Tilg. kode:': 5, 'Dok. dato:': 9, 'Avskrevet:': 6, 'Avsender:': 1, 'Mottaker:': 1, 'Journaldato:': 4, 'Saksbehandler:': 8} - fields = {'Avsender:', 'Mottaker:', 'Sak:', 'Dok.:', 'Journaldato:', 'Tilg. kode:', 'Avskrevet:', 'Arkivkode:', 'Saksbehandler:', 'Dok. dato:', 'Arkivdel:' } + + FIELD_ORDER= {'Arkivdel:': 10, 'Arkivkode:': 7, 'Sak:': 2, 'Dok.:': 3, 'Tilg. kode:': 5, 'Dok. dato:': 9, + 'Avskrevet:': 6, 'Avsender:': 1, 'Mottaker:': 1, 'Journaldato:': 4, 'Saksbehandler:': 8} + fields_checked = 0 num_fields_found = 0 for text in entrytext: - if text in field_order: + fields_checked = fields_checked + 1 + if text.text in FIELD_ORDER: num_fields_found = num_fields_found + 1 - if (field_order[text] != num_fields_found): # Sanity check - self.dprint("[ERROR] Field '%s' is normally field #%s, but was #%s on page %s, entry %s" % (text, field_order[text], num_fields_found, pagenum, num_entry)) - raise ValueError("[ERROR] Field '%s' is normally field #%s, but was #%s on page %s, entry %s" % (text, field_order[text], num_fields_found, pagenum, num_entry)) + if (FIELD_ORDER[text.text] != num_fields_found): # Sanity check + error = "[ERROR] Field '%s' is normally field #%s, but was #%s on page %s, entry %s" % \ + (text, FIELD_ORDER[text], num_fields_found, pagenum, num_entry) + self.dprint(error) + raise ValueError(error) + print("%s: %s" % (fields_checked, text.text)) + else: + print("%s: %s" % (fields_checked, text.text)) + # 1 = Mottager, 3 = Sak, 6 = Dok., 7 = Tilg. kode, 9 = Journaldato/Dok. dato + # 13 = Journaldato/Dok. dato, 15 = Arkivdel, 18 = Saksbehandler + self.dprint("All fields appeared in the expected order") + if (num_fields_found != FIELDS_IN_ENTRY): # Sanity check - self.dprint("[ERROR] Found %s fields, expected %s on page %s, field %s!" % (num_fields_found, FIELDS_IN_ENTRY, pagenum, num_entry)) - raise ValueError("[ERROR] Found %s fields, expected %s on page %s, field %s!" % (num_fields_found, FIELDS_IN_ENTRY, pagenum, num_entry)) + error = "[ERROR] Found %s fields, expected %s on page %s, field %s!" % \ + (num_fields_found, FIELDS_IN_ENTRY, pagenum, num_entry) + self.dprint(error) + raise ValueError(error) else: self.dprint("Found %s/10 fields in entry %s on page %s" % (num_fields_found, num_entry, pagenum)) - #print field_order def process_pages(self): brokenpages = 0 @@ -194,7 +229,7 @@ class PDFJournalParser(jp.JournalParser): options = '' xml=scraperwiki.pdftoxml(pdfcontent, options) #self.dprint("The XMLK:") - #self.dprint(xml) + self.dprint(xml) pages=re.findall('(<page .+?</page>)',xml,flags=re.DOTALL) xml=None |