# # = Introduction # # This file is mostly an attempt to port libpst to ruby, and simplify it in the process. It # will leverage much of the existing MAPI => MIME conversion developed for Msg files, and as # such is purely concerned with the file structure details. # # = TODO # # 1. solve recipient table problem (test4). # this is done. turns out it was due to id2 clashes. find better solution # 2. check parse consistency. an initial conversion of a 30M file to pst, shows # a number of messages conveting badly. compare with libpst too. # 3. xattribs # 4. generalise the Mapi stuff better # 5. refactor index load # 6. msg serialization? # =begin quick plan for cleanup. have working tests for 97 and 03 file formats, so safe. want to fix up: 64 bit unpacks scattered around. its ugly. not sure how best to handle it, but am slightly tempted to override String#unpack to support a 64 bit little endian unpack (like L vs N/V, for Q). one way or another need to fix it. Could really slow everything else down if its parsing the unpack strings twice, once in ruby, for every single unpack i do :/ the index loading process, and the lack of shared code between normal vs 64 bit variants, and Index vs Desc. should be able to reduce code by factor of 4. also think I should move load code into the class too. then maybe have something like: class Header def index_class version_2003 ? Index64 : Index end end def load_idx header.index_class.load_index end OR def initialize @header = ... extend @header.index_class::Load load_idx end need to think about the role of the mapi code, and Pst::Item etc, but that layer can come later. =end require 'mapi' require 'enumerator' require 'ostruct' require 'ole/ranges_io' module Mapi class Pst class FormatError < StandardError end # unfortunately there is no Q analogue which is little endian only. # this translates T as an unsigned quad word, little endian byte order, to # not pollute the rest of the code. # # didn't want to override String#unpack, cause its too hacky, and incomplete. def self.unpack str, unpack_spec return str.unpack(unpack_spec) unless unpack_spec['T'] @unpack_cache ||= {} t_offsets, new_spec = @unpack_cache[unpack_spec] unless t_offsets t_offsets = [] offset = 0 new_spec = '' unpack_spec.scan(/([^\d])_?(\*|\d+)?/o) do num_elems = $1.downcase == 'a' ? 1 : ($2 || 1).to_i if $1 == 'T' num_elems.times { |i| t_offsets << offset + i } new_spec << "V#{num_elems * 2}" else new_spec << $~[0] end offset += num_elems end @unpack_cache[unpack_spec] = [t_offsets, new_spec] end a = str.unpack(new_spec) t_offsets.each do |offset| low, high = a[offset, 2] a[offset, 2] = low && high ? low + (high << 32) : nil end a end # # this is the header and encryption encapsulation code # ---------------------------------------------------------------------------- # # class which encapsulates the pst header class Header SIZE = 512 MAGIC = 0x2142444e # these are the constants defined in libpst.c, that # are referenced in pst_open() INDEX_TYPE_OFFSET = 0x0A FILE_SIZE_POINTER = 0xA8 FILE_SIZE_POINTER_64 = 0xB8 SECOND_POINTER = 0xBC INDEX_POINTER = 0xC4 SECOND_POINTER_64 = 0xE0 INDEX_POINTER_64 = 0xF0 ENC_OFFSET = 0x1CD attr_reader :magic, :index_type, :encrypt_type, :size attr_reader :index1_count, :index1, :index2_count, :index2 attr_reader :version def initialize data @magic = data.unpack('N')[0] @index_type = data[INDEX_TYPE_OFFSET] @version = {0x0e => 1997, 0x17 => 2003}[@index_type] if version_2003? # don't know? # >> data1.unpack('V*').zip(data2.unpack('V*')).enum_with_index.select { |(c, d), i| c != d and not [46, 56, 60].include?(i) }.select { |(a, b), i| b == 0 }.map { |(a, b), i| [a / 256, i] } # [8, 76], [32768, 84], [128, 89] # >> data1.unpack('C*').zip(data2.unpack('C*')).enum_with_index.select { |(c, d), i| c != d and not [184..187, 224..227, 240..243].any? { |r| r === i } }.select { |(a, b), i| b == 0 and ((Math.log(a) / Math.log(2)) % 1) < 0.0001 } # [[[2, 0], 61], [[2, 0], 76], [[2, 0], 195], [[2, 0], 257], [[8, 0], 305], [[128, 0], 338], [[128, 0], 357]] # i have only 2 psts to base this guess on, so i can't really come up with anything that looks reasonable yet. not sure what the offset is. unfortunately there is so much in the header # that isn't understood... @encrypt_type = 1 @index2_count, @index2 = data[SECOND_POINTER_64 - 4, 8].unpack('V2') @index1_count, @index1 = data[INDEX_POINTER_64 - 4, 8].unpack('V2') @size = data[FILE_SIZE_POINTER_64, 4].unpack('V')[0] else @encrypt_type = data[ENC_OFFSET] @index2_count, @index2 = data[SECOND_POINTER - 4, 8].unpack('V2') @index1_count, @index1 = data[INDEX_POINTER - 4, 8].unpack('V2') @size = data[FILE_SIZE_POINTER, 4].unpack('V')[0] end validate! end def version_2003? version == 2003 end def encrypted? encrypt_type != 0 end def validate! raise FormatError, "bad signature on pst file (#{'0x%x' % magic})" unless magic == MAGIC raise FormatError, "only index types 0x0e and 0x17 are handled (#{'0x%x' % index_type})" unless [0x0e, 0x17].include?(index_type) raise FormatError, "only encrytion types 0 and 1 are handled (#{encrypt_type.inspect})" unless [0, 1].include?(encrypt_type) end end # compressible encryption! :D # # simple substitution. see libpst.c # maybe test switch to using a String#tr! class CompressibleEncryption DECRYPT_TABLE = [ 0x47, 0xf1, 0xb4, 0xe6, 0x0b, 0x6a, 0x72, 0x48, 0x85, 0x4e, 0x9e, 0xeb, 0xe2, 0xf8, 0x94, 0x53, # 0x0f 0xe0, 0xbb, 0xa0, 0x02, 0xe8, 0x5a, 0x09, 0xab, 0xdb, 0xe3, 0xba, 0xc6, 0x7c, 0xc3, 0x10, 0xdd, # 0x1f 0x39, 0x05, 0x96, 0x30, 0xf5, 0x37, 0x60, 0x82, 0x8c, 0xc9, 0x13, 0x4a, 0x6b, 0x1d, 0xf3, 0xfb, # 0x2f 0x8f, 0x26, 0x97, 0xca, 0x91, 0x17, 0x01, 0xc4, 0x32, 0x2d, 0x6e, 0x31, 0x95, 0xff, 0xd9, 0x23, # 0x3f 0xd1, 0x00, 0x5e, 0x79, 0xdc, 0x44, 0x3b, 0x1a, 0x28, 0xc5, 0x61, 0x57, 0x20, 0x90, 0x3d, 0x83, # 0x4f 0xb9, 0x43, 0xbe, 0x67, 0xd2, 0x46, 0x42, 0x76, 0xc0, 0x6d, 0x5b, 0x7e, 0xb2, 0x0f, 0x16, 0x29, # 0x5f 0x3c, 0xa9, 0x03, 0x54, 0x0d, 0xda, 0x5d, 0xdf, 0xf6, 0xb7, 0xc7, 0x62, 0xcd, 0x8d, 0x06, 0xd3, # 0x6f 0x69, 0x5c, 0x86, 0xd6, 0x14, 0xf7, 0xa5, 0x66, 0x75, 0xac, 0xb1, 0xe9, 0x45, 0x21, 0x70, 0x0c, # 0x7f 0x87, 0x9f, 0x74, 0xa4, 0x22, 0x4c, 0x6f, 0xbf, 0x1f, 0x56, 0xaa, 0x2e, 0xb3, 0x78, 0x33, 0x50, # 0x8f 0xb0, 0xa3, 0x92, 0xbc, 0xcf, 0x19, 0x1c, 0xa7, 0x63, 0xcb, 0x1e, 0x4d, 0x3e, 0x4b, 0x1b, 0x9b, # 0x9f 0x4f, 0xe7, 0xf0, 0xee, 0xad, 0x3a, 0xb5, 0x59, 0x04, 0xea, 0x40, 0x55, 0x25, 0x51, 0xe5, 0x7a, # 0xaf 0x89, 0x38, 0x68, 0x52, 0x7b, 0xfc, 0x27, 0xae, 0xd7, 0xbd, 0xfa, 0x07, 0xf4, 0xcc, 0x8e, 0x5f, # 0xbf 0xef, 0x35, 0x9c, 0x84, 0x2b, 0x15, 0xd5, 0x77, 0x34, 0x49, 0xb6, 0x12, 0x0a, 0x7f, 0x71, 0x88, # 0xcf 0xfd, 0x9d, 0x18, 0x41, 0x7d, 0x93, 0xd8, 0x58, 0x2c, 0xce, 0xfe, 0x24, 0xaf, 0xde, 0xb8, 0x36, # 0xdf 0xc8, 0xa1, 0x80, 0xa6, 0x99, 0x98, 0xa8, 0x2f, 0x0e, 0x81, 0x65, 0x73, 0xe4, 0xc2, 0xa2, 0x8a, # 0xef 0xd4, 0xe1, 0x11, 0xd0, 0x08, 0x8b, 0x2a, 0xf2, 0xed, 0x9a, 0x64, 0x3f, 0xc1, 0x6c, 0xf9, 0xec # 0xff ] ENCRYPT_TABLE = [nil] * 256 DECRYPT_TABLE.each_with_index { |i, j| ENCRYPT_TABLE[i] = j } def self.decrypt_alt encrypted decrypted = '' encrypted.length.times { |i| decrypted << DECRYPT_TABLE[encrypted[i]] } decrypted end def self.encrypt_alt decrypted encrypted = '' decrypted.length.times { |i| encrypted << ENCRYPT_TABLE[decrypted[i]] } encrypted end # an alternate implementation that is possibly faster.... # TODO - bench DECRYPT_STR, ENCRYPT_STR = [DECRYPT_TABLE, (0...256)].map do |values| values.map { |i| i.chr }.join.gsub(/([\^\-\\])/, "\\\\\\1") end def self.decrypt encrypted encrypted.tr ENCRYPT_STR, DECRYPT_STR end def self.encrypt decrypted decrypted.tr DECRYPT_STR, ENCRYPT_STR end end class RangesIOEncryptable < RangesIO def initialize io, mode='r', params={} mode, params = 'r', mode if Hash === mode @decrypt = !!params[:decrypt] super end def encrypted? @decrypt end def read limit=nil buf = super buf = CompressibleEncryption.decrypt(buf) if encrypted? buf end end attr_reader :io, :header, :idx, :desc, :special_folder_ids # corresponds to # * pst_open # * pst_load_index def initialize io @io = io io.pos = 0 @header = Header.new io.read(Header::SIZE) # would prefer this to be in Header#validate, but it doesn't have the io size. # should perhaps downgrade this to just be a warning... raise FormatError, "header size field invalid (#{header.size} != #{io.size}}" unless header.size == io.size load_idx load_desc load_xattrib @special_folder_ids = {} end def encrypted? @header.encrypted? end # until i properly fix logging... def warn s Mapi::Log.warn s end # # this is the index and desc record loading code # ---------------------------------------------------------------------------- # ToTree = Module.new module Index2 BLOCK_SIZE = 512 module RecursiveLoad def load_chain #... end end module Base def read #... end end class Version1997 < Struct.new(:a)#...) SIZE = 12 include RecursiveLoad include Base end class Version2003 < Struct.new(:a)#...) SIZE = 24 include RecursiveLoad include Base end end module Desc2 module Base def desc #... end end class Version1997 < Struct.new(:a)#...) #include Index::RecursiveLoad include Base end class Version2003 < Struct.new(:a)#...) #include Index::RecursiveLoad include Base end end # more constants from libpst.c # these relate to the index block ITEM_COUNT_OFFSET = 0x1f0 # count byte LEVEL_INDICATOR_OFFSET = 0x1f3 # node or leaf BACKLINK_OFFSET = 0x1f8 # backlink u1 value # these 3 classes are used to hold various file records # pst_index class Index < Struct.new(:id, :offset, :size, :u1) UNPACK_STR = 'VVvv' SIZE = 12 BLOCK_SIZE = 512 # index blocks was 516 but bogus COUNT_MAX = 41 # max active items (ITEM_COUNT_OFFSET / Index::SIZE = 41) attr_accessor :pst def initialize data data = Pst.unpack data, UNPACK_STR if String === data super(*data) end def type @type ||= begin if id & 0x2 == 0 :data else first_byte, second_byte = read.unpack('CC') if first_byte == 1 raise second_byte unless second_byte == 1 :data_chain_header elsif first_byte == 2 raise second_byte unless second_byte == 0 :id2_assoc else raise FormatError, 'unknown first byte for block - %p' % first_byte end end end end def data? (id & 0x2) == 0 end def read decrypt=true # only data blocks are every encrypted decrypt = false unless data? pst.pst_read_block_size offset, size, decrypt end # show all numbers in hex def inspect super.gsub(/=(\d+)/) { '=0x%x' % $1.to_i }.sub(/Index /, "Index type=#{type.inspect}, ") end end # mostly guesses. ITEM_COUNT_OFFSET_64 = 0x1e8 LEVEL_INDICATOR_OFFSET_64 = 0x1eb # diff of 3 between these 2 as above... # will maybe inherit from Index64, in order to get the same #type function. class Index64 < Index UNPACK_STR = 'TTvvV' SIZE = 24 BLOCK_SIZE = 512 COUNT_MAX = 20 # bit of a guess really. 512 / 24 = 21, but doesn't leave enough header room # this is the extra item on the end of the UNPACK_STR above attr_accessor :u2 def initialize data data = Pst.unpack data, UNPACK_STR if String === data @u2 = data.pop super data end def inspect super.sub(/>$/, ', u2=%p>' % u2) end def self.load_chain io, header load_idx_rec io, header.index1, 0, 0 end # almost identical to load code for Index, just different offsets and unpack strings. # can probably merge them, or write a generic load_tree function or something. def self.load_idx_rec io, offset, linku1, start_val io.seek offset buf = io.read BLOCK_SIZE idxs = [] item_count = buf[ITEM_COUNT_OFFSET_64] raise "have too many active items in index (#{item_count})" if item_count > COUNT_MAX #idx = Index.new buf[BACKLINK_OFFSET, Index::SIZE] #raise 'blah 1' unless idx.id == linku1 if buf[LEVEL_INDICATOR_OFFSET_64] == 0 # leaf pointers # split the data into item_count index objects buf[0, SIZE * item_count].scan(/.{#{SIZE}}/mo).each_with_index do |data, i| idx = new data # first entry raise 'blah 3' if i == 0 and start_val != 0 and idx.id != start_val #idx.pst = self break if idx.id == 0 idxs << idx end else # node pointers # split the data into item_count table pointers buf[0, SIZE * item_count].scan(/.{#{SIZE}}/mo).each_with_index do |data, i| start, u1, offset = Pst.unpack data, 'T3' # for the first value, we expect the start to be equal raise 'blah 3' if i == 0 and start_val != 0 and start != start_val break if start == 0 idxs += load_idx_rec io, offset, u1, start end end idxs end end # pst_desc class Desc64 < Struct.new(:desc_id, :idx_id, :idx2_id, :parent_desc_id, :u2) UNPACK_STR = 'T3VV' SIZE = 32 BLOCK_SIZE = 512 # descriptor blocks was 520 but bogus COUNT_MAX = 15 # guess as per Index64 include RecursivelyEnumerable attr_accessor :pst attr_reader :children def initialize data super(*Pst.unpack(data, UNPACK_STR)) @children = [] end def desc pst.idx_from_id idx_id end def list_index pst.idx_from_id idx2_id end def self.load_chain io, header load_desc_rec io, header.index2, 0, 0x21 end def self.load_desc_rec io, offset, linku1, start_val io.seek offset buf = io.read BLOCK_SIZE descs = [] item_count = buf[ITEM_COUNT_OFFSET_64] # not real desc #desc = Desc.new buf[BACKLINK_OFFSET, 4] #raise 'blah 1' unless desc.desc_id == linku1 if buf[LEVEL_INDICATOR_OFFSET_64] == 0 # leaf pointers raise "have too many active items in index (#{item_count})" if item_count > COUNT_MAX # split the data into item_count desc objects buf[0, SIZE * item_count].scan(/.{#{SIZE}}/mo).each_with_index do |data, i| desc = new data # first entry raise 'blah 3' if i == 0 and start_val != 0 and desc.desc_id != start_val break if desc.desc_id == 0 descs << desc end else # node pointers raise "have too many active items in index (#{item_count})" if item_count > Index64::COUNT_MAX # split the data into item_count table pointers buf[0, Index64::SIZE * item_count].scan(/.{#{Index64::SIZE}}/mo).each_with_index do |data, i| start, u1, offset = Pst.unpack data, 'T3' # for the first value, we expect the start to be equal note that ids -1, so even for the # first we expect it to be equal. thats the 0x21 (dec 33) desc record. this means we assert # that the first desc record is always 33... # thats because 0x21 is the pst root itself... raise 'blah 3' if i == 0 and start_val != -1 and start != start_val # this shouldn't really happen i'd imagine break if start == 0 descs += load_desc_rec io, offset, u1, start end end descs end def each_child(&block) @children.each(&block) end end # _pst_table_ptr_struct class TablePtr < Struct.new(:start, :u1, :offset) UNPACK_STR = 'V3' SIZE = 12 def initialize data data = data.unpack(UNPACK_STR) if String === data super(*data) end end # pst_desc # idx_id is a pointer to an idx record which gets the primary data stream for the Desc record. # idx2_id gets you an idx record, that when read gives you an ID2 association list, which just maps # another set of ids to index values class Desc < Struct.new(:desc_id, :idx_id, :idx2_id, :parent_desc_id) UNPACK_STR = 'V4' SIZE = 16 BLOCK_SIZE = 512 # descriptor blocks was 520 but bogus COUNT_MAX = 31 # max active desc records (ITEM_COUNT_OFFSET / Desc::SIZE = 31) include ToTree attr_accessor :pst attr_reader :children def initialize data super(*data.unpack(UNPACK_STR)) @children = [] end def desc pst.idx_from_id idx_id end def list_index pst.idx_from_id idx2_id end # show all numbers in hex def inspect super.gsub(/=(\d+)/) { '=0x%x' % $1.to_i } end end # corresponds to # * _pst_build_id_ptr def load_idx @idx = [] @idx_offsets = [] if header.version_2003? @idx = Index64.load_chain io, header @idx.each { |idx| idx.pst = self } else load_idx_rec header.index1, header.index1_count, 0 end # we'll typically be accessing by id, so create a hash as a lookup cache @idx_from_id = {} @idx.each do |idx| warn "there are duplicate idx records with id #{idx.id}" if @idx_from_id[idx.id] @idx_from_id[idx.id] = idx end end # load the flat idx table, which maps ids to file ranges. this is the recursive helper # # corresponds to # * _pst_build_id_ptr def load_idx_rec offset, linku1, start_val @idx_offsets << offset #_pst_read_block_size(pf, offset, BLOCK_SIZE, &buf, 0, 0) < BLOCK_SIZE) buf = pst_read_block_size offset, Index::BLOCK_SIZE, false item_count = buf[ITEM_COUNT_OFFSET] raise "have too many active items in index (#{item_count})" if item_count > Index::COUNT_MAX idx = Index.new buf[BACKLINK_OFFSET, Index::SIZE] raise 'blah 1' unless idx.id == linku1 if buf[LEVEL_INDICATOR_OFFSET] == 0 # leaf pointers # split the data into item_count index objects buf[0, Index::SIZE * item_count].scan(/.{#{Index::SIZE}}/mo).each_with_index do |data, i| idx = Index.new data # first entry raise 'blah 3' if i == 0 and start_val != 0 and idx.id != start_val idx.pst = self # this shouldn't really happen i'd imagine break if idx.id == 0 @idx << idx end else # node pointers # split the data into item_count table pointers buf[0, TablePtr::SIZE * item_count].scan(/.{#{TablePtr::SIZE}}/mo).each_with_index do |data, i| table = TablePtr.new data # for the first value, we expect the start to be equal raise 'blah 3' if i == 0 and start_val != 0 and table.start != start_val # this shouldn't really happen i'd imagine break if table.start == 0 load_idx_rec table.offset, table.u1, table.start end end end # most access to idx objects will use this function # # corresponds to # * _pst_getID def idx_from_id id @idx_from_id[id] end # corresponds to # * _pst_build_desc_ptr # * record_descriptor def load_desc @desc = [] @desc_offsets = [] if header.version_2003? @desc = Desc64.load_chain io, header @desc.each { |desc| desc.pst = self } else load_desc_rec header.index2, header.index2_count, 0x21 end # first create a lookup cache @desc_from_id = {} @desc.each do |desc| desc.pst = self warn "there are duplicate desc records with id #{desc.desc_id}" if @desc_from_id[desc.desc_id] @desc_from_id[desc.desc_id] = desc end # now turn the flat list of loaded desc records into a tree # well, they have no parent, so they're more like, the toplevel descs. @orphans = [] # now assign each node to the parents child array, putting the orphans in the above @desc.each do |desc| parent = @desc_from_id[desc.parent_desc_id] # note, besides this, its possible to create other circular structures. if parent == desc # this actually happens usually, for the root_item it appears. #warn "desc record's parent is itself (#{desc.inspect})" # maybe add some more checks in here for circular structures elsif parent parent.children << desc next end @orphans << desc end # maybe change this to some sort of sane-ness check. orphans are expected # warn "have #{@orphans.length} orphan desc record(s)." unless @orphans.empty? end # load the flat list of desc records recursively # # corresponds to # * _pst_build_desc_ptr # * record_descriptor def load_desc_rec offset, linku1, start_val @desc_offsets << offset buf = pst_read_block_size offset, Desc::BLOCK_SIZE, false item_count = buf[ITEM_COUNT_OFFSET] # not real desc desc = Desc.new buf[BACKLINK_OFFSET, 4] raise 'blah 1' unless desc.desc_id == linku1 if buf[LEVEL_INDICATOR_OFFSET] == 0 # leaf pointers raise "have too many active items in index (#{item_count})" if item_count > Desc::COUNT_MAX # split the data into item_count desc objects buf[0, Desc::SIZE * item_count].scan(/.{#{Desc::SIZE}}/mo).each_with_index do |data, i| desc = Desc.new data # first entry raise 'blah 3' if i == 0 and start_val != 0 and desc.desc_id != start_val # this shouldn't really happen i'd imagine break if desc.desc_id == 0 @desc << desc end else # node pointers raise "have too many active items in index (#{item_count})" if item_count > Index::COUNT_MAX # split the data into item_count table pointers buf[0, TablePtr::SIZE * item_count].scan(/.{#{TablePtr::SIZE}}/mo).each_with_index do |data, i| table = TablePtr.new data # for the first value, we expect the start to be equal note that ids -1, so even for the # first we expect it to be equal. thats the 0x21 (dec 33) desc record. this means we assert # that the first desc record is always 33... raise 'blah 3' if i == 0 and start_val != -1 and table.start != start_val # this shouldn't really happen i'd imagine break if table.start == 0 load_desc_rec table.offset, table.u1, table.start end end end # as for idx # # corresponds to: # * _pst_getDptr def desc_from_id id @desc_from_id[id] end # corresponds to # * pst_load_extended_attributes def load_xattrib unless desc = desc_from_id(0x61) warn "no extended attributes desc record found" return end unless desc.desc warn "no desc idx for extended attributes" return end if desc.list_index end #warn "skipping loading xattribs" # FIXME implement loading xattribs end # corresponds to: # * _pst_read_block_size # * _pst_read_block ?? # * _pst_ff_getIDblock_dec ?? # * _pst_ff_getIDblock ?? def pst_read_block_size offset, size, decrypt=true io.seek offset buf = io.read size warn "tried to read #{size} bytes but only got #{buf.length}" if buf.length != size encrypted? && decrypt ? CompressibleEncryption.decrypt(buf) : buf end # # id2 # ---------------------------------------------------------------------------- # class ID2Assoc < Struct.new(:id2, :id, :table2) UNPACK_STR = 'V3' SIZE = 12 def initialize data data = data.unpack(UNPACK_STR) if String === data super(*data) end end class ID2Assoc64 < Struct.new(:id2, :u1, :id, :table2) UNPACK_STR = 'VVT2' SIZE = 24 def initialize data if String === data data = Pst.unpack data, UNPACK_STR end super(*data) end def self.load_chain idx buf = idx.read type, count = buf.unpack 'v2' unless type == 0x0002 raise 'unknown id2 type 0x%04x' % type #return end id2 = [] count.times do |i| assoc = new buf[8 + SIZE * i, SIZE] id2 << assoc if assoc.table2 != 0 id2 += load_chain idx.pst.idx_from_id(assoc.table2) end end id2 end end class ID2Mapping attr_reader :list def initialize pst, list @pst = pst @list = list # create a lookup. @id_from_id2 = {} @list.each do |id2| # NOTE we take the last value seen value if there are duplicates. this "fixes" # test4-o1997.pst for the time being. warn "there are duplicate id2 records with id #{id2.id2}" if @id_from_id2[id2.id2] next if @id_from_id2[id2.id2] @id_from_id2[id2.id2] = id2.id end end # TODO: fix logging def warn s Mapi::Log.warn s end # corresponds to: # * _pst_getID2 def [] id #id2 = @list.find { |x| x.id2 == id } id = @id_from_id2[id] id and @pst.idx_from_id(id) end end def load_idx2 idx if header.version_2003? id2 = ID2Assoc64.load_chain idx else id2 = load_idx2_rec idx end ID2Mapping.new self, id2 end # corresponds to # * _pst_build_id2 def load_idx2_rec idx # i should perhaps use a idx chain style read here? buf = pst_read_block_size idx.offset, idx.size, false type, count = buf.unpack 'v2' unless type == 0x0002 raise 'unknown id2 type 0x%04x' % type #return end id2 = [] count.times do |i| assoc = ID2Assoc.new buf[4 + ID2Assoc::SIZE * i, ID2Assoc::SIZE] id2 << assoc if assoc.table2 != 0 id2 += load_idx2_rec idx_from_id(assoc.table2) end end id2 end class RangesIOIdxChain < RangesIOEncryptable def initialize pst, idx_head @idxs = pst.id2_block_idx_chain idx_head # whether or not a given idx needs encrypting decrypts = @idxs.map do |idx| decrypt = (idx.id & 2) != 0 ? false : pst.encrypted? end.uniq raise NotImplementedError, 'partial encryption in RangesIOID2' if decrypts.length > 1 decrypt = decrypts.first # convert idxs to ranges ranges = @idxs.map { |idx| [idx.offset, idx.size] } super pst.io, :ranges => ranges, :decrypt => decrypt end end class RangesIOID2 < RangesIOIdxChain def self.new pst, id2, idx2 RangesIOIdxChain.new pst, idx2[id2] end end # corresponds to: # * _pst_ff_getID2block # * _pst_ff_getID2data # * _pst_ff_compile_ID def id2_block_idx_chain idx if (idx.id & 0x2) == 0 [idx] else buf = idx.read type, fdepth, count = buf[0, 4].unpack 'CCv' unless type == 1 # libpst.c:3958 warn 'Error in idx_chain - %p, %p, %p - attempting to ignore' % [type, fdepth, count] return [idx] end # there are 4 unaccounted for bytes here, 4...8 if header.version_2003? ids = buf[8, count * 8].unpack("T#{count}") else ids = buf[8, count * 4].unpack('V*') end if fdepth == 1 ids.map { |id| idx_from_id id } else ids.map { |id| id2_block_idx_chain idx_from_id(id) }.flatten end end end # # main block parsing code. gets raw properties # ---------------------------------------------------------------------------- # # the job of this class, is to take a desc record, and be able to enumerate through the # mapi properties of the associated thing. # # corresponds to # * _pst_parse_block # * _pst_process (in some ways. although perhaps thats more the Item::Properties#add_property) class BlockParser include Mapi::Types::Constants TYPES = { 0xbcec => 1, 0x7cec => 2, # type 3 is removed. an artifact of not handling the indirect blocks properly in libpst. } PR_SUBJECT = PropertySet::TAGS.find { |num, (name, type)| name == 'PR_SUBJECT' }.first.hex PR_BODY_HTML = PropertySet::TAGS.find { |num, (name, type)| name == 'PR_BODY_HTML' }.first.hex # this stuff could maybe be moved to Ole::Types? or leverage it somehow? # whether or not a type is immeidate is more a property of the pst encoding though i expect. # what i probably can add is a generic concept of whether a type is of variadic length or not. # these lists are very incomplete. think they are largely copied from libpst IMMEDIATE_TYPES = [ PT_SHORT, PT_LONG, PT_BOOLEAN ] INDIRECT_TYPES = [ PT_DOUBLE, PT_OBJECT, 0x0014, # whats this? probably something like PT_LONGLONG, given the correspondence with the # ole variant types. (= VT_I8) PT_STRING8, PT_UNICODE, # unicode isn't in libpst, but added here for outlook 2003 down the track PT_SYSTIME, 0x0048, # another unknown 0x0102, # this is PT_BINARY vs PT_CLSID #0x1003, # these are vector types, but they're commented out for now because i'd expect that #0x1014, # there's extra decoding needed that i'm not doing. (probably just need a simple # # PT_* => unpack string mapping for the immediate types, and just do unpack('V*') etc #0x101e, #0x1102 ] # the attachment and recipient arrays appear to be always stored with these fixed # id2 values. seems strange. are there other extra streams? can find out by making higher # level IO wrapper, which has the id2 value, and doing the diff of available id2 values versus # used id2 values in properties of an item. ID2_ATTACHMENTS = 0x671 ID2_RECIPIENTS = 0x692 attr_reader :desc, :data, :data_chunks, :offset_tables def initialize desc raise FormatError, "unable to get associated index record for #{desc.inspect}" unless desc.desc @desc = desc #@data = desc.desc.read if Pst::Index === desc.desc #@data = RangesIOIdxChain.new(desc.pst, desc.desc).read idxs = desc.pst.id2_block_idx_chain desc.desc # this gets me the plain index chain. else # fake desc #@data = desc.desc.read idxs = [desc.desc] end @data_chunks = idxs.map { |idx| idx.read } @data = @data_chunks.first load_header @index_offsets = [@index_offset] + @data_chunks[1..-1].map { |chunk| chunk.unpack('v')[0] } @offset_tables = [] @ignored = [] @data_chunks.zip(@index_offsets).each do |chunk, offset| ignore = chunk[offset, 2].unpack('v')[0] @ignored << ignore # p ignore @offset_tables.push offset_table = [] # maybe its ok if there aren't to be any values ? raise FormatError if offset == 0 offsets = chunk[offset + 2..-1].unpack('v*') #p offsets offsets[0, ignore + 2].each_cons 2 do |from, to| #next if to == 0 raise FormatError, [from, to].inspect if from > to offset_table << [from, to] end end @offset_table = @offset_tables.first @idxs = idxs # now, we may have multiple different blocks end # a given desc record may or may not have associated idx2 data. we lazily load it here, so it will never # actually be requested unless get_data_indirect actually needs to use it. def idx2 return @idx2 if @idx2 raise FormatError, 'idx2 requested but no idx2 available' unless desc.list_index # should check this can't return nil @idx2 = desc.pst.load_idx2 desc.list_index end def load_header @index_offset, type, @offset1 = data.unpack 'vvV' raise FormatError, 'unknown block type signature 0x%04x' % type unless TYPES[type] @type = TYPES[type] end # based on the value of offset, return either some data from buf, or some data from the # id2 chain id2, where offset is some key into a lookup table that is stored as the id2 # chain. i think i may need to create a BlockParser class that wraps up all this mess. # # corresponds to: # * _pst_getBlockOffsetPointer # * _pst_getBlockOffset def get_data_indirect offset return get_data_indirect_io(offset).read if offset == 0 nil elsif (offset & 0xf) == 0xf RangesIOID2.new(desc.pst, offset, idx2).read else low, high = offset & 0xf, offset >> 4 raise FormatError if low != 0 or (high & 0x1) != 0 or (high / 2) > @offset_table.length from, to = @offset_table[high / 2] data[from...to] end end def get_data_indirect_io offset if offset == 0 nil elsif (offset & 0xf) == 0xf if idx2[offset] RangesIOID2.new desc.pst, offset, idx2 else warn "tried to get idx2 record for #{offset} but failed" return StringIO.new('') end else low, high = offset & 0xf, offset >> 4 if low != 0 or (high & 0x1) != 0 # raise FormatError, warn "bad - #{low} #{high} (1)" return StringIO.new('') end # lets see which block it should come from. block_idx, i = high.divmod 4096 unless block_idx < @data_chunks.length warn "bad - block_idx to high (not #{block_idx} < #{@data_chunks.length})" return StringIO.new('') end data_chunk, offset_table = @data_chunks[block_idx], @offset_tables[block_idx] if i / 2 >= offset_table.length warn "bad - #{low} #{high} - #{i / 2} >= #{offset_table.length} (2)" return StringIO.new('') end #warn "ok - #{low} #{high} #{offset_table.length}" from, to = offset_table[i / 2] StringIO.new data_chunk[from...to] end end def handle_indirect_values key, type, value case type when PT_BOOLEAN value = value != 0 when *IMMEDIATE_TYPES # not including PT_BOOLEAN which we just did above # no processing current applied (needed?). when *INDIRECT_TYPES # the value is a pointer if String === value # ie, value size > 4 above value = StringIO.new value else value = get_data_indirect_io(value) end # keep strings as immediate values for now, for compatability with how i set up # Msg::Properties::ENCODINGS if value if type == PT_STRING8 value = value.read elsif type == PT_UNICODE value = Ole::Types::FROM_UTF16.iconv value.read end end # special subject handling if key == PR_BODY_HTML and value # to keep the msg code happy, which thinks body_html will be an io # although, in 2003 version, they are 0102 already value = StringIO.new value unless value.respond_to?(:read) end if key == PR_SUBJECT and value ignore, offset = value.unpack 'C2' offset = (offset == 1 ? nil : offset - 3) value = value[2..-1] =begin index = value =~ /^[A-Z]*:/ ? $~[0].length - 1 : nil unless ignore == 1 and offset == index warn 'something wrong with subject hack' $x = [ignore, offset, value] require 'irb' IRB.start exit end =end =begin new idea: making sense of the \001\00[156] i've seen prefixing subject. i think its to do with the placement of the ':', or the ' '. And perhaps an optimization to do with thread topic, and ignoring the prefixes added by mailers. thread topic is equal to subject with all that crap removed. can test by creating some mails with bizarre subjects. subject="\001\005RE: blah blah" subject="\001\001blah blah" subject="\001\032Out of Office AutoReply: blah blah" subject="\001\020Undeliverable: blah blah" looks like it =end # now what i think, is that perhaps, value[offset..-1] ... # or something like that should be stored as a special tag. ie, do a double yield # for this case. probably PR_CONVERSATION_TOPIC, in which case i'd write instead: # yield [PR_SUBJECT, ref_type, value] # yield [PR_CONVERSATION_TOPIC, ref_type, value[offset..-1] # next # to skip the yield. end # special handling for embedded objects # used for attach_data for attached messages. in which case attach_method should == 5, # for embedded object. if type == PT_OBJECT and value value = value.read if value.respond_to?(:read) id2, unknown = value.unpack 'V2' io = RangesIOID2.new desc.pst, id2, idx2 # hacky desc2 = OpenStruct.new(:desc => io, :pst => desc.pst, :list_index => desc.list_index, :children => []) # put nil instead of desc.list_index, otherwise the attachment is attached to itself ad infinitum. # should try and fix that FIXME # this shouldn't be done always. for an attached message, yes, but for an attached # meta file, for example, it shouldn't. difference between embedded_ole vs embedded_msg # really. # note that in the case where its a embedded ole, you actually get a regular serialized ole # object, so i need to create an ole storage object on a rangesioidxchain! # eg: =begin att.props.display_name # => "Picture (Metafile)" io = att.props.attach_data io.read(32).unpack('H*') # => ["d0cf11e0a1b11ae100000.... note the docfile signature. # plug some missing rangesio holes: def io.rewind; seek 0; end def io.flush; raise IOError; end ole = Ole::Storage.open io puts ole.root.to_tree - # |- # |- # \- # =end # until properly fixed, i have disabled this code here, so this will break # nested messages temporarily. #value = Item.new desc2, RawPropertyStore.new(desc2).to_a #desc2.list_index = nil value = io end # this is PT_MV_STRING8, i guess. # should probably have the 0x1000 flag, and do the or-ring. # example of 0x1102 is PR_OUTLOOK_2003_ENTRYIDS. less sure about that one. when 0x101e, 0x1102 # example data: # 0x802b "\003\000\000\000\020\000\000\000\030\000\000\000#\000\000\000BusinessCompetitionFavorites" # this 0x802b would be an extended attribute for categories / keywords. value = get_data_indirect_io(value).read unless String === value num = value.unpack('V')[0] offsets = value[4, 4 * num].unpack("V#{num}") value = (offsets + [value.length]).to_enum(:each_cons, 2).map { |from, to| value[from...to] } value.map! { |str| StringIO.new str } if type == 0x1102 else name = Mapi::Types::DATA[type].first rescue nil warn '0x%04x %p' % [key, get_data_indirect_io(value).read] raise NotImplementedError, 'unsupported mapi property type - 0x%04x (%p)' % [type, name] end [key, type, value] end end =begin * recipients: affects: ["0x200764", "0x2011c4", "0x201b24", "0x201b44", "0x201ba4", "0x201c24", "0x201cc4", "0x202504"] after adding the rawpropertystoretable fix, all except the second parse properly, and satisfy: item.props.display_to == item.recipients.map { |r| r.props.display_name if r.props.recipient_type == 1 }.compact * '; ' only the second still has a problem #[#] think this is related to a multi block #data3. ie, when you use @x * rec_size, and it goes > 8190, or there abouts, then it stuffs up. probably there is header gunk, or something, similar to when #data is multi block. same problem affects the attachment table in test4. fixed that issue. round data3 ranges to rec_size. fix other issue with attached objects. all recipients and attachments in test2 are fine. only remaining issue is test4 recipients of 200044. strange. =end # RawPropertyStore is used to iterate through the properties of an item, or the auxiliary # data for an attachment. its just a parser for the way the properties are serialized, when the # properties don't have to conform to a column structure. # # structure of this chunk of data is often # header, property keys, data values, and then indexes. # the property keys has value in it. value can be the actual value if its a short type, # otherwise you lookup the value in the indicies, where you get the offsets to use in the # main data body. due to the indirect thing though, any of these parts could actually come # from a separate stream. class RawPropertyStore < BlockParser include Enumerable attr_reader :length def initialize desc super raise FormatError, "expected type 1 - got #{@type}" unless @type == 1 # the way that offset works, data1 may be a subset of buf, or something from id2. if its from buf, # it will be offset based on index_offset and offset. so it could be some random chunk of data anywhere # in the thing. header_data = get_data_indirect @offset1 raise FormatError if header_data.length < 8 signature, offset2 = header_data.unpack 'V2' #p [@type, signature] raise FormatError, 'unhandled block signature 0x%08x' % @type if signature != 0x000602b5 # this is actually a big chunk of tag tuples. @index_data = get_data_indirect offset2 @length = @index_data.length / 8 end # iterate through the property tuples def each length.times do |i| key, type, value = handle_indirect_values(*@index_data[8 * i, 8].unpack('vvV')) yield key, type, value end end end # RawPropertyStoreTable is kind of like a database table. # it has a fixed set of columns. # #[] is kind of like getting a row from the table. # those rows are currently encapsulated by Row, which has #each like # RawPropertyStore. # only used for the recipients array, and the attachments array. completely lazy, doesn't # load any of the properties upon creation. class RawPropertyStoreTable < BlockParser class Column < Struct.new(:ref_type, :type, :ind2_off, :size, :slot) def initialize data super(*data.unpack('v3CC')) end def nice_type_name Mapi::Types::DATA[ref_type].first[/_(.*)/, 1].downcase rescue '0x%04x' % ref_type end def nice_prop_name Mapi::PropertyStore::TAGS['%04x' % type].first[/_(.*)/, 1].downcase rescue '0x%04x' % type end def inspect "#<#{self.class} name=#{nice_prop_name.inspect}, type=#{nice_type_name.inspect}>" end end include Enumerable attr_reader :length, :index_data, :data2, :data3, :rec_size def initialize desc super raise FormatError, "expected type 2 - got #{@type}" unless @type == 2 header_data = get_data_indirect @offset1 # seven_c_blk # often: u1 == u2 and u3 == u2 + 2, then rec_size == u3 + 4. wtf seven_c, @num_list, u1, u2, u3, @rec_size, b_five_offset, ind2_offset, u7, u8 = header_data[0, 22].unpack('CCv4V2v2') @index_data = header_data[22..-1] raise FormatError if @num_list != schema.length or seven_c != 0x7c # another check min_size = schema.inject(0) { |total, col| total + col.size } # seem to have at max, 8 padding bytes on the end of the record. not sure if it means # anything. maybe its just space that hasn't been reclaimed due to columns being # removed or something. probably should just check lower bound. range = (min_size..min_size + 8) warn "rec_size seems wrong (#{range} !=== #{rec_size})" unless range === rec_size header_data2 = get_data_indirect b_five_offset raise FormatError if header_data2.length < 8 signature, offset2 = header_data2.unpack 'V2' # ??? seems a bit iffy # there's probably more to the differences than this, and the data2 difference below expect = desc.pst.header.version_2003? ? 0x000404b5 : 0x000204b5 raise FormatError, 'unhandled block signature 0x%08x' % signature if signature != expect # this holds all the row data # handle multiple block issue. @data3_io = get_data_indirect_io ind2_offset if RangesIOIdxChain === @data3_io @data3_idxs = # modify ranges ranges = @data3_io.ranges.map { |offset, size| [offset, size / @rec_size * @rec_size] } @data3_io.instance_variable_set :@ranges, ranges end @data3 = @data3_io.read # there must be something to the data in data2. i think data2 is the array of objects essentially. # currently its only used to imply a length # actually, at size 6, its just some auxiliary data. i'm thinking either Vv/vV, for 97, and something # wider for 03. the second value is just the index (0...length), and the first value is # some kind of offset i expect. actually, they were all id2 values, in another case. # so maybe they're get_data_indirect values too? # actually, it turned out they were identical to the PR_ATTACHMENT_ID2 values... # id2_values = ie, data2.unpack('v*').to_enum(:each_slice, 3).transpose[0] # table[i].assoc(PR_ATTACHMENT_ID2).last == id2_values[i], for all i. @data2 = get_data_indirect(offset2) rescue nil #if data2 # @length = (data2.length / 6.0).ceil #else # the above / 6, may have been ok for 97 files, but the new 0x0004 style block must have # different size records... just use this instead: # hmmm, actually, we can still figure it out: @length = @data3.length / @rec_size #end # lets try and at least use data2 for a warning for now if data2 data2_rec_size = desc.pst.header.version_2003? ? 8 : 6 warn 'somthing seems wrong with data3' unless @length == (data2.length / data2_rec_size) end end def schema @schema ||= index_data.scan(/.{8}/m).map { |data| Column.new data } end def [] idx # handle funky rounding Row.new self, idx * @rec_size end def each length.times { |i| yield self[i] } end class Row include Enumerable def initialize array_parser, x @array_parser, @x = array_parser, x end # iterate through the property tuples def each (@array_parser.index_data.length / 8).times do |i| ref_type, type, ind2_off, size, slot = @array_parser.index_data[8 * i, 8].unpack 'v3CC' # check this rescue too value = @array_parser.data3[@x + ind2_off, size] # if INDIRECT_TYPES.include? ref_type if size <= 4 value = value.unpack('V')[0] end #p ['0x%04x' % ref_type, '0x%04x' % type, (Msg::Properties::MAPITAGS['%04x' % type].first[/^.._(.*)/, 1].downcase rescue nil), # value_orig, value, (get_data_indirect(value_orig.unpack('V')[0]) rescue nil), size, ind2_off, slot] key, type, value = @array_parser.handle_indirect_values type, ref_type, value yield key, type, value end end end end class AttachmentTable < BlockParser # a "fake" MAPI property name for this constant. if you get a mapi property with # this value, it is the id2 value to use to get attachment data. PR_ATTACHMENT_ID2 = 0x67f2 attr_reader :desc, :table def initialize desc @desc = desc # no super, we only actually want BlockParser2#idx2 @table = nil return unless desc.list_index return unless idx = idx2[ID2_ATTACHMENTS] # FIXME make a fake desc. @desc2 = OpenStruct.new :desc => idx, :pst => desc.pst, :list_index => desc.list_index @table = RawPropertyStoreTable.new @desc2 end def to_a return [] if !table table.map do |attachment| attachment = attachment.to_a #p attachment # potentially merge with yet more properties # this still seems pretty broken - especially the property overlap if attachment_id2 = attachment.assoc(PR_ATTACHMENT_ID2) #p attachment_id2.last #p idx2[attachment_id2.last] @desc2.desc = idx2[attachment_id2.last] RawPropertyStore.new(@desc2).each do |a, b, c| record = attachment.assoc a attachment << record = [] unless record record.replace [a, b, c] end end attachment end end end # there is no equivalent to this in libpst. ID2_RECIPIENTS was just guessed given the above # AttachmentTable. class RecipientTable < BlockParser attr_reader :desc, :table def initialize desc @desc = desc # no super, we only actually want BlockParser2#idx2 @table = nil return unless desc.list_index return unless idx = idx2[ID2_RECIPIENTS] # FIXME make a fake desc. desc2 = OpenStruct.new :desc => idx, :pst => desc.pst, :list_index => desc.list_index @table = RawPropertyStoreTable.new desc2 end def to_a return [] if !table table.map { |x| x.to_a } end end # # higher level item code. wraps up the raw properties above, and gives nice # objects to work with. handles item relationships too. # ---------------------------------------------------------------------------- # def self.make_property_set property_list hash = property_list.inject({}) do |hash, (key, type, value)| hash.update PropertySet::Key.new(key) => value end PropertySet.new hash end class Attachment < Mapi::Attachment def initialize list super Pst.make_property_set(list) @embedded_msg = props.attach_data if Item === props.attach_data end end class Recipient < Mapi::Recipient def initialize list super Pst.make_property_set(list) end end class Item < Mapi::Message class EntryID < Struct.new(:u1, :entry_id, :id) UNPACK_STR = 'VA16V' def initialize data data = data.unpack(UNPACK_STR) if String === data super(*data) end end include RecursivelyEnumerable attr_accessor :type, :parent def initialize desc, list, type=nil @desc = desc super Pst.make_property_set(list) # this is kind of weird, but the ids of the special folders are stored in a hash # when the root item is loaded if ipm_wastebasket_entryid desc.pst.special_folder_ids[ipm_wastebasket_entryid] = :wastebasket end if finder_entryid desc.pst.special_folder_ids[finder_entryid] = :finder end # and then here, those are used, along with a crappy heuristic to determine if we are an # item =begin i think the low bits of the desc_id can give some info on the type. it seems that 0x4 is for regular messages (and maybe contacts etc) 0x2 is for folders, and 0x8 is for special things like rules etc, that aren't visible. =end unless type type = props.valid_folder_mask || ipm_subtree_entryid || props.content_count || props.subfolders ? :folder : :message if type == :folder type = desc.pst.special_folder_ids[desc.desc_id] || type end end @type = type end def each_child id = ipm_subtree_entryid if id root = @desc.pst.desc_from_id id raise "couldn't find root" unless root raise 'both kinds of children' unless @desc.children.empty? children = root.children # lets look up the other ids we have. # typically the wastebasket one "deleted items" is in the children already, but # the search folder isn't. extras = [ipm_wastebasket_entryid, finder_entryid].compact.map do |id| root = @desc.pst.desc_from_id id warn "couldn't find root for id #{id}" unless root root end.compact # i do this instead of union, so as not to mess with the order of the # existing children. children += (extras - children) children else @desc.children end.each do |desc| item = @desc.pst.pst_parse_item(desc) item.parent = self yield item end end def path parents, item = [], self parents.unshift item while item = item.parent # remove root parents.shift parents.map { |item| item.props.display_name or raise 'unable to construct path' } * '/' end def children to_enum(:each_child).to_a end # these are still around because they do different stuff # Top of Personal Folder Record def ipm_subtree_entryid @ipm_subtree_entryid ||= EntryID.new(props.ipm_subtree_entryid.read).id rescue nil end # Deleted Items Folder Record def ipm_wastebasket_entryid @ipm_wastebasket_entryid ||= EntryID.new(props.ipm_wastebasket_entryid.read).id rescue nil end # Search Root Record def finder_entryid @finder_entryid ||= EntryID.new(props.finder_entryid.read).id rescue nil end # all these have been replaced with the method_missing below =begin # States which folders are valid for this message store #def valid_folder_mask # props[0x35df] #end # Number of emails stored in a folder def content_count props[0x3602] end # Has children def subfolders props[0x360a] end =end # i think i will change these, so they can inherit the lazyness from RawPropertyStoreTable. # so if you want the last attachment, you can get it without creating the others perhaps. # it just has to handle the no table at all case a bit more gracefully. def attachments @attachments ||= AttachmentTable.new(@desc).to_a.map { |list| Attachment.new list } end def recipients #[] @recipients ||= RecipientTable.new(@desc).to_a.map { |list| Recipient.new list } end def each_recursive(&block) #p :self => self children.each do |child| #p :child => child block[child] child.each_recursive(&block) end end def inspect attrs = %w[display_name subject sender_name subfolders] # attrs = %w[display_name valid_folder_mask ipm_wastebasket_entryid finder_entryid content_count subfolders] str = attrs.map { |a| b = props.send a; " #{a}=#{b.inspect}" if b }.compact * ',' type_s = type == :message ? 'Message' : type == :folder ? 'Folder' : type.to_s.capitalize + 'Folder' str2 = 'desc_id=0x%x' % @desc.desc_id !str.empty? ? "#" : "#" #\n" + props.transport_message_headers + ">" end end # corresponds to # * _pst_parse_item def pst_parse_item desc Item.new desc, RawPropertyStore.new(desc).to_a end # # other random code # ---------------------------------------------------------------------------- # def dump_debug_info puts "* pst header" p header =begin Looking at the output of this, for blank-o1997.pst, i see this part: ... - (26624,516) desc block data (overlap of 4 bytes) - (27136,516) desc block data (gap of 508 bytes) - (28160,516) desc block data (gap of 2620 bytes) ... which confirms my belief that the block size for idx and desc is more likely 512 =end if 0 + 0 == 0 puts '* file range usage' file_ranges = # these 3 things, should account for most of the data in the file. [[0, Header::SIZE, 'pst file header']] + @idx_offsets.map { |offset| [offset, Index::BLOCK_SIZE, 'idx block data'] } + @desc_offsets.map { |offset| [offset, Desc::BLOCK_SIZE, 'desc block data'] } + @idx.map { |idx| [idx.offset, idx.size, 'idx id=0x%x (%s)' % [idx.id, idx.type]] } (file_ranges.sort_by { |idx| idx.first } + [nil]).to_enum(:each_cons, 2).each do |(offset, size, name), next_record| # i think there is a padding of the size out to 64 bytes # which is equivalent to padding out the final offset, because i think the offset is # similarly oriented pad_amount = 64 warn 'i am wrong about the offset padding' if offset % pad_amount != 0 # so, assuming i'm not wrong about that, then we can calculate how much padding is needed. pad = pad_amount - (size % pad_amount) pad = 0 if pad == pad_amount gap = next_record ? next_record.first - (offset + size + pad) : 0 extra = case gap <=> 0 when -1; ["overlap of #{gap.abs} bytes)"] when 0; [] when +1; ["gap of #{gap} bytes"] end # how about we check that padding @io.pos = offset + size pad_bytes = @io.read(pad) extra += ["padding not all zero"] unless pad_bytes == 0.chr * pad puts "- #{offset}:#{size}+#{pad} #{name.inspect}" + (extra.empty? ? '' : ' [' + extra * ', ' + ']') end end # i think the idea of the idx, and indeed the idx2, is just to be able to # refer to data indirectly, which means it can get moved around, and you just update # the idx table. it is simply a list of file offsets and sizes. # not sure i get how id2 plays into it though.... # the sizes seem to be all even. is that a co-incidence? and the ids are all even. that # seems to be related to something else (see the (id & 2) == 1 stuff) puts '* idx entries' @idx.each { |idx| puts "- #{idx.inspect}" } # if you look at the desc tree, you notice a few things: # 1. there is a desc that seems to be the parent of all the folders, messages etc. # it is the one whose parent is itself. # one of its children is referenced as the subtree_entryid of the first desc item, # the root. # 2. typically only 2 types of desc records have idx2_id != 0. messages themselves, # and the desc with id = 0x61 - the xattrib container. everything else uses the # regular ids to find its data. i think it should be reframed as small blocks and # big blocks, but i'll look into it more. # # idx_id and idx2_id are for getting to the data. desc_id and parent_desc_id just define # the parent <-> child relationship, and the desc_ids are how the items are referred to in # entryids. # note that these aren't unique! eg for 0, 4 etc. i expect these'd never change, as the ids # are stored in entryids. whereas the idx and idx2 could be a bit more volatile. puts '* desc tree' # make a dummy root hold everything just for convenience root = Desc.new '' def root.inspect; "#"; end root.children.replace @orphans # this still loads the whole thing as a string for gsub. should use directo output io # version. puts root.to_tree.gsub(/, (parent_desc_id|idx2_id)=0x0(?!\d)/, '') # this is fairly easy to understand, its just an attempt to display the pst items in a tree form # which resembles what you'd see in outlook. puts '* item tree' # now streams directly root_item.to_tree STDOUT end def root_desc @desc.first end def root_item item = pst_parse_item root_desc item.type = :root item end def root root_item end # depth first search of all items include Enumerable def each(&block) root = self.root block[root] root.each_recursive(&block) end def name @name ||= root_item.props.display_name end def inspect "#" end end end