diff options
Diffstat (limited to 'vendor/ruby-msg/lib/mapi/pst.rb')
-rw-r--r-- | vendor/ruby-msg/lib/mapi/pst.rb | 1806 |
1 files changed, 1806 insertions, 0 deletions
diff --git a/vendor/ruby-msg/lib/mapi/pst.rb b/vendor/ruby-msg/lib/mapi/pst.rb new file mode 100644 index 000000000..9ac64b097 --- /dev/null +++ b/vendor/ruby-msg/lib/mapi/pst.rb @@ -0,0 +1,1806 @@ +# +# = Introduction +# +# This file is mostly an attempt to port libpst to ruby, and simplify it in the process. It +# will leverage much of the existing MAPI => MIME conversion developed for Msg files, and as +# such is purely concerned with the file structure details. +# +# = TODO +# +# 1. solve recipient table problem (test4). +# this is done. turns out it was due to id2 clashes. find better solution +# 2. check parse consistency. an initial conversion of a 30M file to pst, shows +# a number of messages conveting badly. compare with libpst too. +# 3. xattribs +# 4. generalise the Mapi stuff better +# 5. refactor index load +# 6. msg serialization? +# + +=begin + +quick plan for cleanup. + +have working tests for 97 and 03 file formats, so safe. + +want to fix up: + +64 bit unpacks scattered around. its ugly. not sure how best to handle it, but am slightly tempted +to override String#unpack to support a 64 bit little endian unpack (like L vs N/V, for Q). one way or +another need to fix it. Could really slow everything else down if its parsing the unpack strings twice, +once in ruby, for every single unpack i do :/ + +the index loading process, and the lack of shared code between normal vs 64 bit variants, and Index vs Desc. +should be able to reduce code by factor of 4. also think I should move load code into the class too. then +maybe have something like: + +class Header + def index_class + version_2003 ? Index64 : Index + end +end + +def load_idx + header.index_class.load_index +end + +OR + +def initialize + @header = ... + extend @header.index_class::Load + load_idx +end + +need to think about the role of the mapi code, and Pst::Item etc, but that layer can come later. + +=end + +require 'mapi' +require 'enumerator' +require 'ostruct' +require 'ole/ranges_io' + +module Mapi +class Pst + class FormatError < StandardError + end + + # unfortunately there is no Q analogue which is little endian only. + # this translates T as an unsigned quad word, little endian byte order, to + # not pollute the rest of the code. + # + # didn't want to override String#unpack, cause its too hacky, and incomplete. + def self.unpack str, unpack_spec + return str.unpack(unpack_spec) unless unpack_spec['T'] + @unpack_cache ||= {} + t_offsets, new_spec = @unpack_cache[unpack_spec] + unless t_offsets + t_offsets = [] + offset = 0 + new_spec = '' + unpack_spec.scan(/([^\d])_?(\*|\d+)?/o) do + num_elems = $1.downcase == 'a' ? 1 : ($2 || 1).to_i + if $1 == 'T' + num_elems.times { |i| t_offsets << offset + i } + new_spec << "V#{num_elems * 2}" + else + new_spec << $~[0] + end + offset += num_elems + end + @unpack_cache[unpack_spec] = [t_offsets, new_spec] + end + a = str.unpack(new_spec) + t_offsets.each do |offset| + low, high = a[offset, 2] + a[offset, 2] = low && high ? low + (high << 32) : nil + end + a + end + + # + # this is the header and encryption encapsulation code + # ---------------------------------------------------------------------------- + # + + # class which encapsulates the pst header + class Header + SIZE = 512 + MAGIC = 0x2142444e + + # these are the constants defined in libpst.c, that + # are referenced in pst_open() + INDEX_TYPE_OFFSET = 0x0A + FILE_SIZE_POINTER = 0xA8 + FILE_SIZE_POINTER_64 = 0xB8 + SECOND_POINTER = 0xBC + INDEX_POINTER = 0xC4 + SECOND_POINTER_64 = 0xE0 + INDEX_POINTER_64 = 0xF0 + ENC_OFFSET = 0x1CD + + attr_reader :magic, :index_type, :encrypt_type, :size + attr_reader :index1_count, :index1, :index2_count, :index2 + attr_reader :version + def initialize data + @magic = data.unpack('N')[0] + @index_type = data[INDEX_TYPE_OFFSET] + @version = {0x0e => 1997, 0x17 => 2003}[@index_type] + + if version_2003? + # don't know? + # >> data1.unpack('V*').zip(data2.unpack('V*')).enum_with_index.select { |(c, d), i| c != d and not [46, 56, 60].include?(i) }.select { |(a, b), i| b == 0 }.map { |(a, b), i| [a / 256, i] } + # [8, 76], [32768, 84], [128, 89] + # >> data1.unpack('C*').zip(data2.unpack('C*')).enum_with_index.select { |(c, d), i| c != d and not [184..187, 224..227, 240..243].any? { |r| r === i } }.select { |(a, b), i| b == 0 and ((Math.log(a) / Math.log(2)) % 1) < 0.0001 } + # [[[2, 0], 61], [[2, 0], 76], [[2, 0], 195], [[2, 0], 257], [[8, 0], 305], [[128, 0], 338], [[128, 0], 357]] + # i have only 2 psts to base this guess on, so i can't really come up with anything that looks reasonable yet. not sure what the offset is. unfortunately there is so much in the header + # that isn't understood... + @encrypt_type = 1 + + @index2_count, @index2 = data[SECOND_POINTER_64 - 4, 8].unpack('V2') + @index1_count, @index1 = data[INDEX_POINTER_64 - 4, 8].unpack('V2') + + @size = data[FILE_SIZE_POINTER_64, 4].unpack('V')[0] + else + @encrypt_type = data[ENC_OFFSET] + + @index2_count, @index2 = data[SECOND_POINTER - 4, 8].unpack('V2') + @index1_count, @index1 = data[INDEX_POINTER - 4, 8].unpack('V2') + + @size = data[FILE_SIZE_POINTER, 4].unpack('V')[0] + end + + validate! + end + + def version_2003? + version == 2003 + end + + def encrypted? + encrypt_type != 0 + end + + def validate! + raise FormatError, "bad signature on pst file (#{'0x%x' % magic})" unless magic == MAGIC + raise FormatError, "only index types 0x0e and 0x17 are handled (#{'0x%x' % index_type})" unless [0x0e, 0x17].include?(index_type) + raise FormatError, "only encrytion types 0 and 1 are handled (#{encrypt_type.inspect})" unless [0, 1].include?(encrypt_type) + end + end + + # compressible encryption! :D + # + # simple substitution. see libpst.c + # maybe test switch to using a String#tr! + class CompressibleEncryption + DECRYPT_TABLE = [ + 0x47, 0xf1, 0xb4, 0xe6, 0x0b, 0x6a, 0x72, 0x48, + 0x85, 0x4e, 0x9e, 0xeb, 0xe2, 0xf8, 0x94, 0x53, # 0x0f + 0xe0, 0xbb, 0xa0, 0x02, 0xe8, 0x5a, 0x09, 0xab, + 0xdb, 0xe3, 0xba, 0xc6, 0x7c, 0xc3, 0x10, 0xdd, # 0x1f + 0x39, 0x05, 0x96, 0x30, 0xf5, 0x37, 0x60, 0x82, + 0x8c, 0xc9, 0x13, 0x4a, 0x6b, 0x1d, 0xf3, 0xfb, # 0x2f + 0x8f, 0x26, 0x97, 0xca, 0x91, 0x17, 0x01, 0xc4, + 0x32, 0x2d, 0x6e, 0x31, 0x95, 0xff, 0xd9, 0x23, # 0x3f + 0xd1, 0x00, 0x5e, 0x79, 0xdc, 0x44, 0x3b, 0x1a, + 0x28, 0xc5, 0x61, 0x57, 0x20, 0x90, 0x3d, 0x83, # 0x4f + 0xb9, 0x43, 0xbe, 0x67, 0xd2, 0x46, 0x42, 0x76, + 0xc0, 0x6d, 0x5b, 0x7e, 0xb2, 0x0f, 0x16, 0x29, # 0x5f + 0x3c, 0xa9, 0x03, 0x54, 0x0d, 0xda, 0x5d, 0xdf, + 0xf6, 0xb7, 0xc7, 0x62, 0xcd, 0x8d, 0x06, 0xd3, # 0x6f + 0x69, 0x5c, 0x86, 0xd6, 0x14, 0xf7, 0xa5, 0x66, + 0x75, 0xac, 0xb1, 0xe9, 0x45, 0x21, 0x70, 0x0c, # 0x7f + 0x87, 0x9f, 0x74, 0xa4, 0x22, 0x4c, 0x6f, 0xbf, + 0x1f, 0x56, 0xaa, 0x2e, 0xb3, 0x78, 0x33, 0x50, # 0x8f + 0xb0, 0xa3, 0x92, 0xbc, 0xcf, 0x19, 0x1c, 0xa7, + 0x63, 0xcb, 0x1e, 0x4d, 0x3e, 0x4b, 0x1b, 0x9b, # 0x9f + 0x4f, 0xe7, 0xf0, 0xee, 0xad, 0x3a, 0xb5, 0x59, + 0x04, 0xea, 0x40, 0x55, 0x25, 0x51, 0xe5, 0x7a, # 0xaf + 0x89, 0x38, 0x68, 0x52, 0x7b, 0xfc, 0x27, 0xae, + 0xd7, 0xbd, 0xfa, 0x07, 0xf4, 0xcc, 0x8e, 0x5f, # 0xbf + 0xef, 0x35, 0x9c, 0x84, 0x2b, 0x15, 0xd5, 0x77, + 0x34, 0x49, 0xb6, 0x12, 0x0a, 0x7f, 0x71, 0x88, # 0xcf + 0xfd, 0x9d, 0x18, 0x41, 0x7d, 0x93, 0xd8, 0x58, + 0x2c, 0xce, 0xfe, 0x24, 0xaf, 0xde, 0xb8, 0x36, # 0xdf + 0xc8, 0xa1, 0x80, 0xa6, 0x99, 0x98, 0xa8, 0x2f, + 0x0e, 0x81, 0x65, 0x73, 0xe4, 0xc2, 0xa2, 0x8a, # 0xef + 0xd4, 0xe1, 0x11, 0xd0, 0x08, 0x8b, 0x2a, 0xf2, + 0xed, 0x9a, 0x64, 0x3f, 0xc1, 0x6c, 0xf9, 0xec # 0xff + ] + + ENCRYPT_TABLE = [nil] * 256 + DECRYPT_TABLE.each_with_index { |i, j| ENCRYPT_TABLE[i] = j } + + def self.decrypt_alt encrypted + decrypted = '' + encrypted.length.times { |i| decrypted << DECRYPT_TABLE[encrypted[i]] } + decrypted + end + + def self.encrypt_alt decrypted + encrypted = '' + decrypted.length.times { |i| encrypted << ENCRYPT_TABLE[decrypted[i]] } + encrypted + end + + # an alternate implementation that is possibly faster.... + # TODO - bench + DECRYPT_STR, ENCRYPT_STR = [DECRYPT_TABLE, (0...256)].map do |values| + values.map { |i| i.chr }.join.gsub(/([\^\-\\])/, "\\\\\\1") + end + + def self.decrypt encrypted + encrypted.tr ENCRYPT_STR, DECRYPT_STR + end + + def self.encrypt decrypted + decrypted.tr DECRYPT_STR, ENCRYPT_STR + end + end + + class RangesIOEncryptable < RangesIO + def initialize io, mode='r', params={} + mode, params = 'r', mode if Hash === mode + @decrypt = !!params[:decrypt] + super + end + + def encrypted? + @decrypt + end + + def read limit=nil + buf = super + buf = CompressibleEncryption.decrypt(buf) if encrypted? + buf + end + end + + attr_reader :io, :header, :idx, :desc, :special_folder_ids + + # corresponds to + # * pst_open + # * pst_load_index + def initialize io + @io = io + io.pos = 0 + @header = Header.new io.read(Header::SIZE) + + # would prefer this to be in Header#validate, but it doesn't have the io size. + # should perhaps downgrade this to just be a warning... + raise FormatError, "header size field invalid (#{header.size} != #{io.size}}" unless header.size == io.size + + load_idx + load_desc + load_xattrib + + @special_folder_ids = {} + end + + def encrypted? + @header.encrypted? + end + + # until i properly fix logging... + def warn s + Mapi::Log.warn s + end + + # + # this is the index and desc record loading code + # ---------------------------------------------------------------------------- + # + + ToTree = Module.new + + module Index2 + BLOCK_SIZE = 512 + module RecursiveLoad + def load_chain + #... + end + end + + module Base + def read + #... + end + end + + class Version1997 < Struct.new(:a)#...) + SIZE = 12 + + include RecursiveLoad + include Base + end + + class Version2003 < Struct.new(:a)#...) + SIZE = 24 + + include RecursiveLoad + include Base + end + end + + module Desc2 + module Base + def desc + #... + end + end + + class Version1997 < Struct.new(:a)#...) + #include Index::RecursiveLoad + include Base + end + + class Version2003 < Struct.new(:a)#...) + #include Index::RecursiveLoad + include Base + end + end + + # more constants from libpst.c + # these relate to the index block + ITEM_COUNT_OFFSET = 0x1f0 # count byte + LEVEL_INDICATOR_OFFSET = 0x1f3 # node or leaf + BACKLINK_OFFSET = 0x1f8 # backlink u1 value + + # these 3 classes are used to hold various file records + + # pst_index + class Index < Struct.new(:id, :offset, :size, :u1) + UNPACK_STR = 'VVvv' + SIZE = 12 + BLOCK_SIZE = 512 # index blocks was 516 but bogus + COUNT_MAX = 41 # max active items (ITEM_COUNT_OFFSET / Index::SIZE = 41) + + attr_accessor :pst + def initialize data + data = Pst.unpack data, UNPACK_STR if String === data + super(*data) + end + + def type + @type ||= begin + if id & 0x2 == 0 + :data + else + first_byte, second_byte = read.unpack('CC') + if first_byte == 1 + raise second_byte unless second_byte == 1 + :data_chain_header + elsif first_byte == 2 + raise second_byte unless second_byte == 0 + :id2_assoc + else + raise FormatError, 'unknown first byte for block - %p' % first_byte + end + end + end + end + + def data? + (id & 0x2) == 0 + end + + def read decrypt=true + # only data blocks are every encrypted + decrypt = false unless data? + pst.pst_read_block_size offset, size, decrypt + end + + # show all numbers in hex + def inspect + super.gsub(/=(\d+)/) { '=0x%x' % $1.to_i }.sub(/Index /, "Index type=#{type.inspect}, ") + end + end + + # mostly guesses. + ITEM_COUNT_OFFSET_64 = 0x1e8 + LEVEL_INDICATOR_OFFSET_64 = 0x1eb # diff of 3 between these 2 as above... + + # will maybe inherit from Index64, in order to get the same #type function. + class Index64 < Index + UNPACK_STR = 'TTvvV' + SIZE = 24 + BLOCK_SIZE = 512 + COUNT_MAX = 20 # bit of a guess really. 512 / 24 = 21, but doesn't leave enough header room + + # this is the extra item on the end of the UNPACK_STR above + attr_accessor :u2 + + def initialize data + data = Pst.unpack data, UNPACK_STR if String === data + @u2 = data.pop + super data + end + + def inspect + super.sub(/>$/, ', u2=%p>' % u2) + end + + def self.load_chain io, header + load_idx_rec io, header.index1, 0, 0 + end + + # almost identical to load code for Index, just different offsets and unpack strings. + # can probably merge them, or write a generic load_tree function or something. + def self.load_idx_rec io, offset, linku1, start_val + io.seek offset + buf = io.read BLOCK_SIZE + idxs = [] + + item_count = buf[ITEM_COUNT_OFFSET_64] + raise "have too many active items in index (#{item_count})" if item_count > COUNT_MAX + + #idx = Index.new buf[BACKLINK_OFFSET, Index::SIZE] + #raise 'blah 1' unless idx.id == linku1 + + if buf[LEVEL_INDICATOR_OFFSET_64] == 0 + # leaf pointers + # split the data into item_count index objects + buf[0, SIZE * item_count].scan(/.{#{SIZE}}/mo).each_with_index do |data, i| + idx = new data + # first entry + raise 'blah 3' if i == 0 and start_val != 0 and idx.id != start_val + #idx.pst = self + break if idx.id == 0 + idxs << idx + end + else + # node pointers + # split the data into item_count table pointers + buf[0, SIZE * item_count].scan(/.{#{SIZE}}/mo).each_with_index do |data, i| + start, u1, offset = Pst.unpack data, 'T3' + # for the first value, we expect the start to be equal + raise 'blah 3' if i == 0 and start_val != 0 and start != start_val + break if start == 0 + idxs += load_idx_rec io, offset, u1, start + end + end + + idxs + end + end + + # pst_desc + class Desc64 < Struct.new(:desc_id, :idx_id, :idx2_id, :parent_desc_id, :u2) + UNPACK_STR = 'T3VV' + SIZE = 32 + BLOCK_SIZE = 512 # descriptor blocks was 520 but bogus + COUNT_MAX = 15 # guess as per Index64 + + include RecursivelyEnumerable + + attr_accessor :pst + attr_reader :children + def initialize data + super(*Pst.unpack(data, UNPACK_STR)) + @children = [] + end + + def desc + pst.idx_from_id idx_id + end + + def list_index + pst.idx_from_id idx2_id + end + + def self.load_chain io, header + load_desc_rec io, header.index2, 0, 0x21 + end + + def self.load_desc_rec io, offset, linku1, start_val + io.seek offset + buf = io.read BLOCK_SIZE + descs = [] + item_count = buf[ITEM_COUNT_OFFSET_64] + + # not real desc + #desc = Desc.new buf[BACKLINK_OFFSET, 4] + #raise 'blah 1' unless desc.desc_id == linku1 + + if buf[LEVEL_INDICATOR_OFFSET_64] == 0 + # leaf pointers + raise "have too many active items in index (#{item_count})" if item_count > COUNT_MAX + # split the data into item_count desc objects + buf[0, SIZE * item_count].scan(/.{#{SIZE}}/mo).each_with_index do |data, i| + desc = new data + # first entry + raise 'blah 3' if i == 0 and start_val != 0 and desc.desc_id != start_val + break if desc.desc_id == 0 + descs << desc + end + else + # node pointers + raise "have too many active items in index (#{item_count})" if item_count > Index64::COUNT_MAX + # split the data into item_count table pointers + buf[0, Index64::SIZE * item_count].scan(/.{#{Index64::SIZE}}/mo).each_with_index do |data, i| + start, u1, offset = Pst.unpack data, 'T3' + # for the first value, we expect the start to be equal note that ids -1, so even for the + # first we expect it to be equal. thats the 0x21 (dec 33) desc record. this means we assert + # that the first desc record is always 33... + # thats because 0x21 is the pst root itself... + raise 'blah 3' if i == 0 and start_val != -1 and start != start_val + # this shouldn't really happen i'd imagine + break if start == 0 + descs += load_desc_rec io, offset, u1, start + end + end + + descs + end + + def each_child(&block) + @children.each(&block) + end + end + + # _pst_table_ptr_struct + class TablePtr < Struct.new(:start, :u1, :offset) + UNPACK_STR = 'V3' + SIZE = 12 + + def initialize data + data = data.unpack(UNPACK_STR) if String === data + super(*data) + end + end + + # pst_desc + # idx_id is a pointer to an idx record which gets the primary data stream for the Desc record. + # idx2_id gets you an idx record, that when read gives you an ID2 association list, which just maps + # another set of ids to index values + class Desc < Struct.new(:desc_id, :idx_id, :idx2_id, :parent_desc_id) + UNPACK_STR = 'V4' + SIZE = 16 + BLOCK_SIZE = 512 # descriptor blocks was 520 but bogus + COUNT_MAX = 31 # max active desc records (ITEM_COUNT_OFFSET / Desc::SIZE = 31) + + include ToTree + + attr_accessor :pst + attr_reader :children + def initialize data + super(*data.unpack(UNPACK_STR)) + @children = [] + end + + def desc + pst.idx_from_id idx_id + end + + def list_index + pst.idx_from_id idx2_id + end + + # show all numbers in hex + def inspect + super.gsub(/=(\d+)/) { '=0x%x' % $1.to_i } + end + end + + # corresponds to + # * _pst_build_id_ptr + def load_idx + @idx = [] + @idx_offsets = [] + if header.version_2003? + @idx = Index64.load_chain io, header + @idx.each { |idx| idx.pst = self } + else + load_idx_rec header.index1, header.index1_count, 0 + end + + # we'll typically be accessing by id, so create a hash as a lookup cache + @idx_from_id = {} + @idx.each do |idx| + warn "there are duplicate idx records with id #{idx.id}" if @idx_from_id[idx.id] + @idx_from_id[idx.id] = idx + end + end + + # load the flat idx table, which maps ids to file ranges. this is the recursive helper + # + # corresponds to + # * _pst_build_id_ptr + def load_idx_rec offset, linku1, start_val + @idx_offsets << offset + + #_pst_read_block_size(pf, offset, BLOCK_SIZE, &buf, 0, 0) < BLOCK_SIZE) + buf = pst_read_block_size offset, Index::BLOCK_SIZE, false + + item_count = buf[ITEM_COUNT_OFFSET] + raise "have too many active items in index (#{item_count})" if item_count > Index::COUNT_MAX + + idx = Index.new buf[BACKLINK_OFFSET, Index::SIZE] + raise 'blah 1' unless idx.id == linku1 + + if buf[LEVEL_INDICATOR_OFFSET] == 0 + # leaf pointers + # split the data into item_count index objects + buf[0, Index::SIZE * item_count].scan(/.{#{Index::SIZE}}/mo).each_with_index do |data, i| + idx = Index.new data + # first entry + raise 'blah 3' if i == 0 and start_val != 0 and idx.id != start_val + idx.pst = self + # this shouldn't really happen i'd imagine + break if idx.id == 0 + @idx << idx + end + else + # node pointers + # split the data into item_count table pointers + buf[0, TablePtr::SIZE * item_count].scan(/.{#{TablePtr::SIZE}}/mo).each_with_index do |data, i| + table = TablePtr.new data + # for the first value, we expect the start to be equal + raise 'blah 3' if i == 0 and start_val != 0 and table.start != start_val + # this shouldn't really happen i'd imagine + break if table.start == 0 + load_idx_rec table.offset, table.u1, table.start + end + end + end + + # most access to idx objects will use this function + # + # corresponds to + # * _pst_getID + def idx_from_id id + @idx_from_id[id] + end + + # corresponds to + # * _pst_build_desc_ptr + # * record_descriptor + def load_desc + @desc = [] + @desc_offsets = [] + if header.version_2003? + @desc = Desc64.load_chain io, header + @desc.each { |desc| desc.pst = self } + else + load_desc_rec header.index2, header.index2_count, 0x21 + end + + # first create a lookup cache + @desc_from_id = {} + @desc.each do |desc| + desc.pst = self + warn "there are duplicate desc records with id #{desc.desc_id}" if @desc_from_id[desc.desc_id] + @desc_from_id[desc.desc_id] = desc + end + + # now turn the flat list of loaded desc records into a tree + + # well, they have no parent, so they're more like, the toplevel descs. + @orphans = [] + # now assign each node to the parents child array, putting the orphans in the above + @desc.each do |desc| + parent = @desc_from_id[desc.parent_desc_id] + # note, besides this, its possible to create other circular structures. + if parent == desc + # this actually happens usually, for the root_item it appears. + #warn "desc record's parent is itself (#{desc.inspect})" + # maybe add some more checks in here for circular structures + elsif parent + parent.children << desc + next + end + @orphans << desc + end + + # maybe change this to some sort of sane-ness check. orphans are expected +# warn "have #{@orphans.length} orphan desc record(s)." unless @orphans.empty? + end + + # load the flat list of desc records recursively + # + # corresponds to + # * _pst_build_desc_ptr + # * record_descriptor + def load_desc_rec offset, linku1, start_val + @desc_offsets << offset + + buf = pst_read_block_size offset, Desc::BLOCK_SIZE, false + item_count = buf[ITEM_COUNT_OFFSET] + + # not real desc + desc = Desc.new buf[BACKLINK_OFFSET, 4] + raise 'blah 1' unless desc.desc_id == linku1 + + if buf[LEVEL_INDICATOR_OFFSET] == 0 + # leaf pointers + raise "have too many active items in index (#{item_count})" if item_count > Desc::COUNT_MAX + # split the data into item_count desc objects + buf[0, Desc::SIZE * item_count].scan(/.{#{Desc::SIZE}}/mo).each_with_index do |data, i| + desc = Desc.new data + # first entry + raise 'blah 3' if i == 0 and start_val != 0 and desc.desc_id != start_val + # this shouldn't really happen i'd imagine + break if desc.desc_id == 0 + @desc << desc + end + else + # node pointers + raise "have too many active items in index (#{item_count})" if item_count > Index::COUNT_MAX + # split the data into item_count table pointers + buf[0, TablePtr::SIZE * item_count].scan(/.{#{TablePtr::SIZE}}/mo).each_with_index do |data, i| + table = TablePtr.new data + # for the first value, we expect the start to be equal note that ids -1, so even for the + # first we expect it to be equal. thats the 0x21 (dec 33) desc record. this means we assert + # that the first desc record is always 33... + raise 'blah 3' if i == 0 and start_val != -1 and table.start != start_val + # this shouldn't really happen i'd imagine + break if table.start == 0 + load_desc_rec table.offset, table.u1, table.start + end + end + end + + # as for idx + # + # corresponds to: + # * _pst_getDptr + def desc_from_id id + @desc_from_id[id] + end + + # corresponds to + # * pst_load_extended_attributes + def load_xattrib + unless desc = desc_from_id(0x61) + warn "no extended attributes desc record found" + return + end + unless desc.desc + warn "no desc idx for extended attributes" + return + end + if desc.list_index + end + #warn "skipping loading xattribs" + # FIXME implement loading xattribs + end + + # corresponds to: + # * _pst_read_block_size + # * _pst_read_block ?? + # * _pst_ff_getIDblock_dec ?? + # * _pst_ff_getIDblock ?? + def pst_read_block_size offset, size, decrypt=true + io.seek offset + buf = io.read size + warn "tried to read #{size} bytes but only got #{buf.length}" if buf.length != size + encrypted? && decrypt ? CompressibleEncryption.decrypt(buf) : buf + end + + # + # id2 + # ---------------------------------------------------------------------------- + # + + class ID2Assoc < Struct.new(:id2, :id, :table2) + UNPACK_STR = 'V3' + SIZE = 12 + + def initialize data + data = data.unpack(UNPACK_STR) if String === data + super(*data) + end + end + + class ID2Assoc64 < Struct.new(:id2, :u1, :id, :table2) + UNPACK_STR = 'VVT2' + SIZE = 24 + + def initialize data + if String === data + data = Pst.unpack data, UNPACK_STR + end + super(*data) + end + + def self.load_chain idx + buf = idx.read + type, count = buf.unpack 'v2' + unless type == 0x0002 + raise 'unknown id2 type 0x%04x' % type + #return + end + id2 = [] + count.times do |i| + assoc = new buf[8 + SIZE * i, SIZE] + id2 << assoc + if assoc.table2 != 0 + id2 += load_chain idx.pst.idx_from_id(assoc.table2) + end + end + id2 + end + end + + class ID2Mapping + attr_reader :list + def initialize pst, list + @pst = pst + @list = list + # create a lookup. + @id_from_id2 = {} + @list.each do |id2| + # NOTE we take the last value seen value if there are duplicates. this "fixes" + # test4-o1997.pst for the time being. + warn "there are duplicate id2 records with id #{id2.id2}" if @id_from_id2[id2.id2] + next if @id_from_id2[id2.id2] + @id_from_id2[id2.id2] = id2.id + end + end + + # TODO: fix logging + def warn s + Mapi::Log.warn s + end + + # corresponds to: + # * _pst_getID2 + def [] id + #id2 = @list.find { |x| x.id2 == id } + id = @id_from_id2[id] + id and @pst.idx_from_id(id) + end + end + + def load_idx2 idx + if header.version_2003? + id2 = ID2Assoc64.load_chain idx + else + id2 = load_idx2_rec idx + end + ID2Mapping.new self, id2 + end + + # corresponds to + # * _pst_build_id2 + def load_idx2_rec idx + # i should perhaps use a idx chain style read here? + buf = pst_read_block_size idx.offset, idx.size, false + type, count = buf.unpack 'v2' + unless type == 0x0002 + raise 'unknown id2 type 0x%04x' % type + #return + end + id2 = [] + count.times do |i| + assoc = ID2Assoc.new buf[4 + ID2Assoc::SIZE * i, ID2Assoc::SIZE] + id2 << assoc + if assoc.table2 != 0 + id2 += load_idx2_rec idx_from_id(assoc.table2) + end + end + id2 + end + + class RangesIOIdxChain < RangesIOEncryptable + def initialize pst, idx_head + @idxs = pst.id2_block_idx_chain idx_head + # whether or not a given idx needs encrypting + decrypts = @idxs.map do |idx| + decrypt = (idx.id & 2) != 0 ? false : pst.encrypted? + end.uniq + raise NotImplementedError, 'partial encryption in RangesIOID2' if decrypts.length > 1 + decrypt = decrypts.first + # convert idxs to ranges + ranges = @idxs.map { |idx| [idx.offset, idx.size] } + super pst.io, :ranges => ranges, :decrypt => decrypt + end + end + + class RangesIOID2 < RangesIOIdxChain + def self.new pst, id2, idx2 + RangesIOIdxChain.new pst, idx2[id2] + end + end + + # corresponds to: + # * _pst_ff_getID2block + # * _pst_ff_getID2data + # * _pst_ff_compile_ID + def id2_block_idx_chain idx + if (idx.id & 0x2) == 0 + [idx] + else + buf = idx.read + type, fdepth, count = buf[0, 4].unpack 'CCv' + unless type == 1 # libpst.c:3958 + warn 'Error in idx_chain - %p, %p, %p - attempting to ignore' % [type, fdepth, count] + return [idx] + end + # there are 4 unaccounted for bytes here, 4...8 + if header.version_2003? + ids = buf[8, count * 8].unpack("T#{count}") + else + ids = buf[8, count * 4].unpack('V*') + end + if fdepth == 1 + ids.map { |id| idx_from_id id } + else + ids.map { |id| id2_block_idx_chain idx_from_id(id) }.flatten + end + end + end + + # + # main block parsing code. gets raw properties + # ---------------------------------------------------------------------------- + # + + # the job of this class, is to take a desc record, and be able to enumerate through the + # mapi properties of the associated thing. + # + # corresponds to + # * _pst_parse_block + # * _pst_process (in some ways. although perhaps thats more the Item::Properties#add_property) + class BlockParser + include Mapi::Types::Constants + + TYPES = { + 0xbcec => 1, + 0x7cec => 2, + # type 3 is removed. an artifact of not handling the indirect blocks properly in libpst. + } + + PR_SUBJECT = PropertySet::TAGS.find { |num, (name, type)| name == 'PR_SUBJECT' }.first.hex + PR_BODY_HTML = PropertySet::TAGS.find { |num, (name, type)| name == 'PR_BODY_HTML' }.first.hex + + # this stuff could maybe be moved to Ole::Types? or leverage it somehow? + # whether or not a type is immeidate is more a property of the pst encoding though i expect. + # what i probably can add is a generic concept of whether a type is of variadic length or not. + + # these lists are very incomplete. think they are largely copied from libpst + + IMMEDIATE_TYPES = [ + PT_SHORT, PT_LONG, PT_BOOLEAN + ] + + INDIRECT_TYPES = [ + PT_DOUBLE, PT_OBJECT, + 0x0014, # whats this? probably something like PT_LONGLONG, given the correspondence with the + # ole variant types. (= VT_I8) + PT_STRING8, PT_UNICODE, # unicode isn't in libpst, but added here for outlook 2003 down the track + PT_SYSTIME, + 0x0048, # another unknown + 0x0102, # this is PT_BINARY vs PT_CLSID + #0x1003, # these are vector types, but they're commented out for now because i'd expect that + #0x1014, # there's extra decoding needed that i'm not doing. (probably just need a simple + # # PT_* => unpack string mapping for the immediate types, and just do unpack('V*') etc + #0x101e, + #0x1102 + ] + + # the attachment and recipient arrays appear to be always stored with these fixed + # id2 values. seems strange. are there other extra streams? can find out by making higher + # level IO wrapper, which has the id2 value, and doing the diff of available id2 values versus + # used id2 values in properties of an item. + ID2_ATTACHMENTS = 0x671 + ID2_RECIPIENTS = 0x692 + + attr_reader :desc, :data, :data_chunks, :offset_tables + def initialize desc + raise FormatError, "unable to get associated index record for #{desc.inspect}" unless desc.desc + @desc = desc + #@data = desc.desc.read + if Pst::Index === desc.desc + #@data = RangesIOIdxChain.new(desc.pst, desc.desc).read + idxs = desc.pst.id2_block_idx_chain desc.desc + # this gets me the plain index chain. + else + # fake desc + #@data = desc.desc.read + idxs = [desc.desc] + end + + @data_chunks = idxs.map { |idx| idx.read } + @data = @data_chunks.first + + load_header + + @index_offsets = [@index_offset] + @data_chunks[1..-1].map { |chunk| chunk.unpack('v')[0] } + @offset_tables = [] + @ignored = [] + @data_chunks.zip(@index_offsets).each do |chunk, offset| + ignore = chunk[offset, 2].unpack('v')[0] + @ignored << ignore +# p ignore + @offset_tables.push offset_table = [] + # maybe its ok if there aren't to be any values ? + raise FormatError if offset == 0 + offsets = chunk[offset + 2..-1].unpack('v*') + #p offsets + offsets[0, ignore + 2].each_cons 2 do |from, to| + #next if to == 0 + raise FormatError, [from, to].inspect if from > to + offset_table << [from, to] + end + end + + @offset_table = @offset_tables.first + @idxs = idxs + + # now, we may have multiple different blocks + end + + # a given desc record may or may not have associated idx2 data. we lazily load it here, so it will never + # actually be requested unless get_data_indirect actually needs to use it. + def idx2 + return @idx2 if @idx2 + raise FormatError, 'idx2 requested but no idx2 available' unless desc.list_index + # should check this can't return nil + @idx2 = desc.pst.load_idx2 desc.list_index + end + + def load_header + @index_offset, type, @offset1 = data.unpack 'vvV' + raise FormatError, 'unknown block type signature 0x%04x' % type unless TYPES[type] + @type = TYPES[type] + end + + # based on the value of offset, return either some data from buf, or some data from the + # id2 chain id2, where offset is some key into a lookup table that is stored as the id2 + # chain. i think i may need to create a BlockParser class that wraps up all this mess. + # + # corresponds to: + # * _pst_getBlockOffsetPointer + # * _pst_getBlockOffset + def get_data_indirect offset + return get_data_indirect_io(offset).read + + if offset == 0 + nil + elsif (offset & 0xf) == 0xf + RangesIOID2.new(desc.pst, offset, idx2).read + else + low, high = offset & 0xf, offset >> 4 + raise FormatError if low != 0 or (high & 0x1) != 0 or (high / 2) > @offset_table.length + from, to = @offset_table[high / 2] + data[from...to] + end + end + + def get_data_indirect_io offset + if offset == 0 + nil + elsif (offset & 0xf) == 0xf + if idx2[offset] + RangesIOID2.new desc.pst, offset, idx2 + else + warn "tried to get idx2 record for #{offset} but failed" + return StringIO.new('') + end + else + low, high = offset & 0xf, offset >> 4 + if low != 0 or (high & 0x1) != 0 +# raise FormatError, + warn "bad - #{low} #{high} (1)" + return StringIO.new('') + end + # lets see which block it should come from. + block_idx, i = high.divmod 4096 + unless block_idx < @data_chunks.length + warn "bad - block_idx to high (not #{block_idx} < #{@data_chunks.length})" + return StringIO.new('') + end + data_chunk, offset_table = @data_chunks[block_idx], @offset_tables[block_idx] + if i / 2 >= offset_table.length + warn "bad - #{low} #{high} - #{i / 2} >= #{offset_table.length} (2)" + return StringIO.new('') + end + #warn "ok - #{low} #{high} #{offset_table.length}" + from, to = offset_table[i / 2] + StringIO.new data_chunk[from...to] + end + end + + def handle_indirect_values key, type, value + case type + when PT_BOOLEAN + value = value != 0 + when *IMMEDIATE_TYPES # not including PT_BOOLEAN which we just did above + # no processing current applied (needed?). + when *INDIRECT_TYPES + # the value is a pointer + if String === value # ie, value size > 4 above + value = StringIO.new value + else + value = get_data_indirect_io(value) + end + # keep strings as immediate values for now, for compatability with how i set up + # Msg::Properties::ENCODINGS + if value + if type == PT_STRING8 + value = value.read + elsif type == PT_UNICODE + value = Ole::Types::FROM_UTF16.iconv value.read + end + end + # special subject handling + if key == PR_BODY_HTML and value + # to keep the msg code happy, which thinks body_html will be an io + # although, in 2003 version, they are 0102 already + value = StringIO.new value unless value.respond_to?(:read) + end + if key == PR_SUBJECT and value + ignore, offset = value.unpack 'C2' + offset = (offset == 1 ? nil : offset - 3) + value = value[2..-1] +=begin + index = value =~ /^[A-Z]*:/ ? $~[0].length - 1 : nil + unless ignore == 1 and offset == index + warn 'something wrong with subject hack' + $x = [ignore, offset, value] + require 'irb' + IRB.start + exit + end +=end +=begin +new idea: + +making sense of the \001\00[156] i've seen prefixing subject. i think its to do with the placement +of the ':', or the ' '. And perhaps an optimization to do with thread topic, and ignoring the prefixes +added by mailers. thread topic is equal to subject with all that crap removed. + +can test by creating some mails with bizarre subjects. + +subject="\001\005RE: blah blah" +subject="\001\001blah blah" +subject="\001\032Out of Office AutoReply: blah blah" +subject="\001\020Undeliverable: blah blah" + +looks like it + +=end + + # now what i think, is that perhaps, value[offset..-1] ... + # or something like that should be stored as a special tag. ie, do a double yield + # for this case. probably PR_CONVERSATION_TOPIC, in which case i'd write instead: + # yield [PR_SUBJECT, ref_type, value] + # yield [PR_CONVERSATION_TOPIC, ref_type, value[offset..-1] + # next # to skip the yield. + end + + # special handling for embedded objects + # used for attach_data for attached messages. in which case attach_method should == 5, + # for embedded object. + if type == PT_OBJECT and value + value = value.read if value.respond_to?(:read) + id2, unknown = value.unpack 'V2' + io = RangesIOID2.new desc.pst, id2, idx2 + + # hacky + desc2 = OpenStruct.new(:desc => io, :pst => desc.pst, :list_index => desc.list_index, :children => []) + # put nil instead of desc.list_index, otherwise the attachment is attached to itself ad infinitum. + # should try and fix that FIXME + # this shouldn't be done always. for an attached message, yes, but for an attached + # meta file, for example, it shouldn't. difference between embedded_ole vs embedded_msg + # really. + # note that in the case where its a embedded ole, you actually get a regular serialized ole + # object, so i need to create an ole storage object on a rangesioidxchain! + # eg: +=begin +att.props.display_name # => "Picture (Metafile)" +io = att.props.attach_data +io.read(32).unpack('H*') # => ["d0cf11e0a1b11ae100000.... note the docfile signature. +# plug some missing rangesio holes: +def io.rewind; seek 0; end +def io.flush; raise IOError; end +ole = Ole::Storage.open io +puts ole.root.to_tree + +- #<Dirent:"Root Entry"> + |- #<Dirent:"\001Ole" size=20 data="\001\000\000\002\000..."> + |- #<Dirent:"CONTENTS" size=65696 data="\327\315\306\232\000..."> + \- #<Dirent:"\003MailStream" size=12 data="\001\000\000\000[..."> +=end + # until properly fixed, i have disabled this code here, so this will break + # nested messages temporarily. + #value = Item.new desc2, RawPropertyStore.new(desc2).to_a + #desc2.list_index = nil + value = io + end + # this is PT_MV_STRING8, i guess. + # should probably have the 0x1000 flag, and do the or-ring. + # example of 0x1102 is PR_OUTLOOK_2003_ENTRYIDS. less sure about that one. + when 0x101e, 0x1102 + # example data: + # 0x802b "\003\000\000\000\020\000\000\000\030\000\000\000#\000\000\000BusinessCompetitionFavorites" + # this 0x802b would be an extended attribute for categories / keywords. + value = get_data_indirect_io(value).read unless String === value + num = value.unpack('V')[0] + offsets = value[4, 4 * num].unpack("V#{num}") + value = (offsets + [value.length]).to_enum(:each_cons, 2).map { |from, to| value[from...to] } + value.map! { |str| StringIO.new str } if type == 0x1102 + else + name = Mapi::Types::DATA[type].first rescue nil + warn '0x%04x %p' % [key, get_data_indirect_io(value).read] + raise NotImplementedError, 'unsupported mapi property type - 0x%04x (%p)' % [type, name] + end + [key, type, value] + end + end + +=begin +* recipients: + + affects: ["0x200764", "0x2011c4", "0x201b24", "0x201b44", "0x201ba4", "0x201c24", "0x201cc4", "0x202504"] + +after adding the rawpropertystoretable fix, all except the second parse properly, and satisfy: + + item.props.display_to == item.recipients.map { |r| r.props.display_name if r.props.recipient_type == 1 }.compact * '; ' + +only the second still has a problem + +#[#<struct Pst::Desc desc_id=0x2011c4, idx_id=0x397c, idx2_id=0x398a, parent_desc_id=0x8082>] + +think this is related to a multi block #data3. ie, when you use @x * rec_size, and it +goes > 8190, or there abouts, then it stuffs up. probably there is header gunk, or something, +similar to when #data is multi block. + +same problem affects the attachment table in test4. + +fixed that issue. round data3 ranges to rec_size. + +fix other issue with attached objects. + +all recipients and attachments in test2 are fine. + +only remaining issue is test4 recipients of 200044. strange. + +=end + + # RawPropertyStore is used to iterate through the properties of an item, or the auxiliary + # data for an attachment. its just a parser for the way the properties are serialized, when the + # properties don't have to conform to a column structure. + # + # structure of this chunk of data is often + # header, property keys, data values, and then indexes. + # the property keys has value in it. value can be the actual value if its a short type, + # otherwise you lookup the value in the indicies, where you get the offsets to use in the + # main data body. due to the indirect thing though, any of these parts could actually come + # from a separate stream. + class RawPropertyStore < BlockParser + include Enumerable + + attr_reader :length + def initialize desc + super + raise FormatError, "expected type 1 - got #{@type}" unless @type == 1 + + # the way that offset works, data1 may be a subset of buf, or something from id2. if its from buf, + # it will be offset based on index_offset and offset. so it could be some random chunk of data anywhere + # in the thing. + header_data = get_data_indirect @offset1 + raise FormatError if header_data.length < 8 + signature, offset2 = header_data.unpack 'V2' + #p [@type, signature] + raise FormatError, 'unhandled block signature 0x%08x' % @type if signature != 0x000602b5 + # this is actually a big chunk of tag tuples. + @index_data = get_data_indirect offset2 + @length = @index_data.length / 8 + end + + # iterate through the property tuples + def each + length.times do |i| + key, type, value = handle_indirect_values(*@index_data[8 * i, 8].unpack('vvV')) + yield key, type, value + end + end + end + + # RawPropertyStoreTable is kind of like a database table. + # it has a fixed set of columns. + # #[] is kind of like getting a row from the table. + # those rows are currently encapsulated by Row, which has #each like + # RawPropertyStore. + # only used for the recipients array, and the attachments array. completely lazy, doesn't + # load any of the properties upon creation. + class RawPropertyStoreTable < BlockParser + class Column < Struct.new(:ref_type, :type, :ind2_off, :size, :slot) + def initialize data + super(*data.unpack('v3CC')) + end + + def nice_type_name + Mapi::Types::DATA[ref_type].first[/_(.*)/, 1].downcase rescue '0x%04x' % ref_type + end + + def nice_prop_name + Mapi::PropertyStore::TAGS['%04x' % type].first[/_(.*)/, 1].downcase rescue '0x%04x' % type + end + + def inspect + "#<#{self.class} name=#{nice_prop_name.inspect}, type=#{nice_type_name.inspect}>" + end + end + + include Enumerable + + attr_reader :length, :index_data, :data2, :data3, :rec_size + def initialize desc + super + raise FormatError, "expected type 2 - got #{@type}" unless @type == 2 + + header_data = get_data_indirect @offset1 + # seven_c_blk + # often: u1 == u2 and u3 == u2 + 2, then rec_size == u3 + 4. wtf + seven_c, @num_list, u1, u2, u3, @rec_size, b_five_offset, + ind2_offset, u7, u8 = header_data[0, 22].unpack('CCv4V2v2') + @index_data = header_data[22..-1] + + raise FormatError if @num_list != schema.length or seven_c != 0x7c + # another check + min_size = schema.inject(0) { |total, col| total + col.size } + # seem to have at max, 8 padding bytes on the end of the record. not sure if it means + # anything. maybe its just space that hasn't been reclaimed due to columns being + # removed or something. probably should just check lower bound. + range = (min_size..min_size + 8) + warn "rec_size seems wrong (#{range} !=== #{rec_size})" unless range === rec_size + + header_data2 = get_data_indirect b_five_offset + raise FormatError if header_data2.length < 8 + signature, offset2 = header_data2.unpack 'V2' + # ??? seems a bit iffy + # there's probably more to the differences than this, and the data2 difference below + expect = desc.pst.header.version_2003? ? 0x000404b5 : 0x000204b5 + raise FormatError, 'unhandled block signature 0x%08x' % signature if signature != expect + + # this holds all the row data + # handle multiple block issue. + @data3_io = get_data_indirect_io ind2_offset + if RangesIOIdxChain === @data3_io + @data3_idxs = + # modify ranges + ranges = @data3_io.ranges.map { |offset, size| [offset, size / @rec_size * @rec_size] } + @data3_io.instance_variable_set :@ranges, ranges + end + @data3 = @data3_io.read + + # there must be something to the data in data2. i think data2 is the array of objects essentially. + # currently its only used to imply a length + # actually, at size 6, its just some auxiliary data. i'm thinking either Vv/vV, for 97, and something + # wider for 03. the second value is just the index (0...length), and the first value is + # some kind of offset i expect. actually, they were all id2 values, in another case. + # so maybe they're get_data_indirect values too? + # actually, it turned out they were identical to the PR_ATTACHMENT_ID2 values... + # id2_values = ie, data2.unpack('v*').to_enum(:each_slice, 3).transpose[0] + # table[i].assoc(PR_ATTACHMENT_ID2).last == id2_values[i], for all i. + @data2 = get_data_indirect(offset2) rescue nil + #if data2 + # @length = (data2.length / 6.0).ceil + #else + # the above / 6, may have been ok for 97 files, but the new 0x0004 style block must have + # different size records... just use this instead: + # hmmm, actually, we can still figure it out: + @length = @data3.length / @rec_size + #end + + # lets try and at least use data2 for a warning for now + if data2 + data2_rec_size = desc.pst.header.version_2003? ? 8 : 6 + warn 'somthing seems wrong with data3' unless @length == (data2.length / data2_rec_size) + end + end + + def schema + @schema ||= index_data.scan(/.{8}/m).map { |data| Column.new data } + end + + def [] idx + # handle funky rounding + Row.new self, idx * @rec_size + end + + def each + length.times { |i| yield self[i] } + end + + class Row + include Enumerable + + def initialize array_parser, x + @array_parser, @x = array_parser, x + end + + # iterate through the property tuples + def each + (@array_parser.index_data.length / 8).times do |i| + ref_type, type, ind2_off, size, slot = @array_parser.index_data[8 * i, 8].unpack 'v3CC' + # check this rescue too + value = @array_parser.data3[@x + ind2_off, size] +# if INDIRECT_TYPES.include? ref_type + if size <= 4 + value = value.unpack('V')[0] + end + #p ['0x%04x' % ref_type, '0x%04x' % type, (Msg::Properties::MAPITAGS['%04x' % type].first[/^.._(.*)/, 1].downcase rescue nil), + # value_orig, value, (get_data_indirect(value_orig.unpack('V')[0]) rescue nil), size, ind2_off, slot] + key, type, value = @array_parser.handle_indirect_values type, ref_type, value + yield key, type, value + end + end + end + end + + class AttachmentTable < BlockParser + # a "fake" MAPI property name for this constant. if you get a mapi property with + # this value, it is the id2 value to use to get attachment data. + PR_ATTACHMENT_ID2 = 0x67f2 + + attr_reader :desc, :table + def initialize desc + @desc = desc + # no super, we only actually want BlockParser2#idx2 + @table = nil + return unless desc.list_index + return unless idx = idx2[ID2_ATTACHMENTS] + # FIXME make a fake desc. + @desc2 = OpenStruct.new :desc => idx, :pst => desc.pst, :list_index => desc.list_index + @table = RawPropertyStoreTable.new @desc2 + end + + def to_a + return [] if !table + table.map do |attachment| + attachment = attachment.to_a + #p attachment + # potentially merge with yet more properties + # this still seems pretty broken - especially the property overlap + if attachment_id2 = attachment.assoc(PR_ATTACHMENT_ID2) + #p attachment_id2.last + #p idx2[attachment_id2.last] + @desc2.desc = idx2[attachment_id2.last] + RawPropertyStore.new(@desc2).each do |a, b, c| + record = attachment.assoc a + attachment << record = [] unless record + record.replace [a, b, c] + end + end + attachment + end + end + end + + # there is no equivalent to this in libpst. ID2_RECIPIENTS was just guessed given the above + # AttachmentTable. + class RecipientTable < BlockParser + attr_reader :desc, :table + def initialize desc + @desc = desc + # no super, we only actually want BlockParser2#idx2 + @table = nil + return unless desc.list_index + return unless idx = idx2[ID2_RECIPIENTS] + # FIXME make a fake desc. + desc2 = OpenStruct.new :desc => idx, :pst => desc.pst, :list_index => desc.list_index + @table = RawPropertyStoreTable.new desc2 + end + + def to_a + return [] if !table + table.map { |x| x.to_a } + end + end + + # + # higher level item code. wraps up the raw properties above, and gives nice + # objects to work with. handles item relationships too. + # ---------------------------------------------------------------------------- + # + + def self.make_property_set property_list + hash = property_list.inject({}) do |hash, (key, type, value)| + hash.update PropertySet::Key.new(key) => value + end + PropertySet.new hash + end + + class Attachment < Mapi::Attachment + def initialize list + super Pst.make_property_set(list) + + @embedded_msg = props.attach_data if Item === props.attach_data + end + end + + class Recipient < Mapi::Recipient + def initialize list + super Pst.make_property_set(list) + end + end + + class Item < Mapi::Message + class EntryID < Struct.new(:u1, :entry_id, :id) + UNPACK_STR = 'VA16V' + + def initialize data + data = data.unpack(UNPACK_STR) if String === data + super(*data) + end + end + + include RecursivelyEnumerable + + attr_accessor :type, :parent + + def initialize desc, list, type=nil + @desc = desc + super Pst.make_property_set(list) + + # this is kind of weird, but the ids of the special folders are stored in a hash + # when the root item is loaded + if ipm_wastebasket_entryid + desc.pst.special_folder_ids[ipm_wastebasket_entryid] = :wastebasket + end + + if finder_entryid + desc.pst.special_folder_ids[finder_entryid] = :finder + end + + # and then here, those are used, along with a crappy heuristic to determine if we are an + # item +=begin +i think the low bits of the desc_id can give some info on the type. + +it seems that 0x4 is for regular messages (and maybe contacts etc) +0x2 is for folders, and 0x8 is for special things like rules etc, that aren't visible. +=end + unless type + type = props.valid_folder_mask || ipm_subtree_entryid || props.content_count || props.subfolders ? :folder : :message + if type == :folder + type = desc.pst.special_folder_ids[desc.desc_id] || type + end + end + + @type = type + end + + def each_child + id = ipm_subtree_entryid + if id + root = @desc.pst.desc_from_id id + raise "couldn't find root" unless root + raise 'both kinds of children' unless @desc.children.empty? + children = root.children + # lets look up the other ids we have. + # typically the wastebasket one "deleted items" is in the children already, but + # the search folder isn't. + extras = [ipm_wastebasket_entryid, finder_entryid].compact.map do |id| + root = @desc.pst.desc_from_id id + warn "couldn't find root for id #{id}" unless root + root + end.compact + # i do this instead of union, so as not to mess with the order of the + # existing children. + children += (extras - children) + children + else + @desc.children + end.each do |desc| + item = @desc.pst.pst_parse_item(desc) + item.parent = self + yield item + end + end + + def path + parents, item = [], self + parents.unshift item while item = item.parent + # remove root + parents.shift + parents.map { |item| item.props.display_name or raise 'unable to construct path' } * '/' + end + + def children + to_enum(:each_child).to_a + end + + # these are still around because they do different stuff + + # Top of Personal Folder Record + def ipm_subtree_entryid + @ipm_subtree_entryid ||= EntryID.new(props.ipm_subtree_entryid.read).id rescue nil + end + + # Deleted Items Folder Record + def ipm_wastebasket_entryid + @ipm_wastebasket_entryid ||= EntryID.new(props.ipm_wastebasket_entryid.read).id rescue nil + end + + # Search Root Record + def finder_entryid + @finder_entryid ||= EntryID.new(props.finder_entryid.read).id rescue nil + end + + # all these have been replaced with the method_missing below +=begin + # States which folders are valid for this message store + #def valid_folder_mask + # props[0x35df] + #end + + # Number of emails stored in a folder + def content_count + props[0x3602] + end + + # Has children + def subfolders + props[0x360a] + end +=end + + # i think i will change these, so they can inherit the lazyness from RawPropertyStoreTable. + # so if you want the last attachment, you can get it without creating the others perhaps. + # it just has to handle the no table at all case a bit more gracefully. + + def attachments + @attachments ||= AttachmentTable.new(@desc).to_a.map { |list| Attachment.new list } + end + + def recipients + #[] + @recipients ||= RecipientTable.new(@desc).to_a.map { |list| Recipient.new list } + end + + def each_recursive(&block) + #p :self => self + children.each do |child| + #p :child => child + block[child] + child.each_recursive(&block) + end + end + + def inspect + attrs = %w[display_name subject sender_name subfolders] +# attrs = %w[display_name valid_folder_mask ipm_wastebasket_entryid finder_entryid content_count subfolders] + str = attrs.map { |a| b = props.send a; " #{a}=#{b.inspect}" if b }.compact * ',' + + type_s = type == :message ? 'Message' : type == :folder ? 'Folder' : type.to_s.capitalize + 'Folder' + str2 = 'desc_id=0x%x' % @desc.desc_id + + !str.empty? ? "#<Pst::#{type_s} #{str2}#{str}>" : "#<Pst::#{type_s} #{str2} props=#{props.inspect}>" #\n" + props.transport_message_headers + ">" + end + end + + # corresponds to + # * _pst_parse_item + def pst_parse_item desc + Item.new desc, RawPropertyStore.new(desc).to_a + end + + # + # other random code + # ---------------------------------------------------------------------------- + # + + def dump_debug_info + puts "* pst header" + p header + +=begin +Looking at the output of this, for blank-o1997.pst, i see this part: +... +- (26624,516) desc block data (overlap of 4 bytes) +- (27136,516) desc block data (gap of 508 bytes) +- (28160,516) desc block data (gap of 2620 bytes) +... + +which confirms my belief that the block size for idx and desc is more likely 512 +=end + if 0 + 0 == 0 + puts '* file range usage' + file_ranges = + # these 3 things, should account for most of the data in the file. + [[0, Header::SIZE, 'pst file header']] + + @idx_offsets.map { |offset| [offset, Index::BLOCK_SIZE, 'idx block data'] } + + @desc_offsets.map { |offset| [offset, Desc::BLOCK_SIZE, 'desc block data'] } + + @idx.map { |idx| [idx.offset, idx.size, 'idx id=0x%x (%s)' % [idx.id, idx.type]] } + (file_ranges.sort_by { |idx| idx.first } + [nil]).to_enum(:each_cons, 2).each do |(offset, size, name), next_record| + # i think there is a padding of the size out to 64 bytes + # which is equivalent to padding out the final offset, because i think the offset is + # similarly oriented + pad_amount = 64 + warn 'i am wrong about the offset padding' if offset % pad_amount != 0 + # so, assuming i'm not wrong about that, then we can calculate how much padding is needed. + pad = pad_amount - (size % pad_amount) + pad = 0 if pad == pad_amount + gap = next_record ? next_record.first - (offset + size + pad) : 0 + extra = case gap <=> 0 + when -1; ["overlap of #{gap.abs} bytes)"] + when 0; [] + when +1; ["gap of #{gap} bytes"] + end + # how about we check that padding + @io.pos = offset + size + pad_bytes = @io.read(pad) + extra += ["padding not all zero"] unless pad_bytes == 0.chr * pad + puts "- #{offset}:#{size}+#{pad} #{name.inspect}" + (extra.empty? ? '' : ' [' + extra * ', ' + ']') + end + end + + # i think the idea of the idx, and indeed the idx2, is just to be able to + # refer to data indirectly, which means it can get moved around, and you just update + # the idx table. it is simply a list of file offsets and sizes. + # not sure i get how id2 plays into it though.... + # the sizes seem to be all even. is that a co-incidence? and the ids are all even. that + # seems to be related to something else (see the (id & 2) == 1 stuff) + puts '* idx entries' + @idx.each { |idx| puts "- #{idx.inspect}" } + + # if you look at the desc tree, you notice a few things: + # 1. there is a desc that seems to be the parent of all the folders, messages etc. + # it is the one whose parent is itself. + # one of its children is referenced as the subtree_entryid of the first desc item, + # the root. + # 2. typically only 2 types of desc records have idx2_id != 0. messages themselves, + # and the desc with id = 0x61 - the xattrib container. everything else uses the + # regular ids to find its data. i think it should be reframed as small blocks and + # big blocks, but i'll look into it more. + # + # idx_id and idx2_id are for getting to the data. desc_id and parent_desc_id just define + # the parent <-> child relationship, and the desc_ids are how the items are referred to in + # entryids. + # note that these aren't unique! eg for 0, 4 etc. i expect these'd never change, as the ids + # are stored in entryids. whereas the idx and idx2 could be a bit more volatile. + puts '* desc tree' + # make a dummy root hold everything just for convenience + root = Desc.new '' + def root.inspect; "#<Pst::Root>"; end + root.children.replace @orphans + # this still loads the whole thing as a string for gsub. should use directo output io + # version. + puts root.to_tree.gsub(/, (parent_desc_id|idx2_id)=0x0(?!\d)/, '') + + # this is fairly easy to understand, its just an attempt to display the pst items in a tree form + # which resembles what you'd see in outlook. + puts '* item tree' + # now streams directly + root_item.to_tree STDOUT + end + + def root_desc + @desc.first + end + + def root_item + item = pst_parse_item root_desc + item.type = :root + item + end + + def root + root_item + end + + # depth first search of all items + include Enumerable + + def each(&block) + root = self.root + block[root] + root.each_recursive(&block) + end + + def name + @name ||= root_item.props.display_name + end + + def inspect + "#<Pst name=#{name.inspect} io=#{io.inspect}>" + end +end +end + |