aboutsummaryrefslogtreecommitdiffstats
path: root/vendor/ruby-msg/lib/mapi/pst.rb
diff options
context:
space:
mode:
Diffstat (limited to 'vendor/ruby-msg/lib/mapi/pst.rb')
-rw-r--r--vendor/ruby-msg/lib/mapi/pst.rb1806
1 files changed, 1806 insertions, 0 deletions
diff --git a/vendor/ruby-msg/lib/mapi/pst.rb b/vendor/ruby-msg/lib/mapi/pst.rb
new file mode 100644
index 000000000..9ac64b097
--- /dev/null
+++ b/vendor/ruby-msg/lib/mapi/pst.rb
@@ -0,0 +1,1806 @@
+#
+# = Introduction
+#
+# This file is mostly an attempt to port libpst to ruby, and simplify it in the process. It
+# will leverage much of the existing MAPI => MIME conversion developed for Msg files, and as
+# such is purely concerned with the file structure details.
+#
+# = TODO
+#
+# 1. solve recipient table problem (test4).
+# this is done. turns out it was due to id2 clashes. find better solution
+# 2. check parse consistency. an initial conversion of a 30M file to pst, shows
+# a number of messages conveting badly. compare with libpst too.
+# 3. xattribs
+# 4. generalise the Mapi stuff better
+# 5. refactor index load
+# 6. msg serialization?
+#
+
+=begin
+
+quick plan for cleanup.
+
+have working tests for 97 and 03 file formats, so safe.
+
+want to fix up:
+
+64 bit unpacks scattered around. its ugly. not sure how best to handle it, but am slightly tempted
+to override String#unpack to support a 64 bit little endian unpack (like L vs N/V, for Q). one way or
+another need to fix it. Could really slow everything else down if its parsing the unpack strings twice,
+once in ruby, for every single unpack i do :/
+
+the index loading process, and the lack of shared code between normal vs 64 bit variants, and Index vs Desc.
+should be able to reduce code by factor of 4. also think I should move load code into the class too. then
+maybe have something like:
+
+class Header
+ def index_class
+ version_2003 ? Index64 : Index
+ end
+end
+
+def load_idx
+ header.index_class.load_index
+end
+
+OR
+
+def initialize
+ @header = ...
+ extend @header.index_class::Load
+ load_idx
+end
+
+need to think about the role of the mapi code, and Pst::Item etc, but that layer can come later.
+
+=end
+
+require 'mapi'
+require 'enumerator'
+require 'ostruct'
+require 'ole/ranges_io'
+
+module Mapi
+class Pst
+ class FormatError < StandardError
+ end
+
+ # unfortunately there is no Q analogue which is little endian only.
+ # this translates T as an unsigned quad word, little endian byte order, to
+ # not pollute the rest of the code.
+ #
+ # didn't want to override String#unpack, cause its too hacky, and incomplete.
+ def self.unpack str, unpack_spec
+ return str.unpack(unpack_spec) unless unpack_spec['T']
+ @unpack_cache ||= {}
+ t_offsets, new_spec = @unpack_cache[unpack_spec]
+ unless t_offsets
+ t_offsets = []
+ offset = 0
+ new_spec = ''
+ unpack_spec.scan(/([^\d])_?(\*|\d+)?/o) do
+ num_elems = $1.downcase == 'a' ? 1 : ($2 || 1).to_i
+ if $1 == 'T'
+ num_elems.times { |i| t_offsets << offset + i }
+ new_spec << "V#{num_elems * 2}"
+ else
+ new_spec << $~[0]
+ end
+ offset += num_elems
+ end
+ @unpack_cache[unpack_spec] = [t_offsets, new_spec]
+ end
+ a = str.unpack(new_spec)
+ t_offsets.each do |offset|
+ low, high = a[offset, 2]
+ a[offset, 2] = low && high ? low + (high << 32) : nil
+ end
+ a
+ end
+
+ #
+ # this is the header and encryption encapsulation code
+ # ----------------------------------------------------------------------------
+ #
+
+ # class which encapsulates the pst header
+ class Header
+ SIZE = 512
+ MAGIC = 0x2142444e
+
+ # these are the constants defined in libpst.c, that
+ # are referenced in pst_open()
+ INDEX_TYPE_OFFSET = 0x0A
+ FILE_SIZE_POINTER = 0xA8
+ FILE_SIZE_POINTER_64 = 0xB8
+ SECOND_POINTER = 0xBC
+ INDEX_POINTER = 0xC4
+ SECOND_POINTER_64 = 0xE0
+ INDEX_POINTER_64 = 0xF0
+ ENC_OFFSET = 0x1CD
+
+ attr_reader :magic, :index_type, :encrypt_type, :size
+ attr_reader :index1_count, :index1, :index2_count, :index2
+ attr_reader :version
+ def initialize data
+ @magic = data.unpack('N')[0]
+ @index_type = data[INDEX_TYPE_OFFSET]
+ @version = {0x0e => 1997, 0x17 => 2003}[@index_type]
+
+ if version_2003?
+ # don't know?
+ # >> data1.unpack('V*').zip(data2.unpack('V*')).enum_with_index.select { |(c, d), i| c != d and not [46, 56, 60].include?(i) }.select { |(a, b), i| b == 0 }.map { |(a, b), i| [a / 256, i] }
+ # [8, 76], [32768, 84], [128, 89]
+ # >> data1.unpack('C*').zip(data2.unpack('C*')).enum_with_index.select { |(c, d), i| c != d and not [184..187, 224..227, 240..243].any? { |r| r === i } }.select { |(a, b), i| b == 0 and ((Math.log(a) / Math.log(2)) % 1) < 0.0001 }
+ # [[[2, 0], 61], [[2, 0], 76], [[2, 0], 195], [[2, 0], 257], [[8, 0], 305], [[128, 0], 338], [[128, 0], 357]]
+ # i have only 2 psts to base this guess on, so i can't really come up with anything that looks reasonable yet. not sure what the offset is. unfortunately there is so much in the header
+ # that isn't understood...
+ @encrypt_type = 1
+
+ @index2_count, @index2 = data[SECOND_POINTER_64 - 4, 8].unpack('V2')
+ @index1_count, @index1 = data[INDEX_POINTER_64 - 4, 8].unpack('V2')
+
+ @size = data[FILE_SIZE_POINTER_64, 4].unpack('V')[0]
+ else
+ @encrypt_type = data[ENC_OFFSET]
+
+ @index2_count, @index2 = data[SECOND_POINTER - 4, 8].unpack('V2')
+ @index1_count, @index1 = data[INDEX_POINTER - 4, 8].unpack('V2')
+
+ @size = data[FILE_SIZE_POINTER, 4].unpack('V')[0]
+ end
+
+ validate!
+ end
+
+ def version_2003?
+ version == 2003
+ end
+
+ def encrypted?
+ encrypt_type != 0
+ end
+
+ def validate!
+ raise FormatError, "bad signature on pst file (#{'0x%x' % magic})" unless magic == MAGIC
+ raise FormatError, "only index types 0x0e and 0x17 are handled (#{'0x%x' % index_type})" unless [0x0e, 0x17].include?(index_type)
+ raise FormatError, "only encrytion types 0 and 1 are handled (#{encrypt_type.inspect})" unless [0, 1].include?(encrypt_type)
+ end
+ end
+
+ # compressible encryption! :D
+ #
+ # simple substitution. see libpst.c
+ # maybe test switch to using a String#tr!
+ class CompressibleEncryption
+ DECRYPT_TABLE = [
+ 0x47, 0xf1, 0xb4, 0xe6, 0x0b, 0x6a, 0x72, 0x48,
+ 0x85, 0x4e, 0x9e, 0xeb, 0xe2, 0xf8, 0x94, 0x53, # 0x0f
+ 0xe0, 0xbb, 0xa0, 0x02, 0xe8, 0x5a, 0x09, 0xab,
+ 0xdb, 0xe3, 0xba, 0xc6, 0x7c, 0xc3, 0x10, 0xdd, # 0x1f
+ 0x39, 0x05, 0x96, 0x30, 0xf5, 0x37, 0x60, 0x82,
+ 0x8c, 0xc9, 0x13, 0x4a, 0x6b, 0x1d, 0xf3, 0xfb, # 0x2f
+ 0x8f, 0x26, 0x97, 0xca, 0x91, 0x17, 0x01, 0xc4,
+ 0x32, 0x2d, 0x6e, 0x31, 0x95, 0xff, 0xd9, 0x23, # 0x3f
+ 0xd1, 0x00, 0x5e, 0x79, 0xdc, 0x44, 0x3b, 0x1a,
+ 0x28, 0xc5, 0x61, 0x57, 0x20, 0x90, 0x3d, 0x83, # 0x4f
+ 0xb9, 0x43, 0xbe, 0x67, 0xd2, 0x46, 0x42, 0x76,
+ 0xc0, 0x6d, 0x5b, 0x7e, 0xb2, 0x0f, 0x16, 0x29, # 0x5f
+ 0x3c, 0xa9, 0x03, 0x54, 0x0d, 0xda, 0x5d, 0xdf,
+ 0xf6, 0xb7, 0xc7, 0x62, 0xcd, 0x8d, 0x06, 0xd3, # 0x6f
+ 0x69, 0x5c, 0x86, 0xd6, 0x14, 0xf7, 0xa5, 0x66,
+ 0x75, 0xac, 0xb1, 0xe9, 0x45, 0x21, 0x70, 0x0c, # 0x7f
+ 0x87, 0x9f, 0x74, 0xa4, 0x22, 0x4c, 0x6f, 0xbf,
+ 0x1f, 0x56, 0xaa, 0x2e, 0xb3, 0x78, 0x33, 0x50, # 0x8f
+ 0xb0, 0xa3, 0x92, 0xbc, 0xcf, 0x19, 0x1c, 0xa7,
+ 0x63, 0xcb, 0x1e, 0x4d, 0x3e, 0x4b, 0x1b, 0x9b, # 0x9f
+ 0x4f, 0xe7, 0xf0, 0xee, 0xad, 0x3a, 0xb5, 0x59,
+ 0x04, 0xea, 0x40, 0x55, 0x25, 0x51, 0xe5, 0x7a, # 0xaf
+ 0x89, 0x38, 0x68, 0x52, 0x7b, 0xfc, 0x27, 0xae,
+ 0xd7, 0xbd, 0xfa, 0x07, 0xf4, 0xcc, 0x8e, 0x5f, # 0xbf
+ 0xef, 0x35, 0x9c, 0x84, 0x2b, 0x15, 0xd5, 0x77,
+ 0x34, 0x49, 0xb6, 0x12, 0x0a, 0x7f, 0x71, 0x88, # 0xcf
+ 0xfd, 0x9d, 0x18, 0x41, 0x7d, 0x93, 0xd8, 0x58,
+ 0x2c, 0xce, 0xfe, 0x24, 0xaf, 0xde, 0xb8, 0x36, # 0xdf
+ 0xc8, 0xa1, 0x80, 0xa6, 0x99, 0x98, 0xa8, 0x2f,
+ 0x0e, 0x81, 0x65, 0x73, 0xe4, 0xc2, 0xa2, 0x8a, # 0xef
+ 0xd4, 0xe1, 0x11, 0xd0, 0x08, 0x8b, 0x2a, 0xf2,
+ 0xed, 0x9a, 0x64, 0x3f, 0xc1, 0x6c, 0xf9, 0xec # 0xff
+ ]
+
+ ENCRYPT_TABLE = [nil] * 256
+ DECRYPT_TABLE.each_with_index { |i, j| ENCRYPT_TABLE[i] = j }
+
+ def self.decrypt_alt encrypted
+ decrypted = ''
+ encrypted.length.times { |i| decrypted << DECRYPT_TABLE[encrypted[i]] }
+ decrypted
+ end
+
+ def self.encrypt_alt decrypted
+ encrypted = ''
+ decrypted.length.times { |i| encrypted << ENCRYPT_TABLE[decrypted[i]] }
+ encrypted
+ end
+
+ # an alternate implementation that is possibly faster....
+ # TODO - bench
+ DECRYPT_STR, ENCRYPT_STR = [DECRYPT_TABLE, (0...256)].map do |values|
+ values.map { |i| i.chr }.join.gsub(/([\^\-\\])/, "\\\\\\1")
+ end
+
+ def self.decrypt encrypted
+ encrypted.tr ENCRYPT_STR, DECRYPT_STR
+ end
+
+ def self.encrypt decrypted
+ decrypted.tr DECRYPT_STR, ENCRYPT_STR
+ end
+ end
+
+ class RangesIOEncryptable < RangesIO
+ def initialize io, mode='r', params={}
+ mode, params = 'r', mode if Hash === mode
+ @decrypt = !!params[:decrypt]
+ super
+ end
+
+ def encrypted?
+ @decrypt
+ end
+
+ def read limit=nil
+ buf = super
+ buf = CompressibleEncryption.decrypt(buf) if encrypted?
+ buf
+ end
+ end
+
+ attr_reader :io, :header, :idx, :desc, :special_folder_ids
+
+ # corresponds to
+ # * pst_open
+ # * pst_load_index
+ def initialize io
+ @io = io
+ io.pos = 0
+ @header = Header.new io.read(Header::SIZE)
+
+ # would prefer this to be in Header#validate, but it doesn't have the io size.
+ # should perhaps downgrade this to just be a warning...
+ raise FormatError, "header size field invalid (#{header.size} != #{io.size}}" unless header.size == io.size
+
+ load_idx
+ load_desc
+ load_xattrib
+
+ @special_folder_ids = {}
+ end
+
+ def encrypted?
+ @header.encrypted?
+ end
+
+ # until i properly fix logging...
+ def warn s
+ Mapi::Log.warn s
+ end
+
+ #
+ # this is the index and desc record loading code
+ # ----------------------------------------------------------------------------
+ #
+
+ ToTree = Module.new
+
+ module Index2
+ BLOCK_SIZE = 512
+ module RecursiveLoad
+ def load_chain
+ #...
+ end
+ end
+
+ module Base
+ def read
+ #...
+ end
+ end
+
+ class Version1997 < Struct.new(:a)#...)
+ SIZE = 12
+
+ include RecursiveLoad
+ include Base
+ end
+
+ class Version2003 < Struct.new(:a)#...)
+ SIZE = 24
+
+ include RecursiveLoad
+ include Base
+ end
+ end
+
+ module Desc2
+ module Base
+ def desc
+ #...
+ end
+ end
+
+ class Version1997 < Struct.new(:a)#...)
+ #include Index::RecursiveLoad
+ include Base
+ end
+
+ class Version2003 < Struct.new(:a)#...)
+ #include Index::RecursiveLoad
+ include Base
+ end
+ end
+
+ # more constants from libpst.c
+ # these relate to the index block
+ ITEM_COUNT_OFFSET = 0x1f0 # count byte
+ LEVEL_INDICATOR_OFFSET = 0x1f3 # node or leaf
+ BACKLINK_OFFSET = 0x1f8 # backlink u1 value
+
+ # these 3 classes are used to hold various file records
+
+ # pst_index
+ class Index < Struct.new(:id, :offset, :size, :u1)
+ UNPACK_STR = 'VVvv'
+ SIZE = 12
+ BLOCK_SIZE = 512 # index blocks was 516 but bogus
+ COUNT_MAX = 41 # max active items (ITEM_COUNT_OFFSET / Index::SIZE = 41)
+
+ attr_accessor :pst
+ def initialize data
+ data = Pst.unpack data, UNPACK_STR if String === data
+ super(*data)
+ end
+
+ def type
+ @type ||= begin
+ if id & 0x2 == 0
+ :data
+ else
+ first_byte, second_byte = read.unpack('CC')
+ if first_byte == 1
+ raise second_byte unless second_byte == 1
+ :data_chain_header
+ elsif first_byte == 2
+ raise second_byte unless second_byte == 0
+ :id2_assoc
+ else
+ raise FormatError, 'unknown first byte for block - %p' % first_byte
+ end
+ end
+ end
+ end
+
+ def data?
+ (id & 0x2) == 0
+ end
+
+ def read decrypt=true
+ # only data blocks are every encrypted
+ decrypt = false unless data?
+ pst.pst_read_block_size offset, size, decrypt
+ end
+
+ # show all numbers in hex
+ def inspect
+ super.gsub(/=(\d+)/) { '=0x%x' % $1.to_i }.sub(/Index /, "Index type=#{type.inspect}, ")
+ end
+ end
+
+ # mostly guesses.
+ ITEM_COUNT_OFFSET_64 = 0x1e8
+ LEVEL_INDICATOR_OFFSET_64 = 0x1eb # diff of 3 between these 2 as above...
+
+ # will maybe inherit from Index64, in order to get the same #type function.
+ class Index64 < Index
+ UNPACK_STR = 'TTvvV'
+ SIZE = 24
+ BLOCK_SIZE = 512
+ COUNT_MAX = 20 # bit of a guess really. 512 / 24 = 21, but doesn't leave enough header room
+
+ # this is the extra item on the end of the UNPACK_STR above
+ attr_accessor :u2
+
+ def initialize data
+ data = Pst.unpack data, UNPACK_STR if String === data
+ @u2 = data.pop
+ super data
+ end
+
+ def inspect
+ super.sub(/>$/, ', u2=%p>' % u2)
+ end
+
+ def self.load_chain io, header
+ load_idx_rec io, header.index1, 0, 0
+ end
+
+ # almost identical to load code for Index, just different offsets and unpack strings.
+ # can probably merge them, or write a generic load_tree function or something.
+ def self.load_idx_rec io, offset, linku1, start_val
+ io.seek offset
+ buf = io.read BLOCK_SIZE
+ idxs = []
+
+ item_count = buf[ITEM_COUNT_OFFSET_64]
+ raise "have too many active items in index (#{item_count})" if item_count > COUNT_MAX
+
+ #idx = Index.new buf[BACKLINK_OFFSET, Index::SIZE]
+ #raise 'blah 1' unless idx.id == linku1
+
+ if buf[LEVEL_INDICATOR_OFFSET_64] == 0
+ # leaf pointers
+ # split the data into item_count index objects
+ buf[0, SIZE * item_count].scan(/.{#{SIZE}}/mo).each_with_index do |data, i|
+ idx = new data
+ # first entry
+ raise 'blah 3' if i == 0 and start_val != 0 and idx.id != start_val
+ #idx.pst = self
+ break if idx.id == 0
+ idxs << idx
+ end
+ else
+ # node pointers
+ # split the data into item_count table pointers
+ buf[0, SIZE * item_count].scan(/.{#{SIZE}}/mo).each_with_index do |data, i|
+ start, u1, offset = Pst.unpack data, 'T3'
+ # for the first value, we expect the start to be equal
+ raise 'blah 3' if i == 0 and start_val != 0 and start != start_val
+ break if start == 0
+ idxs += load_idx_rec io, offset, u1, start
+ end
+ end
+
+ idxs
+ end
+ end
+
+ # pst_desc
+ class Desc64 < Struct.new(:desc_id, :idx_id, :idx2_id, :parent_desc_id, :u2)
+ UNPACK_STR = 'T3VV'
+ SIZE = 32
+ BLOCK_SIZE = 512 # descriptor blocks was 520 but bogus
+ COUNT_MAX = 15 # guess as per Index64
+
+ include RecursivelyEnumerable
+
+ attr_accessor :pst
+ attr_reader :children
+ def initialize data
+ super(*Pst.unpack(data, UNPACK_STR))
+ @children = []
+ end
+
+ def desc
+ pst.idx_from_id idx_id
+ end
+
+ def list_index
+ pst.idx_from_id idx2_id
+ end
+
+ def self.load_chain io, header
+ load_desc_rec io, header.index2, 0, 0x21
+ end
+
+ def self.load_desc_rec io, offset, linku1, start_val
+ io.seek offset
+ buf = io.read BLOCK_SIZE
+ descs = []
+ item_count = buf[ITEM_COUNT_OFFSET_64]
+
+ # not real desc
+ #desc = Desc.new buf[BACKLINK_OFFSET, 4]
+ #raise 'blah 1' unless desc.desc_id == linku1
+
+ if buf[LEVEL_INDICATOR_OFFSET_64] == 0
+ # leaf pointers
+ raise "have too many active items in index (#{item_count})" if item_count > COUNT_MAX
+ # split the data into item_count desc objects
+ buf[0, SIZE * item_count].scan(/.{#{SIZE}}/mo).each_with_index do |data, i|
+ desc = new data
+ # first entry
+ raise 'blah 3' if i == 0 and start_val != 0 and desc.desc_id != start_val
+ break if desc.desc_id == 0
+ descs << desc
+ end
+ else
+ # node pointers
+ raise "have too many active items in index (#{item_count})" if item_count > Index64::COUNT_MAX
+ # split the data into item_count table pointers
+ buf[0, Index64::SIZE * item_count].scan(/.{#{Index64::SIZE}}/mo).each_with_index do |data, i|
+ start, u1, offset = Pst.unpack data, 'T3'
+ # for the first value, we expect the start to be equal note that ids -1, so even for the
+ # first we expect it to be equal. thats the 0x21 (dec 33) desc record. this means we assert
+ # that the first desc record is always 33...
+ # thats because 0x21 is the pst root itself...
+ raise 'blah 3' if i == 0 and start_val != -1 and start != start_val
+ # this shouldn't really happen i'd imagine
+ break if start == 0
+ descs += load_desc_rec io, offset, u1, start
+ end
+ end
+
+ descs
+ end
+
+ def each_child(&block)
+ @children.each(&block)
+ end
+ end
+
+ # _pst_table_ptr_struct
+ class TablePtr < Struct.new(:start, :u1, :offset)
+ UNPACK_STR = 'V3'
+ SIZE = 12
+
+ def initialize data
+ data = data.unpack(UNPACK_STR) if String === data
+ super(*data)
+ end
+ end
+
+ # pst_desc
+ # idx_id is a pointer to an idx record which gets the primary data stream for the Desc record.
+ # idx2_id gets you an idx record, that when read gives you an ID2 association list, which just maps
+ # another set of ids to index values
+ class Desc < Struct.new(:desc_id, :idx_id, :idx2_id, :parent_desc_id)
+ UNPACK_STR = 'V4'
+ SIZE = 16
+ BLOCK_SIZE = 512 # descriptor blocks was 520 but bogus
+ COUNT_MAX = 31 # max active desc records (ITEM_COUNT_OFFSET / Desc::SIZE = 31)
+
+ include ToTree
+
+ attr_accessor :pst
+ attr_reader :children
+ def initialize data
+ super(*data.unpack(UNPACK_STR))
+ @children = []
+ end
+
+ def desc
+ pst.idx_from_id idx_id
+ end
+
+ def list_index
+ pst.idx_from_id idx2_id
+ end
+
+ # show all numbers in hex
+ def inspect
+ super.gsub(/=(\d+)/) { '=0x%x' % $1.to_i }
+ end
+ end
+
+ # corresponds to
+ # * _pst_build_id_ptr
+ def load_idx
+ @idx = []
+ @idx_offsets = []
+ if header.version_2003?
+ @idx = Index64.load_chain io, header
+ @idx.each { |idx| idx.pst = self }
+ else
+ load_idx_rec header.index1, header.index1_count, 0
+ end
+
+ # we'll typically be accessing by id, so create a hash as a lookup cache
+ @idx_from_id = {}
+ @idx.each do |idx|
+ warn "there are duplicate idx records with id #{idx.id}" if @idx_from_id[idx.id]
+ @idx_from_id[idx.id] = idx
+ end
+ end
+
+ # load the flat idx table, which maps ids to file ranges. this is the recursive helper
+ #
+ # corresponds to
+ # * _pst_build_id_ptr
+ def load_idx_rec offset, linku1, start_val
+ @idx_offsets << offset
+
+ #_pst_read_block_size(pf, offset, BLOCK_SIZE, &buf, 0, 0) < BLOCK_SIZE)
+ buf = pst_read_block_size offset, Index::BLOCK_SIZE, false
+
+ item_count = buf[ITEM_COUNT_OFFSET]
+ raise "have too many active items in index (#{item_count})" if item_count > Index::COUNT_MAX
+
+ idx = Index.new buf[BACKLINK_OFFSET, Index::SIZE]
+ raise 'blah 1' unless idx.id == linku1
+
+ if buf[LEVEL_INDICATOR_OFFSET] == 0
+ # leaf pointers
+ # split the data into item_count index objects
+ buf[0, Index::SIZE * item_count].scan(/.{#{Index::SIZE}}/mo).each_with_index do |data, i|
+ idx = Index.new data
+ # first entry
+ raise 'blah 3' if i == 0 and start_val != 0 and idx.id != start_val
+ idx.pst = self
+ # this shouldn't really happen i'd imagine
+ break if idx.id == 0
+ @idx << idx
+ end
+ else
+ # node pointers
+ # split the data into item_count table pointers
+ buf[0, TablePtr::SIZE * item_count].scan(/.{#{TablePtr::SIZE}}/mo).each_with_index do |data, i|
+ table = TablePtr.new data
+ # for the first value, we expect the start to be equal
+ raise 'blah 3' if i == 0 and start_val != 0 and table.start != start_val
+ # this shouldn't really happen i'd imagine
+ break if table.start == 0
+ load_idx_rec table.offset, table.u1, table.start
+ end
+ end
+ end
+
+ # most access to idx objects will use this function
+ #
+ # corresponds to
+ # * _pst_getID
+ def idx_from_id id
+ @idx_from_id[id]
+ end
+
+ # corresponds to
+ # * _pst_build_desc_ptr
+ # * record_descriptor
+ def load_desc
+ @desc = []
+ @desc_offsets = []
+ if header.version_2003?
+ @desc = Desc64.load_chain io, header
+ @desc.each { |desc| desc.pst = self }
+ else
+ load_desc_rec header.index2, header.index2_count, 0x21
+ end
+
+ # first create a lookup cache
+ @desc_from_id = {}
+ @desc.each do |desc|
+ desc.pst = self
+ warn "there are duplicate desc records with id #{desc.desc_id}" if @desc_from_id[desc.desc_id]
+ @desc_from_id[desc.desc_id] = desc
+ end
+
+ # now turn the flat list of loaded desc records into a tree
+
+ # well, they have no parent, so they're more like, the toplevel descs.
+ @orphans = []
+ # now assign each node to the parents child array, putting the orphans in the above
+ @desc.each do |desc|
+ parent = @desc_from_id[desc.parent_desc_id]
+ # note, besides this, its possible to create other circular structures.
+ if parent == desc
+ # this actually happens usually, for the root_item it appears.
+ #warn "desc record's parent is itself (#{desc.inspect})"
+ # maybe add some more checks in here for circular structures
+ elsif parent
+ parent.children << desc
+ next
+ end
+ @orphans << desc
+ end
+
+ # maybe change this to some sort of sane-ness check. orphans are expected
+# warn "have #{@orphans.length} orphan desc record(s)." unless @orphans.empty?
+ end
+
+ # load the flat list of desc records recursively
+ #
+ # corresponds to
+ # * _pst_build_desc_ptr
+ # * record_descriptor
+ def load_desc_rec offset, linku1, start_val
+ @desc_offsets << offset
+
+ buf = pst_read_block_size offset, Desc::BLOCK_SIZE, false
+ item_count = buf[ITEM_COUNT_OFFSET]
+
+ # not real desc
+ desc = Desc.new buf[BACKLINK_OFFSET, 4]
+ raise 'blah 1' unless desc.desc_id == linku1
+
+ if buf[LEVEL_INDICATOR_OFFSET] == 0
+ # leaf pointers
+ raise "have too many active items in index (#{item_count})" if item_count > Desc::COUNT_MAX
+ # split the data into item_count desc objects
+ buf[0, Desc::SIZE * item_count].scan(/.{#{Desc::SIZE}}/mo).each_with_index do |data, i|
+ desc = Desc.new data
+ # first entry
+ raise 'blah 3' if i == 0 and start_val != 0 and desc.desc_id != start_val
+ # this shouldn't really happen i'd imagine
+ break if desc.desc_id == 0
+ @desc << desc
+ end
+ else
+ # node pointers
+ raise "have too many active items in index (#{item_count})" if item_count > Index::COUNT_MAX
+ # split the data into item_count table pointers
+ buf[0, TablePtr::SIZE * item_count].scan(/.{#{TablePtr::SIZE}}/mo).each_with_index do |data, i|
+ table = TablePtr.new data
+ # for the first value, we expect the start to be equal note that ids -1, so even for the
+ # first we expect it to be equal. thats the 0x21 (dec 33) desc record. this means we assert
+ # that the first desc record is always 33...
+ raise 'blah 3' if i == 0 and start_val != -1 and table.start != start_val
+ # this shouldn't really happen i'd imagine
+ break if table.start == 0
+ load_desc_rec table.offset, table.u1, table.start
+ end
+ end
+ end
+
+ # as for idx
+ #
+ # corresponds to:
+ # * _pst_getDptr
+ def desc_from_id id
+ @desc_from_id[id]
+ end
+
+ # corresponds to
+ # * pst_load_extended_attributes
+ def load_xattrib
+ unless desc = desc_from_id(0x61)
+ warn "no extended attributes desc record found"
+ return
+ end
+ unless desc.desc
+ warn "no desc idx for extended attributes"
+ return
+ end
+ if desc.list_index
+ end
+ #warn "skipping loading xattribs"
+ # FIXME implement loading xattribs
+ end
+
+ # corresponds to:
+ # * _pst_read_block_size
+ # * _pst_read_block ??
+ # * _pst_ff_getIDblock_dec ??
+ # * _pst_ff_getIDblock ??
+ def pst_read_block_size offset, size, decrypt=true
+ io.seek offset
+ buf = io.read size
+ warn "tried to read #{size} bytes but only got #{buf.length}" if buf.length != size
+ encrypted? && decrypt ? CompressibleEncryption.decrypt(buf) : buf
+ end
+
+ #
+ # id2
+ # ----------------------------------------------------------------------------
+ #
+
+ class ID2Assoc < Struct.new(:id2, :id, :table2)
+ UNPACK_STR = 'V3'
+ SIZE = 12
+
+ def initialize data
+ data = data.unpack(UNPACK_STR) if String === data
+ super(*data)
+ end
+ end
+
+ class ID2Assoc64 < Struct.new(:id2, :u1, :id, :table2)
+ UNPACK_STR = 'VVT2'
+ SIZE = 24
+
+ def initialize data
+ if String === data
+ data = Pst.unpack data, UNPACK_STR
+ end
+ super(*data)
+ end
+
+ def self.load_chain idx
+ buf = idx.read
+ type, count = buf.unpack 'v2'
+ unless type == 0x0002
+ raise 'unknown id2 type 0x%04x' % type
+ #return
+ end
+ id2 = []
+ count.times do |i|
+ assoc = new buf[8 + SIZE * i, SIZE]
+ id2 << assoc
+ if assoc.table2 != 0
+ id2 += load_chain idx.pst.idx_from_id(assoc.table2)
+ end
+ end
+ id2
+ end
+ end
+
+ class ID2Mapping
+ attr_reader :list
+ def initialize pst, list
+ @pst = pst
+ @list = list
+ # create a lookup.
+ @id_from_id2 = {}
+ @list.each do |id2|
+ # NOTE we take the last value seen value if there are duplicates. this "fixes"
+ # test4-o1997.pst for the time being.
+ warn "there are duplicate id2 records with id #{id2.id2}" if @id_from_id2[id2.id2]
+ next if @id_from_id2[id2.id2]
+ @id_from_id2[id2.id2] = id2.id
+ end
+ end
+
+ # TODO: fix logging
+ def warn s
+ Mapi::Log.warn s
+ end
+
+ # corresponds to:
+ # * _pst_getID2
+ def [] id
+ #id2 = @list.find { |x| x.id2 == id }
+ id = @id_from_id2[id]
+ id and @pst.idx_from_id(id)
+ end
+ end
+
+ def load_idx2 idx
+ if header.version_2003?
+ id2 = ID2Assoc64.load_chain idx
+ else
+ id2 = load_idx2_rec idx
+ end
+ ID2Mapping.new self, id2
+ end
+
+ # corresponds to
+ # * _pst_build_id2
+ def load_idx2_rec idx
+ # i should perhaps use a idx chain style read here?
+ buf = pst_read_block_size idx.offset, idx.size, false
+ type, count = buf.unpack 'v2'
+ unless type == 0x0002
+ raise 'unknown id2 type 0x%04x' % type
+ #return
+ end
+ id2 = []
+ count.times do |i|
+ assoc = ID2Assoc.new buf[4 + ID2Assoc::SIZE * i, ID2Assoc::SIZE]
+ id2 << assoc
+ if assoc.table2 != 0
+ id2 += load_idx2_rec idx_from_id(assoc.table2)
+ end
+ end
+ id2
+ end
+
+ class RangesIOIdxChain < RangesIOEncryptable
+ def initialize pst, idx_head
+ @idxs = pst.id2_block_idx_chain idx_head
+ # whether or not a given idx needs encrypting
+ decrypts = @idxs.map do |idx|
+ decrypt = (idx.id & 2) != 0 ? false : pst.encrypted?
+ end.uniq
+ raise NotImplementedError, 'partial encryption in RangesIOID2' if decrypts.length > 1
+ decrypt = decrypts.first
+ # convert idxs to ranges
+ ranges = @idxs.map { |idx| [idx.offset, idx.size] }
+ super pst.io, :ranges => ranges, :decrypt => decrypt
+ end
+ end
+
+ class RangesIOID2 < RangesIOIdxChain
+ def self.new pst, id2, idx2
+ RangesIOIdxChain.new pst, idx2[id2]
+ end
+ end
+
+ # corresponds to:
+ # * _pst_ff_getID2block
+ # * _pst_ff_getID2data
+ # * _pst_ff_compile_ID
+ def id2_block_idx_chain idx
+ if (idx.id & 0x2) == 0
+ [idx]
+ else
+ buf = idx.read
+ type, fdepth, count = buf[0, 4].unpack 'CCv'
+ unless type == 1 # libpst.c:3958
+ warn 'Error in idx_chain - %p, %p, %p - attempting to ignore' % [type, fdepth, count]
+ return [idx]
+ end
+ # there are 4 unaccounted for bytes here, 4...8
+ if header.version_2003?
+ ids = buf[8, count * 8].unpack("T#{count}")
+ else
+ ids = buf[8, count * 4].unpack('V*')
+ end
+ if fdepth == 1
+ ids.map { |id| idx_from_id id }
+ else
+ ids.map { |id| id2_block_idx_chain idx_from_id(id) }.flatten
+ end
+ end
+ end
+
+ #
+ # main block parsing code. gets raw properties
+ # ----------------------------------------------------------------------------
+ #
+
+ # the job of this class, is to take a desc record, and be able to enumerate through the
+ # mapi properties of the associated thing.
+ #
+ # corresponds to
+ # * _pst_parse_block
+ # * _pst_process (in some ways. although perhaps thats more the Item::Properties#add_property)
+ class BlockParser
+ include Mapi::Types::Constants
+
+ TYPES = {
+ 0xbcec => 1,
+ 0x7cec => 2,
+ # type 3 is removed. an artifact of not handling the indirect blocks properly in libpst.
+ }
+
+ PR_SUBJECT = PropertySet::TAGS.find { |num, (name, type)| name == 'PR_SUBJECT' }.first.hex
+ PR_BODY_HTML = PropertySet::TAGS.find { |num, (name, type)| name == 'PR_BODY_HTML' }.first.hex
+
+ # this stuff could maybe be moved to Ole::Types? or leverage it somehow?
+ # whether or not a type is immeidate is more a property of the pst encoding though i expect.
+ # what i probably can add is a generic concept of whether a type is of variadic length or not.
+
+ # these lists are very incomplete. think they are largely copied from libpst
+
+ IMMEDIATE_TYPES = [
+ PT_SHORT, PT_LONG, PT_BOOLEAN
+ ]
+
+ INDIRECT_TYPES = [
+ PT_DOUBLE, PT_OBJECT,
+ 0x0014, # whats this? probably something like PT_LONGLONG, given the correspondence with the
+ # ole variant types. (= VT_I8)
+ PT_STRING8, PT_UNICODE, # unicode isn't in libpst, but added here for outlook 2003 down the track
+ PT_SYSTIME,
+ 0x0048, # another unknown
+ 0x0102, # this is PT_BINARY vs PT_CLSID
+ #0x1003, # these are vector types, but they're commented out for now because i'd expect that
+ #0x1014, # there's extra decoding needed that i'm not doing. (probably just need a simple
+ # # PT_* => unpack string mapping for the immediate types, and just do unpack('V*') etc
+ #0x101e,
+ #0x1102
+ ]
+
+ # the attachment and recipient arrays appear to be always stored with these fixed
+ # id2 values. seems strange. are there other extra streams? can find out by making higher
+ # level IO wrapper, which has the id2 value, and doing the diff of available id2 values versus
+ # used id2 values in properties of an item.
+ ID2_ATTACHMENTS = 0x671
+ ID2_RECIPIENTS = 0x692
+
+ attr_reader :desc, :data, :data_chunks, :offset_tables
+ def initialize desc
+ raise FormatError, "unable to get associated index record for #{desc.inspect}" unless desc.desc
+ @desc = desc
+ #@data = desc.desc.read
+ if Pst::Index === desc.desc
+ #@data = RangesIOIdxChain.new(desc.pst, desc.desc).read
+ idxs = desc.pst.id2_block_idx_chain desc.desc
+ # this gets me the plain index chain.
+ else
+ # fake desc
+ #@data = desc.desc.read
+ idxs = [desc.desc]
+ end
+
+ @data_chunks = idxs.map { |idx| idx.read }
+ @data = @data_chunks.first
+
+ load_header
+
+ @index_offsets = [@index_offset] + @data_chunks[1..-1].map { |chunk| chunk.unpack('v')[0] }
+ @offset_tables = []
+ @ignored = []
+ @data_chunks.zip(@index_offsets).each do |chunk, offset|
+ ignore = chunk[offset, 2].unpack('v')[0]
+ @ignored << ignore
+# p ignore
+ @offset_tables.push offset_table = []
+ # maybe its ok if there aren't to be any values ?
+ raise FormatError if offset == 0
+ offsets = chunk[offset + 2..-1].unpack('v*')
+ #p offsets
+ offsets[0, ignore + 2].each_cons 2 do |from, to|
+ #next if to == 0
+ raise FormatError, [from, to].inspect if from > to
+ offset_table << [from, to]
+ end
+ end
+
+ @offset_table = @offset_tables.first
+ @idxs = idxs
+
+ # now, we may have multiple different blocks
+ end
+
+ # a given desc record may or may not have associated idx2 data. we lazily load it here, so it will never
+ # actually be requested unless get_data_indirect actually needs to use it.
+ def idx2
+ return @idx2 if @idx2
+ raise FormatError, 'idx2 requested but no idx2 available' unless desc.list_index
+ # should check this can't return nil
+ @idx2 = desc.pst.load_idx2 desc.list_index
+ end
+
+ def load_header
+ @index_offset, type, @offset1 = data.unpack 'vvV'
+ raise FormatError, 'unknown block type signature 0x%04x' % type unless TYPES[type]
+ @type = TYPES[type]
+ end
+
+ # based on the value of offset, return either some data from buf, or some data from the
+ # id2 chain id2, where offset is some key into a lookup table that is stored as the id2
+ # chain. i think i may need to create a BlockParser class that wraps up all this mess.
+ #
+ # corresponds to:
+ # * _pst_getBlockOffsetPointer
+ # * _pst_getBlockOffset
+ def get_data_indirect offset
+ return get_data_indirect_io(offset).read
+
+ if offset == 0
+ nil
+ elsif (offset & 0xf) == 0xf
+ RangesIOID2.new(desc.pst, offset, idx2).read
+ else
+ low, high = offset & 0xf, offset >> 4
+ raise FormatError if low != 0 or (high & 0x1) != 0 or (high / 2) > @offset_table.length
+ from, to = @offset_table[high / 2]
+ data[from...to]
+ end
+ end
+
+ def get_data_indirect_io offset
+ if offset == 0
+ nil
+ elsif (offset & 0xf) == 0xf
+ if idx2[offset]
+ RangesIOID2.new desc.pst, offset, idx2
+ else
+ warn "tried to get idx2 record for #{offset} but failed"
+ return StringIO.new('')
+ end
+ else
+ low, high = offset & 0xf, offset >> 4
+ if low != 0 or (high & 0x1) != 0
+# raise FormatError,
+ warn "bad - #{low} #{high} (1)"
+ return StringIO.new('')
+ end
+ # lets see which block it should come from.
+ block_idx, i = high.divmod 4096
+ unless block_idx < @data_chunks.length
+ warn "bad - block_idx to high (not #{block_idx} < #{@data_chunks.length})"
+ return StringIO.new('')
+ end
+ data_chunk, offset_table = @data_chunks[block_idx], @offset_tables[block_idx]
+ if i / 2 >= offset_table.length
+ warn "bad - #{low} #{high} - #{i / 2} >= #{offset_table.length} (2)"
+ return StringIO.new('')
+ end
+ #warn "ok - #{low} #{high} #{offset_table.length}"
+ from, to = offset_table[i / 2]
+ StringIO.new data_chunk[from...to]
+ end
+ end
+
+ def handle_indirect_values key, type, value
+ case type
+ when PT_BOOLEAN
+ value = value != 0
+ when *IMMEDIATE_TYPES # not including PT_BOOLEAN which we just did above
+ # no processing current applied (needed?).
+ when *INDIRECT_TYPES
+ # the value is a pointer
+ if String === value # ie, value size > 4 above
+ value = StringIO.new value
+ else
+ value = get_data_indirect_io(value)
+ end
+ # keep strings as immediate values for now, for compatability with how i set up
+ # Msg::Properties::ENCODINGS
+ if value
+ if type == PT_STRING8
+ value = value.read
+ elsif type == PT_UNICODE
+ value = Ole::Types::FROM_UTF16.iconv value.read
+ end
+ end
+ # special subject handling
+ if key == PR_BODY_HTML and value
+ # to keep the msg code happy, which thinks body_html will be an io
+ # although, in 2003 version, they are 0102 already
+ value = StringIO.new value unless value.respond_to?(:read)
+ end
+ if key == PR_SUBJECT and value
+ ignore, offset = value.unpack 'C2'
+ offset = (offset == 1 ? nil : offset - 3)
+ value = value[2..-1]
+=begin
+ index = value =~ /^[A-Z]*:/ ? $~[0].length - 1 : nil
+ unless ignore == 1 and offset == index
+ warn 'something wrong with subject hack'
+ $x = [ignore, offset, value]
+ require 'irb'
+ IRB.start
+ exit
+ end
+=end
+=begin
+new idea:
+
+making sense of the \001\00[156] i've seen prefixing subject. i think its to do with the placement
+of the ':', or the ' '. And perhaps an optimization to do with thread topic, and ignoring the prefixes
+added by mailers. thread topic is equal to subject with all that crap removed.
+
+can test by creating some mails with bizarre subjects.
+
+subject="\001\005RE: blah blah"
+subject="\001\001blah blah"
+subject="\001\032Out of Office AutoReply: blah blah"
+subject="\001\020Undeliverable: blah blah"
+
+looks like it
+
+=end
+
+ # now what i think, is that perhaps, value[offset..-1] ...
+ # or something like that should be stored as a special tag. ie, do a double yield
+ # for this case. probably PR_CONVERSATION_TOPIC, in which case i'd write instead:
+ # yield [PR_SUBJECT, ref_type, value]
+ # yield [PR_CONVERSATION_TOPIC, ref_type, value[offset..-1]
+ # next # to skip the yield.
+ end
+
+ # special handling for embedded objects
+ # used for attach_data for attached messages. in which case attach_method should == 5,
+ # for embedded object.
+ if type == PT_OBJECT and value
+ value = value.read if value.respond_to?(:read)
+ id2, unknown = value.unpack 'V2'
+ io = RangesIOID2.new desc.pst, id2, idx2
+
+ # hacky
+ desc2 = OpenStruct.new(:desc => io, :pst => desc.pst, :list_index => desc.list_index, :children => [])
+ # put nil instead of desc.list_index, otherwise the attachment is attached to itself ad infinitum.
+ # should try and fix that FIXME
+ # this shouldn't be done always. for an attached message, yes, but for an attached
+ # meta file, for example, it shouldn't. difference between embedded_ole vs embedded_msg
+ # really.
+ # note that in the case where its a embedded ole, you actually get a regular serialized ole
+ # object, so i need to create an ole storage object on a rangesioidxchain!
+ # eg:
+=begin
+att.props.display_name # => "Picture (Metafile)"
+io = att.props.attach_data
+io.read(32).unpack('H*') # => ["d0cf11e0a1b11ae100000.... note the docfile signature.
+# plug some missing rangesio holes:
+def io.rewind; seek 0; end
+def io.flush; raise IOError; end
+ole = Ole::Storage.open io
+puts ole.root.to_tree
+
+- #<Dirent:"Root Entry">
+ |- #<Dirent:"\001Ole" size=20 data="\001\000\000\002\000...">
+ |- #<Dirent:"CONTENTS" size=65696 data="\327\315\306\232\000...">
+ \- #<Dirent:"\003MailStream" size=12 data="\001\000\000\000[...">
+=end
+ # until properly fixed, i have disabled this code here, so this will break
+ # nested messages temporarily.
+ #value = Item.new desc2, RawPropertyStore.new(desc2).to_a
+ #desc2.list_index = nil
+ value = io
+ end
+ # this is PT_MV_STRING8, i guess.
+ # should probably have the 0x1000 flag, and do the or-ring.
+ # example of 0x1102 is PR_OUTLOOK_2003_ENTRYIDS. less sure about that one.
+ when 0x101e, 0x1102
+ # example data:
+ # 0x802b "\003\000\000\000\020\000\000\000\030\000\000\000#\000\000\000BusinessCompetitionFavorites"
+ # this 0x802b would be an extended attribute for categories / keywords.
+ value = get_data_indirect_io(value).read unless String === value
+ num = value.unpack('V')[0]
+ offsets = value[4, 4 * num].unpack("V#{num}")
+ value = (offsets + [value.length]).to_enum(:each_cons, 2).map { |from, to| value[from...to] }
+ value.map! { |str| StringIO.new str } if type == 0x1102
+ else
+ name = Mapi::Types::DATA[type].first rescue nil
+ warn '0x%04x %p' % [key, get_data_indirect_io(value).read]
+ raise NotImplementedError, 'unsupported mapi property type - 0x%04x (%p)' % [type, name]
+ end
+ [key, type, value]
+ end
+ end
+
+=begin
+* recipients:
+
+ affects: ["0x200764", "0x2011c4", "0x201b24", "0x201b44", "0x201ba4", "0x201c24", "0x201cc4", "0x202504"]
+
+after adding the rawpropertystoretable fix, all except the second parse properly, and satisfy:
+
+ item.props.display_to == item.recipients.map { |r| r.props.display_name if r.props.recipient_type == 1 }.compact * '; '
+
+only the second still has a problem
+
+#[#<struct Pst::Desc desc_id=0x2011c4, idx_id=0x397c, idx2_id=0x398a, parent_desc_id=0x8082>]
+
+think this is related to a multi block #data3. ie, when you use @x * rec_size, and it
+goes > 8190, or there abouts, then it stuffs up. probably there is header gunk, or something,
+similar to when #data is multi block.
+
+same problem affects the attachment table in test4.
+
+fixed that issue. round data3 ranges to rec_size.
+
+fix other issue with attached objects.
+
+all recipients and attachments in test2 are fine.
+
+only remaining issue is test4 recipients of 200044. strange.
+
+=end
+
+ # RawPropertyStore is used to iterate through the properties of an item, or the auxiliary
+ # data for an attachment. its just a parser for the way the properties are serialized, when the
+ # properties don't have to conform to a column structure.
+ #
+ # structure of this chunk of data is often
+ # header, property keys, data values, and then indexes.
+ # the property keys has value in it. value can be the actual value if its a short type,
+ # otherwise you lookup the value in the indicies, where you get the offsets to use in the
+ # main data body. due to the indirect thing though, any of these parts could actually come
+ # from a separate stream.
+ class RawPropertyStore < BlockParser
+ include Enumerable
+
+ attr_reader :length
+ def initialize desc
+ super
+ raise FormatError, "expected type 1 - got #{@type}" unless @type == 1
+
+ # the way that offset works, data1 may be a subset of buf, or something from id2. if its from buf,
+ # it will be offset based on index_offset and offset. so it could be some random chunk of data anywhere
+ # in the thing.
+ header_data = get_data_indirect @offset1
+ raise FormatError if header_data.length < 8
+ signature, offset2 = header_data.unpack 'V2'
+ #p [@type, signature]
+ raise FormatError, 'unhandled block signature 0x%08x' % @type if signature != 0x000602b5
+ # this is actually a big chunk of tag tuples.
+ @index_data = get_data_indirect offset2
+ @length = @index_data.length / 8
+ end
+
+ # iterate through the property tuples
+ def each
+ length.times do |i|
+ key, type, value = handle_indirect_values(*@index_data[8 * i, 8].unpack('vvV'))
+ yield key, type, value
+ end
+ end
+ end
+
+ # RawPropertyStoreTable is kind of like a database table.
+ # it has a fixed set of columns.
+ # #[] is kind of like getting a row from the table.
+ # those rows are currently encapsulated by Row, which has #each like
+ # RawPropertyStore.
+ # only used for the recipients array, and the attachments array. completely lazy, doesn't
+ # load any of the properties upon creation.
+ class RawPropertyStoreTable < BlockParser
+ class Column < Struct.new(:ref_type, :type, :ind2_off, :size, :slot)
+ def initialize data
+ super(*data.unpack('v3CC'))
+ end
+
+ def nice_type_name
+ Mapi::Types::DATA[ref_type].first[/_(.*)/, 1].downcase rescue '0x%04x' % ref_type
+ end
+
+ def nice_prop_name
+ Mapi::PropertyStore::TAGS['%04x' % type].first[/_(.*)/, 1].downcase rescue '0x%04x' % type
+ end
+
+ def inspect
+ "#<#{self.class} name=#{nice_prop_name.inspect}, type=#{nice_type_name.inspect}>"
+ end
+ end
+
+ include Enumerable
+
+ attr_reader :length, :index_data, :data2, :data3, :rec_size
+ def initialize desc
+ super
+ raise FormatError, "expected type 2 - got #{@type}" unless @type == 2
+
+ header_data = get_data_indirect @offset1
+ # seven_c_blk
+ # often: u1 == u2 and u3 == u2 + 2, then rec_size == u3 + 4. wtf
+ seven_c, @num_list, u1, u2, u3, @rec_size, b_five_offset,
+ ind2_offset, u7, u8 = header_data[0, 22].unpack('CCv4V2v2')
+ @index_data = header_data[22..-1]
+
+ raise FormatError if @num_list != schema.length or seven_c != 0x7c
+ # another check
+ min_size = schema.inject(0) { |total, col| total + col.size }
+ # seem to have at max, 8 padding bytes on the end of the record. not sure if it means
+ # anything. maybe its just space that hasn't been reclaimed due to columns being
+ # removed or something. probably should just check lower bound.
+ range = (min_size..min_size + 8)
+ warn "rec_size seems wrong (#{range} !=== #{rec_size})" unless range === rec_size
+
+ header_data2 = get_data_indirect b_five_offset
+ raise FormatError if header_data2.length < 8
+ signature, offset2 = header_data2.unpack 'V2'
+ # ??? seems a bit iffy
+ # there's probably more to the differences than this, and the data2 difference below
+ expect = desc.pst.header.version_2003? ? 0x000404b5 : 0x000204b5
+ raise FormatError, 'unhandled block signature 0x%08x' % signature if signature != expect
+
+ # this holds all the row data
+ # handle multiple block issue.
+ @data3_io = get_data_indirect_io ind2_offset
+ if RangesIOIdxChain === @data3_io
+ @data3_idxs =
+ # modify ranges
+ ranges = @data3_io.ranges.map { |offset, size| [offset, size / @rec_size * @rec_size] }
+ @data3_io.instance_variable_set :@ranges, ranges
+ end
+ @data3 = @data3_io.read
+
+ # there must be something to the data in data2. i think data2 is the array of objects essentially.
+ # currently its only used to imply a length
+ # actually, at size 6, its just some auxiliary data. i'm thinking either Vv/vV, for 97, and something
+ # wider for 03. the second value is just the index (0...length), and the first value is
+ # some kind of offset i expect. actually, they were all id2 values, in another case.
+ # so maybe they're get_data_indirect values too?
+ # actually, it turned out they were identical to the PR_ATTACHMENT_ID2 values...
+ # id2_values = ie, data2.unpack('v*').to_enum(:each_slice, 3).transpose[0]
+ # table[i].assoc(PR_ATTACHMENT_ID2).last == id2_values[i], for all i.
+ @data2 = get_data_indirect(offset2) rescue nil
+ #if data2
+ # @length = (data2.length / 6.0).ceil
+ #else
+ # the above / 6, may have been ok for 97 files, but the new 0x0004 style block must have
+ # different size records... just use this instead:
+ # hmmm, actually, we can still figure it out:
+ @length = @data3.length / @rec_size
+ #end
+
+ # lets try and at least use data2 for a warning for now
+ if data2
+ data2_rec_size = desc.pst.header.version_2003? ? 8 : 6
+ warn 'somthing seems wrong with data3' unless @length == (data2.length / data2_rec_size)
+ end
+ end
+
+ def schema
+ @schema ||= index_data.scan(/.{8}/m).map { |data| Column.new data }
+ end
+
+ def [] idx
+ # handle funky rounding
+ Row.new self, idx * @rec_size
+ end
+
+ def each
+ length.times { |i| yield self[i] }
+ end
+
+ class Row
+ include Enumerable
+
+ def initialize array_parser, x
+ @array_parser, @x = array_parser, x
+ end
+
+ # iterate through the property tuples
+ def each
+ (@array_parser.index_data.length / 8).times do |i|
+ ref_type, type, ind2_off, size, slot = @array_parser.index_data[8 * i, 8].unpack 'v3CC'
+ # check this rescue too
+ value = @array_parser.data3[@x + ind2_off, size]
+# if INDIRECT_TYPES.include? ref_type
+ if size <= 4
+ value = value.unpack('V')[0]
+ end
+ #p ['0x%04x' % ref_type, '0x%04x' % type, (Msg::Properties::MAPITAGS['%04x' % type].first[/^.._(.*)/, 1].downcase rescue nil),
+ # value_orig, value, (get_data_indirect(value_orig.unpack('V')[0]) rescue nil), size, ind2_off, slot]
+ key, type, value = @array_parser.handle_indirect_values type, ref_type, value
+ yield key, type, value
+ end
+ end
+ end
+ end
+
+ class AttachmentTable < BlockParser
+ # a "fake" MAPI property name for this constant. if you get a mapi property with
+ # this value, it is the id2 value to use to get attachment data.
+ PR_ATTACHMENT_ID2 = 0x67f2
+
+ attr_reader :desc, :table
+ def initialize desc
+ @desc = desc
+ # no super, we only actually want BlockParser2#idx2
+ @table = nil
+ return unless desc.list_index
+ return unless idx = idx2[ID2_ATTACHMENTS]
+ # FIXME make a fake desc.
+ @desc2 = OpenStruct.new :desc => idx, :pst => desc.pst, :list_index => desc.list_index
+ @table = RawPropertyStoreTable.new @desc2
+ end
+
+ def to_a
+ return [] if !table
+ table.map do |attachment|
+ attachment = attachment.to_a
+ #p attachment
+ # potentially merge with yet more properties
+ # this still seems pretty broken - especially the property overlap
+ if attachment_id2 = attachment.assoc(PR_ATTACHMENT_ID2)
+ #p attachment_id2.last
+ #p idx2[attachment_id2.last]
+ @desc2.desc = idx2[attachment_id2.last]
+ RawPropertyStore.new(@desc2).each do |a, b, c|
+ record = attachment.assoc a
+ attachment << record = [] unless record
+ record.replace [a, b, c]
+ end
+ end
+ attachment
+ end
+ end
+ end
+
+ # there is no equivalent to this in libpst. ID2_RECIPIENTS was just guessed given the above
+ # AttachmentTable.
+ class RecipientTable < BlockParser
+ attr_reader :desc, :table
+ def initialize desc
+ @desc = desc
+ # no super, we only actually want BlockParser2#idx2
+ @table = nil
+ return unless desc.list_index
+ return unless idx = idx2[ID2_RECIPIENTS]
+ # FIXME make a fake desc.
+ desc2 = OpenStruct.new :desc => idx, :pst => desc.pst, :list_index => desc.list_index
+ @table = RawPropertyStoreTable.new desc2
+ end
+
+ def to_a
+ return [] if !table
+ table.map { |x| x.to_a }
+ end
+ end
+
+ #
+ # higher level item code. wraps up the raw properties above, and gives nice
+ # objects to work with. handles item relationships too.
+ # ----------------------------------------------------------------------------
+ #
+
+ def self.make_property_set property_list
+ hash = property_list.inject({}) do |hash, (key, type, value)|
+ hash.update PropertySet::Key.new(key) => value
+ end
+ PropertySet.new hash
+ end
+
+ class Attachment < Mapi::Attachment
+ def initialize list
+ super Pst.make_property_set(list)
+
+ @embedded_msg = props.attach_data if Item === props.attach_data
+ end
+ end
+
+ class Recipient < Mapi::Recipient
+ def initialize list
+ super Pst.make_property_set(list)
+ end
+ end
+
+ class Item < Mapi::Message
+ class EntryID < Struct.new(:u1, :entry_id, :id)
+ UNPACK_STR = 'VA16V'
+
+ def initialize data
+ data = data.unpack(UNPACK_STR) if String === data
+ super(*data)
+ end
+ end
+
+ include RecursivelyEnumerable
+
+ attr_accessor :type, :parent
+
+ def initialize desc, list, type=nil
+ @desc = desc
+ super Pst.make_property_set(list)
+
+ # this is kind of weird, but the ids of the special folders are stored in a hash
+ # when the root item is loaded
+ if ipm_wastebasket_entryid
+ desc.pst.special_folder_ids[ipm_wastebasket_entryid] = :wastebasket
+ end
+
+ if finder_entryid
+ desc.pst.special_folder_ids[finder_entryid] = :finder
+ end
+
+ # and then here, those are used, along with a crappy heuristic to determine if we are an
+ # item
+=begin
+i think the low bits of the desc_id can give some info on the type.
+
+it seems that 0x4 is for regular messages (and maybe contacts etc)
+0x2 is for folders, and 0x8 is for special things like rules etc, that aren't visible.
+=end
+ unless type
+ type = props.valid_folder_mask || ipm_subtree_entryid || props.content_count || props.subfolders ? :folder : :message
+ if type == :folder
+ type = desc.pst.special_folder_ids[desc.desc_id] || type
+ end
+ end
+
+ @type = type
+ end
+
+ def each_child
+ id = ipm_subtree_entryid
+ if id
+ root = @desc.pst.desc_from_id id
+ raise "couldn't find root" unless root
+ raise 'both kinds of children' unless @desc.children.empty?
+ children = root.children
+ # lets look up the other ids we have.
+ # typically the wastebasket one "deleted items" is in the children already, but
+ # the search folder isn't.
+ extras = [ipm_wastebasket_entryid, finder_entryid].compact.map do |id|
+ root = @desc.pst.desc_from_id id
+ warn "couldn't find root for id #{id}" unless root
+ root
+ end.compact
+ # i do this instead of union, so as not to mess with the order of the
+ # existing children.
+ children += (extras - children)
+ children
+ else
+ @desc.children
+ end.each do |desc|
+ item = @desc.pst.pst_parse_item(desc)
+ item.parent = self
+ yield item
+ end
+ end
+
+ def path
+ parents, item = [], self
+ parents.unshift item while item = item.parent
+ # remove root
+ parents.shift
+ parents.map { |item| item.props.display_name or raise 'unable to construct path' } * '/'
+ end
+
+ def children
+ to_enum(:each_child).to_a
+ end
+
+ # these are still around because they do different stuff
+
+ # Top of Personal Folder Record
+ def ipm_subtree_entryid
+ @ipm_subtree_entryid ||= EntryID.new(props.ipm_subtree_entryid.read).id rescue nil
+ end
+
+ # Deleted Items Folder Record
+ def ipm_wastebasket_entryid
+ @ipm_wastebasket_entryid ||= EntryID.new(props.ipm_wastebasket_entryid.read).id rescue nil
+ end
+
+ # Search Root Record
+ def finder_entryid
+ @finder_entryid ||= EntryID.new(props.finder_entryid.read).id rescue nil
+ end
+
+ # all these have been replaced with the method_missing below
+=begin
+ # States which folders are valid for this message store
+ #def valid_folder_mask
+ # props[0x35df]
+ #end
+
+ # Number of emails stored in a folder
+ def content_count
+ props[0x3602]
+ end
+
+ # Has children
+ def subfolders
+ props[0x360a]
+ end
+=end
+
+ # i think i will change these, so they can inherit the lazyness from RawPropertyStoreTable.
+ # so if you want the last attachment, you can get it without creating the others perhaps.
+ # it just has to handle the no table at all case a bit more gracefully.
+
+ def attachments
+ @attachments ||= AttachmentTable.new(@desc).to_a.map { |list| Attachment.new list }
+ end
+
+ def recipients
+ #[]
+ @recipients ||= RecipientTable.new(@desc).to_a.map { |list| Recipient.new list }
+ end
+
+ def each_recursive(&block)
+ #p :self => self
+ children.each do |child|
+ #p :child => child
+ block[child]
+ child.each_recursive(&block)
+ end
+ end
+
+ def inspect
+ attrs = %w[display_name subject sender_name subfolders]
+# attrs = %w[display_name valid_folder_mask ipm_wastebasket_entryid finder_entryid content_count subfolders]
+ str = attrs.map { |a| b = props.send a; " #{a}=#{b.inspect}" if b }.compact * ','
+
+ type_s = type == :message ? 'Message' : type == :folder ? 'Folder' : type.to_s.capitalize + 'Folder'
+ str2 = 'desc_id=0x%x' % @desc.desc_id
+
+ !str.empty? ? "#<Pst::#{type_s} #{str2}#{str}>" : "#<Pst::#{type_s} #{str2} props=#{props.inspect}>" #\n" + props.transport_message_headers + ">"
+ end
+ end
+
+ # corresponds to
+ # * _pst_parse_item
+ def pst_parse_item desc
+ Item.new desc, RawPropertyStore.new(desc).to_a
+ end
+
+ #
+ # other random code
+ # ----------------------------------------------------------------------------
+ #
+
+ def dump_debug_info
+ puts "* pst header"
+ p header
+
+=begin
+Looking at the output of this, for blank-o1997.pst, i see this part:
+...
+- (26624,516) desc block data (overlap of 4 bytes)
+- (27136,516) desc block data (gap of 508 bytes)
+- (28160,516) desc block data (gap of 2620 bytes)
+...
+
+which confirms my belief that the block size for idx and desc is more likely 512
+=end
+ if 0 + 0 == 0
+ puts '* file range usage'
+ file_ranges =
+ # these 3 things, should account for most of the data in the file.
+ [[0, Header::SIZE, 'pst file header']] +
+ @idx_offsets.map { |offset| [offset, Index::BLOCK_SIZE, 'idx block data'] } +
+ @desc_offsets.map { |offset| [offset, Desc::BLOCK_SIZE, 'desc block data'] } +
+ @idx.map { |idx| [idx.offset, idx.size, 'idx id=0x%x (%s)' % [idx.id, idx.type]] }
+ (file_ranges.sort_by { |idx| idx.first } + [nil]).to_enum(:each_cons, 2).each do |(offset, size, name), next_record|
+ # i think there is a padding of the size out to 64 bytes
+ # which is equivalent to padding out the final offset, because i think the offset is
+ # similarly oriented
+ pad_amount = 64
+ warn 'i am wrong about the offset padding' if offset % pad_amount != 0
+ # so, assuming i'm not wrong about that, then we can calculate how much padding is needed.
+ pad = pad_amount - (size % pad_amount)
+ pad = 0 if pad == pad_amount
+ gap = next_record ? next_record.first - (offset + size + pad) : 0
+ extra = case gap <=> 0
+ when -1; ["overlap of #{gap.abs} bytes)"]
+ when 0; []
+ when +1; ["gap of #{gap} bytes"]
+ end
+ # how about we check that padding
+ @io.pos = offset + size
+ pad_bytes = @io.read(pad)
+ extra += ["padding not all zero"] unless pad_bytes == 0.chr * pad
+ puts "- #{offset}:#{size}+#{pad} #{name.inspect}" + (extra.empty? ? '' : ' [' + extra * ', ' + ']')
+ end
+ end
+
+ # i think the idea of the idx, and indeed the idx2, is just to be able to
+ # refer to data indirectly, which means it can get moved around, and you just update
+ # the idx table. it is simply a list of file offsets and sizes.
+ # not sure i get how id2 plays into it though....
+ # the sizes seem to be all even. is that a co-incidence? and the ids are all even. that
+ # seems to be related to something else (see the (id & 2) == 1 stuff)
+ puts '* idx entries'
+ @idx.each { |idx| puts "- #{idx.inspect}" }
+
+ # if you look at the desc tree, you notice a few things:
+ # 1. there is a desc that seems to be the parent of all the folders, messages etc.
+ # it is the one whose parent is itself.
+ # one of its children is referenced as the subtree_entryid of the first desc item,
+ # the root.
+ # 2. typically only 2 types of desc records have idx2_id != 0. messages themselves,
+ # and the desc with id = 0x61 - the xattrib container. everything else uses the
+ # regular ids to find its data. i think it should be reframed as small blocks and
+ # big blocks, but i'll look into it more.
+ #
+ # idx_id and idx2_id are for getting to the data. desc_id and parent_desc_id just define
+ # the parent <-> child relationship, and the desc_ids are how the items are referred to in
+ # entryids.
+ # note that these aren't unique! eg for 0, 4 etc. i expect these'd never change, as the ids
+ # are stored in entryids. whereas the idx and idx2 could be a bit more volatile.
+ puts '* desc tree'
+ # make a dummy root hold everything just for convenience
+ root = Desc.new ''
+ def root.inspect; "#<Pst::Root>"; end
+ root.children.replace @orphans
+ # this still loads the whole thing as a string for gsub. should use directo output io
+ # version.
+ puts root.to_tree.gsub(/, (parent_desc_id|idx2_id)=0x0(?!\d)/, '')
+
+ # this is fairly easy to understand, its just an attempt to display the pst items in a tree form
+ # which resembles what you'd see in outlook.
+ puts '* item tree'
+ # now streams directly
+ root_item.to_tree STDOUT
+ end
+
+ def root_desc
+ @desc.first
+ end
+
+ def root_item
+ item = pst_parse_item root_desc
+ item.type = :root
+ item
+ end
+
+ def root
+ root_item
+ end
+
+ # depth first search of all items
+ include Enumerable
+
+ def each(&block)
+ root = self.root
+ block[root]
+ root.each_recursive(&block)
+ end
+
+ def name
+ @name ||= root_item.props.display_name
+ end
+
+ def inspect
+ "#<Pst name=#{name.inspect} io=#{io.inspect}>"
+ end
+end
+end
+