aboutsummaryrefslogtreecommitdiffstats
path: root/vendor/ruby-msg/lib/mapi/pst.rb
blob: 9ac64b097be1264721b45d3dafa177f37c467cea (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
#
# = Introduction
#
# This file is mostly an attempt to port libpst to ruby, and simplify it in the process. It
# will leverage much of the existing MAPI => MIME conversion developed for Msg files, and as
# such is purely concerned with the file structure details.
#
# = TODO
# 
# 1. solve recipient table problem (test4).
#    this is done. turns out it was due to id2 clashes. find better solution
# 2. check parse consistency. an initial conversion of a 30M file to pst, shows
#    a number of messages conveting badly. compare with libpst too.
# 3. xattribs
# 4. generalise the Mapi stuff better
# 5. refactor index load
# 6. msg serialization?
#

=begin

quick plan for cleanup.

have working tests for 97 and 03 file formats, so safe.

want to fix up:

64 bit unpacks scattered around. its ugly. not sure how best to handle it, but am slightly tempted
to override String#unpack to support a 64 bit little endian unpack (like L vs N/V, for Q). one way or
another need to fix it. Could really slow everything else down if its parsing the unpack strings twice,
once in ruby, for every single unpack i do :/

the index loading process, and the lack of shared code between normal vs 64 bit variants, and Index vs Desc.
should be able to reduce code by factor of 4. also think I should move load code into the class too. then
maybe have something like:

class Header
	def index_class
		version_2003 ? Index64 : Index
	end
end

def load_idx
	header.index_class.load_index
end

OR

def initialize
	@header = ...
	extend @header.index_class::Load
	load_idx
end

need to think about the role of the mapi code, and Pst::Item etc, but that layer can come later.

=end

require 'mapi'
require 'enumerator'
require 'ostruct'
require 'ole/ranges_io'

module Mapi
class Pst
	class FormatError < StandardError
	end

	# unfortunately there is no Q analogue which is little endian only.
	# this translates T as an unsigned quad word, little endian byte order, to
	# not pollute the rest of the code.
	#
	# didn't want to override String#unpack, cause its too hacky, and incomplete.
	def self.unpack str, unpack_spec
		return str.unpack(unpack_spec) unless unpack_spec['T']
		@unpack_cache ||= {}
		t_offsets, new_spec = @unpack_cache[unpack_spec]
		unless t_offsets
			t_offsets = []
			offset = 0
			new_spec = ''
			unpack_spec.scan(/([^\d])_?(\*|\d+)?/o) do
				num_elems = $1.downcase == 'a' ? 1 : ($2 || 1).to_i
				if $1 == 'T'
					num_elems.times { |i| t_offsets << offset + i }
					new_spec << "V#{num_elems * 2}"
				else
					new_spec << $~[0]
				end
				offset += num_elems
			end
			@unpack_cache[unpack_spec] = [t_offsets, new_spec]
		end
		a = str.unpack(new_spec)
		t_offsets.each do |offset|
			low, high = a[offset, 2]
			a[offset, 2] = low && high ? low + (high << 32) : nil
		end
		a
	end

	#
	# this is the header and encryption encapsulation code
	# ----------------------------------------------------------------------------
	#

	# class which encapsulates the pst header
	class Header
		SIZE = 512
		MAGIC = 0x2142444e

		# these are the constants defined in libpst.c, that
		# are referenced in pst_open()
		INDEX_TYPE_OFFSET = 0x0A
		FILE_SIZE_POINTER = 0xA8
		FILE_SIZE_POINTER_64 = 0xB8
		SECOND_POINTER = 0xBC
		INDEX_POINTER = 0xC4
		SECOND_POINTER_64 = 0xE0
		INDEX_POINTER_64 = 0xF0
		ENC_OFFSET = 0x1CD

		attr_reader :magic, :index_type, :encrypt_type, :size
		attr_reader :index1_count, :index1, :index2_count, :index2
		attr_reader :version
		def initialize data
			@magic = data.unpack('N')[0]
			@index_type = data[INDEX_TYPE_OFFSET]
			@version = {0x0e => 1997, 0x17 => 2003}[@index_type]

			if version_2003?
				# don't know?
				# >> data1.unpack('V*').zip(data2.unpack('V*')).enum_with_index.select { |(c, d), i| c != d and not [46, 56, 60].include?(i) }.select { |(a, b), i| b == 0 }.map { |(a, b), i| [a / 256, i] }
				#   [8, 76], [32768, 84], [128, 89]
				# >> data1.unpack('C*').zip(data2.unpack('C*')).enum_with_index.select { |(c, d), i| c != d and not [184..187, 224..227, 240..243].any? { |r| r === i } }.select { |(a, b), i| b == 0 and ((Math.log(a) / Math.log(2)) % 1) < 0.0001 }
				#   [[[2, 0], 61], [[2, 0], 76], [[2, 0], 195], [[2, 0], 257], [[8, 0], 305], [[128, 0], 338], [[128, 0], 357]]
				# i have only 2 psts to base this guess on, so i can't really come up with anything that looks reasonable yet. not sure what the offset is. unfortunately there is so much in the header
				# that isn't understood...
				@encrypt_type = 1

				@index2_count, @index2 = data[SECOND_POINTER_64 - 4, 8].unpack('V2')
				@index1_count, @index1 = data[INDEX_POINTER_64  - 4, 8].unpack('V2')

				@size = data[FILE_SIZE_POINTER_64, 4].unpack('V')[0]
			else
				@encrypt_type = data[ENC_OFFSET]

				@index2_count, @index2 = data[SECOND_POINTER - 4, 8].unpack('V2')
				@index1_count, @index1 = data[INDEX_POINTER  - 4, 8].unpack('V2')

				@size = data[FILE_SIZE_POINTER, 4].unpack('V')[0]
			end

			validate!
		end

		def version_2003?
			version == 2003
		end

		def encrypted?
			encrypt_type != 0
		end

		def validate!
			raise FormatError, "bad signature on pst file (#{'0x%x' % magic})" unless magic == MAGIC
			raise FormatError, "only index types 0x0e and 0x17 are handled (#{'0x%x' % index_type})" unless [0x0e, 0x17].include?(index_type)
			raise FormatError, "only encrytion types 0 and 1 are handled (#{encrypt_type.inspect})" unless [0, 1].include?(encrypt_type)
		end
	end

	# compressible encryption! :D
	#
	# simple substitution. see libpst.c
	# maybe test switch to using a String#tr!
	class CompressibleEncryption
		DECRYPT_TABLE = [
			0x47, 0xf1, 0xb4, 0xe6, 0x0b, 0x6a, 0x72, 0x48,
			0x85, 0x4e, 0x9e, 0xeb, 0xe2, 0xf8, 0x94, 0x53, # 0x0f
			0xe0, 0xbb, 0xa0, 0x02, 0xe8, 0x5a, 0x09, 0xab,
			0xdb, 0xe3, 0xba, 0xc6, 0x7c, 0xc3, 0x10, 0xdd, # 0x1f
			0x39, 0x05, 0x96, 0x30, 0xf5, 0x37, 0x60, 0x82,
			0x8c, 0xc9, 0x13, 0x4a, 0x6b, 0x1d, 0xf3, 0xfb, # 0x2f
			0x8f, 0x26, 0x97, 0xca, 0x91, 0x17, 0x01, 0xc4,
			0x32, 0x2d, 0x6e, 0x31, 0x95, 0xff, 0xd9, 0x23, # 0x3f
			0xd1, 0x00, 0x5e, 0x79, 0xdc, 0x44, 0x3b, 0x1a,
			0x28, 0xc5, 0x61, 0x57, 0x20, 0x90, 0x3d, 0x83, # 0x4f
			0xb9, 0x43, 0xbe, 0x67, 0xd2, 0x46, 0x42, 0x76,
			0xc0, 0x6d, 0x5b, 0x7e, 0xb2, 0x0f, 0x16, 0x29, # 0x5f
			0x3c, 0xa9, 0x03, 0x54, 0x0d, 0xda, 0x5d, 0xdf,
			0xf6, 0xb7, 0xc7, 0x62, 0xcd, 0x8d, 0x06, 0xd3, # 0x6f
			0x69, 0x5c, 0x86, 0xd6, 0x14, 0xf7, 0xa5, 0x66,
			0x75, 0xac, 0xb1, 0xe9, 0x45, 0x21, 0x70, 0x0c, # 0x7f
			0x87, 0x9f, 0x74, 0xa4, 0x22, 0x4c, 0x6f, 0xbf,
			0x1f, 0x56, 0xaa, 0x2e, 0xb3, 0x78, 0x33, 0x50, # 0x8f
			0xb0, 0xa3, 0x92, 0xbc, 0xcf, 0x19, 0x1c, 0xa7,
			0x63, 0xcb, 0x1e, 0x4d, 0x3e, 0x4b, 0x1b, 0x9b, # 0x9f
			0x4f, 0xe7, 0xf0, 0xee, 0xad, 0x3a, 0xb5, 0x59,
			0x04, 0xea, 0x40, 0x55, 0x25, 0x51, 0xe5, 0x7a, # 0xaf
			0x89, 0x38, 0x68, 0x52, 0x7b, 0xfc, 0x27, 0xae,
			0xd7, 0xbd, 0xfa, 0x07, 0xf4, 0xcc, 0x8e, 0x5f, # 0xbf
			0xef, 0x35, 0x9c, 0x84, 0x2b, 0x15, 0xd5, 0x77,
			0x34, 0x49, 0xb6, 0x12, 0x0a, 0x7f, 0x71, 0x88, # 0xcf
			0xfd, 0x9d, 0x18, 0x41, 0x7d, 0x93, 0xd8, 0x58,
			0x2c, 0xce, 0xfe, 0x24, 0xaf, 0xde, 0xb8, 0x36, # 0xdf
			0xc8, 0xa1, 0x80, 0xa6, 0x99, 0x98, 0xa8, 0x2f,
			0x0e, 0x81, 0x65, 0x73, 0xe4, 0xc2, 0xa2, 0x8a, # 0xef
			0xd4, 0xe1, 0x11, 0xd0, 0x08, 0x8b, 0x2a, 0xf2,
			0xed, 0x9a, 0x64, 0x3f, 0xc1, 0x6c, 0xf9, 0xec  # 0xff
		]

		ENCRYPT_TABLE = [nil] * 256
		DECRYPT_TABLE.each_with_index { |i, j| ENCRYPT_TABLE[i] = j }

		def self.decrypt_alt encrypted
			decrypted = ''
			encrypted.length.times { |i| decrypted << DECRYPT_TABLE[encrypted[i]] }
			decrypted
		end

		def self.encrypt_alt decrypted
			encrypted = ''
			decrypted.length.times { |i| encrypted << ENCRYPT_TABLE[decrypted[i]] }
			encrypted
		end

		# an alternate implementation that is possibly faster....
		# TODO - bench
		DECRYPT_STR, ENCRYPT_STR = [DECRYPT_TABLE, (0...256)].map do |values|
			values.map { |i| i.chr }.join.gsub(/([\^\-\\])/, "\\\\\\1")
		end

		def self.decrypt encrypted
			encrypted.tr ENCRYPT_STR, DECRYPT_STR
		end

		def self.encrypt decrypted
			decrypted.tr DECRYPT_STR, ENCRYPT_STR
		end
	end

	class RangesIOEncryptable < RangesIO
		def initialize io, mode='r', params={}
			mode, params = 'r', mode if Hash === mode
			@decrypt = !!params[:decrypt]
			super
		end

		def encrypted?
			@decrypt
		end

		def read limit=nil
			buf = super
			buf = CompressibleEncryption.decrypt(buf) if encrypted?
			buf
		end
	end

	attr_reader :io, :header, :idx, :desc, :special_folder_ids

	# corresponds to
	# * pst_open
	# * pst_load_index
	def initialize io
		@io = io
		io.pos = 0
		@header = Header.new io.read(Header::SIZE)

		# would prefer this to be in Header#validate, but it doesn't have the io size.
		# should perhaps downgrade this to just be a warning...
		raise FormatError, "header size field invalid (#{header.size} != #{io.size}}" unless header.size == io.size

		load_idx
		load_desc
		load_xattrib

		@special_folder_ids = {}
	end

	def encrypted?
		@header.encrypted?
	end

	# until i properly fix logging...
	def warn s
		Mapi::Log.warn s
	end

	#
	# this is the index and desc record loading code
	# ----------------------------------------------------------------------------
	#

	ToTree = Module.new

	module Index2
		BLOCK_SIZE = 512
		module RecursiveLoad
			def load_chain
				#...
			end
		end

		module Base
			def read
				#...
			end
		end

		class Version1997 < Struct.new(:a)#...)
			SIZE = 12

			include RecursiveLoad
			include Base
		end

		class Version2003 < Struct.new(:a)#...)
			SIZE = 24

			include RecursiveLoad
			include Base
		end
	end

	module Desc2
		module Base
			def desc
				#...
			end
		end

		class Version1997 < Struct.new(:a)#...)
			#include Index::RecursiveLoad
			include Base
		end

		class Version2003 < Struct.new(:a)#...)
			#include Index::RecursiveLoad
			include Base
		end
	end

	# more constants from libpst.c
	# these relate to the index block
	ITEM_COUNT_OFFSET = 0x1f0 # count byte
	LEVEL_INDICATOR_OFFSET = 0x1f3 # node or leaf
	BACKLINK_OFFSET = 0x1f8 # backlink u1 value

	# these 3 classes are used to hold various file records

	# pst_index
	class Index < Struct.new(:id, :offset, :size, :u1)
		UNPACK_STR = 'VVvv'
		SIZE = 12
		BLOCK_SIZE = 512 # index blocks was 516 but bogus
		COUNT_MAX = 41 # max active items (ITEM_COUNT_OFFSET / Index::SIZE = 41)

		attr_accessor :pst
		def initialize data
			data = Pst.unpack data, UNPACK_STR if String === data
			super(*data)
		end

		def type
			@type ||= begin
				if id & 0x2 == 0
					:data
				else
					first_byte, second_byte = read.unpack('CC')
					if first_byte == 1
						raise second_byte unless second_byte == 1
						:data_chain_header
					elsif first_byte == 2
						raise second_byte unless second_byte == 0
						:id2_assoc
					else
						raise FormatError, 'unknown first byte for block - %p' % first_byte
					end
				end
			end
		end

		def data?
			(id & 0x2) == 0
		end

		def read decrypt=true
			# only data blocks are every encrypted
			decrypt = false unless data?
			pst.pst_read_block_size offset, size, decrypt
		end

		# show all numbers in hex
		def inspect
			super.gsub(/=(\d+)/) { '=0x%x' % $1.to_i }.sub(/Index /, "Index type=#{type.inspect}, ")
		end
	end

	# mostly guesses.
	ITEM_COUNT_OFFSET_64 = 0x1e8
	LEVEL_INDICATOR_OFFSET_64 = 0x1eb # diff of 3 between these 2 as above...

	# will maybe inherit from Index64, in order to get the same #type function.
	class Index64 < Index
		UNPACK_STR = 'TTvvV'
		SIZE = 24
		BLOCK_SIZE = 512
		COUNT_MAX = 20 # bit of a guess really. 512 / 24 = 21, but doesn't leave enough header room

		# this is the extra item on the end of the UNPACK_STR above
		attr_accessor :u2

		def initialize data
			data = Pst.unpack data, UNPACK_STR if String === data
			@u2 = data.pop
			super data
		end

		def inspect
			super.sub(/>$/, ', u2=%p>' % u2)
		end

		def self.load_chain io, header
			load_idx_rec io, header.index1, 0, 0
		end

		# almost identical to load code for Index, just different offsets and unpack strings.
		# can probably merge them, or write a generic load_tree function or something.
		def self.load_idx_rec io, offset, linku1, start_val
			io.seek offset
			buf = io.read BLOCK_SIZE
			idxs = []

			item_count = buf[ITEM_COUNT_OFFSET_64]
			raise "have too many active items in index (#{item_count})" if item_count > COUNT_MAX

			#idx = Index.new buf[BACKLINK_OFFSET, Index::SIZE]
			#raise 'blah 1' unless idx.id == linku1

			if buf[LEVEL_INDICATOR_OFFSET_64] == 0
				# leaf pointers
				# split the data into item_count index objects
				buf[0, SIZE * item_count].scan(/.{#{SIZE}}/mo).each_with_index do |data, i|
					idx = new data
					# first entry
					raise 'blah 3' if i == 0 and start_val != 0 and idx.id != start_val
					#idx.pst = self
					break if idx.id == 0
					idxs << idx
				end
			else
				# node pointers
				# split the data into item_count table pointers
				buf[0, SIZE * item_count].scan(/.{#{SIZE}}/mo).each_with_index do |data, i|
					start, u1, offset = Pst.unpack data, 'T3'
					# for the first value, we expect the start to be equal
					raise 'blah 3' if i == 0 and start_val != 0 and start != start_val
					break if start == 0
					idxs += load_idx_rec io, offset, u1, start
				end
			end

			idxs
		end
	end

	# pst_desc
	class Desc64 < Struct.new(:desc_id, :idx_id, :idx2_id, :parent_desc_id, :u2)
		UNPACK_STR = 'T3VV'
		SIZE = 32
		BLOCK_SIZE = 512 # descriptor blocks was 520 but bogus
		COUNT_MAX = 15 # guess as per Index64

		include RecursivelyEnumerable

		attr_accessor :pst
		attr_reader :children
		def initialize data
			super(*Pst.unpack(data, UNPACK_STR))
			@children = []
		end

		def desc
			pst.idx_from_id idx_id
		end

		def list_index
			pst.idx_from_id idx2_id
		end

		def self.load_chain io, header
			load_desc_rec io, header.index2, 0, 0x21
		end

		def self.load_desc_rec io, offset, linku1, start_val
			io.seek offset
			buf = io.read BLOCK_SIZE
			descs = []
			item_count = buf[ITEM_COUNT_OFFSET_64]

			# not real desc
			#desc = Desc.new buf[BACKLINK_OFFSET, 4]
			#raise 'blah 1' unless desc.desc_id == linku1

			if buf[LEVEL_INDICATOR_OFFSET_64] == 0
				# leaf pointers
				raise "have too many active items in index (#{item_count})" if item_count > COUNT_MAX
				# split the data into item_count desc objects
				buf[0, SIZE * item_count].scan(/.{#{SIZE}}/mo).each_with_index do |data, i|
					desc = new data
					# first entry
					raise 'blah 3' if i == 0 and start_val != 0 and desc.desc_id != start_val
					break if desc.desc_id == 0
					descs << desc
				end
			else
				# node pointers
				raise "have too many active items in index (#{item_count})" if item_count > Index64::COUNT_MAX
				# split the data into item_count table pointers
				buf[0, Index64::SIZE * item_count].scan(/.{#{Index64::SIZE}}/mo).each_with_index do |data, i|
					start, u1, offset = Pst.unpack data, 'T3'
					# for the first value, we expect the start to be equal note that ids -1, so even for the
					# first we expect it to be equal. thats the 0x21 (dec 33) desc record. this means we assert
					# that the first desc record is always 33...
					# thats because 0x21 is the pst root itself...
					raise 'blah 3' if i == 0 and start_val != -1 and start != start_val
					# this shouldn't really happen i'd imagine
					break if start == 0
					descs += load_desc_rec io, offset, u1, start
				end
			end

			descs
		end

		def each_child(&block)
			@children.each(&block)
		end
	end

	# _pst_table_ptr_struct
	class TablePtr < Struct.new(:start, :u1, :offset)
		UNPACK_STR = 'V3'
		SIZE = 12

		def initialize data
			data = data.unpack(UNPACK_STR) if String === data
			super(*data)
		end
	end

	# pst_desc
	# idx_id is a pointer to an idx record which gets the primary data stream for the Desc record.
	# idx2_id gets you an idx record, that when read gives you an ID2 association list, which just maps
	# another set of ids to index values
	class Desc < Struct.new(:desc_id, :idx_id, :idx2_id, :parent_desc_id)
		UNPACK_STR = 'V4'
		SIZE = 16
		BLOCK_SIZE = 512 # descriptor blocks was 520 but bogus
		COUNT_MAX = 31 # max active desc records (ITEM_COUNT_OFFSET / Desc::SIZE = 31)

		include ToTree

		attr_accessor :pst
		attr_reader :children
		def initialize data
			super(*data.unpack(UNPACK_STR))
			@children = []
		end

		def desc
			pst.idx_from_id idx_id
		end

		def list_index
			pst.idx_from_id idx2_id
		end

		# show all numbers in hex
		def inspect
			super.gsub(/=(\d+)/) { '=0x%x' % $1.to_i }
		end
	end

	# corresponds to
	# * _pst_build_id_ptr
	def load_idx
		@idx = []
		@idx_offsets = []
		if header.version_2003?
			@idx = Index64.load_chain io, header
			@idx.each { |idx| idx.pst = self }
		else
			load_idx_rec header.index1, header.index1_count, 0
		end

		# we'll typically be accessing by id, so create a hash as a lookup cache
		@idx_from_id = {}
 		@idx.each do |idx|
			warn "there are duplicate idx records with id #{idx.id}" if @idx_from_id[idx.id]
			@idx_from_id[idx.id] = idx
		end
	end

	# load the flat idx table, which maps ids to file ranges. this is the recursive helper
	#
	# corresponds to
	# * _pst_build_id_ptr
	def load_idx_rec offset, linku1, start_val
		@idx_offsets << offset

		#_pst_read_block_size(pf, offset, BLOCK_SIZE, &buf, 0, 0) < BLOCK_SIZE)
		buf = pst_read_block_size offset, Index::BLOCK_SIZE, false

		item_count = buf[ITEM_COUNT_OFFSET]
		raise "have too many active items in index (#{item_count})" if item_count > Index::COUNT_MAX

		idx = Index.new buf[BACKLINK_OFFSET, Index::SIZE]
		raise 'blah 1' unless idx.id == linku1

		if buf[LEVEL_INDICATOR_OFFSET] == 0
			# leaf pointers
			# split the data into item_count index objects
			buf[0, Index::SIZE * item_count].scan(/.{#{Index::SIZE}}/mo).each_with_index do |data, i|
				idx = Index.new data
				# first entry
				raise 'blah 3' if i == 0 and start_val != 0 and idx.id != start_val
				idx.pst = self
				# this shouldn't really happen i'd imagine
				break if idx.id == 0
				@idx << idx
			end
		else
			# node pointers
			# split the data into item_count table pointers
			buf[0, TablePtr::SIZE * item_count].scan(/.{#{TablePtr::SIZE}}/mo).each_with_index do |data, i|
				table = TablePtr.new data
				# for the first value, we expect the start to be equal
				raise 'blah 3' if i == 0 and start_val != 0 and table.start != start_val
				# this shouldn't really happen i'd imagine
				break if table.start == 0
				load_idx_rec table.offset, table.u1, table.start
			end
		end
	end

	# most access to idx objects will use this function
	#
	# corresponds to
	# * _pst_getID
	def idx_from_id id
		@idx_from_id[id]
	end

	# corresponds to
	# * _pst_build_desc_ptr
	# * record_descriptor
	def load_desc
		@desc = []
		@desc_offsets = []
		if header.version_2003?
			@desc = Desc64.load_chain io, header
			@desc.each { |desc| desc.pst = self }
		else
			load_desc_rec header.index2, header.index2_count, 0x21
		end

		# first create a lookup cache
		@desc_from_id = {}
 		@desc.each do |desc|
			desc.pst = self
			warn "there are duplicate desc records with id #{desc.desc_id}" if @desc_from_id[desc.desc_id]
			@desc_from_id[desc.desc_id] = desc
		end

		# now turn the flat list of loaded desc records into a tree

		# well, they have no parent, so they're more like, the toplevel descs.
		@orphans = []
		# now assign each node to the parents child array, putting the orphans in the above
		@desc.each do |desc|
			parent = @desc_from_id[desc.parent_desc_id]
			# note, besides this, its possible to create other circular structures.
			if parent == desc
				# this actually happens usually, for the root_item it appears.
				#warn "desc record's parent is itself (#{desc.inspect})"
			# maybe add some more checks in here for circular structures
			elsif parent
				parent.children << desc
				next
			end
			@orphans << desc
		end

		# maybe change this to some sort of sane-ness check. orphans are expected
#		warn "have #{@orphans.length} orphan desc record(s)." unless @orphans.empty?
	end

	# load the flat list of desc records recursively
	#
	# corresponds to
	# * _pst_build_desc_ptr
	# * record_descriptor
	def load_desc_rec offset, linku1, start_val
		@desc_offsets << offset
		
		buf = pst_read_block_size offset, Desc::BLOCK_SIZE, false
		item_count = buf[ITEM_COUNT_OFFSET]

		# not real desc
		desc = Desc.new buf[BACKLINK_OFFSET, 4]
		raise 'blah 1' unless desc.desc_id == linku1

		if buf[LEVEL_INDICATOR_OFFSET] == 0
			# leaf pointers
			raise "have too many active items in index (#{item_count})" if item_count > Desc::COUNT_MAX
			# split the data into item_count desc objects
			buf[0, Desc::SIZE * item_count].scan(/.{#{Desc::SIZE}}/mo).each_with_index do |data, i|
				desc = Desc.new data
				# first entry
				raise 'blah 3' if i == 0 and start_val != 0 and desc.desc_id != start_val
				# this shouldn't really happen i'd imagine
				break if desc.desc_id == 0
				@desc << desc
			end
		else
			# node pointers
			raise "have too many active items in index (#{item_count})" if item_count > Index::COUNT_MAX
			# split the data into item_count table pointers
			buf[0, TablePtr::SIZE * item_count].scan(/.{#{TablePtr::SIZE}}/mo).each_with_index do |data, i|
				table = TablePtr.new data
				# for the first value, we expect the start to be equal note that ids -1, so even for the
				# first we expect it to be equal. thats the 0x21 (dec 33) desc record. this means we assert
				# that the first desc record is always 33...
				raise 'blah 3' if i == 0 and start_val != -1 and table.start != start_val
				# this shouldn't really happen i'd imagine
				break if table.start == 0
				load_desc_rec table.offset, table.u1, table.start
			end
		end
	end

	# as for idx
	# 
	# corresponds to:
	# * _pst_getDptr
	def desc_from_id id
		@desc_from_id[id]
	end

	# corresponds to
	# * pst_load_extended_attributes
	def load_xattrib
		unless desc = desc_from_id(0x61)
			warn "no extended attributes desc record found"
			return
		end
		unless desc.desc
			warn "no desc idx for extended attributes"
			return
		end
		if desc.list_index
		end
		#warn "skipping loading xattribs"
		# FIXME implement loading xattribs
	end

	# corresponds to:
	# * _pst_read_block_size
	# * _pst_read_block ??
	# * _pst_ff_getIDblock_dec ??
	# * _pst_ff_getIDblock ??
	def pst_read_block_size offset, size, decrypt=true
		io.seek offset
		buf = io.read size
		warn "tried to read #{size} bytes but only got #{buf.length}" if buf.length != size
		encrypted? && decrypt ? CompressibleEncryption.decrypt(buf) : buf
	end

	#
	# id2 
	# ----------------------------------------------------------------------------
	#

	class ID2Assoc < Struct.new(:id2, :id, :table2)
		UNPACK_STR = 'V3'
		SIZE = 12

		def initialize data
			data = data.unpack(UNPACK_STR) if String === data
			super(*data)
		end
	end

	class ID2Assoc64 < Struct.new(:id2, :u1, :id, :table2)
		UNPACK_STR = 'VVT2'
		SIZE = 24

		def initialize data
			if String === data
				data = Pst.unpack data, UNPACK_STR
			end
			super(*data)
		end

		def self.load_chain idx
			buf = idx.read
			type, count = buf.unpack 'v2'
			unless type == 0x0002
				raise 'unknown id2 type 0x%04x' % type
				#return
			end
			id2 = []
			count.times do |i|
				assoc = new buf[8 + SIZE * i, SIZE]
				id2 << assoc
				if assoc.table2 != 0
					id2 += load_chain idx.pst.idx_from_id(assoc.table2)
				end
			end
			id2
		end
	end

	class ID2Mapping
		attr_reader :list
		def initialize pst, list
			@pst = pst
			@list = list
			# create a lookup. 
			@id_from_id2 = {}
			@list.each do |id2|
				# NOTE we take the last value seen value if there are duplicates. this "fixes"
				# test4-o1997.pst for the time being.
				warn "there are duplicate id2 records with id #{id2.id2}" if @id_from_id2[id2.id2]
				next if @id_from_id2[id2.id2]
				@id_from_id2[id2.id2] = id2.id
			end
		end

		# TODO: fix logging
		def warn s
			Mapi::Log.warn s
		end

		# corresponds to:
		# * _pst_getID2
		def [] id
			#id2 = @list.find { |x| x.id2 == id }
			id = @id_from_id2[id]
			id and @pst.idx_from_id(id)
		end
	end

	def load_idx2 idx
		if header.version_2003?
			id2 = ID2Assoc64.load_chain idx
		else
			id2 = load_idx2_rec idx
		end
		ID2Mapping.new self, id2
	end

	# corresponds to
	# * _pst_build_id2
	def load_idx2_rec idx
		# i should perhaps use a idx chain style read here?
		buf = pst_read_block_size idx.offset, idx.size, false
		type, count = buf.unpack 'v2'
		unless type == 0x0002
			raise 'unknown id2 type 0x%04x' % type
			#return
		end
		id2 = []
		count.times do |i|
			assoc = ID2Assoc.new buf[4 + ID2Assoc::SIZE * i, ID2Assoc::SIZE]
			id2 << assoc
			if assoc.table2 != 0
				id2 += load_idx2_rec idx_from_id(assoc.table2)
			end
		end
		id2
	end

	class RangesIOIdxChain < RangesIOEncryptable
		def initialize pst, idx_head
			@idxs = pst.id2_block_idx_chain idx_head
			# whether or not a given idx needs encrypting
			decrypts = @idxs.map do |idx|
				decrypt = (idx.id & 2) != 0 ? false : pst.encrypted?
			end.uniq
			raise NotImplementedError, 'partial encryption in RangesIOID2' if decrypts.length > 1
			decrypt = decrypts.first
			# convert idxs to ranges
			ranges = @idxs.map { |idx| [idx.offset, idx.size] }
			super pst.io, :ranges => ranges, :decrypt => decrypt
		end
	end

	class RangesIOID2 < RangesIOIdxChain
		def self.new pst, id2, idx2
			RangesIOIdxChain.new pst, idx2[id2]
		end
	end

	# corresponds to:
	# * _pst_ff_getID2block
	# * _pst_ff_getID2data
	# * _pst_ff_compile_ID
	def id2_block_idx_chain idx
		if (idx.id & 0x2) == 0
			[idx]
		else
			buf = idx.read
			type, fdepth, count = buf[0, 4].unpack 'CCv'
			unless type == 1 # libpst.c:3958
				warn 'Error in idx_chain - %p, %p, %p - attempting to ignore' % [type, fdepth, count]
				return [idx]
			end
			# there are 4 unaccounted for bytes here, 4...8
			if header.version_2003?
				ids = buf[8, count * 8].unpack("T#{count}")
			else
				ids = buf[8, count * 4].unpack('V*')
			end
			if fdepth == 1
				ids.map { |id| idx_from_id id }
			else
				ids.map { |id| id2_block_idx_chain idx_from_id(id) }.flatten
			end
		end
	end

	#
	# main block parsing code. gets raw properties
	# ----------------------------------------------------------------------------
	#

	# the job of this class, is to take a desc record, and be able to enumerate through the
	# mapi properties of the associated thing.
	#
	# corresponds to
	# * _pst_parse_block
	# * _pst_process (in some ways. although perhaps thats more the Item::Properties#add_property)
	class BlockParser
		include Mapi::Types::Constants

		TYPES = {
			0xbcec => 1,
			0x7cec => 2,
			# type 3 is removed. an artifact of not handling the indirect blocks properly in libpst.
		}

		PR_SUBJECT = PropertySet::TAGS.find { |num, (name, type)| name == 'PR_SUBJECT' }.first.hex
		PR_BODY_HTML = PropertySet::TAGS.find { |num, (name, type)| name == 'PR_BODY_HTML' }.first.hex

		# this stuff could maybe be moved to Ole::Types? or leverage it somehow?
		# whether or not a type is immeidate is more a property of the pst encoding though i expect.
		# what i probably can add is a generic concept of whether a type is of variadic length or not.

		# these lists are very incomplete. think they are largely copied from libpst

		IMMEDIATE_TYPES = [
			PT_SHORT, PT_LONG, PT_BOOLEAN
		]

		INDIRECT_TYPES = [
			PT_DOUBLE, PT_OBJECT,
			0x0014, # whats this? probably something like PT_LONGLONG, given the correspondence with the
							# ole variant types. (= VT_I8)
			PT_STRING8, PT_UNICODE, # unicode isn't in libpst, but added here for outlook 2003 down the track
			PT_SYSTIME,
			0x0048, # another unknown
			0x0102, # this is PT_BINARY vs PT_CLSID
			#0x1003, # these are vector types, but they're commented out for now because i'd expect that
			#0x1014, # there's extra decoding needed that i'm not doing. (probably just need a simple
			#        # PT_* => unpack string mapping for the immediate types, and just do unpack('V*') etc
			#0x101e,
			#0x1102
		]

		# the attachment and recipient arrays appear to be always stored with these fixed
		# id2 values. seems strange. are there other extra streams? can find out by making higher
		# level IO wrapper, which has the id2 value, and doing the diff of available id2 values versus
		# used id2 values in properties of an item.
		ID2_ATTACHMENTS = 0x671
		ID2_RECIPIENTS = 0x692

		attr_reader :desc, :data, :data_chunks, :offset_tables
		def initialize desc
			raise FormatError, "unable to get associated index record for #{desc.inspect}" unless desc.desc
			@desc = desc
			#@data = desc.desc.read
			if Pst::Index === desc.desc
				#@data = RangesIOIdxChain.new(desc.pst, desc.desc).read
				idxs = desc.pst.id2_block_idx_chain desc.desc
				# this gets me the plain index chain.
			else
				# fake desc
				#@data = desc.desc.read
				idxs = [desc.desc]
			end

			@data_chunks = idxs.map { |idx| idx.read }
			@data = @data_chunks.first

			load_header

			@index_offsets = [@index_offset] + @data_chunks[1..-1].map { |chunk| chunk.unpack('v')[0] }
			@offset_tables = []
			@ignored = []
			@data_chunks.zip(@index_offsets).each do |chunk, offset|
				ignore = chunk[offset, 2].unpack('v')[0]
				@ignored << ignore
#				p ignore
				@offset_tables.push offset_table = []
				# maybe its ok if there aren't to be any values ?
				raise FormatError if offset == 0
				offsets = chunk[offset + 2..-1].unpack('v*')
				#p offsets
				offsets[0, ignore + 2].each_cons 2 do |from, to|
					#next if to == 0
					raise FormatError, [from, to].inspect if from > to
					offset_table << [from, to]
				end
			end

			@offset_table = @offset_tables.first
			@idxs = idxs

			# now, we may have multiple different blocks
		end

		# a given desc record may or may not have associated idx2 data. we lazily load it here, so it will never
		# actually be requested unless get_data_indirect actually needs to use it.
		def idx2
			return @idx2 if @idx2
			raise FormatError, 'idx2 requested but no idx2 available' unless desc.list_index
			# should check this can't return nil
			@idx2 = desc.pst.load_idx2 desc.list_index
		end

		def load_header
			@index_offset, type, @offset1 = data.unpack 'vvV'
			raise FormatError, 'unknown block type signature 0x%04x' % type unless TYPES[type]
			@type = TYPES[type]
		end

		# based on the value of offset, return either some data from buf, or some data from the
		# id2 chain id2, where offset is some key into a lookup table that is stored as the id2
		# chain. i think i may need to create a BlockParser class that wraps up all this mess.
		#
		# corresponds to:
		# * _pst_getBlockOffsetPointer
		# * _pst_getBlockOffset
		def get_data_indirect offset
			return get_data_indirect_io(offset).read

			if offset == 0
				nil
			elsif (offset & 0xf) == 0xf
				RangesIOID2.new(desc.pst, offset, idx2).read
			else
				low, high = offset & 0xf, offset >> 4
				raise FormatError if low != 0 or (high & 0x1) != 0 or (high / 2) > @offset_table.length
				from, to = @offset_table[high / 2]
				data[from...to]
			end
		end

		def get_data_indirect_io offset
			if offset == 0
				nil
			elsif (offset & 0xf) == 0xf
				if idx2[offset]
					RangesIOID2.new desc.pst, offset, idx2
				else
					warn "tried to get idx2 record for #{offset} but failed"
					return StringIO.new('')
				end
			else
				low, high = offset & 0xf, offset >> 4
				if low != 0 or (high & 0x1) != 0
#				raise FormatError, 
					warn "bad - #{low} #{high} (1)" 
					return StringIO.new('')
				end
				# lets see which block it should come from.
				block_idx, i = high.divmod 4096
				unless block_idx < @data_chunks.length
					warn "bad - block_idx to high (not #{block_idx} < #{@data_chunks.length})"
					return StringIO.new('')
				end
				data_chunk, offset_table = @data_chunks[block_idx], @offset_tables[block_idx]
				if i / 2 >= offset_table.length
					warn "bad - #{low} #{high} - #{i / 2} >= #{offset_table.length} (2)"
					return StringIO.new('')
				end
				#warn "ok  - #{low} #{high} #{offset_table.length}"
				from, to = offset_table[i / 2]
				StringIO.new data_chunk[from...to]
			end
		end

		def handle_indirect_values key, type, value
			case type
			when PT_BOOLEAN
				value = value != 0
			when *IMMEDIATE_TYPES # not including PT_BOOLEAN which we just did above
				# no processing current applied (needed?).
			when *INDIRECT_TYPES
				# the value is a pointer
				if String === value # ie, value size > 4 above
					value = StringIO.new value
				else
					value = get_data_indirect_io(value)
				end
				# keep strings as immediate values for now, for compatability with how i set up
				# Msg::Properties::ENCODINGS
				if value
					if type == PT_STRING8
						value = value.read
					elsif type == PT_UNICODE
						value = Ole::Types::FROM_UTF16.iconv value.read
					end
				end
				# special subject handling
				if key == PR_BODY_HTML and value
					# to keep the msg code happy, which thinks body_html will be an io
					# although, in 2003 version, they are 0102 already
					value = StringIO.new value unless value.respond_to?(:read)
				end
				if key == PR_SUBJECT and value
					ignore, offset = value.unpack 'C2'
					offset = (offset == 1 ? nil : offset - 3)
					value = value[2..-1]
=begin
					index = value =~ /^[A-Z]*:/ ? $~[0].length - 1 : nil
					unless ignore == 1 and offset == index
						warn 'something wrong with subject hack' 
						$x = [ignore, offset, value]
						require 'irb'
						IRB.start
						exit
					end
=end
=begin
new idea:

making sense of the \001\00[156] i've seen prefixing subject. i think its to do with the placement
of the ':', or the ' '. And perhaps an optimization to do with thread topic, and ignoring the prefixes
added by mailers. thread topic is equal to subject with all that crap removed.

can test by creating some mails with bizarre subjects.

subject="\001\005RE: blah blah"
subject="\001\001blah blah"
subject="\001\032Out of Office AutoReply: blah blah"
subject="\001\020Undeliverable: blah blah"

looks like it

=end

					# now what i think, is that perhaps, value[offset..-1] ...
					# or something like that should be stored as a special tag. ie, do a double yield
					# for this case. probably PR_CONVERSATION_TOPIC, in which case i'd write instead:
					# yield [PR_SUBJECT, ref_type, value]
					# yield [PR_CONVERSATION_TOPIC, ref_type, value[offset..-1]
					# next # to skip the yield.
				end

				# special handling for embedded objects
				# used for attach_data for attached messages. in which case attach_method should == 5,
				# for embedded object.
				if type == PT_OBJECT and value
					value = value.read if value.respond_to?(:read)
					id2, unknown = value.unpack 'V2'
					io = RangesIOID2.new desc.pst, id2, idx2

					# hacky
					desc2 = OpenStruct.new(:desc => io, :pst => desc.pst, :list_index => desc.list_index, :children => [])
					# put nil instead of desc.list_index, otherwise the attachment is attached to itself ad infinitum.
					# should try and fix that FIXME
					# this shouldn't be done always. for an attached message, yes, but for an attached
					# meta file, for example, it shouldn't. difference between embedded_ole vs embedded_msg
					# really.
					# note that in the case where its a embedded ole, you actually get a regular serialized ole
					# object, so i need to create an ole storage object on a rangesioidxchain!
					# eg:
=begin
att.props.display_name # => "Picture (Metafile)"
io = att.props.attach_data
io.read(32).unpack('H*') # => ["d0cf11e0a1b11ae100000.... note the docfile signature.
# plug some missing rangesio holes:
def io.rewind; seek 0; end
def io.flush; raise IOError; end
ole = Ole::Storage.open io
puts ole.root.to_tree

- #<Dirent:"Root Entry">
  |- #<Dirent:"\001Ole" size=20 data="\001\000\000\002\000...">
  |- #<Dirent:"CONTENTS" size=65696 data="\327\315\306\232\000...">
  \- #<Dirent:"\003MailStream" size=12 data="\001\000\000\000[...">
=end
					# until properly fixed, i have disabled this code here, so this will break
					# nested messages temporarily.
					#value = Item.new desc2, RawPropertyStore.new(desc2).to_a
					#desc2.list_index = nil
					value = io
				end
			# this is PT_MV_STRING8, i guess.
			# should probably have the 0x1000 flag, and do the or-ring.
			# example of 0x1102 is PR_OUTLOOK_2003_ENTRYIDS. less sure about that one.
			when 0x101e, 0x1102 
				# example data:
				# 0x802b "\003\000\000\000\020\000\000\000\030\000\000\000#\000\000\000BusinessCompetitionFavorites"
				# this 0x802b would be an extended attribute for categories / keywords.
				value = get_data_indirect_io(value).read unless String === value
				num = value.unpack('V')[0]
				offsets = value[4, 4 * num].unpack("V#{num}")
				value = (offsets + [value.length]).to_enum(:each_cons, 2).map { |from, to| value[from...to] }
				value.map! { |str| StringIO.new str } if type == 0x1102
			else
				name = Mapi::Types::DATA[type].first rescue nil
				warn '0x%04x %p' % [key, get_data_indirect_io(value).read]
				raise NotImplementedError, 'unsupported mapi property type - 0x%04x (%p)' % [type, name]
			end
			[key, type, value]
		end
	end

=begin
* recipients:

	affects: ["0x200764", "0x2011c4", "0x201b24", "0x201b44", "0x201ba4", "0x201c24", "0x201cc4", "0x202504"]

after adding the rawpropertystoretable fix, all except the second parse properly, and satisfy:

  item.props.display_to == item.recipients.map { |r| r.props.display_name if r.props.recipient_type == 1 }.compact * '; '

only the second still has a problem

#[#<struct Pst::Desc desc_id=0x2011c4, idx_id=0x397c, idx2_id=0x398a, parent_desc_id=0x8082>]

think this is related to a multi block #data3. ie, when you use @x * rec_size, and it
goes > 8190, or there abouts, then it stuffs up. probably there is header gunk, or something,
similar to when #data is multi block.

same problem affects the attachment table in test4. 

fixed that issue. round data3 ranges to rec_size. 

fix other issue with attached objects.

all recipients and attachments in test2 are fine.

only remaining issue is test4 recipients of 200044. strange.

=end

	# RawPropertyStore is used to iterate through the properties of an item, or the auxiliary
	# data for an attachment. its just a parser for the way the properties are serialized, when the
	# properties don't have to conform to a column structure.
	#
	# structure of this chunk of data is often
	#   header, property keys, data values, and then indexes.
	# the property keys has value in it. value can be the actual value if its a short type,
	# otherwise you lookup the value in the indicies, where you get the offsets to use in the
	# main data body. due to the indirect thing though, any of these parts could actually come
	# from a separate stream.
	class RawPropertyStore < BlockParser
		include Enumerable

		attr_reader :length
		def initialize desc
			super
			raise FormatError, "expected type 1 - got #{@type}" unless @type == 1

			# the way that offset works, data1 may be a subset of buf, or something from id2. if its from buf,
			# it will be offset based on index_offset and offset. so it could be some random chunk of data anywhere
			# in the thing.
			header_data = get_data_indirect @offset1
			raise FormatError if header_data.length < 8
			signature, offset2 = header_data.unpack 'V2'
			#p [@type, signature]
			raise FormatError, 'unhandled block signature 0x%08x' % @type if signature != 0x000602b5
			# this is actually a big chunk of tag tuples.
			@index_data = get_data_indirect offset2
			@length = @index_data.length / 8
		end

		# iterate through the property tuples
		def each
			length.times do |i|
				key, type, value = handle_indirect_values(*@index_data[8 * i, 8].unpack('vvV'))
				yield key, type, value
			end
		end
	end

	# RawPropertyStoreTable is kind of like a database table.
	# it has a fixed set of columns.
	# #[] is kind of like getting a row from the table.
	# those rows are currently encapsulated by Row, which has #each like
	# RawPropertyStore.
	# only used for the recipients array, and the attachments array. completely lazy, doesn't
	# load any of the properties upon creation. 
	class RawPropertyStoreTable < BlockParser
		class Column < Struct.new(:ref_type, :type, :ind2_off, :size, :slot)
			def initialize data
				super(*data.unpack('v3CC'))
			end

			def nice_type_name
				Mapi::Types::DATA[ref_type].first[/_(.*)/, 1].downcase rescue '0x%04x' % ref_type
			end

			def nice_prop_name
				Mapi::PropertyStore::TAGS['%04x' % type].first[/_(.*)/, 1].downcase rescue '0x%04x' % type
			end

			def inspect
				"#<#{self.class} name=#{nice_prop_name.inspect}, type=#{nice_type_name.inspect}>"
			end
		end

		include Enumerable

		attr_reader :length, :index_data, :data2, :data3, :rec_size
		def initialize desc
			super
			raise FormatError, "expected type 2 - got #{@type}" unless @type == 2

			header_data = get_data_indirect @offset1
			# seven_c_blk
			# often: u1 == u2 and u3 == u2 + 2, then rec_size == u3 + 4. wtf
			seven_c, @num_list, u1, u2, u3, @rec_size, b_five_offset,
				ind2_offset, u7, u8 = header_data[0, 22].unpack('CCv4V2v2')
			@index_data = header_data[22..-1]

			raise FormatError if @num_list != schema.length or seven_c != 0x7c
			# another check
			min_size = schema.inject(0) { |total, col| total + col.size }
			# seem to have at max, 8 padding bytes on the end of the record. not sure if it means
			# anything. maybe its just space that hasn't been reclaimed due to columns being
			# removed or something. probably should just check lower bound. 
			range = (min_size..min_size + 8)
			warn "rec_size seems wrong (#{range} !=== #{rec_size})" unless range === rec_size

			header_data2 = get_data_indirect b_five_offset
			raise FormatError if header_data2.length < 8
			signature, offset2 = header_data2.unpack 'V2'
			# ??? seems a bit iffy
			# there's probably more to the differences than this, and the data2 difference below
			expect = desc.pst.header.version_2003? ? 0x000404b5 : 0x000204b5
			raise FormatError, 'unhandled block signature 0x%08x' % signature if signature != expect

			# this holds all the row data
			# handle multiple block issue.
			@data3_io = get_data_indirect_io ind2_offset
			if RangesIOIdxChain === @data3_io
				@data3_idxs = 
				# modify ranges
				ranges = @data3_io.ranges.map { |offset, size| [offset, size / @rec_size * @rec_size] }
				@data3_io.instance_variable_set :@ranges, ranges
			end
			@data3 = @data3_io.read

			# there must be something to the data in data2. i think data2 is the array of objects essentially.
			# currently its only used to imply a length
			# actually, at size 6, its just some auxiliary data. i'm thinking either Vv/vV, for 97, and something
			# wider for 03. the second value is just the index (0...length), and the first value is
			# some kind of offset i expect. actually, they were all id2 values, in another case.
			# so maybe they're get_data_indirect values too?
			# actually, it turned out they were identical to the PR_ATTACHMENT_ID2 values...
			# id2_values = ie, data2.unpack('v*').to_enum(:each_slice, 3).transpose[0]
			# table[i].assoc(PR_ATTACHMENT_ID2).last == id2_values[i], for all i. 
			@data2 = get_data_indirect(offset2) rescue nil
			#if data2
			#	@length = (data2.length / 6.0).ceil
			#else
			# the above / 6, may have been ok for 97 files, but the new 0x0004 style block must have
			# different size records... just use this instead:
				# hmmm, actually, we can still figure it out:
				@length = @data3.length / @rec_size
			#end

			# lets try and at least use data2 for a warning for now
			if data2
				data2_rec_size = desc.pst.header.version_2003? ? 8 : 6
				warn 'somthing seems wrong with data3' unless @length == (data2.length / data2_rec_size)
			end
		end

		def schema
			@schema ||= index_data.scan(/.{8}/m).map { |data| Column.new data }
		end

		def [] idx
			# handle funky rounding
			Row.new self, idx * @rec_size
		end

		def each
			length.times { |i| yield self[i] }
		end

		class Row
			include Enumerable

			def initialize array_parser, x
				@array_parser, @x = array_parser, x
			end

			# iterate through the property tuples
			def each
				(@array_parser.index_data.length / 8).times do |i|
					ref_type, type, ind2_off, size, slot = @array_parser.index_data[8 * i, 8].unpack 'v3CC'
					# check this rescue too
					value = @array_parser.data3[@x + ind2_off, size]
#					if INDIRECT_TYPES.include? ref_type
					if size <= 4
						value = value.unpack('V')[0]
					end
					#p ['0x%04x' % ref_type, '0x%04x' % type, (Msg::Properties::MAPITAGS['%04x' % type].first[/^.._(.*)/, 1].downcase rescue nil),
					#		value_orig, value, (get_data_indirect(value_orig.unpack('V')[0]) rescue nil), size, ind2_off, slot]
					key, type, value = @array_parser.handle_indirect_values type, ref_type, value
					yield key, type, value
				end
			end
		end
	end

	class AttachmentTable < BlockParser
		# a "fake" MAPI property name for this constant. if you get a mapi property with
		# this value, it is the id2 value to use to get attachment data.
		PR_ATTACHMENT_ID2 = 0x67f2

		attr_reader :desc, :table
		def initialize desc
			@desc = desc
			# no super, we only actually want BlockParser2#idx2
			@table = nil
			return unless desc.list_index
			return unless idx = idx2[ID2_ATTACHMENTS]
			# FIXME make a fake desc.
			@desc2 = OpenStruct.new :desc => idx, :pst => desc.pst, :list_index => desc.list_index
			@table = RawPropertyStoreTable.new @desc2
		end

		def to_a
			return [] if !table
			table.map do |attachment|
				attachment = attachment.to_a
				#p attachment
				# potentially merge with yet more properties
				# this still seems pretty broken - especially the property overlap
				if attachment_id2 = attachment.assoc(PR_ATTACHMENT_ID2)
					#p attachment_id2.last
					#p idx2[attachment_id2.last]
					@desc2.desc = idx2[attachment_id2.last]
					RawPropertyStore.new(@desc2).each do |a, b, c|
						record = attachment.assoc a
						attachment << record = [] unless record
						record.replace [a, b, c]
					end
				end
				attachment
			end
		end
	end

	# there is no equivalent to this in libpst. ID2_RECIPIENTS was just guessed given the above
	# AttachmentTable.
	class RecipientTable < BlockParser
		attr_reader :desc, :table
		def initialize desc
			@desc = desc
			# no super, we only actually want BlockParser2#idx2
			@table = nil
			return unless desc.list_index
			return unless idx = idx2[ID2_RECIPIENTS]
			# FIXME make a fake desc.
			desc2 = OpenStruct.new :desc => idx, :pst => desc.pst, :list_index => desc.list_index
			@table = RawPropertyStoreTable.new desc2
		end

		def to_a
			return [] if !table
			table.map { |x| x.to_a }
		end
	end

	#
	# higher level item code. wraps up the raw properties above, and gives nice
	# objects to work with. handles item relationships too.
	# ----------------------------------------------------------------------------
	#

	def self.make_property_set property_list
		hash = property_list.inject({}) do |hash, (key, type, value)|
			hash.update PropertySet::Key.new(key) => value
		end
		PropertySet.new hash
	end

	class Attachment < Mapi::Attachment
		def initialize list
			super Pst.make_property_set(list)

			@embedded_msg = props.attach_data if Item === props.attach_data
		end
	end

	class Recipient < Mapi::Recipient
		def initialize list
			super Pst.make_property_set(list)
		end
	end

	class Item < Mapi::Message
		class EntryID < Struct.new(:u1, :entry_id, :id)
			UNPACK_STR = 'VA16V'

			def initialize data
				data = data.unpack(UNPACK_STR) if String === data
				super(*data)
			end
		end

		include RecursivelyEnumerable

		attr_accessor :type, :parent

		def initialize desc, list, type=nil
			@desc = desc
			super Pst.make_property_set(list)

			# this is kind of weird, but the ids of the special folders are stored in a hash
			# when the root item is loaded
			if ipm_wastebasket_entryid
				desc.pst.special_folder_ids[ipm_wastebasket_entryid] = :wastebasket
			end

			if finder_entryid
				desc.pst.special_folder_ids[finder_entryid] = :finder
			end

			# and then here, those are used, along with a crappy heuristic to determine if we are an
			# item
=begin
i think the low bits of the desc_id can give some info on the type.

it seems that 0x4 is for regular messages (and maybe contacts etc)
0x2 is for folders, and 0x8 is for special things like rules etc, that aren't visible.
=end
			unless type
				type = props.valid_folder_mask || ipm_subtree_entryid || props.content_count || props.subfolders ? :folder : :message
				if type == :folder
					type = desc.pst.special_folder_ids[desc.desc_id] || type
				end
			end

			@type = type
		end

		def each_child
			id = ipm_subtree_entryid
			if id
				root = @desc.pst.desc_from_id id
				raise "couldn't find root" unless root
				raise 'both kinds of children' unless @desc.children.empty?
				children = root.children
				# lets look up the other ids we have.
				# typically the wastebasket one "deleted items" is in the children already, but
				# the search folder isn't.
				extras = [ipm_wastebasket_entryid, finder_entryid].compact.map do |id|
					root = @desc.pst.desc_from_id id
					warn "couldn't find root for id #{id}" unless root
					root
				end.compact
				# i do this instead of union, so as not to mess with the order of the
				# existing children.
				children += (extras - children)
				children
			else
				@desc.children
			end.each do |desc|
				item = @desc.pst.pst_parse_item(desc)
				item.parent = self
				yield item
			end
		end

		def path
			parents, item = [], self
			parents.unshift item while item = item.parent
			# remove root
			parents.shift
			parents.map { |item| item.props.display_name or raise 'unable to construct path' } * '/'
		end

		def children
			to_enum(:each_child).to_a
		end

		# these are still around because they do different stuff

		# Top of Personal Folder Record
		def ipm_subtree_entryid
			@ipm_subtree_entryid ||= EntryID.new(props.ipm_subtree_entryid.read).id rescue nil
		end

		# Deleted Items Folder Record
		def ipm_wastebasket_entryid
			@ipm_wastebasket_entryid ||= EntryID.new(props.ipm_wastebasket_entryid.read).id rescue nil
		end

		# Search Root Record
		def finder_entryid
			@finder_entryid ||= EntryID.new(props.finder_entryid.read).id rescue nil
		end

		# all these have been replaced with the method_missing below
=begin
		# States which folders are valid for this message store 
		#def valid_folder_mask
		#	props[0x35df]
		#end

		# Number of emails stored in a folder
		def content_count
			props[0x3602] 
		end

		# Has children
		def subfolders
			props[0x360a]
		end
=end

		# i think i will change these, so they can inherit the lazyness from RawPropertyStoreTable.
		# so if you want the last attachment, you can get it without creating the others perhaps.
		# it just has to handle the no table at all case a bit more gracefully.

		def attachments
			@attachments ||= AttachmentTable.new(@desc).to_a.map { |list| Attachment.new list }
		end

		def recipients
			#[]
			@recipients ||= RecipientTable.new(@desc).to_a.map { |list| Recipient.new list }
		end

		def each_recursive(&block)
			#p :self => self
			children.each do |child|
				#p :child => child
				block[child]
				child.each_recursive(&block)
			end
		end

		def inspect
			attrs = %w[display_name subject sender_name subfolders]
#			attrs = %w[display_name valid_folder_mask ipm_wastebasket_entryid finder_entryid content_count subfolders]
			str = attrs.map { |a| b = props.send a; " #{a}=#{b.inspect}" if b }.compact * ','

			type_s = type == :message ? 'Message' : type == :folder ? 'Folder' : type.to_s.capitalize + 'Folder'
			str2 = 'desc_id=0x%x' % @desc.desc_id

			!str.empty? ? "#<Pst::#{type_s} #{str2}#{str}>" : "#<Pst::#{type_s} #{str2} props=#{props.inspect}>" #\n" + props.transport_message_headers + ">"
		end
	end

	# corresponds to
	# * _pst_parse_item
	def pst_parse_item desc
		Item.new desc, RawPropertyStore.new(desc).to_a
	end

	#
	# other random code
	# ----------------------------------------------------------------------------
	#

	def dump_debug_info
		puts "* pst header"
		p header

=begin
Looking at the output of this, for blank-o1997.pst, i see this part:
...
- (26624,516) desc block data (overlap of 4 bytes)
- (27136,516) desc block data (gap of 508 bytes)
- (28160,516) desc block data (gap of 2620 bytes)
...

which confirms my belief that the block size for idx and desc is more likely 512
=end
		if 0 + 0 == 0
			puts '* file range usage'
			file_ranges =
				# these 3 things, should account for most of the data in the file.
				[[0, Header::SIZE, 'pst file header']] +
				@idx_offsets.map { |offset| [offset, Index::BLOCK_SIZE, 'idx block data'] } +
				@desc_offsets.map { |offset| [offset, Desc::BLOCK_SIZE, 'desc block data'] } +
				@idx.map { |idx| [idx.offset, idx.size, 'idx id=0x%x (%s)' % [idx.id, idx.type]] }
			(file_ranges.sort_by { |idx| idx.first } + [nil]).to_enum(:each_cons, 2).each do |(offset, size, name), next_record|
				# i think there is a padding of the size out to 64 bytes
				# which is equivalent to padding out the final offset, because i think the offset is 
				# similarly oriented
				pad_amount = 64
				warn 'i am wrong about the offset padding' if offset % pad_amount != 0
				# so, assuming i'm not wrong about that, then we can calculate how much padding is needed.
				pad = pad_amount - (size % pad_amount)
				pad = 0 if pad == pad_amount
				gap = next_record ? next_record.first - (offset + size + pad) : 0
				extra = case gap <=> 0
					when -1; ["overlap of #{gap.abs} bytes)"]
					when  0; []
					when +1; ["gap of #{gap} bytes"]
				end
				# how about we check that padding
				@io.pos = offset + size
				pad_bytes = @io.read(pad)
				extra += ["padding not all zero"] unless pad_bytes == 0.chr * pad
				puts "- #{offset}:#{size}+#{pad} #{name.inspect}" + (extra.empty? ? '' : ' [' + extra * ', ' + ']')
			end
		end

		# i think the idea of the idx, and indeed the idx2, is just to be able to
		# refer to data indirectly, which means it can get moved around, and you just update
		# the idx table. it is simply a list of file offsets and sizes.
		# not sure i get how id2 plays into it though....
		# the sizes seem to be all even. is that a co-incidence? and the ids are all even. that
		# seems to be related to something else (see the (id & 2) == 1 stuff)
		puts '* idx entries'
		@idx.each { |idx| puts "- #{idx.inspect}" }

		# if you look at the desc tree, you notice a few things:
		# 1. there is a desc that seems to be the parent of all the folders, messages etc.
		#    it is the one whose parent is itself.
		#    one of its children is referenced as the subtree_entryid of the first desc item,
		#    the root.
		# 2. typically only 2 types of desc records have idx2_id != 0. messages themselves,
		#    and the desc with id = 0x61 - the xattrib container. everything else uses the
		#    regular ids to find its data. i think it should be reframed as small blocks and
		#    big blocks, but i'll look into it more.
		#
		# idx_id and idx2_id are for getting to the data. desc_id and parent_desc_id just define
		# the parent <-> child relationship, and the desc_ids are how the items are referred to in
		# entryids.
		# note that these aren't unique! eg for 0, 4 etc. i expect these'd never change, as the ids
		# are stored in entryids. whereas the idx and idx2 could be a bit more volatile.
		puts '* desc tree'
		# make a dummy root hold everything just for convenience
		root = Desc.new ''
		def root.inspect; "#<Pst::Root>"; end
		root.children.replace @orphans
		# this still loads the whole thing as a string for gsub. should use directo output io
		# version.
		puts root.to_tree.gsub(/, (parent_desc_id|idx2_id)=0x0(?!\d)/, '')

		# this is fairly easy to understand, its just an attempt to display the pst items in a tree form
		# which resembles what you'd see in outlook.
		puts '* item tree'
		# now streams directly
		root_item.to_tree STDOUT
	end

	def root_desc
		@desc.first
	end

	def root_item
		item = pst_parse_item root_desc
		item.type = :root
		item
	end

	def root
		root_item
	end

	# depth first search of all items
	include Enumerable

	def each(&block)
		root = self.root
		block[root]
		root.each_recursive(&block)
	end

	def name
		@name ||= root_item.props.display_name
	end
	
	def inspect
		"#<Pst name=#{name.inspect} io=#{io.inspect}>"
	end
end
end