3 files changed, 367 insertions, 0 deletions
diff --git a/vendor/ruby-msg/contrib/rtf2html.c b/vendor/ruby-msg/contrib/rtf2html.c
new file mode 100644
index 000000000..937e22ff1
--- /dev/null
+++ b/vendor/ruby-msg/contrib/rtf2html.c
@@ -0,0 +1,155 @@
+#include <stdio.h>
+#define bool int
+#define false 0
+#define true 1
+
+// RTF/HTML functions
+// --------------------
+//
+// Sometimes in MAPI, the PR_BODY_HTML property contains the HTML of a message.
+// But more usually, the HTML is encoded inside the RTF body (which you get in the
+// PR_RTF_COMPRESSED property). These routines concern the decoding of the HTML
+// from this RTF body.
+//
+// An encoded htmlrtf file is a valid RTF document, but which contains additional
+// html markup information in its comments, and sometimes contains the equivalent
+// rtf markup outside the comments. Therefore, when it is displayed by a plain
+// simple RTF reader, the html comments are ignored and only the rtf markup has
+// effect. Typically, this rtf markup is not as rich as the html markup would have been.
+// But for an html-aware reader (such as the code below), we can ignore all the
+// rtf markup, and extract the html markup out of the comments, and get a valid
+// html document.
+//
+// There are actually two kinds of html markup in comments. Most of them are
+// prefixed by "\*\htmltagNNN", for some number NNN. But sometimes there's one
+// prefixed by "\*\mhtmltagNNN" followed by "\*\htmltagNNN". In this case,
+// the two are equivalent, but the m-tag is for a MIME Multipart/Mixed Message
+// and contains tags that refer to content-ids (e.g. img src="cid:072344a7")
+// while the normal tag just refers to a name (e.g. img src="fred.jpg")
+// The code below keeps the m-tag and discards the normal tag.
+// If there are any m-tags like this, then the message also contains an
+// attachment with a PR_CONTENT_ID property e.g. "072344a7". Actually,
+// sometimes the m-tag is e.g. img src="http://outlook/welcome.html" and the
+// attachment has a PR_CONTENT_LOCATION "http://outlook/welcome.html" instead
+// of a PR_CONTENT_ID.
+//
+// This code is experimental. It works on my own message archive, of about
+// a thousand html-encoded messages, received in Outlook97 and Outlook2000
+// and OutlookXP. But I can't guarantee that it will work on all rtf-encoded
+// messages. Indeed, it used to be the case that people would simply stick
+// {\fromhtml at the start of an html document, and } at the end, and send
+// this as RTF. If someone did this, then it will almost work in my function
+// but not quite. (Because I ignore \r and \n, and respect only \par. Thus,
+// any linefeeds in the erroneous encoded-html will be ignored.)
+
+
+
+
+
+// ISRTFHTML -- Given an uncompressed RTF body of the message, this
+// function tells you whether it encodes some html.
+// [in] (buf,*len) indicate the start and length of the uncompressed RTF body.
+// [return-value] true or false, for whether it really does encode some html
+bool isrtfhtml(const char *buf,unsigned int len)
+{ // We look for the words "\fromhtml" somewhere in the file.
+  // If the rtf encodes text rather than html, then instead
+  // it will only find "\fromtext".
+  const char *c;
+  for (c=buf; c<buf+len; c++)
+  { if (strncmp(c,"\\from",5)==0) return strncmp(c,"\\fromhtml",9)==0;
+  }
+  return false;
+}
+
+
+
+
+// DECODERTFHTML -- Given an uncompressed RTF body of the message,
+// and assuming that it contains encoded-html, this function
+// turns it onto regular html.
+// [in] (buf,*len) indicate the start and length of the uncompressed RTF body.
+// [out] the buffer is overwritten with the HTML version, null-terminated,
+// and *len indicates the length of this HTML.
+//
+// Notes: (1) because of how the encoding works, the HTML version is necessarily
+// shorter than the encoded version. That's why it's safe for the function to
+// place the decoded html in the same buffer that formerly held the encoded stuff.
+// (2) Some messages include characters \'XX, where XX is a hexedecimal number.
+// This function simply converts this into ASCII. The conversion will only make
+// sense if the right code-page is being used. I don't know how rtf specifies which
+// code page it wants.
+// (3) By experiment, I discovered that \pntext{..} and \liN and \fi-N are RTF
+// markup that should be removed. There might be other RTF markup that should
+// also be removed. But I don't know what else.
+//
+void decodertfhtml(char *buf,unsigned int *len)
+{ // c -- pointer to where we're reading from
+  // d -- pointer to where we're writing to. Invariant: d<c
+  // max -- how far we can read from (i.e. to the end of the original rtf)
+  // ignore_tag -- stores 'N': after \mhtmlN, we will ignore the subsequent \htmlN.
+  char *c=buf, *max=buf+*len, *d=buf; int ignore_tag=-1;
+  // First, we skip forwards to the first \htmltag.
+  while (c<max && strncmp(c,"{\\*\\htmltag",11)!=0) c++;
+  //
+  // Now work through the document. Our plan is as follows:
+  // * Ignore { and }. These are part of RTF markup.
+  // * Ignore \htmlrtf...\htmlrtf0. This is how RTF keeps its equivalent markup separate from the html.
+  // * Ignore \r and \n. The real carriage returns are stored in \par tags.
+  // * Ignore \pntext{..} and \liN and \fi-N. These are RTF junk.
+  // * Convert \par and \tab into \r\n and \t
+  // * Convert \'XX into the ascii character indicated by the hex number XX
+  // * Convert \{ and \} into { and }. This is how RTF escapes its curly braces.
+  // * When we get \*\mhtmltagN, keep the tag, but ignore the subsequent \*\htmltagN
+  // * When we get \*\htmltagN, keep the tag as long as it isn't subsequent to a \*\mhtmltagN
+  // * All other text should be kept as it is.
+  while (c<max)
+  { if (*c=='{') c++;
+    else if (*c=='}') c++;
+    else if (strncmp(c,"\\*\\htmltag",10)==0)
+    { c+=10; int tag=0; while (*c>='0' && *c<='9') {tag=tag*10+*c-'0'; c++;}
+      if (*c==' ') c++;
+      if (tag==ignore_tag) {while (c<max && *c!='}') c++; if (*c=='}') c++;}
+      ignore_tag=-1;
+    }
+    else if (strncmp(c,"\\*\\mhtmltag",11)==0)
+    { c+=11; int tag=0; while (*c>='0' && *c<='9') {tag=tag*10+*c-'0'; c++;}
+      if (*c==' ') c++;
+      ignore_tag=tag;
+    }
+    else if (strncmp(c,"\\par",4)==0) {strcpy(d,"\r\n"); d+=2; c+=4; if (*c==' ') c++;}
+    else if (strncmp(c,"\\tab",4)==0) {strcpy(d,"   "); d+=3; c+=4; if (*c==' ') c++;}
+    else if (strncmp(c,"\\li",3)==0)
+    { c+=3; while (*c>='0' && *c<='9') c++; if (*c==' ') c++;
+    }
+    else if (strncmp(c,"\\fi-",4)==0)
+    { c+=4; while (*c>='0' && *c<='9') c++; if (*c==' ') c++;
+    }
+    else if (strncmp(c,"\\'",2)==0)
+    { unsigned int hi=c[2], lo=c[3];
+      if (hi>='0' && hi<='9') hi-='0'; else if (hi>='A' && hi<='Z') hi-='A'; else if (hi>='a' && hi<='z') hi-='a';
+      if (lo>='0' && lo<='9') lo-='0'; else if (lo>='A' && lo<='Z') lo-='A'; else if (lo>='a' && lo<='z') lo-='a';
+      *((unsigned char*)d) = (unsigned char)(hi*16+lo);
+      c+=4; d++;
+    }
+    else if (strncmp(c,"\\pntext",7)==0) {c+=7; while (c<max && *c!='}') c++;}
+    else if (strncmp(c,"\\htmlrtf",8)==0)
+    { c++; while (c<max && strncmp(c,"\\htmlrtf0",9)!=0) c++;
+      if (c<max) c+=9; if (*c==' ') c++;
+    }
+    else if (*c=='\r' || *c=='\n') c++;
+    else if (strncmp(c,"\\{",2)==0) {*d='{'; d++; c+=2;}
+    else if (strncmp(c,"\\}",2)==0) {*d='}'; d++; c+=2;}
+    else {*d=*c; c++; d++;}
+  }
+  *d=0; d++;
+  *len = d-buf;
+}
+
+
+void main()
+{
+	unsigned char buf[1024*1024];
+	int len = fread(buf, 1, 1024*1024, stdin);
+	decodertfhtml(buf, &len);
+	fwrite(buf, 1, len, stdout);
+}
diff --git a/vendor/ruby-msg/contrib/rtfdecompr.c b/vendor/ruby-msg/contrib/rtfdecompr.c
new file mode 100644
index 000000000..633d50286
--- /dev/null
+++ b/vendor/ruby-msg/contrib/rtfdecompr.c
@@ -0,0 +1,105 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+void decompress_rtf(FILE *srcf)
+{
+//	#define prebuf_len (sizeof(prebuf))
+//	static unsigned char prebuf[] =
+
+	// the window of decompressed bytes that can be referenced for copies.
+	// moved to this rather than indexing directly into output for streaming.
+	// circular buffer.
+	// because we use single-function call approach, no need for copy.
+	// if using libstream-3, i would have a few options. i would be part of
+	// the filter interface, which doesn't care if it is reading or writing,
+	// all it knows about is its input and output buffers. we can't just
+	// flush some data to the output buffer in that scenario, so we would need
+	// to keep the window around. we also can't guarantee availability of that
+	// buffer. so, we would probably have a instance member which would be
+	// this ->
+	unsigned char buf[4096] =
+		"{\\rtf1\\ansi\\mac\\deff0\\deftab720{\\fonttbl;}"
+		"{\\f0\\fnil \\froman \\fswiss \\fmodern \\fscript "
+		"\\fdecor MS Sans SerifSymbolArialTimes New RomanCourier"
+		"{\\colortbl\\red0\\green0\\blue0\n\r\\par "
+		"\\pard\\plain\\f0\\fs20\\b\\i\\u\\tab\\tx";
+
+	#define BUF_MASK 4095		
+
+	int wp = strlen((char *)buf);
+
+	unsigned char *dst; // destination for uncompressed bytes
+	int in = 0; // current position in src array
+	int out = 0; // current position in dst array
+
+	unsigned char hdr[16];
+	int got;
+	// get header fields (as defined in RTFLIB.H)
+	got = fread(hdr, 1, 16, srcf);
+	if (got != 16) {
+		printf("Invalid compressed-RTF header\n");
+		exit(1);
+	}
+
+	int compr_size = *(unsigned int *)(hdr);
+	int uncompr_size = *(unsigned int *)(hdr + 4);
+	int magic = *(unsigned int *)(hdr + 8);
+	long crc32 = *(unsigned int *)(hdr + 12);
+
+	unsigned char *x, *y;;
+	unsigned char *src = malloc(compr_size - 12); // includes the 3 header fields
+	y = src;
+	x = src + compr_size - 12;
+	got = fread(src, 1, compr_size - 12, srcf);
+	if (got != compr_size - 12) {
+		printf("compressed-RTF data size mismatch (%d != %d)\n", got, compr_size - 12);
+		exit(1);
+	}
+	// shouldn't be any more than that
+	got = fread(dst, 1, 16, srcf);
+	if (got > 0) {
+		printf("warning: data after the size\n");
+	}
+
+	// process the data
+	if (magic == 0x414c454d) { // magic number that identifies the stream as a uncompressed stream
+		dst = malloc(uncompr_size);
+		memcpy(dst, src, uncompr_size);
+	}
+	else if (magic == 0x75465a4c) { // magic number that identifies the stream as a compressed stream
+		out = 0; //strlen(prebuf);
+		int dst_len;
+		dst = malloc(dst_len = uncompr_size);
+
+		int flagCount = 0;
+		int flags = 0;
+		while (out < dst_len && src < x) {
+			// each flag byte flags 8 literals/references, 1 per bit
+			flags = (flagCount++ % 8 == 0) ? *src++ : flags >> 1;
+			if (flags & 1) { // each flag bit is 1 for reference, 0 for literal
+				int rp = *src++;
+				int l = *src++;
+				//offset is a 12 byte number. 2^12 is 4096, so thats fine
+				rp = (rp << 4) | (l >> 4); // the offset relative to block start
+				l = (l & 0xf) + 2; // the number of bytes to copy
+				int e = rp + l;
+				while (rp < e)
+					putchar(buf[wp++ & BUF_MASK] = buf[rp++ & BUF_MASK]);
+			}
+			else putchar(buf[wp++ & BUF_MASK] = *src++);
+		}
+	}
+	else { // unknown magic number
+		printf("Unknown compression type (magic number %04x)", magic);
+	}
+
+	free(y);
+}
+
+int main(int argc, char *argv[])
+{
+	FILE *file = fopen(argv[1], "rb");
+	decompress_rtf(file);
+	fclose(file);
+}
diff --git a/vendor/ruby-msg/contrib/wmf.rb b/vendor/ruby-msg/contrib/wmf.rb
new file mode 100644
index 000000000..531e5fc99
--- /dev/null
+++ b/vendor/ruby-msg/contrib/wmf.rb
@@ -0,0 +1,107 @@
+
+# this file will be used later to enhance the msg conversion.
+
+# doesn't really work very well....
+
+def wmf_getdimensions wmf_data
+	# check if we have a placeable metafile
+	if wmf_data.unpack('L')[0] == 0x9ac6cdd7
+		# do check sum test
+		shorts = wmf_data.unpack 'S11'
+		warn 'bad wmf header checksum' unless shorts.pop == shorts.inject(0) { |a, b| a ^ b }
+		# determine dimensions
+		left, top, right, bottom, twips_per_inch = wmf_data[6, 10].unpack 'S5'
+		p [left, top, right, bottom, twips_per_inch]
+		[right - left, bottom - top].map { |i| (i * 96.0 / twips_per_inch).round }
+	else
+		[nil, nil]
+	end
+end
+
+=begin
+
+some attachment stuff
+rendering_position
+object_type
+attach_num
+attach_method
+
+rendering_position is around (1 << 32) - 1 if its inline
+
+attach_method 1 for plain data?
+attach_method 6 for embedded ole
+
+display_name instead of reading the embedded ole type.
+
+
+PR_RTF_IN_SYNC property is missing or set to FALSE.
+
+
+Before reading from the uncompressed RTF stream, sort the message's attachment
+table on the value of the PR_RENDERING_POSITION property. The attachments will
+now be in order by how they appear in the message.
+
+As your client scans through the RTF stream, check for the token "\objattph".
+The character following the token is the place to put the next attachment from
+the sorted table. Handle attachments that have set their PR_RENDERING_POSITION
+property to -1 separately.
+
+eg from rtf.
+
+\b\f2\fs20{\object\objemb{\*\objclass PBrush}\objw1320\objh1274{\*\objdata
+01050000 <- looks like standard header
+02000000 <- not sure
+07000000 <- this means length of following is 7. 
+50427275736800 <- Pbrush\000 in hex
+00000000 <- ?
+00000000 <- ?
+e0570000 <- this is 22496. length of the following in hex
+this is the bitmap data, starting with BM....
+424dde57000000000000360000002800000058000000550000000100180000000000a857000000
+000000000000000000000000000000c8d0d4c8d0d4c8d0d4c8d0d4c8d0d4c8d0d4c8d0d4c8d0d4
+
+---------------
+
+tested 3 different embedded files:
+
+1. excel embedded
+   - "\002OlePres000"[40..-1] can be saved to '.wmf' and opened.
+   - "\002OlePres001" similarly.
+     much better looking image. strange
+   - For the rtf serialization, it has the file contents as an
+     ole, "d0cf11e" serialization, which i can't do yet. this can
+     be extracted as a working .xls
+     followed by a METAFILEPICT chunk, correspoding to one of the
+     ole pres chunks.
+     then the very same metafile chunk in the result bit.
+
+2. pbrush embedded image
+   - "\002OlePres000" wmf as above.
+   - "\001Ole10Native" is a long followed by a plain old .bmp
+   - Serialization:
+     Basic header as before, then bitmap data follows, then the
+     metafile chunk follows, though labeled PBrush again this time.
+     the result chunk was corrupted
+
+3. metafile embedded image
+   - no presentation section, just a
+   - "CONTENTS" section, which can be saved directly as a wmf.
+     different header to the other 2 metafiles. it starts with
+     9AC6CDD7, which is the Aldus placeable metafile header.
+     (http://wvware.sourceforge.net/caolan/ora-wmf.html)
+     you can decode the left, top, right, bottom, and then
+     multiply by 96, and divide by the metafile unit converter thing
+     to get pixel values.
+
+the above ones were always the plain metafiles
+word filetype (0 = memory, 1 = disk)
+word headersize (always 9)
+word version
+thus leading to the
+0100
+0900
+0003
+pattern i usually see.
+
+=end
+