From 9d1321cca685d4a25cdb615199ef464da3ba4d5d Mon Sep 17 00:00:00 2001 From: Peter Collingbourne Date: Thu, 18 Feb 2010 02:26:35 +0000 Subject: Implement decoding of Outlook msg/oft files --- vendor/ruby-msg/contrib/rtf2html.c | 155 +++++++++++++++++++++++++++++++++++++ 1 file changed, 155 insertions(+) create mode 100644 vendor/ruby-msg/contrib/rtf2html.c (limited to 'vendor/ruby-msg/contrib/rtf2html.c') diff --git a/vendor/ruby-msg/contrib/rtf2html.c b/vendor/ruby-msg/contrib/rtf2html.c new file mode 100644 index 000000000..937e22ff1 --- /dev/null +++ b/vendor/ruby-msg/contrib/rtf2html.c @@ -0,0 +1,155 @@ +#include +#define bool int +#define false 0 +#define true 1 + +// RTF/HTML functions +// -------------------- +// +// Sometimes in MAPI, the PR_BODY_HTML property contains the HTML of a message. +// But more usually, the HTML is encoded inside the RTF body (which you get in the +// PR_RTF_COMPRESSED property). These routines concern the decoding of the HTML +// from this RTF body. +// +// An encoded htmlrtf file is a valid RTF document, but which contains additional +// html markup information in its comments, and sometimes contains the equivalent +// rtf markup outside the comments. Therefore, when it is displayed by a plain +// simple RTF reader, the html comments are ignored and only the rtf markup has +// effect. Typically, this rtf markup is not as rich as the html markup would have been. +// But for an html-aware reader (such as the code below), we can ignore all the +// rtf markup, and extract the html markup out of the comments, and get a valid +// html document. +// +// There are actually two kinds of html markup in comments. Most of them are +// prefixed by "\*\htmltagNNN", for some number NNN. But sometimes there's one +// prefixed by "\*\mhtmltagNNN" followed by "\*\htmltagNNN". In this case, +// the two are equivalent, but the m-tag is for a MIME Multipart/Mixed Message +// and contains tags that refer to content-ids (e.g. img src="cid:072344a7") +// while the normal tag just refers to a name (e.g. img src="fred.jpg") +// The code below keeps the m-tag and discards the normal tag. +// If there are any m-tags like this, then the message also contains an +// attachment with a PR_CONTENT_ID property e.g. "072344a7". Actually, +// sometimes the m-tag is e.g. img src="http://outlook/welcome.html" and the +// attachment has a PR_CONTENT_LOCATION "http://outlook/welcome.html" instead +// of a PR_CONTENT_ID. +// +// This code is experimental. It works on my own message archive, of about +// a thousand html-encoded messages, received in Outlook97 and Outlook2000 +// and OutlookXP. But I can't guarantee that it will work on all rtf-encoded +// messages. Indeed, it used to be the case that people would simply stick +// {\fromhtml at the start of an html document, and } at the end, and send +// this as RTF. If someone did this, then it will almost work in my function +// but not quite. (Because I ignore \r and \n, and respect only \par. Thus, +// any linefeeds in the erroneous encoded-html will be ignored.) + + + + + +// ISRTFHTML -- Given an uncompressed RTF body of the message, this +// function tells you whether it encodes some html. +// [in] (buf,*len) indicate the start and length of the uncompressed RTF body. +// [return-value] true or false, for whether it really does encode some html +bool isrtfhtml(const char *buf,unsigned int len) +{ // We look for the words "\fromhtml" somewhere in the file. + // If the rtf encodes text rather than html, then instead + // it will only find "\fromtext". + const char *c; + for (c=buf; c='0' && *c<='9') {tag=tag*10+*c-'0'; c++;} + if (*c==' ') c++; + if (tag==ignore_tag) {while (c='0' && *c<='9') {tag=tag*10+*c-'0'; c++;} + if (*c==' ') c++; + ignore_tag=tag; + } + else if (strncmp(c,"\\par",4)==0) {strcpy(d,"\r\n"); d+=2; c+=4; if (*c==' ') c++;} + else if (strncmp(c,"\\tab",4)==0) {strcpy(d," "); d+=3; c+=4; if (*c==' ') c++;} + else if (strncmp(c,"\\li",3)==0) + { c+=3; while (*c>='0' && *c<='9') c++; if (*c==' ') c++; + } + else if (strncmp(c,"\\fi-",4)==0) + { c+=4; while (*c>='0' && *c<='9') c++; if (*c==' ') c++; + } + else if (strncmp(c,"\\'",2)==0) + { unsigned int hi=c[2], lo=c[3]; + if (hi>='0' && hi<='9') hi-='0'; else if (hi>='A' && hi<='Z') hi-='A'; else if (hi>='a' && hi<='z') hi-='a'; + if (lo>='0' && lo<='9') lo-='0'; else if (lo>='A' && lo<='Z') lo-='A'; else if (lo>='a' && lo<='z') lo-='a'; + *((unsigned char*)d) = (unsigned char)(hi*16+lo); + c+=4; d++; + } + else if (strncmp(c,"\\pntext",7)==0) {c+=7; while (c