#include #define bool int #define false 0 #define true 1 // RTF/HTML functions // -------------------- // // Sometimes in MAPI, the PR_BODY_HTML property contains the HTML of a message. // But more usually, the HTML is encoded inside the RTF body (which you get in the // PR_RTF_COMPRESSED property). These routines concern the decoding of the HTML // from this RTF body. // // An encoded htmlrtf file is a valid RTF document, but which contains additional // html markup information in its comments, and sometimes contains the equivalent // rtf markup outside the comments. Therefore, when it is displayed by a plain // simple RTF reader, the html comments are ignored and only the rtf markup has // effect. Typically, this rtf markup is not as rich as the html markup would have been. // But for an html-aware reader (such as the code below), we can ignore all the // rtf markup, and extract the html markup out of the comments, and get a valid // html document. // // There are actually two kinds of html markup in comments. Most of them are // prefixed by "\*\htmltagNNN", for some number NNN. But sometimes there's one // prefixed by "\*\mhtmltagNNN" followed by "\*\htmltagNNN". In this case, // the two are equivalent, but the m-tag is for a MIME Multipart/Mixed Message // and contains tags that refer to content-ids (e.g. img src="cid:072344a7") // while the normal tag just refers to a name (e.g. img src="fred.jpg") // The code below keeps the m-tag and discards the normal tag. // If there are any m-tags like this, then the message also contains an // attachment with a PR_CONTENT_ID property e.g. "072344a7". Actually, // sometimes the m-tag is e.g. img src="http://outlook/welcome.html" and the // attachment has a PR_CONTENT_LOCATION "http://outlook/welcome.html" instead // of a PR_CONTENT_ID. // // This code is experimental. It works on my own message archive, of about // a thousand html-encoded messages, received in Outlook97 and Outlook2000 // and OutlookXP. But I can't guarantee that it will work on all rtf-encoded // messages. Indeed, it used to be the case that people would simply stick // {\fromhtml at the start of an html document, and } at the end, and send // this as RTF. If someone did this, then it will almost work in my function // but not quite. (Because I ignore \r and \n, and respect only \par. Thus, // any linefeeds in the erroneous encoded-html will be ignored.) // ISRTFHTML -- Given an uncompressed RTF body of the message, this // function tells you whether it encodes some html. // [in] (buf,*len) indicate the start and length of the uncompressed RTF body. // [return-value] true or false, for whether it really does encode some html bool isrtfhtml(const char *buf,unsigned int len) { // We look for the words "\fromhtml" somewhere in the file. // If the rtf encodes text rather than html, then instead // it will only find "\fromtext". const char *c; for (c=buf; c='0' && *c<='9') {tag=tag*10+*c-'0'; c++;} if (*c==' ') c++; if (tag==ignore_tag) {while (c='0' && *c<='9') {tag=tag*10+*c-'0'; c++;} if (*c==' ') c++; ignore_tag=tag; } else if (strncmp(c,"\\par",4)==0) {strcpy(d,"\r\n"); d+=2; c+=4; if (*c==' ') c++;} else if (strncmp(c,"\\tab",4)==0) {strcpy(d," "); d+=3; c+=4; if (*c==' ') c++;} else if (strncmp(c,"\\li",3)==0) { c+=3; while (*c>='0' && *c<='9') c++; if (*c==' ') c++; } else if (strncmp(c,"\\fi-",4)==0) { c+=4; while (*c>='0' && *c<='9') c++; if (*c==' ') c++; } else if (strncmp(c,"\\'",2)==0) { unsigned int hi=c[2], lo=c[3]; if (hi>='0' && hi<='9') hi-='0'; else if (hi>='A' && hi<='Z') hi-='A'; else if (hi>='a' && hi<='z') hi-='a'; if (lo>='0' && lo<='9') lo-='0'; else if (lo>='A' && lo<='Z') lo-='A'; else if (lo>='a' && lo<='z') lo-='a'; *((unsigned char*)d) = (unsigned char)(hi*16+lo); c+=4; d++; } else if (strncmp(c,"\\pntext",7)==0) {c+=7; while (c