From 39cc341b8f6299fbf8a62b243d278d1e48c8def7 Mon Sep 17 00:00:00 2001 From: Wilmer van der Gaast Date: Tue, 3 Jan 2006 19:30:54 +0100 Subject: strip_html now replaces non-ASCII characters (entities like é) to their UTF-8 versions instead of Latin1. Also added &[aeiou]uml; entities to the list. However, I still don't know if this is really important anyway... --- util.c | 57 +++++++++++++++++++++++++++++++++------------------------ 1 file changed, 33 insertions(+), 24 deletions(-) diff --git a/util.c b/util.c index 57ee0522..3fad6314 100644 --- a/util.c +++ b/util.c @@ -180,34 +180,39 @@ time_t get_time(int year, int month, int day, int hour, int min, int sec) typedef struct htmlentity { char code[8]; - char is; + char is[4]; } htmlentity_t; /* FIXME: This is ISO8859-1(5) centric, so might cause problems with other charsets. */ -static htmlentity_t ent[] = +static const htmlentity_t ent[] = { - { "lt", '<' }, - { "gt", '>' }, - { "amp", '&' }, - { "quot", '"' }, - { "aacute", 'á' }, - { "eacute", 'é' }, - { "iacute", 'é' }, - { "oacute", 'ó' }, - { "uacute", 'ú' }, - { "agrave", 'à' }, - { "egrave", 'è' }, - { "igrave", 'ì' }, - { "ograve", 'ò' }, - { "ugrave", 'ù' }, - { "acirc", 'â' }, - { "ecirc", 'ê' }, - { "icirc", 'î' }, - { "ocirc", 'ô' }, - { "ucirc", 'û' }, - { "nbsp", ' ' }, - { "", 0 } + { "lt", "<" }, + { "gt", ">" }, + { "amp", "&" }, + { "quot", "\"" }, + { "aacute", "á" }, + { "eacute", "é" }, + { "iacute", "é" }, + { "oacute", "ó" }, + { "uacute", "ú" }, + { "agrave", "à" }, + { "egrave", "è" }, + { "igrave", "ì" }, + { "ograve", "ò" }, + { "ugrave", "ù" }, + { "acirc", "â" }, + { "ecirc", "ê" }, + { "icirc", "î" }, + { "ocirc", "ô" }, + { "ucirc", "û" }, + { "auml", "ä" }, + { "euml", "ë" }, + { "iuml", "ï" }, + { "ouml", "ö" }, + { "uuml", "ü" }, + { "nbsp", " " }, + { "", "" } }; void strip_html( char *in ) @@ -256,7 +261,11 @@ void strip_html( char *in ) for( i = 0; *ent[i].code; i ++ ) if( g_strncasecmp( ent[i].code, cs, strlen( ent[i].code ) ) == 0 ) { - *(s++) = ent[i].is; + int j; + + for( j = 0; ent[i].is[j]; j ++ ) + *(s++) = ent[i].is[j]; + matched = 1; break; } -- cgit v1.2.3