diff options
author | francis <francis> | 2008-06-04 20:00:41 +0000 |
---|---|---|
committer | francis <francis> | 2008-06-04 20:00:41 +0000 |
commit | db34071cfb64ffb81b48322ee007c231a21ac46b (patch) | |
tree | df25ecc1815e4958e637acb5b905a8cba7f4ff0d | |
parent | 1ee4c3baa402127e0b874ecc2d27a09125a84eb3 (diff) |
Strip non-UTF-8
-rw-r--r-- | app/models/incoming_message.rb | 9 |
1 files changed, 7 insertions, 2 deletions
diff --git a/app/models/incoming_message.rb b/app/models/incoming_message.rb index bee105165..28613279f 100644 --- a/app/models/incoming_message.rb +++ b/app/models/incoming_message.rb @@ -18,7 +18,7 @@ # Copyright (c) 2007 UK Citizens Online Democracy. All rights reserved. # Email: francis@mysociety.org; WWW: http://www.mysociety.org/ # -# $Id: incoming_message.rb,v 1.108 2008-06-04 19:16:25 francis Exp $ +# $Id: incoming_message.rb,v 1.109 2008-06-04 20:00:41 francis Exp $ # TODO # Move some of the (e.g. quoting) functions here into rblib, as they feel @@ -441,11 +441,16 @@ text = IncomingMessage.mask_string_multicharset(text, 'request-144-a724c835@what # Or is it good windows-1252, most likely text = Iconv.conv('utf-8', 'windows-1252', text) rescue Iconv::IllegalSequence - # Just use it even though it is nonsense - treat as UTF-8 + # Text looks like unlabelled nonsense, strip out anything that isn't UTF-8 + text = Iconv.conv('utf-8//IGNORE', 'utf-8', text) + "\n\n[ WhatDoTheyKnow note: The above text was badly encoded, and has had strange characters removed. ]" end end end + + # An assertion that we have ended up with UTF-8 XXX can remove as this should + # always be fine if code above is + Iconv.conv('utf-8', 'utf-8', text) # Fix DOS style linefeeds to Unix style ones (or other later regexps won't work) # Needed for e.g. http://www.whatdotheyknow.com/request/60/response/98 |