diff options
-rw-r--r-- | spec/fixtures/files/mislabelled-as-iso-8859-1.email | 20 | ||||
-rw-r--r-- | spec/lib/mail_handler/mail_handler_spec.rb | 15 |
2 files changed, 35 insertions, 0 deletions
diff --git a/spec/fixtures/files/mislabelled-as-iso-8859-1.email b/spec/fixtures/files/mislabelled-as-iso-8859-1.email new file mode 100644 index 000000000..6c8e6109e --- /dev/null +++ b/spec/fixtures/files/mislabelled-as-iso-8859-1.email @@ -0,0 +1,20 @@ +From foo@bar Thu Mar 01 15:02:33 2012 +Return-path: <foo@bar> +Envelope-to: foi@quux +Delivery-date: Thu, 01 Mar 2012 15:02:33 +0000 +Date: Thu, 01 Mar 2012 15:01:58 +0000 +Subject: some FOI request +To: foi@quux +From: foo@bar +MIME-Version: 1.0 +Content-Type: text/plain; charset="iso-8859-1" +Content-Transfer-Encoding: 7bit +Message-Id: <2468@bar.local> + +Dear Whoever, + +THERE'S A DASH NEXT – REQUEST FOR INFORMATION + +Best regards, +Other Person + diff --git a/spec/lib/mail_handler/mail_handler_spec.rb b/spec/lib/mail_handler/mail_handler_spec.rb index 6b01326ed..3f3be1f20 100644 --- a/spec/lib/mail_handler/mail_handler_spec.rb +++ b/spec/lib/mail_handler/mail_handler_spec.rb @@ -26,6 +26,21 @@ describe 'when creating a mail object from raw data' do MailHandler.get_part_body(mail).is_utf8?.should == true end + it 'should convert a Windows-1252 body mislabelled as ISO-8859-1 to UTF-8' do + mail = get_fixture_mail('mislabelled-as-iso-8859-1.email') + body = MailHandler.get_part_body(mail) + body.is_utf8?.should == true + # This email is broken in at least these two ways: + # 1. It contains a top bit set character (0x96) despite the + # "Content-Transfer-Encoding: 7bit" + # 2. The charset in the Content-Type header is "iso-8859-1" + # but 0x96 is actually a Windows-1252 en dash, which would + # be Unicode codepoint 2013. It should be possible to + # spot the mislabelling, since 0x96 isn't a valid + # ISO-8859-1 character. + body.should match / \xe2\x80\x93 / + end + end describe 'when asked for the from name' do |