aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--spec/fixtures/files/mislabelled-as-iso-8859-1.email20
-rw-r--r--spec/lib/mail_handler/mail_handler_spec.rb15
2 files changed, 35 insertions, 0 deletions
diff --git a/spec/fixtures/files/mislabelled-as-iso-8859-1.email b/spec/fixtures/files/mislabelled-as-iso-8859-1.email
new file mode 100644
index 000000000..6c8e6109e
--- /dev/null
+++ b/spec/fixtures/files/mislabelled-as-iso-8859-1.email
@@ -0,0 +1,20 @@
+From foo@bar Thu Mar 01 15:02:33 2012
+Return-path: <foo@bar>
+Envelope-to: foi@quux
+Delivery-date: Thu, 01 Mar 2012 15:02:33 +0000
+Date: Thu, 01 Mar 2012 15:01:58 +0000
+Subject: some FOI request
+To: foi@quux
+From: foo@bar
+MIME-Version: 1.0
+Content-Type: text/plain; charset="iso-8859-1"
+Content-Transfer-Encoding: 7bit
+Message-Id: <2468@bar.local>
+
+Dear Whoever,
+
+THERE'S A DASH NEXT – REQUEST FOR INFORMATION
+
+Best regards,
+Other Person
+
diff --git a/spec/lib/mail_handler/mail_handler_spec.rb b/spec/lib/mail_handler/mail_handler_spec.rb
index 6b01326ed..3f3be1f20 100644
--- a/spec/lib/mail_handler/mail_handler_spec.rb
+++ b/spec/lib/mail_handler/mail_handler_spec.rb
@@ -26,6 +26,21 @@ describe 'when creating a mail object from raw data' do
MailHandler.get_part_body(mail).is_utf8?.should == true
end
+ it 'should convert a Windows-1252 body mislabelled as ISO-8859-1 to UTF-8' do
+ mail = get_fixture_mail('mislabelled-as-iso-8859-1.email')
+ body = MailHandler.get_part_body(mail)
+ body.is_utf8?.should == true
+ # This email is broken in at least these two ways:
+ # 1. It contains a top bit set character (0x96) despite the
+ # "Content-Transfer-Encoding: 7bit"
+ # 2. The charset in the Content-Type header is "iso-8859-1"
+ # but 0x96 is actually a Windows-1252 en dash, which would
+ # be Unicode codepoint 2013. It should be possible to
+ # spot the mislabelling, since 0x96 isn't a valid
+ # ISO-8859-1 character.
+ body.should match / \xe2\x80\x93 /
+ end
+
end
describe 'when asked for the from name' do