diff options
Diffstat (limited to 'spec/lib')
-rw-r--r-- | spec/lib/basic_encoding_tests.rb | 157 | ||||
-rw-r--r-- | spec/lib/mail_handler/mail_handler_spec.rb | 80 |
2 files changed, 237 insertions, 0 deletions
diff --git a/spec/lib/basic_encoding_tests.rb b/spec/lib/basic_encoding_tests.rb new file mode 100644 index 000000000..35d35fd4a --- /dev/null +++ b/spec/lib/basic_encoding_tests.rb @@ -0,0 +1,157 @@ +# -*- coding: utf-8 -*- +require File.expand_path(File.dirname(__FILE__) + '/../spec_helper') + +def bytes_to_binary_string( bytes, claimed_encoding = nil ) + claimed_encoding ||= 'ASCII-8BIT' + bytes_string = bytes.pack('c*') + if RUBY_VERSION.to_f >= 1.9 + bytes_string.force_encoding! claimed_encoding + end + bytes_string +end + +random_string = bytes_to_binary_string [ 0x0f, 0x58, 0x1c, 0x8f, 0xa4, 0xcf, + 0xf6, 0x8c, 0x9d, 0xa7, 0x06, 0xd9, + 0xf7, 0x90, 0x6c, 0x6f] + +windows_1252_string = bytes_to_binary_string [ 0x44, 0x41, 0x53, 0x48, 0x20, + 0x96, 0x20, 0x44, 0x41, 0x53, + 0x48 ] + +# It's a shame this example is so long, but if we don't take enough it +# gets misinterpreted as Shift_JIS + +gb_18030_bytes = [ 0xb9, 0xf3, 0xb9, 0xab, 0xcb, 0xbe, 0xb8, 0xba, 0xd4, 0xf0, + 0xc8, 0xcb, 0x28, 0xbe, 0xad, 0xc0, 0xed, 0x2f, 0xb2, 0xc6, + 0xce, 0xf1, 0x29, 0xc4, 0xfa, 0xba, 0xc3, 0xa3, 0xba, 0x0d, + 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0xb1, 0xbe, 0xb9, 0xab, 0xcb, 0xbe, 0xd4, + 0xda, 0x31, 0x39, 0x39, 0x37, 0xc4, 0xea, 0xb3, 0xc9, 0xc1, + 0xa2, 0xb9, 0xfa, 0xbc, 0xd2, 0xb9, 0xa4, 0xc9, 0xcc, 0xd7, + 0xa2, 0xb2, 0xe1, 0x2e, 0xca, 0xb5, 0xc1, 0xa6, 0xd0, 0xdb, + 0xba, 0xf1, 0xa1, 0xa3, 0xd3, 0xd0, 0xb6, 0xc0, 0xc1, 0xa2, + 0xcb, 0xb0, 0xce, 0xf1, 0x0d, 0x0a, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0xd7, 0xa8, 0xd2, 0xb5, + 0xc8, 0xcb, 0xd4, 0xb1, 0x3b, 0xd4, 0xda, 0xc8, 0xab, 0xb9, + 0xfa, 0xb8, 0xf7, 0xb3, 0xc7, 0xca, 0xd0, 0xc9, 0xe8, 0xc1, + 0xa2, 0xb7, 0xd6, 0xb9, 0xab, 0xcb, 0xbe, 0xa3, 0xa8, 0xd5, + 0xe3, 0xbd, 0xad, 0xa1, 0xa2, 0xc9, 0xcf, 0xba, 0xa3, 0xa1, + 0xa2, 0xb9, 0xe3, 0xd6, 0xdd, 0xa1, 0xa2, 0xbd, 0xad, 0xcb, + 0xd5, 0xb5, 0xc8, 0x0d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0xb5, 0xd8, 0xb7, 0xbd, 0xa3, + 0xa9, 0xd2, 0xf2, 0xbd, 0xf8, 0xcf, 0xee, 0xbd, 0xcf, 0xb6, + 0xe0, 0xcf, 0xd6, 0xcd, 0xea, 0xb3, 0xc9, 0xb2, 0xbb, 0xc1, + 0xcb, 0xc3, 0xbf, 0xd4, 0xc2, 0xcf, 0xfa, 0xca, 0xdb, 0xb6, + 0xee, 0xb6, 0xc8, 0xa1, 0xa3, 0xc3, 0xbf, 0xd4, 0xc2, 0xd3, + 0xd0, 0xd2, 0xbb, 0xb2, 0xbf, 0xb7, 0xd6, 0x0d, 0x0a, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0xd4, + 0xf6, 0xd6, 0xb5, 0xb6, 0x90, 0xa3, 0xa8, 0x36, 0x2d, 0x37, + 0x25, 0xd7, 0xf3, 0xd3, 0xd2, 0x29, 0xba, 0xcd, 0xc6, 0xd5, + 0xc6, 0xb1, 0xa3, 0xa8, 0x30, 0x2e, 0x35, 0x25, 0x2d, 0x32, + 0x25, 0x20, 0xd7, 0xf3, 0xd3, 0xd2, 0xa3, 0xa9, 0xd3, 0xc5, + 0xbb, 0xdd, 0xb4, 0xfa, 0xbf, 0xaa, 0xbb, 0xf2, 0xba, 0xcf, + 0xd7, 0xf7, 0xa3, 0xac, 0x0d, 0x0a, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0xb5, 0xe3, 0xca, 0xfd, + 0xbd, 0xcf, 0xb5, 0xcd, 0xa1, 0xa3, 0xb4, 0xfa, 0xc0, 0xed, + 0xb7, 0xb6, 0xce, 0xa7, 0xc8, 0xe7, 0xcf, 0xc2, 0xa3, 0xba, + 0x0d, 0x0a ] + +gb_18030_spam_string = bytes_to_binary_string gb_18030_bytes + +describe "normalize_string_to_utf8" do + + describe "when passed uniterpretable character data" do + + it "should reject it as invalid" do + + expect { + normalize_string_to_utf8 random_string + }.to raise_error(EncodingNormalizationError) + + expect { + normalize_string_to_utf8 random_string, 'UTF-8' + }.to raise_error(EncodingNormalizationError) + + end + end + + describe "when passed unlabelled Windows 1252 data" do + + it "should correctly convert it to UTF-8" do + + normalized = normalize_string_to_utf8 windows_1252_string + + normalized.should == "DASH – DASH" + + end + + end + + describe "when passed GB 18030 data" do + + it "should correctly convert it to UTF-8 if unlabelled" do + + normalized = normalize_string_to_utf8 gb_18030_spam_string + + normalized.should start_with("贵公司负责人") + + end + + end + +end + +describe "convert_string_to_utf8_or_binary" do + + describe "when passed uniterpretable character data" do + + it "should return it as a binary string" do + + converted = convert_string_to_utf8_or_binary random_string + converted.should == random_string + + if RUBY_VERSION.to_f >= 1.9 + converted.encoding.should == 'ASCII-8BIT' + end + + converted = convert_string_to_utf8_or_binary random_string,'UTF-8' + converted.should == random_string + + if RUBY_VERSION.to_f >= 1.9 + converted.encoding.should == 'ASCII-8BIT' + end + + end + end + + describe "when passed unlabelled Windows 1252 data" do + + it "should correctly convert it to UTF-8" do + + converted = convert_string_to_utf8_or_binary windows_1252_string + + converted.should == "DASH – DASH" + + if RUBY_VERSION.to_f >= 1.9 + converted.encoding.should == 'UTF-8' + end + end + + end + + describe "when passed GB 18030 data" do + + it "should correctly convert it to UTF-8 if unlabelled" do + + converted = convert_string_to_utf8_or_binary gb_18030_spam_string + + converted.should start_with("贵公司负责人") + + if RUBY_VERSION.to_f >= 1.9 + converted.encoding.should == 'UTF-8' + end + end + + end + +end diff --git a/spec/lib/mail_handler/mail_handler_spec.rb b/spec/lib/mail_handler/mail_handler_spec.rb index 79b779687..01bf179f8 100644 --- a/spec/lib/mail_handler/mail_handler_spec.rb +++ b/spec/lib/mail_handler/mail_handler_spec.rb @@ -20,12 +20,33 @@ describe 'when creating a mail object from raw data' do mail.to.should == ["request-66666-caa77777@whatdotheyknow.com", "foi@example.com"] end + it 'should return nil for malformed To: and Cc: lines' do + mail = get_fixture_mail('malformed-to-and-cc.email') + mail.to.should == nil + mail.cc.should == nil + end + it 'should convert an iso8859 email to utf8' do mail = get_fixture_mail('iso8859_2_raw_email.email') mail.subject.should match /gjatë/u MailHandler.get_part_body(mail).is_utf8?.should == true end + it 'should convert a Windows-1252 body mislabelled as ISO-8859-1 to UTF-8' do + mail = get_fixture_mail('mislabelled-as-iso-8859-1.email') + body = MailHandler.get_part_body(mail) + body.is_utf8?.should == true + # This email is broken in at least these two ways: + # 1. It contains a top bit set character (0x96) despite the + # "Content-Transfer-Encoding: 7bit" + # 2. The charset in the Content-Type header is "iso-8859-1" + # but 0x96 is actually a Windows-1252 en dash, which would + # be Unicode codepoint 2013. It should be possible to + # spot the mislabelling, since 0x96 isn't a valid + # ISO-8859-1 character. + body.should match(/ \xe2\x80\x93 /) + end + end describe 'when asked for the from name' do @@ -275,6 +296,12 @@ end describe 'when getting attachment attributes' do + it 'should handle a mail with a non-multipart part with no charset in the Content-Type header' do + mail = get_fixture_mail('part-without-charset-in-content-type.email') + attributes = MailHandler.get_attachment_attributes(mail) + attributes.size.should == 2 + end + it 'should get two attachment parts from a multipart mail with text and html alternatives and an image' do mail = get_fixture_mail('quoted-subject-iso8859-1.email') @@ -282,6 +309,13 @@ describe 'when getting attachment attributes' do attributes.size.should == 2 end + it 'should get one attachment from a multipart mail with text and HTML alternatives, which should be UTF-8' do + mail = get_fixture_mail('iso8859_2_raw_email.email') + attributes = MailHandler.get_attachment_attributes(mail) + attributes.length.should == 1 + attributes[0][:body].is_utf8?.should == true + end + it 'should expand a mail attached as text' do # Note that this spec will only pass using Tmail in the timezone set as datetime headers # are rendered out in the local time - using the Mail gem this is not necessary @@ -304,6 +338,52 @@ describe 'when getting attachment attributes' do attributes = MailHandler.get_attachment_attributes(mail) end + it 'should ignore truncated TNEF attachment' do + mail = get_fixture_mail('tnef-attachment-truncated.email') + attributes = MailHandler.get_attachment_attributes(mail) + attributes.length.should == 2 + end + + it 'should ignore anything beyond the final MIME boundary' do + pending do + # This example raw email has a premature closing boundary for + # the outer multipart/mixed - my reading of RFC 1521 is that + # the "epilogue" beyond that should be ignored. + # See https://github.com/mysociety/alaveteli/issues/922 for + # more discussion. + mail = get_fixture_mail('nested-attachments-premature-end.email') + attributes = MailHandler.get_attachment_attributes(mail) + attributes.length.should == 3 + end + end + + it 'should cope with a missing final MIME boundary' do + mail = get_fixture_mail('multipart-no-final-boundary.email') + attributes = MailHandler.get_attachment_attributes(mail) + attributes.length.should == 1 + attributes[0][:body].should match(/This is an acknowledgement of your email/) + attributes[0][:content_type].should == "text/plain" + attributes[0][:url_part_number].should == 1 + end + + it 'should ignore a TNEF attachment with no usable contents' do + # FIXME: "no usable contents" is slightly misleading. The + # attachment in this example email does have usable content in + # the body of the TNEF attachment, but the invocation of tnef + # historically used to unpack these attachments doesn't add + # the --save-body parameter, so that they have been ignored so + # far. We probably should include the body from such + # attachments, but, at the moment, with the pending upgrade to + # Rails 3, we just want to check that the behaviour is the + # same as before. + mail = get_fixture_mail('tnef-attachment-empty.email') + attributes = MailHandler.get_attachment_attributes(mail) + attributes.length.should == 2 + # This is the size of the TNEF-encoded attachment; currently, + # we expect the code just to return this without decoding: + attributes[1][:body].length.should == 7769 + end + it 'should produce a consistent set of url_part_numbers, content_types, within_rfc822_subjects and filenames from an example mail with lots of attachments' do mail = get_fixture_mail('many-attachments-date-header.email') |