aboutsummaryrefslogtreecommitdiffstats
path: root/spec/lib
diff options
context:
space:
mode:
Diffstat (limited to 'spec/lib')
-rw-r--r--spec/lib/basic_encoding_tests.rb157
-rw-r--r--spec/lib/mail_handler/mail_handler_spec.rb101
2 files changed, 258 insertions, 0 deletions
diff --git a/spec/lib/basic_encoding_tests.rb b/spec/lib/basic_encoding_tests.rb
new file mode 100644
index 000000000..35d35fd4a
--- /dev/null
+++ b/spec/lib/basic_encoding_tests.rb
@@ -0,0 +1,157 @@
+# -*- coding: utf-8 -*-
+require File.expand_path(File.dirname(__FILE__) + '/../spec_helper')
+
+def bytes_to_binary_string( bytes, claimed_encoding = nil )
+ claimed_encoding ||= 'ASCII-8BIT'
+ bytes_string = bytes.pack('c*')
+ if RUBY_VERSION.to_f >= 1.9
+ bytes_string.force_encoding! claimed_encoding
+ end
+ bytes_string
+end
+
+random_string = bytes_to_binary_string [ 0x0f, 0x58, 0x1c, 0x8f, 0xa4, 0xcf,
+ 0xf6, 0x8c, 0x9d, 0xa7, 0x06, 0xd9,
+ 0xf7, 0x90, 0x6c, 0x6f]
+
+windows_1252_string = bytes_to_binary_string [ 0x44, 0x41, 0x53, 0x48, 0x20,
+ 0x96, 0x20, 0x44, 0x41, 0x53,
+ 0x48 ]
+
+# It's a shame this example is so long, but if we don't take enough it
+# gets misinterpreted as Shift_JIS
+
+gb_18030_bytes = [ 0xb9, 0xf3, 0xb9, 0xab, 0xcb, 0xbe, 0xb8, 0xba, 0xd4, 0xf0,
+ 0xc8, 0xcb, 0x28, 0xbe, 0xad, 0xc0, 0xed, 0x2f, 0xb2, 0xc6,
+ 0xce, 0xf1, 0x29, 0xc4, 0xfa, 0xba, 0xc3, 0xa3, 0xba, 0x0d,
+ 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+ 0x20, 0x20, 0x20, 0xb1, 0xbe, 0xb9, 0xab, 0xcb, 0xbe, 0xd4,
+ 0xda, 0x31, 0x39, 0x39, 0x37, 0xc4, 0xea, 0xb3, 0xc9, 0xc1,
+ 0xa2, 0xb9, 0xfa, 0xbc, 0xd2, 0xb9, 0xa4, 0xc9, 0xcc, 0xd7,
+ 0xa2, 0xb2, 0xe1, 0x2e, 0xca, 0xb5, 0xc1, 0xa6, 0xd0, 0xdb,
+ 0xba, 0xf1, 0xa1, 0xa3, 0xd3, 0xd0, 0xb6, 0xc0, 0xc1, 0xa2,
+ 0xcb, 0xb0, 0xce, 0xf1, 0x0d, 0x0a, 0x20, 0x20, 0x20, 0x20,
+ 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0xd7, 0xa8, 0xd2, 0xb5,
+ 0xc8, 0xcb, 0xd4, 0xb1, 0x3b, 0xd4, 0xda, 0xc8, 0xab, 0xb9,
+ 0xfa, 0xb8, 0xf7, 0xb3, 0xc7, 0xca, 0xd0, 0xc9, 0xe8, 0xc1,
+ 0xa2, 0xb7, 0xd6, 0xb9, 0xab, 0xcb, 0xbe, 0xa3, 0xa8, 0xd5,
+ 0xe3, 0xbd, 0xad, 0xa1, 0xa2, 0xc9, 0xcf, 0xba, 0xa3, 0xa1,
+ 0xa2, 0xb9, 0xe3, 0xd6, 0xdd, 0xa1, 0xa2, 0xbd, 0xad, 0xcb,
+ 0xd5, 0xb5, 0xc8, 0x0d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20,
+ 0x20, 0x20, 0x20, 0x20, 0x20, 0xb5, 0xd8, 0xb7, 0xbd, 0xa3,
+ 0xa9, 0xd2, 0xf2, 0xbd, 0xf8, 0xcf, 0xee, 0xbd, 0xcf, 0xb6,
+ 0xe0, 0xcf, 0xd6, 0xcd, 0xea, 0xb3, 0xc9, 0xb2, 0xbb, 0xc1,
+ 0xcb, 0xc3, 0xbf, 0xd4, 0xc2, 0xcf, 0xfa, 0xca, 0xdb, 0xb6,
+ 0xee, 0xb6, 0xc8, 0xa1, 0xa3, 0xc3, 0xbf, 0xd4, 0xc2, 0xd3,
+ 0xd0, 0xd2, 0xbb, 0xb2, 0xbf, 0xb7, 0xd6, 0x0d, 0x0a, 0x20,
+ 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0xd4,
+ 0xf6, 0xd6, 0xb5, 0xb6, 0x90, 0xa3, 0xa8, 0x36, 0x2d, 0x37,
+ 0x25, 0xd7, 0xf3, 0xd3, 0xd2, 0x29, 0xba, 0xcd, 0xc6, 0xd5,
+ 0xc6, 0xb1, 0xa3, 0xa8, 0x30, 0x2e, 0x35, 0x25, 0x2d, 0x32,
+ 0x25, 0x20, 0xd7, 0xf3, 0xd3, 0xd2, 0xa3, 0xa9, 0xd3, 0xc5,
+ 0xbb, 0xdd, 0xb4, 0xfa, 0xbf, 0xaa, 0xbb, 0xf2, 0xba, 0xcf,
+ 0xd7, 0xf7, 0xa3, 0xac, 0x0d, 0x0a, 0x20, 0x20, 0x20, 0x20,
+ 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0xb5, 0xe3, 0xca, 0xfd,
+ 0xbd, 0xcf, 0xb5, 0xcd, 0xa1, 0xa3, 0xb4, 0xfa, 0xc0, 0xed,
+ 0xb7, 0xb6, 0xce, 0xa7, 0xc8, 0xe7, 0xcf, 0xc2, 0xa3, 0xba,
+ 0x0d, 0x0a ]
+
+gb_18030_spam_string = bytes_to_binary_string gb_18030_bytes
+
+describe "normalize_string_to_utf8" do
+
+ describe "when passed uniterpretable character data" do
+
+ it "should reject it as invalid" do
+
+ expect {
+ normalize_string_to_utf8 random_string
+ }.to raise_error(EncodingNormalizationError)
+
+ expect {
+ normalize_string_to_utf8 random_string, 'UTF-8'
+ }.to raise_error(EncodingNormalizationError)
+
+ end
+ end
+
+ describe "when passed unlabelled Windows 1252 data" do
+
+ it "should correctly convert it to UTF-8" do
+
+ normalized = normalize_string_to_utf8 windows_1252_string
+
+ normalized.should == "DASH – DASH"
+
+ end
+
+ end
+
+ describe "when passed GB 18030 data" do
+
+ it "should correctly convert it to UTF-8 if unlabelled" do
+
+ normalized = normalize_string_to_utf8 gb_18030_spam_string
+
+ normalized.should start_with("贵公司负责人")
+
+ end
+
+ end
+
+end
+
+describe "convert_string_to_utf8_or_binary" do
+
+ describe "when passed uniterpretable character data" do
+
+ it "should return it as a binary string" do
+
+ converted = convert_string_to_utf8_or_binary random_string
+ converted.should == random_string
+
+ if RUBY_VERSION.to_f >= 1.9
+ converted.encoding.should == 'ASCII-8BIT'
+ end
+
+ converted = convert_string_to_utf8_or_binary random_string,'UTF-8'
+ converted.should == random_string
+
+ if RUBY_VERSION.to_f >= 1.9
+ converted.encoding.should == 'ASCII-8BIT'
+ end
+
+ end
+ end
+
+ describe "when passed unlabelled Windows 1252 data" do
+
+ it "should correctly convert it to UTF-8" do
+
+ converted = convert_string_to_utf8_or_binary windows_1252_string
+
+ converted.should == "DASH – DASH"
+
+ if RUBY_VERSION.to_f >= 1.9
+ converted.encoding.should == 'UTF-8'
+ end
+ end
+
+ end
+
+ describe "when passed GB 18030 data" do
+
+ it "should correctly convert it to UTF-8 if unlabelled" do
+
+ converted = convert_string_to_utf8_or_binary gb_18030_spam_string
+
+ converted.should start_with("贵公司负责人")
+
+ if RUBY_VERSION.to_f >= 1.9
+ converted.encoding.should == 'UTF-8'
+ end
+ end
+
+ end
+
+end
diff --git a/spec/lib/mail_handler/mail_handler_spec.rb b/spec/lib/mail_handler/mail_handler_spec.rb
index 79b779687..272b56d0b 100644
--- a/spec/lib/mail_handler/mail_handler_spec.rb
+++ b/spec/lib/mail_handler/mail_handler_spec.rb
@@ -20,12 +20,46 @@ describe 'when creating a mail object from raw data' do
mail.to.should == ["request-66666-caa77777@whatdotheyknow.com", "foi@example.com"]
end
+ it 'should return nil for malformed To: and Cc: lines' do
+ mail = get_fixture_mail('malformed-to-and-cc.email')
+ mail.to.should == nil
+ mail.cc.should == nil
+ end
+
it 'should convert an iso8859 email to utf8' do
mail = get_fixture_mail('iso8859_2_raw_email.email')
mail.subject.should match /gjatë/u
MailHandler.get_part_body(mail).is_utf8?.should == true
end
+ it 'should not be confused by subject lines with malformed UTF-8 at the end' do
+ # The base64 subject line was generated with:
+ # printf "hello\360" | base64
+ # ... and wrapping the result in '=?UTF-8?B?' and '?='
+ mail = get_fixture_mail('subject-bad-utf-8-trailing-base64.email')
+ mail.subject.should == 'hello'
+ # The quoted printable subject line was generated with:
+ # printf "hello\360" | qprint -b -e
+ # ... and wrapping the result in '=?UTF-8?Q?' and '?='
+ mail = get_fixture_mail('subject-bad-utf-8-trailing-quoted-printable.email')
+ mail.subject.should == 'hello'
+ end
+
+ it 'should convert a Windows-1252 body mislabelled as ISO-8859-1 to UTF-8' do
+ mail = get_fixture_mail('mislabelled-as-iso-8859-1.email')
+ body = MailHandler.get_part_body(mail)
+ body.is_utf8?.should == true
+ # This email is broken in at least these two ways:
+ # 1. It contains a top bit set character (0x96) despite the
+ # "Content-Transfer-Encoding: 7bit"
+ # 2. The charset in the Content-Type header is "iso-8859-1"
+ # but 0x96 is actually a Windows-1252 en dash, which would
+ # be Unicode codepoint 2013. It should be possible to
+ # spot the mislabelling, since 0x96 isn't a valid
+ # ISO-8859-1 character.
+ body.should match(/ \xe2\x80\x93 /)
+ end
+
end
describe 'when asked for the from name' do
@@ -275,6 +309,12 @@ end
describe 'when getting attachment attributes' do
+ it 'should handle a mail with a non-multipart part with no charset in the Content-Type header' do
+ mail = get_fixture_mail('part-without-charset-in-content-type.email')
+ attributes = MailHandler.get_attachment_attributes(mail)
+ attributes.size.should == 2
+ end
+
it 'should get two attachment parts from a multipart mail with text and html alternatives
and an image' do
mail = get_fixture_mail('quoted-subject-iso8859-1.email')
@@ -282,6 +322,13 @@ describe 'when getting attachment attributes' do
attributes.size.should == 2
end
+ it 'should get one attachment from a multipart mail with text and HTML alternatives, which should be UTF-8' do
+ mail = get_fixture_mail('iso8859_2_raw_email.email')
+ attributes = MailHandler.get_attachment_attributes(mail)
+ attributes.length.should == 1
+ attributes[0][:body].is_utf8?.should == true
+ end
+
it 'should expand a mail attached as text' do
# Note that this spec will only pass using Tmail in the timezone set as datetime headers
# are rendered out in the local time - using the Mail gem this is not necessary
@@ -304,6 +351,52 @@ describe 'when getting attachment attributes' do
attributes = MailHandler.get_attachment_attributes(mail)
end
+ it 'should ignore truncated TNEF attachment' do
+ mail = get_fixture_mail('tnef-attachment-truncated.email')
+ attributes = MailHandler.get_attachment_attributes(mail)
+ attributes.length.should == 2
+ end
+
+ it 'should ignore anything beyond the final MIME boundary' do
+ pending do
+ # This example raw email has a premature closing boundary for
+ # the outer multipart/mixed - my reading of RFC 1521 is that
+ # the "epilogue" beyond that should be ignored.
+ # See https://github.com/mysociety/alaveteli/issues/922 for
+ # more discussion.
+ mail = get_fixture_mail('nested-attachments-premature-end.email')
+ attributes = MailHandler.get_attachment_attributes(mail)
+ attributes.length.should == 3
+ end
+ end
+
+ it 'should cope with a missing final MIME boundary' do
+ mail = get_fixture_mail('multipart-no-final-boundary.email')
+ attributes = MailHandler.get_attachment_attributes(mail)
+ attributes.length.should == 1
+ attributes[0][:body].should match(/This is an acknowledgement of your email/)
+ attributes[0][:content_type].should == "text/plain"
+ attributes[0][:url_part_number].should == 1
+ end
+
+ it 'should ignore a TNEF attachment with no usable contents' do
+ # FIXME: "no usable contents" is slightly misleading. The
+ # attachment in this example email does have usable content in
+ # the body of the TNEF attachment, but the invocation of tnef
+ # historically used to unpack these attachments doesn't add
+ # the --save-body parameter, so that they have been ignored so
+ # far. We probably should include the body from such
+ # attachments, but, at the moment, with the pending upgrade to
+ # Rails 3, we just want to check that the behaviour is the
+ # same as before.
+ mail = get_fixture_mail('tnef-attachment-empty.email')
+ attributes = MailHandler.get_attachment_attributes(mail)
+ attributes.length.should == 2
+ # This is the size of the TNEF-encoded attachment; currently,
+ # we expect the code just to return this without decoding:
+ attributes[1][:body].length.should == 7769
+ end
+
it 'should produce a consistent set of url_part_numbers, content_types, within_rfc822_subjects
and filenames from an example mail with lots of attachments' do
mail = get_fixture_mail('many-attachments-date-header.email')
@@ -385,3 +478,11 @@ describe 'when getting attachment attributes' do
end
end
end
+
+describe 'when getting the address part from an address string' do
+
+ it 'should handle non-ascii characters in the name input' do
+ address = "\"Someone’s name\" <test@example.com>"
+ MailHandler.address_from_string(address).should == 'test@example.com'
+ end
+end