diff --git a/src/mimeparser.rs b/src/mimeparser.rs index 21d03e029..624695fc3 100644 --- a/src/mimeparser.rs +++ b/src/mimeparser.rs @@ -26,6 +26,7 @@ use crate::param::*; use crate::peerstate::Peerstate; use crate::simplify::*; use crate::stock::StockMessage; +use charset::Charset; use percent_encoding::percent_decode_str; /// A parsed MIME message. @@ -687,7 +688,7 @@ impl MimeMessage { let (mime_type, msg_type) = get_mime_type(mail)?; let raw_mime = mail.ctype.mimetype.to_lowercase(); - let filename = get_attachment_filename(mail)?; + let filename = get_attachment_filename(context, mail)?; let old_part_count = self.parts.len(); @@ -1275,7 +1276,10 @@ fn is_attachment_disposition(mail: &mailparse::ParsedMail<'_>) -> bool { /// returned. If Content-Disposition is "attachment" but filename is /// not specified, filename is guessed. If Content-Disposition cannot /// be parsed, returns an error. -fn get_attachment_filename(mail: &mailparse::ParsedMail) -> Result> { +fn get_attachment_filename( + context: &Context, + mail: &mailparse::ParsedMail, +) -> Result> { let ct = mail.get_content_disposition(); // try to get file name as "encoded-words" from @@ -1291,7 +1295,7 @@ fn get_attachment_filename(mail: &mailparse::ParsedMail) -> Result Result>(); + let (utf8_str, _, _) = charset.decode(&*decoded_bytes); + Some(utf8_str.into()) + } else { + warn!(context, "latin1 encoding does not exist"); + None + } + } + } else { + warn!(context, "apostroped encoding invalid"); + None + } } } } @@ -1462,53 +1483,146 @@ mod tests { assert!(is_attachment_disposition(&mail.subparts[1])); } - fn load_mail_with_attachment(raw: &[u8]) -> ParsedMail { + fn load_mail_with_attachment<'a>(t: &'a TestContext, raw: &'a [u8]) -> ParsedMail<'a> { let mail = mailparse::parse_mail(raw).unwrap(); - assert!(get_attachment_filename(&mail).unwrap().is_none()); - assert!(get_attachment_filename(&mail.subparts[0]) + assert!(get_attachment_filename(&t.ctx, &mail).unwrap().is_none()); + assert!(get_attachment_filename(&t.ctx, &mail.subparts[0]) .unwrap() .is_none()); mail } - #[test] - fn test_get_attachment_filename() { - let mail = load_mail_with_attachment(include_bytes!( - "../test-data/message/attach_filename_simple.eml" - )); - let filename = get_attachment_filename(&mail.subparts[1]).unwrap(); + #[async_std::test] + async fn test_get_attachment_filename() { + let t = TestContext::new().await; + let mail = load_mail_with_attachment( + &t, + include_bytes!("../test-data/message/attach_filename_simple.eml"), + ); + let filename = get_attachment_filename(&t.ctx, &mail.subparts[1]).unwrap(); assert_eq!(filename, Some("test.html".to_string())) } - #[test] - fn test_get_attachment_filename_encoded_words() { - let mail = load_mail_with_attachment(include_bytes!( - "../test-data/message/attach_filename_encoded_words.eml" - )); - let filename = get_attachment_filename(&mail.subparts[1]).unwrap(); + #[async_std::test] + async fn test_get_attachment_filename_encoded_words() { + let t = TestContext::new().await; + let mail = load_mail_with_attachment( + &t, + include_bytes!("../test-data/message/attach_filename_encoded_words.eml"), + ); + let filename = get_attachment_filename(&t.ctx, &mail.subparts[1]).unwrap(); assert_eq!(filename, Some("Maßnahmen Okt. 2020.html".to_string())) } - #[test] - fn test_get_attachment_filename_encoded_words_cont() { + #[async_std::test] + async fn test_get_attachment_filename_encoded_words_binary() { + let t = TestContext::new().await; + let mail = load_mail_with_attachment( + &t, + include_bytes!("../test-data/message/attach_filename_encoded_words_binary.eml"), + ); + let filename = get_attachment_filename(&t.ctx, &mail.subparts[1]).unwrap(); + assert_eq!(filename, Some(" § 165 Abs".to_string())) + } + + #[async_std::test] + async fn test_get_attachment_filename_encoded_words_windows1251() { + let t = TestContext::new().await; + let mail = load_mail_with_attachment( + &t, + include_bytes!("../test-data/message/attach_filename_encoded_words_windows1251.eml"), + ); + let filename = get_attachment_filename(&t.ctx, &mail.subparts[1]).unwrap(); + assert_eq!(filename, Some("file Что нового 2020.pdf".to_string())) + } + + #[async_std::test] + async fn test_get_attachment_filename_encoded_words_cont() { // test continued encoded-words and also test apostropes work that way - let mail = load_mail_with_attachment(include_bytes!( - "../test-data/message/attach_filename_encoded_words_cont.eml" - )); - let filename = get_attachment_filename(&mail.subparts[1]).unwrap(); + let t = TestContext::new().await; + let mail = load_mail_with_attachment( + &t, + include_bytes!("../test-data/message/attach_filename_encoded_words_cont.eml"), + ); + let filename = get_attachment_filename(&t.ctx, &mail.subparts[1]).unwrap(); assert_eq!(filename, Some("Maßn'ah'men Okt. 2020.html".to_string())) } - #[test] - fn test_get_attachment_filename_combined() { + #[async_std::test] + async fn test_get_attachment_filename_encoded_words_bad_delimiter() { + let t = TestContext::new().await; + let mail = load_mail_with_attachment( + &t, + include_bytes!("../test-data/message/attach_filename_encoded_words_bad_delimiter.eml"), + ); + let filename = get_attachment_filename(&t.ctx, &mail.subparts[1]).unwrap(); + // not decoded as a space is missing after encoded-words part + assert_eq!(filename, Some("=?utf-8?q?foo?=.bar".to_string())) + } + + #[async_std::test] + async fn test_get_attachment_filename_apostrophed() { + let t = TestContext::new().await; + let mail = load_mail_with_attachment( + &t, + include_bytes!("../test-data/message/attach_filename_apostrophed.eml"), + ); + let filename = get_attachment_filename(&t.ctx, &mail.subparts[1]).unwrap(); + assert_eq!(filename, Some("Maßnahmen Okt. 2021.html".to_string())) + } + + #[async_std::test] + async fn test_get_attachment_filename_apostrophed_cont() { + let t = TestContext::new().await; + let mail = load_mail_with_attachment( + &t, + include_bytes!("../test-data/message/attach_filename_apostrophed_cont.eml"), + ); + let filename = get_attachment_filename(&t.ctx, &mail.subparts[1]).unwrap(); + assert_eq!(filename, Some("Maßnahmen März 2022.html".to_string())) + } + + #[async_std::test] + async fn test_get_attachment_filename_apostrophed_windows1251() { + let t = TestContext::new().await; + let mail = load_mail_with_attachment( + &t, + include_bytes!("../test-data/message/attach_filename_apostrophed_windows1251.eml"), + ); + let filename = get_attachment_filename(&t.ctx, &mail.subparts[1]).unwrap(); + assert_eq!(filename, Some("программирование.HTM".to_string())) + } + + #[async_std::test] + async fn test_get_attachment_filename_apostrophed_cp1252() { + let t = TestContext::new().await; + let mail = load_mail_with_attachment( + &t, + include_bytes!("../test-data/message/attach_filename_apostrophed_cp1252.eml"), + ); + let filename = get_attachment_filename(&t.ctx, &mail.subparts[1]).unwrap(); + assert_eq!(filename, Some("Auftragsbestätigung.pdf".to_string())) + } + + #[async_std::test] + async fn test_get_attachment_filename_combined() { // test that if `filename` and `filename*0` are given, the filename is not doubled - let mail = load_mail_with_attachment(include_bytes!( - "../test-data/message/attach_filename_combined.eml" - )); - let filename = get_attachment_filename(&mail.subparts[1]).unwrap(); + let t = TestContext::new().await; + let mail = load_mail_with_attachment( + &t, + include_bytes!("../test-data/message/attach_filename_combined.eml"), + ); + let filename = get_attachment_filename(&t.ctx, &mail.subparts[1]).unwrap(); assert_eq!(filename, Some("Maßnahmen Okt. 2020.html".to_string())) } + #[test] + fn test_charset_latin1() { + // make sure, latin1 exists under this name + // as we're using it as default in get_attachment_filename() for non-utf-8 + assert!(Charset::for_label(b"latin1").is_some()); + } + #[test] fn test_mailparse_content_type() { let ctype = diff --git a/standards.md b/standards.md index 9aceb6285..8ce5707fa 100644 --- a/standards.md +++ b/standards.md @@ -6,6 +6,7 @@ Tasks | Standards ---------------------------------|--------------------------------------------- Transport | IMAP v4 ([RFC 3501](https://tools.ietf.org/html/rfc3501)), SMTP ([RFC 5321](https://tools.ietf.org/html/rfc5321)) and Internet Message Format (IMF, [RFC 5322](https://tools.ietf.org/html/rfc5322)) Embedded media | MIME Document Series ([RFC 2045](https://tools.ietf.org/html/rfc2045), [RFC 2046](https://tools.ietf.org/html/rfc2046)), Content-Disposition Header ([RFC 2183](https://tools.ietf.org/html/rfc2183)), Multipart/Related ([RFC 2387](https://tools.ietf.org/html/rfc2387)) +Filename encoding | Encoded Words ([RFC 2047](https://tools.ietf.org/html/rfc2047)), Encoded Word Extensions ([RFC 2231](https://tools.ietf.org/html/rfc2231)) Identify server folders | IMAP LIST Extension ([RFC 6154](https://tools.ietf.org/html/rfc6154)) Push | IMAP IDLE ([RFC 2177](https://tools.ietf.org/html/rfc2177)) Authorization | OAuth2 ([RFC 6749](https://tools.ietf.org/html/rfc6749)) diff --git a/test-data/message/attach_filename_apostrophed.eml b/test-data/message/attach_filename_apostrophed.eml new file mode 100644 index 000000000..f45d4826e --- /dev/null +++ b/test-data/message/attach_filename_apostrophed.eml @@ -0,0 +1,24 @@ +Subject: Test apostrophed filenames +Message-ID: 12345@testrun.org +Date: Sat, 07 Dec 2019 19:00:27 +0000 +X-Mailer: Kopano 8.7.16 +To: recp@testrun.org +From: sender@testrun.org +Content-Type: multipart/mixed; boundary="==BREAK==" + + +--==BREAK== +Content-Type: text/plain; charset=utf-8 + +apostrophed filenames as of +https://tools.ietf.org/html/rfc2231 + +--==BREAK== +Content-Type: text/html +Content-Disposition: attachment; + filename*=utf-8''Ma%C3%9Fnahmen%20Okt.%202021.html +Content-Transfer-Encoding: base64 + +PGh0bWw+PGJvZHk+dGV4dDwvYm9keT5kYXRh + +--==BREAK==-- diff --git a/test-data/message/attach_filename_apostrophed_cont.eml b/test-data/message/attach_filename_apostrophed_cont.eml new file mode 100644 index 000000000..b7d9cf6a4 --- /dev/null +++ b/test-data/message/attach_filename_apostrophed_cont.eml @@ -0,0 +1,34 @@ +Subject: Test apostrophed filenames +Message-ID: 12345@testrun.org +Date: Sat, 07 Dec 2019 19:00:27 +0000 +X-Mailer: Kopano 8.7.16 +To: recp@testrun.org +From: sender@testrun.org +Content-Type: multipart/mixed; boundary="==BREAK==" + + +--==BREAK== +Content-Type: text/plain; charset=utf-8 + +apostrophed filenames as of +https://tools.ietf.org/html/rfc2231, +span over several header lines. + +note, that, in contrast to encoded-words, +the character-set is not repeated. + +as a side-effect, +this tests unquoted header attributes in filename*1* +and lower-case-urlencoded utf-8 + +--==BREAK== +Content-Type: text/html +Content-Disposition: attachment; + filename*0*="utf-8''Ma%C3%9Fna"; + filename*1*=hm; + filename*2*="en%20M%c3%a4rz%202022.html"; +Content-Transfer-Encoding: base64 + +PGh0bWw+PGJvZHk+dGV4dDwvYm9keT5kYXRh + +--==BREAK==-- diff --git a/test-data/message/attach_filename_apostrophed_cp1252.eml b/test-data/message/attach_filename_apostrophed_cp1252.eml new file mode 100644 index 000000000..01df40f4c --- /dev/null +++ b/test-data/message/attach_filename_apostrophed_cp1252.eml @@ -0,0 +1,23 @@ +Subject: Test apostrophed filenames +Message-ID: 12345@testrun.org +Date: Sat, 07 Dec 2019 19:00:27 +0000 +X-Mailer: Kopano 8.7.16 +To: recp@testrun.org +From: sender@testrun.org +Content-Type: multipart/mixed; boundary="==BREAK==" + + +--==BREAK== +Content-Type: text/plain; charset=utf-8 + +testing cp1252 aka ANSI aka Windows-1252 + +--==BREAK== +Content-Type: text/html +Content-Disposition: attachment; + filename*=Cp1252''Auftragsbest%E4tigung.pdf; +Content-Transfer-Encoding: base64 + +PGh0bWw+PGJvZHk+dGV4dDwvYm9keT5kYXRh + +--==BREAK==-- diff --git a/test-data/message/attach_filename_apostrophed_windows1251.eml b/test-data/message/attach_filename_apostrophed_windows1251.eml new file mode 100644 index 000000000..a040a96da --- /dev/null +++ b/test-data/message/attach_filename_apostrophed_windows1251.eml @@ -0,0 +1,29 @@ +Subject: Test apostrophed filenames +Message-ID: 12345@testrun.org +Date: Sat, 07 Dec 2019 19:00:27 +0000 +X-Mailer: Kopano 8.7.16 +To: recp@testrun.org +From: sender@testrun.org +Content-Type: multipart/mixed; boundary="==BREAK==" + + +--==BREAK== +Content-Type: text/plain; charset=utf-8 + +apostrophed filenames as of +https://tools.ietf.org/html/rfc2231, +testing non-utf-8 charset + +examples: +%EF%F0%EE%E3%F0%E0%EC%EC%E8%F0%EE%E2%E0%ED%E8%E5 = программирование = programming + +--==BREAK== +Content-Type: text/html +Content-Disposition: attachment; + filename*0*=windows-1251''%EF%F0%EE%E3%F0%E0%EC%EC%E8%F0%EE%E2%E0; + filename*1*=%ED%E8%E5.HTM +Content-Transfer-Encoding: base64 + +PGh0bWw+PGJvZHk+dGV4dDwvYm9keT5kYXRh + +--==BREAK==-- diff --git a/test-data/message/attach_filename_encoded_words_bad_delimiter.eml b/test-data/message/attach_filename_encoded_words_bad_delimiter.eml new file mode 100644 index 000000000..a1ae8207c --- /dev/null +++ b/test-data/message/attach_filename_encoded_words_bad_delimiter.eml @@ -0,0 +1,29 @@ +Subject: Test encoded-words filenames +Message-ID: 123456@testrun.org +Date: Sat, 07 Dec 2019 19:00:27 +0000 +Chat-Version: 1.0 +To: recp@testrun.org +From: sender@testrun.org +Content-Type: multipart/mixed; boundary="==BREAK==" + + +--==BREAK== +Content-Type: text/plain; charset=utf-8 + +there MUST be a space between encoded words and plain text, +if there is none, decoding should return the original string +https://tools.ietf.org/html/rfc2047 5.1: + +"Ordinary ASCII text and 'encoded-word's may appear together in the +same header field. However, an 'encoded-word' that appears in a +header field defined as '*text' MUST be separated from any adjacent +'encoded-word' or 'text' by 'linear-white-space'." + +--==BREAK== +Content-Type: text/html +Content-Disposition: attachment; filename="=?utf-8?q?foo?=.bar"; +Content-Transfer-Encoding: base64 + +PGh0bWw+PGJvZHk+dGV4dDwvYm9keT5kYXRh + +--==BREAK==-- diff --git a/test-data/message/attach_filename_encoded_words_binary.eml b/test-data/message/attach_filename_encoded_words_binary.eml new file mode 100644 index 000000000..e028f1340 --- /dev/null +++ b/test-data/message/attach_filename_encoded_words_binary.eml @@ -0,0 +1,26 @@ +Subject: Test binary-encoded-words filenames +Message-ID: 123456@testrun.org +Date: Sat, 07 Dec 2019 19:00:27 +0000 +Chat-Version: 1.0 +To: recp@testrun.org +From: sender@testrun.org +Content-Type: multipart/mixed; boundary="==BREAK==" + + +--==BREAK== +Content-Type: text/plain; charset=utf-8 + +test binary word-encoded filename, +filename is " § 165 Abs" - note the leading space. + +as a side-effect, this also tests that the encoding-name +also works in UPPERCASE. + +--==BREAK== +Content-Type: text/html +Content-Disposition: attachment; filename="=?UTF-8?B?IMKnIDE2NSBBYnM=?="; +Content-Transfer-Encoding: base64 + +PGh0bWw+PGJvZHk+dGV4dDwvYm9keT5kYXRh + +--==BREAK==-- diff --git a/test-data/message/attach_filename_encoded_words_windows1251.eml b/test-data/message/attach_filename_encoded_words_windows1251.eml new file mode 100644 index 000000000..cee238324 --- /dev/null +++ b/test-data/message/attach_filename_encoded_words_windows1251.eml @@ -0,0 +1,31 @@ +Subject: Test encoded-words +Message-ID: 123456@testrun.org +Date: Sat, 07 Dec 2019 19:00:27 +0000 +Chat-Version: 1.0 +To: recp@testrun.org +From: sender@testrun.org +Content-Type: multipart/mixed; boundary="==BREAK==" + + +--==BREAK== +Content-Type: text/plain; charset=utf-8 + +testing encoded-words filenames with windows-1251 (cyrillic) encoding. + +as a side-effect, this also tests that encoded words work together with +plain text as long as they're separated by spaces, see +https://tools.ietf.org/html/rfc2047 5.1: + +"Ordinary ASCII text and 'encoded-word's may appear together in the +same header field. However, an 'encoded-word' that appears in a +header field defined as '*text' MUST be separated from any adjacent +'encoded-word' or 'text' by 'linear-white-space'." + +--==BREAK== +Content-Type: text/html +Content-Disposition: attachment; filename="file =?Windows-1251?B?1/LuIO3u4u7j7g==?= 2020.pdf"; +Content-Transfer-Encoding: base64 + +PGh0bWw+PGJvZHk+dGV4dDwvYm9keT5kYXRh + +--==BREAK==--