Merge pull request #2094 from deltachat/fix-non-utf8-filenames

test and fix non-utf8 filenames
2026-05-18 22:36:29 +03:00 · 2020-12-04 16:32:03 +01:00
parent 7c9624e822 af045c245d
commit cdb7f1dd9f
9 changed files with 347 additions and 36 deletions
--- a/src/mimeparser.rs
+++ b/src/mimeparser.rs
@@ -26,6 +26,7 @@ use crate::param::*;
 use crate::peerstate::Peerstate;
 use crate::simplify::*;
 use crate::stock::StockMessage;
+use charset::Charset;
 use percent_encoding::percent_decode_str;

 /// A parsed MIME message.
@@ -687,7 +688,7 @@ impl MimeMessage {
        let (mime_type, msg_type) = get_mime_type(mail)?;
        let raw_mime = mail.ctype.mimetype.to_lowercase();

-        let filename = get_attachment_filename(mail)?;
+        let filename = get_attachment_filename(context, mail)?;

        let old_part_count = self.parts.len();

@@ -1275,7 +1276,10 @@ fn is_attachment_disposition(mail: &mailparse::ParsedMail<'_>) -> bool {
 /// returned. If Content-Disposition is "attachment" but filename is
 /// not specified, filename is guessed. If Content-Disposition cannot
 /// be parsed, returns an error.
-fn get_attachment_filename(mail: &mailparse::ParsedMail) -> Result<Option<String>> {
+fn get_attachment_filename(
+    context: &Context,
+    mail: &mailparse::ParsedMail,
+) -> Result<Option<String>> {
    let ct = mail.get_content_disposition();

    // try to get file name as "encoded-words" from
@@ -1291,7 +1295,7 @@ fn get_attachment_filename(mail: &mailparse::ParsedMail) -> Result<Option<String
        desired_filename = ct
            .params
            .iter()
-            .filter(|(key, _value)| key.starts_with("filename"))
+            .filter(|(key, _value)| key.starts_with("filename*"))
            .fold(None, |acc, (key, value)| {
                if key.ends_with('*') {
                    apostrophe_encoded = true;
@@ -1303,13 +1307,30 @@ fn get_attachment_filename(mail: &mailparse::ParsedMail) -> Result<Option<String
                }
            });
        if apostrophe_encoded {
-            // we're currently always assuming utf-8, this might need adaption, however, should not break things.
            if let Some(name) = desired_filename {
-                desired_filename = if let Some(name) = name.splitn(3, '\'').last() {
-                    Some(percent_decode_str(&name).decode_utf8_lossy().to_string())
-                } else {
-                    None
-                }
+                let mut parts = name.splitn(3, '\'');
+                desired_filename =
+                    if let (Some(charset), Some(value)) = (parts.next(), parts.last()) {
+                        let decoded_bytes = percent_decode_str(&value);
+                        if charset.to_lowercase() == "utf-8" {
+                            Some(decoded_bytes.decode_utf8_lossy().to_string())
+                        } else {
+                            // encoded_words crate say, latin-1 is not reported; moreover, latin1 is a good default
+                            if let Some(charset) = Charset::for_label(charset.as_bytes())
+                                .or_else(|| Charset::for_label(b"latin1"))
+                            {
+                                let decoded_bytes = decoded_bytes.collect::<Vec<u8>>();
+                                let (utf8_str, _, _) = charset.decode(&*decoded_bytes);
+                                Some(utf8_str.into())
+                            } else {
+                                warn!(context, "latin1 encoding does not exist");
+                                None
+                            }
+                        }
+                    } else {
+                        warn!(context, "apostroped encoding invalid");
+                        None
+                    }
            }
        }
    }
@@ -1462,53 +1483,146 @@ mod tests {
        assert!(is_attachment_disposition(&mail.subparts[1]));
    }

-    fn load_mail_with_attachment(raw: &[u8]) -> ParsedMail {
+    fn load_mail_with_attachment<'a>(t: &'a TestContext, raw: &'a [u8]) -> ParsedMail<'a> {
        let mail = mailparse::parse_mail(raw).unwrap();
-        assert!(get_attachment_filename(&mail).unwrap().is_none());
-        assert!(get_attachment_filename(&mail.subparts[0])
+        assert!(get_attachment_filename(&t.ctx, &mail).unwrap().is_none());
+        assert!(get_attachment_filename(&t.ctx, &mail.subparts[0])
            .unwrap()
            .is_none());
        mail
    }

-    #[test]
-    fn test_get_attachment_filename() {
-        let mail = load_mail_with_attachment(include_bytes!(
-            "../test-data/message/attach_filename_simple.eml"
-        ));
-        let filename = get_attachment_filename(&mail.subparts[1]).unwrap();
+    #[async_std::test]
+    async fn test_get_attachment_filename() {
+        let t = TestContext::new().await;
+        let mail = load_mail_with_attachment(
+            &t,
+            include_bytes!("../test-data/message/attach_filename_simple.eml"),
+        );
+        let filename = get_attachment_filename(&t.ctx, &mail.subparts[1]).unwrap();
        assert_eq!(filename, Some("test.html".to_string()))
    }

-    #[test]
-    fn test_get_attachment_filename_encoded_words() {
-        let mail = load_mail_with_attachment(include_bytes!(
-            "../test-data/message/attach_filename_encoded_words.eml"
-        ));
-        let filename = get_attachment_filename(&mail.subparts[1]).unwrap();
+    #[async_std::test]
+    async fn test_get_attachment_filename_encoded_words() {
+        let t = TestContext::new().await;
+        let mail = load_mail_with_attachment(
+            &t,
+            include_bytes!("../test-data/message/attach_filename_encoded_words.eml"),
+        );
+        let filename = get_attachment_filename(&t.ctx, &mail.subparts[1]).unwrap();
        assert_eq!(filename, Some("Maßnahmen Okt. 2020.html".to_string()))
    }

-    #[test]
-    fn test_get_attachment_filename_encoded_words_cont() {
+    #[async_std::test]
+    async fn test_get_attachment_filename_encoded_words_binary() {
+        let t = TestContext::new().await;
+        let mail = load_mail_with_attachment(
+            &t,
+            include_bytes!("../test-data/message/attach_filename_encoded_words_binary.eml"),
+        );
+        let filename = get_attachment_filename(&t.ctx, &mail.subparts[1]).unwrap();
+        assert_eq!(filename, Some(" § 165 Abs".to_string()))
+    }
+
+    #[async_std::test]
+    async fn test_get_attachment_filename_encoded_words_windows1251() {
+        let t = TestContext::new().await;
+        let mail = load_mail_with_attachment(
+            &t,
+            include_bytes!("../test-data/message/attach_filename_encoded_words_windows1251.eml"),
+        );
+        let filename = get_attachment_filename(&t.ctx, &mail.subparts[1]).unwrap();
+        assert_eq!(filename, Some("file Что нового 2020.pdf".to_string()))
+    }
+
+    #[async_std::test]
+    async fn test_get_attachment_filename_encoded_words_cont() {
        // test continued encoded-words and also test apostropes work that way
-        let mail = load_mail_with_attachment(include_bytes!(
-            "../test-data/message/attach_filename_encoded_words_cont.eml"
-        ));
-        let filename = get_attachment_filename(&mail.subparts[1]).unwrap();
+        let t = TestContext::new().await;
+        let mail = load_mail_with_attachment(
+            &t,
+            include_bytes!("../test-data/message/attach_filename_encoded_words_cont.eml"),
+        );
+        let filename = get_attachment_filename(&t.ctx, &mail.subparts[1]).unwrap();
        assert_eq!(filename, Some("Maßn'ah'men Okt. 2020.html".to_string()))
    }

-    #[test]
-    fn test_get_attachment_filename_combined() {
+    #[async_std::test]
+    async fn test_get_attachment_filename_encoded_words_bad_delimiter() {
+        let t = TestContext::new().await;
+        let mail = load_mail_with_attachment(
+            &t,
+            include_bytes!("../test-data/message/attach_filename_encoded_words_bad_delimiter.eml"),
+        );
+        let filename = get_attachment_filename(&t.ctx, &mail.subparts[1]).unwrap();
+        // not decoded as a space is missing after encoded-words part
+        assert_eq!(filename, Some("=?utf-8?q?foo?=.bar".to_string()))
+    }
+
+    #[async_std::test]
+    async fn test_get_attachment_filename_apostrophed() {
+        let t = TestContext::new().await;
+        let mail = load_mail_with_attachment(
+            &t,
+            include_bytes!("../test-data/message/attach_filename_apostrophed.eml"),
+        );
+        let filename = get_attachment_filename(&t.ctx, &mail.subparts[1]).unwrap();
+        assert_eq!(filename, Some("Maßnahmen Okt. 2021.html".to_string()))
+    }
+
+    #[async_std::test]
+    async fn test_get_attachment_filename_apostrophed_cont() {
+        let t = TestContext::new().await;
+        let mail = load_mail_with_attachment(
+            &t,
+            include_bytes!("../test-data/message/attach_filename_apostrophed_cont.eml"),
+        );
+        let filename = get_attachment_filename(&t.ctx, &mail.subparts[1]).unwrap();
+        assert_eq!(filename, Some("Maßnahmen März 2022.html".to_string()))
+    }
+
+    #[async_std::test]
+    async fn test_get_attachment_filename_apostrophed_windows1251() {
+        let t = TestContext::new().await;
+        let mail = load_mail_with_attachment(
+            &t,
+            include_bytes!("../test-data/message/attach_filename_apostrophed_windows1251.eml"),
+        );
+        let filename = get_attachment_filename(&t.ctx, &mail.subparts[1]).unwrap();
+        assert_eq!(filename, Some("программирование.HTM".to_string()))
+    }
+
+    #[async_std::test]
+    async fn test_get_attachment_filename_apostrophed_cp1252() {
+        let t = TestContext::new().await;
+        let mail = load_mail_with_attachment(
+            &t,
+            include_bytes!("../test-data/message/attach_filename_apostrophed_cp1252.eml"),
+        );
+        let filename = get_attachment_filename(&t.ctx, &mail.subparts[1]).unwrap();
+        assert_eq!(filename, Some("Auftragsbestätigung.pdf".to_string()))
+    }
+
+    #[async_std::test]
+    async fn test_get_attachment_filename_combined() {
        // test that if `filename` and `filename*0` are given, the filename is not doubled
-        let mail = load_mail_with_attachment(include_bytes!(
-            "../test-data/message/attach_filename_combined.eml"
-        ));
-        let filename = get_attachment_filename(&mail.subparts[1]).unwrap();
+        let t = TestContext::new().await;
+        let mail = load_mail_with_attachment(
+            &t,
+            include_bytes!("../test-data/message/attach_filename_combined.eml"),
+        );
+        let filename = get_attachment_filename(&t.ctx, &mail.subparts[1]).unwrap();
        assert_eq!(filename, Some("Maßnahmen Okt. 2020.html".to_string()))
    }

+    #[test]
+    fn test_charset_latin1() {
+        // make sure, latin1 exists under this name
+        // as we're using it as default in get_attachment_filename() for non-utf-8
+        assert!(Charset::for_label(b"latin1").is_some());
+    }
+
    #[test]
    fn test_mailparse_content_type() {
        let ctype =
--- a/standards.md
+++ b/standards.md
@@ -6,6 +6,7 @@ Tasks                            | Standards
 ---------------------------------|---------------------------------------------
 Transport                        | IMAP v4 ([RFC 3501](https://tools.ietf.org/html/rfc3501)), SMTP ([RFC 5321](https://tools.ietf.org/html/rfc5321)) and Internet Message Format (IMF, [RFC 5322](https://tools.ietf.org/html/rfc5322))
 Embedded media                   | MIME Document Series ([RFC 2045](https://tools.ietf.org/html/rfc2045), [RFC 2046](https://tools.ietf.org/html/rfc2046)), Content-Disposition Header ([RFC 2183](https://tools.ietf.org/html/rfc2183)), Multipart/Related ([RFC 2387](https://tools.ietf.org/html/rfc2387))
+Filename encoding                | Encoded Words ([RFC 2047](https://tools.ietf.org/html/rfc2047)), Encoded Word Extensions ([RFC 2231](https://tools.ietf.org/html/rfc2231))
 Identify server folders          | IMAP LIST Extension ([RFC 6154](https://tools.ietf.org/html/rfc6154))
 Push                             | IMAP IDLE ([RFC 2177](https://tools.ietf.org/html/rfc2177))
 Authorization                    | OAuth2 ([RFC 6749](https://tools.ietf.org/html/rfc6749))
--- a/test-data/message/attach_filename_apostrophed.eml
+++ b/test-data/message/attach_filename_apostrophed.eml
@@ -0,0 +1,24 @@
+Subject: Test apostrophed filenames
+Message-ID: 12345@testrun.org
+Date: Sat, 07 Dec 2019 19:00:27 +0000
+X-Mailer: Kopano 8.7.16
+To: recp@testrun.org
+From: sender@testrun.org
+Content-Type: multipart/mixed; boundary="==BREAK=="
+
+
+--==BREAK==
+Content-Type: text/plain; charset=utf-8
+
+apostrophed filenames as of
+https://tools.ietf.org/html/rfc2231
+
+--==BREAK==
+Content-Type: text/html
+Content-Disposition: attachment;
+ filename*=utf-8''Ma%C3%9Fnahmen%20Okt.%202021.html
+Content-Transfer-Encoding: base64
+
+PGh0bWw+PGJvZHk+dGV4dDwvYm9keT5kYXRh
+
+--==BREAK==--
--- a/test-data/message/attach_filename_apostrophed_cont.eml
+++ b/test-data/message/attach_filename_apostrophed_cont.eml
@@ -0,0 +1,34 @@
+Subject: Test apostrophed filenames
+Message-ID: 12345@testrun.org
+Date: Sat, 07 Dec 2019 19:00:27 +0000
+X-Mailer: Kopano 8.7.16
+To: recp@testrun.org
+From: sender@testrun.org
+Content-Type: multipart/mixed; boundary="==BREAK=="
+
+
+--==BREAK==
+Content-Type: text/plain; charset=utf-8
+
+apostrophed filenames as of
+https://tools.ietf.org/html/rfc2231,
+span over several header lines.
+
+note, that, in contrast to encoded-words,
+the character-set is not repeated.
+
+as a side-effect,
+this tests unquoted header attributes in filename*1*
+and lower-case-urlencoded utf-8
+
+--==BREAK==
+Content-Type: text/html
+Content-Disposition: attachment;
+ filename*0*="utf-8''Ma%C3%9Fna";
+ filename*1*=hm;
+ filename*2*="en%20M%c3%a4rz%202022.html";
+Content-Transfer-Encoding: base64
+
+PGh0bWw+PGJvZHk+dGV4dDwvYm9keT5kYXRh
+
+--==BREAK==--
--- a/test-data/message/attach_filename_apostrophed_cp1252.eml
+++ b/test-data/message/attach_filename_apostrophed_cp1252.eml
@@ -0,0 +1,23 @@
+Subject: Test apostrophed filenames
+Message-ID: 12345@testrun.org
+Date: Sat, 07 Dec 2019 19:00:27 +0000
+X-Mailer: Kopano 8.7.16
+To: recp@testrun.org
+From: sender@testrun.org
+Content-Type: multipart/mixed; boundary="==BREAK=="
+
+
+--==BREAK==
+Content-Type: text/plain; charset=utf-8
+
+testing cp1252 aka ANSI aka Windows-1252
+
+--==BREAK==
+Content-Type: text/html
+Content-Disposition: attachment;
+ filename*=Cp1252''Auftragsbest%E4tigung.pdf;
+Content-Transfer-Encoding: base64
+
+PGh0bWw+PGJvZHk+dGV4dDwvYm9keT5kYXRh
+
+--==BREAK==--
--- a/test-data/message/attach_filename_apostrophed_windows1251.eml
+++ b/test-data/message/attach_filename_apostrophed_windows1251.eml
@@ -0,0 +1,29 @@
+Subject: Test apostrophed filenames
+Message-ID: 12345@testrun.org
+Date: Sat, 07 Dec 2019 19:00:27 +0000
+X-Mailer: Kopano 8.7.16
+To: recp@testrun.org
+From: sender@testrun.org
+Content-Type: multipart/mixed; boundary="==BREAK=="
+
+
+--==BREAK==
+Content-Type: text/plain; charset=utf-8
+
+apostrophed filenames as of
+https://tools.ietf.org/html/rfc2231,
+testing non-utf-8 charset
+
+examples:
+%EF%F0%EE%E3%F0%E0%EC%EC%E8%F0%EE%E2%E0%ED%E8%E5 = программирование = programming
+
+--==BREAK==
+Content-Type: text/html
+Content-Disposition: attachment;
+ filename*0*=windows-1251''%EF%F0%EE%E3%F0%E0%EC%EC%E8%F0%EE%E2%E0;
+ filename*1*=%ED%E8%E5.HTM
+Content-Transfer-Encoding: base64
+
+PGh0bWw+PGJvZHk+dGV4dDwvYm9keT5kYXRh
+
+--==BREAK==--
--- a/test-data/message/attach_filename_encoded_words_bad_delimiter.eml
+++ b/test-data/message/attach_filename_encoded_words_bad_delimiter.eml
@@ -0,0 +1,29 @@
+Subject: Test encoded-words filenames
+Message-ID: 123456@testrun.org
+Date: Sat, 07 Dec 2019 19:00:27 +0000
+Chat-Version: 1.0
+To: recp@testrun.org
+From: sender@testrun.org
+Content-Type: multipart/mixed; boundary="==BREAK=="
+
+
+--==BREAK==
+Content-Type: text/plain; charset=utf-8
+
+there MUST be a space between encoded words and plain text,
+if there is none, decoding should return the original string
+https://tools.ietf.org/html/rfc2047 5.1:
+
+"Ordinary ASCII text and 'encoded-word's may appear together in the
+same header field.  However, an 'encoded-word' that appears in a
+header field defined as '*text' MUST be separated from any adjacent
+'encoded-word' or 'text' by 'linear-white-space'."
+
+--==BREAK==
+Content-Type: text/html
+Content-Disposition: attachment; filename="=?utf-8?q?foo?=.bar";
+Content-Transfer-Encoding: base64
+
+PGh0bWw+PGJvZHk+dGV4dDwvYm9keT5kYXRh
+
+--==BREAK==--
--- a/test-data/message/attach_filename_encoded_words_binary.eml
+++ b/test-data/message/attach_filename_encoded_words_binary.eml
@@ -0,0 +1,26 @@
+Subject: Test binary-encoded-words filenames
+Message-ID: 123456@testrun.org
+Date: Sat, 07 Dec 2019 19:00:27 +0000
+Chat-Version: 1.0
+To: recp@testrun.org
+From: sender@testrun.org
+Content-Type: multipart/mixed; boundary="==BREAK=="
+
+
+--==BREAK==
+Content-Type: text/plain; charset=utf-8
+
+test binary word-encoded filename,
+filename is " § 165 Abs" - note the leading space.
+
+as a side-effect, this also tests that the encoding-name
+also works in UPPERCASE.
+
+--==BREAK==
+Content-Type: text/html
+Content-Disposition: attachment; filename="=?UTF-8?B?IMKnIDE2NSBBYnM=?=";
+Content-Transfer-Encoding: base64
+
+PGh0bWw+PGJvZHk+dGV4dDwvYm9keT5kYXRh
+
+--==BREAK==--
--- a/test-data/message/attach_filename_encoded_words_windows1251.eml
+++ b/test-data/message/attach_filename_encoded_words_windows1251.eml
@@ -0,0 +1,31 @@
+Subject: Test encoded-words
+Message-ID: 123456@testrun.org
+Date: Sat, 07 Dec 2019 19:00:27 +0000
+Chat-Version: 1.0
+To: recp@testrun.org
+From: sender@testrun.org
+Content-Type: multipart/mixed; boundary="==BREAK=="
+
+
+--==BREAK==
+Content-Type: text/plain; charset=utf-8
+
+testing encoded-words filenames with windows-1251 (cyrillic) encoding.
+
+as a side-effect, this also tests that encoded words work together with
+plain text as long as they're separated by spaces, see
+https://tools.ietf.org/html/rfc2047 5.1:
+
+"Ordinary ASCII text and 'encoded-word's may appear together in the
+same header field.  However, an 'encoded-word' that appears in a
+header field defined as '*text' MUST be separated from any adjacent
+'encoded-word' or 'text' by 'linear-white-space'."
+
+--==BREAK==
+Content-Type: text/html
+Content-Disposition: attachment; filename="file =?Windows-1251?B?1/LuIO3u4u7j7g==?= 2020.pdf";
+Content-Transfer-Encoding: base64
+
+PGh0bWw+PGJvZHk+dGV4dDwvYm9keT5kYXRh
+
+--==BREAK==--