Merge pull request #2094 from deltachat/fix-non-utf8-filenames

test and fix non-utf8 filenames
This commit is contained in:
bjoern
2020-12-04 16:32:03 +01:00
committed by GitHub
9 changed files with 347 additions and 36 deletions

View File

@@ -26,6 +26,7 @@ use crate::param::*;
use crate::peerstate::Peerstate;
use crate::simplify::*;
use crate::stock::StockMessage;
use charset::Charset;
use percent_encoding::percent_decode_str;
/// A parsed MIME message.
@@ -687,7 +688,7 @@ impl MimeMessage {
let (mime_type, msg_type) = get_mime_type(mail)?;
let raw_mime = mail.ctype.mimetype.to_lowercase();
let filename = get_attachment_filename(mail)?;
let filename = get_attachment_filename(context, mail)?;
let old_part_count = self.parts.len();
@@ -1275,7 +1276,10 @@ fn is_attachment_disposition(mail: &mailparse::ParsedMail<'_>) -> bool {
/// returned. If Content-Disposition is "attachment" but filename is
/// not specified, filename is guessed. If Content-Disposition cannot
/// be parsed, returns an error.
fn get_attachment_filename(mail: &mailparse::ParsedMail) -> Result<Option<String>> {
fn get_attachment_filename(
context: &Context,
mail: &mailparse::ParsedMail,
) -> Result<Option<String>> {
let ct = mail.get_content_disposition();
// try to get file name as "encoded-words" from
@@ -1291,7 +1295,7 @@ fn get_attachment_filename(mail: &mailparse::ParsedMail) -> Result<Option<String
desired_filename = ct
.params
.iter()
.filter(|(key, _value)| key.starts_with("filename"))
.filter(|(key, _value)| key.starts_with("filename*"))
.fold(None, |acc, (key, value)| {
if key.ends_with('*') {
apostrophe_encoded = true;
@@ -1303,13 +1307,30 @@ fn get_attachment_filename(mail: &mailparse::ParsedMail) -> Result<Option<String
}
});
if apostrophe_encoded {
// we're currently always assuming utf-8, this might need adaption, however, should not break things.
if let Some(name) = desired_filename {
desired_filename = if let Some(name) = name.splitn(3, '\'').last() {
Some(percent_decode_str(&name).decode_utf8_lossy().to_string())
} else {
None
}
let mut parts = name.splitn(3, '\'');
desired_filename =
if let (Some(charset), Some(value)) = (parts.next(), parts.last()) {
let decoded_bytes = percent_decode_str(&value);
if charset.to_lowercase() == "utf-8" {
Some(decoded_bytes.decode_utf8_lossy().to_string())
} else {
// encoded_words crate say, latin-1 is not reported; moreover, latin1 is a good default
if let Some(charset) = Charset::for_label(charset.as_bytes())
.or_else(|| Charset::for_label(b"latin1"))
{
let decoded_bytes = decoded_bytes.collect::<Vec<u8>>();
let (utf8_str, _, _) = charset.decode(&*decoded_bytes);
Some(utf8_str.into())
} else {
warn!(context, "latin1 encoding does not exist");
None
}
}
} else {
warn!(context, "apostroped encoding invalid");
None
}
}
}
}
@@ -1462,53 +1483,146 @@ mod tests {
assert!(is_attachment_disposition(&mail.subparts[1]));
}
fn load_mail_with_attachment(raw: &[u8]) -> ParsedMail {
fn load_mail_with_attachment<'a>(t: &'a TestContext, raw: &'a [u8]) -> ParsedMail<'a> {
let mail = mailparse::parse_mail(raw).unwrap();
assert!(get_attachment_filename(&mail).unwrap().is_none());
assert!(get_attachment_filename(&mail.subparts[0])
assert!(get_attachment_filename(&t.ctx, &mail).unwrap().is_none());
assert!(get_attachment_filename(&t.ctx, &mail.subparts[0])
.unwrap()
.is_none());
mail
}
#[test]
fn test_get_attachment_filename() {
let mail = load_mail_with_attachment(include_bytes!(
"../test-data/message/attach_filename_simple.eml"
));
let filename = get_attachment_filename(&mail.subparts[1]).unwrap();
#[async_std::test]
async fn test_get_attachment_filename() {
let t = TestContext::new().await;
let mail = load_mail_with_attachment(
&t,
include_bytes!("../test-data/message/attach_filename_simple.eml"),
);
let filename = get_attachment_filename(&t.ctx, &mail.subparts[1]).unwrap();
assert_eq!(filename, Some("test.html".to_string()))
}
#[test]
fn test_get_attachment_filename_encoded_words() {
let mail = load_mail_with_attachment(include_bytes!(
"../test-data/message/attach_filename_encoded_words.eml"
));
let filename = get_attachment_filename(&mail.subparts[1]).unwrap();
#[async_std::test]
async fn test_get_attachment_filename_encoded_words() {
let t = TestContext::new().await;
let mail = load_mail_with_attachment(
&t,
include_bytes!("../test-data/message/attach_filename_encoded_words.eml"),
);
let filename = get_attachment_filename(&t.ctx, &mail.subparts[1]).unwrap();
assert_eq!(filename, Some("Maßnahmen Okt. 2020.html".to_string()))
}
#[test]
fn test_get_attachment_filename_encoded_words_cont() {
#[async_std::test]
async fn test_get_attachment_filename_encoded_words_binary() {
let t = TestContext::new().await;
let mail = load_mail_with_attachment(
&t,
include_bytes!("../test-data/message/attach_filename_encoded_words_binary.eml"),
);
let filename = get_attachment_filename(&t.ctx, &mail.subparts[1]).unwrap();
assert_eq!(filename, Some(" § 165 Abs".to_string()))
}
#[async_std::test]
async fn test_get_attachment_filename_encoded_words_windows1251() {
let t = TestContext::new().await;
let mail = load_mail_with_attachment(
&t,
include_bytes!("../test-data/message/attach_filename_encoded_words_windows1251.eml"),
);
let filename = get_attachment_filename(&t.ctx, &mail.subparts[1]).unwrap();
assert_eq!(filename, Some("file Что нового 2020.pdf".to_string()))
}
#[async_std::test]
async fn test_get_attachment_filename_encoded_words_cont() {
// test continued encoded-words and also test apostropes work that way
let mail = load_mail_with_attachment(include_bytes!(
"../test-data/message/attach_filename_encoded_words_cont.eml"
));
let filename = get_attachment_filename(&mail.subparts[1]).unwrap();
let t = TestContext::new().await;
let mail = load_mail_with_attachment(
&t,
include_bytes!("../test-data/message/attach_filename_encoded_words_cont.eml"),
);
let filename = get_attachment_filename(&t.ctx, &mail.subparts[1]).unwrap();
assert_eq!(filename, Some("Maßn'ah'men Okt. 2020.html".to_string()))
}
#[test]
fn test_get_attachment_filename_combined() {
#[async_std::test]
async fn test_get_attachment_filename_encoded_words_bad_delimiter() {
let t = TestContext::new().await;
let mail = load_mail_with_attachment(
&t,
include_bytes!("../test-data/message/attach_filename_encoded_words_bad_delimiter.eml"),
);
let filename = get_attachment_filename(&t.ctx, &mail.subparts[1]).unwrap();
// not decoded as a space is missing after encoded-words part
assert_eq!(filename, Some("=?utf-8?q?foo?=.bar".to_string()))
}
#[async_std::test]
async fn test_get_attachment_filename_apostrophed() {
let t = TestContext::new().await;
let mail = load_mail_with_attachment(
&t,
include_bytes!("../test-data/message/attach_filename_apostrophed.eml"),
);
let filename = get_attachment_filename(&t.ctx, &mail.subparts[1]).unwrap();
assert_eq!(filename, Some("Maßnahmen Okt. 2021.html".to_string()))
}
#[async_std::test]
async fn test_get_attachment_filename_apostrophed_cont() {
let t = TestContext::new().await;
let mail = load_mail_with_attachment(
&t,
include_bytes!("../test-data/message/attach_filename_apostrophed_cont.eml"),
);
let filename = get_attachment_filename(&t.ctx, &mail.subparts[1]).unwrap();
assert_eq!(filename, Some("Maßnahmen März 2022.html".to_string()))
}
#[async_std::test]
async fn test_get_attachment_filename_apostrophed_windows1251() {
let t = TestContext::new().await;
let mail = load_mail_with_attachment(
&t,
include_bytes!("../test-data/message/attach_filename_apostrophed_windows1251.eml"),
);
let filename = get_attachment_filename(&t.ctx, &mail.subparts[1]).unwrap();
assert_eq!(filename, Some("программирование.HTM".to_string()))
}
#[async_std::test]
async fn test_get_attachment_filename_apostrophed_cp1252() {
let t = TestContext::new().await;
let mail = load_mail_with_attachment(
&t,
include_bytes!("../test-data/message/attach_filename_apostrophed_cp1252.eml"),
);
let filename = get_attachment_filename(&t.ctx, &mail.subparts[1]).unwrap();
assert_eq!(filename, Some("Auftragsbestätigung.pdf".to_string()))
}
#[async_std::test]
async fn test_get_attachment_filename_combined() {
// test that if `filename` and `filename*0` are given, the filename is not doubled
let mail = load_mail_with_attachment(include_bytes!(
"../test-data/message/attach_filename_combined.eml"
));
let filename = get_attachment_filename(&mail.subparts[1]).unwrap();
let t = TestContext::new().await;
let mail = load_mail_with_attachment(
&t,
include_bytes!("../test-data/message/attach_filename_combined.eml"),
);
let filename = get_attachment_filename(&t.ctx, &mail.subparts[1]).unwrap();
assert_eq!(filename, Some("Maßnahmen Okt. 2020.html".to_string()))
}
#[test]
fn test_charset_latin1() {
// make sure, latin1 exists under this name
// as we're using it as default in get_attachment_filename() for non-utf-8
assert!(Charset::for_label(b"latin1").is_some());
}
#[test]
fn test_mailparse_content_type() {
let ctype =

View File

@@ -6,6 +6,7 @@ Tasks | Standards
---------------------------------|---------------------------------------------
Transport | IMAP v4 ([RFC 3501](https://tools.ietf.org/html/rfc3501)), SMTP ([RFC 5321](https://tools.ietf.org/html/rfc5321)) and Internet Message Format (IMF, [RFC 5322](https://tools.ietf.org/html/rfc5322))
Embedded media | MIME Document Series ([RFC 2045](https://tools.ietf.org/html/rfc2045), [RFC 2046](https://tools.ietf.org/html/rfc2046)), Content-Disposition Header ([RFC 2183](https://tools.ietf.org/html/rfc2183)), Multipart/Related ([RFC 2387](https://tools.ietf.org/html/rfc2387))
Filename encoding | Encoded Words ([RFC 2047](https://tools.ietf.org/html/rfc2047)), Encoded Word Extensions ([RFC 2231](https://tools.ietf.org/html/rfc2231))
Identify server folders | IMAP LIST Extension ([RFC 6154](https://tools.ietf.org/html/rfc6154))
Push | IMAP IDLE ([RFC 2177](https://tools.ietf.org/html/rfc2177))
Authorization | OAuth2 ([RFC 6749](https://tools.ietf.org/html/rfc6749))

View File

@@ -0,0 +1,24 @@
Subject: Test apostrophed filenames
Message-ID: 12345@testrun.org
Date: Sat, 07 Dec 2019 19:00:27 +0000
X-Mailer: Kopano 8.7.16
To: recp@testrun.org
From: sender@testrun.org
Content-Type: multipart/mixed; boundary="==BREAK=="
--==BREAK==
Content-Type: text/plain; charset=utf-8
apostrophed filenames as of
https://tools.ietf.org/html/rfc2231
--==BREAK==
Content-Type: text/html
Content-Disposition: attachment;
filename*=utf-8''Ma%C3%9Fnahmen%20Okt.%202021.html
Content-Transfer-Encoding: base64
PGh0bWw+PGJvZHk+dGV4dDwvYm9keT5kYXRh
--==BREAK==--

View File

@@ -0,0 +1,34 @@
Subject: Test apostrophed filenames
Message-ID: 12345@testrun.org
Date: Sat, 07 Dec 2019 19:00:27 +0000
X-Mailer: Kopano 8.7.16
To: recp@testrun.org
From: sender@testrun.org
Content-Type: multipart/mixed; boundary="==BREAK=="
--==BREAK==
Content-Type: text/plain; charset=utf-8
apostrophed filenames as of
https://tools.ietf.org/html/rfc2231,
span over several header lines.
note, that, in contrast to encoded-words,
the character-set is not repeated.
as a side-effect,
this tests unquoted header attributes in filename*1*
and lower-case-urlencoded utf-8
--==BREAK==
Content-Type: text/html
Content-Disposition: attachment;
filename*0*="utf-8''Ma%C3%9Fna";
filename*1*=hm;
filename*2*="en%20M%c3%a4rz%202022.html";
Content-Transfer-Encoding: base64
PGh0bWw+PGJvZHk+dGV4dDwvYm9keT5kYXRh
--==BREAK==--

View File

@@ -0,0 +1,23 @@
Subject: Test apostrophed filenames
Message-ID: 12345@testrun.org
Date: Sat, 07 Dec 2019 19:00:27 +0000
X-Mailer: Kopano 8.7.16
To: recp@testrun.org
From: sender@testrun.org
Content-Type: multipart/mixed; boundary="==BREAK=="
--==BREAK==
Content-Type: text/plain; charset=utf-8
testing cp1252 aka ANSI aka Windows-1252
--==BREAK==
Content-Type: text/html
Content-Disposition: attachment;
filename*=Cp1252''Auftragsbest%E4tigung.pdf;
Content-Transfer-Encoding: base64
PGh0bWw+PGJvZHk+dGV4dDwvYm9keT5kYXRh
--==BREAK==--

View File

@@ -0,0 +1,29 @@
Subject: Test apostrophed filenames
Message-ID: 12345@testrun.org
Date: Sat, 07 Dec 2019 19:00:27 +0000
X-Mailer: Kopano 8.7.16
To: recp@testrun.org
From: sender@testrun.org
Content-Type: multipart/mixed; boundary="==BREAK=="
--==BREAK==
Content-Type: text/plain; charset=utf-8
apostrophed filenames as of
https://tools.ietf.org/html/rfc2231,
testing non-utf-8 charset
examples:
%EF%F0%EE%E3%F0%E0%EC%EC%E8%F0%EE%E2%E0%ED%E8%E5 = программирование = programming
--==BREAK==
Content-Type: text/html
Content-Disposition: attachment;
filename*0*=windows-1251''%EF%F0%EE%E3%F0%E0%EC%EC%E8%F0%EE%E2%E0;
filename*1*=%ED%E8%E5.HTM
Content-Transfer-Encoding: base64
PGh0bWw+PGJvZHk+dGV4dDwvYm9keT5kYXRh
--==BREAK==--

View File

@@ -0,0 +1,29 @@
Subject: Test encoded-words filenames
Message-ID: 123456@testrun.org
Date: Sat, 07 Dec 2019 19:00:27 +0000
Chat-Version: 1.0
To: recp@testrun.org
From: sender@testrun.org
Content-Type: multipart/mixed; boundary="==BREAK=="
--==BREAK==
Content-Type: text/plain; charset=utf-8
there MUST be a space between encoded words and plain text,
if there is none, decoding should return the original string
https://tools.ietf.org/html/rfc2047 5.1:
"Ordinary ASCII text and 'encoded-word's may appear together in the
same header field. However, an 'encoded-word' that appears in a
header field defined as '*text' MUST be separated from any adjacent
'encoded-word' or 'text' by 'linear-white-space'."
--==BREAK==
Content-Type: text/html
Content-Disposition: attachment; filename="=?utf-8?q?foo?=.bar";
Content-Transfer-Encoding: base64
PGh0bWw+PGJvZHk+dGV4dDwvYm9keT5kYXRh
--==BREAK==--

View File

@@ -0,0 +1,26 @@
Subject: Test binary-encoded-words filenames
Message-ID: 123456@testrun.org
Date: Sat, 07 Dec 2019 19:00:27 +0000
Chat-Version: 1.0
To: recp@testrun.org
From: sender@testrun.org
Content-Type: multipart/mixed; boundary="==BREAK=="
--==BREAK==
Content-Type: text/plain; charset=utf-8
test binary word-encoded filename,
filename is " § 165 Abs" - note the leading space.
as a side-effect, this also tests that the encoding-name
also works in UPPERCASE.
--==BREAK==
Content-Type: text/html
Content-Disposition: attachment; filename="=?UTF-8?B?IMKnIDE2NSBBYnM=?=";
Content-Transfer-Encoding: base64
PGh0bWw+PGJvZHk+dGV4dDwvYm9keT5kYXRh
--==BREAK==--

View File

@@ -0,0 +1,31 @@
Subject: Test encoded-words
Message-ID: 123456@testrun.org
Date: Sat, 07 Dec 2019 19:00:27 +0000
Chat-Version: 1.0
To: recp@testrun.org
From: sender@testrun.org
Content-Type: multipart/mixed; boundary="==BREAK=="
--==BREAK==
Content-Type: text/plain; charset=utf-8
testing encoded-words filenames with windows-1251 (cyrillic) encoding.
as a side-effect, this also tests that encoded words work together with
plain text as long as they're separated by spaces, see
https://tools.ietf.org/html/rfc2047 5.1:
"Ordinary ASCII text and 'encoded-word's may appear together in the
same header field. However, an 'encoded-word' that appears in a
header field defined as '*text' MUST be separated from any adjacent
'encoded-word' or 'text' by 'linear-white-space'."
--==BREAK==
Content-Type: text/html
Content-Disposition: attachment; filename="file =?Windows-1251?B?1/LuIO3u4u7j7g==?= 2020.pdf";
Content-Transfer-Encoding: base64
PGh0bWw+PGJvZHk+dGV4dDwvYm9keT5kYXRh
--==BREAK==--