mirror of
https://github.com/chatmail/core.git
synced 2026-04-27 18:36:30 +03:00
Fix #1804: remove <!doctype html> and accept invalid HTML
This fixes #1804 in two ways: First, it removes a <!doctype html> from the start of the mail, if there is any. Then, it parses the html itself it quick-xml fails, just stripping everything between < and >. Both of these would have fixed this specific issue. Also, add tests for both fixes.
This commit is contained in:
@@ -2623,4 +2623,26 @@ mod tests {
|
||||
);
|
||||
assert_eq!(last_msg.from_id, DC_CONTACT_ID_INFO);
|
||||
}
|
||||
|
||||
#[async_std::test]
|
||||
async fn test_html_only_mail() {
|
||||
let t = TestContext::new_alice().await;
|
||||
t.ctx
|
||||
.set_config(Config::ShowEmails, Some("2"))
|
||||
.await
|
||||
.unwrap();
|
||||
dc_receive_imf(
|
||||
&t.ctx,
|
||||
include_bytes!("../test-data/message/wrong-html.eml"),
|
||||
"INBOX",
|
||||
0,
|
||||
false,
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
let chats = Chatlist::try_load(&t.ctx, 0, None, None).await.unwrap();
|
||||
let msg_id = chats.get_msg_id(0).unwrap();
|
||||
let msg = Message::load_from_db(&t.ctx, msg_id).await.unwrap();
|
||||
assert_eq!(msg.text.unwrap(), " Guten Abend, \n\n Lots of text \n\n text with Umlaut ä... \n\n MfG [...]");
|
||||
}
|
||||
}
|
||||
|
||||
@@ -25,7 +25,19 @@ enum AddText {
|
||||
// dehtml() returns way too many newlines; however, an optimisation on this issue is not needed as
|
||||
// the newlines are typically removed in further processing by the caller
|
||||
pub fn dehtml(buf: &str) -> String {
|
||||
let buf = buf.trim();
|
||||
let s = dehtml_quick_xml(buf);
|
||||
if !s.trim().is_empty() {
|
||||
return s;
|
||||
}
|
||||
let s = dehtml_manually(buf);
|
||||
if !s.trim().is_empty() {
|
||||
return s;
|
||||
}
|
||||
buf.to_string()
|
||||
}
|
||||
|
||||
pub fn dehtml_quick_xml(buf: &str) -> String {
|
||||
let buf = buf.trim().trim_start_matches("<!doctype html>");
|
||||
|
||||
let mut dehtml = Dehtml {
|
||||
strbuilder: String::with_capacity(buf.len()),
|
||||
@@ -171,6 +183,24 @@ fn dehtml_starttag_cb<B: std::io::BufRead>(
|
||||
}
|
||||
}
|
||||
|
||||
pub fn dehtml_manually(buf: &str) -> String {
|
||||
// Just strip out everything between "<" and ">"
|
||||
let mut strbuilder = String::new();
|
||||
let mut show_next_chars = true;
|
||||
for c in buf.chars() {
|
||||
match c {
|
||||
'<' => show_next_chars = false,
|
||||
'>' => show_next_chars = true,
|
||||
_ => {
|
||||
if show_next_chars {
|
||||
strbuilder.push(c)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
strbuilder
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
@@ -254,4 +284,22 @@ mod tests {
|
||||
let txt = dehtml(input);
|
||||
assert_eq!(txt.trim(), "lots of text");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_doctype_html() {
|
||||
use crate::simplify::simplify;
|
||||
|
||||
let input = "<!doctype html>\n<b>fat text</b>";
|
||||
let txt = simplify(dehtml(input), false).0;
|
||||
assert_eq!(txt.trim(), "*fat text*");
|
||||
|
||||
let input = "<!some invalid html code>\n<b>some text</b>";
|
||||
let txt = simplify(dehtml(input), false).0;
|
||||
assert_eq!(txt.trim(), "some text");
|
||||
// at least DC should show the text if the html is invalid
|
||||
|
||||
let input = "<This text is in brackets>";
|
||||
let txt = simplify(dehtml(input), false).0;
|
||||
assert_eq!(txt.trim(), "<This text is in brackets>");
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user