Fix #1804: remove <!doctype html> and accept invalid HTML

This fixes #1804 in two ways: First, it removes a <!doctype html> from the start of the mail, if there is any. Then, it parses the html itself it quick-xml fails, just stripping everything between < and >. Both of these would have fixed this specific issue. Also, add tests for both fixes.
2026-04-27 18:36:30 +03:00 · 2020-08-19 12:20:01 +02:00
parent f1ec1a0765
commit 1a736ca6c3
3 changed files with 185 additions and 1 deletions
--- a/src/dc_receive_imf.rs
+++ b/src/dc_receive_imf.rs
@@ -2623,4 +2623,26 @@ mod tests {
        );
        assert_eq!(last_msg.from_id, DC_CONTACT_ID_INFO);
    }
+
+    #[async_std::test]
+    async fn test_html_only_mail() {
+        let t = TestContext::new_alice().await;
+        t.ctx
+            .set_config(Config::ShowEmails, Some("2"))
+            .await
+            .unwrap();
+        dc_receive_imf(
+            &t.ctx,
+            include_bytes!("../test-data/message/wrong-html.eml"),
+            "INBOX",
+            0,
+            false,
+        )
+        .await
+        .unwrap();
+        let chats = Chatlist::try_load(&t.ctx, 0, None, None).await.unwrap();
+        let msg_id = chats.get_msg_id(0).unwrap();
+        let msg = Message::load_from_db(&t.ctx, msg_id).await.unwrap();
+        assert_eq!(msg.text.unwrap(), "   Guten Abend,   \n\n   Lots of text   \n\n   text with Umlaut ä...   \n\n   MfG    [...]");
+    }
 }
--- a/src/dehtml.rs
+++ b/src/dehtml.rs
@@ -25,7 +25,19 @@ enum AddText {
 // dehtml() returns way too many newlines; however, an optimisation on this issue is not needed as
 // the newlines are typically removed in further processing by the caller
 pub fn dehtml(buf: &str) -> String {
-    let buf = buf.trim();
+    let s = dehtml_quick_xml(buf);
+    if !s.trim().is_empty() {
+        return s;
+    }
+    let s = dehtml_manually(buf);
+    if !s.trim().is_empty() {
+        return s;
+    }
+    buf.to_string()
+}
+
+pub fn dehtml_quick_xml(buf: &str) -> String {
+    let buf = buf.trim().trim_start_matches("<!doctype html>");

    let mut dehtml = Dehtml {
        strbuilder: String::with_capacity(buf.len()),
@@ -171,6 +183,24 @@ fn dehtml_starttag_cb<B: std::io::BufRead>(
    }
 }

+pub fn dehtml_manually(buf: &str) -> String {
+    // Just strip out everything between "<" and ">"
+    let mut strbuilder = String::new();
+    let mut show_next_chars = true;
+    for c in buf.chars() {
+        match c {
+            '<' => show_next_chars = false,
+            '>' => show_next_chars = true,
+            _ => {
+                if show_next_chars {
+                    strbuilder.push(c)
+                }
+            }
+        }
+    }
+    strbuilder
+}
+
 #[cfg(test)]
 mod tests {
    use super::*;
@@ -254,4 +284,22 @@ mod tests {
        let txt = dehtml(input);
        assert_eq!(txt.trim(), "lots of text");
    }
+
+    #[test]
+    fn test_doctype_html() {
+        use crate::simplify::simplify;
+
+        let input = "<!doctype html>\n<b>fat text</b>";
+        let txt = simplify(dehtml(input), false).0;
+        assert_eq!(txt.trim(), "*fat text*");
+
+        let input = "<!some invalid html code>\n<b>some text</b>";
+        let txt = simplify(dehtml(input), false).0;
+        assert_eq!(txt.trim(), "some text");
+        // at least DC should show the text if the html is invalid
+
+        let input = "<This text is in brackets>";
+        let txt = simplify(dehtml(input), false).0;
+        assert_eq!(txt.trim(), "<This text is in brackets>");
+    }
 }