add option to access original message (#2125)

* draft API to deal with uncut message texts * add column mime_modified * add mime_modified flag to MimeParser and save it in the database * save mime_headers also when mime_modified is set * cargo fmt * set mime_modified on parsed html-texts and when there are multiple alternative-parts; add test for that * prototype functions, add to repl and ffi * use correct mime_modified flag * basically parse Mime-Structure to HTML * add basic tests for HTML-parsing * convert text/plain to html for getting original * respect charset for plain texts * make test more specific * fix handling non-utf-8 charsets for plain messages * add test for plain_to_html() * add failing test for plaintext linkify * linkify urls in plain text * fix regex * plain text linkify: add failing test for encapsulated links as <https://domain.com> * plain text linkify: make encapsulated links as <https://domain.com> work * plain text linkify: require word boundary at beginning of link, add tests for that * plain text linkify: linkify emails * plain text: support format=flowed * plain text: support quotes * make clippy happy * set mime-modified also when simplify() cuts non-html messages, add tests for that * streamline mime recursion * repl tool: write original html to file for further processing * convert cid:- to data:-protocol * add a test for cid: to data: conversion * make clippy happy * fix html-tests to work with windows-lineends * clarify what the returned html-code may contain * add some more detailed doc comments * add mime_modified column only if not exist this additional check is needed as the column may added with another dbversion in some shipped beta-versions. * incorporate documentation suggestions from review * rename get_original_mime_html() to more simple get_html() * rename api is_mime_modified() to more simple has_html(); internally, mime_modified-flag stays as-is, however * rename MimeS to MimeMultipartType * do not set mime-modified flag for encrypted messages that need extra-handling for saved mime-structure * fix typo * move get_msg_html() to MsgId.get_html() * incorporate more documentation suggestions from review * remove unused return value from collect_texts_recursive() * avoid mime_modified being mutable in write-parts-loop * move 'use futures::future::FutureExt' atop of html.rs * move attributes defining plain-text to a dedicated structure * more PlainText to separate file * escape cid when building regex * let dc_get_msg_html() return NULL when calling with bad param
2026-04-20 23:16:30 +03:00 · 2021-01-11 17:40:35 +01:00
parent bb9e6038c4
commit e2688f6355
21 changed files with 1141 additions and 47 deletions
--- a/src/html.rs
+++ b/src/html.rs
@@ -0,0 +1,396 @@
+///! # Get message as HTML.
+///!
+///! Use `Message.has_html()` to check if the UI shall render a
+///! corresponding button and `MsgId.get_html()` to get the full message.
+///!
+///! Even when the original mime-message is not HTML,
+///! `MsgId.get_html()` will return HTML -
+///! this allows nice quoting, handling linebreaks properly etc.
+use futures::future::FutureExt;
+use std::future::Future;
+use std::pin::Pin;
+
+use lettre_email::mime::{self, Mime};
+
+use crate::context::Context;
+use crate::error::Result;
+use crate::headerdef::{HeaderDef, HeaderDefMap};
+use crate::message::{Message, MsgId};
+use crate::mimeparser::parse_message_id;
+use crate::plaintext::PlainText;
+use mailparse::ParsedContentType;
+
+impl Message {
+    /// Check if the message can be retrieved as HTML.
+    /// Typically, this is the case, when the mime structure of a Message is modified,
+    /// meaning that some text is cut or the original message
+    /// is in HTML and `simplify()` may hide some maybe important information.
+    /// The corresponding ffi-function is `dc_msg_has_html()`.
+    /// To get the HTML-code of the message, use `MsgId.get_html()`.
+    pub fn has_html(&self) -> bool {
+        self.mime_modified
+    }
+}
+
+/// Type defining a rough mime-type.
+/// This is mainly useful on iterating
+/// to decide whether a mime-part has subtypes.
+enum MimeMultipartType {
+    Multiple,
+    Single,
+    Message,
+}
+
+/// Function takes a content type from a ParsedMail structure
+/// and checks and returns the rough mime-type.
+async fn get_mime_multipart_type(ctype: &ParsedContentType) -> MimeMultipartType {
+    let mimetype = ctype.mimetype.to_lowercase();
+    if mimetype.starts_with("multipart") && ctype.params.get("boundary").is_some() {
+        MimeMultipartType::Multiple
+    } else if mimetype == "message/rfc822" {
+        MimeMultipartType::Message
+    } else {
+        MimeMultipartType::Single
+    }
+}
+
+/// HtmlMsgParser converts a mime-message to HTML.
+#[derive(Debug)]
+struct HtmlMsgParser {
+    pub html: String,
+    pub plain: Option<PlainText>,
+}
+
+impl HtmlMsgParser {
+    /// Function takes a raw mime-message string,
+    /// searches for the main-text part
+    /// and returns that as parser.html
+    pub async fn from_bytes(context: &Context, rawmime: &[u8]) -> Result<Self> {
+        let mut parser = HtmlMsgParser {
+            html: "".to_string(),
+            plain: None,
+        };
+
+        let parsedmail = mailparse::parse_mail(rawmime)?;
+
+        parser.collect_texts_recursive(context, &parsedmail).await?;
+
+        if parser.html.is_empty() {
+            if let Some(plain) = &parser.plain {
+                parser.html = plain.to_html().await;
+            }
+        } else {
+            parser.cid_to_data_recursive(context, &parsedmail).await?;
+        }
+
+        Ok(parser)
+    }
+
+    /// Function iterates over all mime-parts
+    /// and searches for text/plain and text/html parts and saves the
+    /// last one found
+    /// in the corresponding structure fields.
+    /// Usually, there is at most one plain-text and one HTML-text part.
+    fn collect_texts_recursive<'a>(
+        &'a mut self,
+        context: &'a Context,
+        mail: &'a mailparse::ParsedMail<'a>,
+    ) -> Pin<Box<dyn Future<Output = Result<()>> + 'a + Send>> {
+        // Boxed future to deal with recursion
+        async move {
+            match get_mime_multipart_type(&mail.ctype).await {
+                MimeMultipartType::Multiple => {
+                    for cur_data in mail.subparts.iter() {
+                        self.collect_texts_recursive(context, cur_data).await?
+                    }
+                    Ok(())
+                }
+                MimeMultipartType::Message => {
+                    let raw = mail.get_body_raw()?;
+                    if raw.is_empty() {
+                        return Ok(());
+                    }
+                    let mail = mailparse::parse_mail(&raw).unwrap();
+                    self.collect_texts_recursive(context, &mail).await
+                }
+                MimeMultipartType::Single => {
+                    let mimetype = mail.ctype.mimetype.parse::<Mime>()?;
+                    if mimetype == mime::TEXT_HTML {
+                        if let Ok(decoded_data) = mail.get_body() {
+                            self.html = decoded_data;
+                            return Ok(());
+                        }
+                    } else if mimetype == mime::TEXT_PLAIN {
+                        if let Ok(decoded_data) = mail.get_body() {
+                            self.plain = Some(PlainText {
+                                text: decoded_data,
+                                flowed: if let Some(format) = mail.ctype.params.get("format") {
+                                    format.as_str().to_ascii_lowercase() == "flowed"
+                                } else {
+                                    false
+                                },
+                                delsp: if let Some(delsp) = mail.ctype.params.get("delsp") {
+                                    delsp.as_str().to_ascii_lowercase() == "yes"
+                                } else {
+                                    false
+                                },
+                            });
+                            return Ok(());
+                        }
+                    }
+                    Ok(())
+                }
+            }
+        }
+        .boxed()
+    }
+
+    /// Replace cid:-protocol by the data:-protocol where appropriate.
+    /// This allows the final html-file to be self-contained.
+    fn cid_to_data_recursive<'a>(
+        &'a mut self,
+        context: &'a Context,
+        mail: &'a mailparse::ParsedMail<'a>,
+    ) -> Pin<Box<dyn Future<Output = Result<()>> + 'a + Send>> {
+        // Boxed future to deal with recursion
+        async move {
+            match get_mime_multipart_type(&mail.ctype).await {
+                MimeMultipartType::Multiple => {
+                    for cur_data in mail.subparts.iter() {
+                        self.cid_to_data_recursive(context, cur_data).await?;
+                    }
+                    Ok(())
+                }
+                MimeMultipartType::Message => {
+                    let raw = mail.get_body_raw()?;
+                    if raw.is_empty() {
+                        return Ok(());
+                    }
+                    let mail = mailparse::parse_mail(&raw).unwrap();
+                    self.cid_to_data_recursive(context, &mail).await
+                }
+                MimeMultipartType::Single => {
+                    let mimetype = mail.ctype.mimetype.parse::<Mime>()?;
+                    if mimetype.type_() == mime::IMAGE {
+                        if let Some(cid) = mail.headers.get_header_value(HeaderDef::ContentId) {
+                            if let Ok(cid) = parse_message_id(&cid) {
+                                if let Ok(replacement) = mimepart_to_data_url(&mail).await {
+                                    let re_string = format!(
+                                        "(<img[^>]*src[^>]*=[^>]*)(cid:{})([^>]*>)",
+                                        regex::escape(&cid)
+                                    );
+                                    match regex::Regex::new(&re_string) {
+                                        Ok(re) => {
+                                            self.html = re
+                                                .replace_all(
+                                                    &*self.html,
+                                                    format!("${{1}}{}${{3}}", replacement).as_str(),
+                                                )
+                                                .as_ref()
+                                                .to_string()
+                                        }
+                                        Err(e) => warn!(
+                                            context,
+                                            "Cannot create regex for cid: {} throws {}",
+                                            re_string,
+                                            e
+                                        ),
+                                    }
+                                }
+                            }
+                        }
+                    }
+                    Ok(())
+                }
+            }
+        }
+        .boxed()
+    }
+}
+
+/// Convert a mime part to a data: url as defined in [RFC 2397](https://tools.ietf.org/html/rfc2397).
+async fn mimepart_to_data_url(mail: &mailparse::ParsedMail<'_>) -> Result<String> {
+    let data = mail.get_body_raw()?;
+    let data = base64::encode(&data);
+    Ok(format!("data:{};base64,{}", mail.ctype.mimetype, data))
+}
+
+impl MsgId {
+    /// Get HTML from a message-id.
+    /// This requires `mime_headers` field to be set for the message;
+    /// this is the case at least when `Message.has_html()` returns true
+    /// (we do not save raw mime unconditionally in the database to save space).
+    /// The corresponding ffi-function is `dc_get_msg_html()`.
+    pub async fn get_html(self, context: &Context) -> String {
+        let rawmime: Option<String> = context
+            .sql
+            .query_get_value(
+                context,
+                "SELECT mime_headers FROM msgs WHERE id=?;",
+                paramsv![self],
+            )
+            .await;
+
+        if let Some(rawmime) = rawmime {
+            match HtmlMsgParser::from_bytes(context, rawmime.as_bytes()).await {
+                Err(err) => format!("parser error: {}", err),
+                Ok(parser) => parser.html,
+            }
+        } else {
+            format!("parser error: no mime for {}", self)
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::test_utils::*;
+
+    #[async_std::test]
+    async fn test_htmlparse_plain_unspecified() {
+        let t = TestContext::new().await;
+        let raw = include_bytes!("../test-data/message/text_plain_unspecified.eml");
+        let parser = HtmlMsgParser::from_bytes(&t.ctx, raw).await.unwrap();
+        assert_eq!(
+            parser.html,
+            r##"<!DOCTYPE html>
+<html><head><meta http-equiv="Content-Type" content="text/html; charset=utf-8" /></head><body>
+This message does not have Content-Type nor Subject.<br/>
+<br/>
+</body></html>
+"##
+        );
+    }
+
+    #[async_std::test]
+    async fn test_htmlparse_plain_iso88591() {
+        let t = TestContext::new().await;
+        let raw = include_bytes!("../test-data/message/text_plain_iso88591.eml");
+        let parser = HtmlMsgParser::from_bytes(&t.ctx, raw).await.unwrap();
+        assert_eq!(
+            parser.html,
+            r##"<!DOCTYPE html>
+<html><head><meta http-equiv="Content-Type" content="text/html; charset=utf-8" /></head><body>
+message with a non-UTF-8 encoding: äöüßÄÖÜ<br/>
+<br/>
+</body></html>
+"##
+        );
+    }
+
+    #[async_std::test]
+    async fn test_htmlparse_plain_flowed() {
+        let t = TestContext::new().await;
+        let raw = include_bytes!("../test-data/message/text_plain_flowed.eml");
+        let parser = HtmlMsgParser::from_bytes(&t.ctx, raw).await.unwrap();
+        assert!(parser.plain.unwrap().flowed);
+        assert_eq!(
+            parser.html,
+            r##"<!DOCTYPE html>
+<html><head><meta http-equiv="Content-Type" content="text/html; charset=utf-8" /></head><body>
+This line ends with a space and will be merged with the next one due to format=flowed.<br/>
+<br/>
+This line does not end with a space<br/>
+and will be wrapped as usual.<br/>
+<br/>
+</body></html>
+"##
+        );
+    }
+
+    #[async_std::test]
+    async fn test_htmlparse_alt_plain() {
+        let t = TestContext::new().await;
+        let raw = include_bytes!("../test-data/message/text_alt_plain.eml");
+        let parser = HtmlMsgParser::from_bytes(&t.ctx, raw).await.unwrap();
+        assert_eq!(
+            parser.html,
+            r##"<!DOCTYPE html>
+<html><head><meta http-equiv="Content-Type" content="text/html; charset=utf-8" /></head><body>
+mime-modified should not be set set as there is no html and no special stuff;<br/>
+although not being a delta-message.<br/>
+test some special html-characters as &lt; &gt; and &amp; but also &quot; and &#x27; :)<br/>
+<br/>
+<br/>
+</body></html>
+"##
+        );
+    }
+
+    #[async_std::test]
+    async fn test_htmlparse_html() {
+        let t = TestContext::new().await;
+        let raw = include_bytes!("../test-data/message/text_html.eml");
+        let parser = HtmlMsgParser::from_bytes(&t.ctx, raw).await.unwrap();
+
+        // on windows, `\r\n` linends are returned from mimeparser,
+        // however, rust multiline-strings use just `\n`;
+        // therefore, we just remove `\r` before comparison.
+        assert_eq!(
+            parser.html.replace("\r", ""),
+            r##"
+<html>
+  <p>mime-modified <b>set</b>; simplify is always regarded as lossy.</p>
+</html>"##
+        );
+    }
+
+    #[async_std::test]
+    async fn test_htmlparse_alt_html() {
+        let t = TestContext::new().await;
+        let raw = include_bytes!("../test-data/message/text_alt_html.eml");
+        let parser = HtmlMsgParser::from_bytes(&t.ctx, raw).await.unwrap();
+        assert_eq!(
+            parser.html.replace("\r", ""), // see comment in test_htmlparse_html()
+            r##"<html>
+  <p>mime-modified <b>set</b>; simplify is always regarded as lossy.</p>
+</html>
+
+"##
+        );
+    }
+
+    #[async_std::test]
+    async fn test_htmlparse_alt_plain_html() {
+        let t = TestContext::new().await;
+        let raw = include_bytes!("../test-data/message/text_alt_plain_html.eml");
+        let parser = HtmlMsgParser::from_bytes(&t.ctx, raw).await.unwrap();
+        assert_eq!(
+            parser.html.replace("\r", ""), // see comment in test_htmlparse_html()
+            r##"<html>
+  <p>
+    this is <b>html</b>
+  </p>
+</html>
+
+"##
+        );
+    }
+
+    #[async_std::test]
+    async fn test_htmlparse_apple_cid_jpg() {
+        // load raw mime html-data with related image-part (cid:)
+        // and make sure, Content-Id has angle-brackets that are removed correctly.
+        let t = TestContext::new().await;
+        let raw = include_bytes!("../test-data/message/apple_cid_jpg.eml");
+        let test = String::from_utf8_lossy(raw);
+        assert!(test
+            .find("Content-Id: <8AE052EF-BC90-486F-BB78-58D3590308EC@fritz.box>")
+            .is_some());
+        assert!(test
+            .find("cid:8AE052EF-BC90-486F-BB78-58D3590308EC@fritz.box")
+            .is_some());
+        assert!(test.find("data:").is_none());
+
+        // parsing converts cid: to data:
+        let parser = HtmlMsgParser::from_bytes(&t.ctx, raw).await.unwrap();
+        assert!(parser.html.find("<html>").is_some());
+        assert!(parser.html.find("Content-Id:").is_none());
+        assert!(parser
+            .html
+            .find("data:image/jpeg;base64,/9j/4AAQ")
+            .is_some());
+        assert!(parser.html.find("cid:").is_none());
+    }
+}