refactor: make dc_dehtml() function safe

* Make dc_dehtml() function safe * Change type of is_msgrmsg parameter to bool * Narrow type of local variable in simplify_plain_text() * Export less fields of `Simplify' record * Demote is_cut_* from fields of `Simplify' to local variables * Refactor part of simplify_plain_text() Refactor footer ("-- " and similar) code into separate function, and re-implement it with standard Rust string methods. It simplifies code and allows removing one mutable local variable. * Replace dc_split_into_lines with String.split() * src/dc_simplify.rs(find_message_footer): adjust type signature to accept slice of &str, not slice of pointers * src/dc_simplify.rs(simplify_plain_text): adjust code to use '==' operator instead of strcmp(3). * src/dc_simplify.rs(is_empty_line, is_quoted_headline, is_plain_quote): + adjust type signatures to accept &str, not 'const char *' + remove no longer needed 'unsafe' qualifier * src/dc_tools(dc_split_into_lines, dc_free_splitted_lines): remove no longer used functions. In addition to additional type-safety, this change reduces number of allocations: String.split returns iterator of &str. * Make simplify_plain_text() safe * Make Simplify.simplify return String, not pointer * Refactor Simplify.simplify to use String methods, not pointers * Make Simplify.simplify() safe * Avoid neeless allocation in Simplify.simplify when input is html * Add tests for simplify utilities * Document discussion about is_empty_line() discussion
2026-05-03 13:26:28 +03:00 · 2019-08-26 15:15:14 +00:00
parent 8a73f84003
commit d7d7147549
4 changed files with 127 additions and 233 deletions
--- a/src/dc_dehtml.rs
+++ b/src/dc_dehtml.rs
@@ -2,9 +2,6 @@ use lazy_static::lazy_static;
 use quick_xml;
 use quick_xml::events::{BytesEnd, BytesStart, BytesText};

-use crate::dc_tools::*;
-use crate::x::*;
-
 lazy_static! {
    static ref LINE_RE: regex::Regex = regex::Regex::new(r"(\r?\n)+").unwrap();
 }
@@ -24,19 +21,20 @@ enum AddText {

 // dc_dehtml() returns way too many lineends; however, an optimisation on this issue is not needed as
 // the lineends are typically remove in further processing by the caller
-pub unsafe fn dc_dehtml(buf_terminated: *mut libc::c_char) -> *mut libc::c_char {
-    dc_trim(buf_terminated);
-    if *buf_terminated.offset(0isize) as libc::c_int == 0i32 {
-        return dc_strdup(b"\x00" as *const u8 as *const libc::c_char);
+pub fn dc_dehtml(buf_terminated: &str) -> String {
+    let buf_terminated = buf_terminated.trim();
+
+    if buf_terminated.is_empty() {
+        return "".into();
    }

    let mut dehtml = Dehtml {
-        strbuilder: String::with_capacity(strlen(buf_terminated)),
+        strbuilder: String::with_capacity(buf_terminated.len()),
        add_text: AddText::YesRemoveLineEnds,
        last_href: None,
    };

-    let mut reader = quick_xml::Reader::from_str(as_str(buf_terminated));
+    let mut reader = quick_xml::Reader::from_str(buf_terminated);

    let mut buf = Vec::new();

@@ -61,7 +59,7 @@ pub unsafe fn dc_dehtml(buf_terminated: *mut libc::c_char) -> *mut libc::c_char
        buf.clear();
    }

-    dehtml.strbuilder.strdup()
+    dehtml.strbuilder
 }

 fn dehtml_text_cb(event: &BytesText, dehtml: &mut Dehtml) {
--- a/src/dc_mimeparser.rs
+++ b/src/dc_mimeparser.rs
@@ -1154,28 +1154,30 @@ unsafe fn dc_mimeparser_add_single_part_if_known(
                        if ok_to_continue {
                            /* check header directly as is_send_by_messenger is not yet set up */
                            let is_msgrmsg =
-                                (!dc_mimeparser_lookup_optional_field(&mimeparser, "Chat-Version")
-                                    .is_null()) as libc::c_int;
+                                !dc_mimeparser_lookup_optional_field(&mimeparser, "Chat-Version")
+                                    .is_null();

-                            let simplified_txt = simplifier.unwrap().simplify(
-                                decoded_data,
-                                decoded_data_bytes as libc::c_int,
-                                mime_type == 70i32,
-                                is_msgrmsg,
-                            );
-                            if !simplified_txt.is_null()
-                                && 0 != *simplified_txt.offset(0isize) as libc::c_int
-                            {
+                            let simplified_txt =
+                                if decoded_data_bytes <= 0 || decoded_data.is_null() {
+                                    "".into()
+                                } else {
+                                    let input_c = strndup(decoded_data, decoded_data_bytes as _);
+                                    let input = to_string_lossy(input_c);
+                                    let is_html = mime_type == 70;
+                                    free(input_c as *mut _);
+
+                                    simplifier.unwrap().simplify(&input, is_html, is_msgrmsg)
+                                };
+                            if !simplified_txt.is_empty() {
                                let mut part = dc_mimepart_new();
                                part.type_0 = 10i32;
                                part.int_mimetype = mime_type;
-                                part.msg = simplified_txt;
+                                part.msg = simplified_txt.strdup();
                                part.msg_raw =
                                    strndup(decoded_data, decoded_data_bytes as libc::c_ulong);
                                do_add_single_part(mimeparser, part);
-                            } else {
-                                free(simplified_txt as *mut libc::c_void);
                            }
+
                            if simplifier.unwrap().is_forwarded {
                                mimeparser.is_forwarded = 1i32
                            }
--- a/src/dc_simplify.rs
+++ b/src/dc_simplify.rs
@@ -1,64 +1,49 @@
 use crate::dc_dehtml::*;
-use crate::dc_tools::*;
-use crate::x::*;

 #[derive(Copy, Clone)]
 pub struct Simplify {
    pub is_forwarded: bool,
-    pub is_cut_at_begin: bool,
-    pub is_cut_at_end: bool,
+}
+
+/// Return index of footer line in vector of message lines, or vector length if
+/// no footer is found.
+///
+/// Also return whether not-standard (rfc3676, §4.3) footer is found.
+fn find_message_footer(lines: &[&str]) -> (usize, bool) {
+    for ix in 0..lines.len() {
+        let line = lines[ix];
+
+        // quoted-printable may encode `-- ` to `-- =20` which is converted
+        // back to `--  `
+        match line.as_ref() {
+            "-- " | "--  " => return (ix, false),
+            "--" | "---" | "----" => return (ix, true),
+            _ => (),
+        }
+    }
+    return (lines.len(), false);
 }

 impl Simplify {
    pub fn new() -> Self {
        Simplify {
            is_forwarded: false,
-            is_cut_at_begin: false,
-            is_cut_at_end: false,
        }
    }

    /// Simplify and normalise text: Remove quotes, signatures, unnecessary
    /// lineends etc.
    /// The data returned from simplify() must be free()'d when no longer used.
-    pub unsafe fn simplify(
-        &mut self,
-        in_unterminated: *const libc::c_char,
-        in_bytes: libc::c_int,
-        is_html: bool,
-        is_msgrmsg: libc::c_int,
-    ) -> *mut libc::c_char {
-        if in_bytes <= 0 {
-            return "".strdup();
-        }
+    pub fn simplify(&mut self, input: &str, is_html: bool, is_msgrmsg: bool) -> String {
+        let mut out = if is_html {
+            dc_dehtml(input)
+        } else {
+            input.to_string()
+        };

-        /* create a copy of the given buffer */
-        let mut out: *mut libc::c_char;
-        let mut temp: *mut libc::c_char;
-        self.is_forwarded = false;
-        self.is_cut_at_begin = false;
-        self.is_cut_at_end = false;
-        out = strndup(
-            in_unterminated as *mut libc::c_char,
-            in_bytes as libc::c_ulong,
-        );
-        if out.is_null() {
-            return dc_strdup(b"\x00" as *const u8 as *const libc::c_char);
-        }
-        if is_html {
-            temp = dc_dehtml(out);
-            if !temp.is_null() {
-                free(out as *mut libc::c_void);
-                out = temp
-            }
-        }
-        dc_remove_cr_chars(out);
-        temp = self.simplify_plain_text(out, is_msgrmsg);
-        if !temp.is_null() {
-            free(out as *mut libc::c_void);
-            out = temp
-        }
-        dc_remove_cr_chars(out);
+        out.retain(|c| c != '\r');
+        out = self.simplify_plain_text(&out, is_msgrmsg);
+        out.retain(|c| c != '\r');

        out
    }
@@ -67,75 +52,48 @@ impl Simplify {
     * Simplify Plain Text
     */
    #[allow(non_snake_case)]
-    unsafe fn simplify_plain_text(
-        &mut self,
-        buf_terminated: *const libc::c_char,
-        is_msgrmsg: libc::c_int,
-    ) -> *mut libc::c_char {
+    fn simplify_plain_text(&mut self, buf_terminated: &str, is_msgrmsg: bool) -> String {
        /* This function ...
        ... removes all text after the line `-- ` (footer mark)
        ... removes full quotes at the beginning and at the end of the text -
            these are all lines starting with the character `>`
        ... remove a non-empty line before the removed quote (contains sth. like "On 2.9.2016, Bjoern wrote:" in different formats and lanugages) */
        /* split the given buffer into lines */
-        let lines = dc_split_into_lines(buf_terminated);
+        let lines: Vec<_> = buf_terminated.split('\n').collect();
        let mut l_first: usize = 0;
-        let mut l_last = lines.len();
-        let mut line: *mut libc::c_char;
-        let mut footer_mark: libc::c_int = 0i32;
-        for l in l_first..l_last {
-            line = lines[l];
-            if strcmp(line, b"-- \x00" as *const u8 as *const libc::c_char) == 0i32
-                || strcmp(line, b"--  \x00" as *const u8 as *const libc::c_char) == 0i32
-            {
-                footer_mark = 1i32
-            }
-            if strcmp(line, b"--\x00" as *const u8 as *const libc::c_char) == 0i32
-                || strcmp(line, b"---\x00" as *const u8 as *const libc::c_char) == 0i32
-                || strcmp(line, b"----\x00" as *const u8 as *const libc::c_char) == 0i32
-            {
-                footer_mark = 1i32;
-                self.is_cut_at_end = true
-            }
-            if 0 != footer_mark {
-                l_last = l;
-                /* done */
-                break;
-            }
-        }
+        let mut is_cut_at_begin = false;
+        let (mut l_last, mut is_cut_at_end) = find_message_footer(&lines);
+
        if l_last > l_first + 2 {
-            let line0: *mut libc::c_char = lines[l_first];
-            let line1: *mut libc::c_char = lines[l_first + 1];
-            let line2: *mut libc::c_char = lines[l_first + 2];
-            if strcmp(
-                line0,
-                b"---------- Forwarded message ----------\x00" as *const u8 as *const libc::c_char,
-            ) == 0i32
-                && strncmp(line1, b"From: \x00" as *const u8 as *const libc::c_char, 6) == 0i32
-                && *line2.offset(0isize) as libc::c_int == 0i32
+            let line0 = lines[l_first];
+            let line1 = lines[l_first + 1];
+            let line2 = lines[l_first + 2];
+            if line0 == "---------- Forwarded message ----------"
+                && line1.starts_with("From: ")
+                && line2.is_empty()
            {
                self.is_forwarded = true;
                l_first += 3
            }
        }
        for l in l_first..l_last {
-            line = lines[l];
-            if strncmp(line, b"-----\x00" as *const u8 as *const libc::c_char, 5) == 0i32
-                || strncmp(line, b"_____\x00" as *const u8 as *const libc::c_char, 5) == 0i32
-                || strncmp(line, b"=====\x00" as *const u8 as *const libc::c_char, 5) == 0i32
-                || strncmp(line, b"*****\x00" as *const u8 as *const libc::c_char, 5) == 0i32
-                || strncmp(line, b"~~~~~\x00" as *const u8 as *const libc::c_char, 5) == 0i32
+            let line = lines[l];
+            if line == "-----"
+                || line == "_____"
+                || line == "====="
+                || line == "*****"
+                || line == "~~~~~"
            {
                l_last = l;
-                self.is_cut_at_end = true;
+                is_cut_at_end = true;
                /* done */
                break;
            }
        }
-        if 0 == is_msgrmsg {
+        if !is_msgrmsg {
            let mut l_lastQuotedLine = None;
            for l in (l_first..l_last).rev() {
-                line = lines[l];
+                let line = lines[l];
                if is_plain_quote(line) {
                    l_lastQuotedLine = Some(l)
                } else if !is_empty_line(line) {
@@ -144,25 +102,25 @@ impl Simplify {
            }
            if l_lastQuotedLine.is_some() {
                l_last = l_lastQuotedLine.unwrap();
-                self.is_cut_at_end = true;
+                is_cut_at_end = true;
                if l_last > 1 {
                    if is_empty_line(lines[l_last - 1]) {
                        l_last -= 1
                    }
                }
                if l_last > 1 {
-                    line = lines[l_last - 1];
+                    let line = lines[l_last - 1];
                    if is_quoted_headline(line) {
                        l_last -= 1
                    }
                }
            }
        }
-        if 0 == is_msgrmsg {
+        if !is_msgrmsg {
            let mut l_lastQuotedLine_0 = None;
            let mut hasQuotedHeadline = 0;
            for l in l_first..l_last {
-                line = lines[l];
+                let line = lines[l];
                if is_plain_quote(line) {
                    l_lastQuotedLine_0 = Some(l)
                } else if !is_empty_line(line) {
@@ -179,19 +137,19 @@ impl Simplify {
            }
            if l_lastQuotedLine_0.is_some() {
                l_first = l_lastQuotedLine_0.unwrap() + 1;
-                self.is_cut_at_begin = true
+                is_cut_at_begin = true
            }
        }
        /* re-create buffer from the remaining lines */
        let mut ret = String::new();
-        if self.is_cut_at_begin {
+        if is_cut_at_begin {
            ret += "[...]";
        }
        /* we write empty lines only in case and non-empty line follows */
        let mut pending_linebreaks: libc::c_int = 0i32;
        let mut content_lines_added: libc::c_int = 0i32;
        for l in l_first..l_last {
-            line = lines[l];
+            let line = lines[l];
            if is_empty_line(line) {
                pending_linebreaks += 1
            } else {
@@ -205,142 +163,105 @@ impl Simplify {
                    }
                }
                // the incoming message might contain invalid UTF8
-                ret += &to_string_lossy(line);
+                ret += line;
                content_lines_added += 1;
                pending_linebreaks = 1i32
            }
        }
-        if self.is_cut_at_end && (!self.is_cut_at_begin || 0 != content_lines_added) {
+        if is_cut_at_end && (!is_cut_at_begin || 0 != content_lines_added) {
            ret += " [...]";
        }
-        dc_free_splitted_lines(lines);

-        ret.strdup()
+        ret
    }
 }

 /**
 * Tools
 */
-unsafe fn is_empty_line(buf: *const libc::c_char) -> bool {
-    /* force unsigned - otherwise the `> ' '` comparison will fail */
-    let mut p1: *const libc::c_uchar = buf as *const libc::c_uchar;
-    while 0 != *p1 {
-        if *p1 as libc::c_int > ' ' as i32 {
+fn is_empty_line(buf: &str) -> bool {
+    // XXX: can it be simplified to buf.chars().all(|c| c.is_whitespace())?
+    //
+    // Strictly speaking, it is not equivalent (^A is not whitespace, but less than ' '),
+    // but having control sequences in email body?!
+    //
+    // See discussion at: https://github.com/deltachat/deltachat-core-rust/pull/402#discussion_r317062392
+    for c in buf.chars() {
+        if c > ' ' {
            return false;
        }
-        p1 = p1.offset(1isize)
    }

    true
 }

-unsafe fn is_quoted_headline(buf: *const libc::c_char) -> bool {
+fn is_quoted_headline(buf: &str) -> bool {
    /* This function may be called for the line _directly_ before a quote.
    The function checks if the line contains sth. like "On 01.02.2016, xy@z wrote:" in various languages.
    - Currently, we simply check if the last character is a ':'.
    - Checking for the existence of an email address may fail (headlines may show the user's name instead of the address) */
-    let buf_len: libc::c_int = strlen(buf) as libc::c_int;
-    if buf_len > 80i32 {
-        return false;
-    }
-    if buf_len > 0i32 && *buf.offset((buf_len - 1i32) as isize) as libc::c_int == ':' as i32 {
-        return true;
-    }

-    false
+    buf.len() <= 80 && buf.ends_with(':')
 }

-unsafe fn is_plain_quote(buf: *const libc::c_char) -> bool {
-    if *buf.offset(0isize) as libc::c_int == '>' as i32 {
-        return true;
-    }
-
-    false
+fn is_plain_quote(buf: &str) -> bool {
+    buf.starts_with(">")
 }

 #[cfg(test)]
 mod tests {
    use super::*;
-    use std::ffi::CStr;

    #[test]
    fn test_simplify_trim() {
-        unsafe {
-            let mut simplify = Simplify::new();
-            let html: *const libc::c_char =
-                b"\r\r\nline1<br>\r\n\r\n\r\rline2\n\r\x00" as *const u8 as *const libc::c_char;
-            let plain: *mut libc::c_char =
-                simplify.simplify(html, strlen(html) as libc::c_int, true, 0);
+        let mut simplify = Simplify::new();
+        let html = "\r\r\nline1<br>\r\n\r\n\r\rline2\n\r";
+        let plain = simplify.simplify(html, true, false);

-            assert_eq!(
-                CStr::from_ptr(plain as *const libc::c_char)
-                    .to_str()
-                    .unwrap(),
-                "line1\nline2",
-            );
-
-            free(plain as *mut libc::c_void);
-        }
+        assert_eq!(plain, "line1\nline2");
    }

    #[test]
    fn test_simplify_parse_href() {
-        unsafe {
-            let mut simplify = Simplify::new();
-            let html: *const libc::c_char =
-                b"<a href=url>text</a\x00" as *const u8 as *const libc::c_char;
-            let plain: *mut libc::c_char =
-                simplify.simplify(html, strlen(html) as libc::c_int, true, 0);
+        let mut simplify = Simplify::new();
+        let html = "<a href=url>text</a";
+        let plain = simplify.simplify(html, true, false);

-            assert_eq!(
-                CStr::from_ptr(plain as *const libc::c_char)
-                    .to_str()
-                    .unwrap(),
-                "[text](url)",
-            );
-
-            free(plain as *mut libc::c_void);
-        }
+        assert_eq!(plain, "[text](url)");
    }

    #[test]
    fn test_simplify_bold_text() {
-        unsafe {
-            let mut simplify = Simplify::new();
-            let html: *const libc::c_char =
-                b"<!DOCTYPE name [<!DOCTYPE ...>]><!-- comment -->text <b><?php echo ... ?>bold</b><![CDATA[<>]]>\x00"
-                as *const u8 as *const libc::c_char;
-            let plain = simplify.simplify(html, strlen(html) as libc::c_int, true, 0);
+        let mut simplify = Simplify::new();
+        let html = "<!DOCTYPE name [<!DOCTYPE ...>]><!-- comment -->text <b><?php echo ... ?>bold</b><![CDATA[<>]]>";
+        let plain = simplify.simplify(html, true, false);

-            assert_eq!(
-                CStr::from_ptr(plain as *const libc::c_char)
-                    .to_str()
-                    .unwrap(),
-                "text *bold*<>",
-            );
-
-            free(plain as *mut libc::c_void);
-        }
+        assert_eq!(plain, "text *bold*<>");
    }

    #[test]
    fn test_simplify_html_encoded() {
-        unsafe {
-            let mut simplify = Simplify::new();
-            let html =
-                b"&lt;&gt;&quot;&apos;&amp; &auml;&Auml;&ouml;&Ouml;&uuml;&Uuml;&szlig; foo&AElig;&ccedil;&Ccedil; &diams;&lrm;&rlm;&zwnj;&noent;&zwj;\x00"
-                as *const u8 as *const libc::c_char;
-            let plain = simplify.simplify(html, strlen(html) as libc::c_int, true, 0);
+        let mut simplify = Simplify::new();
+        let html =
+                "&lt;&gt;&quot;&apos;&amp; &auml;&Auml;&ouml;&Ouml;&uuml;&Uuml;&szlig; foo&AElig;&ccedil;&Ccedil; &diams;&lrm;&rlm;&zwnj;&noent;&zwj;";

-            assert_eq!(
-                CStr::from_ptr(plain as *const libc::c_char)
-                    .to_str()
-                    .unwrap(),
-                "<>\"\'& äÄöÖüÜß fooÆçÇ \u{2666}\u{200e}\u{200f}\u{200c}&noent;\u{200d}"
-            );
+        let plain = simplify.simplify(html, true, false);

-            free(plain as *mut libc::c_void);
-        }
+        assert_eq!(
+            plain,
+            "<>\"\'& äÄöÖüÜß fooÆçÇ \u{2666}\u{200e}\u{200f}\u{200c}&noent;\u{200d}"
+        );
+    }
+
+    #[test]
+    fn test_simplify_utilities() {
+        assert!(is_empty_line(" \t"));
+        assert!(is_empty_line(""));
+        assert!(is_empty_line(" \r"));
+        assert!(!is_empty_line(" x"));
+        assert!(is_plain_quote("> hello world"));
+        assert!(is_plain_quote(">>"));
+        assert!(!is_plain_quote("Life is pain"));
+        assert!(!is_plain_quote(""));
    }
 }
--- a/src/dc_tools.rs
+++ b/src/dc_tools.rs
@@ -355,33 +355,6 @@ unsafe fn dc_utf8_strnlen(s: *const libc::c_char, n: size_t) -> size_t {
    j
 }

-/* split string into lines*/
-pub unsafe fn dc_split_into_lines(buf_terminated: *const libc::c_char) -> Vec<*mut libc::c_char> {
-    let mut lines = Vec::new();
-    let mut line_chars = 0;
-    let mut p1: *const libc::c_char = buf_terminated;
-    let mut line_start: *const libc::c_char = p1;
-    while 0 != *p1 {
-        if *p1 as libc::c_int == '\n' as i32 {
-            lines.push(strndup(line_start, line_chars));
-            p1 = p1.offset(1isize);
-            line_start = p1;
-            line_chars = 0;
-        } else {
-            p1 = p1.offset(1isize);
-            line_chars += 1;
-        }
-    }
-    lines.push(strndup(line_start, line_chars));
-    lines
-}
-
-pub unsafe fn dc_free_splitted_lines(lines: Vec<*mut libc::c_char>) {
-    for s in lines {
-        free(s as *mut libc::c_void);
-    }
-}
-
 pub unsafe fn dc_str_from_clist(
    list: *const clist,
    delimiter: *const libc::c_char,