diff --git a/src/dc_dehtml.rs b/src/dc_dehtml.rs index 049aaeda0..cdf66e824 100644 --- a/src/dc_dehtml.rs +++ b/src/dc_dehtml.rs @@ -2,9 +2,6 @@ use lazy_static::lazy_static; use quick_xml; use quick_xml::events::{BytesEnd, BytesStart, BytesText}; -use crate::dc_tools::*; -use crate::x::*; - lazy_static! { static ref LINE_RE: regex::Regex = regex::Regex::new(r"(\r?\n)+").unwrap(); } @@ -24,19 +21,20 @@ enum AddText { // dc_dehtml() returns way too many lineends; however, an optimisation on this issue is not needed as // the lineends are typically remove in further processing by the caller -pub unsafe fn dc_dehtml(buf_terminated: *mut libc::c_char) -> *mut libc::c_char { - dc_trim(buf_terminated); - if *buf_terminated.offset(0isize) as libc::c_int == 0i32 { - return dc_strdup(b"\x00" as *const u8 as *const libc::c_char); +pub fn dc_dehtml(buf_terminated: &str) -> String { + let buf_terminated = buf_terminated.trim(); + + if buf_terminated.is_empty() { + return "".into(); } let mut dehtml = Dehtml { - strbuilder: String::with_capacity(strlen(buf_terminated)), + strbuilder: String::with_capacity(buf_terminated.len()), add_text: AddText::YesRemoveLineEnds, last_href: None, }; - let mut reader = quick_xml::Reader::from_str(as_str(buf_terminated)); + let mut reader = quick_xml::Reader::from_str(buf_terminated); let mut buf = Vec::new(); @@ -61,7 +59,7 @@ pub unsafe fn dc_dehtml(buf_terminated: *mut libc::c_char) -> *mut libc::c_char buf.clear(); } - dehtml.strbuilder.strdup() + dehtml.strbuilder } fn dehtml_text_cb(event: &BytesText, dehtml: &mut Dehtml) { diff --git a/src/dc_mimeparser.rs b/src/dc_mimeparser.rs index dd97633aa..118c61258 100644 --- a/src/dc_mimeparser.rs +++ b/src/dc_mimeparser.rs @@ -1154,28 +1154,30 @@ unsafe fn dc_mimeparser_add_single_part_if_known( if ok_to_continue { /* check header directly as is_send_by_messenger is not yet set up */ let is_msgrmsg = - (!dc_mimeparser_lookup_optional_field(&mimeparser, "Chat-Version") - .is_null()) as libc::c_int; + !dc_mimeparser_lookup_optional_field(&mimeparser, "Chat-Version") + .is_null(); - let simplified_txt = simplifier.unwrap().simplify( - decoded_data, - decoded_data_bytes as libc::c_int, - mime_type == 70i32, - is_msgrmsg, - ); - if !simplified_txt.is_null() - && 0 != *simplified_txt.offset(0isize) as libc::c_int - { + let simplified_txt = + if decoded_data_bytes <= 0 || decoded_data.is_null() { + "".into() + } else { + let input_c = strndup(decoded_data, decoded_data_bytes as _); + let input = to_string_lossy(input_c); + let is_html = mime_type == 70; + free(input_c as *mut _); + + simplifier.unwrap().simplify(&input, is_html, is_msgrmsg) + }; + if !simplified_txt.is_empty() { let mut part = dc_mimepart_new(); part.type_0 = 10i32; part.int_mimetype = mime_type; - part.msg = simplified_txt; + part.msg = simplified_txt.strdup(); part.msg_raw = strndup(decoded_data, decoded_data_bytes as libc::c_ulong); do_add_single_part(mimeparser, part); - } else { - free(simplified_txt as *mut libc::c_void); } + if simplifier.unwrap().is_forwarded { mimeparser.is_forwarded = 1i32 } diff --git a/src/dc_simplify.rs b/src/dc_simplify.rs index ea7e17691..8d8fe4ed7 100644 --- a/src/dc_simplify.rs +++ b/src/dc_simplify.rs @@ -1,64 +1,49 @@ use crate::dc_dehtml::*; -use crate::dc_tools::*; -use crate::x::*; #[derive(Copy, Clone)] pub struct Simplify { pub is_forwarded: bool, - pub is_cut_at_begin: bool, - pub is_cut_at_end: bool, +} + +/// Return index of footer line in vector of message lines, or vector length if +/// no footer is found. +/// +/// Also return whether not-standard (rfc3676, §4.3) footer is found. +fn find_message_footer(lines: &[&str]) -> (usize, bool) { + for ix in 0..lines.len() { + let line = lines[ix]; + + // quoted-printable may encode `-- ` to `-- =20` which is converted + // back to `-- ` + match line.as_ref() { + "-- " | "-- " => return (ix, false), + "--" | "---" | "----" => return (ix, true), + _ => (), + } + } + return (lines.len(), false); } impl Simplify { pub fn new() -> Self { Simplify { is_forwarded: false, - is_cut_at_begin: false, - is_cut_at_end: false, } } /// Simplify and normalise text: Remove quotes, signatures, unnecessary /// lineends etc. /// The data returned from simplify() must be free()'d when no longer used. - pub unsafe fn simplify( - &mut self, - in_unterminated: *const libc::c_char, - in_bytes: libc::c_int, - is_html: bool, - is_msgrmsg: libc::c_int, - ) -> *mut libc::c_char { - if in_bytes <= 0 { - return "".strdup(); - } + pub fn simplify(&mut self, input: &str, is_html: bool, is_msgrmsg: bool) -> String { + let mut out = if is_html { + dc_dehtml(input) + } else { + input.to_string() + }; - /* create a copy of the given buffer */ - let mut out: *mut libc::c_char; - let mut temp: *mut libc::c_char; - self.is_forwarded = false; - self.is_cut_at_begin = false; - self.is_cut_at_end = false; - out = strndup( - in_unterminated as *mut libc::c_char, - in_bytes as libc::c_ulong, - ); - if out.is_null() { - return dc_strdup(b"\x00" as *const u8 as *const libc::c_char); - } - if is_html { - temp = dc_dehtml(out); - if !temp.is_null() { - free(out as *mut libc::c_void); - out = temp - } - } - dc_remove_cr_chars(out); - temp = self.simplify_plain_text(out, is_msgrmsg); - if !temp.is_null() { - free(out as *mut libc::c_void); - out = temp - } - dc_remove_cr_chars(out); + out.retain(|c| c != '\r'); + out = self.simplify_plain_text(&out, is_msgrmsg); + out.retain(|c| c != '\r'); out } @@ -67,75 +52,48 @@ impl Simplify { * Simplify Plain Text */ #[allow(non_snake_case)] - unsafe fn simplify_plain_text( - &mut self, - buf_terminated: *const libc::c_char, - is_msgrmsg: libc::c_int, - ) -> *mut libc::c_char { + fn simplify_plain_text(&mut self, buf_terminated: &str, is_msgrmsg: bool) -> String { /* This function ... ... removes all text after the line `-- ` (footer mark) ... removes full quotes at the beginning and at the end of the text - these are all lines starting with the character `>` ... remove a non-empty line before the removed quote (contains sth. like "On 2.9.2016, Bjoern wrote:" in different formats and lanugages) */ /* split the given buffer into lines */ - let lines = dc_split_into_lines(buf_terminated); + let lines: Vec<_> = buf_terminated.split('\n').collect(); let mut l_first: usize = 0; - let mut l_last = lines.len(); - let mut line: *mut libc::c_char; - let mut footer_mark: libc::c_int = 0i32; - for l in l_first..l_last { - line = lines[l]; - if strcmp(line, b"-- \x00" as *const u8 as *const libc::c_char) == 0i32 - || strcmp(line, b"-- \x00" as *const u8 as *const libc::c_char) == 0i32 - { - footer_mark = 1i32 - } - if strcmp(line, b"--\x00" as *const u8 as *const libc::c_char) == 0i32 - || strcmp(line, b"---\x00" as *const u8 as *const libc::c_char) == 0i32 - || strcmp(line, b"----\x00" as *const u8 as *const libc::c_char) == 0i32 - { - footer_mark = 1i32; - self.is_cut_at_end = true - } - if 0 != footer_mark { - l_last = l; - /* done */ - break; - } - } + let mut is_cut_at_begin = false; + let (mut l_last, mut is_cut_at_end) = find_message_footer(&lines); + if l_last > l_first + 2 { - let line0: *mut libc::c_char = lines[l_first]; - let line1: *mut libc::c_char = lines[l_first + 1]; - let line2: *mut libc::c_char = lines[l_first + 2]; - if strcmp( - line0, - b"---------- Forwarded message ----------\x00" as *const u8 as *const libc::c_char, - ) == 0i32 - && strncmp(line1, b"From: \x00" as *const u8 as *const libc::c_char, 6) == 0i32 - && *line2.offset(0isize) as libc::c_int == 0i32 + let line0 = lines[l_first]; + let line1 = lines[l_first + 1]; + let line2 = lines[l_first + 2]; + if line0 == "---------- Forwarded message ----------" + && line1.starts_with("From: ") + && line2.is_empty() { self.is_forwarded = true; l_first += 3 } } for l in l_first..l_last { - line = lines[l]; - if strncmp(line, b"-----\x00" as *const u8 as *const libc::c_char, 5) == 0i32 - || strncmp(line, b"_____\x00" as *const u8 as *const libc::c_char, 5) == 0i32 - || strncmp(line, b"=====\x00" as *const u8 as *const libc::c_char, 5) == 0i32 - || strncmp(line, b"*****\x00" as *const u8 as *const libc::c_char, 5) == 0i32 - || strncmp(line, b"~~~~~\x00" as *const u8 as *const libc::c_char, 5) == 0i32 + let line = lines[l]; + if line == "-----" + || line == "_____" + || line == "=====" + || line == "*****" + || line == "~~~~~" { l_last = l; - self.is_cut_at_end = true; + is_cut_at_end = true; /* done */ break; } } - if 0 == is_msgrmsg { + if !is_msgrmsg { let mut l_lastQuotedLine = None; for l in (l_first..l_last).rev() { - line = lines[l]; + let line = lines[l]; if is_plain_quote(line) { l_lastQuotedLine = Some(l) } else if !is_empty_line(line) { @@ -144,25 +102,25 @@ impl Simplify { } if l_lastQuotedLine.is_some() { l_last = l_lastQuotedLine.unwrap(); - self.is_cut_at_end = true; + is_cut_at_end = true; if l_last > 1 { if is_empty_line(lines[l_last - 1]) { l_last -= 1 } } if l_last > 1 { - line = lines[l_last - 1]; + let line = lines[l_last - 1]; if is_quoted_headline(line) { l_last -= 1 } } } } - if 0 == is_msgrmsg { + if !is_msgrmsg { let mut l_lastQuotedLine_0 = None; let mut hasQuotedHeadline = 0; for l in l_first..l_last { - line = lines[l]; + let line = lines[l]; if is_plain_quote(line) { l_lastQuotedLine_0 = Some(l) } else if !is_empty_line(line) { @@ -179,19 +137,19 @@ impl Simplify { } if l_lastQuotedLine_0.is_some() { l_first = l_lastQuotedLine_0.unwrap() + 1; - self.is_cut_at_begin = true + is_cut_at_begin = true } } /* re-create buffer from the remaining lines */ let mut ret = String::new(); - if self.is_cut_at_begin { + if is_cut_at_begin { ret += "[...]"; } /* we write empty lines only in case and non-empty line follows */ let mut pending_linebreaks: libc::c_int = 0i32; let mut content_lines_added: libc::c_int = 0i32; for l in l_first..l_last { - line = lines[l]; + let line = lines[l]; if is_empty_line(line) { pending_linebreaks += 1 } else { @@ -205,142 +163,105 @@ impl Simplify { } } // the incoming message might contain invalid UTF8 - ret += &to_string_lossy(line); + ret += line; content_lines_added += 1; pending_linebreaks = 1i32 } } - if self.is_cut_at_end && (!self.is_cut_at_begin || 0 != content_lines_added) { + if is_cut_at_end && (!is_cut_at_begin || 0 != content_lines_added) { ret += " [...]"; } - dc_free_splitted_lines(lines); - ret.strdup() + ret } } /** * Tools */ -unsafe fn is_empty_line(buf: *const libc::c_char) -> bool { - /* force unsigned - otherwise the `> ' '` comparison will fail */ - let mut p1: *const libc::c_uchar = buf as *const libc::c_uchar; - while 0 != *p1 { - if *p1 as libc::c_int > ' ' as i32 { +fn is_empty_line(buf: &str) -> bool { + // XXX: can it be simplified to buf.chars().all(|c| c.is_whitespace())? + // + // Strictly speaking, it is not equivalent (^A is not whitespace, but less than ' '), + // but having control sequences in email body?! + // + // See discussion at: https://github.com/deltachat/deltachat-core-rust/pull/402#discussion_r317062392 + for c in buf.chars() { + if c > ' ' { return false; } - p1 = p1.offset(1isize) } true } -unsafe fn is_quoted_headline(buf: *const libc::c_char) -> bool { +fn is_quoted_headline(buf: &str) -> bool { /* This function may be called for the line _directly_ before a quote. The function checks if the line contains sth. like "On 01.02.2016, xy@z wrote:" in various languages. - Currently, we simply check if the last character is a ':'. - Checking for the existence of an email address may fail (headlines may show the user's name instead of the address) */ - let buf_len: libc::c_int = strlen(buf) as libc::c_int; - if buf_len > 80i32 { - return false; - } - if buf_len > 0i32 && *buf.offset((buf_len - 1i32) as isize) as libc::c_int == ':' as i32 { - return true; - } - false + buf.len() <= 80 && buf.ends_with(':') } -unsafe fn is_plain_quote(buf: *const libc::c_char) -> bool { - if *buf.offset(0isize) as libc::c_int == '>' as i32 { - return true; - } - - false +fn is_plain_quote(buf: &str) -> bool { + buf.starts_with(">") } #[cfg(test)] mod tests { use super::*; - use std::ffi::CStr; #[test] fn test_simplify_trim() { - unsafe { - let mut simplify = Simplify::new(); - let html: *const libc::c_char = - b"\r\r\nline1
\r\n\r\n\r\rline2\n\r\x00" as *const u8 as *const libc::c_char; - let plain: *mut libc::c_char = - simplify.simplify(html, strlen(html) as libc::c_int, true, 0); + let mut simplify = Simplify::new(); + let html = "\r\r\nline1
\r\n\r\n\r\rline2\n\r"; + let plain = simplify.simplify(html, true, false); - assert_eq!( - CStr::from_ptr(plain as *const libc::c_char) - .to_str() - .unwrap(), - "line1\nline2", - ); - - free(plain as *mut libc::c_void); - } + assert_eq!(plain, "line1\nline2"); } #[test] fn test_simplify_parse_href() { - unsafe { - let mut simplify = Simplify::new(); - let html: *const libc::c_char = - b"text]>text bold]]>\x00" - as *const u8 as *const libc::c_char; - let plain = simplify.simplify(html, strlen(html) as libc::c_int, true, 0); + let mut simplify = Simplify::new(); + let html = "]>text bold]]>"; + let plain = simplify.simplify(html, true, false); - assert_eq!( - CStr::from_ptr(plain as *const libc::c_char) - .to_str() - .unwrap(), - "text *bold*<>", - ); - - free(plain as *mut libc::c_void); - } + assert_eq!(plain, "text *bold*<>"); } #[test] fn test_simplify_html_encoded() { - unsafe { - let mut simplify = Simplify::new(); - let html = - b"<>"'& äÄöÖüÜß fooÆçÇ ♦‎‏‌&noent;‍\x00" - as *const u8 as *const libc::c_char; - let plain = simplify.simplify(html, strlen(html) as libc::c_int, true, 0); + let mut simplify = Simplify::new(); + let html = + "<>"'& äÄöÖüÜß fooÆçÇ ♦‎‏‌&noent;‍"; - assert_eq!( - CStr::from_ptr(plain as *const libc::c_char) - .to_str() - .unwrap(), - "<>\"\'& äÄöÖüÜß fooÆçÇ \u{2666}\u{200e}\u{200f}\u{200c}&noent;\u{200d}" - ); + let plain = simplify.simplify(html, true, false); - free(plain as *mut libc::c_void); - } + assert_eq!( + plain, + "<>\"\'& äÄöÖüÜß fooÆçÇ \u{2666}\u{200e}\u{200f}\u{200c}&noent;\u{200d}" + ); + } + + #[test] + fn test_simplify_utilities() { + assert!(is_empty_line(" \t")); + assert!(is_empty_line("")); + assert!(is_empty_line(" \r")); + assert!(!is_empty_line(" x")); + assert!(is_plain_quote("> hello world")); + assert!(is_plain_quote(">>")); + assert!(!is_plain_quote("Life is pain")); + assert!(!is_plain_quote("")); } } diff --git a/src/dc_tools.rs b/src/dc_tools.rs index b500fcc55..0ef03eb20 100644 --- a/src/dc_tools.rs +++ b/src/dc_tools.rs @@ -355,33 +355,6 @@ unsafe fn dc_utf8_strnlen(s: *const libc::c_char, n: size_t) -> size_t { j } -/* split string into lines*/ -pub unsafe fn dc_split_into_lines(buf_terminated: *const libc::c_char) -> Vec<*mut libc::c_char> { - let mut lines = Vec::new(); - let mut line_chars = 0; - let mut p1: *const libc::c_char = buf_terminated; - let mut line_start: *const libc::c_char = p1; - while 0 != *p1 { - if *p1 as libc::c_int == '\n' as i32 { - lines.push(strndup(line_start, line_chars)); - p1 = p1.offset(1isize); - line_start = p1; - line_chars = 0; - } else { - p1 = p1.offset(1isize); - line_chars += 1; - } - } - lines.push(strndup(line_start, line_chars)); - lines -} - -pub unsafe fn dc_free_splitted_lines(lines: Vec<*mut libc::c_char>) { - for s in lines { - free(s as *mut libc::c_void); - } -} - pub unsafe fn dc_str_from_clist( list: *const clist, delimiter: *const libc::c_char,