refactor: make dc_dehtml() function safe

* Make dc_dehtml() function safe

* Change type of is_msgrmsg parameter to bool

* Narrow type of local variable in simplify_plain_text()

* Export less fields of `Simplify' record

* Demote is_cut_* from fields of `Simplify' to local variables

* Refactor part of simplify_plain_text()

Refactor footer ("-- " and similar) code into separate function,
and re-implement it with standard Rust string methods.

It simplifies code and allows removing one mutable local variable.

* Replace dc_split_into_lines with String.split()

  * src/dc_simplify.rs(find_message_footer): adjust type signature to accept
    slice of &str, not slice of pointers

  * src/dc_simplify.rs(simplify_plain_text): adjust code to use '==' operator
    instead of strcmp(3).

  * src/dc_simplify.rs(is_empty_line, is_quoted_headline, is_plain_quote):
    + adjust type signatures to accept &str, not 'const char *'
    + remove no longer needed 'unsafe' qualifier

  * src/dc_tools(dc_split_into_lines, dc_free_splitted_lines): remove no longer
    used functions.

In addition to additional type-safety, this change reduces number of
allocations: String.split returns iterator of &str.

* Make simplify_plain_text() safe

* Make Simplify.simplify return String, not pointer

* Refactor Simplify.simplify to use String methods, not pointers

* Make Simplify.simplify() safe

* Avoid neeless allocation in Simplify.simplify when input is html

* Add tests for simplify utilities

* Document discussion about is_empty_line() discussion
This commit is contained in:
Dmitry Bogatov
2019-08-26 15:15:14 +00:00
committed by Friedel Ziegelmayer
parent 8a73f84003
commit d7d7147549
4 changed files with 127 additions and 233 deletions

View File

@@ -2,9 +2,6 @@ use lazy_static::lazy_static;
use quick_xml; use quick_xml;
use quick_xml::events::{BytesEnd, BytesStart, BytesText}; use quick_xml::events::{BytesEnd, BytesStart, BytesText};
use crate::dc_tools::*;
use crate::x::*;
lazy_static! { lazy_static! {
static ref LINE_RE: regex::Regex = regex::Regex::new(r"(\r?\n)+").unwrap(); static ref LINE_RE: regex::Regex = regex::Regex::new(r"(\r?\n)+").unwrap();
} }
@@ -24,19 +21,20 @@ enum AddText {
// dc_dehtml() returns way too many lineends; however, an optimisation on this issue is not needed as // dc_dehtml() returns way too many lineends; however, an optimisation on this issue is not needed as
// the lineends are typically remove in further processing by the caller // the lineends are typically remove in further processing by the caller
pub unsafe fn dc_dehtml(buf_terminated: *mut libc::c_char) -> *mut libc::c_char { pub fn dc_dehtml(buf_terminated: &str) -> String {
dc_trim(buf_terminated); let buf_terminated = buf_terminated.trim();
if *buf_terminated.offset(0isize) as libc::c_int == 0i32 {
return dc_strdup(b"\x00" as *const u8 as *const libc::c_char); if buf_terminated.is_empty() {
return "".into();
} }
let mut dehtml = Dehtml { let mut dehtml = Dehtml {
strbuilder: String::with_capacity(strlen(buf_terminated)), strbuilder: String::with_capacity(buf_terminated.len()),
add_text: AddText::YesRemoveLineEnds, add_text: AddText::YesRemoveLineEnds,
last_href: None, last_href: None,
}; };
let mut reader = quick_xml::Reader::from_str(as_str(buf_terminated)); let mut reader = quick_xml::Reader::from_str(buf_terminated);
let mut buf = Vec::new(); let mut buf = Vec::new();
@@ -61,7 +59,7 @@ pub unsafe fn dc_dehtml(buf_terminated: *mut libc::c_char) -> *mut libc::c_char
buf.clear(); buf.clear();
} }
dehtml.strbuilder.strdup() dehtml.strbuilder
} }
fn dehtml_text_cb(event: &BytesText, dehtml: &mut Dehtml) { fn dehtml_text_cb(event: &BytesText, dehtml: &mut Dehtml) {

View File

@@ -1154,28 +1154,30 @@ unsafe fn dc_mimeparser_add_single_part_if_known(
if ok_to_continue { if ok_to_continue {
/* check header directly as is_send_by_messenger is not yet set up */ /* check header directly as is_send_by_messenger is not yet set up */
let is_msgrmsg = let is_msgrmsg =
(!dc_mimeparser_lookup_optional_field(&mimeparser, "Chat-Version") !dc_mimeparser_lookup_optional_field(&mimeparser, "Chat-Version")
.is_null()) as libc::c_int; .is_null();
let simplified_txt = simplifier.unwrap().simplify( let simplified_txt =
decoded_data, if decoded_data_bytes <= 0 || decoded_data.is_null() {
decoded_data_bytes as libc::c_int, "".into()
mime_type == 70i32, } else {
is_msgrmsg, let input_c = strndup(decoded_data, decoded_data_bytes as _);
); let input = to_string_lossy(input_c);
if !simplified_txt.is_null() let is_html = mime_type == 70;
&& 0 != *simplified_txt.offset(0isize) as libc::c_int free(input_c as *mut _);
{
simplifier.unwrap().simplify(&input, is_html, is_msgrmsg)
};
if !simplified_txt.is_empty() {
let mut part = dc_mimepart_new(); let mut part = dc_mimepart_new();
part.type_0 = 10i32; part.type_0 = 10i32;
part.int_mimetype = mime_type; part.int_mimetype = mime_type;
part.msg = simplified_txt; part.msg = simplified_txt.strdup();
part.msg_raw = part.msg_raw =
strndup(decoded_data, decoded_data_bytes as libc::c_ulong); strndup(decoded_data, decoded_data_bytes as libc::c_ulong);
do_add_single_part(mimeparser, part); do_add_single_part(mimeparser, part);
} else {
free(simplified_txt as *mut libc::c_void);
} }
if simplifier.unwrap().is_forwarded { if simplifier.unwrap().is_forwarded {
mimeparser.is_forwarded = 1i32 mimeparser.is_forwarded = 1i32
} }

View File

@@ -1,64 +1,49 @@
use crate::dc_dehtml::*; use crate::dc_dehtml::*;
use crate::dc_tools::*;
use crate::x::*;
#[derive(Copy, Clone)] #[derive(Copy, Clone)]
pub struct Simplify { pub struct Simplify {
pub is_forwarded: bool, pub is_forwarded: bool,
pub is_cut_at_begin: bool, }
pub is_cut_at_end: bool,
/// Return index of footer line in vector of message lines, or vector length if
/// no footer is found.
///
/// Also return whether not-standard (rfc3676, §4.3) footer is found.
fn find_message_footer(lines: &[&str]) -> (usize, bool) {
for ix in 0..lines.len() {
let line = lines[ix];
// quoted-printable may encode `-- ` to `-- =20` which is converted
// back to `-- `
match line.as_ref() {
"-- " | "-- " => return (ix, false),
"--" | "---" | "----" => return (ix, true),
_ => (),
}
}
return (lines.len(), false);
} }
impl Simplify { impl Simplify {
pub fn new() -> Self { pub fn new() -> Self {
Simplify { Simplify {
is_forwarded: false, is_forwarded: false,
is_cut_at_begin: false,
is_cut_at_end: false,
} }
} }
/// Simplify and normalise text: Remove quotes, signatures, unnecessary /// Simplify and normalise text: Remove quotes, signatures, unnecessary
/// lineends etc. /// lineends etc.
/// The data returned from simplify() must be free()'d when no longer used. /// The data returned from simplify() must be free()'d when no longer used.
pub unsafe fn simplify( pub fn simplify(&mut self, input: &str, is_html: bool, is_msgrmsg: bool) -> String {
&mut self, let mut out = if is_html {
in_unterminated: *const libc::c_char, dc_dehtml(input)
in_bytes: libc::c_int, } else {
is_html: bool, input.to_string()
is_msgrmsg: libc::c_int, };
) -> *mut libc::c_char {
if in_bytes <= 0 {
return "".strdup();
}
/* create a copy of the given buffer */ out.retain(|c| c != '\r');
let mut out: *mut libc::c_char; out = self.simplify_plain_text(&out, is_msgrmsg);
let mut temp: *mut libc::c_char; out.retain(|c| c != '\r');
self.is_forwarded = false;
self.is_cut_at_begin = false;
self.is_cut_at_end = false;
out = strndup(
in_unterminated as *mut libc::c_char,
in_bytes as libc::c_ulong,
);
if out.is_null() {
return dc_strdup(b"\x00" as *const u8 as *const libc::c_char);
}
if is_html {
temp = dc_dehtml(out);
if !temp.is_null() {
free(out as *mut libc::c_void);
out = temp
}
}
dc_remove_cr_chars(out);
temp = self.simplify_plain_text(out, is_msgrmsg);
if !temp.is_null() {
free(out as *mut libc::c_void);
out = temp
}
dc_remove_cr_chars(out);
out out
} }
@@ -67,75 +52,48 @@ impl Simplify {
* Simplify Plain Text * Simplify Plain Text
*/ */
#[allow(non_snake_case)] #[allow(non_snake_case)]
unsafe fn simplify_plain_text( fn simplify_plain_text(&mut self, buf_terminated: &str, is_msgrmsg: bool) -> String {
&mut self,
buf_terminated: *const libc::c_char,
is_msgrmsg: libc::c_int,
) -> *mut libc::c_char {
/* This function ... /* This function ...
... removes all text after the line `-- ` (footer mark) ... removes all text after the line `-- ` (footer mark)
... removes full quotes at the beginning and at the end of the text - ... removes full quotes at the beginning and at the end of the text -
these are all lines starting with the character `>` these are all lines starting with the character `>`
... remove a non-empty line before the removed quote (contains sth. like "On 2.9.2016, Bjoern wrote:" in different formats and lanugages) */ ... remove a non-empty line before the removed quote (contains sth. like "On 2.9.2016, Bjoern wrote:" in different formats and lanugages) */
/* split the given buffer into lines */ /* split the given buffer into lines */
let lines = dc_split_into_lines(buf_terminated); let lines: Vec<_> = buf_terminated.split('\n').collect();
let mut l_first: usize = 0; let mut l_first: usize = 0;
let mut l_last = lines.len(); let mut is_cut_at_begin = false;
let mut line: *mut libc::c_char; let (mut l_last, mut is_cut_at_end) = find_message_footer(&lines);
let mut footer_mark: libc::c_int = 0i32;
for l in l_first..l_last {
line = lines[l];
if strcmp(line, b"-- \x00" as *const u8 as *const libc::c_char) == 0i32
|| strcmp(line, b"-- \x00" as *const u8 as *const libc::c_char) == 0i32
{
footer_mark = 1i32
}
if strcmp(line, b"--\x00" as *const u8 as *const libc::c_char) == 0i32
|| strcmp(line, b"---\x00" as *const u8 as *const libc::c_char) == 0i32
|| strcmp(line, b"----\x00" as *const u8 as *const libc::c_char) == 0i32
{
footer_mark = 1i32;
self.is_cut_at_end = true
}
if 0 != footer_mark {
l_last = l;
/* done */
break;
}
}
if l_last > l_first + 2 { if l_last > l_first + 2 {
let line0: *mut libc::c_char = lines[l_first]; let line0 = lines[l_first];
let line1: *mut libc::c_char = lines[l_first + 1]; let line1 = lines[l_first + 1];
let line2: *mut libc::c_char = lines[l_first + 2]; let line2 = lines[l_first + 2];
if strcmp( if line0 == "---------- Forwarded message ----------"
line0, && line1.starts_with("From: ")
b"---------- Forwarded message ----------\x00" as *const u8 as *const libc::c_char, && line2.is_empty()
) == 0i32
&& strncmp(line1, b"From: \x00" as *const u8 as *const libc::c_char, 6) == 0i32
&& *line2.offset(0isize) as libc::c_int == 0i32
{ {
self.is_forwarded = true; self.is_forwarded = true;
l_first += 3 l_first += 3
} }
} }
for l in l_first..l_last { for l in l_first..l_last {
line = lines[l]; let line = lines[l];
if strncmp(line, b"-----\x00" as *const u8 as *const libc::c_char, 5) == 0i32 if line == "-----"
|| strncmp(line, b"_____\x00" as *const u8 as *const libc::c_char, 5) == 0i32 || line == "_____"
|| strncmp(line, b"=====\x00" as *const u8 as *const libc::c_char, 5) == 0i32 || line == "====="
|| strncmp(line, b"*****\x00" as *const u8 as *const libc::c_char, 5) == 0i32 || line == "*****"
|| strncmp(line, b"~~~~~\x00" as *const u8 as *const libc::c_char, 5) == 0i32 || line == "~~~~~"
{ {
l_last = l; l_last = l;
self.is_cut_at_end = true; is_cut_at_end = true;
/* done */ /* done */
break; break;
} }
} }
if 0 == is_msgrmsg { if !is_msgrmsg {
let mut l_lastQuotedLine = None; let mut l_lastQuotedLine = None;
for l in (l_first..l_last).rev() { for l in (l_first..l_last).rev() {
line = lines[l]; let line = lines[l];
if is_plain_quote(line) { if is_plain_quote(line) {
l_lastQuotedLine = Some(l) l_lastQuotedLine = Some(l)
} else if !is_empty_line(line) { } else if !is_empty_line(line) {
@@ -144,25 +102,25 @@ impl Simplify {
} }
if l_lastQuotedLine.is_some() { if l_lastQuotedLine.is_some() {
l_last = l_lastQuotedLine.unwrap(); l_last = l_lastQuotedLine.unwrap();
self.is_cut_at_end = true; is_cut_at_end = true;
if l_last > 1 { if l_last > 1 {
if is_empty_line(lines[l_last - 1]) { if is_empty_line(lines[l_last - 1]) {
l_last -= 1 l_last -= 1
} }
} }
if l_last > 1 { if l_last > 1 {
line = lines[l_last - 1]; let line = lines[l_last - 1];
if is_quoted_headline(line) { if is_quoted_headline(line) {
l_last -= 1 l_last -= 1
} }
} }
} }
} }
if 0 == is_msgrmsg { if !is_msgrmsg {
let mut l_lastQuotedLine_0 = None; let mut l_lastQuotedLine_0 = None;
let mut hasQuotedHeadline = 0; let mut hasQuotedHeadline = 0;
for l in l_first..l_last { for l in l_first..l_last {
line = lines[l]; let line = lines[l];
if is_plain_quote(line) { if is_plain_quote(line) {
l_lastQuotedLine_0 = Some(l) l_lastQuotedLine_0 = Some(l)
} else if !is_empty_line(line) { } else if !is_empty_line(line) {
@@ -179,19 +137,19 @@ impl Simplify {
} }
if l_lastQuotedLine_0.is_some() { if l_lastQuotedLine_0.is_some() {
l_first = l_lastQuotedLine_0.unwrap() + 1; l_first = l_lastQuotedLine_0.unwrap() + 1;
self.is_cut_at_begin = true is_cut_at_begin = true
} }
} }
/* re-create buffer from the remaining lines */ /* re-create buffer from the remaining lines */
let mut ret = String::new(); let mut ret = String::new();
if self.is_cut_at_begin { if is_cut_at_begin {
ret += "[...]"; ret += "[...]";
} }
/* we write empty lines only in case and non-empty line follows */ /* we write empty lines only in case and non-empty line follows */
let mut pending_linebreaks: libc::c_int = 0i32; let mut pending_linebreaks: libc::c_int = 0i32;
let mut content_lines_added: libc::c_int = 0i32; let mut content_lines_added: libc::c_int = 0i32;
for l in l_first..l_last { for l in l_first..l_last {
line = lines[l]; let line = lines[l];
if is_empty_line(line) { if is_empty_line(line) {
pending_linebreaks += 1 pending_linebreaks += 1
} else { } else {
@@ -205,142 +163,105 @@ impl Simplify {
} }
} }
// the incoming message might contain invalid UTF8 // the incoming message might contain invalid UTF8
ret += &to_string_lossy(line); ret += line;
content_lines_added += 1; content_lines_added += 1;
pending_linebreaks = 1i32 pending_linebreaks = 1i32
} }
} }
if self.is_cut_at_end && (!self.is_cut_at_begin || 0 != content_lines_added) { if is_cut_at_end && (!is_cut_at_begin || 0 != content_lines_added) {
ret += " [...]"; ret += " [...]";
} }
dc_free_splitted_lines(lines);
ret.strdup() ret
} }
} }
/** /**
* Tools * Tools
*/ */
unsafe fn is_empty_line(buf: *const libc::c_char) -> bool { fn is_empty_line(buf: &str) -> bool {
/* force unsigned - otherwise the `> ' '` comparison will fail */ // XXX: can it be simplified to buf.chars().all(|c| c.is_whitespace())?
let mut p1: *const libc::c_uchar = buf as *const libc::c_uchar; //
while 0 != *p1 { // Strictly speaking, it is not equivalent (^A is not whitespace, but less than ' '),
if *p1 as libc::c_int > ' ' as i32 { // but having control sequences in email body?!
//
// See discussion at: https://github.com/deltachat/deltachat-core-rust/pull/402#discussion_r317062392
for c in buf.chars() {
if c > ' ' {
return false; return false;
} }
p1 = p1.offset(1isize)
} }
true true
} }
unsafe fn is_quoted_headline(buf: *const libc::c_char) -> bool { fn is_quoted_headline(buf: &str) -> bool {
/* This function may be called for the line _directly_ before a quote. /* This function may be called for the line _directly_ before a quote.
The function checks if the line contains sth. like "On 01.02.2016, xy@z wrote:" in various languages. The function checks if the line contains sth. like "On 01.02.2016, xy@z wrote:" in various languages.
- Currently, we simply check if the last character is a ':'. - Currently, we simply check if the last character is a ':'.
- Checking for the existence of an email address may fail (headlines may show the user's name instead of the address) */ - Checking for the existence of an email address may fail (headlines may show the user's name instead of the address) */
let buf_len: libc::c_int = strlen(buf) as libc::c_int;
if buf_len > 80i32 {
return false;
}
if buf_len > 0i32 && *buf.offset((buf_len - 1i32) as isize) as libc::c_int == ':' as i32 {
return true;
}
false buf.len() <= 80 && buf.ends_with(':')
} }
unsafe fn is_plain_quote(buf: *const libc::c_char) -> bool { fn is_plain_quote(buf: &str) -> bool {
if *buf.offset(0isize) as libc::c_int == '>' as i32 { buf.starts_with(">")
return true;
}
false
} }
#[cfg(test)] #[cfg(test)]
mod tests { mod tests {
use super::*; use super::*;
use std::ffi::CStr;
#[test] #[test]
fn test_simplify_trim() { fn test_simplify_trim() {
unsafe { let mut simplify = Simplify::new();
let mut simplify = Simplify::new(); let html = "\r\r\nline1<br>\r\n\r\n\r\rline2\n\r";
let html: *const libc::c_char = let plain = simplify.simplify(html, true, false);
b"\r\r\nline1<br>\r\n\r\n\r\rline2\n\r\x00" as *const u8 as *const libc::c_char;
let plain: *mut libc::c_char =
simplify.simplify(html, strlen(html) as libc::c_int, true, 0);
assert_eq!( assert_eq!(plain, "line1\nline2");
CStr::from_ptr(plain as *const libc::c_char)
.to_str()
.unwrap(),
"line1\nline2",
);
free(plain as *mut libc::c_void);
}
} }
#[test] #[test]
fn test_simplify_parse_href() { fn test_simplify_parse_href() {
unsafe { let mut simplify = Simplify::new();
let mut simplify = Simplify::new(); let html = "<a href=url>text</a";
let html: *const libc::c_char = let plain = simplify.simplify(html, true, false);
b"<a href=url>text</a\x00" as *const u8 as *const libc::c_char;
let plain: *mut libc::c_char =
simplify.simplify(html, strlen(html) as libc::c_int, true, 0);
assert_eq!( assert_eq!(plain, "[text](url)");
CStr::from_ptr(plain as *const libc::c_char)
.to_str()
.unwrap(),
"[text](url)",
);
free(plain as *mut libc::c_void);
}
} }
#[test] #[test]
fn test_simplify_bold_text() { fn test_simplify_bold_text() {
unsafe { let mut simplify = Simplify::new();
let mut simplify = Simplify::new(); let html = "<!DOCTYPE name [<!DOCTYPE ...>]><!-- comment -->text <b><?php echo ... ?>bold</b><![CDATA[<>]]>";
let html: *const libc::c_char = let plain = simplify.simplify(html, true, false);
b"<!DOCTYPE name [<!DOCTYPE ...>]><!-- comment -->text <b><?php echo ... ?>bold</b><![CDATA[<>]]>\x00"
as *const u8 as *const libc::c_char;
let plain = simplify.simplify(html, strlen(html) as libc::c_int, true, 0);
assert_eq!( assert_eq!(plain, "text *bold*<>");
CStr::from_ptr(plain as *const libc::c_char)
.to_str()
.unwrap(),
"text *bold*<>",
);
free(plain as *mut libc::c_void);
}
} }
#[test] #[test]
fn test_simplify_html_encoded() { fn test_simplify_html_encoded() {
unsafe { let mut simplify = Simplify::new();
let mut simplify = Simplify::new(); let html =
let html = "&lt;&gt;&quot;&apos;&amp; &auml;&Auml;&ouml;&Ouml;&uuml;&Uuml;&szlig; foo&AElig;&ccedil;&Ccedil; &diams;&lrm;&rlm;&zwnj;&noent;&zwj;";
b"&lt;&gt;&quot;&apos;&amp; &auml;&Auml;&ouml;&Ouml;&uuml;&Uuml;&szlig; foo&AElig;&ccedil;&Ccedil; &diams;&lrm;&rlm;&zwnj;&noent;&zwj;\x00"
as *const u8 as *const libc::c_char;
let plain = simplify.simplify(html, strlen(html) as libc::c_int, true, 0);
assert_eq!( let plain = simplify.simplify(html, true, false);
CStr::from_ptr(plain as *const libc::c_char)
.to_str()
.unwrap(),
"<>\"\'& äÄöÖüÜß fooÆçÇ \u{2666}\u{200e}\u{200f}\u{200c}&noent;\u{200d}"
);
free(plain as *mut libc::c_void); assert_eq!(
} plain,
"<>\"\'& äÄöÖüÜß fooÆçÇ \u{2666}\u{200e}\u{200f}\u{200c}&noent;\u{200d}"
);
}
#[test]
fn test_simplify_utilities() {
assert!(is_empty_line(" \t"));
assert!(is_empty_line(""));
assert!(is_empty_line(" \r"));
assert!(!is_empty_line(" x"));
assert!(is_plain_quote("> hello world"));
assert!(is_plain_quote(">>"));
assert!(!is_plain_quote("Life is pain"));
assert!(!is_plain_quote(""));
} }
} }

View File

@@ -355,33 +355,6 @@ unsafe fn dc_utf8_strnlen(s: *const libc::c_char, n: size_t) -> size_t {
j j
} }
/* split string into lines*/
pub unsafe fn dc_split_into_lines(buf_terminated: *const libc::c_char) -> Vec<*mut libc::c_char> {
let mut lines = Vec::new();
let mut line_chars = 0;
let mut p1: *const libc::c_char = buf_terminated;
let mut line_start: *const libc::c_char = p1;
while 0 != *p1 {
if *p1 as libc::c_int == '\n' as i32 {
lines.push(strndup(line_start, line_chars));
p1 = p1.offset(1isize);
line_start = p1;
line_chars = 0;
} else {
p1 = p1.offset(1isize);
line_chars += 1;
}
}
lines.push(strndup(line_start, line_chars));
lines
}
pub unsafe fn dc_free_splitted_lines(lines: Vec<*mut libc::c_char>) {
for s in lines {
free(s as *mut libc::c_void);
}
}
pub unsafe fn dc_str_from_clist( pub unsafe fn dc_str_from_clist(
list: *const clist, list: *const clist,
delimiter: *const libc::c_char, delimiter: *const libc::c_char,