Simplify simplify.rs

* Remove Simplify structure.

* Match for lines starting with 5 markers, not consisting of exactly 5 markers.

This is a regression from C to Rust conversion, see
2bb1c280d5/src/dc_simplify.c (L154)

* Add tests.
This commit is contained in:
Alexander Krotov
2019-12-16 02:31:38 +03:00
committed by GitHub
parent 898e641256
commit 1ee15942cc
2 changed files with 179 additions and 162 deletions

View File

@@ -1,38 +1,44 @@
use crate::dehtml::*; use crate::dehtml::*;
#[derive(Copy, Clone)] /// Remove standard (RFC 3676, §4.3) footer if it is found.
pub struct Simplify { fn remove_message_footer<'a>(lines: &'a [&str]) -> &'a [&'a str] {
pub is_forwarded: bool,
}
/// Return index of footer line in vector of message lines, or vector length if
/// no footer is found.
///
/// Also return whether not-standard (rfc3676, §4.3) footer is found.
fn find_message_footer(lines: &[&str]) -> (usize, bool) {
for (ix, &line) in lines.iter().enumerate() { for (ix, &line) in lines.iter().enumerate() {
// quoted-printable may encode `-- ` to `-- =20` which is converted // quoted-printable may encode `-- ` to `-- =20` which is converted
// back to `-- ` // back to `-- `
match line { match line {
"-- " | "-- " => return (ix, false), "-- " | "-- " => return &lines[..ix],
"--" | "---" | "----" => return (ix, true),
_ => (), _ => (),
} }
} }
(lines.len(), false) lines
} }
impl Simplify { /// Remove nonstandard footer and a boolean indicating whether such
pub fn new() -> Self { /// footer was removed.
Simplify { fn remove_nonstandard_footer<'a>(lines: &'a [&str]) -> (&'a [&'a str], bool) {
is_forwarded: false, for (ix, &line) in lines.iter().enumerate() {
if line == "--"
|| line == "---"
|| line == "----"
|| line.starts_with("-----")
|| line.starts_with("_____")
|| line.starts_with("=====")
|| line.starts_with("*****")
|| line.starts_with("~~~~~")
{
return (&lines[..ix], true);
} }
} }
(lines, false)
}
/// Simplify and normalise text: Remove quotes, signatures, unnecessary fn split_lines(buf: &str) -> Vec<&str> {
/// lineends etc. buf.split('\n').collect()
/// The data returned from simplify() must be free()'d when no longer used. }
pub fn simplify(&mut self, input: &str, is_html: bool, is_msgrmsg: bool) -> String {
/// Simplify message text for chat display.
/// Remove quotes, signatures, trailing empty lines etc.
pub fn simplify(input: &str, is_html: bool, is_chat_message: bool) -> (String, bool) {
let mut out = if is_html { let mut out = if is_html {
dehtml(input) dehtml(input)
} else { } else {
@@ -40,67 +46,56 @@ impl Simplify {
}; };
out.retain(|c| c != '\r'); out.retain(|c| c != '\r');
out = self.simplify_plain_text(&out, is_msgrmsg); let lines = split_lines(&out);
out.retain(|c| c != '\r'); let (lines, is_forwarded) = skip_forward_header(&lines);
out let lines = remove_message_footer(lines);
let (lines, has_nonstandard_footer) = remove_nonstandard_footer(lines);
let (lines, has_bottom_quote) = if !is_chat_message {
remove_bottom_quote(lines)
} else {
(lines, false)
};
let (lines, has_top_quote) = if !is_chat_message {
remove_top_quote(lines)
} else {
(lines, false)
};
// re-create buffer from the remaining lines
let text = render_message(
lines,
has_top_quote,
has_nonstandard_footer || has_bottom_quote,
);
(text, is_forwarded)
} }
/** /// Skips "forwarded message" header.
* Simplify Plain Text /// Returns message body lines and a boolean indicating whether
*/ /// a message is forwarded or not.
#[allow(non_snake_case, clippy::mut_range_bound, clippy::needless_range_loop)] fn skip_forward_header<'a>(lines: &'a [&str]) -> (&'a [&'a str], bool) {
fn simplify_plain_text(&mut self, buf_terminated: &str, is_msgrmsg: bool) -> String { if lines.len() >= 3
/* This function ... && lines[0] == "---------- Forwarded message ----------"
... removes all text after the line `-- ` (footer mark) && lines[1].starts_with("From: ")
... removes full quotes at the beginning and at the end of the text - && lines[2].is_empty()
these are all lines starting with the character `>`
... remove a non-empty line before the removed quote (contains sth. like "On 2.9.2016, Bjoern wrote:" in different formats and lanugages) */
/* split the given buffer into lines */
let lines: Vec<_> = buf_terminated.split('\n').collect();
let mut l_first: usize = 0;
let mut is_cut_at_begin = false;
let (mut l_last, mut is_cut_at_end) = find_message_footer(&lines);
if l_last > l_first + 2 {
let line0 = lines[l_first];
let line1 = lines[l_first + 1];
let line2 = lines[l_first + 2];
if line0 == "---------- Forwarded message ----------"
&& line1.starts_with("From: ")
&& line2.is_empty()
{ {
self.is_forwarded = true; (&lines[3..], true)
l_first += 3 } else {
(lines, false)
} }
} }
for l in l_first..l_last {
let line = lines[l]; fn remove_bottom_quote<'a>(lines: &'a [&str]) -> (&'a [&'a str], bool) {
if line == "-----" let mut last_quoted_line = None;
|| line == "_____" for (l, line) in lines.iter().enumerate().rev() {
|| line == "====="
|| line == "*****"
|| line == "~~~~~"
{
l_last = l;
is_cut_at_end = true;
/* done */
break;
}
}
if !is_msgrmsg {
let mut l_lastQuotedLine = None;
for l in (l_first..l_last).rev() {
let line = lines[l];
if is_plain_quote(line) { if is_plain_quote(line) {
l_lastQuotedLine = Some(l) last_quoted_line = Some(l)
} else if !is_empty_line(line) { } else if !is_empty_line(line) {
break; break;
} }
} }
if let Some(last_quoted_line) = l_lastQuotedLine { if let Some(mut l_last) = last_quoted_line {
l_last = last_quoted_line;
is_cut_at_end = true;
if l_last > 1 && is_empty_line(lines[l_last - 1]) { if l_last > 1 && is_empty_line(lines[l_last - 1]) {
l_last -= 1 l_last -= 1
} }
@@ -110,48 +105,49 @@ impl Simplify {
l_last -= 1 l_last -= 1
} }
} }
(&lines[..l_last], true)
} else {
(lines, false)
} }
} }
if !is_msgrmsg {
let mut l_lastQuotedLine_0 = None; fn remove_top_quote<'a>(lines: &'a [&str]) -> (&'a [&'a str], bool) {
let mut hasQuotedHeadline = 0; let mut last_quoted_line = None;
for l in l_first..l_last { let mut has_quoted_headline = false;
let line = lines[l]; for (l, line) in lines.iter().enumerate() {
if is_plain_quote(line) { if is_plain_quote(line) {
l_lastQuotedLine_0 = Some(l) last_quoted_line = Some(l)
} else if !is_empty_line(line) { } else if !is_empty_line(line) {
if is_quoted_headline(line) if is_quoted_headline(line) && !has_quoted_headline && last_quoted_line.is_none() {
&& 0 == hasQuotedHeadline has_quoted_headline = true
&& l_lastQuotedLine_0.is_none()
{
hasQuotedHeadline = 1i32
} else { } else {
/* non-quoting line found */ /* non-quoting line found */
break; break;
} }
} }
} }
if let Some(last_quoted_line) = l_lastQuotedLine_0 { if let Some(last_quoted_line) = last_quoted_line {
l_first = last_quoted_line + 1; (&lines[last_quoted_line + 1..], true)
is_cut_at_begin = true } else {
(lines, false)
} }
} }
/* re-create buffer from the remaining lines */
fn render_message(lines: &[&str], is_cut_at_begin: bool, is_cut_at_end: bool) -> String {
let mut ret = String::new(); let mut ret = String::new();
if is_cut_at_begin { if is_cut_at_begin {
ret += "[...]"; ret += "[...]";
} }
/* we write empty lines only in case and non-empty line follows */ /* we write empty lines only in case and non-empty line follows */
let mut pending_linebreaks = 0; let mut pending_linebreaks = 0;
let mut content_lines_added = 0; let mut empty_body = true;
for l in l_first..l_last { for line in lines {
let line = lines[l];
if is_empty_line(line) { if is_empty_line(line) {
pending_linebreaks += 1 pending_linebreaks += 1
} else { } else {
if 0 != content_lines_added { if !empty_body {
if pending_linebreaks > 2i32 { if pending_linebreaks > 2 {
pending_linebreaks = 2i32 pending_linebreaks = 2
} }
while 0 != pending_linebreaks { while 0 != pending_linebreaks {
ret += "\n"; ret += "\n";
@@ -160,17 +156,15 @@ impl Simplify {
} }
// the incoming message might contain invalid UTF8 // the incoming message might contain invalid UTF8
ret += line; ret += line;
content_lines_added += 1; empty_body = false;
pending_linebreaks = 1i32 pending_linebreaks = 1
} }
} }
if is_cut_at_end && (!is_cut_at_begin || 0 != content_lines_added) { if is_cut_at_end && (!is_cut_at_begin || !empty_body) {
ret += " [...]"; ret += " [...]";
} }
ret ret
} }
}
/** /**
* Tools * Tools
@@ -213,50 +207,59 @@ mod tests {
#[test] #[test]
// proptest does not support [[:graphical:][:space:]] regex. // proptest does not support [[:graphical:][:space:]] regex.
fn test_simplify_plain_text_fuzzy(input in "[!-~\t \n]+") { fn test_simplify_plain_text_fuzzy(input in "[!-~\t \n]+") {
let output = Simplify::new().simplify_plain_text(&input, true); let (output, _is_forwarded) = simplify(&input, false, true);
assert!(output.split('\n').all(|s| s != "-- ")); assert!(output.split('\n').all(|s| s != "-- "));
} }
} }
#[test] #[test]
fn test_simplify_trim() { fn test_simplify_trim() {
let mut simplify = Simplify::new();
let html = "\r\r\nline1<br>\r\n\r\n\r\rline2\n\r"; let html = "\r\r\nline1<br>\r\n\r\n\r\rline2\n\r";
let plain = simplify.simplify(html, true, false); let (plain, is_forwarded) = simplify(html, true, false);
assert_eq!(plain, "line1\nline2"); assert_eq!(plain, "line1\nline2");
assert!(!is_forwarded);
} }
#[test] #[test]
fn test_simplify_parse_href() { fn test_simplify_parse_href() {
let mut simplify = Simplify::new();
let html = "<a href=url>text</a"; let html = "<a href=url>text</a";
let plain = simplify.simplify(html, true, false); let (plain, is_forwarded) = simplify(html, true, false);
assert_eq!(plain, "[text](url)"); assert_eq!(plain, "[text](url)");
assert!(!is_forwarded);
} }
#[test] #[test]
fn test_simplify_bold_text() { fn test_simplify_bold_text() {
let mut simplify = Simplify::new();
let html = "<!DOCTYPE name [<!DOCTYPE ...>]><!-- comment -->text <b><?php echo ... ?>bold</b><![CDATA[<>]]>"; let html = "<!DOCTYPE name [<!DOCTYPE ...>]><!-- comment -->text <b><?php echo ... ?>bold</b><![CDATA[<>]]>";
let plain = simplify.simplify(html, true, false); let (plain, is_forwarded) = simplify(html, true, false);
assert_eq!(plain, "text *bold*<>"); assert_eq!(plain, "text *bold*<>");
assert!(!is_forwarded);
}
#[test]
fn test_simplify_forwarded_message() {
let text = "---------- Forwarded message ----------\r\nFrom: test@example.com\r\n\r\nForwarded message\r\n-- \r\nSignature goes here";
let (plain, is_forwarded) = simplify(text, false, false);
assert_eq!(plain, "Forwarded message");
assert!(is_forwarded);
} }
#[test] #[test]
fn test_simplify_html_encoded() { fn test_simplify_html_encoded() {
let mut simplify = Simplify::new();
let html = let html =
"&lt;&gt;&quot;&apos;&amp; &auml;&Auml;&ouml;&Ouml;&uuml;&Uuml;&szlig; foo&AElig;&ccedil;&Ccedil; &diams;&lrm;&rlm;&zwnj;&noent;&zwj;"; "&lt;&gt;&quot;&apos;&amp; &auml;&Auml;&ouml;&Ouml;&uuml;&Uuml;&szlig; foo&AElig;&ccedil;&Ccedil; &diams;&lrm;&rlm;&zwnj;&noent;&zwj;";
let plain = simplify.simplify(html, true, false); let (plain, is_forwarded) = simplify(html, true, false);
assert_eq!( assert_eq!(
plain, plain,
"<>\"\'& äÄöÖüÜß fooÆçÇ \u{2666}\u{200e}\u{200f}\u{200c}&noent;\u{200d}" "<>\"\'& äÄöÖüÜß fooÆçÇ \u{2666}\u{200e}\u{200f}\u{200c}&noent;\u{200d}"
); );
assert!(!is_forwarded);
} }
#[test] #[test]
@@ -270,4 +273,19 @@ mod tests {
assert!(!is_plain_quote("Life is pain")); assert!(!is_plain_quote("Life is pain"));
assert!(!is_plain_quote("")); assert!(!is_plain_quote(""));
} }
#[test]
fn test_remove_top_quote() {
let (lines, has_top_quote) = remove_top_quote(&["> first", "> second"]);
assert!(lines.is_empty());
assert!(has_top_quote);
let (lines, has_top_quote) = remove_top_quote(&["> first", "> second", "not a quote"]);
assert_eq!(lines, &["not a quote"]);
assert!(has_top_quote);
let (lines, has_top_quote) = remove_top_quote(&["not a quote", "> first", "> second"]);
assert_eq!(lines, &["not a quote", "> first", "> second"]);
assert!(!has_top_quote);
}
} }

View File

@@ -576,12 +576,11 @@ impl<'a> MimeParser<'a> {
} }
}; };
let mut simplifier = Simplify::new(); let (simplified_txt, is_forwarded) = if decoded_data.is_empty() {
let simplified_txt = if decoded_data.is_empty() { ("".into(), false)
"".into()
} else { } else {
let is_html = mime_type == mime::TEXT_HTML; let is_html = mime_type == mime::TEXT_HTML;
simplifier.simplify(&decoded_data, is_html, self.has_chat_version()) simplify(&decoded_data, is_html, self.has_chat_version())
}; };
if !simplified_txt.is_empty() { if !simplified_txt.is_empty() {
@@ -593,7 +592,7 @@ impl<'a> MimeParser<'a> {
self.do_add_single_part(part); self.do_add_single_part(part);
} }
if simplifier.is_forwarded { if is_forwarded {
self.is_forwarded = true; self.is_forwarded = true;
} }
} }