From 2d30afd212f4abe33b89d21e79289dec367e7671 Mon Sep 17 00:00:00 2001 From: link2xt Date: Mon, 26 Jun 2023 20:21:01 +0000 Subject: [PATCH] fix: do not run simplify() on dehtml() output simplify() is written to process incoming plaintext messages and extract footers and quotes from them. Incoming messages contain various quote styles and simplify() implements heuristics to detects them. If dehtml() output is processed by simplify(), simplify() heuristics may erroneously detect footers and quotes in produced plaintext. dehtml() should directly detect quotes instead of converting them to plaintext quotes for parsing with simplify(). --- src/dehtml.rs | 193 +++++++++++++++++++++++++++------------ src/mimeparser.rs | 18 ++-- src/receive_imf/tests.rs | 2 +- src/simplify.rs | 18 ++-- test-data/spaces.html | 172 ++++++++++++++++++++++++++++++++++ 5 files changed, 333 insertions(+), 70 deletions(-) create mode 100644 test-data/spaces.html diff --git a/src/dehtml.rs b/src/dehtml.rs index a8b694bfb..02dbea4e7 100644 --- a/src/dehtml.rs +++ b/src/dehtml.rs @@ -10,10 +10,11 @@ use quick_xml::{ Reader, }; -static LINE_RE: Lazy = Lazy::new(|| regex::Regex::new(r"(\r?\n)+").unwrap()); +use crate::simplify::{simplify_quote, SimplifiedText}; struct Dehtml { strbuilder: String, + quote: String, add_text: AddText, last_href: Option, /// GMX wraps a quote in `
`. After a `
`, this count is @@ -29,17 +30,22 @@ struct Dehtml { } impl Dehtml { - fn line_prefix(&self) -> &str { - if self.divs_since_quoted_content_div > 0 || self.blockquotes_since_blockquote > 0 { - "> " + /// Returns true if HTML parser is currently inside the quote. + fn is_quote(&self) -> bool { + self.divs_since_quoted_content_div > 0 || self.blockquotes_since_blockquote > 0 + } + + /// Returns the buffer where the text should be written. + /// + /// If the parser is inside the quote, returns the quote buffer. + fn get_buf(&mut self) -> &mut String { + if self.is_quote() { + &mut self.quote } else { - "" + &mut self.strbuilder } } - fn append_prefix(&self, line_end: &str) -> String { - // line_end is e.g. "\n\n". We add "> " if necessary. - line_end.to_string() + self.line_prefix() - } + fn get_add_text(&self) -> AddText { if self.divs_since_quote_div > 0 && self.divs_since_quoted_content_div == 0 { AddText::No // Everything between `
` and `
` is metadata which we don't want @@ -61,25 +67,60 @@ enum AddText { YesPreserveLineEnds, } -// dehtml() returns way too many newlines; however, an optimisation on this issue is not needed as -// the newlines are typically removed in further processing by the caller -pub fn dehtml(buf: &str) -> Option { - let s = dehtml_quick_xml(buf); +pub(crate) fn dehtml(buf: &str) -> Option { + let (s, quote) = dehtml_quick_xml(buf); if !s.trim().is_empty() { - return Some(s); + let text = dehtml_cleanup(s); + let top_quote = if !quote.trim().is_empty() { + Some(dehtml_cleanup(simplify_quote("e).0)) + } else { + None + }; + return Some(SimplifiedText { + text, + top_quote, + ..Default::default() + }); } let s = dehtml_manually(buf); if !s.trim().is_empty() { - return Some(s); + let text = dehtml_cleanup(s); + return Some(SimplifiedText { + text, + ..Default::default() + }); } None } -fn dehtml_quick_xml(buf: &str) -> String { +fn dehtml_cleanup(mut text: String) -> String { + text.retain(|c| c != '\r'); + let lines = text.trim().split('\n'); + let mut text = String::new(); + let mut linebreak = false; + for line in lines { + if line.chars().all(char::is_whitespace) { + linebreak = true; + } else { + if !text.is_empty() { + text += "\n"; + if linebreak { + text += "\n"; + } + } + text += line.trim_end(); + linebreak = false; + } + } + text +} + +fn dehtml_quick_xml(buf: &str) -> (String, String) { let buf = buf.trim().trim_start_matches(""); let mut dehtml = Dehtml { strbuilder: String::with_capacity(buf.len()), + quote: String::new(), add_text: AddText::YesRemoveLineEnds, last_href: None, divs_since_quote_div: 0, @@ -131,22 +172,33 @@ fn dehtml_quick_xml(buf: &str) -> String { buf.clear(); } - dehtml.strbuilder + (dehtml.strbuilder, dehtml.quote) } fn dehtml_text_cb(event: &BytesText, dehtml: &mut Dehtml) { + static LINE_RE: Lazy = Lazy::new(|| regex::Regex::new(r"(\r?\n)+").unwrap()); + if dehtml.get_add_text() == AddText::YesPreserveLineEnds || dehtml.get_add_text() == AddText::YesRemoveLineEnds { let last_added = escaper::decode_html_buf_sloppy(event as &[_]).unwrap_or_default(); if dehtml.get_add_text() == AddText::YesRemoveLineEnds { - dehtml.strbuilder += LINE_RE.replace_all(&last_added, "\r").as_ref(); - } else if !dehtml.line_prefix().is_empty() { - let l = dehtml.append_prefix("\n"); - dehtml.strbuilder += LINE_RE.replace_all(&last_added, l.as_str()).as_ref(); + // Replace all line ends with spaces. + // E.g. `\r\n\r\n` is replaced with one space. + let last_added = LINE_RE.replace_all(&last_added, " "); + + // Add a space if `last_added` starts with a space + // and there is no whitespace at the end of the buffer yet. + // Trim the rest of leading whitespace from `last_added`. + let buf = dehtml.get_buf(); + if !buf.ends_with(' ') && !buf.ends_with('\n') && last_added.starts_with(' ') { + *buf += " "; + } + + *buf += last_added.trim_start(); } else { - dehtml.strbuilder += &last_added; + *dehtml.get_buf() += LINE_RE.replace_all(&last_added, "\n").as_ref(); } } } @@ -158,35 +210,36 @@ fn dehtml_endtag_cb(event: &BytesEnd, dehtml: &mut Dehtml) { match tag.as_str() { "style" | "script" | "title" | "pre" => { - dehtml.strbuilder += &dehtml.append_prefix("\n\n"); + *dehtml.get_buf() += "\n\n"; dehtml.add_text = AddText::YesRemoveLineEnds; } "div" => { pop_tag(&mut dehtml.divs_since_quote_div); pop_tag(&mut dehtml.divs_since_quoted_content_div); - dehtml.strbuilder += &dehtml.append_prefix("\n\n"); + *dehtml.get_buf() += "\n\n"; dehtml.add_text = AddText::YesRemoveLineEnds; } "a" => { if let Some(ref last_href) = dehtml.last_href.take() { - if dehtml.strbuilder.ends_with('[') { - dehtml.strbuilder.truncate(dehtml.strbuilder.len() - 1); + let buf = dehtml.get_buf(); + if buf.ends_with('[') { + buf.truncate(buf.len() - 1); } else { - dehtml.strbuilder += "]("; - dehtml.strbuilder += last_href; - dehtml.strbuilder += ")"; + *buf += "]("; + *buf += last_href; + *buf += ")"; } } } "b" | "strong" => { if dehtml.get_add_text() != AddText::No { - dehtml.strbuilder += "*"; + *dehtml.get_buf() += "*"; } } "i" | "em" => { if dehtml.get_add_text() != AddText::No { - dehtml.strbuilder += "_"; + *dehtml.get_buf() += "_"; } } "blockquote" => pop_tag(&mut dehtml.blockquotes_since_blockquote), @@ -206,7 +259,7 @@ fn dehtml_starttag_cb( match tag.as_str() { "p" | "table" | "td" => { if !dehtml.strbuilder.is_empty() { - dehtml.strbuilder += &dehtml.append_prefix("\n\n"); + *dehtml.get_buf() += "\n\n"; } dehtml.add_text = AddText::YesRemoveLineEnds; } @@ -215,18 +268,18 @@ fn dehtml_starttag_cb( maybe_push_tag(event, reader, "quote", &mut dehtml.divs_since_quote_div); maybe_push_tag(event, reader, "quoted-content", &mut dehtml.divs_since_quoted_content_div); - dehtml.strbuilder += &dehtml.append_prefix("\n\n"); + *dehtml.get_buf() += "\n\n"; dehtml.add_text = AddText::YesRemoveLineEnds; } "br" => { - dehtml.strbuilder += &dehtml.append_prefix("\n"); + *dehtml.get_buf() += "\n"; dehtml.add_text = AddText::YesRemoveLineEnds; } "style" | "script" | "title" => { dehtml.add_text = AddText::No; } "pre" => { - dehtml.strbuilder += &dehtml.append_prefix("\n\n"); + *dehtml.get_buf() += "\n\n"; dehtml.add_text = AddText::YesPreserveLineEnds; } "a" => { @@ -247,18 +300,18 @@ fn dehtml_starttag_cb( if !href.is_empty() { dehtml.last_href = Some(href); - dehtml.strbuilder += "["; + *dehtml.get_buf() += "["; } } } "b" | "strong" => { if dehtml.get_add_text() != AddText::No { - dehtml.strbuilder += "*"; + *dehtml.get_buf() += "*"; } } "i" | "em" => { if dehtml.get_add_text() != AddText::No { - dehtml.strbuilder += "_"; + *dehtml.get_buf() += "_"; } } "blockquote" => dehtml.blockquotes_since_blockquote += 1, @@ -319,7 +372,6 @@ pub fn dehtml_manually(buf: &str) -> String { #[cfg(test)] mod tests { use super::*; - use crate::simplify::{simplify, SimplifiedText}; #[test] fn test_dehtml() { @@ -333,18 +385,18 @@ mod tests { (" bar foo", "* bar _ foo"), ("& bar", "& bar"), // Despite missing ', this should be shown: - ("", "No link: "), + ("", "No link:"), ( "No link: ", - "No link: ", + "No link:", ), ("\nfat text", "*fat text*"), // Invalid html (at least DC should show the text if the html is invalid): ("\nsome text", "some text"), ]; for (input, output) in cases { - assert_eq!(simplify(dehtml(input).unwrap(), true).text, output); + assert_eq!(dehtml(input).unwrap().text, output); } let none_cases = vec![" ", ""]; for input in none_cases { @@ -354,31 +406,54 @@ mod tests { #[test] fn test_dehtml_parse_br() { - let html = "\r\r\nline1
\r\n\r\n\r\rline2
line3\n\r"; - let plain = dehtml(html).unwrap(); + let html = "line1
line2"; + let plain = dehtml(html).unwrap().text; + assert_eq!(plain, "line1\nline2"); - assert_eq!(plain, "line1\n\r\r\rline2\nline3"); + let html = "line1
line2"; + let plain = dehtml(html).unwrap().text; + assert_eq!(plain, "line1\nline2"); + + let html = "line1

line2"; + let plain = dehtml(html).unwrap().text; + assert_eq!(plain, "line1\n\nline2"); + + let html = "\r\r\nline1
\r\n\r\n\r\rline2
line3\n\r"; + let plain = dehtml(html).unwrap().text; + assert_eq!(plain, "line1\nline2\nline3"); + } + + #[test] + fn test_dehtml_parse_span() { + assert_eq!(dehtml("Foobar").unwrap().text, "Foobar"); + assert_eq!(dehtml("Foo bar").unwrap().text, "Foo bar"); + assert_eq!(dehtml("Foo bar").unwrap().text, "Foo bar"); + assert_eq!(dehtml("Foo\nbar").unwrap().text, "Foo bar"); + assert_eq!(dehtml("\nFoo bar").unwrap().text, "Foo bar"); + assert_eq!(dehtml("Foo\n\nbar").unwrap().text, "Foo bar"); + assert_eq!(dehtml("Foo\nbar").unwrap().text, "Foo bar"); + assert_eq!(dehtml("Foo\nbar").unwrap().text, "Foo bar"); } #[test] fn test_dehtml_parse_p() { let html = "

Foo

Bar

"; - let plain = dehtml(html).unwrap(); + let plain = dehtml(html).unwrap().text; assert_eq!(plain, "Foo\n\nBar"); let html = "

Foo

Bar"; - let plain = dehtml(html).unwrap(); + let plain = dehtml(html).unwrap().text; assert_eq!(plain, "Foo\n\nBar"); let html = "

Foo

Bar

Baz"; - let plain = dehtml(html).unwrap(); + let plain = dehtml(html).unwrap().text; assert_eq!(plain, "Foo\n\nBar\n\nBaz"); } #[test] fn test_dehtml_parse_href() { let html = "text"); } @@ -396,7 +471,7 @@ mod tests { let html = "<>"'& äÄöÖüÜß fooÆçÇ ♦‎‏‌&noent;‍"; - let plain = dehtml(html).unwrap(); + let plain = dehtml(html).unwrap().text; assert_eq!( plain, @@ -420,32 +495,38 @@ mod tests { "##; let txt = dehtml(input).unwrap(); - assert_eq!(txt.trim(), "lots of text"); + assert_eq!(txt.text.trim(), "lots of text"); } #[test] fn test_pre_tag() { let input = "

\ntwo\nlines\n
"; let txt = dehtml(input).unwrap(); - assert_eq!(txt.trim(), "two\nlines"); + assert_eq!(txt.text.trim(), "two\nlines"); } #[tokio::test(flavor = "multi_thread", worker_threads = 2)] async fn test_quote_div() { let input = include_str!("../test-data/message/gmx-quote-body.eml"); let dehtml = dehtml(input).unwrap(); - println!("{dehtml}"); let SimplifiedText { text, is_forwarded, is_cut, top_quote, footer, - } = simplify(dehtml, false); + } = dehtml; assert_eq!(text, "Test"); assert_eq!(is_forwarded, false); assert_eq!(is_cut, false); assert_eq!(top_quote.as_deref(), Some("test")); assert_eq!(footer, None); } + + #[test] + fn test_spaces() { + let input = include_str!("../test-data/spaces.html"); + let txt = dehtml(input).unwrap(); + assert_eq!(txt.text, "Welcome back to Strolling!\n\nHey there,\n\nWelcome back! Use this link to securely sign in to your Strolling account:\n\nSign in to Strolling\n\nFor your security, the link will expire in 24 hours time.\n\nSee you soon!\n\nYou can also copy\n\nhttps://strolling.rosano.ca/members/?token=XXX\n\nIf you did not make this request, you can safely ignore this email.\n\nThis message was sent from [strolling.rosano.ca](https://strolling.rosano.ca/) to [alice@example.org](mailto:alice@example.org)"); + } } diff --git a/src/mimeparser.rs b/src/mimeparser.rs index 30293456d..43387eab7 100644 --- a/src/mimeparser.rs +++ b/src/mimeparser.rs @@ -1076,16 +1076,20 @@ impl MimeMessage { Default::default() } else { let is_html = mime_type == mime::TEXT_HTML; - let out = if is_html { + if is_html { self.is_mime_modified = true; - dehtml(&decoded_data).unwrap_or_else(|| { + if let Some(text) = dehtml(&decoded_data) { + text + } else { dehtml_failed = true; - decoded_data.clone() - }) + SimplifiedText { + text: decoded_data.clone(), + ..Default::default() + } + } } else { - decoded_data.clone() - }; - simplify(out, self.has_chat_version()) + simplify(decoded_data.clone(), self.has_chat_version()) + } }; self.is_mime_modified = self.is_mime_modified diff --git a/src/receive_imf/tests.rs b/src/receive_imf/tests.rs index 8079dde7f..303630d31 100644 --- a/src/receive_imf/tests.rs +++ b/src/receive_imf/tests.rs @@ -734,7 +734,7 @@ async fn load_imf_email(context: &Context, imf_raw: &[u8]) -> Message { async fn test_html_only_mail() { let t = TestContext::new_alice().await; let msg = load_imf_email(&t, include_bytes!("../../test-data/message/wrong-html.eml")).await; - assert_eq!(msg.text.unwrap(), " Guten Abend, \n\n Lots of text \n\n text with Umlaut ä... \n\n MfG [...]"); + assert_eq!(msg.text.unwrap(), "Guten Abend,\n\nLots of text\n\ntext with Umlaut ä...\n\nMfG\n\n--------------------------------------\n\n[Camping ](https://example.com/)\n\nsomeaddress\n\nsometown"); } static GH_MAILINGLIST: &[u8] = diff --git a/src/simplify.rs b/src/simplify.rs index e70a0e6e1..1d07bb6a4 100644 --- a/src/simplify.rs +++ b/src/simplify.rs @@ -72,7 +72,7 @@ pub(crate) fn split_lines(buf: &str) -> Vec<&str> { } /// Simplified text and some additional information gained from the input. -#[derive(Debug, Default)] +#[derive(Debug, Default, PartialEq, Eq)] pub(crate) struct SimplifiedText { /// The text itself. pub text: String, @@ -91,6 +91,14 @@ pub(crate) struct SimplifiedText { pub footer: Option, } +pub(crate) fn simplify_quote(quote: &str) -> (String, bool) { + let quote_lines = split_lines(quote); + let (quote_lines, quote_footer_lines) = remove_message_footer("e_lines); + let is_cut = quote_footer_lines.is_some(); + + (render_message(quote_lines, false), is_cut) +} + /// Simplify message text for chat display. /// Remove quotes, signatures, trailing empty lines etc. pub(crate) fn simplify(mut input: String, is_chat_message: bool) -> SimplifiedText { @@ -125,11 +133,9 @@ pub(crate) fn simplify(mut input: String, is_chat_message: bool) -> SimplifiedTe if !is_chat_message { top_quote = top_quote.map(|quote| { - let quote_lines = split_lines("e); - let (quote_lines, quote_footer_lines) = remove_message_footer("e_lines); - is_cut = is_cut || quote_footer_lines.is_some(); - - render_message(quote_lines, false) + let (quote, quote_cut) = simplify_quote("e); + is_cut |= quote_cut; + quote }); } diff --git a/test-data/spaces.html b/test-data/spaces.html new file mode 100644 index 000000000..f207ef261 --- /dev/null +++ b/test-data/spaces.html @@ -0,0 +1,172 @@ + + + + + + 🔑 Secure sign in link for Strolling + + + + + + + + + +
  +
+ + + + + + + + + + + +
+ + + + + + + + + + + + + + + +
+

Hey there,

+

Welcome back! Use this link to securely sign in to your Strolling account:

+ + + + + + +
+ + + + + + +
Sign in to Strolling
+
+

For your security, the link will expire in 24 hours time.

+

See you soon!

+
+

You can also copy & paste this URL into your browser:

+

https://strolling.rosano.ca/members/?token=XXX&action=signin&r=https%3A%2F%2Fstrolling.rosano.ca%2F

+
+

If you did not make this request, you can safely ignore this email.

+
+

This message was sent from strolling.rosano.ca to alice@example.org

+
+
+ + + +
+
 
+ +