diff --git a/src/dehtml.rs b/src/dehtml.rs index a8b694bfb..02dbea4e7 100644 --- a/src/dehtml.rs +++ b/src/dehtml.rs @@ -10,10 +10,11 @@ use quick_xml::{ Reader, }; -static LINE_RE: Lazy = Lazy::new(|| regex::Regex::new(r"(\r?\n)+").unwrap()); +use crate::simplify::{simplify_quote, SimplifiedText}; struct Dehtml { strbuilder: String, + quote: String, add_text: AddText, last_href: Option, /// GMX wraps a quote in `
`. After a `
`, this count is @@ -29,17 +30,22 @@ struct Dehtml { } impl Dehtml { - fn line_prefix(&self) -> &str { - if self.divs_since_quoted_content_div > 0 || self.blockquotes_since_blockquote > 0 { - "> " + /// Returns true if HTML parser is currently inside the quote. + fn is_quote(&self) -> bool { + self.divs_since_quoted_content_div > 0 || self.blockquotes_since_blockquote > 0 + } + + /// Returns the buffer where the text should be written. + /// + /// If the parser is inside the quote, returns the quote buffer. + fn get_buf(&mut self) -> &mut String { + if self.is_quote() { + &mut self.quote } else { - "" + &mut self.strbuilder } } - fn append_prefix(&self, line_end: &str) -> String { - // line_end is e.g. "\n\n". We add "> " if necessary. - line_end.to_string() + self.line_prefix() - } + fn get_add_text(&self) -> AddText { if self.divs_since_quote_div > 0 && self.divs_since_quoted_content_div == 0 { AddText::No // Everything between `
` and `
` is metadata which we don't want @@ -61,25 +67,60 @@ enum AddText { YesPreserveLineEnds, } -// dehtml() returns way too many newlines; however, an optimisation on this issue is not needed as -// the newlines are typically removed in further processing by the caller -pub fn dehtml(buf: &str) -> Option { - let s = dehtml_quick_xml(buf); +pub(crate) fn dehtml(buf: &str) -> Option { + let (s, quote) = dehtml_quick_xml(buf); if !s.trim().is_empty() { - return Some(s); + let text = dehtml_cleanup(s); + let top_quote = if !quote.trim().is_empty() { + Some(dehtml_cleanup(simplify_quote("e).0)) + } else { + None + }; + return Some(SimplifiedText { + text, + top_quote, + ..Default::default() + }); } let s = dehtml_manually(buf); if !s.trim().is_empty() { - return Some(s); + let text = dehtml_cleanup(s); + return Some(SimplifiedText { + text, + ..Default::default() + }); } None } -fn dehtml_quick_xml(buf: &str) -> String { +fn dehtml_cleanup(mut text: String) -> String { + text.retain(|c| c != '\r'); + let lines = text.trim().split('\n'); + let mut text = String::new(); + let mut linebreak = false; + for line in lines { + if line.chars().all(char::is_whitespace) { + linebreak = true; + } else { + if !text.is_empty() { + text += "\n"; + if linebreak { + text += "\n"; + } + } + text += line.trim_end(); + linebreak = false; + } + } + text +} + +fn dehtml_quick_xml(buf: &str) -> (String, String) { let buf = buf.trim().trim_start_matches(""); let mut dehtml = Dehtml { strbuilder: String::with_capacity(buf.len()), + quote: String::new(), add_text: AddText::YesRemoveLineEnds, last_href: None, divs_since_quote_div: 0, @@ -131,22 +172,33 @@ fn dehtml_quick_xml(buf: &str) -> String { buf.clear(); } - dehtml.strbuilder + (dehtml.strbuilder, dehtml.quote) } fn dehtml_text_cb(event: &BytesText, dehtml: &mut Dehtml) { + static LINE_RE: Lazy = Lazy::new(|| regex::Regex::new(r"(\r?\n)+").unwrap()); + if dehtml.get_add_text() == AddText::YesPreserveLineEnds || dehtml.get_add_text() == AddText::YesRemoveLineEnds { let last_added = escaper::decode_html_buf_sloppy(event as &[_]).unwrap_or_default(); if dehtml.get_add_text() == AddText::YesRemoveLineEnds { - dehtml.strbuilder += LINE_RE.replace_all(&last_added, "\r").as_ref(); - } else if !dehtml.line_prefix().is_empty() { - let l = dehtml.append_prefix("\n"); - dehtml.strbuilder += LINE_RE.replace_all(&last_added, l.as_str()).as_ref(); + // Replace all line ends with spaces. + // E.g. `\r\n\r\n` is replaced with one space. + let last_added = LINE_RE.replace_all(&last_added, " "); + + // Add a space if `last_added` starts with a space + // and there is no whitespace at the end of the buffer yet. + // Trim the rest of leading whitespace from `last_added`. + let buf = dehtml.get_buf(); + if !buf.ends_with(' ') && !buf.ends_with('\n') && last_added.starts_with(' ') { + *buf += " "; + } + + *buf += last_added.trim_start(); } else { - dehtml.strbuilder += &last_added; + *dehtml.get_buf() += LINE_RE.replace_all(&last_added, "\n").as_ref(); } } } @@ -158,35 +210,36 @@ fn dehtml_endtag_cb(event: &BytesEnd, dehtml: &mut Dehtml) { match tag.as_str() { "style" | "script" | "title" | "pre" => { - dehtml.strbuilder += &dehtml.append_prefix("\n\n"); + *dehtml.get_buf() += "\n\n"; dehtml.add_text = AddText::YesRemoveLineEnds; } "div" => { pop_tag(&mut dehtml.divs_since_quote_div); pop_tag(&mut dehtml.divs_since_quoted_content_div); - dehtml.strbuilder += &dehtml.append_prefix("\n\n"); + *dehtml.get_buf() += "\n\n"; dehtml.add_text = AddText::YesRemoveLineEnds; } "a" => { if let Some(ref last_href) = dehtml.last_href.take() { - if dehtml.strbuilder.ends_with('[') { - dehtml.strbuilder.truncate(dehtml.strbuilder.len() - 1); + let buf = dehtml.get_buf(); + if buf.ends_with('[') { + buf.truncate(buf.len() - 1); } else { - dehtml.strbuilder += "]("; - dehtml.strbuilder += last_href; - dehtml.strbuilder += ")"; + *buf += "]("; + *buf += last_href; + *buf += ")"; } } } "b" | "strong" => { if dehtml.get_add_text() != AddText::No { - dehtml.strbuilder += "*"; + *dehtml.get_buf() += "*"; } } "i" | "em" => { if dehtml.get_add_text() != AddText::No { - dehtml.strbuilder += "_"; + *dehtml.get_buf() += "_"; } } "blockquote" => pop_tag(&mut dehtml.blockquotes_since_blockquote), @@ -206,7 +259,7 @@ fn dehtml_starttag_cb( match tag.as_str() { "p" | "table" | "td" => { if !dehtml.strbuilder.is_empty() { - dehtml.strbuilder += &dehtml.append_prefix("\n\n"); + *dehtml.get_buf() += "\n\n"; } dehtml.add_text = AddText::YesRemoveLineEnds; } @@ -215,18 +268,18 @@ fn dehtml_starttag_cb( maybe_push_tag(event, reader, "quote", &mut dehtml.divs_since_quote_div); maybe_push_tag(event, reader, "quoted-content", &mut dehtml.divs_since_quoted_content_div); - dehtml.strbuilder += &dehtml.append_prefix("\n\n"); + *dehtml.get_buf() += "\n\n"; dehtml.add_text = AddText::YesRemoveLineEnds; } "br" => { - dehtml.strbuilder += &dehtml.append_prefix("\n"); + *dehtml.get_buf() += "\n"; dehtml.add_text = AddText::YesRemoveLineEnds; } "style" | "script" | "title" => { dehtml.add_text = AddText::No; } "pre" => { - dehtml.strbuilder += &dehtml.append_prefix("\n\n"); + *dehtml.get_buf() += "\n\n"; dehtml.add_text = AddText::YesPreserveLineEnds; } "a" => { @@ -247,18 +300,18 @@ fn dehtml_starttag_cb( if !href.is_empty() { dehtml.last_href = Some(href); - dehtml.strbuilder += "["; + *dehtml.get_buf() += "["; } } } "b" | "strong" => { if dehtml.get_add_text() != AddText::No { - dehtml.strbuilder += "*"; + *dehtml.get_buf() += "*"; } } "i" | "em" => { if dehtml.get_add_text() != AddText::No { - dehtml.strbuilder += "_"; + *dehtml.get_buf() += "_"; } } "blockquote" => dehtml.blockquotes_since_blockquote += 1, @@ -319,7 +372,6 @@ pub fn dehtml_manually(buf: &str) -> String { #[cfg(test)] mod tests { use super::*; - use crate::simplify::{simplify, SimplifiedText}; #[test] fn test_dehtml() { @@ -333,18 +385,18 @@ mod tests { (" bar foo", "* bar _ foo"), ("& bar", "& bar"), // Despite missing ', this should be shown: - ("", "No link: "), + ("", "No link:"), ( "No link: ", - "No link: ", + "No link:", ), ("\nfat text", "*fat text*"), // Invalid html (at least DC should show the text if the html is invalid): ("\nsome text", "some text"), ]; for (input, output) in cases { - assert_eq!(simplify(dehtml(input).unwrap(), true).text, output); + assert_eq!(dehtml(input).unwrap().text, output); } let none_cases = vec![" ", ""]; for input in none_cases { @@ -354,31 +406,54 @@ mod tests { #[test] fn test_dehtml_parse_br() { - let html = "\r\r\nline1
\r\n\r\n\r\rline2
line3\n\r"; - let plain = dehtml(html).unwrap(); + let html = "line1
line2"; + let plain = dehtml(html).unwrap().text; + assert_eq!(plain, "line1\nline2"); - assert_eq!(plain, "line1\n\r\r\rline2\nline3"); + let html = "line1
line2"; + let plain = dehtml(html).unwrap().text; + assert_eq!(plain, "line1\nline2"); + + let html = "line1

line2"; + let plain = dehtml(html).unwrap().text; + assert_eq!(plain, "line1\n\nline2"); + + let html = "\r\r\nline1
\r\n\r\n\r\rline2
line3\n\r"; + let plain = dehtml(html).unwrap().text; + assert_eq!(plain, "line1\nline2\nline3"); + } + + #[test] + fn test_dehtml_parse_span() { + assert_eq!(dehtml("Foobar").unwrap().text, "Foobar"); + assert_eq!(dehtml("Foo bar").unwrap().text, "Foo bar"); + assert_eq!(dehtml("Foo bar").unwrap().text, "Foo bar"); + assert_eq!(dehtml("Foo\nbar").unwrap().text, "Foo bar"); + assert_eq!(dehtml("\nFoo bar").unwrap().text, "Foo bar"); + assert_eq!(dehtml("Foo\n\nbar").unwrap().text, "Foo bar"); + assert_eq!(dehtml("Foo\nbar").unwrap().text, "Foo bar"); + assert_eq!(dehtml("Foo\nbar").unwrap().text, "Foo bar"); } #[test] fn test_dehtml_parse_p() { let html = "

Foo

Bar

"; - let plain = dehtml(html).unwrap(); + let plain = dehtml(html).unwrap().text; assert_eq!(plain, "Foo\n\nBar"); let html = "

Foo

Bar"; - let plain = dehtml(html).unwrap(); + let plain = dehtml(html).unwrap().text; assert_eq!(plain, "Foo\n\nBar"); let html = "

Foo

Bar

Baz"; - let plain = dehtml(html).unwrap(); + let plain = dehtml(html).unwrap().text; assert_eq!(plain, "Foo\n\nBar\n\nBaz"); } #[test] fn test_dehtml_parse_href() { let html = "text"); } @@ -396,7 +471,7 @@ mod tests { let html = "<>"'& äÄöÖüÜß fooÆçÇ ♦‎‏‌&noent;‍"; - let plain = dehtml(html).unwrap(); + let plain = dehtml(html).unwrap().text; assert_eq!( plain, @@ -420,32 +495,38 @@ mod tests { "##; let txt = dehtml(input).unwrap(); - assert_eq!(txt.trim(), "lots of text"); + assert_eq!(txt.text.trim(), "lots of text"); } #[test] fn test_pre_tag() { let input = "

\ntwo\nlines\n
"; let txt = dehtml(input).unwrap(); - assert_eq!(txt.trim(), "two\nlines"); + assert_eq!(txt.text.trim(), "two\nlines"); } #[tokio::test(flavor = "multi_thread", worker_threads = 2)] async fn test_quote_div() { let input = include_str!("../test-data/message/gmx-quote-body.eml"); let dehtml = dehtml(input).unwrap(); - println!("{dehtml}"); let SimplifiedText { text, is_forwarded, is_cut, top_quote, footer, - } = simplify(dehtml, false); + } = dehtml; assert_eq!(text, "Test"); assert_eq!(is_forwarded, false); assert_eq!(is_cut, false); assert_eq!(top_quote.as_deref(), Some("test")); assert_eq!(footer, None); } + + #[test] + fn test_spaces() { + let input = include_str!("../test-data/spaces.html"); + let txt = dehtml(input).unwrap(); + assert_eq!(txt.text, "Welcome back to Strolling!\n\nHey there,\n\nWelcome back! Use this link to securely sign in to your Strolling account:\n\nSign in to Strolling\n\nFor your security, the link will expire in 24 hours time.\n\nSee you soon!\n\nYou can also copy\n\nhttps://strolling.rosano.ca/members/?token=XXX\n\nIf you did not make this request, you can safely ignore this email.\n\nThis message was sent from [strolling.rosano.ca](https://strolling.rosano.ca/) to [alice@example.org](mailto:alice@example.org)"); + } } diff --git a/src/mimeparser.rs b/src/mimeparser.rs index 30293456d..43387eab7 100644 --- a/src/mimeparser.rs +++ b/src/mimeparser.rs @@ -1076,16 +1076,20 @@ impl MimeMessage { Default::default() } else { let is_html = mime_type == mime::TEXT_HTML; - let out = if is_html { + if is_html { self.is_mime_modified = true; - dehtml(&decoded_data).unwrap_or_else(|| { + if let Some(text) = dehtml(&decoded_data) { + text + } else { dehtml_failed = true; - decoded_data.clone() - }) + SimplifiedText { + text: decoded_data.clone(), + ..Default::default() + } + } } else { - decoded_data.clone() - }; - simplify(out, self.has_chat_version()) + simplify(decoded_data.clone(), self.has_chat_version()) + } }; self.is_mime_modified = self.is_mime_modified diff --git a/src/receive_imf/tests.rs b/src/receive_imf/tests.rs index 8079dde7f..303630d31 100644 --- a/src/receive_imf/tests.rs +++ b/src/receive_imf/tests.rs @@ -734,7 +734,7 @@ async fn load_imf_email(context: &Context, imf_raw: &[u8]) -> Message { async fn test_html_only_mail() { let t = TestContext::new_alice().await; let msg = load_imf_email(&t, include_bytes!("../../test-data/message/wrong-html.eml")).await; - assert_eq!(msg.text.unwrap(), " Guten Abend, \n\n Lots of text \n\n text with Umlaut ä... \n\n MfG [...]"); + assert_eq!(msg.text.unwrap(), "Guten Abend,\n\nLots of text\n\ntext with Umlaut ä...\n\nMfG\n\n--------------------------------------\n\n[Camping ](https://example.com/)\n\nsomeaddress\n\nsometown"); } static GH_MAILINGLIST: &[u8] = diff --git a/src/simplify.rs b/src/simplify.rs index e70a0e6e1..1d07bb6a4 100644 --- a/src/simplify.rs +++ b/src/simplify.rs @@ -72,7 +72,7 @@ pub(crate) fn split_lines(buf: &str) -> Vec<&str> { } /// Simplified text and some additional information gained from the input. -#[derive(Debug, Default)] +#[derive(Debug, Default, PartialEq, Eq)] pub(crate) struct SimplifiedText { /// The text itself. pub text: String, @@ -91,6 +91,14 @@ pub(crate) struct SimplifiedText { pub footer: Option, } +pub(crate) fn simplify_quote(quote: &str) -> (String, bool) { + let quote_lines = split_lines(quote); + let (quote_lines, quote_footer_lines) = remove_message_footer("e_lines); + let is_cut = quote_footer_lines.is_some(); + + (render_message(quote_lines, false), is_cut) +} + /// Simplify message text for chat display. /// Remove quotes, signatures, trailing empty lines etc. pub(crate) fn simplify(mut input: String, is_chat_message: bool) -> SimplifiedText { @@ -125,11 +133,9 @@ pub(crate) fn simplify(mut input: String, is_chat_message: bool) -> SimplifiedTe if !is_chat_message { top_quote = top_quote.map(|quote| { - let quote_lines = split_lines("e); - let (quote_lines, quote_footer_lines) = remove_message_footer("e_lines); - is_cut = is_cut || quote_footer_lines.is_some(); - - render_message(quote_lines, false) + let (quote, quote_cut) = simplify_quote("e); + is_cut |= quote_cut; + quote }); } diff --git a/test-data/spaces.html b/test-data/spaces.html new file mode 100644 index 000000000..f207ef261 --- /dev/null +++ b/test-data/spaces.html @@ -0,0 +1,172 @@ + + + + + + 🔑 Secure sign in link for Strolling + + + + + + + + + +
  +
+ + + + + + + + + + + +
+ + + + + + + + + + + + + + + +
+

Hey there,

+

Welcome back! Use this link to securely sign in to your Strolling account:

+ + + + + + +
+ + + + + + +
Sign in to Strolling
+
+

For your security, the link will expire in 24 hours time.

+

See you soon!

+
+

You can also copy & paste this URL into your browser:

+

https://strolling.rosano.ca/members/?token=XXX&action=signin&r=https%3A%2F%2Fstrolling.rosano.ca%2F

+
+

If you did not make this request, you can safely ignore this email.

+
+

This message was sent from strolling.rosano.ca to alice@example.org

+
+
+ + + +
+
 
+ +