diff --git a/src/dehtml.rs b/src/dehtml.rs index 17ecef037..3464fef85 100644 --- a/src/dehtml.rs +++ b/src/dehtml.rs @@ -2,8 +2,13 @@ //! //! A module to remove HTML tags from the email text +use std::io::BufRead; + use once_cell::sync::Lazy; -use quick_xml::events::{BytesEnd, BytesStart, BytesText}; +use quick_xml::{ + events::{BytesEnd, BytesStart, BytesText}, + Reader, +}; static LINE_RE: Lazy = Lazy::new(|| regex::Regex::new(r"(\r?\n)+").unwrap()); @@ -11,9 +16,37 @@ struct Dehtml { strbuilder: String, add_text: AddText, last_href: Option, + /// GMX wraps a quote in `
`. After a `
`, this count is + /// increased at each `
` and decreased at each `
`. This way we know when the quote ends. + /// If this is > `0`, then we are inside a `
` + divs_since_quote_div: u32, + /// Everything between
and
is usually metadata + /// If this is > `0`, then we are inside a `
`. + divs_since_quoted_content_div: u32, } -#[derive(Debug, PartialEq)] +impl Dehtml { + fn line_prefix(&self) -> &str { + if self.divs_since_quoted_content_div > 0 { + "> " + } else { + "" + } + } + fn append_prefix(&self, line_end: impl AsRef) -> String { + // line_end is e.g. "\n\n". We add "> " if necessary. + line_end.as_ref().to_owned() + self.line_prefix() + } + fn get_add_text(&self) -> AddText { + if self.divs_since_quote_div > 0 && self.divs_since_quoted_content_div == 0 { + AddText::No // Everything between
and
is metadata which we don't want + } else { + self.add_text + } + } +} + +#[derive(Debug, PartialEq, Clone, Copy)] enum AddText { No, YesRemoveLineEnds, @@ -41,6 +74,8 @@ pub fn dehtml_quick_xml(buf: &str) -> String { strbuilder: String::with_capacity(buf.len()), add_text: AddText::YesRemoveLineEnds, last_href: None, + divs_since_quote_div: 0, + divs_since_quoted_content_div: 0, }; let mut reader = quick_xml::Reader::from_str(buf); @@ -79,13 +114,16 @@ pub fn dehtml_quick_xml(buf: &str) -> String { } fn dehtml_text_cb(event: &BytesText, dehtml: &mut Dehtml) { - if dehtml.add_text == AddText::YesPreserveLineEnds - || dehtml.add_text == AddText::YesRemoveLineEnds + if dehtml.get_add_text() == AddText::YesPreserveLineEnds + || dehtml.get_add_text() == AddText::YesRemoveLineEnds { let last_added = escaper::decode_html_buf_sloppy(event.escaped()).unwrap_or_default(); - if dehtml.add_text == AddText::YesRemoveLineEnds { + if dehtml.get_add_text() == AddText::YesRemoveLineEnds { dehtml.strbuilder += LINE_RE.replace_all(&last_added, "\r").as_ref(); + } else if !dehtml.line_prefix().is_empty() { + let l = dehtml.append_prefix("\n"); + dehtml.strbuilder += LINE_RE.replace_all(&last_added, l.as_str()).as_ref(); } else { dehtml.strbuilder += &last_added; } @@ -93,13 +131,16 @@ fn dehtml_text_cb(event: &BytesText, dehtml: &mut Dehtml) { } fn dehtml_cdata_cb(event: &BytesText, dehtml: &mut Dehtml) { - if dehtml.add_text == AddText::YesPreserveLineEnds - || dehtml.add_text == AddText::YesRemoveLineEnds + if dehtml.get_add_text() == AddText::YesPreserveLineEnds + || dehtml.get_add_text() == AddText::YesRemoveLineEnds { let last_added = escaper::decode_html_buf_sloppy(event.escaped()).unwrap_or_default(); - if dehtml.add_text == AddText::YesRemoveLineEnds { + if dehtml.get_add_text() == AddText::YesRemoveLineEnds { dehtml.strbuilder += LINE_RE.replace_all(&last_added, "\r").as_ref(); + } else if !dehtml.line_prefix().is_empty() { + let l = dehtml.append_prefix("\n"); + dehtml.strbuilder += LINE_RE.replace_all(&last_added, l.as_str()).as_ref(); } else { dehtml.strbuilder += &last_added; } @@ -110,8 +151,15 @@ fn dehtml_endtag_cb(event: &BytesEnd, dehtml: &mut Dehtml) { let tag = String::from_utf8_lossy(event.name()).trim().to_lowercase(); match tag.as_str() { - "p" | "div" | "table" | "td" | "style" | "script" | "title" | "pre" => { - dehtml.strbuilder += "\n\n"; + "p" | "table" | "td" | "style" | "script" | "title" | "pre" => { + dehtml.strbuilder += &dehtml.append_prefix("\n\n"); + dehtml.add_text = AddText::YesRemoveLineEnds; + } + "div" => { + pop_tag(&mut dehtml.divs_since_quote_div); + pop_tag(&mut dehtml.divs_since_quoted_content_div); + + dehtml.strbuilder += &dehtml.append_prefix("\n\n"); dehtml.add_text = AddText::YesRemoveLineEnds; } "a" => { @@ -122,10 +170,14 @@ fn dehtml_endtag_cb(event: &BytesEnd, dehtml: &mut Dehtml) { } } "b" | "strong" => { - dehtml.strbuilder += "*"; + if dehtml.get_add_text() != AddText::No { + dehtml.strbuilder += "*"; + } } "i" | "em" => { - dehtml.strbuilder += "_"; + if dehtml.get_add_text() != AddText::No { + dehtml.strbuilder += "_"; + } } _ => {} } @@ -139,19 +191,27 @@ fn dehtml_starttag_cb( let tag = String::from_utf8_lossy(event.name()).trim().to_lowercase(); match tag.as_str() { - "p" | "div" | "table" | "td" => { - dehtml.strbuilder += "\n\n"; + "p" | "table" | "td" => { + dehtml.strbuilder += &dehtml.append_prefix("\n\n"); + dehtml.add_text = AddText::YesRemoveLineEnds; + } + #[rustfmt::skip] + "div" => { + maybe_push_tag(event, reader, "quote", &mut dehtml.divs_since_quote_div); + maybe_push_tag(event, reader, "quoted-content", &mut dehtml.divs_since_quoted_content_div); + + dehtml.strbuilder += &dehtml.append_prefix("\n\n"); dehtml.add_text = AddText::YesRemoveLineEnds; } "br" => { - dehtml.strbuilder += "\n"; + dehtml.strbuilder += &dehtml.append_prefix("\n"); dehtml.add_text = AddText::YesRemoveLineEnds; } "style" | "script" | "title" => { dehtml.add_text = AddText::No; } "pre" => { - dehtml.strbuilder += "\n\n"; + dehtml.strbuilder += &dehtml.append_prefix("\n\n"); dehtml.add_text = AddText::YesPreserveLineEnds; } "a" => { @@ -172,15 +232,51 @@ fn dehtml_starttag_cb( } } "b" | "strong" => { - dehtml.strbuilder += "*"; + if dehtml.get_add_text() != AddText::No { + dehtml.strbuilder += "*"; + } } "i" | "em" => { - dehtml.strbuilder += "_"; + if dehtml.get_add_text() != AddText::No { + dehtml.strbuilder += "_"; + } } _ => {} } } +/// In order to know when a specific tag is closed, we need to count the opening and closing tags. +/// The `counts`s are stored in the `Dehtml` struct. +fn pop_tag(count: &mut u32) { + if *count > 0 { + *count -= 1; + } +} + +/// In order to know when a specific tag is closed, we need to count the opening and closing tags. +/// The `counts`s are stored in the `Dehtml` struct. +fn maybe_push_tag( + event: &BytesStart, + reader: &Reader, + tag_name: &str, + count: &mut u32, +) { + if *count > 0 || tag_contains_attr(event, reader, tag_name) { + *count += 1; + } +} + +fn tag_contains_attr(event: &BytesStart, reader: &Reader, name: &str) -> bool { + event.attributes().any(|r| { + r.map(|a| { + a.unescape_and_decode_value(reader) + .map(|v| v == name) + .unwrap_or(false) + }) + .unwrap_or(false) + }) +} + pub fn dehtml_manually(buf: &str) -> String { // Just strip out everything between "<" and ">" let mut strbuilder = String::new(); @@ -288,4 +384,15 @@ mod tests { let txt = dehtml(input).unwrap(); assert_eq!(txt.trim(), "lots of text"); } + + #[async_std::test] + async fn test_quote_div() { + let input = include_str!("../test-data/message/gmx-quote-body.eml"); + let dehtml = dehtml(input).unwrap(); + println!("{}", dehtml); + let (msg, forwawded, top_quote) = simplify(dehtml, false); + assert_eq!(msg, "Test"); + assert_eq!(forwawded, false); + assert_eq!(top_quote.as_deref(), Some("test")); + } } diff --git a/src/mimeparser.rs b/src/mimeparser.rs index 2be629b1b..6a7767d8c 100644 --- a/src/mimeparser.rs +++ b/src/mimeparser.rs @@ -2513,4 +2513,13 @@ On 2020-10-25, Bob wrote: ); assert_eq!(mimeparser.parts[0].typ, Viewtype::File); } + + #[async_std::test] + async fn test_quote_div() { + let t = TestContext::new().await; + let raw = include_bytes!("../test-data/message/gmx-quote.eml"); + let mimeparser = MimeMessage::from_bytes(&t.ctx, raw).await.unwrap(); + assert_eq!(mimeparser.parts[0].msg, "YIPPEEEEEE\n\nMulti-line"); + assert_eq!(mimeparser.parts[0].param.get(Param::Quote).unwrap(), "Now?"); + } } diff --git a/src/simplify.rs b/src/simplify.rs index 792a7c887..6c46414e1 100644 --- a/src/simplify.rs +++ b/src/simplify.rs @@ -91,6 +91,14 @@ pub fn simplify(mut input: String, is_chat_message: bool) -> (String, bool, Opti render_message(lines, has_nonstandard_footer || bottom_quote.is_some()) } }; + + if !is_chat_message { + top_quote = top_quote.map(|quote| { + let quote_lines = split_lines("e); + let quote_lines = remove_message_footer("e_lines); + render_message(quote_lines, false) + }); + } (text, is_forwarded, top_quote) } @@ -218,19 +226,10 @@ fn render_message(lines: &[&str], is_cut_at_end: bool) -> String { * Tools */ fn is_empty_line(buf: &str) -> bool { - // XXX: can it be simplified to buf.chars().all(|c| c.is_whitespace())? - // - // Strictly speaking, it is not equivalent (^A is not whitespace, but less than ' '), - // but having control sequences in email body?! - // - // See discussion at: https://github.com/deltachat/deltachat-core-rust/pull/402#discussion_r317062392 - for c in buf.chars() { - if c > ' ' { - return false; - } - } - - true + buf.chars().all(char::is_whitespace) + // for some time, this checked for `char <= ' '`, + // see discussion at: https://github.com/deltachat/deltachat-core-rust/pull/402#discussion_r317062392 + // and https://github.com/deltachat/deltachat-core-rust/pull/2104/files#r538973613 } fn is_quoted_headline(buf: &str) -> bool { diff --git a/test-data/message/blockquote-tag.eml b/test-data/message/blockquote-tag.eml new file mode 100644 index 000000000..6943ffcb4 --- /dev/null +++ b/test-data/message/blockquote-tag.eml @@ -0,0 +1,47 @@ +Return-Path: +User-Agent: K-9 Mail for Android +In-Reply-To: +MIME-Version: 1.0 +Content-Type: multipart/alternative; boundary="----MLV7YOLJ7ED4UZKNGQYQ63O0RJGHU0" +Content-Transfer-Encoding: 7bit +Subject: Re: Test +To: Alice +From: Bob +Message-ID: + +------MLV7YOLJ7ED4UZKNGQYQ63O0RJGHU0 +Content-Type: text/definitelynotplainthiswouldbetooeasy; + charset=utf-8 +Content-Transfer-Encoding: quoted-printable + +Hi Alice, + +some text. + +Am 21=2E Juni 2020 10:38:44 MESZ schrieb Alice : +>Dear Bob, +> +>let's meet +> +>Alice + +--=20 +Diese Nachricht wurde von meinem Android-Ger=C3=A4t mit K-9 Mail gesendet= +=2E +------MLV7YOLJ7ED4UZKNGQYQ63O0RJGHU0 +Content-Type: text/html; + charset=utf-8 +Content-Transfer-Encoding: quoted-printable + +Hi Alice,

some text.

+
Am 21=2E Juni 2020 10:38:44 M= +ESZ schrieb Alice <jonathanschmiederer@gmx=2Ede>: +
Sehr geehrte/r Frau/Herr Brenner,

ich habe in= + meinen JuFo-Unterlagen den angeh=C3=A4ngten Gutschein gefunden=2E
Ist e= +s noch m=C3=B6glich, diesen einzul=C3=B6sen?

Mit freundlichen Gr=C3= +=BC=C3=9Fen
Alice

--
= +Diese Nachricht wurde von meinem Android-Ger=C3=A4t mit K-9 Mail gesendet= +=2E +------MLV7YOLJ7ED4UZKNGQYQ63O0RJGHU0-- diff --git a/test-data/message/gmx-quote-body.eml b/test-data/message/gmx-quote-body.eml new file mode 100644 index 000000000..77e282baa --- /dev/null +++ b/test-data/message/gmx-quote-body.eml @@ -0,0 +1,14 @@ +
Test
+ +
  +
  +
+
Gesendet: Freitag, 04. Dezember 2020 um 18:46 Uhr
+Von: "Bob" <bob@gmx.de>
+An: alice@gmx.de
+Betreff: test
+ +
test
+
+
+
diff --git a/test-data/message/gmx-quote.eml b/test-data/message/gmx-quote.eml new file mode 100644 index 000000000..594a72ef5 --- /dev/null +++ b/test-data/message/gmx-quote.eml @@ -0,0 +1,38 @@ +Return-Path: +Delivered-To: bob@gmx.de +MIME-Version: 1.0 +Message-ID: +From: Alice +To: bob@gmx.de +Subject: Aw: Re: Re: Re: Message from bob@gmx.de +Content-Type: text/html; charset=UTF-8 +Date: Tue, 8 Dec 2020 12:59:55 +0100 +Importance: normal +Sensitivity: Normal +In-Reply-To: +References: + + + +
YIPPEEEEEE
+ +
 
+ +
Multi-line
+ +
  +
  +
+
Gesendet: Dienstag, 08. Dezember 2020 um 12:59 Uhr
+Von: bob@gmx.de
+An: "Alice" <alice@gmx.de>
+Betreff: Re: Re: Re: Message from bob@gmx.de
+ +
Now?
+
+--
+Sent with my Delta Chat Messenger: https://delta.chat
+
+
+