Parse name="quote" divs (#2104)

fix #1560 Replies in html-only format are not converted nicely wrt Quoting
This commit is contained in:
Hocuri
2020-12-13 18:02:20 +01:00
committed by GitHub
parent 518e87b0cf
commit ec83fae314
6 changed files with 245 additions and 31 deletions

View File

@@ -2,8 +2,13 @@
//!
//! A module to remove HTML tags from the email text
use std::io::BufRead;
use once_cell::sync::Lazy;
use quick_xml::events::{BytesEnd, BytesStart, BytesText};
use quick_xml::{
events::{BytesEnd, BytesStart, BytesText},
Reader,
};
static LINE_RE: Lazy<regex::Regex> = Lazy::new(|| regex::Regex::new(r"(\r?\n)+").unwrap());
@@ -11,9 +16,37 @@ struct Dehtml {
strbuilder: String,
add_text: AddText,
last_href: Option<String>,
/// GMX wraps a quote in `<div name="quote">`. After a `<div name="quote">`, this count is
/// increased at each `<div>` and decreased at each `</div>`. This way we know when the quote ends.
/// If this is > `0`, then we are inside a `<div name="quote">`
divs_since_quote_div: u32,
/// Everything between <div name="quote"> and <div name="quoted-content"> is usually metadata
/// If this is > `0`, then we are inside a `<div name="quoted-content">`.
divs_since_quoted_content_div: u32,
}
#[derive(Debug, PartialEq)]
impl Dehtml {
fn line_prefix(&self) -> &str {
if self.divs_since_quoted_content_div > 0 {
"> "
} else {
""
}
}
fn append_prefix(&self, line_end: impl AsRef<str>) -> String {
// line_end is e.g. "\n\n". We add "> " if necessary.
line_end.as_ref().to_owned() + self.line_prefix()
}
fn get_add_text(&self) -> AddText {
if self.divs_since_quote_div > 0 && self.divs_since_quoted_content_div == 0 {
AddText::No // Everything between <div name="quoted"> and <div name="quoted_content"> is metadata which we don't want
} else {
self.add_text
}
}
}
#[derive(Debug, PartialEq, Clone, Copy)]
enum AddText {
No,
YesRemoveLineEnds,
@@ -41,6 +74,8 @@ pub fn dehtml_quick_xml(buf: &str) -> String {
strbuilder: String::with_capacity(buf.len()),
add_text: AddText::YesRemoveLineEnds,
last_href: None,
divs_since_quote_div: 0,
divs_since_quoted_content_div: 0,
};
let mut reader = quick_xml::Reader::from_str(buf);
@@ -79,13 +114,16 @@ pub fn dehtml_quick_xml(buf: &str) -> String {
}
fn dehtml_text_cb(event: &BytesText, dehtml: &mut Dehtml) {
if dehtml.add_text == AddText::YesPreserveLineEnds
|| dehtml.add_text == AddText::YesRemoveLineEnds
if dehtml.get_add_text() == AddText::YesPreserveLineEnds
|| dehtml.get_add_text() == AddText::YesRemoveLineEnds
{
let last_added = escaper::decode_html_buf_sloppy(event.escaped()).unwrap_or_default();
if dehtml.add_text == AddText::YesRemoveLineEnds {
if dehtml.get_add_text() == AddText::YesRemoveLineEnds {
dehtml.strbuilder += LINE_RE.replace_all(&last_added, "\r").as_ref();
} else if !dehtml.line_prefix().is_empty() {
let l = dehtml.append_prefix("\n");
dehtml.strbuilder += LINE_RE.replace_all(&last_added, l.as_str()).as_ref();
} else {
dehtml.strbuilder += &last_added;
}
@@ -93,13 +131,16 @@ fn dehtml_text_cb(event: &BytesText, dehtml: &mut Dehtml) {
}
fn dehtml_cdata_cb(event: &BytesText, dehtml: &mut Dehtml) {
if dehtml.add_text == AddText::YesPreserveLineEnds
|| dehtml.add_text == AddText::YesRemoveLineEnds
if dehtml.get_add_text() == AddText::YesPreserveLineEnds
|| dehtml.get_add_text() == AddText::YesRemoveLineEnds
{
let last_added = escaper::decode_html_buf_sloppy(event.escaped()).unwrap_or_default();
if dehtml.add_text == AddText::YesRemoveLineEnds {
if dehtml.get_add_text() == AddText::YesRemoveLineEnds {
dehtml.strbuilder += LINE_RE.replace_all(&last_added, "\r").as_ref();
} else if !dehtml.line_prefix().is_empty() {
let l = dehtml.append_prefix("\n");
dehtml.strbuilder += LINE_RE.replace_all(&last_added, l.as_str()).as_ref();
} else {
dehtml.strbuilder += &last_added;
}
@@ -110,8 +151,15 @@ fn dehtml_endtag_cb(event: &BytesEnd, dehtml: &mut Dehtml) {
let tag = String::from_utf8_lossy(event.name()).trim().to_lowercase();
match tag.as_str() {
"p" | "div" | "table" | "td" | "style" | "script" | "title" | "pre" => {
dehtml.strbuilder += "\n\n";
"p" | "table" | "td" | "style" | "script" | "title" | "pre" => {
dehtml.strbuilder += &dehtml.append_prefix("\n\n");
dehtml.add_text = AddText::YesRemoveLineEnds;
}
"div" => {
pop_tag(&mut dehtml.divs_since_quote_div);
pop_tag(&mut dehtml.divs_since_quoted_content_div);
dehtml.strbuilder += &dehtml.append_prefix("\n\n");
dehtml.add_text = AddText::YesRemoveLineEnds;
}
"a" => {
@@ -122,10 +170,14 @@ fn dehtml_endtag_cb(event: &BytesEnd, dehtml: &mut Dehtml) {
}
}
"b" | "strong" => {
dehtml.strbuilder += "*";
if dehtml.get_add_text() != AddText::No {
dehtml.strbuilder += "*";
}
}
"i" | "em" => {
dehtml.strbuilder += "_";
if dehtml.get_add_text() != AddText::No {
dehtml.strbuilder += "_";
}
}
_ => {}
}
@@ -139,19 +191,27 @@ fn dehtml_starttag_cb<B: std::io::BufRead>(
let tag = String::from_utf8_lossy(event.name()).trim().to_lowercase();
match tag.as_str() {
"p" | "div" | "table" | "td" => {
dehtml.strbuilder += "\n\n";
"p" | "table" | "td" => {
dehtml.strbuilder += &dehtml.append_prefix("\n\n");
dehtml.add_text = AddText::YesRemoveLineEnds;
}
#[rustfmt::skip]
"div" => {
maybe_push_tag(event, reader, "quote", &mut dehtml.divs_since_quote_div);
maybe_push_tag(event, reader, "quoted-content", &mut dehtml.divs_since_quoted_content_div);
dehtml.strbuilder += &dehtml.append_prefix("\n\n");
dehtml.add_text = AddText::YesRemoveLineEnds;
}
"br" => {
dehtml.strbuilder += "\n";
dehtml.strbuilder += &dehtml.append_prefix("\n");
dehtml.add_text = AddText::YesRemoveLineEnds;
}
"style" | "script" | "title" => {
dehtml.add_text = AddText::No;
}
"pre" => {
dehtml.strbuilder += "\n\n";
dehtml.strbuilder += &dehtml.append_prefix("\n\n");
dehtml.add_text = AddText::YesPreserveLineEnds;
}
"a" => {
@@ -172,15 +232,51 @@ fn dehtml_starttag_cb<B: std::io::BufRead>(
}
}
"b" | "strong" => {
dehtml.strbuilder += "*";
if dehtml.get_add_text() != AddText::No {
dehtml.strbuilder += "*";
}
}
"i" | "em" => {
dehtml.strbuilder += "_";
if dehtml.get_add_text() != AddText::No {
dehtml.strbuilder += "_";
}
}
_ => {}
}
}
/// In order to know when a specific tag is closed, we need to count the opening and closing tags.
/// The `counts`s are stored in the `Dehtml` struct.
fn pop_tag(count: &mut u32) {
if *count > 0 {
*count -= 1;
}
}
/// In order to know when a specific tag is closed, we need to count the opening and closing tags.
/// The `counts`s are stored in the `Dehtml` struct.
fn maybe_push_tag(
event: &BytesStart,
reader: &Reader<impl BufRead>,
tag_name: &str,
count: &mut u32,
) {
if *count > 0 || tag_contains_attr(event, reader, tag_name) {
*count += 1;
}
}
fn tag_contains_attr(event: &BytesStart, reader: &Reader<impl BufRead>, name: &str) -> bool {
event.attributes().any(|r| {
r.map(|a| {
a.unescape_and_decode_value(reader)
.map(|v| v == name)
.unwrap_or(false)
})
.unwrap_or(false)
})
}
pub fn dehtml_manually(buf: &str) -> String {
// Just strip out everything between "<" and ">"
let mut strbuilder = String::new();
@@ -288,4 +384,15 @@ mod tests {
let txt = dehtml(input).unwrap();
assert_eq!(txt.trim(), "lots of text");
}
#[async_std::test]
async fn test_quote_div() {
let input = include_str!("../test-data/message/gmx-quote-body.eml");
let dehtml = dehtml(input).unwrap();
println!("{}", dehtml);
let (msg, forwawded, top_quote) = simplify(dehtml, false);
assert_eq!(msg, "Test");
assert_eq!(forwawded, false);
assert_eq!(top_quote.as_deref(), Some("test"));
}
}

View File

@@ -2513,4 +2513,13 @@ On 2020-10-25, Bob wrote:
);
assert_eq!(mimeparser.parts[0].typ, Viewtype::File);
}
#[async_std::test]
async fn test_quote_div() {
let t = TestContext::new().await;
let raw = include_bytes!("../test-data/message/gmx-quote.eml");
let mimeparser = MimeMessage::from_bytes(&t.ctx, raw).await.unwrap();
assert_eq!(mimeparser.parts[0].msg, "YIPPEEEEEE\n\nMulti-line");
assert_eq!(mimeparser.parts[0].param.get(Param::Quote).unwrap(), "Now?");
}
}

View File

@@ -91,6 +91,14 @@ pub fn simplify(mut input: String, is_chat_message: bool) -> (String, bool, Opti
render_message(lines, has_nonstandard_footer || bottom_quote.is_some())
}
};
if !is_chat_message {
top_quote = top_quote.map(|quote| {
let quote_lines = split_lines(&quote);
let quote_lines = remove_message_footer(&quote_lines);
render_message(quote_lines, false)
});
}
(text, is_forwarded, top_quote)
}
@@ -218,19 +226,10 @@ fn render_message(lines: &[&str], is_cut_at_end: bool) -> String {
* Tools
*/
fn is_empty_line(buf: &str) -> bool {
// XXX: can it be simplified to buf.chars().all(|c| c.is_whitespace())?
//
// Strictly speaking, it is not equivalent (^A is not whitespace, but less than ' '),
// but having control sequences in email body?!
//
// See discussion at: https://github.com/deltachat/deltachat-core-rust/pull/402#discussion_r317062392
for c in buf.chars() {
if c > ' ' {
return false;
}
}
true
buf.chars().all(char::is_whitespace)
// for some time, this checked for `char <= ' '`,
// see discussion at: https://github.com/deltachat/deltachat-core-rust/pull/402#discussion_r317062392
// and https://github.com/deltachat/deltachat-core-rust/pull/2104/files#r538973613
}
fn is_quoted_headline(buf: &str) -> bool {

View File

@@ -0,0 +1,47 @@
Return-Path: <bob@example.org>
User-Agent: K-9 Mail for Android
In-Reply-To: <hasnihae@gmx.de>
MIME-Version: 1.0
Content-Type: multipart/alternative; boundary="----MLV7YOLJ7ED4UZKNGQYQ63O0RJGHU0"
Content-Transfer-Encoding: 7bit
Subject: Re: Test
To: Alice <alice@example.org>
From: Bob <bob@example.org>
Message-ID: <haeisnr@example.org>
------MLV7YOLJ7ED4UZKNGQYQ63O0RJGHU0
Content-Type: text/definitelynotplainthiswouldbetooeasy;
charset=utf-8
Content-Transfer-Encoding: quoted-printable
Hi Alice,
some text.
Am 21=2E Juni 2020 10:38:44 MESZ schrieb Alice <alice@example=2Eorg>:
>Dear Bob,
>
>let's meet
>
>Alice
--=20
Diese Nachricht wurde von meinem Android-Ger=C3=A4t mit K-9 Mail gesendet=
=2E
------MLV7YOLJ7ED4UZKNGQYQ63O0RJGHU0
Content-Type: text/html;
charset=utf-8
Content-Transfer-Encoding: quoted-printable
<html><head></head><body>Hi Alice,<br><br>some text.<br><br>
<div class=3D"gmail_quote">Am 21=2E Juni 2020 10:38:44 M=
ESZ schrieb Alice &lt;jonathanschmiederer@gmx=2Ede&gt;:<bloc=
kquote class=3D"gmail_quote" style=3D"margin: 0pt 0pt 0pt 0=2E8ex; border-l=
eft: 1px solid rgb(204, 204, 204); padding-left: 1ex;">
<pre class=3D"k9mail">Sehr geehrte/r Frau/Herr Brenner,<br><br>ich habe in=
meinen JuFo-Unterlagen den angeh=C3=A4ngten Gutschein gefunden=2E<br>Ist e=
s noch m=C3=B6glich, diesen einzul=C3=B6sen?<br><br>Mit freundlichen Gr=C3=
=BC=C3=9Fen<br>Alice<br></pre></blockquote></div><br>-- <br>=
Diese Nachricht wurde von meinem Android-Ger=C3=A4t mit K-9 Mail gesendet=
=2E</body></html>
------MLV7YOLJ7ED4UZKNGQYQ63O0RJGHU0--

View File

@@ -0,0 +1,14 @@
<html><head></head><body><div style="font-family: Verdana;font-size: 12.0px;"><div>Test</div>
<div>&nbsp;
<div>&nbsp;
<div name="quote" style="margin:10px 5px 5px 10px; padding: 10px 0 10px 10px; border-left:2px solid #C3D9E5; word-wrap: break-word; -webkit-nbsp-mode: space; -webkit-line-break: after-white-space;">
<div style="margin:0 0 10px 0;"><b>Gesendet:</b>&nbsp;Freitag, 04. Dezember 2020 um 18:46 Uhr<br/>
<b>Von:</b>&nbsp;&quot;Bob&quot; &lt;bob@gmx.de&gt;<br/>
<b>An:</b>&nbsp;alice@gmx.de<br/>
<b>Betreff:</b>&nbsp;test</div>
<div name="quoted-content">test</div>
</div>
</div>
</div></div></body></html>

View File

@@ -0,0 +1,38 @@
Return-Path: <alice@gmx.de>
Delivered-To: bob@gmx.de
MIME-Version: 1.0
Message-ID: <trinity-fa44240c-65ef-4323-b531-d1d3e5e84313-1607428795002@3c-app-gmx-bs36>
From: Alice <alice@gmx.de>
To: bob@gmx.de
Subject: Aw: Re: Re: Re: Message from bob@gmx.de
Content-Type: text/html; charset=UTF-8
Date: Tue, 8 Dec 2020 12:59:55 +0100
Importance: normal
Sensitivity: Normal
In-Reply-To: <Mr.3p9V5c7XjPQ.bcqzV4ls4ID@testrun.org>
References: <Mr.SRvG2sUyI8E.pASbMFvaNqy@testrun.org>
<trinity-764a001f-6593-4c33-b681-942d2ffb9f9f-1607359668635@3c-app-gmx-bs26>
<Mr.3p9V5c7XjPQ.bcqzV4ls4ID@testrun.org>
<html><head></head><body><div style="font-family: Verdana;font-size: 12.0px;"><div>YIPPEEEEEE</div>
<div>&nbsp;</div>
<div>Multi-line</div>
<div>&nbsp;
<div>&nbsp;
<div name="quote" style="margin:10px 5px 5px 10px; padding: 10px 0 10px 10px; border-left:2px solid #C3D9E5; word-wrap: break-word; -webkit-nbsp-mode: space; -webkit-line-break: after-white-space;">
<div style="margin:0 0 10px 0;"><b>Gesendet:</b>&nbsp;Dienstag, 08. Dezember 2020 um 12:59 Uhr<br/>
<b>Von:</b>&nbsp;bob@gmx.de<br/>
<b>An:</b>&nbsp;&quot;Alice&quot; &lt;alice@gmx.de&gt;<br/>
<b>Betreff:</b>&nbsp;Re: Re: Re: Message from bob@gmx.de</div>
<div name="quoted-content">Now?<br/>
<br/>
--<br/>
Sent with my Delta Chat Messenger: <a href="https://delta.chat" target="_blank">https://delta.chat</a><br/>
&nbsp;</div>
</div>
</div>
</div></div></body></html>