Parse <blockquote> tags for better quote detection (#2313)

This commit is contained in:
Hocuri
2021-04-07 18:45:00 +02:00
committed by GitHub
parent 720135a915
commit 179a2a50e6
3 changed files with 67 additions and 2 deletions

View File

@@ -23,11 +23,14 @@ struct Dehtml {
/// Everything between <div name="quote"> and <div name="quoted-content"> is usually metadata
/// If this is > `0`, then we are inside a `<div name="quoted-content">`.
divs_since_quoted_content_div: u32,
/// All-Inkl just puts the quote into `<blockquote> </blockquote>`. This count is
/// increased at each `<blockquote>` and decreased at each `</blockquote>`.
blockquotes_since_blockquote: u32,
}
impl Dehtml {
fn line_prefix(&self) -> &str {
if self.divs_since_quoted_content_div > 0 {
if self.divs_since_quoted_content_div > 0 || self.blockquotes_since_blockquote > 0 {
"> "
} else {
""
@@ -67,7 +70,7 @@ pub fn dehtml(buf: &str) -> Option<String> {
None
}
pub fn dehtml_quick_xml(buf: &str) -> String {
fn dehtml_quick_xml(buf: &str) -> String {
let buf = buf.trim().trim_start_matches("<!doctype html>");
let mut dehtml = Dehtml {
@@ -76,6 +79,7 @@ pub fn dehtml_quick_xml(buf: &str) -> String {
last_href: None,
divs_since_quote_div: 0,
divs_since_quoted_content_div: 0,
blockquotes_since_blockquote: 0,
};
let mut reader = quick_xml::Reader::from_str(buf);
@@ -179,6 +183,7 @@ fn dehtml_endtag_cb(event: &BytesEnd, dehtml: &mut Dehtml) {
dehtml.strbuilder += "_";
}
}
"blockquote" => pop_tag(&mut dehtml.blockquotes_since_blockquote),
_ => {}
}
}
@@ -241,6 +246,7 @@ fn dehtml_starttag_cb<B: std::io::BufRead>(
dehtml.strbuilder += "_";
}
}
"blockquote" => dehtml.blockquotes_since_blockquote += 1,
_ => {}
}
}

View File

@@ -2744,6 +2744,19 @@ On 2020-10-25, Bob wrote:
assert_eq!(mimeparser.parts[0].param.get(Param::Quote).unwrap(), "Now?");
}
#[async_std::test]
async fn test_allinkl_blockquote() {
// all-inkl.com puts quotes into `<blockquote> </blockquote>`.
let t = TestContext::new().await;
let raw = include_bytes!("../test-data/message/allinkl-quote.eml");
let mimeparser = MimeMessage::from_bytes(&t, raw).await.unwrap();
assert!(mimeparser.parts[0].msg.starts_with("It's 1.0."));
assert_eq!(
mimeparser.parts[0].param.get(Param::Quote).unwrap(),
"What's the version?"
);
}
#[async_std::test]
async fn test_add_subj_to_multimedia_msg() {
let t = TestContext::new_alice().await;

View File

@@ -0,0 +1,46 @@
Return-Path: <alice@example.org>
Delivered-To: bob@example.org
Received: from hq5.merlinux.eu
by hq5.merlinux.eu with LMTP
id eHU/Co4EUmBAQQAAPzvFDg
(envelope-from <alice@example.org>)
for <bob@example.org>; Wed, 17 Mar 2021 14:30:54 +0100
Received: from dd37930.kasserver.com (dd37930.kasserver.com [85.13.154.127])
by hq5.merlinux.eu (Postfix) with ESMTPS id CB5D927A0071
for <bob@example.org>; Wed, 17 Mar 2021 14:30:53 +0100 (CET)
Received: from dd37930.kasserver.com (dd0805.kasserver.com [85.13.161.253])
by dd37930.kasserver.com (Postfix) with ESMTPSA id 724E853C0979
for <bob@example.org>; Wed, 17 Mar 2021 14:30:53 +0100 (CET)
MIME-Version: 1.0
Content-Type: text/html; charset=ISO-8859-1
Content-Transfer-Encoding: quoted-printable
X-SenderIP: 217.80.3.233
User-Agent: ALL-INKL Webmail 2.11
In-Reply-To: <Mr.nru4puZrBpw.JfbybhIh75A@testrun.org>
References: <Mr.nru4puZrBpw.JfbybhIh75A@testrun.org><Mr.nru4puZrBpw.JfbybhIh75A@testrun.org>
Subject: Re: Message from Hocuri
From: alice@example.org
To: bob@example.org
Message-Id: <20210317133053.724E853C0979@dd37930.kasserver.com>
Date: Wed, 17 Mar 2021 14:30:53 +0100 (CET)
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www=
=2Ew3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"><html lang=3D"de" xml:lang=
=3D"en" xmlns=3D"http://www.w3.org/1999/xhtml"><head><meta http-equiv=3D"Co=
ntent-Type" content=3D"text/html; charset=3DISO-8859-1" /><title></title><s=
tyle type=3D"text/css">html,body{background-color:#fff;color:#333;line-heig=
ht:1.4;font-family:sans-serif,Arial,Verdana,Trebuchet MS;}</style></head><b=
ody><p>It's 1.0.</p>
<div ></div>
<p>Hocuri schrieb am 17.03.2021 14:25 (GMT +01:00):</p>
<blockquote cite=3D"mid:Mr.nru4puZrBpw.JfbybhIh75A@testrun.org">
<pre>What's the version?
--=20
Sent with my Delta Chat Messenger: <a href=3D"https://delta.chat" target=3D=
"_blank" rel=3D"nofollow noopener" title=3D"https://delta.chat">https://del=
ta.chat</a>
</pre>
</blockquote></body></html>