fix(dehtml): do not insert unnecessary newlines when parsing <p> tags

Previously, parsing of `<p>Foo</p><p>Bar</p>`
resulted in `\n\nFoo\n\n\n\nBar\n\n`.

Now it results in `Foo\n\nBar`.
This commit is contained in:
link2xt
2023-06-16 13:45:39 +00:00
parent 92e34d67e6
commit 00cb72f04d

View File

@@ -152,7 +152,7 @@ fn dehtml_endtag_cb(event: &BytesEnd, dehtml: &mut Dehtml) {
.to_lowercase();
match tag.as_str() {
"p" | "table" | "td" | "style" | "script" | "title" | "pre" => {
"style" | "script" | "title" | "pre" => {
dehtml.strbuilder += &dehtml.append_prefix("\n\n");
dehtml.add_text = AddText::YesRemoveLineEnds;
}
@@ -200,7 +200,9 @@ fn dehtml_starttag_cb<B: std::io::BufRead>(
match tag.as_str() {
"p" | "table" | "td" => {
dehtml.strbuilder += &dehtml.append_prefix("\n\n");
if !dehtml.strbuilder.is_empty() {
dehtml.strbuilder += &dehtml.append_prefix("\n\n");
}
dehtml.add_text = AddText::YesRemoveLineEnds;
}
#[rustfmt::skip]
@@ -353,6 +355,21 @@ mod tests {
assert_eq!(plain, "line1\n\r\r\rline2\nline3");
}
#[test]
fn test_dehtml_parse_p() {
let html = "<p>Foo</p><p>Bar</p>";
let plain = dehtml(html).unwrap();
assert_eq!(plain, "Foo\n\nBar");
let html = "<p>Foo<p>Bar";
let plain = dehtml(html).unwrap();
assert_eq!(plain, "Foo\n\nBar");
let html = "<p>Foo</p><p>Bar<p>Baz";
let plain = dehtml(html).unwrap();
assert_eq!(plain, "Foo\n\nBar\n\nBaz");
}
#[test]
fn test_dehtml_parse_href() {
let html = "<a href=url>text</a";