fix(dehtml): do not insert unnecessary newlines when parsing <p> tags

Previously, parsing of `<p>Foo</p><p>Bar</p>`
resulted in `\n\nFoo\n\n\n\nBar\n\n`.

Now it results in `Foo\n\nBar`.
This commit is contained in:
link2xt
2023-06-16 13:45:39 +00:00
parent 92e34d67e6
commit 00cb72f04d

View File

@@ -152,7 +152,7 @@ fn dehtml_endtag_cb(event: &BytesEnd, dehtml: &mut Dehtml) {
.to_lowercase(); .to_lowercase();
match tag.as_str() { match tag.as_str() {
"p" | "table" | "td" | "style" | "script" | "title" | "pre" => { "style" | "script" | "title" | "pre" => {
dehtml.strbuilder += &dehtml.append_prefix("\n\n"); dehtml.strbuilder += &dehtml.append_prefix("\n\n");
dehtml.add_text = AddText::YesRemoveLineEnds; dehtml.add_text = AddText::YesRemoveLineEnds;
} }
@@ -200,7 +200,9 @@ fn dehtml_starttag_cb<B: std::io::BufRead>(
match tag.as_str() { match tag.as_str() {
"p" | "table" | "td" => { "p" | "table" | "td" => {
dehtml.strbuilder += &dehtml.append_prefix("\n\n"); if !dehtml.strbuilder.is_empty() {
dehtml.strbuilder += &dehtml.append_prefix("\n\n");
}
dehtml.add_text = AddText::YesRemoveLineEnds; dehtml.add_text = AddText::YesRemoveLineEnds;
} }
#[rustfmt::skip] #[rustfmt::skip]
@@ -353,6 +355,21 @@ mod tests {
assert_eq!(plain, "line1\n\r\r\rline2\nline3"); assert_eq!(plain, "line1\n\r\r\rline2\nline3");
} }
#[test]
fn test_dehtml_parse_p() {
let html = "<p>Foo</p><p>Bar</p>";
let plain = dehtml(html).unwrap();
assert_eq!(plain, "Foo\n\nBar");
let html = "<p>Foo<p>Bar";
let plain = dehtml(html).unwrap();
assert_eq!(plain, "Foo\n\nBar");
let html = "<p>Foo</p><p>Bar<p>Baz";
let plain = dehtml(html).unwrap();
assert_eq!(plain, "Foo\n\nBar\n\nBaz");
}
#[test] #[test]
fn test_dehtml_parse_href() { fn test_dehtml_parse_href() {
let html = "<a href=url>text</a"; let html = "<a href=url>text</a";