diff --git a/Cargo.lock b/Cargo.lock index 7c931d7e3..a0b68dacb 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4639,9 +4639,9 @@ checksum = "a993555f31e5a609f617c12db6250dedcac1b0a85076912c436e6fc9b2c8e6a3" [[package]] name = "quick-xml" -version = "0.37.5" +version = "0.38.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "331e97a1af0bf59823e6eadffe373d7b27f485be8748f71471c662c1f269b7fb" +checksum = "42a232e7487fc2ef313d96dde7948e7a3c05101870d8985e4fd8d26aedd27b89" dependencies = [ "memchr", ] diff --git a/Cargo.toml b/Cargo.toml index 859b597e4..9a1cd34cc 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -82,7 +82,7 @@ percent-encoding = "2.3" pgp = { version = "0.17.0", default-features = false } pin-project = "1" qrcodegen = "1.7.0" -quick-xml = "0.37" +quick-xml = { version = "0.38", features = ["escape-html"] } rand = { workspace = true } regex = { workspace = true } rusqlite = { workspace = true, features = ["sqlcipher"] } diff --git a/src/configure/auto_mozilla.rs b/src/configure/auto_mozilla.rs index 62de51741..664334ed9 100644 --- a/src/configure/auto_mozilla.rs +++ b/src/configure/auto_mozilla.rs @@ -106,7 +106,7 @@ fn parse_server( } } Event::Text(ref event) => { - let val = event.unescape().unwrap_or_default().trim().to_owned(); + let val = event.xml_content().unwrap_or_default().trim().to_owned(); match tag_config { MozConfigTag::Hostname => hostname = Some(val), diff --git a/src/configure/auto_outlook.rs b/src/configure/auto_outlook.rs index 6f4687ee8..40bbcfe5e 100644 --- a/src/configure/auto_outlook.rs +++ b/src/configure/auto_outlook.rs @@ -79,7 +79,7 @@ fn parse_protocol( } } Event::Text(ref e) => { - let val = e.unescape().unwrap_or_default(); + let val = e.xml_content().unwrap_or_default(); if let Some(ref tag) = current_tag { match tag.as_str() { @@ -123,7 +123,7 @@ fn parse_redirecturl( let mut buf = Vec::new(); match reader.read_event_into(&mut buf)? { Event::Text(ref e) => { - let val = e.unescape().unwrap_or_default(); + let val = e.xml_content().unwrap_or_default(); Ok(val.trim().to_string()) } _ => Ok("".to_string()), diff --git a/src/dehtml.rs b/src/dehtml.rs index 007708195..a6d70b1f7 100644 --- a/src/dehtml.rs +++ b/src/dehtml.rs @@ -7,6 +7,7 @@ use std::sync::LazyLock; use quick_xml::{ Reader, + errors::Error as QuickXmlError, events::{BytesEnd, BytesStart, BytesText}, }; @@ -132,6 +133,7 @@ fn dehtml_quick_xml(buf: &str) -> (String, String) { reader.config_mut().check_end_names = false; let mut buf = Vec::new(); + let mut char_buf = String::with_capacity(4); loop { match reader.read_event_into(&mut buf) { @@ -140,16 +142,9 @@ fn dehtml_quick_xml(buf: &str) -> (String, String) { } Ok(quick_xml::events::Event::End(ref e)) => dehtml_endtag_cb(e, &mut dehtml), Ok(quick_xml::events::Event::Text(ref e)) => dehtml_text_cb(e, &mut dehtml), - Ok(quick_xml::events::Event::CData(e)) => match e.escape() { - Ok(e) => dehtml_text_cb(&e, &mut dehtml), - Err(e) => { - eprintln!( - "CDATA escape error at position {}: {:?}", - reader.buffer_position(), - e, - ); - } - }, + Ok(quick_xml::events::Event::CData(e)) => { + str_cb(&String::from_utf8_lossy(&e as &[_]), &mut dehtml) + } Ok(quick_xml::events::Event::Empty(ref e)) => { // Handle empty tags as a start tag immediately followed by end tag. // For example, `

` is treated as `

`. @@ -159,6 +154,33 @@ fn dehtml_quick_xml(buf: &str) -> (String, String) { &mut dehtml, ); } + Ok(quick_xml::events::Event::GeneralRef(ref e)) => { + match e.resolve_char_ref() { + Err(err) => eprintln!( + "resolve_char_ref() error at position {}: {:?}", + reader.buffer_position(), + err, + ), + Ok(Some(ch)) => { + char_buf.clear(); + char_buf.push(ch); + str_cb(&char_buf, &mut dehtml); + } + Ok(None) => { + let event_str = String::from_utf8_lossy(e); + if let Some(s) = quick_xml::escape::resolve_html5_entity(&event_str) { + str_cb(s, &mut dehtml); + } else { + // Nonstandard entity. Add escaped. + str_cb(&format!("&{event_str};"), &mut dehtml); + } + } + } + } + Err(QuickXmlError::IllFormed(_)) => { + // This is probably not HTML at all and should be left as is. + str_cb(&String::from_utf8_lossy(&buf), &mut dehtml); + } Err(e) => { eprintln!( "Parse html error: Error at position {}: {:?}", @@ -176,36 +198,36 @@ fn dehtml_quick_xml(buf: &str) -> (String, String) { } fn dehtml_text_cb(event: &BytesText, dehtml: &mut Dehtml) { - static LINE_RE: LazyLock = - LazyLock::new(|| regex::Regex::new(r"(\r?\n)+").unwrap()); - if dehtml.get_add_text() == AddText::YesPreserveLineEnds || dehtml.get_add_text() == AddText::YesRemoveLineEnds { let event = event as &[_]; let event_str = std::str::from_utf8(event).unwrap_or_default(); - let mut last_added = escaper::decode_html_buf_sloppy(event).unwrap_or_default(); - if event_str.starts_with(&last_added) { - last_added = event_str.to_string(); + str_cb(event_str, dehtml); + } +} + +fn str_cb(event_str: &str, dehtml: &mut Dehtml) { + static LINE_RE: LazyLock = + LazyLock::new(|| regex::Regex::new(r"(\r?\n)+").unwrap()); + + let add_text = dehtml.get_add_text(); + if add_text == AddText::YesRemoveLineEnds { + // Replace all line ends with spaces. + // E.g. `\r\n\r\n` is replaced with one space. + let event_str = LINE_RE.replace_all(event_str, " "); + + // Add a space if `event_str` starts with a space + // and there is no whitespace at the end of the buffer yet. + // Trim the rest of leading whitespace from `event_str`. + let buf = dehtml.get_buf(); + if !buf.ends_with(' ') && !buf.ends_with('\n') && event_str.starts_with(' ') { + *buf += " "; } - if dehtml.get_add_text() == AddText::YesRemoveLineEnds { - // Replace all line ends with spaces. - // E.g. `\r\n\r\n` is replaced with one space. - let last_added = LINE_RE.replace_all(&last_added, " "); - - // Add a space if `last_added` starts with a space - // and there is no whitespace at the end of the buffer yet. - // Trim the rest of leading whitespace from `last_added`. - let buf = dehtml.get_buf(); - if !buf.ends_with(' ') && !buf.ends_with('\n') && last_added.starts_with(' ') { - *buf += " "; - } - - *buf += last_added.trim_start(); - } else { - *dehtml.get_buf() += LINE_RE.replace_all(&last_added, "\n").as_ref(); - } + *buf += event_str.trim_start(); + } else if add_text == AddText::YesPreserveLineEnds { + *dehtml.get_buf() += LINE_RE.replace_all(event_str, "\n").as_ref(); } } diff --git a/src/location.rs b/src/location.rs index 696146120..4f9fe867e 100644 --- a/src/location.rs +++ b/src/location.rs @@ -140,7 +140,7 @@ impl Kml { if self.tag == KmlTag::PlacemarkTimestampWhen || self.tag == KmlTag::PlacemarkPointCoordinates { - let val = event.unescape().unwrap_or_default(); + let val = event.xml_content().unwrap_or_default(); let val = val.replace(['\n', '\r', '\t', ' '], "");