refactor(dehtml): switch to quick-xml for parsing

This commit is contained in:
dignifiedquire
2019-08-16 23:09:31 +02:00
committed by holger krekel
parent eb2b4028ab
commit c901ab844f
4 changed files with 142 additions and 110 deletions

View File

@@ -1,9 +1,9 @@
use lazy_static::lazy_static;
use quick_xml;
use quick_xml::events::{BytesEnd, BytesStart, BytesText};
use crate::dc_saxparser::*;
use crate::dc_tools::*;
use crate::x::*;
use std::ptr;
lazy_static! {
static ref LINE_RE: regex::Regex = regex::Regex::new(r"(\r?\n)+").unwrap();
@@ -35,56 +35,67 @@ pub unsafe fn dc_dehtml(buf_terminated: *mut libc::c_char) -> *mut libc::c_char
add_text: AddText::YesRemoveLineEnds,
last_href: None,
};
let mut saxparser = dc_saxparser_t {
starttag_cb: None,
endtag_cb: None,
text_cb: None,
userdata: ptr::null_mut(),
};
dc_saxparser_init(
&mut saxparser,
&mut dehtml as *mut Dehtml as *mut libc::c_void,
);
dc_saxparser_set_tag_handler(
&mut saxparser,
Some(dehtml_starttag_cb),
Some(dehtml_endtag_cb),
);
dc_saxparser_set_text_handler(&mut saxparser, Some(dehtml_text_cb));
dc_saxparser_parse(&mut saxparser, buf_terminated);
let mut reader = quick_xml::Reader::from_str(as_str(buf_terminated));
let mut buf = Vec::new();
loop {
match reader.read_event(&mut buf) {
Ok(quick_xml::events::Event::Start(ref e)) => {
dehtml_starttag_cb(e, &mut dehtml, &reader)
}
Ok(quick_xml::events::Event::End(ref e)) => dehtml_endtag_cb(e, &mut dehtml),
Ok(quick_xml::events::Event::Text(ref e)) => dehtml_text_cb(e, &mut dehtml),
Ok(quick_xml::events::Event::CData(ref e)) => dehtml_cdata_cb(e, &mut dehtml),
Err(e) => {
eprintln!(
"Parse html error: Error at position {}: {:?}",
reader.buffer_position(),
e
);
}
Ok(quick_xml::events::Event::Eof) => break,
_ => (),
}
buf.clear();
}
dehtml.strbuilder.strdup()
}
unsafe fn dehtml_text_cb(
userdata: *mut libc::c_void,
text: *const libc::c_char,
_len: libc::c_int,
) {
let dehtml = &mut *(userdata as *mut Dehtml);
fn dehtml_text_cb(event: &BytesText, dehtml: &mut Dehtml) {
if dehtml.add_text == AddText::YesPreserveLineEnds
|| dehtml.add_text == AddText::YesRemoveLineEnds
{
let last_added = std::ffi::CStr::from_ptr(text)
.to_str()
.expect("invalid utf8");
// TODO: why does len does not match?
// assert_eq!(last_added.len(), len as usize);
let last_added = escaper::decode_html_buf_sloppy(event.escaped()).unwrap_or_default();
if dehtml.add_text == AddText::YesRemoveLineEnds {
dehtml.strbuilder += LINE_RE.replace_all(last_added.as_ref(), "\r").as_ref();
dehtml.strbuilder += LINE_RE.replace_all(&last_added, "\r").as_ref();
} else {
dehtml.strbuilder += last_added.as_ref();
dehtml.strbuilder += &last_added;
}
}
}
unsafe fn dehtml_endtag_cb(userdata: *mut libc::c_void, tag: *const libc::c_char) {
let mut dehtml = &mut *(userdata as *mut Dehtml);
let tag = std::ffi::CStr::from_ptr(tag).to_string_lossy();
fn dehtml_cdata_cb(event: &BytesText, dehtml: &mut Dehtml) {
if dehtml.add_text == AddText::YesPreserveLineEnds
|| dehtml.add_text == AddText::YesRemoveLineEnds
{
let last_added = escaper::decode_html_buf_sloppy(event.escaped()).unwrap_or_default();
match tag.as_ref() {
if dehtml.add_text == AddText::YesRemoveLineEnds {
dehtml.strbuilder += LINE_RE.replace_all(&last_added, "\r").as_ref();
} else {
dehtml.strbuilder += &last_added;
}
}
}
fn dehtml_endtag_cb(event: &BytesEnd, dehtml: &mut Dehtml) {
let tag = String::from_utf8_lossy(event.name()).trim().to_lowercase();
match tag.as_str() {
"p" | "div" | "table" | "td" | "style" | "script" | "title" | "pre" => {
dehtml.strbuilder += "\n\n";
dehtml.add_text = AddText::YesRemoveLineEnds;
@@ -106,15 +117,14 @@ unsafe fn dehtml_endtag_cb(userdata: *mut libc::c_void, tag: *const libc::c_char
}
}
unsafe fn dehtml_starttag_cb(
userdata: *mut libc::c_void,
tag: *const libc::c_char,
attr: *mut *mut libc::c_char,
fn dehtml_starttag_cb<B: std::io::BufRead>(
event: &BytesStart,
dehtml: &mut Dehtml,
reader: &quick_xml::Reader<B>,
) {
let mut dehtml = &mut *(userdata as *mut Dehtml);
let tag = std::ffi::CStr::from_ptr(tag).to_string_lossy();
let tag = String::from_utf8_lossy(event.name()).trim().to_lowercase();
match tag.as_ref() {
match tag.as_str() {
"p" | "div" | "table" | "td" => {
dehtml.strbuilder += "\n\n";
dehtml.add_text = AddText::YesRemoveLineEnds;
@@ -131,14 +141,21 @@ unsafe fn dehtml_starttag_cb(
dehtml.add_text = AddText::YesPreserveLineEnds;
}
"a" => {
let text_c = std::ffi::CStr::from_ptr(dc_attr_find(
attr,
b"href\x00" as *const u8 as *const libc::c_char,
));
let text_r = text_c.to_str().expect("invalid utf8");
if !text_r.is_empty() {
dehtml.last_href = Some(text_r.to_string());
dehtml.strbuilder += "[";
if let Some(href) = event.html_attributes().find(|attr| {
attr.as_ref()
.map(|a| String::from_utf8_lossy(a.key).trim().to_lowercase() == "href")
.unwrap_or_default()
}) {
let href = href
.unwrap()
.unescape_and_decode_value(reader)
.unwrap_or_default()
.to_lowercase();
if !href.is_empty() {
dehtml.last_href = Some(href);
dehtml.strbuilder += "[";
}
}
}
"b" | "strong" => {

View File

@@ -311,8 +311,7 @@ mod tests {
let html: *const libc::c_char =
b"<!DOCTYPE name [<!DOCTYPE ...>]><!-- comment -->text <b><?php echo ... ?>bold</b><![CDATA[<>]]>\x00"
as *const u8 as *const libc::c_char;
let plain: *mut libc::c_char =
simplify.simplify(html, strlen(html) as libc::c_int, true, 0);
let plain = simplify.simplify(html, strlen(html) as libc::c_int, true, 0);
assert_eq!(
CStr::from_ptr(plain as *const libc::c_char)
@@ -329,17 +328,16 @@ mod tests {
fn test_simplify_html_encoded() {
unsafe {
let mut simplify = Simplify::new();
let html: *const libc::c_char =
b"&lt;&gt;&quot;&apos;&amp; &auml;&Auml;&ouml;&Ouml;&uuml;&Uuml;&szlig; foo&AElig;&ccedil;&Ccedil; &diams;&noent;&lrm;&rlm;&zwnj;&zwj;\x00"
let html =
b"&lt;&gt;&quot;&apos;&amp; &auml;&Auml;&ouml;&Ouml;&uuml;&Uuml;&szlig; foo&AElig;&ccedil;&Ccedil; &diams;&lrm;&rlm;&zwnj;&noent;&zwj;\x00"
as *const u8 as *const libc::c_char;
let plain: *mut libc::c_char =
simplify.simplify(html, strlen(html) as libc::c_int, true, 0);
let plain = simplify.simplify(html, strlen(html) as libc::c_int, true, 0);
assert_eq!(
strcmp(plain,
b"<>\"\'& \xc3\xa4\xc3\x84\xc3\xb6\xc3\x96\xc3\xbc\xc3\x9c\xc3\x9f foo\xc3\x86\xc3\xa7\xc3\x87 \xe2\x99\xa6&noent;\x00"
as *const u8 as *const libc::c_char),
0,
CStr::from_ptr(plain as *const libc::c_char)
.to_str()
.unwrap(),
"<>\"\'& äÄöÖüÜß fooÆçÇ \u{2666}\u{200e}\u{200f}\u{200c}&noent;\u{200d}"
);
free(plain as *mut libc::c_void);