refactor(dehtml): switch to quick-xml for parsing

This commit is contained in:
dignifiedquire
2019-08-16 23:09:31 +02:00
committed by holger krekel
parent eb2b4028ab
commit c901ab844f
4 changed files with 142 additions and 110 deletions

112
Cargo.lock generated
View File

@@ -182,7 +182,7 @@ version = "0.8.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [
"memchr 2.2.1 (registry+https://github.com/rust-lang/crates.io-index)",
"safemem 0.3.1 (registry+https://github.com/rust-lang/crates.io-index)",
"safemem 0.3.2 (registry+https://github.com/rust-lang/crates.io-index)",
"slice-deque 0.2.4 (registry+https://github.com/rust-lang/crates.io-index)",
]
@@ -232,8 +232,8 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [
"error-chain 0.12.1 (registry+https://github.com/rust-lang/crates.io-index)",
"semver 0.9.0 (registry+https://github.com/rust-lang/crates.io-index)",
"serde 1.0.98 (registry+https://github.com/rust-lang/crates.io-index)",
"serde_derive 1.0.98 (registry+https://github.com/rust-lang/crates.io-index)",
"serde 1.0.99 (registry+https://github.com/rust-lang/crates.io-index)",
"serde_derive 1.0.99 (registry+https://github.com/rust-lang/crates.io-index)",
"serde_json 1.0.40 (registry+https://github.com/rust-lang/crates.io-index)",
]
@@ -331,7 +331,7 @@ dependencies = [
"idna 0.1.5 (registry+https://github.com/rust-lang/crates.io-index)",
"log 0.4.8 (registry+https://github.com/rust-lang/crates.io-index)",
"publicsuffix 1.5.2 (registry+https://github.com/rust-lang/crates.io-index)",
"serde 1.0.98 (registry+https://github.com/rust-lang/crates.io-index)",
"serde 1.0.99 (registry+https://github.com/rust-lang/crates.io-index)",
"serde_json 1.0.40 (registry+https://github.com/rust-lang/crates.io-index)",
"time 0.1.42 (registry+https://github.com/rust-lang/crates.io-index)",
"try_from 0.3.2 (registry+https://github.com/rust-lang/crates.io-index)",
@@ -409,8 +409,8 @@ name = "ctor"
version = "0.1.10"
source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [
"quote 1.0.0 (registry+https://github.com/rust-lang/crates.io-index)",
"syn 1.0.1 (registry+https://github.com/rust-lang/crates.io-index)",
"quote 1.0.1 (registry+https://github.com/rust-lang/crates.io-index)",
"syn 1.0.2 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
@@ -468,6 +468,7 @@ dependencies = [
"charset 0.1.2 (registry+https://github.com/rust-lang/crates.io-index)",
"chrono 0.4.7 (registry+https://github.com/rust-lang/crates.io-index)",
"deltachat_derive 0.1.0",
"escaper 0.1.0 (registry+https://github.com/rust-lang/crates.io-index)",
"failure 0.1.5 (registry+https://github.com/rust-lang/crates.io-index)",
"failure_derive 0.1.5 (registry+https://github.com/rust-lang/crates.io-index)",
"hex 0.3.2 (registry+https://github.com/rust-lang/crates.io-index)",
@@ -486,7 +487,7 @@ dependencies = [
"phf 0.7.24 (git+https://github.com/sfackler/rust-phf?rev=0d00821)",
"pkg-config 0.3.15 (registry+https://github.com/rust-lang/crates.io-index)",
"pretty_assertions 0.6.1 (registry+https://github.com/rust-lang/crates.io-index)",
"pretty_env_logger 0.3.0 (registry+https://github.com/rust-lang/crates.io-index)",
"pretty_env_logger 0.3.1 (registry+https://github.com/rust-lang/crates.io-index)",
"quick-xml 0.15.0 (registry+https://github.com/rust-lang/crates.io-index)",
"r2d2 0.8.5 (registry+https://github.com/rust-lang/crates.io-index)",
"r2d2_sqlite 0.12.0 (registry+https://github.com/rust-lang/crates.io-index)",
@@ -495,7 +496,7 @@ dependencies = [
"reqwest 0.9.19 (registry+https://github.com/rust-lang/crates.io-index)",
"rusqlite 0.20.0 (registry+https://github.com/rust-lang/crates.io-index)",
"rustyline 4.1.0 (registry+https://github.com/rust-lang/crates.io-index)",
"serde 1.0.98 (registry+https://github.com/rust-lang/crates.io-index)",
"serde 1.0.99 (registry+https://github.com/rust-lang/crates.io-index)",
"serde_json 1.0.40 (registry+https://github.com/rust-lang/crates.io-index)",
"sha2 0.8.0 (registry+https://github.com/rust-lang/crates.io-index)",
"smallvec 0.6.10 (registry+https://github.com/rust-lang/crates.io-index)",
@@ -620,6 +621,11 @@ dependencies = [
"cfg-if 0.1.9 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
name = "entities"
version = "1.0.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
[[package]]
name = "enum_primitive"
version = "0.1.1"
@@ -649,6 +655,14 @@ dependencies = [
"version_check 0.1.5 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
name = "escaper"
version = "0.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [
"entities 1.0.1 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
name = "failure"
version = "0.1.5"
@@ -849,8 +863,8 @@ dependencies = [
"backtrace 0.3.34 (registry+https://github.com/rust-lang/crates.io-index)",
"failure 0.1.5 (registry+https://github.com/rust-lang/crates.io-index)",
"os_type 2.2.0 (registry+https://github.com/rust-lang/crates.io-index)",
"serde 1.0.98 (registry+https://github.com/rust-lang/crates.io-index)",
"serde_derive 1.0.98 (registry+https://github.com/rust-lang/crates.io-index)",
"serde 1.0.99 (registry+https://github.com/rust-lang/crates.io-index)",
"serde_derive 1.0.99 (registry+https://github.com/rust-lang/crates.io-index)",
"tempdir 0.3.7 (registry+https://github.com/rust-lang/crates.io-index)",
"termcolor 0.3.6 (registry+https://github.com/rust-lang/crates.io-index)",
"toml 0.4.10 (registry+https://github.com/rust-lang/crates.io-index)",
@@ -1011,8 +1025,8 @@ dependencies = [
"log 0.4.8 (registry+https://github.com/rust-lang/crates.io-index)",
"native-tls 0.2.3 (registry+https://github.com/rust-lang/crates.io-index)",
"nom 4.2.3 (registry+https://github.com/rust-lang/crates.io-index)",
"serde 1.0.98 (registry+https://github.com/rust-lang/crates.io-index)",
"serde_derive 1.0.98 (registry+https://github.com/rust-lang/crates.io-index)",
"serde 1.0.99 (registry+https://github.com/rust-lang/crates.io-index)",
"serde_derive 1.0.99 (registry+https://github.com/rust-lang/crates.io-index)",
"serde_json 1.0.40 (registry+https://github.com/rust-lang/crates.io-index)",
]
@@ -1199,7 +1213,7 @@ dependencies = [
"log 0.4.8 (registry+https://github.com/rust-lang/crates.io-index)",
"openssl 0.10.24 (registry+https://github.com/rust-lang/crates.io-index)",
"openssl-probe 0.1.2 (registry+https://github.com/rust-lang/crates.io-index)",
"openssl-sys 0.9.48 (registry+https://github.com/rust-lang/crates.io-index)",
"openssl-sys 0.9.49 (registry+https://github.com/rust-lang/crates.io-index)",
"schannel 0.1.15 (registry+https://github.com/rust-lang/crates.io-index)",
"security-framework 0.3.1 (registry+https://github.com/rust-lang/crates.io-index)",
"security-framework-sys 0.3.1 (registry+https://github.com/rust-lang/crates.io-index)",
@@ -1325,7 +1339,7 @@ dependencies = [
"foreign-types 0.3.2 (registry+https://github.com/rust-lang/crates.io-index)",
"lazy_static 1.3.0 (registry+https://github.com/rust-lang/crates.io-index)",
"libc 0.2.62 (registry+https://github.com/rust-lang/crates.io-index)",
"openssl-sys 0.9.48 (registry+https://github.com/rust-lang/crates.io-index)",
"openssl-sys 0.9.49 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
@@ -1343,7 +1357,7 @@ dependencies = [
[[package]]
name = "openssl-sys"
version = "0.9.48"
version = "0.9.49"
source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [
"autocfg 0.1.5 (registry+https://github.com/rust-lang/crates.io-index)",
@@ -1575,7 +1589,7 @@ dependencies = [
[[package]]
name = "pretty_env_logger"
version = "0.3.0"
version = "0.3.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [
"chrono 0.4.7 (registry+https://github.com/rust-lang/crates.io-index)",
@@ -1588,9 +1602,9 @@ name = "proc-macro-hack"
version = "0.5.9"
source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [
"proc-macro2 1.0.0 (registry+https://github.com/rust-lang/crates.io-index)",
"quote 1.0.0 (registry+https://github.com/rust-lang/crates.io-index)",
"syn 1.0.1 (registry+https://github.com/rust-lang/crates.io-index)",
"proc-macro2 1.0.1 (registry+https://github.com/rust-lang/crates.io-index)",
"quote 1.0.1 (registry+https://github.com/rust-lang/crates.io-index)",
"syn 1.0.2 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
@@ -1603,7 +1617,7 @@ dependencies = [
[[package]]
name = "proc-macro2"
version = "1.0.0"
version = "1.0.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [
"unicode-xid 0.2.0 (registry+https://github.com/rust-lang/crates.io-index)",
@@ -1656,10 +1670,10 @@ dependencies = [
[[package]]
name = "quote"
version = "1.0.0"
version = "1.0.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [
"proc-macro2 1.0.0 (registry+https://github.com/rust-lang/crates.io-index)",
"proc-macro2 1.0.1 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
@@ -1904,7 +1918,7 @@ dependencies = [
"mime 0.3.13 (registry+https://github.com/rust-lang/crates.io-index)",
"mime_guess 2.0.1 (registry+https://github.com/rust-lang/crates.io-index)",
"native-tls 0.2.3 (registry+https://github.com/rust-lang/crates.io-index)",
"serde 1.0.98 (registry+https://github.com/rust-lang/crates.io-index)",
"serde 1.0.99 (registry+https://github.com/rust-lang/crates.io-index)",
"serde_json 1.0.40 (registry+https://github.com/rust-lang/crates.io-index)",
"serde_urlencoded 0.5.5 (registry+https://github.com/rust-lang/crates.io-index)",
"time 0.1.42 (registry+https://github.com/rust-lang/crates.io-index)",
@@ -2005,7 +2019,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
[[package]]
name = "safemem"
version = "0.3.1"
version = "0.3.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
[[package]]
@@ -2068,7 +2082,7 @@ version = "0.9.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [
"semver-parser 0.7.0 (registry+https://github.com/rust-lang/crates.io-index)",
"serde 1.0.98 (registry+https://github.com/rust-lang/crates.io-index)",
"serde 1.0.99 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
@@ -2078,20 +2092,20 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
[[package]]
name = "serde"
version = "1.0.98"
version = "1.0.99"
source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [
"serde_derive 1.0.98 (registry+https://github.com/rust-lang/crates.io-index)",
"serde_derive 1.0.99 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
name = "serde_derive"
version = "1.0.98"
version = "1.0.99"
source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [
"proc-macro2 0.4.30 (registry+https://github.com/rust-lang/crates.io-index)",
"quote 0.6.13 (registry+https://github.com/rust-lang/crates.io-index)",
"syn 0.15.44 (registry+https://github.com/rust-lang/crates.io-index)",
"proc-macro2 1.0.1 (registry+https://github.com/rust-lang/crates.io-index)",
"quote 1.0.1 (registry+https://github.com/rust-lang/crates.io-index)",
"syn 1.0.2 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
@@ -2101,7 +2115,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [
"itoa 0.4.4 (registry+https://github.com/rust-lang/crates.io-index)",
"ryu 1.0.0 (registry+https://github.com/rust-lang/crates.io-index)",
"serde 1.0.98 (registry+https://github.com/rust-lang/crates.io-index)",
"serde 1.0.99 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
@@ -2111,7 +2125,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [
"dtoa 0.4.4 (registry+https://github.com/rust-lang/crates.io-index)",
"itoa 0.4.4 (registry+https://github.com/rust-lang/crates.io-index)",
"serde 1.0.98 (registry+https://github.com/rust-lang/crates.io-index)",
"serde 1.0.99 (registry+https://github.com/rust-lang/crates.io-index)",
"url 1.7.2 (registry+https://github.com/rust-lang/crates.io-index)",
]
@@ -2258,11 +2272,11 @@ dependencies = [
[[package]]
name = "syn"
version = "1.0.1"
version = "1.0.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [
"proc-macro2 1.0.0 (registry+https://github.com/rust-lang/crates.io-index)",
"quote 1.0.0 (registry+https://github.com/rust-lang/crates.io-index)",
"proc-macro2 1.0.1 (registry+https://github.com/rust-lang/crates.io-index)",
"quote 1.0.1 (registry+https://github.com/rust-lang/crates.io-index)",
"unicode-xid 0.2.0 (registry+https://github.com/rust-lang/crates.io-index)",
]
@@ -2312,7 +2326,7 @@ name = "termcolor"
version = "1.0.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [
"wincolor 1.0.1 (registry+https://github.com/rust-lang/crates.io-index)",
"wincolor 1.0.2 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
@@ -2469,7 +2483,7 @@ name = "toml"
version = "0.4.10"
source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [
"serde 1.0.98 (registry+https://github.com/rust-lang/crates.io-index)",
"serde 1.0.99 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
@@ -2676,7 +2690,7 @@ dependencies = [
[[package]]
name = "wincolor"
version = "1.0.1"
version = "1.0.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [
"winapi 0.3.7 (registry+https://github.com/rust-lang/crates.io-index)",
@@ -2803,9 +2817,11 @@ dependencies = [
"checksum ed25519-dalek 1.0.0-pre.1 (registry+https://github.com/rust-lang/crates.io-index)" = "81956bcf7ef761fb4e1d88de3fa181358a0d26cbcb9755b587a08f9119824b86"
"checksum either 1.5.2 (registry+https://github.com/rust-lang/crates.io-index)" = "5527cfe0d098f36e3f8839852688e63c8fff1c90b2b405aef730615f9a7bcf7b"
"checksum encoding_rs 0.8.17 (registry+https://github.com/rust-lang/crates.io-index)" = "4155785c79f2f6701f185eb2e6b4caf0555ec03477cb4c70db67b465311620ed"
"checksum entities 1.0.1 (registry+https://github.com/rust-lang/crates.io-index)" = "b5320ae4c3782150d900b79807611a59a99fc9a1d61d686faafc24b93fc8d7ca"
"checksum enum_primitive 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)" = "be4551092f4d519593039259a9ed8daedf0da12e5109c5280338073eaeb81180"
"checksum env_logger 0.6.2 (registry+https://github.com/rust-lang/crates.io-index)" = "aafcde04e90a5226a6443b7aabdb016ba2f8307c847d524724bd9b346dd1a2d3"
"checksum error-chain 0.12.1 (registry+https://github.com/rust-lang/crates.io-index)" = "3ab49e9dcb602294bc42f9a7dfc9bc6e936fca4418ea300dbfb84fe16de0b7d9"
"checksum escaper 0.1.0 (registry+https://github.com/rust-lang/crates.io-index)" = "39da344028c2227132b2dfa7c186e2104ecc153467583d00ed9c398f9ff693b0"
"checksum failure 0.1.5 (registry+https://github.com/rust-lang/crates.io-index)" = "795bd83d3abeb9220f257e597aa0080a508b27533824adf336529648f6abf7e2"
"checksum failure_derive 0.1.5 (registry+https://github.com/rust-lang/crates.io-index)" = "ea1063915fd7ef4309e222a5a07cf9c319fb9c7836b1f89b85458672dbb127e1"
"checksum fake-simd 0.1.2 (registry+https://github.com/rust-lang/crates.io-index)" = "e88a8acf291dafb59c2d96e8f59828f3838bb1a70398823ade51a84de6a6deed"
@@ -2884,7 +2900,7 @@ dependencies = [
"checksum openssl 0.10.24 (registry+https://github.com/rust-lang/crates.io-index)" = "8152bb5a9b5b721538462336e3bef9a539f892715e5037fda0f984577311af15"
"checksum openssl-probe 0.1.2 (registry+https://github.com/rust-lang/crates.io-index)" = "77af24da69f9d9341038eba93a073b1fdaaa1b788221b00a69bce9e762cb32de"
"checksum openssl-src 111.5.0+1.1.1c (registry+https://github.com/rust-lang/crates.io-index)" = "4bdebf3f49173e2f693e3ad83eed3aa9fe9e5a60e23c84010c29063b1497049e"
"checksum openssl-sys 0.9.48 (registry+https://github.com/rust-lang/crates.io-index)" = "b5ba300217253bcc5dc68bed23d782affa45000193866e025329aa8a7a9f05b8"
"checksum openssl-sys 0.9.49 (registry+https://github.com/rust-lang/crates.io-index)" = "f4fad9e54bd23bd4cbbe48fdc08a1b8091707ac869ef8508edea2fec77dcc884"
"checksum os_type 2.2.0 (registry+https://github.com/rust-lang/crates.io-index)" = "7edc011af0ae98b7f88cf7e4a83b70a54a75d2b8cb013d6efd02e5956207e9eb"
"checksum output_vt100 0.1.2 (registry+https://github.com/rust-lang/crates.io-index)" = "53cdc5b785b7a58c5aad8216b3dfa114df64b0b06ae6e1501cef91df2fbdf8f9"
"checksum owning_ref 0.4.0 (registry+https://github.com/rust-lang/crates.io-index)" = "49a4b8ea2179e6a2e27411d3bca09ca6dd630821cf6894c6c7c8467a8ee7ef13"
@@ -2905,16 +2921,16 @@ dependencies = [
"checksum pkg-config 0.3.15 (registry+https://github.com/rust-lang/crates.io-index)" = "a7c1d2cfa5a714db3b5f24f0915e74fcdf91d09d496ba61329705dda7774d2af"
"checksum ppv-lite86 0.2.5 (registry+https://github.com/rust-lang/crates.io-index)" = "e3cbf9f658cdb5000fcf6f362b8ea2ba154b9f146a61c7a20d647034c6b6561b"
"checksum pretty_assertions 0.6.1 (registry+https://github.com/rust-lang/crates.io-index)" = "3f81e1644e1b54f5a68959a29aa86cde704219254669da328ecfdf6a1f09d427"
"checksum pretty_env_logger 0.3.0 (registry+https://github.com/rust-lang/crates.io-index)" = "df8b3f4e0475def7d9c2e5de8e5a1306949849761e107b360d03e98eafaffd61"
"checksum pretty_env_logger 0.3.1 (registry+https://github.com/rust-lang/crates.io-index)" = "717ee476b1690853d222af4634056d830b5197ffd747726a9a1eee6da9f49074"
"checksum proc-macro-hack 0.5.9 (registry+https://github.com/rust-lang/crates.io-index)" = "e688f31d92ffd7c1ddc57a1b4e6d773c0f2a14ee437a4b0a4f5a69c80eb221c8"
"checksum proc-macro2 0.4.30 (registry+https://github.com/rust-lang/crates.io-index)" = "cf3d2011ab5c909338f7887f4fc896d35932e29146c12c8d01da6b22a80ba759"
"checksum proc-macro2 1.0.0 (registry+https://github.com/rust-lang/crates.io-index)" = "19f287c234c9b2d0308d692dee5c449c1a171167a6f8150f7cf2a49d8fd96967"
"checksum proc-macro2 1.0.1 (registry+https://github.com/rust-lang/crates.io-index)" = "4c5c2380ae88876faae57698be9e9775e3544decad214599c3a6266cca6ac802"
"checksum publicsuffix 1.5.2 (registry+https://github.com/rust-lang/crates.io-index)" = "5afecba86dcf1e4fd610246f89899d1924fe12e1e89f555eb7c7f710f3c5ad1d"
"checksum pulldown-cmark 0.2.0 (registry+https://github.com/rust-lang/crates.io-index)" = "eef52fac62d0ea7b9b4dc7da092aa64ea7ec3d90af6679422d3d7e0e14b6ee15"
"checksum quick-error 1.2.2 (registry+https://github.com/rust-lang/crates.io-index)" = "9274b940887ce9addde99c4eee6b5c44cc494b182b97e73dc8ffdcb3397fd3f0"
"checksum quick-xml 0.15.0 (registry+https://github.com/rust-lang/crates.io-index)" = "c2b074258da4f2ccb1c450380c5bd8e77db2508de2161f574c70930d8e880482"
"checksum quote 0.6.13 (registry+https://github.com/rust-lang/crates.io-index)" = "6ce23b6b870e8f94f81fb0a363d65d86675884b34a09043c81e5562f11c1f8e1"
"checksum quote 1.0.0 (registry+https://github.com/rust-lang/crates.io-index)" = "7ab938ebe6f1c82426b5fb82eaf10c3e3028c53deaa3fbe38f5904b37cf4d767"
"checksum quote 1.0.1 (registry+https://github.com/rust-lang/crates.io-index)" = "49d77c41ca8767f2f41394c11a4eebccab83da25e7cc035387a3125f02be90a3"
"checksum r2d2 0.8.5 (registry+https://github.com/rust-lang/crates.io-index)" = "bc42ce75d9f4447fb2a04bbe1ed5d18dd949104572850ec19b164e274919f81b"
"checksum r2d2_sqlite 0.12.0 (registry+https://github.com/rust-lang/crates.io-index)" = "806e268035ce9e5a604bf617ac8a073ef28b59ef2e48e8338db0baf530caef33"
"checksum rand 0.4.6 (registry+https://github.com/rust-lang/crates.io-index)" = "552840b97013b1a26992c11eac34bdd778e464601a4c2054b5f0bff7c6761293"
@@ -2948,7 +2964,7 @@ dependencies = [
"checksum rustc_version 0.2.3 (registry+https://github.com/rust-lang/crates.io-index)" = "138e3e0acb6c9fb258b19b67cb8abd63c00679d2851805ea151465464fe9030a"
"checksum rustyline 4.1.0 (registry+https://github.com/rust-lang/crates.io-index)" = "0f47ea1ceb347d2deae482d655dc8eef4bd82363d3329baffa3818bd76fea48b"
"checksum ryu 1.0.0 (registry+https://github.com/rust-lang/crates.io-index)" = "c92464b447c0ee8c4fb3824ecc8383b81717b9f1e74ba2e72540aef7b9f82997"
"checksum safemem 0.3.1 (registry+https://github.com/rust-lang/crates.io-index)" = "e133ccc4f4d1cd4f89cc8a7ff618287d56dc7f638b8e38fc32c5fdcadc339dd5"
"checksum safemem 0.3.2 (registry+https://github.com/rust-lang/crates.io-index)" = "d2b08423011dae9a5ca23f07cf57dac3857f5c885d352b76f6d95f4aea9434d0"
"checksum same-file 1.0.5 (registry+https://github.com/rust-lang/crates.io-index)" = "585e8ddcedc187886a30fa705c47985c3fa88d06624095856b36ca0b82ff4421"
"checksum schannel 0.1.15 (registry+https://github.com/rust-lang/crates.io-index)" = "f2f6abf258d99c3c1c5c2131d99d064e94b7b3dd5f416483057f308fea253339"
"checksum scheduled-thread-pool 0.2.2 (registry+https://github.com/rust-lang/crates.io-index)" = "bd07742e081ff6c077f5f6b283f12f32b9e7cc765b316160d66289b74546fbb3"
@@ -2958,8 +2974,8 @@ dependencies = [
"checksum security-framework-sys 0.3.1 (registry+https://github.com/rust-lang/crates.io-index)" = "9636f8989cbf61385ae4824b98c1aaa54c994d7d8b41f11c601ed799f0549a56"
"checksum semver 0.9.0 (registry+https://github.com/rust-lang/crates.io-index)" = "1d7eb9ef2c18661902cc47e535f9bc51b78acd254da71d375c2f6720d9a40403"
"checksum semver-parser 0.7.0 (registry+https://github.com/rust-lang/crates.io-index)" = "388a1df253eca08550bef6c72392cfe7c30914bf41df5269b68cbd6ff8f570a3"
"checksum serde 1.0.98 (registry+https://github.com/rust-lang/crates.io-index)" = "7fe5626ac617da2f2d9c48af5515a21d5a480dbd151e01bb1c355e26a3e68113"
"checksum serde_derive 1.0.98 (registry+https://github.com/rust-lang/crates.io-index)" = "01e69e1b8a631f245467ee275b8c757b818653c6d704cdbcaeb56b56767b529c"
"checksum serde 1.0.99 (registry+https://github.com/rust-lang/crates.io-index)" = "fec2851eb56d010dc9a21b89ca53ee75e6528bab60c11e89d38390904982da9f"
"checksum serde_derive 1.0.99 (registry+https://github.com/rust-lang/crates.io-index)" = "cb4dc18c61206b08dc98216c98faa0232f4337e1e1b8574551d5bad29ea1b425"
"checksum serde_json 1.0.40 (registry+https://github.com/rust-lang/crates.io-index)" = "051c49229f282f7c6f3813f8286cc1e3323e8051823fce42c7ea80fe13521704"
"checksum serde_urlencoded 0.5.5 (registry+https://github.com/rust-lang/crates.io-index)" = "642dd69105886af2efd227f75a520ec9b44a820d65bc133a9131f7d229fd165a"
"checksum sha-1 0.8.1 (registry+https://github.com/rust-lang/crates.io-index)" = "23962131a91661d643c98940b20fcaffe62d776a823247be80a48fcb8b6fce68"
@@ -2979,7 +2995,7 @@ dependencies = [
"checksum subtle 2.1.1 (registry+https://github.com/rust-lang/crates.io-index)" = "01f40907d9ffc762709e4ff3eb4a6f6b41b650375a3f09ac92b641942b7fb082"
"checksum syn 0.14.9 (registry+https://github.com/rust-lang/crates.io-index)" = "261ae9ecaa397c42b960649561949d69311f08eeaea86a65696e6e46517cf741"
"checksum syn 0.15.44 (registry+https://github.com/rust-lang/crates.io-index)" = "9ca4b3b69a77cbe1ffc9e198781b7acb0c7365a883670e8f1c1bc66fba79a5c5"
"checksum syn 1.0.1 (registry+https://github.com/rust-lang/crates.io-index)" = "863ecbce06044c8380458360b4146d7372edadfedd77f120ba8c193da427b708"
"checksum syn 1.0.2 (registry+https://github.com/rust-lang/crates.io-index)" = "2ae5cd13590144ea968ba5d5520da7a4c08415861014399b5b349f74591c375f"
"checksum synstructure 0.10.2 (registry+https://github.com/rust-lang/crates.io-index)" = "02353edf96d6e4dc81aea2d8490a7e9db177bf8acb0e951c24940bf866cb313f"
"checksum tempdir 0.3.7 (registry+https://github.com/rust-lang/crates.io-index)" = "15f2b5fb00ccdf689e0149d1b1b3c03fead81c2b37735d812fa8bddbbf41b6d8"
"checksum tempfile 3.1.0 (registry+https://github.com/rust-lang/crates.io-index)" = "7a6e24d9338a0a5be79593e2fa15a648add6138caa803e2d5bc782c371732ca9"
@@ -3029,7 +3045,7 @@ dependencies = [
"checksum winapi-util 0.1.2 (registry+https://github.com/rust-lang/crates.io-index)" = "7168bab6e1daee33b4557efd0e95d5ca70a03706d39fa5f3fe7a236f584b03c9"
"checksum winapi-x86_64-pc-windows-gnu 0.4.0 (registry+https://github.com/rust-lang/crates.io-index)" = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f"
"checksum wincolor 0.1.6 (registry+https://github.com/rust-lang/crates.io-index)" = "eeb06499a3a4d44302791052df005d5232b927ed1a9658146d842165c4de7767"
"checksum wincolor 1.0.1 (registry+https://github.com/rust-lang/crates.io-index)" = "561ed901ae465d6185fa7864d63fbd5720d0ef718366c9a4dc83cf6170d7e9ba"
"checksum wincolor 1.0.2 (registry+https://github.com/rust-lang/crates.io-index)" = "96f5016b18804d24db43cebf3c77269e7569b8954a8464501c216cc5e070eaa9"
"checksum winreg 0.6.2 (registry+https://github.com/rust-lang/crates.io-index)" = "b2986deb581c4fe11b621998a5e53361efe6b48a151178d0cd9eeffa4dc6acc9"
"checksum winutil 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)" = "7daf138b6b14196e3830a588acf1e86966c694d3e8fb026fb105b8b5dca07e6e"
"checksum ws2_32-sys 0.2.1 (registry+https://github.com/rust-lang/crates.io-index)" = "d59cefebd0c892fa2dd6de581e937301d8552cb44489cdff035c6187cb63fa5e"

View File

@@ -48,6 +48,7 @@ byteorder = "1.3.1"
itertools = "0.8.0"
image-meta = "0.1.0"
quick-xml = "0.15.0"
escaper = "0.1.0"
[dev-dependencies]
tempfile = "3.0"

View File

@@ -1,9 +1,9 @@
use lazy_static::lazy_static;
use quick_xml;
use quick_xml::events::{BytesEnd, BytesStart, BytesText};
use crate::dc_saxparser::*;
use crate::dc_tools::*;
use crate::x::*;
use std::ptr;
lazy_static! {
static ref LINE_RE: regex::Regex = regex::Regex::new(r"(\r?\n)+").unwrap();
@@ -35,56 +35,67 @@ pub unsafe fn dc_dehtml(buf_terminated: *mut libc::c_char) -> *mut libc::c_char
add_text: AddText::YesRemoveLineEnds,
last_href: None,
};
let mut saxparser = dc_saxparser_t {
starttag_cb: None,
endtag_cb: None,
text_cb: None,
userdata: ptr::null_mut(),
};
dc_saxparser_init(
&mut saxparser,
&mut dehtml as *mut Dehtml as *mut libc::c_void,
);
dc_saxparser_set_tag_handler(
&mut saxparser,
Some(dehtml_starttag_cb),
Some(dehtml_endtag_cb),
);
dc_saxparser_set_text_handler(&mut saxparser, Some(dehtml_text_cb));
dc_saxparser_parse(&mut saxparser, buf_terminated);
let mut reader = quick_xml::Reader::from_str(as_str(buf_terminated));
let mut buf = Vec::new();
loop {
match reader.read_event(&mut buf) {
Ok(quick_xml::events::Event::Start(ref e)) => {
dehtml_starttag_cb(e, &mut dehtml, &reader)
}
Ok(quick_xml::events::Event::End(ref e)) => dehtml_endtag_cb(e, &mut dehtml),
Ok(quick_xml::events::Event::Text(ref e)) => dehtml_text_cb(e, &mut dehtml),
Ok(quick_xml::events::Event::CData(ref e)) => dehtml_cdata_cb(e, &mut dehtml),
Err(e) => {
eprintln!(
"Parse html error: Error at position {}: {:?}",
reader.buffer_position(),
e
);
}
Ok(quick_xml::events::Event::Eof) => break,
_ => (),
}
buf.clear();
}
dehtml.strbuilder.strdup()
}
unsafe fn dehtml_text_cb(
userdata: *mut libc::c_void,
text: *const libc::c_char,
_len: libc::c_int,
) {
let dehtml = &mut *(userdata as *mut Dehtml);
fn dehtml_text_cb(event: &BytesText, dehtml: &mut Dehtml) {
if dehtml.add_text == AddText::YesPreserveLineEnds
|| dehtml.add_text == AddText::YesRemoveLineEnds
{
let last_added = std::ffi::CStr::from_ptr(text)
.to_str()
.expect("invalid utf8");
// TODO: why does len does not match?
// assert_eq!(last_added.len(), len as usize);
let last_added = escaper::decode_html_buf_sloppy(event.escaped()).unwrap_or_default();
if dehtml.add_text == AddText::YesRemoveLineEnds {
dehtml.strbuilder += LINE_RE.replace_all(last_added.as_ref(), "\r").as_ref();
dehtml.strbuilder += LINE_RE.replace_all(&last_added, "\r").as_ref();
} else {
dehtml.strbuilder += last_added.as_ref();
dehtml.strbuilder += &last_added;
}
}
}
unsafe fn dehtml_endtag_cb(userdata: *mut libc::c_void, tag: *const libc::c_char) {
let mut dehtml = &mut *(userdata as *mut Dehtml);
let tag = std::ffi::CStr::from_ptr(tag).to_string_lossy();
fn dehtml_cdata_cb(event: &BytesText, dehtml: &mut Dehtml) {
if dehtml.add_text == AddText::YesPreserveLineEnds
|| dehtml.add_text == AddText::YesRemoveLineEnds
{
let last_added = escaper::decode_html_buf_sloppy(event.escaped()).unwrap_or_default();
match tag.as_ref() {
if dehtml.add_text == AddText::YesRemoveLineEnds {
dehtml.strbuilder += LINE_RE.replace_all(&last_added, "\r").as_ref();
} else {
dehtml.strbuilder += &last_added;
}
}
}
fn dehtml_endtag_cb(event: &BytesEnd, dehtml: &mut Dehtml) {
let tag = String::from_utf8_lossy(event.name()).trim().to_lowercase();
match tag.as_str() {
"p" | "div" | "table" | "td" | "style" | "script" | "title" | "pre" => {
dehtml.strbuilder += "\n\n";
dehtml.add_text = AddText::YesRemoveLineEnds;
@@ -106,15 +117,14 @@ unsafe fn dehtml_endtag_cb(userdata: *mut libc::c_void, tag: *const libc::c_char
}
}
unsafe fn dehtml_starttag_cb(
userdata: *mut libc::c_void,
tag: *const libc::c_char,
attr: *mut *mut libc::c_char,
fn dehtml_starttag_cb<B: std::io::BufRead>(
event: &BytesStart,
dehtml: &mut Dehtml,
reader: &quick_xml::Reader<B>,
) {
let mut dehtml = &mut *(userdata as *mut Dehtml);
let tag = std::ffi::CStr::from_ptr(tag).to_string_lossy();
let tag = String::from_utf8_lossy(event.name()).trim().to_lowercase();
match tag.as_ref() {
match tag.as_str() {
"p" | "div" | "table" | "td" => {
dehtml.strbuilder += "\n\n";
dehtml.add_text = AddText::YesRemoveLineEnds;
@@ -131,14 +141,21 @@ unsafe fn dehtml_starttag_cb(
dehtml.add_text = AddText::YesPreserveLineEnds;
}
"a" => {
let text_c = std::ffi::CStr::from_ptr(dc_attr_find(
attr,
b"href\x00" as *const u8 as *const libc::c_char,
));
let text_r = text_c.to_str().expect("invalid utf8");
if !text_r.is_empty() {
dehtml.last_href = Some(text_r.to_string());
dehtml.strbuilder += "[";
if let Some(href) = event.html_attributes().find(|attr| {
attr.as_ref()
.map(|a| String::from_utf8_lossy(a.key).trim().to_lowercase() == "href")
.unwrap_or_default()
}) {
let href = href
.unwrap()
.unescape_and_decode_value(reader)
.unwrap_or_default()
.to_lowercase();
if !href.is_empty() {
dehtml.last_href = Some(href);
dehtml.strbuilder += "[";
}
}
}
"b" | "strong" => {

View File

@@ -311,8 +311,7 @@ mod tests {
let html: *const libc::c_char =
b"<!DOCTYPE name [<!DOCTYPE ...>]><!-- comment -->text <b><?php echo ... ?>bold</b><![CDATA[<>]]>\x00"
as *const u8 as *const libc::c_char;
let plain: *mut libc::c_char =
simplify.simplify(html, strlen(html) as libc::c_int, true, 0);
let plain = simplify.simplify(html, strlen(html) as libc::c_int, true, 0);
assert_eq!(
CStr::from_ptr(plain as *const libc::c_char)
@@ -329,17 +328,16 @@ mod tests {
fn test_simplify_html_encoded() {
unsafe {
let mut simplify = Simplify::new();
let html: *const libc::c_char =
b"&lt;&gt;&quot;&apos;&amp; &auml;&Auml;&ouml;&Ouml;&uuml;&Uuml;&szlig; foo&AElig;&ccedil;&Ccedil; &diams;&noent;&lrm;&rlm;&zwnj;&zwj;\x00"
let html =
b"&lt;&gt;&quot;&apos;&amp; &auml;&Auml;&ouml;&Ouml;&uuml;&Uuml;&szlig; foo&AElig;&ccedil;&Ccedil; &diams;&lrm;&rlm;&zwnj;&noent;&zwj;\x00"
as *const u8 as *const libc::c_char;
let plain: *mut libc::c_char =
simplify.simplify(html, strlen(html) as libc::c_int, true, 0);
let plain = simplify.simplify(html, strlen(html) as libc::c_int, true, 0);
assert_eq!(
strcmp(plain,
b"<>\"\'& \xc3\xa4\xc3\x84\xc3\xb6\xc3\x96\xc3\xbc\xc3\x9c\xc3\x9f foo\xc3\x86\xc3\xa7\xc3\x87 \xe2\x99\xa6&noent;\x00"
as *const u8 as *const libc::c_char),
0,
CStr::from_ptr(plain as *const libc::c_char)
.to_str()
.unwrap(),
"<>\"\'& äÄöÖüÜß fooÆçÇ \u{2666}\u{200e}\u{200f}\u{200c}&noent;\u{200d}"
);
free(plain as *mut libc::c_void);