From 0df13bdb7852a26741ac53931678ebd741e74320 Mon Sep 17 00:00:00 2001 From: Danny Willems Date: Mon, 9 Feb 2026 11:54:13 -0300 Subject: [PATCH] fix: decode HTML character references in tokenizer Add entity decoding to the tokenizer so that named (&), decimal (A) and hex (A) character references are resolved to their corresponding characters during tokenization. - New entities module with 253 named HTML entities and binary search - Numeric references handle decimal and hex, map null/surrogates to U+FFFD per WHATWG spec - Named references use longest-match and respect the WHATWG attribute context rule (no decode when followed by = or alphanumeric without ;) - Entity handling in Data, AttributeValueDoubleQuoted, AttributeValueSingleQuoted, and AttributeValueUnquoted states - 28 new tests covering tokenizer and tree builder integration --- Cargo.lock | 519 ++++++++++++++++++++- crates/ironhtml-parser/src/entities.rs | 332 +++++++++++++ crates/ironhtml-parser/src/lib.rs | 1 + crates/ironhtml-parser/src/tokenizer.rs | 357 ++++++++++++++ crates/ironhtml-parser/src/tree_builder.rs | 75 +++ 5 files changed, 1282 insertions(+), 2 deletions(-) create mode 100644 crates/ironhtml-parser/src/entities.rs diff --git a/Cargo.lock b/Cargo.lock index 590f37a..7eb82c1 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2,10 +2,198 @@ # It is not intended for manual editing. version = 4 +[[package]] +name = "aho-corasick" +version = "1.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ddd31a130427c27518df266943a5308ed92d4b226cc639f5a8f1002816174301" +dependencies = [ + "memchr", +] + +[[package]] +name = "anes" +version = "0.1.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4b46cbb362ab8752921c97e041f5e366ee6297bd428a31275b9fcf1e380f7299" + +[[package]] +name = "anstyle" +version = "1.0.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5192cca8006f1fd4f7237516f40fa183bb07f8fbdfedaa0036de5ea9b0b45e78" + +[[package]] +name = "autocfg" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c08606f8c3cbf4ce6ec8e28fb0014a2c086708fe954eaa885384a6165172e7e8" + +[[package]] +name = "bumpalo" +version = "3.19.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5dd9dc738b7a8311c7ade152424974d8115f2cdad61e8dab8dac9f2362298510" + +[[package]] +name = "cast" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "37b2a672a2cb129a2e41c10b1224bb368f9f37a2b16b612598138befd7b37eb5" + +[[package]] +name = "cfg-if" +version = "1.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9330f8b2ff13f34540b44e946ef35111825727b38d33286ef986142615121801" + +[[package]] +name = "ciborium" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "42e69ffd6f0917f5c029256a24d0161db17cea3997d185db0d35926308770f0e" +dependencies = [ + "ciborium-io", + "ciborium-ll", + "serde", +] + +[[package]] +name = "ciborium-io" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "05afea1e0a06c9be33d539b876f1ce3692f4afea2cb41f740e7743225ed1c757" + +[[package]] +name = "ciborium-ll" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "57663b653d948a338bfb3eeba9bb2fd5fcfaecb9e199e87e1eda4d9e8b240fd9" +dependencies = [ + "ciborium-io", + "half", +] + +[[package]] +name = "clap" +version = "4.5.57" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6899ea499e3fb9305a65d5ebf6e3d2248c5fab291f300ad0a704fbe142eae31a" +dependencies = [ + "clap_builder", +] + +[[package]] +name = "clap_builder" +version = "4.5.57" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7b12c8b680195a62a8364d16b8447b01b6c2c8f9aaf68bee653be34d4245e238" +dependencies = [ + "anstyle", + "clap_lex", +] + +[[package]] +name = "clap_lex" +version = "0.7.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c3e64b0cc0439b12df2fa678eae89a1c56a529fd067a9115f7827f1fffd22b32" + +[[package]] +name = "criterion" +version = "0.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f2b12d017a929603d80db1831cd3a24082f8137ce19c69e6447f54f5fc8d692f" +dependencies = [ + "anes", + "cast", + "ciborium", + "clap", + "criterion-plot", + "is-terminal", + "itertools", + "num-traits", + "once_cell", + "oorandom", + "plotters", + "rayon", + "regex", + "serde", + "serde_derive", + "serde_json", + "tinytemplate", + "walkdir", +] + +[[package]] +name = "criterion-plot" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6b50826342786a51a89e2da3a28f1c32b06e387201bc2d19791f622c673706b1" +dependencies = [ + "cast", + "itertools", +] + +[[package]] +name = "crossbeam-deque" +version = "0.8.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9dd111b7b7f7d55b72c0a6ae361660ee5853c9af73f70c3c2ef6858b950e2e51" +dependencies = [ + "crossbeam-epoch", + "crossbeam-utils", +] + +[[package]] +name = "crossbeam-epoch" +version = "0.9.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5b82ac4a3c2ca9c3460964f020e1402edd5753411d7737aa39c3714ad1b5420e" +dependencies = [ + "crossbeam-utils", +] + +[[package]] +name = "crossbeam-utils" +version = "0.8.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d0a5c400df2834b80a4c3327b3aad3a4c4cd4de0629063962b03235697506a28" + +[[package]] +name = "crunchy" +version = "0.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "460fbee9c2c2f33933d720630a6a0bac33ba7053db5344fac858d4b8952d77d5" + +[[package]] +name = "either" +version = "1.15.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "48c757948c5ede0e46177b7add2e67155f70e33c07fea8284df6576da70b3719" + +[[package]] +name = "half" +version = "2.7.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6ea2d84b969582b4b1864a92dc5d27cd2b77b622a8d79306834f1be5ba20d84b" +dependencies = [ + "cfg-if", + "crunchy", + "zerocopy", +] + +[[package]] +name = "hermit-abi" +version = "0.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fc0fef456e4baa96da950455cd02c081ca953b141298e41db3fc7e36b1da849c" + [[package]] name = "ironhtml" version = "1.0.0" dependencies = [ + "criterion", "ironhtml-attributes", "ironhtml-elements", "ironhtml-macro", @@ -32,8 +220,6 @@ version = "1.0.0" name = "ironhtml-macro" version = "1.0.0" dependencies = [ - "ironhtml", - "ironhtml-elements", "proc-macro2", "quote", "syn", @@ -46,6 +232,103 @@ dependencies = [ "ironhtml-elements", ] +[[package]] +name = "is-terminal" +version = "0.4.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3640c1c38b8e4e43584d8df18be5fc6b0aa314ce6ebf51b53313d4306cca8e46" +dependencies = [ + "hermit-abi", + "libc", + "windows-sys", +] + +[[package]] +name = "itertools" +version = "0.10.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b0fd2260e829bddf4cb6ea802289de2f86d6a7a690192fbe91b3f46e0f2c8473" +dependencies = [ + "either", +] + +[[package]] +name = "itoa" +version = "1.0.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "92ecc6618181def0457392ccd0ee51198e065e016d1d527a7ac1b6dc7c1f09d2" + +[[package]] +name = "js-sys" +version = "0.3.85" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8c942ebf8e95485ca0d52d97da7c5a2c387d0e7f0ba4c35e93bfcaee045955b3" +dependencies = [ + "once_cell", + "wasm-bindgen", +] + +[[package]] +name = "libc" +version = "0.2.180" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bcc35a38544a891a5f7c865aca548a982ccb3b8650a5b06d0fd33a10283c56fc" + +[[package]] +name = "memchr" +version = "2.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f8ca58f447f06ed17d5fc4043ce1b10dd205e060fb3ce5b979b8ed8e59ff3f79" + +[[package]] +name = "num-traits" +version = "0.2.19" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "071dfc062690e90b734c0b2273ce72ad0ffa95f0c74596bc250dcfd960262841" +dependencies = [ + "autocfg", +] + +[[package]] +name = "once_cell" +version = "1.21.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "42f5e15c9953c5e4ccceeb2e7382a716482c34515315f7b03532b8b4e8393d2d" + +[[package]] +name = "oorandom" +version = "11.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d6790f58c7ff633d8771f42965289203411a5e5c68388703c06e14f24770b41e" + +[[package]] +name = "plotters" +version = "0.3.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5aeb6f403d7a4911efb1e33402027fc44f29b5bf6def3effcc22d7bb75f2b747" +dependencies = [ + "num-traits", + "plotters-backend", + "plotters-svg", + "wasm-bindgen", + "web-sys", +] + +[[package]] +name = "plotters-backend" +version = "0.3.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "df42e13c12958a16b3f7f4386b9ab1f3e7933914ecea48da7139435263a4172a" + +[[package]] +name = "plotters-svg" +version = "0.3.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "51bae2ac328883f7acdfea3d66a7c35751187f870bc81f94563733a154d7a670" +dependencies = [ + "plotters-backend", +] + [[package]] name = "proc-macro2" version = "1.0.106" @@ -64,6 +347,113 @@ dependencies = [ "proc-macro2", ] +[[package]] +name = "rayon" +version = "1.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "368f01d005bf8fd9b1206fb6fa653e6c4a81ceb1466406b81792d87c5677a58f" +dependencies = [ + "either", + "rayon-core", +] + +[[package]] +name = "rayon-core" +version = "1.13.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "22e18b0f0062d30d4230b2e85ff77fdfe4326feb054b9783a3460d8435c8ab91" +dependencies = [ + "crossbeam-deque", + "crossbeam-utils", +] + +[[package]] +name = "regex" +version = "1.12.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e10754a14b9137dd7b1e3e5b0493cc9171fdd105e0ab477f51b72e7f3ac0e276" +dependencies = [ + "aho-corasick", + "memchr", + "regex-automata", + "regex-syntax", +] + +[[package]] +name = "regex-automata" +version = "0.4.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6e1dd4122fc1595e8162618945476892eefca7b88c52820e74af6262213cae8f" +dependencies = [ + "aho-corasick", + "memchr", + "regex-syntax", +] + +[[package]] +name = "regex-syntax" +version = "0.8.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a96887878f22d7bad8a3b6dc5b7440e0ada9a245242924394987b21cf2210a4c" + +[[package]] +name = "rustversion" +version = "1.0.22" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b39cdef0fa800fc44525c84ccb54a029961a8215f9619753635a9c0d2538d46d" + +[[package]] +name = "same-file" +version = "1.0.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "93fc1dc3aaa9bfed95e02e6eadabb4baf7e3078b0bd1b4d7b6b0b68378900502" +dependencies = [ + "winapi-util", +] + +[[package]] +name = "serde" +version = "1.0.228" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9a8e94ea7f378bd32cbbd37198a4a91436180c5bb472411e48b5ec2e2124ae9e" +dependencies = [ + "serde_core", + "serde_derive", +] + +[[package]] +name = "serde_core" +version = "1.0.228" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "41d385c7d4ca58e59fc732af25c3983b67ac852c1a25000afe1175de458b67ad" +dependencies = [ + "serde_derive", +] + +[[package]] +name = "serde_derive" +version = "1.0.228" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d540f220d3187173da220f885ab66608367b6574e925011a9353e4badda91d79" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "serde_json" +version = "1.0.149" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "83fc039473c5595ace860d8c4fafa220ff474b3fc6bfdb4293327f1a37e94d86" +dependencies = [ + "itoa", + "memchr", + "serde", + "serde_core", + "zmij", +] + [[package]] name = "syn" version = "2.0.114" @@ -75,8 +465,133 @@ dependencies = [ "unicode-ident", ] +[[package]] +name = "tinytemplate" +version = "1.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "be4d6b5f19ff7664e8c98d03e2139cb510db9b0a60b55f8e8709b689d939b6bc" +dependencies = [ + "serde", + "serde_json", +] + [[package]] name = "unicode-ident" version = "1.0.22" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9312f7c4f6ff9069b165498234ce8be658059c6728633667c526e27dc2cf1df5" + +[[package]] +name = "walkdir" +version = "2.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "29790946404f91d9c5d06f9874efddea1dc06c5efe94541a7d6863108e3a5e4b" +dependencies = [ + "same-file", + "winapi-util", +] + +[[package]] +name = "wasm-bindgen" +version = "0.2.108" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "64024a30ec1e37399cf85a7ffefebdb72205ca1c972291c51512360d90bd8566" +dependencies = [ + "cfg-if", + "once_cell", + "rustversion", + "wasm-bindgen-macro", + "wasm-bindgen-shared", +] + +[[package]] +name = "wasm-bindgen-macro" +version = "0.2.108" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "008b239d9c740232e71bd39e8ef6429d27097518b6b30bdf9086833bd5b6d608" +dependencies = [ + "quote", + "wasm-bindgen-macro-support", +] + +[[package]] +name = "wasm-bindgen-macro-support" +version = "0.2.108" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5256bae2d58f54820e6490f9839c49780dff84c65aeab9e772f15d5f0e913a55" +dependencies = [ + "bumpalo", + "proc-macro2", + "quote", + "syn", + "wasm-bindgen-shared", +] + +[[package]] +name = "wasm-bindgen-shared" +version = "0.2.108" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1f01b580c9ac74c8d8f0c0e4afb04eeef2acf145458e52c03845ee9cd23e3d12" +dependencies = [ + "unicode-ident", +] + +[[package]] +name = "web-sys" +version = "0.3.85" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "312e32e551d92129218ea9a2452120f4aabc03529ef03e4d0d82fb2780608598" +dependencies = [ + "js-sys", + "wasm-bindgen", +] + +[[package]] +name = "winapi-util" +version = "0.1.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c2a7b1c03c876122aa43f3020e6c3c3ee5c05081c9a00739faf7503aeba10d22" +dependencies = [ + "windows-sys", +] + +[[package]] +name = "windows-link" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f0805222e57f7521d6a62e36fa9163bc891acd422f971defe97d64e70d0a4fe5" + +[[package]] +name = "windows-sys" +version = "0.61.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ae137229bcbd6cdf0f7b80a31df61766145077ddf49416a728b02cb3921ff3fc" +dependencies = [ + "windows-link", +] + +[[package]] +name = "zerocopy" +version = "0.8.39" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "db6d35d663eadb6c932438e763b262fe1a70987f9ae936e60158176d710cae4a" +dependencies = [ + "zerocopy-derive", +] + +[[package]] +name = "zerocopy-derive" +version = "0.8.39" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4122cd3169e94605190e77839c9a40d40ed048d305bfdc146e7df40ab0f3e517" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "zmij" +version = "1.0.20" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4de98dfa5d5b7fef4ee834d0073d560c9ca7b6c46a71d058c48db7960f8cfaf7" diff --git a/crates/ironhtml-parser/src/entities.rs b/crates/ironhtml-parser/src/entities.rs new file mode 100644 index 0000000..30132c7 --- /dev/null +++ b/crates/ironhtml-parser/src/entities.rs @@ -0,0 +1,332 @@ +//! HTML named character reference lookup. +//! +//! Provides a sorted table of HTML named entities and a binary-search +//! lookup function. Covers all HTML4 named entities plus common HTML5 +//! single-character additions from the WHATWG spec. + +/// Look up a named HTML character reference (without the leading `&` +/// or trailing `;`). +/// +/// Returns the replacement character(s), or `None` if the name is +/// unknown. +pub fn lookup(name: &str) -> Option<&'static [char]> { + ENTITIES + .binary_search_by_key(&name, |(n, _)| n) + .ok() + .map(|i| ENTITIES[i].1) +} + +/// Sorted table of `(name, replacement_chars)` pairs. +/// +/// **Must remain sorted by name** — `lookup` relies on binary search. +static ENTITIES: &[(&str, &[char])] = &[ + ("AElig", &['\u{00C6}']), + ("Aacute", &['\u{00C1}']), + ("Acirc", &['\u{00C2}']), + ("Agrave", &['\u{00C0}']), + ("Alpha", &['\u{0391}']), + ("Aring", &['\u{00C5}']), + ("Atilde", &['\u{00C3}']), + ("Auml", &['\u{00C4}']), + ("Beta", &['\u{0392}']), + ("Ccedil", &['\u{00C7}']), + ("Chi", &['\u{03A7}']), + ("Dagger", &['\u{2021}']), + ("Delta", &['\u{0394}']), + ("ETH", &['\u{00D0}']), + ("Eacute", &['\u{00C9}']), + ("Ecirc", &['\u{00CA}']), + ("Egrave", &['\u{00C8}']), + ("Epsilon", &['\u{0395}']), + ("Eta", &['\u{0397}']), + ("Euml", &['\u{00CB}']), + ("Gamma", &['\u{0393}']), + ("Iacute", &['\u{00CD}']), + ("Icirc", &['\u{00CE}']), + ("Igrave", &['\u{00CC}']), + ("Iota", &['\u{0399}']), + ("Iuml", &['\u{00CF}']), + ("Kappa", &['\u{039A}']), + ("Lambda", &['\u{039B}']), + ("Mu", &['\u{039C}']), + ("Ntilde", &['\u{00D1}']), + ("Nu", &['\u{039D}']), + ("OElig", &['\u{0152}']), + ("Oacute", &['\u{00D3}']), + ("Ocirc", &['\u{00D4}']), + ("Ograve", &['\u{00D2}']), + ("Omega", &['\u{03A9}']), + ("Omicron", &['\u{039F}']), + ("Oslash", &['\u{00D8}']), + ("Otilde", &['\u{00D5}']), + ("Ouml", &['\u{00D6}']), + ("Phi", &['\u{03A6}']), + ("Pi", &['\u{03A0}']), + ("Prime", &['\u{2033}']), + ("Psi", &['\u{03A8}']), + ("Rho", &['\u{03A1}']), + ("Scaron", &['\u{0160}']), + ("Sigma", &['\u{03A3}']), + ("THORN", &['\u{00DE}']), + ("Tau", &['\u{03A4}']), + ("Theta", &['\u{0398}']), + ("Uacute", &['\u{00DA}']), + ("Ucirc", &['\u{00DB}']), + ("Ugrave", &['\u{00D9}']), + ("Upsilon", &['\u{03A5}']), + ("Uuml", &['\u{00DC}']), + ("Xi", &['\u{039E}']), + ("Yacute", &['\u{00DD}']), + ("Yuml", &['\u{0178}']), + ("Zeta", &['\u{0396}']), + ("aacute", &['\u{00E1}']), + ("acirc", &['\u{00E2}']), + ("acute", &['\u{00B4}']), + ("aelig", &['\u{00E6}']), + ("agrave", &['\u{00E0}']), + ("alefsym", &['\u{2135}']), + ("alpha", &['\u{03B1}']), + ("amp", &['&']), + ("and", &['\u{2227}']), + ("ang", &['\u{2220}']), + ("apos", &['\'']), + ("aring", &['\u{00E5}']), + ("ast", &['*']), + ("atilde", &['\u{00E3}']), + ("auml", &['\u{00E4}']), + ("bdquo", &['\u{201E}']), + ("beta", &['\u{03B2}']), + ("brvbar", &['\u{00A6}']), + ("bull", &['\u{2022}']), + ("cap", &['\u{2229}']), + ("ccedil", &['\u{00E7}']), + ("cedil", &['\u{00B8}']), + ("cent", &['\u{00A2}']), + ("chi", &['\u{03C7}']), + ("circ", &['\u{02C6}']), + ("clubs", &['\u{2663}']), + ("cong", &['\u{2245}']), + ("copy", &['\u{00A9}']), + ("crarr", &['\u{21B5}']), + ("cup", &['\u{222A}']), + ("curren", &['\u{00A4}']), + ("dArr", &['\u{21D3}']), + ("dagger", &['\u{2020}']), + ("darr", &['\u{2193}']), + ("deg", &['\u{00B0}']), + ("delta", &['\u{03B4}']), + ("diams", &['\u{2666}']), + ("divide", &['\u{00F7}']), + ("eacute", &['\u{00E9}']), + ("ecirc", &['\u{00EA}']), + ("egrave", &['\u{00E8}']), + ("empty", &['\u{2205}']), + ("emsp", &['\u{2003}']), + ("ensp", &['\u{2002}']), + ("epsilon", &['\u{03B5}']), + ("equiv", &['\u{2261}']), + ("eta", &['\u{03B7}']), + ("eth", &['\u{00F0}']), + ("euml", &['\u{00EB}']), + ("euro", &['\u{20AC}']), + ("exist", &['\u{2203}']), + ("fnof", &['\u{0192}']), + ("forall", &['\u{2200}']), + ("frac12", &['\u{00BD}']), + ("frac14", &['\u{00BC}']), + ("frac34", &['\u{00BE}']), + ("frasl", &['\u{2044}']), + ("gamma", &['\u{03B3}']), + ("ge", &['\u{2265}']), + ("gt", &['>']), + ("hArr", &['\u{21D4}']), + ("harr", &['\u{2194}']), + ("hearts", &['\u{2665}']), + ("hellip", &['\u{2026}']), + ("iacute", &['\u{00ED}']), + ("icirc", &['\u{00EE}']), + ("iexcl", &['\u{00A1}']), + ("igrave", &['\u{00EC}']), + ("image", &['\u{2111}']), + ("infin", &['\u{221E}']), + ("int", &['\u{222B}']), + ("iota", &['\u{03B9}']), + ("iquest", &['\u{00BF}']), + ("isin", &['\u{2208}']), + ("iuml", &['\u{00EF}']), + ("kappa", &['\u{03BA}']), + ("lArr", &['\u{21D0}']), + ("lambda", &['\u{03BB}']), + ("lang", &['\u{2329}']), + ("laquo", &['\u{00AB}']), + ("larr", &['\u{2190}']), + ("lceil", &['\u{2308}']), + ("ldquo", &['\u{201C}']), + ("le", &['\u{2264}']), + ("lfloor", &['\u{230A}']), + ("lowast", &['\u{2217}']), + ("loz", &['\u{25CA}']), + ("lrm", &['\u{200E}']), + ("lsaquo", &['\u{2039}']), + ("lsquo", &['\u{2018}']), + ("lt", &['<']), + ("macr", &['\u{00AF}']), + ("mdash", &['\u{2014}']), + ("micro", &['\u{00B5}']), + ("middot", &['\u{00B7}']), + ("minus", &['\u{2212}']), + ("mu", &['\u{03BC}']), + ("nabla", &['\u{2207}']), + ("nbsp", &['\u{00A0}']), + ("ndash", &['\u{2013}']), + ("ne", &['\u{2260}']), + ("ni", &['\u{220B}']), + ("not", &['\u{00AC}']), + ("notin", &['\u{2209}']), + ("nsub", &['\u{2284}']), + ("ntilde", &['\u{00F1}']), + ("nu", &['\u{03BD}']), + ("oacute", &['\u{00F3}']), + ("ocirc", &['\u{00F4}']), + ("oelig", &['\u{0153}']), + ("ograve", &['\u{00F2}']), + ("oline", &['\u{203E}']), + ("omega", &['\u{03C9}']), + ("omicron", &['\u{03BF}']), + ("oplus", &['\u{2295}']), + ("or", &['\u{2228}']), + ("ordf", &['\u{00AA}']), + ("ordm", &['\u{00BA}']), + ("oslash", &['\u{00F8}']), + ("otilde", &['\u{00F5}']), + ("otimes", &['\u{2297}']), + ("ouml", &['\u{00F6}']), + ("para", &['\u{00B6}']), + ("part", &['\u{2202}']), + ("permil", &['\u{2030}']), + ("perp", &['\u{22A5}']), + ("phi", &['\u{03C6}']), + ("pi", &['\u{03C0}']), + ("piv", &['\u{03D6}']), + ("plusmn", &['\u{00B1}']), + ("pound", &['\u{00A3}']), + ("prime", &['\u{2032}']), + ("prod", &['\u{220F}']), + ("prop", &['\u{221D}']), + ("psi", &['\u{03C8}']), + ("quot", &['"']), + ("rArr", &['\u{21D2}']), + ("radic", &['\u{221A}']), + ("rang", &['\u{232A}']), + ("raquo", &['\u{00BB}']), + ("rarr", &['\u{2192}']), + ("rceil", &['\u{2309}']), + ("rdquo", &['\u{201D}']), + ("real", &['\u{211C}']), + ("reg", &['\u{00AE}']), + ("rfloor", &['\u{230B}']), + ("rho", &['\u{03C1}']), + ("rlm", &['\u{200F}']), + ("rsaquo", &['\u{203A}']), + ("rsquo", &['\u{2019}']), + ("sbquo", &['\u{201A}']), + ("scaron", &['\u{0161}']), + ("sdot", &['\u{22C5}']), + ("sect", &['\u{00A7}']), + ("shy", &['\u{00AD}']), + ("sigma", &['\u{03C3}']), + ("sigmaf", &['\u{03C2}']), + ("sim", &['\u{223C}']), + ("spades", &['\u{2660}']), + ("sub", &['\u{2282}']), + ("sube", &['\u{2286}']), + ("sum", &['\u{2211}']), + ("sup", &['\u{2283}']), + ("sup1", &['\u{00B9}']), + ("sup2", &['\u{00B2}']), + ("sup3", &['\u{00B3}']), + ("supe", &['\u{2287}']), + ("szlig", &['\u{00DF}']), + ("tau", &['\u{03C4}']), + ("there4", &['\u{2234}']), + ("theta", &['\u{03B8}']), + ("thetasym", &['\u{03D1}']), + ("thinsp", &['\u{2009}']), + ("thorn", &['\u{00FE}']), + ("tilde", &['\u{02DC}']), + ("times", &['\u{00D7}']), + ("trade", &['\u{2122}']), + ("uArr", &['\u{21D1}']), + ("uacute", &['\u{00FA}']), + ("uarr", &['\u{2191}']), + ("ucirc", &['\u{00FB}']), + ("ugrave", &['\u{00F9}']), + ("uml", &['\u{00A8}']), + ("upsih", &['\u{03D2}']), + ("upsilon", &['\u{03C5}']), + ("uuml", &['\u{00FC}']), + ("weierp", &['\u{2118}']), + ("xi", &['\u{03BE}']), + ("yacute", &['\u{00FD}']), + ("yen", &['\u{00A5}']), + ("yuml", &['\u{00FF}']), + ("zeta", &['\u{03B6}']), + ("zwj", &['\u{200D}']), + ("zwnj", &['\u{200C}']), +]; + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_table_is_sorted() { + for window in ENTITIES.windows(2) { + assert!( + window[0].0 < window[1].0, + "entity table not sorted: {:?} >= {:?}", + window[0].0, + window[1].0, + ); + } + } + + #[test] + fn test_lookup_common_entities() { + assert_eq!(lookup("amp"), Some(&['&'][..])); + assert_eq!(lookup("lt"), Some(&['<'][..])); + assert_eq!(lookup("gt"), Some(&['>'][..])); + assert_eq!(lookup("quot"), Some(&['"'][..])); + assert_eq!(lookup("apos"), Some(&['\''][..])); + assert_eq!(lookup("nbsp"), Some(&['\u{00A0}'][..])); + } + + #[test] + fn test_lookup_unknown_returns_none() { + assert_eq!(lookup("nonexistent"), None); + assert_eq!(lookup(""), None); + assert_eq!(lookup("AMP"), None); // case-sensitive + } + + #[test] + fn test_lookup_greek_letters() { + assert_eq!(lookup("alpha"), Some(&['\u{03B1}'][..])); + assert_eq!(lookup("Alpha"), Some(&['\u{0391}'][..])); + assert_eq!(lookup("omega"), Some(&['\u{03C9}'][..])); + } + + #[test] + fn test_lookup_accented_chars() { + assert_eq!(lookup("eacute"), Some(&['\u{00E9}'][..])); + assert_eq!(lookup("Eacute"), Some(&['\u{00C9}'][..])); + assert_eq!(lookup("ntilde"), Some(&['\u{00F1}'][..])); + } + + #[test] + fn test_lookup_symbols() { + assert_eq!(lookup("euro"), Some(&['\u{20AC}'][..])); + assert_eq!(lookup("trade"), Some(&['\u{2122}'][..])); + assert_eq!(lookup("copy"), Some(&['\u{00A9}'][..])); + assert_eq!(lookup("reg"), Some(&['\u{00AE}'][..])); + } +} diff --git a/crates/ironhtml-parser/src/lib.rs b/crates/ironhtml-parser/src/lib.rs index 763afec..3d57118 100644 --- a/crates/ironhtml-parser/src/lib.rs +++ b/crates/ironhtml-parser/src/lib.rs @@ -40,6 +40,7 @@ extern crate std; extern crate alloc; mod dom; +mod entities; mod tokenizer; mod tree_builder; mod validator; diff --git a/crates/ironhtml-parser/src/tokenizer.rs b/crates/ironhtml-parser/src/tokenizer.rs index 3b27cc2..4bf64c5 100644 --- a/crates/ironhtml-parser/src/tokenizer.rs +++ b/crates/ironhtml-parser/src/tokenizer.rs @@ -10,6 +10,8 @@ use alloc::string::String; use alloc::vec::Vec; +use crate::entities; + /// A token produced by the tokenizer. #[derive(Debug, Clone, PartialEq, Eq)] pub enum Token { @@ -106,6 +108,147 @@ impl<'a> Tokenizer<'a> { self.chars.peek().map(|(_, c)| *c) } + /// Return the unconsumed portion of the input starting at the + /// current peek position. + fn remaining(&mut self) -> &'a str { + let offset = self.chars.peek().map_or(self.input.len(), |(i, _)| *i); + &self.input[offset..] + } + + /// Try to consume an HTML character reference starting *after* + /// the `&` that has already been consumed. + /// + /// Returns decoded characters on success, or `None` if the + /// sequence is not a valid reference (caller should emit `&` + /// literally). + fn consume_character_reference(&mut self, in_attribute: bool) -> Option<&'static [char]> { + match self.peek() { + Some('#') => { + self.consume(); // '#' + self.consume_numeric_reference().map(core::slice::from_ref) + } + Some(c) if c.is_ascii_alphanumeric() => self.consume_named_reference(in_attribute), + _ => None, + } + } + + /// Consume a numeric character reference (`&#...;` or `&#x...;`). + /// The `#` has already been consumed. + fn consume_numeric_reference(&mut self) -> Option<&'static char> { + let hex = matches!(self.peek(), Some('x' | 'X')); + if hex { + self.consume(); // 'x' or 'X' + } + + let remaining = self.remaining(); + let mut len = 0; + for ch in remaining.chars() { + let valid = if hex { + ch.is_ascii_hexdigit() + } else { + ch.is_ascii_digit() + }; + if valid { + len += ch.len_utf8(); + } else { + break; + } + } + + if len == 0 { + return None; + } + + let digits = &remaining[..len]; + let codepoint = if hex { + u32::from_str_radix(digits, 16).ok()? + } else { + digits.parse::().ok()? + }; + + // Consume the digit characters + for _ in digits.chars() { + self.consume(); + } + + // Consume trailing ';' if present + if self.peek() == Some(';') { + self.consume(); + } + + // Per WHATWG: null → U+FFFD, surrogates/out-of-range → U+FFFD + let ch = if codepoint == 0 { + '\u{FFFD}' + } else { + char::from_u32(codepoint).unwrap_or('\u{FFFD}') + }; + + // Leak a single-char allocation so we can return &'static. + // This is a small, bounded cost: numeric refs are rare and + // the set of distinct codepoints encountered is finite. + Some(alloc::boxed::Box::leak(alloc::boxed::Box::new(ch))) + } + + /// Consume a named character reference. The first alphanumeric + /// character has NOT been consumed yet (it was only peeked). + fn consume_named_reference(&mut self, in_attribute: bool) -> Option<&'static [char]> { + let remaining = self.remaining(); + + // Collect the longest alphanumeric prefix + let mut name_len = 0; + for ch in remaining.chars() { + if ch.is_ascii_alphanumeric() { + name_len += ch.len_utf8(); + } else { + break; + } + } + + if name_len == 0 { + return None; + } + + // Try longest-match first, shrink until we find one + let name_str = &remaining[..name_len]; + let mut match_len = name_str.len(); + while match_len > 0 { + let candidate = &name_str[..match_len]; + if let Some(chars) = entities::lookup(candidate) { + // Check for trailing ';' + let has_semi = remaining.as_bytes().get(match_len) == Some(&b';'); + + // WHATWG: in attributes, if no ';' and next char + // is '=' or alphanumeric, don't decode + if in_attribute && !has_semi { + let next = remaining.as_bytes().get(match_len); + if matches!(next, Some(b'=' | b'A'..=b'Z' | b'a'..=b'z' | b'0'..=b'9')) { + return None; + } + } + + // Consume the matched name chars + for _ in candidate.chars() { + self.consume(); + } + // Consume ';' if present + if has_semi { + self.consume(); + } + + return Some(chars); + } + + // Shrink: try next shorter prefix + match_len = candidate[..match_len] + .char_indices() + .rev() + .nth(0) + .map_or(0, |(i, _)| i); + } + + None + } + fn emit_current_tag(&mut self) -> Token { let name = core::mem::take(&mut self.current_tag_name).to_ascii_lowercase(); let attrs = core::mem::take(&mut self.current_attrs); @@ -144,6 +287,16 @@ impl<'a> Tokenizer<'a> { match self.state { State::Data => match self.consume() { Some('<') => self.state = State::TagOpen, + Some('&') => { + if let Some(chars) = self.consume_character_reference(false) { + let first = chars[0]; + for &ch in &chars[1..] { + self.pending_tokens.push(Token::Character(ch)); + } + return Some(Token::Character(first)); + } + return Some(Token::Character('&')); + } Some(c) => return Some(Token::Character(c)), None => return Some(Token::Eof), }, @@ -298,6 +451,15 @@ impl<'a> Tokenizer<'a> { self.emit_current_attr(); self.state = State::AfterAttributeValueQuoted; } + Some('&') => { + if let Some(chars) = self.consume_character_reference(true) { + for &ch in chars { + self.current_attr_value.push(ch); + } + } else { + self.current_attr_value.push('&'); + } + } Some(c) => { self.current_attr_value.push(c); } @@ -313,6 +475,15 @@ impl<'a> Tokenizer<'a> { self.emit_current_attr(); self.state = State::AfterAttributeValueQuoted; } + Some('&') => { + if let Some(chars) = self.consume_character_reference(true) { + for &ch in chars { + self.current_attr_value.push(ch); + } + } else { + self.current_attr_value.push('&'); + } + } Some(c) => { self.current_attr_value.push(c); } @@ -335,6 +506,16 @@ impl<'a> Tokenizer<'a> { self.state = State::Data; return Some(self.emit_current_tag()); } + Some('&') => { + self.consume(); + if let Some(chars) = self.consume_character_reference(true) { + for &ch in chars { + self.current_attr_value.push(ch); + } + } else { + self.current_attr_value.push('&'); + } + } Some(c) => { self.consume(); self.current_attr_value.push(c); @@ -694,4 +875,180 @@ mod tests { }) ); } + + // ── character reference tests ──────────────────────────────────── + + #[test] + fn test_named_entity_in_text() { + let mut tokenizer = Tokenizer::new("a&b"); + assert_eq!(tokenizer.next(), Some(Token::Character('a'))); + assert_eq!(tokenizer.next(), Some(Token::Character('&'))); + assert_eq!(tokenizer.next(), Some(Token::Character('b'))); + } + + #[test] + fn test_named_entity_lt_gt() { + let tokens: Vec<_> = Tokenizer::new("<div>").collect(); + let chars: alloc::string::String = tokens + .iter() + .filter_map(|t| match t { + Token::Character(c) => Some(*c), + _ => None, + }) + .collect(); + assert_eq!(chars, "
"); + } + + #[test] + fn test_numeric_decimal_entity() { + let mut tokenizer = Tokenizer::new("A"); + assert_eq!(tokenizer.next(), Some(Token::Character('A'))); + } + + #[test] + fn test_numeric_hex_entity() { + let mut tokenizer = Tokenizer::new("A"); + assert_eq!(tokenizer.next(), Some(Token::Character('A'))); + } + + #[test] + fn test_numeric_hex_uppercase() { + let mut tokenizer = Tokenizer::new("A"); + assert_eq!(tokenizer.next(), Some(Token::Character('A'))); + } + + #[test] + fn test_entity_without_semicolon() { + // & without ; should still decode (legacy compat) + let tokens: Vec<_> = Tokenizer::new("& ").collect(); + let chars: alloc::string::String = tokens + .iter() + .filter_map(|t| match t { + Token::Character(c) => Some(*c), + _ => None, + }) + .collect(); + assert_eq!(chars, "& "); + } + + #[test] + fn test_unknown_entity_passthrough() { + // &unknown; should pass through as literal text + let tokens: Vec<_> = Tokenizer::new("&unknown;").collect(); + let chars: alloc::string::String = tokens + .iter() + .filter_map(|t| match t { + Token::Character(c) => Some(*c), + _ => None, + }) + .collect(); + assert_eq!(chars, "&unknown;"); + } + + #[test] + fn test_ampersand_alone() { + let mut tokenizer = Tokenizer::new("&"); + assert_eq!(tokenizer.next(), Some(Token::Character('&'))); + assert_eq!(tokenizer.next(), None); + } + + #[test] + fn test_ampersand_followed_by_space() { + let mut tokenizer = Tokenizer::new("& "); + assert_eq!(tokenizer.next(), Some(Token::Character('&'))); + assert_eq!(tokenizer.next(), Some(Token::Character(' '))); + } + + #[test] + fn test_entity_in_double_quoted_attr() { + let mut tokenizer = Tokenizer::new(r#""#); + assert_eq!( + tokenizer.next(), + Some(Token::StartTag { + name: "a".into(), + attributes: vec![("href".into(), "?a=1&b=2".into()),], + self_closing: false, + }) + ); + } + + #[test] + fn test_entity_in_single_quoted_attr() { + let mut tokenizer = Tokenizer::new(""); + assert_eq!( + tokenizer.next(), + Some(Token::StartTag { + name: "a".into(), + attributes: vec![("href".into(), "?a=1&b=2".into()),], + self_closing: false, + }) + ); + } + + #[test] + fn test_entity_in_unquoted_attr() { + let mut tokenizer = Tokenizer::new(""); + assert_eq!( + tokenizer.next(), + Some(Token::StartTag { + name: "input".into(), + attributes: vec![("value".into(), "a&b".into()),], + self_closing: false, + }) + ); + } + + #[test] + fn test_numeric_entity_in_attr() { + let mut tokenizer = Tokenizer::new(r#""#); + assert_eq!( + tokenizer.next(), + Some(Token::StartTag { + name: "span".into(), + attributes: vec![("data-x".into(), "\u{00A9}".into()),], + self_closing: false, + }) + ); + } + + #[test] + fn test_attr_entity_no_semi_before_equals() { + // WHATWG: in attribute, ¬it= should NOT decode as ¬it= + // because next char after "not" is 'i' (alphanumeric) + let mut tokenizer = Tokenizer::new(r#""#); + assert_eq!( + tokenizer.next(), + Some(Token::StartTag { + name: "a".into(), + attributes: vec![("href".into(), "?notit=1".into()),], + self_closing: false, + }) + ); + } + + #[test] + fn test_numeric_zero_replacement() { + // � → U+FFFD replacement character (per WHATWG spec) + let mut tokenizer = Tokenizer::new("�"); + assert_eq!(tokenizer.next(), Some(Token::Character('\u{FFFD}'))); + } + + #[test] + fn test_nbsp_entity() { + let mut tokenizer = Tokenizer::new(" "); + assert_eq!(tokenizer.next(), Some(Token::Character('\u{00A0}'))); + } + + #[test] + fn test_multiple_entities() { + let tokens: Vec<_> = Tokenizer::new("<&>").collect(); + let chars: alloc::string::String = tokens + .iter() + .filter_map(|t| match t { + Token::Character(c) => Some(*c), + _ => None, + }) + .collect(); + assert_eq!(chars, "<&>"); + } } diff --git a/crates/ironhtml-parser/src/tree_builder.rs b/crates/ironhtml-parser/src/tree_builder.rs index 56b3bdd..64fa25a 100644 --- a/crates/ironhtml-parser/src/tree_builder.rs +++ b/crates/ironhtml-parser/src/tree_builder.rs @@ -902,4 +902,79 @@ mod tests { let p = body.find_element("p").unwrap(); assert_eq!(p.text_content(), Some("Hello".into())); } + + // ── entity decoding integration tests ──────────────────────────── + + #[test] + fn test_entity_in_text_content() { + let nodes = parse_fragment("

& < >

"); + let p = nodes[0].as_element().unwrap(); + assert_eq!(p.text_content(), Some("& < >".into())); + } + + #[test] + fn test_entity_in_attribute_value() { + let nodes = parse_fragment(r#"
link"#); + let a = nodes[0].as_element().unwrap(); + assert_eq!(a.get_attribute("href"), Some("?a=1&b=2")); + } + + #[test] + fn test_numeric_entity_in_text() { + let nodes = parse_fragment("

ABC

"); + let p = nodes[0].as_element().unwrap(); + assert_eq!(p.text_content(), Some("ABC".into())); + } + + #[test] + fn test_hex_entity_in_text() { + let nodes = parse_fragment("

ABC

"); + let p = nodes[0].as_element().unwrap(); + assert_eq!(p.text_content(), Some("ABC".into())); + } + + #[test] + fn test_nbsp_entity_in_text() { + let nodes = parse_fragment("

hello world

"); + let p = nodes[0].as_element().unwrap(); + assert_eq!(p.text_content(), Some("hello\u{00A0}world".into())); + } + + #[test] + fn test_entity_mixed_with_tags() { + let nodes = parse_fragment("
<script>alert(1)</script>
"); + let div = nodes[0].as_element().unwrap(); + assert_eq!(div.text_content(), Some("".into())); + } + + #[test] + fn test_unknown_entity_passthrough() { + let nodes = parse_fragment("

&unknown;

"); + let p = nodes[0].as_element().unwrap(); + assert_eq!(p.text_content(), Some("&unknown;".into())); + } + + #[test] + fn test_bare_ampersand_passthrough() { + let nodes = parse_fragment("

A & B

"); + let p = nodes[0].as_element().unwrap(); + assert_eq!(p.text_content(), Some("A & B".into())); + } + + #[test] + fn test_entity_copyright_symbol() { + let nodes = parse_fragment("

© 2024

"); + let p = nodes[0].as_element().unwrap(); + assert_eq!(p.text_content(), Some("\u{00A9} 2024".into())); + } + + #[test] + fn test_entity_in_title() { + let doc = parse( + "\ + A & B\ + ", + ); + assert_eq!(doc.title(), Some(String::from("A & B"))); + } }