diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml new file mode 100644 index 0000000..f6e42b5 --- /dev/null +++ b/.github/workflows/ci.yml @@ -0,0 +1,23 @@ +name: test + +on: + push: + branches: + - master + - main + pull_request: + +jobs: + test: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - uses: erlef/setup-beam@v1 + with: + otp-version: "28" + gleam-version: "1.13.0" + rebar3-version: "3" + # elixir-version: "1" + - run: gleam deps download + - run: gleam test + - run: gleam format --check src test diff --git a/CHANGELOG.md b/CHANGELOG.md index 937bc90..20eed64 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,24 @@ All notable changes to this project are documented in this file. +## [1.2.3] - 2026-01-08 +### Changed +- Replaced `escape_html` implementation with `houdini.escape` for faster, + allocation-friendly HTML escaping. +- Replaced `unescape_html` with `odysseus.unescape` for comprehensive HTML + entity unescaping (named entities, numeric decimal and hex entities). +- Added dependencies: `houdini`, `odysseus`. + +### Tests +- Added tests for HTML escape/unescape and numeric entities (decimal and hex). + +Contributed by: Daniele (`lupodevelop`) +Suggested by: Louis Pilfold (`@lpil`) + +Suggested by: NNB (`@NNBnh`) +Suggested change: updated README logo pointer to use the raw.githubusercontent URL +(pointing to the repository commit) so the logo is resolvable on Hexdocs. + ## [1.2.2] - 2026-01-05 ### Added - Added internal helper `grapheme_len/1` (internal) to centralize grapheme cluster length computation and avoid repetitive `string.to_graphemes |> list.length` patterns. diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md new file mode 100644 index 0000000..750ed0e --- /dev/null +++ b/CONTRIBUTING.md @@ -0,0 +1,34 @@ +# Contributing to str + +Thanks for helping! Short, practical guide. + +## Quick start +- Fork, create a branch: `git switch -c feat/your-change`. +- Run `gleam format` and `gleam test` locally. +- Open a PR against `main` with a short description and tests. + +## Setup +- Requirements: Gleam (see `gleam.toml`) + +Commands: +```bash +gleam format +gleam test +``` + +## Commits +Use brief prefixes: `feat:`, `fix:`, `chore:`, `test:`, `perf:`. +Example: `feat(display): add truncate_display` +No strict enforcement, use these prefixes as a guideline, not a hard rule. + +## PR checklist +- [ ] Tests added/updated +- [ ] `gleam format` & `gleam test` pass +- [ ] Update `CHANGELOG.md` if behaviour changes +- [ ] Document noteworthy changes in `README.md` , docs/ or examples/ + +## Deprecations +- Report breaking changes in an issue and add migration notes in PRs. See `DEPRECATIONS.md` if present. + +## Testing +- Add unit tests for edge cases (ZWJ, skin tones, combining marks, CJK, ambiguous widths). diff --git a/README.md b/README.md index 8f6b7a3..765b690 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,5 @@

- str logo + str logo

str

@@ -327,6 +327,8 @@ gleam test python3 scripts/generate_character_tables.py ``` +Note: as of **1.2.3**, `escape_html` now uses the `houdini` library for fast, allocation‑friendly escaping, and `unescape_html` uses `odysseus` for comprehensive entity support (named, decimal and hex numeric entities). See [CHANGELOG.md](CHANGELOG.md) for details. + --- ## 📊 Test Coverage diff --git a/gleam.toml b/gleam.toml index 7117b4f..80e9ec3 100644 --- a/gleam.toml +++ b/gleam.toml @@ -1,17 +1,18 @@ name = "str" -version = "1.2.2" +version = "1.2.3" # Project metadata (fill or replace placeholders before publishing) description = "Unicode-aware string utilities for Gleam: grapheme-safe operations, pragmatic ASCII transliteration, and slug generation." licenses = ["MIT"] repository = { type = "github", user = "lupodevelop", repo = "str" } -links = [{ title = "Repository", href = "https://github.com/lupodevelop/str" }] # For a full reference of all the available options, see: # https://gleam.run/writing-gleam/gleam-toml/ [dependencies] gleam_stdlib = ">= 0.44.0 and < 2.0.0" +houdini = ">= 1.0.0 and < 2.0.0" +odysseus = ">= 1.0.0 and < 2.0.0" [dev-dependencies] gleeunit = ">= 1.0.0 and < 2.0.0" diff --git a/manifest.toml b/manifest.toml index 61ab519..cf214d7 100644 --- a/manifest.toml +++ b/manifest.toml @@ -4,8 +4,12 @@ packages = [ { name = "gleam_stdlib", version = "0.65.0", build_tools = ["gleam"], requirements = [], otp_app = "gleam_stdlib", source = "hex", outer_checksum = "7C69C71D8C493AE11A5184828A77110EB05A7786EBF8B25B36A72F879C3EE107" }, { name = "gleeunit", version = "1.9.0", build_tools = ["gleam"], requirements = ["gleam_stdlib"], otp_app = "gleeunit", source = "hex", outer_checksum = "DA9553CE58B67924B3C631F96FE3370C49EB6D6DC6B384EC4862CC4AAA718F3C" }, + { name = "houdini", version = "1.2.0", build_tools = ["gleam"], requirements = [], otp_app = "houdini", source = "hex", outer_checksum = "5DB1053F1AF828049C2B206D4403C18970ABEF5C18671CA3C2D2ED0DD64F6385" }, + { name = "odysseus", version = "1.0.0", build_tools = ["gleam"], requirements = [], otp_app = "odysseus", source = "hex", outer_checksum = "6A97DA1075BDDEA8B60F47B1DFFAD49309FA27E73843F13A0AF32EA7087BA11C" }, ] [requirements] gleam_stdlib = { version = ">= 0.44.0 and < 2.0.0" } gleeunit = { version = ">= 1.0.0 and < 2.0.0" } +houdini = { version = ">= 1.0.0 and < 2.0.0" } +odysseus = { version = ">= 1.0.0 and < 2.0.0" } diff --git a/src/str/core.gleam b/src/str/core.gleam index ad7304a..ad07923 100644 --- a/src/str/core.gleam +++ b/src/str/core.gleam @@ -13,6 +13,8 @@ import gleam/dict import gleam/int import gleam/list import gleam/string +import houdini +import odysseus import str/config /// Detects if a grapheme cluster likely contains emoji components. @@ -1766,12 +1768,7 @@ pub fn is_hex(text: String) -> Bool { /// escape_html("Say \"hello\"") -> "Say "hello"" /// pub fn escape_html(text: String) -> String { - text - |> string.replace("&", "&") - |> string.replace("<", "<") - |> string.replace(">", ">") - |> string.replace("\"", """) - |> string.replace("'", "'") + houdini.escape(text) } /// Unescapes HTML entities to their character equivalents. @@ -1781,12 +1778,7 @@ pub fn escape_html(text: String) -> String { /// unescape_html("Tom & Jerry") -> "Tom & Jerry" /// pub fn unescape_html(text: String) -> String { - text - |> string.replace("'", "'") - |> string.replace(""", "\"") - |> string.replace(">", ">") - |> string.replace("<", "<") - |> string.replace("&", "&") + odysseus.unescape(text) } /// Escapes regex metacharacters so the string can be used as a literal pattern. diff --git a/test/str_html_escape_extended_test.gleam b/test/str_html_escape_extended_test.gleam new file mode 100644 index 0000000..0335224 --- /dev/null +++ b/test/str_html_escape_extended_test.gleam @@ -0,0 +1,64 @@ +import gleam/list +import gleeunit +import str + +pub fn main() -> Nil { + gleeunit.main() +} + +pub fn roundtrip_basic_entities_test() { + let cases = [ + "
Hello
", + "Tom & Jerry", + "Say \"hello\"", + "It's me", + "5 < 10 && 10 > 5", + "Ampersand: &", + ] + + list.fold(cases, True, fn(_, s) { + let escaped = str.escape_html(s) + let unescaped = str.unescape_html(escaped) + assert unescaped == s + True + }) +} + +pub fn numeric_and_named_entities_test() { + assert str.unescape_html("<>&''"") == "<>&''\"" + assert str.unescape_html("" and " and "") == "\" and \" and \"" + assert str.unescape_html("I like 'quotes'") == "I like 'quotes'" + assert str.unescape_html("Hex: '") == "Hex: '" +} + +pub fn malformed_and_unknown_entity_test() { + // Missing semicolon should remain unchanged + assert str.unescape_html("This & is broken") == "This & is broken" + + // Unknown entity should remain unchanged + assert str.unescape_html("This ¬anentity; remains") + == "This ¬anentity; remains" +} + +pub fn combined_and_adjacent_entities_test() { + assert str.unescape_html("<< >>") == "<< >>" + assert str.unescape_html("&&&") == "&&&" +} + +pub fn unicode_and_emoji_roundtrip_test() { + let s = "Café — ️👩‍👩‍👧‍👦 \u{00A0}" + let escaped = str.escape_html(s) + // Expect unescape to restore the original (escape may not change emoji/nbspace) + assert str.unescape_html(escaped) == s +} + +pub fn idempotence_and_double_escape_test() { + let s = "&" + let once = str.escape_html(s) + let twice = str.escape_html(once) + assert once == "&" + assert twice == "&amp;" + // unescape decodes one level: "&amp;" -> "&"; double unescape restores original + assert str.unescape_html(twice) == "&" + assert str.unescape_html(str.unescape_html(twice)) == s +} diff --git a/test/str_html_escape_fuzz_test.gleam b/test/str_html_escape_fuzz_test.gleam new file mode 100644 index 0000000..5debba8 --- /dev/null +++ b/test/str_html_escape_fuzz_test.gleam @@ -0,0 +1,70 @@ +import gleeunit +import str +import gleam/list +import gleam/string + +pub fn main() -> Nil { + gleeunit.main() +} + +// Deterministic, simple generator over a token pool. +fn gen_token_pool() -> List(String) { + [ + "a","b","c","1","2","3"," ","\n","<",">","&","\"","'", + "&","<",">",""","'","'",""","¬anentity;", + "&","&","&#", "&#x", + "\u{00A0}", // NBSP + "Café","naïve","ø","漢","字", + "👩‍👩‍👧‍👦","👨‍👩‍👧","️","✈️","🏳️‍🌈", + "\u{0301}", // combining acute + "α","β","γ" + ] +} + +// Deterministic pseudo-random index using seed and i +fn idx_for(seed: Int, i: Int, len: Int) -> Int { + // simple LCG-ish formula; keep small to avoid large-int overhead + let v = seed * 1103515245 + 12345 + i + let v_pos = case v < 0 { True -> -v False -> v } + v_pos % len +} + +fn gen_string(seed: Int, tokens: List(String), n: Int) -> String { + let len = list.length(tokens) + let seq = list.range(0, n - 1) + seq + |> list.map(fn(i) { + let j = idx_for(seed, i, len) + case list.drop(tokens, j) { + [first, ..] -> first + [] -> "" + } + }) + |> list.fold("", fn(acc, s) { acc <> s }) +} + +fn run_cfg(seed: Int, n: Int, tokens: List(String)) -> Bool { + let s = gen_string(seed, tokens, n) + // Roundtrip: unescape(escape(s)) == s + let escaped = str.escape_html(s) + let unescaped = str.unescape_html(escaped) + assert unescaped == s + + // Escaped string must not contain raw angle brackets or quotes + assert string.contains(escaped, "<") == False + assert string.contains(escaped, ">") == False + assert string.contains(escaped, "\"") == False + assert string.contains(escaped, "'") == False + + True +} + +pub fn fuzz_roundtrip_test() { + let tokens = gen_token_pool() + + run_cfg(1, 20, tokens) + run_cfg(42, 50, tokens) + run_cfg(123, 200, tokens) + + True +} diff --git a/test/str_html_escape_test.gleam b/test/str_html_escape_test.gleam new file mode 100644 index 0000000..ae1c9cb --- /dev/null +++ b/test/str_html_escape_test.gleam @@ -0,0 +1,31 @@ +import str + +pub fn escape_basic_test() { + assert str.escape_html("
Hello
") == "<div>Hello</div>" + assert str.escape_html("Tom & Jerry") == "Tom & Jerry" + assert str.escape_html("Say \"hello\"") == "Say "hello"" +} + +pub fn unescape_basic_test() { + assert str.unescape_html("<div>") == "
" + assert str.unescape_html("Tom & Jerry") == "Tom & Jerry" + assert str.unescape_html("Say "hello"") == "Say \"hello\"" + assert str.unescape_html("It's me") == "It's me" +} + +pub fn roundtrip_test() { + let s = "Hello & < > \"" + let escaped = str.escape_html(s) + assert str.unescape_html(escaped) == s +} + +pub fn numeric_entities_test() { + // Decimal numeric entity + assert str.unescape_html("I like 'quotes'") == "I like 'quotes'" + + // Hex numeric entity + assert str.unescape_html("Hex: '") == "Hex: '" + + // Double quote numeric and hex + assert str.unescape_html("" and " and "") == "\" and \" and \"" +}