From c1887729cf7b12bc02ed1bd572731a16edda9ce0 Mon Sep 17 00:00:00 2001 From: Daniele Date: Sat, 28 Feb 2026 09:54:45 +0100 Subject: [PATCH] Bump 2.0.1: fixes, docs & API cleanup Release bump to 2.0.1 with bug fixes, internal refactors, docs/examples updates, and tests. Changes: - Bumped package version to 2.0.1 and added changelog entry. - Fix truncate behavior: do not append suffix when text already fits within max_len. - Make starts_with_any / ends_with_any grapheme-aware (use public starts_with/ends_with). - Correct center docs to indicate right-biased padding when uneven. - Simplify find_emoji_index API (remove unused start parameter) and tighten its loop. - Replace repeat_str implementation to a direct recursive loop to avoid intermediate list allocation. - Update examples and docs to use the public `str` API (and `str/advanced` where applicable) and to reflect internal module path changes (src/str/internal/*). - Update tokenizer docs to reference public `str.chars` / `str.chars_stdlib` and mark internal tokenizer module. - Add regression tests for truncate noop, grapheme-aware starts/ends_with_any, and center right-bias. Other: various documentation cleanups and example callsite adjustments to match the 2.0 public API. --- CHANGELOG.md | 25 ++++ EXAMPLES.md | 234 +++++++++++++++++++----------------- docs/str_core.md | 9 +- docs/str_extra.md | 33 ++--- docs/str_tokenize.md | 25 ++-- gleam.toml | 2 +- src/str/internal/core.gleam | 36 +++--- test/str_core_test.gleam | 35 ++++++ 8 files changed, 240 insertions(+), 159 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index f8ad8c8..d5f2dab 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,31 @@ All notable changes to this project are documented in this file. +## [2.0.1] - 2026-02-28 + +### Fixed + +- **`truncate_with_flag`**: text that already fits within `max_len` was still being truncated when `text_len + suffix_len > max_len`. For example `truncate("Hello!", 8, "...")` returned `"Hello..."` instead of `"Hello!"`. The suffix is now only appended when the text is actually cut. +- **`starts_with_any` / `ends_with_any`**: used the stdlib's byte-level `string.starts_with`/`string.ends_with` instead of the library's grapheme-aware versions, causing incorrect matches on strings with combining marks (e.g. `"e\u{0301}"` would wrongly match prefix `"e"`). +- **`center` doc comment**: stated "left side receives more" but the implementation gives more padding to the right side. Comment corrected. + +### Changed + +- **`find_emoji_index`**: removed unused `start` parameter that was always passed as `0`, simplified the internal loop. +- **`repeat_str`**: replaced `list.range` + `list.fold` with a direct recursive loop to avoid an intermediate list allocation. + +### Tests + +- Added regression tests for the `truncate` noop case, grapheme-aware `starts_with_any`/`ends_with_any`, and `center` right-bias. + +### Documentation + +- Updated examples and documentation to reflect the 2.0 public API and clean up outdated snippets. + +Contributed by: Daniele (`lupodevelop`) + +--- + ## [2.0.0] - 2026-01-24 ### Major Release โ€” Unified API diff --git a/EXAMPLES.md b/EXAMPLES.md index e6fe15f..626b8ff 100644 --- a/EXAMPLES.md +++ b/EXAMPLES.md @@ -10,53 +10,54 @@ interop should live in the *integrating application* (not in `src/str/*`). ### Grapheme-Aware Indexing and Search ```gleam -import str/core +import str pub fn search_examples() { // Find first occurrence (grapheme-aware!) - let idx = core.index_of("Hello ๐Ÿ‘จโ€๐Ÿ‘ฉโ€๐Ÿ‘งโ€๐Ÿ‘ฆ World", "World") + let idx = str.index_of("Hello ๐Ÿ‘จโ€๐Ÿ‘ฉโ€๐Ÿ‘งโ€๐Ÿ‘ฆ World", "World") // Ok(8) - the emoji is ONE grapheme cluster! // Find last occurrence - let last = core.last_index_of("hello hello hello", "hello") + let last = str.last_index_of("hello hello hello", "hello") // Ok(12) // Check for multiple needles - let has_any = core.contains_any("hello world", ["foo", "world"]) + let has_any = str.contains_any("hello world", ["foo", "world"]) // True - let has_all = core.contains_all("hello world", ["hello", "world"]) + let has_all = str.contains_all("hello world", ["hello", "world"]) // True } ``` -### Experimental Search Strategies & Caching (1.2.2) +### Experimental Search Strategies & Caching ```gleam -import str/core +import str +import str/advanced pub fn search_strategy_examples() { // 1) Use the automatic heuristic (experimental) // The heuristic chooses between a sliding matcher and KMP based on // pattern/text characteristics. It is opt-in and may choose a // non-optimal strategy in some cases. - let auto = core.index_of_auto("some long text...", "pat") + let auto = str.index_of_auto("some long text...", "pat") // 2) Force a specific strategy: use this when performance is critical // and you know which algorithm is better for your input shape. - let forced_kmp = core.index_of_strategy("long text...", "pattern", core.Kmp) - let forced_sliding = core.index_of_strategy("short text", "pat", core.Sliding) + let forced_kmp = str.index_of_strategy("long text...", "pattern", str.Kmp) + let forced_sliding = str.index_of_strategy("short text", "pat", str.Sliding) // 3) Caching KMP maps: precompute pattern maps once and reuse them // across multiple searches to avoid rebuilding prefix tables. let pattern = "abababab..." - let maps = core.build_kmp_maps(pattern) + let maps = advanced.build_kmp_maps(pattern) let pmap = maps.0 let pimap = maps.1 // Reuse maps across many texts - let idx1 = core.kmp_index_of_with_maps("first long text...", pattern, pmap, pimap) - let occurrences = core.kmp_search_all_with_maps("another text...", pmap, pimap) + let idx1 = advanced.kmp_index_of_with_maps("first long text...", pattern, pmap, pimap) + let occurrences = advanced.kmp_search_all_with_maps("another text...", pmap, pimap) // Guidance: prefer explicit strategy or caching in hot loops; use // `index_of_auto` for convenience and exploratory testing. @@ -67,74 +68,74 @@ pub fn search_strategy_examples() { > thresholds in `src/str/config.gleam`. For production-critical paths, > prefer `index_of_strategy` or precomputing maps via `build_kmp_maps`. -### Grapheme-Aware Length and String Checks (NEW in 1.1.0) +### Grapheme-Aware Length and String Checks ```gleam -import str/core +import str pub fn length_examples() { - // Grapheme-aware length (NEW in 1.1.0) + // Grapheme-aware length // Unlike standard string length, counts grapheme clusters correctly - let len = core.length("Hello") + let len = str.length("Hello") // 5 // Family emoji is a SINGLE grapheme cluster - let emoji_len = core.length("๐Ÿ‘จโ€๐Ÿ‘ฉโ€๐Ÿ‘งโ€๐Ÿ‘ฆ") + let emoji_len = str.length("๐Ÿ‘จโ€๐Ÿ‘ฉโ€๐Ÿ‘งโ€๐Ÿ‘ฆ") // 1 // Flag is also a single grapheme - let flag_len = core.length("๐Ÿ‡ฎ๐Ÿ‡น") + let flag_len = str.length("๐Ÿ‡ฎ๐Ÿ‡น") // 1 // Combining characters stay attached - let cafe_len = core.length("cafรฉ") + let cafe_len = str.length("cafรฉ") // 4 (even with combining accent) } pub fn contains_examples() { - // Grapheme-aware contains (NEW in 1.1.0) - let found = core.contains("hello world", "world") + // Grapheme-aware contains + let found = str.contains("hello world", "world") // True - let not_found = core.contains("hello", "x") + let not_found = str.contains("hello", "x") // False // Works correctly with emoji - let emoji_found = core.contains("๐Ÿ‘จโ€๐Ÿ‘ฉโ€๐Ÿ‘งโ€๐Ÿ‘ฆ family", "๐Ÿ‘จโ€๐Ÿ‘ฉโ€๐Ÿ‘งโ€๐Ÿ‘ฆ") + let emoji_found = str.contains("๐Ÿ‘จโ€๐Ÿ‘ฉโ€๐Ÿ‘งโ€๐Ÿ‘ฆ family", "๐Ÿ‘จโ€๐Ÿ‘ฉโ€๐Ÿ‘งโ€๐Ÿ‘ฆ") // True } pub fn prefix_suffix_examples() { - // Grapheme-aware starts_with (NEW in 1.1.0) - let starts = core.starts_with("hello", "he") + // Grapheme-aware starts_with + let starts = str.starts_with("hello", "he") // True // Empty prefix always matches - let empty_prefix = core.starts_with("hello", "") + let empty_prefix = str.starts_with("hello", "") // True // Works with emoji on grapheme boundaries - let emoji_starts = core.starts_with("๐Ÿ‘จโ€๐Ÿ‘ฉโ€๐Ÿ‘งโ€๐Ÿ‘ฆabc", "๐Ÿ‘จโ€๐Ÿ‘ฉโ€๐Ÿ‘งโ€๐Ÿ‘ฆ") + let emoji_starts = str.starts_with("๐Ÿ‘จโ€๐Ÿ‘ฉโ€๐Ÿ‘งโ€๐Ÿ‘ฆabc", "๐Ÿ‘จโ€๐Ÿ‘ฉโ€๐Ÿ‘งโ€๐Ÿ‘ฆ") // True - // Grapheme-aware ends_with (NEW in 1.1.0) - let ends = core.ends_with("hello.txt", ".txt") + // Grapheme-aware ends_with + let ends = str.ends_with("hello.txt", ".txt") // True - let emoji_ends = core.ends_with("abc๐Ÿ‘จโ€๐Ÿ‘ฉโ€๐Ÿ‘งโ€๐Ÿ‘ฆ", "๐Ÿ‘จโ€๐Ÿ‘ฉโ€๐Ÿ‘งโ€๐Ÿ‘ฆ") + let emoji_ends = str.ends_with("abc๐Ÿ‘จโ€๐Ÿ‘ฉโ€๐Ÿ‘งโ€๐Ÿ‘ฆ", "๐Ÿ‘จโ€๐Ÿ‘ฉโ€๐Ÿ‘งโ€๐Ÿ‘ฆ") // True } pub fn empty_check_examples() { - // is_empty check (NEW in 1.1.0) - let empty = core.is_empty("") + // is_empty check + let empty = str.is_empty("") // True - let not_empty = core.is_empty(" ") + let not_empty = str.is_empty(" ") // False (whitespace is not empty) // Combine with is_blank for whitespace check - let blank = core.is_blank(" ") + let blank = str.is_blank(" ") // True } ``` @@ -142,15 +143,15 @@ pub fn empty_check_examples() { ### Replace First/Last Occurrence ```gleam -import str/core +import str pub fn replace_examples() { // Replace only first occurrence (stdlib only has replace all) let text = "hello hello hello" - let first = core.replace_first(text, "hello", "hi") + let first = str.replace_first(text, "hello", "hi") // "hi hello hello" - let last = core.replace_last(text, "hello", "bye") + let last = str.replace_last(text, "hello", "bye") // "hello hello bye" } ``` @@ -158,17 +159,17 @@ pub fn replace_examples() { ### HTML Escaping for Web Applications ```gleam -import str/core +import str pub fn html_examples() { // Escape user input before rendering let user_input = "" - let safe = core.escape_html(user_input) + let safe = str.escape_html(user_input) // "<script>alert('xss')</script>" // Unescape for display let escaped = "<div>Hello</div>" - let original = core.unescape_html(escaped) + let original = str.unescape_html(escaped) // "
Hello
" } ``` @@ -176,48 +177,48 @@ pub fn html_examples() { ### String Validation ```gleam -import str/core +import str pub fn validation_examples() { // Case validation (ignores non-letter characters) - assert core.is_uppercase("HELLO123") == True - assert core.is_lowercase("hello_world") == True + assert str.is_uppercase("HELLO123") == True + assert str.is_lowercase("hello_world") == True - // Title Case validation (NEW in 1.0.0) - assert core.is_title_case("Hello World") == True - assert core.is_title_case("hello World") == False - assert core.is_title_case("Hello 123 World") == True // numbers ignored + // Title Case validation + assert str.is_title_case("Hello World") == True + assert str.is_title_case("hello World") == False + assert str.is_title_case("Hello 123 World") == True // numbers ignored // ASCII validation - assert core.is_ascii("hello!@#") == True - assert core.is_ascii("cafรฉ") == False + assert str.is_ascii("hello!@#") == True + assert str.is_ascii("cafรฉ") == False // Hex validation (useful for color codes, UUIDs, etc.) - assert core.is_hex("DEADBEEF") == True - assert core.is_hex("ff00ff") == True + assert str.is_hex("DEADBEEF") == True + assert str.is_hex("ff00ff") == True // Printable check (no control characters) - assert core.is_printable("hello") == True - assert core.is_printable("hello\n") == False + assert str.is_printable("hello") == True + assert str.is_printable("hello\n") == False } ``` ### String Similarity and Distance ```gleam -import str/core +import str pub fn similarity_examples() { // Levenshtein distance (edit operations needed) - let dist = core.distance("kitten", "sitting") + let dist = str.distance("kitten", "sitting") // 3 // Similarity as percentage (0.0 to 1.0) - let sim = core.similarity("hello", "hallo") + let sim = str.similarity("hello", "hallo") // 0.8 (80% similar) // Hamming distance (same length strings only) - let ham = core.hamming_distance("karolin", "kathrin") + let ham = str.hamming_distance("karolin", "kathrin") // Ok(3) } ``` @@ -225,135 +226,135 @@ pub fn similarity_examples() { ### Take/Drop from Right ```gleam -import str/core +import str pub fn take_drop_examples() { // Get last N graphemes - let last3 = core.take_right("hello world", 3) + let last3 = str.take_right("hello world", 3) // "rld" // Drop last N graphemes - let without_ext = core.drop_right("file.txt", 4) + let without_ext = str.drop_right("file.txt", 4) // "file" // Works with emoji too! - let emoji_end = core.take_right("Hello ๐Ÿ‘‹๐Ÿฝ", 1) + let emoji_end = str.take_right("Hello ๐Ÿ‘‹๐Ÿฝ", 1) // "๐Ÿ‘‹๐Ÿฝ" (single grapheme cluster with skin tone) } ``` -### Capitalize and Case Manipulation (NEW in 1.0.0) +### Capitalize and Case Manipulation ```gleam -import str/core +import str pub fn capitalize_examples() { // Capitalize: first letter uppercase, rest lowercase - let text = core.capitalize("hELLO wORLD") + let text = str.capitalize("hELLO wORLD") // "Hello world" // Swap case - let swapped = core.swapcase("Hello World") + let swapped = str.swapcase("Hello World") // "hELLO wORLD" } ``` -### Partition and Split (NEW in 1.0.0) +### Partition and Split ```gleam -import str/core +import str pub fn partition_examples() { // Partition from first occurrence - let #(before, sep, after) = core.partition("a-b-c", "-") + let #(before, sep, after) = str.partition("a-b-c", "-") // #("a", "-", "b-c") - // Partition from LAST occurrence (rpartition - NEW!) + // Partition from LAST occurrence // Note: if not found, returns #("", "", text) like Python - let #(before2, sep2, after2) = core.rpartition("a-b-c", "-") + let #(before2, sep2, after2) = str.rpartition("a-b-c", "-") // #("a-b", "-", "c") - // Split with max parts limit (splitn - NEW!) - let parts = core.splitn("one-two-three-four", "-", 2) + // Split with max parts limit + let parts = str.splitn("one-two-three-four", "-", 2) // ["one", "two-three-four"] - let parts3 = core.splitn("a:b:c:d", ":", 3) + let parts3 = str.splitn("a:b:c:d", ":", 3) // ["a", "b", "c:d"] } ``` -### Padding and Filling (NEW in 1.0.0) +### Padding and Filling ```gleam -import str/core +import str pub fn padding_examples() { // Standard padding - let padded = core.pad_left("42", 5, "0") + let padded = str.pad_left("42", 5, "0") // "00042" - // Flexible fill with position type (NEW!) - let left_fill = core.fill("x", 5, "-", core.Left) + // Flexible fill with position type + let left_fill = str.fill("x", 5, "-", str.Left) // "----x" - let right_fill = core.fill("x", 5, "-", core.Right) + let right_fill = str.fill("x", 5, "-", str.Right) // "x----" - let center_fill = core.fill("x", 5, "-", core.Both) + let center_fill = str.fill("x", 5, "-", str.Both) // "--x--" } ``` -### Chunking Strings (NEW in 1.0.0) +### Chunking Strings ```gleam -import str/core +import str pub fn chunk_examples() { // Split into fixed-size chunks - let chunks = core.chunk("abcdefg", 3) + let chunks = str.chunk("abcdefg", 3) // ["abc", "def", "g"] - let pairs = core.chunk("abcdef", 2) + let pairs = str.chunk("abcdef", 2) // ["ab", "cd", "ef"] // Works with emoji (grapheme-aware!) - let emoji_chunks = core.chunk("๐Ÿ‘จโ€๐Ÿ‘ฉโ€๐Ÿ‘งโ€๐Ÿ‘ฆab", 2) + let emoji_chunks = str.chunk("๐Ÿ‘จโ€๐Ÿ‘ฉโ€๐Ÿ‘งโ€๐Ÿ‘ฆab", 2) // ["๐Ÿ‘จโ€๐Ÿ‘ฉโ€๐Ÿ‘งโ€๐Ÿ‘ฆa", "b"] } ``` -### Prefix/Suffix Checking (NEW in 1.0.0) +### Prefix/Suffix Checking ```gleam -import str/core +import str pub fn prefix_suffix_examples() { - // Check multiple prefixes at once (starts_with_any - NEW!) - let is_greeting = core.starts_with_any("hello world", ["hi", "hello", "hey"]) + // Check multiple prefixes at once + let is_greeting = str.starts_with_any("hello world", ["hi", "hello", "hey"]) // True - // Check multiple suffixes at once (ends_with_any - NEW!) - let is_image = core.ends_with_any("photo.png", [".jpg", ".png", ".gif"]) + // Check multiple suffixes at once + let is_image = str.ends_with_any("photo.png", [".jpg", ".png", ".gif"]) // True - let is_code = core.ends_with_any("main.gleam", [".gleam", ".erl", ".ex"]) + let is_code = str.ends_with_any("main.gleam", [".gleam", ".erl", ".ex"]) // True } ``` -### Whitespace Normalization (NEW in 1.0.0) +### Whitespace Normalization ```gleam -import str/core +import str pub fn whitespace_examples() { // Collapse all whitespace to single spaces - let normalized = core.normalize_whitespace(" hello world \n\t test ") + let normalized = str.normalize_whitespace(" hello world \n\t test ") // "hello world test" // Great for cleaning user input - let clean = core.normalize_whitespace(" John Doe ") + let clean = str.normalize_whitespace(" John Doe ") // "John Doe" } ``` @@ -361,19 +362,19 @@ pub fn whitespace_examples() { ### Text Utilities ```gleam -import str/core +import str pub fn utility_examples() { // Reverse word order - let reversed = core.reverse_words("hello beautiful world") + let reversed = str.reverse_words("hello beautiful world") // "world beautiful hello" // Extract initials - let init = core.initials("John Fitzgerald Kennedy") + let init = str.initials("John Fitzgerald Kennedy") // "JFK" // Regex escaping for pattern matching - let pattern = core.escape_regex("hello.world[test]") + let pattern = str.escape_regex("hello.world[test]") // "hello\\.world\\[test\\]" } ``` @@ -395,8 +396,8 @@ pub fn otp_nfd(s: String) -> String { } // Use it when calling into `str`: -let folded = str::extra::ascii_fold_with_normalizer("Crรจme Brรปlรฉe", otp_nfd) -let slug = str::extra::slugify_opts_with_normalizer("Crรจme Brรปlรฉe", 0, "-", False, otp_nfd) +let folded = str.ascii_fold_with_normalizer("Crรจme Brรปlรฉe", otp_nfd) +let slug = str.slugify_opts_with_normalizer("Crรจme Brรปlรฉe", 0, "-", False, otp_nfd) ``` Notes: @@ -412,7 +413,7 @@ A short wrapper is available for convenience. Example usage: ```gleam // short alias: uses default separator `-` and no token limit let s = "Cafรฉ โค๏ธ Gleam" -let slug = str::extra::slugify_with_normalizer(s, otp_nfd) +let slug = str.slugify_with_normalizer(s, otp_nfd) ``` ## 3) No-decompose variants @@ -421,7 +422,7 @@ If you prefer not to run the library's limited Latin decomposer you can call the `_no_decompose_` variants and still pass a normalizer: ```gleam -let folded = str::extra::ascii_fold_no_decompose_with_normalizer(s, otp_nfd) +let folded = str.ascii_fold_no_decompose_with_normalizer(s, otp_nfd) ``` This gives you full control over decomposition/normalization order. @@ -440,8 +441,8 @@ cd /path/to/project && gleam test ## 5) Regenerating character tables (docs) -If you extend `src/str/internal_translit.gleam` or -`src/str/internal_decompose.gleam`, regenerate the JSON used by the +If you extend `src/str/internal/translit.gleam` or +`src/str/internal/decompose.gleam`, regenerate the JSON used by the docs: ```sh @@ -454,11 +455,11 @@ In tests it's handy to simulate NFD/NFC without OTP. Example: ```gleam let fake_nfd = fn(x) { string.replace(x, "รฉ", "e\u{0301}") } -let slug = str::extra::slugify_opts_with_normalizer("Cafรฉ", 0, "-", False, fake_nfd) +let slug = str.slugify_opts_with_normalizer("Cafรฉ", 0, "-", False, fake_nfd) assert slug == "cafe" ``` -## 8) Where to put NFC/NFD helpers (application-side) +## 7) Where to put NFC/NFD helpers (application-side) If you want to provide explicit `nfc`/`nfd` helpers that call OTP, put them in your application (not in the `str` library). Example (commented): @@ -474,7 +475,18 @@ them in your application (not in the `str` library). Example (commented): // } ``` -## 7) Tokenization reference +## 8) Tokenization reference If you need a pure-Gleam tokenizer for special processing, see -`src/str/tokenize.gleam` which provides a pedagogic reference implementation. +`src/str/internal/tokenize.gleam` which provides a pedagogic reference +implementation. Access the tokenizer via the public API: + +```gleam +import str + +let clusters = str.chars("cafรฉ") +// -> ["c", "a", "f", "รฉ"] + +let stdlib_clusters = str.chars_stdlib("cafรฉ") +// -> ["c", "a", "f", "รฉ"] +``` diff --git a/docs/str_core.md b/docs/str_core.md index ad96022..5a6a860 100644 --- a/docs/str_core.md +++ b/docs/str_core.md @@ -1,8 +1,11 @@ -# str/core โ€” Grapheme-Aware Core Utilities +# str โ€” Grapheme-Aware Core Utilities + +> **Note (2.0+):** The `str/core` module is now internal (`str/internal/core`). All functions +> documented here are available via `import str`. Use `str.function_name()` in your code. ## Overview -The `str/core` module provides fundamental string operations that correctly handle Unicode grapheme clusters, including: +The core of `str` provides fundamental string operations that correctly handle Unicode grapheme clusters, including: - Complex emoji sequences (ZWJ, skin tones, flags) - Combining character sequences (diacritics, accents) @@ -133,7 +136,7 @@ Pads text on the right. #### `center(text: String, width: Int, pad: String) -> String` -Centers text within the specified width (left-biased when uneven). +Centers text within the specified width (right-biased when uneven: extra padding goes to the right). **Example**: ```gleam diff --git a/docs/str_extra.md b/docs/str_extra.md index 243ee0d..cac484e 100644 --- a/docs/str_extra.md +++ b/docs/str_extra.md @@ -1,8 +1,11 @@ -# str/extra โ€” ASCII Transliteration and Slug Generation +# str โ€” ASCII Transliteration and Slug Generation + +> **Note (2.0+):** The `str/extra` module is now internal (`str/internal/extra`). All functions +> documented here are available via `import str`. Use `str.function_name()` in your code. ## Overview -The `str/extra` module provides practical utilities for: +The `str` library provides practical utilities for: - Converting Unicode text to ASCII equivalents - Generating URL-friendly slugs @@ -240,26 +243,26 @@ The internal Latin decomposer handles: **Note**: Coverage is optimized for Western European languages. For comprehensive Unicode support, use an external transliteration library. -## Production Usage +### Production Usage ### Basic Slug Generation ```gleam -import str/extra +import str pub fn create_post_slug(title: String) -> String { - extra.slugify(title) + str.slugify(title) } ``` ### With OTP Normalization ```gleam -import str/extra +import str import unicode_helpers pub fn create_url_slug(title: String, max_words: Int) -> String { - extra.slugify_opts_with_normalizer( + str.slugify_opts_with_normalizer( title, max_words, "-", @@ -272,26 +275,26 @@ pub fn create_url_slug(title: String, max_words: Int) -> String { ### File Name Sanitization ```gleam -import str/extra +import str pub fn sanitize_filename(name: String) -> String { name - |> extra.ascii_fold() - |> extra.slugify_opts(0, "_", False) + |> str.ascii_fold() + |> str.slugify_opts(0, "_", False) } ``` ### Identifier Generation ```gleam -import str/extra +import str pub fn to_variable_name(text: String) -> String { - extra.to_snake_case(text) + str.to_snake_case(text) } pub fn to_function_name(text: String) -> String { - extra.to_camel_case(text) + str.to_camel_case(text) } ``` @@ -306,14 +309,14 @@ pub fn to_function_name(text: String) -> String { To add custom replacements: -1. Edit `src/str/internal_translit.gleam` +1. Edit `src/str/internal/translit.gleam` 2. Add entries to the `replacements()` table 3. Regenerate documentation: `python3 scripts/generate_character_tables.py` 4. Test with real-world examples ## See Also -- [str/core](str_core.md) โ€” Grapheme-aware core utilities +- [str](str_core.md) โ€” Grapheme-aware core utilities - [OTP Integration Guide](../examples/with_otp.md) โ€” Unicode normalization setup - [Examples](../EXAMPLES.md) โ€” Integration patterns - [Character Tables](character_tables.json) โ€” Machine-readable replacement data diff --git a/docs/str_tokenize.md b/docs/str_tokenize.md index 2200e3b..2881e68 100644 --- a/docs/str_tokenize.md +++ b/docs/str_tokenize.md @@ -1,16 +1,16 @@ **Tokenizer** - **Description:** The `str` library exposes two tokenizer functions for extracting grapheme clusters from text: - - `tokenize.chars/1`: an experimental pure-Gleam implementation that approximates grapheme segmentation. - - `tokenize.chars_stdlib/1`: a thin wrapper over the BEAM stdlib `string.to_graphemes/1` and the recommended choice for production. + - `str.chars/1`: an experimental pure-Gleam implementation that approximates grapheme segmentation. + - `str.chars_stdlib/1`: a thin wrapper over the BEAM stdlib `string.to_graphemes/1` and the recommended choice for production. - **When to use which:** - **`chars_stdlib/1` (recommended):** Use in production code. It uses the BEAM runtime's grapheme segmentation, is more accurate for edge cases (UAX #29) and typically faster. - **`chars/1` (experimental):** Useful when you want a self-contained, pure-Gleam implementation (for debugging, learning, or portability guarantees within Gleam code). It approximates common grapheme rules (combining marks, variation selectors, skin tones, ZWJ sequences) but may differ on rare or exotic sequences. - **Examples:** - - `tokenize.chars("cafรฉ")` -> `["c", "a", "f", "รฉ"]` - - `tokenize.chars_stdlib("๐Ÿ‘ฉ\u{200D}๐Ÿ‘ฉ")` -> `["๐Ÿ‘ฉ\u{200D}๐Ÿ‘ฉ"]` + - `str.chars("cafรฉ")` -> `["c", "a", "f", "รฉ"]` + - `str.chars_stdlib("๐Ÿ‘ฉ\u{200D}๐Ÿ‘ฉ")` -> `["๐Ÿ‘ฉ\u{200D}๐Ÿ‘ฉ"]` - **Notes:** - Both functions return a `List(String)` of grapheme clusters. @@ -19,7 +19,11 @@ **Guidance for library maintainers** - Keep `chars/1` as an experimental reference implementation. If a user-reported bug shows a clear mismatch between `chars/1` and the BEAM stdlib for a case that matters, prefer fixing docs or recommending `chars_stdlib/1` rather than changing the experimental algorithm in-place unless necessary. -# str/tokenize โ€” Pure-Gleam tokenizer (reference) + +# str/internal/tokenize โ€” Pure-Gleam tokenizer (reference) + +> **Note (2.0+):** This module is internal. Access these functions via the public `str` module: +> `str.chars(text)` and `str.chars_stdlib(text)`. This module contains a tokenizer implemented entirely in Gleam as a pedagogical reference. It is not intended to replace standard library APIs, but to show how to iterate grapheme clusters in pure Gleam without NIFs or native dependencies. @@ -28,16 +32,15 @@ Key functions - `chars(text: String) -> List(String)` : Returns the list of grapheme clusters for the input string. -- `words(text: String) -> List(String)` - : Simple whitespace-normalized word split. - -- `scan_with_state(text, init_state, fun)` - : Generic scanner that calls `fun` for each grapheme with a state value. +- `chars_stdlib(text: String) -> List(String)` + : Uses the BEAM stdlib grapheme segmentation (more accurate). Example ```gleam -let chars = str_tokenize::chars("cafรฉ") +import str + +let chars = str.chars("cafรฉ") // -> ["c", "a", "f", "รฉ"] ``` diff --git a/gleam.toml b/gleam.toml index 1c02158..0c5ec74 100644 --- a/gleam.toml +++ b/gleam.toml @@ -1,5 +1,5 @@ name = "str" -version = "2.0.0" +version = "2.0.1" # Project metadata (fill or replace placeholders before publishing) description = "Unicode-aware string utilities for Gleam: grapheme-safe operations, pragmatic ASCII transliteration, and slug generation." diff --git a/src/str/internal/core.gleam b/src/str/internal/core.gleam index ad07923..3f6d837 100644 --- a/src/str/internal/core.gleam +++ b/src/str/internal/core.gleam @@ -82,7 +82,7 @@ fn truncate_with_emoji_inclusion( // clusters to include up to that emoji to avoid splitting it. let window = list.take(clusters, take + grapheme_len(suffix)) - let keep = case find_emoji_index(window, 0) { + let keep = case find_emoji_index(window) { Ok(idx) -> // If emoji is beyond current `take`, include up to the emoji. case idx + 1 > take { @@ -98,21 +98,17 @@ fn truncate_with_emoji_inclusion( /// Finds the index of the first emoji-containing cluster in a list. /// /// Returns Ok(index) if found, Error(Nil) if no emoji detected. -fn find_emoji_index(clusters: List(String), start: Int) -> Result(Int, Nil) { - find_emoji_index_loop(clusters, start, 0) +fn find_emoji_index(clusters: List(String)) -> Result(Int, Nil) { + find_emoji_index_loop(clusters, 0) } -fn find_emoji_index_loop( - clusters: List(String), - idx: Int, - offset: Int, -) -> Result(Int, Nil) { +fn find_emoji_index_loop(clusters: List(String), index: Int) -> Result(Int, Nil) { case clusters { [] -> Error(Nil) [first, ..rest] -> case cluster_has_emoji(first) { - True -> Ok(idx + offset) - False -> find_emoji_index_loop(rest, idx, offset + 1) + True -> Ok(index) + False -> find_emoji_index_loop(rest, index + 1) } } } @@ -180,9 +176,13 @@ pub fn is_blank(text: String) -> Bool { /// /// Internal helper for padding operations. Returns empty string if n <= 0. fn repeat_str(s: String, n: Int) -> String { + repeat_str_loop(s, n, "") +} + +fn repeat_str_loop(s: String, n: Int, acc: String) -> String { case n <= 0 { - True -> "" - False -> list.fold(list.range(1, n), "", fn(acc, _) { acc <> s }) + True -> acc + False -> repeat_str_loop(s, n - 1, acc <> s) } } @@ -292,7 +292,7 @@ pub fn pad_right(text: String, width: Int, pad: String) -> String { } /// Centers text within the specified width using the given padding. -/// When padding is uneven, the left side receives more (left-biased). +/// When padding is uneven, the right side receives more (text shifts left). /// /// center("hi", 6, " ") -> " hi " /// center("hi", 5, " ") -> " hi " @@ -457,9 +457,9 @@ pub fn truncate_with_flag( case max_len <= 0 { True -> "" False -> - // If the text already fits and appending the suffix wouldn't exceed max_len, - // return the original text. - case total <= max_len && total + suffix_len <= max_len { + // If the text already fits within max_len, return it unchanged. + // The suffix is only added when text is actually truncated. + case total <= max_len { True -> text False -> case take <= 0 { @@ -1203,7 +1203,7 @@ pub fn ensure_suffix(text: String, suffix: String) -> String { /// starts_with_any("test", []) -> False /// pub fn starts_with_any(text: String, prefixes: List(String)) -> Bool { - list.any(prefixes, fn(prefix) { string.starts_with(text, prefix) }) + list.any(prefixes, fn(prefix) { starts_with(text, prefix) }) } /// Checks if text ends with any of the given suffixes. @@ -1213,7 +1213,7 @@ pub fn starts_with_any(text: String, prefixes: List(String)) -> Bool { /// ends_with_any("test", []) -> False /// pub fn ends_with_any(text: String, suffixes: List(String)) -> Bool { - list.any(suffixes, fn(suffix) { string.ends_with(text, suffix) }) + list.any(suffixes, fn(suffix) { ends_with(text, suffix) }) } // ============================================================================ diff --git a/test/str_core_test.gleam b/test/str_core_test.gleam index ac9731b..9ce3760 100644 --- a/test/str_core_test.gleam +++ b/test/str_core_test.gleam @@ -1212,3 +1212,38 @@ pub fn normalize_whitespace_empty_test() { pub fn normalize_whitespace_only_spaces_test() { assert str.normalize_whitespace(" ") == "" } + +// --- Fix regression tests --- + +/// truncate should NOT truncate text that already fits within max_len, +/// even when text + suffix would exceed max_len. +pub fn truncate_noop_when_text_fits_test() { + // 6 chars, max_len=8: text fits, no truncation needed + assert str.truncate("Hello!", 8, "...") == "Hello!" + // exact fit + assert str.truncate("12345678", 8, "...") == "12345678" + // well within limit + assert str.truncate("Hi", 10, "...") == "Hi" +} + +/// starts_with_any should be grapheme-aware (combining marks). +pub fn starts_with_any_grapheme_aware_test() { + // "e" + combining accent = one grapheme "รฉ" + let text = "e\u{0301}llo" + // byte-level would match "e", but grapheme-aware should not + assert str.starts_with_any(text, ["e"]) == False + assert str.starts_with_any(text, ["e\u{0301}"]) == True +} + +/// ends_with_any should be grapheme-aware (combining marks). +pub fn ends_with_any_grapheme_aware_test() { + let text = "caf" <> "e\u{0301}" + assert str.ends_with_any(text, ["e"]) == False + assert str.ends_with_any(text, ["e\u{0301}"]) == True +} + +/// center right-bias: odd padding gives more to the right side. +pub fn center_right_bias_test() { + // width=5, text="hi" (2), total_pad=3 โ†’ left=1, right=2 + assert str.center("hi", 5, " ") == " hi " +}