hegeldev · Liam-DeVoe · Apr 4, 2026 · Mar 28, 2026 · Mar 28, 2026 · Mar 28, 2026
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -48,6 +48,7 @@ jobs:
 
       - name: Run tests with coverage
         run: just check-tests-coverage-normal
+
       - name: Upload coverage data
         uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4
         with:
@@ -70,6 +71,7 @@ jobs:
 
       - name: Run tests with coverage
         run: just check-tests-coverage-antithesis
+
       - name: Upload coverage data
         uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4
         with:
@@ -96,6 +98,7 @@ jobs:
         with:
           pattern: coverage-*
           merge-multiple: true
+
       - name: Check coverage is 100%
         run: just check-coverage-report
 

diff --git a/RELEASE.md b/RELEASE.md
@@ -0,0 +1,3 @@
+RELEASE_TYPE: patch
+
+This release adds support for alphabet parameters in `{"type": "string"}` and `{"type": "regex"}` schemas, allowing control over generated characters. Supported parameters are `codec`, `min_codepoint`, `max_codepoint`, `categories`, `exclude_categories`, `exclude_characters`, and `include_characters`.
diff --git a/docs/library-api.md b/docs/library-api.md
@@ -192,13 +192,53 @@ Generate Unicode text strings.
 **Parameters:**
 - `min_size` (int): Minimum string length. Default: 0.
 - `max_size` (int, optional): Maximum string length. Default: no limit.
+- `codec` (string, optional): Restrict to characters encodable in this codec (e.g. `"ascii"`, `"utf-8"`, `"latin-1"`).
+- `min_codepoint` (int, optional): Minimum Unicode codepoint.
+- `max_codepoint` (int, optional): Maximum Unicode codepoint.
+- `categories` (list of strings, optional): Include only characters from these Unicode general categories (e.g. `["L", "Nd"]`). Mutually exclusive with `exclude_categories`.
+- `exclude_categories` (list of strings, optional): Exclude characters from these Unicode general categories. Mutually exclusive with `categories`.
+- `include_characters` (string, optional): Always include these specific characters, even if excluded by other filters.
+- `exclude_characters` (string, optional): Always exclude these specific characters.
+
+When no character filtering parameters are set, the server generates from the
+full Unicode range. Client libraries that cannot represent surrogates (e.g.
+Rust) should send `exclude_categories: ["Cs"]` in their schemas by default.
 
 **Schema:**
 ```json
-{"type": "string", "min_size": <int>, "max_size": <int>}
+{
+  "type": "string",
+  "min_size": <int>,
+  "max_size": <int>,
+  "codec": <string>,
+  "min_codepoint": <int>,
+  "max_codepoint": <int>,
+  "categories": [<string>, ...],
+  "exclude_categories": [<string>, ...],
+  "include_characters": <string>,
+  "exclude_characters": <string>
+}
 ```
 
-`max_size` is omitted when unspecified.
+All fields except `type` and `min_size` are omitted when unspecified.
+
+**Basic:** Always. No transform.
+
+### `characters`
+
+Generate single Unicode characters. This is a convenience for
+`text(min_size=1, max_size=1)` — it returns the same type/schema with length
+fixed to 1. The `characters` type is also accepted by `from_regex`'s `alphabet`
+parameter.
+
+**Parameters:** Same character filtering parameters as `text` (all optional):
+`codec`, `min_codepoint`, `max_codepoint`, `categories`, `exclude_categories`,
+`include_characters`, `exclude_characters`.
+
+**Schema:**
+```json
+{"type": "string", "min_size": 1, "max_size": 1, "codec": <string>, ...}
+```
 
 **Basic:** Always. No transform.
 
@@ -263,12 +303,19 @@ Generate strings matching a regular expression pattern.
 **Parameters:**
 - `pattern` (string): The regex pattern.
 - `fullmatch` (bool): Whether the entire string must match. Default: `false`.
+- `alphabet` (Characters, optional): Constrain which characters may appear in
+  generated strings. Accepts a `characters()` value. When unspecified, the
+  server generates from the full Unicode range.
 
 **Schema:**
 ```json
-{"type": "regex", "pattern": <string>, "fullmatch": <bool>}
+{"type": "regex", "pattern": <string>, "fullmatch": <bool>, "alphabet": {...}}
 ```
 
+The `alphabet` object contains the same character filtering keys as `text`:
+`codec`, `min_codepoint`, `max_codepoint`, `categories`, `exclude_categories`,
+`include_characters`, `exclude_characters`. It is omitted when unspecified.
+
 **Basic:** Always. No transform.
 
 ### Format Generators

diff --git a/src/hegel/conformance.py b/src/hegel/conformance.py
@@ -2,20 +2,94 @@
 import os
 import subprocess
 import tempfile
+import unicodedata
 from abc import ABC, abstractmethod
 from collections.abc import Collection
+from encodings.aliases import aliases
 from pathlib import Path
 from typing import Any, ClassVar
 
 import pytest
 from hypothesis import (
     Phase,
+    assume,
     currently_in_test_context,
     given,
     note,
     settings as Settings,
     strategies as st,
 )
+from hypothesis.errors import InvalidArgument
+from hypothesis.internal import charmap
+
+
+def _can_encode(codec: str) -> bool:
+    try:
+        "".encode(codec)
+        return True
+    except Exception:
+        return False
+
+
+ALL_CATEGORIES = list(charmap.categories())
+ALL_CODECS = sorted({c for c in set(aliases).union(aliases.values()) if _can_encode(c)})
+
+
+@st.composite
+def _character_params(draw: st.DrawFn) -> dict[str, Any]:
+    params: dict[str, Any] = {}
+
+    use_codec = draw(st.booleans())
+    use_min_codepoint = draw(st.booleans())
+    use_max_codepoint = draw(st.booleans())
+    use_categories = draw(st.booleans())
+    use_exclude_categories = draw(st.booleans())
+    use_exclude_chars = draw(st.booleans())
+    use_include_chars = draw(st.booleans())
+
+    # categories and exclude_categories are mutually exclusive
+    assume(not (use_categories and use_exclude_categories))
+
+    if use_codec:
+        params["codec"] = draw(st.sampled_from(ALL_CODECS))
+
+    if use_min_codepoint:
+        params["min_codepoint"] = draw(st.integers(0, 0x10FFFF))
+
+    if use_max_codepoint:
+        lo = params.get("min_codepoint", 0)
+        params["max_codepoint"] = draw(st.integers(lo, 0x10FFFF))
+
+    if use_categories:
+        params["categories"] = draw(st.lists(st.sampled_from(ALL_CATEGORIES)))
+
+    if use_exclude_categories:
+        params["exclude_categories"] = draw(st.lists(st.sampled_from(ALL_CATEGORIES)))
+
+    if use_exclude_chars:
+        params["exclude_characters"] = draw(st.text())
+
+    if use_include_chars:
+        params["include_characters"] = draw(st.text())
+
+    # reject invalid combinations
+    try:
+        st.characters(**params).validate()
+    except (InvalidArgument, ValueError):
+        assume(False)
+
+    return params
+
+
+@st.composite
+def text_params_strategy(draw: st.DrawFn) -> dict[str, Any]:
+    char_params = draw(_character_params())
+    min_size = draw(st.integers(0, 20))
+    max_size = draw(st.none() | st.integers(min_size, 20))
+    params: dict[str, Any] = {"min_size": min_size, **char_params}
+    if max_size is not None:
+        params["max_size"] = max_size
+    return params
 
 
 @st.composite
@@ -329,35 +403,47 @@ def validate(
 
 class TextConformance(ConformanceTest):
     def params_strategy(self) -> st.SearchStrategy[dict[str, Any]]:
-        @st.composite
-        def strategy(draw: st.DrawFn) -> dict[str, Any]:
-            use_min_size = draw(st.booleans())
-            use_max_size = draw(st.booleans())
-
-            min_size = draw(st.integers(0, 50)) if use_min_size else 0
-            max_size = (
-                draw(st.integers(min_value=min_size, max_value=100))
-                if use_max_size
-                else None
-            )
-
-            return {"min_size": min_size, "max_size": max_size}
-
-        return strategy()
+        return text_params_strategy()
 
     def validate(
         self,
         metrics_list: list[dict[str, Any]],
         params: dict[str, Any],
     ) -> None:
+        expanded_cats = (
+            set(charmap.as_general_categories(params["categories"]))
+            if "categories" in params
+            else set()
+        )
+        expanded_exclude_cats = (
+            set(charmap.as_general_categories(params["exclude_categories"]))
+            if "exclude_categories" in params
+            else set()
+        )
+
         for metrics in metrics_list:
             if currently_in_test_context():
                 note(f"metrics: {metrics}")
-            length = metrics["length"]
+            codepoints = metrics["codepoints"]
+            length = len(codepoints)
             assert length >= params["min_size"]
-            if params["max_size"] is not None:
+            if params.get("max_size") is not None:
                 assert length <= params["max_size"]
 
+            for cp in codepoints:
+                if "min_codepoint" in params:
+                    assert cp >= params["min_codepoint"]
+                if "max_codepoint" in params:
+                    assert cp <= params["max_codepoint"]
+                if "codec" in params:
+                    chr(cp).encode(params["codec"])
+                if expanded_cats:
+                    assert unicodedata.category(chr(cp)) in expanded_cats
+                if expanded_exclude_cats:
+                    assert unicodedata.category(chr(cp)) not in expanded_exclude_cats
+                if "exclude_characters" in params:
+                    assert chr(cp) not in params["exclude_characters"]
+
 
 class BinaryConformance(ConformanceTest):
     def params_strategy(self) -> st.SearchStrategy[dict[str, Any]]:

diff --git a/src/hegel/protocol/connection.py b/src/hegel/protocol/connection.py
@@ -19,7 +19,7 @@
 if TYPE_CHECKING:
     from hegel.protocol.stream import Stream
 
-PROTOCOL_VERSION = 0.8
+PROTOCOL_VERSION = 0.9
 HANDSHAKE_STRING = b"hegel_handshake_start"
 
 

diff --git a/src/hegel/schema.py b/src/hegel/schema.py
@@ -2,6 +2,7 @@
 import json
 from typing import Any
 
+from cbor2 import CBORTag
 from hypothesis import strategies as st
 from hypothesis.internal.cache import LRUCache
 from hypothesis.internal.conjecture.data import ConjectureData
@@ -20,6 +21,27 @@ def do_draw(self, data: ConjectureData) -> bool:
         return data.draw_boolean(p=self.p)
 
 
+# used to allow encoding of surrogate code points. See https://github.com/hegeldev/hegel-core/pull/72
+HEGEL_STRING_TAG = 6
+
+
+def _encode_value(value: object) -> object:
+    # prepare a final generated value for transport in the protocol.
+    if isinstance(value, str):
+        # this is sometimes called "WTF-8" encoding: https://wtf-8.codeberg.page/. It is
+        # identical to UTF-8 except it drops the well-formed requirement that surrogates
+        # not appear.
+        return CBORTag(HEGEL_STRING_TAG, value.encode("utf-8", "surrogatepass"))
+    if isinstance(value, list):
+        return [_encode_value(v) for v in value]
+    if isinstance(value, tuple):
+        return [_encode_value(v) for v in value]
+    # nocover because we don't actually generate dicts yet
+    if isinstance(value, dict):  # pragma: no cover
+        return {_encode_value(k): _encode_value(v) for k, v in value.items()}
+    return value
+
+
 def _from_schema(schema: dict[str, Any]) -> SearchStrategy[Any]:
     schema_type = schema["type"]
 
@@ -49,14 +71,11 @@ def _from_schema(schema: dict[str, Any]) -> SearchStrategy[Any]:
             exclude_max=schema.get("exclude_max", False),
         )
     if schema_type == "string":
-        # Exclude null bytes due to reflect-cpp truncation bug:
-        # https://github.com/getml/reflect-cpp/issues/559
-        # Exclude surrogates (Cs category) as they're invalid in UTF-8/JSON
+        characters_schema = {
+            k: v for k, v in schema.items() if k not in {"type", "min_size", "max_size"}
+        }
         return st.text(
-            alphabet=st.characters(
-                blacklist_characters="\x00",
-                blacklist_categories=("Cs",),  # type: ignore[arg-type]
-            ),
+            alphabet=st.characters(**characters_schema),
             min_size=schema.get("min_size", 0),
             max_size=schema.get("max_size"),
         )
@@ -66,9 +85,16 @@ def _from_schema(schema: dict[str, Any]) -> SearchStrategy[Any]:
             max_size=schema.get("max_size"),
         )
     if schema_type == "regex":
+        alphabet_schema = schema.get("alphabet")
         return st.from_regex(
             schema["pattern"],
             fullmatch=schema.get("fullmatch", False),
+            alphabet=(
+                # hypothesis typing bug, I think
+                None
+                if alphabet_schema is None
+                else st.characters(**alphabet_schema)  # type: ignore
+            ),
         )
     if schema_type == "list":
         return st.lists(
@@ -78,11 +104,6 @@ def _from_schema(schema: dict[str, Any]) -> SearchStrategy[Any]:
             unique=schema.get("unique", False),
         )
     if schema_type == "dict":
-        # Possibly "dict" should be removed entirely and replaced by libraries calling "tuple"
-        # themselves.
-        #
-        # We initially returned a tuple here to avoid json requiring string keys in dicts,
-        # but since we switched to cbor that's no longer a problem.
         return st.dictionaries(
             keys=_from_schema(schema["keys"]),
             values=_from_schema(schema["values"]),

diff --git a/src/hegel/server.py b/src/hegel/server.py
@@ -25,7 +25,7 @@
 from hypothesis.internal.conjecture.utils import calc_label_from_name, many
 
 from hegel.protocol import Connection, ProtocolError, Stream
-from hegel.schema import from_schema
+from hegel.schema import _encode_value, from_schema
 from hegel.utils import UniqueIdentifier, not_set
 
 VARIABLES_LABEL = calc_label_from_name("Variables")
@@ -184,7 +184,8 @@ def handle_client_request(message: dict) -> Any:
                     if command == "generate":
                         schema = message["schema"]
                         strategy = from_schema(schema)
-                        return data.draw(strategy)
+                        result = data.draw(strategy)
+                        return _encode_value(result)
                     elif command == "start_span":
                         label = message.get("label", 0)
                         data.start_span(label)
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,3 @@
		RELEASE_TYPE: patch

		This release adds support for alphabet parameters in `{"type": "string"}` and `{"type": "regex"}` schemas, allowing control over generated characters. Supported parameters are `codec`, `min_codepoint`, `max_codepoint`, `categories`, `exclude_categories`, `exclude_characters`, and `include_characters`.