Skip to content
Closed
2 changes: 1 addition & 1 deletion .github/coverage-ratchet.json
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
{
"nocov": 529
"nocov": 528
}
12 changes: 12 additions & 0 deletions RELEASE.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
RELEASE_TYPE: patch

Add more configuration options to `generators::text`. You can now write:

```rust
text().codec("ascii")
text().alphabet("abc")
text().min_codepoint(lo).max_codepoint(hi)
text().categories(&["Lu"])
```

This patch also adds `generators::characters`, with similar configuration options. `characters()` is a convenience for `text().min_size(1).max_size(1)`.
2 changes: 1 addition & 1 deletion justfile
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ check-coverage:

check-conformance:
cargo build --release --manifest-path tests/conformance/rust/Cargo.toml
uv run --with 'hegel-core==0.3.0' --with pytest --with hypothesis \
uv run --with 'hegel-core==0.3.1' --with pytest --with hypothesis \
pytest tests/conformance/test_conformance.py

# these aliases are provided as ux improvements for local developers. CI should use the longer
Expand Down
7 changes: 4 additions & 3 deletions src/generators/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -49,9 +49,10 @@ pub use generators::{BoxedGenerator, Filtered, FlatMapped, Generator, Mapped};
pub use misc::{BoolGenerator, JustGenerator, booleans, just, unit};
pub use numeric::{Float, FloatGenerator, Integer, IntegerGenerator, floats, integers};
pub use strings::{
BinaryGenerator, DateGenerator, DateTimeGenerator, DomainGenerator, EmailGenerator,
IpAddressGenerator, RegexGenerator, TextGenerator, TimeGenerator, UrlGenerator, binary, dates,
datetimes, domains, emails, from_regex, ip_addresses, text, times, urls,
BinaryGenerator, CharactersGenerator, DateGenerator, DateTimeGenerator, DomainGenerator,
EmailGenerator, IpAddressGenerator, RegexGenerator, TextGenerator, TimeGenerator, UrlGenerator,
binary, characters, dates, datetimes, domains, emails, from_regex, ip_addresses, text, times,
urls,
};
pub use time::{DurationGenerator, durations};
#[doc(hidden)]
Expand Down
257 changes: 248 additions & 9 deletions src/generators/strings.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,156 @@ use super::{BasicGenerator, Generator, TestCase};
use crate::cbor_utils::{cbor_array, cbor_map, map_insert};
use ciborium::Value;

/// Generator for single Unicode characters. Created by [`characters()`].
pub struct CharactersGenerator {
codec: Option<String>,
min_codepoint: Option<u32>,
max_codepoint: Option<u32>,
categories: Option<Vec<String>>,
exclude_categories: Option<Vec<String>>,
include_characters: Option<String>,
exclude_characters: Option<String>,
}

impl CharactersGenerator {
/// Restrict to this codec (e.g. `"ascii"`, `"utf-8"`).
pub fn codec(mut self, codec: &str) -> Self {
self.codec = Some(codec.to_string());
self
}

/// Set the minimum Unicode codepoint.
pub fn min_codepoint(mut self, cp: u32) -> Self {
self.min_codepoint = Some(cp);
self
}

/// Set the maximum Unicode codepoint.
pub fn max_codepoint(mut self, cp: u32) -> Self {
self.max_codepoint = Some(cp);
self
}

/// Include only characters from these Unicode categories (e.g. `["L", "Nd"]`).
/// Mutually exclusive with [`exclude_categories`](Self::exclude_categories).
pub fn categories(mut self, cats: &[&str]) -> Self {
self.categories = Some(cats.iter().map(|s| s.to_string()).collect());
self.exclude_categories = None;
self
}

/// Exclude characters from these Unicode categories.
/// Mutually exclusive with [`categories`](Self::categories).
///
/// The `Cs` (surrogate) category is always excluded because Rust strings
/// must be valid UTF-8 and cannot represent surrogate code points.
pub fn exclude_categories(mut self, cats: &[&str]) -> Self {
let mut all: Vec<String> = cats.iter().map(|s| s.to_string()).collect();
if !all.iter().any(|c| c == "Cs") {
all.push("Cs".to_string());
}
self.exclude_categories = Some(all);
self.categories = None;
self
}

/// Always include these specific characters, even if excluded by other filters.
pub fn include_characters(mut self, chars: &str) -> Self {
self.include_characters = Some(chars.to_string());
self
}

/// Always exclude these specific characters.
pub fn exclude_characters(mut self, chars: &str) -> Self {
self.exclude_characters = Some(chars.to_string());
self
}

fn insert_into_schema(&self, schema: &mut Value) {
if let Some(ref codec) = self.codec {
map_insert(schema, "codec", codec.as_str());
}
if let Some(codepoint) = self.min_codepoint {
map_insert(schema, "min_codepoint", codepoint as u64);
}
if let Some(codepoint) = self.max_codepoint {
map_insert(schema, "max_codepoint", codepoint as u64);
}
if let Some(ref cats) = self.categories {
map_insert(
schema,
"categories",
Value::Array(cats.iter().map(|s| Value::from(s.as_str())).collect()),
);
}
if let Some(ref cats) = self.exclude_categories {
map_insert(
schema,
"exclude_categories",
Value::Array(cats.iter().map(|s| Value::from(s.as_str())).collect()),
);
}
if let Some(ref chars) = self.include_characters {
map_insert(schema, "include_characters", chars.as_str());
}
if let Some(ref chars) = self.exclude_characters {
map_insert(schema, "exclude_characters", chars.as_str());
}
}

fn build_alphabet_schema(&self) -> Value {
let mut schema = cbor_map! {};
self.insert_into_schema(&mut schema);
schema
}
}

impl Generator<String> for CharactersGenerator {
fn do_draw(&self, tc: &TestCase) -> String {
let mut schema = cbor_map! {
"type" => "string",
"min_size" => 1u64,
"max_size" => 1u64
};
self.insert_into_schema(&mut schema);
super::generate_from_schema(tc, &schema)
}

fn as_basic(&self) -> Option<BasicGenerator<'_, String>> {
let mut schema = cbor_map! {
"type" => "string",
"min_size" => 1u64,
"max_size" => 1u64
};
self.insert_into_schema(&mut schema);
Some(BasicGenerator::new(schema, super::deserialize_value))
}
}

/// Generate single Unicode characters.
///
/// By default, surrogates (Unicode category `Cs`) are excluded because Rust's
/// `char` type cannot represent them. Other client libraries (e.g. TypeScript,
/// Python) may include surrogates by default.
pub fn characters() -> CharactersGenerator {
CharactersGenerator {
codec: None,
min_codepoint: None,
max_codepoint: None,
categories: None,
exclude_categories: Some(vec!["Cs".to_string()]),
include_characters: None,
exclude_characters: None,
}
}

/// Generator for Unicode text strings. Created by [`text()`].
pub struct TextGenerator {
min_size: usize,
max_size: Option<usize>,
characters: CharactersGenerator,
alphabet_called: bool,
character_param_called: bool,
}

impl TextGenerator {
Expand All @@ -21,7 +167,81 @@ impl TextGenerator {
self
}

/// Use a fixed set of characters. Each character in the string is a member
/// of the alphabet.
///
/// Mutually exclusive with the character filtering methods like `codec`,
/// `categories`, `min_codepoint`, etc.
pub fn alphabet(mut self, chars: &str) -> Self {
self.characters = CharactersGenerator {
codec: None,
min_codepoint: None,
max_codepoint: None,
categories: Some(vec![]),
exclude_categories: None,
include_characters: Some(chars.to_string()),
exclude_characters: None,
};
self.alphabet_called = true;
self
}

/// Restrict to characters encodable in this codec (e.g. `"ascii"`, `"utf-8"`, `"latin-1"`).
pub fn codec(mut self, codec: &str) -> Self {
self.character_param_called = true;
self.characters = self.characters.codec(codec);
self
}

/// Set the minimum Unicode codepoint.
pub fn min_codepoint(mut self, cp: u32) -> Self {
self.character_param_called = true;
self.characters = self.characters.min_codepoint(cp);
self
}

/// Set the maximum Unicode codepoint.
pub fn max_codepoint(mut self, cp: u32) -> Self {
self.character_param_called = true;
self.characters = self.characters.max_codepoint(cp);
self
}

/// Include only characters from these Unicode categories (e.g. `["L", "Nd"]`).
/// Mutually exclusive with [`exclude_categories`](Self::exclude_categories).
pub fn categories(mut self, cats: &[&str]) -> Self {
self.character_param_called = true;
self.characters = self.characters.categories(cats);
self
}

/// Exclude characters from these Unicode categories.
/// Mutually exclusive with [`categories`](Self::categories).
pub fn exclude_categories(mut self, cats: &[&str]) -> Self {
self.character_param_called = true;
self.characters = self.characters.exclude_categories(cats);
self
}

/// Always include these specific characters, even if excluded by other filters.
pub fn include_characters(mut self, chars: &str) -> Self {
self.character_param_called = true;
self.characters = self.characters.include_characters(chars);
self
}

/// Always exclude these specific characters.
pub fn exclude_characters(mut self, chars: &str) -> Self {
self.character_param_called = true;
self.characters = self.characters.exclude_characters(chars);
self
}

fn build_schema(&self) -> Value {
assert!(
!(self.alphabet_called && self.character_param_called),
"Cannot combine .alphabet() with character methods."
);
if let Some(max) = self.max_size {
assert!(self.min_size <= max, "Cannot have max_size < min_size");
}
Expand All @@ -35,6 +255,8 @@ impl TextGenerator {
map_insert(&mut schema, "max_size", max as u64);
}

self.characters.insert_into_schema(&mut schema);

schema
}
}
Expand All @@ -45,9 +267,10 @@ impl Generator<String> for TextGenerator {
}

fn as_basic(&self) -> Option<BasicGenerator<'_, String>> {
Some(BasicGenerator::new(self.build_schema(), |raw| {
super::deserialize_value(raw)
}))
Some(BasicGenerator::new(
self.build_schema(),
super::deserialize_value,
))
}
}

Expand All @@ -56,6 +279,9 @@ pub fn text() -> TextGenerator {
TextGenerator {
min_size: 0,
max_size: None,
characters: characters(),
alphabet_called: false,
character_param_called: false,
}
}

Expand All @@ -66,25 +292,36 @@ pub fn text() -> TextGenerator {
pub struct RegexGenerator {
pattern: String,
fullmatch: bool,
alphabet: Option<CharactersGenerator>,
}

impl RegexGenerator {
/// Set whether the entire string must match the pattern, not just contain a match.
// nocov start
pub fn fullmatch(mut self, fullmatch: bool) -> Self {
self.fullmatch = fullmatch;
self
// nocov end
}

/// Constrain which characters may appear in generated strings.
pub fn alphabet(mut self, alphabet: CharactersGenerator) -> Self {
self.alphabet = Some(alphabet);
self
}

// nocov start
fn build_schema(&self) -> Value {
cbor_map! {
let mut schema = cbor_map! {
"type" => "regex",
"pattern" => self.pattern.as_str(),
"fullmatch" => self.fullmatch
// nocov end
};

if let Some(ref alphabet) = self.alphabet {
map_insert(&mut schema, "alphabet", alphabet.build_alphabet_schema());
}

schema
}
}

Expand All @@ -97,10 +334,11 @@ impl Generator<String> for RegexGenerator {

// nocov start
fn as_basic(&self) -> Option<BasicGenerator<'_, String>> {
Some(BasicGenerator::new(self.build_schema(), |raw| {
super::deserialize_value(raw)
Some(BasicGenerator::new(
self.build_schema(),
super::deserialize_value,
// nocov end
}))
))
}
}

Expand All @@ -110,6 +348,7 @@ pub fn from_regex(pattern: &str) -> RegexGenerator {
RegexGenerator {
pattern: pattern.to_string(),
fullmatch: false,
alphabet: None,
// nocov end
}
}
Expand Down
Loading