From 6bc330114ccd43ddb62f6f76328f4733a7b3f28c Mon Sep 17 00:00:00 2001 From: Johnny Greco Date: Fri, 6 Feb 2026 15:24:31 -0500 Subject: [PATCH 1/3] update init constraint imports --- .../data-designer-config/src/data_designer/config/__init__.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/packages/data-designer-config/src/data_designer/config/__init__.py b/packages/data-designer-config/src/data_designer/config/__init__.py index bbddb8e00..306192f03 100644 --- a/packages/data-designer-config/src/data_designer/config/__init__.py +++ b/packages/data-designer-config/src/data_designer/config/__init__.py @@ -60,6 +60,8 @@ from data_designer.config.run_config import RunConfig # noqa: F401 from data_designer.config.sampler_constraints import ( # noqa: F401 ColumnInequalityConstraint, + ConstraintType, + InequalityOperator, ScalarInequalityConstraint, ) from data_designer.config.sampler_params import ( # noqa: F401 @@ -168,6 +170,8 @@ "RunConfig": (f"{_MOD_BASE}.run_config", "RunConfig"), # sampler_constraints "ColumnInequalityConstraint": (_MOD_SAMPLER_CONSTRAINTS, "ColumnInequalityConstraint"), + "ConstraintType": (_MOD_SAMPLER_CONSTRAINTS, "ConstraintType"), + "InequalityOperator": (_MOD_SAMPLER_CONSTRAINTS, "InequalityOperator"), "ScalarInequalityConstraint": (_MOD_SAMPLER_CONSTRAINTS, "ScalarInequalityConstraint"), # sampler_params "BernoulliMixtureSamplerParams": (_MOD_SAMPLER_PARAMS, "BernoulliMixtureSamplerParams"), From 24751469bf216c780b0e3fd2fdaf62e875da5be3 Mon Sep 17 00:00:00 2001 From: Johnny Greco Date: Fri, 6 Feb 2026 16:25:51 -0500 Subject: [PATCH 2/3] add missing columns --- AGENTS.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/AGENTS.md b/AGENTS.md index eb71694a2..865845a7b 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -507,6 +507,8 @@ When working with column configurations, understand these key types: - **`ExpressionColumnConfig`**: Expression-based derived columns (Python eval or Jinja2) - **`ValidationColumnConfig`**: Validation results (Python, SQL, Code, Remote validators) - **`SeedDatasetColumnConfig`**: Data from seed datasets +- **`EmbeddingColumnConfig`**: Embedding generation for text columns using a specified model +- **`CustomColumnConfig`**: Custom user-defined column generators via `@custom_column_generator` decorator See [packages/data-designer-config/src/data_designer/config/column_configs.py](packages/data-designer-config/src/data_designer/config/column_configs.py) for detailed schemas. From e864a758189db180f5b5214c82d7639a531aab57 Mon Sep 17 00:00:00 2001 From: Johnny Greco Date: Fri, 6 Feb 2026 18:15:29 -0500 Subject: [PATCH 3/3] add library claude skill --- skill/README.md | 79 ++ skill/data-designer/SKILL.md | 310 ++++++ .../examples/basic_text_generation.py | 91 ++ .../examples/custom_column_with_llm.py | 101 ++ skill/data-designer/examples/mcp_tool_use.py | 117 +++ .../examples/seed_dataset_with_judge.py | 116 +++ .../examples/structured_and_code.py | 99 ++ .../hooks/check_data_designer.sh | 33 + skill/data-designer/hooks/ruff_lint.sh | 60 ++ skill/data-designer/hooks/ty_check.sh | 58 ++ .../references/advanced_patterns.md | 446 +++++++++ .../data-designer/references/api_reference.md | 557 +++++++++++ .../echo_data_designer_library_path.py | 32 + .../data-designer/scripts/get_column_info.py | 95 ++ .../scripts/get_processor_info.py | 70 ++ .../data-designer/scripts/get_sampler_info.py | 127 +++ .../scripts/get_validator_info.py | 92 ++ .../data-designer/scripts/helpers/__init__.py | 0 .../scripts/helpers/pydantic_info_utils.py | 445 +++++++++ skill/test_info_scripts.py | 900 ++++++++++++++++++ 20 files changed, 3828 insertions(+) create mode 100644 skill/README.md create mode 100644 skill/data-designer/SKILL.md create mode 100644 skill/data-designer/examples/basic_text_generation.py create mode 100644 skill/data-designer/examples/custom_column_with_llm.py create mode 100644 skill/data-designer/examples/mcp_tool_use.py create mode 100644 skill/data-designer/examples/seed_dataset_with_judge.py create mode 100644 skill/data-designer/examples/structured_and_code.py create mode 100755 skill/data-designer/hooks/check_data_designer.sh create mode 100755 skill/data-designer/hooks/ruff_lint.sh create mode 100755 skill/data-designer/hooks/ty_check.sh create mode 100644 skill/data-designer/references/advanced_patterns.md create mode 100644 skill/data-designer/references/api_reference.md create mode 100644 skill/data-designer/scripts/echo_data_designer_library_path.py create mode 100644 skill/data-designer/scripts/get_column_info.py create mode 100644 skill/data-designer/scripts/get_processor_info.py create mode 100644 skill/data-designer/scripts/get_sampler_info.py create mode 100644 skill/data-designer/scripts/get_validator_info.py create mode 100644 skill/data-designer/scripts/helpers/__init__.py create mode 100644 skill/data-designer/scripts/helpers/pydantic_info_utils.py create mode 100644 skill/test_info_scripts.py diff --git a/skill/README.md b/skill/README.md new file mode 100644 index 000000000..d31faaf98 --- /dev/null +++ b/skill/README.md @@ -0,0 +1,79 @@ +# Data Designer Skill for Claude Code + +A [Claude Code skill](https://docs.anthropic.com/en/docs/claude-code/skills) that teaches Claude how to generate synthetic datasets using [NVIDIA NeMo Data Designer](https://github.com/NVIDIA-NeMo/DataDesigner). + +When activated, Claude can design and build complete data generation pipelines — choosing the right column types, writing prompts, wiring up dependencies, and iterating on previews — all from a natural language description of the dataset you want. + +## What's in the skill + +``` +.claude/skills/data-designer/ +├── SKILL.md # Core skill definition and workflow guide +├── references/ +│ ├── api_reference.md # Complete API documentation +│ └── advanced_patterns.md # Custom columns, MCP tools, multimodal, etc. +├── examples/ # 5 runnable pattern-reference scripts +├── scripts/ # Discovery tools for API introspection +└── hooks/ # Session startup check, ruff lint, ty type-check +``` + +## Prerequisites + +- **[uv](https://docs.astral.sh/uv/getting-started/installation/)** — used for environment management and required by the skill's session hooks +- **[Claude Code](https://docs.anthropic.com/en/docs/claude-code/overview)** — the CLI that runs the skill +- **Python 3.10+** — any version from 3.10 to 3.13 works (`uv` will install it for you) +- **An LLM provider API key** — e.g., an [NVIDIA API key](https://build.nvidia.com/) (`NVIDIA_API_KEY`) + +## Quick start + +### 1. Set up a project and download the skill + +```bash +mkdir my-project && cd my-project +mkdir -p .claude/skills +``` + +Download the `skill/data-designer` folder into `.claude/skills/data-designer`: + +```bash +# with curl +curl -L https://github.com/NVIDIA-NeMo/DataDesigner/archive/refs/heads/main.tar.gz \ + | tar xz --strip-components=2 -C .claude/skills "DataDesigner-main/skill/data-designer" + +# or with wget +wget -qO- https://github.com/NVIDIA-NeMo/DataDesigner/archive/refs/heads/main.tar.gz \ + | tar xz --strip-components=2 -C .claude/skills "DataDesigner-main/skill/data-designer" +``` + +### 2. Create a Python environment and install Data Designer + +```bash +uv venv --python 3.13 +source .venv/bin/activate +uv pip install --pre data-designer +``` + +> **Note:** The `--pre` flag installs the latest pre-release. + +### 3. Set up your default model providers and models + +Use the Data Designer CLI to configure your LLM provider(s) and model(s) interactively: + +```bash +# Configure a provider (endpoint, API key, etc.) +data-designer config providers + +# Configure model(s) that use the provider +data-designer config models + +# Verify your configuration +data-designer config list +``` + +The CLI walks you through each setting with an interactive prompt. You only need to do this once — configurations are saved to `~/.data-designer/`. + +### 4. Launch Claude Code + +```bash +claude +``` diff --git a/skill/data-designer/SKILL.md b/skill/data-designer/SKILL.md new file mode 100644 index 000000000..6e9a7d07b --- /dev/null +++ b/skill/data-designer/SKILL.md @@ -0,0 +1,310 @@ +--- +name: data-designer +description: >- + Generate synthetic datasets using NVIDIA NeMo Data Designer. + Use when the user wants to create, design, or generate synthetic data, + build training/evaluation datasets, generate text/code/structured data + with LLMs, score data quality with LLM judges, validate generated code, + or work with the data-designer Python library in any capacity. +argument-hint: [describe the dataset you want to generate] +disable-model-invocation: true +hooks: + SessionStart: + - matcher: startup + hooks: + - type: command + command: "$CLAUDE_PROJECT_DIR/.claude/skills/data-designer/hooks/check_data_designer.sh" + once: true + PostToolUse: + - matcher: Write|Edit + hooks: + - type: command + command: "$CLAUDE_PROJECT_DIR/.claude/skills/data-designer/hooks/ruff_lint.sh" $filePath + - type: command + command: "$CLAUDE_PROJECT_DIR/.claude/skills/data-designer/hooks/ty_check.sh" $filePath +--- + +# Data Designer Synthetic Dataset Generator + +Generate synthetic datasets using NVIDIA NeMo Data Designer. + +--- + +## 1. Before You Start + +**Pre-flight check** runs automatically on session start (Claude Code hook). +For Cursor, run manually: `.claude/skills/data-designer/hooks/check_data_designer.sh` + +**Clarify with the user:** purpose (training/eval/fine-tuning), record count, schema (columns/fields), seed data, quality needs (validation/judging), and model provider. + +**Verify environment and discover model aliases:** Run `uv run data-designer config list` to confirm API keys are set and to see the available providers, model aliases, and backing models. Use the aliases from this output (e.g., `nvidia-text`, `openai-reasoning`) as the `model_alias` argument in column configs. + +--- + +## 2. Schema Design + +Run discovery scripts to see available types: +```bash +uv run .claude/skills/data-designer/scripts/get_column_info.py --list +uv run .claude/skills/data-designer/scripts/get_sampler_info.py --list +uv run .claude/skills/data-designer/scripts/get_processor_info.py --list +uv run .claude/skills/data-designer/scripts/get_validator_info.py --list +``` + +### Column Type Decision Tree + +``` +Need data? ++-- Statistical/random values --> SamplerColumnConfig +| +-- Categorical --> SamplerType.CATEGORY +| +-- Hierarchical --> SamplerType.SUBCATEGORY +| +-- Person data --> SamplerType.PERSON (Nemotron Personas) / PERSON_FROM_FAKER +| +-- Dates --> SamplerType.DATETIME / TIMEDELTA +| +-- IDs --> SamplerType.UUID +| +-- Numeric --> UNIFORM, GAUSSIAN, POISSON, BINOMIAL, BERNOULLI, SCIPY ++-- LLM-generated content +| +-- Free-form text --> LLMTextColumnConfig +| +-- Code output --> LLMCodeColumnConfig (20+ languages) +| +-- Multiple related fields that must be internally consistent +| --> LLMStructuredColumnConfig (Pydantic or JSON schema) +| Use when: a single entity has 2+ fields that should cohere +| (e.g., a customer with name + title + department, a product +| with SKU + regulatory class + clearance type). One LLM call +| ensures consistency; separate LLM text columns would not. +| +-- Quality scoring --> LLMJudgeColumnConfig (Score rubrics) ++-- Derived/computed --> ExpressionColumnConfig (Jinja2, no LLM) ++-- Custom logic --> CustomColumnConfig (@custom_column_generator) ++-- From seed data --> with_seed_dataset() (auto-creates SeedDatasetColumnConfig) ++-- Embeddings --> EmbeddingColumnConfig ++-- Validation gates --> ValidationColumnConfig (code lint, callable, remote) +``` + +Before writing any column config, run the relevant info script for exact field details: +```bash +uv run .claude/skills/data-designer/scripts/get_column_info.py +uv run .claude/skills/data-designer/scripts/get_sampler_info.py +uv run .claude/skills/data-designer/scripts/get_processor_info.py +uv run .claude/skills/data-designer/scripts/get_validator_info.py +``` + +--- + +## 3. Workflow + +1. **Initialize** `DataDesigner()` and `DataDesignerConfigBuilder()` +2. **Add columns** in any order (DAG auto-resolves dependencies from Jinja2 templates) +3. **Add optional features**: constraints, processors, validators, seed data, profilers +4. **Validate** with `data_designer.validate(config_builder)` +5. **Preview** with `data_designer.preview(config_builder, num_records=5)` +6. **Iterate** on prompts/params based on preview +7. **Create** full dataset with `data_designer.create(config_builder, num_records=N, dataset_name="name")` +8. **Load and inspect** results + +### Default Model Aliases + +No manual `ModelConfig` needed — default aliases load automatically per provider. +Alias pattern: `{provider}-text`, `{provider}-reasoning`, `{provider}-vision`, `{provider}-embedding`. +**Always run `uv run data-designer config list` before writing column configs** to discover which aliases are available for the user's provider. Use those aliases as the `model_alias` argument — do not guess or hardcode aliases without checking. + +### Minimal Skeleton + +```python +import data_designer.config as dd +from data_designer.interface import DataDesigner + +data_designer = DataDesigner() +config_builder = dd.DataDesignerConfigBuilder() + +# Add columns here... + +data_designer.validate(config_builder) +preview = data_designer.preview(config_builder, num_records=5) +preview.display_sample_record() + +results = data_designer.create(config_builder, num_records=100, dataset_name="my-dataset") +dataset = results.load_dataset() +analysis = results.load_analysis() +``` + +--- + +## 4. Examples + +Read these before writing any Data Designer script. Each demonstrates **patterns and API usage**, not domain-specific content — adapt the structure to the user's requirements, don't copy example-specific values (categories, prompts, schemas, etc.). + +- **`examples/basic_text_generation.py`** -- Product reviews: CATEGORY sampler with weights, PERSON_FROM_FAKER with drop, UNIFORM with convert_to, ExpressionColumnConfig, LLMTextColumnConfig. +- **`examples/structured_and_code.py`** -- Programming tasks: Pydantic BaseModel as output_format, LLMStructuredColumnConfig with nested field access, LLMCodeColumnConfig. +- **`examples/seed_dataset_with_judge.py`** -- Clinical notes: LocalFileSeedSource with SHUFFLE, UUID/DATETIME samplers, LLMJudgeColumnConfig with Score rubrics. +- **`examples/custom_column_with_llm.py`** -- Writer-editor pattern: @custom_column_generator with model access, side_effect_columns, multi-model orchestration. +- **`examples/mcp_tool_use.py`** -- MCP tool calling: LocalStdioMCPProvider, ToolConfig, tool_alias on LLM columns, trace capture. + +--- + +## 5. Key Patterns + +### Jinja2 Templating + +All prompts use Jinja2 for column references. Dependencies are auto-resolved. + +```python +# Basic reference +"Write a review for {{ product_name }}" + +# Nested object access (Person sampler, structured output) +"{{ customer.first_name }} from {{ customer.city }}" +"{{ product_info.price }}" + +# Conditional logic +"{% if rating >= 4 %}positive{% else %}negative{% endif %}" + +# Filters +"{{ name | upper }}" +"{{ price | round(2) }}" +``` + +### Drop Intermediate Columns + +Generate a column for dependency use but exclude from final output: +```python +dd.SamplerColumnConfig(name="person", ..., drop=True) +dd.ExpressionColumnConfig(name="name", expr="{{ person.full_name }}") +``` + +### Structured Entity with Extracted Fields + +Generate a coherent multi-field entity with LLMStructuredColumnConfig, +then extract individual fields into top-level columns with ExpressionColumnConfig: + +```python +class CustomerProfile(BaseModel): + facility_name: str = Field(description="Healthcare facility name") + bed_count: int = Field(description="Number of beds (50-2000)") + department: str = Field(description="Primary department") + +config_builder.add_column(dd.LLMStructuredColumnConfig( + name="customer_profile", + prompt="...", + output_format=CustomerProfile, + model_alias="", +)) +# Extract individual fields for a flat output schema +config_builder.add_column(dd.ExpressionColumnConfig( + name="facility_name", expr="{{ customer_profile.facility_name }}")) +config_builder.add_column(dd.ExpressionColumnConfig( + name="department", expr="{{ customer_profile.department }}")) +``` + +This ensures facility_name and department are contextually consistent +(generated together) while still appearing as flat columns in the output. + +### Seed Datasets + +Bootstrap from existing data (CSV, Parquet, HuggingFace, DataFrame): +```python +seed_source = dd.LocalFileSeedSource(path="data.csv") +config_builder.with_seed_dataset(seed_source, sampling_strategy=dd.SamplingStrategy.SHUFFLE) +# Seed columns are auto-available in Jinja2 templates: {{ column_from_seed }} +``` + +### Processors (Post-Generation Transforms) + +Transform output schema after generation: +```python +config_builder.add_processor(dd.SchemaTransformProcessorConfig( + name="chat_format", + template={ + "messages": [ + {"role": "user", "content": "{{ question }}"}, + {"role": "assistant", "content": "{{ answer }}"}, + ] + }, +)) +# Access: results.load_processor_dataset("chat_format") +``` + +### Validators (Quality Gates) + +Validate generated code or data: +```python +config_builder.add_column(dd.ValidationColumnConfig( + name="code_check", + target_columns=["solution"], + validator_type=dd.ValidatorType.CODE, + validator_params=dd.CodeValidatorParams(code_lang=dd.CodeLang.PYTHON), + batch_size=20, +)) +``` + +### Constraints (Sampler Bounds) + +```python +config_builder.add_constraint(dd.ScalarInequalityConstraint( + target_column="age", operator=dd.InequalityOperator.GE, rhs=18, +)) +config_builder.add_constraint(dd.ColumnInequalityConstraint( + target_column="end_date", operator=dd.InequalityOperator.GT, rhs="start_date", +)) +``` + +### Trace & Reasoning Capture + +```python +dd.LLMTextColumnConfig( + name="answer", + prompt="...", + model_alias="nvidia-reasoning", + with_trace=dd.TraceType.ALL_MESSAGES, # -> answer__trace column + extract_reasoning_content=True, # -> answer__reasoning_content column +) +``` + +### Performance Tuning (RunConfig) + +```python +from data_designer.config import RunConfig +data_designer.set_run_config(RunConfig( + buffer_size=500, # records per batch (default: 1000) + disable_early_shutdown=True, # don't stop on high error rate + max_conversation_restarts=7, # retries for strict schemas + max_conversation_correction_steps=2, # in-conversation corrections +)) +``` + +--- + +## 6. Best Practices + +- Always preview (3-5 records) before full generation +- Use samplers for diversity control (not LLMs) +- Keep prompts deterministic and scoped +- Add validators/judges when quality matters +- Use temperature distributions for output diversity +- Drop intermediate columns to keep final dataset clean +- Use `ExpressionColumnConfig` for derived fields (no LLM cost) +- Use `SchemaTransformProcessorConfig` to reshape for training formats (e.g., chat messages) +- Use `LLMStructuredColumnConfig` when generating 2+ related fields that must + be internally consistent (e.g., a person's name + title, a product's SKU + + regulatory class). This is more coherent than separate `LLMTextColumnConfig` + calls and more realistic than hardcoded `CATEGORY` samplers. Extract + individual fields with `ExpressionColumnConfig` for a flat output schema. +- Always call `validate()` before `preview()`/`create()` + +--- + +## 7. Reference Material + +**Start here:** Read the example scripts in `examples/` (Section 4). + +Then consult these references as needed: + +- **`references/api_reference.md`** -- Complete API: all column types, sampler types, model config, constraints, processors, validators, seed sources, MCP, RunConfig, profilers, results. +- **`references/advanced_patterns.md`** -- Custom columns with `@custom_column_generator`, MCP tool integration, multimodal inputs, schema transforms, performance tuning, multi-stage refinement, conditional sampling, Nemotron Personas, config serialization. + +--- + +## 8. Output Expectations + +- Produce a single runnable Python script +- Clear inputs: `num_records`, `model_alias`, `dataset_name` +- Clear outputs: dataset path, analysis report +- Include error handling for common issues diff --git a/skill/data-designer/examples/basic_text_generation.py b/skill/data-designer/examples/basic_text_generation.py new file mode 100644 index 000000000..21b65bb96 --- /dev/null +++ b/skill/data-designer/examples/basic_text_generation.py @@ -0,0 +1,91 @@ +"""Product Review Dataset — basic text generation patterns. + +PATTERN REFERENCE ONLY — copy the structure, not the domain-specific values. + +Demonstrates: +- Default model aliases (nvidia-text) — no manual ModelConfig needed +- CATEGORY sampler with weights +- PERSON_FROM_FAKER sampler with drop=True +- UNIFORM sampler with convert_to="int" +- ExpressionColumnConfig for derived fields +- LLMTextColumnConfig with Jinja2 templates referencing nested fields +- preview() + create() workflow +""" + +import data_designer.config as dd +from data_designer.interface import DataDesigner + +data_designer = DataDesigner() + +config_builder = dd.DataDesignerConfigBuilder() + +# --- Sampler columns --- + +config_builder.add_column( + dd.SamplerColumnConfig( + name="category", + sampler_type=dd.SamplerType.CATEGORY, + params=dd.CategorySamplerParams( + values=["Electronics", "Clothing", "Books", "Home"], + weights=[0.3, 0.25, 0.25, 0.2], + ), + ) +) + +config_builder.add_column( + dd.SamplerColumnConfig( + name="customer", + sampler_type=dd.SamplerType.PERSON_FROM_FAKER, + params=dd.PersonFromFakerSamplerParams(locale="en_US"), + drop=True, # keep derived fields, drop raw person object + ) +) + +config_builder.add_column( + dd.SamplerColumnConfig( + name="rating", + sampler_type=dd.SamplerType.UNIFORM, + params=dd.UniformSamplerParams(low=1, high=5), + convert_to="int", + ) +) + +# --- Expression column (derived from sampler) --- + +config_builder.add_column( + dd.ExpressionColumnConfig( + name="customer_name", + expr="{{ customer.first_name }} {{ customer.last_name }}", + ) +) + +# --- LLM text columns --- + +config_builder.add_column( + dd.LLMTextColumnConfig( + name="product_name", + prompt=("Create a creative product name for a {{ category }} product. Respond with only the product name."), + model_alias="nvidia-text", + ) +) + +config_builder.add_column( + dd.LLMTextColumnConfig( + name="review", + prompt=( + "You are {{ customer_name }}, a {{ customer.age }}-year-old from " + "{{ customer.city }}. Write a {{ rating }}-star review for " + "{{ product_name }}. Be authentic and detailed." + ), + model_alias="nvidia-text", + ) +) + +# --- Preview then create --- + +preview = data_designer.preview(config_builder, num_records=3) +preview.display_sample_record() + +results = data_designer.create(config_builder, num_records=100, dataset_name="product-reviews") +dataset = results.load_dataset() +analysis = results.load_analysis() diff --git a/skill/data-designer/examples/custom_column_with_llm.py b/skill/data-designer/examples/custom_column_with_llm.py new file mode 100644 index 000000000..6a2c9787c --- /dev/null +++ b/skill/data-designer/examples/custom_column_with_llm.py @@ -0,0 +1,101 @@ +"""Writer-Editor Dataset — custom column generator with multi-model orchestration. + +PATTERN REFERENCE ONLY — copy the structure, not the domain-specific values. + +Demonstrates: +- Default model aliases (nvidia-text, nvidia-reasoning) — no manual ModelConfig needed +- @custom_column_generator decorator with model_aliases and side_effect_columns +- Multi-model orchestration (writer + editor) within a single custom column +- GenerationStrategy.CELL_BY_CELL for row-based LLM access +- CATEGORY sampler for topic diversity +- preview() + create() workflow +""" + +import data_designer.config as dd +from data_designer.interface import DataDesigner + +data_designer = DataDesigner() + +config_builder = dd.DataDesignerConfigBuilder() + +# --- Sampler columns --- + +config_builder.add_column( + dd.SamplerColumnConfig( + name="topic", + sampler_type=dd.SamplerType.CATEGORY, + params=dd.CategorySamplerParams( + values=[ + "renewable energy", + "space exploration", + "artificial intelligence", + "ocean conservation", + "quantum computing", + ], + ), + ) +) + +config_builder.add_column( + dd.SamplerColumnConfig( + name="audience", + sampler_type=dd.SamplerType.CATEGORY, + params=dd.CategorySamplerParams( + values=["general public", "technical experts", "students"], + weights=[0.4, 0.3, 0.3], + ), + ) +) + + +# --- Custom column: writer-editor pattern --- + + +@dd.custom_column_generator( + required_columns=["topic", "audience"], + side_effect_columns=["draft", "editorial_feedback"], + model_aliases=["nvidia-text", "nvidia-reasoning"], +) +def writer_editor(row: dict, generator_params: None, models: dict) -> dict: + # Step 1: Writer drafts the article + draft, _ = models["nvidia-text"].generate( + prompt=(f"Write a 200-word article about {row['topic']} for a {row['audience']} audience."), + ) + + # Step 2: Editor provides critique + feedback, _ = models["nvidia-reasoning"].generate( + prompt=(f"Review this article and provide 3 specific improvement suggestions:\n\n{draft}"), + ) + + # Step 3: Writer revises based on feedback + final, _ = models["nvidia-text"].generate( + prompt=( + f"Revise this article based on the editorial feedback.\n\n" + f"Original draft:\n{draft}\n\n" + f"Feedback:\n{feedback}\n\n" + f"Write the improved version." + ), + ) + + row["final_article"] = final + row["draft"] = draft + row["editorial_feedback"] = feedback + return row + + +config_builder.add_column( + dd.CustomColumnConfig( + name="final_article", + generator_function=writer_editor, + generation_strategy=dd.GenerationStrategy.CELL_BY_CELL, + ) +) + +# --- Preview then create --- + +preview = data_designer.preview(config_builder, num_records=3) +preview.display_sample_record() + +results = data_designer.create(config_builder, num_records=50, dataset_name="writer-editor") +dataset = results.load_dataset() +analysis = results.load_analysis() diff --git a/skill/data-designer/examples/mcp_tool_use.py b/skill/data-designer/examples/mcp_tool_use.py new file mode 100644 index 000000000..e1471f764 --- /dev/null +++ b/skill/data-designer/examples/mcp_tool_use.py @@ -0,0 +1,117 @@ +"""Fact-Checked Q&A Dataset — MCP tool calling with trace capture. + +PATTERN REFERENCE ONLY — copy the structure, not the domain-specific values. + +Demonstrates: +- Default model aliases (nvidia-text) — no manual ModelConfig needed +- FastMCP server with @mcp_server.tool() decorators +- LocalStdioMCPProvider launching the script as a subprocess +- ToolConfig with allow_tools whitelist +- tool_alias on LLMTextColumnConfig for grounded generation +- TraceType.ALL_MESSAGES for full tool-call history capture +- Self-contained server/client pattern (same script serves both roles) +""" + +import sys + +# --- MCP Server Definition --- +from mcp.server.fastmcp import FastMCP + +mcp_server = FastMCP("demo-tools") + +KNOWLEDGE_BASE = { + "python": "Python was created by Guido van Rossum and first released in 1991.", + "rust": "Rust was first released in 2010 and is known for memory safety.", + "javascript": "JavaScript was created by Brendan Eich in 1995 in 10 days.", + "go": "Go was designed at Google by Robert Griesemer, Rob Pike, and Ken Thompson.", + "java": "Java was developed by James Gosling at Sun Microsystems, released in 1995.", +} + + +@mcp_server.tool() +def lookup_language_facts(language: str) -> str: + """Look up facts about a programming language. + + Args: + language: Name of the programming language (e.g., 'python', 'rust'). + """ + key = language.lower().strip() + if key in KNOWLEDGE_BASE: + return KNOWLEDGE_BASE[key] + return f"No facts available for '{language}'. Available: {', '.join(KNOWLEDGE_BASE)}" + + +@mcp_server.tool() +def get_available_languages() -> str: + """Get the list of programming languages available in the knowledge base.""" + return ", ".join(sorted(KNOWLEDGE_BASE.keys())) + + +# --- Main: Data Designer Client --- + +if __name__ == "__main__": + if len(sys.argv) > 1 and sys.argv[1] == "serve": + mcp_server.run() + else: + import data_designer.config as dd + from data_designer.interface import DataDesigner + + # MCP provider: launch this script as subprocess server + provider = dd.LocalStdioMCPProvider( + name="demo-tools", + command=sys.executable, + args=[__file__, "serve"], + ) + + # Tool config: restrict to our two tools + tool_config = dd.ToolConfig( + tool_alias="lang-tools", + providers=["demo-tools"], + allow_tools=["lookup_language_facts", "get_available_languages"], + max_tool_call_turns=5, + timeout_sec=30.0, + ) + + data_designer = DataDesigner(mcp_providers=[provider]) + config_builder = dd.DataDesignerConfigBuilder(tool_configs=[tool_config]) + + # --- Sampler columns --- + + config_builder.add_column( + dd.SamplerColumnConfig( + name="language", + sampler_type=dd.SamplerType.CATEGORY, + params=dd.CategorySamplerParams( + values=["Python", "Rust", "JavaScript", "Go", "Java"], + ), + ) + ) + + # --- LLM column with tool access and trace capture --- + + config_builder.add_column( + dd.LLMTextColumnConfig( + name="fact_summary", + prompt=( + "Use the available tools to look up facts about {{ language }}, " + "then write a 2-3 sentence summary about the language." + ), + model_alias="nvidia-text", + system_prompt=( + "You are a helpful assistant with access to a programming language " + "knowledge base. Always use the lookup tool to get accurate facts " + "before writing your summary." + ), + tool_alias="lang-tools", + with_trace=dd.TraceType.ALL_MESSAGES, + ) + ) + + # --- Preview --- + + preview = data_designer.preview(config_builder, num_records=3) + preview.display_sample_record() + + results = data_designer.create(config_builder, num_records=10, dataset_name="lang-facts") + dataset = results.load_dataset() + analysis = results.load_analysis() diff --git a/skill/data-designer/examples/seed_dataset_with_judge.py b/skill/data-designer/examples/seed_dataset_with_judge.py new file mode 100644 index 000000000..fba542dac --- /dev/null +++ b/skill/data-designer/examples/seed_dataset_with_judge.py @@ -0,0 +1,116 @@ +"""Clinical Notes Dataset — seed data + judge scoring patterns. + +PATTERN REFERENCE ONLY — copy the structure, not the domain-specific values. + +Demonstrates: +- Default model aliases (nvidia-text) — no manual ModelConfig needed +- LocalFileSeedSource with SamplingStrategy.SHUFFLE +- UUID and DATETIME samplers +- PERSON_FROM_FAKER sampler +- LLMTextColumnConfig referencing seed columns ({{ diagnosis }}, {{ symptoms }}) +- LLMJudgeColumnConfig with two Score rubrics +- preview() + create() workflow +""" + +import data_designer.config as dd +from data_designer.interface import DataDesigner + +data_designer = DataDesigner() + +config_builder = dd.DataDesignerConfigBuilder() + +# --- Seed dataset (columns: diagnosis, symptoms) --- + +seed_source = dd.LocalFileSeedSource(path="medical_conditions.csv") +config_builder.with_seed_dataset(seed_source, sampling_strategy=dd.SamplingStrategy.SHUFFLE) + +# --- Sampler columns --- + +config_builder.add_column( + dd.SamplerColumnConfig( + name="patient", + sampler_type=dd.SamplerType.PERSON_FROM_FAKER, + params=dd.PersonFromFakerSamplerParams(locale="en_US"), + ) +) + +config_builder.add_column( + dd.SamplerColumnConfig( + name="patient_id", + sampler_type=dd.SamplerType.UUID, + params=dd.UUIDSamplerParams(prefix="PT-", short_form=True, uppercase=True), + ) +) + +config_builder.add_column( + dd.SamplerColumnConfig( + name="visit_date", + sampler_type=dd.SamplerType.DATETIME, + params=dd.DatetimeSamplerParams(start="2024-01-01", end="2024-12-31"), + ) +) + +# --- LLM text column referencing seed + sampler columns --- + +config_builder.add_column( + dd.LLMTextColumnConfig( + name="clinical_notes", + prompt=( + "You are a physician writing clinical notes for patient " + "{{ patient.first_name }} {{ patient.last_name }} " + "(ID: {{ patient_id }}).\n\n" + "Visit date: {{ visit_date }}\n" + "Diagnosis: {{ diagnosis }}\n" + "Presenting symptoms: {{ symptoms }}\n\n" + "Write detailed clinical notes including assessment and treatment plan." + ), + model_alias="nvidia-text", + ) +) + +# --- LLM judge column with two scoring rubrics --- + +config_builder.add_column( + dd.LLMJudgeColumnConfig( + name="quality_scores", + prompt=( + "Evaluate the following clinical notes:\n\n{{ clinical_notes }}\n\nRate the notes on the criteria below." + ), + scores=[ + dd.Score( + name="Medical Accuracy", + description=( + "Are the assessment and treatment plan consistent with the stated diagnosis and symptoms?" + ), + options={ + 1: "Major inaccuracies or contradictions", + 2: "Some inaccuracies", + 3: "Mostly accurate with minor issues", + 4: "Accurate and clinically sound", + 5: "Exemplary accuracy and clinical reasoning", + }, + ), + dd.Score( + name="Completeness", + description=("Does the note cover history, assessment, and treatment plan?"), + options={ + 1: "Missing most required sections", + 2: "Incomplete — key sections missing", + 3: "Adequate but could be more thorough", + 4: "Thorough with all major sections present", + 5: "Comprehensive and publication-ready", + }, + ), + ], + model_alias="nvidia-text", + ) +) + +# --- Preview then create --- + +preview = data_designer.preview(config_builder, num_records=3) +preview.display_sample_record() + +results = data_designer.create(config_builder, num_records=100, dataset_name="clinical-notes") +dataset = results.load_dataset() +analysis = results.load_analysis() diff --git a/skill/data-designer/examples/structured_and_code.py b/skill/data-designer/examples/structured_and_code.py new file mode 100644 index 000000000..b6f03786c --- /dev/null +++ b/skill/data-designer/examples/structured_and_code.py @@ -0,0 +1,99 @@ +"""Programming Task Dataset — structured output + code generation patterns. + +PATTERN REFERENCE ONLY — copy the structure, not the domain-specific values. + +Demonstrates: +- Default model aliases (nvidia-text) — no manual ModelConfig needed +- Pydantic BaseModel as output_format for LLMStructuredColumnConfig +- Nested field access in downstream prompts ({{ task_spec.function_name }}) +- LLMCodeColumnConfig with CodeLang.PYTHON +- CATEGORY sampler (uniform, no weights) +- preview() + create() workflow +""" + +from pydantic import BaseModel, Field + +import data_designer.config as dd +from data_designer.interface import DataDesigner + +data_designer = DataDesigner() + +config_builder = dd.DataDesignerConfigBuilder() + +# --- Sampler columns --- + +config_builder.add_column( + dd.SamplerColumnConfig( + name="domain", + sampler_type=dd.SamplerType.CATEGORY, + params=dd.CategorySamplerParams( + values=[ + "data processing", + "string manipulation", + "math", + "file I/O", + "API calls", + ], + ), + ) +) + +config_builder.add_column( + dd.SamplerColumnConfig( + name="difficulty", + sampler_type=dd.SamplerType.CATEGORY, + params=dd.CategorySamplerParams(values=["easy", "medium", "hard"]), + ) +) + +# --- Pydantic schema for structured output --- + + +class TaskSpec(BaseModel): + task_description: str = Field(description="What the function should do") + function_name: str = Field(description="Name of the function") + input_params: list[str] = Field(description="List of parameter names") + return_type: str = Field(description="Expected return type") + + +# --- Structured LLM column --- + +config_builder.add_column( + dd.LLMStructuredColumnConfig( + name="task_spec", + prompt=( + "Create a {{ difficulty }} programming task in the {{ domain }} domain. " + "Define a function specification with a clear description, name, " + "parameters, and return type." + ), + output_format=TaskSpec, + model_alias="nvidia-text", + ) +) + +# --- Code generation column (references nested structured fields) --- + +config_builder.add_column( + dd.LLMCodeColumnConfig( + name="solution", + prompt=( + "Write a Python function based on this specification:\n\n" + "Task: {{ task_spec.task_description }}\n" + "Function name: {{ task_spec.function_name }}\n" + "Parameters: {{ task_spec.input_params }}\n" + "Return type: {{ task_spec.return_type }}\n\n" + "Include docstring and type hints." + ), + code_lang=dd.CodeLang.PYTHON, + model_alias="nvidia-text", + ) +) + +# --- Preview then create --- + +preview = data_designer.preview(config_builder, num_records=3) +preview.display_sample_record() + +results = data_designer.create(config_builder, num_records=100, dataset_name="programming-tasks") +dataset = results.load_dataset() +analysis = results.load_analysis() diff --git a/skill/data-designer/hooks/check_data_designer.sh b/skill/data-designer/hooks/check_data_designer.sh new file mode 100755 index 000000000..bce0fb011 --- /dev/null +++ b/skill/data-designer/hooks/check_data_designer.sh @@ -0,0 +1,33 @@ +#!/bin/bash +# ============================================================================= +# Data Designer Context Injection Hook +# ============================================================================= +# This hook provides the agent with context about the data_designer library: +# - Whether the library is installed +# - The installed version +# - The library's location on disk (useful for reading source code) +# ============================================================================= + +# Check if data_designer is installed +if ! uv run python -c "import data_designer" 2>/dev/null; then + echo "=== Data Designer Context ===" + echo "STATUS: NOT INSTALLED" + echo "The data_designer library is not installed in the current environment." + echo "To install, run: uv pip install data-designer" + echo "=============================" + exit 0 +fi + +# Get version and library path +VERSION=$(uv run python -c "import importlib.metadata; print(importlib.metadata.version('data-designer'))" 2>/dev/null) +LIB_PATH=$(uv run python "$(dirname "$0")/../scripts/echo_data_designer_library_path.py" 2>/dev/null) + +# Output formatted context for the agent +echo "=== Data Designer Library ===" +echo "STATUS: Installed" +echo "VERSION: ${VERSION}" +echo "LIBRARY_PATH: ${LIB_PATH}" +echo "" +echo "Use the discovery scripts (get_column_info.py, get_sampler_info.py, etc.)" +echo "to look up API details — prefer these over reading source code directly." +echo "=============================" diff --git a/skill/data-designer/hooks/ruff_lint.sh b/skill/data-designer/hooks/ruff_lint.sh new file mode 100755 index 000000000..1966a377a --- /dev/null +++ b/skill/data-designer/hooks/ruff_lint.sh @@ -0,0 +1,60 @@ +#!/bin/bash +# ============================================================================= +# Ruff Linting Hook +# ============================================================================= +# Runs ruff linting on modified Python files. +# Receives the file path as $1 from Claude Code. +# ============================================================================= + +show_help() { + cat << EOF +Usage: $(basename "$0") [OPTIONS] + +Run ruff linting on a Python file. + +Arguments: + file Path to the Python file to lint + +Options: + -h, --help Show this help message and exit + +Examples: + $(basename "$0") src/main.py + $(basename "$0") --help + +Exit Codes: + 0 No linting issues found + 1 Linting issues detected +EOF +} + +# Parse arguments +if [[ "$1" == "-h" || "$1" == "--help" ]]; then + show_help + exit 0 +fi + +FILE="$1" + +# Show help if no file argument provided +if [[ -z "$FILE" ]]; then + show_help + exit 0 +fi + +# Only process Python files +if [[ ! "$FILE" =~ \.py$ ]]; then + exit 0 +fi + +echo "=== Ruff Lint Check ===" +uvx ruff check "$FILE" +EXIT_CODE=$? + +if [ $EXIT_CODE -eq 0 ]; then + echo "No linting issues found." +else + echo "Linting issues detected. Consider running: uvx ruff check --fix $FILE" +fi +echo "=======================" +exit $EXIT_CODE diff --git a/skill/data-designer/hooks/ty_check.sh b/skill/data-designer/hooks/ty_check.sh new file mode 100755 index 000000000..c099d7eb7 --- /dev/null +++ b/skill/data-designer/hooks/ty_check.sh @@ -0,0 +1,58 @@ +#!/bin/bash +# ============================================================================= +# Ty Type Checking Hook +# ============================================================================= +# Runs ty type checking on modified Python files. +# Receives the file path as $1 from Claude Code. +# ============================================================================= + +show_help() { + cat << EOF +Usage: $(basename "$0") [OPTIONS] + +Run ty type checking on a Python file. + +Arguments: + file Path to the Python file to type check + +Options: + -h, --help Show this help message and exit + +Examples: + $(basename "$0") src/main.py + $(basename "$0") --help + +Exit Codes: + 0 No type errors found + 1 Type errors detected +EOF +} + +# Parse arguments +if [[ "$1" == "-h" || "$1" == "--help" ]]; then + show_help + exit 0 +fi + +FILE="$1" + +# Show help if no file argument provided +if [[ -z "$FILE" ]]; then + show_help + exit 0 +fi + +# Only process Python files +if [[ ! "$FILE" =~ \.py$ ]]; then + exit 0 +fi + +echo "=== Ty Type Check ===" +uvx ty check "$FILE" +EXIT_CODE=$? + +if [ $EXIT_CODE -eq 0 ]; then + echo "No type errors found." +fi +echo "=====================" +exit $EXIT_CODE diff --git a/skill/data-designer/references/advanced_patterns.md b/skill/data-designer/references/advanced_patterns.md new file mode 100644 index 000000000..0817f8e2a --- /dev/null +++ b/skill/data-designer/references/advanced_patterns.md @@ -0,0 +1,446 @@ +# Data Designer Advanced Patterns + +Advanced usage patterns for `data-designer`. See `references/api_reference.md` for the complete API. + +--- + +## Table of Contents + +1. [Custom Column Generators](#1-custom-column-generators) +2. [MCP Tool Integration](#2-mcp-tool-integration) +3. [Multimodal Inputs](#3-multimodal-inputs) +4. [Schema Transform Processors](#4-schema-transform-processors) +5. [Performance Tuning](#5-performance-tuning) +6. [Multi-Stage Refinement](#6-multi-stage-refinement) +7. [Conditional Sampling](#7-conditional-sampling) +8. [Trace & Reasoning Extraction](#8-trace--reasoning-extraction) +9. [Nemotron Personas](#9-nemotron-personas) +10. [Configuration Serialization](#10-configuration-serialization) + +--- + +## 1. Custom Column Generators + +Use `@custom_column_generator` for logic that can't be expressed with built-in column types. + +### Decorator Signature + +```python +from data_designer.config import custom_column_generator, GenerationStrategy + +@custom_column_generator( + required_columns: list[str] | None = None, # columns this generator reads + side_effect_columns: list[str] | None = None, # extra columns this generator creates + model_aliases: list[str] | None = None, # LLM models needed +) +``` + +### Function Signatures (1-3 args, names matter) + +```python +# 1-arg: row-only (cell_by_cell) or df-only (full_column) +def my_gen(row): ... +def my_gen(df): ... + +# 2-arg: with typed parameters +def my_gen(row, generator_params): ... + +# 3-arg: with LLM access +def my_gen(row, generator_params, models): ... +``` + +First param name determines strategy: `row` -> `CELL_BY_CELL`, `df` -> `FULL_COLUMN`. + +### Writer-Editor Pattern (Multi-Model) + +```python +@custom_column_generator( + required_columns=["topic"], + side_effect_columns=["draft", "critique"], + model_aliases=["writer", "editor"], +) +def writer_editor(row: dict, generator_params: None, models: dict) -> dict: + draft, _ = models["writer"].generate(prompt=f"Write about '{row['topic']}'") + critique, _ = models["editor"].generate(prompt=f"Critique: {draft}") + revised, _ = models["writer"].generate( + prompt=f"Revise based on: {critique}\n\nOriginal: {draft}" + ) + row["final_text"] = revised + row["draft"] = draft + row["critique"] = critique + return row + +config_builder.add_column(dd.CustomColumnConfig( + name="final_text", + generator_function=writer_editor, + generation_strategy=dd.GenerationStrategy.CELL_BY_CELL, +)) +``` + +### With Typed Parameters + +```python +from pydantic import BaseModel + +class FormatConfig(BaseModel): + prefix: str = "Dr." + uppercase: bool = True + +@custom_column_generator(required_columns=["name"]) +def format_name(row, generator_params): + name = f"{generator_params.prefix} {row['name']}" + return name.upper() if generator_params.uppercase else name + +config_builder.add_column(dd.CustomColumnConfig( + name="formal_name", + generator_function=format_name, + generator_params=FormatConfig(prefix="Prof."), +)) +``` + +### Full-Column Strategy (Vectorized) + +```python +@custom_column_generator(required_columns=["score"]) +def normalize_scores(df): + return (df["score"] - df["score"].min()) / (df["score"].max() - df["score"].min()) + +config_builder.add_column(dd.CustomColumnConfig( + name="normalized_score", + generator_function=normalize_scores, + generation_strategy=dd.GenerationStrategy.FULL_COLUMN, +)) +``` + +### Testing Custom Generators + +Test generators outside the full pipeline: + +```python +data_designer = DataDesigner() +models = data_designer.get_models(["nvidia-text"]) +result = my_generator({"topic": "AI"}, None, models) +``` + +--- + +## 2. MCP Tool Integration + +### Self-Contained Server + Client Pattern + +A common pattern is a single script that acts as both MCP server and Data Designer client: + +```python +import sys +from mcp.server.fastmcp import FastMCP + +mcp_server = FastMCP("my-tools") + +@mcp_server.tool() +def search_docs(query: str) -> str: + """Search documents by keyword.""" + # ... implementation + return results + +if __name__ == "__main__": + if len(sys.argv) > 1 and sys.argv[1] == "serve": + mcp_server.run() + else: + import data_designer.config as dd + from data_designer.interface import DataDesigner + + provider = dd.LocalStdioMCPProvider( + name="my-tools", + command=sys.executable, + args=[__file__, "serve"], + ) + + tool_config = dd.ToolConfig( + tool_alias="search", + providers=["my-tools"], + allow_tools=["search_docs"], + max_tool_call_turns=10, + timeout_sec=30.0, + ) + + data_designer = DataDesigner(mcp_providers=[provider]) + config_builder = dd.DataDesignerConfigBuilder(tool_configs=[tool_config]) + + config_builder.add_column(dd.LLMTextColumnConfig( + name="answer", + prompt="Use tools to find and answer: {{ question }}", + model_alias="nvidia-text", + tool_alias="search", + system_prompt="You have access to search tools. Use them to find information.", + with_trace=dd.TraceType.ALL_MESSAGES, + )) +``` + +### Remote MCP Provider + +```python +mcp_provider = dd.MCPProvider( + name="remote-tools", + endpoint="https://mcp.example.com/sse", + api_key="REMOTE_API_KEY", +) +``` + +--- + +## 3. Multimodal Inputs + +### Image Context for Vision Models + +```python +# From URL column +dd.LLMTextColumnConfig( + name="description", + prompt="Describe this image in detail.", + model_alias="nvidia-vision", + multi_modal_context=[ + dd.ImageContext( + column_name="image_url", + data_type=dd.ModalityDataType.URL, + ) + ], +) + +# From Base64 column +dd.LLMTextColumnConfig( + name="summary", + prompt="Summarize this document.", + model_alias="nvidia-vision", + multi_modal_context=[ + dd.ImageContext( + column_name="base64_image", + data_type=dd.ModalityDataType.BASE64, + image_format=dd.ImageFormat.PNG, # required for base64 + ) + ], +) +``` + +Image formats: `PNG`, `JPG`/`JPEG`, `GIF`, `WEBP` + +Multiple images: column can contain a JSON array of URLs. + +--- + +## 4. Schema Transform Processors + +Create additional output datasets with transformed schemas. The original dataset passes through unchanged. + +### Chat Format Transform + +```python +config_builder.add_processor(dd.SchemaTransformProcessorConfig( + name="openai_chat", + template={ + "messages": [ + {"role": "system", "content": "{{ system_prompt }}"}, + {"role": "user", "content": "{{ question }}"}, + {"role": "assistant", "content": "{{ answer }}"}, + ], + "metadata": { + "category": "{{ category | upper }}", + "difficulty": "{{ difficulty }}", + }, + }, +)) + +# Load transformed data +transformed = results.load_processor_dataset("openai_chat") +``` + +### Multi-Turn Chat Transform + +```python +config_builder.add_processor(dd.SchemaTransformProcessorConfig( + name="multi_turn", + template={ + "conversations": [ + {"from": "human", "value": "{{ turn_1_user }}"}, + {"from": "gpt", "value": "{{ turn_1_assistant }}"}, + {"from": "human", "value": "{{ turn_2_user }}"}, + {"from": "gpt", "value": "{{ turn_2_assistant }}"}, + ], + }, +)) +``` + +--- + +## 5. Performance Tuning + +### Key Parameters + +| Parameter | Default | Tune When | +|-----------|---------|-----------| +| `buffer_size` | 1000 | Memory issues -> lower; faster batch cycling -> lower | +| `max_parallel_requests` | 4 | Low GPU util -> increase (self-hosted: try 256-1024) | +| `max_conversation_restarts` | 5 | Strict schemas -> increase to 7+ | +| `max_conversation_correction_steps` | 0 | Schema conformance issues -> set to 2-3 | +| `disable_early_shutdown` | False | Debugging -> set True | +| `non_inference_max_parallel_workers` | 4 | Many non-LLM columns -> increase | + +### Concurrency Formula + +``` +concurrent_requests = min(buffer_size, max_parallel_requests, remaining_cells) +``` + +### Execution Model + +1. Dataset split into batches of `buffer_size` +2. Within a batch, columns processed **sequentially** (DAG order) +3. Within a column, cells processed **in parallel** (up to limit) + +### Benchmarking + +Run 100 records with increasing `max_parallel_requests` (4 -> 8 -> 16 -> 32 -> ...). Stop when runtime plateaus. + +```python +from data_designer.config import RunConfig + +data_designer.set_run_config(RunConfig( + buffer_size=500, + max_conversation_restarts=7, + max_conversation_correction_steps=2, +)) +``` + +--- + +## 6. Multi-Stage Refinement + +Chain LLM columns for iterative improvement: + +```python +# Stage 1: Generate draft +config_builder.add_column(dd.LLMTextColumnConfig( + name="draft", prompt="Write an article about {{ topic }}.", model_alias="nvidia-text", +)) + +# Stage 2: Critique +config_builder.add_column(dd.LLMStructuredColumnConfig( + name="critique", + prompt="Identify 3 improvements for:\n\n{{ draft }}", + output_format=CritiqueSchema, + model_alias="nvidia-reasoning", +)) + +# Stage 3: Refine +config_builder.add_column(dd.LLMTextColumnConfig( + name="final_article", + prompt="Improve based on feedback:\n\nDraft: {{ draft }}\nFeedback: {{ critique.suggestions }}", + model_alias="nvidia-text", +)) + +# Stage 4: Judge +config_builder.add_column(dd.LLMJudgeColumnConfig( + name="quality", + prompt="Rate this article:\n\n{{ final_article }}", + scores=[dd.Score(name="Quality", description="...", options={1: "Poor", ..., 5: "Excellent"})], + model_alias="nvidia-text", +)) +``` + +--- + +## 7. Conditional Sampling + +Override sampler params based on other column values: + +```python +dd.SamplerColumnConfig( + name="review_style", + sampler_type=dd.SamplerType.CATEGORY, + params=dd.CategorySamplerParams( + values=["brief", "detailed", "rambling"], + weights=[0.4, 0.4, 0.2], + ), + conditional_params={ + "age_group == '18-25'": dd.CategorySamplerParams(values=["rambling"]), + "age_group == '65+'": dd.CategorySamplerParams(values=["detailed"]), + }, +) +``` + +--- + +## 8. Trace & Reasoning Extraction + +### Trace Types + +- `TraceType.NONE` (default): No trace +- `TraceType.LAST_MESSAGE`: Only final response -> `{name}__trace` +- `TraceType.ALL_MESSAGES`: Full conversation -> `{name}__trace` + +### Reasoning Content + +`extract_reasoning_content=True` creates `{name}__reasoning_content` with chain-of-thought. + +Available on all LLM column types. + +### Use Cases + +- **Debugging**: `ALL_MESSAGES` to see full conversation including tool calls +- **Fine-tuning data**: `extract_reasoning_content=True` for clean reasoning extraction +- **Tool-use training**: `ALL_MESSAGES` to capture tool call patterns + +--- + +## 9. Nemotron Personas + +Census-grounded synthetic person data with rich personality traits. + +### Setup + +```bash +data-designer download personas --locale en_US +``` + +### Usage + +```python +dd.SamplerColumnConfig( + name="customer", + sampler_type=dd.SamplerType.PERSON, + params=dd.PersonSamplerParams( + locale="en_US", + sex="Female", + age_range=[25, 45], + with_synthetic_personas=True, # personality traits, cultural backgrounds + select_field_values={"state": ["NY", "CA"]}, + ), +) +``` + +### Available Locales + +`en_US`, `en_IN`, `en_SG`, `hi_Deva_IN`, `hi_Latn_IN`, `ja_JP`, `pt_BR` + +### Persona Fields + +Big Five personality traits, cultural backgrounds, skills, hobbies, career goals, plus domain-specific personas (professional, financial, healthcare, sports, arts, travel, culinary). + +--- + +## 10. Configuration Serialization + +### Save + +```python +config_builder.write_config("my_config.yaml") +config_builder.write_config("my_config.json", indent=2) +``` + +Note: `DataFrameSeedSource` is not serializable. Use `LocalFileSeedSource` for shareable configs. + +### Load + +```python +loaded = dd.DataDesignerConfigBuilder.from_config("my_config.yaml") +loaded = dd.DataDesignerConfigBuilder.from_config({"columns": [...]}) +``` diff --git a/skill/data-designer/references/api_reference.md b/skill/data-designer/references/api_reference.md new file mode 100644 index 000000000..962762f1e --- /dev/null +++ b/skill/data-designer/references/api_reference.md @@ -0,0 +1,557 @@ +# Data Designer API Reference + +Complete API reference for `data-designer` v0.5.x. All classes imported via `import data_designer.config as dd` unless noted. + +--- + +## Table of Contents + +1. [Core Classes](#1-core-classes) +2. [Column Types](#2-column-types) +3. [Sampler Types](#3-sampler-types) +4. [Model Configuration](#4-model-configuration) +5. [Seed Datasets](#5-seed-datasets) +6. [Constraints](#6-constraints) +7. [Processors](#7-processors) +8. [Validators](#8-validators) +9. [MCP / Tool Configuration](#9-mcp--tool-configuration) +10. [RunConfig](#10-runconfig) +11. [Profilers](#11-profilers) +12. [Results](#12-results) +13. [Default Model Aliases](#13-default-model-aliases) + +--- + +## 1. Core Classes + +### DataDesigner + +```python +from data_designer.interface import DataDesigner + +DataDesigner( + artifact_path: Path | str | None = None, # default: ./artifacts + model_providers: list[ModelProvider] | None = None, # default: nvidia + openai + openrouter + secret_resolver: SecretResolver | None = None, + seed_readers: list[SeedReader] | None = None, + managed_assets_path: Path | str | None = None, # default: ~/.data-designer/managed-assets + mcp_providers: list[MCPProviderT] | None = None, +) +``` + +| Method | Returns | Description | +|--------|---------|-------------| +| `preview(config_builder, *, num_records=10)` | `PreviewResults` | In-memory preview | +| `create(config_builder, *, num_records=10, dataset_name="dataset")` | `DatasetCreationResults` | Full generation + save | +| `validate(config_builder)` | `None` | Validate without generating | +| `set_run_config(run_config)` | `None` | Set runtime parameters | +| `get_models(model_aliases)` | `dict[str, ModelFacade]` | Get model facades for custom columns | +| `get_default_model_configs()` | `list[ModelConfig]` | List default model configs | +| `get_default_model_providers()` | `list[ModelProvider]` | List default providers | +| `info` | `InterfaceInfo` | Property: `dd.info.display("model_providers")` | + +### DataDesignerConfigBuilder + +```python +dd.DataDesignerConfigBuilder( + model_configs: list[ModelConfig] | str | Path | None = None, # None = use defaults + tool_configs: list[ToolConfig] | None = None, +) +``` + +| Method | Description | +|--------|-------------| +| `add_column(column_config)` | Add a column config | +| `delete_column(column_name)` | Remove a column | +| `get_column_config(name)` | Get column by name | +| `get_column_configs()` | Get all columns | +| `get_columns_of_type(column_type)` | Filter by type | +| `add_model_config(model_config)` | Add model config | +| `delete_model_config(alias)` | Remove model config | +| `add_tool_config(tool_config)` | Add MCP tool config | +| `delete_tool_config(alias)` | Remove tool config | +| `add_constraint(constraint)` | Add sampler constraint | +| `delete_constraints(target_column)` | Remove constraints | +| `add_processor(processor_config)` | Add processor | +| `add_profiler(profiler_config)` | Add profiler | +| `with_seed_dataset(seed_source, *, sampling_strategy=ORDERED, selection_strategy=None)` | Set seed data | +| `build()` | Build final config | +| `write_config(path, indent=2)` | Serialize to YAML/JSON | +| `from_config(config)` | Class method: load from file/dict | +| `info` | Property: `builder.info.display("samplers")` | +| `allowed_references` | Property: list of referenceable column names | + +--- + +## 2. Column Types + +### SamplerColumnConfig + +```python +dd.SamplerColumnConfig( + name: str, + sampler_type: dd.SamplerType, + params: SamplerParamsT, + conditional_params: dict[str, SamplerParamsT] = {}, # condition -> override params + convert_to: str | None = None, # "int", "float", "str" + drop: bool = False, +) +``` + +### LLMTextColumnConfig + +```python +dd.LLMTextColumnConfig( + name: str, + prompt: str, # Jinja2 template + model_alias: str, + system_prompt: str | None = None, + tool_alias: str | None = None, # MCP tool reference + with_trace: dd.TraceType = TraceType.NONE, # NONE, LAST_MESSAGE, ALL_MESSAGES + extract_reasoning_content: bool = False, # -> {name}__reasoning_content + multi_modal_context: list[dd.ImageContext] | None = None, + drop: bool = False, +) +``` + +### LLMCodeColumnConfig + +Extends LLMTextColumnConfig with: +```python +dd.LLMCodeColumnConfig( + ..., # all LLMTextColumnConfig fields + code_lang: dd.CodeLang, # PYTHON, JAVASCRIPT, SQL_POSTGRES, etc. +) +``` + +**CodeLang values**: BASH, C, COBOL, CPP, CSHARP, GO, JAVA, JAVASCRIPT, KOTLIN, PYTHON, RUBY, RUST, SCALA, SWIFT, TYPESCRIPT, SQL_SQLITE, SQL_TSQL, SQL_BIGQUERY, SQL_MYSQL, SQL_POSTGRES, SQL_ANSI + +### LLMStructuredColumnConfig + +```python +dd.LLMStructuredColumnConfig( + ..., # all LLMTextColumnConfig fields + output_format: dict | type[BaseModel], # Pydantic model or JSON schema dict +) +``` + +Access nested fields in downstream prompts: `{{ column_name.field_name }}` + +### LLMJudgeColumnConfig + +```python +dd.LLMJudgeColumnConfig( + ..., # all LLMTextColumnConfig fields + scores: list[dd.Score], +) + +dd.Score( + name: str, + description: str, + options: dict[int | str, str], # score_value -> description +) +``` + +### ExpressionColumnConfig + +```python +dd.ExpressionColumnConfig( + name: str, + expr: str, # Jinja2 expression + dtype: str = "str", # "int", "float", "str", "bool" + drop: bool = False, +) +``` + +### EmbeddingColumnConfig + +```python +dd.EmbeddingColumnConfig( + name: str, + target_column: str, # column to embed (text or JSON list of texts) + model_alias: str, # embedding model alias + drop: bool = False, +) +``` + +### ValidationColumnConfig + +```python +dd.ValidationColumnConfig( + name: str, + target_columns: list[str], + validator_type: dd.ValidatorType, # CODE, LOCAL_CALLABLE, REMOTE + validator_params: ValidatorParamsT, + batch_size: int = 10, + drop: bool = False, +) +``` + +### CustomColumnConfig + +```python +dd.CustomColumnConfig( + name: str, + generator_function: Any, # decorated with @custom_column_generator + generation_strategy: dd.GenerationStrategy = GenerationStrategy.CELL_BY_CELL, + generator_params: BaseModel | None = None, + drop: bool = False, +) +``` + +--- + +## 3. Sampler Types + +| SamplerType | Params Class | Key Params | +|-------------|--------------|------------| +| `CATEGORY` | `CategorySamplerParams` | `values: list`, `weights: list[float] | None` | +| `SUBCATEGORY` | `SubcategorySamplerParams` | `category: str` (parent column), `values: dict[str, list]` | +| `PERSON` | `PersonSamplerParams` | `locale`, `sex`, `city`, `age_range`, `with_synthetic_personas`, `select_field_values` | +| `PERSON_FROM_FAKER` | `PersonFromFakerSamplerParams` | `locale`, `sex`, `city`, `age_range` | +| `UUID` | `UUIDSamplerParams` | `prefix`, `short_form`, `uppercase` | +| `DATETIME` | `DatetimeSamplerParams` | `start`, `end`, `unit` (Y/M/D/h/m/s) | +| `TIMEDELTA` | `TimeDeltaSamplerParams` | `reference_column_name`, `dt_min`, `dt_max`, `unit` | +| `UNIFORM` | `UniformSamplerParams` | `low`, `high`, `decimal_places` | +| `GAUSSIAN` | `GaussianSamplerParams` | `mean`, `stddev`, `decimal_places` | +| `POISSON` | `PoissonSamplerParams` | `mean` | +| `BINOMIAL` | `BinomialSamplerParams` | `n`, `p` | +| `BERNOULLI` | `BernoulliSamplerParams` | `p` | +| `BERNOULLI_MIXTURE` | `BernoulliMixtureSamplerParams` | `p`, `dist_name`, `dist_params` | +| `SCIPY` | `ScipySamplerParams` | `dist_name`, `dist_params`, `decimal_places` | + +### Person Object Fields + +Access via `{{ person.field_name }}`: +- `first_name`, `last_name`, `full_name` +- `age`, `birth_date`, `sex` +- `email`, `phone` +- `city`, `state`, `country`, `address` + +**Nemotron Personas** (SamplerType.PERSON only): +- Supported locales: `en_US`, `en_IN`, `en_SG`, `hi_Deva_IN`, `hi_Latn_IN`, `ja_JP`, `pt_BR` +- Download: `data-designer download personas --locale en_US` +- Extra fields: Big Five personality traits, cultural backgrounds, domain-specific personas +- Filtering: `select_field_values={"state": ["NY", "CA"]}` + +--- + +## 4. Model Configuration + +### ModelConfig + +```python +dd.ModelConfig( + alias: str, # reference name for columns + model: str, # model identifier + provider: str | None = None, + inference_parameters: ChatCompletionInferenceParams | EmbeddingInferenceParams, + skip_health_check: bool = False, +) +``` + +### ChatCompletionInferenceParams + +```python +dd.ChatCompletionInferenceParams( + temperature: float | DistributionT | None, + top_p: float | DistributionT | None, + max_tokens: int | None, + max_parallel_requests: int = 4, + timeout: int | None, + extra_body: dict | None, +) +``` + +### Temperature/Top-p Distributions + +For diversity, use distributions instead of fixed values: + +```python +# Uniform +dd.UniformDistribution(params=dd.UniformDistributionParams(low=0.5, high=1.0)) + +# Manual (discrete) +dd.ManualDistribution(params=dd.ManualDistributionParams( + values=[0.8, 0.9, 1.0], weights=[0.2, 0.5, 0.3] +)) +``` + +### EmbeddingInferenceParams + +```python +dd.EmbeddingInferenceParams( + encoding_format: str = "float", # "float" or "base64" + dimensions: int | None = None, + max_parallel_requests: int = 4, +) +``` + +### ModelProvider + +```python +dd.ModelProvider( + name: str, + endpoint: str, + provider_type: str = "openai", # API format + api_key: str | None = None, + extra_body: dict | None = None, + extra_headers: dict | None = None, +) +``` + +--- + +## 5. Seed Datasets + +### Sources + +```python +# Local file (CSV, Parquet, JSON/JSONL, XLSX; supports wildcards) +dd.LocalFileSeedSource(path="data/*.parquet") +dd.LocalFileSeedSource.from_dataframe(df, path="saved.parquet") + +# HuggingFace +dd.HuggingFaceSeedSource(path="datasets/user/name/data/*.parquet", token="hf_xxx") + +# In-memory DataFrame (not serializable to YAML/JSON) +dd.DataFrameSeedSource(df=my_dataframe) +``` + +### Sampling & Selection Strategies + +```python +config_builder.with_seed_dataset( + seed_source, + sampling_strategy=dd.SamplingStrategy.SHUFFLE, # ORDERED (default) or SHUFFLE + selection_strategy=dd.IndexRange(start=0, end=99), # or PartitionBlock(index=0, num_partitions=10) +) +``` + +--- + +## 6. Constraints + +```python +# Column vs scalar +dd.ScalarInequalityConstraint(target_column="age", operator=dd.InequalityOperator.GE, rhs=18) + +# Column vs column +dd.ColumnInequalityConstraint(target_column="end_date", operator=dd.InequalityOperator.GT, rhs="start_date") +``` + +Operators: `LT`, `LE`, `GT`, `GE` + +Only for numerical sampler columns. + +--- + +## 7. Processors + +Run at `BuildStage.POST_BATCH` after column generation. + +### DropColumnsProcessorConfig + +```python +dd.DropColumnsProcessorConfig( + name: str, + column_names: list[str], +) +``` + +Dropped columns saved separately in `dropped-columns/` directory. + +### SchemaTransformProcessorConfig + +```python +dd.SchemaTransformProcessorConfig( + name: str, + template: dict[str, Any], # keys = new column names, values = Jinja2 templates +) +``` + +Creates an **additional** dataset alongside the original. Output in `processors-outputs/{name}/`. + +Example: +```python +dd.SchemaTransformProcessorConfig( + name="chat_format", + template={ + "messages": [ + {"role": "user", "content": "{{ question }}"}, + {"role": "assistant", "content": "{{ answer }}"}, + ], + "metadata": {"category": "{{ category | upper }}"}, + }, +) +``` + +--- + +## 8. Validators + +### CodeValidatorParams + +```python +dd.CodeValidatorParams(code_lang=dd.CodeLang.PYTHON) +``` + +Python: uses Ruff. Returns `is_valid`, `python_linter_score` (0-10), `python_linter_severity`, `python_linter_messages`. + +SQL: uses SQLFluff. Dialects: SQL_POSTGRES, SQL_ANSI, SQL_MYSQL, SQL_SQLITE, SQL_TSQL, SQL_BIGQUERY. Returns `is_valid`, `error_messages`. + +### LocalCallableValidatorParams + +```python +dd.LocalCallableValidatorParams( + validation_function: Callable[[pd.DataFrame], pd.DataFrame], # must return df with is_valid column + output_schema: dict | None = None, +) +``` + +### RemoteValidatorParams + +```python +dd.RemoteValidatorParams( + endpoint_url: str, + output_schema: dict | None = None, + timeout: float = 30.0, + max_retries: int = 3, + retry_backoff: float = 2.0, + max_parallel_requests: int = 4, +) +``` + +### Batch Size Recommendations + +- Code validators: 5-20 +- Local callable: 10-50 +- Remote validators: 1-10 + +--- + +## 9. MCP / Tool Configuration + +### Providers + +```python +# Local subprocess (stdio transport) +dd.LocalStdioMCPProvider( + name: str, + command: str, # e.g., "python" + args: list[str] | None, # e.g., ["-m", "my_mcp_server"] + env: dict[str, str] | None, +) + +# Remote SSE +dd.MCPProvider( + name: str, + endpoint: str, # e.g., "http://localhost:8080/sse" + api_key: str | None, +) +``` + +### ToolConfig + +```python +dd.ToolConfig( + tool_alias: str, + providers: list[str], # MCP provider names + allow_tools: list[str] | None, # allowlist (None = all tools) + max_tool_call_turns: int = 5, + timeout_sec: float | None = None, +) +``` + +### Usage + +```python +data_designer = DataDesigner(mcp_providers=[mcp_provider]) +config_builder = dd.DataDesignerConfigBuilder(tool_configs=[tool_config]) + +config_builder.add_column(dd.LLMTextColumnConfig( + name="answer", + prompt="Use tools to answer: {{ question }}", + model_alias="nvidia-text", + tool_alias="my-tools", +)) +``` + +--- + +## 10. RunConfig + +```python +dd.RunConfig( + buffer_size: int = 1000, # records per batch + disable_early_shutdown: bool = False, + shutdown_error_rate: float = 0.5, # 0.0-1.0 + shutdown_error_window: int = 10, # min tasks before monitoring + non_inference_max_parallel_workers: int = 4, + max_conversation_restarts: int = 5, # full restarts on failure + max_conversation_correction_steps: int = 0, # in-conversation corrections +) +``` + +Apply: `data_designer.set_run_config(RunConfig(...))` + +--- + +## 11. Profilers + +```python +dd.JudgeScoreProfilerConfig( + model_alias: str, + summary_score_sample_size: int | None = 20, +) +``` + +Add: `config_builder.add_profiler(dd.JudgeScoreProfilerConfig(model_alias="nvidia-text"))` + +--- + +## 12. Results + +### PreviewResults + +```python +preview.dataset # pd.DataFrame | None +preview.analysis # DatasetProfilerResults | None +preview.processor_artifacts # dict | None +preview.display_sample_record() +``` + +### DatasetCreationResults + +```python +results.load_dataset() # pd.DataFrame +results.load_analysis() # DatasetProfilerResults +results.load_processor_dataset(processor_name) # pd.DataFrame +results.get_path_to_processor_artifacts(name) # Path +``` + +### DatasetProfilerResults + +```python +analysis.num_records +analysis.percent_complete +analysis.column_statistics # list of column stats +analysis.to_report(save_path=None) # Rich console report (HTML/SVG save) +``` + +--- + +## 13. Default Model Aliases + +Auto-configured per provider. No manual `ModelConfig` needed. + +| Alias | nvidia model | openai model | +|-------|-------------|--------------| +| `{provider}-text` | nemotron-3-nano-30b-a3b | gpt-4.1 | +| `{provider}-reasoning` | gpt-oss-20b | gpt-5 | +| `{provider}-vision` | nemotron-nano-12b-v2-vl | gpt-5 | +| `{provider}-embedding` | llama-3.2-nv-embedqa-1b-v2 | text-embedding-3-large | + +Providers: `nvidia`, `openai`, `openrouter` + +Environment variables: `NVIDIA_API_KEY`, `OPENAI_API_KEY`, `OPENROUTER_API_KEY` diff --git a/skill/data-designer/scripts/echo_data_designer_library_path.py b/skill/data-designer/scripts/echo_data_designer_library_path.py new file mode 100644 index 000000000..45737efe2 --- /dev/null +++ b/skill/data-designer/scripts/echo_data_designer_library_path.py @@ -0,0 +1,32 @@ +"""Print the absolute filesystem path to the installed data_designer package. + +This script is used by agent hooks and other tools to inject context about the +data_designer library's source location for inspection or source exploration. + +Usage: + uv run echo_data_designer_library_path.py + +Example output: + /home/user/.venv/lib/python3.10/site-packages/data_designer +""" + +import sys +from pathlib import Path + + +def main() -> None: + """Print the data_designer library path and exit.""" + try: + import data_designer.config as dd + + print(Path(dd.__file__).parent.parent) + except ImportError: + print( + "Error: data_designer is not installed in the current environment.", + file=sys.stderr, + ) + sys.exit(1) + + +if __name__ == "__main__": + main() diff --git a/skill/data-designer/scripts/get_column_info.py b/skill/data-designer/scripts/get_column_info.py new file mode 100644 index 000000000..64069d987 --- /dev/null +++ b/skill/data-designer/scripts/get_column_info.py @@ -0,0 +1,95 @@ +"""Print LLM-optimized context for data designer column types. + +Usage: + uv run get_column_info.py # Print help + uv run get_column_info.py all # Print all column types + uv run get_column_info.py llm-text # Print specific column type + uv run get_column_info.py --list # List column types and config classes +""" + +import inspect +from typing import Literal, get_args, get_origin + +from helpers.pydantic_info_utils import run_cli + +import data_designer.config as dd + +# Default descriptions for common fields that lack docstrings. +# These are derived from inspecting how fields are used in the codebase. +DEFAULT_FIELD_DESCRIPTIONS: dict[str, str] = { + # Common base fields + "name": "Unique column name in the generated dataset", + "drop": "If True, exclude this column from the final dataset output", + "column_type": "Discriminator identifying the column configuration type", + # LLM-related fields + "prompt": "Jinja2 template for the LLM prompt; can reference other columns via {{ column_name }}", + "model_alias": "Reference to a ModelConfig.alias defined in the config builder", + "system_prompt": "Optional system prompt to set LLM behavior and context", + "multi_modal_context": "Optional list of ImageContext for vision model inputs", + "tool_alias": "Optional reference to a ToolConfig.tool_alias for MCP tool access", + "with_trace": "Trace capture mode: NONE, LAST_MESSAGE, or ALL_MESSAGES", + "extract_reasoning_content": "If True, capture chain-of-thought in {name}__reasoning_content column", + # LLM type-specific fields + "code_lang": "Target programming language for code extraction from LLM response", + "scores": "List of Score objects defining rubric criteria for LLM judge evaluation", + "output_format": "Pydantic model or JSON schema dict defining the structured output shape", + # Sampler fields + "sampler_type": "Type of statistical sampler to use (e.g., CATEGORY, UNIFORM, PERSON)", + "params": "Sampler-specific parameters (e.g., CategorySamplerParams, UniformSamplerParams)", + "conditional_params": "Override params based on conditions referencing other columns", + "convert_to": "Optional type cast for sampled values: 'int', 'float', or 'str'", + # Expression fields + "expr": "Jinja2 expression to compute the column value from other columns", + "dtype": "Data type for expression result: 'int', 'float', 'str', or 'bool'", + # Embedding fields + "target_column": "Name of the text column to generate embeddings for", + # Validation fields + "target_columns": "List of column names to validate", + "validator_type": "Validation method: CODE, LOCAL_CALLABLE, or REMOTE", + "validator_params": "Validator-specific parameters (e.g., CodeValidatorParams)", +} + + +def discover_column_configs() -> dict[str, type]: + """Dynamically discover all ColumnConfig classes from data_designer.config. + + Returns: + Dict mapping column_type values (e.g., 'llm-text') to their config classes. + """ + column_configs = {} + for name in dir(dd): + if name.endswith("ColumnConfig"): + obj = getattr(dd, name) + if inspect.isclass(obj) and hasattr(obj, "model_fields"): + if "column_type" in obj.model_fields: + annotation = obj.model_fields["column_type"].annotation + if get_origin(annotation) is Literal: + args = get_args(annotation) + if args: + column_configs[args[0]] = obj + return column_configs + + +def main() -> None: + """CLI entry point.""" + run_cli( + discover_fn=discover_column_configs, + type_key="column_type", + type_label="column_type", + class_label="config_class", + default_descriptions=DEFAULT_FIELD_DESCRIPTIONS, + script_name="get_column_info.py", + description="Print LLM-optimized context for data designer column types.", + header_title="Data Designer Column Types Reference", + examples=[ + "uv run get_column_info.py all # Print all column types", + "uv run get_column_info.py llm-text # Print specific column type", + "uv run get_column_info.py --list # List column types and config classes", + ], + case_insensitive=False, + uppercase_value=False, + ) + + +if __name__ == "__main__": + main() diff --git a/skill/data-designer/scripts/get_processor_info.py b/skill/data-designer/scripts/get_processor_info.py new file mode 100644 index 000000000..dfcbadbba --- /dev/null +++ b/skill/data-designer/scripts/get_processor_info.py @@ -0,0 +1,70 @@ +"""Print LLM-optimized context for data designer processor types. + +Usage: + uv run get_processor_info.py # Print help + uv run get_processor_info.py all # Print all processor types + uv run get_processor_info.py drop_columns # Print specific processor type + uv run get_processor_info.py --list # List processor types and config classes +""" + +import inspect +from enum import Enum +from typing import Literal, get_args, get_origin + +from helpers.pydantic_info_utils import run_cli + +import data_designer.config as dd + +DEFAULT_FIELD_DESCRIPTIONS: dict[str, str] = { + "name": "Unique processor name, used for artifact paths on disk", + "build_stage": "Processing stage: currently only POST_BATCH is supported", + "processor_type": "Discriminator identifying the processor type", + "column_names": "List of column names to drop from the output dataset", + "template": "Dict mapping new column names to Jinja2 template values (supports nested JSON)", +} + + +def discover_processor_configs() -> dict[str, type]: + """Dynamically discover all ProcessorConfig classes from data_designer.config. + + Returns: + Dict mapping processor_type values to their config classes. + """ + processor_configs = {} + for name in dir(dd): + if name.endswith("ProcessorConfig") and name != "ProcessorConfig": + obj = getattr(dd, name) + if inspect.isclass(obj) and hasattr(obj, "model_fields"): + if "processor_type" in obj.model_fields: + annotation = obj.model_fields["processor_type"].annotation + if get_origin(annotation) is Literal: + args = get_args(annotation) + if args: + key = args[0].value if isinstance(args[0], Enum) else args[0] + processor_configs[key] = obj + return processor_configs + + +def main() -> None: + """CLI entry point.""" + run_cli( + discover_fn=discover_processor_configs, + type_key="processor_type", + type_label="processor_type", + class_label="config_class", + default_descriptions=DEFAULT_FIELD_DESCRIPTIONS, + script_name="get_processor_info.py", + description="Print LLM-optimized context for data designer processor types.", + header_title="Data Designer Processor Types Reference", + examples=[ + "uv run get_processor_info.py all # Print all processor types", + "uv run get_processor_info.py drop_columns # Print specific processor type", + "uv run get_processor_info.py --list # List processor types and config classes", + ], + case_insensitive=True, + uppercase_value=False, + ) + + +if __name__ == "__main__": + main() diff --git a/skill/data-designer/scripts/get_sampler_info.py b/skill/data-designer/scripts/get_sampler_info.py new file mode 100644 index 000000000..75c6434b9 --- /dev/null +++ b/skill/data-designer/scripts/get_sampler_info.py @@ -0,0 +1,127 @@ +"""Print LLM-optimized context for data designer sampler types. + +Usage: + uv run get_sampler_info.py # Print help + uv run get_sampler_info.py all # Print all sampler types + uv run get_sampler_info.py category # Print specific sampler type + uv run get_sampler_info.py UNIFORM # Case-insensitive lookup + uv run get_sampler_info.py --list # List sampler types and params classes +""" + +import inspect +import sys +from enum import Enum + +from helpers.pydantic_info_utils import run_cli + +import data_designer.config as dd + +# Default descriptions for common sampler param fields that lack docstrings. +# These are derived from inspecting how fields are used in the codebase. +DEFAULT_FIELD_DESCRIPTIONS: dict[str, str] = { + # Category sampler + "values": "List of categorical values to sample from", + "weights": "Optional sampling weights (probabilities) for each value", + # Subcategory sampler + "category": "Parent category column name for hierarchical sampling", + # Person sampler + "locale": "Locale for generating names (e.g., 'en_US')", + "sex": "Filter by sex: 'Male' or 'Female'", + "city": "Filter by city or list of cities", + "age_range": "[min, max] age range for filtering", + "with_synthetic_personas": "Include personality traits from Nemotron Personas", + # UUID sampler + "prefix": "Optional prefix string for generated UUIDs", + "short_form": "Truncate UUID to 8 characters", + "uppercase": "Use uppercase letters in UUID", + # Datetime sampler + "start": "Start date/datetime string (e.g., '2024-01-01')", + "end": "End date/datetime string (e.g., '2024-12-31')", + "unit": "Time unit: 'Y' (year), 'M' (month), 'D' (day), 'h', 'm', 's'", + # TimeDelta sampler + "reference_column_name": "Column name to compute time offset from", + "dt_min": "Minimum time offset value", + "dt_max": "Maximum time offset value", + # Numeric samplers + "low": "Minimum value (inclusive)", + "high": "Maximum value (exclusive)", + "mean": "Mean of the distribution", + "stddev": "Standard deviation of the distribution", + "decimal_places": "Number of decimal places for rounding output", + "n": "Number of trials (for binomial distribution)", + "p": "Probability of success (0.0 to 1.0)", + # Scipy sampler + "dist_name": "Name of scipy.stats distribution (e.g., 'expon', 'gamma')", + "dist_params": "Dictionary of distribution parameters (e.g., {'scale': 5.0})", +} + + +def discover_sampler_types() -> dict[str, type]: + """Dynamically discover all sampler types and their param classes from data_designer.config. + + Returns: + Dict mapping sampler_type values (e.g., 'category') to their params classes. + """ + # Find SamplerType enum + sampler_type_enum = getattr(dd, "SamplerType", None) + if sampler_type_enum is None or not issubclass(sampler_type_enum, Enum): + print( + "Error: Could not find SamplerType enum in data_designer.config", + file=sys.stderr, + ) + sys.exit(1) + + # Build dict of all SamplerParams classes by normalized name + params_classes = {} + for name in dir(dd): + if name.endswith("SamplerParams"): + obj = getattr(dd, name) + if inspect.isclass(obj) and hasattr(obj, "model_fields"): + # Normalize: CategorySamplerParams -> category + normalized = name.replace("SamplerParams", "").lower() + params_classes[normalized] = obj + + # Map sampler types to their params classes + sampler_types = {} + for member in sampler_type_enum: + # member.name is like 'CATEGORY', member.value is the actual value + sampler_name = member.name.lower() + # Handle special cases like PERSON_FROM_FAKER -> personfromfaker + normalized_name = sampler_name.replace("_", "") + + # Try to find matching params class + params_cls = params_classes.get(normalized_name) + if params_cls is None: + # Try with underscores removed differently + params_cls = params_classes.get(sampler_name.replace("_", "")) + + if params_cls is not None: + sampler_types[sampler_name] = params_cls + + return sampler_types + + +def main() -> None: + """CLI entry point.""" + run_cli( + discover_fn=discover_sampler_types, + type_key="sampler_type", + type_label="sampler_type", + class_label="params_class", + default_descriptions=DEFAULT_FIELD_DESCRIPTIONS, + script_name="get_sampler_info.py", + description="Print LLM-optimized context for data designer sampler types.", + header_title="Data Designer Sampler Types Reference", + examples=[ + "uv run get_sampler_info.py all # Print all sampler types", + "uv run get_sampler_info.py category # Print specific sampler type", + "uv run get_sampler_info.py UNIFORM # Case-insensitive lookup", + "uv run get_sampler_info.py --list # List sampler types and params classes", + ], + case_insensitive=True, + uppercase_value=True, + ) + + +if __name__ == "__main__": + main() diff --git a/skill/data-designer/scripts/get_validator_info.py b/skill/data-designer/scripts/get_validator_info.py new file mode 100644 index 000000000..408648110 --- /dev/null +++ b/skill/data-designer/scripts/get_validator_info.py @@ -0,0 +1,92 @@ +"""Print LLM-optimized context for data designer validator types. + +Usage: + uv run get_validator_info.py # Print help + uv run get_validator_info.py all # Print all validator types + uv run get_validator_info.py code # Print specific validator type + uv run get_validator_info.py --list # List validator types and params classes +""" + +import inspect +import sys +from enum import Enum + +from helpers.pydantic_info_utils import run_cli + +import data_designer.config as dd + +DEFAULT_FIELD_DESCRIPTIONS: dict[str, str] = { + # Code validator + "code_lang": "Programming language for linting (e.g., PYTHON, SQL_POSTGRES, SQL_ANSI)", + # Local callable validator + "validation_function": "Callable[[pd.DataFrame], pd.DataFrame] that must return a df with 'is_valid' bool column", + "output_schema": "Optional JSON schema dict for validating the output DataFrame structure", + # Remote validator + "endpoint_url": "HTTP endpoint URL that accepts POST with {'data': [...]} payload", + "timeout": "Request timeout in seconds (default: 30.0)", + "max_retries": "Maximum retry attempts on failure (default: 3)", + "retry_backoff": "Exponential backoff multiplier for retries (default: 2.0)", + "max_parallel_requests": "Maximum concurrent validation requests (default: 4)", +} + + +def discover_validator_types() -> dict[str, type]: + """Dynamically discover all validator types and their param classes from data_designer.config. + + Returns: + Dict mapping validator_type values to their params classes. + """ + validator_type_enum = getattr(dd, "ValidatorType", None) + if validator_type_enum is None or not issubclass(validator_type_enum, Enum): + print( + "Error: Could not find ValidatorType enum in data_designer.config", + file=sys.stderr, + ) + sys.exit(1) + + # Build dict of all ValidatorParams classes by normalized name + params_classes = {} + for name in dir(dd): + if name.endswith("ValidatorParams"): + obj = getattr(dd, name) + if inspect.isclass(obj) and hasattr(obj, "model_fields"): + # Normalize: CodeValidatorParams -> code + normalized = name.replace("ValidatorParams", "").lower() + params_classes[normalized] = obj + + # Map validator types to their params classes + validator_types = {} + for member in validator_type_enum: + validator_name = member.name.lower() + normalized_name = validator_name.replace("_", "") + params_cls = params_classes.get(normalized_name) + if params_cls is not None: + validator_types[validator_name] = params_cls + + return validator_types + + +def main() -> None: + """CLI entry point.""" + run_cli( + discover_fn=discover_validator_types, + type_key="validator_type", + type_label="validator_type", + class_label="params_class", + default_descriptions=DEFAULT_FIELD_DESCRIPTIONS, + script_name="get_validator_info.py", + description="Print LLM-optimized context for data designer validator types.", + header_title="Data Designer Validator Types Reference", + examples=[ + "uv run get_validator_info.py all # Print all validator types", + "uv run get_validator_info.py code # Print specific validator type", + "uv run get_validator_info.py LOCAL_CALLABLE # Case-insensitive lookup", + "uv run get_validator_info.py --list # List validator types and params classes", + ], + case_insensitive=True, + uppercase_value=True, + ) + + +if __name__ == "__main__": + main() diff --git a/skill/data-designer/scripts/helpers/__init__.py b/skill/data-designer/scripts/helpers/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/skill/data-designer/scripts/helpers/pydantic_info_utils.py b/skill/data-designer/scripts/helpers/pydantic_info_utils.py new file mode 100644 index 000000000..a8801eaeb --- /dev/null +++ b/skill/data-designer/scripts/helpers/pydantic_info_utils.py @@ -0,0 +1,445 @@ +#!/usr/bin/env python3 +"""Shared utilities for printing LLM-optimized context from Pydantic models. + +This module provides common functionality for the four info scripts: +get_column_info.py, get_sampler_info.py, get_validator_info.py, and +get_processor_info.py. +""" + +import re +import sys +import types +import typing +from collections.abc import Callable +from enum import Enum +from typing import get_args, get_origin + +from pydantic import BaseModel + + +def _is_basemodel_subclass(cls) -> bool: + """Return True if cls is a concrete BaseModel subclass (not BaseModel itself).""" + return isinstance(cls, type) and issubclass(cls, BaseModel) and cls is not BaseModel + + +def _is_enum_subclass(cls) -> bool: + """Return True if cls is an Enum subclass (not Enum itself).""" + return isinstance(cls, type) and issubclass(cls, Enum) and cls is not Enum + + +def _extract_enum_class(annotation) -> type | None: + """Unwrap a type annotation to find an Enum class, if present. + + Handles X, X | None, Annotated[X, ...]. + Returns the Enum class or None. + """ + if annotation is None: + return None + + # Unwrap Annotated[X, ...] + if get_origin(annotation) is typing.Annotated: + annotation = get_args(annotation)[0] + + # Direct enum class + if _is_enum_subclass(annotation): + return annotation + + # Union: X | None or typing.Union[X, None] + origin = get_origin(annotation) + if origin is typing.Union or origin is types.UnionType: + for arg in get_args(annotation): + if arg is type(None): + continue + if _is_enum_subclass(arg): + return arg + + return None + + +def extract_nested_basemodel(annotation) -> type | None: + """Unwrap a type annotation to find a single nested BaseModel subclass. + + Handles: X, list[X], X | None, list[X] | None, dict[K, V], Annotated[X, ...]. + Returns None for unions of 2+ BaseModel subclasses (discriminated unions), + primitives, enums, or BaseModel itself. + """ + if annotation is None: + return None + + # Unwrap Annotated[X, ...] + if get_origin(annotation) is typing.Annotated: + annotation = get_args(annotation)[0] + + # Direct BaseModel subclass + if _is_basemodel_subclass(annotation): + return annotation + + origin = get_origin(annotation) + + # list[X] -> check X + if origin is list: + args = get_args(annotation) + if args and _is_basemodel_subclass(args[0]): + return args[0] + return None + + # dict[K, V] -> check V + if origin is dict: + args = get_args(annotation) + if len(args) >= 2 and _is_basemodel_subclass(args[1]): + return args[1] + return None + + # Union: X | None, list[X] | None, or discriminated unions + if origin is typing.Union or origin is types.UnionType: + non_none_args = [a for a in get_args(annotation) if a is not type(None)] + basemodel_classes = [] + for arg in non_none_args: + # Recurse to handle list[X] | None etc. + result = extract_nested_basemodel(arg) + if result is not None: + basemodel_classes.append(result) + elif _is_basemodel_subclass(arg): + basemodel_classes.append(arg) + # Only return if exactly one BaseModel subclass found + if len(basemodel_classes) == 1: + return basemodel_classes[0] + return None + + return None + + +def format_type(annotation) -> str: + """Format a type annotation for readable display. + + Strips module prefixes and simplifies complex types. + """ + type_str = str(annotation) + + # Remove module prefixes + type_str = re.sub(r"data_designer\.config\.\w+\.", "", type_str) + type_str = re.sub(r"pydantic\.main\.", "", type_str) + type_str = re.sub(r"typing\.", "", type_str) + + # Clean up enum types BEFORE other replacements: -> EnumName + type_str = re.sub(r"", r"\1", type_str) + + # Clean up class types: -> str + type_str = re.sub(r"", r"\1", type_str) + + # Simplify common patterns + type_str = type_str.replace("NoneType", "None") + + # Clean up Literal types for readability + if "Literal[" in type_str: + # Extract just the value from Literal['value'] + match = re.search(r"Literal\[([^\]]+)\]", type_str) + if match: + type_str = f"Literal[{match.group(1)}]" + + # Clean up Annotated types with Discriminator (too verbose) + if "Annotated[" in type_str and "Discriminator" in type_str: + # Extract just the union type, drop the Discriminator metadata + match = re.search(r"Annotated\[([^,]+(?:\s*\|\s*[^,]+)*),", type_str) + if match: + type_str = match.group(1).strip() + + return type_str + + +def get_brief_description(cls: type) -> str: + """Extract first line from class docstring.""" + if cls.__doc__: + doc = cls.__doc__.strip() + first_line = doc.split("\n")[0].strip() + return first_line + return "No description available." + + +def get_field_info( + cls: type, default_descriptions: dict[str, str] +) -> list[tuple[str, str, str, type | None, type | None]]: + """Extract field information from a Pydantic model. + + Args: + cls: The Pydantic model class to inspect. + default_descriptions: Fallback descriptions for fields without docstrings. + + Returns: + List of (field_name, type_str, description, nested_basemodel_cls, enum_cls) tuples. + """ + fields = [] + model_fields: dict = getattr(cls, "model_fields", {}) + if model_fields: + for field_name, field_info in model_fields.items(): + type_str = format_type(field_info.annotation) + # Use field's description if available, otherwise fall back to defaults + description = field_info.description or default_descriptions.get(field_name, "") + nested_cls = extract_nested_basemodel(field_info.annotation) + enum_cls = _extract_enum_class(field_info.annotation) + fields.append((field_name, type_str, description, nested_cls, enum_cls)) + return fields + + +def _print_fields( + fields: list[tuple[str, str, str, type | None, type | None]], + default_descriptions: dict[str, str], + indent: int = 4, + seen: set | None = None, + max_depth: int = 3, + current_depth: int = 0, +) -> None: + """Print fields with optional nested BaseModel expansion and enum values. + + Args: + fields: List of (field_name, type_str, description, nested_cls, enum_cls) tuples. + default_descriptions: Fallback descriptions for nested model fields. + indent: Current indentation level (number of spaces). + seen: Set of already-expanded class names to prevent cycles. + max_depth: Maximum recursion depth for nested models. + current_depth: Current recursion depth. + """ + if seen is None: + seen = set() + + pad = " " * indent + + for field_name, type_str, desc, nested_cls, enum_cls in fields: + print(f"{pad}{field_name}:") + print(f"{pad} type: {type_str}") + if desc: + print(f"{pad} description: {desc}") + + # Expand enum values + if enum_cls is not None: + values = [member.name for member in enum_cls] # type: ignore[var-annotated] + print(f"{pad} values: [{', '.join(values)}]") + + # Expand nested BaseModel (with cycle and depth protection) + if nested_cls is not None and nested_cls.__name__ not in seen and current_depth < max_depth: + seen.add(nested_cls.__name__) + nested_fields = get_field_info(nested_cls, default_descriptions) + print(f"{pad} schema ({nested_cls.__name__}):") + _print_fields( + nested_fields, + default_descriptions, + indent=indent + 4, + seen=seen, + max_depth=max_depth, + current_depth=current_depth + 1, + ) + + +def print_yaml_entry( + type_key: str, + type_value: str, + cls: type, + default_descriptions: dict[str, str], + uppercase_value: bool = False, +) -> None: + """Print YAML-style output for a Pydantic model class. + + Args: + type_key: The key name for the type (e.g., "sampler_type" or "column_type"). + type_value: The value of the type (e.g., "category" or "llm-text"). + cls: The Pydantic model class to print. + default_descriptions: Fallback descriptions for fields without docstrings. + uppercase_value: If True, print type_value in uppercase. + """ + class_name = cls.__name__ + description = get_brief_description(cls) + fields = get_field_info(cls, default_descriptions) + + display_value = type_value.upper() if uppercase_value else type_value + + print(f"{class_name}:") + print(f" {type_key}: {display_value}") + print(f" description: {description}") + print(" fields:") + + _print_fields(fields, default_descriptions) + + +def print_all_entries( + items: dict[str, type], + type_key: str, + header_title: str, + default_descriptions: dict[str, str], + uppercase_value: bool = False, +) -> None: + """Print YAML-style output for all items. + + Args: + items: Dict mapping type values to their classes. + type_key: The key name for the type (e.g., "sampler_type" or "column_type"). + header_title: Title for the header comment. + default_descriptions: Fallback descriptions for fields without docstrings. + uppercase_value: If True, print type values in uppercase. + """ + sorted_types = sorted(items.keys()) + + print(f"# {header_title}") + print(f"# {len(sorted_types)} types discovered from data_designer.config") + print() + + for type_value in sorted_types: + cls = items[type_value] + print_yaml_entry(type_key, type_value, cls, default_descriptions, uppercase_value) + print() + + +def print_single_entry( + items: dict[str, type], + lookup_key: str, + type_key: str, + default_descriptions: dict[str, str], + case_insensitive: bool = False, + uppercase_value: bool = False, +) -> None: + """Print YAML-style output for a specific item. + + Args: + items: Dict mapping type values to their classes. + lookup_key: The type value to look up. + type_key: The key name for the type (e.g., "sampler_type" or "column_type"). + default_descriptions: Fallback descriptions for fields without docstrings. + case_insensitive: If True, perform case-insensitive lookup. + uppercase_value: If True, print type value in uppercase. + """ + normalized = lookup_key.lower() if case_insensitive else lookup_key + + if normalized not in items: + available = ", ".join(sorted(items.keys())) + print(f"Error: Unknown {type_key} '{lookup_key}'", file=sys.stderr) + print(f"Available types: {available}", file=sys.stderr) + sys.exit(1) + + cls = items[normalized] + print_yaml_entry(type_key, normalized, cls, default_descriptions, uppercase_value) + + +def print_list_table( + items: dict[str, type], + type_label: str, + class_label: str, +) -> None: + """Print available types with their class names in a table. + + Args: + items: Dict mapping type values to their classes. + type_label: Label for the type column (e.g., "sampler_type" or "column_type"). + class_label: Label for the class column (e.g., "params_class" or "config_class"). + """ + sorted_items = sorted(items.items()) + + # Calculate column widths + type_width = max(len(type_value) for type_value, _ in sorted_items) + type_width = max(type_width, len(type_label)) + + # Print header + print(f"{type_label:<{type_width}} {class_label}") + print(f"{'-' * type_width} {'-' * max(len(class_label), 25)}") + + # Print rows + for type_value, cls in sorted_items: + print(f"{type_value:<{type_width}} {cls.__name__}") + + +def print_help( + items: dict[str, type], + type_label: str, + class_label: str, + script_name: str, + description: str, + examples: list[str], +) -> None: + """Print help message with available types. + + Args: + items: Dict mapping type values to their classes. + type_label: Label for the type (e.g., "sampler_type" or "column_type"). + class_label: Label for the class column in list output. + script_name: Name of the script for usage examples. + description: Brief description of what the script does. + examples: List of example command lines. + """ + available_types = sorted(items.keys()) + + print(f"Usage: uv run {script_name} <{type_label}>") + print() + print(description) + print() + print("Options:") + print(" -h, --help Show this help message") + print(f" -l, --list List {type_label}s and their {class_label}es") + print() + print("Arguments:") + print(f" {type_label} Type to print (use 'all' for all types)") + print() + print("Examples:") + for example in examples: + print(f" {example}") + print() + print(f"Available {type_label}s ({len(available_types)}):") + print() + print_list_table(items, type_label, class_label) + + +def run_cli( + discover_fn: Callable[[], dict[str, type]], + type_key: str, + type_label: str, + class_label: str, + default_descriptions: dict[str, str], + script_name: str, + description: str, + header_title: str, + examples: list[str], + case_insensitive: bool = False, + uppercase_value: bool = False, +) -> None: + """Run the CLI for a Pydantic info script. + + Args: + discover_fn: Function that returns dict mapping type values to classes. + type_key: The key name for the type in YAML output. + type_label: Label for the type in help text. + class_label: Label for the class column in list output. + default_descriptions: Fallback descriptions for fields without docstrings. + script_name: Name of the script for usage examples. + description: Brief description of what the script does. + header_title: Title for the header when printing all entries. + examples: List of example command lines. + case_insensitive: If True, perform case-insensitive lookup. + uppercase_value: If True, print type values in uppercase. + """ + if len(sys.argv) == 1: + # No arguments: print help + items = discover_fn() + print_help(items, type_label, class_label, script_name, description, examples) + elif len(sys.argv) == 2: + arg = sys.argv[1] + if arg in ("-h", "--help"): + items = discover_fn() + print_help(items, type_label, class_label, script_name, description, examples) + elif arg in ("-l", "--list"): + items = discover_fn() + print_list_table(items, type_label, class_label) + elif arg == "all": + items = discover_fn() + print_all_entries(items, type_key, header_title, default_descriptions, uppercase_value) + else: + # Single argument: print specific type + items = discover_fn() + print_single_entry( + items, + arg, + type_key, + default_descriptions, + case_insensitive, + uppercase_value, + ) + else: + items = discover_fn() + print_help(items, type_label, class_label, script_name, description, examples) + sys.exit(1) diff --git a/skill/test_info_scripts.py b/skill/test_info_scripts.py new file mode 100644 index 000000000..4586de894 --- /dev/null +++ b/skill/test_info_scripts.py @@ -0,0 +1,900 @@ +#!/usr/bin/env python3 +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +"""Comprehensive tests for the data designer info scripts. + +Run from the skill/ directory: + uv run test_info_scripts.py + +Tests cover: + - Unit tests for pydantic_info_utils helper functions + - Integration tests for all four info scripts (column, sampler, validator, processor) + - CLI behavior: help, list, single entry, all entries, invalid input, exit codes + - Nested BaseModel expansion (Score, ImageContext) + - Enum value expansion (SamplerType, TraceType, CodeLang, etc.) + - Discriminated union non-expansion (params fields) + - Cycle/depth protection in nested model printing +""" + +import io +import re +import subprocess +import sys +from contextlib import redirect_stdout +from enum import Enum +from pathlib import Path +from typing import Annotated + +# Resolve the scripts directory relative to this test file +SCRIPTS_DIR = Path(__file__).resolve().parent / "data-designer" / "scripts" + +# Add scripts dir to sys.path so we can import the helpers module +sys.path.insert(0, str(SCRIPTS_DIR)) + +from pydantic import BaseModel, Field + +# --------------------------------------------------------------------------- +# Test infrastructure +# --------------------------------------------------------------------------- + +_passed = 0 +_failed = 0 +_errors: list[str] = [] + + +def check(condition: bool, name: str) -> None: + global _passed, _failed + if condition: + _passed += 1 + else: + _failed += 1 + _errors.append(name) + print(f" FAIL: {name}") + + +def run_script(script: str, *args: str, expect_fail: bool = False) -> tuple[str, str, int]: + """Run a script in skills/data-designer/scripts/ via uv and return (stdout, stderr, returncode).""" + script_path = str(SCRIPTS_DIR / script) + result = subprocess.run( + ["uv", "run", script_path, *args], + capture_output=True, + text=True, + timeout=60, + ) + if not expect_fail and result.returncode != 0: + print(f" WARNING: {script} {' '.join(args)} exited with code {result.returncode}") + if result.stderr: + print(f" stderr: {result.stderr[:200]}") + return result.stdout, result.stderr, result.returncode + + +def section(title: str) -> None: + print(f"\n{'=' * 60}") + print(f" {title}") + print(f"{'=' * 60}") + + +# =========================================================================== +# PART 1: Unit tests for pydantic_info_utils helpers +# =========================================================================== + +section("Unit tests: type introspection helpers") + +from helpers.pydantic_info_utils import ( + _extract_enum_class, + _is_basemodel_subclass, + _is_enum_subclass, + extract_nested_basemodel, + format_type, + get_brief_description, + get_field_info, +) + +# ---- Test models & enums for unit tests ---- + + +class MyEnum(str, Enum): + A = "a" + B = "b" + C = "c" + + +class AnotherEnum(Enum): + X = 1 + Y = 2 + + +class Nested(BaseModel): + x: int = 0 + y: str = "hello" + + +class Nested2(BaseModel): + z: float = 1.0 + + +class Outer(BaseModel): + """Outer model for testing.""" + + plain: str = "foo" + nested: Nested = Field(default_factory=Nested) + nested_list: list[Nested] = Field(default_factory=list) + nested_optional: Nested | None = None + nested_list_optional: list[Nested] | None = None + nested_dict: dict[str, Nested] = Field(default_factory=dict) + my_enum: MyEnum = MyEnum.A + enum_optional: MyEnum | None = None + annotated_enum: Annotated[MyEnum, "some metadata"] = MyEnum.A + + +class SelfRef(BaseModel): + """Model that references itself indirectly.""" + + name: str = "" + child: "SelfRef | None" = None + + +SelfRef.model_rebuild() + + +# Model with a discriminated-union-like field (multiple BaseModel subclasses) +class DiscriminatedOuter(BaseModel): + choice: Nested | Nested2 = Field(default_factory=Nested) + + +# ---- _is_basemodel_subclass ---- +print("\n_is_basemodel_subclass:") +check(_is_basemodel_subclass(Nested) is True, "concrete subclass -> True") +check(_is_basemodel_subclass(BaseModel) is False, "BaseModel itself -> False") +check(_is_basemodel_subclass(str) is False, "str -> False") +check(_is_basemodel_subclass(int) is False, "int -> False") +check(_is_basemodel_subclass(MyEnum) is False, "Enum -> False") +check(_is_basemodel_subclass(None) is False, "None -> False") +check(_is_basemodel_subclass(list) is False, "list -> False") + +# ---- _is_enum_subclass ---- +print("\n_is_enum_subclass:") +check(_is_enum_subclass(MyEnum) is True, "str Enum subclass -> True") +check(_is_enum_subclass(AnotherEnum) is True, "int Enum subclass -> True") +check(_is_enum_subclass(Enum) is False, "Enum itself -> False") +check(_is_enum_subclass(str) is False, "str -> False") +check(_is_enum_subclass(BaseModel) is False, "BaseModel -> False") +check(_is_enum_subclass(None) is False, "None -> False") + +# ---- _extract_enum_class ---- +print("\n_extract_enum_class:") +check(_extract_enum_class(MyEnum) is MyEnum, "direct enum class") +check(_extract_enum_class(MyEnum | None) is MyEnum, "enum | None") +check(_extract_enum_class(MyEnum | None) is MyEnum, "Optional[enum]") +check(_extract_enum_class(Annotated[MyEnum, "meta"]) is MyEnum, "Annotated[enum, ...]") +check(_extract_enum_class(str) is None, "str -> None") +check(_extract_enum_class(int) is None, "int -> None") +check(_extract_enum_class(None) is None, "None -> None") +check(_extract_enum_class(Nested) is None, "BaseModel -> None") +check(_extract_enum_class(list[MyEnum]) is None, "list[enum] -> None (not unwrapped)") + +# ---- extract_nested_basemodel ---- +print("\nextract_nested_basemodel:") +check(extract_nested_basemodel(Nested) is Nested, "direct BaseModel subclass") +check(extract_nested_basemodel(list[Nested]) is Nested, "list[Model]") +check(extract_nested_basemodel(Nested | None) is Nested, "Model | None") +check(extract_nested_basemodel(list[Nested] | None) is Nested, "list[Model] | None") +check(extract_nested_basemodel(dict[str, Nested]) is Nested, "dict[str, Model]") +check( + extract_nested_basemodel(Annotated[Nested, "meta"]) is Nested, + "Annotated[Model, ...]", +) +check( + extract_nested_basemodel(Annotated[list[Nested], "meta"]) is Nested, + "Annotated[list[Model], ...]", +) + +# Should return None for these: +check(extract_nested_basemodel(str) is None, "str -> None") +check(extract_nested_basemodel(int) is None, "int -> None") +check(extract_nested_basemodel(None) is None, "None -> None") +check(extract_nested_basemodel(BaseModel) is None, "BaseModel itself -> None") +check(extract_nested_basemodel(list[str]) is None, "list[str] -> None") +check(extract_nested_basemodel(dict[str, int]) is None, "dict[str, int] -> None") +check(extract_nested_basemodel(MyEnum) is None, "Enum -> None") +# Discriminated union: 2+ BaseModel subclasses -> None +check( + extract_nested_basemodel(Nested | Nested2) is None, + "Model | Model2 (discriminated) -> None", +) +check( + extract_nested_basemodel(Nested | Nested2 | None) is None, + "Model | Model2 | None -> None", +) + +# ---- format_type ---- +print("\nformat_type:") +check(format_type(str) == "str", "str") +check(format_type(int) == "int", "int") +check("None" in format_type(str | None), "str | None contains 'None'") +check("list" in format_type(list[str]).lower(), "list[str]") + +# ---- get_brief_description ---- +print("\nget_brief_description:") +check(get_brief_description(Outer) == "Outer model for testing.", "docstring extraction") +check( + get_brief_description(type("NoDoc", (), {})) == "No description available.", + "no docstring", +) + +# ---- get_field_info: tuple structure ---- +print("\nget_field_info tuple structure:") +fields = get_field_info(Outer, {}) +check(len(fields) > 0, "returns non-empty list") +check(all(len(f) == 5 for f in fields), "all tuples have 5 elements") + +# Verify nested detection in field tuples +field_dict = {f[0]: f for f in fields} + +# plain str field: no nested, no enum +f = field_dict["plain"] +check(f[3] is None, "plain str -> nested_cls is None") +check(f[4] is None, "plain str -> enum_cls is None") + +# nested field: should detect Nested +f = field_dict["nested"] +check(f[3] is Nested, "nested field -> nested_cls is Nested") +check(f[4] is None, "nested field -> enum_cls is None") + +# nested_list: should detect Nested +f = field_dict["nested_list"] +check(f[3] is Nested, "nested_list -> nested_cls is Nested") + +# nested_optional: should detect Nested +f = field_dict["nested_optional"] +check(f[3] is Nested, "nested_optional -> nested_cls is Nested") + +# nested_list_optional: should detect Nested +f = field_dict["nested_list_optional"] +check(f[3] is Nested, "nested_list_optional -> nested_cls is Nested") + +# nested_dict: should detect Nested +f = field_dict["nested_dict"] +check(f[3] is Nested, "nested_dict -> nested_cls is Nested") + +# my_enum: should detect MyEnum +f = field_dict["my_enum"] +check(f[3] is None, "enum field -> nested_cls is None") +check(f[4] is MyEnum, "my_enum -> enum_cls is MyEnum") + +# enum_optional: should detect MyEnum +f = field_dict["enum_optional"] +check(f[4] is MyEnum, "enum_optional -> enum_cls is MyEnum") + +# annotated_enum +f = field_dict["annotated_enum"] +check(f[4] is MyEnum, "annotated_enum -> enum_cls is MyEnum") + + +# =========================================================================== +# PART 2: Unit tests for _print_fields and print_yaml_entry +# =========================================================================== + +section("Unit tests: output formatting") + +from helpers.pydantic_info_utils import _print_fields, print_yaml_entry + +# ---- _print_fields: basic output ---- +print("\n_print_fields basic output:") +fields = get_field_info(Nested, {}) +buf = io.StringIO() +with redirect_stdout(buf): + _print_fields(fields, {}) +output = buf.getvalue() +check("x:" in output, "nested field 'x' printed") +check("y:" in output, "nested field 'y' printed") +check("type: int" in output, "type: int printed") +check("type: str" in output, "type: str printed") + +# ---- _print_fields: enum expansion ---- +print("\n_print_fields enum expansion:") +fields = get_field_info(Outer, {}) +enum_fields = [f for f in fields if f[0] == "my_enum"] +buf = io.StringIO() +with redirect_stdout(buf): + _print_fields(enum_fields, {}) +output = buf.getvalue() +check("values:" in output, "values: line present for enum field") +check("A" in output and "B" in output and "C" in output, "all enum member names present") + +# ---- _print_fields: nested model expansion ---- +print("\n_print_fields nested model expansion:") +fields = get_field_info(Outer, {}) +nested_fields = [f for f in fields if f[0] == "nested"] +buf = io.StringIO() +with redirect_stdout(buf): + _print_fields(nested_fields, {}) +output = buf.getvalue() +check("schema (Nested):" in output, "schema (Nested): header present") +check(" x:" in output, "nested field x printed at deeper indent") +check(" y:" in output, "nested field y printed at deeper indent") + +# ---- _print_fields: cycle protection ---- +print("\n_print_fields cycle protection:") +fields = get_field_info(SelfRef, {}) +buf = io.StringIO() +with redirect_stdout(buf): + _print_fields(fields, {}) +output = buf.getvalue() +# Should expand SelfRef once but not infinitely recurse +count = output.count("schema (SelfRef):") +check(count == 1, f"SelfRef expanded exactly once (got {count})") + +# ---- _print_fields: discriminated union not expanded ---- +print("\n_print_fields discriminated union not expanded:") +fields = get_field_info(DiscriminatedOuter, {}) +choice_field = [f for f in fields if f[0] == "choice"] +buf = io.StringIO() +with redirect_stdout(buf): + _print_fields(choice_field, {}) +output = buf.getvalue() +check("schema" not in output, "discriminated union field not expanded") + +# ---- _print_fields: depth protection ---- +print("\n_print_fields depth limit:") + + +class Level3(BaseModel): + val: int = 0 + + +class Level2(BaseModel): + child: Level3 = Field(default_factory=Level3) + + +class Level1(BaseModel): + child: Level2 = Field(default_factory=Level2) + + +class Level0(BaseModel): + child: Level1 = Field(default_factory=Level1) + + +fields = get_field_info(Level0, {}) +buf = io.StringIO() +with redirect_stdout(buf): + _print_fields(fields, {}, max_depth=2) +output = buf.getvalue() +check("schema (Level1):" in output, "depth 0->1: Level1 expanded") +check("schema (Level2):" in output, "depth 1->2: Level2 expanded") +check("schema (Level3):" not in output, "depth 2->3: Level3 NOT expanded (max_depth=2)") + +# ---- print_yaml_entry ---- +print("\nprint_yaml_entry:") +buf = io.StringIO() +with redirect_stdout(buf): + print_yaml_entry("test_type", "my_value", Outer, {"plain": "A plain field"}) +output = buf.getvalue() +check(output.startswith("Outer:"), "starts with class name") +check(" test_type: my_value" in output, "type key/value printed") +check(" description: Outer model for testing." in output, "description printed") +check(" fields:" in output, "fields header printed") +check(" plain:" in output, "field plain printed at indent 4") +check(" description: A plain field" in output, "default description used") +check("schema (Nested):" in output, "nested model expanded in yaml entry") +check("values: [A, B, C]" in output, "enum values expanded in yaml entry") + +# ---- print_yaml_entry with uppercase_value ---- +print("\nprint_yaml_entry uppercase_value:") +buf = io.StringIO() +with redirect_stdout(buf): + print_yaml_entry("test_type", "my_value", Nested, {}, uppercase_value=True) +output = buf.getvalue() +check(" test_type: MY_VALUE" in output, "uppercase value printed") + + +# =========================================================================== +# PART 3: Integration tests — get_column_info.py +# =========================================================================== + +section("Integration: get_column_info.py") + +SCRIPT = "get_column_info.py" + +# ---- help ---- +print("\nhelp output:") +out, err, rc = run_script(SCRIPT, "--help") +check(rc == 0, "help exits 0") +check("Usage:" in out, "help contains Usage:") +check("column_type" in out, "help mentions column_type") +check("config_class" in out, "help mentions config_class") +check("Examples:" in out, "help contains Examples section") +check("Available column_types" in out, "help lists available types") + +# No-arg should also show help +out2, _, rc2 = run_script(SCRIPT) +check(rc2 == 0, "no-arg exits 0") +check("Usage:" in out2, "no-arg shows help") + +# ---- list ---- +print("\nlist output:") +out, err, rc = run_script(SCRIPT, "--list") +check(rc == 0, "list exits 0") +check("column_type" in out, "list header has column_type") +check("config_class" in out, "list header has config_class") +expected_types = [ + "custom", + "embedding", + "expression", + "llm-code", + "llm-judge", + "llm-structured", + "llm-text", + "sampler", + "seed-dataset", + "validation", +] +for t in expected_types: + check(t in out, f"list contains '{t}'") +check("LLMJudgeColumnConfig" in out, "list contains LLMJudgeColumnConfig class name") +check("SamplerColumnConfig" in out, "list contains SamplerColumnConfig class name") + +# -l alias +out_l, _, _ = run_script(SCRIPT, "-l") +check(out_l == out, "-l produces same output as --list") + +# ---- single entry: llm-judge (nested model expansion) ---- +print("\nsingle entry: llm-judge:") +out, err, rc = run_script(SCRIPT, "llm-judge") +check(rc == 0, "llm-judge exits 0") +check(out.startswith("LLMJudgeColumnConfig:"), "starts with class name") +check(" column_type: llm-judge" in out, "column_type value") +check(" description:" in out, "has description") +check(" fields:" in out, "has fields header") +# Score nested expansion +check(" scores:" in out, "scores field present") +check(" type: list[Score]" in out, "scores type is list[Score]") +check(" schema (Score):" in out, "Score schema expanded") +check(" name:" in out, "Score.name field at indent 8") +check(" description:" in out, "Score.description field at indent 8") +check(" options:" in out, "Score.options field at indent 8") +check(" type: dict[int | str, str]" in out, "Score.options type at indent 10") +# ImageContext nested expansion (inherited from LLMTextColumnConfig) +check(" schema (ImageContext):" in out, "ImageContext schema expanded") +check(" modality:" in out, "ImageContext.modality field") +check(" data_type:" in out, "ImageContext.data_type field") +# Enum expansion +check(" values: [NONE, LAST_MESSAGE, ALL_MESSAGES]" in out, "TraceType enum values") + +# ---- single entry: llm-text (ImageContext expansion) ---- +print("\nsingle entry: llm-text:") +out, err, rc = run_script(SCRIPT, "llm-text") +check(rc == 0, "llm-text exits 0") +check(out.startswith("LLMTextColumnConfig:"), "starts with class name") +check(" schema (ImageContext):" in out, "ImageContext expanded") +check(" modality:" in out, "ImageContext.modality at indent 8") +check(" values: [IMAGE]" in out, "Modality enum values for modality field") +check(" values: [URL, BASE64]" in out, "ModalityDataType enum values") +check(" values: [PNG, JPG, JPEG, GIF, WEBP]" in out, "ImageFormat enum values") +# Verify scores is NOT present in llm-text (it's only in llm-judge) +check("scores:" not in out, "scores not in llm-text") + +# ---- single entry: sampler (enum expansion, no discriminated union expansion) ---- +print("\nsingle entry: sampler:") +out, err, rc = run_script(SCRIPT, "sampler") +check(rc == 0, "sampler exits 0") +check(out.startswith("SamplerColumnConfig:"), "starts with class name") +check(" sampler_type:" in out, "sampler_type field present") +check(" type: SamplerType" in out, "sampler_type type is SamplerType") +# Enum values for SamplerType +check(" values: [BERNOULLI, " in out, "SamplerType enum values start with BERNOULLI") +check("CATEGORY" in out, "CATEGORY in enum values") +check("UNIFORM" in out, "UNIFORM in enum values") +check("UUID]" in out, "UUID at end of enum values") +# params should NOT be expanded (it's a discriminated union of 14 BaseModel subclasses) +check( + "schema (CategorySamplerParams):" not in out, + "params not expanded to CategorySamplerParams", +) +check( + "schema (UniformSamplerParams):" not in out, + "params not expanded to UniformSamplerParams", +) + +# ---- single entry: llm-code (CodeLang enum expansion) ---- +print("\nsingle entry: llm-code:") +out, err, rc = run_script(SCRIPT, "llm-code") +check(rc == 0, "llm-code exits 0") +check(" code_lang:" in out, "code_lang field present") +check("PYTHON" in out, "PYTHON in CodeLang values") +check("JAVASCRIPT" in out, "JAVASCRIPT in CodeLang values") + +# ---- single entry: validation (ValidatorType enum) ---- +print("\nsingle entry: validation:") +out, err, rc = run_script(SCRIPT, "validation") +check(rc == 0, "validation exits 0") +check(" validator_type:" in out, "validator_type field present") +check(" values: [CODE, LOCAL_CALLABLE, REMOTE]" in out, "ValidatorType enum values") + +# ---- single entry: custom (GenerationStrategy enum) ---- +print("\nsingle entry: custom:") +out, err, rc = run_script(SCRIPT, "custom") +check(rc == 0, "custom exits 0") +check(" values: [CELL_BY_CELL, FULL_COLUMN]" in out, "GenerationStrategy enum values") + +# ---- all ---- +print("\nall output:") +out, err, rc = run_script(SCRIPT, "all") +check(rc == 0, "all exits 0") +check("# Data Designer Column Types Reference" in out, "header title present") +check("# 10 types discovered" in out, "type count in header") +for t in expected_types: + # Each type should appear as " column_type: " + check(f" column_type: {t}" in out, f"all output contains column_type: {t}") + +# ---- invalid type ---- +print("\ninvalid type:") +out, err, rc = run_script(SCRIPT, "nonexistent", expect_fail=True) +check(rc == 1, "invalid type exits 1") +check("Error:" in err, "error message on stderr") +check("nonexistent" in err, "mentions the invalid type") +check("Available types:" in err, "lists available types") + +# ---- case sensitivity (column_info is case-sensitive) ---- +print("\ncase sensitivity:") +out, err, rc = run_script(SCRIPT, "LLM-TEXT", expect_fail=True) +check(rc == 1, "uppercase LLM-TEXT fails (case-sensitive)") + +# ---- too many args ---- +print("\ntoo many args:") +out, err, rc = run_script(SCRIPT, "llm-text", "extra", expect_fail=True) +check(rc == 1, "too many args exits 1") +check("Usage:" in out, "shows help on too many args") + + +# =========================================================================== +# PART 4: Integration tests — get_sampler_info.py +# =========================================================================== + +section("Integration: get_sampler_info.py") + +SCRIPT = "get_sampler_info.py" + +# ---- help ---- +print("\nhelp output:") +out, err, rc = run_script(SCRIPT, "--help") +check(rc == 0, "sampler help exits 0") +check("sampler_type" in out, "help mentions sampler_type") +check("params_class" in out, "help mentions params_class") + +# ---- list ---- +print("\nlist output:") +out, err, rc = run_script(SCRIPT, "--list") +check(rc == 0, "sampler list exits 0") +expected_samplers = [ + "bernoulli", + "bernoulli_mixture", + "binomial", + "category", + "datetime", + "gaussian", + "person", + "person_from_faker", + "poisson", + "scipy", + "subcategory", + "timedelta", + "uniform", + "uuid", +] +for s in expected_samplers: + check(s in out, f"sampler list contains '{s}'") +check("CategorySamplerParams" in out, "list has CategorySamplerParams class") + +# ---- single entry: category ---- +print("\nsingle entry: category:") +out, err, rc = run_script(SCRIPT, "category") +check(rc == 0, "category exits 0") +check(out.startswith("CategorySamplerParams:"), "starts with class name") +# sampler_type should be UPPERCASE +check(" sampler_type: CATEGORY" in out, "sampler_type displayed as CATEGORY (uppercase)") +check(" values:" in out, "values field present") +check(" weights:" in out, "weights field present") + +# ---- case insensitivity (sampler_info is case-insensitive) ---- +print("\ncase insensitivity:") +out_lower, _, rc1 = run_script(SCRIPT, "category") +out_upper, _, rc2 = run_script(SCRIPT, "CATEGORY") +check(rc1 == 0 and rc2 == 0, "both cases succeed") +check(out_lower == out_upper, "case-insensitive: same output for category/CATEGORY") + +out_mixed, _, rc3 = run_script(SCRIPT, "Category") +check(rc3 == 0, "mixed case succeeds") +check(out_mixed == out_lower, "mixed case same output") + +# ---- all ---- +print("\nall output:") +out, err, rc = run_script(SCRIPT, "all") +check(rc == 0, "sampler all exits 0") +check("# Data Designer Sampler Types Reference" in out, "header title") +check(f"# {len(expected_samplers)} types discovered" in out, "type count in header") +for s in expected_samplers: + check(f" sampler_type: {s.upper()}" in out, f"all has sampler_type: {s.upper()}") + +# ---- invalid type ---- +print("\ninvalid type:") +_, err, rc = run_script(SCRIPT, "nonexistent", expect_fail=True) +check(rc == 1, "invalid sampler exits 1") +check("Error:" in err, "error on stderr") + + +# =========================================================================== +# PART 5: Integration tests — get_validator_info.py +# =========================================================================== + +section("Integration: get_validator_info.py") + +SCRIPT = "get_validator_info.py" + +# ---- help ---- +print("\nhelp output:") +out, err, rc = run_script(SCRIPT, "--help") +check(rc == 0, "validator help exits 0") +check("validator_type" in out, "help mentions validator_type") + +# ---- list ---- +print("\nlist output:") +out, err, rc = run_script(SCRIPT, "--list") +check(rc == 0, "validator list exits 0") +expected_validators = ["code", "local_callable", "remote"] +for v in expected_validators: + check(v in out, f"validator list contains '{v}'") + +# ---- single entry: code (CodeLang enum expansion) ---- +print("\nsingle entry: code:") +out, err, rc = run_script(SCRIPT, "code") +check(rc == 0, "code validator exits 0") +check(out.startswith("CodeValidatorParams:"), "starts with class name") +check(" validator_type: CODE" in out, "validator_type displayed as CODE (uppercase)") +check(" code_lang:" in out, "code_lang field present") +check(" values:" in out, "CodeLang enum values present") +check("PYTHON" in out, "PYTHON in CodeLang values") +check("SQL_POSTGRES" in out, "SQL_POSTGRES in CodeLang values") + +# ---- case insensitivity ---- +print("\ncase insensitivity:") +out1, _, rc1 = run_script(SCRIPT, "code") +out2, _, rc2 = run_script(SCRIPT, "CODE") +check(rc1 == 0 and rc2 == 0, "both cases succeed") +check(out1 == out2, "case-insensitive: same output") + +# ---- all ---- +print("\nall output:") +out, err, rc = run_script(SCRIPT, "all") +check(rc == 0, "validator all exits 0") +check("# Data Designer Validator Types Reference" in out, "header title") +check(f"# {len(expected_validators)} types discovered" in out, "type count") + +# ---- invalid ---- +print("\ninvalid type:") +_, err, rc = run_script(SCRIPT, "nonexistent", expect_fail=True) +check(rc == 1, "invalid validator exits 1") + + +# =========================================================================== +# PART 6: Integration tests — get_processor_info.py +# =========================================================================== + +section("Integration: get_processor_info.py") + +SCRIPT = "get_processor_info.py" + +# ---- help ---- +print("\nhelp output:") +out, err, rc = run_script(SCRIPT, "--help") +check(rc == 0, "processor help exits 0") +check("processor_type" in out, "help mentions processor_type") + +# ---- list ---- +print("\nlist output:") +out, err, rc = run_script(SCRIPT, "--list") +check(rc == 0, "processor list exits 0") +expected_processors = ["drop_columns", "schema_transform"] +for p in expected_processors: + check(p in out, f"processor list contains '{p}'") + +# ---- single entry: drop_columns ---- +print("\nsingle entry: drop_columns:") +out, err, rc = run_script(SCRIPT, "drop_columns") +check(rc == 0, "drop_columns exits 0") +check(out.startswith("DropColumnsProcessorConfig:"), "starts with class name") +# processor_info uses uppercase_value=False +check( + " processor_type: drop_columns" in out, + "processor_type is lowercase (not uppercased)", +) +check(" column_names:" in out, "column_names field present") +# BuildStage enum expansion +check(" values:" in out, "BuildStage enum values present") + +# ---- single entry: schema_transform ---- +print("\nsingle entry: schema_transform:") +out, err, rc = run_script(SCRIPT, "schema_transform") +check(rc == 0, "schema_transform exits 0") +check(out.startswith("SchemaTransformProcessorConfig:"), "starts with class name") +check(" template:" in out, "template field present") + +# ---- case insensitivity ---- +print("\ncase insensitivity:") +out1, _, rc1 = run_script(SCRIPT, "drop_columns") +out2, _, rc2 = run_script(SCRIPT, "DROP_COLUMNS") +check(rc1 == 0 and rc2 == 0, "both cases succeed") +check(out1 == out2, "case-insensitive: same output") + +# ---- all ---- +print("\nall output:") +out, err, rc = run_script(SCRIPT, "all") +check(rc == 0, "processor all exits 0") +check("# Data Designer Processor Types Reference" in out, "header title") + +# ---- invalid ---- +print("\ninvalid type:") +_, err, rc = run_script(SCRIPT, "nonexistent", expect_fail=True) +check(rc == 1, "invalid processor exits 1") + + +# =========================================================================== +# PART 7: Cross-cutting tests — YAML structure validation +# =========================================================================== + +section("Cross-cutting: YAML structure validation") + + +def validate_yaml_structure(output: str, script_name: str) -> None: + """Validate common YAML structure in output from any info script.""" + lines = output.rstrip().split("\n") + + # First line should be ClassName: + check( + re.match(r"^\w+:", lines[0]) is not None, + f"{script_name}: first line is ClassName:", + ) + + # Should have fields: header + check( + any(line == " fields:" for line in lines), + f"{script_name}: has 'fields:' at indent 2", + ) + + # Every field name should be at indent 4 under fields: + in_fields = False + field_names = [] + for line in lines: + if line == " fields:": + in_fields = True + continue + if in_fields and re.match(r"^ \w[\w_]*:$", line): + field_names.append(line.strip().rstrip(":")) + + check(len(field_names) > 0, f"{script_name}: has at least one field") + + # Every field should have a type: line + for i, line in enumerate(lines): + if re.match(r"^ \w[\w_]*:$", line) and in_fields: + # Next non-empty line should contain "type:" + if i + 1 < len(lines): + check( + "type:" in lines[i + 1], + f"{script_name}: field '{line.strip().rstrip(':')}' has type line", + ) + + # No schema line should appear without a corresponding nested model + for line in lines: + if "schema (" in line: + match = re.search(r"schema \((\w+)\):", line) + check( + match is not None, + f"{script_name}: schema line has valid format", + ) + + # No values: line should be empty brackets + for line in lines: + if "values:" in line: + check( + "values: []" not in line, + f"{script_name}: values: is not empty", + ) + + +print("\nValidating YAML structure for each script + type:") +for script, types_to_check in [ + ( + "get_column_info.py", + ["llm-text", "llm-judge", "sampler", "expression", "validation", "custom"], + ), + ("get_sampler_info.py", ["category", "uniform", "person"]), + ("get_validator_info.py", ["code", "remote"]), + ("get_processor_info.py", ["drop_columns", "schema_transform"]), +]: + for t in types_to_check: + out, _, rc = run_script(script, t) + if rc == 0: + validate_yaml_structure(out, f"{script} {t}") + + +# =========================================================================== +# PART 8: Nested expansion consistency tests +# =========================================================================== + +section("Nested expansion consistency") + +# ImageContext should be expanded identically across all LLM column types +print("\nImageContext expansion consistency:") +llm_types_with_imagecontext = ["llm-text", "llm-code", "llm-judge", "llm-structured"] +imagecontext_blocks = {} +for t in llm_types_with_imagecontext: + out, _, rc = run_script("get_column_info.py", t) + if rc == 0 and "schema (ImageContext):" in out: + # Extract the ImageContext block + start = out.index("schema (ImageContext):") + # Find the next field at indent 4 (sibling) or end of output + rest = out[start:] + lines = rest.split("\n") + block_lines = [lines[0]] + for line in lines[1:]: + # Stop at a line that's at indent <= 6 (sibling or parent level) + if line and not line.startswith(" "): + break + block_lines.append(line) + imagecontext_blocks[t] = "\n".join(block_lines) + +if len(imagecontext_blocks) >= 2: + reference = list(imagecontext_blocks.values())[0] + for t, block in imagecontext_blocks.items(): + check(block == reference, f"ImageContext block identical in {t}") +else: + check(False, "ImageContext found in at least 2 LLM types") + + +# =========================================================================== +# PART 9: Indentation consistency tests +# =========================================================================== + +section("Indentation consistency") + + +def check_indentation(output: str, label: str) -> None: + """Verify indentation is always a multiple of 2 spaces.""" + for i, line in enumerate(output.split("\n"), 1): + if not line.strip(): + continue + leading = len(line) - len(line.lstrip()) + check( + leading % 2 == 0, + f"{label} line {i}: indent {leading} is multiple of 2", + ) + + +print("\nIndentation check for representative outputs:") +for script, arg in [ + ("get_column_info.py", "llm-judge"), + ("get_sampler_info.py", "category"), + ("get_validator_info.py", "code"), + ("get_processor_info.py", "drop_columns"), +]: + out, _, _ = run_script(script, arg) + check_indentation(out, f"{script} {arg}") + + +# =========================================================================== +# Summary +# =========================================================================== + +section("RESULTS") +total = _passed + _failed +print(f"\n Total: {total}") +print(f" Passed: {_passed}") +print(f" Failed: {_failed}") + +if _errors: + print("\n Failed tests:") + for e in _errors: + print(f" - {e}") + +print() +sys.exit(0 if _failed == 0 else 1)