From e89279fef837946af6833ab9c4260e4cf06e86b2 Mon Sep 17 00:00:00 2001 From: Johnny Greco Date: Fri, 13 Feb 2026 16:54:37 -0500 Subject: [PATCH 01/37] feat: add agent-context CLI introspection - add the new agent-context command, controller, and introspection services for config model and method discovery - register the command in the main CLI and add broad unit test coverage for commands, controllers, and introspection formatting/inspection behavior - enrich config pydantic models with Field descriptions so introspection output provides clearer, user-facing schema documentation - add an agent-context CLI review document under docs/reviews --- docs/reviews/agent-context-cli-review.md | 383 ++++++++++++++++++ .../src/data_designer/config/base.py | 13 +- .../data_designer/config/column_configs.py | 127 ++++-- .../src/data_designer/config/mcp.py | 38 +- .../src/data_designer/config/models.py | 94 +++-- .../src/data_designer/config/processors.py | 10 +- .../config/sampler_constraints.py | 11 +- .../data_designer/config/sampler_params.py | 58 ++- .../src/data_designer/config/seed.py | 11 +- .../src/data_designer/config/seed_source.py | 12 +- .../cli/commands/agent_context.py | 114 ++++++ .../data_designer/cli/controllers/__init__.py | 2 - .../controllers/agent_context_controller.py | 257 ++++++++++++ .../src/data_designer/cli/main.py | 4 + .../cli/services/introspection/__init__.py | 62 +++ .../cli/services/introspection/discovery.py | 183 +++++++++ .../cli/services/introspection/formatters.py | 182 +++++++++ .../introspection/method_inspector.py | 250 ++++++++++++ .../introspection/pydantic_inspector.py | 252 ++++++++++++ .../tests/cli/commands/__init__.py | 2 + .../commands/test_agent_context_command.py | 137 +++++++ .../tests/cli/controllers/__init__.py | 2 + .../test_agent_context_controller.py | 163 ++++++++ .../tests/cli/services/__init__.py | 2 + .../cli/services/introspection/__init__.py | 2 + .../services/introspection/test_discovery.py | 155 +++++++ .../introspection/test_field_descriptions.py | 63 +++ .../services/introspection/test_formatters.py | 268 ++++++++++++ .../introspection/test_method_inspector.py | 159 ++++++++ .../introspection/test_pydantic_inspector.py | 300 ++++++++++++++ 30 files changed, 3200 insertions(+), 116 deletions(-) create mode 100644 docs/reviews/agent-context-cli-review.md create mode 100644 packages/data-designer/src/data_designer/cli/commands/agent_context.py create mode 100644 packages/data-designer/src/data_designer/cli/controllers/agent_context_controller.py create mode 100644 packages/data-designer/src/data_designer/cli/services/introspection/__init__.py create mode 100644 packages/data-designer/src/data_designer/cli/services/introspection/discovery.py create mode 100644 packages/data-designer/src/data_designer/cli/services/introspection/formatters.py create mode 100644 packages/data-designer/src/data_designer/cli/services/introspection/method_inspector.py create mode 100644 packages/data-designer/src/data_designer/cli/services/introspection/pydantic_inspector.py create mode 100644 packages/data-designer/tests/cli/commands/__init__.py create mode 100644 packages/data-designer/tests/cli/commands/test_agent_context_command.py create mode 100644 packages/data-designer/tests/cli/controllers/__init__.py create mode 100644 packages/data-designer/tests/cli/controllers/test_agent_context_controller.py create mode 100644 packages/data-designer/tests/cli/services/__init__.py create mode 100644 packages/data-designer/tests/cli/services/introspection/__init__.py create mode 100644 packages/data-designer/tests/cli/services/introspection/test_discovery.py create mode 100644 packages/data-designer/tests/cli/services/introspection/test_field_descriptions.py create mode 100644 packages/data-designer/tests/cli/services/introspection/test_formatters.py create mode 100644 packages/data-designer/tests/cli/services/introspection/test_method_inspector.py create mode 100644 packages/data-designer/tests/cli/services/introspection/test_pydantic_inspector.py diff --git a/docs/reviews/agent-context-cli-review.md b/docs/reviews/agent-context-cli-review.md new file mode 100644 index 000000000..7a19e22c6 --- /dev/null +++ b/docs/reviews/agent-context-cli-review.md @@ -0,0 +1,383 @@ +# Agent Context CLI — Implementation Review + +## Summary + +This review covers the implementation of `data-designer agent-context`, a new CLI command group that exposes DataDesigner's full configuration API surface as agent-friendly introspection commands. The feature was built by porting and improving existing standalone skill scripts (`skill/data-designer/scripts/get_*.py`) into the library itself, expanding coverage from 4 config domains to the full API, and separating data extraction from presentation to support multiple output formats. + +**Key stats:** +- 10 new CLI subcommands +- 8 new source files + 1 modified source file +- 6 new test files (113 tests) +- 2 output formats: plain text (YAML-style) and JSON + +--- + +## Architecture + +``` +data-designer agent-context +├── columns [TYPE] # Column types & fields +├── samplers [TYPE] # Sampler types & params +├── validators [TYPE] # Validator types & params +├── processors [TYPE] # Processor types & configs +├── models # ModelConfig, inference params, distributions +├── builder # DataDesignerConfigBuilder method signatures +├── constraints # ScalarInequality, ColumnInequality, operators +├── seeds # SeedConfig, SeedSource types, SamplingStrategy +├── mcp # MCPProvider, LocalStdioMCPProvider, ToolConfig +└── overview # Compact cheatsheet: type counts + builder summary +``` + +The implementation follows a layered architecture with clear separation of concerns: + +``` +commands/agent_context.py # Thin Typer command wrappers + │ + ▼ +controllers/agent_context_controller.py # Orchestration: discovery → inspection → formatting → output + │ + ▼ +services/introspection/ + ├── discovery.py # Dynamic type discovery (8 functions) + ├── pydantic_inspector.py # Pydantic model introspection (dataclass-based) + ├── method_inspector.py # Class method introspection via inspect.signature() + └── formatters.py # Text and JSON output formatters +``` + +--- + +## New Files + +### Source Files + +All paths relative to `packages/data-designer/src/data_designer/cli/`. + +| File | Lines | Purpose | +|------|-------|---------| +| `services/introspection/__init__.py` | 64 | Public exports for all introspection modules | +| `services/introspection/pydantic_inspector.py` | 257 | Core Pydantic model introspection with `FieldDetail` and `ModelSchema` dataclasses | +| `services/introspection/discovery.py` | 284 | 8 discovery functions + centralized `DEFAULT_FIELD_DESCRIPTIONS` (108 entries) | +| `services/introspection/method_inspector.py` | 251 | Class method introspection via `inspect.signature()` with Google-style docstring parsing | +| `services/introspection/formatters.py` | 183 | Text (YAML-style) and JSON formatters for all data types | +| `controllers/agent_context_controller.py` | 263 | Controller orchestrating discovery, inspection, formatting, and output | +| `commands/agent_context.py` | 115 | Typer subcommand group with 10 commands | + +### Modified Files + +| File | Change | +|------|--------| +| `main.py` | Added `agent_context` import and `app.add_typer(...)` registration | +| `controllers/__init__.py` | Added `AgentContextController` to exports | + +### Test Files + +All paths relative to `packages/data-designer/tests/cli/`. + +| File | Tests | Purpose | +|------|-------|---------| +| `services/introspection/test_pydantic_inspector.py` | 38 | Unit tests for type introspection (field extraction, enum detection, nested models, cycles, depth limits) | +| `services/introspection/test_discovery.py` | 17 | Tests all 8 discovery functions find expected types | +| `services/introspection/test_method_inspector.py` | 11 | Tests docstring parsing, method signature extraction, public/private filtering | +| `services/introspection/test_formatters.py` | 19 | Tests all text and JSON formatters (schemas, methods, type lists, overview) | +| `controllers/test_agent_context_controller.py` | 15 | Controller orchestration tests using `capsys` | +| `commands/test_agent_context_command.py` | 13 | End-to-end CLI integration tests via `typer.testing.CliRunner` | +| **Total** | **113** | | + +--- + +## Key Design Decisions + +### 1. Dataclass-based structured data (not raw tuples) + +The existing skill scripts used raw tuples for field information. The new implementation uses typed dataclasses: + +```python +@dataclass +class FieldDetail: + name: str + type_str: str + description: str + enum_values: list[str] | None = None + nested_schema: ModelSchema | None = None + +@dataclass +class ModelSchema: + class_name: str + description: str + type_key: str | None = None + type_value: str | None = None + fields: list[FieldDetail] = field(default_factory=list) +``` + +This enables clean separation between introspection and formatting, and makes JSON output trivial. + +### 2. Plain text output (no Rich/ANSI) + +All output uses `typer.echo()` producing plain text. Agents parse plain text more reliably than colored/ANSI output. The YAML-style text format is backward-compatible with the existing skill script output. + +### 3. Dynamic discovery for extensibility + +Column configs, sampler types, validator types, and processor configs are discovered dynamically by iterating `dir(data_designer.config)` and matching class name patterns. This means new types added to the config package are automatically picked up without code changes. + +### 4. Cycle and depth protection + +Nested model expansion uses a `seen` set (by class name) and `max_depth` parameter (default 3) to prevent infinite recursion from self-referential or deeply nested models. + +### 5. Centralized field descriptions + +All 108 default field descriptions are in a single `DEFAULT_FIELD_DESCRIPTIONS` dict in `discovery.py`, replacing 4 separate per-script copies. + +--- + +## Expected Behavior + +### Common Flags + +For type-based commands (columns, samplers, validators, processors): +- **Positional `TYPE`**: Show details for a specific type (e.g., `llm-text`, `category`) +- **`TYPE` = `all`**: Show details for all types in the category +- **No `TYPE` (no `--list`)**: Show summary table of available types +- **`--list` / `-l`**: Show summary table of available types +- **`--format json` / `-f json`**: JSON output instead of text + +For other commands (models, builder, constraints, seeds, mcp, overview): +- **`--format json` / `-f json`**: JSON output instead of text + +### Command-by-Command Behavior + +#### `data-designer agent-context columns` + +Shows column configuration types discovered from `data_designer.config`. + +```bash +# List all column types +$ data-designer agent-context columns --list +column_type config_class +----------- ------------------------- +custom CustomColumnConfig +embedding EmbeddingColumnConfig +expression ExpressionColumnConfig +llm-code LLMCodeColumnConfig +llm-judge LLMJudgeColumnConfig +llm-structured LLMStructuredColumnConfig +llm-text LLMTextColumnConfig +sampler SamplerColumnConfig +seed-dataset SeedDatasetColumnConfig +validation ValidationColumnConfig + +# Show details for a specific type +$ data-designer agent-context columns llm-text +LLMTextColumnConfig: + column_type: llm-text + description: Configuration for LLM-based text generation columns. + fields: + name: + type: str + description: Unique column name in the generated dataset + prompt: + type: str + description: Jinja2 template for the LLM prompt... + ... + +# JSON format +$ data-designer agent-context columns llm-text --format json +{ + "class_name": "LLMTextColumnConfig", + "description": "Configuration for LLM-based text generation columns.", + "column_type": "llm-text", + "fields": [ + {"name": "name", "type": "str", "description": "..."}, + ... + ] +} + +# Unknown type exits with error +$ data-designer agent-context columns nonexistent +Error: Unknown column_type 'nonexistent' +Available types: custom, embedding, expression, ... +``` + +#### `data-designer agent-context samplers` + +Shows sampler types discovered from `SamplerType` enum and their params classes. Type lookups are case-insensitive, and the type value is displayed in uppercase (e.g., `CATEGORY`). + +```bash +$ data-designer agent-context samplers category +CategorySamplerParams: + sampler_type: CATEGORY + description: ... + fields: + values: + type: list[str] + description: List of categorical values to sample from + ... +``` + +#### `data-designer agent-context validators` + +Shows validator types (CODE, REMOTE, LOCAL_CALLABLE) and their params classes. Same pattern as samplers. + +#### `data-designer agent-context processors` + +Shows processor types (drop_columns, templated_columns) and their config classes. + +#### `data-designer agent-context models` + +Shows all model-related types: `ModelConfig`, `ChatCompletionInferenceParams`, `EmbeddingInferenceParams`, `ImageInferenceParams`, `ImageContext`, `UniformDistribution`, `ManualDistribution`. + +```bash +$ data-designer agent-context models +# Data Designer Model Configuration Reference +# 7 types + +ChatCompletionInferenceParams: + description: ... + fields: + temperature: + type: float | UniformDistribution | ManualDistribution | None + ... +... +``` + +#### `data-designer agent-context builder` + +Shows `DataDesignerConfigBuilder` method signatures and documentation, extracted via `inspect.signature()` and Google-style docstring parsing. + +```bash +$ data-designer agent-context builder +DataDesignerConfigBuilder Methods: + + add_column(column: ColumnConfig) -> Self + Add a column configuration to the builder. + Parameters: + column: ColumnConfig — The column configuration to add. + ... +``` + +#### `data-designer agent-context constraints` + +Shows constraint types: `ScalarInequalityConstraint`, `ColumnInequalityConstraint`, `InequalityOperator`. + +#### `data-designer agent-context seeds` + +Shows seed dataset types: `SeedConfig`, `SamplingStrategy`, `LocalFileSeedSource`, `HuggingFaceSeedSource`, `DataFrameSeedSource`, `IndexRange`, `PartitionBlock`. + +#### `data-designer agent-context mcp` + +Shows MCP types: `MCPProvider`, `LocalStdioMCPProvider`, `ToolConfig`. + +#### `data-designer agent-context overview` + +Compact API cheatsheet with type counts, builder method summaries, and quick-start commands. + +```bash +$ data-designer agent-context overview +Data Designer API Overview +========================== + +Type Counts: + Column types: 10 + Sampler types: 12 + Validator types: 3 + Processor types: 2 + Model configs: 7 + Constraint types: 3 + Seed types: 7 + MCP types: 3 + +Builder Methods (DataDesignerConfigBuilder): + add_column(...) — Add a column configuration to the builder. + add_constraint(...) — Add a constraint to the builder. + ... + +Quick Start Commands: + data-designer agent-context columns --list + data-designer agent-context columns all + data-designer agent-context columns llm-text + data-designer agent-context samplers category + data-designer agent-context builder +``` + +--- + +## Improvements Over Skill Scripts + +| Aspect | Skill Scripts | Agent Context CLI | +|--------|--------------|-------------------| +| **Location** | External (`skill/data-designer/scripts/`) | Library (`data_designer.cli`) | +| **Data structure** | Raw tuples, print directly | Dataclasses (`FieldDetail`, `ModelSchema`, `MethodInfo`, `ParamInfo`) | +| **Output formats** | Text only | Text + JSON (`--format json`) | +| **API coverage** | 4 domains (columns, samplers, validators, processors) | 9 domains (+models, builder, constraints, seeds, MCP, overview) | +| **Field descriptions** | 4 separate dicts | 1 centralized dict (108 entries) | +| **Builder introspection** | None | Full method signatures + docstring parsing | +| **Error handling** | Varies | Consistent: error message + exit code 1 | +| **Testability** | Script-level test | 113 unit + integration tests at every layer | + +--- + +## Test Coverage Summary + +### Unit Tests (85 tests) + +**`test_pydantic_inspector.py` (38 tests):** +- `_is_basemodel_subclass`: 5 tests (subclass, BaseModel itself, str, enum, non-type) +- `_is_enum_subclass`: 4 tests (subclass, Enum itself, str, non-type) +- `_extract_enum_class`: 5 tests (direct, optional, annotated, non-enum, None) +- `extract_nested_basemodel`: 10 tests (direct, list, optional, optional-list, dict, annotated, discriminated union, primitive, None, BaseModel itself) +- `format_type`: 3 tests (str, int, optional) +- `get_brief_description`: 2 tests (with/without docstring) +- `get_field_info`: 4 tests (returns FieldDetails, default descriptions, enum values, non-enum) +- `build_model_schema`: 5 tests (basic structure, type key/value, nested expansion, cycle protection, depth limiting) + +**`test_discovery.py` (17 tests):** +- 2 tests per discovery function (returns dict + contains expected keys) for all 8 functions +- 1 extra test for `discover_column_configs` (values are classes with model_fields) + +**`test_method_inspector.py` (11 tests):** +- `_parse_google_docstring_args`: 4 tests (basic, empty, no args section, multiline) +- `inspect_class_methods`: 7 tests (public only, returns MethodInfo, signature content, description, parameters, include private, init included) + +**`test_formatters.py` (19 tests):** +- `format_model_schema_text`: 4 tests (basic, type key, nested, enum values) +- `format_model_schema_json`: 3 tests (basic, type key, nested) +- `format_method_info_text`: 3 tests (basic, with class name, without) +- `format_method_info_json`: 2 tests (basic, multiple methods) +- `format_type_list_text`: 3 tests (basic, alignment, empty) +- `format_overview_text`: 4 tests (header, type counts, builder methods, quick start) + +### Controller Tests (15 tests) + +**`test_agent_context_controller.py`:** +- `show_columns`: 6 tests (list mode, specific type, all, nonexistent exits, JSON format, list JSON) +- `show_overview`: 2 tests (text, JSON) +- `show_samplers`: 2 tests (list, specific) +- `show_models`: 1 test +- `show_builder`: 1 test +- `show_constraints`: 1 test +- `show_seeds`: 1 test +- `show_mcp`: 1 test + +### Integration Tests (13 tests) + +**`test_agent_context_command.py`:** +- Via `typer.testing.CliRunner` against the real `app`: + - `agent-context --help` + - `columns --list`, `columns llm-text`, `columns llm-text --format json`, `columns nonexistent` + - `samplers category`, `samplers --list` + - `overview`, `builder`, `models`, `constraints`, `seeds`, `mcp` + +--- + +## Verification Checklist + +- [x] `make check-all` passes (ruff format + lint) +- [x] All 113 new tests pass +- [x] All 670 total project tests pass (113 new + 557 existing) +- [x] SPDX license headers on all files (2025-2026) +- [x] Type annotations on all functions +- [x] Absolute imports only (no relative imports) +- [x] No in-function imports (except `data_designer.config` in discovery functions, which is intentional to avoid circular imports at module load time) +- [x] Plain text output (no Rich/ANSI) for agent compatibility +- [x] JSON output is valid `json.loads()`-parseable +- [x] Error handling: unknown types produce clear error message + exit code 1 +- [x] Backward compatibility: YAML-style text format matches existing skill script output diff --git a/packages/data-designer-config/src/data_designer/config/base.py b/packages/data-designer-config/src/data_designer/config/base.py index a4e55fa27..3f02cd881 100644 --- a/packages/data-designer-config/src/data_designer/config/base.py +++ b/packages/data-designer-config/src/data_designer/config/base.py @@ -31,14 +31,19 @@ class SingleColumnConfig(ConfigBase, ABC): name: Unique name of the column to be generated. drop: If True, the column will be generated but removed from the final dataset. Useful for intermediate columns that are dependencies for other columns. + allow_resize: If True, the column is allowed to be resized during generation. column_type: Discriminator field that identifies the specific column type. Subclasses must override this field to specify the column type with a `Literal` value. """ - name: str - drop: bool = False - allow_resize: bool = False - column_type: str + name: str = Field(description="Unique name of the column to be generated") + drop: bool = Field( + default=False, description="If True, the column will be generated but removed from the final dataset" + ) + allow_resize: bool = Field( + default=False, description="If True, the column is allowed to be resized during generation" + ) + column_type: str = Field(description="Discriminator field that identifies the specific column type") @staticmethod def get_column_emoji() -> str: diff --git a/packages/data-designer-config/src/data_designer/config/column_configs.py b/packages/data-designer-config/src/data_designer/config/column_configs.py index 49dbb8311..aa1a7f2a0 100644 --- a/packages/data-designer-config/src/data_designer/config/column_configs.py +++ b/packages/data-designer-config/src/data_designer/config/column_configs.py @@ -56,11 +56,22 @@ class SamplerColumnConfig(SingleColumnConfig): ``` """ - sampler_type: SamplerType - params: Annotated[SamplerParamsT, Discriminator("sampler_type")] - conditional_params: dict[str, Annotated[SamplerParamsT, Discriminator("sampler_type")]] = {} - convert_to: str | None = None - column_type: Literal["sampler"] = "sampler" + sampler_type: SamplerType = Field( + description="Type of sampler to use (e.g., uuid, category, uniform, gaussian, person, datetime)" + ) + params: Annotated[SamplerParamsT, Discriminator("sampler_type")] = Field( + description="Parameters specific to the chosen sampler type" + ) + conditional_params: dict[str, Annotated[SamplerParamsT, Discriminator("sampler_type")]] = Field( + default_factory=dict, + description="Optional dictionary for conditional parameters; keys are conditions, values are params to use when met", + ) + convert_to: str | None = Field( + default=None, description="Optional type conversion after sampling: 'float', 'int', or 'str'" + ) + column_type: Literal["sampler"] = Field( + default="sampler", description="Discriminator field, always 'sampler' for this configuration type" + ) @staticmethod def get_column_emoji() -> str: @@ -136,14 +147,28 @@ class LLMTextColumnConfig(SingleColumnConfig): column_type: Discriminator field, always "llm-text" for this configuration type. """ - prompt: str - model_alias: str - system_prompt: str | None = None - multi_modal_context: list[ImageContext] | None = None - tool_alias: str | None = None - with_trace: TraceType = TraceType.NONE - extract_reasoning_content: bool = False - column_type: Literal["llm-text"] = "llm-text" + prompt: str = Field( + description="Jinja2 template for the LLM prompt; can reference other columns via {{ column_name }}" + ) + model_alias: str = Field(description="Alias of the model configuration to use for generation") + system_prompt: str | None = Field( + default=None, description="Optional system prompt to set model behavior and constraints" + ) + multi_modal_context: list[ImageContext] | None = Field( + default=None, description="Optional list of ImageContext for vision model inputs" + ) + tool_alias: str | None = Field( + default=None, description="Optional alias of the tool configuration to use for MCP tool calls" + ) + with_trace: TraceType = Field( + default=TraceType.NONE, description="Trace capture mode: NONE, LAST_MESSAGE, or ALL_MESSAGES" + ) + extract_reasoning_content: bool = Field( + default=False, description="If True, capture chain-of-thought in {name}__reasoning_content column" + ) + column_type: Literal["llm-text"] = Field( + default="llm-text", description="Discriminator field, always 'llm-text' for this configuration type" + ) @staticmethod def get_column_emoji() -> str: @@ -219,8 +244,12 @@ class LLMCodeColumnConfig(LLMTextColumnConfig): column containing the reasoning content from the final assistant response. """ - code_lang: CodeLang - column_type: Literal["llm-code"] = "llm-code" + code_lang: CodeLang = Field( + description="Target programming language or SQL dialect for code extraction from LLM response" + ) + column_type: Literal["llm-code"] = Field( + default="llm-code", description="Discriminator field, always 'llm-code' for this configuration type" + ) @staticmethod def get_column_emoji() -> str: @@ -252,8 +281,12 @@ class LLMStructuredColumnConfig(LLMTextColumnConfig): column containing the reasoning content from the final assistant response. """ - output_format: dict | type[BaseModel] - column_type: Literal["llm-structured"] = "llm-structured" + output_format: dict | type[BaseModel] = Field( + description="Pydantic model or JSON schema dict defining the expected structured output shape" + ) + column_type: Literal["llm-structured"] = Field( + default="llm-structured", description="Discriminator field, always 'llm-structured' for this configuration type" + ) @staticmethod def get_column_emoji() -> str: @@ -317,8 +350,12 @@ class LLMJudgeColumnConfig(LLMTextColumnConfig): column containing the reasoning content from the final assistant response. """ - scores: list[Score] = Field(..., min_length=1) - column_type: Literal["llm-judge"] = "llm-judge" + scores: list[Score] = Field( + ..., min_length=1, description="List of Score objects defining rubric criteria for LLM judge evaluation" + ) + column_type: Literal["llm-judge"] = Field( + default="llm-judge", description="Discriminator field, always 'llm-judge' for this configuration type" + ) @staticmethod def get_column_emoji() -> str: @@ -341,10 +378,14 @@ class ExpressionColumnConfig(SingleColumnConfig): column_type: Discriminator field, always "expression" for this configuration type. """ - name: str - expr: str - dtype: Literal["int", "float", "str", "bool"] = "str" - column_type: Literal["expression"] = "expression" + name: str = Field(description="Unique name of the column to be generated") + expr: str = Field(description="Jinja2 expression to compute the column value from other columns") + dtype: Literal["int", "float", "str", "bool"] = Field( + default="str", description="Data type for expression result: 'int', 'float', 'str', or 'bool'" + ) + column_type: Literal["expression"] = Field( + default="expression", description="Discriminator field, always 'expression' for this configuration type" + ) @staticmethod def get_column_emoji() -> str: @@ -410,11 +451,13 @@ class ValidationColumnConfig(SingleColumnConfig): column_type: Discriminator field, always "validation" for this configuration type. """ - target_columns: list[str] - validator_type: ValidatorType - validator_params: ValidatorParamsT + target_columns: list[str] = Field(description="List of column names to validate") + validator_type: ValidatorType = Field(description="Validation method: CODE, LOCAL_CALLABLE, or REMOTE") + validator_params: ValidatorParamsT = Field(description="Validator-specific parameters (e.g., CodeValidatorParams)") batch_size: int = Field(default=10, ge=1, description="Number of records to process in each batch") - column_type: Literal["validation"] = "validation" + column_type: Literal["validation"] = Field( + default="validation", description="Discriminator field, always 'validation' for this configuration type" + ) @staticmethod def get_column_emoji() -> str: @@ -441,7 +484,9 @@ class SeedDatasetColumnConfig(SingleColumnConfig): column_type: Discriminator field, always "seed-dataset" for this configuration type. """ - column_type: Literal["seed-dataset"] = "seed-dataset" + column_type: Literal["seed-dataset"] = Field( + default="seed-dataset", description="Discriminator field, always 'seed-dataset' for this configuration type" + ) @staticmethod def get_column_emoji() -> str: @@ -468,9 +513,11 @@ class EmbeddingColumnConfig(SingleColumnConfig): column_type: Discriminator field, always "embedding" for this configuration type. """ - target_column: str - model_alias: str - column_type: Literal["embedding"] = "embedding" + target_column: str = Field(description="Name of the text column to generate embeddings for") + model_alias: str = Field(description="Alias of the model to use for embedding generation") + column_type: Literal["embedding"] = Field( + default="embedding", description="Discriminator field, always 'embedding' for this configuration type" + ) @staticmethod def get_column_emoji() -> str: @@ -502,10 +549,16 @@ class ImageColumnConfig(SingleColumnConfig): column_type: Discriminator field, always "image" for this configuration type. """ - prompt: str - model_alias: str - multi_modal_context: list[ImageContext] | None = None - column_type: Literal["image"] = "image" + prompt: str = Field( + description="Jinja2 template for the image generation prompt; can reference other columns via {{ column_name }}" + ) + model_alias: str = Field(description="Alias of the model to use for image generation") + multi_modal_context: list[ImageContext] | None = Field( + default=None, description="Optional list of ImageContext for image-to-image generation inputs" + ) + column_type: Literal["image"] = Field( + default="image", description="Discriminator field, always 'image' for this configuration type" + ) @staticmethod def get_column_emoji() -> str: @@ -562,7 +615,9 @@ class CustomColumnConfig(SingleColumnConfig): default=None, description="Optional typed configuration object passed as second argument to generator function", ) - column_type: Literal["custom"] = "custom" + column_type: Literal["custom"] = Field( + default="custom", description="Discriminator field, always 'custom' for this configuration type" + ) @field_validator("generator_function") @classmethod diff --git a/packages/data-designer-config/src/data_designer/config/mcp.py b/packages/data-designer-config/src/data_designer/config/mcp.py index fe870fa86..e0683e3c7 100644 --- a/packages/data-designer-config/src/data_designer/config/mcp.py +++ b/packages/data-designer-config/src/data_designer/config/mcp.py @@ -33,10 +33,12 @@ class MCPProvider(ConfigBase): ... ) """ - provider_type: Literal["sse"] = "sse" - name: str - endpoint: str - api_key: str | None = None + provider_type: Literal["sse"] = Field( + default="sse", description="Transport type discriminator, always 'sse' for remote MCP providers" + ) + name: str = Field(description="Unique name used to reference this MCP provider") + endpoint: str = Field(description="SSE endpoint URL for connecting to the remote MCP server") + api_key: str | None = Field(default=None, description="Optional API key for authentication") class LocalStdioMCPProvider(ConfigBase): @@ -63,11 +65,15 @@ class LocalStdioMCPProvider(ConfigBase): ... ) """ - provider_type: Literal["stdio"] = "stdio" - name: str - command: str - args: list[str] = Field(default_factory=list) - env: dict[str, str] = Field(default_factory=dict) + provider_type: Literal["stdio"] = Field( + default="stdio", description="Transport type discriminator, always 'stdio' for local subprocess MCP providers" + ) + name: str = Field(description="Unique name used to reference this MCP provider") + command: str = Field(description="Executable to launch the MCP server via stdio transport") + args: list[str] = Field(default_factory=list, description="Arguments passed to the MCP server executable") + env: dict[str, str] = Field( + default_factory=dict, description="Environment variables passed to the MCP server subprocess" + ) MCPProviderT: TypeAlias = Annotated[MCPProvider | LocalStdioMCPProvider, Field(discriminator="provider_type")] @@ -102,8 +108,12 @@ class ToolConfig(ConfigBase): ... ) """ - tool_alias: str - providers: list[str] - allow_tools: list[str] | None = None - max_tool_call_turns: int = Field(default=5, ge=1) - timeout_sec: float | None = Field(default=None, gt=0) + tool_alias: str = Field(description="User-defined alias to reference this tool configuration in column configs") + providers: list[str] = Field(description="Names of the MCP providers to use for tool calls") + allow_tools: list[str] | None = Field( + default=None, description="Optional allowlist of tool names that restricts which tools are permitted" + ) + max_tool_call_turns: int = Field( + default=5, ge=1, description="Maximum number of tool-calling turns permitted in a single generation" + ) + timeout_sec: float | None = Field(default=None, gt=0, description="Timeout in seconds for MCP tool calls") diff --git a/packages/data-designer-config/src/data_designer/config/models.py b/packages/data-designer-config/src/data_designer/config/models.py index 578b34eec..536348907 100644 --- a/packages/data-designer-config/src/data_designer/config/models.py +++ b/packages/data-designer-config/src/data_designer/config/models.py @@ -56,9 +56,11 @@ class DistributionType(str, Enum): class ModalityContext(ABC, BaseModel): - modality: Modality - column_name: str - data_type: ModalityDataType | None = None + modality: Modality = Field(description="The modality type for this context") + column_name: str = Field(description="Name of the column containing the modality data") + data_type: ModalityDataType | None = Field( + default=None, description="Format of the modality data ('url' or 'base64')" + ) @abstractmethod def get_contexts(self, record: dict, *, base_path: str | None = None) -> list[dict[str, Any]]: ... @@ -76,8 +78,8 @@ class ImageContext(ModalityContext): image_format: Image format (required when data_type is explicitly "base64"). """ - modality: Modality = Modality.IMAGE - image_format: ImageFormat | None = None + modality: Modality = Field(default=Modality.IMAGE, description="The modality type, always 'image' for ImageContext") + image_format: ImageFormat | None = Field(default=None, description="Image format (required for base64 data)") def get_contexts(self, record: dict, *, base_path: str | None = None) -> list[dict[str, Any]]: """Get the contexts for the image modality. @@ -179,8 +181,8 @@ def _validate_image_format(self) -> Self: class Distribution(ABC, ConfigBase, Generic[DistributionParamsT]): - distribution_type: DistributionType - params: DistributionParamsT + distribution_type: DistributionType = Field(description="Type of distribution for sampling") + params: DistributionParamsT = Field(description="Parameters for the distribution") @abstractmethod def sample(self) -> float: ... @@ -194,8 +196,10 @@ class ManualDistributionParams(ConfigBase): weights: Optional list of weights for each value. If not provided, all values have equal probability. """ - values: list[float] = Field(min_length=1) - weights: list[float] | None = None + values: list[float] = Field(min_length=1, description="List of possible values to sample from") + weights: list[float] | None = Field( + default=None, description="Optional probability weights for each value; automatically normalized to sum to 1.0" + ) @model_validator(mode="after") def _normalize_weights(self) -> Self: @@ -221,8 +225,10 @@ class ManualDistribution(Distribution[ManualDistributionParams]): params: Distribution parameters (values, weights). """ - distribution_type: DistributionType | None = "manual" - params: ManualDistributionParams + distribution_type: DistributionType | None = Field( + default="manual", description="Type of distribution, always 'manual' for this class" + ) + params: ManualDistributionParams = Field(description="Manual distribution parameters (values and optional weights)") def sample(self) -> float: """Sample a value from the manual distribution. @@ -241,8 +247,8 @@ class UniformDistributionParams(ConfigBase): high: Upper bound (exclusive). """ - low: float - high: float + low: float = Field(description="Lower bound of the uniform distribution (inclusive)") + high: float = Field(description="Upper bound of the uniform distribution (exclusive)") @model_validator(mode="after") def _validate_low_lt_high(self) -> Self: @@ -262,8 +268,10 @@ class UniformDistribution(Distribution[UniformDistributionParams]): params: Distribution parameters (low, high). """ - distribution_type: DistributionType | None = "uniform" - params: UniformDistributionParams + distribution_type: DistributionType | None = Field( + default="uniform", description="Type of distribution, always 'uniform' for this class" + ) + params: UniformDistributionParams = Field(description="Uniform distribution parameters (low and high bounds)") def sample(self) -> float: """Sample a value from the uniform distribution. @@ -293,10 +301,14 @@ class BaseInferenceParams(ConfigBase, ABC): extra_body: Additional parameters to pass to the model API. """ - generation_type: GenerationType - max_parallel_requests: int = Field(default=4, ge=1) - timeout: int | None = Field(default=None, ge=1) - extra_body: dict[str, Any] | None = None + generation_type: GenerationType = Field(description="Type of generation (chat-completion, embedding, or image)") + max_parallel_requests: int = Field( + default=4, ge=1, description="Maximum number of parallel requests to the model API" + ) + timeout: int | None = Field(default=None, ge=1, description="Timeout in seconds for each request") + extra_body: dict[str, Any] | None = Field( + default=None, description="Additional parameters to pass to the model API" + ) @property def generate_kwargs(self) -> dict[str, Any]: @@ -361,10 +373,19 @@ class ChatCompletionInferenceParams(BaseInferenceParams): max_tokens: Maximum number of tokens to generate in the response. """ - generation_type: Literal[GenerationType.CHAT_COMPLETION] = GenerationType.CHAT_COMPLETION - temperature: float | DistributionT | None = None - top_p: float | DistributionT | None = None - max_tokens: int | None = Field(default=None, ge=1) + generation_type: Literal[GenerationType.CHAT_COMPLETION] = Field( + default=GenerationType.CHAT_COMPLETION, + description="Type of generation, always 'chat-completion' for this class", + ) + temperature: float | DistributionT | None = Field( + default=None, description="Sampling temperature (0.0-2.0); can be a fixed value or a distribution" + ) + top_p: float | DistributionT | None = Field( + default=None, description="Nucleus sampling probability (0.0-1.0); can be a fixed value or a distribution" + ) + max_tokens: int | None = Field( + default=None, ge=1, description="Maximum number of tokens to generate in the response" + ) @property def generate_kwargs(self) -> dict[str, Any]: @@ -446,9 +467,13 @@ class EmbeddingInferenceParams(BaseInferenceParams): dimensions: Number of dimensions for the embedding. """ - generation_type: Literal[GenerationType.EMBEDDING] = GenerationType.EMBEDDING - encoding_format: Literal["float", "base64"] = "float" - dimensions: int | None = None + generation_type: Literal[GenerationType.EMBEDDING] = Field( + default=GenerationType.EMBEDDING, description="Type of generation, always 'embedding' for this class" + ) + encoding_format: Literal["float", "base64"] = Field( + default="float", description="Format of the embedding encoding ('float' or 'base64')" + ) + dimensions: int | None = Field(default=None, description="Number of dimensions for the embedding") @property def generate_kwargs(self) -> dict[str, float | int]: @@ -489,7 +514,9 @@ class ImageInferenceParams(BaseInferenceParams): ``` """ - generation_type: Literal[GenerationType.IMAGE] = GenerationType.IMAGE + generation_type: Literal[GenerationType.IMAGE] = Field( + default=GenerationType.IMAGE, description="Type of generation, always 'image' for this class" + ) InferenceParamsT: TypeAlias = Annotated[ @@ -510,11 +537,14 @@ class ModelConfig(ConfigBase): skip_health_check: Whether to skip the health check for this model. Defaults to False. """ - alias: str - model: str - inference_parameters: InferenceParamsT = Field(default_factory=ChatCompletionInferenceParams) - provider: str | None = None - skip_health_check: bool = False + alias: str = Field(description="User-defined alias to reference in column configurations") + model: str = Field(description="Model identifier (e.g., from build.nvidia.com or other providers)") + inference_parameters: InferenceParamsT = Field( + default_factory=ChatCompletionInferenceParams, + description="Inference parameters for the model (temperature, top_p, max_tokens, etc.)", + ) + provider: str | None = Field(default=None, description="Optional model provider name if using custom providers") + skip_health_check: bool = Field(default=False, description="Whether to skip the health check for this model") @property def generation_type(self) -> GenerationType: diff --git a/packages/data-designer-config/src/data_designer/config/processors.py b/packages/data-designer-config/src/data_designer/config/processors.py index 733dd5ab5..6ac61c0e8 100644 --- a/packages/data-designer-config/src/data_designer/config/processors.py +++ b/packages/data-designer-config/src/data_designer/config/processors.py @@ -57,7 +57,10 @@ class DropColumnsProcessorConfig(ProcessorConfig): """ column_names: list[str] = Field(description="List of column names to drop from the output dataset.") - processor_type: Literal[ProcessorType.DROP_COLUMNS] = ProcessorType.DROP_COLUMNS + processor_type: Literal[ProcessorType.DROP_COLUMNS] = Field( + default=ProcessorType.DROP_COLUMNS, + description="Discriminator field, always 'drop_columns' for this processor type", + ) class SchemaTransformProcessorConfig(ProcessorConfig): @@ -97,7 +100,10 @@ class SchemaTransformProcessorConfig(ProcessorConfig): References to columns "col1" and "col2" in the templates will be replaced with the actual values of the columns in the dataset. """, ) - processor_type: Literal[ProcessorType.SCHEMA_TRANSFORM] = ProcessorType.SCHEMA_TRANSFORM + processor_type: Literal[ProcessorType.SCHEMA_TRANSFORM] = Field( + default=ProcessorType.SCHEMA_TRANSFORM, + description="Discriminator field, always 'schema_transform' for this processor type", + ) @field_validator("template") def validate_template(cls, v: dict[str, Any]) -> dict[str, Any]: diff --git a/packages/data-designer-config/src/data_designer/config/sampler_constraints.py b/packages/data-designer-config/src/data_designer/config/sampler_constraints.py index 86dc2c09c..e935424e7 100644 --- a/packages/data-designer-config/src/data_designer/config/sampler_constraints.py +++ b/packages/data-designer-config/src/data_designer/config/sampler_constraints.py @@ -6,6 +6,7 @@ from abc import ABC, abstractmethod from enum import Enum +from pydantic import Field from typing_extensions import TypeAlias from data_designer.config.base import ConfigBase @@ -24,7 +25,7 @@ class InequalityOperator(str, Enum): class Constraint(ConfigBase, ABC): - target_column: str + target_column: str = Field(description="Name of the column this constraint applies to") @property @abstractmethod @@ -32,8 +33,8 @@ def constraint_type(self) -> ConstraintType: ... class ScalarInequalityConstraint(Constraint): - rhs: float - operator: InequalityOperator + rhs: float = Field(description="Scalar value to compare against") + operator: InequalityOperator = Field(description="Comparison operator (lt, le, gt, ge)") @property def constraint_type(self) -> ConstraintType: @@ -41,8 +42,8 @@ def constraint_type(self) -> ConstraintType: class ColumnInequalityConstraint(Constraint): - rhs: str - operator: InequalityOperator + rhs: str = Field(description="Name of the other column to compare against") + operator: InequalityOperator = Field(description="Comparison operator (lt, le, gt, ge)") @property def constraint_type(self) -> ConstraintType: diff --git a/packages/data-designer-config/src/data_designer/config/sampler_params.py b/packages/data-designer-config/src/data_designer/config/sampler_params.py index aafad16db..01af4c983 100644 --- a/packages/data-designer-config/src/data_designer/config/sampler_params.py +++ b/packages/data-designer-config/src/data_designer/config/sampler_params.py @@ -68,7 +68,9 @@ class CategorySamplerParams(ConfigBase): "Larger values will be sampled with higher probability." ), ) - sampler_type: Literal[SamplerType.CATEGORY] = SamplerType.CATEGORY + sampler_type: Literal[SamplerType.CATEGORY] = Field( + default=SamplerType.CATEGORY, description="Sampler type discriminator, always 'category' for this sampler" + ) @model_validator(mode="after") def _normalize_weights_if_needed(self) -> Self: @@ -109,7 +111,9 @@ class DatetimeSamplerParams(ConfigBase): default="D", description="Sampling units, e.g. the smallest possible time interval between samples.", ) - sampler_type: Literal[SamplerType.DATETIME] = SamplerType.DATETIME + sampler_type: Literal[SamplerType.DATETIME] = Field( + default=SamplerType.DATETIME, description="Sampler type discriminator, always 'datetime' for this sampler" + ) @field_validator("start", "end") @classmethod @@ -140,7 +144,9 @@ class SubcategorySamplerParams(ConfigBase): ..., description="Mapping from each value of parent category to a list of subcategory values.", ) - sampler_type: Literal[SamplerType.SUBCATEGORY] = SamplerType.SUBCATEGORY + sampler_type: Literal[SamplerType.SUBCATEGORY] = Field( + default=SamplerType.SUBCATEGORY, description="Sampler type discriminator, always 'subcategory' for this sampler" + ) class TimeDeltaSamplerParams(ConfigBase): @@ -192,7 +198,9 @@ class TimeDeltaSamplerParams(ConfigBase): default="D", description="Sampling units, e.g. the smallest possible time interval between samples.", ) - sampler_type: Literal[SamplerType.TIMEDELTA] = SamplerType.TIMEDELTA + sampler_type: Literal[SamplerType.TIMEDELTA] = Field( + default=SamplerType.TIMEDELTA, description="Sampler type discriminator, always 'timedelta' for this sampler" + ) @model_validator(mode="after") def _validate_min_less_than_max(self) -> Self: @@ -225,7 +233,9 @@ class UUIDSamplerParams(ConfigBase): default=False, description="If true, all letters in the UUID will be capitalized.", ) - sampler_type: Literal[SamplerType.UUID] = SamplerType.UUID + sampler_type: Literal[SamplerType.UUID] = Field( + default=SamplerType.UUID, description="Sampler type discriminator, always 'uuid' for this sampler" + ) @property def last_index(self) -> int: @@ -264,7 +274,9 @@ class ScipySamplerParams(ConfigBase): decimal_places: int | None = Field( default=None, description="Number of decimal places to round the sampled values to." ) - sampler_type: Literal[SamplerType.SCIPY] = SamplerType.SCIPY + sampler_type: Literal[SamplerType.SCIPY] = Field( + default=SamplerType.SCIPY, description="Sampler type discriminator, always 'scipy' for this sampler" + ) class BinomialSamplerParams(ConfigBase): @@ -281,7 +293,9 @@ class BinomialSamplerParams(ConfigBase): n: int = Field(..., description="Number of trials.") p: float = Field(..., description="Probability of success on each trial.", ge=0.0, le=1.0) - sampler_type: Literal[SamplerType.BINOMIAL] = SamplerType.BINOMIAL + sampler_type: Literal[SamplerType.BINOMIAL] = Field( + default=SamplerType.BINOMIAL, description="Sampler type discriminator, always 'binomial' for this sampler" + ) class BernoulliSamplerParams(ConfigBase): @@ -297,7 +311,9 @@ class BernoulliSamplerParams(ConfigBase): """ p: float = Field(..., description="Probability of success.", ge=0.0, le=1.0) - sampler_type: Literal[SamplerType.BERNOULLI] = SamplerType.BERNOULLI + sampler_type: Literal[SamplerType.BERNOULLI] = Field( + default=SamplerType.BERNOULLI, description="Sampler type discriminator, always 'bernoulli' for this sampler" + ) class BernoulliMixtureSamplerParams(ConfigBase): @@ -337,7 +353,10 @@ class BernoulliMixtureSamplerParams(ConfigBase): ..., description="Parameters of the scipy.stats distribution given in `dist_name`.", ) - sampler_type: Literal[SamplerType.BERNOULLI_MIXTURE] = SamplerType.BERNOULLI_MIXTURE + sampler_type: Literal[SamplerType.BERNOULLI_MIXTURE] = Field( + default=SamplerType.BERNOULLI_MIXTURE, + description="Sampler type discriminator, always 'bernoulli_mixture' for this sampler", + ) class GaussianSamplerParams(ConfigBase): @@ -361,7 +380,9 @@ class GaussianSamplerParams(ConfigBase): decimal_places: int | None = Field( default=None, description="Number of decimal places to round the sampled values to." ) - sampler_type: Literal[SamplerType.GAUSSIAN] = SamplerType.GAUSSIAN + sampler_type: Literal[SamplerType.GAUSSIAN] = Field( + default=SamplerType.GAUSSIAN, description="Sampler type discriminator, always 'gaussian' for this sampler" + ) class PoissonSamplerParams(ConfigBase): @@ -381,7 +402,9 @@ class PoissonSamplerParams(ConfigBase): """ mean: float = Field(..., description="Mean number of events in a fixed interval.") - sampler_type: Literal[SamplerType.POISSON] = SamplerType.POISSON + sampler_type: Literal[SamplerType.POISSON] = Field( + default=SamplerType.POISSON, description="Sampler type discriminator, always 'poisson' for this sampler" + ) class UniformSamplerParams(ConfigBase): @@ -403,7 +426,9 @@ class UniformSamplerParams(ConfigBase): decimal_places: int | None = Field( default=None, description="Number of decimal places to round the sampled values to." ) - sampler_type: Literal[SamplerType.UNIFORM] = SamplerType.UNIFORM + sampler_type: Literal[SamplerType.UNIFORM] = Field( + default=SamplerType.UNIFORM, description="Sampler type discriminator, always 'uniform' for this sampler" + ) ######################################### @@ -481,7 +506,9 @@ class PersonSamplerParams(ConfigBase): default=False, description="If True, then append synthetic persona columns to each generated person.", ) - sampler_type: Literal[SamplerType.PERSON] = SamplerType.PERSON + sampler_type: Literal[SamplerType.PERSON] = Field( + default=SamplerType.PERSON, description="Sampler type discriminator, always 'person' for this sampler" + ) @property def generator_kwargs(self) -> list[str]: @@ -564,7 +591,10 @@ class PersonFromFakerSamplerParams(ConfigBase): min_length=2, max_length=2, ) - sampler_type: Literal[SamplerType.PERSON_FROM_FAKER] = SamplerType.PERSON_FROM_FAKER + sampler_type: Literal[SamplerType.PERSON_FROM_FAKER] = Field( + default=SamplerType.PERSON_FROM_FAKER, + description="Sampler type discriminator, always 'person_from_faker' for this sampler", + ) @property def generator_kwargs(self) -> list[str]: diff --git a/packages/data-designer-config/src/data_designer/config/seed.py b/packages/data-designer-config/src/data_designer/config/seed.py index bdd9dae29..c791f954a 100644 --- a/packages/data-designer-config/src/data_designer/config/seed.py +++ b/packages/data-designer-config/src/data_designer/config/seed.py @@ -111,6 +111,11 @@ class SeedConfig(ConfigBase): ) """ - source: SeedSourceT - sampling_strategy: SamplingStrategy = SamplingStrategy.ORDERED - selection_strategy: IndexRange | PartitionBlock | None = None + source: SeedSourceT = Field(description="A SeedSource defining where the seed data exists") + sampling_strategy: SamplingStrategy = Field( + default=SamplingStrategy.ORDERED, + description="Strategy for how to sample rows: ORDERED (sequential) or SHUFFLE (random)", + ) + selection_strategy: IndexRange | PartitionBlock | None = Field( + default=None, description="Optional strategy to select a subset of the dataset (IndexRange or PartitionBlock)" + ) diff --git a/packages/data-designer-config/src/data_designer/config/seed_source.py b/packages/data-designer-config/src/data_designer/config/seed_source.py index c9f31eb46..df43deccd 100644 --- a/packages/data-designer-config/src/data_designer/config/seed_source.py +++ b/packages/data-designer-config/src/data_designer/config/seed_source.py @@ -26,13 +26,15 @@ class SeedSource(BaseModel, ABC): This serves as a discriminated union discriminator. """ - seed_type: str + seed_type: str = Field(description="Discriminator field identifying the seed source type") class LocalFileSeedSource(SeedSource): - seed_type: Literal["local"] = "local" + seed_type: Literal["local"] = Field( + default="local", description="Seed source type discriminator, always 'local' for local file sources" + ) - path: str + path: str = Field(description="Path to the local seed dataset file") @field_validator("path", mode="after") def validate_path(cls, v: str) -> str: @@ -53,7 +55,9 @@ def from_dataframe(cls, df: pd.DataFrame, path: str) -> Self: class HuggingFaceSeedSource(SeedSource): - seed_type: Literal["hf"] = "hf" + seed_type: Literal["hf"] = Field( + default="hf", description="Seed source type discriminator, always 'hf' for HuggingFace sources" + ) path: str = Field( ..., diff --git a/packages/data-designer/src/data_designer/cli/commands/agent_context.py b/packages/data-designer/src/data_designer/cli/commands/agent_context.py new file mode 100644 index 000000000..8a2a4e7e1 --- /dev/null +++ b/packages/data-designer/src/data_designer/cli/commands/agent_context.py @@ -0,0 +1,114 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +from __future__ import annotations + +import typer + +from data_designer.cli.controllers.agent_context_controller import AgentContextController + +agent_context_app = typer.Typer( + name="agent-context", + help="Introspect Data Designer's API for agent consumption.", + no_args_is_help=True, +) + + +def _make_controller(output_format: str) -> AgentContextController: + return AgentContextController(output_format=output_format) + + +@agent_context_app.command(name="columns") +def columns_command( + type_name: str | None = typer.Argument( + None, help="Column type to display (e.g., 'llm-text'), or 'all' for everything." + ), + list_mode: bool = typer.Option(False, "--list", "-l", help="Show summary table of available types."), + output_format: str = typer.Option("text", "--format", "-f", help="Output format: 'text' (default) or 'json'."), +) -> None: + """Show column configuration types and their fields.""" + _make_controller(output_format).show_columns(type_name, list_mode) + + +@agent_context_app.command(name="samplers") +def samplers_command( + type_name: str | None = typer.Argument( + None, help="Sampler type to display (e.g., 'category'), or 'all' for everything." + ), + list_mode: bool = typer.Option(False, "--list", "-l", help="Show summary table of available types."), + output_format: str = typer.Option("text", "--format", "-f", help="Output format: 'text' (default) or 'json'."), +) -> None: + """Show sampler types and their parameter fields.""" + _make_controller(output_format).show_samplers(type_name, list_mode) + + +@agent_context_app.command(name="validators") +def validators_command( + type_name: str | None = typer.Argument( + None, help="Validator type to display (e.g., 'code'), or 'all' for everything." + ), + list_mode: bool = typer.Option(False, "--list", "-l", help="Show summary table of available types."), + output_format: str = typer.Option("text", "--format", "-f", help="Output format: 'text' (default) or 'json'."), +) -> None: + """Show validator types and their parameter fields.""" + _make_controller(output_format).show_validators(type_name, list_mode) + + +@agent_context_app.command(name="processors") +def processors_command( + type_name: str | None = typer.Argument( + None, help="Processor type to display (e.g., 'drop_columns'), or 'all' for everything." + ), + list_mode: bool = typer.Option(False, "--list", "-l", help="Show summary table of available types."), + output_format: str = typer.Option("text", "--format", "-f", help="Output format: 'text' (default) or 'json'."), +) -> None: + """Show processor types and their configuration fields.""" + _make_controller(output_format).show_processors(type_name, list_mode) + + +@agent_context_app.command(name="models") +def models_command( + output_format: str = typer.Option("text", "--format", "-f", help="Output format: 'text' (default) or 'json'."), +) -> None: + """Show model configuration types (ModelConfig, inference params, distributions).""" + _make_controller(output_format).show_models() + + +@agent_context_app.command(name="builder") +def builder_command( + output_format: str = typer.Option("text", "--format", "-f", help="Output format: 'text' (default) or 'json'."), +) -> None: + """Show DataDesignerConfigBuilder method signatures and documentation.""" + _make_controller(output_format).show_builder() + + +@agent_context_app.command(name="constraints") +def constraints_command( + output_format: str = typer.Option("text", "--format", "-f", help="Output format: 'text' (default) or 'json'."), +) -> None: + """Show constraint types (ScalarInequality, ColumnInequality, operators).""" + _make_controller(output_format).show_constraints() + + +@agent_context_app.command(name="seeds") +def seeds_command( + output_format: str = typer.Option("text", "--format", "-f", help="Output format: 'text' (default) or 'json'."), +) -> None: + """Show seed dataset types (SeedConfig, sources, sampling strategies).""" + _make_controller(output_format).show_seeds() + + +@agent_context_app.command(name="mcp") +def mcp_command( + output_format: str = typer.Option("text", "--format", "-f", help="Output format: 'text' (default) or 'json'."), +) -> None: + """Show MCP provider types (MCPProvider, LocalStdioMCPProvider, ToolConfig).""" + _make_controller(output_format).show_mcp() + + +@agent_context_app.command(name="overview") +def overview_command( + output_format: str = typer.Option("text", "--format", "-f", help="Output format: 'text' (default) or 'json'."), +) -> None: + """Show compact API cheatsheet with type counts, builder summary, and quick start commands.""" + _make_controller(output_format).show_overview() diff --git a/packages/data-designer/src/data_designer/cli/controllers/__init__.py b/packages/data-designer/src/data_designer/cli/controllers/__init__.py index 3d2894b76..e5725ea5a 100644 --- a/packages/data-designer/src/data_designer/cli/controllers/__init__.py +++ b/packages/data-designer/src/data_designer/cli/controllers/__init__.py @@ -1,4 +1,2 @@ # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 - -from __future__ import annotations diff --git a/packages/data-designer/src/data_designer/cli/controllers/agent_context_controller.py b/packages/data-designer/src/data_designer/cli/controllers/agent_context_controller.py new file mode 100644 index 000000000..f49cc1918 --- /dev/null +++ b/packages/data-designer/src/data_designer/cli/controllers/agent_context_controller.py @@ -0,0 +1,257 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +from __future__ import annotations + +import json + +import typer + +from data_designer.cli.services.introspection.discovery import ( + discover_column_configs, + discover_constraint_types, + discover_mcp_types, + discover_model_configs, + discover_processor_configs, + discover_sampler_types, + discover_seed_types, + discover_validator_types, +) +from data_designer.cli.services.introspection.formatters import ( + format_method_info_json, + format_method_info_text, + format_model_schema_json, + format_model_schema_text, + format_overview_text, + format_type_list_text, +) +from data_designer.cli.services.introspection.method_inspector import inspect_class_methods +from data_designer.cli.services.introspection.pydantic_inspector import build_model_schema + + +class AgentContextController: + """Controller for agent-context CLI commands. + + Orchestrates discovery, inspection, formatting, and output for all + agent-context subcommands. + """ + + def __init__(self, output_format: str = "text") -> None: + self._format = output_format + + def show_columns(self, type_name: str | None, list_mode: bool) -> None: + """Show column configuration types.""" + items = discover_column_configs() + self._show_typed_items( + items=items, + type_name=type_name, + list_mode=list_mode, + type_key="column_type", + type_label="column_type", + class_label="config_class", + header_title="Data Designer Column Types Reference", + ) + + def show_samplers(self, type_name: str | None, list_mode: bool) -> None: + """Show sampler types and their param classes.""" + items = discover_sampler_types() + self._show_typed_items( + items=items, + type_name=type_name, + list_mode=list_mode, + type_key="sampler_type", + type_label="sampler_type", + class_label="params_class", + header_title="Data Designer Sampler Types Reference", + case_insensitive=True, + uppercase_value=True, + ) + + def show_validators(self, type_name: str | None, list_mode: bool) -> None: + """Show validator types and their param classes.""" + items = discover_validator_types() + self._show_typed_items( + items=items, + type_name=type_name, + list_mode=list_mode, + type_key="validator_type", + type_label="validator_type", + class_label="params_class", + header_title="Data Designer Validator Types Reference", + case_insensitive=True, + uppercase_value=True, + ) + + def show_processors(self, type_name: str | None, list_mode: bool) -> None: + """Show processor types and their config classes.""" + items = discover_processor_configs() + self._show_typed_items( + items=items, + type_name=type_name, + list_mode=list_mode, + type_key="processor_type", + type_label="processor_type", + class_label="config_class", + header_title="Data Designer Processor Types Reference", + case_insensitive=True, + ) + + def show_models(self) -> None: + """Show model configuration types.""" + items = discover_model_configs() + self._show_all_schemas(items, "Data Designer Model Configuration Reference") + + def show_builder(self) -> None: + """Show DataDesignerConfigBuilder method signatures and docs.""" + from data_designer.config.config_builder import DataDesignerConfigBuilder + + methods = inspect_class_methods(DataDesignerConfigBuilder) + if self._format == "json": + typer.echo(json.dumps(format_method_info_json(methods), indent=2)) + else: + typer.echo(format_method_info_text(methods, class_name="DataDesignerConfigBuilder")) + + def show_constraints(self) -> None: + """Show constraint types.""" + items = discover_constraint_types() + self._show_all_schemas(items, "Data Designer Constraint Types Reference") + + def show_seeds(self) -> None: + """Show seed dataset types.""" + items = discover_seed_types() + self._show_all_schemas(items, "Data Designer Seed Dataset Types Reference") + + def show_mcp(self) -> None: + """Show MCP provider types.""" + items = discover_mcp_types() + self._show_all_schemas(items, "Data Designer MCP Types Reference") + + def show_overview(self) -> None: + """Show compact API overview cheatsheet.""" + from data_designer.config.config_builder import DataDesignerConfigBuilder + + type_counts = { + "Column types": len(discover_column_configs()), + "Sampler types": len(discover_sampler_types()), + "Validator types": len(discover_validator_types()), + "Processor types": len(discover_processor_configs()), + "Model configs": len(discover_model_configs()), + "Constraint types": len(discover_constraint_types()), + "Seed types": len(discover_seed_types()), + "MCP types": len(discover_mcp_types()), + } + + builder_methods = inspect_class_methods(DataDesignerConfigBuilder) + + if self._format == "json": + typer.echo( + json.dumps( + {"type_counts": type_counts, "builder_methods": format_method_info_json(builder_methods)}, indent=2 + ) + ) + else: + typer.echo(format_overview_text(type_counts, builder_methods)) + + def _show_typed_items( + self, + items: dict[str, type], + type_name: str | None, + list_mode: bool, + type_key: str, + type_label: str, + class_label: str, + header_title: str, + case_insensitive: bool = False, + uppercase_value: bool = False, + ) -> None: + """Shared logic for type-based commands (columns, samplers, validators, processors).""" + if list_mode: + if self._format == "json": + typer.echo(json.dumps({k: v.__name__ for k, v in sorted(items.items())}, indent=2)) + else: + typer.echo(format_type_list_text(items, type_label, class_label)) + return + + if type_name is None: + if self._format == "json": + typer.echo(json.dumps({k: v.__name__ for k, v in sorted(items.items())}, indent=2)) + else: + typer.echo(format_type_list_text(items, type_label, class_label)) + return + + if type_name == "all": + self._show_all_typed(items, type_key, header_title, uppercase_value) + return + + lookup = type_name.lower() if case_insensitive else type_name + if lookup not in items: + available = ", ".join(sorted(items.keys())) + typer.echo(f"Error: Unknown {type_key} '{type_name}'", err=True) + typer.echo(f"Available types: {available}", err=True) + raise typer.Exit(code=1) + + cls = items[lookup] + display_value = lookup.upper() if uppercase_value else lookup + schema = build_model_schema(cls, type_key=type_key, type_value=display_value) + + if self._format == "json": + typer.echo(json.dumps(format_model_schema_json(schema), indent=2)) + else: + typer.echo(format_model_schema_text(schema)) + + def _show_all_typed( + self, + items: dict[str, type], + type_key: str, + header_title: str, + uppercase_value: bool = False, + ) -> None: + """Show all types for a typed command.""" + sorted_types = sorted(items.keys()) + + if self._format == "json": + all_schemas = [] + for type_value in sorted_types: + cls = items[type_value] + display_value = type_value.upper() if uppercase_value else type_value + schema = build_model_schema(cls, type_key=type_key, type_value=display_value) + all_schemas.append(format_model_schema_json(schema)) + typer.echo(json.dumps(all_schemas, indent=2)) + else: + lines = [f"# {header_title}", f"# {len(sorted_types)} types discovered from data_designer.config", ""] + for type_value in sorted_types: + cls = items[type_value] + display_value = type_value.upper() if uppercase_value else type_value + schema = build_model_schema(cls, type_key=type_key, type_value=display_value) + lines.append(format_model_schema_text(schema)) + lines.append("") + typer.echo("\n".join(lines)) + + def _show_all_schemas(self, items: dict[str, type], header_title: str) -> None: + """Show all schemas for simple discovery commands (models, constraints, seeds, mcp).""" + if self._format == "json": + all_schemas = [] + for name in sorted(items.keys()): + cls = items[name] + if hasattr(cls, "model_fields"): + schema = build_model_schema(cls) + all_schemas.append(format_model_schema_json(schema)) + else: + all_schemas.append({"class_name": cls.__name__, "description": cls.__doc__ or ""}) + typer.echo(json.dumps(all_schemas, indent=2)) + else: + lines = [f"# {header_title}", f"# {len(items)} types", ""] + for name in sorted(items.keys()): + cls = items[name] + if hasattr(cls, "model_fields"): + schema = build_model_schema(cls) + lines.append(format_model_schema_text(schema)) + else: + lines.append(f"{cls.__name__}:") + if cls.__doc__: + lines.append(f" description: {cls.__doc__.strip().split(chr(10))[0]}") + if hasattr(cls, "__members__"): + members = [m.name for m in cls] + lines.append(f" values: [{', '.join(members)}]") + lines.append("") + typer.echo("\n".join(lines)) diff --git a/packages/data-designer/src/data_designer/cli/main.py b/packages/data-designer/src/data_designer/cli/main.py index a45276c4e..1afb9d640 100644 --- a/packages/data-designer/src/data_designer/cli/main.py +++ b/packages/data-designer/src/data_designer/cli/main.py @@ -5,6 +5,7 @@ import typer +from data_designer.cli.commands import agent_context from data_designer.cli.lazy_group import create_lazy_typer_group _CMD = "data_designer.cli.commands" @@ -102,6 +103,9 @@ app.add_typer(config_app, name="config", rich_help_panel="Setup") app.add_typer(download_app, name="download", rich_help_panel="Setup") +# Add agent command groups +app.add_typer(agent_context.agent_context_app, name="agent-context", rich_help_panel="Agent") + def main() -> None: """Main entry point for the CLI.""" diff --git a/packages/data-designer/src/data_designer/cli/services/introspection/__init__.py b/packages/data-designer/src/data_designer/cli/services/introspection/__init__.py new file mode 100644 index 000000000..fe6a08e50 --- /dev/null +++ b/packages/data-designer/src/data_designer/cli/services/introspection/__init__.py @@ -0,0 +1,62 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +from __future__ import annotations + +from data_designer.cli.services.introspection.discovery import ( + discover_column_configs, + discover_constraint_types, + discover_mcp_types, + discover_model_configs, + discover_processor_configs, + discover_sampler_types, + discover_seed_types, + discover_validator_types, +) +from data_designer.cli.services.introspection.formatters import ( + format_method_info_json, + format_method_info_text, + format_model_schema_json, + format_model_schema_text, + format_overview_text, + format_type_list_text, +) +from data_designer.cli.services.introspection.method_inspector import ( + MethodInfo, + ParamInfo, + inspect_class_methods, +) +from data_designer.cli.services.introspection.pydantic_inspector import ( + FieldDetail, + ModelSchema, + build_model_schema, + format_type, + get_brief_description, + get_field_info, +) + +__all__ = [ + "FieldDetail", + "MethodInfo", + "ModelSchema", + "ParamInfo", + "build_model_schema", + "discover_column_configs", + "discover_constraint_types", + "discover_mcp_types", + "discover_model_configs", + "discover_processor_configs", + "discover_sampler_types", + "discover_seed_types", + "discover_validator_types", + "format_method_info_json", + "format_method_info_text", + "format_model_schema_json", + "format_model_schema_text", + "format_overview_text", + "format_type_list_text", + "format_type", + "get_brief_description", + "get_field_info", + "inspect_class_methods", +] diff --git a/packages/data-designer/src/data_designer/cli/services/introspection/discovery.py b/packages/data-designer/src/data_designer/cli/services/introspection/discovery.py new file mode 100644 index 000000000..5f6e46d98 --- /dev/null +++ b/packages/data-designer/src/data_designer/cli/services/introspection/discovery.py @@ -0,0 +1,183 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +from __future__ import annotations + +import inspect +from enum import Enum +from typing import Literal, get_args, get_origin + + +def discover_column_configs() -> dict[str, type]: + """Dynamically discover all ColumnConfig classes from data_designer.config. + + Returns: + Dict mapping column_type literal values (e.g., 'llm-text') to their config classes. + """ + import data_designer.config as dd + + column_configs: dict[str, type] = {} + for name in dir(dd): + if name.endswith("ColumnConfig"): + obj = getattr(dd, name) + if inspect.isclass(obj) and hasattr(obj, "model_fields"): + if "column_type" in obj.model_fields: + annotation = obj.model_fields["column_type"].annotation + if get_origin(annotation) is Literal: + args = get_args(annotation) + if args: + column_configs[args[0]] = obj + return column_configs + + +def discover_sampler_types() -> dict[str, type]: + """Dynamically discover all sampler types and their param classes from data_designer.config. + + Returns: + Dict mapping sampler type names (e.g., 'category') to their params classes. + """ + import data_designer.config as dd + + sampler_type_enum = getattr(dd, "SamplerType", None) + if sampler_type_enum is None or not issubclass(sampler_type_enum, Enum): + return {} + + params_classes: dict[str, type] = {} + for name in dir(dd): + if name.endswith("SamplerParams"): + obj = getattr(dd, name) + if inspect.isclass(obj) and hasattr(obj, "model_fields"): + normalized = name.replace("SamplerParams", "").lower() + params_classes[normalized] = obj + + sampler_types: dict[str, type] = {} + for member in sampler_type_enum: + sampler_name = member.name.lower() + normalized_name = sampler_name.replace("_", "") + params_cls = params_classes.get(normalized_name) + if params_cls is not None: + sampler_types[sampler_name] = params_cls + + return sampler_types + + +def discover_validator_types() -> dict[str, type]: + """Dynamically discover all validator types and their param classes from data_designer.config. + + Returns: + Dict mapping validator type names to their params classes. + """ + import data_designer.config as dd + + validator_type_enum = getattr(dd, "ValidatorType", None) + if validator_type_enum is None or not issubclass(validator_type_enum, Enum): + return {} + + params_classes: dict[str, type] = {} + for name in dir(dd): + if name.endswith("ValidatorParams"): + obj = getattr(dd, name) + if inspect.isclass(obj) and hasattr(obj, "model_fields"): + normalized = name.replace("ValidatorParams", "").lower() + params_classes[normalized] = obj + + validator_types: dict[str, type] = {} + for member in validator_type_enum: + validator_name = member.name.lower() + normalized_name = validator_name.replace("_", "") + params_cls = params_classes.get(normalized_name) + if params_cls is not None: + validator_types[validator_name] = params_cls + + return validator_types + + +def discover_processor_configs() -> dict[str, type]: + """Dynamically discover all ProcessorConfig classes from data_designer.config. + + Returns: + Dict mapping processor_type values to their config classes. + """ + import data_designer.config as dd + + processor_configs: dict[str, type] = {} + for name in dir(dd): + if name.endswith("ProcessorConfig") and name != "ProcessorConfig": + obj = getattr(dd, name) + if inspect.isclass(obj) and hasattr(obj, "model_fields"): + if "processor_type" in obj.model_fields: + annotation = obj.model_fields["processor_type"].annotation + if get_origin(annotation) is Literal: + args = get_args(annotation) + if args: + key = args[0].value if isinstance(args[0], Enum) else args[0] + processor_configs[key] = obj + return processor_configs + + +def discover_model_configs() -> dict[str, type]: + """Return model-related configuration classes from data_designer.config. + + Returns: + Dict mapping class names to their types. + """ + import data_designer.config as dd + + return { + "ModelConfig": dd.ModelConfig, + "ChatCompletionInferenceParams": dd.ChatCompletionInferenceParams, + "EmbeddingInferenceParams": dd.EmbeddingInferenceParams, + "ImageInferenceParams": dd.ImageInferenceParams, + "ImageContext": dd.ImageContext, + "UniformDistribution": dd.UniformDistribution, + "ManualDistribution": dd.ManualDistribution, + } + + +def discover_constraint_types() -> dict[str, type]: + """Return constraint-related classes from data_designer.config. + + Returns: + Dict mapping class names to their types. + """ + import data_designer.config as dd + + return { + "ScalarInequalityConstraint": dd.ScalarInequalityConstraint, + "ColumnInequalityConstraint": dd.ColumnInequalityConstraint, + "InequalityOperator": dd.InequalityOperator, + } + + +def discover_seed_types() -> dict[str, type]: + """Return seed dataset-related classes from data_designer.config. + + Returns: + Dict mapping class names to their types. + """ + import data_designer.config as dd + + return { + "SeedConfig": dd.SeedConfig, + "SamplingStrategy": dd.SamplingStrategy, + "LocalFileSeedSource": dd.LocalFileSeedSource, + "HuggingFaceSeedSource": dd.HuggingFaceSeedSource, + "DataFrameSeedSource": dd.DataFrameSeedSource, + "IndexRange": dd.IndexRange, + "PartitionBlock": dd.PartitionBlock, + } + + +def discover_mcp_types() -> dict[str, type]: + """Return MCP-related classes from data_designer.config. + + Returns: + Dict mapping class names to their types. + """ + import data_designer.config as dd + + return { + "MCPProvider": dd.MCPProvider, + "LocalStdioMCPProvider": dd.LocalStdioMCPProvider, + "ToolConfig": dd.ToolConfig, + } diff --git a/packages/data-designer/src/data_designer/cli/services/introspection/formatters.py b/packages/data-designer/src/data_designer/cli/services/introspection/formatters.py new file mode 100644 index 000000000..097f9c2ea --- /dev/null +++ b/packages/data-designer/src/data_designer/cli/services/introspection/formatters.py @@ -0,0 +1,182 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +from __future__ import annotations + +from data_designer.cli.services.introspection.method_inspector import MethodInfo, ParamInfo +from data_designer.cli.services.introspection.pydantic_inspector import FieldDetail, ModelSchema + + +def _format_field_text(field: FieldDetail, indent: int = 4) -> list[str]: + """Format a single field as YAML-style text lines, recursing into nested schemas.""" + pad = " " * indent + lines: list[str] = [] + lines.append(f"{pad}{field.name}:") + lines.append(f"{pad} type: {field.type_str}") + if field.description: + lines.append(f"{pad} description: {field.description}") + if field.enum_values: + lines.append(f"{pad} values: [{', '.join(field.enum_values)}]") + if field.nested_schema: + lines.append(f"{pad} schema ({field.nested_schema.class_name}):") + for nested_field in field.nested_schema.fields: + lines.extend(_format_field_text(nested_field, indent=indent + 4)) + return lines + + +def format_model_schema_text(schema: ModelSchema, indent: int = 0) -> str: + """Format a ModelSchema as YAML-style text for backward compatibility with the existing skill scripts.""" + lines: list[str] = [] + pad = " " * indent + lines.append(f"{pad}{schema.class_name}:") + if schema.type_key and schema.type_value: + lines.append(f"{pad} {schema.type_key}: {schema.type_value}") + lines.append(f"{pad} description: {schema.description}") + lines.append(f"{pad} fields:") + for field in schema.fields: + lines.extend(_format_field_text(field, indent=indent + 4)) + return "\n".join(lines) + + +def _format_field_json(field: FieldDetail) -> dict: + """Convert a FieldDetail to a JSON-serializable dict, recursing into nested schemas.""" + result: dict = { + "name": field.name, + "type": field.type_str, + } + if field.description: + result["description"] = field.description + if field.enum_values: + result["values"] = field.enum_values + if field.nested_schema: + result["schema"] = format_model_schema_json(field.nested_schema) + return result + + +def format_model_schema_json(schema: ModelSchema) -> dict: + """Convert a ModelSchema to a JSON-serializable dict.""" + result: dict = { + "class_name": schema.class_name, + "description": schema.description, + } + if schema.type_key and schema.type_value: + result[schema.type_key] = schema.type_value + result["fields"] = [_format_field_json(f) for f in schema.fields] + return result + + +def _format_param_text(param: ParamInfo, indent: int) -> str: + """Format a single method parameter as a text line.""" + pad = " " * indent + parts = [f"{pad}{param.name}: {param.type_str}"] + if param.default is not None: + parts[0] += f" = {param.default}" + if param.description: + parts[0] += f" \u2014 {param.description}" + return parts[0] + + +def format_method_info_text(methods: list[MethodInfo], class_name: str | None = None) -> str: + """Format a list of MethodInfo as readable text with signatures and parameter details.""" + lines: list[str] = [] + if class_name: + lines.append(f"{class_name} Methods:") + lines.append("") + + for method in methods: + lines.append(f" {method.signature}") + if method.description: + lines.append(f" {method.description}") + if method.parameters: + lines.append(" Parameters:") + for param in method.parameters: + lines.append(_format_param_text(param, indent=6)) + lines.append("") + + return "\n".join(lines).rstrip() + + +def _param_to_json(param: ParamInfo) -> dict: + """Convert a ParamInfo to a JSON-serializable dict.""" + result: dict = { + "name": param.name, + "type": param.type_str, + } + if param.default is not None: + result["default"] = param.default + if param.description: + result["description"] = param.description + return result + + +def format_method_info_json(methods: list[MethodInfo]) -> list[dict]: + """Convert a list of MethodInfo to a JSON-serializable list of dicts.""" + result: list[dict] = [] + for method in methods: + entry: dict = { + "name": method.name, + "signature": method.signature, + "return_type": method.return_type, + } + if method.description: + entry["description"] = method.description + if method.parameters: + entry["parameters"] = [_param_to_json(p) for p in method.parameters] + result.append(entry) + return result + + +def format_type_list_text(items: dict[str, type], type_label: str, class_label: str) -> str: + """Format a summary table of type->class mappings, matching the existing print_list_table style.""" + sorted_items = sorted(items.items()) + if not sorted_items: + return f"{type_label} {class_label}\n(no items)" + + type_width = max(len(type_value) for type_value, _ in sorted_items) + type_width = max(type_width, len(type_label)) + + lines: list[str] = [] + lines.append(f"{type_label:<{type_width}} {class_label}") + lines.append(f"{'-' * type_width} {'-' * max(len(class_label), 25)}") + + for type_value, cls in sorted_items: + lines.append(f"{type_value:<{type_width}} {cls.__name__}") + + return "\n".join(lines) + + +def format_overview_text(type_counts: dict[str, int], builder_methods: list[MethodInfo]) -> str: + """Format a compact API overview cheatsheet.""" + lines: list[str] = [] + lines.append("Data Designer API Overview") + lines.append("=" * 26) + lines.append("") + + lines.append("Type Counts:") + label_width = max(len(label) for label in type_counts) + 1 if type_counts else 10 + for label, count in type_counts.items(): + lines.append(f" {label + ':':<{label_width}} {count:>3}") + lines.append("") + + if builder_methods: + lines.append("Builder Methods (DataDesignerConfigBuilder):") + sig_width = max(len(_short_sig(m)) for m in builder_methods) + for method in builder_methods: + short = _short_sig(method) + desc = method.description + lines.append(f" {short:<{sig_width}} \u2014 {desc}") + lines.append("") + + lines.append("Quick Start Commands:") + lines.append(" data-designer agent-context columns --list") + lines.append(" data-designer agent-context columns all") + lines.append(" data-designer agent-context columns llm-text") + lines.append(" data-designer agent-context samplers category") + lines.append(" data-designer agent-context builder") + + return "\n".join(lines) + + +def _short_sig(method: MethodInfo) -> str: + """Create a compact signature like 'add_column(...)' for overview display.""" + return f"{method.name}(...)" diff --git a/packages/data-designer/src/data_designer/cli/services/introspection/method_inspector.py b/packages/data-designer/src/data_designer/cli/services/introspection/method_inspector.py new file mode 100644 index 000000000..2451b63e5 --- /dev/null +++ b/packages/data-designer/src/data_designer/cli/services/introspection/method_inspector.py @@ -0,0 +1,250 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +from __future__ import annotations + +import inspect +import re +from dataclasses import dataclass, field + + +@dataclass +class ParamInfo: + name: str + type_str: str + default: str | None + description: str + + +@dataclass +class MethodInfo: + name: str + signature: str + description: str + return_type: str + parameters: list[ParamInfo] = field(default_factory=list) + + +def _parse_google_docstring_args(docstring: str | None) -> dict[str, str]: + """Parse Args section from a Google-style docstring. + + Returns: + Dict mapping parameter names to their descriptions. + """ + if not docstring: + return {} + + lines = docstring.split("\n") + result: dict[str, str] = {} + in_args_section = False + current_param: str | None = None + current_desc_lines: list[str] = [] + args_indent: int | None = None + + section_pattern = re.compile(r"^(\s*)(Args|Returns|Raises|Yields|Note|Notes|Example|Examples|Attributes)\s*:") + + for line in lines: + if re.match(r"^\s*Args\s*:\s*$", line): + in_args_section = True + args_indent = len(line) - len(line.lstrip()) + continue + + if not in_args_section: + continue + + if not line.strip(): + if current_param is not None: + current_desc_lines.append("") + continue + + match = section_pattern.match(line) + if match and match.group(2) != "Args": + section_indent = len(line) - len(line.lstrip()) + if args_indent is not None and section_indent <= args_indent: + break + + line_indent = len(line) - len(line.lstrip()) + stripped = line.strip() + + param_match = re.match(r"^(\*{0,2}\w+)\s*(?:\(.+?\))?\s*:\s*(.*)$", stripped) + if param_match and args_indent is not None and line_indent > args_indent: + if current_param is not None: + result[current_param] = _join_desc_lines(current_desc_lines) + current_param = param_match.group(1) + current_desc_lines = [param_match.group(2).strip()] + elif current_param is not None: + if args_indent is not None and line_indent <= args_indent: + break + current_desc_lines.append(stripped) + + if current_param is not None: + result[current_param] = _join_desc_lines(current_desc_lines) + + return result + + +def _join_desc_lines(lines: list[str]) -> str: + """Join description lines, collapsing whitespace and stripping trailing blanks.""" + while lines and not lines[-1]: + lines.pop() + return " ".join(part for part in lines if part) + + +def _format_annotation(annotation: type | str) -> str: + """Format a type annotation to a readable string.""" + if annotation is inspect.Parameter.empty: + return "Any" + + if isinstance(annotation, str): + return annotation + + if hasattr(annotation, "__name__"): + return annotation.__name__ + + return str(annotation).replace("typing.", "").replace("typing_extensions.", "") + + +def _format_signature(method_name: str, sig: inspect.Signature) -> str: + """Format a method signature as a readable string, skipping 'self'.""" + params: list[str] = [] + seen_keyword_only = False + + for param in sig.parameters.values(): + if param.name == "self": + continue + + if param.kind == inspect.Parameter.KEYWORD_ONLY and not seen_keyword_only: + seen_keyword_only = True + params.append("*") + + type_str = _format_annotation(param.annotation) + default_str = "" + if param.default is not inspect.Parameter.empty: + default_str = ( + f" = {param.default!r}" if not isinstance(param.default, type) else f" = {param.default.__name__}" + ) + + if param.kind == inspect.Parameter.VAR_POSITIONAL: + params.append(f"*{param.name}: {type_str}") + elif param.kind == inspect.Parameter.VAR_KEYWORD: + params.append(f"**{param.name}") + else: + params.append(f"{param.name}: {type_str}{default_str}") + + return_type = _format_return_type(sig) + params_str = ", ".join(params) + + # Remove the extra "*, " if a *args was already present + if any(p.kind == inspect.Parameter.VAR_POSITIONAL for p in sig.parameters.values()): + parts = params_str.split(", ") + parts = [p for p in parts if p != "*"] + params_str = ", ".join(parts) + + return f"{method_name}({params_str}) -> {return_type}" + + +def _format_return_type(sig: inspect.Signature) -> str: + """Extract and format the return type from a signature.""" + if sig.return_annotation is inspect.Parameter.empty: + return "None" + + formatted = _format_annotation(sig.return_annotation) + if formatted == "Self": + return "Self" + + return formatted + + +def _get_first_docstring_line(docstring: str | None) -> str: + """Extract the first non-empty line from a docstring as the description.""" + if not docstring: + return "" + for line in docstring.strip().split("\n"): + stripped = line.strip() + if stripped: + return stripped + return "" + + +def _build_param_info(sig: inspect.Signature, docstring_args: dict[str, str]) -> list[ParamInfo]: + """Build ParamInfo list from a signature and parsed docstring args.""" + params: list[ParamInfo] = [] + for param in sig.parameters.values(): + if param.name == "self": + continue + if param.kind == inspect.Parameter.VAR_KEYWORD: + name = f"**{param.name}" + elif param.kind == inspect.Parameter.VAR_POSITIONAL: + name = f"*{param.name}" + else: + name = param.name + + type_str = _format_annotation(param.annotation) + default: str | None = None + if param.default is not inspect.Parameter.empty: + default = repr(param.default) if not isinstance(param.default, type) else param.default.__name__ + + raw_name = param.name + description = docstring_args.get(raw_name, "") + if not description: + description = docstring_args.get(f"**{raw_name}", "") + if not description: + description = docstring_args.get(f"*{raw_name}", "") + + params.append(ParamInfo(name=name, type_str=type_str, default=default, description=description)) + + return params + + +def _is_dunder(name: str) -> bool: + """Check if a method name is a dunder method (excluding __init__).""" + return name.startswith("__") and name.endswith("__") and name != "__init__" + + +def _is_private(name: str) -> bool: + """Check if a method name is private (starts with underscore, not dunder).""" + return name.startswith("_") and not (name.startswith("__") and name.endswith("__")) + + +def inspect_class_methods(cls: type, include_private: bool = False) -> list[MethodInfo]: + """Introspect public methods of a class using inspect.signature() and docstring parsing. + + Args: + cls: The class to introspect. + include_private: If True, include methods starting with underscore. + + Returns: + List of MethodInfo objects for each method. + """ + methods: list[MethodInfo] = [] + + for name, method in inspect.getmembers(cls, predicate=inspect.isfunction): + if _is_dunder(name): + continue + if _is_private(name) and not include_private: + continue + + try: + sig = inspect.signature(method) + except (ValueError, TypeError): + continue + + docstring = inspect.getdoc(method) + docstring_args = _parse_google_docstring_args(docstring) + + signature_str = _format_signature(name, sig) + description = _get_first_docstring_line(docstring) + return_type = _format_return_type(sig) + parameters = _build_param_info(sig, docstring_args) + + methods.append( + MethodInfo( + name=name, + signature=signature_str, + description=description, + return_type=return_type, + parameters=parameters, + ) + ) + + return methods diff --git a/packages/data-designer/src/data_designer/cli/services/introspection/pydantic_inspector.py b/packages/data-designer/src/data_designer/cli/services/introspection/pydantic_inspector.py new file mode 100644 index 000000000..70fe69c92 --- /dev/null +++ b/packages/data-designer/src/data_designer/cli/services/introspection/pydantic_inspector.py @@ -0,0 +1,252 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +from __future__ import annotations + +import re +import types +import typing +from dataclasses import dataclass, field +from enum import Enum +from typing import Any, get_args, get_origin + +from pydantic import BaseModel + + +@dataclass +class FieldDetail: + """Structured representation of a single Pydantic model field.""" + + name: str + type_str: str + description: str + enum_values: list[str] | None = None + nested_schema: ModelSchema | None = None + + +@dataclass +class ModelSchema: + """Structured representation of a Pydantic model's schema.""" + + class_name: str + description: str + type_key: str | None = None + type_value: str | None = None + fields: list[FieldDetail] = field(default_factory=list) + + +def _is_basemodel_subclass(cls: Any) -> bool: + """Return True if cls is a concrete BaseModel subclass (not BaseModel itself).""" + return isinstance(cls, type) and issubclass(cls, BaseModel) and cls is not BaseModel + + +def _is_enum_subclass(cls: Any) -> bool: + """Return True if cls is an Enum subclass (not Enum itself).""" + return isinstance(cls, type) and issubclass(cls, Enum) and cls is not Enum + + +def _extract_enum_class(annotation: Any) -> type | None: + """Unwrap a type annotation to find an Enum class, if present. + + Handles X, X | None, Annotated[X, ...]. + Returns the Enum class or None. + """ + if annotation is None: + return None + + # Unwrap Annotated[X, ...] + if get_origin(annotation) is typing.Annotated: + annotation = get_args(annotation)[0] + + if _is_enum_subclass(annotation): + return annotation + + origin = get_origin(annotation) + if origin is typing.Union or origin is types.UnionType: + for arg in get_args(annotation): + if arg is type(None): + continue + if _is_enum_subclass(arg): + return arg + + return None + + +def extract_nested_basemodel(annotation: Any) -> type | None: + """Unwrap a type annotation to find a single nested BaseModel subclass. + + Handles: X, list[X], X | None, list[X] | None, dict[K, V], Annotated[X, ...]. + Returns None for unions of 2+ BaseModel subclasses (discriminated unions), + primitives, enums, or BaseModel itself. + """ + if annotation is None: + return None + + # Unwrap Annotated[X, ...] + if get_origin(annotation) is typing.Annotated: + annotation = get_args(annotation)[0] + + if _is_basemodel_subclass(annotation): + return annotation + + origin = get_origin(annotation) + + # list[X] -> check X + if origin is list: + args = get_args(annotation) + if args and _is_basemodel_subclass(args[0]): + return args[0] + return None + + # dict[K, V] -> check V + if origin is dict: + args = get_args(annotation) + if len(args) >= 2 and _is_basemodel_subclass(args[1]): + return args[1] + return None + + # Union: X | None, list[X] | None, or discriminated unions + if origin is typing.Union or origin is types.UnionType: + non_none_args = [a for a in get_args(annotation) if a is not type(None)] + basemodel_classes: list[type] = [] + for arg in non_none_args: + result = extract_nested_basemodel(arg) + if result is not None: + basemodel_classes.append(result) + elif _is_basemodel_subclass(arg): + basemodel_classes.append(arg) + if len(basemodel_classes) == 1: + return basemodel_classes[0] + return None + + return None + + +def format_type(annotation: Any) -> str: + """Format a type annotation for readable display. + + Strips module prefixes and simplifies complex types. + """ + type_str = str(annotation) + + # Remove module prefixes + type_str = re.sub(r"data_designer\.config\.\w+\.", "", type_str) + type_str = re.sub(r"pydantic\.main\.", "", type_str) + type_str = re.sub(r"typing\.", "", type_str) + + # Clean up enum types BEFORE other replacements: -> EnumName + type_str = re.sub(r"", r"\1", type_str) + + # Clean up class types: -> str + type_str = re.sub(r"", r"\1", type_str) + + type_str = type_str.replace("NoneType", "None") + + if "Literal[" in type_str: + match = re.search(r"Literal\[([^\]]+)\]", type_str) + if match: + type_str = f"Literal[{match.group(1)}]" + + # Clean up Annotated types with Discriminator (too verbose) + if "Annotated[" in type_str and "Discriminator" in type_str: + match = re.search(r"Annotated\[([^,]+(?:\s*\|\s*[^,]+)*),", type_str) + if match: + type_str = match.group(1).strip() + + return type_str + + +def get_brief_description(cls: type) -> str: + """Extract first line from class docstring.""" + if cls.__doc__: + doc = cls.__doc__.strip() + first_line = doc.split("\n")[0].strip() + return first_line + return "No description available." + + +def get_field_info(cls: type) -> list[FieldDetail]: + """Extract field information from a Pydantic model. + + Args: + cls: The Pydantic model class to inspect. + + Returns: + List of FieldDetail objects with name, type_str, description, enum_values, + and nested_schema (initially None, populated by build_model_schema). + """ + fields: list[FieldDetail] = [] + model_fields: dict[str, Any] = getattr(cls, "model_fields", {}) + if model_fields: + for field_name, field_info in model_fields.items(): + type_str = format_type(field_info.annotation) + description = field_info.description or "" + + enum_cls = _extract_enum_class(field_info.annotation) + enum_values: list[str] | None = None + if enum_cls is not None: + enum_values = [member.name for member in enum_cls] + + fields.append( + FieldDetail( + name=field_name, + type_str=type_str, + description=description, + enum_values=enum_values, + nested_schema=None, + ) + ) + return fields + + +def build_model_schema( + cls: type, + type_key: str | None = None, + type_value: str | None = None, + seen: set[str] | None = None, + max_depth: int = 3, + current_depth: int = 0, +) -> ModelSchema: + """Build a structured ModelSchema from a Pydantic model class. + + Args: + cls: The Pydantic model class to inspect. + type_key: Optional key name for the type discriminator (e.g., "column_type"). + type_value: Optional value for the type discriminator (e.g., "llm-text"). + seen: Set of already-expanded class names to prevent cycles. + max_depth: Maximum recursion depth for nested models. + current_depth: Current recursion depth. + + Returns: + A ModelSchema with recursively expanded nested schemas. + """ + if seen is None: + seen = set() + + class_name = cls.__name__ + description = get_brief_description(cls) + fields = get_field_info(cls) + + model_fields_raw: dict[str, Any] = getattr(cls, "model_fields", {}) + for field_detail in fields: + raw_field_info = model_fields_raw.get(field_detail.name) + if raw_field_info is None: + continue + + nested_cls = extract_nested_basemodel(raw_field_info.annotation) + if nested_cls is not None and nested_cls.__name__ not in seen and current_depth < max_depth: + seen.add(nested_cls.__name__) + field_detail.nested_schema = build_model_schema( + nested_cls, + seen=seen, + max_depth=max_depth, + current_depth=current_depth + 1, + ) + + return ModelSchema( + class_name=class_name, + description=description, + type_key=type_key, + type_value=type_value, + fields=fields, + ) diff --git a/packages/data-designer/tests/cli/commands/__init__.py b/packages/data-designer/tests/cli/commands/__init__.py new file mode 100644 index 000000000..e5725ea5a --- /dev/null +++ b/packages/data-designer/tests/cli/commands/__init__.py @@ -0,0 +1,2 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 diff --git a/packages/data-designer/tests/cli/commands/test_agent_context_command.py b/packages/data-designer/tests/cli/commands/test_agent_context_command.py new file mode 100644 index 000000000..162ec7c9b --- /dev/null +++ b/packages/data-designer/tests/cli/commands/test_agent_context_command.py @@ -0,0 +1,137 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +from __future__ import annotations + +import json + +from typer.testing import CliRunner + +from data_designer.cli.main import app + +runner = CliRunner() + + +# --------------------------------------------------------------------------- +# help +# --------------------------------------------------------------------------- + + +def test_agent_context_help() -> None: + result = runner.invoke(app, ["agent-context", "--help"]) + assert result.exit_code == 0 + assert "columns" in result.output + + +# --------------------------------------------------------------------------- +# columns +# --------------------------------------------------------------------------- + + +def test_columns_list() -> None: + result = runner.invoke(app, ["agent-context", "columns", "--list"]) + assert result.exit_code == 0 + assert "llm-text" in result.output + + +def test_columns_specific_type() -> None: + result = runner.invoke(app, ["agent-context", "columns", "llm-text"]) + assert result.exit_code == 0 + assert "LLMTextColumnConfig" in result.output + + +def test_columns_json_format() -> None: + result = runner.invoke(app, ["agent-context", "columns", "llm-text", "--format", "json"]) + assert result.exit_code == 0 + data = json.loads(result.output) + assert isinstance(data, dict) + assert data["class_name"] == "LLMTextColumnConfig" + + +def test_columns_nonexistent_exits_with_error() -> None: + result = runner.invoke(app, ["agent-context", "columns", "nonexistent"]) + assert result.exit_code == 1 + + +# --------------------------------------------------------------------------- +# samplers +# --------------------------------------------------------------------------- + + +def test_samplers_specific() -> None: + result = runner.invoke(app, ["agent-context", "samplers", "category"]) + assert result.exit_code == 0 + assert "CATEGORY" in result.output + + +def test_samplers_list() -> None: + result = runner.invoke(app, ["agent-context", "samplers", "--list"]) + assert result.exit_code == 0 + assert "category" in result.output + + +# --------------------------------------------------------------------------- +# overview +# --------------------------------------------------------------------------- + + +def test_overview() -> None: + result = runner.invoke(app, ["agent-context", "overview"]) + assert result.exit_code == 0 + assert "Type Counts" in result.output + + +# --------------------------------------------------------------------------- +# builder +# --------------------------------------------------------------------------- + + +def test_builder() -> None: + result = runner.invoke(app, ["agent-context", "builder"]) + assert result.exit_code == 0 + assert "add_column" in result.output + + +# --------------------------------------------------------------------------- +# models +# --------------------------------------------------------------------------- + + +def test_models() -> None: + result = runner.invoke(app, ["agent-context", "models"]) + assert result.exit_code == 0 + assert "ModelConfig" in result.output + + +# --------------------------------------------------------------------------- +# constraints +# --------------------------------------------------------------------------- + + +def test_constraints() -> None: + result = runner.invoke(app, ["agent-context", "constraints"]) + assert result.exit_code == 0 + output = result.output + assert "ScalarInequalityConstraint" in output or "InequalityOperator" in output + + +# --------------------------------------------------------------------------- +# seeds +# --------------------------------------------------------------------------- + + +def test_seeds() -> None: + result = runner.invoke(app, ["agent-context", "seeds"]) + assert result.exit_code == 0 + assert "SeedConfig" in result.output + + +# --------------------------------------------------------------------------- +# mcp +# --------------------------------------------------------------------------- + + +def test_mcp() -> None: + result = runner.invoke(app, ["agent-context", "mcp"]) + assert result.exit_code == 0 + assert "ToolConfig" in result.output diff --git a/packages/data-designer/tests/cli/controllers/__init__.py b/packages/data-designer/tests/cli/controllers/__init__.py new file mode 100644 index 000000000..e5725ea5a --- /dev/null +++ b/packages/data-designer/tests/cli/controllers/__init__.py @@ -0,0 +1,2 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 diff --git a/packages/data-designer/tests/cli/controllers/test_agent_context_controller.py b/packages/data-designer/tests/cli/controllers/test_agent_context_controller.py new file mode 100644 index 000000000..54cb5511f --- /dev/null +++ b/packages/data-designer/tests/cli/controllers/test_agent_context_controller.py @@ -0,0 +1,163 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +from __future__ import annotations + +import json + +import click.exceptions +import pytest + +from data_designer.cli.controllers.agent_context_controller import AgentContextController + +# --------------------------------------------------------------------------- +# show_columns +# --------------------------------------------------------------------------- + + +def test_show_columns_list_mode(capsys: pytest.CaptureFixture[str]) -> None: + controller = AgentContextController(output_format="text") + controller.show_columns(type_name=None, list_mode=True) + captured = capsys.readouterr() + assert "llm-text" in captured.out + assert "sampler" in captured.out + + +def test_show_columns_specific_type(capsys: pytest.CaptureFixture[str]) -> None: + controller = AgentContextController(output_format="text") + controller.show_columns(type_name="llm-text", list_mode=False) + captured = capsys.readouterr() + assert "LLMTextColumnConfig" in captured.out + + +def test_show_columns_all(capsys: pytest.CaptureFixture[str]) -> None: + controller = AgentContextController(output_format="text") + controller.show_columns(type_name="all", list_mode=False) + captured = capsys.readouterr() + assert "llm-text" in captured.out + assert "sampler" in captured.out + + +def test_show_columns_nonexistent_type_exits() -> None: + controller = AgentContextController(output_format="text") + with pytest.raises(click.exceptions.Exit): + controller.show_columns(type_name="nonexistent_type_xyz", list_mode=False) + + +def test_show_columns_json_format(capsys: pytest.CaptureFixture[str]) -> None: + controller = AgentContextController(output_format="json") + controller.show_columns(type_name="llm-text", list_mode=False) + captured = capsys.readouterr() + data = json.loads(captured.out) + assert isinstance(data, dict) + assert data["class_name"] == "LLMTextColumnConfig" + + +def test_show_columns_list_json_format(capsys: pytest.CaptureFixture[str]) -> None: + controller = AgentContextController(output_format="json") + controller.show_columns(type_name=None, list_mode=True) + captured = capsys.readouterr() + data = json.loads(captured.out) + assert isinstance(data, dict) + assert "llm-text" in data + + +# --------------------------------------------------------------------------- +# show_overview +# --------------------------------------------------------------------------- + + +def test_show_overview_text(capsys: pytest.CaptureFixture[str]) -> None: + controller = AgentContextController(output_format="text") + controller.show_overview() + captured = capsys.readouterr() + assert "Data Designer API Overview" in captured.out + assert "Type Counts:" in captured.out + + +def test_show_overview_json(capsys: pytest.CaptureFixture[str]) -> None: + controller = AgentContextController(output_format="json") + controller.show_overview() + captured = capsys.readouterr() + data = json.loads(captured.out) + assert "type_counts" in data + assert "builder_methods" in data + + +# --------------------------------------------------------------------------- +# show_samplers +# --------------------------------------------------------------------------- + + +def test_show_samplers_list(capsys: pytest.CaptureFixture[str]) -> None: + controller = AgentContextController(output_format="text") + controller.show_samplers(type_name=None, list_mode=True) + captured = capsys.readouterr() + assert "category" in captured.out + + +def test_show_samplers_specific(capsys: pytest.CaptureFixture[str]) -> None: + controller = AgentContextController(output_format="text") + controller.show_samplers(type_name="category", list_mode=False) + captured = capsys.readouterr() + assert "CATEGORY" in captured.out + + +# --------------------------------------------------------------------------- +# show_models +# --------------------------------------------------------------------------- + + +def test_show_models(capsys: pytest.CaptureFixture[str]) -> None: + controller = AgentContextController(output_format="text") + controller.show_models() + captured = capsys.readouterr() + assert "ModelConfig" in captured.out + + +# --------------------------------------------------------------------------- +# show_builder +# --------------------------------------------------------------------------- + + +def test_show_builder(capsys: pytest.CaptureFixture[str]) -> None: + controller = AgentContextController(output_format="text") + controller.show_builder() + captured = capsys.readouterr() + assert "add_column" in captured.out + + +# --------------------------------------------------------------------------- +# show_constraints +# --------------------------------------------------------------------------- + + +def test_show_constraints(capsys: pytest.CaptureFixture[str]) -> None: + controller = AgentContextController(output_format="text") + controller.show_constraints() + captured = capsys.readouterr() + assert "ScalarInequalityConstraint" in captured.out + + +# --------------------------------------------------------------------------- +# show_seeds +# --------------------------------------------------------------------------- + + +def test_show_seeds(capsys: pytest.CaptureFixture[str]) -> None: + controller = AgentContextController(output_format="text") + controller.show_seeds() + captured = capsys.readouterr() + assert "SeedConfig" in captured.out + + +# --------------------------------------------------------------------------- +# show_mcp +# --------------------------------------------------------------------------- + + +def test_show_mcp(capsys: pytest.CaptureFixture[str]) -> None: + controller = AgentContextController(output_format="text") + controller.show_mcp() + captured = capsys.readouterr() + assert "ToolConfig" in captured.out diff --git a/packages/data-designer/tests/cli/services/__init__.py b/packages/data-designer/tests/cli/services/__init__.py new file mode 100644 index 000000000..e5725ea5a --- /dev/null +++ b/packages/data-designer/tests/cli/services/__init__.py @@ -0,0 +1,2 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 diff --git a/packages/data-designer/tests/cli/services/introspection/__init__.py b/packages/data-designer/tests/cli/services/introspection/__init__.py new file mode 100644 index 000000000..e5725ea5a --- /dev/null +++ b/packages/data-designer/tests/cli/services/introspection/__init__.py @@ -0,0 +1,2 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 diff --git a/packages/data-designer/tests/cli/services/introspection/test_discovery.py b/packages/data-designer/tests/cli/services/introspection/test_discovery.py new file mode 100644 index 000000000..d1a7c8d54 --- /dev/null +++ b/packages/data-designer/tests/cli/services/introspection/test_discovery.py @@ -0,0 +1,155 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +from __future__ import annotations + +from data_designer.cli.services.introspection.discovery import ( + discover_column_configs, + discover_constraint_types, + discover_mcp_types, + discover_model_configs, + discover_processor_configs, + discover_sampler_types, + discover_seed_types, + discover_validator_types, +) + +# --------------------------------------------------------------------------- +# discover_column_configs +# --------------------------------------------------------------------------- + + +def test_discover_column_configs_returns_dict() -> None: + result = discover_column_configs() + assert isinstance(result, dict) + assert len(result) > 0 + + +def test_discover_column_configs_contains_expected_keys() -> None: + result = discover_column_configs() + for expected_key in ("llm-text", "sampler", "expression"): + assert expected_key in result, f"Expected key '{expected_key}' not found in {list(result.keys())}" + + +def test_discover_column_configs_values_are_classes() -> None: + result = discover_column_configs() + for cls in result.values(): + assert isinstance(cls, type) + assert hasattr(cls, "model_fields") + + +# --------------------------------------------------------------------------- +# discover_sampler_types +# --------------------------------------------------------------------------- + + +def test_discover_sampler_types_returns_dict() -> None: + result = discover_sampler_types() + assert isinstance(result, dict) + assert len(result) > 0 + + +def test_discover_sampler_types_contains_expected_keys() -> None: + result = discover_sampler_types() + for expected_key in ("category", "uniform", "person"): + assert expected_key in result, f"Expected key '{expected_key}' not found in {list(result.keys())}" + + +# --------------------------------------------------------------------------- +# discover_validator_types +# --------------------------------------------------------------------------- + + +def test_discover_validator_types_returns_dict() -> None: + result = discover_validator_types() + assert isinstance(result, dict) + assert len(result) > 0 + + +def test_discover_validator_types_contains_expected_keys() -> None: + result = discover_validator_types() + for expected_key in ("code", "remote"): + assert expected_key in result, f"Expected key '{expected_key}' not found in {list(result.keys())}" + + +# --------------------------------------------------------------------------- +# discover_processor_configs +# --------------------------------------------------------------------------- + + +def test_discover_processor_configs_returns_dict() -> None: + result = discover_processor_configs() + assert isinstance(result, dict) + assert len(result) > 0 + + +def test_discover_processor_configs_contains_expected_keys() -> None: + result = discover_processor_configs() + assert "drop_columns" in result, f"Expected 'drop_columns' not found in {list(result.keys())}" + + +# --------------------------------------------------------------------------- +# discover_model_configs +# --------------------------------------------------------------------------- + + +def test_discover_model_configs_returns_dict() -> None: + result = discover_model_configs() + assert isinstance(result, dict) + assert len(result) > 0 + + +def test_discover_model_configs_contains_expected_keys() -> None: + result = discover_model_configs() + for expected_key in ("ModelConfig", "ChatCompletionInferenceParams"): + assert expected_key in result, f"Expected key '{expected_key}' not found in {list(result.keys())}" + + +# --------------------------------------------------------------------------- +# discover_constraint_types +# --------------------------------------------------------------------------- + + +def test_discover_constraint_types_returns_dict() -> None: + result = discover_constraint_types() + assert isinstance(result, dict) + assert len(result) > 0 + + +def test_discover_constraint_types_contains_expected_keys() -> None: + result = discover_constraint_types() + assert "ScalarInequalityConstraint" in result + + +# --------------------------------------------------------------------------- +# discover_seed_types +# --------------------------------------------------------------------------- + + +def test_discover_seed_types_returns_dict() -> None: + result = discover_seed_types() + assert isinstance(result, dict) + assert len(result) > 0 + + +def test_discover_seed_types_contains_expected_keys() -> None: + result = discover_seed_types() + for expected_key in ("SeedConfig", "LocalFileSeedSource"): + assert expected_key in result, f"Expected key '{expected_key}' not found in {list(result.keys())}" + + +# --------------------------------------------------------------------------- +# discover_mcp_types +# --------------------------------------------------------------------------- + + +def test_discover_mcp_types_returns_dict() -> None: + result = discover_mcp_types() + assert isinstance(result, dict) + assert len(result) > 0 + + +def test_discover_mcp_types_contains_expected_keys() -> None: + result = discover_mcp_types() + for expected_key in ("MCPProvider", "ToolConfig"): + assert expected_key in result, f"Expected key '{expected_key}' not found in {list(result.keys())}" diff --git a/packages/data-designer/tests/cli/services/introspection/test_field_descriptions.py b/packages/data-designer/tests/cli/services/introspection/test_field_descriptions.py new file mode 100644 index 000000000..dd684f0eb --- /dev/null +++ b/packages/data-designer/tests/cli/services/introspection/test_field_descriptions.py @@ -0,0 +1,63 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +from __future__ import annotations + +import pytest + +from data_designer.cli.services.introspection.discovery import ( + discover_column_configs, + discover_constraint_types, + discover_mcp_types, + discover_model_configs, + discover_processor_configs, + discover_sampler_types, + discover_seed_types, + discover_validator_types, +) + + +def _collect_models_with_fields() -> list[tuple[str, str, type]]: + """Collect all discovered model classes and their fields. + + Returns: + List of (source_label, field_name, model_class) tuples. + """ + items: list[tuple[str, str, type]] = [] + + discovery_sources: list[tuple[str, dict[str, type]]] = [ + ("column_configs", discover_column_configs()), + ("sampler_types", discover_sampler_types()), + ("validator_types", discover_validator_types()), + ("processor_configs", discover_processor_configs()), + ("model_configs", discover_model_configs()), + ("constraint_types", discover_constraint_types()), + ("seed_types", discover_seed_types()), + ("mcp_types", discover_mcp_types()), + ] + + for source_label, discovered in discovery_sources: + for type_name, cls in discovered.items(): + if not hasattr(cls, "model_fields"): + continue + for field_name in cls.model_fields: + items.append((f"{source_label}:{type_name}", field_name, cls)) + + return items + + +_ALL_FIELDS = _collect_models_with_fields() + + +@pytest.mark.parametrize( + "source_label,field_name,cls", + _ALL_FIELDS, + ids=[f"{src}.{field}" for src, field, _ in _ALL_FIELDS], +) +def test_all_discovered_fields_have_descriptions(source_label: str, field_name: str, cls: type) -> None: + """Every field in discovered config models must have a non-empty description.""" + field_info = cls.model_fields[field_name] + assert field_info.description, ( + f"{cls.__name__}.{field_name} (from {source_label}) has no Field(description=...). " + f"Add a description to this field in the source model." + ) diff --git a/packages/data-designer/tests/cli/services/introspection/test_formatters.py b/packages/data-designer/tests/cli/services/introspection/test_formatters.py new file mode 100644 index 000000000..e9d99d63f --- /dev/null +++ b/packages/data-designer/tests/cli/services/introspection/test_formatters.py @@ -0,0 +1,268 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +from __future__ import annotations + +from data_designer.cli.services.introspection.formatters import ( + format_method_info_json, + format_method_info_text, + format_model_schema_json, + format_model_schema_text, + format_overview_text, + format_type_list_text, +) +from data_designer.cli.services.introspection.method_inspector import MethodInfo, ParamInfo +from data_designer.cli.services.introspection.pydantic_inspector import FieldDetail, ModelSchema + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + + +def _make_field(name: str = "my_field", type_str: str = "str", description: str = "A field") -> FieldDetail: + return FieldDetail(name=name, type_str=type_str, description=description) + + +def _make_schema( + class_name: str = "TestModel", + description: str = "A test model.", + type_key: str | None = None, + type_value: str | None = None, + fields: list[FieldDetail] | None = None, +) -> ModelSchema: + return ModelSchema( + class_name=class_name, + description=description, + type_key=type_key, + type_value=type_value, + fields=fields or [_make_field()], + ) + + +def _make_method( + name: str = "do_thing", + signature: str = "do_thing(x: int) -> str", + description: str = "Does a thing.", + return_type: str = "str", + parameters: list[ParamInfo] | None = None, +) -> MethodInfo: + return MethodInfo( + name=name, + signature=signature, + description=description, + return_type=return_type, + parameters=parameters or [ParamInfo(name="x", type_str="int", default=None, description="An integer")], + ) + + +# --------------------------------------------------------------------------- +# format_model_schema_text +# --------------------------------------------------------------------------- + + +def test_format_model_schema_text_basic() -> None: + schema = _make_schema() + text = format_model_schema_text(schema) + assert "TestModel:" in text + assert "description: A test model." in text + assert "my_field:" in text + assert "type: str" in text + + +def test_format_model_schema_text_with_type_key() -> None: + schema = _make_schema(type_key="column_type", type_value="llm-text") + text = format_model_schema_text(schema) + assert "column_type: llm-text" in text + + +def test_format_model_schema_text_with_nested_schema() -> None: + nested = _make_schema(class_name="NestedModel", description="Nested.") + outer_field = FieldDetail( + name="child", + type_str="NestedModel", + description="A nested model", + nested_schema=nested, + ) + schema = _make_schema(fields=[outer_field]) + text = format_model_schema_text(schema) + assert "schema (NestedModel):" in text + + +def test_format_model_schema_text_with_enum_values() -> None: + field = FieldDetail( + name="color", + type_str="ColorEnum", + description="Pick a color", + enum_values=["RED", "GREEN", "BLUE"], + ) + schema = _make_schema(fields=[field]) + text = format_model_schema_text(schema) + assert "values: [RED, GREEN, BLUE]" in text + + +# --------------------------------------------------------------------------- +# format_model_schema_json +# --------------------------------------------------------------------------- + + +def test_format_model_schema_json_basic() -> None: + schema = _make_schema() + result = format_model_schema_json(schema) + assert isinstance(result, dict) + assert result["class_name"] == "TestModel" + assert result["description"] == "A test model." + assert isinstance(result["fields"], list) + assert len(result["fields"]) == 1 + assert result["fields"][0]["name"] == "my_field" + + +def test_format_model_schema_json_with_type_key() -> None: + schema = _make_schema(type_key="column_type", type_value="sampler") + result = format_model_schema_json(schema) + assert result["column_type"] == "sampler" + + +def test_format_model_schema_json_with_nested() -> None: + nested = _make_schema(class_name="Inner", description="Inner model.") + outer_field = FieldDetail( + name="inner", + type_str="Inner", + description="Nested", + nested_schema=nested, + ) + schema = _make_schema(fields=[outer_field]) + result = format_model_schema_json(schema) + inner_field = result["fields"][0] + assert "schema" in inner_field + assert inner_field["schema"]["class_name"] == "Inner" + + +# --------------------------------------------------------------------------- +# format_method_info_text +# --------------------------------------------------------------------------- + + +def test_format_method_info_text_basic() -> None: + methods = [_make_method()] + text = format_method_info_text(methods) + assert "do_thing(x: int) -> str" in text + assert "Does a thing." in text + assert "Parameters:" in text + + +def test_format_method_info_text_with_class_name() -> None: + methods = [_make_method()] + text = format_method_info_text(methods, class_name="MyBuilder") + assert "MyBuilder Methods:" in text + + +def test_format_method_info_text_no_class_name() -> None: + methods = [_make_method()] + text = format_method_info_text(methods, class_name=None) + assert "Methods:" not in text + + +# --------------------------------------------------------------------------- +# format_method_info_json +# --------------------------------------------------------------------------- + + +def test_format_method_info_json_basic() -> None: + methods = [_make_method()] + result = format_method_info_json(methods) + assert isinstance(result, list) + assert len(result) == 1 + entry = result[0] + assert entry["name"] == "do_thing" + assert entry["signature"] == "do_thing(x: int) -> str" + assert entry["return_type"] == "str" + assert "description" in entry + assert "parameters" in entry + + +def test_format_method_info_json_multiple_methods() -> None: + methods = [ + _make_method(name="method_a", signature="method_a() -> None", return_type="None", parameters=[]), + _make_method(name="method_b"), + ] + result = format_method_info_json(methods) + assert len(result) == 2 + names = [e["name"] for e in result] + assert "method_a" in names + assert "method_b" in names + + +# --------------------------------------------------------------------------- +# format_type_list_text +# --------------------------------------------------------------------------- + + +def test_format_type_list_text_basic() -> None: + class FakeA: + pass + + class FakeB: + pass + + items: dict[str, type] = {"alpha": FakeA, "beta": FakeB} + text = format_type_list_text(items, "type_name", "class_name") + assert "type_name" in text + assert "class_name" in text + assert "alpha" in text + assert "FakeA" in text + assert "beta" in text + assert "FakeB" in text + + +def test_format_type_list_text_alignment() -> None: + class C: + pass + + items: dict[str, type] = {"short": C, "very_long_name": C} + text = format_type_list_text(items, "Type", "Class") + lines = text.strip().split("\n") + # Header + separator + 2 data rows + assert len(lines) == 4 + + +def test_format_type_list_text_empty() -> None: + text = format_type_list_text({}, "Type", "Class") + assert "(no items)" in text + + +# --------------------------------------------------------------------------- +# format_overview_text +# --------------------------------------------------------------------------- + + +def test_format_overview_text_contains_header() -> None: + type_counts = {"Column types": 5, "Sampler types": 3} + methods = [_make_method()] + text = format_overview_text(type_counts, methods) + assert "Data Designer API Overview" in text + + +def test_format_overview_text_contains_type_counts() -> None: + type_counts = {"Column types": 5, "Sampler types": 3} + methods = [_make_method()] + text = format_overview_text(type_counts, methods) + assert "Type Counts:" in text + assert "Column types:" in text + assert "5" in text + assert "Sampler types:" in text + assert "3" in text + + +def test_format_overview_text_contains_builder_methods() -> None: + type_counts = {"Column types": 5} + methods = [_make_method(name="add_column")] + text = format_overview_text(type_counts, methods) + assert "Builder Methods" in text + assert "add_column(...)" in text + + +def test_format_overview_text_contains_quick_start() -> None: + type_counts = {"Column types": 1} + text = format_overview_text(type_counts, []) + assert "Quick Start Commands:" in text + assert "agent-context columns --list" in text diff --git a/packages/data-designer/tests/cli/services/introspection/test_method_inspector.py b/packages/data-designer/tests/cli/services/introspection/test_method_inspector.py new file mode 100644 index 000000000..37007e252 --- /dev/null +++ b/packages/data-designer/tests/cli/services/introspection/test_method_inspector.py @@ -0,0 +1,159 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +from __future__ import annotations + +from data_designer.cli.services.introspection.method_inspector import ( + MethodInfo, + _parse_google_docstring_args, + inspect_class_methods, +) + +# --------------------------------------------------------------------------- +# Test helper class +# --------------------------------------------------------------------------- + + +class SampleClass: + """A sample class for testing method introspection.""" + + def public_method(self, x: int, y: str = "default") -> str: + """Do something public. + + Args: + x: The integer input. + y: An optional string. + + Returns: + A result string. + """ + return f"{x}-{y}" + + def another_public(self) -> None: + """Another public method with no args.""" + + def _private_method(self, z: float) -> float: + """A private helper. + + Args: + z: A float value. + """ + return z * 2 + + def __dunder_method__(self) -> None: + """Should be excluded (dunder).""" + + def __init__(self) -> None: + """Init should be included.""" + + +# --------------------------------------------------------------------------- +# _parse_google_docstring_args +# --------------------------------------------------------------------------- + + +def test_parse_google_docstring_args_basic() -> None: + docstring = """Do something. + + Args: + x: The first parameter. + y: The second parameter. + + Returns: + A result. + """ + result = _parse_google_docstring_args(docstring) + assert "x" in result + assert result["x"] == "The first parameter." + assert "y" in result + assert result["y"] == "The second parameter." + + +def test_parse_google_docstring_args_empty() -> None: + assert _parse_google_docstring_args(None) == {} + assert _parse_google_docstring_args("") == {} + + +def test_parse_google_docstring_args_no_args_section() -> None: + docstring = """Just a description. + + Returns: + Something. + """ + result = _parse_google_docstring_args(docstring) + assert result == {} + + +def test_parse_google_docstring_args_multiline_description() -> None: + docstring = """Do something. + + Args: + x: First line of description + continued on second line. + y: Another param. + """ + result = _parse_google_docstring_args(docstring) + assert "x" in result + assert "continued" in result["x"] + assert "y" in result + + +# --------------------------------------------------------------------------- +# inspect_class_methods - exclude private +# --------------------------------------------------------------------------- + + +def test_inspect_class_methods_public_only() -> None: + methods = inspect_class_methods(SampleClass, include_private=False) + names = [m.name for m in methods] + assert "public_method" in names + assert "another_public" in names + assert "_private_method" not in names + assert "__dunder_method__" not in names + + +def test_inspect_class_methods_returns_method_info() -> None: + methods = inspect_class_methods(SampleClass, include_private=False) + assert all(isinstance(m, MethodInfo) for m in methods) + + +def test_inspect_class_methods_signature_content() -> None: + methods = inspect_class_methods(SampleClass, include_private=False) + public = next(m for m in methods if m.name == "public_method") + assert "x: int" in public.signature + assert "y: str" in public.signature + assert "str" in public.return_type + + +def test_inspect_class_methods_description() -> None: + methods = inspect_class_methods(SampleClass, include_private=False) + public = next(m for m in methods if m.name == "public_method") + assert public.description == "Do something public." + + +def test_inspect_class_methods_parameters() -> None: + methods = inspect_class_methods(SampleClass, include_private=False) + public = next(m for m in methods if m.name == "public_method") + param_names = [p.name for p in public.parameters] + assert "x" in param_names + assert "y" in param_names + x_param = next(p for p in public.parameters if p.name == "x") + assert x_param.description == "The integer input." + + +# --------------------------------------------------------------------------- +# inspect_class_methods - include private +# --------------------------------------------------------------------------- + + +def test_inspect_class_methods_include_private() -> None: + methods = inspect_class_methods(SampleClass, include_private=True) + names = [m.name for m in methods] + assert "_private_method" in names + assert "__dunder_method__" not in names + + +def test_inspect_class_methods_init_included() -> None: + methods = inspect_class_methods(SampleClass, include_private=True) + names = [m.name for m in methods] + assert "__init__" in names diff --git a/packages/data-designer/tests/cli/services/introspection/test_pydantic_inspector.py b/packages/data-designer/tests/cli/services/introspection/test_pydantic_inspector.py new file mode 100644 index 000000000..d29e3e844 --- /dev/null +++ b/packages/data-designer/tests/cli/services/introspection/test_pydantic_inspector.py @@ -0,0 +1,300 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +from __future__ import annotations + +from enum import Enum +from typing import Annotated + +from pydantic import BaseModel, Field + +from data_designer.cli.services.introspection.pydantic_inspector import ( + FieldDetail, + ModelSchema, + _extract_enum_class, + _is_basemodel_subclass, + _is_enum_subclass, + build_model_schema, + extract_nested_basemodel, + format_type, + get_brief_description, + get_field_info, +) + +# --------------------------------------------------------------------------- +# Test models / enums +# --------------------------------------------------------------------------- + + +class ColorEnum(str, Enum): + RED = "red" + GREEN = "green" + BLUE = "blue" + + +class SizeEnum(str, Enum): + SMALL = "small" + LARGE = "large" + + +class InnerModel(BaseModel): + x: int = 0 + y: str = "hello" + + +class OuterModel(BaseModel): + """Outer model for testing.""" + + plain: str = "foo" + nested: InnerModel = Field(default_factory=InnerModel) + my_enum: ColorEnum = ColorEnum.RED + + +class SelfRefModel(BaseModel): + """A model that references itself (for cycle testing).""" + + name: str = "" + child: SelfRefModel | None = None + + +class DeepA(BaseModel): + val: int = 0 + b: DeepB | None = None + + +class DeepB(BaseModel): + val: int = 0 + + +# --------------------------------------------------------------------------- +# _is_basemodel_subclass +# --------------------------------------------------------------------------- + + +def test_is_basemodel_subclass_with_subclass() -> None: + assert _is_basemodel_subclass(InnerModel) is True + + +def test_is_basemodel_subclass_with_basemodel_itself() -> None: + assert _is_basemodel_subclass(BaseModel) is False + + +def test_is_basemodel_subclass_with_str() -> None: + assert _is_basemodel_subclass(str) is False + + +def test_is_basemodel_subclass_with_enum() -> None: + assert _is_basemodel_subclass(ColorEnum) is False + + +def test_is_basemodel_subclass_with_non_type() -> None: + assert _is_basemodel_subclass("not a type") is False + + +# --------------------------------------------------------------------------- +# _is_enum_subclass +# --------------------------------------------------------------------------- + + +def test_is_enum_subclass_with_enum_subclass() -> None: + assert _is_enum_subclass(ColorEnum) is True + + +def test_is_enum_subclass_with_enum_itself() -> None: + assert _is_enum_subclass(Enum) is False + + +def test_is_enum_subclass_with_str() -> None: + assert _is_enum_subclass(str) is False + + +def test_is_enum_subclass_with_non_type() -> None: + assert _is_enum_subclass(42) is False + + +# --------------------------------------------------------------------------- +# _extract_enum_class +# --------------------------------------------------------------------------- + + +def test_extract_enum_class_direct_enum() -> None: + assert _extract_enum_class(ColorEnum) is ColorEnum + + +def test_extract_enum_class_optional_enum() -> None: + assert _extract_enum_class(ColorEnum | None) is ColorEnum + + +def test_extract_enum_class_annotated_enum() -> None: + assert _extract_enum_class(Annotated[ColorEnum, "metadata"]) is ColorEnum + + +def test_extract_enum_class_non_enum() -> None: + assert _extract_enum_class(str) is None + + +def test_extract_enum_class_none() -> None: + assert _extract_enum_class(None) is None + + +# --------------------------------------------------------------------------- +# extract_nested_basemodel +# --------------------------------------------------------------------------- + + +def test_extract_nested_basemodel_direct() -> None: + assert extract_nested_basemodel(InnerModel) is InnerModel + + +def test_extract_nested_basemodel_list() -> None: + assert extract_nested_basemodel(list[InnerModel]) is InnerModel + + +def test_extract_nested_basemodel_optional() -> None: + assert extract_nested_basemodel(InnerModel | None) is InnerModel + + +def test_extract_nested_basemodel_optional_list() -> None: + assert extract_nested_basemodel(list[InnerModel] | None) is InnerModel + + +def test_extract_nested_basemodel_dict() -> None: + assert extract_nested_basemodel(dict[str, InnerModel]) is InnerModel + + +def test_extract_nested_basemodel_annotated() -> None: + assert extract_nested_basemodel(Annotated[InnerModel, "info"]) is InnerModel + + +def test_extract_nested_basemodel_discriminated_union_returns_none() -> None: + """Unions of 2+ BaseModel subclasses should return None.""" + assert extract_nested_basemodel(InnerModel | OuterModel) is None + + +def test_extract_nested_basemodel_primitive_returns_none() -> None: + assert extract_nested_basemodel(str) is None + assert extract_nested_basemodel(int) is None + + +def test_extract_nested_basemodel_none_returns_none() -> None: + assert extract_nested_basemodel(None) is None + + +def test_extract_nested_basemodel_basemodel_itself_returns_none() -> None: + assert extract_nested_basemodel(BaseModel) is None + + +# --------------------------------------------------------------------------- +# format_type +# --------------------------------------------------------------------------- + + +def test_format_type_str() -> None: + result = format_type(str) + assert "str" in result + + +def test_format_type_int() -> None: + result = format_type(int) + assert "int" in result + + +def test_format_type_optional() -> None: + result = format_type(str | None) + assert "str" in result + assert "None" in result + + +# --------------------------------------------------------------------------- +# get_brief_description +# --------------------------------------------------------------------------- + + +def test_get_brief_description_with_docstring() -> None: + result = get_brief_description(OuterModel) + assert result == "Outer model for testing." + + +def test_get_brief_description_without_docstring() -> None: + result = get_brief_description(InnerModel) + assert result == "No description available." + + +# --------------------------------------------------------------------------- +# get_field_info +# --------------------------------------------------------------------------- + + +def test_get_field_info_returns_field_details() -> None: + fields = get_field_info(OuterModel) + assert isinstance(fields, list) + assert all(isinstance(f, FieldDetail) for f in fields) + names = [f.name for f in fields] + assert "plain" in names + assert "nested" in names + assert "my_enum" in names + + +def test_get_field_info_enum_values() -> None: + fields = get_field_info(OuterModel) + enum_field = next(f for f in fields if f.name == "my_enum") + assert enum_field.enum_values is not None + assert set(enum_field.enum_values) == {"RED", "GREEN", "BLUE"} + + +def test_get_field_info_non_enum_has_no_enum_values() -> None: + fields = get_field_info(OuterModel) + plain_field = next(f for f in fields if f.name == "plain") + assert plain_field.enum_values is None + + +# --------------------------------------------------------------------------- +# build_model_schema +# --------------------------------------------------------------------------- + + +def test_build_model_schema_basic_structure() -> None: + schema = build_model_schema(OuterModel) + assert isinstance(schema, ModelSchema) + assert schema.class_name == "OuterModel" + assert schema.description == "Outer model for testing." + assert len(schema.fields) == 3 + + +def test_build_model_schema_with_type_key_and_value() -> None: + schema = build_model_schema(OuterModel, type_key="column_type", type_value="test") + assert schema.type_key == "column_type" + assert schema.type_value == "test" + + +def test_build_model_schema_nested_expansion() -> None: + schema = build_model_schema(OuterModel) + nested_field = next(f for f in schema.fields if f.name == "nested") + assert nested_field.nested_schema is not None + assert nested_field.nested_schema.class_name == "InnerModel" + nested_names = [f.name for f in nested_field.nested_schema.fields] + assert "x" in nested_names + assert "y" in nested_names + + +def test_build_model_schema_cycle_protection() -> None: + schema = build_model_schema(SelfRefModel) + child_field = next(f for f in schema.fields if f.name == "child") + # SelfRefModel references itself: the first expansion should happen, + # but the recursive child should NOT be expanded again (cycle detected). + if child_field.nested_schema is not None: + inner_child = next( + (f for f in child_field.nested_schema.fields if f.name == "child"), + None, + ) + if inner_child is not None: + assert inner_child.nested_schema is None + + +def test_build_model_schema_depth_limiting() -> None: + schema = build_model_schema(DeepA, max_depth=1) + b_field = next(f for f in schema.fields if f.name == "b") + if b_field.nested_schema is not None: + # At depth 1, nested model should not recurse further + for f in b_field.nested_schema.fields: + assert f.nested_schema is None From e0ef33d8e12ceb42f4626b6cf0ff92f0379dff77 Mon Sep 17 00:00:00 2001 From: Johnny Greco Date: Sun, 15 Feb 2026 15:01:49 -0500 Subject: [PATCH 02/37] fix: correct agent-context field descriptions in column configs Remove redundant `name` field re-declaration from ExpressionColumnConfig (already inherited from SingleColumnConfig) and fix validator_type description to use actual enum values instead of uppercase member names. --- .../src/data_designer/config/column_configs.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/packages/data-designer-config/src/data_designer/config/column_configs.py b/packages/data-designer-config/src/data_designer/config/column_configs.py index aa1a7f2a0..151202902 100644 --- a/packages/data-designer-config/src/data_designer/config/column_configs.py +++ b/packages/data-designer-config/src/data_designer/config/column_configs.py @@ -378,7 +378,6 @@ class ExpressionColumnConfig(SingleColumnConfig): column_type: Discriminator field, always "expression" for this configuration type. """ - name: str = Field(description="Unique name of the column to be generated") expr: str = Field(description="Jinja2 expression to compute the column value from other columns") dtype: Literal["int", "float", "str", "bool"] = Field( default="str", description="Data type for expression result: 'int', 'float', 'str', or 'bool'" @@ -452,7 +451,7 @@ class ValidationColumnConfig(SingleColumnConfig): """ target_columns: list[str] = Field(description="List of column names to validate") - validator_type: ValidatorType = Field(description="Validation method: CODE, LOCAL_CALLABLE, or REMOTE") + validator_type: ValidatorType = Field(description="Validation method: 'code', 'local_callable', or 'remote'") validator_params: ValidatorParamsT = Field(description="Validator-specific parameters (e.g., CodeValidatorParams)") batch_size: int = Field(default=10, ge=1, description="Number of records to process in each batch") column_type: Literal["validation"] = Field( From 914f3c48e725d9299a92143aa54c00f2eee634ff Mon Sep 17 00:00:00 2001 From: Johnny Greco Date: Sun, 15 Feb 2026 21:33:52 -0500 Subject: [PATCH 03/37] feat: enhance pydantic and method inspectors with richer field details Add required/default/constraints to FieldDetail, PropertyInfo dataclass, classmethod detection, inspect_class_properties, and __init__ docstring fallback. Enum values now use .value instead of .name. --- .../introspection/method_inspector.py | 87 ++++++++- .../introspection/pydantic_inspector.py | 37 +++- .../introspection/test_method_inspector.py | 177 +++++++++++++++++- .../introspection/test_pydantic_inspector.py | 132 +++++++++++-- 4 files changed, 411 insertions(+), 22 deletions(-) diff --git a/packages/data-designer/src/data_designer/cli/services/introspection/method_inspector.py b/packages/data-designer/src/data_designer/cli/services/introspection/method_inspector.py index 2451b63e5..e5784b7da 100644 --- a/packages/data-designer/src/data_designer/cli/services/introspection/method_inspector.py +++ b/packages/data-designer/src/data_designer/cli/services/introspection/method_inspector.py @@ -23,6 +23,14 @@ class MethodInfo: description: str return_type: str parameters: list[ParamInfo] = field(default_factory=list) + is_classmethod: bool = False + + +@dataclass +class PropertyInfo: + name: str + return_type: str + description: str def _parse_google_docstring_args(docstring: str | None) -> dict[str, str]: @@ -206,9 +214,23 @@ def _is_private(name: str) -> bool: return name.startswith("_") and not (name.startswith("__") and name.endswith("__")) +_DEFAULT_INIT_DOCSTRING = "Initialize self. See help(type(self)) for accurate signature." + + +def _is_default_init_docstring(docstring: str | None) -> bool: + """Check if a docstring is the unhelpful default __init__ docstring.""" + if not docstring: + return False + normalized = " ".join(docstring.strip().split()) + return normalized == _DEFAULT_INIT_DOCSTRING + + def inspect_class_methods(cls: type, include_private: bool = False) -> list[MethodInfo]: """Introspect public methods of a class using inspect.signature() and docstring parsing. + Detects regular methods, classmethods, and handles __init__ docstring fallback + to the class docstring when the default is unhelpful. + Args: cls: The class to introspect. include_private: If True, include methods starting with underscore. @@ -217,8 +239,19 @@ def inspect_class_methods(cls: type, include_private: bool = False) -> list[Meth List of MethodInfo objects for each method. """ methods: list[MethodInfo] = [] + classmethod_names = _collect_classmethod_names(cls) + + # inspect.isfunction finds regular methods; inspect.ismethod finds classmethods + seen: set[str] = set() + candidates: list[tuple[str, object]] = [] + candidates.extend(inspect.getmembers(cls, predicate=inspect.isfunction)) + candidates.extend(inspect.getmembers(cls, predicate=inspect.ismethod)) + + for name, method in candidates: + if name in seen: + continue + seen.add(name) - for name, method in inspect.getmembers(cls, predicate=inspect.isfunction): if _is_dunder(name): continue if _is_private(name) and not include_private: @@ -230,6 +263,10 @@ def inspect_class_methods(cls: type, include_private: bool = False) -> list[Meth continue docstring = inspect.getdoc(method) + + if name == "__init__" and _is_default_init_docstring(docstring): + docstring = inspect.getdoc(cls) or "" + docstring_args = _parse_google_docstring_args(docstring) signature_str = _format_signature(name, sig) @@ -244,7 +281,55 @@ def inspect_class_methods(cls: type, include_private: bool = False) -> list[Meth description=description, return_type=return_type, parameters=parameters, + is_classmethod=name in classmethod_names, ) ) return methods + + +def _collect_classmethod_names(cls: type) -> set[str]: + """Collect the names of all classmethods defined on a class and its bases.""" + names: set[str] = set() + for klass in cls.__mro__: + for name, value in vars(klass).items(): + if isinstance(value, classmethod): + names.add(name) + return names + + +def inspect_class_properties(cls: type, include_private: bool = False) -> list[PropertyInfo]: + """Introspect properties of a class. + + Args: + cls: The class to introspect. + include_private: If True, include properties starting with underscore. + + Returns: + List of PropertyInfo objects for each property. + """ + properties: list[PropertyInfo] = [] + + for name in dir(cls): + if _is_dunder(name): + continue + if _is_private(name) and not include_private: + continue + + attr = inspect.getattr_static(cls, name, None) + if not isinstance(attr, property): + continue + + return_type = "Any" + if attr.fget is not None: + hints = getattr(attr.fget, "__annotations__", {}) + ret = hints.get("return") + if ret is not None: + return_type = _format_annotation(ret) + + docstring = attr.fget.__doc__ if attr.fget is not None else None + description = _get_first_docstring_line(docstring) + + properties.append(PropertyInfo(name=name, return_type=return_type, description=description)) + + return properties diff --git a/packages/data-designer/src/data_designer/cli/services/introspection/pydantic_inspector.py b/packages/data-designer/src/data_designer/cli/services/introspection/pydantic_inspector.py index 70fe69c92..e9287659a 100644 --- a/packages/data-designer/src/data_designer/cli/services/introspection/pydantic_inspector.py +++ b/packages/data-designer/src/data_designer/cli/services/introspection/pydantic_inspector.py @@ -11,6 +11,7 @@ from typing import Any, get_args, get_origin from pydantic import BaseModel +from pydantic_core import PydanticUndefined @dataclass @@ -20,7 +21,10 @@ class FieldDetail: name: str type_str: str description: str + required: bool = True + default: str | None = None enum_values: list[str] | None = None + constraints: dict[str, Any] | None = None nested_schema: ModelSchema | None = None @@ -165,6 +169,18 @@ def get_brief_description(cls: type) -> str: return "No description available." +def _extract_constraints(field_info: Any) -> dict[str, Any] | None: + """Extract numeric/string constraints from a Pydantic FieldInfo's metadata.""" + constraint_keys = {"ge", "le", "gt", "lt", "min_length", "max_length"} + constraints: dict[str, Any] = {} + for meta in getattr(field_info, "metadata", []): + for key in constraint_keys: + val = getattr(meta, key, None) + if val is not None: + constraints[key] = val + return constraints or None + + def get_field_info(cls: type) -> list[FieldDetail]: """Extract field information from a Pydantic model. @@ -172,8 +188,9 @@ def get_field_info(cls: type) -> list[FieldDetail]: cls: The Pydantic model class to inspect. Returns: - List of FieldDetail objects with name, type_str, description, enum_values, - and nested_schema (initially None, populated by build_model_schema). + List of FieldDetail objects with name, type_str, description, required, + default, enum_values, constraints, and nested_schema (initially None, + populated by build_model_schema). """ fields: list[FieldDetail] = [] model_fields: dict[str, Any] = getattr(cls, "model_fields", {}) @@ -182,17 +199,31 @@ def get_field_info(cls: type) -> list[FieldDetail]: type_str = format_type(field_info.annotation) description = field_info.description or "" + required = field_info.is_required() + + default: str | None = None + if not required: + if field_info.default is not PydanticUndefined and field_info.default is not None: + default = repr(field_info.default) + elif field_info.default_factory is not None: + default = f"{field_info.default_factory.__name__}()" + enum_cls = _extract_enum_class(field_info.annotation) enum_values: list[str] | None = None if enum_cls is not None: - enum_values = [member.name for member in enum_cls] + enum_values = [str(member.value) for member in enum_cls] + + constraints = _extract_constraints(field_info) fields.append( FieldDetail( name=field_name, type_str=type_str, description=description, + required=required, + default=default, enum_values=enum_values, + constraints=constraints, nested_schema=None, ) ) diff --git a/packages/data-designer/tests/cli/services/introspection/test_method_inspector.py b/packages/data-designer/tests/cli/services/introspection/test_method_inspector.py index 37007e252..bc8e0def3 100644 --- a/packages/data-designer/tests/cli/services/introspection/test_method_inspector.py +++ b/packages/data-designer/tests/cli/services/introspection/test_method_inspector.py @@ -5,12 +5,14 @@ from data_designer.cli.services.introspection.method_inspector import ( MethodInfo, + PropertyInfo, _parse_google_docstring_args, inspect_class_methods, + inspect_class_properties, ) # --------------------------------------------------------------------------- -# Test helper class +# Test helper classes # --------------------------------------------------------------------------- @@ -47,6 +49,68 @@ def __init__(self) -> None: """Init should be included.""" +class ClassWithClassmethod: + """A class with a classmethod for testing.""" + + @classmethod + def from_value(cls, value: int) -> ClassWithClassmethod: + """Create an instance from a value. + + Args: + value: The input value. + """ + return cls() + + def regular_method(self) -> str: + """A regular method.""" + return "hello" + + +class ClassWithProperties: + """A class with properties for testing.""" + + def __init__(self) -> None: + self._name = "test" + self._count = 0 + + @property + def name(self) -> str: + """Get the name.""" + return self._name + + @property + def count(self) -> int: + """Get the count value.""" + return self._count + + @property + def _private_prop(self) -> bool: + """A private property.""" + return True + + +class ClassWithDefaultInitDocstring: + """A useful class that does important things. + + This is a longer description of the class. + """ + + def __init__(self, x: int = 0) -> None: + self.x = x + + +class ClassWithCustomInitDocstring: + """Class-level docstring.""" + + def __init__(self, x: int) -> None: + """Custom init docstring. + + Args: + x: An integer. + """ + self.x = x + + # --------------------------------------------------------------------------- # _parse_google_docstring_args # --------------------------------------------------------------------------- @@ -157,3 +221,114 @@ def test_inspect_class_methods_init_included() -> None: methods = inspect_class_methods(SampleClass, include_private=True) names = [m.name for m in methods] assert "__init__" in names + + +# --------------------------------------------------------------------------- +# inspect_class_methods - classmethod detection +# --------------------------------------------------------------------------- + + +def test_inspect_class_methods_detects_classmethod() -> None: + methods = inspect_class_methods(ClassWithClassmethod, include_private=False) + names = [m.name for m in methods] + assert "from_value" in names + assert "regular_method" in names + + +def test_inspect_class_methods_classmethod_is_classmethod_flag() -> None: + methods = inspect_class_methods(ClassWithClassmethod, include_private=False) + from_value = next(m for m in methods if m.name == "from_value") + regular = next(m for m in methods if m.name == "regular_method") + assert from_value.is_classmethod is True + assert regular.is_classmethod is False + + +def test_inspect_class_methods_classmethod_signature() -> None: + methods = inspect_class_methods(ClassWithClassmethod, include_private=False) + from_value = next(m for m in methods if m.name == "from_value") + assert "value: int" in from_value.signature + + +def test_inspect_class_methods_classmethod_description() -> None: + methods = inspect_class_methods(ClassWithClassmethod, include_private=False) + from_value = next(m for m in methods if m.name == "from_value") + assert from_value.description == "Create an instance from a value." + + +def test_inspect_class_methods_classmethod_parameters() -> None: + methods = inspect_class_methods(ClassWithClassmethod, include_private=False) + from_value = next(m for m in methods if m.name == "from_value") + param_names = [p.name for p in from_value.parameters] + assert "value" in param_names + value_param = next(p for p in from_value.parameters if p.name == "value") + assert value_param.description == "The input value." + + +def test_inspect_class_methods_regular_not_classmethod() -> None: + methods = inspect_class_methods(SampleClass, include_private=False) + for m in methods: + assert m.is_classmethod is False + + +# --------------------------------------------------------------------------- +# inspect_class_properties +# --------------------------------------------------------------------------- + + +def test_inspect_class_properties_finds_public() -> None: + props = inspect_class_properties(ClassWithProperties, include_private=False) + names = [p.name for p in props] + assert "name" in names + assert "count" in names + assert "_private_prop" not in names + + +def test_inspect_class_properties_returns_property_info() -> None: + props = inspect_class_properties(ClassWithProperties, include_private=False) + assert all(isinstance(p, PropertyInfo) for p in props) + + +def test_inspect_class_properties_return_types() -> None: + props = inspect_class_properties(ClassWithProperties, include_private=False) + name_prop = next(p for p in props if p.name == "name") + count_prop = next(p for p in props if p.name == "count") + assert name_prop.return_type == "str" + assert count_prop.return_type == "int" + + +def test_inspect_class_properties_descriptions() -> None: + props = inspect_class_properties(ClassWithProperties, include_private=False) + name_prop = next(p for p in props if p.name == "name") + count_prop = next(p for p in props if p.name == "count") + assert name_prop.description == "Get the name." + assert count_prop.description == "Get the count value." + + +def test_inspect_class_properties_include_private() -> None: + props = inspect_class_properties(ClassWithProperties, include_private=True) + names = [p.name for p in props] + assert "_private_prop" in names + + +def test_inspect_class_properties_empty_class() -> None: + props = inspect_class_properties(SampleClass, include_private=False) + assert props == [] + + +# --------------------------------------------------------------------------- +# __init__ docstring fallback +# --------------------------------------------------------------------------- + + +def test_init_default_docstring_falls_back_to_class() -> None: + methods = inspect_class_methods(ClassWithDefaultInitDocstring, include_private=True) + init = next((m for m in methods if m.name == "__init__"), None) + assert init is not None + assert init.description == "A useful class that does important things." + + +def test_init_custom_docstring_preserved() -> None: + methods = inspect_class_methods(ClassWithCustomInitDocstring, include_private=True) + init = next((m for m in methods if m.name == "__init__"), None) + assert init is not None + assert init.description == "Custom init docstring." diff --git a/packages/data-designer/tests/cli/services/introspection/test_pydantic_inspector.py b/packages/data-designer/tests/cli/services/introspection/test_pydantic_inspector.py index d29e3e844..f5ab0be5c 100644 --- a/packages/data-designer/tests/cli/services/introspection/test_pydantic_inspector.py +++ b/packages/data-designer/tests/cli/services/introspection/test_pydantic_inspector.py @@ -11,6 +11,7 @@ from data_designer.cli.services.introspection.pydantic_inspector import ( FieldDetail, ModelSchema, + _extract_constraints, _extract_enum_class, _is_basemodel_subclass, _is_enum_subclass, @@ -57,13 +58,33 @@ class SelfRefModel(BaseModel): child: SelfRefModel | None = None +class DeepB(BaseModel): + val: int = 0 + + class DeepA(BaseModel): val: int = 0 b: DeepB | None = None -class DeepB(BaseModel): - val: int = 0 +# Rebuild models that use forward references (required due to `from __future__ import annotations`) +SelfRefModel.model_rebuild() +DeepA.model_rebuild() + + +class RequiredFieldModel(BaseModel): + """Model with required and optional fields for testing.""" + + required_name: str + optional_name: str = "default_val" + + +class ConstrainedModel(BaseModel): + """Model with constrained fields for testing.""" + + score: float = Field(default=0.5, ge=0.0, le=1.0) + label: str = Field(default="", min_length=1, max_length=100) + count: int = Field(default=0, gt=-1, lt=1000) # --------------------------------------------------------------------------- @@ -235,11 +256,11 @@ def test_get_field_info_returns_field_details() -> None: assert "my_enum" in names -def test_get_field_info_enum_values() -> None: +def test_get_field_info_enum_values_use_dot_value() -> None: fields = get_field_info(OuterModel) enum_field = next(f for f in fields if f.name == "my_enum") assert enum_field.enum_values is not None - assert set(enum_field.enum_values) == {"RED", "GREEN", "BLUE"} + assert set(enum_field.enum_values) == {"red", "green", "blue"} def test_get_field_info_non_enum_has_no_enum_values() -> None: @@ -248,6 +269,84 @@ def test_get_field_info_non_enum_has_no_enum_values() -> None: assert plain_field.enum_values is None +# --------------------------------------------------------------------------- +# required / default +# --------------------------------------------------------------------------- + + +def test_get_field_info_required_field() -> None: + fields = get_field_info(RequiredFieldModel) + req = next(f for f in fields if f.name == "required_name") + assert req.required is True + assert req.default is None + + +def test_get_field_info_optional_field_default() -> None: + fields = get_field_info(RequiredFieldModel) + opt = next(f for f in fields if f.name == "optional_name") + assert opt.required is False + assert opt.default == "'default_val'" + + +def test_get_field_info_default_factory() -> None: + fields = get_field_info(OuterModel) + nested = next(f for f in fields if f.name == "nested") + assert nested.required is False + assert nested.default == "InnerModel()" + + +def test_get_field_info_none_default_not_shown() -> None: + """Fields with default=None (like SelfRefModel.child) should have default=None in FieldDetail.""" + fields = get_field_info(SelfRefModel) + child = next(f for f in fields if f.name == "child") + assert child.required is False + assert child.default is None + + +# --------------------------------------------------------------------------- +# constraints +# --------------------------------------------------------------------------- + + +def test_extract_constraints_from_constrained_model() -> None: + fields = get_field_info(ConstrainedModel) + score = next(f for f in fields if f.name == "score") + assert score.constraints is not None + assert score.constraints["ge"] == 0.0 + assert score.constraints["le"] == 1.0 + + +def test_extract_constraints_gt_lt() -> None: + fields = get_field_info(ConstrainedModel) + count = next(f for f in fields if f.name == "count") + assert count.constraints is not None + assert count.constraints["gt"] == -1 + assert count.constraints["lt"] == 1000 + + +def test_extract_constraints_string_lengths() -> None: + fields = get_field_info(ConstrainedModel) + label = next(f for f in fields if f.name == "label") + assert label.constraints is not None + assert label.constraints["min_length"] == 1 + assert label.constraints["max_length"] == 100 + + +def test_extract_constraints_none_for_unconstrained() -> None: + fields = get_field_info(InnerModel) + x_field = next(f for f in fields if f.name == "x") + assert x_field.constraints is None + + +def test_extract_constraints_helper_with_no_metadata() -> None: + """_extract_constraints returns None when field_info has no constraint metadata.""" + + class FakeFieldInfo: + metadata: list = [] + + assert _extract_constraints(FakeFieldInfo()) is None + + # --------------------------------------------------------------------------- # build_model_schema # --------------------------------------------------------------------------- @@ -280,21 +379,20 @@ def test_build_model_schema_nested_expansion() -> None: def test_build_model_schema_cycle_protection() -> None: schema = build_model_schema(SelfRefModel) child_field = next(f for f in schema.fields if f.name == "child") - # SelfRefModel references itself: the first expansion should happen, - # but the recursive child should NOT be expanded again (cycle detected). - if child_field.nested_schema is not None: - inner_child = next( - (f for f in child_field.nested_schema.fields if f.name == "child"), - None, - ) - if inner_child is not None: - assert inner_child.nested_schema is None + # First level: SelfRefModel.child should be expanded into a nested schema + assert child_field.nested_schema is not None, "First-level expansion must happen" + assert child_field.nested_schema.class_name == "SelfRefModel" + # Second level: the recursive child.child should NOT be expanded (cycle detected) + inner_child = next(f for f in child_field.nested_schema.fields if f.name == "child") + assert inner_child.nested_schema is None, "Cycle protection must block second-level expansion" def test_build_model_schema_depth_limiting() -> None: schema = build_model_schema(DeepA, max_depth=1) b_field = next(f for f in schema.fields if f.name == "b") - if b_field.nested_schema is not None: - # At depth 1, nested model should not recurse further - for f in b_field.nested_schema.fields: - assert f.nested_schema is None + # At max_depth=1, the first nested level (DeepB) should still be expanded + assert b_field.nested_schema is not None, "First-level nesting must be expanded at max_depth=1" + assert b_field.nested_schema.class_name == "DeepB" + # But any further nesting within DeepB should be blocked + for f in b_field.nested_schema.fields: + assert f.nested_schema is None, f"Field '{f.name}' should not be expanded beyond max_depth" From ac91dc5c562af9dd5a439246f6e766d81256ae7e Mon Sep 17 00:00:00 2001 From: Johnny Greco Date: Sun, 15 Feb 2026 21:34:05 -0500 Subject: [PATCH 04/37] feat: add Field descriptions and docstrings to config models Add description= to RunConfig fields, class docstrings to constraint and seed source types for richer introspection output. --- .../src/data_designer/config/run_config.py | 42 +++++++++++++++---- .../config/sampler_constraints.py | 4 ++ .../src/data_designer/config/seed_source.py | 4 ++ 3 files changed, 43 insertions(+), 7 deletions(-) diff --git a/packages/data-designer-config/src/data_designer/config/run_config.py b/packages/data-designer-config/src/data_designer/config/run_config.py index 03b2ed297..538844add 100644 --- a/packages/data-designer-config/src/data_designer/config/run_config.py +++ b/packages/data-designer-config/src/data_designer/config/run_config.py @@ -35,13 +35,41 @@ class RunConfig(ConfigBase): Default is 0. """ - disable_early_shutdown: bool = False - shutdown_error_rate: float = Field(default=0.5, ge=0.0, le=1.0) - shutdown_error_window: int = Field(default=10, ge=0) - buffer_size: int = Field(default=1000, gt=0) - non_inference_max_parallel_workers: int = Field(default=4, ge=1) - max_conversation_restarts: int = Field(default=5, ge=0) - max_conversation_correction_steps: int = Field(default=0, ge=0) + disable_early_shutdown: bool = Field( + default=False, + description="If True, disables early-shutdown behavior; generation continues regardless of error rate", + ) + shutdown_error_rate: float = Field( + default=0.5, + ge=0.0, + le=1.0, + description="Error rate threshold (0.0-1.0) that triggers early shutdown when early shutdown is enabled", + ) + shutdown_error_window: int = Field( + default=10, + ge=0, + description="Minimum number of completed tasks before error rate monitoring begins", + ) + buffer_size: int = Field( + default=1000, + gt=0, + description="Number of records to process in each batch during dataset generation", + ) + non_inference_max_parallel_workers: int = Field( + default=4, + ge=1, + description="Maximum number of worker threads used for non-inference cell-by-cell generators", + ) + max_conversation_restarts: int = Field( + default=5, + ge=0, + description="Maximum number of full conversation restarts permitted per ModelFacade.generate() call", + ) + max_conversation_correction_steps: int = Field( + default=0, + ge=0, + description="Maximum number of correction rounds permitted within a single conversation", + ) @model_validator(mode="after") def normalize_shutdown_settings(self) -> Self: diff --git a/packages/data-designer-config/src/data_designer/config/sampler_constraints.py b/packages/data-designer-config/src/data_designer/config/sampler_constraints.py index e935424e7..73128fd6c 100644 --- a/packages/data-designer-config/src/data_designer/config/sampler_constraints.py +++ b/packages/data-designer-config/src/data_designer/config/sampler_constraints.py @@ -33,6 +33,8 @@ def constraint_type(self) -> ConstraintType: ... class ScalarInequalityConstraint(Constraint): + """Constraint that compares a column's values against a scalar threshold.""" + rhs: float = Field(description="Scalar value to compare against") operator: InequalityOperator = Field(description="Comparison operator (lt, le, gt, ge)") @@ -42,6 +44,8 @@ def constraint_type(self) -> ConstraintType: class ColumnInequalityConstraint(Constraint): + """Constraint that compares a column's values against another column's values.""" + rhs: str = Field(description="Name of the other column to compare against") operator: InequalityOperator = Field(description="Comparison operator (lt, le, gt, ge)") diff --git a/packages/data-designer-config/src/data_designer/config/seed_source.py b/packages/data-designer-config/src/data_designer/config/seed_source.py index df43deccd..7244a6e41 100644 --- a/packages/data-designer-config/src/data_designer/config/seed_source.py +++ b/packages/data-designer-config/src/data_designer/config/seed_source.py @@ -30,6 +30,8 @@ class SeedSource(BaseModel, ABC): class LocalFileSeedSource(SeedSource): + """Seed source that reads data from a local file (e.g., Parquet, CSV, JSONL).""" + seed_type: Literal["local"] = Field( default="local", description="Seed source type discriminator, always 'local' for local file sources" ) @@ -55,6 +57,8 @@ def from_dataframe(cls, df: pd.DataFrame, path: str) -> Self: class HuggingFaceSeedSource(SeedSource): + """Seed source that reads data from a HuggingFace dataset repository.""" + seed_type: Literal["hf"] = Field( default="hf", description="Seed source type discriminator, always 'hf' for HuggingFace sources" ) From 311201c643f98fe2f9b47e79f16e53a83b223504 Mon Sep 17 00:00:00 2001 From: Johnny Greco Date: Sun, 15 Feb 2026 21:34:17 -0500 Subject: [PATCH 05/37] feat: enhance formatters with rich field display, dedup, and new format functions Add required/default/constraints to field rendering, schema deduplication via seen_schemas, and new formatters for interface, imports, and namespace tree output. --- .../cli/services/introspection/formatters.py | 191 ++++++++++++- .../services/introspection/test_formatters.py | 265 +++++++++++++++++- 2 files changed, 436 insertions(+), 20 deletions(-) diff --git a/packages/data-designer/src/data_designer/cli/services/introspection/formatters.py b/packages/data-designer/src/data_designer/cli/services/introspection/formatters.py index 097f9c2ea..2a26f8201 100644 --- a/packages/data-designer/src/data_designer/cli/services/introspection/formatters.py +++ b/packages/data-designer/src/data_designer/cli/services/introspection/formatters.py @@ -3,29 +3,57 @@ from __future__ import annotations +from typing import Any + from data_designer.cli.services.introspection.method_inspector import MethodInfo, ParamInfo from data_designer.cli.services.introspection.pydantic_inspector import FieldDetail, ModelSchema +_AGENT_GUIDANCE_FOOTER = ( + "Use `data-designer introspect ` for API details.\n" + "Only read source files directly if these commands don't cover your need." +) + -def _format_field_text(field: FieldDetail, indent: int = 4) -> list[str]: - """Format a single field as YAML-style text lines, recursing into nested schemas.""" +def _format_field_text(field: FieldDetail, indent: int = 4, seen_schemas: set[str] | None = None) -> list[str]: + """Format a single field as YAML-style text lines, recursing into nested schemas. + + When ``seen_schemas`` is provided, nested schemas that have already been rendered + are replaced with a short back-reference to reduce output duplication. + """ pad = " " * indent lines: list[str] = [] - lines.append(f"{pad}{field.name}:") - lines.append(f"{pad} type: {field.type_str}") + header = f"{pad}{field.name}: {field.type_str}" + if field.default is not None: + header += f" = {field.default}" + if field.required: + header += " [required]" + lines.append(header) if field.description: lines.append(f"{pad} description: {field.description}") if field.enum_values: lines.append(f"{pad} values: [{', '.join(field.enum_values)}]") + if field.constraints: + constraint_parts = [f"{k}={v}" for k, v in field.constraints.items()] + lines.append(f"{pad} constraints: {', '.join(constraint_parts)}") if field.nested_schema: - lines.append(f"{pad} schema ({field.nested_schema.class_name}):") - for nested_field in field.nested_schema.fields: - lines.extend(_format_field_text(nested_field, indent=indent + 4)) + schema_name = field.nested_schema.class_name + if seen_schemas is not None and schema_name in seen_schemas: + lines.append(f"{pad} schema: (see {schema_name} above)") + else: + if seen_schemas is not None: + seen_schemas.add(schema_name) + lines.append(f"{pad} schema ({schema_name}):") + for nested_field in field.nested_schema.fields: + lines.extend(_format_field_text(nested_field, indent=indent + 4, seen_schemas=seen_schemas)) return lines -def format_model_schema_text(schema: ModelSchema, indent: int = 0) -> str: - """Format a ModelSchema as YAML-style text for backward compatibility with the existing skill scripts.""" +def format_model_schema_text(schema: ModelSchema, indent: int = 0, seen_schemas: set[str] | None = None) -> str: + """Format a ModelSchema as YAML-style text for backward compatibility with the existing skill scripts. + + When ``seen_schemas`` is provided, nested schemas that have already been rendered + across prior calls are replaced with a short back-reference. + """ lines: list[str] = [] pad = " " * indent lines.append(f"{pad}{schema.class_name}:") @@ -34,7 +62,7 @@ def format_model_schema_text(schema: ModelSchema, indent: int = 0) -> str: lines.append(f"{pad} description: {schema.description}") lines.append(f"{pad} fields:") for field in schema.fields: - lines.extend(_format_field_text(field, indent=indent + 4)) + lines.extend(_format_field_text(field, indent=indent + 4, seen_schemas=seen_schemas)) return "\n".join(lines) @@ -43,11 +71,16 @@ def _format_field_json(field: FieldDetail) -> dict: result: dict = { "name": field.name, "type": field.type_str, + "required": field.required, } + if field.default is not None: + result["default"] = field.default if field.description: result["description"] = field.description if field.enum_values: result["values"] = field.enum_values + if field.constraints: + result["constraints"] = field.constraints if field.nested_schema: result["schema"] = format_model_schema_json(field.nested_schema) return result @@ -168,11 +201,13 @@ def format_overview_text(type_counts: dict[str, int], builder_methods: list[Meth lines.append("") lines.append("Quick Start Commands:") - lines.append(" data-designer agent-context columns --list") - lines.append(" data-designer agent-context columns all") - lines.append(" data-designer agent-context columns llm-text") - lines.append(" data-designer agent-context samplers category") - lines.append(" data-designer agent-context builder") + lines.append(" data-designer introspect columns --list") + lines.append(" data-designer introspect columns all") + lines.append(" data-designer introspect columns llm-text") + lines.append(" data-designer introspect samplers category") + lines.append(" data-designer introspect builder") + lines.append(" data-designer introspect interface") + lines.append(" data-designer introspect imports") return "\n".join(lines) @@ -180,3 +215,129 @@ def format_overview_text(type_counts: dict[str, int], builder_methods: list[Meth def _short_sig(method: MethodInfo) -> str: """Create a compact signature like 'add_column(...)' for overview display.""" return f"{method.name}(...)" + + +# --------------------------------------------------------------------------- +# Namespace / code-structure formatters +# --------------------------------------------------------------------------- + + +def _render_tree_lines(node: dict[str, Any], prefix: str = "", is_last: bool = True) -> list[str]: + """Recursively render a namespace tree node into box-drawing lines.""" + connector = "└── " if is_last else "├── " + suffix = "/" if node["is_package"] else ".py" + lines: list[str] = [f"{prefix}{connector}{node['name']}{suffix}"] + + children = node.get("children", []) + child_prefix = prefix + (" " if is_last else "│ ") + for i, child in enumerate(children): + lines.extend(_render_tree_lines(child, child_prefix, is_last=(i == len(children) - 1))) + return lines + + +def format_namespace_text(data: dict[str, Any]) -> str: + """Format a namespace tree as a text tree diagram with box-drawing characters.""" + lines: list[str] = [] + lines.append("data_designer code structure") + lines.append("=" * 28) + lines.append("") + + paths = data.get("paths", []) + if paths: + lines.append("Install path:") + for p in paths: + lines.append(f" {p}") + lines.append("") + + tree = data["tree"] + lines.append(f"{tree['name']}/") + children = tree.get("children", []) + for i, child in enumerate(children): + lines.extend(_render_tree_lines(child, prefix="", is_last=(i == len(children) - 1))) + + lines.append("") + lines.append(_AGENT_GUIDANCE_FOOTER) + return "\n".join(lines) + + +def format_namespace_json(data: dict[str, Any]) -> dict[str, Any]: + """Return the namespace tree dict as-is for JSON output.""" + return data + + +# --------------------------------------------------------------------------- +# Interface formatters +# --------------------------------------------------------------------------- + + +def format_interface_text( + classes_with_methods: list[tuple[str, list[MethodInfo]]], + pydantic_schemas: list[ModelSchema], +) -> str: + """Format interface classes as readable text for agent consumption.""" + lines: list[str] = [] + lines.append("Data Designer Interface Reference") + lines.append("=" * 34) + lines.append("") + + for class_name, methods in classes_with_methods: + lines.append(format_method_info_text(methods, class_name=class_name)) + lines.append("") + + for schema in pydantic_schemas: + lines.append(format_model_schema_text(schema)) + lines.append("") + + return "\n".join(lines).rstrip() + + +def format_interface_json( + classes_with_methods: list[tuple[str, list[MethodInfo]]], + pydantic_schemas: list[ModelSchema], +) -> dict[str, Any]: + """Convert interface classes to a JSON-serializable dict.""" + methods_dict: dict[str, list[dict]] = {} + for class_name, methods in classes_with_methods: + methods_dict[class_name] = format_method_info_json(methods) + + schemas_list: list[dict] = [format_model_schema_json(s) for s in pydantic_schemas] + + return {"methods": methods_dict, "schemas": schemas_list} + + +# --------------------------------------------------------------------------- +# Imports formatters +# --------------------------------------------------------------------------- + + +def format_imports_text(categories: dict[str, list[dict[str, str]]]) -> str: + """Format categorized import names as readable text with import statements.""" + lines: list[str] = [] + lines.append("Data Designer Import Reference") + lines.append("=" * 30) + lines.append("") + + for category, entries in sorted(categories.items()): + lines.append(f"{category} ({len(entries)} names):") + by_module: dict[str, list[str]] = {} + for entry in entries: + by_module.setdefault(entry["module"], []).append(entry["name"]) + + for module, names in sorted(by_module.items()): + sorted_names = sorted(names) + if len(sorted_names) <= 3: + names_str = ", ".join(sorted_names) + lines.append(f" from {module} import {names_str}") + else: + lines.append(f" from {module} import (") + for name in sorted_names: + lines.append(f" {name},") + lines.append(" )") + lines.append("") + + return "\n".join(lines).rstrip() + + +def format_imports_json(categories: dict[str, list[dict[str, str]]]) -> dict[str, Any]: + """Return the categories dict as-is for JSON output.""" + return categories diff --git a/packages/data-designer/tests/cli/services/introspection/test_formatters.py b/packages/data-designer/tests/cli/services/introspection/test_formatters.py index e9d99d63f..291980ba9 100644 --- a/packages/data-designer/tests/cli/services/introspection/test_formatters.py +++ b/packages/data-designer/tests/cli/services/introspection/test_formatters.py @@ -4,10 +4,16 @@ from __future__ import annotations from data_designer.cli.services.introspection.formatters import ( + format_imports_json, + format_imports_text, + format_interface_json, + format_interface_text, format_method_info_json, format_method_info_text, format_model_schema_json, format_model_schema_text, + format_namespace_json, + format_namespace_text, format_overview_text, format_type_list_text, ) @@ -65,8 +71,8 @@ def test_format_model_schema_text_basic() -> None: text = format_model_schema_text(schema) assert "TestModel:" in text assert "description: A test model." in text - assert "my_field:" in text - assert "type: str" in text + assert "my_field: str" in text + assert "[required]" in text def test_format_model_schema_text_with_type_key() -> None: @@ -93,11 +99,39 @@ def test_format_model_schema_text_with_enum_values() -> None: name="color", type_str="ColorEnum", description="Pick a color", - enum_values=["RED", "GREEN", "BLUE"], + enum_values=["red", "green", "blue"], ) schema = _make_schema(fields=[field]) text = format_model_schema_text(schema) - assert "values: [RED, GREEN, BLUE]" in text + assert "values: [red, green, blue]" in text + + +def test_format_model_schema_text_with_default() -> None: + field = FieldDetail( + name="count", + type_str="int", + description="A count", + required=False, + default="0", + ) + schema = _make_schema(fields=[field]) + text = format_model_schema_text(schema) + assert "count: int = 0" in text + assert "[required]" not in text + + +def test_format_model_schema_text_with_constraints() -> None: + field = FieldDetail( + name="score", + type_str="float", + description="A score", + required=False, + default="0.5", + constraints={"ge": 0.0, "le": 1.0}, + ) + schema = _make_schema(fields=[field]) + text = format_model_schema_text(schema) + assert "constraints: ge=0.0, le=1.0" in text # --------------------------------------------------------------------------- @@ -122,6 +156,47 @@ def test_format_model_schema_json_with_type_key() -> None: assert result["column_type"] == "sampler" +def test_format_model_schema_json_includes_required_and_default() -> None: + field = FieldDetail(name="val", type_str="int", description="Value", required=False, default="42") + schema = _make_schema(fields=[field]) + result = format_model_schema_json(schema) + f = result["fields"][0] + assert f["required"] is False + assert f["default"] == "42" + + +def test_format_model_schema_json_required_field_no_default() -> None: + field = FieldDetail(name="val", type_str="str", description="Value", required=True) + schema = _make_schema(fields=[field]) + result = format_model_schema_json(schema) + f = result["fields"][0] + assert f["required"] is True + assert "default" not in f + + +def test_format_model_schema_json_includes_constraints() -> None: + field = FieldDetail( + name="score", + type_str="float", + description="Score", + required=False, + default="0.5", + constraints={"ge": 0.0, "le": 1.0}, + ) + schema = _make_schema(fields=[field]) + result = format_model_schema_json(schema) + f = result["fields"][0] + assert f["constraints"] == {"ge": 0.0, "le": 1.0} + + +def test_format_model_schema_json_no_constraints_key_when_none() -> None: + field = FieldDetail(name="val", type_str="str", description="Value") + schema = _make_schema(fields=[field]) + result = format_model_schema_json(schema) + f = result["fields"][0] + assert "constraints" not in f + + def test_format_model_schema_json_with_nested() -> None: nested = _make_schema(class_name="Inner", description="Inner model.") outer_field = FieldDetail( @@ -265,4 +340,184 @@ def test_format_overview_text_contains_quick_start() -> None: type_counts = {"Column types": 1} text = format_overview_text(type_counts, []) assert "Quick Start Commands:" in text - assert "agent-context columns --list" in text + assert "introspect columns --list" in text + + +# --------------------------------------------------------------------------- +# Namespace / code-structure formatters +# --------------------------------------------------------------------------- + + +def _make_namespace_data() -> dict: + return { + "paths": ["/fake/site-packages/data_designer"], + "tree": { + "name": "data_designer", + "is_package": True, + "children": [ + { + "name": "config", + "is_package": True, + "children": [ + {"name": "column_configs", "is_package": False, "children": []}, + {"name": "models", "is_package": False, "children": []}, + ], + }, + { + "name": "errors", + "is_package": False, + "children": [], + }, + ], + }, + } + + +def test_format_namespace_text_contains_tree_characters() -> None: + text = format_namespace_text(_make_namespace_data()) + assert "├──" in text or "└──" in text + assert "│" in text + + +def test_format_namespace_text_shows_install_path() -> None: + text = format_namespace_text(_make_namespace_data()) + assert "Install path:" in text + assert "/fake/site-packages/data_designer" in text + + +def test_format_namespace_text_packages_have_trailing_slash() -> None: + text = format_namespace_text(_make_namespace_data()) + assert "config/" in text + + +def test_format_namespace_text_modules_have_py_extension() -> None: + text = format_namespace_text(_make_namespace_data()) + assert "errors.py" in text + assert "column_configs.py" in text + + +def test_format_namespace_text_contains_agent_guidance() -> None: + text = format_namespace_text(_make_namespace_data()) + assert "Only read source files directly" in text + + +def test_format_namespace_json_returns_passthrough() -> None: + data = _make_namespace_data() + result = format_namespace_json(data) + assert result is data + + +# --------------------------------------------------------------------------- +# Interface formatters +# --------------------------------------------------------------------------- + + +def _make_interface_data() -> tuple[list[tuple[str, list[MethodInfo]]], list[ModelSchema]]: + methods_data = [ + ("DataDesigner", [_make_method(name="create", signature="create(...) -> DatasetCreationResults")]), + ("DatasetCreationResults", [_make_method(name="load_dataset", signature="load_dataset() -> pd.DataFrame")]), + ] + schemas = [_make_schema(class_name="RunConfig", description="Runtime configuration.")] + return methods_data, schemas + + +def test_format_interface_text_contains_class_names() -> None: + methods_data, schemas = _make_interface_data() + text = format_interface_text(methods_data, schemas) + assert "DataDesigner" in text + assert "DatasetCreationResults" in text + assert "RunConfig" in text + + +def test_format_interface_text_contains_methods() -> None: + methods_data, schemas = _make_interface_data() + text = format_interface_text(methods_data, schemas) + assert "create" in text + assert "load_dataset" in text + + +def test_format_interface_text_contains_run_config_fields() -> None: + methods_data, schemas = _make_interface_data() + text = format_interface_text(methods_data, schemas) + assert "my_field" in text + + +def test_format_interface_json_structure() -> None: + methods_data, schemas = _make_interface_data() + result = format_interface_json(methods_data, schemas) + assert isinstance(result, dict) + assert "methods" in result + assert "schemas" in result + assert "DataDesigner" in result["methods"] + assert "DatasetCreationResults" in result["methods"] + assert isinstance(result["schemas"], list) + assert len(result["schemas"]) == 1 + + +# --------------------------------------------------------------------------- +# Imports formatters +# --------------------------------------------------------------------------- + + +def _make_imports_data() -> dict[str, list[dict[str, str]]]: + return { + "Column Configs": [ + {"name": "LLMTextColumnConfig", "module": "data_designer.config"}, + {"name": "SamplerColumnConfig", "module": "data_designer.config"}, + ], + "Interface": [ + {"name": "DataDesigner", "module": "data_designer.interface"}, + ], + } + + +def test_format_imports_text_contains_from_import() -> None: + text = format_imports_text(_make_imports_data()) + assert "from data_designer.config import" in text + + +def test_format_imports_text_has_category_headers() -> None: + text = format_imports_text(_make_imports_data()) + assert "Column Configs (2 names):" in text + assert "Interface (1 names):" in text + + +def test_format_imports_json_structure() -> None: + data = _make_imports_data() + result = format_imports_json(data) + assert isinstance(result, dict) + assert "Column Configs" in result + assert "Interface" in result + + +# --------------------------------------------------------------------------- +# Schema deduplication +# --------------------------------------------------------------------------- + + +def test_format_field_text_deduplicates_nested_schemas() -> None: + """When seen_schemas is passed, second occurrence of a nested schema shows a back-reference.""" + nested = _make_schema(class_name="SharedNested", description="Shared nested model.") + field1 = FieldDetail(name="field_a", type_str="SharedNested", description="First ref", nested_schema=nested) + field2 = FieldDetail(name="field_b", type_str="SharedNested", description="Second ref", nested_schema=nested) + + schema1 = _make_schema(class_name="Model1", fields=[field1]) + schema2 = _make_schema(class_name="Model2", fields=[field2]) + + seen: set[str] = set() + text1 = format_model_schema_text(schema1, seen_schemas=seen) + text2 = format_model_schema_text(schema2, seen_schemas=seen) + + assert "schema (SharedNested):" in text1 + assert "see SharedNested above" in text2 + assert "schema (SharedNested):" not in text2 + + +def test_format_field_text_no_dedup_without_seen_set() -> None: + """Without seen_schemas, nested schemas always expand fully.""" + nested = _make_schema(class_name="Inner", description="Inner model.") + field1 = FieldDetail(name="x", type_str="Inner", description="Ref", nested_schema=nested) + schema = _make_schema(fields=[field1]) + + text = format_model_schema_text(schema) + assert "schema (Inner):" in text From 55bb9147015ce53e28f20ab3dec1723d27fa09a8 Mon Sep 17 00:00:00 2001 From: Johnny Greco Date: Sun, 15 Feb 2026 21:34:26 -0500 Subject: [PATCH 06/37] feat: add discovery for namespace tree, interface classes, and imports Add discover_namespace_tree, discover_interface_classes, and discover_importable_names functions. Move config imports to module level. --- .../cli/services/introspection/discovery.py | 131 ++++++++++++++++-- .../services/introspection/test_discovery.py | 100 +++++++++++++ .../introspection/test_field_descriptions.py | 2 + 3 files changed, 224 insertions(+), 9 deletions(-) diff --git a/packages/data-designer/src/data_designer/cli/services/introspection/discovery.py b/packages/data-designer/src/data_designer/cli/services/introspection/discovery.py index 5f6e46d98..0ffc9aac6 100644 --- a/packages/data-designer/src/data_designer/cli/services/introspection/discovery.py +++ b/packages/data-designer/src/data_designer/cli/services/introspection/discovery.py @@ -3,9 +3,60 @@ from __future__ import annotations +import importlib import inspect +import pkgutil from enum import Enum -from typing import Literal, get_args, get_origin +from typing import Any, Literal, get_args, get_origin + +import data_designer +import data_designer.config as dd +import data_designer.interface as interface_mod +from data_designer.config.preview_results import PreviewResults +from data_designer.config.run_config import RunConfig +from data_designer.interface.data_designer import DataDesigner +from data_designer.interface.results import DatasetCreationResults + + +def _walk_namespace(package_path: list[str], prefix: str, max_depth: int, current_depth: int) -> list[dict[str, Any]]: + """Recursively walk a namespace package and build a tree of children nodes.""" + if current_depth >= max_depth: + return [] + + children: list[dict[str, Any]] = [] + for importer, name, is_pkg in pkgutil.iter_modules(package_path): + node: dict[str, Any] = { + "name": name, + "is_package": is_pkg, + "children": [], + } + if is_pkg: + full_name = f"{prefix}.{name}" + try: + sub_mod = importlib.import_module(full_name) + sub_path = getattr(sub_mod, "__path__", []) + node["children"] = _walk_namespace(list(sub_path), full_name, max_depth, current_depth + 1) + except Exception: + pass + children.append(node) + + children.sort(key=lambda n: (not n["is_package"], n["name"])) + return children + + +def discover_namespace_tree(max_depth: int = 2) -> dict[str, Any]: + """Walk the data_designer namespace and return install paths plus a module tree. + + Returns: + Dict with ``paths`` (list of install directories) and ``tree`` (nested node dict). + """ + paths = list(data_designer.__path__) + tree: dict[str, Any] = { + "name": "data_designer", + "is_package": True, + "children": _walk_namespace(paths, "data_designer", max_depth, 0), + } + return {"paths": paths, "tree": tree} def discover_column_configs() -> dict[str, type]: @@ -14,7 +65,6 @@ def discover_column_configs() -> dict[str, type]: Returns: Dict mapping column_type literal values (e.g., 'llm-text') to their config classes. """ - import data_designer.config as dd column_configs: dict[str, type] = {} for name in dir(dd): @@ -36,7 +86,6 @@ def discover_sampler_types() -> dict[str, type]: Returns: Dict mapping sampler type names (e.g., 'category') to their params classes. """ - import data_designer.config as dd sampler_type_enum = getattr(dd, "SamplerType", None) if sampler_type_enum is None or not issubclass(sampler_type_enum, Enum): @@ -67,7 +116,6 @@ def discover_validator_types() -> dict[str, type]: Returns: Dict mapping validator type names to their params classes. """ - import data_designer.config as dd validator_type_enum = getattr(dd, "ValidatorType", None) if validator_type_enum is None or not issubclass(validator_type_enum, Enum): @@ -98,7 +146,6 @@ def discover_processor_configs() -> dict[str, type]: Returns: Dict mapping processor_type values to their config classes. """ - import data_designer.config as dd processor_configs: dict[str, type] = {} for name in dir(dd): @@ -121,7 +168,6 @@ def discover_model_configs() -> dict[str, type]: Returns: Dict mapping class names to their types. """ - import data_designer.config as dd return { "ModelConfig": dd.ModelConfig, @@ -140,7 +186,6 @@ def discover_constraint_types() -> dict[str, type]: Returns: Dict mapping class names to their types. """ - import data_designer.config as dd return { "ScalarInequalityConstraint": dd.ScalarInequalityConstraint, @@ -155,7 +200,6 @@ def discover_seed_types() -> dict[str, type]: Returns: Dict mapping class names to their types. """ - import data_designer.config as dd return { "SeedConfig": dd.SeedConfig, @@ -174,10 +218,79 @@ def discover_mcp_types() -> dict[str, type]: Returns: Dict mapping class names to their types. """ - import data_designer.config as dd return { "MCPProvider": dd.MCPProvider, "LocalStdioMCPProvider": dd.LocalStdioMCPProvider, "ToolConfig": dd.ToolConfig, } + + +def discover_interface_classes() -> dict[str, type]: + """Return the key interface-layer classes an agent uses after building a config. + + Returns: + Dict mapping class names to their types for DataDesigner, DatasetCreationResults, + PreviewResults, and RunConfig. + """ + return { + "DataDesigner": DataDesigner, + "DatasetCreationResults": DatasetCreationResults, + "PreviewResults": PreviewResults, + "RunConfig": RunConfig, + } + + +_MODULE_CATEGORIES: dict[str, str] = { + "column_configs": "Column Configs", + "column_types": "Column Types", + "config_builder": "Builder", + "custom_column": "Custom Columns", + "data_designer_config": "Core Config", + "mcp": "MCP", + "models": "Model Configs", + "processors": "Processors", + "run_config": "Runtime Config", + "sampler_constraints": "Constraints", + "sampler_params": "Sampler Params", + "seed": "Seed Config", + "seed_source": "Seed Sources", + "validator_params": "Validator Params", + "analysis.column_profilers": "Analysis", + "utils": "Utilities", + "version": "Utilities", +} + + +def _categorize_module(module_path: str) -> str: + """Map a module path from _LAZY_IMPORTS to a human-readable category name.""" + prefix = "data_designer.config." + suffix = module_path.removeprefix(prefix) if module_path.startswith(prefix) else module_path + + for key, category in _MODULE_CATEGORIES.items(): + if suffix == key or suffix.startswith(key + "."): + return category + return "Other" + + +def discover_importable_names() -> dict[str, list[dict[str, str]]]: + """Discover all importable names from data_designer.config and data_designer.interface. + + Reads _LAZY_IMPORTS from the config module and __all__ from the interface module, + grouping names by source-module category. + + Returns: + Dict mapping category names to lists of ``{"name": str, "module": str}`` entries. + """ + lazy_imports: dict[str, tuple[str, str]] = getattr(dd, "_LAZY_IMPORTS", {}) + + categories: dict[str, list[dict[str, str]]] = {} + for name, (module_path, _attr) in sorted(lazy_imports.items()): + category = _categorize_module(module_path) + categories.setdefault(category, []).append({"name": name, "module": "data_designer.config"}) + + interface_all: list[str] = getattr(interface_mod, "__all__", []) + if interface_all: + categories["Interface"] = [{"name": n, "module": "data_designer.interface"} for n in sorted(interface_all)] + + return categories diff --git a/packages/data-designer/tests/cli/services/introspection/test_discovery.py b/packages/data-designer/tests/cli/services/introspection/test_discovery.py index d1a7c8d54..0d8f72e3f 100644 --- a/packages/data-designer/tests/cli/services/introspection/test_discovery.py +++ b/packages/data-designer/tests/cli/services/introspection/test_discovery.py @@ -6,8 +6,11 @@ from data_designer.cli.services.introspection.discovery import ( discover_column_configs, discover_constraint_types, + discover_importable_names, + discover_interface_classes, discover_mcp_types, discover_model_configs, + discover_namespace_tree, discover_processor_configs, discover_sampler_types, discover_seed_types, @@ -153,3 +156,100 @@ def test_discover_mcp_types_contains_expected_keys() -> None: result = discover_mcp_types() for expected_key in ("MCPProvider", "ToolConfig"): assert expected_key in result, f"Expected key '{expected_key}' not found in {list(result.keys())}" + + +# --------------------------------------------------------------------------- +# discover_namespace_tree +# --------------------------------------------------------------------------- + + +def test_discover_namespace_tree_returns_paths_and_tree() -> None: + result = discover_namespace_tree() + assert "paths" in result + assert "tree" in result + + +def test_discover_namespace_tree_paths_non_empty() -> None: + result = discover_namespace_tree() + assert isinstance(result["paths"], list) + assert len(result["paths"]) > 0 + for p in result["paths"]: + assert isinstance(p, str) + + +def test_discover_namespace_tree_root_is_data_designer() -> None: + result = discover_namespace_tree() + tree = result["tree"] + assert tree["name"] == "data_designer" + assert tree["is_package"] is True + + +def test_discover_namespace_tree_contains_expected_children() -> None: + result = discover_namespace_tree() + child_names = [c["name"] for c in result["tree"]["children"]] + for expected in ("config", "engine", "cli"): + assert expected in child_names, f"Expected '{expected}' in {child_names}" + + +def test_discover_namespace_tree_children_have_correct_structure() -> None: + result = discover_namespace_tree() + for child in result["tree"]["children"]: + assert "name" in child + assert "is_package" in child + assert "children" in child + assert isinstance(child["name"], str) + assert isinstance(child["is_package"], bool) + assert isinstance(child["children"], list) + + +# --------------------------------------------------------------------------- +# discover_interface_classes +# --------------------------------------------------------------------------- + + +def test_discover_interface_classes_returns_dict() -> None: + result = discover_interface_classes() + assert isinstance(result, dict) + assert len(result) > 0 + + +def test_discover_interface_classes_contains_expected_keys() -> None: + result = discover_interface_classes() + for expected_key in ("DataDesigner", "DatasetCreationResults", "PreviewResults", "RunConfig"): + assert expected_key in result, f"Expected key '{expected_key}' not found in {list(result.keys())}" + + +def test_discover_interface_classes_values_are_classes() -> None: + result = discover_interface_classes() + for cls in result.values(): + assert isinstance(cls, type) + + +# --------------------------------------------------------------------------- +# discover_importable_names +# --------------------------------------------------------------------------- + + +def test_discover_importable_names_returns_dict() -> None: + result = discover_importable_names() + assert isinstance(result, dict) + assert len(result) > 0 + + +def test_discover_importable_names_has_column_configs_category() -> None: + result = discover_importable_names() + assert "Column Configs" in result, f"Expected 'Column Configs' in {list(result.keys())}" + + +def test_discover_importable_names_has_interface_category() -> None: + result = discover_importable_names() + assert "Interface" in result, f"Expected 'Interface' in {list(result.keys())}" + + +def test_discover_importable_names_entries_have_name_and_module() -> None: + result = discover_importable_names() + for category, entries in result.items(): + assert isinstance(entries, list), f"Category '{category}' value is not a list" + for entry in entries: + assert "name" in entry, f"Entry in '{category}' missing 'name'" + assert "module" in entry, f"Entry in '{category}' missing 'module'" diff --git a/packages/data-designer/tests/cli/services/introspection/test_field_descriptions.py b/packages/data-designer/tests/cli/services/introspection/test_field_descriptions.py index dd684f0eb..d8f8accba 100644 --- a/packages/data-designer/tests/cli/services/introspection/test_field_descriptions.py +++ b/packages/data-designer/tests/cli/services/introspection/test_field_descriptions.py @@ -8,6 +8,7 @@ from data_designer.cli.services.introspection.discovery import ( discover_column_configs, discover_constraint_types, + discover_interface_classes, discover_mcp_types, discover_model_configs, discover_processor_configs, @@ -34,6 +35,7 @@ def _collect_models_with_fields() -> list[tuple[str, str, type]]: ("constraint_types", discover_constraint_types()), ("seed_types", discover_seed_types()), ("mcp_types", discover_mcp_types()), + ("interface_classes", discover_interface_classes()), ] for source_label, discovered in discovery_sources: From de03cdbe822370bf67430a0bc8318441093e84f7 Mon Sep 17 00:00:00 2001 From: Johnny Greco Date: Sun, 15 Feb 2026 21:34:41 -0500 Subject: [PATCH 07/37] refactor: rename agent-context CLI to introspect and add new subcommands Rename CLI command from agent-context to introspect, add OutputFormat enum for validated --format options, and add interface, imports, and code-structure subcommands with fuzzy category matching. --- .../cli/commands/agent_context.py | 88 ++++++- .../controllers/agent_context_controller.py | 120 ++++++++-- .../src/data_designer/cli/main.py | 6 +- .../cli/services/introspection/__init__.py | 22 ++ .../commands/test_agent_context_command.py | 170 ++++++++++++- .../test_agent_context_controller.py | 226 ++++++++++++++++++ 6 files changed, 587 insertions(+), 45 deletions(-) diff --git a/packages/data-designer/src/data_designer/cli/commands/agent_context.py b/packages/data-designer/src/data_designer/cli/commands/agent_context.py index 8a2a4e7e1..c1b77aee0 100644 --- a/packages/data-designer/src/data_designer/cli/commands/agent_context.py +++ b/packages/data-designer/src/data_designer/cli/commands/agent_context.py @@ -3,19 +3,29 @@ from __future__ import annotations +from enum import Enum + import typer from data_designer.cli.controllers.agent_context_controller import AgentContextController + +class OutputFormat(str, Enum): + """Supported output formats for introspect commands.""" + + TEXT = "text" + JSON = "json" + + agent_context_app = typer.Typer( - name="agent-context", + name="introspect", help="Introspect Data Designer's API for agent consumption.", no_args_is_help=True, ) -def _make_controller(output_format: str) -> AgentContextController: - return AgentContextController(output_format=output_format) +def _make_controller(output_format: OutputFormat) -> AgentContextController: + return AgentContextController(output_format=output_format.value) @agent_context_app.command(name="columns") @@ -24,7 +34,9 @@ def columns_command( None, help="Column type to display (e.g., 'llm-text'), or 'all' for everything." ), list_mode: bool = typer.Option(False, "--list", "-l", help="Show summary table of available types."), - output_format: str = typer.Option("text", "--format", "-f", help="Output format: 'text' (default) or 'json'."), + output_format: OutputFormat = typer.Option( + OutputFormat.TEXT, "--format", "-f", help="Output format: 'text' or 'json'." + ), ) -> None: """Show column configuration types and their fields.""" _make_controller(output_format).show_columns(type_name, list_mode) @@ -36,7 +48,9 @@ def samplers_command( None, help="Sampler type to display (e.g., 'category'), or 'all' for everything." ), list_mode: bool = typer.Option(False, "--list", "-l", help="Show summary table of available types."), - output_format: str = typer.Option("text", "--format", "-f", help="Output format: 'text' (default) or 'json'."), + output_format: OutputFormat = typer.Option( + OutputFormat.TEXT, "--format", "-f", help="Output format: 'text' or 'json'." + ), ) -> None: """Show sampler types and their parameter fields.""" _make_controller(output_format).show_samplers(type_name, list_mode) @@ -48,7 +62,9 @@ def validators_command( None, help="Validator type to display (e.g., 'code'), or 'all' for everything." ), list_mode: bool = typer.Option(False, "--list", "-l", help="Show summary table of available types."), - output_format: str = typer.Option("text", "--format", "-f", help="Output format: 'text' (default) or 'json'."), + output_format: OutputFormat = typer.Option( + OutputFormat.TEXT, "--format", "-f", help="Output format: 'text' or 'json'." + ), ) -> None: """Show validator types and their parameter fields.""" _make_controller(output_format).show_validators(type_name, list_mode) @@ -60,7 +76,9 @@ def processors_command( None, help="Processor type to display (e.g., 'drop_columns'), or 'all' for everything." ), list_mode: bool = typer.Option(False, "--list", "-l", help="Show summary table of available types."), - output_format: str = typer.Option("text", "--format", "-f", help="Output format: 'text' (default) or 'json'."), + output_format: OutputFormat = typer.Option( + OutputFormat.TEXT, "--format", "-f", help="Output format: 'text' or 'json'." + ), ) -> None: """Show processor types and their configuration fields.""" _make_controller(output_format).show_processors(type_name, list_mode) @@ -68,7 +86,9 @@ def processors_command( @agent_context_app.command(name="models") def models_command( - output_format: str = typer.Option("text", "--format", "-f", help="Output format: 'text' (default) or 'json'."), + output_format: OutputFormat = typer.Option( + OutputFormat.TEXT, "--format", "-f", help="Output format: 'text' or 'json'." + ), ) -> None: """Show model configuration types (ModelConfig, inference params, distributions).""" _make_controller(output_format).show_models() @@ -76,7 +96,9 @@ def models_command( @agent_context_app.command(name="builder") def builder_command( - output_format: str = typer.Option("text", "--format", "-f", help="Output format: 'text' (default) or 'json'."), + output_format: OutputFormat = typer.Option( + OutputFormat.TEXT, "--format", "-f", help="Output format: 'text' or 'json'." + ), ) -> None: """Show DataDesignerConfigBuilder method signatures and documentation.""" _make_controller(output_format).show_builder() @@ -84,7 +106,9 @@ def builder_command( @agent_context_app.command(name="constraints") def constraints_command( - output_format: str = typer.Option("text", "--format", "-f", help="Output format: 'text' (default) or 'json'."), + output_format: OutputFormat = typer.Option( + OutputFormat.TEXT, "--format", "-f", help="Output format: 'text' or 'json'." + ), ) -> None: """Show constraint types (ScalarInequality, ColumnInequality, operators).""" _make_controller(output_format).show_constraints() @@ -92,7 +116,9 @@ def constraints_command( @agent_context_app.command(name="seeds") def seeds_command( - output_format: str = typer.Option("text", "--format", "-f", help="Output format: 'text' (default) or 'json'."), + output_format: OutputFormat = typer.Option( + OutputFormat.TEXT, "--format", "-f", help="Output format: 'text' or 'json'." + ), ) -> None: """Show seed dataset types (SeedConfig, sources, sampling strategies).""" _make_controller(output_format).show_seeds() @@ -100,15 +126,51 @@ def seeds_command( @agent_context_app.command(name="mcp") def mcp_command( - output_format: str = typer.Option("text", "--format", "-f", help="Output format: 'text' (default) or 'json'."), + output_format: OutputFormat = typer.Option( + OutputFormat.TEXT, "--format", "-f", help="Output format: 'text' or 'json'." + ), ) -> None: """Show MCP provider types (MCPProvider, LocalStdioMCPProvider, ToolConfig).""" _make_controller(output_format).show_mcp() +@agent_context_app.command(name="interface") +def interface_command( + output_format: OutputFormat = typer.Option( + OutputFormat.TEXT, "--format", "-f", help="Output format: 'text' or 'json'." + ), +) -> None: + """Show DataDesigner class methods, result types, and RunConfig fields.""" + _make_controller(output_format).show_interface() + + +@agent_context_app.command(name="imports") +def imports_command( + category: str | None = typer.Argument(None, help="Filter by category (e.g., 'columns'), or omit for all."), + output_format: OutputFormat = typer.Option( + OutputFormat.TEXT, "--format", "-f", help="Output format: 'text' or 'json'." + ), +) -> None: + """Show categorized import reference for data_designer.config and data_designer.interface.""" + _make_controller(output_format).show_imports(category) + + +@agent_context_app.command(name="code-structure") +def code_structure_command( + depth: int = typer.Option(2, "--depth", "-d", help="Max tree depth (default: 2)."), + output_format: OutputFormat = typer.Option( + OutputFormat.TEXT, "--format", "-f", help="Output format: 'text' or 'json'." + ), +) -> None: + """Show the data_designer package structure and install paths.""" + _make_controller(output_format).show_code_structure(depth=depth) + + @agent_context_app.command(name="overview") def overview_command( - output_format: str = typer.Option("text", "--format", "-f", help="Output format: 'text' (default) or 'json'."), + output_format: OutputFormat = typer.Option( + OutputFormat.TEXT, "--format", "-f", help="Output format: 'text' or 'json'." + ), ) -> None: """Show compact API cheatsheet with type counts, builder summary, and quick start commands.""" _make_controller(output_format).show_overview() diff --git a/packages/data-designer/src/data_designer/cli/controllers/agent_context_controller.py b/packages/data-designer/src/data_designer/cli/controllers/agent_context_controller.py index f49cc1918..9406b9ff6 100644 --- a/packages/data-designer/src/data_designer/cli/controllers/agent_context_controller.py +++ b/packages/data-designer/src/data_designer/cli/controllers/agent_context_controller.py @@ -10,30 +10,40 @@ from data_designer.cli.services.introspection.discovery import ( discover_column_configs, discover_constraint_types, + discover_importable_names, + discover_interface_classes, discover_mcp_types, discover_model_configs, + discover_namespace_tree, discover_processor_configs, discover_sampler_types, discover_seed_types, discover_validator_types, ) from data_designer.cli.services.introspection.formatters import ( + format_imports_json, + format_imports_text, + format_interface_json, + format_interface_text, format_method_info_json, format_method_info_text, format_model_schema_json, format_model_schema_text, + format_namespace_json, + format_namespace_text, format_overview_text, format_type_list_text, ) from data_designer.cli.services.introspection.method_inspector import inspect_class_methods from data_designer.cli.services.introspection.pydantic_inspector import build_model_schema +from data_designer.config.config_builder import DataDesignerConfigBuilder class AgentContextController: - """Controller for agent-context CLI commands. + """Controller for introspect CLI commands. Orchestrates discovery, inspection, formatting, and output for all - agent-context subcommands. + introspect subcommands. """ def __init__(self, output_format: str = "text") -> None: @@ -103,8 +113,6 @@ def show_models(self) -> None: def show_builder(self) -> None: """Show DataDesignerConfigBuilder method signatures and docs.""" - from data_designer.config.config_builder import DataDesignerConfigBuilder - methods = inspect_class_methods(DataDesignerConfigBuilder) if self._format == "json": typer.echo(json.dumps(format_method_info_json(methods), indent=2)) @@ -126,10 +134,58 @@ def show_mcp(self) -> None: items = discover_mcp_types() self._show_all_schemas(items, "Data Designer MCP Types Reference") + def show_interface(self) -> None: + """Show DataDesigner, result types, and RunConfig.""" + classes = discover_interface_classes() + + method_class_names = ["DataDesigner", "DatasetCreationResults", "PreviewResults"] + pydantic_class_names = ["RunConfig"] + + classes_with_methods: list[tuple[str, list]] = [] + for name in method_class_names: + cls = classes[name] + methods = inspect_class_methods(cls) + classes_with_methods.append((name, methods)) + + pydantic_schemas = [] + for name in pydantic_class_names: + cls = classes[name] + schema = build_model_schema(cls) + pydantic_schemas.append(schema) + + if self._format == "json": + typer.echo(json.dumps(format_interface_json(classes_with_methods, pydantic_schemas), indent=2)) + else: + typer.echo(format_interface_text(classes_with_methods, pydantic_schemas)) + + def show_imports(self, category: str | None = None) -> None: + """Show categorized import reference for data_designer.config and data_designer.interface.""" + categories = discover_importable_names() + + if category is not None: + matched_key = self._match_category(category, list(categories.keys())) + if matched_key is None: + available = ", ".join(sorted(categories.keys())) + typer.echo(f"Error: No category matching '{category}'.", err=True) + typer.echo(f"Available categories: {available}", err=True) + raise typer.Exit(code=1) + categories = {matched_key: categories[matched_key]} + + if self._format == "json": + typer.echo(json.dumps(format_imports_json(categories), indent=2)) + else: + typer.echo(format_imports_text(categories)) + + def show_code_structure(self, depth: int = 2) -> None: + """Show the data_designer package structure and install paths.""" + data = discover_namespace_tree(max_depth=depth) + if self._format == "json": + typer.echo(json.dumps(format_namespace_json(data), indent=2)) + else: + typer.echo(format_namespace_text(data)) + def show_overview(self) -> None: """Show compact API overview cheatsheet.""" - from data_designer.config.config_builder import DataDesignerConfigBuilder - type_counts = { "Column types": len(discover_column_configs()), "Sampler types": len(discover_sampler_types()), @@ -152,6 +208,43 @@ def show_overview(self) -> None: else: typer.echo(format_overview_text(type_counts, builder_methods)) + @staticmethod + def _match_category(query: str, keys: list[str]) -> str | None: + """Match a user query to a category key using progressive fuzzy matching. + + Tries: exact match, first-word stem match, any-word stem match, substring match. + """ + normalized = query.lower().rstrip("s") + + # Exact match (case-insensitive) + for key in keys: + if key.lower() == query.lower(): + return key + + # First-word stem match + for key in keys: + first_word = key.lower().split()[0].rstrip("s") + if first_word == normalized: + return key + + # Any-word stem match + for key in keys: + words = key.lower().split() + for word in words: + if word.rstrip("s") == normalized: + return key + + # Substring match (earliest position wins) + best_key: str | None = None + best_pos = float("inf") + for key in keys: + pos = key.lower().find(query.lower()) + if pos != -1 and pos < best_pos: + best_pos = pos + best_key = key + + return best_key + def _show_typed_items( self, items: dict[str, type], @@ -165,14 +258,7 @@ def _show_typed_items( uppercase_value: bool = False, ) -> None: """Shared logic for type-based commands (columns, samplers, validators, processors).""" - if list_mode: - if self._format == "json": - typer.echo(json.dumps({k: v.__name__ for k, v in sorted(items.items())}, indent=2)) - else: - typer.echo(format_type_list_text(items, type_label, class_label)) - return - - if type_name is None: + if list_mode or type_name is None: if self._format == "json": typer.echo(json.dumps({k: v.__name__ for k, v in sorted(items.items())}, indent=2)) else: @@ -218,12 +304,13 @@ def _show_all_typed( all_schemas.append(format_model_schema_json(schema)) typer.echo(json.dumps(all_schemas, indent=2)) else: + seen_schemas: set[str] = set() lines = [f"# {header_title}", f"# {len(sorted_types)} types discovered from data_designer.config", ""] for type_value in sorted_types: cls = items[type_value] display_value = type_value.upper() if uppercase_value else type_value schema = build_model_schema(cls, type_key=type_key, type_value=display_value) - lines.append(format_model_schema_text(schema)) + lines.append(format_model_schema_text(schema, seen_schemas=seen_schemas)) lines.append("") typer.echo("\n".join(lines)) @@ -240,12 +327,13 @@ def _show_all_schemas(self, items: dict[str, type], header_title: str) -> None: all_schemas.append({"class_name": cls.__name__, "description": cls.__doc__ or ""}) typer.echo(json.dumps(all_schemas, indent=2)) else: + seen_schemas: set[str] = set() lines = [f"# {header_title}", f"# {len(items)} types", ""] for name in sorted(items.keys()): cls = items[name] if hasattr(cls, "model_fields"): schema = build_model_schema(cls) - lines.append(format_model_schema_text(schema)) + lines.append(format_model_schema_text(schema, seen_schemas=seen_schemas)) else: lines.append(f"{cls.__name__}:") if cls.__doc__: diff --git a/packages/data-designer/src/data_designer/cli/main.py b/packages/data-designer/src/data_designer/cli/main.py index 1afb9d640..df69cea70 100644 --- a/packages/data-designer/src/data_designer/cli/main.py +++ b/packages/data-designer/src/data_designer/cli/main.py @@ -100,11 +100,11 @@ ) # Add setup command groups -app.add_typer(config_app, name="config", rich_help_panel="Setup") -app.add_typer(download_app, name="download", rich_help_panel="Setup") +app.add_typer(config_app, name="config", rich_help_panel="Setup Commands") +app.add_typer(download_app, name="download", rich_help_panel="Setup Commands") # Add agent command groups -app.add_typer(agent_context.agent_context_app, name="agent-context", rich_help_panel="Agent") +app.add_typer(agent_context.agent_context_app, name="introspect", rich_help_panel="Agent Commands") def main() -> None: diff --git a/packages/data-designer/src/data_designer/cli/services/introspection/__init__.py b/packages/data-designer/src/data_designer/cli/services/introspection/__init__.py index fe6a08e50..f09c64909 100644 --- a/packages/data-designer/src/data_designer/cli/services/introspection/__init__.py +++ b/packages/data-designer/src/data_designer/cli/services/introspection/__init__.py @@ -6,25 +6,36 @@ from data_designer.cli.services.introspection.discovery import ( discover_column_configs, discover_constraint_types, + discover_importable_names, + discover_interface_classes, discover_mcp_types, discover_model_configs, + discover_namespace_tree, discover_processor_configs, discover_sampler_types, discover_seed_types, discover_validator_types, ) from data_designer.cli.services.introspection.formatters import ( + format_imports_json, + format_imports_text, + format_interface_json, + format_interface_text, format_method_info_json, format_method_info_text, format_model_schema_json, format_model_schema_text, + format_namespace_json, + format_namespace_text, format_overview_text, format_type_list_text, ) from data_designer.cli.services.introspection.method_inspector import ( MethodInfo, ParamInfo, + PropertyInfo, inspect_class_methods, + inspect_class_properties, ) from data_designer.cli.services.introspection.pydantic_inspector import ( FieldDetail, @@ -40,23 +51,34 @@ "MethodInfo", "ModelSchema", "ParamInfo", + "PropertyInfo", "build_model_schema", "discover_column_configs", "discover_constraint_types", + "discover_importable_names", + "discover_interface_classes", "discover_mcp_types", "discover_model_configs", + "discover_namespace_tree", "discover_processor_configs", "discover_sampler_types", "discover_seed_types", "discover_validator_types", + "format_imports_json", + "format_imports_text", + "format_interface_json", + "format_interface_text", "format_method_info_json", "format_method_info_text", "format_model_schema_json", "format_model_schema_text", + "format_namespace_json", + "format_namespace_text", "format_overview_text", "format_type_list_text", "format_type", "get_brief_description", "get_field_info", "inspect_class_methods", + "inspect_class_properties", ] diff --git a/packages/data-designer/tests/cli/commands/test_agent_context_command.py b/packages/data-designer/tests/cli/commands/test_agent_context_command.py index 162ec7c9b..4bd3960ca 100644 --- a/packages/data-designer/tests/cli/commands/test_agent_context_command.py +++ b/packages/data-designer/tests/cli/commands/test_agent_context_command.py @@ -18,7 +18,7 @@ def test_agent_context_help() -> None: - result = runner.invoke(app, ["agent-context", "--help"]) + result = runner.invoke(app, ["introspect", "--help"]) assert result.exit_code == 0 assert "columns" in result.output @@ -29,19 +29,19 @@ def test_agent_context_help() -> None: def test_columns_list() -> None: - result = runner.invoke(app, ["agent-context", "columns", "--list"]) + result = runner.invoke(app, ["introspect", "columns", "--list"]) assert result.exit_code == 0 assert "llm-text" in result.output def test_columns_specific_type() -> None: - result = runner.invoke(app, ["agent-context", "columns", "llm-text"]) + result = runner.invoke(app, ["introspect", "columns", "llm-text"]) assert result.exit_code == 0 assert "LLMTextColumnConfig" in result.output def test_columns_json_format() -> None: - result = runner.invoke(app, ["agent-context", "columns", "llm-text", "--format", "json"]) + result = runner.invoke(app, ["introspect", "columns", "llm-text", "--format", "json"]) assert result.exit_code == 0 data = json.loads(result.output) assert isinstance(data, dict) @@ -49,7 +49,7 @@ def test_columns_json_format() -> None: def test_columns_nonexistent_exits_with_error() -> None: - result = runner.invoke(app, ["agent-context", "columns", "nonexistent"]) + result = runner.invoke(app, ["introspect", "columns", "nonexistent"]) assert result.exit_code == 1 @@ -59,13 +59,13 @@ def test_columns_nonexistent_exits_with_error() -> None: def test_samplers_specific() -> None: - result = runner.invoke(app, ["agent-context", "samplers", "category"]) + result = runner.invoke(app, ["introspect", "samplers", "category"]) assert result.exit_code == 0 assert "CATEGORY" in result.output def test_samplers_list() -> None: - result = runner.invoke(app, ["agent-context", "samplers", "--list"]) + result = runner.invoke(app, ["introspect", "samplers", "--list"]) assert result.exit_code == 0 assert "category" in result.output @@ -76,7 +76,7 @@ def test_samplers_list() -> None: def test_overview() -> None: - result = runner.invoke(app, ["agent-context", "overview"]) + result = runner.invoke(app, ["introspect", "overview"]) assert result.exit_code == 0 assert "Type Counts" in result.output @@ -87,9 +87,20 @@ def test_overview() -> None: def test_builder() -> None: - result = runner.invoke(app, ["agent-context", "builder"]) + result = runner.invoke(app, ["introspect", "builder"]) assert result.exit_code == 0 assert "add_column" in result.output + assert "DataDesignerConfigBuilder" in result.output + assert "Parameters:" in result.output + + +def test_builder_json() -> None: + result = runner.invoke(app, ["introspect", "builder", "--format", "json"]) + assert result.exit_code == 0 + data = json.loads(result.output) + assert isinstance(data, list) + method_names = [m["name"] for m in data] + assert "add_column" in method_names # --------------------------------------------------------------------------- @@ -98,9 +109,19 @@ def test_builder() -> None: def test_models() -> None: - result = runner.invoke(app, ["agent-context", "models"]) + result = runner.invoke(app, ["introspect", "models"]) assert result.exit_code == 0 assert "ModelConfig" in result.output + assert "description:" in result.output + + +def test_models_json() -> None: + result = runner.invoke(app, ["introspect", "models", "--format", "json"]) + assert result.exit_code == 0 + data = json.loads(result.output) + assert isinstance(data, list) + class_names = [item.get("class_name", "") for item in data if isinstance(item, dict)] + assert "ModelConfig" in class_names # --------------------------------------------------------------------------- @@ -109,7 +130,7 @@ def test_models() -> None: def test_constraints() -> None: - result = runner.invoke(app, ["agent-context", "constraints"]) + result = runner.invoke(app, ["introspect", "constraints"]) assert result.exit_code == 0 output = result.output assert "ScalarInequalityConstraint" in output or "InequalityOperator" in output @@ -121,9 +142,19 @@ def test_constraints() -> None: def test_seeds() -> None: - result = runner.invoke(app, ["agent-context", "seeds"]) + result = runner.invoke(app, ["introspect", "seeds"]) assert result.exit_code == 0 assert "SeedConfig" in result.output + assert "SamplingStrategy" in result.output + + +def test_seeds_json() -> None: + result = runner.invoke(app, ["introspect", "seeds", "--format", "json"]) + assert result.exit_code == 0 + data = json.loads(result.output) + assert isinstance(data, list) + class_names = [item.get("class_name", "") for item in data if isinstance(item, dict)] + assert "SeedConfig" in class_names # --------------------------------------------------------------------------- @@ -132,6 +163,119 @@ def test_seeds() -> None: def test_mcp() -> None: - result = runner.invoke(app, ["agent-context", "mcp"]) + result = runner.invoke(app, ["introspect", "mcp"]) assert result.exit_code == 0 assert "ToolConfig" in result.output + assert "MCPProvider" in result.output or "LocalStdioMCPProvider" in result.output + + +def test_mcp_json() -> None: + result = runner.invoke(app, ["introspect", "mcp", "--format", "json"]) + assert result.exit_code == 0 + data = json.loads(result.output) + assert isinstance(data, list) + class_names = [item.get("class_name", "") for item in data if isinstance(item, dict)] + assert "ToolConfig" in class_names + + +# --------------------------------------------------------------------------- +# code-structure +# --------------------------------------------------------------------------- + + +def test_code_structure() -> None: + result = runner.invoke(app, ["introspect", "code-structure"]) + assert result.exit_code == 0 + assert "data_designer code structure" in result.output + assert "├──" in result.output + + +def test_code_structure_shows_subpackages() -> None: + result = runner.invoke(app, ["introspect", "code-structure"]) + assert result.exit_code == 0 + for pkg in ("config/", "engine/", "cli/"): + assert pkg in result.output, f"Expected '{pkg}' in output" + + +def test_code_structure_json_format() -> None: + result = runner.invoke(app, ["introspect", "code-structure", "--format", "json"]) + assert result.exit_code == 0 + data = json.loads(result.output) + assert "paths" in data + assert "tree" in data + + +def test_code_structure_shows_agent_guidance() -> None: + result = runner.invoke(app, ["introspect", "code-structure"]) + assert result.exit_code == 0 + assert "Only read source files directly" in result.output + + +# --------------------------------------------------------------------------- +# interface +# --------------------------------------------------------------------------- + + +def test_interface() -> None: + result = runner.invoke(app, ["introspect", "interface"]) + assert result.exit_code == 0 + assert "DataDesigner" in result.output + assert "create" in result.output + + +def test_interface_json() -> None: + result = runner.invoke(app, ["introspect", "interface", "--format", "json"]) + assert result.exit_code == 0 + data = json.loads(result.output) + assert "methods" in data + assert "schemas" in data + + +def test_interface_shows_result_types() -> None: + result = runner.invoke(app, ["introspect", "interface"]) + assert result.exit_code == 0 + assert "DatasetCreationResults" in result.output + + +# --------------------------------------------------------------------------- +# imports +# --------------------------------------------------------------------------- + + +def test_imports() -> None: + result = runner.invoke(app, ["introspect", "imports"]) + assert result.exit_code == 0 + assert "from data_designer.config import" in result.output + + +def test_imports_json() -> None: + result = runner.invoke(app, ["introspect", "imports", "--format", "json"]) + assert result.exit_code == 0 + data = json.loads(result.output) + assert isinstance(data, dict) + assert len(data) > 0 + + +# --------------------------------------------------------------------------- +# format validation +# --------------------------------------------------------------------------- + + +def test_invalid_format_rejected() -> None: + result = runner.invoke(app, ["introspect", "columns", "--list", "--format", "xml"]) + assert result.exit_code != 0 + + +def test_invalid_format_rejected_on_builder() -> None: + result = runner.invoke(app, ["introspect", "builder", "--format", "yaml"]) + assert result.exit_code != 0 + + +def test_valid_format_text() -> None: + result = runner.invoke(app, ["introspect", "columns", "--list", "--format", "text"]) + assert result.exit_code == 0 + + +def test_valid_format_json() -> None: + result = runner.invoke(app, ["introspect", "columns", "--list", "--format", "json"]) + assert result.exit_code == 0 diff --git a/packages/data-designer/tests/cli/controllers/test_agent_context_controller.py b/packages/data-designer/tests/cli/controllers/test_agent_context_controller.py index 54cb5511f..34b53b5a2 100644 --- a/packages/data-designer/tests/cli/controllers/test_agent_context_controller.py +++ b/packages/data-designer/tests/cli/controllers/test_agent_context_controller.py @@ -82,6 +82,10 @@ def test_show_overview_json(capsys: pytest.CaptureFixture[str]) -> None: data = json.loads(captured.out) assert "type_counts" in data assert "builder_methods" in data + assert isinstance(data["type_counts"], dict) + assert len(data["type_counts"]) > 0 + assert isinstance(data["builder_methods"], list) + assert len(data["builder_methods"]) > 0 # --------------------------------------------------------------------------- @@ -161,3 +165,225 @@ def test_show_mcp(capsys: pytest.CaptureFixture[str]) -> None: controller.show_mcp() captured = capsys.readouterr() assert "ToolConfig" in captured.out + + +# --------------------------------------------------------------------------- +# show_interface +# --------------------------------------------------------------------------- + + +def test_show_interface_text(capsys: pytest.CaptureFixture[str]) -> None: + controller = AgentContextController(output_format="text") + controller.show_interface() + captured = capsys.readouterr() + assert "DataDesigner" in captured.out + assert "DatasetCreationResults" in captured.out + assert "RunConfig" in captured.out + + +def test_show_interface_json(capsys: pytest.CaptureFixture[str]) -> None: + controller = AgentContextController(output_format="json") + controller.show_interface() + captured = capsys.readouterr() + data = json.loads(captured.out) + assert "methods" in data + assert "schemas" in data + assert "DataDesigner" in data["methods"] + assert isinstance(data["schemas"], list) + assert len(data["schemas"]) > 0 + + +# --------------------------------------------------------------------------- +# show_validators +# --------------------------------------------------------------------------- + + +def test_show_validators_list_text(capsys: pytest.CaptureFixture[str]) -> None: + controller = AgentContextController(output_format="text") + controller.show_validators(type_name=None, list_mode=True) + captured = capsys.readouterr() + assert "validator_type" in captured.out + assert "params_class" in captured.out + + +def test_show_validators_list_json(capsys: pytest.CaptureFixture[str]) -> None: + controller = AgentContextController(output_format="json") + controller.show_validators(type_name=None, list_mode=True) + captured = capsys.readouterr() + data = json.loads(captured.out) + assert isinstance(data, dict) + assert len(data) > 0 + + +def test_show_validators_specific_text(capsys: pytest.CaptureFixture[str]) -> None: + controller = AgentContextController(output_format="text") + controller.show_validators(type_name="code", list_mode=False) + captured = capsys.readouterr() + assert "CODE" in captured.out + + +def test_show_validators_specific_json(capsys: pytest.CaptureFixture[str]) -> None: + controller = AgentContextController(output_format="json") + controller.show_validators(type_name="code", list_mode=False) + captured = capsys.readouterr() + data = json.loads(captured.out) + assert isinstance(data, dict) + assert "fields" in data + + +# --------------------------------------------------------------------------- +# show_processors +# --------------------------------------------------------------------------- + + +def test_show_processors_list_text(capsys: pytest.CaptureFixture[str]) -> None: + controller = AgentContextController(output_format="text") + controller.show_processors(type_name=None, list_mode=True) + captured = capsys.readouterr() + assert "processor_type" in captured.out + assert "config_class" in captured.out + + +def test_show_processors_list_json(capsys: pytest.CaptureFixture[str]) -> None: + controller = AgentContextController(output_format="json") + controller.show_processors(type_name=None, list_mode=True) + captured = capsys.readouterr() + data = json.loads(captured.out) + assert isinstance(data, dict) + assert len(data) > 0 + + +# --------------------------------------------------------------------------- +# show_imports (with category filter) +# --------------------------------------------------------------------------- + + +def test_show_imports_text(capsys: pytest.CaptureFixture[str]) -> None: + controller = AgentContextController(output_format="text") + controller.show_imports() + captured = capsys.readouterr() + assert "from data_designer.config import" in captured.out + + +def test_show_imports_with_category_filter(capsys: pytest.CaptureFixture[str]) -> None: + controller = AgentContextController(output_format="text") + controller.show_imports(category="columns") + captured = capsys.readouterr() + assert "Column Configs" in captured.out + assert "from data_designer.config import" in captured.out + + +def test_show_imports_with_category_filter_json(capsys: pytest.CaptureFixture[str]) -> None: + controller = AgentContextController(output_format="json") + controller.show_imports(category="columns") + captured = capsys.readouterr() + data = json.loads(captured.out) + assert isinstance(data, dict) + assert "Column Configs" in data + + +def test_show_imports_with_invalid_category(capsys: pytest.CaptureFixture[str]) -> None: + controller = AgentContextController(output_format="text") + with pytest.raises(click.exceptions.Exit): + controller.show_imports(category="nonexistent_xyz") + + +# --------------------------------------------------------------------------- +# show_code_structure +# --------------------------------------------------------------------------- + + +def test_show_code_structure_text(capsys: pytest.CaptureFixture[str]) -> None: + controller = AgentContextController(output_format="text") + controller.show_code_structure() + captured = capsys.readouterr() + assert "data_designer code structure" in captured.out + assert "data_designer/" in captured.out + + +def test_show_code_structure_json(capsys: pytest.CaptureFixture[str]) -> None: + controller = AgentContextController(output_format="json") + controller.show_code_structure() + captured = capsys.readouterr() + data = json.loads(captured.out) + assert "paths" in data + assert "tree" in data + assert data["tree"]["name"] == "data_designer" + + +# --------------------------------------------------------------------------- +# _match_category +# --------------------------------------------------------------------------- + + +def test_match_category_exact_match() -> None: + keys = ["Column Configs", "Builder", "Model Configs"] + assert AgentContextController._match_category("Column Configs", keys) == "Column Configs" + + +def test_match_category_exact_match_case_insensitive() -> None: + keys = ["Column Configs", "Builder", "Model Configs"] + assert AgentContextController._match_category("column configs", keys) == "Column Configs" + assert AgentContextController._match_category("BUILDER", keys) == "Builder" + + +def test_match_category_first_word_stem_match() -> None: + keys = ["Column Configs", "Builder", "Model Configs"] + # "columns" -> rstrip("s") -> "column", matches first word "column" of "Column Configs" + assert AgentContextController._match_category("columns", keys) == "Column Configs" + + +def test_match_category_first_word_stem_match_singular() -> None: + keys = ["Column Configs", "Builder", "Model Configs"] + # "column" is already stemmed, matches first word "column" + assert AgentContextController._match_category("column", keys) == "Column Configs" + + +def test_match_category_any_word_stem_match() -> None: + keys = ["Column Configs", "Builder", "Model Configs"] + # "configs" -> rstrip("s") -> "config", matches second word of "Column Configs" + assert AgentContextController._match_category("configs", keys) == "Column Configs" + + +def test_match_category_substring_match() -> None: + keys = ["Column Configs", "Builder", "Model Configs"] + # "uild" is a substring of "Builder" + assert AgentContextController._match_category("uild", keys) == "Builder" + + +def test_match_category_substring_picks_earliest_position() -> None: + keys = ["ABC-foo", "foo-ABC"] + # "foo" appears at position 4 in "ABC-foo" and position 0 in "foo-ABC" + assert AgentContextController._match_category("foo", keys) == "foo-ABC" + + +def test_match_category_no_match() -> None: + keys = ["Column Configs", "Builder", "Model Configs"] + assert AgentContextController._match_category("zzzzz_nonexistent", keys) is None + + +def test_match_category_empty_string() -> None: + keys = ["Column Configs", "Builder", "Model Configs"] + # Empty string is a substring of everything; earliest position (0) wins + result = AgentContextController._match_category("", keys) + assert result is not None + + +def test_match_category_process_rstrip_s_edge_case() -> None: + """Words ending in 's' naturally (like 'process') still work after rstrip('s').""" + keys = ["Processors", "Builder"] + # "process" -> rstrip("s") -> "proces" + # First-word stem: "Processors" first word is "processor" -> rstrip("s") -> "processor" != "proces" + # Any-word stem: same + # Falls to substring: "process" is a substring of "Processors" at pos 0 + assert AgentContextController._match_category("process", keys) == "Processors" + + +def test_match_category_empty_keys_list() -> None: + assert AgentContextController._match_category("anything", []) is None + + +def test_match_category_model_stem() -> None: + keys = ["Column Configs", "Builder", "Model Configs"] + # "models" -> rstrip("s") -> "model", matches first word "model" of "Model Configs" + assert AgentContextController._match_category("models", keys) == "Model Configs" From f18506cb9104b8f945714dff0a3824bbda1ab22a Mon Sep 17 00:00:00 2001 From: Johnny Greco Date: Sun, 15 Feb 2026 21:34:49 -0500 Subject: [PATCH 08/37] test: add CLI usage scenario integration tests Add end-to-end tests for preview, validate, and introspect commands covering non-interactive preview, interactive navigation, error messages, and JSON contract validation. --- .../cli/commands/test_usage_scenarios.py | 135 ++++++++++++++++++ 1 file changed, 135 insertions(+) create mode 100644 packages/data-designer/tests/cli/commands/test_usage_scenarios.py diff --git a/packages/data-designer/tests/cli/commands/test_usage_scenarios.py b/packages/data-designer/tests/cli/commands/test_usage_scenarios.py new file mode 100644 index 000000000..4a7d2e694 --- /dev/null +++ b/packages/data-designer/tests/cli/commands/test_usage_scenarios.py @@ -0,0 +1,135 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +from __future__ import annotations + +import json +import re +import types +from pathlib import Path +from unittest.mock import patch + +from typer.testing import CliRunner + +from data_designer.cli.main import app + +runner = CliRunner() + +ANSI_ESCAPE_RE = re.compile(r"\x1B\[[0-?]*[ -/]*[@-~]") + + +class _AlwaysTTY: + def isatty(self) -> bool: + return True + + +def _normalize_text(text: str) -> str: + without_ansi = ANSI_ESCAPE_RE.sub("", text) + return re.sub(r"\s+", " ", without_ansi).strip().lower() + + +def _write_usage_config(tmp_path: Path) -> Path: + config_path = tmp_path / "usage_config.py" + config_path.write_text( + """from __future__ import annotations + +import data_designer.config as dd + + +def load_config_builder() -> dd.DataDesignerConfigBuilder: + builder = dd.DataDesignerConfigBuilder() + builder.add_column( + dd.SamplerColumnConfig( + name="record_id", + sampler_type=dd.SamplerType.UUID, + params=dd.UUIDSamplerParams(), + ) + ) + builder.add_column( + dd.SamplerColumnConfig( + name="category", + sampler_type=dd.SamplerType.CATEGORY, + params=dd.CategorySamplerParams(values=["A", "B", "C"]), + ) + ) + builder.add_column( + dd.ExpressionColumnConfig( + name="summary", + expr="{{ category }}::{{ record_id }}", + ) + ) + return builder +""", + encoding="utf-8", + ) + return config_path + + +def test_usage_preview_non_interactive_shows_records(tmp_path: Path) -> None: + config_path = _write_usage_config(tmp_path) + result = runner.invoke( + app, + ["preview", str(config_path), "--num-records", "3", "--non-interactive"], + color=False, + ) + + normalized = _normalize_text(result.output) + assert result.exit_code == 0 + assert "record 1 of 3" in normalized + assert "record 3 of 3" in normalized + assert "preview complete" in normalized + + +def test_usage_interactive_preview_navigation(tmp_path: Path) -> None: + config_path = _write_usage_config(tmp_path) + fake_sys = types.SimpleNamespace(stdin=_AlwaysTTY(), stdout=_AlwaysTTY()) + + with ( + patch("data_designer.cli.controllers.generation_controller.sys", fake_sys), + patch( + "data_designer.cli.controllers.generation_controller.wait_for_navigation_key", + side_effect=["n", "p", "q"], + ), + ): + result = runner.invoke( + app, + ["preview", str(config_path), "--num-records", "3"], + color=False, + ) + + normalized = _normalize_text(result.output) + assert result.exit_code == 0 + assert "record 1 of 3" in normalized + assert "record 2 of 3" in normalized + assert "done browsing." in normalized + + +def test_usage_validate_unsupported_extension_is_actionable(tmp_path: Path) -> None: + bad_config = tmp_path / "config.txt" + bad_config.write_text("not supported", encoding="utf-8") + + result = runner.invoke(app, ["validate", str(bad_config)], color=False) + normalized = _normalize_text(result.output) + + assert result.exit_code == 1 + assert "unsupported file extension" in normalized + assert "supported extensions" in normalized + + +def test_usage_introspect_columns_json_contract() -> None: + result = runner.invoke(app, ["introspect", "columns", "llm-text", "--format", "json"], color=False) + assert result.exit_code == 0 + + payload = json.loads(result.output) + assert isinstance(payload, dict) + assert payload.get("class_name") == "LLMTextColumnConfig" + assert isinstance(payload.get("fields"), list) + + +def test_usage_introspect_unknown_type_error_is_actionable() -> None: + result = runner.invoke(app, ["introspect", "columns", "nonexistent"], color=False) + normalized = _normalize_text(result.output) + + assert result.exit_code == 1 + assert "error: unknown column_type" in normalized + assert "available types:" in normalized From fe2d87e5d2289f9d6c18610ea7000cc9fe32c88d Mon Sep 17 00:00:00 2001 From: Johnny Greco Date: Sun, 15 Feb 2026 22:47:53 -0500 Subject: [PATCH 09/37] refactor: replace introspect command with types and reference command groups Split the monolithic `introspect` CLI into two focused command groups: - `types`: explore configuration types (columns, samplers, validators, etc.) - `reference`: reference docs (overview, builder, interface, imports, code-structure) --- .../cli/commands/agent_context.py | 176 ------------------ .../data_designer/cli/commands/reference.py | 66 +++++++ .../src/data_designer/cli/commands/types.py | 125 +++++++++++++ .../data_designer/cli/controllers/__init__.py | 16 ++ ...troller.py => introspection_controller.py} | 27 +-- .../src/data_designer/cli/main.py | 5 +- ...mand.py => test_introspection_commands.py} | 79 ++++---- ...er.py => test_introspection_controller.py} | 124 ++++++------ 8 files changed, 333 insertions(+), 285 deletions(-) delete mode 100644 packages/data-designer/src/data_designer/cli/commands/agent_context.py create mode 100644 packages/data-designer/src/data_designer/cli/commands/reference.py create mode 100644 packages/data-designer/src/data_designer/cli/commands/types.py rename packages/data-designer/src/data_designer/cli/controllers/{agent_context_controller.py => introspection_controller.py} (95%) rename packages/data-designer/tests/cli/commands/{test_agent_context_command.py => test_introspection_commands.py} (72%) rename packages/data-designer/tests/cli/controllers/{test_agent_context_controller.py => test_introspection_controller.py} (73%) diff --git a/packages/data-designer/src/data_designer/cli/commands/agent_context.py b/packages/data-designer/src/data_designer/cli/commands/agent_context.py deleted file mode 100644 index c1b77aee0..000000000 --- a/packages/data-designer/src/data_designer/cli/commands/agent_context.py +++ /dev/null @@ -1,176 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 - -from __future__ import annotations - -from enum import Enum - -import typer - -from data_designer.cli.controllers.agent_context_controller import AgentContextController - - -class OutputFormat(str, Enum): - """Supported output formats for introspect commands.""" - - TEXT = "text" - JSON = "json" - - -agent_context_app = typer.Typer( - name="introspect", - help="Introspect Data Designer's API for agent consumption.", - no_args_is_help=True, -) - - -def _make_controller(output_format: OutputFormat) -> AgentContextController: - return AgentContextController(output_format=output_format.value) - - -@agent_context_app.command(name="columns") -def columns_command( - type_name: str | None = typer.Argument( - None, help="Column type to display (e.g., 'llm-text'), or 'all' for everything." - ), - list_mode: bool = typer.Option(False, "--list", "-l", help="Show summary table of available types."), - output_format: OutputFormat = typer.Option( - OutputFormat.TEXT, "--format", "-f", help="Output format: 'text' or 'json'." - ), -) -> None: - """Show column configuration types and their fields.""" - _make_controller(output_format).show_columns(type_name, list_mode) - - -@agent_context_app.command(name="samplers") -def samplers_command( - type_name: str | None = typer.Argument( - None, help="Sampler type to display (e.g., 'category'), or 'all' for everything." - ), - list_mode: bool = typer.Option(False, "--list", "-l", help="Show summary table of available types."), - output_format: OutputFormat = typer.Option( - OutputFormat.TEXT, "--format", "-f", help="Output format: 'text' or 'json'." - ), -) -> None: - """Show sampler types and their parameter fields.""" - _make_controller(output_format).show_samplers(type_name, list_mode) - - -@agent_context_app.command(name="validators") -def validators_command( - type_name: str | None = typer.Argument( - None, help="Validator type to display (e.g., 'code'), or 'all' for everything." - ), - list_mode: bool = typer.Option(False, "--list", "-l", help="Show summary table of available types."), - output_format: OutputFormat = typer.Option( - OutputFormat.TEXT, "--format", "-f", help="Output format: 'text' or 'json'." - ), -) -> None: - """Show validator types and their parameter fields.""" - _make_controller(output_format).show_validators(type_name, list_mode) - - -@agent_context_app.command(name="processors") -def processors_command( - type_name: str | None = typer.Argument( - None, help="Processor type to display (e.g., 'drop_columns'), or 'all' for everything." - ), - list_mode: bool = typer.Option(False, "--list", "-l", help="Show summary table of available types."), - output_format: OutputFormat = typer.Option( - OutputFormat.TEXT, "--format", "-f", help="Output format: 'text' or 'json'." - ), -) -> None: - """Show processor types and their configuration fields.""" - _make_controller(output_format).show_processors(type_name, list_mode) - - -@agent_context_app.command(name="models") -def models_command( - output_format: OutputFormat = typer.Option( - OutputFormat.TEXT, "--format", "-f", help="Output format: 'text' or 'json'." - ), -) -> None: - """Show model configuration types (ModelConfig, inference params, distributions).""" - _make_controller(output_format).show_models() - - -@agent_context_app.command(name="builder") -def builder_command( - output_format: OutputFormat = typer.Option( - OutputFormat.TEXT, "--format", "-f", help="Output format: 'text' or 'json'." - ), -) -> None: - """Show DataDesignerConfigBuilder method signatures and documentation.""" - _make_controller(output_format).show_builder() - - -@agent_context_app.command(name="constraints") -def constraints_command( - output_format: OutputFormat = typer.Option( - OutputFormat.TEXT, "--format", "-f", help="Output format: 'text' or 'json'." - ), -) -> None: - """Show constraint types (ScalarInequality, ColumnInequality, operators).""" - _make_controller(output_format).show_constraints() - - -@agent_context_app.command(name="seeds") -def seeds_command( - output_format: OutputFormat = typer.Option( - OutputFormat.TEXT, "--format", "-f", help="Output format: 'text' or 'json'." - ), -) -> None: - """Show seed dataset types (SeedConfig, sources, sampling strategies).""" - _make_controller(output_format).show_seeds() - - -@agent_context_app.command(name="mcp") -def mcp_command( - output_format: OutputFormat = typer.Option( - OutputFormat.TEXT, "--format", "-f", help="Output format: 'text' or 'json'." - ), -) -> None: - """Show MCP provider types (MCPProvider, LocalStdioMCPProvider, ToolConfig).""" - _make_controller(output_format).show_mcp() - - -@agent_context_app.command(name="interface") -def interface_command( - output_format: OutputFormat = typer.Option( - OutputFormat.TEXT, "--format", "-f", help="Output format: 'text' or 'json'." - ), -) -> None: - """Show DataDesigner class methods, result types, and RunConfig fields.""" - _make_controller(output_format).show_interface() - - -@agent_context_app.command(name="imports") -def imports_command( - category: str | None = typer.Argument(None, help="Filter by category (e.g., 'columns'), or omit for all."), - output_format: OutputFormat = typer.Option( - OutputFormat.TEXT, "--format", "-f", help="Output format: 'text' or 'json'." - ), -) -> None: - """Show categorized import reference for data_designer.config and data_designer.interface.""" - _make_controller(output_format).show_imports(category) - - -@agent_context_app.command(name="code-structure") -def code_structure_command( - depth: int = typer.Option(2, "--depth", "-d", help="Max tree depth (default: 2)."), - output_format: OutputFormat = typer.Option( - OutputFormat.TEXT, "--format", "-f", help="Output format: 'text' or 'json'." - ), -) -> None: - """Show the data_designer package structure and install paths.""" - _make_controller(output_format).show_code_structure(depth=depth) - - -@agent_context_app.command(name="overview") -def overview_command( - output_format: OutputFormat = typer.Option( - OutputFormat.TEXT, "--format", "-f", help="Output format: 'text' or 'json'." - ), -) -> None: - """Show compact API cheatsheet with type counts, builder summary, and quick start commands.""" - _make_controller(output_format).show_overview() diff --git a/packages/data-designer/src/data_designer/cli/commands/reference.py b/packages/data-designer/src/data_designer/cli/commands/reference.py new file mode 100644 index 000000000..53c831c35 --- /dev/null +++ b/packages/data-designer/src/data_designer/cli/commands/reference.py @@ -0,0 +1,66 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +from __future__ import annotations + +import typer + +from data_designer.cli.controllers.introspection_controller import IntrospectionController, OutputFormat + +reference_app = typer.Typer( + name="reference", + help="Reference documentation for Data Designer (overview, interface, code structure, builder, imports).", + no_args_is_help=True, +) + + +@reference_app.command(name="overview") +def overview_command( + output_format: OutputFormat = typer.Option( + OutputFormat.TEXT, "--format", "-f", help="Output format: 'text' or 'json'." + ), +) -> None: + """Show compact API cheatsheet with type counts, builder summary, and quick start commands.""" + IntrospectionController(output_format=output_format.value).show_overview() + + +@reference_app.command(name="code-structure") +def code_structure_command( + depth: int = typer.Option(2, "--depth", "-d", help="Max tree depth (default: 2)."), + output_format: OutputFormat = typer.Option( + OutputFormat.TEXT, "--format", "-f", help="Output format: 'text' or 'json'." + ), +) -> None: + """Show the data_designer package structure and install paths.""" + IntrospectionController(output_format=output_format.value).show_code_structure(depth=depth) + + +@reference_app.command(name="builder") +def builder_command( + output_format: OutputFormat = typer.Option( + OutputFormat.TEXT, "--format", "-f", help="Output format: 'text' or 'json'." + ), +) -> None: + """Show DataDesignerConfigBuilder method signatures and documentation.""" + IntrospectionController(output_format=output_format.value).show_builder() + + +@reference_app.command(name="interface") +def interface_command( + output_format: OutputFormat = typer.Option( + OutputFormat.TEXT, "--format", "-f", help="Output format: 'text' or 'json'." + ), +) -> None: + """Show DataDesigner class methods, result types, and RunConfig fields.""" + IntrospectionController(output_format=output_format.value).show_interface() + + +@reference_app.command(name="imports") +def imports_command( + category: str | None = typer.Argument(None, help="Filter by category (e.g., 'columns'), or omit for all."), + output_format: OutputFormat = typer.Option( + OutputFormat.TEXT, "--format", "-f", help="Output format: 'text' or 'json'." + ), +) -> None: + """Show categorized import reference for data_designer.config and data_designer.interface.""" + IntrospectionController(output_format=output_format.value).show_imports(category) diff --git a/packages/data-designer/src/data_designer/cli/commands/types.py b/packages/data-designer/src/data_designer/cli/commands/types.py new file mode 100644 index 000000000..5e3195790 --- /dev/null +++ b/packages/data-designer/src/data_designer/cli/commands/types.py @@ -0,0 +1,125 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +from __future__ import annotations + +import typer + +from data_designer.cli.controllers.introspection_controller import IntrospectionController, OutputFormat + +types_app = typer.Typer( + name="types", + help="Explore Data Designer configuration types (columns, samplers, validators, etc.).", + no_args_is_help=True, +) + + +def _print_usage_hint(command_name: str) -> None: + """Print a usage hint after the type list (text format only).""" + typer.echo("") + typer.echo(f"Tip: Run `data-designer types {command_name} ` for full schema details.") + typer.echo(f" Run `data-designer types {command_name} all` to see every type expanded.") + + +@types_app.command(name="columns") +def columns_command( + type_name: str | None = typer.Argument( + None, help="Column type to display (e.g., 'llm-text'), or 'all' for everything." + ), + output_format: OutputFormat = typer.Option( + OutputFormat.TEXT, "--format", "-f", help="Output format: 'text' or 'json'." + ), +) -> None: + """Show column configuration types and their fields.""" + ctrl = IntrospectionController(output_format=output_format.value) + ctrl.show_columns(type_name) + if type_name is None and output_format == OutputFormat.TEXT: + _print_usage_hint("columns") + + +@types_app.command(name="samplers") +def samplers_command( + type_name: str | None = typer.Argument( + None, help="Sampler type to display (e.g., 'category'), or 'all' for everything." + ), + output_format: OutputFormat = typer.Option( + OutputFormat.TEXT, "--format", "-f", help="Output format: 'text' or 'json'." + ), +) -> None: + """Show sampler types and their parameter fields.""" + ctrl = IntrospectionController(output_format=output_format.value) + ctrl.show_samplers(type_name) + if type_name is None and output_format == OutputFormat.TEXT: + _print_usage_hint("samplers") + + +@types_app.command(name="validators") +def validators_command( + type_name: str | None = typer.Argument( + None, help="Validator type to display (e.g., 'code'), or 'all' for everything." + ), + output_format: OutputFormat = typer.Option( + OutputFormat.TEXT, "--format", "-f", help="Output format: 'text' or 'json'." + ), +) -> None: + """Show validator types and their parameter fields.""" + ctrl = IntrospectionController(output_format=output_format.value) + ctrl.show_validators(type_name) + if type_name is None and output_format == OutputFormat.TEXT: + _print_usage_hint("validators") + + +@types_app.command(name="processors") +def processors_command( + type_name: str | None = typer.Argument( + None, help="Processor type to display (e.g., 'drop_columns'), or 'all' for everything." + ), + output_format: OutputFormat = typer.Option( + OutputFormat.TEXT, "--format", "-f", help="Output format: 'text' or 'json'." + ), +) -> None: + """Show processor types and their configuration fields.""" + ctrl = IntrospectionController(output_format=output_format.value) + ctrl.show_processors(type_name) + if type_name is None and output_format == OutputFormat.TEXT: + _print_usage_hint("processors") + + +@types_app.command(name="models") +def models_command( + output_format: OutputFormat = typer.Option( + OutputFormat.TEXT, "--format", "-f", help="Output format: 'text' or 'json'." + ), +) -> None: + """Show model configuration types (ModelConfig, inference params, distributions).""" + IntrospectionController(output_format=output_format.value).show_models() + + +@types_app.command(name="constraints") +def constraints_command( + output_format: OutputFormat = typer.Option( + OutputFormat.TEXT, "--format", "-f", help="Output format: 'text' or 'json'." + ), +) -> None: + """Show constraint types (ScalarInequality, ColumnInequality, operators).""" + IntrospectionController(output_format=output_format.value).show_constraints() + + +@types_app.command(name="seeds") +def seeds_command( + output_format: OutputFormat = typer.Option( + OutputFormat.TEXT, "--format", "-f", help="Output format: 'text' or 'json'." + ), +) -> None: + """Show seed dataset types (SeedConfig, sources, sampling strategies).""" + IntrospectionController(output_format=output_format.value).show_seeds() + + +@types_app.command(name="mcp") +def mcp_command( + output_format: OutputFormat = typer.Option( + OutputFormat.TEXT, "--format", "-f", help="Output format: 'text' or 'json'." + ), +) -> None: + """Show MCP provider types (MCPProvider, LocalStdioMCPProvider, ToolConfig).""" + IntrospectionController(output_format=output_format.value).show_mcp() diff --git a/packages/data-designer/src/data_designer/cli/controllers/__init__.py b/packages/data-designer/src/data_designer/cli/controllers/__init__.py index e5725ea5a..f568a3015 100644 --- a/packages/data-designer/src/data_designer/cli/controllers/__init__.py +++ b/packages/data-designer/src/data_designer/cli/controllers/__init__.py @@ -1,2 +1,18 @@ # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 + +from __future__ import annotations + +from data_designer.cli.controllers.download_controller import DownloadController +from data_designer.cli.controllers.generation_controller import GenerationController +from data_designer.cli.controllers.introspection_controller import IntrospectionController +from data_designer.cli.controllers.model_controller import ModelController +from data_designer.cli.controllers.provider_controller import ProviderController + +__all__ = [ + "DownloadController", + "GenerationController", + "IntrospectionController", + "ModelController", + "ProviderController", +] diff --git a/packages/data-designer/src/data_designer/cli/controllers/agent_context_controller.py b/packages/data-designer/src/data_designer/cli/controllers/introspection_controller.py similarity index 95% rename from packages/data-designer/src/data_designer/cli/controllers/agent_context_controller.py rename to packages/data-designer/src/data_designer/cli/controllers/introspection_controller.py index 9406b9ff6..7cf315fc8 100644 --- a/packages/data-designer/src/data_designer/cli/controllers/agent_context_controller.py +++ b/packages/data-designer/src/data_designer/cli/controllers/introspection_controller.py @@ -1,9 +1,10 @@ -# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 from __future__ import annotations import json +from enum import Enum import typer @@ -39,7 +40,14 @@ from data_designer.config.config_builder import DataDesignerConfigBuilder -class AgentContextController: +class OutputFormat(str, Enum): + """Supported output formats for introspect commands.""" + + TEXT = "text" + JSON = "json" + + +class IntrospectionController: """Controller for introspect CLI commands. Orchestrates discovery, inspection, formatting, and output for all @@ -49,26 +57,24 @@ class AgentContextController: def __init__(self, output_format: str = "text") -> None: self._format = output_format - def show_columns(self, type_name: str | None, list_mode: bool) -> None: + def show_columns(self, type_name: str | None) -> None: """Show column configuration types.""" items = discover_column_configs() self._show_typed_items( items=items, type_name=type_name, - list_mode=list_mode, type_key="column_type", type_label="column_type", class_label="config_class", header_title="Data Designer Column Types Reference", ) - def show_samplers(self, type_name: str | None, list_mode: bool) -> None: + def show_samplers(self, type_name: str | None) -> None: """Show sampler types and their param classes.""" items = discover_sampler_types() self._show_typed_items( items=items, type_name=type_name, - list_mode=list_mode, type_key="sampler_type", type_label="sampler_type", class_label="params_class", @@ -77,13 +83,12 @@ def show_samplers(self, type_name: str | None, list_mode: bool) -> None: uppercase_value=True, ) - def show_validators(self, type_name: str | None, list_mode: bool) -> None: + def show_validators(self, type_name: str | None) -> None: """Show validator types and their param classes.""" items = discover_validator_types() self._show_typed_items( items=items, type_name=type_name, - list_mode=list_mode, type_key="validator_type", type_label="validator_type", class_label="params_class", @@ -92,13 +97,12 @@ def show_validators(self, type_name: str | None, list_mode: bool) -> None: uppercase_value=True, ) - def show_processors(self, type_name: str | None, list_mode: bool) -> None: + def show_processors(self, type_name: str | None) -> None: """Show processor types and their config classes.""" items = discover_processor_configs() self._show_typed_items( items=items, type_name=type_name, - list_mode=list_mode, type_key="processor_type", type_label="processor_type", class_label="config_class", @@ -249,7 +253,6 @@ def _show_typed_items( self, items: dict[str, type], type_name: str | None, - list_mode: bool, type_key: str, type_label: str, class_label: str, @@ -258,7 +261,7 @@ def _show_typed_items( uppercase_value: bool = False, ) -> None: """Shared logic for type-based commands (columns, samplers, validators, processors).""" - if list_mode or type_name is None: + if type_name is None: if self._format == "json": typer.echo(json.dumps({k: v.__name__ for k, v in sorted(items.items())}, indent=2)) else: diff --git a/packages/data-designer/src/data_designer/cli/main.py b/packages/data-designer/src/data_designer/cli/main.py index df69cea70..672c4d8af 100644 --- a/packages/data-designer/src/data_designer/cli/main.py +++ b/packages/data-designer/src/data_designer/cli/main.py @@ -5,7 +5,7 @@ import typer -from data_designer.cli.commands import agent_context +from data_designer.cli.commands import reference, types from data_designer.cli.lazy_group import create_lazy_typer_group _CMD = "data_designer.cli.commands" @@ -104,7 +104,8 @@ app.add_typer(download_app, name="download", rich_help_panel="Setup Commands") # Add agent command groups -app.add_typer(agent_context.agent_context_app, name="introspect", rich_help_panel="Agent Commands") +app.add_typer(types.types_app, name="types", rich_help_panel="Agent Commands") +app.add_typer(reference.reference_app, name="reference", rich_help_panel="Agent Commands") def main() -> None: diff --git a/packages/data-designer/tests/cli/commands/test_agent_context_command.py b/packages/data-designer/tests/cli/commands/test_introspection_commands.py similarity index 72% rename from packages/data-designer/tests/cli/commands/test_agent_context_command.py rename to packages/data-designer/tests/cli/commands/test_introspection_commands.py index 4bd3960ca..e054e0ecd 100644 --- a/packages/data-designer/tests/cli/commands/test_agent_context_command.py +++ b/packages/data-designer/tests/cli/commands/test_introspection_commands.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 from __future__ import annotations @@ -17,31 +17,38 @@ # --------------------------------------------------------------------------- -def test_agent_context_help() -> None: - result = runner.invoke(app, ["introspect", "--help"]) +def test_types_help() -> None: + result = runner.invoke(app, ["types", "--help"]) assert result.exit_code == 0 assert "columns" in result.output +def test_reference_help() -> None: + result = runner.invoke(app, ["reference", "--help"]) + assert result.exit_code == 0 + assert "overview" in result.output + + # --------------------------------------------------------------------------- # columns # --------------------------------------------------------------------------- def test_columns_list() -> None: - result = runner.invoke(app, ["introspect", "columns", "--list"]) + result = runner.invoke(app, ["types", "columns"]) assert result.exit_code == 0 assert "llm-text" in result.output + assert "data-designer types columns" in result.output def test_columns_specific_type() -> None: - result = runner.invoke(app, ["introspect", "columns", "llm-text"]) + result = runner.invoke(app, ["types", "columns", "llm-text"]) assert result.exit_code == 0 assert "LLMTextColumnConfig" in result.output def test_columns_json_format() -> None: - result = runner.invoke(app, ["introspect", "columns", "llm-text", "--format", "json"]) + result = runner.invoke(app, ["types", "columns", "llm-text", "--format", "json"]) assert result.exit_code == 0 data = json.loads(result.output) assert isinstance(data, dict) @@ -49,7 +56,7 @@ def test_columns_json_format() -> None: def test_columns_nonexistent_exits_with_error() -> None: - result = runner.invoke(app, ["introspect", "columns", "nonexistent"]) + result = runner.invoke(app, ["types", "columns", "nonexistent"]) assert result.exit_code == 1 @@ -59,15 +66,16 @@ def test_columns_nonexistent_exits_with_error() -> None: def test_samplers_specific() -> None: - result = runner.invoke(app, ["introspect", "samplers", "category"]) + result = runner.invoke(app, ["types", "samplers", "category"]) assert result.exit_code == 0 assert "CATEGORY" in result.output def test_samplers_list() -> None: - result = runner.invoke(app, ["introspect", "samplers", "--list"]) + result = runner.invoke(app, ["types", "samplers"]) assert result.exit_code == 0 assert "category" in result.output + assert "data-designer types samplers" in result.output # --------------------------------------------------------------------------- @@ -76,7 +84,7 @@ def test_samplers_list() -> None: def test_overview() -> None: - result = runner.invoke(app, ["introspect", "overview"]) + result = runner.invoke(app, ["reference", "overview"]) assert result.exit_code == 0 assert "Type Counts" in result.output @@ -87,7 +95,7 @@ def test_overview() -> None: def test_builder() -> None: - result = runner.invoke(app, ["introspect", "builder"]) + result = runner.invoke(app, ["reference", "builder"]) assert result.exit_code == 0 assert "add_column" in result.output assert "DataDesignerConfigBuilder" in result.output @@ -95,7 +103,7 @@ def test_builder() -> None: def test_builder_json() -> None: - result = runner.invoke(app, ["introspect", "builder", "--format", "json"]) + result = runner.invoke(app, ["reference", "builder", "--format", "json"]) assert result.exit_code == 0 data = json.loads(result.output) assert isinstance(data, list) @@ -109,14 +117,14 @@ def test_builder_json() -> None: def test_models() -> None: - result = runner.invoke(app, ["introspect", "models"]) + result = runner.invoke(app, ["types", "models"]) assert result.exit_code == 0 assert "ModelConfig" in result.output assert "description:" in result.output def test_models_json() -> None: - result = runner.invoke(app, ["introspect", "models", "--format", "json"]) + result = runner.invoke(app, ["types", "models", "--format", "json"]) assert result.exit_code == 0 data = json.loads(result.output) assert isinstance(data, list) @@ -130,7 +138,7 @@ def test_models_json() -> None: def test_constraints() -> None: - result = runner.invoke(app, ["introspect", "constraints"]) + result = runner.invoke(app, ["types", "constraints"]) assert result.exit_code == 0 output = result.output assert "ScalarInequalityConstraint" in output or "InequalityOperator" in output @@ -142,14 +150,14 @@ def test_constraints() -> None: def test_seeds() -> None: - result = runner.invoke(app, ["introspect", "seeds"]) + result = runner.invoke(app, ["types", "seeds"]) assert result.exit_code == 0 assert "SeedConfig" in result.output assert "SamplingStrategy" in result.output def test_seeds_json() -> None: - result = runner.invoke(app, ["introspect", "seeds", "--format", "json"]) + result = runner.invoke(app, ["types", "seeds", "--format", "json"]) assert result.exit_code == 0 data = json.loads(result.output) assert isinstance(data, list) @@ -163,14 +171,14 @@ def test_seeds_json() -> None: def test_mcp() -> None: - result = runner.invoke(app, ["introspect", "mcp"]) + result = runner.invoke(app, ["types", "mcp"]) assert result.exit_code == 0 assert "ToolConfig" in result.output assert "MCPProvider" in result.output or "LocalStdioMCPProvider" in result.output def test_mcp_json() -> None: - result = runner.invoke(app, ["introspect", "mcp", "--format", "json"]) + result = runner.invoke(app, ["types", "mcp", "--format", "json"]) assert result.exit_code == 0 data = json.loads(result.output) assert isinstance(data, list) @@ -184,21 +192,21 @@ def test_mcp_json() -> None: def test_code_structure() -> None: - result = runner.invoke(app, ["introspect", "code-structure"]) + result = runner.invoke(app, ["reference", "code-structure"]) assert result.exit_code == 0 assert "data_designer code structure" in result.output assert "├──" in result.output def test_code_structure_shows_subpackages() -> None: - result = runner.invoke(app, ["introspect", "code-structure"]) + result = runner.invoke(app, ["reference", "code-structure"]) assert result.exit_code == 0 for pkg in ("config/", "engine/", "cli/"): assert pkg in result.output, f"Expected '{pkg}' in output" def test_code_structure_json_format() -> None: - result = runner.invoke(app, ["introspect", "code-structure", "--format", "json"]) + result = runner.invoke(app, ["reference", "code-structure", "--format", "json"]) assert result.exit_code == 0 data = json.loads(result.output) assert "paths" in data @@ -206,7 +214,7 @@ def test_code_structure_json_format() -> None: def test_code_structure_shows_agent_guidance() -> None: - result = runner.invoke(app, ["introspect", "code-structure"]) + result = runner.invoke(app, ["reference", "code-structure"]) assert result.exit_code == 0 assert "Only read source files directly" in result.output @@ -217,14 +225,14 @@ def test_code_structure_shows_agent_guidance() -> None: def test_interface() -> None: - result = runner.invoke(app, ["introspect", "interface"]) + result = runner.invoke(app, ["reference", "interface"]) assert result.exit_code == 0 assert "DataDesigner" in result.output assert "create" in result.output def test_interface_json() -> None: - result = runner.invoke(app, ["introspect", "interface", "--format", "json"]) + result = runner.invoke(app, ["reference", "interface", "--format", "json"]) assert result.exit_code == 0 data = json.loads(result.output) assert "methods" in data @@ -232,7 +240,7 @@ def test_interface_json() -> None: def test_interface_shows_result_types() -> None: - result = runner.invoke(app, ["introspect", "interface"]) + result = runner.invoke(app, ["reference", "interface"]) assert result.exit_code == 0 assert "DatasetCreationResults" in result.output @@ -243,17 +251,20 @@ def test_interface_shows_result_types() -> None: def test_imports() -> None: - result = runner.invoke(app, ["introspect", "imports"]) + result = runner.invoke(app, ["reference", "imports"]) assert result.exit_code == 0 - assert "from data_designer.config import" in result.output + assert "import data_designer.config as dd" in result.output + assert "dd." in result.output def test_imports_json() -> None: - result = runner.invoke(app, ["introspect", "imports", "--format", "json"]) + result = runner.invoke(app, ["reference", "imports", "--format", "json"]) assert result.exit_code == 0 data = json.loads(result.output) assert isinstance(data, dict) - assert len(data) > 0 + assert "recommended_imports" in data + assert "categories" in data + assert len(data["categories"]) > 0 # --------------------------------------------------------------------------- @@ -262,20 +273,20 @@ def test_imports_json() -> None: def test_invalid_format_rejected() -> None: - result = runner.invoke(app, ["introspect", "columns", "--list", "--format", "xml"]) + result = runner.invoke(app, ["types", "columns", "--format", "xml"]) assert result.exit_code != 0 def test_invalid_format_rejected_on_builder() -> None: - result = runner.invoke(app, ["introspect", "builder", "--format", "yaml"]) + result = runner.invoke(app, ["reference", "builder", "--format", "yaml"]) assert result.exit_code != 0 def test_valid_format_text() -> None: - result = runner.invoke(app, ["introspect", "columns", "--list", "--format", "text"]) + result = runner.invoke(app, ["types", "columns", "--format", "text"]) assert result.exit_code == 0 def test_valid_format_json() -> None: - result = runner.invoke(app, ["introspect", "columns", "--list", "--format", "json"]) + result = runner.invoke(app, ["types", "columns", "--format", "json"]) assert result.exit_code == 0 diff --git a/packages/data-designer/tests/cli/controllers/test_agent_context_controller.py b/packages/data-designer/tests/cli/controllers/test_introspection_controller.py similarity index 73% rename from packages/data-designer/tests/cli/controllers/test_agent_context_controller.py rename to packages/data-designer/tests/cli/controllers/test_introspection_controller.py index 34b53b5a2..2b4c6b532 100644 --- a/packages/data-designer/tests/cli/controllers/test_agent_context_controller.py +++ b/packages/data-designer/tests/cli/controllers/test_introspection_controller.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 from __future__ import annotations @@ -8,7 +8,7 @@ import click.exceptions import pytest -from data_designer.cli.controllers.agent_context_controller import AgentContextController +from data_designer.cli.controllers.introspection_controller import IntrospectionController # --------------------------------------------------------------------------- # show_columns @@ -16,37 +16,37 @@ def test_show_columns_list_mode(capsys: pytest.CaptureFixture[str]) -> None: - controller = AgentContextController(output_format="text") - controller.show_columns(type_name=None, list_mode=True) + controller = IntrospectionController(output_format="text") + controller.show_columns(type_name=None) captured = capsys.readouterr() assert "llm-text" in captured.out assert "sampler" in captured.out def test_show_columns_specific_type(capsys: pytest.CaptureFixture[str]) -> None: - controller = AgentContextController(output_format="text") - controller.show_columns(type_name="llm-text", list_mode=False) + controller = IntrospectionController(output_format="text") + controller.show_columns(type_name="llm-text") captured = capsys.readouterr() assert "LLMTextColumnConfig" in captured.out def test_show_columns_all(capsys: pytest.CaptureFixture[str]) -> None: - controller = AgentContextController(output_format="text") - controller.show_columns(type_name="all", list_mode=False) + controller = IntrospectionController(output_format="text") + controller.show_columns(type_name="all") captured = capsys.readouterr() assert "llm-text" in captured.out assert "sampler" in captured.out def test_show_columns_nonexistent_type_exits() -> None: - controller = AgentContextController(output_format="text") + controller = IntrospectionController(output_format="text") with pytest.raises(click.exceptions.Exit): - controller.show_columns(type_name="nonexistent_type_xyz", list_mode=False) + controller.show_columns(type_name="nonexistent_type_xyz") def test_show_columns_json_format(capsys: pytest.CaptureFixture[str]) -> None: - controller = AgentContextController(output_format="json") - controller.show_columns(type_name="llm-text", list_mode=False) + controller = IntrospectionController(output_format="json") + controller.show_columns(type_name="llm-text") captured = capsys.readouterr() data = json.loads(captured.out) assert isinstance(data, dict) @@ -54,8 +54,8 @@ def test_show_columns_json_format(capsys: pytest.CaptureFixture[str]) -> None: def test_show_columns_list_json_format(capsys: pytest.CaptureFixture[str]) -> None: - controller = AgentContextController(output_format="json") - controller.show_columns(type_name=None, list_mode=True) + controller = IntrospectionController(output_format="json") + controller.show_columns(type_name=None) captured = capsys.readouterr() data = json.loads(captured.out) assert isinstance(data, dict) @@ -68,7 +68,7 @@ def test_show_columns_list_json_format(capsys: pytest.CaptureFixture[str]) -> No def test_show_overview_text(capsys: pytest.CaptureFixture[str]) -> None: - controller = AgentContextController(output_format="text") + controller = IntrospectionController(output_format="text") controller.show_overview() captured = capsys.readouterr() assert "Data Designer API Overview" in captured.out @@ -76,7 +76,7 @@ def test_show_overview_text(capsys: pytest.CaptureFixture[str]) -> None: def test_show_overview_json(capsys: pytest.CaptureFixture[str]) -> None: - controller = AgentContextController(output_format="json") + controller = IntrospectionController(output_format="json") controller.show_overview() captured = capsys.readouterr() data = json.loads(captured.out) @@ -94,15 +94,15 @@ def test_show_overview_json(capsys: pytest.CaptureFixture[str]) -> None: def test_show_samplers_list(capsys: pytest.CaptureFixture[str]) -> None: - controller = AgentContextController(output_format="text") - controller.show_samplers(type_name=None, list_mode=True) + controller = IntrospectionController(output_format="text") + controller.show_samplers(type_name=None) captured = capsys.readouterr() assert "category" in captured.out def test_show_samplers_specific(capsys: pytest.CaptureFixture[str]) -> None: - controller = AgentContextController(output_format="text") - controller.show_samplers(type_name="category", list_mode=False) + controller = IntrospectionController(output_format="text") + controller.show_samplers(type_name="category") captured = capsys.readouterr() assert "CATEGORY" in captured.out @@ -113,7 +113,7 @@ def test_show_samplers_specific(capsys: pytest.CaptureFixture[str]) -> None: def test_show_models(capsys: pytest.CaptureFixture[str]) -> None: - controller = AgentContextController(output_format="text") + controller = IntrospectionController(output_format="text") controller.show_models() captured = capsys.readouterr() assert "ModelConfig" in captured.out @@ -125,7 +125,7 @@ def test_show_models(capsys: pytest.CaptureFixture[str]) -> None: def test_show_builder(capsys: pytest.CaptureFixture[str]) -> None: - controller = AgentContextController(output_format="text") + controller = IntrospectionController(output_format="text") controller.show_builder() captured = capsys.readouterr() assert "add_column" in captured.out @@ -137,7 +137,7 @@ def test_show_builder(capsys: pytest.CaptureFixture[str]) -> None: def test_show_constraints(capsys: pytest.CaptureFixture[str]) -> None: - controller = AgentContextController(output_format="text") + controller = IntrospectionController(output_format="text") controller.show_constraints() captured = capsys.readouterr() assert "ScalarInequalityConstraint" in captured.out @@ -149,7 +149,7 @@ def test_show_constraints(capsys: pytest.CaptureFixture[str]) -> None: def test_show_seeds(capsys: pytest.CaptureFixture[str]) -> None: - controller = AgentContextController(output_format="text") + controller = IntrospectionController(output_format="text") controller.show_seeds() captured = capsys.readouterr() assert "SeedConfig" in captured.out @@ -161,7 +161,7 @@ def test_show_seeds(capsys: pytest.CaptureFixture[str]) -> None: def test_show_mcp(capsys: pytest.CaptureFixture[str]) -> None: - controller = AgentContextController(output_format="text") + controller = IntrospectionController(output_format="text") controller.show_mcp() captured = capsys.readouterr() assert "ToolConfig" in captured.out @@ -173,7 +173,7 @@ def test_show_mcp(capsys: pytest.CaptureFixture[str]) -> None: def test_show_interface_text(capsys: pytest.CaptureFixture[str]) -> None: - controller = AgentContextController(output_format="text") + controller = IntrospectionController(output_format="text") controller.show_interface() captured = capsys.readouterr() assert "DataDesigner" in captured.out @@ -182,7 +182,7 @@ def test_show_interface_text(capsys: pytest.CaptureFixture[str]) -> None: def test_show_interface_json(capsys: pytest.CaptureFixture[str]) -> None: - controller = AgentContextController(output_format="json") + controller = IntrospectionController(output_format="json") controller.show_interface() captured = capsys.readouterr() data = json.loads(captured.out) @@ -199,16 +199,16 @@ def test_show_interface_json(capsys: pytest.CaptureFixture[str]) -> None: def test_show_validators_list_text(capsys: pytest.CaptureFixture[str]) -> None: - controller = AgentContextController(output_format="text") - controller.show_validators(type_name=None, list_mode=True) + controller = IntrospectionController(output_format="text") + controller.show_validators(type_name=None) captured = capsys.readouterr() assert "validator_type" in captured.out assert "params_class" in captured.out def test_show_validators_list_json(capsys: pytest.CaptureFixture[str]) -> None: - controller = AgentContextController(output_format="json") - controller.show_validators(type_name=None, list_mode=True) + controller = IntrospectionController(output_format="json") + controller.show_validators(type_name=None) captured = capsys.readouterr() data = json.loads(captured.out) assert isinstance(data, dict) @@ -216,15 +216,15 @@ def test_show_validators_list_json(capsys: pytest.CaptureFixture[str]) -> None: def test_show_validators_specific_text(capsys: pytest.CaptureFixture[str]) -> None: - controller = AgentContextController(output_format="text") - controller.show_validators(type_name="code", list_mode=False) + controller = IntrospectionController(output_format="text") + controller.show_validators(type_name="code") captured = capsys.readouterr() assert "CODE" in captured.out def test_show_validators_specific_json(capsys: pytest.CaptureFixture[str]) -> None: - controller = AgentContextController(output_format="json") - controller.show_validators(type_name="code", list_mode=False) + controller = IntrospectionController(output_format="json") + controller.show_validators(type_name="code") captured = capsys.readouterr() data = json.loads(captured.out) assert isinstance(data, dict) @@ -237,16 +237,16 @@ def test_show_validators_specific_json(capsys: pytest.CaptureFixture[str]) -> No def test_show_processors_list_text(capsys: pytest.CaptureFixture[str]) -> None: - controller = AgentContextController(output_format="text") - controller.show_processors(type_name=None, list_mode=True) + controller = IntrospectionController(output_format="text") + controller.show_processors(type_name=None) captured = capsys.readouterr() assert "processor_type" in captured.out assert "config_class" in captured.out def test_show_processors_list_json(capsys: pytest.CaptureFixture[str]) -> None: - controller = AgentContextController(output_format="json") - controller.show_processors(type_name=None, list_mode=True) + controller = IntrospectionController(output_format="json") + controller.show_processors(type_name=None) captured = capsys.readouterr() data = json.loads(captured.out) assert isinstance(data, dict) @@ -259,31 +259,33 @@ def test_show_processors_list_json(capsys: pytest.CaptureFixture[str]) -> None: def test_show_imports_text(capsys: pytest.CaptureFixture[str]) -> None: - controller = AgentContextController(output_format="text") + controller = IntrospectionController(output_format="text") controller.show_imports() captured = capsys.readouterr() - assert "from data_designer.config import" in captured.out + assert "import data_designer.config as dd" in captured.out + assert "dd." in captured.out def test_show_imports_with_category_filter(capsys: pytest.CaptureFixture[str]) -> None: - controller = AgentContextController(output_format="text") + controller = IntrospectionController(output_format="text") controller.show_imports(category="columns") captured = capsys.readouterr() assert "Column Configs" in captured.out - assert "from data_designer.config import" in captured.out + assert "dd." in captured.out def test_show_imports_with_category_filter_json(capsys: pytest.CaptureFixture[str]) -> None: - controller = AgentContextController(output_format="json") + controller = IntrospectionController(output_format="json") controller.show_imports(category="columns") captured = capsys.readouterr() data = json.loads(captured.out) assert isinstance(data, dict) - assert "Column Configs" in data + assert "categories" in data + assert "Column Configs" in data["categories"] def test_show_imports_with_invalid_category(capsys: pytest.CaptureFixture[str]) -> None: - controller = AgentContextController(output_format="text") + controller = IntrospectionController(output_format="text") with pytest.raises(click.exceptions.Exit): controller.show_imports(category="nonexistent_xyz") @@ -294,7 +296,7 @@ def test_show_imports_with_invalid_category(capsys: pytest.CaptureFixture[str]) def test_show_code_structure_text(capsys: pytest.CaptureFixture[str]) -> None: - controller = AgentContextController(output_format="text") + controller = IntrospectionController(output_format="text") controller.show_code_structure() captured = capsys.readouterr() assert "data_designer code structure" in captured.out @@ -302,7 +304,7 @@ def test_show_code_structure_text(capsys: pytest.CaptureFixture[str]) -> None: def test_show_code_structure_json(capsys: pytest.CaptureFixture[str]) -> None: - controller = AgentContextController(output_format="json") + controller = IntrospectionController(output_format="json") controller.show_code_structure() captured = capsys.readouterr() data = json.loads(captured.out) @@ -318,54 +320,54 @@ def test_show_code_structure_json(capsys: pytest.CaptureFixture[str]) -> None: def test_match_category_exact_match() -> None: keys = ["Column Configs", "Builder", "Model Configs"] - assert AgentContextController._match_category("Column Configs", keys) == "Column Configs" + assert IntrospectionController._match_category("Column Configs", keys) == "Column Configs" def test_match_category_exact_match_case_insensitive() -> None: keys = ["Column Configs", "Builder", "Model Configs"] - assert AgentContextController._match_category("column configs", keys) == "Column Configs" - assert AgentContextController._match_category("BUILDER", keys) == "Builder" + assert IntrospectionController._match_category("column configs", keys) == "Column Configs" + assert IntrospectionController._match_category("BUILDER", keys) == "Builder" def test_match_category_first_word_stem_match() -> None: keys = ["Column Configs", "Builder", "Model Configs"] # "columns" -> rstrip("s") -> "column", matches first word "column" of "Column Configs" - assert AgentContextController._match_category("columns", keys) == "Column Configs" + assert IntrospectionController._match_category("columns", keys) == "Column Configs" def test_match_category_first_word_stem_match_singular() -> None: keys = ["Column Configs", "Builder", "Model Configs"] # "column" is already stemmed, matches first word "column" - assert AgentContextController._match_category("column", keys) == "Column Configs" + assert IntrospectionController._match_category("column", keys) == "Column Configs" def test_match_category_any_word_stem_match() -> None: keys = ["Column Configs", "Builder", "Model Configs"] # "configs" -> rstrip("s") -> "config", matches second word of "Column Configs" - assert AgentContextController._match_category("configs", keys) == "Column Configs" + assert IntrospectionController._match_category("configs", keys) == "Column Configs" def test_match_category_substring_match() -> None: keys = ["Column Configs", "Builder", "Model Configs"] # "uild" is a substring of "Builder" - assert AgentContextController._match_category("uild", keys) == "Builder" + assert IntrospectionController._match_category("uild", keys) == "Builder" def test_match_category_substring_picks_earliest_position() -> None: keys = ["ABC-foo", "foo-ABC"] # "foo" appears at position 4 in "ABC-foo" and position 0 in "foo-ABC" - assert AgentContextController._match_category("foo", keys) == "foo-ABC" + assert IntrospectionController._match_category("foo", keys) == "foo-ABC" def test_match_category_no_match() -> None: keys = ["Column Configs", "Builder", "Model Configs"] - assert AgentContextController._match_category("zzzzz_nonexistent", keys) is None + assert IntrospectionController._match_category("zzzzz_nonexistent", keys) is None def test_match_category_empty_string() -> None: keys = ["Column Configs", "Builder", "Model Configs"] # Empty string is a substring of everything; earliest position (0) wins - result = AgentContextController._match_category("", keys) + result = IntrospectionController._match_category("", keys) assert result is not None @@ -376,14 +378,14 @@ def test_match_category_process_rstrip_s_edge_case() -> None: # First-word stem: "Processors" first word is "processor" -> rstrip("s") -> "processor" != "proces" # Any-word stem: same # Falls to substring: "process" is a substring of "Processors" at pos 0 - assert AgentContextController._match_category("process", keys) == "Processors" + assert IntrospectionController._match_category("process", keys) == "Processors" def test_match_category_empty_keys_list() -> None: - assert AgentContextController._match_category("anything", []) is None + assert IntrospectionController._match_category("anything", []) is None def test_match_category_model_stem() -> None: keys = ["Column Configs", "Builder", "Model Configs"] # "models" -> rstrip("s") -> "model", matches first word "model" of "Model Configs" - assert AgentContextController._match_category("models", keys) == "Model Configs" + assert IntrospectionController._match_category("models", keys) == "Model Configs" From 37ae075be555fd77d894d0749d012662925135d9 Mon Sep 17 00:00:00 2001 From: Johnny Greco Date: Sun, 15 Feb 2026 22:48:43 -0500 Subject: [PATCH 10/37] refactor: update formatters and tests for new types/reference CLI structure Update command references from `introspect` to `types`/`reference`, enhance import display to use `dd.` alias pattern with recommended imports section, and fix singular/plural noun in category headers. --- .../cli/services/introspection/formatters.py | 72 +++++++++++++------ .../cli/commands/test_usage_scenarios.py | 6 +- .../services/introspection/test_formatters.py | 46 ++++++++++-- 3 files changed, 94 insertions(+), 30 deletions(-) diff --git a/packages/data-designer/src/data_designer/cli/services/introspection/formatters.py b/packages/data-designer/src/data_designer/cli/services/introspection/formatters.py index 2a26f8201..91a2b82e6 100644 --- a/packages/data-designer/src/data_designer/cli/services/introspection/formatters.py +++ b/packages/data-designer/src/data_designer/cli/services/introspection/formatters.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 from __future__ import annotations @@ -9,7 +9,8 @@ from data_designer.cli.services.introspection.pydantic_inspector import FieldDetail, ModelSchema _AGENT_GUIDANCE_FOOTER = ( - "Use `data-designer introspect ` for API details.\n" + "Use `data-designer types ` to explore configuration types.\n" + "Use `data-designer reference ` for builder, imports, and overview.\n" "Only read source files directly if these commands don't cover your need." ) @@ -201,13 +202,13 @@ def format_overview_text(type_counts: dict[str, int], builder_methods: list[Meth lines.append("") lines.append("Quick Start Commands:") - lines.append(" data-designer introspect columns --list") - lines.append(" data-designer introspect columns all") - lines.append(" data-designer introspect columns llm-text") - lines.append(" data-designer introspect samplers category") - lines.append(" data-designer introspect builder") - lines.append(" data-designer introspect interface") - lines.append(" data-designer introspect imports") + lines.append(" data-designer types columns") + lines.append(" data-designer types columns all") + lines.append(" data-designer types columns llm-text") + lines.append(" data-designer types samplers category") + lines.append(" data-designer reference builder") + lines.append(" data-designer reference interface") + lines.append(" data-designer reference imports") return "\n".join(lines) @@ -310,25 +311,44 @@ def format_interface_json( # --------------------------------------------------------------------------- +_CONFIG_MODULE = "data_designer.config" +_INTERFACE_MODULE = "data_designer.interface" +_CONFIG_ALIAS = "dd" + +_RECOMMENDED_IMPORTS = [ + f"import {_CONFIG_MODULE} as {_CONFIG_ALIAS}", + f"from {_INTERFACE_MODULE} import DataDesigner", +] + + def format_imports_text(categories: dict[str, list[dict[str, str]]]) -> str: - """Format categorized import names as readable text with import statements.""" + """Format categorized import names as readable text with access patterns.""" lines: list[str] = [] lines.append("Data Designer Import Reference") lines.append("=" * 30) lines.append("") - for category, entries in sorted(categories.items()): - lines.append(f"{category} ({len(entries)} names):") - by_module: dict[str, list[str]] = {} - for entry in entries: - by_module.setdefault(entry["module"], []).append(entry["name"]) + lines.append("Recommended imports:") + for imp in _RECOMMENDED_IMPORTS: + lines.append(f" {imp}") + lines.append("") - for module, names in sorted(by_module.items()): - sorted_names = sorted(names) + for category, entries in sorted(categories.items()): + count = len(entries) + noun = "name" if count == 1 else "names" + lines.append(f"{category} ({count} {noun}):") + + is_config = any(e["module"] == _CONFIG_MODULE for e in entries) + if is_config: + for entry in sorted(entries, key=lambda e: e["name"]): + lines.append(f" {_CONFIG_ALIAS}.{entry['name']}") + else: + sorted_names = sorted(e["name"] for e in entries) if len(sorted_names) <= 3: names_str = ", ".join(sorted_names) - lines.append(f" from {module} import {names_str}") + lines.append(f" from {entries[0]['module']} import {names_str}") else: + module = entries[0]["module"] lines.append(f" from {module} import (") for name in sorted_names: lines.append(f" {name},") @@ -339,5 +359,17 @@ def format_imports_text(categories: dict[str, list[dict[str, str]]]) -> str: def format_imports_json(categories: dict[str, list[dict[str, str]]]) -> dict[str, Any]: - """Return the categories dict as-is for JSON output.""" - return categories + """Return a structured JSON with recommended imports, alias, and categorized names.""" + structured: dict[str, Any] = { + "recommended_imports": _RECOMMENDED_IMPORTS, + "config_alias": _CONFIG_ALIAS, + "categories": {}, + } + for category, entries in sorted(categories.items()): + module = entries[0]["module"] if entries else _CONFIG_MODULE + structured["categories"][category] = { + "module": module, + "access_pattern": f"{_CONFIG_ALIAS}." if module == _CONFIG_MODULE else f"from {module} import ", + "names": sorted(e["name"] for e in entries), + } + return structured diff --git a/packages/data-designer/tests/cli/commands/test_usage_scenarios.py b/packages/data-designer/tests/cli/commands/test_usage_scenarios.py index 4a7d2e694..33926ac79 100644 --- a/packages/data-designer/tests/cli/commands/test_usage_scenarios.py +++ b/packages/data-designer/tests/cli/commands/test_usage_scenarios.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 from __future__ import annotations @@ -117,7 +117,7 @@ def test_usage_validate_unsupported_extension_is_actionable(tmp_path: Path) -> N def test_usage_introspect_columns_json_contract() -> None: - result = runner.invoke(app, ["introspect", "columns", "llm-text", "--format", "json"], color=False) + result = runner.invoke(app, ["types", "columns", "llm-text", "--format", "json"], color=False) assert result.exit_code == 0 payload = json.loads(result.output) @@ -127,7 +127,7 @@ def test_usage_introspect_columns_json_contract() -> None: def test_usage_introspect_unknown_type_error_is_actionable() -> None: - result = runner.invoke(app, ["introspect", "columns", "nonexistent"], color=False) + result = runner.invoke(app, ["types", "columns", "nonexistent"], color=False) normalized = _normalize_text(result.output) assert result.exit_code == 1 diff --git a/packages/data-designer/tests/cli/services/introspection/test_formatters.py b/packages/data-designer/tests/cli/services/introspection/test_formatters.py index 291980ba9..f3b3ca1d9 100644 --- a/packages/data-designer/tests/cli/services/introspection/test_formatters.py +++ b/packages/data-designer/tests/cli/services/introspection/test_formatters.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 from __future__ import annotations @@ -340,7 +340,7 @@ def test_format_overview_text_contains_quick_start() -> None: type_counts = {"Column types": 1} text = format_overview_text(type_counts, []) assert "Quick Start Commands:" in text - assert "introspect columns --list" in text + assert "types columns" in text # --------------------------------------------------------------------------- @@ -471,23 +471,55 @@ def _make_imports_data() -> dict[str, list[dict[str, str]]]: } -def test_format_imports_text_contains_from_import() -> None: +def test_format_imports_text_contains_recommended_imports() -> None: text = format_imports_text(_make_imports_data()) - assert "from data_designer.config import" in text + assert "Recommended imports:" in text + assert "import data_designer.config as dd" in text + assert "from data_designer.interface import DataDesigner" in text + + +def test_format_imports_text_config_names_use_dd_prefix() -> None: + text = format_imports_text(_make_imports_data()) + assert "dd.LLMTextColumnConfig" in text + assert "dd.SamplerColumnConfig" in text + assert "from data_designer.config import" not in text + + +def test_format_imports_text_interface_uses_from_import() -> None: + text = format_imports_text(_make_imports_data()) + assert "from data_designer.interface import DataDesigner" in text def test_format_imports_text_has_category_headers() -> None: text = format_imports_text(_make_imports_data()) assert "Column Configs (2 names):" in text - assert "Interface (1 names):" in text + assert "Interface (1 name):" in text def test_format_imports_json_structure() -> None: data = _make_imports_data() result = format_imports_json(data) assert isinstance(result, dict) - assert "Column Configs" in result - assert "Interface" in result + assert "recommended_imports" in result + assert "config_alias" in result + assert result["config_alias"] == "dd" + assert "categories" in result + assert "Column Configs" in result["categories"] + assert "Interface" in result["categories"] + + +def test_format_imports_json_category_structure() -> None: + data = _make_imports_data() + result = format_imports_json(data) + config_cat = result["categories"]["Column Configs"] + assert config_cat["module"] == "data_designer.config" + assert config_cat["access_pattern"] == "dd." + assert "LLMTextColumnConfig" in config_cat["names"] + + interface_cat = result["categories"]["Interface"] + assert interface_cat["module"] == "data_designer.interface" + assert "from data_designer.interface import " in interface_cat["access_pattern"] + assert "DataDesigner" in interface_cat["names"] # --------------------------------------------------------------------------- From 78950d9175bc6bcbfd8db32847ee1d8c379416b9 Mon Sep 17 00:00:00 2001 From: Johnny Greco Date: Sun, 15 Feb 2026 22:58:45 -0500 Subject: [PATCH 11/37] drop stale review --- docs/reviews/agent-context-cli-review.md | 383 ----------------------- 1 file changed, 383 deletions(-) delete mode 100644 docs/reviews/agent-context-cli-review.md diff --git a/docs/reviews/agent-context-cli-review.md b/docs/reviews/agent-context-cli-review.md deleted file mode 100644 index 7a19e22c6..000000000 --- a/docs/reviews/agent-context-cli-review.md +++ /dev/null @@ -1,383 +0,0 @@ -# Agent Context CLI — Implementation Review - -## Summary - -This review covers the implementation of `data-designer agent-context`, a new CLI command group that exposes DataDesigner's full configuration API surface as agent-friendly introspection commands. The feature was built by porting and improving existing standalone skill scripts (`skill/data-designer/scripts/get_*.py`) into the library itself, expanding coverage from 4 config domains to the full API, and separating data extraction from presentation to support multiple output formats. - -**Key stats:** -- 10 new CLI subcommands -- 8 new source files + 1 modified source file -- 6 new test files (113 tests) -- 2 output formats: plain text (YAML-style) and JSON - ---- - -## Architecture - -``` -data-designer agent-context -├── columns [TYPE] # Column types & fields -├── samplers [TYPE] # Sampler types & params -├── validators [TYPE] # Validator types & params -├── processors [TYPE] # Processor types & configs -├── models # ModelConfig, inference params, distributions -├── builder # DataDesignerConfigBuilder method signatures -├── constraints # ScalarInequality, ColumnInequality, operators -├── seeds # SeedConfig, SeedSource types, SamplingStrategy -├── mcp # MCPProvider, LocalStdioMCPProvider, ToolConfig -└── overview # Compact cheatsheet: type counts + builder summary -``` - -The implementation follows a layered architecture with clear separation of concerns: - -``` -commands/agent_context.py # Thin Typer command wrappers - │ - ▼ -controllers/agent_context_controller.py # Orchestration: discovery → inspection → formatting → output - │ - ▼ -services/introspection/ - ├── discovery.py # Dynamic type discovery (8 functions) - ├── pydantic_inspector.py # Pydantic model introspection (dataclass-based) - ├── method_inspector.py # Class method introspection via inspect.signature() - └── formatters.py # Text and JSON output formatters -``` - ---- - -## New Files - -### Source Files - -All paths relative to `packages/data-designer/src/data_designer/cli/`. - -| File | Lines | Purpose | -|------|-------|---------| -| `services/introspection/__init__.py` | 64 | Public exports for all introspection modules | -| `services/introspection/pydantic_inspector.py` | 257 | Core Pydantic model introspection with `FieldDetail` and `ModelSchema` dataclasses | -| `services/introspection/discovery.py` | 284 | 8 discovery functions + centralized `DEFAULT_FIELD_DESCRIPTIONS` (108 entries) | -| `services/introspection/method_inspector.py` | 251 | Class method introspection via `inspect.signature()` with Google-style docstring parsing | -| `services/introspection/formatters.py` | 183 | Text (YAML-style) and JSON formatters for all data types | -| `controllers/agent_context_controller.py` | 263 | Controller orchestrating discovery, inspection, formatting, and output | -| `commands/agent_context.py` | 115 | Typer subcommand group with 10 commands | - -### Modified Files - -| File | Change | -|------|--------| -| `main.py` | Added `agent_context` import and `app.add_typer(...)` registration | -| `controllers/__init__.py` | Added `AgentContextController` to exports | - -### Test Files - -All paths relative to `packages/data-designer/tests/cli/`. - -| File | Tests | Purpose | -|------|-------|---------| -| `services/introspection/test_pydantic_inspector.py` | 38 | Unit tests for type introspection (field extraction, enum detection, nested models, cycles, depth limits) | -| `services/introspection/test_discovery.py` | 17 | Tests all 8 discovery functions find expected types | -| `services/introspection/test_method_inspector.py` | 11 | Tests docstring parsing, method signature extraction, public/private filtering | -| `services/introspection/test_formatters.py` | 19 | Tests all text and JSON formatters (schemas, methods, type lists, overview) | -| `controllers/test_agent_context_controller.py` | 15 | Controller orchestration tests using `capsys` | -| `commands/test_agent_context_command.py` | 13 | End-to-end CLI integration tests via `typer.testing.CliRunner` | -| **Total** | **113** | | - ---- - -## Key Design Decisions - -### 1. Dataclass-based structured data (not raw tuples) - -The existing skill scripts used raw tuples for field information. The new implementation uses typed dataclasses: - -```python -@dataclass -class FieldDetail: - name: str - type_str: str - description: str - enum_values: list[str] | None = None - nested_schema: ModelSchema | None = None - -@dataclass -class ModelSchema: - class_name: str - description: str - type_key: str | None = None - type_value: str | None = None - fields: list[FieldDetail] = field(default_factory=list) -``` - -This enables clean separation between introspection and formatting, and makes JSON output trivial. - -### 2. Plain text output (no Rich/ANSI) - -All output uses `typer.echo()` producing plain text. Agents parse plain text more reliably than colored/ANSI output. The YAML-style text format is backward-compatible with the existing skill script output. - -### 3. Dynamic discovery for extensibility - -Column configs, sampler types, validator types, and processor configs are discovered dynamically by iterating `dir(data_designer.config)` and matching class name patterns. This means new types added to the config package are automatically picked up without code changes. - -### 4. Cycle and depth protection - -Nested model expansion uses a `seen` set (by class name) and `max_depth` parameter (default 3) to prevent infinite recursion from self-referential or deeply nested models. - -### 5. Centralized field descriptions - -All 108 default field descriptions are in a single `DEFAULT_FIELD_DESCRIPTIONS` dict in `discovery.py`, replacing 4 separate per-script copies. - ---- - -## Expected Behavior - -### Common Flags - -For type-based commands (columns, samplers, validators, processors): -- **Positional `TYPE`**: Show details for a specific type (e.g., `llm-text`, `category`) -- **`TYPE` = `all`**: Show details for all types in the category -- **No `TYPE` (no `--list`)**: Show summary table of available types -- **`--list` / `-l`**: Show summary table of available types -- **`--format json` / `-f json`**: JSON output instead of text - -For other commands (models, builder, constraints, seeds, mcp, overview): -- **`--format json` / `-f json`**: JSON output instead of text - -### Command-by-Command Behavior - -#### `data-designer agent-context columns` - -Shows column configuration types discovered from `data_designer.config`. - -```bash -# List all column types -$ data-designer agent-context columns --list -column_type config_class ------------ ------------------------- -custom CustomColumnConfig -embedding EmbeddingColumnConfig -expression ExpressionColumnConfig -llm-code LLMCodeColumnConfig -llm-judge LLMJudgeColumnConfig -llm-structured LLMStructuredColumnConfig -llm-text LLMTextColumnConfig -sampler SamplerColumnConfig -seed-dataset SeedDatasetColumnConfig -validation ValidationColumnConfig - -# Show details for a specific type -$ data-designer agent-context columns llm-text -LLMTextColumnConfig: - column_type: llm-text - description: Configuration for LLM-based text generation columns. - fields: - name: - type: str - description: Unique column name in the generated dataset - prompt: - type: str - description: Jinja2 template for the LLM prompt... - ... - -# JSON format -$ data-designer agent-context columns llm-text --format json -{ - "class_name": "LLMTextColumnConfig", - "description": "Configuration for LLM-based text generation columns.", - "column_type": "llm-text", - "fields": [ - {"name": "name", "type": "str", "description": "..."}, - ... - ] -} - -# Unknown type exits with error -$ data-designer agent-context columns nonexistent -Error: Unknown column_type 'nonexistent' -Available types: custom, embedding, expression, ... -``` - -#### `data-designer agent-context samplers` - -Shows sampler types discovered from `SamplerType` enum and their params classes. Type lookups are case-insensitive, and the type value is displayed in uppercase (e.g., `CATEGORY`). - -```bash -$ data-designer agent-context samplers category -CategorySamplerParams: - sampler_type: CATEGORY - description: ... - fields: - values: - type: list[str] - description: List of categorical values to sample from - ... -``` - -#### `data-designer agent-context validators` - -Shows validator types (CODE, REMOTE, LOCAL_CALLABLE) and their params classes. Same pattern as samplers. - -#### `data-designer agent-context processors` - -Shows processor types (drop_columns, templated_columns) and their config classes. - -#### `data-designer agent-context models` - -Shows all model-related types: `ModelConfig`, `ChatCompletionInferenceParams`, `EmbeddingInferenceParams`, `ImageInferenceParams`, `ImageContext`, `UniformDistribution`, `ManualDistribution`. - -```bash -$ data-designer agent-context models -# Data Designer Model Configuration Reference -# 7 types - -ChatCompletionInferenceParams: - description: ... - fields: - temperature: - type: float | UniformDistribution | ManualDistribution | None - ... -... -``` - -#### `data-designer agent-context builder` - -Shows `DataDesignerConfigBuilder` method signatures and documentation, extracted via `inspect.signature()` and Google-style docstring parsing. - -```bash -$ data-designer agent-context builder -DataDesignerConfigBuilder Methods: - - add_column(column: ColumnConfig) -> Self - Add a column configuration to the builder. - Parameters: - column: ColumnConfig — The column configuration to add. - ... -``` - -#### `data-designer agent-context constraints` - -Shows constraint types: `ScalarInequalityConstraint`, `ColumnInequalityConstraint`, `InequalityOperator`. - -#### `data-designer agent-context seeds` - -Shows seed dataset types: `SeedConfig`, `SamplingStrategy`, `LocalFileSeedSource`, `HuggingFaceSeedSource`, `DataFrameSeedSource`, `IndexRange`, `PartitionBlock`. - -#### `data-designer agent-context mcp` - -Shows MCP types: `MCPProvider`, `LocalStdioMCPProvider`, `ToolConfig`. - -#### `data-designer agent-context overview` - -Compact API cheatsheet with type counts, builder method summaries, and quick-start commands. - -```bash -$ data-designer agent-context overview -Data Designer API Overview -========================== - -Type Counts: - Column types: 10 - Sampler types: 12 - Validator types: 3 - Processor types: 2 - Model configs: 7 - Constraint types: 3 - Seed types: 7 - MCP types: 3 - -Builder Methods (DataDesignerConfigBuilder): - add_column(...) — Add a column configuration to the builder. - add_constraint(...) — Add a constraint to the builder. - ... - -Quick Start Commands: - data-designer agent-context columns --list - data-designer agent-context columns all - data-designer agent-context columns llm-text - data-designer agent-context samplers category - data-designer agent-context builder -``` - ---- - -## Improvements Over Skill Scripts - -| Aspect | Skill Scripts | Agent Context CLI | -|--------|--------------|-------------------| -| **Location** | External (`skill/data-designer/scripts/`) | Library (`data_designer.cli`) | -| **Data structure** | Raw tuples, print directly | Dataclasses (`FieldDetail`, `ModelSchema`, `MethodInfo`, `ParamInfo`) | -| **Output formats** | Text only | Text + JSON (`--format json`) | -| **API coverage** | 4 domains (columns, samplers, validators, processors) | 9 domains (+models, builder, constraints, seeds, MCP, overview) | -| **Field descriptions** | 4 separate dicts | 1 centralized dict (108 entries) | -| **Builder introspection** | None | Full method signatures + docstring parsing | -| **Error handling** | Varies | Consistent: error message + exit code 1 | -| **Testability** | Script-level test | 113 unit + integration tests at every layer | - ---- - -## Test Coverage Summary - -### Unit Tests (85 tests) - -**`test_pydantic_inspector.py` (38 tests):** -- `_is_basemodel_subclass`: 5 tests (subclass, BaseModel itself, str, enum, non-type) -- `_is_enum_subclass`: 4 tests (subclass, Enum itself, str, non-type) -- `_extract_enum_class`: 5 tests (direct, optional, annotated, non-enum, None) -- `extract_nested_basemodel`: 10 tests (direct, list, optional, optional-list, dict, annotated, discriminated union, primitive, None, BaseModel itself) -- `format_type`: 3 tests (str, int, optional) -- `get_brief_description`: 2 tests (with/without docstring) -- `get_field_info`: 4 tests (returns FieldDetails, default descriptions, enum values, non-enum) -- `build_model_schema`: 5 tests (basic structure, type key/value, nested expansion, cycle protection, depth limiting) - -**`test_discovery.py` (17 tests):** -- 2 tests per discovery function (returns dict + contains expected keys) for all 8 functions -- 1 extra test for `discover_column_configs` (values are classes with model_fields) - -**`test_method_inspector.py` (11 tests):** -- `_parse_google_docstring_args`: 4 tests (basic, empty, no args section, multiline) -- `inspect_class_methods`: 7 tests (public only, returns MethodInfo, signature content, description, parameters, include private, init included) - -**`test_formatters.py` (19 tests):** -- `format_model_schema_text`: 4 tests (basic, type key, nested, enum values) -- `format_model_schema_json`: 3 tests (basic, type key, nested) -- `format_method_info_text`: 3 tests (basic, with class name, without) -- `format_method_info_json`: 2 tests (basic, multiple methods) -- `format_type_list_text`: 3 tests (basic, alignment, empty) -- `format_overview_text`: 4 tests (header, type counts, builder methods, quick start) - -### Controller Tests (15 tests) - -**`test_agent_context_controller.py`:** -- `show_columns`: 6 tests (list mode, specific type, all, nonexistent exits, JSON format, list JSON) -- `show_overview`: 2 tests (text, JSON) -- `show_samplers`: 2 tests (list, specific) -- `show_models`: 1 test -- `show_builder`: 1 test -- `show_constraints`: 1 test -- `show_seeds`: 1 test -- `show_mcp`: 1 test - -### Integration Tests (13 tests) - -**`test_agent_context_command.py`:** -- Via `typer.testing.CliRunner` against the real `app`: - - `agent-context --help` - - `columns --list`, `columns llm-text`, `columns llm-text --format json`, `columns nonexistent` - - `samplers category`, `samplers --list` - - `overview`, `builder`, `models`, `constraints`, `seeds`, `mcp` - ---- - -## Verification Checklist - -- [x] `make check-all` passes (ruff format + lint) -- [x] All 113 new tests pass -- [x] All 670 total project tests pass (113 new + 557 existing) -- [x] SPDX license headers on all files (2025-2026) -- [x] Type annotations on all functions -- [x] Absolute imports only (no relative imports) -- [x] No in-function imports (except `data_designer.config` in discovery functions, which is intentional to avoid circular imports at module load time) -- [x] Plain text output (no Rich/ANSI) for agent compatibility -- [x] JSON output is valid `json.loads()`-parseable -- [x] Error handling: unknown types produce clear error message + exit code 1 -- [x] Backward compatibility: YAML-style text format matches existing skill script output From 52fd3aabe4d95d0fe88d4b104f14f930de559f8b Mon Sep 17 00:00:00 2001 From: Johnny Greco Date: Sun, 15 Feb 2026 23:24:14 -0500 Subject: [PATCH 12/37] refactor: replace hardcoded discovery functions with introspection-based discovery Use _LAZY_IMPORTS as the single source of truth for config exports so discovery functions stay in sync automatically when new types are added. Add _discover_by_modules() helper and make discover_interface_classes() scan interface.__all__ dynamically. Dynamically classify interface classes in the controller using ConfigBase instead of hardcoded lists. Add field descriptions to ModelProvider (newly discovered by dynamic scan). --- .../src/data_designer/config/models.py | 16 ++-- .../controllers/introspection_controller.py | 19 ++--- .../cli/services/introspection/discovery.py | 82 +++++++++---------- .../services/introspection/test_discovery.py | 37 +++++++++ 4 files changed, 93 insertions(+), 61 deletions(-) diff --git a/packages/data-designer-config/src/data_designer/config/models.py b/packages/data-designer-config/src/data_designer/config/models.py index 536348907..13ce8d60e 100644 --- a/packages/data-designer-config/src/data_designer/config/models.py +++ b/packages/data-designer-config/src/data_designer/config/models.py @@ -581,12 +581,16 @@ class ModelProvider(ConfigBase): extra_headers: Additional headers to pass in API requests. """ - name: str - endpoint: str - provider_type: str = "openai" - api_key: str | None = None - extra_body: dict[str, Any] | None = None - extra_headers: dict[str, str] | None = None + name: str = Field(description="Name of the model provider.") + endpoint: str = Field(description="API endpoint URL for the provider.") + provider_type: str = Field(default="openai", description="Provider type. Determines the API format to use.") + api_key: str | None = Field(default=None, description="Optional API key for authentication.") + extra_body: dict[str, Any] | None = Field( + default=None, description="Additional parameters to pass in API requests." + ) + extra_headers: dict[str, str] | None = Field( + default=None, description="Additional headers to pass in API requests." + ) def load_model_configs(model_configs: list[ModelConfig] | str | Path) -> list[ModelConfig]: diff --git a/packages/data-designer/src/data_designer/cli/controllers/introspection_controller.py b/packages/data-designer/src/data_designer/cli/controllers/introspection_controller.py index 7cf315fc8..94c67e12a 100644 --- a/packages/data-designer/src/data_designer/cli/controllers/introspection_controller.py +++ b/packages/data-designer/src/data_designer/cli/controllers/introspection_controller.py @@ -37,6 +37,7 @@ ) from data_designer.cli.services.introspection.method_inspector import inspect_class_methods from data_designer.cli.services.introspection.pydantic_inspector import build_model_schema +from data_designer.config.base import ConfigBase from data_designer.config.config_builder import DataDesignerConfigBuilder @@ -142,20 +143,14 @@ def show_interface(self) -> None: """Show DataDesigner, result types, and RunConfig.""" classes = discover_interface_classes() - method_class_names = ["DataDesigner", "DatasetCreationResults", "PreviewResults"] - pydantic_class_names = ["RunConfig"] - classes_with_methods: list[tuple[str, list]] = [] - for name in method_class_names: - cls = classes[name] - methods = inspect_class_methods(cls) - classes_with_methods.append((name, methods)) - pydantic_schemas = [] - for name in pydantic_class_names: - cls = classes[name] - schema = build_model_schema(cls) - pydantic_schemas.append(schema) + for name, cls in classes.items(): + if isinstance(cls, type) and issubclass(cls, ConfigBase): + pydantic_schemas.append(build_model_schema(cls)) + else: + methods = inspect_class_methods(cls) + classes_with_methods.append((name, methods)) if self._format == "json": typer.echo(json.dumps(format_interface_json(classes_with_methods, pydantic_schemas), indent=2)) diff --git a/packages/data-designer/src/data_designer/cli/services/introspection/discovery.py b/packages/data-designer/src/data_designer/cli/services/introspection/discovery.py index 0ffc9aac6..9eb203804 100644 --- a/packages/data-designer/src/data_designer/cli/services/introspection/discovery.py +++ b/packages/data-designer/src/data_designer/cli/services/introspection/discovery.py @@ -14,8 +14,6 @@ import data_designer.interface as interface_mod from data_designer.config.preview_results import PreviewResults from data_designer.config.run_config import RunConfig -from data_designer.interface.data_designer import DataDesigner -from data_designer.interface.results import DatasetCreationResults def _walk_namespace(package_path: list[str], prefix: str, max_depth: int, current_depth: int) -> list[dict[str, Any]]: @@ -162,22 +160,35 @@ def discover_processor_configs() -> dict[str, type]: return processor_configs +def _discover_by_modules(*module_suffixes: str) -> dict[str, type]: + """Discover config types by filtering _LAZY_IMPORTS on source-module suffix. + + Args: + module_suffixes: One or more module suffixes to match against + (e.g., ``"models"``, ``"seed"``). + + Returns: + Dict mapping class/object names to their resolved types. + """ + lazy_imports: dict[str, tuple[str, str]] = getattr(dd, "_LAZY_IMPORTS", {}) + prefix = "data_designer.config." + result: dict[str, type] = {} + for name, (module_path, _attr) in lazy_imports.items(): + suffix = module_path.removeprefix(prefix) if module_path.startswith(prefix) else module_path + if suffix in module_suffixes: + obj = getattr(dd, name, None) + if obj is not None: + result[name] = obj + return result + + def discover_model_configs() -> dict[str, type]: """Return model-related configuration classes from data_designer.config. Returns: Dict mapping class names to their types. """ - - return { - "ModelConfig": dd.ModelConfig, - "ChatCompletionInferenceParams": dd.ChatCompletionInferenceParams, - "EmbeddingInferenceParams": dd.EmbeddingInferenceParams, - "ImageInferenceParams": dd.ImageInferenceParams, - "ImageContext": dd.ImageContext, - "UniformDistribution": dd.UniformDistribution, - "ManualDistribution": dd.ManualDistribution, - } + return _discover_by_modules("models") def discover_constraint_types() -> dict[str, type]: @@ -186,12 +197,7 @@ def discover_constraint_types() -> dict[str, type]: Returns: Dict mapping class names to their types. """ - - return { - "ScalarInequalityConstraint": dd.ScalarInequalityConstraint, - "ColumnInequalityConstraint": dd.ColumnInequalityConstraint, - "InequalityOperator": dd.InequalityOperator, - } + return _discover_by_modules("sampler_constraints") def discover_seed_types() -> dict[str, type]: @@ -200,16 +206,7 @@ def discover_seed_types() -> dict[str, type]: Returns: Dict mapping class names to their types. """ - - return { - "SeedConfig": dd.SeedConfig, - "SamplingStrategy": dd.SamplingStrategy, - "LocalFileSeedSource": dd.LocalFileSeedSource, - "HuggingFaceSeedSource": dd.HuggingFaceSeedSource, - "DataFrameSeedSource": dd.DataFrameSeedSource, - "IndexRange": dd.IndexRange, - "PartitionBlock": dd.PartitionBlock, - } + return _discover_by_modules("seed", "seed_source") def discover_mcp_types() -> dict[str, type]: @@ -218,27 +215,26 @@ def discover_mcp_types() -> dict[str, type]: Returns: Dict mapping class names to their types. """ - - return { - "MCPProvider": dd.MCPProvider, - "LocalStdioMCPProvider": dd.LocalStdioMCPProvider, - "ToolConfig": dd.ToolConfig, - } + return _discover_by_modules("mcp") def discover_interface_classes() -> dict[str, type]: - """Return the key interface-layer classes an agent uses after building a config. + """Discover interface-layer classes plus config-layer types used in the interface workflow. + + Dynamically scans ``data_designer.interface.__all__`` for non-exception classes and + adds ``PreviewResults`` and ``RunConfig`` from the config layer. Returns: - Dict mapping class names to their types for DataDesigner, DatasetCreationResults, - PreviewResults, and RunConfig. + Dict mapping class names to their types. """ - return { - "DataDesigner": DataDesigner, - "DatasetCreationResults": DatasetCreationResults, - "PreviewResults": PreviewResults, - "RunConfig": RunConfig, - } + result: dict[str, type] = {} + for name in getattr(interface_mod, "__all__", []): + obj = getattr(interface_mod, name, None) + if obj is not None and inspect.isclass(obj) and not issubclass(obj, Exception): + result[name] = obj + result["PreviewResults"] = PreviewResults + result["RunConfig"] = RunConfig + return result _MODULE_CATEGORIES: dict[str, str] = { diff --git a/packages/data-designer/tests/cli/services/introspection/test_discovery.py b/packages/data-designer/tests/cli/services/introspection/test_discovery.py index 0d8f72e3f..c628a8c8b 100644 --- a/packages/data-designer/tests/cli/services/introspection/test_discovery.py +++ b/packages/data-designer/tests/cli/services/introspection/test_discovery.py @@ -4,6 +4,7 @@ from __future__ import annotations from data_designer.cli.services.introspection.discovery import ( + _discover_by_modules, discover_column_configs, discover_constraint_types, discover_importable_names, @@ -253,3 +254,39 @@ def test_discover_importable_names_entries_have_name_and_module() -> None: for entry in entries: assert "name" in entry, f"Entry in '{category}' missing 'name'" assert "module" in entry, f"Entry in '{category}' missing 'module'" + + +# --------------------------------------------------------------------------- +# _discover_by_modules +# --------------------------------------------------------------------------- + + +def test_discover_by_modules_returns_only_matching_modules() -> None: + result = _discover_by_modules("models") + import data_designer.config as dd + + lazy_imports: dict[str, tuple[str, str]] = getattr(dd, "_LAZY_IMPORTS", {}) + model_names = {name for name, (mod, _) in lazy_imports.items() if mod == "data_designer.config.models"} + assert set(result.keys()) == model_names + + +def test_discover_by_modules_with_multiple_suffixes() -> None: + result = _discover_by_modules("seed", "seed_source") + assert "SeedConfig" in result + assert "LocalFileSeedSource" in result + + +def test_discover_by_modules_unknown_suffix_returns_empty() -> None: + result = _discover_by_modules("nonexistent_module") + assert result == {} + + +# --------------------------------------------------------------------------- +# discover_interface_classes — error class exclusion +# --------------------------------------------------------------------------- + + +def test_discover_interface_classes_excludes_exceptions() -> None: + result = discover_interface_classes() + for name, cls in result.items(): + assert not issubclass(cls, Exception), f"{name} is an Exception subclass and should be excluded" From 35d10e212e1fc7aaf3ff8660ec4d67b86b068c8b Mon Sep 17 00:00:00 2001 From: Johnny Greco Date: Mon, 16 Feb 2026 11:31:02 -0500 Subject: [PATCH 13/37] fix: improve introspection defaults and depth checks - validate negative depth values in code-structure discovery/CLI paths and return actionable errors - preserve machine-typed field defaults in JSON schema output via default/default_factory handling - surface namespace import warnings and include enum values in seed JSON output, with coverage updates across introspection tests --- .../data_designer/cli/commands/reference.py | 8 ++- .../controllers/introspection_controller.py | 11 ++- .../cli/services/introspection/discovery.py | 34 +++++++-- .../cli/services/introspection/formatters.py | 31 ++++++-- .../introspection/pydantic_inspector.py | 52 ++++++++++++-- .../commands/test_introspection_commands.py | 7 ++ .../test_introspection_controller.py | 25 +++++++ .../services/introspection/test_discovery.py | 21 ++++++ .../services/introspection/test_formatters.py | 71 +++++++++++++++++++ .../introspection/test_pydantic_inspector.py | 21 +++++- 10 files changed, 260 insertions(+), 21 deletions(-) diff --git a/packages/data-designer/src/data_designer/cli/commands/reference.py b/packages/data-designer/src/data_designer/cli/commands/reference.py index 53c831c35..9e1aef3b7 100644 --- a/packages/data-designer/src/data_designer/cli/commands/reference.py +++ b/packages/data-designer/src/data_designer/cli/commands/reference.py @@ -26,7 +26,13 @@ def overview_command( @reference_app.command(name="code-structure") def code_structure_command( - depth: int = typer.Option(2, "--depth", "-d", help="Max tree depth (default: 2)."), + depth: int = typer.Option( + 2, + "--depth", + "-d", + help="Max tree depth (default: 2). Must be >= 0.", + min=0, + ), output_format: OutputFormat = typer.Option( OutputFormat.TEXT, "--format", "-f", help="Output format: 'text' or 'json'." ), diff --git a/packages/data-designer/src/data_designer/cli/controllers/introspection_controller.py b/packages/data-designer/src/data_designer/cli/controllers/introspection_controller.py index 94c67e12a..28aab13fe 100644 --- a/packages/data-designer/src/data_designer/cli/controllers/introspection_controller.py +++ b/packages/data-designer/src/data_designer/cli/controllers/introspection_controller.py @@ -177,6 +177,9 @@ def show_imports(self, category: str | None = None) -> None: def show_code_structure(self, depth: int = 2) -> None: """Show the data_designer package structure and install paths.""" + if depth < 0: + typer.echo("Error: --depth must be >= 0.", err=True) + raise typer.Exit(code=1) data = discover_namespace_tree(max_depth=depth) if self._format == "json": typer.echo(json.dumps(format_namespace_json(data), indent=2)) @@ -322,7 +325,13 @@ def _show_all_schemas(self, items: dict[str, type], header_title: str) -> None: schema = build_model_schema(cls) all_schemas.append(format_model_schema_json(schema)) else: - all_schemas.append({"class_name": cls.__name__, "description": cls.__doc__ or ""}) + entry: dict = { + "class_name": cls.__name__, + "description": (cls.__doc__ or "").strip().split("\n")[0], + } + if hasattr(cls, "__members__"): + entry["values"] = [str(member.value) for member in cls] + all_schemas.append(entry) typer.echo(json.dumps(all_schemas, indent=2)) else: seen_schemas: set[str] = set() diff --git a/packages/data-designer/src/data_designer/cli/services/introspection/discovery.py b/packages/data-designer/src/data_designer/cli/services/introspection/discovery.py index 9eb203804..09d3afd74 100644 --- a/packages/data-designer/src/data_designer/cli/services/introspection/discovery.py +++ b/packages/data-designer/src/data_designer/cli/services/introspection/discovery.py @@ -16,8 +16,17 @@ from data_designer.config.run_config import RunConfig -def _walk_namespace(package_path: list[str], prefix: str, max_depth: int, current_depth: int) -> list[dict[str, Any]]: - """Recursively walk a namespace package and build a tree of children nodes.""" +def _walk_namespace( + package_path: list[str], + prefix: str, + max_depth: int, + current_depth: int, + import_errors: list[dict[str, str]], +) -> list[dict[str, Any]]: + """Recursively walk a namespace package and build a tree of children nodes. + + Import failures are appended to import_errors as {"module": full_name, "message": str}. + """ if current_depth >= max_depth: return [] @@ -33,9 +42,11 @@ def _walk_namespace(package_path: list[str], prefix: str, max_depth: int, curren try: sub_mod = importlib.import_module(full_name) sub_path = getattr(sub_mod, "__path__", []) - node["children"] = _walk_namespace(list(sub_path), full_name, max_depth, current_depth + 1) - except Exception: - pass + node["children"] = _walk_namespace( + list(sub_path), full_name, max_depth, current_depth + 1, import_errors + ) + except Exception as e: + import_errors.append({"module": full_name, "message": str(e)}) children.append(node) children.sort(key=lambda n: (not n["is_package"], n["name"])) @@ -47,14 +58,23 @@ def discover_namespace_tree(max_depth: int = 2) -> dict[str, Any]: Returns: Dict with ``paths`` (list of install directories) and ``tree`` (nested node dict). + + Raises: + ValueError: If max_depth < 0. """ + if max_depth < 0: + raise ValueError("max_depth must be >= 0.") paths = list(data_designer.__path__) + import_errors: list[dict[str, str]] = [] tree: dict[str, Any] = { "name": "data_designer", "is_package": True, - "children": _walk_namespace(paths, "data_designer", max_depth, 0), + "children": _walk_namespace(paths, "data_designer", max_depth, 0, import_errors), } - return {"paths": paths, "tree": tree} + result: dict[str, Any] = {"paths": paths, "tree": tree} + if import_errors: + result["import_errors"] = import_errors + return result def discover_column_configs() -> dict[str, type]: diff --git a/packages/data-designer/src/data_designer/cli/services/introspection/formatters.py b/packages/data-designer/src/data_designer/cli/services/introspection/formatters.py index 91a2b82e6..0320054fe 100644 --- a/packages/data-designer/src/data_designer/cli/services/introspection/formatters.py +++ b/packages/data-designer/src/data_designer/cli/services/introspection/formatters.py @@ -24,7 +24,11 @@ def _format_field_text(field: FieldDetail, indent: int = 4, seen_schemas: set[st pad = " " * indent lines: list[str] = [] header = f"{pad}{field.name}: {field.type_str}" - if field.default is not None: + if field.default_factory: + header += f" = {field.default_factory}()" + elif field.has_literal_default(): + header += f" = {field.default_json!r}" + elif field.default: header += f" = {field.default}" if field.required: header += " [required]" @@ -68,13 +72,21 @@ def format_model_schema_text(schema: ModelSchema, indent: int = 0, seen_schemas: def _format_field_json(field: FieldDetail) -> dict: - """Convert a FieldDetail to a JSON-serializable dict, recursing into nested schemas.""" + """Convert a FieldDetail to a JSON-serializable dict, recursing into nested schemas. + + Emits machine-typed defaults: "default" (native JSON value, including null) when + the field has a literal default, and "default_factory" (string) when it uses a factory. + """ result: dict = { "name": field.name, "type": field.type_str, "required": field.required, } - if field.default is not None: + if field.default_factory: + result["default_factory"] = field.default_factory + elif field.has_literal_default(): + result["default"] = field.default_json + elif field.default is not None: result["default"] = field.default if field.description: result["description"] = field.description @@ -256,13 +268,24 @@ def format_namespace_text(data: dict[str, Any]) -> str: for i, child in enumerate(children): lines.extend(_render_tree_lines(child, prefix="", is_last=(i == len(children) - 1))) + import_errors = data.get("import_errors", []) + if import_errors: + lines.append("") + lines.append("Warnings (submodules that could not be imported):") + for err in import_errors: + lines.append(f" {err.get('module', '?')}: {err.get('message', '')}") + lines.append("") + lines.append("") lines.append(_AGENT_GUIDANCE_FOOTER) return "\n".join(lines) def format_namespace_json(data: dict[str, Any]) -> dict[str, Any]: - """Return the namespace tree dict as-is for JSON output.""" + """Return the namespace tree dict as-is for JSON output. + + When discovery collected import_errors, they are included under "import_errors". + """ return data diff --git a/packages/data-designer/src/data_designer/cli/services/introspection/pydantic_inspector.py b/packages/data-designer/src/data_designer/cli/services/introspection/pydantic_inspector.py index e9287659a..f8ae68e80 100644 --- a/packages/data-designer/src/data_designer/cli/services/introspection/pydantic_inspector.py +++ b/packages/data-designer/src/data_designer/cli/services/introspection/pydantic_inspector.py @@ -13,6 +13,8 @@ from pydantic import BaseModel from pydantic_core import PydanticUndefined +_UNDEFINED: Any = object() + @dataclass class FieldDetail: @@ -23,10 +25,16 @@ class FieldDetail: description: str required: bool = True default: str | None = None + default_json: Any = _UNDEFINED + default_factory: str | None = None enum_values: list[str] | None = None constraints: dict[str, Any] | None = None nested_schema: ModelSchema | None = None + def has_literal_default(self) -> bool: + """True if this field has a literal default value (including None).""" + return self.default_json is not _UNDEFINED + @dataclass class ModelSchema: @@ -181,6 +189,27 @@ def _extract_constraints(field_info: Any) -> dict[str, Any] | None: return constraints or None +def _default_to_json(value: Any) -> Any: + """Convert a Pydantic default value to a JSON-serializable value. + + Returns the value unchanged if it is already JSON-serializable (bool, int, float, + str, None, list, dict with JSON-serializable values). Enum members are converted + to their .value. Other types are returned as a string representation for stability. + """ + if value is None: + return None + if isinstance(value, (bool, int, float, str)): + return value + if isinstance(value, Enum): + return value.value + if isinstance(value, (list, dict)): + try: + return value + except Exception: + pass + return repr(value) + + def get_field_info(cls: type) -> list[FieldDetail]: """Extract field information from a Pydantic model. @@ -201,12 +230,18 @@ def get_field_info(cls: type) -> list[FieldDetail]: required = field_info.is_required() - default: str | None = None + default_json: Any = _UNDEFINED + default_factory_name: str | None = None + default_display: str | None = None if not required: - if field_info.default is not PydanticUndefined and field_info.default is not None: - default = repr(field_info.default) - elif field_info.default_factory is not None: - default = f"{field_info.default_factory.__name__}()" + if field_info.default_factory is not None: + default_factory_name = getattr( + field_info.default_factory, "__name__", repr(field_info.default_factory) + ) + elif field_info.default is not PydanticUndefined: + default_json = _default_to_json(field_info.default) + if default_json is not _UNDEFINED: + default_display = repr(field_info.default) enum_cls = _extract_enum_class(field_info.annotation) enum_values: list[str] | None = None @@ -215,13 +250,18 @@ def get_field_info(cls: type) -> list[FieldDetail]: constraints = _extract_constraints(field_info) + if default_display is None and default_factory_name is not None: + default_display = f"{default_factory_name}()" + fields.append( FieldDetail( name=field_name, type_str=type_str, description=description, required=required, - default=default, + default=default_display, + default_json=default_json, + default_factory=default_factory_name, enum_values=enum_values, constraints=constraints, nested_schema=None, diff --git a/packages/data-designer/tests/cli/commands/test_introspection_commands.py b/packages/data-designer/tests/cli/commands/test_introspection_commands.py index e054e0ecd..ae0271c20 100644 --- a/packages/data-designer/tests/cli/commands/test_introspection_commands.py +++ b/packages/data-designer/tests/cli/commands/test_introspection_commands.py @@ -219,6 +219,13 @@ def test_code_structure_shows_agent_guidance() -> None: assert "Only read source files directly" in result.output +def test_code_structure_negative_depth_exits_with_error() -> None: + """Invalid --depth < 0 is rejected with actionable error.""" + result = runner.invoke(app, ["reference", "code-structure", "--depth", "-1"], color=False) + assert result.exit_code != 0 + assert "depth" in result.output.lower() or "0" in result.output + + # --------------------------------------------------------------------------- # interface # --------------------------------------------------------------------------- diff --git a/packages/data-designer/tests/cli/controllers/test_introspection_controller.py b/packages/data-designer/tests/cli/controllers/test_introspection_controller.py index 2b4c6b532..9c347bc11 100644 --- a/packages/data-designer/tests/cli/controllers/test_introspection_controller.py +++ b/packages/data-designer/tests/cli/controllers/test_introspection_controller.py @@ -313,6 +313,31 @@ def test_show_code_structure_json(capsys: pytest.CaptureFixture[str]) -> None: assert data["tree"]["name"] == "data_designer" +def test_show_code_structure_negative_depth_exits(capsys: pytest.CaptureFixture[str]) -> None: + """Invalid depth < 0 should exit with code 1 and an actionable message.""" + import click.exceptions + + controller = IntrospectionController(output_format="text") + with pytest.raises(click.exceptions.Exit): + controller.show_code_structure(depth=-1) + captured = capsys.readouterr() + assert "depth" in captured.err.lower() or ">= 0" in captured.err + + +def test_show_seeds_json_includes_enum_values(capsys: pytest.CaptureFixture[str]) -> None: + """JSON schema list for seeds includes 'values' for enum-only types (e.g. SamplingStrategy).""" + controller = IntrospectionController(output_format="json") + controller.show_seeds() + captured = capsys.readouterr() + data = json.loads(captured.out) + assert isinstance(data, list) + enum_entries = [e for e in data if e.get("class_name") == "SamplingStrategy"] + assert len(enum_entries) >= 1 + assert "values" in enum_entries[0] + assert isinstance(enum_entries[0]["values"], list) + assert "ordered" in enum_entries[0]["values"] or "shuffle" in enum_entries[0]["values"] + + # --------------------------------------------------------------------------- # _match_category # --------------------------------------------------------------------------- diff --git a/packages/data-designer/tests/cli/services/introspection/test_discovery.py b/packages/data-designer/tests/cli/services/introspection/test_discovery.py index c628a8c8b..f38a66512 100644 --- a/packages/data-designer/tests/cli/services/introspection/test_discovery.py +++ b/packages/data-designer/tests/cli/services/introspection/test_discovery.py @@ -3,6 +3,8 @@ from __future__ import annotations +import pytest + from data_designer.cli.services.introspection.discovery import ( _discover_by_modules, discover_column_configs, @@ -203,6 +205,25 @@ def test_discover_namespace_tree_children_have_correct_structure() -> None: assert isinstance(child["children"], list) +def test_discover_namespace_tree_negative_depth_raises() -> None: + """Invalid max_depth < 0 raises ValueError with actionable message.""" + with pytest.raises(ValueError, match="max_depth must be >= 0"): + discover_namespace_tree(max_depth=-1) + + +def test_discover_namespace_tree_import_errors_structure() -> None: + """When present, import_errors is a list of dicts with module and message.""" + result = discover_namespace_tree() + if "import_errors" in result: + errors = result["import_errors"] + assert isinstance(errors, list) + for err in errors: + assert "module" in err + assert "message" in err + assert isinstance(err["module"], str) + assert isinstance(err["message"], str) + + # --------------------------------------------------------------------------- # discover_interface_classes # --------------------------------------------------------------------------- diff --git a/packages/data-designer/tests/cli/services/introspection/test_formatters.py b/packages/data-designer/tests/cli/services/introspection/test_formatters.py index f3b3ca1d9..00199ffc0 100644 --- a/packages/data-designer/tests/cli/services/introspection/test_formatters.py +++ b/packages/data-designer/tests/cli/services/introspection/test_formatters.py @@ -174,6 +174,58 @@ def test_format_model_schema_json_required_field_no_default() -> None: assert "default" not in f +def test_format_model_schema_json_native_default_value() -> None: + """JSON output uses native types for default (e.g. int, bool, null).""" + field = FieldDetail( + name="count", + type_str="int", + description="Count", + required=False, + default_json=42, + default_factory=None, + ) + schema = _make_schema(fields=[field]) + result = format_model_schema_json(schema) + f = result["fields"][0] + assert f["required"] is False + assert f["default"] == 42 + assert "default_factory" not in f + + +def test_format_model_schema_json_default_factory_key() -> None: + """JSON output includes default_factory when the field uses a factory.""" + field = FieldDetail( + name="items", + type_str="list[str]", + description="Items", + required=False, + default_factory="list", + ) + schema = _make_schema(fields=[field]) + result = format_model_schema_json(schema) + f = result["fields"][0] + assert f["required"] is False + assert f["default_factory"] == "list" + assert "default" not in f + + +def test_format_model_schema_json_explicit_null_default() -> None: + """JSON output uses null for explicit None default.""" + field = FieldDetail( + name="optional", + type_str="str | None", + description="Optional", + required=False, + default_json=None, + default_factory=None, + ) + schema = _make_schema(fields=[field]) + result = format_model_schema_json(schema) + f = result["fields"][0] + assert f["required"] is False + assert f["default"] is None + + def test_format_model_schema_json_includes_constraints() -> None: field = FieldDetail( name="score", @@ -407,6 +459,25 @@ def test_format_namespace_json_returns_passthrough() -> None: assert result is data +def test_format_namespace_text_shows_import_warnings_when_present() -> None: + data = _make_namespace_data() + data["import_errors"] = [ + {"module": "data_designer.fake_submodule", "message": "No module named 'fake'"}, + ] + text = format_namespace_text(data) + assert "Warnings" in text + assert "data_designer.fake_submodule" in text + assert "No module named" in text + + +def test_format_namespace_json_includes_import_errors_when_present() -> None: + data = _make_namespace_data() + data["import_errors"] = [{"module": "data_designer.foo", "message": "err"}] + result = format_namespace_json(data) + assert "import_errors" in result + assert result["import_errors"] == [{"module": "data_designer.foo", "message": "err"}] + + # --------------------------------------------------------------------------- # Interface formatters # --------------------------------------------------------------------------- diff --git a/packages/data-designer/tests/cli/services/introspection/test_pydantic_inspector.py b/packages/data-designer/tests/cli/services/introspection/test_pydantic_inspector.py index f5ab0be5c..9f37e92e3 100644 --- a/packages/data-designer/tests/cli/services/introspection/test_pydantic_inspector.py +++ b/packages/data-designer/tests/cli/services/introspection/test_pydantic_inspector.py @@ -296,11 +296,28 @@ def test_get_field_info_default_factory() -> None: def test_get_field_info_none_default_not_shown() -> None: - """Fields with default=None (like SelfRefModel.child) should have default=None in FieldDetail.""" + """Fields with default=None (like SelfRefModel.child) have default_json=None in FieldDetail.""" fields = get_field_info(SelfRefModel) child = next(f for f in fields if f.name == "child") assert child.required is False - assert child.default is None + assert child.default_json is None + + +def test_get_field_info_optional_field_default_json_native() -> None: + """Optional scalar defaults are stored as native default_json for machine consumption.""" + fields = get_field_info(RequiredFieldModel) + opt = next(f for f in fields if f.name == "optional_name") + assert opt.required is False + assert opt.default_json == "default_val" + assert opt.default_factory is None + + +def test_get_field_info_default_factory_set() -> None: + """Fields with default_factory set have default_factory name and default_json undefined.""" + fields = get_field_info(OuterModel) + nested = next(f for f in fields if f.name == "nested") + assert nested.required is False + assert nested.default_factory == "InnerModel" # --------------------------------------------------------------------------- From f7fa98dc136a39a2e0a9e006a788e4bc478f613e Mon Sep 17 00:00:00 2001 From: Johnny Greco Date: Mon, 16 Feb 2026 15:58:18 -0500 Subject: [PATCH 14/37] fix: align enum output across text/json and remove dead try/except - Use enum .value in text path of _show_all_schemas for parity with JSON - Simplify _default_to_json list/dict branch (remove dead try/except) - Add test_show_seeds_text_uses_enum_values_not_names for format parity --- .../cli/controllers/introspection_controller.py | 2 +- .../cli/services/introspection/pydantic_inspector.py | 5 +---- .../cli/controllers/test_introspection_controller.py | 9 +++++++++ 3 files changed, 11 insertions(+), 5 deletions(-) diff --git a/packages/data-designer/src/data_designer/cli/controllers/introspection_controller.py b/packages/data-designer/src/data_designer/cli/controllers/introspection_controller.py index 28aab13fe..d0a330fbc 100644 --- a/packages/data-designer/src/data_designer/cli/controllers/introspection_controller.py +++ b/packages/data-designer/src/data_designer/cli/controllers/introspection_controller.py @@ -346,7 +346,7 @@ def _show_all_schemas(self, items: dict[str, type], header_title: str) -> None: if cls.__doc__: lines.append(f" description: {cls.__doc__.strip().split(chr(10))[0]}") if hasattr(cls, "__members__"): - members = [m.name for m in cls] + members = [str(m.value) for m in cls] lines.append(f" values: [{', '.join(members)}]") lines.append("") typer.echo("\n".join(lines)) diff --git a/packages/data-designer/src/data_designer/cli/services/introspection/pydantic_inspector.py b/packages/data-designer/src/data_designer/cli/services/introspection/pydantic_inspector.py index f8ae68e80..ed7848e47 100644 --- a/packages/data-designer/src/data_designer/cli/services/introspection/pydantic_inspector.py +++ b/packages/data-designer/src/data_designer/cli/services/introspection/pydantic_inspector.py @@ -203,10 +203,7 @@ def _default_to_json(value: Any) -> Any: if isinstance(value, Enum): return value.value if isinstance(value, (list, dict)): - try: - return value - except Exception: - pass + return value return repr(value) diff --git a/packages/data-designer/tests/cli/controllers/test_introspection_controller.py b/packages/data-designer/tests/cli/controllers/test_introspection_controller.py index 9c347bc11..578b58e71 100644 --- a/packages/data-designer/tests/cli/controllers/test_introspection_controller.py +++ b/packages/data-designer/tests/cli/controllers/test_introspection_controller.py @@ -338,6 +338,15 @@ def test_show_seeds_json_includes_enum_values(capsys: pytest.CaptureFixture[str] assert "ordered" in enum_entries[0]["values"] or "shuffle" in enum_entries[0]["values"] +def test_show_seeds_text_uses_enum_values_not_names(capsys: pytest.CaptureFixture[str]) -> None: + """Text output for seeds uses enum .value (e.g. ordered, shuffle) for parity with JSON.""" + controller = IntrospectionController(output_format="text") + controller.show_seeds() + captured = capsys.readouterr() + assert "SamplingStrategy" in captured.out + assert "values: [ordered, shuffle]" in captured.out or "values: [shuffle, ordered]" in captured.out + + # --------------------------------------------------------------------------- # _match_category # --------------------------------------------------------------------------- From 93e0a617b48c50391f18dee87a1417b6bec71519 Mon Sep 17 00:00:00 2001 From: Johnny Greco Date: Mon, 16 Feb 2026 16:19:55 -0500 Subject: [PATCH 15/37] fix: surface namespace import failures in debug logs Log subpackage import exceptions during namespace discovery so skipped modules are traceable during development without changing best-effort traversal behavior. --- .../src/data_designer/cli/services/introspection/discovery.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/packages/data-designer/src/data_designer/cli/services/introspection/discovery.py b/packages/data-designer/src/data_designer/cli/services/introspection/discovery.py index 09d3afd74..9d8e1d5bc 100644 --- a/packages/data-designer/src/data_designer/cli/services/introspection/discovery.py +++ b/packages/data-designer/src/data_designer/cli/services/introspection/discovery.py @@ -5,6 +5,7 @@ import importlib import inspect +import logging import pkgutil from enum import Enum from typing import Any, Literal, get_args, get_origin @@ -15,6 +16,8 @@ from data_designer.config.preview_results import PreviewResults from data_designer.config.run_config import RunConfig +logger = logging.getLogger(__name__) + def _walk_namespace( package_path: list[str], @@ -46,6 +49,7 @@ def _walk_namespace( list(sub_path), full_name, max_depth, current_depth + 1, import_errors ) except Exception as e: + logger.debug("Failed to import %s during namespace discovery.", full_name, exc_info=True) import_errors.append({"module": full_name, "message": str(e)}) children.append(node) From 2ae47e74336edf90689ac37f268fa3c2f93e1c76 Mon Sep 17 00:00:00 2001 From: Johnny Greco Date: Mon, 16 Feb 2026 16:22:10 -0500 Subject: [PATCH 16/37] sort --- .../cli/services/introspection/__init__.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/packages/data-designer/src/data_designer/cli/services/introspection/__init__.py b/packages/data-designer/src/data_designer/cli/services/introspection/__init__.py index f09c64909..93748bfd4 100644 --- a/packages/data-designer/src/data_designer/cli/services/introspection/__init__.py +++ b/packages/data-designer/src/data_designer/cli/services/introspection/__init__.py @@ -47,11 +47,6 @@ ) __all__ = [ - "FieldDetail", - "MethodInfo", - "ModelSchema", - "ParamInfo", - "PropertyInfo", "build_model_schema", "discover_column_configs", "discover_constraint_types", @@ -64,6 +59,7 @@ "discover_sampler_types", "discover_seed_types", "discover_validator_types", + "FieldDetail", "format_imports_json", "format_imports_text", "format_interface_json", @@ -81,4 +77,8 @@ "get_field_info", "inspect_class_methods", "inspect_class_properties", + "MethodInfo", + "ModelSchema", + "ParamInfo", + "PropertyInfo", ] From 37a3c6c8d3bb8d451b922105d6e59750fae4919b Mon Sep 17 00:00:00 2001 From: Johnny Greco Date: Mon, 16 Feb 2026 17:27:59 -0500 Subject: [PATCH 17/37] refactor introspection discovery and normalize typed schema output --- .../controllers/introspection_controller.py | 126 ++++++----- .../cli/services/introspection/discovery.py | 200 +++++++++++------- .../cli/services/introspection/formatters.py | 10 +- .../introspection/pydantic_inspector.py | 21 +- .../commands/test_introspection_commands.py | 53 ++++- .../test_introspection_controller.py | 20 +- .../services/introspection/test_formatters.py | 25 +++ .../introspection/test_pydantic_inspector.py | 26 +++ 8 files changed, 346 insertions(+), 135 deletions(-) diff --git a/packages/data-designer/src/data_designer/cli/controllers/introspection_controller.py b/packages/data-designer/src/data_designer/cli/controllers/introspection_controller.py index d0a330fbc..cf6d5657e 100644 --- a/packages/data-designer/src/data_designer/cli/controllers/introspection_controller.py +++ b/packages/data-designer/src/data_designer/cli/controllers/introspection_controller.py @@ -4,6 +4,8 @@ from __future__ import annotations import json +from collections.abc import Callable +from dataclasses import dataclass from enum import Enum import typer @@ -48,6 +50,18 @@ class OutputFormat(str, Enum): JSON = "json" +@dataclass(frozen=True) +class _TypedCommandSpec: + """Configuration for typed introspection commands.""" + + discover_items: Callable[[], dict[str, type]] + type_key: str + type_label: str + class_label: str + header_title: str + case_insensitive: bool = False + + class IntrospectionController: """Controller for introspect CLI commands. @@ -55,61 +69,58 @@ class IntrospectionController: introspect subcommands. """ - def __init__(self, output_format: str = "text") -> None: - self._format = output_format - - def show_columns(self, type_name: str | None) -> None: - """Show column configuration types.""" - items = discover_column_configs() - self._show_typed_items( - items=items, - type_name=type_name, + _TYPED_COMMAND_SPECS: dict[str, _TypedCommandSpec] = { + "columns": _TypedCommandSpec( + discover_items=discover_column_configs, type_key="column_type", type_label="column_type", class_label="config_class", header_title="Data Designer Column Types Reference", - ) - - def show_samplers(self, type_name: str | None) -> None: - """Show sampler types and their param classes.""" - items = discover_sampler_types() - self._show_typed_items( - items=items, - type_name=type_name, + ), + "samplers": _TypedCommandSpec( + discover_items=discover_sampler_types, type_key="sampler_type", type_label="sampler_type", class_label="params_class", header_title="Data Designer Sampler Types Reference", case_insensitive=True, - uppercase_value=True, - ) - - def show_validators(self, type_name: str | None) -> None: - """Show validator types and their param classes.""" - items = discover_validator_types() - self._show_typed_items( - items=items, - type_name=type_name, + ), + "validators": _TypedCommandSpec( + discover_items=discover_validator_types, type_key="validator_type", type_label="validator_type", class_label="params_class", header_title="Data Designer Validator Types Reference", case_insensitive=True, - uppercase_value=True, - ) - - def show_processors(self, type_name: str | None) -> None: - """Show processor types and their config classes.""" - items = discover_processor_configs() - self._show_typed_items( - items=items, - type_name=type_name, + ), + "processors": _TypedCommandSpec( + discover_items=discover_processor_configs, type_key="processor_type", type_label="processor_type", class_label="config_class", header_title="Data Designer Processor Types Reference", case_insensitive=True, - ) + ), + } + + def __init__(self, output_format: str = "text") -> None: + self._format = output_format + + def show_columns(self, type_name: str | None) -> None: + """Show column configuration types.""" + self._show_typed_command(command_name="columns", type_name=type_name) + + def show_samplers(self, type_name: str | None) -> None: + """Show sampler types and their param classes.""" + self._show_typed_command(command_name="samplers", type_name=type_name) + + def show_validators(self, type_name: str | None) -> None: + """Show validator types and their param classes.""" + self._show_typed_command(command_name="validators", type_name=type_name) + + def show_processors(self, type_name: str | None) -> None: + """Show processor types and their config classes.""" + self._show_typed_command(command_name="processors", type_name=type_name) def show_models(self) -> None: """Show model configuration types.""" @@ -210,6 +221,19 @@ def show_overview(self) -> None: else: typer.echo(format_overview_text(type_counts, builder_methods)) + def _show_typed_command(self, command_name: str, type_name: str | None) -> None: + """Resolve a typed-command spec and render it.""" + spec = self._TYPED_COMMAND_SPECS[command_name] + self._show_typed_items( + items=spec.discover_items(), + type_name=type_name, + type_key=spec.type_key, + type_label=spec.type_label, + class_label=spec.class_label, + header_title=spec.header_title, + case_insensitive=spec.case_insensitive, + ) + @staticmethod def _match_category(query: str, keys: list[str]) -> str | None: """Match a user query to a category key using progressive fuzzy matching. @@ -256,7 +280,6 @@ def _show_typed_items( class_label: str, header_title: str, case_insensitive: bool = False, - uppercase_value: bool = False, ) -> None: """Shared logic for type-based commands (columns, samplers, validators, processors).""" if type_name is None: @@ -266,20 +289,28 @@ def _show_typed_items( typer.echo(format_type_list_text(items, type_label, class_label)) return - if type_name == "all": - self._show_all_typed(items, type_key, header_title, uppercase_value) + if type_name.lower() == "all": + self._show_all_typed(items, type_key, header_title) return - lookup = type_name.lower() if case_insensitive else type_name - if lookup not in items: + canonical_value: str | None = None + cls: type | None = None + if case_insensitive: + matched = {k.lower(): (k, v) for k, v in items.items()}.get(type_name.lower()) + if matched is not None: + canonical_value, cls = matched + else: + if type_name in items: + canonical_value = type_name + cls = items[type_name] + + if canonical_value is None or cls is None: available = ", ".join(sorted(items.keys())) typer.echo(f"Error: Unknown {type_key} '{type_name}'", err=True) typer.echo(f"Available types: {available}", err=True) raise typer.Exit(code=1) - cls = items[lookup] - display_value = lookup.upper() if uppercase_value else lookup - schema = build_model_schema(cls, type_key=type_key, type_value=display_value) + schema = build_model_schema(cls, type_key=type_key, type_value=canonical_value) if self._format == "json": typer.echo(json.dumps(format_model_schema_json(schema), indent=2)) @@ -291,7 +322,6 @@ def _show_all_typed( items: dict[str, type], type_key: str, header_title: str, - uppercase_value: bool = False, ) -> None: """Show all types for a typed command.""" sorted_types = sorted(items.keys()) @@ -300,8 +330,7 @@ def _show_all_typed( all_schemas = [] for type_value in sorted_types: cls = items[type_value] - display_value = type_value.upper() if uppercase_value else type_value - schema = build_model_schema(cls, type_key=type_key, type_value=display_value) + schema = build_model_schema(cls, type_key=type_key, type_value=type_value) all_schemas.append(format_model_schema_json(schema)) typer.echo(json.dumps(all_schemas, indent=2)) else: @@ -309,8 +338,7 @@ def _show_all_typed( lines = [f"# {header_title}", f"# {len(sorted_types)} types discovered from data_designer.config", ""] for type_value in sorted_types: cls = items[type_value] - display_value = type_value.upper() if uppercase_value else type_value - schema = build_model_schema(cls, type_key=type_key, type_value=display_value) + schema = build_model_schema(cls, type_key=type_key, type_value=type_value) lines.append(format_model_schema_text(schema, seen_schemas=seen_schemas)) lines.append("") typer.echo("\n".join(lines)) diff --git a/packages/data-designer/src/data_designer/cli/services/introspection/discovery.py b/packages/data-designer/src/data_designer/cli/services/introspection/discovery.py index 9d8e1d5bc..fcab153a6 100644 --- a/packages/data-designer/src/data_designer/cli/services/introspection/discovery.py +++ b/packages/data-designer/src/data_designer/cli/services/introspection/discovery.py @@ -19,6 +19,112 @@ logger = logging.getLogger(__name__) +def _extract_literal_discriminator_value(annotation: Any) -> str | None: + """Extract the first literal discriminator value from a type annotation. + + Supports ``Literal["value"]`` and ``Literal[SomeEnum.MEMBER]``. + Returns ``None`` when the annotation is not a literal discriminator. + """ + if get_origin(annotation) is not Literal: + return None + + args = get_args(annotation) + if not args: + return None + + value = args[0] + if isinstance(value, Enum): + return str(value.value) + return str(value) + + +def _discover_configs_by_discriminator( + class_name_suffix: str, + discriminator_field: str, + exclude_class_names: set[str] | None = None, +) -> dict[str, type]: + """Discover config classes whose discriminator field is a Literal value. + + Args: + class_name_suffix: Class-name suffix to select candidate classes. + discriminator_field: Pydantic field name containing the discriminator. + exclude_class_names: Optional set of class names to skip. + + Returns: + Dict mapping discriminator values to config classes. + """ + excluded = exclude_class_names or set() + discovered: dict[str, type] = {} + + for name in dir(dd): + if name in excluded or not name.endswith(class_name_suffix): + continue + + obj = getattr(dd, name) + if not (inspect.isclass(obj) and hasattr(obj, "model_fields")): + continue + if discriminator_field not in obj.model_fields: + continue + + annotation = obj.model_fields[discriminator_field].annotation + discriminator_value = _extract_literal_discriminator_value(annotation) + if discriminator_value is not None: + discovered[discriminator_value] = obj + + return discovered + + +def _discover_params_by_discriminator( + params_class_suffix: str, + discriminator_field: str, + enum_name: str, +) -> dict[str, type]: + """Discover params classes keyed by their literal discriminator value. + + Args: + params_class_suffix: Class-name suffix to select params classes. + discriminator_field: Field name that stores the literal discriminator. + + Returns: + Dict mapping discriminator values to params classes. + """ + discovered: dict[str, type] = {} + normalized_name_map: dict[str, type] = {} + + for name in dir(dd): + if not name.endswith(params_class_suffix): + continue + + obj = getattr(dd, name) + if not (inspect.isclass(obj) and hasattr(obj, "model_fields")): + continue + + if discriminator_field in obj.model_fields: + annotation = obj.model_fields[discriminator_field].annotation + discriminator_value = _extract_literal_discriminator_value(annotation) + if discriminator_value is not None: + discovered[discriminator_value] = obj + continue + + normalized_name = name.removesuffix(params_class_suffix).replace("_", "").lower() + normalized_name_map[normalized_name] = obj + + enum_cls = getattr(dd, enum_name, None) + if enum_cls is None or not (inspect.isclass(enum_cls) and issubclass(enum_cls, Enum)): + return discovered + + for member in enum_cls: + value = str(member.value) + if value in discovered: + continue + normalized_value = value.replace("_", "").lower() + params_cls = normalized_name_map.get(normalized_value) + if params_cls is not None: + discovered[value] = params_cls + + return discovered + + def _walk_namespace( package_path: list[str], prefix: str, @@ -87,79 +193,36 @@ def discover_column_configs() -> dict[str, type]: Returns: Dict mapping column_type literal values (e.g., 'llm-text') to their config classes. """ - - column_configs: dict[str, type] = {} - for name in dir(dd): - if name.endswith("ColumnConfig"): - obj = getattr(dd, name) - if inspect.isclass(obj) and hasattr(obj, "model_fields"): - if "column_type" in obj.model_fields: - annotation = obj.model_fields["column_type"].annotation - if get_origin(annotation) is Literal: - args = get_args(annotation) - if args: - column_configs[args[0]] = obj - return column_configs + return _discover_configs_by_discriminator( + class_name_suffix="ColumnConfig", + discriminator_field="column_type", + ) def discover_sampler_types() -> dict[str, type]: - """Dynamically discover all sampler types and their param classes from data_designer.config. + """Dynamically discover sampler types and params classes from data_designer.config. Returns: Dict mapping sampler type names (e.g., 'category') to their params classes. """ - - sampler_type_enum = getattr(dd, "SamplerType", None) - if sampler_type_enum is None or not issubclass(sampler_type_enum, Enum): - return {} - - params_classes: dict[str, type] = {} - for name in dir(dd): - if name.endswith("SamplerParams"): - obj = getattr(dd, name) - if inspect.isclass(obj) and hasattr(obj, "model_fields"): - normalized = name.replace("SamplerParams", "").lower() - params_classes[normalized] = obj - - sampler_types: dict[str, type] = {} - for member in sampler_type_enum: - sampler_name = member.name.lower() - normalized_name = sampler_name.replace("_", "") - params_cls = params_classes.get(normalized_name) - if params_cls is not None: - sampler_types[sampler_name] = params_cls - - return sampler_types + return _discover_params_by_discriminator( + params_class_suffix="SamplerParams", + discriminator_field="sampler_type", + enum_name="SamplerType", + ) def discover_validator_types() -> dict[str, type]: - """Dynamically discover all validator types and their param classes from data_designer.config. + """Dynamically discover validator types and params classes from data_designer.config. Returns: Dict mapping validator type names to their params classes. """ - - validator_type_enum = getattr(dd, "ValidatorType", None) - if validator_type_enum is None or not issubclass(validator_type_enum, Enum): - return {} - - params_classes: dict[str, type] = {} - for name in dir(dd): - if name.endswith("ValidatorParams"): - obj = getattr(dd, name) - if inspect.isclass(obj) and hasattr(obj, "model_fields"): - normalized = name.replace("ValidatorParams", "").lower() - params_classes[normalized] = obj - - validator_types: dict[str, type] = {} - for member in validator_type_enum: - validator_name = member.name.lower() - normalized_name = validator_name.replace("_", "") - params_cls = params_classes.get(normalized_name) - if params_cls is not None: - validator_types[validator_name] = params_cls - - return validator_types + return _discover_params_by_discriminator( + params_class_suffix="ValidatorParams", + discriminator_field="validator_type", + enum_name="ValidatorType", + ) def discover_processor_configs() -> dict[str, type]: @@ -168,20 +231,11 @@ def discover_processor_configs() -> dict[str, type]: Returns: Dict mapping processor_type values to their config classes. """ - - processor_configs: dict[str, type] = {} - for name in dir(dd): - if name.endswith("ProcessorConfig") and name != "ProcessorConfig": - obj = getattr(dd, name) - if inspect.isclass(obj) and hasattr(obj, "model_fields"): - if "processor_type" in obj.model_fields: - annotation = obj.model_fields["processor_type"].annotation - if get_origin(annotation) is Literal: - args = get_args(annotation) - if args: - key = args[0].value if isinstance(args[0], Enum) else args[0] - processor_configs[key] = obj - return processor_configs + return _discover_configs_by_discriminator( + class_name_suffix="ProcessorConfig", + discriminator_field="processor_type", + exclude_class_names={"ProcessorConfig"}, + ) def _discover_by_modules(*module_suffixes: str) -> dict[str, type]: diff --git a/packages/data-designer/src/data_designer/cli/services/introspection/formatters.py b/packages/data-designer/src/data_designer/cli/services/introspection/formatters.py index 0320054fe..aaae8f28a 100644 --- a/packages/data-designer/src/data_designer/cli/services/introspection/formatters.py +++ b/packages/data-designer/src/data_designer/cli/services/introspection/formatters.py @@ -15,6 +15,11 @@ ) +def _schema_dedupe_key(schema: ModelSchema) -> str: + """Return a stable key used for nested-schema deduplication in text output.""" + return schema.schema_ref or schema.class_name + + def _format_field_text(field: FieldDetail, indent: int = 4, seen_schemas: set[str] | None = None) -> list[str]: """Format a single field as YAML-style text lines, recursing into nested schemas. @@ -41,12 +46,13 @@ def _format_field_text(field: FieldDetail, indent: int = 4, seen_schemas: set[st constraint_parts = [f"{k}={v}" for k, v in field.constraints.items()] lines.append(f"{pad} constraints: {', '.join(constraint_parts)}") if field.nested_schema: + schema_key = _schema_dedupe_key(field.nested_schema) schema_name = field.nested_schema.class_name - if seen_schemas is not None and schema_name in seen_schemas: + if seen_schemas is not None and schema_key in seen_schemas: lines.append(f"{pad} schema: (see {schema_name} above)") else: if seen_schemas is not None: - seen_schemas.add(schema_name) + seen_schemas.add(schema_key) lines.append(f"{pad} schema ({schema_name}):") for nested_field in field.nested_schema.fields: lines.extend(_format_field_text(nested_field, indent=indent + 4, seen_schemas=seen_schemas)) diff --git a/packages/data-designer/src/data_designer/cli/services/introspection/pydantic_inspector.py b/packages/data-designer/src/data_designer/cli/services/introspection/pydantic_inspector.py index ed7848e47..7643cdfbc 100644 --- a/packages/data-designer/src/data_designer/cli/services/introspection/pydantic_inspector.py +++ b/packages/data-designer/src/data_designer/cli/services/introspection/pydantic_inspector.py @@ -42,6 +42,7 @@ class ModelSchema: class_name: str description: str + schema_ref: str | None = None type_key: str | None = None type_value: str | None = None fields: list[FieldDetail] = field(default_factory=list) @@ -198,12 +199,14 @@ def _default_to_json(value: Any) -> Any: """ if value is None: return None - if isinstance(value, (bool, int, float, str)): - return value if isinstance(value, Enum): return value.value - if isinstance(value, (list, dict)): + if isinstance(value, (bool, int, float, str)): return value + if isinstance(value, list): + return [_default_to_json(item) for item in value] + if isinstance(value, dict): + return {k: _default_to_json(v) for k, v in value.items()} return repr(value) @@ -238,7 +241,7 @@ def get_field_info(cls: type) -> list[FieldDetail]: elif field_info.default is not PydanticUndefined: default_json = _default_to_json(field_info.default) if default_json is not _UNDEFINED: - default_display = repr(field_info.default) + default_display = repr(default_json) enum_cls = _extract_enum_class(field_info.annotation) enum_values: list[str] | None = None @@ -271,7 +274,7 @@ def build_model_schema( cls: type, type_key: str | None = None, type_value: str | None = None, - seen: set[str] | None = None, + seen: set[type] | None = None, max_depth: int = 3, current_depth: int = 0, ) -> ModelSchema: @@ -293,6 +296,7 @@ def build_model_schema( class_name = cls.__name__ description = get_brief_description(cls) + schema_ref = f"{cls.__module__}.{cls.__qualname__}" fields = get_field_info(cls) model_fields_raw: dict[str, Any] = getattr(cls, "model_fields", {}) @@ -302,11 +306,11 @@ def build_model_schema( continue nested_cls = extract_nested_basemodel(raw_field_info.annotation) - if nested_cls is not None and nested_cls.__name__ not in seen and current_depth < max_depth: - seen.add(nested_cls.__name__) + if nested_cls is not None and nested_cls not in seen and current_depth < max_depth: + next_seen = seen | {nested_cls} field_detail.nested_schema = build_model_schema( nested_cls, - seen=seen, + seen=next_seen, max_depth=max_depth, current_depth=current_depth + 1, ) @@ -314,6 +318,7 @@ def build_model_schema( return ModelSchema( class_name=class_name, description=description, + schema_ref=schema_ref, type_key=type_key, type_value=type_value, fields=fields, diff --git a/packages/data-designer/tests/cli/commands/test_introspection_commands.py b/packages/data-designer/tests/cli/commands/test_introspection_commands.py index ae0271c20..3e37bbeb9 100644 --- a/packages/data-designer/tests/cli/commands/test_introspection_commands.py +++ b/packages/data-designer/tests/cli/commands/test_introspection_commands.py @@ -5,6 +5,7 @@ import json +import pytest from typer.testing import CliRunner from data_designer.cli.main import app @@ -68,7 +69,14 @@ def test_columns_nonexistent_exits_with_error() -> None: def test_samplers_specific() -> None: result = runner.invoke(app, ["types", "samplers", "category"]) assert result.exit_code == 0 - assert "CATEGORY" in result.output + assert "sampler_type: category" in result.output + + +def test_samplers_all_case_insensitive() -> None: + result = runner.invoke(app, ["types", "samplers", "ALL"]) + assert result.exit_code == 0 + assert "Data Designer Sampler Types Reference" in result.output + assert "sampler_type: category" in result.output def test_samplers_list() -> None: @@ -78,6 +86,24 @@ def test_samplers_list() -> None: assert "data-designer types samplers" in result.output +# --------------------------------------------------------------------------- +# validators +# --------------------------------------------------------------------------- + + +def test_validators_specific() -> None: + result = runner.invoke(app, ["types", "validators", "code"]) + assert result.exit_code == 0 + assert "validator_type: code" in result.output + + +def test_validators_all_case_insensitive() -> None: + result = runner.invoke(app, ["types", "validators", "ALL"]) + assert result.exit_code == 0 + assert "Data Designer Validator Types Reference" in result.output + assert "validator_type: code" in result.output + + # --------------------------------------------------------------------------- # overview # --------------------------------------------------------------------------- @@ -274,6 +300,31 @@ def test_imports_json() -> None: assert len(data["categories"]) > 0 +@pytest.mark.parametrize( + "args", + [ + ["types", "columns", "all"], + ["types", "samplers", "all"], + ["types", "validators", "all"], + ["types", "processors", "all"], + ["types", "models"], + ["types", "constraints"], + ["types", "seeds"], + ["types", "mcp"], + ["reference", "overview"], + ["reference", "builder"], + ["reference", "interface"], + ["reference", "imports"], + ["reference", "code-structure"], + ], +) +def test_json_contract_for_all_introspection_commands(args: list[str]) -> None: + result = runner.invoke(app, [*args, "--format", "json"]) + assert result.exit_code == 0 + payload = json.loads(result.output) + assert payload is not None + + # --------------------------------------------------------------------------- # format validation # --------------------------------------------------------------------------- diff --git a/packages/data-designer/tests/cli/controllers/test_introspection_controller.py b/packages/data-designer/tests/cli/controllers/test_introspection_controller.py index 578b58e71..b1184ad1d 100644 --- a/packages/data-designer/tests/cli/controllers/test_introspection_controller.py +++ b/packages/data-designer/tests/cli/controllers/test_introspection_controller.py @@ -104,7 +104,15 @@ def test_show_samplers_specific(capsys: pytest.CaptureFixture[str]) -> None: controller = IntrospectionController(output_format="text") controller.show_samplers(type_name="category") captured = capsys.readouterr() - assert "CATEGORY" in captured.out + assert "sampler_type: category" in captured.out + + +def test_show_samplers_all_case_insensitive(capsys: pytest.CaptureFixture[str]) -> None: + controller = IntrospectionController(output_format="text") + controller.show_samplers(type_name="ALL") + captured = capsys.readouterr() + assert "Data Designer Sampler Types Reference" in captured.out + assert "sampler_type: category" in captured.out # --------------------------------------------------------------------------- @@ -219,7 +227,15 @@ def test_show_validators_specific_text(capsys: pytest.CaptureFixture[str]) -> No controller = IntrospectionController(output_format="text") controller.show_validators(type_name="code") captured = capsys.readouterr() - assert "CODE" in captured.out + assert "validator_type: code" in captured.out + + +def test_show_validators_all_case_insensitive(capsys: pytest.CaptureFixture[str]) -> None: + controller = IntrospectionController(output_format="text") + controller.show_validators(type_name="ALL") + captured = capsys.readouterr() + assert "Data Designer Validator Types Reference" in captured.out + assert "validator_type: code" in captured.out def test_show_validators_specific_json(capsys: pytest.CaptureFixture[str]) -> None: diff --git a/packages/data-designer/tests/cli/services/introspection/test_formatters.py b/packages/data-designer/tests/cli/services/introspection/test_formatters.py index 00199ffc0..4130137f0 100644 --- a/packages/data-designer/tests/cli/services/introspection/test_formatters.py +++ b/packages/data-designer/tests/cli/services/introspection/test_formatters.py @@ -32,6 +32,7 @@ def _make_field(name: str = "my_field", type_str: str = "str", description: str def _make_schema( class_name: str = "TestModel", description: str = "A test model.", + schema_ref: str | None = None, type_key: str | None = None, type_value: str | None = None, fields: list[FieldDetail] | None = None, @@ -39,6 +40,7 @@ def _make_schema( return ModelSchema( class_name=class_name, description=description, + schema_ref=schema_ref, type_key=type_key, type_value=type_value, fields=fields or [_make_field()], @@ -624,3 +626,26 @@ def test_format_field_text_no_dedup_without_seen_set() -> None: text = format_model_schema_text(schema) assert "schema (Inner):" in text + + +def test_format_field_text_dedup_uses_schema_ref_to_avoid_name_collisions() -> None: + """Schemas with identical class names but different refs should both expand.""" + nested_a = _make_schema(class_name="SharedName", schema_ref="pkg.alpha.SharedName") + nested_b = _make_schema(class_name="SharedName", schema_ref="pkg.beta.SharedName") + + schema_a = _make_schema( + class_name="OuterA", + fields=[FieldDetail(name="a", type_str="SharedName", description="Ref A", nested_schema=nested_a)], + ) + schema_b = _make_schema( + class_name="OuterB", + fields=[FieldDetail(name="b", type_str="SharedName", description="Ref B", nested_schema=nested_b)], + ) + + seen: set[str] = set() + text_a = format_model_schema_text(schema_a, seen_schemas=seen) + text_b = format_model_schema_text(schema_b, seen_schemas=seen) + + assert "schema (SharedName):" in text_a + assert "schema (SharedName):" in text_b + assert "see SharedName above" not in text_b diff --git a/packages/data-designer/tests/cli/services/introspection/test_pydantic_inspector.py b/packages/data-designer/tests/cli/services/introspection/test_pydantic_inspector.py index 9f37e92e3..a269b680f 100644 --- a/packages/data-designer/tests/cli/services/introspection/test_pydantic_inspector.py +++ b/packages/data-designer/tests/cli/services/introspection/test_pydantic_inspector.py @@ -67,6 +67,11 @@ class DeepA(BaseModel): b: DeepB | None = None +class SiblingNestedModel(BaseModel): + first: InnerModel = Field(default_factory=InnerModel) + second: InnerModel = Field(default_factory=InnerModel) + + # Rebuild models that use forward references (required due to `from __future__ import annotations`) SelfRefModel.model_rebuild() DeepA.model_rebuild() @@ -320,6 +325,15 @@ def test_get_field_info_default_factory_set() -> None: assert nested.default_factory == "InnerModel" +def test_get_field_info_str_enum_default_json_uses_member_value() -> None: + """Defaults for str-enum fields should be normalized to the enum member's .value.""" + fields = get_field_info(OuterModel) + enum_field = next(f for f in fields if f.name == "my_enum") + assert enum_field.required is False + assert enum_field.default_json == "red" + assert enum_field.default == "'red'" + + # --------------------------------------------------------------------------- # constraints # --------------------------------------------------------------------------- @@ -413,3 +427,15 @@ def test_build_model_schema_depth_limiting() -> None: # But any further nesting within DeepB should be blocked for f in b_field.nested_schema.fields: assert f.nested_schema is None, f"Field '{f.name}' should not be expanded beyond max_depth" + + +def test_build_model_schema_repeated_sibling_nested_expands_each_field() -> None: + """Sibling fields of the same nested type should each include a nested schema.""" + schema = build_model_schema(SiblingNestedModel) + first = next(f for f in schema.fields if f.name == "first") + second = next(f for f in schema.fields if f.name == "second") + + assert first.nested_schema is not None + assert second.nested_schema is not None + assert first.nested_schema.class_name == "InnerModel" + assert second.nested_schema.class_name == "InnerModel" From 7496c8cc0b5607542d9003a8a0954cfb623f7ef2 Mon Sep 17 00:00:00 2001 From: Johnny Greco Date: Mon, 16 Feb 2026 22:58:06 -0500 Subject: [PATCH 18/37] feat: add data-designer list-assets agent-helper command Gives agents a quick way to check which Nemotron-Persona locales are installed and usable in PersonSamplerParams. --- .../data_designer/cli/commands/list_assets.py | 19 +++ .../cli/controllers/list_assets_controller.py | 52 ++++++++ .../src/data_designer/cli/main.py | 9 +- .../cli/commands/test_list_assets_command.py | 33 +++++ .../test_list_assets_controller.py | 126 ++++++++++++++++++ 5 files changed, 236 insertions(+), 3 deletions(-) create mode 100644 packages/data-designer/src/data_designer/cli/commands/list_assets.py create mode 100644 packages/data-designer/src/data_designer/cli/controllers/list_assets_controller.py create mode 100644 packages/data-designer/tests/cli/commands/test_list_assets_command.py create mode 100644 packages/data-designer/tests/cli/controllers/test_list_assets_controller.py diff --git a/packages/data-designer/src/data_designer/cli/commands/list_assets.py b/packages/data-designer/src/data_designer/cli/commands/list_assets.py new file mode 100644 index 000000000..68fb3c418 --- /dev/null +++ b/packages/data-designer/src/data_designer/cli/commands/list_assets.py @@ -0,0 +1,19 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +from __future__ import annotations + +import typer + +from data_designer.cli.controllers.introspection_controller import OutputFormat +from data_designer.cli.controllers.list_assets_controller import ListAssetsController +from data_designer.config.utils.constants import DATA_DESIGNER_HOME + + +def list_assets_command( + output_format: OutputFormat = typer.Option( + OutputFormat.TEXT, "--format", "-f", help="Output format: 'text' or 'json'." + ), +) -> None: + """List installed and available Nemotron-Persona datasets.""" + ListAssetsController(DATA_DESIGNER_HOME).list_assets(output_format.value) diff --git a/packages/data-designer/src/data_designer/cli/controllers/list_assets_controller.py b/packages/data-designer/src/data_designer/cli/controllers/list_assets_controller.py new file mode 100644 index 000000000..6fae6b84c --- /dev/null +++ b/packages/data-designer/src/data_designer/cli/controllers/list_assets_controller.py @@ -0,0 +1,52 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +from __future__ import annotations + +import json +from pathlib import Path + +import typer + +from data_designer.cli.repositories.persona_repository import PersonaRepository +from data_designer.cli.services.download_service import DownloadService + + +class ListAssetsController: + """Controller for listing managed dataset assets.""" + + def __init__(self, config_dir: Path) -> None: + self.persona_repository = PersonaRepository() + self.service = DownloadService(config_dir, self.persona_repository) + + def list_assets(self, output_format: str) -> None: + """List installed and available Nemotron-Persona datasets. + + Args: + output_format: "text" or "json". + """ + all_locales = self.persona_repository.list_all() + installed: list[str] = [] + not_installed: list[str] = [] + + for locale in all_locales: + if self.service.is_locale_downloaded(locale.code): + installed.append(locale.code) + else: + not_installed.append(locale.code) + + if output_format == "json": + typer.echo(json.dumps({"installed": installed, "not_installed": not_installed})) + return + + typer.echo("Nemotron-Persona Datasets") + typer.echo("-" * 25) + + if installed: + typer.echo(f"Usable locales in PersonSamplerParams: {', '.join(installed)}") + else: + typer.echo("No persona datasets installed.") + + if not_installed: + typer.echo(f"Not installed: {', '.join(not_installed)}") + typer.echo("The user can run `data-designer download personas --locale ` to install.") diff --git a/packages/data-designer/src/data_designer/cli/main.py b/packages/data-designer/src/data_designer/cli/main.py index 672c4d8af..108fbecdf 100644 --- a/packages/data-designer/src/data_designer/cli/main.py +++ b/packages/data-designer/src/data_designer/cli/main.py @@ -5,7 +5,7 @@ import typer -from data_designer.cli.commands import reference, types +from data_designer.cli.commands import list_assets, reference, types from data_designer.cli.lazy_group import create_lazy_typer_group _CMD = "data_designer.cli.commands" @@ -104,8 +104,11 @@ app.add_typer(download_app, name="download", rich_help_panel="Setup Commands") # Add agent command groups -app.add_typer(types.types_app, name="types", rich_help_panel="Agent Commands") -app.add_typer(reference.reference_app, name="reference", rich_help_panel="Agent Commands") +app.add_typer(types.types_app, name="types", rich_help_panel="Agent-Helper Commands") +app.add_typer(reference.reference_app, name="reference", rich_help_panel="Agent-Helper Commands") +app.command( + name="list-assets", help="List installed and available managed assets", rich_help_panel="Agent-Helper Commands" +)(list_assets.list_assets_command) def main() -> None: diff --git a/packages/data-designer/tests/cli/commands/test_list_assets_command.py b/packages/data-designer/tests/cli/commands/test_list_assets_command.py new file mode 100644 index 000000000..4a360a123 --- /dev/null +++ b/packages/data-designer/tests/cli/commands/test_list_assets_command.py @@ -0,0 +1,33 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +from __future__ import annotations + +from unittest.mock import MagicMock, patch + +from data_designer.cli.commands.list_assets import list_assets_command +from data_designer.cli.controllers.list_assets_controller import ListAssetsController +from data_designer.config.utils.constants import DATA_DESIGNER_HOME + + +@patch("data_designer.cli.commands.list_assets.ListAssetsController") +def test_list_assets_command_delegates_to_controller(mock_controller_cls: MagicMock) -> None: + """Command creates controller with DATA_DESIGNER_HOME and delegates.""" + mock_controller = MagicMock(spec=ListAssetsController) + mock_controller_cls.return_value = mock_controller + + list_assets_command(output_format=MagicMock(value="text")) + + mock_controller_cls.assert_called_once_with(DATA_DESIGNER_HOME) + mock_controller.list_assets.assert_called_once_with("text") + + +@patch("data_designer.cli.commands.list_assets.ListAssetsController") +def test_list_assets_command_passes_json_format(mock_controller_cls: MagicMock) -> None: + """Command forwards the json format value to the controller.""" + mock_controller = MagicMock(spec=ListAssetsController) + mock_controller_cls.return_value = mock_controller + + list_assets_command(output_format=MagicMock(value="json")) + + mock_controller.list_assets.assert_called_once_with("json") diff --git a/packages/data-designer/tests/cli/controllers/test_list_assets_controller.py b/packages/data-designer/tests/cli/controllers/test_list_assets_controller.py new file mode 100644 index 000000000..627c9a65e --- /dev/null +++ b/packages/data-designer/tests/cli/controllers/test_list_assets_controller.py @@ -0,0 +1,126 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +from __future__ import annotations + +import json +from pathlib import Path + +import pytest + +from data_designer.cli.controllers.list_assets_controller import ListAssetsController + +# --------------------------------------------------------------------------- +# fixtures +# --------------------------------------------------------------------------- + + +@pytest.fixture +def controller(tmp_path: Path) -> ListAssetsController: + """Create a controller with no datasets installed.""" + return ListAssetsController(tmp_path) + + +@pytest.fixture +def controller_with_datasets(tmp_path: Path) -> ListAssetsController: + """Create a controller with en_US and ja_JP already installed.""" + managed = tmp_path / "managed-assets" / "datasets" + managed.mkdir(parents=True) + (managed / "en_US.parquet").touch() + (managed / "ja_JP.parquet").touch() + return ListAssetsController(tmp_path) + + +# --------------------------------------------------------------------------- +# init +# --------------------------------------------------------------------------- + + +def test_init(tmp_path: Path) -> None: + """Controller sets up repository and service.""" + ctrl = ListAssetsController(tmp_path) + assert ctrl.persona_repository is not None + assert ctrl.service.config_dir == tmp_path + + +# --------------------------------------------------------------------------- +# text format +# --------------------------------------------------------------------------- + + +def test_text_none_installed(controller: ListAssetsController, capsys: pytest.CaptureFixture[str]) -> None: + """Text output shows no-installed message when nothing is downloaded.""" + controller.list_assets("text") + out = capsys.readouterr().out + + assert "Nemotron-Persona Datasets" in out + assert "No persona datasets installed." in out + assert "Not installed:" in out + assert "The user can run" in out + + +def test_text_some_installed( + controller_with_datasets: ListAssetsController, capsys: pytest.CaptureFixture[str] +) -> None: + """Text output lists usable locales and not-installed ones.""" + controller_with_datasets.list_assets("text") + out = capsys.readouterr().out + + assert "Usable locales in PersonSamplerParams:" in out + assert "en_US" in out + assert "ja_JP" in out + assert "Not installed:" in out + + +def test_text_all_installed_omits_not_installed_section(tmp_path: Path, capsys: pytest.CaptureFixture[str]) -> None: + """When every locale is installed the not-installed section is omitted.""" + managed = tmp_path / "managed-assets" / "datasets" + managed.mkdir(parents=True) + ctrl = ListAssetsController(tmp_path) + for locale in ctrl.persona_repository.list_all(): + (managed / f"{locale.code}.parquet").touch() + + ctrl.list_assets("text") + out = capsys.readouterr().out + + assert "Usable locales in PersonSamplerParams:" in out + assert "Not installed" not in out + + +# --------------------------------------------------------------------------- +# json format +# --------------------------------------------------------------------------- + + +def test_json_structure(controller: ListAssetsController, capsys: pytest.CaptureFixture[str]) -> None: + """JSON output has the expected keys and types.""" + controller.list_assets("json") + data = json.loads(capsys.readouterr().out) + + assert isinstance(data["installed"], list) + assert isinstance(data["not_installed"], list) + + +def test_json_partitions_correctly( + controller_with_datasets: ListAssetsController, capsys: pytest.CaptureFixture[str] +) -> None: + """JSON output places downloaded locales in installed and the rest in not_installed.""" + controller_with_datasets.list_assets("json") + data = json.loads(capsys.readouterr().out) + + assert "en_US" in data["installed"] + assert "ja_JP" in data["installed"] + assert "en_US" not in data["not_installed"] + assert "ja_JP" not in data["not_installed"] + assert len(data["installed"]) + len(data["not_installed"]) == len( + controller_with_datasets.persona_repository.list_all() + ) + + +def test_json_none_installed(controller: ListAssetsController, capsys: pytest.CaptureFixture[str]) -> None: + """JSON output when nothing is installed.""" + controller.list_assets("json") + data = json.loads(capsys.readouterr().out) + + assert data["installed"] == [] + assert len(data["not_installed"]) > 0 From 221e1cc599624f70b915cbe3444a5a1f6dddf1b0 Mon Sep 17 00:00:00 2001 From: Johnny Greco Date: Tue, 17 Feb 2026 19:46:55 -0500 Subject: [PATCH 19/37] refactor: replace types/reference commands with inspect agent-helper Consolidate the types and reference command groups into a single inspect command group under agent_helpers. Remove JSON output format and simplify the introspection controller and service layer. Use singular subcommand names for inspect (column, sampler, validator, processor) to semantically distinguish from the plural list commands. Rename constraints to sampler-constraints for clarity. --- .../cli/commands/agent_helpers/__init__.py | 4 + .../cli/commands/agent_helpers/inspect.py | 58 ++ .../data_designer/cli/commands/reference.py | 72 --- .../src/data_designer/cli/commands/types.py | 125 ----- .../data_designer/cli/controllers/__init__.py | 2 + .../controllers/introspection_controller.py | 273 ++------- .../src/data_designer/cli/main.py | 9 +- .../cli/services/introspection/__init__.py | 42 +- .../cli/services/introspection/discovery.py | 172 ------ .../cli/services/introspection/formatters.py | 318 ----------- .../introspection/pydantic_inspector.py | 271 +++++---- .../cli/commands/agent_helpers/__init__.py | 4 + .../test_introspection_commands.py | 181 ++++++ .../test_usage_scenarios.py | 13 +- .../commands/test_introspection_commands.py | 350 ------------ .../test_introspection_controller.py | 339 +---------- .../services/introspection/test_discovery.py | 186 ------- .../introspection/test_field_descriptions.py | 8 - .../services/introspection/test_formatters.py | 527 ------------------ .../introspection/test_pydantic_inspector.py | 321 +++++------ 20 files changed, 627 insertions(+), 2648 deletions(-) create mode 100644 packages/data-designer/src/data_designer/cli/commands/agent_helpers/__init__.py create mode 100644 packages/data-designer/src/data_designer/cli/commands/agent_helpers/inspect.py delete mode 100644 packages/data-designer/src/data_designer/cli/commands/reference.py delete mode 100644 packages/data-designer/src/data_designer/cli/commands/types.py create mode 100644 packages/data-designer/tests/cli/commands/agent_helpers/__init__.py create mode 100644 packages/data-designer/tests/cli/commands/agent_helpers/test_introspection_commands.py rename packages/data-designer/tests/cli/commands/{ => agent_helpers}/test_usage_scenarios.py (88%) delete mode 100644 packages/data-designer/tests/cli/commands/test_introspection_commands.py diff --git a/packages/data-designer/src/data_designer/cli/commands/agent_helpers/__init__.py b/packages/data-designer/src/data_designer/cli/commands/agent_helpers/__init__.py new file mode 100644 index 000000000..f1ea03ddb --- /dev/null +++ b/packages/data-designer/src/data_designer/cli/commands/agent_helpers/__init__.py @@ -0,0 +1,4 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +from __future__ import annotations diff --git a/packages/data-designer/src/data_designer/cli/commands/agent_helpers/inspect.py b/packages/data-designer/src/data_designer/cli/commands/agent_helpers/inspect.py new file mode 100644 index 000000000..4ab4d71d5 --- /dev/null +++ b/packages/data-designer/src/data_designer/cli/commands/agent_helpers/inspect.py @@ -0,0 +1,58 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +from __future__ import annotations + +import typer + +from data_designer.cli.controllers.introspection_controller import IntrospectionController + +inspect_app = typer.Typer( + name="inspect", + help="Inspect configuration types and Python API (schemas, method signatures).", + no_args_is_help=True, +) + + +@inspect_app.command(name="column") +def columns_command( + type_name: str = typer.Argument(help="Column type to display (e.g., 'llm-text'), or 'all' for everything."), +) -> None: + """Show schema for a column config type (use `list columns` for valid names).""" + IntrospectionController().show_columns(type_name) + + +@inspect_app.command(name="sampler") +def samplers_command( + type_name: str = typer.Argument(help="Sampler type to display (e.g., 'category'), or 'all' for everything."), +) -> None: + """Show schema for a sampler params type (use `list samplers` for valid names).""" + IntrospectionController().show_samplers(type_name) + + +@inspect_app.command(name="validator") +def validators_command( + type_name: str = typer.Argument(help="Validator type to display (e.g., 'code'), or 'all' for everything."), +) -> None: + """Show schema for a validator params type (use `list validators` for valid names).""" + IntrospectionController().show_validators(type_name) + + +@inspect_app.command(name="processor") +def processors_command( + type_name: str = typer.Argument(help="Processor type to display (e.g., 'drop_columns'), or 'all' for everything."), +) -> None: + """Show schema for a processor config type (use `list processors` for valid names).""" + IntrospectionController().show_processors(type_name) + + +@inspect_app.command(name="sampler-constraints") +def constraints_command() -> None: + """Show sampler constraint schemas (scalar inequality, column inequality, operators).""" + IntrospectionController().show_sampler_constraints() + + +@inspect_app.command(name="builder") +def builder_command() -> None: + """Show config builder method signatures and docstrings.""" + IntrospectionController().show_builder() diff --git a/packages/data-designer/src/data_designer/cli/commands/reference.py b/packages/data-designer/src/data_designer/cli/commands/reference.py deleted file mode 100644 index 9e1aef3b7..000000000 --- a/packages/data-designer/src/data_designer/cli/commands/reference.py +++ /dev/null @@ -1,72 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 - -from __future__ import annotations - -import typer - -from data_designer.cli.controllers.introspection_controller import IntrospectionController, OutputFormat - -reference_app = typer.Typer( - name="reference", - help="Reference documentation for Data Designer (overview, interface, code structure, builder, imports).", - no_args_is_help=True, -) - - -@reference_app.command(name="overview") -def overview_command( - output_format: OutputFormat = typer.Option( - OutputFormat.TEXT, "--format", "-f", help="Output format: 'text' or 'json'." - ), -) -> None: - """Show compact API cheatsheet with type counts, builder summary, and quick start commands.""" - IntrospectionController(output_format=output_format.value).show_overview() - - -@reference_app.command(name="code-structure") -def code_structure_command( - depth: int = typer.Option( - 2, - "--depth", - "-d", - help="Max tree depth (default: 2). Must be >= 0.", - min=0, - ), - output_format: OutputFormat = typer.Option( - OutputFormat.TEXT, "--format", "-f", help="Output format: 'text' or 'json'." - ), -) -> None: - """Show the data_designer package structure and install paths.""" - IntrospectionController(output_format=output_format.value).show_code_structure(depth=depth) - - -@reference_app.command(name="builder") -def builder_command( - output_format: OutputFormat = typer.Option( - OutputFormat.TEXT, "--format", "-f", help="Output format: 'text' or 'json'." - ), -) -> None: - """Show DataDesignerConfigBuilder method signatures and documentation.""" - IntrospectionController(output_format=output_format.value).show_builder() - - -@reference_app.command(name="interface") -def interface_command( - output_format: OutputFormat = typer.Option( - OutputFormat.TEXT, "--format", "-f", help="Output format: 'text' or 'json'." - ), -) -> None: - """Show DataDesigner class methods, result types, and RunConfig fields.""" - IntrospectionController(output_format=output_format.value).show_interface() - - -@reference_app.command(name="imports") -def imports_command( - category: str | None = typer.Argument(None, help="Filter by category (e.g., 'columns'), or omit for all."), - output_format: OutputFormat = typer.Option( - OutputFormat.TEXT, "--format", "-f", help="Output format: 'text' or 'json'." - ), -) -> None: - """Show categorized import reference for data_designer.config and data_designer.interface.""" - IntrospectionController(output_format=output_format.value).show_imports(category) diff --git a/packages/data-designer/src/data_designer/cli/commands/types.py b/packages/data-designer/src/data_designer/cli/commands/types.py deleted file mode 100644 index 5e3195790..000000000 --- a/packages/data-designer/src/data_designer/cli/commands/types.py +++ /dev/null @@ -1,125 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 - -from __future__ import annotations - -import typer - -from data_designer.cli.controllers.introspection_controller import IntrospectionController, OutputFormat - -types_app = typer.Typer( - name="types", - help="Explore Data Designer configuration types (columns, samplers, validators, etc.).", - no_args_is_help=True, -) - - -def _print_usage_hint(command_name: str) -> None: - """Print a usage hint after the type list (text format only).""" - typer.echo("") - typer.echo(f"Tip: Run `data-designer types {command_name} ` for full schema details.") - typer.echo(f" Run `data-designer types {command_name} all` to see every type expanded.") - - -@types_app.command(name="columns") -def columns_command( - type_name: str | None = typer.Argument( - None, help="Column type to display (e.g., 'llm-text'), or 'all' for everything." - ), - output_format: OutputFormat = typer.Option( - OutputFormat.TEXT, "--format", "-f", help="Output format: 'text' or 'json'." - ), -) -> None: - """Show column configuration types and their fields.""" - ctrl = IntrospectionController(output_format=output_format.value) - ctrl.show_columns(type_name) - if type_name is None and output_format == OutputFormat.TEXT: - _print_usage_hint("columns") - - -@types_app.command(name="samplers") -def samplers_command( - type_name: str | None = typer.Argument( - None, help="Sampler type to display (e.g., 'category'), or 'all' for everything." - ), - output_format: OutputFormat = typer.Option( - OutputFormat.TEXT, "--format", "-f", help="Output format: 'text' or 'json'." - ), -) -> None: - """Show sampler types and their parameter fields.""" - ctrl = IntrospectionController(output_format=output_format.value) - ctrl.show_samplers(type_name) - if type_name is None and output_format == OutputFormat.TEXT: - _print_usage_hint("samplers") - - -@types_app.command(name="validators") -def validators_command( - type_name: str | None = typer.Argument( - None, help="Validator type to display (e.g., 'code'), or 'all' for everything." - ), - output_format: OutputFormat = typer.Option( - OutputFormat.TEXT, "--format", "-f", help="Output format: 'text' or 'json'." - ), -) -> None: - """Show validator types and their parameter fields.""" - ctrl = IntrospectionController(output_format=output_format.value) - ctrl.show_validators(type_name) - if type_name is None and output_format == OutputFormat.TEXT: - _print_usage_hint("validators") - - -@types_app.command(name="processors") -def processors_command( - type_name: str | None = typer.Argument( - None, help="Processor type to display (e.g., 'drop_columns'), or 'all' for everything." - ), - output_format: OutputFormat = typer.Option( - OutputFormat.TEXT, "--format", "-f", help="Output format: 'text' or 'json'." - ), -) -> None: - """Show processor types and their configuration fields.""" - ctrl = IntrospectionController(output_format=output_format.value) - ctrl.show_processors(type_name) - if type_name is None and output_format == OutputFormat.TEXT: - _print_usage_hint("processors") - - -@types_app.command(name="models") -def models_command( - output_format: OutputFormat = typer.Option( - OutputFormat.TEXT, "--format", "-f", help="Output format: 'text' or 'json'." - ), -) -> None: - """Show model configuration types (ModelConfig, inference params, distributions).""" - IntrospectionController(output_format=output_format.value).show_models() - - -@types_app.command(name="constraints") -def constraints_command( - output_format: OutputFormat = typer.Option( - OutputFormat.TEXT, "--format", "-f", help="Output format: 'text' or 'json'." - ), -) -> None: - """Show constraint types (ScalarInequality, ColumnInequality, operators).""" - IntrospectionController(output_format=output_format.value).show_constraints() - - -@types_app.command(name="seeds") -def seeds_command( - output_format: OutputFormat = typer.Option( - OutputFormat.TEXT, "--format", "-f", help="Output format: 'text' or 'json'." - ), -) -> None: - """Show seed dataset types (SeedConfig, sources, sampling strategies).""" - IntrospectionController(output_format=output_format.value).show_seeds() - - -@types_app.command(name="mcp") -def mcp_command( - output_format: OutputFormat = typer.Option( - OutputFormat.TEXT, "--format", "-f", help="Output format: 'text' or 'json'." - ), -) -> None: - """Show MCP provider types (MCPProvider, LocalStdioMCPProvider, ToolConfig).""" - IntrospectionController(output_format=output_format.value).show_mcp() diff --git a/packages/data-designer/src/data_designer/cli/controllers/__init__.py b/packages/data-designer/src/data_designer/cli/controllers/__init__.py index f568a3015..5f59c1bba 100644 --- a/packages/data-designer/src/data_designer/cli/controllers/__init__.py +++ b/packages/data-designer/src/data_designer/cli/controllers/__init__.py @@ -6,6 +6,7 @@ from data_designer.cli.controllers.download_controller import DownloadController from data_designer.cli.controllers.generation_controller import GenerationController from data_designer.cli.controllers.introspection_controller import IntrospectionController +from data_designer.cli.controllers.list_controller import ListController from data_designer.cli.controllers.model_controller import ModelController from data_designer.cli.controllers.provider_controller import ProviderController @@ -13,6 +14,7 @@ "DownloadController", "GenerationController", "IntrospectionController", + "ListController", "ModelController", "ProviderController", ] diff --git a/packages/data-designer/src/data_designer/cli/controllers/introspection_controller.py b/packages/data-designer/src/data_designer/cli/controllers/introspection_controller.py index cf6d5657e..30f7720b5 100644 --- a/packages/data-designer/src/data_designer/cli/controllers/introspection_controller.py +++ b/packages/data-designer/src/data_designer/cli/controllers/introspection_controller.py @@ -3,53 +3,27 @@ from __future__ import annotations -import json from collections.abc import Callable from dataclasses import dataclass -from enum import Enum import typer from data_designer.cli.services.introspection.discovery import ( discover_column_configs, discover_constraint_types, - discover_importable_names, - discover_interface_classes, - discover_mcp_types, - discover_model_configs, - discover_namespace_tree, discover_processor_configs, discover_sampler_types, - discover_seed_types, discover_validator_types, ) from data_designer.cli.services.introspection.formatters import ( - format_imports_json, - format_imports_text, - format_interface_json, - format_interface_text, - format_method_info_json, format_method_info_text, - format_model_schema_json, - format_model_schema_text, - format_namespace_json, - format_namespace_text, - format_overview_text, format_type_list_text, ) from data_designer.cli.services.introspection.method_inspector import inspect_class_methods -from data_designer.cli.services.introspection.pydantic_inspector import build_model_schema -from data_designer.config.base import ConfigBase +from data_designer.cli.services.introspection.pydantic_inspector import format_model_text from data_designer.config.config_builder import DataDesignerConfigBuilder -class OutputFormat(str, Enum): - """Supported output formats for introspect commands.""" - - TEXT = "text" - JSON = "json" - - @dataclass(frozen=True) class _TypedCommandSpec: """Configuration for typed introspection commands.""" @@ -60,6 +34,10 @@ class _TypedCommandSpec: class_label: str header_title: str case_insensitive: bool = False + related_inspect_tip: str | None = None + + +_CONFIG_IMPORT = "import data_designer.config as dd" class IntrospectionController: @@ -76,6 +54,11 @@ class IntrospectionController: type_label="column_type", class_label="config_class", header_title="Data Designer Column Types Reference", + case_insensitive=True, + related_inspect_tip=( + "Tip: Use 'data-designer inspect sampler ' for sampler params," + " 'inspect validator ' for validator params." + ), ), "samplers": _TypedCommandSpec( discover_items=discover_sampler_types, @@ -103,8 +86,13 @@ class IntrospectionController: ), } - def __init__(self, output_format: str = "text") -> None: - self._format = output_format + def _emit_import_hint(self, import_stmt: str, access: str | None = None) -> None: + """Print a one-line import hint.""" + line = f"# {import_stmt}" + if access: + line += f" \u2192 {access}" + typer.echo(line) + typer.echo("") def show_columns(self, type_name: str | None) -> None: """Show column configuration types.""" @@ -122,105 +110,18 @@ def show_processors(self, type_name: str | None) -> None: """Show processor types and their config classes.""" self._show_typed_command(command_name="processors", type_name=type_name) - def show_models(self) -> None: - """Show model configuration types.""" - items = discover_model_configs() - self._show_all_schemas(items, "Data Designer Model Configuration Reference") - def show_builder(self) -> None: """Show DataDesignerConfigBuilder method signatures and docs.""" + self._emit_import_hint(_CONFIG_IMPORT, "dd.DataDesignerConfigBuilder") methods = inspect_class_methods(DataDesignerConfigBuilder) - if self._format == "json": - typer.echo(json.dumps(format_method_info_json(methods), indent=2)) - else: - typer.echo(format_method_info_text(methods, class_name="DataDesignerConfigBuilder")) + typer.echo(format_method_info_text(methods, class_name="DataDesignerConfigBuilder")) - def show_constraints(self) -> None: - """Show constraint types.""" + def show_sampler_constraints(self) -> None: + """Show sampler constraint types.""" + self._emit_import_hint(_CONFIG_IMPORT, "dd.") items = discover_constraint_types() self._show_all_schemas(items, "Data Designer Constraint Types Reference") - def show_seeds(self) -> None: - """Show seed dataset types.""" - items = discover_seed_types() - self._show_all_schemas(items, "Data Designer Seed Dataset Types Reference") - - def show_mcp(self) -> None: - """Show MCP provider types.""" - items = discover_mcp_types() - self._show_all_schemas(items, "Data Designer MCP Types Reference") - - def show_interface(self) -> None: - """Show DataDesigner, result types, and RunConfig.""" - classes = discover_interface_classes() - - classes_with_methods: list[tuple[str, list]] = [] - pydantic_schemas = [] - for name, cls in classes.items(): - if isinstance(cls, type) and issubclass(cls, ConfigBase): - pydantic_schemas.append(build_model_schema(cls)) - else: - methods = inspect_class_methods(cls) - classes_with_methods.append((name, methods)) - - if self._format == "json": - typer.echo(json.dumps(format_interface_json(classes_with_methods, pydantic_schemas), indent=2)) - else: - typer.echo(format_interface_text(classes_with_methods, pydantic_schemas)) - - def show_imports(self, category: str | None = None) -> None: - """Show categorized import reference for data_designer.config and data_designer.interface.""" - categories = discover_importable_names() - - if category is not None: - matched_key = self._match_category(category, list(categories.keys())) - if matched_key is None: - available = ", ".join(sorted(categories.keys())) - typer.echo(f"Error: No category matching '{category}'.", err=True) - typer.echo(f"Available categories: {available}", err=True) - raise typer.Exit(code=1) - categories = {matched_key: categories[matched_key]} - - if self._format == "json": - typer.echo(json.dumps(format_imports_json(categories), indent=2)) - else: - typer.echo(format_imports_text(categories)) - - def show_code_structure(self, depth: int = 2) -> None: - """Show the data_designer package structure and install paths.""" - if depth < 0: - typer.echo("Error: --depth must be >= 0.", err=True) - raise typer.Exit(code=1) - data = discover_namespace_tree(max_depth=depth) - if self._format == "json": - typer.echo(json.dumps(format_namespace_json(data), indent=2)) - else: - typer.echo(format_namespace_text(data)) - - def show_overview(self) -> None: - """Show compact API overview cheatsheet.""" - type_counts = { - "Column types": len(discover_column_configs()), - "Sampler types": len(discover_sampler_types()), - "Validator types": len(discover_validator_types()), - "Processor types": len(discover_processor_configs()), - "Model configs": len(discover_model_configs()), - "Constraint types": len(discover_constraint_types()), - "Seed types": len(discover_seed_types()), - "MCP types": len(discover_mcp_types()), - } - - builder_methods = inspect_class_methods(DataDesignerConfigBuilder) - - if self._format == "json": - typer.echo( - json.dumps( - {"type_counts": type_counts, "builder_methods": format_method_info_json(builder_methods)}, indent=2 - ) - ) - else: - typer.echo(format_overview_text(type_counts, builder_methods)) - def _show_typed_command(self, command_name: str, type_name: str | None) -> None: """Resolve a typed-command spec and render it.""" spec = self._TYPED_COMMAND_SPECS[command_name] @@ -232,45 +133,9 @@ def _show_typed_command(self, command_name: str, type_name: str | None) -> None: class_label=spec.class_label, header_title=spec.header_title, case_insensitive=spec.case_insensitive, + related_inspect_tip=spec.related_inspect_tip, ) - @staticmethod - def _match_category(query: str, keys: list[str]) -> str | None: - """Match a user query to a category key using progressive fuzzy matching. - - Tries: exact match, first-word stem match, any-word stem match, substring match. - """ - normalized = query.lower().rstrip("s") - - # Exact match (case-insensitive) - for key in keys: - if key.lower() == query.lower(): - return key - - # First-word stem match - for key in keys: - first_word = key.lower().split()[0].rstrip("s") - if first_word == normalized: - return key - - # Any-word stem match - for key in keys: - words = key.lower().split() - for word in words: - if word.rstrip("s") == normalized: - return key - - # Substring match (earliest position wins) - best_key: str | None = None - best_pos = float("inf") - for key in keys: - pos = key.lower().find(query.lower()) - if pos != -1 and pos < best_pos: - best_pos = pos - best_key = key - - return best_key - def _show_typed_items( self, items: dict[str, type], @@ -280,13 +145,12 @@ def _show_typed_items( class_label: str, header_title: str, case_insensitive: bool = False, + related_inspect_tip: str | None = None, ) -> None: """Shared logic for type-based commands (columns, samplers, validators, processors).""" if type_name is None: - if self._format == "json": - typer.echo(json.dumps({k: v.__name__ for k, v in sorted(items.items())}, indent=2)) - else: - typer.echo(format_type_list_text(items, type_label, class_label)) + self._emit_import_hint(_CONFIG_IMPORT, "dd.") + typer.echo(format_type_list_text(items, type_label, class_label)) return if type_name.lower() == "all": @@ -310,12 +174,11 @@ def _show_typed_items( typer.echo(f"Available types: {available}", err=True) raise typer.Exit(code=1) - schema = build_model_schema(cls, type_key=type_key, type_value=canonical_value) - - if self._format == "json": - typer.echo(json.dumps(format_model_schema_json(schema), indent=2)) - else: - typer.echo(format_model_schema_text(schema)) + self._emit_import_hint(_CONFIG_IMPORT, f"dd.{cls.__name__}") + typer.echo(format_model_text(cls, type_key=type_key, type_value=canonical_value)) + if related_inspect_tip: + typer.echo("") + typer.echo(related_inspect_tip) def _show_all_typed( self, @@ -324,57 +187,31 @@ def _show_all_typed( header_title: str, ) -> None: """Show all types for a typed command.""" + self._emit_import_hint(_CONFIG_IMPORT, "dd.") sorted_types = sorted(items.keys()) - if self._format == "json": - all_schemas = [] - for type_value in sorted_types: - cls = items[type_value] - schema = build_model_schema(cls, type_key=type_key, type_value=type_value) - all_schemas.append(format_model_schema_json(schema)) - typer.echo(json.dumps(all_schemas, indent=2)) - else: - seen_schemas: set[str] = set() - lines = [f"# {header_title}", f"# {len(sorted_types)} types discovered from data_designer.config", ""] - for type_value in sorted_types: - cls = items[type_value] - schema = build_model_schema(cls, type_key=type_key, type_value=type_value) - lines.append(format_model_schema_text(schema, seen_schemas=seen_schemas)) - lines.append("") - typer.echo("\n".join(lines)) + seen_schemas: set[str] = set() + lines = [f"# {header_title}", f"# {len(sorted_types)} types discovered from data_designer.config", ""] + for type_value in sorted_types: + cls = items[type_value] + lines.append(format_model_text(cls, type_key=type_key, type_value=type_value, seen_schemas=seen_schemas)) + lines.append("") + typer.echo("\n".join(lines)) def _show_all_schemas(self, items: dict[str, type], header_title: str) -> None: - """Show all schemas for simple discovery commands (models, constraints, seeds, mcp).""" - if self._format == "json": - all_schemas = [] - for name in sorted(items.keys()): - cls = items[name] - if hasattr(cls, "model_fields"): - schema = build_model_schema(cls) - all_schemas.append(format_model_schema_json(schema)) - else: - entry: dict = { - "class_name": cls.__name__, - "description": (cls.__doc__ or "").strip().split("\n")[0], - } - if hasattr(cls, "__members__"): - entry["values"] = [str(member.value) for member in cls] - all_schemas.append(entry) - typer.echo(json.dumps(all_schemas, indent=2)) - else: - seen_schemas: set[str] = set() - lines = [f"# {header_title}", f"# {len(items)} types", ""] - for name in sorted(items.keys()): - cls = items[name] - if hasattr(cls, "model_fields"): - schema = build_model_schema(cls) - lines.append(format_model_schema_text(schema, seen_schemas=seen_schemas)) - else: - lines.append(f"{cls.__name__}:") - if cls.__doc__: - lines.append(f" description: {cls.__doc__.strip().split(chr(10))[0]}") - if hasattr(cls, "__members__"): - members = [str(m.value) for m in cls] - lines.append(f" values: [{', '.join(members)}]") - lines.append("") - typer.echo("\n".join(lines)) + """Show all schemas for simple discovery commands (e.g. constraints).""" + seen_schemas: set[str] = set() + lines = [f"# {header_title}", f"# {len(items)} types", ""] + for name in sorted(items.keys()): + cls = items[name] + if hasattr(cls, "model_fields"): + lines.append(format_model_text(cls, seen_schemas=seen_schemas)) + else: + lines.append(f"{cls.__name__}:") + if cls.__doc__: + lines.append(f" description: {cls.__doc__.strip().split(chr(10))[0]}") + if hasattr(cls, "__members__"): + members = [str(m.value) for m in cls] + lines.append(f" values: [{', '.join(members)}]") + lines.append("") + typer.echo("\n".join(lines)) diff --git a/packages/data-designer/src/data_designer/cli/main.py b/packages/data-designer/src/data_designer/cli/main.py index 108fbecdf..bd782704c 100644 --- a/packages/data-designer/src/data_designer/cli/main.py +++ b/packages/data-designer/src/data_designer/cli/main.py @@ -5,7 +5,7 @@ import typer -from data_designer.cli.commands import list_assets, reference, types +from data_designer.cli.commands.agent_helpers import inspect as inspect_cmd from data_designer.cli.lazy_group import create_lazy_typer_group _CMD = "data_designer.cli.commands" @@ -104,11 +104,8 @@ app.add_typer(download_app, name="download", rich_help_panel="Setup Commands") # Add agent command groups -app.add_typer(types.types_app, name="types", rich_help_panel="Agent-Helper Commands") -app.add_typer(reference.reference_app, name="reference", rich_help_panel="Agent-Helper Commands") -app.command( - name="list-assets", help="List installed and available managed assets", rich_help_panel="Agent-Helper Commands" -)(list_assets.list_assets_command) +title_agent_helpers = "Agent-Helper Commands" +app.add_typer(inspect_cmd.inspect_app, name="inspect", rich_help_panel=title_agent_helpers) def main() -> None: diff --git a/packages/data-designer/src/data_designer/cli/services/introspection/__init__.py b/packages/data-designer/src/data_designer/cli/services/introspection/__init__.py index 93748bfd4..1063da54b 100644 --- a/packages/data-designer/src/data_designer/cli/services/introspection/__init__.py +++ b/packages/data-designer/src/data_designer/cli/services/introspection/__init__.py @@ -6,28 +6,13 @@ from data_designer.cli.services.introspection.discovery import ( discover_column_configs, discover_constraint_types, - discover_importable_names, - discover_interface_classes, - discover_mcp_types, - discover_model_configs, - discover_namespace_tree, discover_processor_configs, discover_sampler_types, - discover_seed_types, discover_validator_types, ) from data_designer.cli.services.introspection.formatters import ( - format_imports_json, - format_imports_text, - format_interface_json, - format_interface_text, format_method_info_json, format_method_info_text, - format_model_schema_json, - format_model_schema_text, - format_namespace_json, - format_namespace_text, - format_overview_text, format_type_list_text, ) from data_designer.cli.services.introspection.method_inspector import ( @@ -38,47 +23,28 @@ inspect_class_properties, ) from data_designer.cli.services.introspection.pydantic_inspector import ( - FieldDetail, - ModelSchema, - build_model_schema, + _extract_nested_basemodel, + format_model_text, format_type, get_brief_description, - get_field_info, ) __all__ = [ - "build_model_schema", + "_extract_nested_basemodel", "discover_column_configs", "discover_constraint_types", - "discover_importable_names", - "discover_interface_classes", - "discover_mcp_types", - "discover_model_configs", - "discover_namespace_tree", "discover_processor_configs", "discover_sampler_types", - "discover_seed_types", "discover_validator_types", - "FieldDetail", - "format_imports_json", - "format_imports_text", - "format_interface_json", - "format_interface_text", "format_method_info_json", "format_method_info_text", - "format_model_schema_json", - "format_model_schema_text", - "format_namespace_json", - "format_namespace_text", - "format_overview_text", + "format_model_text", "format_type_list_text", "format_type", "get_brief_description", - "get_field_info", "inspect_class_methods", "inspect_class_properties", "MethodInfo", - "ModelSchema", "ParamInfo", "PropertyInfo", ] diff --git a/packages/data-designer/src/data_designer/cli/services/introspection/discovery.py b/packages/data-designer/src/data_designer/cli/services/introspection/discovery.py index fcab153a6..c7ab08bf4 100644 --- a/packages/data-designer/src/data_designer/cli/services/introspection/discovery.py +++ b/packages/data-designer/src/data_designer/cli/services/introspection/discovery.py @@ -3,20 +3,11 @@ from __future__ import annotations -import importlib import inspect -import logging -import pkgutil from enum import Enum from typing import Any, Literal, get_args, get_origin -import data_designer import data_designer.config as dd -import data_designer.interface as interface_mod -from data_designer.config.preview_results import PreviewResults -from data_designer.config.run_config import RunConfig - -logger = logging.getLogger(__name__) def _extract_literal_discriminator_value(annotation: Any) -> str | None: @@ -125,68 +116,6 @@ def _discover_params_by_discriminator( return discovered -def _walk_namespace( - package_path: list[str], - prefix: str, - max_depth: int, - current_depth: int, - import_errors: list[dict[str, str]], -) -> list[dict[str, Any]]: - """Recursively walk a namespace package and build a tree of children nodes. - - Import failures are appended to import_errors as {"module": full_name, "message": str}. - """ - if current_depth >= max_depth: - return [] - - children: list[dict[str, Any]] = [] - for importer, name, is_pkg in pkgutil.iter_modules(package_path): - node: dict[str, Any] = { - "name": name, - "is_package": is_pkg, - "children": [], - } - if is_pkg: - full_name = f"{prefix}.{name}" - try: - sub_mod = importlib.import_module(full_name) - sub_path = getattr(sub_mod, "__path__", []) - node["children"] = _walk_namespace( - list(sub_path), full_name, max_depth, current_depth + 1, import_errors - ) - except Exception as e: - logger.debug("Failed to import %s during namespace discovery.", full_name, exc_info=True) - import_errors.append({"module": full_name, "message": str(e)}) - children.append(node) - - children.sort(key=lambda n: (not n["is_package"], n["name"])) - return children - - -def discover_namespace_tree(max_depth: int = 2) -> dict[str, Any]: - """Walk the data_designer namespace and return install paths plus a module tree. - - Returns: - Dict with ``paths`` (list of install directories) and ``tree`` (nested node dict). - - Raises: - ValueError: If max_depth < 0. - """ - if max_depth < 0: - raise ValueError("max_depth must be >= 0.") - paths = list(data_designer.__path__) - import_errors: list[dict[str, str]] = [] - tree: dict[str, Any] = { - "name": "data_designer", - "is_package": True, - "children": _walk_namespace(paths, "data_designer", max_depth, 0, import_errors), - } - result: dict[str, Any] = {"paths": paths, "tree": tree} - if import_errors: - result["import_errors"] = import_errors - return result - - def discover_column_configs() -> dict[str, type]: """Dynamically discover all ColumnConfig classes from data_designer.config. @@ -260,15 +189,6 @@ def _discover_by_modules(*module_suffixes: str) -> dict[str, type]: return result -def discover_model_configs() -> dict[str, type]: - """Return model-related configuration classes from data_designer.config. - - Returns: - Dict mapping class names to their types. - """ - return _discover_by_modules("models") - - def discover_constraint_types() -> dict[str, type]: """Return constraint-related classes from data_designer.config. @@ -276,95 +196,3 @@ def discover_constraint_types() -> dict[str, type]: Dict mapping class names to their types. """ return _discover_by_modules("sampler_constraints") - - -def discover_seed_types() -> dict[str, type]: - """Return seed dataset-related classes from data_designer.config. - - Returns: - Dict mapping class names to their types. - """ - return _discover_by_modules("seed", "seed_source") - - -def discover_mcp_types() -> dict[str, type]: - """Return MCP-related classes from data_designer.config. - - Returns: - Dict mapping class names to their types. - """ - return _discover_by_modules("mcp") - - -def discover_interface_classes() -> dict[str, type]: - """Discover interface-layer classes plus config-layer types used in the interface workflow. - - Dynamically scans ``data_designer.interface.__all__`` for non-exception classes and - adds ``PreviewResults`` and ``RunConfig`` from the config layer. - - Returns: - Dict mapping class names to their types. - """ - result: dict[str, type] = {} - for name in getattr(interface_mod, "__all__", []): - obj = getattr(interface_mod, name, None) - if obj is not None and inspect.isclass(obj) and not issubclass(obj, Exception): - result[name] = obj - result["PreviewResults"] = PreviewResults - result["RunConfig"] = RunConfig - return result - - -_MODULE_CATEGORIES: dict[str, str] = { - "column_configs": "Column Configs", - "column_types": "Column Types", - "config_builder": "Builder", - "custom_column": "Custom Columns", - "data_designer_config": "Core Config", - "mcp": "MCP", - "models": "Model Configs", - "processors": "Processors", - "run_config": "Runtime Config", - "sampler_constraints": "Constraints", - "sampler_params": "Sampler Params", - "seed": "Seed Config", - "seed_source": "Seed Sources", - "validator_params": "Validator Params", - "analysis.column_profilers": "Analysis", - "utils": "Utilities", - "version": "Utilities", -} - - -def _categorize_module(module_path: str) -> str: - """Map a module path from _LAZY_IMPORTS to a human-readable category name.""" - prefix = "data_designer.config." - suffix = module_path.removeprefix(prefix) if module_path.startswith(prefix) else module_path - - for key, category in _MODULE_CATEGORIES.items(): - if suffix == key or suffix.startswith(key + "."): - return category - return "Other" - - -def discover_importable_names() -> dict[str, list[dict[str, str]]]: - """Discover all importable names from data_designer.config and data_designer.interface. - - Reads _LAZY_IMPORTS from the config module and __all__ from the interface module, - grouping names by source-module category. - - Returns: - Dict mapping category names to lists of ``{"name": str, "module": str}`` entries. - """ - lazy_imports: dict[str, tuple[str, str]] = getattr(dd, "_LAZY_IMPORTS", {}) - - categories: dict[str, list[dict[str, str]]] = {} - for name, (module_path, _attr) in sorted(lazy_imports.items()): - category = _categorize_module(module_path) - categories.setdefault(category, []).append({"name": name, "module": "data_designer.config"}) - - interface_all: list[str] = getattr(interface_mod, "__all__", []) - if interface_all: - categories["Interface"] = [{"name": n, "module": "data_designer.interface"} for n in sorted(interface_all)] - - return categories diff --git a/packages/data-designer/src/data_designer/cli/services/introspection/formatters.py b/packages/data-designer/src/data_designer/cli/services/introspection/formatters.py index aaae8f28a..f1a3532f5 100644 --- a/packages/data-designer/src/data_designer/cli/services/introspection/formatters.py +++ b/packages/data-designer/src/data_designer/cli/services/introspection/formatters.py @@ -3,118 +3,7 @@ from __future__ import annotations -from typing import Any - from data_designer.cli.services.introspection.method_inspector import MethodInfo, ParamInfo -from data_designer.cli.services.introspection.pydantic_inspector import FieldDetail, ModelSchema - -_AGENT_GUIDANCE_FOOTER = ( - "Use `data-designer types ` to explore configuration types.\n" - "Use `data-designer reference ` for builder, imports, and overview.\n" - "Only read source files directly if these commands don't cover your need." -) - - -def _schema_dedupe_key(schema: ModelSchema) -> str: - """Return a stable key used for nested-schema deduplication in text output.""" - return schema.schema_ref or schema.class_name - - -def _format_field_text(field: FieldDetail, indent: int = 4, seen_schemas: set[str] | None = None) -> list[str]: - """Format a single field as YAML-style text lines, recursing into nested schemas. - - When ``seen_schemas`` is provided, nested schemas that have already been rendered - are replaced with a short back-reference to reduce output duplication. - """ - pad = " " * indent - lines: list[str] = [] - header = f"{pad}{field.name}: {field.type_str}" - if field.default_factory: - header += f" = {field.default_factory}()" - elif field.has_literal_default(): - header += f" = {field.default_json!r}" - elif field.default: - header += f" = {field.default}" - if field.required: - header += " [required]" - lines.append(header) - if field.description: - lines.append(f"{pad} description: {field.description}") - if field.enum_values: - lines.append(f"{pad} values: [{', '.join(field.enum_values)}]") - if field.constraints: - constraint_parts = [f"{k}={v}" for k, v in field.constraints.items()] - lines.append(f"{pad} constraints: {', '.join(constraint_parts)}") - if field.nested_schema: - schema_key = _schema_dedupe_key(field.nested_schema) - schema_name = field.nested_schema.class_name - if seen_schemas is not None and schema_key in seen_schemas: - lines.append(f"{pad} schema: (see {schema_name} above)") - else: - if seen_schemas is not None: - seen_schemas.add(schema_key) - lines.append(f"{pad} schema ({schema_name}):") - for nested_field in field.nested_schema.fields: - lines.extend(_format_field_text(nested_field, indent=indent + 4, seen_schemas=seen_schemas)) - return lines - - -def format_model_schema_text(schema: ModelSchema, indent: int = 0, seen_schemas: set[str] | None = None) -> str: - """Format a ModelSchema as YAML-style text for backward compatibility with the existing skill scripts. - - When ``seen_schemas`` is provided, nested schemas that have already been rendered - across prior calls are replaced with a short back-reference. - """ - lines: list[str] = [] - pad = " " * indent - lines.append(f"{pad}{schema.class_name}:") - if schema.type_key and schema.type_value: - lines.append(f"{pad} {schema.type_key}: {schema.type_value}") - lines.append(f"{pad} description: {schema.description}") - lines.append(f"{pad} fields:") - for field in schema.fields: - lines.extend(_format_field_text(field, indent=indent + 4, seen_schemas=seen_schemas)) - return "\n".join(lines) - - -def _format_field_json(field: FieldDetail) -> dict: - """Convert a FieldDetail to a JSON-serializable dict, recursing into nested schemas. - - Emits machine-typed defaults: "default" (native JSON value, including null) when - the field has a literal default, and "default_factory" (string) when it uses a factory. - """ - result: dict = { - "name": field.name, - "type": field.type_str, - "required": field.required, - } - if field.default_factory: - result["default_factory"] = field.default_factory - elif field.has_literal_default(): - result["default"] = field.default_json - elif field.default is not None: - result["default"] = field.default - if field.description: - result["description"] = field.description - if field.enum_values: - result["values"] = field.enum_values - if field.constraints: - result["constraints"] = field.constraints - if field.nested_schema: - result["schema"] = format_model_schema_json(field.nested_schema) - return result - - -def format_model_schema_json(schema: ModelSchema) -> dict: - """Convert a ModelSchema to a JSON-serializable dict.""" - result: dict = { - "class_name": schema.class_name, - "description": schema.description, - } - if schema.type_key and schema.type_value: - result[schema.type_key] = schema.type_value - result["fields"] = [_format_field_json(f) for f in schema.fields] - return result def _format_param_text(param: ParamInfo, indent: int) -> str: @@ -195,210 +84,3 @@ def format_type_list_text(items: dict[str, type], type_label: str, class_label: lines.append(f"{type_value:<{type_width}} {cls.__name__}") return "\n".join(lines) - - -def format_overview_text(type_counts: dict[str, int], builder_methods: list[MethodInfo]) -> str: - """Format a compact API overview cheatsheet.""" - lines: list[str] = [] - lines.append("Data Designer API Overview") - lines.append("=" * 26) - lines.append("") - - lines.append("Type Counts:") - label_width = max(len(label) for label in type_counts) + 1 if type_counts else 10 - for label, count in type_counts.items(): - lines.append(f" {label + ':':<{label_width}} {count:>3}") - lines.append("") - - if builder_methods: - lines.append("Builder Methods (DataDesignerConfigBuilder):") - sig_width = max(len(_short_sig(m)) for m in builder_methods) - for method in builder_methods: - short = _short_sig(method) - desc = method.description - lines.append(f" {short:<{sig_width}} \u2014 {desc}") - lines.append("") - - lines.append("Quick Start Commands:") - lines.append(" data-designer types columns") - lines.append(" data-designer types columns all") - lines.append(" data-designer types columns llm-text") - lines.append(" data-designer types samplers category") - lines.append(" data-designer reference builder") - lines.append(" data-designer reference interface") - lines.append(" data-designer reference imports") - - return "\n".join(lines) - - -def _short_sig(method: MethodInfo) -> str: - """Create a compact signature like 'add_column(...)' for overview display.""" - return f"{method.name}(...)" - - -# --------------------------------------------------------------------------- -# Namespace / code-structure formatters -# --------------------------------------------------------------------------- - - -def _render_tree_lines(node: dict[str, Any], prefix: str = "", is_last: bool = True) -> list[str]: - """Recursively render a namespace tree node into box-drawing lines.""" - connector = "└── " if is_last else "├── " - suffix = "/" if node["is_package"] else ".py" - lines: list[str] = [f"{prefix}{connector}{node['name']}{suffix}"] - - children = node.get("children", []) - child_prefix = prefix + (" " if is_last else "│ ") - for i, child in enumerate(children): - lines.extend(_render_tree_lines(child, child_prefix, is_last=(i == len(children) - 1))) - return lines - - -def format_namespace_text(data: dict[str, Any]) -> str: - """Format a namespace tree as a text tree diagram with box-drawing characters.""" - lines: list[str] = [] - lines.append("data_designer code structure") - lines.append("=" * 28) - lines.append("") - - paths = data.get("paths", []) - if paths: - lines.append("Install path:") - for p in paths: - lines.append(f" {p}") - lines.append("") - - tree = data["tree"] - lines.append(f"{tree['name']}/") - children = tree.get("children", []) - for i, child in enumerate(children): - lines.extend(_render_tree_lines(child, prefix="", is_last=(i == len(children) - 1))) - - import_errors = data.get("import_errors", []) - if import_errors: - lines.append("") - lines.append("Warnings (submodules that could not be imported):") - for err in import_errors: - lines.append(f" {err.get('module', '?')}: {err.get('message', '')}") - lines.append("") - - lines.append("") - lines.append(_AGENT_GUIDANCE_FOOTER) - return "\n".join(lines) - - -def format_namespace_json(data: dict[str, Any]) -> dict[str, Any]: - """Return the namespace tree dict as-is for JSON output. - - When discovery collected import_errors, they are included under "import_errors". - """ - return data - - -# --------------------------------------------------------------------------- -# Interface formatters -# --------------------------------------------------------------------------- - - -def format_interface_text( - classes_with_methods: list[tuple[str, list[MethodInfo]]], - pydantic_schemas: list[ModelSchema], -) -> str: - """Format interface classes as readable text for agent consumption.""" - lines: list[str] = [] - lines.append("Data Designer Interface Reference") - lines.append("=" * 34) - lines.append("") - - for class_name, methods in classes_with_methods: - lines.append(format_method_info_text(methods, class_name=class_name)) - lines.append("") - - for schema in pydantic_schemas: - lines.append(format_model_schema_text(schema)) - lines.append("") - - return "\n".join(lines).rstrip() - - -def format_interface_json( - classes_with_methods: list[tuple[str, list[MethodInfo]]], - pydantic_schemas: list[ModelSchema], -) -> dict[str, Any]: - """Convert interface classes to a JSON-serializable dict.""" - methods_dict: dict[str, list[dict]] = {} - for class_name, methods in classes_with_methods: - methods_dict[class_name] = format_method_info_json(methods) - - schemas_list: list[dict] = [format_model_schema_json(s) for s in pydantic_schemas] - - return {"methods": methods_dict, "schemas": schemas_list} - - -# --------------------------------------------------------------------------- -# Imports formatters -# --------------------------------------------------------------------------- - - -_CONFIG_MODULE = "data_designer.config" -_INTERFACE_MODULE = "data_designer.interface" -_CONFIG_ALIAS = "dd" - -_RECOMMENDED_IMPORTS = [ - f"import {_CONFIG_MODULE} as {_CONFIG_ALIAS}", - f"from {_INTERFACE_MODULE} import DataDesigner", -] - - -def format_imports_text(categories: dict[str, list[dict[str, str]]]) -> str: - """Format categorized import names as readable text with access patterns.""" - lines: list[str] = [] - lines.append("Data Designer Import Reference") - lines.append("=" * 30) - lines.append("") - - lines.append("Recommended imports:") - for imp in _RECOMMENDED_IMPORTS: - lines.append(f" {imp}") - lines.append("") - - for category, entries in sorted(categories.items()): - count = len(entries) - noun = "name" if count == 1 else "names" - lines.append(f"{category} ({count} {noun}):") - - is_config = any(e["module"] == _CONFIG_MODULE for e in entries) - if is_config: - for entry in sorted(entries, key=lambda e: e["name"]): - lines.append(f" {_CONFIG_ALIAS}.{entry['name']}") - else: - sorted_names = sorted(e["name"] for e in entries) - if len(sorted_names) <= 3: - names_str = ", ".join(sorted_names) - lines.append(f" from {entries[0]['module']} import {names_str}") - else: - module = entries[0]["module"] - lines.append(f" from {module} import (") - for name in sorted_names: - lines.append(f" {name},") - lines.append(" )") - lines.append("") - - return "\n".join(lines).rstrip() - - -def format_imports_json(categories: dict[str, list[dict[str, str]]]) -> dict[str, Any]: - """Return a structured JSON with recommended imports, alias, and categorized names.""" - structured: dict[str, Any] = { - "recommended_imports": _RECOMMENDED_IMPORTS, - "config_alias": _CONFIG_ALIAS, - "categories": {}, - } - for category, entries in sorted(categories.items()): - module = entries[0]["module"] if entries else _CONFIG_MODULE - structured["categories"][category] = { - "module": module, - "access_pattern": f"{_CONFIG_ALIAS}." if module == _CONFIG_MODULE else f"from {module} import ", - "names": sorted(e["name"] for e in entries), - } - return structured diff --git a/packages/data-designer/src/data_designer/cli/services/introspection/pydantic_inspector.py b/packages/data-designer/src/data_designer/cli/services/introspection/pydantic_inspector.py index 7643cdfbc..a331279b3 100644 --- a/packages/data-designer/src/data_designer/cli/services/introspection/pydantic_inspector.py +++ b/packages/data-designer/src/data_designer/cli/services/introspection/pydantic_inspector.py @@ -6,47 +6,12 @@ import re import types import typing -from dataclasses import dataclass, field from enum import Enum from typing import Any, get_args, get_origin from pydantic import BaseModel from pydantic_core import PydanticUndefined -_UNDEFINED: Any = object() - - -@dataclass -class FieldDetail: - """Structured representation of a single Pydantic model field.""" - - name: str - type_str: str - description: str - required: bool = True - default: str | None = None - default_json: Any = _UNDEFINED - default_factory: str | None = None - enum_values: list[str] | None = None - constraints: dict[str, Any] | None = None - nested_schema: ModelSchema | None = None - - def has_literal_default(self) -> bool: - """True if this field has a literal default value (including None).""" - return self.default_json is not _UNDEFINED - - -@dataclass -class ModelSchema: - """Structured representation of a Pydantic model's schema.""" - - class_name: str - description: str - schema_ref: str | None = None - type_key: str | None = None - type_value: str | None = None - fields: list[FieldDetail] = field(default_factory=list) - def _is_basemodel_subclass(cls: Any) -> bool: """Return True if cls is a concrete BaseModel subclass (not BaseModel itself).""" @@ -85,7 +50,7 @@ def _extract_enum_class(annotation: Any) -> type | None: return None -def extract_nested_basemodel(annotation: Any) -> type | None: +def _extract_nested_basemodel(annotation: Any) -> type | None: """Unwrap a type annotation to find a single nested BaseModel subclass. Handles: X, list[X], X | None, list[X] | None, dict[K, V], Annotated[X, ...]. @@ -123,7 +88,7 @@ def extract_nested_basemodel(annotation: Any) -> type | None: non_none_args = [a for a in get_args(annotation) if a is not type(None)] basemodel_classes: list[type] = [] for arg in non_none_args: - result = extract_nested_basemodel(arg) + result = _extract_nested_basemodel(arg) if result is not None: basemodel_classes.append(result) elif _is_basemodel_subclass(arg): @@ -147,6 +112,9 @@ def format_type(annotation: Any) -> str: type_str = re.sub(r"pydantic\.main\.", "", type_str) type_str = re.sub(r"typing\.", "", type_str) + # Clean up enum members used inside Literal or other contexts: -> 'value' + type_str = re.sub(r"<\w+\.\w+: '([^']+)'>", r"'\1'", type_str) + # Clean up enum types BEFORE other replacements: -> EnumName type_str = re.sub(r"", r"\1", type_str) @@ -162,9 +130,16 @@ def format_type(annotation: Any) -> str: # Clean up Annotated types with Discriminator (too verbose) if "Annotated[" in type_str and "Discriminator" in type_str: - match = re.search(r"Annotated\[([^,]+(?:\s*\|\s*[^,]+)*),", type_str) - if match: - type_str = match.group(1).strip() + start = type_str.index("Annotated[") + len("Annotated[") + depth = 0 + for i, ch in enumerate(type_str[start:], start): + if ch in "([": + depth += 1 + elif ch in ")]": + depth -= 1 + elif ch == "," and depth == 0: + type_str = type_str[start:i].strip() + break return type_str @@ -210,116 +185,136 @@ def _default_to_json(value: Any) -> Any: return repr(value) -def get_field_info(cls: type) -> list[FieldDetail]: - """Extract field information from a Pydantic model. - - Args: - cls: The Pydantic model class to inspect. - - Returns: - List of FieldDetail objects with name, type_str, description, required, - default, enum_values, constraints, and nested_schema (initially None, - populated by build_model_schema). - """ - fields: list[FieldDetail] = [] - model_fields: dict[str, Any] = getattr(cls, "model_fields", {}) - if model_fields: - for field_name, field_info in model_fields.items(): - type_str = format_type(field_info.annotation) - description = field_info.description or "" - - required = field_info.is_required() - - default_json: Any = _UNDEFINED - default_factory_name: str | None = None - default_display: str | None = None - if not required: - if field_info.default_factory is not None: - default_factory_name = getattr( - field_info.default_factory, "__name__", repr(field_info.default_factory) +def _format_field( + field_name: str, + field_info: Any, + indent: int, + seen_schemas: set[str] | None, + seen_types: set[type], + max_depth: int, + depth: int, +) -> list[str]: + """Format a single Pydantic field as YAML-style text lines, recursing into nested schemas.""" + pad = " " * indent + lines: list[str] = [] + + type_str = format_type(field_info.annotation) + description: str = field_info.description or "" + required: bool = field_info.is_required() + + header = f"{pad}{field_name}: {type_str}" + if not required: + if field_info.default_factory is not None: + factory_name = getattr(field_info.default_factory, "__name__", repr(field_info.default_factory)) + header += f" = {factory_name}()" + elif field_info.default is not PydanticUndefined: + header += f" = {_default_to_json(field_info.default)!r}" + if required: + header += " [required]" + lines.append(header) + + if description: + lines.append(f"{pad} description: {description}") + + enum_cls = _extract_enum_class(field_info.annotation) + if enum_cls is not None: + enum_values = [str(member.value) for member in enum_cls] + lines.append(f"{pad} values: [{', '.join(enum_values)}]") + + constraints = _extract_constraints(field_info) + if constraints: + constraint_parts = [f"{k}={v}" for k, v in constraints.items()] + lines.append(f"{pad} constraints: {', '.join(constraint_parts)}") + + nested_cls = _extract_nested_basemodel(field_info.annotation) + if nested_cls is not None and nested_cls not in seen_types and depth < max_depth: + schema_key = f"{nested_cls.__module__}.{nested_cls.__qualname__}" + schema_name = nested_cls.__name__ + if seen_schemas is not None and schema_key in seen_schemas: + lines.append(f"{pad} schema: (see {schema_name} above)") + else: + if seen_schemas is not None: + seen_schemas.add(schema_key) + lines.append(f"{pad} schema ({schema_name}):") + next_seen = seen_types | {nested_cls} + nested_model_fields: dict[str, Any] = getattr(nested_cls, "model_fields", {}) + for nested_name, nested_info in nested_model_fields.items(): + lines.extend( + _format_field( + field_name=nested_name, + field_info=nested_info, + indent=indent + 4, + seen_schemas=seen_schemas, + seen_types=next_seen, + max_depth=max_depth, + depth=depth + 1, ) - elif field_info.default is not PydanticUndefined: - default_json = _default_to_json(field_info.default) - if default_json is not _UNDEFINED: - default_display = repr(default_json) - - enum_cls = _extract_enum_class(field_info.annotation) - enum_values: list[str] | None = None - if enum_cls is not None: - enum_values = [str(member.value) for member in enum_cls] - - constraints = _extract_constraints(field_info) - - if default_display is None and default_factory_name is not None: - default_display = f"{default_factory_name}()" - - fields.append( - FieldDetail( - name=field_name, - type_str=type_str, - description=description, - required=required, - default=default_display, - default_json=default_json, - default_factory=default_factory_name, - enum_values=enum_values, - constraints=constraints, - nested_schema=None, ) - ) - return fields + + return lines -def build_model_schema( +def format_model_text( cls: type, type_key: str | None = None, type_value: str | None = None, - seen: set[type] | None = None, + indent: int = 0, + seen_schemas: set[str] | None = None, max_depth: int = 3, - current_depth: int = 0, -) -> ModelSchema: - """Build a structured ModelSchema from a Pydantic model class. +) -> str: + """Format a Pydantic model as YAML-style text for agent context. Args: - cls: The Pydantic model class to inspect. - type_key: Optional key name for the type discriminator (e.g., "column_type"). - type_value: Optional value for the type discriminator (e.g., "llm-text"). - seen: Set of already-expanded class names to prevent cycles. + cls: The Pydantic model class to format. + type_key: Optional discriminator key name (e.g., "column_type"). + type_value: Optional discriminator value (e.g., "llm-text"). + indent: Base indentation level. + seen_schemas: Set of schema refs already rendered (mutated for cross-model dedup). max_depth: Maximum recursion depth for nested models. - current_depth: Current recursion depth. - - Returns: - A ModelSchema with recursively expanded nested schemas. """ - if seen is None: - seen = set() - - class_name = cls.__name__ - description = get_brief_description(cls) - schema_ref = f"{cls.__module__}.{cls.__qualname__}" - fields = get_field_info(cls) - - model_fields_raw: dict[str, Any] = getattr(cls, "model_fields", {}) - for field_detail in fields: - raw_field_info = model_fields_raw.get(field_detail.name) - if raw_field_info is None: - continue - - nested_cls = extract_nested_basemodel(raw_field_info.annotation) - if nested_cls is not None and nested_cls not in seen and current_depth < max_depth: - next_seen = seen | {nested_cls} - field_detail.nested_schema = build_model_schema( - nested_cls, - seen=next_seen, - max_depth=max_depth, - current_depth=current_depth + 1, - ) - - return ModelSchema( - class_name=class_name, - description=description, - schema_ref=schema_ref, + return _format_model_text( + cls, type_key=type_key, type_value=type_value, - fields=fields, + indent=indent, + seen_schemas=seen_schemas, + seen_types=set(), + max_depth=max_depth, + depth=0, ) + + +def _format_model_text( + cls: type, + type_key: str | None, + type_value: str | None, + indent: int, + seen_schemas: set[str] | None, + seen_types: set[type], + max_depth: int, + depth: int, +) -> str: + """Recursive implementation of format_model_text.""" + pad = " " * indent + lines: list[str] = [] + lines.append(f"{pad}{cls.__name__}:") + if type_key and type_value: + lines.append(f"{pad} {type_key}: {type_value}") + lines.append(f"{pad} description: {get_brief_description(cls)}") + lines.append(f"{pad} fields:") + + model_fields: dict[str, Any] = getattr(cls, "model_fields", {}) + for field_name, field_info in model_fields.items(): + lines.extend( + _format_field( + field_name=field_name, + field_info=field_info, + indent=indent + 4, + seen_schemas=seen_schemas, + seen_types=seen_types, + max_depth=max_depth, + depth=depth, + ) + ) + + return "\n".join(lines) diff --git a/packages/data-designer/tests/cli/commands/agent_helpers/__init__.py b/packages/data-designer/tests/cli/commands/agent_helpers/__init__.py new file mode 100644 index 000000000..f1ea03ddb --- /dev/null +++ b/packages/data-designer/tests/cli/commands/agent_helpers/__init__.py @@ -0,0 +1,4 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +from __future__ import annotations diff --git a/packages/data-designer/tests/cli/commands/agent_helpers/test_introspection_commands.py b/packages/data-designer/tests/cli/commands/agent_helpers/test_introspection_commands.py new file mode 100644 index 000000000..f3815df55 --- /dev/null +++ b/packages/data-designer/tests/cli/commands/agent_helpers/test_introspection_commands.py @@ -0,0 +1,181 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +from __future__ import annotations + +from typer.testing import CliRunner + +from data_designer.cli.main import app + +runner = CliRunner() + + +# --------------------------------------------------------------------------- +# help +# --------------------------------------------------------------------------- + + +def test_inspect_help() -> None: + result = runner.invoke(app, ["inspect", "--help"]) + assert result.exit_code == 0 + assert "column" in result.output + + +# --------------------------------------------------------------------------- +# columns +# --------------------------------------------------------------------------- + + +def test_columns_no_arg_fails() -> None: + result = runner.invoke(app, ["inspect", "column"]) + assert result.exit_code != 0 + + +def test_columns_specific_type() -> None: + result = runner.invoke(app, ["inspect", "column", "llm-text"]) + assert result.exit_code == 0 + assert "LLMTextColumnConfig" in result.output + + +def test_columns_nonexistent_exits_with_error() -> None: + result = runner.invoke(app, ["inspect", "column", "nonexistent"]) + assert result.exit_code == 1 + + +# --------------------------------------------------------------------------- +# samplers +# --------------------------------------------------------------------------- + + +def test_samplers_specific() -> None: + result = runner.invoke(app, ["inspect", "sampler", "category"]) + assert result.exit_code == 0 + assert "sampler_type: category" in result.output + + +def test_samplers_all_case_insensitive() -> None: + result = runner.invoke(app, ["inspect", "sampler", "ALL"]) + assert result.exit_code == 0 + assert "Data Designer Sampler Types Reference" in result.output + assert "sampler_type: category" in result.output + + +def test_samplers_no_arg_fails() -> None: + result = runner.invoke(app, ["inspect", "sampler"]) + assert result.exit_code != 0 + + +# --------------------------------------------------------------------------- +# validators +# --------------------------------------------------------------------------- + + +def test_validators_no_arg_fails() -> None: + result = runner.invoke(app, ["inspect", "validator"]) + assert result.exit_code != 0 + + +def test_validators_specific() -> None: + result = runner.invoke(app, ["inspect", "validator", "code"]) + assert result.exit_code == 0 + assert "validator_type: code" in result.output + + +def test_validators_all_case_insensitive() -> None: + result = runner.invoke(app, ["inspect", "validator", "ALL"]) + assert result.exit_code == 0 + assert "Data Designer Validator Types Reference" in result.output + assert "validator_type: code" in result.output + + +# --------------------------------------------------------------------------- +# processors +# --------------------------------------------------------------------------- + + +def test_processors_no_arg_fails() -> None: + result = runner.invoke(app, ["inspect", "processor"]) + assert result.exit_code != 0 + + +# --------------------------------------------------------------------------- +# builder +# --------------------------------------------------------------------------- + + +def test_builder() -> None: + result = runner.invoke(app, ["inspect", "builder"]) + assert result.exit_code == 0 + assert "add_column" in result.output + assert "DataDesignerConfigBuilder" in result.output + assert "Parameters:" in result.output + + +# --------------------------------------------------------------------------- +# constraints +# --------------------------------------------------------------------------- + + +def test_constraints() -> None: + result = runner.invoke(app, ["inspect", "sampler-constraints"]) + assert result.exit_code == 0 + output = result.output + assert "ScalarInequalityConstraint" in output or "InequalityOperator" in output + + +# --------------------------------------------------------------------------- +# import hints +# --------------------------------------------------------------------------- + + +def test_import_hint_shown_in_text_output() -> None: + result = runner.invoke(app, ["inspect", "column", "llm-text"]) + assert result.exit_code == 0 + assert "import data_designer.config as dd" in result.output + assert "dd.LLMTextColumnConfig" in result.output + + +# --------------------------------------------------------------------------- +# list +# --------------------------------------------------------------------------- + + +def test_list_help() -> None: + result = runner.invoke(app, ["list", "--help"]) + assert result.exit_code == 0 + for subcmd in ("model-aliases", "persona-datasets", "columns", "samplers", "validators", "processors"): + assert subcmd in result.output + + +def test_list_model_aliases() -> None: + result = runner.invoke(app, ["list", "model-aliases"]) + assert result.exit_code == 0 + + +def test_list_persona_datasets() -> None: + result = runner.invoke(app, ["list", "persona-datasets"]) + assert result.exit_code == 0 + assert "Nemotron-Personas Datasets" in result.output + + +def test_list_column_types() -> None: + result = runner.invoke(app, ["list", "columns"]) + assert result.exit_code == 0 + assert "llm-text" in result.output + + +def test_list_sampler_types() -> None: + result = runner.invoke(app, ["list", "samplers"]) + assert result.exit_code == 0 + assert "category" in result.output + + +def test_list_validator_types() -> None: + result = runner.invoke(app, ["list", "validators"]) + assert result.exit_code == 0 + assert "code" in result.output + + +def test_list_processor_types() -> None: + result = runner.invoke(app, ["list", "processors"]) + assert result.exit_code == 0 diff --git a/packages/data-designer/tests/cli/commands/test_usage_scenarios.py b/packages/data-designer/tests/cli/commands/agent_helpers/test_usage_scenarios.py similarity index 88% rename from packages/data-designer/tests/cli/commands/test_usage_scenarios.py rename to packages/data-designer/tests/cli/commands/agent_helpers/test_usage_scenarios.py index 33926ac79..12c55ddbd 100644 --- a/packages/data-designer/tests/cli/commands/test_usage_scenarios.py +++ b/packages/data-designer/tests/cli/commands/agent_helpers/test_usage_scenarios.py @@ -3,7 +3,6 @@ from __future__ import annotations -import json import re import types from pathlib import Path @@ -116,18 +115,8 @@ def test_usage_validate_unsupported_extension_is_actionable(tmp_path: Path) -> N assert "supported extensions" in normalized -def test_usage_introspect_columns_json_contract() -> None: - result = runner.invoke(app, ["types", "columns", "llm-text", "--format", "json"], color=False) - assert result.exit_code == 0 - - payload = json.loads(result.output) - assert isinstance(payload, dict) - assert payload.get("class_name") == "LLMTextColumnConfig" - assert isinstance(payload.get("fields"), list) - - def test_usage_introspect_unknown_type_error_is_actionable() -> None: - result = runner.invoke(app, ["types", "columns", "nonexistent"], color=False) + result = runner.invoke(app, ["inspect", "column", "nonexistent"], color=False) normalized = _normalize_text(result.output) assert result.exit_code == 1 diff --git a/packages/data-designer/tests/cli/commands/test_introspection_commands.py b/packages/data-designer/tests/cli/commands/test_introspection_commands.py deleted file mode 100644 index 3e37bbeb9..000000000 --- a/packages/data-designer/tests/cli/commands/test_introspection_commands.py +++ /dev/null @@ -1,350 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 - -from __future__ import annotations - -import json - -import pytest -from typer.testing import CliRunner - -from data_designer.cli.main import app - -runner = CliRunner() - - -# --------------------------------------------------------------------------- -# help -# --------------------------------------------------------------------------- - - -def test_types_help() -> None: - result = runner.invoke(app, ["types", "--help"]) - assert result.exit_code == 0 - assert "columns" in result.output - - -def test_reference_help() -> None: - result = runner.invoke(app, ["reference", "--help"]) - assert result.exit_code == 0 - assert "overview" in result.output - - -# --------------------------------------------------------------------------- -# columns -# --------------------------------------------------------------------------- - - -def test_columns_list() -> None: - result = runner.invoke(app, ["types", "columns"]) - assert result.exit_code == 0 - assert "llm-text" in result.output - assert "data-designer types columns" in result.output - - -def test_columns_specific_type() -> None: - result = runner.invoke(app, ["types", "columns", "llm-text"]) - assert result.exit_code == 0 - assert "LLMTextColumnConfig" in result.output - - -def test_columns_json_format() -> None: - result = runner.invoke(app, ["types", "columns", "llm-text", "--format", "json"]) - assert result.exit_code == 0 - data = json.loads(result.output) - assert isinstance(data, dict) - assert data["class_name"] == "LLMTextColumnConfig" - - -def test_columns_nonexistent_exits_with_error() -> None: - result = runner.invoke(app, ["types", "columns", "nonexistent"]) - assert result.exit_code == 1 - - -# --------------------------------------------------------------------------- -# samplers -# --------------------------------------------------------------------------- - - -def test_samplers_specific() -> None: - result = runner.invoke(app, ["types", "samplers", "category"]) - assert result.exit_code == 0 - assert "sampler_type: category" in result.output - - -def test_samplers_all_case_insensitive() -> None: - result = runner.invoke(app, ["types", "samplers", "ALL"]) - assert result.exit_code == 0 - assert "Data Designer Sampler Types Reference" in result.output - assert "sampler_type: category" in result.output - - -def test_samplers_list() -> None: - result = runner.invoke(app, ["types", "samplers"]) - assert result.exit_code == 0 - assert "category" in result.output - assert "data-designer types samplers" in result.output - - -# --------------------------------------------------------------------------- -# validators -# --------------------------------------------------------------------------- - - -def test_validators_specific() -> None: - result = runner.invoke(app, ["types", "validators", "code"]) - assert result.exit_code == 0 - assert "validator_type: code" in result.output - - -def test_validators_all_case_insensitive() -> None: - result = runner.invoke(app, ["types", "validators", "ALL"]) - assert result.exit_code == 0 - assert "Data Designer Validator Types Reference" in result.output - assert "validator_type: code" in result.output - - -# --------------------------------------------------------------------------- -# overview -# --------------------------------------------------------------------------- - - -def test_overview() -> None: - result = runner.invoke(app, ["reference", "overview"]) - assert result.exit_code == 0 - assert "Type Counts" in result.output - - -# --------------------------------------------------------------------------- -# builder -# --------------------------------------------------------------------------- - - -def test_builder() -> None: - result = runner.invoke(app, ["reference", "builder"]) - assert result.exit_code == 0 - assert "add_column" in result.output - assert "DataDesignerConfigBuilder" in result.output - assert "Parameters:" in result.output - - -def test_builder_json() -> None: - result = runner.invoke(app, ["reference", "builder", "--format", "json"]) - assert result.exit_code == 0 - data = json.loads(result.output) - assert isinstance(data, list) - method_names = [m["name"] for m in data] - assert "add_column" in method_names - - -# --------------------------------------------------------------------------- -# models -# --------------------------------------------------------------------------- - - -def test_models() -> None: - result = runner.invoke(app, ["types", "models"]) - assert result.exit_code == 0 - assert "ModelConfig" in result.output - assert "description:" in result.output - - -def test_models_json() -> None: - result = runner.invoke(app, ["types", "models", "--format", "json"]) - assert result.exit_code == 0 - data = json.loads(result.output) - assert isinstance(data, list) - class_names = [item.get("class_name", "") for item in data if isinstance(item, dict)] - assert "ModelConfig" in class_names - - -# --------------------------------------------------------------------------- -# constraints -# --------------------------------------------------------------------------- - - -def test_constraints() -> None: - result = runner.invoke(app, ["types", "constraints"]) - assert result.exit_code == 0 - output = result.output - assert "ScalarInequalityConstraint" in output or "InequalityOperator" in output - - -# --------------------------------------------------------------------------- -# seeds -# --------------------------------------------------------------------------- - - -def test_seeds() -> None: - result = runner.invoke(app, ["types", "seeds"]) - assert result.exit_code == 0 - assert "SeedConfig" in result.output - assert "SamplingStrategy" in result.output - - -def test_seeds_json() -> None: - result = runner.invoke(app, ["types", "seeds", "--format", "json"]) - assert result.exit_code == 0 - data = json.loads(result.output) - assert isinstance(data, list) - class_names = [item.get("class_name", "") for item in data if isinstance(item, dict)] - assert "SeedConfig" in class_names - - -# --------------------------------------------------------------------------- -# mcp -# --------------------------------------------------------------------------- - - -def test_mcp() -> None: - result = runner.invoke(app, ["types", "mcp"]) - assert result.exit_code == 0 - assert "ToolConfig" in result.output - assert "MCPProvider" in result.output or "LocalStdioMCPProvider" in result.output - - -def test_mcp_json() -> None: - result = runner.invoke(app, ["types", "mcp", "--format", "json"]) - assert result.exit_code == 0 - data = json.loads(result.output) - assert isinstance(data, list) - class_names = [item.get("class_name", "") for item in data if isinstance(item, dict)] - assert "ToolConfig" in class_names - - -# --------------------------------------------------------------------------- -# code-structure -# --------------------------------------------------------------------------- - - -def test_code_structure() -> None: - result = runner.invoke(app, ["reference", "code-structure"]) - assert result.exit_code == 0 - assert "data_designer code structure" in result.output - assert "├──" in result.output - - -def test_code_structure_shows_subpackages() -> None: - result = runner.invoke(app, ["reference", "code-structure"]) - assert result.exit_code == 0 - for pkg in ("config/", "engine/", "cli/"): - assert pkg in result.output, f"Expected '{pkg}' in output" - - -def test_code_structure_json_format() -> None: - result = runner.invoke(app, ["reference", "code-structure", "--format", "json"]) - assert result.exit_code == 0 - data = json.loads(result.output) - assert "paths" in data - assert "tree" in data - - -def test_code_structure_shows_agent_guidance() -> None: - result = runner.invoke(app, ["reference", "code-structure"]) - assert result.exit_code == 0 - assert "Only read source files directly" in result.output - - -def test_code_structure_negative_depth_exits_with_error() -> None: - """Invalid --depth < 0 is rejected with actionable error.""" - result = runner.invoke(app, ["reference", "code-structure", "--depth", "-1"], color=False) - assert result.exit_code != 0 - assert "depth" in result.output.lower() or "0" in result.output - - -# --------------------------------------------------------------------------- -# interface -# --------------------------------------------------------------------------- - - -def test_interface() -> None: - result = runner.invoke(app, ["reference", "interface"]) - assert result.exit_code == 0 - assert "DataDesigner" in result.output - assert "create" in result.output - - -def test_interface_json() -> None: - result = runner.invoke(app, ["reference", "interface", "--format", "json"]) - assert result.exit_code == 0 - data = json.loads(result.output) - assert "methods" in data - assert "schemas" in data - - -def test_interface_shows_result_types() -> None: - result = runner.invoke(app, ["reference", "interface"]) - assert result.exit_code == 0 - assert "DatasetCreationResults" in result.output - - -# --------------------------------------------------------------------------- -# imports -# --------------------------------------------------------------------------- - - -def test_imports() -> None: - result = runner.invoke(app, ["reference", "imports"]) - assert result.exit_code == 0 - assert "import data_designer.config as dd" in result.output - assert "dd." in result.output - - -def test_imports_json() -> None: - result = runner.invoke(app, ["reference", "imports", "--format", "json"]) - assert result.exit_code == 0 - data = json.loads(result.output) - assert isinstance(data, dict) - assert "recommended_imports" in data - assert "categories" in data - assert len(data["categories"]) > 0 - - -@pytest.mark.parametrize( - "args", - [ - ["types", "columns", "all"], - ["types", "samplers", "all"], - ["types", "validators", "all"], - ["types", "processors", "all"], - ["types", "models"], - ["types", "constraints"], - ["types", "seeds"], - ["types", "mcp"], - ["reference", "overview"], - ["reference", "builder"], - ["reference", "interface"], - ["reference", "imports"], - ["reference", "code-structure"], - ], -) -def test_json_contract_for_all_introspection_commands(args: list[str]) -> None: - result = runner.invoke(app, [*args, "--format", "json"]) - assert result.exit_code == 0 - payload = json.loads(result.output) - assert payload is not None - - -# --------------------------------------------------------------------------- -# format validation -# --------------------------------------------------------------------------- - - -def test_invalid_format_rejected() -> None: - result = runner.invoke(app, ["types", "columns", "--format", "xml"]) - assert result.exit_code != 0 - - -def test_invalid_format_rejected_on_builder() -> None: - result = runner.invoke(app, ["reference", "builder", "--format", "yaml"]) - assert result.exit_code != 0 - - -def test_valid_format_text() -> None: - result = runner.invoke(app, ["types", "columns", "--format", "text"]) - assert result.exit_code == 0 - - -def test_valid_format_json() -> None: - result = runner.invoke(app, ["types", "columns", "--format", "json"]) - assert result.exit_code == 0 diff --git a/packages/data-designer/tests/cli/controllers/test_introspection_controller.py b/packages/data-designer/tests/cli/controllers/test_introspection_controller.py index b1184ad1d..f3ab12ea4 100644 --- a/packages/data-designer/tests/cli/controllers/test_introspection_controller.py +++ b/packages/data-designer/tests/cli/controllers/test_introspection_controller.py @@ -3,8 +3,6 @@ from __future__ import annotations -import json - import click.exceptions import pytest @@ -16,7 +14,7 @@ def test_show_columns_list_mode(capsys: pytest.CaptureFixture[str]) -> None: - controller = IntrospectionController(output_format="text") + controller = IntrospectionController() controller.show_columns(type_name=None) captured = capsys.readouterr() assert "llm-text" in captured.out @@ -24,14 +22,14 @@ def test_show_columns_list_mode(capsys: pytest.CaptureFixture[str]) -> None: def test_show_columns_specific_type(capsys: pytest.CaptureFixture[str]) -> None: - controller = IntrospectionController(output_format="text") + controller = IntrospectionController() controller.show_columns(type_name="llm-text") captured = capsys.readouterr() assert "LLMTextColumnConfig" in captured.out def test_show_columns_all(capsys: pytest.CaptureFixture[str]) -> None: - controller = IntrospectionController(output_format="text") + controller = IntrospectionController() controller.show_columns(type_name="all") captured = capsys.readouterr() assert "llm-text" in captured.out @@ -39,403 +37,98 @@ def test_show_columns_all(capsys: pytest.CaptureFixture[str]) -> None: def test_show_columns_nonexistent_type_exits() -> None: - controller = IntrospectionController(output_format="text") + controller = IntrospectionController() with pytest.raises(click.exceptions.Exit): controller.show_columns(type_name="nonexistent_type_xyz") -def test_show_columns_json_format(capsys: pytest.CaptureFixture[str]) -> None: - controller = IntrospectionController(output_format="json") - controller.show_columns(type_name="llm-text") - captured = capsys.readouterr() - data = json.loads(captured.out) - assert isinstance(data, dict) - assert data["class_name"] == "LLMTextColumnConfig" - - -def test_show_columns_list_json_format(capsys: pytest.CaptureFixture[str]) -> None: - controller = IntrospectionController(output_format="json") - controller.show_columns(type_name=None) - captured = capsys.readouterr() - data = json.loads(captured.out) - assert isinstance(data, dict) - assert "llm-text" in data - - -# --------------------------------------------------------------------------- -# show_overview -# --------------------------------------------------------------------------- - - -def test_show_overview_text(capsys: pytest.CaptureFixture[str]) -> None: - controller = IntrospectionController(output_format="text") - controller.show_overview() - captured = capsys.readouterr() - assert "Data Designer API Overview" in captured.out - assert "Type Counts:" in captured.out - - -def test_show_overview_json(capsys: pytest.CaptureFixture[str]) -> None: - controller = IntrospectionController(output_format="json") - controller.show_overview() - captured = capsys.readouterr() - data = json.loads(captured.out) - assert "type_counts" in data - assert "builder_methods" in data - assert isinstance(data["type_counts"], dict) - assert len(data["type_counts"]) > 0 - assert isinstance(data["builder_methods"], list) - assert len(data["builder_methods"]) > 0 - - # --------------------------------------------------------------------------- # show_samplers # --------------------------------------------------------------------------- def test_show_samplers_list(capsys: pytest.CaptureFixture[str]) -> None: - controller = IntrospectionController(output_format="text") + controller = IntrospectionController() controller.show_samplers(type_name=None) captured = capsys.readouterr() assert "category" in captured.out def test_show_samplers_specific(capsys: pytest.CaptureFixture[str]) -> None: - controller = IntrospectionController(output_format="text") + controller = IntrospectionController() controller.show_samplers(type_name="category") captured = capsys.readouterr() assert "sampler_type: category" in captured.out def test_show_samplers_all_case_insensitive(capsys: pytest.CaptureFixture[str]) -> None: - controller = IntrospectionController(output_format="text") + controller = IntrospectionController() controller.show_samplers(type_name="ALL") captured = capsys.readouterr() assert "Data Designer Sampler Types Reference" in captured.out assert "sampler_type: category" in captured.out -# --------------------------------------------------------------------------- -# show_models -# --------------------------------------------------------------------------- - - -def test_show_models(capsys: pytest.CaptureFixture[str]) -> None: - controller = IntrospectionController(output_format="text") - controller.show_models() - captured = capsys.readouterr() - assert "ModelConfig" in captured.out - - # --------------------------------------------------------------------------- # show_builder # --------------------------------------------------------------------------- def test_show_builder(capsys: pytest.CaptureFixture[str]) -> None: - controller = IntrospectionController(output_format="text") + controller = IntrospectionController() controller.show_builder() captured = capsys.readouterr() assert "add_column" in captured.out # --------------------------------------------------------------------------- -# show_constraints +# show_sampler_constraints # --------------------------------------------------------------------------- -def test_show_constraints(capsys: pytest.CaptureFixture[str]) -> None: - controller = IntrospectionController(output_format="text") - controller.show_constraints() +def test_show_sampler_constraints(capsys: pytest.CaptureFixture[str]) -> None: + controller = IntrospectionController() + controller.show_sampler_constraints() captured = capsys.readouterr() assert "ScalarInequalityConstraint" in captured.out -# --------------------------------------------------------------------------- -# show_seeds -# --------------------------------------------------------------------------- - - -def test_show_seeds(capsys: pytest.CaptureFixture[str]) -> None: - controller = IntrospectionController(output_format="text") - controller.show_seeds() - captured = capsys.readouterr() - assert "SeedConfig" in captured.out - - -# --------------------------------------------------------------------------- -# show_mcp -# --------------------------------------------------------------------------- - - -def test_show_mcp(capsys: pytest.CaptureFixture[str]) -> None: - controller = IntrospectionController(output_format="text") - controller.show_mcp() - captured = capsys.readouterr() - assert "ToolConfig" in captured.out - - -# --------------------------------------------------------------------------- -# show_interface -# --------------------------------------------------------------------------- - - -def test_show_interface_text(capsys: pytest.CaptureFixture[str]) -> None: - controller = IntrospectionController(output_format="text") - controller.show_interface() - captured = capsys.readouterr() - assert "DataDesigner" in captured.out - assert "DatasetCreationResults" in captured.out - assert "RunConfig" in captured.out - - -def test_show_interface_json(capsys: pytest.CaptureFixture[str]) -> None: - controller = IntrospectionController(output_format="json") - controller.show_interface() - captured = capsys.readouterr() - data = json.loads(captured.out) - assert "methods" in data - assert "schemas" in data - assert "DataDesigner" in data["methods"] - assert isinstance(data["schemas"], list) - assert len(data["schemas"]) > 0 - - # --------------------------------------------------------------------------- # show_validators # --------------------------------------------------------------------------- def test_show_validators_list_text(capsys: pytest.CaptureFixture[str]) -> None: - controller = IntrospectionController(output_format="text") + controller = IntrospectionController() controller.show_validators(type_name=None) captured = capsys.readouterr() assert "validator_type" in captured.out assert "params_class" in captured.out -def test_show_validators_list_json(capsys: pytest.CaptureFixture[str]) -> None: - controller = IntrospectionController(output_format="json") - controller.show_validators(type_name=None) - captured = capsys.readouterr() - data = json.loads(captured.out) - assert isinstance(data, dict) - assert len(data) > 0 - - def test_show_validators_specific_text(capsys: pytest.CaptureFixture[str]) -> None: - controller = IntrospectionController(output_format="text") + controller = IntrospectionController() controller.show_validators(type_name="code") captured = capsys.readouterr() assert "validator_type: code" in captured.out def test_show_validators_all_case_insensitive(capsys: pytest.CaptureFixture[str]) -> None: - controller = IntrospectionController(output_format="text") + controller = IntrospectionController() controller.show_validators(type_name="ALL") captured = capsys.readouterr() assert "Data Designer Validator Types Reference" in captured.out assert "validator_type: code" in captured.out -def test_show_validators_specific_json(capsys: pytest.CaptureFixture[str]) -> None: - controller = IntrospectionController(output_format="json") - controller.show_validators(type_name="code") - captured = capsys.readouterr() - data = json.loads(captured.out) - assert isinstance(data, dict) - assert "fields" in data - - # --------------------------------------------------------------------------- # show_processors # --------------------------------------------------------------------------- def test_show_processors_list_text(capsys: pytest.CaptureFixture[str]) -> None: - controller = IntrospectionController(output_format="text") + controller = IntrospectionController() controller.show_processors(type_name=None) captured = capsys.readouterr() assert "processor_type" in captured.out assert "config_class" in captured.out - - -def test_show_processors_list_json(capsys: pytest.CaptureFixture[str]) -> None: - controller = IntrospectionController(output_format="json") - controller.show_processors(type_name=None) - captured = capsys.readouterr() - data = json.loads(captured.out) - assert isinstance(data, dict) - assert len(data) > 0 - - -# --------------------------------------------------------------------------- -# show_imports (with category filter) -# --------------------------------------------------------------------------- - - -def test_show_imports_text(capsys: pytest.CaptureFixture[str]) -> None: - controller = IntrospectionController(output_format="text") - controller.show_imports() - captured = capsys.readouterr() - assert "import data_designer.config as dd" in captured.out - assert "dd." in captured.out - - -def test_show_imports_with_category_filter(capsys: pytest.CaptureFixture[str]) -> None: - controller = IntrospectionController(output_format="text") - controller.show_imports(category="columns") - captured = capsys.readouterr() - assert "Column Configs" in captured.out - assert "dd." in captured.out - - -def test_show_imports_with_category_filter_json(capsys: pytest.CaptureFixture[str]) -> None: - controller = IntrospectionController(output_format="json") - controller.show_imports(category="columns") - captured = capsys.readouterr() - data = json.loads(captured.out) - assert isinstance(data, dict) - assert "categories" in data - assert "Column Configs" in data["categories"] - - -def test_show_imports_with_invalid_category(capsys: pytest.CaptureFixture[str]) -> None: - controller = IntrospectionController(output_format="text") - with pytest.raises(click.exceptions.Exit): - controller.show_imports(category="nonexistent_xyz") - - -# --------------------------------------------------------------------------- -# show_code_structure -# --------------------------------------------------------------------------- - - -def test_show_code_structure_text(capsys: pytest.CaptureFixture[str]) -> None: - controller = IntrospectionController(output_format="text") - controller.show_code_structure() - captured = capsys.readouterr() - assert "data_designer code structure" in captured.out - assert "data_designer/" in captured.out - - -def test_show_code_structure_json(capsys: pytest.CaptureFixture[str]) -> None: - controller = IntrospectionController(output_format="json") - controller.show_code_structure() - captured = capsys.readouterr() - data = json.loads(captured.out) - assert "paths" in data - assert "tree" in data - assert data["tree"]["name"] == "data_designer" - - -def test_show_code_structure_negative_depth_exits(capsys: pytest.CaptureFixture[str]) -> None: - """Invalid depth < 0 should exit with code 1 and an actionable message.""" - import click.exceptions - - controller = IntrospectionController(output_format="text") - with pytest.raises(click.exceptions.Exit): - controller.show_code_structure(depth=-1) - captured = capsys.readouterr() - assert "depth" in captured.err.lower() or ">= 0" in captured.err - - -def test_show_seeds_json_includes_enum_values(capsys: pytest.CaptureFixture[str]) -> None: - """JSON schema list for seeds includes 'values' for enum-only types (e.g. SamplingStrategy).""" - controller = IntrospectionController(output_format="json") - controller.show_seeds() - captured = capsys.readouterr() - data = json.loads(captured.out) - assert isinstance(data, list) - enum_entries = [e for e in data if e.get("class_name") == "SamplingStrategy"] - assert len(enum_entries) >= 1 - assert "values" in enum_entries[0] - assert isinstance(enum_entries[0]["values"], list) - assert "ordered" in enum_entries[0]["values"] or "shuffle" in enum_entries[0]["values"] - - -def test_show_seeds_text_uses_enum_values_not_names(capsys: pytest.CaptureFixture[str]) -> None: - """Text output for seeds uses enum .value (e.g. ordered, shuffle) for parity with JSON.""" - controller = IntrospectionController(output_format="text") - controller.show_seeds() - captured = capsys.readouterr() - assert "SamplingStrategy" in captured.out - assert "values: [ordered, shuffle]" in captured.out or "values: [shuffle, ordered]" in captured.out - - -# --------------------------------------------------------------------------- -# _match_category -# --------------------------------------------------------------------------- - - -def test_match_category_exact_match() -> None: - keys = ["Column Configs", "Builder", "Model Configs"] - assert IntrospectionController._match_category("Column Configs", keys) == "Column Configs" - - -def test_match_category_exact_match_case_insensitive() -> None: - keys = ["Column Configs", "Builder", "Model Configs"] - assert IntrospectionController._match_category("column configs", keys) == "Column Configs" - assert IntrospectionController._match_category("BUILDER", keys) == "Builder" - - -def test_match_category_first_word_stem_match() -> None: - keys = ["Column Configs", "Builder", "Model Configs"] - # "columns" -> rstrip("s") -> "column", matches first word "column" of "Column Configs" - assert IntrospectionController._match_category("columns", keys) == "Column Configs" - - -def test_match_category_first_word_stem_match_singular() -> None: - keys = ["Column Configs", "Builder", "Model Configs"] - # "column" is already stemmed, matches first word "column" - assert IntrospectionController._match_category("column", keys) == "Column Configs" - - -def test_match_category_any_word_stem_match() -> None: - keys = ["Column Configs", "Builder", "Model Configs"] - # "configs" -> rstrip("s") -> "config", matches second word of "Column Configs" - assert IntrospectionController._match_category("configs", keys) == "Column Configs" - - -def test_match_category_substring_match() -> None: - keys = ["Column Configs", "Builder", "Model Configs"] - # "uild" is a substring of "Builder" - assert IntrospectionController._match_category("uild", keys) == "Builder" - - -def test_match_category_substring_picks_earliest_position() -> None: - keys = ["ABC-foo", "foo-ABC"] - # "foo" appears at position 4 in "ABC-foo" and position 0 in "foo-ABC" - assert IntrospectionController._match_category("foo", keys) == "foo-ABC" - - -def test_match_category_no_match() -> None: - keys = ["Column Configs", "Builder", "Model Configs"] - assert IntrospectionController._match_category("zzzzz_nonexistent", keys) is None - - -def test_match_category_empty_string() -> None: - keys = ["Column Configs", "Builder", "Model Configs"] - # Empty string is a substring of everything; earliest position (0) wins - result = IntrospectionController._match_category("", keys) - assert result is not None - - -def test_match_category_process_rstrip_s_edge_case() -> None: - """Words ending in 's' naturally (like 'process') still work after rstrip('s').""" - keys = ["Processors", "Builder"] - # "process" -> rstrip("s") -> "proces" - # First-word stem: "Processors" first word is "processor" -> rstrip("s") -> "processor" != "proces" - # Any-word stem: same - # Falls to substring: "process" is a substring of "Processors" at pos 0 - assert IntrospectionController._match_category("process", keys) == "Processors" - - -def test_match_category_empty_keys_list() -> None: - assert IntrospectionController._match_category("anything", []) is None - - -def test_match_category_model_stem() -> None: - keys = ["Column Configs", "Builder", "Model Configs"] - # "models" -> rstrip("s") -> "model", matches first word "model" of "Model Configs" - assert IntrospectionController._match_category("models", keys) == "Model Configs" diff --git a/packages/data-designer/tests/cli/services/introspection/test_discovery.py b/packages/data-designer/tests/cli/services/introspection/test_discovery.py index f38a66512..7d9087cbb 100644 --- a/packages/data-designer/tests/cli/services/introspection/test_discovery.py +++ b/packages/data-designer/tests/cli/services/introspection/test_discovery.py @@ -3,20 +3,12 @@ from __future__ import annotations -import pytest - from data_designer.cli.services.introspection.discovery import ( _discover_by_modules, discover_column_configs, discover_constraint_types, - discover_importable_names, - discover_interface_classes, - discover_mcp_types, - discover_model_configs, - discover_namespace_tree, discover_processor_configs, discover_sampler_types, - discover_seed_types, discover_validator_types, ) @@ -94,23 +86,6 @@ def test_discover_processor_configs_contains_expected_keys() -> None: assert "drop_columns" in result, f"Expected 'drop_columns' not found in {list(result.keys())}" -# --------------------------------------------------------------------------- -# discover_model_configs -# --------------------------------------------------------------------------- - - -def test_discover_model_configs_returns_dict() -> None: - result = discover_model_configs() - assert isinstance(result, dict) - assert len(result) > 0 - - -def test_discover_model_configs_contains_expected_keys() -> None: - result = discover_model_configs() - for expected_key in ("ModelConfig", "ChatCompletionInferenceParams"): - assert expected_key in result, f"Expected key '{expected_key}' not found in {list(result.keys())}" - - # --------------------------------------------------------------------------- # discover_constraint_types # --------------------------------------------------------------------------- @@ -127,156 +102,6 @@ def test_discover_constraint_types_contains_expected_keys() -> None: assert "ScalarInequalityConstraint" in result -# --------------------------------------------------------------------------- -# discover_seed_types -# --------------------------------------------------------------------------- - - -def test_discover_seed_types_returns_dict() -> None: - result = discover_seed_types() - assert isinstance(result, dict) - assert len(result) > 0 - - -def test_discover_seed_types_contains_expected_keys() -> None: - result = discover_seed_types() - for expected_key in ("SeedConfig", "LocalFileSeedSource"): - assert expected_key in result, f"Expected key '{expected_key}' not found in {list(result.keys())}" - - -# --------------------------------------------------------------------------- -# discover_mcp_types -# --------------------------------------------------------------------------- - - -def test_discover_mcp_types_returns_dict() -> None: - result = discover_mcp_types() - assert isinstance(result, dict) - assert len(result) > 0 - - -def test_discover_mcp_types_contains_expected_keys() -> None: - result = discover_mcp_types() - for expected_key in ("MCPProvider", "ToolConfig"): - assert expected_key in result, f"Expected key '{expected_key}' not found in {list(result.keys())}" - - -# --------------------------------------------------------------------------- -# discover_namespace_tree -# --------------------------------------------------------------------------- - - -def test_discover_namespace_tree_returns_paths_and_tree() -> None: - result = discover_namespace_tree() - assert "paths" in result - assert "tree" in result - - -def test_discover_namespace_tree_paths_non_empty() -> None: - result = discover_namespace_tree() - assert isinstance(result["paths"], list) - assert len(result["paths"]) > 0 - for p in result["paths"]: - assert isinstance(p, str) - - -def test_discover_namespace_tree_root_is_data_designer() -> None: - result = discover_namespace_tree() - tree = result["tree"] - assert tree["name"] == "data_designer" - assert tree["is_package"] is True - - -def test_discover_namespace_tree_contains_expected_children() -> None: - result = discover_namespace_tree() - child_names = [c["name"] for c in result["tree"]["children"]] - for expected in ("config", "engine", "cli"): - assert expected in child_names, f"Expected '{expected}' in {child_names}" - - -def test_discover_namespace_tree_children_have_correct_structure() -> None: - result = discover_namespace_tree() - for child in result["tree"]["children"]: - assert "name" in child - assert "is_package" in child - assert "children" in child - assert isinstance(child["name"], str) - assert isinstance(child["is_package"], bool) - assert isinstance(child["children"], list) - - -def test_discover_namespace_tree_negative_depth_raises() -> None: - """Invalid max_depth < 0 raises ValueError with actionable message.""" - with pytest.raises(ValueError, match="max_depth must be >= 0"): - discover_namespace_tree(max_depth=-1) - - -def test_discover_namespace_tree_import_errors_structure() -> None: - """When present, import_errors is a list of dicts with module and message.""" - result = discover_namespace_tree() - if "import_errors" in result: - errors = result["import_errors"] - assert isinstance(errors, list) - for err in errors: - assert "module" in err - assert "message" in err - assert isinstance(err["module"], str) - assert isinstance(err["message"], str) - - -# --------------------------------------------------------------------------- -# discover_interface_classes -# --------------------------------------------------------------------------- - - -def test_discover_interface_classes_returns_dict() -> None: - result = discover_interface_classes() - assert isinstance(result, dict) - assert len(result) > 0 - - -def test_discover_interface_classes_contains_expected_keys() -> None: - result = discover_interface_classes() - for expected_key in ("DataDesigner", "DatasetCreationResults", "PreviewResults", "RunConfig"): - assert expected_key in result, f"Expected key '{expected_key}' not found in {list(result.keys())}" - - -def test_discover_interface_classes_values_are_classes() -> None: - result = discover_interface_classes() - for cls in result.values(): - assert isinstance(cls, type) - - -# --------------------------------------------------------------------------- -# discover_importable_names -# --------------------------------------------------------------------------- - - -def test_discover_importable_names_returns_dict() -> None: - result = discover_importable_names() - assert isinstance(result, dict) - assert len(result) > 0 - - -def test_discover_importable_names_has_column_configs_category() -> None: - result = discover_importable_names() - assert "Column Configs" in result, f"Expected 'Column Configs' in {list(result.keys())}" - - -def test_discover_importable_names_has_interface_category() -> None: - result = discover_importable_names() - assert "Interface" in result, f"Expected 'Interface' in {list(result.keys())}" - - -def test_discover_importable_names_entries_have_name_and_module() -> None: - result = discover_importable_names() - for category, entries in result.items(): - assert isinstance(entries, list), f"Category '{category}' value is not a list" - for entry in entries: - assert "name" in entry, f"Entry in '{category}' missing 'name'" - assert "module" in entry, f"Entry in '{category}' missing 'module'" - - # --------------------------------------------------------------------------- # _discover_by_modules # --------------------------------------------------------------------------- @@ -300,14 +125,3 @@ def test_discover_by_modules_with_multiple_suffixes() -> None: def test_discover_by_modules_unknown_suffix_returns_empty() -> None: result = _discover_by_modules("nonexistent_module") assert result == {} - - -# --------------------------------------------------------------------------- -# discover_interface_classes — error class exclusion -# --------------------------------------------------------------------------- - - -def test_discover_interface_classes_excludes_exceptions() -> None: - result = discover_interface_classes() - for name, cls in result.items(): - assert not issubclass(cls, Exception), f"{name} is an Exception subclass and should be excluded" diff --git a/packages/data-designer/tests/cli/services/introspection/test_field_descriptions.py b/packages/data-designer/tests/cli/services/introspection/test_field_descriptions.py index d8f8accba..edfbec7ab 100644 --- a/packages/data-designer/tests/cli/services/introspection/test_field_descriptions.py +++ b/packages/data-designer/tests/cli/services/introspection/test_field_descriptions.py @@ -8,12 +8,8 @@ from data_designer.cli.services.introspection.discovery import ( discover_column_configs, discover_constraint_types, - discover_interface_classes, - discover_mcp_types, - discover_model_configs, discover_processor_configs, discover_sampler_types, - discover_seed_types, discover_validator_types, ) @@ -31,11 +27,7 @@ def _collect_models_with_fields() -> list[tuple[str, str, type]]: ("sampler_types", discover_sampler_types()), ("validator_types", discover_validator_types()), ("processor_configs", discover_processor_configs()), - ("model_configs", discover_model_configs()), ("constraint_types", discover_constraint_types()), - ("seed_types", discover_seed_types()), - ("mcp_types", discover_mcp_types()), - ("interface_classes", discover_interface_classes()), ] for source_label, discovered in discovery_sources: diff --git a/packages/data-designer/tests/cli/services/introspection/test_formatters.py b/packages/data-designer/tests/cli/services/introspection/test_formatters.py index 4130137f0..cc3e32e56 100644 --- a/packages/data-designer/tests/cli/services/introspection/test_formatters.py +++ b/packages/data-designer/tests/cli/services/introspection/test_formatters.py @@ -4,49 +4,17 @@ from __future__ import annotations from data_designer.cli.services.introspection.formatters import ( - format_imports_json, - format_imports_text, - format_interface_json, - format_interface_text, format_method_info_json, format_method_info_text, - format_model_schema_json, - format_model_schema_text, - format_namespace_json, - format_namespace_text, - format_overview_text, format_type_list_text, ) from data_designer.cli.services.introspection.method_inspector import MethodInfo, ParamInfo -from data_designer.cli.services.introspection.pydantic_inspector import FieldDetail, ModelSchema # --------------------------------------------------------------------------- # Helpers # --------------------------------------------------------------------------- -def _make_field(name: str = "my_field", type_str: str = "str", description: str = "A field") -> FieldDetail: - return FieldDetail(name=name, type_str=type_str, description=description) - - -def _make_schema( - class_name: str = "TestModel", - description: str = "A test model.", - schema_ref: str | None = None, - type_key: str | None = None, - type_value: str | None = None, - fields: list[FieldDetail] | None = None, -) -> ModelSchema: - return ModelSchema( - class_name=class_name, - description=description, - schema_ref=schema_ref, - type_key=type_key, - type_value=type_value, - fields=fields or [_make_field()], - ) - - def _make_method( name: str = "do_thing", signature: str = "do_thing(x: int) -> str", @@ -63,209 +31,6 @@ def _make_method( ) -# --------------------------------------------------------------------------- -# format_model_schema_text -# --------------------------------------------------------------------------- - - -def test_format_model_schema_text_basic() -> None: - schema = _make_schema() - text = format_model_schema_text(schema) - assert "TestModel:" in text - assert "description: A test model." in text - assert "my_field: str" in text - assert "[required]" in text - - -def test_format_model_schema_text_with_type_key() -> None: - schema = _make_schema(type_key="column_type", type_value="llm-text") - text = format_model_schema_text(schema) - assert "column_type: llm-text" in text - - -def test_format_model_schema_text_with_nested_schema() -> None: - nested = _make_schema(class_name="NestedModel", description="Nested.") - outer_field = FieldDetail( - name="child", - type_str="NestedModel", - description="A nested model", - nested_schema=nested, - ) - schema = _make_schema(fields=[outer_field]) - text = format_model_schema_text(schema) - assert "schema (NestedModel):" in text - - -def test_format_model_schema_text_with_enum_values() -> None: - field = FieldDetail( - name="color", - type_str="ColorEnum", - description="Pick a color", - enum_values=["red", "green", "blue"], - ) - schema = _make_schema(fields=[field]) - text = format_model_schema_text(schema) - assert "values: [red, green, blue]" in text - - -def test_format_model_schema_text_with_default() -> None: - field = FieldDetail( - name="count", - type_str="int", - description="A count", - required=False, - default="0", - ) - schema = _make_schema(fields=[field]) - text = format_model_schema_text(schema) - assert "count: int = 0" in text - assert "[required]" not in text - - -def test_format_model_schema_text_with_constraints() -> None: - field = FieldDetail( - name="score", - type_str="float", - description="A score", - required=False, - default="0.5", - constraints={"ge": 0.0, "le": 1.0}, - ) - schema = _make_schema(fields=[field]) - text = format_model_schema_text(schema) - assert "constraints: ge=0.0, le=1.0" in text - - -# --------------------------------------------------------------------------- -# format_model_schema_json -# --------------------------------------------------------------------------- - - -def test_format_model_schema_json_basic() -> None: - schema = _make_schema() - result = format_model_schema_json(schema) - assert isinstance(result, dict) - assert result["class_name"] == "TestModel" - assert result["description"] == "A test model." - assert isinstance(result["fields"], list) - assert len(result["fields"]) == 1 - assert result["fields"][0]["name"] == "my_field" - - -def test_format_model_schema_json_with_type_key() -> None: - schema = _make_schema(type_key="column_type", type_value="sampler") - result = format_model_schema_json(schema) - assert result["column_type"] == "sampler" - - -def test_format_model_schema_json_includes_required_and_default() -> None: - field = FieldDetail(name="val", type_str="int", description="Value", required=False, default="42") - schema = _make_schema(fields=[field]) - result = format_model_schema_json(schema) - f = result["fields"][0] - assert f["required"] is False - assert f["default"] == "42" - - -def test_format_model_schema_json_required_field_no_default() -> None: - field = FieldDetail(name="val", type_str="str", description="Value", required=True) - schema = _make_schema(fields=[field]) - result = format_model_schema_json(schema) - f = result["fields"][0] - assert f["required"] is True - assert "default" not in f - - -def test_format_model_schema_json_native_default_value() -> None: - """JSON output uses native types for default (e.g. int, bool, null).""" - field = FieldDetail( - name="count", - type_str="int", - description="Count", - required=False, - default_json=42, - default_factory=None, - ) - schema = _make_schema(fields=[field]) - result = format_model_schema_json(schema) - f = result["fields"][0] - assert f["required"] is False - assert f["default"] == 42 - assert "default_factory" not in f - - -def test_format_model_schema_json_default_factory_key() -> None: - """JSON output includes default_factory when the field uses a factory.""" - field = FieldDetail( - name="items", - type_str="list[str]", - description="Items", - required=False, - default_factory="list", - ) - schema = _make_schema(fields=[field]) - result = format_model_schema_json(schema) - f = result["fields"][0] - assert f["required"] is False - assert f["default_factory"] == "list" - assert "default" not in f - - -def test_format_model_schema_json_explicit_null_default() -> None: - """JSON output uses null for explicit None default.""" - field = FieldDetail( - name="optional", - type_str="str | None", - description="Optional", - required=False, - default_json=None, - default_factory=None, - ) - schema = _make_schema(fields=[field]) - result = format_model_schema_json(schema) - f = result["fields"][0] - assert f["required"] is False - assert f["default"] is None - - -def test_format_model_schema_json_includes_constraints() -> None: - field = FieldDetail( - name="score", - type_str="float", - description="Score", - required=False, - default="0.5", - constraints={"ge": 0.0, "le": 1.0}, - ) - schema = _make_schema(fields=[field]) - result = format_model_schema_json(schema) - f = result["fields"][0] - assert f["constraints"] == {"ge": 0.0, "le": 1.0} - - -def test_format_model_schema_json_no_constraints_key_when_none() -> None: - field = FieldDetail(name="val", type_str="str", description="Value") - schema = _make_schema(fields=[field]) - result = format_model_schema_json(schema) - f = result["fields"][0] - assert "constraints" not in f - - -def test_format_model_schema_json_with_nested() -> None: - nested = _make_schema(class_name="Inner", description="Inner model.") - outer_field = FieldDetail( - name="inner", - type_str="Inner", - description="Nested", - nested_schema=nested, - ) - schema = _make_schema(fields=[outer_field]) - result = format_model_schema_json(schema) - inner_field = result["fields"][0] - assert "schema" in inner_field - assert inner_field["schema"]["class_name"] == "Inner" - - # --------------------------------------------------------------------------- # format_method_info_text # --------------------------------------------------------------------------- @@ -357,295 +122,3 @@ class C: def test_format_type_list_text_empty() -> None: text = format_type_list_text({}, "Type", "Class") assert "(no items)" in text - - -# --------------------------------------------------------------------------- -# format_overview_text -# --------------------------------------------------------------------------- - - -def test_format_overview_text_contains_header() -> None: - type_counts = {"Column types": 5, "Sampler types": 3} - methods = [_make_method()] - text = format_overview_text(type_counts, methods) - assert "Data Designer API Overview" in text - - -def test_format_overview_text_contains_type_counts() -> None: - type_counts = {"Column types": 5, "Sampler types": 3} - methods = [_make_method()] - text = format_overview_text(type_counts, methods) - assert "Type Counts:" in text - assert "Column types:" in text - assert "5" in text - assert "Sampler types:" in text - assert "3" in text - - -def test_format_overview_text_contains_builder_methods() -> None: - type_counts = {"Column types": 5} - methods = [_make_method(name="add_column")] - text = format_overview_text(type_counts, methods) - assert "Builder Methods" in text - assert "add_column(...)" in text - - -def test_format_overview_text_contains_quick_start() -> None: - type_counts = {"Column types": 1} - text = format_overview_text(type_counts, []) - assert "Quick Start Commands:" in text - assert "types columns" in text - - -# --------------------------------------------------------------------------- -# Namespace / code-structure formatters -# --------------------------------------------------------------------------- - - -def _make_namespace_data() -> dict: - return { - "paths": ["/fake/site-packages/data_designer"], - "tree": { - "name": "data_designer", - "is_package": True, - "children": [ - { - "name": "config", - "is_package": True, - "children": [ - {"name": "column_configs", "is_package": False, "children": []}, - {"name": "models", "is_package": False, "children": []}, - ], - }, - { - "name": "errors", - "is_package": False, - "children": [], - }, - ], - }, - } - - -def test_format_namespace_text_contains_tree_characters() -> None: - text = format_namespace_text(_make_namespace_data()) - assert "├──" in text or "└──" in text - assert "│" in text - - -def test_format_namespace_text_shows_install_path() -> None: - text = format_namespace_text(_make_namespace_data()) - assert "Install path:" in text - assert "/fake/site-packages/data_designer" in text - - -def test_format_namespace_text_packages_have_trailing_slash() -> None: - text = format_namespace_text(_make_namespace_data()) - assert "config/" in text - - -def test_format_namespace_text_modules_have_py_extension() -> None: - text = format_namespace_text(_make_namespace_data()) - assert "errors.py" in text - assert "column_configs.py" in text - - -def test_format_namespace_text_contains_agent_guidance() -> None: - text = format_namespace_text(_make_namespace_data()) - assert "Only read source files directly" in text - - -def test_format_namespace_json_returns_passthrough() -> None: - data = _make_namespace_data() - result = format_namespace_json(data) - assert result is data - - -def test_format_namespace_text_shows_import_warnings_when_present() -> None: - data = _make_namespace_data() - data["import_errors"] = [ - {"module": "data_designer.fake_submodule", "message": "No module named 'fake'"}, - ] - text = format_namespace_text(data) - assert "Warnings" in text - assert "data_designer.fake_submodule" in text - assert "No module named" in text - - -def test_format_namespace_json_includes_import_errors_when_present() -> None: - data = _make_namespace_data() - data["import_errors"] = [{"module": "data_designer.foo", "message": "err"}] - result = format_namespace_json(data) - assert "import_errors" in result - assert result["import_errors"] == [{"module": "data_designer.foo", "message": "err"}] - - -# --------------------------------------------------------------------------- -# Interface formatters -# --------------------------------------------------------------------------- - - -def _make_interface_data() -> tuple[list[tuple[str, list[MethodInfo]]], list[ModelSchema]]: - methods_data = [ - ("DataDesigner", [_make_method(name="create", signature="create(...) -> DatasetCreationResults")]), - ("DatasetCreationResults", [_make_method(name="load_dataset", signature="load_dataset() -> pd.DataFrame")]), - ] - schemas = [_make_schema(class_name="RunConfig", description="Runtime configuration.")] - return methods_data, schemas - - -def test_format_interface_text_contains_class_names() -> None: - methods_data, schemas = _make_interface_data() - text = format_interface_text(methods_data, schemas) - assert "DataDesigner" in text - assert "DatasetCreationResults" in text - assert "RunConfig" in text - - -def test_format_interface_text_contains_methods() -> None: - methods_data, schemas = _make_interface_data() - text = format_interface_text(methods_data, schemas) - assert "create" in text - assert "load_dataset" in text - - -def test_format_interface_text_contains_run_config_fields() -> None: - methods_data, schemas = _make_interface_data() - text = format_interface_text(methods_data, schemas) - assert "my_field" in text - - -def test_format_interface_json_structure() -> None: - methods_data, schemas = _make_interface_data() - result = format_interface_json(methods_data, schemas) - assert isinstance(result, dict) - assert "methods" in result - assert "schemas" in result - assert "DataDesigner" in result["methods"] - assert "DatasetCreationResults" in result["methods"] - assert isinstance(result["schemas"], list) - assert len(result["schemas"]) == 1 - - -# --------------------------------------------------------------------------- -# Imports formatters -# --------------------------------------------------------------------------- - - -def _make_imports_data() -> dict[str, list[dict[str, str]]]: - return { - "Column Configs": [ - {"name": "LLMTextColumnConfig", "module": "data_designer.config"}, - {"name": "SamplerColumnConfig", "module": "data_designer.config"}, - ], - "Interface": [ - {"name": "DataDesigner", "module": "data_designer.interface"}, - ], - } - - -def test_format_imports_text_contains_recommended_imports() -> None: - text = format_imports_text(_make_imports_data()) - assert "Recommended imports:" in text - assert "import data_designer.config as dd" in text - assert "from data_designer.interface import DataDesigner" in text - - -def test_format_imports_text_config_names_use_dd_prefix() -> None: - text = format_imports_text(_make_imports_data()) - assert "dd.LLMTextColumnConfig" in text - assert "dd.SamplerColumnConfig" in text - assert "from data_designer.config import" not in text - - -def test_format_imports_text_interface_uses_from_import() -> None: - text = format_imports_text(_make_imports_data()) - assert "from data_designer.interface import DataDesigner" in text - - -def test_format_imports_text_has_category_headers() -> None: - text = format_imports_text(_make_imports_data()) - assert "Column Configs (2 names):" in text - assert "Interface (1 name):" in text - - -def test_format_imports_json_structure() -> None: - data = _make_imports_data() - result = format_imports_json(data) - assert isinstance(result, dict) - assert "recommended_imports" in result - assert "config_alias" in result - assert result["config_alias"] == "dd" - assert "categories" in result - assert "Column Configs" in result["categories"] - assert "Interface" in result["categories"] - - -def test_format_imports_json_category_structure() -> None: - data = _make_imports_data() - result = format_imports_json(data) - config_cat = result["categories"]["Column Configs"] - assert config_cat["module"] == "data_designer.config" - assert config_cat["access_pattern"] == "dd." - assert "LLMTextColumnConfig" in config_cat["names"] - - interface_cat = result["categories"]["Interface"] - assert interface_cat["module"] == "data_designer.interface" - assert "from data_designer.interface import " in interface_cat["access_pattern"] - assert "DataDesigner" in interface_cat["names"] - - -# --------------------------------------------------------------------------- -# Schema deduplication -# --------------------------------------------------------------------------- - - -def test_format_field_text_deduplicates_nested_schemas() -> None: - """When seen_schemas is passed, second occurrence of a nested schema shows a back-reference.""" - nested = _make_schema(class_name="SharedNested", description="Shared nested model.") - field1 = FieldDetail(name="field_a", type_str="SharedNested", description="First ref", nested_schema=nested) - field2 = FieldDetail(name="field_b", type_str="SharedNested", description="Second ref", nested_schema=nested) - - schema1 = _make_schema(class_name="Model1", fields=[field1]) - schema2 = _make_schema(class_name="Model2", fields=[field2]) - - seen: set[str] = set() - text1 = format_model_schema_text(schema1, seen_schemas=seen) - text2 = format_model_schema_text(schema2, seen_schemas=seen) - - assert "schema (SharedNested):" in text1 - assert "see SharedNested above" in text2 - assert "schema (SharedNested):" not in text2 - - -def test_format_field_text_no_dedup_without_seen_set() -> None: - """Without seen_schemas, nested schemas always expand fully.""" - nested = _make_schema(class_name="Inner", description="Inner model.") - field1 = FieldDetail(name="x", type_str="Inner", description="Ref", nested_schema=nested) - schema = _make_schema(fields=[field1]) - - text = format_model_schema_text(schema) - assert "schema (Inner):" in text - - -def test_format_field_text_dedup_uses_schema_ref_to_avoid_name_collisions() -> None: - """Schemas with identical class names but different refs should both expand.""" - nested_a = _make_schema(class_name="SharedName", schema_ref="pkg.alpha.SharedName") - nested_b = _make_schema(class_name="SharedName", schema_ref="pkg.beta.SharedName") - - schema_a = _make_schema( - class_name="OuterA", - fields=[FieldDetail(name="a", type_str="SharedName", description="Ref A", nested_schema=nested_a)], - ) - schema_b = _make_schema( - class_name="OuterB", - fields=[FieldDetail(name="b", type_str="SharedName", description="Ref B", nested_schema=nested_b)], - ) - - seen: set[str] = set() - text_a = format_model_schema_text(schema_a, seen_schemas=seen) - text_b = format_model_schema_text(schema_b, seen_schemas=seen) - - assert "schema (SharedName):" in text_a - assert "schema (SharedName):" in text_b - assert "see SharedName above" not in text_b diff --git a/packages/data-designer/tests/cli/services/introspection/test_pydantic_inspector.py b/packages/data-designer/tests/cli/services/introspection/test_pydantic_inspector.py index a269b680f..bd60c0bdc 100644 --- a/packages/data-designer/tests/cli/services/introspection/test_pydantic_inspector.py +++ b/packages/data-designer/tests/cli/services/introspection/test_pydantic_inspector.py @@ -9,17 +9,14 @@ from pydantic import BaseModel, Field from data_designer.cli.services.introspection.pydantic_inspector import ( - FieldDetail, - ModelSchema, _extract_constraints, _extract_enum_class, + _extract_nested_basemodel, _is_basemodel_subclass, _is_enum_subclass, - build_model_schema, - extract_nested_basemodel, + format_model_text, format_type, get_brief_description, - get_field_info, ) # --------------------------------------------------------------------------- @@ -169,45 +166,45 @@ def test_extract_enum_class_none() -> None: def test_extract_nested_basemodel_direct() -> None: - assert extract_nested_basemodel(InnerModel) is InnerModel + assert _extract_nested_basemodel(InnerModel) is InnerModel def test_extract_nested_basemodel_list() -> None: - assert extract_nested_basemodel(list[InnerModel]) is InnerModel + assert _extract_nested_basemodel(list[InnerModel]) is InnerModel def test_extract_nested_basemodel_optional() -> None: - assert extract_nested_basemodel(InnerModel | None) is InnerModel + assert _extract_nested_basemodel(InnerModel | None) is InnerModel def test_extract_nested_basemodel_optional_list() -> None: - assert extract_nested_basemodel(list[InnerModel] | None) is InnerModel + assert _extract_nested_basemodel(list[InnerModel] | None) is InnerModel def test_extract_nested_basemodel_dict() -> None: - assert extract_nested_basemodel(dict[str, InnerModel]) is InnerModel + assert _extract_nested_basemodel(dict[str, InnerModel]) is InnerModel def test_extract_nested_basemodel_annotated() -> None: - assert extract_nested_basemodel(Annotated[InnerModel, "info"]) is InnerModel + assert _extract_nested_basemodel(Annotated[InnerModel, "info"]) is InnerModel def test_extract_nested_basemodel_discriminated_union_returns_none() -> None: """Unions of 2+ BaseModel subclasses should return None.""" - assert extract_nested_basemodel(InnerModel | OuterModel) is None + assert _extract_nested_basemodel(InnerModel | OuterModel) is None def test_extract_nested_basemodel_primitive_returns_none() -> None: - assert extract_nested_basemodel(str) is None - assert extract_nested_basemodel(int) is None + assert _extract_nested_basemodel(str) is None + assert _extract_nested_basemodel(int) is None def test_extract_nested_basemodel_none_returns_none() -> None: - assert extract_nested_basemodel(None) is None + assert _extract_nested_basemodel(None) is None def test_extract_nested_basemodel_basemodel_itself_returns_none() -> None: - assert extract_nested_basemodel(BaseModel) is None + assert _extract_nested_basemodel(BaseModel) is None # --------------------------------------------------------------------------- @@ -247,195 +244,209 @@ def test_get_brief_description_without_docstring() -> None: # --------------------------------------------------------------------------- -# get_field_info +# _extract_constraints # --------------------------------------------------------------------------- -def test_get_field_info_returns_field_details() -> None: - fields = get_field_info(OuterModel) - assert isinstance(fields, list) - assert all(isinstance(f, FieldDetail) for f in fields) - names = [f.name for f in fields] - assert "plain" in names - assert "nested" in names - assert "my_enum" in names +def test_extract_constraints_from_constrained_model() -> None: + score_info = ConstrainedModel.model_fields["score"] + constraints = _extract_constraints(score_info) + assert constraints is not None + assert constraints["ge"] == 0.0 + assert constraints["le"] == 1.0 + + +def test_extract_constraints_gt_lt() -> None: + count_info = ConstrainedModel.model_fields["count"] + constraints = _extract_constraints(count_info) + assert constraints is not None + assert constraints["gt"] == -1 + assert constraints["lt"] == 1000 + +def test_extract_constraints_string_lengths() -> None: + label_info = ConstrainedModel.model_fields["label"] + constraints = _extract_constraints(label_info) + assert constraints is not None + assert constraints["min_length"] == 1 + assert constraints["max_length"] == 100 -def test_get_field_info_enum_values_use_dot_value() -> None: - fields = get_field_info(OuterModel) - enum_field = next(f for f in fields if f.name == "my_enum") - assert enum_field.enum_values is not None - assert set(enum_field.enum_values) == {"red", "green", "blue"} +def test_extract_constraints_none_for_unconstrained() -> None: + x_info = InnerModel.model_fields["x"] + assert _extract_constraints(x_info) is None + + +def test_extract_constraints_helper_with_no_metadata() -> None: + """_extract_constraints returns None when field_info has no constraint metadata.""" + + class FakeFieldInfo: + metadata: list = [] -def test_get_field_info_non_enum_has_no_enum_values() -> None: - fields = get_field_info(OuterModel) - plain_field = next(f for f in fields if f.name == "plain") - assert plain_field.enum_values is None + assert _extract_constraints(FakeFieldInfo()) is None # --------------------------------------------------------------------------- -# required / default +# format_model_text # --------------------------------------------------------------------------- -def test_get_field_info_required_field() -> None: - fields = get_field_info(RequiredFieldModel) - req = next(f for f in fields if f.name == "required_name") - assert req.required is True - assert req.default is None +def test_format_model_text_basic_structure() -> None: + text = format_model_text(OuterModel) + assert "OuterModel:" in text + assert "description: Outer model for testing." in text + assert "fields:" in text + assert "plain:" in text + assert "nested:" in text + assert "my_enum:" in text -def test_get_field_info_optional_field_default() -> None: - fields = get_field_info(RequiredFieldModel) - opt = next(f for f in fields if f.name == "optional_name") - assert opt.required is False - assert opt.default == "'default_val'" +def test_format_model_text_with_type_key_and_value() -> None: + text = format_model_text(OuterModel, type_key="column_type", type_value="test") + assert "column_type: test" in text -def test_get_field_info_default_factory() -> None: - fields = get_field_info(OuterModel) - nested = next(f for f in fields if f.name == "nested") - assert nested.required is False - assert nested.default == "InnerModel()" +def test_format_model_text_required_field() -> None: + text = format_model_text(RequiredFieldModel) + assert "required_name: str [required]" in text -def test_get_field_info_none_default_not_shown() -> None: - """Fields with default=None (like SelfRefModel.child) have default_json=None in FieldDetail.""" - fields = get_field_info(SelfRefModel) - child = next(f for f in fields if f.name == "child") - assert child.required is False - assert child.default_json is None +def test_format_model_text_optional_field_default() -> None: + text = format_model_text(RequiredFieldModel) + assert "optional_name: str = 'default_val'" in text + assert "[required]" not in text.split("optional_name")[1].split("\n")[0] -def test_get_field_info_optional_field_default_json_native() -> None: - """Optional scalar defaults are stored as native default_json for machine consumption.""" - fields = get_field_info(RequiredFieldModel) - opt = next(f for f in fields if f.name == "optional_name") - assert opt.required is False - assert opt.default_json == "default_val" - assert opt.default_factory is None +def test_format_model_text_default_factory() -> None: + text = format_model_text(OuterModel) + assert "= InnerModel()" in text -def test_get_field_info_default_factory_set() -> None: - """Fields with default_factory set have default_factory name and default_json undefined.""" - fields = get_field_info(OuterModel) - nested = next(f for f in fields if f.name == "nested") - assert nested.required is False - assert nested.default_factory == "InnerModel" +def test_format_model_text_none_default() -> None: + text = format_model_text(SelfRefModel) + assert "child:" in text + assert "= None" in text -def test_get_field_info_str_enum_default_json_uses_member_value() -> None: - """Defaults for str-enum fields should be normalized to the enum member's .value.""" - fields = get_field_info(OuterModel) - enum_field = next(f for f in fields if f.name == "my_enum") - assert enum_field.required is False - assert enum_field.default_json == "red" - assert enum_field.default == "'red'" +def test_format_model_text_enum_default_uses_member_value() -> None: + text = format_model_text(OuterModel) + assert "my_enum: ColorEnum = 'red'" in text -# --------------------------------------------------------------------------- -# constraints -# --------------------------------------------------------------------------- +def test_format_model_text_enum_values() -> None: + text = format_model_text(OuterModel) + assert "values: [red, green, blue]" in text -def test_extract_constraints_from_constrained_model() -> None: - fields = get_field_info(ConstrainedModel) - score = next(f for f in fields if f.name == "score") - assert score.constraints is not None - assert score.constraints["ge"] == 0.0 - assert score.constraints["le"] == 1.0 +def test_format_model_text_constraints() -> None: + text = format_model_text(ConstrainedModel) + assert "constraints: ge=0.0, le=1.0" in text -def test_extract_constraints_gt_lt() -> None: - fields = get_field_info(ConstrainedModel) - count = next(f for f in fields if f.name == "count") - assert count.constraints is not None - assert count.constraints["gt"] == -1 - assert count.constraints["lt"] == 1000 +def test_format_model_text_nested_expansion() -> None: + text = format_model_text(OuterModel) + assert "schema (InnerModel):" in text + # Nested fields should appear indented under the schema + assert "x: int = 0" in text + assert "y: str = 'hello'" in text -def test_extract_constraints_string_lengths() -> None: - fields = get_field_info(ConstrainedModel) - label = next(f for f in fields if f.name == "label") - assert label.constraints is not None - assert label.constraints["min_length"] == 1 - assert label.constraints["max_length"] == 100 +def test_format_model_text_cycle_protection() -> None: + text = format_model_text(SelfRefModel) + # First level should expand + assert "schema (SelfRefModel):" in text + # The recursive child.child should NOT expand again (only one "schema (SelfRefModel):") + assert text.count("schema (SelfRefModel):") == 1 -def test_extract_constraints_none_for_unconstrained() -> None: - fields = get_field_info(InnerModel) - x_field = next(f for f in fields if f.name == "x") - assert x_field.constraints is None +def test_format_model_text_depth_limiting() -> None: + text = format_model_text(DeepA, max_depth=1) + # First level (DeepB) should expand + assert "schema (DeepB):" in text -def test_extract_constraints_helper_with_no_metadata() -> None: - """_extract_constraints returns None when field_info has no constraint metadata.""" +def test_format_model_text_sibling_nested_expands_each() -> None: + """Sibling fields of the same nested type should each include a nested schema.""" + text = format_model_text(SiblingNestedModel) + # Both first and second fields should have InnerModel expanded + assert text.count("schema (InnerModel):") == 2 - class FakeFieldInfo: - metadata: list = [] - assert _extract_constraints(FakeFieldInfo()) is None +def test_format_model_text_deduplication_with_seen_schemas() -> None: + """When seen_schemas is passed across calls, second occurrence shows a back-reference.""" + seen: set[str] = set() + text1 = format_model_text(OuterModel, seen_schemas=seen) + text2 = format_model_text(SiblingNestedModel, seen_schemas=seen) + assert "schema (InnerModel):" in text1 + assert "see InnerModel above" in text2 -# --------------------------------------------------------------------------- -# build_model_schema -# --------------------------------------------------------------------------- +def test_format_model_text_no_dedup_without_seen_set() -> None: + """Without seen_schemas, nested schemas always expand fully.""" + text = format_model_text(OuterModel) + assert "schema (InnerModel):" in text -def test_build_model_schema_basic_structure() -> None: - schema = build_model_schema(OuterModel) - assert isinstance(schema, ModelSchema) - assert schema.class_name == "OuterModel" - assert schema.description == "Outer model for testing." - assert len(schema.fields) == 3 +def test_format_model_text_max_depth_zero_blocks_all_nesting() -> None: + """At max_depth=0, nested schemas should not expand.""" + text = format_model_text(OuterModel, max_depth=0) + assert "schema (InnerModel):" not in text + assert "nested:" in text # field still listed, just not expanded -def test_build_model_schema_with_type_key_and_value() -> None: - schema = build_model_schema(OuterModel, type_key="column_type", type_value="test") - assert schema.type_key == "column_type" - assert schema.type_value == "test" +def test_format_model_text_dedup_distinguishes_same_name_different_module() -> None: + """Schemas with same __name__ but different __module__ should not dedup.""" + class SharedNameA(BaseModel): + x: int = 0 -def test_build_model_schema_nested_expansion() -> None: - schema = build_model_schema(OuterModel) - nested_field = next(f for f in schema.fields if f.name == "nested") - assert nested_field.nested_schema is not None - assert nested_field.nested_schema.class_name == "InnerModel" - nested_names = [f.name for f in nested_field.nested_schema.fields] - assert "x" in nested_names - assert "y" in nested_names + class SharedNameB(BaseModel): + y: str = "" + # Make them look like same-named classes from different modules + SharedNameB.__name__ = "SharedNameA" + SharedNameB.__qualname__ = "SharedNameA" + SharedNameA.__module__ = "pkg.alpha" + SharedNameB.__module__ = "pkg.beta" -def test_build_model_schema_cycle_protection() -> None: - schema = build_model_schema(SelfRefModel) - child_field = next(f for f in schema.fields if f.name == "child") - # First level: SelfRefModel.child should be expanded into a nested schema - assert child_field.nested_schema is not None, "First-level expansion must happen" - assert child_field.nested_schema.class_name == "SelfRefModel" - # Second level: the recursive child.child should NOT be expanded (cycle detected) - inner_child = next(f for f in child_field.nested_schema.fields if f.name == "child") - assert inner_child.nested_schema is None, "Cycle protection must block second-level expansion" + class WrapperA(BaseModel): + a: SharedNameA = Field(default_factory=SharedNameA) + class WrapperB(BaseModel): + b: SharedNameB = Field(default_factory=SharedNameB) -def test_build_model_schema_depth_limiting() -> None: - schema = build_model_schema(DeepA, max_depth=1) - b_field = next(f for f in schema.fields if f.name == "b") - # At max_depth=1, the first nested level (DeepB) should still be expanded - assert b_field.nested_schema is not None, "First-level nesting must be expanded at max_depth=1" - assert b_field.nested_schema.class_name == "DeepB" - # But any further nesting within DeepB should be blocked - for f in b_field.nested_schema.fields: - assert f.nested_schema is None, f"Field '{f.name}' should not be expanded beyond max_depth" + WrapperA.model_rebuild() + WrapperB.model_rebuild() + seen: set[str] = set() + text_a = format_model_text(WrapperA, seen_schemas=seen) + text_b = format_model_text(WrapperB, seen_schemas=seen) -def test_build_model_schema_repeated_sibling_nested_expands_each_field() -> None: - """Sibling fields of the same nested type should each include a nested schema.""" - schema = build_model_schema(SiblingNestedModel) - first = next(f for f in schema.fields if f.name == "first") - second = next(f for f in schema.fields if f.name == "second") - - assert first.nested_schema is not None - assert second.nested_schema is not None - assert first.nested_schema.class_name == "InnerModel" - assert second.nested_schema.class_name == "InnerModel" + assert "schema (SharedNameA):" in text_a + assert "schema (SharedNameA):" in text_b + assert "see SharedNameA above" not in text_b + + +class Level3(BaseModel): + val: int = 0 + + +class Level2(BaseModel): + val: int = 0 + child: Level3 | None = None + + +class Level1(BaseModel): + val: int = 0 + child: Level2 | None = None + + +Level1.model_rebuild() +Level2.model_rebuild() + + +def test_format_model_text_depth_limiting_blocks_deeper_nesting() -> None: + """With max_depth=1, Level2 expands but Level3 does not.""" + text = format_model_text(Level1, max_depth=1) + assert "schema (Level2):" in text + assert "schema (Level3):" not in text From b1778a701d48137e2dc9a92711649aa46f4144ae Mon Sep 17 00:00:00 2001 From: Johnny Greco Date: Tue, 17 Feb 2026 19:47:06 -0500 Subject: [PATCH 20/37] feat: add list agent-helper command group Replace the flat list-assets command with a list command group under agent_helpers. Add subcommands for columns, samplers, validators, processors, model-aliases, and persona-datasets. Each subcommand includes tip text pointing to the corresponding inspect subcommand. --- .../cli/commands/agent_helpers/list.py | 51 ++ .../data_designer/cli/commands/list_assets.py | 19 - .../cli/controllers/list_assets_controller.py | 52 -- .../cli/controllers/list_controller.py | 161 ++++++ .../agent_helpers/test_list_command.py | 115 +++++ .../cli/commands/test_list_assets_command.py | 33 -- .../test_list_assets_controller.py | 126 ----- .../cli/controllers/test_list_controller.py | 462 ++++++++++++++++++ 8 files changed, 789 insertions(+), 230 deletions(-) create mode 100644 packages/data-designer/src/data_designer/cli/commands/agent_helpers/list.py delete mode 100644 packages/data-designer/src/data_designer/cli/commands/list_assets.py delete mode 100644 packages/data-designer/src/data_designer/cli/controllers/list_assets_controller.py create mode 100644 packages/data-designer/src/data_designer/cli/controllers/list_controller.py create mode 100644 packages/data-designer/tests/cli/commands/agent_helpers/test_list_command.py delete mode 100644 packages/data-designer/tests/cli/commands/test_list_assets_command.py delete mode 100644 packages/data-designer/tests/cli/controllers/test_list_assets_controller.py create mode 100644 packages/data-designer/tests/cli/controllers/test_list_controller.py diff --git a/packages/data-designer/src/data_designer/cli/commands/agent_helpers/list.py b/packages/data-designer/src/data_designer/cli/commands/agent_helpers/list.py new file mode 100644 index 000000000..78596e7e5 --- /dev/null +++ b/packages/data-designer/src/data_designer/cli/commands/agent_helpers/list.py @@ -0,0 +1,51 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +from __future__ import annotations + +import typer + +from data_designer.cli.controllers.list_controller import ListController +from data_designer.config.utils.constants import DATA_DESIGNER_HOME + +list_app = typer.Typer( + name="list", + help="List valid values for configuration fields.", + no_args_is_help=True, +) + + +@list_app.command(name="model-aliases") +def model_aliases_command() -> None: + """List configured model aliases and their backing models.""" + ListController(DATA_DESIGNER_HOME).list_model_aliases() + + +@list_app.command(name="persona-datasets") +def persona_datasets_command() -> None: + """List available persona datasets and their install status.""" + ListController(DATA_DESIGNER_HOME).list_persona_datasets() + + +@list_app.command(name="columns") +def column_types_command() -> None: + """List available column types and their config classes.""" + ListController(DATA_DESIGNER_HOME).list_column_types() + + +@list_app.command(name="samplers") +def sampler_types_command() -> None: + """List available sampler types and their params classes.""" + ListController(DATA_DESIGNER_HOME).list_sampler_types() + + +@list_app.command(name="validators") +def validator_types_command() -> None: + """List available validator types and their params classes.""" + ListController(DATA_DESIGNER_HOME).list_validator_types() + + +@list_app.command(name="processors") +def processor_types_command() -> None: + """List available processor types and their config classes.""" + ListController(DATA_DESIGNER_HOME).list_processor_types() diff --git a/packages/data-designer/src/data_designer/cli/commands/list_assets.py b/packages/data-designer/src/data_designer/cli/commands/list_assets.py deleted file mode 100644 index 68fb3c418..000000000 --- a/packages/data-designer/src/data_designer/cli/commands/list_assets.py +++ /dev/null @@ -1,19 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 - -from __future__ import annotations - -import typer - -from data_designer.cli.controllers.introspection_controller import OutputFormat -from data_designer.cli.controllers.list_assets_controller import ListAssetsController -from data_designer.config.utils.constants import DATA_DESIGNER_HOME - - -def list_assets_command( - output_format: OutputFormat = typer.Option( - OutputFormat.TEXT, "--format", "-f", help="Output format: 'text' or 'json'." - ), -) -> None: - """List installed and available Nemotron-Persona datasets.""" - ListAssetsController(DATA_DESIGNER_HOME).list_assets(output_format.value) diff --git a/packages/data-designer/src/data_designer/cli/controllers/list_assets_controller.py b/packages/data-designer/src/data_designer/cli/controllers/list_assets_controller.py deleted file mode 100644 index 6fae6b84c..000000000 --- a/packages/data-designer/src/data_designer/cli/controllers/list_assets_controller.py +++ /dev/null @@ -1,52 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 - -from __future__ import annotations - -import json -from pathlib import Path - -import typer - -from data_designer.cli.repositories.persona_repository import PersonaRepository -from data_designer.cli.services.download_service import DownloadService - - -class ListAssetsController: - """Controller for listing managed dataset assets.""" - - def __init__(self, config_dir: Path) -> None: - self.persona_repository = PersonaRepository() - self.service = DownloadService(config_dir, self.persona_repository) - - def list_assets(self, output_format: str) -> None: - """List installed and available Nemotron-Persona datasets. - - Args: - output_format: "text" or "json". - """ - all_locales = self.persona_repository.list_all() - installed: list[str] = [] - not_installed: list[str] = [] - - for locale in all_locales: - if self.service.is_locale_downloaded(locale.code): - installed.append(locale.code) - else: - not_installed.append(locale.code) - - if output_format == "json": - typer.echo(json.dumps({"installed": installed, "not_installed": not_installed})) - return - - typer.echo("Nemotron-Persona Datasets") - typer.echo("-" * 25) - - if installed: - typer.echo(f"Usable locales in PersonSamplerParams: {', '.join(installed)}") - else: - typer.echo("No persona datasets installed.") - - if not_installed: - typer.echo(f"Not installed: {', '.join(not_installed)}") - typer.echo("The user can run `data-designer download personas --locale ` to install.") diff --git a/packages/data-designer/src/data_designer/cli/controllers/list_controller.py b/packages/data-designer/src/data_designer/cli/controllers/list_controller.py new file mode 100644 index 000000000..b649511a0 --- /dev/null +++ b/packages/data-designer/src/data_designer/cli/controllers/list_controller.py @@ -0,0 +1,161 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +from __future__ import annotations + +from pathlib import Path + +import typer + +from data_designer.cli.repositories.model_repository import ModelRepository +from data_designer.cli.repositories.persona_repository import PersonaRepository +from data_designer.cli.repositories.provider_repository import ProviderRepository +from data_designer.cli.services.download_service import DownloadService +from data_designer.cli.services.introspection.discovery import ( + discover_column_configs, + discover_processor_configs, + discover_sampler_types, + discover_validator_types, +) +from data_designer.config.default_model_settings import get_providers_with_missing_api_keys + + +class ListController: + """Controller for listing valid configuration values.""" + + def __init__(self, config_dir: Path) -> None: + self._config_dir = config_dir + self._model_repository = ModelRepository(config_dir) + self._provider_repository = ProviderRepository(config_dir) + self._persona_repository = PersonaRepository() + self._download_service = DownloadService(config_dir, self._persona_repository) + + def list_model_aliases(self) -> None: + """List configured model aliases. + + Only shows aliases whose backing provider has a valid API key. + """ + provider_registry = self._provider_repository.load() + + if not provider_registry or not provider_registry.providers: + typer.echo("No model providers configured. Run `data-designer config models` to configure your models.") + return + + missing_key_providers = get_providers_with_missing_api_keys(provider_registry.providers) + valid_provider_names = {p.name for p in provider_registry.providers} - {p.name for p in missing_key_providers} + + if not valid_provider_names: + typer.echo( + "No model providers are configured with valid API keys. " + "Run `data-designer config models` to configure your models." + ) + return + + default_provider = provider_registry.default or provider_registry.providers[0].name + + model_registry = self._model_repository.load() + configs = model_registry.model_configs if model_registry else [] + + if not configs: + typer.echo("No model aliases configured.") + typer.echo("Run `data-designer config models` to add models.") + return + + filtered = [mc for mc in configs if (mc.provider or default_provider) in valid_provider_names] + + if not filtered: + typer.echo( + "All configured model aliases use providers without valid API keys. " + "Run `data-designer config models` to configure your models." + ) + return + + c1, c2, c3 = "model_alias", "model", "provider" + w1 = max(len(c1), max(len(mc.alias) for mc in filtered)) + w2 = max(len(c2), max(len(mc.model) for mc in filtered)) + w3 = max(len(c3), max(len(mc.provider or "default") for mc in filtered)) + typer.echo(f"{c1:<{w1}} {c2:<{w2}} {c3}") + typer.echo(f"{'-' * w1} {'-' * w2} {'-' * w3}") + for mc in filtered: + typer.echo(f"{mc.alias:<{w1}} {mc.model:<{w2}} {mc.provider or 'default'}") + + if len(filtered) < len(configs): + typer.echo(f"\n({len(configs) - len(filtered)} model alias(es) hidden — providers missing API keys)") + + def list_persona_datasets(self) -> None: + """List persona datasets available for PersonSamplerParams.""" + managed_locales = self._persona_repository.list_all() + entries: list[dict[str, str | bool]] = [] + for locale in managed_locales: + installed = self._download_service.is_locale_downloaded(locale.code) + entries.append({"locale": locale.code, "installed": installed}) + + typer.echo("Nemotron-Personas Datasets") + typer.echo("-" * 26) + col1 = "locale" + col2 = "status" + max_width = max(len(col1), max(len(str(entry["locale"])) for entry in entries)) + typer.echo(f"{col1:<{max_width}} {col2}") + typer.echo(f"{'-' * max_width} {'-' * len('not installed')}") + for entry in entries: + status = "installed" if entry["installed"] else "not installed" + typer.echo(f"{str(entry['locale']):<{max_width}} {status}") + typer.echo("") + typer.echo("Tip: Use the PersonSamplerParams locale parameter to select a dataset.") + typer.echo("The user can run `data-designer download personas --locale ` to install a dataset.") + + def list_column_types(self) -> None: + """List available column configuration types.""" + items = discover_column_configs() + sorted_types = sorted(items.keys()) + + col1, col2 = "column_type", "config_class" + max_width = max(len(col1), max(len(t) for t in sorted_types)) + typer.echo(f"{col1:<{max_width}} {col2}") + typer.echo(f"{'-' * max_width} {'-' * max(len(items[t].__name__) for t in sorted_types)}") + for t in sorted_types: + typer.echo(f"{t:<{max_width}} {items[t].__name__}") + typer.echo("") + typer.echo("Tip: Run `data-designer inspect column ` for full schema details.") + + def list_sampler_types(self) -> None: + """List available sampler types.""" + items = discover_sampler_types() + sorted_types = sorted(items.keys()) + + col1, col2 = "sampler_type", "params_class" + max_width = max(len(col1), max(len(t) for t in sorted_types)) + typer.echo(f"{col1:<{max_width}} {col2}") + typer.echo(f"{'-' * max_width} {'-' * max(len(items[t].__name__) for t in sorted_types)}") + for t in sorted_types: + typer.echo(f"{t:<{max_width}} {items[t].__name__}") + typer.echo("") + typer.echo("Tip: Run `data-designer inspect sampler ` for full schema details.") + + def list_validator_types(self) -> None: + """List available validator types.""" + items = discover_validator_types() + sorted_types = sorted(items.keys()) + + col1, col2 = "validator_type", "params_class" + max_width = max(len(col1), max(len(t) for t in sorted_types)) + typer.echo(f"{col1:<{max_width}} {col2}") + typer.echo(f"{'-' * max_width} {'-' * max(len(items[t].__name__) for t in sorted_types)}") + for t in sorted_types: + typer.echo(f"{t:<{max_width}} {items[t].__name__}") + typer.echo("") + typer.echo("Tip: Run `data-designer inspect validator ` for full schema details.") + + def list_processor_types(self) -> None: + """List available processor types.""" + items = discover_processor_configs() + sorted_types = sorted(items.keys()) + + col1, col2 = "processor_type", "config_class" + max_width = max(len(col1), max(len(t) for t in sorted_types)) + typer.echo(f"{col1:<{max_width}} {col2}") + typer.echo(f"{'-' * max_width} {'-' * max(len(items[t].__name__) for t in sorted_types)}") + for t in sorted_types: + typer.echo(f"{t:<{max_width}} {items[t].__name__}") + typer.echo("") + typer.echo("Tip: Run `data-designer inspect processor ` for full schema details.") diff --git a/packages/data-designer/tests/cli/commands/agent_helpers/test_list_command.py b/packages/data-designer/tests/cli/commands/agent_helpers/test_list_command.py new file mode 100644 index 000000000..0b97a0847 --- /dev/null +++ b/packages/data-designer/tests/cli/commands/agent_helpers/test_list_command.py @@ -0,0 +1,115 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +from __future__ import annotations + +from unittest.mock import MagicMock, patch + +from data_designer.cli.commands.agent_helpers.list import ( + column_types_command, + model_aliases_command, + persona_datasets_command, + processor_types_command, + sampler_types_command, + validator_types_command, +) +from data_designer.cli.controllers.list_controller import ListController +from data_designer.config.utils.constants import DATA_DESIGNER_HOME + +_PATCH_TARGET = "data_designer.cli.commands.agent_helpers.list.ListController" + + +# --------------------------------------------------------------------------- +# model-aliases +# --------------------------------------------------------------------------- + + +@patch(_PATCH_TARGET) +def test_model_aliases_delegates_text(mock_cls: MagicMock) -> None: + mock_ctrl = MagicMock(spec=ListController) + mock_cls.return_value = mock_ctrl + + model_aliases_command() + + mock_cls.assert_called_once_with(DATA_DESIGNER_HOME) + mock_ctrl.list_model_aliases.assert_called_once_with() + + +# --------------------------------------------------------------------------- +# persona-datasets +# --------------------------------------------------------------------------- + + +@patch(_PATCH_TARGET) +def test_persona_datasets_delegates(mock_cls: MagicMock) -> None: + mock_ctrl = MagicMock(spec=ListController) + mock_cls.return_value = mock_ctrl + + persona_datasets_command() + + mock_cls.assert_called_once_with(DATA_DESIGNER_HOME) + mock_ctrl.list_persona_datasets.assert_called_once_with() + + +# --------------------------------------------------------------------------- +# columns +# --------------------------------------------------------------------------- + + +@patch(_PATCH_TARGET) +def test_column_types_delegates(mock_cls: MagicMock) -> None: + mock_ctrl = MagicMock(spec=ListController) + mock_cls.return_value = mock_ctrl + + column_types_command() + + mock_cls.assert_called_once_with(DATA_DESIGNER_HOME) + mock_ctrl.list_column_types.assert_called_once_with() + + +# --------------------------------------------------------------------------- +# samplers +# --------------------------------------------------------------------------- + + +@patch(_PATCH_TARGET) +def test_sampler_types_delegates(mock_cls: MagicMock) -> None: + mock_ctrl = MagicMock(spec=ListController) + mock_cls.return_value = mock_ctrl + + sampler_types_command() + + mock_cls.assert_called_once_with(DATA_DESIGNER_HOME) + mock_ctrl.list_sampler_types.assert_called_once_with() + + +# --------------------------------------------------------------------------- +# validators +# --------------------------------------------------------------------------- + + +@patch(_PATCH_TARGET) +def test_validator_types_delegates(mock_cls: MagicMock) -> None: + mock_ctrl = MagicMock(spec=ListController) + mock_cls.return_value = mock_ctrl + + validator_types_command() + + mock_cls.assert_called_once_with(DATA_DESIGNER_HOME) + mock_ctrl.list_validator_types.assert_called_once_with() + + +# --------------------------------------------------------------------------- +# processors +# --------------------------------------------------------------------------- + + +@patch(_PATCH_TARGET) +def test_processor_types_delegates(mock_cls: MagicMock) -> None: + mock_ctrl = MagicMock(spec=ListController) + mock_cls.return_value = mock_ctrl + + processor_types_command() + + mock_cls.assert_called_once_with(DATA_DESIGNER_HOME) + mock_ctrl.list_processor_types.assert_called_once_with() diff --git a/packages/data-designer/tests/cli/commands/test_list_assets_command.py b/packages/data-designer/tests/cli/commands/test_list_assets_command.py deleted file mode 100644 index 4a360a123..000000000 --- a/packages/data-designer/tests/cli/commands/test_list_assets_command.py +++ /dev/null @@ -1,33 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 - -from __future__ import annotations - -from unittest.mock import MagicMock, patch - -from data_designer.cli.commands.list_assets import list_assets_command -from data_designer.cli.controllers.list_assets_controller import ListAssetsController -from data_designer.config.utils.constants import DATA_DESIGNER_HOME - - -@patch("data_designer.cli.commands.list_assets.ListAssetsController") -def test_list_assets_command_delegates_to_controller(mock_controller_cls: MagicMock) -> None: - """Command creates controller with DATA_DESIGNER_HOME and delegates.""" - mock_controller = MagicMock(spec=ListAssetsController) - mock_controller_cls.return_value = mock_controller - - list_assets_command(output_format=MagicMock(value="text")) - - mock_controller_cls.assert_called_once_with(DATA_DESIGNER_HOME) - mock_controller.list_assets.assert_called_once_with("text") - - -@patch("data_designer.cli.commands.list_assets.ListAssetsController") -def test_list_assets_command_passes_json_format(mock_controller_cls: MagicMock) -> None: - """Command forwards the json format value to the controller.""" - mock_controller = MagicMock(spec=ListAssetsController) - mock_controller_cls.return_value = mock_controller - - list_assets_command(output_format=MagicMock(value="json")) - - mock_controller.list_assets.assert_called_once_with("json") diff --git a/packages/data-designer/tests/cli/controllers/test_list_assets_controller.py b/packages/data-designer/tests/cli/controllers/test_list_assets_controller.py deleted file mode 100644 index 627c9a65e..000000000 --- a/packages/data-designer/tests/cli/controllers/test_list_assets_controller.py +++ /dev/null @@ -1,126 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 - -from __future__ import annotations - -import json -from pathlib import Path - -import pytest - -from data_designer.cli.controllers.list_assets_controller import ListAssetsController - -# --------------------------------------------------------------------------- -# fixtures -# --------------------------------------------------------------------------- - - -@pytest.fixture -def controller(tmp_path: Path) -> ListAssetsController: - """Create a controller with no datasets installed.""" - return ListAssetsController(tmp_path) - - -@pytest.fixture -def controller_with_datasets(tmp_path: Path) -> ListAssetsController: - """Create a controller with en_US and ja_JP already installed.""" - managed = tmp_path / "managed-assets" / "datasets" - managed.mkdir(parents=True) - (managed / "en_US.parquet").touch() - (managed / "ja_JP.parquet").touch() - return ListAssetsController(tmp_path) - - -# --------------------------------------------------------------------------- -# init -# --------------------------------------------------------------------------- - - -def test_init(tmp_path: Path) -> None: - """Controller sets up repository and service.""" - ctrl = ListAssetsController(tmp_path) - assert ctrl.persona_repository is not None - assert ctrl.service.config_dir == tmp_path - - -# --------------------------------------------------------------------------- -# text format -# --------------------------------------------------------------------------- - - -def test_text_none_installed(controller: ListAssetsController, capsys: pytest.CaptureFixture[str]) -> None: - """Text output shows no-installed message when nothing is downloaded.""" - controller.list_assets("text") - out = capsys.readouterr().out - - assert "Nemotron-Persona Datasets" in out - assert "No persona datasets installed." in out - assert "Not installed:" in out - assert "The user can run" in out - - -def test_text_some_installed( - controller_with_datasets: ListAssetsController, capsys: pytest.CaptureFixture[str] -) -> None: - """Text output lists usable locales and not-installed ones.""" - controller_with_datasets.list_assets("text") - out = capsys.readouterr().out - - assert "Usable locales in PersonSamplerParams:" in out - assert "en_US" in out - assert "ja_JP" in out - assert "Not installed:" in out - - -def test_text_all_installed_omits_not_installed_section(tmp_path: Path, capsys: pytest.CaptureFixture[str]) -> None: - """When every locale is installed the not-installed section is omitted.""" - managed = tmp_path / "managed-assets" / "datasets" - managed.mkdir(parents=True) - ctrl = ListAssetsController(tmp_path) - for locale in ctrl.persona_repository.list_all(): - (managed / f"{locale.code}.parquet").touch() - - ctrl.list_assets("text") - out = capsys.readouterr().out - - assert "Usable locales in PersonSamplerParams:" in out - assert "Not installed" not in out - - -# --------------------------------------------------------------------------- -# json format -# --------------------------------------------------------------------------- - - -def test_json_structure(controller: ListAssetsController, capsys: pytest.CaptureFixture[str]) -> None: - """JSON output has the expected keys and types.""" - controller.list_assets("json") - data = json.loads(capsys.readouterr().out) - - assert isinstance(data["installed"], list) - assert isinstance(data["not_installed"], list) - - -def test_json_partitions_correctly( - controller_with_datasets: ListAssetsController, capsys: pytest.CaptureFixture[str] -) -> None: - """JSON output places downloaded locales in installed and the rest in not_installed.""" - controller_with_datasets.list_assets("json") - data = json.loads(capsys.readouterr().out) - - assert "en_US" in data["installed"] - assert "ja_JP" in data["installed"] - assert "en_US" not in data["not_installed"] - assert "ja_JP" not in data["not_installed"] - assert len(data["installed"]) + len(data["not_installed"]) == len( - controller_with_datasets.persona_repository.list_all() - ) - - -def test_json_none_installed(controller: ListAssetsController, capsys: pytest.CaptureFixture[str]) -> None: - """JSON output when nothing is installed.""" - controller.list_assets("json") - data = json.loads(capsys.readouterr().out) - - assert data["installed"] == [] - assert len(data["not_installed"]) > 0 diff --git a/packages/data-designer/tests/cli/controllers/test_list_controller.py b/packages/data-designer/tests/cli/controllers/test_list_controller.py new file mode 100644 index 000000000..d270deb51 --- /dev/null +++ b/packages/data-designer/tests/cli/controllers/test_list_controller.py @@ -0,0 +1,462 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +from __future__ import annotations + +import os +from pathlib import Path +from unittest.mock import MagicMock, patch + +import pytest + +from data_designer.cli.controllers.list_controller import ListController + +# --------------------------------------------------------------------------- +# fixtures +# --------------------------------------------------------------------------- + + +@pytest.fixture +def controller(tmp_path: Path) -> ListController: + """Controller with no datasets installed and no model configs.""" + return ListController(tmp_path) + + +@pytest.fixture +def controller_with_datasets(tmp_path: Path) -> ListController: + """Controller with en_US and ja_JP persona datasets installed.""" + managed = tmp_path / "managed-assets" / "datasets" + managed.mkdir(parents=True) + (managed / "en_US.parquet").touch() + (managed / "ja_JP.parquet").touch() + return ListController(tmp_path) + + +@pytest.fixture +def controller_all_installed(tmp_path: Path) -> ListController: + """Controller with ALL managed persona datasets installed.""" + ctrl = ListController(tmp_path) + managed = tmp_path / "managed-assets" / "datasets" + managed.mkdir(parents=True) + for locale in ctrl._persona_repository.list_all(): + (managed / f"{locale.code}.parquet").touch() + return ctrl + + +def _make_model_config(alias: str, model: str, provider: str | None = None) -> MagicMock: + mc = MagicMock() + mc.alias = alias + mc.model = model + mc.provider = provider + return mc + + +def _make_provider(name: str, api_key: str | None = "sk-valid-key") -> MagicMock: + p = MagicMock() + p.name = name + p.api_key = api_key + return p + + +def _make_provider_registry( + providers: list[MagicMock], + default: str | None = None, +) -> MagicMock: + registry = MagicMock() + registry.providers = providers + registry.default = default + return registry + + +# --------------------------------------------------------------------------- +# list_model_aliases — text +# --------------------------------------------------------------------------- + + +def test_model_aliases_text_empty(tmp_path: Path, capsys: pytest.CaptureFixture[str]) -> None: + ctrl = ListController(tmp_path) + provider_reg = _make_provider_registry([_make_provider("nvidia")]) + model_reg = MagicMock() + model_reg.model_configs = [] + with ( + patch.object(ctrl._provider_repository, "load", return_value=provider_reg), + patch.object(ctrl._model_repository, "load", return_value=model_reg), + ): + ctrl.list_model_aliases() + out = capsys.readouterr().out + assert "No model aliases configured." in out + assert "data-designer config models" in out + + +def test_model_aliases_text_with_models(tmp_path: Path, capsys: pytest.CaptureFixture[str]) -> None: + ctrl = ListController(tmp_path) + provider_reg = _make_provider_registry( + [_make_provider("nvidia"), _make_provider("openai")], + default="nvidia", + ) + model_reg = MagicMock() + model_reg.model_configs = [ + _make_model_config("my-model", "meta/llama-3.1-8b-instruct", "nvidia"), + _make_model_config("judge", "openai/gpt-4o", None), + ] + with ( + patch.object(ctrl._provider_repository, "load", return_value=provider_reg), + patch.object(ctrl._model_repository, "load", return_value=model_reg), + ): + ctrl.list_model_aliases() + out = capsys.readouterr().out + assert "my-model" in out + assert "meta/llama-3.1-8b-instruct" in out + assert "nvidia" in out + assert "judge" in out + assert "default" in out + + +def test_model_aliases_text_empty_model_configs(tmp_path: Path, capsys: pytest.CaptureFixture[str]) -> None: + ctrl = ListController(tmp_path) + provider_reg = _make_provider_registry([_make_provider("nvidia")]) + model_reg = MagicMock() + model_reg.model_configs = [] + with ( + patch.object(ctrl._provider_repository, "load", return_value=provider_reg), + patch.object(ctrl._model_repository, "load", return_value=model_reg), + ): + ctrl.list_model_aliases() + out = capsys.readouterr().out + assert "No model aliases configured." in out + + +# --------------------------------------------------------------------------- +# list_model_aliases — provider validation (text) +# --------------------------------------------------------------------------- + + +def test_model_aliases_text_no_provider_config(tmp_path: Path, capsys: pytest.CaptureFixture[str]) -> None: + ctrl = ListController(tmp_path) + with patch.object(ctrl._provider_repository, "load", return_value=None): + ctrl.list_model_aliases() + out = capsys.readouterr().out + assert "No model providers configured" in out + assert "data-designer config models" in out + + +def test_model_aliases_text_empty_providers(tmp_path: Path, capsys: pytest.CaptureFixture[str]) -> None: + ctrl = ListController(tmp_path) + provider_reg = _make_provider_registry([]) + with patch.object(ctrl._provider_repository, "load", return_value=provider_reg): + ctrl.list_model_aliases() + out = capsys.readouterr().out + assert "No model providers configured" in out + + +def test_model_aliases_text_all_providers_missing_keys(tmp_path: Path, capsys: pytest.CaptureFixture[str]) -> None: + ctrl = ListController(tmp_path) + provider_reg = _make_provider_registry( + [ + _make_provider("nvidia", api_key=None), + _make_provider("openai", api_key=None), + ] + ) + with patch.object(ctrl._provider_repository, "load", return_value=provider_reg): + ctrl.list_model_aliases() + out = capsys.readouterr().out + assert "No model providers are configured with valid API keys" in out + assert "data-designer config models" in out + + +def test_model_aliases_text_filters_by_provider(tmp_path: Path, capsys: pytest.CaptureFixture[str]) -> None: + ctrl = ListController(tmp_path) + provider_reg = _make_provider_registry( + [_make_provider("nvidia", api_key="sk-valid"), _make_provider("openai", api_key=None)], + default="nvidia", + ) + model_reg = MagicMock() + model_reg.model_configs = [ + _make_model_config("nv-model", "meta/llama-3.1-8b-instruct", "nvidia"), + _make_model_config("oai-model", "openai/gpt-4o", "openai"), + ] + with ( + patch.object(ctrl._provider_repository, "load", return_value=provider_reg), + patch.object(ctrl._model_repository, "load", return_value=model_reg), + ): + ctrl.list_model_aliases() + out = capsys.readouterr().out + assert "nv-model" in out + assert "oai-model" not in out + + +def test_model_aliases_text_default_provider_resolution(tmp_path: Path, capsys: pytest.CaptureFixture[str]) -> None: + """Model with provider=None resolves to default provider for filtering.""" + ctrl = ListController(tmp_path) + provider_reg = _make_provider_registry( + [_make_provider("nvidia", api_key="sk-valid"), _make_provider("openai", api_key=None)], + default="nvidia", + ) + model_reg = MagicMock() + model_reg.model_configs = [ + _make_model_config("my-model", "meta/llama-3.1-8b-instruct", None), + ] + with ( + patch.object(ctrl._provider_repository, "load", return_value=provider_reg), + patch.object(ctrl._model_repository, "load", return_value=model_reg), + ): + ctrl.list_model_aliases() + out = capsys.readouterr().out + assert "my-model" in out + assert "default" in out + + +def test_model_aliases_text_default_provider_resolution_excluded( + tmp_path: Path, capsys: pytest.CaptureFixture[str] +) -> None: + """Model with provider=None is excluded when default provider lacks a valid key.""" + ctrl = ListController(tmp_path) + provider_reg = _make_provider_registry( + [_make_provider("nvidia", api_key=None), _make_provider("openai", api_key="sk-valid")], + default="nvidia", + ) + model_reg = MagicMock() + model_reg.model_configs = [ + _make_model_config("my-model", "meta/llama-3.1-8b-instruct", None), + _make_model_config("oai-model", "openai/gpt-4o", "openai"), + ] + with ( + patch.object(ctrl._provider_repository, "load", return_value=provider_reg), + patch.object(ctrl._model_repository, "load", return_value=model_reg), + ): + ctrl.list_model_aliases() + out = capsys.readouterr().out + assert "my-model" not in out + assert "oai-model" in out + + +def test_model_aliases_text_all_models_filtered(tmp_path: Path, capsys: pytest.CaptureFixture[str]) -> None: + ctrl = ListController(tmp_path) + provider_reg = _make_provider_registry( + [_make_provider("nvidia", api_key="sk-valid"), _make_provider("openai", api_key=None)], + default="nvidia", + ) + model_reg = MagicMock() + model_reg.model_configs = [ + _make_model_config("oai-model", "openai/gpt-4o", "openai"), + ] + with ( + patch.object(ctrl._provider_repository, "load", return_value=provider_reg), + patch.object(ctrl._model_repository, "load", return_value=model_reg), + ): + ctrl.list_model_aliases() + out = capsys.readouterr().out + assert "All configured model aliases use providers without valid API keys" in out + assert "data-designer config models" in out + + +def test_model_aliases_text_default_from_first_provider(tmp_path: Path, capsys: pytest.CaptureFixture[str]) -> None: + """When provider_registry.default is None, first provider is used as default.""" + ctrl = ListController(tmp_path) + provider_reg = _make_provider_registry( + [_make_provider("nvidia", api_key="sk-valid")], + default=None, + ) + model_reg = MagicMock() + model_reg.model_configs = [ + _make_model_config("my-model", "meta/llama-3.1-8b-instruct", None), + ] + with ( + patch.object(ctrl._provider_repository, "load", return_value=provider_reg), + patch.object(ctrl._model_repository, "load", return_value=model_reg), + ): + ctrl.list_model_aliases() + out = capsys.readouterr().out + assert "my-model" in out + + +def test_model_aliases_env_var_api_key_set(tmp_path: Path, capsys: pytest.CaptureFixture[str]) -> None: + """Provider whose api_key names an env var that IS set should be treated as valid.""" + ctrl = ListController(tmp_path) + provider_reg = _make_provider_registry( + [_make_provider("nvidia", api_key="NVIDIA_API_KEY")], + default="nvidia", + ) + model_reg = MagicMock() + model_reg.model_configs = [ + _make_model_config("nv-model", "meta/llama-3.1-8b-instruct", "nvidia"), + ] + with ( + patch.object(ctrl._provider_repository, "load", return_value=provider_reg), + patch.object(ctrl._model_repository, "load", return_value=model_reg), + patch.dict(os.environ, {"NVIDIA_API_KEY": "real-key"}), + ): + ctrl.list_model_aliases() + out = capsys.readouterr().out + assert "nv-model" in out + + +def test_model_aliases_env_var_api_key_unset(tmp_path: Path, capsys: pytest.CaptureFixture[str]) -> None: + """Provider whose api_key names an env var that is NOT set should be invalid.""" + ctrl = ListController(tmp_path) + provider_reg = _make_provider_registry( + [_make_provider("nvidia", api_key="NVIDIA_API_KEY")], + default="nvidia", + ) + model_reg = MagicMock() + model_reg.model_configs = [ + _make_model_config("nv-model", "meta/llama-3.1-8b-instruct", "nvidia"), + ] + env = {k: v for k, v in os.environ.items() if k != "NVIDIA_API_KEY"} + with ( + patch.object(ctrl._provider_repository, "load", return_value=provider_reg), + patch.object(ctrl._model_repository, "load", return_value=model_reg), + patch.dict(os.environ, env, clear=True), + ): + ctrl.list_model_aliases() + out = capsys.readouterr().out + assert "No model providers are configured with valid API keys" in out + + +def test_model_aliases_model_registry_returns_none(tmp_path: Path, capsys: pytest.CaptureFixture[str]) -> None: + """When _model_repository.load() returns None, show 'No model aliases configured.'""" + ctrl = ListController(tmp_path) + provider_reg = _make_provider_registry([_make_provider("nvidia")]) + with ( + patch.object(ctrl._provider_repository, "load", return_value=provider_reg), + patch.object(ctrl._model_repository, "load", return_value=None), + ): + ctrl.list_model_aliases() + out = capsys.readouterr().out + assert "No model aliases configured." in out + + +def test_model_aliases_multiple_models_same_provider(tmp_path: Path, capsys: pytest.CaptureFixture[str]) -> None: + """All models on a single valid provider should appear in the output.""" + ctrl = ListController(tmp_path) + provider_reg = _make_provider_registry( + [_make_provider("nvidia", api_key="sk-valid")], + default="nvidia", + ) + model_reg = MagicMock() + model_reg.model_configs = [ + _make_model_config("model-a", "meta/llama-3.1-8b-instruct", "nvidia"), + _make_model_config("model-b", "meta/llama-3.1-70b-instruct", "nvidia"), + _make_model_config("model-c", "nvidia/nemotron-4-340b", "nvidia"), + ] + with ( + patch.object(ctrl._provider_repository, "load", return_value=provider_reg), + patch.object(ctrl._model_repository, "load", return_value=model_reg), + ): + ctrl.list_model_aliases() + out = capsys.readouterr().out + assert "model-a" in out + assert "model-b" in out + assert "model-c" in out + + +def test_model_aliases_filtered_count_hint(tmp_path: Path, capsys: pytest.CaptureFixture[str]) -> None: + """Output should contain a hint about how many aliases were hidden by filtering.""" + ctrl = ListController(tmp_path) + provider_reg = _make_provider_registry( + [_make_provider("nvidia", api_key="sk-valid"), _make_provider("openai", api_key=None)], + default="nvidia", + ) + model_reg = MagicMock() + model_reg.model_configs = [ + _make_model_config("nv-model", "meta/llama-3.1-8b-instruct", "nvidia"), + _make_model_config("oai-model", "openai/gpt-4o", "openai"), + ] + with ( + patch.object(ctrl._provider_repository, "load", return_value=provider_reg), + patch.object(ctrl._model_repository, "load", return_value=model_reg), + ): + ctrl.list_model_aliases() + out = capsys.readouterr().out + assert "nv-model" in out + assert "oai-model" not in out + assert "1 model alias(es) hidden" in out + + +# --------------------------------------------------------------------------- +# list_persona_datasets — text +# --------------------------------------------------------------------------- + + +def test_persona_datasets_text_none_installed(controller: ListController, capsys: pytest.CaptureFixture[str]) -> None: + controller.list_persona_datasets() + out = capsys.readouterr().out + assert "Nemotron-Personas Datasets" in out + assert "not installed" in out + + +def test_persona_datasets_text_some_installed( + controller_with_datasets: ListController, capsys: pytest.CaptureFixture[str] +) -> None: + controller_with_datasets.list_persona_datasets() + out = capsys.readouterr().out + assert "en_US" in out + assert "installed" in out + assert "ja_JP" in out + + +def test_persona_datasets_text_all_installed( + controller_all_installed: ListController, capsys: pytest.CaptureFixture[str] +) -> None: + controller_all_installed.list_persona_datasets() + out = capsys.readouterr().out + lines = out.strip().splitlines() + locale_lines = [line for line in lines if "installed" in line and "---" not in line and "status" not in line] + assert len(locale_lines) > 0 + for line in locale_lines: + assert "not installed" not in line + + +# --------------------------------------------------------------------------- +# list_column_types — text +# --------------------------------------------------------------------------- + + +def test_column_types_text(controller: ListController, capsys: pytest.CaptureFixture[str]) -> None: + controller.list_column_types() + out = capsys.readouterr().out + assert "column_type" in out + assert "config_class" in out + assert "llm-text" in out + assert "sampler" in out + assert "data-designer inspect column" in out + + +# --------------------------------------------------------------------------- +# list_sampler_types — text +# --------------------------------------------------------------------------- + + +def test_sampler_types_text(controller: ListController, capsys: pytest.CaptureFixture[str]) -> None: + controller.list_sampler_types() + out = capsys.readouterr().out + assert "sampler_type" in out + assert "params_class" in out + assert "category" in out + assert "data-designer inspect sampler" in out + + +# --------------------------------------------------------------------------- +# list_validator_types — text +# --------------------------------------------------------------------------- + + +def test_validator_types_text(controller: ListController, capsys: pytest.CaptureFixture[str]) -> None: + controller.list_validator_types() + out = capsys.readouterr().out + assert "validator_type" in out + assert "params_class" in out + assert "data-designer inspect validator" in out + + +# --------------------------------------------------------------------------- +# list_processor_types — text +# --------------------------------------------------------------------------- + + +def test_processor_types_text(controller: ListController, capsys: pytest.CaptureFixture[str]) -> None: + controller.list_processor_types() + out = capsys.readouterr().out + assert "processor_type" in out + assert "config_class" in out + assert "data-designer inspect processor" in out From 03db8035710f0a0d1d51ef210513b86f6614c1e6 Mon Sep 17 00:00:00 2001 From: Johnny Greco Date: Tue, 17 Feb 2026 19:47:16 -0500 Subject: [PATCH 21/37] docs: clarify that constraints apply only to sampler columns Update docstrings and field descriptions in sampler_constraints.py to make explicit that Constraint, ScalarInequalityConstraint, and ColumnInequalityConstraint are scoped to sampler columns. --- .../src/data_designer/config/sampler_constraints.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/packages/data-designer-config/src/data_designer/config/sampler_constraints.py b/packages/data-designer-config/src/data_designer/config/sampler_constraints.py index 73128fd6c..eb6470159 100644 --- a/packages/data-designer-config/src/data_designer/config/sampler_constraints.py +++ b/packages/data-designer-config/src/data_designer/config/sampler_constraints.py @@ -25,7 +25,9 @@ class InequalityOperator(str, Enum): class Constraint(ConfigBase, ABC): - target_column: str = Field(description="Name of the column this constraint applies to") + """Base class for sampler column constraints.""" + + target_column: str = Field(description="Name of the sampler column this constraint applies to") @property @abstractmethod @@ -33,7 +35,7 @@ def constraint_type(self) -> ConstraintType: ... class ScalarInequalityConstraint(Constraint): - """Constraint that compares a column's values against a scalar threshold.""" + """Sampler constraint that compares a sampler column's generated values against a scalar threshold.""" rhs: float = Field(description="Scalar value to compare against") operator: InequalityOperator = Field(description="Comparison operator (lt, le, gt, ge)") @@ -44,7 +46,7 @@ def constraint_type(self) -> ConstraintType: class ColumnInequalityConstraint(Constraint): - """Constraint that compares a column's values against another column's values.""" + """Sampler constraint that compares a sampler column's generated values against another sampler column's values.""" rhs: str = Field(description="Name of the other column to compare against") operator: InequalityOperator = Field(description="Comparison operator (lt, le, gt, ge)") From ea03168d20a80051599058351d760dca5ec0211c Mon Sep 17 00:00:00 2001 From: Johnny Greco Date: Tue, 17 Feb 2026 19:56:20 -0500 Subject: [PATCH 22/37] refactor: rename inspect "builder" subcommand to "config_builder" --- .../src/data_designer/cli/commands/agent_helpers/inspect.py | 6 +++--- .../commands/agent_helpers/test_introspection_commands.py | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/packages/data-designer/src/data_designer/cli/commands/agent_helpers/inspect.py b/packages/data-designer/src/data_designer/cli/commands/agent_helpers/inspect.py index 4ab4d71d5..365ca8869 100644 --- a/packages/data-designer/src/data_designer/cli/commands/agent_helpers/inspect.py +++ b/packages/data-designer/src/data_designer/cli/commands/agent_helpers/inspect.py @@ -52,7 +52,7 @@ def constraints_command() -> None: IntrospectionController().show_sampler_constraints() -@inspect_app.command(name="builder") -def builder_command() -> None: - """Show config builder method signatures and docstrings.""" +@inspect_app.command(name="config_builder") +def config_builder_command() -> None: + """Show DataDesignerConfigBuilder method signatures and docstrings.""" IntrospectionController().show_builder() diff --git a/packages/data-designer/tests/cli/commands/agent_helpers/test_introspection_commands.py b/packages/data-designer/tests/cli/commands/agent_helpers/test_introspection_commands.py index f3815df55..1d4e72768 100644 --- a/packages/data-designer/tests/cli/commands/agent_helpers/test_introspection_commands.py +++ b/packages/data-designer/tests/cli/commands/agent_helpers/test_introspection_commands.py @@ -99,12 +99,12 @@ def test_processors_no_arg_fails() -> None: # --------------------------------------------------------------------------- -# builder +# config_builder # --------------------------------------------------------------------------- -def test_builder() -> None: - result = runner.invoke(app, ["inspect", "builder"]) +def test_config_builder() -> None: + result = runner.invoke(app, ["inspect", "config_builder"]) assert result.exit_code == 0 assert "add_column" in result.output assert "DataDesignerConfigBuilder" in result.output From 98288c6e83e48ca1f867f95c83ce43d6989a6bea Mon Sep 17 00:00:00 2001 From: Johnny Greco Date: Tue, 17 Feb 2026 19:58:25 -0500 Subject: [PATCH 23/37] docs: improve agent-helper CLI help descriptions for agent consumption --- .../cli/commands/agent_helpers/inspect.py | 33 ++++++++++++------- .../cli/commands/agent_helpers/list.py | 17 ++++++---- .../src/data_designer/cli/main.py | 4 +-- 3 files changed, 34 insertions(+), 20 deletions(-) diff --git a/packages/data-designer/src/data_designer/cli/commands/agent_helpers/inspect.py b/packages/data-designer/src/data_designer/cli/commands/agent_helpers/inspect.py index 365ca8869..977696d93 100644 --- a/packages/data-designer/src/data_designer/cli/commands/agent_helpers/inspect.py +++ b/packages/data-designer/src/data_designer/cli/commands/agent_helpers/inspect.py @@ -9,50 +9,61 @@ inspect_app = typer.Typer( name="inspect", - help="Inspect configuration types and Python API (schemas, method signatures).", + help=( + "Return detailed schemas (fields, types, defaults, constraints) for configuration types," + " or method signatures for the Python API. Use `list` first to discover valid type names." + ), no_args_is_help=True, ) @inspect_app.command(name="column") def columns_command( - type_name: str = typer.Argument(help="Column type to display (e.g., 'llm-text'), or 'all' for everything."), + type_name: str = typer.Argument( + help="Column type name (e.g., 'llm-text', 'sampler'). Pass 'all' to dump every column type." + ), ) -> None: - """Show schema for a column config type (use `list columns` for valid names).""" + """Return the full schema for a column config type, including field names, types, defaults, and descriptions. Run `list columns` to discover valid type names.""" IntrospectionController().show_columns(type_name) @inspect_app.command(name="sampler") def samplers_command( - type_name: str = typer.Argument(help="Sampler type to display (e.g., 'category'), or 'all' for everything."), + type_name: str = typer.Argument( + help="Sampler type name (e.g., 'category', 'uniform'). Pass 'all' to dump every sampler type." + ), ) -> None: - """Show schema for a sampler params type (use `list samplers` for valid names).""" + """Return the full params schema for a sampler type, including field names, types, defaults, and descriptions. Run `list samplers` to discover valid type names.""" IntrospectionController().show_samplers(type_name) @inspect_app.command(name="validator") def validators_command( - type_name: str = typer.Argument(help="Validator type to display (e.g., 'code'), or 'all' for everything."), + type_name: str = typer.Argument( + help="Validator type name (e.g., 'code', 'python'). Pass 'all' to dump every validator type." + ), ) -> None: - """Show schema for a validator params type (use `list validators` for valid names).""" + """Return the full params schema for a validator type, including field names, types, defaults, and descriptions. Run `list validators` to discover valid type names.""" IntrospectionController().show_validators(type_name) @inspect_app.command(name="processor") def processors_command( - type_name: str = typer.Argument(help="Processor type to display (e.g., 'drop_columns'), or 'all' for everything."), + type_name: str = typer.Argument( + help="Processor type name (e.g., 'drop_columns'). Pass 'all' to dump every processor type." + ), ) -> None: - """Show schema for a processor config type (use `list processors` for valid names).""" + """Return the full config schema for a processor type, including field names, types, defaults, and descriptions. Run `list processors` to discover valid type names.""" IntrospectionController().show_processors(type_name) @inspect_app.command(name="sampler-constraints") def constraints_command() -> None: - """Show sampler constraint schemas (scalar inequality, column inequality, operators).""" + """Return schemas for sampler constraint types: ScalarInequalityConstraint, ColumnInequalityConstraint, and the InequalityOperator enum. Use when adding value constraints to sampler columns.""" IntrospectionController().show_sampler_constraints() @inspect_app.command(name="config_builder") def config_builder_command() -> None: - """Show DataDesignerConfigBuilder method signatures and docstrings.""" + """Return DataDesignerConfigBuilder method signatures and docstrings. Use to understand available builder methods and their parameters.""" IntrospectionController().show_builder() diff --git a/packages/data-designer/src/data_designer/cli/commands/agent_helpers/list.py b/packages/data-designer/src/data_designer/cli/commands/agent_helpers/list.py index 78596e7e5..7f4717b2c 100644 --- a/packages/data-designer/src/data_designer/cli/commands/agent_helpers/list.py +++ b/packages/data-designer/src/data_designer/cli/commands/agent_helpers/list.py @@ -10,42 +10,45 @@ list_app = typer.Typer( name="list", - help="List valid values for configuration fields.", + help=( + "Enumerate valid names and classes for configuration fields." + " Use these names as arguments to `inspect` commands for detailed schemas." + ), no_args_is_help=True, ) @list_app.command(name="model-aliases") def model_aliases_command() -> None: - """List configured model aliases and their backing models.""" + """List all configured model aliases with their backing model identifiers. Required to set model_alias on LLM column configs.""" ListController(DATA_DESIGNER_HOME).list_model_aliases() @list_app.command(name="persona-datasets") def persona_datasets_command() -> None: - """List available persona datasets and their install status.""" + """List available Nemotron-Persona datasets and whether each is installed locally.""" ListController(DATA_DESIGNER_HOME).list_persona_datasets() @list_app.command(name="columns") def column_types_command() -> None: - """List available column types and their config classes.""" + """List all column type names and their config classes. Pass a name to `inspect column ` for the full schema.""" ListController(DATA_DESIGNER_HOME).list_column_types() @list_app.command(name="samplers") def sampler_types_command() -> None: - """List available sampler types and their params classes.""" + """List all sampler type names and their params classes. Pass a name to `inspect sampler ` for the full schema.""" ListController(DATA_DESIGNER_HOME).list_sampler_types() @list_app.command(name="validators") def validator_types_command() -> None: - """List available validator types and their params classes.""" + """List all validator type names and their params classes. Pass a name to `inspect validator ` for the full schema.""" ListController(DATA_DESIGNER_HOME).list_validator_types() @list_app.command(name="processors") def processor_types_command() -> None: - """List available processor types and their config classes.""" + """List all processor type names and their config classes. Pass a name to `inspect processor ` for the full schema.""" ListController(DATA_DESIGNER_HOME).list_processor_types() diff --git a/packages/data-designer/src/data_designer/cli/main.py b/packages/data-designer/src/data_designer/cli/main.py index bd782704c..ae184df9c 100644 --- a/packages/data-designer/src/data_designer/cli/main.py +++ b/packages/data-designer/src/data_designer/cli/main.py @@ -103,8 +103,8 @@ app.add_typer(config_app, name="config", rich_help_panel="Setup Commands") app.add_typer(download_app, name="download", rich_help_panel="Setup Commands") -# Add agent command groups -title_agent_helpers = "Agent-Helper Commands" +# Add agent command groups (designed for AI agent consumption: schema introspection and valid-value discovery) +title_agent_helpers = "Agent-Helper Commands (schema introspection and valid-value discovery)" app.add_typer(inspect_cmd.inspect_app, name="inspect", rich_help_panel=title_agent_helpers) From d11aa41702aa58c84dbb613b0804b8695af13e33 Mon Sep 17 00:00:00 2001 From: Johnny Greco Date: Tue, 17 Feb 2026 19:59:43 -0500 Subject: [PATCH 24/37] fix: use hyphenated config-builder for CLI subcommand name --- .../src/data_designer/cli/commands/agent_helpers/inspect.py | 2 +- .../cli/commands/agent_helpers/test_introspection_commands.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/packages/data-designer/src/data_designer/cli/commands/agent_helpers/inspect.py b/packages/data-designer/src/data_designer/cli/commands/agent_helpers/inspect.py index 977696d93..f4b22df69 100644 --- a/packages/data-designer/src/data_designer/cli/commands/agent_helpers/inspect.py +++ b/packages/data-designer/src/data_designer/cli/commands/agent_helpers/inspect.py @@ -63,7 +63,7 @@ def constraints_command() -> None: IntrospectionController().show_sampler_constraints() -@inspect_app.command(name="config_builder") +@inspect_app.command(name="config-builder") def config_builder_command() -> None: """Return DataDesignerConfigBuilder method signatures and docstrings. Use to understand available builder methods and their parameters.""" IntrospectionController().show_builder() diff --git a/packages/data-designer/tests/cli/commands/agent_helpers/test_introspection_commands.py b/packages/data-designer/tests/cli/commands/agent_helpers/test_introspection_commands.py index 1d4e72768..7783c61e2 100644 --- a/packages/data-designer/tests/cli/commands/agent_helpers/test_introspection_commands.py +++ b/packages/data-designer/tests/cli/commands/agent_helpers/test_introspection_commands.py @@ -99,12 +99,12 @@ def test_processors_no_arg_fails() -> None: # --------------------------------------------------------------------------- -# config_builder +# config-builder # --------------------------------------------------------------------------- def test_config_builder() -> None: - result = runner.invoke(app, ["inspect", "config_builder"]) + result = runner.invoke(app, ["inspect", "config-builder"]) assert result.exit_code == 0 assert "add_column" in result.output assert "DataDesignerConfigBuilder" in result.output From 7520939af688a81693120e8869a9788bea71e8db Mon Sep 17 00:00:00 2001 From: Johnny Greco Date: Tue, 17 Feb 2026 20:01:26 -0500 Subject: [PATCH 25/37] docs: tighten agent-helper CLI help descriptions --- .../cli/commands/agent_helpers/inspect.py | 33 +++++++------------ .../cli/commands/agent_helpers/list.py | 17 ++++------ .../src/data_designer/cli/main.py | 4 +-- 3 files changed, 20 insertions(+), 34 deletions(-) diff --git a/packages/data-designer/src/data_designer/cli/commands/agent_helpers/inspect.py b/packages/data-designer/src/data_designer/cli/commands/agent_helpers/inspect.py index f4b22df69..35bcc9fb8 100644 --- a/packages/data-designer/src/data_designer/cli/commands/agent_helpers/inspect.py +++ b/packages/data-designer/src/data_designer/cli/commands/agent_helpers/inspect.py @@ -9,61 +9,50 @@ inspect_app = typer.Typer( name="inspect", - help=( - "Return detailed schemas (fields, types, defaults, constraints) for configuration types," - " or method signatures for the Python API. Use `list` first to discover valid type names." - ), + help="Show schemas and method signatures for configuration types. Run `list` to discover valid type names.", no_args_is_help=True, ) @inspect_app.command(name="column") def columns_command( - type_name: str = typer.Argument( - help="Column type name (e.g., 'llm-text', 'sampler'). Pass 'all' to dump every column type." - ), + type_name: str = typer.Argument(help="Type name (e.g. 'llm-text', 'sampler'), or 'all'."), ) -> None: - """Return the full schema for a column config type, including field names, types, defaults, and descriptions. Run `list columns` to discover valid type names.""" + """Show schema for a column config type. Run `list columns` for valid names.""" IntrospectionController().show_columns(type_name) @inspect_app.command(name="sampler") def samplers_command( - type_name: str = typer.Argument( - help="Sampler type name (e.g., 'category', 'uniform'). Pass 'all' to dump every sampler type." - ), + type_name: str = typer.Argument(help="Type name (e.g. 'category', 'uniform'), or 'all'."), ) -> None: - """Return the full params schema for a sampler type, including field names, types, defaults, and descriptions. Run `list samplers` to discover valid type names.""" + """Show schema for a sampler params type. Run `list samplers` for valid names.""" IntrospectionController().show_samplers(type_name) @inspect_app.command(name="validator") def validators_command( - type_name: str = typer.Argument( - help="Validator type name (e.g., 'code', 'python'). Pass 'all' to dump every validator type." - ), + type_name: str = typer.Argument(help="Type name (e.g. 'code', 'python'), or 'all'."), ) -> None: - """Return the full params schema for a validator type, including field names, types, defaults, and descriptions. Run `list validators` to discover valid type names.""" + """Show schema for a validator params type. Run `list validators` for valid names.""" IntrospectionController().show_validators(type_name) @inspect_app.command(name="processor") def processors_command( - type_name: str = typer.Argument( - help="Processor type name (e.g., 'drop_columns'). Pass 'all' to dump every processor type." - ), + type_name: str = typer.Argument(help="Type name (e.g. 'drop_columns'), or 'all'."), ) -> None: - """Return the full config schema for a processor type, including field names, types, defaults, and descriptions. Run `list processors` to discover valid type names.""" + """Show schema for a processor config type. Run `list processors` for valid names.""" IntrospectionController().show_processors(type_name) @inspect_app.command(name="sampler-constraints") def constraints_command() -> None: - """Return schemas for sampler constraint types: ScalarInequalityConstraint, ColumnInequalityConstraint, and the InequalityOperator enum. Use when adding value constraints to sampler columns.""" + """Show constraint schemas for sampler columns.""" IntrospectionController().show_sampler_constraints() @inspect_app.command(name="config-builder") def config_builder_command() -> None: - """Return DataDesignerConfigBuilder method signatures and docstrings. Use to understand available builder methods and their parameters.""" + """Show DataDesignerConfigBuilder method signatures and docstrings.""" IntrospectionController().show_builder() diff --git a/packages/data-designer/src/data_designer/cli/commands/agent_helpers/list.py b/packages/data-designer/src/data_designer/cli/commands/agent_helpers/list.py index 7f4717b2c..868ad4670 100644 --- a/packages/data-designer/src/data_designer/cli/commands/agent_helpers/list.py +++ b/packages/data-designer/src/data_designer/cli/commands/agent_helpers/list.py @@ -10,45 +10,42 @@ list_app = typer.Typer( name="list", - help=( - "Enumerate valid names and classes for configuration fields." - " Use these names as arguments to `inspect` commands for detailed schemas." - ), + help="List valid type names for use with `inspect` commands.", no_args_is_help=True, ) @list_app.command(name="model-aliases") def model_aliases_command() -> None: - """List all configured model aliases with their backing model identifiers. Required to set model_alias on LLM column configs.""" + """List configured model aliases and backing models. Needed for model_alias on LLM columns.""" ListController(DATA_DESIGNER_HOME).list_model_aliases() @list_app.command(name="persona-datasets") def persona_datasets_command() -> None: - """List available Nemotron-Persona datasets and whether each is installed locally.""" + """List Nemotron-Persona datasets and install status.""" ListController(DATA_DESIGNER_HOME).list_persona_datasets() @list_app.command(name="columns") def column_types_command() -> None: - """List all column type names and their config classes. Pass a name to `inspect column ` for the full schema.""" + """List column type names and config classes.""" ListController(DATA_DESIGNER_HOME).list_column_types() @list_app.command(name="samplers") def sampler_types_command() -> None: - """List all sampler type names and their params classes. Pass a name to `inspect sampler ` for the full schema.""" + """List sampler type names and params classes.""" ListController(DATA_DESIGNER_HOME).list_sampler_types() @list_app.command(name="validators") def validator_types_command() -> None: - """List all validator type names and their params classes. Pass a name to `inspect validator ` for the full schema.""" + """List validator type names and params classes.""" ListController(DATA_DESIGNER_HOME).list_validator_types() @list_app.command(name="processors") def processor_types_command() -> None: - """List all processor type names and their config classes. Pass a name to `inspect processor ` for the full schema.""" + """List processor type names and config classes.""" ListController(DATA_DESIGNER_HOME).list_processor_types() diff --git a/packages/data-designer/src/data_designer/cli/main.py b/packages/data-designer/src/data_designer/cli/main.py index ae184df9c..bd782704c 100644 --- a/packages/data-designer/src/data_designer/cli/main.py +++ b/packages/data-designer/src/data_designer/cli/main.py @@ -103,8 +103,8 @@ app.add_typer(config_app, name="config", rich_help_panel="Setup Commands") app.add_typer(download_app, name="download", rich_help_panel="Setup Commands") -# Add agent command groups (designed for AI agent consumption: schema introspection and valid-value discovery) -title_agent_helpers = "Agent-Helper Commands (schema introspection and valid-value discovery)" +# Add agent command groups +title_agent_helpers = "Agent-Helper Commands" app.add_typer(inspect_cmd.inspect_app, name="inspect", rich_help_panel=title_agent_helpers) From 4d19e90a469d2655eacac20cfc40a767f4e6dae6 Mon Sep 17 00:00:00 2001 From: Johnny Greco Date: Tue, 17 Feb 2026 20:03:52 -0500 Subject: [PATCH 26/37] docs: use column header names in list command tips for clarity --- .../data_designer/cli/controllers/list_controller.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/packages/data-designer/src/data_designer/cli/controllers/list_controller.py b/packages/data-designer/src/data_designer/cli/controllers/list_controller.py index b649511a0..c5ae34b56 100644 --- a/packages/data-designer/src/data_designer/cli/controllers/list_controller.py +++ b/packages/data-designer/src/data_designer/cli/controllers/list_controller.py @@ -101,8 +101,8 @@ def list_persona_datasets(self) -> None: status = "installed" if entry["installed"] else "not installed" typer.echo(f"{str(entry['locale']):<{max_width}} {status}") typer.echo("") - typer.echo("Tip: Use the PersonSamplerParams locale parameter to select a dataset.") - typer.echo("The user can run `data-designer download personas --locale ` to install a dataset.") + typer.echo("Use the PersonSamplerParams locale parameter to select a dataset.") + typer.echo("Run `data-designer download personas --locale ` to install a dataset.") def list_column_types(self) -> None: """List available column configuration types.""" @@ -116,7 +116,7 @@ def list_column_types(self) -> None: for t in sorted_types: typer.echo(f"{t:<{max_width}} {items[t].__name__}") typer.echo("") - typer.echo("Tip: Run `data-designer inspect column ` for full schema details.") + typer.echo("Run `data-designer inspect column ` to see that type's full schema.") def list_sampler_types(self) -> None: """List available sampler types.""" @@ -130,7 +130,7 @@ def list_sampler_types(self) -> None: for t in sorted_types: typer.echo(f"{t:<{max_width}} {items[t].__name__}") typer.echo("") - typer.echo("Tip: Run `data-designer inspect sampler ` for full schema details.") + typer.echo("Run `data-designer inspect sampler ` to see that type's full schema.") def list_validator_types(self) -> None: """List available validator types.""" @@ -144,7 +144,7 @@ def list_validator_types(self) -> None: for t in sorted_types: typer.echo(f"{t:<{max_width}} {items[t].__name__}") typer.echo("") - typer.echo("Tip: Run `data-designer inspect validator ` for full schema details.") + typer.echo("Run `data-designer inspect validator ` to see that type's full schema.") def list_processor_types(self) -> None: """List available processor types.""" @@ -158,4 +158,4 @@ def list_processor_types(self) -> None: for t in sorted_types: typer.echo(f"{t:<{max_width}} {items[t].__name__}") typer.echo("") - typer.echo("Tip: Run `data-designer inspect processor ` for full schema details.") + typer.echo("Run `data-designer inspect processor ` to see that type's full schema.") From 45e06a824d27444da5a2d75d551e9767b33812b5 Mon Sep 17 00:00:00 2001 From: Johnny Greco Date: Tue, 17 Feb 2026 20:08:05 -0500 Subject: [PATCH 27/37] docs: sharpen inspect and list group-level help descriptions --- .../cli/commands/agent_helpers/inspect.py | 10 +++++----- .../data_designer/cli/commands/agent_helpers/list.py | 2 +- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/packages/data-designer/src/data_designer/cli/commands/agent_helpers/inspect.py b/packages/data-designer/src/data_designer/cli/commands/agent_helpers/inspect.py index 35bcc9fb8..c4f08dee6 100644 --- a/packages/data-designer/src/data_designer/cli/commands/agent_helpers/inspect.py +++ b/packages/data-designer/src/data_designer/cli/commands/agent_helpers/inspect.py @@ -9,7 +9,7 @@ inspect_app = typer.Typer( name="inspect", - help="Show schemas and method signatures for configuration types. Run `list` to discover valid type names.", + help="Inspect detailed schemas for configuration objects and the Python API.", no_args_is_help=True, ) @@ -18,7 +18,7 @@ def columns_command( type_name: str = typer.Argument(help="Type name (e.g. 'llm-text', 'sampler'), or 'all'."), ) -> None: - """Show schema for a column config type. Run `list columns` for valid names.""" + """Show schema for a column config type.""" IntrospectionController().show_columns(type_name) @@ -26,7 +26,7 @@ def columns_command( def samplers_command( type_name: str = typer.Argument(help="Type name (e.g. 'category', 'uniform'), or 'all'."), ) -> None: - """Show schema for a sampler params type. Run `list samplers` for valid names.""" + """Show schema for a sampler params type.""" IntrospectionController().show_samplers(type_name) @@ -34,7 +34,7 @@ def samplers_command( def validators_command( type_name: str = typer.Argument(help="Type name (e.g. 'code', 'python'), or 'all'."), ) -> None: - """Show schema for a validator params type. Run `list validators` for valid names.""" + """Show schema for a validator params type.""" IntrospectionController().show_validators(type_name) @@ -42,7 +42,7 @@ def validators_command( def processors_command( type_name: str = typer.Argument(help="Type name (e.g. 'drop_columns'), or 'all'."), ) -> None: - """Show schema for a processor config type. Run `list processors` for valid names.""" + """Show schema for a processor config type.""" IntrospectionController().show_processors(type_name) diff --git a/packages/data-designer/src/data_designer/cli/commands/agent_helpers/list.py b/packages/data-designer/src/data_designer/cli/commands/agent_helpers/list.py index 868ad4670..813f6cc21 100644 --- a/packages/data-designer/src/data_designer/cli/commands/agent_helpers/list.py +++ b/packages/data-designer/src/data_designer/cli/commands/agent_helpers/list.py @@ -10,7 +10,7 @@ list_app = typer.Typer( name="list", - help="List valid type names for use with `inspect` commands.", + help="List available types, model aliases, and persona datasets.", no_args_is_help=True, ) From e8d370837a5e5ae97257f4db399f05b44e3e1034 Mon Sep 17 00:00:00 2001 From: Johnny Greco Date: Tue, 17 Feb 2026 20:15:21 -0500 Subject: [PATCH 28/37] refactor: remove related_inspect_tip from inspect command output --- .../cli/controllers/introspection_controller.py | 9 --------- 1 file changed, 9 deletions(-) diff --git a/packages/data-designer/src/data_designer/cli/controllers/introspection_controller.py b/packages/data-designer/src/data_designer/cli/controllers/introspection_controller.py index 30f7720b5..a2b6c9818 100644 --- a/packages/data-designer/src/data_designer/cli/controllers/introspection_controller.py +++ b/packages/data-designer/src/data_designer/cli/controllers/introspection_controller.py @@ -34,7 +34,6 @@ class _TypedCommandSpec: class_label: str header_title: str case_insensitive: bool = False - related_inspect_tip: str | None = None _CONFIG_IMPORT = "import data_designer.config as dd" @@ -55,10 +54,6 @@ class IntrospectionController: class_label="config_class", header_title="Data Designer Column Types Reference", case_insensitive=True, - related_inspect_tip=( - "Tip: Use 'data-designer inspect sampler ' for sampler params," - " 'inspect validator ' for validator params." - ), ), "samplers": _TypedCommandSpec( discover_items=discover_sampler_types, @@ -133,7 +128,6 @@ def _show_typed_command(self, command_name: str, type_name: str | None) -> None: class_label=spec.class_label, header_title=spec.header_title, case_insensitive=spec.case_insensitive, - related_inspect_tip=spec.related_inspect_tip, ) def _show_typed_items( @@ -176,9 +170,6 @@ def _show_typed_items( self._emit_import_hint(_CONFIG_IMPORT, f"dd.{cls.__name__}") typer.echo(format_model_text(cls, type_key=type_key, type_value=canonical_value)) - if related_inspect_tip: - typer.echo("") - typer.echo(related_inspect_tip) def _show_all_typed( self, From 122346fe4221505971e6dcf0f82ad3b05e2747bc Mon Sep 17 00:00:00 2001 From: Johnny Greco Date: Tue, 17 Feb 2026 20:59:28 -0500 Subject: [PATCH 29/37] refactor: remove dead code from introspection services Remove PropertyInfo, inspect_class_properties, _collect_classmethod_names, is_classmethod field from MethodInfo, format_method_info_json, and _param_to_json. None of these are called in production. Also removes _extract_nested_basemodel and PropertyInfo from __init__.py re-exports. --- .../cli/services/introspection/__init__.py | 8 --- .../cli/services/introspection/formatters.py | 42 +++---------- .../introspection/method_inspector.py | 61 +------------------ 3 files changed, 8 insertions(+), 103 deletions(-) diff --git a/packages/data-designer/src/data_designer/cli/services/introspection/__init__.py b/packages/data-designer/src/data_designer/cli/services/introspection/__init__.py index 1063da54b..4396183e5 100644 --- a/packages/data-designer/src/data_designer/cli/services/introspection/__init__.py +++ b/packages/data-designer/src/data_designer/cli/services/introspection/__init__.py @@ -11,40 +11,32 @@ discover_validator_types, ) from data_designer.cli.services.introspection.formatters import ( - format_method_info_json, format_method_info_text, format_type_list_text, ) from data_designer.cli.services.introspection.method_inspector import ( MethodInfo, ParamInfo, - PropertyInfo, inspect_class_methods, - inspect_class_properties, ) from data_designer.cli.services.introspection.pydantic_inspector import ( - _extract_nested_basemodel, format_model_text, format_type, get_brief_description, ) __all__ = [ - "_extract_nested_basemodel", "discover_column_configs", "discover_constraint_types", "discover_processor_configs", "discover_sampler_types", "discover_validator_types", - "format_method_info_json", "format_method_info_text", "format_model_text", "format_type_list_text", "format_type", "get_brief_description", "inspect_class_methods", - "inspect_class_properties", "MethodInfo", "ParamInfo", - "PropertyInfo", ] diff --git a/packages/data-designer/src/data_designer/cli/services/introspection/formatters.py b/packages/data-designer/src/data_designer/cli/services/introspection/formatters.py index f1a3532f5..e2c7f8c44 100644 --- a/packages/data-designer/src/data_designer/cli/services/introspection/formatters.py +++ b/packages/data-designer/src/data_designer/cli/services/introspection/formatters.py @@ -5,16 +5,18 @@ from data_designer.cli.services.introspection.method_inspector import MethodInfo, ParamInfo +_MIN_CLASS_COL_WIDTH = 25 + def _format_param_text(param: ParamInfo, indent: int) -> str: """Format a single method parameter as a text line.""" pad = " " * indent - parts = [f"{pad}{param.name}: {param.type_str}"] + line = f"{pad}{param.name}: {param.type_str}" if param.default is not None: - parts[0] += f" = {param.default}" + line += f" = {param.default}" if param.description: - parts[0] += f" \u2014 {param.description}" - return parts[0] + line += f" \u2014 {param.description}" + return line def format_method_info_text(methods: list[MethodInfo], class_name: str | None = None) -> str: @@ -37,36 +39,6 @@ def format_method_info_text(methods: list[MethodInfo], class_name: str | None = return "\n".join(lines).rstrip() -def _param_to_json(param: ParamInfo) -> dict: - """Convert a ParamInfo to a JSON-serializable dict.""" - result: dict = { - "name": param.name, - "type": param.type_str, - } - if param.default is not None: - result["default"] = param.default - if param.description: - result["description"] = param.description - return result - - -def format_method_info_json(methods: list[MethodInfo]) -> list[dict]: - """Convert a list of MethodInfo to a JSON-serializable list of dicts.""" - result: list[dict] = [] - for method in methods: - entry: dict = { - "name": method.name, - "signature": method.signature, - "return_type": method.return_type, - } - if method.description: - entry["description"] = method.description - if method.parameters: - entry["parameters"] = [_param_to_json(p) for p in method.parameters] - result.append(entry) - return result - - def format_type_list_text(items: dict[str, type], type_label: str, class_label: str) -> str: """Format a summary table of type->class mappings, matching the existing print_list_table style.""" sorted_items = sorted(items.items()) @@ -78,7 +50,7 @@ def format_type_list_text(items: dict[str, type], type_label: str, class_label: lines: list[str] = [] lines.append(f"{type_label:<{type_width}} {class_label}") - lines.append(f"{'-' * type_width} {'-' * max(len(class_label), 25)}") + lines.append(f"{'-' * type_width} {'-' * max(len(class_label), _MIN_CLASS_COL_WIDTH)}") for type_value, cls in sorted_items: lines.append(f"{type_value:<{type_width}} {cls.__name__}") diff --git a/packages/data-designer/src/data_designer/cli/services/introspection/method_inspector.py b/packages/data-designer/src/data_designer/cli/services/introspection/method_inspector.py index e5784b7da..456439ae6 100644 --- a/packages/data-designer/src/data_designer/cli/services/introspection/method_inspector.py +++ b/packages/data-designer/src/data_designer/cli/services/introspection/method_inspector.py @@ -23,14 +23,9 @@ class MethodInfo: description: str return_type: str parameters: list[ParamInfo] = field(default_factory=list) - is_classmethod: bool = False -@dataclass -class PropertyInfo: - name: str - return_type: str - description: str +_DEFAULT_INIT_DOCSTRING = "Initialize self. See help(type(self)) for accurate signature." def _parse_google_docstring_args(docstring: str | None) -> dict[str, str]: @@ -93,8 +88,6 @@ def _parse_google_docstring_args(docstring: str | None) -> dict[str, str]: def _join_desc_lines(lines: list[str]) -> str: """Join description lines, collapsing whitespace and stripping trailing blanks.""" - while lines and not lines[-1]: - lines.pop() return " ".join(part for part in lines if part) @@ -214,9 +207,6 @@ def _is_private(name: str) -> bool: return name.startswith("_") and not (name.startswith("__") and name.endswith("__")) -_DEFAULT_INIT_DOCSTRING = "Initialize self. See help(type(self)) for accurate signature." - - def _is_default_init_docstring(docstring: str | None) -> bool: """Check if a docstring is the unhelpful default __init__ docstring.""" if not docstring: @@ -239,7 +229,6 @@ def inspect_class_methods(cls: type, include_private: bool = False) -> list[Meth List of MethodInfo objects for each method. """ methods: list[MethodInfo] = [] - classmethod_names = _collect_classmethod_names(cls) # inspect.isfunction finds regular methods; inspect.ismethod finds classmethods seen: set[str] = set() @@ -281,55 +270,7 @@ def inspect_class_methods(cls: type, include_private: bool = False) -> list[Meth description=description, return_type=return_type, parameters=parameters, - is_classmethod=name in classmethod_names, ) ) return methods - - -def _collect_classmethod_names(cls: type) -> set[str]: - """Collect the names of all classmethods defined on a class and its bases.""" - names: set[str] = set() - for klass in cls.__mro__: - for name, value in vars(klass).items(): - if isinstance(value, classmethod): - names.add(name) - return names - - -def inspect_class_properties(cls: type, include_private: bool = False) -> list[PropertyInfo]: - """Introspect properties of a class. - - Args: - cls: The class to introspect. - include_private: If True, include properties starting with underscore. - - Returns: - List of PropertyInfo objects for each property. - """ - properties: list[PropertyInfo] = [] - - for name in dir(cls): - if _is_dunder(name): - continue - if _is_private(name) and not include_private: - continue - - attr = inspect.getattr_static(cls, name, None) - if not isinstance(attr, property): - continue - - return_type = "Any" - if attr.fget is not None: - hints = getattr(attr.fget, "__annotations__", {}) - ret = hints.get("return") - if ret is not None: - return_type = _format_annotation(ret) - - docstring = attr.fget.__doc__ if attr.fget is not None else None - description = _get_first_docstring_line(docstring) - - properties.append(PropertyInfo(name=name, return_type=return_type, description=description)) - - return properties From 9ad0399855747f9cd8dcf217ecdd29a700f308b3 Mon Sep 17 00:00:00 2001 From: Johnny Greco Date: Tue, 17 Feb 2026 20:59:52 -0500 Subject: [PATCH 30/37] fix: harden introspection service layer - Fix get_brief_description to iterate lines instead of taking first (handles whitespace-first-line docstrings) - Remove dead elif branch in _extract_nested_basemodel union handler - Remove input mutation in _join_desc_lines - Move _DEFAULT_INIT_DOCSTRING constant to top of method_inspector.py - Extract _MIN_CLASS_COL_WIDTH and _NO_DESCRIPTION constants - Use plain string variable instead of single-element list in formatters - Document enum_name param in _discover_params_by_discriminator --- .../cli/services/introspection/discovery.py | 1 + .../services/introspection/pydantic_inspector.py | 13 +++++++------ 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/packages/data-designer/src/data_designer/cli/services/introspection/discovery.py b/packages/data-designer/src/data_designer/cli/services/introspection/discovery.py index c7ab08bf4..663efccf6 100644 --- a/packages/data-designer/src/data_designer/cli/services/introspection/discovery.py +++ b/packages/data-designer/src/data_designer/cli/services/introspection/discovery.py @@ -75,6 +75,7 @@ def _discover_params_by_discriminator( Args: params_class_suffix: Class-name suffix to select params classes. discriminator_field: Field name that stores the literal discriminator. + enum_name: Enum class name to use for fallback name-matching. Returns: Dict mapping discriminator values to params classes. diff --git a/packages/data-designer/src/data_designer/cli/services/introspection/pydantic_inspector.py b/packages/data-designer/src/data_designer/cli/services/introspection/pydantic_inspector.py index a331279b3..8c7c61e99 100644 --- a/packages/data-designer/src/data_designer/cli/services/introspection/pydantic_inspector.py +++ b/packages/data-designer/src/data_designer/cli/services/introspection/pydantic_inspector.py @@ -12,6 +12,8 @@ from pydantic import BaseModel from pydantic_core import PydanticUndefined +_NO_DESCRIPTION = "No description available." + def _is_basemodel_subclass(cls: Any) -> bool: """Return True if cls is a concrete BaseModel subclass (not BaseModel itself).""" @@ -91,8 +93,6 @@ def _extract_nested_basemodel(annotation: Any) -> type | None: result = _extract_nested_basemodel(arg) if result is not None: basemodel_classes.append(result) - elif _is_basemodel_subclass(arg): - basemodel_classes.append(arg) if len(basemodel_classes) == 1: return basemodel_classes[0] return None @@ -147,10 +147,11 @@ def format_type(annotation: Any) -> str: def get_brief_description(cls: type) -> str: """Extract first line from class docstring.""" if cls.__doc__: - doc = cls.__doc__.strip() - first_line = doc.split("\n")[0].strip() - return first_line - return "No description available." + for line in cls.__doc__.strip().split("\n"): + stripped = line.strip() + if stripped: + return stripped + return _NO_DESCRIPTION def _extract_constraints(field_info: Any) -> dict[str, Any] | None: From fe9ebf6a6ae888115960fd3e525c359f18a292c9 Mon Sep 17 00:00:00 2001 From: Johnny Greco Date: Tue, 17 Feb 2026 20:59:59 -0500 Subject: [PATCH 31/37] refactor: clean up IntrospectionController - Remove unreachable type_name=None branch from _show_typed_items (Typer always provides a string); move list-mode handling to _show_typed_command instead - Replace inline chr(10) docstring splitting with get_brief_description - Remove template dd. hint from show_sampler_constraints - Remove unused related_inspect_tip parameter --- .../controllers/introspection_controller.py | 32 ++++++++----------- 1 file changed, 13 insertions(+), 19 deletions(-) diff --git a/packages/data-designer/src/data_designer/cli/controllers/introspection_controller.py b/packages/data-designer/src/data_designer/cli/controllers/introspection_controller.py index a2b6c9818..3f0a1e058 100644 --- a/packages/data-designer/src/data_designer/cli/controllers/introspection_controller.py +++ b/packages/data-designer/src/data_designer/cli/controllers/introspection_controller.py @@ -15,12 +15,9 @@ discover_sampler_types, discover_validator_types, ) -from data_designer.cli.services.introspection.formatters import ( - format_method_info_text, - format_type_list_text, -) +from data_designer.cli.services.introspection.formatters import format_method_info_text, format_type_list_text from data_designer.cli.services.introspection.method_inspector import inspect_class_methods -from data_designer.cli.services.introspection.pydantic_inspector import format_model_text +from data_designer.cli.services.introspection.pydantic_inspector import format_model_text, get_brief_description from data_designer.config.config_builder import DataDesignerConfigBuilder @@ -113,19 +110,24 @@ def show_builder(self) -> None: def show_sampler_constraints(self) -> None: """Show sampler constraint types.""" - self._emit_import_hint(_CONFIG_IMPORT, "dd.") + self._emit_import_hint(_CONFIG_IMPORT) items = discover_constraint_types() self._show_all_schemas(items, "Data Designer Constraint Types Reference") def _show_typed_command(self, command_name: str, type_name: str | None) -> None: """Resolve a typed-command spec and render it.""" spec = self._TYPED_COMMAND_SPECS[command_name] + items = spec.discover_items() + + if type_name is None: + self._emit_import_hint(_CONFIG_IMPORT) + typer.echo(format_type_list_text(items, spec.type_label, spec.class_label)) + return + self._show_typed_items( - items=spec.discover_items(), + items=items, type_name=type_name, type_key=spec.type_key, - type_label=spec.type_label, - class_label=spec.class_label, header_title=spec.header_title, case_insensitive=spec.case_insensitive, ) @@ -133,20 +135,12 @@ def _show_typed_command(self, command_name: str, type_name: str | None) -> None: def _show_typed_items( self, items: dict[str, type], - type_name: str | None, + type_name: str, type_key: str, - type_label: str, - class_label: str, header_title: str, case_insensitive: bool = False, - related_inspect_tip: str | None = None, ) -> None: """Shared logic for type-based commands (columns, samplers, validators, processors).""" - if type_name is None: - self._emit_import_hint(_CONFIG_IMPORT, "dd.") - typer.echo(format_type_list_text(items, type_label, class_label)) - return - if type_name.lower() == "all": self._show_all_typed(items, type_key, header_title) return @@ -200,7 +194,7 @@ def _show_all_schemas(self, items: dict[str, type], header_title: str) -> None: else: lines.append(f"{cls.__name__}:") if cls.__doc__: - lines.append(f" description: {cls.__doc__.strip().split(chr(10))[0]}") + lines.append(f" description: {get_brief_description(cls)}") if hasattr(cls, "__members__"): members = [str(m.value) for m in cls] lines.append(f" values: [{', '.join(members)}]") From f1e3593c9dda0e664ca16f57fcbc95508860a259 Mon Sep 17 00:00:00 2001 From: Johnny Greco Date: Tue, 17 Feb 2026 21:00:07 -0500 Subject: [PATCH 32/37] fix: harden ListController and eliminate DRY violation - Fix max() crash on empty discovery results in all list_* methods - Extract _print_type_table helper to replace 4 near-identical methods - Add import hint (# import data_designer.config as dd) to all list output - Remove inconsistent Nemotron-Personas banner from list_persona_datasets - Add early return for empty persona dataset list --- .../cli/controllers/list_controller.py | 76 +++++++++---------- 1 file changed, 35 insertions(+), 41 deletions(-) diff --git a/packages/data-designer/src/data_designer/cli/controllers/list_controller.py b/packages/data-designer/src/data_designer/cli/controllers/list_controller.py index c5ae34b56..98a376c0e 100644 --- a/packages/data-designer/src/data_designer/cli/controllers/list_controller.py +++ b/packages/data-designer/src/data_designer/cli/controllers/list_controller.py @@ -19,6 +19,8 @@ ) from data_designer.config.default_model_settings import get_providers_with_missing_api_keys +_IMPORT_HINT = "# import data_designer.config as dd" + class ListController: """Controller for listing valid configuration values.""" @@ -85,13 +87,17 @@ def list_model_aliases(self) -> None: def list_persona_datasets(self) -> None: """List persona datasets available for PersonSamplerParams.""" managed_locales = self._persona_repository.list_all() + if not managed_locales: + typer.echo("No persona datasets found.") + return + entries: list[dict[str, str | bool]] = [] for locale in managed_locales: installed = self._download_service.is_locale_downloaded(locale.code) entries.append({"locale": locale.code, "installed": installed}) - typer.echo("Nemotron-Personas Datasets") - typer.echo("-" * 26) + typer.echo(_IMPORT_HINT) + typer.echo("") col1 = "locale" col2 = "status" max_width = max(len(col1), max(len(str(entry["locale"])) for entry in entries)) @@ -104,58 +110,46 @@ def list_persona_datasets(self) -> None: typer.echo("Use the PersonSamplerParams locale parameter to select a dataset.") typer.echo("Run `data-designer download personas --locale ` to install a dataset.") - def list_column_types(self) -> None: - """List available column configuration types.""" - items = discover_column_configs() - sorted_types = sorted(items.keys()) + def _print_type_table( + self, + items: dict[str, type], + col1: str, + col2: str, + inspect_command: str, + ) -> None: + """Print a two-column table of discovered types with an inspect tip.""" + if not items: + typer.echo("No items found.") + return - col1, col2 = "column_type", "config_class" + sorted_types = sorted(items.keys()) max_width = max(len(col1), max(len(t) for t in sorted_types)) + + typer.echo(_IMPORT_HINT) + typer.echo("") typer.echo(f"{col1:<{max_width}} {col2}") typer.echo(f"{'-' * max_width} {'-' * max(len(items[t].__name__) for t in sorted_types)}") for t in sorted_types: typer.echo(f"{t:<{max_width}} {items[t].__name__}") typer.echo("") - typer.echo("Run `data-designer inspect column ` to see that type's full schema.") + typer.echo(f"Run `data-designer inspect {inspect_command}` to see that type's full schema.") + + def list_column_types(self) -> None: + """List available column configuration types.""" + self._print_type_table(discover_column_configs(), "column_type", "config_class", "column ") def list_sampler_types(self) -> None: """List available sampler types.""" - items = discover_sampler_types() - sorted_types = sorted(items.keys()) - - col1, col2 = "sampler_type", "params_class" - max_width = max(len(col1), max(len(t) for t in sorted_types)) - typer.echo(f"{col1:<{max_width}} {col2}") - typer.echo(f"{'-' * max_width} {'-' * max(len(items[t].__name__) for t in sorted_types)}") - for t in sorted_types: - typer.echo(f"{t:<{max_width}} {items[t].__name__}") - typer.echo("") - typer.echo("Run `data-designer inspect sampler ` to see that type's full schema.") + self._print_type_table(discover_sampler_types(), "sampler_type", "params_class", "sampler ") def list_validator_types(self) -> None: """List available validator types.""" - items = discover_validator_types() - sorted_types = sorted(items.keys()) - - col1, col2 = "validator_type", "params_class" - max_width = max(len(col1), max(len(t) for t in sorted_types)) - typer.echo(f"{col1:<{max_width}} {col2}") - typer.echo(f"{'-' * max_width} {'-' * max(len(items[t].__name__) for t in sorted_types)}") - for t in sorted_types: - typer.echo(f"{t:<{max_width}} {items[t].__name__}") - typer.echo("") - typer.echo("Run `data-designer inspect validator ` to see that type's full schema.") + self._print_type_table( + discover_validator_types(), "validator_type", "params_class", "validator " + ) def list_processor_types(self) -> None: """List available processor types.""" - items = discover_processor_configs() - sorted_types = sorted(items.keys()) - - col1, col2 = "processor_type", "config_class" - max_width = max(len(col1), max(len(t) for t in sorted_types)) - typer.echo(f"{col1:<{max_width}} {col2}") - typer.echo(f"{'-' * max_width} {'-' * max(len(items[t].__name__) for t in sorted_types)}") - for t in sorted_types: - typer.echo(f"{t:<{max_width}} {items[t].__name__}") - typer.echo("") - typer.echo("Run `data-designer inspect processor ` to see that type's full schema.") + self._print_type_table( + discover_processor_configs(), "processor_type", "config_class", "processor " + ) From 8ca35c474af6b70cf4f0bcd2c23973a44670e1f5 Mon Sep 17 00:00:00 2001 From: Johnny Greco Date: Tue, 17 Feb 2026 21:00:14 -0500 Subject: [PATCH 33/37] docs: polish help text and field description consistency - Remove trailing periods from ModelProvider field descriptions to match the convention used across all other config models - Fix ambiguous 'sampler' example in inspect column help text - Update main CLI help to "Data Designer CLI for humans and agents." --- .../src/data_designer/config/models.py | 16 ++++++---------- .../cli/commands/agent_helpers/inspect.py | 2 +- .../data-designer/src/data_designer/cli/main.py | 2 +- 3 files changed, 8 insertions(+), 12 deletions(-) diff --git a/packages/data-designer-config/src/data_designer/config/models.py b/packages/data-designer-config/src/data_designer/config/models.py index 13ce8d60e..76b66ac0f 100644 --- a/packages/data-designer-config/src/data_designer/config/models.py +++ b/packages/data-designer-config/src/data_designer/config/models.py @@ -581,16 +581,12 @@ class ModelProvider(ConfigBase): extra_headers: Additional headers to pass in API requests. """ - name: str = Field(description="Name of the model provider.") - endpoint: str = Field(description="API endpoint URL for the provider.") - provider_type: str = Field(default="openai", description="Provider type. Determines the API format to use.") - api_key: str | None = Field(default=None, description="Optional API key for authentication.") - extra_body: dict[str, Any] | None = Field( - default=None, description="Additional parameters to pass in API requests." - ) - extra_headers: dict[str, str] | None = Field( - default=None, description="Additional headers to pass in API requests." - ) + name: str = Field(description="Name of the model provider") + endpoint: str = Field(description="API endpoint URL for the provider") + provider_type: str = Field(default="openai", description="Provider type. Determines the API format to use") + api_key: str | None = Field(default=None, description="Optional API key for authentication") + extra_body: dict[str, Any] | None = Field(default=None, description="Additional parameters to pass in API requests") + extra_headers: dict[str, str] | None = Field(default=None, description="Additional headers to pass in API requests") def load_model_configs(model_configs: list[ModelConfig] | str | Path) -> list[ModelConfig]: diff --git a/packages/data-designer/src/data_designer/cli/commands/agent_helpers/inspect.py b/packages/data-designer/src/data_designer/cli/commands/agent_helpers/inspect.py index c4f08dee6..24794b97c 100644 --- a/packages/data-designer/src/data_designer/cli/commands/agent_helpers/inspect.py +++ b/packages/data-designer/src/data_designer/cli/commands/agent_helpers/inspect.py @@ -16,7 +16,7 @@ @inspect_app.command(name="column") def columns_command( - type_name: str = typer.Argument(help="Type name (e.g. 'llm-text', 'sampler'), or 'all'."), + type_name: str = typer.Argument(help="Type name (e.g. 'llm-text', 'expression'), or 'all'."), ) -> None: """Show schema for a column config type.""" IntrospectionController().show_columns(type_name) diff --git a/packages/data-designer/src/data_designer/cli/main.py b/packages/data-designer/src/data_designer/cli/main.py index bd782704c..63b5f0733 100644 --- a/packages/data-designer/src/data_designer/cli/main.py +++ b/packages/data-designer/src/data_designer/cli/main.py @@ -13,7 +13,7 @@ # Initialize Typer app with custom configuration app = typer.Typer( name="data-designer", - help="Data Designer CLI - Configure model providers and models for synthetic data generation", + help="Data Designer CLI for humans and agents.", cls=create_lazy_typer_group( { "preview": { From 589aafa2339aadf5c502494952ef98a6e6a2092b Mon Sep 17 00:00:00 2001 From: Johnny Greco Date: Tue, 17 Feb 2026 21:00:45 -0500 Subject: [PATCH 34/37] test: add coverage for introspection edge cases and crash paths - Add empty discovery tests for all ListController list_* methods - Add empty persona dataset test - Add processors specific/all/nonexistent tests for IntrospectionController - Add mixed-case lookup tests (LLM-TEXT, CATEGORY) - Add method_inspector edge cases (empty class, signature error, varargs, keyword-only params, _is_dunder/_is_private parametrized) - Add _extract_literal_discriminator_value direct tests - Add _default_to_json parametrized tests (9 branches) - Add format_type regex branch tests - Add format_model_text empty model test - Add format_method_info_text edge cases (empty list, no description, no parameters) - Add processors CLI command tests - Fix broken persona banner assertion after banner removal --- .../test_introspection_commands.py | 14 +- .../test_introspection_controller.py | 40 ++++ .../cli/controllers/test_list_controller.py | 47 ++++- .../services/introspection/test_discovery.py | 31 +++ .../services/introspection/test_formatters.py | 71 ++++--- .../introspection/test_method_inspector.py | 182 ++++++++++-------- .../introspection/test_pydantic_inspector.py | 88 +++++++++ 7 files changed, 363 insertions(+), 110 deletions(-) diff --git a/packages/data-designer/tests/cli/commands/agent_helpers/test_introspection_commands.py b/packages/data-designer/tests/cli/commands/agent_helpers/test_introspection_commands.py index 7783c61e2..bdaca7c25 100644 --- a/packages/data-designer/tests/cli/commands/agent_helpers/test_introspection_commands.py +++ b/packages/data-designer/tests/cli/commands/agent_helpers/test_introspection_commands.py @@ -98,6 +98,18 @@ def test_processors_no_arg_fails() -> None: assert result.exit_code != 0 +def test_processors_specific_type() -> None: + result = runner.invoke(app, ["inspect", "processor", "drop_columns"]) + assert result.exit_code == 0 + assert "DropColumnsProcessorConfig" in result.output + + +def test_processors_all() -> None: + result = runner.invoke(app, ["inspect", "processor", "all"]) + assert result.exit_code == 0 + assert "Data Designer Processor Types Reference" in result.output + + # --------------------------------------------------------------------------- # config-builder # --------------------------------------------------------------------------- @@ -155,7 +167,7 @@ def test_list_model_aliases() -> None: def test_list_persona_datasets() -> None: result = runner.invoke(app, ["list", "persona-datasets"]) assert result.exit_code == 0 - assert "Nemotron-Personas Datasets" in result.output + assert "locale" in result.output def test_list_column_types() -> None: diff --git a/packages/data-designer/tests/cli/controllers/test_introspection_controller.py b/packages/data-designer/tests/cli/controllers/test_introspection_controller.py index f3ab12ea4..195104cef 100644 --- a/packages/data-designer/tests/cli/controllers/test_introspection_controller.py +++ b/packages/data-designer/tests/cli/controllers/test_introspection_controller.py @@ -132,3 +132,43 @@ def test_show_processors_list_text(capsys: pytest.CaptureFixture[str]) -> None: captured = capsys.readouterr() assert "processor_type" in captured.out assert "config_class" in captured.out + + +def test_show_processors_specific_type(capsys: pytest.CaptureFixture[str]) -> None: + controller = IntrospectionController() + controller.show_processors(type_name="drop_columns") + captured = capsys.readouterr() + assert "DropColumnsProcessorConfig" in captured.out + + +def test_show_processors_all(capsys: pytest.CaptureFixture[str]) -> None: + controller = IntrospectionController() + controller.show_processors(type_name="all") + captured = capsys.readouterr() + assert "Data Designer Processor Types Reference" in captured.out + assert "processor_type:" in captured.out + + +def test_show_processors_nonexistent() -> None: + controller = IntrospectionController() + with pytest.raises(click.exceptions.Exit): + controller.show_processors(type_name="badname") + + +# --------------------------------------------------------------------------- +# case-insensitive lookup (P1-3) +# --------------------------------------------------------------------------- + + +def test_show_columns_mixed_case(capsys: pytest.CaptureFixture[str]) -> None: + controller = IntrospectionController() + controller.show_columns(type_name="LLM-TEXT") + captured = capsys.readouterr() + assert "LLMTextColumnConfig" in captured.out + + +def test_show_samplers_mixed_case(capsys: pytest.CaptureFixture[str]) -> None: + controller = IntrospectionController() + controller.show_samplers(type_name="CATEGORY") + captured = capsys.readouterr() + assert "sampler_type: category" in captured.out diff --git a/packages/data-designer/tests/cli/controllers/test_list_controller.py b/packages/data-designer/tests/cli/controllers/test_list_controller.py index d270deb51..293516644 100644 --- a/packages/data-designer/tests/cli/controllers/test_list_controller.py +++ b/packages/data-designer/tests/cli/controllers/test_list_controller.py @@ -381,7 +381,7 @@ def test_model_aliases_filtered_count_hint(tmp_path: Path, capsys: pytest.Captur def test_persona_datasets_text_none_installed(controller: ListController, capsys: pytest.CaptureFixture[str]) -> None: controller.list_persona_datasets() out = capsys.readouterr().out - assert "Nemotron-Personas Datasets" in out + assert "locale" in out assert "not installed" in out @@ -460,3 +460,48 @@ def test_processor_types_text(controller: ListController, capsys: pytest.Capture assert "processor_type" in out assert "config_class" in out assert "data-designer inspect processor" in out + + +# --------------------------------------------------------------------------- +# list_*_types — empty discovery (P0-1) +# --------------------------------------------------------------------------- + + +def test_list_column_types_empty_discovery(controller: ListController, capsys: pytest.CaptureFixture[str]) -> None: + with patch("data_designer.cli.controllers.list_controller.discover_column_configs", return_value={}): + controller.list_column_types() + out = capsys.readouterr().out + assert "No items found" in out + + +def test_list_sampler_types_empty_discovery(controller: ListController, capsys: pytest.CaptureFixture[str]) -> None: + with patch("data_designer.cli.controllers.list_controller.discover_sampler_types", return_value={}): + controller.list_sampler_types() + out = capsys.readouterr().out + assert "No items found" in out + + +def test_list_validator_types_empty_discovery(controller: ListController, capsys: pytest.CaptureFixture[str]) -> None: + with patch("data_designer.cli.controllers.list_controller.discover_validator_types", return_value={}): + controller.list_validator_types() + out = capsys.readouterr().out + assert "No items found" in out + + +def test_list_processor_types_empty_discovery(controller: ListController, capsys: pytest.CaptureFixture[str]) -> None: + with patch("data_designer.cli.controllers.list_controller.discover_processor_configs", return_value={}): + controller.list_processor_types() + out = capsys.readouterr().out + assert "No items found" in out + + +# --------------------------------------------------------------------------- +# list_persona_datasets — empty (P0-2) +# --------------------------------------------------------------------------- + + +def test_list_persona_datasets_empty(controller: ListController, capsys: pytest.CaptureFixture[str]) -> None: + with patch.object(controller._persona_repository, "list_all", return_value=[]): + controller.list_persona_datasets() + out = capsys.readouterr().out + assert "No persona datasets found" in out diff --git a/packages/data-designer/tests/cli/services/introspection/test_discovery.py b/packages/data-designer/tests/cli/services/introspection/test_discovery.py index 7d9087cbb..e2ae5a921 100644 --- a/packages/data-designer/tests/cli/services/introspection/test_discovery.py +++ b/packages/data-designer/tests/cli/services/introspection/test_discovery.py @@ -3,8 +3,12 @@ from __future__ import annotations +from enum import Enum +from typing import Literal + from data_designer.cli.services.introspection.discovery import ( _discover_by_modules, + _extract_literal_discriminator_value, discover_column_configs, discover_constraint_types, discover_processor_configs, @@ -125,3 +129,30 @@ def test_discover_by_modules_with_multiple_suffixes() -> None: def test_discover_by_modules_unknown_suffix_returns_empty() -> None: result = _discover_by_modules("nonexistent_module") assert result == {} + + +# --------------------------------------------------------------------------- +# _extract_literal_discriminator_value (P1-5) +# --------------------------------------------------------------------------- + + +class _TestEnum(str, Enum): + A = "alpha" + B = "beta" + + +def test_extract_literal_value_string() -> None: + assert _extract_literal_discriminator_value(Literal["foo"]) == "foo" + + +def test_extract_literal_value_enum() -> None: + result = _extract_literal_discriminator_value(Literal[_TestEnum.A]) + assert result == "alpha" + + +def test_extract_literal_non_literal() -> None: + assert _extract_literal_discriminator_value(str) is None + + +def test_extract_literal_int_value() -> None: + assert _extract_literal_discriminator_value(Literal[42]) == "42" diff --git a/packages/data-designer/tests/cli/services/introspection/test_formatters.py b/packages/data-designer/tests/cli/services/introspection/test_formatters.py index cc3e32e56..2bb19f2b1 100644 --- a/packages/data-designer/tests/cli/services/introspection/test_formatters.py +++ b/packages/data-designer/tests/cli/services/introspection/test_formatters.py @@ -4,7 +4,6 @@ from __future__ import annotations from data_designer.cli.services.introspection.formatters import ( - format_method_info_json, format_method_info_text, format_type_list_text, ) @@ -56,36 +55,6 @@ def test_format_method_info_text_no_class_name() -> None: assert "Methods:" not in text -# --------------------------------------------------------------------------- -# format_method_info_json -# --------------------------------------------------------------------------- - - -def test_format_method_info_json_basic() -> None: - methods = [_make_method()] - result = format_method_info_json(methods) - assert isinstance(result, list) - assert len(result) == 1 - entry = result[0] - assert entry["name"] == "do_thing" - assert entry["signature"] == "do_thing(x: int) -> str" - assert entry["return_type"] == "str" - assert "description" in entry - assert "parameters" in entry - - -def test_format_method_info_json_multiple_methods() -> None: - methods = [ - _make_method(name="method_a", signature="method_a() -> None", return_type="None", parameters=[]), - _make_method(name="method_b"), - ] - result = format_method_info_json(methods) - assert len(result) == 2 - names = [e["name"] for e in result] - assert "method_a" in names - assert "method_b" in names - - # --------------------------------------------------------------------------- # format_type_list_text # --------------------------------------------------------------------------- @@ -122,3 +91,43 @@ class C: def test_format_type_list_text_empty() -> None: text = format_type_list_text({}, "Type", "Class") assert "(no items)" in text + + +# --------------------------------------------------------------------------- +# format_method_info_text — edge cases (P1-7) +# --------------------------------------------------------------------------- + + +def test_format_method_info_text_empty_list() -> None: + text = format_method_info_text([], class_name="MyClass") + assert "MyClass Methods:" in text + lines = text.strip().split("\n") + assert len(lines) <= 2 + + +def test_format_method_info_text_no_description() -> None: + method = MethodInfo( + name="do_thing", + signature="do_thing() -> None", + description="", + return_type="None", + parameters=[], + ) + text = format_method_info_text([method]) + lines = text.strip().split("\n") + sig_line_idx = next(i for i, line in enumerate(lines) if "do_thing()" in line) + if sig_line_idx + 1 < len(lines): + next_line = lines[sig_line_idx + 1].strip() + assert next_line == "" or next_line.startswith("Parameters:") or "do_thing" not in next_line + + +def test_format_method_info_text_no_parameters() -> None: + method = MethodInfo( + name="do_thing", + signature="do_thing() -> None", + description="Does a thing.", + return_type="None", + parameters=[], + ) + text = format_method_info_text([method]) + assert "Parameters:" not in text diff --git a/packages/data-designer/tests/cli/services/introspection/test_method_inspector.py b/packages/data-designer/tests/cli/services/introspection/test_method_inspector.py index bc8e0def3..539ea2bfd 100644 --- a/packages/data-designer/tests/cli/services/introspection/test_method_inspector.py +++ b/packages/data-designer/tests/cli/services/introspection/test_method_inspector.py @@ -3,12 +3,14 @@ from __future__ import annotations +import pytest + from data_designer.cli.services.introspection.method_inspector import ( MethodInfo, - PropertyInfo, + _is_dunder, + _is_private, _parse_google_docstring_args, inspect_class_methods, - inspect_class_properties, ) # --------------------------------------------------------------------------- @@ -66,29 +68,6 @@ def regular_method(self) -> str: return "hello" -class ClassWithProperties: - """A class with properties for testing.""" - - def __init__(self) -> None: - self._name = "test" - self._count = 0 - - @property - def name(self) -> str: - """Get the name.""" - return self._name - - @property - def count(self) -> int: - """Get the count value.""" - return self._count - - @property - def _private_prop(self) -> bool: - """A private property.""" - return True - - class ClassWithDefaultInitDocstring: """A useful class that does important things. @@ -235,14 +214,6 @@ def test_inspect_class_methods_detects_classmethod() -> None: assert "regular_method" in names -def test_inspect_class_methods_classmethod_is_classmethod_flag() -> None: - methods = inspect_class_methods(ClassWithClassmethod, include_private=False) - from_value = next(m for m in methods if m.name == "from_value") - regular = next(m for m in methods if m.name == "regular_method") - assert from_value.is_classmethod is True - assert regular.is_classmethod is False - - def test_inspect_class_methods_classmethod_signature() -> None: methods = inspect_class_methods(ClassWithClassmethod, include_private=False) from_value = next(m for m in methods if m.name == "from_value") @@ -264,71 +235,128 @@ def test_inspect_class_methods_classmethod_parameters() -> None: assert value_param.description == "The input value." -def test_inspect_class_methods_regular_not_classmethod() -> None: - methods = inspect_class_methods(SampleClass, include_private=False) - for m in methods: - assert m.is_classmethod is False +# --------------------------------------------------------------------------- +# __init__ docstring fallback +# --------------------------------------------------------------------------- + + +def test_init_default_docstring_falls_back_to_class() -> None: + methods = inspect_class_methods(ClassWithDefaultInitDocstring, include_private=True) + init = next((m for m in methods if m.name == "__init__"), None) + assert init is not None + assert init.description == "A useful class that does important things." + + +def test_init_custom_docstring_preserved() -> None: + methods = inspect_class_methods(ClassWithCustomInitDocstring, include_private=True) + init = next((m for m in methods if m.name == "__init__"), None) + assert init is not None + assert init.description == "Custom init docstring." # --------------------------------------------------------------------------- -# inspect_class_properties +# inspect_class_methods — edge cases (P1-4) # --------------------------------------------------------------------------- -def test_inspect_class_properties_finds_public() -> None: - props = inspect_class_properties(ClassWithProperties, include_private=False) - names = [p.name for p in props] - assert "name" in names - assert "count" in names - assert "_private_prop" not in names +class EmptyClass: + """A class with no public methods (no __init__ either).""" + + + +class ClassWithBadSignature: + """A class where one method has an uninspectable signature.""" + def good_method(self) -> str: + """Works fine.""" + return "ok" -def test_inspect_class_properties_returns_property_info() -> None: - props = inspect_class_properties(ClassWithProperties, include_private=False) - assert all(isinstance(p, PropertyInfo) for p in props) +class ClassWithVarArgs: + """A class with *args and **kwargs.""" -def test_inspect_class_properties_return_types() -> None: - props = inspect_class_properties(ClassWithProperties, include_private=False) - name_prop = next(p for p in props if p.name == "name") - count_prop = next(p for p in props if p.name == "count") - assert name_prop.return_type == "str" - assert count_prop.return_type == "int" + def method_with_varargs(self, *args: str, **kwargs: int) -> None: + """A method with varargs.""" -def test_inspect_class_properties_descriptions() -> None: - props = inspect_class_properties(ClassWithProperties, include_private=False) - name_prop = next(p for p in props if p.name == "name") - count_prop = next(p for p in props if p.name == "count") - assert name_prop.description == "Get the name." - assert count_prop.description == "Get the count value." +def test_inspect_class_methods_empty_class() -> None: + methods = inspect_class_methods(EmptyClass, include_private=False) + assert methods == [] -def test_inspect_class_properties_include_private() -> None: - props = inspect_class_properties(ClassWithProperties, include_private=True) - names = [p.name for p in props] - assert "_private_prop" in names +def test_inspect_class_methods_signature_error_skipped() -> None: + import inspect as _inspect + from unittest.mock import patch + original_sig = _inspect.signature + + def bad_signature(method: object) -> _inspect.Signature: + if getattr(method, "__name__", "") == "good_method": + raise ValueError("cannot inspect") + return original_sig(method) + + with patch( + "data_designer.cli.services.introspection.method_inspector.inspect.signature", side_effect=bad_signature + ): + methods = inspect_class_methods(ClassWithBadSignature, include_private=False) + + names = [m.name for m in methods] + assert "good_method" not in names -def test_inspect_class_properties_empty_class() -> None: - props = inspect_class_properties(SampleClass, include_private=False) - assert props == [] + +def test_inspect_class_methods_varargs_and_kwargs() -> None: + methods = inspect_class_methods(ClassWithVarArgs, include_private=False) + m = next(m for m in methods if m.name == "method_with_varargs") + assert "*args" in m.signature + assert "**kwargs" in m.signature # --------------------------------------------------------------------------- -# __init__ docstring fallback +# _is_dunder / _is_private (P2-1) # --------------------------------------------------------------------------- -def test_init_default_docstring_falls_back_to_class() -> None: - methods = inspect_class_methods(ClassWithDefaultInitDocstring, include_private=True) - init = next((m for m in methods if m.name == "__init__"), None) - assert init is not None - assert init.description == "A useful class that does important things." +@pytest.mark.parametrize( + ("name", "expected"), + [ + ("__init__", False), + ("__str__", True), + ("__repr__", True), + ("regular", False), + ("_private", False), + ], +) +def test_is_dunder(name: str, expected: bool) -> None: + assert _is_dunder(name) is expected + + +@pytest.mark.parametrize( + ("name", "expected"), + [ + ("_foo", True), + ("_private_method", True), + ("__init__", False), + ("__str__", False), + ("public", False), + ], +) +def test_is_private(name: str, expected: bool) -> None: + assert _is_private(name) is expected -def test_init_custom_docstring_preserved() -> None: - methods = inspect_class_methods(ClassWithCustomInitDocstring, include_private=True) - init = next((m for m in methods if m.name == "__init__"), None) - assert init is not None - assert init.description == "Custom init docstring." +# --------------------------------------------------------------------------- +# keyword-only params (P2-9) +# --------------------------------------------------------------------------- + + +class ClassWithKeywordOnly: + """A class with keyword-only parameters.""" + + def method_with_kw(self, *, kw: str = "x") -> None: + """A method with keyword-only arg.""" + + +def test_format_signature_keyword_only() -> None: + methods = inspect_class_methods(ClassWithKeywordOnly, include_private=False) + m = next(m for m in methods if m.name == "method_with_kw") + assert "*, " in m.signature or "*," in m.signature diff --git a/packages/data-designer/tests/cli/services/introspection/test_pydantic_inspector.py b/packages/data-designer/tests/cli/services/introspection/test_pydantic_inspector.py index bd60c0bdc..0889021e9 100644 --- a/packages/data-designer/tests/cli/services/introspection/test_pydantic_inspector.py +++ b/packages/data-designer/tests/cli/services/introspection/test_pydantic_inspector.py @@ -6,9 +6,11 @@ from enum import Enum from typing import Annotated +import pytest from pydantic import BaseModel, Field from data_designer.cli.services.introspection.pydantic_inspector import ( + _default_to_json, _extract_constraints, _extract_enum_class, _extract_nested_basemodel, @@ -450,3 +452,89 @@ def test_format_model_text_depth_limiting_blocks_deeper_nesting() -> None: text = format_model_text(Level1, max_depth=1) assert "schema (Level2):" in text assert "schema (Level3):" not in text + + +# --------------------------------------------------------------------------- +# _default_to_json (P1-6) +# --------------------------------------------------------------------------- + + +class _JsonTestEnum(str, Enum): + MEMBER = "member_value" + + +class _CustomObj: + def __repr__(self) -> str: + return "CustomObj()" + + +@pytest.mark.parametrize( + ("value", "expected"), + [ + (None, None), + (_JsonTestEnum.MEMBER, "member_value"), + (True, True), + (42, 42), + (3.14, 3.14), + ("hello", "hello"), + ([1, 2], [1, 2]), + ({"a": 1}, {"a": 1}), + ], +) +def test_default_to_json(value: object, expected: object) -> None: + assert _default_to_json(value) == expected + + +def test_default_to_json_custom_object() -> None: + obj = _CustomObj() + assert _default_to_json(obj) == "CustomObj()" + + +# --------------------------------------------------------------------------- +# format_type — regex branches (P1-8) +# --------------------------------------------------------------------------- + + +def test_format_type_none_type() -> None: + result = format_type(type(None)) + assert result == "None" + + +def test_format_type_enum_class() -> None: + result = format_type(ColorEnum) + assert result == "ColorEnum" + + +def test_format_type_module_prefix_stripping() -> None: + import data_designer.config as dd + + result = format_type(list[dd.CategorySamplerParams]) + assert "data_designer.config." not in result + assert "CategorySamplerParams" in result + + +def test_format_type_literal() -> None: + from typing import Literal + + result = format_type(Literal["foo", "bar"]) + assert "Literal[" in result + assert "foo" in result + assert "bar" in result + + +# --------------------------------------------------------------------------- +# format_model_text — empty model (P1-10) +# --------------------------------------------------------------------------- + + +class EmptyModel(BaseModel): + """An empty model with no fields.""" + + +def test_format_model_text_empty_model() -> None: + text = format_model_text(EmptyModel) + assert "EmptyModel:" in text + assert "fields:" in text + lines = text.strip().split("\n") + field_lines = [line for line in lines if line.startswith(" ") and ":" in line and "fields:" not in line] + assert len(field_lines) == 0 From 50fffa6906c5f4a27b619bf3bca6eefb7063cff7 Mon Sep 17 00:00:00 2001 From: Johnny Greco Date: Tue, 17 Feb 2026 21:17:10 -0500 Subject: [PATCH 35/37] refactor: simplify introspection inspectors without changing output - method_inspector: prevent redundant "*" insertion in _format_signature instead of inserting then stripping it post-hoc - pydantic_inspector: merge format_model_text/_format_model_text wrapper into a single function with defaulted parameters - pydantic_inspector: replace string-level bracket-matching hack for Annotated[X, Discriminator] with proper type-level unwrapping - pydantic_inspector: condense union handling in _extract_nested_basemodel --- .../introspection/method_inspector.py | 9 +-- .../introspection/pydantic_inspector.py | 60 ++++++------------- .../introspection/test_method_inspector.py | 1 - 3 files changed, 21 insertions(+), 49 deletions(-) diff --git a/packages/data-designer/src/data_designer/cli/services/introspection/method_inspector.py b/packages/data-designer/src/data_designer/cli/services/introspection/method_inspector.py index 456439ae6..1ce2d5c4c 100644 --- a/packages/data-designer/src/data_designer/cli/services/introspection/method_inspector.py +++ b/packages/data-designer/src/data_designer/cli/services/introspection/method_inspector.py @@ -109,12 +109,13 @@ def _format_signature(method_name: str, sig: inspect.Signature) -> str: """Format a method signature as a readable string, skipping 'self'.""" params: list[str] = [] seen_keyword_only = False + has_var_positional = any(p.kind == inspect.Parameter.VAR_POSITIONAL for p in sig.parameters.values()) for param in sig.parameters.values(): if param.name == "self": continue - if param.kind == inspect.Parameter.KEYWORD_ONLY and not seen_keyword_only: + if param.kind == inspect.Parameter.KEYWORD_ONLY and not seen_keyword_only and not has_var_positional: seen_keyword_only = True params.append("*") @@ -135,12 +136,6 @@ def _format_signature(method_name: str, sig: inspect.Signature) -> str: return_type = _format_return_type(sig) params_str = ", ".join(params) - # Remove the extra "*, " if a *args was already present - if any(p.kind == inspect.Parameter.VAR_POSITIONAL for p in sig.parameters.values()): - parts = params_str.split(", ") - parts = [p for p in parts if p != "*"] - params_str = ", ".join(parts) - return f"{method_name}({params_str}) -> {return_type}" diff --git a/packages/data-designer/src/data_designer/cli/services/introspection/pydantic_inspector.py b/packages/data-designer/src/data_designer/cli/services/introspection/pydantic_inspector.py index 8c7c61e99..bb1ec7f5f 100644 --- a/packages/data-designer/src/data_designer/cli/services/introspection/pydantic_inspector.py +++ b/packages/data-designer/src/data_designer/cli/services/introspection/pydantic_inspector.py @@ -88,11 +88,7 @@ def _extract_nested_basemodel(annotation: Any) -> type | None: # Union: X | None, list[X] | None, or discriminated unions if origin is typing.Union or origin is types.UnionType: non_none_args = [a for a in get_args(annotation) if a is not type(None)] - basemodel_classes: list[type] = [] - for arg in non_none_args: - result = _extract_nested_basemodel(arg) - if result is not None: - basemodel_classes.append(result) + basemodel_classes = [m for a in non_none_args if (m := _extract_nested_basemodel(a)) is not None] if len(basemodel_classes) == 1: return basemodel_classes[0] return None @@ -100,11 +96,22 @@ def _extract_nested_basemodel(annotation: Any) -> type | None: return None +def _unwrap_annotated_discriminator(annotation: Any) -> Any: + """Strip Annotated wrapper containing a Discriminator.""" + if get_origin(annotation) is not typing.Annotated: + return annotation + args = get_args(annotation) + if len(args) >= 2 and any("Discriminator" in str(a) for a in args[1:]): + return args[0] + return annotation + + def format_type(annotation: Any) -> str: """Format a type annotation for readable display. Strips module prefixes and simplifies complex types. """ + annotation = _unwrap_annotated_discriminator(annotation) type_str = str(annotation) # Remove module prefixes @@ -128,19 +135,6 @@ def format_type(annotation: Any) -> str: if match: type_str = f"Literal[{match.group(1)}]" - # Clean up Annotated types with Discriminator (too verbose) - if "Annotated[" in type_str and "Discriminator" in type_str: - start = type_str.index("Annotated[") + len("Annotated[") - depth = 0 - for i, ch in enumerate(type_str[start:], start): - if ch in "([": - depth += 1 - elif ch in ")]": - depth -= 1 - elif ch == "," and depth == 0: - type_str = type_str[start:i].strip() - break - return type_str @@ -262,6 +256,8 @@ def format_model_text( indent: int = 0, seen_schemas: set[str] | None = None, max_depth: int = 3, + seen_types: set[type] | None = None, + depth: int = 0, ) -> str: """Format a Pydantic model as YAML-style text for agent context. @@ -272,30 +268,12 @@ def format_model_text( indent: Base indentation level. seen_schemas: Set of schema refs already rendered (mutated for cross-model dedup). max_depth: Maximum recursion depth for nested models. + seen_types: Set of types already rendered (prevents infinite recursion). + depth: Current recursion depth. """ - return _format_model_text( - cls, - type_key=type_key, - type_value=type_value, - indent=indent, - seen_schemas=seen_schemas, - seen_types=set(), - max_depth=max_depth, - depth=0, - ) - - -def _format_model_text( - cls: type, - type_key: str | None, - type_value: str | None, - indent: int, - seen_schemas: set[str] | None, - seen_types: set[type], - max_depth: int, - depth: int, -) -> str: - """Recursive implementation of format_model_text.""" + if seen_types is None: + seen_types = set() + pad = " " * indent lines: list[str] = [] lines.append(f"{pad}{cls.__name__}:") diff --git a/packages/data-designer/tests/cli/services/introspection/test_method_inspector.py b/packages/data-designer/tests/cli/services/introspection/test_method_inspector.py index 539ea2bfd..8d1670208 100644 --- a/packages/data-designer/tests/cli/services/introspection/test_method_inspector.py +++ b/packages/data-designer/tests/cli/services/introspection/test_method_inspector.py @@ -263,7 +263,6 @@ class EmptyClass: """A class with no public methods (no __init__ either).""" - class ClassWithBadSignature: """A class where one method has an uninspectable signature.""" From ad8da0274bf5691f3e1d63d5647cf5aeb3041141 Mon Sep 17 00:00:00 2001 From: Johnny Greco Date: Thu, 19 Feb 2026 16:59:13 -0500 Subject: [PATCH 36/37] refactor: lazy-load inspect CLI commands Register inspect subcommands via the lazy Typer group so inspect follows the same deferred-import pattern as other CLI commands. --- .../src/data_designer/cli/main.py | 44 ++++++++++++++++++- 1 file changed, 42 insertions(+), 2 deletions(-) diff --git a/packages/data-designer/src/data_designer/cli/main.py b/packages/data-designer/src/data_designer/cli/main.py index 63b5f0733..935fa77bc 100644 --- a/packages/data-designer/src/data_designer/cli/main.py +++ b/packages/data-designer/src/data_designer/cli/main.py @@ -5,7 +5,6 @@ import typer -from data_designer.cli.commands.agent_helpers import inspect as inspect_cmd from data_designer.cli.lazy_group import create_lazy_typer_group _CMD = "data_designer.cli.commands" @@ -99,13 +98,54 @@ no_args_is_help=True, ) +# Create inspect command group +inspect_app = typer.Typer( + name="inspect", + help="Inspect detailed schemas for configuration objects and the Python API.", + cls=create_lazy_typer_group( + { + "column": { + "module": f"{_CMD}.agent_helpers.inspect", + "attr": "columns_command", + "help": "Show schema for a column config type", + }, + "sampler": { + "module": f"{_CMD}.agent_helpers.inspect", + "attr": "samplers_command", + "help": "Show schema for a sampler params type", + }, + "validator": { + "module": f"{_CMD}.agent_helpers.inspect", + "attr": "validators_command", + "help": "Show schema for a validator params type", + }, + "processor": { + "module": f"{_CMD}.agent_helpers.inspect", + "attr": "processors_command", + "help": "Show schema for a processor config type", + }, + "sampler-constraints": { + "module": f"{_CMD}.agent_helpers.inspect", + "attr": "constraints_command", + "help": "Show constraint schemas for sampler columns", + }, + "config-builder": { + "module": f"{_CMD}.agent_helpers.inspect", + "attr": "config_builder_command", + "help": "Show DataDesignerConfigBuilder method signatures and docstrings", + }, + } + ), + no_args_is_help=True, +) + # Add setup command groups app.add_typer(config_app, name="config", rich_help_panel="Setup Commands") app.add_typer(download_app, name="download", rich_help_panel="Setup Commands") # Add agent command groups title_agent_helpers = "Agent-Helper Commands" -app.add_typer(inspect_cmd.inspect_app, name="inspect", rich_help_panel=title_agent_helpers) +app.add_typer(inspect_app, name="inspect", rich_help_panel=title_agent_helpers) def main() -> None: From 83ccb30a37f47aa24053257272e5384754c9065d Mon Sep 17 00:00:00 2001 From: Johnny Greco Date: Thu, 19 Feb 2026 17:02:16 -0500 Subject: [PATCH 37/37] fix: restore agent-helper list CLI commands Re-register the list helper group in the CLI so agent-facing list subcommands are available again alongside inspect. --- .../src/data_designer/cli/main.py | 42 +++++++++++++++++++ 1 file changed, 42 insertions(+) diff --git a/packages/data-designer/src/data_designer/cli/main.py b/packages/data-designer/src/data_designer/cli/main.py index 935fa77bc..aac5d316a 100644 --- a/packages/data-designer/src/data_designer/cli/main.py +++ b/packages/data-designer/src/data_designer/cli/main.py @@ -98,6 +98,47 @@ no_args_is_help=True, ) +# Create list command group +list_app = typer.Typer( + name="list", + help="List available types, model aliases, and persona datasets.", + cls=create_lazy_typer_group( + { + "model-aliases": { + "module": f"{_CMD}.agent_helpers.list", + "attr": "model_aliases_command", + "help": "List configured model aliases and backing models", + }, + "persona-datasets": { + "module": f"{_CMD}.agent_helpers.list", + "attr": "persona_datasets_command", + "help": "List Nemotron-Persona datasets and install status", + }, + "columns": { + "module": f"{_CMD}.agent_helpers.list", + "attr": "column_types_command", + "help": "List column type names and config classes", + }, + "samplers": { + "module": f"{_CMD}.agent_helpers.list", + "attr": "sampler_types_command", + "help": "List sampler type names and params classes", + }, + "validators": { + "module": f"{_CMD}.agent_helpers.list", + "attr": "validator_types_command", + "help": "List validator type names and params classes", + }, + "processors": { + "module": f"{_CMD}.agent_helpers.list", + "attr": "processor_types_command", + "help": "List processor type names and config classes", + }, + } + ), + no_args_is_help=True, +) + # Create inspect command group inspect_app = typer.Typer( name="inspect", @@ -145,6 +186,7 @@ # Add agent command groups title_agent_helpers = "Agent-Helper Commands" +app.add_typer(list_app, name="list", rich_help_panel=title_agent_helpers) app.add_typer(inspect_app, name="inspect", rich_help_panel=title_agent_helpers)