diff --git a/packages/data-designer-config/src/data_designer/config/utils/constants.py b/packages/data-designer-config/src/data_designer/config/utils/constants.py index 3b229923e..51bd278ff 100644 --- a/packages/data-designer-config/src/data_designer/config/utils/constants.py +++ b/packages/data-designer-config/src/data_designer/config/utils/constants.py @@ -372,3 +372,5 @@ class NordColor(Enum): LOCALES_WITH_MANAGED_DATASETS = list[str](NEMOTRON_PERSONAS_DATASET_SIZES.keys()) NEMOTRON_PERSONAS_DATASET_PREFIX = "nemotron-personas-dataset-" + +HUGGINGFACE_HUB_DATASET_URL_PREFIX = "https://huggingface.co/datasets/" diff --git a/packages/data-designer-engine/src/data_designer/engine/dataset_builders/artifact_storage.py b/packages/data-designer-engine/src/data_designer/engine/dataset_builders/artifact_storage.py index 905b03503..35e7d4f8a 100644 --- a/packages/data-designer-engine/src/data_designer/engine/dataset_builders/artifact_storage.py +++ b/packages/data-designer-engine/src/data_designer/engine/dataset_builders/artifact_storage.py @@ -24,7 +24,10 @@ logger = logging.getLogger(__name__) BATCH_FILE_NAME_FORMAT = "batch_{batch_number:05d}.parquet" -SDG_CONFIG_FILENAME = "sdg.json" +SDG_CONFIG_FILENAME = "builder_config.json" +METADATA_FILENAME = "metadata.json" +FINAL_DATASET_FOLDER_NAME = "parquet-files" +PROCESSORS_OUTPUTS_FOLDER_NAME = "processors-files" class BatchStage(StrEnum): @@ -37,10 +40,10 @@ class BatchStage(StrEnum): class ArtifactStorage(BaseModel): artifact_path: Path | str dataset_name: str = "dataset" - final_dataset_folder_name: str = "parquet-files" + final_dataset_folder_name: str = FINAL_DATASET_FOLDER_NAME partial_results_folder_name: str = "tmp-partial-parquet-files" dropped_columns_folder_name: str = "dropped-columns-parquet-files" - processors_outputs_folder_name: str = "processors-files" + processors_outputs_folder_name: str = PROCESSORS_OUTPUTS_FOLDER_NAME @property def artifact_path_exists(self) -> bool: @@ -72,7 +75,7 @@ def final_dataset_path(self) -> Path: @property def metadata_file_path(self) -> Path: - return self.base_dataset_path / "metadata.json" + return self.base_dataset_path / METADATA_FILENAME @property def partial_results_path(self) -> Path: @@ -259,7 +262,7 @@ def write_metadata(self, metadata: dict) -> Path: """ self.mkdir_if_needed(self.base_dataset_path) with open(self.metadata_file_path, "w") as file: - json.dump(metadata, file, indent=4, sort_keys=True) + json.dump(metadata, file, indent=2, sort_keys=True) return self.metadata_file_path def update_metadata(self, updates: dict) -> Path: diff --git a/packages/data-designer/src/data_designer/integrations/huggingface/__init__.py b/packages/data-designer/src/data_designer/integrations/huggingface/__init__.py new file mode 100644 index 000000000..9db421562 --- /dev/null +++ b/packages/data-designer/src/data_designer/integrations/huggingface/__init__.py @@ -0,0 +1,7 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +from data_designer.integrations.huggingface.client import HuggingFaceHubClient, HuggingFaceHubClientUploadError +from data_designer.integrations.huggingface.dataset_card import DataDesignerDatasetCard + +__all__ = ["HuggingFaceHubClient", "HuggingFaceHubClientUploadError", "DataDesignerDatasetCard"] diff --git a/packages/data-designer/src/data_designer/integrations/huggingface/client.py b/packages/data-designer/src/data_designer/integrations/huggingface/client.py new file mode 100644 index 000000000..0812b8dee --- /dev/null +++ b/packages/data-designer/src/data_designer/integrations/huggingface/client.py @@ -0,0 +1,431 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +from __future__ import annotations + +import json +import logging +import tempfile +from pathlib import Path + +from huggingface_hub import HfApi +from huggingface_hub.errors import HFValidationError +from huggingface_hub.utils import HfHubHTTPError, validate_repo_id + +from data_designer.config.utils.constants import HUGGINGFACE_HUB_DATASET_URL_PREFIX +from data_designer.engine.dataset_builders.artifact_storage import ( + FINAL_DATASET_FOLDER_NAME, + METADATA_FILENAME, + PROCESSORS_OUTPUTS_FOLDER_NAME, + SDG_CONFIG_FILENAME, +) +from data_designer.errors import DataDesignerError +from data_designer.integrations.huggingface.dataset_card import DataDesignerDatasetCard +from data_designer.logging import RandomEmoji + +logger = logging.getLogger(__name__) + + +class HuggingFaceHubClientUploadError(DataDesignerError): + """Error during Hugging Face dataset upload.""" + + +class HuggingFaceHubClient: + """Client for interacting with Hugging Face Hub to upload datasets.""" + + def __init__(self, token: str | None = None): + """Initialize Hugging Face Hub client. + + Args: + token: Hugging Face API token. If None, the token is automatically + resolved from HF_TOKEN environment variable or cached credentials + from `hf auth login`. + """ + self._token = token + self._api = HfApi(token=token) + + @property + def has_token(self) -> bool: + """Check if a token was explicitly provided. + + Returns: + True if a token was provided during initialization, False otherwise. + """ + return self._token is not None + + def upload_dataset( + self, + repo_id: str, + base_dataset_path: Path, + description: str, + *, + private: bool = False, + tags: list[str] | None = None, + ) -> str: + """Upload dataset to Hugging Face Hub. + + Uploads the complete dataset including: + - Main parquet batch files from parquet-files/ → data/ + - Processor output batch files from processors-files/{name}/ → {name}/ + - Existing builder_config.json and metadata.json files + - Auto-generated README.md (dataset card) + + Args: + repo_id: Hugging Face dataset repo ID (e.g., "username/dataset-name") + base_dataset_path: Path to base_dataset_path (contains parquet-files/, builder_config.json, etc.) + description: Custom description text for dataset card + private: Whether to create private repo + tags: Additional custom tags for the dataset + + Returns: + URL to the uploaded dataset + + Raises: + HuggingFaceUploadError: If validation fails or upload encounters errors + """ + logger.info(f"🤗 Uploading dataset to Hugging Face Hub: {repo_id}") + + self._validate_repo_id(repo_id=repo_id) + self._validate_dataset_path(base_dataset_path=base_dataset_path) + self._create_or_get_repo(repo_id=repo_id, private=private) + + logger.info(f" |-- {RandomEmoji.data()} Uploading dataset card...") + try: + self._upload_dataset_card( + repo_id=repo_id, + metadata_path=base_dataset_path / METADATA_FILENAME, + builder_config_path=base_dataset_path / SDG_CONFIG_FILENAME, + description=description, + tags=tags, + ) + except Exception as e: + raise HuggingFaceHubClientUploadError(f"Failed to upload dataset card: {e}") from e + + self._upload_main_dataset_files(repo_id=repo_id, parquet_folder=base_dataset_path / FINAL_DATASET_FOLDER_NAME) + self._upload_processor_files( + repo_id=repo_id, processors_folder=base_dataset_path / PROCESSORS_OUTPUTS_FOLDER_NAME + ) + self._upload_config_files( + repo_id=repo_id, + metadata_path=base_dataset_path / METADATA_FILENAME, + builder_config_path=base_dataset_path / SDG_CONFIG_FILENAME, + ) + + url = f"{HUGGINGFACE_HUB_DATASET_URL_PREFIX}{repo_id}" + logger.info(f" |-- {RandomEmoji.success()} Dataset uploaded successfully! View at: {url}") + return url + + def _create_or_get_repo(self, repo_id: str, *, private: bool = False) -> None: + """Create or get existing repository on Hugging Face Hub. + + Args: + repo_id: Hugging Face dataset repo ID + private: Whether to create private repo + + Raises: + HuggingFaceUploadError: If repository creation fails + """ + logger.info(f" |-- {RandomEmoji.working()} Checking if repository exists...") + try: + repo_exists = self._api.repo_exists(repo_id=repo_id, repo_type="dataset") + if repo_exists: + logger.info(f" |-- {RandomEmoji.success()} Repository already exists, updating content...") + else: + logger.info(f" |-- {RandomEmoji.working()} Creating new repository...") + + self._api.create_repo( + repo_id=repo_id, + repo_type="dataset", + exist_ok=True, + private=private, + ) + except HfHubHTTPError as e: + if e.response.status_code == 401: + raise HuggingFaceHubClientUploadError( + "Authentication failed. Please provide a valid Hugging Face token. " + "You can set it via the token parameter or HF_TOKEN environment variable, " + "or run 'hf auth login'." + ) from e + elif e.response.status_code == 403: + raise HuggingFaceHubClientUploadError( + f"Permission denied. You don't have access to create repository '{repo_id}'. " + "Check your token permissions or repository ownership." + ) from e + else: + raise HuggingFaceHubClientUploadError(f"Failed to create repository '{repo_id}': {e}") from e + except Exception as e: + raise HuggingFaceHubClientUploadError(f"Unexpected error creating repository '{repo_id}': {e}") from e + + def _upload_main_dataset_files(self, repo_id: str, parquet_folder: Path) -> None: + """Upload main parquet dataset files. + + Args: + repo_id: Hugging Face dataset repo ID + parquet_folder: Path to folder containing parquet files + + Raises: + HuggingFaceUploadError: If upload fails + """ + logger.info(f" |-- {RandomEmoji.loading()} Uploading main dataset files...") + try: + self._api.upload_folder( + repo_id=repo_id, + folder_path=str(parquet_folder), + path_in_repo="data", + repo_type="dataset", + commit_message="Upload main dataset files", + ) + except Exception as e: + raise HuggingFaceHubClientUploadError(f"Failed to upload parquet files: {e}") from e + + def _upload_processor_files(self, repo_id: str, processors_folder: Path) -> None: + """Upload processor output files. + + Args: + repo_id: Hugging Face dataset repo ID + processors_folder: Path to folder containing processor output directories + + Raises: + HuggingFaceUploadError: If upload fails + """ + if not processors_folder.exists(): + return + + processor_dirs = [d for d in processors_folder.iterdir() if d.is_dir()] + if not processor_dirs: + return + + logger.info(f" |-- {RandomEmoji.loading()} Uploading processor outputs ({len(processor_dirs)} processors)...") + for processor_dir in processor_dirs: + try: + self._api.upload_folder( + repo_id=repo_id, + folder_path=str(processor_dir), + path_in_repo=processor_dir.name, + repo_type="dataset", + commit_message=f"Upload {processor_dir.name} processor outputs", + ) + except Exception as e: + raise HuggingFaceHubClientUploadError( + f"Failed to upload processor outputs for '{processor_dir.name}': {e}" + ) from e + + def _upload_config_files(self, repo_id: str, metadata_path: Path, builder_config_path: Path) -> None: + """Upload configuration files (builder_config.json and metadata.json). + + Args: + repo_id: Hugging Face dataset repo ID + metadata_path: Path to metadata.json file + builder_config_path: Path to builder_config.json file + + Raises: + HuggingFaceUploadError: If upload fails + """ + logger.info(f" |-- {RandomEmoji.loading()} Uploading configuration files...") + + if builder_config_path.exists(): + try: + self._api.upload_file( + repo_id=repo_id, + path_or_fileobj=str(builder_config_path), + path_in_repo=SDG_CONFIG_FILENAME, + repo_type="dataset", + commit_message="Upload builder_config.json", + ) + except Exception as e: + raise HuggingFaceHubClientUploadError(f"Failed to upload builder_config.json: {e}") from e + + if metadata_path.exists(): + tmp_path = None + try: + updated_metadata = self._update_metadata_paths(metadata_path) + with tempfile.NamedTemporaryFile(mode="w", suffix=".json", delete=False) as tmp_file: + json.dump(updated_metadata, tmp_file, indent=2) + tmp_path = tmp_file.name + + self._api.upload_file( + repo_id=repo_id, + path_or_fileobj=tmp_path, + path_in_repo=METADATA_FILENAME, + repo_type="dataset", + commit_message=f"Upload {METADATA_FILENAME}", + ) + except Exception as e: + raise HuggingFaceHubClientUploadError(f"Failed to upload {METADATA_FILENAME}: {e}") from e + finally: + if tmp_path and Path(tmp_path).exists(): + Path(tmp_path).unlink() + + def _upload_dataset_card( + self, + repo_id: str, + metadata_path: Path, + builder_config_path: Path, + description: str, + tags: list[str] | None = None, + ) -> None: + """Generate and upload dataset card from metadata.json. + + Args: + repo_id: Hugging Face dataset repo ID + metadata_path: Path to metadata.json file + builder_config_path: Path to builder_config.json file + description: Custom description text for dataset card + tags: Additional custom tags for the dataset + + Raises: + HuggingFaceUploadError: If dataset card generation or upload fails + """ + try: + with open(metadata_path) as f: + metadata = json.load(f) + except json.JSONDecodeError as e: + raise HuggingFaceHubClientUploadError(f"Failed to parse {METADATA_FILENAME}: {e}") from e + except Exception as e: + raise HuggingFaceHubClientUploadError(f"Failed to read {METADATA_FILENAME}: {e}") from e + + builder_config = None + if builder_config_path.exists(): + try: + with open(builder_config_path) as f: + builder_config = json.load(f) + except json.JSONDecodeError as e: + raise HuggingFaceHubClientUploadError(f"Failed to parse builder_config.json: {e}") from e + except Exception as e: + raise HuggingFaceHubClientUploadError(f"Failed to read builder_config.json: {e}") from e + + try: + card = DataDesignerDatasetCard.from_metadata( + metadata=metadata, + builder_config=builder_config, + repo_id=repo_id, + description=description, + tags=tags, + ) + except Exception as e: + raise HuggingFaceHubClientUploadError(f"Failed to generate dataset card: {e}") from e + + try: + card.push_to_hub(repo_id, repo_type="dataset") + except Exception as e: + raise HuggingFaceHubClientUploadError(f"Failed to push dataset card to hub: {e}") from e + + @staticmethod + def _validate_repo_id(repo_id: str) -> None: + """Validate Hugging Face dataset repository ID format. + + Args: + repo_id: Repository ID to validate + + Raises: + HuggingFaceHubClientUploadError: If repo_id format is invalid + """ + # Check if repo_id is empty + if not repo_id or not repo_id.strip(): + raise HuggingFaceHubClientUploadError("repo_id must be a non-empty string") + + # Check for exactly one slash (username/dataset-name format). This is not enforced by huggingface_hub's validator. + if repo_id.count("/") != 1: + raise HuggingFaceHubClientUploadError( + f"Invalid repo_id format: '{repo_id}'. Expected format: 'username/dataset-name'" + ) + + # Use huggingface_hub's validator for additional checks (characters, length, etc.) + try: + validate_repo_id(repo_id) + except HFValidationError as e: + raise HuggingFaceHubClientUploadError(f"Invalid repo_id format: '{repo_id}': {e}") from e + + @staticmethod + def _update_metadata_paths(metadata_path: Path) -> dict: + """Update file paths in metadata.json to match Hugging Face dataset repository structure. + + Local paths: + - parquet-files/batch_00000.parquet → data/batch_00000.parquet + - processors-files/processor1/batch_00000.parquet → processor1/batch_00000.parquet + + Args: + metadata_path: Path to metadata.json file + + Returns: + Updated metadata dictionary with corrected paths + """ + with open(metadata_path) as f: + metadata = json.load(f) + + if "file_paths" in metadata: + updated_file_paths = {} + + # Update parquet files path: parquet-files/ → data/ + if FINAL_DATASET_FOLDER_NAME in metadata["file_paths"]: + updated_file_paths["data"] = [ + path.replace(f"{FINAL_DATASET_FOLDER_NAME}/", "data/") + for path in metadata["file_paths"][FINAL_DATASET_FOLDER_NAME] + ] + + # Update processor files paths: processors-files/{name}/ → {name}/ + if "processor-files" in metadata["file_paths"]: + updated_file_paths["processor-files"] = {} + for processor_name, paths in metadata["file_paths"]["processor-files"].items(): + updated_file_paths["processor-files"][processor_name] = [ + path.replace(f"{PROCESSORS_OUTPUTS_FOLDER_NAME}/{processor_name}/", f"{processor_name}/") + for path in paths + ] + + metadata["file_paths"] = updated_file_paths + + return metadata + + @staticmethod + def _validate_dataset_path(base_dataset_path: Path) -> None: + """Validate dataset directory structure. + + Args: + base_dataset_path: Path to dataset directory + + Raises: + HuggingFaceUploadError: If directory structure is invalid + """ + if not base_dataset_path.exists(): + raise HuggingFaceHubClientUploadError(f"Dataset path does not exist: {base_dataset_path}") + + if not base_dataset_path.is_dir(): + raise HuggingFaceHubClientUploadError(f"Dataset path is not a directory: {base_dataset_path}") + + metadata_path = base_dataset_path / METADATA_FILENAME + if not metadata_path.exists(): + raise HuggingFaceHubClientUploadError(f"Required file not found: {metadata_path}") + + if not metadata_path.is_file(): + raise HuggingFaceHubClientUploadError(f"{METADATA_FILENAME} is not a file: {metadata_path}") + + parquet_dir = base_dataset_path / FINAL_DATASET_FOLDER_NAME + if not parquet_dir.exists(): + raise HuggingFaceHubClientUploadError( + f"Required directory not found: {parquet_dir}. " + "Dataset must contain parquet-files directory with batch files." + ) + + if not parquet_dir.is_dir(): + raise HuggingFaceHubClientUploadError(f"parquet-files is not a directory: {parquet_dir}") + + if not any(parquet_dir.glob("*.parquet")): + raise HuggingFaceHubClientUploadError( + f"parquet-files directory is empty: {parquet_dir}. At least one .parquet file is required." + ) + + try: + with open(metadata_path) as f: + json.load(f) + except json.JSONDecodeError as e: + raise HuggingFaceHubClientUploadError(f"Invalid JSON in {METADATA_FILENAME}: {e}") + + builder_config_path = base_dataset_path / SDG_CONFIG_FILENAME + if builder_config_path.exists(): + if not builder_config_path.is_file(): + raise HuggingFaceHubClientUploadError(f"{SDG_CONFIG_FILENAME} is not a file: {builder_config_path}") + try: + with open(builder_config_path) as f: + json.load(f) + except json.JSONDecodeError as e: + raise HuggingFaceHubClientUploadError(f"Invalid JSON in {SDG_CONFIG_FILENAME}: {e}") diff --git a/packages/data-designer/src/data_designer/integrations/huggingface/dataset_card.py b/packages/data-designer/src/data_designer/integrations/huggingface/dataset_card.py new file mode 100644 index 000000000..3c57f7431 --- /dev/null +++ b/packages/data-designer/src/data_designer/integrations/huggingface/dataset_card.py @@ -0,0 +1,130 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +from __future__ import annotations + +from datetime import datetime +from pathlib import Path + +from huggingface_hub import CardData, DatasetCard + +TEMPLATE_DATA_DESIGNER_DATASET_CARD_PATH = Path(__file__).parent / "dataset_card_template.md" +DEFAULT_DATASET_CARD_TAGS = ["synthetic", "datadesigner"] + + +class DataDesignerDatasetCard(DatasetCard): + """Dataset card for NeMo Data Designer datasets. + + This class extends Hugging Face's DatasetCard with a custom template + specifically designed for Data Designer generated datasets. + The template is located at `data_designer/integrations/huggingface/dataset_card_template.md`. + """ + + default_template_path = TEMPLATE_DATA_DESIGNER_DATASET_CARD_PATH + + @classmethod + def from_metadata( + cls, + metadata: dict, + builder_config: dict | None, + repo_id: str, + description: str, + tags: list[str] | None = None, + ) -> DataDesignerDatasetCard: + """Create dataset card from metadata.json and builder_config.json. + + Args: + metadata: Contents of metadata.json + builder_config: Contents of builder_config.json (optional) + repo_id: Hugging Face dataset repo ID + description: Custom description text + tags: Additional custom tags for the dataset. + + Returns: + DataDesignerDatasetCard instance ready to upload + """ + # Extract info from metadata + target_num_records = metadata.get("target_num_records", 0) + schema = metadata.get("schema", {}) + column_stats = metadata.get("column_statistics", []) + + # Get actual num_records from column_statistics if available + if column_stats: + actual_num_records = column_stats[0].get("num_records", target_num_records) + else: + actual_num_records = target_num_records + + # Compute size category + size_categories = cls._compute_size_category(actual_num_records) + + # Extract column types from builder_config.json if available + config_types: dict[str, int] = {} + num_columns_configured = 0 + if builder_config: + columns = builder_config.get("data_designer", {}).get("columns", []) + num_columns_configured = len(columns) + for col in columns: + col_type = col.get("column_type", "unknown") + if isinstance(col_type, dict): + col_type = col_type.get("value", "unknown") + config_types[col_type] = config_types.get(col_type, 0) + 1 + + # Extract processor names from file_paths + processor_names = [] + if "file_paths" in metadata and "processor-files" in metadata["file_paths"]: + processor_names = list(metadata["file_paths"]["processor-files"].keys()) + + # Prepare tags: default tags + custom tags + all_tags = DEFAULT_DATASET_CARD_TAGS + (tags or []) + + # Prepare CardData (metadata for YAML frontmatter) + card_data = CardData( + size_categories=size_categories, + tags=all_tags, + ) + + # Prepare template variables + template_vars = { + "repo_id": repo_id, + "num_records": actual_num_records, + "target_num_records": target_num_records, + "num_columns": len(schema), + "size_categories": size_categories, + "all_columns": schema, + "column_statistics": column_stats, + "num_columns_configured": num_columns_configured, + "config_types": config_types, + "percent_complete": 100 * actual_num_records / target_num_records if target_num_records > 0 else 0, + "current_year": datetime.now().year, + "has_processors": len(processor_names) > 0, + "processor_names": processor_names, + "tags": all_tags, + "custom_description": description, + } + + # Create card from template + card = cls.from_template(card_data, template_path=str(cls.default_template_path), **template_vars) + return card + + @staticmethod + def _compute_size_category(num_records: int) -> str: + """Compute Hugging Face dataset size category from record count. + + Args: + num_records: Number of records in the dataset + + Returns: + Size category string for Hugging Face dataset repository tags + """ + if num_records < 1000: + return "n<1K" + elif num_records < 10000: + return "1K10M" diff --git a/packages/data-designer/src/data_designer/integrations/huggingface/dataset_card_template.md b/packages/data-designer/src/data_designer/integrations/huggingface/dataset_card_template.md new file mode 100644 index 000000000..f01ce33d2 --- /dev/null +++ b/packages/data-designer/src/data_designer/integrations/huggingface/dataset_card_template.md @@ -0,0 +1,111 @@ +--- +size_categories: {{ size_categories }} +tags: +{% for tag in tags %} + - {{ tag }} +{% endfor %} +configs: +- config_name: data + data_files: "data/*.parquet" + default: true +{% if has_processors %}{% for processor_name in processor_names %}- config_name: {{ processor_name }} + data_files: "{{ processor_name }}/*.parquet" +{% endfor %}{% endif %} +--- + +
+

{{ repo_id.split('/')[-1] | title }}

+Made with ❤️ using 🎨 NeMo Data Designer +
+ +--- + +{{ custom_description }} + +--- + +## 🚀 Quick Start + +```python +from datasets import load_dataset + +# Load the main dataset +dataset = load_dataset("{{ repo_id }}", "data", split="train") +df = dataset.to_pandas() +{% if has_processors %} + +# Load processor outputs (if available){% for processor_name in processor_names %} +processor_{{ processor_name }} = load_dataset("{{ repo_id }}", "{{ processor_name }}", split="train") +df_{{ processor_name }} = processor_{{ processor_name }}.to_pandas() +{% endfor %}{% endif %} +``` + +--- + +## 📊 Dataset Summary + +- **📈 Records**: {{ "{:,}".format(num_records) }} +- **📋 Columns**: {{ num_columns }} +{% if target_num_records != num_records %} +- **✅ Completion**: {{ "%.1f" | format(percent_complete) }}% ({{ "{:,}".format(target_num_records) }} requested) +{% endif %} + +--- + +## 📋 Schema & Statistics + +{% if column_statistics %} +| Column | Type | Column Type | Unique (%) | Null (%) | Details | +|--------|------|-------------|------------|----------|---------| +{% for stat in column_statistics -%} +| `{{ stat.column_name }}` | `{{ stat.simple_dtype }}` | {{ stat.column_type }} | {{ stat.num_unique }} ({{ "%.1f" | format((stat.num_unique / stat.num_records * 100) if stat.num_records > 0 else 0) }}%) | {{ stat.num_null if stat.num_null > 0 else 0 }} ({{ "%.1f" | format((stat.num_null / stat.num_records * 100) if stat.num_records > 0 else 0) }}%) | {% if stat.column_type in ["llm-text", "llm-code", "llm-structured", "llm-judge"] %}Tokens: {{ "%.0f" | format(stat.output_tokens_mean) if stat.output_tokens_mean is defined else "N/A" }} out / {{ "%.0f" | format(stat.input_tokens_mean) if stat.input_tokens_mean is defined else "N/A" }} in{% elif stat.column_type == "sampler" and stat.sampler_type is defined %}{% if stat.sampler_type is mapping %}{{ stat.sampler_type.value }}{% else %}{{ stat.sampler_type }}{% endif %}{% else %}-{% endif %} | +{% endfor -%} +{% else %} +| Column | Type | +|--------|------| +{% for col_name, dtype in all_columns.items() | sort -%} +| `{{ col_name }}` | {{ dtype }} | +{% endfor %} +{% endif %} + +--- + +## ⚙️ Generation Details + +{% if config_types %} +Generated with {{ num_columns_configured }} column configuration(s): + +{% for col_type, count in config_types.items() | sort %} +- **{{ col_type }}**: {{ count }} column(s) +{% endfor %} + +{% endif %} +📄 Full configuration available in [`builder_config.json`](builder_config.json) and detailed metadata in [`metadata.json`](metadata.json). + +--- + +## 📚 Citation + +```bibtex +@misc{nemo-data-designer, + author = {The NeMo Data Designer Team, NVIDIA}, + title = {NeMo Data Designer: A framework for generating synthetic data from scratch or based on your own seed data}, + howpublished = {\url{https://github.com/NVIDIA-NeMo/DataDesigner}}, + year = {{ current_year }}, + note = {GitHub Repository}, +} +``` + +--- + +## 💡 About NeMo Data Designer + +NeMo Data Designer is a general framework for generating high-quality synthetic data that goes beyond simple LLM prompting. It provides: + +- **Diverse data generation** using statistical samplers, LLMs, or existing seed datasets +- **Relationship control** between fields with dependency-aware generation +- **Quality validation** with built-in Python, SQL, and custom local and remote validators +- **LLM-as-a-judge** scoring for quality assessment +- **Fast iteration** with preview mode before full-scale generation + +For more information, visit: [https://github.com/NVIDIA-NeMo/DataDesigner](https://github.com/NVIDIA-NeMo/DataDesigner) (`pip install data-designer`) diff --git a/packages/data-designer/src/data_designer/interface/results.py b/packages/data-designer/src/data_designer/interface/results.py index b9467c581..f86acced3 100644 --- a/packages/data-designer/src/data_designer/interface/results.py +++ b/packages/data-designer/src/data_designer/interface/results.py @@ -12,6 +12,7 @@ from data_designer.config.utils.visualization import WithRecordSamplerMixin from data_designer.engine.dataset_builders.artifact_storage import ArtifactStorage from data_designer.engine.dataset_builders.errors import ArtifactStorageError +from data_designer.integrations.huggingface.client import HuggingFaceHubClient from data_designer.lazy_heavy_imports import pd if TYPE_CHECKING: @@ -96,3 +97,49 @@ def get_path_to_processor_artifacts(self, processor_name: str) -> Path: if not self.artifact_storage.processors_outputs_path.exists(): raise ArtifactStorageError(f"Processor {processor_name} has no artifacts.") return self.artifact_storage.processors_outputs_path / processor_name + + def push_to_hub( + self, + repo_id: str, + description: str, + *, + token: str | None = None, + private: bool = False, + tags: list[str] | None = None, + ) -> str: + """Push dataset to HuggingFace Hub. + + Uploads all artifacts including: + - Main parquet batch files (data subset) + - Processor output batch files ({processor_name} subsets) + - Configuration (builder_config.json) + - Metadata (metadata.json) + - Auto-generated dataset card (README.md) + + Args: + repo_id: HuggingFace repo ID (e.g., "username/my-dataset") + description: Custom description text for the dataset card. + Appears after the title. + token: HuggingFace API token. If None, the token is automatically + resolved from HF_TOKEN environment variable or cached credentials + from `hf auth login`. + private: Create private repo + tags: Additional custom tags for the dataset. + + Returns: + URL to the uploaded dataset + + Example: + >>> results = data_designer.create(config, num_records=1000) + >>> description = "This dataset contains synthetic conversations for training chatbots." + >>> results.push_to_hub("username/my-synthetic-dataset", description, tags=["chatbot", "conversation"]) + 'https://huggingface.co/datasets/username/my-synthetic-dataset' + """ + client = HuggingFaceHubClient(token=token) + return client.upload_dataset( + repo_id=repo_id, + base_dataset_path=self.artifact_storage.base_dataset_path, + private=private, + description=description, + tags=tags, + ) diff --git a/packages/data-designer/tests/integrations/huggingface/__init__.py b/packages/data-designer/tests/integrations/huggingface/__init__.py new file mode 100644 index 000000000..52a7a9daf --- /dev/null +++ b/packages/data-designer/tests/integrations/huggingface/__init__.py @@ -0,0 +1,2 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 diff --git a/packages/data-designer/tests/integrations/huggingface/test_client.py b/packages/data-designer/tests/integrations/huggingface/test_client.py new file mode 100644 index 000000000..735ea3bcd --- /dev/null +++ b/packages/data-designer/tests/integrations/huggingface/test_client.py @@ -0,0 +1,559 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +from __future__ import annotations + +import json +from pathlib import Path +from unittest.mock import MagicMock, patch + +import pytest +from huggingface_hub.utils import HfHubHTTPError + +from data_designer.integrations.huggingface.client import HuggingFaceHubClient, HuggingFaceHubClientUploadError + + +@pytest.fixture +def mock_hf_api() -> MagicMock: + """Mock HfApi for testing.""" + with patch("data_designer.integrations.huggingface.client.HfApi") as mock: + api_instance = MagicMock() + mock.return_value = api_instance + yield api_instance + + +@pytest.fixture +def mock_dataset_card() -> MagicMock: + """Mock DataDesignerDatasetCard for testing.""" + with patch("data_designer.integrations.huggingface.client.DataDesignerDatasetCard") as mock: + card_instance = MagicMock() + mock.from_metadata.return_value = card_instance + yield mock + + +@pytest.fixture +def sample_dataset_path(tmp_path: Path) -> Path: + """Create a sample dataset directory structure. + + Structure mirrors actual DataDesigner output: + - parquet-files/: Main dataset batch files + - processors-files/{processor_name}/: Processor output batch files (same structure) + - metadata.json: Dataset metadata + - builder_config.json: Configuration + """ + base_path = tmp_path / "dataset" + base_path.mkdir() + + # Create parquet-files directory with batch files + parquet_dir = base_path / "parquet-files" + parquet_dir.mkdir() + (parquet_dir / "batch_00000.parquet").write_text("dummy parquet data") + (parquet_dir / "batch_00001.parquet").write_text("dummy parquet data") + + # Create processors-files directory with same structure as main parquet-files + processors_dir = base_path / "processors-files" + processors_dir.mkdir() + processor1_dir = processors_dir / "processor1" + processor1_dir.mkdir() + (processor1_dir / "batch_00000.parquet").write_text("dummy processor output") + (processor1_dir / "batch_00001.parquet").write_text("dummy processor output") + + processor2_dir = processors_dir / "processor2" + processor2_dir.mkdir() + (processor2_dir / "batch_00000.parquet").write_text("dummy processor output") + + # Create metadata.json with matching column statistics + metadata = { + "target_num_records": 100, + "total_num_batches": 2, + "buffer_size": 50, + "schema": {"col1": "string"}, + "file_paths": { + "parquet-files": ["parquet-files/batch_00000.parquet", "parquet-files/batch_00001.parquet"], + "processor-files": { + "processor1": ["processors-files/processor1/batch_00000.parquet"], + "processor2": ["processors-files/processor2/batch_00000.parquet"], + }, + }, + "num_completed_batches": 2, + "dataset_name": "dataset", + "column_statistics": [ + { + "column_name": "col1", + "num_records": 100, + "num_unique": 100, + "num_null": 0, + "simple_dtype": "string", + "pyarrow_dtype": "string", + "column_type": "sampler", + "sampler_type": "uuid", + } + ], + } + (base_path / "metadata.json").write_text(json.dumps(metadata)) + + # Create builder_config.json with realistic BuilderConfig structure + builder_config = { + "data_designer": { + "columns": [ + { + "name": "col1", + "column_type": "sampler", + "sampler_type": "uuid", + "params": {}, + } + ], + "model_configs": [], + "constraints": None, + "seed_config": None, + "profilers": None, + } + } + (base_path / "builder_config.json").write_text(json.dumps(builder_config)) + + return base_path + + +def test_client_initialization() -> None: + """Test HuggingFaceHubClient initialization.""" + with patch("data_designer.integrations.huggingface.client.HfApi"): + client = HuggingFaceHubClient(token="test-token") + assert client.has_token is True + + +def test_client_initialization_no_token() -> None: + """Test HuggingFaceHubClient initialization without token.""" + with patch("data_designer.integrations.huggingface.client.HfApi"): + client = HuggingFaceHubClient() + assert client.has_token is False + + +def test_upload_dataset_creates_repo( + mock_hf_api: MagicMock, mock_dataset_card: MagicMock, sample_dataset_path: Path +) -> None: + """Test that upload_dataset creates a repository.""" + client = HuggingFaceHubClient(token="test-token") + + client.upload_dataset( + repo_id="test/dataset", + base_dataset_path=sample_dataset_path, + description="Test dataset", + ) + + # Verify repo creation was called + mock_hf_api.create_repo.assert_called_once() + assert mock_hf_api.create_repo.call_args.kwargs["repo_id"] == "test/dataset" + + +def test_upload_dataset_uploads_parquet_files( + mock_hf_api: MagicMock, mock_dataset_card: MagicMock, sample_dataset_path: Path +) -> None: + """Test that upload_dataset uploads parquet files.""" + client = HuggingFaceHubClient(token="test-token") + + client.upload_dataset( + repo_id="test/dataset", + base_dataset_path=sample_dataset_path, + description="Test dataset", + ) + + # Check that upload_folder was called for parquet files + calls = [call for call in mock_hf_api.upload_folder.call_args_list if call.kwargs["path_in_repo"] == "data"] + assert len(calls) >= 1 + + +def test_upload_dataset_uploads_processor_outputs( + mock_hf_api: MagicMock, mock_dataset_card: MagicMock, sample_dataset_path: Path +) -> None: + """Test that upload_dataset uploads processor outputs.""" + client = HuggingFaceHubClient(token="test-token") + + client.upload_dataset( + repo_id="test/dataset", + base_dataset_path=sample_dataset_path, + description="Test dataset", + ) + + # Check that upload_folder was called for processor outputs + calls = [call for call in mock_hf_api.upload_folder.call_args_list if "processor1" in call.kwargs["path_in_repo"]] + assert len(calls) >= 1 + + +def test_upload_dataset_uploads_config_files( + mock_hf_api: MagicMock, mock_dataset_card: MagicMock, sample_dataset_path: Path +) -> None: + """Test that upload_dataset uploads builder_config.json and metadata.json.""" + client = HuggingFaceHubClient(token="test-token") + + client.upload_dataset( + repo_id="test/dataset", + base_dataset_path=sample_dataset_path, + description="Test dataset", + ) + + # Check that upload_file was called for config files + upload_file_calls = mock_hf_api.upload_file.call_args_list + uploaded_files = [call.kwargs["path_in_repo"] for call in upload_file_calls] + assert "builder_config.json" in uploaded_files + assert "metadata.json" in uploaded_files + + +def test_upload_dataset_returns_url( + mock_hf_api: MagicMock, mock_dataset_card: MagicMock, sample_dataset_path: Path +) -> None: + """Test that upload_dataset returns the correct URL.""" + client = HuggingFaceHubClient(token="test-token") + + url = client.upload_dataset( + repo_id="test/dataset", + base_dataset_path=sample_dataset_path, + description="Test dataset", + ) + + assert url == "https://huggingface.co/datasets/test/dataset" + + +def test_upload_dataset_with_private_repo( + mock_hf_api: MagicMock, mock_dataset_card: MagicMock, sample_dataset_path: Path +) -> None: + """Test upload_dataset with private repository.""" + client = HuggingFaceHubClient(token="test-token") + + client.upload_dataset( + repo_id="test/dataset", + base_dataset_path=sample_dataset_path, + description="Test dataset", + private=True, + ) + + mock_hf_api.create_repo.assert_called_once_with( + repo_id="test/dataset", + repo_type="dataset", + exist_ok=True, + private=True, + ) + + +def test_upload_dataset_card_missing_metadata(tmp_path: Path) -> None: + """Test upload fails when metadata.json is missing.""" + client = HuggingFaceHubClient(token="test-token") + + # Create directory without metadata.json + base_path = tmp_path / "dataset" + base_path.mkdir() + + with pytest.raises(HuggingFaceHubClientUploadError, match="Required file not found"): + client.upload_dataset( + repo_id="test/dataset", + base_dataset_path=base_path, + description="Test description", + ) + + +def test_upload_dataset_card_calls_push_to_hub(mock_hf_api: MagicMock, sample_dataset_path: Path) -> None: + """Test upload_dataset generates and pushes dataset card.""" + client = HuggingFaceHubClient(token="test-token") + + with patch("data_designer.integrations.huggingface.client.DataDesignerDatasetCard") as mock_card_class: + mock_card = MagicMock() + mock_card_class.from_metadata.return_value = mock_card + + client.upload_dataset( + repo_id="test/dataset", + base_dataset_path=sample_dataset_path, + description="Test description", + ) + + # Verify card was created and pushed + mock_card_class.from_metadata.assert_called_once() + mock_card.push_to_hub.assert_called_once() + + +def test_upload_dataset_without_processors( + mock_hf_api: MagicMock, mock_dataset_card: MagicMock, tmp_path: Path +) -> None: + """Test upload_dataset when no processor outputs exist.""" + # Create dataset path without processors directory + base_path = tmp_path / "dataset" + base_path.mkdir() + + parquet_dir = base_path / "parquet-files" + parquet_dir.mkdir() + (parquet_dir / "batch_00000.parquet").write_text("dummy data") + + metadata = {"target_num_records": 10, "schema": {"col1": "string"}, "column_statistics": []} + (base_path / "metadata.json").write_text(json.dumps(metadata)) + + client = HuggingFaceHubClient(token="test-token") + + client.upload_dataset( + repo_id="test/dataset", + base_dataset_path=base_path, + description="Test dataset", + ) + + # Should only upload parquet files, not processors + folder_calls = mock_hf_api.upload_folder.call_args_list + data_calls = [call for call in folder_calls if call.kwargs["path_in_repo"] == "data"] + processor_calls = [call for call in folder_calls if "processor" in call.kwargs["path_in_repo"]] + + assert len(data_calls) == 1 # Main parquet files uploaded + assert len(processor_calls) == 0 # No processor files + + +def test_upload_dataset_without_builder_config( + mock_hf_api: MagicMock, mock_dataset_card: MagicMock, tmp_path: Path +) -> None: + """Test upload_dataset when builder_config.json doesn't exist.""" + base_path = tmp_path / "dataset" + base_path.mkdir() + + parquet_dir = base_path / "parquet-files" + parquet_dir.mkdir() + (parquet_dir / "batch_00000.parquet").write_text("dummy data") + + metadata = {"target_num_records": 10, "schema": {"col1": "string"}, "column_statistics": []} + (base_path / "metadata.json").write_text(json.dumps(metadata)) + + # No builder_config.json file + + client = HuggingFaceHubClient(token="test-token") + + client.upload_dataset( + repo_id="test/dataset", + base_dataset_path=base_path, + description="Test dataset", + ) + + # Should only upload metadata.json, not builder_config.json + file_calls = mock_hf_api.upload_file.call_args_list + uploaded_files = [call.kwargs["path_in_repo"] for call in file_calls] + + assert len(uploaded_files) == 1 # Only metadata.json + assert "metadata.json" in uploaded_files + assert "builder_config.json" not in uploaded_files + + +def test_upload_dataset_multiple_processors( + mock_hf_api: MagicMock, mock_dataset_card: MagicMock, sample_dataset_path: Path +) -> None: + """Test that multiple processor outputs are uploaded correctly.""" + client = HuggingFaceHubClient(token="test-token") + + client.upload_dataset( + repo_id="test/dataset", + base_dataset_path=sample_dataset_path, + description="Test dataset", + ) + + # Check that both processors were uploaded + folder_calls = mock_hf_api.upload_folder.call_args_list + processor_calls = [call for call in folder_calls if "processor" in call.kwargs["path_in_repo"]] + + assert len(processor_calls) >= 2 + processor_paths = [call.kwargs["path_in_repo"] for call in processor_calls] + assert any("processor1" in path for path in processor_paths) + assert any("processor2" in path for path in processor_paths) + + +# Error handling and validation tests + + +def test_validate_repo_id_invalid_format(sample_dataset_path: Path) -> None: + """Test upload fails with invalid repo_id formats.""" + client = HuggingFaceHubClient(token="test-token") + + # Missing slash + with pytest.raises(HuggingFaceHubClientUploadError, match="Invalid repo_id format"): + client.upload_dataset("my-dataset", sample_dataset_path, "Test") + + # Too many slashes (caught by regex) + with pytest.raises(HuggingFaceHubClientUploadError, match="Invalid repo_id format"): + client.upload_dataset("user/org/dataset", sample_dataset_path, "Test") + + # Invalid characters (space) + with pytest.raises(HuggingFaceHubClientUploadError, match="Invalid repo_id format"): + client.upload_dataset("user/my dataset", sample_dataset_path, "Test") + + # Empty string + with pytest.raises(HuggingFaceHubClientUploadError, match="must be a non-empty string"): + client.upload_dataset("", sample_dataset_path, "Test") + + +def test_validate_dataset_path_not_exists(tmp_path: Path) -> None: + """Test upload fails when dataset path doesn't exist.""" + client = HuggingFaceHubClient(token="test-token") + non_existent = tmp_path / "does-not-exist" + + with pytest.raises(HuggingFaceHubClientUploadError, match="does not exist"): + client.upload_dataset("test/dataset", non_existent, "Test") + + +def test_validate_dataset_path_is_file(tmp_path: Path) -> None: + """Test upload fails when dataset path is a file.""" + client = HuggingFaceHubClient(token="test-token") + file_path = tmp_path / "file.txt" + file_path.write_text("not a directory") + + with pytest.raises(HuggingFaceHubClientUploadError, match="not a directory"): + client.upload_dataset("test/dataset", file_path, "Test") + + +def test_validate_dataset_path_missing_metadata(tmp_path: Path) -> None: + """Test upload fails when metadata.json is missing.""" + client = HuggingFaceHubClient(token="test-token") + base_path = tmp_path / "dataset" + base_path.mkdir() + + with pytest.raises(HuggingFaceHubClientUploadError, match="Required file not found"): + client.upload_dataset("test/dataset", base_path, "Test") + + +def test_validate_dataset_path_missing_parquet_folder(tmp_path: Path) -> None: + """Test upload fails when parquet-files directory is missing.""" + client = HuggingFaceHubClient(token="test-token") + base_path = tmp_path / "dataset" + base_path.mkdir() + (base_path / "metadata.json").write_text('{"target_num_records": 10}') + + with pytest.raises(HuggingFaceHubClientUploadError, match="Required directory not found"): + client.upload_dataset("test/dataset", base_path, "Test") + + +def test_validate_dataset_path_empty_parquet_folder(tmp_path: Path) -> None: + """Test upload fails when parquet-files directory is empty.""" + client = HuggingFaceHubClient(token="test-token") + base_path = tmp_path / "dataset" + base_path.mkdir() + (base_path / "metadata.json").write_text('{"target_num_records": 10}') + parquet_dir = base_path / "parquet-files" + parquet_dir.mkdir() + + with pytest.raises(HuggingFaceHubClientUploadError, match="directory is empty"): + client.upload_dataset("test/dataset", base_path, "Test") + + +def test_validate_dataset_path_invalid_metadata_json(tmp_path: Path) -> None: + """Test upload fails when metadata.json contains invalid JSON.""" + client = HuggingFaceHubClient(token="test-token") + base_path = tmp_path / "dataset" + base_path.mkdir() + (base_path / "metadata.json").write_text("invalid json {{{") + parquet_dir = base_path / "parquet-files" + parquet_dir.mkdir() + (parquet_dir / "batch_00000.parquet").write_text("data") + + with pytest.raises(HuggingFaceHubClientUploadError, match="Invalid JSON"): + client.upload_dataset("test/dataset", base_path, "Test") + + +def test_validate_dataset_path_invalid_builder_config_json(tmp_path: Path) -> None: + """Test upload fails when builder_config.json contains invalid JSON.""" + client = HuggingFaceHubClient(token="test-token") + base_path = tmp_path / "dataset" + base_path.mkdir() + (base_path / "metadata.json").write_text('{"target_num_records": 10}') + (base_path / "builder_config.json").write_text("invalid json {{{") + parquet_dir = base_path / "parquet-files" + parquet_dir.mkdir() + (parquet_dir / "batch_00000.parquet").write_text("data") + + with pytest.raises(HuggingFaceHubClientUploadError, match="Invalid JSON"): + client.upload_dataset("test/dataset", base_path, "Test") + + +def test_upload_dataset_invalid_repo_id(mock_hf_api: MagicMock, sample_dataset_path: Path) -> None: + """Test upload_dataset fails with invalid repo_id.""" + client = HuggingFaceHubClient(token="test-token") + + with pytest.raises(HuggingFaceHubClientUploadError, match="Invalid repo_id format"): + client.upload_dataset( + repo_id="invalid-repo-id", # Missing slash + base_dataset_path=sample_dataset_path, + description="Test dataset", + ) + + +def test_upload_dataset_authentication_error(mock_hf_api: MagicMock, sample_dataset_path: Path) -> None: + """Test upload_dataset handles authentication errors.""" + client = HuggingFaceHubClient(token="invalid-token") + + # Mock 401 authentication error + error_response = MagicMock() + error_response.status_code = 401 + mock_hf_api.create_repo.side_effect = HfHubHTTPError("Unauthorized", response=error_response) + + with pytest.raises(HuggingFaceHubClientUploadError, match="Authentication failed"): + client.upload_dataset( + repo_id="test/dataset", + base_dataset_path=sample_dataset_path, + description="Test dataset", + ) + + +def test_upload_dataset_permission_error(mock_hf_api: MagicMock, sample_dataset_path: Path) -> None: + """Test upload_dataset handles permission errors.""" + client = HuggingFaceHubClient(token="test-token") + + # Mock 403 permission error + error_response = MagicMock() + error_response.status_code = 403 + mock_hf_api.create_repo.side_effect = HfHubHTTPError("Forbidden", response=error_response) + + with pytest.raises(HuggingFaceHubClientUploadError, match="Permission denied"): + client.upload_dataset( + repo_id="test/dataset", + base_dataset_path=sample_dataset_path, + description="Test dataset", + ) + + +def test_upload_dataset_card_invalid_json(tmp_path: Path) -> None: + """Test upload fails when metadata.json contains invalid JSON.""" + client = HuggingFaceHubClient(token="test-token") + base_path = tmp_path / "dataset" + base_path.mkdir() + (base_path / "metadata.json").write_text("invalid json") + + # Create parquet directory so validation reaches the metadata JSON check + parquet_dir = base_path / "parquet-files" + parquet_dir.mkdir() + (parquet_dir / "batch_00000.parquet").write_text("data") + + with pytest.raises(HuggingFaceHubClientUploadError, match="Invalid JSON"): + client.upload_dataset( + repo_id="test/dataset", + base_dataset_path=base_path, + description="Test description", + ) + + +def test_update_metadata_paths(tmp_path: Path) -> None: + """Test that _update_metadata_paths correctly updates file paths for HuggingFace Hub.""" + metadata = { + "target_num_records": 100, + "file_paths": { + "parquet-files": [ + "parquet-files/batch_00000.parquet", + "parquet-files/batch_00001.parquet", + ], + "processor-files": { + "processor1": ["processors-files/processor1/batch_00000.parquet"], + "processor2": ["processors-files/processor2/batch_00000.parquet"], + }, + }, + } + + metadata_path = tmp_path / "metadata.json" + with open(metadata_path, "w") as f: + json.dump(metadata, f) + + updated = HuggingFaceHubClient._update_metadata_paths(metadata_path) + + assert updated["file_paths"]["data"] == [ + "data/batch_00000.parquet", + "data/batch_00001.parquet", + ] + assert updated["file_paths"]["processor-files"]["processor1"] == ["processor1/batch_00000.parquet"] + assert updated["file_paths"]["processor-files"]["processor2"] == ["processor2/batch_00000.parquet"] + assert "parquet-files" not in updated["file_paths"] diff --git a/packages/data-designer/tests/integrations/huggingface/test_dataset_card.py b/packages/data-designer/tests/integrations/huggingface/test_dataset_card.py new file mode 100644 index 000000000..ce7b28325 --- /dev/null +++ b/packages/data-designer/tests/integrations/huggingface/test_dataset_card.py @@ -0,0 +1,267 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +from __future__ import annotations + +import pytest + +from data_designer.integrations.huggingface.dataset_card import DataDesignerDatasetCard + + +@pytest.fixture +def stub_metadata() -> dict: + """Stub metadata fixture with single column that can be used/modified by most tests.""" + return { + "target_num_records": 100, + "schema": {"col1": "string"}, + "column_statistics": [ + { + "column_name": "col1", + "num_records": 100, + "num_unique": 100, + "num_null": 0, + "simple_dtype": "string", + "column_type": "sampler", + } + ], + } + + +def test_compute_size_category() -> None: + """Test size category computation for various dataset sizes.""" + assert DataDesignerDatasetCard._compute_size_category(500) == "n<1K" + assert DataDesignerDatasetCard._compute_size_category(5000) == "1K10M" + + +def test_from_metadata_minimal(stub_metadata: dict) -> None: + """Test creating dataset card from minimal metadata.""" + # Add second column for this test + stub_metadata["schema"]["col2"] = "int64" + + card = DataDesignerDatasetCard.from_metadata( + metadata=stub_metadata, + builder_config=None, + repo_id="test/dataset", + description="Test dataset for unit testing.", + ) + + # Verify card was created + assert card is not None + assert "test/dataset" in str(card) + assert "100" in str(card) + assert "col1" in str(card) + assert "2" in str(card) # Number of columns + + +def test_from_metadata_with_builder_config(stub_metadata: dict) -> None: + """Test creating dataset card with builder config.""" + # Customize for this test + stub_metadata["target_num_records"] = 50 + stub_metadata["schema"] = {"name": "string", "age": "int64"} + stub_metadata["column_statistics"] = [ + { + "column_name": "name", + "num_records": 50, + "num_unique": 50, + "num_null": 0, + "simple_dtype": "string", + "column_type": "sampler", + "sampler_type": "person", + }, + { + "column_name": "age", + "num_records": 50, + "num_unique": 30, + "num_null": 0, + "simple_dtype": "int64", + "column_type": "sampler", + "sampler_type": "uniform", + }, + ] + + builder_config = { + "data_designer": { + "columns": [ + {"name": "name", "column_type": "sampler"}, + {"name": "age", "column_type": "sampler"}, + ] + } + } + + card = DataDesignerDatasetCard.from_metadata( + metadata=stub_metadata, + builder_config=builder_config, + repo_id="test/dataset-with-config", + description="Test dataset with builder config.", + ) + + # Verify card includes config info + assert card is not None + assert "sampler" in str(card) + assert "2 column" in str(card) + + +def test_from_metadata_with_llm_columns(stub_metadata: dict) -> None: + """Test creating dataset card with LLM column statistics.""" + # Customize for LLM test + stub_metadata["target_num_records"] = 10 + stub_metadata["schema"] = {"prompt": "string", "response": "string"} + stub_metadata["column_statistics"] = [ + { + "column_name": "response", + "num_records": 10, + "num_unique": 10, + "num_null": 0, + "simple_dtype": "string", + "column_type": "llm-text", + "output_tokens_mean": 50.5, + "input_tokens_mean": 20.3, + } + ] + + card = DataDesignerDatasetCard.from_metadata( + metadata=stub_metadata, + builder_config=None, + repo_id="test/llm-dataset", + description="Test dataset with LLM columns.", + ) + + # Verify LLM statistics are included + assert card is not None + assert "Tokens:" in str(card) and "out" in str(card) and "in" in str(card) + + +def test_from_metadata_with_processors(stub_metadata: dict) -> None: + """Test creating dataset card with processor outputs includes loading examples.""" + # Add processor files for this test + stub_metadata["file_paths"] = { + "parquet-files": ["parquet-files/batch_00000.parquet"], + "processor-files": { + "processor1": ["processors-files/processor1/batch_00000.parquet"], + "processor2": ["processors-files/processor2/batch_00000.parquet"], + }, + } + + card = DataDesignerDatasetCard.from_metadata( + metadata=stub_metadata, + builder_config=None, + repo_id="test/dataset-with-processors", + description="Test dataset with processor outputs.", + ) + + card_str = str(card) + assert card is not None + assert "processor1" in card_str + assert "processor2" in card_str + assert '"processor1"' in card_str + assert '"processor2"' in card_str + assert "Load processor outputs" in card_str + + +def test_from_metadata_with_custom_description(stub_metadata: dict) -> None: + """Test creating dataset card with custom description.""" + # Add second column for this test + stub_metadata["schema"]["col2"] = "int64" + + description = "This dataset contains synthetic data for testing chatbot responses." + + card = DataDesignerDatasetCard.from_metadata( + metadata=stub_metadata, + builder_config=None, + repo_id="test/dataset-with-description", + description=description, + ) + + card_str = str(card) + assert card is not None + assert "This dataset contains synthetic data for testing chatbot responses." in card_str + + +def test_from_metadata_description_placement(stub_metadata: dict) -> None: + """Test that description appears in the correct location.""" + # Use 50 records for this test + stub_metadata["target_num_records"] = 50 + stub_metadata["column_statistics"][0]["num_records"] = 50 + + card = DataDesignerDatasetCard.from_metadata( + metadata=stub_metadata, + builder_config=None, + repo_id="test/dataset-description-placement", + description="Test description placement.", + ) + + card_str = str(card) + assert card is not None + assert "Test description placement." in card_str + assert "About NeMo Data Designer" in card_str + # Description should appear before Dataset Summary + desc_pos = card_str.find("Test description placement.") + summary_pos = card_str.find("Dataset Summary") + assert desc_pos < summary_pos + + +def test_from_metadata_default_tags(stub_metadata: dict) -> None: + """Test that default tags are included when no custom tags are provided.""" + card = DataDesignerDatasetCard.from_metadata( + metadata=stub_metadata, + builder_config=None, + repo_id="test/dataset-default-tags", + description="Test dataset with default tags.", + ) + + card_str = str(card) + assert card is not None + # Check that default tags appear in the YAML frontmatter + assert "- synthetic" in card_str + assert "- datadesigner" in card_str + + +def test_from_metadata_with_custom_tags(stub_metadata: dict) -> None: + """Test that custom tags are added to default tags.""" + custom_tags = ["chatbot", "conversation", "qa"] + + card = DataDesignerDatasetCard.from_metadata( + metadata=stub_metadata, + builder_config=None, + repo_id="test/dataset-custom-tags", + description="Test dataset with custom tags.", + tags=custom_tags, + ) + + card_str = str(card) + assert card is not None + # Check that both default and custom tags appear in the YAML frontmatter + assert "- synthetic" in card_str + assert "- datadesigner" in card_str + assert "- chatbot" in card_str + assert "- conversation" in card_str + assert "- qa" in card_str + + +def test_from_metadata_tags_in_yaml_frontmatter(stub_metadata: dict) -> None: + """Test that tags appear in the YAML frontmatter section.""" + # Use 50 records for this test + stub_metadata["target_num_records"] = 50 + stub_metadata["column_statistics"][0]["num_records"] = 50 + + card = DataDesignerDatasetCard.from_metadata( + metadata=stub_metadata, + builder_config=None, + repo_id="test/dataset-tags-frontmatter", + description="Test dataset.", + tags=["custom-tag"], + ) + + card_str = str(card) + assert card is not None + # Tags should appear before the main content (in YAML frontmatter) + tags_section = card_str.find("tags:") + quick_start_section = card_str.find("## 🚀 Quick Start") + assert tags_section < quick_start_section + assert tags_section != -1 # Make sure tags section exists + # Verify tags appear before the closing of YAML frontmatter + assert tags_section < card_str.find("---", tags_section)