From 94c2112dff4638bcff2fbba51b1215bedd948c17 Mon Sep 17 00:00:00 2001 From: davidberenstein1957 Date: Fri, 12 Dec 2025 07:03:01 +0100 Subject: [PATCH 1/5] feat: integrate Hugging Face Hub functionality with dataset push/pull capabilities - Added `HuggingFaceHubMixin` to facilitate pushing and pulling datasets to/from Hugging Face Hub. - Implemented `pull_from_hub` method in `DatasetCreationResults` for loading datasets and artifacts from the hub. - Created `HubDatasetResults` class to encapsulate results from hub operations, including datasets, analysis, and metadata. - Developed integration tests for verifying push and pull operations with Hugging Face Hub. - Introduced dataset card generation for datasets pushed to the hub, enhancing documentation and usability. --- .../interface/huggingface/__init__.py | 8 + .../interface/huggingface/dataset_card.py | 21 + .../huggingface/dataset_card_template.md | 175 +++ .../interface/huggingface/hub_mixin.py | 1118 +++++++++++++++++ .../interface/huggingface/hub_results.py | 43 + src/data_designer/interface/results.py | 56 +- tests/interface/test_hub_integration.py | 353 ++++++ 7 files changed, 1773 insertions(+), 1 deletion(-) create mode 100644 src/data_designer/interface/huggingface/__init__.py create mode 100644 src/data_designer/interface/huggingface/dataset_card.py create mode 100644 src/data_designer/interface/huggingface/dataset_card_template.md create mode 100644 src/data_designer/interface/huggingface/hub_mixin.py create mode 100644 src/data_designer/interface/huggingface/hub_results.py create mode 100644 tests/interface/test_hub_integration.py diff --git a/src/data_designer/interface/huggingface/__init__.py b/src/data_designer/interface/huggingface/__init__.py new file mode 100644 index 000000000..7c6d2eaec --- /dev/null +++ b/src/data_designer/interface/huggingface/__init__.py @@ -0,0 +1,8 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +from data_designer.interface.huggingface.hub_mixin import HuggingFaceHubMixin, pull_from_hub +from data_designer.interface.huggingface.hub_results import HubDatasetResults + +__all__ = ["HuggingFaceHubMixin", "pull_from_hub", "HubDatasetResults"] + diff --git a/src/data_designer/interface/huggingface/dataset_card.py b/src/data_designer/interface/huggingface/dataset_card.py new file mode 100644 index 000000000..6bf634887 --- /dev/null +++ b/src/data_designer/interface/huggingface/dataset_card.py @@ -0,0 +1,21 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +from __future__ import annotations + +from pathlib import Path + +from huggingface_hub import DatasetCard + +TEMPLATE_DATA_DESIGNER_DATASET_CARD_PATH = Path(__file__).parent / "dataset_card_template.md" + + +class DataDesignerDatasetCard(DatasetCard): + """Dataset card for NeMo Data Designer datasets. + + This class extends Hugging Face's DatasetCard with a custom template + specifically designed for Data Designer generated datasets. + The template is located at `data_designer/interface/huggingface/dataset_card_template.md`. + """ + + default_template_path = TEMPLATE_DATA_DESIGNER_DATASET_CARD_PATH diff --git a/src/data_designer/interface/huggingface/dataset_card_template.md b/src/data_designer/interface/huggingface/dataset_card_template.md new file mode 100644 index 000000000..d676a4d6e --- /dev/null +++ b/src/data_designer/interface/huggingface/dataset_card_template.md @@ -0,0 +1,175 @@ +--- +size_categories: {{ size_categories }} +tags: +{% for tag in tags %} + - {{ tag }} +{% endfor %} +--- + +# Dataset Card + +This dataset was generated using **NeMo Data Designer**, a comprehensive framework for creating high-quality synthetic datasets from scratch or using seed data. + +## About NeMo Data Designer + +NeMo Data Designer is a general framework for generating high-quality synthetic data that goes beyond simple LLM prompting. It provides: + +- **Diverse data generation** using statistical samplers, LLMs, or existing seed datasets +- **Relationship control** between fields with dependency-aware generation +- **Quality validation** with built-in Python, SQL, and custom local and remote validators +- **LLM-as-a-judge** scoring for quality assessment +- **Fast iteration** with preview mode before full-scale generation + +For more information, visit: [https://github.com/NVIDIA-NeMo/DataDesigner](https://github.com/NVIDIA-NeMo/DataDesigner) + +## Quick Start + +Load this dataset for fine-tuning: + +```python +from datasets import load_dataset + +dataset = load_dataset("{{ repo_id }}") +# Access the data +df = dataset["train"].to_pandas() +``` + +Or with NeMo Data Designer: + +```python +from data_designer.interface.results import DatasetCreationResults + +# Load dataset with all artifacts (analysis, configs, etc.) +results = DatasetCreationResults.pull_from_hub("{{ repo_id }}") + +# Access the dataset +df = results.load_dataset() + +# Access the analysis +analysis = results.load_analysis() + +# Access the config builder +config_builder = results._config_builder +``` + +## Dataset Summary + +- **Number of records**: {% if num_records is defined and num_records is not none %}{{ "{:,}".format(num_records) }}{% else %}N/A{% endif %} +- **Number of columns**: {{ num_columns }} +- **Size category**: {{ size_categories }} +{% if target_num_records is defined and target_num_records is not none and target_num_records != num_records %} +- **Target records**: {{ "{:,}".format(target_num_records) }} ({{ "%.1f" | format(percent_complete) if percent_complete is defined and percent_complete is not none else "N/A" }}% complete) +{% endif %} + +## Sample Data + +{% if num_samples > 0 %} +Here are sample records from the dataset: + +{% for idx in range(num_samples) %} +### Example {{ idx + 1 }} + +```json +{{ sample_records[idx] | tojson(indent=2) }} +``` +{% endfor %} +{% else %} +No sample records available. +{% endif %} + +## Schema + +{% if all_columns is defined and all_columns %} +| Column | Type | Description | +|--------|------|-------------| +{% for col_name, dtype in all_columns | dictsort -%} +| `{{ col_name }}` | {{ dtype }} | {% if column_configs %}{% for col_config in column_configs %}{% if col_config.get('name') == col_name %}{% set col_type = col_config.get('column_type') %}{% if col_type is mapping %}{{ col_type.get('value', '') }}{% elif col_type %}{{ col_type }}{% endif %}{% endif %}{% endfor %}{% endif %} | +{% endfor -%} +{% else %} +No column information available. +{% endif %} + +## Data Quality + +{% if column_stats_by_type %} +### Column Statistics + +{% for col_type in sorted_column_types %} +{% set stats_list = column_stats_by_type[col_type] %} +{% if stats_list %} +{% set col_type_label = col_type.replace("_", " ").title().replace("Llm", "LLM") %} +#### {{ col_type_label }} Columns + +{% if col_type == "sampler" %} +| Column | Data Type | Unique Values | Sampler Type | +|--------|-----------|---------------|--------------| +{% for stat in stats_list -%} +| **{{ stat.get('column_name', 'unknown') }}** | {{ stat.get('simple_dtype', 'unknown') }} | {% if 'num_unique' in stat and stat['num_unique'] is not none %}{{ stat['num_unique'] }}{% else %}N/A{% endif %} ({% if 'num_unique' in stat and stat['num_unique'] is not none and num_records > 0 %}{{ "%.1f" | format((stat['num_unique'] / num_records * 100)) }}{% else %}0.0{% endif %}%) | {% if 'sampler_type' in stat and stat['sampler_type'] is not none %}{% set sampler_type = stat['sampler_type'] %}{% if sampler_type is mapping %}{{ sampler_type.get('value', 'N/A') }}{% else %}{{ sampler_type }}{% endif %}{% else %}N/A{% endif %} | +{% endfor -%} + +{% elif col_type in ["llm_text", "llm_structured", "llm_code", "llm_judge"] %} +| Column | Data Type | Unique Values | Prompt Tokens (avg) | Completion Tokens (avg) | +|--------|-----------|---------------|---------------------|--------------------------| +{% for stat in stats_list -%} +| **{{ stat.get('column_name', 'unknown') }}** | {{ stat.get('simple_dtype', 'unknown') }} | {% if 'num_unique' in stat and stat['num_unique'] is not none %}{{ stat['num_unique'] }}{% else %}N/A{% endif %} ({% if 'num_unique' in stat and stat['num_unique'] is not none and num_records > 0 %}{{ "%.1f" | format((stat['num_unique'] / num_records * 100)) }}{% else %}0.0{% endif %}%) | {% if 'prompt_tokens_mean' in stat and stat['prompt_tokens_mean'] is not none %}{{ "%.1f" | format(stat['prompt_tokens_mean']) }}{% else %}N/A{% endif %} ± {% if 'prompt_tokens_stddev' in stat and stat['prompt_tokens_stddev'] is not none %}{{ "%.1f" | format(stat['prompt_tokens_stddev']) }}{% else %}N/A{% endif %} | {% if 'completion_tokens_mean' in stat and stat['completion_tokens_mean'] is not none %}{{ "%.1f" | format(stat['completion_tokens_mean']) }}{% else %}N/A{% endif %} ± {% if 'completion_tokens_stddev' in stat and stat['completion_tokens_stddev'] is not none %}{{ "%.1f" | format(stat['completion_tokens_stddev']) }}{% else %}N/A{% endif %} | +{% endfor -%} + +{% else %} +| Column | Data Type | Unique Values | Null Values | +|--------|-----------|---------------|-------------| +{% for stat in stats_list -%} +| **{{ stat.get('column_name', 'unknown') }}** | {{ stat.get('simple_dtype', 'unknown') }} | {% if 'num_unique' in stat and stat['num_unique'] is not none %}{{ stat['num_unique'] }}{% else %}N/A{% endif %} ({% if 'num_unique' in stat and stat['num_unique'] is not none and num_records > 0 %}{{ "%.1f" | format((stat['num_unique'] / num_records * 100)) }}{% else %}0.0{% endif %}%) | {% if 'num_null' in stat and stat['num_null'] is not none %}{{ stat['num_null'] }}{% else %}0{% endif %} ({% if 'num_null' in stat and stat['num_null'] is not none and num_records > 0 %}{{ "%.1f" | format((stat['num_null'] / num_records * 100)) }}{% else %}0.0{% endif %}%) | +{% endfor -%} +{% endif %} +{% endif %} + +{% endfor %} +{% elif column_statistics %} +{% for stat in column_statistics[:10] %} +- **{{ stat.get('column_name', 'unknown') }}** ({{ stat.get('column_type', 'unknown') }}): {% if 'num_unique' in stat and stat['num_unique'] is not none %}{{ stat['num_unique'] }} unique values{% if num_records > 0 %} ({{ "%.1f" | format((stat['num_unique'] / num_records * 100)) }}% coverage){% endif %}{% else %}N/A{% endif %}{% if 'num_null' in stat and stat['num_null'] is not none and stat['num_null'] > 0 %}, {{ stat['num_null'] }} nulls{% endif %} +{% endfor %} +{% if column_statistics | length > 10 %} +*... and {{ (column_statistics | length) - 10 }} more columns* +{% endif %} +{% endif %} + +## Configuration Details + +{% if column_configs %} +This dataset was generated with {{ column_configs | length }} column configuration(s). + +### Generation Strategy + +{% for config_type, count in config_types | dictsort %} +- **{{ config_type }}**: {{ count }} column(s) +{% endfor %} + +### Column Configurations + +{% for col_config in column_configs %} +- **{{ col_config.get('name', 'unknown') }}**: {% set col_type = col_config.get('column_type') %}{% if col_type is mapping %}{{ col_type.get('value', 'unknown') }}{% elif col_type %}{{ col_type }}{% else %}unknown{% endif %} +{% endfor %} +{% else %} +No column configurations available. +{% endif %} + +{% if metadata %} +## Metadata + +```json +{{ metadata | tojson(indent=2) }} +``` +{% endif %} + +## Citation + +If you use this dataset in your research, please cite: + +```bibtex +@software{data_designer, + title={NeMo Data Designer: A Framework for Synthetic Dataset Generation}, + author={NVIDIA}, + year={2025}, + url={https://github.com/NVIDIA-NeMo/DataDesigner} +} +``` diff --git a/src/data_designer/interface/huggingface/hub_mixin.py b/src/data_designer/interface/huggingface/hub_mixin.py new file mode 100644 index 000000000..b3117abf4 --- /dev/null +++ b/src/data_designer/interface/huggingface/hub_mixin.py @@ -0,0 +1,1118 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +from __future__ import annotations + +import json +from pathlib import Path +from tempfile import TemporaryDirectory +from typing import Any, Protocol + +import pandas as pd +from datasets import Dataset, DatasetDict, load_dataset +from huggingface_hub import HfApi, get_token, hf_hub_download, list_repo_files +from huggingface_hub.utils import HfHubHTTPError + +from data_designer.engine.dataset_builders.errors import ArtifactStorageError +from data_designer.interface.huggingface.dataset_card import DataDesignerDatasetCard +from data_designer.interface.huggingface.hub_results import HubDatasetResults + + +def _resolve_hf_token(token: str | None) -> str | None: + """Resolve the Hugging Face token from parameter or huggingface_hub. + + This function tries to resolve a token in the following order: + 1. Token provided as parameter + 2. huggingface_hub's get_token() (checks environment variables, cache, config file, etc.) + + Args: + token: Token provided as parameter. + + Returns: + Resolved token or None if not found. + """ + if token is not None: + return token + + # Try to get token from huggingface_hub (checks env vars, cache, config file, etc.) + try: + token = get_token() + if token: + return token + except Exception: + # If get_token fails, continue to return None + pass + + # Return None - huggingface_hub will handle authentication if user is logged in + return None + + +def _size_categories_parser(num_records: int) -> str: + """Parse dataset size into Hugging Face size category. + + Uses the same category names as Argilla's size_categories_parser. + + Args: + num_records: Number of records in the dataset. + + Returns: + Size category string matching Hugging Face format (e.g., "n<1K", "1K1T" + + +def _build_card_template_variables( + dataset_df: pd.DataFrame, + analysis: Any, + config_builder: Any, + metadata: dict[str, Any] | None, + repo_id: str, +) -> dict[str, Any]: + """Build template variables for the dataset card. + + Args: + dataset_df: The dataset as a pandas DataFrame. + analysis: Profiling analysis results. + config_builder: Configuration builder. + metadata: Optional metadata dictionary. + repo_id: Repository ID. + + Returns: + Dictionary of template variables. + """ + column_configs = config_builder.get_column_configs() + column_names = set(dataset_df.columns) + + # Prepare column information + unconfigured_columns = {} + all_columns = {} + # Always populate all_columns for template use + for col_name in sorted(column_names): + all_columns[col_name] = str(dataset_df[col_name].dtype) + + if column_configs: + configured_names = {col.name for col in column_configs} + unconfigured = column_names - configured_names + for col_name in sorted(unconfigured): + unconfigured_columns[col_name] = str(dataset_df[col_name].dtype) + + # Prepare sample records + num_samples = min(5, len(dataset_df)) + sample_records = [] + if num_samples > 0: + sample_df = dataset_df.head(num_samples) + records = sample_df.to_dict(orient="records") + for record in records: + # Convert to JSON-serializable format, handling complex types + serializable_record = {} + for k, v in record.items(): + if isinstance(v, (str, int, float, bool, type(None))): + serializable_record[k] = v + else: + # Convert complex types to string representation + serializable_record[k] = str(v) + sample_records.append(serializable_record) + + # Convert column_configs to dicts for safer template rendering + column_configs_dicts = [] + if column_configs: + for col_config in column_configs: + if hasattr(col_config, "model_dump"): + # Pydantic model + config_dict = col_config.model_dump(mode="json") + # Convert column_type enum to dict if it exists + if "column_type" in config_dict and hasattr(config_dict["column_type"], "value"): + config_dict["column_type"] = {"value": config_dict["column_type"].value} + elif "column_type" in config_dict and not isinstance(config_dict["column_type"], dict): + # If it's an enum object, convert it + col_type = getattr(col_config, "column_type", None) + if col_type and hasattr(col_type, "value"): + config_dict["column_type"] = {"value": col_type.value} + else: + config_dict["column_type"] = {"value": str(config_dict.get("column_type", "unknown"))} + column_configs_dicts.append(config_dict) + elif hasattr(col_config, "__dict__"): + # Regular object - convert to dict + config_dict = {} + for key in dir(col_config): + if not key.startswith("_") and not callable(getattr(col_config, key, None)): + try: + value = getattr(col_config, key, None) + if isinstance(value, (str, int, float, bool, type(None))): + config_dict[key] = value + elif hasattr(value, "value"): # Enum + config_dict[key] = {"value": value.value} if key == "column_type" else value.value + else: + config_dict[key] = str(value) if value is not None else None + except Exception: + pass + column_configs_dicts.append(config_dict) + else: + column_configs_dicts.append(col_config) + + # Prepare config types summary + config_types: dict[str, int] = {} + if column_configs: + for col_config in column_configs: + config_type = type(col_config).__name__ + config_types[config_type] = config_types.get(config_type, 0) + 1 + + # Group column statistics by type + from data_designer.config.column_types import DataDesignerColumnType, get_column_display_order + + column_stats_by_type: dict[str, list] = {} + display_order = get_column_display_order() + + for column_type in analysis.column_types: + # column_type is already a string, convert to DataDesignerColumnType enum + try: + column_type_enum = DataDesignerColumnType(column_type) + except (ValueError, TypeError): + # Skip invalid column types + continue + stats = analysis.get_column_statistics_by_type(column_type_enum) + if stats: + # Convert stat objects to dicts for safer template rendering + stats_dicts = [] + for stat in stats: + if hasattr(stat, "model_dump"): + # Pydantic model - convert to dict + stat_dict = stat.model_dump(mode="json") + # Handle enum fields like sampler_type + if "sampler_type" in stat_dict and not isinstance(stat_dict["sampler_type"], (str, dict)): + sampler_type = getattr(stat, "sampler_type", None) + if sampler_type and hasattr(sampler_type, "value"): + stat_dict["sampler_type"] = {"value": sampler_type.value} + else: + stat_dict["sampler_type"] = {"value": str(stat_dict.get("sampler_type", "unknown"))} + stats_dicts.append(stat_dict) + elif hasattr(stat, "__dict__"): + # Regular object - convert to dict + stat_dict = {} + for key in dir(stat): + if not key.startswith("_") and not callable(getattr(stat, key, None)): + try: + value = getattr(stat, key, None) + if isinstance(value, (str, int, float, bool, type(None))): + stat_dict[key] = value + elif hasattr(value, "value"): # Enum + # For enums, store as dict with value key for consistency + stat_dict[key] = ( + {"value": value.value} + if key in ["sampler_type", "column_type"] + else value.value + ) + else: + stat_dict[key] = str(value) if value is not None else None + except Exception: + pass + stats_dicts.append(stat_dict) + else: + stats_dicts.append(stat) + column_stats_by_type[column_type] = stats_dicts + + # Sort column types by display order + sorted_column_types = sorted( + column_stats_by_type.keys(), + key=lambda x: display_order.index(x) if x in display_order else len(display_order), + ) + + # Convert column_statistics to dicts for safer template rendering + column_statistics_dicts = [] + if analysis.column_statistics: + for stat in analysis.column_statistics: + if hasattr(stat, "model_dump"): + # Pydantic model + column_statistics_dicts.append(stat.model_dump(mode="json")) + elif hasattr(stat, "__dict__"): + # Regular object - convert to dict + stat_dict = {} + for key in dir(stat): + if not key.startswith("_") and not callable(getattr(stat, key, None)): + try: + value = getattr(stat, key, None) + if isinstance(value, (str, int, float, bool, type(None))): + stat_dict[key] = value + elif hasattr(value, "value"): # Enum + stat_dict[key] = value.value + else: + stat_dict[key] = str(value) if value is not None else None + except Exception: + pass + column_statistics_dicts.append(stat_dict) + else: + column_statistics_dicts.append(stat) + + return { + "size_categories": _size_categories_parser(len(dataset_df)), + "num_records": len(dataset_df), + "target_num_records": analysis.target_num_records, + "percent_complete": analysis.percent_complete, + "num_columns": len(dataset_df.columns), + "repo_id": repo_id, + "metadata": metadata or {}, + "column_configs": column_configs_dicts if column_configs_dicts else [], + "unconfigured_columns": unconfigured_columns, + "all_columns": all_columns, + "column_statistics": column_statistics_dicts, + "column_stats_by_type": column_stats_by_type, + "sorted_column_types": sorted_column_types, + "num_samples": num_samples, + "sample_records": sample_records, + "config_types": config_types, + } + + +class HasDataset(Protocol): + """Protocol for classes that have a load_dataset method.""" + + def load_dataset(self) -> pd.DataFrame: ... + + +class HasArtifactStorage(Protocol): + """Protocol for classes that have artifact_storage with metadata_file_path.""" + + @property + def artifact_storage(self) -> Any: ... + + +class HuggingFaceHubMixin: + """Mixin class for pushing and pulling datasets to/from Hugging Face Hub. + + This mixin provides the `push_to_hub` and `pull_from_hub` methods to classes that implement + the `HasDataset` and `HasArtifactStorage` protocols. + """ + + def push_to_hub( + self: Any, + repo_id: str, + *, + token: str | None = None, + generate_card: bool = True, + **kwargs: Any, + ) -> None: + """Push the dataset to Hugging Face Hub. + + This method converts the pandas DataFrame to a HuggingFace Dataset, pushes it to + the Hugging Face Hub, and optionally generates and uploads a dataset card. + + Args: + repo_id: The ID of the Hugging Face Hub repository (e.g., "username/dataset-name"). + token: Hugging Face token for authentication. If None, will check environment + variables HF_TOKEN or HUGGINGFACE_HUB_TOKEN. + generate_card: Whether to generate and upload a dataset card. Defaults to True. + **kwargs: Additional arguments to pass to `dataset.push_to_hub()`. + + Raises: + ArtifactStorageError: If there's an error loading the dataset or metadata. + """ + # Resolve token + resolved_token = self._resolve_token(token) + + # Load dataset + dataset_df = self.load_dataset() + + # Convert pandas DataFrame to HuggingFace Dataset + hf_dataset = Dataset.from_pandas(dataset_df) + + # Push dataset to hub + hf_dataset.push_to_hub(repo_id, token=resolved_token, **kwargs) + + # Push additional artifacts (analysis, processor datasets, configs) + self._upload_additional_artifacts(repo_id, resolved_token) + + # Generate and upload dataset card if requested + if generate_card: + self._upload_dataset_card(repo_id, resolved_token, dataset_df) + + def _resolve_token(self, token: str | None) -> str | None: + """Resolve the Hugging Face token from parameter, environment variables, or huggingface_hub. + + Args: + token: Token provided as parameter. + + Returns: + Resolved token or None if not found. + """ + return _resolve_hf_token(token) + + def _upload_additional_artifacts( + self: Any, + repo_id: str, + token: str | None, + ) -> None: + """Upload additional artifacts to Hugging Face Hub. + + This includes: + - Analysis results (as JSON) + - Processor datasets (as parquet files) + - Processor artifacts (directories) + - Configuration files (column_configs.json, model_configs.json) + + Args: + repo_id: The ID of the Hugging Face Hub repository. + token: Hugging Face token for authentication. + """ + hf_api = HfApi(token=token) + + # Get analysis from the instance + analysis = getattr(self, "_analysis", None) + + # Upload analysis results + if analysis is not None: + try: + analysis_json = analysis.model_dump(mode="json") + with TemporaryDirectory() as tmpdir: + analysis_path = Path(tmpdir) / "analysis.json" + with open(analysis_path, "w") as f: + json.dump(analysis_json, f, indent=2, default=str) + hf_api.upload_file( + path_or_fileobj=str(analysis_path), + path_in_repo="analysis.json", + repo_id=repo_id, + repo_type="dataset", + ) + except Exception as e: + # Log but don't fail if analysis can't be uploaded + import logging + + logger = logging.getLogger(__name__) + logger.warning(f"Failed to upload analysis results: {e}") + + # Upload processor datasets and artifacts + if hasattr(self, "artifact_storage") and hasattr(self.artifact_storage, "processors_outputs_path"): + processors_path = self.artifact_storage.processors_outputs_path + if processors_path.exists(): + self._upload_processor_artifacts(hf_api, repo_id, processors_path) + + # Upload metadata if it exists (sanitize file paths first) + if hasattr(self, "artifact_storage") and hasattr(self.artifact_storage, "metadata_file_path"): + metadata_path = self.artifact_storage.metadata_file_path + if metadata_path.exists(): + try: + with open(metadata_path, "r") as f: + metadata = json.load(f) + + # Sanitize metadata: convert local file paths to Hugging Face Hub relative paths + if "file_paths" in metadata and isinstance(metadata["file_paths"], list): + # Convert absolute local paths to relative paths within the HF repo + # Hugging Face datasets library manages the file structure when pushing + # Files are typically stored as data/train-XXXXX-of-YYYYY.parquet + # Since we don't know the exact naming ahead of time, we use a pattern + # that matches the HF datasets structure + sanitized_paths = [] + num_files = len(metadata["file_paths"]) + for idx, file_path in enumerate(metadata["file_paths"]): + path_obj = Path(file_path) + # Hugging Face uses format: data/train-XXXXX-of-YYYYY.parquet + # where XXXXX is zero-padded file index and YYYYY is total files + if "batch_" in path_obj.name: + try: + # Extract batch number from filename + batch_num = path_obj.stem.split("_")[-1] + # Format as HF datasets style: train-00000-of-00001.parquet + sanitized_paths.append( + f"data/train-{batch_num.zfill(5)}-of-{str(num_files).zfill(5)}.parquet" + ) + except Exception: + # Fallback: use generic pattern + sanitized_paths.append( + f"data/train-{str(idx).zfill(5)}-of-{str(num_files).zfill(5)}.parquet" + ) + else: + # Use generic HF datasets naming pattern + sanitized_paths.append( + f"data/train-{str(idx).zfill(5)}-of-{str(num_files).zfill(5)}.parquet" + ) + metadata["file_paths"] = sanitized_paths + + # Write sanitized metadata to temp file and upload + with TemporaryDirectory() as tmpdir: + sanitized_metadata_path = Path(tmpdir) / "metadata.json" + with open(sanitized_metadata_path, "w") as f: + json.dump(metadata, f, indent=2, default=str) + hf_api.upload_file( + path_or_fileobj=str(sanitized_metadata_path), + path_in_repo="metadata.json", + repo_id=repo_id, + repo_type="dataset", + ) + except Exception as e: + import logging + + logger = logging.getLogger(__name__) + logger.warning(f"Failed to upload metadata: {e}") + + # Upload configuration files if they exist + if hasattr(self, "artifact_storage") and hasattr(self.artifact_storage, "base_dataset_path"): + base_path = self.artifact_storage.base_dataset_path + config_files = ["column_configs.json", "model_configs.json"] + for config_file in config_files: + config_path = base_path / config_file + if config_path.exists(): + try: + hf_api.upload_file( + path_or_fileobj=str(config_path), + path_in_repo=config_file, + repo_id=repo_id, + repo_type="dataset", + ) + except Exception as e: + import logging + + logger = logging.getLogger(__name__) + logger.warning(f"Failed to upload {config_file}: {e}") + + def _upload_processor_artifacts( + self: Any, + hf_api: HfApi, + repo_id: str, + processors_path: Path, + ) -> None: + """Upload processor datasets and artifacts to Hugging Face Hub. + + Args: + hf_api: Hugging Face API client. + repo_id: The ID of the Hugging Face Hub repository. + processors_path: Path to the processors outputs directory. + """ + # Find all processor directories + processor_dirs = [d for d in processors_path.iterdir() if d.is_dir()] + + for processor_dir in processor_dirs: + processor_name = processor_dir.name + + # Check if it's a dataset (contains parquet files) + parquet_files = list(processor_dir.glob("*.parquet")) + if parquet_files: + # Upload as a dataset (combine all parquet files) + try: + # Load all parquet files and combine + dfs = [pd.read_parquet(f) for f in parquet_files] + combined_df = pd.concat(dfs, ignore_index=True) + + # Upload as a separate dataset file + with TemporaryDirectory() as tmpdir: + processor_parquet = Path(tmpdir) / f"{processor_name}.parquet" + combined_df.to_parquet(processor_parquet, index=False) + hf_api.upload_file( + path_or_fileobj=str(processor_parquet), + path_in_repo=f"processors/{processor_name}.parquet", + repo_id=repo_id, + repo_type="dataset", + ) + except Exception as e: + import logging + + logger = logging.getLogger(__name__) + logger.warning(f"Failed to upload processor dataset {processor_name}: {e}") + + # Upload other files in the processor directory as artifacts + other_files = [f for f in processor_dir.rglob("*") if f.is_file() and f.suffix != ".parquet"] + for artifact_file in other_files: + try: + # Preserve directory structure relative to processor_dir + relative_path = artifact_file.relative_to(processors_path) + hf_api.upload_file( + path_or_fileobj=str(artifact_file), + path_in_repo=f"processors/{relative_path.as_posix()}", + repo_id=repo_id, + repo_type="dataset", + ) + except Exception as e: + import logging + + logger = logging.getLogger(__name__) + logger.warning(f"Failed to upload processor artifact {artifact_file}: {e}") + + def _upload_dataset_card( + self: Any, + repo_id: str, + token: str | None, + dataset_df: pd.DataFrame, + ) -> None: + """Generate and upload the dataset card to Hugging Face Hub. + + Args: + repo_id: The ID of the Hugging Face Hub repository. + token: Hugging Face token for authentication. + dataset_df: The dataset as a pandas DataFrame. + """ + # Get analysis and config_builder from the instance + analysis = getattr(self, "_analysis", None) + config_builder = getattr(self, "_config_builder", None) + + if analysis is None or config_builder is None: + raise ArtifactStorageError( + "Cannot generate dataset card: missing analysis or config_builder. " + "Ensure the class has _analysis and _config_builder attributes." + ) + + # Load metadata if available + metadata: dict[str, Any] | None = None + if hasattr(self, "artifact_storage") and hasattr(self.artifact_storage, "metadata_file_path"): + metadata_path = self.artifact_storage.metadata_file_path + if metadata_path.exists(): + try: + with open(metadata_path, "r") as f: + metadata = json.load(f) + except Exception: + # If metadata can't be loaded, continue without it + pass + + # Generate dataset card using from_template pattern (similar to Argilla) + from huggingface_hub import DatasetCardData + + # Build template variables for the card + template_variables = _build_card_template_variables( + dataset_df=dataset_df, + analysis=analysis, + config_builder=config_builder, + metadata=metadata, + repo_id=repo_id, + ) + # Ensure all_columns is always defined + if "all_columns" not in template_variables: + template_variables["all_columns"] = {} + + # Create card using DatasetCard.from_template with card_data and template_variables + # DataDesignerDatasetCard extends DatasetCard and uses default_template_path + # Unpack template_variables as kwargs for the template + tags_list = ["synthetic-data", "data-designer", "nemo", "synthetic", "nvidia"] + card = DataDesignerDatasetCard.from_template( + card_data=DatasetCardData( + size_categories=_size_categories_parser(len(dataset_df)), + tags=tags_list, + ), + tags=tags_list, # Also pass as template variable for explicit rendering + **template_variables, + ) + + # Save card to temporary directory and upload + with TemporaryDirectory() as tmpdir: + card_path = Path(tmpdir) / "README.md" + try: + card.save(filepath=str(card_path)) + except Exception as e: + import logging + + logger = logging.getLogger(__name__) + logger.error(f"Error saving dataset card: {e}") + logger.error(f"Template variables keys: {list(template_variables.keys())}") + # Try to identify which variable is causing the issue + for key, value in template_variables.items(): + if value is None: + logger.warning(f"Template variable '{key}' is None") + raise + hf_api = HfApi(token=token) + hf_api.upload_file( + path_or_fileobj=str(card_path), + path_in_repo="README.md", + repo_id=repo_id, + repo_type="dataset", + ) + + @classmethod + def pull_from_hub( + cls: type[Any], + repo_id: str, + *, + token: str | None = None, + artifact_path: Path | str | None = None, + split: str | None = None, + **kwargs: Any, + ) -> Any: + """Load a dataset and all artifacts from Hugging Face Hub as a DatasetCreationResults object. + + This classmethod downloads all artifacts from the Hugging Face Hub and reconstructs + a DatasetCreationResults object that can be used just like one created from a local + dataset generation run. + + Args: + repo_id: The ID of the Hugging Face Hub repository (e.g., "username/dataset-name"). + token: Hugging Face token for authentication. If None, will check environment + variables HF_TOKEN or HUGGINGFACE_HUB_TOKEN. + artifact_path: Optional path to save downloaded artifacts. If None, a temporary + directory will be used (note: temporary directories are cleaned up when + the object is garbage collected). + split: The split to load from the dataset. If None, the default split will be used. + **kwargs: Additional arguments to pass to `pull_from_hub()` function. + + Returns: + A DatasetCreationResults object containing the dataset, analysis, and all artifacts. + + Example: + ```python + from data_designer.interface.results import DatasetCreationResults + + # Load from hub (uses temporary directory) + results = DatasetCreationResults.pull_from_hub("username/dataset-name") + + # Load to a specific directory + results = DatasetCreationResults.pull_from_hub( + "username/dataset-name", + artifact_path="./downloaded_datasets/my_dataset" + ) + + # Access the dataset and analysis + df = results.load_dataset() + analysis = results.load_analysis() + ``` + """ + import tempfile + + from data_designer.config.config_builder import DataDesignerConfigBuilder + from data_designer.config.models import ModelConfig + from data_designer.engine.dataset_builders.artifact_storage import ArtifactStorage + + # Pull all artifacts from hub using the function + hub_results = pull_from_hub( + repo_id=repo_id, + token=token, + split=split, + include_analysis=True, + include_processors=True, + include_configs=True, + **kwargs, + ) + + # Determine artifact path + if artifact_path is None: + # Use a temporary directory + # Note: The directory will persist as long as the DatasetCreationResults object exists + # Users should provide artifact_path for persistent storage + temp_dir = tempfile.mkdtemp(prefix="data_designer_hub_") + artifact_path = Path(temp_dir) + else: + artifact_path = Path(artifact_path) + artifact_path.mkdir(parents=True, exist_ok=True) + + # Create artifact storage first to get the resolved dataset name + dataset_name = "dataset" + artifact_storage = ArtifactStorage( + artifact_path=artifact_path, + dataset_name=dataset_name, + ) + base_path = artifact_storage.base_dataset_path + base_path.mkdir(parents=True, exist_ok=True) + + # Save main dataset as parquet files + final_dataset_path = artifact_storage.final_dataset_path + final_dataset_path.mkdir(parents=True, exist_ok=True) + hub_results.dataset.to_parquet(final_dataset_path / "data.parquet", index=False) + + # Save metadata if available + if hub_results.metadata: + metadata_path = base_path / "metadata.json" + with open(metadata_path, "w") as f: + json.dump(hub_results.metadata, f, indent=2) + + # Save processor datasets and artifacts + if hub_results.processor_datasets: + processors_path = base_path / "processors-files" + processors_path.mkdir(parents=True, exist_ok=True) + for processor_name, processor_df in hub_results.processor_datasets.items(): + processor_dir = processors_path / processor_name + processor_dir.mkdir(parents=True, exist_ok=True) + processor_df.to_parquet(processor_dir / f"{processor_name}.parquet", index=False) + + # Copy processor artifacts if available + if hub_results.processor_artifacts: + processors_path = base_path / "processors-files" + processors_path.mkdir(parents=True, exist_ok=True) + import shutil + + for processor_name, artifact_dir in hub_results.processor_artifacts.items(): + if artifact_dir.exists(): + target_dir = processors_path / processor_name + if target_dir.exists(): + shutil.rmtree(target_dir) + shutil.copytree(artifact_dir, target_dir) + + # Save config files + if hub_results.column_configs: + config_path = base_path / "column_configs.json" + with open(config_path, "w") as f: + json.dump(hub_results.column_configs, f, indent=2) + + if hub_results.model_configs: + config_path = base_path / "model_configs.json" + with open(config_path, "w") as f: + json.dump(hub_results.model_configs, f, indent=2) + + # Reconstruct config builder from config files + config_builder: DataDesignerConfigBuilder | None = None + if hub_results.column_configs and hub_results.model_configs: + # Load model configs + model_configs = [ModelConfig.model_validate(mc) for mc in hub_results.model_configs] + config_builder = DataDesignerConfigBuilder(model_configs=model_configs) + + # Build dynamic mapping from column_type to config class (includes plugins) + def _get_column_config_class_mapping() -> dict[str, type[Any]]: + """Build a mapping from column_type string to config class dynamically.""" + from data_designer.config.column_configs import ( + ExpressionColumnConfig, + LLMCodeColumnConfig, + LLMJudgeColumnConfig, + LLMStructuredColumnConfig, + LLMTextColumnConfig, + SamplerColumnConfig, + SeedDatasetColumnConfig, + ValidationColumnConfig, + ) + from data_designer.plugin_manager import PluginManager + + mapping: dict[str, type[Any]] = { + "sampler": SamplerColumnConfig, + "llm_text": LLMTextColumnConfig, + "llm_structured": LLMStructuredColumnConfig, + "llm_code": LLMCodeColumnConfig, + "llm_judge": LLMJudgeColumnConfig, + "expression": ExpressionColumnConfig, + "seed_dataset": SeedDatasetColumnConfig, + "validation": ValidationColumnConfig, + } + + # Add plugin column configs dynamically + plugin_manager = PluginManager() + for plugin in plugin_manager.get_column_generator_plugins(): + mapping[plugin.name] = plugin.config_cls + + return mapping + + column_config_class_mapping = _get_column_config_class_mapping() + + def _load_column_config(col_config_dict: dict[str, Any]) -> Any | None: + """Load a single column config from dict using dynamic class mapping.""" + column_type = col_config_dict.get("column_type") + if not column_type: + return None + + config_class = column_config_class_mapping.get(column_type) + if config_class is None: + # Unknown column type - might be from a plugin or future version + import logging + + logger = logging.getLogger(__name__) + logger.warning( + f"Skipping column config with unknown type '{column_type}': {col_config_dict.get('name', 'unknown')}" + ) + return None + + try: + return config_class.model_validate(col_config_dict) + except Exception as e: + # Skip columns that fail validation + import logging + + logger = logging.getLogger(__name__) + logger.warning( + f"Failed to load column config '{col_config_dict.get('name', 'unknown')}': {e}. Skipping." + ) + return None + + for col_config_dict in hub_results.column_configs: + # Handle MultiColumnConfig (has 'columns' key) by flattening it + if "columns" in col_config_dict and isinstance(col_config_dict["columns"], list): + # This is a MultiColumnConfig - extract individual column configs + for single_col_config_dict in col_config_dict["columns"]: + col_config = _load_column_config(single_col_config_dict) + if col_config is not None: + config_builder.add_column(col_config) + else: + # This is a single column config + single_col_config = _load_column_config(col_config_dict) + if single_col_config is not None: + config_builder.add_column(single_col_config) + + # If config builder couldn't be reconstructed, create a minimal one + if config_builder is None: + # Try to get model configs from environment or use defaults + resolved_token = _resolve_hf_token(token) + try: + model_configs_path = hf_hub_download( + repo_id=repo_id, + filename="model_configs.json", + repo_type="dataset", + token=resolved_token, + ) + with open(model_configs_path, "r") as f: + model_configs_data = json.load(f) + model_configs = [ModelConfig.model_validate(mc) for mc in model_configs_data] + config_builder = DataDesignerConfigBuilder(model_configs=model_configs) + except Exception: + # Fallback to default model configs + config_builder = DataDesignerConfigBuilder() + + # Ensure we have analysis + if hub_results.analysis is None: + raise ArtifactStorageError("Cannot reconstruct DatasetCreationResults: analysis results not found in hub.") + + return cls( + artifact_storage=artifact_storage, + analysis=hub_results.analysis, + config_builder=config_builder, + ) + + +def pull_from_hub( + repo_id: str, + *, + token: str | None = None, + split: str | None = None, + include_analysis: bool = True, + include_processors: bool = True, + include_configs: bool = True, + **kwargs: Any, +) -> HubDatasetResults: + """Load a dataset and all associated artifacts from Hugging Face Hub. + + This function loads a dataset from the Hugging Face Hub along with analysis results, + processor datasets, processor artifacts, and configuration files if available. + It is similar to Argilla's `from_hub` method but returns a comprehensive results object. + + Args: + repo_id: The ID of the Hugging Face Hub repository (e.g., "username/dataset-name"). + token: Hugging Face token for authentication. If None, will check environment + variables HF_TOKEN or HUGGINGFACE_HUB_TOKEN. + split: The split to load from the dataset. If None, the default split will be used. + include_analysis: Whether to load analysis results. Defaults to True. + include_processors: Whether to load processor datasets and artifacts. Defaults to True. + include_configs: Whether to load configuration files. Defaults to True. + **kwargs: Additional arguments to pass to `datasets.load_dataset()`. + + Returns: + A HubDatasetResults object containing the dataset and all associated artifacts. + + Example: + ```python + from data_designer.interface.huggingface import pull_from_hub + + # Load a dataset with all artifacts from Hugging Face Hub + results = pull_from_hub("username/dataset-name") + df = results.dataset + analysis = results.analysis + processor_data = results.processor_datasets + + # Load only the main dataset + results = pull_from_hub("username/dataset-name", include_analysis=False, include_processors=False) + + # Load a specific split + results = pull_from_hub("username/dataset-name", split="train") + ``` + """ + from data_designer.config.analysis.dataset_profiler import DatasetProfilerResults + + # Resolve token + resolved_token = _resolve_hf_token(token) + + # Load main dataset from hub + hf_dataset = load_dataset(repo_id, split=split, token=resolved_token, **kwargs) + + # Handle DatasetDict + if isinstance(hf_dataset, DatasetDict): + if split is None: + # Use the first split if no split specified + split = next(iter(hf_dataset.keys())) + hf_dataset = hf_dataset[split] + elif isinstance(hf_dataset, dict): + # Fallback for dict-like objects + if split is None: + split = next(iter(hf_dataset.keys())) + hf_dataset = hf_dataset[split] + + # Convert to pandas DataFrame + dataset_df = hf_dataset.to_pandas() + + # Load analysis results if requested + analysis: DatasetProfilerResults | None = None + if include_analysis: + try: + analysis_path = hf_hub_download( + repo_id=repo_id, + filename="analysis.json", + repo_type="dataset", + token=resolved_token, + ) + with open(analysis_path, "r") as f: + analysis_data = json.load(f) + analysis = DatasetProfilerResults.model_validate(analysis_data) + except (HfHubHTTPError, FileNotFoundError): + # Analysis file may not exist, continue without it + pass + except Exception: + # Other errors loading analysis, continue without it + pass + + # Load processor datasets and artifacts if requested + processor_datasets: dict[str, pd.DataFrame] | None = None + processor_artifacts: dict[str, Path] | None = None + if include_processors: + try: + repo_files = list_repo_files(repo_id=repo_id, repo_type="dataset", token=resolved_token) + processor_files = [f for f in repo_files if f.startswith("processors/")] + + processor_datasets = {} + processor_artifacts = {} + + # Group files by processor name + processor_groups: dict[str, list[str]] = {} + for file_path in processor_files: + # Extract processor name from path like "processors/processor_name.parquet" + # or "processors/processor_name/file.txt" + parts = file_path.replace("processors/", "").split("/") + processor_name = parts[0].replace(".parquet", "") + + if processor_name not in processor_groups: + processor_groups[processor_name] = [] + processor_groups[processor_name].append(file_path) + + # Download and load processor datasets + for processor_name, files in processor_groups.items(): + parquet_files = [f for f in files if f.endswith(".parquet")] + if parquet_files: + # Download the parquet file + parquet_file = parquet_files[0] # Use first parquet file + try: + local_path = hf_hub_download( + repo_id=repo_id, + filename=parquet_file, + repo_type="dataset", + token=resolved_token, + ) + processor_datasets[processor_name] = pd.read_parquet(local_path) + except Exception: + pass + + # Download other artifacts + other_files = [f for f in files if not f.endswith(".parquet")] + if other_files: + # Download to a temporary directory + import shutil + + with TemporaryDirectory() as tmpdir: + artifact_dir = Path(tmpdir) / processor_name + artifact_dir.mkdir(parents=True, exist_ok=True) + + for artifact_file in other_files: + try: + local_path = hf_hub_download( + repo_id=repo_id, + filename=artifact_file, + repo_type="dataset", + token=resolved_token, + ) + # Preserve relative path structure + relative_path = artifact_file.replace(f"processors/{processor_name}/", "") + if relative_path: + target_path = artifact_dir / relative_path + target_path.parent.mkdir(parents=True, exist_ok=True) + shutil.copy2(local_path, target_path) + except Exception: + pass + + # Only add if directory has files + if any(artifact_dir.rglob("*")): + # Copy to a persistent location or return the temp directory + # For now, we'll return the temp directory path + # Note: This will be cleaned up when tmpdir is deleted + # In a real implementation, you might want to copy to a user-specified location + processor_artifacts[processor_name] = artifact_dir + + if not processor_datasets: + processor_datasets = None + if not processor_artifacts: + processor_artifacts = None + except (HfHubHTTPError, FileNotFoundError): + # Processors may not exist, continue without them + pass + except Exception: + # Other errors loading processors, continue without them + pass + + # Load configuration files if requested + metadata: dict[str, Any] | None = None + column_configs: list[dict[str, Any]] | None = None + model_configs: list[dict[str, Any]] | None = None + + if include_configs: + # Load metadata + try: + metadata_path = hf_hub_download( + repo_id=repo_id, + filename="metadata.json", + repo_type="dataset", + token=resolved_token, + ) + with open(metadata_path, "r") as f: + metadata = json.load(f) + except (HfHubHTTPError, FileNotFoundError): + pass + except Exception: + pass + + # Load column configs + try: + config_path = hf_hub_download( + repo_id=repo_id, + filename="column_configs.json", + repo_type="dataset", + token=resolved_token, + ) + with open(config_path, "r") as f: + raw_column_configs = json.load(f) + # Flatten MultiColumnConfig objects (those with 'columns' key) into individual column configs + column_configs = [] + for config in raw_column_configs: + if "columns" in config and isinstance(config["columns"], list): + # This is a MultiColumnConfig - extract individual column configs + column_configs.extend(config["columns"]) + else: + # This is a single column config + column_configs.append(config) + except (HfHubHTTPError, FileNotFoundError): + pass + except Exception: + pass + + # Load model configs + try: + config_path = hf_hub_download( + repo_id=repo_id, + filename="model_configs.json", + repo_type="dataset", + token=resolved_token, + ) + with open(config_path, "r") as f: + model_configs = json.load(f) + except (HfHubHTTPError, FileNotFoundError): + pass + except Exception: + pass + + return HubDatasetResults( + dataset=dataset_df, + analysis=analysis, + processor_datasets=processor_datasets, + processor_artifacts=processor_artifacts, + metadata=metadata, + column_configs=column_configs, + model_configs=model_configs, + ) + diff --git a/src/data_designer/interface/huggingface/hub_results.py b/src/data_designer/interface/huggingface/hub_results.py new file mode 100644 index 000000000..980bbdd5f --- /dev/null +++ b/src/data_designer/interface/huggingface/hub_results.py @@ -0,0 +1,43 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +from __future__ import annotations + +from dataclasses import dataclass +from pathlib import Path +from typing import Any + +import pandas as pd + +from data_designer.config.analysis.dataset_profiler import DatasetProfilerResults + + +@dataclass +class HubDatasetResults: + """Results container for datasets pulled from Hugging Face Hub. + + This class contains the main dataset, analysis results, processor datasets, + and processor artifacts that were pushed to the hub. + """ + + dataset: pd.DataFrame + """The main dataset as a pandas DataFrame.""" + + analysis: DatasetProfilerResults | None = None + """Analysis results if available.""" + + processor_datasets: dict[str, pd.DataFrame] | None = None + """Dictionary of processor datasets, keyed by processor name.""" + + processor_artifacts: dict[str, Path] | None = None + """Dictionary of paths to processor artifacts, keyed by processor name.""" + + metadata: dict[str, Any] | None = None + """Metadata dictionary if available.""" + + column_configs: list[dict[str, Any]] | None = None + """Column configurations if available.""" + + model_configs: list[dict[str, Any]] | None = None + """Model configurations if available.""" + diff --git a/src/data_designer/interface/results.py b/src/data_designer/interface/results.py index 263173a70..48b3f3bad 100644 --- a/src/data_designer/interface/results.py +++ b/src/data_designer/interface/results.py @@ -4,6 +4,7 @@ from __future__ import annotations from pathlib import Path +from typing import Any import pandas as pd @@ -12,9 +13,10 @@ from data_designer.config.utils.visualization import WithRecordSamplerMixin from data_designer.engine.dataset_builders.artifact_storage import ArtifactStorage from data_designer.engine.dataset_builders.errors import ArtifactStorageError +from data_designer.interface.huggingface import HuggingFaceHubMixin -class DatasetCreationResults(WithRecordSamplerMixin): +class DatasetCreationResults(WithRecordSamplerMixin, HuggingFaceHubMixin): """Results container for a Data Designer dataset creation run. This class provides access to the generated dataset, profiling analysis, and @@ -89,3 +91,55 @@ def get_path_to_processor_artifacts(self, processor_name: str) -> Path: if not self.artifact_storage.processors_outputs_path.exists(): raise ArtifactStorageError(f"Processor {processor_name} has no artifacts.") return self.artifact_storage.processors_outputs_path / processor_name + + @classmethod + def pull_from_hub( + cls, + repo_id: str, + *, + token: str | None = None, + artifact_path: Path | str | None = None, + split: str | None = None, + **kwargs: Any, + ) -> DatasetCreationResults: + """Load a dataset and all artifacts from Hugging Face Hub as a DatasetCreationResults object. + + This classmethod downloads all artifacts from the Hugging Face Hub and reconstructs + a DatasetCreationResults object that can be used just like one created from a local + dataset generation run. + + Args: + repo_id: The ID of the Hugging Face Hub repository (e.g., "username/dataset-name"). + token: Hugging Face token for authentication. If None, will check environment + variables HF_TOKEN or HUGGINGFACE_HUB_TOKEN. + artifact_path: Optional path to save downloaded artifacts. If None, a temporary + directory will be used (note: temporary directories are cleaned up when + the object is garbage collected). + split: The split to load from the dataset. If None, the default split will be used. + **kwargs: Additional arguments to pass to `pull_from_hub()` function. + + Returns: + A DatasetCreationResults object containing the dataset, analysis, and all artifacts. + + Example: + ```python + from data_designer.interface.results import DatasetCreationResults + + # Load from hub (uses temporary directory) + results = DatasetCreationResults.pull_from_hub("username/dataset-name") + + # Load to a specific directory + results = DatasetCreationResults.pull_from_hub( + "username/dataset-name", + artifact_path="./downloaded_datasets/my_dataset" + ) + + # Access the dataset and analysis + df = results.load_dataset() + analysis = results.load_analysis() + ``` + """ + # Delegate to the mixin method using super() to avoid recursion + return super(DatasetCreationResults, cls).pull_from_hub( + repo_id, token=token, artifact_path=artifact_path, split=split, **kwargs + ) diff --git a/tests/interface/test_hub_integration.py b/tests/interface/test_hub_integration.py new file mode 100644 index 000000000..46515bde2 --- /dev/null +++ b/tests/interface/test_hub_integration.py @@ -0,0 +1,353 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +"""Integration tests for Hugging Face Hub push/pull functionality.""" + +import json +import tempfile +from unittest.mock import MagicMock, patch + +import pandas as pd +import pytest + +from data_designer.essentials import ( + CategorySamplerParams, + DataDesigner, + DataDesignerConfigBuilder, + LLMTextColumnConfig, + SamplerColumnConfig, + SamplerType, +) +from data_designer.interface.huggingface import pull_from_hub +from data_designer.interface.results import DatasetCreationResults + + +@pytest.fixture +def stub_model_configs(): + """Mock model configs for testing.""" + from data_designer.config.models import InferenceParameters, ModelConfig + + return [ + ModelConfig( + alias="nvidia-text", + model="nvidia/nvidia-nemotron-nano-9b-v2", + provider="nvidia", + inference_parameters=InferenceParameters( + temperature=0.5, + top_p=1.0, + max_tokens=1024, + ), + ) + ] + + +@pytest.fixture +def sample_dataset_config(stub_model_configs): + """Create a sample dataset configuration matching the README example.""" + config_builder = DataDesignerConfigBuilder(model_configs=stub_model_configs) + + # Add a product category + config_builder.add_column( + SamplerColumnConfig( + name="product_category", + sampler_type=SamplerType.CATEGORY, + params=CategorySamplerParams( + values=["Electronics", "Clothing", "Home & Kitchen", "Books"], + ), + ) + ) + + # For integration tests, we'll mock LLM calls, but keep the config structure + # Generate personalized customer reviews (will be mocked in tests) + config_builder.add_column( + LLMTextColumnConfig( + name="review", + model_alias="nvidia-text", + prompt="""Write a brief product review for a {{ product_category }} item you recently purchased.""", + ) + ) + + return config_builder + + +@pytest.fixture +def simple_dataset_config(): + """Create a simple dataset configuration without LLM calls for faster testing.""" + config_builder = DataDesignerConfigBuilder() + + # Add a product category + config_builder.add_column( + SamplerColumnConfig( + name="product_category", + sampler_type=SamplerType.CATEGORY, + params=CategorySamplerParams( + values=["Electronics", "Clothing", "Home & Kitchen", "Books"], + ), + ) + ) + + # Add a simple numeric column + from data_designer.config.sampler_params import UniformSamplerParams + + config_builder.add_column( + SamplerColumnConfig( + name="rating", + sampler_type=SamplerType.UNIFORM, + params=UniformSamplerParams(low=1, high=5), + ) + ) + + return config_builder + + +@pytest.mark.integration +@patch("data_designer.interface.huggingface.hub_mixin.Dataset") +@patch("data_designer.interface.huggingface.hub_mixin.HfApi") +@patch("data_designer.interface.huggingface.hub_mixin.load_dataset") +def test_push_and_pull_from_hub_integration( + mock_load_dataset, + mock_hf_api_class, + mock_dataset_class, + simple_dataset_config, + tmp_path, +): + """Integration test: create dataset, push to hub, pull from hub, verify round-trip.""" + # Initialize DataDesigner + data_designer = DataDesigner() + + # Create a small dataset (10 records for testing) - using simple config without LLM + num_records = 10 + results = data_designer.create(config_builder=simple_dataset_config, num_records=num_records) + + # Verify dataset was created + original_df = results.load_dataset() + assert len(original_df) == num_records + assert "product_category" in original_df.columns + assert "rating" in original_df.columns + + # Get original analysis + original_analysis = results.load_analysis() + + # Mock Hugging Face Hub interactions + mock_hf_dataset = MagicMock() + mock_dataset_class.from_pandas.return_value = mock_hf_dataset + + mock_hf_api = MagicMock() + mock_hf_api_class.return_value = mock_hf_api + + # Mock the uploaded files for pull_from_hub + uploaded_files = {} + + def mock_upload_file(**kwargs): + """Capture uploaded files.""" + path_or_fileobj = kwargs.get("path_or_fileobj") + path_in_repo = kwargs.get("path_in_repo") + if isinstance(path_or_fileobj, str): + with open(path_or_fileobj, "rb") as f: + uploaded_files[path_in_repo] = f.read() + else: + uploaded_files[path_in_repo] = path_or_fileobj.read() + + mock_hf_api.upload_file.side_effect = mock_upload_file + + # Mock load_dataset for pull_from_hub + def mock_load_dataset_for_pull(repo_id, split=None, token=None, **kwargs): + """Mock loading dataset from hub.""" + # Return the original dataset + mock_hf_dataset_for_pull = MagicMock() + mock_hf_dataset_for_pull.to_pandas.return_value = original_df + return mock_hf_dataset_for_pull + + mock_load_dataset.side_effect = mock_load_dataset_for_pull + + # Mock hf_hub_download for pull_from_hub + def mock_hf_hub_download(repo_id, filename, repo_type, token=None): + """Mock downloading files from hub.""" + if filename in uploaded_files: + # Create a temporary file with the uploaded content + temp_file = tempfile.NamedTemporaryFile(mode="w", delete=False, suffix=".json") + if filename.endswith(".json"): + content = uploaded_files[filename].decode("utf-8") if isinstance(uploaded_files[filename], bytes) else uploaded_files[filename] + temp_file.write(content) + temp_file.close() + return temp_file.name + raise FileNotFoundError(f"File {filename} not found") + + # Mock list_repo_files + def mock_list_repo_files(repo_id, repo_type, token=None): + """Mock listing repo files.""" + return list(uploaded_files.keys()) + + # Push to hub + repo_id = "test-user/test-dataset" + with patch("data_designer.interface.huggingface.hub_mixin.hf_hub_download", side_effect=mock_hf_hub_download), patch( + "data_designer.interface.huggingface.hub_mixin.list_repo_files", side_effect=mock_list_repo_files + ): + results.push_to_hub(repo_id, token="test-token", generate_card=True) + + # Verify dataset was pushed + mock_dataset_class.from_pandas.assert_called_once() + mock_hf_dataset.push_to_hub.assert_called_once_with(repo_id, token="test-token") + + # Verify analysis.json was uploaded + assert "analysis.json" in uploaded_files + analysis_data = json.loads(uploaded_files["analysis.json"].decode("utf-8")) + assert analysis_data["num_records"] == num_records + + # Verify README.md was uploaded + assert "README.md" in uploaded_files + readme_content = uploaded_files["README.md"].decode("utf-8") + assert "NeMo Data Designer" in readme_content + assert repo_id in readme_content + + # Pull from hub + pulled_results = DatasetCreationResults.pull_from_hub( + repo_id=repo_id, + token="test-token", + artifact_path=tmp_path / "pulled_artifacts", + ) + + # Verify pulled dataset matches original + pulled_df = pulled_results.load_dataset() + pd.testing.assert_frame_equal(pulled_df, original_df, check_dtype=False) + + # Verify pulled analysis matches original + pulled_analysis = pulled_results.load_analysis() + assert pulled_analysis.num_records == original_analysis.num_records + assert pulled_analysis.target_num_records == original_analysis.target_num_records + assert len(pulled_analysis.column_statistics) == len(original_analysis.column_statistics) + + # Verify config builder was reconstructed + pulled_config_builder = pulled_results._config_builder + assert pulled_config_builder is not None + pulled_column_configs = pulled_config_builder.get_column_configs() + assert len(pulled_column_configs) == 2 # product_category and rating + + # Verify artifact storage structure exists + assert pulled_results.artifact_storage.base_dataset_path.exists() + assert (pulled_results.artifact_storage.base_dataset_path / "parquet-files").exists() + + +@pytest.mark.integration +@patch("data_designer.interface.huggingface.hub_mixin.Dataset") +@patch("data_designer.interface.huggingface.hub_mixin.HfApi") +@patch("data_designer.interface.huggingface.hub_mixin.load_dataset") +def test_push_and_pull_with_pull_from_hub_function( + mock_load_dataset, + mock_hf_api_class, + mock_dataset_class, + simple_dataset_config, +): + """Integration test: create dataset, push to hub, pull using pull_from_hub function.""" + # Initialize DataDesigner + data_designer = DataDesigner() + + # Create a small dataset - using simple config without LLM + num_records = 5 + results = data_designer.create(config_builder=simple_dataset_config, num_records=num_records) + + original_df = results.load_dataset() + + # Mock Hugging Face Hub interactions + mock_hf_dataset = MagicMock() + mock_dataset_class.from_pandas.return_value = mock_hf_dataset + + mock_hf_api = MagicMock() + mock_hf_api_class.return_value = mock_hf_api + + uploaded_files = {} + + def mock_upload_file(**kwargs): + """Capture uploaded files.""" + path_or_fileobj = kwargs.get("path_or_fileobj") + path_in_repo = kwargs.get("path_in_repo") + if isinstance(path_or_fileobj, str): + with open(path_or_fileobj, "rb") as f: + uploaded_files[path_in_repo] = f.read() + else: + uploaded_files[path_in_repo] = path_or_fileobj.read() + + mock_hf_api.upload_file.side_effect = mock_upload_file + + # Mock load_dataset for pull_from_hub + def mock_load_dataset_for_pull(repo_id, split=None, token=None, **kwargs): + """Mock loading dataset from hub.""" + mock_hf_dataset_for_pull = MagicMock() + mock_hf_dataset_for_pull.to_pandas.return_value = original_df + return mock_hf_dataset_for_pull + + mock_load_dataset.side_effect = mock_load_dataset_for_pull + + # Mock hf_hub_download for pull_from_hub + def mock_hf_hub_download(repo_id, filename, repo_type, token=None): + """Mock downloading files from hub.""" + if filename in uploaded_files: + temp_file = tempfile.NamedTemporaryFile(mode="w", delete=False, suffix=".json") + if filename.endswith(".json"): + content = uploaded_files[filename].decode("utf-8") if isinstance(uploaded_files[filename], bytes) else uploaded_files[filename] + temp_file.write(content) + temp_file.close() + return temp_file.name + raise FileNotFoundError(f"File {filename} not found") + + # Mock list_repo_files + def mock_list_repo_files(repo_id, repo_type, token=None): + """Mock listing repo files.""" + return list(uploaded_files.keys()) + + # Push to hub + repo_id = "test-user/test-dataset-2" + with patch("data_designer.interface.huggingface.hub_mixin.hf_hub_download", side_effect=mock_hf_hub_download), patch( + "data_designer.interface.huggingface.hub_mixin.list_repo_files", side_effect=mock_list_repo_files + ): + results.push_to_hub(repo_id, token="test-token", generate_card=True) + + # Pull using pull_from_hub function + hub_results = pull_from_hub( + repo_id=repo_id, + token="test-token", + include_analysis=True, + include_configs=True, + ) + + # Verify pulled dataset matches original + pd.testing.assert_frame_equal(hub_results.dataset, original_df, check_dtype=False) + + # Verify analysis was loaded + assert hub_results.analysis is not None + assert hub_results.analysis.num_records == num_records + + # Verify configs were loaded + assert hub_results.column_configs is not None + assert len(hub_results.column_configs) == 2 + + +@pytest.mark.integration +def test_real_push_to_hub(simple_dataset_config): + """Real integration test: create dataset and push to actual Hugging Face Hub.""" + import os + + # Only run if HF_TOKEN is set + token = os.getenv("HF_TOKEN") or os.getenv("HUGGINGFACE_HUB_TOKEN") + if not token: + pytest.skip("HF_TOKEN or HUGGINGFACE_HUB_TOKEN not set, skipping real push test") + + # Initialize DataDesigner + data_designer = DataDesigner() + + # Create a small dataset - using simple config without LLM + num_records = 1 + results = data_designer.create(config_builder=simple_dataset_config, num_records=num_records) + + # Verify dataset was created + original_df = results.load_dataset() + assert len(original_df) == num_records + assert "product_category" in original_df.columns + assert "rating" in original_df.columns + + # Push to actual Hugging Face Hub + repo_id = "davidberenstein1957/datadesigner-test" + print(f"\n🚀 Pushing dataset to {repo_id}...") + results.push_to_hub(repo_id, token=token, generate_card=True) + print(f"✅ Successfully pushed dataset to {repo_id}!") + From 0617aa02b052c14fd67fdfc347c1ad3195bbf84f Mon Sep 17 00:00:00 2001 From: davidberenstein1957 Date: Sat, 13 Dec 2025 09:05:44 +0100 Subject: [PATCH 2/5] feat: add metadata sanitization for file paths in HuggingFaceHubMixin - Introduced `_sanitize_metadata_file_paths` method to convert local file paths in metadata to remote paths suitable for Hugging Face Hub. - Updated existing metadata handling to utilize the new sanitization method, ensuring consistent file path formatting. - Enhanced comments for clarity on metadata processing and artifact uploading. --- .../interface/huggingface/hub_mixin.py | 96 ++++++++++++------- 1 file changed, 62 insertions(+), 34 deletions(-) diff --git a/src/data_designer/interface/huggingface/hub_mixin.py b/src/data_designer/interface/huggingface/hub_mixin.py index b3117abf4..711bb028f 100644 --- a/src/data_designer/interface/huggingface/hub_mixin.py +++ b/src/data_designer/interface/huggingface/hub_mixin.py @@ -335,6 +335,7 @@ def push_to_hub( hf_dataset.push_to_hub(repo_id, token=resolved_token, **kwargs) # Push additional artifacts (analysis, processor datasets, configs) + # Pass the repo_id so we can list actual files for metadata sanitization self._upload_additional_artifacts(repo_id, resolved_token) # Generate and upload dataset card if requested @@ -352,6 +353,61 @@ def _resolve_token(self, token: str | None) -> str | None: """ return _resolve_hf_token(token) + def _sanitize_metadata_file_paths(self, metadata: dict[str, Any]) -> dict[str, Any]: + """Sanitize file paths in metadata by converting local paths to remote paths. + + Args: + metadata: Metadata dictionary that may contain file_paths. + + Returns: + Metadata dictionary with sanitized file paths. + """ + if "file_paths" not in metadata or not isinstance(metadata["file_paths"], list): + return metadata + + sanitized_paths = [] + base_path = self.artifact_storage.base_dataset_path + + for file_path in metadata["file_paths"]: + path_str = str(file_path) + path_obj = Path(path_str) + + # Try to get relative path from base_dataset_path + try: + if path_obj.is_absolute(): + try: + relative_path = path_obj.relative_to(base_path) + remote_path = f"data/{relative_path.as_posix()}" + sanitized_paths.append(remote_path) + continue + except ValueError: + # Path is not relative to base_path, try fallback + pass + except Exception: + # If Path operations fail, try string-based extraction + pass + + # Fallback: extract directory structure from path string + if "parquet-files" in path_str: + idx = path_str.find("parquet-files") + if idx != -1: + remaining = path_str[idx + len("parquet-files") :].lstrip("/\\") + sanitized_paths.append(f"data/parquet-files/{remaining}") + else: + sanitized_paths.append(f"data/parquet-files/{path_obj.name}") + else: + sanitized_paths.append(f"data/{path_obj.name}") + + if sanitized_paths: + metadata = metadata.copy() + metadata["file_paths"] = sanitized_paths + else: + # If no paths could be sanitized, remove file_paths + metadata = metadata.copy() + metadata.pop("file_paths", None) + + return metadata + def _upload_additional_artifacts( self: Any, repo_id: str, @@ -409,38 +465,8 @@ def _upload_additional_artifacts( with open(metadata_path, "r") as f: metadata = json.load(f) - # Sanitize metadata: convert local file paths to Hugging Face Hub relative paths - if "file_paths" in metadata and isinstance(metadata["file_paths"], list): - # Convert absolute local paths to relative paths within the HF repo - # Hugging Face datasets library manages the file structure when pushing - # Files are typically stored as data/train-XXXXX-of-YYYYY.parquet - # Since we don't know the exact naming ahead of time, we use a pattern - # that matches the HF datasets structure - sanitized_paths = [] - num_files = len(metadata["file_paths"]) - for idx, file_path in enumerate(metadata["file_paths"]): - path_obj = Path(file_path) - # Hugging Face uses format: data/train-XXXXX-of-YYYYY.parquet - # where XXXXX is zero-padded file index and YYYYY is total files - if "batch_" in path_obj.name: - try: - # Extract batch number from filename - batch_num = path_obj.stem.split("_")[-1] - # Format as HF datasets style: train-00000-of-00001.parquet - sanitized_paths.append( - f"data/train-{batch_num.zfill(5)}-of-{str(num_files).zfill(5)}.parquet" - ) - except Exception: - # Fallback: use generic pattern - sanitized_paths.append( - f"data/train-{str(idx).zfill(5)}-of-{str(num_files).zfill(5)}.parquet" - ) - else: - # Use generic HF datasets naming pattern - sanitized_paths.append( - f"data/train-{str(idx).zfill(5)}-of-{str(num_files).zfill(5)}.parquet" - ) - metadata["file_paths"] = sanitized_paths + # Sanitize metadata: convert local file paths to remote Hugging Face Hub paths + metadata = self._sanitize_metadata_file_paths(metadata) # Write sanitized metadata to temp file and upload with TemporaryDirectory() as tmpdir: @@ -564,7 +590,7 @@ def _upload_dataset_card( "Ensure the class has _analysis and _config_builder attributes." ) - # Load metadata if available + # Load metadata if available and sanitize file paths metadata: dict[str, Any] | None = None if hasattr(self, "artifact_storage") and hasattr(self.artifact_storage, "metadata_file_path"): metadata_path = self.artifact_storage.metadata_file_path @@ -572,6 +598,8 @@ def _upload_dataset_card( try: with open(metadata_path, "r") as f: metadata = json.load(f) + # Sanitize file paths for the dataset card + metadata = self._sanitize_metadata_file_paths(metadata) except Exception: # If metadata can't be loaded, continue without it pass @@ -594,7 +622,7 @@ def _upload_dataset_card( # Create card using DatasetCard.from_template with card_data and template_variables # DataDesignerDatasetCard extends DatasetCard and uses default_template_path # Unpack template_variables as kwargs for the template - tags_list = ["synthetic-data", "data-designer", "nemo", "synthetic", "nvidia"] + tags_list = ["datadesigner", "synthetic"] card = DataDesignerDatasetCard.from_template( card_data=DatasetCardData( size_categories=_size_categories_parser(len(dataset_df)), From a6a0b9d90753223bf1926df51351f5d8861ce02b Mon Sep 17 00:00:00 2001 From: davidberenstein1957 Date: Wed, 17 Dec 2025 08:54:17 +0100 Subject: [PATCH 3/5] feat: integrate Hugging Face Hub client and dataset card functionality - Introduced `HuggingFaceHubClient` for managing dataset interactions with Hugging Face Hub, including pushing and pulling datasets. - Replaced `HuggingFaceHubMixin` with the new client in `DatasetCreationResults` for improved structure and clarity. - Added `DataDesignerDatasetCard` class to create custom dataset cards for datasets pushed to the hub. - Implemented methods for reconstructing dataset creation results from hub data, enhancing usability and integration. - Updated tests to cover new functionality and ensure robust integration with Hugging Face Hub. --- .../config/utils/visualization.py | 7 +- .../integrations/huggingface/__init__.py | 8 + .../integrations/huggingface/client.py | 1062 +++++++++++++++ .../huggingface/dataset_card.py | 2 +- .../huggingface/dataset_card_template.md | 21 +- .../huggingface/hub_results.py | 0 .../huggingface/reconstruction.py | 272 ++++ .../interface/huggingface/__init__.py | 16 +- .../interface/huggingface/hub_mixin.py | 1146 ----------------- src/data_designer/interface/results.py | 76 +- tests/integrations/huggingface/test_client.py | 430 +++++++ tests/interface/test_hub_integration.py | 28 +- tests/interface/test_results.py | 2 +- 13 files changed, 1890 insertions(+), 1180 deletions(-) create mode 100644 src/data_designer/integrations/huggingface/__init__.py create mode 100644 src/data_designer/integrations/huggingface/client.py rename src/data_designer/{interface => integrations}/huggingface/dataset_card.py (86%) rename src/data_designer/{interface => integrations}/huggingface/dataset_card_template.md (91%) rename src/data_designer/{interface => integrations}/huggingface/hub_results.py (100%) create mode 100644 src/data_designer/integrations/huggingface/reconstruction.py delete mode 100644 src/data_designer/interface/huggingface/hub_mixin.py create mode 100644 tests/integrations/huggingface/test_client.py diff --git a/src/data_designer/config/utils/visualization.py b/src/data_designer/config/utils/visualization.py index 1f843c86b..9d21169fd 100644 --- a/src/data_designer/config/utils/visualization.py +++ b/src/data_designer/config/utils/visualization.py @@ -120,10 +120,15 @@ def display_sample_record( else: processor_data_to_display[processor] = self.processor_artifacts[processor] + # Use property if available, otherwise fall back to protected attribute + try: + config_builder = self.config_builder + except AttributeError: + config_builder = self._config_builder display_sample_record( record=record, processor_data_to_display=processor_data_to_display, - config_builder=self._config_builder, + config_builder=config_builder, background_color=background_color, syntax_highlighting_theme=syntax_highlighting_theme, hide_seed_columns=hide_seed_columns, diff --git a/src/data_designer/integrations/huggingface/__init__.py b/src/data_designer/integrations/huggingface/__init__.py new file mode 100644 index 000000000..a379d8a18 --- /dev/null +++ b/src/data_designer/integrations/huggingface/__init__.py @@ -0,0 +1,8 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +from data_designer.integrations.huggingface.client import HuggingFaceHubClient, resolve_hf_token +from data_designer.integrations.huggingface.hub_results import HubDatasetResults +from data_designer.integrations.huggingface.reconstruction import reconstruct_dataset_creation_results + +__all__ = ["HuggingFaceHubClient", "HubDatasetResults", "resolve_hf_token", "reconstruct_dataset_creation_results"] diff --git a/src/data_designer/integrations/huggingface/client.py b/src/data_designer/integrations/huggingface/client.py new file mode 100644 index 000000000..c197a239d --- /dev/null +++ b/src/data_designer/integrations/huggingface/client.py @@ -0,0 +1,1062 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +from __future__ import annotations + +import json +import logging +import shutil +from pathlib import Path +from tempfile import TemporaryDirectory +from typing import Any, Protocol + +import pandas as pd +from datasets import Dataset, DatasetDict, load_dataset +from huggingface_hub import DatasetCardData, HfApi, get_token, hf_hub_download, list_repo_files +from huggingface_hub.utils import HfHubHTTPError + +from data_designer.config.analysis.dataset_profiler import DatasetProfilerResults +from data_designer.config.column_types import DataDesignerColumnType, get_column_display_order +from data_designer.config.config_builder import DataDesignerConfigBuilder +from data_designer.engine.analysis.utils.column_statistics_calculations import ( + convert_pyarrow_dtype_to_simple_dtype, +) +from data_designer.engine.dataset_builders.errors import ArtifactStorageError +from data_designer.integrations.huggingface.dataset_card import DataDesignerDatasetCard +from data_designer.integrations.huggingface.hub_results import HubDatasetResults + +logger = logging.getLogger(__name__) + + +class HasDataset(Protocol): + """Protocol for classes that have a load_dataset method.""" + + def load_dataset(self) -> pd.DataFrame: ... + + +class HasArtifactStorage(Protocol): + """Protocol for classes that have artifact_storage with metadata_file_path.""" + + @property + def artifact_storage(self) -> Any: ... + + +def resolve_hf_token(token: str | None) -> str | None: + """Resolve the Hugging Face token from parameter or huggingface_hub. + + This function tries to resolve a token in the following order: + 1. Token provided as parameter + 2. huggingface_hub's get_token() (checks environment variables, cache, config file, etc.) + + Args: + token: Token provided as parameter. + + Returns: + Resolved token or None if not found. + """ + if token is not None: + return token + + try: + token = get_token() + if token: + return token + except Exception: + pass + + return None + + +def parse_size_category(num_records: int) -> str: + """Parse dataset size into Hugging Face size category. + + Uses the same category names as Argilla's size_categories_parser. + + Args: + num_records: Number of records in the dataset. + + Returns: + Size category string matching Hugging Face format (e.g., "n<1K", "1K1T" + + +def pydantic_to_dict(obj: Any) -> dict[str, Any]: + """Convert a Pydantic model to a dict, handling enum fields properly. + + Args: + obj: Pydantic model instance. + + Returns: + Dictionary representation of the object. + """ + if not hasattr(obj, "model_dump"): + return obj + + result = obj.model_dump(mode="json") + for key in ["column_type", "sampler_type"]: + if key not in result: + continue + value = result[key] + if isinstance(value, dict) and "value" in value: + continue + if hasattr(value, "value"): + result[key] = {"value": value.value} + elif isinstance(value, str): + result[key] = {"value": value} + return result + + +class HuggingFaceHubClient: + """Client for pushing and pulling datasets to/from Hugging Face Hub. + + This class encapsulates all Hugging Face Hub operations and can be composed + into other classes to provide hub functionality without using mixins. + """ + + def __init__( + self, + dataset_provider: HasDataset, + artifact_storage_provider: HasArtifactStorage | None = None, + analysis: DatasetProfilerResults | None = None, + config_builder: DataDesignerConfigBuilder | None = None, + ) -> None: + """Initialize the Hugging Face Hub client. + + Args: + dataset_provider: Object that provides the dataset via load_dataset(). + artifact_storage_provider: Object that provides artifact storage. + analysis: Optional analysis results for dataset card generation. + config_builder: Optional config builder for dataset card generation. + """ + self._dataset_provider = dataset_provider + self._artifact_storage_provider = artifact_storage_provider + self._analysis = analysis + self._config_builder = config_builder + + def push_to_hub( + self, + repo_id: str, + *, + token: str | None = None, + generate_card: bool = True, + **kwargs: Any, + ) -> None: + """Push the dataset to Hugging Face Hub. + + This method converts the pandas DataFrame to a HuggingFace Dataset, pushes it to + the Hugging Face Hub, and optionally generates and uploads a dataset card. + + Args: + repo_id: The ID of the Hugging Face Hub repository (e.g., "username/dataset-name"). + token: Hugging Face token for authentication. If None, will check environment + variables HF_TOKEN or HUGGINGFACE_HUB_TOKEN. + generate_card: Whether to generate and upload a dataset card. Defaults to True. + **kwargs: Additional arguments to pass to `dataset.push_to_hub()`. + + Raises: + ArtifactStorageError: If there's an error loading the dataset or metadata. + """ + resolved_token = resolve_hf_token(token) + dataset_df = self._dataset_provider.load_dataset() + hf_dataset = Dataset.from_pandas(dataset_df) + hf_dataset.push_to_hub(repo_id, token=resolved_token, **kwargs) + + if self._artifact_storage_provider: + self._upload_additional_artifacts(repo_id, resolved_token) + + if generate_card: + self._upload_dataset_card(repo_id, resolved_token, dataset_df) + + def _upload_additional_artifacts( + self, + repo_id: str, + token: str | None, + ) -> None: + """Upload additional artifacts to Hugging Face Hub. + + This includes: + - Analysis results (as JSON) + - Processor datasets (as parquet files) + - Processor artifacts (directories) + - Configuration files (column_configs.json, model_configs.json) + + Args: + repo_id: The ID of the Hugging Face Hub repository. + token: Hugging Face token for authentication. + """ + if not self._artifact_storage_provider: + return + + hf_api = HfApi(token=token) + artifact_storage = self._artifact_storage_provider.artifact_storage + + self._upload_analysis(hf_api, repo_id) + self._upload_processor_artifacts(hf_api, repo_id, artifact_storage) + self._upload_metadata(hf_api, repo_id, artifact_storage) + self._upload_config_files(hf_api, repo_id, artifact_storage) + + def _upload_analysis(self, hf_api: HfApi, repo_id: str) -> None: + """Upload analysis results as JSON. + + Args: + hf_api: Hugging Face API client. + repo_id: The ID of the Hugging Face Hub repository. + """ + if self._analysis is None: + return + + try: + analysis_json = self._analysis.model_dump(mode="json") + with TemporaryDirectory() as tmpdir: + analysis_path = Path(tmpdir) / "analysis.json" + with open(analysis_path, "w") as f: + json.dump(analysis_json, f, indent=2, default=str) + hf_api.upload_file( + path_or_fileobj=str(analysis_path), + path_in_repo="analysis.json", + repo_id=repo_id, + repo_type="dataset", + ) + except Exception as e: + logger.warning(f"Failed to upload analysis results: {e}") + + def _upload_processor_artifacts( + self, + hf_api: HfApi, + repo_id: str, + artifact_storage: Any, + ) -> None: + """Upload processor datasets and artifacts. + + Args: + hf_api: Hugging Face API client. + repo_id: The ID of the Hugging Face Hub repository. + artifact_storage: Artifact storage object. + """ + if not hasattr(artifact_storage, "processors_outputs_path"): + return + + processors_path = artifact_storage.processors_outputs_path + if not processors_path.exists(): + return + + for processor_dir in processors_path.iterdir(): + if not processor_dir.is_dir(): + continue + processor_name = processor_dir.name + self._upload_processor_dataset(hf_api, repo_id, processor_dir, processor_name) + self._upload_processor_files(hf_api, repo_id, processors_path, processor_dir, processor_name) + + def _upload_processor_dataset( + self, + hf_api: HfApi, + repo_id: str, + processor_dir: Path, + processor_name: str, + ) -> None: + """Upload a processor dataset as a parquet file. + + Args: + hf_api: Hugging Face API client. + repo_id: The ID of the Hugging Face Hub repository. + processor_dir: Directory containing the processor files. + processor_name: Name of the processor. + """ + parquet_files = list(processor_dir.glob("*.parquet")) + if not parquet_files: + return + + try: + dfs = [pd.read_parquet(f) for f in parquet_files] + combined_df = pd.concat(dfs, ignore_index=True) + + with TemporaryDirectory() as tmpdir: + processor_parquet = Path(tmpdir) / f"{processor_name}.parquet" + combined_df.to_parquet(processor_parquet, index=False) + hf_api.upload_file( + path_or_fileobj=str(processor_parquet), + path_in_repo=f"processors/{processor_name}.parquet", + repo_id=repo_id, + repo_type="dataset", + ) + except Exception as e: + logger.warning(f"Failed to upload processor dataset {processor_name}: {e}") + + def _upload_processor_files( + self, + hf_api: HfApi, + repo_id: str, + processors_path: Path, + processor_dir: Path, + processor_name: str, + ) -> None: + """Upload non-parquet files from a processor directory. + + Args: + hf_api: Hugging Face API client. + repo_id: The ID of the Hugging Face Hub repository. + processors_path: Base path for all processors. + processor_dir: Directory containing the processor files. + processor_name: Name of the processor. + """ + for artifact_file in processor_dir.rglob("*"): + if not artifact_file.is_file() or artifact_file.suffix == ".parquet": + continue + try: + relative_path = artifact_file.relative_to(processors_path) + hf_api.upload_file( + path_or_fileobj=str(artifact_file), + path_in_repo=f"processors/{relative_path.as_posix()}", + repo_id=repo_id, + repo_type="dataset", + ) + except Exception as e: + logger.warning(f"Failed to upload processor artifact {artifact_file}: {e}") + + def _upload_metadata( + self, + hf_api: HfApi, + repo_id: str, + artifact_storage: Any, + ) -> None: + """Upload metadata file with sanitized file paths. + + Args: + hf_api: Hugging Face API client. + repo_id: The ID of the Hugging Face Hub repository. + artifact_storage: Artifact storage object. + """ + if not hasattr(artifact_storage, "metadata_file_path"): + return + + metadata_path = artifact_storage.metadata_file_path + if not metadata_path.exists(): + return + + try: + with open(metadata_path, "r") as f: + metadata = json.load(f) + + sanitized_metadata = self._sanitize_metadata_file_paths(metadata, artifact_storage) + + with TemporaryDirectory() as tmpdir: + sanitized_metadata_path = Path(tmpdir) / "metadata.json" + with open(sanitized_metadata_path, "w") as f: + json.dump(sanitized_metadata, f, indent=2, default=str) + hf_api.upload_file( + path_or_fileobj=str(sanitized_metadata_path), + path_in_repo="metadata.json", + repo_id=repo_id, + repo_type="dataset", + ) + except Exception as e: + logger.warning(f"Failed to upload metadata: {e}") + + def _sanitize_metadata_file_paths(self, metadata: dict[str, Any], artifact_storage: Any) -> dict[str, Any]: + """Sanitize file paths in metadata by converting local paths to remote paths. + + Args: + metadata: Metadata dictionary that may contain file_paths. + artifact_storage: Artifact storage object. + + Returns: + Metadata dictionary with sanitized file paths. + """ + if "file_paths" not in metadata or not isinstance(metadata["file_paths"], list): + return metadata + + sanitized_paths = [] + base_path = artifact_storage.base_dataset_path + + for file_path in metadata["file_paths"]: + path_obj = Path(str(file_path)) + sanitized = None + + if path_obj.is_absolute(): + try: + relative_path = path_obj.relative_to(base_path) + sanitized = f"data/{relative_path.as_posix()}" + except ValueError: + pass + + if not sanitized: + path_str = str(file_path) + if "parquet-files" in path_str: + idx = path_str.find("parquet-files") + remaining = path_str[idx + len("parquet-files") :].lstrip("/\\") if idx != -1 else path_obj.name + sanitized = f"data/parquet-files/{remaining}" + else: + sanitized = f"data/{path_obj.name}" + + sanitized_paths.append(sanitized) + + result = metadata.copy() + if sanitized_paths: + result["file_paths"] = sanitized_paths + else: + result.pop("file_paths", None) + return result + + def _upload_config_files( + self, + hf_api: HfApi, + repo_id: str, + artifact_storage: Any, + ) -> None: + """Upload configuration files (column_configs.json, model_configs.json). + + Args: + hf_api: Hugging Face API client. + repo_id: The ID of the Hugging Face Hub repository. + artifact_storage: Artifact storage object. + """ + if not hasattr(artifact_storage, "base_dataset_path"): + return + + base_path = artifact_storage.base_dataset_path + config_files = ["column_configs.json", "model_configs.json"] + for config_file in config_files: + config_path = base_path / config_file + if config_path.exists(): + try: + hf_api.upload_file( + path_or_fileobj=str(config_path), + path_in_repo=config_file, + repo_id=repo_id, + repo_type="dataset", + ) + except Exception as e: + logger.warning(f"Failed to upload {config_file}: {e}") + + def _upload_dataset_card( + self, + repo_id: str, + token: str | None, + dataset_df: pd.DataFrame, + ) -> None: + """Generate and upload the dataset card to Hugging Face Hub. + + Args: + repo_id: The ID of the Hugging Face Hub repository. + token: Hugging Face token for authentication. + dataset_df: The dataset as a pandas DataFrame. + + Raises: + ArtifactStorageError: If analysis or config_builder is missing. + """ + if self._analysis is None or self._config_builder is None: + raise ArtifactStorageError( + "Cannot generate dataset card: missing analysis or config_builder. " + "Ensure the client was initialized with analysis and config_builder." + ) + + metadata = self._load_metadata_for_card() + template_variables = self._build_card_template_variables( + dataset_df=dataset_df, + analysis=self._analysis, + config_builder=self._config_builder, + metadata=metadata, + repo_id=repo_id, + ) + + card = self._create_dataset_card(dataset_df, template_variables) + self._save_and_upload_card(card, repo_id, token) + + def _load_metadata_for_card(self) -> dict[str, Any] | None: + """Load and sanitize metadata for dataset card generation. + + Returns: + Sanitized metadata dictionary or None if not available. + """ + if not self._artifact_storage_provider: + return None + + artifact_storage = self._artifact_storage_provider.artifact_storage + if not hasattr(artifact_storage, "metadata_file_path"): + return None + + metadata_path = artifact_storage.metadata_file_path + if not metadata_path.exists(): + return None + + try: + with open(metadata_path, "r") as f: + metadata = json.load(f) + return self._sanitize_metadata_file_paths(metadata, artifact_storage) + except Exception: + return None + + def _build_card_template_variables( + self, + dataset_df: pd.DataFrame, + analysis: DatasetProfilerResults, + config_builder: DataDesignerConfigBuilder, + metadata: dict[str, Any] | None, + repo_id: str, + ) -> dict[str, Any]: + """Build template variables for the dataset card. + + Args: + dataset_df: The dataset as a pandas DataFrame. + analysis: Profiling analysis results. + config_builder: Configuration builder. + metadata: Optional metadata dictionary. + repo_id: Repository ID. + + Returns: + Dictionary of template variables. + """ + column_configs = config_builder.get_column_configs() + column_names = set(dataset_df.columns) + + all_columns = self._build_column_info(dataset_df, column_names) + unconfigured_columns = self._find_unconfigured_columns(dataset_df, column_names, column_configs) + sample_records = self._build_sample_records(dataset_df) + config_types = self._build_config_types_summary(column_configs) + column_stats_by_type = self._build_column_stats_by_type(analysis) + + return { + "size_categories": parse_size_category(len(dataset_df)), + "num_records": len(dataset_df), + "target_num_records": analysis.target_num_records, + "percent_complete": analysis.percent_complete, + "num_columns": len(dataset_df.columns), + "repo_id": repo_id, + "metadata": metadata or {}, + "column_configs": [pydantic_to_dict(col_config) for col_config in column_configs] if column_configs else [], + "unconfigured_columns": unconfigured_columns, + "all_columns": all_columns, + "column_statistics": ( + [pydantic_to_dict(stat) for stat in analysis.column_statistics] if analysis.column_statistics else [] + ), + "column_stats_by_type": column_stats_by_type, + "sorted_column_types": self._sort_column_types(column_stats_by_type), + "num_samples": len(sample_records), + "sample_records": sample_records, + "config_types": config_types, + } + + def _build_column_info(self, dataset_df: pd.DataFrame, column_names: set[str]) -> dict[str, str]: + """Build column information dictionary with normalized types. + + Args: + dataset_df: The dataset as a pandas DataFrame. + column_names: Set of column names. + + Returns: + Dictionary mapping column names to their normalized types. + """ + all_columns: dict[str, str] = {} + for col_name in sorted(column_names): + try: + normalized_type = convert_pyarrow_dtype_to_simple_dtype(dataset_df[col_name].dtype.pyarrow_dtype) + except Exception: + normalized_type = str(dataset_df[col_name].dtype) + all_columns[col_name] = normalized_type + return all_columns + + def _find_unconfigured_columns( + self, + dataset_df: pd.DataFrame, + column_names: set[str], + column_configs: list[Any] | None, + ) -> dict[str, str]: + """Find columns that don't have configurations. + + Args: + dataset_df: The dataset as a pandas DataFrame. + column_names: Set of all column names. + column_configs: List of column configurations. + + Returns: + Dictionary mapping unconfigured column names to their types. + """ + if not column_configs: + return {} + + configured_names = {col.name for col in column_configs} + unconfigured = column_names - configured_names + return {col_name: str(dataset_df[col_name].dtype) for col_name in sorted(unconfigured)} + + def _build_sample_records(self, dataset_df: pd.DataFrame) -> list[dict[str, Any]]: + """Build sample records for the dataset card. + + Args: + dataset_df: The dataset as a pandas DataFrame. + + Returns: + List of sample records as dictionaries. + """ + num_samples = min(5, len(dataset_df)) + if num_samples == 0: + return [] + + sample_df = dataset_df.head(num_samples) + records = sample_df.to_dict(orient="records") + return [ + {k: v if isinstance(v, (str, int, float, bool, type(None))) else str(v) for k, v in record.items()} + for record in records + ] + + def _build_config_types_summary(self, column_configs: list[Any] | None) -> dict[str, int]: + """Build summary of configuration types. + + Args: + column_configs: List of column configurations. + + Returns: + Dictionary mapping config type names to counts. + """ + if not column_configs: + return {} + + config_types: dict[str, int] = {} + for col_config in column_configs: + config_type = type(col_config).__name__ + config_types[config_type] = config_types.get(config_type, 0) + 1 + return config_types + + def _build_column_stats_by_type(self, analysis: DatasetProfilerResults) -> dict[str, list[dict[str, Any]]]: + """Build column statistics grouped by type. + + Args: + analysis: Profiling analysis results. + + Returns: + Dictionary mapping column types to lists of statistics dictionaries. + """ + column_stats_by_type: dict[str, list[Any]] = {} + for column_type in analysis.column_types: + try: + column_type_enum = DataDesignerColumnType(column_type) + stats = analysis.get_column_statistics_by_type(column_type_enum) + if stats: + column_stats_by_type[column_type] = stats + except (ValueError, TypeError): + continue + + return {col_type: [pydantic_to_dict(stat) for stat in stats_list] for col_type, stats_list in column_stats_by_type.items()} + + def _sort_column_types(self, column_stats_by_type: dict[str, list[dict[str, Any]]]) -> list[str]: + """Sort column types by display order. + + Args: + column_stats_by_type: Dictionary mapping column types to statistics. + + Returns: + Sorted list of column type names. + """ + display_order = get_column_display_order() + return sorted( + column_stats_by_type.keys(), + key=lambda x: display_order.index(x) if x in display_order else len(display_order), + ) + + def _create_dataset_card( + self, + dataset_df: pd.DataFrame, + template_variables: dict[str, Any], + ) -> DataDesignerDatasetCard: + """Create a dataset card from template variables. + + Args: + dataset_df: The dataset as a pandas DataFrame. + template_variables: Template variables for the card. + + Returns: + DataDesignerDatasetCard instance. + """ + tags_list = ["datadesigner", "synthetic"] + return DataDesignerDatasetCard.from_template( + card_data=DatasetCardData( + size_categories=parse_size_category(len(dataset_df)), + tags=tags_list, + ), + tags=tags_list, + **template_variables, + ) + + def _save_and_upload_card( + self, + card: DataDesignerDatasetCard, + repo_id: str, + token: str | None, + ) -> None: + """Save dataset card to temporary file and upload to hub. + + Args: + card: The dataset card to upload. + repo_id: The ID of the Hugging Face Hub repository. + token: Hugging Face token for authentication. + + Raises: + ArtifactStorageError: If card saving fails. + """ + with TemporaryDirectory() as tmpdir: + card_path = Path(tmpdir) / "README.md" + try: + card.save(filepath=str(card_path)) + except Exception as e: + raise ArtifactStorageError(f"Failed to save dataset card: {e}") from e + + HfApi(token=token).upload_file( + path_or_fileobj=str(card_path), + path_in_repo="README.md", + repo_id=repo_id, + repo_type="dataset", + ) + + @staticmethod + def pull_from_hub( + repo_id: str, + *, + token: str | None = None, + split: str | None = None, + include_analysis: bool = True, + include_processors: bool = True, + include_configs: bool = True, + **kwargs: Any, + ) -> HubDatasetResults: + """Load a dataset and all associated artifacts from Hugging Face Hub. + + This function loads a dataset from the Hugging Face Hub along with analysis results, + processor datasets, processor artifacts, and configuration files if available. + + Args: + repo_id: The ID of the Hugging Face Hub repository (e.g., "username/dataset-name"). + token: Hugging Face token for authentication. If None, will check environment + variables HF_TOKEN or HUGGINGFACE_HUB_TOKEN. + split: The split to load from the dataset. If None, the default split will be used. + include_analysis: Whether to load analysis results. Defaults to True. + include_processors: Whether to load processor datasets and artifacts. Defaults to True. + include_configs: Whether to load configuration files. Defaults to True. + **kwargs: Additional arguments to pass to `datasets.load_dataset()`. + + Returns: + A HubDatasetResults object containing the dataset and all associated artifacts. + """ + resolved_token = resolve_hf_token(token) + hf_dataset = HuggingFaceHubClient._load_dataset_from_hub(repo_id, split, resolved_token, **kwargs) + dataset_df = hf_dataset.to_pandas() + + analysis = None + if include_analysis: + analysis = HuggingFaceHubClient._load_analysis_from_hub(repo_id, resolved_token) + + processor_datasets = None + processor_artifacts = None + if include_processors: + processor_datasets, processor_artifacts = HuggingFaceHubClient._load_processors_from_hub( + repo_id, resolved_token + ) + + metadata = None + column_configs = None + model_configs = None + if include_configs: + metadata, column_configs, model_configs = HuggingFaceHubClient._load_configs_from_hub( + repo_id, resolved_token + ) + + return HubDatasetResults( + dataset=dataset_df, + analysis=analysis, + processor_datasets=processor_datasets, + processor_artifacts=processor_artifacts, + metadata=metadata, + column_configs=column_configs, + model_configs=model_configs, + ) + + @staticmethod + def _load_dataset_from_hub( + repo_id: str, + split: str | None, + token: str | None, + **kwargs: Any, + ) -> Dataset: + """Load the main dataset from Hugging Face Hub. + + Args: + repo_id: The ID of the Hugging Face Hub repository. + split: The split to load. If None, the first split will be used. + token: Hugging Face token for authentication. + **kwargs: Additional arguments to pass to load_dataset. + + Returns: + HuggingFace Dataset object. + """ + hf_dataset = load_dataset(repo_id, split=split, token=token, **kwargs) + + if isinstance(hf_dataset, (DatasetDict, dict)): + if split is None: + split = next(iter(hf_dataset.keys())) + hf_dataset = hf_dataset[split] + + return hf_dataset + + @staticmethod + def _load_analysis_from_hub( + repo_id: str, + token: str | None, + ) -> DatasetProfilerResults | None: + """Load analysis results from Hugging Face Hub. + + Args: + repo_id: The ID of the Hugging Face Hub repository. + token: Hugging Face token for authentication. + + Returns: + DatasetProfilerResults if available, None otherwise. + """ + try: + analysis_path = hf_hub_download( + repo_id=repo_id, + filename="analysis.json", + repo_type="dataset", + token=token, + ) + with open(analysis_path, "r") as f: + return DatasetProfilerResults.model_validate(json.load(f)) + except (HfHubHTTPError, FileNotFoundError, Exception): + return None + + @staticmethod + def _load_processors_from_hub( + repo_id: str, + token: str | None, + ) -> tuple[dict[str, pd.DataFrame] | None, dict[str, Path] | None]: + """Load processor datasets and artifacts from Hugging Face Hub. + + Args: + repo_id: The ID of the Hugging Face Hub repository. + token: Hugging Face token for authentication. + + Returns: + Tuple of (processor_datasets dict, processor_artifacts dict), or (None, None) if unavailable. + """ + try: + repo_files = list_repo_files(repo_id=repo_id, repo_type="dataset", token=token) + processor_files = [f for f in repo_files if f.startswith("processors/")] + + processor_groups = HuggingFaceHubClient._group_processor_files(processor_files) + processor_datasets = HuggingFaceHubClient._download_processor_datasets( + repo_id, token, processor_groups + ) + processor_artifacts = HuggingFaceHubClient._download_processor_artifacts( + repo_id, token, processor_groups + ) + + return processor_datasets or None, processor_artifacts or None + except (HfHubHTTPError, FileNotFoundError, Exception): + return None, None + + @staticmethod + def _group_processor_files(processor_files: list[str]) -> dict[str, list[str]]: + """Group processor files by processor name. + + Args: + processor_files: List of file paths in the processors/ directory. + + Returns: + Dictionary mapping processor names to lists of file paths. + """ + processor_groups: dict[str, list[str]] = {} + for file_path in processor_files: + parts = file_path.replace("processors/", "").split("/") + processor_name = parts[0].replace(".parquet", "") + if processor_name not in processor_groups: + processor_groups[processor_name] = [] + processor_groups[processor_name].append(file_path) + return processor_groups + + @staticmethod + def _download_processor_datasets( + repo_id: str, + token: str | None, + processor_groups: dict[str, list[str]], + ) -> dict[str, pd.DataFrame]: + """Download processor datasets from the hub. + + Args: + repo_id: The ID of the Hugging Face Hub repository. + token: Hugging Face token for authentication. + processor_groups: Dictionary mapping processor names to file paths. + + Returns: + Dictionary mapping processor names to DataFrames. + """ + processor_datasets: dict[str, pd.DataFrame] = {} + for processor_name, files in processor_groups.items(): + parquet_file = next((f for f in files if f.endswith(".parquet")), None) + if parquet_file: + try: + local_path = hf_hub_download( + repo_id=repo_id, + filename=parquet_file, + repo_type="dataset", + token=token, + ) + processor_datasets[processor_name] = pd.read_parquet(local_path) + except Exception: + pass + return processor_datasets + + @staticmethod + def _download_processor_artifacts( + repo_id: str, + token: str | None, + processor_groups: dict[str, list[str]], + ) -> dict[str, Path]: + """Download processor artifacts from the hub. + + Args: + repo_id: The ID of the Hugging Face Hub repository. + token: Hugging Face token for authentication. + processor_groups: Dictionary mapping processor names to file paths. + + Returns: + Dictionary mapping processor names to artifact directory paths. + """ + processor_artifacts: dict[str, Path] = {} + for processor_name, files in processor_groups.items(): + other_files = [f for f in files if not f.endswith(".parquet")] + if other_files: + with TemporaryDirectory() as tmpdir: + artifact_dir = Path(tmpdir) / processor_name + artifact_dir.mkdir(parents=True, exist_ok=True) + + for artifact_file in other_files: + try: + local_path = hf_hub_download( + repo_id=repo_id, + filename=artifact_file, + repo_type="dataset", + token=token, + ) + relative_path = artifact_file.replace(f"processors/{processor_name}/", "") + if relative_path: + target_path = artifact_dir / relative_path + target_path.parent.mkdir(parents=True, exist_ok=True) + shutil.copy2(local_path, target_path) + except Exception: + pass + + if any(artifact_dir.rglob("*")): + processor_artifacts[processor_name] = artifact_dir + + return processor_artifacts + + @staticmethod + def _load_configs_from_hub( + repo_id: str, + token: str | None, + ) -> tuple[dict[str, Any] | None, list[dict[str, Any]] | None, list[dict[str, Any]] | None]: + """Load configuration files from Hugging Face Hub. + + Args: + repo_id: The ID of the Hugging Face Hub repository. + token: Hugging Face token for authentication. + + Returns: + Tuple of (metadata, column_configs, model_configs), with None values if unavailable. + """ + metadata = HuggingFaceHubClient._load_metadata_from_hub(repo_id, token) + column_configs = HuggingFaceHubClient._load_column_configs_from_hub(repo_id, token) + model_configs = HuggingFaceHubClient._load_model_configs_from_hub(repo_id, token) + + return metadata, column_configs, model_configs + + @staticmethod + def _load_metadata_from_hub(repo_id: str, token: str | None) -> dict[str, Any] | None: + """Load metadata from Hugging Face Hub. + + Args: + repo_id: The ID of the Hugging Face Hub repository. + token: Hugging Face token for authentication. + + Returns: + Metadata dictionary or None if unavailable. + """ + try: + metadata_path = hf_hub_download( + repo_id=repo_id, + filename="metadata.json", + repo_type="dataset", + token=token, + ) + with open(metadata_path, "r") as f: + return json.load(f) + except (HfHubHTTPError, FileNotFoundError, Exception): + return None + + @staticmethod + def _load_column_configs_from_hub(repo_id: str, token: str | None) -> list[dict[str, Any]] | None: + """Load column configurations from Hugging Face Hub. + + Args: + repo_id: The ID of the Hugging Face Hub repository. + token: Hugging Face token for authentication. + + Returns: + List of column config dictionaries or None if unavailable. + """ + try: + config_path = hf_hub_download( + repo_id=repo_id, + filename="column_configs.json", + repo_type="dataset", + token=token, + ) + with open(config_path, "r") as f: + raw_column_configs = json.load(f) + + column_configs = [] + for config in raw_column_configs: + if "columns" in config and isinstance(config["columns"], list): + column_configs.extend(config["columns"]) + else: + column_configs.append(config) + return column_configs + except (HfHubHTTPError, FileNotFoundError, Exception): + return None + + @staticmethod + def _load_model_configs_from_hub(repo_id: str, token: str | None) -> list[dict[str, Any]] | None: + """Load model configurations from Hugging Face Hub. + + Args: + repo_id: The ID of the Hugging Face Hub repository. + token: Hugging Face token for authentication. + + Returns: + List of model config dictionaries or None if unavailable. + """ + try: + config_path = hf_hub_download( + repo_id=repo_id, + filename="model_configs.json", + repo_type="dataset", + token=token, + ) + with open(config_path, "r") as f: + return json.load(f) + except (HfHubHTTPError, FileNotFoundError, Exception): + return None diff --git a/src/data_designer/interface/huggingface/dataset_card.py b/src/data_designer/integrations/huggingface/dataset_card.py similarity index 86% rename from src/data_designer/interface/huggingface/dataset_card.py rename to src/data_designer/integrations/huggingface/dataset_card.py index 6bf634887..9b16b173d 100644 --- a/src/data_designer/interface/huggingface/dataset_card.py +++ b/src/data_designer/integrations/huggingface/dataset_card.py @@ -15,7 +15,7 @@ class DataDesignerDatasetCard(DatasetCard): This class extends Hugging Face's DatasetCard with a custom template specifically designed for Data Designer generated datasets. - The template is located at `data_designer/interface/huggingface/dataset_card_template.md`. + The template is located at `data_designer/integrations/huggingface/dataset_card_template.md`. """ default_template_path = TEMPLATE_DATA_DESIGNER_DATASET_CARD_PATH diff --git a/src/data_designer/interface/huggingface/dataset_card_template.md b/src/data_designer/integrations/huggingface/dataset_card_template.md similarity index 91% rename from src/data_designer/interface/huggingface/dataset_card_template.md rename to src/data_designer/integrations/huggingface/dataset_card_template.md index d676a4d6e..fd81c8d47 100644 --- a/src/data_designer/interface/huggingface/dataset_card_template.md +++ b/src/data_designer/integrations/huggingface/dataset_card_template.md @@ -20,7 +20,7 @@ NeMo Data Designer is a general framework for generating high-quality synthetic - **LLM-as-a-judge** scoring for quality assessment - **Fast iteration** with preview mode before full-scale generation -For more information, visit: [https://github.com/NVIDIA-NeMo/DataDesigner](https://github.com/NVIDIA-NeMo/DataDesigner) +For more information, visit: [https://github.com/NVIDIA-NeMo/DataDesigner](https://github.com/NVIDIA-NeMo/DataDesigner) (`pip install data-designer`) ## Quick Start @@ -49,7 +49,7 @@ df = results.load_dataset() analysis = results.load_analysis() # Access the config builder -config_builder = results._config_builder +config_builder = results.config_builder ``` ## Dataset Summary @@ -80,10 +80,10 @@ No sample records available. ## Schema {% if all_columns is defined and all_columns %} -| Column | Type | Description | -|--------|------|-------------| +| Column | Type | +|--------|------| {% for col_name, dtype in all_columns | dictsort -%} -| `{{ col_name }}` | {{ dtype }} | {% if column_configs %}{% for col_config in column_configs %}{% if col_config.get('name') == col_name %}{% set col_type = col_config.get('column_type') %}{% if col_type is mapping %}{{ col_type.get('value', '') }}{% elif col_type %}{{ col_type }}{% endif %}{% endif %}{% endfor %}{% endif %} | +| `{{ col_name }}` | {{ dtype }} | {% endfor -%} {% else %} No column information available. @@ -166,10 +166,11 @@ No column configurations available. If you use this dataset in your research, please cite: ```bibtex -@software{data_designer, - title={NeMo Data Designer: A Framework for Synthetic Dataset Generation}, - author={NVIDIA}, - year={2025}, - url={https://github.com/NVIDIA-NeMo/DataDesigner} +@misc{nemo-data-designer, + author = {The NeMo Data Designer Team, NVIDIA}, + title = {NeMo Data Designer: A framework for generating synthetic data from scratch or based on your own seed data}, + howpublished = {\url{https://github.com/NVIDIA-NeMo/DataDesigner}}, + year = {2025}, + note = {GitHub Repository}, } ``` diff --git a/src/data_designer/interface/huggingface/hub_results.py b/src/data_designer/integrations/huggingface/hub_results.py similarity index 100% rename from src/data_designer/interface/huggingface/hub_results.py rename to src/data_designer/integrations/huggingface/hub_results.py diff --git a/src/data_designer/integrations/huggingface/reconstruction.py b/src/data_designer/integrations/huggingface/reconstruction.py new file mode 100644 index 000000000..61417c69d --- /dev/null +++ b/src/data_designer/integrations/huggingface/reconstruction.py @@ -0,0 +1,272 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +from __future__ import annotations + +import json +import logging +import shutil +import tempfile +from pathlib import Path +from typing import Any + +from huggingface_hub import hf_hub_download + +from data_designer.config.config_builder import DataDesignerConfigBuilder +from data_designer.config.models import ModelConfig +from data_designer.engine.dataset_builders.artifact_storage import ArtifactStorage +from data_designer.engine.dataset_builders.errors import ArtifactStorageError +from data_designer.integrations.huggingface.client import resolve_hf_token +from data_designer.integrations.huggingface.hub_results import HubDatasetResults + +logger = logging.getLogger(__name__) + + +def reconstruct_dataset_creation_results( + hub_results: HubDatasetResults, + repo_id: str, + artifact_path: Path | str | None = None, + token: str | None = None, +) -> tuple[ArtifactStorage, DataDesignerConfigBuilder]: + """Reconstruct ArtifactStorage and DataDesignerConfigBuilder from hub results. + + This function downloads all artifacts from the Hugging Face Hub and reconstructs + the necessary components for a DatasetCreationResults object. + + Args: + hub_results: Results from pulling from the hub. + repo_id: The ID of the Hugging Face Hub repository. + artifact_path: Optional path to save downloaded artifacts. If None, a temporary + directory will be used. + token: Hugging Face token for authentication. + + Returns: + Tuple of (ArtifactStorage, DataDesignerConfigBuilder). + + Raises: + ArtifactStorageError: If analysis results are not found or reconstruction fails. + """ + if hub_results.analysis is None: + raise ArtifactStorageError("Cannot reconstruct DatasetCreationResults: analysis results not found in hub.") + + if artifact_path is None: + temp_dir = tempfile.mkdtemp(prefix="data_designer_hub_") + artifact_path = Path(temp_dir) + else: + artifact_path = Path(artifact_path) + artifact_path.mkdir(parents=True, exist_ok=True) + + dataset_name = "dataset" + artifact_storage = ArtifactStorage( + artifact_path=artifact_path, + dataset_name=dataset_name, + ) + base_path = artifact_storage.base_dataset_path + base_path.mkdir(parents=True, exist_ok=True) + + _save_main_dataset(hub_results, artifact_storage) + _save_metadata(hub_results, base_path) + _save_processor_datasets(hub_results, base_path) + _save_processor_artifacts(hub_results, base_path) + _save_config_files(hub_results, base_path) + + config_builder = _reconstruct_config_builder(hub_results, repo_id, token) + + return artifact_storage, config_builder + + +def _save_main_dataset(hub_results: HubDatasetResults, artifact_storage: ArtifactStorage) -> None: + """Save the main dataset as parquet files. + + Args: + hub_results: Results from pulling from the hub. + artifact_storage: Artifact storage object. + """ + final_dataset_path = artifact_storage.final_dataset_path + final_dataset_path.mkdir(parents=True, exist_ok=True) + hub_results.dataset.to_parquet(final_dataset_path / "data.parquet", index=False) + + +def _save_metadata(hub_results: HubDatasetResults, base_path: Path) -> None: + """Save metadata if available. + + Args: + hub_results: Results from pulling from the hub. + base_path: Base path for artifacts. + """ + if hub_results.metadata: + with open(base_path / "metadata.json", "w") as f: + json.dump(hub_results.metadata, f, indent=2) + + +def _save_processor_datasets(hub_results: HubDatasetResults, base_path: Path) -> None: + """Save processor datasets if available. + + Args: + hub_results: Results from pulling from the hub. + base_path: Base path for artifacts. + """ + if not hub_results.processor_datasets: + return + + processors_path = base_path / "processors-files" + processors_path.mkdir(parents=True, exist_ok=True) + for processor_name, processor_df in hub_results.processor_datasets.items(): + processor_dir = processors_path / processor_name + processor_dir.mkdir(parents=True, exist_ok=True) + processor_df.to_parquet(processor_dir / f"{processor_name}.parquet", index=False) + + +def _save_processor_artifacts(hub_results: HubDatasetResults, base_path: Path) -> None: + """Save processor artifacts if available. + + Args: + hub_results: Results from pulling from the hub. + base_path: Base path for artifacts. + """ + if not hub_results.processor_artifacts: + return + + processors_path = base_path / "processors-files" + processors_path.mkdir(parents=True, exist_ok=True) + for processor_name, artifact_dir in hub_results.processor_artifacts.items(): + if not artifact_dir.exists(): + continue + target_dir = processors_path / processor_name + if target_dir.exists(): + shutil.rmtree(target_dir) + shutil.copytree(artifact_dir, target_dir) + + +def _save_config_files(hub_results: HubDatasetResults, base_path: Path) -> None: + """Save configuration files if available. + + Args: + hub_results: Results from pulling from the hub. + base_path: Base path for artifacts. + """ + if hub_results.column_configs: + with open(base_path / "column_configs.json", "w") as f: + json.dump(hub_results.column_configs, f, indent=2) + + if hub_results.model_configs: + with open(base_path / "model_configs.json", "w") as f: + json.dump(hub_results.model_configs, f, indent=2) + + +def _reconstruct_config_builder( + hub_results: HubDatasetResults, + repo_id: str, + token: str | None, +) -> DataDesignerConfigBuilder: + """Reconstruct the config builder from hub results or hub files. + + Args: + hub_results: Results from pulling from the hub. + repo_id: The ID of the Hugging Face Hub repository. + token: Hugging Face token for authentication. + + Returns: + DataDesignerConfigBuilder instance. + """ + if hub_results.column_configs and hub_results.model_configs: + model_configs = [ModelConfig.model_validate(mc) for mc in hub_results.model_configs] + config_builder = DataDesignerConfigBuilder(model_configs=model_configs) + column_config_class_mapping = _get_column_config_class_mapping() + + for col_config_dict in hub_results.column_configs: + configs_to_add = ( + col_config_dict["columns"] + if "columns" in col_config_dict and isinstance(col_config_dict["columns"], list) + else [col_config_dict] + ) + for single_col_config_dict in configs_to_add: + col_config = _load_column_config(single_col_config_dict, column_config_class_mapping) + if col_config is not None: + config_builder.add_column(col_config) + + return config_builder + + resolved_token = resolve_hf_token(token) + try: + model_configs_path = hf_hub_download( + repo_id=repo_id, + filename="model_configs.json", + repo_type="dataset", + token=resolved_token, + ) + with open(model_configs_path, "r") as f: + model_configs_data = json.load(f) + model_configs = [ModelConfig.model_validate(mc) for mc in model_configs_data] + return DataDesignerConfigBuilder(model_configs=model_configs) + except Exception: + return DataDesignerConfigBuilder() + + +def _get_column_config_class_mapping() -> dict[str, type[Any]]: + """Build a mapping from column_type string to config class dynamically. + + Returns: + Dictionary mapping column type strings to config classes. + """ + from data_designer.config.column_configs import ( + ExpressionColumnConfig, + LLMCodeColumnConfig, + LLMJudgeColumnConfig, + LLMStructuredColumnConfig, + LLMTextColumnConfig, + SamplerColumnConfig, + SeedDatasetColumnConfig, + ValidationColumnConfig, + ) + from data_designer.plugin_manager import PluginManager + + mapping: dict[str, type[Any]] = { + "sampler": SamplerColumnConfig, + "llm_text": LLMTextColumnConfig, + "llm_structured": LLMStructuredColumnConfig, + "llm_code": LLMCodeColumnConfig, + "llm_judge": LLMJudgeColumnConfig, + "expression": ExpressionColumnConfig, + "seed_dataset": SeedDatasetColumnConfig, + "validation": ValidationColumnConfig, + } + + plugin_manager = PluginManager() + for plugin in plugin_manager.get_column_generator_plugins(): + mapping[plugin.name] = plugin.config_cls + + return mapping + + +def _load_column_config( + col_config_dict: dict[str, Any], + column_config_class_mapping: dict[str, type[Any]], +) -> Any | None: + """Load a single column config from dict using dynamic class mapping. + + Args: + col_config_dict: Dictionary representation of column config. + column_config_class_mapping: Mapping from column type to config class. + + Returns: + Column config instance or None if loading fails. + """ + column_type = col_config_dict.get("column_type") + if not column_type: + return None + + config_class = column_config_class_mapping.get(column_type) + if config_class is None: + logger.warning( + f"Skipping column config with unknown type '{column_type}': {col_config_dict.get('name', 'unknown')}" + ) + return None + + try: + return config_class.model_validate(col_config_dict) + except Exception as e: + logger.warning( + f"Failed to load column config '{col_config_dict.get('name', 'unknown')}': {e}. Skipping." + ) + return None diff --git a/src/data_designer/interface/huggingface/__init__.py b/src/data_designer/interface/huggingface/__init__.py index 7c6d2eaec..ea58067e3 100644 --- a/src/data_designer/interface/huggingface/__init__.py +++ b/src/data_designer/interface/huggingface/__init__.py @@ -1,8 +1,18 @@ # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 -from data_designer.interface.huggingface.hub_mixin import HuggingFaceHubMixin, pull_from_hub -from data_designer.interface.huggingface.hub_results import HubDatasetResults +# Backward compatibility: re-export from new location +from data_designer.integrations.huggingface import ( + HubDatasetResults, + HuggingFaceHubClient, + resolve_hf_token, +) -__all__ = ["HuggingFaceHubMixin", "pull_from_hub", "HubDatasetResults"] +# For backward compatibility, provide pull_from_hub as a function +pull_from_hub = HuggingFaceHubClient.pull_from_hub + +# Legacy alias for mixin (deprecated, use HuggingFaceHubClient instead) +HuggingFaceHubMixin = HuggingFaceHubClient + +__all__ = ["HuggingFaceHubMixin", "HuggingFaceHubClient", "pull_from_hub", "HubDatasetResults", "resolve_hf_token"] diff --git a/src/data_designer/interface/huggingface/hub_mixin.py b/src/data_designer/interface/huggingface/hub_mixin.py deleted file mode 100644 index 711bb028f..000000000 --- a/src/data_designer/interface/huggingface/hub_mixin.py +++ /dev/null @@ -1,1146 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 - -from __future__ import annotations - -import json -from pathlib import Path -from tempfile import TemporaryDirectory -from typing import Any, Protocol - -import pandas as pd -from datasets import Dataset, DatasetDict, load_dataset -from huggingface_hub import HfApi, get_token, hf_hub_download, list_repo_files -from huggingface_hub.utils import HfHubHTTPError - -from data_designer.engine.dataset_builders.errors import ArtifactStorageError -from data_designer.interface.huggingface.dataset_card import DataDesignerDatasetCard -from data_designer.interface.huggingface.hub_results import HubDatasetResults - - -def _resolve_hf_token(token: str | None) -> str | None: - """Resolve the Hugging Face token from parameter or huggingface_hub. - - This function tries to resolve a token in the following order: - 1. Token provided as parameter - 2. huggingface_hub's get_token() (checks environment variables, cache, config file, etc.) - - Args: - token: Token provided as parameter. - - Returns: - Resolved token or None if not found. - """ - if token is not None: - return token - - # Try to get token from huggingface_hub (checks env vars, cache, config file, etc.) - try: - token = get_token() - if token: - return token - except Exception: - # If get_token fails, continue to return None - pass - - # Return None - huggingface_hub will handle authentication if user is logged in - return None - - -def _size_categories_parser(num_records: int) -> str: - """Parse dataset size into Hugging Face size category. - - Uses the same category names as Argilla's size_categories_parser. - - Args: - num_records: Number of records in the dataset. - - Returns: - Size category string matching Hugging Face format (e.g., "n<1K", "1K1T" - - -def _build_card_template_variables( - dataset_df: pd.DataFrame, - analysis: Any, - config_builder: Any, - metadata: dict[str, Any] | None, - repo_id: str, -) -> dict[str, Any]: - """Build template variables for the dataset card. - - Args: - dataset_df: The dataset as a pandas DataFrame. - analysis: Profiling analysis results. - config_builder: Configuration builder. - metadata: Optional metadata dictionary. - repo_id: Repository ID. - - Returns: - Dictionary of template variables. - """ - column_configs = config_builder.get_column_configs() - column_names = set(dataset_df.columns) - - # Prepare column information - unconfigured_columns = {} - all_columns = {} - # Always populate all_columns for template use - for col_name in sorted(column_names): - all_columns[col_name] = str(dataset_df[col_name].dtype) - - if column_configs: - configured_names = {col.name for col in column_configs} - unconfigured = column_names - configured_names - for col_name in sorted(unconfigured): - unconfigured_columns[col_name] = str(dataset_df[col_name].dtype) - - # Prepare sample records - num_samples = min(5, len(dataset_df)) - sample_records = [] - if num_samples > 0: - sample_df = dataset_df.head(num_samples) - records = sample_df.to_dict(orient="records") - for record in records: - # Convert to JSON-serializable format, handling complex types - serializable_record = {} - for k, v in record.items(): - if isinstance(v, (str, int, float, bool, type(None))): - serializable_record[k] = v - else: - # Convert complex types to string representation - serializable_record[k] = str(v) - sample_records.append(serializable_record) - - # Convert column_configs to dicts for safer template rendering - column_configs_dicts = [] - if column_configs: - for col_config in column_configs: - if hasattr(col_config, "model_dump"): - # Pydantic model - config_dict = col_config.model_dump(mode="json") - # Convert column_type enum to dict if it exists - if "column_type" in config_dict and hasattr(config_dict["column_type"], "value"): - config_dict["column_type"] = {"value": config_dict["column_type"].value} - elif "column_type" in config_dict and not isinstance(config_dict["column_type"], dict): - # If it's an enum object, convert it - col_type = getattr(col_config, "column_type", None) - if col_type and hasattr(col_type, "value"): - config_dict["column_type"] = {"value": col_type.value} - else: - config_dict["column_type"] = {"value": str(config_dict.get("column_type", "unknown"))} - column_configs_dicts.append(config_dict) - elif hasattr(col_config, "__dict__"): - # Regular object - convert to dict - config_dict = {} - for key in dir(col_config): - if not key.startswith("_") and not callable(getattr(col_config, key, None)): - try: - value = getattr(col_config, key, None) - if isinstance(value, (str, int, float, bool, type(None))): - config_dict[key] = value - elif hasattr(value, "value"): # Enum - config_dict[key] = {"value": value.value} if key == "column_type" else value.value - else: - config_dict[key] = str(value) if value is not None else None - except Exception: - pass - column_configs_dicts.append(config_dict) - else: - column_configs_dicts.append(col_config) - - # Prepare config types summary - config_types: dict[str, int] = {} - if column_configs: - for col_config in column_configs: - config_type = type(col_config).__name__ - config_types[config_type] = config_types.get(config_type, 0) + 1 - - # Group column statistics by type - from data_designer.config.column_types import DataDesignerColumnType, get_column_display_order - - column_stats_by_type: dict[str, list] = {} - display_order = get_column_display_order() - - for column_type in analysis.column_types: - # column_type is already a string, convert to DataDesignerColumnType enum - try: - column_type_enum = DataDesignerColumnType(column_type) - except (ValueError, TypeError): - # Skip invalid column types - continue - stats = analysis.get_column_statistics_by_type(column_type_enum) - if stats: - # Convert stat objects to dicts for safer template rendering - stats_dicts = [] - for stat in stats: - if hasattr(stat, "model_dump"): - # Pydantic model - convert to dict - stat_dict = stat.model_dump(mode="json") - # Handle enum fields like sampler_type - if "sampler_type" in stat_dict and not isinstance(stat_dict["sampler_type"], (str, dict)): - sampler_type = getattr(stat, "sampler_type", None) - if sampler_type and hasattr(sampler_type, "value"): - stat_dict["sampler_type"] = {"value": sampler_type.value} - else: - stat_dict["sampler_type"] = {"value": str(stat_dict.get("sampler_type", "unknown"))} - stats_dicts.append(stat_dict) - elif hasattr(stat, "__dict__"): - # Regular object - convert to dict - stat_dict = {} - for key in dir(stat): - if not key.startswith("_") and not callable(getattr(stat, key, None)): - try: - value = getattr(stat, key, None) - if isinstance(value, (str, int, float, bool, type(None))): - stat_dict[key] = value - elif hasattr(value, "value"): # Enum - # For enums, store as dict with value key for consistency - stat_dict[key] = ( - {"value": value.value} - if key in ["sampler_type", "column_type"] - else value.value - ) - else: - stat_dict[key] = str(value) if value is not None else None - except Exception: - pass - stats_dicts.append(stat_dict) - else: - stats_dicts.append(stat) - column_stats_by_type[column_type] = stats_dicts - - # Sort column types by display order - sorted_column_types = sorted( - column_stats_by_type.keys(), - key=lambda x: display_order.index(x) if x in display_order else len(display_order), - ) - - # Convert column_statistics to dicts for safer template rendering - column_statistics_dicts = [] - if analysis.column_statistics: - for stat in analysis.column_statistics: - if hasattr(stat, "model_dump"): - # Pydantic model - column_statistics_dicts.append(stat.model_dump(mode="json")) - elif hasattr(stat, "__dict__"): - # Regular object - convert to dict - stat_dict = {} - for key in dir(stat): - if not key.startswith("_") and not callable(getattr(stat, key, None)): - try: - value = getattr(stat, key, None) - if isinstance(value, (str, int, float, bool, type(None))): - stat_dict[key] = value - elif hasattr(value, "value"): # Enum - stat_dict[key] = value.value - else: - stat_dict[key] = str(value) if value is not None else None - except Exception: - pass - column_statistics_dicts.append(stat_dict) - else: - column_statistics_dicts.append(stat) - - return { - "size_categories": _size_categories_parser(len(dataset_df)), - "num_records": len(dataset_df), - "target_num_records": analysis.target_num_records, - "percent_complete": analysis.percent_complete, - "num_columns": len(dataset_df.columns), - "repo_id": repo_id, - "metadata": metadata or {}, - "column_configs": column_configs_dicts if column_configs_dicts else [], - "unconfigured_columns": unconfigured_columns, - "all_columns": all_columns, - "column_statistics": column_statistics_dicts, - "column_stats_by_type": column_stats_by_type, - "sorted_column_types": sorted_column_types, - "num_samples": num_samples, - "sample_records": sample_records, - "config_types": config_types, - } - - -class HasDataset(Protocol): - """Protocol for classes that have a load_dataset method.""" - - def load_dataset(self) -> pd.DataFrame: ... - - -class HasArtifactStorage(Protocol): - """Protocol for classes that have artifact_storage with metadata_file_path.""" - - @property - def artifact_storage(self) -> Any: ... - - -class HuggingFaceHubMixin: - """Mixin class for pushing and pulling datasets to/from Hugging Face Hub. - - This mixin provides the `push_to_hub` and `pull_from_hub` methods to classes that implement - the `HasDataset` and `HasArtifactStorage` protocols. - """ - - def push_to_hub( - self: Any, - repo_id: str, - *, - token: str | None = None, - generate_card: bool = True, - **kwargs: Any, - ) -> None: - """Push the dataset to Hugging Face Hub. - - This method converts the pandas DataFrame to a HuggingFace Dataset, pushes it to - the Hugging Face Hub, and optionally generates and uploads a dataset card. - - Args: - repo_id: The ID of the Hugging Face Hub repository (e.g., "username/dataset-name"). - token: Hugging Face token for authentication. If None, will check environment - variables HF_TOKEN or HUGGINGFACE_HUB_TOKEN. - generate_card: Whether to generate and upload a dataset card. Defaults to True. - **kwargs: Additional arguments to pass to `dataset.push_to_hub()`. - - Raises: - ArtifactStorageError: If there's an error loading the dataset or metadata. - """ - # Resolve token - resolved_token = self._resolve_token(token) - - # Load dataset - dataset_df = self.load_dataset() - - # Convert pandas DataFrame to HuggingFace Dataset - hf_dataset = Dataset.from_pandas(dataset_df) - - # Push dataset to hub - hf_dataset.push_to_hub(repo_id, token=resolved_token, **kwargs) - - # Push additional artifacts (analysis, processor datasets, configs) - # Pass the repo_id so we can list actual files for metadata sanitization - self._upload_additional_artifacts(repo_id, resolved_token) - - # Generate and upload dataset card if requested - if generate_card: - self._upload_dataset_card(repo_id, resolved_token, dataset_df) - - def _resolve_token(self, token: str | None) -> str | None: - """Resolve the Hugging Face token from parameter, environment variables, or huggingface_hub. - - Args: - token: Token provided as parameter. - - Returns: - Resolved token or None if not found. - """ - return _resolve_hf_token(token) - - def _sanitize_metadata_file_paths(self, metadata: dict[str, Any]) -> dict[str, Any]: - """Sanitize file paths in metadata by converting local paths to remote paths. - - Args: - metadata: Metadata dictionary that may contain file_paths. - - Returns: - Metadata dictionary with sanitized file paths. - """ - if "file_paths" not in metadata or not isinstance(metadata["file_paths"], list): - return metadata - - sanitized_paths = [] - base_path = self.artifact_storage.base_dataset_path - - for file_path in metadata["file_paths"]: - path_str = str(file_path) - path_obj = Path(path_str) - - # Try to get relative path from base_dataset_path - try: - if path_obj.is_absolute(): - try: - relative_path = path_obj.relative_to(base_path) - remote_path = f"data/{relative_path.as_posix()}" - sanitized_paths.append(remote_path) - continue - except ValueError: - # Path is not relative to base_path, try fallback - pass - except Exception: - # If Path operations fail, try string-based extraction - pass - - # Fallback: extract directory structure from path string - if "parquet-files" in path_str: - idx = path_str.find("parquet-files") - if idx != -1: - remaining = path_str[idx + len("parquet-files") :].lstrip("/\\") - sanitized_paths.append(f"data/parquet-files/{remaining}") - else: - sanitized_paths.append(f"data/parquet-files/{path_obj.name}") - else: - sanitized_paths.append(f"data/{path_obj.name}") - - if sanitized_paths: - metadata = metadata.copy() - metadata["file_paths"] = sanitized_paths - else: - # If no paths could be sanitized, remove file_paths - metadata = metadata.copy() - metadata.pop("file_paths", None) - - return metadata - - def _upload_additional_artifacts( - self: Any, - repo_id: str, - token: str | None, - ) -> None: - """Upload additional artifacts to Hugging Face Hub. - - This includes: - - Analysis results (as JSON) - - Processor datasets (as parquet files) - - Processor artifacts (directories) - - Configuration files (column_configs.json, model_configs.json) - - Args: - repo_id: The ID of the Hugging Face Hub repository. - token: Hugging Face token for authentication. - """ - hf_api = HfApi(token=token) - - # Get analysis from the instance - analysis = getattr(self, "_analysis", None) - - # Upload analysis results - if analysis is not None: - try: - analysis_json = analysis.model_dump(mode="json") - with TemporaryDirectory() as tmpdir: - analysis_path = Path(tmpdir) / "analysis.json" - with open(analysis_path, "w") as f: - json.dump(analysis_json, f, indent=2, default=str) - hf_api.upload_file( - path_or_fileobj=str(analysis_path), - path_in_repo="analysis.json", - repo_id=repo_id, - repo_type="dataset", - ) - except Exception as e: - # Log but don't fail if analysis can't be uploaded - import logging - - logger = logging.getLogger(__name__) - logger.warning(f"Failed to upload analysis results: {e}") - - # Upload processor datasets and artifacts - if hasattr(self, "artifact_storage") and hasattr(self.artifact_storage, "processors_outputs_path"): - processors_path = self.artifact_storage.processors_outputs_path - if processors_path.exists(): - self._upload_processor_artifacts(hf_api, repo_id, processors_path) - - # Upload metadata if it exists (sanitize file paths first) - if hasattr(self, "artifact_storage") and hasattr(self.artifact_storage, "metadata_file_path"): - metadata_path = self.artifact_storage.metadata_file_path - if metadata_path.exists(): - try: - with open(metadata_path, "r") as f: - metadata = json.load(f) - - # Sanitize metadata: convert local file paths to remote Hugging Face Hub paths - metadata = self._sanitize_metadata_file_paths(metadata) - - # Write sanitized metadata to temp file and upload - with TemporaryDirectory() as tmpdir: - sanitized_metadata_path = Path(tmpdir) / "metadata.json" - with open(sanitized_metadata_path, "w") as f: - json.dump(metadata, f, indent=2, default=str) - hf_api.upload_file( - path_or_fileobj=str(sanitized_metadata_path), - path_in_repo="metadata.json", - repo_id=repo_id, - repo_type="dataset", - ) - except Exception as e: - import logging - - logger = logging.getLogger(__name__) - logger.warning(f"Failed to upload metadata: {e}") - - # Upload configuration files if they exist - if hasattr(self, "artifact_storage") and hasattr(self.artifact_storage, "base_dataset_path"): - base_path = self.artifact_storage.base_dataset_path - config_files = ["column_configs.json", "model_configs.json"] - for config_file in config_files: - config_path = base_path / config_file - if config_path.exists(): - try: - hf_api.upload_file( - path_or_fileobj=str(config_path), - path_in_repo=config_file, - repo_id=repo_id, - repo_type="dataset", - ) - except Exception as e: - import logging - - logger = logging.getLogger(__name__) - logger.warning(f"Failed to upload {config_file}: {e}") - - def _upload_processor_artifacts( - self: Any, - hf_api: HfApi, - repo_id: str, - processors_path: Path, - ) -> None: - """Upload processor datasets and artifacts to Hugging Face Hub. - - Args: - hf_api: Hugging Face API client. - repo_id: The ID of the Hugging Face Hub repository. - processors_path: Path to the processors outputs directory. - """ - # Find all processor directories - processor_dirs = [d for d in processors_path.iterdir() if d.is_dir()] - - for processor_dir in processor_dirs: - processor_name = processor_dir.name - - # Check if it's a dataset (contains parquet files) - parquet_files = list(processor_dir.glob("*.parquet")) - if parquet_files: - # Upload as a dataset (combine all parquet files) - try: - # Load all parquet files and combine - dfs = [pd.read_parquet(f) for f in parquet_files] - combined_df = pd.concat(dfs, ignore_index=True) - - # Upload as a separate dataset file - with TemporaryDirectory() as tmpdir: - processor_parquet = Path(tmpdir) / f"{processor_name}.parquet" - combined_df.to_parquet(processor_parquet, index=False) - hf_api.upload_file( - path_or_fileobj=str(processor_parquet), - path_in_repo=f"processors/{processor_name}.parquet", - repo_id=repo_id, - repo_type="dataset", - ) - except Exception as e: - import logging - - logger = logging.getLogger(__name__) - logger.warning(f"Failed to upload processor dataset {processor_name}: {e}") - - # Upload other files in the processor directory as artifacts - other_files = [f for f in processor_dir.rglob("*") if f.is_file() and f.suffix != ".parquet"] - for artifact_file in other_files: - try: - # Preserve directory structure relative to processor_dir - relative_path = artifact_file.relative_to(processors_path) - hf_api.upload_file( - path_or_fileobj=str(artifact_file), - path_in_repo=f"processors/{relative_path.as_posix()}", - repo_id=repo_id, - repo_type="dataset", - ) - except Exception as e: - import logging - - logger = logging.getLogger(__name__) - logger.warning(f"Failed to upload processor artifact {artifact_file}: {e}") - - def _upload_dataset_card( - self: Any, - repo_id: str, - token: str | None, - dataset_df: pd.DataFrame, - ) -> None: - """Generate and upload the dataset card to Hugging Face Hub. - - Args: - repo_id: The ID of the Hugging Face Hub repository. - token: Hugging Face token for authentication. - dataset_df: The dataset as a pandas DataFrame. - """ - # Get analysis and config_builder from the instance - analysis = getattr(self, "_analysis", None) - config_builder = getattr(self, "_config_builder", None) - - if analysis is None or config_builder is None: - raise ArtifactStorageError( - "Cannot generate dataset card: missing analysis or config_builder. " - "Ensure the class has _analysis and _config_builder attributes." - ) - - # Load metadata if available and sanitize file paths - metadata: dict[str, Any] | None = None - if hasattr(self, "artifact_storage") and hasattr(self.artifact_storage, "metadata_file_path"): - metadata_path = self.artifact_storage.metadata_file_path - if metadata_path.exists(): - try: - with open(metadata_path, "r") as f: - metadata = json.load(f) - # Sanitize file paths for the dataset card - metadata = self._sanitize_metadata_file_paths(metadata) - except Exception: - # If metadata can't be loaded, continue without it - pass - - # Generate dataset card using from_template pattern (similar to Argilla) - from huggingface_hub import DatasetCardData - - # Build template variables for the card - template_variables = _build_card_template_variables( - dataset_df=dataset_df, - analysis=analysis, - config_builder=config_builder, - metadata=metadata, - repo_id=repo_id, - ) - # Ensure all_columns is always defined - if "all_columns" not in template_variables: - template_variables["all_columns"] = {} - - # Create card using DatasetCard.from_template with card_data and template_variables - # DataDesignerDatasetCard extends DatasetCard and uses default_template_path - # Unpack template_variables as kwargs for the template - tags_list = ["datadesigner", "synthetic"] - card = DataDesignerDatasetCard.from_template( - card_data=DatasetCardData( - size_categories=_size_categories_parser(len(dataset_df)), - tags=tags_list, - ), - tags=tags_list, # Also pass as template variable for explicit rendering - **template_variables, - ) - - # Save card to temporary directory and upload - with TemporaryDirectory() as tmpdir: - card_path = Path(tmpdir) / "README.md" - try: - card.save(filepath=str(card_path)) - except Exception as e: - import logging - - logger = logging.getLogger(__name__) - logger.error(f"Error saving dataset card: {e}") - logger.error(f"Template variables keys: {list(template_variables.keys())}") - # Try to identify which variable is causing the issue - for key, value in template_variables.items(): - if value is None: - logger.warning(f"Template variable '{key}' is None") - raise - hf_api = HfApi(token=token) - hf_api.upload_file( - path_or_fileobj=str(card_path), - path_in_repo="README.md", - repo_id=repo_id, - repo_type="dataset", - ) - - @classmethod - def pull_from_hub( - cls: type[Any], - repo_id: str, - *, - token: str | None = None, - artifact_path: Path | str | None = None, - split: str | None = None, - **kwargs: Any, - ) -> Any: - """Load a dataset and all artifacts from Hugging Face Hub as a DatasetCreationResults object. - - This classmethod downloads all artifacts from the Hugging Face Hub and reconstructs - a DatasetCreationResults object that can be used just like one created from a local - dataset generation run. - - Args: - repo_id: The ID of the Hugging Face Hub repository (e.g., "username/dataset-name"). - token: Hugging Face token for authentication. If None, will check environment - variables HF_TOKEN or HUGGINGFACE_HUB_TOKEN. - artifact_path: Optional path to save downloaded artifacts. If None, a temporary - directory will be used (note: temporary directories are cleaned up when - the object is garbage collected). - split: The split to load from the dataset. If None, the default split will be used. - **kwargs: Additional arguments to pass to `pull_from_hub()` function. - - Returns: - A DatasetCreationResults object containing the dataset, analysis, and all artifacts. - - Example: - ```python - from data_designer.interface.results import DatasetCreationResults - - # Load from hub (uses temporary directory) - results = DatasetCreationResults.pull_from_hub("username/dataset-name") - - # Load to a specific directory - results = DatasetCreationResults.pull_from_hub( - "username/dataset-name", - artifact_path="./downloaded_datasets/my_dataset" - ) - - # Access the dataset and analysis - df = results.load_dataset() - analysis = results.load_analysis() - ``` - """ - import tempfile - - from data_designer.config.config_builder import DataDesignerConfigBuilder - from data_designer.config.models import ModelConfig - from data_designer.engine.dataset_builders.artifact_storage import ArtifactStorage - - # Pull all artifacts from hub using the function - hub_results = pull_from_hub( - repo_id=repo_id, - token=token, - split=split, - include_analysis=True, - include_processors=True, - include_configs=True, - **kwargs, - ) - - # Determine artifact path - if artifact_path is None: - # Use a temporary directory - # Note: The directory will persist as long as the DatasetCreationResults object exists - # Users should provide artifact_path for persistent storage - temp_dir = tempfile.mkdtemp(prefix="data_designer_hub_") - artifact_path = Path(temp_dir) - else: - artifact_path = Path(artifact_path) - artifact_path.mkdir(parents=True, exist_ok=True) - - # Create artifact storage first to get the resolved dataset name - dataset_name = "dataset" - artifact_storage = ArtifactStorage( - artifact_path=artifact_path, - dataset_name=dataset_name, - ) - base_path = artifact_storage.base_dataset_path - base_path.mkdir(parents=True, exist_ok=True) - - # Save main dataset as parquet files - final_dataset_path = artifact_storage.final_dataset_path - final_dataset_path.mkdir(parents=True, exist_ok=True) - hub_results.dataset.to_parquet(final_dataset_path / "data.parquet", index=False) - - # Save metadata if available - if hub_results.metadata: - metadata_path = base_path / "metadata.json" - with open(metadata_path, "w") as f: - json.dump(hub_results.metadata, f, indent=2) - - # Save processor datasets and artifacts - if hub_results.processor_datasets: - processors_path = base_path / "processors-files" - processors_path.mkdir(parents=True, exist_ok=True) - for processor_name, processor_df in hub_results.processor_datasets.items(): - processor_dir = processors_path / processor_name - processor_dir.mkdir(parents=True, exist_ok=True) - processor_df.to_parquet(processor_dir / f"{processor_name}.parquet", index=False) - - # Copy processor artifacts if available - if hub_results.processor_artifacts: - processors_path = base_path / "processors-files" - processors_path.mkdir(parents=True, exist_ok=True) - import shutil - - for processor_name, artifact_dir in hub_results.processor_artifacts.items(): - if artifact_dir.exists(): - target_dir = processors_path / processor_name - if target_dir.exists(): - shutil.rmtree(target_dir) - shutil.copytree(artifact_dir, target_dir) - - # Save config files - if hub_results.column_configs: - config_path = base_path / "column_configs.json" - with open(config_path, "w") as f: - json.dump(hub_results.column_configs, f, indent=2) - - if hub_results.model_configs: - config_path = base_path / "model_configs.json" - with open(config_path, "w") as f: - json.dump(hub_results.model_configs, f, indent=2) - - # Reconstruct config builder from config files - config_builder: DataDesignerConfigBuilder | None = None - if hub_results.column_configs and hub_results.model_configs: - # Load model configs - model_configs = [ModelConfig.model_validate(mc) for mc in hub_results.model_configs] - config_builder = DataDesignerConfigBuilder(model_configs=model_configs) - - # Build dynamic mapping from column_type to config class (includes plugins) - def _get_column_config_class_mapping() -> dict[str, type[Any]]: - """Build a mapping from column_type string to config class dynamically.""" - from data_designer.config.column_configs import ( - ExpressionColumnConfig, - LLMCodeColumnConfig, - LLMJudgeColumnConfig, - LLMStructuredColumnConfig, - LLMTextColumnConfig, - SamplerColumnConfig, - SeedDatasetColumnConfig, - ValidationColumnConfig, - ) - from data_designer.plugin_manager import PluginManager - - mapping: dict[str, type[Any]] = { - "sampler": SamplerColumnConfig, - "llm_text": LLMTextColumnConfig, - "llm_structured": LLMStructuredColumnConfig, - "llm_code": LLMCodeColumnConfig, - "llm_judge": LLMJudgeColumnConfig, - "expression": ExpressionColumnConfig, - "seed_dataset": SeedDatasetColumnConfig, - "validation": ValidationColumnConfig, - } - - # Add plugin column configs dynamically - plugin_manager = PluginManager() - for plugin in plugin_manager.get_column_generator_plugins(): - mapping[plugin.name] = plugin.config_cls - - return mapping - - column_config_class_mapping = _get_column_config_class_mapping() - - def _load_column_config(col_config_dict: dict[str, Any]) -> Any | None: - """Load a single column config from dict using dynamic class mapping.""" - column_type = col_config_dict.get("column_type") - if not column_type: - return None - - config_class = column_config_class_mapping.get(column_type) - if config_class is None: - # Unknown column type - might be from a plugin or future version - import logging - - logger = logging.getLogger(__name__) - logger.warning( - f"Skipping column config with unknown type '{column_type}': {col_config_dict.get('name', 'unknown')}" - ) - return None - - try: - return config_class.model_validate(col_config_dict) - except Exception as e: - # Skip columns that fail validation - import logging - - logger = logging.getLogger(__name__) - logger.warning( - f"Failed to load column config '{col_config_dict.get('name', 'unknown')}': {e}. Skipping." - ) - return None - - for col_config_dict in hub_results.column_configs: - # Handle MultiColumnConfig (has 'columns' key) by flattening it - if "columns" in col_config_dict and isinstance(col_config_dict["columns"], list): - # This is a MultiColumnConfig - extract individual column configs - for single_col_config_dict in col_config_dict["columns"]: - col_config = _load_column_config(single_col_config_dict) - if col_config is not None: - config_builder.add_column(col_config) - else: - # This is a single column config - single_col_config = _load_column_config(col_config_dict) - if single_col_config is not None: - config_builder.add_column(single_col_config) - - # If config builder couldn't be reconstructed, create a minimal one - if config_builder is None: - # Try to get model configs from environment or use defaults - resolved_token = _resolve_hf_token(token) - try: - model_configs_path = hf_hub_download( - repo_id=repo_id, - filename="model_configs.json", - repo_type="dataset", - token=resolved_token, - ) - with open(model_configs_path, "r") as f: - model_configs_data = json.load(f) - model_configs = [ModelConfig.model_validate(mc) for mc in model_configs_data] - config_builder = DataDesignerConfigBuilder(model_configs=model_configs) - except Exception: - # Fallback to default model configs - config_builder = DataDesignerConfigBuilder() - - # Ensure we have analysis - if hub_results.analysis is None: - raise ArtifactStorageError("Cannot reconstruct DatasetCreationResults: analysis results not found in hub.") - - return cls( - artifact_storage=artifact_storage, - analysis=hub_results.analysis, - config_builder=config_builder, - ) - - -def pull_from_hub( - repo_id: str, - *, - token: str | None = None, - split: str | None = None, - include_analysis: bool = True, - include_processors: bool = True, - include_configs: bool = True, - **kwargs: Any, -) -> HubDatasetResults: - """Load a dataset and all associated artifacts from Hugging Face Hub. - - This function loads a dataset from the Hugging Face Hub along with analysis results, - processor datasets, processor artifacts, and configuration files if available. - It is similar to Argilla's `from_hub` method but returns a comprehensive results object. - - Args: - repo_id: The ID of the Hugging Face Hub repository (e.g., "username/dataset-name"). - token: Hugging Face token for authentication. If None, will check environment - variables HF_TOKEN or HUGGINGFACE_HUB_TOKEN. - split: The split to load from the dataset. If None, the default split will be used. - include_analysis: Whether to load analysis results. Defaults to True. - include_processors: Whether to load processor datasets and artifacts. Defaults to True. - include_configs: Whether to load configuration files. Defaults to True. - **kwargs: Additional arguments to pass to `datasets.load_dataset()`. - - Returns: - A HubDatasetResults object containing the dataset and all associated artifacts. - - Example: - ```python - from data_designer.interface.huggingface import pull_from_hub - - # Load a dataset with all artifacts from Hugging Face Hub - results = pull_from_hub("username/dataset-name") - df = results.dataset - analysis = results.analysis - processor_data = results.processor_datasets - - # Load only the main dataset - results = pull_from_hub("username/dataset-name", include_analysis=False, include_processors=False) - - # Load a specific split - results = pull_from_hub("username/dataset-name", split="train") - ``` - """ - from data_designer.config.analysis.dataset_profiler import DatasetProfilerResults - - # Resolve token - resolved_token = _resolve_hf_token(token) - - # Load main dataset from hub - hf_dataset = load_dataset(repo_id, split=split, token=resolved_token, **kwargs) - - # Handle DatasetDict - if isinstance(hf_dataset, DatasetDict): - if split is None: - # Use the first split if no split specified - split = next(iter(hf_dataset.keys())) - hf_dataset = hf_dataset[split] - elif isinstance(hf_dataset, dict): - # Fallback for dict-like objects - if split is None: - split = next(iter(hf_dataset.keys())) - hf_dataset = hf_dataset[split] - - # Convert to pandas DataFrame - dataset_df = hf_dataset.to_pandas() - - # Load analysis results if requested - analysis: DatasetProfilerResults | None = None - if include_analysis: - try: - analysis_path = hf_hub_download( - repo_id=repo_id, - filename="analysis.json", - repo_type="dataset", - token=resolved_token, - ) - with open(analysis_path, "r") as f: - analysis_data = json.load(f) - analysis = DatasetProfilerResults.model_validate(analysis_data) - except (HfHubHTTPError, FileNotFoundError): - # Analysis file may not exist, continue without it - pass - except Exception: - # Other errors loading analysis, continue without it - pass - - # Load processor datasets and artifacts if requested - processor_datasets: dict[str, pd.DataFrame] | None = None - processor_artifacts: dict[str, Path] | None = None - if include_processors: - try: - repo_files = list_repo_files(repo_id=repo_id, repo_type="dataset", token=resolved_token) - processor_files = [f for f in repo_files if f.startswith("processors/")] - - processor_datasets = {} - processor_artifacts = {} - - # Group files by processor name - processor_groups: dict[str, list[str]] = {} - for file_path in processor_files: - # Extract processor name from path like "processors/processor_name.parquet" - # or "processors/processor_name/file.txt" - parts = file_path.replace("processors/", "").split("/") - processor_name = parts[0].replace(".parquet", "") - - if processor_name not in processor_groups: - processor_groups[processor_name] = [] - processor_groups[processor_name].append(file_path) - - # Download and load processor datasets - for processor_name, files in processor_groups.items(): - parquet_files = [f for f in files if f.endswith(".parquet")] - if parquet_files: - # Download the parquet file - parquet_file = parquet_files[0] # Use first parquet file - try: - local_path = hf_hub_download( - repo_id=repo_id, - filename=parquet_file, - repo_type="dataset", - token=resolved_token, - ) - processor_datasets[processor_name] = pd.read_parquet(local_path) - except Exception: - pass - - # Download other artifacts - other_files = [f for f in files if not f.endswith(".parquet")] - if other_files: - # Download to a temporary directory - import shutil - - with TemporaryDirectory() as tmpdir: - artifact_dir = Path(tmpdir) / processor_name - artifact_dir.mkdir(parents=True, exist_ok=True) - - for artifact_file in other_files: - try: - local_path = hf_hub_download( - repo_id=repo_id, - filename=artifact_file, - repo_type="dataset", - token=resolved_token, - ) - # Preserve relative path structure - relative_path = artifact_file.replace(f"processors/{processor_name}/", "") - if relative_path: - target_path = artifact_dir / relative_path - target_path.parent.mkdir(parents=True, exist_ok=True) - shutil.copy2(local_path, target_path) - except Exception: - pass - - # Only add if directory has files - if any(artifact_dir.rglob("*")): - # Copy to a persistent location or return the temp directory - # For now, we'll return the temp directory path - # Note: This will be cleaned up when tmpdir is deleted - # In a real implementation, you might want to copy to a user-specified location - processor_artifacts[processor_name] = artifact_dir - - if not processor_datasets: - processor_datasets = None - if not processor_artifacts: - processor_artifacts = None - except (HfHubHTTPError, FileNotFoundError): - # Processors may not exist, continue without them - pass - except Exception: - # Other errors loading processors, continue without them - pass - - # Load configuration files if requested - metadata: dict[str, Any] | None = None - column_configs: list[dict[str, Any]] | None = None - model_configs: list[dict[str, Any]] | None = None - - if include_configs: - # Load metadata - try: - metadata_path = hf_hub_download( - repo_id=repo_id, - filename="metadata.json", - repo_type="dataset", - token=resolved_token, - ) - with open(metadata_path, "r") as f: - metadata = json.load(f) - except (HfHubHTTPError, FileNotFoundError): - pass - except Exception: - pass - - # Load column configs - try: - config_path = hf_hub_download( - repo_id=repo_id, - filename="column_configs.json", - repo_type="dataset", - token=resolved_token, - ) - with open(config_path, "r") as f: - raw_column_configs = json.load(f) - # Flatten MultiColumnConfig objects (those with 'columns' key) into individual column configs - column_configs = [] - for config in raw_column_configs: - if "columns" in config and isinstance(config["columns"], list): - # This is a MultiColumnConfig - extract individual column configs - column_configs.extend(config["columns"]) - else: - # This is a single column config - column_configs.append(config) - except (HfHubHTTPError, FileNotFoundError): - pass - except Exception: - pass - - # Load model configs - try: - config_path = hf_hub_download( - repo_id=repo_id, - filename="model_configs.json", - repo_type="dataset", - token=resolved_token, - ) - with open(config_path, "r") as f: - model_configs = json.load(f) - except (HfHubHTTPError, FileNotFoundError): - pass - except Exception: - pass - - return HubDatasetResults( - dataset=dataset_df, - analysis=analysis, - processor_datasets=processor_datasets, - processor_artifacts=processor_artifacts, - metadata=metadata, - column_configs=column_configs, - model_configs=model_configs, - ) - diff --git a/src/data_designer/interface/results.py b/src/data_designer/interface/results.py index 48b3f3bad..824d1a739 100644 --- a/src/data_designer/interface/results.py +++ b/src/data_designer/interface/results.py @@ -13,10 +13,13 @@ from data_designer.config.utils.visualization import WithRecordSamplerMixin from data_designer.engine.dataset_builders.artifact_storage import ArtifactStorage from data_designer.engine.dataset_builders.errors import ArtifactStorageError -from data_designer.interface.huggingface import HuggingFaceHubMixin +from data_designer.integrations.huggingface import ( + HuggingFaceHubClient, + reconstruct_dataset_creation_results, +) -class DatasetCreationResults(WithRecordSamplerMixin, HuggingFaceHubMixin): +class DatasetCreationResults(WithRecordSamplerMixin): """Results container for a Data Designer dataset creation run. This class provides access to the generated dataset, profiling analysis, and @@ -41,6 +44,12 @@ def __init__( self.artifact_storage = artifact_storage self._analysis = analysis self._config_builder = config_builder + self._hub_client = HuggingFaceHubClient( + dataset_provider=self, + artifact_storage_provider=self, + analysis=analysis, + config_builder=config_builder, + ) def load_analysis(self) -> DatasetProfilerResults: """Load the profiling analysis results for the generated dataset. @@ -51,6 +60,15 @@ def load_analysis(self) -> DatasetProfilerResults: """ return self._analysis + @property + def config_builder(self) -> DataDesignerConfigBuilder: + """Load the configuration builder used to create the dataset. + + Returns: + DataDesignerConfigBuilder containing the configuration used to create the dataset. + """ + return self._config_builder + def load_dataset(self) -> pd.DataFrame: """Load the generated dataset as a pandas DataFrame. @@ -92,6 +110,36 @@ def get_path_to_processor_artifacts(self, processor_name: str) -> Path: raise ArtifactStorageError(f"Processor {processor_name} has no artifacts.") return self.artifact_storage.processors_outputs_path / processor_name + def push_to_hub( + self, + repo_id: str, + *, + token: str | None = None, + generate_card: bool = True, + **kwargs: Any, + ) -> None: + """Push the dataset to Hugging Face Hub. + + This method converts the pandas DataFrame to a HuggingFace Dataset, pushes it to + the Hugging Face Hub, and optionally generates and uploads a dataset card. + + Args: + repo_id: The ID of the Hugging Face Hub repository (e.g., "username/dataset-name"). + token: Hugging Face token for authentication. If None, will check environment + variables HF_TOKEN or HUGGINGFACE_HUB_TOKEN. + generate_card: Whether to generate and upload a dataset card. Defaults to True. + **kwargs: Additional arguments to pass to `dataset.push_to_hub()`. + + Raises: + ArtifactStorageError: If there's an error loading the dataset or metadata. + """ + self._hub_client.push_to_hub( + repo_id=repo_id, + token=token, + generate_card=generate_card, + **kwargs, + ) + @classmethod def pull_from_hub( cls, @@ -139,7 +187,25 @@ def pull_from_hub( analysis = results.load_analysis() ``` """ - # Delegate to the mixin method using super() to avoid recursion - return super(DatasetCreationResults, cls).pull_from_hub( - repo_id, token=token, artifact_path=artifact_path, split=split, **kwargs + hub_results = HuggingFaceHubClient.pull_from_hub( + repo_id=repo_id, + token=token, + split=split, + include_analysis=True, + include_processors=True, + include_configs=True, + **kwargs, + ) + + artifact_storage, config_builder = reconstruct_dataset_creation_results( + hub_results=hub_results, + repo_id=repo_id, + artifact_path=artifact_path, + token=token, + ) + + return cls( + artifact_storage=artifact_storage, + analysis=hub_results.analysis, + config_builder=config_builder, ) diff --git a/tests/integrations/huggingface/test_client.py b/tests/integrations/huggingface/test_client.py new file mode 100644 index 000000000..1c7d688ae --- /dev/null +++ b/tests/integrations/huggingface/test_client.py @@ -0,0 +1,430 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +"""Unit tests for HuggingFaceHubClient.""" + +import json +from pathlib import Path +from tempfile import TemporaryDirectory +from unittest.mock import Mock, patch + +import pandas as pd +import pytest + +from data_designer.config.analysis.dataset_profiler import DatasetProfilerResults +from data_designer.config.config_builder import DataDesignerConfigBuilder +from data_designer.engine.dataset_builders.artifact_storage import ArtifactStorage +from data_designer.integrations.huggingface.client import ( + HuggingFaceHubClient, + parse_size_category, + pydantic_to_dict, + resolve_hf_token, +) + + +class TestResolveHfToken: + """Tests for resolve_hf_token function.""" + + def test_resolve_token_provided(self) -> None: + """Test that provided token is returned.""" + assert resolve_hf_token("test-token") == "test-token" + + def test_resolve_token_from_hub(self) -> None: + """Test that token is resolved from huggingface_hub.""" + with patch("data_designer.integrations.huggingface.client.get_token", return_value="hub-token"): + assert resolve_hf_token(None) == "hub-token" + + def test_resolve_token_none(self) -> None: + """Test that None is returned when no token is available.""" + with patch("data_designer.integrations.huggingface.client.get_token", return_value=None): + assert resolve_hf_token(None) is None + + def test_resolve_token_exception(self) -> None: + """Test that exceptions are handled gracefully.""" + with patch("data_designer.integrations.huggingface.client.get_token", side_effect=Exception()): + assert resolve_hf_token(None) is None + + +class TestParseSizeCategory: + """Tests for parse_size_category function.""" + + def test_small_dataset(self) -> None: + """Test small dataset category.""" + assert parse_size_category(500) == "n<1K" + + def test_medium_dataset(self) -> None: + """Test medium dataset category.""" + assert parse_size_category(5000) == "1K None: + """Test large dataset category.""" + assert parse_size_category(500000) == "100K None: + """Test very large dataset category.""" + assert parse_size_category(50000000) == "10M None: + """Test extremely large dataset category.""" + assert parse_size_category(5000000000) == "1B1T" + + +class TestPydanticToDict: + """Tests for pydantic_to_dict function.""" + + def test_pydantic_model(self) -> None: + """Test conversion of Pydantic model.""" + from pydantic import BaseModel + + class TestModel(BaseModel): + name: str + value: int + + obj = TestModel(name="test", value=42) + result = pydantic_to_dict(obj) + assert result == {"name": "test", "value": 42} + + def test_non_pydantic_object(self) -> None: + """Test that non-Pydantic objects are returned as-is.""" + obj = {"key": "value"} + assert pydantic_to_dict(obj) == obj + + +class TestHuggingFaceHubClient: + """Tests for HuggingFaceHubClient class.""" + + @pytest.fixture + def mock_dataset_provider(self) -> Mock: + """Create a mock dataset provider.""" + provider = Mock() + provider.load_dataset.return_value = pd.DataFrame({"col1": [1, 2, 3], "col2": ["a", "b", "c"]}) + return provider + + @pytest.fixture + def mock_artifact_storage(self) -> Mock: + """Create a mock artifact storage.""" + storage = Mock(spec=ArtifactStorage) + storage.base_dataset_path = Path("/tmp/test") + storage.processors_outputs_path = Path("/tmp/test/processors") + storage.metadata_file_path = Path("/tmp/test/metadata.json") + return storage + + @pytest.fixture + def mock_artifact_storage_provider(self, mock_artifact_storage: Mock) -> Mock: + """Create a mock artifact storage provider.""" + provider = Mock() + provider.artifact_storage = mock_artifact_storage + return provider + + @pytest.fixture + def mock_analysis(self) -> Mock: + """Create a mock analysis.""" + analysis = Mock(spec=DatasetProfilerResults) + analysis.num_records = 3 + analysis.target_num_records = 3 + analysis.percent_complete = 100.0 + analysis.column_types = [] + analysis.column_statistics = [] + analysis.get_column_statistics_by_type.return_value = [] + return analysis + + @pytest.fixture + def mock_config_builder(self) -> Mock: + """Create a mock config builder.""" + builder = Mock(spec=DataDesignerConfigBuilder) + builder.get_column_configs.return_value = [] + return builder + + @pytest.fixture + def client( + self, + mock_dataset_provider: Mock, + mock_artifact_storage_provider: Mock, + mock_analysis: Mock, + mock_config_builder: Mock, + ) -> HuggingFaceHubClient: + """Create a HuggingFaceHubClient instance.""" + return HuggingFaceHubClient( + dataset_provider=mock_dataset_provider, + artifact_storage_provider=mock_artifact_storage_provider, + analysis=mock_analysis, + config_builder=mock_config_builder, + ) + + def test_init(self, client: HuggingFaceHubClient) -> None: + """Test client initialization.""" + assert client._dataset_provider is not None + assert client._artifact_storage_provider is not None + assert client._analysis is not None + assert client._config_builder is not None + + @patch("data_designer.integrations.huggingface.client.Dataset") + @patch("data_designer.integrations.huggingface.client.resolve_hf_token") + def test_push_to_hub_basic( + self, + mock_resolve_token: Mock, + mock_dataset_class: Mock, + client: HuggingFaceHubClient, + ) -> None: + """Test basic push_to_hub functionality.""" + mock_resolve_token.return_value = "test-token" + mock_hf_dataset = Mock() + mock_dataset_class.from_pandas.return_value = mock_hf_dataset + + with patch.object(client, "_upload_additional_artifacts"), patch.object(client, "_upload_dataset_card"): + client.push_to_hub("test-user/test-dataset", token="test-token", generate_card=False) + + mock_dataset_class.from_pandas.assert_called_once() + mock_hf_dataset.push_to_hub.assert_called_once_with("test-user/test-dataset", token="test-token") + + @patch("data_designer.integrations.huggingface.client.HfApi") + def test_upload_analysis( + self, + mock_hf_api_class: Mock, + client: HuggingFaceHubClient, + mock_analysis: Mock, + ) -> None: + """Test uploading analysis results.""" + mock_hf_api = Mock() + mock_hf_api_class.return_value = mock_hf_api + mock_analysis.model_dump.return_value = {"num_records": 3} + + client._upload_analysis(mock_hf_api, "test-user/test-dataset") + + mock_hf_api.upload_file.assert_called_once() + call_args = mock_hf_api.upload_file.call_args + assert call_args.kwargs["path_in_repo"] == "analysis.json" + assert call_args.kwargs["repo_id"] == "test-user/test-dataset" + + @patch("data_designer.integrations.huggingface.client.HfApi") + def test_upload_analysis_none( + self, + mock_hf_api_class: Mock, + mock_dataset_provider: Mock, + mock_artifact_storage_provider: Mock, + mock_config_builder: Mock, + ) -> None: + """Test uploading analysis when analysis is None.""" + client = HuggingFaceHubClient( + dataset_provider=mock_dataset_provider, + artifact_storage_provider=mock_artifact_storage_provider, + analysis=None, + config_builder=mock_config_builder, + ) + mock_hf_api = Mock() + mock_hf_api_class.return_value = mock_hf_api + + client._upload_analysis(mock_hf_api, "test-user/test-dataset") + + mock_hf_api.upload_file.assert_not_called() + + def test_sanitize_metadata_file_paths_absolute( + self, + client: HuggingFaceHubClient, + mock_artifact_storage: Mock, + ) -> None: + """Test sanitizing absolute file paths.""" + mock_artifact_storage.base_dataset_path = Path("/base/path") + metadata = { + "file_paths": [ + "/base/path/data/file1.parquet", + "/base/path/data/file2.parquet", + ] + } + + result = client._sanitize_metadata_file_paths(metadata, mock_artifact_storage) + + assert result["file_paths"] == ["data/data/file1.parquet", "data/data/file2.parquet"] + + def test_sanitize_metadata_file_paths_parquet_files( + self, + client: HuggingFaceHubClient, + mock_artifact_storage: Mock, + ) -> None: + """Test sanitizing parquet file paths.""" + mock_artifact_storage.base_dataset_path = Path("/base/path") + metadata = { + "file_paths": [ + "/some/path/parquet-files/file1.parquet", + ] + } + + result = client._sanitize_metadata_file_paths(metadata, mock_artifact_storage) + + assert result["file_paths"] == ["data/parquet-files/file1.parquet"] + + def test_sanitize_metadata_file_paths_no_file_paths( + self, + client: HuggingFaceHubClient, + mock_artifact_storage: Mock, + ) -> None: + """Test sanitizing metadata without file_paths.""" + metadata = {"other_key": "value"} + + result = client._sanitize_metadata_file_paths(metadata, mock_artifact_storage) + + assert result == metadata + + def test_build_column_info(self, client: HuggingFaceHubClient) -> None: + """Test building column information.""" + df = pd.DataFrame({"col1": [1, 2, 3], "col2": ["a", "b", "c"]}) + column_names = set(df.columns) + + result = client._build_column_info(df, column_names) + + assert "col1" in result + assert "col2" in result + assert isinstance(result["col1"], str) + assert isinstance(result["col2"], str) + + def test_find_unconfigured_columns(self, client: HuggingFaceHubClient) -> None: + """Test finding unconfigured columns.""" + df = pd.DataFrame({"col1": [1, 2], "col2": ["a", "b"], "col3": [1.0, 2.0]}) + column_names = {"col1", "col2", "col3"} + mock_config1 = Mock() + mock_config1.name = "col1" + mock_config2 = Mock() + mock_config2.name = "col2" + mock_configs = [mock_config1, mock_config2] + + result = client._find_unconfigured_columns(df, column_names, mock_configs) + + assert "col3" in result + assert "col1" not in result + assert "col2" not in result + assert isinstance(result["col3"], str) + + def test_build_sample_records(self, client: HuggingFaceHubClient) -> None: + """Test building sample records.""" + df = pd.DataFrame({"col1": [1, 2, 3], "col2": ["a", "b", "c"]}) + + result = client._build_sample_records(df) + + assert len(result) == 3 + assert all(isinstance(r, dict) for r in result) + assert result[0]["col1"] == 1 + assert result[0]["col2"] == "a" + + def test_build_sample_records_empty(self, client: HuggingFaceHubClient) -> None: + """Test building sample records from empty dataset.""" + df = pd.DataFrame() + + result = client._build_sample_records(df) + + assert result == [] + + def test_build_config_types_summary(self, client: HuggingFaceHubClient) -> None: + """Test building config types summary.""" + class Config1: + pass + + class Config2: + pass + + mock_configs = [ + type("Config1Instance", (Config1,), {})(), + type("Config1Instance2", (Config1,), {})(), + type("Config2Instance", (Config2,), {})(), + ] + + result = client._build_config_types_summary(mock_configs) + + assert result["Config1Instance"] == 1 + assert result["Config1Instance2"] == 1 + assert result["Config2Instance"] == 1 + + @patch("data_designer.integrations.huggingface.client.load_dataset") + @patch("data_designer.integrations.huggingface.client.resolve_hf_token") + def test_pull_from_hub_basic( + self, + mock_resolve_token: Mock, + mock_load_dataset: Mock, + ) -> None: + """Test basic pull_from_hub functionality.""" + mock_resolve_token.return_value = "test-token" + mock_hf_dataset = Mock() + mock_hf_dataset.to_pandas.return_value = pd.DataFrame({"col1": [1, 2, 3]}) + mock_load_dataset.return_value = mock_hf_dataset + + with patch.object(HuggingFaceHubClient, "_load_analysis_from_hub", return_value=None), patch.object( + HuggingFaceHubClient, "_load_processors_from_hub", return_value=(None, None) + ), patch.object(HuggingFaceHubClient, "_load_configs_from_hub", return_value=(None, None, None)): + result = HuggingFaceHubClient.pull_from_hub( + "test-user/test-dataset", + token="test-token", + include_analysis=False, + include_processors=False, + include_configs=False, + ) + + assert result.dataset is not None + assert len(result.dataset) == 3 + + @patch("data_designer.integrations.huggingface.client.hf_hub_download") + def test_load_analysis_from_hub_success( + self, + mock_hf_hub_download: Mock, + ) -> None: + """Test loading analysis from hub successfully.""" + with TemporaryDirectory() as tmpdir: + analysis_path = Path(tmpdir) / "analysis.json" + analysis_data = { + "num_records": 10, + "target_num_records": 10, + "column_statistics": [ + { + "column_name": "test_col", + "num_records": 10, + "num_null": 0, + "num_unique": 5, + "pyarrow_dtype": "string", + "simple_dtype": "string", + "column_type": "general", + } + ], + } + with open(analysis_path, "w") as f: + json.dump(analysis_data, f) + + mock_hf_hub_download.return_value = str(analysis_path) + + result = HuggingFaceHubClient._load_analysis_from_hub("test-user/test-dataset", "test-token") + + assert result is not None + assert result.num_records == 10 + assert len(result.column_statistics) == 1 + + @patch("data_designer.integrations.huggingface.client.hf_hub_download") + def test_load_analysis_from_hub_not_found( + self, + mock_hf_hub_download: Mock, + ) -> None: + """Test loading analysis when file is not found.""" + from huggingface_hub.utils import HfHubHTTPError + + mock_hf_hub_download.side_effect = HfHubHTTPError("Not found", response=Mock(status_code=404)) + + result = HuggingFaceHubClient._load_analysis_from_hub("test-user/test-dataset", "test-token") + + assert result is None + + def test_group_processor_files(self) -> None: + """Test grouping processor files by processor name.""" + processor_files = [ + "processors/processor1/file1.parquet", + "processors/processor1/file2.txt", + "processors/processor2/file3.parquet", + ] + + result = HuggingFaceHubClient._group_processor_files(processor_files) + + assert "processor1" in result + assert "processor2" in result + assert len(result["processor1"]) == 2 + assert len(result["processor2"]) == 1 diff --git a/tests/interface/test_hub_integration.py b/tests/interface/test_hub_integration.py index 46515bde2..cb41e36f4 100644 --- a/tests/interface/test_hub_integration.py +++ b/tests/interface/test_hub_integration.py @@ -18,7 +18,7 @@ SamplerColumnConfig, SamplerType, ) -from data_designer.interface.huggingface import pull_from_hub +from data_designer.integrations.huggingface import HuggingFaceHubClient from data_designer.interface.results import DatasetCreationResults @@ -101,9 +101,9 @@ def simple_dataset_config(): @pytest.mark.integration -@patch("data_designer.interface.huggingface.hub_mixin.Dataset") -@patch("data_designer.interface.huggingface.hub_mixin.HfApi") -@patch("data_designer.interface.huggingface.hub_mixin.load_dataset") +@patch("data_designer.integrations.huggingface.client.Dataset") +@patch("data_designer.integrations.huggingface.client.HfApi") +@patch("data_designer.integrations.huggingface.client.load_dataset") def test_push_and_pull_from_hub_integration( mock_load_dataset, mock_hf_api_class, @@ -180,8 +180,9 @@ def mock_list_repo_files(repo_id, repo_type, token=None): # Push to hub repo_id = "test-user/test-dataset" - with patch("data_designer.interface.huggingface.hub_mixin.hf_hub_download", side_effect=mock_hf_hub_download), patch( - "data_designer.interface.huggingface.hub_mixin.list_repo_files", side_effect=mock_list_repo_files + with ( + patch("data_designer.integrations.huggingface.client.hf_hub_download", side_effect=mock_hf_hub_download), + patch("data_designer.integrations.huggingface.client.list_repo_files", side_effect=mock_list_repo_files), ): results.push_to_hub(repo_id, token="test-token", generate_card=True) @@ -218,7 +219,7 @@ def mock_list_repo_files(repo_id, repo_type, token=None): assert len(pulled_analysis.column_statistics) == len(original_analysis.column_statistics) # Verify config builder was reconstructed - pulled_config_builder = pulled_results._config_builder + pulled_config_builder = pulled_results.config_builder assert pulled_config_builder is not None pulled_column_configs = pulled_config_builder.get_column_configs() assert len(pulled_column_configs) == 2 # product_category and rating @@ -229,9 +230,9 @@ def mock_list_repo_files(repo_id, repo_type, token=None): @pytest.mark.integration -@patch("data_designer.interface.huggingface.hub_mixin.Dataset") -@patch("data_designer.interface.huggingface.hub_mixin.HfApi") -@patch("data_designer.interface.huggingface.hub_mixin.load_dataset") +@patch("data_designer.integrations.huggingface.client.Dataset") +@patch("data_designer.integrations.huggingface.client.HfApi") +@patch("data_designer.integrations.huggingface.client.load_dataset") def test_push_and_pull_with_pull_from_hub_function( mock_load_dataset, mock_hf_api_class, @@ -297,13 +298,14 @@ def mock_list_repo_files(repo_id, repo_type, token=None): # Push to hub repo_id = "test-user/test-dataset-2" - with patch("data_designer.interface.huggingface.hub_mixin.hf_hub_download", side_effect=mock_hf_hub_download), patch( - "data_designer.interface.huggingface.hub_mixin.list_repo_files", side_effect=mock_list_repo_files + with ( + patch("data_designer.integrations.huggingface.client.hf_hub_download", side_effect=mock_hf_hub_download), + patch("data_designer.integrations.huggingface.client.list_repo_files", side_effect=mock_list_repo_files), ): results.push_to_hub(repo_id, token="test-token", generate_card=True) # Pull using pull_from_hub function - hub_results = pull_from_hub( + hub_results = HuggingFaceHubClient.pull_from_hub( repo_id=repo_id, token="test-token", include_analysis=True, diff --git a/tests/interface/test_results.py b/tests/interface/test_results.py index ac159fc42..1ff598dad 100644 --- a/tests/interface/test_results.py +++ b/tests/interface/test_results.py @@ -40,7 +40,7 @@ def test_init(stub_artifact_storage, stub_dataset_profiler_results, stub_complet ) assert results.artifact_storage == stub_artifact_storage assert results._analysis == stub_dataset_profiler_results - assert results._config_builder == stub_complete_builder + assert results.config_builder == stub_complete_builder def test_load_dataset(stub_dataset_creation_results, stub_artifact_storage, stub_dataframe): From 670278c52d743e0044515d3aef704dd62d28b317 Mon Sep 17 00:00:00 2001 From: davidberenstein1957 Date: Wed, 17 Dec 2025 08:56:14 +0100 Subject: [PATCH 4/5] refactor: streamline config_builder access in WithRecordSamplerMixin - Simplified the retrieval of the config_builder property in the WithRecordSamplerMixin class. - Removed the try-except block for accessing config_builder, directly using the property for improved clarity and efficiency. --- src/data_designer/config/utils/visualization.py | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/src/data_designer/config/utils/visualization.py b/src/data_designer/config/utils/visualization.py index 9d21169fd..c95123bbe 100644 --- a/src/data_designer/config/utils/visualization.py +++ b/src/data_designer/config/utils/visualization.py @@ -120,15 +120,10 @@ def display_sample_record( else: processor_data_to_display[processor] = self.processor_artifacts[processor] - # Use property if available, otherwise fall back to protected attribute - try: - config_builder = self.config_builder - except AttributeError: - config_builder = self._config_builder display_sample_record( record=record, processor_data_to_display=processor_data_to_display, - config_builder=config_builder, + config_builder=self.config_builder, background_color=background_color, syntax_highlighting_theme=syntax_highlighting_theme, hide_seed_columns=hide_seed_columns, From 5c33306e6c5611c3d3129587653fd3f339be4f4e Mon Sep 17 00:00:00 2001 From: davidberenstein1957 Date: Wed, 17 Dec 2025 09:34:15 +0100 Subject: [PATCH 5/5] refactor: improve code readability and formatting in Hugging Face integration files - Reformatted dictionary comprehension for better readability in `HuggingFaceHubClient`. - Consolidated multiple lines into a single line for downloading processor datasets and artifacts. - Streamlined logging message formatting in `_load_column_config` function. - Removed unnecessary blank lines in various files to enhance code cleanliness. --- .../integrations/huggingface/client.py | 13 ++++++------- .../integrations/huggingface/hub_results.py | 1 - .../integrations/huggingface/reconstruction.py | 4 +--- src/data_designer/interface/huggingface/__init__.py | 1 - tests/integrations/huggingface/test_client.py | 9 ++++++--- 5 files changed, 13 insertions(+), 15 deletions(-) diff --git a/src/data_designer/integrations/huggingface/client.py b/src/data_designer/integrations/huggingface/client.py index c197a239d..d22a2064a 100644 --- a/src/data_designer/integrations/huggingface/client.py +++ b/src/data_designer/integrations/huggingface/client.py @@ -652,7 +652,10 @@ def _build_column_stats_by_type(self, analysis: DatasetProfilerResults) -> dict[ except (ValueError, TypeError): continue - return {col_type: [pydantic_to_dict(stat) for stat in stats_list] for col_type, stats_list in column_stats_by_type.items()} + return { + col_type: [pydantic_to_dict(stat) for stat in stats_list] + for col_type, stats_list in column_stats_by_type.items() + } def _sort_column_types(self, column_stats_by_type: dict[str, list[dict[str, Any]]]) -> list[str]: """Sort column types by display order. @@ -857,12 +860,8 @@ def _load_processors_from_hub( processor_files = [f for f in repo_files if f.startswith("processors/")] processor_groups = HuggingFaceHubClient._group_processor_files(processor_files) - processor_datasets = HuggingFaceHubClient._download_processor_datasets( - repo_id, token, processor_groups - ) - processor_artifacts = HuggingFaceHubClient._download_processor_artifacts( - repo_id, token, processor_groups - ) + processor_datasets = HuggingFaceHubClient._download_processor_datasets(repo_id, token, processor_groups) + processor_artifacts = HuggingFaceHubClient._download_processor_artifacts(repo_id, token, processor_groups) return processor_datasets or None, processor_artifacts or None except (HfHubHTTPError, FileNotFoundError, Exception): diff --git a/src/data_designer/integrations/huggingface/hub_results.py b/src/data_designer/integrations/huggingface/hub_results.py index 980bbdd5f..ec37e5b52 100644 --- a/src/data_designer/integrations/huggingface/hub_results.py +++ b/src/data_designer/integrations/huggingface/hub_results.py @@ -40,4 +40,3 @@ class HubDatasetResults: model_configs: list[dict[str, Any]] | None = None """Model configurations if available.""" - diff --git a/src/data_designer/integrations/huggingface/reconstruction.py b/src/data_designer/integrations/huggingface/reconstruction.py index 61417c69d..4f832d00a 100644 --- a/src/data_designer/integrations/huggingface/reconstruction.py +++ b/src/data_designer/integrations/huggingface/reconstruction.py @@ -266,7 +266,5 @@ def _load_column_config( try: return config_class.model_validate(col_config_dict) except Exception as e: - logger.warning( - f"Failed to load column config '{col_config_dict.get('name', 'unknown')}': {e}. Skipping." - ) + logger.warning(f"Failed to load column config '{col_config_dict.get('name', 'unknown')}': {e}. Skipping.") return None diff --git a/src/data_designer/interface/huggingface/__init__.py b/src/data_designer/interface/huggingface/__init__.py index ea58067e3..b225ea5e7 100644 --- a/src/data_designer/interface/huggingface/__init__.py +++ b/src/data_designer/interface/huggingface/__init__.py @@ -15,4 +15,3 @@ HuggingFaceHubMixin = HuggingFaceHubClient __all__ = ["HuggingFaceHubMixin", "HuggingFaceHubClient", "pull_from_hub", "HubDatasetResults", "resolve_hf_token"] - diff --git a/tests/integrations/huggingface/test_client.py b/tests/integrations/huggingface/test_client.py index 1c7d688ae..a626ac6cf 100644 --- a/tests/integrations/huggingface/test_client.py +++ b/tests/integrations/huggingface/test_client.py @@ -321,6 +321,7 @@ def test_build_sample_records_empty(self, client: HuggingFaceHubClient) -> None: def test_build_config_types_summary(self, client: HuggingFaceHubClient) -> None: """Test building config types summary.""" + class Config1: pass @@ -352,9 +353,11 @@ def test_pull_from_hub_basic( mock_hf_dataset.to_pandas.return_value = pd.DataFrame({"col1": [1, 2, 3]}) mock_load_dataset.return_value = mock_hf_dataset - with patch.object(HuggingFaceHubClient, "_load_analysis_from_hub", return_value=None), patch.object( - HuggingFaceHubClient, "_load_processors_from_hub", return_value=(None, None) - ), patch.object(HuggingFaceHubClient, "_load_configs_from_hub", return_value=(None, None, None)): + with ( + patch.object(HuggingFaceHubClient, "_load_analysis_from_hub", return_value=None), + patch.object(HuggingFaceHubClient, "_load_processors_from_hub", return_value=(None, None)), + patch.object(HuggingFaceHubClient, "_load_configs_from_hub", return_value=(None, None, None)), + ): result = HuggingFaceHubClient.pull_from_hub( "test-user/test-dataset", token="test-token",