Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions src/atdata/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,10 @@
create_repository as create_repository,
)

from .dataset_meta import (
DatasetMeta as DatasetMeta,
)

from .index import (
Index as Index,
)
Expand Down
124 changes: 114 additions & 10 deletions src/atdata/atmosphere/records.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
BlobEntry,
ShardChecksum,
)
from ..dataset_meta import DatasetMeta, _resolve_meta

# Import for type checking only to avoid circular imports
from typing import TYPE_CHECKING
Expand Down Expand Up @@ -157,7 +158,8 @@ def publish(
self,
dataset: "Dataset[ST]",
*,
name: str,
name: str | None = None,
meta: DatasetMeta | None = None,
schema_uri: Optional[str] = None,
description: Optional[str] = None,
tags: Optional[list[str]] = None,
Expand All @@ -172,7 +174,11 @@ def publish(

Args:
dataset: The Dataset to publish.
name: Human-readable dataset name.
name: Human-readable dataset name. Can be provided via
*meta* instead.
meta: Optional :class:`~atdata.DatasetMeta` bundling name,
schema_ref, description, tags, license, and metadata.
Explicit keyword arguments override fields in *meta*.
schema_uri: AT URI of the schema record. If not provided and
auto_publish_schema is True, the schema will be published.
description: Human-readable description.
Expand All @@ -194,7 +200,21 @@ def publish(

Raises:
ValueError: If schema_uri is not provided and auto_publish_schema is False.
TypeError: If neither *name* nor *meta* is provided.
"""
resolved = _resolve_meta(
meta,
name=name,
schema_ref=schema_uri,
description=description,
tags=tags,
license=license,
)
name = resolved.name
schema_uri = resolved.schema_ref if schema_uri is None else schema_uri
description = resolved.description
tags = resolved.tags
license = resolved.license
if schema_uri is None:
if not auto_publish_schema:
raise ValueError(
Expand Down Expand Up @@ -253,7 +273,8 @@ def publish_with_urls(
urls: list[str],
schema_uri: str,
*,
name: str,
name: str | None = None,
meta: DatasetMeta | None = None,
description: Optional[str] = None,
tags: Optional[list[str]] = None,
license: Optional[str] = None,
Expand All @@ -272,7 +293,11 @@ def publish_with_urls(
Args:
urls: List of individual shard URLs.
schema_uri: AT URI of the schema record.
name: Human-readable dataset name.
name: Human-readable dataset name. Can be provided via
*meta* instead.
meta: Optional :class:`~atdata.DatasetMeta` bundling name,
schema_ref, description, tags, license, and metadata.
Explicit keyword arguments override fields in *meta*.
description: Human-readable description.
tags: Searchable tags for discovery.
license: SPDX license identifier.
Expand All @@ -285,7 +310,23 @@ def publish_with_urls(

Returns:
The AT URI of the created dataset record.

Raises:
TypeError: If neither *name* nor *meta* is provided.
"""
resolved = _resolve_meta(
meta,
name=name,
description=description,
tags=tags,
license=license,
metadata=metadata,
)
name = resolved.name
description = resolved.description
tags = resolved.tags
license = resolved.license
metadata = resolved.metadata
if checksums and len(checksums) != len(urls):
raise ValueError(
f"checksums length ({len(checksums)}) must match "
Expand Down Expand Up @@ -319,7 +360,8 @@ def publish_with_s3(
keys: list[str],
schema_uri: str,
*,
name: str,
name: str | None = None,
meta: DatasetMeta | None = None,
region: Optional[str] = None,
endpoint: Optional[str] = None,
description: Optional[str] = None,
Expand All @@ -337,7 +379,11 @@ def publish_with_s3(
bucket: S3 bucket name.
keys: List of S3 object keys for shard files.
schema_uri: AT URI of the schema record.
name: Human-readable dataset name.
name: Human-readable dataset name. Can be provided via
*meta* instead.
meta: Optional :class:`~atdata.DatasetMeta` bundling name,
schema_ref, description, tags, license, and metadata.
Explicit keyword arguments override fields in *meta*.
region: AWS region (e.g., 'us-east-1').
endpoint: Custom S3-compatible endpoint URL.
description: Human-readable description.
Expand All @@ -351,7 +397,23 @@ def publish_with_s3(

Returns:
The AT URI of the created dataset record.

Raises:
TypeError: If neither *name* nor *meta* is provided.
"""
resolved = _resolve_meta(
meta,
name=name,
description=description,
tags=tags,
license=license,
metadata=metadata,
)
name = resolved.name
description = resolved.description
tags = resolved.tags
license = resolved.license
metadata = resolved.metadata
if checksums and len(checksums) != len(keys):
raise ValueError(
f"checksums length ({len(checksums)}) must match "
Expand Down Expand Up @@ -384,7 +446,8 @@ def publish_with_blob_refs(
blob_refs: list[dict],
schema_uri: str,
*,
name: str,
name: str | None = None,
meta: DatasetMeta | None = None,
description: Optional[str] = None,
tags: Optional[list[str]] = None,
license: Optional[str] = None,
Expand All @@ -406,7 +469,11 @@ def publish_with_blob_refs(
``Atmosphere.upload_blob()``. Each dict must contain
``$type``, ``ref`` (with ``$link``), ``mimeType``, and ``size``.
schema_uri: AT URI of the schema record.
name: Human-readable dataset name.
name: Human-readable dataset name. Can be provided via
*meta* instead.
meta: Optional :class:`~atdata.DatasetMeta` bundling name,
schema_ref, description, tags, license, and metadata.
Explicit keyword arguments override fields in *meta*.
description: Human-readable description.
tags: Searchable tags for discovery.
license: SPDX license identifier.
Expand All @@ -419,7 +486,23 @@ def publish_with_blob_refs(

Returns:
The AT URI of the created dataset record.

Raises:
TypeError: If neither *name* nor *meta* is provided.
"""
resolved = _resolve_meta(
meta,
name=name,
description=description,
tags=tags,
license=license,
metadata=metadata,
)
name = resolved.name
description = resolved.description
tags = resolved.tags
license = resolved.license
metadata = resolved.metadata
if checksums and len(checksums) != len(blob_refs):
raise ValueError(
f"checksums length ({len(checksums)}) must match "
Expand Down Expand Up @@ -452,7 +535,8 @@ def publish_with_blobs(
blobs: list[bytes],
schema_uri: str,
*,
name: str,
name: str | None = None,
meta: DatasetMeta | None = None,
description: Optional[str] = None,
tags: Optional[list[str]] = None,
license: Optional[str] = None,
Expand All @@ -471,7 +555,11 @@ def publish_with_blobs(
Args:
blobs: List of binary data (e.g., tar shards) to upload as blobs.
schema_uri: AT URI of the schema record.
name: Human-readable dataset name.
name: Human-readable dataset name. Can be provided via
*meta* instead.
meta: Optional :class:`~atdata.DatasetMeta` bundling name,
schema_ref, description, tags, license, and metadata.
Explicit keyword arguments override fields in *meta*.
description: Human-readable description.
tags: Searchable tags for discovery.
license: SPDX license identifier.
Expand All @@ -484,10 +572,26 @@ def publish_with_blobs(
Returns:
The AT URI of the created dataset record.

Raises:
TypeError: If neither *name* nor *meta* is provided.

Note:
Blobs are only retained by the PDS when referenced in a committed
record. This method handles that automatically.
"""
resolved = _resolve_meta(
meta,
name=name,
description=description,
tags=tags,
license=license,
metadata=metadata,
)
name = resolved.name
description = resolved.description
tags = resolved.tags
license = resolved.license
metadata = resolved.metadata
import hashlib

blob_entries = []
Expand Down
103 changes: 103 additions & 0 deletions src/atdata/dataset_meta.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,103 @@
"""DatasetMeta parameter object for bundling shared metadata fields.

Reduces parameter explosion across ``Index.insert_dataset``,
``Index.write_samples``, and ``DatasetPublisher.publish*`` by collecting
the six metadata fields (name, schema_ref, description, tags, license,
metadata) into a single dataclass.
"""

from __future__ import annotations

import dataclasses
from dataclasses import dataclass


@dataclass
class DatasetMeta:
"""Metadata for publishing or indexing a dataset.

Bundle common fields shared across ``write_samples``,
``insert_dataset``, and atmosphere publication.

Args:
name: Human-readable name for the dataset.
schema_ref: Optional schema reference (AT URI or local ref).
description: Optional dataset description.
tags: Optional tags for discovery.
license: Optional SPDX license identifier.
metadata: Optional arbitrary metadata dict.

Examples:
>>> meta = DatasetMeta(name="mnist", tags=["vision"])
>>> meta.name
'mnist'
"""

name: str
schema_ref: str | None = None
description: str | None = None
tags: list[str] | None = None
license: str | None = None
metadata: dict | None = None


def _resolve_meta(
meta: DatasetMeta | None = None,
*,
name: str | None = None,
schema_ref: str | None = None,
description: str | None = None,
tags: list[str] | None = None,
license: str | None = None,
metadata: dict | None = None,
) -> DatasetMeta:
"""Normalize a ``DatasetMeta`` from either a meta object or flat kwargs.

When both *meta* and explicit kwargs are provided, explicit kwargs
override the corresponding fields in *meta* (explicit wins).

Args:
meta: Optional pre-built metadata object.
name: Dataset name (required if *meta* is ``None``).
schema_ref: Optional schema reference override.
description: Optional description override.
tags: Optional tags override.
license: Optional license override.
metadata: Optional metadata dict override.

Returns:
Resolved ``DatasetMeta`` instance.

Raises:
TypeError: If neither *name* nor *meta* is provided.
"""
if meta is None:
if name is None:
raise TypeError("Either 'meta' or 'name' must be provided.")
return DatasetMeta(
name=name,
schema_ref=schema_ref,
description=description,
tags=tags,
license=license,
metadata=metadata,
)

# Build overrides from explicit kwargs (only non-None values win)
overrides: dict = {}
if name is not None:
overrides["name"] = name
if schema_ref is not None:
overrides["schema_ref"] = schema_ref
if description is not None:
overrides["description"] = description
if tags is not None:
overrides["tags"] = tags
if license is not None:
overrides["license"] = license
if metadata is not None:
overrides["metadata"] = metadata

if overrides:
return dataclasses.replace(meta, **overrides)
return meta
Loading
Loading