diff --git a/CLAUDE.md b/CLAUDE.md index 5f1019f..b9c5b93 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -51,6 +51,9 @@ just test # Run all tests with coverage just test tests/test_dataset.py # Run specific test file just lint # Run ruff check + format check just docs # Build documentation (runs quartodoc + quarto) +just sync-lexicons # Fetch latest lexicons from atdata-lexicon repo +just sync-lexicons ref=v1.0 # Pin to a specific tag/branch +just sync-lexicons-local # Sync vendored → package (offline, no fetch) just bench # Run full benchmark suite just bench-io # Run I/O benchmarks only just bench-index # Run index provider benchmarks diff --git a/justfile b/justfile index bbb5f87..6014c4d 100644 --- a/justfile +++ b/justfile @@ -4,14 +4,55 @@ setup: git config core.hooksPath .githooks @echo "Git hooks activated from .githooks/" -sync-lexicons: - cp lexicons/*.json src/atdata/lexicons/ +# Fetch latest lexicons from atdata-lexicon repo and sync to both local copies. +# Uses gh CLI to download a tarball — no clone needed. +# Pass ref= to pin a specific version (default: main). +sync-lexicons ref="main": + #!/usr/bin/env bash + set -euo pipefail + REPO="forecast-bio/atdata-lexicon" + VENDOR="lexicons/science/alt/dataset" + PKG="src/atdata/lexicons/science/alt/dataset" + TMPDIR=$(mktemp -d) + trap 'rm -rf "$TMPDIR"' EXIT + echo "Fetching lexicons from $REPO@{{ref}}..." + gh api "repos/$REPO/tarball/{{ref}}" > "$TMPDIR/archive.tar.gz" + tar xzf "$TMPDIR/archive.tar.gz" -C "$TMPDIR" --strip-components=1 + # Copy NSID lexicons to both vendor and package directories + for f in "$TMPDIR/lexicons/science/alt/dataset/"*.json; do + name=$(basename "$f") + cp "$f" "$VENDOR/$name" + cp "$f" "$PKG/$name" + echo " synced $name" + done + echo "Lexicons synced from $REPO@{{ref}}" + # Also sync top-level shim files if they exist upstream + for f in "$TMPDIR/lexicons/"*.json; do + [ -f "$f" ] || continue + name=$(basename "$f") + cp "$f" "lexicons/$name" + [ -f "src/atdata/lexicons/$name" ] && cp "$f" "src/atdata/lexicons/$name" + done + +# Sync local vendored lexicons → package (no network, for offline use) +sync-lexicons-local: + #!/usr/bin/env bash + set -euo pipefail + for f in lexicons/science/alt/dataset/*.json; do + name=$(basename "$f") + cp "$f" "src/atdata/lexicons/science/alt/dataset/$name" + done + for f in lexicons/*.json; do + name=$(basename "$f") + [ -f "src/atdata/lexicons/$name" ] && cp "$f" "src/atdata/lexicons/$name" + done + echo "Local lexicon sync complete" gen-lexicon-docs: uv run python scripts/gen_lexicon_docs.py test *args: - just sync-lexicons + just sync-lexicons-local uv run pytest {{args}} lint: diff --git a/lexicons/science/alt/dataset/entry.json b/lexicons/science/alt/dataset/entry.json index 6cde1cb..159c9d7 100644 --- a/lexicons/science/alt/dataset/entry.json +++ b/lexicons/science/alt/dataset/entry.json @@ -78,10 +78,44 @@ "contentMetadata": { "type": "unknown", "description": "Dataset-level content metadata (e.g., instrument settings, acquisition parameters). Structure is validated against the schema referenced by metadataSchemaRef when present. Stored as an open JSON object." + }, + "manifests": { + "type": "array", + "description": "Per-shard manifest references for query-based access. Each entry pairs a shard with its manifest header blob and optional Parquet samples blob.", + "items": { + "type": "ref", + "ref": "#shardManifestRef" + }, + "maxLength": 10000 } } } }, + "shardManifestRef": { + "type": "object", + "description": "References to manifest sidecar data for a single shard. The header contains schema info, sample count, and per-field aggregates. The samples file is a Parquet table with per-sample metadata for query-based access.", + "required": [ + "header" + ], + "properties": { + "header": { + "type": "blob", + "description": "Manifest JSON header blob containing shard-level metadata (schema, sample count, field aggregates)", + "accept": [ + "application/json" + ], + "maxSize": 1048576 + }, + "samples": { + "type": "blob", + "description": "Optional Parquet file with per-sample metadata for query-based filtering", + "accept": [ + "application/octet-stream" + ], + "maxSize": 104857600 + } + } + }, "shardChecksum": { "type": "object", "description": "Content hash for shard integrity verification. Algorithm is flexible to allow SHA-256, BLAKE3, or other hash functions.", diff --git a/src/atdata/lexicons/science/alt/dataset/entry.json b/src/atdata/lexicons/science/alt/dataset/entry.json index 6cde1cb..159c9d7 100644 --- a/src/atdata/lexicons/science/alt/dataset/entry.json +++ b/src/atdata/lexicons/science/alt/dataset/entry.json @@ -78,10 +78,44 @@ "contentMetadata": { "type": "unknown", "description": "Dataset-level content metadata (e.g., instrument settings, acquisition parameters). Structure is validated against the schema referenced by metadataSchemaRef when present. Stored as an open JSON object." + }, + "manifests": { + "type": "array", + "description": "Per-shard manifest references for query-based access. Each entry pairs a shard with its manifest header blob and optional Parquet samples blob.", + "items": { + "type": "ref", + "ref": "#shardManifestRef" + }, + "maxLength": 10000 } } } }, + "shardManifestRef": { + "type": "object", + "description": "References to manifest sidecar data for a single shard. The header contains schema info, sample count, and per-field aggregates. The samples file is a Parquet table with per-sample metadata for query-based access.", + "required": [ + "header" + ], + "properties": { + "header": { + "type": "blob", + "description": "Manifest JSON header blob containing shard-level metadata (schema, sample count, field aggregates)", + "accept": [ + "application/json" + ], + "maxSize": 1048576 + }, + "samples": { + "type": "blob", + "description": "Optional Parquet file with per-sample metadata for query-based filtering", + "accept": [ + "application/octet-stream" + ], + "maxSize": 104857600 + } + } + }, "shardChecksum": { "type": "object", "description": "Content hash for shard integrity verification. Algorithm is flexible to allow SHA-256, BLAKE3, or other hash functions.",