From ea81c1adfba4febc58d1b68812bfbb9df11a72fa Mon Sep 17 00:00:00 2001
From: Maxine Levesque <220467675+maxine-at-forecast@users.noreply.github.com>
Date: Fri, 3 Apr 2026 19:27:47 -0700
Subject: [PATCH 1/2] feat: add manifests property to dataset entry lexicon

Add shardManifestRef def and optional manifests array to
science.alt.dataset.entry lexicon, aligning the JSON spec with
the existing Python types (ShardManifestRef, LexDatasetEntry.manifests).

Each manifest ref pairs a shard with its header blob (JSON metadata)
and optional Parquet samples blob for query-based access.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 lexicons/science/alt/dataset/entry.json       | 34 +++++++++++++++++++
 .../lexicons/science/alt/dataset/entry.json   | 34 +++++++++++++++++++
 2 files changed, 68 insertions(+)

diff --git a/lexicons/science/alt/dataset/entry.json b/lexicons/science/alt/dataset/entry.json
index 6cde1cb..159c9d7 100644
--- a/lexicons/science/alt/dataset/entry.json
+++ b/lexicons/science/alt/dataset/entry.json
@@ -78,10 +78,44 @@
           "contentMetadata": {
             "type": "unknown",
             "description": "Dataset-level content metadata (e.g., instrument settings, acquisition parameters). Structure is validated against the schema referenced by metadataSchemaRef when present. Stored as an open JSON object."
+          },
+          "manifests": {
+            "type": "array",
+            "description": "Per-shard manifest references for query-based access. Each entry pairs a shard with its manifest header blob and optional Parquet samples blob.",
+            "items": {
+              "type": "ref",
+              "ref": "#shardManifestRef"
+            },
+            "maxLength": 10000
           }
         }
       }
     },
+    "shardManifestRef": {
+      "type": "object",
+      "description": "References to manifest sidecar data for a single shard. The header contains schema info, sample count, and per-field aggregates. The samples file is a Parquet table with per-sample metadata for query-based access.",
+      "required": [
+        "header"
+      ],
+      "properties": {
+        "header": {
+          "type": "blob",
+          "description": "Manifest JSON header blob containing shard-level metadata (schema, sample count, field aggregates)",
+          "accept": [
+            "application/json"
+          ],
+          "maxSize": 1048576
+        },
+        "samples": {
+          "type": "blob",
+          "description": "Optional Parquet file with per-sample metadata for query-based filtering",
+          "accept": [
+            "application/octet-stream"
+          ],
+          "maxSize": 104857600
+        }
+      }
+    },
     "shardChecksum": {
       "type": "object",
       "description": "Content hash for shard integrity verification. Algorithm is flexible to allow SHA-256, BLAKE3, or other hash functions.",
diff --git a/src/atdata/lexicons/science/alt/dataset/entry.json b/src/atdata/lexicons/science/alt/dataset/entry.json
index 6cde1cb..159c9d7 100644
--- a/src/atdata/lexicons/science/alt/dataset/entry.json
+++ b/src/atdata/lexicons/science/alt/dataset/entry.json
@@ -78,10 +78,44 @@
           "contentMetadata": {
             "type": "unknown",
             "description": "Dataset-level content metadata (e.g., instrument settings, acquisition parameters). Structure is validated against the schema referenced by metadataSchemaRef when present. Stored as an open JSON object."
+          },
+          "manifests": {
+            "type": "array",
+            "description": "Per-shard manifest references for query-based access. Each entry pairs a shard with its manifest header blob and optional Parquet samples blob.",
+            "items": {
+              "type": "ref",
+              "ref": "#shardManifestRef"
+            },
+            "maxLength": 10000
           }
         }
       }
     },
+    "shardManifestRef": {
+      "type": "object",
+      "description": "References to manifest sidecar data for a single shard. The header contains schema info, sample count, and per-field aggregates. The samples file is a Parquet table with per-sample metadata for query-based access.",
+      "required": [
+        "header"
+      ],
+      "properties": {
+        "header": {
+          "type": "blob",
+          "description": "Manifest JSON header blob containing shard-level metadata (schema, sample count, field aggregates)",
+          "accept": [
+            "application/json"
+          ],
+          "maxSize": 1048576
+        },
+        "samples": {
+          "type": "blob",
+          "description": "Optional Parquet file with per-sample metadata for query-based filtering",
+          "accept": [
+            "application/octet-stream"
+          ],
+          "maxSize": 104857600
+        }
+      }
+    },
     "shardChecksum": {
       "type": "object",
       "description": "Content hash for shard integrity verification. Algorithm is flexible to allow SHA-256, BLAKE3, or other hash functions.",

From 30c2b2a02ec96fc4bcf61268091f2f8e9055eefe Mon Sep 17 00:00:00 2001
From: Maxine Levesque <220467675+maxine-at-forecast@users.noreply.github.com>
Date: Fri, 3 Apr 2026 23:04:17 -0700
Subject: [PATCH 2/2] feat: add lexicon sync from atdata-lexicon repo
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Replace broken sync-lexicons recipe (was copying flat lexicons/*.json)
with two recipes:
- just sync-lexicons: fetches latest from forecast-bio/atdata-lexicon
  via gh API tarball, syncs to both vendored and package directories.
  Supports ref= parameter for pinning to a tag/branch.
- just sync-lexicons-local: offline vendor→package sync (no network).

Test recipe now uses sync-lexicons-local to avoid network dependency.
Updated CLAUDE.md with the new commands.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 CLAUDE.md |  3 +++
 justfile  | 47 ++++++++++++++++++++++++++++++++++++++++++++---
 2 files changed, 47 insertions(+), 3 deletions(-)

diff --git a/CLAUDE.md b/CLAUDE.md
index 5f1019f..b9c5b93 100644
--- a/CLAUDE.md
+++ b/CLAUDE.md
@@ -51,6 +51,9 @@ just test              # Run all tests with coverage
 just test tests/test_dataset.py  # Run specific test file
 just lint              # Run ruff check + format check
 just docs              # Build documentation (runs quartodoc + quarto)
+just sync-lexicons     # Fetch latest lexicons from atdata-lexicon repo
+just sync-lexicons ref=v1.0  # Pin to a specific tag/branch
+just sync-lexicons-local  # Sync vendored → package (offline, no fetch)
 just bench             # Run full benchmark suite
 just bench-io          # Run I/O benchmarks only
 just bench-index       # Run index provider benchmarks
diff --git a/justfile b/justfile
index bbb5f87..6014c4d 100644
--- a/justfile
+++ b/justfile
@@ -4,14 +4,55 @@ setup:
     git config core.hooksPath .githooks
     @echo "Git hooks activated from .githooks/"
 
-sync-lexicons:
-    cp lexicons/*.json src/atdata/lexicons/
+# Fetch latest lexicons from atdata-lexicon repo and sync to both local copies.
+# Uses gh CLI to download a tarball — no clone needed.
+# Pass ref=<branch/tag> to pin a specific version (default: main).
+sync-lexicons ref="main":
+    #!/usr/bin/env bash
+    set -euo pipefail
+    REPO="forecast-bio/atdata-lexicon"
+    VENDOR="lexicons/science/alt/dataset"
+    PKG="src/atdata/lexicons/science/alt/dataset"
+    TMPDIR=$(mktemp -d)
+    trap 'rm -rf "$TMPDIR"' EXIT
+    echo "Fetching lexicons from $REPO@{{ref}}..."
+    gh api "repos/$REPO/tarball/{{ref}}" > "$TMPDIR/archive.tar.gz"
+    tar xzf "$TMPDIR/archive.tar.gz" -C "$TMPDIR" --strip-components=1
+    # Copy NSID lexicons to both vendor and package directories
+    for f in "$TMPDIR/lexicons/science/alt/dataset/"*.json; do
+        name=$(basename "$f")
+        cp "$f" "$VENDOR/$name"
+        cp "$f" "$PKG/$name"
+        echo "  synced $name"
+    done
+    echo "Lexicons synced from $REPO@{{ref}}"
+    # Also sync top-level shim files if they exist upstream
+    for f in "$TMPDIR/lexicons/"*.json; do
+        [ -f "$f" ] || continue
+        name=$(basename "$f")
+        cp "$f" "lexicons/$name"
+        [ -f "src/atdata/lexicons/$name" ] && cp "$f" "src/atdata/lexicons/$name"
+    done
+
+# Sync local vendored lexicons → package (no network, for offline use)
+sync-lexicons-local:
+    #!/usr/bin/env bash
+    set -euo pipefail
+    for f in lexicons/science/alt/dataset/*.json; do
+        name=$(basename "$f")
+        cp "$f" "src/atdata/lexicons/science/alt/dataset/$name"
+    done
+    for f in lexicons/*.json; do
+        name=$(basename "$f")
+        [ -f "src/atdata/lexicons/$name" ] && cp "$f" "src/atdata/lexicons/$name"
+    done
+    echo "Local lexicon sync complete"
 
 gen-lexicon-docs:
     uv run python scripts/gen_lexicon_docs.py
 
 test *args:
-    just sync-lexicons
+    just sync-lexicons-local
     uv run pytest {{args}}
 
 lint: