Skip to content

Commit 3ddd8b6

Browse files
committed
update agents and tests
1 parent 045b9bb commit 3ddd8b6

4 files changed

Lines changed: 239 additions & 41 deletions

File tree

AGENTS.md

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -221,12 +221,23 @@ Datasets are typically uploaded to the `just-dna-seq` organization on Hugging Fa
221221
- **Delete redundant tests**: If test A (e.g., set equality) fully covers test B (e.g., count check), keep only test A.
222222
- **Domain constants are OK**: Hardcoding expected enum values or well-known constants from specs is fine; hardcoding row counts or unique counts derived from data inspection is not.
223223

224+
#### Verifying Bug-Catching Claims
225+
226+
When claiming a test "would have caught" a bug, **demonstrate it**:
227+
228+
1. **Isolate the buggy logic** in a test or script
229+
2. **Run it and show failure** against correct expectations
230+
3. **Then show the fix passes** the same test
231+
232+
Never claim "tests would have caught this" without running the buggy code against the test.
233+
224234
#### Anti-Patterns to Avoid
225235

226236
- Testing only "happy path" with trivial data
227237
- Hardcoding expected values that drift from source (use derived ground truth)
228238
- Mocking data transformations instead of running real pipelines
229239
- Ignoring edge cases (nulls, empty strings, boundary values, unicode, malformed data)
240+
- **Claiming tests "would catch bugs" without demonstrating failure on buggy code**
230241

231242
**Meaningless Tests to Avoid** (common AI-generated anti-patterns):
232243

src/prepare_annotations/core/dagster_io_managers.py

Lines changed: 42 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -124,60 +124,64 @@ class ModuleIOManager(IOManager):
124124
"""
125125
IO Manager for OakVar module assets.
126126
127+
This IO manager stores the actual paths returned by assets, so downstream
128+
assets can load them correctly without needing to guess/reconstruct paths.
129+
127130
Handles paths for:
128131
- Module SQLite databases in data/modules/just_{module}/
129132
- Converted parquet files in data/output/modules/{module}/
133+
134+
Path metadata is stored in a JSON file alongside the Dagster storage.
130135
"""
131136

132-
# Mapping of asset keys to actual SQLite filenames (as they exist in GitHub repos)
133-
SQLITE_FILENAMES: dict[str, str] = {
134-
"longevitymap": "longevitymap.sqlite",
135-
"lipidmetabolism": "lipid_metabolism.sqlite",
136-
"vo2max": "vo2max.sqlite",
137-
"superhuman": "superhuman.sqlite",
138-
"coronary": "coronary.sqlite",
139-
}
137+
def __init__(self) -> None:
138+
# Path registry persisted to disk
139+
self._registry_path = MODULES_OUTPUT_DIR / ".asset_paths.json"
140140

141-
def _get_asset_path(self, asset_key: str) -> Path:
142-
"""Resolve the path for a module asset based on its key."""
143-
if asset_key.endswith("_sqlite"):
144-
module = asset_key.replace("_sqlite", "")
145-
sqlite_filename = self.SQLITE_FILENAMES.get(module, f"{module}.sqlite")
146-
return MODULES_DIR / f"just_{module}" / sqlite_filename
147-
148-
# Handle longevitymap_with_ensembl specifically
149-
if asset_key == "longevitymap_with_ensembl":
150-
return MODULES_OUTPUT_DIR / "longevitymap" / "longevitymap_ensembl_joined.parquet"
151-
152-
# Standard module assets: {module}_{type} (e.g., longevitymap_annotations)
153-
parts = asset_key.split("_")
154-
if len(parts) >= 2:
155-
module = parts[0]
156-
type_name = "_".join(parts[1:])
157-
return MODULES_OUTPUT_DIR / module / f"{type_name}.parquet"
158-
159-
return MODULES_OUTPUT_DIR / asset_key
141+
def _load_registry(self) -> dict[str, str]:
142+
"""Load the asset path registry from disk."""
143+
if self._registry_path.exists():
144+
import json
145+
return json.loads(self._registry_path.read_text())
146+
return {}
147+
148+
def _save_registry(self, registry: dict[str, str]) -> None:
149+
"""Save the asset path registry to disk."""
150+
import json
151+
self._registry_path.parent.mkdir(parents=True, exist_ok=True)
152+
self._registry_path.write_text(json.dumps(registry, indent=2))
160153

161154
def handle_output(self, context: OutputContext, obj: Any) -> None:
162-
"""Log where the asset was stored."""
155+
"""Store the asset path in registry for later retrieval."""
156+
asset_key = context.asset_key.to_user_string()
157+
163158
if isinstance(obj, Path):
159+
# Store the actual path returned by the asset
160+
registry = self._load_registry()
161+
registry[asset_key] = str(obj)
162+
self._save_registry(registry)
164163
context.log.info(f"Module asset stored at: {obj}")
165164
else:
166165
context.log.info(f"Module asset materialized: {type(obj).__name__}")
167166

168167
def load_input(self, context: InputContext) -> Any:
169-
"""Load asset by returning its expected path."""
168+
"""Load asset by looking up its stored path."""
170169
asset_key = context.upstream_output.asset_key.to_user_string() if context.upstream_output else "unknown"
171-
path = self._get_asset_path(asset_key)
172170

173-
if not path.exists():
174-
raise FileNotFoundError(
175-
f"Module asset not found at {path}. "
176-
f"Materialize the {asset_key} asset first."
177-
)
178-
179-
context.log.info(f"Loading module data from: {path}")
180-
return path
171+
# Look up the actual path from registry
172+
registry = self._load_registry()
173+
if asset_key in registry:
174+
path = Path(registry[asset_key])
175+
if path.exists():
176+
context.log.info(f"Loading module data from registry: {path}")
177+
return path
178+
179+
# Fallback: raise clear error
180+
raise FileNotFoundError(
181+
f"Module asset '{asset_key}' not found in registry. "
182+
f"Materialize the {asset_key} asset first. "
183+
f"Registry path: {self._registry_path}"
184+
)
181185

182186

183187
@io_manager

src/prepare_annotations/huggingface/uploader.py

Lines changed: 27 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -6,10 +6,12 @@
66
"""
77

88
from pathlib import Path
9-
from typing import Optional, Dict, List
9+
from typing import Optional, List
1010
from eliot import start_action
1111
from huggingface_hub import HfApi, hf_hub_download, list_repo_files, CommitOperationAdd
1212
from huggingface_hub.utils import RepositoryNotFoundError, HfHubHTTPError
13+
from tenacity import retry, stop_after_attempt, wait_exponential, retry_if_exception_type
14+
import httpx
1315

1416
from prepare_annotations.core.models import SingleUploadResult, BatchUploadResult
1517
from prepare_annotations.huggingface.dataset_cards import (
@@ -18,6 +20,10 @@
1820
save_dataset_card
1921
)
2022

23+
# Retry configuration for HuggingFace uploads
24+
HF_UPLOAD_MAX_RETRIES = 3
25+
HF_UPLOAD_TIMEOUT_SECONDS = 300 # 5 minutes
26+
2127

2228
def upload_to_hf_if_changed(
2329
parquet_file: Path,
@@ -336,20 +342,38 @@ def upload_files_batch(
336342
if commit_message is None:
337343
commit_message = f"Upload {num_uploading} parquet file{'s' if num_uploading > 1 else ''}"
338344

339-
try:
340-
commit_info = api.create_commit(
345+
# Retry logic for transient network errors
346+
@retry(
347+
stop=stop_after_attempt(HF_UPLOAD_MAX_RETRIES),
348+
wait=wait_exponential(multiplier=1, min=4, max=60),
349+
retry=retry_if_exception_type((httpx.TimeoutException, httpx.NetworkError)),
350+
reraise=True,
351+
)
352+
def create_commit_with_retry():
353+
return api.create_commit(
341354
repo_id=repo_id,
342355
repo_type=repo_type,
343356
operations=operations,
344357
commit_message=commit_message,
345358
)
359+
360+
try:
361+
commit_info = create_commit_with_retry()
346362

347363
action.log(
348364
message_type="success",
349365
step="commit_created",
350366
commit_url=commit_info.commit_url,
351367
num_files=num_uploading
352368
)
369+
except (httpx.TimeoutException, httpx.NetworkError) as e:
370+
action.log(
371+
message_type="error",
372+
step="commit_failed_after_retries",
373+
error=str(e),
374+
max_retries=HF_UPLOAD_MAX_RETRIES
375+
)
376+
raise
353377
except Exception as e:
354378
action.log(
355379
message_type="error",

tests/test_module_io_manager.py

Lines changed: 159 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,159 @@
1+
"""
2+
Tests for ModuleIOManager path registry.
3+
4+
The IO manager now properly stores the paths returned by assets in a registry,
5+
instead of trying to guess/reconstruct paths from asset keys.
6+
"""
7+
from __future__ import annotations
8+
9+
from pathlib import Path
10+
from unittest.mock import MagicMock
11+
12+
import pytest
13+
14+
from prepare_annotations.core.dagster_io_managers import ModuleIOManager
15+
16+
17+
@pytest.fixture
18+
def io_manager(tmp_path: Path) -> ModuleIOManager:
19+
"""Create an IO manager with a temp registry path."""
20+
manager = ModuleIOManager()
21+
manager._registry_path = tmp_path / ".asset_paths.json"
22+
return manager
23+
24+
25+
class TestModuleIOManagerRegistry:
26+
"""Tests for the path registry functionality."""
27+
28+
def test_handle_output_stores_path_in_registry(
29+
self,
30+
io_manager: ModuleIOManager,
31+
tmp_path: Path,
32+
) -> None:
33+
"""handle_output should store the returned Path in the registry."""
34+
# Create a mock output context
35+
context = MagicMock()
36+
context.asset_key.to_user_string.return_value = "coronary_with_ensembl"
37+
38+
# The asset returns this path
39+
asset_output_path = tmp_path / "coronary" / "coronary_ensembl_joined.parquet"
40+
asset_output_path.parent.mkdir(parents=True, exist_ok=True)
41+
asset_output_path.touch()
42+
43+
# Handle the output
44+
io_manager.handle_output(context, asset_output_path)
45+
46+
# Registry should contain the path
47+
registry = io_manager._load_registry()
48+
assert "coronary_with_ensembl" in registry
49+
assert registry["coronary_with_ensembl"] == str(asset_output_path)
50+
51+
def test_load_input_retrieves_path_from_registry(
52+
self,
53+
io_manager: ModuleIOManager,
54+
tmp_path: Path,
55+
) -> None:
56+
"""load_input should retrieve the path from registry."""
57+
# Create a file
58+
asset_path = tmp_path / "longevitymap" / "annotations.parquet"
59+
asset_path.parent.mkdir(parents=True, exist_ok=True)
60+
asset_path.touch()
61+
62+
# Manually store in registry (simulating prior handle_output)
63+
registry = {"longevitymap_annotations": str(asset_path)}
64+
io_manager._save_registry(registry)
65+
66+
# Create mock input context
67+
context = MagicMock()
68+
context.upstream_output.asset_key.to_user_string.return_value = "longevitymap_annotations"
69+
70+
# Load should return the path
71+
result = io_manager.load_input(context)
72+
assert result == asset_path
73+
74+
def test_load_input_raises_if_not_in_registry(
75+
self,
76+
io_manager: ModuleIOManager,
77+
) -> None:
78+
"""load_input should raise FileNotFoundError if asset not in registry."""
79+
context = MagicMock()
80+
context.upstream_output.asset_key.to_user_string.return_value = "unknown_asset"
81+
82+
with pytest.raises(FileNotFoundError, match="not found in registry"):
83+
io_manager.load_input(context)
84+
85+
def test_registry_persists_across_instances(
86+
self,
87+
tmp_path: Path,
88+
) -> None:
89+
"""Registry should persist to disk and be readable by new instances."""
90+
registry_path = tmp_path / ".asset_paths.json"
91+
92+
# First instance stores a path
93+
manager1 = ModuleIOManager()
94+
manager1._registry_path = registry_path
95+
96+
asset_path = tmp_path / "vo2max" / "weights.parquet"
97+
asset_path.parent.mkdir(parents=True, exist_ok=True)
98+
asset_path.touch()
99+
100+
context = MagicMock()
101+
context.asset_key.to_user_string.return_value = "vo2max_weights"
102+
manager1.handle_output(context, asset_path)
103+
104+
# New instance should be able to load
105+
manager2 = ModuleIOManager()
106+
manager2._registry_path = registry_path
107+
108+
context2 = MagicMock()
109+
context2.upstream_output.asset_key.to_user_string.return_value = "vo2max_weights"
110+
111+
result = manager2.load_input(context2)
112+
assert result == asset_path
113+
114+
def test_multiple_assets_stored_in_registry(
115+
self,
116+
io_manager: ModuleIOManager,
117+
tmp_path: Path,
118+
) -> None:
119+
"""Multiple assets should be stored in the same registry."""
120+
# Store multiple assets
121+
assets = {
122+
"coronary_annotations": tmp_path / "coronary" / "annotations.parquet",
123+
"coronary_studies": tmp_path / "coronary" / "studies.parquet",
124+
"coronary_with_ensembl": tmp_path / "coronary" / "coronary_ensembl_joined.parquet",
125+
}
126+
127+
for asset_key, path in assets.items():
128+
path.parent.mkdir(parents=True, exist_ok=True)
129+
path.touch()
130+
131+
context = MagicMock()
132+
context.asset_key.to_user_string.return_value = asset_key
133+
io_manager.handle_output(context, path)
134+
135+
# All should be in registry
136+
registry = io_manager._load_registry()
137+
assert len(registry) == 3
138+
for asset_key, path in assets.items():
139+
assert registry[asset_key] == str(path)
140+
141+
142+
class TestModuleIOManagerNonPathOutputs:
143+
"""Tests for non-Path outputs (e.g., dicts from upload assets)."""
144+
145+
def test_handle_output_ignores_non_path_objects(
146+
self,
147+
io_manager: ModuleIOManager,
148+
) -> None:
149+
"""Non-Path outputs should not be stored in registry."""
150+
context = MagicMock()
151+
context.asset_key.to_user_string.return_value = "coronary_hf_upload"
152+
153+
# Upload assets return dicts, not Paths
154+
upload_result = {"repo_id": "just-dna-seq/annotators", "num_uploaded": 3}
155+
io_manager.handle_output(context, upload_result)
156+
157+
# Should not be in registry
158+
registry = io_manager._load_registry()
159+
assert "coronary_hf_upload" not in registry

0 commit comments

Comments
 (0)