Skip to content

Commit bd374f3

Browse files
committed
Add CLI validation for 10x renal dataset
1 parent c07cb1c commit bd374f3

File tree

7 files changed

+415
-227
lines changed

7 files changed

+415
-227
lines changed

README.md

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,13 @@ Validation summary from a local download of the public bundle:
5454
An executable smoke-test example is included in
5555
`examples/smoke_test_10x_renal_ffpe_protein.py`.
5656

57+
After installing the package, the same workflow is also available as a CLI command:
58+
59+
```bash
60+
pyxenium validate-renal-ffpe-protein \
61+
"Y:/long/10X_datasets/Xenium/Xenium_Renal/Xenium_V1_Human_Kidney_FFPE_Protein"
62+
```
63+
5764
```bash
5865
python examples/smoke_test_10x_renal_ffpe_protein.py \
5966
"Y:/long/10X_datasets/Xenium/Xenium_Renal/Xenium_V1_Human_Kidney_FFPE_Protein"
@@ -62,15 +69,15 @@ python examples/smoke_test_10x_renal_ffpe_protein.py \
6269
To also write a compact Markdown/JSON/CSV report bundle:
6370

6471
```bash
65-
python examples/smoke_test_10x_renal_ffpe_protein.py \
72+
pyxenium validate-renal-ffpe-protein \
6673
"Y:/long/10X_datasets/Xenium/Xenium_Renal/Xenium_V1_Human_Kidney_FFPE_Protein" \
6774
--output-dir ./smoke_test_outputs
6875
```
6976

7077
To export the loaded object for downstream analysis:
7178

7279
```bash
73-
python examples/smoke_test_10x_renal_ffpe_protein.py \
80+
pyxenium validate-renal-ffpe-protein \
7481
"Y:/long/10X_datasets/Xenium/Xenium_Renal/Xenium_V1_Human_Kidney_FFPE_Protein" \
7582
--write-h5ad ./renal_ffpe_protein.h5ad
7683
```

examples/smoke_test_10x_renal_ffpe_protein.py

Lines changed: 17 additions & 225 deletions
Original file line numberDiff line numberDiff line change
@@ -6,211 +6,19 @@
66
import sys
77
from pathlib import Path
88

9-
import numpy as np
10-
import pandas as pd
11-
129
try:
13-
from pyXenium.datasets import RENAL_FFPE_PROTEIN_10X_DATASET
14-
from pyXenium.io.xenium_gene_protein_loader import load_xenium_gene_protein
10+
from pyXenium.validation.renal_ffpe_protein import (
11+
DEFAULT_DATASET_PATH,
12+
run_validated_renal_ffpe_smoke,
13+
)
1514
except ModuleNotFoundError:
1615
repo_src = Path(__file__).resolve().parents[1] / "src"
1716
if str(repo_src) not in sys.path:
1817
sys.path.insert(0, str(repo_src))
19-
from pyXenium.datasets import RENAL_FFPE_PROTEIN_10X_DATASET
20-
from pyXenium.io.xenium_gene_protein_loader import load_xenium_gene_protein
21-
22-
23-
DEFAULT_DATASET_PATH = (
24-
r"Y:\long\10X_datasets\Xenium\Xenium_Renal\Xenium_V1_Human_Kidney_FFPE_Protein"
25-
)
26-
EXPECTED_CELLS = 465545
27-
EXPECTED_RNA_FEATURES = 405
28-
EXPECTED_PROTEIN_MARKERS = 27
29-
30-
31-
def _top_rna_features(adata, top_n: int) -> list[dict]:
32-
feature_names = (
33-
adata.var["name"].astype(str).tolist()
34-
if "name" in adata.var.columns
35-
else adata.var_names.astype(str).tolist()
18+
from pyXenium.validation.renal_ffpe_protein import (
19+
DEFAULT_DATASET_PATH,
20+
run_validated_renal_ffpe_smoke,
3621
)
37-
nnz = np.asarray(adata.X.getnnz(axis=0)).ravel()
38-
total = np.asarray(adata.X.sum(axis=0)).ravel()
39-
40-
order = np.argsort(-total)[:top_n]
41-
rows = []
42-
for idx in order:
43-
rows.append(
44-
{
45-
"feature": feature_names[idx],
46-
"detected_cells": int(nnz[idx]),
47-
"total_counts": float(total[idx]),
48-
}
49-
)
50-
return rows
51-
52-
53-
def _top_protein_markers(adata, top_n: int) -> list[dict]:
54-
protein = adata.obsm["protein"]
55-
protein_df = protein if isinstance(protein, pd.DataFrame) else pd.DataFrame(protein, index=adata.obs_names)
56-
57-
mean_signal = protein_df.mean(axis=0).sort_values(ascending=False)
58-
rows = []
59-
for marker, value in mean_signal.head(top_n).items():
60-
rows.append(
61-
{
62-
"marker": str(marker),
63-
"mean_signal": float(value),
64-
"positive_cells": int((protein_df[marker] > 0).sum()),
65-
}
66-
)
67-
return rows
68-
69-
70-
def _top_clusters(adata, top_n: int) -> list[dict]:
71-
if "cluster" not in adata.obs.columns:
72-
return []
73-
74-
counts = adata.obs["cluster"].astype(str).value_counts().head(top_n)
75-
return [{"cluster": str(cluster), "n_cells": int(count)} for cluster, count in counts.items()]
76-
77-
78-
def build_summary(base_path: str, prefer: str, top_n: int = 10) -> tuple[dict, object]:
79-
adata = load_xenium_gene_protein(base_path=base_path, prefer=prefer)
80-
81-
protein = adata.obsm.get("protein")
82-
protein_shape = getattr(protein, "shape", None)
83-
protein_markers = int(protein_shape[1]) if protein_shape is not None else 0
84-
85-
summary = {
86-
"dataset_title": RENAL_FFPE_PROTEIN_10X_DATASET.title,
87-
"dataset_url": RENAL_FFPE_PROTEIN_10X_DATASET.url,
88-
"base_path": base_path,
89-
"prefer": prefer,
90-
"n_cells": int(adata.n_obs),
91-
"n_rna_features": int(adata.n_vars),
92-
"n_protein_markers": protein_markers,
93-
"x_nnz": int(getattr(adata.X, "nnz", 0)),
94-
"has_spatial": "spatial" in adata.obsm,
95-
"has_cluster": "cluster" in adata.obs.columns,
96-
"obsm_keys": sorted(adata.obsm.keys()),
97-
"metrics_summary_num_cells_detected": None,
98-
"top_rna_features_by_total_counts": _top_rna_features(adata, top_n=top_n),
99-
"top_protein_markers_by_mean_signal": _top_protein_markers(adata, top_n=top_n),
100-
"largest_clusters": _top_clusters(adata, top_n=top_n),
101-
}
102-
103-
metrics_path = Path(base_path) / "metrics_summary.csv"
104-
if metrics_path.exists():
105-
metrics = pd.read_csv(metrics_path)
106-
if "num_cells_detected" in metrics.columns and not metrics.empty:
107-
summary["metrics_summary_num_cells_detected"] = int(metrics.loc[0, "num_cells_detected"])
108-
109-
return summary, adata
110-
111-
112-
def render_markdown_report(payload: dict) -> str:
113-
summary = payload["summary"]
114-
validated = payload["validated_reference"]
115-
issues = payload["issues"]
116-
117-
lines = [
118-
"# pyXenium Smoke Test Report",
119-
"",
120-
f"Dataset: {summary['dataset_title']}",
121-
f"Source: {summary['dataset_url']}",
122-
f"Local path: `{summary['base_path']}`",
123-
f"Backend preference: `{summary['prefer']}`",
124-
"",
125-
"## Core Results",
126-
"",
127-
f"- Cells: `{summary['n_cells']}`",
128-
f"- RNA features: `{summary['n_rna_features']}`",
129-
f"- Protein markers: `{summary['n_protein_markers']}`",
130-
f"- Sparse matrix nnz: `{summary['x_nnz']}`",
131-
f"- Spatial coordinates present: `{summary['has_spatial']}`",
132-
f"- Cluster labels present: `{summary['has_cluster']}`",
133-
f"- metrics_summary.csv detected cells: `{summary['metrics_summary_num_cells_detected']}`",
134-
"",
135-
"## Validated Reference",
136-
"",
137-
f"- Expected cells: `{validated['expected_cells']}`",
138-
f"- Expected RNA features: `{validated['expected_rna_features']}`",
139-
f"- Expected protein markers: `{validated['expected_protein_markers']}`",
140-
"",
141-
"## Largest Clusters",
142-
"",
143-
]
144-
145-
for row in summary["largest_clusters"]:
146-
lines.append(f"- `{row['cluster']}`: `{row['n_cells']}` cells")
147-
148-
lines.extend(["", "## Top RNA Features by Total Counts", ""])
149-
for row in summary["top_rna_features_by_total_counts"]:
150-
lines.append(
151-
f"- `{row['feature']}`: total counts `{row['total_counts']:.0f}`, detected cells `{row['detected_cells']}`"
152-
)
153-
154-
lines.extend(["", "## Top Protein Markers by Mean Signal", ""])
155-
for row in summary["top_protein_markers_by_mean_signal"]:
156-
lines.append(
157-
f"- `{row['marker']}`: mean signal `{row['mean_signal']:.4f}`, positive cells `{row['positive_cells']}`"
158-
)
159-
160-
lines.extend(["", "## Issues", ""])
161-
if issues:
162-
lines.extend(f"- {issue}" for issue in issues)
163-
else:
164-
lines.append("- No issues detected.")
165-
166-
lines.append("")
167-
return "\n".join(lines)
168-
169-
170-
def write_output_artifacts(payload: dict, output_dir: str | None) -> None:
171-
if not output_dir:
172-
return
173-
174-
out = Path(output_dir)
175-
out.mkdir(parents=True, exist_ok=True)
176-
177-
(out / "summary.json").write_text(json.dumps(payload, indent=2) + "\n", encoding="utf-8")
178-
(out / "report.md").write_text(render_markdown_report(payload), encoding="utf-8")
179-
pd.DataFrame(payload["summary"]["top_rna_features_by_total_counts"]).to_csv(
180-
out / "top_rna_features.csv", index=False
181-
)
182-
pd.DataFrame(payload["summary"]["top_protein_markers_by_mean_signal"]).to_csv(
183-
out / "top_protein_markers.csv", index=False
184-
)
185-
pd.DataFrame(payload["summary"]["largest_clusters"]).to_csv(out / "largest_clusters.csv", index=False)
186-
187-
188-
def validate_summary(summary: dict) -> list[str]:
189-
issues: list[str] = []
190-
191-
if summary["n_cells"] != EXPECTED_CELLS:
192-
issues.append(f"Expected {EXPECTED_CELLS} cells, observed {summary['n_cells']}.")
193-
if summary["n_rna_features"] != EXPECTED_RNA_FEATURES:
194-
issues.append(
195-
f"Expected {EXPECTED_RNA_FEATURES} RNA features, observed {summary['n_rna_features']}."
196-
)
197-
if summary["n_protein_markers"] != EXPECTED_PROTEIN_MARKERS:
198-
issues.append(
199-
f"Expected {EXPECTED_PROTEIN_MARKERS} protein markers, observed {summary['n_protein_markers']}."
200-
)
201-
if not summary["has_spatial"]:
202-
issues.append("Expected adata.obsm['spatial'] to be present.")
203-
if not summary["has_cluster"]:
204-
issues.append("Expected adata.obs['cluster'] to be present.")
205-
206-
metric_cells = summary["metrics_summary_num_cells_detected"]
207-
if metric_cells is not None and metric_cells != summary["n_cells"]:
208-
issues.append(
209-
"metrics_summary.csv reports "
210-
f"{metric_cells} detected cells, but pyXenium loaded {summary['n_cells']} cells."
211-
)
212-
213-
return issues
21422

21523

21624
def parse_args() -> argparse.Namespace:
@@ -266,32 +74,16 @@ def parse_args() -> argparse.Namespace:
26674

26775
def main() -> int:
26876
args = parse_args()
269-
summary, adata = build_summary(base_path=args.base_path, prefer=args.prefer, top_n=args.top_n)
270-
issues = validate_summary(summary)
271-
272-
payload = {
273-
"summary": summary,
274-
"validated_reference": {
275-
"expected_cells": EXPECTED_CELLS,
276-
"expected_rna_features": EXPECTED_RNA_FEATURES,
277-
"expected_protein_markers": EXPECTED_PROTEIN_MARKERS,
278-
},
279-
"issues": issues,
280-
}
281-
282-
rendered = json.dumps(payload, indent=2)
283-
print(rendered)
284-
285-
if args.output_json:
286-
Path(args.output_json).write_text(rendered + "\n", encoding="utf-8")
287-
write_output_artifacts(payload, args.output_dir)
288-
289-
if args.write_h5ad:
290-
h5ad_path = Path(args.write_h5ad)
291-
h5ad_path.parent.mkdir(parents=True, exist_ok=True)
292-
adata.write_h5ad(h5ad_path)
293-
294-
if issues and not args.allow_mismatch:
77+
payload = run_validated_renal_ffpe_smoke(
78+
base_path=args.base_path,
79+
prefer=args.prefer,
80+
top_n=args.top_n,
81+
output_json=args.output_json,
82+
output_dir=args.output_dir,
83+
write_h5ad=args.write_h5ad,
84+
)
85+
print(json.dumps(payload, indent=2))
86+
if payload["issues"] and not args.allow_mismatch:
29587
return 1
29688
return 0
29789

src/pyXenium/__main__.py

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,10 @@
1+
import json
12
from pathlib import Path
23

34
import click
45

56
from .io.io import copy_bundled_dataset, load_toy
7+
from .validation import DEFAULT_DATASET_PATH, run_validated_renal_ffpe_smoke
68

79

810
@click.group()
@@ -38,6 +40,33 @@ def datasets(name, url, dest):
3840
click.echo(f"Copied bundled toy dataset to {target}")
3941

4042

43+
@app.command("validate-renal-ffpe-protein")
44+
@click.argument("base_path", required=False, default=DEFAULT_DATASET_PATH)
45+
@click.option("--prefer", type=click.Choice(["auto", "zarr", "h5", "mex"]), default="auto", show_default=True)
46+
@click.option("--top-n", type=int, default=10, show_default=True)
47+
@click.option("--allow-mismatch", is_flag=True, default=False)
48+
@click.option("--output-json", default=None, help="Optional path to write the summary JSON.")
49+
@click.option(
50+
"--output-dir",
51+
default=None,
52+
help="Optional directory for report.md, summary.json, and CSV summaries.",
53+
)
54+
@click.option("--write-h5ad", default=None, help="Optional path to export the loaded AnnData object as an .h5ad file.")
55+
def validate_renal_ffpe_protein(base_path, prefer, top_n, allow_mismatch, output_json, output_dir, write_h5ad):
56+
"""Validate pyXenium against the public 10x FFPE renal RNA + Protein dataset."""
57+
payload = run_validated_renal_ffpe_smoke(
58+
base_path=base_path,
59+
prefer=prefer,
60+
top_n=top_n,
61+
output_json=output_json,
62+
output_dir=output_dir,
63+
write_h5ad=write_h5ad,
64+
)
65+
click.echo(json.dumps(payload, indent=2))
66+
if payload["issues"] and not allow_mismatch:
67+
raise click.exceptions.Exit(1)
68+
69+
4170
def main():
4271
app()
4372

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
from .renal_ffpe_protein import (
2+
DEFAULT_DATASET_PATH,
3+
EXPECTED_CELLS,
4+
EXPECTED_PROTEIN_MARKERS,
5+
EXPECTED_RNA_FEATURES,
6+
build_summary,
7+
render_markdown_report,
8+
run_validated_renal_ffpe_smoke,
9+
validate_summary,
10+
write_output_artifacts,
11+
)
12+
13+
__all__ = [
14+
"DEFAULT_DATASET_PATH",
15+
"EXPECTED_CELLS",
16+
"EXPECTED_RNA_FEATURES",
17+
"EXPECTED_PROTEIN_MARKERS",
18+
"build_summary",
19+
"validate_summary",
20+
"render_markdown_report",
21+
"write_output_artifacts",
22+
"run_validated_renal_ffpe_smoke",
23+
]

0 commit comments

Comments
 (0)