Skip to content

Commit c07cb1c

Browse files
committed
Expand 10x renal dataset smoke test workflow
1 parent 99ee456 commit c07cb1c

File tree

2 files changed

+168
-3
lines changed

2 files changed

+168
-3
lines changed

README.md

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,22 @@ python examples/smoke_test_10x_renal_ffpe_protein.py \
5959
"Y:/long/10X_datasets/Xenium/Xenium_Renal/Xenium_V1_Human_Kidney_FFPE_Protein"
6060
```
6161

62+
To also write a compact Markdown/JSON/CSV report bundle:
63+
64+
```bash
65+
python examples/smoke_test_10x_renal_ffpe_protein.py \
66+
"Y:/long/10X_datasets/Xenium/Xenium_Renal/Xenium_V1_Human_Kidney_FFPE_Protein" \
67+
--output-dir ./smoke_test_outputs
68+
```
69+
70+
To export the loaded object for downstream analysis:
71+
72+
```bash
73+
python examples/smoke_test_10x_renal_ffpe_protein.py \
74+
"Y:/long/10X_datasets/Xenium/Xenium_Renal/Xenium_V1_Human_Kidney_FFPE_Protein" \
75+
--write-h5ad ./renal_ffpe_protein.h5ad
76+
```
77+
6278
Quick Start
6379
-----------
6480

examples/smoke_test_10x_renal_ffpe_protein.py

Lines changed: 152 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
import sys
77
from pathlib import Path
88

9+
import numpy as np
910
import pandas as pd
1011

1112
try:
@@ -27,7 +28,54 @@
2728
EXPECTED_PROTEIN_MARKERS = 27
2829

2930

30-
def build_summary(base_path: str, prefer: str) -> dict:
31+
def _top_rna_features(adata, top_n: int) -> list[dict]:
32+
feature_names = (
33+
adata.var["name"].astype(str).tolist()
34+
if "name" in adata.var.columns
35+
else adata.var_names.astype(str).tolist()
36+
)
37+
nnz = np.asarray(adata.X.getnnz(axis=0)).ravel()
38+
total = np.asarray(adata.X.sum(axis=0)).ravel()
39+
40+
order = np.argsort(-total)[:top_n]
41+
rows = []
42+
for idx in order:
43+
rows.append(
44+
{
45+
"feature": feature_names[idx],
46+
"detected_cells": int(nnz[idx]),
47+
"total_counts": float(total[idx]),
48+
}
49+
)
50+
return rows
51+
52+
53+
def _top_protein_markers(adata, top_n: int) -> list[dict]:
54+
protein = adata.obsm["protein"]
55+
protein_df = protein if isinstance(protein, pd.DataFrame) else pd.DataFrame(protein, index=adata.obs_names)
56+
57+
mean_signal = protein_df.mean(axis=0).sort_values(ascending=False)
58+
rows = []
59+
for marker, value in mean_signal.head(top_n).items():
60+
rows.append(
61+
{
62+
"marker": str(marker),
63+
"mean_signal": float(value),
64+
"positive_cells": int((protein_df[marker] > 0).sum()),
65+
}
66+
)
67+
return rows
68+
69+
70+
def _top_clusters(adata, top_n: int) -> list[dict]:
71+
if "cluster" not in adata.obs.columns:
72+
return []
73+
74+
counts = adata.obs["cluster"].astype(str).value_counts().head(top_n)
75+
return [{"cluster": str(cluster), "n_cells": int(count)} for cluster, count in counts.items()]
76+
77+
78+
def build_summary(base_path: str, prefer: str, top_n: int = 10) -> tuple[dict, object]:
3179
adata = load_xenium_gene_protein(base_path=base_path, prefer=prefer)
3280

3381
protein = adata.obsm.get("protein")
@@ -47,6 +95,9 @@ def build_summary(base_path: str, prefer: str) -> dict:
4795
"has_cluster": "cluster" in adata.obs.columns,
4896
"obsm_keys": sorted(adata.obsm.keys()),
4997
"metrics_summary_num_cells_detected": None,
98+
"top_rna_features_by_total_counts": _top_rna_features(adata, top_n=top_n),
99+
"top_protein_markers_by_mean_signal": _top_protein_markers(adata, top_n=top_n),
100+
"largest_clusters": _top_clusters(adata, top_n=top_n),
50101
}
51102

52103
metrics_path = Path(base_path) / "metrics_summary.csv"
@@ -55,7 +106,83 @@ def build_summary(base_path: str, prefer: str) -> dict:
55106
if "num_cells_detected" in metrics.columns and not metrics.empty:
56107
summary["metrics_summary_num_cells_detected"] = int(metrics.loc[0, "num_cells_detected"])
57108

58-
return summary
109+
return summary, adata
110+
111+
112+
def render_markdown_report(payload: dict) -> str:
113+
summary = payload["summary"]
114+
validated = payload["validated_reference"]
115+
issues = payload["issues"]
116+
117+
lines = [
118+
"# pyXenium Smoke Test Report",
119+
"",
120+
f"Dataset: {summary['dataset_title']}",
121+
f"Source: {summary['dataset_url']}",
122+
f"Local path: `{summary['base_path']}`",
123+
f"Backend preference: `{summary['prefer']}`",
124+
"",
125+
"## Core Results",
126+
"",
127+
f"- Cells: `{summary['n_cells']}`",
128+
f"- RNA features: `{summary['n_rna_features']}`",
129+
f"- Protein markers: `{summary['n_protein_markers']}`",
130+
f"- Sparse matrix nnz: `{summary['x_nnz']}`",
131+
f"- Spatial coordinates present: `{summary['has_spatial']}`",
132+
f"- Cluster labels present: `{summary['has_cluster']}`",
133+
f"- metrics_summary.csv detected cells: `{summary['metrics_summary_num_cells_detected']}`",
134+
"",
135+
"## Validated Reference",
136+
"",
137+
f"- Expected cells: `{validated['expected_cells']}`",
138+
f"- Expected RNA features: `{validated['expected_rna_features']}`",
139+
f"- Expected protein markers: `{validated['expected_protein_markers']}`",
140+
"",
141+
"## Largest Clusters",
142+
"",
143+
]
144+
145+
for row in summary["largest_clusters"]:
146+
lines.append(f"- `{row['cluster']}`: `{row['n_cells']}` cells")
147+
148+
lines.extend(["", "## Top RNA Features by Total Counts", ""])
149+
for row in summary["top_rna_features_by_total_counts"]:
150+
lines.append(
151+
f"- `{row['feature']}`: total counts `{row['total_counts']:.0f}`, detected cells `{row['detected_cells']}`"
152+
)
153+
154+
lines.extend(["", "## Top Protein Markers by Mean Signal", ""])
155+
for row in summary["top_protein_markers_by_mean_signal"]:
156+
lines.append(
157+
f"- `{row['marker']}`: mean signal `{row['mean_signal']:.4f}`, positive cells `{row['positive_cells']}`"
158+
)
159+
160+
lines.extend(["", "## Issues", ""])
161+
if issues:
162+
lines.extend(f"- {issue}" for issue in issues)
163+
else:
164+
lines.append("- No issues detected.")
165+
166+
lines.append("")
167+
return "\n".join(lines)
168+
169+
170+
def write_output_artifacts(payload: dict, output_dir: str | None) -> None:
171+
if not output_dir:
172+
return
173+
174+
out = Path(output_dir)
175+
out.mkdir(parents=True, exist_ok=True)
176+
177+
(out / "summary.json").write_text(json.dumps(payload, indent=2) + "\n", encoding="utf-8")
178+
(out / "report.md").write_text(render_markdown_report(payload), encoding="utf-8")
179+
pd.DataFrame(payload["summary"]["top_rna_features_by_total_counts"]).to_csv(
180+
out / "top_rna_features.csv", index=False
181+
)
182+
pd.DataFrame(payload["summary"]["top_protein_markers_by_mean_signal"]).to_csv(
183+
out / "top_protein_markers.csv", index=False
184+
)
185+
pd.DataFrame(payload["summary"]["largest_clusters"]).to_csv(out / "largest_clusters.csv", index=False)
59186

60187

61188
def validate_summary(summary: dict) -> list[str]:
@@ -108,6 +235,12 @@ def parse_args() -> argparse.Namespace:
108235
default="auto",
109236
help="Preferred matrix backend passed to load_xenium_gene_protein().",
110237
)
238+
parser.add_argument(
239+
"--top-n",
240+
type=int,
241+
default=10,
242+
help="Number of top RNA features, protein markers, and clusters to report.",
243+
)
111244
parser.add_argument(
112245
"--allow-mismatch",
113246
action="store_true",
@@ -118,12 +251,22 @@ def parse_args() -> argparse.Namespace:
118251
default=None,
119252
help="Optional path to write the summary JSON.",
120253
)
254+
parser.add_argument(
255+
"--output-dir",
256+
default=None,
257+
help="Optional directory for report.md, summary.json, and CSV summaries.",
258+
)
259+
parser.add_argument(
260+
"--write-h5ad",
261+
default=None,
262+
help="Optional path to export the loaded AnnData object as an .h5ad file.",
263+
)
121264
return parser.parse_args()
122265

123266

124267
def main() -> int:
125268
args = parse_args()
126-
summary = build_summary(base_path=args.base_path, prefer=args.prefer)
269+
summary, adata = build_summary(base_path=args.base_path, prefer=args.prefer, top_n=args.top_n)
127270
issues = validate_summary(summary)
128271

129272
payload = {
@@ -141,6 +284,12 @@ def main() -> int:
141284

142285
if args.output_json:
143286
Path(args.output_json).write_text(rendered + "\n", encoding="utf-8")
287+
write_output_artifacts(payload, args.output_dir)
288+
289+
if args.write_h5ad:
290+
h5ad_path = Path(args.write_h5ad)
291+
h5ad_path.parent.mkdir(parents=True, exist_ok=True)
292+
adata.write_h5ad(h5ad_path)
144293

145294
if issues and not args.allow_mismatch:
146295
return 1

0 commit comments

Comments
 (0)