|
| 1 | +from __future__ import annotations |
| 2 | + |
| 3 | +import argparse |
| 4 | +import json |
| 5 | +import os |
| 6 | +import sys |
| 7 | +from pathlib import Path |
| 8 | + |
| 9 | +import pandas as pd |
| 10 | + |
| 11 | +try: |
| 12 | + from pyXenium.datasets import RENAL_FFPE_PROTEIN_10X_DATASET |
| 13 | + from pyXenium.io.xenium_gene_protein_loader import load_xenium_gene_protein |
| 14 | +except ModuleNotFoundError: |
| 15 | + repo_src = Path(__file__).resolve().parents[1] / "src" |
| 16 | + if str(repo_src) not in sys.path: |
| 17 | + sys.path.insert(0, str(repo_src)) |
| 18 | + from pyXenium.datasets import RENAL_FFPE_PROTEIN_10X_DATASET |
| 19 | + from pyXenium.io.xenium_gene_protein_loader import load_xenium_gene_protein |
| 20 | + |
| 21 | + |
| 22 | +DEFAULT_DATASET_PATH = ( |
| 23 | + r"Y:\long\10X_datasets\Xenium\Xenium_Renal\Xenium_V1_Human_Kidney_FFPE_Protein" |
| 24 | +) |
| 25 | +EXPECTED_CELLS = 465545 |
| 26 | +EXPECTED_RNA_FEATURES = 405 |
| 27 | +EXPECTED_PROTEIN_MARKERS = 27 |
| 28 | + |
| 29 | + |
| 30 | +def build_summary(base_path: str, prefer: str) -> dict: |
| 31 | + adata = load_xenium_gene_protein(base_path=base_path, prefer=prefer) |
| 32 | + |
| 33 | + protein = adata.obsm.get("protein") |
| 34 | + protein_shape = getattr(protein, "shape", None) |
| 35 | + protein_markers = int(protein_shape[1]) if protein_shape is not None else 0 |
| 36 | + |
| 37 | + summary = { |
| 38 | + "dataset_title": RENAL_FFPE_PROTEIN_10X_DATASET.title, |
| 39 | + "dataset_url": RENAL_FFPE_PROTEIN_10X_DATASET.url, |
| 40 | + "base_path": base_path, |
| 41 | + "prefer": prefer, |
| 42 | + "n_cells": int(adata.n_obs), |
| 43 | + "n_rna_features": int(adata.n_vars), |
| 44 | + "n_protein_markers": protein_markers, |
| 45 | + "x_nnz": int(getattr(adata.X, "nnz", 0)), |
| 46 | + "has_spatial": "spatial" in adata.obsm, |
| 47 | + "has_cluster": "cluster" in adata.obs.columns, |
| 48 | + "obsm_keys": sorted(adata.obsm.keys()), |
| 49 | + "metrics_summary_num_cells_detected": None, |
| 50 | + } |
| 51 | + |
| 52 | + metrics_path = Path(base_path) / "metrics_summary.csv" |
| 53 | + if metrics_path.exists(): |
| 54 | + metrics = pd.read_csv(metrics_path) |
| 55 | + if "num_cells_detected" in metrics.columns and not metrics.empty: |
| 56 | + summary["metrics_summary_num_cells_detected"] = int(metrics.loc[0, "num_cells_detected"]) |
| 57 | + |
| 58 | + return summary |
| 59 | + |
| 60 | + |
| 61 | +def validate_summary(summary: dict) -> list[str]: |
| 62 | + issues: list[str] = [] |
| 63 | + |
| 64 | + if summary["n_cells"] != EXPECTED_CELLS: |
| 65 | + issues.append(f"Expected {EXPECTED_CELLS} cells, observed {summary['n_cells']}.") |
| 66 | + if summary["n_rna_features"] != EXPECTED_RNA_FEATURES: |
| 67 | + issues.append( |
| 68 | + f"Expected {EXPECTED_RNA_FEATURES} RNA features, observed {summary['n_rna_features']}." |
| 69 | + ) |
| 70 | + if summary["n_protein_markers"] != EXPECTED_PROTEIN_MARKERS: |
| 71 | + issues.append( |
| 72 | + f"Expected {EXPECTED_PROTEIN_MARKERS} protein markers, observed {summary['n_protein_markers']}." |
| 73 | + ) |
| 74 | + if not summary["has_spatial"]: |
| 75 | + issues.append("Expected adata.obsm['spatial'] to be present.") |
| 76 | + if not summary["has_cluster"]: |
| 77 | + issues.append("Expected adata.obs['cluster'] to be present.") |
| 78 | + |
| 79 | + metric_cells = summary["metrics_summary_num_cells_detected"] |
| 80 | + if metric_cells is not None and metric_cells != summary["n_cells"]: |
| 81 | + issues.append( |
| 82 | + "metrics_summary.csv reports " |
| 83 | + f"{metric_cells} detected cells, but pyXenium loaded {summary['n_cells']} cells." |
| 84 | + ) |
| 85 | + |
| 86 | + return issues |
| 87 | + |
| 88 | + |
| 89 | +def parse_args() -> argparse.Namespace: |
| 90 | + parser = argparse.ArgumentParser( |
| 91 | + description=( |
| 92 | + "Smoke-test pyXenium on the official 10x Genomics FFPE Human Renal Cell Carcinoma " |
| 93 | + "RNA + Protein Xenium dataset." |
| 94 | + ) |
| 95 | + ) |
| 96 | + parser.add_argument( |
| 97 | + "base_path", |
| 98 | + nargs="?", |
| 99 | + default=os.environ.get("PYXENIUM_DATASET_PATH", DEFAULT_DATASET_PATH), |
| 100 | + help=( |
| 101 | + "Local path to the Xenium dataset directory. Defaults to the " |
| 102 | + "PYXENIUM_DATASET_PATH environment variable or the validated local path." |
| 103 | + ), |
| 104 | + ) |
| 105 | + parser.add_argument( |
| 106 | + "--prefer", |
| 107 | + choices=("auto", "zarr", "h5", "mex"), |
| 108 | + default="auto", |
| 109 | + help="Preferred matrix backend passed to load_xenium_gene_protein().", |
| 110 | + ) |
| 111 | + parser.add_argument( |
| 112 | + "--allow-mismatch", |
| 113 | + action="store_true", |
| 114 | + help="Print the summary even if the observed values differ from the validated reference.", |
| 115 | + ) |
| 116 | + parser.add_argument( |
| 117 | + "--output-json", |
| 118 | + default=None, |
| 119 | + help="Optional path to write the summary JSON.", |
| 120 | + ) |
| 121 | + return parser.parse_args() |
| 122 | + |
| 123 | + |
| 124 | +def main() -> int: |
| 125 | + args = parse_args() |
| 126 | + summary = build_summary(base_path=args.base_path, prefer=args.prefer) |
| 127 | + issues = validate_summary(summary) |
| 128 | + |
| 129 | + payload = { |
| 130 | + "summary": summary, |
| 131 | + "validated_reference": { |
| 132 | + "expected_cells": EXPECTED_CELLS, |
| 133 | + "expected_rna_features": EXPECTED_RNA_FEATURES, |
| 134 | + "expected_protein_markers": EXPECTED_PROTEIN_MARKERS, |
| 135 | + }, |
| 136 | + "issues": issues, |
| 137 | + } |
| 138 | + |
| 139 | + rendered = json.dumps(payload, indent=2) |
| 140 | + print(rendered) |
| 141 | + |
| 142 | + if args.output_json: |
| 143 | + Path(args.output_json).write_text(rendered + "\n", encoding="utf-8") |
| 144 | + |
| 145 | + if issues and not args.allow_mismatch: |
| 146 | + return 1 |
| 147 | + return 0 |
| 148 | + |
| 149 | + |
| 150 | +if __name__ == "__main__": |
| 151 | + raise SystemExit(main()) |
0 commit comments