|
6 | 6 | import sys |
7 | 7 | from pathlib import Path |
8 | 8 |
|
9 | | -import numpy as np |
10 | | -import pandas as pd |
11 | | - |
12 | 9 | try: |
13 | | - from pyXenium.datasets import RENAL_FFPE_PROTEIN_10X_DATASET |
14 | | - from pyXenium.io.xenium_gene_protein_loader import load_xenium_gene_protein |
| 10 | + from pyXenium.validation.renal_ffpe_protein import ( |
| 11 | + DEFAULT_DATASET_PATH, |
| 12 | + run_validated_renal_ffpe_smoke, |
| 13 | + ) |
15 | 14 | except ModuleNotFoundError: |
16 | 15 | repo_src = Path(__file__).resolve().parents[1] / "src" |
17 | 16 | if str(repo_src) not in sys.path: |
18 | 17 | sys.path.insert(0, str(repo_src)) |
19 | | - from pyXenium.datasets import RENAL_FFPE_PROTEIN_10X_DATASET |
20 | | - from pyXenium.io.xenium_gene_protein_loader import load_xenium_gene_protein |
21 | | - |
22 | | - |
23 | | -DEFAULT_DATASET_PATH = ( |
24 | | - r"Y:\long\10X_datasets\Xenium\Xenium_Renal\Xenium_V1_Human_Kidney_FFPE_Protein" |
25 | | -) |
26 | | -EXPECTED_CELLS = 465545 |
27 | | -EXPECTED_RNA_FEATURES = 405 |
28 | | -EXPECTED_PROTEIN_MARKERS = 27 |
29 | | - |
30 | | - |
31 | | -def _top_rna_features(adata, top_n: int) -> list[dict]: |
32 | | - feature_names = ( |
33 | | - adata.var["name"].astype(str).tolist() |
34 | | - if "name" in adata.var.columns |
35 | | - else adata.var_names.astype(str).tolist() |
| 18 | + from pyXenium.validation.renal_ffpe_protein import ( |
| 19 | + DEFAULT_DATASET_PATH, |
| 20 | + run_validated_renal_ffpe_smoke, |
36 | 21 | ) |
37 | | - nnz = np.asarray(adata.X.getnnz(axis=0)).ravel() |
38 | | - total = np.asarray(adata.X.sum(axis=0)).ravel() |
39 | | - |
40 | | - order = np.argsort(-total)[:top_n] |
41 | | - rows = [] |
42 | | - for idx in order: |
43 | | - rows.append( |
44 | | - { |
45 | | - "feature": feature_names[idx], |
46 | | - "detected_cells": int(nnz[idx]), |
47 | | - "total_counts": float(total[idx]), |
48 | | - } |
49 | | - ) |
50 | | - return rows |
51 | | - |
52 | | - |
53 | | -def _top_protein_markers(adata, top_n: int) -> list[dict]: |
54 | | - protein = adata.obsm["protein"] |
55 | | - protein_df = protein if isinstance(protein, pd.DataFrame) else pd.DataFrame(protein, index=adata.obs_names) |
56 | | - |
57 | | - mean_signal = protein_df.mean(axis=0).sort_values(ascending=False) |
58 | | - rows = [] |
59 | | - for marker, value in mean_signal.head(top_n).items(): |
60 | | - rows.append( |
61 | | - { |
62 | | - "marker": str(marker), |
63 | | - "mean_signal": float(value), |
64 | | - "positive_cells": int((protein_df[marker] > 0).sum()), |
65 | | - } |
66 | | - ) |
67 | | - return rows |
68 | | - |
69 | | - |
70 | | -def _top_clusters(adata, top_n: int) -> list[dict]: |
71 | | - if "cluster" not in adata.obs.columns: |
72 | | - return [] |
73 | | - |
74 | | - counts = adata.obs["cluster"].astype(str).value_counts().head(top_n) |
75 | | - return [{"cluster": str(cluster), "n_cells": int(count)} for cluster, count in counts.items()] |
76 | | - |
77 | | - |
78 | | -def build_summary(base_path: str, prefer: str, top_n: int = 10) -> tuple[dict, object]: |
79 | | - adata = load_xenium_gene_protein(base_path=base_path, prefer=prefer) |
80 | | - |
81 | | - protein = adata.obsm.get("protein") |
82 | | - protein_shape = getattr(protein, "shape", None) |
83 | | - protein_markers = int(protein_shape[1]) if protein_shape is not None else 0 |
84 | | - |
85 | | - summary = { |
86 | | - "dataset_title": RENAL_FFPE_PROTEIN_10X_DATASET.title, |
87 | | - "dataset_url": RENAL_FFPE_PROTEIN_10X_DATASET.url, |
88 | | - "base_path": base_path, |
89 | | - "prefer": prefer, |
90 | | - "n_cells": int(adata.n_obs), |
91 | | - "n_rna_features": int(adata.n_vars), |
92 | | - "n_protein_markers": protein_markers, |
93 | | - "x_nnz": int(getattr(adata.X, "nnz", 0)), |
94 | | - "has_spatial": "spatial" in adata.obsm, |
95 | | - "has_cluster": "cluster" in adata.obs.columns, |
96 | | - "obsm_keys": sorted(adata.obsm.keys()), |
97 | | - "metrics_summary_num_cells_detected": None, |
98 | | - "top_rna_features_by_total_counts": _top_rna_features(adata, top_n=top_n), |
99 | | - "top_protein_markers_by_mean_signal": _top_protein_markers(adata, top_n=top_n), |
100 | | - "largest_clusters": _top_clusters(adata, top_n=top_n), |
101 | | - } |
102 | | - |
103 | | - metrics_path = Path(base_path) / "metrics_summary.csv" |
104 | | - if metrics_path.exists(): |
105 | | - metrics = pd.read_csv(metrics_path) |
106 | | - if "num_cells_detected" in metrics.columns and not metrics.empty: |
107 | | - summary["metrics_summary_num_cells_detected"] = int(metrics.loc[0, "num_cells_detected"]) |
108 | | - |
109 | | - return summary, adata |
110 | | - |
111 | | - |
112 | | -def render_markdown_report(payload: dict) -> str: |
113 | | - summary = payload["summary"] |
114 | | - validated = payload["validated_reference"] |
115 | | - issues = payload["issues"] |
116 | | - |
117 | | - lines = [ |
118 | | - "# pyXenium Smoke Test Report", |
119 | | - "", |
120 | | - f"Dataset: {summary['dataset_title']}", |
121 | | - f"Source: {summary['dataset_url']}", |
122 | | - f"Local path: `{summary['base_path']}`", |
123 | | - f"Backend preference: `{summary['prefer']}`", |
124 | | - "", |
125 | | - "## Core Results", |
126 | | - "", |
127 | | - f"- Cells: `{summary['n_cells']}`", |
128 | | - f"- RNA features: `{summary['n_rna_features']}`", |
129 | | - f"- Protein markers: `{summary['n_protein_markers']}`", |
130 | | - f"- Sparse matrix nnz: `{summary['x_nnz']}`", |
131 | | - f"- Spatial coordinates present: `{summary['has_spatial']}`", |
132 | | - f"- Cluster labels present: `{summary['has_cluster']}`", |
133 | | - f"- metrics_summary.csv detected cells: `{summary['metrics_summary_num_cells_detected']}`", |
134 | | - "", |
135 | | - "## Validated Reference", |
136 | | - "", |
137 | | - f"- Expected cells: `{validated['expected_cells']}`", |
138 | | - f"- Expected RNA features: `{validated['expected_rna_features']}`", |
139 | | - f"- Expected protein markers: `{validated['expected_protein_markers']}`", |
140 | | - "", |
141 | | - "## Largest Clusters", |
142 | | - "", |
143 | | - ] |
144 | | - |
145 | | - for row in summary["largest_clusters"]: |
146 | | - lines.append(f"- `{row['cluster']}`: `{row['n_cells']}` cells") |
147 | | - |
148 | | - lines.extend(["", "## Top RNA Features by Total Counts", ""]) |
149 | | - for row in summary["top_rna_features_by_total_counts"]: |
150 | | - lines.append( |
151 | | - f"- `{row['feature']}`: total counts `{row['total_counts']:.0f}`, detected cells `{row['detected_cells']}`" |
152 | | - ) |
153 | | - |
154 | | - lines.extend(["", "## Top Protein Markers by Mean Signal", ""]) |
155 | | - for row in summary["top_protein_markers_by_mean_signal"]: |
156 | | - lines.append( |
157 | | - f"- `{row['marker']}`: mean signal `{row['mean_signal']:.4f}`, positive cells `{row['positive_cells']}`" |
158 | | - ) |
159 | | - |
160 | | - lines.extend(["", "## Issues", ""]) |
161 | | - if issues: |
162 | | - lines.extend(f"- {issue}" for issue in issues) |
163 | | - else: |
164 | | - lines.append("- No issues detected.") |
165 | | - |
166 | | - lines.append("") |
167 | | - return "\n".join(lines) |
168 | | - |
169 | | - |
170 | | -def write_output_artifacts(payload: dict, output_dir: str | None) -> None: |
171 | | - if not output_dir: |
172 | | - return |
173 | | - |
174 | | - out = Path(output_dir) |
175 | | - out.mkdir(parents=True, exist_ok=True) |
176 | | - |
177 | | - (out / "summary.json").write_text(json.dumps(payload, indent=2) + "\n", encoding="utf-8") |
178 | | - (out / "report.md").write_text(render_markdown_report(payload), encoding="utf-8") |
179 | | - pd.DataFrame(payload["summary"]["top_rna_features_by_total_counts"]).to_csv( |
180 | | - out / "top_rna_features.csv", index=False |
181 | | - ) |
182 | | - pd.DataFrame(payload["summary"]["top_protein_markers_by_mean_signal"]).to_csv( |
183 | | - out / "top_protein_markers.csv", index=False |
184 | | - ) |
185 | | - pd.DataFrame(payload["summary"]["largest_clusters"]).to_csv(out / "largest_clusters.csv", index=False) |
186 | | - |
187 | | - |
188 | | -def validate_summary(summary: dict) -> list[str]: |
189 | | - issues: list[str] = [] |
190 | | - |
191 | | - if summary["n_cells"] != EXPECTED_CELLS: |
192 | | - issues.append(f"Expected {EXPECTED_CELLS} cells, observed {summary['n_cells']}.") |
193 | | - if summary["n_rna_features"] != EXPECTED_RNA_FEATURES: |
194 | | - issues.append( |
195 | | - f"Expected {EXPECTED_RNA_FEATURES} RNA features, observed {summary['n_rna_features']}." |
196 | | - ) |
197 | | - if summary["n_protein_markers"] != EXPECTED_PROTEIN_MARKERS: |
198 | | - issues.append( |
199 | | - f"Expected {EXPECTED_PROTEIN_MARKERS} protein markers, observed {summary['n_protein_markers']}." |
200 | | - ) |
201 | | - if not summary["has_spatial"]: |
202 | | - issues.append("Expected adata.obsm['spatial'] to be present.") |
203 | | - if not summary["has_cluster"]: |
204 | | - issues.append("Expected adata.obs['cluster'] to be present.") |
205 | | - |
206 | | - metric_cells = summary["metrics_summary_num_cells_detected"] |
207 | | - if metric_cells is not None and metric_cells != summary["n_cells"]: |
208 | | - issues.append( |
209 | | - "metrics_summary.csv reports " |
210 | | - f"{metric_cells} detected cells, but pyXenium loaded {summary['n_cells']} cells." |
211 | | - ) |
212 | | - |
213 | | - return issues |
214 | 22 |
|
215 | 23 |
|
216 | 24 | def parse_args() -> argparse.Namespace: |
@@ -266,32 +74,16 @@ def parse_args() -> argparse.Namespace: |
266 | 74 |
|
267 | 75 | def main() -> int: |
268 | 76 | args = parse_args() |
269 | | - summary, adata = build_summary(base_path=args.base_path, prefer=args.prefer, top_n=args.top_n) |
270 | | - issues = validate_summary(summary) |
271 | | - |
272 | | - payload = { |
273 | | - "summary": summary, |
274 | | - "validated_reference": { |
275 | | - "expected_cells": EXPECTED_CELLS, |
276 | | - "expected_rna_features": EXPECTED_RNA_FEATURES, |
277 | | - "expected_protein_markers": EXPECTED_PROTEIN_MARKERS, |
278 | | - }, |
279 | | - "issues": issues, |
280 | | - } |
281 | | - |
282 | | - rendered = json.dumps(payload, indent=2) |
283 | | - print(rendered) |
284 | | - |
285 | | - if args.output_json: |
286 | | - Path(args.output_json).write_text(rendered + "\n", encoding="utf-8") |
287 | | - write_output_artifacts(payload, args.output_dir) |
288 | | - |
289 | | - if args.write_h5ad: |
290 | | - h5ad_path = Path(args.write_h5ad) |
291 | | - h5ad_path.parent.mkdir(parents=True, exist_ok=True) |
292 | | - adata.write_h5ad(h5ad_path) |
293 | | - |
294 | | - if issues and not args.allow_mismatch: |
| 77 | + payload = run_validated_renal_ffpe_smoke( |
| 78 | + base_path=args.base_path, |
| 79 | + prefer=args.prefer, |
| 80 | + top_n=args.top_n, |
| 81 | + output_json=args.output_json, |
| 82 | + output_dir=args.output_dir, |
| 83 | + write_h5ad=args.write_h5ad, |
| 84 | + ) |
| 85 | + print(json.dumps(payload, indent=2)) |
| 86 | + if payload["issues"] and not args.allow_mismatch: |
295 | 87 | return 1 |
296 | 88 | return 0 |
297 | 89 |
|
|
0 commit comments