66import sys
77from pathlib import Path
88
9+ import numpy as np
910import pandas as pd
1011
1112try :
2728EXPECTED_PROTEIN_MARKERS = 27
2829
2930
30- def build_summary (base_path : str , prefer : str ) -> dict :
31+ def _top_rna_features (adata , top_n : int ) -> list [dict ]:
32+ feature_names = (
33+ adata .var ["name" ].astype (str ).tolist ()
34+ if "name" in adata .var .columns
35+ else adata .var_names .astype (str ).tolist ()
36+ )
37+ nnz = np .asarray (adata .X .getnnz (axis = 0 )).ravel ()
38+ total = np .asarray (adata .X .sum (axis = 0 )).ravel ()
39+
40+ order = np .argsort (- total )[:top_n ]
41+ rows = []
42+ for idx in order :
43+ rows .append (
44+ {
45+ "feature" : feature_names [idx ],
46+ "detected_cells" : int (nnz [idx ]),
47+ "total_counts" : float (total [idx ]),
48+ }
49+ )
50+ return rows
51+
52+
53+ def _top_protein_markers (adata , top_n : int ) -> list [dict ]:
54+ protein = adata .obsm ["protein" ]
55+ protein_df = protein if isinstance (protein , pd .DataFrame ) else pd .DataFrame (protein , index = adata .obs_names )
56+
57+ mean_signal = protein_df .mean (axis = 0 ).sort_values (ascending = False )
58+ rows = []
59+ for marker , value in mean_signal .head (top_n ).items ():
60+ rows .append (
61+ {
62+ "marker" : str (marker ),
63+ "mean_signal" : float (value ),
64+ "positive_cells" : int ((protein_df [marker ] > 0 ).sum ()),
65+ }
66+ )
67+ return rows
68+
69+
70+ def _top_clusters (adata , top_n : int ) -> list [dict ]:
71+ if "cluster" not in adata .obs .columns :
72+ return []
73+
74+ counts = adata .obs ["cluster" ].astype (str ).value_counts ().head (top_n )
75+ return [{"cluster" : str (cluster ), "n_cells" : int (count )} for cluster , count in counts .items ()]
76+
77+
78+ def build_summary (base_path : str , prefer : str , top_n : int = 10 ) -> tuple [dict , object ]:
3179 adata = load_xenium_gene_protein (base_path = base_path , prefer = prefer )
3280
3381 protein = adata .obsm .get ("protein" )
@@ -47,6 +95,9 @@ def build_summary(base_path: str, prefer: str) -> dict:
4795 "has_cluster" : "cluster" in adata .obs .columns ,
4896 "obsm_keys" : sorted (adata .obsm .keys ()),
4997 "metrics_summary_num_cells_detected" : None ,
98+ "top_rna_features_by_total_counts" : _top_rna_features (adata , top_n = top_n ),
99+ "top_protein_markers_by_mean_signal" : _top_protein_markers (adata , top_n = top_n ),
100+ "largest_clusters" : _top_clusters (adata , top_n = top_n ),
50101 }
51102
52103 metrics_path = Path (base_path ) / "metrics_summary.csv"
@@ -55,7 +106,83 @@ def build_summary(base_path: str, prefer: str) -> dict:
55106 if "num_cells_detected" in metrics .columns and not metrics .empty :
56107 summary ["metrics_summary_num_cells_detected" ] = int (metrics .loc [0 , "num_cells_detected" ])
57108
58- return summary
109+ return summary , adata
110+
111+
112+ def render_markdown_report (payload : dict ) -> str :
113+ summary = payload ["summary" ]
114+ validated = payload ["validated_reference" ]
115+ issues = payload ["issues" ]
116+
117+ lines = [
118+ "# pyXenium Smoke Test Report" ,
119+ "" ,
120+ f"Dataset: { summary ['dataset_title' ]} " ,
121+ f"Source: { summary ['dataset_url' ]} " ,
122+ f"Local path: `{ summary ['base_path' ]} `" ,
123+ f"Backend preference: `{ summary ['prefer' ]} `" ,
124+ "" ,
125+ "## Core Results" ,
126+ "" ,
127+ f"- Cells: `{ summary ['n_cells' ]} `" ,
128+ f"- RNA features: `{ summary ['n_rna_features' ]} `" ,
129+ f"- Protein markers: `{ summary ['n_protein_markers' ]} `" ,
130+ f"- Sparse matrix nnz: `{ summary ['x_nnz' ]} `" ,
131+ f"- Spatial coordinates present: `{ summary ['has_spatial' ]} `" ,
132+ f"- Cluster labels present: `{ summary ['has_cluster' ]} `" ,
133+ f"- metrics_summary.csv detected cells: `{ summary ['metrics_summary_num_cells_detected' ]} `" ,
134+ "" ,
135+ "## Validated Reference" ,
136+ "" ,
137+ f"- Expected cells: `{ validated ['expected_cells' ]} `" ,
138+ f"- Expected RNA features: `{ validated ['expected_rna_features' ]} `" ,
139+ f"- Expected protein markers: `{ validated ['expected_protein_markers' ]} `" ,
140+ "" ,
141+ "## Largest Clusters" ,
142+ "" ,
143+ ]
144+
145+ for row in summary ["largest_clusters" ]:
146+ lines .append (f"- `{ row ['cluster' ]} `: `{ row ['n_cells' ]} ` cells" )
147+
148+ lines .extend (["" , "## Top RNA Features by Total Counts" , "" ])
149+ for row in summary ["top_rna_features_by_total_counts" ]:
150+ lines .append (
151+ f"- `{ row ['feature' ]} `: total counts `{ row ['total_counts' ]:.0f} `, detected cells `{ row ['detected_cells' ]} `"
152+ )
153+
154+ lines .extend (["" , "## Top Protein Markers by Mean Signal" , "" ])
155+ for row in summary ["top_protein_markers_by_mean_signal" ]:
156+ lines .append (
157+ f"- `{ row ['marker' ]} `: mean signal `{ row ['mean_signal' ]:.4f} `, positive cells `{ row ['positive_cells' ]} `"
158+ )
159+
160+ lines .extend (["" , "## Issues" , "" ])
161+ if issues :
162+ lines .extend (f"- { issue } " for issue in issues )
163+ else :
164+ lines .append ("- No issues detected." )
165+
166+ lines .append ("" )
167+ return "\n " .join (lines )
168+
169+
170+ def write_output_artifacts (payload : dict , output_dir : str | None ) -> None :
171+ if not output_dir :
172+ return
173+
174+ out = Path (output_dir )
175+ out .mkdir (parents = True , exist_ok = True )
176+
177+ (out / "summary.json" ).write_text (json .dumps (payload , indent = 2 ) + "\n " , encoding = "utf-8" )
178+ (out / "report.md" ).write_text (render_markdown_report (payload ), encoding = "utf-8" )
179+ pd .DataFrame (payload ["summary" ]["top_rna_features_by_total_counts" ]).to_csv (
180+ out / "top_rna_features.csv" , index = False
181+ )
182+ pd .DataFrame (payload ["summary" ]["top_protein_markers_by_mean_signal" ]).to_csv (
183+ out / "top_protein_markers.csv" , index = False
184+ )
185+ pd .DataFrame (payload ["summary" ]["largest_clusters" ]).to_csv (out / "largest_clusters.csv" , index = False )
59186
60187
61188def validate_summary (summary : dict ) -> list [str ]:
@@ -108,6 +235,12 @@ def parse_args() -> argparse.Namespace:
108235 default = "auto" ,
109236 help = "Preferred matrix backend passed to load_xenium_gene_protein()." ,
110237 )
238+ parser .add_argument (
239+ "--top-n" ,
240+ type = int ,
241+ default = 10 ,
242+ help = "Number of top RNA features, protein markers, and clusters to report." ,
243+ )
111244 parser .add_argument (
112245 "--allow-mismatch" ,
113246 action = "store_true" ,
@@ -118,12 +251,22 @@ def parse_args() -> argparse.Namespace:
118251 default = None ,
119252 help = "Optional path to write the summary JSON." ,
120253 )
254+ parser .add_argument (
255+ "--output-dir" ,
256+ default = None ,
257+ help = "Optional directory for report.md, summary.json, and CSV summaries." ,
258+ )
259+ parser .add_argument (
260+ "--write-h5ad" ,
261+ default = None ,
262+ help = "Optional path to export the loaded AnnData object as an .h5ad file." ,
263+ )
121264 return parser .parse_args ()
122265
123266
124267def main () -> int :
125268 args = parse_args ()
126- summary = build_summary (base_path = args .base_path , prefer = args .prefer )
269+ summary , adata = build_summary (base_path = args .base_path , prefer = args .prefer , top_n = args . top_n )
127270 issues = validate_summary (summary )
128271
129272 payload = {
@@ -141,6 +284,12 @@ def main() -> int:
141284
142285 if args .output_json :
143286 Path (args .output_json ).write_text (rendered + "\n " , encoding = "utf-8" )
287+ write_output_artifacts (payload , args .output_dir )
288+
289+ if args .write_h5ad :
290+ h5ad_path = Path (args .write_h5ad )
291+ h5ad_path .parent .mkdir (parents = True , exist_ok = True )
292+ adata .write_h5ad (h5ad_path )
144293
145294 if issues and not args .allow_mismatch :
146295 return 1
0 commit comments