|
| 1 | +# -*- coding: utf-8 -*- |
| 2 | +""" |
| 3 | +xenium_gene_protein_loader.py |
| 4 | +
|
| 5 | +通用读取 10x Xenium In Situ Gene + Protein 输出(XOA ≥ v4.0)为 AnnData 的工具。 |
| 6 | +- 自动从 cell_feature_matrix(MEX 或 Zarr 或 HDF5)中拆分 RNA / Protein |
| 7 | +- RNA 进入 .X(稀疏计数矩阵),Protein 进入 .obsm["protein"](float 强度,DataFrame) |
| 8 | +- 附带 cells.csv.gz(obs)与几何信息(obs['x'], obs['y'] / 多边形到 .obsm/.uns) |
| 9 | +- 可选读取 morphology_focus/ OME-TIFF 与 QC mask(默认关闭,仅留钩子) |
| 10 | +
|
| 11 | +参考: |
| 12 | +- Protein features & outputs(XOA v4.0+):10x 文档“Protein data outputs” |
| 13 | +- morphology_focus 命名示例与位置:10x “Understanding Xenium Outputs” |
| 14 | +- Protein 算法 & 背景偏置说明:10x “Xenium Protein Algorithms” |
| 15 | +""" |
| 16 | +from __future__ import annotations |
| 17 | + |
| 18 | +import io |
| 19 | +import os |
| 20 | +import gzip |
| 21 | +import json |
| 22 | +import warnings |
| 23 | +from typing import Optional, Tuple, Dict |
| 24 | + |
| 25 | +import numpy as np |
| 26 | +import pandas as pd |
| 27 | +import anndata as ad |
| 28 | +from scipy import sparse |
| 29 | + |
| 30 | +# 可选依赖,按需导入 |
| 31 | +try: |
| 32 | + import zarr |
| 33 | +except Exception: # pragma: no cover |
| 34 | + zarr = None |
| 35 | +try: |
| 36 | + import h5py |
| 37 | +except Exception: # pragma: no cover |
| 38 | + h5py = None |
| 39 | + |
| 40 | +# 为远程/本地透明访问 |
| 41 | +import fsspec |
| 42 | + |
| 43 | + |
| 44 | +# --------------------------- |
| 45 | +# 工具函数 |
| 46 | +# --------------------------- |
| 47 | + |
| 48 | +def _open_text(path_or_url: str): |
| 49 | + """以文本模式打开本地或远程文件(自动识别 .gz)""" |
| 50 | + f = fsspec.open(path_or_url, mode="rb").open() |
| 51 | + # 自动解压 .gz |
| 52 | + if path_or_url.endswith(".gz"): |
| 53 | + return io.TextIOWrapper(gzip.GzipFile(fileobj=f), encoding="utf-8") |
| 54 | + # 普通文本 |
| 55 | + return io.TextIOWrapper(f, encoding="utf-8") |
| 56 | + |
| 57 | + |
| 58 | +def _exists(path_or_url: str) -> bool: |
| 59 | + try: |
| 60 | + with fsspec.open(path_or_url).open() as _: |
| 61 | + return True |
| 62 | + except Exception: |
| 63 | + return False |
| 64 | + |
| 65 | + |
| 66 | +def _join(base: str, *names: str) -> str: |
| 67 | + base = base.rstrip("/") |
| 68 | + |
| 69 | + # 若 base 看起来已经是完整文件名,直接返回 |
| 70 | + if names and any(n.startswith("http") or n.startswith("gs://") for n in names): |
| 71 | + # 子路径本身是完整 URL |
| 72 | + return names[-1] |
| 73 | + |
| 74 | + for n in names: |
| 75 | + if n is None or n == "": |
| 76 | + continue |
| 77 | + if n.startswith("/"): |
| 78 | + base = n # 允许绝对路径覆盖 |
| 79 | + else: |
| 80 | + base = f"{base}/{n}" |
| 81 | + return base |
| 82 | + |
| 83 | + |
| 84 | +# --------------------------- |
| 85 | +# 读 MEX(三件套) |
| 86 | +# --------------------------- |
| 87 | + |
| 88 | +def _read_mex_triplet(mex_dir: str, |
| 89 | + matrix_name: str = "matrix.mtx.gz", |
| 90 | + features_name: str = "features.tsv.gz", |
| 91 | + barcodes_name: str = "barcodes.tsv.gz" |
| 92 | + ) -> Tuple[sparse.csr_matrix, pd.DataFrame, pd.Index]: |
| 93 | + """读取 MEX 为 (csr_matrix, features_df, barcodes_index)""" |
| 94 | + from scipy.io import mmread |
| 95 | + |
| 96 | + mtx_fp = _join(mex_dir, matrix_name) |
| 97 | + feat_fp = _join(mex_dir, features_name) |
| 98 | + bar_fp = _join(mex_dir, barcodes_name) |
| 99 | + |
| 100 | + if not all(_exists(p) for p in (mtx_fp, feat_fp, bar_fp)): |
| 101 | + raise FileNotFoundError( |
| 102 | + f"MEX 文件不完整:\n{mtx_fp}\n{feat_fp}\n{bar_fp}" |
| 103 | + ) |
| 104 | + |
| 105 | + with fsspec.open(mtx_fp).open() as f: |
| 106 | + if mtx_fp.endswith(".gz"): |
| 107 | + m = mmread(gzip.GzipFile(fileobj=f)).tocsr() |
| 108 | + else: |
| 109 | + m = mmread(f).tocsr() |
| 110 | + |
| 111 | + with _open_text(feat_fp) as f: |
| 112 | + # 10x features.tsv.gz 通常 3~5 列:id, name, feature_type[, genome, ...] |
| 113 | + features = pd.read_csv( |
| 114 | + f, sep="\t", header=None, comment="#", engine="python" |
| 115 | + ) |
| 116 | + # 兜底列名 |
| 117 | + cols = ["id", "name", "feature_type"] + [f"col{i}" for i in range(max(0, features.shape[1] - 3))] |
| 118 | + features.columns = cols[:features.shape[1]] |
| 119 | + |
| 120 | + with _open_text(bar_fp) as f: |
| 121 | + barcodes = pd.read_csv(f, sep="\t", header=None, engine="python")[0].astype(str).values |
| 122 | + |
| 123 | + return m, features, pd.Index(barcodes, name="barcode") |
| 124 | + |
| 125 | + |
| 126 | +# --------------------------- |
| 127 | +# 读 Zarr / HDF5(cell_feature_matrix) |
| 128 | +# --------------------------- |
| 129 | + |
| 130 | +def _read_cell_feature_matrix_zarr(zarr_root: str) -> Tuple[sparse.csr_matrix, pd.DataFrame, pd.Index]: |
| 131 | + """读取 10x Zarr 版 cell_feature_matrix.""" |
| 132 | + if zarr is None: |
| 133 | + raise ImportError("需要 zarr>=2 才能读取 Zarr 格式的 cell_feature_matrix") |
| 134 | + |
| 135 | + # 兼容两种布局: |
| 136 | + # 1) <sample>/cell_feature_matrix.zarr/ |
| 137 | + # 2) <sample>/cell_feature_matrix/ (直接是 Zarr store) |
| 138 | + cand = [] |
| 139 | + for name in ("cell_feature_matrix.zarr", "cell_feature_matrix"): |
| 140 | + p = _join(zarr_root, name) |
| 141 | + if _exists(p): |
| 142 | + cand.append(p) |
| 143 | + if not cand: |
| 144 | + raise FileNotFoundError("未找到 Zarr cell_feature_matrix(*.zarr 或同名目录)") |
| 145 | + |
| 146 | + store_path = cand[0] |
| 147 | + store = zarr.open_group(fsspec.get_mapper(store_path), mode="r") |
| 148 | + |
| 149 | + # 官方约定字段(不同版本可能略有差异) |
| 150 | + data = store["X/data"][:] |
| 151 | + indices = store["X/indices"][:] |
| 152 | + indptr = store["X/indptr"][:] |
| 153 | + shape = tuple(store["X/shape"][:]) |
| 154 | + X = sparse.csr_matrix((data, indices, indptr), shape=shape) |
| 155 | + |
| 156 | + feat = pd.DataFrame({ |
| 157 | + "id": store["features/id"][:].astype(str), |
| 158 | + "name": store["features/name"][:].astype(str), |
| 159 | + "feature_type": store["features/feature_type"][:].astype(str), |
| 160 | + }) |
| 161 | + |
| 162 | + barcodes = pd.Index(store["barcodes"][:].astype(str), name="barcode") |
| 163 | + return X, feat, barcodes |
| 164 | + |
| 165 | + |
| 166 | +def _read_cell_feature_matrix_h5(h5_path: str) -> Tuple[sparse.csr_matrix, pd.DataFrame, pd.Index]: |
| 167 | + """读取 10x HDF5 版 cell_feature_matrix(若提供)""" |
| 168 | + if h5py is None: |
| 169 | + raise ImportError("需要 h5py 才能读取 HDF5 格式的 cell_feature_matrix") |
| 170 | + |
| 171 | + with fsspec.open(h5_path).open() as fb: |
| 172 | + with h5py.File(fb, "r") as f: |
| 173 | + grp = f["X"] |
| 174 | + X = sparse.csr_matrix((grp["data"][:], grp["indices"][:], grp["indptr"][:]), |
| 175 | + shape=tuple(grp["shape"][:])) |
| 176 | + feat = pd.DataFrame({ |
| 177 | + "id": f["features"]["id"][:].astype(str), |
| 178 | + "name": f["features"]["name"][:].astype(str), |
| 179 | + "feature_type": f["features"]["feature_type"][:].astype(str), |
| 180 | + }) |
| 181 | + barcodes = pd.Index(f["barcodes"][:].astype(str), name="barcode") |
| 182 | + return X, feat, barcodes |
| 183 | + |
| 184 | + |
| 185 | +# --------------------------- |
| 186 | +# 主函数:读取 Gene + Protein |
| 187 | +# --------------------------- |
| 188 | + |
| 189 | +def load_xenium_gene_protein( |
| 190 | + base_path: str, |
| 191 | + *, |
| 192 | + prefer: str = "auto", # "auto" | "zarr" | "h5" | "mex" |
| 193 | + mex_dirname: str = "cell_feature_matrix", |
| 194 | + mex_matrix_name: str = "matrix.mtx.gz", |
| 195 | + mex_features_name: str = "features.tsv.gz", |
| 196 | + mex_barcodes_name: str = "barcodes.tsv.gz", |
| 197 | + cells_csv: str = "cells.csv.gz", |
| 198 | + cells_parquet: Optional[str] = None, |
| 199 | + read_morphology: bool = False, # 预留:是否读 morphology_focus/ |
| 200 | + attach_boundaries: bool = True, # 若可用,将 cell/nucleus 边界附加进 .uns/.obsm |
| 201 | +) -> ad.AnnData: |
| 202 | + """ |
| 203 | + 从 Xenium 输出目录(本地/远程)读取同时包含 RNA + Protein 的 AnnData。 |
| 204 | + 参数 |
| 205 | + ---- |
| 206 | + base_path: 目录或 URL(例如 HuggingFace 原始文件链接所在的“目录 URL”) |
| 207 | + prefer: 读取优先级(默认 auto:依次尝试 zarr > h5 > mex) |
| 208 | + 其它参数: 自定义 MEX 三件套与 cells 文件名 |
| 209 | + 返回 |
| 210 | + ---- |
| 211 | + AnnData |
| 212 | + - .X: RNA(csr,计数) |
| 213 | + - .var: RNA 基因注释(仅 feature_type == "Gene Expression") |
| 214 | + - .obsm["protein"]: pd.DataFrame(列为蛋白 marker,值为 scaled mean intensity) |
| 215 | + - .obs: 细胞表(cells.csv.gz/parquet) |
| 216 | + - .uns/.obsm: 可选地附加边界/质心等 |
| 217 | + """ |
| 218 | + # 1) 选择可用的 cell_feature_matrix 源 |
| 219 | + # Zarr |
| 220 | + zarr_candidate = None |
| 221 | + for name in ("cell_feature_matrix.zarr", "cell_feature_matrix"): |
| 222 | + p = _join(base_path, name) |
| 223 | + if _exists(p): |
| 224 | + zarr_candidate = p |
| 225 | + break |
| 226 | + # HDF5 |
| 227 | + h5_candidate = _join(base_path, "cell_feature_matrix.h5") if _exists(_join(base_path, "cell_feature_matrix.h5")) else None |
| 228 | + # MEX |
| 229 | + mex_candidate = _join(base_path, mex_dirname) if _exists(_join(base_path, mex_dirname)) else None |
| 230 | + |
| 231 | + # 策略 |
| 232 | + order = { |
| 233 | + "zarr": 0, "h5": 1, "mex": 2 |
| 234 | + } |
| 235 | + tried = [] |
| 236 | + |
| 237 | + def read_cfm(): |
| 238 | + if prefer in ("auto", "zarr") and zarr_candidate: |
| 239 | + tried.append(zarr_candidate) |
| 240 | + return _read_cell_feature_matrix_zarr(base_path) |
| 241 | + if prefer in ("auto", "h5") and h5_candidate: |
| 242 | + tried.append(h5_candidate) |
| 243 | + return _read_cell_feature_matrix_h5(h5_candidate) |
| 244 | + if prefer in ("auto", "mex") and mex_candidate: |
| 245 | + tried.append(mex_candidate) |
| 246 | + return _read_mex_triplet(mex_candidate, mex_matrix_name, mex_features_name, mex_barcodes_name) |
| 247 | + raise FileNotFoundError(f"未发现可用的 cell_feature_matrix(尝试:{tried or '无'})") |
| 248 | + |
| 249 | + X_all, feat_all, barcodes = read_cfm() |
| 250 | + |
| 251 | + # 2) 拆分 feature_type |
| 252 | + if "feature_type" not in feat_all.columns: |
| 253 | + # 旧版兜底(当缺少类型时视为全是 Gene) |
| 254 | + feat_all["feature_type"] = "Gene Expression" |
| 255 | + |
| 256 | + # 官方:蛋白为“Protein Expression”,RNA 为“Gene Expression” |
| 257 | + mask_rna = feat_all["feature_type"].astype(str).str.lower().str.contains("gene") |
| 258 | + mask_pro = feat_all["feature_type"].astype(str).str.lower().str.contains("protein") |
| 259 | + |
| 260 | + # 索引映射 |
| 261 | + idx_rna = np.where(mask_rna.values)[0] |
| 262 | + idx_pro = np.where(mask_pro.values)[0] |
| 263 | + |
| 264 | + # 任何一种缺失都允许(用户可能只有 RNA 或只有 Protein) |
| 265 | + X_rna = X_all[:, idx_rna].tocsr() if idx_rna.size else sparse.csr_matrix((X_all.shape[0], 0)) |
| 266 | + var_rna = feat_all.loc[mask_rna, ["id", "name", "feature_type"]].copy() |
| 267 | + var_rna.index = var_rna["id"].values |
| 268 | + |
| 269 | + if idx_pro.size: |
| 270 | + X_pro = X_all[:, idx_pro].astype(np.float32) # protein 是强度 |
| 271 | + var_pro = feat_all.loc[mask_pro, ["id", "name", "feature_type"]].copy() |
| 272 | + var_pro.index = var_pro["id"].values |
| 273 | + |
| 274 | + # 以 DataFrame 放到 obsm["protein"](行与 .obs 对齐;列用 protein marker name,若重名则回退 id) |
| 275 | + pro_names = var_pro["name"].astype(str).values |
| 276 | + # 防止重名 |
| 277 | + if len(set(pro_names)) != len(pro_names): |
| 278 | + pro_names = [f"{n}_{i}" for i, n in enumerate(pro_names)] |
| 279 | + protein_df = pd.DataFrame(X_pro.toarray(), index=barcodes, columns=pro_names) |
| 280 | + else: |
| 281 | + protein_df = pd.DataFrame(index=barcodes) |
| 282 | + |
| 283 | + # 3) cells 表 |
| 284 | + obs = None |
| 285 | + if cells_parquet and _exists(_join(base_path, cells_parquet)): |
| 286 | + obs = pd.read_parquet(_join(base_path, cells_parquet)) |
| 287 | + elif _exists(_join(base_path, cells_csv)): |
| 288 | + with _open_text(_join(base_path, cells_csv)) as f: |
| 289 | + obs = pd.read_csv(f) |
| 290 | + else: |
| 291 | + warnings.warn("未找到 cells 表(cells.csv.gz 或 parquet); 将仅使用条码作为 obs。") |
| 292 | + obs = pd.DataFrame(index=barcodes) |
| 293 | + |
| 294 | + # 对齐 obs 顺序,确保与 barcodes 一致 |
| 295 | + if "cell_id" in obs.columns: |
| 296 | + obs = obs.set_index("cell_id") |
| 297 | + if obs.index.name is None or obs.index.name != "barcode": |
| 298 | + obs.index.name = "barcode" |
| 299 | + # 可能出现 obs 与 barcodes 不完全一致,按 barcodes 重建并对齐 |
| 300 | + obs = obs.reindex(barcodes).copy() |
| 301 | + |
| 302 | + # 4) 组装 AnnData |
| 303 | + adata = ad.AnnData(X=X_rna, obs=obs, var=var_rna) |
| 304 | + adata.layers["rna"] = adata.X.copy() # 显式命名 |
| 305 | + adata.obsm["protein"] = protein_df |
| 306 | + |
| 307 | + # 常用元数据 |
| 308 | + adata.uns.setdefault("modality", {}) |
| 309 | + adata.uns["modality"]["rna"] = {"feature_type": "Gene Expression"} |
| 310 | + if protein_df.shape[1] > 0: |
| 311 | + adata.uns["modality"]["protein"] = {"feature_type": "Protein Expression", "value": "scaled_mean_intensity"} # 见 10x 文档 |
| 312 | + |
| 313 | + # 5) 可选:附加边界/质心(如可用) |
| 314 | + if attach_boundaries: |
| 315 | + # cells.csv.gz 中常有 centroid_x/centroid_y/… 字段(不同版本字段名略异) |
| 316 | + for cand_x, cand_y in (("x_centroid", "y_centroid"), ("cell_x_centroid", "cell_y_centroid"), |
| 317 | + ("centroid_x", "centroid_y")): |
| 318 | + if cand_x in adata.obs.columns and cand_y in adata.obs.columns: |
| 319 | + adata.obsm["spatial"] = adata.obs[[cand_x, cand_y]].to_numpy() |
| 320 | + break |
| 321 | + |
| 322 | + # cell_boundaries.csv.gz / nucleus_boundaries.csv.gz 可存于 .uns/.obsm |
| 323 | + for fname, key in (("cell_boundaries.csv.gz", "cell_boundaries"), |
| 324 | + ("nucleus_boundaries.csv.gz", "nucleus_boundaries")): |
| 325 | + p = _join(base_path, fname) |
| 326 | + if _exists(p): |
| 327 | + with _open_text(p) as f: |
| 328 | + bdf = pd.read_csv(f) |
| 329 | + adata.uns[key] = bdf # 先原样放入;高阶用户可用 shapely/squidpy 转多边形 |
| 330 | + |
| 331 | + # 6) 预留:读 morphology_focus(默认关闭) |
| 332 | + if read_morphology: |
| 333 | + # 仅做占位说明;实际读取/拼接 OME-TIFF 建议在单独的图像模块中完成 |
| 334 | + adata.uns.setdefault("morphology_focus", {"path": _join(base_path, "morphology_focus")}) |
| 335 | + |
| 336 | + return adata |
0 commit comments