Skip to content

Commit 4b3d7d0

Browse files
committed
add new feature
1 parent ee229ec commit 4b3d7d0

2 files changed

Lines changed: 337 additions & 0 deletions

File tree

src/pyXenium/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
from ._version import __version__
22
from .io.partial_xenium_loader import load_anndata_from_partial
3+
from .io.xenium_gene_protein_loader import load_xenium_gene_protein
34

45
# src/pyXenium/__init__.py
56
__all__ = []
Lines changed: 336 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,336 @@
1+
# -*- coding: utf-8 -*-
2+
"""
3+
xenium_gene_protein_loader.py
4+
5+
通用读取 10x Xenium In Situ Gene + Protein 输出(XOA ≥ v4.0)为 AnnData 的工具。
6+
- 自动从 cell_feature_matrix(MEX 或 Zarr 或 HDF5)中拆分 RNA / Protein
7+
- RNA 进入 .X(稀疏计数矩阵),Protein 进入 .obsm["protein"](float 强度,DataFrame)
8+
- 附带 cells.csv.gz(obs)与几何信息(obs['x'], obs['y'] / 多边形到 .obsm/.uns)
9+
- 可选读取 morphology_focus/ OME-TIFF 与 QC mask(默认关闭,仅留钩子)
10+
11+
参考:
12+
- Protein features & outputs(XOA v4.0+):10x 文档“Protein data outputs”
13+
- morphology_focus 命名示例与位置:10x “Understanding Xenium Outputs”
14+
- Protein 算法 & 背景偏置说明:10x “Xenium Protein Algorithms”
15+
"""
16+
from __future__ import annotations
17+
18+
import io
19+
import os
20+
import gzip
21+
import json
22+
import warnings
23+
from typing import Optional, Tuple, Dict
24+
25+
import numpy as np
26+
import pandas as pd
27+
import anndata as ad
28+
from scipy import sparse
29+
30+
# 可选依赖,按需导入
31+
try:
32+
import zarr
33+
except Exception: # pragma: no cover
34+
zarr = None
35+
try:
36+
import h5py
37+
except Exception: # pragma: no cover
38+
h5py = None
39+
40+
# 为远程/本地透明访问
41+
import fsspec
42+
43+
44+
# ---------------------------
45+
# 工具函数
46+
# ---------------------------
47+
48+
def _open_text(path_or_url: str):
49+
"""以文本模式打开本地或远程文件(自动识别 .gz)"""
50+
f = fsspec.open(path_or_url, mode="rb").open()
51+
# 自动解压 .gz
52+
if path_or_url.endswith(".gz"):
53+
return io.TextIOWrapper(gzip.GzipFile(fileobj=f), encoding="utf-8")
54+
# 普通文本
55+
return io.TextIOWrapper(f, encoding="utf-8")
56+
57+
58+
def _exists(path_or_url: str) -> bool:
59+
try:
60+
with fsspec.open(path_or_url).open() as _:
61+
return True
62+
except Exception:
63+
return False
64+
65+
66+
def _join(base: str, *names: str) -> str:
67+
base = base.rstrip("/")
68+
69+
# 若 base 看起来已经是完整文件名,直接返回
70+
if names and any(n.startswith("http") or n.startswith("gs://") for n in names):
71+
# 子路径本身是完整 URL
72+
return names[-1]
73+
74+
for n in names:
75+
if n is None or n == "":
76+
continue
77+
if n.startswith("/"):
78+
base = n # 允许绝对路径覆盖
79+
else:
80+
base = f"{base}/{n}"
81+
return base
82+
83+
84+
# ---------------------------
85+
# 读 MEX(三件套)
86+
# ---------------------------
87+
88+
def _read_mex_triplet(mex_dir: str,
89+
matrix_name: str = "matrix.mtx.gz",
90+
features_name: str = "features.tsv.gz",
91+
barcodes_name: str = "barcodes.tsv.gz"
92+
) -> Tuple[sparse.csr_matrix, pd.DataFrame, pd.Index]:
93+
"""读取 MEX 为 (csr_matrix, features_df, barcodes_index)"""
94+
from scipy.io import mmread
95+
96+
mtx_fp = _join(mex_dir, matrix_name)
97+
feat_fp = _join(mex_dir, features_name)
98+
bar_fp = _join(mex_dir, barcodes_name)
99+
100+
if not all(_exists(p) for p in (mtx_fp, feat_fp, bar_fp)):
101+
raise FileNotFoundError(
102+
f"MEX 文件不完整:\n{mtx_fp}\n{feat_fp}\n{bar_fp}"
103+
)
104+
105+
with fsspec.open(mtx_fp).open() as f:
106+
if mtx_fp.endswith(".gz"):
107+
m = mmread(gzip.GzipFile(fileobj=f)).tocsr()
108+
else:
109+
m = mmread(f).tocsr()
110+
111+
with _open_text(feat_fp) as f:
112+
# 10x features.tsv.gz 通常 3~5 列:id, name, feature_type[, genome, ...]
113+
features = pd.read_csv(
114+
f, sep="\t", header=None, comment="#", engine="python"
115+
)
116+
# 兜底列名
117+
cols = ["id", "name", "feature_type"] + [f"col{i}" for i in range(max(0, features.shape[1] - 3))]
118+
features.columns = cols[:features.shape[1]]
119+
120+
with _open_text(bar_fp) as f:
121+
barcodes = pd.read_csv(f, sep="\t", header=None, engine="python")[0].astype(str).values
122+
123+
return m, features, pd.Index(barcodes, name="barcode")
124+
125+
126+
# ---------------------------
127+
# 读 Zarr / HDF5(cell_feature_matrix)
128+
# ---------------------------
129+
130+
def _read_cell_feature_matrix_zarr(zarr_root: str) -> Tuple[sparse.csr_matrix, pd.DataFrame, pd.Index]:
131+
"""读取 10x Zarr 版 cell_feature_matrix."""
132+
if zarr is None:
133+
raise ImportError("需要 zarr>=2 才能读取 Zarr 格式的 cell_feature_matrix")
134+
135+
# 兼容两种布局:
136+
# 1) <sample>/cell_feature_matrix.zarr/
137+
# 2) <sample>/cell_feature_matrix/ (直接是 Zarr store)
138+
cand = []
139+
for name in ("cell_feature_matrix.zarr", "cell_feature_matrix"):
140+
p = _join(zarr_root, name)
141+
if _exists(p):
142+
cand.append(p)
143+
if not cand:
144+
raise FileNotFoundError("未找到 Zarr cell_feature_matrix(*.zarr 或同名目录)")
145+
146+
store_path = cand[0]
147+
store = zarr.open_group(fsspec.get_mapper(store_path), mode="r")
148+
149+
# 官方约定字段(不同版本可能略有差异)
150+
data = store["X/data"][:]
151+
indices = store["X/indices"][:]
152+
indptr = store["X/indptr"][:]
153+
shape = tuple(store["X/shape"][:])
154+
X = sparse.csr_matrix((data, indices, indptr), shape=shape)
155+
156+
feat = pd.DataFrame({
157+
"id": store["features/id"][:].astype(str),
158+
"name": store["features/name"][:].astype(str),
159+
"feature_type": store["features/feature_type"][:].astype(str),
160+
})
161+
162+
barcodes = pd.Index(store["barcodes"][:].astype(str), name="barcode")
163+
return X, feat, barcodes
164+
165+
166+
def _read_cell_feature_matrix_h5(h5_path: str) -> Tuple[sparse.csr_matrix, pd.DataFrame, pd.Index]:
167+
"""读取 10x HDF5 版 cell_feature_matrix(若提供)"""
168+
if h5py is None:
169+
raise ImportError("需要 h5py 才能读取 HDF5 格式的 cell_feature_matrix")
170+
171+
with fsspec.open(h5_path).open() as fb:
172+
with h5py.File(fb, "r") as f:
173+
grp = f["X"]
174+
X = sparse.csr_matrix((grp["data"][:], grp["indices"][:], grp["indptr"][:]),
175+
shape=tuple(grp["shape"][:]))
176+
feat = pd.DataFrame({
177+
"id": f["features"]["id"][:].astype(str),
178+
"name": f["features"]["name"][:].astype(str),
179+
"feature_type": f["features"]["feature_type"][:].astype(str),
180+
})
181+
barcodes = pd.Index(f["barcodes"][:].astype(str), name="barcode")
182+
return X, feat, barcodes
183+
184+
185+
# ---------------------------
186+
# 主函数:读取 Gene + Protein
187+
# ---------------------------
188+
189+
def load_xenium_gene_protein(
190+
base_path: str,
191+
*,
192+
prefer: str = "auto", # "auto" | "zarr" | "h5" | "mex"
193+
mex_dirname: str = "cell_feature_matrix",
194+
mex_matrix_name: str = "matrix.mtx.gz",
195+
mex_features_name: str = "features.tsv.gz",
196+
mex_barcodes_name: str = "barcodes.tsv.gz",
197+
cells_csv: str = "cells.csv.gz",
198+
cells_parquet: Optional[str] = None,
199+
read_morphology: bool = False, # 预留:是否读 morphology_focus/
200+
attach_boundaries: bool = True, # 若可用,将 cell/nucleus 边界附加进 .uns/.obsm
201+
) -> ad.AnnData:
202+
"""
203+
从 Xenium 输出目录(本地/远程)读取同时包含 RNA + Protein 的 AnnData。
204+
参数
205+
----
206+
base_path: 目录或 URL(例如 HuggingFace 原始文件链接所在的“目录 URL”)
207+
prefer: 读取优先级(默认 auto:依次尝试 zarr > h5 > mex)
208+
其它参数: 自定义 MEX 三件套与 cells 文件名
209+
返回
210+
----
211+
AnnData
212+
- .X: RNA(csr,计数)
213+
- .var: RNA 基因注释(仅 feature_type == "Gene Expression")
214+
- .obsm["protein"]: pd.DataFrame(列为蛋白 marker,值为 scaled mean intensity)
215+
- .obs: 细胞表(cells.csv.gz/parquet)
216+
- .uns/.obsm: 可选地附加边界/质心等
217+
"""
218+
# 1) 选择可用的 cell_feature_matrix 源
219+
# Zarr
220+
zarr_candidate = None
221+
for name in ("cell_feature_matrix.zarr", "cell_feature_matrix"):
222+
p = _join(base_path, name)
223+
if _exists(p):
224+
zarr_candidate = p
225+
break
226+
# HDF5
227+
h5_candidate = _join(base_path, "cell_feature_matrix.h5") if _exists(_join(base_path, "cell_feature_matrix.h5")) else None
228+
# MEX
229+
mex_candidate = _join(base_path, mex_dirname) if _exists(_join(base_path, mex_dirname)) else None
230+
231+
# 策略
232+
order = {
233+
"zarr": 0, "h5": 1, "mex": 2
234+
}
235+
tried = []
236+
237+
def read_cfm():
238+
if prefer in ("auto", "zarr") and zarr_candidate:
239+
tried.append(zarr_candidate)
240+
return _read_cell_feature_matrix_zarr(base_path)
241+
if prefer in ("auto", "h5") and h5_candidate:
242+
tried.append(h5_candidate)
243+
return _read_cell_feature_matrix_h5(h5_candidate)
244+
if prefer in ("auto", "mex") and mex_candidate:
245+
tried.append(mex_candidate)
246+
return _read_mex_triplet(mex_candidate, mex_matrix_name, mex_features_name, mex_barcodes_name)
247+
raise FileNotFoundError(f"未发现可用的 cell_feature_matrix(尝试:{tried or '无'})")
248+
249+
X_all, feat_all, barcodes = read_cfm()
250+
251+
# 2) 拆分 feature_type
252+
if "feature_type" not in feat_all.columns:
253+
# 旧版兜底(当缺少类型时视为全是 Gene)
254+
feat_all["feature_type"] = "Gene Expression"
255+
256+
# 官方:蛋白为“Protein Expression”,RNA 为“Gene Expression”
257+
mask_rna = feat_all["feature_type"].astype(str).str.lower().str.contains("gene")
258+
mask_pro = feat_all["feature_type"].astype(str).str.lower().str.contains("protein")
259+
260+
# 索引映射
261+
idx_rna = np.where(mask_rna.values)[0]
262+
idx_pro = np.where(mask_pro.values)[0]
263+
264+
# 任何一种缺失都允许(用户可能只有 RNA 或只有 Protein)
265+
X_rna = X_all[:, idx_rna].tocsr() if idx_rna.size else sparse.csr_matrix((X_all.shape[0], 0))
266+
var_rna = feat_all.loc[mask_rna, ["id", "name", "feature_type"]].copy()
267+
var_rna.index = var_rna["id"].values
268+
269+
if idx_pro.size:
270+
X_pro = X_all[:, idx_pro].astype(np.float32) # protein 是强度
271+
var_pro = feat_all.loc[mask_pro, ["id", "name", "feature_type"]].copy()
272+
var_pro.index = var_pro["id"].values
273+
274+
# 以 DataFrame 放到 obsm["protein"](行与 .obs 对齐;列用 protein marker name,若重名则回退 id)
275+
pro_names = var_pro["name"].astype(str).values
276+
# 防止重名
277+
if len(set(pro_names)) != len(pro_names):
278+
pro_names = [f"{n}_{i}" for i, n in enumerate(pro_names)]
279+
protein_df = pd.DataFrame(X_pro.toarray(), index=barcodes, columns=pro_names)
280+
else:
281+
protein_df = pd.DataFrame(index=barcodes)
282+
283+
# 3) cells 表
284+
obs = None
285+
if cells_parquet and _exists(_join(base_path, cells_parquet)):
286+
obs = pd.read_parquet(_join(base_path, cells_parquet))
287+
elif _exists(_join(base_path, cells_csv)):
288+
with _open_text(_join(base_path, cells_csv)) as f:
289+
obs = pd.read_csv(f)
290+
else:
291+
warnings.warn("未找到 cells 表(cells.csv.gz 或 parquet); 将仅使用条码作为 obs。")
292+
obs = pd.DataFrame(index=barcodes)
293+
294+
# 对齐 obs 顺序,确保与 barcodes 一致
295+
if "cell_id" in obs.columns:
296+
obs = obs.set_index("cell_id")
297+
if obs.index.name is None or obs.index.name != "barcode":
298+
obs.index.name = "barcode"
299+
# 可能出现 obs 与 barcodes 不完全一致,按 barcodes 重建并对齐
300+
obs = obs.reindex(barcodes).copy()
301+
302+
# 4) 组装 AnnData
303+
adata = ad.AnnData(X=X_rna, obs=obs, var=var_rna)
304+
adata.layers["rna"] = adata.X.copy() # 显式命名
305+
adata.obsm["protein"] = protein_df
306+
307+
# 常用元数据
308+
adata.uns.setdefault("modality", {})
309+
adata.uns["modality"]["rna"] = {"feature_type": "Gene Expression"}
310+
if protein_df.shape[1] > 0:
311+
adata.uns["modality"]["protein"] = {"feature_type": "Protein Expression", "value": "scaled_mean_intensity"} # 见 10x 文档
312+
313+
# 5) 可选:附加边界/质心(如可用)
314+
if attach_boundaries:
315+
# cells.csv.gz 中常有 centroid_x/centroid_y/… 字段(不同版本字段名略异)
316+
for cand_x, cand_y in (("x_centroid", "y_centroid"), ("cell_x_centroid", "cell_y_centroid"),
317+
("centroid_x", "centroid_y")):
318+
if cand_x in adata.obs.columns and cand_y in adata.obs.columns:
319+
adata.obsm["spatial"] = adata.obs[[cand_x, cand_y]].to_numpy()
320+
break
321+
322+
# cell_boundaries.csv.gz / nucleus_boundaries.csv.gz 可存于 .uns/.obsm
323+
for fname, key in (("cell_boundaries.csv.gz", "cell_boundaries"),
324+
("nucleus_boundaries.csv.gz", "nucleus_boundaries")):
325+
p = _join(base_path, fname)
326+
if _exists(p):
327+
with _open_text(p) as f:
328+
bdf = pd.read_csv(f)
329+
adata.uns[key] = bdf # 先原样放入;高阶用户可用 shapely/squidpy 转多边形
330+
331+
# 6) 预留:读 morphology_focus(默认关闭)
332+
if read_morphology:
333+
# 仅做占位说明;实际读取/拼接 OME-TIFF 建议在单独的图像模块中完成
334+
adata.uns.setdefault("morphology_focus", {"path": _join(base_path, "morphology_focus")})
335+
336+
return adata

0 commit comments

Comments
 (0)