Skip to content

Commit 19a91b7

Browse files
committed
fix
1 parent 7fd9384 commit 19a91b7

1 file changed

Lines changed: 99 additions & 16 deletions

File tree

src/pyXenium/io/xenium_gene_protein_loader.py

Lines changed: 99 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -163,23 +163,106 @@ def _read_cell_feature_matrix_zarr(zarr_root: str) -> Tuple[sparse.csr_matrix, p
163163
return X, feat, barcodes
164164

165165

166+
from typing import Tuple
167+
import pandas as pd
168+
from scipy import sparse
169+
import h5py, fsspec
170+
166171
def _read_cell_feature_matrix_h5(h5_path: str) -> Tuple[sparse.csr_matrix, pd.DataFrame, pd.Index]:
167-
"""读取 10x HDF5 版 cell_feature_matrix(若提供)"""
168-
if h5py is None:
169-
raise ImportError("需要 h5py 才能读取 HDF5 格式的 cell_feature_matrix")
170-
171-
with fsspec.open(h5_path).open() as fb:
172-
with h5py.File(fb, "r") as f:
173-
grp = f["X"]
174-
X = sparse.csr_matrix((grp["data"][:], grp["indices"][:], grp["indptr"][:]),
175-
shape=tuple(grp["shape"][:]))
176-
feat = pd.DataFrame({
177-
"id": f["features"]["id"][:].astype(str),
178-
"name": f["features"]["name"][:].astype(str),
179-
"feature_type": f["features"]["feature_type"][:].astype(str),
180-
})
181-
barcodes = pd.Index(f["barcodes"][:].astype(str), name="barcode")
182-
return X, feat, barcodes
172+
"""Read 10x HDF5 cell_feature_matrix (RNA/Protein). Robust to group names, CSR/CSC, and naming diffs."""
173+
# 兼容本地/远程
174+
try:
175+
fb = fsspec.open(h5_path).open()
176+
fileobj = h5py.File(fb, "r")
177+
managed = True
178+
except Exception:
179+
fileobj = h5py.File(h5_path, "r")
180+
managed = False
181+
182+
def _as_str(arr):
183+
arr = arr[()]
184+
# h5py 字节串 -> str
185+
if getattr(arr, "dtype", None) is not None and arr.dtype.kind in ("S", "O"):
186+
return arr.astype(str)
187+
return arr
188+
189+
try:
190+
f = fileobj
191+
192+
# 1) 找到矩阵分组
193+
grp = f.get("X") or f.get("matrix") or f.get("cell_feature_matrix")
194+
if grp is None:
195+
raise KeyError("Neither 'X' nor 'matrix' nor 'cell_feature_matrix' exists in HDF5.")
196+
197+
data = grp["data"][()]
198+
indices = grp["indices"][()]
199+
indptr = grp["indptr"][()]
200+
shape = tuple(grp["shape"][()]) # (n_features, n_barcodes) in 10x HDF5
201+
202+
# 2) 识别是 CSR 还是 CSC
203+
# CSR: len(indptr) == n_rows + 1 == shape[0] + 1
204+
# CSC: len(indptr) == n_cols + 1 == shape[1] + 1 ← 10x HDF5 常见
205+
if len(indptr) == shape[0] + 1:
206+
# 已经是 CSR(行压缩),行=features
207+
mat = sparse.csr_matrix((data, indices, indptr), shape=shape)
208+
# 通常我们希望得到 cells x features,因此需要转置
209+
X = mat.T.tocsr() # (n_barcodes, n_features)
210+
elif len(indptr) == shape[1] + 1:
211+
# 是 CSC(列压缩),列=barcodes
212+
mat = sparse.csc_matrix((data, indices, indptr), shape=shape)
213+
# 转成 cells x features 的 CSR
214+
X = mat.T.tocsr() # (n_barcodes, n_features)
215+
else:
216+
raise ValueError(
217+
f"Cannot infer matrix format: len(indptr)={len(indptr)}, "
218+
f"shape={shape} (expect {shape[0]+1} for CSR rows or {shape[1]+1} for CSC cols)."
219+
)
220+
221+
# 3) 找 features / barcodes(有的在 grp 下,有的在根)
222+
def _find(node, name):
223+
if name in node:
224+
return node[name]
225+
# 常见 10x HDF5: features/barcodes 挂在同一层(如 grp 或根)
226+
if hasattr(node, "parent") and node.parent is not None and name in node.parent:
227+
return node.parent[name]
228+
if name in f:
229+
return f[name]
230+
return None
231+
232+
feat_grp = _find(grp, "features")
233+
if feat_grp is None:
234+
raise KeyError("Cannot find 'features' group.")
235+
236+
name_ds = feat_grp.get("name") or feat_grp.get("gene_names")
237+
if name_ds is None:
238+
raise KeyError("Cannot find 'features/name' (or 'gene_names').")
239+
240+
feat = pd.DataFrame({
241+
"id": _as_str(feat_grp["id"]),
242+
"name": _as_str(name_ds),
243+
"feature_type": _as_str(feat_grp["feature_type"]),
244+
})
245+
246+
bc_ds = _find(grp, "barcodes")
247+
if bc_ds is None:
248+
raise KeyError("Cannot find 'barcodes'.")
249+
barcodes = pd.Index(_as_str(bc_ds), name="barcode")
250+
251+
# 4) 一致性检查(可帮助早发现问题)
252+
n_cells, n_features = X.shape
253+
if len(barcodes) != n_cells:
254+
raise ValueError(f"Barcodes length {len(barcodes)} != X.shape[0] (cells) {n_cells}.")
255+
if len(feat) != n_features:
256+
raise ValueError(f"Features length {len(feat)} != X.shape[1] (features) {n_features}.")
257+
258+
return X, feat, barcodes
259+
finally:
260+
if managed:
261+
try: fileobj.close()
262+
except Exception: pass
263+
else:
264+
try: fileobj.close()
265+
except Exception: pass
183266

184267

185268
# ---------------------------

0 commit comments

Comments
 (0)