@@ -163,23 +163,106 @@ def _read_cell_feature_matrix_zarr(zarr_root: str) -> Tuple[sparse.csr_matrix, p
163163 return X , feat , barcodes
164164
165165
166+ from typing import Tuple
167+ import pandas as pd
168+ from scipy import sparse
169+ import h5py , fsspec
170+
166171def _read_cell_feature_matrix_h5 (h5_path : str ) -> Tuple [sparse .csr_matrix , pd .DataFrame , pd .Index ]:
167- """读取 10x HDF5 版 cell_feature_matrix(若提供)"""
168- if h5py is None :
169- raise ImportError ("需要 h5py 才能读取 HDF5 格式的 cell_feature_matrix" )
170-
171- with fsspec .open (h5_path ).open () as fb :
172- with h5py .File (fb , "r" ) as f :
173- grp = f ["X" ]
174- X = sparse .csr_matrix ((grp ["data" ][:], grp ["indices" ][:], grp ["indptr" ][:]),
175- shape = tuple (grp ["shape" ][:]))
176- feat = pd .DataFrame ({
177- "id" : f ["features" ]["id" ][:].astype (str ),
178- "name" : f ["features" ]["name" ][:].astype (str ),
179- "feature_type" : f ["features" ]["feature_type" ][:].astype (str ),
180- })
181- barcodes = pd .Index (f ["barcodes" ][:].astype (str ), name = "barcode" )
182- return X , feat , barcodes
172+ """Read 10x HDF5 cell_feature_matrix (RNA/Protein). Robust to group names, CSR/CSC, and naming diffs."""
173+ # 兼容本地/远程
174+ try :
175+ fb = fsspec .open (h5_path ).open ()
176+ fileobj = h5py .File (fb , "r" )
177+ managed = True
178+ except Exception :
179+ fileobj = h5py .File (h5_path , "r" )
180+ managed = False
181+
182+ def _as_str (arr ):
183+ arr = arr [()]
184+ # h5py 字节串 -> str
185+ if getattr (arr , "dtype" , None ) is not None and arr .dtype .kind in ("S" , "O" ):
186+ return arr .astype (str )
187+ return arr
188+
189+ try :
190+ f = fileobj
191+
192+ # 1) 找到矩阵分组
193+ grp = f .get ("X" ) or f .get ("matrix" ) or f .get ("cell_feature_matrix" )
194+ if grp is None :
195+ raise KeyError ("Neither 'X' nor 'matrix' nor 'cell_feature_matrix' exists in HDF5." )
196+
197+ data = grp ["data" ][()]
198+ indices = grp ["indices" ][()]
199+ indptr = grp ["indptr" ][()]
200+ shape = tuple (grp ["shape" ][()]) # (n_features, n_barcodes) in 10x HDF5
201+
202+ # 2) 识别是 CSR 还是 CSC
203+ # CSR: len(indptr) == n_rows + 1 == shape[0] + 1
204+ # CSC: len(indptr) == n_cols + 1 == shape[1] + 1 ← 10x HDF5 常见
205+ if len (indptr ) == shape [0 ] + 1 :
206+ # 已经是 CSR(行压缩),行=features
207+ mat = sparse .csr_matrix ((data , indices , indptr ), shape = shape )
208+ # 通常我们希望得到 cells x features,因此需要转置
209+ X = mat .T .tocsr () # (n_barcodes, n_features)
210+ elif len (indptr ) == shape [1 ] + 1 :
211+ # 是 CSC(列压缩),列=barcodes
212+ mat = sparse .csc_matrix ((data , indices , indptr ), shape = shape )
213+ # 转成 cells x features 的 CSR
214+ X = mat .T .tocsr () # (n_barcodes, n_features)
215+ else :
216+ raise ValueError (
217+ f"Cannot infer matrix format: len(indptr)={ len (indptr )} , "
218+ f"shape={ shape } (expect { shape [0 ]+ 1 } for CSR rows or { shape [1 ]+ 1 } for CSC cols)."
219+ )
220+
221+ # 3) 找 features / barcodes(有的在 grp 下,有的在根)
222+ def _find (node , name ):
223+ if name in node :
224+ return node [name ]
225+ # 常见 10x HDF5: features/barcodes 挂在同一层(如 grp 或根)
226+ if hasattr (node , "parent" ) and node .parent is not None and name in node .parent :
227+ return node .parent [name ]
228+ if name in f :
229+ return f [name ]
230+ return None
231+
232+ feat_grp = _find (grp , "features" )
233+ if feat_grp is None :
234+ raise KeyError ("Cannot find 'features' group." )
235+
236+ name_ds = feat_grp .get ("name" ) or feat_grp .get ("gene_names" )
237+ if name_ds is None :
238+ raise KeyError ("Cannot find 'features/name' (or 'gene_names')." )
239+
240+ feat = pd .DataFrame ({
241+ "id" : _as_str (feat_grp ["id" ]),
242+ "name" : _as_str (name_ds ),
243+ "feature_type" : _as_str (feat_grp ["feature_type" ]),
244+ })
245+
246+ bc_ds = _find (grp , "barcodes" )
247+ if bc_ds is None :
248+ raise KeyError ("Cannot find 'barcodes'." )
249+ barcodes = pd .Index (_as_str (bc_ds ), name = "barcode" )
250+
251+ # 4) 一致性检查(可帮助早发现问题)
252+ n_cells , n_features = X .shape
253+ if len (barcodes ) != n_cells :
254+ raise ValueError (f"Barcodes length { len (barcodes )} != X.shape[0] (cells) { n_cells } ." )
255+ if len (feat ) != n_features :
256+ raise ValueError (f"Features length { len (feat )} != X.shape[1] (features) { n_features } ." )
257+
258+ return X , feat , barcodes
259+ finally :
260+ if managed :
261+ try : fileobj .close ()
262+ except Exception : pass
263+ else :
264+ try : fileobj .close ()
265+ except Exception : pass
183266
184267
185268# ---------------------------
0 commit comments