77import pandas as pd
88from anndata import AnnData
99
10- # 依赖内部 ProteinMicroEnv
10+ # Dependency: internal ProteinMicroEnv
1111try :
1212 from pyXenium .analysis .protein_microenvironment import ProteinMicroEnv
1313except Exception as e :
1414 raise ImportError (
15- "未能导入 pyXenium.analysis.protein_microenvironment.ProteinMicroEnv, "
16- "请确认该类已包含在包内并可被导入。 "
15+ "Failed to import pyXenium.analysis.protein_microenvironment.ProteinMicroEnv. "
16+ "Please ensure that this class is included in the package and can be imported. "
1717 ) from e
1818
19-
2019# ---------------------------
21- # 工具:方法自适应与存储
20+ # Tools: method adaptation and storage
2221# ---------------------------
2322
2423def _subset_kwargs_by_signature (func , ** kwargs ):
@@ -34,15 +33,15 @@ def _call_first_available(obj, names: Sequence[str], **kwargs):
3433 return fn (** _subset_kwargs_by_signature (fn , ** kwargs ))
3534 except Exception as e :
3635 last = e
37- raise RuntimeError (f"尝试的方法均不可用: { names } \n 最后错误: { last !r} " )
36+ raise RuntimeError (f"All attempted methods are unavailable: { names } \n Last error: { last !r} " )
3837
3938def _protein_df (adata : AnnData ) -> pd .DataFrame :
4039 prot = adata .obsm .get ("protein" , None )
4140 if prot is None :
42- raise ValueError ("当前 AnnData 不包含 obsm['protein']。 " )
41+ raise ValueError ("The current AnnData does not contain obsm['protein']. " )
4342 if isinstance (prot , pd .DataFrame ):
4443 return prot
45- # 若不是 DataFrame,则构造列名兜底
44+ # If not a DataFrame, construct fallback column names
4645 cols = getattr (prot , "columns" , None )
4746 if cols is None :
4847 cols = [f"p{ i } " for i in range (prot .shape [1 ])]
@@ -53,11 +52,11 @@ def _normalize_name(s: str) -> str:
5352
5453def _resolve_protein_column (adata : AnnData , preferred : Sequence [str ]) -> Optional [str ]:
5554 """
56- 在 obsm['protein'].columns 中按同义名/大小写不敏感方式查找最佳列。
55+ Find the best column in obsm['protein'].columns using synonyms and case-insensitive matching.
5756 """
5857 prot = _protein_df (adata )
5958 norm_cols = {_normalize_name (c ): c for c in prot .columns }
60- # 一些常用同义词
59+ # Some common synonyms
6160 synonyms = {
6261 "cd8" : ["cd8" , "cd8a" ],
6362 "cd45" : ["cd45" , "ptprc" , "cd45ra" , "cd45rb" , "cd45ro" ],
@@ -68,13 +67,13 @@ def _resolve_protein_column(adata: AnnData, preferred: Sequence[str]) -> Optiona
6867 "alphasma" : ["alphasma" , "αsma" , "alpha-sma" , "acta2" ],
6968 "cd31" : ["cd31" , "pecam1" ],
7069 }
71- # 把 preferred 展开成同义词列表
70+ # Expand preferred list into synonyms
7271 cand_norms : list [str ] = []
7372 for p in preferred :
7473 key = _normalize_name (p )
75- # 同义词集合
74+ # Synonym set
7675 cand_norms .extend (synonyms .get (key , [key ]))
77- # 逐个匹配
76+ # Match one by one
7877 for c in cand_norms :
7978 if c in norm_cols :
8079 return norm_cols [c ]
@@ -90,7 +89,7 @@ def _guess_transcripts_path(base_path: str) -> str:
9089 for p in cands :
9190 if os .path .exists (p ):
9291 return p
93- # 没找到就返回首选,交由 PM 本身报错
92+ # If not found, return the first candidate and let ProteinMicroEnv handle the error.
9493 return cands [0 ]
9594
9695def _pm_init (
@@ -137,7 +136,7 @@ def _compute_gene_stats(
137136 return_long : bool = False ,
138137):
139138 """
140- 返回通常是: index=锚点(或条形码), columns=每个 (gene@ring) 的宽表。
139+ Return typically a wide DataFrame with index as anchors (or barcodes), columns for each (gene@ring).
141140 """
142141 candidates = [
143142 ("compute_transcript_stats" , dict (genes = genes ,
@@ -159,7 +158,7 @@ def _compute_gene_stats(
159158 return fn (** _subset_kwargs_by_signature (fn , ** kw ))
160159 except Exception as e :
161160 last = e
162- raise RuntimeError (f"ProteinMicroEnv 无可用基因统计接口;最后错误: { last !r} " )
161+ raise RuntimeError (f"No available gene statistics interface in ProteinMicroEnv; last error: { last !r} " )
163162
164163def _compute_neighbor_cells (env : ProteinMicroEnv , cell_types : pd .Series , how : str = "fraction" ):
165164 return _call_first_available (env , ["compute_neighbor_cells" , "neighbor_cell_stats" ],
@@ -173,25 +172,25 @@ def _store_anchor_df(
173172 uns_key : str ,
174173):
175174 """
176- 存储策略:
177- 1) 完整 DataFrame 放到 uns[uns_key](保留行索引信息);
178- 2) 同步一份到 obsm[obsm_key]:构造 (n_obs × df.shape[1]) 的矩阵,
179- 非锚点行填 NaN;行匹配优先按名字对齐,否则按传入的 anchor_indices 顺序放置。
175+ Storage strategy:
176+ 1) Put the complete DataFrame into uns[uns_key] (preserve row index information);
177+ 2) Also sync a copy to obsm[obsm_key]: construct an (n_obs × df.shape[1]) matrix,
178+ fill non-anchor rows with NaN; align rows by name if possible, otherwise place in order of the provided anchor_indices.
180179 """
181180 if not isinstance (df , pd .DataFrame ):
182- # 尝试包装
181+ # Try to wrap
183182 df = pd .DataFrame (df )
184183
185- # 1) 放 uns
184+ # 1) in uns
186185 adata .uns [uns_key ] = df .copy ()
187186
188- # 2) 铺到 obsm(与 obs 对齐)
187+ # 2) sync to obsm (aligned with obs)
189188 mat = np .full ((adata .n_obs , df .shape [1 ]), np .nan , dtype = float )
190189
191190 if np .all (np .isin (df .index , adata .obs_names )):
192191 row_idx = adata .obs_names .get_indexer (df .index )
193192 else :
194- # 行数不一定等于锚点数,这里取两者的 min 并按次序放置
193+ # The number of rows may not equal the number of anchors; take the smaller of the two and place in order
195194 n = min (len (anchor_indices ), len (df ))
196195 row_idx = np .asarray (anchor_indices [:n ], dtype = int )
197196 mat [row_idx , :] = df .iloc [:n , :].to_numpy ()
@@ -203,9 +202,8 @@ def _store_anchor_df(
203202 adata .obsm [obsm_key ] = mat
204203 adata .uns [obsm_key + "_cols" ] = list (df .columns )
205204
206-
207205# ---------------------------
208- # 免疫微环境
206+ # Immune microenvironment
209207# ---------------------------
210208
211209def run_immune_microenvironment (
@@ -220,14 +218,14 @@ def run_immune_microenvironment(
220218) -> Dict [str , Any ]:
221219 prot = _protein_df (adata )
222220
223- # 锚点: CD8 & CD45(若有 CD3 也可;不存在的自动忽略)
221+ # Anchors: CD8 & CD45 ( CD3 if available; automatically ignored if not present)
224222 col_cd8 = _resolve_protein_column (adata , ["CD8" , "CD8A" ])
225223 col_cd45 = _resolve_protein_column (adata , ["CD45" , "PTPRC" ])
226224 col_cd3 = _resolve_protein_column (adata , ["CD3" , "CD3E" , "CD3D" ])
227225
228226 markers = [c for c in [col_cd8 , col_cd45 , col_cd3 ] if c is not None ]
229227 if not markers :
230- raise ValueError ("找不到用于免疫锚点的蛋白列( CD8/CD45[可选CD3])。 " )
228+ raise ValueError ("Cannot find a protein column for immune anchors ( CD8/CD45 [optional CD3]). " )
231229
232230 mask = np .ones (adata .n_obs , dtype = bool )
233231 for m in markers :
@@ -237,23 +235,23 @@ def run_immune_microenvironment(
237235 if anchor_idx .size :
238236 adata .obs .iloc [anchor_idx , adata .obs .columns .get_loc (f"{ out_prefix } _is_anchor" )] = True
239237
240- # 初始化 ProteinMicroEnv
238+ # Initialize ProteinMicroEnv
241239 if transcripts_path is None and base_path is not None :
242240 transcripts_path = _guess_transcripts_path (base_path )
243241 env = _pm_init (adata , transcripts_path , pixel_size_um = pixel_size_um , qv_threshold = qv_threshold )
244242
245- # 设置锚点与邻域
243+ # Set anchors and neighborhood
246244 _set_anchors (env , anchor_idx )
247245 _set_rings (env , ring_edges_um )
248246
249- # 基因邻域统计(宽表)
247+ # Gene neighborhood statistics (wide table)
250248 gene_df = _compute_gene_stats (env , list (genes ), background = "global" , area_norm = True , return_long = False )
251249 if isinstance (gene_df , pd .DataFrame ):
252250 _store_anchor_df (adata , gene_df , anchor_idx ,
253251 obsm_key = f"{ out_prefix } _gene_stats" ,
254252 uns_key = f"{ out_prefix } _gene_stats" )
255253
256- # 邻居细胞组成( alphaSMA= CAF; CD31= Endothelial)
254+ # Neighbor cell composition ( alphaSMA = CAF; CD31 = Endothelial)
257255 col_asma = _resolve_protein_column (adata , ["alphaSMA" , "ACTA2" ])
258256 col_cd31 = _resolve_protein_column (adata , ["CD31" , "PECAM1" ])
259257 cell_type = pd .Series ("Other" , index = adata .obs_names , dtype = object )
@@ -262,7 +260,7 @@ def run_immune_microenvironment(
262260 if col_cd31 is not None :
263261 cell_type .loc [prot [col_cd31 ].to_numpy (dtype = float ) > 0.0 ] = "Endothelial"
264262
265- # how='fraction':返回各环占比;如果你的 PM 实现支持 how= 'count',也可以改成 'count'
263+ # how='fraction': returns fraction of each ring; if your ProteinMicroEnv implementation supports 'count', you can also use 'count'.
266264 comp_df = _compute_neighbor_cells (env , cell_type , how = "fraction" )
267265 if isinstance (comp_df , pd .DataFrame ):
268266 _store_anchor_df (adata , comp_df , anchor_idx ,
@@ -271,9 +269,8 @@ def run_immune_microenvironment(
271269
272270 return {"env" : env , "anchors" : anchor_idx }
273271
274-
275272# ---------------------------
276- # 肿瘤-间质边界
273+ # Tumor-Stroma border
277274# ---------------------------
278275
279276def run_tumor_stroma_border (
@@ -288,13 +285,13 @@ def run_tumor_stroma_border(
288285) -> Dict [str , Any ]:
289286 prot = _protein_df (adata )
290287
291- # 锚点:上皮蛋白(优先 PanCK,其次 E-Cadherin/EPCAM)
288+ # Anchor: epithelial protein (prefer PanCK, then E-Cadherin/EPCAM)
292289 col_panck = _resolve_protein_column (adata , ["PanCK" ])
293290 col_ecad = _resolve_protein_column (adata , ["E-Cadherin" , "ECADHERIN" , "ECAD" ])
294291 col_epcam = _resolve_protein_column (adata , ["EPCAM" ])
295292 anchor_col = next ((c for c in [col_panck , col_ecad , col_epcam ] if c is not None ), None )
296293 if anchor_col is None :
297- raise ValueError ("找不到用于肿瘤锚点的蛋白列( PanCK/E-Cadherin/EPCAM)。 " )
294+ raise ValueError ("Cannot find a protein column for tumor anchor ( PanCK/E-Cadherin/EPCAM). " )
298295
299296 mask = prot [anchor_col ].to_numpy (dtype = float ) > 0.0
300297 anchor_idx = np .where (mask )[0 ]
@@ -309,7 +306,7 @@ def run_tumor_stroma_border(
309306 _set_anchors (env , anchor_idx )
310307 _set_rings (env , ring_edges_um )
311308
312- # 邻居细胞组成: CAF / Endothelial
309+ # Neighbor cell composition: CAF / Endothelial
313310 col_asma = _resolve_protein_column (adata , ["alphaSMA" , "ACTA2" ])
314311 col_cd31 = _resolve_protein_column (adata , ["CD31" , "PECAM1" ])
315312 cell_type = pd .Series ("Other" , index = adata .obs_names , dtype = object )
@@ -324,7 +321,7 @@ def run_tumor_stroma_border(
324321 obsm_key = f"{ out_prefix } _neighbor_composition" ,
325322 uns_key = f"{ out_prefix } _neighbor_composition" )
326323
327- # ECM 基因邻域统计
324+ # ECM gene neighborhood statistics
328325 ecm_df = _compute_gene_stats (env , list (ecm_genes ), background = "global" , area_norm = True , return_long = False )
329326 if isinstance (ecm_df , pd .DataFrame ):
330327 _store_anchor_df (adata , ecm_df , anchor_idx ,
@@ -333,9 +330,8 @@ def run_tumor_stroma_border(
333330
334331 return {"env" : env , "anchors" : anchor_idx }
335332
336-
337333# ---------------------------
338- # 统一入口(Notebook 友好)
334+ # Unified entry point (Notebook-friendly)
339335# ---------------------------
340336
341337def analyze_microenvironment (
@@ -349,10 +345,10 @@ def analyze_microenvironment(
349345 output_dir : Optional [str ] = None ,
350346) -> AnnData :
351347 """
352- 统一入口:执行 'immune' 或 'tumor_border' 分析。
353- - 结果表(按锚点×特征)存到 adata.uns[...],并铺平到 adata.obsm[...](非锚点= NaN)。
354- - 锚点布尔标记写入 adata.obs['<prefix>_is_anchor']。
355- - 若 output_dir 指定,会把主要结果另存 CSV。
348+ Unified entry point: execute 'immune' or 'tumor_border' analysis.
349+ - The result tables (anchor x feature) are stored in adata.uns[...] and flattened into adata.obsm[...] (non-anchors = NaN).
350+ - Anchor boolean labels are written to adata.obs['<prefix>_is_anchor'].
351+ - If output_dir is specified, the main results will be saved as CSV files.
356352 """
357353 mode = mode .lower ()
358354 if mode == "immune" :
@@ -376,16 +372,16 @@ def analyze_microenvironment(
376372 f"{ prefix } _neighbor_composition" ,
377373 ]
378374 else :
379- raise ValueError ("mode 必须是 'immune' 或 'tumor_border'。 " )
375+ raise ValueError ("mode must be 'immune' or 'tumor_border'. " )
380376
381- # 可选把 uns 表格另存 CSV
377+ # Optionally save uns tables as CSV
382378 if output_dir :
383379 os .makedirs (output_dir , exist_ok = True )
384380 for k in keys_to_dump :
385381 df = adata .uns .get (k , None )
386382 if isinstance (df , pd .DataFrame ):
387383 df .to_csv (os .path .join (output_dir , f"{ k } .csv" ))
388- # 同时保存锚点名单
384+ # Also save anchor list
389385 anchors = res .get ("anchors" , np .array ([], dtype = int ))
390386 pd .Series (adata .obs_names [anchors ]).to_csv (
391387 os .path .join (output_dir , f"{ prefix } _anchors.csv" ), index = False , header = False
0 commit comments