Skip to content

Commit f5c9e4c

Browse files
committed
add new feature
1 parent 1a4e436 commit f5c9e4c

1 file changed

Lines changed: 44 additions & 48 deletions

File tree

src/pyXenium/analysis/microenv_analysis.py

Lines changed: 44 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -7,18 +7,17 @@
77
import pandas as pd
88
from anndata import AnnData
99

10-
# 依赖内部 ProteinMicroEnv
10+
# Dependency: internal ProteinMicroEnv
1111
try:
1212
from pyXenium.analysis.protein_microenvironment import ProteinMicroEnv
1313
except Exception as e:
1414
raise ImportError(
15-
"未能导入 pyXenium.analysis.protein_microenvironment.ProteinMicroEnv"
16-
"请确认该类已包含在包内并可被导入。"
15+
"Failed to import pyXenium.analysis.protein_microenvironment.ProteinMicroEnv. "
16+
"Please ensure that this class is included in the package and can be imported."
1717
) from e
1818

19-
2019
# ---------------------------
21-
# 工具:方法自适应与存储
20+
# Tools: method adaptation and storage
2221
# ---------------------------
2322

2423
def _subset_kwargs_by_signature(func, **kwargs):
@@ -34,15 +33,15 @@ def _call_first_available(obj, names: Sequence[str], **kwargs):
3433
return fn(**_subset_kwargs_by_signature(fn, **kwargs))
3534
except Exception as e:
3635
last = e
37-
raise RuntimeError(f"尝试的方法均不可用:{names}\n最后错误:{last!r}")
36+
raise RuntimeError(f"All attempted methods are unavailable: {names}\nLast error: {last!r}")
3837

3938
def _protein_df(adata: AnnData) -> pd.DataFrame:
4039
prot = adata.obsm.get("protein", None)
4140
if prot is None:
42-
raise ValueError("当前 AnnData 不包含 obsm['protein']")
41+
raise ValueError("The current AnnData does not contain obsm['protein'].")
4342
if isinstance(prot, pd.DataFrame):
4443
return prot
45-
# 若不是 DataFrame,则构造列名兜底
44+
# If not a DataFrame, construct fallback column names
4645
cols = getattr(prot, "columns", None)
4746
if cols is None:
4847
cols = [f"p{i}" for i in range(prot.shape[1])]
@@ -53,11 +52,11 @@ def _normalize_name(s: str) -> str:
5352

5453
def _resolve_protein_column(adata: AnnData, preferred: Sequence[str]) -> Optional[str]:
5554
"""
56-
obsm['protein'].columns 中按同义名/大小写不敏感方式查找最佳列。
55+
Find the best column in obsm['protein'].columns using synonyms and case-insensitive matching.
5756
"""
5857
prot = _protein_df(adata)
5958
norm_cols = {_normalize_name(c): c for c in prot.columns}
60-
# 一些常用同义词
59+
# Some common synonyms
6160
synonyms = {
6261
"cd8": ["cd8", "cd8a"],
6362
"cd45": ["cd45", "ptprc", "cd45ra", "cd45rb", "cd45ro"],
@@ -68,13 +67,13 @@ def _resolve_protein_column(adata: AnnData, preferred: Sequence[str]) -> Optiona
6867
"alphasma": ["alphasma", "αsma", "alpha-sma", "acta2"],
6968
"cd31": ["cd31", "pecam1"],
7069
}
71-
# preferred 展开成同义词列表
70+
# Expand preferred list into synonyms
7271
cand_norms: list[str] = []
7372
for p in preferred:
7473
key = _normalize_name(p)
75-
# 同义词集合
74+
# Synonym set
7675
cand_norms.extend(synonyms.get(key, [key]))
77-
# 逐个匹配
76+
# Match one by one
7877
for c in cand_norms:
7978
if c in norm_cols:
8079
return norm_cols[c]
@@ -90,7 +89,7 @@ def _guess_transcripts_path(base_path: str) -> str:
9089
for p in cands:
9190
if os.path.exists(p):
9291
return p
93-
# 没找到就返回首选,交由 PM 本身报错
92+
# If not found, return the first candidate and let ProteinMicroEnv handle the error.
9493
return cands[0]
9594

9695
def _pm_init(
@@ -137,7 +136,7 @@ def _compute_gene_stats(
137136
return_long: bool = False,
138137
):
139138
"""
140-
返回通常是:index=锚点(或条形码),columns=每个 (gene@ring) 的宽表。
139+
Return typically a wide DataFrame with index as anchors (or barcodes), columns for each (gene@ring).
141140
"""
142141
candidates = [
143142
("compute_transcript_stats", dict(genes=genes,
@@ -159,7 +158,7 @@ def _compute_gene_stats(
159158
return fn(**_subset_kwargs_by_signature(fn, **kw))
160159
except Exception as e:
161160
last = e
162-
raise RuntimeError(f"ProteinMicroEnv 无可用基因统计接口;最后错误:{last!r}")
161+
raise RuntimeError(f"No available gene statistics interface in ProteinMicroEnv; last error: {last!r}")
163162

164163
def _compute_neighbor_cells(env: ProteinMicroEnv, cell_types: pd.Series, how: str = "fraction"):
165164
return _call_first_available(env, ["compute_neighbor_cells", "neighbor_cell_stats"],
@@ -173,25 +172,25 @@ def _store_anchor_df(
173172
uns_key: str,
174173
):
175174
"""
176-
存储策略:
177-
1) 完整 DataFrame 放到 uns[uns_key](保留行索引信息);
178-
2) 同步一份到 obsm[obsm_key]:构造 (n_obs × df.shape[1]) 的矩阵,
179-
非锚点行填 NaN;行匹配优先按名字对齐,否则按传入的 anchor_indices 顺序放置。
175+
Storage strategy:
176+
1) Put the complete DataFrame into uns[uns_key] (preserve row index information);
177+
2) Also sync a copy to obsm[obsm_key]: construct an (n_obs × df.shape[1]) matrix,
178+
fill non-anchor rows with NaN; align rows by name if possible, otherwise place in order of the provided anchor_indices.
180179
"""
181180
if not isinstance(df, pd.DataFrame):
182-
# 尝试包装
181+
# Try to wrap
183182
df = pd.DataFrame(df)
184183

185-
# 1) uns
184+
# 1) in uns
186185
adata.uns[uns_key] = df.copy()
187186

188-
# 2) 铺到 obsm(与 obs 对齐)
187+
# 2) sync to obsm (aligned with obs)
189188
mat = np.full((adata.n_obs, df.shape[1]), np.nan, dtype=float)
190189

191190
if np.all(np.isin(df.index, adata.obs_names)):
192191
row_idx = adata.obs_names.get_indexer(df.index)
193192
else:
194-
# 行数不一定等于锚点数,这里取两者的 min 并按次序放置
193+
# The number of rows may not equal the number of anchors; take the smaller of the two and place in order
195194
n = min(len(anchor_indices), len(df))
196195
row_idx = np.asarray(anchor_indices[:n], dtype=int)
197196
mat[row_idx, :] = df.iloc[:n, :].to_numpy()
@@ -203,9 +202,8 @@ def _store_anchor_df(
203202
adata.obsm[obsm_key] = mat
204203
adata.uns[obsm_key + "_cols"] = list(df.columns)
205204

206-
207205
# ---------------------------
208-
# 免疫微环境
206+
# Immune microenvironment
209207
# ---------------------------
210208

211209
def run_immune_microenvironment(
@@ -220,14 +218,14 @@ def run_immune_microenvironment(
220218
) -> Dict[str, Any]:
221219
prot = _protein_df(adata)
222220

223-
# 锚点:CD8 & CD45(若有 CD3 也可;不存在的自动忽略)
221+
# Anchors: CD8 & CD45 (CD3 if available; automatically ignored if not present)
224222
col_cd8 = _resolve_protein_column(adata, ["CD8", "CD8A"])
225223
col_cd45 = _resolve_protein_column(adata, ["CD45", "PTPRC"])
226224
col_cd3 = _resolve_protein_column(adata, ["CD3", "CD3E", "CD3D"])
227225

228226
markers = [c for c in [col_cd8, col_cd45, col_cd3] if c is not None]
229227
if not markers:
230-
raise ValueError("找不到用于免疫锚点的蛋白列(CD8/CD45[可选CD3])。")
228+
raise ValueError("Cannot find a protein column for immune anchors (CD8/CD45 [optional CD3]).")
231229

232230
mask = np.ones(adata.n_obs, dtype=bool)
233231
for m in markers:
@@ -237,23 +235,23 @@ def run_immune_microenvironment(
237235
if anchor_idx.size:
238236
adata.obs.iloc[anchor_idx, adata.obs.columns.get_loc(f"{out_prefix}_is_anchor")] = True
239237

240-
# 初始化 ProteinMicroEnv
238+
# Initialize ProteinMicroEnv
241239
if transcripts_path is None and base_path is not None:
242240
transcripts_path = _guess_transcripts_path(base_path)
243241
env = _pm_init(adata, transcripts_path, pixel_size_um=pixel_size_um, qv_threshold=qv_threshold)
244242

245-
# 设置锚点与邻域
243+
# Set anchors and neighborhood
246244
_set_anchors(env, anchor_idx)
247245
_set_rings(env, ring_edges_um)
248246

249-
# 基因邻域统计(宽表)
247+
# Gene neighborhood statistics (wide table)
250248
gene_df = _compute_gene_stats(env, list(genes), background="global", area_norm=True, return_long=False)
251249
if isinstance(gene_df, pd.DataFrame):
252250
_store_anchor_df(adata, gene_df, anchor_idx,
253251
obsm_key=f"{out_prefix}_gene_stats",
254252
uns_key=f"{out_prefix}_gene_stats")
255253

256-
# 邻居细胞组成(alphaSMA=CAFCD31=Endothelial
254+
# Neighbor cell composition (alphaSMA = CAF; CD31 = Endothelial)
257255
col_asma = _resolve_protein_column(adata, ["alphaSMA", "ACTA2"])
258256
col_cd31 = _resolve_protein_column(adata, ["CD31", "PECAM1"])
259257
cell_type = pd.Series("Other", index=adata.obs_names, dtype=object)
@@ -262,7 +260,7 @@ def run_immune_microenvironment(
262260
if col_cd31 is not None:
263261
cell_type.loc[prot[col_cd31].to_numpy(dtype=float) > 0.0] = "Endothelial"
264262

265-
# how='fraction':返回各环占比;如果你的 PM 实现支持 how='count',也可以改成 'count'
263+
# how='fraction': returns fraction of each ring; if your ProteinMicroEnv implementation supports 'count', you can also use 'count'.
266264
comp_df = _compute_neighbor_cells(env, cell_type, how="fraction")
267265
if isinstance(comp_df, pd.DataFrame):
268266
_store_anchor_df(adata, comp_df, anchor_idx,
@@ -271,9 +269,8 @@ def run_immune_microenvironment(
271269

272270
return {"env": env, "anchors": anchor_idx}
273271

274-
275272
# ---------------------------
276-
# 肿瘤-间质边界
273+
# Tumor-Stroma border
277274
# ---------------------------
278275

279276
def run_tumor_stroma_border(
@@ -288,13 +285,13 @@ def run_tumor_stroma_border(
288285
) -> Dict[str, Any]:
289286
prot = _protein_df(adata)
290287

291-
# 锚点:上皮蛋白(优先 PanCK,其次 E-Cadherin/EPCAM
288+
# Anchor: epithelial protein (prefer PanCK, then E-Cadherin/EPCAM)
292289
col_panck = _resolve_protein_column(adata, ["PanCK"])
293290
col_ecad = _resolve_protein_column(adata, ["E-Cadherin", "ECADHERIN", "ECAD"])
294291
col_epcam = _resolve_protein_column(adata, ["EPCAM"])
295292
anchor_col = next((c for c in [col_panck, col_ecad, col_epcam] if c is not None), None)
296293
if anchor_col is None:
297-
raise ValueError("找不到用于肿瘤锚点的蛋白列(PanCK/E-Cadherin/EPCAM)。")
294+
raise ValueError("Cannot find a protein column for tumor anchor (PanCK/E-Cadherin/EPCAM).")
298295

299296
mask = prot[anchor_col].to_numpy(dtype=float) > 0.0
300297
anchor_idx = np.where(mask)[0]
@@ -309,7 +306,7 @@ def run_tumor_stroma_border(
309306
_set_anchors(env, anchor_idx)
310307
_set_rings(env, ring_edges_um)
311308

312-
# 邻居细胞组成:CAF / Endothelial
309+
# Neighbor cell composition: CAF / Endothelial
313310
col_asma = _resolve_protein_column(adata, ["alphaSMA", "ACTA2"])
314311
col_cd31 = _resolve_protein_column(adata, ["CD31", "PECAM1"])
315312
cell_type = pd.Series("Other", index=adata.obs_names, dtype=object)
@@ -324,7 +321,7 @@ def run_tumor_stroma_border(
324321
obsm_key=f"{out_prefix}_neighbor_composition",
325322
uns_key=f"{out_prefix}_neighbor_composition")
326323

327-
# ECM 基因邻域统计
324+
# ECM gene neighborhood statistics
328325
ecm_df = _compute_gene_stats(env, list(ecm_genes), background="global", area_norm=True, return_long=False)
329326
if isinstance(ecm_df, pd.DataFrame):
330327
_store_anchor_df(adata, ecm_df, anchor_idx,
@@ -333,9 +330,8 @@ def run_tumor_stroma_border(
333330

334331
return {"env": env, "anchors": anchor_idx}
335332

336-
337333
# ---------------------------
338-
# 统一入口(Notebook 友好)
334+
# Unified entry point (Notebook-friendly)
339335
# ---------------------------
340336

341337
def analyze_microenvironment(
@@ -349,10 +345,10 @@ def analyze_microenvironment(
349345
output_dir: Optional[str] = None,
350346
) -> AnnData:
351347
"""
352-
统一入口:执行 'immune' 'tumor_border' 分析。
353-
- 结果表(按锚点×特征)存到 adata.uns[...],并铺平到 adata.obsm[...](非锚点=NaN)。
354-
- 锚点布尔标记写入 adata.obs['<prefix>_is_anchor']
355-
- output_dir 指定,会把主要结果另存 CSV
348+
Unified entry point: execute 'immune' or 'tumor_border' analysis.
349+
- The result tables (anchor x feature) are stored in adata.uns[...] and flattened into adata.obsm[...] (non-anchors = NaN).
350+
- Anchor boolean labels are written to adata.obs['<prefix>_is_anchor'].
351+
- If output_dir is specified, the main results will be saved as CSV files.
356352
"""
357353
mode = mode.lower()
358354
if mode == "immune":
@@ -376,16 +372,16 @@ def analyze_microenvironment(
376372
f"{prefix}_neighbor_composition",
377373
]
378374
else:
379-
raise ValueError("mode 必须是 'immune' 'tumor_border'")
375+
raise ValueError("mode must be 'immune' or 'tumor_border'.")
380376

381-
# 可选把 uns 表格另存 CSV
377+
# Optionally save uns tables as CSV
382378
if output_dir:
383379
os.makedirs(output_dir, exist_ok=True)
384380
for k in keys_to_dump:
385381
df = adata.uns.get(k, None)
386382
if isinstance(df, pd.DataFrame):
387383
df.to_csv(os.path.join(output_dir, f"{k}.csv"))
388-
# 同时保存锚点名单
384+
# Also save anchor list
389385
anchors = res.get("anchors", np.array([], dtype=int))
390386
pd.Series(adata.obs_names[anchors]).to_csv(
391387
os.path.join(output_dir, f"{prefix}_anchors.csv"), index=False, header=False

0 commit comments

Comments
 (0)