add new feature

hutaobo · hutaobo · commit f5c9e4cc4377 · 2025-10-28T10:45:04.000+01:00
diff --git a/src/pyXenium/analysis/microenv_analysis.py b/src/pyXenium/analysis/microenv_analysis.py
@@ -7,18 +7,17 @@
 import pandas as pd
 from anndata import AnnData
 
-# 依赖内部 ProteinMicroEnv
+# Dependency: internal ProteinMicroEnv
 try:
     from pyXenium.analysis.protein_microenvironment import ProteinMicroEnv
 except Exception as e:
     raise ImportError(
-        "未能导入 pyXenium.analysis.protein_microenvironment.ProteinMicroEnv，"
-        "请确认该类已包含在包内并可被导入。"
+        "Failed to import pyXenium.analysis.protein_microenvironment.ProteinMicroEnv. "
+        "Please ensure that this class is included in the package and can be imported."
     ) from e
 
-
 # ---------------------------
-# 工具：方法自适应与存储
+# Tools: method adaptation and storage
 # ---------------------------
 
 def _subset_kwargs_by_signature(func, **kwargs):
@@ -34,15 +33,15 @@ def _call_first_available(obj, names: Sequence[str], **kwargs):
                 return fn(**_subset_kwargs_by_signature(fn, **kwargs))
             except Exception as e:
                 last = e
-    raise RuntimeError(f"尝试的方法均不可用：{names}\n最后错误：{last!r}")
+    raise RuntimeError(f"All attempted methods are unavailable: {names}\nLast error: {last!r}")
 
 def _protein_df(adata: AnnData) -> pd.DataFrame:
     prot = adata.obsm.get("protein", None)
     if prot is None:
-        raise ValueError("当前 AnnData 不包含 obsm['protein']。")
+        raise ValueError("The current AnnData does not contain obsm['protein'].")
     if isinstance(prot, pd.DataFrame):
         return prot
-    # 若不是 DataFrame，则构造列名兜底
+    # If not a DataFrame, construct fallback column names
     cols = getattr(prot, "columns", None)
     if cols is None:
         cols = [f"p{i}" for i in range(prot.shape[1])]
@@ -53,11 +52,11 @@ def _normalize_name(s: str) -> str:
 
 def _resolve_protein_column(adata: AnnData, preferred: Sequence[str]) -> Optional[str]:
     """
-    在 obsm['protein'].columns 中按同义名/大小写不敏感方式查找最佳列。
+    Find the best column in obsm['protein'].columns using synonyms and case-insensitive matching.
     """
     prot = _protein_df(adata)
     norm_cols = {_normalize_name(c): c for c in prot.columns}
-    # 一些常用同义词
+    # Some common synonyms
     synonyms = {
         "cd8": ["cd8", "cd8a"],
         "cd45": ["cd45", "ptprc", "cd45ra", "cd45rb", "cd45ro"],
@@ -68,13 +67,13 @@ def _resolve_protein_column(adata: AnnData, preferred: Sequence[str]) -> Optiona
         "alphasma": ["alphasma", "αsma", "alpha-sma", "acta2"],
         "cd31": ["cd31", "pecam1"],
     }
-    # 把 preferred 展开成同义词列表
+    # Expand preferred list into synonyms
     cand_norms: list[str] = []
     for p in preferred:
         key = _normalize_name(p)
-        # 同义词集合
+        # Synonym set
         cand_norms.extend(synonyms.get(key, [key]))
-    # 逐个匹配
+    # Match one by one
     for c in cand_norms:
         if c in norm_cols:
             return norm_cols[c]
@@ -90,7 +89,7 @@ def _guess_transcripts_path(base_path: str) -> str:
     for p in cands:
         if os.path.exists(p):
             return p
-    # 没找到就返回首选，交由 PM 本身报错
+    # If not found, return the first candidate and let ProteinMicroEnv handle the error.
     return cands[0]
 
 def _pm_init(
@@ -137,7 +136,7 @@ def _compute_gene_stats(
     return_long: bool = False,
 ):
     """
-    返回通常是：index=锚点（或条形码），columns=每个 (gene@ring) 的宽表。
+    Return typically a wide DataFrame with index as anchors (or barcodes), columns for each (gene@ring).
     """
     candidates = [
         ("compute_transcript_stats", dict(genes=genes,
@@ -159,7 +158,7 @@ def _compute_gene_stats(
                 return fn(**_subset_kwargs_by_signature(fn, **kw))
             except Exception as e:
                 last = e
-    raise RuntimeError(f"ProteinMicroEnv 无可用基因统计接口；最后错误：{last!r}")
+    raise RuntimeError(f"No available gene statistics interface in ProteinMicroEnv; last error: {last!r}")
 
 def _compute_neighbor_cells(env: ProteinMicroEnv, cell_types: pd.Series, how: str = "fraction"):
     return _call_first_available(env, ["compute_neighbor_cells", "neighbor_cell_stats"],
@@ -173,25 +172,25 @@ def _store_anchor_df(
     uns_key: str,
 ):
     """
-    存储策略：
-      1) 完整 DataFrame 放到 uns[uns_key]（保留行索引信息）；
-      2) 同步一份到 obsm[obsm_key]：构造 (n_obs × df.shape[1]) 的矩阵，
-         非锚点行填 NaN；行匹配优先按名字对齐，否则按传入的 anchor_indices 顺序放置。
+    Storage strategy:
+      1) Put the complete DataFrame into uns[uns_key] (preserve row index information);
+      2) Also sync a copy to obsm[obsm_key]: construct an (n_obs × df.shape[1]) matrix,
+         fill non-anchor rows with NaN; align rows by name if possible, otherwise place in order of the provided anchor_indices.
     """
     if not isinstance(df, pd.DataFrame):
-        # 尝试包装
+        # Try to wrap
         df = pd.DataFrame(df)
 
-    # 1) 放 uns
+    # 1) in uns
     adata.uns[uns_key] = df.copy()
 
-    # 2) 铺到 obsm（与 obs 对齐）
+    # 2) sync to obsm (aligned with obs)
     mat = np.full((adata.n_obs, df.shape[1]), np.nan, dtype=float)
 
     if np.all(np.isin(df.index, adata.obs_names)):
         row_idx = adata.obs_names.get_indexer(df.index)
     else:
-        # 行数不一定等于锚点数，这里取两者的 min 并按次序放置
+        # The number of rows may not equal the number of anchors; take the smaller of the two and place in order
         n = min(len(anchor_indices), len(df))
         row_idx = np.asarray(anchor_indices[:n], dtype=int)
         mat[row_idx, :] = df.iloc[:n, :].to_numpy()
@@ -203,9 +202,8 @@ def _store_anchor_df(
     adata.obsm[obsm_key] = mat
     adata.uns[obsm_key + "_cols"] = list(df.columns)
 
-
 # ---------------------------
-# 免疫微环境
+# Immune microenvironment
 # ---------------------------
 
 def run_immune_microenvironment(
@@ -220,14 +218,14 @@ def run_immune_microenvironment(
 ) -> Dict[str, Any]:
     prot = _protein_df(adata)
 
-    # 锚点：CD8 & CD45（若有 CD3 也可；不存在的自动忽略）
+    # Anchors: CD8 & CD45 (CD3 if available; automatically ignored if not present)
     col_cd8  = _resolve_protein_column(adata, ["CD8", "CD8A"])
     col_cd45 = _resolve_protein_column(adata, ["CD45", "PTPRC"])
     col_cd3  = _resolve_protein_column(adata, ["CD3", "CD3E", "CD3D"])
 
     markers = [c for c in [col_cd8, col_cd45, col_cd3] if c is not None]
     if not markers:
-        raise ValueError("找不到用于免疫锚点的蛋白列（CD8/CD45[可选CD3]）。")
+        raise ValueError("Cannot find a protein column for immune anchors (CD8/CD45 [optional CD3]).")
 
     mask = np.ones(adata.n_obs, dtype=bool)
     for m in markers:
@@ -237,23 +235,23 @@ def run_immune_microenvironment(
     if anchor_idx.size:
         adata.obs.iloc[anchor_idx, adata.obs.columns.get_loc(f"{out_prefix}_is_anchor")] = True
 
-    # 初始化 ProteinMicroEnv
+    # Initialize ProteinMicroEnv
     if transcripts_path is None and base_path is not None:
         transcripts_path = _guess_transcripts_path(base_path)
     env = _pm_init(adata, transcripts_path, pixel_size_um=pixel_size_um, qv_threshold=qv_threshold)
 
-    # 设置锚点与邻域
+    # Set anchors and neighborhood
     _set_anchors(env, anchor_idx)
     _set_rings(env, ring_edges_um)
 
-    # 基因邻域统计（宽表）
+    # Gene neighborhood statistics (wide table)
     gene_df = _compute_gene_stats(env, list(genes), background="global", area_norm=True, return_long=False)
     if isinstance(gene_df, pd.DataFrame):
         _store_anchor_df(adata, gene_df, anchor_idx,
                          obsm_key=f"{out_prefix}_gene_stats",
                          uns_key=f"{out_prefix}_gene_stats")
 
-    # 邻居细胞组成（alphaSMA=CAF；CD31=Endothelial）
+    # Neighbor cell composition (alphaSMA = CAF; CD31 = Endothelial)
     col_asma = _resolve_protein_column(adata, ["alphaSMA", "ACTA2"])
     col_cd31 = _resolve_protein_column(adata, ["CD31", "PECAM1"])
     cell_type = pd.Series("Other", index=adata.obs_names, dtype=object)
@@ -262,7 +260,7 @@ def run_immune_microenvironment(
     if col_cd31 is not None:
         cell_type.loc[prot[col_cd31].to_numpy(dtype=float) > 0.0] = "Endothelial"
 
-    # how='fraction'：返回各环占比；如果你的 PM 实现支持 how='count'，也可以改成 'count'
+    # how='fraction': returns fraction of each ring; if your ProteinMicroEnv implementation supports 'count', you can also use 'count'.
     comp_df = _compute_neighbor_cells(env, cell_type, how="fraction")
     if isinstance(comp_df, pd.DataFrame):
         _store_anchor_df(adata, comp_df, anchor_idx,
@@ -271,9 +269,8 @@ def run_immune_microenvironment(
 
     return {"env": env, "anchors": anchor_idx}
 
-
 # ---------------------------
-# 肿瘤-间质边界
+# Tumor-Stroma border
 # ---------------------------
 
 def run_tumor_stroma_border(
@@ -288,13 +285,13 @@ def run_tumor_stroma_border(
 ) -> Dict[str, Any]:
     prot = _protein_df(adata)
 
-    # 锚点：上皮蛋白（优先 PanCK，其次 E-Cadherin/EPCAM）
+    # Anchor: epithelial protein (prefer PanCK, then E-Cadherin/EPCAM)
     col_panck = _resolve_protein_column(adata, ["PanCK"])
     col_ecad  = _resolve_protein_column(adata, ["E-Cadherin", "ECADHERIN", "ECAD"])
     col_epcam = _resolve_protein_column(adata, ["EPCAM"])
     anchor_col = next((c for c in [col_panck, col_ecad, col_epcam] if c is not None), None)
     if anchor_col is None:
-        raise ValueError("找不到用于肿瘤锚点的蛋白列（PanCK/E-Cadherin/EPCAM）。")
+        raise ValueError("Cannot find a protein column for tumor anchor (PanCK/E-Cadherin/EPCAM).")
 
     mask = prot[anchor_col].to_numpy(dtype=float) > 0.0
     anchor_idx = np.where(mask)[0]
@@ -309,7 +306,7 @@ def run_tumor_stroma_border(
     _set_anchors(env, anchor_idx)
     _set_rings(env, ring_edges_um)
 
-    # 邻居细胞组成：CAF / Endothelial
+    # Neighbor cell composition: CAF / Endothelial
     col_asma = _resolve_protein_column(adata, ["alphaSMA", "ACTA2"])
     col_cd31 = _resolve_protein_column(adata, ["CD31", "PECAM1"])
     cell_type = pd.Series("Other", index=adata.obs_names, dtype=object)
@@ -324,7 +321,7 @@ def run_tumor_stroma_border(
                          obsm_key=f"{out_prefix}_neighbor_composition",
                          uns_key=f"{out_prefix}_neighbor_composition")
 
-    # ECM 基因邻域统计
+    # ECM gene neighborhood statistics
     ecm_df = _compute_gene_stats(env, list(ecm_genes), background="global", area_norm=True, return_long=False)
     if isinstance(ecm_df, pd.DataFrame):
         _store_anchor_df(adata, ecm_df, anchor_idx,
@@ -333,9 +330,8 @@ def run_tumor_stroma_border(
 
     return {"env": env, "anchors": anchor_idx}
 
-
 # ---------------------------
-# 统一入口（Notebook 友好）
+# Unified entry point (Notebook-friendly)
 # ---------------------------
 
 def analyze_microenvironment(
@@ -349,10 +345,10 @@ def analyze_microenvironment(
     output_dir: Optional[str] = None,
 ) -> AnnData:
     """
-    统一入口：执行 'immune' 或 'tumor_border' 分析。
-    - 结果表（按锚点×特征）存到 adata.uns[...]，并铺平到 adata.obsm[...]（非锚点=NaN）。
-    - 锚点布尔标记写入 adata.obs['<prefix>_is_anchor']。
-    - 若 output_dir 指定，会把主要结果另存 CSV。
+    Unified entry point: execute 'immune' or 'tumor_border' analysis.
+    - The result tables (anchor x feature) are stored in adata.uns[...] and flattened into adata.obsm[...] (non-anchors = NaN).
+    - Anchor boolean labels are written to adata.obs['<prefix>_is_anchor'].
+    - If output_dir is specified, the main results will be saved as CSV files.
     """
     mode = mode.lower()
     if mode == "immune":
@@ -376,16 +372,16 @@ def analyze_microenvironment(
             f"{prefix}_neighbor_composition",
         ]
     else:
-        raise ValueError("mode 必须是 'immune' 或 'tumor_border'。")
+        raise ValueError("mode must be 'immune' or 'tumor_border'.")
 
-    # 可选把 uns 表格另存 CSV
+    # Optionally save uns tables as CSV
     if output_dir:
         os.makedirs(output_dir, exist_ok=True)
         for k in keys_to_dump:
             df = adata.uns.get(k, None)
             if isinstance(df, pd.DataFrame):
                 df.to_csv(os.path.join(output_dir, f"{k}.csv"))
-        # 同时保存锚点名单
+        # Also save anchor list
         anchors = res.get("anchors", np.array([], dtype=int))
         pd.Series(adata.obs_names[anchors]).to_csv(
             os.path.join(output_dir, f"{prefix}_anchors.csv"), index=False, header=False