bionetslab
diff --git a/‎pixi.toml‎
Lines changed: 24 additions & 1 deletion b/‎pixi.toml‎
Lines changed: 24 additions & 1 deletion
diff --git a/‎src/netmap/downstream/final_downstream.py‎
Lines changed: 166 additions & 0 deletions b/‎src/netmap/downstream/final_downstream.py‎
Lines changed: 166 additions & 0 deletions
diff --git a/‎src/netmap/downstream/plotting.py‎
Lines changed: 2 additions & 4 deletions b/‎src/netmap/downstream/plotting.py‎
Lines changed: 2 additions & 4 deletions
diff --git a/‎src/netmap/masking/internal.py‎
Lines changed: 2 additions & 2 deletions b/‎src/netmap/masking/internal.py‎
Lines changed: 2 additions & 2 deletions
@@ -13,6 +13,8 @@ install-kernel = "python -m ipykernel install --user --name pixi_netmap --displa
 uninstall-kernel = "jupyter kernelspec uninstall pixi_netmap"
 
 [dependencies]
+anndata = ">=0.12.8,<0.13"
+scanpy = ">=1.12,<2"
 pytorch-gpu = "*"
 cuda-version = "12.8.*"
 cupy = ">=13.5.1,<14"
@@ -21,7 +23,28 @@ scikit-image = ">=0.25.2,<0.26"
 sphinx = "*"
 sphinx-autobuild = "*"
 furo = "*"
-
+numba = ">=0.63.1,<0.64"
+libopenblas = ">=0.3.31,<0.4"
+jupyter = ">=1.1.1,<2"
+gtfparse = ">=2.5.0,<3"
+ipykernel = ">=7.1.0,<8"
+scikit-learn =  "*"
+pandas= "*"
+scipy= "*"
+numpy= "*"
+seaborn= "*"
+statsmodels= "*"
+captum= "*"
+python-igraph= "*"
+leidenalg= "*"
+matplotlib= "*"
+pyvis= "*"
+pyyaml= "*"
+plotnine = ">=0.15.3,<0.16"
+patchworklib = ">=0.6.3,<0.7"
+pyarrow = ">=20.0.0,<21"
+celltypist = ">=1.7.1,<2"
+scvi = ">=0.6.8,<0.7"
 
 [pypi-dependencies]
 netmap = { git = "https://github.com/bionetslab/netmap" }
@@ -28,6 +28,172 @@
 from collections import Counter
 
 
+def select_top_edges(gene_inter_adata, adata, top_per_source=10, col_cluster='leiden_remap', min_reg_size=10, verbose=True, return_copy = False, tf_column=None):
+    """
+    Selects top gene targets per source from a clustered gene interaction AnnData.
+
+    Parameters
+    ----------
+    gene_inter_adata : AnnData
+        Gene interaction AnnData with `var` containing 'source' and 'target'.
+    adata : AnnData
+        Expression AnnData for ranking genes.
+    top_per_source : int, default=750
+        Number of top targets to select per source.
+    col_cluster : str, default='spectral'
+        Column in obs defining clusters.grn_adata3.var
+
+    Returns
+    -------
+    gene_inter_adata_filtered : AnnData
+        Filtered AnnData containing top edges.
+    reglon_sizes : list of int
+        Sizes of regulatory regions per source.
+
+    """
+
+    min_edge_support = 0.5
+
+    clusters = list(np.unique(gene_inter_adata.obs[col_cluster]))
+
+    keep_edges_dict = {}
+
+
+    for c in clusters:
+        Keep_edges = []
+        if verbose: print(f"Selecting targets for cluster: {c}")
+
+        cells_c = gene_inter_adata.obs[col_cluster] == c
+        gene_inter_adata.var['edge_support_c'] = (
+            gene_inter_adata.layers['mask'][cells_c, :].mean(axis=0)
+        )
+
+        if tf_column is not None:
+            tfs = gene_inter_adata.var[gene_inter_adata.var[tf_column]]['source'].unique()
+            source_list =  set(gene_inter_adata.var["source"].unique()).intersection(set(tfs))
+        
+        else:
+            source_list = gene_inter_adata.var["source"].unique()
+
+        for source in source_list:
+
+            df_targets = gene_inter_adata.var[
+                    (gene_inter_adata.var['source'] == source) &
+                    (gene_inter_adata.var['edge_support_c'] >= min_edge_support)]
+            #print(gene_inter_adata[cells_c, gene_inter_adata[cells_c, df_targets.index].X.sum(axis = 0)].X.sum(axis = 0))
+            
+            df_targets['sum_of_edge'] = gene_inter_adata[cells_c, df_targets.index].X.sum(axis = 0)
+            df_targets = df_targets.sort_values('sum_of_edge', ascending=False).head(top_per_source)
+
+            if len(df_targets) >= min_reg_size:
+                Keep_edges.extend(f"{source}_{t}" for t in df_targets['target'])
+
+            keep_edges_dict[c] = Keep_edges
+    keep_edges_dict = process_cell_edges(keep_edges_dict)
+    return keep_edges_dict
+
+
+def process_cell_edges(keep_edges):
+    results = {'unique': {}, 'all': {}}
+    all_cells = list(keep_edges.keys())
+
+    
+    def get_source_summary(edge_set):
+        # Handles (source, target) tuples OR strings with a separator like '->'
+        sources = []
+        for e in edge_set:
+            sources.append(e.split('_')[0])
+        
+        source_dict = dict(Counter(sources))
+        sources = pd.DataFrame({'source' :source_dict.keys(), 'count': source_dict.values()}).sort_values('count', ascending=False)
+        return sources
+
+    # Calculate Uniques
+    for cell in all_cells:
+        others = set().union(*(set(keep_edges[c]) for c in all_cells if c != cell))
+        unique = set(keep_edges[cell]) - others
+
+        df = pd.DataFrame(
+            [e.split('_', 1) for e in unique],
+            columns=['source', 'target']
+        )
+        df_all  = pd.DataFrame(
+            [e.split('_', 1) for e in set(keep_edges[cell])],
+            columns=['source', 'target']
+        )
+
+        results['unique'][cell] = {
+            'edges': df,
+            'summary': get_source_summary(unique)
+        }
+        
+        results['all'][cell] = {
+            'edges': df_all,
+            'summary': get_source_summary(set(keep_edges[cell]))
+        }
+        
+    return results
+
+
+def compute_signatures_UCell_scores(selected_edges, adata, key='unique') -> pd.DataFrame:
+    """
+    Filters gene signatures by cluster and computes UCell scores.
+
+    Parameters
+    ----------
+    grn_adata : AnnData
+        AnnData object containing GRN (gene regulatory network) information.
+    adata : AnnData
+        AnnData object containing gene expression counts in the 'counts' layer.
+
+    Returns
+    -------
+    pd.DataFrame
+        DataFrame with UCell scores merged with the 'spectral' cluster labels.
+    """
+    
+    all_signatures = {}
+    for ct in selected_edges[key]:
+        sign = selected_edges[key][ct]['edges'].groupby('source')['target'].apply(list).to_dict()
+        sign  = {f"{ct}_{k}": v for k, v in sign.items()}
+        all_signatures = all_signatures | sign
+
+    ucell.compute_ucell_scores(adata, signatures=all_signatures, n_jobs=1)
+    data_ucell = adata.obs.filter(like='_UCell')
+    data_ucell.columns = [x.replace('_UCell', '') for x in data_ucell.columns]
+
+    return data_ucell
+
+
+def aggregate_edges(selected_edges, grn_adata, key='unique') -> pd.DataFrame:
+    """
+    Filters gene signatures by cluster and computes UCell scores.
+
+    Parameters
+    ----------
+    grn_adata : AnnData
+        AnnData object containing GRN (gene regulatory network) information.
+    adata : AnnData
+        AnnData object containing gene expression counts in the 'counts' layer.
+
+    Returns
+    -------
+    pd.DataFrame
+        DataFrame with UCell scores merged with the 'spectral' cluster labels.
+    """
+    
+    regulons = {}
+    for ct in selected_edges[key]:
+        print(ct)
+        sign = selected_edges[key][ct]['edges'].groupby('source').apply(lambda x: (x['source'] + "_" + x['target']).tolist()).to_dict()
+        for g in sign:
+            regulons[f'{ct}_{g}'] = grn_adata[:, sign[g]].X.sum(axis = 1)
+    regulons = pd.DataFrame(regulons)
+    return regulons
+
+
+
+
 def filter_clusters_by_cell_count(grn_adata: ad.AnnData, metric_tag: float, top_fraction: float) -> Tuple[Optional[Dict[str, float]], ad.AnnData]:
     """
     Filter features (genes/edges) based on cell count differences between two clusters,
 
@@ -38,20 +38,18 @@ def rank_regulon_groups_dotplot(grn_adata_filtered, adata_regl, original_cluster
         regulon = adata_regl.var.regulon_name[ri]
         sou = adata_regl.var.source[ri]
         count = count+1
-        fractions[f'{regulon}_UCell'] =   grn_adata_filtered.var[grn_adata_filtered.var.source == sou][colheaders].sum()
+        fractions[f'{regulon}'] =   grn_adata_filtered.var[grn_adata_filtered.var.source == sou][colheaders].sum()
 
     fractions = pd.DataFrame(fractions)
     fractions.index = [x.replace('_nonzero', '') for x in fractions.index]
 
-
     # return fig needs to be true: get plot, modify sizes, then plot or return
     pp = sc.pl.rank_genes_groups_dotplot(adata_regl, n_genes=n_genes, key=key, groupby=new_cluster_column, cmap=cmap, figsize=figsize, values_to_plot=values_to_plot, return_fig = True)
     fractions = fractions.reindex(list(pp.dot_size_df.index))
 
     pp.dot_size_df = fractions.loc[:, pp.dot_color_df.columns]
-    print(fractions)
     pp.dot_size_df = pp.dot_size_df/(pp.dot_size_df.max())
-    
+    pp.dot_size_df = pp.dot_size_df.fillna(0)
     if return_fig:
         return pp
     else:
 
@@ -153,13 +153,13 @@ def add_neighbourhood_expression_mask(adata, grn_adata, strict=False, layer = 'X
     return grn_adata
 
 
-def add_cluster_based_candidate_edges(grn_adata, cluster_column = 'leiden_remap'):
+def add_cluster_based_candidate_edges(grn_adata, cluster_column = 'leiden_remap', threshold = 0.5):
     vc = grn_adata.obs[cluster_column].value_counts()
     grn_adata.var[f'count_nonzero_norm'] = grn_adata.var[f'count_nonzero']/grn_adata.obs.shape[0]
     for ps in list(vc.index):
         grn_adata.var[f'{ps}_nonzero'] = (grn_adata[grn_adata.obs[cluster_column] == ps].layers['mask'].sum(axis = 0))/vc[ps]
         #grn_adata.var[f'{ps}_candidate_edge'] = np.abs(grn_adata.var[f'{ps}_nonzero']-grn_adata.var[f'count_nonzero_norm'])>0.1
-        grn_adata.var[f'{ps}_candidate_edge'] = grn_adata.var[f'{ps}_nonzero']>0.5
+        grn_adata.var[f'{ps}_candidate_edge'] = grn_adata.var[f'{ps}_nonzero']>threshold
 
     value_cols = [f'{ps}_candidate_edge' for ps in vc.index]
     grn_adata.var['candidate_edge'] = grn_adata.var[value_cols].sum(axis = 1)