@@ -348,118 +348,7 @@ def filter_signatures_by_Ucell(selected_edges, adata) -> pd.DataFrame:
348348 data_ucell = adata .obs .filter (like = '_UCell' )
349349 return data_ucell
350350
351- def select_top_edges (gene_inter_adata , adata , top_per_source = 10 , col_cluster = 'leiden_remap' , min_reg_size = 10 , verbose = True , return_copy = False ):
352- """
353- Selects top gene targets per source from a clustered gene interaction AnnData.
354-
355- Parameters
356- ----------
357- gene_inter_adata : AnnData
358- Gene interaction AnnData with `var` containing 'source' and 'target'.
359- adata : AnnData
360- Expression AnnData for ranking genes.
361- top_per_source : int, default=750
362- Number of top targets to select per source.
363- col_cluster : str, default='spectral'
364- Column in obs defining clusters.grn_adata3.var
365-
366- Returns
367- -------
368- gene_inter_adata_filtered : AnnData
369- Filtered AnnData containing top edges.
370- reglon_sizes : list of int
371- Sizes of regulatory regions per source.
372-
373- """
374-
375- min_edge_support = 0.5
376-
377- if verbose : print (f"Initial shape: { gene_inter_adata .shape } " )
378-
379- # Rank genes per cluster
380- sc .tl .rank_genes_groups (adata , groupby = col_cluster , method = 't-test' )
381- clusters = list (set (np .unique (gene_inter_adata .obs [col_cluster ])).intersection (adata .obs [col_cluster ]))
382351
383- # Merge ranking data across clusters
384- rank_dfs = []
385- for c in clusters :
386- if verbose : print (f"Ranking cluster: { c } " )
387- df = sc .get .rank_genes_groups_df (adata , group = str (c ))
388- df = df .sort_values ('scores' , ascending = False )
389- df [f"rank_{ c } " ] = np .arange (1 , len (df ) + 1 )
390- rank_dfs .append (df [['names' , f'rank_{ c } ' ]])
391- df_rank = reduce (lambda l , r : pd .merge (l , r , on = 'names' , how = 'inner' ), rank_dfs )
392-
393- keep_edges_dict = {}
394- # Compute differences per cluster
395-
396- for c in clusters :
397- Keep_edges , reglon_sizes = [], []
398- if verbose : print (f"Selecting targets for cluster: { c } " )
399- df_rank_c = df_rank .copy ()
400- rank_cols = [col for col in df_rank .columns if col != 'names' ]
401- rank_cols .remove (f"rank_{ c } " )
402- df_rank_c ['avg' ] = df_rank_c [rank_cols ].mean (axis = 1 )
403- df_rank_c ['diff' ] = (df_rank_c [f"rank_{ c } " ] - df_rank_c ['avg' ]).abs ()
404- df_rank_c = df_rank_c .sort_values ('diff' , ascending = False )
405-
406-
407- for source in gene_inter_adata .var ["source" ].unique ():
408- if df_rank_c .loc [df_rank_c ['names' ] == source , 'diff' ].shape [0 ] > 0 :
409- tf_rank = df_rank_c .loc [df_rank_c ['names' ] == source , 'diff' ].values [0 ]
410-
411- df_targets = (
412- gene_inter_adata .var [
413- (gene_inter_adata .var ['source' ] == source ) &
414- (gene_inter_adata .var [f'{ c } _nonzero' ] >= min_edge_support )
415- ]
416- .merge (df_rank_c [['names' , 'diff' ]],
417- left_on = 'target' , right_on = 'names' , how = 'left' )
418- )
419-
420- df_targets ['rank_distance' ] = (df_targets ['diff' ] - tf_rank ).abs ()
421- df_targets = df_targets .sort_values ('rank_distance' ).head (top_per_source )
422-
423- reglon_sizes .append (len (df_targets ))
424- if len (df_targets ) >= min_reg_size :
425- Keep_edges .extend (f"{ source } _{ t } " for t in df_targets ['target' ])
426-
427- keep_edges_dict [c ] = Keep_edges
428- keep_edges_dict = process_cell_edges (keep_edges )
429- return keep_edges_dict
430-
431-
432-
433- def process_cell_edges (keep_edges ):
434- results = {'unique' : {}, 'pairwise' : {}}
435- all_cells = list (keep_edges .keys ())
436-
437- def get_source_summary (edge_set ):
438- # Handles (source, target) tuples OR strings with a separator like '->'
439- sources = []
440- for e in edge_set :
441- sources .append (e .split ('_' )[0 ])
442-
443- source_dict = dict (Counter (sources ))
444- sources = pd .DataFrame ({'source' :source_dict .keys (), 'count' : source_dict .values ()}).sort_values ('count' , ascending = False )
445- return sources
446-
447- # Calculate Uniques
448- for cell in all_cells :
449- others = set ().union (* (set (keep_edges [c ]) for c in all_cells if c != cell ))
450- unique = set (keep_edges [cell ]) - others
451-
452- df = pd .DataFrame (
453- [e .split ('_' , 1 ) for e in unique ],
454- columns = ['source' , 'target' ]
455- )
456-
457- results ['unique' ][cell ] = {
458- 'edges' : df ,
459- 'summary' : get_source_summary (unique )
460- }
461-
462- return results
463352
464353
465354def compute_signatures_UCell_scores (selected_edges , adata , key = 'unique' ) -> pd .DataFrame :
0 commit comments