Skip to content

Commit dfd3bcf

Browse files
add example folder
1 parent 53bc280 commit dfd3bcf

6 files changed

Lines changed: 5134 additions & 112 deletions

File tree

examples/1-preprocess-data.ipynb

Lines changed: 703 additions & 0 deletions
Large diffs are not rendered by default.

examples/2-train-model.py

Lines changed: 80 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,80 @@
1+
2+
import anndata as ad
3+
import pandas as pd
4+
import seaborn as sns
5+
import matplotlib.pyplot as plt
6+
from itertools import combinations
7+
from matplotlib.pyplot import rc_context
8+
import numpy as np
9+
import sys
10+
import os.path as op
11+
12+
import anndata
13+
from netmap.downstream import final_downstream
14+
15+
16+
import warnings
17+
18+
from netmap.utils.data_utils import *
19+
from netmap.utils.tf_utils import *
20+
from netmap.utils.netmap_config import NetmapConfig
21+
22+
from netmap.model.train_model import create_model_zoo
23+
from netmap.grn.inferrence import inferrence, inferrence_model_wise
24+
from netmap.masking.internal import *
25+
from netmap.masking.external import *
26+
27+
from netmap.downstream.edge_selection import *
28+
from netmap.downstream.clustering import *
29+
from netmap.downstream.final_downstream import *
30+
31+
import scipy.sparse as scs
32+
import torch
33+
34+
35+
36+
def train_model(adata, output_dir, model_name):
37+
38+
39+
gene_names = np.array(adata.var.index)
40+
data_tensor = adata.X # Log normalized, but not standardized data.
41+
42+
43+
if scs.issparse(data_tensor):
44+
data_tensor = torch.tensor(data_tensor.todense(), dtype=torch.float32)
45+
else:
46+
data_tensor = torch.tensor(data_tensor, dtype=torch.float32)
47+
48+
model_zoo = create_model_zoo(data_tensor, n_models=10, n_epochs=10000, model_type='NBAutoencoder', latent_dim= 8, dropout_rate=0.1, hidden_dim = [64] )
49+
50+
grn_adata = inferrence(model_zoo, data_tensor.cuda(), gene_names, xai_method='GradientShap', background_type = 'zeros', backing_file=op.join(output_dir, f'{model_name}.parquet'), return_in_memory=False)
51+
52+
#Save anndata obs to grn obs for reference
53+
grn_adata.obs = adata.obs
54+
grn_adata.write_h5ad( op.join(output_dir, f'{model_name}_grn.h5ad'))
55+
56+
grn_adata.var.to_csv(op.join(output_dir, f'{model_name}_var.tsv'), header = '\t')
57+
# save the original obs
58+
adata.obs.to_csv(op.join(output_dir, f'{model_name}_obs.tsv'), header = '\t')
59+
60+
61+
62+
if __name__=='__main__':
63+
64+
# define your output dir.
65+
output_dir = "netmap/case_studies/blood"
66+
os.makedirs(output_dir, exist_ok=True)
67+
68+
# this is the folder and filename
69+
model_name = 'bd-rhap-rep1-X'
70+
model_output_dir = op.join(output_dir, model_name)
71+
os.makedirs(model_output_dir, exist_ok=True)
72+
73+
## load your data
74+
adata = sc.read_h5ad('netmap/data/blood/reprocessed/bd-rhap-rep1.h5ad')
75+
# Use the correct layer!
76+
# Here we use the X layer. The data needs to be sc.pp.normalized(target = 10000) and sc.pp.log1() transformed
77+
# But not scaled (!sc.pp.scale())
78+
train_model(adata, output_dir=model_output_dir, model_name = model_name)
79+
80+

examples/3-downstream.ipynb

Lines changed: 4349 additions & 0 deletions
Large diffs are not rendered by default.

src/netmap/downstream/final_downstream.py

Lines changed: 0 additions & 111 deletions
Original file line numberDiff line numberDiff line change
@@ -348,118 +348,7 @@ def filter_signatures_by_Ucell(selected_edges, adata) -> pd.DataFrame:
348348
data_ucell = adata.obs.filter(like='_UCell')
349349
return data_ucell
350350

351-
def select_top_edges(gene_inter_adata, adata, top_per_source=10, col_cluster='leiden_remap', min_reg_size=10, verbose=True, return_copy = False):
352-
"""
353-
Selects top gene targets per source from a clustered gene interaction AnnData.
354-
355-
Parameters
356-
----------
357-
gene_inter_adata : AnnData
358-
Gene interaction AnnData with `var` containing 'source' and 'target'.
359-
adata : AnnData
360-
Expression AnnData for ranking genes.
361-
top_per_source : int, default=750
362-
Number of top targets to select per source.
363-
col_cluster : str, default='spectral'
364-
Column in obs defining clusters.grn_adata3.var
365-
366-
Returns
367-
-------
368-
gene_inter_adata_filtered : AnnData
369-
Filtered AnnData containing top edges.
370-
reglon_sizes : list of int
371-
Sizes of regulatory regions per source.
372-
373-
"""
374-
375-
min_edge_support = 0.5
376-
377-
if verbose: print(f"Initial shape: {gene_inter_adata.shape}")
378-
379-
# Rank genes per cluster
380-
sc.tl.rank_genes_groups(adata, groupby=col_cluster, method='t-test')
381-
clusters = list(set(np.unique(gene_inter_adata.obs[col_cluster])).intersection(adata.obs[col_cluster]))
382351

383-
# Merge ranking data across clusters
384-
rank_dfs = []
385-
for c in clusters:
386-
if verbose: print(f"Ranking cluster: {c}")
387-
df = sc.get.rank_genes_groups_df(adata, group=str(c))
388-
df = df.sort_values('scores', ascending=False)
389-
df[f"rank_{c}"] = np.arange(1, len(df) + 1)
390-
rank_dfs.append(df[['names', f'rank_{c}']])
391-
df_rank = reduce(lambda l, r: pd.merge(l, r, on='names', how='inner'), rank_dfs)
392-
393-
keep_edges_dict = {}
394-
# Compute differences per cluster
395-
396-
for c in clusters:
397-
Keep_edges, reglon_sizes = [], []
398-
if verbose: print(f"Selecting targets for cluster: {c}")
399-
df_rank_c = df_rank.copy()
400-
rank_cols = [col for col in df_rank.columns if col != 'names']
401-
rank_cols.remove(f"rank_{c}")
402-
df_rank_c['avg'] = df_rank_c[rank_cols].mean(axis=1)
403-
df_rank_c['diff'] = (df_rank_c[f"rank_{c}"] - df_rank_c['avg']).abs()
404-
df_rank_c = df_rank_c.sort_values('diff', ascending=False)
405-
406-
407-
for source in gene_inter_adata.var["source"].unique():
408-
if df_rank_c.loc[df_rank_c['names'] == source, 'diff'].shape[0] > 0:
409-
tf_rank = df_rank_c.loc[df_rank_c['names'] == source, 'diff'].values[0]
410-
411-
df_targets = (
412-
gene_inter_adata.var[
413-
(gene_inter_adata.var['source'] == source) &
414-
(gene_inter_adata.var[f'{c}_nonzero'] >= min_edge_support)
415-
]
416-
.merge(df_rank_c[['names', 'diff']],
417-
left_on='target', right_on='names', how='left')
418-
)
419-
420-
df_targets['rank_distance'] = (df_targets['diff'] - tf_rank).abs()
421-
df_targets = df_targets.sort_values('rank_distance').head(top_per_source)
422-
423-
reglon_sizes.append(len(df_targets))
424-
if len(df_targets) >= min_reg_size:
425-
Keep_edges.extend(f"{source}_{t}" for t in df_targets['target'])
426-
427-
keep_edges_dict[c] = Keep_edges
428-
keep_edges_dict = process_cell_edges(keep_edges)
429-
return keep_edges_dict
430-
431-
432-
433-
def process_cell_edges(keep_edges):
434-
results = {'unique': {}, 'pairwise': {}}
435-
all_cells = list(keep_edges.keys())
436-
437-
def get_source_summary(edge_set):
438-
# Handles (source, target) tuples OR strings with a separator like '->'
439-
sources = []
440-
for e in edge_set:
441-
sources.append(e.split('_')[0])
442-
443-
source_dict = dict(Counter(sources))
444-
sources = pd.DataFrame({'source' :source_dict.keys(), 'count': source_dict.values()}).sort_values('count', ascending=False)
445-
return sources
446-
447-
# Calculate Uniques
448-
for cell in all_cells:
449-
others = set().union(*(set(keep_edges[c]) for c in all_cells if c != cell))
450-
unique = set(keep_edges[cell]) - others
451-
452-
df = pd.DataFrame(
453-
[e.split('_', 1) for e in unique],
454-
columns=['source', 'target']
455-
)
456-
457-
results['unique'][cell] = {
458-
'edges': df,
459-
'summary': get_source_summary(unique)
460-
}
461-
462-
return results
463352

464353

465354
def compute_signatures_UCell_scores(selected_edges, adata, key='unique') -> pd.DataFrame:

src/netmap/downstream/plotting.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,7 @@ def rank_regulon_groups_dotplot(grn_adata_filtered, adata_regl, original_cluster
5050
pp.dot_size_df = fractions.loc[:, pp.dot_color_df.columns]
5151
pp.dot_size_df = pp.dot_size_df/(pp.dot_size_df.max())
5252
pp.dot_size_df = pp.dot_size_df.fillna(0)
53+
5354
if return_fig:
5455
return pp
5556
else:

src/netmap/downstream/regulon.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -184,7 +184,7 @@ def aggregate_edges(selected_edges, grn_adata, key='unique') -> pd.DataFrame:
184184
print(ct)
185185
sign = selected_edges[key][ct]['edges'].groupby('source').apply(lambda x: (x['source'] + "_" + x['target']).tolist()).to_dict()
186186
for g in sign:
187-
regulons[f'{ct}_{g}'] = grn_adata[:, sign[g]].X.sum(axis = 1)
187+
regulons[f'{ct}_{g}'] = grn_adata[:, sign[g]].X.sum(axis = 1)/len(sign[g])
188188
regulons = pd.DataFrame(regulons)
189189
return regulons
190190

0 commit comments

Comments
 (0)