bionetslab
diff --git a/‎src/netmap/grn/inferrence.py‎
Lines changed: 40 additions & 25 deletions b/‎src/netmap/grn/inferrence.py‎
Lines changed: 40 additions & 25 deletions
diff --git a/‎src/netmap/masking/external.py‎
Lines changed: 44 additions & 9 deletions b/‎src/netmap/masking/external.py‎
Lines changed: 44 additions & 9 deletions
diff --git a/‎src/netmap/masking/internal.py‎
Lines changed: 10 additions & 0 deletions b/‎src/netmap/masking/internal.py‎
Lines changed: 10 additions & 0 deletions
diff --git a/‎src/netmap/model/train_model.py‎
Lines changed: 51 additions & 12 deletions b/‎src/netmap/model/train_model.py‎
Lines changed: 51 additions & 12 deletions
diff --git a/‎src/netmap/utils/data_utils.py‎
Lines changed: 2 additions & 2 deletions b/‎src/netmap/utils/data_utils.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎src/netmap/utils/netmap_config.py‎
Lines changed: 1 addition & 1 deletion b/‎src/netmap/utils/netmap_config.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/old/edge_selection.py‎
Lines changed: 0 additions & 47 deletions b/‎src/old/edge_selection.py‎
Lines changed: 0 additions & 47 deletions
@@ -346,7 +346,36 @@ def attribution_one_model(
 
 def inferrence_model_wise(models, data_train_full_tensor, gene_names, xai_method, n_models = [10, 25, 50], background_type = 'zeros'):
 
+    """
+    The main inferrence function to compute the entire GRN model wise. Computes all
+    attributions for all targets, aggregates them on the fly and creates an anndata.AnnData
+    object with the edge names in the var slot.
+
+    Parameters
+    ----------
+    models : list[torch.Model]
+        List of trained autoencoder models
+
+    data_train_full_tensor: torch.tensor
+        input data tensor
+        
+    gene_names: np.array
+        Gene names indicating the order of the genes in the torch tensort
+
+    xai_method: str
+        Method to be used [GradientShap, Deconvolution, GuidedBackprop]
+    
+    n_models: list [int]
+        returns aggregates of the attributions at these levels.
+
+    background_type: str
+        Bacground to compute the LRP values against. One of ['zeros', 'randomize', 'data']
 
+    Returns
+    -------
+    grn_adata : anndata.AnnData 
+        A complete, aggregated GRN object   
+    """
 
     tms = []
 
@@ -361,13 +390,9 @@ def inferrence_model_wise(models, data_train_full_tensor, gene_names, xai_method
         explainer, xai_type = _get_explainer(trained_model, xai_method)
         tms.append(explainer)
 
-
-    thresholds = [0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]
-
     attributions = {}
     attribution_collector = None
     keynames = []
-    top_egde_collector = {}
 
 
     for m in range(len(tms)):
@@ -380,19 +405,6 @@ def inferrence_model_wise(models, data_train_full_tensor, gene_names, xai_method
             background_type = background_type)
 
 
-        # grn_adata_eph = attribution_to_anndata(current_attribution, var=cou)
-        # b = np.argsort(grn_adata_eph.X, axis=1)
-        # grn_adata_eph.layers['sorted'] = b
-        # grn_adata_eph = edge_selection.add_top_edge_annotation_global(grn_adata=grn_adata_eph, top_edges = thresholds, key_name=f'agg_{m}')
-        # df_subset = grn_adata_eph.var.iloc[:, 2:]
-        # integral_results = df_subset.apply(
-        #     lambda row: np.sum(integrate.cumulative_trapezoid(row, thresholds )), 
-        #     axis=1,
-        #     )
-        # integral_results = integral_results/1000
-        # top_egde_collector[f'agg_{m}'] = integral_results
-
-
         if attribution_collector is not None:
             # add current attribution to the collector
             attribution_collector =  aggregate_attributions([attribution_collector, current_attribution], strategy='sum')
@@ -402,11 +414,13 @@ def inferrence_model_wise(models, data_train_full_tensor, gene_names, xai_method
             attribution_collector = current_attribution
 
 
-        
-        if (m+1) in n_models:
-            # dont reset, just save the correct matrix
-            attributions[f'aggregated_{(m+1)}'] = attribution_collector/(m+1)
-            keynames.append(f'aggregated_{(m+1)}')
+        try:
+            if (m+1) in n_models:
+                # dont reset, just save the correct matrix
+                attributions[f'aggregated_{(m+1)}'] = attribution_collector/(m+1)
+                keynames.append(f'aggregated_{(m+1)}')
+        except:
+            pass
 
 
     # top_egde_collector = pd.DataFrame(top_egde_collector)
@@ -418,7 +432,8 @@ def inferrence_model_wise(models, data_train_full_tensor, gene_names, xai_method
 
     grn_adata = attribution_to_anndata(attributions[keynames[0]], var=cou)
 
-    for k in keynames[1:len(keynames)]:
-        # add remaining versions as masks
-        grn_adata.layers[k] = attributions[k]
+    if len(keynames)>0:
+        for k in keynames[1:len(keynames)]:
+            # add remaining versions as masks
+            grn_adata.layers[k] = attributions[k]
     return grn_adata
@@ -20,6 +20,7 @@ def _create_edge_mask_from_GRN(grn_df, gene_list, name_grn='external_grn'):
         names: numpy.ndarray: edge name vector (GeneA_GeneB)
 
     """
+
     # Create a mapping from gene names to their matrix indices for efficient look-up.
     gene_to_index = {gene: i for i, gene in enumerate(gene_list)}
     num_genes = len(gene_list)
@@ -55,22 +56,28 @@ def _create_edge_mask_from_GRN(grn_df, gene_list, name_grn='external_grn'):
 
 
 def _get_all_genes_in_grn_object(grnad):
+    """
+    Helper function to get all genes if not available
+    
+    Args:
+        grnad (anndata.Anndata) An anndata object containing a var object with the columns source
+            and target
+    """
     all_sources = np.unique(grnad.var.source)
     all_targets = np.unique(grnad.var.target)
     all_genes = np.unique(np.concatenate([all_sources, all_targets]))
     return all_genes
 
 
 def add_external_grn(grn_ad, external_grn, name_grn = 'external_grn'):
-    
-    """
-    Adds three columns to a anndate GRN object. 
-    is_target
-    is_source
-    is_egde
-    
     """
+    Add annotation columns for a reference GRN
 
+    Args:
+     grn_ad
+    :param external_grn: pd.DataFrame containing a source column and a target column
+    :param name_grn: 
+    """
     all_my_genes = _get_all_genes_in_grn_object(grn_ad)
     edge_mask = _create_edge_mask_from_GRN(external_grn, all_my_genes, name_grn = name_grn)
     grn_ad.var = grn_ad.var.merge(edge_mask, left_index=True, right_index=True)
@@ -81,6 +88,15 @@ def add_external_grn(grn_ad, external_grn, name_grn = 'external_grn'):
 
 
 def get_genome_annotation_from_gtf(gtf_df):
+    """ Add genome information from a pandas data frame to the object.
+    Returns the gene features from a gtf file. 
+
+    Args:
+        gtf_df (pd.DataFrame): Genome information
+
+    Returns:
+        pd.DataFrame with genome infformation
+    """
     genes = gtf_df.filter(feature="gene")
     genes = pd.DataFrame(genes)
     genes.columns = gtf_df.columns
@@ -92,17 +108,36 @@ def get_genome_annotation_from_gtf(gtf_df):
     return genes
 
 
-def preprocess_bed_file(bed_file, gtf_df):
+def preprocess_bed_file(bed_file):
+    """ Read the bed file as a tab separated csv file and obtain all TFs
+    that are related to a gene from the object.
+
+    Args:
+        bed_file (str): path containing the bed file
+
+    Returns:
+        pd.DataFrame: Dataframe relating the TFs to the genes
+    """
     ## ALL cis regulatory motifs
     crm_df = pd.read_csv(bed_file, sep="\t", header=None)
     crm_df.columns = ['chr', 'start', 'end', 'TF_list','TF_number', 'strand', 'number1', 'number2', 'large_number']
-    crm_by_chr = {chr_: df for chr_, df in crm_df.groupby("chr")}
     crm_df['TF_list_list'] = crm_df['TF_list'].str.split(",")
     return crm_df
 
 
 
 def get_regulators(crm_df, genes, window):
+    """ Obtain the regulators of a target gene by searching
+    in a window up and down from the TSS
+
+    Args:
+        crm_df (_type_): _description_
+        genes (_type_): _description_
+        window (_type_): _description_
+
+    Returns:
+        _type_: _description_
+    """
     gene_to_tfs = defaultdict(set)
 
     crm_by_chr = {chr_: df for chr_, df in crm_df.groupby("chr")}
 
@@ -148,6 +148,16 @@ def dict_to_dataframe(mask_dict, column_order_list):
 
 
 def add_neighbourhood_expression_mask(adata, grn_adata):
+    """ Create a mask indicating whether the edge is likely actually
+    expressed or not.
+
+    Args:
+        adata (_type_): _description_
+        grn_adata (_type_): _description_
+
+    Returns:
+        _type_: _description_
+    """
     counts = pd.DataFrame(adata.X)
     counts.columns =adata.var.index
     ne = get_neighborhood_expression(adata, required_neighbours=5)
 
@@ -2,9 +2,27 @@
 from netmap.model.nbautoencoder import NegativeBinomialAutoencoder
 from netmap.model.zinbautoencoder import ZINBAutoencoder
 
+import torch
+from torch.utils.data import DataLoader, TensorDataset
+from tqdm import tqdm
+
 
+def create_model_zoo(data_tensor, n_models = 10, n_epochs = 10000, model_type = 'ZINBAutoencoder', dropout_rate = 0.02, latent_dim=8, hidden_dim=[128]):
+    """ Creates a set of Autoencoders of the data using the speicified architecture. The architecture of the encoder can be specified using 
+    the `hidden_dim` parameter, the decoder architecture is mirrored. Early stopping is used by default.
 
-def create_model_zoo(data_tensor, n_models = 4, n_epochs = 500, model_type = 'ZINBAutoencoder', dropout_rate = 0.02, latent_dim=8, hidden_dim=[128]):
+    Args:
+        data_tensor (torch.tensor): The raw gene expression data
+        n_models (int, optional): The number of models to compute. Defaults to 10.
+        n_epochs (int, optional): Maximum number of epochs, if early stopping is not triggered. Defaults to 10000. Use
+        model_type (str, optional): Model type, one of [ZINBAutoencoder, NegativeBinomialAutoencoder] Defaults to 'ZINBAutoencoder'.
+        dropout_rate (float, optional): Dropout rate used during training. Defaults to 0.02.
+        latent_dim (int, optional): Number of neurons in the latent dimension. Defaults to 8.
+        hidden_dim (list, optional): Architecture specification, list of ints. Defaults to [128].
+
+    Returns:
+        Model )list): The list of trained models.
+    """
     model_zoo = []
     counter = 0
     failures = 0
@@ -23,7 +41,7 @@ def create_model_zoo(data_tensor, n_models = 4, n_epochs = 500, model_type = 'ZI
 
         optimizer2 = torch.optim.Adam(trained_model2.parameters(), lr=1e-4)
 
-        trained_model2 = train_autoencoder_early_stopping(
+        trained_model2 = _train_autoencoder_early_stopping(
                 trained_model2,
                 data_train2.cuda(),
                 data_test2.cuda(),
@@ -40,13 +58,24 @@ def create_model_zoo(data_tensor, n_models = 4, n_epochs = 500, model_type = 'ZI
     return model_zoo
 
 
-def train_autoencoder(
+def _train_autoencoder(
     model,
     data_train,
     optimizer,
     batch_size=32,  # Minibatch size
     num_epochs=100,
 ):
+    """Legacy version of the training loop without early stopping
+
+    Args:
+        model (_type_): The model to be trained
+        data_train (_type_): Trianing data
+        optimizer (_type_): optimizer to be used
+        batch_size (int, optional): Batch size. Defaults to 32.
+
+    Returns:
+        Model: trained model
+    """
     # Prepare DataLoader for training
     train_dataset = TensorDataset(data_train)
     train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
@@ -73,26 +102,36 @@ def train_autoencoder(
 
     return model
 
-import torch
-from torch.utils.data import DataLoader, TensorDataset
-
 
-from tqdm import tqdm
-from torch.utils.data import TensorDataset, DataLoader
-from tqdm import tqdm
-import torch 
 
-def train_autoencoder_early_stopping(
+def _train_autoencoder_early_stopping(
     model,
     data_train,
     data_val,  
     optimizer,
     batch_size=32,
-    num_epochs=100,
+    num_epochs=10000,
     patience=10,  
     min_delta=0.001,  
     validation_freq = 10,
 ):
+    """Training loop for the autoencoders.
+
+    Args:
+        model (_type_): An instance of an autoencoder model
+        data_train (_type_): Training data split
+        data_val (_type_): Validation data split used for early stopping
+        optimizer (_type_): Optimizer used
+        batch_size (int, optional): Minibatch size. Defaults to 32.
+        num_epochs (int, optional): Number of epochs. Defaults to 10000.
+        patience (int, optional): Number of epochs with delta loss smaller 
+            than min delta before early stopping is triggered. Defaults to 10.
+        min_delta (float, optional): Loss delta for early stopping. Defaults to 0.001.
+        validation_freq (int, optional): Number of epochs before validation is run. Defaults to 10.
+
+    Returns:
+        Model: Trained model with the parametrization of the best loss.
+    """
     # Prepare DataLoaders
     train_dataset = TensorDataset(data_train)
     val_dataset = TensorDataset(data_val)
 
@@ -8,10 +8,10 @@ def attribution_to_anndata(attribution_list, var = None, obs = None)-> anndata.A
     Transform attribution data frame into an anndata object
 
     Args:
-    attribution_list: (sparse) Data frame of attribution values (one column per edge)
+        attribution_list: (sparse) Data frame of attribution values (one column per edge)
 
     returns: 
-    Anndata object with attribution values in X.
+        anndata.Anndata: Anndata object with attribution values in X.
     """
     print('Creating anndata')
     adata = anndata.AnnData(attribution_list)
 
@@ -9,7 +9,7 @@ class NetmapConfig:
     input_data: str =  "data.h5ad"
     layer: str = 'X'
     output_directory: str =  "netmap"
-    transcription_factors: str =  "/data_nfs/datasets/SCENIC_DB/tf_lists/allTFs_hg38.txt"
+    transcription_factors: str =  ""
     tf_only: bool = True
     penalize_error: bool = True
     adata_filename: str =  "grn_lrp.h5ad"