From 393b751848d088f2d02505dc782d520f8a23823d Mon Sep 17 00:00:00 2001 From: Maya Sheth Date: Fri, 6 Jun 2025 10:07:08 -0700 Subject: [PATCH 01/22] first attempt at implementation, untested --- workflow/Snakefile_training | 40 +++++++++------ workflow/rules/compare_models.smk | 5 +- workflow/rules/crispr_features.smk | 27 ++++++---- workflow/rules/feature_analysis.smk | 50 +++++++++---------- workflow/rules/train_model.smk | 14 +++--- workflow/rules/utils.smk | 26 ++++++++++ .../feature_tables/combine_feature_tables.R | 14 ++++-- .../overlap_features_with_crispr_data.R | 5 +- .../feature_tables/process_crispr_data.R | 36 +++++++++++-- .../model_training/compare_all_models.py | 32 +++++++----- 10 files changed, 165 insertions(+), 84 deletions(-) diff --git a/workflow/Snakefile_training b/workflow/Snakefile_training index 545a8fe..b9b40c1 100755 --- a/workflow/Snakefile_training +++ b/workflow/Snakefile_training @@ -1,27 +1,34 @@ from snakemake.utils import min_version min_version("7.0") +conda: "mamba" import pandas as pd import os import yaml -configfile: "config/config_training.yaml" model_config = pd.read_table(config["model_config"], na_values="").fillna("None").set_index("model", drop=False) dataset_config = pd.read_table(config["dataset_config"], na_values="").set_index("biosample", drop=False) conda: "mamba" + +# import utils and make config paths absolute MAX_MEM_MB = config["max_memory_allocation_mb"] include: "rules/utils.smk" E2G_DIR_PATH = os.path.abspath(config["E2G_DIR_PATH"]) config = make_paths_absolute(config, E2G_DIR_PATH) +config["results_dir"] = os.path.join(E2G_DIR_PATH, config["results_dir"]) # manually modify results dir since may not exist -# Need to manually make results_dir an absolute path since above may -# not work if results_dir folder isn't created -# If results_dir is already an absolute path, this is a no-op -config["results_dir"] = os.path.join(E2G_DIR_PATH, config["results_dir"]) +# define some global variables +RESULTS_DIR = os.path.join(config["results_dir"], "dataset_results") # for datasets +MODELS_RESULTS_DIR = os.path.join(config["results_dir"], "model_results") +SCRIPTS_DIR = os.path.join(E2G_DIR_PATH, "workflow/scripts") +CRISPR_CELL_TYPES = config["crispr_cell_types"] +# process configs model_config = process_model_config(model_config) +model_dataset_dict = make_model_dataset_dict(model_config, dataset_config) # dictionary of dictionaries: model: {crispr_ct: dataset, ...} +# import ABC submodule module ABC: snakefile: f"{config['ABC_DIR_PATH']}/workflow/Snakefile" @@ -31,39 +38,40 @@ abc_config = get_abc_config(config) use rule * from ABC exclude all as abc_* -RESULTS_DIR = config["results_dir"] -SCRIPTS_DIR = os.path.join(E2G_DIR_PATH, "workflow/scripts") - -# These rules requires the variables above to be defined +# import all rules (require the variables above to be defined) include: "rules/genomewide_features.smk" include: "rules/crispr_features.smk" include: "rules/train_model.smk" include: "rules/feature_analysis.smk" include: "rules/compare_models.smk" -# Validate ABC_directory column and make ABC directory dict +# validate ABC_directory column and make ABC directory dict ABC_BIOSAMPLES_DIR = process_abc_directory_column(model_config) # specify target files output_files = [] -output_files.extend(expand(os.path.join(RESULTS_DIR, "{dataset}", "{model}", "for_training.EPCrisprBenchmark_ensemble_data_GRCh38.K562_features_NAfilled.tsv.gz"), zip, dataset=model_config["dataset"], model=model_config["model"])) -output_files.extend(expand(os.path.join(RESULTS_DIR, "{dataset}", "{model}", "model", "model_full.pkl"), zip, dataset=model_config["dataset"], model=model_config["model"])) # trained models +output_files.extend(expand(os.path.join(MODELS_RESULTS_DIR, "{model}", "for_training.EPCrisprBenchmark_ensemble_data_GRCh38.K562_features_NAfilled.tsv.gz"), model=model_config["model"])) +output_files.extend(expand(os.path.join(MODELS_RESULTS_DIR, "{model}", "model", "model_full.pkl"), model=model_config["model"])) # trained models + +# output_files.extend(expand(os.path.join(RESULTS_DIR, "{dataset}", "{model}", "for_training.EPCrisprBenchmark_ensemble_data_GRCh38.K562_features_NAfilled.tsv.gz"), zip, dataset=model_config["dataset"], model=model_config["model"])) +# output_files.extend(expand(os.path.join(RESULTS_DIR, "{dataset}", "{model}", "model", "model_full.pkl"), zip, dataset=model_config["dataset"], model=model_config["model"])) # trained models output_files.append(os.path.join(RESULTS_DIR, "performance_across_models.tsv")) # comparison across models output_files.extend(expand(os.path.join(RESULTS_DIR, "performance_across_models_{metric}.pdf"), metric=["auprc", "precision"])) # plot of comparisons if config["run_feature_analysis"]: - output_files.extend(expand(os.path.join(RESULTS_DIR, "{dataset}", "{model}", "feature_analysis", "forward_feature_selection_auprc.pdf"), zip, dataset=model_config["dataset"], model=model_config["model"])) - output_files.extend(expand(os.path.join(RESULTS_DIR, "{dataset}", "{model}", "feature_analysis", "backward_feature_selection_auprc.pdf"), zip, dataset=model_config["dataset"], model=model_config["model"])) - output_files.extend(expand(os.path.join(RESULTS_DIR, "{dataset}", "{model}", "feature_analysis", "permutation_feature_importance_auprc.pdf"), zip, dataset=model_config["dataset"], model=model_config["model"])) + output_files.extend(expand(os.path.join(MODELS_RESULTS_DIR, "{model}", "feature_analysis", "forward_feature_selection_auprc.pdf"), model=model_config["model"])) + output_files.extend(expand(os.path.join(MODELS_RESULTS_DIR, "{model}", "feature_analysis", "backward_feature_selection_auprc.pdf"), model=model_config["model"])) + output_files.extend(expand(os.path.join(MODELS_RESULTS_DIR, "{model}", "feature_analysis", "permutation_feature_importance_auprc.pdf"), model=model_config["model"])) # only test all feature sets if polynomial==False and n_features<14 for row in model_config.itertuples(index=False): if not row.polynomial == 'True': features = pd.read_table(row.feature_table) n_features = len(features) + if n_features<14: - output_files.append(os.path.join(RESULTS_DIR, row.dataset, row.model, "feature_analysis", "all_feature_sets.tsv")) + output_files.append(os.path.join(MODELS_RESULTS_DIR, row.model, "feature_analysis", "all_feature_sets.tsv")) rule all: input: diff --git a/workflow/rules/compare_models.smk b/workflow/rules/compare_models.smk index b0623d8..8fb446c 100755 --- a/workflow/rules/compare_models.smk +++ b/workflow/rules/compare_models.smk @@ -2,7 +2,8 @@ # compare cv-performance on training data across all models (note, this is not the true benchmarking performance CRISPR elements not overlapping prediction elements aren't considered) rule gather_model_performances: input: - all_predictions = expand(os.path.join(RESULTS_DIR, "{dataset}", "{model}", "model", "training_predictions.tsv"), zip, dataset=model_config["dataset"], model=model_config["model"]) + all_predictions = expand(os.path.join(MODELS_RESULTS_DIR "{model}", "model", "training_predictions.tsv"), model=model_config["model"]), + all_missing = expand(os.path.join(MODELS_RESULTS_DIR "{model}", "combined_CRISPR_dataset.missing_from_features.{nafill}.tsv.gz"), model=model_config["model"]), output: comp_table = os.path.join(RESULTS_DIR, "performance_across_models.tsv") params: @@ -17,6 +18,8 @@ rule gather_model_performances: shell: """ python {params.scripts_dir}/model_training/compare_all_models.py \ + --all_pred "{input.all_predictions}" \ + --all_missing "{input.all_missing}" \ --model_config_file {params.model_config_file} \ --output_file {output.comp_table} \ --crispr_data {params.crispr_dataset} \ diff --git a/workflow/rules/crispr_features.smk b/workflow/rules/crispr_features.smk index 378fdf0..8748c76 100644 --- a/workflow/rules/crispr_features.smk +++ b/workflow/rules/crispr_features.smk @@ -28,18 +28,18 @@ rule format_external_features_config: script: "../scripts/feature_tables/format_external_features_config.R" -# overlap feature table with K562 CRISPR data -rule overlap_features_crispr: +# overlap feature table with CRISPR data for this dataset +rule overlap_features_crispr_for_dataset: input: features = os.path.join(RESULTS_DIR, "{dataset}", "genomewide_features.tsv.gz"), - crispr = config['crispr_dataset'], feature_table_file = os.path.join(RESULTS_DIR, "{dataset}", "feature_table.tsv"), + crispr = config['crispr_dataset'], tss = config['gene_TSS500'] params: - tpm_threshold = lambda wildcards: model_config.loc[wildcards.model, 'tpm_threshold'] + crispr_cell_type = lambda wildcards: dataset_config.loc[wildcards.dataset, "crispr_cell_type"] output: - features = os.path.join(RESULTS_DIR, "{dataset}", "{model}", "EPCrisprBenchmark_ensemble_data_GRCh38.K562_features_{nafill}.tsv.gz"), - missing = os.path.join(RESULTS_DIR, "{dataset}", "{model}", "missing.EPCrisprBenchmark_ensemble_data_GRCh38.K562_features_{nafill}.tsv.gz") + features = os.path.join(RESULTS_DIR, "{dataset}", "CRISPR_dataset.overlapping_features.{nafill}.tsv.gz"), + missing = os.path.join(RESULTS_DIR, "{dataset}", "CRISPR_dataset.missing_from_features.{nafill}.tsv.gz") conda: "../envs/encode_re2g.yml" resources: @@ -47,15 +47,22 @@ rule overlap_features_crispr: script: "../scripts/feature_tables/overlap_features_with_crispr_data.R" -# process data for model training: rename columns, apply filter features, filter to gene list +# process data for model training: rename columns, apply filter features, filter to gene list, combine across datasets # note: we use the NAfilled CRISPR feature data here! +def get_crispr_files_for_model(wildcards, file_type): + datasets = model_dataset_dict[wildcards.model].values() + files = [os.path.join(RESULTS_DIR, ds, f"CRISPR_dataset.{file_type}.{wildcards.nafill}.tsv.gz") for ds in datasets] + return files + rule process_crispr_data: input: - crispr_features = os.path.join(RESULTS_DIR, "{dataset}", "{model}", "EPCrisprBenchmark_ensemble_data_GRCh38.K562_features_{nafill}.tsv.gz") + crispr_features = lambda wildcards: get_crispr_files_for_model(wildcards, "overlapping_features"), + crispr_missing = lambda wildcards: get_crispr_files_for_model(wildcards, "missing_from_features"), params: - genes = config["gene_TSS500"] + genes = config["gene_TSS500"], output: - processed = os.path.join(RESULTS_DIR, "{dataset}", "{model}", "for_training.EPCrisprBenchmark_ensemble_data_GRCh38.K562_features_{nafill}.tsv.gz") + processed = os.path.join(MODELS_RESULTS_DIR, "{model}", "for_training.combined_CRISPR_dataset.overlapping_features.{nafill}.tsv.gz") + missing = os.path.join(MODELS_RESULTS_DIR, "{model}", "combined_CRISPR_dataset.missing_from_features.{nafill}.tsv.gz") conda: "../envs/encode_re2g.yml" resources: diff --git a/workflow/rules/feature_analysis.smk b/workflow/rules/feature_analysis.smk index 02354a1..afad160 100755 --- a/workflow/rules/feature_analysis.smk +++ b/workflow/rules/feature_analysis.smk @@ -1,16 +1,16 @@ # forward sequential feature selection rule calculate_forward_feature_selection: input: - crispr_features_processed = os.path.join(RESULTS_DIR, "{dataset}", "{model}", "for_training.EPCrisprBenchmark_ensemble_data_GRCh38.K562_features_NAfilled.tsv.gz"), + crispr_features_processed = os.path.join(MODELS_RESULTS_DIR, "{model}", "for_training.combined_CRISPR_dataset.overlapping_features.NAfilled.tsv.gz"), feature_table = lambda wildcards: model_config.loc[wildcards.model, 'feature_table'], - model_params = os.path.join(RESULTS_DIR, "{dataset}", "{model}", "model", "training_params.pkl") + model_params = os.path.join(MODELS_RESULTS_DIR, "{model}", "model", "training_params.pkl") params: epsilon = config["epsilon"], scripts_dir = SCRIPTS_DIR, polynomial = lambda wildcards: model_config.loc[wildcards.model, 'polynomial'], - out_dir = os.path.join(RESULTS_DIR, "{dataset}", "{model}", "feature_analysis") + out_dir = os.path.join(MODELS_RESULTS_DIR, "{model}", "feature_analysis") output: - results = os.path.join(RESULTS_DIR, "{dataset}", "{model}", "feature_analysis", "forward_feature_selection.tsv"), + results = os.path.join(MODELS_RESULTS_DIR, "{model}", "feature_analysis", "forward_feature_selection.tsv"), conda: "../envs/encode_re2g.yml" resources: @@ -28,13 +28,13 @@ rule calculate_forward_feature_selection: rule plot_forward_feature_selection: input: - results = os.path.join(RESULTS_DIR, "{dataset}", "{model}", "feature_analysis", "forward_feature_selection.tsv"), + results = os.path.join(MODELS_RESULTS_DIR, "{model}", "feature_analysis", "forward_feature_selection.tsv"), feature_table = lambda wildcards: model_config.loc[wildcards.model, 'feature_table'], params: polynomial = lambda wildcards: model_config.loc[wildcards.model, 'polynomial'] output: - out_auprc = os.path.join(RESULTS_DIR, "{dataset}", "{model}", "feature_analysis", "forward_feature_selection_auprc.pdf"), - out_prec = os.path.join(RESULTS_DIR, "{dataset}", "{model}", "feature_analysis", "forward_feature_selection_precision.pdf") + out_auprc = os.path.join(MODELS_RESULTS_DIR, "{model}", "feature_analysis", "forward_feature_selection_auprc.pdf"), + out_prec = os.path.join(MODELS_RESULTS_DIR, "{model}", "feature_analysis", "forward_feature_selection_precision.pdf") conda: "../envs/encode_re2g.yml" resources: @@ -45,16 +45,16 @@ rule plot_forward_feature_selection: # backward sequential feature selection rule calculate_backward_feature_selection: input: - crispr_features_processed = os.path.join(RESULTS_DIR, "{dataset}", "{model}", "for_training.EPCrisprBenchmark_ensemble_data_GRCh38.K562_features_NAfilled.tsv.gz"), + crispr_features_processed = os.path.join(MODELS_RESULTS_DIR, "{model}", "for_training.combined_CRISPR_dataset.overlapping_features.NAfilled.tsv.gz"), feature_table = lambda wildcards: model_config.loc[wildcards.model, 'feature_table'], - model_params = os.path.join(RESULTS_DIR, "{dataset}", "{model}", "model", "training_params.pkl") + model_params = os.path.join(MODELS_RESULTS_DIR, "{model}", "model", "training_params.pkl") params: epsilon = config["epsilon"], scripts_dir = SCRIPTS_DIR, polynomial = lambda wildcards: model_config.loc[wildcards.model, 'polynomial'], - out_dir = os.path.join(RESULTS_DIR, "{dataset}", "{model}", "feature_analysis") + out_dir = os.path.join(MODELS_RESULTS_DIR, "{model}", "feature_analysis") output: - results = os.path.join(RESULTS_DIR, "{dataset}", "{model}", "feature_analysis", "backward_feature_selection.tsv"), + results = os.path.join(MODELS_RESULTS_DIR, "{model}", "feature_analysis", "backward_feature_selection.tsv"), conda: "../envs/encode_re2g.yml" resources: @@ -72,13 +72,13 @@ rule calculate_backward_feature_selection: rule plot_backward_feature_selection: input: - results = os.path.join(RESULTS_DIR, "{dataset}", "{model}", "feature_analysis", "backward_feature_selection.tsv"), + results = os.path.join(MODELS_RESULTS_DIR, "{dataset}", "{model}", "feature_analysis", "backward_feature_selection.tsv"), feature_table = lambda wildcards: model_config.loc[wildcards.model, 'feature_table'] params: polynomial = lambda wildcards: model_config.loc[wildcards.model, 'polynomial'] output: - out_auprc = os.path.join(RESULTS_DIR, "{dataset}", "{model}", "feature_analysis", "backward_feature_selection_auprc.pdf"), - out_prec = os.path.join(RESULTS_DIR, "{dataset}", "{model}", "feature_analysis", "backward_feature_selection_precision.pdf") + out_auprc = os.path.join(MODELS_RESULTS_DIR, "{dataset}", "{model}", "feature_analysis", "backward_feature_selection_auprc.pdf"), + out_prec = os.path.join(MODELS_RESULTS_DIR, "{dataset}", "{model}", "feature_analysis", "backward_feature_selection_precision.pdf") conda: "../envs/encode_re2g.yml" resources: @@ -89,15 +89,15 @@ rule plot_backward_feature_selection: # compare all features sets rule compare_all_feature_sets: input: - crispr_features_processed = os.path.join(RESULTS_DIR, "{dataset}", "{model}", "for_training.EPCrisprBenchmark_ensemble_data_GRCh38.K562_features_NAfilled.tsv.gz"), + crispr_features_processed = os.path.join(MODELS_RESULTS_DIR, "{model}", "for_training.combined_CRISPR_dataset.overlapping_features.NAfilled.tsv.gz"), feature_table = lambda wildcards: model_config.loc[wildcards.model, 'feature_table'], - model_params = os.path.join(RESULTS_DIR, "{dataset}", "{model}", "model", "training_params.pkl") + model_params = os.path.join(MODELS_RESULTS_DIR, "{model}", "model", "training_params.pkl") params: epsilon = config["epsilon"], scripts_dir = SCRIPTS_DIR, - out_dir = os.path.join(RESULTS_DIR, "{dataset}", "{model}", "feature_analysis") + out_dir = os.path.join(MODELS_RESULTS_DIR "{model}", "feature_analysis") output: - results = os.path.join(RESULTS_DIR, "{dataset}", "{model}", "feature_analysis", "all_feature_sets.tsv"), + results = os.path.join(MODELS_RESULTS_DIR "{model}", "feature_analysis", "all_feature_sets.tsv"), conda: "../envs/encode_re2g.yml" resources: @@ -116,17 +116,17 @@ rule compare_all_feature_sets: # permuation feature importance rule calculate_permutation_feature_importance: input: - crispr_features_processed = os.path.join(RESULTS_DIR, "{dataset}", "{model}", "for_training.EPCrisprBenchmark_ensemble_data_GRCh38.K562_features_NAfilled.tsv.gz"), + crispr_features_processed = os.path.join(MODELS_RESULTS_DIR, "{model}", "for_training.combined_CRISPR_dataset.overlapping_features.NAfilled.tsv.gz"), feature_table = lambda wildcards: model_config.loc[wildcards.model, 'feature_table'], - model_params = os.path.join(RESULTS_DIR, "{dataset}", "{model}", "model", "training_params.pkl") + model_params = os.path.join(MODELS_RESULTS_DIR "{model}", "model", "training_params.pkl") params: epsilon = config["epsilon"], n_repeats = 20, scripts_dir = SCRIPTS_DIR, polynomial = lambda wildcards: model_config.loc[wildcards.model, 'polynomial'], - out_dir = os.path.join(RESULTS_DIR, "{dataset}", "{model}", "feature_analysis") + out_dir = os.path.join(MODELS_RESULTS_DIR "{model}", "feature_analysis") output: - results = os.path.join(RESULTS_DIR, "{dataset}", "{model}", "feature_analysis", "permutation_feature_importance.tsv"), + results = os.path.join(MODELS_RESULTS_DIR "{model}", "feature_analysis", "permutation_feature_importance.tsv"), conda: "../envs/encode_re2g.yml" resources: @@ -145,14 +145,14 @@ rule calculate_permutation_feature_importance: rule plot_permutation_feature_importance: input: - results = os.path.join(RESULTS_DIR, "{dataset}", "{model}", "feature_analysis", "permutation_feature_importance.tsv"), + results = os.path.join(MODELS_RESULTS_DIR "{model}", "feature_analysis", "permutation_feature_importance.tsv"), feature_table = lambda wildcards: model_config.loc[wildcards.model, 'feature_table'] params: polynomial = lambda wildcards: model_config.loc[wildcards.model, 'polynomial'], n_repeats = 20 output: - out_auprc = os.path.join(RESULTS_DIR, "{dataset}", "{model}", "feature_analysis", "permutation_feature_importance_auprc.pdf"), - out_prec = os.path.join(RESULTS_DIR, "{dataset}", "{model}", "feature_analysis", "permutation_feature_importance_precision.pdf") + out_auprc = os.path.join(MODELS_RESULTS_DIR "{model}", "feature_analysis", "permutation_feature_importance_auprc.pdf"), + out_prec = os.path.join(MODELS_RESULTS_DIR "{model}", "feature_analysis", "permutation_feature_importance_precision.pdf") conda: "../envs/encode_re2g.yml" resources: diff --git a/workflow/rules/train_model.smk b/workflow/rules/train_model.smk index 9a7e683..e5a3e15 100644 --- a/workflow/rules/train_model.smk +++ b/workflow/rules/train_model.smk @@ -7,7 +7,7 @@ rule generate_model_params: default_params = config["default_params"], scripts_dir = SCRIPTS_DIR output: - final_params = os.path.join(RESULTS_DIR, "{dataset}", "{model}", "model", "training_params.pkl") + final_params = os.path.join(MODELS_RESULTS_DIR, "{model}", "model", "training_params.pkl") conda: "../envs/encode_re2g.yml" shell: @@ -21,19 +21,17 @@ rule generate_model_params: # generate trained model and cross-validated predictions on CRISPR data rule train_model: input: - crispr_features_processed = os.path.join(RESULTS_DIR, "{dataset}", "{model}", "for_training.EPCrisprBenchmark_ensemble_data_GRCh38.K562_features_NAfilled.tsv.gz"), + crispr_features_processed = os.path.join(MODELS_RESULTS_DIR, "{model}", "for_training.combined_CRISPR_dataset.overlapping_features.NAfilled.tsv.gz"), feature_table = lambda wildcards: model_config.loc[wildcards.model, 'feature_table'], - model_params = os.path.join(RESULTS_DIR, "{dataset}", "{model}", "model", "training_params.pkl") + model_params = os.path.join(MODELS_RESULTS_DIR, "{model}", "model", "training_params.pkl") params: epsilon = config["epsilon"], scripts_dir = SCRIPTS_DIR, polynomial = lambda wildcards: model_config.loc[wildcards.model, 'polynomial'], - out_dir = os.path.join(RESULTS_DIR, "{dataset}", "{model}", "model") + out_dir = os.path.join(MODELS_RESULTS_DIR, "{model}") output: - trained_model = os.path.join(RESULTS_DIR, "{dataset}", "{model}", "model", "model_full.pkl"), - pred = os.path.join(RESULTS_DIR, "{dataset}", "{model}", "model", "training_predictions.tsv"), - #in_order = os.path.join(RESULTS_DIR, "{dataset}", "{model}", "model", "training_data_in_order.tsv"), - #shap = os.path.join(RESULTS_DIR, "{dataset}", "{model}", "model", "shap_scores.tsv") + trained_model = os.path.join(MODELS_RESULTS_DIR, "{model}", "model", "model_full.pkl"), + pred = os.path.join(MODELS_RESULTS_DIR, "{model}", "model", "training_predictions.tsv"), conda: "../envs/encode_re2g.yml" resources: diff --git a/workflow/rules/utils.smk b/workflow/rules/utils.smk index fa1db11..129a6fd 100755 --- a/workflow/rules/utils.smk +++ b/workflow/rules/utils.smk @@ -35,6 +35,32 @@ def process_model_config(model_config): return model_config +# create dictionary of dictionaries to map datasets to crispr cell types: model: {crispr_ct: dataset, ...} +def make_model_dataset_dict(model_config, dataset_config): + + # dict of dataset: crispr_cell_type + dataset_ct_dict = dict(zip(dataset_config['biosample'], dataset_config['crispr_cell_type'])) + + model_dicts = [] + for index, row in model_config.iterrows(): + model_datasets = [item.strip() for item in row["dataset"].split(",")] # return list of datasets + mapped_cts = [dataset_ct_dict[ds] for ds in model_datasets] + + # make sure 1:1 correspondance with CRISPR cell types + if sorted(CRISPR_CELL_TYPES) != sorted(mapped_cts): + print(f"Model datasets: {model_datasets} -> cell types: {mapped_cts}") + raise Exception(f"Datasets specified for {row["model"]} do not map to all CRISPR cell types.") + + this_model_dict = dict(zip(model_datasets, mapped_cts)) + model_dicts.append(this_model_dict) + + # combine into dictionary of dicitonaries + model_dataset_dict = dict(zip(model_config["model"]), model_dicts) + + return model_dataset_dict + + + def get_abc_config(config): abc_config_file = os.path.join(config["ABC_DIR_PATH"], "config/config.yaml") with open(abc_config_file, 'r') as stream: diff --git a/workflow/scripts/feature_tables/combine_feature_tables.R b/workflow/scripts/feature_tables/combine_feature_tables.R index 54cbe91..cdee111 100644 --- a/workflow/scripts/feature_tables/combine_feature_tables.R +++ b/workflow/scripts/feature_tables/combine_feature_tables.R @@ -6,13 +6,17 @@ library(dplyr) model_config = fread(snakemake@input$model_config) ds = snakemake@wildcards$dataset -# merge feature tables -models_this = dplyr::filter(model_config, dataset==ds) -for (i in 1:nrow(models_this)){ - ft = fread(models_this$feature_table[i]) - if (i==1){df = ft} else {df = rbind(df, ft)} +# merge feature tables for models with this dataset +ft_files = c() +for (i in 1:nrow(model_config)){ + model_datasets = model_config$dataset[i] %>% strsplit(",") %>% trimws() + if (ds %in% model_datasets) { + ft_files = c(ft_files, model_config$feature_table[i]) + } } +df <- lapply(ft_files, fread) %>% rbindlist() %>% as.data.frame() + # for sc-E2G pipeline if (("ARC.E2G.Score" %in% df$feature) | ("Kendall" %in% df$feature)){ ARC_rows = data.frame(c("RNA_meanLogNorm", "RNA_pseudobulkTPM", "RNA_percentCellsDetected", "Kendall", "ARC.E2G.Score", "ABC.Score", "normalizedATAC_enh"), diff --git a/workflow/scripts/feature_tables/overlap_features_with_crispr_data.R b/workflow/scripts/feature_tables/overlap_features_with_crispr_data.R index 34d2850..abf57f9 100755 --- a/workflow/scripts/feature_tables/overlap_features_with_crispr_data.R +++ b/workflow/scripts/feature_tables/overlap_features_with_crispr_data.R @@ -81,9 +81,10 @@ aggregate_features <- function(merged, feature_score_cols, agg_cols, agg_fun) { # load feature table features <- fread(snakemake@input$features) -# load crispri data and only retain relevant columns +# load crispri data and only retain relevant columns and filter to cell type crispr <- fread(snakemake@input$crispr) -crispr <- select(crispr, -c(pair_uid, merged_uid, merged_start, merged_end)) +crispr <- select(crispr, -c(pair_uid, merged_uid, merged_start, merged_end)) %>% + filter(CellType == snakemake@params$crispr_cell_type) # load feature config file and only retain entries for features in input data config <- fread(snakemake@input$feature_table_file) diff --git a/workflow/scripts/feature_tables/process_crispr_data.R b/workflow/scripts/feature_tables/process_crispr_data.R index e382a16..ef5e45b 100755 --- a/workflow/scripts/feature_tables/process_crispr_data.R +++ b/workflow/scripts/feature_tables/process_crispr_data.R @@ -8,19 +8,45 @@ suppressPackageStartupMessages({ library(tibble) }) +combine_crispr_features <- function(space_sep_string) { + crispr_feature_files <- space_sep_list %>% strsplit(" ") %>% trimws() + crispr_features <- lapply(crispr_feature_files, fread) + + # get common columns (e.g. features in all) + col_names_list <- lapply(crispr_features, colnames) + common_cols <- Reduce(intersect, col_names_list) + + # filter crispr features to selected columns then combine + crispr_features <- lapply(crispr_features, function(df) select(df, all_of(common_cols))) %>% + rbindlist() %>% as.data.frame() + + return(crispr_features) +} + + + # load inputs -df <- fread(snakemake@input$crispr_features) +crispr_features <- combine_crispr_features(snakemake@input$crispr_features) +crispr_missing <- combine_crispr_features(snakemake@input$crispr_missing) + + genes <- fread(snakemake@params$genes) colnames(genes) <- c('chr', 'start', 'end', 'gene', 'score', 'strand', 'Ensembl_ID', 'gene_type') # rename some column names for later utility -df = dplyr::rename(df, 'chr'='chrom', 'start'='chromStart', 'end'='chromEnd', 'TargetGene'='measuredGeneSymbol') +crispr_features <- crispr_features %>% + rename('chr'='chrom', 'start'='chromStart', 'end'='chromEnd', 'TargetGene'='measuredGeneSymbol') +crispr_missing <- crispr_missing %>% + rename('chr'='chrom', 'start'='chromStart', 'end'='chromEnd', 'TargetGene'='measuredGeneSymbol') # drop elements where Regulated=NA -df = dplyr::filter(df, Regulated!="NA") +crispr_features = filter(crispr_features, Regulated != "NA" & !is.na(Regulated)) +crispr_missing = filter(crispr_missing, Regulated != "NA" & !is.na(Regulated)) # filter to elements with target gene in gene universe (should be redundant with applying filter_genes=tss_universe in feature overlap) -df = dplyr::filter(df, TargetGene %in% genes$gene) +crispr_features = filter(crispr_features, TargetGene %in% genes$gene) +crispr_missing = filter(crispr_missing, TargetGene %in% genes$gene) # save output to file -fwrite(df, file = snakemake@output$processed, sep = "\t", na = "NA", quote = FALSE) +fwrite(crispr_features, file = snakemake@output$processed, sep = "\t", na = "NA", quote = FALSE) +fwrite(crispr_missing, file = snakemake@output$missing, sep = "\t", na = "NA", quote = FALSE) diff --git a/workflow/scripts/model_training/compare_all_models.py b/workflow/scripts/model_training/compare_all_models.py index 9b38528..0841b47 100644 --- a/workflow/scripts/model_training/compare_all_models.py +++ b/workflow/scripts/model_training/compare_all_models.py @@ -12,7 +12,7 @@ def performance_summary( - model_id, dataset, model_name, out_dir, crispr_data="", n_boot=1000 + model_id, dataset, pred_file, missing_file, model_name, out_dir, crispr_data="", n_boot=1000 ): # read in predicitons if model_id == "distance": @@ -26,15 +26,15 @@ def performance_summary( Y_pred_all = crispr_data["distance"] * -1 pct_missing = 0 else: # normal models - pred_file = os.path.join( - out_dir, dataset, model_id, "model", "training_predictions.tsv" - ) - missing_file = os.path.join( - out_dir, - dataset, - model_id, - "missing.EPCrisprBenchmark_ensemble_data_GRCh38.K562_features_NAfilled.tsv.gz", - ) + # pred_file = os.path.join( + # out_dir, dataset, model_id, "model", "training_predictions.tsv" + # ) + # missing_file = os.path.join( + # out_dir, + # dataset, + # model_id, + # "missing.EPCrisprBenchmark_ensemble_data_GRCh38.K562_features_NAfilled.tsv.gz", + # ) pred_df = pd.read_csv(pred_file, sep="\t") missing_df = pd.read_csv(missing_file, sep="\t") @@ -101,17 +101,25 @@ def performance_summary( @click.command() +@click.option("--all_pred", required=True) +@click.option("--all_missing", required=True) @click.option("--model_config_file", required=True) @click.option("--output_file", required=True) @click.option("--crispr_data", required=True) @click.option("--out_dir", required=True) -def main(model_config_file, output_file, crispr_data, out_dir): +def main(all_pred, all_missing, model_config_file, output_file, crispr_data, out_dir): + all_pred = all_pred.split(" ") + all_missing = all_missing.split(" ") + model_name = "E2G" model_config = ( pd.read_table(model_config_file, na_values="") .fillna("None") .set_index("model", drop=False) ) + model_config["prediction_file"] = all_pred + model_config["missing_file"] = all_missing + # initiate final df df = pd.DataFrame( columns=[ @@ -130,7 +138,7 @@ def main(model_config_file, output_file, crispr_data, out_dir): # iterate through rows of model config and add results to final df for row in model_config.itertuples(index=False): - res_row = performance_summary(row.model, row.dataset, model_name, out_dir) + res_row = performance_summary(row.model, row.dataset, row.prediction_file, row.missing_file, model_name, out_dir) df = pd.concat([df, res_row]) # add row for distance res_row = performance_summary( From a0a10c61084911a60b96a0a893c15fb414a0e160 Mon Sep 17 00:00:00 2001 From: Maya Sheth Date: Fri, 6 Jun 2025 18:33:47 -0700 Subject: [PATCH 02/22] fix some typos --- workflow/Snakefile_training | 8 ++++---- workflow/rules/compare_models.smk | 12 ++++++------ workflow/rules/crispr_features.smk | 2 +- workflow/rules/feature_analysis.smk | 22 +++++++++++----------- workflow/rules/utils.smk | 22 ++++++++++++---------- 5 files changed, 34 insertions(+), 32 deletions(-) diff --git a/workflow/Snakefile_training b/workflow/Snakefile_training index b9b40c1..ee7366e 100755 --- a/workflow/Snakefile_training +++ b/workflow/Snakefile_training @@ -19,7 +19,7 @@ config = make_paths_absolute(config, E2G_DIR_PATH) config["results_dir"] = os.path.join(E2G_DIR_PATH, config["results_dir"]) # manually modify results dir since may not exist # define some global variables -RESULTS_DIR = os.path.join(config["results_dir"], "dataset_results") # for datasets +RESULTS_DIR = config["results_dir"] MODELS_RESULTS_DIR = os.path.join(config["results_dir"], "model_results") SCRIPTS_DIR = os.path.join(E2G_DIR_PATH, "workflow/scripts") CRISPR_CELL_TYPES = config["crispr_cell_types"] @@ -27,6 +27,7 @@ CRISPR_CELL_TYPES = config["crispr_cell_types"] # process configs model_config = process_model_config(model_config) model_dataset_dict = make_model_dataset_dict(model_config, dataset_config) # dictionary of dictionaries: model: {crispr_ct: dataset, ...} +print(model_dataset_dict) # import ABC submodule module ABC: @@ -50,14 +51,13 @@ ABC_BIOSAMPLES_DIR = process_abc_directory_column(model_config) # specify target files output_files = [] -output_files.extend(expand(os.path.join(MODELS_RESULTS_DIR, "{model}", "for_training.EPCrisprBenchmark_ensemble_data_GRCh38.K562_features_NAfilled.tsv.gz"), model=model_config["model"])) output_files.extend(expand(os.path.join(MODELS_RESULTS_DIR, "{model}", "model", "model_full.pkl"), model=model_config["model"])) # trained models # output_files.extend(expand(os.path.join(RESULTS_DIR, "{dataset}", "{model}", "for_training.EPCrisprBenchmark_ensemble_data_GRCh38.K562_features_NAfilled.tsv.gz"), zip, dataset=model_config["dataset"], model=model_config["model"])) # output_files.extend(expand(os.path.join(RESULTS_DIR, "{dataset}", "{model}", "model", "model_full.pkl"), zip, dataset=model_config["dataset"], model=model_config["model"])) # trained models -output_files.append(os.path.join(RESULTS_DIR, "performance_across_models.tsv")) # comparison across models -output_files.extend(expand(os.path.join(RESULTS_DIR, "performance_across_models_{metric}.pdf"), metric=["auprc", "precision"])) # plot of comparisons +output_files.append(os.path.join(MODELS_RESULTS_DIR, "performance_across_models.tsv")) # comparison across models +output_files.extend(expand(os.path.join(MODELS_RESULTS_DIR, "performance_across_models_{metric}.pdf"), metric=["auprc", "precision"])) # plot of comparisons if config["run_feature_analysis"]: output_files.extend(expand(os.path.join(MODELS_RESULTS_DIR, "{model}", "feature_analysis", "forward_feature_selection_auprc.pdf"), model=model_config["model"])) diff --git a/workflow/rules/compare_models.smk b/workflow/rules/compare_models.smk index 8fb446c..fe8c194 100755 --- a/workflow/rules/compare_models.smk +++ b/workflow/rules/compare_models.smk @@ -2,10 +2,10 @@ # compare cv-performance on training data across all models (note, this is not the true benchmarking performance CRISPR elements not overlapping prediction elements aren't considered) rule gather_model_performances: input: - all_predictions = expand(os.path.join(MODELS_RESULTS_DIR "{model}", "model", "training_predictions.tsv"), model=model_config["model"]), - all_missing = expand(os.path.join(MODELS_RESULTS_DIR "{model}", "combined_CRISPR_dataset.missing_from_features.{nafill}.tsv.gz"), model=model_config["model"]), + all_predictions = expand(os.path.join(MODELS_RESULTS_DIR, "{model}", "model", "training_predictions.tsv"), model=model_config["model"]), + all_missing = expand(os.path.join(MODELS_RESULTS_DIR, "{model}", "combined_CRISPR_dataset.missing_from_features.NAfilled.tsv.gz"), model=model_config["model"]), output: - comp_table = os.path.join(RESULTS_DIR, "performance_across_models.tsv") + comp_table = os.path.join(MODELS_RESULTS_DIR, "performance_across_models.tsv") params: scripts_dir = SCRIPTS_DIR, out_dir = RESULTS_DIR, @@ -28,10 +28,10 @@ rule gather_model_performances: rule plot_model_performances: input: - comp_table = os.path.join(RESULTS_DIR, "performance_across_models.tsv") + comp_table = os.path.join(MODELS_RESULTS_DIR, "performance_across_models.tsv") output: - comp_plot_auprc = os.path.join(RESULTS_DIR, "performance_across_models_auprc.pdf"), - comp_plot_prec = os.path.join(RESULTS_DIR, "performance_across_models_precision.pdf") + comp_plot_auprc = os.path.join(MODELS_RESULTS_DIR, "performance_across_models_auprc.pdf"), + comp_plot_prec = os.path.join(MODELS_RESULTS_DIR, "performance_across_models_precision.pdf") conda: "../envs/encode_re2g.yml" resources: diff --git a/workflow/rules/crispr_features.smk b/workflow/rules/crispr_features.smk index 8748c76..1d974f9 100644 --- a/workflow/rules/crispr_features.smk +++ b/workflow/rules/crispr_features.smk @@ -61,7 +61,7 @@ rule process_crispr_data: params: genes = config["gene_TSS500"], output: - processed = os.path.join(MODELS_RESULTS_DIR, "{model}", "for_training.combined_CRISPR_dataset.overlapping_features.{nafill}.tsv.gz") + processed = os.path.join(MODELS_RESULTS_DIR, "{model}", "for_training.combined_CRISPR_dataset.overlapping_features.{nafill}.tsv.gz"), missing = os.path.join(MODELS_RESULTS_DIR, "{model}", "combined_CRISPR_dataset.missing_from_features.{nafill}.tsv.gz") conda: "../envs/encode_re2g.yml" diff --git a/workflow/rules/feature_analysis.smk b/workflow/rules/feature_analysis.smk index afad160..0ba62e0 100755 --- a/workflow/rules/feature_analysis.smk +++ b/workflow/rules/feature_analysis.smk @@ -72,13 +72,13 @@ rule calculate_backward_feature_selection: rule plot_backward_feature_selection: input: - results = os.path.join(MODELS_RESULTS_DIR, "{dataset}", "{model}", "feature_analysis", "backward_feature_selection.tsv"), + results = os.path.join(MODELS_RESULTS_DIR, "{model}", "feature_analysis", "backward_feature_selection.tsv"), feature_table = lambda wildcards: model_config.loc[wildcards.model, 'feature_table'] params: polynomial = lambda wildcards: model_config.loc[wildcards.model, 'polynomial'] output: - out_auprc = os.path.join(MODELS_RESULTS_DIR, "{dataset}", "{model}", "feature_analysis", "backward_feature_selection_auprc.pdf"), - out_prec = os.path.join(MODELS_RESULTS_DIR, "{dataset}", "{model}", "feature_analysis", "backward_feature_selection_precision.pdf") + out_auprc = os.path.join(MODELS_RESULTS_DIR, "{model}", "feature_analysis", "backward_feature_selection_auprc.pdf"), + out_prec = os.path.join(MODELS_RESULTS_DIR, "{model}", "feature_analysis", "backward_feature_selection_precision.pdf") conda: "../envs/encode_re2g.yml" resources: @@ -95,9 +95,9 @@ rule compare_all_feature_sets: params: epsilon = config["epsilon"], scripts_dir = SCRIPTS_DIR, - out_dir = os.path.join(MODELS_RESULTS_DIR "{model}", "feature_analysis") + out_dir = os.path.join(MODELS_RESULTS_DIR, "{model}", "feature_analysis") output: - results = os.path.join(MODELS_RESULTS_DIR "{model}", "feature_analysis", "all_feature_sets.tsv"), + results = os.path.join(MODELS_RESULTS_DIR, "{model}", "feature_analysis", "all_feature_sets.tsv"), conda: "../envs/encode_re2g.yml" resources: @@ -118,15 +118,15 @@ rule calculate_permutation_feature_importance: input: crispr_features_processed = os.path.join(MODELS_RESULTS_DIR, "{model}", "for_training.combined_CRISPR_dataset.overlapping_features.NAfilled.tsv.gz"), feature_table = lambda wildcards: model_config.loc[wildcards.model, 'feature_table'], - model_params = os.path.join(MODELS_RESULTS_DIR "{model}", "model", "training_params.pkl") + model_params = os.path.join(MODELS_RESULTS_DIR, "{model}", "model", "training_params.pkl") params: epsilon = config["epsilon"], n_repeats = 20, scripts_dir = SCRIPTS_DIR, polynomial = lambda wildcards: model_config.loc[wildcards.model, 'polynomial'], - out_dir = os.path.join(MODELS_RESULTS_DIR "{model}", "feature_analysis") + out_dir = os.path.join(MODELS_RESULTS_DIR, "{model}", "feature_analysis") output: - results = os.path.join(MODELS_RESULTS_DIR "{model}", "feature_analysis", "permutation_feature_importance.tsv"), + results = os.path.join(MODELS_RESULTS_DIR, "{model}", "feature_analysis", "permutation_feature_importance.tsv"), conda: "../envs/encode_re2g.yml" resources: @@ -145,14 +145,14 @@ rule calculate_permutation_feature_importance: rule plot_permutation_feature_importance: input: - results = os.path.join(MODELS_RESULTS_DIR "{model}", "feature_analysis", "permutation_feature_importance.tsv"), + results = os.path.join(MODELS_RESULTS_DIR, "{model}", "feature_analysis", "permutation_feature_importance.tsv"), feature_table = lambda wildcards: model_config.loc[wildcards.model, 'feature_table'] params: polynomial = lambda wildcards: model_config.loc[wildcards.model, 'polynomial'], n_repeats = 20 output: - out_auprc = os.path.join(MODELS_RESULTS_DIR "{model}", "feature_analysis", "permutation_feature_importance_auprc.pdf"), - out_prec = os.path.join(MODELS_RESULTS_DIR "{model}", "feature_analysis", "permutation_feature_importance_precision.pdf") + out_auprc = os.path.join(MODELS_RESULTS_DIR, "{model}", "feature_analysis", "permutation_feature_importance_auprc.pdf"), + out_prec = os.path.join(MODELS_RESULTS_DIR, "{model}", "feature_analysis", "permutation_feature_importance_precision.pdf") conda: "../envs/encode_re2g.yml" resources: diff --git a/workflow/rules/utils.smk b/workflow/rules/utils.smk index 129a6fd..55d487a 100755 --- a/workflow/rules/utils.smk +++ b/workflow/rules/utils.smk @@ -49,13 +49,13 @@ def make_model_dataset_dict(model_config, dataset_config): # make sure 1:1 correspondance with CRISPR cell types if sorted(CRISPR_CELL_TYPES) != sorted(mapped_cts): print(f"Model datasets: {model_datasets} -> cell types: {mapped_cts}") - raise Exception(f"Datasets specified for {row["model"]} do not map to all CRISPR cell types.") + raise Exception(f"Datasets specified for {row['model']} do not map to all CRISPR cell types.") - this_model_dict = dict(zip(model_datasets, mapped_cts)) + this_model_dict = dict(zip(mapped_cts, model_datasets)) model_dicts.append(this_model_dict) # combine into dictionary of dicitonaries - model_dataset_dict = dict(zip(model_config["model"]), model_dicts) + model_dataset_dict = dict(zip(model_config["model"], model_dicts)) return model_dataset_dict @@ -146,13 +146,15 @@ def process_abc_directory_column(model_config): # Make a dictionary of dataset:ABC_dir pairs ABC_BIOSAMPLES_DIR = {} for row in model_config.itertuples(index=False): - if row.dataset not in ABC_BIOSAMPLES_DIR: - if row.ABC_directory=="None": # ABC directory is not provided - if row.dataset not in dataset_config['biosample']: # is there info to run ABC? - raise Exception(f"Dataset {row.dataset} not specified in dataset_config.") - ABC_BIOSAMPLES_DIR[row.dataset] = os.path.join(RESULTS_DIR, row.dataset) - else: # ABC directory is provided - ABC_BIOSAMPLES_DIR[row.dataset] = row.ABC_directory + model_datasets = [item.strip() for item in row.dataset.split(",")] + for ds in model_datasets: + if ds not in ABC_BIOSAMPLES_DIR: + if row.ABC_directory=="None": # ABC directory is not provided + if ds not in dataset_config['biosample']: # is there info to run ABC? + raise Exception(f"Dataset {ds} not specified in dataset_config.") + ABC_BIOSAMPLES_DIR[ds] = os.path.join(RESULTS_DIR, ds) + else: # ABC directory is provided + ABC_BIOSAMPLES_DIR[ds] = ds return ABC_BIOSAMPLES_DIR From fc8466182cb1fed834ea6db579f737265ac00331 Mon Sep 17 00:00:00 2001 From: Maya Sheth Date: Sat, 7 Jun 2025 17:57:24 -0700 Subject: [PATCH 03/22] allow for multiple crispr datasets --- workflow/rules/compare_models.smk | 2 +- workflow/rules/crispr_features.smk | 13 +++---- workflow/rules/feature_analysis.smk | 8 ++--- workflow/rules/train_model.smk | 4 +-- workflow/rules/utils.smk | 4 ++- .../feature_tables/combine_feature_tables.R | 4 +-- .../gen_num_candidate_enh_gene.py | 2 +- .../gen_num_sum_nearby_enhancers.py | 12 ++++--- .../feature_tables/gen_num_tss_enh_gene.py | 2 +- .../overlap_features_with_crispr_data.R | 4 +-- .../feature_tables/process_crispr_data.R | 2 +- .../compare_all_feature_sets.py | 34 +++++++++++++------ .../model_training/compare_all_models.py | 2 +- .../scripts/model_training/train_model.py | 15 ++++---- 14 files changed, 65 insertions(+), 43 deletions(-) diff --git a/workflow/rules/compare_models.smk b/workflow/rules/compare_models.smk index fe8c194..b684cbc 100755 --- a/workflow/rules/compare_models.smk +++ b/workflow/rules/compare_models.smk @@ -3,7 +3,7 @@ rule gather_model_performances: input: all_predictions = expand(os.path.join(MODELS_RESULTS_DIR, "{model}", "model", "training_predictions.tsv"), model=model_config["model"]), - all_missing = expand(os.path.join(MODELS_RESULTS_DIR, "{model}", "combined_CRISPR_dataset.missing_from_features.NAfilled.tsv.gz"), model=model_config["model"]), + all_missing = expand(os.path.join(MODELS_RESULTS_DIR, "{model}", "merged_CRISPR_dataset.missing_from_features.NAfilled.tsv.gz"), model=model_config["model"]), output: comp_table = os.path.join(MODELS_RESULTS_DIR, "performance_across_models.tsv") params: diff --git a/workflow/rules/crispr_features.smk b/workflow/rules/crispr_features.smk index 1d974f9..b7a9e3a 100644 --- a/workflow/rules/crispr_features.smk +++ b/workflow/rules/crispr_features.smk @@ -33,13 +33,13 @@ rule overlap_features_crispr_for_dataset: input: features = os.path.join(RESULTS_DIR, "{dataset}", "genomewide_features.tsv.gz"), feature_table_file = os.path.join(RESULTS_DIR, "{dataset}", "feature_table.tsv"), - crispr = config['crispr_dataset'], + crispr = lambda wildcards: config['crispr_dataset'][wildcards.crispr_dataset], tss = config['gene_TSS500'] params: crispr_cell_type = lambda wildcards: dataset_config.loc[wildcards.dataset, "crispr_cell_type"] output: - features = os.path.join(RESULTS_DIR, "{dataset}", "CRISPR_dataset.overlapping_features.{nafill}.tsv.gz"), - missing = os.path.join(RESULTS_DIR, "{dataset}", "CRISPR_dataset.missing_from_features.{nafill}.tsv.gz") + features = os.path.join(RESULTS_DIR, "{dataset}", "CRISPR_dataset_{crispr_dataset}.overlapping_features.{nafill}.tsv.gz"), + missing = os.path.join(RESULTS_DIR, "{dataset}", "CRISPR_dataset_{crispr_dataset}.missing_from_features.{nafill}.tsv.gz") conda: "../envs/encode_re2g.yml" resources: @@ -51,7 +51,8 @@ rule overlap_features_crispr_for_dataset: # note: we use the NAfilled CRISPR feature data here! def get_crispr_files_for_model(wildcards, file_type): datasets = model_dataset_dict[wildcards.model].values() - files = [os.path.join(RESULTS_DIR, ds, f"CRISPR_dataset.{file_type}.{wildcards.nafill}.tsv.gz") for ds in datasets] + crispr_dataset = model_config.loc[wildcards.model, "crispr_dataset"] + files = [os.path.join(RESULTS_DIR, ds, f"CRISPR_dataset_{crispr_dataset}.{file_type}.{wildcards.nafill}.tsv.gz") for ds in datasets] return files rule process_crispr_data: @@ -61,8 +62,8 @@ rule process_crispr_data: params: genes = config["gene_TSS500"], output: - processed = os.path.join(MODELS_RESULTS_DIR, "{model}", "for_training.combined_CRISPR_dataset.overlapping_features.{nafill}.tsv.gz"), - missing = os.path.join(MODELS_RESULTS_DIR, "{model}", "combined_CRISPR_dataset.missing_from_features.{nafill}.tsv.gz") + processed = os.path.join(MODELS_RESULTS_DIR, "{model}", "for_training.merged_CRISPR_dataset.overlapping_features.{nafill}.tsv.gz"), + missing = os.path.join(MODELS_RESULTS_DIR, "{model}", "merged_CRISPR_dataset.missing_from_features.{nafill}.tsv.gz") conda: "../envs/encode_re2g.yml" resources: diff --git a/workflow/rules/feature_analysis.smk b/workflow/rules/feature_analysis.smk index 0ba62e0..3f6925e 100755 --- a/workflow/rules/feature_analysis.smk +++ b/workflow/rules/feature_analysis.smk @@ -1,7 +1,7 @@ # forward sequential feature selection rule calculate_forward_feature_selection: input: - crispr_features_processed = os.path.join(MODELS_RESULTS_DIR, "{model}", "for_training.combined_CRISPR_dataset.overlapping_features.NAfilled.tsv.gz"), + crispr_features_processed = os.path.join(MODELS_RESULTS_DIR, "{model}", "for_training.merged_CRISPR_dataset.overlapping_features.NAfilled.tsv.gz"), feature_table = lambda wildcards: model_config.loc[wildcards.model, 'feature_table'], model_params = os.path.join(MODELS_RESULTS_DIR, "{model}", "model", "training_params.pkl") params: @@ -45,7 +45,7 @@ rule plot_forward_feature_selection: # backward sequential feature selection rule calculate_backward_feature_selection: input: - crispr_features_processed = os.path.join(MODELS_RESULTS_DIR, "{model}", "for_training.combined_CRISPR_dataset.overlapping_features.NAfilled.tsv.gz"), + crispr_features_processed = os.path.join(MODELS_RESULTS_DIR, "{model}", "for_training.merged_CRISPR_dataset.overlapping_features.NAfilled.tsv.gz"), feature_table = lambda wildcards: model_config.loc[wildcards.model, 'feature_table'], model_params = os.path.join(MODELS_RESULTS_DIR, "{model}", "model", "training_params.pkl") params: @@ -89,7 +89,7 @@ rule plot_backward_feature_selection: # compare all features sets rule compare_all_feature_sets: input: - crispr_features_processed = os.path.join(MODELS_RESULTS_DIR, "{model}", "for_training.combined_CRISPR_dataset.overlapping_features.NAfilled.tsv.gz"), + crispr_features_processed = os.path.join(MODELS_RESULTS_DIR, "{model}", "for_training.merged_CRISPR_dataset.overlapping_features.NAfilled.tsv.gz"), feature_table = lambda wildcards: model_config.loc[wildcards.model, 'feature_table'], model_params = os.path.join(MODELS_RESULTS_DIR, "{model}", "model", "training_params.pkl") params: @@ -116,7 +116,7 @@ rule compare_all_feature_sets: # permuation feature importance rule calculate_permutation_feature_importance: input: - crispr_features_processed = os.path.join(MODELS_RESULTS_DIR, "{model}", "for_training.combined_CRISPR_dataset.overlapping_features.NAfilled.tsv.gz"), + crispr_features_processed = os.path.join(MODELS_RESULTS_DIR, "{model}", "for_training.merged_CRISPR_dataset.overlapping_features.NAfilled.tsv.gz"), feature_table = lambda wildcards: model_config.loc[wildcards.model, 'feature_table'], model_params = os.path.join(MODELS_RESULTS_DIR, "{model}", "model", "training_params.pkl") params: diff --git a/workflow/rules/train_model.smk b/workflow/rules/train_model.smk index e5a3e15..4f8093b 100644 --- a/workflow/rules/train_model.smk +++ b/workflow/rules/train_model.smk @@ -21,14 +21,14 @@ rule generate_model_params: # generate trained model and cross-validated predictions on CRISPR data rule train_model: input: - crispr_features_processed = os.path.join(MODELS_RESULTS_DIR, "{model}", "for_training.combined_CRISPR_dataset.overlapping_features.NAfilled.tsv.gz"), + crispr_features_processed = os.path.join(MODELS_RESULTS_DIR, "{model}", "for_training.merged_CRISPR_dataset.overlapping_features.NAfilled.tsv.gz"), feature_table = lambda wildcards: model_config.loc[wildcards.model, 'feature_table'], model_params = os.path.join(MODELS_RESULTS_DIR, "{model}", "model", "training_params.pkl") params: epsilon = config["epsilon"], scripts_dir = SCRIPTS_DIR, polynomial = lambda wildcards: model_config.loc[wildcards.model, 'polynomial'], - out_dir = os.path.join(MODELS_RESULTS_DIR, "{model}") + out_dir = os.path.join(MODELS_RESULTS_DIR, "{model}", "model") output: trained_model = os.path.join(MODELS_RESULTS_DIR, "{model}", "model", "model_full.pkl"), pred = os.path.join(MODELS_RESULTS_DIR, "{model}", "model", "training_predictions.tsv"), diff --git a/workflow/rules/utils.smk b/workflow/rules/utils.smk index 55d487a..f82287c 100755 --- a/workflow/rules/utils.smk +++ b/workflow/rules/utils.smk @@ -47,7 +47,9 @@ def make_model_dataset_dict(model_config, dataset_config): mapped_cts = [dataset_ct_dict[ds] for ds in model_datasets] # make sure 1:1 correspondance with CRISPR cell types - if sorted(CRISPR_CELL_TYPES) != sorted(mapped_cts): + crispr_data_cell_types = config["crispr_cell_types"][row["crispr_dataset"]] + + if sorted(crispr_data_cell_types) != sorted(mapped_cts): print(f"Model datasets: {model_datasets} -> cell types: {mapped_cts}") raise Exception(f"Datasets specified for {row['model']} do not map to all CRISPR cell types.") diff --git a/workflow/scripts/feature_tables/combine_feature_tables.R b/workflow/scripts/feature_tables/combine_feature_tables.R index cdee111..0268b49 100644 --- a/workflow/scripts/feature_tables/combine_feature_tables.R +++ b/workflow/scripts/feature_tables/combine_feature_tables.R @@ -4,12 +4,12 @@ library(dplyr) # inputs from snakemake model_config = fread(snakemake@input$model_config) -ds = snakemake@wildcards$dataset +ds = snakemake@wildcards$dataset # merge feature tables for models with this dataset ft_files = c() for (i in 1:nrow(model_config)){ - model_datasets = model_config$dataset[i] %>% strsplit(",") %>% trimws() + model_datasets = model_config$dataset[i] %>% strsplit(",") %>% unlist() %>% trimws() if (ds %in% model_datasets) { ft_files = c(ft_files, model_config$feature_table[i]) } diff --git a/workflow/scripts/feature_tables/gen_num_candidate_enh_gene.py b/workflow/scripts/feature_tables/gen_num_candidate_enh_gene.py index f1ecca0..6ea4300 100644 --- a/workflow/scripts/feature_tables/gen_num_candidate_enh_gene.py +++ b/workflow/scripts/feature_tables/gen_num_candidate_enh_gene.py @@ -44,7 +44,7 @@ def determine_num_candidate_enh_gene(pred_df, out_file): @click.option("--abc_predictions") @click.option("--out_file") def main(abc_predictions, out_file): - pred_df = pd.read_csv(abc_predictions, sep="\t") + pred_df = pd.read_csv(abc_predictions, sep="\t", compression="gzip") if len(pred_df) == 0: raise Exception("Did not find any enhancers in the Predictions file") diff --git a/workflow/scripts/feature_tables/gen_num_sum_nearby_enhancers.py b/workflow/scripts/feature_tables/gen_num_sum_nearby_enhancers.py index d1aab36..942fcca 100644 --- a/workflow/scripts/feature_tables/gen_num_sum_nearby_enhancers.py +++ b/workflow/scripts/feature_tables/gen_num_sum_nearby_enhancers.py @@ -35,11 +35,13 @@ def generate_num_sum_enhancers( ) # select columns from EnhancerPredictionsAllPutative - os.system( - "zcat {} | csvtk cut -t -f chr,start,end,name,activity_base | sed '1d' > {}".format( - pred_file, pred_slim - ) - ) + df = pd.read_csv(pred_file, sep="\t", usecols=["chr", "start", "end", "name", "activity_base"], compression= "gzip") + df.to_csv(pred_slim, sep="\t", index=False, header=False) + # os.system( + # "zcat {} | csvtk cut -t -f chr,start,end,name,activity_base | sed '1d' > {}".format( + # pred_file, pred_slim + # ) + # ) # intersect with expanded enhancer regions and count n enhancers os.system( diff --git a/workflow/scripts/feature_tables/gen_num_tss_enh_gene.py b/workflow/scripts/feature_tables/gen_num_tss_enh_gene.py index 211a2bb..4643858 100644 --- a/workflow/scripts/feature_tables/gen_num_tss_enh_gene.py +++ b/workflow/scripts/feature_tables/gen_num_tss_enh_gene.py @@ -53,7 +53,7 @@ def determine_num_tss_enh_gene( @click.option("--enhancer_tss_int") @click.option("--out_file") def main(abc_predictions, ref_gene_tss, extended_enhancers, enhancer_tss_int, out_file): - pred_df = pd.read_csv(abc_predictions, sep="\t") + pred_df = pd.read_csv(abc_predictions, sep="\t", compression="gzip") if len(pred_df) == 0: raise Exception("Did not find any enhancers in the Predictions file") diff --git a/workflow/scripts/feature_tables/overlap_features_with_crispr_data.R b/workflow/scripts/feature_tables/overlap_features_with_crispr_data.R index abf57f9..ca470d9 100755 --- a/workflow/scripts/feature_tables/overlap_features_with_crispr_data.R +++ b/workflow/scripts/feature_tables/overlap_features_with_crispr_data.R @@ -57,7 +57,7 @@ merge_feature_to_crispr <- function(crispr, features, feature_score_cols, agg_fu output <- merged # sort output by cre position - output <- output[order(dataset, chrom, chromStart, chromEnd, measuredGeneSymbol), ] + output <- output[order(chrom, chromStart, chromEnd, measuredGeneSymbol), ] return(list(merged=output, missing=missing)) } @@ -83,7 +83,7 @@ features <- fread(snakemake@input$features) # load crispri data and only retain relevant columns and filter to cell type crispr <- fread(snakemake@input$crispr) -crispr <- select(crispr, -c(pair_uid, merged_uid, merged_start, merged_end)) %>% +crispr <- select(crispr, -any_of(c("pair_uid", "merged_uid", "merged_start", "merged_end"))) %>% filter(CellType == snakemake@params$crispr_cell_type) # load feature config file and only retain entries for features in input data diff --git a/workflow/scripts/feature_tables/process_crispr_data.R b/workflow/scripts/feature_tables/process_crispr_data.R index ef5e45b..73f070c 100755 --- a/workflow/scripts/feature_tables/process_crispr_data.R +++ b/workflow/scripts/feature_tables/process_crispr_data.R @@ -9,7 +9,7 @@ suppressPackageStartupMessages({ }) combine_crispr_features <- function(space_sep_string) { - crispr_feature_files <- space_sep_list %>% strsplit(" ") %>% trimws() + crispr_feature_files <- space_sep_string %>% strsplit(" ") %>% unlist() %>% trimws() crispr_features <- lapply(crispr_feature_files, fread) # get common columns (e.g. features in all) diff --git a/workflow/scripts/model_training/compare_all_feature_sets.py b/workflow/scripts/model_training/compare_all_feature_sets.py index fc84e78..04546c1 100644 --- a/workflow/scripts/model_training/compare_all_feature_sets.py +++ b/workflow/scripts/model_training/compare_all_feature_sets.py @@ -5,11 +5,9 @@ import scipy from training_functions import ( statistic_aupr, - statistic_precision, + statistic_precision_at_threshold, train_and_predict_once, - bootstrap_pvalue, - statistic_delta_aupr, - statistic_delta_precision, + threshold_70_pct_recall ) @@ -32,10 +30,13 @@ def compare_feature_sets(df_dataset, feature_table, epsilon, params, n_boot): df.loc[len(df)] = fill_zeros df["features"] = df.apply(lambda row: list(df.columns[row == 1]), axis=1) - df["n_features"] = 0 - df["AUPRC"] = 0 - df["AUPRC_95CI_low"] = 0 - df["AUPRC_95CI_high"] = 0 + df["n_features"] = 0.0 + df["AUPRC"] = 0.0 + df["AUPRC_95CI_low"] = 0.0 + df["AUPRC_95CI_high"] = 0.0 + df["precision_70pct_recall"] = 0.0 + df["precision_95CI_low"] = 0.0 + df["precision_95CI_high"] = 0.0 for i in range(len(df)): # iterate through sets model_name = "row_" + str(i) @@ -48,9 +49,8 @@ def compare_feature_sets(df_dataset, feature_table, epsilon, params, n_boot): ) Y_pred = df_dataset[model_name + ".Score"] - data = (Y_true, Y_pred) res_aupr = scipy.stats.bootstrap( - data, + (Y_true, Y_pred), statistic_aupr, n_resamples=n_boot, paired=True, @@ -61,6 +61,20 @@ def compare_feature_sets(df_dataset, feature_table, epsilon, params, n_boot): df.loc[i, "AUPRC_95CI_low"] = res_aupr.confidence_interval[0] df.loc[i, "AUPRC_95CI_high"] = res_aupr.confidence_interval[1] + thresh = threshold_70_pct_recall(Y_true, Y_pred) + + res_prec = scipy.stats.bootstrap( + (Y_true, Y_pred), + lambda Y_true, Y_pred: statistic_precision_at_threshold(Y_true, Y_pred, thresh), + n_resamples=n_boot, + paired=True, + confidence_level=0.95, + method="BCa", + ) + df.loc[i, "precision_70pct_recall"] = np.mean(res_prec.bootstrap_distribution) + df.loc[i, "precision_95CI_low"] = res_prec.confidence_interval[0] + df.loc[i, "precision_95CI_high"] = res_prec.confidence_interval[1] + # sort table by AUPRC df = df.sort_values(by="AUPRC", ascending=False) diff --git a/workflow/scripts/model_training/compare_all_models.py b/workflow/scripts/model_training/compare_all_models.py index 0841b47..4db2672 100644 --- a/workflow/scripts/model_training/compare_all_models.py +++ b/workflow/scripts/model_training/compare_all_models.py @@ -142,7 +142,7 @@ def main(all_pred, all_missing, model_config_file, output_file, crispr_data, out df = pd.concat([df, res_row]) # add row for distance res_row = performance_summary( - "distance", "baseline", "", out_dir, crispr_data=crispr_data + "distance", "baseline", "", "", "", out_dir, crispr_data=crispr_data ) df = pd.concat([df, res_row]) diff --git a/workflow/scripts/model_training/train_model.py b/workflow/scripts/model_training/train_model.py index 653104f..8d88ef4 100644 --- a/workflow/scripts/model_training/train_model.py +++ b/workflow/scripts/model_training/train_model.py @@ -25,6 +25,7 @@ def train_and_predict( polynomial_names = poly.get_feature_names_out(feature_list_core) X = pd.DataFrame(X, columns=polynomial_names) + Y = df_dataset["Regulated"].values.astype(np.int64) # initialize df for feature weights & metrics @@ -88,32 +89,34 @@ def train_and_predict( n_test_pos = np.sum(Y_test) n_test_neg = len(Y_test) - n_test_pos + multiple_labels = n_test_pos > 0 and n_test_neg > 0 + ll_train = log_loss(Y_train, model.predict_proba(X_train)[:, 1]) ll_test_full = ( log_loss(Y_test, probs_full[idx_test, 1]) - if n_test_pos > 0 + if multiple_labels else np.NaN ) - ll_test = log_loss(Y_test, probs[:, 1]) if n_test_pos > 0 else np.NaN + ll_test = log_loss(Y_test, probs[:, 1]) if multiple_labels else np.NaN auroc_train = roc_auc_score(Y_train, model.predict_proba(X_train)[:, 1]) auroc_test_full = ( roc_auc_score(Y_test, probs_full[idx_test, 1]) - if n_test_pos > 0 + if multiple_labels else np.NaN ) auroc_test = ( - roc_auc_score(Y_test, probs[:, 1]) if n_test_pos > 0 else np.NaN + roc_auc_score(Y_test, probs[:, 1]) if multiple_labels > 0 else np.NaN ) auprc_train = statistic_aupr( Y_train, model.predict_proba(X_train)[:, 1] ) auprc_test_full = ( statistic_aupr(Y_test, probs_full[idx_test, 1]) - if n_test_pos > 0 + if multiple_labels else np.NaN ) auprc_test = ( - statistic_aupr(Y_test, probs[:, 1]) if n_test_pos > 0 else np.NaN + statistic_aupr(Y_test, probs[:, 1]) if multiple_labels else np.NaN ) df_temp = pd.DataFrame( From 88396be9646278bcaed0ed9ab83140b1f95b91ef Mon Sep 17 00:00:00 2001 From: Maya Sheth Date: Sat, 7 Jun 2025 18:17:33 -0700 Subject: [PATCH 04/22] update snakefile --- workflow/Snakefile_training | 1 - 1 file changed, 1 deletion(-) diff --git a/workflow/Snakefile_training b/workflow/Snakefile_training index ee7366e..c844bdd 100755 --- a/workflow/Snakefile_training +++ b/workflow/Snakefile_training @@ -22,7 +22,6 @@ config["results_dir"] = os.path.join(E2G_DIR_PATH, config["results_dir"]) # manu RESULTS_DIR = config["results_dir"] MODELS_RESULTS_DIR = os.path.join(config["results_dir"], "model_results") SCRIPTS_DIR = os.path.join(E2G_DIR_PATH, "workflow/scripts") -CRISPR_CELL_TYPES = config["crispr_cell_types"] # process configs model_config = process_model_config(model_config) From 9fa5b1172ce5f604c37ab9766499dcc96985d4de Mon Sep 17 00:00:00 2001 From: Maya Sheth Date: Sun, 8 Jun 2025 19:31:28 -0700 Subject: [PATCH 05/22] update to fully working --- workflow/rules/compare_models.smk | 6 +++-- workflow/rules/genomewide_features.smk | 2 +- .../model_training/compare_all_models.py | 26 +++++++++++-------- 3 files changed, 20 insertions(+), 14 deletions(-) diff --git a/workflow/rules/compare_models.smk b/workflow/rules/compare_models.smk index b684cbc..d6ce2ed 100755 --- a/workflow/rules/compare_models.smk +++ b/workflow/rules/compare_models.smk @@ -10,7 +10,8 @@ rule gather_model_performances: scripts_dir = SCRIPTS_DIR, out_dir = RESULTS_DIR, model_config_file = config["model_config"], - crispr_dataset = config["crispr_dataset"] + crispr_dataset_names = [n for n in model_config["crispr_dataset"].unique()], + crispr_dataset = lambda wildcards: [config["crispr_dataset"][cd] for cd in model_config["crispr_dataset"].unique()] conda: "../envs/encode_re2g.yml" resources: @@ -22,7 +23,8 @@ rule gather_model_performances: --all_missing "{input.all_missing}" \ --model_config_file {params.model_config_file} \ --output_file {output.comp_table} \ - --crispr_data {params.crispr_dataset} \ + --crispr_names "{params.crispr_dataset_names}" \ + --crispr_data "{params.crispr_dataset}" \ --out_dir {params.out_dir} """ diff --git a/workflow/rules/genomewide_features.smk b/workflow/rules/genomewide_features.smk index 8afe56e..0e7c8f8 100644 --- a/workflow/rules/genomewide_features.smk +++ b/workflow/rules/genomewide_features.smk @@ -172,7 +172,7 @@ rule add_external_features: input: predictions_extended = os.path.join(RESULTS_DIR, "{biosample}", "ActivityOnly_features.tsv.gz"), feature_table_file = os.path.join(RESULTS_DIR, "{biosample}", "feature_table.tsv"), - external_features_config = ancient(os.path.join(RESULTS_DIR, "{biosample}", "external_features_config.tsv")) + external_features_config = os.path.join(RESULTS_DIR, "{biosample}", "external_features_config.tsv") output: plus_external_features = os.path.join(RESULTS_DIR, "{biosample}", "ActivityOnly_plus_external_features.tsv.gz") conda: diff --git a/workflow/scripts/model_training/compare_all_models.py b/workflow/scripts/model_training/compare_all_models.py index 4db2672..b9244c8 100644 --- a/workflow/scripts/model_training/compare_all_models.py +++ b/workflow/scripts/model_training/compare_all_models.py @@ -15,7 +15,7 @@ def performance_summary( model_id, dataset, pred_file, missing_file, model_name, out_dir, crispr_data="", n_boot=1000 ): # read in predicitons - if model_id == "distance": + if model_id.startswith("distance"): crispr_data = pd.read_csv(crispr_data, sep="\t") crispr_data["distance"] = np.abs( (crispr_data["chromStart"] + crispr_data["chromEnd"]) / 2 @@ -106,10 +106,12 @@ def performance_summary( @click.option("--model_config_file", required=True) @click.option("--output_file", required=True) @click.option("--crispr_data", required=True) +@click.option("--crispr_names", required=True) @click.option("--out_dir", required=True) -def main(all_pred, all_missing, model_config_file, output_file, crispr_data, out_dir): - all_pred = all_pred.split(" ") - all_missing = all_missing.split(" ") +def main(all_pred, all_missing, model_config_file, output_file, crispr_data, crispr_names, out_dir): + all_pred_files = all_pred.split(" ") + all_missing_files = all_missing.split(" ") + crispr_dict = dict(zip(crispr_names.split(" "), crispr_data.split(" "))) model_name = "E2G" model_config = ( @@ -117,8 +119,8 @@ def main(all_pred, all_missing, model_config_file, output_file, crispr_data, out .fillna("None") .set_index("model", drop=False) ) - model_config["prediction_file"] = all_pred - model_config["missing_file"] = all_missing + model_config["prediction_file"] = all_pred_files + model_config["missing_file"] = all_missing_files # initiate final df df = pd.DataFrame( @@ -140,11 +142,13 @@ def main(all_pred, all_missing, model_config_file, output_file, crispr_data, out for row in model_config.itertuples(index=False): res_row = performance_summary(row.model, row.dataset, row.prediction_file, row.missing_file, model_name, out_dir) df = pd.concat([df, res_row]) - # add row for distance - res_row = performance_summary( - "distance", "baseline", "", "", "", out_dir, crispr_data=crispr_data - ) - df = pd.concat([df, res_row]) + + for key, file_path in crispr_dict.items(): + # add row(s) for distance + res_row = performance_summary( + f"distance_{key}", "baseline", "", "", "", out_dir, crispr_data=file_path + ) + df = pd.concat([df, res_row]) # sort table by AUPRC df = df.sort_values(by="AUPRC", ascending=False) From e8bc14af134513d0b543e4ad13e962d03b53f892 Mon Sep 17 00:00:00 2001 From: Maya Sheth Date: Mon, 9 Jun 2025 08:00:58 -0700 Subject: [PATCH 06/22] update readme and example configs" --- README.md | 8 +++++--- config/config_biosamples_chr22.tsv | 4 ++-- config/config_models_test.tsv | 4 ++-- config/config_training.yaml | 5 ++++- 4 files changed, 13 insertions(+), 8 deletions(-) diff --git a/README.md b/README.md index f5e5255..f8fb8bf 100755 --- a/README.md +++ b/README.md @@ -49,16 +49,18 @@ The way we choose the model depends on the biosamples input. The code for model ## Train model -**Important: Only train models for biosamples matching the corresponding CRISPR data (in this case, K562)** +**Important: Only train models for biosamples matching the corresponding CRISPR data** - Much of the the model training code was adapted from Alireza Karbalayghareh's [original implementation](https://github.com/karbalayghareh/ENCODE-E2G). Modify `config/config_training.yaml` with your model and dataset configs -- `model_config` has columns: model, dataset, ABC_directory, feature_table, polynomial (do you want to use polynomial features?), and override_params (are there model training parameters you would like to change from the default logistic regression settings specfied in `config/config_training.yaml`?) +- `model_config` has columns: model, dataset, crispr_dataset, feature_table, polynomial (do you want to use polynomial features?), and override_params (are there model training parameters you would like to change from the default logistic regression settings specfied in `config/config_training.yaml`?) - See [this example](https://pastebin.com/zt1868R3) `model_config` for how to specfiy override parameters. If there are no override_params, leave the column blank but still include the header. + - The `dataset` column is a comma-separated list with values corresponding to biosaples in `dataset_config`. Datasets must be provided for each `crispr_cell_type` included in the corresponding `crispr_dataset`. + - The `crispr_dataset` column corresponds to a key under `crispr_dataset` in `config_training`. - Feature tables must be specified for each model (example: `resources/feature_tables`) with columns: feature (name in final table), input_col (name in ABC output), second_input (multiplied by input_col if provided), aggregate_function (how to combine feature values when a CRISPR element overlaps more than one ABC element), fill_value (how to replace NAs), nice_name (used when plotting) - Note that trained models generated using polynomial features cannot directly be used in the **Apply model** workflow - `dataset_config` is an ABC biosamples config to generate ABC predictions for datasets without an existing ABC directory. -- Each dataset must correspond to a unique ABC_directory, with "biosample" in `dataset_config` equals "dataset" in `model_config`. If no ABC_directory is indicated in `model_config`, it must have an entry in `dataset_config`. + Each dataset must correspond to a "biosample" in `dataset_config` equals "dataset" in `model_config`. The column `crispr_cell_type` indicates the which cell type in the CRISPR data corresponds to this biosample. - If you are including features in addition to those generated within the pipeline (e.g. a value in input_col or second_input of a feature table is not included in `reference_features` in `config/config_training/yaml`), you must also define how to add these values with an external_features_config, which you include in `dataset_config` in the optional column external_features_config: - An `external_features_config` has columns feature (corresponding to the missing input_col or second_input value), source_col (column name in the source file), aggregate_function (how to combine values when merging different element definitions), join_by, and source_file - join_by must be either "TargetGene" (feature is defined per gene) or "overlap" (feature is defined per element-gene pair) diff --git a/config/config_biosamples_chr22.tsv b/config/config_biosamples_chr22.tsv index b9559d7..670633b 100755 --- a/config/config_biosamples_chr22.tsv +++ b/config/config_biosamples_chr22.tsv @@ -1,2 +1,2 @@ -biosample DHS ATAC H3K27ac default_accessibility_feature HiC_file HiC_type HiC_resolution alt_TSS alt_genes -K562_chr22 ABC/example_chr/chr22/ENCFF860XAE.chr22.sorted.se.bam DHS https://www.encodeproject.org/files/ENCFF621AIY/@@download/ENCFF621AIY.hic hic 5000 ABC/example_chr/chr22/RefSeqCurated.170308.bed.CollapsedGeneBounds.chr22.hg38.TSS500bp.bed ABC/example_chr/chr22/RefSeqCurated.170308.bed.CollapsedGeneBounds.chr22.hg38.bed \ No newline at end of file +biosample crispr_cell_type DHS ATAC H3K27ac default_accessibility_feature HiC_file HiC_type HiC_resolution alt_TSS alt_genes +K562_chr22 K562 ABC/example_chr/chr22/ENCFF860XAE.chr22.sorted.se.bam DHS https://www.encodeproject.org/files/ENCFF621AIY/@@download/ENCFF621AIY.hic hic 5000 ABC/example_chr/chr22/RefSeqCurated.170308.bed.CollapsedGeneBounds.chr22.hg38.TSS500bp.bed ABC/example_chr/chr22/RefSeqCurated.170308.bed.CollapsedGeneBounds.chr22.hg38.bed \ No newline at end of file diff --git a/config/config_models_test.tsv b/config/config_models_test.tsv index a4a7988..392ed18 100644 --- a/config/config_models_test.tsv +++ b/config/config_models_test.tsv @@ -1,2 +1,2 @@ -model dataset ABC_directory feature_table polynomial override_params -DNase_megamap K562_chr22 resources/feature_tables/final_feature_set_DNase_hic.tsv False +model dataset crispr_dataset feature_table polynomial override_params +DNase_megamap K562_chr22 training resources/feature_tables/final_feature_set_DNase_hic.tsv FALSE \ No newline at end of file diff --git a/config/config_training.yaml b/config/config_training.yaml index fa8536b..1b78793 100755 --- a/config/config_training.yaml +++ b/config/config_training.yaml @@ -12,7 +12,10 @@ max_memory_allocation_mb: 250000 # for submiting jobs gene_TSS500: "reference/CollapsedGeneBounds.hg38.TSS500bp.bed" # TSS reference file chr_sizes: "reference/GRCh38_EBV.no_alt.chrom.sizes.tsv" gene_classes: "resources/external_features/gene_promoter_class_RefSeqCurated.170308.bed.CollapsedGeneBounds.hg38.TSS500bp.tsv" -crispr_dataset: "reference/EPCrisprBenchmark_ensemble_data_GRCh38.tsv.gz" # CRISPR dataset +crispr_dataset: + training: "reference/EPCrisprBenchmark_ensemble_data_GRCh38.tsv.gz" # CRISPR dataset +crispr_cell_types: + training: ["K562"] # list of features that are generated by default, referenced by their column names in source files; reference_features: ["numTSSEnhGene", "distance", "activity_base", "TargetGenePromoterActivityQuantile", "numNearbyEnhancers", "sumNearbyEnhancers", "is_ubiquitous_uniform", "P2PromoterClass", "numCandidateEnhGene", "hic_contact_pl_scaled_adj", "ABC.Score", "ABC.Numerator", "ABC.Denominator", "normalized_dhs_prom", "normalized_dhs_enh", "normalized_atac_prom", "normalized_atac_enh", "normalized_h3k27ac_enh", "normalized_h3k27ac_prom"] From 110e62f8cc2a1f9796a23e8295c82486f2c1800b Mon Sep 17 00:00:00 2001 From: Maya Sheth Date: Thu, 7 Aug 2025 10:29:53 -0700 Subject: [PATCH 07/22] fix pandas indexing --- workflow/rules/utils.smk | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/workflow/rules/utils.smk b/workflow/rules/utils.smk index f82287c..de3c3f5 100755 --- a/workflow/rules/utils.smk +++ b/workflow/rules/utils.smk @@ -35,25 +35,25 @@ def process_model_config(model_config): return model_config -# create dictionary of dictionaries to map datasets to crispr cell types: model: {crispr_ct: dataset, ...} +# create dictionary of dictionaries to map datasets to crispr cell types: {model: {crispr_ct: dataset, ...}} def make_model_dataset_dict(model_config, dataset_config): # dict of dataset: crispr_cell_type - dataset_ct_dict = dict(zip(dataset_config['biosample'], dataset_config['crispr_cell_type'])) + dataset_celltype_dict = dict(zip(dataset_config['biosample'], dataset_config['crispr_cell_type'])) model_dicts = [] for index, row in model_config.iterrows(): model_datasets = [item.strip() for item in row["dataset"].split(",")] # return list of datasets - mapped_cts = [dataset_ct_dict[ds] for ds in model_datasets] + mapped_celltypes = [dataset_celltype_dict[dataset] for dataset in model_datasets] # make sure 1:1 correspondance with CRISPR cell types crispr_data_cell_types = config["crispr_cell_types"][row["crispr_dataset"]] - if sorted(crispr_data_cell_types) != sorted(mapped_cts): - print(f"Model datasets: {model_datasets} -> cell types: {mapped_cts}") + if sorted(crispr_data_cell_types) != sorted(mapped_celltypes): + print(f"Model datasets: {model_datasets} -> cell types: {mapped_celltypes}") raise Exception(f"Datasets specified for {row['model']} do not map to all CRISPR cell types.") - this_model_dict = dict(zip(mapped_cts, model_datasets)) + this_model_dict = dict(zip(mapped_celltypes, model_datasets)) model_dicts.append(this_model_dict) # combine into dictionary of dicitonaries @@ -114,7 +114,7 @@ def make_accessibility_file_df(biosample_df, biosample_activities): def get_input_for_bw(this_biosample, this_simple_id): df_sub = ACCESSIBILITY_DF.loc[(ACCESSIBILITY_DF["biosample"]==this_biosample) & (ACCESSIBILITY_DF["access_simple_id"]==this_simple_id)] - return df_sub["single_access_file"][0] + return df_sub["single_access_file"].item() def expand_biosample_df(biosample_df): # add new columns From a85df948687dd49c23996bbe44c7e691b4e6d77f Mon Sep 17 00:00:00 2001 From: mayasheth <43573959+mayasheth@users.noreply.github.com> Date: Thu, 7 Aug 2025 10:38:50 -0700 Subject: [PATCH 08/22] fix typo --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index f8fb8bf..03298c3 100755 --- a/README.md +++ b/README.md @@ -55,7 +55,7 @@ The way we choose the model depends on the biosamples input. The code for model Modify `config/config_training.yaml` with your model and dataset configs - `model_config` has columns: model, dataset, crispr_dataset, feature_table, polynomial (do you want to use polynomial features?), and override_params (are there model training parameters you would like to change from the default logistic regression settings specfied in `config/config_training.yaml`?) - See [this example](https://pastebin.com/zt1868R3) `model_config` for how to specfiy override parameters. If there are no override_params, leave the column blank but still include the header. - - The `dataset` column is a comma-separated list with values corresponding to biosaples in `dataset_config`. Datasets must be provided for each `crispr_cell_type` included in the corresponding `crispr_dataset`. + - The `dataset` column is a comma-separated list with values corresponding to biosamples in `dataset_config`. Datasets must be provided for each `crispr_cell_type` included in the corresponding `crispr_dataset`. - The `crispr_dataset` column corresponds to a key under `crispr_dataset` in `config_training`. - Feature tables must be specified for each model (example: `resources/feature_tables`) with columns: feature (name in final table), input_col (name in ABC output), second_input (multiplied by input_col if provided), aggregate_function (how to combine feature values when a CRISPR element overlaps more than one ABC element), fill_value (how to replace NAs), nice_name (used when plotting) - Note that trained models generated using polynomial features cannot directly be used in the **Apply model** workflow From 77d1e052f5ca0e87248b51c646f378bb93695eef Mon Sep 17 00:00:00 2001 From: mayasheth <43573959+mayasheth@users.noreply.github.com> Date: Thu, 7 Aug 2025 10:40:11 -0700 Subject: [PATCH 09/22] clarify variable names --- workflow/rules/utils.smk | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflow/rules/utils.smk b/workflow/rules/utils.smk index de3c3f5..2fcc66b 100755 --- a/workflow/rules/utils.smk +++ b/workflow/rules/utils.smk @@ -229,4 +229,4 @@ def get_tpm_threshold(biosample, model_name, biosample_df=None): return 0 else: tpm_file = os.path.basename(tpm_files[0]) - return tpm_file.split("threshold_")[1] \ No newline at end of file + return tpm_file.split("threshold_")[1] From 6b81c64b59d1b2ece56b3021b5ae16f9acbf6766 Mon Sep 17 00:00:00 2001 From: mayasheth <43573959+mayasheth@users.noreply.github.com> Date: Thu, 7 Aug 2025 10:41:33 -0700 Subject: [PATCH 10/22] actually clarify variable names From 8cd2472b8d6aeb0721640c9820b5fd6345d71655 Mon Sep 17 00:00:00 2001 From: Kayla Brand Date: Tue, 10 Feb 2026 15:30:26 -0800 Subject: [PATCH 11/22] handle crispr_dataset as dict --- workflow/rules/compare_models.smk | 4 +- .../model_training/compare_all_models.py | 77 +++++++++++-------- 2 files changed, 47 insertions(+), 34 deletions(-) diff --git a/workflow/rules/compare_models.smk b/workflow/rules/compare_models.smk index d6ce2ed..384bbbb 100755 --- a/workflow/rules/compare_models.smk +++ b/workflow/rules/compare_models.smk @@ -10,8 +10,7 @@ rule gather_model_performances: scripts_dir = SCRIPTS_DIR, out_dir = RESULTS_DIR, model_config_file = config["model_config"], - crispr_dataset_names = [n for n in model_config["crispr_dataset"].unique()], - crispr_dataset = lambda wildcards: [config["crispr_dataset"][cd] for cd in model_config["crispr_dataset"].unique()] + crispr_dataset = config["crispr_dataset"] conda: "../envs/encode_re2g.yml" resources: @@ -23,7 +22,6 @@ rule gather_model_performances: --all_missing "{input.all_missing}" \ --model_config_file {params.model_config_file} \ --output_file {output.comp_table} \ - --crispr_names "{params.crispr_dataset_names}" \ --crispr_data "{params.crispr_dataset}" \ --out_dir {params.out_dir} """ diff --git a/workflow/scripts/model_training/compare_all_models.py b/workflow/scripts/model_training/compare_all_models.py index b9244c8..832407b 100644 --- a/workflow/scripts/model_training/compare_all_models.py +++ b/workflow/scripts/model_training/compare_all_models.py @@ -4,6 +4,7 @@ import numpy as np import pandas as pd import scipy +import ast from training_functions import ( statistic_aupr, statistic_precision_at_threshold, @@ -15,26 +16,44 @@ def performance_summary( model_id, dataset, pred_file, missing_file, model_name, out_dir, crispr_data="", n_boot=1000 ): # read in predicitons - if model_id.startswith("distance"): - crispr_data = pd.read_csv(crispr_data, sep="\t") - crispr_data["distance"] = np.abs( - (crispr_data["chromStart"] + crispr_data["chromEnd"]) / 2 - - (crispr_data["startTSS"] + crispr_data["endTSS"]) / 2 + if model_id == "distance": + # Determine whether crispr_data is a str or dict + if crispr_data.strip().startswith('{') and crispr_data.strip().endswith('}'): + all_crispr_dfs = [] + print("INFO: --crispr_data appears to be a dictionary. Parsing and concatenating all files.") + try: + # Safely parse the string into a Python dictionary + crispr_data_dict = ast.literal_eval(crispr_data) + if not isinstance(crispr_data_dict, dict): + raise TypeError("Parsed data is not a dictionary.") + + # Iterate through the file paths in the dictionary's values + for key, filepath in crispr_data_dict.items(): + print(f" - Loading '{key}': {filepath}") + df = pd.read_csv(filepath, sep="\t") + all_crispr_dfs.append(df) + + # Concatenate all loaded DataFrames into one + crispr_df = pd.concat(all_crispr_dfs, ignore_index=True) + + except (ValueError, SyntaxError, TypeError) as e: + # If parsing fails, raise an error because the format was ambiguous + raise ValueError(f"Failed to parse --crispr_data as a dictionary. Error: {e}. Content: {crispr_data}") + else: + # If it's not a dictionary string, treat it as a single file path + print(f"INFO: --crispr_data is a single file path. Loading: {crispr_data}") + crispr_df = pd.read_csv(crispr_data, sep="\t") + + # Calculate distance + crispr_df["distance"] = np.abs( + (crispr_df["chromStart"] + crispr_df["chromEnd"]) / 2 + - (crispr_df["startTSS"] + crispr_df["endTSS"]) / 2 ) - crispr_data = crispr_data.dropna(subset=["Regulated", "distance"]) - Y_true_all = crispr_data["Regulated"].values.astype(np.int64) - Y_pred_all = crispr_data["distance"] * -1 + crispr_df = crispr_df.dropna(subset=["Regulated", "distance"]) + Y_true_all = crispr_df["Regulated"].values.astype(np.int64) + Y_pred_all = crispr_df["distance"] * -1 pct_missing = 0 else: # normal models - # pred_file = os.path.join( - # out_dir, dataset, model_id, "model", "training_predictions.tsv" - # ) - # missing_file = os.path.join( - # out_dir, - # dataset, - # model_id, - # "missing.EPCrisprBenchmark_ensemble_data_GRCh38.K562_features_NAfilled.tsv.gz", - # ) pred_df = pd.read_csv(pred_file, sep="\t") missing_df = pd.read_csv(missing_file, sep="\t") @@ -106,12 +125,10 @@ def performance_summary( @click.option("--model_config_file", required=True) @click.option("--output_file", required=True) @click.option("--crispr_data", required=True) -@click.option("--crispr_names", required=True) @click.option("--out_dir", required=True) -def main(all_pred, all_missing, model_config_file, output_file, crispr_data, crispr_names, out_dir): - all_pred_files = all_pred.split(" ") - all_missing_files = all_missing.split(" ") - crispr_dict = dict(zip(crispr_names.split(" "), crispr_data.split(" "))) +def main(all_pred, all_missing, model_config_file, output_file, crispr_data, out_dir): + all_pred = all_pred.split(" ") + all_missing = all_missing.split(" ") model_name = "E2G" model_config = ( @@ -119,8 +136,8 @@ def main(all_pred, all_missing, model_config_file, output_file, crispr_data, cri .fillna("None") .set_index("model", drop=False) ) - model_config["prediction_file"] = all_pred_files - model_config["missing_file"] = all_missing_files + model_config["prediction_file"] = all_pred + model_config["missing_file"] = all_missing # initiate final df df = pd.DataFrame( @@ -142,13 +159,11 @@ def main(all_pred, all_missing, model_config_file, output_file, crispr_data, cri for row in model_config.itertuples(index=False): res_row = performance_summary(row.model, row.dataset, row.prediction_file, row.missing_file, model_name, out_dir) df = pd.concat([df, res_row]) - - for key, file_path in crispr_dict.items(): - # add row(s) for distance - res_row = performance_summary( - f"distance_{key}", "baseline", "", "", "", out_dir, crispr_data=file_path - ) - df = pd.concat([df, res_row]) + # add row for distance + res_row = performance_summary( + "distance", "baseline", "", "", "", out_dir, crispr_data=crispr_data + ) + df = pd.concat([df, res_row]) # sort table by AUPRC df = df.sort_values(by="AUPRC", ascending=False) From 661056f13736aef2942d494db4cdd798b0d77274 Mon Sep 17 00:00:00 2001 From: Kayla Brand Date: Tue, 10 Feb 2026 15:34:02 -0800 Subject: [PATCH 12/22] set cpus-per-task = # threads to mitigate srun: fatal: cpus-per-task set by two different environment variables --- workflow/rules/predictions.smk | 1 + 1 file changed, 1 insertion(+) diff --git a/workflow/rules/predictions.smk b/workflow/rules/predictions.smk index ae06a5a..9e5066d 100755 --- a/workflow/rules/predictions.smk +++ b/workflow/rules/predictions.smk @@ -113,6 +113,7 @@ rule write_accessibility_bw_file: "../envs/encode_re2g.yml" resources: mem_mb=ABC.determine_mem_mb, + cpus_per_task=16, runtime_hr=6 threads: 16 shell: From 87227506ea680da1b2c5385a52b86b81f92522c8 Mon Sep 17 00:00:00 2001 From: Kayla Brand Date: Tue, 10 Feb 2026 15:35:34 -0800 Subject: [PATCH 13/22] improve time+memory efficiency by utilizing bedtools native overlap counting capability --- workflow/rules/genomewide_features.smk | 7 +-- .../feature_tables/gen_num_tss_enh_gene.py | 62 ++++++++++++------- 2 files changed, 44 insertions(+), 25 deletions(-) diff --git a/workflow/rules/genomewide_features.smk b/workflow/rules/genomewide_features.smk index 0e7c8f8..ac91ce6 100644 --- a/workflow/rules/genomewide_features.smk +++ b/workflow/rules/genomewide_features.smk @@ -93,20 +93,19 @@ rule generate_num_tss_enh_gene: conda: "../envs/encode_re2g.yml" resources: - mem_mb=partial(ABC.determine_mem_mb, min_gb=32) + mem_mb=partial(ABC.determine_mem_mb) #, min_gb=32) output: numTSSEnhGene = os.path.join(RESULTS_DIR, "{biosample}", "new_features", "NumTSSEnhGene.tsv"), extendedEnhancerRegions = temp(os.path.join(RESULTS_DIR, "{biosample}", "new_features", "extendedEnhancerRegions.txt")), - enhancerTSSInt = temp(os.path.join(RESULTS_DIR, "{biosample}", "new_features", "extendedEnhancerRegions_TSS_int.tsv.gz")) + # enhancerTSSInt = temp(os.path.join(RESULTS_DIR, "{biosample}", "new_features", "extendedEnhancerRegions_TSS_int.tsv.gz")) shell: """ python {params.scripts_dir}/feature_tables/gen_num_tss_enh_gene.py \ --abc_predictions {input.abc_predictions} \ --ref_gene_tss {params.gene_TSS500} \ --extended_enhancers {output.extendedEnhancerRegions} \ - --enhancer_tss_int {output.enhancerTSSInt} \ --out_file {output.numTSSEnhGene} - """ + """ # --enhancer_tss_int {output.enhancerTSSInt} \ # generate features "numNearbyEnhancers" and "sumNearbyEnhancers" rule generate_num_sum_enhancers: diff --git a/workflow/scripts/feature_tables/gen_num_tss_enh_gene.py b/workflow/scripts/feature_tables/gen_num_tss_enh_gene.py index 4643858..82b0db9 100644 --- a/workflow/scripts/feature_tables/gen_num_tss_enh_gene.py +++ b/workflow/scripts/feature_tables/gen_num_tss_enh_gene.py @@ -4,7 +4,7 @@ def determine_num_tss_enh_gene( - pred_df, ref_gene_tss, extended_enhancers, enhancer_tss_int, out_file + pred_df, ref_gene_tss, extended_enhancers, out_file #enhancer_tss_int ): # make the end be midpoint of enhancer + distance (This gives you the end coordinate of distance range) pred_df["midpoint"] = ((pred_df["start"] + pred_df["end"]) / 2).astype("int") @@ -23,26 +23,46 @@ def determine_num_tss_enh_gene( pred_df[["chr", "start", "new_end", "name", "TargetGene"]].to_csv( extended_enhancers, sep="\t", index=False ) + + # This pipeline performs the intersection, counting, and saves the final small result. + # It is the command-line equivalent of groupby().size() and is very memory-efficient. - # Intersect midpoint of enhancer to target gene regions with TSSs (includes overlap with target gene) - os.system( - "sed '1d' {} | bedtools intersect -a stdin -b {} -wa -wb | cut -f4,5 | pigz > {}".format( - extended_enhancers, ref_gene_tss, enhancer_tss_int - ) - ) - print("Reading in {}".format(enhancer_tss_int)) - predictions = pd.read_csv(enhancer_tss_int, sep="\t", names=["class", "gene"]) + # Define the desired header for the output file + header = "name\tgene\tcount\n" + # Define the memory-efficient command-line pipeline + cmd = f""" + # Step 1: Write the header to the output file first. + printf "{header}" > {out_file}; - # Calculate the number of TSS regions that fall within the enhancer to target gene regions. - num_tss_between_enh_and_gene = ( - predictions.groupby(["class", "gene"]).size().reset_index() - ) + # Step 2: Run the intersection, select the correct columns, and APPEND to the output file. + # The '>>' append operator is crucial here. + bedtools intersect -a {extended_enhancers} -b {ref_gene_tss} -wa -c \ + | cut -f4,5,6 \ + >> {out_file} + """ - num_tss_between_enh_and_gene.to_csv( - out_file, - sep="\t", - index=False, - ) + os.system(cmd) + +# bedtools creates chr, start, new_end, name, TargetGene + # # Intersect midpoint of enhancer to target gene regions with TSSs (includes overlap with target gene) + # os.system( + # "sed '1d' {} | bedtools intersect -a stdin -b {} -wa -wb | cut -f4,5 | pigz > {}".format( + # extended_enhancers, ref_gene_tss, enhancer_tss_int + # ) + # ) + # print("Reading in {}".format(enhancer_tss_int)) + # predictions = pd.read_csv(enhancer_tss_int, sep="\t", names=["class", "gene"]) + + # # Calculate the number of TSS regions that fall within the enhancer to target gene regions. + # num_tss_between_enh_and_gene = ( + # predictions.groupby(["class", "gene"]).size().reset_index() + # ) + + # num_tss_between_enh_and_gene.to_csv( + # out_file, + # sep="\t", + # index=False, + # ) print("Saved num TSS between enh and gene") @@ -50,15 +70,15 @@ def determine_num_tss_enh_gene( @click.option("--abc_predictions") @click.option("--ref_gene_tss") @click.option("--extended_enhancers") -@click.option("--enhancer_tss_int") +# @click.option("--enhancer_tss_int") @click.option("--out_file") -def main(abc_predictions, ref_gene_tss, extended_enhancers, enhancer_tss_int, out_file): +def main(abc_predictions, ref_gene_tss, extended_enhancers, out_file): #enhancer_tss_int pred_df = pd.read_csv(abc_predictions, sep="\t", compression="gzip") if len(pred_df) == 0: raise Exception("Did not find any enhancers in the Predictions file") determine_num_tss_enh_gene( - pred_df, ref_gene_tss, extended_enhancers, enhancer_tss_int, out_file + pred_df, ref_gene_tss, extended_enhancers, out_file #enhancer_tss_int ) From 85e1ed80a03192e6d7619d913b48a8fb688df861 Mon Sep 17 00:00:00 2001 From: Kayla Brand Date: Tue, 10 Feb 2026 16:07:19 -0800 Subject: [PATCH 14/22] black v26.1.0 reformatting --- .../gen_num_candidate_enh_gene.py | 4 +- .../gen_num_sum_nearby_enhancers.py | 7 +- .../feature_tables/gen_num_tss_enh_gene.py | 39 +- .../model_application/generate_plots.py | 12 +- .../scripts/model_application/get_stats.py | 14 +- workflow/scripts/model_application/run_e2g.py | 21 +- .../backward_sequential_feature_selection.py | 566 +++++++++--------- .../compare_all_feature_sets.py | 6 +- .../model_training/compare_all_models.py | 36 +- .../forward_sequential_feature_selection.py | 522 ++++++++-------- .../permutation_feature_importance.py | 192 +++--- .../scripts/model_training/train_model.py | 289 ++++----- .../model_training/training_functions.py | 1 - 13 files changed, 857 insertions(+), 852 deletions(-) diff --git a/workflow/scripts/feature_tables/gen_num_candidate_enh_gene.py b/workflow/scripts/feature_tables/gen_num_candidate_enh_gene.py index 6ea4300..1070e7f 100644 --- a/workflow/scripts/feature_tables/gen_num_candidate_enh_gene.py +++ b/workflow/scripts/feature_tables/gen_num_candidate_enh_gene.py @@ -44,7 +44,7 @@ def determine_num_candidate_enh_gene(pred_df, out_file): @click.option("--abc_predictions") @click.option("--out_file") def main(abc_predictions, out_file): - pred_df = pd.read_csv(abc_predictions, sep="\t", compression="gzip") + pred_df = pd.read_csv(abc_predictions, sep="\t") if len(pred_df) == 0: raise Exception("Did not find any enhancers in the Predictions file") @@ -52,4 +52,4 @@ def main(abc_predictions, out_file): if __name__ == "__main__": - main() + main() \ No newline at end of file diff --git a/workflow/scripts/feature_tables/gen_num_sum_nearby_enhancers.py b/workflow/scripts/feature_tables/gen_num_sum_nearby_enhancers.py index 942fcca..c887fa6 100644 --- a/workflow/scripts/feature_tables/gen_num_sum_nearby_enhancers.py +++ b/workflow/scripts/feature_tables/gen_num_sum_nearby_enhancers.py @@ -35,7 +35,12 @@ def generate_num_sum_enhancers( ) # select columns from EnhancerPredictionsAllPutative - df = pd.read_csv(pred_file, sep="\t", usecols=["chr", "start", "end", "name", "activity_base"], compression= "gzip") + df = pd.read_csv( + pred_file, + sep="\t", + usecols=["chr", "start", "end", "name", "activity_base"], + compression="gzip", + ) df.to_csv(pred_slim, sep="\t", index=False, header=False) # os.system( # "zcat {} | csvtk cut -t -f chr,start,end,name,activity_base | sed '1d' > {}".format( diff --git a/workflow/scripts/feature_tables/gen_num_tss_enh_gene.py b/workflow/scripts/feature_tables/gen_num_tss_enh_gene.py index 82b0db9..bf68eee 100644 --- a/workflow/scripts/feature_tables/gen_num_tss_enh_gene.py +++ b/workflow/scripts/feature_tables/gen_num_tss_enh_gene.py @@ -4,7 +4,7 @@ def determine_num_tss_enh_gene( - pred_df, ref_gene_tss, extended_enhancers, out_file #enhancer_tss_int + pred_df, ref_gene_tss, extended_enhancers, out_file # enhancer_tss_int ): # make the end be midpoint of enhancer + distance (This gives you the end coordinate of distance range) pred_df["midpoint"] = ((pred_df["start"] + pred_df["end"]) / 2).astype("int") @@ -23,46 +23,18 @@ def determine_num_tss_enh_gene( pred_df[["chr", "start", "new_end", "name", "TargetGene"]].to_csv( extended_enhancers, sep="\t", index=False ) - + # This pipeline performs the intersection, counting, and saves the final small result. # It is the command-line equivalent of groupby().size() and is very memory-efficient. - - # Define the desired header for the output file header = "name\tgene\tcount\n" - # Define the memory-efficient command-line pipeline cmd = f""" - # Step 1: Write the header to the output file first. printf "{header}" > {out_file}; - - # Step 2: Run the intersection, select the correct columns, and APPEND to the output file. - # The '>>' append operator is crucial here. bedtools intersect -a {extended_enhancers} -b {ref_gene_tss} -wa -c \ | cut -f4,5,6 \ >> {out_file} """ - os.system(cmd) -# bedtools creates chr, start, new_end, name, TargetGene - # # Intersect midpoint of enhancer to target gene regions with TSSs (includes overlap with target gene) - # os.system( - # "sed '1d' {} | bedtools intersect -a stdin -b {} -wa -wb | cut -f4,5 | pigz > {}".format( - # extended_enhancers, ref_gene_tss, enhancer_tss_int - # ) - # ) - # print("Reading in {}".format(enhancer_tss_int)) - # predictions = pd.read_csv(enhancer_tss_int, sep="\t", names=["class", "gene"]) - - # # Calculate the number of TSS regions that fall within the enhancer to target gene regions. - # num_tss_between_enh_and_gene = ( - # predictions.groupby(["class", "gene"]).size().reset_index() - # ) - - # num_tss_between_enh_and_gene.to_csv( - # out_file, - # sep="\t", - # index=False, - # ) print("Saved num TSS between enh and gene") @@ -70,16 +42,13 @@ def determine_num_tss_enh_gene( @click.option("--abc_predictions") @click.option("--ref_gene_tss") @click.option("--extended_enhancers") -# @click.option("--enhancer_tss_int") @click.option("--out_file") -def main(abc_predictions, ref_gene_tss, extended_enhancers, out_file): #enhancer_tss_int +def main(abc_predictions, ref_gene_tss, extended_enhancers, out_file): pred_df = pd.read_csv(abc_predictions, sep="\t", compression="gzip") if len(pred_df) == 0: raise Exception("Did not find any enhancers in the Predictions file") - determine_num_tss_enh_gene( - pred_df, ref_gene_tss, extended_enhancers, out_file #enhancer_tss_int - ) + determine_num_tss_enh_gene(pred_df, ref_gene_tss, extended_enhancers, out_file) if __name__ == "__main__": diff --git a/workflow/scripts/model_application/generate_plots.py b/workflow/scripts/model_application/generate_plots.py index 9fe0a42..9d22879 100755 --- a/workflow/scripts/model_application/generate_plots.py +++ b/workflow/scripts/model_application/generate_plots.py @@ -31,13 +31,13 @@ def load_and_save_stat_files(stat_files, output_table) -> List[pd.DataFrame]: cell_cluster = Path(stat_file).parts[-3] model_name = Path(stat_file).parts[-2] pred_id = f"{cell_cluster}_{model_name}" - results[pred_id] = pd.read_csv(stat_file, sep="\t").set_index("Metric").assign( - cell_cluster=cell_cluster, - model_name=model_name + results[pred_id] = ( + pd.read_csv(stat_file, sep="\t") + .set_index("Metric") + .assign(cell_cluster=cell_cluster, model_name=model_name) ) results_no_ind[pred_id] = pd.read_csv(stat_file, sep="\t").assign( - cell_cluster=cell_cluster, - model_name=model_name + cell_cluster=cell_cluster, model_name=model_name ) # save merged dataframe @@ -178,7 +178,7 @@ def save_outlier_stats(stat_dfs, metric, pdf_writer): pred_values = {} for this_df in stat_dfs.values(): pred_id = f"{this_df['cell_cluster'].iloc[0]}_{this_df['model_name'].iloc[0]}" - pred_values[pred_id] = this_df.loc[metric, VALUE_KEY] + pred_values[pred_id] = this_df.loc[metric, VALUE_KEY] sorted_items = sorted(pred_values.items(), key=lambda item: item[1]) diff --git a/workflow/scripts/model_application/get_stats.py b/workflow/scripts/model_application/get_stats.py index b38389b..cb8cd9d 100755 --- a/workflow/scripts/model_application/get_stats.py +++ b/workflow/scripts/model_application/get_stats.py @@ -19,29 +19,31 @@ def count_bam_total(bam_file: str) -> int: no_alt_chrom_df = df[df["chr"].isin(NORMAL_CHROMOSOMES)] return no_alt_chrom_df["mapped_reads"].sum() + def count_lines(file_path): count = 0 if file_path.endswith(".gz"): - with gzip.open(file_path, 'rt') as file: # 'rt' mode is for reading text - reader = csv.reader(file, delimiter='\t') + with gzip.open(file_path, "rt") as file: # 'rt' mode is for reading text + reader = csv.reader(file, delimiter="\t") for row in reader: if row[0] in NORMAL_CHROMOSOMES: count += 1 else: - reader = csv.reader(file_path, delimiter='\t') + reader = csv.reader(file_path, delimiter="\t") for row in reader: if row[0] in NORMAL_CHROMOSOMES: count += 1 return count + def get_num_reads(accessibility_files): total_counts = 0 for access_in in accessibility_files: if access_in.endswith(".bam"): total_counts += count_bam_total(access_in) elif "tagAlign" in access_in: - total_counts += count_lines(access_in)/2 - else: # hope it's a frag file + total_counts += count_lines(access_in) / 2 + else: # hope it's a frag file total_counts += count_lines(access_in) return total_counts @@ -81,7 +83,7 @@ def get_mean_num_enh_per_gene_no_prom(df): def get_mean_log_dist_to_tss(df): dist_cols = [x for x in ["distance", "distanceToTSS"] if x in df.columns] - if len(dist_cols)>0: + if len(dist_cols) > 0: log_dist = df[dist_cols[0]].apply(np.log10) log_dist = log_dist.replace(-np.inf, 0) return log_dist.mean() diff --git a/workflow/scripts/model_application/run_e2g.py b/workflow/scripts/model_application/run_e2g.py index cd6daa6..bab67fb 100755 --- a/workflow/scripts/model_application/run_e2g.py +++ b/workflow/scripts/model_application/run_e2g.py @@ -7,7 +7,9 @@ SCORE_COLUMN_BASE = "E2G.Score" -def make_e2g_predictions(df_enhancers, feature_list, trained_model, tpm_threshold, epsilon): +def make_e2g_predictions( + df_enhancers, feature_list, trained_model, tpm_threshold, epsilon +): # transform the features X = df_enhancers.loc[:, feature_list] X = np.log(np.abs(X) + epsilon) @@ -15,13 +17,18 @@ def make_e2g_predictions(df_enhancers, feature_list, trained_model, tpm_threshol with open(trained_model, "rb") as f: model = pickle.load(f) probs = model.predict_proba(X) - + if ("RNA_pseudobulkTPM" in df_enhancers.columns) and (tpm_threshold > 0): df_enhancers[SCORE_COLUMN_BASE + ".ignoreTPM"] = probs[:, 1] - df_enhancers[SCORE_COLUMN_BASE] = [score if tpm >= tpm_threshold else 0 - for (score, tpm) in zip(df_enhancers[SCORE_COLUMN_BASE + ".ignoreTPM"] , df_enhancers["RNA_pseudobulkTPM"])] + df_enhancers[SCORE_COLUMN_BASE] = [ + score if tpm >= tpm_threshold else 0 + for (score, tpm) in zip( + df_enhancers[SCORE_COLUMN_BASE + ".ignoreTPM"], + df_enhancers["RNA_pseudobulkTPM"], + ) + ] else: - df_enhancers[SCORE_COLUMN_BASE] = probs[:, 1] + df_enhancers[SCORE_COLUMN_BASE] = probs[:, 1] return df_enhancers @@ -33,7 +40,9 @@ def make_e2g_predictions(df_enhancers, feature_list, trained_model, tpm_threshol @click.option("--tpm_threshold", default=0) @click.option("--epsilon", type=float, default=0.01) @click.option("--output_file", required=True) -def main(predictions, feature_table_file, trained_model, tpm_threshold, epsilon, output_file): +def main( + predictions, feature_table_file, trained_model, tpm_threshold, epsilon, output_file +): feature_table = pd.read_csv(feature_table_file, sep="\t") feature_list = feature_table["feature"] diff --git a/workflow/scripts/model_training/backward_sequential_feature_selection.py b/workflow/scripts/model_training/backward_sequential_feature_selection.py index 9bdf854..68fd808 100644 --- a/workflow/scripts/model_training/backward_sequential_feature_selection.py +++ b/workflow/scripts/model_training/backward_sequential_feature_selection.py @@ -7,286 +7,286 @@ from sklearn.preprocessing import PolynomialFeatures from sklearn.linear_model import LogisticRegression from training_functions import ( - statistic_aupr, - statistic_precision, - train_and_predict_once, - bootstrap_pvalue, - statistic_delta_aupr, - threshold_70_pct_recall, - statistic_precision_at_threshold, - statistic_delta_precision_at_threshold, + statistic_aupr, + statistic_precision, + train_and_predict_once, + bootstrap_pvalue, + statistic_delta_aupr, + threshold_70_pct_recall, + statistic_precision_at_threshold, + statistic_delta_precision_at_threshold, ) def SBFS(df_dataset, feature_table, model_name, epsilon, params, polynomial=False): - # specify feature list - feature_list_core = feature_table["feature"] - X = df_dataset.loc[:, feature_list_core] - model_name_core = model_name - feature_list = feature_list_core - - # transform features first - X = np.log(np.abs(X) + epsilon) - Y_true = df_dataset["Regulated"].values.astype(np.int64) - - # optionally add polynomial features - if polynomial: - poly = PolynomialFeatures(degree=2) - X = poly.fit_transform(X) - polynomial_names = poly.get_feature_names_out(feature_list_core) - X = pd.DataFrame(X, columns=polynomial_names) - feature_list = pd.Series(polynomial_names) - - n_features = len(feature_list) - - feature_removed = [] - all_auprs = [] - all_precisions = [] - - # train all feature model - model_name = model_name_core + "_full" - df_dataset = train_and_predict_once( - df_dataset, X, Y_true, feature_list, model_name, params - ) - Y_pred = df_dataset[model_name + ".Score"] - aupr = statistic_aupr(Y_true, Y_pred) - thresh = threshold_70_pct_recall(Y_true, Y_pred) - precision_at_70_pct_recall = statistic_precision_at_threshold( - Y_true, Y_pred, thresh - ) - - feature_removed = feature_removed + ["None"] - all_auprs = all_auprs + [aupr] - all_precisions = all_precisions + [precision_at_70_pct_recall] - - for i in range(n_features - 1): # iterate (# of feature) times - best_k = -1 - best_aupr = 0 - best_precision = 0 - for k in range( - len(feature_list) - ): # iterate through each feature, remove one with least effect - new_feature = feature_list[k] # feature to remove - print(new_feature) - features = feature_list.loc[feature_list != new_feature].reset_index( - drop=True - ) - print(features) - model_name = model_name_core + "_" + str(i + 1) + "_" + str(k + 1) - print(model_name) - - df_dataset = train_and_predict_once( - df_dataset, X, Y_true, features, model_name, params - ) - - # evaluate model once trained - Y_pred = df_dataset[model_name + ".Score"] - aupr = statistic_aupr(Y_true, Y_pred) - thresh = threshold_70_pct_recall(Y_true, Y_pred) - precision_at_70_pct_recall = statistic_precision_at_threshold( - Y_true, Y_pred, thresh - ) - - if aupr >= best_aupr: - best_aupr = aupr - best_k = k - best_precision = precision_at_70_pct_recall - - # update best feature - feature_removed = feature_removed + [feature_list[best_k]] - all_auprs = all_auprs + [best_aupr] - all_precisions = all_precisions + [best_precision] - feature_list = feature_list.loc[ - feature_list != feature_list[best_k] - ].reset_index(drop=True) - # add remaining feature - feature_removed = feature_removed + [feature_list[0]] - print(feature_removed) - print(all_auprs) - print(all_precisions) - - return feature_removed + # specify feature list + feature_list_core = feature_table["feature"] + X = df_dataset.loc[:, feature_list_core] + model_name_core = model_name + feature_list = feature_list_core + + # transform features first + X = np.log(np.abs(X) + epsilon) + Y_true = df_dataset["Regulated"].values.astype(np.int64) + + # optionally add polynomial features + if polynomial: + poly = PolynomialFeatures(degree=2) + X = poly.fit_transform(X) + polynomial_names = poly.get_feature_names_out(feature_list_core) + X = pd.DataFrame(X, columns=polynomial_names) + feature_list = pd.Series(polynomial_names) + + n_features = len(feature_list) + + feature_removed = [] + all_auprs = [] + all_precisions = [] + + # train all feature model + model_name = model_name_core + "_full" + df_dataset = train_and_predict_once( + df_dataset, X, Y_true, feature_list, model_name, params + ) + Y_pred = df_dataset[model_name + ".Score"] + aupr = statistic_aupr(Y_true, Y_pred) + thresh = threshold_70_pct_recall(Y_true, Y_pred) + precision_at_70_pct_recall = statistic_precision_at_threshold( + Y_true, Y_pred, thresh + ) + + feature_removed = feature_removed + ["None"] + all_auprs = all_auprs + [aupr] + all_precisions = all_precisions + [precision_at_70_pct_recall] + + for i in range(n_features - 1): # iterate (# of feature) times + best_k = -1 + best_aupr = 0 + best_precision = 0 + for k in range( + len(feature_list) + ): # iterate through each feature, remove one with least effect + new_feature = feature_list[k] # feature to remove + print(new_feature) + features = feature_list.loc[feature_list != new_feature].reset_index( + drop=True + ) + print(features) + model_name = model_name_core + "_" + str(i + 1) + "_" + str(k + 1) + print(model_name) + + df_dataset = train_and_predict_once( + df_dataset, X, Y_true, features, model_name, params + ) + + # evaluate model once trained + Y_pred = df_dataset[model_name + ".Score"] + aupr = statistic_aupr(Y_true, Y_pred) + thresh = threshold_70_pct_recall(Y_true, Y_pred) + precision_at_70_pct_recall = statistic_precision_at_threshold( + Y_true, Y_pred, thresh + ) + + if aupr >= best_aupr: + best_aupr = aupr + best_k = k + best_precision = precision_at_70_pct_recall + + # update best feature + feature_removed = feature_removed + [feature_list[best_k]] + all_auprs = all_auprs + [best_aupr] + all_precisions = all_precisions + [best_precision] + feature_list = feature_list.loc[ + feature_list != feature_list[best_k] + ].reset_index(drop=True) + # add remaining feature + feature_removed = feature_removed + [feature_list[0]] + print(feature_removed) + print(all_auprs) + print(all_precisions) + + return feature_removed def SBFS_significance( - df_dataset, - feature_table, - model_name, - epsilon, - params, - polynomial=False, - n_boot=1000, + df_dataset, + feature_table, + model_name, + epsilon, + params, + polynomial=False, + n_boot=1000, ): - feature_list = SBFS( - df_dataset, feature_table, model_name, epsilon, params, polynomial - ) # get order of features - feature_list_core = feature_table["feature"] - model_name_core = model_name - - # calculate polynomial features - if polynomial: - X_core = df_dataset.loc[:, feature_list_core] - poly = PolynomialFeatures(degree=2) - X_2 = poly.fit_transform(X_core) - polynomial_names = poly.get_feature_names_out(X_core.columns) - X_2 = pd.DataFrame(X_2, columns=polynomial_names) - X = X_2 - else: - X = df_dataset.loc[:, feature_list_core] - - X = np.log(np.abs(X) + epsilon) - Y_true = df_dataset["Regulated"].values.astype(np.int64) - - df = pd.DataFrame( - columns=[ - "feature_removed", - "aupr", - "delta_aupr", - "delta_aupr_low", - "delta_aupr_high", - "pval_aupr", - "precision", - "delta_precision", - "delta_precision_low", - "delta_precision_high", - "pval_precision", - ] - ) - # train first model (all features) - model_name = model_name_core + "_full" - feature_list.remove("None") - df_dataset = train_and_predict_once( - df_dataset, X, Y_true, feature_list, model_name, params - ) - Y_new = df_dataset[model_name + ".Score"] - - data = (Y_true, Y_new) - res_aupr = scipy.stats.bootstrap( - data, - statistic_aupr, - n_resamples=n_boot, - paired=True, - confidence_level=0.95, - method="BCa", - ) - thresh = threshold_70_pct_recall(Y_true, Y_new) - res_precision = scipy.stats.bootstrap( - data, - lambda Y_true, Y_new: statistic_precision_at_threshold(Y_true, Y_new, thresh), - n_resamples=n_boot, - paired=True, - confidence_level=0.95, - method="BCa", - ) - aupr = np.mean(res_aupr.bootstrap_distribution) - precision = np.mean(res_precision.bootstrap_distribution) - - df_temp = pd.DataFrame( - { - "feature_removed": "None", - "aupr": aupr, - "delta_aupr": 0, - "delta_aupr_low": 0, - "delta_aupr_high": 0, - "pval_aupr": 1, - "precision": precision, - "delta_precision": 0, - "delta_precision_low": 0, - "delta_precision_high": 0, - "pval_precision": 1, - }, - index=[0], - ) - df = pd.concat([df, df_temp]) - - Y_last = Y_new - - # loop feature list from SBFS through to train next model and compare - for i in range(len(feature_list)): - to_remove = feature_list[0] - print(to_remove) - if len(feature_list) == 1: - df_dataset["all_true"] = 1 - Y_new = df_dataset["all_true"] - else: - feature_list.remove(to_remove) - model_name = model_name_core + "_" + str(i + 1) - df_dataset = train_and_predict_once( - df_dataset, X, Y_true, feature_list, model_name, params - ) - Y_new = df_dataset[model_name + ".Score"] - - data_compare = (Y_true, Y_last, Y_new) - thresh_last = threshold_70_pct_recall(Y_true, Y_last) - thresh_new = threshold_70_pct_recall(Y_true, Y_new) - res_delta_aupr = scipy.stats.bootstrap( - data_compare, - statistic_delta_aupr, - n_resamples=n_boot, - paired=True, - confidence_level=0.95, - method="BCa", - ) - res_delta_precision = scipy.stats.bootstrap( - data_compare, - lambda Y_true, Y_last, Y_new,: statistic_delta_precision_at_threshold( - Y_true, Y_last, Y_new, thresh_last, thresh_new - ), - n_resamples=n_boot, - paired=True, - confidence_level=0.95, - method="BCa", - ) - delta_aupr = np.mean( - res_delta_aupr.bootstrap_distribution - ) # aupr_new - aupr_last - delta_precision = np.mean(res_delta_precision.bootstrap_distribution) - pval_aupr = bootstrap_pvalue(delta_aupr, res_delta_aupr) - pval_precision = bootstrap_pvalue(delta_precision, res_delta_precision) - - data = (Y_true, Y_new) - res_aupr = scipy.stats.bootstrap( - data, - statistic_aupr, - n_resamples=n_boot, - paired=True, - confidence_level=0.95, - method="BCa", - ) - thresh = threshold_70_pct_recall(Y_true, Y_new) - res_precision = scipy.stats.bootstrap( - data, - lambda Y_true, Y_new: statistic_precision_at_threshold( - Y_true, Y_new, thresh - ), - n_resamples=n_boot, - paired=True, - confidence_level=0.95, - method="BCa", - ) - aupr = np.mean(res_aupr.bootstrap_distribution) - precision = np.mean(res_precision.bootstrap_distribution) - - df_temp = pd.DataFrame( - { - "feature_removed": to_remove, - "aupr": aupr, - "delta_aupr": delta_aupr, - "delta_aupr_low": res_delta_aupr.confidence_interval[0], - "delta_aupr_high": res_delta_aupr.confidence_interval[1], - "pval_aupr": pval_aupr, - "precision": precision, - "delta_precision": delta_precision, - "delta_precision_low": res_delta_precision.confidence_interval[0], - "delta_precision_high": res_delta_precision.confidence_interval[1], - "pval_precision": pval_precision, - }, - index=[0], - ) - df = pd.concat([df, df_temp]) - Y_last = Y_new - - return df + feature_list = SBFS( + df_dataset, feature_table, model_name, epsilon, params, polynomial + ) # get order of features + feature_list_core = feature_table["feature"] + model_name_core = model_name + + # calculate polynomial features + if polynomial: + X_core = df_dataset.loc[:, feature_list_core] + poly = PolynomialFeatures(degree=2) + X_2 = poly.fit_transform(X_core) + polynomial_names = poly.get_feature_names_out(X_core.columns) + X_2 = pd.DataFrame(X_2, columns=polynomial_names) + X = X_2 + else: + X = df_dataset.loc[:, feature_list_core] + + X = np.log(np.abs(X) + epsilon) + Y_true = df_dataset["Regulated"].values.astype(np.int64) + + df = pd.DataFrame( + columns=[ + "feature_removed", + "aupr", + "delta_aupr", + "delta_aupr_low", + "delta_aupr_high", + "pval_aupr", + "precision", + "delta_precision", + "delta_precision_low", + "delta_precision_high", + "pval_precision", + ] + ) + # train first model (all features) + model_name = model_name_core + "_full" + feature_list.remove("None") + df_dataset = train_and_predict_once( + df_dataset, X, Y_true, feature_list, model_name, params + ) + Y_new = df_dataset[model_name + ".Score"] + + data = (Y_true, Y_new) + res_aupr = scipy.stats.bootstrap( + data, + statistic_aupr, + n_resamples=n_boot, + paired=True, + confidence_level=0.95, + method="BCa", + ) + thresh = threshold_70_pct_recall(Y_true, Y_new) + res_precision = scipy.stats.bootstrap( + data, + lambda Y_true, Y_new: statistic_precision_at_threshold(Y_true, Y_new, thresh), + n_resamples=n_boot, + paired=True, + confidence_level=0.95, + method="BCa", + ) + aupr = np.mean(res_aupr.bootstrap_distribution) + precision = np.mean(res_precision.bootstrap_distribution) + + df_temp = pd.DataFrame( + { + "feature_removed": "None", + "aupr": aupr, + "delta_aupr": 0, + "delta_aupr_low": 0, + "delta_aupr_high": 0, + "pval_aupr": 1, + "precision": precision, + "delta_precision": 0, + "delta_precision_low": 0, + "delta_precision_high": 0, + "pval_precision": 1, + }, + index=[0], + ) + df = pd.concat([df, df_temp]) + + Y_last = Y_new + + # loop feature list from SBFS through to train next model and compare + for i in range(len(feature_list)): + to_remove = feature_list[0] + print(to_remove) + if len(feature_list) == 1: + df_dataset["all_true"] = 1 + Y_new = df_dataset["all_true"] + else: + feature_list.remove(to_remove) + model_name = model_name_core + "_" + str(i + 1) + df_dataset = train_and_predict_once( + df_dataset, X, Y_true, feature_list, model_name, params + ) + Y_new = df_dataset[model_name + ".Score"] + + data_compare = (Y_true, Y_last, Y_new) + thresh_last = threshold_70_pct_recall(Y_true, Y_last) + thresh_new = threshold_70_pct_recall(Y_true, Y_new) + res_delta_aupr = scipy.stats.bootstrap( + data_compare, + statistic_delta_aupr, + n_resamples=n_boot, + paired=True, + confidence_level=0.95, + method="BCa", + ) + res_delta_precision = scipy.stats.bootstrap( + data_compare, + lambda Y_true, Y_last, Y_new,: statistic_delta_precision_at_threshold( + Y_true, Y_last, Y_new, thresh_last, thresh_new + ), + n_resamples=n_boot, + paired=True, + confidence_level=0.95, + method="BCa", + ) + delta_aupr = np.mean( + res_delta_aupr.bootstrap_distribution + ) # aupr_new - aupr_last + delta_precision = np.mean(res_delta_precision.bootstrap_distribution) + pval_aupr = bootstrap_pvalue(delta_aupr, res_delta_aupr) + pval_precision = bootstrap_pvalue(delta_precision, res_delta_precision) + + data = (Y_true, Y_new) + res_aupr = scipy.stats.bootstrap( + data, + statistic_aupr, + n_resamples=n_boot, + paired=True, + confidence_level=0.95, + method="BCa", + ) + thresh = threshold_70_pct_recall(Y_true, Y_new) + res_precision = scipy.stats.bootstrap( + data, + lambda Y_true, Y_new: statistic_precision_at_threshold( + Y_true, Y_new, thresh + ), + n_resamples=n_boot, + paired=True, + confidence_level=0.95, + method="BCa", + ) + aupr = np.mean(res_aupr.bootstrap_distribution) + precision = np.mean(res_precision.bootstrap_distribution) + + df_temp = pd.DataFrame( + { + "feature_removed": to_remove, + "aupr": aupr, + "delta_aupr": delta_aupr, + "delta_aupr_low": res_delta_aupr.confidence_interval[0], + "delta_aupr_high": res_delta_aupr.confidence_interval[1], + "pval_aupr": pval_aupr, + "precision": precision, + "delta_precision": delta_precision, + "delta_precision_low": res_delta_precision.confidence_interval[0], + "delta_precision_high": res_delta_precision.confidence_interval[1], + "pval_precision": pval_precision, + }, + index=[0], + ) + df = pd.concat([df, df_temp]) + Y_last = Y_new + + return df @click.command() @@ -297,20 +297,20 @@ def SBFS_significance( @click.option("--epsilon", type=float, default=0.01) @click.option("--params_file", required=True) def main( - crispr_features_file, feature_table_file, out_dir, polynomial, params_file, epsilon + crispr_features_file, feature_table_file, out_dir, polynomial, params_file, epsilon ): - model_name = "E2G" - df_dataset = pd.read_csv(crispr_features_file, sep="\t") - feature_table = pd.read_csv(feature_table_file, sep="\t") - with open(params_file, "rb") as handle: - params = pickle.load(handle) + model_name = "E2G" + df_dataset = pd.read_csv(crispr_features_file, sep="\t") + feature_table = pd.read_csv(feature_table_file, sep="\t") + with open(params_file, "rb") as handle: + params = pickle.load(handle) - res = SBFS_significance( - df_dataset, feature_table, model_name, epsilon, params, polynomial, n_boot=1000 - ) + res = SBFS_significance( + df_dataset, feature_table, model_name, epsilon, params, polynomial, n_boot=1000 + ) - res.to_csv(out_dir + "/backward_feature_selection.tsv", sep="\t", index=False) + res.to_csv(out_dir + "/backward_feature_selection.tsv", sep="\t", index=False) if __name__ == "__main__": - main() + main() diff --git a/workflow/scripts/model_training/compare_all_feature_sets.py b/workflow/scripts/model_training/compare_all_feature_sets.py index 04546c1..f8594a1 100644 --- a/workflow/scripts/model_training/compare_all_feature_sets.py +++ b/workflow/scripts/model_training/compare_all_feature_sets.py @@ -7,7 +7,7 @@ statistic_aupr, statistic_precision_at_threshold, train_and_predict_once, - threshold_70_pct_recall + threshold_70_pct_recall, ) @@ -65,7 +65,9 @@ def compare_feature_sets(df_dataset, feature_table, epsilon, params, n_boot): res_prec = scipy.stats.bootstrap( (Y_true, Y_pred), - lambda Y_true, Y_pred: statistic_precision_at_threshold(Y_true, Y_pred, thresh), + lambda Y_true, Y_pred: statistic_precision_at_threshold( + Y_true, Y_pred, thresh + ), n_resamples=n_boot, paired=True, confidence_level=0.95, diff --git a/workflow/scripts/model_training/compare_all_models.py b/workflow/scripts/model_training/compare_all_models.py index 832407b..3277ad4 100644 --- a/workflow/scripts/model_training/compare_all_models.py +++ b/workflow/scripts/model_training/compare_all_models.py @@ -13,32 +13,43 @@ def performance_summary( - model_id, dataset, pred_file, missing_file, model_name, out_dir, crispr_data="", n_boot=1000 + model_id, + dataset, + pred_file, + missing_file, + model_name, + out_dir, + crispr_data="", + n_boot=1000, ): # read in predicitons if model_id == "distance": # Determine whether crispr_data is a str or dict - if crispr_data.strip().startswith('{') and crispr_data.strip().endswith('}'): + if crispr_data.strip().startswith("{") and crispr_data.strip().endswith("}"): all_crispr_dfs = [] - print("INFO: --crispr_data appears to be a dictionary. Parsing and concatenating all files.") + print( + "INFO: --crispr_data appears to be a dictionary. Parsing and concatenating all files." + ) try: # Safely parse the string into a Python dictionary crispr_data_dict = ast.literal_eval(crispr_data) if not isinstance(crispr_data_dict, dict): raise TypeError("Parsed data is not a dictionary.") - + # Iterate through the file paths in the dictionary's values for key, filepath in crispr_data_dict.items(): print(f" - Loading '{key}': {filepath}") df = pd.read_csv(filepath, sep="\t") all_crispr_dfs.append(df) - + # Concatenate all loaded DataFrames into one crispr_df = pd.concat(all_crispr_dfs, ignore_index=True) - + except (ValueError, SyntaxError, TypeError) as e: # If parsing fails, raise an error because the format was ambiguous - raise ValueError(f"Failed to parse --crispr_data as a dictionary. Error: {e}. Content: {crispr_data}") + raise ValueError( + f"Failed to parse --crispr_data as a dictionary. Error: {e}. Content: {crispr_data}" + ) else: # If it's not a dictionary string, treat it as a single file path print(f"INFO: --crispr_data is a single file path. Loading: {crispr_data}") @@ -138,7 +149,7 @@ def main(all_pred, all_missing, model_config_file, output_file, crispr_data, out ) model_config["prediction_file"] = all_pred model_config["missing_file"] = all_missing - + # initiate final df df = pd.DataFrame( columns=[ @@ -157,7 +168,14 @@ def main(all_pred, all_missing, model_config_file, output_file, crispr_data, out # iterate through rows of model config and add results to final df for row in model_config.itertuples(index=False): - res_row = performance_summary(row.model, row.dataset, row.prediction_file, row.missing_file, model_name, out_dir) + res_row = performance_summary( + row.model, + row.dataset, + row.prediction_file, + row.missing_file, + model_name, + out_dir, + ) df = pd.concat([df, res_row]) # add row for distance res_row = performance_summary( diff --git a/workflow/scripts/model_training/forward_sequential_feature_selection.py b/workflow/scripts/model_training/forward_sequential_feature_selection.py index 8a9f76a..7a320c2 100644 --- a/workflow/scripts/model_training/forward_sequential_feature_selection.py +++ b/workflow/scripts/model_training/forward_sequential_feature_selection.py @@ -7,264 +7,264 @@ from sklearn.preprocessing import PolynomialFeatures from sklearn.linear_model import LogisticRegression from training_functions import ( - statistic_aupr, - statistic_precision, - train_and_predict_once, - bootstrap_pvalue, - statistic_delta_aupr, - threshold_70_pct_recall, - statistic_precision_at_threshold, - statistic_delta_precision_at_threshold, + statistic_aupr, + statistic_precision, + train_and_predict_once, + bootstrap_pvalue, + statistic_delta_aupr, + threshold_70_pct_recall, + statistic_precision_at_threshold, + statistic_delta_precision_at_threshold, ) def SFFS(df_dataset, feature_table, model_name, epsilon, params, polynomial=False): - # specify feature list - feature_list_core = feature_table["feature"] - X = df_dataset.loc[:, feature_list_core] - model_name_core = model_name - feature_list = feature_list_core - - # transform features first - X = np.log(np.abs(X) + epsilon) - Y_true = df_dataset["Regulated"].values.astype(np.int64) - - # optionally add polynomial features - if polynomial: - poly = PolynomialFeatures(degree=2) - X = poly.fit_transform(X) - polynomial_names = poly.get_feature_names_out(feature_list_core) - X = pd.DataFrame(X, columns=polynomial_names) - feature_list = pd.Series(polynomial_names) - - n_features = len(feature_list) - - best_features = [] - all_auprs = [] - all_precisions = [] - - for i in range(n_features): # iterate (# of feature) times - best_k = -1 - best_aupr = 0 - best_precision = 0 - for k in range( - len(feature_list) - ): # iterate through each feature, choose best one - new_feature = feature_list[k] - print(new_feature) - features = best_features + [new_feature] - print(features) - model_name = model_name_core + "_" + str(i + 1) + "_" + str(k + 1) - print(model_name) - - df_dataset = train_and_predict_once( - df_dataset, X, Y_true, features, model_name, params - ) - - # evaluate model once trained - Y_pred = df_dataset[model_name + ".Score"] - aupr = statistic_aupr(Y_true, Y_pred) - thresh = threshold_70_pct_recall(Y_true, Y_pred) - precision_at_70_pct_recall = statistic_precision_at_threshold( - Y_true, Y_pred, thresh - ) - - if aupr >= best_aupr: - best_aupr = aupr - best_k = k - best_precision = precision_at_70_pct_recall - - # update best feature - best_features = best_features + [feature_list[best_k]] - all_auprs = all_auprs + [best_aupr] - all_precisions = all_precisions + [best_precision] - feature_list = feature_list.loc[ - feature_list != feature_list[best_k] - ].reset_index(drop=True) - print(best_features) - print(all_auprs) - print(all_precisions) - - return best_features + # specify feature list + feature_list_core = feature_table["feature"] + X = df_dataset.loc[:, feature_list_core] + model_name_core = model_name + feature_list = feature_list_core + + # transform features first + X = np.log(np.abs(X) + epsilon) + Y_true = df_dataset["Regulated"].values.astype(np.int64) + + # optionally add polynomial features + if polynomial: + poly = PolynomialFeatures(degree=2) + X = poly.fit_transform(X) + polynomial_names = poly.get_feature_names_out(feature_list_core) + X = pd.DataFrame(X, columns=polynomial_names) + feature_list = pd.Series(polynomial_names) + + n_features = len(feature_list) + + best_features = [] + all_auprs = [] + all_precisions = [] + + for i in range(n_features): # iterate (# of feature) times + best_k = -1 + best_aupr = 0 + best_precision = 0 + for k in range( + len(feature_list) + ): # iterate through each feature, choose best one + new_feature = feature_list[k] + print(new_feature) + features = best_features + [new_feature] + print(features) + model_name = model_name_core + "_" + str(i + 1) + "_" + str(k + 1) + print(model_name) + + df_dataset = train_and_predict_once( + df_dataset, X, Y_true, features, model_name, params + ) + + # evaluate model once trained + Y_pred = df_dataset[model_name + ".Score"] + aupr = statistic_aupr(Y_true, Y_pred) + thresh = threshold_70_pct_recall(Y_true, Y_pred) + precision_at_70_pct_recall = statistic_precision_at_threshold( + Y_true, Y_pred, thresh + ) + + if aupr >= best_aupr: + best_aupr = aupr + best_k = k + best_precision = precision_at_70_pct_recall + + # update best feature + best_features = best_features + [feature_list[best_k]] + all_auprs = all_auprs + [best_aupr] + all_precisions = all_precisions + [best_precision] + feature_list = feature_list.loc[ + feature_list != feature_list[best_k] + ].reset_index(drop=True) + print(best_features) + print(all_auprs) + print(all_precisions) + + return best_features def SFFS_significance( - df_dataset, - feature_table, - model_name, - epsilon, - params, - polynomial=False, - n_boot=1000, + df_dataset, + feature_table, + model_name, + epsilon, + params, + polynomial=False, + n_boot=1000, ): - feature_list = SFFS( - df_dataset, feature_table, model_name, epsilon, params, polynomial - ) # get order of features - feature_list_core = feature_table["feature"] - model_name_core = model_name - - # calculate polynomial features - if polynomial: - X_core = df_dataset.loc[:, feature_list_core] - poly = PolynomialFeatures(degree=2) - X_2 = poly.fit_transform(X_core) - polynomial_names = poly.get_feature_names_out(X_core.columns) - X_2 = pd.DataFrame(X_2, columns=polynomial_names) - X = X_2 - else: - X = df_dataset.loc[:, feature_list_core] - - X = np.log(np.abs(X) + epsilon) - Y_true = df_dataset["Regulated"].values.astype(np.int64) - - df = pd.DataFrame( - columns=[ - "feature_added", - "aupr", - "delta_aupr", - "delta_aupr_low", - "delta_aupr_high", - "pval_aupr", - "precision", - "delta_precision", - "delta_precision_low", - "delta_precision_high", - "pval_precision", - ] - ) - # evaluate baseline model - df_dataset["all_true"] = 1 - Y_new = df_dataset["all_true"] - - data = (Y_true, Y_new) - res_aupr = scipy.stats.bootstrap( - data, - statistic_aupr, - n_resamples=n_boot, - paired=True, - confidence_level=0.95, - method="BCa", - ) - thresh = threshold_70_pct_recall( - Y_true, Y_new - ) # will return None if max recall < 70% - res_precision = scipy.stats.bootstrap( - data, - lambda Y_true, Y_pred: statistic_precision_at_threshold(Y_true, Y_pred, thresh), - n_resamples=n_boot, - paired=True, - confidence_level=0.95, - method="BCa", - ) - - aupr = np.mean(res_aupr.bootstrap_distribution) - precision = np.mean(res_precision.bootstrap_distribution) - - df_temp = pd.DataFrame( - { - "feature_added": "None", - "aupr": aupr, - "delta_aupr": 0, - "delta_aupr_low": 0, - "delta_aupr_high": 0, - "pval_aupr": 1, - "precision": precision, - "delta_precision": 0, - "delta_precision_low": 0, - "delta_precision_high": 0, - "pval_precision": 1, - }, - index=[0], - ) - df = pd.concat([df, df_temp]) - Y_last = Y_new - - features = [] - # loop through to train next model and compare - for i in range(0, len(feature_list)): - feature_added = feature_list[i] - features = features + [feature_added] - model_name = model_name_core + "_" + str(i + 1) - print(model_name) - df_dataset = train_and_predict_once( - df_dataset, X, Y_true, features, model_name, params - ) - Y_new = df_dataset[model_name + ".Score"] - - data_compare = (Y_true, Y_last, Y_new) - thresh_last = threshold_70_pct_recall(Y_true, Y_last) - thresh_new = threshold_70_pct_recall(Y_true, Y_new) - res_delta_aupr = scipy.stats.bootstrap( - data_compare, - statistic_delta_aupr, - n_resamples=n_boot, - paired=True, - confidence_level=0.95, - method="BCa", - ) - res_delta_precision = scipy.stats.bootstrap( - data_compare, - lambda Y_true, Y_last, Y_new,: statistic_delta_precision_at_threshold( - Y_true, Y_last, Y_new, thresh_last, thresh_new - ), - n_resamples=n_boot, - paired=True, - confidence_level=0.95, - method="BCa", - ) - delta_aupr = np.mean( - res_delta_aupr.bootstrap_distribution - ) # aupr_new - aupr_last - delta_precision = np.mean(res_delta_precision.bootstrap_distribution) - pval_aupr = bootstrap_pvalue(delta_aupr, res_delta_aupr) - pval_precision = bootstrap_pvalue(delta_precision, res_delta_precision) - - data = (Y_true, Y_new) - res_aupr = scipy.stats.bootstrap( - data, - statistic_aupr, - n_resamples=n_boot, - paired=True, - confidence_level=0.95, - method="BCa", - ) - thresh = threshold_70_pct_recall( - Y_true, Y_new - ) # will return None if max recall < 70% - res_precision = scipy.stats.bootstrap( - data, - lambda Y_true, Y_pred: statistic_precision_at_threshold( - Y_true, Y_pred, thresh - ), - n_resamples=n_boot, - paired=True, - confidence_level=0.95, - method="BCa", - ) - - aupr = np.mean(res_aupr.bootstrap_distribution) - precision = np.mean(res_precision.bootstrap_distribution) - - df_temp = pd.DataFrame( - { - "feature_added": feature_added, - "aupr": aupr, - "delta_aupr": delta_aupr, - "delta_aupr_low": res_delta_aupr.confidence_interval[0], - "delta_aupr_high": res_delta_aupr.confidence_interval[1], - "pval_aupr": pval_aupr, - "precision": precision, - "delta_precision": delta_precision, - "delta_precision_low": res_delta_precision.confidence_interval[0], - "delta_precision_high": res_delta_precision.confidence_interval[1], - "pval_precision": pval_precision, - }, - index=[0], - ) - df = pd.concat([df, df_temp]) - Y_last = Y_new - - return df + feature_list = SFFS( + df_dataset, feature_table, model_name, epsilon, params, polynomial + ) # get order of features + feature_list_core = feature_table["feature"] + model_name_core = model_name + + # calculate polynomial features + if polynomial: + X_core = df_dataset.loc[:, feature_list_core] + poly = PolynomialFeatures(degree=2) + X_2 = poly.fit_transform(X_core) + polynomial_names = poly.get_feature_names_out(X_core.columns) + X_2 = pd.DataFrame(X_2, columns=polynomial_names) + X = X_2 + else: + X = df_dataset.loc[:, feature_list_core] + + X = np.log(np.abs(X) + epsilon) + Y_true = df_dataset["Regulated"].values.astype(np.int64) + + df = pd.DataFrame( + columns=[ + "feature_added", + "aupr", + "delta_aupr", + "delta_aupr_low", + "delta_aupr_high", + "pval_aupr", + "precision", + "delta_precision", + "delta_precision_low", + "delta_precision_high", + "pval_precision", + ] + ) + # evaluate baseline model + df_dataset["all_true"] = 1 + Y_new = df_dataset["all_true"] + + data = (Y_true, Y_new) + res_aupr = scipy.stats.bootstrap( + data, + statistic_aupr, + n_resamples=n_boot, + paired=True, + confidence_level=0.95, + method="BCa", + ) + thresh = threshold_70_pct_recall( + Y_true, Y_new + ) # will return None if max recall < 70% + res_precision = scipy.stats.bootstrap( + data, + lambda Y_true, Y_pred: statistic_precision_at_threshold(Y_true, Y_pred, thresh), + n_resamples=n_boot, + paired=True, + confidence_level=0.95, + method="BCa", + ) + + aupr = np.mean(res_aupr.bootstrap_distribution) + precision = np.mean(res_precision.bootstrap_distribution) + + df_temp = pd.DataFrame( + { + "feature_added": "None", + "aupr": aupr, + "delta_aupr": 0, + "delta_aupr_low": 0, + "delta_aupr_high": 0, + "pval_aupr": 1, + "precision": precision, + "delta_precision": 0, + "delta_precision_low": 0, + "delta_precision_high": 0, + "pval_precision": 1, + }, + index=[0], + ) + df = pd.concat([df, df_temp]) + Y_last = Y_new + + features = [] + # loop through to train next model and compare + for i in range(0, len(feature_list)): + feature_added = feature_list[i] + features = features + [feature_added] + model_name = model_name_core + "_" + str(i + 1) + print(model_name) + df_dataset = train_and_predict_once( + df_dataset, X, Y_true, features, model_name, params + ) + Y_new = df_dataset[model_name + ".Score"] + + data_compare = (Y_true, Y_last, Y_new) + thresh_last = threshold_70_pct_recall(Y_true, Y_last) + thresh_new = threshold_70_pct_recall(Y_true, Y_new) + res_delta_aupr = scipy.stats.bootstrap( + data_compare, + statistic_delta_aupr, + n_resamples=n_boot, + paired=True, + confidence_level=0.95, + method="BCa", + ) + res_delta_precision = scipy.stats.bootstrap( + data_compare, + lambda Y_true, Y_last, Y_new,: statistic_delta_precision_at_threshold( + Y_true, Y_last, Y_new, thresh_last, thresh_new + ), + n_resamples=n_boot, + paired=True, + confidence_level=0.95, + method="BCa", + ) + delta_aupr = np.mean( + res_delta_aupr.bootstrap_distribution + ) # aupr_new - aupr_last + delta_precision = np.mean(res_delta_precision.bootstrap_distribution) + pval_aupr = bootstrap_pvalue(delta_aupr, res_delta_aupr) + pval_precision = bootstrap_pvalue(delta_precision, res_delta_precision) + + data = (Y_true, Y_new) + res_aupr = scipy.stats.bootstrap( + data, + statistic_aupr, + n_resamples=n_boot, + paired=True, + confidence_level=0.95, + method="BCa", + ) + thresh = threshold_70_pct_recall( + Y_true, Y_new + ) # will return None if max recall < 70% + res_precision = scipy.stats.bootstrap( + data, + lambda Y_true, Y_pred: statistic_precision_at_threshold( + Y_true, Y_pred, thresh + ), + n_resamples=n_boot, + paired=True, + confidence_level=0.95, + method="BCa", + ) + + aupr = np.mean(res_aupr.bootstrap_distribution) + precision = np.mean(res_precision.bootstrap_distribution) + + df_temp = pd.DataFrame( + { + "feature_added": feature_added, + "aupr": aupr, + "delta_aupr": delta_aupr, + "delta_aupr_low": res_delta_aupr.confidence_interval[0], + "delta_aupr_high": res_delta_aupr.confidence_interval[1], + "pval_aupr": pval_aupr, + "precision": precision, + "delta_precision": delta_precision, + "delta_precision_low": res_delta_precision.confidence_interval[0], + "delta_precision_high": res_delta_precision.confidence_interval[1], + "pval_precision": pval_precision, + }, + index=[0], + ) + df = pd.concat([df, df_temp]) + Y_last = Y_new + + return df @click.command() @@ -275,21 +275,21 @@ def SFFS_significance( @click.option("--epsilon", type=float, default=0.01) @click.option("--params_file", required=True) def main( - crispr_features_file, feature_table_file, out_dir, polynomial, epsilon, params_file + crispr_features_file, feature_table_file, out_dir, polynomial, epsilon, params_file ): - model_name = "E2G" - df_dataset = pd.read_csv(crispr_features_file, sep="\t") - feature_table = pd.read_csv(feature_table_file, sep="\t") + model_name = "E2G" + df_dataset = pd.read_csv(crispr_features_file, sep="\t") + feature_table = pd.read_csv(feature_table_file, sep="\t") - with open(params_file, "rb") as handle: - params = pickle.load(handle) + with open(params_file, "rb") as handle: + params = pickle.load(handle) - res = SFFS_significance( - df_dataset, feature_table, model_name, epsilon, params, polynomial, n_boot=1000 - ) + res = SFFS_significance( + df_dataset, feature_table, model_name, epsilon, params, polynomial, n_boot=1000 + ) - res.to_csv(out_dir + "/forward_feature_selection.tsv", sep="\t", index=False) + res.to_csv(out_dir + "/forward_feature_selection.tsv", sep="\t", index=False) if __name__ == "__main__": - main() + main() diff --git a/workflow/scripts/model_training/permutation_feature_importance.py b/workflow/scripts/model_training/permutation_feature_importance.py index 6bea740..7c5b4f8 100644 --- a/workflow/scripts/model_training/permutation_feature_importance.py +++ b/workflow/scripts/model_training/permutation_feature_importance.py @@ -7,89 +7,89 @@ from sklearn.preprocessing import PolynomialFeatures from sklearn.linear_model import LogisticRegression from training_functions import ( - statistic_aupr, - statistic_precision, - train_and_predict_once, - bootstrap_pvalue, - statistic_delta_aupr, - statistic_delta_precision_at_threshold, - threshold_70_pct_recall, + statistic_aupr, + statistic_precision, + train_and_predict_once, + bootstrap_pvalue, + statistic_delta_aupr, + statistic_delta_precision_at_threshold, + threshold_70_pct_recall, ) def permutation_feature_importance( - df_dataset, - feature_table, - model_name, - epsilon, - params, - n_repeats=20, - polynomial=False, + df_dataset, + feature_table, + model_name, + epsilon, + params, + n_repeats=20, + polynomial=False, ): - # specify feature list - feature_list_core = feature_table["feature"] - model_name_core = model_name - - X = df_dataset.loc[:, feature_list_core] - feature_list = feature_list_core - - # transform features first - X = np.log(np.abs(X) + epsilon) - Y_true = df_dataset["Regulated"].values.astype(np.int64) - - # optionally add polynomial features - if polynomial: - poly = PolynomialFeatures(degree=2) - X = poly.fit_transform(X) - polynomial_names = poly.get_feature_names_out(feature_list_core) - X = pd.DataFrame(X, columns=polynomial_names) - feature_list = pd.Series(polynomial_names) - - # train full model - model_name = model_name_core + "_full" - df_dataset = train_and_predict_once( - df_dataset, X, Y_true, feature_list, model_name, params - ) - Y_full = df_dataset[model_name + ".Score"] - - df = pd.DataFrame(columns=["feature_permuted", "delta_aupr", "delta_precision"]) - - for i in range(len(feature_list)): # iterate through features - print(feature_list[i]) - delta_aupr_feature = [] - delta_precision_feature = [] - original_values = X[feature_list[i]] - - for j in range(n_repeats): - X[feature_list[i]] = np.random.permutation(X[feature_list[i]]) - model_name = model_name_core + "_shuff_" + feature_list[i] - df_dataset = train_and_predict_once( - df_dataset, X, Y_true, feature_list, model_name, params - ) - Y_shuffle = df_dataset[model_name + ".Score"] - - delta_aupr = statistic_delta_aupr(Y_true, Y_full, Y_shuffle) - thresh_full = threshold_70_pct_recall(Y_true, Y_full) - thresh_shuffle = threshold_70_pct_recall(Y_true, Y_shuffle) - delta_precision = statistic_delta_precision_at_threshold( - Y_true, Y_full, Y_shuffle, thresh_full, thresh_shuffle - ) - - delta_aupr_feature = delta_aupr_feature + [delta_aupr] - delta_precision_feature = delta_precision_feature + [delta_precision] - - df_temp = pd.DataFrame( - { - "delta_aupr": delta_aupr_feature, - "delta_precision": delta_precision_feature, - } - ) - df_temp["feature_permuted"] = feature_list[i] - df = pd.concat([df, df_temp]) - - X[feature_list[i]] = original_values # reset feature values - - return df + # specify feature list + feature_list_core = feature_table["feature"] + model_name_core = model_name + + X = df_dataset.loc[:, feature_list_core] + feature_list = feature_list_core + + # transform features first + X = np.log(np.abs(X) + epsilon) + Y_true = df_dataset["Regulated"].values.astype(np.int64) + + # optionally add polynomial features + if polynomial: + poly = PolynomialFeatures(degree=2) + X = poly.fit_transform(X) + polynomial_names = poly.get_feature_names_out(feature_list_core) + X = pd.DataFrame(X, columns=polynomial_names) + feature_list = pd.Series(polynomial_names) + + # train full model + model_name = model_name_core + "_full" + df_dataset = train_and_predict_once( + df_dataset, X, Y_true, feature_list, model_name, params + ) + Y_full = df_dataset[model_name + ".Score"] + + df = pd.DataFrame(columns=["feature_permuted", "delta_aupr", "delta_precision"]) + + for i in range(len(feature_list)): # iterate through features + print(feature_list[i]) + delta_aupr_feature = [] + delta_precision_feature = [] + original_values = X[feature_list[i]] + + for j in range(n_repeats): + X[feature_list[i]] = np.random.permutation(X[feature_list[i]]) + model_name = model_name_core + "_shuff_" + feature_list[i] + df_dataset = train_and_predict_once( + df_dataset, X, Y_true, feature_list, model_name, params + ) + Y_shuffle = df_dataset[model_name + ".Score"] + + delta_aupr = statistic_delta_aupr(Y_true, Y_full, Y_shuffle) + thresh_full = threshold_70_pct_recall(Y_true, Y_full) + thresh_shuffle = threshold_70_pct_recall(Y_true, Y_shuffle) + delta_precision = statistic_delta_precision_at_threshold( + Y_true, Y_full, Y_shuffle, thresh_full, thresh_shuffle + ) + + delta_aupr_feature = delta_aupr_feature + [delta_aupr] + delta_precision_feature = delta_precision_feature + [delta_precision] + + df_temp = pd.DataFrame( + { + "delta_aupr": delta_aupr_feature, + "delta_precision": delta_precision_feature, + } + ) + df_temp["feature_permuted"] = feature_list[i] + df = pd.concat([df, df_temp]) + + X[feature_list[i]] = original_values # reset feature values + + return df @click.command() @@ -101,28 +101,28 @@ def permutation_feature_importance( @click.option("--n_repeats", type=float, default=20) @click.option("--params_file", required=True) def main( - crispr_features_file, - feature_table_file, - out_dir, - polynomial, - epsilon, - n_repeats, - params_file, + crispr_features_file, + feature_table_file, + out_dir, + polynomial, + epsilon, + n_repeats, + params_file, ): - model_name = "E2G" - n_repeats = int(n_repeats) - df_dataset = pd.read_csv(crispr_features_file, sep="\t") - feature_table = pd.read_csv(feature_table_file, sep="\t") + model_name = "E2G" + n_repeats = int(n_repeats) + df_dataset = pd.read_csv(crispr_features_file, sep="\t") + feature_table = pd.read_csv(feature_table_file, sep="\t") - with open(params_file, "rb") as handle: - params = pickle.load(handle) + with open(params_file, "rb") as handle: + params = pickle.load(handle) - res = permutation_feature_importance( - df_dataset, feature_table, model_name, epsilon, params, n_repeats, polynomial - ) + res = permutation_feature_importance( + df_dataset, feature_table, model_name, epsilon, params, n_repeats, polynomial + ) - res.to_csv(out_dir + "/permutation_feature_importance.tsv", sep="\t", index=False) + res.to_csv(out_dir + "/permutation_feature_importance.tsv", sep="\t", index=False) if __name__ == "__main__": - main() + main() diff --git a/workflow/scripts/model_training/train_model.py b/workflow/scripts/model_training/train_model.py index 8d88ef4..9402059 100644 --- a/workflow/scripts/model_training/train_model.py +++ b/workflow/scripts/model_training/train_model.py @@ -8,142 +8,143 @@ from sklearn.linear_model import LogisticRegression from training_functions import statistic_aupr + def train_and_predict( - df_dataset, feature_table, model_name, out_dir, epsilon, params, polynomial=False + df_dataset, feature_table, model_name, out_dir, epsilon, params, polynomial=False ): - # specify feature list - feature_list_core = feature_table["feature"] - X = df_dataset.loc[:, feature_list_core] - - # transform features first - X = np.log(np.abs(X) + epsilon) - - # optionally add polynomial features - if polynomial: - poly = PolynomialFeatures(degree=2) - X = poly.fit_transform(X) - polynomial_names = poly.get_feature_names_out(feature_list_core) - X = pd.DataFrame(X, columns=polynomial_names) - - - Y = df_dataset["Regulated"].values.astype(np.int64) - - # initialize df for feature weights & metrics - df_coef = pd.DataFrame(columns=["feature", "coefficient", "test_chr"]) - df_metrics = pd.DataFrame( - columns=[ - "test_chr", - "log_loss_test_full", - "log_loss_train", - "log_loss_test", - "AUROC_test_full", - "AUROC_train", - "AUROC_test", - "AUPRC_test_full", - "AUPRC_train", - "AUPRC_test", - "n_test_pos", - "n_test_neg", - "n_train_neg", - "n_train_pos", - ] - ) - - # train aggregate model across all chromosomes, calc weights and performance metrics, save full model - model_full = LogisticRegression(**params).fit(X, Y) - probs_full = model_full.predict_proba(X) - df_dataset[model_name + ".Score_full"] = probs_full[:, 1] - coefficients = model_full.coef_[0] - df_temp = pd.DataFrame( - {"feature": X.columns, "coefficient": coefficients, "test_chr": "none"} - ) - df_coef = pd.concat([df_coef, df_temp]) - with open(out_dir + f"/model_full.pkl", "wb") as f: - pickle.dump(model_full, f) - - # logistic regression predictions on chromosome-wise cross validation - idx = np.arange(len(Y)) - chr_list = np.unique(df_dataset["chr"]) - if len(chr_list) > 1: - for chr in chr_list: - idx_test = df_dataset[df_dataset["chr"] == chr].index.values - - if len(idx_test) > 0: - idx_train = np.delete(idx, idx_test) - X_test = X.loc[idx_test, :] - X_train = X.loc[idx_train, :] - Y_test = Y[idx_test] - Y_train = Y[idx_train] - - model = LogisticRegression(**params).fit(X_train, Y_train) - - with open(out_dir + f"/model_test_{chr}.pkl", "wb") as f: - pickle.dump(model, f) - - probs = model.predict_proba(X_test) - df_dataset.loc[idx_test, model_name + ".Score"] = probs[:, 1] - - # performance metrics - n_train_pos = np.sum(Y_train) - n_train_neg = len(Y_train) - n_train_pos - n_test_pos = np.sum(Y_test) - n_test_neg = len(Y_test) - n_test_pos - - multiple_labels = n_test_pos > 0 and n_test_neg > 0 - - ll_train = log_loss(Y_train, model.predict_proba(X_train)[:, 1]) - ll_test_full = ( - log_loss(Y_test, probs_full[idx_test, 1]) - if multiple_labels - else np.NaN - ) - ll_test = log_loss(Y_test, probs[:, 1]) if multiple_labels else np.NaN - auroc_train = roc_auc_score(Y_train, model.predict_proba(X_train)[:, 1]) - auroc_test_full = ( - roc_auc_score(Y_test, probs_full[idx_test, 1]) - if multiple_labels - else np.NaN - ) - auroc_test = ( - roc_auc_score(Y_test, probs[:, 1]) if multiple_labels > 0 else np.NaN - ) - auprc_train = statistic_aupr( - Y_train, model.predict_proba(X_train)[:, 1] - ) - auprc_test_full = ( - statistic_aupr(Y_test, probs_full[idx_test, 1]) - if multiple_labels - else np.NaN - ) - auprc_test = ( - statistic_aupr(Y_test, probs[:, 1]) if multiple_labels else np.NaN - ) - - df_temp = pd.DataFrame( - { - "test_chr": [chr], - "log_loss_test_full": [ll_test_full], - "log_loss_train": [ll_train], - "log_loss_test": [ll_test], - "AUROC_test_full": [auroc_test_full], - "AUROC_train": [auroc_train], - "AUROC_test": [auroc_test], - "AUPRC_test_full": [auprc_test_full], - "AUPRC_train": [auprc_train], - "AUPRC_test": [auprc_test], - "n_test_pos": [n_test_pos], - "n_test_neg": [n_test_neg], - "n_train_pos": [n_train_pos], - "n_train_neg": [n_train_neg], - } - ) - df_metrics = pd.concat([df_metrics, df_temp]) - - # save dfs - df_dataset.to_csv(out_dir + "/training_predictions.tsv", sep="\t", index=False) - df_coef.to_csv(out_dir + "/model_coefficients.tsv", sep="\t", index=False) - df_metrics.to_csv(out_dir + "/performance_metrics.tsv", sep="\t", index=False) - + # specify feature list + feature_list_core = feature_table["feature"] + X = df_dataset.loc[:, feature_list_core] + + # transform features first + X = np.log(np.abs(X) + epsilon) + + # optionally add polynomial features + if polynomial: + poly = PolynomialFeatures(degree=2) + X = poly.fit_transform(X) + polynomial_names = poly.get_feature_names_out(feature_list_core) + X = pd.DataFrame(X, columns=polynomial_names) + + Y = df_dataset["Regulated"].values.astype(np.int64) + + # initialize df for feature weights & metrics + df_coef = pd.DataFrame(columns=["feature", "coefficient", "test_chr"]) + df_metrics = pd.DataFrame( + columns=[ + "test_chr", + "log_loss_test_full", + "log_loss_train", + "log_loss_test", + "AUROC_test_full", + "AUROC_train", + "AUROC_test", + "AUPRC_test_full", + "AUPRC_train", + "AUPRC_test", + "n_test_pos", + "n_test_neg", + "n_train_neg", + "n_train_pos", + ] + ) + + # train aggregate model across all chromosomes, calc weights and performance metrics, save full model + model_full = LogisticRegression(**params).fit(X, Y) + probs_full = model_full.predict_proba(X) + df_dataset[model_name + ".Score_full"] = probs_full[:, 1] + coefficients = model_full.coef_[0] + df_temp = pd.DataFrame( + {"feature": X.columns, "coefficient": coefficients, "test_chr": "none"} + ) + df_coef = pd.concat([df_coef, df_temp]) + with open(out_dir + f"/model_full.pkl", "wb") as f: + pickle.dump(model_full, f) + + # logistic regression predictions on chromosome-wise cross validation + idx = np.arange(len(Y)) + chr_list = np.unique(df_dataset["chr"]) + if len(chr_list) > 1: + for chr in chr_list: + idx_test = df_dataset[df_dataset["chr"] == chr].index.values + + if len(idx_test) > 0: + idx_train = np.delete(idx, idx_test) + X_test = X.loc[idx_test, :] + X_train = X.loc[idx_train, :] + Y_test = Y[idx_test] + Y_train = Y[idx_train] + + model = LogisticRegression(**params).fit(X_train, Y_train) + + with open(out_dir + f"/model_test_{chr}.pkl", "wb") as f: + pickle.dump(model, f) + + probs = model.predict_proba(X_test) + df_dataset.loc[idx_test, model_name + ".Score"] = probs[:, 1] + + # performance metrics + n_train_pos = np.sum(Y_train) + n_train_neg = len(Y_train) - n_train_pos + n_test_pos = np.sum(Y_test) + n_test_neg = len(Y_test) - n_test_pos + + multiple_labels = n_test_pos > 0 and n_test_neg > 0 + + ll_train = log_loss(Y_train, model.predict_proba(X_train)[:, 1]) + ll_test_full = ( + log_loss(Y_test, probs_full[idx_test, 1]) + if multiple_labels + else np.NaN + ) + ll_test = log_loss(Y_test, probs[:, 1]) if multiple_labels else np.NaN + auroc_train = roc_auc_score(Y_train, model.predict_proba(X_train)[:, 1]) + auroc_test_full = ( + roc_auc_score(Y_test, probs_full[idx_test, 1]) + if multiple_labels + else np.NaN + ) + auroc_test = ( + roc_auc_score(Y_test, probs[:, 1]) + if multiple_labels > 0 + else np.NaN + ) + auprc_train = statistic_aupr( + Y_train, model.predict_proba(X_train)[:, 1] + ) + auprc_test_full = ( + statistic_aupr(Y_test, probs_full[idx_test, 1]) + if multiple_labels + else np.NaN + ) + auprc_test = ( + statistic_aupr(Y_test, probs[:, 1]) if multiple_labels else np.NaN + ) + + df_temp = pd.DataFrame( + { + "test_chr": [chr], + "log_loss_test_full": [ll_test_full], + "log_loss_train": [ll_train], + "log_loss_test": [ll_test], + "AUROC_test_full": [auroc_test_full], + "AUROC_train": [auroc_train], + "AUROC_test": [auroc_test], + "AUPRC_test_full": [auprc_test_full], + "AUPRC_train": [auprc_train], + "AUPRC_test": [auprc_test], + "n_test_pos": [n_test_pos], + "n_test_neg": [n_test_neg], + "n_train_pos": [n_train_pos], + "n_train_neg": [n_train_neg], + } + ) + df_metrics = pd.concat([df_metrics, df_temp]) + + # save dfs + df_dataset.to_csv(out_dir + "/training_predictions.tsv", sep="\t", index=False) + df_coef.to_csv(out_dir + "/model_coefficients.tsv", sep="\t", index=False) + df_metrics.to_csv(out_dir + "/performance_metrics.tsv", sep="\t", index=False) @click.command() @@ -154,18 +155,18 @@ def train_and_predict( @click.option("--epsilon", type=float, default=0.01) @click.option("--params_file", required=True) def main( - crispr_features_file, feature_table_file, out_dir, polynomial, params_file, epsilon + crispr_features_file, feature_table_file, out_dir, polynomial, params_file, epsilon ): - model_name = "E2G" - df_dataset = pd.read_csv(crispr_features_file, sep="\t") - feature_table = pd.read_csv(feature_table_file, sep="\t") - with open(params_file, "rb") as handle: - params = pickle.load(handle) + model_name = "E2G" + df_dataset = pd.read_csv(crispr_features_file, sep="\t") + feature_table = pd.read_csv(feature_table_file, sep="\t") + with open(params_file, "rb") as handle: + params = pickle.load(handle) - train_and_predict( - df_dataset, feature_table, model_name, out_dir, epsilon, params, polynomial - ) + train_and_predict( + df_dataset, feature_table, model_name, out_dir, epsilon, params, polynomial + ) if __name__ == "__main__": - main() + main() diff --git a/workflow/scripts/model_training/training_functions.py b/workflow/scripts/model_training/training_functions.py index 34b6ad8..d0d5a84 100644 --- a/workflow/scripts/model_training/training_functions.py +++ b/workflow/scripts/model_training/training_functions.py @@ -4,7 +4,6 @@ from sklearn.preprocessing import PolynomialFeatures from sklearn.linear_model import LogisticRegression - ## statistic functions for delta auPR/precision to be used for scipy.stats.bootstrap From 009cd97897c66ab3cb0e7a8888b4227c01f85743 Mon Sep 17 00:00:00 2001 From: Kayla Brand Date: Tue, 10 Feb 2026 18:19:57 -0800 Subject: [PATCH 15/22] Exclude ABC submodule from black checks --- pyproject.toml | 4 ++++ 1 file changed, 4 insertions(+) create mode 100644 pyproject.toml diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..061d7cd --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,4 @@ +[tool.black] +extend-exclude = ''' +^ABC/ +''' From 8952800ebeeb01e92bbe5c3e2248c0d4260fcaa4 Mon Sep 17 00:00:00 2001 From: Kayla Brand Date: Tue, 10 Feb 2026 18:40:00 -0800 Subject: [PATCH 16/22] remove extraneous comments from code counting num TSS btw enh and gene --- workflow/rules/genomewide_features.smk | 5 ++--- workflow/scripts/feature_tables/gen_num_tss_enh_gene.py | 2 +- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/workflow/rules/genomewide_features.smk b/workflow/rules/genomewide_features.smk index ac91ce6..a3a33e3 100644 --- a/workflow/rules/genomewide_features.smk +++ b/workflow/rules/genomewide_features.smk @@ -93,11 +93,10 @@ rule generate_num_tss_enh_gene: conda: "../envs/encode_re2g.yml" resources: - mem_mb=partial(ABC.determine_mem_mb) #, min_gb=32) + mem_mb=partial(ABC.determine_mem_mb) output: numTSSEnhGene = os.path.join(RESULTS_DIR, "{biosample}", "new_features", "NumTSSEnhGene.tsv"), extendedEnhancerRegions = temp(os.path.join(RESULTS_DIR, "{biosample}", "new_features", "extendedEnhancerRegions.txt")), - # enhancerTSSInt = temp(os.path.join(RESULTS_DIR, "{biosample}", "new_features", "extendedEnhancerRegions_TSS_int.tsv.gz")) shell: """ python {params.scripts_dir}/feature_tables/gen_num_tss_enh_gene.py \ @@ -105,7 +104,7 @@ rule generate_num_tss_enh_gene: --ref_gene_tss {params.gene_TSS500} \ --extended_enhancers {output.extendedEnhancerRegions} \ --out_file {output.numTSSEnhGene} - """ # --enhancer_tss_int {output.enhancerTSSInt} \ + """ # generate features "numNearbyEnhancers" and "sumNearbyEnhancers" rule generate_num_sum_enhancers: diff --git a/workflow/scripts/feature_tables/gen_num_tss_enh_gene.py b/workflow/scripts/feature_tables/gen_num_tss_enh_gene.py index bf68eee..716b051 100644 --- a/workflow/scripts/feature_tables/gen_num_tss_enh_gene.py +++ b/workflow/scripts/feature_tables/gen_num_tss_enh_gene.py @@ -4,7 +4,7 @@ def determine_num_tss_enh_gene( - pred_df, ref_gene_tss, extended_enhancers, out_file # enhancer_tss_int + pred_df, ref_gene_tss, extended_enhancers, out_file ): # make the end be midpoint of enhancer + distance (This gives you the end coordinate of distance range) pred_df["midpoint"] = ((pred_df["start"] + pred_df["end"]) / 2).astype("int") From 6e7b2c846aa8ee53d746b5a40939d273dd827ae7 Mon Sep 17 00:00:00 2001 From: Kayla Brand Date: Wed, 11 Feb 2026 14:32:52 -0800 Subject: [PATCH 17/22] Only apply black to workflow dir --- .circleci/config.yml | 10 +++++----- pyproject.toml | 4 ---- 2 files changed, 5 insertions(+), 9 deletions(-) delete mode 100644 pyproject.toml diff --git a/.circleci/config.yml b/.circleci/config.yml index 8b4d912..205fcbb 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -21,11 +21,11 @@ jobs: - run: name: Install basic OS pkgs command: apt-get update && apt-get -y install curl vim - # - run: - # name: Run Linter (python black) - # command: | - # mamba install black - # black . --check + - run: + name: Run Linter (python black) + command: | + mamba install black + black workflow/ --check - run: name: Install env no_output_timeout: 30m diff --git a/pyproject.toml b/pyproject.toml deleted file mode 100644 index 061d7cd..0000000 --- a/pyproject.toml +++ /dev/null @@ -1,4 +0,0 @@ -[tool.black] -extend-exclude = ''' -^ABC/ -''' From c6e004f52fe2d76b350623ca2c16c036ffe1791c Mon Sep 17 00:00:00 2001 From: Kayla Brand Date: Wed, 11 Feb 2026 14:52:11 -0800 Subject: [PATCH 18/22] unified crispr_dataset handling --- workflow/rules/compare_models.smk | 6 +- .../model_training/compare_all_models.py | 80 +++++++------------ 2 files changed, 34 insertions(+), 52 deletions(-) diff --git a/workflow/rules/compare_models.smk b/workflow/rules/compare_models.smk index 384bbbb..03f4d55 100755 --- a/workflow/rules/compare_models.smk +++ b/workflow/rules/compare_models.smk @@ -10,7 +10,8 @@ rule gather_model_performances: scripts_dir = SCRIPTS_DIR, out_dir = RESULTS_DIR, model_config_file = config["model_config"], - crispr_dataset = config["crispr_dataset"] + crispr_dataset_names = [n for n in model_config["crispr_dataset"].unique()], + crispr_dataset = lambda wildcards: [config["crispr_dataset"][cd] for cd in model_config["crispr_dataset"].unique()] conda: "../envs/encode_re2g.yml" resources: @@ -22,6 +23,7 @@ rule gather_model_performances: --all_missing "{input.all_missing}" \ --model_config_file {params.model_config_file} \ --output_file {output.comp_table} \ + --crispr_names "{params.crispr_dataset_names}" \ --crispr_data "{params.crispr_dataset}" \ --out_dir {params.out_dir} """ @@ -37,4 +39,4 @@ rule plot_model_performances: resources: mem_mb=ABC.determine_mem_mb script: - "../scripts/model_training/plot_model_comparison.R" + "../scripts/model_training/plot_model_comparison.R" \ No newline at end of file diff --git a/workflow/scripts/model_training/compare_all_models.py b/workflow/scripts/model_training/compare_all_models.py index 3277ad4..e360d56 100644 --- a/workflow/scripts/model_training/compare_all_models.py +++ b/workflow/scripts/model_training/compare_all_models.py @@ -4,7 +4,6 @@ import numpy as np import pandas as pd import scipy -import ast from training_functions import ( statistic_aupr, statistic_precision_at_threshold, @@ -23,46 +22,15 @@ def performance_summary( n_boot=1000, ): # read in predicitons - if model_id == "distance": - # Determine whether crispr_data is a str or dict - if crispr_data.strip().startswith("{") and crispr_data.strip().endswith("}"): - all_crispr_dfs = [] - print( - "INFO: --crispr_data appears to be a dictionary. Parsing and concatenating all files." - ) - try: - # Safely parse the string into a Python dictionary - crispr_data_dict = ast.literal_eval(crispr_data) - if not isinstance(crispr_data_dict, dict): - raise TypeError("Parsed data is not a dictionary.") - - # Iterate through the file paths in the dictionary's values - for key, filepath in crispr_data_dict.items(): - print(f" - Loading '{key}': {filepath}") - df = pd.read_csv(filepath, sep="\t") - all_crispr_dfs.append(df) - - # Concatenate all loaded DataFrames into one - crispr_df = pd.concat(all_crispr_dfs, ignore_index=True) - - except (ValueError, SyntaxError, TypeError) as e: - # If parsing fails, raise an error because the format was ambiguous - raise ValueError( - f"Failed to parse --crispr_data as a dictionary. Error: {e}. Content: {crispr_data}" - ) - else: - # If it's not a dictionary string, treat it as a single file path - print(f"INFO: --crispr_data is a single file path. Loading: {crispr_data}") - crispr_df = pd.read_csv(crispr_data, sep="\t") - - # Calculate distance - crispr_df["distance"] = np.abs( - (crispr_df["chromStart"] + crispr_df["chromEnd"]) / 2 - - (crispr_df["startTSS"] + crispr_df["endTSS"]) / 2 + if model_id.startswith("distance"): + crispr_data = pd.read_csv(crispr_data, sep="\t") + crispr_data["distance"] = np.abs( + (crispr_data["chromStart"] + crispr_data["chromEnd"]) / 2 + - (crispr_data["startTSS"] + crispr_data["endTSS"]) / 2 ) - crispr_df = crispr_df.dropna(subset=["Regulated", "distance"]) - Y_true_all = crispr_df["Regulated"].values.astype(np.int64) - Y_pred_all = crispr_df["distance"] * -1 + crispr_data = crispr_data.dropna(subset=["Regulated", "distance"]) + Y_true_all = crispr_data["Regulated"].values.astype(np.int64) + Y_pred_all = crispr_data["distance"] * -1 pct_missing = 0 else: # normal models pred_df = pd.read_csv(pred_file, sep="\t") @@ -136,10 +104,20 @@ def performance_summary( @click.option("--model_config_file", required=True) @click.option("--output_file", required=True) @click.option("--crispr_data", required=True) +@click.option("--crispr_names", required=True) @click.option("--out_dir", required=True) -def main(all_pred, all_missing, model_config_file, output_file, crispr_data, out_dir): - all_pred = all_pred.split(" ") - all_missing = all_missing.split(" ") +def main( + all_pred, + all_missing, + model_config_file, + output_file, + crispr_data, + crispr_names, + out_dir, +): + all_pred_files = all_pred.split(" ") + all_missing_files = all_missing.split(" ") + crispr_dict = dict(zip(crispr_names.split(" "), crispr_data.split(" "))) model_name = "E2G" model_config = ( @@ -147,8 +125,8 @@ def main(all_pred, all_missing, model_config_file, output_file, crispr_data, out .fillna("None") .set_index("model", drop=False) ) - model_config["prediction_file"] = all_pred - model_config["missing_file"] = all_missing + model_config["prediction_file"] = all_pred_files + model_config["missing_file"] = all_missing_files # initiate final df df = pd.DataFrame( @@ -177,11 +155,13 @@ def main(all_pred, all_missing, model_config_file, output_file, crispr_data, out out_dir, ) df = pd.concat([df, res_row]) - # add row for distance - res_row = performance_summary( - "distance", "baseline", "", "", "", out_dir, crispr_data=crispr_data - ) - df = pd.concat([df, res_row]) + + for key, file_path in crispr_dict.items(): + # add row(s) for distance + res_row = performance_summary( + f"distance_{key}", "baseline", "", "", "", out_dir, crispr_data=file_path + ) + df = pd.concat([df, res_row]) # sort table by AUPRC df = df.sort_values(by="AUPRC", ascending=False) From 7dde7c6b0a2d081b6eb2b9c385e53f54b6c782be Mon Sep 17 00:00:00 2001 From: Kayla Brand Date: Wed, 11 Feb 2026 15:59:00 -0800 Subject: [PATCH 19/22] pulled gen_num_candidate_enh_gene.py from branch train_multiple_celltypes --- workflow/scripts/feature_tables/gen_num_candidate_enh_gene.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/workflow/scripts/feature_tables/gen_num_candidate_enh_gene.py b/workflow/scripts/feature_tables/gen_num_candidate_enh_gene.py index 1070e7f..6ea4300 100644 --- a/workflow/scripts/feature_tables/gen_num_candidate_enh_gene.py +++ b/workflow/scripts/feature_tables/gen_num_candidate_enh_gene.py @@ -44,7 +44,7 @@ def determine_num_candidate_enh_gene(pred_df, out_file): @click.option("--abc_predictions") @click.option("--out_file") def main(abc_predictions, out_file): - pred_df = pd.read_csv(abc_predictions, sep="\t") + pred_df = pd.read_csv(abc_predictions, sep="\t", compression="gzip") if len(pred_df) == 0: raise Exception("Did not find any enhancers in the Predictions file") @@ -52,4 +52,4 @@ def main(abc_predictions, out_file): if __name__ == "__main__": - main() \ No newline at end of file + main() From c4cb5749743fcbd797e9b57e7d01e02a6892af72 Mon Sep 17 00:00:00 2001 From: Kayla Brand Date: Wed, 11 Feb 2026 17:44:14 -0800 Subject: [PATCH 20/22] vectorized approach to counting enh btw E-G runs in 4% as much time but requires 70-80% more RAM --- .../gen_num_candidate_enh_gene.py | 74 +++++++++++++------ 1 file changed, 53 insertions(+), 21 deletions(-) diff --git a/workflow/scripts/feature_tables/gen_num_candidate_enh_gene.py b/workflow/scripts/feature_tables/gen_num_candidate_enh_gene.py index 6ea4300..4390a4f 100644 --- a/workflow/scripts/feature_tables/gen_num_candidate_enh_gene.py +++ b/workflow/scripts/feature_tables/gen_num_candidate_enh_gene.py @@ -3,35 +3,67 @@ import pandas as pd -def _populate_enhancer_count_from_tss(df, enhancers, is_upstream): - enh_indexes = enhancers.index - if is_upstream: - # start counting from the enhancer closest to TSS - enh_indexes = reversed(enh_indexes) +def determine_num_candidate_enh_gene(pred_df, out_file): + # Concept: Start by making a working copy of our data so we don't accidentally + # modify the original DataFrame that was passed into the function. + df = pred_df.copy() - count_from_tss = 0 - for enh_idx in enh_indexes: - count_from_tss += 1 - df.loc[enh_idx, "NumCandidateEnhGene"] = count_from_tss + # Concept: To determine if an enhancer is upstream or downstream, we need a single + # point to represent it. The midpoint is a simple and effective choice. + df["midpoint"] = ((df["start"] + df["end"]) / 2).astype("int") + # Concept: For our counting logic to work correctly, all enhancers must be + # sorted by their genomic position. This creates a predictable 'map' of the + # chromosome that we can operate on. We sort by gene first, then position. + df = df.sort_values(by=["TargetGene", "midpoint"], ascending=[True, True]) -def determine_num_candidate_enh_gene(pred_df, out_file): - # Need df to be sorted by midpoint for each chromosome - pred_df["midpoint"] = ((pred_df["start"] + pred_df["end"]) / 2).astype("int") - df = pred_df.sort_values(by=["chr", "midpoint"], ascending=True).reset_index( - drop=True + # Concept: We need to handle enhancers before the TSS differently from those after it. + # Let's create two 'masks'—like stencils—to quickly select all the upstream + # enhancers or all the downstream enhancers in one go. + is_downstream = df["midpoint"] > df["TargetGeneTSS"] + is_upstream = df["midpoint"] < df["TargetGeneTSS"] + + # --- Upstream Calculation --- + # Concept: To rank upstream enhancers, we need to count *away* from the TSS. + # This means starting with the enhancer closest to the TSS (the one with the + # highest 'midpoint' coordinate) and counting backwards. + # We achieve this by creating a temporary, sorted copy of just the upstream enhancers. + df.loc[is_upstream, "NumCandidateEnhGene"] = ( + # 1. Select only the upstream enhancers. This creates a temporary table. + df[is_upstream] + # 2. Sort this temporary table in DESCENDING order of position. Now, the + # enhancer closest to the TSS is at the top of each gene's group. + .sort_values("midpoint", ascending=False) + # 3. Group by gene and then perform a cumulative count. `cumcount` automatically + # creates the ranking (0, 1, 2, ...) for us within each gene's group. + .groupby("TargetGene").cumcount() + # 4. `cumcount` starts at 0, but we want our ranks to start at 1. + + 1 ) - gene_groups = df.groupby(["TargetGene", "TargetGeneTSS"]) - for (gene, tss), indexes in gene_groups.groups.items(): - enhancers = df.loc[indexes] - upstream_enh = enhancers[enhancers["midpoint"] < tss] - downstream_enh = enhancers[enhancers["midpoint"] > tss] - _populate_enhancer_count_from_tss(df, upstream_enh, is_upstream=True) - _populate_enhancer_count_from_tss(df, downstream_enh, is_upstream=False) + # --- Downstream Calculation --- + # Concept: For downstream enhancers, the default sort order (ascending) is already + # correct for counting away from the TSS. The enhancer with the lowest + # 'midpoint' coordinate is the one closest to the TSS. + df.loc[is_downstream, "NumCandidateEnhGene"] = ( + # 1. Select only the downstream enhancers. + df[is_downstream] + # 2. Group by gene and perform the cumulative count. No special sorting needed. + .groupby("TargetGene").cumcount() + # 3. Add 1 to start the rank from 1 instead of 0. + + 1 + ) + # Concept: Any enhancers located exactly at the TSS (or any other unhandled cases) + # will have a missing ('NaN') rank. We'll fill these with 0 to be safe. df = df.fillna(value=0) df["NumCandidateEnhGene"] = df["NumCandidateEnhGene"].astype("int") + + # Concept: Before saving, restore the desired chromosomal sort order + df = df.sort_values(by=["chr", "midpoint"], ascending=True).reset_index(drop=True) + + # Concept: Finally, select only the columns we need for the final report and + # save the result to a file. df[["name", "TargetGene", "NumCandidateEnhGene"]].to_csv( out_file, sep="\t", From e00074b5c6b6f2ff475b0304fd642e3093bde583 Mon Sep 17 00:00:00 2001 From: Kayla Brand Date: Thu, 12 Feb 2026 14:25:58 -0800 Subject: [PATCH 21/22] allow user to specify cluster_max_cores for jobs --- workflow/rules/predictions.smk | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/workflow/rules/predictions.smk b/workflow/rules/predictions.smk index 9e5066d..ce79d59 100755 --- a/workflow/rules/predictions.smk +++ b/workflow/rules/predictions.smk @@ -113,9 +113,9 @@ rule write_accessibility_bw_file: "../envs/encode_re2g.yml" resources: mem_mb=ABC.determine_mem_mb, - cpus_per_task=16, + cpus_per_task = min(16, config.get("cluster_max_cores", 16)), runtime_hr=6 - threads: 16 + threads: min(16, config.get("cluster_max_cores", 16)) shell: """ LC_ALL=C @@ -131,7 +131,7 @@ rule write_accessibility_bw_file: awk '$4 > 0' > {output.out_bg} else # tagAlign # remove alt chromosomes and sort - zcat {input.input_file} | \ + pigz -cd {input.input_file} | \ awk 'NR==FNR {{keep[$1]; next}} $1 in keep' {params.chr_sizes} - | \ sort -k1,1 -k2,2n --parallel={threads} -S $BUFFER_SIZE | \ bedtools genomecov -bg -split -i - -g {params.chr_sizes} | \ From 4083e9aea2601a25f2fd006e4a9ba3a9e93bdf55 Mon Sep 17 00:00:00 2001 From: Kayla Brand Date: Thu, 12 Feb 2026 14:29:49 -0800 Subject: [PATCH 22/22] black reformatting --- workflow/scripts/feature_tables/gen_num_tss_enh_gene.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/workflow/scripts/feature_tables/gen_num_tss_enh_gene.py b/workflow/scripts/feature_tables/gen_num_tss_enh_gene.py index 716b051..985ee34 100644 --- a/workflow/scripts/feature_tables/gen_num_tss_enh_gene.py +++ b/workflow/scripts/feature_tables/gen_num_tss_enh_gene.py @@ -3,9 +3,7 @@ import pandas as pd -def determine_num_tss_enh_gene( - pred_df, ref_gene_tss, extended_enhancers, out_file -): +def determine_num_tss_enh_gene(pred_df, ref_gene_tss, extended_enhancers, out_file): # make the end be midpoint of enhancer + distance (This gives you the end coordinate of distance range) pred_df["midpoint"] = ((pred_df["start"] + pred_df["end"]) / 2).astype("int") pred_df["new_end"] = (pred_df["midpoint"] + pred_df["distance"]).astype("int")