diff --git a/.circleci/config.yml b/.circleci/config.yml index 8b4d912..205fcbb 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -21,11 +21,11 @@ jobs: - run: name: Install basic OS pkgs command: apt-get update && apt-get -y install curl vim - # - run: - # name: Run Linter (python black) - # command: | - # mamba install black - # black . --check + - run: + name: Run Linter (python black) + command: | + mamba install black + black workflow/ --check - run: name: Install env no_output_timeout: 30m diff --git a/README.md b/README.md index f5e5255..03298c3 100755 --- a/README.md +++ b/README.md @@ -49,16 +49,18 @@ The way we choose the model depends on the biosamples input. The code for model ## Train model -**Important: Only train models for biosamples matching the corresponding CRISPR data (in this case, K562)** +**Important: Only train models for biosamples matching the corresponding CRISPR data** - Much of the the model training code was adapted from Alireza Karbalayghareh's [original implementation](https://github.com/karbalayghareh/ENCODE-E2G). Modify `config/config_training.yaml` with your model and dataset configs -- `model_config` has columns: model, dataset, ABC_directory, feature_table, polynomial (do you want to use polynomial features?), and override_params (are there model training parameters you would like to change from the default logistic regression settings specfied in `config/config_training.yaml`?) +- `model_config` has columns: model, dataset, crispr_dataset, feature_table, polynomial (do you want to use polynomial features?), and override_params (are there model training parameters you would like to change from the default logistic regression settings specfied in `config/config_training.yaml`?) - See [this example](https://pastebin.com/zt1868R3) `model_config` for how to specfiy override parameters. If there are no override_params, leave the column blank but still include the header. + - The `dataset` column is a comma-separated list with values corresponding to biosamples in `dataset_config`. Datasets must be provided for each `crispr_cell_type` included in the corresponding `crispr_dataset`. + - The `crispr_dataset` column corresponds to a key under `crispr_dataset` in `config_training`. - Feature tables must be specified for each model (example: `resources/feature_tables`) with columns: feature (name in final table), input_col (name in ABC output), second_input (multiplied by input_col if provided), aggregate_function (how to combine feature values when a CRISPR element overlaps more than one ABC element), fill_value (how to replace NAs), nice_name (used when plotting) - Note that trained models generated using polynomial features cannot directly be used in the **Apply model** workflow - `dataset_config` is an ABC biosamples config to generate ABC predictions for datasets without an existing ABC directory. -- Each dataset must correspond to a unique ABC_directory, with "biosample" in `dataset_config` equals "dataset" in `model_config`. If no ABC_directory is indicated in `model_config`, it must have an entry in `dataset_config`. + Each dataset must correspond to a "biosample" in `dataset_config` equals "dataset" in `model_config`. The column `crispr_cell_type` indicates the which cell type in the CRISPR data corresponds to this biosample. - If you are including features in addition to those generated within the pipeline (e.g. a value in input_col or second_input of a feature table is not included in `reference_features` in `config/config_training/yaml`), you must also define how to add these values with an external_features_config, which you include in `dataset_config` in the optional column external_features_config: - An `external_features_config` has columns feature (corresponding to the missing input_col or second_input value), source_col (column name in the source file), aggregate_function (how to combine values when merging different element definitions), join_by, and source_file - join_by must be either "TargetGene" (feature is defined per gene) or "overlap" (feature is defined per element-gene pair) diff --git a/config/config_biosamples_chr22.tsv b/config/config_biosamples_chr22.tsv index b9559d7..670633b 100755 --- a/config/config_biosamples_chr22.tsv +++ b/config/config_biosamples_chr22.tsv @@ -1,2 +1,2 @@ -biosample DHS ATAC H3K27ac default_accessibility_feature HiC_file HiC_type HiC_resolution alt_TSS alt_genes -K562_chr22 ABC/example_chr/chr22/ENCFF860XAE.chr22.sorted.se.bam DHS https://www.encodeproject.org/files/ENCFF621AIY/@@download/ENCFF621AIY.hic hic 5000 ABC/example_chr/chr22/RefSeqCurated.170308.bed.CollapsedGeneBounds.chr22.hg38.TSS500bp.bed ABC/example_chr/chr22/RefSeqCurated.170308.bed.CollapsedGeneBounds.chr22.hg38.bed \ No newline at end of file +biosample crispr_cell_type DHS ATAC H3K27ac default_accessibility_feature HiC_file HiC_type HiC_resolution alt_TSS alt_genes +K562_chr22 K562 ABC/example_chr/chr22/ENCFF860XAE.chr22.sorted.se.bam DHS https://www.encodeproject.org/files/ENCFF621AIY/@@download/ENCFF621AIY.hic hic 5000 ABC/example_chr/chr22/RefSeqCurated.170308.bed.CollapsedGeneBounds.chr22.hg38.TSS500bp.bed ABC/example_chr/chr22/RefSeqCurated.170308.bed.CollapsedGeneBounds.chr22.hg38.bed \ No newline at end of file diff --git a/config/config_models_test.tsv b/config/config_models_test.tsv index a4a7988..392ed18 100644 --- a/config/config_models_test.tsv +++ b/config/config_models_test.tsv @@ -1,2 +1,2 @@ -model dataset ABC_directory feature_table polynomial override_params -DNase_megamap K562_chr22 resources/feature_tables/final_feature_set_DNase_hic.tsv False +model dataset crispr_dataset feature_table polynomial override_params +DNase_megamap K562_chr22 training resources/feature_tables/final_feature_set_DNase_hic.tsv FALSE \ No newline at end of file diff --git a/config/config_training.yaml b/config/config_training.yaml index fa8536b..1b78793 100755 --- a/config/config_training.yaml +++ b/config/config_training.yaml @@ -12,7 +12,10 @@ max_memory_allocation_mb: 250000 # for submiting jobs gene_TSS500: "reference/CollapsedGeneBounds.hg38.TSS500bp.bed" # TSS reference file chr_sizes: "reference/GRCh38_EBV.no_alt.chrom.sizes.tsv" gene_classes: "resources/external_features/gene_promoter_class_RefSeqCurated.170308.bed.CollapsedGeneBounds.hg38.TSS500bp.tsv" -crispr_dataset: "reference/EPCrisprBenchmark_ensemble_data_GRCh38.tsv.gz" # CRISPR dataset +crispr_dataset: + training: "reference/EPCrisprBenchmark_ensemble_data_GRCh38.tsv.gz" # CRISPR dataset +crispr_cell_types: + training: ["K562"] # list of features that are generated by default, referenced by their column names in source files; reference_features: ["numTSSEnhGene", "distance", "activity_base", "TargetGenePromoterActivityQuantile", "numNearbyEnhancers", "sumNearbyEnhancers", "is_ubiquitous_uniform", "P2PromoterClass", "numCandidateEnhGene", "hic_contact_pl_scaled_adj", "ABC.Score", "ABC.Numerator", "ABC.Denominator", "normalized_dhs_prom", "normalized_dhs_enh", "normalized_atac_prom", "normalized_atac_enh", "normalized_h3k27ac_enh", "normalized_h3k27ac_prom"] diff --git a/workflow/Snakefile_training b/workflow/Snakefile_training index 545a8fe..c844bdd 100755 --- a/workflow/Snakefile_training +++ b/workflow/Snakefile_training @@ -1,27 +1,34 @@ from snakemake.utils import min_version min_version("7.0") +conda: "mamba" import pandas as pd import os import yaml -configfile: "config/config_training.yaml" model_config = pd.read_table(config["model_config"], na_values="").fillna("None").set_index("model", drop=False) dataset_config = pd.read_table(config["dataset_config"], na_values="").set_index("biosample", drop=False) conda: "mamba" + +# import utils and make config paths absolute MAX_MEM_MB = config["max_memory_allocation_mb"] include: "rules/utils.smk" E2G_DIR_PATH = os.path.abspath(config["E2G_DIR_PATH"]) config = make_paths_absolute(config, E2G_DIR_PATH) +config["results_dir"] = os.path.join(E2G_DIR_PATH, config["results_dir"]) # manually modify results dir since may not exist -# Need to manually make results_dir an absolute path since above may -# not work if results_dir folder isn't created -# If results_dir is already an absolute path, this is a no-op -config["results_dir"] = os.path.join(E2G_DIR_PATH, config["results_dir"]) +# define some global variables +RESULTS_DIR = config["results_dir"] +MODELS_RESULTS_DIR = os.path.join(config["results_dir"], "model_results") +SCRIPTS_DIR = os.path.join(E2G_DIR_PATH, "workflow/scripts") +# process configs model_config = process_model_config(model_config) +model_dataset_dict = make_model_dataset_dict(model_config, dataset_config) # dictionary of dictionaries: model: {crispr_ct: dataset, ...} +print(model_dataset_dict) +# import ABC submodule module ABC: snakefile: f"{config['ABC_DIR_PATH']}/workflow/Snakefile" @@ -31,39 +38,39 @@ abc_config = get_abc_config(config) use rule * from ABC exclude all as abc_* -RESULTS_DIR = config["results_dir"] -SCRIPTS_DIR = os.path.join(E2G_DIR_PATH, "workflow/scripts") - -# These rules requires the variables above to be defined +# import all rules (require the variables above to be defined) include: "rules/genomewide_features.smk" include: "rules/crispr_features.smk" include: "rules/train_model.smk" include: "rules/feature_analysis.smk" include: "rules/compare_models.smk" -# Validate ABC_directory column and make ABC directory dict +# validate ABC_directory column and make ABC directory dict ABC_BIOSAMPLES_DIR = process_abc_directory_column(model_config) # specify target files output_files = [] -output_files.extend(expand(os.path.join(RESULTS_DIR, "{dataset}", "{model}", "for_training.EPCrisprBenchmark_ensemble_data_GRCh38.K562_features_NAfilled.tsv.gz"), zip, dataset=model_config["dataset"], model=model_config["model"])) -output_files.extend(expand(os.path.join(RESULTS_DIR, "{dataset}", "{model}", "model", "model_full.pkl"), zip, dataset=model_config["dataset"], model=model_config["model"])) # trained models +output_files.extend(expand(os.path.join(MODELS_RESULTS_DIR, "{model}", "model", "model_full.pkl"), model=model_config["model"])) # trained models -output_files.append(os.path.join(RESULTS_DIR, "performance_across_models.tsv")) # comparison across models -output_files.extend(expand(os.path.join(RESULTS_DIR, "performance_across_models_{metric}.pdf"), metric=["auprc", "precision"])) # plot of comparisons +# output_files.extend(expand(os.path.join(RESULTS_DIR, "{dataset}", "{model}", "for_training.EPCrisprBenchmark_ensemble_data_GRCh38.K562_features_NAfilled.tsv.gz"), zip, dataset=model_config["dataset"], model=model_config["model"])) +# output_files.extend(expand(os.path.join(RESULTS_DIR, "{dataset}", "{model}", "model", "model_full.pkl"), zip, dataset=model_config["dataset"], model=model_config["model"])) # trained models + +output_files.append(os.path.join(MODELS_RESULTS_DIR, "performance_across_models.tsv")) # comparison across models +output_files.extend(expand(os.path.join(MODELS_RESULTS_DIR, "performance_across_models_{metric}.pdf"), metric=["auprc", "precision"])) # plot of comparisons if config["run_feature_analysis"]: - output_files.extend(expand(os.path.join(RESULTS_DIR, "{dataset}", "{model}", "feature_analysis", "forward_feature_selection_auprc.pdf"), zip, dataset=model_config["dataset"], model=model_config["model"])) - output_files.extend(expand(os.path.join(RESULTS_DIR, "{dataset}", "{model}", "feature_analysis", "backward_feature_selection_auprc.pdf"), zip, dataset=model_config["dataset"], model=model_config["model"])) - output_files.extend(expand(os.path.join(RESULTS_DIR, "{dataset}", "{model}", "feature_analysis", "permutation_feature_importance_auprc.pdf"), zip, dataset=model_config["dataset"], model=model_config["model"])) + output_files.extend(expand(os.path.join(MODELS_RESULTS_DIR, "{model}", "feature_analysis", "forward_feature_selection_auprc.pdf"), model=model_config["model"])) + output_files.extend(expand(os.path.join(MODELS_RESULTS_DIR, "{model}", "feature_analysis", "backward_feature_selection_auprc.pdf"), model=model_config["model"])) + output_files.extend(expand(os.path.join(MODELS_RESULTS_DIR, "{model}", "feature_analysis", "permutation_feature_importance_auprc.pdf"), model=model_config["model"])) # only test all feature sets if polynomial==False and n_features<14 for row in model_config.itertuples(index=False): if not row.polynomial == 'True': features = pd.read_table(row.feature_table) n_features = len(features) + if n_features<14: - output_files.append(os.path.join(RESULTS_DIR, row.dataset, row.model, "feature_analysis", "all_feature_sets.tsv")) + output_files.append(os.path.join(MODELS_RESULTS_DIR, row.model, "feature_analysis", "all_feature_sets.tsv")) rule all: input: diff --git a/workflow/rules/compare_models.smk b/workflow/rules/compare_models.smk index b0623d8..03f4d55 100755 --- a/workflow/rules/compare_models.smk +++ b/workflow/rules/compare_models.smk @@ -2,14 +2,16 @@ # compare cv-performance on training data across all models (note, this is not the true benchmarking performance CRISPR elements not overlapping prediction elements aren't considered) rule gather_model_performances: input: - all_predictions = expand(os.path.join(RESULTS_DIR, "{dataset}", "{model}", "model", "training_predictions.tsv"), zip, dataset=model_config["dataset"], model=model_config["model"]) + all_predictions = expand(os.path.join(MODELS_RESULTS_DIR, "{model}", "model", "training_predictions.tsv"), model=model_config["model"]), + all_missing = expand(os.path.join(MODELS_RESULTS_DIR, "{model}", "merged_CRISPR_dataset.missing_from_features.NAfilled.tsv.gz"), model=model_config["model"]), output: - comp_table = os.path.join(RESULTS_DIR, "performance_across_models.tsv") + comp_table = os.path.join(MODELS_RESULTS_DIR, "performance_across_models.tsv") params: scripts_dir = SCRIPTS_DIR, out_dir = RESULTS_DIR, model_config_file = config["model_config"], - crispr_dataset = config["crispr_dataset"] + crispr_dataset_names = [n for n in model_config["crispr_dataset"].unique()], + crispr_dataset = lambda wildcards: [config["crispr_dataset"][cd] for cd in model_config["crispr_dataset"].unique()] conda: "../envs/encode_re2g.yml" resources: @@ -17,21 +19,24 @@ rule gather_model_performances: shell: """ python {params.scripts_dir}/model_training/compare_all_models.py \ + --all_pred "{input.all_predictions}" \ + --all_missing "{input.all_missing}" \ --model_config_file {params.model_config_file} \ --output_file {output.comp_table} \ - --crispr_data {params.crispr_dataset} \ + --crispr_names "{params.crispr_dataset_names}" \ + --crispr_data "{params.crispr_dataset}" \ --out_dir {params.out_dir} """ rule plot_model_performances: input: - comp_table = os.path.join(RESULTS_DIR, "performance_across_models.tsv") + comp_table = os.path.join(MODELS_RESULTS_DIR, "performance_across_models.tsv") output: - comp_plot_auprc = os.path.join(RESULTS_DIR, "performance_across_models_auprc.pdf"), - comp_plot_prec = os.path.join(RESULTS_DIR, "performance_across_models_precision.pdf") + comp_plot_auprc = os.path.join(MODELS_RESULTS_DIR, "performance_across_models_auprc.pdf"), + comp_plot_prec = os.path.join(MODELS_RESULTS_DIR, "performance_across_models_precision.pdf") conda: "../envs/encode_re2g.yml" resources: mem_mb=ABC.determine_mem_mb script: - "../scripts/model_training/plot_model_comparison.R" + "../scripts/model_training/plot_model_comparison.R" \ No newline at end of file diff --git a/workflow/rules/crispr_features.smk b/workflow/rules/crispr_features.smk index 378fdf0..b7a9e3a 100644 --- a/workflow/rules/crispr_features.smk +++ b/workflow/rules/crispr_features.smk @@ -28,18 +28,18 @@ rule format_external_features_config: script: "../scripts/feature_tables/format_external_features_config.R" -# overlap feature table with K562 CRISPR data -rule overlap_features_crispr: +# overlap feature table with CRISPR data for this dataset +rule overlap_features_crispr_for_dataset: input: features = os.path.join(RESULTS_DIR, "{dataset}", "genomewide_features.tsv.gz"), - crispr = config['crispr_dataset'], feature_table_file = os.path.join(RESULTS_DIR, "{dataset}", "feature_table.tsv"), + crispr = lambda wildcards: config['crispr_dataset'][wildcards.crispr_dataset], tss = config['gene_TSS500'] params: - tpm_threshold = lambda wildcards: model_config.loc[wildcards.model, 'tpm_threshold'] + crispr_cell_type = lambda wildcards: dataset_config.loc[wildcards.dataset, "crispr_cell_type"] output: - features = os.path.join(RESULTS_DIR, "{dataset}", "{model}", "EPCrisprBenchmark_ensemble_data_GRCh38.K562_features_{nafill}.tsv.gz"), - missing = os.path.join(RESULTS_DIR, "{dataset}", "{model}", "missing.EPCrisprBenchmark_ensemble_data_GRCh38.K562_features_{nafill}.tsv.gz") + features = os.path.join(RESULTS_DIR, "{dataset}", "CRISPR_dataset_{crispr_dataset}.overlapping_features.{nafill}.tsv.gz"), + missing = os.path.join(RESULTS_DIR, "{dataset}", "CRISPR_dataset_{crispr_dataset}.missing_from_features.{nafill}.tsv.gz") conda: "../envs/encode_re2g.yml" resources: @@ -47,15 +47,23 @@ rule overlap_features_crispr: script: "../scripts/feature_tables/overlap_features_with_crispr_data.R" -# process data for model training: rename columns, apply filter features, filter to gene list +# process data for model training: rename columns, apply filter features, filter to gene list, combine across datasets # note: we use the NAfilled CRISPR feature data here! +def get_crispr_files_for_model(wildcards, file_type): + datasets = model_dataset_dict[wildcards.model].values() + crispr_dataset = model_config.loc[wildcards.model, "crispr_dataset"] + files = [os.path.join(RESULTS_DIR, ds, f"CRISPR_dataset_{crispr_dataset}.{file_type}.{wildcards.nafill}.tsv.gz") for ds in datasets] + return files + rule process_crispr_data: input: - crispr_features = os.path.join(RESULTS_DIR, "{dataset}", "{model}", "EPCrisprBenchmark_ensemble_data_GRCh38.K562_features_{nafill}.tsv.gz") + crispr_features = lambda wildcards: get_crispr_files_for_model(wildcards, "overlapping_features"), + crispr_missing = lambda wildcards: get_crispr_files_for_model(wildcards, "missing_from_features"), params: - genes = config["gene_TSS500"] + genes = config["gene_TSS500"], output: - processed = os.path.join(RESULTS_DIR, "{dataset}", "{model}", "for_training.EPCrisprBenchmark_ensemble_data_GRCh38.K562_features_{nafill}.tsv.gz") + processed = os.path.join(MODELS_RESULTS_DIR, "{model}", "for_training.merged_CRISPR_dataset.overlapping_features.{nafill}.tsv.gz"), + missing = os.path.join(MODELS_RESULTS_DIR, "{model}", "merged_CRISPR_dataset.missing_from_features.{nafill}.tsv.gz") conda: "../envs/encode_re2g.yml" resources: diff --git a/workflow/rules/feature_analysis.smk b/workflow/rules/feature_analysis.smk index 02354a1..3f6925e 100755 --- a/workflow/rules/feature_analysis.smk +++ b/workflow/rules/feature_analysis.smk @@ -1,16 +1,16 @@ # forward sequential feature selection rule calculate_forward_feature_selection: input: - crispr_features_processed = os.path.join(RESULTS_DIR, "{dataset}", "{model}", "for_training.EPCrisprBenchmark_ensemble_data_GRCh38.K562_features_NAfilled.tsv.gz"), + crispr_features_processed = os.path.join(MODELS_RESULTS_DIR, "{model}", "for_training.merged_CRISPR_dataset.overlapping_features.NAfilled.tsv.gz"), feature_table = lambda wildcards: model_config.loc[wildcards.model, 'feature_table'], - model_params = os.path.join(RESULTS_DIR, "{dataset}", "{model}", "model", "training_params.pkl") + model_params = os.path.join(MODELS_RESULTS_DIR, "{model}", "model", "training_params.pkl") params: epsilon = config["epsilon"], scripts_dir = SCRIPTS_DIR, polynomial = lambda wildcards: model_config.loc[wildcards.model, 'polynomial'], - out_dir = os.path.join(RESULTS_DIR, "{dataset}", "{model}", "feature_analysis") + out_dir = os.path.join(MODELS_RESULTS_DIR, "{model}", "feature_analysis") output: - results = os.path.join(RESULTS_DIR, "{dataset}", "{model}", "feature_analysis", "forward_feature_selection.tsv"), + results = os.path.join(MODELS_RESULTS_DIR, "{model}", "feature_analysis", "forward_feature_selection.tsv"), conda: "../envs/encode_re2g.yml" resources: @@ -28,13 +28,13 @@ rule calculate_forward_feature_selection: rule plot_forward_feature_selection: input: - results = os.path.join(RESULTS_DIR, "{dataset}", "{model}", "feature_analysis", "forward_feature_selection.tsv"), + results = os.path.join(MODELS_RESULTS_DIR, "{model}", "feature_analysis", "forward_feature_selection.tsv"), feature_table = lambda wildcards: model_config.loc[wildcards.model, 'feature_table'], params: polynomial = lambda wildcards: model_config.loc[wildcards.model, 'polynomial'] output: - out_auprc = os.path.join(RESULTS_DIR, "{dataset}", "{model}", "feature_analysis", "forward_feature_selection_auprc.pdf"), - out_prec = os.path.join(RESULTS_DIR, "{dataset}", "{model}", "feature_analysis", "forward_feature_selection_precision.pdf") + out_auprc = os.path.join(MODELS_RESULTS_DIR, "{model}", "feature_analysis", "forward_feature_selection_auprc.pdf"), + out_prec = os.path.join(MODELS_RESULTS_DIR, "{model}", "feature_analysis", "forward_feature_selection_precision.pdf") conda: "../envs/encode_re2g.yml" resources: @@ -45,16 +45,16 @@ rule plot_forward_feature_selection: # backward sequential feature selection rule calculate_backward_feature_selection: input: - crispr_features_processed = os.path.join(RESULTS_DIR, "{dataset}", "{model}", "for_training.EPCrisprBenchmark_ensemble_data_GRCh38.K562_features_NAfilled.tsv.gz"), + crispr_features_processed = os.path.join(MODELS_RESULTS_DIR, "{model}", "for_training.merged_CRISPR_dataset.overlapping_features.NAfilled.tsv.gz"), feature_table = lambda wildcards: model_config.loc[wildcards.model, 'feature_table'], - model_params = os.path.join(RESULTS_DIR, "{dataset}", "{model}", "model", "training_params.pkl") + model_params = os.path.join(MODELS_RESULTS_DIR, "{model}", "model", "training_params.pkl") params: epsilon = config["epsilon"], scripts_dir = SCRIPTS_DIR, polynomial = lambda wildcards: model_config.loc[wildcards.model, 'polynomial'], - out_dir = os.path.join(RESULTS_DIR, "{dataset}", "{model}", "feature_analysis") + out_dir = os.path.join(MODELS_RESULTS_DIR, "{model}", "feature_analysis") output: - results = os.path.join(RESULTS_DIR, "{dataset}", "{model}", "feature_analysis", "backward_feature_selection.tsv"), + results = os.path.join(MODELS_RESULTS_DIR, "{model}", "feature_analysis", "backward_feature_selection.tsv"), conda: "../envs/encode_re2g.yml" resources: @@ -72,13 +72,13 @@ rule calculate_backward_feature_selection: rule plot_backward_feature_selection: input: - results = os.path.join(RESULTS_DIR, "{dataset}", "{model}", "feature_analysis", "backward_feature_selection.tsv"), + results = os.path.join(MODELS_RESULTS_DIR, "{model}", "feature_analysis", "backward_feature_selection.tsv"), feature_table = lambda wildcards: model_config.loc[wildcards.model, 'feature_table'] params: polynomial = lambda wildcards: model_config.loc[wildcards.model, 'polynomial'] output: - out_auprc = os.path.join(RESULTS_DIR, "{dataset}", "{model}", "feature_analysis", "backward_feature_selection_auprc.pdf"), - out_prec = os.path.join(RESULTS_DIR, "{dataset}", "{model}", "feature_analysis", "backward_feature_selection_precision.pdf") + out_auprc = os.path.join(MODELS_RESULTS_DIR, "{model}", "feature_analysis", "backward_feature_selection_auprc.pdf"), + out_prec = os.path.join(MODELS_RESULTS_DIR, "{model}", "feature_analysis", "backward_feature_selection_precision.pdf") conda: "../envs/encode_re2g.yml" resources: @@ -89,15 +89,15 @@ rule plot_backward_feature_selection: # compare all features sets rule compare_all_feature_sets: input: - crispr_features_processed = os.path.join(RESULTS_DIR, "{dataset}", "{model}", "for_training.EPCrisprBenchmark_ensemble_data_GRCh38.K562_features_NAfilled.tsv.gz"), + crispr_features_processed = os.path.join(MODELS_RESULTS_DIR, "{model}", "for_training.merged_CRISPR_dataset.overlapping_features.NAfilled.tsv.gz"), feature_table = lambda wildcards: model_config.loc[wildcards.model, 'feature_table'], - model_params = os.path.join(RESULTS_DIR, "{dataset}", "{model}", "model", "training_params.pkl") + model_params = os.path.join(MODELS_RESULTS_DIR, "{model}", "model", "training_params.pkl") params: epsilon = config["epsilon"], scripts_dir = SCRIPTS_DIR, - out_dir = os.path.join(RESULTS_DIR, "{dataset}", "{model}", "feature_analysis") + out_dir = os.path.join(MODELS_RESULTS_DIR, "{model}", "feature_analysis") output: - results = os.path.join(RESULTS_DIR, "{dataset}", "{model}", "feature_analysis", "all_feature_sets.tsv"), + results = os.path.join(MODELS_RESULTS_DIR, "{model}", "feature_analysis", "all_feature_sets.tsv"), conda: "../envs/encode_re2g.yml" resources: @@ -116,17 +116,17 @@ rule compare_all_feature_sets: # permuation feature importance rule calculate_permutation_feature_importance: input: - crispr_features_processed = os.path.join(RESULTS_DIR, "{dataset}", "{model}", "for_training.EPCrisprBenchmark_ensemble_data_GRCh38.K562_features_NAfilled.tsv.gz"), + crispr_features_processed = os.path.join(MODELS_RESULTS_DIR, "{model}", "for_training.merged_CRISPR_dataset.overlapping_features.NAfilled.tsv.gz"), feature_table = lambda wildcards: model_config.loc[wildcards.model, 'feature_table'], - model_params = os.path.join(RESULTS_DIR, "{dataset}", "{model}", "model", "training_params.pkl") + model_params = os.path.join(MODELS_RESULTS_DIR, "{model}", "model", "training_params.pkl") params: epsilon = config["epsilon"], n_repeats = 20, scripts_dir = SCRIPTS_DIR, polynomial = lambda wildcards: model_config.loc[wildcards.model, 'polynomial'], - out_dir = os.path.join(RESULTS_DIR, "{dataset}", "{model}", "feature_analysis") + out_dir = os.path.join(MODELS_RESULTS_DIR, "{model}", "feature_analysis") output: - results = os.path.join(RESULTS_DIR, "{dataset}", "{model}", "feature_analysis", "permutation_feature_importance.tsv"), + results = os.path.join(MODELS_RESULTS_DIR, "{model}", "feature_analysis", "permutation_feature_importance.tsv"), conda: "../envs/encode_re2g.yml" resources: @@ -145,14 +145,14 @@ rule calculate_permutation_feature_importance: rule plot_permutation_feature_importance: input: - results = os.path.join(RESULTS_DIR, "{dataset}", "{model}", "feature_analysis", "permutation_feature_importance.tsv"), + results = os.path.join(MODELS_RESULTS_DIR, "{model}", "feature_analysis", "permutation_feature_importance.tsv"), feature_table = lambda wildcards: model_config.loc[wildcards.model, 'feature_table'] params: polynomial = lambda wildcards: model_config.loc[wildcards.model, 'polynomial'], n_repeats = 20 output: - out_auprc = os.path.join(RESULTS_DIR, "{dataset}", "{model}", "feature_analysis", "permutation_feature_importance_auprc.pdf"), - out_prec = os.path.join(RESULTS_DIR, "{dataset}", "{model}", "feature_analysis", "permutation_feature_importance_precision.pdf") + out_auprc = os.path.join(MODELS_RESULTS_DIR, "{model}", "feature_analysis", "permutation_feature_importance_auprc.pdf"), + out_prec = os.path.join(MODELS_RESULTS_DIR, "{model}", "feature_analysis", "permutation_feature_importance_precision.pdf") conda: "../envs/encode_re2g.yml" resources: diff --git a/workflow/rules/genomewide_features.smk b/workflow/rules/genomewide_features.smk index 8afe56e..a3a33e3 100644 --- a/workflow/rules/genomewide_features.smk +++ b/workflow/rules/genomewide_features.smk @@ -93,18 +93,16 @@ rule generate_num_tss_enh_gene: conda: "../envs/encode_re2g.yml" resources: - mem_mb=partial(ABC.determine_mem_mb, min_gb=32) + mem_mb=partial(ABC.determine_mem_mb) output: numTSSEnhGene = os.path.join(RESULTS_DIR, "{biosample}", "new_features", "NumTSSEnhGene.tsv"), extendedEnhancerRegions = temp(os.path.join(RESULTS_DIR, "{biosample}", "new_features", "extendedEnhancerRegions.txt")), - enhancerTSSInt = temp(os.path.join(RESULTS_DIR, "{biosample}", "new_features", "extendedEnhancerRegions_TSS_int.tsv.gz")) shell: """ python {params.scripts_dir}/feature_tables/gen_num_tss_enh_gene.py \ --abc_predictions {input.abc_predictions} \ --ref_gene_tss {params.gene_TSS500} \ --extended_enhancers {output.extendedEnhancerRegions} \ - --enhancer_tss_int {output.enhancerTSSInt} \ --out_file {output.numTSSEnhGene} """ @@ -172,7 +170,7 @@ rule add_external_features: input: predictions_extended = os.path.join(RESULTS_DIR, "{biosample}", "ActivityOnly_features.tsv.gz"), feature_table_file = os.path.join(RESULTS_DIR, "{biosample}", "feature_table.tsv"), - external_features_config = ancient(os.path.join(RESULTS_DIR, "{biosample}", "external_features_config.tsv")) + external_features_config = os.path.join(RESULTS_DIR, "{biosample}", "external_features_config.tsv") output: plus_external_features = os.path.join(RESULTS_DIR, "{biosample}", "ActivityOnly_plus_external_features.tsv.gz") conda: diff --git a/workflow/rules/predictions.smk b/workflow/rules/predictions.smk index ae06a5a..ce79d59 100755 --- a/workflow/rules/predictions.smk +++ b/workflow/rules/predictions.smk @@ -113,8 +113,9 @@ rule write_accessibility_bw_file: "../envs/encode_re2g.yml" resources: mem_mb=ABC.determine_mem_mb, + cpus_per_task = min(16, config.get("cluster_max_cores", 16)), runtime_hr=6 - threads: 16 + threads: min(16, config.get("cluster_max_cores", 16)) shell: """ LC_ALL=C @@ -130,7 +131,7 @@ rule write_accessibility_bw_file: awk '$4 > 0' > {output.out_bg} else # tagAlign # remove alt chromosomes and sort - zcat {input.input_file} | \ + pigz -cd {input.input_file} | \ awk 'NR==FNR {{keep[$1]; next}} $1 in keep' {params.chr_sizes} - | \ sort -k1,1 -k2,2n --parallel={threads} -S $BUFFER_SIZE | \ bedtools genomecov -bg -split -i - -g {params.chr_sizes} | \ diff --git a/workflow/rules/train_model.smk b/workflow/rules/train_model.smk index 9a7e683..4f8093b 100644 --- a/workflow/rules/train_model.smk +++ b/workflow/rules/train_model.smk @@ -7,7 +7,7 @@ rule generate_model_params: default_params = config["default_params"], scripts_dir = SCRIPTS_DIR output: - final_params = os.path.join(RESULTS_DIR, "{dataset}", "{model}", "model", "training_params.pkl") + final_params = os.path.join(MODELS_RESULTS_DIR, "{model}", "model", "training_params.pkl") conda: "../envs/encode_re2g.yml" shell: @@ -21,19 +21,17 @@ rule generate_model_params: # generate trained model and cross-validated predictions on CRISPR data rule train_model: input: - crispr_features_processed = os.path.join(RESULTS_DIR, "{dataset}", "{model}", "for_training.EPCrisprBenchmark_ensemble_data_GRCh38.K562_features_NAfilled.tsv.gz"), + crispr_features_processed = os.path.join(MODELS_RESULTS_DIR, "{model}", "for_training.merged_CRISPR_dataset.overlapping_features.NAfilled.tsv.gz"), feature_table = lambda wildcards: model_config.loc[wildcards.model, 'feature_table'], - model_params = os.path.join(RESULTS_DIR, "{dataset}", "{model}", "model", "training_params.pkl") + model_params = os.path.join(MODELS_RESULTS_DIR, "{model}", "model", "training_params.pkl") params: epsilon = config["epsilon"], scripts_dir = SCRIPTS_DIR, polynomial = lambda wildcards: model_config.loc[wildcards.model, 'polynomial'], - out_dir = os.path.join(RESULTS_DIR, "{dataset}", "{model}", "model") + out_dir = os.path.join(MODELS_RESULTS_DIR, "{model}", "model") output: - trained_model = os.path.join(RESULTS_DIR, "{dataset}", "{model}", "model", "model_full.pkl"), - pred = os.path.join(RESULTS_DIR, "{dataset}", "{model}", "model", "training_predictions.tsv"), - #in_order = os.path.join(RESULTS_DIR, "{dataset}", "{model}", "model", "training_data_in_order.tsv"), - #shap = os.path.join(RESULTS_DIR, "{dataset}", "{model}", "model", "shap_scores.tsv") + trained_model = os.path.join(MODELS_RESULTS_DIR, "{model}", "model", "model_full.pkl"), + pred = os.path.join(MODELS_RESULTS_DIR, "{model}", "model", "training_predictions.tsv"), conda: "../envs/encode_re2g.yml" resources: diff --git a/workflow/rules/utils.smk b/workflow/rules/utils.smk index fa1db11..2fcc66b 100755 --- a/workflow/rules/utils.smk +++ b/workflow/rules/utils.smk @@ -35,6 +35,34 @@ def process_model_config(model_config): return model_config +# create dictionary of dictionaries to map datasets to crispr cell types: {model: {crispr_ct: dataset, ...}} +def make_model_dataset_dict(model_config, dataset_config): + + # dict of dataset: crispr_cell_type + dataset_celltype_dict = dict(zip(dataset_config['biosample'], dataset_config['crispr_cell_type'])) + + model_dicts = [] + for index, row in model_config.iterrows(): + model_datasets = [item.strip() for item in row["dataset"].split(",")] # return list of datasets + mapped_celltypes = [dataset_celltype_dict[dataset] for dataset in model_datasets] + + # make sure 1:1 correspondance with CRISPR cell types + crispr_data_cell_types = config["crispr_cell_types"][row["crispr_dataset"]] + + if sorted(crispr_data_cell_types) != sorted(mapped_celltypes): + print(f"Model datasets: {model_datasets} -> cell types: {mapped_celltypes}") + raise Exception(f"Datasets specified for {row['model']} do not map to all CRISPR cell types.") + + this_model_dict = dict(zip(mapped_celltypes, model_datasets)) + model_dicts.append(this_model_dict) + + # combine into dictionary of dicitonaries + model_dataset_dict = dict(zip(model_config["model"], model_dicts)) + + return model_dataset_dict + + + def get_abc_config(config): abc_config_file = os.path.join(config["ABC_DIR_PATH"], "config/config.yaml") with open(abc_config_file, 'r') as stream: @@ -86,7 +114,7 @@ def make_accessibility_file_df(biosample_df, biosample_activities): def get_input_for_bw(this_biosample, this_simple_id): df_sub = ACCESSIBILITY_DF.loc[(ACCESSIBILITY_DF["biosample"]==this_biosample) & (ACCESSIBILITY_DF["access_simple_id"]==this_simple_id)] - return df_sub["single_access_file"][0] + return df_sub["single_access_file"].item() def expand_biosample_df(biosample_df): # add new columns @@ -120,13 +148,15 @@ def process_abc_directory_column(model_config): # Make a dictionary of dataset:ABC_dir pairs ABC_BIOSAMPLES_DIR = {} for row in model_config.itertuples(index=False): - if row.dataset not in ABC_BIOSAMPLES_DIR: - if row.ABC_directory=="None": # ABC directory is not provided - if row.dataset not in dataset_config['biosample']: # is there info to run ABC? - raise Exception(f"Dataset {row.dataset} not specified in dataset_config.") - ABC_BIOSAMPLES_DIR[row.dataset] = os.path.join(RESULTS_DIR, row.dataset) - else: # ABC directory is provided - ABC_BIOSAMPLES_DIR[row.dataset] = row.ABC_directory + model_datasets = [item.strip() for item in row.dataset.split(",")] + for ds in model_datasets: + if ds not in ABC_BIOSAMPLES_DIR: + if row.ABC_directory=="None": # ABC directory is not provided + if ds not in dataset_config['biosample']: # is there info to run ABC? + raise Exception(f"Dataset {ds} not specified in dataset_config.") + ABC_BIOSAMPLES_DIR[ds] = os.path.join(RESULTS_DIR, ds) + else: # ABC directory is provided + ABC_BIOSAMPLES_DIR[ds] = ds return ABC_BIOSAMPLES_DIR @@ -199,4 +229,4 @@ def get_tpm_threshold(biosample, model_name, biosample_df=None): return 0 else: tpm_file = os.path.basename(tpm_files[0]) - return tpm_file.split("threshold_")[1] \ No newline at end of file + return tpm_file.split("threshold_")[1] diff --git a/workflow/scripts/feature_tables/combine_feature_tables.R b/workflow/scripts/feature_tables/combine_feature_tables.R index 54cbe91..0268b49 100644 --- a/workflow/scripts/feature_tables/combine_feature_tables.R +++ b/workflow/scripts/feature_tables/combine_feature_tables.R @@ -4,15 +4,19 @@ library(dplyr) # inputs from snakemake model_config = fread(snakemake@input$model_config) -ds = snakemake@wildcards$dataset +ds = snakemake@wildcards$dataset -# merge feature tables -models_this = dplyr::filter(model_config, dataset==ds) -for (i in 1:nrow(models_this)){ - ft = fread(models_this$feature_table[i]) - if (i==1){df = ft} else {df = rbind(df, ft)} +# merge feature tables for models with this dataset +ft_files = c() +for (i in 1:nrow(model_config)){ + model_datasets = model_config$dataset[i] %>% strsplit(",") %>% unlist() %>% trimws() + if (ds %in% model_datasets) { + ft_files = c(ft_files, model_config$feature_table[i]) + } } +df <- lapply(ft_files, fread) %>% rbindlist() %>% as.data.frame() + # for sc-E2G pipeline if (("ARC.E2G.Score" %in% df$feature) | ("Kendall" %in% df$feature)){ ARC_rows = data.frame(c("RNA_meanLogNorm", "RNA_pseudobulkTPM", "RNA_percentCellsDetected", "Kendall", "ARC.E2G.Score", "ABC.Score", "normalizedATAC_enh"), diff --git a/workflow/scripts/feature_tables/gen_num_candidate_enh_gene.py b/workflow/scripts/feature_tables/gen_num_candidate_enh_gene.py index f1ecca0..4390a4f 100644 --- a/workflow/scripts/feature_tables/gen_num_candidate_enh_gene.py +++ b/workflow/scripts/feature_tables/gen_num_candidate_enh_gene.py @@ -3,35 +3,67 @@ import pandas as pd -def _populate_enhancer_count_from_tss(df, enhancers, is_upstream): - enh_indexes = enhancers.index - if is_upstream: - # start counting from the enhancer closest to TSS - enh_indexes = reversed(enh_indexes) +def determine_num_candidate_enh_gene(pred_df, out_file): + # Concept: Start by making a working copy of our data so we don't accidentally + # modify the original DataFrame that was passed into the function. + df = pred_df.copy() - count_from_tss = 0 - for enh_idx in enh_indexes: - count_from_tss += 1 - df.loc[enh_idx, "NumCandidateEnhGene"] = count_from_tss + # Concept: To determine if an enhancer is upstream or downstream, we need a single + # point to represent it. The midpoint is a simple and effective choice. + df["midpoint"] = ((df["start"] + df["end"]) / 2).astype("int") + # Concept: For our counting logic to work correctly, all enhancers must be + # sorted by their genomic position. This creates a predictable 'map' of the + # chromosome that we can operate on. We sort by gene first, then position. + df = df.sort_values(by=["TargetGene", "midpoint"], ascending=[True, True]) -def determine_num_candidate_enh_gene(pred_df, out_file): - # Need df to be sorted by midpoint for each chromosome - pred_df["midpoint"] = ((pred_df["start"] + pred_df["end"]) / 2).astype("int") - df = pred_df.sort_values(by=["chr", "midpoint"], ascending=True).reset_index( - drop=True + # Concept: We need to handle enhancers before the TSS differently from those after it. + # Let's create two 'masks'—like stencils—to quickly select all the upstream + # enhancers or all the downstream enhancers in one go. + is_downstream = df["midpoint"] > df["TargetGeneTSS"] + is_upstream = df["midpoint"] < df["TargetGeneTSS"] + + # --- Upstream Calculation --- + # Concept: To rank upstream enhancers, we need to count *away* from the TSS. + # This means starting with the enhancer closest to the TSS (the one with the + # highest 'midpoint' coordinate) and counting backwards. + # We achieve this by creating a temporary, sorted copy of just the upstream enhancers. + df.loc[is_upstream, "NumCandidateEnhGene"] = ( + # 1. Select only the upstream enhancers. This creates a temporary table. + df[is_upstream] + # 2. Sort this temporary table in DESCENDING order of position. Now, the + # enhancer closest to the TSS is at the top of each gene's group. + .sort_values("midpoint", ascending=False) + # 3. Group by gene and then perform a cumulative count. `cumcount` automatically + # creates the ranking (0, 1, 2, ...) for us within each gene's group. + .groupby("TargetGene").cumcount() + # 4. `cumcount` starts at 0, but we want our ranks to start at 1. + + 1 ) - gene_groups = df.groupby(["TargetGene", "TargetGeneTSS"]) - for (gene, tss), indexes in gene_groups.groups.items(): - enhancers = df.loc[indexes] - upstream_enh = enhancers[enhancers["midpoint"] < tss] - downstream_enh = enhancers[enhancers["midpoint"] > tss] - _populate_enhancer_count_from_tss(df, upstream_enh, is_upstream=True) - _populate_enhancer_count_from_tss(df, downstream_enh, is_upstream=False) + # --- Downstream Calculation --- + # Concept: For downstream enhancers, the default sort order (ascending) is already + # correct for counting away from the TSS. The enhancer with the lowest + # 'midpoint' coordinate is the one closest to the TSS. + df.loc[is_downstream, "NumCandidateEnhGene"] = ( + # 1. Select only the downstream enhancers. + df[is_downstream] + # 2. Group by gene and perform the cumulative count. No special sorting needed. + .groupby("TargetGene").cumcount() + # 3. Add 1 to start the rank from 1 instead of 0. + + 1 + ) + # Concept: Any enhancers located exactly at the TSS (or any other unhandled cases) + # will have a missing ('NaN') rank. We'll fill these with 0 to be safe. df = df.fillna(value=0) df["NumCandidateEnhGene"] = df["NumCandidateEnhGene"].astype("int") + + # Concept: Before saving, restore the desired chromosomal sort order + df = df.sort_values(by=["chr", "midpoint"], ascending=True).reset_index(drop=True) + + # Concept: Finally, select only the columns we need for the final report and + # save the result to a file. df[["name", "TargetGene", "NumCandidateEnhGene"]].to_csv( out_file, sep="\t", @@ -44,7 +76,7 @@ def determine_num_candidate_enh_gene(pred_df, out_file): @click.option("--abc_predictions") @click.option("--out_file") def main(abc_predictions, out_file): - pred_df = pd.read_csv(abc_predictions, sep="\t") + pred_df = pd.read_csv(abc_predictions, sep="\t", compression="gzip") if len(pred_df) == 0: raise Exception("Did not find any enhancers in the Predictions file") diff --git a/workflow/scripts/feature_tables/gen_num_sum_nearby_enhancers.py b/workflow/scripts/feature_tables/gen_num_sum_nearby_enhancers.py index d1aab36..c887fa6 100644 --- a/workflow/scripts/feature_tables/gen_num_sum_nearby_enhancers.py +++ b/workflow/scripts/feature_tables/gen_num_sum_nearby_enhancers.py @@ -35,11 +35,18 @@ def generate_num_sum_enhancers( ) # select columns from EnhancerPredictionsAllPutative - os.system( - "zcat {} | csvtk cut -t -f chr,start,end,name,activity_base | sed '1d' > {}".format( - pred_file, pred_slim - ) + df = pd.read_csv( + pred_file, + sep="\t", + usecols=["chr", "start", "end", "name", "activity_base"], + compression="gzip", ) + df.to_csv(pred_slim, sep="\t", index=False, header=False) + # os.system( + # "zcat {} | csvtk cut -t -f chr,start,end,name,activity_base | sed '1d' > {}".format( + # pred_file, pred_slim + # ) + # ) # intersect with expanded enhancer regions and count n enhancers os.system( diff --git a/workflow/scripts/feature_tables/gen_num_tss_enh_gene.py b/workflow/scripts/feature_tables/gen_num_tss_enh_gene.py index 211a2bb..985ee34 100644 --- a/workflow/scripts/feature_tables/gen_num_tss_enh_gene.py +++ b/workflow/scripts/feature_tables/gen_num_tss_enh_gene.py @@ -3,9 +3,7 @@ import pandas as pd -def determine_num_tss_enh_gene( - pred_df, ref_gene_tss, extended_enhancers, enhancer_tss_int, out_file -): +def determine_num_tss_enh_gene(pred_df, ref_gene_tss, extended_enhancers, out_file): # make the end be midpoint of enhancer + distance (This gives you the end coordinate of distance range) pred_df["midpoint"] = ((pred_df["start"] + pred_df["end"]) / 2).astype("int") pred_df["new_end"] = (pred_df["midpoint"] + pred_df["distance"]).astype("int") @@ -24,25 +22,17 @@ def determine_num_tss_enh_gene( extended_enhancers, sep="\t", index=False ) - # Intersect midpoint of enhancer to target gene regions with TSSs (includes overlap with target gene) - os.system( - "sed '1d' {} | bedtools intersect -a stdin -b {} -wa -wb | cut -f4,5 | pigz > {}".format( - extended_enhancers, ref_gene_tss, enhancer_tss_int - ) - ) - print("Reading in {}".format(enhancer_tss_int)) - predictions = pd.read_csv(enhancer_tss_int, sep="\t", names=["class", "gene"]) - - # Calculate the number of TSS regions that fall within the enhancer to target gene regions. - num_tss_between_enh_and_gene = ( - predictions.groupby(["class", "gene"]).size().reset_index() - ) + # This pipeline performs the intersection, counting, and saves the final small result. + # It is the command-line equivalent of groupby().size() and is very memory-efficient. + header = "name\tgene\tcount\n" + cmd = f""" + printf "{header}" > {out_file}; + bedtools intersect -a {extended_enhancers} -b {ref_gene_tss} -wa -c \ + | cut -f4,5,6 \ + >> {out_file} + """ + os.system(cmd) - num_tss_between_enh_and_gene.to_csv( - out_file, - sep="\t", - index=False, - ) print("Saved num TSS between enh and gene") @@ -50,16 +40,13 @@ def determine_num_tss_enh_gene( @click.option("--abc_predictions") @click.option("--ref_gene_tss") @click.option("--extended_enhancers") -@click.option("--enhancer_tss_int") @click.option("--out_file") -def main(abc_predictions, ref_gene_tss, extended_enhancers, enhancer_tss_int, out_file): - pred_df = pd.read_csv(abc_predictions, sep="\t") +def main(abc_predictions, ref_gene_tss, extended_enhancers, out_file): + pred_df = pd.read_csv(abc_predictions, sep="\t", compression="gzip") if len(pred_df) == 0: raise Exception("Did not find any enhancers in the Predictions file") - determine_num_tss_enh_gene( - pred_df, ref_gene_tss, extended_enhancers, enhancer_tss_int, out_file - ) + determine_num_tss_enh_gene(pred_df, ref_gene_tss, extended_enhancers, out_file) if __name__ == "__main__": diff --git a/workflow/scripts/feature_tables/overlap_features_with_crispr_data.R b/workflow/scripts/feature_tables/overlap_features_with_crispr_data.R index 34d2850..ca470d9 100755 --- a/workflow/scripts/feature_tables/overlap_features_with_crispr_data.R +++ b/workflow/scripts/feature_tables/overlap_features_with_crispr_data.R @@ -57,7 +57,7 @@ merge_feature_to_crispr <- function(crispr, features, feature_score_cols, agg_fu output <- merged # sort output by cre position - output <- output[order(dataset, chrom, chromStart, chromEnd, measuredGeneSymbol), ] + output <- output[order(chrom, chromStart, chromEnd, measuredGeneSymbol), ] return(list(merged=output, missing=missing)) } @@ -81,9 +81,10 @@ aggregate_features <- function(merged, feature_score_cols, agg_cols, agg_fun) { # load feature table features <- fread(snakemake@input$features) -# load crispri data and only retain relevant columns +# load crispri data and only retain relevant columns and filter to cell type crispr <- fread(snakemake@input$crispr) -crispr <- select(crispr, -c(pair_uid, merged_uid, merged_start, merged_end)) +crispr <- select(crispr, -any_of(c("pair_uid", "merged_uid", "merged_start", "merged_end"))) %>% + filter(CellType == snakemake@params$crispr_cell_type) # load feature config file and only retain entries for features in input data config <- fread(snakemake@input$feature_table_file) diff --git a/workflow/scripts/feature_tables/process_crispr_data.R b/workflow/scripts/feature_tables/process_crispr_data.R index e382a16..73f070c 100755 --- a/workflow/scripts/feature_tables/process_crispr_data.R +++ b/workflow/scripts/feature_tables/process_crispr_data.R @@ -8,19 +8,45 @@ suppressPackageStartupMessages({ library(tibble) }) +combine_crispr_features <- function(space_sep_string) { + crispr_feature_files <- space_sep_string %>% strsplit(" ") %>% unlist() %>% trimws() + crispr_features <- lapply(crispr_feature_files, fread) + + # get common columns (e.g. features in all) + col_names_list <- lapply(crispr_features, colnames) + common_cols <- Reduce(intersect, col_names_list) + + # filter crispr features to selected columns then combine + crispr_features <- lapply(crispr_features, function(df) select(df, all_of(common_cols))) %>% + rbindlist() %>% as.data.frame() + + return(crispr_features) +} + + + # load inputs -df <- fread(snakemake@input$crispr_features) +crispr_features <- combine_crispr_features(snakemake@input$crispr_features) +crispr_missing <- combine_crispr_features(snakemake@input$crispr_missing) + + genes <- fread(snakemake@params$genes) colnames(genes) <- c('chr', 'start', 'end', 'gene', 'score', 'strand', 'Ensembl_ID', 'gene_type') # rename some column names for later utility -df = dplyr::rename(df, 'chr'='chrom', 'start'='chromStart', 'end'='chromEnd', 'TargetGene'='measuredGeneSymbol') +crispr_features <- crispr_features %>% + rename('chr'='chrom', 'start'='chromStart', 'end'='chromEnd', 'TargetGene'='measuredGeneSymbol') +crispr_missing <- crispr_missing %>% + rename('chr'='chrom', 'start'='chromStart', 'end'='chromEnd', 'TargetGene'='measuredGeneSymbol') # drop elements where Regulated=NA -df = dplyr::filter(df, Regulated!="NA") +crispr_features = filter(crispr_features, Regulated != "NA" & !is.na(Regulated)) +crispr_missing = filter(crispr_missing, Regulated != "NA" & !is.na(Regulated)) # filter to elements with target gene in gene universe (should be redundant with applying filter_genes=tss_universe in feature overlap) -df = dplyr::filter(df, TargetGene %in% genes$gene) +crispr_features = filter(crispr_features, TargetGene %in% genes$gene) +crispr_missing = filter(crispr_missing, TargetGene %in% genes$gene) # save output to file -fwrite(df, file = snakemake@output$processed, sep = "\t", na = "NA", quote = FALSE) +fwrite(crispr_features, file = snakemake@output$processed, sep = "\t", na = "NA", quote = FALSE) +fwrite(crispr_missing, file = snakemake@output$missing, sep = "\t", na = "NA", quote = FALSE) diff --git a/workflow/scripts/model_application/generate_plots.py b/workflow/scripts/model_application/generate_plots.py index 9fe0a42..9d22879 100755 --- a/workflow/scripts/model_application/generate_plots.py +++ b/workflow/scripts/model_application/generate_plots.py @@ -31,13 +31,13 @@ def load_and_save_stat_files(stat_files, output_table) -> List[pd.DataFrame]: cell_cluster = Path(stat_file).parts[-3] model_name = Path(stat_file).parts[-2] pred_id = f"{cell_cluster}_{model_name}" - results[pred_id] = pd.read_csv(stat_file, sep="\t").set_index("Metric").assign( - cell_cluster=cell_cluster, - model_name=model_name + results[pred_id] = ( + pd.read_csv(stat_file, sep="\t") + .set_index("Metric") + .assign(cell_cluster=cell_cluster, model_name=model_name) ) results_no_ind[pred_id] = pd.read_csv(stat_file, sep="\t").assign( - cell_cluster=cell_cluster, - model_name=model_name + cell_cluster=cell_cluster, model_name=model_name ) # save merged dataframe @@ -178,7 +178,7 @@ def save_outlier_stats(stat_dfs, metric, pdf_writer): pred_values = {} for this_df in stat_dfs.values(): pred_id = f"{this_df['cell_cluster'].iloc[0]}_{this_df['model_name'].iloc[0]}" - pred_values[pred_id] = this_df.loc[metric, VALUE_KEY] + pred_values[pred_id] = this_df.loc[metric, VALUE_KEY] sorted_items = sorted(pred_values.items(), key=lambda item: item[1]) diff --git a/workflow/scripts/model_application/get_stats.py b/workflow/scripts/model_application/get_stats.py index b38389b..cb8cd9d 100755 --- a/workflow/scripts/model_application/get_stats.py +++ b/workflow/scripts/model_application/get_stats.py @@ -19,29 +19,31 @@ def count_bam_total(bam_file: str) -> int: no_alt_chrom_df = df[df["chr"].isin(NORMAL_CHROMOSOMES)] return no_alt_chrom_df["mapped_reads"].sum() + def count_lines(file_path): count = 0 if file_path.endswith(".gz"): - with gzip.open(file_path, 'rt') as file: # 'rt' mode is for reading text - reader = csv.reader(file, delimiter='\t') + with gzip.open(file_path, "rt") as file: # 'rt' mode is for reading text + reader = csv.reader(file, delimiter="\t") for row in reader: if row[0] in NORMAL_CHROMOSOMES: count += 1 else: - reader = csv.reader(file_path, delimiter='\t') + reader = csv.reader(file_path, delimiter="\t") for row in reader: if row[0] in NORMAL_CHROMOSOMES: count += 1 return count + def get_num_reads(accessibility_files): total_counts = 0 for access_in in accessibility_files: if access_in.endswith(".bam"): total_counts += count_bam_total(access_in) elif "tagAlign" in access_in: - total_counts += count_lines(access_in)/2 - else: # hope it's a frag file + total_counts += count_lines(access_in) / 2 + else: # hope it's a frag file total_counts += count_lines(access_in) return total_counts @@ -81,7 +83,7 @@ def get_mean_num_enh_per_gene_no_prom(df): def get_mean_log_dist_to_tss(df): dist_cols = [x for x in ["distance", "distanceToTSS"] if x in df.columns] - if len(dist_cols)>0: + if len(dist_cols) > 0: log_dist = df[dist_cols[0]].apply(np.log10) log_dist = log_dist.replace(-np.inf, 0) return log_dist.mean() diff --git a/workflow/scripts/model_application/run_e2g.py b/workflow/scripts/model_application/run_e2g.py index cd6daa6..bab67fb 100755 --- a/workflow/scripts/model_application/run_e2g.py +++ b/workflow/scripts/model_application/run_e2g.py @@ -7,7 +7,9 @@ SCORE_COLUMN_BASE = "E2G.Score" -def make_e2g_predictions(df_enhancers, feature_list, trained_model, tpm_threshold, epsilon): +def make_e2g_predictions( + df_enhancers, feature_list, trained_model, tpm_threshold, epsilon +): # transform the features X = df_enhancers.loc[:, feature_list] X = np.log(np.abs(X) + epsilon) @@ -15,13 +17,18 @@ def make_e2g_predictions(df_enhancers, feature_list, trained_model, tpm_threshol with open(trained_model, "rb") as f: model = pickle.load(f) probs = model.predict_proba(X) - + if ("RNA_pseudobulkTPM" in df_enhancers.columns) and (tpm_threshold > 0): df_enhancers[SCORE_COLUMN_BASE + ".ignoreTPM"] = probs[:, 1] - df_enhancers[SCORE_COLUMN_BASE] = [score if tpm >= tpm_threshold else 0 - for (score, tpm) in zip(df_enhancers[SCORE_COLUMN_BASE + ".ignoreTPM"] , df_enhancers["RNA_pseudobulkTPM"])] + df_enhancers[SCORE_COLUMN_BASE] = [ + score if tpm >= tpm_threshold else 0 + for (score, tpm) in zip( + df_enhancers[SCORE_COLUMN_BASE + ".ignoreTPM"], + df_enhancers["RNA_pseudobulkTPM"], + ) + ] else: - df_enhancers[SCORE_COLUMN_BASE] = probs[:, 1] + df_enhancers[SCORE_COLUMN_BASE] = probs[:, 1] return df_enhancers @@ -33,7 +40,9 @@ def make_e2g_predictions(df_enhancers, feature_list, trained_model, tpm_threshol @click.option("--tpm_threshold", default=0) @click.option("--epsilon", type=float, default=0.01) @click.option("--output_file", required=True) -def main(predictions, feature_table_file, trained_model, tpm_threshold, epsilon, output_file): +def main( + predictions, feature_table_file, trained_model, tpm_threshold, epsilon, output_file +): feature_table = pd.read_csv(feature_table_file, sep="\t") feature_list = feature_table["feature"] diff --git a/workflow/scripts/model_training/backward_sequential_feature_selection.py b/workflow/scripts/model_training/backward_sequential_feature_selection.py index 9bdf854..68fd808 100644 --- a/workflow/scripts/model_training/backward_sequential_feature_selection.py +++ b/workflow/scripts/model_training/backward_sequential_feature_selection.py @@ -7,286 +7,286 @@ from sklearn.preprocessing import PolynomialFeatures from sklearn.linear_model import LogisticRegression from training_functions import ( - statistic_aupr, - statistic_precision, - train_and_predict_once, - bootstrap_pvalue, - statistic_delta_aupr, - threshold_70_pct_recall, - statistic_precision_at_threshold, - statistic_delta_precision_at_threshold, + statistic_aupr, + statistic_precision, + train_and_predict_once, + bootstrap_pvalue, + statistic_delta_aupr, + threshold_70_pct_recall, + statistic_precision_at_threshold, + statistic_delta_precision_at_threshold, ) def SBFS(df_dataset, feature_table, model_name, epsilon, params, polynomial=False): - # specify feature list - feature_list_core = feature_table["feature"] - X = df_dataset.loc[:, feature_list_core] - model_name_core = model_name - feature_list = feature_list_core - - # transform features first - X = np.log(np.abs(X) + epsilon) - Y_true = df_dataset["Regulated"].values.astype(np.int64) - - # optionally add polynomial features - if polynomial: - poly = PolynomialFeatures(degree=2) - X = poly.fit_transform(X) - polynomial_names = poly.get_feature_names_out(feature_list_core) - X = pd.DataFrame(X, columns=polynomial_names) - feature_list = pd.Series(polynomial_names) - - n_features = len(feature_list) - - feature_removed = [] - all_auprs = [] - all_precisions = [] - - # train all feature model - model_name = model_name_core + "_full" - df_dataset = train_and_predict_once( - df_dataset, X, Y_true, feature_list, model_name, params - ) - Y_pred = df_dataset[model_name + ".Score"] - aupr = statistic_aupr(Y_true, Y_pred) - thresh = threshold_70_pct_recall(Y_true, Y_pred) - precision_at_70_pct_recall = statistic_precision_at_threshold( - Y_true, Y_pred, thresh - ) - - feature_removed = feature_removed + ["None"] - all_auprs = all_auprs + [aupr] - all_precisions = all_precisions + [precision_at_70_pct_recall] - - for i in range(n_features - 1): # iterate (# of feature) times - best_k = -1 - best_aupr = 0 - best_precision = 0 - for k in range( - len(feature_list) - ): # iterate through each feature, remove one with least effect - new_feature = feature_list[k] # feature to remove - print(new_feature) - features = feature_list.loc[feature_list != new_feature].reset_index( - drop=True - ) - print(features) - model_name = model_name_core + "_" + str(i + 1) + "_" + str(k + 1) - print(model_name) - - df_dataset = train_and_predict_once( - df_dataset, X, Y_true, features, model_name, params - ) - - # evaluate model once trained - Y_pred = df_dataset[model_name + ".Score"] - aupr = statistic_aupr(Y_true, Y_pred) - thresh = threshold_70_pct_recall(Y_true, Y_pred) - precision_at_70_pct_recall = statistic_precision_at_threshold( - Y_true, Y_pred, thresh - ) - - if aupr >= best_aupr: - best_aupr = aupr - best_k = k - best_precision = precision_at_70_pct_recall - - # update best feature - feature_removed = feature_removed + [feature_list[best_k]] - all_auprs = all_auprs + [best_aupr] - all_precisions = all_precisions + [best_precision] - feature_list = feature_list.loc[ - feature_list != feature_list[best_k] - ].reset_index(drop=True) - # add remaining feature - feature_removed = feature_removed + [feature_list[0]] - print(feature_removed) - print(all_auprs) - print(all_precisions) - - return feature_removed + # specify feature list + feature_list_core = feature_table["feature"] + X = df_dataset.loc[:, feature_list_core] + model_name_core = model_name + feature_list = feature_list_core + + # transform features first + X = np.log(np.abs(X) + epsilon) + Y_true = df_dataset["Regulated"].values.astype(np.int64) + + # optionally add polynomial features + if polynomial: + poly = PolynomialFeatures(degree=2) + X = poly.fit_transform(X) + polynomial_names = poly.get_feature_names_out(feature_list_core) + X = pd.DataFrame(X, columns=polynomial_names) + feature_list = pd.Series(polynomial_names) + + n_features = len(feature_list) + + feature_removed = [] + all_auprs = [] + all_precisions = [] + + # train all feature model + model_name = model_name_core + "_full" + df_dataset = train_and_predict_once( + df_dataset, X, Y_true, feature_list, model_name, params + ) + Y_pred = df_dataset[model_name + ".Score"] + aupr = statistic_aupr(Y_true, Y_pred) + thresh = threshold_70_pct_recall(Y_true, Y_pred) + precision_at_70_pct_recall = statistic_precision_at_threshold( + Y_true, Y_pred, thresh + ) + + feature_removed = feature_removed + ["None"] + all_auprs = all_auprs + [aupr] + all_precisions = all_precisions + [precision_at_70_pct_recall] + + for i in range(n_features - 1): # iterate (# of feature) times + best_k = -1 + best_aupr = 0 + best_precision = 0 + for k in range( + len(feature_list) + ): # iterate through each feature, remove one with least effect + new_feature = feature_list[k] # feature to remove + print(new_feature) + features = feature_list.loc[feature_list != new_feature].reset_index( + drop=True + ) + print(features) + model_name = model_name_core + "_" + str(i + 1) + "_" + str(k + 1) + print(model_name) + + df_dataset = train_and_predict_once( + df_dataset, X, Y_true, features, model_name, params + ) + + # evaluate model once trained + Y_pred = df_dataset[model_name + ".Score"] + aupr = statistic_aupr(Y_true, Y_pred) + thresh = threshold_70_pct_recall(Y_true, Y_pred) + precision_at_70_pct_recall = statistic_precision_at_threshold( + Y_true, Y_pred, thresh + ) + + if aupr >= best_aupr: + best_aupr = aupr + best_k = k + best_precision = precision_at_70_pct_recall + + # update best feature + feature_removed = feature_removed + [feature_list[best_k]] + all_auprs = all_auprs + [best_aupr] + all_precisions = all_precisions + [best_precision] + feature_list = feature_list.loc[ + feature_list != feature_list[best_k] + ].reset_index(drop=True) + # add remaining feature + feature_removed = feature_removed + [feature_list[0]] + print(feature_removed) + print(all_auprs) + print(all_precisions) + + return feature_removed def SBFS_significance( - df_dataset, - feature_table, - model_name, - epsilon, - params, - polynomial=False, - n_boot=1000, + df_dataset, + feature_table, + model_name, + epsilon, + params, + polynomial=False, + n_boot=1000, ): - feature_list = SBFS( - df_dataset, feature_table, model_name, epsilon, params, polynomial - ) # get order of features - feature_list_core = feature_table["feature"] - model_name_core = model_name - - # calculate polynomial features - if polynomial: - X_core = df_dataset.loc[:, feature_list_core] - poly = PolynomialFeatures(degree=2) - X_2 = poly.fit_transform(X_core) - polynomial_names = poly.get_feature_names_out(X_core.columns) - X_2 = pd.DataFrame(X_2, columns=polynomial_names) - X = X_2 - else: - X = df_dataset.loc[:, feature_list_core] - - X = np.log(np.abs(X) + epsilon) - Y_true = df_dataset["Regulated"].values.astype(np.int64) - - df = pd.DataFrame( - columns=[ - "feature_removed", - "aupr", - "delta_aupr", - "delta_aupr_low", - "delta_aupr_high", - "pval_aupr", - "precision", - "delta_precision", - "delta_precision_low", - "delta_precision_high", - "pval_precision", - ] - ) - # train first model (all features) - model_name = model_name_core + "_full" - feature_list.remove("None") - df_dataset = train_and_predict_once( - df_dataset, X, Y_true, feature_list, model_name, params - ) - Y_new = df_dataset[model_name + ".Score"] - - data = (Y_true, Y_new) - res_aupr = scipy.stats.bootstrap( - data, - statistic_aupr, - n_resamples=n_boot, - paired=True, - confidence_level=0.95, - method="BCa", - ) - thresh = threshold_70_pct_recall(Y_true, Y_new) - res_precision = scipy.stats.bootstrap( - data, - lambda Y_true, Y_new: statistic_precision_at_threshold(Y_true, Y_new, thresh), - n_resamples=n_boot, - paired=True, - confidence_level=0.95, - method="BCa", - ) - aupr = np.mean(res_aupr.bootstrap_distribution) - precision = np.mean(res_precision.bootstrap_distribution) - - df_temp = pd.DataFrame( - { - "feature_removed": "None", - "aupr": aupr, - "delta_aupr": 0, - "delta_aupr_low": 0, - "delta_aupr_high": 0, - "pval_aupr": 1, - "precision": precision, - "delta_precision": 0, - "delta_precision_low": 0, - "delta_precision_high": 0, - "pval_precision": 1, - }, - index=[0], - ) - df = pd.concat([df, df_temp]) - - Y_last = Y_new - - # loop feature list from SBFS through to train next model and compare - for i in range(len(feature_list)): - to_remove = feature_list[0] - print(to_remove) - if len(feature_list) == 1: - df_dataset["all_true"] = 1 - Y_new = df_dataset["all_true"] - else: - feature_list.remove(to_remove) - model_name = model_name_core + "_" + str(i + 1) - df_dataset = train_and_predict_once( - df_dataset, X, Y_true, feature_list, model_name, params - ) - Y_new = df_dataset[model_name + ".Score"] - - data_compare = (Y_true, Y_last, Y_new) - thresh_last = threshold_70_pct_recall(Y_true, Y_last) - thresh_new = threshold_70_pct_recall(Y_true, Y_new) - res_delta_aupr = scipy.stats.bootstrap( - data_compare, - statistic_delta_aupr, - n_resamples=n_boot, - paired=True, - confidence_level=0.95, - method="BCa", - ) - res_delta_precision = scipy.stats.bootstrap( - data_compare, - lambda Y_true, Y_last, Y_new,: statistic_delta_precision_at_threshold( - Y_true, Y_last, Y_new, thresh_last, thresh_new - ), - n_resamples=n_boot, - paired=True, - confidence_level=0.95, - method="BCa", - ) - delta_aupr = np.mean( - res_delta_aupr.bootstrap_distribution - ) # aupr_new - aupr_last - delta_precision = np.mean(res_delta_precision.bootstrap_distribution) - pval_aupr = bootstrap_pvalue(delta_aupr, res_delta_aupr) - pval_precision = bootstrap_pvalue(delta_precision, res_delta_precision) - - data = (Y_true, Y_new) - res_aupr = scipy.stats.bootstrap( - data, - statistic_aupr, - n_resamples=n_boot, - paired=True, - confidence_level=0.95, - method="BCa", - ) - thresh = threshold_70_pct_recall(Y_true, Y_new) - res_precision = scipy.stats.bootstrap( - data, - lambda Y_true, Y_new: statistic_precision_at_threshold( - Y_true, Y_new, thresh - ), - n_resamples=n_boot, - paired=True, - confidence_level=0.95, - method="BCa", - ) - aupr = np.mean(res_aupr.bootstrap_distribution) - precision = np.mean(res_precision.bootstrap_distribution) - - df_temp = pd.DataFrame( - { - "feature_removed": to_remove, - "aupr": aupr, - "delta_aupr": delta_aupr, - "delta_aupr_low": res_delta_aupr.confidence_interval[0], - "delta_aupr_high": res_delta_aupr.confidence_interval[1], - "pval_aupr": pval_aupr, - "precision": precision, - "delta_precision": delta_precision, - "delta_precision_low": res_delta_precision.confidence_interval[0], - "delta_precision_high": res_delta_precision.confidence_interval[1], - "pval_precision": pval_precision, - }, - index=[0], - ) - df = pd.concat([df, df_temp]) - Y_last = Y_new - - return df + feature_list = SBFS( + df_dataset, feature_table, model_name, epsilon, params, polynomial + ) # get order of features + feature_list_core = feature_table["feature"] + model_name_core = model_name + + # calculate polynomial features + if polynomial: + X_core = df_dataset.loc[:, feature_list_core] + poly = PolynomialFeatures(degree=2) + X_2 = poly.fit_transform(X_core) + polynomial_names = poly.get_feature_names_out(X_core.columns) + X_2 = pd.DataFrame(X_2, columns=polynomial_names) + X = X_2 + else: + X = df_dataset.loc[:, feature_list_core] + + X = np.log(np.abs(X) + epsilon) + Y_true = df_dataset["Regulated"].values.astype(np.int64) + + df = pd.DataFrame( + columns=[ + "feature_removed", + "aupr", + "delta_aupr", + "delta_aupr_low", + "delta_aupr_high", + "pval_aupr", + "precision", + "delta_precision", + "delta_precision_low", + "delta_precision_high", + "pval_precision", + ] + ) + # train first model (all features) + model_name = model_name_core + "_full" + feature_list.remove("None") + df_dataset = train_and_predict_once( + df_dataset, X, Y_true, feature_list, model_name, params + ) + Y_new = df_dataset[model_name + ".Score"] + + data = (Y_true, Y_new) + res_aupr = scipy.stats.bootstrap( + data, + statistic_aupr, + n_resamples=n_boot, + paired=True, + confidence_level=0.95, + method="BCa", + ) + thresh = threshold_70_pct_recall(Y_true, Y_new) + res_precision = scipy.stats.bootstrap( + data, + lambda Y_true, Y_new: statistic_precision_at_threshold(Y_true, Y_new, thresh), + n_resamples=n_boot, + paired=True, + confidence_level=0.95, + method="BCa", + ) + aupr = np.mean(res_aupr.bootstrap_distribution) + precision = np.mean(res_precision.bootstrap_distribution) + + df_temp = pd.DataFrame( + { + "feature_removed": "None", + "aupr": aupr, + "delta_aupr": 0, + "delta_aupr_low": 0, + "delta_aupr_high": 0, + "pval_aupr": 1, + "precision": precision, + "delta_precision": 0, + "delta_precision_low": 0, + "delta_precision_high": 0, + "pval_precision": 1, + }, + index=[0], + ) + df = pd.concat([df, df_temp]) + + Y_last = Y_new + + # loop feature list from SBFS through to train next model and compare + for i in range(len(feature_list)): + to_remove = feature_list[0] + print(to_remove) + if len(feature_list) == 1: + df_dataset["all_true"] = 1 + Y_new = df_dataset["all_true"] + else: + feature_list.remove(to_remove) + model_name = model_name_core + "_" + str(i + 1) + df_dataset = train_and_predict_once( + df_dataset, X, Y_true, feature_list, model_name, params + ) + Y_new = df_dataset[model_name + ".Score"] + + data_compare = (Y_true, Y_last, Y_new) + thresh_last = threshold_70_pct_recall(Y_true, Y_last) + thresh_new = threshold_70_pct_recall(Y_true, Y_new) + res_delta_aupr = scipy.stats.bootstrap( + data_compare, + statistic_delta_aupr, + n_resamples=n_boot, + paired=True, + confidence_level=0.95, + method="BCa", + ) + res_delta_precision = scipy.stats.bootstrap( + data_compare, + lambda Y_true, Y_last, Y_new,: statistic_delta_precision_at_threshold( + Y_true, Y_last, Y_new, thresh_last, thresh_new + ), + n_resamples=n_boot, + paired=True, + confidence_level=0.95, + method="BCa", + ) + delta_aupr = np.mean( + res_delta_aupr.bootstrap_distribution + ) # aupr_new - aupr_last + delta_precision = np.mean(res_delta_precision.bootstrap_distribution) + pval_aupr = bootstrap_pvalue(delta_aupr, res_delta_aupr) + pval_precision = bootstrap_pvalue(delta_precision, res_delta_precision) + + data = (Y_true, Y_new) + res_aupr = scipy.stats.bootstrap( + data, + statistic_aupr, + n_resamples=n_boot, + paired=True, + confidence_level=0.95, + method="BCa", + ) + thresh = threshold_70_pct_recall(Y_true, Y_new) + res_precision = scipy.stats.bootstrap( + data, + lambda Y_true, Y_new: statistic_precision_at_threshold( + Y_true, Y_new, thresh + ), + n_resamples=n_boot, + paired=True, + confidence_level=0.95, + method="BCa", + ) + aupr = np.mean(res_aupr.bootstrap_distribution) + precision = np.mean(res_precision.bootstrap_distribution) + + df_temp = pd.DataFrame( + { + "feature_removed": to_remove, + "aupr": aupr, + "delta_aupr": delta_aupr, + "delta_aupr_low": res_delta_aupr.confidence_interval[0], + "delta_aupr_high": res_delta_aupr.confidence_interval[1], + "pval_aupr": pval_aupr, + "precision": precision, + "delta_precision": delta_precision, + "delta_precision_low": res_delta_precision.confidence_interval[0], + "delta_precision_high": res_delta_precision.confidence_interval[1], + "pval_precision": pval_precision, + }, + index=[0], + ) + df = pd.concat([df, df_temp]) + Y_last = Y_new + + return df @click.command() @@ -297,20 +297,20 @@ def SBFS_significance( @click.option("--epsilon", type=float, default=0.01) @click.option("--params_file", required=True) def main( - crispr_features_file, feature_table_file, out_dir, polynomial, params_file, epsilon + crispr_features_file, feature_table_file, out_dir, polynomial, params_file, epsilon ): - model_name = "E2G" - df_dataset = pd.read_csv(crispr_features_file, sep="\t") - feature_table = pd.read_csv(feature_table_file, sep="\t") - with open(params_file, "rb") as handle: - params = pickle.load(handle) + model_name = "E2G" + df_dataset = pd.read_csv(crispr_features_file, sep="\t") + feature_table = pd.read_csv(feature_table_file, sep="\t") + with open(params_file, "rb") as handle: + params = pickle.load(handle) - res = SBFS_significance( - df_dataset, feature_table, model_name, epsilon, params, polynomial, n_boot=1000 - ) + res = SBFS_significance( + df_dataset, feature_table, model_name, epsilon, params, polynomial, n_boot=1000 + ) - res.to_csv(out_dir + "/backward_feature_selection.tsv", sep="\t", index=False) + res.to_csv(out_dir + "/backward_feature_selection.tsv", sep="\t", index=False) if __name__ == "__main__": - main() + main() diff --git a/workflow/scripts/model_training/compare_all_feature_sets.py b/workflow/scripts/model_training/compare_all_feature_sets.py index fc84e78..f8594a1 100644 --- a/workflow/scripts/model_training/compare_all_feature_sets.py +++ b/workflow/scripts/model_training/compare_all_feature_sets.py @@ -5,11 +5,9 @@ import scipy from training_functions import ( statistic_aupr, - statistic_precision, + statistic_precision_at_threshold, train_and_predict_once, - bootstrap_pvalue, - statistic_delta_aupr, - statistic_delta_precision, + threshold_70_pct_recall, ) @@ -32,10 +30,13 @@ def compare_feature_sets(df_dataset, feature_table, epsilon, params, n_boot): df.loc[len(df)] = fill_zeros df["features"] = df.apply(lambda row: list(df.columns[row == 1]), axis=1) - df["n_features"] = 0 - df["AUPRC"] = 0 - df["AUPRC_95CI_low"] = 0 - df["AUPRC_95CI_high"] = 0 + df["n_features"] = 0.0 + df["AUPRC"] = 0.0 + df["AUPRC_95CI_low"] = 0.0 + df["AUPRC_95CI_high"] = 0.0 + df["precision_70pct_recall"] = 0.0 + df["precision_95CI_low"] = 0.0 + df["precision_95CI_high"] = 0.0 for i in range(len(df)): # iterate through sets model_name = "row_" + str(i) @@ -48,9 +49,8 @@ def compare_feature_sets(df_dataset, feature_table, epsilon, params, n_boot): ) Y_pred = df_dataset[model_name + ".Score"] - data = (Y_true, Y_pred) res_aupr = scipy.stats.bootstrap( - data, + (Y_true, Y_pred), statistic_aupr, n_resamples=n_boot, paired=True, @@ -61,6 +61,22 @@ def compare_feature_sets(df_dataset, feature_table, epsilon, params, n_boot): df.loc[i, "AUPRC_95CI_low"] = res_aupr.confidence_interval[0] df.loc[i, "AUPRC_95CI_high"] = res_aupr.confidence_interval[1] + thresh = threshold_70_pct_recall(Y_true, Y_pred) + + res_prec = scipy.stats.bootstrap( + (Y_true, Y_pred), + lambda Y_true, Y_pred: statistic_precision_at_threshold( + Y_true, Y_pred, thresh + ), + n_resamples=n_boot, + paired=True, + confidence_level=0.95, + method="BCa", + ) + df.loc[i, "precision_70pct_recall"] = np.mean(res_prec.bootstrap_distribution) + df.loc[i, "precision_95CI_low"] = res_prec.confidence_interval[0] + df.loc[i, "precision_95CI_high"] = res_prec.confidence_interval[1] + # sort table by AUPRC df = df.sort_values(by="AUPRC", ascending=False) diff --git a/workflow/scripts/model_training/compare_all_models.py b/workflow/scripts/model_training/compare_all_models.py index 9b38528..e360d56 100644 --- a/workflow/scripts/model_training/compare_all_models.py +++ b/workflow/scripts/model_training/compare_all_models.py @@ -12,10 +12,17 @@ def performance_summary( - model_id, dataset, model_name, out_dir, crispr_data="", n_boot=1000 + model_id, + dataset, + pred_file, + missing_file, + model_name, + out_dir, + crispr_data="", + n_boot=1000, ): # read in predicitons - if model_id == "distance": + if model_id.startswith("distance"): crispr_data = pd.read_csv(crispr_data, sep="\t") crispr_data["distance"] = np.abs( (crispr_data["chromStart"] + crispr_data["chromEnd"]) / 2 @@ -26,15 +33,6 @@ def performance_summary( Y_pred_all = crispr_data["distance"] * -1 pct_missing = 0 else: # normal models - pred_file = os.path.join( - out_dir, dataset, model_id, "model", "training_predictions.tsv" - ) - missing_file = os.path.join( - out_dir, - dataset, - model_id, - "missing.EPCrisprBenchmark_ensemble_data_GRCh38.K562_features_NAfilled.tsv.gz", - ) pred_df = pd.read_csv(pred_file, sep="\t") missing_df = pd.read_csv(missing_file, sep="\t") @@ -101,17 +99,35 @@ def performance_summary( @click.command() +@click.option("--all_pred", required=True) +@click.option("--all_missing", required=True) @click.option("--model_config_file", required=True) @click.option("--output_file", required=True) @click.option("--crispr_data", required=True) +@click.option("--crispr_names", required=True) @click.option("--out_dir", required=True) -def main(model_config_file, output_file, crispr_data, out_dir): +def main( + all_pred, + all_missing, + model_config_file, + output_file, + crispr_data, + crispr_names, + out_dir, +): + all_pred_files = all_pred.split(" ") + all_missing_files = all_missing.split(" ") + crispr_dict = dict(zip(crispr_names.split(" "), crispr_data.split(" "))) + model_name = "E2G" model_config = ( pd.read_table(model_config_file, na_values="") .fillna("None") .set_index("model", drop=False) ) + model_config["prediction_file"] = all_pred_files + model_config["missing_file"] = all_missing_files + # initiate final df df = pd.DataFrame( columns=[ @@ -130,13 +146,22 @@ def main(model_config_file, output_file, crispr_data, out_dir): # iterate through rows of model config and add results to final df for row in model_config.itertuples(index=False): - res_row = performance_summary(row.model, row.dataset, model_name, out_dir) + res_row = performance_summary( + row.model, + row.dataset, + row.prediction_file, + row.missing_file, + model_name, + out_dir, + ) + df = pd.concat([df, res_row]) + + for key, file_path in crispr_dict.items(): + # add row(s) for distance + res_row = performance_summary( + f"distance_{key}", "baseline", "", "", "", out_dir, crispr_data=file_path + ) df = pd.concat([df, res_row]) - # add row for distance - res_row = performance_summary( - "distance", "baseline", "", out_dir, crispr_data=crispr_data - ) - df = pd.concat([df, res_row]) # sort table by AUPRC df = df.sort_values(by="AUPRC", ascending=False) diff --git a/workflow/scripts/model_training/forward_sequential_feature_selection.py b/workflow/scripts/model_training/forward_sequential_feature_selection.py index 8a9f76a..7a320c2 100644 --- a/workflow/scripts/model_training/forward_sequential_feature_selection.py +++ b/workflow/scripts/model_training/forward_sequential_feature_selection.py @@ -7,264 +7,264 @@ from sklearn.preprocessing import PolynomialFeatures from sklearn.linear_model import LogisticRegression from training_functions import ( - statistic_aupr, - statistic_precision, - train_and_predict_once, - bootstrap_pvalue, - statistic_delta_aupr, - threshold_70_pct_recall, - statistic_precision_at_threshold, - statistic_delta_precision_at_threshold, + statistic_aupr, + statistic_precision, + train_and_predict_once, + bootstrap_pvalue, + statistic_delta_aupr, + threshold_70_pct_recall, + statistic_precision_at_threshold, + statistic_delta_precision_at_threshold, ) def SFFS(df_dataset, feature_table, model_name, epsilon, params, polynomial=False): - # specify feature list - feature_list_core = feature_table["feature"] - X = df_dataset.loc[:, feature_list_core] - model_name_core = model_name - feature_list = feature_list_core - - # transform features first - X = np.log(np.abs(X) + epsilon) - Y_true = df_dataset["Regulated"].values.astype(np.int64) - - # optionally add polynomial features - if polynomial: - poly = PolynomialFeatures(degree=2) - X = poly.fit_transform(X) - polynomial_names = poly.get_feature_names_out(feature_list_core) - X = pd.DataFrame(X, columns=polynomial_names) - feature_list = pd.Series(polynomial_names) - - n_features = len(feature_list) - - best_features = [] - all_auprs = [] - all_precisions = [] - - for i in range(n_features): # iterate (# of feature) times - best_k = -1 - best_aupr = 0 - best_precision = 0 - for k in range( - len(feature_list) - ): # iterate through each feature, choose best one - new_feature = feature_list[k] - print(new_feature) - features = best_features + [new_feature] - print(features) - model_name = model_name_core + "_" + str(i + 1) + "_" + str(k + 1) - print(model_name) - - df_dataset = train_and_predict_once( - df_dataset, X, Y_true, features, model_name, params - ) - - # evaluate model once trained - Y_pred = df_dataset[model_name + ".Score"] - aupr = statistic_aupr(Y_true, Y_pred) - thresh = threshold_70_pct_recall(Y_true, Y_pred) - precision_at_70_pct_recall = statistic_precision_at_threshold( - Y_true, Y_pred, thresh - ) - - if aupr >= best_aupr: - best_aupr = aupr - best_k = k - best_precision = precision_at_70_pct_recall - - # update best feature - best_features = best_features + [feature_list[best_k]] - all_auprs = all_auprs + [best_aupr] - all_precisions = all_precisions + [best_precision] - feature_list = feature_list.loc[ - feature_list != feature_list[best_k] - ].reset_index(drop=True) - print(best_features) - print(all_auprs) - print(all_precisions) - - return best_features + # specify feature list + feature_list_core = feature_table["feature"] + X = df_dataset.loc[:, feature_list_core] + model_name_core = model_name + feature_list = feature_list_core + + # transform features first + X = np.log(np.abs(X) + epsilon) + Y_true = df_dataset["Regulated"].values.astype(np.int64) + + # optionally add polynomial features + if polynomial: + poly = PolynomialFeatures(degree=2) + X = poly.fit_transform(X) + polynomial_names = poly.get_feature_names_out(feature_list_core) + X = pd.DataFrame(X, columns=polynomial_names) + feature_list = pd.Series(polynomial_names) + + n_features = len(feature_list) + + best_features = [] + all_auprs = [] + all_precisions = [] + + for i in range(n_features): # iterate (# of feature) times + best_k = -1 + best_aupr = 0 + best_precision = 0 + for k in range( + len(feature_list) + ): # iterate through each feature, choose best one + new_feature = feature_list[k] + print(new_feature) + features = best_features + [new_feature] + print(features) + model_name = model_name_core + "_" + str(i + 1) + "_" + str(k + 1) + print(model_name) + + df_dataset = train_and_predict_once( + df_dataset, X, Y_true, features, model_name, params + ) + + # evaluate model once trained + Y_pred = df_dataset[model_name + ".Score"] + aupr = statistic_aupr(Y_true, Y_pred) + thresh = threshold_70_pct_recall(Y_true, Y_pred) + precision_at_70_pct_recall = statistic_precision_at_threshold( + Y_true, Y_pred, thresh + ) + + if aupr >= best_aupr: + best_aupr = aupr + best_k = k + best_precision = precision_at_70_pct_recall + + # update best feature + best_features = best_features + [feature_list[best_k]] + all_auprs = all_auprs + [best_aupr] + all_precisions = all_precisions + [best_precision] + feature_list = feature_list.loc[ + feature_list != feature_list[best_k] + ].reset_index(drop=True) + print(best_features) + print(all_auprs) + print(all_precisions) + + return best_features def SFFS_significance( - df_dataset, - feature_table, - model_name, - epsilon, - params, - polynomial=False, - n_boot=1000, + df_dataset, + feature_table, + model_name, + epsilon, + params, + polynomial=False, + n_boot=1000, ): - feature_list = SFFS( - df_dataset, feature_table, model_name, epsilon, params, polynomial - ) # get order of features - feature_list_core = feature_table["feature"] - model_name_core = model_name - - # calculate polynomial features - if polynomial: - X_core = df_dataset.loc[:, feature_list_core] - poly = PolynomialFeatures(degree=2) - X_2 = poly.fit_transform(X_core) - polynomial_names = poly.get_feature_names_out(X_core.columns) - X_2 = pd.DataFrame(X_2, columns=polynomial_names) - X = X_2 - else: - X = df_dataset.loc[:, feature_list_core] - - X = np.log(np.abs(X) + epsilon) - Y_true = df_dataset["Regulated"].values.astype(np.int64) - - df = pd.DataFrame( - columns=[ - "feature_added", - "aupr", - "delta_aupr", - "delta_aupr_low", - "delta_aupr_high", - "pval_aupr", - "precision", - "delta_precision", - "delta_precision_low", - "delta_precision_high", - "pval_precision", - ] - ) - # evaluate baseline model - df_dataset["all_true"] = 1 - Y_new = df_dataset["all_true"] - - data = (Y_true, Y_new) - res_aupr = scipy.stats.bootstrap( - data, - statistic_aupr, - n_resamples=n_boot, - paired=True, - confidence_level=0.95, - method="BCa", - ) - thresh = threshold_70_pct_recall( - Y_true, Y_new - ) # will return None if max recall < 70% - res_precision = scipy.stats.bootstrap( - data, - lambda Y_true, Y_pred: statistic_precision_at_threshold(Y_true, Y_pred, thresh), - n_resamples=n_boot, - paired=True, - confidence_level=0.95, - method="BCa", - ) - - aupr = np.mean(res_aupr.bootstrap_distribution) - precision = np.mean(res_precision.bootstrap_distribution) - - df_temp = pd.DataFrame( - { - "feature_added": "None", - "aupr": aupr, - "delta_aupr": 0, - "delta_aupr_low": 0, - "delta_aupr_high": 0, - "pval_aupr": 1, - "precision": precision, - "delta_precision": 0, - "delta_precision_low": 0, - "delta_precision_high": 0, - "pval_precision": 1, - }, - index=[0], - ) - df = pd.concat([df, df_temp]) - Y_last = Y_new - - features = [] - # loop through to train next model and compare - for i in range(0, len(feature_list)): - feature_added = feature_list[i] - features = features + [feature_added] - model_name = model_name_core + "_" + str(i + 1) - print(model_name) - df_dataset = train_and_predict_once( - df_dataset, X, Y_true, features, model_name, params - ) - Y_new = df_dataset[model_name + ".Score"] - - data_compare = (Y_true, Y_last, Y_new) - thresh_last = threshold_70_pct_recall(Y_true, Y_last) - thresh_new = threshold_70_pct_recall(Y_true, Y_new) - res_delta_aupr = scipy.stats.bootstrap( - data_compare, - statistic_delta_aupr, - n_resamples=n_boot, - paired=True, - confidence_level=0.95, - method="BCa", - ) - res_delta_precision = scipy.stats.bootstrap( - data_compare, - lambda Y_true, Y_last, Y_new,: statistic_delta_precision_at_threshold( - Y_true, Y_last, Y_new, thresh_last, thresh_new - ), - n_resamples=n_boot, - paired=True, - confidence_level=0.95, - method="BCa", - ) - delta_aupr = np.mean( - res_delta_aupr.bootstrap_distribution - ) # aupr_new - aupr_last - delta_precision = np.mean(res_delta_precision.bootstrap_distribution) - pval_aupr = bootstrap_pvalue(delta_aupr, res_delta_aupr) - pval_precision = bootstrap_pvalue(delta_precision, res_delta_precision) - - data = (Y_true, Y_new) - res_aupr = scipy.stats.bootstrap( - data, - statistic_aupr, - n_resamples=n_boot, - paired=True, - confidence_level=0.95, - method="BCa", - ) - thresh = threshold_70_pct_recall( - Y_true, Y_new - ) # will return None if max recall < 70% - res_precision = scipy.stats.bootstrap( - data, - lambda Y_true, Y_pred: statistic_precision_at_threshold( - Y_true, Y_pred, thresh - ), - n_resamples=n_boot, - paired=True, - confidence_level=0.95, - method="BCa", - ) - - aupr = np.mean(res_aupr.bootstrap_distribution) - precision = np.mean(res_precision.bootstrap_distribution) - - df_temp = pd.DataFrame( - { - "feature_added": feature_added, - "aupr": aupr, - "delta_aupr": delta_aupr, - "delta_aupr_low": res_delta_aupr.confidence_interval[0], - "delta_aupr_high": res_delta_aupr.confidence_interval[1], - "pval_aupr": pval_aupr, - "precision": precision, - "delta_precision": delta_precision, - "delta_precision_low": res_delta_precision.confidence_interval[0], - "delta_precision_high": res_delta_precision.confidence_interval[1], - "pval_precision": pval_precision, - }, - index=[0], - ) - df = pd.concat([df, df_temp]) - Y_last = Y_new - - return df + feature_list = SFFS( + df_dataset, feature_table, model_name, epsilon, params, polynomial + ) # get order of features + feature_list_core = feature_table["feature"] + model_name_core = model_name + + # calculate polynomial features + if polynomial: + X_core = df_dataset.loc[:, feature_list_core] + poly = PolynomialFeatures(degree=2) + X_2 = poly.fit_transform(X_core) + polynomial_names = poly.get_feature_names_out(X_core.columns) + X_2 = pd.DataFrame(X_2, columns=polynomial_names) + X = X_2 + else: + X = df_dataset.loc[:, feature_list_core] + + X = np.log(np.abs(X) + epsilon) + Y_true = df_dataset["Regulated"].values.astype(np.int64) + + df = pd.DataFrame( + columns=[ + "feature_added", + "aupr", + "delta_aupr", + "delta_aupr_low", + "delta_aupr_high", + "pval_aupr", + "precision", + "delta_precision", + "delta_precision_low", + "delta_precision_high", + "pval_precision", + ] + ) + # evaluate baseline model + df_dataset["all_true"] = 1 + Y_new = df_dataset["all_true"] + + data = (Y_true, Y_new) + res_aupr = scipy.stats.bootstrap( + data, + statistic_aupr, + n_resamples=n_boot, + paired=True, + confidence_level=0.95, + method="BCa", + ) + thresh = threshold_70_pct_recall( + Y_true, Y_new + ) # will return None if max recall < 70% + res_precision = scipy.stats.bootstrap( + data, + lambda Y_true, Y_pred: statistic_precision_at_threshold(Y_true, Y_pred, thresh), + n_resamples=n_boot, + paired=True, + confidence_level=0.95, + method="BCa", + ) + + aupr = np.mean(res_aupr.bootstrap_distribution) + precision = np.mean(res_precision.bootstrap_distribution) + + df_temp = pd.DataFrame( + { + "feature_added": "None", + "aupr": aupr, + "delta_aupr": 0, + "delta_aupr_low": 0, + "delta_aupr_high": 0, + "pval_aupr": 1, + "precision": precision, + "delta_precision": 0, + "delta_precision_low": 0, + "delta_precision_high": 0, + "pval_precision": 1, + }, + index=[0], + ) + df = pd.concat([df, df_temp]) + Y_last = Y_new + + features = [] + # loop through to train next model and compare + for i in range(0, len(feature_list)): + feature_added = feature_list[i] + features = features + [feature_added] + model_name = model_name_core + "_" + str(i + 1) + print(model_name) + df_dataset = train_and_predict_once( + df_dataset, X, Y_true, features, model_name, params + ) + Y_new = df_dataset[model_name + ".Score"] + + data_compare = (Y_true, Y_last, Y_new) + thresh_last = threshold_70_pct_recall(Y_true, Y_last) + thresh_new = threshold_70_pct_recall(Y_true, Y_new) + res_delta_aupr = scipy.stats.bootstrap( + data_compare, + statistic_delta_aupr, + n_resamples=n_boot, + paired=True, + confidence_level=0.95, + method="BCa", + ) + res_delta_precision = scipy.stats.bootstrap( + data_compare, + lambda Y_true, Y_last, Y_new,: statistic_delta_precision_at_threshold( + Y_true, Y_last, Y_new, thresh_last, thresh_new + ), + n_resamples=n_boot, + paired=True, + confidence_level=0.95, + method="BCa", + ) + delta_aupr = np.mean( + res_delta_aupr.bootstrap_distribution + ) # aupr_new - aupr_last + delta_precision = np.mean(res_delta_precision.bootstrap_distribution) + pval_aupr = bootstrap_pvalue(delta_aupr, res_delta_aupr) + pval_precision = bootstrap_pvalue(delta_precision, res_delta_precision) + + data = (Y_true, Y_new) + res_aupr = scipy.stats.bootstrap( + data, + statistic_aupr, + n_resamples=n_boot, + paired=True, + confidence_level=0.95, + method="BCa", + ) + thresh = threshold_70_pct_recall( + Y_true, Y_new + ) # will return None if max recall < 70% + res_precision = scipy.stats.bootstrap( + data, + lambda Y_true, Y_pred: statistic_precision_at_threshold( + Y_true, Y_pred, thresh + ), + n_resamples=n_boot, + paired=True, + confidence_level=0.95, + method="BCa", + ) + + aupr = np.mean(res_aupr.bootstrap_distribution) + precision = np.mean(res_precision.bootstrap_distribution) + + df_temp = pd.DataFrame( + { + "feature_added": feature_added, + "aupr": aupr, + "delta_aupr": delta_aupr, + "delta_aupr_low": res_delta_aupr.confidence_interval[0], + "delta_aupr_high": res_delta_aupr.confidence_interval[1], + "pval_aupr": pval_aupr, + "precision": precision, + "delta_precision": delta_precision, + "delta_precision_low": res_delta_precision.confidence_interval[0], + "delta_precision_high": res_delta_precision.confidence_interval[1], + "pval_precision": pval_precision, + }, + index=[0], + ) + df = pd.concat([df, df_temp]) + Y_last = Y_new + + return df @click.command() @@ -275,21 +275,21 @@ def SFFS_significance( @click.option("--epsilon", type=float, default=0.01) @click.option("--params_file", required=True) def main( - crispr_features_file, feature_table_file, out_dir, polynomial, epsilon, params_file + crispr_features_file, feature_table_file, out_dir, polynomial, epsilon, params_file ): - model_name = "E2G" - df_dataset = pd.read_csv(crispr_features_file, sep="\t") - feature_table = pd.read_csv(feature_table_file, sep="\t") + model_name = "E2G" + df_dataset = pd.read_csv(crispr_features_file, sep="\t") + feature_table = pd.read_csv(feature_table_file, sep="\t") - with open(params_file, "rb") as handle: - params = pickle.load(handle) + with open(params_file, "rb") as handle: + params = pickle.load(handle) - res = SFFS_significance( - df_dataset, feature_table, model_name, epsilon, params, polynomial, n_boot=1000 - ) + res = SFFS_significance( + df_dataset, feature_table, model_name, epsilon, params, polynomial, n_boot=1000 + ) - res.to_csv(out_dir + "/forward_feature_selection.tsv", sep="\t", index=False) + res.to_csv(out_dir + "/forward_feature_selection.tsv", sep="\t", index=False) if __name__ == "__main__": - main() + main() diff --git a/workflow/scripts/model_training/permutation_feature_importance.py b/workflow/scripts/model_training/permutation_feature_importance.py index 6bea740..7c5b4f8 100644 --- a/workflow/scripts/model_training/permutation_feature_importance.py +++ b/workflow/scripts/model_training/permutation_feature_importance.py @@ -7,89 +7,89 @@ from sklearn.preprocessing import PolynomialFeatures from sklearn.linear_model import LogisticRegression from training_functions import ( - statistic_aupr, - statistic_precision, - train_and_predict_once, - bootstrap_pvalue, - statistic_delta_aupr, - statistic_delta_precision_at_threshold, - threshold_70_pct_recall, + statistic_aupr, + statistic_precision, + train_and_predict_once, + bootstrap_pvalue, + statistic_delta_aupr, + statistic_delta_precision_at_threshold, + threshold_70_pct_recall, ) def permutation_feature_importance( - df_dataset, - feature_table, - model_name, - epsilon, - params, - n_repeats=20, - polynomial=False, + df_dataset, + feature_table, + model_name, + epsilon, + params, + n_repeats=20, + polynomial=False, ): - # specify feature list - feature_list_core = feature_table["feature"] - model_name_core = model_name - - X = df_dataset.loc[:, feature_list_core] - feature_list = feature_list_core - - # transform features first - X = np.log(np.abs(X) + epsilon) - Y_true = df_dataset["Regulated"].values.astype(np.int64) - - # optionally add polynomial features - if polynomial: - poly = PolynomialFeatures(degree=2) - X = poly.fit_transform(X) - polynomial_names = poly.get_feature_names_out(feature_list_core) - X = pd.DataFrame(X, columns=polynomial_names) - feature_list = pd.Series(polynomial_names) - - # train full model - model_name = model_name_core + "_full" - df_dataset = train_and_predict_once( - df_dataset, X, Y_true, feature_list, model_name, params - ) - Y_full = df_dataset[model_name + ".Score"] - - df = pd.DataFrame(columns=["feature_permuted", "delta_aupr", "delta_precision"]) - - for i in range(len(feature_list)): # iterate through features - print(feature_list[i]) - delta_aupr_feature = [] - delta_precision_feature = [] - original_values = X[feature_list[i]] - - for j in range(n_repeats): - X[feature_list[i]] = np.random.permutation(X[feature_list[i]]) - model_name = model_name_core + "_shuff_" + feature_list[i] - df_dataset = train_and_predict_once( - df_dataset, X, Y_true, feature_list, model_name, params - ) - Y_shuffle = df_dataset[model_name + ".Score"] - - delta_aupr = statistic_delta_aupr(Y_true, Y_full, Y_shuffle) - thresh_full = threshold_70_pct_recall(Y_true, Y_full) - thresh_shuffle = threshold_70_pct_recall(Y_true, Y_shuffle) - delta_precision = statistic_delta_precision_at_threshold( - Y_true, Y_full, Y_shuffle, thresh_full, thresh_shuffle - ) - - delta_aupr_feature = delta_aupr_feature + [delta_aupr] - delta_precision_feature = delta_precision_feature + [delta_precision] - - df_temp = pd.DataFrame( - { - "delta_aupr": delta_aupr_feature, - "delta_precision": delta_precision_feature, - } - ) - df_temp["feature_permuted"] = feature_list[i] - df = pd.concat([df, df_temp]) - - X[feature_list[i]] = original_values # reset feature values - - return df + # specify feature list + feature_list_core = feature_table["feature"] + model_name_core = model_name + + X = df_dataset.loc[:, feature_list_core] + feature_list = feature_list_core + + # transform features first + X = np.log(np.abs(X) + epsilon) + Y_true = df_dataset["Regulated"].values.astype(np.int64) + + # optionally add polynomial features + if polynomial: + poly = PolynomialFeatures(degree=2) + X = poly.fit_transform(X) + polynomial_names = poly.get_feature_names_out(feature_list_core) + X = pd.DataFrame(X, columns=polynomial_names) + feature_list = pd.Series(polynomial_names) + + # train full model + model_name = model_name_core + "_full" + df_dataset = train_and_predict_once( + df_dataset, X, Y_true, feature_list, model_name, params + ) + Y_full = df_dataset[model_name + ".Score"] + + df = pd.DataFrame(columns=["feature_permuted", "delta_aupr", "delta_precision"]) + + for i in range(len(feature_list)): # iterate through features + print(feature_list[i]) + delta_aupr_feature = [] + delta_precision_feature = [] + original_values = X[feature_list[i]] + + for j in range(n_repeats): + X[feature_list[i]] = np.random.permutation(X[feature_list[i]]) + model_name = model_name_core + "_shuff_" + feature_list[i] + df_dataset = train_and_predict_once( + df_dataset, X, Y_true, feature_list, model_name, params + ) + Y_shuffle = df_dataset[model_name + ".Score"] + + delta_aupr = statistic_delta_aupr(Y_true, Y_full, Y_shuffle) + thresh_full = threshold_70_pct_recall(Y_true, Y_full) + thresh_shuffle = threshold_70_pct_recall(Y_true, Y_shuffle) + delta_precision = statistic_delta_precision_at_threshold( + Y_true, Y_full, Y_shuffle, thresh_full, thresh_shuffle + ) + + delta_aupr_feature = delta_aupr_feature + [delta_aupr] + delta_precision_feature = delta_precision_feature + [delta_precision] + + df_temp = pd.DataFrame( + { + "delta_aupr": delta_aupr_feature, + "delta_precision": delta_precision_feature, + } + ) + df_temp["feature_permuted"] = feature_list[i] + df = pd.concat([df, df_temp]) + + X[feature_list[i]] = original_values # reset feature values + + return df @click.command() @@ -101,28 +101,28 @@ def permutation_feature_importance( @click.option("--n_repeats", type=float, default=20) @click.option("--params_file", required=True) def main( - crispr_features_file, - feature_table_file, - out_dir, - polynomial, - epsilon, - n_repeats, - params_file, + crispr_features_file, + feature_table_file, + out_dir, + polynomial, + epsilon, + n_repeats, + params_file, ): - model_name = "E2G" - n_repeats = int(n_repeats) - df_dataset = pd.read_csv(crispr_features_file, sep="\t") - feature_table = pd.read_csv(feature_table_file, sep="\t") + model_name = "E2G" + n_repeats = int(n_repeats) + df_dataset = pd.read_csv(crispr_features_file, sep="\t") + feature_table = pd.read_csv(feature_table_file, sep="\t") - with open(params_file, "rb") as handle: - params = pickle.load(handle) + with open(params_file, "rb") as handle: + params = pickle.load(handle) - res = permutation_feature_importance( - df_dataset, feature_table, model_name, epsilon, params, n_repeats, polynomial - ) + res = permutation_feature_importance( + df_dataset, feature_table, model_name, epsilon, params, n_repeats, polynomial + ) - res.to_csv(out_dir + "/permutation_feature_importance.tsv", sep="\t", index=False) + res.to_csv(out_dir + "/permutation_feature_importance.tsv", sep="\t", index=False) if __name__ == "__main__": - main() + main() diff --git a/workflow/scripts/model_training/train_model.py b/workflow/scripts/model_training/train_model.py index 653104f..9402059 100644 --- a/workflow/scripts/model_training/train_model.py +++ b/workflow/scripts/model_training/train_model.py @@ -8,139 +8,143 @@ from sklearn.linear_model import LogisticRegression from training_functions import statistic_aupr + def train_and_predict( - df_dataset, feature_table, model_name, out_dir, epsilon, params, polynomial=False + df_dataset, feature_table, model_name, out_dir, epsilon, params, polynomial=False ): - # specify feature list - feature_list_core = feature_table["feature"] - X = df_dataset.loc[:, feature_list_core] - - # transform features first - X = np.log(np.abs(X) + epsilon) - - # optionally add polynomial features - if polynomial: - poly = PolynomialFeatures(degree=2) - X = poly.fit_transform(X) - polynomial_names = poly.get_feature_names_out(feature_list_core) - X = pd.DataFrame(X, columns=polynomial_names) - - Y = df_dataset["Regulated"].values.astype(np.int64) - - # initialize df for feature weights & metrics - df_coef = pd.DataFrame(columns=["feature", "coefficient", "test_chr"]) - df_metrics = pd.DataFrame( - columns=[ - "test_chr", - "log_loss_test_full", - "log_loss_train", - "log_loss_test", - "AUROC_test_full", - "AUROC_train", - "AUROC_test", - "AUPRC_test_full", - "AUPRC_train", - "AUPRC_test", - "n_test_pos", - "n_test_neg", - "n_train_neg", - "n_train_pos", - ] - ) - - # train aggregate model across all chromosomes, calc weights and performance metrics, save full model - model_full = LogisticRegression(**params).fit(X, Y) - probs_full = model_full.predict_proba(X) - df_dataset[model_name + ".Score_full"] = probs_full[:, 1] - coefficients = model_full.coef_[0] - df_temp = pd.DataFrame( - {"feature": X.columns, "coefficient": coefficients, "test_chr": "none"} - ) - df_coef = pd.concat([df_coef, df_temp]) - with open(out_dir + f"/model_full.pkl", "wb") as f: - pickle.dump(model_full, f) - - # logistic regression predictions on chromosome-wise cross validation - idx = np.arange(len(Y)) - chr_list = np.unique(df_dataset["chr"]) - if len(chr_list) > 1: - for chr in chr_list: - idx_test = df_dataset[df_dataset["chr"] == chr].index.values - - if len(idx_test) > 0: - idx_train = np.delete(idx, idx_test) - X_test = X.loc[idx_test, :] - X_train = X.loc[idx_train, :] - Y_test = Y[idx_test] - Y_train = Y[idx_train] - - model = LogisticRegression(**params).fit(X_train, Y_train) - - with open(out_dir + f"/model_test_{chr}.pkl", "wb") as f: - pickle.dump(model, f) - - probs = model.predict_proba(X_test) - df_dataset.loc[idx_test, model_name + ".Score"] = probs[:, 1] - - # performance metrics - n_train_pos = np.sum(Y_train) - n_train_neg = len(Y_train) - n_train_pos - n_test_pos = np.sum(Y_test) - n_test_neg = len(Y_test) - n_test_pos - - ll_train = log_loss(Y_train, model.predict_proba(X_train)[:, 1]) - ll_test_full = ( - log_loss(Y_test, probs_full[idx_test, 1]) - if n_test_pos > 0 - else np.NaN - ) - ll_test = log_loss(Y_test, probs[:, 1]) if n_test_pos > 0 else np.NaN - auroc_train = roc_auc_score(Y_train, model.predict_proba(X_train)[:, 1]) - auroc_test_full = ( - roc_auc_score(Y_test, probs_full[idx_test, 1]) - if n_test_pos > 0 - else np.NaN - ) - auroc_test = ( - roc_auc_score(Y_test, probs[:, 1]) if n_test_pos > 0 else np.NaN - ) - auprc_train = statistic_aupr( - Y_train, model.predict_proba(X_train)[:, 1] - ) - auprc_test_full = ( - statistic_aupr(Y_test, probs_full[idx_test, 1]) - if n_test_pos > 0 - else np.NaN - ) - auprc_test = ( - statistic_aupr(Y_test, probs[:, 1]) if n_test_pos > 0 else np.NaN - ) - - df_temp = pd.DataFrame( - { - "test_chr": [chr], - "log_loss_test_full": [ll_test_full], - "log_loss_train": [ll_train], - "log_loss_test": [ll_test], - "AUROC_test_full": [auroc_test_full], - "AUROC_train": [auroc_train], - "AUROC_test": [auroc_test], - "AUPRC_test_full": [auprc_test_full], - "AUPRC_train": [auprc_train], - "AUPRC_test": [auprc_test], - "n_test_pos": [n_test_pos], - "n_test_neg": [n_test_neg], - "n_train_pos": [n_train_pos], - "n_train_neg": [n_train_neg], - } - ) - df_metrics = pd.concat([df_metrics, df_temp]) - - # save dfs - df_dataset.to_csv(out_dir + "/training_predictions.tsv", sep="\t", index=False) - df_coef.to_csv(out_dir + "/model_coefficients.tsv", sep="\t", index=False) - df_metrics.to_csv(out_dir + "/performance_metrics.tsv", sep="\t", index=False) - + # specify feature list + feature_list_core = feature_table["feature"] + X = df_dataset.loc[:, feature_list_core] + + # transform features first + X = np.log(np.abs(X) + epsilon) + + # optionally add polynomial features + if polynomial: + poly = PolynomialFeatures(degree=2) + X = poly.fit_transform(X) + polynomial_names = poly.get_feature_names_out(feature_list_core) + X = pd.DataFrame(X, columns=polynomial_names) + + Y = df_dataset["Regulated"].values.astype(np.int64) + + # initialize df for feature weights & metrics + df_coef = pd.DataFrame(columns=["feature", "coefficient", "test_chr"]) + df_metrics = pd.DataFrame( + columns=[ + "test_chr", + "log_loss_test_full", + "log_loss_train", + "log_loss_test", + "AUROC_test_full", + "AUROC_train", + "AUROC_test", + "AUPRC_test_full", + "AUPRC_train", + "AUPRC_test", + "n_test_pos", + "n_test_neg", + "n_train_neg", + "n_train_pos", + ] + ) + + # train aggregate model across all chromosomes, calc weights and performance metrics, save full model + model_full = LogisticRegression(**params).fit(X, Y) + probs_full = model_full.predict_proba(X) + df_dataset[model_name + ".Score_full"] = probs_full[:, 1] + coefficients = model_full.coef_[0] + df_temp = pd.DataFrame( + {"feature": X.columns, "coefficient": coefficients, "test_chr": "none"} + ) + df_coef = pd.concat([df_coef, df_temp]) + with open(out_dir + f"/model_full.pkl", "wb") as f: + pickle.dump(model_full, f) + + # logistic regression predictions on chromosome-wise cross validation + idx = np.arange(len(Y)) + chr_list = np.unique(df_dataset["chr"]) + if len(chr_list) > 1: + for chr in chr_list: + idx_test = df_dataset[df_dataset["chr"] == chr].index.values + + if len(idx_test) > 0: + idx_train = np.delete(idx, idx_test) + X_test = X.loc[idx_test, :] + X_train = X.loc[idx_train, :] + Y_test = Y[idx_test] + Y_train = Y[idx_train] + + model = LogisticRegression(**params).fit(X_train, Y_train) + + with open(out_dir + f"/model_test_{chr}.pkl", "wb") as f: + pickle.dump(model, f) + + probs = model.predict_proba(X_test) + df_dataset.loc[idx_test, model_name + ".Score"] = probs[:, 1] + + # performance metrics + n_train_pos = np.sum(Y_train) + n_train_neg = len(Y_train) - n_train_pos + n_test_pos = np.sum(Y_test) + n_test_neg = len(Y_test) - n_test_pos + + multiple_labels = n_test_pos > 0 and n_test_neg > 0 + + ll_train = log_loss(Y_train, model.predict_proba(X_train)[:, 1]) + ll_test_full = ( + log_loss(Y_test, probs_full[idx_test, 1]) + if multiple_labels + else np.NaN + ) + ll_test = log_loss(Y_test, probs[:, 1]) if multiple_labels else np.NaN + auroc_train = roc_auc_score(Y_train, model.predict_proba(X_train)[:, 1]) + auroc_test_full = ( + roc_auc_score(Y_test, probs_full[idx_test, 1]) + if multiple_labels + else np.NaN + ) + auroc_test = ( + roc_auc_score(Y_test, probs[:, 1]) + if multiple_labels > 0 + else np.NaN + ) + auprc_train = statistic_aupr( + Y_train, model.predict_proba(X_train)[:, 1] + ) + auprc_test_full = ( + statistic_aupr(Y_test, probs_full[idx_test, 1]) + if multiple_labels + else np.NaN + ) + auprc_test = ( + statistic_aupr(Y_test, probs[:, 1]) if multiple_labels else np.NaN + ) + + df_temp = pd.DataFrame( + { + "test_chr": [chr], + "log_loss_test_full": [ll_test_full], + "log_loss_train": [ll_train], + "log_loss_test": [ll_test], + "AUROC_test_full": [auroc_test_full], + "AUROC_train": [auroc_train], + "AUROC_test": [auroc_test], + "AUPRC_test_full": [auprc_test_full], + "AUPRC_train": [auprc_train], + "AUPRC_test": [auprc_test], + "n_test_pos": [n_test_pos], + "n_test_neg": [n_test_neg], + "n_train_pos": [n_train_pos], + "n_train_neg": [n_train_neg], + } + ) + df_metrics = pd.concat([df_metrics, df_temp]) + + # save dfs + df_dataset.to_csv(out_dir + "/training_predictions.tsv", sep="\t", index=False) + df_coef.to_csv(out_dir + "/model_coefficients.tsv", sep="\t", index=False) + df_metrics.to_csv(out_dir + "/performance_metrics.tsv", sep="\t", index=False) @click.command() @@ -151,18 +155,18 @@ def train_and_predict( @click.option("--epsilon", type=float, default=0.01) @click.option("--params_file", required=True) def main( - crispr_features_file, feature_table_file, out_dir, polynomial, params_file, epsilon + crispr_features_file, feature_table_file, out_dir, polynomial, params_file, epsilon ): - model_name = "E2G" - df_dataset = pd.read_csv(crispr_features_file, sep="\t") - feature_table = pd.read_csv(feature_table_file, sep="\t") - with open(params_file, "rb") as handle: - params = pickle.load(handle) + model_name = "E2G" + df_dataset = pd.read_csv(crispr_features_file, sep="\t") + feature_table = pd.read_csv(feature_table_file, sep="\t") + with open(params_file, "rb") as handle: + params = pickle.load(handle) - train_and_predict( - df_dataset, feature_table, model_name, out_dir, epsilon, params, polynomial - ) + train_and_predict( + df_dataset, feature_table, model_name, out_dir, epsilon, params, polynomial + ) if __name__ == "__main__": - main() + main() diff --git a/workflow/scripts/model_training/training_functions.py b/workflow/scripts/model_training/training_functions.py index 34b6ad8..d0d5a84 100644 --- a/workflow/scripts/model_training/training_functions.py +++ b/workflow/scripts/model_training/training_functions.py @@ -4,7 +4,6 @@ from sklearn.preprocessing import PolynomialFeatures from sklearn.linear_model import LogisticRegression - ## statistic functions for delta auPR/precision to be used for scipy.stats.bootstrap