EngreitzLab · mayasheth · Jun 6, 2025 · Jun 7, 2025 · Jun 8, 2025 · Jun 8, 2025
diff --git a/.circleci/config.yml b/.circleci/config.yml
@@ -21,11 +21,11 @@ jobs:
       - run:
           name: Install basic OS pkgs
           command: apt-get update && apt-get -y install curl vim
-      # - run:
-      #     name: Run Linter (python black)
-      #     command: |
-      #       mamba install black
-      #       black . --check
+       - run:
+           name: Run Linter (python black)
+           command: |
+             mamba install black
+             black workflow/ --check
       - run:
           name: Install env
           no_output_timeout: 30m

diff --git a/README.md b/README.md
@@ -49,16 +49,18 @@ The way we choose the model depends on the biosamples input. The code for model
 
 ## Train model
 
-**Important: Only train models for biosamples matching the corresponding CRISPR data (in this case, K562)**
+**Important: Only train models for biosamples matching the corresponding CRISPR data**
 - Much of the the model training code was adapted from Alireza Karbalayghareh's [original implementation](https://github.com/karbalayghareh/ENCODE-E2G).
 
 Modify `config/config_training.yaml` with your model and dataset configs
-- `model_config` has columns:  model, dataset, ABC_directory, feature_table, polynomial (do you want to use polynomial features?), and override_params (are there model training parameters you would like to change from the default logistic regression settings specfied in `config/config_training.yaml`?)
+- `model_config` has columns:  model, dataset, crispr_dataset, feature_table, polynomial (do you want to use polynomial features?), and override_params (are there model training parameters you would like to change from the default logistic regression settings specfied in `config/config_training.yaml`?)
     - See [this example](https://pastebin.com/zt1868R3) `model_config` for how to specfiy override parameters. If there are no override_params, leave the column blank but still include the header.
+    - The `dataset` column is a comma-separated list with values corresponding to biosamples in `dataset_config`. Datasets must be provided for each `crispr_cell_type` included in the corresponding `crispr_dataset`.
+    - The `crispr_dataset` column corresponds to a key under `crispr_dataset` in `config_training`.
     - Feature tables must be specified for each model (example: `resources/feature_tables`) with columns: feature (name in final table), input_col (name in ABC output), second_input (multiplied by input_col if provided), aggregate_function (how to combine feature values when a CRISPR element overlaps more than one ABC element), fill_value (how to replace NAs), nice_name (used when plotting)
     - Note that trained models generated using polynomial features cannot directly be used in the **Apply model** workflow
 - `dataset_config` is an ABC biosamples config to generate ABC predictions for datasets without an existing ABC directory. 
-- Each dataset must correspond to a unique ABC_directory, with "biosample" in `dataset_config` equals "dataset" in `model_config`. If no ABC_directory is indicated in `model_config`, it must have an entry in `dataset_config`.
+    Each dataset must correspond to a "biosample" in `dataset_config` equals "dataset" in `model_config`. The column `crispr_cell_type` indicates the which cell type in the CRISPR data corresponds to this biosample.
 - If you are including features in addition to those generated within the pipeline (e.g. a value in input_col or second_input of a feature table is not included in `reference_features` in `config/config_training/yaml`), you must also define how to add these values with an external_features_config, which you include in `dataset_config` in the optional column external_features_config:
     - An `external_features_config` has columns feature (corresponding to the missing input_col or second_input value), source_col (column name in the source file), aggregate_function (how to combine values when merging different element definitions), join_by, and source_file
     - join_by must be either "TargetGene" (feature is defined per gene) or "overlap" (feature is defined per element-gene pair)

diff --git a/config/config_biosamples_chr22.tsv b/config/config_biosamples_chr22.tsv
@@ -1,2 +1,2 @@
-biosample	DHS	ATAC	H3K27ac	default_accessibility_feature	HiC_file	HiC_type	HiC_resolution	alt_TSS	alt_genes
-K562_chr22	ABC/example_chr/chr22/ENCFF860XAE.chr22.sorted.se.bam			DHS	https://www.encodeproject.org/files/ENCFF621AIY/@@download/ENCFF621AIY.hic	hic	5000	ABC/example_chr/chr22/RefSeqCurated.170308.bed.CollapsedGeneBounds.chr22.hg38.TSS500bp.bed	ABC/example_chr/chr22/RefSeqCurated.170308.bed.CollapsedGeneBounds.chr22.hg38.bed
+biosample	crispr_cell_type	DHS	ATAC	H3K27ac	default_accessibility_feature	HiC_file	HiC_type	HiC_resolution	alt_TSS	alt_genes
+K562_chr22	K562	ABC/example_chr/chr22/ENCFF860XAE.chr22.sorted.se.bam			DHS	https://www.encodeproject.org/files/ENCFF621AIY/@@download/ENCFF621AIY.hic	hic	5000	ABC/example_chr/chr22/RefSeqCurated.170308.bed.CollapsedGeneBounds.chr22.hg38.TSS500bp.bed	ABC/example_chr/chr22/RefSeqCurated.170308.bed.CollapsedGeneBounds.chr22.hg38.bed
diff --git a/config/config_models_test.tsv b/config/config_models_test.tsv
@@ -1,2 +1,2 @@
-model	dataset	ABC_directory	feature_table	polynomial	override_params
-DNase_megamap	K562_chr22		resources/feature_tables/final_feature_set_DNase_hic.tsv	False 	
+model	dataset	crispr_dataset	feature_table	polynomial	override_params
+DNase_megamap	K562_chr22	training	resources/feature_tables/final_feature_set_DNase_hic.tsv	FALSE	
diff --git a/config/config_training.yaml b/config/config_training.yaml
@@ -12,7 +12,10 @@ max_memory_allocation_mb: 250000 # for submiting jobs
 gene_TSS500: "reference/CollapsedGeneBounds.hg38.TSS500bp.bed" # TSS reference file
 chr_sizes: "reference/GRCh38_EBV.no_alt.chrom.sizes.tsv"
 gene_classes: "resources/external_features/gene_promoter_class_RefSeqCurated.170308.bed.CollapsedGeneBounds.hg38.TSS500bp.tsv"
-crispr_dataset: "reference/EPCrisprBenchmark_ensemble_data_GRCh38.tsv.gz" # CRISPR dataset
+crispr_dataset:
+  training: "reference/EPCrisprBenchmark_ensemble_data_GRCh38.tsv.gz" # CRISPR dataset
+crispr_cell_types: 
+  training: ["K562"]
 # list of features that are generated by default, referenced by their column names in source files;
 reference_features: ["numTSSEnhGene", "distance", "activity_base", "TargetGenePromoterActivityQuantile", "numNearbyEnhancers", "sumNearbyEnhancers", "is_ubiquitous_uniform", "P2PromoterClass", "numCandidateEnhGene", "hic_contact_pl_scaled_adj", "ABC.Score", "ABC.Numerator", "ABC.Denominator", "normalized_dhs_prom", "normalized_dhs_enh", "normalized_atac_prom", "normalized_atac_enh", "normalized_h3k27ac_enh", "normalized_h3k27ac_prom"] 
 

diff --git a/workflow/Snakefile_training b/workflow/Snakefile_training
@@ -1,27 +1,34 @@
 from snakemake.utils import min_version
 min_version("7.0")
+conda: "mamba"
 
 import pandas as pd
 import os
 import yaml
 
-configfile: "config/config_training.yaml"
 model_config = pd.read_table(config["model_config"], na_values="").fillna("None").set_index("model", drop=False)
 dataset_config = pd.read_table(config["dataset_config"], na_values="").set_index("biosample", drop=False)
 conda: "mamba"
+
+# import utils and make config paths absolute
 MAX_MEM_MB = config["max_memory_allocation_mb"]
 include: "rules/utils.smk"
 
 E2G_DIR_PATH = os.path.abspath(config["E2G_DIR_PATH"])
 config = make_paths_absolute(config, E2G_DIR_PATH)
+config["results_dir"] = os.path.join(E2G_DIR_PATH, config["results_dir"]) # manually modify results dir since may not exist
 
-# Need to manually make results_dir an absolute path since above may
-# not work if results_dir folder isn't created
-# If results_dir is already an absolute path, this is a no-op
-config["results_dir"] = os.path.join(E2G_DIR_PATH, config["results_dir"])
+# define some global variables 
+RESULTS_DIR = config["results_dir"]
+MODELS_RESULTS_DIR = os.path.join(config["results_dir"], "model_results")
+SCRIPTS_DIR = os.path.join(E2G_DIR_PATH, "workflow/scripts")
 
+# process configs
 model_config = process_model_config(model_config)
+model_dataset_dict = make_model_dataset_dict(model_config, dataset_config) # dictionary of dictionaries: model: {crispr_ct: dataset, ...}
+print(model_dataset_dict)
 
+# import ABC submodule
 module ABC:
     snakefile:
         f"{config['ABC_DIR_PATH']}/workflow/Snakefile"
@@ -31,39 +38,39 @@ abc_config = get_abc_config(config)
 
 use rule * from ABC exclude all as abc_*
 
-RESULTS_DIR = config["results_dir"]
-SCRIPTS_DIR = os.path.join(E2G_DIR_PATH, "workflow/scripts")
-
-# These rules requires the variables above to be defined
+# import all rules (require the variables above to be defined)
 include: "rules/genomewide_features.smk"
 include: "rules/crispr_features.smk"
 include: "rules/train_model.smk"
 include: "rules/feature_analysis.smk"
 include: "rules/compare_models.smk"
 
-# Validate ABC_directory column and make ABC directory dict
+# validate ABC_directory column and make ABC directory dict
 ABC_BIOSAMPLES_DIR = process_abc_directory_column(model_config)
 
 # specify target files
 output_files = []
-output_files.extend(expand(os.path.join(RESULTS_DIR, "{dataset}",  "{model}", "for_training.EPCrisprBenchmark_ensemble_data_GRCh38.K562_features_NAfilled.tsv.gz"), zip, dataset=model_config["dataset"], model=model_config["model"]))
-output_files.extend(expand(os.path.join(RESULTS_DIR, "{dataset}", "{model}", "model", "model_full.pkl"), zip, dataset=model_config["dataset"], model=model_config["model"])) # trained models
+output_files.extend(expand(os.path.join(MODELS_RESULTS_DIR, "{model}", "model", "model_full.pkl"), model=model_config["model"])) # trained models
 
-output_files.append(os.path.join(RESULTS_DIR, "performance_across_models.tsv")) # comparison across models
-output_files.extend(expand(os.path.join(RESULTS_DIR, "performance_across_models_{metric}.pdf"), metric=["auprc", "precision"])) # plot of comparisons
+# output_files.extend(expand(os.path.join(RESULTS_DIR, "{dataset}",  "{model}", "for_training.EPCrisprBenchmark_ensemble_data_GRCh38.K562_features_NAfilled.tsv.gz"), zip, dataset=model_config["dataset"], model=model_config["model"]))
+# output_files.extend(expand(os.path.join(RESULTS_DIR, "{dataset}", "{model}", "model", "model_full.pkl"), zip, dataset=model_config["dataset"], model=model_config["model"])) # trained models
+
+output_files.append(os.path.join(MODELS_RESULTS_DIR, "performance_across_models.tsv")) # comparison across models
+output_files.extend(expand(os.path.join(MODELS_RESULTS_DIR, "performance_across_models_{metric}.pdf"), metric=["auprc", "precision"])) # plot of comparisons
 
 if config["run_feature_analysis"]:
-    output_files.extend(expand(os.path.join(RESULTS_DIR, "{dataset}", "{model}", "feature_analysis", "forward_feature_selection_auprc.pdf"), zip, dataset=model_config["dataset"], model=model_config["model"]))
-    output_files.extend(expand(os.path.join(RESULTS_DIR, "{dataset}", "{model}", "feature_analysis", "backward_feature_selection_auprc.pdf"), zip, dataset=model_config["dataset"], model=model_config["model"]))
-    output_files.extend(expand(os.path.join(RESULTS_DIR, "{dataset}", "{model}", "feature_analysis", "permutation_feature_importance_auprc.pdf"), zip, dataset=model_config["dataset"], model=model_config["model"]))
+    output_files.extend(expand(os.path.join(MODELS_RESULTS_DIR, "{model}", "feature_analysis", "forward_feature_selection_auprc.pdf"), model=model_config["model"]))
+    output_files.extend(expand(os.path.join(MODELS_RESULTS_DIR, "{model}", "feature_analysis", "backward_feature_selection_auprc.pdf"), model=model_config["model"]))
+    output_files.extend(expand(os.path.join(MODELS_RESULTS_DIR, "{model}", "feature_analysis", "permutation_feature_importance_auprc.pdf"), model=model_config["model"]))
 
     # only test all feature sets if polynomial==False and n_features<14
     for row in model_config.itertuples(index=False):
         if not row.polynomial == 'True':
             features = pd.read_table(row.feature_table)
             n_features = len(features) 
+
             if n_features<14:
-                output_files.append(os.path.join(RESULTS_DIR, row.dataset, row.model, "feature_analysis", "all_feature_sets.tsv"))
+                output_files.append(os.path.join(MODELS_RESULTS_DIR, row.model, "feature_analysis", "all_feature_sets.tsv"))
 
 rule all:
     input: 

diff --git a/workflow/rules/compare_models.smk b/workflow/rules/compare_models.smk
@@ -2,36 +2,41 @@
 # compare cv-performance on training data across all models (note, this is not the true benchmarking performance CRISPR elements not overlapping prediction elements aren't considered)  
 rule gather_model_performances:
 	input:
-		all_predictions = expand(os.path.join(RESULTS_DIR, "{dataset}", "{model}", "model", "training_predictions.tsv"), zip, dataset=model_config["dataset"], model=model_config["model"])
+		all_predictions = expand(os.path.join(MODELS_RESULTS_DIR, "{model}", "model", "training_predictions.tsv"), model=model_config["model"]),
+		all_missing = expand(os.path.join(MODELS_RESULTS_DIR, "{model}", "merged_CRISPR_dataset.missing_from_features.NAfilled.tsv.gz"), model=model_config["model"]),
 	output:
-		comp_table = os.path.join(RESULTS_DIR, "performance_across_models.tsv")
+		comp_table = os.path.join(MODELS_RESULTS_DIR, "performance_across_models.tsv")
 	params:
 		scripts_dir = SCRIPTS_DIR,
 		out_dir = RESULTS_DIR,
 		model_config_file = config["model_config"],
-		crispr_dataset = config["crispr_dataset"]
+		crispr_dataset_names = [n for n in model_config["crispr_dataset"].unique()],
+		crispr_dataset = lambda wildcards: [config["crispr_dataset"][cd] for cd in model_config["crispr_dataset"].unique()] 
 	conda:
 		"../envs/encode_re2g.yml" 
 	resources:
 		mem_mb=64*1000
 	shell: 
 		""" 
 		python {params.scripts_dir}/model_training/compare_all_models.py \
+			--all_pred "{input.all_predictions}" \
+			--all_missing "{input.all_missing}" \
 			--model_config_file {params.model_config_file} \
 			--output_file {output.comp_table}  \
-			--crispr_data {params.crispr_dataset} \
+			--crispr_names "{params.crispr_dataset_names}" \
+			--crispr_data "{params.crispr_dataset}" \
 			--out_dir {params.out_dir}
 		"""
 
 rule plot_model_performances:
 	input:
-		comp_table = os.path.join(RESULTS_DIR, "performance_across_models.tsv")
+		comp_table = os.path.join(MODELS_RESULTS_DIR, "performance_across_models.tsv")
 	output:
-		comp_plot_auprc = os.path.join(RESULTS_DIR, "performance_across_models_auprc.pdf"),
-		comp_plot_prec = os.path.join(RESULTS_DIR, "performance_across_models_precision.pdf")
+		comp_plot_auprc = os.path.join(MODELS_RESULTS_DIR, "performance_across_models_auprc.pdf"),
+		comp_plot_prec = os.path.join(MODELS_RESULTS_DIR, "performance_across_models_precision.pdf")
 	conda:
 		"../envs/encode_re2g.yml" 
 	resources:
 		mem_mb=ABC.determine_mem_mb
 	script:
-		"../scripts/model_training/plot_model_comparison.R"
+		"../scripts/model_training/plot_model_comparison.R"
diff --git a/workflow/rules/crispr_features.smk b/workflow/rules/crispr_features.smk
@@ -28,34 +28,42 @@ rule format_external_features_config:
 	script:
 		"../scripts/feature_tables/format_external_features_config.R"
 
-# overlap feature table  with K562 CRISPR data
-rule overlap_features_crispr:
+# overlap feature table  with CRISPR data for this dataset
+rule overlap_features_crispr_for_dataset:
 	input:
 		features = os.path.join(RESULTS_DIR, "{dataset}", "genomewide_features.tsv.gz"),
-		crispr = config['crispr_dataset'],
 		feature_table_file = os.path.join(RESULTS_DIR, "{dataset}", "feature_table.tsv"),
+		crispr = lambda wildcards: config['crispr_dataset'][wildcards.crispr_dataset],
 		tss = config['gene_TSS500']
 	params:
-		tpm_threshold = lambda wildcards: model_config.loc[wildcards.model, 'tpm_threshold']
+		crispr_cell_type = lambda wildcards: dataset_config.loc[wildcards.dataset, "crispr_cell_type"]
 	output: 
-		features = os.path.join(RESULTS_DIR, "{dataset}", "{model}", "EPCrisprBenchmark_ensemble_data_GRCh38.K562_features_{nafill}.tsv.gz"),
-		missing = os.path.join(RESULTS_DIR, "{dataset}",  "{model}", "missing.EPCrisprBenchmark_ensemble_data_GRCh38.K562_features_{nafill}.tsv.gz")
+		features = os.path.join(RESULTS_DIR, "{dataset}", "CRISPR_dataset_{crispr_dataset}.overlapping_features.{nafill}.tsv.gz"),
+		missing = os.path.join(RESULTS_DIR, "{dataset}", "CRISPR_dataset_{crispr_dataset}.missing_from_features.{nafill}.tsv.gz")
 	conda:
 		"../envs/encode_re2g.yml" 
 	resources:
 		mem_mb=32*1000
 	script:
 		"../scripts/feature_tables/overlap_features_with_crispr_data.R"
 
-# process data for model training: rename columns, apply filter features, filter to gene list
+# process data for model training: rename columns, apply filter features, filter to gene list, combine across datasets
 # note: we use the NAfilled CRISPR feature data here!
+def get_crispr_files_for_model(wildcards, file_type):
+	datasets = model_dataset_dict[wildcards.model].values()
+	crispr_dataset = model_config.loc[wildcards.model, "crispr_dataset"]
+	files = [os.path.join(RESULTS_DIR, ds, f"CRISPR_dataset_{crispr_dataset}.{file_type}.{wildcards.nafill}.tsv.gz") for ds in datasets]
+	return files
+
 rule process_crispr_data:
 	input:
-		crispr_features = os.path.join(RESULTS_DIR, "{dataset}", "{model}", "EPCrisprBenchmark_ensemble_data_GRCh38.K562_features_{nafill}.tsv.gz")
+		crispr_features = lambda wildcards: get_crispr_files_for_model(wildcards, "overlapping_features"),
+		crispr_missing = lambda wildcards: get_crispr_files_for_model(wildcards, "missing_from_features"),
 	params:
-		genes = config["gene_TSS500"]
+		genes = config["gene_TSS500"],
 	output:
-		processed = os.path.join(RESULTS_DIR, "{dataset}",  "{model}",  "for_training.EPCrisprBenchmark_ensemble_data_GRCh38.K562_features_{nafill}.tsv.gz")
+		processed = os.path.join(MODELS_RESULTS_DIR, "{model}",  "for_training.merged_CRISPR_dataset.overlapping_features.{nafill}.tsv.gz"),
+		missing = os.path.join(MODELS_RESULTS_DIR, "{model}",  "merged_CRISPR_dataset.missing_from_features.{nafill}.tsv.gz")
 	conda:
 		"../envs/encode_re2g.yml" 
 	resources: