From 59c9b71f440569b6d063ee86dd7061cc5049ba45 Mon Sep 17 00:00:00 2001 From: Scott Simpkins Date: Sat, 17 Apr 2021 16:10:03 -0700 Subject: [PATCH] First attempt at updating tests to be two-sided --- ...ided_empirical_pval.R => empirical_pval.R} | 219 +- lib/filenames.r | 186 +- lib/zscore-pval_by-drug-and-goterm_fast.r | 701 +++--- scripts/predict_gene-set_targets.r | 2193 +++++++++-------- 4 files changed, 1671 insertions(+), 1628 deletions(-) rename lib/{one-sided_empirical_pval.R => empirical_pval.R} (82%) diff --git a/lib/one-sided_empirical_pval.R b/lib/empirical_pval.R similarity index 82% rename from lib/one-sided_empirical_pval.R rename to lib/empirical_pval.R index dac491e..6ebf657 100644 --- a/lib/one-sided_empirical_pval.R +++ b/lib/empirical_pval.R @@ -1,105 +1,114 @@ -################################################################# -###### Copyright: Regents of the University of Minnesota ###### -################################################################# - - -# In this script, I write two functions that take in -# a vector of test statistics or other values and a -# vector containing a distribution against which to -# test the values in the first vector. The first -# function resembles what is already in my -# chemical genomics process prediction pipeline, -# and the second function is the "new" way of -# doing it that should be faster and use less -# memory. - -# This is basically the old function, although it was not -# nicely wrapped like this :) -#two_vector_sig_old = function(test_vec, distrib_vec) { -# num_controls = length(distrib_vec) -# vapply(test_vec, function(x) { -# sum(x <= distrib_vec) / num_controls -# }, numeric(1)) -#} - -empirical_pval_factory = function(alternative = c('greater', 'less')) { - - if (alternative == 'greater') { - compare_fn = `>=` - } else if (alternative == 'less') { - compare_fn = `<=` - } else { - stop('The argument for "alternative" must be in c("greater", "less")') - } - - - f = function(test_vec, distrib_vec) { - - i_test_vec = order(test_vec, decreasing = TRUE) - i_distrib_vec = order(distrib_vec, decreasing = TRUE) - - num_distrib = length(distrib_vec) - num_test = length(test_vec) - - test_sig = numeric(num_test) - - i = 1 - j = 1 - while (i <= num_distrib & j <= num_test) { - # Weird way to do the comparison, but it works and makes - # it so the "greater" and "less" functions are always the - # same. Functional programming!!! - if (compare_fn(distrib_vec[i_distrib_vec[i]], test_vec[i_test_vec[j]])) { - i = i + 1 - } else { - test_sig[i_test_vec[j]] = i - 1 - j = j + 1 - } - - } - - if (j <= num_test) { - test_sig[i_test_vec[j:num_test]] = i - 1 - } - - test_sig / num_distrib - } - - return(f) -} - -empirical_pval_greater = empirical_pval_factory('greater') -empirical_pval_less = empirical_pval_factory('less') - -#empirical_pval_greater = function(test_vec, distrib_vec) { -# i_test_vec = order(test_vec, decreasing = TRUE) -# i_distrib_vec = order(distrib_vec, decreasing = TRUE) -# -# num_distrib = length(distrib_vec) -# num_test = length(test_vec) -# -# test_sig = numeric(num_test) -# -# i = 1 -# j = 1 -# while (i <= num_distrib & j <= num_test) { -# if (distrib_vec[i_distrib_vec[i]] >= test_vec[i_test_vec[j]]) { -# i = i + 1 -# } else if (distrib_vec[i_distrib_vec[i]] < test_vec[i_test_vec[j]]) { -# test_sig[i_test_vec[j]] = i - 1 -# j = j + 1 -# } -# -# } -# -# if (j =< num_test) { -# test_sig[i_test_vec[j:num_test]] = i - 1 -# } -# -# test_sig / num_distrib -# -#} - - - - +################################################################# +###### Copyright: Regents of the University of Minnesota ###### +################################################################# + + +# In this script, I write two functions that take in +# a vector of test statistics or other values and a +# vector containing a distribution against which to +# test the values in the first vector. The first +# function resembles what is already in my +# chemical genomics process prediction pipeline, +# and the second function is the "new" way of +# doing it that should be faster and use less +# memory. + +# This is basically the old function, although it was not +# nicely wrapped like this :) +#two_vector_sig_old = function(test_vec, distrib_vec) { +# num_controls = length(distrib_vec) +# vapply(test_vec, function(x) { +# sum(x <= distrib_vec) / num_controls +# }, numeric(1)) +#} + +empirical_pval_factory = function(alternative = c('greater', 'less')) { + + if (alternative == 'greater') { + order_fn = function(x) order(x, decreasing = TRUE) + compare_fn = `>=` + } else if (alternative == 'less') { + order_fn = function(x) order(x, decreasing = FALSE) + compare_fn = `<=` + } else { + stop('The argument for "alternative" must be in c("greater", "less")') + } + + + f = function(test_vec, distrib_vec) { + + i_test_vec = order_fn(test_vec) + i_distrib_vec = order_fn(distrib_vec) + + num_distrib = length(distrib_vec) + num_test = length(test_vec) + + test_sig = numeric(num_test) + + i = 1 + j = 1 + while (i <= num_distrib & j <= num_test) { + # Weird way to do the comparison, but it works and makes + # it so the "greater," "less," and "two-sided" functions are always the + # same. Functional programming!!! + if (compare_fn(distrib_vec[i_distrib_vec[i]], test_vec[i_test_vec[j]])) { + i = i + 1 + } else { + test_sig[i_test_vec[j]] = i - 1 + j = j + 1 + } + + } + + if (j <= num_test) { + test_sig[i_test_vec[j:num_test]] = i - 1 + } + + test_sig / num_distrib + } + + return(f) +} + +empirical_pval_greater = empirical_pval_factory('greater') +empirical_pval_less = empirical_pval_factory('less') + +empirical_pval_two_sided = function(test_vec, distrib_vec) { + 2 * pmin( + empirical_pval_greater(test_vec, distrib_vec), + empirical_pval_less(test_vec, distrib_vec) + ) +} + +#empirical_pval_greater = function(test_vec, distrib_vec) { +# i_test_vec = order(test_vec, decreasing = TRUE) +# i_distrib_vec = order(distrib_vec, decreasing = TRUE) +# +# num_distrib = length(distrib_vec) +# num_test = length(test_vec) +# +# test_sig = numeric(num_test) +# +# i = 1 +# j = 1 +# while (i <= num_distrib & j <= num_test) { +# if (distrib_vec[i_distrib_vec[i]] >= test_vec[i_test_vec[j]]) { +# i = i + 1 +# } else if (distrib_vec[i_distrib_vec[i]] < test_vec[i_test_vec[j]]) { +# test_sig[i_test_vec[j]] = i - 1 +# j = j + 1 +# } +# +# } +# +# if (j =< num_test) { +# test_sig[i_test_vec[j:num_test]] = i - 1 +# } +# +# test_sig / num_distrib +# +#} + + + + diff --git a/lib/filenames.r b/lib/filenames.r index 2d3660b..eb59fa4 100644 --- a/lib/filenames.r +++ b/lib/filenames.r @@ -1,93 +1,93 @@ -################################################################# -###### Copyright: Regents of the University of Minnesota ###### -################################################################# - -# The functions in this R file are to unify file naming across -# all target prediction scripts. -get_resampled_profile_folder = function(outdir) { - file.path(outdir, 'resampled_profiles') -} - -get_resampled_profile_filename = function(folder) { - file.path(folder, 'resampled_profiles.txt.gz') -} - -get_resampled_profile_config_filename = function(folder) { - file.path(folder, 'resampled_profiles_config.yaml') -} - -get_dummy_config_filename = function(name) { - file.path(get_dummy_folder(name), 'config.yaml') -} - -get_dummy_folder = function(name) { - file.path(Sys.getenv('TARGET_PATH'), 'data', 'CG', name) -} - -get_gene_target_folder = function(outdir) { - file.path(outdir, 'gene_target_prediction') -} - -get_gene_target_prediction_filename = function(folder, similarity) { - file.path(folder, sprintf('gene_target_prediction_%s.txt.gz', similarity)) -} - -get_gene_target_prediction_resampled_filename = function(folder, rand_scheme, num_rand, seed, similarity) { - file.path(folder, sprintf('gene_target_prediction_resampled-%s_%s-rands_seed-%s_%s.txt.gz', rand_scheme, num_rand, seed, similarity)) -} - -get_gene_set_target_prediction_folder = function(outdir) { - file.path(outdir, 'gene_set_target_prediction') -} - -get_gene_set_target_prediction_filename_v1 = function(folder, rand_scheme, num_rand, seed, similarity, per_cond_seed, per_cond_num_rand, gene_set_name, test_run) { - # If either the randomization scheme or the number of randomizations - # is set to zero, then set both to zero, including the seed! - if (rand_scheme == 0 | num_rand == 0) { - rand_scheme = 0 - num_rand = 0 - seed = 0 - } - file.path(folder, sprintf('gene_set_target_prediction_resampled-%s_%s-rands_seed-%s_%s_per-cond-seed-%s_%s-per-cond-rands_%s-gene-sets.txt.gz', rand_scheme, num_rand, seed, similarity, per_cond_seed, per_cond_num_rand, gene_set_name, ifelse(test_run, 'test-run', ''))) -} - -get_gene_set_target_prediction_filename_v2 = function(folder, rand_scheme, num_rand, seed, similarity, per_cond_seed, per_cond_num_rand, gene_set_name, min_termsize, max_termsize, test_run) { - # If either the randomization scheme or the number of randomizations - # is set to zero, then set both to zero, including the seed! - if (rand_scheme == 0 | num_rand == 0) { - rand_scheme = 0 - num_rand = 0 - seed = 0 - } - file.path(folder, sprintf('gene_set_target_prediction_%s_%s_%s_%s_%s_%s_%s_%s-%s%s.txt.gz', rand_scheme, num_rand, seed, similarity, per_cond_seed, per_cond_num_rand, gene_set_name, min_termsize, max_termsize, ifelse(test_run, '_test-run', ''))) -} - -get_gene_set_target_prediction_terms_filename = function(folder, gene_set_name, min_termsize, max_termsize, test_run) { - - # This generates a file showing the gene sets that were actually used in the gene-set - # target prediction, once terms with too few or too many annotations were removed. - file.path(folder, sprintf('gene_sets_%s_%s-%s%s.txt', gene_set_name, min_termsize, max_termsize, ifelse(test_run, '_test-run', ''))) -} - -get_gene_set_hist_folder = function(outdir) { - x = get_gene_set_target_prediction_folder(outdir) - file.path(x, 'gene_sets_and stats') -} - -get_gene_set_target_prediction_size_hist_file = function(folder, gene_set_name, min_termsize, max_termsize, test_run) { - file.path(folder, sprintf('gene_set_size_dist_%s_%s-%s%s.pdf', gene_set_name, min_termsize, max_termsize, ifelse(test_run, '_test-run', ''))) -} - -get_final_results_folder = function(outdir, gene_set_name) { - file.path(outdir, 'final_results', gene_set_name) -} - -get_final_results_tabs_folder = function(outdir, gene_set_name) { - final_results_folder = get_final_results_folder(outdir, gene_set_name) - file.path(final_results_folder, 'tables') -} - -get_final_results_plots_folder = function(outdir, gene_set_name) { - final_results_folder = get_final_results_folder(outdir, gene_set_name) - file.path(final_results_folder, 'plots') -} +################################################################# +###### Copyright: Regents of the University of Minnesota ###### +################################################################# + +# The functions in this R file are to unify file naming across +# all target prediction scripts. +get_resampled_profile_folder = function(outdir) { + file.path(outdir, 'resampled_profiles') +} + +get_resampled_profile_filename = function(folder) { + file.path(folder, 'resampled_profiles.txt.gz') +} + +get_resampled_profile_config_filename = function(folder) { + file.path(folder, 'resampled_profiles_config.yaml') +} + +get_dummy_config_filename = function(name) { + file.path(get_dummy_folder(name), 'config.yaml') +} + +get_dummy_folder = function(name) { + file.path(Sys.getenv('TARGET_PATH'), 'data', 'CG', name) +} + +get_gene_target_folder = function(outdir) { + file.path(outdir, 'gene_target_prediction') +} + +get_gene_target_prediction_filename = function(folder, similarity) { + file.path(folder, sprintf('gene_target_prediction_%s.txt.gz', similarity)) +} + +get_gene_target_prediction_resampled_filename = function(folder, rand_scheme, num_rand, seed, similarity) { + file.path(folder, sprintf('gene_target_prediction_resampled-%s_%s-rands_seed-%s_%s.txt.gz', rand_scheme, num_rand, seed, similarity)) +} + +get_gene_set_target_prediction_folder = function(outdir) { + file.path(outdir, 'gene_set_target_prediction') +} + +get_gene_set_target_prediction_filename_v1 = function(folder, rand_scheme, num_rand, seed, similarity, per_cond_seed, per_cond_num_rand, gene_set_name, test_run) { + # If either the randomization scheme or the number of randomizations + # is set to zero, then set both to zero, including the seed! + if (rand_scheme == 0 | num_rand == 0) { + rand_scheme = 0 + num_rand = 0 + seed = 0 + } + file.path(folder, sprintf('gene_set_target_prediction_resampled-%s_%s-rands_seed-%s_%s_per-cond-seed-%s_%s-per-cond-rands_%s-gene-sets.txt.gz', rand_scheme, num_rand, seed, similarity, per_cond_seed, per_cond_num_rand, gene_set_name, ifelse(test_run, 'test-run', ''))) +} + +get_gene_set_target_prediction_filename_v2 = function(folder, rand_scheme, num_rand, seed, similarity, per_cond_seed, per_cond_num_rand, gene_set_name, min_termsize, max_termsize, alternative, test_run) { + # If either the randomization scheme or the number of randomizations + # is set to zero, then set both to zero, including the seed! + if (rand_scheme == 0 | num_rand == 0) { + rand_scheme = 0 + num_rand = 0 + seed = 0 + } + file.path(folder, sprintf('gene_set_target_prediction_%s_%s_%s_%s_%s_%s_%s_%s-%s_%s_%s.txt.gz', rand_scheme, num_rand, seed, similarity, per_cond_seed, per_cond_num_rand, gene_set_name, min_termsize, max_termsize, alternative, ifelse(test_run, '_test-run', ''))) +} + +get_gene_set_target_prediction_terms_filename = function(folder, gene_set_name, min_termsize, max_termsize, test_run) { + + # This generates a file showing the gene sets that were actually used in the gene-set + # target prediction, once terms with too few or too many annotations were removed. + file.path(folder, sprintf('gene_sets_%s_%s-%s%s.txt', gene_set_name, min_termsize, max_termsize, ifelse(test_run, '_test-run', ''))) +} + +get_gene_set_hist_folder = function(outdir) { + x = get_gene_set_target_prediction_folder(outdir) + file.path(x, 'gene_sets_and stats') +} + +get_gene_set_target_prediction_size_hist_file = function(folder, gene_set_name, min_termsize, max_termsize, test_run) { + file.path(folder, sprintf('gene_set_size_dist_%s_%s-%s%s.pdf', gene_set_name, min_termsize, max_termsize, ifelse(test_run, '_test-run', ''))) +} + +get_final_results_folder = function(outdir, gene_set_name) { + file.path(outdir, 'final_results', gene_set_name) +} + +get_final_results_tabs_folder = function(outdir, gene_set_name) { + final_results_folder = get_final_results_folder(outdir, gene_set_name) + file.path(final_results_folder, 'tables') +} + +get_final_results_plots_folder = function(outdir, gene_set_name) { + final_results_folder = get_final_results_folder(outdir, gene_set_name) + file.path(final_results_folder, 'plots') +} diff --git a/lib/zscore-pval_by-drug-and-goterm_fast.r b/lib/zscore-pval_by-drug-and-goterm_fast.r index 14b4418..25fc9db 100755 --- a/lib/zscore-pval_by-drug-and-goterm_fast.r +++ b/lib/zscore-pval_by-drug-and-goterm_fast.r @@ -1,338 +1,363 @@ -################################################################# -###### Copyright: Regents of the University of Minnesota ###### -################################################################# - -library(ggplot2) -library(data.table) - -# Source in libraries specific to this part of the script! -TARGET_PATH = Sys.getenv('TARGET_PATH') -source(file.path(TARGET_PATH, 'lib/one-sided_empirical_pval.R')) - -# target_prediction_mat should have drugs/conditions as rows and target prediction scores as columns -# go_term_mat should have the GO terms as columns and the predicted target genes as rows (and 1's in -# the matrix if the target gene is in a go term. - -# One note: I have recently changed all of the .maxcombine arguments to 100000, -# in an attempt to speed up combining all of my results once the calculations -# have completed. -best_score_per_col_group = function(mat, group_vec) { - - groups = unique(group_vec) - num_conds = nrow(mat) - final_mat = matrix(numeric(), nrow = num_conds, ncol = length(groups), dimnames = list(rownames(mat), groups)) - which_mat = matrix(character(), nrow = num_conds, ncol = length(groups), dimnames = list(rownames(mat), groups)) - for (i in seq_along(groups)) { - group = groups[i] - col_group = mat[, group_vec == group, drop = FALSE] - if (dim(col_group)[2] == 1) { - final_mat[, i] = col_group - which_mat[, i] = colnames(col_group) - } else { - best_inds = apply(col_group, 1, which.max) - best_queries = colnames(col_group)[best_inds] - best_scores = col_group[cbind(1:num_conds, best_inds)] - final_mat[, i] = best_scores - which_mat[, i] = best_queries - } - } - - return(list(best_scores = final_mat, best_queries = which_mat)) -} - -compute_per_gene_set_pvals_zscores = function(condition_x_gene_set_pred_sum_mat, control_condition_x_gene_set_pred_sum_mat_list, control_condition_x_gene_set_pred_sum_means_list, control_condition_x_gene_set_pred_sum_stdevs_list, control_types) { - - # Get some dimensions - num_control_types = length(control_types) - num_conditions = dim(condition_x_gene_set_pred_sum_mat)[1] - num_gene_sets = dim(condition_x_gene_set_pred_sum_mat)[2] - - ##### P-VALUES ##### - # Get p-values computed using different control types - # as null distributions - # Note: I compute the pvals for ALL conditions that I - # have already specified I want to predict targets for, - # not just for the treatments - - - # Assemble output data structure! - per_gene_set_pval_mats_by_control = replicate(num_control_types, - matrix(numeric(num_conditions * num_gene_sets), - nrow = num_conditions, - dimnames = dimnames(condition_x_gene_set_pred_sum_mat)), - simplify = FALSE) - names(per_gene_set_pval_mats_by_control) = control_types - - # Yes, this is a for loop in R. Satan's getting a bit chilled right now... - for (i in 1:num_control_types) { - - for (j in 1:num_gene_sets) { - - cat(sprintf('computing %s-derived per-gene-set pval for gene set %s/%s\r', control_types[i], j, num_gene_sets)) - # message(sprintf('computing %s-derived per-gene-set pval for gene set %s/%s', control_types[i], j, num_gene_sets)) - per_gene_set_pval_mats_by_control[[i]][, j] = empirical_pval_greater(condition_x_gene_set_pred_sum_mat[, j], - control_condition_x_gene_set_pred_sum_mat_list[[i]][, j]) - - } - cat('\n\n') - - } - - message('completed per-gene-set pval calculations') - message(str(per_gene_set_pval_mats_by_control)) - - ##### Z-SCORES ##### - # Get z-scores computed using different control types - # as null distributions - - # Assemble output data structure! - per_gene_set_zscore_mats_by_control = replicate(num_control_types, - matrix(numeric(num_conditions * num_gene_sets), - nrow = num_conditions, - dimnames = dimnames(condition_x_gene_set_pred_sum_mat)), - simplify = FALSE) - names(per_gene_set_zscore_mats_by_control) = control_types - - # Yes, this is a for loop in R. Satan's getting a bit chilled right now... - for (i in 1:num_control_types) { - - for (j in 1:num_conditions) { - - cat(sprintf('computing %s-derived per-gene-set zscore for condition %s/%s\r', control_types[i], j, num_conditions)) - #message(sprintf('computing %s-derived per-gene-set zscore for condition %s/%s', control_types[i], j, num_conditions)) - per_gene_set_zscore_mats_by_control[[i]][j, ] = - (condition_x_gene_set_pred_sum_mat[j, ] - control_condition_x_gene_set_pred_sum_means_list[[i]]) / - control_condition_x_gene_set_pred_sum_stdevs_list[[i]] - - } - cat('\n\n') - - } - - message('Assembling final data structure...') - - # print(str(drug_subset_go_pred_sum_mat)) - - per_gene_set_final_list = vector('list', num_control_types) - names(per_gene_set_final_list) = control_types - - for (i in 1:num_control_types) { - - per_gene_set_final_list[[i]] = list(pval = per_gene_set_pval_mats_by_control[[i]], - zscore = per_gene_set_zscore_mats_by_control[[i]]) - } - - return(per_gene_set_final_list) - -} - -compute_per_condition_pvals_zscores_2 = function(condition_x_gene_score_mat, condition_x_gene_set_sum_mat, gene_set_mat, num_per_cond_rand, gene_set_sizes, condition_x_gene_score_means, condition_x_gene_score_stdevs, seed) { - - # Get some dimensions - num_conditions = dim(condition_x_gene_set_sum_mat)[1] - num_gene_sets = dim(condition_x_gene_set_sum_mat)[2] - num_genes = dim(condition_x_gene_score_mat)[2] - - ##### P-VALUES ##### - # Tricky part: compute p values for each per-drug - # zscore. - # Current scheme: shuffling gene labels on target_prediction_scores - - # Preallocate the output data structures before running - # the loop that computes the # of randoms that beat real - # predictions for each drug X gene set combination - rand_result_mat = matrix(numeric(num_conditions * num_gene_sets), - nrow = num_conditions, - dimnames = dimnames(condition_x_gene_set_sum_mat)) - - rand_beats_real_count_mat = matrix(integer(num_conditions * num_gene_sets), - nrow = num_conditions, - dimnames = dimnames(condition_x_gene_set_sum_mat)) - - - - # Seed the random number generator right before - # performing the randomizations!!! - set.seed(seed) - - for (i in 1:num_per_cond_rand) { - - cat(sprintf('computing per-condition p values, randomization: %s/%s\r', i, num_per_cond_rand)) - #message(sprintf('computing per-condition p values, randomization: %s/%s', i, num_per_cond_rand)) - - # Shuffle the gene set matrix (this is faster - # than shuffling the condition X gene set - # prediction matrix) - shuffled_gene_set_mat = gene_set_mat[sample(x = 1:num_genes, size = num_genes, replace = FALSE), ] - - # Multiply the gene-level target prediction scores by - # the shuffled gene set matrix to get per-condition - # randomized sums! Assign the result to the same variable - # that occupies the same position in memory to reduce - # memory use issues (have not tested exactly how much - # benefit comes from this - there is a slight speed - # decrease) - rand_result_mat[] = condition_x_gene_score_mat %*% shuffled_gene_set_mat - - # Determine which condition X gene set predictions using - # the shuffled gene sets (shuffled on the gene side) - # had the same or larger sum of gene-level target score - # across all the genes in the gene set of interest. - # Use subsetting to make sure the matrix is modified - # in place to save memory - rand_beats_real_count_mat[] = rand_beats_real_count_mat + as.integer(rand_result_mat >= condition_x_gene_set_sum_mat) - - } - cat('\n\n') - - per_condition_pval_mat = rand_beats_real_count_mat / num_per_cond_rand - - ##### Z-SCORES ##### - # Preallocate the output z-score matrix - per_condition_zscore_mat = matrix(numeric(num_conditions * num_gene_sets), - nrow = num_conditions, - dimnames = dimnames(condition_x_gene_set_sum_mat)) - - # Yes, another for loop. They work great lol. - for (i in 1:num_gene_sets) { - - cat(sprintf('computing per-condition zscores for gene set %s/%s\r', i, num_gene_sets)) - #message(sprintf('computing per-condition zscores for gene set %s/%s', i, num_gene_sets)) - per_condition_zscore_mat[, i] = sqrt(gene_set_sizes[i]) * ((condition_x_gene_set_sum_mat[, i] / gene_set_sizes[i]) - condition_x_gene_score_means) / condition_x_gene_score_stdevs - - } - cat('\n\n') - - per_condition_final_list <- list(pval = per_condition_pval_mat, zscore = per_condition_zscore_mat) - - return(per_condition_final_list) - -} - - -# Define a function to extract the worst p value (or z score, if the -# p values match) between the two different control types (dmso and -# randomized drugs) -get_worst_case_pval_zscore <- function(scheme1_pval_mat, scheme1_zscore_mat, scheme2_pval_mat, scheme2_zscore_mat, scheme1_name, scheme2_name) { - - # Get sets of indices corresponding to where I should select - # the scheme1 control-derived result vs the scheme2omized drug-derived - # result - scheme1_beats_scheme2_pval <- scheme1_pval_mat < scheme2_pval_mat - scheme2_beats_scheme1_pval <- scheme1_pval_mat > scheme2_pval_mat - scheme1_equal_scheme2_pval <- scheme1_pval_mat == scheme2_pval_mat - - scheme1_beats_scheme2_zscore <- scheme1_zscore_mat > scheme2_zscore_mat - scheme2_beats_scheme1_zscore <- scheme1_zscore_mat < scheme2_zscore_mat - scheme1_equal_scheme2_zscore <- scheme1_zscore_mat == scheme2_zscore_mat - - pick_scheme1_indices <- scheme2_beats_scheme1_pval | (scheme1_equal_scheme2_pval & scheme2_beats_scheme1_zscore) - pick_scheme2_indices <- scheme1_beats_scheme2_pval | (scheme1_equal_scheme2_pval & scheme1_beats_scheme2_zscore) - tie_indices <- scheme1_equal_scheme2_pval & scheme1_equal_scheme2_zscore - - mat_dim <- dim(scheme1_pval_mat) - mat_dimnames <- dimnames(scheme1_pval_mat) - - # Initialize some matrices - worst_pval_mat <- matrix(numeric(), nrow = mat_dim[1], ncol = mat_dim[2], dimnames = mat_dimnames) - worst_zscore_mat <- matrix(numeric(), nrow = mat_dim[1], ncol = mat_dim[2], dimnames = mat_dimnames) - worst_control_name_mat <- matrix(character(), nrow = mat_dim[1], ncol = mat_dim[2], dimnames = mat_dimnames) - - # Fill in my worst case pval/zscore/control_name matrices! - worst_pval_mat[pick_scheme1_indices] <- scheme1_pval_mat[pick_scheme1_indices] - worst_pval_mat[pick_scheme2_indices] <- scheme2_pval_mat[pick_scheme2_indices] - worst_pval_mat[tie_indices] <- scheme1_pval_mat[tie_indices] - - worst_zscore_mat[pick_scheme1_indices] <- scheme1_zscore_mat[pick_scheme1_indices] - worst_zscore_mat[pick_scheme2_indices] <- scheme2_zscore_mat[pick_scheme2_indices] - worst_zscore_mat[tie_indices] <- scheme1_zscore_mat[tie_indices] - - worst_control_name_mat[pick_scheme1_indices] <- scheme1_name - worst_control_name_mat[pick_scheme2_indices] <- scheme2_name - worst_control_name_mat[tie_indices] <- 'tie' - - list(worst_pval = worst_pval_mat, - worst_zscore = worst_zscore_mat, - control_name = worst_control_name_mat - ) -} - -# Function to export diagnostic plots for any two schemes that generate a matrix of pvals -# and a matrix of zscores - -print_diagnostic_plots_pval_zscore_2_schemes <- function(scheme1_pval_mat, scheme1_zscore_mat, scheme2_pval_mat, scheme2_zscore_mat, scheme1_name, scheme2_name, plots_folder) { - - dir.create(plots_folder, recursive = TRUE) - - master_plot_list <- list( - # pval vs zscore plots - scheme1_pval_vs_zscore_plot = qplot(x = as.vector(scheme1_zscore_mat), - y = -log10(as.vector(scheme1_pval_mat)), - xlab = sprintf('%s z-score', scheme1_name), - ylab = sprintf('-log10 %s p value', scheme1_name), - main = sprintf('Comparison of all %s-derived\nz-scores and p values', scheme1_name) - ), - - scheme2_pval_vs_zscore_plot = qplot(x = as.vector(scheme2_zscore_mat), - y = -log10(as.vector(scheme2_pval_mat)), - xlab = sprintf('%s z-score', scheme2_name), - ylab = sprintf('-log10 %s p value', scheme2_name), - main = sprintf('Comparison of all %s-derived\nz-scores and p values', scheme2_name) - ), - - # Across scheme pval vs pval and zscore vs zscore plots - scheme1_vs_scheme2_pval_plot = qplot(x = -log10(as.vector(scheme1_pval_mat)), - y = -log10(as.vector(scheme2_pval_mat)), - xlab = sprintf('-log10 %s p value', scheme1_name), - ylab = sprintf('-log10 %s p value', scheme2_name), - main = sprintf('Comparison of all %s- and %s-derived p values', scheme1_name, scheme2_name) - ), - - scheme1_vs_scheme2_zscore_plot = qplot(x = as.vector(scheme1_zscore_mat), - y = as.vector(scheme2_zscore_mat), - xlab = sprintf('%s z-score', scheme1_name), - ylab = sprintf('%s z-score', scheme2_name), - main = sprintf('Comparison of all %s- and %s-derived z-scores', scheme1_name, scheme2_name) - ), - - # Plot only each condition's top p values or top zscores from each scheme - # against each other - scheme1_vs_scheme2_top_pval_plot = qplot(x = -log10(apply(scheme1_pval_mat, 1, min)), - y = -log10(apply(scheme2_pval_mat, 1, min)), - xlab = sprintf('-log10 %s min p value\nfor each condition', scheme1_name), - ylab = sprintf('-log10 %s min p value\nfor each condition', scheme2_name), - main = sprintf('Comparison of all %s- and %s-derived p values', scheme1_name, scheme2_name) - ), - - scheme1_vs_scheme2_top_zscore_plot = qplot(x = apply(scheme1_zscore_mat, 1, max), - y = apply(scheme2_zscore_mat, 1, max), - xlab = sprintf('%s max z score\nfor each condition', scheme1_name), - ylab = sprintf('%s max z score\nfor each condition', scheme2_name), - main = sprintf('Comparison of all %s- and %s-derived z scores', scheme1_name, scheme2_name) - ) - ) - - # Add some common parameters to each plot for aesthetics - - # And some code to print all of these out to plots - qc_plot <- function(ggplot_obj, filename) { - png(file = file.path(plots_folder, filename), height = 4, width = 4, units = 'in', dpi = 200) - print(ggplot_obj) - dev.off() - } - - # Change names of list to reflect actual identities - new_names <- names(master_plot_list) - new_names <- vapply(new_names, function(x) sub(pattern = 'scheme1', replacement = scheme1_name, x = x), character(1)) - new_names <- vapply(new_names, function(x) sub(pattern = 'scheme2', replacement = scheme2_name, x = x), character(1)) - names(master_plot_list) <- new_names - - lapply(names(master_plot_list), function(x) { - qc_plot(master_plot_list[[x]], sprintf('%s.png', x)) - }) - - return(NULL) - -} - - +################################################################# +###### Copyright: Regents of the University of Minnesota ###### +################################################################# + +library(ggplot2) +library(data.table) + +# Source in libraries specific to this part of the script! +TARGET_PATH = Sys.getenv('TARGET_PATH') +source(file.path(TARGET_PATH, 'lib/empirical_pval.R')) + +# target_prediction_mat should have drugs/conditions as rows and target prediction scores as columns +# go_term_mat should have the GO terms as columns and the predicted target genes as rows (and 1's in +# the matrix if the target gene is in a go term. + +# One note: I have recently changed all of the .maxcombine arguments to 100000, +# in an attempt to speed up combining all of my results once the calculations +# have completed. +best_score_per_col_group = function(mat, group_vec) { + + groups = unique(group_vec) + num_conds = nrow(mat) + final_mat = matrix(numeric(), nrow = num_conds, ncol = length(groups), dimnames = list(rownames(mat), groups)) + which_mat = matrix(character(), nrow = num_conds, ncol = length(groups), dimnames = list(rownames(mat), groups)) + for (i in seq_along(groups)) { + group = groups[i] + col_group = mat[, group_vec == group, drop = FALSE] + if (dim(col_group)[2] == 1) { + final_mat[, i] = col_group + which_mat[, i] = colnames(col_group) + } else { + best_inds = apply(col_group, 1, which.max) + best_queries = colnames(col_group)[best_inds] + best_scores = col_group[cbind(1:num_conds, best_inds)] + final_mat[, i] = best_scores + which_mat[, i] = best_queries + } + } + + return(list(best_scores = final_mat, best_queries = which_mat)) +} + +compute_per_gene_set_pvals_zscores = function(condition_x_gene_set_pred_sum_mat, control_condition_x_gene_set_pred_sum_mat_list, control_condition_x_gene_set_pred_sum_means_list, control_condition_x_gene_set_pred_sum_stdevs_list, control_types, alternative = c('greater', 'less', 'two-sided')) { + # determine which direction of effect to detect + alternative <- match.arg(alternative) + empirical_pval_fn <- switch( + alternative, + greater = empirical_pval_greater, + less = empirical_pval_less, + `two-sided` = empirical_pval_two_sided + ) + + # Get some dimensions + num_control_types = length(control_types) + num_conditions = dim(condition_x_gene_set_pred_sum_mat)[1] + num_gene_sets = dim(condition_x_gene_set_pred_sum_mat)[2] + + ##### P-VALUES ##### + # Get p-values computed using different control types + # as null distributions + # Note: I compute the pvals for ALL conditions that I + # have already specified I want to predict targets for, + # not just for the treatments + + + # Assemble output data structure! + per_gene_set_pval_mats_by_control = replicate(num_control_types, + matrix(numeric(num_conditions * num_gene_sets), + nrow = num_conditions, + dimnames = dimnames(condition_x_gene_set_pred_sum_mat)), + simplify = FALSE) + names(per_gene_set_pval_mats_by_control) = control_types + + # Yes, this is a for loop in R. Satan's getting a bit chilled right now... + for (i in 1:num_control_types) { + + for (j in 1:num_gene_sets) { + + cat(sprintf('computing %s-derived per-gene-set pval for gene set %s/%s\r', control_types[i], j, num_gene_sets)) + # message(sprintf('computing %s-derived per-gene-set pval for gene set %s/%s', control_types[i], j, num_gene_sets)) + per_gene_set_pval_mats_by_control[[i]][, j] = empirical_pval_fn(condition_x_gene_set_pred_sum_mat[, j], + control_condition_x_gene_set_pred_sum_mat_list[[i]][, j]) + + } + cat('\n\n') + + } + + message('completed per-gene-set pval calculations') + message(str(per_gene_set_pval_mats_by_control)) + + ##### Z-SCORES ##### + # Get z-scores computed using different control types + # as null distributions + + # Assemble output data structure! + per_gene_set_zscore_mats_by_control = replicate(num_control_types, + matrix(numeric(num_conditions * num_gene_sets), + nrow = num_conditions, + dimnames = dimnames(condition_x_gene_set_pred_sum_mat)), + simplify = FALSE) + names(per_gene_set_zscore_mats_by_control) = control_types + + # Yes, this is a for loop in R. Satan's getting a bit chilled right now... + for (i in 1:num_control_types) { + + for (j in 1:num_conditions) { + + cat(sprintf('computing %s-derived per-gene-set zscore for condition %s/%s\r', control_types[i], j, num_conditions)) + #message(sprintf('computing %s-derived per-gene-set zscore for condition %s/%s', control_types[i], j, num_conditions)) + per_gene_set_zscore_mats_by_control[[i]][j, ] = + (condition_x_gene_set_pred_sum_mat[j, ] - control_condition_x_gene_set_pred_sum_means_list[[i]]) / + control_condition_x_gene_set_pred_sum_stdevs_list[[i]] + + } + cat('\n\n') + + } + + message('Assembling final data structure...') + + # print(str(drug_subset_go_pred_sum_mat)) + + per_gene_set_final_list = vector('list', num_control_types) + names(per_gene_set_final_list) = control_types + + for (i in 1:num_control_types) { + + per_gene_set_final_list[[i]] = list(pval = per_gene_set_pval_mats_by_control[[i]], + zscore = per_gene_set_zscore_mats_by_control[[i]]) + } + + return(per_gene_set_final_list) + +} + +compute_per_condition_pvals_zscores_2 = function(condition_x_gene_score_mat, condition_x_gene_set_sum_mat, gene_set_mat, num_per_cond_rand, gene_set_sizes, condition_x_gene_score_means, condition_x_gene_score_stdevs, seed, alternative = c('greater', 'less', 'two-sided')) { + + # determine which direction of effect to detect + alternative = match.arg(alternative) + + # Get some dimensions + num_conditions = dim(condition_x_gene_set_sum_mat)[1] + num_gene_sets = dim(condition_x_gene_set_sum_mat)[2] + num_genes = dim(condition_x_gene_score_mat)[2] + + ##### P-VALUES ##### + # Tricky part: compute p values for each per-drug + # zscore. + # Current scheme: shuffling gene labels on target_prediction_scores + + # Preallocate the output data structures before running + # the loop that computes the # of randoms that beat real + # predictions for each drug X gene set combination + rand_result_mat = matrix(numeric(num_conditions * num_gene_sets), + nrow = num_conditions, + dimnames = dimnames(condition_x_gene_set_sum_mat)) + + rand_beats_real_count_mat = matrix(integer(num_conditions * num_gene_sets), + nrow = num_conditions, + dimnames = dimnames(condition_x_gene_set_sum_mat)) + + + + # Seed the random number generator right before + # performing the randomizations!!! + set.seed(seed) + + for (i in 1:num_per_cond_rand) { + + cat(sprintf('computing per-condition p values, randomization: %s/%s\r', i, num_per_cond_rand)) + #message(sprintf('computing per-condition p values, randomization: %s/%s', i, num_per_cond_rand)) + + # Shuffle the gene set matrix (this is faster + # than shuffling the condition X gene set + # prediction matrix) + shuffled_gene_set_mat = gene_set_mat[sample(x = 1:num_genes, size = num_genes, replace = FALSE), ] + + # Multiply the gene-level target prediction scores by + # the shuffled gene set matrix to get per-condition + # randomized sums! Assign the result to the same variable + # that occupies the same position in memory to reduce + # memory use issues (have not tested exactly how much + # benefit comes from this - there is a slight speed + # decrease) + rand_result_mat[] = condition_x_gene_score_mat %*% shuffled_gene_set_mat + + # Determine which condition X gene set predictions using + # the shuffled gene sets (shuffled on the gene side) + # had the same or larger sum of gene-level target score + # across all the genes in the gene set of interest. + # Use subsetting to make sure the matrix is modified + # in place to save memory + rand_beats_real_count_mat[] = rand_beats_real_count_mat + as.integer(rand_result_mat >= condition_x_gene_set_sum_mat) + + } + cat('\n\n') + + per_condition_pval_mat = switch( + alternative, + greater = rand_beats_real_count_mat / num_per_cond_rand, + less = 1 - rand_beats_real_count_mat / num_per_cond_rand, + `two-sided` = 2 * pmin(rand_beats_real_count_mat / num_per_cond_rand) + ) + + ##### Z-SCORES ##### + # Preallocate the output z-score matrix + per_condition_zscore_mat = matrix(numeric(num_conditions * num_gene_sets), + nrow = num_conditions, + dimnames = dimnames(condition_x_gene_set_sum_mat)) + + # Yes, another for loop. They work great lol. + for (i in 1:num_gene_sets) { + + cat(sprintf('computing per-condition zscores for gene set %s/%s\r', i, num_gene_sets)) + #message(sprintf('computing per-condition zscores for gene set %s/%s', i, num_gene_sets)) + per_condition_zscore_mat[, i] = sqrt(gene_set_sizes[i]) * ((condition_x_gene_set_sum_mat[, i] / gene_set_sizes[i]) - condition_x_gene_score_means) / condition_x_gene_score_stdevs + + } + cat('\n\n') + + per_condition_final_list <- list(pval = per_condition_pval_mat, zscore = per_condition_zscore_mat) + + return(per_condition_final_list) + +} + + +# Define a function to extract the worst p value (or z score, if the +# p values match) between the two different control types (dmso and +# randomized drugs) +get_worst_case_pval_zscore <- function(scheme1_pval_mat, scheme1_zscore_mat, scheme2_pval_mat, scheme2_zscore_mat, scheme1_name, scheme2_name, alternative = c('greater', 'less', 'two-sided')) { + + alternative = match.arg(alternative) + + zscore_compare_fn <- switch( + alternative, + greater = `>`, + less = `<`, + `two-sided` = function(x,y) abs(x) > abs(y) + ) + + # Get sets of indices corresponding to where I should select + # the scheme1 control-derived result vs the scheme2 drug-derived + # result + scheme1_beats_scheme2_pval <- scheme1_pval_mat < scheme2_pval_mat + scheme2_beats_scheme1_pval <- scheme2_pval_mat < scheme1_pval_mat + scheme1_equal_scheme2_pval <- scheme1_pval_mat == scheme2_pval_mat + + scheme1_beats_scheme2_zscore <- zscore_compare_fn(scheme1_zscore_mat, scheme2_zscore_mat) + scheme2_beats_scheme1_zscore <- zscore_compare_fn(scheme2_zscore_mat, scheme1_zscore_mat) + scheme1_equal_scheme2_zscore <- scheme1_zscore_mat == scheme2_zscore_mat + + pick_scheme1_indices <- scheme2_beats_scheme1_pval | (scheme1_equal_scheme2_pval & scheme2_beats_scheme1_zscore) + pick_scheme2_indices <- scheme1_beats_scheme2_pval | (scheme1_equal_scheme2_pval & scheme1_beats_scheme2_zscore) + tie_indices <- scheme1_equal_scheme2_pval & scheme1_equal_scheme2_zscore + + mat_dim <- dim(scheme1_pval_mat) + mat_dimnames <- dimnames(scheme1_pval_mat) + + # Initialize some matrices + worst_pval_mat <- matrix(numeric(), nrow = mat_dim[1], ncol = mat_dim[2], dimnames = mat_dimnames) + worst_zscore_mat <- matrix(numeric(), nrow = mat_dim[1], ncol = mat_dim[2], dimnames = mat_dimnames) + worst_control_name_mat <- matrix(character(), nrow = mat_dim[1], ncol = mat_dim[2], dimnames = mat_dimnames) + + # Fill in my worst case pval/zscore/control_name matrices! + worst_pval_mat[pick_scheme1_indices] <- scheme1_pval_mat[pick_scheme1_indices] + worst_pval_mat[pick_scheme2_indices] <- scheme2_pval_mat[pick_scheme2_indices] + worst_pval_mat[tie_indices] <- scheme1_pval_mat[tie_indices] + + worst_zscore_mat[pick_scheme1_indices] <- scheme1_zscore_mat[pick_scheme1_indices] + worst_zscore_mat[pick_scheme2_indices] <- scheme2_zscore_mat[pick_scheme2_indices] + worst_zscore_mat[tie_indices] <- scheme1_zscore_mat[tie_indices] + + worst_control_name_mat[pick_scheme1_indices] <- scheme1_name + worst_control_name_mat[pick_scheme2_indices] <- scheme2_name + worst_control_name_mat[tie_indices] <- 'tie' + + list(worst_pval = worst_pval_mat, + worst_zscore = worst_zscore_mat, + control_name = worst_control_name_mat + ) +} + +# Function to export diagnostic plots for any two schemes that generate a matrix of pvals +# and a matrix of zscores + +print_diagnostic_plots_pval_zscore_2_schemes <- function(scheme1_pval_mat, scheme1_zscore_mat, scheme2_pval_mat, scheme2_zscore_mat, scheme1_name, scheme2_name, plots_folder) { + + dir.create(plots_folder, recursive = TRUE) + + master_plot_list <- list( + # pval vs zscore plots + scheme1_pval_vs_zscore_plot = qplot(x = as.vector(scheme1_zscore_mat), + y = -log10(as.vector(scheme1_pval_mat)), + xlab = sprintf('%s z-score', scheme1_name), + ylab = sprintf('-log10 %s p value', scheme1_name), + main = sprintf('Comparison of all %s-derived\nz-scores and p values', scheme1_name) + ), + + scheme2_pval_vs_zscore_plot = qplot(x = as.vector(scheme2_zscore_mat), + y = -log10(as.vector(scheme2_pval_mat)), + xlab = sprintf('%s z-score', scheme2_name), + ylab = sprintf('-log10 %s p value', scheme2_name), + main = sprintf('Comparison of all %s-derived\nz-scores and p values', scheme2_name) + ), + + # Across scheme pval vs pval and zscore vs zscore plots + scheme1_vs_scheme2_pval_plot = qplot(x = -log10(as.vector(scheme1_pval_mat)), + y = -log10(as.vector(scheme2_pval_mat)), + xlab = sprintf('-log10 %s p value', scheme1_name), + ylab = sprintf('-log10 %s p value', scheme2_name), + main = sprintf('Comparison of all %s- and %s-derived p values', scheme1_name, scheme2_name) + ), + + scheme1_vs_scheme2_zscore_plot = qplot(x = as.vector(scheme1_zscore_mat), + y = as.vector(scheme2_zscore_mat), + xlab = sprintf('%s z-score', scheme1_name), + ylab = sprintf('%s z-score', scheme2_name), + main = sprintf('Comparison of all %s- and %s-derived z-scores', scheme1_name, scheme2_name) + ), + + # Plot only each condition's top p values or top zscores from each scheme + # against each other + scheme1_vs_scheme2_top_pval_plot = qplot(x = -log10(apply(scheme1_pval_mat, 1, min)), + y = -log10(apply(scheme2_pval_mat, 1, min)), + xlab = sprintf('-log10 %s min p value\nfor each condition', scheme1_name), + ylab = sprintf('-log10 %s min p value\nfor each condition', scheme2_name), + main = sprintf('Comparison of all %s- and %s-derived p values', scheme1_name, scheme2_name) + ), + + scheme1_vs_scheme2_top_zscore_plot = qplot(x = apply(scheme1_zscore_mat, 1, max), + y = apply(scheme2_zscore_mat, 1, max), + xlab = sprintf('%s max z score\nfor each condition', scheme1_name), + ylab = sprintf('%s max z score\nfor each condition', scheme2_name), + main = sprintf('Comparison of all %s- and %s-derived z scores', scheme1_name, scheme2_name) + ) + ) + + # Add some common parameters to each plot for aesthetics + + # And some code to print all of these out to plots + qc_plot <- function(ggplot_obj, filename) { + png(file = file.path(plots_folder, filename), height = 4, width = 4, units = 'in', dpi = 200) + print(ggplot_obj) + dev.off() + } + + # Change names of list to reflect actual identities + new_names <- names(master_plot_list) + new_names <- vapply(new_names, function(x) sub(pattern = 'scheme1', replacement = scheme1_name, x = x), character(1)) + new_names <- vapply(new_names, function(x) sub(pattern = 'scheme2', replacement = scheme2_name, x = x), character(1)) + names(master_plot_list) <- new_names + + lapply(names(master_plot_list), function(x) { + qc_plot(master_plot_list[[x]], sprintf('%s.png', x)) + }) + + return(NULL) + +} + + diff --git a/scripts/predict_gene-set_targets.r b/scripts/predict_gene-set_targets.r index 1a5f636..d2f8670 100755 --- a/scripts/predict_gene-set_targets.r +++ b/scripts/predict_gene-set_targets.r @@ -1,1092 +1,1101 @@ -#!/usr/bin/env Rscript - -################################################################# -###### Copyright: Regents of the University of Minnesota ###### -################################################################# - -## CHANGE THIS HEADING DOCUMENTATION!!!!!!!! -# -# -## This script will take in the results of chemical genomics target prediction -## and return "z scores" describing the relationship between a drug's average -## target prediction score for genes that match a GO term, and the drug's overall -## average target prediction score. -# -## This script exports all zscore calculations -## Analysis is done using other scripts -# - - -library(optparse) -library(yaml) -library(reshape2) -library(data.table) - -# Source in libraries for gene set target prediction -TARGET_PATH = Sys.getenv('TARGET_PATH') -source(file.path(TARGET_PATH, 'lib/go_drivers.r')) -source(file.path(TARGET_PATH, 'lib/file_naming.r')) -source(file.path(TARGET_PATH, 'lib/top_table.r')) -source(file.path(TARGET_PATH, 'lib/table_to_globular.r')) -source(file.path(TARGET_PATH, 'lib/zscore-pval_by-drug-and-goterm_fast.r')) -source(file.path(TARGET_PATH, 'lib/pos_args.r')) -source(file.path(TARGET_PATH, 'lib/filenames.r')) -source(file.path(TARGET_PATH, 'lib/common_vars.r')) -source(file.path(TARGET_PATH, 'lib/datasets.r')) -source(file.path(TARGET_PATH, 'lib/scott_themes.r')) -source(file.path(TARGET_PATH, 'lib/dummy_dataset.r')) - -# print(sessionInfo()) - -# Some function definitions... -save_data_factory = function(folder) { - # This function returns a function that saves data in a - # folder that is specified at the time of function creation. - # The resulting function only requires a "save_point" - # argument, which is x. - - force(folder) - fun = function(x) { - outfile = file.path(folder, sprintf('save_point_%s.RData', x)) - message(sprintf('\nsaving data at save point %s...', x)) - message(sprintf('location: %s\n', outfile)) - # Get a list of everything except for functions, and also - # remove the save_points and load_point variablex. We want - # both functions and save/load points to be defined for - # each run of the code. (don't want old bugs hanging - # around or wasting time on saving when the user doesn't - # want it) - # If I think about it, removing the load_point variable - # probably doesn't matter, but I'm preventing it from - # saving anyway. - object_vec = setdiff(ls(envir = .GlobalEnv), lsf.str(envir = .GlobalEnv)) - object_vec = setdiff(object_vec, c('save_points', 'load_point')) - #print(object_vec) - save(list = object_vec, file = outfile, envir = .GlobalEnv) - message('done') - } - return(fun) -} - - -load_data_factory = function(folder) { - # This function returns a function that saves loads data - # from a folder that is specified at the time of function - # creation. The resulting function only requires a - # "load_point" argument, which is x. - - force(folder) - fun = function(x) { - infile = file.path(folder, sprintf('save_point_%s.RData', x)) - message(sprintf('\nloading data from save point %s...', x)) - message(sprintf('location: %s\n', infile)) - load(infile, envir = globalenv()) - message('done') - } - return(fun) -} - - -# Parse the arguments, as these need to be read regardless of which starting -# point in the code the user wants to start the analysis from. -# Note: while it is totally ok (and more or less the point) for the "load_point" -# and "save_points" arguments to change between runs on the same dataset -# (for example, stepping through the analysis, one save point at a time), -# "--test_run" only matters when no load point is specified (or it is 0). -positional_arguments_list = c(CONFIG_FILE = 'yaml-formatted gene-set prediction configuration file.') - -option_list = list(make_option('--test_run', action = 'store_true', default = FALSE, help = 'Performs predictions for a small subset of the data, to ensure that everything is working properly before the full set of predictions are made (can take days for large screens). This option only has an effect when "--load_point" is not specified or is 0. If "--test_run" has been specified and the analysis saved halfway through using "--save_points," then the analysis will still be in "test run" mode when loading that save point, and vice versa if "--test_run" was not specified. NOTE: the results from this run will not be accurate!!!'), - make_option('--load_point', default = 0, type = 'integer', help = 'Point in the processing at which a previously partially-processed, saved dataset should be loaded and its processing continued through the rest of the script. Possible values are in c(0, 1, 2). 0 (default): run all code from the beginning; 1: resume the analysis at per-gene-set p-value computations; 2: resume the analysis at per-condition p-value computations. While primarily for testing purposes, this could be useful during real analyses.'), - make_option('--save_points', default = '', type = 'character', help = 'Point(s) in the processing at which to save the data in their present, partially-processed form. See "--load_point" for descriptions of these points. If the user wishes to specify multiple save points, these must be comma-delimited *without* spaces (e.g. --save_points 1,2). Data are saved in "*.RData" format.') - ) -final_usage_positional = get_usage_and_positional(positional_arguments_list) - -parser = OptionParser(usage = final_usage_positional, option_list = option_list) -arguments = parse_args(parser, positional_arguments = length(positional_arguments_list)) -opt = arguments$options -arg = arguments$args - -print(opt) -print(arg) - - - -####### First, deal with the save/load point arguments -# Load 'em in... -load_point = opt$load_point -save_points_raw = opt$save_points -save_points_split = strsplit(save_points_raw, ',')[[1]] -save_points = as.integer(save_points_split) - -# Make sure no saving is supposed to happen before loading -if (any(save_points < load_point)) { - stop('Specified intermediate file save points must occur after the specified load point!') -} - -# Looks like I have to parse the config file, *just a bit*, so -# that it knows where the output directory is. Grrrrrrr... -config_f = file(arg[1], 'rt') -config_params = yaml.load_file(config_f) -close(config_f) - -# Get the output folder for intermediate data -# (and create if it doesn't exist!) -output_table_folder = get_gene_set_target_prediction_folder(config_params$Required_arguments$output_folder) -intermed_folder = file.path(output_table_folder, 'intermediate_data') -dir.create(intermed_folder, recursive = TRUE) - -# Generate the save_data and load_data functions -save_data = save_data_factory(intermed_folder) -load_data = load_data_factory(intermed_folder) - -# If the user does not specify a load point (or explicitly wants -# to load from the beginning), then run the code from the -# beginning! Otherwise, all of this code will be skipped, as the -# options have already been defined and saved in the user's -# R environment from the first round of processing! - -# Now, on to the real stuff! -if (load_point < 1) { - - - #positional_arguments_list = c(gene_predictions = 'Gzipped file containing the gene-level target predictions for each condition.', - # rand_gene_predictions = 'Gzipped file containing the gene-level target predictions for each resampled condition.', - # gene_sets = 'Two-column, tab-delimited file (WITH HEADER) that maps gene identifiers (must have same column name as corresponding column in the query info table) to gene sets.', - # query_info_table = 'Table with a "query_key" column that maps to the query identifiers in the genetic interaction data and another column that maps to the gene identifiers in the gene_sets file.', - # driver_cutoff = 'Similarity value (similarity to the condition profile) above which a gene is reported as a "driver" of particular gene-set prediction. Default value cannot be specified because unbounded similarity measures are accepted as input.', - # output_table = 'File into which the results are written. (it is gzipped, so the extension should be *.txt.gz)' - # ) - - #option_list = list( - # make_option('--sample_table', help = 'File with a table that maps conditions to control information'), - # make_option('--neg_control_column', help = 'Name of True/False column in the sample_table that indicates which samples are to be used as negative controls in gene-set prediction. If this option is not specified, only the resampled conditions will be used for gene-set prediction.'), - # make_option('--condition_name_column', help = 'Name of a column in the sample table with human-readable condition names'), - # make_option(c('-n', '--per_condition_randomizations'), type = 'integer', help = 'The number of "per-condition" randomizations to perform. When not specified, the default is to perform the same number of randomizations that are present in the rand_gene_predictions file.'), - # make_option('--gene_set_names', help = 'Two-column, tab-delimited file that maps each gene set identifier to an interpretable name for gene-set prediciton output. The values in the first column must match the gene sets in the gene_sets file above. The values in the second column will be included in the final results table (along with those in the first column).'), - # make_option('--gene_name_column', help = 'Column in "query_info_table" that contains interpretable gene names. In S. cerevisiae, for example this would be the column that contains "TUB3" in the row for ORF YML124C'), - # make_option('--test_run', action = 'store_true', default = FALSE, help = 'Performs predictions for a small subset of the data, to ensure that everything is working properly before the full set of predictions are made (can take days for large screens). The results from this run will not be accurate!!!'), - # make_option('--num_cores', type = 'integer', default = 1, help = 'The number of cores used to run the gene set predictions!'), - # make_option(c('-v', '--verbosity'), type = 'integer', default = 1, help = 'The level of verbosity printed to stdout. Ranges from 0 to 3, 1, is default.') - # ) - - - ################# Load in the required files, deal with arguments - - # Note: I had concerns that not parsing the config file within - # this block of code would lead the user to be able to alter - # the config params on a saved run that was loaded halfway into - # the analysis. I now believe this is unfounded, as all variables - # will be restored to those from the config file at the time the - # analysis was initiated when the user loads all variables in - # from a specified save point. - - ####### Now for the parameters in the config file!!! - output_folder = config_params$Required_arguments$output_folder - gene_pred_folder = get_gene_target_folder(output_folder) - gene_pred_file = get_gene_target_prediction_filename(gene_pred_folder, - config_params$Required_arguments$cg_gi_similarity_measure - ) - gene_prediction_tab = fread(sprintf('gzip -dc %s', gene_pred_file), header = TRUE, colClasses = c('character', 'character', 'character', 'numeric')) - - # Since the user can specify not to use the resampled profiles derived - # from the entire dataset, here I start dealing with that. The code - # will look a bit clunky, but this will be more reliable/predictable - # than the alternative of creating an empty data.table to funnel - # through all of the same steps. That will likely wreak havoc. - zero_resampled_profiles = (config_params$Required_arguments$`per-array_resampling_scheme` == 0) | (config_params$Required_arguments$`num_per-array_resampled_profiles`== 0) - - if (!zero_resampled_profiles) { - - rand_gene_pred_file = get_gene_target_prediction_resampled_filename( - gene_pred_folder, - config_params$Required_arguments$`per-array_resampling_scheme`, - config_params$Required_arguments$`num_per-array_resampled_profiles`, - config_params$Required_arguments$`per-array_resampling_seed`, - config_params$Required_arguments$cg_gi_similarity_measure - ) - rand_gene_prediction_tab = fread(sprintf('gzip -dc %s', rand_gene_pred_file), header = TRUE, colClasses = c('character', 'character', 'character', 'numeric')) - } else { - - # If there are no resampled profiles, make sure that the config_params, - # which are used later in the script, accurately reflect that. - config_params$Required_arguments$`per-array_resampling_scheme` = 0 - config_params$Required_arguments$`num_per-array_resampled_profiles` = 0 - config_params$Required_arguments$`per-array_resampling_seed` = 0 - } - - sample_table_filename = config_params$Required_arguments$cg_col_info_table - sample_table = fread(sample_table_filename, header = TRUE, colClasses = 'character') - - gene_set_name = config_params$Required_arguments$gene_set_name - gi_data_name = config_params$Required_arguments$gi_dataset_name - gene_set_info = get_gene_set_info(gene_set_name, TARGET_PATH) - print(gene_set_info) - gene_set_tab_full = fread(gene_set_info$filename, header = TRUE, sep = '\t', colClasses = 'character') - gene_set_gene_id_col = names(gene_set_tab_full)[1] - gene_set_id_col = names(gene_set_tab_full)[2] - gene_set_name_col = gene_set_info$interpretable_column - print(gene_set_tab_full) - print(gene_set_id_col) - print(gene_set_name_col) - gene_set_tab = gene_set_tab_full[, c(gene_set_gene_id_col, gene_set_id_col), with = FALSE] - - # Here, I read in the min and max allowable gene set sizes - min_term_size = as.numeric(config_params$Required_arguments$min_term_size) - max_term_size = as.numeric(config_params$Required_arguments$max_term_size) - if (is.na(min_term_size) | is.na(max_term_size)) { - stop('Required arguments "min_term_size" and/or "max_term_size" not specified\nproperly. Please specify as a number and re-run this script.') - } - - gi_info = get_gi_info(config_params$Required_arguments$gi_dataset_name, TARGET_PATH) - #gi_tab = fread(sprintf('gzip -dc %s', gi_filenames$gi_tab)) - query_info_tab = fread(gi_info$gi_query_tab) - - #query_info_file = config_params$Required_arguments$gi_query_info_table - #query_info_tab = fread(query_info_file, header = TRUE, colClasses = 'character') - - driver_cutoff = as.numeric(config_params$Required_arguments$driver_cutoff) - if (is.na(driver_cutoff)) { stop('driver_cutoff argument must be numeric.') } - - output_table_folder = get_gene_set_target_prediction_folder(output_folder) - output_table = get_gene_set_target_prediction_filename_v2( - output_table_folder, - config_params$Required_arguments$`per-array_resampling_scheme`, - config_params$Required_arguments$`num_per-array_resampled_profiles`, - config_params$Required_arguments$`per-array_resampling_seed`, - config_params$Required_arguments$cg_gi_similarity_measure, - config_params$Required_arguments$`per-condition_randomization_seed`, - config_params$Required_arguments$`num_per-condition_randomizations`, - config_params$Required_arguments$gene_set_name, - config_params$Required_arguments$min_term_size, - config_params$Required_arguments$max_term_size, - opt$test_run - ) - - dir.create(output_table_folder, recursive = TRUE) - - gene_set_hist_folder = get_gene_set_hist_folder(output_folder) - dir.create(gene_set_hist_folder) - gene_set_size_hist_file = get_gene_set_target_prediction_size_hist_file( - gene_set_hist_folder, - config_params$Required_arguments$gene_set_name, - config_params$Required_arguments$min_term_size, - config_params$Required_arguments$max_term_size, - opt$test_run - ) - - gene_set_out_tab = get_gene_set_target_prediction_terms_filename( - gene_set_hist_folder, - config_params$Required_arguments$gene_set_name, - config_params$Required_arguments$min_term_size, - config_params$Required_arguments$max_term_size, - opt$test_run - ) - - # This way, `true_false_control_map` adapts to changes in `bool_vec` - init_control_map = c(`TRUE` = 'expt_control', `FALSE` = 'treatment') - true_false_control_map = init_control_map[as.character(bool_vec)] - names(true_false_control_map) = names(bool_vec) - #true_false_control_map = c('TRUE' = 'expt_control', 'FALSE' = 'treatment', 'True' = 'expt_control', 'False' = 'treatment') - - ################### Load in optional files and options - # Gene set ID to interpretable name table - # Might want to change this to "if (!gene_set_name_col == '')" instead of expecting it to be null or NA - print(gene_set_name_col) - if (!gene_set_name_col == '') { - #gene_set_name_file = config_params$Options$gene_set_target_prediction$gene_set_name_table - gene_set_name_tab = unique(gene_set_tab_full[, c(gene_set_id_col, gene_set_name_col), with = FALSE]) - print(gene_set_name_tab) - setnames(gene_set_name_tab, c('gene_set', 'gene_set_name')) - } else { - gene_set_name_tab = NULL - } - - # Note: if no column is specified containing interpretable gene names, - # then this variable will already be NULL. - gene_name_column = gi_info$query_genename_col - if (!is.null(gene_name_column)) { - if (! gene_name_column %in% names(query_info_tab)) { - stop(sprintf('column %s was specified as the column in the query info table that contains interpretable gene names, but the columns does not exist! query info table is here:\n%s', gene_name_column, gi_info$gi_query_tab)) - } - } - - ################ Deal with the consequences of using a dummy dataset, if applicable ############### - # (must be done before pulling other information from sample table, as this modifies the sample - # table to include dummy controls, if they exist) - dummy_name = config_params$Options$dummy_dataset$name - if (!is.null(dummy_name)) { - dummy_config_f = file(get_dummy_config_filename(dummy_name), 'rt') - dummy_config_params = yaml.load_file(dummy_config_f) - close(dummy_config_f) - dummy_dt = fread(sprintf('gzip -dc %s', file.path(get_dummy_folder(dummy_name), dummy_config_params$cg_data_table)), colClasses = c('character', 'character','character','character','numeric')) - dummy_col_tab = fread(file.path(get_dummy_folder(dummy_name), dummy_config_params$cg_col_info_tab), header = TRUE, colClasses = 'character') - - # If a column specifying negative experimental controls was specified, then use it - # to filter the dummy dataset. otherwise, do not use the dummy dataset here! - if (!is.null(config_params$Options$dummy_dataset$negative_control_column)) { - print(unique(dummy_dt[, list(Strain_ID, Barcode)], by = NULL)) - print(unique(dummy_dt[, list(screen_name, expt_id)], by = NULL)) - message(sprintf('dummy matrix dimensions before filtering for "cols_to_include": (%s, %s)', - dim(unique(dummy_dt[, list(Strain_ID, Barcode)], by = NULL))[1], - dim(unique(dummy_dt[, list(screen_name, expt_id)], by = NULL))[1])) - select_rows_dummy = bool_vec[dummy_col_tab[[config_params$Options$dummy_dataset$negative_control_column]]] - dummy_col_tab = dummy_col_tab[select_rows_dummy] - } - - # Create a final sample table with the dummy controls added in. - sample_table = add_dummy_controls_to_cg_sample_tab(sample_table, - config_params$Options$gene_set_target_prediction$negative_control_column, - config_params$Options$gene_set_target_prediction$condition_name_column, - dummy_col_tab, - config_params$Options$dummy_dataset$negative_control_column, - config_params$Options$dummy_dataset$condition_name_column) - } - - ################ Read in the sample table and negative control/condition name columns, if they exist - # Handle the negative control column - neg_control_col = config_params$Options$gene_set_target_prediction$negative_control_column - if (!is.null(neg_control_col)) { - if (! neg_control_col %in% names(sample_table)) { - stop(sprintf('column %s not in cg_col_info_table!', neg_control_col)) - } - control_map = true_false_control_map[sample_table[[neg_control_col]]] - names(control_map) = sample_table[, sprintf('%s_%s', screen_name, expt_id)] - } else { - control_map = rep('treatment', dim(sample_table)[1]) - names(control_map) = sample_table[, sprintf('%s_%s', screen_name, expt_id)] - } - - # Handle the condition name column - cond_name_col = config_params$Options$gene_set_target_prediction$condition_name_column - if (!is.null(cond_name_col)) { - ### May need to revisit if the columns are split or not... - if (! cond_name_col %in% names(sample_table)) { - stop(sprintf('column %s not in cg_col_info_table!', cond_name_col)) - } - condition_name_tab = sample_table[, list(condition = sprintf('%s_%s', screen_name, expt_id), condition_name = sample_table[[cond_name_col]])] - - # The condition_tab needs to have rand-by-strain conditions added so the join - # doesn't cause us to lose all or the rand-by-strain conditions! - # **But only if there is a table of rand-by-strain (resampled) gene target - # predictions!** - if (!zero_resampled_profiles) { - rand_condition_name_tab = unique(rand_gene_prediction_tab[, list(screen_name, expt_id)])[, list(condition = sprintf('%s_%s', screen_name, expt_id), condition_name = sprintf('%s_%s', screen_name, expt_id))] - condition_name_tab = rbind(condition_name_tab, rand_condition_name_tab) - } - - print(condition_name_tab) - } else { - condition_name_tab = NULL - } - - - print(sample_table) - print(control_map) - - ############ Here, convert all the tables into matrices and them remove - ############ before any workspaces can be saved (lots of unnecessary - ############ space would be taken up! - - ###### Smash some data around - gene_prediction_mat = acast(gene_prediction_tab, screen_name + expt_id ~ query_key, value.var = 'score') - if (!zero_resampled_profiles) { - rand_gene_prediction_mat = acast(rand_gene_prediction_tab, screen_name + expt_id ~ query_key, value.var = 'score') - all_prediction_mat = rbind(gene_prediction_mat, rand_gene_prediction_mat) - - ###### Line up the control vectors to the matrix with all predictions - rand_control_map = rep('rand-by-strain', dim(unique(rand_gene_prediction_tab[, list(screen_name, expt_id)]))[1]) - names(rand_control_map) = unique(rand_gene_prediction_tab[, sprintf('%s_%s', screen_name, expt_id)]) - - # Don't need these! - rm(rand_gene_prediction_tab) - rm(rand_gene_prediction_mat) - - } else { - all_prediction_mat = gene_prediction_mat - } - - # Don't need these! - rm(gene_prediction_tab) - rm(gene_prediction_mat) - - #all_prediction_tab = rbind(gene_prediction_tab, rand_gene_prediction_tab) - #all_prediction_mat = acast(all_prediction_tab, screen_name + expt_id ~ query_key, value.var = 'score') - - - gc() - -} - -# Here is very important testing/rerunning code. If the user wants to save -# at "save point 1," then the R environment is saved here before moving on. -# If the user wants to load the data previously saved at "save point 1," -# then the previously saved R environment is loaded here. -if (1 %in% save_points) { - save_data(1) -} -if (load_point == 1) { - load_data(1) -} - -# If the user-specified load point is less than 2, then this -# next section of code must be run (because the data were -# loaded in somewhere before this code and must be processed -# to completion!) -if (load_point < 2) { - - - # Check for real and random conditions overlapping (should NEVER happen) - # Only check if there are "random" (really, "resampled") conditions - if (!zero_resampled_profiles) { - print(control_map[1:10]) - print(rand_control_map[1:10]) - overlap_conds = intersect(names(control_map), names(rand_control_map)) - if (length(overlap_conds) > 0) { - stop(sprintf('The following real and random conditions have the same name:\n%s', paste(overlap_conds, collapse = '\n'))) - } - } - - # Combine the control maps and get one master control type vector!!! - if (!zero_resampled_profiles) { - - all_control_map = c(control_map, rand_control_map) - } else { - all_control_map = control_map - } - - all_control_vec = all_control_map[rownames(all_prediction_mat)] - print(all_control_vec) - print(sprintf('Number of %s conditions: %s', names(table(all_control_vec)), table(all_control_vec))) - - # If any sample types are still NA, filter them out of the dataset - inds_to_remove = is.na(all_control_vec) - conds_to_remove = rownames(all_prediction_mat)[inds_to_remove] - - all_control_vec = all_control_vec[!is.na(all_control_vec)] - all_prediction_mat = all_prediction_mat[!is.na(all_control_vec), ] - - # Get a matrix with one column per unique query gene (there may be multiple alleles for the same gene), - # and for each row (condition) in that column, we take the maximum prediction score against all - # alleles of the same query gene. Also, get a matrix that indicates which allele the selected - # max score came from. - - # First, get the vector of query genes that define where the multiple-allele cases are. - #print(gene_set_tab) - #print(query_info_tab) - # WHY DO I DRAW THE NAME OF THE QUERY COLUMN FROM THE GENE_SET_TAB?! - # Well now I don't :) - query_column = gi_info$query_sys_name_col - query_map = query_info_tab[[query_column]] - names(query_map) = query_info_tab[['query_key']] - query_gene_vec = query_map[colnames(all_prediction_mat)] - - print(str(all_prediction_mat)) - print(str(query_gene_vec)) - print(all_prediction_mat[1:5, (dim(all_prediction_mat)[2]-9):(dim(all_prediction_mat)[2])]) - print(query_gene_vec[(length(query_gene_vec) - 9):length(query_gene_vec)]) - - # Get the matrix with the best predictions among multiple alleles - best_query_mat_list = best_score_per_col_group(all_prediction_mat, query_gene_vec) - - best_prediction_mat = best_query_mat_list[['best_scores']] - best_query_mat = best_query_mat_list[['best_queries']] - - # Get a map from query key column to an interpretable name, if the - # gene_name_column exists. Then create a matrix that matches the - # best_query_mat matrix and contains interpretable query gene - # names. If that column was not given, create a matrix of NA - # values instead. - if (!is.null(gene_name_column)) { - gene_name_map = query_info_tab[[gene_name_column]] - names(gene_name_map) = query_info_tab[['query_key']] - best_query_name_mat = matrix(gene_name_map[best_query_mat], nrow = nrow(best_query_mat), ncol = ncol(best_query_mat), dimnames = dimnames(best_query_mat)) - } else { - best_query_name_mat = matrix(NA_character_, nrow = nrow(best_query_mat), ncol = ncol(best_query_mat), dimnames = dimnames(best_query_mat)) - } - - ###### Determine which sample types exist and will be used to make predictions - # At some point, should the user be able to specify this instead of just assuming - # that everything needs to have its target predicted? - sample_types_to_predict_for <- unique(all_control_vec) - - ######################### - ######################### - - ######### - ### What are the control types? (There may be experimental controls, or maybe not!) - controls <- intersect(sample_types_to_predict_for, c('expt_control', 'rand-by-strain')) - - # Reshape the gene set table into a matrix - gene_set_mat_formula = as.formula(sprintf('%s ~ %s', names(gene_set_tab)[1], names(gene_set_tab)[2])) - gene_set_tab[, annotation := 1] - gene_set_matrix = acast(data = gene_set_tab, formula = gene_set_mat_formula, value.var = 'annotation', fill = 0) - - ## Get a matrix of all possible propagated orf to GO term mappings - #raw_go_matrix <- get_gene_go_bp_is_a_propagated_matrix(raw_orfs) - # - ## Only select GO terms with 4 to 200 genes from the lightnet - ## mapped to them - #go_term_sizes <- apply(raw_go_matrix, 2, sum) - #go_matrix <- raw_go_matrix[, go_term_sizes >= 4 & go_term_sizes <= 200] - - # Match up the gene set matrix with the prediction/query matrices: - common_genes = intersect(rownames(gene_set_matrix), colnames(best_prediction_mat)) - gene_set_matrix = gene_set_matrix[common_genes, , drop = FALSE] - best_prediction_mat <- best_prediction_mat[, common_genes, drop = FALSE] - best_query_mat <- best_query_mat[, common_genes, drop = FALSE] - best_query_name_mat = best_query_name_mat[, common_genes, drop = FALSE] - - # Remove GO terms from the gene set matrix without any annotations (must only be - # annotated from genes not in the set that is predicted against in the gene-level - # target prediction step). - nonzero_degree_gene_set_inds = colSums(gene_set_matrix) > 0 - gene_set_matrix = gene_set_matrix[, nonzero_degree_gene_set_inds, drop = FALSE] - print(sprintf('Removed %s gene sets with no annotations', sum(!nonzero_degree_gene_set_inds))) - - # Now, remove GO terms from the gene set matrix that do not meet the term size - # criteria in the configuration file. - term_size_min_pass = (colSums(gene_set_matrix) >= min_term_size) - term_size_max_pass = (colSums(gene_set_matrix) <= max_term_size) - gene_set_matrix = gene_set_matrix[, term_size_min_pass & term_size_max_pass, drop = FALSE] - print(sprintf('Removed %s gene sets with fewer than %s annotations', sum(!term_size_min_pass), min_term_size)) - print(sprintf('Removed %s gene sets with greater than %s annotations', sum(!term_size_max_pass), max_term_size)) - - # Get a gene set annotation table filtered to contain only the query genes from the - # selected genetic interaction network and for which the terms pass the defined - # size criteria - accepted_gene_ids = as.character(row(gene_set_matrix, as.factor = TRUE))[gene_set_matrix == 1] - accepted_gene_set_ids = as.character(col(gene_set_matrix, as.factor = TRUE))[gene_set_matrix == 1] - print(str(accepted_gene_ids)) - print(str(accepted_gene_set_ids)) - accepted_annotation_tab = data.table(accepted_gene_ids, accepted_gene_set_ids) - setnames(accepted_annotation_tab, c(gene_set_gene_id_col, gene_set_id_col)) - gene_set_tab_final = copy(gene_set_tab_full) - setkeyv(gene_set_tab_final, c(gene_set_gene_id_col, gene_set_id_col)) - gene_set_tab_final = gene_set_tab_final[accepted_annotation_tab] - print(gene_set_tab_final) - - # This is a check specific to protein complexes (couldn't get the more general - # version to work) - # print(gene_set_tab_final[, .N, by = list(complex)][, range(N)]) - - # Print out a list of the gene_sets actually used in this analysis (and the - # annotations after filtering to only include the predicted gene-level targets). - # Also, make a plot of term size distribution. - write.table(gene_set_tab_final, gene_set_out_tab, quote = FALSE, sep = '\t', row.names = FALSE, col.names = TRUE) - - ### Plot the gene set size distribution - setkeyv(gene_set_tab_final, gene_set_id_col) - gene_set_size_dist = gene_set_tab_final[, .N, by = gene_set_id_col] - - gene_set_size_dist_plot = ggplot(gene_set_size_dist, aes(N)) + - geom_histogram() + - scott_theme_1() + - labs(x = 'gene set size', y = 'frequency', title = sprintf('%s\ngene set size distribution', gene_set_name)) + - theme(axis.title.x = element_text(margin = margin(10, 0, 0, 0)), - axis.title.y = element_text(margin = margin(0, 10, 0, 0)), - plot.title = element_text(margin = margin(0, 0, 15, 0)) - ) + - scale_y_continuous(expand = c(0, 0)) - - pdf(gene_set_size_hist_file) - print(gene_set_size_dist_plot) - dev.off() - - # Get p values and zscores for each drug --> go combination - # This will be done on a per-GO term and a per-drug basis, and - # the per-GO computations can be performed using as many - # different control types of samples as possible. From this - # function call, one gets back all p values and z scores for - # ALL different computation methods. - - # First, create a vector that splits the matrix up into - # treatments and controls - # treat_control_vec <- vector('character', dim(tp_mat)[1]) - # treat_control_vec[] <- 'Treatment' - # treat_control_vec[grepl('DMSO', dimnames(tp_mat)[[1]], fixed = TRUE)] <- 'DMSO' - # treat_control_vec[grepl('Rand-by-strain', dimnames(tp_mat)[[1]], fixed = TRUE)] <- 'Rand-by-strain' - - - # test_set <- c(which(treat_control_vec == 'Treatment')[1:20], - # which(treat_control_vec == 'DMSO')[1:20], - # which(treat_control_vec == 'Rand-by-strain')[1:20] - # ) - - # Select a subset for debugging purposes - # Selects 100 of each sample type (or fewer, if fewer than - # 100 of a sample type - test_inds <- do.call(c, lapply(unique(all_control_vec), function(x) { - inds <- which(all_control_vec == x) - inds[1:(min(length(inds), 100))] - })) - - - # Is this a test to make sure everything works, or is this the real deal? - # Look for the command line option "--test_run" - if (opt$test_run) { - condition_inds <- test_inds - } else { - condition_inds <- TRUE - } - - - # stop('time to figure out what is going terribly wrong') - - ###### The following steps of code, until the end of this section, are - ###### preparation for performing the z-score and p-value computations - target_prediction_mat = best_prediction_mat[condition_inds,] - sample_type_split_vec = all_control_vec[condition_inds] - - condition_gene_score_means <- rowMeans(target_prediction_mat) - condition_gene_score_stdevs <- apply(target_prediction_mat, 1, sd) - - gene_set_sizes <- colSums(gene_set_matrix) - - condition_gene_set_pred_sum_mat <- target_prediction_mat %*% gene_set_matrix - - # sample_types <- unique(sample_type_split_vec) - - # controls <- sample_types[sample_types != treatment_sample_type] - - # Perform some subsetting to only include sample types to make - # predictions for - condition_subset_gene_set_pred_sum_mat <- condition_gene_set_pred_sum_mat[sample_type_split_vec %in% sample_types_to_predict_for, ] - target_prediction_subset_mat <- target_prediction_mat[sample_type_split_vec %in% sample_types_to_predict_for, ] - condition_subset_gene_score_means = condition_gene_score_means[sample_type_split_vec %in% sample_types_to_predict_for] - condition_subset_gene_score_stdevs = condition_gene_score_stdevs[sample_type_split_vec %in% sample_types_to_predict_for] - - # Change order of controls so that the first control type has the - # largest number of conditions. If they tie, order() will not change - # their order. - sample_type_counts = table(sample_type_split_vec) - control_counts = sample_type_counts[controls] - control_order = order(control_counts, decreasing = TRUE) - - # Here's the final ordering step - controls = controls[control_order] - - #control_gene_set_pred_sum_mat_list <- foreach(control = controls) %do% { - # condition_gene_set_pred_sum_mat[sample_type_split_vec %in% control, ] - #} - - control_gene_set_pred_sum_mat_list = vector('list', length(controls)) - for (i in seq_along(controls)) { - control = controls[[i]] - control_gene_set_pred_sum_mat_list[[i]] = condition_gene_set_pred_sum_mat[sample_type_split_vec == control, ] - } - names(control_gene_set_pred_sum_mat_list) = controls - - - rm(target_prediction_mat) - rm(condition_gene_set_pred_sum_mat) - - gc() - - #control_gene_set_pred_sum_means_list <- foreach(control_gene_set_pred_sum_mat = control_gene_set_pred_sum_mat_list) %do% { - # colMeans(control_gene_set_pred_sum_mat) - #} - - #control_gene_set_pred_sum_stdevs_list <- foreach(control_gene_set_pred_sum_mat = control_gene_set_pred_sum_mat_list) %do% { - # apply(control_gene_set_pred_sum_mat, 2, sd) - #} - - control_gene_set_pred_sum_means_list = lapply(control_gene_set_pred_sum_mat_list, colMeans) - control_gene_set_pred_sum_stdevs_list = lapply(control_gene_set_pred_sum_mat_list, - function(x) apply(x, 2, sd)) - - names(control_gene_set_pred_sum_means_list) = controls - names(control_gene_set_pred_sum_stdevs_list) = controls - -} - - -# Here is very important testing/rerunning code. If the user wants to save -# at "save point 2," then the R environment is saved here before moving on. -# If the user wants to load the data previously saved at "save point 2," -# then the previously saved R environment is loaded here. -if (2 %in% save_points) { - save_data(2) -} -if (load_point == 2) { - load_data(2) -} - - -########### Get p-values and z-scores for per-gene-set evaluations - -# If the user-specified load point is less than 3, then this -# next section of code must be run (because the data were -# loaded in somewhere before this code and must be processed -# to completion!) -if (load_point < 3) { - - if (!(is.null(neg_control_col) & zero_resampled_profiles)) { - - # Only compute per-gene-set scores if negative control and/or - # resampled profiles are present. Otherwise, skip and replace - # with a reasonable data structure. - per_gene_set_pvals_zscores = compute_per_gene_set_pvals_zscores(condition_subset_gene_set_pred_sum_mat, control_gene_set_pred_sum_mat_list, control_gene_set_pred_sum_means_list, control_gene_set_pred_sum_stdevs_list, controls) - } else { - - # Else, this list is just NULL instead. To be dealt with later. - per_gene_set_pvals_zscores = NULL - } - -} - -# Again, here is important testing/rerunning code. If the user wants to save -# at "save point 3," then the R environment is saved here before moving on. -# If the user wants to load the data previously saved at "save point 3," -# then the previously saved R environment is loaded here. -if (3 %in% save_points) { - save_data(3) -} -if (load_point == 3) { - load_data(3) -} - -########### Get p-values and z-scores for per-condition evaluations - -# If the user-specified load point is less than 4, then this -# next section of code must be run (because the data were -# loaded in somewhere before this code and must be processed -# to completion!) -if (load_point < 4) { - - # Settle the per-condition randomization question - num_per_condition_rand = config_params$Required_arguments$`num_per-condition_randomizations` - - # Get the seed (required input from the user) - # Seed isn't set until right before randomizations - # are performed, inside the function below. - seed = config_params$Required_arguments$`per-condition_randomization_seed` - print(seed) - if (seed == 'rand') { - seed = NULL - } else if (!is.numeric(seed)) { - stop('specified per-condition_randomization_seed is neither numeric nor "rand"') - } - - per_condition_pvals_zscores = compute_per_condition_pvals_zscores_2(target_prediction_subset_mat, condition_subset_gene_set_pred_sum_mat, gene_set_matrix, num_per_condition_rand, gene_set_sizes, condition_subset_gene_score_means, condition_subset_gene_score_stdevs, seed) - - # Combine results into a final list and remove the individual lists - all_pvals_zscores = list(per_gene_set = per_gene_set_pvals_zscores, per_condition = per_condition_pvals_zscores) - - # Remove any components of the list that are NULL. If both - # are NULL, stop the code with an error since we need at - # least one set of p-values and z-scores to move on. - all_pvals_zscores = all_pvals_zscores[!vapply(all_pvals_zscores, is.null, logical(1))] - - if (length(all_pvals_zscores) == 0) { - - stop('There are no computed p-values/z-scores, from either the per-gene-set or per-condition analyses, with which to move forward. Please check your config file and your data to ensure you either have generated resampled profiles and/or have enough negative control profiles for per-gene-set analyses, and/or you have set the number of per-condition randomizations to a value greater than zero.') - } - - if(!is.null(per_gene_set_pvals_zscores)) rm(per_gene_set_pvals_zscores) - if(!is.null(per_condition_pvals_zscores)) rm(per_condition_pvals_zscores) - gc() - -} - -# Save/load point 4! -if (4 %in% save_points) { - save_data(4) -} -if (load_point == 4) { - load_data(4) -} - -# Now, get all versions of p values and zscores! -#system.time(all_pvals_zscores <- compute_zscores_pvals_by_go_and_drug(target_prediction_mat = best_prediction_mat[condition_inds,], -# go_term_mat = gene_set_matrix, -# sample_type_split_vec = all_control_vec[condition_inds], -# control_types = controls, -# types_to_predict_for = sample_types_to_predict_for, -# num_per_condition_rand = num_per_condition_rand, -# load_point = load_point, -# save_points = save_points, -# gene_set_outdir = output_table_folder)) - - -######### Combine the p-values and z-scores so that there is one p-value -######### and z-score per condition X gene set pair. The largest p-value and its -######### accompanying z-score are chosen, to minimize the chance that the -######### results we call significant are actually easily observable among any -######### appropriate null models. - -# If the data are already loaded, proceed with processing! -if (load_point < 5) { - - print(str(all_pvals_zscores)) - print(controls) - - #print(which(is.na(all_pvals_zscores[['per_gene_set']][[controls[1]]][['pval']]), arr.ind = TRUE)) - #print(sum(is.na(all_pvals_zscores[['per_gene_set']][[controls[1]]][['pval']]))) - - #print(which(is.na(all_pvals_zscores[['per_gene_set']][[controls[1]]][['zscore']]), arr.ind = TRUE)) - #print(sum(is.na(all_pvals_zscores[['per_gene_set']][[controls[1]]][['zscore']]))) - - #bad_go_term_inds = unique(which(is.na(all_pvals_zscores[['per_gene_set']][[controls[1]]][['zscore']]), arr.ind = TRUE)[, 2]) - #print(str(gene_set_matrix[, bad_go_term_inds])) - #print(colSums(gene_set_matrix[, bad_go_term_inds])) - - #print(which(is.na(all_pvals_zscores[['per_gene_set']][[controls[2]]][['pval']]))) - #print(which(is.na(all_pvals_zscores[['per_gene_set']][[controls[2]]][['zscore']]))) - # get worst per-go pvals/zscores - if (length(all_pvals_zscores[['per_gene_set']]) == 2) { - per_gene_set_worst_case_mats <- get_worst_case_pval_zscore(all_pvals_zscores[['per_gene_set']][[controls[1]]][['pval']], - all_pvals_zscores[['per_gene_set']][[controls[1]]][['zscore']], - all_pvals_zscores[['per_gene_set']][[controls[2]]][['pval']], - all_pvals_zscores[['per_gene_set']][[controls[2]]][['zscore']], - controls[1], - controls[2] - ) - } else if (length(all_pvals_zscores[['per_gene_set']]) == 1) { - print(str(all_pvals_zscores)) - per_gene_set_worst_case_mats = list(worst_pval = all_pvals_zscores[['per_gene_set']][[controls[1]]][['pval']], - worst_zscore = all_pvals_zscores[['per_gene_set']][[controls[1]]][['zscore']], - control_name = matrix(controls[1], nrow = dim(all_pvals_zscores[['per_gene_set']][[controls[1]]][['pval']])[1], ncol = dim(all_pvals_zscores[['per_gene_set']][[controls[1]]][['pval']])[2], dimnames = dimnames(all_pvals_zscores[['per_gene_set']][[controls[1]]])) - ) - } else if (length(all_pvals_zscores[['per_gene_set']]) > 2) { - - # Since this code can now tolerate having no per_gene_set- - # derived predictions, it's only a problem if there are - # too many control types - more than there should be. - stop(sprintf('Not sure how this happened, as there are %s control types when there should be either 1 or 2', length(controls))) - } - - # get worst overall pvals/zscores (worst between - # per-gene-set and per-condition, but ONLY if - # there are both per_gene_set pvals/zscores and - # per_condition pvals/zscores) - if (length(all_pvals_zscores[['per_gene_set']]) == 0) { - - # If there are no per-gene-set predictions, then we just use - # the per-condition predictions. - overall_worst_case_mats = list(worst_pval = all_pvals_zscores[['per_condition']][['pval']], - worst_zscore = all_pvals_zscores[['per_condition']][['zscore']], - control_name = matrix('per_condition', nrow = dim(all_pvals_zscores[['per_condition']][['pval']])[1], ncol = dim(all_pvals_zscores[['per_condition']][['pval']])[2], dimnames = dimnames(all_pvals_zscores[['per_condition']][['pval']])) - ) - - } else if (length(all_pvals_zscores[['per_condition']]) == 0) { - - # Same thing for per-condition predictions - overall_worst_case_mats = list(worst_pval = per_gene_set_worst_case_mats[['worst_pval']], - worst_zscore = per_gene_set_worst_case_mats[['worst_zscore']], - control_name = matrix('per_gene_set', nrow = dim(all_pvals_zscores[['per_gene_set']][['pval']])[1], ncol = dim(all_pvals_zscores[['per_gene_set']][['pval']])[2], dimnames = dimnames(all_pvals_zscores[['per_gene_set']][['pval']])) - ) - } else { - - # Otherwise, there is at least one per-condition and one per-gene-set - # set of statistics, this is how to combine them. - overall_worst_case_mats <- get_worst_case_pval_zscore(per_gene_set_worst_case_mats$worst_pval, - per_gene_set_worst_case_mats$worst_zscore, - all_pvals_zscores$per_condition$pval, - all_pvals_zscores$per_condition$zscore, - 'per_gene_set', - 'per_condition' - ) - } - - # Specify which per-gene-set scheme was used if the worst pval/zscore - # was generated using the per-gene-set scheme. But if there are no - # per_gene_set indices, then don't do it! - if (exists('per_gene_set_worst_case_mats')) { - per_gene_set_indices <- overall_worst_case_mats$control_name == 'per_gene_set' - overall_worst_case_mats$control_name[per_gene_set_indices] <- per_gene_set_worst_case_mats$control_name[per_gene_set_indices] - } -} - -# Save/load point 5! -if (5 %in% save_points) { - save_data(5) -} -if (load_point == 5) { - load_data(5) -} - -########### Compute and assemble the drivers of the gene-set predictions - -# If the data are already loaded, proceed with processing! -if (load_point < 6) { - - # I need to subset the target prediction matrix so I can get the correct - # process prediction drivers back! - best_prediction_subset_mat = best_prediction_mat[condition_inds, ][all_control_vec[condition_inds] %in% sample_types_to_predict_for, ] - best_query_subset_mat = best_query_mat[condition_inds, ][all_control_vec[condition_inds] %in% sample_types_to_predict_for, ] - best_query_name_subset_mat = best_query_name_mat[condition_inds, ][all_control_vec[condition_inds] %in% sample_types_to_predict_for, ] - - # Here I will get a table of drivers of my go process predictions - gene_set_drivers_dt = get_gene_set_drivers(best_prediction_subset_mat, best_query_subset_mat, best_query_name_subset_mat, gene_set_matrix, cutoff = driver_cutoff) - # If the driver gene names were not specified, remove that column from the target prediction table! - if (is.null(gene_name_column)) { - gene_set_drivers_dt[, driver_name := NULL] - } - -} - -# Save/load point 6! -if (6 %in% save_points) { - save_data(6) -} -if (load_point == 6) { - load_data(6) -} - -## Shut down MPI cluster, if it exists -#if (ncores > 1) { -# closeCluster(clus) -#} - -# print(gene_set_drivers_dt[condition == 'SHANGHAI-1511_000088']) - -########### Table mashing and exporting! - -# If the data are already loaded, proceed with processing! -if (load_point < 7) { - - # Set the key on the go_drivers_dt so that it can be joined with the - # process prediction data.tables - setkeyv(gene_set_drivers_dt, c('condition', 'gene_set')) - - ## Melt per-gene-set only predictions into a data table - #per_gene_set_worst_case_df <- lapply(per_gene_set_worst_case_mats, as.vector) - #per_gene_set_worst_case_df <- as.data.frame(per_gene_set_worst_case_df) - # - #per_gene_set_GENE_SET <- as.character(col(per_gene_set_worst_case_mats$worst_pval, as.factor = TRUE)) - #per_gene_set_CONDITION <- as.character(row(per_gene_set_worst_case_mats$worst_pval, as.factor = TRUE)) - # - #per_go_worst_case_dt <- data.table(condition = per_gene_set_CONDITION, gene_set = per_gene_set_GENE_SET, per_gene_set_worst_case_df) - - # Melt overall only predictions into a data table - overall_worst_case_df <- lapply(overall_worst_case_mats, as.vector) - overall_worst_case_df <- as.data.frame(overall_worst_case_df) - - overall_GENE_SET <- as.character(col(overall_worst_case_mats$worst_pval, as.factor = TRUE)) - overall_CONDITION <- as.character(row(overall_worst_case_mats$worst_pval, as.factor = TRUE)) - - overall_worst_case_dt <- data.table(condition = overall_CONDITION, gene_set = overall_GENE_SET, overall_worst_case_df) - - - # Join the process score tables with the process driver table - #setkeyv(per_go_worst_case_dt, c('condition', 'gene_set')) - setkeyv(overall_worst_case_dt, c('condition', 'gene_set')) - - #per_gene_set_worst_case_dt <- gene_set_drivers_dt[per_gene_set_worst_case_dt] - overall_worst_case_dt <- gene_set_drivers_dt[overall_worst_case_dt] - - # Add a column indicating control/treatment category for each condition. - overall_worst_case_dt[, sample_type := all_control_vec[condition]] - - # If the user provided a table mapping gene set IDs to interpretable gene set names, - # then add those to the final table! - if (!is.null(gene_set_name_tab)) { - # setkey(per_gene_set_worst_case_dt, gene_set) - setkey(overall_worst_case_dt, gene_set) - setkey(gene_set_name_tab, gene_set) - # per_gene_set_worst_case_dt = per_gene_set_worst_case_dt[gene_set_name_tab] - overall_worst_case_dt = overall_worst_case_dt[gene_set_name_tab, nomatch = 0] - } - - # If the user provided a table mapping condition IDs to to interpretable condition names, - # then add those to the final table! - if (!is.null(condition_name_tab)) { - # setkey(per_gene_set_worst_case_dt, gene_set) - setkey(overall_worst_case_dt, condition) - setkey(condition_name_tab, condition) - # per_gene_set_worst_case_dt = per_gene_set_worst_case_dt[gene_set_name_tab] - overall_worst_case_dt = overall_worst_case_dt[condition_name_tab, nomatch = 0] - } - - - ## Check size distribution of these go terms - #go_terms <- per_go_worst_case_dt[, unique(GO)] - #go_termsizes <- colSums(go_matrix[, go_terms]) - #hist(go_termsizes, breaks = 200) - ## Comment: I think the size distribution checks out just fine :) - ## However, these terms may be slightly different from the ones - ## I have predicted in the past. I am more confident in these - ## propagations, however - - ##### Print out some diagnostic plots describing how all of the pvals/zscores - ##### relate to each other across the types of control conditions I used to - ##### derive them - #### - ##### Need to fix speed, or DO NOT RUN UNLESS YOU HAVE EXTRA TIME!!! - ##### Also, plots probably look fugly at this point - #####with(all_pvals_zscores$per_go, print_diagnostic_plots_pval_zscore_2_schemes(DMSO$pval, - ##### DMSO$zscore, - ##### `Rand-by-strain`$pval, - ##### `Rand-by-strain`$zscore, - ##### 'DMSO', - ##### 'Rand-by-strain', - ##### file.path(prefix, sprintf('%s_process-prediction_qc-plots', Sys.Date())) - ##### )) - ##### - #####print_diagnostic_plots_pval_zscore_2_schemes(per_go_worst_case_mats$worst_pval, - ##### per_go_worst_case_mats$worst_zscore, - ##### overall_worst_case_mats$worst_pval, - ##### overall_worst_case_mats$worst_zscore, - ##### 'per-GO', - ##### 'per-drug', - ##### file.path(prefix, sprintf('%s_process-prediction_qc-plots', Sys.Date())) - ##### ) - ##### - ##### - # - # - ## Sort each prediction table by condition, pval, and zscore - #setkeyv(per_go_worst_case_dt, c('drug')) - #per_go_worst_case_dt <- per_go_worst_case_dt[, .SD[order(-worst_pval, worst_zscore, decreasing = TRUE)], by = list(drug)] - # - setkey(overall_worst_case_dt, condition) - # overall_worst_case_dt <- overall_worst_case_dt[, .SD[order(-worst_pval, worst_zscore, decreasing = TRUE)], by = list(drug)] - overall_worst_case_dt <- overall_worst_case_dt[order(worst_pval, -worst_zscore)] - - # Set the order of the columns!!! (This varies depending on which other tables were provided and joined to the predictions) - print(names(overall_worst_case_dt)) - cols = c('condition', 'condition_name', 'gene_set', 'gene_set_name', 'p_value', 'z_score', 'driver_query', 'driver_name', 'driver_score', 'sample_type', 'control_name') - setnames(overall_worst_case_dt, c('worst_pval', 'worst_zscore'), c('p_value', 'z_score')) - if (is.null(gene_set_name_tab)) cols = c(cols[1:(which(cols == 'gene_set_name') - 1)], cols[(which(cols == 'gene_set_name') + 1):length(cols)]) - if (is.null(condition_name_tab)) cols = c(cols[1:(which(cols == 'condition_name') - 1)], cols[(which(cols == 'condition_name') + 1):length(cols)]) - if (is.null(gene_name_column)) cols = c(cols[1:(which(cols == 'driver_name') - 1)], cols[(which(cols == 'driver_name') + 1):length(cols)]) - setcolorder(overall_worst_case_dt, cols) - - # Write out to (compressed) table-formatted file - #outfile_per_go <- gzfile(file.path(prefix, sprintf('%s_per-go_pval-zscore.txt.gz', Sys.Date())), open = 'wt') - #write.table(per_go_worst_case_dt, file = outfile_per_go, quote = FALSE, row.names = FALSE, col.names = TRUE, sep = '\t') - #close(outfile_per_go) - - - outfile_overall <- gzfile(output_table, open = 'wb') - write.table(overall_worst_case_dt, file = outfile_overall, quote = FALSE, row.names = FALSE, col.names = TRUE, sep = '\t') - close(outfile_overall) - - # Write out a top table and a globular table for human consumption - - # First, make filenames for the top table and glob top table outfiles - #out_top_per_go <- file.path(prefix, sprintf('%s_per-go_pval-zscore_top-ten.txt.gz', Sys.Date())) - #out_top_glob_per_go <- file.path(prefix, sprintf('%s_per-go_pval-zscore_top-ten_glob.txt.gz', Sys.Date())) - ## Now export prediction tables for just per_go predictions - #top_ten_tab_per_go <- export_top_table(dat = per_go_worst_case_dt, outfile = out_top_per_go, select_expr = rank(worst_pval, ties.method = 'min') <= 10, split_by_expr = list(drug), order_within_by_expr = order(-worst_pval, worst_zscore, decreasing = TRUE), connection_FUN = 'gzfile') - #table_to_glob(dat = top_ten_tab_per_go, outfile = out_top_glob_per_go, by_vec = c('drug'), connection_FUN = 'gzfile') - - # More filenames - #out_top_overall <- file.path(prefix, sprintf('%s_overall_pval-zscore_top-ten.txt.gz', Sys.Date())) - #out_top_glob_overall <- file.path(prefix, sprintf('%s_overall_pval-zscore_top-ten_glob.txt.gz', Sys.Date())) - ## And, export the overall pval/zscore top table and glob top table files - #top_ten_tab_overall <- export_top_table(dat = overall_worst_case_dt, outfile = out_top_overall, select_expr = rank(worst_pval, ties.method = 'min') <= 10, split_by_expr = list(drug), order_within_by_expr = order(-worst_pval, worst_zscore, decreasing = TRUE), connection_FUN = 'gzfile') - #table_to_glob(dat = top_ten_tab_overall, outfile = out_top_glob_overall, by_vec = c('drug'), connection_FUN = 'gzfile') - -} - -# Save/load point 7! This would be purely for debugging, I think. -if (7 %in% save_points) { - save_data(7) -} -if (load_point == 7) { - load_data(7) -} - - - +#!/usr/bin/env Rscript + +################################################################# +###### Copyright: Regents of the University of Minnesota ###### +################################################################# + +## CHANGE THIS HEADING DOCUMENTATION!!!!!!!! +# +# +## This script will take in the results of chemical genomics target prediction +## and return "z scores" describing the relationship between a drug's average +## target prediction score for genes that match a GO term, and the drug's overall +## average target prediction score. +# +## This script exports all zscore calculations +## Analysis is done using other scripts +# + + +library(optparse) +library(yaml) +library(reshape2) +library(data.table) + +# Source in libraries for gene set target prediction +TARGET_PATH = Sys.getenv('TARGET_PATH') +source(file.path(TARGET_PATH, 'lib/go_drivers.r')) +source(file.path(TARGET_PATH, 'lib/file_naming.r')) +source(file.path(TARGET_PATH, 'lib/top_table.r')) +source(file.path(TARGET_PATH, 'lib/table_to_globular.r')) +source(file.path(TARGET_PATH, 'lib/zscore-pval_by-drug-and-goterm_fast.r')) +source(file.path(TARGET_PATH, 'lib/pos_args.r')) +source(file.path(TARGET_PATH, 'lib/filenames.r')) +source(file.path(TARGET_PATH, 'lib/common_vars.r')) +source(file.path(TARGET_PATH, 'lib/datasets.r')) +source(file.path(TARGET_PATH, 'lib/scott_themes.r')) +source(file.path(TARGET_PATH, 'lib/dummy_dataset.r')) + +# print(sessionInfo()) + +# Some function definitions... +save_data_factory = function(folder) { + # This function returns a function that saves data in a + # folder that is specified at the time of function creation. + # The resulting function only requires a "save_point" + # argument, which is x. + + force(folder) + fun = function(x) { + outfile = file.path(folder, sprintf('save_point_%s.RData', x)) + message(sprintf('\nsaving data at save point %s...', x)) + message(sprintf('location: %s\n', outfile)) + # Get a list of everything except for functions, and also + # remove the save_points and load_point variablex. We want + # both functions and save/load points to be defined for + # each run of the code. (don't want old bugs hanging + # around or wasting time on saving when the user doesn't + # want it) + # If I think about it, removing the load_point variable + # probably doesn't matter, but I'm preventing it from + # saving anyway. + object_vec = setdiff(ls(envir = .GlobalEnv), lsf.str(envir = .GlobalEnv)) + object_vec = setdiff(object_vec, c('save_points', 'load_point')) + #print(object_vec) + save(list = object_vec, file = outfile, envir = .GlobalEnv) + message('done') + } + return(fun) +} + + +load_data_factory = function(folder) { + # This function returns a function that saves loads data + # from a folder that is specified at the time of function + # creation. The resulting function only requires a + # "load_point" argument, which is x. + + force(folder) + fun = function(x) { + infile = file.path(folder, sprintf('save_point_%s.RData', x)) + message(sprintf('\nloading data from save point %s...', x)) + message(sprintf('location: %s\n', infile)) + load(infile, envir = globalenv()) + message('done') + } + return(fun) +} + + +# Parse the arguments, as these need to be read regardless of which starting +# point in the code the user wants to start the analysis from. +# Note: while it is totally ok (and more or less the point) for the "load_point" +# and "save_points" arguments to change between runs on the same dataset +# (for example, stepping through the analysis, one save point at a time), +# "--test_run" only matters when no load point is specified (or it is 0). +positional_arguments_list = c(CONFIG_FILE = 'yaml-formatted gene-set prediction configuration file.') + +option_list = list(make_option('--test_run', action = 'store_true', default = FALSE, help = 'Performs predictions for a small subset of the data, to ensure that everything is working properly before the full set of predictions are made (can take days for large screens). This option only has an effect when "--load_point" is not specified or is 0. If "--test_run" has been specified and the analysis saved halfway through using "--save_points," then the analysis will still be in "test run" mode when loading that save point, and vice versa if "--test_run" was not specified. NOTE: the results from this run will not be accurate!!!'), + make_option('--load_point', default = 0, type = 'integer', help = 'Point in the processing at which a previously partially-processed, saved dataset should be loaded and its processing continued through the rest of the script. Possible values are in c(0, 1, 2). 0 (default): run all code from the beginning; 1: resume the analysis at per-gene-set p-value computations; 2: resume the analysis at per-condition p-value computations. While primarily for testing purposes, this could be useful during real analyses.'), + make_option('--save_points', default = '', type = 'character', help = 'Point(s) in the processing at which to save the data in their present, partially-processed form. See "--load_point" for descriptions of these points. If the user wishes to specify multiple save points, these must be comma-delimited *without* spaces (e.g. --save_points 1,2). Data are saved in "*.RData" format.') + ) +final_usage_positional = get_usage_and_positional(positional_arguments_list) + +parser = OptionParser(usage = final_usage_positional, option_list = option_list) +arguments = parse_args(parser, positional_arguments = length(positional_arguments_list)) +opt = arguments$options +arg = arguments$args + +print(opt) +print(arg) + + + +####### First, deal with the save/load point arguments +# Load 'em in... +load_point = opt$load_point +save_points_raw = opt$save_points +save_points_split = strsplit(save_points_raw, ',')[[1]] +save_points = as.integer(save_points_split) + +# Make sure no saving is supposed to happen before loading +if (any(save_points < load_point)) { + stop('Specified intermediate file save points must occur after the specified load point!') +} + +# Looks like I have to parse the config file, *just a bit*, so +# that it knows where the output directory is. Grrrrrrr... +config_f = file(arg[1], 'rt') +config_params = yaml.load_file(config_f) +close(config_f) + +# Validate alternative hypothesis entry in config_params +alt_options <- c("greater", "less", "two-sided") +if (!config_params$Required_arguments$alternative %in% alt_options) { + stop(sprintf("Required argument 'alternative' must be one of c('%s')", paste(alt_options, collapse = "', '"))) +} + +# Get the output folder for intermediate data +# (and create if it doesn't exist!) +output_table_folder = get_gene_set_target_prediction_folder(config_params$Required_arguments$output_folder) +intermed_folder = file.path(output_table_folder, 'intermediate_data') +dir.create(intermed_folder, recursive = TRUE) + +# Generate the save_data and load_data functions +save_data = save_data_factory(intermed_folder) +load_data = load_data_factory(intermed_folder) + +# If the user does not specify a load point (or explicitly wants +# to load from the beginning), then run the code from the +# beginning! Otherwise, all of this code will be skipped, as the +# options have already been defined and saved in the user's +# R environment from the first round of processing! + +# Now, on to the real stuff! +if (load_point < 1) { + + + #positional_arguments_list = c(gene_predictions = 'Gzipped file containing the gene-level target predictions for each condition.', + # rand_gene_predictions = 'Gzipped file containing the gene-level target predictions for each resampled condition.', + # gene_sets = 'Two-column, tab-delimited file (WITH HEADER) that maps gene identifiers (must have same column name as corresponding column in the query info table) to gene sets.', + # query_info_table = 'Table with a "query_key" column that maps to the query identifiers in the genetic interaction data and another column that maps to the gene identifiers in the gene_sets file.', + # driver_cutoff = 'Similarity value (similarity to the condition profile) above which a gene is reported as a "driver" of particular gene-set prediction. Default value cannot be specified because unbounded similarity measures are accepted as input.', + # output_table = 'File into which the results are written. (it is gzipped, so the extension should be *.txt.gz)' + # ) + + #option_list = list( + # make_option('--sample_table', help = 'File with a table that maps conditions to control information'), + # make_option('--neg_control_column', help = 'Name of True/False column in the sample_table that indicates which samples are to be used as negative controls in gene-set prediction. If this option is not specified, only the resampled conditions will be used for gene-set prediction.'), + # make_option('--condition_name_column', help = 'Name of a column in the sample table with human-readable condition names'), + # make_option(c('-n', '--per_condition_randomizations'), type = 'integer', help = 'The number of "per-condition" randomizations to perform. When not specified, the default is to perform the same number of randomizations that are present in the rand_gene_predictions file.'), + # make_option('--gene_set_names', help = 'Two-column, tab-delimited file that maps each gene set identifier to an interpretable name for gene-set prediciton output. The values in the first column must match the gene sets in the gene_sets file above. The values in the second column will be included in the final results table (along with those in the first column).'), + # make_option('--gene_name_column', help = 'Column in "query_info_table" that contains interpretable gene names. In S. cerevisiae, for example this would be the column that contains "TUB3" in the row for ORF YML124C'), + # make_option('--test_run', action = 'store_true', default = FALSE, help = 'Performs predictions for a small subset of the data, to ensure that everything is working properly before the full set of predictions are made (can take days for large screens). The results from this run will not be accurate!!!'), + # make_option('--num_cores', type = 'integer', default = 1, help = 'The number of cores used to run the gene set predictions!'), + # make_option(c('-v', '--verbosity'), type = 'integer', default = 1, help = 'The level of verbosity printed to stdout. Ranges from 0 to 3, 1, is default.') + # ) + + + ################# Load in the required files, deal with arguments + + # Note: I had concerns that not parsing the config file within + # this block of code would lead the user to be able to alter + # the config params on a saved run that was loaded halfway into + # the analysis. I now believe this is unfounded, as all variables + # will be restored to those from the config file at the time the + # analysis was initiated when the user loads all variables in + # from a specified save point. + + ####### Now for the parameters in the config file!!! + output_folder = config_params$Required_arguments$output_folder + gene_pred_folder = get_gene_target_folder(output_folder) + gene_pred_file = get_gene_target_prediction_filename(gene_pred_folder, + config_params$Required_arguments$cg_gi_similarity_measure + ) + gene_prediction_tab = fread(sprintf('gzip -dc %s', gene_pred_file), header = TRUE, colClasses = c('character', 'character', 'character', 'numeric')) + + # Since the user can specify not to use the resampled profiles derived + # from the entire dataset, here I start dealing with that. The code + # will look a bit clunky, but this will be more reliable/predictable + # than the alternative of creating an empty data.table to funnel + # through all of the same steps. That will likely wreak havoc. + zero_resampled_profiles = (config_params$Required_arguments$`per-array_resampling_scheme` == 0) | (config_params$Required_arguments$`num_per-array_resampled_profiles`== 0) + + if (!zero_resampled_profiles) { + + rand_gene_pred_file = get_gene_target_prediction_resampled_filename( + gene_pred_folder, + config_params$Required_arguments$`per-array_resampling_scheme`, + config_params$Required_arguments$`num_per-array_resampled_profiles`, + config_params$Required_arguments$`per-array_resampling_seed`, + config_params$Required_arguments$cg_gi_similarity_measure + ) + rand_gene_prediction_tab = fread(sprintf('gzip -dc %s', rand_gene_pred_file), header = TRUE, colClasses = c('character', 'character', 'character', 'numeric')) + } else { + + # If there are no resampled profiles, make sure that the config_params, + # which are used later in the script, accurately reflect that. + config_params$Required_arguments$`per-array_resampling_scheme` = 0 + config_params$Required_arguments$`num_per-array_resampled_profiles` = 0 + config_params$Required_arguments$`per-array_resampling_seed` = 0 + } + + sample_table_filename = config_params$Required_arguments$cg_col_info_table + sample_table = fread(sample_table_filename, header = TRUE, colClasses = 'character') + + gene_set_name = config_params$Required_arguments$gene_set_name + gi_data_name = config_params$Required_arguments$gi_dataset_name + gene_set_info = get_gene_set_info(gene_set_name, TARGET_PATH) + print(gene_set_info) + gene_set_tab_full = fread(gene_set_info$filename, header = TRUE, sep = '\t', colClasses = 'character') + gene_set_gene_id_col = names(gene_set_tab_full)[1] + gene_set_id_col = names(gene_set_tab_full)[2] + gene_set_name_col = gene_set_info$interpretable_column + print(gene_set_tab_full) + print(gene_set_id_col) + print(gene_set_name_col) + gene_set_tab = gene_set_tab_full[, c(gene_set_gene_id_col, gene_set_id_col), with = FALSE] + + # Here, I read in the min and max allowable gene set sizes + min_term_size = as.numeric(config_params$Required_arguments$min_term_size) + max_term_size = as.numeric(config_params$Required_arguments$max_term_size) + if (is.na(min_term_size) | is.na(max_term_size)) { + stop('Required arguments "min_term_size" and/or "max_term_size" not specified\nproperly. Please specify as a number and re-run this script.') + } + + gi_info = get_gi_info(config_params$Required_arguments$gi_dataset_name, TARGET_PATH) + #gi_tab = fread(sprintf('gzip -dc %s', gi_filenames$gi_tab)) + query_info_tab = fread(gi_info$gi_query_tab) + + #query_info_file = config_params$Required_arguments$gi_query_info_table + #query_info_tab = fread(query_info_file, header = TRUE, colClasses = 'character') + + driver_cutoff = as.numeric(config_params$Required_arguments$driver_cutoff) + if (is.na(driver_cutoff)) { stop('driver_cutoff argument must be numeric.') } + + output_table_folder = get_gene_set_target_prediction_folder(output_folder) + output_table = get_gene_set_target_prediction_filename_v2( + output_table_folder, + config_params$Required_arguments$`per-array_resampling_scheme`, + config_params$Required_arguments$`num_per-array_resampled_profiles`, + config_params$Required_arguments$`per-array_resampling_seed`, + config_params$Required_arguments$cg_gi_similarity_measure, + config_params$Required_arguments$`per-condition_randomization_seed`, + config_params$Required_arguments$`num_per-condition_randomizations`, + config_params$Required_arguments$gene_set_name, + config_params$Required_arguments$min_term_size, + config_params$Required_arguments$max_term_size, + config_params$Required_arguments$alternative, + opt$test_run + ) + + dir.create(output_table_folder, recursive = TRUE) + + gene_set_hist_folder = get_gene_set_hist_folder(output_folder) + dir.create(gene_set_hist_folder) + gene_set_size_hist_file = get_gene_set_target_prediction_size_hist_file( + gene_set_hist_folder, + config_params$Required_arguments$gene_set_name, + config_params$Required_arguments$min_term_size, + config_params$Required_arguments$max_term_size, + opt$test_run + ) + + gene_set_out_tab = get_gene_set_target_prediction_terms_filename( + gene_set_hist_folder, + config_params$Required_arguments$gene_set_name, + config_params$Required_arguments$min_term_size, + config_params$Required_arguments$max_term_size, + opt$test_run + ) + + # This way, `true_false_control_map` adapts to changes in `bool_vec` + init_control_map = c(`TRUE` = 'expt_control', `FALSE` = 'treatment') + true_false_control_map = init_control_map[as.character(bool_vec)] + names(true_false_control_map) = names(bool_vec) + #true_false_control_map = c('TRUE' = 'expt_control', 'FALSE' = 'treatment', 'True' = 'expt_control', 'False' = 'treatment') + + ################### Load in optional files and options + # Gene set ID to interpretable name table + # Might want to change this to "if (!gene_set_name_col == '')" instead of expecting it to be null or NA + print(gene_set_name_col) + if (!gene_set_name_col == '') { + #gene_set_name_file = config_params$Options$gene_set_target_prediction$gene_set_name_table + gene_set_name_tab = unique(gene_set_tab_full[, c(gene_set_id_col, gene_set_name_col), with = FALSE]) + print(gene_set_name_tab) + setnames(gene_set_name_tab, c('gene_set', 'gene_set_name')) + } else { + gene_set_name_tab = NULL + } + + # Note: if no column is specified containing interpretable gene names, + # then this variable will already be NULL. + gene_name_column = gi_info$query_genename_col + if (!is.null(gene_name_column)) { + if (! gene_name_column %in% names(query_info_tab)) { + stop(sprintf('column %s was specified as the column in the query info table that contains interpretable gene names, but the columns does not exist! query info table is here:\n%s', gene_name_column, gi_info$gi_query_tab)) + } + } + + ################ Deal with the consequences of using a dummy dataset, if applicable ############### + # (must be done before pulling other information from sample table, as this modifies the sample + # table to include dummy controls, if they exist) + dummy_name = config_params$Options$dummy_dataset$name + if (!is.null(dummy_name)) { + dummy_config_f = file(get_dummy_config_filename(dummy_name), 'rt') + dummy_config_params = yaml.load_file(dummy_config_f) + close(dummy_config_f) + dummy_dt = fread(sprintf('gzip -dc %s', file.path(get_dummy_folder(dummy_name), dummy_config_params$cg_data_table)), colClasses = c('character', 'character','character','character','numeric')) + dummy_col_tab = fread(file.path(get_dummy_folder(dummy_name), dummy_config_params$cg_col_info_tab), header = TRUE, colClasses = 'character') + + # If a column specifying negative experimental controls was specified, then use it + # to filter the dummy dataset. otherwise, do not use the dummy dataset here! + if (!is.null(config_params$Options$dummy_dataset$negative_control_column)) { + print(unique(dummy_dt[, list(Strain_ID, Barcode)], by = NULL)) + print(unique(dummy_dt[, list(screen_name, expt_id)], by = NULL)) + message(sprintf('dummy matrix dimensions before filtering for "cols_to_include": (%s, %s)', + dim(unique(dummy_dt[, list(Strain_ID, Barcode)], by = NULL))[1], + dim(unique(dummy_dt[, list(screen_name, expt_id)], by = NULL))[1])) + select_rows_dummy = bool_vec[dummy_col_tab[[config_params$Options$dummy_dataset$negative_control_column]]] + dummy_col_tab = dummy_col_tab[select_rows_dummy] + } + + # Create a final sample table with the dummy controls added in. + sample_table = add_dummy_controls_to_cg_sample_tab(sample_table, + config_params$Options$gene_set_target_prediction$negative_control_column, + config_params$Options$gene_set_target_prediction$condition_name_column, + dummy_col_tab, + config_params$Options$dummy_dataset$negative_control_column, + config_params$Options$dummy_dataset$condition_name_column) + } + + ################ Read in the sample table and negative control/condition name columns, if they exist + # Handle the negative control column + neg_control_col = config_params$Options$gene_set_target_prediction$negative_control_column + if (!is.null(neg_control_col)) { + if (! neg_control_col %in% names(sample_table)) { + stop(sprintf('column %s not in cg_col_info_table!', neg_control_col)) + } + control_map = true_false_control_map[sample_table[[neg_control_col]]] + names(control_map) = sample_table[, sprintf('%s_%s', screen_name, expt_id)] + } else { + control_map = rep('treatment', dim(sample_table)[1]) + names(control_map) = sample_table[, sprintf('%s_%s', screen_name, expt_id)] + } + + # Handle the condition name column + cond_name_col = config_params$Options$gene_set_target_prediction$condition_name_column + if (!is.null(cond_name_col)) { + ### May need to revisit if the columns are split or not... + if (! cond_name_col %in% names(sample_table)) { + stop(sprintf('column %s not in cg_col_info_table!', cond_name_col)) + } + condition_name_tab = sample_table[, list(condition = sprintf('%s_%s', screen_name, expt_id), condition_name = sample_table[[cond_name_col]])] + + # The condition_tab needs to have rand-by-strain conditions added so the join + # doesn't cause us to lose all or the rand-by-strain conditions! + # **But only if there is a table of rand-by-strain (resampled) gene target + # predictions!** + if (!zero_resampled_profiles) { + rand_condition_name_tab = unique(rand_gene_prediction_tab[, list(screen_name, expt_id)])[, list(condition = sprintf('%s_%s', screen_name, expt_id), condition_name = sprintf('%s_%s', screen_name, expt_id))] + condition_name_tab = rbind(condition_name_tab, rand_condition_name_tab) + } + + print(condition_name_tab) + } else { + condition_name_tab = NULL + } + + + print(sample_table) + print(control_map) + + ############ Here, convert all the tables into matrices and them remove + ############ before any workspaces can be saved (lots of unnecessary + ############ space would be taken up! + + ###### Smash some data around + gene_prediction_mat = acast(gene_prediction_tab, screen_name + expt_id ~ query_key, value.var = 'score') + if (!zero_resampled_profiles) { + rand_gene_prediction_mat = acast(rand_gene_prediction_tab, screen_name + expt_id ~ query_key, value.var = 'score') + all_prediction_mat = rbind(gene_prediction_mat, rand_gene_prediction_mat) + + ###### Line up the control vectors to the matrix with all predictions + rand_control_map = rep('rand-by-strain', dim(unique(rand_gene_prediction_tab[, list(screen_name, expt_id)]))[1]) + names(rand_control_map) = unique(rand_gene_prediction_tab[, sprintf('%s_%s', screen_name, expt_id)]) + + # Don't need these! + rm(rand_gene_prediction_tab) + rm(rand_gene_prediction_mat) + + } else { + all_prediction_mat = gene_prediction_mat + } + + # Don't need these! + rm(gene_prediction_tab) + rm(gene_prediction_mat) + + #all_prediction_tab = rbind(gene_prediction_tab, rand_gene_prediction_tab) + #all_prediction_mat = acast(all_prediction_tab, screen_name + expt_id ~ query_key, value.var = 'score') + + + gc() + +} + +# Here is very important testing/rerunning code. If the user wants to save +# at "save point 1," then the R environment is saved here before moving on. +# If the user wants to load the data previously saved at "save point 1," +# then the previously saved R environment is loaded here. +if (1 %in% save_points) { + save_data(1) +} +if (load_point == 1) { + load_data(1) +} + +# If the user-specified load point is less than 2, then this +# next section of code must be run (because the data were +# loaded in somewhere before this code and must be processed +# to completion!) +if (load_point < 2) { + + + # Check for real and random conditions overlapping (should NEVER happen) + # Only check if there are "random" (really, "resampled") conditions + if (!zero_resampled_profiles) { + print(control_map[1:10]) + print(rand_control_map[1:10]) + overlap_conds = intersect(names(control_map), names(rand_control_map)) + if (length(overlap_conds) > 0) { + stop(sprintf('The following real and random conditions have the same name:\n%s', paste(overlap_conds, collapse = '\n'))) + } + } + + # Combine the control maps and get one master control type vector!!! + if (!zero_resampled_profiles) { + + all_control_map = c(control_map, rand_control_map) + } else { + all_control_map = control_map + } + + all_control_vec = all_control_map[rownames(all_prediction_mat)] + print(all_control_vec) + print(sprintf('Number of %s conditions: %s', names(table(all_control_vec)), table(all_control_vec))) + + # If any sample types are still NA, filter them out of the dataset + inds_to_remove = is.na(all_control_vec) + conds_to_remove = rownames(all_prediction_mat)[inds_to_remove] + + all_control_vec = all_control_vec[!is.na(all_control_vec)] + all_prediction_mat = all_prediction_mat[!is.na(all_control_vec), ] + + # Get a matrix with one column per unique query gene (there may be multiple alleles for the same gene), + # and for each row (condition) in that column, we take the maximum prediction score against all + # alleles of the same query gene. Also, get a matrix that indicates which allele the selected + # max score came from. + + # First, get the vector of query genes that define where the multiple-allele cases are. + #print(gene_set_tab) + #print(query_info_tab) + # WHY DO I DRAW THE NAME OF THE QUERY COLUMN FROM THE GENE_SET_TAB?! + # Well now I don't :) + query_column = gi_info$query_sys_name_col + query_map = query_info_tab[[query_column]] + names(query_map) = query_info_tab[['query_key']] + query_gene_vec = query_map[colnames(all_prediction_mat)] + + print(str(all_prediction_mat)) + print(str(query_gene_vec)) + print(all_prediction_mat[1:5, (dim(all_prediction_mat)[2]-9):(dim(all_prediction_mat)[2])]) + print(query_gene_vec[(length(query_gene_vec) - 9):length(query_gene_vec)]) + + # Get the matrix with the best predictions among multiple alleles + best_query_mat_list = best_score_per_col_group(all_prediction_mat, query_gene_vec) + + best_prediction_mat = best_query_mat_list[['best_scores']] + best_query_mat = best_query_mat_list[['best_queries']] + + # Get a map from query key column to an interpretable name, if the + # gene_name_column exists. Then create a matrix that matches the + # best_query_mat matrix and contains interpretable query gene + # names. If that column was not given, create a matrix of NA + # values instead. + if (!is.null(gene_name_column)) { + gene_name_map = query_info_tab[[gene_name_column]] + names(gene_name_map) = query_info_tab[['query_key']] + best_query_name_mat = matrix(gene_name_map[best_query_mat], nrow = nrow(best_query_mat), ncol = ncol(best_query_mat), dimnames = dimnames(best_query_mat)) + } else { + best_query_name_mat = matrix(NA_character_, nrow = nrow(best_query_mat), ncol = ncol(best_query_mat), dimnames = dimnames(best_query_mat)) + } + + ###### Determine which sample types exist and will be used to make predictions + # At some point, should the user be able to specify this instead of just assuming + # that everything needs to have its target predicted? + sample_types_to_predict_for <- unique(all_control_vec) + + ######################### + ######################### + + ######### + ### What are the control types? (There may be experimental controls, or maybe not!) + controls <- intersect(sample_types_to_predict_for, c('expt_control', 'rand-by-strain')) + + # Reshape the gene set table into a matrix + gene_set_mat_formula = as.formula(sprintf('%s ~ %s', names(gene_set_tab)[1], names(gene_set_tab)[2])) + gene_set_tab[, annotation := 1] + gene_set_matrix = acast(data = gene_set_tab, formula = gene_set_mat_formula, value.var = 'annotation', fill = 0) + + ## Get a matrix of all possible propagated orf to GO term mappings + #raw_go_matrix <- get_gene_go_bp_is_a_propagated_matrix(raw_orfs) + # + ## Only select GO terms with 4 to 200 genes from the lightnet + ## mapped to them + #go_term_sizes <- apply(raw_go_matrix, 2, sum) + #go_matrix <- raw_go_matrix[, go_term_sizes >= 4 & go_term_sizes <= 200] + + # Match up the gene set matrix with the prediction/query matrices: + common_genes = intersect(rownames(gene_set_matrix), colnames(best_prediction_mat)) + gene_set_matrix = gene_set_matrix[common_genes, , drop = FALSE] + best_prediction_mat <- best_prediction_mat[, common_genes, drop = FALSE] + best_query_mat <- best_query_mat[, common_genes, drop = FALSE] + best_query_name_mat = best_query_name_mat[, common_genes, drop = FALSE] + + # Remove GO terms from the gene set matrix without any annotations (must only be + # annotated from genes not in the set that is predicted against in the gene-level + # target prediction step). + nonzero_degree_gene_set_inds = colSums(gene_set_matrix) > 0 + gene_set_matrix = gene_set_matrix[, nonzero_degree_gene_set_inds, drop = FALSE] + print(sprintf('Removed %s gene sets with no annotations', sum(!nonzero_degree_gene_set_inds))) + + # Now, remove GO terms from the gene set matrix that do not meet the term size + # criteria in the configuration file. + term_size_min_pass = (colSums(gene_set_matrix) >= min_term_size) + term_size_max_pass = (colSums(gene_set_matrix) <= max_term_size) + gene_set_matrix = gene_set_matrix[, term_size_min_pass & term_size_max_pass, drop = FALSE] + print(sprintf('Removed %s gene sets with fewer than %s annotations', sum(!term_size_min_pass), min_term_size)) + print(sprintf('Removed %s gene sets with greater than %s annotations', sum(!term_size_max_pass), max_term_size)) + + # Get a gene set annotation table filtered to contain only the query genes from the + # selected genetic interaction network and for which the terms pass the defined + # size criteria + accepted_gene_ids = as.character(row(gene_set_matrix, as.factor = TRUE))[gene_set_matrix == 1] + accepted_gene_set_ids = as.character(col(gene_set_matrix, as.factor = TRUE))[gene_set_matrix == 1] + print(str(accepted_gene_ids)) + print(str(accepted_gene_set_ids)) + accepted_annotation_tab = data.table(accepted_gene_ids, accepted_gene_set_ids) + setnames(accepted_annotation_tab, c(gene_set_gene_id_col, gene_set_id_col)) + gene_set_tab_final = copy(gene_set_tab_full) + setkeyv(gene_set_tab_final, c(gene_set_gene_id_col, gene_set_id_col)) + gene_set_tab_final = gene_set_tab_final[accepted_annotation_tab] + print(gene_set_tab_final) + + # This is a check specific to protein complexes (couldn't get the more general + # version to work) + # print(gene_set_tab_final[, .N, by = list(complex)][, range(N)]) + + # Print out a list of the gene_sets actually used in this analysis (and the + # annotations after filtering to only include the predicted gene-level targets). + # Also, make a plot of term size distribution. + write.table(gene_set_tab_final, gene_set_out_tab, quote = FALSE, sep = '\t', row.names = FALSE, col.names = TRUE) + + ### Plot the gene set size distribution + setkeyv(gene_set_tab_final, gene_set_id_col) + gene_set_size_dist = gene_set_tab_final[, .N, by = gene_set_id_col] + + gene_set_size_dist_plot = ggplot(gene_set_size_dist, aes(N)) + + geom_histogram() + + scott_theme_1() + + labs(x = 'gene set size', y = 'frequency', title = sprintf('%s\ngene set size distribution', gene_set_name)) + + theme(axis.title.x = element_text(margin = margin(10, 0, 0, 0)), + axis.title.y = element_text(margin = margin(0, 10, 0, 0)), + plot.title = element_text(margin = margin(0, 0, 15, 0)) + ) + + scale_y_continuous(expand = c(0, 0)) + + pdf(gene_set_size_hist_file) + print(gene_set_size_dist_plot) + dev.off() + + # Get p values and zscores for each drug --> go combination + # This will be done on a per-GO term and a per-drug basis, and + # the per-GO computations can be performed using as many + # different control types of samples as possible. From this + # function call, one gets back all p values and z scores for + # ALL different computation methods. + + # First, create a vector that splits the matrix up into + # treatments and controls + # treat_control_vec <- vector('character', dim(tp_mat)[1]) + # treat_control_vec[] <- 'Treatment' + # treat_control_vec[grepl('DMSO', dimnames(tp_mat)[[1]], fixed = TRUE)] <- 'DMSO' + # treat_control_vec[grepl('Rand-by-strain', dimnames(tp_mat)[[1]], fixed = TRUE)] <- 'Rand-by-strain' + + + # test_set <- c(which(treat_control_vec == 'Treatment')[1:20], + # which(treat_control_vec == 'DMSO')[1:20], + # which(treat_control_vec == 'Rand-by-strain')[1:20] + # ) + + # Select a subset for debugging purposes + # Selects 100 of each sample type (or fewer, if fewer than + # 100 of a sample type + test_inds <- do.call(c, lapply(unique(all_control_vec), function(x) { + inds <- which(all_control_vec == x) + inds[1:(min(length(inds), 100))] + })) + + + # Is this a test to make sure everything works, or is this the real deal? + # Look for the command line option "--test_run" + if (opt$test_run) { + condition_inds <- test_inds + } else { + condition_inds <- TRUE + } + + + # stop('time to figure out what is going terribly wrong') + + ###### The following steps of code, until the end of this section, are + ###### preparation for performing the z-score and p-value computations + target_prediction_mat = best_prediction_mat[condition_inds,] + sample_type_split_vec = all_control_vec[condition_inds] + + condition_gene_score_means <- rowMeans(target_prediction_mat) + condition_gene_score_stdevs <- apply(target_prediction_mat, 1, sd) + + gene_set_sizes <- colSums(gene_set_matrix) + + condition_gene_set_pred_sum_mat <- target_prediction_mat %*% gene_set_matrix + + # sample_types <- unique(sample_type_split_vec) + + # controls <- sample_types[sample_types != treatment_sample_type] + + # Perform some subsetting to only include sample types to make + # predictions for + condition_subset_gene_set_pred_sum_mat <- condition_gene_set_pred_sum_mat[sample_type_split_vec %in% sample_types_to_predict_for, ] + target_prediction_subset_mat <- target_prediction_mat[sample_type_split_vec %in% sample_types_to_predict_for, ] + condition_subset_gene_score_means = condition_gene_score_means[sample_type_split_vec %in% sample_types_to_predict_for] + condition_subset_gene_score_stdevs = condition_gene_score_stdevs[sample_type_split_vec %in% sample_types_to_predict_for] + + # Change order of controls so that the first control type has the + # largest number of conditions. If they tie, order() will not change + # their order. + sample_type_counts = table(sample_type_split_vec) + control_counts = sample_type_counts[controls] + control_order = order(control_counts, decreasing = TRUE) + + # Here's the final ordering step + controls = controls[control_order] + + #control_gene_set_pred_sum_mat_list <- foreach(control = controls) %do% { + # condition_gene_set_pred_sum_mat[sample_type_split_vec %in% control, ] + #} + + control_gene_set_pred_sum_mat_list = vector('list', length(controls)) + for (i in seq_along(controls)) { + control = controls[[i]] + control_gene_set_pred_sum_mat_list[[i]] = condition_gene_set_pred_sum_mat[sample_type_split_vec == control, ] + } + names(control_gene_set_pred_sum_mat_list) = controls + + + rm(target_prediction_mat) + rm(condition_gene_set_pred_sum_mat) + + gc() + + #control_gene_set_pred_sum_means_list <- foreach(control_gene_set_pred_sum_mat = control_gene_set_pred_sum_mat_list) %do% { + # colMeans(control_gene_set_pred_sum_mat) + #} + + #control_gene_set_pred_sum_stdevs_list <- foreach(control_gene_set_pred_sum_mat = control_gene_set_pred_sum_mat_list) %do% { + # apply(control_gene_set_pred_sum_mat, 2, sd) + #} + + control_gene_set_pred_sum_means_list = lapply(control_gene_set_pred_sum_mat_list, colMeans) + control_gene_set_pred_sum_stdevs_list = lapply(control_gene_set_pred_sum_mat_list, + function(x) apply(x, 2, sd)) + + names(control_gene_set_pred_sum_means_list) = controls + names(control_gene_set_pred_sum_stdevs_list) = controls + +} + + +# Here is very important testing/rerunning code. If the user wants to save +# at "save point 2," then the R environment is saved here before moving on. +# If the user wants to load the data previously saved at "save point 2," +# then the previously saved R environment is loaded here. +if (2 %in% save_points) { + save_data(2) +} +if (load_point == 2) { + load_data(2) +} + + +########### Get p-values and z-scores for per-gene-set evaluations + +# If the user-specified load point is less than 3, then this +# next section of code must be run (because the data were +# loaded in somewhere before this code and must be processed +# to completion!) +if (load_point < 3) { + + if (!(is.null(neg_control_col) & zero_resampled_profiles)) { + + # Only compute per-gene-set scores if negative control and/or + # resampled profiles are present. Otherwise, skip and replace + # with a reasonable data structure. + per_gene_set_pvals_zscores = compute_per_gene_set_pvals_zscores(condition_subset_gene_set_pred_sum_mat, control_gene_set_pred_sum_mat_list, control_gene_set_pred_sum_means_list, control_gene_set_pred_sum_stdevs_list, controls, config_params$Required_arguments$alternative) + } else { + + # Else, this list is just NULL instead. To be dealt with later. + per_gene_set_pvals_zscores = NULL + } + +} + +# Again, here is important testing/rerunning code. If the user wants to save +# at "save point 3," then the R environment is saved here before moving on. +# If the user wants to load the data previously saved at "save point 3," +# then the previously saved R environment is loaded here. +if (3 %in% save_points) { + save_data(3) +} +if (load_point == 3) { + load_data(3) +} + +########### Get p-values and z-scores for per-condition evaluations + +# If the user-specified load point is less than 4, then this +# next section of code must be run (because the data were +# loaded in somewhere before this code and must be processed +# to completion!) +if (load_point < 4) { + + # Settle the per-condition randomization question + num_per_condition_rand = config_params$Required_arguments$`num_per-condition_randomizations` + + # Get the seed (required input from the user) + # Seed isn't set until right before randomizations + # are performed, inside the function below. + seed = config_params$Required_arguments$`per-condition_randomization_seed` + print(seed) + if (seed == 'rand') { + seed = NULL + } else if (!is.numeric(seed)) { + stop('specified per-condition_randomization_seed is neither numeric nor "rand"') + } + + per_condition_pvals_zscores = compute_per_condition_pvals_zscores_2(target_prediction_subset_mat, condition_subset_gene_set_pred_sum_mat, gene_set_matrix, num_per_condition_rand, gene_set_sizes, condition_subset_gene_score_means, condition_subset_gene_score_stdevs, seed, config_params$Required_arguments$alternative) + + # Combine results into a final list and remove the individual lists + all_pvals_zscores = list(per_gene_set = per_gene_set_pvals_zscores, per_condition = per_condition_pvals_zscores) + + # Remove any components of the list that are NULL. If both + # are NULL, stop the code with an error since we need at + # least one set of p-values and z-scores to move on. + all_pvals_zscores = all_pvals_zscores[!vapply(all_pvals_zscores, is.null, logical(1))] + + if (length(all_pvals_zscores) == 0) { + + stop('There are no computed p-values/z-scores, from either the per-gene-set or per-condition analyses, with which to move forward. Please check your config file and your data to ensure you either have generated resampled profiles and/or have enough negative control profiles for per-gene-set analyses, and/or you have set the number of per-condition randomizations to a value greater than zero.') + } + + if(!is.null(per_gene_set_pvals_zscores)) rm(per_gene_set_pvals_zscores) + if(!is.null(per_condition_pvals_zscores)) rm(per_condition_pvals_zscores) + gc() + +} + +# Save/load point 4! +if (4 %in% save_points) { + save_data(4) +} +if (load_point == 4) { + load_data(4) +} + +# Now, get all versions of p values and zscores! +#system.time(all_pvals_zscores <- compute_zscores_pvals_by_go_and_drug(target_prediction_mat = best_prediction_mat[condition_inds,], +# go_term_mat = gene_set_matrix, +# sample_type_split_vec = all_control_vec[condition_inds], +# control_types = controls, +# types_to_predict_for = sample_types_to_predict_for, +# num_per_condition_rand = num_per_condition_rand, +# load_point = load_point, +# save_points = save_points, +# gene_set_outdir = output_table_folder)) + + +######### Combine the p-values and z-scores so that there is one p-value +######### and z-score per condition X gene set pair. The largest p-value and its +######### accompanying z-score are chosen, to minimize the chance that the +######### results we call significant are actually easily observable among any +######### appropriate null models. + +# If the data are already loaded, proceed with processing! +if (load_point < 5) { + + print(str(all_pvals_zscores)) + print(controls) + + #print(which(is.na(all_pvals_zscores[['per_gene_set']][[controls[1]]][['pval']]), arr.ind = TRUE)) + #print(sum(is.na(all_pvals_zscores[['per_gene_set']][[controls[1]]][['pval']]))) + + #print(which(is.na(all_pvals_zscores[['per_gene_set']][[controls[1]]][['zscore']]), arr.ind = TRUE)) + #print(sum(is.na(all_pvals_zscores[['per_gene_set']][[controls[1]]][['zscore']]))) + + #bad_go_term_inds = unique(which(is.na(all_pvals_zscores[['per_gene_set']][[controls[1]]][['zscore']]), arr.ind = TRUE)[, 2]) + #print(str(gene_set_matrix[, bad_go_term_inds])) + #print(colSums(gene_set_matrix[, bad_go_term_inds])) + + #print(which(is.na(all_pvals_zscores[['per_gene_set']][[controls[2]]][['pval']]))) + #print(which(is.na(all_pvals_zscores[['per_gene_set']][[controls[2]]][['zscore']]))) + # get worst per-go pvals/zscores + if (length(all_pvals_zscores[['per_gene_set']]) == 2) { + per_gene_set_worst_case_mats <- get_worst_case_pval_zscore(all_pvals_zscores[['per_gene_set']][[controls[1]]][['pval']], + all_pvals_zscores[['per_gene_set']][[controls[1]]][['zscore']], + all_pvals_zscores[['per_gene_set']][[controls[2]]][['pval']], + all_pvals_zscores[['per_gene_set']][[controls[2]]][['zscore']], + controls[1], + controls[2], + config_params$Required_arguments$alternative + ) + } else if (length(all_pvals_zscores[['per_gene_set']]) == 1) { + print(str(all_pvals_zscores)) + per_gene_set_worst_case_mats = list(worst_pval = all_pvals_zscores[['per_gene_set']][[controls[1]]][['pval']], + worst_zscore = all_pvals_zscores[['per_gene_set']][[controls[1]]][['zscore']], + control_name = matrix(controls[1], nrow = dim(all_pvals_zscores[['per_gene_set']][[controls[1]]][['pval']])[1], ncol = dim(all_pvals_zscores[['per_gene_set']][[controls[1]]][['pval']])[2], dimnames = dimnames(all_pvals_zscores[['per_gene_set']][[controls[1]]])) + ) + } else if (length(all_pvals_zscores[['per_gene_set']]) > 2) { + + # Since this code can now tolerate having no per_gene_set- + # derived predictions, it's only a problem if there are + # too many control types - more than there should be. + stop(sprintf('Not sure how this happened, as there are %s control types when there should be either 1 or 2', length(controls))) + } + + # get worst overall pvals/zscores (worst between + # per-gene-set and per-condition, but ONLY if + # there are both per_gene_set pvals/zscores and + # per_condition pvals/zscores) + if (length(all_pvals_zscores[['per_gene_set']]) == 0) { + + # If there are no per-gene-set predictions, then we just use + # the per-condition predictions. + overall_worst_case_mats = list(worst_pval = all_pvals_zscores[['per_condition']][['pval']], + worst_zscore = all_pvals_zscores[['per_condition']][['zscore']], + control_name = matrix('per_condition', nrow = dim(all_pvals_zscores[['per_condition']][['pval']])[1], ncol = dim(all_pvals_zscores[['per_condition']][['pval']])[2], dimnames = dimnames(all_pvals_zscores[['per_condition']][['pval']])) + ) + + } else if (length(all_pvals_zscores[['per_condition']]) == 0) { + + # Same thing for per-condition predictions + overall_worst_case_mats = list(worst_pval = per_gene_set_worst_case_mats[['worst_pval']], + worst_zscore = per_gene_set_worst_case_mats[['worst_zscore']], + control_name = matrix('per_gene_set', nrow = dim(all_pvals_zscores[['per_gene_set']][['pval']])[1], ncol = dim(all_pvals_zscores[['per_gene_set']][['pval']])[2], dimnames = dimnames(all_pvals_zscores[['per_gene_set']][['pval']])) + ) + } else { + + # Otherwise, there is at least one per-condition and one per-gene-set + # set of statistics, this is how to combine them. + overall_worst_case_mats <- get_worst_case_pval_zscore(per_gene_set_worst_case_mats$worst_pval, + per_gene_set_worst_case_mats$worst_zscore, + all_pvals_zscores$per_condition$pval, + all_pvals_zscores$per_condition$zscore, + 'per_gene_set', + 'per_condition', + config_params$Required_arguments$alternative + ) + } + + # Specify which per-gene-set scheme was used if the worst pval/zscore + # was generated using the per-gene-set scheme. But if there are no + # per_gene_set indices, then don't do it! + if (exists('per_gene_set_worst_case_mats')) { + per_gene_set_indices <- overall_worst_case_mats$control_name == 'per_gene_set' + overall_worst_case_mats$control_name[per_gene_set_indices] <- per_gene_set_worst_case_mats$control_name[per_gene_set_indices] + } +} + +# Save/load point 5! +if (5 %in% save_points) { + save_data(5) +} +if (load_point == 5) { + load_data(5) +} + +########### Compute and assemble the drivers of the gene-set predictions + +# If the data are already loaded, proceed with processing! +if (load_point < 6) { + + # I need to subset the target prediction matrix so I can get the correct + # process prediction drivers back! + best_prediction_subset_mat = best_prediction_mat[condition_inds, ][all_control_vec[condition_inds] %in% sample_types_to_predict_for, ] + best_query_subset_mat = best_query_mat[condition_inds, ][all_control_vec[condition_inds] %in% sample_types_to_predict_for, ] + best_query_name_subset_mat = best_query_name_mat[condition_inds, ][all_control_vec[condition_inds] %in% sample_types_to_predict_for, ] + + # Here I will get a table of drivers of my go process predictions + gene_set_drivers_dt = get_gene_set_drivers(best_prediction_subset_mat, best_query_subset_mat, best_query_name_subset_mat, gene_set_matrix, cutoff = driver_cutoff) + # If the driver gene names were not specified, remove that column from the target prediction table! + if (is.null(gene_name_column)) { + gene_set_drivers_dt[, driver_name := NULL] + } + +} + +# Save/load point 6! +if (6 %in% save_points) { + save_data(6) +} +if (load_point == 6) { + load_data(6) +} + +## Shut down MPI cluster, if it exists +#if (ncores > 1) { +# closeCluster(clus) +#} + +# print(gene_set_drivers_dt[condition == 'SHANGHAI-1511_000088']) + +########### Table mashing and exporting! + +# If the data are already loaded, proceed with processing! +if (load_point < 7) { + + # Set the key on the go_drivers_dt so that it can be joined with the + # process prediction data.tables + setkeyv(gene_set_drivers_dt, c('condition', 'gene_set')) + + ## Melt per-gene-set only predictions into a data table + #per_gene_set_worst_case_df <- lapply(per_gene_set_worst_case_mats, as.vector) + #per_gene_set_worst_case_df <- as.data.frame(per_gene_set_worst_case_df) + # + #per_gene_set_GENE_SET <- as.character(col(per_gene_set_worst_case_mats$worst_pval, as.factor = TRUE)) + #per_gene_set_CONDITION <- as.character(row(per_gene_set_worst_case_mats$worst_pval, as.factor = TRUE)) + # + #per_go_worst_case_dt <- data.table(condition = per_gene_set_CONDITION, gene_set = per_gene_set_GENE_SET, per_gene_set_worst_case_df) + + # Melt overall only predictions into a data table + overall_worst_case_df <- lapply(overall_worst_case_mats, as.vector) + overall_worst_case_df <- as.data.frame(overall_worst_case_df) + + overall_GENE_SET <- as.character(col(overall_worst_case_mats$worst_pval, as.factor = TRUE)) + overall_CONDITION <- as.character(row(overall_worst_case_mats$worst_pval, as.factor = TRUE)) + + overall_worst_case_dt <- data.table(condition = overall_CONDITION, gene_set = overall_GENE_SET, overall_worst_case_df) + + + # Join the process score tables with the process driver table + #setkeyv(per_go_worst_case_dt, c('condition', 'gene_set')) + setkeyv(overall_worst_case_dt, c('condition', 'gene_set')) + + #per_gene_set_worst_case_dt <- gene_set_drivers_dt[per_gene_set_worst_case_dt] + overall_worst_case_dt <- gene_set_drivers_dt[overall_worst_case_dt] + + # Add a column indicating control/treatment category for each condition. + overall_worst_case_dt[, sample_type := all_control_vec[condition]] + + # If the user provided a table mapping gene set IDs to interpretable gene set names, + # then add those to the final table! + if (!is.null(gene_set_name_tab)) { + # setkey(per_gene_set_worst_case_dt, gene_set) + setkey(overall_worst_case_dt, gene_set) + setkey(gene_set_name_tab, gene_set) + # per_gene_set_worst_case_dt = per_gene_set_worst_case_dt[gene_set_name_tab] + overall_worst_case_dt = overall_worst_case_dt[gene_set_name_tab, nomatch = 0] + } + + # If the user provided a table mapping condition IDs to to interpretable condition names, + # then add those to the final table! + if (!is.null(condition_name_tab)) { + # setkey(per_gene_set_worst_case_dt, gene_set) + setkey(overall_worst_case_dt, condition) + setkey(condition_name_tab, condition) + # per_gene_set_worst_case_dt = per_gene_set_worst_case_dt[gene_set_name_tab] + overall_worst_case_dt = overall_worst_case_dt[condition_name_tab, nomatch = 0] + } + + + ## Check size distribution of these go terms + #go_terms <- per_go_worst_case_dt[, unique(GO)] + #go_termsizes <- colSums(go_matrix[, go_terms]) + #hist(go_termsizes, breaks = 200) + ## Comment: I think the size distribution checks out just fine :) + ## However, these terms may be slightly different from the ones + ## I have predicted in the past. I am more confident in these + ## propagations, however + + ##### Print out some diagnostic plots describing how all of the pvals/zscores + ##### relate to each other across the types of control conditions I used to + ##### derive them + #### + ##### Need to fix speed, or DO NOT RUN UNLESS YOU HAVE EXTRA TIME!!! + ##### Also, plots probably look fugly at this point + #####with(all_pvals_zscores$per_go, print_diagnostic_plots_pval_zscore_2_schemes(DMSO$pval, + ##### DMSO$zscore, + ##### `Rand-by-strain`$pval, + ##### `Rand-by-strain`$zscore, + ##### 'DMSO', + ##### 'Rand-by-strain', + ##### file.path(prefix, sprintf('%s_process-prediction_qc-plots', Sys.Date())) + ##### )) + ##### + #####print_diagnostic_plots_pval_zscore_2_schemes(per_go_worst_case_mats$worst_pval, + ##### per_go_worst_case_mats$worst_zscore, + ##### overall_worst_case_mats$worst_pval, + ##### overall_worst_case_mats$worst_zscore, + ##### 'per-GO', + ##### 'per-drug', + ##### file.path(prefix, sprintf('%s_process-prediction_qc-plots', Sys.Date())) + ##### ) + ##### + ##### + # + # + ## Sort each prediction table by condition, pval, and zscore + #setkeyv(per_go_worst_case_dt, c('drug')) + #per_go_worst_case_dt <- per_go_worst_case_dt[, .SD[order(-worst_pval, worst_zscore, decreasing = TRUE)], by = list(drug)] + # + setkey(overall_worst_case_dt, condition) + # overall_worst_case_dt <- overall_worst_case_dt[, .SD[order(-worst_pval, worst_zscore, decreasing = TRUE)], by = list(drug)] + overall_worst_case_dt <- overall_worst_case_dt[order(worst_pval, -worst_zscore)] + + # Set the order of the columns!!! (This varies depending on which other tables were provided and joined to the predictions) + print(names(overall_worst_case_dt)) + cols = c('condition', 'condition_name', 'gene_set', 'gene_set_name', 'p_value', 'z_score', 'driver_query', 'driver_name', 'driver_score', 'sample_type', 'control_name') + setnames(overall_worst_case_dt, c('worst_pval', 'worst_zscore'), c('p_value', 'z_score')) + if (is.null(gene_set_name_tab)) cols = c(cols[1:(which(cols == 'gene_set_name') - 1)], cols[(which(cols == 'gene_set_name') + 1):length(cols)]) + if (is.null(condition_name_tab)) cols = c(cols[1:(which(cols == 'condition_name') - 1)], cols[(which(cols == 'condition_name') + 1):length(cols)]) + if (is.null(gene_name_column)) cols = c(cols[1:(which(cols == 'driver_name') - 1)], cols[(which(cols == 'driver_name') + 1):length(cols)]) + setcolorder(overall_worst_case_dt, cols) + + # Write out to (compressed) table-formatted file + #outfile_per_go <- gzfile(file.path(prefix, sprintf('%s_per-go_pval-zscore.txt.gz', Sys.Date())), open = 'wt') + #write.table(per_go_worst_case_dt, file = outfile_per_go, quote = FALSE, row.names = FALSE, col.names = TRUE, sep = '\t') + #close(outfile_per_go) + + + outfile_overall <- gzfile(output_table, open = 'wb') + write.table(overall_worst_case_dt, file = outfile_overall, quote = FALSE, row.names = FALSE, col.names = TRUE, sep = '\t') + close(outfile_overall) + + # Write out a top table and a globular table for human consumption + + # First, make filenames for the top table and glob top table outfiles + #out_top_per_go <- file.path(prefix, sprintf('%s_per-go_pval-zscore_top-ten.txt.gz', Sys.Date())) + #out_top_glob_per_go <- file.path(prefix, sprintf('%s_per-go_pval-zscore_top-ten_glob.txt.gz', Sys.Date())) + ## Now export prediction tables for just per_go predictions + #top_ten_tab_per_go <- export_top_table(dat = per_go_worst_case_dt, outfile = out_top_per_go, select_expr = rank(worst_pval, ties.method = 'min') <= 10, split_by_expr = list(drug), order_within_by_expr = order(-worst_pval, worst_zscore, decreasing = TRUE), connection_FUN = 'gzfile') + #table_to_glob(dat = top_ten_tab_per_go, outfile = out_top_glob_per_go, by_vec = c('drug'), connection_FUN = 'gzfile') + + # More filenames + #out_top_overall <- file.path(prefix, sprintf('%s_overall_pval-zscore_top-ten.txt.gz', Sys.Date())) + #out_top_glob_overall <- file.path(prefix, sprintf('%s_overall_pval-zscore_top-ten_glob.txt.gz', Sys.Date())) + ## And, export the overall pval/zscore top table and glob top table files + #top_ten_tab_overall <- export_top_table(dat = overall_worst_case_dt, outfile = out_top_overall, select_expr = rank(worst_pval, ties.method = 'min') <= 10, split_by_expr = list(drug), order_within_by_expr = order(-worst_pval, worst_zscore, decreasing = TRUE), connection_FUN = 'gzfile') + #table_to_glob(dat = top_ten_tab_overall, outfile = out_top_glob_overall, by_vec = c('drug'), connection_FUN = 'gzfile') + +} + +# Save/load point 7! This would be purely for debugging, I think. +if (7 %in% save_points) { + save_data(7) +} +if (load_point == 7) { + load_data(7) +} + + +