From ab8b6307bd3bf42891e46c1e9c8eb5767a80a6e3 Mon Sep 17 00:00:00 2001 From: stemangiola Date: Wed, 17 Jun 2020 10:17:52 +1000 Subject: [PATCH 1/7] update gitignore --- .gitignore | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/.gitignore b/.gitignore index 0a0275d..18dc1f5 100644 --- a/.gitignore +++ b/.gitignore @@ -13,5 +13,9 @@ dev/false_positive_study.RData dev/false_positive_study.pdf dev/*rds dev/*rda +dev/*csv +dev/*xlsx +dev/*RData +dev/*txt dev/approximation_bias_correction_cache* temp* From e931417e2df5a18eb57527ea61e1c601e01962a2 Mon Sep 17 00:00:00 2001 From: stemangiola Date: Wed, 17 Jun 2020 11:47:56 +1000 Subject: [PATCH 2/7] isolate rbinom function --- R/utilities.R | 23 +++++++++++++++-------- 1 file changed, 15 insertions(+), 8 deletions(-) diff --git a/R/utilities.R b/R/utilities.R index 8473ab9..405e330 100644 --- a/R/utilities.R +++ b/R/utilities.R @@ -677,14 +677,11 @@ fit_to_counts_rng_approximated = function(fit, adj_prob_theshold, how_many_poste CI = map( data, ~ { - .x_supersampled = .x %>% sample_n(how_many_posterior_draws, replace = T) - draws = rnbinom(n =how_many_posterior_draws, mu = exp(.x_supersampled$mu + .x_supersampled$exposure), size = 1/exp(.x_supersampled$sigma) * truncation_compensation ) - draws %>% - # Process quantile - quantile(c(adj_prob_theshold, 1 - adj_prob_theshold)) %>% - tibble::as_tibble(rownames="prop") %>% - tidyr::spread(prop, value) %>% - setNames(c(".lower", ".upper")) %>% + get_CI_semi_analytically_rnbinom( + .x, + adj_prob_theshold, + how_many_posterior_draws + ) %>% # Add mean and sd dplyr::mutate(mean = mean(draws), sd = sd(draws)) } @@ -698,6 +695,16 @@ fit_to_counts_rng_approximated = function(fit, adj_prob_theshold, how_many_poste mutate(S = as.integer(S), G = as.integer(G)) + +get_CI_semi_analytically_rnbinom = function(.x, .quantile, how_many_posterior_draws){ + .x_supersampled = .x %>% sample_n(how_many_posterior_draws, replace = T) + draws = rnbinom(n =how_many_posterior_draws, mu = exp(.x_supersampled$mu + .x_supersampled$exposure), size = 1/exp(.x_supersampled$sigma) * truncation_compensation ) + draws %>% + # Process quantile + quantile(c(.quantile, 1 - .quantile)) %>% + tibble::as_tibble(rownames="prop") %>% + tidyr::spread(prop, value) %>% + setNames(c(".lower", ".upper")) } save_generated_quantities_in_case = function(.data, fit, save_generated_quantities){ From 50d42e797ab17e0088367781f64a0b7600925b3b Mon Sep 17 00:00:00 2001 From: stemangiola Date: Wed, 17 Jun 2020 12:21:07 +1000 Subject: [PATCH 3/7] added pnbinom approach --- R/utilities.R | 50 ++++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 46 insertions(+), 4 deletions(-) diff --git a/R/utilities.R b/R/utilities.R index 405e330..b55f089 100644 --- a/R/utilities.R +++ b/R/utilities.R @@ -677,13 +677,15 @@ fit_to_counts_rng_approximated = function(fit, adj_prob_theshold, how_many_poste CI = map( data, ~ { - get_CI_semi_analytically_rnbinom( + get_CI_semi_analytically_pnbinom( .x, - adj_prob_theshold, - how_many_posterior_draws + adj_prob_theshold ) %>% # Add mean and sd - dplyr::mutate(mean = mean(draws), sd = sd(draws)) + dplyr::mutate( + mean = mean(exp(.x$mu + .x$exposure)), + sd = mean(1/exp(.x$sigma) * truncation_compensation) + ) } ) ) %>% @@ -696,6 +698,46 @@ fit_to_counts_rng_approximated = function(fit, adj_prob_theshold, how_many_poste + + } + +get_CI_semi_analytically_pnbinom_core = function(.x, .quantile){ + ab<-range( + qnbinom( + .quantile, + mu = exp(.x$mu + .x$exposure), + size = 1/exp(.x$sigma) * truncation_compensation + ) + ); + + if(sum(ab) == 0) return(0); + + opt<-optim( + mean(ab), + function (x) + sapply( + x, + function(p) + ((.quantile)-mean(pnbinom(p, mu = exp(.x$mu + .x$exposure), + size = 1/exp(.x$sigma) * truncation_compensation )))^2 + ), + method='Brent', + lower=ab[1], + upper=ab[2] + ) + opt$par +} + +get_CI_semi_analytically_pnbinom = function(.x, .quantile){ + + tibble( + .lower = .x %>% get_CI_semi_analytically_pnbinom_core(.quantile), + .upper = .x %>% get_CI_semi_analytically_pnbinom_core(1-.quantile) + ) + + +} + get_CI_semi_analytically_rnbinom = function(.x, .quantile, how_many_posterior_draws){ .x_supersampled = .x %>% sample_n(how_many_posterior_draws, replace = T) draws = rnbinom(n =how_many_posterior_draws, mu = exp(.x_supersampled$mu + .x_supersampled$exposure), size = 1/exp(.x_supersampled$sigma) * truncation_compensation ) From 525f5c8f06a860b506ac3ea069fbf4926a3f1923 Mon Sep 17 00:00:00 2001 From: stemangiola Date: Wed, 17 Jun 2020 13:22:12 +1000 Subject: [PATCH 4/7] allow future map --- R/utilities.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/utilities.R b/R/utilities.R index b55f089..474a3af 100644 --- a/R/utilities.R +++ b/R/utilities.R @@ -674,7 +674,7 @@ fit_to_counts_rng_approximated = function(fit, adj_prob_theshold, how_many_poste left_join(draws_exposure, by = c(".draw", "S")) %>% nest(data = -c(S, G)) %>% mutate( - CI = map( + CI = future_map( data, ~ { get_CI_semi_analytically_pnbinom( From 9b504a5e70565860b2f8992490e90c16178b0241 Mon Sep 17 00:00:00 2001 From: stemangiola Date: Mon, 2 Nov 2020 11:12:08 +1100 Subject: [PATCH 5/7] fix optim procedure --- R/ppcseq.R | 4 +- R/utilities.R | 108 ++++++++++++++++++++------------- inst/stan/negBinomial_MPI.stan | 1 + 3 files changed, 68 insertions(+), 45 deletions(-) diff --git a/R/ppcseq.R b/R/ppcseq.R index 0ec98d0..84395d8 100644 --- a/R/ppcseq.R +++ b/R/ppcseq.R @@ -483,7 +483,7 @@ do_inference = function(my_df, as_matrix() %>% t # Get the matrix of the idexes of the outlier data points - # to explude from the model if it is the second passage + # to exclude from the model if it is the second passage to_exclude_MPI = get_outlier_data_to_exlude(counts_MPI, to_exclude, shards) # Package data @@ -547,7 +547,7 @@ do_inference = function(my_df, ifelse_pipe( approximate_posterior_analysis, - ~ .x %>% fit_to_counts_rng_approximated(adj_prob_theshold, how_many_posterior_draws * 10, truncation_compensation, cores), + ~ .x %>% fit_to_counts_rng_approximated(adj_prob_theshold, how_many_posterior_draws * 10, truncation_compensation, cores, how_many_to_check), ~ .x %>% fit_to_counts_rng(adj_prob_theshold) ) %>% diff --git a/R/utilities.R b/R/utilities.R index 474a3af..059438d 100644 --- a/R/utilities.R +++ b/R/utilities.R @@ -525,8 +525,6 @@ format_results = function(.data, formula, .transcript, .abundance, .sample, do_c ) } - - #' Select only significant genes plus background for efficient normalisation #' #' @importFrom rstan sampling @@ -564,7 +562,6 @@ select_to_check_and_house_keeping = function(.data, .do_check, .significance, .t } } - #' add_exposure_rate #' #' @importFrom tidyr separate @@ -655,58 +652,83 @@ fit_to_counts_rng = function(fit, adj_prob_theshold){ #' #' @export -fit_to_counts_rng_approximated = function(fit, adj_prob_theshold, how_many_posterior_draws, truncation_compensation, cores){ +fit_to_counts_rng_approximated = function(fit, adj_prob_theshold, how_many_posterior_draws, truncation_compensation, cores, how_many_to_check){ writeLines(sprintf("executing %s", "fit_to_counts_rng_approximated")) - draws_mu = - fit %>% extract("lambda_log_param") %>% `[[` (1) %>% as.data.frame() %>% setNames(sprintf("mu.%s", colnames(.))) %>% - as_tibble() %>% mutate(.draw = 1:n()) %>% gather(par, mu, -.draw) %>% separate(par, c("par", "S", "G"), sep="\\.") %>% select(-par) - draws_sigma = - fit %>% extract("sigma_raw") %>% `[[` (1) %>% as.data.frame() %>% setNames(sprintf("sigma.%s", colnames(.) %>% gsub("V", "", .))) %>% - as_tibble() %>% mutate(.draw = 1:n()) %>% gather(par, sigma, -.draw) %>% separate(par, c("par", "G"), sep="\\.") %>% select(-par) - draws_exposure = - fit %>% extract("exposure_rate") %>% `[[` (1) %>% as.data.frame() %>% setNames(sprintf("exposure.%s", colnames(.) %>% gsub("V", "", .))) %>% - as_tibble() %>% mutate(.draw = 1:n()) %>% gather(par, exposure, -.draw) %>% separate(par, c("par", "S"), sep="\\.") %>% select(-par) - - draws_mu %>% - left_join(draws_sigma, by = c(".draw", "G")) %>% - left_join(draws_exposure, by = c(".draw", "S")) %>% - nest(data = -c(S, G)) %>% - mutate( - CI = future_map( - data, - ~ { - get_CI_semi_analytically_pnbinom( - .x, - adj_prob_theshold - ) %>% - # Add mean and sd - dplyr::mutate( - mean = mean(exp(.x$mu + .x$exposure)), - sd = mean(1/exp(.x$sigma) * truncation_compensation) - ) - } - ) - ) %>% - select(-data) %>% - unnest(CI) %>% + draws_mu = fit %>% extract("lambda_log_param") %>% .[[1]] %>% .[,,1:how_many_to_check] - # Adapt to old dataset - mutate(.variable = "counts_rng") %>% - mutate(S = as.integer(S), G = as.integer(G)) + # %>% as.data.frame() %>% setNames(sprintf("mu.%s", colnames(.))) %>% + # as_tibble() %>% mutate(.draw = 1:n()) %>% gather(par, mu, -.draw) %>% separate(par, c("par", "S", "G"), sep="\\.") %>% select(-par) + draws_sigma = fit %>% extract("sigma_raw") %>% .[[1]] %>% .[,1:how_many_to_check] + # %>% as.data.frame() %>% setNames(sprintf("sigma.%s", colnames(.) %>% gsub("V", "", .))) %>% + # as_tibble() %>% mutate(.draw = 1:n()) %>% gather(par, sigma, -.draw) %>% separate(par, c("par", "G"), sep="\\.") %>% select(-par) + draws_exposure = fit %>% extract("exposure_rate") %>% .[[1]] - } + # %>% as.data.frame() %>% setNames(sprintf("exposure.%s", colnames(.) %>% gsub("V", "", .))) %>% + # as_tibble() %>% mutate(.draw = 1:n()) %>% gather(par, exposure, -.draw) %>% separate(par, c("par", "S"), sep="\\.") %>% select(-par) + + expand_grid(S = 1:(dim(draws_mu)[2]), G = 1:(dim(draws_mu)[3])) %>% + mutate(truncation_compensation = !!truncation_compensation) %>% + mutate( + CI = pmap( + list( S,G, truncation_compensation), + ~ tibble( + mu = draws_mu[,..1, ..2], + sigma = draws_sigma[,..2] , + exposure = draws_exposure[,..1], + truncation_compensation = ..3 + ) %>% + get_CI_semi_analytically_pnbinom( adj_prob_theshold ) %>% + + # Add mean and sd + dplyr::mutate( + mean = mean(exp(draws_mu[,..1, ..2] + draws_exposure[,..1])), + sd = mean(1/exp(draws_sigma[,..2]) * truncation_compensation) + ) + ) + ) %>% + mutate(.variable = "counts_rng") %>% + unnest(CI) + + # draws_mu %>% + # left_join(draws_sigma, by = c(".draw", "G")) %>% + # left_join(draws_exposure, by = c(".draw", "S")) %>% + # nest(data = -c(S, G)) %>% + # mutate( + # CI = future_map( + # data, + # ~ { + # get_CI_semi_analytically_pnbinom( + # .x, + # adj_prob_theshold + # ) %>% + # # Add mean and sd + # dplyr::mutate( + # mean = mean(exp(.x$mu + .x$exposure)), + # sd = mean(1/exp(.x$sigma) * truncation_compensation) + # ) + # } + # ) + # ) %>% + # select(-data) %>% + # unnest(CI) %>% + # + # # Adapt to old dataset + # mutate(.variable = "counts_rng") %>% + # mutate(S = as.integer(S), G = as.integer(G)) + +} get_CI_semi_analytically_pnbinom_core = function(.x, .quantile){ ab<-range( qnbinom( .quantile, mu = exp(.x$mu + .x$exposure), - size = 1/exp(.x$sigma) * truncation_compensation + size = 1/exp(.x$sigma) * .x$truncation_compensation ) ); @@ -719,7 +741,7 @@ get_CI_semi_analytically_pnbinom_core = function(.x, .quantile){ x, function(p) ((.quantile)-mean(pnbinom(p, mu = exp(.x$mu + .x$exposure), - size = 1/exp(.x$sigma) * truncation_compensation )))^2 + size = 1/exp(.x$sigma) * .x$truncation_compensation )))^2 ), method='Brent', lower=ab[1], @@ -740,7 +762,7 @@ get_CI_semi_analytically_pnbinom = function(.x, .quantile){ get_CI_semi_analytically_rnbinom = function(.x, .quantile, how_many_posterior_draws){ .x_supersampled = .x %>% sample_n(how_many_posterior_draws, replace = T) - draws = rnbinom(n =how_many_posterior_draws, mu = exp(.x_supersampled$mu + .x_supersampled$exposure), size = 1/exp(.x_supersampled$sigma) * truncation_compensation ) + draws = rnbinom(n =how_many_posterior_draws, mu = exp(.x_supersampled$mu + .x_supersampled$exposure), size = 1/exp(.x_supersampled$sigma) * .x$truncation_compensation ) draws %>% # Process quantile quantile(c(.quantile, 1 - .quantile)) %>% diff --git a/inst/stan/negBinomial_MPI.stan b/inst/stan/negBinomial_MPI.stan index b00dafe..2baa956 100644 --- a/inst/stan/negBinomial_MPI.stan +++ b/inst/stan/negBinomial_MPI.stan @@ -112,6 +112,7 @@ functions{ } matrix merge_coefficients(row_vector intercept, row_vector alpha_sub_1, matrix alpha_2, int C, int S, int G){ + // Here I build the coefficient matrix appending the house keeping genes matrix[C,G] my_alpha; From ca1bd6c65d11e8d2f17338e915c9481c3dba46a8 Mon Sep 17 00:00:00 2001 From: stemangiola Date: Mon, 2 Nov 2020 13:58:40 +1100 Subject: [PATCH 6/7] avoid recomputation for discovery run --- R/ppcseq.R | 26 ++++++++++++++++---------- R/utilities.R | 4 ++-- 2 files changed, 18 insertions(+), 12 deletions(-) diff --git a/R/ppcseq.R b/R/ppcseq.R index 84395d8..c6863f1 100644 --- a/R/ppcseq.R +++ b/R/ppcseq.R @@ -64,7 +64,8 @@ identify_outliers = function(.data, tol_rel_obj = 0.01, just_discovery = F, seed = sample(1:99999, size = 1), - adj_prob_theshold_2 = NULL + adj_prob_theshold_2 = NULL, + return_fit = FALSE ) { # Prepare column same enquo .sample = enquo(.sample) @@ -281,14 +282,19 @@ identify_outliers = function(.data, merge_results( # Calculate CI 2 for discovery for plotting - res_discovery %>% - left_join( - (.) %>% - attr("fit") %>% - fit_to_counts_rng_approximated(adj_prob_theshold_2, how_many_posterior_draws_2, truncation_compensation = 0.7352941, cores) %>% - select(S, G, .lower_1 = .lower, .upper_1 = .upper) - ), - res_test, formula, + res_discovery, + + # Just used for article comparative analyses between 1 and 2 steps + # %>% + # left_join( + # (.) %>% + # attr("fit") %>% + # fit_to_counts_rng_approximated(adj_prob_theshold_2, how_many_posterior_draws_2, truncation_compensation = 0.7352941, cores) %>% + # select(S, G, .lower_1 = .lower, .upper_1 = .upper) + # ), + + res_test, + formula, !!.transcript, !!.abundance, !!.sample, @@ -296,7 +302,7 @@ identify_outliers = function(.data, ) %>% # Add fit attribute if any - add_attr(res_discovery %>% attr("fit"), "fit 1") %>% + add_attr(res_discovery %>% attr("fit"), "fit 1") %>% add_attr(res_test %>% attr("fit"), "fit 2") %>% # Add total draws diff --git a/R/utilities.R b/R/utilities.R index 059438d..e7190ef 100644 --- a/R/utilities.R +++ b/R/utilities.R @@ -448,8 +448,8 @@ merge_results = function(res_discovery, res_test, formula, .transcript, .abundan !!.abundance, !!.sample, mean, - `.lower_1`, - `.upper_1`, + # `.lower_1`, + # `.upper_1`, `exposure rate`, slope_1 = slope, one_of(parse_formula(formula)) From 2eaa20f6a169b2ba6eb8ef48a155c55ad1cc06c8 Mon Sep 17 00:00:00 2001 From: stemangiola Date: Mon, 2 Nov 2020 13:58:53 +1100 Subject: [PATCH 7/7] dont save fit --- R/ppcseq.R | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/R/ppcseq.R b/R/ppcseq.R index c6863f1..5af8588 100644 --- a/R/ppcseq.R +++ b/R/ppcseq.R @@ -301,9 +301,13 @@ identify_outliers = function(.data, do_check_only_on_detrimental ) %>% - # Add fit attribute if any + # If return_fit + when( + return_fit ~ (.) %>% add_attr(res_discovery %>% attr("fit"), "fit 1") %>% - add_attr(res_test %>% attr("fit"), "fit 2") %>% + add_attr(res_test %>% attr("fit"), "fit 2"), + ~ (.) + ) %>% # Add total draws add_attr(res_test %>% attr("total_draws"), "total_draws")