From ab8b6307bd3bf42891e46c1e9c8eb5767a80a6e3 Mon Sep 17 00:00:00 2001
From: stemangiola <mangiolastefano@gmail.com>
Date: Wed, 17 Jun 2020 10:17:52 +1000
Subject: [PATCH 1/7] update gitignore

---
 .gitignore | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/.gitignore b/.gitignore
index 0a0275d..18dc1f5 100644
--- a/.gitignore
+++ b/.gitignore
@@ -13,5 +13,9 @@ dev/false_positive_study.RData
 dev/false_positive_study.pdf
 dev/*rds
 dev/*rda
+dev/*csv
+dev/*xlsx
+dev/*RData
+dev/*txt
 dev/approximation_bias_correction_cache*
 temp*

From e931417e2df5a18eb57527ea61e1c601e01962a2 Mon Sep 17 00:00:00 2001
From: stemangiola <mangiolastefano@gmail.com>
Date: Wed, 17 Jun 2020 11:47:56 +1000
Subject: [PATCH 2/7] isolate rbinom function

---
 R/utilities.R | 23 +++++++++++++++--------
 1 file changed, 15 insertions(+), 8 deletions(-)

diff --git a/R/utilities.R b/R/utilities.R
index 8473ab9..405e330 100644
--- a/R/utilities.R
+++ b/R/utilities.R
@@ -677,14 +677,11 @@ fit_to_counts_rng_approximated = function(fit, adj_prob_theshold, how_many_poste
 			CI = map(
 				data,
 				~ {
-					.x_supersampled = .x %>%	sample_n(how_many_posterior_draws, replace = T)
-					draws = rnbinom(n =how_many_posterior_draws,	mu = exp(.x_supersampled$mu + .x_supersampled$exposure),	size = 1/exp(.x_supersampled$sigma) * truncation_compensation	)
-					draws %>%
-						# Process quantile
-						quantile(c(adj_prob_theshold, 1 - adj_prob_theshold)) %>%
-				  	tibble::as_tibble(rownames="prop") %>%
-						tidyr::spread(prop, value) %>%
-						setNames(c(".lower", ".upper")) %>%
+					get_CI_semi_analytically_rnbinom(
+						.x,
+						adj_prob_theshold,
+						how_many_posterior_draws
+					) %>%
 						# Add mean and sd
 						dplyr::mutate(mean = mean(draws), sd = sd(draws))
 				}
@@ -698,6 +695,16 @@ fit_to_counts_rng_approximated = function(fit, adj_prob_theshold, how_many_poste
 		mutate(S = as.integer(S), G = as.integer(G))
 
 
+
+get_CI_semi_analytically_rnbinom = function(.x, .quantile, how_many_posterior_draws){
+	.x_supersampled = .x %>%	sample_n(how_many_posterior_draws, replace = T)
+	draws = rnbinom(n =how_many_posterior_draws,	mu = exp(.x_supersampled$mu + .x_supersampled$exposure),	size = 1/exp(.x_supersampled$sigma) * truncation_compensation	)
+	draws %>%
+		# Process quantile
+		quantile(c(.quantile, 1 - .quantile)) %>%
+		tibble::as_tibble(rownames="prop") %>%
+		tidyr::spread(prop, value) %>%
+		setNames(c(".lower", ".upper"))
 }
 
 save_generated_quantities_in_case = function(.data, fit, save_generated_quantities){

From 50d42e797ab17e0088367781f64a0b7600925b3b Mon Sep 17 00:00:00 2001
From: stemangiola <mangiolastefano@gmail.com>
Date: Wed, 17 Jun 2020 12:21:07 +1000
Subject: [PATCH 3/7] added pnbinom approach

---
 R/utilities.R | 50 ++++++++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 46 insertions(+), 4 deletions(-)

diff --git a/R/utilities.R b/R/utilities.R
index 405e330..b55f089 100644
--- a/R/utilities.R
+++ b/R/utilities.R
@@ -677,13 +677,15 @@ fit_to_counts_rng_approximated = function(fit, adj_prob_theshold, how_many_poste
 			CI = map(
 				data,
 				~ {
-					get_CI_semi_analytically_rnbinom(
+					get_CI_semi_analytically_pnbinom(
 						.x,
-						adj_prob_theshold,
-						how_many_posterior_draws
+						adj_prob_theshold
 					) %>%
 						# Add mean and sd
-						dplyr::mutate(mean = mean(draws), sd = sd(draws))
+						dplyr::mutate(
+							mean = mean(exp(.x$mu + .x$exposure)),
+							sd = mean(1/exp(.x$sigma) * truncation_compensation)
+						)
 				}
 			)
 		) %>%
@@ -696,6 +698,46 @@ fit_to_counts_rng_approximated = function(fit, adj_prob_theshold, how_many_poste
 
 
 
+
+	}
+
+get_CI_semi_analytically_pnbinom_core = function(.x, .quantile){
+	ab<-range(
+		qnbinom(
+			.quantile,
+			mu = exp(.x$mu + .x$exposure),
+			size = 1/exp(.x$sigma) * truncation_compensation
+		)
+	);
+
+	if(sum(ab) == 0) return(0);
+
+	opt<-optim(
+		mean(ab),
+		function (x)
+			sapply(
+				x,
+				function(p)
+					((.quantile)-mean(pnbinom(p, 	mu = exp(.x$mu + .x$exposure),
+																		size = 1/exp(.x$sigma) * truncation_compensation )))^2
+			),
+		method='Brent',
+		lower=ab[1],
+		upper=ab[2]
+	)
+	opt$par
+}
+
+get_CI_semi_analytically_pnbinom = function(.x, .quantile){
+
+	tibble(
+		.lower = .x %>% get_CI_semi_analytically_pnbinom_core(.quantile),
+		.upper = .x %>% get_CI_semi_analytically_pnbinom_core(1-.quantile)
+	)
+
+
+}
+
 get_CI_semi_analytically_rnbinom = function(.x, .quantile, how_many_posterior_draws){
 	.x_supersampled = .x %>%	sample_n(how_many_posterior_draws, replace = T)
 	draws = rnbinom(n =how_many_posterior_draws,	mu = exp(.x_supersampled$mu + .x_supersampled$exposure),	size = 1/exp(.x_supersampled$sigma) * truncation_compensation	)

From 525f5c8f06a860b506ac3ea069fbf4926a3f1923 Mon Sep 17 00:00:00 2001
From: stemangiola <mangiolastefano@gmail.com>
Date: Wed, 17 Jun 2020 13:22:12 +1000
Subject: [PATCH 4/7] allow future map

---
 R/utilities.R | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/R/utilities.R b/R/utilities.R
index b55f089..474a3af 100644
--- a/R/utilities.R
+++ b/R/utilities.R
@@ -674,7 +674,7 @@ fit_to_counts_rng_approximated = function(fit, adj_prob_theshold, how_many_poste
 		left_join(draws_exposure, by = c(".draw", "S")) %>%
 		nest(data = -c(S, G)) %>%
 		mutate(
-			CI = map(
+			CI = future_map(
 				data,
 				~ {
 					get_CI_semi_analytically_pnbinom(

From 9b504a5e70565860b2f8992490e90c16178b0241 Mon Sep 17 00:00:00 2001
From: stemangiola <mangiolastefano@gmail.com>
Date: Mon, 2 Nov 2020 11:12:08 +1100
Subject: [PATCH 5/7] fix optim procedure

---
 R/ppcseq.R                     |   4 +-
 R/utilities.R                  | 108 ++++++++++++++++++++-------------
 inst/stan/negBinomial_MPI.stan |   1 +
 3 files changed, 68 insertions(+), 45 deletions(-)

diff --git a/R/ppcseq.R b/R/ppcseq.R
index 0ec98d0..84395d8 100644
--- a/R/ppcseq.R
+++ b/R/ppcseq.R
@@ -483,7 +483,7 @@ do_inference = function(my_df,
 		as_matrix() %>% t
 
 	# Get the matrix of the idexes of the outlier data points
-	# to explude from the model if it is the second passage
+	# to exclude from the model if it is the second passage
 	to_exclude_MPI = get_outlier_data_to_exlude(counts_MPI, to_exclude, shards)
 
 	# Package data
@@ -547,7 +547,7 @@ do_inference = function(my_df,
 
 		ifelse_pipe(
 			approximate_posterior_analysis,
-			~ .x %>% fit_to_counts_rng_approximated(adj_prob_theshold, how_many_posterior_draws * 10, truncation_compensation, cores),
+			~ .x %>% fit_to_counts_rng_approximated(adj_prob_theshold, how_many_posterior_draws * 10, truncation_compensation, cores, how_many_to_check),
 			~ .x %>% fit_to_counts_rng(adj_prob_theshold)
 		) %>%
 
diff --git a/R/utilities.R b/R/utilities.R
index 474a3af..059438d 100644
--- a/R/utilities.R
+++ b/R/utilities.R
@@ -525,8 +525,6 @@ format_results = function(.data, formula, .transcript, .abundance, .sample, do_c
 		)
 }
 
-
-
 #' Select only significant genes plus background for efficient normalisation
 #'
 #' @importFrom rstan sampling
@@ -564,7 +562,6 @@ select_to_check_and_house_keeping = function(.data, .do_check, .significance, .t
 		}
 }
 
-
 #' add_exposure_rate
 #'
 #' @importFrom tidyr separate
@@ -655,58 +652,83 @@ fit_to_counts_rng = function(fit, adj_prob_theshold){
 #'
 #' @export
 
-fit_to_counts_rng_approximated = function(fit, adj_prob_theshold, how_many_posterior_draws, truncation_compensation, cores){
+fit_to_counts_rng_approximated = function(fit, adj_prob_theshold, how_many_posterior_draws, truncation_compensation, cores, how_many_to_check){
 
 	writeLines(sprintf("executing %s", "fit_to_counts_rng_approximated"))
 
-	draws_mu =
-		fit %>% extract("lambda_log_param") %>% `[[` (1) %>% as.data.frame() %>% setNames(sprintf("mu.%s", colnames(.))) %>%
-		as_tibble() %>% mutate(.draw = 1:n()) %>% gather(par, mu, -.draw) %>% separate(par, c("par", "S", "G"), sep="\\.") %>% select(-par)
-	draws_sigma =
-		fit %>% extract("sigma_raw") %>% `[[` (1) %>% as.data.frame() %>% setNames(sprintf("sigma.%s", colnames(.) %>% gsub("V", "", .))) %>%
-		as_tibble() %>% mutate(.draw = 1:n()) %>% gather(par, sigma, -.draw) %>% separate(par, c("par", "G"), sep="\\.") %>% select(-par)
-	draws_exposure =
-		fit %>% extract("exposure_rate") %>% `[[` (1) %>% as.data.frame() %>% setNames(sprintf("exposure.%s", colnames(.) %>% gsub("V", "", .))) %>%
-		as_tibble() %>% mutate(.draw = 1:n()) %>% gather(par, exposure, -.draw) %>% separate(par, c("par", "S"), sep="\\.") %>% select(-par)
-
-	draws_mu %>%
-		left_join(draws_sigma, by = c(".draw", "G")) %>%
-		left_join(draws_exposure, by = c(".draw", "S")) %>%
-		nest(data = -c(S, G)) %>%
-		mutate(
-			CI = future_map(
-				data,
-				~ {
-					get_CI_semi_analytically_pnbinom(
-						.x,
-						adj_prob_theshold
-					) %>%
-						# Add mean and sd
-						dplyr::mutate(
-							mean = mean(exp(.x$mu + .x$exposure)),
-							sd = mean(1/exp(.x$sigma) * truncation_compensation)
-						)
-				}
-			)
-		) %>%
-		select(-data) %>%
-		unnest(CI) %>%
+	draws_mu = fit %>% extract("lambda_log_param") %>% .[[1]] %>% .[,,1:how_many_to_check]
 
-		# Adapt to old dataset
-		mutate(.variable = "counts_rng") %>%
-		mutate(S = as.integer(S), G = as.integer(G))
+	# %>% as.data.frame() %>% setNames(sprintf("mu.%s", colnames(.))) %>%
+	# 	as_tibble() %>% mutate(.draw = 1:n()) %>% gather(par, mu, -.draw) %>% separate(par, c("par", "S", "G"), sep="\\.") %>% select(-par)
 
+	draws_sigma = fit %>% extract("sigma_raw") %>% .[[1]] %>% .[,1:how_many_to_check]
 
+	# %>% as.data.frame() %>% setNames(sprintf("sigma.%s", colnames(.) %>% gsub("V", "", .))) %>%
+	# 	as_tibble() %>% mutate(.draw = 1:n()) %>% gather(par, sigma, -.draw) %>% separate(par, c("par", "G"), sep="\\.") %>% select(-par)
 
+	draws_exposure = 	fit %>% extract("exposure_rate") %>% .[[1]]
 
-	}
+	# %>% as.data.frame() %>% setNames(sprintf("exposure.%s", colnames(.) %>% gsub("V", "", .))) %>%
+	# 	as_tibble() %>% mutate(.draw = 1:n()) %>% gather(par, exposure, -.draw) %>% separate(par, c("par", "S"), sep="\\.") %>% select(-par)
+
+	expand_grid(S = 1:(dim(draws_mu)[2]), G = 1:(dim(draws_mu)[3])) %>%
+		mutate(truncation_compensation = !!truncation_compensation) %>%
+		mutate(
+			CI = pmap(
+				list(  S,G, truncation_compensation),
+				~ tibble(
+					mu = draws_mu[,..1, ..2],
+					sigma = draws_sigma[,..2]	,
+					exposure = draws_exposure[,..1],
+					truncation_compensation = ..3
+				) %>%
+				get_CI_semi_analytically_pnbinom(	adj_prob_theshold	) %>%
+
+				# Add mean and sd
+				dplyr::mutate(
+					mean = mean(exp(draws_mu[,..1, ..2] + draws_exposure[,..1])),
+					sd = mean(1/exp(draws_sigma[,..2]) * truncation_compensation)
+				)
+			)
+		) %>%
+		mutate(.variable = "counts_rng") %>%
+		unnest(CI)
+
+	# draws_mu %>%
+	# 	left_join(draws_sigma, by = c(".draw", "G")) %>%
+	# 	left_join(draws_exposure, by = c(".draw", "S")) %>%
+	# 	nest(data = -c(S, G)) %>%
+	# 	mutate(
+	# 		CI = future_map(
+	# 			data,
+	# 			~ {
+	# 				get_CI_semi_analytically_pnbinom(
+	# 					.x,
+	# 					adj_prob_theshold
+	# 				) %>%
+	# 					# Add mean and sd
+	# 					dplyr::mutate(
+	# 						mean = mean(exp(.x$mu + .x$exposure)),
+	# 						sd = mean(1/exp(.x$sigma) * truncation_compensation)
+	# 					)
+	# 			}
+	# 		)
+	# 	) %>%
+	# 	select(-data) %>%
+	# 	unnest(CI) %>%
+	#
+	# 	# Adapt to old dataset
+	# 	mutate(.variable = "counts_rng") %>%
+	# 	mutate(S = as.integer(S), G = as.integer(G))
+
+}
 
 get_CI_semi_analytically_pnbinom_core = function(.x, .quantile){
 	ab<-range(
 		qnbinom(
 			.quantile,
 			mu = exp(.x$mu + .x$exposure),
-			size = 1/exp(.x$sigma) * truncation_compensation
+			size = 1/exp(.x$sigma) * .x$truncation_compensation
 		)
 	);
 
@@ -719,7 +741,7 @@ get_CI_semi_analytically_pnbinom_core = function(.x, .quantile){
 				x,
 				function(p)
 					((.quantile)-mean(pnbinom(p, 	mu = exp(.x$mu + .x$exposure),
-																		size = 1/exp(.x$sigma) * truncation_compensation )))^2
+																		size = 1/exp(.x$sigma) * .x$truncation_compensation )))^2
 			),
 		method='Brent',
 		lower=ab[1],
@@ -740,7 +762,7 @@ get_CI_semi_analytically_pnbinom = function(.x, .quantile){
 
 get_CI_semi_analytically_rnbinom = function(.x, .quantile, how_many_posterior_draws){
 	.x_supersampled = .x %>%	sample_n(how_many_posterior_draws, replace = T)
-	draws = rnbinom(n =how_many_posterior_draws,	mu = exp(.x_supersampled$mu + .x_supersampled$exposure),	size = 1/exp(.x_supersampled$sigma) * truncation_compensation	)
+	draws = rnbinom(n =how_many_posterior_draws,	mu = exp(.x_supersampled$mu + .x_supersampled$exposure),	size = 1/exp(.x_supersampled$sigma) * .x$truncation_compensation	)
 	draws %>%
 		# Process quantile
 		quantile(c(.quantile, 1 - .quantile)) %>%
diff --git a/inst/stan/negBinomial_MPI.stan b/inst/stan/negBinomial_MPI.stan
index b00dafe..2baa956 100644
--- a/inst/stan/negBinomial_MPI.stan
+++ b/inst/stan/negBinomial_MPI.stan
@@ -112,6 +112,7 @@ functions{
   }
 
 matrix merge_coefficients(row_vector intercept, row_vector alpha_sub_1, matrix alpha_2,  int C, int S, int G){
+	// Here I build the coefficient matrix appending the house keeping genes
 	matrix[C,G] my_alpha;
 
 

From ca1bd6c65d11e8d2f17338e915c9481c3dba46a8 Mon Sep 17 00:00:00 2001
From: stemangiola <mangiolastefano@gmail.com>
Date: Mon, 2 Nov 2020 13:58:40 +1100
Subject: [PATCH 6/7] avoid recomputation for discovery run

---
 R/ppcseq.R    | 26 ++++++++++++++++----------
 R/utilities.R |  4 ++--
 2 files changed, 18 insertions(+), 12 deletions(-)

diff --git a/R/ppcseq.R b/R/ppcseq.R
index 84395d8..c6863f1 100644
--- a/R/ppcseq.R
+++ b/R/ppcseq.R
@@ -64,7 +64,8 @@ identify_outliers = function(.data,
 														 tol_rel_obj = 0.01,
 														 just_discovery = F,
 														 seed = sample(1:99999, size = 1),
-														 adj_prob_theshold_2 = NULL
+														 adj_prob_theshold_2 = NULL,
+														 return_fit = FALSE
 ) {
 	# Prepare column same enquo
 	.sample = enquo(.sample)
@@ -281,14 +282,19 @@ identify_outliers = function(.data,
 	merge_results(
 
 		# Calculate CI 2 for discovery for plotting
-		res_discovery %>%
-			left_join(
-				(.) %>%
-					attr("fit") %>%
-					fit_to_counts_rng_approximated(adj_prob_theshold_2, how_many_posterior_draws_2, truncation_compensation = 0.7352941, cores) %>%
-					select(S, G, .lower_1 = .lower, .upper_1 = .upper)
-			),
-		res_test, formula,
+		res_discovery,
+
+		# Just used for article comparative analyses between 1 and 2 steps
+		# %>%
+		# 	left_join(
+		# 		(.) %>%
+		# 			attr("fit") %>%
+		# 			fit_to_counts_rng_approximated(adj_prob_theshold_2, how_many_posterior_draws_2, truncation_compensation = 0.7352941, cores) %>%
+		# 			select(S, G, .lower_1 = .lower, .upper_1 = .upper)
+		# 	),
+
+		res_test,
+		formula,
 		!!.transcript,
 		!!.abundance,
 		!!.sample,
@@ -296,7 +302,7 @@ identify_outliers = function(.data,
 	) %>%
 
 		# Add fit attribute if any
-		add_attr(res_discovery %>% attr("fit"), "fit 1") %>%
+				add_attr(res_discovery %>% attr("fit"), "fit 1") %>%
 		add_attr(res_test %>% attr("fit"), "fit 2") %>%
 
 		# Add total draws
diff --git a/R/utilities.R b/R/utilities.R
index 059438d..e7190ef 100644
--- a/R/utilities.R
+++ b/R/utilities.R
@@ -448,8 +448,8 @@ merge_results = function(res_discovery, res_test, formula, .transcript, .abundan
 			!!.abundance,
 			!!.sample,
 			mean,
-			`.lower_1`,
-			`.upper_1`,
+			# `.lower_1`,
+			# `.upper_1`,
 			`exposure rate`,
 			slope_1 = slope,
 			one_of(parse_formula(formula))

From 2eaa20f6a169b2ba6eb8ef48a155c55ad1cc06c8 Mon Sep 17 00:00:00 2001
From: stemangiola <mangiolastefano@gmail.com>
Date: Mon, 2 Nov 2020 13:58:53 +1100
Subject: [PATCH 7/7] dont save fit

---
 R/ppcseq.R | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/R/ppcseq.R b/R/ppcseq.R
index c6863f1..5af8588 100644
--- a/R/ppcseq.R
+++ b/R/ppcseq.R
@@ -301,9 +301,13 @@ identify_outliers = function(.data,
 		do_check_only_on_detrimental
 	) %>%
 
-		# Add fit attribute if any
+		# If return_fit
+		when(
+			return_fit ~ (.) %>%
 				add_attr(res_discovery %>% attr("fit"), "fit 1") %>%
-		add_attr(res_test %>% attr("fit"), "fit 2") %>%
+				add_attr(res_test %>% attr("fit"), "fit 2"),
+			~ (.)
+		)	%>%
 
 		# Add total draws
 		add_attr(res_test %>% attr("total_draws"), "total_draws")