diff --git a/R/colocboost.R b/R/colocboost.R index 81e8490..aba9cdf 100644 --- a/R/colocboost.R +++ b/R/colocboost.R @@ -195,6 +195,181 @@ colocboost <- function(X = NULL, Y = NULL, # individual data return(NULL) } + # - check input data: individual level data and summary-level data + validated_data <- colocboost_validate_input_data( + X = X, Y = Y, + sumstat = sumstat, LD = LD, + dict_YX = dict_YX, dict_sumstatLD = dict_sumstatLD, + effect_est = effect_est, effect_se = effect_se, effect_n = effect_n, + overlap_variables = overlap_variables, + M = M, min_abs_corr = min_abs_corr + ) + if (is.null(validated_data)) { + return(NULL) + } + # Extract validated data + X <- validated_data$X + Y <- validated_data$Y + yx_dict <- validated_data$yx_dict + keep_variable_individual <- validated_data$keep_variable_individual + sumstat <- validated_data$sumstat + LD <- validated_data$LD + sumstatLD_dict <- validated_data$sumstatLD_dict + keep_variable_sumstat <- validated_data$keep_variable_sumstat + Z <- validated_data$Z + N_sumstat <- validated_data$N_sumstat + Var_y <- validated_data$Var_y + SeBhat <- validated_data$SeBhat + + # Update parameters if LD was not provided + M <- validated_data$M + min_abs_corr <- validated_data$min_abs_corr + jk_equiv_corr <- validated_data$jk_equiv_corr + jk_equiv_loglik <- validated_data$jk_equiv_loglik + func_simplex <- validated_data$func_simplex + + # - initial colocboost object + keep_variables <- c(keep_variable_individual, keep_variable_sumstat) + overlapped_variables <- Reduce("intersect", keep_variables) + mean_variables <- mean(sapply(keep_variables, length)) + min_variables <- min(sapply(keep_variables, length)) + if (min_variables < 100) { + warning( + "Warning message about the number of variables.\n", + "The smallest number of variables across outcomes is ", min_variables, " < 100.", + " If this is what you expected, this is not a problem.", + " If this is not what you expected, please check input data." + ) + } + if (length(overlapped_variables) <= 1) { + warning( + "Error: No or only 1 overlapping variables were found across all outcomes, colocalization cannot be performed. ", + "Please verify the variable names across different outcomes." + ) + return(NULL) + } else if ((length(overlapped_variables) / mean_variables) < 0.1) { + warning( + "Warning message about the overlapped variables.\n", + "The average number of variables across outcomes is ", mean_variables, + ". But only ", length(overlapped_variables), " number of variables overlapped (<10%).\n", + " If this is what you expected, this is not a problem.", + " If this is not what you expected, please check if the variable name matched across outcomes." + ) + } + cb_data <- colocboost_init_data( + X = X, Y = Y, dict_YX = yx_dict, + Z = Z, LD = LD, N_sumstat = N_sumstat, dict_sumstatLD = sumstatLD_dict, + Var_y = Var_y, SeBhat = SeBhat, + keep_variables = keep_variables, + focal_outcome_idx = focal_outcome_idx, + focal_outcome_variables = focal_outcome_variables, + overlap_variables = overlap_variables, + intercept = intercept, + standardize = standardize, + residual_correlation = residual_correlation + ) + + ################## colocboost updates ################################### + message("Starting gradient boosting algorithm.") + cb_obj <- colocboost_workhorse(cb_data, + M = M, + prioritize_jkstar = prioritize_jkstar, + tau = tau, + learning_rate_init = learning_rate_init, + learning_rate_decay = learning_rate_decay, + func_simplex = func_simplex, + lambda = lambda, + lambda_focal_outcome = lambda_focal_outcome, + stop_thresh = stop_thresh, + func_multi_test = func_multi_test, + stop_null = stop_null, + multi_test_max = multi_test_max, + multi_test_thresh = multi_test_thresh, + ash_prior = ash_prior, + p.adjust.methods = p.adjust.methods, + jk_equiv_corr = jk_equiv_corr, + jk_equiv_loglik = jk_equiv_loglik, + func_compare = func_compare, + coloc_thresh = coloc_thresh, + LD_free = LD_free, + dynamic_learning_rate = dynamic_learning_rate, + focal_outcome_idx = focal_outcome_idx, + outcome_names = outcome_names + ) + + # --- post-processing of the colocboost updates + message("Performing inference on colocalization events.") + cb_output <- colocboost_assemble(cb_obj, + coverage = coverage, + weight_fudge_factor = weight_fudge_factor, + check_null = check_null, + check_null_method = check_null_method, + check_null_max = check_null_max, + check_null_max_ucos = check_null_max_ucos, + dedup = dedup, + overlap = overlap, + n_purity = n_purity, + min_abs_corr = min_abs_corr, + sec_coverage_thresh = sec_coverage_thresh, + median_abs_corr = median_abs_corr, + min_cluster_corr = min_cluster_corr, + median_cos_abs_corr = median_cos_abs_corr, + weaker_effect = weaker_effect, + merge_cos = merge_cos, + tol = tol, + output_level = output_level + ) + class(cb_output) <- "colocboost" + return(cb_output) +} + + + + +#' @title Validate and Process All Input Data for ColocBoost +#' +#' @description Internal function to validate and process both individual-level and summary-level input data +#' +#' @param X A list of genotype matrices for different outcomes, or a single matrix if all outcomes share the same genotypes. +#' @param Y A list of vectors of outcomes or an N by L matrix if it is considered for the same X and multiple outcomes. +#' @param sumstat A list of data.frames of summary statistics. +#' @param LD A list of correlation matrices indicating the LD matrix for each genotype. +#' @param dict_YX A L by 2 matrix of dictionary for X and Y if there exist subsets of outcomes corresponding to the same X matrix. +#' @param dict_sumstatLD A L by 2 matrix of dictionary for sumstat and LD if there exist subsets of outcomes corresponding to the same sumstat. +#' @param effect_est Matrix of variable regression coefficients (i.e. regression beta values) in the genomic region +#' @param effect_se Matrix of standard errors associated with the beta values +#' @param effect_n A scalar or a vector of sample sizes for estimating regression coefficients. +#' @param overlap_variables If overlap_variables = TRUE, only perform colocalization in the overlapped region. +#' @param M The maximum number of gradient boosting rounds for each outcome (default is 500). +#' @param min_abs_corr Minimum absolute correlation allowed in a confidence set. +#' +#' @return A list containing: +#' \item{X}{Processed list of genotype matrices} +#' \item{Y}{Processed list of phenotype vectors} +#' \item{yx_dict}{Dictionary mapping Y to X} +#' \item{keep_variable_individual}{List of variable names for each X matrix} +#' \item{sumstat}{Processed list of summary statistics data.frames} +#' \item{LD}{Processed list of LD matrices} +#' \item{sumstatLD_dict}{Dictionary mapping sumstat to LD} +#' \item{keep_variable_sumstat}{List of variant names for each sumstat} +#' \item{Z}{List of z-scores for each outcome} +#' \item{N_sumstat}{List of sample sizes for each outcome} +#' \item{Var_y}{List of phenotype variances for each outcome} +#' \item{SeBhat}{List of standard errors for each outcome} +#' \item{M_updated}{Updated M value (may be changed if LD not provided)} +#' \item{min_abs_corr_updated}{Updated min_abs_corr value (may be changed if LD not provided)} +#' \item{jk_equiv_corr_updated}{Updated jk_equiv_corr value} +#' \item{jk_equiv_loglik_updated}{Updated jk_equiv_loglik value} +#' \item{func_simplex_updated}{Updated func_simplex value} +#' +#' @keywords internal +colocboost_validate_input_data <- function(X = NULL, Y = NULL, + sumstat = NULL, LD = NULL, + dict_YX = NULL, dict_sumstatLD = NULL, + effect_est = NULL, effect_se = NULL, effect_n = NULL, + overlap_variables = FALSE, + M = 500, min_abs_corr = 0.5) { + # - check individual level data if (!is.null(X) & !is.null(Y)) { # --- check input @@ -247,7 +422,17 @@ colocboost <- function(X = NULL, Y = NULL, # individual data return(xx) }) } - + + # Remove duplicates and report: if duplicate columns of X + X <- lapply(seq_along(X), function(i) { + df <- X[[i]] + if (anyDuplicated(colnames(df))) { + message(paste("Removed duplicate columns from X matrix ", i)) + df[, !duplicated(colnames(df)), drop = FALSE] + } else { + df + } + }) keep_variable_individual <- lapply(X, colnames) if (!is.list(X) & !is.list(Y)) { warning("Error: Input X and Y must be the list containing genotype matrics and all phenotype vectors!") @@ -324,7 +509,7 @@ colocboost <- function(X = NULL, Y = NULL, # individual data } else { yx_dict <- keep_variable_individual <- NULL } - + # - check summary-level data if ((!is.null(sumstat)) | (!is.null(effect_est) & !is.null(effect_se))) { # --- check input of (effect_est, effect_se) @@ -333,11 +518,19 @@ colocboost <- function(X = NULL, Y = NULL, # individual data warning("Error: effect_est and effect_se should be the same dimension! Please check!") return(NULL) } - variables <- rownames(effect_est) effect_est <- as.matrix(effect_est) effect_se <- as.matrix(effect_se) + variables <- rownames(effect_est) if (is.null(variables)) { variables <- paste0("variant_", 1:nrow(effect_est)) + } else { + # add - if duplciates occurs, only keep one + if (anyDuplicated(variables)) { + keep_idx <- !duplicated(variables) + variables <- variables[keep_idx] + effect_est <- effect_est[keep_idx, , drop = FALSE] + effect_se <- effect_se[keep_idx, , drop = FALSE] + } } sumstat <- list() for (sum_iy in 1:ncol(effect_est)) { @@ -358,7 +551,7 @@ colocboost <- function(X = NULL, Y = NULL, # individual data sumstat[[sum_iy]] <- sumstat_y } } - + if (!is.null(sumstat)) { if (is.data.frame(sumstat)) { sumstat <- list(sumstat) @@ -367,6 +560,7 @@ colocboost <- function(X = NULL, Y = NULL, # individual data warning("Error: Input sumstat must be the list containing summary level data for all outcomes!") return(NULL) } + sumstat <- lapply(sumstat, as.data.frame) # --- check if variables names in summary data variable.tmp <- sapply(sumstat, function(xx) { if (("variant" %in% colnames(xx))) { @@ -388,11 +582,33 @@ colocboost <- function(X = NULL, Y = NULL, # individual data }) } } - keep_variable_sumstat <- lapply(sumstat, function(xx) { - xx$variant + + # Remove NA for sumstat$variant columns - add on + sumstat <- lapply(seq_along(sumstat), function(i) { + xx <- sumstat[[i]] + if (anyNA(xx$variant)) { + warning(paste("Removed variant with NA from sumstat", i)) + xx = as.data.frame(xx[!is.na(xx$variant), , drop = FALSE]) + } + return(xx) }) - + # Remove duplicates and report: if duplicate variant in summary statistics + sumstat <- lapply(seq_along(sumstat), function(i) { + xx <- sumstat[[i]] + if (anyDuplicated(xx$variant)) { + warning(paste("Removed duplicate variants from sumstat", i)) + xx = as.data.frame(xx[!duplicated(xx$variant), , drop = FALSE]) + } + return(xx) + }) + # --- check input of LD + M_updated <- M + min_abs_corr_updated <- min_abs_corr + jk_equiv_corr_updated <- 0.8 + jk_equiv_loglik_updated <- 1 + func_simplex_updated <- "LD_z2z" + if (is.null(LD)) { # if no LD input, set diagonal matrix to LD warning( @@ -404,55 +620,84 @@ colocboost <- function(X = NULL, Y = NULL, # individual data LD <- 1 sumstatLD_dict <- rep(1, length(sumstat)) # change some algorithm parameters - M <- 1 # one iteration - min_abs_corr <- 0 # remove purity checking - jk_equiv_corr <- 0 - jk_equiv_loglik <- 0.1 - func_simplex <- "only_z2z" + M_updated <- 1 # one iteration + min_abs_corr_updated <- 0 # remove purity checking + jk_equiv_corr_updated <- 0 + jk_equiv_loglik_updated <- 0.1 + func_simplex_updated <- "only_z2z" + } else { - if (is.data.frame(LD)) { - LD <- as.matrix(LD) - } - if (is.matrix(LD)) { - LD <- list(LD) - } + + if (is.data.frame(LD)) LD <- as.matrix(LD) + if (is.matrix(LD)) LD <- list(LD) # - check if NA in LD matrix num_na <- sapply(LD, sum) if (any(is.na(num_na))){ warning("Error: Input LD must not contain missing values (NA).") return(NULL) } + # Create sumstat-LD mapping === if (length(LD) == 1) { sumstatLD_dict <- rep(1, length(sumstat)) } else if (length(LD) == length(sumstat)) { - sumstatLD_dict <- 1:length(sumstat) + sumstatLD_dict <- seq_along(sumstat) } else { if (is.null(dict_sumstatLD)) { - warning("Error: Please provide the dict_sumstatLD since you have multiple sumstat but only few LD!") + warning('Error: Please provide dict_sumstatLD: you have ', length(sumstat), + ' sumstats but only ', length(LD), ' LD matrices') return(NULL) - } else { - # - dict for sumstat to LD mapping - sumstatLD_dict <- rep(NA, length(sumstat)) - for (i in 1:length(sumstat)) { - tmp <- unique(dict_sumstatLD[dict_sumstatLD[, 1] == i, 2]) - if (length(tmp) == 0) { - warning(paste("Error: You don't provide matched LD for sumstat", i)) - return(NULL) - } else if (length(tmp) != 1) { - warning(paste("Error: You provide multiple matched LD for sumstat", i)) - return(NULL) - } else { - sumstatLD_dict[i] <- tmp - } + } + if (length(dict_sumstatLD) != length(sumstat)) { + warning('Error: dict_sumstatLD must have length ', length(sumstat)) + return(NULL) + } + if (any(is.na(dict_sumstatLD))) { + warning('Error: dict_sumstatLD contains NA values') + return(NULL) + } + if (any(dict_sumstatLD < 1) || any(dict_sumstatLD > length(LD))) { + warning('Error: dict_sumstatLD values must be between 1 and ', length(LD)) + return(NULL) + } + sumstatLD_dict <- as.integer(dict_sumstatLD) + } + + # === Filter variants for each sumstat === + for (i in seq_along(sumstat)) { + # Get sumstat variants (adjust field name based on your data structure) + sumstat_variants <- sumstat[[i]]$variant + n_total <- length(sumstat_variants) + # Get LD variants + ld_idx <- sumstatLD_dict[i] + current_ld <- LD[[ld_idx]] + ld_variants <- rownames(current_ld) + if (is.null(ld_variants)) { + if (ncol(current_ld) != n_total){ + warning('Error: LD matrix ', ld_idx, ' has no rownames. Please ensure all LD matrices have variant names as rownames.') + return(NULL) } - if (max(sumstatLD_dict) > length(LD)) { - warning("Error: You don't provide enough LD matrices!") + } + # Find common variants + common_variants <- intersect(sumstat_variants, ld_variants) + n_removed <- n_total - length(common_variants) + # Filter if needed + if (n_removed > 0) { + warning('Sumstat ', i, ': removing ', n_removed, ' out of ', n_total, + ' variants since those variants are not in LD matrix ', ld_idx) + keep_idx <- match(common_variants, sumstat_variants) + if (length(keep_idx) == 0){ + warning('Error: Sumstat data ', i, ' is empty after filtering. Returning NULL') return(NULL) } + # Filter all relevant fields - ADJUST THESE FIELD NAMES TO YOUR DATA + sumstat[[i]] <- sumstat[[i]][keep_idx, , drop = FALSE] } } } - + keep_variable_sumstat <- lapply(sumstat, function(xx) { + xx$variant + }) + # - checking sample size existency n_exist <- sapply(sumstat, function(ss) { "n" %in% colnames(ss) @@ -469,7 +714,7 @@ colocboost <- function(X = NULL, Y = NULL, # individual data "Outcome ", paste(p_no, collapse = ", "), " in sumstat don't contain 'n'!" ) } - + Z <- N_sumstat <- Var_y <- SeBhat <- vector(mode = "list", length = length(sumstat)) for (i.summstat in 1:length(sumstat)) { summstat_tmp <- sumstat[[i.summstat]] @@ -499,7 +744,7 @@ colocboost <- function(X = NULL, Y = NULL, # individual data warning(paste("summary statistic dataset", i.summstat, "contains NA values that are replaced with 0")) z[is.na(z)] <- 0 } - + # - check N if (!("n" %in% colVar)) { z <- z @@ -512,7 +757,7 @@ colocboost <- function(X = NULL, Y = NULL, # individual data } n <- median(n) N_sumstat[[i.summstat]] <- n - + # When n is provided, compute the adjusted z-scores. z <- z * sqrt((n - 1) / (z^2 + n - 2)) if ("var_y" %in% colVar) { @@ -532,99 +777,31 @@ colocboost <- function(X = NULL, Y = NULL, # individual data } } else { Z <- N_sumstat <- Var_y <- SeBhat <- sumstatLD_dict <- keep_variable_sumstat <- NULL + M_updated <- M + min_abs_corr_updated <- min_abs_corr + jk_equiv_corr_updated <- 0.8 + jk_equiv_loglik_updated <- 1 + func_simplex_updated <- "LD_z2z" } - # - initial colocboost object - keep_variables <- c(keep_variable_individual, keep_variable_sumstat) - overlapped_variables <- Reduce("intersect", keep_variables) - mean_variables <- mean(sapply(keep_variables, length)) - min_variables <- min(sapply(keep_variables, length)) - if (min_variables < 100) { - warning( - "Warning message about the number of variables.\n", - "The smallest number of variables across outcomes is ", min_variables, " < 100.", - " If this is what you expected, this is not a problem.", - " If this is not what you expected, please check input data." - ) - } - if (length(overlapped_variables) <= 1) { - warning( - "Error: No or only 1 overlapping variables were found across all outcomes, colocalization cannot be performed. ", - "Please verify the variable names across different outcomes." - ) - return(NULL) - } else if ((length(overlapped_variables) / mean_variables) < 0.1) { - warning( - "Warning message about the overlapped variables.\n", - "The average number of variables across outcomes is ", mean_variables, - ". But only ", length(overlapped_variables), " number of variables overlapped (<10%).\n", - " If this is what you expected, this is not a problem.", - " If this is not what you expected, please check if the variable name matched across outcomes." - ) - } - cb_data <- colocboost_init_data( - X = X, Y = Y, dict_YX = yx_dict, - Z = Z, LD = LD, N_sumstat = N_sumstat, dict_sumstatLD = sumstatLD_dict, - Var_y = Var_y, SeBhat = SeBhat, - keep_variables = keep_variables, - focal_outcome_idx = focal_outcome_idx, - focal_outcome_variables = focal_outcome_variables, - overlap_variables = overlap_variables, - intercept = intercept, - standardize = standardize, - residual_correlation = residual_correlation - ) - - ################## colocboost updates ################################### - message("Starting gradient boosting algorithm.") - cb_obj <- colocboost_workhorse(cb_data, - M = M, - prioritize_jkstar = prioritize_jkstar, - tau = tau, - learning_rate_init = learning_rate_init, - learning_rate_decay = learning_rate_decay, - func_simplex = func_simplex, - lambda = lambda, - lambda_focal_outcome = lambda_focal_outcome, - stop_thresh = stop_thresh, - func_multi_test = func_multi_test, - stop_null = stop_null, - multi_test_max = multi_test_max, - multi_test_thresh = multi_test_thresh, - ash_prior = ash_prior, - p.adjust.methods = p.adjust.methods, - jk_equiv_corr = jk_equiv_corr, - jk_equiv_loglik = jk_equiv_loglik, - func_compare = func_compare, - coloc_thresh = coloc_thresh, - LD_free = LD_free, - dynamic_learning_rate = dynamic_learning_rate, - focal_outcome_idx = focal_outcome_idx, - outcome_names = outcome_names - ) - - # --- post-processing of the colocboost updates - message("Performing inference on colocalization events.") - cb_output <- colocboost_assemble(cb_obj, - coverage = coverage, - weight_fudge_factor = weight_fudge_factor, - check_null = check_null, - check_null_method = check_null_method, - check_null_max = check_null_max, - check_null_max_ucos = check_null_max_ucos, - dedup = dedup, - overlap = overlap, - n_purity = n_purity, - min_abs_corr = min_abs_corr, - sec_coverage_thresh = sec_coverage_thresh, - median_abs_corr = median_abs_corr, - min_cluster_corr = min_cluster_corr, - median_cos_abs_corr = median_cos_abs_corr, - weaker_effect = weaker_effect, - merge_cos = merge_cos, - tol = tol, - output_level = output_level - ) - class(cb_output) <- "colocboost" - return(cb_output) + + return(list( + X = X, + Y = Y, + yx_dict = yx_dict, + keep_variable_individual = keep_variable_individual, + sumstat = sumstat, + LD = LD, + sumstatLD_dict = sumstatLD_dict, + keep_variable_sumstat = keep_variable_sumstat, + Z = Z, + N_sumstat = N_sumstat, + Var_y = Var_y, + SeBhat = SeBhat, + M = M_updated, + min_abs_corr = min_abs_corr_updated, + jk_equiv_corr = jk_equiv_corr_updated, + jk_equiv_loglik = jk_equiv_loglik_updated, + func_simplex = func_simplex_updated + )) } diff --git a/R/colocboost_assemble.R b/R/colocboost_assemble.R index aba5405..94cbbe5 100644 --- a/R/colocboost_assemble.R +++ b/R/colocboost_assemble.R @@ -58,7 +58,7 @@ colocboost_assemble <- function(cb_obj, ) # - save model and all coloc and single information for diagnostic if (output_level != 1) { - tmp <- get_full_output(cb_obj = cb_obj, past_out = NULL, variables = NULL) + tmp <- get_full_output(cb_obj = cb_obj, past_out = NULL, variables = data_info$variables) if (output_level == 2) { cb_output <- c(cb_output, list("ucos_details" = tmp$ucos_detials)) cb_output <- cb_output[c("cos_summary", "vcp", "cos_details", "data_info", "model_info", "ucos_details")] @@ -79,6 +79,7 @@ colocboost_assemble <- function(cb_obj, } else if (cb_obj$cb_model_para$model_used == "one_causal"){ # fixme later check_null_max <- check_null_max * check_null + check_null_max_ucos <- check_null_max_ucos * check_null } cb_obj <- get_max_profile(cb_obj, check_null_max = check_null_max, check_null_max_ucos = check_null_max_ucos, @@ -209,6 +210,9 @@ colocboost_assemble <- function(cb_obj, # - colocalization results cb_obj$cb_model_para$weight_fudge_factor <- weight_fudge_factor cb_obj$cb_model_para$coverage <- coverage + cb_obj$cb_model_para$min_abs_corr <- min_abs_corr + cb_obj$cb_model_para$median_abs_corr <- median_abs_corr + cb_obj$cb_model_para$n_purity <- n_purity cos_results <- get_cos_details(cb_obj, coloc_out = past_out$cos$cos, data_info = data_info) cb_output <- list( "vcp" = cos_results$vcp, diff --git a/R/colocboost_check_update_jk.R b/R/colocboost_check_update_jk.R index 623faa8..d449b03 100644 --- a/R/colocboost_check_update_jk.R +++ b/R/colocboost_check_update_jk.R @@ -66,15 +66,15 @@ boost_check_update_jk_nofocal <- function(cb_model, cb_model_para, cb_data) { # -- define jk and jk_each cor_vals_each <- do.call(cbind, lapply(model_update, function(cc) cc$correlation)) - abs_cor_vals_each <- sweep(abs(cor_vals_each), 2, adj_dep, `*`) + # abs_cor_vals_each <- sweep(abs(cor_vals_each), 2, adj_dep, `*`) + abs_cor_vals_each <- abs(cor_vals_each) * rep(adj_dep, each = nrow(cor_vals_each)) abs_cor_vals <- rowSums(abs_cor_vals_each) - jk <- which(abs_cor_vals == max(abs_cor_vals)) - jk <- ifelse(length(jk) == 1, jk, sample(jk, 1)) - jk_each <- apply(abs_cor_vals_each, 2, function(temp) { - jk_temp <- which(temp == max(temp)) - return(ifelse(length(jk_temp) == 1, jk_temp, sample(jk_temp, 1))) - }) + # jk <- which(abs_cor_vals == max(abs_cor_vals)) + # jk <- ifelse(length(jk) == 1, jk, sample(jk, 1)) + # jk_each <- apply(abs_cor_vals_each, 2, which.max) + jk <- which.max(abs_cor_vals) + jk_each <- sapply(1:ncol(abs_cor_vals_each), function(j) which.max(abs_cor_vals_each[, j])) update_jk[c(1, pos.update + 1)] <- c(jk, jk_each) @@ -142,7 +142,7 @@ boost_check_update_jk_nofocal <- function(cb_model, cb_model_para, cb_data) { update_status[pos.update[true_update]] <- 1 # - re-define jk_star abs_cor_vals_redefine <- rowSums(abs_cor_vals_each[, true_update, drop = FALSE]) - jk_redefine <- which(abs_cor_vals_redefine == max(abs_cor_vals_redefine)) + jk_redefine <- which.max(abs_cor_vals_redefine == max(abs_cor_vals_redefine)) jk_redefine <- ifelse(length(jk_redefine) == 1, jk_redefine, sample(jk_redefine, 1)) real_update_jk[pos.update[true_update]] <- jk_redefine } else { @@ -351,11 +351,10 @@ boost_check_update_jk_focal <- function(cb_model, cb_model_para, cb_data, # -- define jk and jk_each cor_vals_each <- do.call(cbind, lapply(model_update, function(cc) as.vector(cc$correlation))) - abs_cor_vals_each <- sweep(abs(cor_vals_each), 2, adj_dep, `*`) - jk_each <- apply(abs_cor_vals_each, 2, function(temp) { - jk_temp <- which(temp == max(temp)) - return(ifelse(length(jk_temp) == 1, jk_temp, sample(jk_temp, 1))) - }) + # abs_cor_vals_each <- sweep(abs(cor_vals_each), 2, adj_dep, `*`) + abs_cor_vals_each <- abs(cor_vals_each) * rep(adj_dep, each = nrow(cor_vals_each)) + # jk_each <- apply(abs_cor_vals_each, 2, which.max) + jk_each <- sapply(1:ncol(abs_cor_vals_each), function(j) which.max(abs_cor_vals_each[, j])) pp_focal <- which(pos.update == focal_outcome_idx) jk_focal <- jk_each[pp_focal] @@ -503,8 +502,8 @@ get_LD_jk1_jk2 <- function(jk1, jk2, if (length(XtX) == 1){ LD_temp <- 0 } else { - jk1.remain <- which(remain_jk == jk1) - jk2.remain <- which(remain_jk == jk2) + jk1.remain <- match(jk1, remain_jk) + jk2.remain <- match(jk2, remain_jk) LD_temp <- XtX[jk1.remain, jk2.remain] } } @@ -545,29 +544,59 @@ check_pair_jkeach <- function(jk_each, cb_data, X_dict, jk_equiv_corr = 0.8, jk_equiv_loglik = 0.001) { + + + #' @importFrom stats cor + get_LD_jk_each <- function(jk_each, + X = NULL, XtX = NULL, N = NULL, + remain_jk = NULL) { + if (!is.null(X)) { + LD_temp <- suppressWarnings({ + get_cormat(X[, jk_each]) + }) + LD_temp[which(is.na(LD_temp))] <- 0 + # LD_temp <- LD_temp[1, 2] + } else if (!is.null(XtX)) { + if (length(XtX) == 1){ + LD_temp <- matrix(0, length(jk_each), length(jk_each)) + } else { + jk.remain <- match(jk_each, remain_jk) + LD_temp <- XtX[jk.remain, jk.remain] + LD_temp[which(is.na(LD_temp))] <- 0 + } + } + return(LD_temp) + } + + detect_func <- function(idx, LD_all, jk_i, jk_j, i, j){ + change_log_jk_i <- model_update[[idx]]$change_loglike[jk_i] + change_log_jk_j <- model_update[[idx]]$change_loglike[jk_j] + change_each <- abs(change_log_jk_i - change_log_jk_j) + LD_temp <- LD_all[[idx]][i, j] + return((change_each <= jk_equiv_loglik) & (abs(LD_temp) >= jk_equiv_corr)) + } + data_update <- cb_data$data[pos.update] + LD_all <- lapply(1:length(jk_each), function(idx){ + get_LD_jk_each(jk_each, + X = cb_data$data[[X_dict[idx]]]$X, + XtX = cb_data$data[[X_dict[idx]]]$XtX, + N = data_update[[idx]]$N, + remain_jk = setdiff(1:length(model_update[[idx]]$res), data_update[[idx]]$variable_miss) + ) + }) + # -- check if jk_i ~ jk_j change_each_pair <- matrix(FALSE, nrow = length(jk_each), ncol = length(jk_each)) for (i in 1:length(jk_each)) { - jk_i <- jk_each[i] - change_log_jk_i <- model_update[[i]]$change_loglike[jk_i] - for (j in 1:length(jk_each)) { + for (j in i:length(jk_each)) { if (j != i) { jk_j <- jk_each[j] - - if (!(jk_i %in% data_update[[i]]$variable_miss) & !(jk_j %in% data_update[[i]]$variable_miss)) { - change_log_jk_j <- model_update[[i]]$change_loglike[jk_j] - change_each <- abs(change_log_jk_i - change_log_jk_j) - LD_temp <- get_LD_jk1_jk2(jk_i, jk_j, - X = cb_data$data[[X_dict[i]]]$X, - XtX = cb_data$data[[X_dict[i]]]$XtX, - N = data_update[[i]]$N, - remain_jk = setdiff(1:length(model_update[[i]]$res), data_update[[i]]$variable_miss) - ) - change_each_pair[i, j] <- (change_each <= jk_equiv_loglik) & (abs(LD_temp) >= jk_equiv_corr) - } else { - change_each_pair[i, j] <- FALSE + change_each_pair[i, j] <- detect_func(idx = i, LD_all, jk_i, jk_j, i, j) + # if jk_i and jk_j are equivalent on dataset i, then we don't need to check dataset j + if ( !change_each_pair[i, j] ){ + change_each_pair[j, i] <- detect_func(idx = j, LD_all, jk_i, jk_j, i, j) } } else { change_each_pair[i, j] <- FALSE diff --git a/R/colocboost_inference.R b/R/colocboost_inference.R index dc53908..882fc7a 100644 --- a/R/colocboost_inference.R +++ b/R/colocboost_inference.R @@ -132,7 +132,8 @@ get_modularity <- function(Weight, B) { if (m == 0) return(0) - cate <- B %*% t(B) + # cate <- B %*% t(B) + cate <- tcrossprod(B) if (m_pos == 0 & m_neg == 0) return(0) @@ -190,12 +191,12 @@ get_n_cluster <- function(hc, Sigma, m = ncol(Sigma), min_cluster_corr = 0.8) { #' @noRd w_purity <- function(weights, X = NULL, Xcorr = NULL, N = NULL, n = 100, coverage = 0.95, min_abs_corr = 0.5, median_abs_corr = NULL, miss_idx = NULL) { - tmp <- apply(weights, 2, w_cs, coverage = coverage) - tmp_purity <- apply(tmp, 2, function(tt) { - pos <- which(tt == 1) + + tmp_purity <- apply(weights, 2, function(w) { + pos <- w_cs(w, coverage = coverage) # deal with missing snp here if (!is.null(Xcorr)) { - pos <- match(pos, setdiff(1:length(tmp), miss_idx)) + pos <- match(pos, setdiff(1:length(w), miss_idx)) } get_purity(pos, X = X, Xcorr = Xcorr, N = N, n = n) }) @@ -373,7 +374,12 @@ check_null_post <- function(cb_obj, } if (!weaker_effect) { check_cs_change <- cs_change - check_null_tmp <- sapply(1:cb_obj$cb_model_para$L, function(j) cb_obj$cb_model[[j]]$check_null_max) + if (cb_obj$cb_model_para$L == 1){ + check_null_max_tmp <- cb_obj$cb_model[[j]]$check_null_max_ucos + } else { + check_null_max_tmp <- cb_obj$cb_model[[j]]$check_null_max + } + check_null_tmp <- sapply(1:cb_obj$cb_model_para$L, function(j) check_null_max_tmp) } else { check_null_tmp <- rep(check_null, cb_obj$cb_model_para$L) } @@ -420,7 +426,7 @@ get_purity <- function(pos, X = NULL, Xcorr = NULL, N = NULL, n = 100) { corr[which(is.na(corr))] <- 0 value <- abs(get_upper_tri(corr)) } else { - if (sum(Xcorr) == 1){ + if (length(Xcorr) == 1){ value <- 0 } else { Xcorr <- Xcorr # if (!is.null(N)) Xcorr/(N-1) else Xcorr diff --git a/R/colocboost_init.R b/R/colocboost_init.R index 8685b01..edca937 100644 --- a/R/colocboost_init.R +++ b/R/colocboost_init.R @@ -163,7 +163,8 @@ colocboost_init_model <- function(cb_data, tmp <- list( "res" = NULL, "beta" = rep(0, P), - "weights_path" = c(), + # "weights_path" = c(), + "weights_path" = list(), "profile_loglike_each" = NULL, "obj_path" = 999999, "obj_single" = 999999, @@ -619,7 +620,7 @@ process_sumstat <- function(Z, N, Var_y, SeBhat, ld_matrices, variant_lists, dic Z_extend[pos_target] <- current_z[pos_z] # Calculate submatrix for each unique entry (not duplicates) - if (sum(current_ld_matrix) == 1){ + if (length(current_ld_matrix) == 1){ ld_submatrix <- current_ld_matrix } else { ld_submatrix <- NULL diff --git a/R/colocboost_one_causal.R b/R/colocboost_one_causal.R index 0ed96b5..09692d5 100644 --- a/R/colocboost_one_causal.R +++ b/R/colocboost_one_causal.R @@ -79,9 +79,8 @@ colocboost_one_iteration <- function(cb_model, cb_model_para, cb_data) { cb_model_para[rm_elements] <- NULL for (i in 1:length(cb_model)) { cb_model[[i]]$obj_path <- as.numeric(unlist(cb_model[[i]]$obj_path[-1])) - } - for (i in 1:length(cb_model)) { cb_model[[i]]$obj_single <- as.numeric(unlist(cb_model[[i]]$obj_single[-1])) + cb_model[[i]]$weights_path <- do.call(rbind, cb_model[[i]]$weights_path) } cb_obj <- list("cb_data" = cb_data, "cb_model" = cb_model, "cb_model_para" = cb_model_para) @@ -238,7 +237,7 @@ colocboost_diagLD <- function(cb_model, cb_model_para, cb_data) { cb_model_para, cb_data ) - weights <- rbind(weights, cb_model_tmp[[iy]]$weights_path) + weights <- rbind(weights, unlist(cb_model_tmp[[iy]]$weights_path)) } ###### overlap weights overlap_pair <- check_pair_overlap(weights, coverage = 0.95) @@ -307,11 +306,10 @@ colocboost_diagLD <- function(cb_model, cb_model_para, cb_data) { cb_model_para[rm_elements] <- NULL for (i in 1:length(cb_model)) { cb_model[[i]]$obj_path <- as.numeric(unlist(cb_model[[i]]$obj_path[-1])) - } - for (i in 1:length(cb_model)) { cb_model[[i]]$obj_single <- as.numeric(unlist(cb_model[[i]]$obj_single[-1])) + cb_model[[i]]$weights_path <- do.call(rbind, cb_model[[i]]$weights_path) } - + cb_obj <- list("cb_data" = cb_data, "cb_model" = cb_model, "cb_model_para" = cb_model_para) class(cb_obj) <- "colocboost" return(cb_obj) diff --git a/R/colocboost_plot.R b/R/colocboost_plot.R index 90141d6..1f0f7c1 100644 --- a/R/colocboost_plot.R +++ b/R/colocboost_plot.R @@ -14,8 +14,9 @@ #' @param plot_focal_cos_outcome_only Logical, if TRUE only plots colocalization including at least on colocalized outcome with focal outcome, default is FALSE. #' @param points_color Background color for non-colocalized variables, default is "grey80". #' @param cos_color Optional custom colors for CoS. -#' @param add_vertical Logical, if TRUE adds vertical lines at specified positions, default is FALSE -#' @param add_vertical_idx Optional indices for vertical lines. +#' @param add_highlight Logical, if TRUE adds vertical lines at specified positions, default is FALSE +#' @param add_highlight_style Optional style of add_highlight variables, default is "vertical_lines", other choice is "star" - red star. +#' @param add_highlight_idx Optional indices for add_highlight variables. #' @param outcome_names Optional vector of outcomes names for the subtitle of each figure. \code{outcome_names=NULL} for the outcome name shown in \code{data_info}. #' @param plot_cols Number of columns in the plot grid, default is 2. If you have many colocalization. please consider increasing this. #' @param variant_coord Logical, if TRUE uses variant coordinates on x-axis, default is FALSE. This is required the variable names including position information. @@ -78,8 +79,9 @@ colocboost_plot <- function(cb_output, y = "log10p", plot_focal_cos_outcome_only = FALSE, points_color = "grey80", cos_color = NULL, - add_vertical = FALSE, - add_vertical_idx = NULL, + add_highlight = FALSE, + add_highlight_idx = NULL, + add_highlight_style = "vertical_lines", outcome_names = NULL, plot_cols = 2, variant_coord = FALSE, @@ -102,33 +104,34 @@ colocboost_plot <- function(cb_output, y = "log10p", if (!inherits(cb_output, "colocboost")) { stop("Input of colocboost_plot must be a 'colocboost' object!") } - + # get cb_plot_input data from colocboost results cb_plot_input <- get_input_plot(cb_output, - plot_cos_idx = plot_cos_idx, - variant_coord = variant_coord, - outcome_names = outcome_names, - plot_focal_only = plot_focal_only, - plot_focal_cos_outcome_only = plot_focal_cos_outcome_only, - show_cos_to_uncoloc = show_cos_to_uncoloc, - show_cos_to_uncoloc_idx = show_cos_to_uncoloc_idx, - show_cos_to_uncoloc_outcome = show_cos_to_uncoloc_outcome, - plot_ucos = plot_ucos, plot_ucos_idx = plot_ucos_idx + plot_cos_idx = plot_cos_idx, + variant_coord = variant_coord, + outcome_names = outcome_names, + plot_focal_only = plot_focal_only, + plot_focal_cos_outcome_only = plot_focal_cos_outcome_only, + show_cos_to_uncoloc = show_cos_to_uncoloc, + show_cos_to_uncoloc_idx = show_cos_to_uncoloc_idx, + show_cos_to_uncoloc_outcome = show_cos_to_uncoloc_outcome, + plot_ucos = plot_ucos, plot_ucos_idx = plot_ucos_idx ) # get initial set up of plot cb_plot_init <- plot_initial(cb_plot_input, - y = y, points_color = points_color, cos_color = cos_color, - ylim_each = ylim_each, title_specific = title_specific, - outcome_legend_pos = outcome_legend_pos, outcome_legend_size = outcome_legend_size, - cos_legend_pos = cos_legend_pos, plot_ucos = plot_ucos, - show_variable = show_variable, lab_style = lab_style, axis_style = axis_style, - title_style = title_style, ... + y = y, points_color = points_color, cos_color = cos_color, + ylim_each = ylim_each, title_specific = title_specific, + outcome_legend_pos = outcome_legend_pos, outcome_legend_size = outcome_legend_size, + cos_legend_pos = cos_legend_pos, plot_ucos = plot_ucos, + show_variable = show_variable, lab_style = lab_style, axis_style = axis_style, + title_style = title_style, ... ) - + colocboost_plot_basic <- function(cb_plot_input, cb_plot_init, outcome_idx = NULL, plot_all_outcome = FALSE, grange = NULL, plot_cols = 2, - add_vertical = FALSE, add_vertical_idx = NULL, + add_highlight = FALSE, add_highlight_idx = NULL, + add_highlight_style = "vertical_lines", show_top_variables = TRUE, ...) { args <- list(...) @@ -147,12 +150,12 @@ colocboost_plot <- function(cb_output, y = "log10p", args$font.lab <- cb_plot_init$lab_face # - change position cb_plot_init$outcome_legend_pos <- switch(cb_plot_init$outcome_legend_pos, - "right" = 4, - "left" = 2, - "top" = 3, - "bottom" = 1 + "right" = 4, + "left" = 2, + "top" = 3, + "bottom" = 1 ) - + # - begin plotting coloc_cos <- cb_plot_input$cos outcomes <- cb_plot_input$outcomes @@ -229,12 +232,7 @@ colocboost_plot <- function(cb_output, y = "log10p", cex = cb_plot_init$outcome_legend_size, font = 1 ) } - if (add_vertical) { - for (iii in 1:length(add_vertical_idx)) { - abline(v = add_vertical_idx[iii], col = "#E31A1C", lwd = 1.5, lty = "dashed") - } - } - + # mark variables in CoS to colocalized outcomes if (!is.null(coloc_cos)) { n.coloc <- length(coloc_cos) @@ -242,7 +240,7 @@ colocboost_plot <- function(cb_output, y = "log10p", if (length(y)==1){coloc_index = lapply(coloc_index, function(i) 1 )} legend_text <- list(col = vector()) legend_text$col <- head(cb_plot_init$col, n.coloc) - + # check which coloc set for this outcome p.coloc <- sapply(coloc_index, function(idx) sum(idx == iy) != 0) p.coloc <- which(p.coloc) @@ -261,7 +259,7 @@ colocboost_plot <- function(cb_output, y = "log10p", points(x_hits, y_hits, pch = 21, bg = legend_text$col[i.cs], col = "#E31A1C", cex = 3, lwd = 3) } } - + # mark variables in CoS to uncolocalized outcomes uncoloc <- cb_plot_input$uncoloc if (!is.null(uncoloc)) { @@ -276,8 +274,8 @@ colocboost_plot <- function(cb_output, y = "log10p", x0 <- intersect(args$x, cs) y1 <- args$y[match(x0, args$x)] points(x0, y1, - pch = 4, col = adjustcolor(legend_text$col[uncoloc$cos_idx_to_uncoloc[i.uncoloc]], alpha.f = 0.4), - cex = 1.5, lwd = 1.5 + pch = 4, col = adjustcolor(legend_text$col[uncoloc$cos_idx_to_uncoloc[i.uncoloc]], alpha.f = 0.4), + cex = 1.5, lwd = 1.5 ) shape_col <- c(shape_col, adjustcolor(legend_text$col[uncoloc$cos_idx_to_uncoloc[i.uncoloc]], alpha.f = 1)) texts_col <- c(texts_col, adjustcolor(legend_text$col[uncoloc$cos_idx_to_uncoloc[i.uncoloc]], alpha.f = 0.8)) @@ -294,35 +292,51 @@ colocboost_plot <- function(cb_output, y = "log10p", if (length(texts) == 0) { next } - + # Get current plot area coordinates usr <- par("usr") x_pos <- usr[1] + cb_plot_init$cos_legend_pos[1] * (usr[2] - usr[1]) # 5% from left edge y_pos <- usr[3] + cb_plot_init$cos_legend_pos[2] * (usr[4] - usr[3]) # 50% from bottom edge legend(x = x_pos, y = y_pos, texts, - bty = "n", col = shape_col, text.col = texts_col, - cex = 1.5, pt.cex = 1.5, pch = 4, x.intersp = cos_legend_pos[3], y.intersp = cos_legend_pos[4] + bty = "n", col = shape_col, text.col = texts_col, + cex = 1.5, pt.cex = 1.5, pch = 4, x.intersp = cos_legend_pos[3], y.intersp = cos_legend_pos[4] ) } } + if (add_highlight) { + for (iii in 1:length(add_highlight_idx)) { + if ( !(add_highlight_style %in% c("vertical_lines", "star")) ){ + message("add_highlight_style is not 'vertical_lines' and 'star', defaulting to 'vertical_lines'") + add_highlight_style = "vertical_lines" + } + if (add_highlight_style == "vertical_lines"){ + abline(v = add_highlight_idx[iii], col = "#E31A1C", lwd = 1.5, lty = "dashed") + } else if (add_highlight_style == "star") { + points(add_highlight_idx[iii], args$y[match(add_highlight_idx[iii],args$x)], + pch = 8, col = "#E31A1C", lwd = 2, lty = "dashed", cex = 2.5) + } + + } + } } if (!is.null(cb_plot_init$title)) { mtext(cb_plot_init$title, - side = 3, line = 0, outer = TRUE, - cex = cb_plot_init$title_size, font = cb_plot_init$title_face + side = 3, line = 0, outer = TRUE, + cex = cb_plot_init$title_size, font = cb_plot_init$title_face ) } return(invisible()) } - + colocboost_plot_basic(cb_plot_input, cb_plot_init, - grange = grange, - outcome_idx = outcome_idx, - plot_all_outcome = plot_all_outcome, - plot_cols = plot_cols, - add_vertical = add_vertical, add_vertical_idx = add_vertical_idx, - show_top_variables = show_top_variables, - ... + grange = grange, + outcome_idx = outcome_idx, + plot_all_outcome = plot_all_outcome, + plot_cols = plot_cols, + add_highlight = add_highlight, add_highlight_idx = add_highlight_idx, + add_highlight_style = add_highlight_style, + show_top_variables = show_top_variables, + ... ) } @@ -339,7 +353,7 @@ get_input_plot <- function(cb_output, plot_cos_idx = NULL, show_cos_to_uncoloc_outcome = NULL, plot_ucos = FALSE, plot_ucos_idx = NULL) { - + # check ucos exists if (plot_ucos && !"ucos_details" %in% names(cb_output)) { warning( @@ -354,7 +368,7 @@ get_input_plot <- function(cb_output, plot_cos_idx = NULL, cb_output$data_info$outcome_info$outcome_names <- outcome_names names(cb_output$data_info$z) <- outcome_names } - + # extract results from colocboost analysis_outcome <- cb_output$data_info$outcome_info$outcome_names focal_outcome_idx <- which(cb_output$data_info$outcome_info$is_focal) @@ -371,7 +385,7 @@ get_input_plot <- function(cb_output, plot_cos_idx = NULL, Z <- cb_output$data_info$z coef <- cb_output$data_info$coef vcp <- list(as.numeric(cb_output$vcp)) - + # if finemapping if (cb_output$data_info$n_outcomes == 1) { cb_output$cos_details$cos$cos_variables <- cb_output$ucos_details$ucos$ucos_variables @@ -406,7 +420,7 @@ get_input_plot <- function(cb_output, plot_cos_idx = NULL, } }) ncos <- length(cb_output$cos_details$cos$cos_index) - + select_cs <- 1:ncos if (!is.null(plot_cos_idx)) { if (length(setdiff(plot_cos_idx, c(1:ncos))) != 0) { @@ -477,7 +491,7 @@ get_input_plot <- function(cb_output, plot_cos_idx = NULL, "coloc_index" = coloc_index, "select_cos" = select_cs ) - + # check plot uncolocalizaed confidence sets from ucos_details if (plot_ucos & cb_output$data_info$n_outcomes > 1){ ucos_details <- cb_output$ucos_details @@ -528,7 +542,7 @@ get_input_plot <- function(cb_output, plot_cos_idx = NULL, plot_input$ucos_cos_int_weights <- combined } } - + # check if plot cos to uncolocalized outcome # use the updated coloc_cos and related components from plot_input if available coloc_cos <- plot_input$cos @@ -613,7 +627,7 @@ get_input_plot <- function(cb_output, plot_cos_idx = NULL, ) plot_input$uncoloc <- uncoloc } - + class(plot_input) <- "colocboost" return(plot_input) } @@ -634,11 +648,11 @@ plot_initial <- function(cb_plot_input, y = "log10p", ...) { args <- list(...) if (!exists("pch", args)) args$pch <- 16 - + # - set background point color and cos color pools args$bg <- points_color if (is.null(cos_color)) { - + # cos_color <- c( # "#377EB8", "#E69F00", "#33A02C", "#984EA3", "#F46D43", # "#A65628", "#1F4E79", "#B2182B", "#D73027", "#F781BF", @@ -688,12 +702,12 @@ plot_initial <- function(cb_plot_input, y = "log10p", "#D5D8DC", "#BDC3C7", "#95A5A6", "#7F8C8D", "#17202A", "#808080", "#A9A9A9", "#C0C0C0", "#696969", "#778899" ) - - + + # cos_color <- c("#1F70A9", "#33A02C", "#CAB2D6", "#EA7827") } args$col <- cos_color[cb_plot_input$select_cos] - + # - set data and x-lab and y-lab if (y == "log10p") { plot_data <- lapply(cb_plot_input$Zscores, function(z) { @@ -729,12 +743,12 @@ plot_initial <- function(cb_plot_input, y = "log10p", if (!exists("ylab", args)) args$ylab <- ylab args$lab_size <- as.numeric(lab_style[1]) args$lab_face <- lab_style[2] - + # - set title format args$title <- title_specific args$title_size <- as.numeric(title_style[1]) args$title_face <- title_style[2] - + # - set x-axis and y-axis args$x <- cb_plot_input$x$pos args$y <- plot_data @@ -744,7 +758,7 @@ plot_initial <- function(cb_plot_input, y = "log10p", } args$axis_size <- as.numeric(axis_style[1]) args$axis_face <- axis_style[2] - + # - set ylim for each subfigure if (exists("ylim", args)) { ymax <- rep(args$ylim[2], length(args$y)) @@ -776,7 +790,7 @@ plot_initial <- function(cb_plot_input, y = "log10p", } args$ymax <- ymax args$ymin <- ymin - + # - set legend text position and format args$outcome_legend_pos <- outcome_legend_pos args$outcome_legend_size <- outcome_legend_size @@ -788,7 +802,7 @@ plot_initial <- function(cb_plot_input, y = "log10p", args$outcome_legend_angle <- 0 } args$cos_legend_pos <- cos_legend_pos - + return(args) } diff --git a/R/colocboost_update.R b/R/colocboost_update.R index 6100828..734bcb1 100644 --- a/R/colocboost_update.R +++ b/R/colocboost_update.R @@ -74,7 +74,8 @@ colocboost_update <- function(cb_model, cb_model_para, cb_data) { # weights <- adj_dep * ld_feature / scaling_factor * exp_abs_cor / sum(exp_abs_cor) weights <- adj_dep * obj_ld * exp_abs_cor / sum(exp_abs_cor) weights <- weights / sum(weights) - cb_model[[i]]$weights_path <- rbind(cb_model[[i]]$weights_path, as.vector(weights)) + # cb_model[[i]]$weights_path <- rbind(cb_model[[i]]$weights_path, as.vector(weights)) + cb_model[[i]]$weights_path <- c(cb_model[[i]]$weights_path, list(as.vector(weights))) ########## END: MAIN CALCULATION ################### diff --git a/R/colocboost_utils.R b/R/colocboost_utils.R index c4da011..9bfa3fd 100644 --- a/R/colocboost_utils.R +++ b/R/colocboost_utils.R @@ -426,15 +426,20 @@ get_max_profile <- function(cb_obj, check_null_max = 0.025, } +# ### Function for check cs for each weight +# w_cs <- function(weights, coverage = 0.95) { +# indices <- unlist(get_in_cos(weights, coverage = coverage)) +# result <- rep(0, length(weights)) +# result[indices] <- 1 +# return(result) +# } + ### Function for check cs for each weight w_cs <- function(weights, coverage = 0.95) { indices <- unlist(get_in_cos(weights, coverage = coverage)) - result <- rep(0, length(weights)) - result[indices] <- 1 - return(result) + return(indices) } - #' Pure R implementation (fallback) #' @noRd get_merge_ordered_with_indices <- function(vector_list) { @@ -505,12 +510,14 @@ get_merge_ordered_with_indices <- function(vector_list) { # Step 4: Topological sort using Kahn's algorithm # Start with nodes that have no incoming edges queue <- all_elements[sapply(all_elements, function(elem) in_degree[[elem]] == 0)] - result <- character() + result <- list() + result_idx <- 1 while (length(queue) > 0) { # Take the first element from the queue current <- queue[1] queue <- queue[-1] - result <- c(result, current) + result[[result_idx]] <- current # List assignment is fast + result_idx <- result_idx + 1 # Process all neighbors (elements that must come after current) neighbors <- graph[[current]] for (next_elem in neighbors) { @@ -520,6 +527,7 @@ get_merge_ordered_with_indices <- function(vector_list) { } } } + result <- unlist(result) # Step 5: Check for cycles and use fallback if needed if (length(result) != n_elements) { # Different variable orders detected - use priority-based fallback @@ -657,8 +665,6 @@ get_cos_details <- function(cb_obj, coloc_out, data_info = NULL) { }) int_weight <- lapply(cos_weights, get_integrated_weight, weight_fudge_factor = cb_obj$cb_model_para$weight_fudge_factor) names(int_weight) <- names(cos_weights) <- colocset_names - vcp <- as.vector(1 - apply(1 - do.call(cbind, int_weight), 1, prod)) - names(vcp) <- data_info$variables # - resummary results cos_re_idx <- lapply(int_weight, function(w) { @@ -668,7 +674,55 @@ get_cos_details <- function(cb_obj, coloc_out, data_info = NULL) { data_info$variables[idx] }) coloc_csets <- list("cos_index" = cos_re_idx, "cos_variables" = cos_re_var) - + + # - recalculate purity + purity <- vector(mode = "list", length = length(cos_re_idx)) + for (ee in 1:length(cos_re_idx)) { + coloc_t <- coloc_outcome_index[[ee]] + p_tmp <- c() + for (i3 in coloc_t) { + pos <- cos_re_idx[[ee]] + X_dict <- cb_obj$cb_data$dict[i3] + if (!is.null(cb_obj$cb_data$data[[X_dict]]$XtX)) { + pos <- match(pos, setdiff(1:cb_obj$cb_model_para$P, cb_obj$cb_data$data[[i3]]$variable_miss)) + } + tmp <- matrix(get_purity(pos, + X = cb_obj$cb_data$data[[X_dict]]$X, + Xcorr = cb_obj$cb_data$data[[X_dict]]$XtX, + N = cb_obj$cb_data$data[[i3]]$N, n = cb_obj$cb_model_para$n_purity + ), 1, 3) + p_tmp <- rbind(p_tmp, tmp) + } + purity[[ee]] <- matrix(apply(p_tmp, 2, max), 1, 3) + } + purity_all <- do.call(rbind, purity) + purity_all <- as.data.frame(purity_all) + colnames(purity_all) <- c("min_abs_corr", "mean_abs_corr", "median_abs_corr") + coloc_out$purity <- purity_all + if (is.null(cb_obj$cb_model_para$median_abs_corr)) { + is_pure <- which(purity_all[, 1] >= cb_obj$cb_model_para$min_abs_corr) + } else { + is_pure <- which(purity_all[, 1] >= cb_obj$cb_model_para$min_abs_corr | purity_all[, 3] >= cb_obj$cb_model_para$median_abs_corr) + } + if (length(is_pure)==0){ + coloc_results <- NULL + vcp <- NULL + return(list("cos_results" = coloc_results, "vcp" = vcp)) + } else if (length(is_pure)!=length(cos_re_idx)){ + int_weight = int_weight[is_pure] + coloc_csets <- lapply(coloc_csets, function(cs) cs[is_pure]) + coloc_outcomes <- lapply(coloc_outcomes, function(cs) cs[is_pure]) + normalization_evidence <- normalization_evidence[is_pure] + npc <- npc[is_pure] + cos_min_npc_outcome <- cos_min_npc_outcome[is_pure] + cos_weights <- cos_weights[is_pure] + coloc_out$purity <- purity_all[is_pure,,drop = FALSE] + colocset_names <- colocset_names[is_pure] + } + vcp <- as.vector(1 - apply(1 - do.call(cbind, int_weight), 1, prod)) + names(vcp) <- data_info$variables + + # - hits variables in each csets coloc_hits <- coloc_hits_variablenames <- coloc_hits_names <- c() for (i in 1:length(int_weight)) { @@ -839,7 +893,7 @@ get_full_output <- function(cb_obj, past_out = NULL, variables = NULL, cb_output cb_model_para <- cb_obj$cb_model_para ## - obtain the order of variables based on the variables names if it has position information - if (!is.null(variables)) { + if (is.null(variables)) { ordered <- 1:length(cb_obj$cb_model_para$variables) } else { ordered <- match(variables, cb_obj$cb_model_para$variables) @@ -933,7 +987,7 @@ get_full_output <- function(cb_obj, past_out = NULL, variables = NULL, cb_output # - hits variables in each csets cs_hits <- sapply(1:length(specific_w), function(jj) { inw <- specific_w[[jj]] - sample(which(inw == max(inw)), 1) + which(inw == max(inw))[1] }) cs_hits_variablenames <- sapply(cs_hits, function(ch) variables[ch]) specific_cs_hits <- data.frame("top_index" = cs_hits, "top_variables" = cs_hits_variablenames) # save diff --git a/R/colocboost_workhorse.R b/R/colocboost_workhorse.R index 7583eee..4462680 100644 --- a/R/colocboost_workhorse.R +++ b/R/colocboost_workhorse.R @@ -103,6 +103,9 @@ colocboost_workhorse <- function(cb_data, } if (sum(cb_model_para$update_y == 1) == 0) { + for (i in 1:length(cb_model)){ + cb_model[[i]]$weights_path <- matrix(0, ncol = cb_model_para$P) + } cb_obj <- list("cb_data" = cb_data, "cb_model" = cb_model, "cb_model_para" = cb_model_para) class(cb_obj) <- "colocboost" return(cb_obj) @@ -236,9 +239,8 @@ colocboost_workhorse <- function(cb_data, cb_model_para$num_updates <- m for (i in 1:length(cb_model)) { cb_model[[i]]$obj_path <- as.numeric(unlist(cb_model[[i]]$obj_path[-1])) - } - for (i in 1:length(cb_model)) { cb_model[[i]]$obj_single <- as.numeric(unlist(cb_model[[i]]$obj_single[-1])) + cb_model[[i]]$weights_path <- do.call(rbind, cb_model[[i]]$weights_path) } if (m == M) { diff --git a/man/colocboost_plot.Rd b/man/colocboost_plot.Rd index fe155fc..8e5c7df 100644 --- a/man/colocboost_plot.Rd +++ b/man/colocboost_plot.Rd @@ -18,8 +18,9 @@ colocboost_plot( plot_focal_cos_outcome_only = FALSE, points_color = "grey80", cos_color = NULL, - add_vertical = FALSE, - add_vertical_idx = NULL, + add_highlight = FALSE, + add_highlight_idx = NULL, + add_highlight_style = "vertical_lines", outcome_names = NULL, plot_cols = 2, variant_coord = FALSE, @@ -62,9 +63,11 @@ colocboost_plot( \item{cos_color}{Optional custom colors for CoS.} -\item{add_vertical}{Logical, if TRUE adds vertical lines at specified positions, default is FALSE} +\item{add_highlight}{Logical, if TRUE adds vertical lines at specified positions, default is FALSE} -\item{add_vertical_idx}{Optional indices for vertical lines.} +\item{add_highlight_idx}{Optional indices for add_highlight variables.} + +\item{add_highlight_style}{Optional style of add_highlight variables, default is "vertical_lines", other choice is "star" - red star.} \item{outcome_names}{Optional vector of outcomes names for the subtitle of each figure. \code{outcome_names=NULL} for the outcome name shown in \code{data_info}.} diff --git a/man/colocboost_validate_input_data.Rd b/man/colocboost_validate_input_data.Rd new file mode 100644 index 0000000..b8616e9 --- /dev/null +++ b/man/colocboost_validate_input_data.Rd @@ -0,0 +1,70 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/colocboost.R +\name{colocboost_validate_input_data} +\alias{colocboost_validate_input_data} +\title{Validate and Process All Input Data for ColocBoost} +\usage{ +colocboost_validate_input_data( + X = NULL, + Y = NULL, + sumstat = NULL, + LD = NULL, + dict_YX = NULL, + dict_sumstatLD = NULL, + effect_est = NULL, + effect_se = NULL, + effect_n = NULL, + overlap_variables = FALSE, + M = 500, + min_abs_corr = 0.5 +) +} +\arguments{ +\item{X}{A list of genotype matrices for different outcomes, or a single matrix if all outcomes share the same genotypes.} + +\item{Y}{A list of vectors of outcomes or an N by L matrix if it is considered for the same X and multiple outcomes.} + +\item{sumstat}{A list of data.frames of summary statistics.} + +\item{LD}{A list of correlation matrices indicating the LD matrix for each genotype.} + +\item{dict_YX}{A L by 2 matrix of dictionary for X and Y if there exist subsets of outcomes corresponding to the same X matrix.} + +\item{dict_sumstatLD}{A L by 2 matrix of dictionary for sumstat and LD if there exist subsets of outcomes corresponding to the same sumstat.} + +\item{effect_est}{Matrix of variable regression coefficients (i.e. regression beta values) in the genomic region} + +\item{effect_se}{Matrix of standard errors associated with the beta values} + +\item{effect_n}{A scalar or a vector of sample sizes for estimating regression coefficients.} + +\item{overlap_variables}{If overlap_variables = TRUE, only perform colocalization in the overlapped region.} + +\item{M}{The maximum number of gradient boosting rounds for each outcome (default is 500).} + +\item{min_abs_corr}{Minimum absolute correlation allowed in a confidence set.} +} +\value{ +A list containing: +\item{X}{Processed list of genotype matrices} +\item{Y}{Processed list of phenotype vectors} +\item{yx_dict}{Dictionary mapping Y to X} +\item{keep_variable_individual}{List of variable names for each X matrix} +\item{sumstat}{Processed list of summary statistics data.frames} +\item{LD}{Processed list of LD matrices} +\item{sumstatLD_dict}{Dictionary mapping sumstat to LD} +\item{keep_variable_sumstat}{List of variant names for each sumstat} +\item{Z}{List of z-scores for each outcome} +\item{N_sumstat}{List of sample sizes for each outcome} +\item{Var_y}{List of phenotype variances for each outcome} +\item{SeBhat}{List of standard errors for each outcome} +\item{M_updated}{Updated M value (may be changed if LD not provided)} +\item{min_abs_corr_updated}{Updated min_abs_corr value (may be changed if LD not provided)} +\item{jk_equiv_corr_updated}{Updated jk_equiv_corr value} +\item{jk_equiv_loglik_updated}{Updated jk_equiv_loglik value} +\item{func_simplex_updated}{Updated func_simplex value} +} +\description{ +Internal function to validate and process both individual-level and summary-level input data +} +\keyword{internal} diff --git a/tests/testthat/test_plot.R b/tests/testthat/test_plot.R index 775d3fe..d49aace 100644 --- a/tests/testthat/test_plot.R +++ b/tests/testthat/test_plot.R @@ -176,7 +176,9 @@ test_that("colocboost_plot handles layout options", { test_that("colocboost_plot handles additional visualization options", { # Test with vertical line options - expect_error(suppressWarnings(colocboost_plot(cb_res, add_vertical = TRUE, add_vertical_idx = c(5, 10))), NA) + expect_error(suppressWarnings(colocboost_plot(cb_res, add_highlight = TRUE, add_highlight_idx = c(5, 10))), NA) + expect_error(suppressWarnings(colocboost_plot(cb_res, add_highlight = TRUE, add_highlight_idx = c(5, 10), add_highlight_style = "vertical_lines")), NA) + expect_error(suppressWarnings(colocboost_plot(cb_res, add_highlight = TRUE, add_highlight_idx = c(5, 10), add_highlight_style = "star")), NA) # Test with show_top_variables option expect_error(suppressWarnings(colocboost_plot(cb_res, show_top_variables = TRUE)), NA) diff --git a/tests/testthat/test_sumstats.R b/tests/testthat/test_sumstats.R index deb3aaa..ef896c3 100644 --- a/tests/testthat/test_sumstats.R +++ b/tests/testthat/test_sumstats.R @@ -247,7 +247,7 @@ test_that("colocboost handles mismatched inputs correctly", { # Expect error with mismatched dimensions expect_warning( - colocboost( + colocboost_validate_input_data( effect_est = bad_effect_est, effect_se = test_sumstat_data$effect_se ), @@ -281,4 +281,77 @@ test_that("colocboost handles missing sample size correctly", { # Still should get a valid result expect_s3_class(result, "colocboost") -}) \ No newline at end of file +}) + +# Test 8: Handling of NA variants +test_that("colocboost removes NA variants correctly", { + sumstat_with_na <- test_sumstat_data$sumstat + sumstat_with_na[[1]]$variant[1:2] <- NA + + warnings <- capture_warnings({ + validated_data <- colocboost_validate_input_data( + sumstat = sumstat_with_na, + LD = test_sumstat_data$LD + ) + }) + + expect_true(any(grepl("Removed variant with NA from sumstat 1", warnings))) + # Should have 2 fewer variants + expect_true(length(validated_data$Z[[1]]) == length(validated_data$Z[[2]])-2) +}) + +# Test 9: Handling of duplicate variants +test_that("colocboost removes duplicate variants correctly", { + sumstat_with_dup <- test_sumstat_data$sumstat + # Duplicate the first variant + sumstat_with_dup[[1]] <- rbind( + sumstat_with_dup[[1]][1, ], + sumstat_with_dup[[1]] + ) + + warnings <- capture_warnings({ + result <- colocboost( + sumstat = sumstat_with_dup, + LD = test_sumstat_data$LD + ) + }) + + expect_true(any(grepl("Removed duplicate variants from sumstat 1", warnings))) + expect_s3_class(result, "colocboost") + # Should be back to original count + expect_equal(length(result$data_info$variables), ncol(test_sumstat_data$X)) +}) + +# Test 10: Handling of mismatched variants between sumstat and LD +test_that("colocboost handles variant mismatch between sumstat and LD", { + # Create LD with different variant names + LD_modified <- test_sumstat_data$LD + rownames(LD_modified) <- colnames(LD_modified) <- paste0("SNP", 11:30) + + warnings <- capture_warnings({ + result <- colocboost_validate_input_data( + sumstat = test_sumstat_data$sumstat, + LD = LD_modified + ) + }) + + # Should warn about removing variants + expect_true(any(grepl("removing.*variants since those variants are not in LD", warnings))) +}) + +# Test 11: Error when no common variants +test_that("colocboost errors with no common variants", { + # Create LD with completely different variant names + LD_no_overlap <- test_sumstat_data$LD + rownames(LD_no_overlap) <- colnames(LD_no_overlap) <- paste0("DIFF", 1:20) + + warnings <- capture_warnings( + colocboost_validate_input_data( + sumstat = test_sumstat_data$sumstat, + LD = LD_no_overlap + ) + ) + expect_true(any(grepl("removing.*variants since those variants are not in LD", warnings))) + expect_true(any(grepl("is empty after filtering", warnings))) +}) + diff --git a/tests/testthat/test_utils.R b/tests/testthat/test_utils.R index a622540..d3d480d 100644 --- a/tests/testthat/test_utils.R +++ b/tests/testthat/test_utils.R @@ -85,12 +85,12 @@ test_that("w_cs correctly identifies confidence set for weight vector", { result <- w_cs(w, coverage = 0.8) # Expected result for 80% coverage: w[1] + w[2] = 0.8, so first 2 elements should be 1 - expected <- c(1, 1, 0, 0) + expected <- c(1,2) expect_equal(result, expected) # Test with different coverage result2 <- w_cs(w, coverage = 0.9) - expected2 <- c(1, 1, 1, 1) # First 4 elements cover 90% since 3 and 4 with the same 0.1 weight + expected2 <- c(1,2,3,4) # First 4 elements cover 90% since 3 and 4 with the same 0.1 weight expect_equal(result2, expected2) }) @@ -490,7 +490,7 @@ test_that("get_merge_ordered_with_indices handles conflicting variable orders co test_that("get_merge_ordered_with_indices handles edge cases", { # Test with NULL inputs - expect_equal(get_merge_ordered_with_indices(list()), character(0)) + expect_equal(get_merge_ordered_with_indices(list()), NULL) # Test with single vector expect_equal(get_merge_ordered_with_indices(list(c("A", "B", "C"))), c("A", "B", "C")) diff --git a/vignettes/ColocBoost_Wrapper_Pipeline.Rmd b/vignettes/ColocBoost_Wrapper_Pipeline.Rmd index 6b5a5a4..800d2f7 100644 --- a/vignettes/ColocBoost_Wrapper_Pipeline.Rmd +++ b/vignettes/ColocBoost_Wrapper_Pipeline.Rmd @@ -22,259 +22,8 @@ This vignette demonstrates how to use the bioinformatics pipeline for ColocBoost - See more details about input data preparation in `xqtl_protocol` with [link](https://statfungen.github.io/xqtl-protocol/code/mnm_analysis/mnm_methods/colocboost.html). -# 1. Loading Data using `colocboost_analysis_pipeline` function +Step 1: Loading individual-level and summary statistics using `load_multitask_regional_data` function from multiple cohorts or datasets -This function harmonizes the input data and prepares it for colocalization analysis. +Step 2: Perform ColocBoost using `colocboost_analysis_pipeline` function -In this section, we introduce how to load the regional data required for the ColocBoost analysis using the `load_multitask_regional_data` function. -This function loads mixed datasets for a specific region, including individual-level data (genotype, phenotype, covariate data), summary statistics -(sumstats, LD), or a combination of both. It runs `load_regional_univariate_data` for each individual-level dataset and `load_rss_data` for each summary statistics dataset. -The output is a list with `individual_data` and `sumstat_data` components, where `individual_data` is a list of individual-level data and `sumstat_data` is a list of summary statistics data. -This list is then passed to the `colocboost_analysis_pipeline` function for the colocalization analysis. - - -Below are the input parameters for this function for loading individual-level data: - -## 1.1. Loading individual-level data from multiple cohorts - -inputs: -- **`region`**: String ; Genomic region of interest in the format of `chr:start-end` for the phenotype region you want to analyze. -- **`genotype_list`**: Character vector; Paths for PLINK bed files containing genotype data (do NOT include .bed suffix). -- **`phenotype_list`**: Character vector; Paths for phenotype file names. -- **`covariate_list`**: Character vector; Paths for covariate file names for each phenotype. Must have the same length as the phenotype file vector. -- **`conditions_list_individual`**: Character vector; Strings representing different conditions or groups used for naming. Must have the same length as the phenotype file vector. -- **`match_geno_pheno`**: Integer vector; Indices of phenotypes matched to genotype if multiple genotype PLINK files are used. For each phenotype file in `phenotype_list`, the index of the genotype file in `genotype_list` it matches with. -- **`association_window`**: String; Genomic region of interest in the format of `chr:start-end` for the association analysis window of variants to test (cis or trans). If not provided, all genotype data will be loaded. -- **`extract_region_name`**: List of character vectors; Phenotype names (e.g., gene ID `ENSG00000269699`) to subset the phenotype data when there are multiple phenotypes availible in the region. Must have the same length as the phenotype file vector. Default is `NULL`, which will use all phenotypes in the region. -- **`region_name_col`**: Integer; 1-based index of the column containing the region name (i.e. 4 for gene ID in a bed file). Required if `extract_region_name` is not `NULL`, or if multiple phenotypes fall into the same region in one phenotype file -- **`keep_indel`**: Logical; indicating whether to keep insertions/deletions (INDELs). Default is `TRUE`. -- **`keep_samples`**: Character vector; Sample names to keep. Default is `NULL`. Currently only supports keeping the same samples from all genotype and phenotype files. -- **`maf_cutoff`**: Numeric; Minimum minor allele frequency (MAF) cutoff. Default is 0. -- **`mac_cutoff`**: Numeric; Minimum minor allele count (MAC) cutoff. Default is 0. -- **`xvar_cutoff`**: Numeric; Minimum genotype variance cutoff. Default is 0. -- **`imiss_cutoff`**: Numeric; Maximum individual missingness cutoff. Default is 0. - -outputs: -- **`region_data`**: List (with `individual_data`, `sumstat_data`); Output of the `load_multitask_regional_data` function. If only individual-level data is loaded, `sumstat_data` will be `NULL`. - - -**Indivudual-level data loading example** - -The following example demonstrates how to set up input data with 3 phenotypes and 2 cohorts. The first cohort has 2 phenotypes and the second cohort has 1 phenotype. The first phenotype has 2 genes and the second phenotype has 1 gene. - -```{r, data-loader-individual} -# Example of loading individual-level data -region = "chr1:1000000-2000000" -genotype_list = c("plink_cohort1.1", "plink_cohort1.2") -phenotype_list = c("phenotype1_cohort1.bed.gz", "phenotype2_cohort1.bed.gz", "phenotype1_cohort2.bed.gz") -covariate_list = c("covariate1_cohort1.bed.gz", "covariate2_cohort1.bed.gz", "covariate1_cohort2.bed.gz") -conditions_list_individual = c("phenotype1_cohort1", "phenotype2_cohort1", "phenotype1_cohort2") -match_geno_pheno = c(1,1,2) -association_window = "chr1:1000000-2000000" # set to be the same as region for cis-analysis -extract_region_name = list(c("ENSG00000269699, ENSG00000789633"), c("ENSG00000269699"), c("ENSG00000269699", "ENSG00000789633")) -region_name_col = 4 -keep_indel = TRUE -keep_samples = c("SAMPLE1", "SAMPLE2", "SAMPLE3") - -# Following parameters need to be set according to your data -maf_cutoff = 0.01 -mac_cutoff = 10 -xvar_cutoff = 0 -imiss_cutoff = 0.9 - -# More advanced parameters see pecotmr::load_multitask_regional_data() - -region_data_individual <- load_multitask_regional_data( - region = region, - genotype_list = genotype_list, - phenotype_list = phenotype_list, - covariate_list = covariate_list, - conditions_list_individual = conditions_list_individual, - match_geno_pheno = match_geno_pheno, - association_window = association_window, - region_name_col = region_name_col, - extract_region_name = extract_region_name, - keep_indel = keep_indel, - keep_samples = keep_samples, - maf_cutoff = maf_cutoff, - mac_cutoff = mac_cutoff, - xvar_cutoff = xvar_cutoff, - imiss_cutoff = imiss_cutoff -) - -``` - - - -## 1.2. Loading summary statistics from multiple cohorts or datasets - -inputs: -- **`sumstat_path_list`**: Character vector; Paths to the summary statistics. -- **`column_file_path_list`**: Character vector; Paths to the column mapping files. See below for expected format. -- **`LD_meta_file_path_list`**: Character vector; Paths to LD metadata files. See below for expected format. -- **`conditions_list_sumstat`**: Character vector; Strings representing different sumstats used for naming. Must have the same length as the sumstat file vector. -- **`match_LD_sumstat`**: List of character vectors; Mapping each LD metadata file to the summary-statistics conditions to pair with it. Length must equal `LD_meta_file_path_list`. Each element is a character vector of names present in `conditions_list_sumstat`. If omitted or an element is empty, defaults to all conditions for the first LD. -- **`association_window`**: String; Genomic region of interest in the format of `chr:start-end` for the association analysis window of variants to test (cis or trans). If not provided, all genotype data will be loaded. -- **`n_samples`**: Integer vector; Sample size. Set a 0 if `n_cases`/`n_controls` are passed explicitly. If unknown, set as 0 and include `n_samples` column in the column mapping file to retrieve from the sumstat file. -- **`n_cases`**: Integer vector; Number of cases. Set a 0 if `n_samples` is passed explicitly. If unknown, set as 0 and include `n_cases` column in the column mapping file to retrieve from the sumstat file. -- **`n_controls`**: Integer vector; Number of controls. Set a 0 if `n_samples` is passed explicitly. If unknown, set as 0 and include `n_controls` column in the column mapping file to retrieve from the sumstat file. - -outputs: -- **`region_data`**: List (with `individual_data`, `sumstat_data`); Output of the `load_multitask_regional_data` function. If only summary statistics data is loaded, `individual_data` will be `NULL`. - -**Summary statistics loading example** - -The following example demonstrates how to set up input data with 2 summary statistics and one LD reference. - -```{r, data-loader-sumstat} -# Example of loading summary statistics -sumstat_path_list = c("sumstat1.tsv.gz", "sumstat2.tsv.gz") -column_file_path_list = c("column_mapping_sumstat1.yml", "column_mapping_sumstat2.yml") -LD_meta_file_path_list = c("ld_meta_file.tsv") -conditions_list_sumstat = c("sumstat_1", "sumstat_2") -match_LD_sumstat = c("sumstat_1", "sumstat_2") -association_window = "chr1:1000000-2000000" - -# Following parameters need to be set according to your data -n_samples = c(300000, 0) -n_cases = c(0, 20000) -n_controls = c(0, 40000) - - -# More advanced parameters see pecotmr::load_multitask_regional_data() - -region_data_sumstat <- load_multitask_regional_data( - sumstat_path_list = sumstat_path_list, - column_file_path_list = column_file_path_list, - LD_meta_file_path_list = LD_meta_file_path_list, - conditions_list_sumstat = conditions_list_sumstat, - match_LD_sumstat = match_LD_sumstat, - association_window = association_window, - n_samples = n_samples, - n_cases = n_cases, - n_controls = n_controls -) -``` - - - -**Expected format for column mapping file** -The column mapping file is YAML (`.yml`) with key: value pairs mapping your input column names to the standardized names expected by the loader. -Required columns are `chrom`, `pos`, `A1`, and `A2`, and either `z` or `beta` and `sebeta`. -Either 'n_case' and 'n_control' or 'n_samples' can be passed as part of the column mapping, but will be overwritten by the n_cases and n_controls or n_samples parameterspassed explicitly. -```yaml -# required -chrom: chromosome -pos: position -A1: effect_allele -A2: non_effect_allele - -z: zscore -# or -beta: beta -sebeta: sebeta - -# optional, will be overridden by n_samples or n_cases and n_controls if passed explicitly -n_case: NCASE -n_control: NCONTROL -# or -n_sample: N -``` - - -**Expected format for LD metadata file** -LD files sould be in the format generated by for instance `plink --r squared`, then xz compressed. -The LD metadata file is a tab-separated file with the following columns: -- `chrom`: chromosome -- `start`: start position -- `end`: end position -- `ld_path, bim_path`: path to the LD block file and bim file -``` -1 1000000 2000000 ld_block_1.ld.gz,ld_block_1.bim -``` - - -# 2. Perform ColocBoost using `colocboost_analysis_pipeline` function - -In this section, we load region data for a combination of individual-level and summary statistics data, then perform the colocalization analysis using the `colocboost_analysis_pipeline` function. -The colocalization analysis can be run in any one of three modes, or in a combination of these modes (names assume that individual-level data is xQTL data and summary statistics data is GWAS data): - -- **`xQTL-only mode`**: Only perform colocalization analysis on the individual-level data. Summary statistics data is not used. -- **`joint GWAS mode`**: Perform colocalization analysis in disease-agnostic mode on the individual-level and summary statistics data together. -- **`separate GWAS mode`**: Perform colocalization analysis in disease-prioritized mode on the the individual-level data and each summary statistics dataset separately, treating each summary statistics dataset as the focal trait. - -inputs: -- **`region_data`**: List (with `individual_data`, `sumstat_data`); Output of the `load_multitask_regional_data` function. -- **`focal_trait`**: String; For xQTL-only mode, the name of the trait to perform disease-prioritized ColocBoost, from `conditions_list_individual`. If not provided, xQTL-only mode will be run without disease-prioritized mode. -- **`event_filters`**: List of character vectors; Patterns for filtering events based on context names. -Example: for sQTL, `list(type_pattern = ".*clu_(\\d+_[+-?]).*", valid_pattern = "clu_(\\d+_[+-?]):PR:", exclude_pattern = "clu_(\\d+_[+-?]):IN:")`. -- **`maf_cutoff`**: Numeric; Minor allele frequency cutoff. Default is 0.005. -- **`pip_cutoff_to_skip_ind`**: Integer vector; Cutoff values for skipping analysis based on pre-screening with single-effect SuSiE (L=1). Context is skipped if none of the variants in the context have PIP values greater than the cutoff. Default is 0 (does not run single-effect SuSiE). Passing a negative value sets the cutoff to 3/number of variants. -- **`pip_cutoff_to_skip_sumstat`**: Integer vector; Cutoff values for skipping analysis based on pre-screening with single-effect SuSiE (L=1). Sumstat is skipped if none of the variants in the sumstat have PIP values greater than the cutoff. Default is 0 (does not run single-effect SuSiE). Passing a negative value sets the cutoff to 3/number of variants. -- **`qc_method`**: String; Quality control method to use. Options are "rss_qc", "dentist", or "slalom". Default is `rss_qc`. -- **`impute`**: Logical; if TRUE, performs imputation for outliers identified in the analysis. Default is `TRUE`. -- **`impute_opts`**: List of lists; Imputation options including rcond, R2_threshold, and minimum_ld. Default is `list(rcond = 0.01, R2_threshold = 0.6, minimum_ld = 5)`. -- **`xqtl_coloc`**: Logical; if TRUE, performs xQTL-only mode. Default is `TRUE`. -- **`joint_gwas`**: Logical; if TRUE, performs joint GWAS mode, mapping all individual-level and sumstat data together.Default is `FALSE`. -- **`separate_gwas`**: Logical; if TRUE, runs separate GWAS mode, where each sumstat dataset is analyzed separately with all individual-level data, treating each sumstat as the focal trait in disease-prioritized mode. Default is `FALSE`. - -outputs: -- **`colocboost_results`**: List of colocboost objects (with `xqtl_coloc`, `joint_gwas`, `separate_gwas`); Output of the `colocboost_analysis_pipeline` function. If the mode is not run, the corresponding element will be `NULL`. - -```{r, colocboost-analysis} -# load in individual-level and sumstat data -region_data_combined <- load_multitask_regional_data( - region = region, - genotype_list = genotype_list, - phenotype_list = phenotype_list, - covariate_list = covariate_list, - conditions_list_individual = conditions_list_individual, - match_geno_pheno = match_geno_pheno, - association_window = association_window, - region_name_col = region_name_col, - extract_region_name = extract_region_name, - keep_indel = keep_indel, - keep_samples = keep_samples, - maf_cutoff = maf_cutoff, - mac_cutoff = mac_cutoff, - xvar_cutoff = xvar_cutoff, - imiss_cutoff = imiss_cutoff, - sumstat_path_list = sumstat_path_list, - column_file_path_list = column_file_path_list, - LD_meta_file_path_list = LD_meta_file_path_list, - conditions_list_sumstat = conditions_list_sumstat, - match_LD_sumstat = match_LD_sumstat, - n_samples = n_samples, - n_cases = n_cases, - n_controls = n_controls -) - -maf_cutoff = 0.01 -pip_cutoff_to_skip_ind = rep(0, length(phenotype_list)) -pip_cutoff_to_skip_sumstat = rep(0, length(sumstat_path_list)) -qc_method = "rss_qc" - -# run colocboost analysis -colocboost_results <- colocboost_analysis_pipeline( - region_data_combined, - maf_cutoff = maf_cutoff, - pip_cutoff_to_skip_ind = pip_cutoff_to_skip_ind, - pip_cutoff_to_skip_sumstat = pip_cutoff_to_skip_sumstat, - qc_method = qc_method, - xqtl_coloc = TRUE, - joint_gwas = TRUE, - separate_gwas = TRUE -) - -# visualize results for xQTL-only mode -colocboost_plot(colocboost_results$xqtl_coloc) - -# visualize results for joint GWAS mode -colocboost_plot(colocboost_results$joint_gwas) - -# visualize results for separate GWAS mode -for (i in 1:length(colocboost_results$separate_gwas)) { - colocboost_plot(colocboost_results$separate_gwas[[i]]) -} -``` \ No newline at end of file diff --git a/vignettes/Visualization_ColocBoost_Output.Rmd b/vignettes/Visualization_ColocBoost_Output.Rmd index 1ffd7d9..8ed3e5b 100644 --- a/vignettes/Visualization_ColocBoost_Output.Rmd +++ b/vignettes/Visualization_ColocBoost_Output.Rmd @@ -94,9 +94,11 @@ There are three options available for plotting the CoS variants to uncolocalized colocboost_plot(res, show_cos_to_uncoloc = TRUE) ``` -### 2.4. Plot with an added vertical line +### 2.4. Plot with added highlight points -You can add a vertical line to the plot by setting `add_vertical = TRUE` and `add_vertical_idx = **`. This will add a vertical line at the specified index. +You can highlight specific variants in the plot by setting `add_highlight = TRUE` and `add_highlight_idx = **`. +This will add red dashed vertical lines (defult `add_highlight_style = "vertical_lines"`) at the specified index you want to highlight. +Alternatively, you can use `add_highlight_style = "star"` to change the highlight style to the red star for the specified variants. For example, to add a vertical line at true causal variants, you can set `add_vertical_idx = unique(unlist(Ind_5traits$true_effect_variants))`. Following plot also shows the top variants. @@ -104,8 +106,9 @@ Following plot also shows the top variants. ```{r vertical-plot} colocboost_plot( res, show_top_variables = TRUE, - add_vertical = TRUE, - add_vertical_idx = unique(unlist(Ind_5traits$true_effect_variants)) + add_highlight = TRUE, + add_highlight_idx = unique(unlist(Ind_5traits$true_effect_variants)), + add_highlight_style = "star" ) ```