diff --git a/R/colocboost.R b/R/colocboost.R
index 81e8490..aba9cdf 100644
--- a/R/colocboost.R
+++ b/R/colocboost.R
@@ -195,6 +195,181 @@ colocboost <- function(X = NULL, Y = NULL, # individual data
     return(NULL)
   }
 
+  # - check input data: individual level data and summary-level data
+  validated_data <- colocboost_validate_input_data(
+    X = X, Y = Y,
+    sumstat = sumstat, LD = LD,
+    dict_YX = dict_YX, dict_sumstatLD = dict_sumstatLD,
+    effect_est = effect_est, effect_se = effect_se, effect_n = effect_n,
+    overlap_variables = overlap_variables,
+    M = M, min_abs_corr = min_abs_corr
+  )
+  if (is.null(validated_data)) {
+    return(NULL)
+  }
+  # Extract validated data
+  X <- validated_data$X
+  Y <- validated_data$Y
+  yx_dict <- validated_data$yx_dict
+  keep_variable_individual <- validated_data$keep_variable_individual
+  sumstat <- validated_data$sumstat
+  LD <- validated_data$LD
+  sumstatLD_dict <- validated_data$sumstatLD_dict
+  keep_variable_sumstat <- validated_data$keep_variable_sumstat
+  Z <- validated_data$Z
+  N_sumstat <- validated_data$N_sumstat
+  Var_y <- validated_data$Var_y
+  SeBhat <- validated_data$SeBhat
+  
+  # Update parameters if LD was not provided
+  M <- validated_data$M
+  min_abs_corr <- validated_data$min_abs_corr
+  jk_equiv_corr <- validated_data$jk_equiv_corr
+  jk_equiv_loglik <- validated_data$jk_equiv_loglik
+  func_simplex <- validated_data$func_simplex
+  
+  # - initial colocboost object
+  keep_variables <- c(keep_variable_individual, keep_variable_sumstat)
+  overlapped_variables <- Reduce("intersect", keep_variables)
+  mean_variables <- mean(sapply(keep_variables, length))
+  min_variables <- min(sapply(keep_variables, length))
+  if (min_variables < 100) {
+    warning(
+      "Warning message about the number of variables.\n",
+      "The smallest number of variables across outcomes is ", min_variables, " < 100.",
+      " If this is what you expected, this is not a problem.",
+      " If this is not what you expected, please check input data."
+    )
+  }
+  if (length(overlapped_variables) <= 1) {
+    warning(
+      "Error: No or only 1 overlapping variables were found across all outcomes, colocalization cannot be performed. ",
+      "Please verify the variable names across different outcomes."
+    )
+    return(NULL)
+  } else if ((length(overlapped_variables) / mean_variables) < 0.1) {
+    warning(
+      "Warning message about the overlapped variables.\n",
+      "The average number of variables across outcomes is ", mean_variables,
+      ". But only ", length(overlapped_variables), " number of variables overlapped (<10%).\n",
+      " If this is what you expected, this is not a problem.",
+      " If this is not what you expected, please check if the variable name matched across outcomes."
+    )
+  }
+  cb_data <- colocboost_init_data(
+    X = X, Y = Y, dict_YX = yx_dict,
+    Z = Z, LD = LD, N_sumstat = N_sumstat, dict_sumstatLD = sumstatLD_dict,
+    Var_y = Var_y, SeBhat = SeBhat,
+    keep_variables = keep_variables,
+    focal_outcome_idx = focal_outcome_idx,
+    focal_outcome_variables = focal_outcome_variables,
+    overlap_variables = overlap_variables,
+    intercept = intercept,
+    standardize = standardize,
+    residual_correlation = residual_correlation
+  )
+
+  ##################  colocboost updates   ###################################
+  message("Starting gradient boosting algorithm.")
+  cb_obj <- colocboost_workhorse(cb_data,
+    M = M,
+    prioritize_jkstar = prioritize_jkstar,
+    tau = tau,
+    learning_rate_init = learning_rate_init,
+    learning_rate_decay = learning_rate_decay,
+    func_simplex = func_simplex,
+    lambda = lambda,
+    lambda_focal_outcome = lambda_focal_outcome,
+    stop_thresh = stop_thresh,
+    func_multi_test = func_multi_test,
+    stop_null = stop_null,
+    multi_test_max = multi_test_max,
+    multi_test_thresh = multi_test_thresh,
+    ash_prior = ash_prior,
+    p.adjust.methods = p.adjust.methods,
+    jk_equiv_corr = jk_equiv_corr,
+    jk_equiv_loglik = jk_equiv_loglik,
+    func_compare = func_compare,
+    coloc_thresh = coloc_thresh,
+    LD_free = LD_free,
+    dynamic_learning_rate = dynamic_learning_rate,
+    focal_outcome_idx = focal_outcome_idx,
+    outcome_names = outcome_names
+  )
+
+  # --- post-processing of the colocboost updates
+  message("Performing inference on colocalization events.")
+  cb_output <- colocboost_assemble(cb_obj,
+    coverage = coverage,
+    weight_fudge_factor = weight_fudge_factor,
+    check_null = check_null,
+    check_null_method = check_null_method,
+    check_null_max = check_null_max,
+    check_null_max_ucos = check_null_max_ucos,
+    dedup = dedup,
+    overlap = overlap,
+    n_purity = n_purity,
+    min_abs_corr = min_abs_corr,
+    sec_coverage_thresh = sec_coverage_thresh,
+    median_abs_corr = median_abs_corr,
+    min_cluster_corr = min_cluster_corr,
+    median_cos_abs_corr = median_cos_abs_corr,
+    weaker_effect = weaker_effect,
+    merge_cos = merge_cos,
+    tol = tol,
+    output_level = output_level
+  )
+  class(cb_output) <- "colocboost"
+  return(cb_output)
+}
+
+
+
+
+#' @title Validate and Process All Input Data for ColocBoost
+#'
+#' @description Internal function to validate and process both individual-level and summary-level input data
+#'
+#' @param X A list of genotype matrices for different outcomes, or a single matrix if all outcomes share the same genotypes.
+#' @param Y A list of vectors of outcomes or an N by L matrix if it is considered for the same X and multiple outcomes.
+#' @param sumstat A list of data.frames of summary statistics.
+#' @param LD A list of correlation matrices indicating the LD matrix for each genotype.
+#' @param dict_YX A L by 2 matrix of dictionary for X and Y if there exist subsets of outcomes corresponding to the same X matrix.
+#' @param dict_sumstatLD A L by 2 matrix of dictionary for sumstat and LD if there exist subsets of outcomes corresponding to the same sumstat.
+#' @param effect_est Matrix of variable regression coefficients (i.e. regression beta values) in the genomic region
+#' @param effect_se Matrix of standard errors associated with the beta values
+#' @param effect_n A scalar or a vector of sample sizes for estimating regression coefficients.
+#' @param overlap_variables If overlap_variables = TRUE, only perform colocalization in the overlapped region.
+#' @param M The maximum number of gradient boosting rounds for each outcome (default is 500).
+#' @param min_abs_corr Minimum absolute correlation allowed in a confidence set.
+#'
+#' @return A list containing:
+#'   \item{X}{Processed list of genotype matrices}
+#'   \item{Y}{Processed list of phenotype vectors}
+#'   \item{yx_dict}{Dictionary mapping Y to X}
+#'   \item{keep_variable_individual}{List of variable names for each X matrix}
+#'   \item{sumstat}{Processed list of summary statistics data.frames}
+#'   \item{LD}{Processed list of LD matrices}
+#'   \item{sumstatLD_dict}{Dictionary mapping sumstat to LD}
+#'   \item{keep_variable_sumstat}{List of variant names for each sumstat}
+#'   \item{Z}{List of z-scores for each outcome}
+#'   \item{N_sumstat}{List of sample sizes for each outcome}
+#'   \item{Var_y}{List of phenotype variances for each outcome}
+#'   \item{SeBhat}{List of standard errors for each outcome}
+#'   \item{M_updated}{Updated M value (may be changed if LD not provided)}
+#'   \item{min_abs_corr_updated}{Updated min_abs_corr value (may be changed if LD not provided)}
+#'   \item{jk_equiv_corr_updated}{Updated jk_equiv_corr value}
+#'   \item{jk_equiv_loglik_updated}{Updated jk_equiv_loglik value}
+#'   \item{func_simplex_updated}{Updated func_simplex value}
+#'
+#' @keywords internal
+colocboost_validate_input_data <- function(X = NULL, Y = NULL,
+                                           sumstat = NULL, LD = NULL,
+                                           dict_YX = NULL, dict_sumstatLD = NULL,
+                                           effect_est = NULL, effect_se = NULL, effect_n = NULL,
+                                           overlap_variables = FALSE,
+                                           M = 500, min_abs_corr = 0.5) {
+  
   # - check individual level data
   if (!is.null(X) & !is.null(Y)) {
     # --- check input
@@ -247,7 +422,17 @@ colocboost <- function(X = NULL, Y = NULL, # individual data
         return(xx)
       })
     }
-
+    
+    # Remove duplicates and report: if duplicate columns of X
+    X <- lapply(seq_along(X), function(i) {
+      df <- X[[i]]
+      if (anyDuplicated(colnames(df))) {
+        message(paste("Removed duplicate columns from X matrix ", i))
+        df[, !duplicated(colnames(df)), drop = FALSE]
+      } else {
+        df
+      }
+    })
     keep_variable_individual <- lapply(X, colnames)
     if (!is.list(X) & !is.list(Y)) {
       warning("Error: Input X and Y must be the list containing genotype matrics and all phenotype vectors!")
@@ -324,7 +509,7 @@ colocboost <- function(X = NULL, Y = NULL, # individual data
   } else {
     yx_dict <- keep_variable_individual <- NULL
   }
-
+  
   # - check summary-level data
   if ((!is.null(sumstat)) | (!is.null(effect_est) & !is.null(effect_se))) {
     # --- check input of (effect_est, effect_se)
@@ -333,11 +518,19 @@ colocboost <- function(X = NULL, Y = NULL, # individual data
         warning("Error: effect_est and effect_se should be the same dimension! Please check!")
         return(NULL)
       }
-      variables <- rownames(effect_est)
       effect_est <- as.matrix(effect_est)
       effect_se <- as.matrix(effect_se)
+      variables <- rownames(effect_est)
       if (is.null(variables)) {
         variables <- paste0("variant_", 1:nrow(effect_est))
+      } else {
+        # add - if duplciates occurs, only keep one
+        if (anyDuplicated(variables)) {
+          keep_idx <- !duplicated(variables)
+          variables <- variables[keep_idx]
+          effect_est <- effect_est[keep_idx, , drop = FALSE]
+          effect_se <- effect_se[keep_idx, , drop = FALSE]
+        }
       }
       sumstat <- list()
       for (sum_iy in 1:ncol(effect_est)) {
@@ -358,7 +551,7 @@ colocboost <- function(X = NULL, Y = NULL, # individual data
         sumstat[[sum_iy]] <- sumstat_y
       }
     }
-
+    
     if (!is.null(sumstat)) {
       if (is.data.frame(sumstat)) {
         sumstat <- list(sumstat)
@@ -367,6 +560,7 @@ colocboost <- function(X = NULL, Y = NULL, # individual data
         warning("Error: Input sumstat must be the list containing summary level data for all outcomes!")
         return(NULL)
       }
+      sumstat <- lapply(sumstat, as.data.frame)
       # --- check if variables names in summary data
       variable.tmp <- sapply(sumstat, function(xx) {
         if (("variant" %in% colnames(xx))) {
@@ -388,11 +582,33 @@ colocboost <- function(X = NULL, Y = NULL, # individual data
         })
       }
     }
-    keep_variable_sumstat <- lapply(sumstat, function(xx) {
-      xx$variant
+    
+    # Remove NA for sumstat$variant columns - add on
+    sumstat <- lapply(seq_along(sumstat), function(i) {
+      xx <- sumstat[[i]]
+      if (anyNA(xx$variant)) {
+        warning(paste("Removed variant with NA from sumstat", i))
+        xx = as.data.frame(xx[!is.na(xx$variant), , drop = FALSE])
+      }
+      return(xx)
     })
-
+    # Remove duplicates and report: if duplicate variant in summary statistics
+    sumstat <- lapply(seq_along(sumstat), function(i) {
+      xx <- sumstat[[i]]
+      if (anyDuplicated(xx$variant)) {
+        warning(paste("Removed duplicate variants from sumstat", i))
+        xx = as.data.frame(xx[!duplicated(xx$variant), , drop = FALSE])
+      }
+      return(xx)
+    })
+    
     # --- check input of LD
+    M_updated <- M
+    min_abs_corr_updated <- min_abs_corr
+    jk_equiv_corr_updated <- 0.8
+    jk_equiv_loglik_updated <- 1
+    func_simplex_updated <- "LD_z2z"
+    
     if (is.null(LD)) {
       # if no LD input, set diagonal matrix to LD
       warning(
@@ -404,55 +620,84 @@ colocboost <- function(X = NULL, Y = NULL, # individual data
       LD <- 1
       sumstatLD_dict <- rep(1, length(sumstat))
       # change some algorithm parameters
-      M <- 1 # one iteration
-      min_abs_corr <- 0 # remove purity checking
-      jk_equiv_corr <- 0
-      jk_equiv_loglik <- 0.1
-      func_simplex <- "only_z2z"
+      M_updated <- 1 # one iteration
+      min_abs_corr_updated <- 0 # remove purity checking
+      jk_equiv_corr_updated <- 0
+      jk_equiv_loglik_updated <- 0.1
+      func_simplex_updated <- "only_z2z"
+      
     } else {
-      if (is.data.frame(LD)) {
-        LD <- as.matrix(LD)
-      }
-      if (is.matrix(LD)) {
-        LD <- list(LD)
-      }
+      
+      if (is.data.frame(LD)) LD <- as.matrix(LD)
+      if (is.matrix(LD)) LD <- list(LD)
       # - check if NA in LD matrix
       num_na <- sapply(LD, sum)
       if (any(is.na(num_na))){
         warning("Error: Input LD must not contain missing values (NA).")
         return(NULL)
       }
+      # Create sumstat-LD mapping ===
       if (length(LD) == 1) {
         sumstatLD_dict <- rep(1, length(sumstat))
       } else if (length(LD) == length(sumstat)) {
-        sumstatLD_dict <- 1:length(sumstat)
+        sumstatLD_dict <- seq_along(sumstat)
       } else {
         if (is.null(dict_sumstatLD)) {
-          warning("Error: Please provide the dict_sumstatLD since you have multiple sumstat but only few LD!")
+          warning('Error: Please provide dict_sumstatLD: you have ', length(sumstat), 
+                  ' sumstats but only ', length(LD), ' LD matrices')
           return(NULL)
-        } else {
-          # - dict for sumstat to LD mapping
-          sumstatLD_dict <- rep(NA, length(sumstat))
-          for (i in 1:length(sumstat)) {
-            tmp <- unique(dict_sumstatLD[dict_sumstatLD[, 1] == i, 2])
-            if (length(tmp) == 0) {
-              warning(paste("Error: You don't provide matched LD for sumstat", i))
-              return(NULL)
-            } else if (length(tmp) != 1) {
-              warning(paste("Error: You provide multiple matched LD for sumstat", i))
-              return(NULL)
-            } else {
-              sumstatLD_dict[i] <- tmp
-            }
+        }
+        if (length(dict_sumstatLD) != length(sumstat)) {
+          warning('Error: dict_sumstatLD must have length ', length(sumstat))
+          return(NULL)
+        }
+        if (any(is.na(dict_sumstatLD))) {
+          warning('Error: dict_sumstatLD contains NA values')
+          return(NULL)
+        }
+        if (any(dict_sumstatLD < 1) || any(dict_sumstatLD > length(LD))) {
+          warning('Error: dict_sumstatLD values must be between 1 and ', length(LD))
+          return(NULL)
+        }
+        sumstatLD_dict <- as.integer(dict_sumstatLD)
+      }
+      
+      # === Filter variants for each sumstat ===
+      for (i in seq_along(sumstat)) {
+        # Get sumstat variants (adjust field name based on your data structure)
+        sumstat_variants <- sumstat[[i]]$variant
+        n_total <- length(sumstat_variants)
+        # Get LD variants
+        ld_idx <- sumstatLD_dict[i]
+        current_ld <- LD[[ld_idx]]
+        ld_variants <- rownames(current_ld)
+        if (is.null(ld_variants)) {
+          if (ncol(current_ld) != n_total){
+            warning('Error: LD matrix ', ld_idx, ' has no rownames. Please ensure all LD matrices have variant names as rownames.')
+            return(NULL)
           }
-          if (max(sumstatLD_dict) > length(LD)) {
-            warning("Error: You don't provide enough LD matrices!")
+        }
+        # Find common variants
+        common_variants <- intersect(sumstat_variants, ld_variants)
+        n_removed <- n_total - length(common_variants)
+        # Filter if needed
+        if (n_removed > 0) {
+          warning('Sumstat ', i, ': removing ', n_removed, ' out of ', n_total,
+                  ' variants since those variants are not in LD matrix ', ld_idx)
+          keep_idx <- match(common_variants, sumstat_variants)
+          if (length(keep_idx) == 0){
+            warning('Error: Sumstat data ', i, ' is empty after filtering. Returning NULL')
             return(NULL)
           }
+          # Filter all relevant fields - ADJUST THESE FIELD NAMES TO YOUR DATA
+          sumstat[[i]] <- sumstat[[i]][keep_idx, , drop = FALSE]
         }
       }
     }
-
+    keep_variable_sumstat <- lapply(sumstat, function(xx) {
+      xx$variant
+    })
+    
     # - checking sample size existency
     n_exist <- sapply(sumstat, function(ss) {
       "n" %in% colnames(ss)
@@ -469,7 +714,7 @@ colocboost <- function(X = NULL, Y = NULL, # individual data
         "Outcome ", paste(p_no, collapse = ", "), " in sumstat don't contain 'n'!"
       )
     }
-
+    
     Z <- N_sumstat <- Var_y <- SeBhat <- vector(mode = "list", length = length(sumstat))
     for (i.summstat in 1:length(sumstat)) {
       summstat_tmp <- sumstat[[i.summstat]]
@@ -499,7 +744,7 @@ colocboost <- function(X = NULL, Y = NULL, # individual data
         warning(paste("summary statistic dataset", i.summstat, "contains NA values that are replaced with 0"))
         z[is.na(z)] <- 0
       }
-
+      
       # - check N
       if (!("n" %in% colVar)) {
         z <- z
@@ -512,7 +757,7 @@ colocboost <- function(X = NULL, Y = NULL, # individual data
         }
         n <- median(n)
         N_sumstat[[i.summstat]] <- n
-
+        
         # When n is provided, compute the adjusted z-scores.
         z <- z * sqrt((n - 1) / (z^2 + n - 2))
         if ("var_y" %in% colVar) {
@@ -532,99 +777,31 @@ colocboost <- function(X = NULL, Y = NULL, # individual data
     }
   } else {
     Z <- N_sumstat <- Var_y <- SeBhat <- sumstatLD_dict <- keep_variable_sumstat <- NULL
+    M_updated <- M
+    min_abs_corr_updated <- min_abs_corr
+    jk_equiv_corr_updated <- 0.8
+    jk_equiv_loglik_updated <- 1
+    func_simplex_updated <- "LD_z2z"
   }
-  # - initial colocboost object
-  keep_variables <- c(keep_variable_individual, keep_variable_sumstat)
-  overlapped_variables <- Reduce("intersect", keep_variables)
-  mean_variables <- mean(sapply(keep_variables, length))
-  min_variables <- min(sapply(keep_variables, length))
-  if (min_variables < 100) {
-    warning(
-      "Warning message about the number of variables.\n",
-      "The smallest number of variables across outcomes is ", min_variables, " < 100.",
-      " If this is what you expected, this is not a problem.",
-      " If this is not what you expected, please check input data."
-    )
-  }
-  if (length(overlapped_variables) <= 1) {
-    warning(
-      "Error: No or only 1 overlapping variables were found across all outcomes, colocalization cannot be performed. ",
-      "Please verify the variable names across different outcomes."
-    )
-    return(NULL)
-  } else if ((length(overlapped_variables) / mean_variables) < 0.1) {
-    warning(
-      "Warning message about the overlapped variables.\n",
-      "The average number of variables across outcomes is ", mean_variables,
-      ". But only ", length(overlapped_variables), " number of variables overlapped (<10%).\n",
-      " If this is what you expected, this is not a problem.",
-      " If this is not what you expected, please check if the variable name matched across outcomes."
-    )
-  }
-  cb_data <- colocboost_init_data(
-    X = X, Y = Y, dict_YX = yx_dict,
-    Z = Z, LD = LD, N_sumstat = N_sumstat, dict_sumstatLD = sumstatLD_dict,
-    Var_y = Var_y, SeBhat = SeBhat,
-    keep_variables = keep_variables,
-    focal_outcome_idx = focal_outcome_idx,
-    focal_outcome_variables = focal_outcome_variables,
-    overlap_variables = overlap_variables,
-    intercept = intercept,
-    standardize = standardize,
-    residual_correlation = residual_correlation
-  )
-
-  ##################  colocboost updates   ###################################
-  message("Starting gradient boosting algorithm.")
-  cb_obj <- colocboost_workhorse(cb_data,
-    M = M,
-    prioritize_jkstar = prioritize_jkstar,
-    tau = tau,
-    learning_rate_init = learning_rate_init,
-    learning_rate_decay = learning_rate_decay,
-    func_simplex = func_simplex,
-    lambda = lambda,
-    lambda_focal_outcome = lambda_focal_outcome,
-    stop_thresh = stop_thresh,
-    func_multi_test = func_multi_test,
-    stop_null = stop_null,
-    multi_test_max = multi_test_max,
-    multi_test_thresh = multi_test_thresh,
-    ash_prior = ash_prior,
-    p.adjust.methods = p.adjust.methods,
-    jk_equiv_corr = jk_equiv_corr,
-    jk_equiv_loglik = jk_equiv_loglik,
-    func_compare = func_compare,
-    coloc_thresh = coloc_thresh,
-    LD_free = LD_free,
-    dynamic_learning_rate = dynamic_learning_rate,
-    focal_outcome_idx = focal_outcome_idx,
-    outcome_names = outcome_names
-  )
-
-  # --- post-processing of the colocboost updates
-  message("Performing inference on colocalization events.")
-  cb_output <- colocboost_assemble(cb_obj,
-    coverage = coverage,
-    weight_fudge_factor = weight_fudge_factor,
-    check_null = check_null,
-    check_null_method = check_null_method,
-    check_null_max = check_null_max,
-    check_null_max_ucos = check_null_max_ucos,
-    dedup = dedup,
-    overlap = overlap,
-    n_purity = n_purity,
-    min_abs_corr = min_abs_corr,
-    sec_coverage_thresh = sec_coverage_thresh,
-    median_abs_corr = median_abs_corr,
-    min_cluster_corr = min_cluster_corr,
-    median_cos_abs_corr = median_cos_abs_corr,
-    weaker_effect = weaker_effect,
-    merge_cos = merge_cos,
-    tol = tol,
-    output_level = output_level
-  )
-  class(cb_output) <- "colocboost"
-  return(cb_output)
+  
+  return(list(
+    X = X,
+    Y = Y,
+    yx_dict = yx_dict,
+    keep_variable_individual = keep_variable_individual,
+    sumstat = sumstat,
+    LD = LD,
+    sumstatLD_dict = sumstatLD_dict,
+    keep_variable_sumstat = keep_variable_sumstat,
+    Z = Z,
+    N_sumstat = N_sumstat,
+    Var_y = Var_y,
+    SeBhat = SeBhat,
+    M = M_updated,
+    min_abs_corr = min_abs_corr_updated,
+    jk_equiv_corr = jk_equiv_corr_updated,
+    jk_equiv_loglik = jk_equiv_loglik_updated,
+    func_simplex = func_simplex_updated
+  ))
 }
 
diff --git a/R/colocboost_assemble.R b/R/colocboost_assemble.R
index aba5405..94cbbe5 100644
--- a/R/colocboost_assemble.R
+++ b/R/colocboost_assemble.R
@@ -58,7 +58,7 @@ colocboost_assemble <- function(cb_obj,
     )
     # - save model and all coloc and single information for diagnostic
     if (output_level != 1) {
-      tmp <- get_full_output(cb_obj = cb_obj, past_out = NULL, variables = NULL)
+      tmp <- get_full_output(cb_obj = cb_obj, past_out = NULL, variables = data_info$variables)
       if (output_level == 2) {
         cb_output <- c(cb_output, list("ucos_details" = tmp$ucos_detials))
         cb_output <- cb_output[c("cos_summary", "vcp", "cos_details", "data_info", "model_info", "ucos_details")]
@@ -79,6 +79,7 @@ colocboost_assemble <- function(cb_obj,
     } else if (cb_obj$cb_model_para$model_used == "one_causal"){
       # fixme later
       check_null_max <- check_null_max * check_null
+      check_null_max_ucos <- check_null_max_ucos * check_null
     }
     cb_obj <- get_max_profile(cb_obj, check_null_max = check_null_max, 
                               check_null_max_ucos = check_null_max_ucos, 
@@ -209,6 +210,9 @@ colocboost_assemble <- function(cb_obj,
     # - colocalization results
     cb_obj$cb_model_para$weight_fudge_factor <- weight_fudge_factor
     cb_obj$cb_model_para$coverage <- coverage
+    cb_obj$cb_model_para$min_abs_corr <- min_abs_corr
+    cb_obj$cb_model_para$median_abs_corr <- median_abs_corr
+    cb_obj$cb_model_para$n_purity <- n_purity
     cos_results <- get_cos_details(cb_obj, coloc_out = past_out$cos$cos, data_info = data_info)
     cb_output <- list(
       "vcp" = cos_results$vcp,
diff --git a/R/colocboost_check_update_jk.R b/R/colocboost_check_update_jk.R
index 623faa8..d449b03 100644
--- a/R/colocboost_check_update_jk.R
+++ b/R/colocboost_check_update_jk.R
@@ -66,15 +66,15 @@ boost_check_update_jk_nofocal <- function(cb_model, cb_model_para, cb_data) {
 
     # -- define jk and jk_each
     cor_vals_each <- do.call(cbind, lapply(model_update, function(cc) cc$correlation))
-    abs_cor_vals_each <- sweep(abs(cor_vals_each), 2, adj_dep, `*`)
+    # abs_cor_vals_each <- sweep(abs(cor_vals_each), 2, adj_dep, `*`)
+    abs_cor_vals_each <- abs(cor_vals_each) * rep(adj_dep, each = nrow(cor_vals_each))
     abs_cor_vals <- rowSums(abs_cor_vals_each)
 
-    jk <- which(abs_cor_vals == max(abs_cor_vals))
-    jk <- ifelse(length(jk) == 1, jk, sample(jk, 1))
-    jk_each <- apply(abs_cor_vals_each, 2, function(temp) {
-      jk_temp <- which(temp == max(temp))
-      return(ifelse(length(jk_temp) == 1, jk_temp, sample(jk_temp, 1)))
-    })
+    # jk <- which(abs_cor_vals == max(abs_cor_vals))
+    # jk <- ifelse(length(jk) == 1, jk, sample(jk, 1))
+    # jk_each <- apply(abs_cor_vals_each, 2, which.max)
+    jk <- which.max(abs_cor_vals)
+    jk_each <- sapply(1:ncol(abs_cor_vals_each), function(j) which.max(abs_cor_vals_each[, j]))
     update_jk[c(1, pos.update + 1)] <- c(jk, jk_each)
 
     
@@ -142,7 +142,7 @@ boost_check_update_jk_nofocal <- function(cb_model, cb_model_para, cb_data) {
         update_status[pos.update[true_update]] <- 1
         # - re-define jk_star
         abs_cor_vals_redefine <- rowSums(abs_cor_vals_each[, true_update, drop = FALSE])
-        jk_redefine <- which(abs_cor_vals_redefine == max(abs_cor_vals_redefine))
+        jk_redefine <- which.max(abs_cor_vals_redefine == max(abs_cor_vals_redefine))
         jk_redefine <- ifelse(length(jk_redefine) == 1, jk_redefine, sample(jk_redefine, 1))
         real_update_jk[pos.update[true_update]] <- jk_redefine
       } else {
@@ -351,11 +351,10 @@ boost_check_update_jk_focal <- function(cb_model, cb_model_para, cb_data,
 
     # -- define jk and jk_each
     cor_vals_each <- do.call(cbind, lapply(model_update, function(cc) as.vector(cc$correlation)))
-    abs_cor_vals_each <- sweep(abs(cor_vals_each), 2, adj_dep, `*`)
-    jk_each <- apply(abs_cor_vals_each, 2, function(temp) {
-      jk_temp <- which(temp == max(temp))
-      return(ifelse(length(jk_temp) == 1, jk_temp, sample(jk_temp, 1)))
-    })
+    # abs_cor_vals_each <- sweep(abs(cor_vals_each), 2, adj_dep, `*`)
+    abs_cor_vals_each <- abs(cor_vals_each) * rep(adj_dep, each = nrow(cor_vals_each))
+    # jk_each <- apply(abs_cor_vals_each, 2, which.max)
+    jk_each <- sapply(1:ncol(abs_cor_vals_each), function(j) which.max(abs_cor_vals_each[, j]))
     pp_focal <- which(pos.update == focal_outcome_idx)
     jk_focal <- jk_each[pp_focal]
 
@@ -503,8 +502,8 @@ get_LD_jk1_jk2 <- function(jk1, jk2,
     if (length(XtX) == 1){
       LD_temp <- 0
     } else {
-      jk1.remain <- which(remain_jk == jk1)
-      jk2.remain <- which(remain_jk == jk2)
+      jk1.remain <- match(jk1, remain_jk)
+      jk2.remain <- match(jk2, remain_jk)
       LD_temp <- XtX[jk1.remain, jk2.remain]
     }
   }
@@ -545,29 +544,59 @@ check_pair_jkeach <- function(jk_each,
                               cb_data, X_dict,
                               jk_equiv_corr = 0.8,
                               jk_equiv_loglik = 0.001) {
+
+
+  #' @importFrom stats cor
+  get_LD_jk_each <- function(jk_each,
+                             X = NULL, XtX = NULL, N = NULL,
+                             remain_jk = NULL) {
+    if (!is.null(X)) {
+      LD_temp <- suppressWarnings({
+        get_cormat(X[, jk_each])
+      })
+      LD_temp[which(is.na(LD_temp))] <- 0
+      # LD_temp <- LD_temp[1, 2]
+    } else if (!is.null(XtX)) {
+      if (length(XtX) == 1){
+        LD_temp <- matrix(0, length(jk_each), length(jk_each))
+      } else {
+        jk.remain <- match(jk_each, remain_jk)
+        LD_temp <- XtX[jk.remain, jk.remain]
+        LD_temp[which(is.na(LD_temp))] <- 0
+      }
+    }
+    return(LD_temp)
+  }
+
+  detect_func <- function(idx, LD_all, jk_i, jk_j, i, j){
+    change_log_jk_i <- model_update[[idx]]$change_loglike[jk_i]
+    change_log_jk_j <- model_update[[idx]]$change_loglike[jk_j]
+    change_each <- abs(change_log_jk_i - change_log_jk_j)
+    LD_temp <- LD_all[[idx]][i, j]
+    return((change_each <= jk_equiv_loglik) & (abs(LD_temp) >= jk_equiv_corr))
+  }
+
   data_update <- cb_data$data[pos.update]
+  LD_all <- lapply(1:length(jk_each), function(idx){
+    get_LD_jk_each(jk_each,
+                   X = cb_data$data[[X_dict[idx]]]$X,
+                   XtX = cb_data$data[[X_dict[idx]]]$XtX,
+                   N = data_update[[idx]]$N,
+                   remain_jk = setdiff(1:length(model_update[[idx]]$res), data_update[[idx]]$variable_miss)
+    )
+  })
+
   # -- check if jk_i ~ jk_j
   change_each_pair <- matrix(FALSE, nrow = length(jk_each), ncol = length(jk_each))
   for (i in 1:length(jk_each)) {
-    
     jk_i <- jk_each[i]
-    change_log_jk_i <- model_update[[i]]$change_loglike[jk_i]
-    for (j in 1:length(jk_each)) {
+    for (j in i:length(jk_each)) {
       if (j != i) {
         jk_j <- jk_each[j]
-
-        if (!(jk_i %in% data_update[[i]]$variable_miss) & !(jk_j %in% data_update[[i]]$variable_miss)) {
-          change_log_jk_j <- model_update[[i]]$change_loglike[jk_j]
-          change_each <- abs(change_log_jk_i - change_log_jk_j)
-          LD_temp <- get_LD_jk1_jk2(jk_i, jk_j,
-            X = cb_data$data[[X_dict[i]]]$X,
-            XtX = cb_data$data[[X_dict[i]]]$XtX,
-            N = data_update[[i]]$N,
-            remain_jk = setdiff(1:length(model_update[[i]]$res), data_update[[i]]$variable_miss)
-          )
-          change_each_pair[i, j] <- (change_each <= jk_equiv_loglik) & (abs(LD_temp) >= jk_equiv_corr)
-        } else {
-          change_each_pair[i, j] <- FALSE
+        change_each_pair[i, j] <- detect_func(idx = i, LD_all, jk_i, jk_j, i, j)
+        # if jk_i and jk_j are equivalent on dataset i, then we don't need to check dataset j
+        if ( !change_each_pair[i, j] ){
+          change_each_pair[j, i] <- detect_func(idx = j, LD_all, jk_i, jk_j, i, j)
         }
       } else {
         change_each_pair[i, j] <- FALSE
diff --git a/R/colocboost_inference.R b/R/colocboost_inference.R
index dc53908..882fc7a 100644
--- a/R/colocboost_inference.R
+++ b/R/colocboost_inference.R
@@ -132,7 +132,8 @@ get_modularity <- function(Weight, B) {
   
   if (m == 0) return(0)
   
-  cate <- B %*% t(B)
+  # cate <- B %*% t(B)
+  cate <- tcrossprod(B)
   
   if (m_pos == 0 & m_neg == 0) return(0)
   
@@ -190,12 +191,12 @@ get_n_cluster <- function(hc, Sigma, m = ncol(Sigma), min_cluster_corr = 0.8) {
 #' @noRd
 w_purity <- function(weights, X = NULL, Xcorr = NULL, N = NULL, n = 100, coverage = 0.95,
                      min_abs_corr = 0.5, median_abs_corr = NULL, miss_idx = NULL) {
-  tmp <- apply(weights, 2, w_cs, coverage = coverage)
-  tmp_purity <- apply(tmp, 2, function(tt) {
-    pos <- which(tt == 1)
+  
+  tmp_purity <- apply(weights, 2, function(w) {
+    pos <- w_cs(w, coverage = coverage)
     # deal with missing snp here
     if (!is.null(Xcorr)) {
-      pos <- match(pos, setdiff(1:length(tmp), miss_idx))
+      pos <- match(pos, setdiff(1:length(w), miss_idx))
     }
     get_purity(pos, X = X, Xcorr = Xcorr, N = N, n = n)
   })
@@ -373,7 +374,12 @@ check_null_post <- function(cb_obj,
   }
   if (!weaker_effect) {
     check_cs_change <- cs_change
-    check_null_tmp <- sapply(1:cb_obj$cb_model_para$L, function(j) cb_obj$cb_model[[j]]$check_null_max)
+    if (cb_obj$cb_model_para$L == 1){
+      check_null_max_tmp <- cb_obj$cb_model[[j]]$check_null_max_ucos
+    } else {
+      check_null_max_tmp <- cb_obj$cb_model[[j]]$check_null_max
+    }
+    check_null_tmp <- sapply(1:cb_obj$cb_model_para$L, function(j) check_null_max_tmp)
   } else {
     check_null_tmp <- rep(check_null, cb_obj$cb_model_para$L)
   }
@@ -420,7 +426,7 @@ get_purity <- function(pos, X = NULL, Xcorr = NULL, N = NULL, n = 100) {
       corr[which(is.na(corr))] <- 0
       value <- abs(get_upper_tri(corr))
     } else {
-      if (sum(Xcorr) == 1){
+      if (length(Xcorr) == 1){
         value <- 0
       } else {
         Xcorr <- Xcorr # if (!is.null(N)) Xcorr/(N-1) else Xcorr
diff --git a/R/colocboost_init.R b/R/colocboost_init.R
index 8685b01..edca937 100644
--- a/R/colocboost_init.R
+++ b/R/colocboost_init.R
@@ -163,7 +163,8 @@ colocboost_init_model <- function(cb_data,
     tmp <- list(
       "res" = NULL,
       "beta" = rep(0, P),
-      "weights_path" = c(),
+      # "weights_path" = c(),
+      "weights_path" = list(),
       "profile_loglike_each" = NULL,
       "obj_path" = 999999,
       "obj_single" = 999999,
@@ -619,7 +620,7 @@ process_sumstat <- function(Z, N, Var_y, SeBhat, ld_matrices, variant_lists, dic
     Z_extend[pos_target] <- current_z[pos_z]
 
     # Calculate submatrix for each unique entry (not duplicates)
-    if (sum(current_ld_matrix) == 1){
+    if (length(current_ld_matrix) == 1){
       ld_submatrix <- current_ld_matrix
     } else {
       ld_submatrix <- NULL
diff --git a/R/colocboost_one_causal.R b/R/colocboost_one_causal.R
index 0ed96b5..09692d5 100644
--- a/R/colocboost_one_causal.R
+++ b/R/colocboost_one_causal.R
@@ -79,9 +79,8 @@ colocboost_one_iteration <- function(cb_model, cb_model_para, cb_data) {
   cb_model_para[rm_elements] <- NULL
   for (i in 1:length(cb_model)) {
     cb_model[[i]]$obj_path <- as.numeric(unlist(cb_model[[i]]$obj_path[-1]))
-  }
-  for (i in 1:length(cb_model)) {
     cb_model[[i]]$obj_single <- as.numeric(unlist(cb_model[[i]]$obj_single[-1]))
+    cb_model[[i]]$weights_path <- do.call(rbind, cb_model[[i]]$weights_path)
   }
 
   cb_obj <- list("cb_data" = cb_data, "cb_model" = cb_model, "cb_model_para" = cb_model_para)
@@ -238,7 +237,7 @@ colocboost_diagLD <- function(cb_model, cb_model_para, cb_data) {
         cb_model_para, 
         cb_data
       )
-      weights <- rbind(weights, cb_model_tmp[[iy]]$weights_path)
+      weights <- rbind(weights, unlist(cb_model_tmp[[iy]]$weights_path))
     }
     ###### overlap weights
     overlap_pair <- check_pair_overlap(weights, coverage = 0.95)
@@ -307,11 +306,10 @@ colocboost_diagLD <- function(cb_model, cb_model_para, cb_data) {
   cb_model_para[rm_elements] <- NULL
   for (i in 1:length(cb_model)) {
     cb_model[[i]]$obj_path <- as.numeric(unlist(cb_model[[i]]$obj_path[-1]))
-  }
-  for (i in 1:length(cb_model)) {
     cb_model[[i]]$obj_single <- as.numeric(unlist(cb_model[[i]]$obj_single[-1]))
+    cb_model[[i]]$weights_path <- do.call(rbind, cb_model[[i]]$weights_path)
   }
-
+  
   cb_obj <- list("cb_data" = cb_data, "cb_model" = cb_model, "cb_model_para" = cb_model_para)
   class(cb_obj) <- "colocboost"
   return(cb_obj)
diff --git a/R/colocboost_plot.R b/R/colocboost_plot.R
index 90141d6..1f0f7c1 100644
--- a/R/colocboost_plot.R
+++ b/R/colocboost_plot.R
@@ -14,8 +14,9 @@
 #' @param plot_focal_cos_outcome_only Logical, if TRUE only plots colocalization including at least on colocalized outcome with focal outcome, default is FALSE.
 #' @param points_color Background color for non-colocalized variables, default is "grey80".
 #' @param cos_color Optional custom colors for CoS.
-#' @param add_vertical Logical, if TRUE adds vertical lines at specified positions, default is FALSE
-#' @param add_vertical_idx Optional indices for vertical lines.
+#' @param add_highlight Logical, if TRUE adds vertical lines at specified positions, default is FALSE
+#' @param add_highlight_style Optional style of add_highlight variables, default is "vertical_lines", other choice is "star" - red star.
+#' @param add_highlight_idx Optional indices for add_highlight variables.
 #' @param outcome_names Optional vector of outcomes names for the subtitle of each figure. \code{outcome_names=NULL} for the outcome name shown in \code{data_info}.
 #' @param plot_cols Number of columns in the plot grid, default is 2. If you have many colocalization. please consider increasing this.
 #' @param variant_coord Logical, if TRUE uses variant coordinates on x-axis, default is FALSE. This is required the variable names including position information.
@@ -78,8 +79,9 @@ colocboost_plot <- function(cb_output, y = "log10p",
                             plot_focal_cos_outcome_only = FALSE,
                             points_color = "grey80",
                             cos_color = NULL,
-                            add_vertical = FALSE,
-                            add_vertical_idx = NULL,
+                            add_highlight = FALSE,
+                            add_highlight_idx = NULL,
+                            add_highlight_style = "vertical_lines",
                             outcome_names = NULL,
                             plot_cols = 2,
                             variant_coord = FALSE,
@@ -102,33 +104,34 @@ colocboost_plot <- function(cb_output, y = "log10p",
   if (!inherits(cb_output, "colocboost")) {
     stop("Input of colocboost_plot must be a 'colocboost' object!")
   }
-
+  
   # get cb_plot_input data from colocboost results
   cb_plot_input <- get_input_plot(cb_output,
-    plot_cos_idx = plot_cos_idx,
-    variant_coord = variant_coord,
-    outcome_names = outcome_names,
-    plot_focal_only = plot_focal_only,
-    plot_focal_cos_outcome_only = plot_focal_cos_outcome_only,
-    show_cos_to_uncoloc = show_cos_to_uncoloc,
-    show_cos_to_uncoloc_idx = show_cos_to_uncoloc_idx,
-    show_cos_to_uncoloc_outcome = show_cos_to_uncoloc_outcome,
-    plot_ucos = plot_ucos, plot_ucos_idx = plot_ucos_idx
+                                  plot_cos_idx = plot_cos_idx,
+                                  variant_coord = variant_coord,
+                                  outcome_names = outcome_names,
+                                  plot_focal_only = plot_focal_only,
+                                  plot_focal_cos_outcome_only = plot_focal_cos_outcome_only,
+                                  show_cos_to_uncoloc = show_cos_to_uncoloc,
+                                  show_cos_to_uncoloc_idx = show_cos_to_uncoloc_idx,
+                                  show_cos_to_uncoloc_outcome = show_cos_to_uncoloc_outcome,
+                                  plot_ucos = plot_ucos, plot_ucos_idx = plot_ucos_idx
   )
   # get initial set up of plot
   cb_plot_init <- plot_initial(cb_plot_input,
-    y = y, points_color = points_color, cos_color = cos_color,
-    ylim_each = ylim_each, title_specific = title_specific,
-    outcome_legend_pos = outcome_legend_pos, outcome_legend_size = outcome_legend_size,
-    cos_legend_pos = cos_legend_pos, plot_ucos = plot_ucos,
-    show_variable = show_variable, lab_style = lab_style, axis_style = axis_style,
-    title_style = title_style, ...
+                               y = y, points_color = points_color, cos_color = cos_color,
+                               ylim_each = ylim_each, title_specific = title_specific,
+                               outcome_legend_pos = outcome_legend_pos, outcome_legend_size = outcome_legend_size,
+                               cos_legend_pos = cos_legend_pos, plot_ucos = plot_ucos,
+                               show_variable = show_variable, lab_style = lab_style, axis_style = axis_style,
+                               title_style = title_style, ...
   )
-
+  
   colocboost_plot_basic <- function(cb_plot_input, cb_plot_init,
                                     outcome_idx = NULL, plot_all_outcome = FALSE,
                                     grange = NULL, plot_cols = 2,
-                                    add_vertical = FALSE, add_vertical_idx = NULL,
+                                    add_highlight = FALSE, add_highlight_idx = NULL,
+                                    add_highlight_style = "vertical_lines",
                                     show_top_variables = TRUE,
                                     ...) {
     args <- list(...)
@@ -147,12 +150,12 @@ colocboost_plot <- function(cb_output, y = "log10p",
     args$font.lab <- cb_plot_init$lab_face
     # - change position
     cb_plot_init$outcome_legend_pos <- switch(cb_plot_init$outcome_legend_pos,
-      "right" = 4,
-      "left" = 2,
-      "top" = 3,
-      "bottom" = 1
+                                              "right" = 4,
+                                              "left" = 2,
+                                              "top" = 3,
+                                              "bottom" = 1
     )
-
+    
     # - begin plotting
     coloc_cos <- cb_plot_input$cos
     outcomes <- cb_plot_input$outcomes
@@ -229,12 +232,7 @@ colocboost_plot <- function(cb_output, y = "log10p",
               cex = cb_plot_init$outcome_legend_size, font = 1
         )
       }
-      if (add_vertical) {
-        for (iii in 1:length(add_vertical_idx)) {
-          abline(v = add_vertical_idx[iii], col = "#E31A1C", lwd = 1.5, lty = "dashed")
-        }
-      }
-
+      
       # mark variables in CoS to colocalized outcomes
       if (!is.null(coloc_cos)) {
         n.coloc <- length(coloc_cos)
@@ -242,7 +240,7 @@ colocboost_plot <- function(cb_output, y = "log10p",
         if (length(y)==1){coloc_index = lapply(coloc_index, function(i) 1 )}
         legend_text <- list(col = vector())
         legend_text$col <- head(cb_plot_init$col, n.coloc)
-
+        
         # check which coloc set for this outcome
         p.coloc <- sapply(coloc_index, function(idx) sum(idx == iy) != 0)
         p.coloc <- which(p.coloc)
@@ -261,7 +259,7 @@ colocboost_plot <- function(cb_output, y = "log10p",
             points(x_hits, y_hits, pch = 21, bg = legend_text$col[i.cs], col = "#E31A1C", cex = 3, lwd = 3)
           }
         }
-
+        
         # mark variables in CoS to uncolocalized outcomes
         uncoloc <- cb_plot_input$uncoloc
         if (!is.null(uncoloc)) {
@@ -276,8 +274,8 @@ colocboost_plot <- function(cb_output, y = "log10p",
               x0 <- intersect(args$x, cs)
               y1 <- args$y[match(x0, args$x)]
               points(x0, y1,
-                pch = 4, col = adjustcolor(legend_text$col[uncoloc$cos_idx_to_uncoloc[i.uncoloc]], alpha.f = 0.4),
-                cex = 1.5, lwd = 1.5
+                     pch = 4, col = adjustcolor(legend_text$col[uncoloc$cos_idx_to_uncoloc[i.uncoloc]], alpha.f = 0.4),
+                     cex = 1.5, lwd = 1.5
               )
               shape_col <- c(shape_col, adjustcolor(legend_text$col[uncoloc$cos_idx_to_uncoloc[i.uncoloc]], alpha.f = 1))
               texts_col <- c(texts_col, adjustcolor(legend_text$col[uncoloc$cos_idx_to_uncoloc[i.uncoloc]], alpha.f = 0.8))
@@ -294,35 +292,51 @@ colocboost_plot <- function(cb_output, y = "log10p",
           if (length(texts) == 0) {
             next
           }
-
+          
           # Get current plot area coordinates
           usr <- par("usr")
           x_pos <- usr[1] + cb_plot_init$cos_legend_pos[1] * (usr[2] - usr[1])  # 5% from left edge
           y_pos <- usr[3] + cb_plot_init$cos_legend_pos[2] * (usr[4] - usr[3])  # 50% from bottom edge
           legend(x = x_pos, y = y_pos, texts,
-            bty = "n", col = shape_col, text.col = texts_col,
-            cex = 1.5, pt.cex = 1.5, pch = 4, x.intersp = cos_legend_pos[3], y.intersp = cos_legend_pos[4]
+                 bty = "n", col = shape_col, text.col = texts_col,
+                 cex = 1.5, pt.cex = 1.5, pch = 4, x.intersp = cos_legend_pos[3], y.intersp = cos_legend_pos[4]
           )
         }
       }
+      if (add_highlight) {
+        for (iii in 1:length(add_highlight_idx)) {
+          if ( !(add_highlight_style %in% c("vertical_lines", "star")) ){
+            message("add_highlight_style is not 'vertical_lines' and 'star', defaulting to 'vertical_lines'")
+            add_highlight_style = "vertical_lines"
+          }
+          if (add_highlight_style == "vertical_lines"){
+            abline(v = add_highlight_idx[iii], col = "#E31A1C", lwd = 1.5, lty = "dashed")
+          } else if (add_highlight_style == "star") {
+            points(add_highlight_idx[iii], args$y[match(add_highlight_idx[iii],args$x)], 
+                   pch = 8, col = "#E31A1C", lwd = 2, lty = "dashed", cex = 2.5)
+          } 
+          
+        }
+      }
     }
     if (!is.null(cb_plot_init$title)) {
       mtext(cb_plot_init$title,
-        side = 3, line = 0, outer = TRUE,
-        cex = cb_plot_init$title_size, font = cb_plot_init$title_face
+            side = 3, line = 0, outer = TRUE,
+            cex = cb_plot_init$title_size, font = cb_plot_init$title_face
       )
     }
     return(invisible())
   }
-
+  
   colocboost_plot_basic(cb_plot_input, cb_plot_init,
-    grange = grange,
-    outcome_idx = outcome_idx, 
-    plot_all_outcome = plot_all_outcome,
-    plot_cols = plot_cols,
-    add_vertical = add_vertical, add_vertical_idx = add_vertical_idx,
-    show_top_variables = show_top_variables,
-    ...
+                        grange = grange,
+                        outcome_idx = outcome_idx, 
+                        plot_all_outcome = plot_all_outcome,
+                        plot_cols = plot_cols,
+                        add_highlight = add_highlight, add_highlight_idx = add_highlight_idx,
+                        add_highlight_style = add_highlight_style,
+                        show_top_variables = show_top_variables,
+                        ...
   )
 }
 
@@ -339,7 +353,7 @@ get_input_plot <- function(cb_output, plot_cos_idx = NULL,
                            show_cos_to_uncoloc_outcome = NULL,
                            plot_ucos = FALSE,
                            plot_ucos_idx = NULL) {
-
+  
   # check ucos exists
   if (plot_ucos && !"ucos_details" %in% names(cb_output)) {
     warning(
@@ -354,7 +368,7 @@ get_input_plot <- function(cb_output, plot_cos_idx = NULL,
     cb_output$data_info$outcome_info$outcome_names <- outcome_names
     names(cb_output$data_info$z) <- outcome_names
   }
-
+  
   # extract results from colocboost
   analysis_outcome <- cb_output$data_info$outcome_info$outcome_names
   focal_outcome_idx <- which(cb_output$data_info$outcome_info$is_focal)
@@ -371,7 +385,7 @@ get_input_plot <- function(cb_output, plot_cos_idx = NULL,
   Z <- cb_output$data_info$z
   coef <- cb_output$data_info$coef
   vcp <- list(as.numeric(cb_output$vcp))
-
+  
   # if finemapping
   if (cb_output$data_info$n_outcomes == 1) {
     cb_output$cos_details$cos$cos_variables <- cb_output$ucos_details$ucos$ucos_variables
@@ -406,7 +420,7 @@ get_input_plot <- function(cb_output, plot_cos_idx = NULL,
       }
     })
     ncos <- length(cb_output$cos_details$cos$cos_index)
-
+    
     select_cs <- 1:ncos
     if (!is.null(plot_cos_idx)) {
       if (length(setdiff(plot_cos_idx, c(1:ncos))) != 0) {
@@ -477,7 +491,7 @@ get_input_plot <- function(cb_output, plot_cos_idx = NULL,
     "coloc_index" = coloc_index,
     "select_cos" = select_cs
   )
-
+  
   # check plot uncolocalizaed confidence sets from ucos_details
   if (plot_ucos & cb_output$data_info$n_outcomes > 1){
     ucos_details <- cb_output$ucos_details
@@ -528,7 +542,7 @@ get_input_plot <- function(cb_output, plot_cos_idx = NULL,
       plot_input$ucos_cos_int_weights <- combined
     }
   }
-
+  
   # check if plot cos to uncolocalized outcome
   # use the updated coloc_cos and related components from plot_input if available
   coloc_cos <- plot_input$cos
@@ -613,7 +627,7 @@ get_input_plot <- function(cb_output, plot_cos_idx = NULL,
     )
     plot_input$uncoloc <- uncoloc
   }
-
+  
   class(plot_input) <- "colocboost"
   return(plot_input)
 }
@@ -634,11 +648,11 @@ plot_initial <- function(cb_plot_input, y = "log10p",
                          ...) {
   args <- list(...)
   if (!exists("pch", args)) args$pch <- 16
-
+  
   # - set background point color and cos color pools
   args$bg <- points_color
   if (is.null(cos_color)) {
-
+    
     # cos_color <- c(
     #   "#377EB8", "#E69F00", "#33A02C", "#984EA3", "#F46D43",
     #   "#A65628", "#1F4E79", "#B2182B", "#D73027", "#F781BF",
@@ -688,12 +702,12 @@ plot_initial <- function(cb_plot_input, y = "log10p",
       "#D5D8DC", "#BDC3C7", "#95A5A6", "#7F8C8D", "#17202A",
       "#808080", "#A9A9A9", "#C0C0C0", "#696969", "#778899"
     )
-
-
+    
+    
     # cos_color <- c("#1F70A9", "#33A02C", "#CAB2D6", "#EA7827")
   }
   args$col <- cos_color[cb_plot_input$select_cos]
-
+  
   # - set data and x-lab and y-lab
   if (y == "log10p") {
     plot_data <- lapply(cb_plot_input$Zscores, function(z) {
@@ -729,12 +743,12 @@ plot_initial <- function(cb_plot_input, y = "log10p",
   if (!exists("ylab", args)) args$ylab <- ylab
   args$lab_size <- as.numeric(lab_style[1])
   args$lab_face <- lab_style[2]
-
+  
   # - set title format
   args$title <- title_specific
   args$title_size <- as.numeric(title_style[1])
   args$title_face <- title_style[2]
-
+  
   # - set x-axis and y-axis
   args$x <- cb_plot_input$x$pos
   args$y <- plot_data
@@ -744,7 +758,7 @@ plot_initial <- function(cb_plot_input, y = "log10p",
   }
   args$axis_size <- as.numeric(axis_style[1])
   args$axis_face <- axis_style[2]
-
+  
   # - set ylim for each subfigure
   if (exists("ylim", args)) {
     ymax <- rep(args$ylim[2], length(args$y))
@@ -776,7 +790,7 @@ plot_initial <- function(cb_plot_input, y = "log10p",
   }
   args$ymax <- ymax
   args$ymin <- ymin
-
+  
   # - set legend text position and format
   args$outcome_legend_pos <- outcome_legend_pos
   args$outcome_legend_size <- outcome_legend_size
@@ -788,7 +802,7 @@ plot_initial <- function(cb_plot_input, y = "log10p",
     args$outcome_legend_angle <- 0
   }
   args$cos_legend_pos <- cos_legend_pos
-
+  
   return(args)
 }
 
diff --git a/R/colocboost_update.R b/R/colocboost_update.R
index 6100828..734bcb1 100644
--- a/R/colocboost_update.R
+++ b/R/colocboost_update.R
@@ -74,7 +74,8 @@ colocboost_update <- function(cb_model, cb_model_para, cb_data) {
     # weights <- adj_dep * ld_feature / scaling_factor * exp_abs_cor / sum(exp_abs_cor)
     weights <- adj_dep * obj_ld * exp_abs_cor / sum(exp_abs_cor)
     weights <- weights / sum(weights)
-    cb_model[[i]]$weights_path <- rbind(cb_model[[i]]$weights_path, as.vector(weights))
+    # cb_model[[i]]$weights_path <- rbind(cb_model[[i]]$weights_path, as.vector(weights))
+    cb_model[[i]]$weights_path <- c(cb_model[[i]]$weights_path, list(as.vector(weights)))
     ########## END: MAIN CALCULATION ###################
 
 
diff --git a/R/colocboost_utils.R b/R/colocboost_utils.R
index c4da011..9bfa3fd 100644
--- a/R/colocboost_utils.R
+++ b/R/colocboost_utils.R
@@ -426,15 +426,20 @@ get_max_profile <- function(cb_obj, check_null_max = 0.025,
 }
 
 
+# ### Function for check cs for each weight
+# w_cs <- function(weights, coverage = 0.95) {
+#   indices <- unlist(get_in_cos(weights, coverage = coverage))
+#   result <- rep(0, length(weights))
+#   result[indices] <- 1
+#   return(result)
+# }
+
 ### Function for check cs for each weight
 w_cs <- function(weights, coverage = 0.95) {
   indices <- unlist(get_in_cos(weights, coverage = coverage))
-  result <- rep(0, length(weights))
-  result[indices] <- 1
-  return(result)
+  return(indices)
 }
 
-
 #' Pure R implementation (fallback)
 #' @noRd
 get_merge_ordered_with_indices <- function(vector_list) {
@@ -505,12 +510,14 @@ get_merge_ordered_with_indices <- function(vector_list) {
   # Step 4: Topological sort using Kahn's algorithm
   # Start with nodes that have no incoming edges
   queue <- all_elements[sapply(all_elements, function(elem) in_degree[[elem]] == 0)]
-  result <- character()
+  result <- list()
+  result_idx <- 1
   while (length(queue) > 0) {
     # Take the first element from the queue
     current <- queue[1]
     queue <- queue[-1]
-    result <- c(result, current)
+    result[[result_idx]] <- current  # List assignment is fast
+    result_idx <- result_idx + 1
     # Process all neighbors (elements that must come after current)
     neighbors <- graph[[current]]
     for (next_elem in neighbors) {
@@ -520,6 +527,7 @@ get_merge_ordered_with_indices <- function(vector_list) {
       }
     }
   }
+  result <- unlist(result)
   # Step 5: Check for cycles and use fallback if needed
   if (length(result) != n_elements) {
     # Different variable orders detected - use priority-based fallback
@@ -657,8 +665,6 @@ get_cos_details <- function(cb_obj, coloc_out, data_info = NULL) {
     })
     int_weight <- lapply(cos_weights, get_integrated_weight, weight_fudge_factor = cb_obj$cb_model_para$weight_fudge_factor)
     names(int_weight) <- names(cos_weights) <- colocset_names
-    vcp <- as.vector(1 - apply(1 - do.call(cbind, int_weight), 1, prod))
-    names(vcp) <- data_info$variables
 
     # - resummary results
     cos_re_idx <- lapply(int_weight, function(w) {
@@ -668,7 +674,55 @@ get_cos_details <- function(cb_obj, coloc_out, data_info = NULL) {
       data_info$variables[idx]
     })
     coloc_csets <- list("cos_index" = cos_re_idx, "cos_variables" = cos_re_var)
-
+    
+    # - recalculate purity
+    purity <- vector(mode = "list", length = length(cos_re_idx))
+    for (ee in 1:length(cos_re_idx)) {
+      coloc_t <- coloc_outcome_index[[ee]]
+      p_tmp <- c()
+      for (i3 in coloc_t) {
+        pos <- cos_re_idx[[ee]]
+        X_dict <- cb_obj$cb_data$dict[i3]
+        if (!is.null(cb_obj$cb_data$data[[X_dict]]$XtX)) {
+          pos <- match(pos, setdiff(1:cb_obj$cb_model_para$P, cb_obj$cb_data$data[[i3]]$variable_miss))
+        }
+        tmp <- matrix(get_purity(pos,
+                                 X = cb_obj$cb_data$data[[X_dict]]$X,
+                                 Xcorr = cb_obj$cb_data$data[[X_dict]]$XtX,
+                                 N = cb_obj$cb_data$data[[i3]]$N, n = cb_obj$cb_model_para$n_purity
+        ), 1, 3)
+        p_tmp <- rbind(p_tmp, tmp)
+      }
+      purity[[ee]] <- matrix(apply(p_tmp, 2, max), 1, 3)
+    }
+    purity_all <- do.call(rbind, purity)
+    purity_all <- as.data.frame(purity_all)
+    colnames(purity_all) <- c("min_abs_corr", "mean_abs_corr", "median_abs_corr")
+    coloc_out$purity <- purity_all
+    if (is.null(cb_obj$cb_model_para$median_abs_corr)) {
+      is_pure <- which(purity_all[, 1] >= cb_obj$cb_model_para$min_abs_corr)
+    } else {
+      is_pure <- which(purity_all[, 1] >= cb_obj$cb_model_para$min_abs_corr | purity_all[, 3] >= cb_obj$cb_model_para$median_abs_corr)
+    }
+    if (length(is_pure)==0){
+      coloc_results <- NULL
+      vcp <- NULL
+      return(list("cos_results" = coloc_results, "vcp" = vcp))
+    } else if (length(is_pure)!=length(cos_re_idx)){
+      int_weight = int_weight[is_pure]
+      coloc_csets <- lapply(coloc_csets, function(cs) cs[is_pure])
+      coloc_outcomes <- lapply(coloc_outcomes, function(cs) cs[is_pure])
+      normalization_evidence <- normalization_evidence[is_pure]
+      npc <- npc[is_pure]
+      cos_min_npc_outcome <- cos_min_npc_outcome[is_pure]
+      cos_weights <- cos_weights[is_pure]
+      coloc_out$purity <- purity_all[is_pure,,drop = FALSE]
+      colocset_names <- colocset_names[is_pure]
+    }
+    vcp <- as.vector(1 - apply(1 - do.call(cbind, int_weight), 1, prod))
+    names(vcp) <- data_info$variables
+    
+    
     # - hits variables in each csets
     coloc_hits <- coloc_hits_variablenames <- coloc_hits_names <- c()
     for (i in 1:length(int_weight)) {
@@ -839,7 +893,7 @@ get_full_output <- function(cb_obj, past_out = NULL, variables = NULL, cb_output
   cb_model_para <- cb_obj$cb_model_para
 
   ## - obtain the order of variables based on the variables names if it has position information
-  if (!is.null(variables)) {
+  if (is.null(variables)) {
     ordered <- 1:length(cb_obj$cb_model_para$variables)
   } else {
     ordered <- match(variables, cb_obj$cb_model_para$variables)
@@ -933,7 +987,7 @@ get_full_output <- function(cb_obj, past_out = NULL, variables = NULL, cb_output
         # - hits variables in each csets
         cs_hits <- sapply(1:length(specific_w), function(jj) {
           inw <- specific_w[[jj]]
-          sample(which(inw == max(inw)), 1)
+          which(inw == max(inw))[1]
         })
         cs_hits_variablenames <- sapply(cs_hits, function(ch) variables[ch])
         specific_cs_hits <- data.frame("top_index" = cs_hits, "top_variables" = cs_hits_variablenames) # save
diff --git a/R/colocboost_workhorse.R b/R/colocboost_workhorse.R
index 7583eee..4462680 100644
--- a/R/colocboost_workhorse.R
+++ b/R/colocboost_workhorse.R
@@ -103,6 +103,9 @@ colocboost_workhorse <- function(cb_data,
   }
 
   if (sum(cb_model_para$update_y == 1) == 0) {
+    for (i in 1:length(cb_model)){
+      cb_model[[i]]$weights_path <- matrix(0, ncol = cb_model_para$P)
+    }
     cb_obj <- list("cb_data" = cb_data, "cb_model" = cb_model, "cb_model_para" = cb_model_para)
     class(cb_obj) <- "colocboost"
     return(cb_obj)
@@ -236,9 +239,8 @@ colocboost_workhorse <- function(cb_data,
     cb_model_para$num_updates <- m
     for (i in 1:length(cb_model)) {
       cb_model[[i]]$obj_path <- as.numeric(unlist(cb_model[[i]]$obj_path[-1]))
-    }
-    for (i in 1:length(cb_model)) {
       cb_model[[i]]$obj_single <- as.numeric(unlist(cb_model[[i]]$obj_single[-1]))
+      cb_model[[i]]$weights_path <- do.call(rbind, cb_model[[i]]$weights_path)
     }
 
     if (m == M) {
diff --git a/man/colocboost_plot.Rd b/man/colocboost_plot.Rd
index fe155fc..8e5c7df 100644
--- a/man/colocboost_plot.Rd
+++ b/man/colocboost_plot.Rd
@@ -18,8 +18,9 @@ colocboost_plot(
   plot_focal_cos_outcome_only = FALSE,
   points_color = "grey80",
   cos_color = NULL,
-  add_vertical = FALSE,
-  add_vertical_idx = NULL,
+  add_highlight = FALSE,
+  add_highlight_idx = NULL,
+  add_highlight_style = "vertical_lines",
   outcome_names = NULL,
   plot_cols = 2,
   variant_coord = FALSE,
@@ -62,9 +63,11 @@ colocboost_plot(
 
 \item{cos_color}{Optional custom colors for CoS.}
 
-\item{add_vertical}{Logical, if TRUE adds vertical lines at specified positions, default is FALSE}
+\item{add_highlight}{Logical, if TRUE adds vertical lines at specified positions, default is FALSE}
 
-\item{add_vertical_idx}{Optional indices for vertical lines.}
+\item{add_highlight_idx}{Optional indices for add_highlight variables.}
+
+\item{add_highlight_style}{Optional style of add_highlight variables, default is "vertical_lines", other choice is "star" - red star.}
 
 \item{outcome_names}{Optional vector of outcomes names for the subtitle of each figure. \code{outcome_names=NULL} for the outcome name shown in \code{data_info}.}
 
diff --git a/man/colocboost_validate_input_data.Rd b/man/colocboost_validate_input_data.Rd
new file mode 100644
index 0000000..b8616e9
--- /dev/null
+++ b/man/colocboost_validate_input_data.Rd
@@ -0,0 +1,70 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/colocboost.R
+\name{colocboost_validate_input_data}
+\alias{colocboost_validate_input_data}
+\title{Validate and Process All Input Data for ColocBoost}
+\usage{
+colocboost_validate_input_data(
+  X = NULL,
+  Y = NULL,
+  sumstat = NULL,
+  LD = NULL,
+  dict_YX = NULL,
+  dict_sumstatLD = NULL,
+  effect_est = NULL,
+  effect_se = NULL,
+  effect_n = NULL,
+  overlap_variables = FALSE,
+  M = 500,
+  min_abs_corr = 0.5
+)
+}
+\arguments{
+\item{X}{A list of genotype matrices for different outcomes, or a single matrix if all outcomes share the same genotypes.}
+
+\item{Y}{A list of vectors of outcomes or an N by L matrix if it is considered for the same X and multiple outcomes.}
+
+\item{sumstat}{A list of data.frames of summary statistics.}
+
+\item{LD}{A list of correlation matrices indicating the LD matrix for each genotype.}
+
+\item{dict_YX}{A L by 2 matrix of dictionary for X and Y if there exist subsets of outcomes corresponding to the same X matrix.}
+
+\item{dict_sumstatLD}{A L by 2 matrix of dictionary for sumstat and LD if there exist subsets of outcomes corresponding to the same sumstat.}
+
+\item{effect_est}{Matrix of variable regression coefficients (i.e. regression beta values) in the genomic region}
+
+\item{effect_se}{Matrix of standard errors associated with the beta values}
+
+\item{effect_n}{A scalar or a vector of sample sizes for estimating regression coefficients.}
+
+\item{overlap_variables}{If overlap_variables = TRUE, only perform colocalization in the overlapped region.}
+
+\item{M}{The maximum number of gradient boosting rounds for each outcome (default is 500).}
+
+\item{min_abs_corr}{Minimum absolute correlation allowed in a confidence set.}
+}
+\value{
+A list containing:
+\item{X}{Processed list of genotype matrices}
+\item{Y}{Processed list of phenotype vectors}
+\item{yx_dict}{Dictionary mapping Y to X}
+\item{keep_variable_individual}{List of variable names for each X matrix}
+\item{sumstat}{Processed list of summary statistics data.frames}
+\item{LD}{Processed list of LD matrices}
+\item{sumstatLD_dict}{Dictionary mapping sumstat to LD}
+\item{keep_variable_sumstat}{List of variant names for each sumstat}
+\item{Z}{List of z-scores for each outcome}
+\item{N_sumstat}{List of sample sizes for each outcome}
+\item{Var_y}{List of phenotype variances for each outcome}
+\item{SeBhat}{List of standard errors for each outcome}
+\item{M_updated}{Updated M value (may be changed if LD not provided)}
+\item{min_abs_corr_updated}{Updated min_abs_corr value (may be changed if LD not provided)}
+\item{jk_equiv_corr_updated}{Updated jk_equiv_corr value}
+\item{jk_equiv_loglik_updated}{Updated jk_equiv_loglik value}
+\item{func_simplex_updated}{Updated func_simplex value}
+}
+\description{
+Internal function to validate and process both individual-level and summary-level input data
+}
+\keyword{internal}
diff --git a/tests/testthat/test_plot.R b/tests/testthat/test_plot.R
index 775d3fe..d49aace 100644
--- a/tests/testthat/test_plot.R
+++ b/tests/testthat/test_plot.R
@@ -176,7 +176,9 @@ test_that("colocboost_plot handles layout options", {
 test_that("colocboost_plot handles additional visualization options", {
   
   # Test with vertical line options
-  expect_error(suppressWarnings(colocboost_plot(cb_res, add_vertical = TRUE, add_vertical_idx = c(5, 10))), NA)
+  expect_error(suppressWarnings(colocboost_plot(cb_res, add_highlight = TRUE, add_highlight_idx = c(5, 10))), NA)
+  expect_error(suppressWarnings(colocboost_plot(cb_res, add_highlight = TRUE, add_highlight_idx = c(5, 10), add_highlight_style = "vertical_lines")), NA)
+  expect_error(suppressWarnings(colocboost_plot(cb_res, add_highlight = TRUE, add_highlight_idx = c(5, 10), add_highlight_style = "star")), NA)
   
   # Test with show_top_variables option
   expect_error(suppressWarnings(colocboost_plot(cb_res, show_top_variables = TRUE)), NA)
diff --git a/tests/testthat/test_sumstats.R b/tests/testthat/test_sumstats.R
index deb3aaa..ef896c3 100644
--- a/tests/testthat/test_sumstats.R
+++ b/tests/testthat/test_sumstats.R
@@ -247,7 +247,7 @@ test_that("colocboost handles mismatched inputs correctly", {
   
   # Expect error with mismatched dimensions
   expect_warning(
-    colocboost(
+    colocboost_validate_input_data(
       effect_est = bad_effect_est,
       effect_se = test_sumstat_data$effect_se
     ),
@@ -281,4 +281,77 @@ test_that("colocboost handles missing sample size correctly", {
   
   # Still should get a valid result
   expect_s3_class(result, "colocboost")
-})
\ No newline at end of file
+})
+
+# Test 8: Handling of NA variants
+test_that("colocboost removes NA variants correctly", {
+  sumstat_with_na <- test_sumstat_data$sumstat
+  sumstat_with_na[[1]]$variant[1:2] <- NA
+  
+  warnings <- capture_warnings({
+    validated_data <- colocboost_validate_input_data(
+      sumstat = sumstat_with_na,
+      LD = test_sumstat_data$LD
+    )
+  })
+  
+  expect_true(any(grepl("Removed variant with NA from sumstat 1", warnings)))
+  # Should have 2 fewer variants
+  expect_true(length(validated_data$Z[[1]]) == length(validated_data$Z[[2]])-2)
+})
+
+# Test 9: Handling of duplicate variants
+test_that("colocboost removes duplicate variants correctly", {
+  sumstat_with_dup <- test_sumstat_data$sumstat
+  # Duplicate the first variant
+  sumstat_with_dup[[1]] <- rbind(
+    sumstat_with_dup[[1]][1, ],
+    sumstat_with_dup[[1]]
+  )
+  
+  warnings <- capture_warnings({
+    result <- colocboost(
+      sumstat = sumstat_with_dup,
+      LD = test_sumstat_data$LD
+    )
+  })
+  
+  expect_true(any(grepl("Removed duplicate variants from sumstat 1", warnings)))
+  expect_s3_class(result, "colocboost")
+  # Should be back to original count
+  expect_equal(length(result$data_info$variables), ncol(test_sumstat_data$X))
+})
+
+# Test 10: Handling of mismatched variants between sumstat and LD
+test_that("colocboost handles variant mismatch between sumstat and LD", {
+  # Create LD with different variant names
+  LD_modified <- test_sumstat_data$LD
+  rownames(LD_modified) <- colnames(LD_modified) <- paste0("SNP", 11:30)
+  
+  warnings <- capture_warnings({
+    result <- colocboost_validate_input_data(
+      sumstat = test_sumstat_data$sumstat,
+      LD = LD_modified
+    )
+  })
+  
+  # Should warn about removing variants
+  expect_true(any(grepl("removing.*variants since those variants are not in LD", warnings)))
+})
+
+# Test 11: Error when no common variants
+test_that("colocboost errors with no common variants", {
+  # Create LD with completely different variant names
+  LD_no_overlap <- test_sumstat_data$LD
+  rownames(LD_no_overlap) <- colnames(LD_no_overlap) <- paste0("DIFF", 1:20)
+  
+  warnings <- capture_warnings(
+    colocboost_validate_input_data(
+      sumstat = test_sumstat_data$sumstat,
+      LD = LD_no_overlap
+    )
+  )
+  expect_true(any(grepl("removing.*variants since those variants are not in LD", warnings)))
+  expect_true(any(grepl("is empty after filtering", warnings)))
+})
+
diff --git a/tests/testthat/test_utils.R b/tests/testthat/test_utils.R
index a622540..d3d480d 100644
--- a/tests/testthat/test_utils.R
+++ b/tests/testthat/test_utils.R
@@ -85,12 +85,12 @@ test_that("w_cs correctly identifies confidence set for weight vector", {
   result <- w_cs(w, coverage = 0.8)
   
   # Expected result for 80% coverage: w[1] + w[2] = 0.8, so first 2 elements should be 1
-  expected <- c(1, 1, 0, 0)
+  expected <- c(1,2)
   expect_equal(result, expected)
   
   # Test with different coverage
   result2 <- w_cs(w, coverage = 0.9)
-  expected2 <- c(1, 1, 1, 1)  # First 4 elements cover 90% since 3 and 4 with the same 0.1 weight 
+  expected2 <- c(1,2,3,4)  # First 4 elements cover 90% since 3 and 4 with the same 0.1 weight 
   expect_equal(result2, expected2)
 })
 
@@ -490,7 +490,7 @@ test_that("get_merge_ordered_with_indices handles conflicting variable orders co
 test_that("get_merge_ordered_with_indices handles edge cases", {
   
   # Test with NULL inputs
-  expect_equal(get_merge_ordered_with_indices(list()), character(0))
+  expect_equal(get_merge_ordered_with_indices(list()), NULL)
   
   # Test with single vector
   expect_equal(get_merge_ordered_with_indices(list(c("A", "B", "C"))), c("A", "B", "C"))
diff --git a/vignettes/ColocBoost_Wrapper_Pipeline.Rmd b/vignettes/ColocBoost_Wrapper_Pipeline.Rmd
index 6b5a5a4..800d2f7 100644
--- a/vignettes/ColocBoost_Wrapper_Pipeline.Rmd
+++ b/vignettes/ColocBoost_Wrapper_Pipeline.Rmd
@@ -22,259 +22,8 @@ This vignette demonstrates how to use the bioinformatics pipeline for ColocBoost
 - See more details about input data preparation in `xqtl_protocol` with [link](https://statfungen.github.io/xqtl-protocol/code/mnm_analysis/mnm_methods/colocboost.html).
 
 
-# 1. Loading Data using `colocboost_analysis_pipeline` function
+Step 1: Loading individual-level and summary statistics using `load_multitask_regional_data` function from multiple cohorts or datasets
 
-This function harmonizes the input data and prepares it for colocalization analysis. 
 
+Step 2: Perform ColocBoost using `colocboost_analysis_pipeline` function
 
-In this section, we introduce how to load the regional data required for the ColocBoost analysis using the `load_multitask_regional_data` function. 
-This function loads mixed datasets for a specific region, including individual-level data (genotype, phenotype, covariate data), summary statistics 
-(sumstats, LD), or a combination of both. It runs `load_regional_univariate_data` for each individual-level dataset and `load_rss_data` for each summary statistics dataset.
-The output is a list with `individual_data` and `sumstat_data` components, where `individual_data` is a list of individual-level data and `sumstat_data` is a list of summary statistics data. 
-This list is then passed to the `colocboost_analysis_pipeline` function for the colocalization analysis.
-
-
-Below are the input parameters for this function for loading individual-level data:
-
-## 1.1. Loading individual-level data from multiple cohorts
-
-inputs:
-- **`region`**: String ; Genomic region of interest in the format of `chr:start-end` for the phenotype region you want to analyze. 
-- **`genotype_list`**: Character vector; Paths for PLINK bed files containing genotype data (do NOT include .bed suffix). 
-- **`phenotype_list`**: Character vector; Paths for phenotype file names.
-- **`covariate_list`**: Character vector; Paths for covariate file names for each phenotype. Must have the same length as the phenotype file vector.
-- **`conditions_list_individual`**: Character vector; Strings representing different conditions or groups used for naming. Must have the same length as the phenotype file vector.
-- **`match_geno_pheno`**: Integer vector; Indices of phenotypes matched to genotype if multiple genotype PLINK files are used. For each phenotype file in `phenotype_list`, the index of the genotype file in `genotype_list` it matches with.
-- **`association_window`**: String; Genomic region of interest in the format of `chr:start-end` for the association analysis window of variants to test (cis or trans). If not provided, all genotype data will be loaded.
-- **`extract_region_name`**: List of character vectors; Phenotype names (e.g., gene ID `ENSG00000269699`) to subset the phenotype data when there are multiple phenotypes availible in the region. Must have the same length as the phenotype file vector. Default is `NULL`, which will use all phenotypes in the region.
-- **`region_name_col`**: Integer; 1-based index of the column containing the region name (i.e. 4 for gene ID in a bed file). Required if `extract_region_name` is not `NULL`, or if multiple phenotypes fall into the same region in one phenotype file
-- **`keep_indel`**: Logical; indicating whether to keep insertions/deletions (INDELs). Default is `TRUE`.
-- **`keep_samples`**: Character vector; Sample names to keep. Default is `NULL`. Currently only supports keeping the same samples from all genotype and phenotype files.
-- **`maf_cutoff`**: Numeric; Minimum minor allele frequency (MAF) cutoff. Default is 0.
-- **`mac_cutoff`**: Numeric; Minimum minor allele count (MAC) cutoff. Default is 0.
-- **`xvar_cutoff`**: Numeric; Minimum genotype variance cutoff. Default is 0.
-- **`imiss_cutoff`**: Numeric; Maximum individual missingness cutoff. Default is 0.
-
-outputs: 
-- **`region_data`**: List (with `individual_data`, `sumstat_data`); Output of the `load_multitask_regional_data` function. If only individual-level data is loaded, `sumstat_data` will be `NULL`.
-
-
-**Indivudual-level data loading example**
-
-The following example demonstrates how to set up input data with 3 phenotypes and 2 cohorts. The first cohort has 2 phenotypes and the second cohort has 1 phenotype. The first phenotype has 2 genes and the second phenotype has 1 gene.
-
-```{r, data-loader-individual}
-# Example of loading individual-level data
-region = "chr1:1000000-2000000"
-genotype_list = c("plink_cohort1.1", "plink_cohort1.2")
-phenotype_list = c("phenotype1_cohort1.bed.gz", "phenotype2_cohort1.bed.gz", "phenotype1_cohort2.bed.gz")
-covariate_list = c("covariate1_cohort1.bed.gz", "covariate2_cohort1.bed.gz", "covariate1_cohort2.bed.gz")
-conditions_list_individual = c("phenotype1_cohort1", "phenotype2_cohort1", "phenotype1_cohort2")
-match_geno_pheno = c(1,1,2)
-association_window = "chr1:1000000-2000000" # set to be the same as region for cis-analysis
-extract_region_name = list(c("ENSG00000269699, ENSG00000789633"), c("ENSG00000269699"), c("ENSG00000269699", "ENSG00000789633"))
-region_name_col = 4
-keep_indel = TRUE
-keep_samples = c("SAMPLE1", "SAMPLE2", "SAMPLE3")
-
-# Following parameters need to be set according to your data
-maf_cutoff = 0.01
-mac_cutoff = 10
-xvar_cutoff = 0
-imiss_cutoff = 0.9
-
-# More advanced parameters see pecotmr::load_multitask_regional_data()
-
-region_data_individual <- load_multitask_regional_data(
-    region = region,
-    genotype_list = genotype_list,
-    phenotype_list = phenotype_list,
-    covariate_list = covariate_list,
-    conditions_list_individual = conditions_list_individual,
-    match_geno_pheno = match_geno_pheno,
-    association_window = association_window,
-    region_name_col = region_name_col,
-    extract_region_name = extract_region_name,
-    keep_indel = keep_indel,
-    keep_samples = keep_samples,
-    maf_cutoff = maf_cutoff,
-    mac_cutoff = mac_cutoff,
-    xvar_cutoff = xvar_cutoff,
-    imiss_cutoff = imiss_cutoff
-)
-
-```
-
-
-
-## 1.2. Loading summary statistics from multiple cohorts or datasets
-
-inputs:
-- **`sumstat_path_list`**: Character vector; Paths to the summary statistics.
-- **`column_file_path_list`**: Character vector; Paths to the column mapping files. See below for expected format.
-- **`LD_meta_file_path_list`**: Character vector; Paths to LD metadata files. See below for expected format.
-- **`conditions_list_sumstat`**: Character vector; Strings representing different sumstats used for naming. Must have the same length as the sumstat file vector.
-- **`match_LD_sumstat`**: List of character vectors; Mapping each LD metadata file to the summary-statistics conditions to pair with it. Length must equal `LD_meta_file_path_list`. Each element is a character vector of names present in `conditions_list_sumstat`. If omitted or an element is empty, defaults to all conditions for the first LD.
-- **`association_window`**: String; Genomic region of interest in the format of `chr:start-end` for the association analysis window of variants to test (cis or trans). If not provided, all genotype data will be loaded.
-- **`n_samples`**: Integer vector; Sample size. Set a 0 if `n_cases`/`n_controls` are passed explicitly. If unknown, set as 0 and include `n_samples` column in the column mapping file to retrieve from the sumstat file.
-- **`n_cases`**: Integer vector; Number of cases. Set a 0 if `n_samples` is passed explicitly. If unknown, set as 0 and include `n_cases` column in the column mapping file to retrieve from the sumstat file.
-- **`n_controls`**: Integer vector; Number of controls. Set a 0 if `n_samples` is passed explicitly. If unknown, set as 0 and include `n_controls` column in the column mapping file to retrieve from the sumstat file.
-
-outputs: 
-- **`region_data`**: List (with `individual_data`, `sumstat_data`); Output of the `load_multitask_regional_data` function. If only summary statistics data is loaded, `individual_data` will be `NULL`.
-
-**Summary statistics loading example**
-
-The following example demonstrates how to set up input data with 2 summary statistics and one LD reference.
-
-```{r, data-loader-sumstat}
-# Example of loading summary statistics
-sumstat_path_list = c("sumstat1.tsv.gz", "sumstat2.tsv.gz")
-column_file_path_list = c("column_mapping_sumstat1.yml", "column_mapping_sumstat2.yml")
-LD_meta_file_path_list = c("ld_meta_file.tsv")
-conditions_list_sumstat = c("sumstat_1", "sumstat_2")
-match_LD_sumstat = c("sumstat_1", "sumstat_2")
-association_window = "chr1:1000000-2000000"
-
-# Following parameters need to be set according to your data
-n_samples = c(300000, 0)
-n_cases = c(0, 20000)
-n_controls = c(0, 40000)
-
-
-# More advanced parameters see pecotmr::load_multitask_regional_data()
-
-region_data_sumstat <- load_multitask_regional_data(
-    sumstat_path_list = sumstat_path_list,
-    column_file_path_list = column_file_path_list,
-    LD_meta_file_path_list = LD_meta_file_path_list,
-    conditions_list_sumstat = conditions_list_sumstat,
-    match_LD_sumstat = match_LD_sumstat,
-    association_window = association_window,
-    n_samples = n_samples,
-    n_cases = n_cases,
-    n_controls = n_controls
-)
-```
-
-
-
-**Expected format for column mapping file**
-The column mapping file is YAML (`.yml`) with key: value pairs mapping your input column names to the standardized names expected by the loader. 
-Required columns are `chrom`, `pos`, `A1`, and `A2`, and either `z` or `beta` and `sebeta`. 
-Either 'n_case' and 'n_control' or 'n_samples' can be passed as part of the column mapping, but will be overwritten by the n_cases and n_controls or n_samples parameterspassed explicitly.
-```yaml
-# required
-chrom: chromosome
-pos: position
-A1: effect_allele
-A2: non_effect_allele
-
-z: zscore 
-# or
-beta: beta
-sebeta: sebeta
-
-# optional, will be overridden by n_samples or n_cases and n_controls if passed explicitly
-n_case: NCASE
-n_control: NCONTROL
-# or
-n_sample: N
-```
-
-
-**Expected format for LD metadata file**
-LD files sould be in the format generated by for instance `plink --r squared`, then xz compressed.
-The LD metadata file is a tab-separated file with the following columns:
-- `chrom`: chromosome
-- `start`: start position
-- `end`: end position
-- `ld_path, bim_path`: path to the LD block file and bim file
-```
-1	1000000	2000000	ld_block_1.ld.gz,ld_block_1.bim
-```
-
-
-# 2. Perform ColocBoost using `colocboost_analysis_pipeline` function
-
-In this section, we load region data for a combination of individual-level and summary statistics data, then perform the colocalization analysis using the `colocboost_analysis_pipeline` function. 
-The colocalization analysis can be run in any one of three modes, or in a combination of these modes (names assume that individual-level data is xQTL data and summary statistics data is GWAS data):
-
-- **`xQTL-only mode`**: Only perform colocalization analysis on the individual-level data. Summary statistics data is not used.
-- **`joint GWAS mode`**: Perform colocalization analysis in disease-agnostic mode on the individual-level and summary statistics data together.
-- **`separate GWAS mode`**: Perform colocalization analysis in disease-prioritized mode on the the individual-level data and each summary statistics dataset separately, treating each summary statistics dataset as the focal trait.
-
-inputs:
-- **`region_data`**: List (with `individual_data`, `sumstat_data`); Output of the `load_multitask_regional_data` function.
-- **`focal_trait`**: String; For xQTL-only mode, the name of the trait to perform disease-prioritized ColocBoost, from `conditions_list_individual`. If not provided, xQTL-only mode will be run without disease-prioritized mode.
-- **`event_filters`**: List of character vectors; Patterns for filtering events based on context names. 
-Example: for sQTL, `list(type_pattern = ".*clu_(\\d+_[+-?]).*", valid_pattern = "clu_(\\d+_[+-?]):PR:", exclude_pattern = "clu_(\\d+_[+-?]):IN:")`.
-- **`maf_cutoff`**: Numeric; Minor allele frequency cutoff. Default is 0.005.
-- **`pip_cutoff_to_skip_ind`**: Integer vector; Cutoff values for skipping analysis based on pre-screening with single-effect SuSiE (L=1). Context is skipped if none of the variants in the context have PIP values greater than the cutoff. Default is 0 (does not run single-effect SuSiE). Passing a negative value sets the cutoff to 3/number of variants.
-- **`pip_cutoff_to_skip_sumstat`**: Integer vector; Cutoff values for skipping analysis based on pre-screening with single-effect SuSiE (L=1). Sumstat is skipped if none of the variants in the sumstat have PIP values greater than the cutoff. Default is 0 (does not run single-effect SuSiE). Passing a negative value sets the cutoff to 3/number of variants.
-- **`qc_method`**: String; Quality control method to use. Options are "rss_qc", "dentist", or "slalom". Default is `rss_qc`.
-- **`impute`**: Logical; if TRUE, performs imputation for outliers identified in the analysis. Default is `TRUE`.
-- **`impute_opts`**: List of lists; Imputation options including rcond, R2_threshold, and minimum_ld. Default is `list(rcond = 0.01, R2_threshold = 0.6, minimum_ld = 5)`.
-- **`xqtl_coloc`**: Logical; if TRUE, performs xQTL-only mode. Default is `TRUE`.
-- **`joint_gwas`**: Logical; if TRUE, performs joint GWAS mode, mapping all individual-level and sumstat data together.Default is `FALSE`.
-- **`separate_gwas`**: Logical; if TRUE, runs separate GWAS mode, where each sumstat dataset is analyzed separately with all individual-level data, treating each sumstat as the focal trait in disease-prioritized mode. Default is `FALSE`.
-
-outputs:
-- **`colocboost_results`**: List of colocboost objects (with `xqtl_coloc`, `joint_gwas`, `separate_gwas`); Output of the `colocboost_analysis_pipeline` function. If the mode is not run, the corresponding element will be `NULL`.
-
-```{r, colocboost-analysis}
-# load in individual-level and sumstat data
-region_data_combined <- load_multitask_regional_data(
-    region = region,
-    genotype_list = genotype_list,
-    phenotype_list = phenotype_list,
-    covariate_list = covariate_list,
-    conditions_list_individual = conditions_list_individual,
-    match_geno_pheno = match_geno_pheno,
-    association_window = association_window,
-    region_name_col = region_name_col,
-    extract_region_name = extract_region_name,
-    keep_indel = keep_indel,
-    keep_samples = keep_samples,
-    maf_cutoff = maf_cutoff,
-    mac_cutoff = mac_cutoff,
-    xvar_cutoff = xvar_cutoff,
-    imiss_cutoff = imiss_cutoff,
-    sumstat_path_list = sumstat_path_list,
-    column_file_path_list = column_file_path_list,
-    LD_meta_file_path_list = LD_meta_file_path_list,
-    conditions_list_sumstat = conditions_list_sumstat,
-    match_LD_sumstat = match_LD_sumstat,
-    n_samples = n_samples,
-    n_cases = n_cases,
-    n_controls = n_controls
-)
-
-maf_cutoff = 0.01
-pip_cutoff_to_skip_ind = rep(0, length(phenotype_list))
-pip_cutoff_to_skip_sumstat = rep(0, length(sumstat_path_list))
-qc_method = "rss_qc" 
-
-# run colocboost analysis
-colocboost_results <- colocboost_analysis_pipeline(
-    region_data_combined,
-    maf_cutoff = maf_cutoff, 
-    pip_cutoff_to_skip_ind = pip_cutoff_to_skip_ind,
-    pip_cutoff_to_skip_sumstat = pip_cutoff_to_skip_sumstat,
-    qc_method = qc_method,
-    xqtl_coloc = TRUE,
-    joint_gwas = TRUE,
-    separate_gwas = TRUE
-)
-
-# visualize results for xQTL-only mode
-colocboost_plot(colocboost_results$xqtl_coloc)
-
-# visualize results for joint GWAS mode
-colocboost_plot(colocboost_results$joint_gwas)
-
-# visualize results for separate GWAS mode
-for (i in 1:length(colocboost_results$separate_gwas)) {
-    colocboost_plot(colocboost_results$separate_gwas[[i]])
-}
-```
\ No newline at end of file
diff --git a/vignettes/Visualization_ColocBoost_Output.Rmd b/vignettes/Visualization_ColocBoost_Output.Rmd
index 1ffd7d9..8ed3e5b 100644
--- a/vignettes/Visualization_ColocBoost_Output.Rmd
+++ b/vignettes/Visualization_ColocBoost_Output.Rmd
@@ -94,9 +94,11 @@ There are three options available for plotting the CoS variants to uncolocalized
 colocboost_plot(res, show_cos_to_uncoloc = TRUE)
 ```
 
-### 2.4. Plot with an added vertical line
+### 2.4. Plot with added highlight points
 
-You can add a vertical line to the plot by setting `add_vertical = TRUE` and `add_vertical_idx = **`. This will add a vertical line at the specified index.
+You can highlight specific variants in the plot by setting `add_highlight = TRUE` and `add_highlight_idx = **`. 
+This will add red dashed vertical lines (defult `add_highlight_style = "vertical_lines"`) at the specified index you want to highlight. 
+Alternatively, you can use `add_highlight_style = "star"` to change the highlight style to the red star for the specified variants.
 For example, to add a vertical line at true causal variants, you can set `add_vertical_idx = unique(unlist(Ind_5traits$true_effect_variants))`.
 Following plot also shows the top variants.
 
@@ -104,8 +106,9 @@ Following plot also shows the top variants.
 ```{r vertical-plot}
 colocboost_plot(
   res, show_top_variables = TRUE, 
-  add_vertical = TRUE, 
-  add_vertical_idx = unique(unlist(Ind_5traits$true_effect_variants))
+  add_highlight = TRUE, 
+  add_highlight_idx = unique(unlist(Ind_5traits$true_effect_variants)),
+  add_highlight_style = "star"
 )
 ```