From 47ae7d07aaa1779287cce5d2388fa82eb89f10ee Mon Sep 17 00:00:00 2001
From: Kate Lawrence <klawren@stanford.edu>
Date: Thu, 30 Oct 2025 10:38:40 -0700
Subject: [PATCH 1/3] add rss_qc as default qc_method

---
 R/colocboost_pipeline.R | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/R/colocboost_pipeline.R b/R/colocboost_pipeline.R
index 2b22d712..d12bd6d9 100644
--- a/R/colocboost_pipeline.R
+++ b/R/colocboost_pipeline.R
@@ -554,6 +554,7 @@ qc_regional_data <- function(region_data,
                              qc_method = c("rss_qc", "dentist", "slalom"),
                              impute = TRUE,
                              impute_opts = list(rcond = 0.01, R2_threshold = 0.6, minimum_ld = 5, lamb = 0.01)) {
+  if (!is.null(qc_method)) qc_method <- match.arg(qc_method)
   #### related internal functions
   # Add context names to colname of Y if missing
   add_context_to_Y <- function(res_Y) {
@@ -675,6 +676,7 @@ qc_regional_data <- function(region_data,
                                          qc_method = c("rss_qc", "dentist", "slalom"),
                                          impute = TRUE,
                                          impute_opts = list(rcond = 0.01, R2_threshold = 0.6, minimum_ld = 5, lamb = 0.01)) {
+    if (!is.null(qc_method)) qc_method <- match.arg(qc_method)
     n_LD <- length(sumstat_data$LD_info)
     final_sumstats <- final_LD <- NULL
     LD_match <- c()

From 4d60845831620da3a0b8d8a43be9f54aa4d9880f Mon Sep 17 00:00:00 2001
From: Kate Lawrence <klawren@stanford.edu>
Date: Thu, 30 Oct 2025 10:42:40 -0700
Subject: [PATCH 2/3] add default pip_cutoff for multiple individual contexts
 or sumstats

---
 R/colocboost_pipeline.R | 34 +++++++++++++++++++++++++++++++---
 1 file changed, 31 insertions(+), 3 deletions(-)

diff --git a/R/colocboost_pipeline.R b/R/colocboost_pipeline.R
index d12bd6d9..5b98aab4 100644
--- a/R/colocboost_pipeline.R
+++ b/R/colocboost_pipeline.R
@@ -615,6 +615,15 @@ qc_regional_data <- function(region_data,
     # - add context to colname of Y
     Y <- add_context_to_Y(Y)
     n_context <- length(X)
+    # Validate pip_cutoff_to_skip_ind length: allow scalar or exact length; otherwise error
+    if (length(pip_cutoff_to_skip_ind) == 1) {
+      pip_cutoff_vec <- rep(pip_cutoff_to_skip_ind, n_context)
+    } else if (length(pip_cutoff_to_skip_ind) == n_context) {
+      pip_cutoff_vec <- pip_cutoff_to_skip_ind
+    } else {
+      stop(sprintf("pip_cutoff_to_skip_ind length (%d) must be 1 or equal to number of contexts (%d)",
+                   length(pip_cutoff_to_skip_ind), n_context))
+    }
     residual_X <- residual_Y <- list()
     keep_contexts <- c()
     for (i_context in 1:n_context) {
@@ -626,8 +635,8 @@ qc_regional_data <- function(region_data,
       # - remove variants with maf < maf_cutoff
       # tmp <- filter_resX_maf(resX, maf, maf_cutoff = maf_cutoff)
       resX <- filter_X(resX, missing_rate_thresh = NULL, maf_thresh = maf_cutoff, maf = maf)
-      # Initial PIP check
-      resY <- filter_resY_pip(resX, resY, pip_cutoff = pip_cutoff_to_skip_ind[i_context], context = context)
+      # Initial PIP check (use recycled vector element)
+      resY <- filter_resY_pip(resX, resY, pip_cutoff = pip_cutoff_vec[i_context], context = context)
       if (!is.null(resY)) {
         residual_X <- c(residual_X, list(resX))
         residual_Y <- c(residual_Y, list(resY))
@@ -689,7 +698,26 @@ qc_regional_data <- function(region_data,
         n <- sumstat$n
         var_y <- sumstat$var_y
         conditions_sumstat <- names(sumstats)[ii]
-        pip_cutoff_to_skip_ld <- pip_cutoff_to_skip_sumstat[conditions_sumstat] %>% as.numeric()
+        # Determine PIP cutoff per sumstat with strict validation:
+        # - scalar applies to all studies
+        # - length equals number of studies -> positional mapping
+        # - named vector that covers ALL study names -> name-based mapping
+        # Otherwise: throw error (do not silently default)
+        if (length(pip_cutoff_to_skip_sumstat) == 1) {
+          pip_cutoff_to_skip_ld <- as.numeric(pip_cutoff_to_skip_sumstat)
+        } else if (length(pip_cutoff_to_skip_sumstat) == length(sumstats) && is.null(names(pip_cutoff_to_skip_sumstat))) {
+          pip_cutoff_to_skip_ld <- as.numeric(pip_cutoff_to_skip_sumstat[ii])
+        } else if (!is.null(names(pip_cutoff_to_skip_sumstat))) {
+          missing_names <- setdiff(names(sumstats), names(pip_cutoff_to_skip_sumstat))
+          if (length(missing_names) > 0) {
+            stop(sprintf("pip_cutoff_to_skip_sumstat must provide values for all studies; missing: %s",
+                         paste(missing_names, collapse = ", ")))
+          }
+          pip_cutoff_to_skip_ld <- as.numeric(pip_cutoff_to_skip_sumstat[conditions_sumstat])
+        } else {
+          stop(sprintf("pip_cutoff_to_skip_sumstat length (%d) must be 1, equal to number of studies (%d), or be a named vector covering all studies",
+                       length(pip_cutoff_to_skip_sumstat), length(sumstats)))
+        }
 
         # Preprocess the input data
         preprocess_results <- rss_basic_qc(sumstat$sumstats, LD_data, remove_indels = remove_indels)

From 705eae49f8b92fdeea9d82299481b48bcaa8b19d Mon Sep 17 00:00:00 2001
From: Kate Lawrence <klawren@stanford.edu>
Date: Thu, 30 Oct 2025 10:58:10 -0700
Subject: [PATCH 3/3] get names from region_name_col even if
 extract_region_name isn't provided. Add fallback region nameing (reigon_1,
 region_2, ..)

---
 R/file_utils.R | 15 ++++++++++++++-
 1 file changed, 14 insertions(+), 1 deletion(-)

diff --git a/R/file_utils.R b/R/file_utils.R
index ce3253ff..6e2444a6 100644
--- a/R/file_utils.R
+++ b/R/file_utils.R
@@ -243,7 +243,20 @@ load_phenotype_data <- function(phenotype_path, region, extract_region_name = NU
         stop("region_name_col is out of bounds for the number of columns in tabix_data.")
       }
     } else {
-      return(tabix_data %>% t())
+      # No extract_region_name filter: still transpose and assign names appropriately
+      result <- tabix_data %>% t()
+      if (!is.null(region_name_col) && (region_name_col %% 1 == 0)) {
+        if (region_name_col <= ncol(tabix_data)) {
+          # After transpose, the desired name row is at index region_name_col
+          colnames(result) <- result[region_name_col, ]
+        } else {
+          stop("region_name_col is out of bounds for the number of columns in tabix_data.")
+        }
+      } else if (is.null(colnames(result))) {
+        # Fallback: synthesize names if still missing
+        colnames(result) <- paste0("region_", seq_len(ncol(result)))
+      }
+      return(result)
     }
   }))