From 00f205810c725ab6c964ff3cf4cf5a4c36d2016e Mon Sep 17 00:00:00 2001
From: Kate Lawrence <klawren@stanford.edu>
Date: Thu, 2 Oct 2025 11:24:16 -0700
Subject: [PATCH 01/15] clean NAs before mutated in alelle_qc

---
 R/allele_qc.R | 227 +++++++++++++++++++++++++++-----------------------
 1 file changed, 124 insertions(+), 103 deletions(-)

diff --git a/R/allele_qc.R b/R/allele_qc.R
index 7b32d0a8..69fb1471 100644
--- a/R/allele_qc.R
+++ b/R/allele_qc.R
@@ -18,184 +18,205 @@
 #' @param remove_unmatched Whether to remove unmatched variants. Default is `TRUE`.
 #' @return A single data frame with matched variants.
 #' @importFrom magrittr %>%
-#' @importFrom dplyr mutate inner_join filter pull select everything row_number if_else
+#' @importFrom dplyr mutate inner_join filter pull select everything row_number if_else any_of all_of rename
 #' @importFrom vctrs vec_duplicate_detect
-#' @importFrom dplyr if_else
 #' @importFrom tidyr separate
 #' @export
 allele_qc <- function(target_data, ref_variants, col_to_flip = NULL,
-                      match_min_prop = 0.2, remove_dups = TRUE,
-                      remove_indels = FALSE, remove_strand_ambiguous = TRUE,
-                      flip_strand = FALSE, remove_unmatched = TRUE, remove_same_vars = FALSE) {
-    strand_flip <- function(ref) {
-    # Define a mapping for complementary bases
-    base_mapping <- c("A" = "T", "T" = "A", "G" = "C", "C" = "G")
-
-    # Function to complement a single base
-    complement_base <- function(base) {
-      complement <- base_mapping[base]
-      return(complement)
-    }
-
-    # Function to reverse and complement a DNA sequence
-    reverse_complement <- function(sequence) {
-      reversed <- rev(strsplit(sequence, NULL)[[1]])
-      complemented <- sapply(reversed, complement_base, USE.NAMES = FALSE)
-      return(paste(complemented, collapse = ""))
-    }
-
-    complemented_sequence <- sapply(ref, reverse_complement, USE.NAMES = FALSE)
-
-    return(complemented_sequence)
+                     match_min_prop = 0.2, remove_dups = TRUE,
+                     remove_indels = FALSE, remove_strand_ambiguous = TRUE,
+                     flip_strand = FALSE, remove_unmatched = TRUE, remove_same_vars = FALSE) {
+	strand_flip <- function(ref) {
+	# Define a mapping for complementary bases
+	base_mapping <- c("A" = "T", "T" = "A", "G" = "C", "C" = "G")
+
+	# Function to complement a single base
+	complement_base <- function(base) {
+	  complement <- base_mapping[base]
+	  return(complement)
+	}
+
+	# Function to reverse and complement a DNA sequence
+	reverse_complement <- function(sequence) {
+	  reversed <- rev(strsplit(sequence, NULL)[[1]])
+	  complemented <- sapply(reversed, complement_base, USE.NAMES = FALSE)
+	  return(paste(complemented, collapse = ""))
+	}
+
+	complemented_sequence <- sapply(ref, reverse_complement, USE.NAMES = FALSE)
+
+	return(complemented_sequence)
+  }
+
+  # helper to sanitize column names to avoid NA/empty names that break dplyr verbs
+  sanitize_names <- function(df) {
+	 nm <- colnames(df)
+	 if (is.null(nm)) {
+	   nm <- rep("unnamed", ncol(df))
+	 }
+	 empty_idx <- is.na(nm) | nm == ""
+	 if (any(empty_idx)) {
+	   # assign stable placeholder names for empties
+	   nm[empty_idx] <- paste0("unnamed_", seq_len(sum(empty_idx)))
+	 }
+	 # ensure names are unique and syntactic
+	 nm <- base::make.unique(nm, sep = "_")
+	 colnames(df) <- nm
+	 df
   }
 
   # check if the pattern is ATCG/DI
   check_ATCG <- function(vec) {
-    pattern <- "^[ATCGDI]+$"
+	pattern <- "^[ATCGDI]+$"
 
-    # Function to check if a single element matches the pattern
-    check_element <- function(element) {
-      grepl(pattern, element)
-    }
+	# Function to check if a single element matches the pattern
+	check_element <- function(element) {
+	  grepl(pattern, element)
+	}
 
-    result <- sapply(vec, check_element, USE.NAMES = FALSE)
-    return(result)
+	result <- sapply(vec, check_element, USE.NAMES = FALSE)
+	return(result)
   }
 
   # transform all inputs to dataframe
   if (is.data.frame(target_data)) {
-     if (ncol(target_data) > 4 && all(c("chrom", "pos", "A2", "A1") %in% names(target_data))) {
-        # Extract variant columns and standardize
-        variant_cols <- c("chrom", "pos", "A2", "A1")
-        variant_df <- target_data %>% select(all_of(variant_cols))
-        other_cols <- target_data %>% select(-all_of(variant_cols))
-        target_data <- cbind(variant_id_to_df(variant_df), other_cols)
-     } else {
-        target_data <- variant_id_to_df(target_data)
-     }
+	 if (ncol(target_data) > 4 && all(c("chrom", "pos", "A2", "A1") %in% names(target_data))) {
+		# Extract variant columns and standardize
+		variant_cols <- c("chrom", "pos", "A2", "A1")
+		variant_df <- target_data %>% select(all_of(variant_cols))
+		other_cols <- target_data %>% select(-all_of(variant_cols))
+		target_data <- cbind(variant_id_to_df(variant_df), other_cols)
+	 } else {
+		target_data <- variant_id_to_df(target_data)
+	 }
   } else {
-        target_data <- variant_id_to_df(target_data)
+		target_data <- variant_id_to_df(target_data)
   }
   ref_variants <- variant_id_to_df(ref_variants)
+
   columns_to_remove <- c("chromosome", "position", "ref", "alt", "variant_id")
 
   # Check if any of the specified columns are present
   if (any(columns_to_remove %in% colnames(target_data))) {
-    target_data <- select(target_data, -any_of(columns_to_remove))
+	 target_data <- select(target_data, -any_of(columns_to_remove))
   }
   match_result <- merge(target_data, ref_variants, by = c("chrom", "pos"), all = FALSE, suffixes = c(".target", ".ref")) %>% as.data.frame()
+
+  # sanitize names after merge as well (merge can introduce empty names in edge cases)
+  match_result <- sanitize_names(match_result)
+
   if (nrow(match_result) == 0) {
-    warning("No matching variants found between target data and reference variants.") 
-    return(list(target_data_qced = match_result, qc_summary = match_result))
+	warning("No matching variants found between target data and reference variants.") 
+	return(list(target_data_qced = match_result, qc_summary = match_result))
   }
     # match target & ref by chrom and position
   match_result = match_result %>%
-    mutate(variants_id_original = paste(chrom, pos, A2.target, A1.target, sep = ":")) %>%
-    mutate(variants_id_qced = paste(chrom, pos, A2.ref, A1.ref, sep = ":")) %>%
-    # filter out totally same rows.
-    filter(duplicated(.) | !duplicated(.)) %>%
-    # upper case target/reference A1 A2
-    mutate(across(c(A1.target, A2.target, A1.ref, A2.ref), toupper)) %>%
-    mutate(flip1.ref = strand_flip(A1.ref), flip2.ref = strand_flip(A2.ref)) %>%
-    # these pairings are ambiguous: because we cannot tell it's an sign flip / strand flip
-    mutate(strand_unambiguous = if_else((A1.target == "A" & A2.target == "T") | (A1.target == "T" & A2.target == "A") |
-      (A1.target == "C" & A2.target == "G") | (A1.target == "G" & A2.target == "C"), FALSE, TRUE)) %>%
-    # filter out non-ATCG coded alleles
-    mutate(non_ATCG = !(check_ATCG(A1.target) & check_ATCG(A2.target))) %>%
-    # exact match should be kept all the time
-    mutate(exact_match = A1.target == A1.ref & A2.target == A2.ref) %>%
-    mutate(sign_flip = ((A1.target == A2.ref & A2.target == A1.ref) | (A1.target == flip2.ref & A2.target == flip1.ref)) & (A1.target != A1.ref & A2.target != A2.ref)) %>%
-    mutate(strand_flip = ((A1.target == flip1.ref & A2.target == flip2.ref) | (A1.target == flip2.ref & A2.target == flip1.ref)) & (A1.target != A1.ref & A2.target != A2.ref)) %>%
-    mutate(INDEL = (A2.target == "I" | A2.target == "D" | nchar(A2.target) > 1 | nchar(A1.target) > 1)) %>%
-    mutate(ID_match = ((A2.target == "D" | A2.target == "I") & (nchar(A1.ref) > 1 | nchar(A2.ref) > 1)))
+	mutate(variants_id_original = paste(chrom, pos, A2.target, A1.target, sep = ":")) %>%
+	mutate(variants_id_qced = paste(chrom, pos, A2.ref, A1.ref, sep = ":")) %>%
+	# filter out totally same rows.
+	filter(duplicated(.) | !duplicated(.)) %>%
+	# upper case target/reference A1 A2
+	mutate(across(c(A1.target, A2.target, A1.ref, A2.ref), toupper)) %>%
+	mutate(flip1.ref = strand_flip(A1.ref), flip2.ref = strand_flip(A2.ref)) %>%
+	# these pairings are ambiguous: because we cannot tell it's an sign flip / strand flip
+	mutate(strand_unambiguous = if_else((A1.target == "A" & A2.target == "T") | (A1.target == "T" & A2.target == "A") |
+	  (A1.target == "C" & A2.target == "G") | (A1.target == "G" & A2.target == "C"), FALSE, TRUE)) %>%
+	# filter out non-ATCG coded alleles
+	mutate(non_ATCG = !(check_ATCG(A1.target) & check_ATCG(A2.target))) %>%
+	# exact match should be kept all the time
+	mutate(exact_match = A1.target == A1.ref & A2.target == A2.ref) %>%
+	mutate(sign_flip = ((A1.target == A2.ref & A2.target == A1.ref) | (A1.target == flip2.ref & A2.target == flip1.ref)) & (A1.target != A1.ref & A2.target != A2.ref)) %>%
+	mutate(strand_flip = ((A1.target == flip1.ref & A2.target == flip2.ref) | (A1.target == flip2.ref & A2.target == flip1.ref)) & (A1.target != A1.ref & A2.target != A2.ref)) %>%
+	mutate(INDEL = (A2.target == "I" | A2.target == "D" | nchar(A2.target) > 1 | nchar(A1.target) > 1)) %>%
+	mutate(ID_match = ((A2.target == "D" | A2.target == "I") & (nchar(A1.ref) > 1 | nchar(A2.ref) > 1)))
 
   # if not remove, then this should'nt be a condition to filter out any variants
   if (!remove_strand_ambiguous) {
-    match_result <- match_result %>% mutate(strand_unambiguous = TRUE)
+	match_result <- match_result %>% mutate(strand_unambiguous = TRUE)
   }
   # if all strand flip is un-ambigous, then we know ambigous cases are indeed a strand flip
   # not a sign flip, then we infer there is no ambigous in the whole dataset, and keep those ambigous ones
   if (nrow(match_result %>% filter(strand_flip == TRUE) %>% filter(strand_unambiguous == TRUE)) == 0) {
-    match_result <- match_result %>% mutate(strand_unambiguous = TRUE)
+	match_result <- match_result %>% mutate(strand_unambiguous = TRUE)
   }
 
   # To keep variants: if it's a strand flip, we will keep those unambigous (because if ambigous, cannot know it's trand / sign flip, so discard all)
   # or exact match or indel match (ID_match)
   # If not a strand flip, then we will keep those that are exact match / those are sign flip / INDEL matched
   match_result <- match_result %>% mutate(keep = if_else(strand_flip, true = (strand_unambiguous | exact_match | ID_match), false =
-    (exact_match | sign_flip | ID_match)
+	(exact_match | sign_flip | ID_match)
   ))
 
   if (remove_indels) {
-    match_result <- match_result %>% mutate(keep = if_else(INDEL == FALSE, FALSE, TRUE))
+	match_result <- match_result %>% mutate(keep = if_else(INDEL == FALSE, FALSE, TRUE))
   }
 
   # flip the signs of the column col_to_flip if there is a sign flip
   if (!is.null(col_to_flip)) {
-    if (!is.null(match_result[, col_to_flip])) {
-      match_result[match_result$sign_flip, col_to_flip] <- -1 * match_result[match_result$sign_flip, col_to_flip]
-    } else {
-      stop("Column '", col_to_flip, "' not found in target_data.")
-    }
+	if (!is.null(match_result[, col_to_flip])) {
+	  match_result[match_result$sign_flip, col_to_flip] <- -1 * match_result[match_result$sign_flip, col_to_flip]
+	} else {
+	  stop("Column '", col_to_flip, "' not found in target_data.")
+	}
   }
   # flip the strands if there is a strand flip
   if (flip_strand) {
-    strand_flipped_indices <- which(match_result$strand_flip)
-    match_result[strand_flipped_indices, "A1.target"] <- strand_flip(match_result[strand_flipped_indices, "A1.target"])
-    match_result[strand_flipped_indices, "A2.target"] <- strand_flip(match_result[strand_flipped_indices, "A2.target"])
+	strand_flipped_indices <- which(match_result$strand_flip)
+	match_result[strand_flipped_indices, "A1.target"] <- strand_flip(match_result[strand_flipped_indices, "A1.target"])
+	match_result[strand_flipped_indices, "A2.target"] <- strand_flip(match_result[strand_flipped_indices, "A2.target"])
   }
 
   # Remove all unnecessary columns used to determine qc status
   # Finally keep those variants with FLAG keep = TRUE
   result <- match_result[match_result$keep, , drop = FALSE]
-    
+	
   # FIXME: I think this parameter is confusing. I inheritated directly from our function, whose default setting is TRUE.
   # It is removing all multi-allelic alleles which is unnecessary. I suggest remove this parameter directly.
   # What we are trying to avoid is the SAME allele having diferent z score. I defined one parameter remove_same_vars later, but I can re-use this
   # remove_dup name
   if (remove_dups) {
-    dups <- vec_duplicate_detect(result[, c("chrom", "pos", "variants_id_qced")])
-    if (any(dups)) {
-      result <- result[!dups, , drop = FALSE]
-      warning("Unexpected duplicates were removed.")
-    }
+	dups <- vec_duplicate_detect(result[, c("chrom", "pos", "variants_id_qced")])
+	if (any(dups)) {
+	  result <- result[!dups, , drop = FALSE]
+	  warning("Unexpected duplicates were removed.")
+	}
   }
-    
+	
   result <- result %>%
-    select(-(flip1.ref:keep)) %>%
-    select(-A1.target, -A2.target) %>%
-    rename(A1 = A1.ref, A2 = A2.ref, variant_id = variants_id_qced)
+	select(-(flip1.ref:keep)) %>%
+	select(-A1.target, -A2.target) %>%
+	rename(A1 = A1.ref, A2 = A2.ref, variant_id = variants_id_qced)
 
   # default FALSE, but if want to remove same variants having different z score, then set as TRUE
   if (remove_same_vars) {
-    same_vars <- vec_duplicate_detect(result[, c("chrom", "pos", "variant_id")])
-    if (any(same_vars)) {
-      result <- result[!same_vars, , drop = FALSE]
-      message("Same variants with different z scores are removed.")
-    }
+	same_vars <- vec_duplicate_detect(result[, c("chrom", "pos", "variant_id")])
+	if (any(same_vars)) {
+	  result <- result[!same_vars, , drop = FALSE]
+	  message("Same variants with different z scores are removed.")
+	}
   }
 
   if (!remove_unmatched) {
-    match_variant <- result %>% pull(variants_id_original)
-    match_result <- select(match_result, -(flip1.ref:keep)) %>%
-      select(-variants_id_original, -A1.target, -A2.target) %>%
-      rename(A1 = A1.ref, A2 = A2.ref, variant_id = variants_id_qced)
-    target_data <- target_data %>% mutate(variant_id = paste(chrom, pos, A2, A1, sep = ":"))
-    if (length(setdiff(target_data %>% pull(variant_id), match_variant)) > 0) {
-      unmatch_data <- target_data %>% filter(!variant_id %in% match_variant)
-      result <- rbind(result, unmatch_data %>% mutate(variants_id_original = variant_id))
-      result <- result[match(target_data$variant_id, result$variants_id_original), ] %>% select(-variants_id_original)
-    }
+	match_variant <- result %>% pull(variants_id_original)
+	match_result <- select(match_result, -(flip1.ref:keep)) %>%
+	  select(-variants_id_original, -A1.target, -A2.target) %>%
+	  rename(A1 = A1.ref, A2 = A2.ref, variant_id = variants_id_qced)
+	target_data <- target_data %>% mutate(variant_id = paste(chrom, pos, A2, A1, sep = ":"))
+	if (length(setdiff(target_data %>% pull(variant_id), match_variant)) > 0) {
+	  unmatch_data <- target_data %>% filter(!variant_id %in% match_variant)
+	  result <- rbind(result, unmatch_data %>% mutate(variants_id_original = variant_id))
+	  result <- result[match(target_data$variant_id, result$variants_id_original), ] %>% select(-variants_id_original)
+	}
   }
 
   min_match <- match_min_prop * nrow(ref_variants)
   if (nrow(result) < min_match) {
-    stop("Not enough variants have been matched.")
+	stop("Not enough variants have been matched.")
   }
 
   # throw an error if there are any variant_id that are duplicated (meaning that same variant having different other infos for example z score)
   if (!remove_same_vars & any(duplicated(result$variant_id))) {
-    stop("In the input, there are duplicated variants with different z scores. Please check the data and determine which to keep.")
+	stop("In the input, there are duplicated variants with different z scores. Please check the data and determine which to keep.")
   }
 
   return(list(target_data_qced = result, qc_summary = match_result))
@@ -288,4 +309,4 @@ align_variant_names <- function(source, reference, remove_indels = FALSE) {
     aligned_variants = aligned_variants,
     unmatched_indices = unmatched_indices
   )
-}
+}
\ No newline at end of file

From 756b6570d1e56c4618ff03d5fc21f2be459d4fca Mon Sep 17 00:00:00 2001
From: Kate Lawrence <klawren@stanford.edu>
Date: Thu, 2 Oct 2025 11:58:39 -0700
Subject: [PATCH 02/15] clean build from LD varaint ids

---
 R/allele_qc.R   |  8 +++++++-
 R/sumstats_qc.R | 22 ++++++++++++++++++++++
 2 files changed, 29 insertions(+), 1 deletion(-)

diff --git a/R/allele_qc.R b/R/allele_qc.R
index 69fb1471..fba18628 100644
--- a/R/allele_qc.R
+++ b/R/allele_qc.R
@@ -230,6 +230,7 @@ allele_qc <- function(target_data, ref_variants, col_to_flip = NULL,
 #'
 #' @param source A character vector of variant names in the format "chr:pos:A2:A1" or "chr:pos_A2_A1".
 #' @param reference A character vector of variant names in the format "chr:pos:A2:A1" or "chr:pos_A2_A1".
+#' @param remove_build_suffix Whether to strip trailing genome build suffixes like ":b38" or "_b38" before alignment. Default TRUE.
 #'
 #' @return A list with two elements:
 #' - aligned_variants: A character vector of aligned variant names.
@@ -241,7 +242,12 @@ allele_qc <- function(target_data, ref_variants, col_to_flip = NULL,
 #' align_variant_names(source, reference)
 #'
 #' @export
-align_variant_names <- function(source, reference, remove_indels = FALSE) {
+align_variant_names <- function(source, reference, remove_indels = FALSE, remove_build_suffix = TRUE) {
+  # Optionally strip build suffix like :b38 or _b38 from both sides for robust alignment
+  if (remove_build_suffix) {
+    source <- gsub("(:|_)b[0-9]+$", "", source)
+    reference <- gsub("(:|_)b[0-9]+$", "", reference)
+  }
   # Check if source and reference follow the expected pattern
   source_pattern <- grepl("^(chr)?[0-9]+:[0-9]+:[ATCG*]+:[ATCG*]+$|^(chr)?[0-9]+:[0-9]+_[ATCG*]+_[ATCG*]+$", source)
   reference_pattern <- grepl("^(chr)?[0-9]+:[0-9]+:[ATCG*]+:[ATCG*]+$|^(chr)?[0-9]+:[0-9]+_[ATCG*]+_[ATCG*]+$", reference)
diff --git a/R/sumstats_qc.R b/R/sumstats_qc.R
index c0424fe3..d0756bbb 100644
--- a/R/sumstats_qc.R
+++ b/R/sumstats_qc.R
@@ -54,6 +54,28 @@ rss_basic_qc <- function(sumstats, LD_data, skip_region = NULL, remove_indels =
 
   sumstats_processed <- allele_flip$target_data_qced %>% arrange(pos)
 
+  # Align and subset LD by mapping core IDs (strip trailing build suffix) to exact LD IDs
+  ld_mat <- LD_data$combined_LD_matrix
+  ld_ids <- tryCatch(rownames(ld_mat), error = function(e) NULL)
+  if (is.null(ld_ids)) {
+    stop("LD matrix rownames are NULL; cannot align variant IDs.")
+  }
+  present <- sumstats_processed$variant_id %in% ld_ids
+  if (sum(present) == 0) {
+    strip_build <- function(x) sub("(:|_)b[0-9]+$", "", x)
+    ld_core <- strip_build(ld_ids)
+    ss_core <- strip_build(sumstats_processed$variant_id)
+    map_idx <- match(ss_core, ld_core)
+    remap <- !is.na(map_idx)
+    if (sum(remap) > 0) {
+      sumstats_processed$variant_id[remap] <- ld_ids[map_idx[remap]]
+      present <- sumstats_processed$variant_id %in% ld_ids
+    }
+  }
+  if (sum(present) == 0) {
+    stop("No overlapping variants between sumstats and LD after alignment.")
+  }
+
   LD_mat_processed <- LD_data$combined_LD_matrix[sumstats_processed$variant_id, sumstats_processed$variant_id, drop = FALSE]
 
   return(list(sumstats = sumstats_processed, LD_mat = LD_mat_processed))

From 308487ecd5c3adeb7af1330d13f9dd30e0cb90af Mon Sep 17 00:00:00 2001
From: Kate Lawrence <klawren@stanford.edu>
Date: Thu, 2 Oct 2025 11:59:56 -0700
Subject: [PATCH 03/15] add missing defualt for pip_cutoff_to_skip_ld

---
 R/colocboost_pipeline.R | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/R/colocboost_pipeline.R b/R/colocboost_pipeline.R
index 2b22d712..766eef6e 100644
--- a/R/colocboost_pipeline.R
+++ b/R/colocboost_pipeline.R
@@ -687,7 +687,11 @@ qc_regional_data <- function(region_data,
         n <- sumstat$n
         var_y <- sumstat$var_y
         conditions_sumstat <- names(sumstats)[ii]
-        pip_cutoff_to_skip_ld <- pip_cutoff_to_skip_sumstat[conditions_sumstat] %>% as.numeric()
+        pip_cutoff_to_skip_ld <- tryCatch(
+          as.numeric(pip_cutoff_to_skip_sumstat[conditions_sumstat]),
+          error = function(e) 0
+        )
+        if (is.na(pip_cutoff_to_skip_ld)) pip_cutoff_to_skip_ld <- 0
 
         # Preprocess the input data
         preprocess_results <- rss_basic_qc(sumstat$sumstats, LD_data, remove_indels = remove_indels)

From 77630df03d4b5416dfbda75431d3aabcc61a764f Mon Sep 17 00:00:00 2001
From: Kate Lawrence <klawren@stanford.edu>
Date: Thu, 2 Oct 2025 12:36:42 -0700
Subject: [PATCH 04/15] chr anming for LD columns

---
 R/LD.R | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/R/LD.R b/R/LD.R
index 426790ff..9aeaea2b 100644
--- a/R/LD.R
+++ b/R/LD.R
@@ -355,6 +355,15 @@ load_LD_matrix <- function(LD_meta_file_path, region, extract_coordinates = NULL
     LD_matrices = extracted_LD_matrices_list,
     variants = extracted_LD_variants_list
   )
+  # Normalize LD variant IDs at load time: add 'chr' prefix if missing
+  if (!is.null(colnames(combined_LD_matrix))) {
+    cn <- as.character(colnames(combined_LD_matrix))
+    colnames(combined_LD_matrix) <- ifelse(startsWith(cn, "chr"), cn, paste0("chr", cn))
+  }
+  if (!is.null(rownames(combined_LD_matrix))) {
+    rn <- as.character(rownames(combined_LD_matrix))
+    rownames(combined_LD_matrix) <- ifelse(startsWith(rn, "chr"), rn, paste0("chr", rn))
+  }
   combined_LD_variants <- rownames(combined_LD_matrix)
 
   # Now create block_metadata with all the information we've accumulated

From 50e8dcb8d32e95b716d83644e7a220b742a47a65 Mon Sep 17 00:00:00 2001
From: Kate Lawrence <klawren@stanford.edu>
Date: Thu, 2 Oct 2025 12:42:23 -0700
Subject: [PATCH 05/15] chr naming for LD columns

---
 R/LD.R | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/R/LD.R b/R/LD.R
index 9aeaea2b..df0379a2 100644
--- a/R/LD.R
+++ b/R/LD.R
@@ -384,6 +384,8 @@ load_LD_matrix <- function(LD_meta_file_path, region, extract_coordinates = NULL
   ref_panel <- do.call(rbind, lapply(strsplit(rownames(combined_LD_matrix), ":"), function(x) {
     data.frame(chrom = x[1], pos = as.integer(x[2]), A2 = x[3], A1 = x[4])
   }))
+  # Normalize ref_panel chrom to drop 'chr' prefix for merging with sumstats (which often use numeric chrom)
+  ref_panel$chrom <- sub("^chr", "", as.character(ref_panel$chrom))
 
   merged_variant_list <- do.call(rbind, extracted_LD_variants_list)
   ref_panel$variant_id <- rownames(combined_LD_matrix)

From 84e2e517ddfeb9c214c132efd4c7880cb146b21a Mon Sep 17 00:00:00 2001
From: Kate Lawrence <klawren@stanford.edu>
Date: Thu, 2 Oct 2025 12:47:42 -0700
Subject: [PATCH 06/15] chr and build naming in LD variant ids

---
 R/sumstats_qc.R | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/R/sumstats_qc.R b/R/sumstats_qc.R
index d0756bbb..3923e168 100644
--- a/R/sumstats_qc.R
+++ b/R/sumstats_qc.R
@@ -63,8 +63,9 @@ rss_basic_qc <- function(sumstats, LD_data, skip_region = NULL, remove_indels =
   present <- sumstats_processed$variant_id %in% ld_ids
   if (sum(present) == 0) {
     strip_build <- function(x) sub("(:|_)b[0-9]+$", "", x)
-    ld_core <- strip_build(ld_ids)
-    ss_core <- strip_build(sumstats_processed$variant_id)
+    drop_chr <- function(x) sub("^chr", "", x)
+    ld_core <- drop_chr(strip_build(ld_ids))
+    ss_core <- drop_chr(strip_build(sumstats_processed$variant_id))
     map_idx <- match(ss_core, ld_core)
     remap <- !is.na(map_idx)
     if (sum(remap) > 0) {

From 089383f227b3848cd2343acc74b1bc4fd11d544b Mon Sep 17 00:00:00 2001
From: Kate Lawrence <klawren@stanford.edu>
Date: Thu, 2 Oct 2025 13:09:08 -0700
Subject: [PATCH 07/15] harmonize LD block variants, handel nas in printing

---
 R/LD.R    | 12 ++++++++++--
 R/raiss.R |  8 ++++----
 2 files changed, 14 insertions(+), 6 deletions(-)

diff --git a/R/LD.R b/R/LD.R
index df0379a2..fb9d00b9 100644
--- a/R/LD.R
+++ b/R/LD.R
@@ -368,12 +368,20 @@ load_LD_matrix <- function(LD_meta_file_path, region, extract_coordinates = NULL
 
   # Now create block_metadata with all the information we've accumulated
   block_variants <- lapply(extracted_LD_variants_list, function(v) v$variants)
+  # Normalize block_variants to match combined_LD_variants format (add 'chr' if needed)
+  block_variants_normalized <- lapply(block_variants, function(v) {
+    if (!any(startsWith(v, "chr"))) {
+      paste0("chr", v)
+    } else {
+      v
+    }
+  })
   block_metadata <- data.frame(
     block_id = seq_along(LD_file_paths),
     chrom = block_chroms,
     size = sapply(block_variants, length),
-    start_idx = sapply(block_variants, function(v) min(match(v, combined_LD_variants))),
-    end_idx = sapply(block_variants, function(v) max(match(v, combined_LD_variants))),
+    start_idx = sapply(block_variants_normalized, function(v) min(match(v, combined_LD_variants))),
+    end_idx = sapply(block_variants_normalized, function(v) max(match(v, combined_LD_variants))),
     stringsAsFactors = FALSE
   )
 
diff --git a/R/raiss.R b/R/raiss.R
index 278292fa..5b27e572 100644
--- a/R/raiss.R
+++ b/R/raiss.R
@@ -330,10 +330,10 @@ filter_raiss_output <- function(zscores, R2_threshold = 0.6, minimum_ld = 5, ver
 
   # Count statistics before filtering
   NSNPs_bf_filt <- nrow(zscores)
-  NSNPs_initial <- sum(zscores$raiss_R2 == 2.0)
-  NSNPs_imputed <- sum(zscores$raiss_R2 != 2.0)
-  NSNPs_ld_filt <- sum(zscores$raiss_ld_score < minimum_ld)
-  NSNPs_R2_filt <- sum(zscores$raiss_R2 < R2_threshold)
+  NSNPs_initial <- sum(zscores$raiss_R2 == 2.0, na.rm = TRUE)
+  NSNPs_imputed <- sum(zscores$raiss_R2 != 2.0, na.rm = TRUE)
+  NSNPs_ld_filt <- sum(zscores$raiss_ld_score < minimum_ld, na.rm = TRUE)
+  NSNPs_R2_filt <- sum(zscores$raiss_R2 < R2_threshold, na.rm = TRUE)
 
   # Apply filters
   zscores_nofilter <- zscores

From 1b90a950b10be66350c6248ba8553978bfc7fbc6 Mon Sep 17 00:00:00 2001
From: Kate Lawrence <klawren@stanford.edu>
Date: Thu, 2 Oct 2025 13:42:00 -0700
Subject: [PATCH 08/15] harmonize all to have chr prefix

---
 R/colocboost_pipeline.R | 25 ++++++++++++++++++-------
 1 file changed, 18 insertions(+), 7 deletions(-)

diff --git a/R/colocboost_pipeline.R b/R/colocboost_pipeline.R
index 766eef6e..70df18ad 100644
--- a/R/colocboost_pipeline.R
+++ b/R/colocboost_pipeline.R
@@ -436,13 +436,24 @@ colocboost_analysis_pipeline <- function(region_data,
   if (!is.null(sumstat_data$sumstats)) {
     sumstats <- lapply(sumstat_data$sumstats, function(ss) {
       z <- ss$sumstats$z
-      variant <- paste0("chr", ss$sumstats$variant_id)
+      # Prefix 'chr' only if missing to avoid 'chrchr'
+      vi <- ss$sumstats$variant_id
+      has_chr <- startsWith(as.character(vi), "chr")
+      variant <- ifelse(has_chr, as.character(vi), paste0("chr", as.character(vi)))
       n <- ss$n
       data.frame("z" = z, "n" = n, "variant" = variant)
     })
     names(sumstats) <- names(sumstat_data$sumstats)
     LD_mat <- lapply(sumstat_data$LD_mat, function(ld) {
-      colnames(ld) <- rownames(ld) <- paste0("chr", colnames(ld))
+      # Ensure LD dimnames have 'chr' only once
+      cn <- colnames(ld)
+      rn <- rownames(ld)
+      if (!is.null(cn)) {
+        colnames(ld) <- ifelse(startsWith(as.character(cn), "chr"), as.character(cn), paste0("chr", as.character(cn)))
+      }
+      if (!is.null(rn)) {
+        rownames(ld) <- ifelse(startsWith(as.character(rn), "chr"), as.character(rn), paste0("chr", as.character(rn)))
+      }
       return(ld)
     })
     LD_match <- sumstat_data$LD_match
@@ -687,11 +698,11 @@ qc_regional_data <- function(region_data,
         n <- sumstat$n
         var_y <- sumstat$var_y
         conditions_sumstat <- names(sumstats)[ii]
-        pip_cutoff_to_skip_ld <- tryCatch(
-          as.numeric(pip_cutoff_to_skip_sumstat[conditions_sumstat]),
-          error = function(e) 0
-        )
-        if (is.na(pip_cutoff_to_skip_ld)) pip_cutoff_to_skip_ld <- 0
+        pip_cutoff_to_skip_ld <- if (conditions_sumstat %in% names(pip_cutoff_to_skip_sumstat)) {
+          as.numeric(pip_cutoff_to_skip_sumstat[conditions_sumstat])
+        } else {
+          0
+        }
 
         # Preprocess the input data
         preprocess_results <- rss_basic_qc(sumstat$sumstats, LD_data, remove_indels = remove_indels)

From 51ba8e8b86712cbf6322b3864172de0dbc38afcc Mon Sep 17 00:00:00 2001
From: Kate Lawrence <klawren@stanford.edu>
Date: Thu, 2 Oct 2025 14:02:34 -0700
Subject: [PATCH 09/15] remove nas from sumstats

---
 R/colocboost_pipeline.R | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/R/colocboost_pipeline.R b/R/colocboost_pipeline.R
index 70df18ad..eb296f2c 100644
--- a/R/colocboost_pipeline.R
+++ b/R/colocboost_pipeline.R
@@ -441,7 +441,15 @@ colocboost_analysis_pipeline <- function(region_data,
       has_chr <- startsWith(as.character(vi), "chr")
       variant <- ifelse(has_chr, as.character(vi), paste0("chr", as.character(vi)))
       n <- ss$n
-      data.frame("z" = z, "n" = n, "variant" = variant)
+      
+      # Filter out NA values from z-scores and corresponding variants
+      na_mask <- !is.na(z)
+      if (sum(na_mask) == 0) {
+        message("Warning: All z-scores are NA for this summary statistic dataset")
+        return(data.frame("z" = numeric(0), "n" = numeric(0), "variant" = character(0)))
+      }
+      
+      data.frame("z" = z[na_mask], "n" = n, "variant" = variant[na_mask])
     })
     names(sumstats) <- names(sumstat_data$sumstats)
     LD_mat <- lapply(sumstat_data$LD_mat, function(ld) {

From a2aa4e248edfe094e03c838435d63eb122806305 Mon Sep 17 00:00:00 2001
From: Kate Lawrence <klawren@stanford.edu>
Date: Fri, 10 Oct 2025 20:01:15 -0700
Subject: [PATCH 10/15] skip gwas in focal mode if colocboost errors

---
 R/colocboost_pipeline.R | 86 +++++++++++++++++++++++++++++++++--------
 1 file changed, 69 insertions(+), 17 deletions(-)

diff --git a/R/colocboost_pipeline.R b/R/colocboost_pipeline.R
index eb296f2c..d11feedd 100644
--- a/R/colocboost_pipeline.R
+++ b/R/colocboost_pipeline.R
@@ -470,6 +470,35 @@ colocboost_analysis_pipeline <- function(region_data,
     sumstats <- LD_mat <- dict_sumstatLD <- NULL
   }
 
+  # Helper: validate a single GWAS study data.frame and its LD matrix
+  is_valid_sumstat_entry <- function(df, ld_matrix) {
+    if (is.null(df) || nrow(df) == 0) return(FALSE)
+    if (!all(c("z", "n", "variant") %in% colnames(df))) return(FALSE)
+    n_vals <- suppressWarnings(as.numeric(unique(df$n)))
+    if (length(n_vals) == 0 || is.na(n_vals[1]) || !is.finite(n_vals[1]) || n_vals[1] < 3) return(FALSE)
+    if (all(is.na(df$z))) return(FALSE)
+    if (is.null(ld_matrix)) return(FALSE)
+    if (!is.matrix(ld_matrix)) return(FALSE)
+    if (any(!is.finite(ld_matrix))) return(FALSE)
+    # Ensure at least 1 overlapping variant with LD matrix dimnames
+    ld_vars <- intersect(rownames(ld_matrix), colnames(ld_matrix))
+    if (length(ld_vars) == 0) return(FALSE)
+    if (length(intersect(as.character(df$variant), as.character(ld_vars))) == 0) return(FALSE)
+    TRUE
+  }
+
+  # Helper: filter joint structures (sumstats list and dict_sumstatLD) to only valid entries
+  filter_valid_sumstats <- function(sumstats, LD_mat, dict_sumstatLD) {
+    if (is.null(sumstats) || is.null(dict_sumstatLD) || is.null(LD_mat)) return(list(sumstats = NULL, dict = NULL))
+    keep <- vector("logical", length(sumstats))
+    for (i in seq_along(sumstats)) {
+      ld_idx <- dict_sumstatLD[i, 2]
+      ld_matrix <- if (!is.na(ld_idx)) LD_mat[[ld_idx]] else NULL
+      keep[i] <- is_valid_sumstat_entry(sumstats[[i]], ld_matrix)
+    }
+    if (!any(keep)) return(list(sumstats = NULL, dict = NULL))
+    list(sumstats = sumstats[keep], dict = matrix(dict_sumstatLD[keep, , drop = FALSE], ncol = 2))
+  }
 
   ####### ========= streamline three types of analyses ======== ########
   if (is.null(X) & is.null(sumstats)) {
@@ -500,16 +529,22 @@ colocboost_analysis_pipeline <- function(region_data,
   if (joint_gwas & !is.null(sumstats)) {
     message(paste("====== Performing non-focaled version GWAS-xQTL ColocBoost on", length(Y), "contexts and", length(sumstats), "GWAS. ====="))
     t21 <- Sys.time()
-    traits <- c(names(Y), names(sumstats))
-    res_gwas <- colocboost(
-      X = X, Y = Y, sumstat = sumstats, LD = LD_mat,
-      dict_YX = dict_YX, dict_sumstatLD = dict_sumstatLD,
-      outcome_names = traits, focal_outcome_idx = NULL, 
-      output_level = 2, ...
-    )
-    t22 <- Sys.time()
-    analysis_results$joint_gwas <- res_gwas
-    analysis_results$computing_time$Analysis$joint_gwas <- t22 - t21
+    # Filter invalid GWAS before calling colocboost
+    filtered <- filter_valid_sumstats(sumstats, LD_mat, dict_sumstatLD)
+    if (is.null(filtered$sumstats)) {
+      message("All GWAS studies failed validation; skipping joint GWAS analysis for this region.")
+    } else {
+      traits <- c(names(Y), names(filtered$sumstats))
+      res_gwas <- colocboost(
+        X = X, Y = Y, sumstat = filtered$sumstats, LD = LD_mat,
+        dict_YX = dict_YX, dict_sumstatLD = filtered$dict,
+        outcome_names = traits, focal_outcome_idx = NULL, 
+        output_level = 2, ...
+      )
+      t22 <- Sys.time()
+      analysis_results$joint_gwas <- res_gwas
+      analysis_results$computing_time$Analysis$joint_gwas <- t22 - t21
+    }
   }
   # - run focaled version of ColocBoost for each GWAS
   if (separate_gwas & !is.null(sumstats)) {
@@ -517,15 +552,32 @@ colocboost_analysis_pipeline <- function(region_data,
     res_gwas_separate <- analysis_results$separate_gwas
     for (i_gwas in 1:nrow(dict_sumstatLD)) {
       current_study <- names(sumstats)[i_gwas]
-      message(paste("====== Performing focaled version GWAS-xQTL ColocBoost on", length(Y), "contexts and ", current_study, "GWAS. ====="))
       dict <- dict_sumstatLD[i_gwas, ]
+      # Validate this study before calling colocboost
+      ld_idx <- dict[2]
+      ld_matrix <- if (!is.na(ld_idx)) LD_mat[[ld_idx]] else NULL
+      valid <- is_valid_sumstat_entry(sumstats[[dict[1]]], ld_matrix)
+      if (!isTRUE(valid)) {
+        message(paste("Skipping", current_study, "due to invalid N/z/LD."))
+        next
+      }
+      message(paste("====== Performing focaled version GWAS-xQTL ColocBoost on", length(Y), "contexts and ", current_study, "GWAS. ====="))
       traits <- c(names(Y), current_study)
-      res_gwas_separate[[current_study]] <- colocboost(
-        X = X, Y = Y, sumstat = sumstats[dict[1]],
-        LD = LD_mat[dict[2]], dict_YX = dict_YX,
-        outcome_names = traits, focal_outcome_idx = length(traits), 
-        output_level = 2, ...
-      )
+      
+      # Try to run colocboost with error handling
+      tryCatch({
+        res_gwas_separate[[current_study]] <- colocboost(
+          X = X, Y = Y, sumstat = sumstats[dict[1]],
+          LD = LD_mat[dict[2]], dict_YX = dict_YX,
+          outcome_names = traits, focal_outcome_idx = length(traits), 
+          output_level = 2, ...
+        )
+      }, error = function(e) {
+        message(paste("Error in colocboost analysis for", current_study, ":"))
+        message(paste("Error message:", e$message))
+        message(paste("Continuing with next GWAS..."))
+        res_gwas_separate[[current_study]] <<- NULL
+      })
     }
     t32 <- Sys.time()
     analysis_results$separate_gwas <- res_gwas_separate

From e194bad0fcfc010267add1e572f68579905596c6 Mon Sep 17 00:00:00 2001
From: kal26 <kal26@users.noreply.github.com>
Date: Thu, 30 Oct 2025 17:34:57 +0000
Subject: [PATCH 11/15] Update documentation

---
 DESCRIPTION                | 2 +-
 NAMESPACE                  | 2 ++
 man/align_variant_names.Rd | 9 ++++++++-
 3 files changed, 11 insertions(+), 2 deletions(-)

diff --git a/DESCRIPTION b/DESCRIPTION
index 3ece6a87..87e3c40b 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -62,4 +62,4 @@ LinkingTo:
     RcppGSL
 NeedsCompilation: yes
 VignetteBuilder: knitr
-RoxygenNote: 7.3.2
+RoxygenNote: 7.3.3
diff --git a/NAMESPACE b/NAMESPACE
index ea3fe90e..a8ed34c0 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -141,6 +141,8 @@ importFrom(data.table,setnames)
 importFrom(doFuture,registerDoFuture)
 importFrom(dplyr,"%>%")
 importFrom(dplyr,across)
+importFrom(dplyr,all_of)
+importFrom(dplyr,any_of)
 importFrom(dplyr,arrange)
 importFrom(dplyr,as_tibble)
 importFrom(dplyr,bind_rows)
diff --git a/man/align_variant_names.Rd b/man/align_variant_names.Rd
index bd112604..d89b7ca5 100644
--- a/man/align_variant_names.Rd
+++ b/man/align_variant_names.Rd
@@ -4,12 +4,19 @@
 \alias{align_variant_names}
 \title{Align Variant Names}
 \usage{
-align_variant_names(source, reference, remove_indels = FALSE)
+align_variant_names(
+  source,
+  reference,
+  remove_indels = FALSE,
+  remove_build_suffix = TRUE
+)
 }
 \arguments{
 \item{source}{A character vector of variant names in the format "chr:pos:A2:A1" or "chr:pos_A2_A1".}
 
 \item{reference}{A character vector of variant names in the format "chr:pos:A2:A1" or "chr:pos_A2_A1".}
+
+\item{remove_build_suffix}{Whether to strip trailing genome build suffixes like ":b38" or "_b38" before alignment. Default TRUE.}
 }
 \value{
 A list with two elements:

From 47ae7d07aaa1779287cce5d2388fa82eb89f10ee Mon Sep 17 00:00:00 2001
From: Kate Lawrence <klawren@stanford.edu>
Date: Thu, 30 Oct 2025 10:38:40 -0700
Subject: [PATCH 12/15] add rss_qc as default qc_method

---
 R/colocboost_pipeline.R | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/R/colocboost_pipeline.R b/R/colocboost_pipeline.R
index 2b22d712..d12bd6d9 100644
--- a/R/colocboost_pipeline.R
+++ b/R/colocboost_pipeline.R
@@ -554,6 +554,7 @@ qc_regional_data <- function(region_data,
                              qc_method = c("rss_qc", "dentist", "slalom"),
                              impute = TRUE,
                              impute_opts = list(rcond = 0.01, R2_threshold = 0.6, minimum_ld = 5, lamb = 0.01)) {
+  if (!is.null(qc_method)) qc_method <- match.arg(qc_method)
   #### related internal functions
   # Add context names to colname of Y if missing
   add_context_to_Y <- function(res_Y) {
@@ -675,6 +676,7 @@ qc_regional_data <- function(region_data,
                                          qc_method = c("rss_qc", "dentist", "slalom"),
                                          impute = TRUE,
                                          impute_opts = list(rcond = 0.01, R2_threshold = 0.6, minimum_ld = 5, lamb = 0.01)) {
+    if (!is.null(qc_method)) qc_method <- match.arg(qc_method)
     n_LD <- length(sumstat_data$LD_info)
     final_sumstats <- final_LD <- NULL
     LD_match <- c()

From 4d60845831620da3a0b8d8a43be9f54aa4d9880f Mon Sep 17 00:00:00 2001
From: Kate Lawrence <klawren@stanford.edu>
Date: Thu, 30 Oct 2025 10:42:40 -0700
Subject: [PATCH 13/15] add default pip_cutoff for multiple individual contexts
 or sumstats

---
 R/colocboost_pipeline.R | 34 +++++++++++++++++++++++++++++++---
 1 file changed, 31 insertions(+), 3 deletions(-)

diff --git a/R/colocboost_pipeline.R b/R/colocboost_pipeline.R
index d12bd6d9..5b98aab4 100644
--- a/R/colocboost_pipeline.R
+++ b/R/colocboost_pipeline.R
@@ -615,6 +615,15 @@ qc_regional_data <- function(region_data,
     # - add context to colname of Y
     Y <- add_context_to_Y(Y)
     n_context <- length(X)
+    # Validate pip_cutoff_to_skip_ind length: allow scalar or exact length; otherwise error
+    if (length(pip_cutoff_to_skip_ind) == 1) {
+      pip_cutoff_vec <- rep(pip_cutoff_to_skip_ind, n_context)
+    } else if (length(pip_cutoff_to_skip_ind) == n_context) {
+      pip_cutoff_vec <- pip_cutoff_to_skip_ind
+    } else {
+      stop(sprintf("pip_cutoff_to_skip_ind length (%d) must be 1 or equal to number of contexts (%d)",
+                   length(pip_cutoff_to_skip_ind), n_context))
+    }
     residual_X <- residual_Y <- list()
     keep_contexts <- c()
     for (i_context in 1:n_context) {
@@ -626,8 +635,8 @@ qc_regional_data <- function(region_data,
       # - remove variants with maf < maf_cutoff
       # tmp <- filter_resX_maf(resX, maf, maf_cutoff = maf_cutoff)
       resX <- filter_X(resX, missing_rate_thresh = NULL, maf_thresh = maf_cutoff, maf = maf)
-      # Initial PIP check
-      resY <- filter_resY_pip(resX, resY, pip_cutoff = pip_cutoff_to_skip_ind[i_context], context = context)
+      # Initial PIP check (use recycled vector element)
+      resY <- filter_resY_pip(resX, resY, pip_cutoff = pip_cutoff_vec[i_context], context = context)
       if (!is.null(resY)) {
         residual_X <- c(residual_X, list(resX))
         residual_Y <- c(residual_Y, list(resY))
@@ -689,7 +698,26 @@ qc_regional_data <- function(region_data,
         n <- sumstat$n
         var_y <- sumstat$var_y
         conditions_sumstat <- names(sumstats)[ii]
-        pip_cutoff_to_skip_ld <- pip_cutoff_to_skip_sumstat[conditions_sumstat] %>% as.numeric()
+        # Determine PIP cutoff per sumstat with strict validation:
+        # - scalar applies to all studies
+        # - length equals number of studies -> positional mapping
+        # - named vector that covers ALL study names -> name-based mapping
+        # Otherwise: throw error (do not silently default)
+        if (length(pip_cutoff_to_skip_sumstat) == 1) {
+          pip_cutoff_to_skip_ld <- as.numeric(pip_cutoff_to_skip_sumstat)
+        } else if (length(pip_cutoff_to_skip_sumstat) == length(sumstats) && is.null(names(pip_cutoff_to_skip_sumstat))) {
+          pip_cutoff_to_skip_ld <- as.numeric(pip_cutoff_to_skip_sumstat[ii])
+        } else if (!is.null(names(pip_cutoff_to_skip_sumstat))) {
+          missing_names <- setdiff(names(sumstats), names(pip_cutoff_to_skip_sumstat))
+          if (length(missing_names) > 0) {
+            stop(sprintf("pip_cutoff_to_skip_sumstat must provide values for all studies; missing: %s",
+                         paste(missing_names, collapse = ", ")))
+          }
+          pip_cutoff_to_skip_ld <- as.numeric(pip_cutoff_to_skip_sumstat[conditions_sumstat])
+        } else {
+          stop(sprintf("pip_cutoff_to_skip_sumstat length (%d) must be 1, equal to number of studies (%d), or be a named vector covering all studies",
+                       length(pip_cutoff_to_skip_sumstat), length(sumstats)))
+        }
 
         # Preprocess the input data
         preprocess_results <- rss_basic_qc(sumstat$sumstats, LD_data, remove_indels = remove_indels)

From 705eae49f8b92fdeea9d82299481b48bcaa8b19d Mon Sep 17 00:00:00 2001
From: Kate Lawrence <klawren@stanford.edu>
Date: Thu, 30 Oct 2025 10:58:10 -0700
Subject: [PATCH 14/15] get names from region_name_col even if
 extract_region_name isn't provided. Add fallback region nameing (reigon_1,
 region_2, ..)

---
 R/file_utils.R | 15 ++++++++++++++-
 1 file changed, 14 insertions(+), 1 deletion(-)

diff --git a/R/file_utils.R b/R/file_utils.R
index ce3253ff..6e2444a6 100644
--- a/R/file_utils.R
+++ b/R/file_utils.R
@@ -243,7 +243,20 @@ load_phenotype_data <- function(phenotype_path, region, extract_region_name = NU
         stop("region_name_col is out of bounds for the number of columns in tabix_data.")
       }
     } else {
-      return(tabix_data %>% t())
+      # No extract_region_name filter: still transpose and assign names appropriately
+      result <- tabix_data %>% t()
+      if (!is.null(region_name_col) && (region_name_col %% 1 == 0)) {
+        if (region_name_col <= ncol(tabix_data)) {
+          # After transpose, the desired name row is at index region_name_col
+          colnames(result) <- result[region_name_col, ]
+        } else {
+          stop("region_name_col is out of bounds for the number of columns in tabix_data.")
+        }
+      } else if (is.null(colnames(result))) {
+        # Fallback: synthesize names if still missing
+        colnames(result) <- paste0("region_", seq_len(ncol(result)))
+      }
+      return(result)
     }
   }))
 

From 922e236a413374d1771bfc506fa677bd1591017a Mon Sep 17 00:00:00 2001
From: Kate Lawrence <klawren@stanford.edu>
Date: Fri, 5 Dec 2025 08:09:19 -0800
Subject: [PATCH 15/15] Add comprehensive file existence checks with clear
 error messages

- Check for missing phenotype, covariate, genotype, sumstat, column mapping, and LD metadata files
- Error (not warn) when files are missing to catch path issues early
- Fix genotype file check to look for .bed/.bim/.fam or .pgen/.pvar/.psam files instead of base filename
- Improve error messages to show which files are missing with condition names
- Filter phenotypes without data for specific region before loading
---
 R/colocboost_pipeline.R | 188 +++++++++++++++++++++++++++++++++++++++-
 R/file_utils.R          |  30 +++++--
 2 files changed, 209 insertions(+), 9 deletions(-)

diff --git a/R/colocboost_pipeline.R b/R/colocboost_pipeline.R
index 5762707a..89997940 100644
--- a/R/colocboost_pipeline.R
+++ b/R/colocboost_pipeline.R
@@ -111,16 +111,143 @@ load_multitask_regional_data <- function(region, # a string of chr:start-end for
     for (i_data in 1:n_dataset) {
       # extract genotype file name
       genotype <- genotype_list[i_data]
+      
+      # Check if PLINK files exist (.bed/.bim/.fam for PLINK 1 or .pgen/.pvar/.psam for PLINK 2)
+      bed_file <- paste0(genotype, ".bed")
+      bim_file <- paste0(genotype, ".bim")
+      fam_file <- paste0(genotype, ".fam")
+      pgen_file <- paste0(genotype, ".pgen")
+      pvar_file <- paste0(genotype, ".pvar")
+      psam_file <- paste0(genotype, ".psam")
+      
+      plink1_exists <- file.exists(bed_file) && file.exists(bim_file) && file.exists(fam_file)
+      plink2_exists <- file.exists(pgen_file) && file.exists(pvar_file) && file.exists(psam_file)
+      
+      if (!plink1_exists && !plink2_exists) {
+        missing_files <- c()
+        if (!file.exists(bed_file)) missing_files <- c(missing_files, bed_file)
+        if (!file.exists(bim_file)) missing_files <- c(missing_files, bim_file)
+        if (!file.exists(fam_file)) missing_files <- c(missing_files, fam_file)
+        if (!file.exists(pgen_file)) missing_files <- c(missing_files, pgen_file)
+        if (!file.exists(pvar_file)) missing_files <- c(missing_files, pvar_file)
+        if (!file.exists(psam_file)) missing_files <- c(missing_files, psam_file)
+        
+        stop("Genotype files do not exist for dataset ", i_data, ".\n",
+             "  Base path: ", genotype, "\n",
+             "  Missing files: ", paste(missing_files, collapse = ", "),
+             call. = FALSE)
+      }
+      
       # extract phenotype and covariate file names
       pos <- which(match_geno_pheno == i_data)
       phenotype <- phenotype_list[pos]
       covariate <- covariate_list[pos]
       conditions <- conditions_list_individual[pos]
+      
+      # Check for missing files first and provide clear errors
+      missing_pheno <- !vapply(phenotype, file.exists, logical(1))
+      missing_covar <- !vapply(covariate, file.exists, logical(1))
+      
+      if (any(missing_pheno)) {
+        missing_pheno_files <- phenotype[missing_pheno]
+        missing_pheno_conditions <- conditions[missing_pheno]
+        stop("Phenotype file(s) do not exist:\n",
+             paste(paste0("  - ", missing_pheno_conditions, ": ", missing_pheno_files), collapse = "\n"),
+             call. = FALSE)
+      }
+      
+      if (any(missing_covar)) {
+        missing_covar_files <- covariate[missing_covar]
+        missing_covar_conditions <- conditions[missing_covar]
+        stop("Covariate file(s) do not exist:\n",
+             paste(paste0("  - ", missing_covar_conditions, ": ", missing_covar_files), collapse = "\n"),
+             call. = FALSE)
+      }
+      
+      # All files should exist at this point (errors would have been thrown)
+      
+      # Helper function to check if phenotype file has data for the specific region
+      # Note: The file may have data elsewhere, but we only care about this region
+      # This is a pre-check; the actual loader will do the definitive check
+      check_phenotype_has_data <- function(pheno_file, region_str) {
+        # File existence already checked above, so we can assume it exists here
+        # Parse region to get chromosome and coordinates
+        region_parts <- strsplit(region_str, ":", fixed = TRUE)[[1]]
+        if (length(region_parts) != 2) {
+          return(TRUE)  # If region format is wrong, let the actual load function handle it
+        }
+        chr <- region_parts[1]
+        coord_parts <- strsplit(region_parts[2], "-", fixed = TRUE)[[1]]
+        if (length(coord_parts) != 2) {
+          return(TRUE)  # If region format is wrong, let the actual load function handle it
+        }
+        start <- coord_parts[1]
+        end <- coord_parts[2]
+        
+        # Try both formats: with and without 'chr' prefix
+        # Different files may be indexed with different formats
+        region_formats <- c(
+          region_str,  # Try original format first (e.g., "chr1:0-1000000")
+          paste0(sub("^chr", "", chr), ":", start, "-", end)  # Try without chr (e.g., "1:0-1000000")
+        )
+        
+        # Use tabix to check if file has data for this specific region
+        # Use the same approach as tabix_region: try fread with tabix command
+        for (region_tabix in region_formats) {
+          # Try using fread with tabix command (same as tabix_region does)
+          cmd_output <- tryCatch(
+            {
+              data.table::fread(cmd = paste0("tabix -h ", pheno_file, " ", region_tabix), 
+                               sep = "auto", header = "auto")
+            },
+            error = function(e) NULL
+          )
+          
+          # If we got output with data, return TRUE
+          if (!is.null(cmd_output) && nrow(cmd_output) > 0) {
+            return(TRUE)
+          }
+          
+          # If fread failed, it might be because file isn't indexed or has other issues
+          # In that case, be lenient and let the actual loader handle it
+          # (The loader will give a proper error message if needed)
+        }
+        
+        # If we tried all formats and got no data, return FALSE
+        # But only if we successfully queried (didn't get errors)
+        # For now, if we reach here after trying both formats, assume no data
+        return(FALSE)
+      }
+      
+      # Filter to only phenotypes with data for this specific region
+      has_data <- vapply(phenotype, function(p) check_phenotype_has_data(p, region), logical(1))
+      valid_indices <- which(has_data)
+      
+      if (length(valid_indices) == 0) {
+        warning("No phenotype files have data for region ", region, 
+               " for genotype dataset ", i_data, ". Skipping this dataset.", call. = FALSE)
+        next
+      }
+      
+      if (length(valid_indices) < length(phenotype)) {
+        skipped_conditions <- conditions[!has_data]
+        warning("Filtered from ", length(phenotype), " to ", length(valid_indices),
+               " tissues with data in region ", region, 
+               ". Skipped tissues (no data in this region): ", paste(skipped_conditions, collapse = ", "),
+               call. = FALSE)
+      }
+      
+      # Filter lists to only valid phenotypes
+      phenotype_filtered <- phenotype[valid_indices]
+      covariate_filtered <- covariate[valid_indices]
+      conditions_filtered <- conditions[valid_indices]
+      
+      # Load all valid phenotypes together
       dat <- load_regional_univariate_data(
-        genotype = genotype, phenotype = phenotype,
-        covariate = covariate, region = region,
+        genotype = genotype, phenotype = phenotype_filtered,
+        covariate = covariate_filtered, region = region,
         association_window = association_window,
-        conditions = conditions, xvar_cutoff = xvar_cutoff,
+        conditions = conditions_filtered, xvar_cutoff = xvar_cutoff,
         maf_cutoff = maf_cutoff, mac_cutoff = mac_cutoff,
         imiss_cutoff = imiss_cutoff, keep_indel = keep_indel,
         keep_samples = keep_samples, keep_variants = keep_variants,
@@ -129,6 +256,7 @@ load_multitask_regional_data <- function(region, # a string of chr:start-end for
         region_name_col = region_name_col,
         scale_residuals = scale_residuals
       )
+      
       if (is.null(individual_data)) {
         individual_data <- dat
       } else {
@@ -150,11 +278,55 @@ load_multitask_regional_data <- function(region, # a string of chr:start-end for
     if (length(match_LD_sumstat) != length(LD_meta_file_path_list)) {
       stop("Please make sure 'match_LD_sumstat' matched 'LD_meta_file_path_list' if you load data from multiple sumstats.")
     }
+    
+    # Check for missing sumstat files
+    missing_sumstat <- !vapply(sumstat_path_list, file.exists, logical(1))
+    if (any(missing_sumstat)) {
+      missing_sumstat_files <- sumstat_path_list[missing_sumstat]
+      missing_sumstat_conditions <- if (!is.null(conditions_list_sumstat)) {
+        conditions_list_sumstat[missing_sumstat]
+      } else {
+        paste0("sumstat_", which(missing_sumstat))
+      }
+      stop("Summary statistics file(s) do not exist:\n",
+           paste(paste0("  - ", missing_sumstat_conditions, ": ", missing_sumstat_files), collapse = "\n"),
+           call. = FALSE)
+    }
+    
+    # Check for missing column mapping files
+    missing_column <- !vapply(column_file_path_list, file.exists, logical(1))
+    if (any(missing_column)) {
+      missing_column_files <- column_file_path_list[missing_column]
+      missing_column_conditions <- if (!is.null(conditions_list_sumstat)) {
+        conditions_list_sumstat[missing_column]
+      } else {
+        paste0("sumstat_", which(missing_column))
+      }
+      stop("Column mapping file(s) do not exist:\n",
+           paste(paste0("  - ", missing_column_conditions, ": ", missing_column_files), collapse = "\n"),
+           call. = FALSE)
+    }
+    
+    # Check for missing LD metadata files
+    missing_ld_meta <- !vapply(LD_meta_file_path_list, file.exists, logical(1))
+    if (any(missing_ld_meta)) {
+      missing_ld_meta_files <- LD_meta_file_path_list[missing_ld_meta]
+      stop("LD metadata file(s) do not exist:\n",
+           paste(paste0("  - LD_meta_", which(missing_ld_meta), ": ", missing_ld_meta_files), collapse = "\n"),
+           call. = FALSE)
+    }
+    
     # - load sumstat data from multiple datasets
     n_LD <- length(match_LD_sumstat)
     for (i_ld in 1:n_LD) {
       # extract LD meta file path name
       LD_meta_file_path <- LD_meta_file_path_list[i_ld]
+      
+      # Error if LD metadata file doesn't exist (already checked above, but double-check)
+      if (!file.exists(LD_meta_file_path)) {
+        stop("LD metadata file does not exist for dataset ", i_ld, ": ", LD_meta_file_path, call. = FALSE)
+      }
+      
       LD_info <- load_LD_matrix(LD_meta_file_path,
         region = association_window,
         extract_coordinates = extract_coordinates
@@ -165,6 +337,14 @@ load_multitask_regional_data <- function(region, # a string of chr:start-end for
       sumstats <- lapply(pos, function(ii) {
         sumstat_path <- sumstat_path_list[ii]
         column_file_path <- column_file_path_list[ii]
+        
+        # Error if files don't exist (already checked above, but double-check)
+        if (!file.exists(sumstat_path)) {
+          stop("Summary statistics file does not exist for ", conditions_list_sumstat[ii], ": ", sumstat_path, call. = FALSE)
+        }
+        if (!file.exists(column_file_path)) {
+          stop("Column mapping file does not exist for ", conditions_list_sumstat[ii], ": ", column_file_path, call. = FALSE)
+        }
         # FIXME later: when consider multiple LD reference
         tmp <- load_rss_data(
           sumstat_path = sumstat_path, column_file_path = column_file_path,
@@ -179,6 +359,8 @@ load_multitask_regional_data <- function(region, # a string of chr:start-end for
         }
         return(tmp)
       })
+      
+      # All files should exist at this point (errors would have been thrown)
       names(sumstats) <- conditions
       sumstat_data$sumstats <- c(sumstat_data$sumstats, list(sumstats))
       sumstat_data$LD_info <- c(sumstat_data$LD_info, list(LD_info))
diff --git a/R/file_utils.R b/R/file_utils.R
index 6e2444a6..6e022df5 100644
--- a/R/file_utils.R
+++ b/R/file_utils.R
@@ -209,7 +209,7 @@ NoPhenotypeError <- function(message) {
   structure(list(message = message), class = c("NoPhenotypeError", "error", "condition"))
 }
 
-#' @importFrom purrr map2 compact
+#' @importFrom purrr map2
 #' @importFrom readr read_delim cols
 #' @importFrom dplyr filter select mutate across everything
 #' @importFrom magrittr %>%
@@ -224,8 +224,7 @@ load_phenotype_data <- function(phenotype_path, region, extract_region_name = NU
   }
 
   # Use `map2` to iterate over `phenotype_path` and `extract_region_name` simultaneously
-  # `compact` should remove all NULL element
-  phenotype_data <- compact(map2(phenotype_path, extract_region_name, ~ {
+  phenotype_data <- map2(phenotype_path, extract_region_name, ~ {
     tabix_data <- if (!is.null(region)) tabix_region(.x, region, tabix_header = tabix_header) else read_delim(.x, "\t", col_types = cols())
     if (nrow(tabix_data) == 0) {
       message(paste("Phenotype file ", .x, " is empty for the specified region", if (!is.null(region)) "" else region))
@@ -258,10 +257,10 @@ load_phenotype_data <- function(phenotype_path, region, extract_region_name = NU
       }
       return(result)
     }
-  }))
+  })
 
-  # Check if all phenotype files are empty
-  if (length(phenotype_data) == 0) {
+  # Check if all phenotype files are empty (all elements NULL)
+  if (all(vapply(phenotype_data, is.null, logical(1)))) {
     stop(NoPhenotypeError(paste("All phenotype files are empty for the specified region", if (!is.null(region)) "" else region)))
   }
   return(phenotype_data)
@@ -469,6 +468,25 @@ load_regional_association_data <- function(genotype, # PLINK file
   ## Load phenotype and covariates and perform some pre-processing
   covar <- load_covariate_data(covariate)
   pheno <- load_phenotype_data(phenotype, region, extract_region_name = extract_region_name, region_name_col = region_name_col, tabix_header = tabix_header)
+
+  # Keep covariates / conditions aligned with successfully loaded phenotypes.
+  # load_phenotype_data() returns NULL for phenotype files that are empty in
+  # the requested region; we must drop the corresponding covariate / condition
+  # entries before constructing data_list.
+  keep_idx <- !vapply(pheno, is.null, logical(1))
+  if (!any(keep_idx)) {
+    stop(NoPhenotypeError(paste("All phenotype files are empty for the specified region", if (!is.null(region)) "" else region)))
+  }
+  if (any(!keep_idx)) {
+    dropped_conditions <- conditions[!keep_idx]
+    message(paste(
+      "Dropping phenotypes with no data in region", region, ":",
+      paste(dropped_conditions, collapse = ", ")
+    ))
+  }
+  pheno <- pheno[keep_idx]
+  covar <- covar[keep_idx]
+  conditions <- conditions[keep_idx]
   ### including Y ( cov ) and specific X and covar match, filter X variants based on the overlapped samples.
   data_list <- prepare_data_list(geno, pheno, covar, imiss_cutoff,
     maf_cutoff, mac_cutoff, xvar_cutoff,