From 00f205810c725ab6c964ff3cf4cf5a4c36d2016e Mon Sep 17 00:00:00 2001 From: Kate Lawrence Date: Thu, 2 Oct 2025 11:24:16 -0700 Subject: [PATCH 01/15] clean NAs before mutated in alelle_qc --- R/allele_qc.R | 227 +++++++++++++++++++++++++++----------------------- 1 file changed, 124 insertions(+), 103 deletions(-) diff --git a/R/allele_qc.R b/R/allele_qc.R index 7b32d0a8..69fb1471 100644 --- a/R/allele_qc.R +++ b/R/allele_qc.R @@ -18,184 +18,205 @@ #' @param remove_unmatched Whether to remove unmatched variants. Default is `TRUE`. #' @return A single data frame with matched variants. #' @importFrom magrittr %>% -#' @importFrom dplyr mutate inner_join filter pull select everything row_number if_else +#' @importFrom dplyr mutate inner_join filter pull select everything row_number if_else any_of all_of rename #' @importFrom vctrs vec_duplicate_detect -#' @importFrom dplyr if_else #' @importFrom tidyr separate #' @export allele_qc <- function(target_data, ref_variants, col_to_flip = NULL, - match_min_prop = 0.2, remove_dups = TRUE, - remove_indels = FALSE, remove_strand_ambiguous = TRUE, - flip_strand = FALSE, remove_unmatched = TRUE, remove_same_vars = FALSE) { - strand_flip <- function(ref) { - # Define a mapping for complementary bases - base_mapping <- c("A" = "T", "T" = "A", "G" = "C", "C" = "G") - - # Function to complement a single base - complement_base <- function(base) { - complement <- base_mapping[base] - return(complement) - } - - # Function to reverse and complement a DNA sequence - reverse_complement <- function(sequence) { - reversed <- rev(strsplit(sequence, NULL)[[1]]) - complemented <- sapply(reversed, complement_base, USE.NAMES = FALSE) - return(paste(complemented, collapse = "")) - } - - complemented_sequence <- sapply(ref, reverse_complement, USE.NAMES = FALSE) - - return(complemented_sequence) + match_min_prop = 0.2, remove_dups = TRUE, + remove_indels = FALSE, remove_strand_ambiguous = TRUE, + flip_strand = FALSE, remove_unmatched = TRUE, remove_same_vars = FALSE) { + strand_flip <- function(ref) { + # Define a mapping for complementary bases + base_mapping <- c("A" = "T", "T" = "A", "G" = "C", "C" = "G") + + # Function to complement a single base + complement_base <- function(base) { + complement <- base_mapping[base] + return(complement) + } + + # Function to reverse and complement a DNA sequence + reverse_complement <- function(sequence) { + reversed <- rev(strsplit(sequence, NULL)[[1]]) + complemented <- sapply(reversed, complement_base, USE.NAMES = FALSE) + return(paste(complemented, collapse = "")) + } + + complemented_sequence <- sapply(ref, reverse_complement, USE.NAMES = FALSE) + + return(complemented_sequence) + } + + # helper to sanitize column names to avoid NA/empty names that break dplyr verbs + sanitize_names <- function(df) { + nm <- colnames(df) + if (is.null(nm)) { + nm <- rep("unnamed", ncol(df)) + } + empty_idx <- is.na(nm) | nm == "" + if (any(empty_idx)) { + # assign stable placeholder names for empties + nm[empty_idx] <- paste0("unnamed_", seq_len(sum(empty_idx))) + } + # ensure names are unique and syntactic + nm <- base::make.unique(nm, sep = "_") + colnames(df) <- nm + df } # check if the pattern is ATCG/DI check_ATCG <- function(vec) { - pattern <- "^[ATCGDI]+$" + pattern <- "^[ATCGDI]+$" - # Function to check if a single element matches the pattern - check_element <- function(element) { - grepl(pattern, element) - } + # Function to check if a single element matches the pattern + check_element <- function(element) { + grepl(pattern, element) + } - result <- sapply(vec, check_element, USE.NAMES = FALSE) - return(result) + result <- sapply(vec, check_element, USE.NAMES = FALSE) + return(result) } # transform all inputs to dataframe if (is.data.frame(target_data)) { - if (ncol(target_data) > 4 && all(c("chrom", "pos", "A2", "A1") %in% names(target_data))) { - # Extract variant columns and standardize - variant_cols <- c("chrom", "pos", "A2", "A1") - variant_df <- target_data %>% select(all_of(variant_cols)) - other_cols <- target_data %>% select(-all_of(variant_cols)) - target_data <- cbind(variant_id_to_df(variant_df), other_cols) - } else { - target_data <- variant_id_to_df(target_data) - } + if (ncol(target_data) > 4 && all(c("chrom", "pos", "A2", "A1") %in% names(target_data))) { + # Extract variant columns and standardize + variant_cols <- c("chrom", "pos", "A2", "A1") + variant_df <- target_data %>% select(all_of(variant_cols)) + other_cols <- target_data %>% select(-all_of(variant_cols)) + target_data <- cbind(variant_id_to_df(variant_df), other_cols) + } else { + target_data <- variant_id_to_df(target_data) + } } else { - target_data <- variant_id_to_df(target_data) + target_data <- variant_id_to_df(target_data) } ref_variants <- variant_id_to_df(ref_variants) + columns_to_remove <- c("chromosome", "position", "ref", "alt", "variant_id") # Check if any of the specified columns are present if (any(columns_to_remove %in% colnames(target_data))) { - target_data <- select(target_data, -any_of(columns_to_remove)) + target_data <- select(target_data, -any_of(columns_to_remove)) } match_result <- merge(target_data, ref_variants, by = c("chrom", "pos"), all = FALSE, suffixes = c(".target", ".ref")) %>% as.data.frame() + + # sanitize names after merge as well (merge can introduce empty names in edge cases) + match_result <- sanitize_names(match_result) + if (nrow(match_result) == 0) { - warning("No matching variants found between target data and reference variants.") - return(list(target_data_qced = match_result, qc_summary = match_result)) + warning("No matching variants found between target data and reference variants.") + return(list(target_data_qced = match_result, qc_summary = match_result)) } # match target & ref by chrom and position match_result = match_result %>% - mutate(variants_id_original = paste(chrom, pos, A2.target, A1.target, sep = ":")) %>% - mutate(variants_id_qced = paste(chrom, pos, A2.ref, A1.ref, sep = ":")) %>% - # filter out totally same rows. - filter(duplicated(.) | !duplicated(.)) %>% - # upper case target/reference A1 A2 - mutate(across(c(A1.target, A2.target, A1.ref, A2.ref), toupper)) %>% - mutate(flip1.ref = strand_flip(A1.ref), flip2.ref = strand_flip(A2.ref)) %>% - # these pairings are ambiguous: because we cannot tell it's an sign flip / strand flip - mutate(strand_unambiguous = if_else((A1.target == "A" & A2.target == "T") | (A1.target == "T" & A2.target == "A") | - (A1.target == "C" & A2.target == "G") | (A1.target == "G" & A2.target == "C"), FALSE, TRUE)) %>% - # filter out non-ATCG coded alleles - mutate(non_ATCG = !(check_ATCG(A1.target) & check_ATCG(A2.target))) %>% - # exact match should be kept all the time - mutate(exact_match = A1.target == A1.ref & A2.target == A2.ref) %>% - mutate(sign_flip = ((A1.target == A2.ref & A2.target == A1.ref) | (A1.target == flip2.ref & A2.target == flip1.ref)) & (A1.target != A1.ref & A2.target != A2.ref)) %>% - mutate(strand_flip = ((A1.target == flip1.ref & A2.target == flip2.ref) | (A1.target == flip2.ref & A2.target == flip1.ref)) & (A1.target != A1.ref & A2.target != A2.ref)) %>% - mutate(INDEL = (A2.target == "I" | A2.target == "D" | nchar(A2.target) > 1 | nchar(A1.target) > 1)) %>% - mutate(ID_match = ((A2.target == "D" | A2.target == "I") & (nchar(A1.ref) > 1 | nchar(A2.ref) > 1))) + mutate(variants_id_original = paste(chrom, pos, A2.target, A1.target, sep = ":")) %>% + mutate(variants_id_qced = paste(chrom, pos, A2.ref, A1.ref, sep = ":")) %>% + # filter out totally same rows. + filter(duplicated(.) | !duplicated(.)) %>% + # upper case target/reference A1 A2 + mutate(across(c(A1.target, A2.target, A1.ref, A2.ref), toupper)) %>% + mutate(flip1.ref = strand_flip(A1.ref), flip2.ref = strand_flip(A2.ref)) %>% + # these pairings are ambiguous: because we cannot tell it's an sign flip / strand flip + mutate(strand_unambiguous = if_else((A1.target == "A" & A2.target == "T") | (A1.target == "T" & A2.target == "A") | + (A1.target == "C" & A2.target == "G") | (A1.target == "G" & A2.target == "C"), FALSE, TRUE)) %>% + # filter out non-ATCG coded alleles + mutate(non_ATCG = !(check_ATCG(A1.target) & check_ATCG(A2.target))) %>% + # exact match should be kept all the time + mutate(exact_match = A1.target == A1.ref & A2.target == A2.ref) %>% + mutate(sign_flip = ((A1.target == A2.ref & A2.target == A1.ref) | (A1.target == flip2.ref & A2.target == flip1.ref)) & (A1.target != A1.ref & A2.target != A2.ref)) %>% + mutate(strand_flip = ((A1.target == flip1.ref & A2.target == flip2.ref) | (A1.target == flip2.ref & A2.target == flip1.ref)) & (A1.target != A1.ref & A2.target != A2.ref)) %>% + mutate(INDEL = (A2.target == "I" | A2.target == "D" | nchar(A2.target) > 1 | nchar(A1.target) > 1)) %>% + mutate(ID_match = ((A2.target == "D" | A2.target == "I") & (nchar(A1.ref) > 1 | nchar(A2.ref) > 1))) # if not remove, then this should'nt be a condition to filter out any variants if (!remove_strand_ambiguous) { - match_result <- match_result %>% mutate(strand_unambiguous = TRUE) + match_result <- match_result %>% mutate(strand_unambiguous = TRUE) } # if all strand flip is un-ambigous, then we know ambigous cases are indeed a strand flip # not a sign flip, then we infer there is no ambigous in the whole dataset, and keep those ambigous ones if (nrow(match_result %>% filter(strand_flip == TRUE) %>% filter(strand_unambiguous == TRUE)) == 0) { - match_result <- match_result %>% mutate(strand_unambiguous = TRUE) + match_result <- match_result %>% mutate(strand_unambiguous = TRUE) } # To keep variants: if it's a strand flip, we will keep those unambigous (because if ambigous, cannot know it's trand / sign flip, so discard all) # or exact match or indel match (ID_match) # If not a strand flip, then we will keep those that are exact match / those are sign flip / INDEL matched match_result <- match_result %>% mutate(keep = if_else(strand_flip, true = (strand_unambiguous | exact_match | ID_match), false = - (exact_match | sign_flip | ID_match) + (exact_match | sign_flip | ID_match) )) if (remove_indels) { - match_result <- match_result %>% mutate(keep = if_else(INDEL == FALSE, FALSE, TRUE)) + match_result <- match_result %>% mutate(keep = if_else(INDEL == FALSE, FALSE, TRUE)) } # flip the signs of the column col_to_flip if there is a sign flip if (!is.null(col_to_flip)) { - if (!is.null(match_result[, col_to_flip])) { - match_result[match_result$sign_flip, col_to_flip] <- -1 * match_result[match_result$sign_flip, col_to_flip] - } else { - stop("Column '", col_to_flip, "' not found in target_data.") - } + if (!is.null(match_result[, col_to_flip])) { + match_result[match_result$sign_flip, col_to_flip] <- -1 * match_result[match_result$sign_flip, col_to_flip] + } else { + stop("Column '", col_to_flip, "' not found in target_data.") + } } # flip the strands if there is a strand flip if (flip_strand) { - strand_flipped_indices <- which(match_result$strand_flip) - match_result[strand_flipped_indices, "A1.target"] <- strand_flip(match_result[strand_flipped_indices, "A1.target"]) - match_result[strand_flipped_indices, "A2.target"] <- strand_flip(match_result[strand_flipped_indices, "A2.target"]) + strand_flipped_indices <- which(match_result$strand_flip) + match_result[strand_flipped_indices, "A1.target"] <- strand_flip(match_result[strand_flipped_indices, "A1.target"]) + match_result[strand_flipped_indices, "A2.target"] <- strand_flip(match_result[strand_flipped_indices, "A2.target"]) } # Remove all unnecessary columns used to determine qc status # Finally keep those variants with FLAG keep = TRUE result <- match_result[match_result$keep, , drop = FALSE] - + # FIXME: I think this parameter is confusing. I inheritated directly from our function, whose default setting is TRUE. # It is removing all multi-allelic alleles which is unnecessary. I suggest remove this parameter directly. # What we are trying to avoid is the SAME allele having diferent z score. I defined one parameter remove_same_vars later, but I can re-use this # remove_dup name if (remove_dups) { - dups <- vec_duplicate_detect(result[, c("chrom", "pos", "variants_id_qced")]) - if (any(dups)) { - result <- result[!dups, , drop = FALSE] - warning("Unexpected duplicates were removed.") - } + dups <- vec_duplicate_detect(result[, c("chrom", "pos", "variants_id_qced")]) + if (any(dups)) { + result <- result[!dups, , drop = FALSE] + warning("Unexpected duplicates were removed.") + } } - + result <- result %>% - select(-(flip1.ref:keep)) %>% - select(-A1.target, -A2.target) %>% - rename(A1 = A1.ref, A2 = A2.ref, variant_id = variants_id_qced) + select(-(flip1.ref:keep)) %>% + select(-A1.target, -A2.target) %>% + rename(A1 = A1.ref, A2 = A2.ref, variant_id = variants_id_qced) # default FALSE, but if want to remove same variants having different z score, then set as TRUE if (remove_same_vars) { - same_vars <- vec_duplicate_detect(result[, c("chrom", "pos", "variant_id")]) - if (any(same_vars)) { - result <- result[!same_vars, , drop = FALSE] - message("Same variants with different z scores are removed.") - } + same_vars <- vec_duplicate_detect(result[, c("chrom", "pos", "variant_id")]) + if (any(same_vars)) { + result <- result[!same_vars, , drop = FALSE] + message("Same variants with different z scores are removed.") + } } if (!remove_unmatched) { - match_variant <- result %>% pull(variants_id_original) - match_result <- select(match_result, -(flip1.ref:keep)) %>% - select(-variants_id_original, -A1.target, -A2.target) %>% - rename(A1 = A1.ref, A2 = A2.ref, variant_id = variants_id_qced) - target_data <- target_data %>% mutate(variant_id = paste(chrom, pos, A2, A1, sep = ":")) - if (length(setdiff(target_data %>% pull(variant_id), match_variant)) > 0) { - unmatch_data <- target_data %>% filter(!variant_id %in% match_variant) - result <- rbind(result, unmatch_data %>% mutate(variants_id_original = variant_id)) - result <- result[match(target_data$variant_id, result$variants_id_original), ] %>% select(-variants_id_original) - } + match_variant <- result %>% pull(variants_id_original) + match_result <- select(match_result, -(flip1.ref:keep)) %>% + select(-variants_id_original, -A1.target, -A2.target) %>% + rename(A1 = A1.ref, A2 = A2.ref, variant_id = variants_id_qced) + target_data <- target_data %>% mutate(variant_id = paste(chrom, pos, A2, A1, sep = ":")) + if (length(setdiff(target_data %>% pull(variant_id), match_variant)) > 0) { + unmatch_data <- target_data %>% filter(!variant_id %in% match_variant) + result <- rbind(result, unmatch_data %>% mutate(variants_id_original = variant_id)) + result <- result[match(target_data$variant_id, result$variants_id_original), ] %>% select(-variants_id_original) + } } min_match <- match_min_prop * nrow(ref_variants) if (nrow(result) < min_match) { - stop("Not enough variants have been matched.") + stop("Not enough variants have been matched.") } # throw an error if there are any variant_id that are duplicated (meaning that same variant having different other infos for example z score) if (!remove_same_vars & any(duplicated(result$variant_id))) { - stop("In the input, there are duplicated variants with different z scores. Please check the data and determine which to keep.") + stop("In the input, there are duplicated variants with different z scores. Please check the data and determine which to keep.") } return(list(target_data_qced = result, qc_summary = match_result)) @@ -288,4 +309,4 @@ align_variant_names <- function(source, reference, remove_indels = FALSE) { aligned_variants = aligned_variants, unmatched_indices = unmatched_indices ) -} +} \ No newline at end of file From 756b6570d1e56c4618ff03d5fc21f2be459d4fca Mon Sep 17 00:00:00 2001 From: Kate Lawrence Date: Thu, 2 Oct 2025 11:58:39 -0700 Subject: [PATCH 02/15] clean build from LD varaint ids --- R/allele_qc.R | 8 +++++++- R/sumstats_qc.R | 22 ++++++++++++++++++++++ 2 files changed, 29 insertions(+), 1 deletion(-) diff --git a/R/allele_qc.R b/R/allele_qc.R index 69fb1471..fba18628 100644 --- a/R/allele_qc.R +++ b/R/allele_qc.R @@ -230,6 +230,7 @@ allele_qc <- function(target_data, ref_variants, col_to_flip = NULL, #' #' @param source A character vector of variant names in the format "chr:pos:A2:A1" or "chr:pos_A2_A1". #' @param reference A character vector of variant names in the format "chr:pos:A2:A1" or "chr:pos_A2_A1". +#' @param remove_build_suffix Whether to strip trailing genome build suffixes like ":b38" or "_b38" before alignment. Default TRUE. #' #' @return A list with two elements: #' - aligned_variants: A character vector of aligned variant names. @@ -241,7 +242,12 @@ allele_qc <- function(target_data, ref_variants, col_to_flip = NULL, #' align_variant_names(source, reference) #' #' @export -align_variant_names <- function(source, reference, remove_indels = FALSE) { +align_variant_names <- function(source, reference, remove_indels = FALSE, remove_build_suffix = TRUE) { + # Optionally strip build suffix like :b38 or _b38 from both sides for robust alignment + if (remove_build_suffix) { + source <- gsub("(:|_)b[0-9]+$", "", source) + reference <- gsub("(:|_)b[0-9]+$", "", reference) + } # Check if source and reference follow the expected pattern source_pattern <- grepl("^(chr)?[0-9]+:[0-9]+:[ATCG*]+:[ATCG*]+$|^(chr)?[0-9]+:[0-9]+_[ATCG*]+_[ATCG*]+$", source) reference_pattern <- grepl("^(chr)?[0-9]+:[0-9]+:[ATCG*]+:[ATCG*]+$|^(chr)?[0-9]+:[0-9]+_[ATCG*]+_[ATCG*]+$", reference) diff --git a/R/sumstats_qc.R b/R/sumstats_qc.R index c0424fe3..d0756bbb 100644 --- a/R/sumstats_qc.R +++ b/R/sumstats_qc.R @@ -54,6 +54,28 @@ rss_basic_qc <- function(sumstats, LD_data, skip_region = NULL, remove_indels = sumstats_processed <- allele_flip$target_data_qced %>% arrange(pos) + # Align and subset LD by mapping core IDs (strip trailing build suffix) to exact LD IDs + ld_mat <- LD_data$combined_LD_matrix + ld_ids <- tryCatch(rownames(ld_mat), error = function(e) NULL) + if (is.null(ld_ids)) { + stop("LD matrix rownames are NULL; cannot align variant IDs.") + } + present <- sumstats_processed$variant_id %in% ld_ids + if (sum(present) == 0) { + strip_build <- function(x) sub("(:|_)b[0-9]+$", "", x) + ld_core <- strip_build(ld_ids) + ss_core <- strip_build(sumstats_processed$variant_id) + map_idx <- match(ss_core, ld_core) + remap <- !is.na(map_idx) + if (sum(remap) > 0) { + sumstats_processed$variant_id[remap] <- ld_ids[map_idx[remap]] + present <- sumstats_processed$variant_id %in% ld_ids + } + } + if (sum(present) == 0) { + stop("No overlapping variants between sumstats and LD after alignment.") + } + LD_mat_processed <- LD_data$combined_LD_matrix[sumstats_processed$variant_id, sumstats_processed$variant_id, drop = FALSE] return(list(sumstats = sumstats_processed, LD_mat = LD_mat_processed)) From 308487ecd5c3adeb7af1330d13f9dd30e0cb90af Mon Sep 17 00:00:00 2001 From: Kate Lawrence Date: Thu, 2 Oct 2025 11:59:56 -0700 Subject: [PATCH 03/15] add missing defualt for pip_cutoff_to_skip_ld --- R/colocboost_pipeline.R | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/R/colocboost_pipeline.R b/R/colocboost_pipeline.R index 2b22d712..766eef6e 100644 --- a/R/colocboost_pipeline.R +++ b/R/colocboost_pipeline.R @@ -687,7 +687,11 @@ qc_regional_data <- function(region_data, n <- sumstat$n var_y <- sumstat$var_y conditions_sumstat <- names(sumstats)[ii] - pip_cutoff_to_skip_ld <- pip_cutoff_to_skip_sumstat[conditions_sumstat] %>% as.numeric() + pip_cutoff_to_skip_ld <- tryCatch( + as.numeric(pip_cutoff_to_skip_sumstat[conditions_sumstat]), + error = function(e) 0 + ) + if (is.na(pip_cutoff_to_skip_ld)) pip_cutoff_to_skip_ld <- 0 # Preprocess the input data preprocess_results <- rss_basic_qc(sumstat$sumstats, LD_data, remove_indels = remove_indels) From 77630df03d4b5416dfbda75431d3aabcc61a764f Mon Sep 17 00:00:00 2001 From: Kate Lawrence Date: Thu, 2 Oct 2025 12:36:42 -0700 Subject: [PATCH 04/15] chr anming for LD columns --- R/LD.R | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/R/LD.R b/R/LD.R index 426790ff..9aeaea2b 100644 --- a/R/LD.R +++ b/R/LD.R @@ -355,6 +355,15 @@ load_LD_matrix <- function(LD_meta_file_path, region, extract_coordinates = NULL LD_matrices = extracted_LD_matrices_list, variants = extracted_LD_variants_list ) + # Normalize LD variant IDs at load time: add 'chr' prefix if missing + if (!is.null(colnames(combined_LD_matrix))) { + cn <- as.character(colnames(combined_LD_matrix)) + colnames(combined_LD_matrix) <- ifelse(startsWith(cn, "chr"), cn, paste0("chr", cn)) + } + if (!is.null(rownames(combined_LD_matrix))) { + rn <- as.character(rownames(combined_LD_matrix)) + rownames(combined_LD_matrix) <- ifelse(startsWith(rn, "chr"), rn, paste0("chr", rn)) + } combined_LD_variants <- rownames(combined_LD_matrix) # Now create block_metadata with all the information we've accumulated From 50e8dcb8d32e95b716d83644e7a220b742a47a65 Mon Sep 17 00:00:00 2001 From: Kate Lawrence Date: Thu, 2 Oct 2025 12:42:23 -0700 Subject: [PATCH 05/15] chr naming for LD columns --- R/LD.R | 2 ++ 1 file changed, 2 insertions(+) diff --git a/R/LD.R b/R/LD.R index 9aeaea2b..df0379a2 100644 --- a/R/LD.R +++ b/R/LD.R @@ -384,6 +384,8 @@ load_LD_matrix <- function(LD_meta_file_path, region, extract_coordinates = NULL ref_panel <- do.call(rbind, lapply(strsplit(rownames(combined_LD_matrix), ":"), function(x) { data.frame(chrom = x[1], pos = as.integer(x[2]), A2 = x[3], A1 = x[4]) })) + # Normalize ref_panel chrom to drop 'chr' prefix for merging with sumstats (which often use numeric chrom) + ref_panel$chrom <- sub("^chr", "", as.character(ref_panel$chrom)) merged_variant_list <- do.call(rbind, extracted_LD_variants_list) ref_panel$variant_id <- rownames(combined_LD_matrix) From 84e2e517ddfeb9c214c132efd4c7880cb146b21a Mon Sep 17 00:00:00 2001 From: Kate Lawrence Date: Thu, 2 Oct 2025 12:47:42 -0700 Subject: [PATCH 06/15] chr and build naming in LD variant ids --- R/sumstats_qc.R | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/R/sumstats_qc.R b/R/sumstats_qc.R index d0756bbb..3923e168 100644 --- a/R/sumstats_qc.R +++ b/R/sumstats_qc.R @@ -63,8 +63,9 @@ rss_basic_qc <- function(sumstats, LD_data, skip_region = NULL, remove_indels = present <- sumstats_processed$variant_id %in% ld_ids if (sum(present) == 0) { strip_build <- function(x) sub("(:|_)b[0-9]+$", "", x) - ld_core <- strip_build(ld_ids) - ss_core <- strip_build(sumstats_processed$variant_id) + drop_chr <- function(x) sub("^chr", "", x) + ld_core <- drop_chr(strip_build(ld_ids)) + ss_core <- drop_chr(strip_build(sumstats_processed$variant_id)) map_idx <- match(ss_core, ld_core) remap <- !is.na(map_idx) if (sum(remap) > 0) { From 089383f227b3848cd2343acc74b1bc4fd11d544b Mon Sep 17 00:00:00 2001 From: Kate Lawrence Date: Thu, 2 Oct 2025 13:09:08 -0700 Subject: [PATCH 07/15] harmonize LD block variants, handel nas in printing --- R/LD.R | 12 ++++++++++-- R/raiss.R | 8 ++++---- 2 files changed, 14 insertions(+), 6 deletions(-) diff --git a/R/LD.R b/R/LD.R index df0379a2..fb9d00b9 100644 --- a/R/LD.R +++ b/R/LD.R @@ -368,12 +368,20 @@ load_LD_matrix <- function(LD_meta_file_path, region, extract_coordinates = NULL # Now create block_metadata with all the information we've accumulated block_variants <- lapply(extracted_LD_variants_list, function(v) v$variants) + # Normalize block_variants to match combined_LD_variants format (add 'chr' if needed) + block_variants_normalized <- lapply(block_variants, function(v) { + if (!any(startsWith(v, "chr"))) { + paste0("chr", v) + } else { + v + } + }) block_metadata <- data.frame( block_id = seq_along(LD_file_paths), chrom = block_chroms, size = sapply(block_variants, length), - start_idx = sapply(block_variants, function(v) min(match(v, combined_LD_variants))), - end_idx = sapply(block_variants, function(v) max(match(v, combined_LD_variants))), + start_idx = sapply(block_variants_normalized, function(v) min(match(v, combined_LD_variants))), + end_idx = sapply(block_variants_normalized, function(v) max(match(v, combined_LD_variants))), stringsAsFactors = FALSE ) diff --git a/R/raiss.R b/R/raiss.R index 278292fa..5b27e572 100644 --- a/R/raiss.R +++ b/R/raiss.R @@ -330,10 +330,10 @@ filter_raiss_output <- function(zscores, R2_threshold = 0.6, minimum_ld = 5, ver # Count statistics before filtering NSNPs_bf_filt <- nrow(zscores) - NSNPs_initial <- sum(zscores$raiss_R2 == 2.0) - NSNPs_imputed <- sum(zscores$raiss_R2 != 2.0) - NSNPs_ld_filt <- sum(zscores$raiss_ld_score < minimum_ld) - NSNPs_R2_filt <- sum(zscores$raiss_R2 < R2_threshold) + NSNPs_initial <- sum(zscores$raiss_R2 == 2.0, na.rm = TRUE) + NSNPs_imputed <- sum(zscores$raiss_R2 != 2.0, na.rm = TRUE) + NSNPs_ld_filt <- sum(zscores$raiss_ld_score < minimum_ld, na.rm = TRUE) + NSNPs_R2_filt <- sum(zscores$raiss_R2 < R2_threshold, na.rm = TRUE) # Apply filters zscores_nofilter <- zscores From 1b90a950b10be66350c6248ba8553978bfc7fbc6 Mon Sep 17 00:00:00 2001 From: Kate Lawrence Date: Thu, 2 Oct 2025 13:42:00 -0700 Subject: [PATCH 08/15] harmonize all to have chr prefix --- R/colocboost_pipeline.R | 25 ++++++++++++++++++------- 1 file changed, 18 insertions(+), 7 deletions(-) diff --git a/R/colocboost_pipeline.R b/R/colocboost_pipeline.R index 766eef6e..70df18ad 100644 --- a/R/colocboost_pipeline.R +++ b/R/colocboost_pipeline.R @@ -436,13 +436,24 @@ colocboost_analysis_pipeline <- function(region_data, if (!is.null(sumstat_data$sumstats)) { sumstats <- lapply(sumstat_data$sumstats, function(ss) { z <- ss$sumstats$z - variant <- paste0("chr", ss$sumstats$variant_id) + # Prefix 'chr' only if missing to avoid 'chrchr' + vi <- ss$sumstats$variant_id + has_chr <- startsWith(as.character(vi), "chr") + variant <- ifelse(has_chr, as.character(vi), paste0("chr", as.character(vi))) n <- ss$n data.frame("z" = z, "n" = n, "variant" = variant) }) names(sumstats) <- names(sumstat_data$sumstats) LD_mat <- lapply(sumstat_data$LD_mat, function(ld) { - colnames(ld) <- rownames(ld) <- paste0("chr", colnames(ld)) + # Ensure LD dimnames have 'chr' only once + cn <- colnames(ld) + rn <- rownames(ld) + if (!is.null(cn)) { + colnames(ld) <- ifelse(startsWith(as.character(cn), "chr"), as.character(cn), paste0("chr", as.character(cn))) + } + if (!is.null(rn)) { + rownames(ld) <- ifelse(startsWith(as.character(rn), "chr"), as.character(rn), paste0("chr", as.character(rn))) + } return(ld) }) LD_match <- sumstat_data$LD_match @@ -687,11 +698,11 @@ qc_regional_data <- function(region_data, n <- sumstat$n var_y <- sumstat$var_y conditions_sumstat <- names(sumstats)[ii] - pip_cutoff_to_skip_ld <- tryCatch( - as.numeric(pip_cutoff_to_skip_sumstat[conditions_sumstat]), - error = function(e) 0 - ) - if (is.na(pip_cutoff_to_skip_ld)) pip_cutoff_to_skip_ld <- 0 + pip_cutoff_to_skip_ld <- if (conditions_sumstat %in% names(pip_cutoff_to_skip_sumstat)) { + as.numeric(pip_cutoff_to_skip_sumstat[conditions_sumstat]) + } else { + 0 + } # Preprocess the input data preprocess_results <- rss_basic_qc(sumstat$sumstats, LD_data, remove_indels = remove_indels) From 51ba8e8b86712cbf6322b3864172de0dbc38afcc Mon Sep 17 00:00:00 2001 From: Kate Lawrence Date: Thu, 2 Oct 2025 14:02:34 -0700 Subject: [PATCH 09/15] remove nas from sumstats --- R/colocboost_pipeline.R | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/R/colocboost_pipeline.R b/R/colocboost_pipeline.R index 70df18ad..eb296f2c 100644 --- a/R/colocboost_pipeline.R +++ b/R/colocboost_pipeline.R @@ -441,7 +441,15 @@ colocboost_analysis_pipeline <- function(region_data, has_chr <- startsWith(as.character(vi), "chr") variant <- ifelse(has_chr, as.character(vi), paste0("chr", as.character(vi))) n <- ss$n - data.frame("z" = z, "n" = n, "variant" = variant) + + # Filter out NA values from z-scores and corresponding variants + na_mask <- !is.na(z) + if (sum(na_mask) == 0) { + message("Warning: All z-scores are NA for this summary statistic dataset") + return(data.frame("z" = numeric(0), "n" = numeric(0), "variant" = character(0))) + } + + data.frame("z" = z[na_mask], "n" = n, "variant" = variant[na_mask]) }) names(sumstats) <- names(sumstat_data$sumstats) LD_mat <- lapply(sumstat_data$LD_mat, function(ld) { From a2aa4e248edfe094e03c838435d63eb122806305 Mon Sep 17 00:00:00 2001 From: Kate Lawrence Date: Fri, 10 Oct 2025 20:01:15 -0700 Subject: [PATCH 10/15] skip gwas in focal mode if colocboost errors --- R/colocboost_pipeline.R | 86 +++++++++++++++++++++++++++++++++-------- 1 file changed, 69 insertions(+), 17 deletions(-) diff --git a/R/colocboost_pipeline.R b/R/colocboost_pipeline.R index eb296f2c..d11feedd 100644 --- a/R/colocboost_pipeline.R +++ b/R/colocboost_pipeline.R @@ -470,6 +470,35 @@ colocboost_analysis_pipeline <- function(region_data, sumstats <- LD_mat <- dict_sumstatLD <- NULL } + # Helper: validate a single GWAS study data.frame and its LD matrix + is_valid_sumstat_entry <- function(df, ld_matrix) { + if (is.null(df) || nrow(df) == 0) return(FALSE) + if (!all(c("z", "n", "variant") %in% colnames(df))) return(FALSE) + n_vals <- suppressWarnings(as.numeric(unique(df$n))) + if (length(n_vals) == 0 || is.na(n_vals[1]) || !is.finite(n_vals[1]) || n_vals[1] < 3) return(FALSE) + if (all(is.na(df$z))) return(FALSE) + if (is.null(ld_matrix)) return(FALSE) + if (!is.matrix(ld_matrix)) return(FALSE) + if (any(!is.finite(ld_matrix))) return(FALSE) + # Ensure at least 1 overlapping variant with LD matrix dimnames + ld_vars <- intersect(rownames(ld_matrix), colnames(ld_matrix)) + if (length(ld_vars) == 0) return(FALSE) + if (length(intersect(as.character(df$variant), as.character(ld_vars))) == 0) return(FALSE) + TRUE + } + + # Helper: filter joint structures (sumstats list and dict_sumstatLD) to only valid entries + filter_valid_sumstats <- function(sumstats, LD_mat, dict_sumstatLD) { + if (is.null(sumstats) || is.null(dict_sumstatLD) || is.null(LD_mat)) return(list(sumstats = NULL, dict = NULL)) + keep <- vector("logical", length(sumstats)) + for (i in seq_along(sumstats)) { + ld_idx <- dict_sumstatLD[i, 2] + ld_matrix <- if (!is.na(ld_idx)) LD_mat[[ld_idx]] else NULL + keep[i] <- is_valid_sumstat_entry(sumstats[[i]], ld_matrix) + } + if (!any(keep)) return(list(sumstats = NULL, dict = NULL)) + list(sumstats = sumstats[keep], dict = matrix(dict_sumstatLD[keep, , drop = FALSE], ncol = 2)) + } ####### ========= streamline three types of analyses ======== ######## if (is.null(X) & is.null(sumstats)) { @@ -500,16 +529,22 @@ colocboost_analysis_pipeline <- function(region_data, if (joint_gwas & !is.null(sumstats)) { message(paste("====== Performing non-focaled version GWAS-xQTL ColocBoost on", length(Y), "contexts and", length(sumstats), "GWAS. =====")) t21 <- Sys.time() - traits <- c(names(Y), names(sumstats)) - res_gwas <- colocboost( - X = X, Y = Y, sumstat = sumstats, LD = LD_mat, - dict_YX = dict_YX, dict_sumstatLD = dict_sumstatLD, - outcome_names = traits, focal_outcome_idx = NULL, - output_level = 2, ... - ) - t22 <- Sys.time() - analysis_results$joint_gwas <- res_gwas - analysis_results$computing_time$Analysis$joint_gwas <- t22 - t21 + # Filter invalid GWAS before calling colocboost + filtered <- filter_valid_sumstats(sumstats, LD_mat, dict_sumstatLD) + if (is.null(filtered$sumstats)) { + message("All GWAS studies failed validation; skipping joint GWAS analysis for this region.") + } else { + traits <- c(names(Y), names(filtered$sumstats)) + res_gwas <- colocboost( + X = X, Y = Y, sumstat = filtered$sumstats, LD = LD_mat, + dict_YX = dict_YX, dict_sumstatLD = filtered$dict, + outcome_names = traits, focal_outcome_idx = NULL, + output_level = 2, ... + ) + t22 <- Sys.time() + analysis_results$joint_gwas <- res_gwas + analysis_results$computing_time$Analysis$joint_gwas <- t22 - t21 + } } # - run focaled version of ColocBoost for each GWAS if (separate_gwas & !is.null(sumstats)) { @@ -517,15 +552,32 @@ colocboost_analysis_pipeline <- function(region_data, res_gwas_separate <- analysis_results$separate_gwas for (i_gwas in 1:nrow(dict_sumstatLD)) { current_study <- names(sumstats)[i_gwas] - message(paste("====== Performing focaled version GWAS-xQTL ColocBoost on", length(Y), "contexts and ", current_study, "GWAS. =====")) dict <- dict_sumstatLD[i_gwas, ] + # Validate this study before calling colocboost + ld_idx <- dict[2] + ld_matrix <- if (!is.na(ld_idx)) LD_mat[[ld_idx]] else NULL + valid <- is_valid_sumstat_entry(sumstats[[dict[1]]], ld_matrix) + if (!isTRUE(valid)) { + message(paste("Skipping", current_study, "due to invalid N/z/LD.")) + next + } + message(paste("====== Performing focaled version GWAS-xQTL ColocBoost on", length(Y), "contexts and ", current_study, "GWAS. =====")) traits <- c(names(Y), current_study) - res_gwas_separate[[current_study]] <- colocboost( - X = X, Y = Y, sumstat = sumstats[dict[1]], - LD = LD_mat[dict[2]], dict_YX = dict_YX, - outcome_names = traits, focal_outcome_idx = length(traits), - output_level = 2, ... - ) + + # Try to run colocboost with error handling + tryCatch({ + res_gwas_separate[[current_study]] <- colocboost( + X = X, Y = Y, sumstat = sumstats[dict[1]], + LD = LD_mat[dict[2]], dict_YX = dict_YX, + outcome_names = traits, focal_outcome_idx = length(traits), + output_level = 2, ... + ) + }, error = function(e) { + message(paste("Error in colocboost analysis for", current_study, ":")) + message(paste("Error message:", e$message)) + message(paste("Continuing with next GWAS...")) + res_gwas_separate[[current_study]] <<- NULL + }) } t32 <- Sys.time() analysis_results$separate_gwas <- res_gwas_separate From e194bad0fcfc010267add1e572f68579905596c6 Mon Sep 17 00:00:00 2001 From: kal26 Date: Thu, 30 Oct 2025 17:34:57 +0000 Subject: [PATCH 11/15] Update documentation --- DESCRIPTION | 2 +- NAMESPACE | 2 ++ man/align_variant_names.Rd | 9 ++++++++- 3 files changed, 11 insertions(+), 2 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 3ece6a87..87e3c40b 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -62,4 +62,4 @@ LinkingTo: RcppGSL NeedsCompilation: yes VignetteBuilder: knitr -RoxygenNote: 7.3.2 +RoxygenNote: 7.3.3 diff --git a/NAMESPACE b/NAMESPACE index ea3fe90e..a8ed34c0 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -141,6 +141,8 @@ importFrom(data.table,setnames) importFrom(doFuture,registerDoFuture) importFrom(dplyr,"%>%") importFrom(dplyr,across) +importFrom(dplyr,all_of) +importFrom(dplyr,any_of) importFrom(dplyr,arrange) importFrom(dplyr,as_tibble) importFrom(dplyr,bind_rows) diff --git a/man/align_variant_names.Rd b/man/align_variant_names.Rd index bd112604..d89b7ca5 100644 --- a/man/align_variant_names.Rd +++ b/man/align_variant_names.Rd @@ -4,12 +4,19 @@ \alias{align_variant_names} \title{Align Variant Names} \usage{ -align_variant_names(source, reference, remove_indels = FALSE) +align_variant_names( + source, + reference, + remove_indels = FALSE, + remove_build_suffix = TRUE +) } \arguments{ \item{source}{A character vector of variant names in the format "chr:pos:A2:A1" or "chr:pos_A2_A1".} \item{reference}{A character vector of variant names in the format "chr:pos:A2:A1" or "chr:pos_A2_A1".} + +\item{remove_build_suffix}{Whether to strip trailing genome build suffixes like ":b38" or "_b38" before alignment. Default TRUE.} } \value{ A list with two elements: From 47ae7d07aaa1779287cce5d2388fa82eb89f10ee Mon Sep 17 00:00:00 2001 From: Kate Lawrence Date: Thu, 30 Oct 2025 10:38:40 -0700 Subject: [PATCH 12/15] add rss_qc as default qc_method --- R/colocboost_pipeline.R | 2 ++ 1 file changed, 2 insertions(+) diff --git a/R/colocboost_pipeline.R b/R/colocboost_pipeline.R index 2b22d712..d12bd6d9 100644 --- a/R/colocboost_pipeline.R +++ b/R/colocboost_pipeline.R @@ -554,6 +554,7 @@ qc_regional_data <- function(region_data, qc_method = c("rss_qc", "dentist", "slalom"), impute = TRUE, impute_opts = list(rcond = 0.01, R2_threshold = 0.6, minimum_ld = 5, lamb = 0.01)) { + if (!is.null(qc_method)) qc_method <- match.arg(qc_method) #### related internal functions # Add context names to colname of Y if missing add_context_to_Y <- function(res_Y) { @@ -675,6 +676,7 @@ qc_regional_data <- function(region_data, qc_method = c("rss_qc", "dentist", "slalom"), impute = TRUE, impute_opts = list(rcond = 0.01, R2_threshold = 0.6, minimum_ld = 5, lamb = 0.01)) { + if (!is.null(qc_method)) qc_method <- match.arg(qc_method) n_LD <- length(sumstat_data$LD_info) final_sumstats <- final_LD <- NULL LD_match <- c() From 4d60845831620da3a0b8d8a43be9f54aa4d9880f Mon Sep 17 00:00:00 2001 From: Kate Lawrence Date: Thu, 30 Oct 2025 10:42:40 -0700 Subject: [PATCH 13/15] add default pip_cutoff for multiple individual contexts or sumstats --- R/colocboost_pipeline.R | 34 +++++++++++++++++++++++++++++++--- 1 file changed, 31 insertions(+), 3 deletions(-) diff --git a/R/colocboost_pipeline.R b/R/colocboost_pipeline.R index d12bd6d9..5b98aab4 100644 --- a/R/colocboost_pipeline.R +++ b/R/colocboost_pipeline.R @@ -615,6 +615,15 @@ qc_regional_data <- function(region_data, # - add context to colname of Y Y <- add_context_to_Y(Y) n_context <- length(X) + # Validate pip_cutoff_to_skip_ind length: allow scalar or exact length; otherwise error + if (length(pip_cutoff_to_skip_ind) == 1) { + pip_cutoff_vec <- rep(pip_cutoff_to_skip_ind, n_context) + } else if (length(pip_cutoff_to_skip_ind) == n_context) { + pip_cutoff_vec <- pip_cutoff_to_skip_ind + } else { + stop(sprintf("pip_cutoff_to_skip_ind length (%d) must be 1 or equal to number of contexts (%d)", + length(pip_cutoff_to_skip_ind), n_context)) + } residual_X <- residual_Y <- list() keep_contexts <- c() for (i_context in 1:n_context) { @@ -626,8 +635,8 @@ qc_regional_data <- function(region_data, # - remove variants with maf < maf_cutoff # tmp <- filter_resX_maf(resX, maf, maf_cutoff = maf_cutoff) resX <- filter_X(resX, missing_rate_thresh = NULL, maf_thresh = maf_cutoff, maf = maf) - # Initial PIP check - resY <- filter_resY_pip(resX, resY, pip_cutoff = pip_cutoff_to_skip_ind[i_context], context = context) + # Initial PIP check (use recycled vector element) + resY <- filter_resY_pip(resX, resY, pip_cutoff = pip_cutoff_vec[i_context], context = context) if (!is.null(resY)) { residual_X <- c(residual_X, list(resX)) residual_Y <- c(residual_Y, list(resY)) @@ -689,7 +698,26 @@ qc_regional_data <- function(region_data, n <- sumstat$n var_y <- sumstat$var_y conditions_sumstat <- names(sumstats)[ii] - pip_cutoff_to_skip_ld <- pip_cutoff_to_skip_sumstat[conditions_sumstat] %>% as.numeric() + # Determine PIP cutoff per sumstat with strict validation: + # - scalar applies to all studies + # - length equals number of studies -> positional mapping + # - named vector that covers ALL study names -> name-based mapping + # Otherwise: throw error (do not silently default) + if (length(pip_cutoff_to_skip_sumstat) == 1) { + pip_cutoff_to_skip_ld <- as.numeric(pip_cutoff_to_skip_sumstat) + } else if (length(pip_cutoff_to_skip_sumstat) == length(sumstats) && is.null(names(pip_cutoff_to_skip_sumstat))) { + pip_cutoff_to_skip_ld <- as.numeric(pip_cutoff_to_skip_sumstat[ii]) + } else if (!is.null(names(pip_cutoff_to_skip_sumstat))) { + missing_names <- setdiff(names(sumstats), names(pip_cutoff_to_skip_sumstat)) + if (length(missing_names) > 0) { + stop(sprintf("pip_cutoff_to_skip_sumstat must provide values for all studies; missing: %s", + paste(missing_names, collapse = ", "))) + } + pip_cutoff_to_skip_ld <- as.numeric(pip_cutoff_to_skip_sumstat[conditions_sumstat]) + } else { + stop(sprintf("pip_cutoff_to_skip_sumstat length (%d) must be 1, equal to number of studies (%d), or be a named vector covering all studies", + length(pip_cutoff_to_skip_sumstat), length(sumstats))) + } # Preprocess the input data preprocess_results <- rss_basic_qc(sumstat$sumstats, LD_data, remove_indels = remove_indels) From 705eae49f8b92fdeea9d82299481b48bcaa8b19d Mon Sep 17 00:00:00 2001 From: Kate Lawrence Date: Thu, 30 Oct 2025 10:58:10 -0700 Subject: [PATCH 14/15] get names from region_name_col even if extract_region_name isn't provided. Add fallback region nameing (reigon_1, region_2, ..) --- R/file_utils.R | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/R/file_utils.R b/R/file_utils.R index ce3253ff..6e2444a6 100644 --- a/R/file_utils.R +++ b/R/file_utils.R @@ -243,7 +243,20 @@ load_phenotype_data <- function(phenotype_path, region, extract_region_name = NU stop("region_name_col is out of bounds for the number of columns in tabix_data.") } } else { - return(tabix_data %>% t()) + # No extract_region_name filter: still transpose and assign names appropriately + result <- tabix_data %>% t() + if (!is.null(region_name_col) && (region_name_col %% 1 == 0)) { + if (region_name_col <= ncol(tabix_data)) { + # After transpose, the desired name row is at index region_name_col + colnames(result) <- result[region_name_col, ] + } else { + stop("region_name_col is out of bounds for the number of columns in tabix_data.") + } + } else if (is.null(colnames(result))) { + # Fallback: synthesize names if still missing + colnames(result) <- paste0("region_", seq_len(ncol(result))) + } + return(result) } })) From 922e236a413374d1771bfc506fa677bd1591017a Mon Sep 17 00:00:00 2001 From: Kate Lawrence Date: Fri, 5 Dec 2025 08:09:19 -0800 Subject: [PATCH 15/15] Add comprehensive file existence checks with clear error messages - Check for missing phenotype, covariate, genotype, sumstat, column mapping, and LD metadata files - Error (not warn) when files are missing to catch path issues early - Fix genotype file check to look for .bed/.bim/.fam or .pgen/.pvar/.psam files instead of base filename - Improve error messages to show which files are missing with condition names - Filter phenotypes without data for specific region before loading --- R/colocboost_pipeline.R | 188 +++++++++++++++++++++++++++++++++++++++- R/file_utils.R | 30 +++++-- 2 files changed, 209 insertions(+), 9 deletions(-) diff --git a/R/colocboost_pipeline.R b/R/colocboost_pipeline.R index 5762707a..89997940 100644 --- a/R/colocboost_pipeline.R +++ b/R/colocboost_pipeline.R @@ -111,16 +111,143 @@ load_multitask_regional_data <- function(region, # a string of chr:start-end for for (i_data in 1:n_dataset) { # extract genotype file name genotype <- genotype_list[i_data] + + # Check if PLINK files exist (.bed/.bim/.fam for PLINK 1 or .pgen/.pvar/.psam for PLINK 2) + bed_file <- paste0(genotype, ".bed") + bim_file <- paste0(genotype, ".bim") + fam_file <- paste0(genotype, ".fam") + pgen_file <- paste0(genotype, ".pgen") + pvar_file <- paste0(genotype, ".pvar") + psam_file <- paste0(genotype, ".psam") + + plink1_exists <- file.exists(bed_file) && file.exists(bim_file) && file.exists(fam_file) + plink2_exists <- file.exists(pgen_file) && file.exists(pvar_file) && file.exists(psam_file) + + if (!plink1_exists && !plink2_exists) { + missing_files <- c() + if (!file.exists(bed_file)) missing_files <- c(missing_files, bed_file) + if (!file.exists(bim_file)) missing_files <- c(missing_files, bim_file) + if (!file.exists(fam_file)) missing_files <- c(missing_files, fam_file) + if (!file.exists(pgen_file)) missing_files <- c(missing_files, pgen_file) + if (!file.exists(pvar_file)) missing_files <- c(missing_files, pvar_file) + if (!file.exists(psam_file)) missing_files <- c(missing_files, psam_file) + + stop("Genotype files do not exist for dataset ", i_data, ".\n", + " Base path: ", genotype, "\n", + " Missing files: ", paste(missing_files, collapse = ", "), + call. = FALSE) + } + # extract phenotype and covariate file names pos <- which(match_geno_pheno == i_data) phenotype <- phenotype_list[pos] covariate <- covariate_list[pos] conditions <- conditions_list_individual[pos] + + # Check for missing files first and provide clear errors + missing_pheno <- !vapply(phenotype, file.exists, logical(1)) + missing_covar <- !vapply(covariate, file.exists, logical(1)) + + if (any(missing_pheno)) { + missing_pheno_files <- phenotype[missing_pheno] + missing_pheno_conditions <- conditions[missing_pheno] + stop("Phenotype file(s) do not exist:\n", + paste(paste0(" - ", missing_pheno_conditions, ": ", missing_pheno_files), collapse = "\n"), + call. = FALSE) + } + + if (any(missing_covar)) { + missing_covar_files <- covariate[missing_covar] + missing_covar_conditions <- conditions[missing_covar] + stop("Covariate file(s) do not exist:\n", + paste(paste0(" - ", missing_covar_conditions, ": ", missing_covar_files), collapse = "\n"), + call. = FALSE) + } + + # All files should exist at this point (errors would have been thrown) + + # Helper function to check if phenotype file has data for the specific region + # Note: The file may have data elsewhere, but we only care about this region + # This is a pre-check; the actual loader will do the definitive check + check_phenotype_has_data <- function(pheno_file, region_str) { + # File existence already checked above, so we can assume it exists here + # Parse region to get chromosome and coordinates + region_parts <- strsplit(region_str, ":", fixed = TRUE)[[1]] + if (length(region_parts) != 2) { + return(TRUE) # If region format is wrong, let the actual load function handle it + } + chr <- region_parts[1] + coord_parts <- strsplit(region_parts[2], "-", fixed = TRUE)[[1]] + if (length(coord_parts) != 2) { + return(TRUE) # If region format is wrong, let the actual load function handle it + } + start <- coord_parts[1] + end <- coord_parts[2] + + # Try both formats: with and without 'chr' prefix + # Different files may be indexed with different formats + region_formats <- c( + region_str, # Try original format first (e.g., "chr1:0-1000000") + paste0(sub("^chr", "", chr), ":", start, "-", end) # Try without chr (e.g., "1:0-1000000") + ) + + # Use tabix to check if file has data for this specific region + # Use the same approach as tabix_region: try fread with tabix command + for (region_tabix in region_formats) { + # Try using fread with tabix command (same as tabix_region does) + cmd_output <- tryCatch( + { + data.table::fread(cmd = paste0("tabix -h ", pheno_file, " ", region_tabix), + sep = "auto", header = "auto") + }, + error = function(e) NULL + ) + + # If we got output with data, return TRUE + if (!is.null(cmd_output) && nrow(cmd_output) > 0) { + return(TRUE) + } + + # If fread failed, it might be because file isn't indexed or has other issues + # In that case, be lenient and let the actual loader handle it + # (The loader will give a proper error message if needed) + } + + # If we tried all formats and got no data, return FALSE + # But only if we successfully queried (didn't get errors) + # For now, if we reach here after trying both formats, assume no data + return(FALSE) + } + + # Filter to only phenotypes with data for this specific region + has_data <- vapply(phenotype, function(p) check_phenotype_has_data(p, region), logical(1)) + valid_indices <- which(has_data) + + if (length(valid_indices) == 0) { + warning("No phenotype files have data for region ", region, + " for genotype dataset ", i_data, ". Skipping this dataset.", call. = FALSE) + next + } + + if (length(valid_indices) < length(phenotype)) { + skipped_conditions <- conditions[!has_data] + warning("Filtered from ", length(phenotype), " to ", length(valid_indices), + " tissues with data in region ", region, + ". Skipped tissues (no data in this region): ", paste(skipped_conditions, collapse = ", "), + call. = FALSE) + } + + # Filter lists to only valid phenotypes + phenotype_filtered <- phenotype[valid_indices] + covariate_filtered <- covariate[valid_indices] + conditions_filtered <- conditions[valid_indices] + + # Load all valid phenotypes together dat <- load_regional_univariate_data( - genotype = genotype, phenotype = phenotype, - covariate = covariate, region = region, + genotype = genotype, phenotype = phenotype_filtered, + covariate = covariate_filtered, region = region, association_window = association_window, - conditions = conditions, xvar_cutoff = xvar_cutoff, + conditions = conditions_filtered, xvar_cutoff = xvar_cutoff, maf_cutoff = maf_cutoff, mac_cutoff = mac_cutoff, imiss_cutoff = imiss_cutoff, keep_indel = keep_indel, keep_samples = keep_samples, keep_variants = keep_variants, @@ -129,6 +256,7 @@ load_multitask_regional_data <- function(region, # a string of chr:start-end for region_name_col = region_name_col, scale_residuals = scale_residuals ) + if (is.null(individual_data)) { individual_data <- dat } else { @@ -150,11 +278,55 @@ load_multitask_regional_data <- function(region, # a string of chr:start-end for if (length(match_LD_sumstat) != length(LD_meta_file_path_list)) { stop("Please make sure 'match_LD_sumstat' matched 'LD_meta_file_path_list' if you load data from multiple sumstats.") } + + # Check for missing sumstat files + missing_sumstat <- !vapply(sumstat_path_list, file.exists, logical(1)) + if (any(missing_sumstat)) { + missing_sumstat_files <- sumstat_path_list[missing_sumstat] + missing_sumstat_conditions <- if (!is.null(conditions_list_sumstat)) { + conditions_list_sumstat[missing_sumstat] + } else { + paste0("sumstat_", which(missing_sumstat)) + } + stop("Summary statistics file(s) do not exist:\n", + paste(paste0(" - ", missing_sumstat_conditions, ": ", missing_sumstat_files), collapse = "\n"), + call. = FALSE) + } + + # Check for missing column mapping files + missing_column <- !vapply(column_file_path_list, file.exists, logical(1)) + if (any(missing_column)) { + missing_column_files <- column_file_path_list[missing_column] + missing_column_conditions <- if (!is.null(conditions_list_sumstat)) { + conditions_list_sumstat[missing_column] + } else { + paste0("sumstat_", which(missing_column)) + } + stop("Column mapping file(s) do not exist:\n", + paste(paste0(" - ", missing_column_conditions, ": ", missing_column_files), collapse = "\n"), + call. = FALSE) + } + + # Check for missing LD metadata files + missing_ld_meta <- !vapply(LD_meta_file_path_list, file.exists, logical(1)) + if (any(missing_ld_meta)) { + missing_ld_meta_files <- LD_meta_file_path_list[missing_ld_meta] + stop("LD metadata file(s) do not exist:\n", + paste(paste0(" - LD_meta_", which(missing_ld_meta), ": ", missing_ld_meta_files), collapse = "\n"), + call. = FALSE) + } + # - load sumstat data from multiple datasets n_LD <- length(match_LD_sumstat) for (i_ld in 1:n_LD) { # extract LD meta file path name LD_meta_file_path <- LD_meta_file_path_list[i_ld] + + # Error if LD metadata file doesn't exist (already checked above, but double-check) + if (!file.exists(LD_meta_file_path)) { + stop("LD metadata file does not exist for dataset ", i_ld, ": ", LD_meta_file_path, call. = FALSE) + } + LD_info <- load_LD_matrix(LD_meta_file_path, region = association_window, extract_coordinates = extract_coordinates @@ -165,6 +337,14 @@ load_multitask_regional_data <- function(region, # a string of chr:start-end for sumstats <- lapply(pos, function(ii) { sumstat_path <- sumstat_path_list[ii] column_file_path <- column_file_path_list[ii] + + # Error if files don't exist (already checked above, but double-check) + if (!file.exists(sumstat_path)) { + stop("Summary statistics file does not exist for ", conditions_list_sumstat[ii], ": ", sumstat_path, call. = FALSE) + } + if (!file.exists(column_file_path)) { + stop("Column mapping file does not exist for ", conditions_list_sumstat[ii], ": ", column_file_path, call. = FALSE) + } # FIXME later: when consider multiple LD reference tmp <- load_rss_data( sumstat_path = sumstat_path, column_file_path = column_file_path, @@ -179,6 +359,8 @@ load_multitask_regional_data <- function(region, # a string of chr:start-end for } return(tmp) }) + + # All files should exist at this point (errors would have been thrown) names(sumstats) <- conditions sumstat_data$sumstats <- c(sumstat_data$sumstats, list(sumstats)) sumstat_data$LD_info <- c(sumstat_data$LD_info, list(LD_info)) diff --git a/R/file_utils.R b/R/file_utils.R index 6e2444a6..6e022df5 100644 --- a/R/file_utils.R +++ b/R/file_utils.R @@ -209,7 +209,7 @@ NoPhenotypeError <- function(message) { structure(list(message = message), class = c("NoPhenotypeError", "error", "condition")) } -#' @importFrom purrr map2 compact +#' @importFrom purrr map2 #' @importFrom readr read_delim cols #' @importFrom dplyr filter select mutate across everything #' @importFrom magrittr %>% @@ -224,8 +224,7 @@ load_phenotype_data <- function(phenotype_path, region, extract_region_name = NU } # Use `map2` to iterate over `phenotype_path` and `extract_region_name` simultaneously - # `compact` should remove all NULL element - phenotype_data <- compact(map2(phenotype_path, extract_region_name, ~ { + phenotype_data <- map2(phenotype_path, extract_region_name, ~ { tabix_data <- if (!is.null(region)) tabix_region(.x, region, tabix_header = tabix_header) else read_delim(.x, "\t", col_types = cols()) if (nrow(tabix_data) == 0) { message(paste("Phenotype file ", .x, " is empty for the specified region", if (!is.null(region)) "" else region)) @@ -258,10 +257,10 @@ load_phenotype_data <- function(phenotype_path, region, extract_region_name = NU } return(result) } - })) + }) - # Check if all phenotype files are empty - if (length(phenotype_data) == 0) { + # Check if all phenotype files are empty (all elements NULL) + if (all(vapply(phenotype_data, is.null, logical(1)))) { stop(NoPhenotypeError(paste("All phenotype files are empty for the specified region", if (!is.null(region)) "" else region))) } return(phenotype_data) @@ -469,6 +468,25 @@ load_regional_association_data <- function(genotype, # PLINK file ## Load phenotype and covariates and perform some pre-processing covar <- load_covariate_data(covariate) pheno <- load_phenotype_data(phenotype, region, extract_region_name = extract_region_name, region_name_col = region_name_col, tabix_header = tabix_header) + + # Keep covariates / conditions aligned with successfully loaded phenotypes. + # load_phenotype_data() returns NULL for phenotype files that are empty in + # the requested region; we must drop the corresponding covariate / condition + # entries before constructing data_list. + keep_idx <- !vapply(pheno, is.null, logical(1)) + if (!any(keep_idx)) { + stop(NoPhenotypeError(paste("All phenotype files are empty for the specified region", if (!is.null(region)) "" else region))) + } + if (any(!keep_idx)) { + dropped_conditions <- conditions[!keep_idx] + message(paste( + "Dropping phenotypes with no data in region", region, ":", + paste(dropped_conditions, collapse = ", ") + )) + } + pheno <- pheno[keep_idx] + covar <- covar[keep_idx] + conditions <- conditions[keep_idx] ### including Y ( cov ) and specific X and covar match, filter X variants based on the overlapped samples. data_list <- prepare_data_list(geno, pheno, covar, imiss_cutoff, maf_cutoff, mac_cutoff, xvar_cutoff,