diff --git a/DESCRIPTION b/DESCRIPTION index 3ece6a87..87e3c40b 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -62,4 +62,4 @@ LinkingTo: RcppGSL NeedsCompilation: yes VignetteBuilder: knitr -RoxygenNote: 7.3.2 +RoxygenNote: 7.3.3 diff --git a/NAMESPACE b/NAMESPACE index ea3fe90e..a8ed34c0 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -141,6 +141,8 @@ importFrom(data.table,setnames) importFrom(doFuture,registerDoFuture) importFrom(dplyr,"%>%") importFrom(dplyr,across) +importFrom(dplyr,all_of) +importFrom(dplyr,any_of) importFrom(dplyr,arrange) importFrom(dplyr,as_tibble) importFrom(dplyr,bind_rows) diff --git a/R/LD.R b/R/LD.R index 426790ff..fb9d00b9 100644 --- a/R/LD.R +++ b/R/LD.R @@ -355,16 +355,33 @@ load_LD_matrix <- function(LD_meta_file_path, region, extract_coordinates = NULL LD_matrices = extracted_LD_matrices_list, variants = extracted_LD_variants_list ) + # Normalize LD variant IDs at load time: add 'chr' prefix if missing + if (!is.null(colnames(combined_LD_matrix))) { + cn <- as.character(colnames(combined_LD_matrix)) + colnames(combined_LD_matrix) <- ifelse(startsWith(cn, "chr"), cn, paste0("chr", cn)) + } + if (!is.null(rownames(combined_LD_matrix))) { + rn <- as.character(rownames(combined_LD_matrix)) + rownames(combined_LD_matrix) <- ifelse(startsWith(rn, "chr"), rn, paste0("chr", rn)) + } combined_LD_variants <- rownames(combined_LD_matrix) # Now create block_metadata with all the information we've accumulated block_variants <- lapply(extracted_LD_variants_list, function(v) v$variants) + # Normalize block_variants to match combined_LD_variants format (add 'chr' if needed) + block_variants_normalized <- lapply(block_variants, function(v) { + if (!any(startsWith(v, "chr"))) { + paste0("chr", v) + } else { + v + } + }) block_metadata <- data.frame( block_id = seq_along(LD_file_paths), chrom = block_chroms, size = sapply(block_variants, length), - start_idx = sapply(block_variants, function(v) min(match(v, combined_LD_variants))), - end_idx = sapply(block_variants, function(v) max(match(v, combined_LD_variants))), + start_idx = sapply(block_variants_normalized, function(v) min(match(v, combined_LD_variants))), + end_idx = sapply(block_variants_normalized, function(v) max(match(v, combined_LD_variants))), stringsAsFactors = FALSE ) @@ -375,6 +392,8 @@ load_LD_matrix <- function(LD_meta_file_path, region, extract_coordinates = NULL ref_panel <- do.call(rbind, lapply(strsplit(rownames(combined_LD_matrix), ":"), function(x) { data.frame(chrom = x[1], pos = as.integer(x[2]), A2 = x[3], A1 = x[4]) })) + # Normalize ref_panel chrom to drop 'chr' prefix for merging with sumstats (which often use numeric chrom) + ref_panel$chrom <- sub("^chr", "", as.character(ref_panel$chrom)) merged_variant_list <- do.call(rbind, extracted_LD_variants_list) ref_panel$variant_id <- rownames(combined_LD_matrix) diff --git a/R/allele_qc.R b/R/allele_qc.R index 7b32d0a8..fba18628 100644 --- a/R/allele_qc.R +++ b/R/allele_qc.R @@ -18,184 +18,205 @@ #' @param remove_unmatched Whether to remove unmatched variants. Default is `TRUE`. #' @return A single data frame with matched variants. #' @importFrom magrittr %>% -#' @importFrom dplyr mutate inner_join filter pull select everything row_number if_else +#' @importFrom dplyr mutate inner_join filter pull select everything row_number if_else any_of all_of rename #' @importFrom vctrs vec_duplicate_detect -#' @importFrom dplyr if_else #' @importFrom tidyr separate #' @export allele_qc <- function(target_data, ref_variants, col_to_flip = NULL, - match_min_prop = 0.2, remove_dups = TRUE, - remove_indels = FALSE, remove_strand_ambiguous = TRUE, - flip_strand = FALSE, remove_unmatched = TRUE, remove_same_vars = FALSE) { - strand_flip <- function(ref) { - # Define a mapping for complementary bases - base_mapping <- c("A" = "T", "T" = "A", "G" = "C", "C" = "G") - - # Function to complement a single base - complement_base <- function(base) { - complement <- base_mapping[base] - return(complement) - } - - # Function to reverse and complement a DNA sequence - reverse_complement <- function(sequence) { - reversed <- rev(strsplit(sequence, NULL)[[1]]) - complemented <- sapply(reversed, complement_base, USE.NAMES = FALSE) - return(paste(complemented, collapse = "")) - } - - complemented_sequence <- sapply(ref, reverse_complement, USE.NAMES = FALSE) - - return(complemented_sequence) + match_min_prop = 0.2, remove_dups = TRUE, + remove_indels = FALSE, remove_strand_ambiguous = TRUE, + flip_strand = FALSE, remove_unmatched = TRUE, remove_same_vars = FALSE) { + strand_flip <- function(ref) { + # Define a mapping for complementary bases + base_mapping <- c("A" = "T", "T" = "A", "G" = "C", "C" = "G") + + # Function to complement a single base + complement_base <- function(base) { + complement <- base_mapping[base] + return(complement) + } + + # Function to reverse and complement a DNA sequence + reverse_complement <- function(sequence) { + reversed <- rev(strsplit(sequence, NULL)[[1]]) + complemented <- sapply(reversed, complement_base, USE.NAMES = FALSE) + return(paste(complemented, collapse = "")) + } + + complemented_sequence <- sapply(ref, reverse_complement, USE.NAMES = FALSE) + + return(complemented_sequence) + } + + # helper to sanitize column names to avoid NA/empty names that break dplyr verbs + sanitize_names <- function(df) { + nm <- colnames(df) + if (is.null(nm)) { + nm <- rep("unnamed", ncol(df)) + } + empty_idx <- is.na(nm) | nm == "" + if (any(empty_idx)) { + # assign stable placeholder names for empties + nm[empty_idx] <- paste0("unnamed_", seq_len(sum(empty_idx))) + } + # ensure names are unique and syntactic + nm <- base::make.unique(nm, sep = "_") + colnames(df) <- nm + df } # check if the pattern is ATCG/DI check_ATCG <- function(vec) { - pattern <- "^[ATCGDI]+$" + pattern <- "^[ATCGDI]+$" - # Function to check if a single element matches the pattern - check_element <- function(element) { - grepl(pattern, element) - } + # Function to check if a single element matches the pattern + check_element <- function(element) { + grepl(pattern, element) + } - result <- sapply(vec, check_element, USE.NAMES = FALSE) - return(result) + result <- sapply(vec, check_element, USE.NAMES = FALSE) + return(result) } # transform all inputs to dataframe if (is.data.frame(target_data)) { - if (ncol(target_data) > 4 && all(c("chrom", "pos", "A2", "A1") %in% names(target_data))) { - # Extract variant columns and standardize - variant_cols <- c("chrom", "pos", "A2", "A1") - variant_df <- target_data %>% select(all_of(variant_cols)) - other_cols <- target_data %>% select(-all_of(variant_cols)) - target_data <- cbind(variant_id_to_df(variant_df), other_cols) - } else { - target_data <- variant_id_to_df(target_data) - } + if (ncol(target_data) > 4 && all(c("chrom", "pos", "A2", "A1") %in% names(target_data))) { + # Extract variant columns and standardize + variant_cols <- c("chrom", "pos", "A2", "A1") + variant_df <- target_data %>% select(all_of(variant_cols)) + other_cols <- target_data %>% select(-all_of(variant_cols)) + target_data <- cbind(variant_id_to_df(variant_df), other_cols) + } else { + target_data <- variant_id_to_df(target_data) + } } else { - target_data <- variant_id_to_df(target_data) + target_data <- variant_id_to_df(target_data) } ref_variants <- variant_id_to_df(ref_variants) + columns_to_remove <- c("chromosome", "position", "ref", "alt", "variant_id") # Check if any of the specified columns are present if (any(columns_to_remove %in% colnames(target_data))) { - target_data <- select(target_data, -any_of(columns_to_remove)) + target_data <- select(target_data, -any_of(columns_to_remove)) } match_result <- merge(target_data, ref_variants, by = c("chrom", "pos"), all = FALSE, suffixes = c(".target", ".ref")) %>% as.data.frame() + + # sanitize names after merge as well (merge can introduce empty names in edge cases) + match_result <- sanitize_names(match_result) + if (nrow(match_result) == 0) { - warning("No matching variants found between target data and reference variants.") - return(list(target_data_qced = match_result, qc_summary = match_result)) + warning("No matching variants found between target data and reference variants.") + return(list(target_data_qced = match_result, qc_summary = match_result)) } # match target & ref by chrom and position match_result = match_result %>% - mutate(variants_id_original = paste(chrom, pos, A2.target, A1.target, sep = ":")) %>% - mutate(variants_id_qced = paste(chrom, pos, A2.ref, A1.ref, sep = ":")) %>% - # filter out totally same rows. - filter(duplicated(.) | !duplicated(.)) %>% - # upper case target/reference A1 A2 - mutate(across(c(A1.target, A2.target, A1.ref, A2.ref), toupper)) %>% - mutate(flip1.ref = strand_flip(A1.ref), flip2.ref = strand_flip(A2.ref)) %>% - # these pairings are ambiguous: because we cannot tell it's an sign flip / strand flip - mutate(strand_unambiguous = if_else((A1.target == "A" & A2.target == "T") | (A1.target == "T" & A2.target == "A") | - (A1.target == "C" & A2.target == "G") | (A1.target == "G" & A2.target == "C"), FALSE, TRUE)) %>% - # filter out non-ATCG coded alleles - mutate(non_ATCG = !(check_ATCG(A1.target) & check_ATCG(A2.target))) %>% - # exact match should be kept all the time - mutate(exact_match = A1.target == A1.ref & A2.target == A2.ref) %>% - mutate(sign_flip = ((A1.target == A2.ref & A2.target == A1.ref) | (A1.target == flip2.ref & A2.target == flip1.ref)) & (A1.target != A1.ref & A2.target != A2.ref)) %>% - mutate(strand_flip = ((A1.target == flip1.ref & A2.target == flip2.ref) | (A1.target == flip2.ref & A2.target == flip1.ref)) & (A1.target != A1.ref & A2.target != A2.ref)) %>% - mutate(INDEL = (A2.target == "I" | A2.target == "D" | nchar(A2.target) > 1 | nchar(A1.target) > 1)) %>% - mutate(ID_match = ((A2.target == "D" | A2.target == "I") & (nchar(A1.ref) > 1 | nchar(A2.ref) > 1))) + mutate(variants_id_original = paste(chrom, pos, A2.target, A1.target, sep = ":")) %>% + mutate(variants_id_qced = paste(chrom, pos, A2.ref, A1.ref, sep = ":")) %>% + # filter out totally same rows. + filter(duplicated(.) | !duplicated(.)) %>% + # upper case target/reference A1 A2 + mutate(across(c(A1.target, A2.target, A1.ref, A2.ref), toupper)) %>% + mutate(flip1.ref = strand_flip(A1.ref), flip2.ref = strand_flip(A2.ref)) %>% + # these pairings are ambiguous: because we cannot tell it's an sign flip / strand flip + mutate(strand_unambiguous = if_else((A1.target == "A" & A2.target == "T") | (A1.target == "T" & A2.target == "A") | + (A1.target == "C" & A2.target == "G") | (A1.target == "G" & A2.target == "C"), FALSE, TRUE)) %>% + # filter out non-ATCG coded alleles + mutate(non_ATCG = !(check_ATCG(A1.target) & check_ATCG(A2.target))) %>% + # exact match should be kept all the time + mutate(exact_match = A1.target == A1.ref & A2.target == A2.ref) %>% + mutate(sign_flip = ((A1.target == A2.ref & A2.target == A1.ref) | (A1.target == flip2.ref & A2.target == flip1.ref)) & (A1.target != A1.ref & A2.target != A2.ref)) %>% + mutate(strand_flip = ((A1.target == flip1.ref & A2.target == flip2.ref) | (A1.target == flip2.ref & A2.target == flip1.ref)) & (A1.target != A1.ref & A2.target != A2.ref)) %>% + mutate(INDEL = (A2.target == "I" | A2.target == "D" | nchar(A2.target) > 1 | nchar(A1.target) > 1)) %>% + mutate(ID_match = ((A2.target == "D" | A2.target == "I") & (nchar(A1.ref) > 1 | nchar(A2.ref) > 1))) # if not remove, then this should'nt be a condition to filter out any variants if (!remove_strand_ambiguous) { - match_result <- match_result %>% mutate(strand_unambiguous = TRUE) + match_result <- match_result %>% mutate(strand_unambiguous = TRUE) } # if all strand flip is un-ambigous, then we know ambigous cases are indeed a strand flip # not a sign flip, then we infer there is no ambigous in the whole dataset, and keep those ambigous ones if (nrow(match_result %>% filter(strand_flip == TRUE) %>% filter(strand_unambiguous == TRUE)) == 0) { - match_result <- match_result %>% mutate(strand_unambiguous = TRUE) + match_result <- match_result %>% mutate(strand_unambiguous = TRUE) } # To keep variants: if it's a strand flip, we will keep those unambigous (because if ambigous, cannot know it's trand / sign flip, so discard all) # or exact match or indel match (ID_match) # If not a strand flip, then we will keep those that are exact match / those are sign flip / INDEL matched match_result <- match_result %>% mutate(keep = if_else(strand_flip, true = (strand_unambiguous | exact_match | ID_match), false = - (exact_match | sign_flip | ID_match) + (exact_match | sign_flip | ID_match) )) if (remove_indels) { - match_result <- match_result %>% mutate(keep = if_else(INDEL == FALSE, FALSE, TRUE)) + match_result <- match_result %>% mutate(keep = if_else(INDEL == FALSE, FALSE, TRUE)) } # flip the signs of the column col_to_flip if there is a sign flip if (!is.null(col_to_flip)) { - if (!is.null(match_result[, col_to_flip])) { - match_result[match_result$sign_flip, col_to_flip] <- -1 * match_result[match_result$sign_flip, col_to_flip] - } else { - stop("Column '", col_to_flip, "' not found in target_data.") - } + if (!is.null(match_result[, col_to_flip])) { + match_result[match_result$sign_flip, col_to_flip] <- -1 * match_result[match_result$sign_flip, col_to_flip] + } else { + stop("Column '", col_to_flip, "' not found in target_data.") + } } # flip the strands if there is a strand flip if (flip_strand) { - strand_flipped_indices <- which(match_result$strand_flip) - match_result[strand_flipped_indices, "A1.target"] <- strand_flip(match_result[strand_flipped_indices, "A1.target"]) - match_result[strand_flipped_indices, "A2.target"] <- strand_flip(match_result[strand_flipped_indices, "A2.target"]) + strand_flipped_indices <- which(match_result$strand_flip) + match_result[strand_flipped_indices, "A1.target"] <- strand_flip(match_result[strand_flipped_indices, "A1.target"]) + match_result[strand_flipped_indices, "A2.target"] <- strand_flip(match_result[strand_flipped_indices, "A2.target"]) } # Remove all unnecessary columns used to determine qc status # Finally keep those variants with FLAG keep = TRUE result <- match_result[match_result$keep, , drop = FALSE] - + # FIXME: I think this parameter is confusing. I inheritated directly from our function, whose default setting is TRUE. # It is removing all multi-allelic alleles which is unnecessary. I suggest remove this parameter directly. # What we are trying to avoid is the SAME allele having diferent z score. I defined one parameter remove_same_vars later, but I can re-use this # remove_dup name if (remove_dups) { - dups <- vec_duplicate_detect(result[, c("chrom", "pos", "variants_id_qced")]) - if (any(dups)) { - result <- result[!dups, , drop = FALSE] - warning("Unexpected duplicates were removed.") - } + dups <- vec_duplicate_detect(result[, c("chrom", "pos", "variants_id_qced")]) + if (any(dups)) { + result <- result[!dups, , drop = FALSE] + warning("Unexpected duplicates were removed.") + } } - + result <- result %>% - select(-(flip1.ref:keep)) %>% - select(-A1.target, -A2.target) %>% - rename(A1 = A1.ref, A2 = A2.ref, variant_id = variants_id_qced) + select(-(flip1.ref:keep)) %>% + select(-A1.target, -A2.target) %>% + rename(A1 = A1.ref, A2 = A2.ref, variant_id = variants_id_qced) # default FALSE, but if want to remove same variants having different z score, then set as TRUE if (remove_same_vars) { - same_vars <- vec_duplicate_detect(result[, c("chrom", "pos", "variant_id")]) - if (any(same_vars)) { - result <- result[!same_vars, , drop = FALSE] - message("Same variants with different z scores are removed.") - } + same_vars <- vec_duplicate_detect(result[, c("chrom", "pos", "variant_id")]) + if (any(same_vars)) { + result <- result[!same_vars, , drop = FALSE] + message("Same variants with different z scores are removed.") + } } if (!remove_unmatched) { - match_variant <- result %>% pull(variants_id_original) - match_result <- select(match_result, -(flip1.ref:keep)) %>% - select(-variants_id_original, -A1.target, -A2.target) %>% - rename(A1 = A1.ref, A2 = A2.ref, variant_id = variants_id_qced) - target_data <- target_data %>% mutate(variant_id = paste(chrom, pos, A2, A1, sep = ":")) - if (length(setdiff(target_data %>% pull(variant_id), match_variant)) > 0) { - unmatch_data <- target_data %>% filter(!variant_id %in% match_variant) - result <- rbind(result, unmatch_data %>% mutate(variants_id_original = variant_id)) - result <- result[match(target_data$variant_id, result$variants_id_original), ] %>% select(-variants_id_original) - } + match_variant <- result %>% pull(variants_id_original) + match_result <- select(match_result, -(flip1.ref:keep)) %>% + select(-variants_id_original, -A1.target, -A2.target) %>% + rename(A1 = A1.ref, A2 = A2.ref, variant_id = variants_id_qced) + target_data <- target_data %>% mutate(variant_id = paste(chrom, pos, A2, A1, sep = ":")) + if (length(setdiff(target_data %>% pull(variant_id), match_variant)) > 0) { + unmatch_data <- target_data %>% filter(!variant_id %in% match_variant) + result <- rbind(result, unmatch_data %>% mutate(variants_id_original = variant_id)) + result <- result[match(target_data$variant_id, result$variants_id_original), ] %>% select(-variants_id_original) + } } min_match <- match_min_prop * nrow(ref_variants) if (nrow(result) < min_match) { - stop("Not enough variants have been matched.") + stop("Not enough variants have been matched.") } # throw an error if there are any variant_id that are duplicated (meaning that same variant having different other infos for example z score) if (!remove_same_vars & any(duplicated(result$variant_id))) { - stop("In the input, there are duplicated variants with different z scores. Please check the data and determine which to keep.") + stop("In the input, there are duplicated variants with different z scores. Please check the data and determine which to keep.") } return(list(target_data_qced = result, qc_summary = match_result)) @@ -209,6 +230,7 @@ allele_qc <- function(target_data, ref_variants, col_to_flip = NULL, #' #' @param source A character vector of variant names in the format "chr:pos:A2:A1" or "chr:pos_A2_A1". #' @param reference A character vector of variant names in the format "chr:pos:A2:A1" or "chr:pos_A2_A1". +#' @param remove_build_suffix Whether to strip trailing genome build suffixes like ":b38" or "_b38" before alignment. Default TRUE. #' #' @return A list with two elements: #' - aligned_variants: A character vector of aligned variant names. @@ -220,7 +242,12 @@ allele_qc <- function(target_data, ref_variants, col_to_flip = NULL, #' align_variant_names(source, reference) #' #' @export -align_variant_names <- function(source, reference, remove_indels = FALSE) { +align_variant_names <- function(source, reference, remove_indels = FALSE, remove_build_suffix = TRUE) { + # Optionally strip build suffix like :b38 or _b38 from both sides for robust alignment + if (remove_build_suffix) { + source <- gsub("(:|_)b[0-9]+$", "", source) + reference <- gsub("(:|_)b[0-9]+$", "", reference) + } # Check if source and reference follow the expected pattern source_pattern <- grepl("^(chr)?[0-9]+:[0-9]+:[ATCG*]+:[ATCG*]+$|^(chr)?[0-9]+:[0-9]+_[ATCG*]+_[ATCG*]+$", source) reference_pattern <- grepl("^(chr)?[0-9]+:[0-9]+:[ATCG*]+:[ATCG*]+$|^(chr)?[0-9]+:[0-9]+_[ATCG*]+_[ATCG*]+$", reference) @@ -288,4 +315,4 @@ align_variant_names <- function(source, reference, remove_indels = FALSE) { aligned_variants = aligned_variants, unmatched_indices = unmatched_indices ) -} +} \ No newline at end of file diff --git a/R/colocboost_pipeline.R b/R/colocboost_pipeline.R index 2b22d712..d11feedd 100644 --- a/R/colocboost_pipeline.R +++ b/R/colocboost_pipeline.R @@ -436,13 +436,32 @@ colocboost_analysis_pipeline <- function(region_data, if (!is.null(sumstat_data$sumstats)) { sumstats <- lapply(sumstat_data$sumstats, function(ss) { z <- ss$sumstats$z - variant <- paste0("chr", ss$sumstats$variant_id) + # Prefix 'chr' only if missing to avoid 'chrchr' + vi <- ss$sumstats$variant_id + has_chr <- startsWith(as.character(vi), "chr") + variant <- ifelse(has_chr, as.character(vi), paste0("chr", as.character(vi))) n <- ss$n - data.frame("z" = z, "n" = n, "variant" = variant) + + # Filter out NA values from z-scores and corresponding variants + na_mask <- !is.na(z) + if (sum(na_mask) == 0) { + message("Warning: All z-scores are NA for this summary statistic dataset") + return(data.frame("z" = numeric(0), "n" = numeric(0), "variant" = character(0))) + } + + data.frame("z" = z[na_mask], "n" = n, "variant" = variant[na_mask]) }) names(sumstats) <- names(sumstat_data$sumstats) LD_mat <- lapply(sumstat_data$LD_mat, function(ld) { - colnames(ld) <- rownames(ld) <- paste0("chr", colnames(ld)) + # Ensure LD dimnames have 'chr' only once + cn <- colnames(ld) + rn <- rownames(ld) + if (!is.null(cn)) { + colnames(ld) <- ifelse(startsWith(as.character(cn), "chr"), as.character(cn), paste0("chr", as.character(cn))) + } + if (!is.null(rn)) { + rownames(ld) <- ifelse(startsWith(as.character(rn), "chr"), as.character(rn), paste0("chr", as.character(rn))) + } return(ld) }) LD_match <- sumstat_data$LD_match @@ -451,6 +470,35 @@ colocboost_analysis_pipeline <- function(region_data, sumstats <- LD_mat <- dict_sumstatLD <- NULL } + # Helper: validate a single GWAS study data.frame and its LD matrix + is_valid_sumstat_entry <- function(df, ld_matrix) { + if (is.null(df) || nrow(df) == 0) return(FALSE) + if (!all(c("z", "n", "variant") %in% colnames(df))) return(FALSE) + n_vals <- suppressWarnings(as.numeric(unique(df$n))) + if (length(n_vals) == 0 || is.na(n_vals[1]) || !is.finite(n_vals[1]) || n_vals[1] < 3) return(FALSE) + if (all(is.na(df$z))) return(FALSE) + if (is.null(ld_matrix)) return(FALSE) + if (!is.matrix(ld_matrix)) return(FALSE) + if (any(!is.finite(ld_matrix))) return(FALSE) + # Ensure at least 1 overlapping variant with LD matrix dimnames + ld_vars <- intersect(rownames(ld_matrix), colnames(ld_matrix)) + if (length(ld_vars) == 0) return(FALSE) + if (length(intersect(as.character(df$variant), as.character(ld_vars))) == 0) return(FALSE) + TRUE + } + + # Helper: filter joint structures (sumstats list and dict_sumstatLD) to only valid entries + filter_valid_sumstats <- function(sumstats, LD_mat, dict_sumstatLD) { + if (is.null(sumstats) || is.null(dict_sumstatLD) || is.null(LD_mat)) return(list(sumstats = NULL, dict = NULL)) + keep <- vector("logical", length(sumstats)) + for (i in seq_along(sumstats)) { + ld_idx <- dict_sumstatLD[i, 2] + ld_matrix <- if (!is.na(ld_idx)) LD_mat[[ld_idx]] else NULL + keep[i] <- is_valid_sumstat_entry(sumstats[[i]], ld_matrix) + } + if (!any(keep)) return(list(sumstats = NULL, dict = NULL)) + list(sumstats = sumstats[keep], dict = matrix(dict_sumstatLD[keep, , drop = FALSE], ncol = 2)) + } ####### ========= streamline three types of analyses ======== ######## if (is.null(X) & is.null(sumstats)) { @@ -481,16 +529,22 @@ colocboost_analysis_pipeline <- function(region_data, if (joint_gwas & !is.null(sumstats)) { message(paste("====== Performing non-focaled version GWAS-xQTL ColocBoost on", length(Y), "contexts and", length(sumstats), "GWAS. =====")) t21 <- Sys.time() - traits <- c(names(Y), names(sumstats)) - res_gwas <- colocboost( - X = X, Y = Y, sumstat = sumstats, LD = LD_mat, - dict_YX = dict_YX, dict_sumstatLD = dict_sumstatLD, - outcome_names = traits, focal_outcome_idx = NULL, - output_level = 2, ... - ) - t22 <- Sys.time() - analysis_results$joint_gwas <- res_gwas - analysis_results$computing_time$Analysis$joint_gwas <- t22 - t21 + # Filter invalid GWAS before calling colocboost + filtered <- filter_valid_sumstats(sumstats, LD_mat, dict_sumstatLD) + if (is.null(filtered$sumstats)) { + message("All GWAS studies failed validation; skipping joint GWAS analysis for this region.") + } else { + traits <- c(names(Y), names(filtered$sumstats)) + res_gwas <- colocboost( + X = X, Y = Y, sumstat = filtered$sumstats, LD = LD_mat, + dict_YX = dict_YX, dict_sumstatLD = filtered$dict, + outcome_names = traits, focal_outcome_idx = NULL, + output_level = 2, ... + ) + t22 <- Sys.time() + analysis_results$joint_gwas <- res_gwas + analysis_results$computing_time$Analysis$joint_gwas <- t22 - t21 + } } # - run focaled version of ColocBoost for each GWAS if (separate_gwas & !is.null(sumstats)) { @@ -498,15 +552,32 @@ colocboost_analysis_pipeline <- function(region_data, res_gwas_separate <- analysis_results$separate_gwas for (i_gwas in 1:nrow(dict_sumstatLD)) { current_study <- names(sumstats)[i_gwas] - message(paste("====== Performing focaled version GWAS-xQTL ColocBoost on", length(Y), "contexts and ", current_study, "GWAS. =====")) dict <- dict_sumstatLD[i_gwas, ] + # Validate this study before calling colocboost + ld_idx <- dict[2] + ld_matrix <- if (!is.na(ld_idx)) LD_mat[[ld_idx]] else NULL + valid <- is_valid_sumstat_entry(sumstats[[dict[1]]], ld_matrix) + if (!isTRUE(valid)) { + message(paste("Skipping", current_study, "due to invalid N/z/LD.")) + next + } + message(paste("====== Performing focaled version GWAS-xQTL ColocBoost on", length(Y), "contexts and ", current_study, "GWAS. =====")) traits <- c(names(Y), current_study) - res_gwas_separate[[current_study]] <- colocboost( - X = X, Y = Y, sumstat = sumstats[dict[1]], - LD = LD_mat[dict[2]], dict_YX = dict_YX, - outcome_names = traits, focal_outcome_idx = length(traits), - output_level = 2, ... - ) + + # Try to run colocboost with error handling + tryCatch({ + res_gwas_separate[[current_study]] <- colocboost( + X = X, Y = Y, sumstat = sumstats[dict[1]], + LD = LD_mat[dict[2]], dict_YX = dict_YX, + outcome_names = traits, focal_outcome_idx = length(traits), + output_level = 2, ... + ) + }, error = function(e) { + message(paste("Error in colocboost analysis for", current_study, ":")) + message(paste("Error message:", e$message)) + message(paste("Continuing with next GWAS...")) + res_gwas_separate[[current_study]] <<- NULL + }) } t32 <- Sys.time() analysis_results$separate_gwas <- res_gwas_separate @@ -687,7 +758,11 @@ qc_regional_data <- function(region_data, n <- sumstat$n var_y <- sumstat$var_y conditions_sumstat <- names(sumstats)[ii] - pip_cutoff_to_skip_ld <- pip_cutoff_to_skip_sumstat[conditions_sumstat] %>% as.numeric() + pip_cutoff_to_skip_ld <- if (conditions_sumstat %in% names(pip_cutoff_to_skip_sumstat)) { + as.numeric(pip_cutoff_to_skip_sumstat[conditions_sumstat]) + } else { + 0 + } # Preprocess the input data preprocess_results <- rss_basic_qc(sumstat$sumstats, LD_data, remove_indels = remove_indels) diff --git a/R/raiss.R b/R/raiss.R index 278292fa..5b27e572 100644 --- a/R/raiss.R +++ b/R/raiss.R @@ -330,10 +330,10 @@ filter_raiss_output <- function(zscores, R2_threshold = 0.6, minimum_ld = 5, ver # Count statistics before filtering NSNPs_bf_filt <- nrow(zscores) - NSNPs_initial <- sum(zscores$raiss_R2 == 2.0) - NSNPs_imputed <- sum(zscores$raiss_R2 != 2.0) - NSNPs_ld_filt <- sum(zscores$raiss_ld_score < minimum_ld) - NSNPs_R2_filt <- sum(zscores$raiss_R2 < R2_threshold) + NSNPs_initial <- sum(zscores$raiss_R2 == 2.0, na.rm = TRUE) + NSNPs_imputed <- sum(zscores$raiss_R2 != 2.0, na.rm = TRUE) + NSNPs_ld_filt <- sum(zscores$raiss_ld_score < minimum_ld, na.rm = TRUE) + NSNPs_R2_filt <- sum(zscores$raiss_R2 < R2_threshold, na.rm = TRUE) # Apply filters zscores_nofilter <- zscores diff --git a/R/sumstats_qc.R b/R/sumstats_qc.R index c0424fe3..3923e168 100644 --- a/R/sumstats_qc.R +++ b/R/sumstats_qc.R @@ -54,6 +54,29 @@ rss_basic_qc <- function(sumstats, LD_data, skip_region = NULL, remove_indels = sumstats_processed <- allele_flip$target_data_qced %>% arrange(pos) + # Align and subset LD by mapping core IDs (strip trailing build suffix) to exact LD IDs + ld_mat <- LD_data$combined_LD_matrix + ld_ids <- tryCatch(rownames(ld_mat), error = function(e) NULL) + if (is.null(ld_ids)) { + stop("LD matrix rownames are NULL; cannot align variant IDs.") + } + present <- sumstats_processed$variant_id %in% ld_ids + if (sum(present) == 0) { + strip_build <- function(x) sub("(:|_)b[0-9]+$", "", x) + drop_chr <- function(x) sub("^chr", "", x) + ld_core <- drop_chr(strip_build(ld_ids)) + ss_core <- drop_chr(strip_build(sumstats_processed$variant_id)) + map_idx <- match(ss_core, ld_core) + remap <- !is.na(map_idx) + if (sum(remap) > 0) { + sumstats_processed$variant_id[remap] <- ld_ids[map_idx[remap]] + present <- sumstats_processed$variant_id %in% ld_ids + } + } + if (sum(present) == 0) { + stop("No overlapping variants between sumstats and LD after alignment.") + } + LD_mat_processed <- LD_data$combined_LD_matrix[sumstats_processed$variant_id, sumstats_processed$variant_id, drop = FALSE] return(list(sumstats = sumstats_processed, LD_mat = LD_mat_processed)) diff --git a/man/align_variant_names.Rd b/man/align_variant_names.Rd index bd112604..d89b7ca5 100644 --- a/man/align_variant_names.Rd +++ b/man/align_variant_names.Rd @@ -4,12 +4,19 @@ \alias{align_variant_names} \title{Align Variant Names} \usage{ -align_variant_names(source, reference, remove_indels = FALSE) +align_variant_names( + source, + reference, + remove_indels = FALSE, + remove_build_suffix = TRUE +) } \arguments{ \item{source}{A character vector of variant names in the format "chr:pos:A2:A1" or "chr:pos_A2_A1".} \item{reference}{A character vector of variant names in the format "chr:pos:A2:A1" or "chr:pos_A2_A1".} + +\item{remove_build_suffix}{Whether to strip trailing genome build suffixes like ":b38" or "_b38" before alignment. Default TRUE.} } \value{ A list with two elements: