StatFunGen · kal26 · Oct 2, 2025 · Oct 2, 2025 · Oct 2, 2025 · Oct 2, 2025
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -62,4 +62,4 @@ LinkingTo:
     RcppGSL
 NeedsCompilation: yes
 VignetteBuilder: knitr
-RoxygenNote: 7.3.2
+RoxygenNote: 7.3.3
diff --git a/NAMESPACE b/NAMESPACE
@@ -141,6 +141,8 @@ importFrom(data.table,setnames)
 importFrom(doFuture,registerDoFuture)
 importFrom(dplyr,"%>%")
 importFrom(dplyr,across)
+importFrom(dplyr,all_of)
+importFrom(dplyr,any_of)
 importFrom(dplyr,arrange)
 importFrom(dplyr,as_tibble)
 importFrom(dplyr,bind_rows)

diff --git a/R/LD.R b/R/LD.R
@@ -355,16 +355,33 @@ load_LD_matrix <- function(LD_meta_file_path, region, extract_coordinates = NULL
     LD_matrices = extracted_LD_matrices_list,
     variants = extracted_LD_variants_list
   )
+  # Normalize LD variant IDs at load time: add 'chr' prefix if missing
+  if (!is.null(colnames(combined_LD_matrix))) {
+    cn <- as.character(colnames(combined_LD_matrix))
+    colnames(combined_LD_matrix) <- ifelse(startsWith(cn, "chr"), cn, paste0("chr", cn))
+  }
+  if (!is.null(rownames(combined_LD_matrix))) {
+    rn <- as.character(rownames(combined_LD_matrix))
+    rownames(combined_LD_matrix) <- ifelse(startsWith(rn, "chr"), rn, paste0("chr", rn))
+  }
   combined_LD_variants <- rownames(combined_LD_matrix)
 
   # Now create block_metadata with all the information we've accumulated
   block_variants <- lapply(extracted_LD_variants_list, function(v) v$variants)
+  # Normalize block_variants to match combined_LD_variants format (add 'chr' if needed)
+  block_variants_normalized <- lapply(block_variants, function(v) {
+    if (!any(startsWith(v, "chr"))) {
+      paste0("chr", v)
+    } else {
+      v
+    }
+  })
   block_metadata <- data.frame(
     block_id = seq_along(LD_file_paths),
     chrom = block_chroms,
     size = sapply(block_variants, length),
-    start_idx = sapply(block_variants, function(v) min(match(v, combined_LD_variants))),
-    end_idx = sapply(block_variants, function(v) max(match(v, combined_LD_variants))),
+    start_idx = sapply(block_variants_normalized, function(v) min(match(v, combined_LD_variants))),
+    end_idx = sapply(block_variants_normalized, function(v) max(match(v, combined_LD_variants))),
     stringsAsFactors = FALSE
   )
 
@@ -375,6 +392,8 @@ load_LD_matrix <- function(LD_meta_file_path, region, extract_coordinates = NULL
   ref_panel <- do.call(rbind, lapply(strsplit(rownames(combined_LD_matrix), ":"), function(x) {
     data.frame(chrom = x[1], pos = as.integer(x[2]), A2 = x[3], A1 = x[4])
   }))
+  # Normalize ref_panel chrom to drop 'chr' prefix for merging with sumstats (which often use numeric chrom)
+  ref_panel$chrom <- sub("^chr", "", as.character(ref_panel$chrom))
 
   merged_variant_list <- do.call(rbind, extracted_LD_variants_list)
   ref_panel$variant_id <- rownames(combined_LD_matrix)

diff --git a/R/allele_qc.R b/R/allele_qc.R
diff --git a/R/colocboost_pipeline.R b/R/colocboost_pipeline.R
@@ -436,13 +436,32 @@ colocboost_analysis_pipeline <- function(region_data,
   if (!is.null(sumstat_data$sumstats)) {
     sumstats <- lapply(sumstat_data$sumstats, function(ss) {
       z <- ss$sumstats$z
-      variant <- paste0("chr", ss$sumstats$variant_id)
+      # Prefix 'chr' only if missing to avoid 'chrchr'
+      vi <- ss$sumstats$variant_id
+      has_chr <- startsWith(as.character(vi), "chr")
+      variant <- ifelse(has_chr, as.character(vi), paste0("chr", as.character(vi)))
       n <- ss$n
-      data.frame("z" = z, "n" = n, "variant" = variant)
+
+      # Filter out NA values from z-scores and corresponding variants
+      na_mask <- !is.na(z)
+      if (sum(na_mask) == 0) {
+        message("Warning: All z-scores are NA for this summary statistic dataset")
+        return(data.frame("z" = numeric(0), "n" = numeric(0), "variant" = character(0)))
+      }
+
+      data.frame("z" = z[na_mask], "n" = n, "variant" = variant[na_mask])
     })
     names(sumstats) <- names(sumstat_data$sumstats)
     LD_mat <- lapply(sumstat_data$LD_mat, function(ld) {
-      colnames(ld) <- rownames(ld) <- paste0("chr", colnames(ld))
+      # Ensure LD dimnames have 'chr' only once
+      cn <- colnames(ld)
+      rn <- rownames(ld)
+      if (!is.null(cn)) {
+        colnames(ld) <- ifelse(startsWith(as.character(cn), "chr"), as.character(cn), paste0("chr", as.character(cn)))
+      }
+      if (!is.null(rn)) {
+        rownames(ld) <- ifelse(startsWith(as.character(rn), "chr"), as.character(rn), paste0("chr", as.character(rn)))
+      }
       return(ld)
     })
     LD_match <- sumstat_data$LD_match
@@ -451,6 +470,35 @@ colocboost_analysis_pipeline <- function(region_data,
     sumstats <- LD_mat <- dict_sumstatLD <- NULL
   }
 
+  # Helper: validate a single GWAS study data.frame and its LD matrix
+  is_valid_sumstat_entry <- function(df, ld_matrix) {
+    if (is.null(df) || nrow(df) == 0) return(FALSE)
+    if (!all(c("z", "n", "variant") %in% colnames(df))) return(FALSE)
+    n_vals <- suppressWarnings(as.numeric(unique(df$n)))
+    if (length(n_vals) == 0 || is.na(n_vals[1]) || !is.finite(n_vals[1]) || n_vals[1] < 3) return(FALSE)
+    if (all(is.na(df$z))) return(FALSE)
+    if (is.null(ld_matrix)) return(FALSE)
+    if (!is.matrix(ld_matrix)) return(FALSE)
+    if (any(!is.finite(ld_matrix))) return(FALSE)
+    # Ensure at least 1 overlapping variant with LD matrix dimnames
+    ld_vars <- intersect(rownames(ld_matrix), colnames(ld_matrix))
+    if (length(ld_vars) == 0) return(FALSE)
+    if (length(intersect(as.character(df$variant), as.character(ld_vars))) == 0) return(FALSE)
+    TRUE
+  }
+
+  # Helper: filter joint structures (sumstats list and dict_sumstatLD) to only valid entries
+  filter_valid_sumstats <- function(sumstats, LD_mat, dict_sumstatLD) {
+    if (is.null(sumstats) || is.null(dict_sumstatLD) || is.null(LD_mat)) return(list(sumstats = NULL, dict = NULL))
+    keep <- vector("logical", length(sumstats))
+    for (i in seq_along(sumstats)) {
+      ld_idx <- dict_sumstatLD[i, 2]
+      ld_matrix <- if (!is.na(ld_idx)) LD_mat[[ld_idx]] else NULL
+      keep[i] <- is_valid_sumstat_entry(sumstats[[i]], ld_matrix)
+    }
+    if (!any(keep)) return(list(sumstats = NULL, dict = NULL))
+    list(sumstats = sumstats[keep], dict = matrix(dict_sumstatLD[keep, , drop = FALSE], ncol = 2))
+  }
 
   ####### ========= streamline three types of analyses ======== ########
   if (is.null(X) & is.null(sumstats)) {
@@ -481,32 +529,55 @@ colocboost_analysis_pipeline <- function(region_data,
   if (joint_gwas & !is.null(sumstats)) {
     message(paste("====== Performing non-focaled version GWAS-xQTL ColocBoost on", length(Y), "contexts and", length(sumstats), "GWAS. ====="))
     t21 <- Sys.time()
-    traits <- c(names(Y), names(sumstats))
-    res_gwas <- colocboost(
-      X = X, Y = Y, sumstat = sumstats, LD = LD_mat,
-      dict_YX = dict_YX, dict_sumstatLD = dict_sumstatLD,
-      outcome_names = traits, focal_outcome_idx = NULL, 
-      output_level = 2, ...
-    )
-    t22 <- Sys.time()
-    analysis_results$joint_gwas <- res_gwas
-    analysis_results$computing_time$Analysis$joint_gwas <- t22 - t21
+    # Filter invalid GWAS before calling colocboost
+    filtered <- filter_valid_sumstats(sumstats, LD_mat, dict_sumstatLD)
+    if (is.null(filtered$sumstats)) {
+      message("All GWAS studies failed validation; skipping joint GWAS analysis for this region.")
+    } else {
+      traits <- c(names(Y), names(filtered$sumstats))
+      res_gwas <- colocboost(
+        X = X, Y = Y, sumstat = filtered$sumstats, LD = LD_mat,
+        dict_YX = dict_YX, dict_sumstatLD = filtered$dict,
+        outcome_names = traits, focal_outcome_idx = NULL, 
+        output_level = 2, ...
+      )
+      t22 <- Sys.time()
+      analysis_results$joint_gwas <- res_gwas
+      analysis_results$computing_time$Analysis$joint_gwas <- t22 - t21
+    }
   }
   # - run focaled version of ColocBoost for each GWAS
   if (separate_gwas & !is.null(sumstats)) {
     t31 <- Sys.time()
     res_gwas_separate <- analysis_results$separate_gwas
     for (i_gwas in 1:nrow(dict_sumstatLD)) {
       current_study <- names(sumstats)[i_gwas]
-      message(paste("====== Performing focaled version GWAS-xQTL ColocBoost on", length(Y), "contexts and ", current_study, "GWAS. ====="))
       dict <- dict_sumstatLD[i_gwas, ]
+      # Validate this study before calling colocboost
+      ld_idx <- dict[2]
+      ld_matrix <- if (!is.na(ld_idx)) LD_mat[[ld_idx]] else NULL
+      valid <- is_valid_sumstat_entry(sumstats[[dict[1]]], ld_matrix)
+      if (!isTRUE(valid)) {
+        message(paste("Skipping", current_study, "due to invalid N/z/LD."))
+        next
+      }
+      message(paste("====== Performing focaled version GWAS-xQTL ColocBoost on", length(Y), "contexts and ", current_study, "GWAS. ====="))
       traits <- c(names(Y), current_study)
-      res_gwas_separate[[current_study]] <- colocboost(
-        X = X, Y = Y, sumstat = sumstats[dict[1]],
-        LD = LD_mat[dict[2]], dict_YX = dict_YX,
-        outcome_names = traits, focal_outcome_idx = length(traits), 
-        output_level = 2, ...
-      )
+
+      # Try to run colocboost with error handling
+      tryCatch({
+        res_gwas_separate[[current_study]] <- colocboost(
+          X = X, Y = Y, sumstat = sumstats[dict[1]],
+          LD = LD_mat[dict[2]], dict_YX = dict_YX,
+          outcome_names = traits, focal_outcome_idx = length(traits), 
+          output_level = 2, ...
+        )
+      }, error = function(e) {
+        message(paste("Error in colocboost analysis for", current_study, ":"))
+        message(paste("Error message:", e$message))
+        message(paste("Continuing with next GWAS..."))
+        res_gwas_separate[[current_study]] <<- NULL
+      })
     }
     t32 <- Sys.time()
     analysis_results$separate_gwas <- res_gwas_separate
@@ -687,7 +758,11 @@ qc_regional_data <- function(region_data,
         n <- sumstat$n
         var_y <- sumstat$var_y
         conditions_sumstat <- names(sumstats)[ii]
-        pip_cutoff_to_skip_ld <- pip_cutoff_to_skip_sumstat[conditions_sumstat] %>% as.numeric()
+        pip_cutoff_to_skip_ld <- if (conditions_sumstat %in% names(pip_cutoff_to_skip_sumstat)) {
+          as.numeric(pip_cutoff_to_skip_sumstat[conditions_sumstat])
+        } else {
+          0
+        }
 
         # Preprocess the input data
         preprocess_results <- rss_basic_qc(sumstat$sumstats, LD_data, remove_indels = remove_indels)

diff --git a/R/raiss.R b/R/raiss.R
@@ -330,10 +330,10 @@ filter_raiss_output <- function(zscores, R2_threshold = 0.6, minimum_ld = 5, ver
 
   # Count statistics before filtering
   NSNPs_bf_filt <- nrow(zscores)
-  NSNPs_initial <- sum(zscores$raiss_R2 == 2.0)
-  NSNPs_imputed <- sum(zscores$raiss_R2 != 2.0)
-  NSNPs_ld_filt <- sum(zscores$raiss_ld_score < minimum_ld)
-  NSNPs_R2_filt <- sum(zscores$raiss_R2 < R2_threshold)
+  NSNPs_initial <- sum(zscores$raiss_R2 == 2.0, na.rm = TRUE)
+  NSNPs_imputed <- sum(zscores$raiss_R2 != 2.0, na.rm = TRUE)
+  NSNPs_ld_filt <- sum(zscores$raiss_ld_score < minimum_ld, na.rm = TRUE)
+  NSNPs_R2_filt <- sum(zscores$raiss_R2 < R2_threshold, na.rm = TRUE)
 
   # Apply filters
   zscores_nofilter <- zscores

diff --git a/R/sumstats_qc.R b/R/sumstats_qc.R
@@ -54,6 +54,29 @@ rss_basic_qc <- function(sumstats, LD_data, skip_region = NULL, remove_indels =
 
   sumstats_processed <- allele_flip$target_data_qced %>% arrange(pos)
 
+  # Align and subset LD by mapping core IDs (strip trailing build suffix) to exact LD IDs
+  ld_mat <- LD_data$combined_LD_matrix
+  ld_ids <- tryCatch(rownames(ld_mat), error = function(e) NULL)
+  if (is.null(ld_ids)) {
+    stop("LD matrix rownames are NULL; cannot align variant IDs.")
+  }
+  present <- sumstats_processed$variant_id %in% ld_ids
+  if (sum(present) == 0) {
+    strip_build <- function(x) sub("(:|_)b[0-9]+$", "", x)
+    drop_chr <- function(x) sub("^chr", "", x)
+    ld_core <- drop_chr(strip_build(ld_ids))
+    ss_core <- drop_chr(strip_build(sumstats_processed$variant_id))
+    map_idx <- match(ss_core, ld_core)
+    remap <- !is.na(map_idx)
+    if (sum(remap) > 0) {
+      sumstats_processed$variant_id[remap] <- ld_ids[map_idx[remap]]
+      present <- sumstats_processed$variant_id %in% ld_ids
+    }
+  }
+  if (sum(present) == 0) {
+    stop("No overlapping variants between sumstats and LD after alignment.")
+  }
+
   LD_mat_processed <- LD_data$combined_LD_matrix[sumstats_processed$variant_id, sumstats_processed$variant_id, drop = FALSE]
 
   return(list(sumstats = sumstats_processed, LD_mat = LD_mat_processed))

diff --git a/man/align_variant_names.Rd b/man/align_variant_names.Rd