StatFunGen · kal26 · Oct 2, 2025 · Oct 2, 2025 · Oct 2, 2025 · Oct 2, 2025
diff --git a/NAMESPACE b/NAMESPACE
@@ -141,6 +141,8 @@ importFrom(data.table,setnames)
 importFrom(doFuture,registerDoFuture)
 importFrom(dplyr,"%>%")
 importFrom(dplyr,across)
+importFrom(dplyr,all_of)
+importFrom(dplyr,any_of)
 importFrom(dplyr,arrange)
 importFrom(dplyr,as_tibble)
 importFrom(dplyr,bind_rows)

diff --git a/R/LD.R b/R/LD.R
@@ -355,16 +355,33 @@ load_LD_matrix <- function(LD_meta_file_path, region, extract_coordinates = NULL
     LD_matrices = extracted_LD_matrices_list,
     variants = extracted_LD_variants_list
   )
+  # Normalize LD variant IDs at load time: add 'chr' prefix if missing
+  if (!is.null(colnames(combined_LD_matrix))) {
+    cn <- as.character(colnames(combined_LD_matrix))
+    colnames(combined_LD_matrix) <- ifelse(startsWith(cn, "chr"), cn, paste0("chr", cn))
+  }
+  if (!is.null(rownames(combined_LD_matrix))) {
+    rn <- as.character(rownames(combined_LD_matrix))
+    rownames(combined_LD_matrix) <- ifelse(startsWith(rn, "chr"), rn, paste0("chr", rn))
+  }
   combined_LD_variants <- rownames(combined_LD_matrix)
 
   # Now create block_metadata with all the information we've accumulated
   block_variants <- lapply(extracted_LD_variants_list, function(v) v$variants)
+  # Normalize block_variants to match combined_LD_variants format (add 'chr' if needed)
+  block_variants_normalized <- lapply(block_variants, function(v) {
+    if (!any(startsWith(v, "chr"))) {
+      paste0("chr", v)
+    } else {
+      v
+    }
+  })
   block_metadata <- data.frame(
     block_id = seq_along(LD_file_paths),
     chrom = block_chroms,
     size = sapply(block_variants, length),
-    start_idx = sapply(block_variants, function(v) min(match(v, combined_LD_variants))),
-    end_idx = sapply(block_variants, function(v) max(match(v, combined_LD_variants))),
+    start_idx = sapply(block_variants_normalized, function(v) min(match(v, combined_LD_variants))),
+    end_idx = sapply(block_variants_normalized, function(v) max(match(v, combined_LD_variants))),
     stringsAsFactors = FALSE
   )
 
@@ -375,6 +392,8 @@ load_LD_matrix <- function(LD_meta_file_path, region, extract_coordinates = NULL
   ref_panel <- do.call(rbind, lapply(strsplit(rownames(combined_LD_matrix), ":"), function(x) {
     data.frame(chrom = x[1], pos = as.integer(x[2]), A2 = x[3], A1 = x[4])
   }))
+  # Normalize ref_panel chrom to drop 'chr' prefix for merging with sumstats (which often use numeric chrom)
+  ref_panel$chrom <- sub("^chr", "", as.character(ref_panel$chrom))
 
   merged_variant_list <- do.call(rbind, extracted_LD_variants_list)
   ref_panel$variant_id <- rownames(combined_LD_matrix)

diff --git a/R/allele_qc.R b/R/allele_qc.R
diff --git a/R/colocboost_pipeline.R b/R/colocboost_pipeline.R
diff --git a/R/file_utils.R b/R/file_utils.R
@@ -209,7 +209,7 @@ NoPhenotypeError <- function(message) {
   structure(list(message = message), class = c("NoPhenotypeError", "error", "condition"))
 }
 
-#' @importFrom purrr map2 compact
+#' @importFrom purrr map2
 #' @importFrom readr read_delim cols
 #' @importFrom dplyr filter select mutate across everything
 #' @importFrom magrittr %>%
@@ -224,8 +224,7 @@ load_phenotype_data <- function(phenotype_path, region, extract_region_name = NU
   }
 
   # Use `map2` to iterate over `phenotype_path` and `extract_region_name` simultaneously
-  # `compact` should remove all NULL element
-  phenotype_data <- compact(map2(phenotype_path, extract_region_name, ~ {
+  phenotype_data <- map2(phenotype_path, extract_region_name, ~ {
     tabix_data <- if (!is.null(region)) tabix_region(.x, region, tabix_header = tabix_header) else read_delim(.x, "\t", col_types = cols())
     if (nrow(tabix_data) == 0) {
       message(paste("Phenotype file ", .x, " is empty for the specified region", if (!is.null(region)) "" else region))
@@ -243,12 +242,25 @@ load_phenotype_data <- function(phenotype_path, region, extract_region_name = NU
         stop("region_name_col is out of bounds for the number of columns in tabix_data.")
       }
     } else {
-      return(tabix_data %>% t())
+      # No extract_region_name filter: still transpose and assign names appropriately
+      result <- tabix_data %>% t()
+      if (!is.null(region_name_col) && (region_name_col %% 1 == 0)) {
+        if (region_name_col <= ncol(tabix_data)) {
+          # After transpose, the desired name row is at index region_name_col
+          colnames(result) <- result[region_name_col, ]
+        } else {
+          stop("region_name_col is out of bounds for the number of columns in tabix_data.")
+        }
+      } else if (is.null(colnames(result))) {
+        # Fallback: synthesize names if still missing
+        colnames(result) <- paste0("region_", seq_len(ncol(result)))
+      }
+      return(result)
     }
-  }))
+  })
 
-  # Check if all phenotype files are empty
-  if (length(phenotype_data) == 0) {
+  # Check if all phenotype files are empty (all elements NULL)
+  if (all(vapply(phenotype_data, is.null, logical(1)))) {
     stop(NoPhenotypeError(paste("All phenotype files are empty for the specified region", if (!is.null(region)) "" else region)))
   }
   return(phenotype_data)
@@ -456,6 +468,25 @@ load_regional_association_data <- function(genotype, # PLINK file
   ## Load phenotype and covariates and perform some pre-processing
   covar <- load_covariate_data(covariate)
   pheno <- load_phenotype_data(phenotype, region, extract_region_name = extract_region_name, region_name_col = region_name_col, tabix_header = tabix_header)
+
+  # Keep covariates / conditions aligned with successfully loaded phenotypes.
+  # load_phenotype_data() returns NULL for phenotype files that are empty in
+  # the requested region; we must drop the corresponding covariate / condition
+  # entries before constructing data_list.
+  keep_idx <- !vapply(pheno, is.null, logical(1))
+  if (!any(keep_idx)) {
+    stop(NoPhenotypeError(paste("All phenotype files are empty for the specified region", if (!is.null(region)) "" else region)))
+  }
+  if (any(!keep_idx)) {
+    dropped_conditions <- conditions[!keep_idx]
+    message(paste(
+      "Dropping phenotypes with no data in region", region, ":",
+      paste(dropped_conditions, collapse = ", ")
+    ))
+  }
+  pheno <- pheno[keep_idx]
+  covar <- covar[keep_idx]
+  conditions <- conditions[keep_idx]
   ### including Y ( cov ) and specific X and covar match, filter X variants based on the overlapped samples.
   data_list <- prepare_data_list(geno, pheno, covar, imiss_cutoff,
     maf_cutoff, mac_cutoff, xvar_cutoff,

diff --git a/R/raiss.R b/R/raiss.R
@@ -330,10 +330,10 @@ filter_raiss_output <- function(zscores, R2_threshold = 0.6, minimum_ld = 5, ver
 
   # Count statistics before filtering
   NSNPs_bf_filt <- nrow(zscores)
-  NSNPs_initial <- sum(zscores$raiss_R2 == 2.0)
-  NSNPs_imputed <- sum(zscores$raiss_R2 != 2.0)
-  NSNPs_ld_filt <- sum(zscores$raiss_ld_score < minimum_ld)
-  NSNPs_R2_filt <- sum(zscores$raiss_R2 < R2_threshold)
+  NSNPs_initial <- sum(zscores$raiss_R2 == 2.0, na.rm = TRUE)
+  NSNPs_imputed <- sum(zscores$raiss_R2 != 2.0, na.rm = TRUE)
+  NSNPs_ld_filt <- sum(zscores$raiss_ld_score < minimum_ld, na.rm = TRUE)
+  NSNPs_R2_filt <- sum(zscores$raiss_R2 < R2_threshold, na.rm = TRUE)
 
   # Apply filters
   zscores_nofilter <- zscores

diff --git a/R/sumstats_qc.R b/R/sumstats_qc.R
@@ -54,6 +54,29 @@ rss_basic_qc <- function(sumstats, LD_data, skip_region = NULL, remove_indels =
 
   sumstats_processed <- allele_flip$target_data_qced %>% arrange(pos)
 
+  # Align and subset LD by mapping core IDs (strip trailing build suffix) to exact LD IDs
+  ld_mat <- LD_data$combined_LD_matrix
+  ld_ids <- tryCatch(rownames(ld_mat), error = function(e) NULL)
+  if (is.null(ld_ids)) {
+    stop("LD matrix rownames are NULL; cannot align variant IDs.")
+  }
+  present <- sumstats_processed$variant_id %in% ld_ids
+  if (sum(present) == 0) {
+    strip_build <- function(x) sub("(:|_)b[0-9]+$", "", x)
+    drop_chr <- function(x) sub("^chr", "", x)
+    ld_core <- drop_chr(strip_build(ld_ids))
+    ss_core <- drop_chr(strip_build(sumstats_processed$variant_id))
+    map_idx <- match(ss_core, ld_core)
+    remap <- !is.na(map_idx)
+    if (sum(remap) > 0) {
+      sumstats_processed$variant_id[remap] <- ld_ids[map_idx[remap]]
+      present <- sumstats_processed$variant_id %in% ld_ids
+    }
+  }
+  if (sum(present) == 0) {
+    stop("No overlapping variants between sumstats and LD after alignment.")
+  }
+
   LD_mat_processed <- LD_data$combined_LD_matrix[sumstats_processed$variant_id, sumstats_processed$variant_id, drop = FALSE]
 
   return(list(sumstats = sumstats_processed, LD_mat = LD_mat_processed))

diff --git a/man/align_variant_names.Rd b/man/align_variant_names.Rd