Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -62,4 +62,4 @@ LinkingTo:
RcppGSL
NeedsCompilation: yes
VignetteBuilder: knitr
RoxygenNote: 7.3.2
RoxygenNote: 7.3.3
2 changes: 2 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -141,6 +141,8 @@ importFrom(data.table,setnames)
importFrom(doFuture,registerDoFuture)
importFrom(dplyr,"%>%")
importFrom(dplyr,across)
importFrom(dplyr,all_of)
importFrom(dplyr,any_of)
importFrom(dplyr,arrange)
importFrom(dplyr,as_tibble)
importFrom(dplyr,bind_rows)
Expand Down
23 changes: 21 additions & 2 deletions R/LD.R
Original file line number Diff line number Diff line change
Expand Up @@ -355,16 +355,33 @@ load_LD_matrix <- function(LD_meta_file_path, region, extract_coordinates = NULL
LD_matrices = extracted_LD_matrices_list,
variants = extracted_LD_variants_list
)
# Normalize LD variant IDs at load time: add 'chr' prefix if missing
if (!is.null(colnames(combined_LD_matrix))) {
cn <- as.character(colnames(combined_LD_matrix))
colnames(combined_LD_matrix) <- ifelse(startsWith(cn, "chr"), cn, paste0("chr", cn))
}
if (!is.null(rownames(combined_LD_matrix))) {
rn <- as.character(rownames(combined_LD_matrix))
rownames(combined_LD_matrix) <- ifelse(startsWith(rn, "chr"), rn, paste0("chr", rn))
}
combined_LD_variants <- rownames(combined_LD_matrix)

# Now create block_metadata with all the information we've accumulated
block_variants <- lapply(extracted_LD_variants_list, function(v) v$variants)
# Normalize block_variants to match combined_LD_variants format (add 'chr' if needed)
block_variants_normalized <- lapply(block_variants, function(v) {
if (!any(startsWith(v, "chr"))) {
paste0("chr", v)
} else {
v
}
})
block_metadata <- data.frame(
block_id = seq_along(LD_file_paths),
chrom = block_chroms,
size = sapply(block_variants, length),
start_idx = sapply(block_variants, function(v) min(match(v, combined_LD_variants))),
end_idx = sapply(block_variants, function(v) max(match(v, combined_LD_variants))),
start_idx = sapply(block_variants_normalized, function(v) min(match(v, combined_LD_variants))),
end_idx = sapply(block_variants_normalized, function(v) max(match(v, combined_LD_variants))),
stringsAsFactors = FALSE
)

Expand All @@ -375,6 +392,8 @@ load_LD_matrix <- function(LD_meta_file_path, region, extract_coordinates = NULL
ref_panel <- do.call(rbind, lapply(strsplit(rownames(combined_LD_matrix), ":"), function(x) {
data.frame(chrom = x[1], pos = as.integer(x[2]), A2 = x[3], A1 = x[4])
}))
# Normalize ref_panel chrom to drop 'chr' prefix for merging with sumstats (which often use numeric chrom)
ref_panel$chrom <- sub("^chr", "", as.character(ref_panel$chrom))

merged_variant_list <- do.call(rbind, extracted_LD_variants_list)
ref_panel$variant_id <- rownames(combined_LD_matrix)
Expand Down
235 changes: 131 additions & 104 deletions R/allele_qc.R

Large diffs are not rendered by default.

117 changes: 96 additions & 21 deletions R/colocboost_pipeline.R
Original file line number Diff line number Diff line change
Expand Up @@ -436,13 +436,32 @@ colocboost_analysis_pipeline <- function(region_data,
if (!is.null(sumstat_data$sumstats)) {
sumstats <- lapply(sumstat_data$sumstats, function(ss) {
z <- ss$sumstats$z
variant <- paste0("chr", ss$sumstats$variant_id)
# Prefix 'chr' only if missing to avoid 'chrchr'
vi <- ss$sumstats$variant_id
has_chr <- startsWith(as.character(vi), "chr")
variant <- ifelse(has_chr, as.character(vi), paste0("chr", as.character(vi)))
n <- ss$n
data.frame("z" = z, "n" = n, "variant" = variant)

# Filter out NA values from z-scores and corresponding variants
na_mask <- !is.na(z)
if (sum(na_mask) == 0) {
message("Warning: All z-scores are NA for this summary statistic dataset")
return(data.frame("z" = numeric(0), "n" = numeric(0), "variant" = character(0)))
}

data.frame("z" = z[na_mask], "n" = n, "variant" = variant[na_mask])
})
names(sumstats) <- names(sumstat_data$sumstats)
LD_mat <- lapply(sumstat_data$LD_mat, function(ld) {
colnames(ld) <- rownames(ld) <- paste0("chr", colnames(ld))
# Ensure LD dimnames have 'chr' only once
cn <- colnames(ld)
rn <- rownames(ld)
if (!is.null(cn)) {
colnames(ld) <- ifelse(startsWith(as.character(cn), "chr"), as.character(cn), paste0("chr", as.character(cn)))
}
if (!is.null(rn)) {
rownames(ld) <- ifelse(startsWith(as.character(rn), "chr"), as.character(rn), paste0("chr", as.character(rn)))
}
return(ld)
})
LD_match <- sumstat_data$LD_match
Expand All @@ -451,6 +470,35 @@ colocboost_analysis_pipeline <- function(region_data,
sumstats <- LD_mat <- dict_sumstatLD <- NULL
}

# Helper: validate a single GWAS study data.frame and its LD matrix
is_valid_sumstat_entry <- function(df, ld_matrix) {
if (is.null(df) || nrow(df) == 0) return(FALSE)
if (!all(c("z", "n", "variant") %in% colnames(df))) return(FALSE)
n_vals <- suppressWarnings(as.numeric(unique(df$n)))
if (length(n_vals) == 0 || is.na(n_vals[1]) || !is.finite(n_vals[1]) || n_vals[1] < 3) return(FALSE)
if (all(is.na(df$z))) return(FALSE)
if (is.null(ld_matrix)) return(FALSE)
if (!is.matrix(ld_matrix)) return(FALSE)
if (any(!is.finite(ld_matrix))) return(FALSE)
# Ensure at least 1 overlapping variant with LD matrix dimnames
ld_vars <- intersect(rownames(ld_matrix), colnames(ld_matrix))
if (length(ld_vars) == 0) return(FALSE)
if (length(intersect(as.character(df$variant), as.character(ld_vars))) == 0) return(FALSE)
TRUE
}

# Helper: filter joint structures (sumstats list and dict_sumstatLD) to only valid entries
filter_valid_sumstats <- function(sumstats, LD_mat, dict_sumstatLD) {
if (is.null(sumstats) || is.null(dict_sumstatLD) || is.null(LD_mat)) return(list(sumstats = NULL, dict = NULL))
keep <- vector("logical", length(sumstats))
for (i in seq_along(sumstats)) {
ld_idx <- dict_sumstatLD[i, 2]
ld_matrix <- if (!is.na(ld_idx)) LD_mat[[ld_idx]] else NULL
keep[i] <- is_valid_sumstat_entry(sumstats[[i]], ld_matrix)
}
if (!any(keep)) return(list(sumstats = NULL, dict = NULL))
list(sumstats = sumstats[keep], dict = matrix(dict_sumstatLD[keep, , drop = FALSE], ncol = 2))
}

####### ========= streamline three types of analyses ======== ########
if (is.null(X) & is.null(sumstats)) {
Expand Down Expand Up @@ -481,32 +529,55 @@ colocboost_analysis_pipeline <- function(region_data,
if (joint_gwas & !is.null(sumstats)) {
message(paste("====== Performing non-focaled version GWAS-xQTL ColocBoost on", length(Y), "contexts and", length(sumstats), "GWAS. ====="))
t21 <- Sys.time()
traits <- c(names(Y), names(sumstats))
res_gwas <- colocboost(
X = X, Y = Y, sumstat = sumstats, LD = LD_mat,
dict_YX = dict_YX, dict_sumstatLD = dict_sumstatLD,
outcome_names = traits, focal_outcome_idx = NULL,
output_level = 2, ...
)
t22 <- Sys.time()
analysis_results$joint_gwas <- res_gwas
analysis_results$computing_time$Analysis$joint_gwas <- t22 - t21
# Filter invalid GWAS before calling colocboost
filtered <- filter_valid_sumstats(sumstats, LD_mat, dict_sumstatLD)
if (is.null(filtered$sumstats)) {
message("All GWAS studies failed validation; skipping joint GWAS analysis for this region.")
} else {
traits <- c(names(Y), names(filtered$sumstats))
res_gwas <- colocboost(
X = X, Y = Y, sumstat = filtered$sumstats, LD = LD_mat,
dict_YX = dict_YX, dict_sumstatLD = filtered$dict,
outcome_names = traits, focal_outcome_idx = NULL,
output_level = 2, ...
)
t22 <- Sys.time()
analysis_results$joint_gwas <- res_gwas
analysis_results$computing_time$Analysis$joint_gwas <- t22 - t21
}
}
# - run focaled version of ColocBoost for each GWAS
if (separate_gwas & !is.null(sumstats)) {
t31 <- Sys.time()
res_gwas_separate <- analysis_results$separate_gwas
for (i_gwas in 1:nrow(dict_sumstatLD)) {
current_study <- names(sumstats)[i_gwas]
message(paste("====== Performing focaled version GWAS-xQTL ColocBoost on", length(Y), "contexts and ", current_study, "GWAS. ====="))
dict <- dict_sumstatLD[i_gwas, ]
# Validate this study before calling colocboost
ld_idx <- dict[2]
ld_matrix <- if (!is.na(ld_idx)) LD_mat[[ld_idx]] else NULL
valid <- is_valid_sumstat_entry(sumstats[[dict[1]]], ld_matrix)
if (!isTRUE(valid)) {
message(paste("Skipping", current_study, "due to invalid N/z/LD."))
next
}
message(paste("====== Performing focaled version GWAS-xQTL ColocBoost on", length(Y), "contexts and ", current_study, "GWAS. ====="))
traits <- c(names(Y), current_study)
res_gwas_separate[[current_study]] <- colocboost(
X = X, Y = Y, sumstat = sumstats[dict[1]],
LD = LD_mat[dict[2]], dict_YX = dict_YX,
outcome_names = traits, focal_outcome_idx = length(traits),
output_level = 2, ...
)

# Try to run colocboost with error handling
tryCatch({
res_gwas_separate[[current_study]] <- colocboost(
X = X, Y = Y, sumstat = sumstats[dict[1]],
LD = LD_mat[dict[2]], dict_YX = dict_YX,
outcome_names = traits, focal_outcome_idx = length(traits),
output_level = 2, ...
)
}, error = function(e) {
message(paste("Error in colocboost analysis for", current_study, ":"))
message(paste("Error message:", e$message))
message(paste("Continuing with next GWAS..."))
res_gwas_separate[[current_study]] <<- NULL
})
}
t32 <- Sys.time()
analysis_results$separate_gwas <- res_gwas_separate
Expand Down Expand Up @@ -687,7 +758,11 @@ qc_regional_data <- function(region_data,
n <- sumstat$n
var_y <- sumstat$var_y
conditions_sumstat <- names(sumstats)[ii]
pip_cutoff_to_skip_ld <- pip_cutoff_to_skip_sumstat[conditions_sumstat] %>% as.numeric()
pip_cutoff_to_skip_ld <- if (conditions_sumstat %in% names(pip_cutoff_to_skip_sumstat)) {
as.numeric(pip_cutoff_to_skip_sumstat[conditions_sumstat])
} else {
0
}

# Preprocess the input data
preprocess_results <- rss_basic_qc(sumstat$sumstats, LD_data, remove_indels = remove_indels)
Expand Down
8 changes: 4 additions & 4 deletions R/raiss.R
Original file line number Diff line number Diff line change
Expand Up @@ -330,10 +330,10 @@ filter_raiss_output <- function(zscores, R2_threshold = 0.6, minimum_ld = 5, ver

# Count statistics before filtering
NSNPs_bf_filt <- nrow(zscores)
NSNPs_initial <- sum(zscores$raiss_R2 == 2.0)
NSNPs_imputed <- sum(zscores$raiss_R2 != 2.0)
NSNPs_ld_filt <- sum(zscores$raiss_ld_score < minimum_ld)
NSNPs_R2_filt <- sum(zscores$raiss_R2 < R2_threshold)
NSNPs_initial <- sum(zscores$raiss_R2 == 2.0, na.rm = TRUE)
NSNPs_imputed <- sum(zscores$raiss_R2 != 2.0, na.rm = TRUE)
NSNPs_ld_filt <- sum(zscores$raiss_ld_score < minimum_ld, na.rm = TRUE)
NSNPs_R2_filt <- sum(zscores$raiss_R2 < R2_threshold, na.rm = TRUE)

# Apply filters
zscores_nofilter <- zscores
Expand Down
23 changes: 23 additions & 0 deletions R/sumstats_qc.R
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,29 @@ rss_basic_qc <- function(sumstats, LD_data, skip_region = NULL, remove_indels =

sumstats_processed <- allele_flip$target_data_qced %>% arrange(pos)

# Align and subset LD by mapping core IDs (strip trailing build suffix) to exact LD IDs
ld_mat <- LD_data$combined_LD_matrix
ld_ids <- tryCatch(rownames(ld_mat), error = function(e) NULL)
if (is.null(ld_ids)) {
stop("LD matrix rownames are NULL; cannot align variant IDs.")
}
present <- sumstats_processed$variant_id %in% ld_ids
if (sum(present) == 0) {
strip_build <- function(x) sub("(:|_)b[0-9]+$", "", x)
drop_chr <- function(x) sub("^chr", "", x)
ld_core <- drop_chr(strip_build(ld_ids))
ss_core <- drop_chr(strip_build(sumstats_processed$variant_id))
map_idx <- match(ss_core, ld_core)
remap <- !is.na(map_idx)
if (sum(remap) > 0) {
sumstats_processed$variant_id[remap] <- ld_ids[map_idx[remap]]
present <- sumstats_processed$variant_id %in% ld_ids
}
}
if (sum(present) == 0) {
stop("No overlapping variants between sumstats and LD after alignment.")
}

LD_mat_processed <- LD_data$combined_LD_matrix[sumstats_processed$variant_id, sumstats_processed$variant_id, drop = FALSE]

return(list(sumstats = sumstats_processed, LD_mat = LD_mat_processed))
Expand Down
9 changes: 8 additions & 1 deletion man/align_variant_names.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading