Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -141,6 +141,8 @@ importFrom(data.table,setnames)
importFrom(doFuture,registerDoFuture)
importFrom(dplyr,"%>%")
importFrom(dplyr,across)
importFrom(dplyr,all_of)
importFrom(dplyr,any_of)
importFrom(dplyr,arrange)
importFrom(dplyr,as_tibble)
importFrom(dplyr,bind_rows)
Expand Down
23 changes: 21 additions & 2 deletions R/LD.R
Original file line number Diff line number Diff line change
Expand Up @@ -355,16 +355,33 @@ load_LD_matrix <- function(LD_meta_file_path, region, extract_coordinates = NULL
LD_matrices = extracted_LD_matrices_list,
variants = extracted_LD_variants_list
)
# Normalize LD variant IDs at load time: add 'chr' prefix if missing
if (!is.null(colnames(combined_LD_matrix))) {
cn <- as.character(colnames(combined_LD_matrix))
colnames(combined_LD_matrix) <- ifelse(startsWith(cn, "chr"), cn, paste0("chr", cn))
}
if (!is.null(rownames(combined_LD_matrix))) {
rn <- as.character(rownames(combined_LD_matrix))
rownames(combined_LD_matrix) <- ifelse(startsWith(rn, "chr"), rn, paste0("chr", rn))
}
combined_LD_variants <- rownames(combined_LD_matrix)

# Now create block_metadata with all the information we've accumulated
block_variants <- lapply(extracted_LD_variants_list, function(v) v$variants)
# Normalize block_variants to match combined_LD_variants format (add 'chr' if needed)
block_variants_normalized <- lapply(block_variants, function(v) {
if (!any(startsWith(v, "chr"))) {
paste0("chr", v)
} else {
v
}
})
block_metadata <- data.frame(
block_id = seq_along(LD_file_paths),
chrom = block_chroms,
size = sapply(block_variants, length),
start_idx = sapply(block_variants, function(v) min(match(v, combined_LD_variants))),
end_idx = sapply(block_variants, function(v) max(match(v, combined_LD_variants))),
start_idx = sapply(block_variants_normalized, function(v) min(match(v, combined_LD_variants))),
end_idx = sapply(block_variants_normalized, function(v) max(match(v, combined_LD_variants))),
stringsAsFactors = FALSE
)

Expand All @@ -375,6 +392,8 @@ load_LD_matrix <- function(LD_meta_file_path, region, extract_coordinates = NULL
ref_panel <- do.call(rbind, lapply(strsplit(rownames(combined_LD_matrix), ":"), function(x) {
data.frame(chrom = x[1], pos = as.integer(x[2]), A2 = x[3], A1 = x[4])
}))
# Normalize ref_panel chrom to drop 'chr' prefix for merging with sumstats (which often use numeric chrom)
ref_panel$chrom <- sub("^chr", "", as.character(ref_panel$chrom))

merged_variant_list <- do.call(rbind, extracted_LD_variants_list)
ref_panel$variant_id <- rownames(combined_LD_matrix)
Expand Down
235 changes: 131 additions & 104 deletions R/allele_qc.R

Large diffs are not rendered by default.

335 changes: 309 additions & 26 deletions R/colocboost_pipeline.R

Large diffs are not rendered by default.

45 changes: 38 additions & 7 deletions R/file_utils.R
Original file line number Diff line number Diff line change
Expand Up @@ -209,7 +209,7 @@ NoPhenotypeError <- function(message) {
structure(list(message = message), class = c("NoPhenotypeError", "error", "condition"))
}

#' @importFrom purrr map2 compact
#' @importFrom purrr map2
#' @importFrom readr read_delim cols
#' @importFrom dplyr filter select mutate across everything
#' @importFrom magrittr %>%
Expand All @@ -224,8 +224,7 @@ load_phenotype_data <- function(phenotype_path, region, extract_region_name = NU
}

# Use `map2` to iterate over `phenotype_path` and `extract_region_name` simultaneously
# `compact` should remove all NULL element
phenotype_data <- compact(map2(phenotype_path, extract_region_name, ~ {
phenotype_data <- map2(phenotype_path, extract_region_name, ~ {
tabix_data <- if (!is.null(region)) tabix_region(.x, region, tabix_header = tabix_header) else read_delim(.x, "\t", col_types = cols())
if (nrow(tabix_data) == 0) {
message(paste("Phenotype file ", .x, " is empty for the specified region", if (!is.null(region)) "" else region))
Expand All @@ -243,12 +242,25 @@ load_phenotype_data <- function(phenotype_path, region, extract_region_name = NU
stop("region_name_col is out of bounds for the number of columns in tabix_data.")
}
} else {
return(tabix_data %>% t())
# No extract_region_name filter: still transpose and assign names appropriately
result <- tabix_data %>% t()
if (!is.null(region_name_col) && (region_name_col %% 1 == 0)) {
if (region_name_col <= ncol(tabix_data)) {
# After transpose, the desired name row is at index region_name_col
colnames(result) <- result[region_name_col, ]
} else {
stop("region_name_col is out of bounds for the number of columns in tabix_data.")
}
} else if (is.null(colnames(result))) {
# Fallback: synthesize names if still missing
colnames(result) <- paste0("region_", seq_len(ncol(result)))
}
return(result)
}
}))
})

# Check if all phenotype files are empty
if (length(phenotype_data) == 0) {
# Check if all phenotype files are empty (all elements NULL)
if (all(vapply(phenotype_data, is.null, logical(1)))) {
stop(NoPhenotypeError(paste("All phenotype files are empty for the specified region", if (!is.null(region)) "" else region)))
}
return(phenotype_data)
Expand Down Expand Up @@ -456,6 +468,25 @@ load_regional_association_data <- function(genotype, # PLINK file
## Load phenotype and covariates and perform some pre-processing
covar <- load_covariate_data(covariate)
pheno <- load_phenotype_data(phenotype, region, extract_region_name = extract_region_name, region_name_col = region_name_col, tabix_header = tabix_header)

# Keep covariates / conditions aligned with successfully loaded phenotypes.
# load_phenotype_data() returns NULL for phenotype files that are empty in
# the requested region; we must drop the corresponding covariate / condition
# entries before constructing data_list.
keep_idx <- !vapply(pheno, is.null, logical(1))
if (!any(keep_idx)) {
stop(NoPhenotypeError(paste("All phenotype files are empty for the specified region", if (!is.null(region)) "" else region)))
}
if (any(!keep_idx)) {
dropped_conditions <- conditions[!keep_idx]
message(paste(
"Dropping phenotypes with no data in region", region, ":",
paste(dropped_conditions, collapse = ", ")
))
}
pheno <- pheno[keep_idx]
covar <- covar[keep_idx]
conditions <- conditions[keep_idx]
### including Y ( cov ) and specific X and covar match, filter X variants based on the overlapped samples.
data_list <- prepare_data_list(geno, pheno, covar, imiss_cutoff,
maf_cutoff, mac_cutoff, xvar_cutoff,
Expand Down
8 changes: 4 additions & 4 deletions R/raiss.R
Original file line number Diff line number Diff line change
Expand Up @@ -330,10 +330,10 @@ filter_raiss_output <- function(zscores, R2_threshold = 0.6, minimum_ld = 5, ver

# Count statistics before filtering
NSNPs_bf_filt <- nrow(zscores)
NSNPs_initial <- sum(zscores$raiss_R2 == 2.0)
NSNPs_imputed <- sum(zscores$raiss_R2 != 2.0)
NSNPs_ld_filt <- sum(zscores$raiss_ld_score < minimum_ld)
NSNPs_R2_filt <- sum(zscores$raiss_R2 < R2_threshold)
NSNPs_initial <- sum(zscores$raiss_R2 == 2.0, na.rm = TRUE)
NSNPs_imputed <- sum(zscores$raiss_R2 != 2.0, na.rm = TRUE)
NSNPs_ld_filt <- sum(zscores$raiss_ld_score < minimum_ld, na.rm = TRUE)
NSNPs_R2_filt <- sum(zscores$raiss_R2 < R2_threshold, na.rm = TRUE)

# Apply filters
zscores_nofilter <- zscores
Expand Down
23 changes: 23 additions & 0 deletions R/sumstats_qc.R
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,29 @@ rss_basic_qc <- function(sumstats, LD_data, skip_region = NULL, remove_indels =

sumstats_processed <- allele_flip$target_data_qced %>% arrange(pos)

# Align and subset LD by mapping core IDs (strip trailing build suffix) to exact LD IDs
ld_mat <- LD_data$combined_LD_matrix
ld_ids <- tryCatch(rownames(ld_mat), error = function(e) NULL)
if (is.null(ld_ids)) {
stop("LD matrix rownames are NULL; cannot align variant IDs.")
}
present <- sumstats_processed$variant_id %in% ld_ids
if (sum(present) == 0) {
strip_build <- function(x) sub("(:|_)b[0-9]+$", "", x)
drop_chr <- function(x) sub("^chr", "", x)
ld_core <- drop_chr(strip_build(ld_ids))
ss_core <- drop_chr(strip_build(sumstats_processed$variant_id))
map_idx <- match(ss_core, ld_core)
remap <- !is.na(map_idx)
if (sum(remap) > 0) {
sumstats_processed$variant_id[remap] <- ld_ids[map_idx[remap]]
present <- sumstats_processed$variant_id %in% ld_ids
}
}
if (sum(present) == 0) {
stop("No overlapping variants between sumstats and LD after alignment.")
}

LD_mat_processed <- LD_data$combined_LD_matrix[sumstats_processed$variant_id, sumstats_processed$variant_id, drop = FALSE]

return(list(sumstats = sumstats_processed, LD_mat = LD_mat_processed))
Expand Down
9 changes: 8 additions & 1 deletion man/align_variant_names.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading