Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
198 changes: 2 additions & 196 deletions R/colocboost_pipeline.R
Original file line number Diff line number Diff line change
@@ -1,197 +1,3 @@
#' This function loads a mixture data sets for a specific region, including individual-level data (genotype, phenotype, covariate data)
#' or summary statistics (sumstats, LD). Run \code{load_regional_univariate_data} and \code{load_rss_data} multiple times for different datasets
#'
#' @section Loading individual level data from multiple corhorts
#' @param region A string of chr:start-end for the phenotype region.
#' @param genotype_list a vector of PLINK bed file containing genotype data.
#' @param phenotype_list A vector of phenotype file names.
#' @param covariate_list A vector of covariate file names corresponding to the phenotype file vector.
#' @param conditions_list_individual A vector of strings representing different conditions or groups.
#' @param match_geno_pheno A vector of index of phentoypes matched to genotype if mulitple genotype PLINK files
#' @param maf_cutoff Minimum minor allele frequency (MAF) cutoff. Default is 0.
#' @param mac_cutoff Minimum minor allele count (MAC) cutoff. Default is 0.
#' @param xvar_cutoff Minimum variance cutoff. Default is 0.
#' @param imiss_cutoff Maximum individual missingness cutoff. Default is 0.
#' @param association_window A string of chr:start-end for the association analysis window (cis or trans). If not provided, all genotype data will be loaded.
#' @param extract_region_name A list of vectors of strings (e.g., gene ID ENSG00000269699) to subset the information when there are multiple regions available. Default is NULL.
#' @param region_name_col Column name containing the region name. Default is NULL.
#' @param keep_indel Logical indicating whether to keep insertions/deletions (INDELs). Default is TRUE.
#' @param keep_samples A vector of sample names to keep. Default is NULL.
#' @param phenotype_header Number of rows to skip at the beginning of the transposed phenotype file (default is 4 for chr, start, end, and ID).
#' @param scale_residuals Logical indicating whether to scale residuals. Default is FALSE.
#' @param tabix_header Logical indicating whether the tabix file has a header. Default is TRUE.
#'
#' @section Loading summary statistics from multiple corhorts or data set
#' @param sumstat_path_list A vector of file path to the summary statistics.
#' @param column_file_path_list A vector of file path to the column file for mapping.
#' @param LD_meta_file_path_list A vector of path of LD_metadata, LD_metadata is a data frame specifying LD blocks with columns "chrom", "start", "end", and "path". "start" and "end" denote the positions of LD blocks. "path" is the path of each LD block, optionally including bim file paths.
#' @param match_LD_sumstat A vector of index of sumstat matched to LD if mulitple sumstat files
#' @param conditions_list_sumstat A vector of strings representing different sumstats.
#' @param n_samples User-specified sample size. If unknown, set as 0 to retrieve from the sumstat file.
#' @param n_cases User-specified number of cases.
#' @param n_controls User-specified number of controls.
#' @param region The region where tabix use to subset the input dataset.
#' @param extract_sumstats_region_name User-specified gene/phenotype name used to further subset the phenotype data.
#' @param sumstats_region_name_col Filter this specific column for the extract_sumstats_region_name.
#' @param comment_string comment sign in the column_mapping file, default is #
#' @param extract_coordinates Optional data frame with columns "chrom" and "pos" for specific coordinates extraction.
#'
#' @return A list containing the individual_data and sumstat_data:
#' individual_data contains the following components if exist
#' \itemize{
#' \item residual_Y: A list of residualized phenotype values (either a vector or a matrix).
#' \item residual_X: A list of residualized genotype matrices for each condition.
#' \item residual_Y_scalar: Scaling factor for residualized phenotype values.
#' \item residual_X_scalar: Scaling factor for residualized genotype values.
#' \item dropped_sample: A list of dropped samples for X, Y, and covariates.
#' \item covar: Covariate data.
#' \item Y: Original phenotype data.
#' \item X_data: Original genotype data.
#' \item X: Filtered genotype matrix.
#' \item maf: Minor allele frequency (MAF) for each variant.
#' \item chrom: Chromosome of the region.
#' \item grange: Genomic range of the region (start and end positions).
#' \item Y_coordinates: Phenotype coordinates if a region is specified.
#' }
#' sumstat_data contains the following components if exist
#' \itemize{
#' \item sumstats: A list of summary statistics for the matched LD_info, each sublist contains sumstats, n, var_y from \code{load_rss_data}.
#' \item LD_info: A list of LD information, each sublist contains combined_LD_variants, combined_LD_matrix, ref_panel \code{load_LD_matrix}.
#' }
#'
#' @export
load_multitask_regional_data <- function(region, # a string of chr:start-end for phenotype region
genotype_list = NULL, # PLINK file
phenotype_list = NULL, # a vector of phenotype file names
covariate_list = NULL, # a vector of covariate file names corresponding to the phenotype file vector
conditions_list_individual = NULL, # a vector of strings
match_geno_pheno = NULL, # a vector of index of phentoypes matched to genotype if mulitple genotype files
maf_cutoff = 0,
mac_cutoff = 0,
xvar_cutoff = 0,
imiss_cutoff = 0,
association_window = NULL,
extract_region_name = NULL,
region_name_col = NULL,
keep_indel = TRUE,
keep_samples = NULL,
keep_variants = NULL,
phenotype_header = 4, # skip first 4 rows of transposed phenotype for chr, start, end and ID
scale_residuals = FALSE,
tabix_header = TRUE,
# sumstat if need
sumstat_path_list = NULL,
column_file_path_list = NULL,
LD_meta_file_path_list = NULL,
match_LD_sumstat = NULL, # a vector of index of sumstat matched to LD if mulitple sumstat files
conditions_list_sumstat = NULL,
n_samples = 0,
n_cases = 0,
n_controls = 0,
extract_sumstats_region_name = NULL,
sumstats_region_name_col = NULL,
comment_string = "#",
extract_coordinates = NULL) {
if (is.null(genotype_list) & is.null(sumstat_path_list)) {
stop("Data load error. Please make sure at least one data set (sumstat_path_list or genotype_list) exists.")
}

# - if exist individual level data
individual_data <- NULL
if (!is.null(genotype_list)) {
#### FIXME: later if we have mulitple genotype list
if (length(genotype_list) != 1 & is.null(match_geno_pheno)) {
stop("Data load error. Please make sure 'match_geno_pheno' exists if you load data from multiple individual-level data.")
} else if (length(genotype_list) == 1 & is.null(match_geno_pheno)) {
match_geno_pheno <- rep(1, length(phenotype_list))
}

# - load individual data from multiple datasets
n_dataset <- unique(match_geno_pheno) ### FIXME
for (i_data in 1:n_dataset) {
# extract genotype file name
genotype <- genotype_list[i_data]
# extract phenotype and covariate file names
pos <- which(match_geno_pheno == i_data)
phenotype <- phenotype_list[pos]
covariate <- covariate_list[pos]
conditions <- conditions_list_individual[pos]
dat <- load_regional_univariate_data(
genotype = genotype, phenotype = phenotype,
covariate = covariate, region = region,
association_window = association_window,
conditions = conditions, xvar_cutoff = xvar_cutoff,
maf_cutoff = maf_cutoff, mac_cutoff = mac_cutoff,
imiss_cutoff = imiss_cutoff, keep_indel = keep_indel,
keep_samples = keep_samples, keep_variants = keep_variants,
extract_region_name = extract_region_name,
phenotype_header = phenotype_header,
region_name_col = region_name_col,
scale_residuals = scale_residuals
)
if (is.null(individual_data)) {
individual_data <- dat
} else {
individual_data <- lapply(names(dat), function(k) {
c(individual_data[[k]], dat[[k]])
})
individual_data$chrom <- dat$chrom
individual_data$grange <- dat$grange
}
}
}

# - if exist summstat data
sumstat_data <- NULL
if (!is.null(sumstat_path_list)) {
if (length(match_LD_sumstat) == 0) {
match_LD_sumstat[[1]] <- conditions_list_sumstat
}
if (length(match_LD_sumstat) != length(LD_meta_file_path_list)) {
stop("Please make sure 'match_LD_sumstat' matched 'LD_meta_file_path_list' if you load data from multiple sumstats.")
}
# - load sumstat data from multiple datasets
n_LD <- length(match_LD_sumstat)
for (i_ld in 1:n_LD) {
# extract LD meta file path name
LD_meta_file_path <- LD_meta_file_path_list[i_ld]
LD_info <- load_LD_matrix(LD_meta_file_path,
region = association_window,
extract_coordinates = extract_coordinates
)
# extract sumstat information
conditions <- match_LD_sumstat[[i_ld]]
pos <- match(conditions, conditions_list_sumstat)
sumstats <- lapply(pos, function(ii) {
sumstat_path <- sumstat_path_list[ii]
column_file_path <- column_file_path_list[ii]
# FIXME later: when consider multiple LD reference
tmp <- load_rss_data(
sumstat_path = sumstat_path, column_file_path = column_file_path,
n_sample = n_samples[ii], n_case = n_cases[ii], n_control = n_controls[ii],
region = association_window, extract_region_name = extract_sumstats_region_name,
region_name_col = sumstats_region_name_col, comment_string = comment_string
)
if (!("variant_id" %in% colnames(tmp$sumstats))) {
tmp$sumstats <- tmp$sumstats %>%
rowwise() %>%
mutate(variant_id = paste0(c(chrom, pos, A1, A2), collapse = ":"))
}
return(tmp)
})
names(sumstats) <- conditions
sumstat_data$sumstats <- c(sumstat_data$sumstats, list(sumstats))
sumstat_data$LD_info <- c(sumstat_data$LD_info, list(LD_info))
}
names(sumstat_data$sumstats) <- names(sumstat_data$LD_info) <- names(match_LD_sumstat)
}

return(list(
individual_data = individual_data,
sumstat_data = sumstat_data
))
}

#' Multi-trait colocalization analysis pipeline
#'
#' This function perform a multi-trait colocalization using ColocBoost
Expand All @@ -215,7 +21,7 @@ load_multitask_regional_data <- function(region, # a string of chr:start-end for
#' }
#' sumstat_data contains the following components if exist
#' \itemize{
#' \item sumstats: A list of summary statistics for the matched LD_info, each sublist contains sumstats, n, var_y from \code{load_rss_data}.
#' \item sumstats: A list of summary statistics f or the matched LD_info, each sublist contains sumstats, n, var_y from \code{load_rss_data}.
#' \item LD_info: A list of LD information, each sublist contains combined_LD_variants, combined_LD_matrix, ref_panel \code{load_LD_matrix}.
#' }
#'
Expand Down Expand Up @@ -744,7 +550,7 @@ qc_regional_data <- function(region_data,
final_LD <- c(final_LD, list(LD_mat) %>% setNames(conditions_sumstat))
LD_match <- c(LD_match, conditions_sumstat)
} else {
LD_match <- c(LD_match, LD_match[pos[1]])
LD_match <- c(LD_match, names(final_LD)[pos[1]])
}
}
}
Expand Down
Loading
Loading