From 96f293dee343206c310de5200d10e4b9727d65a1 Mon Sep 17 00:00:00 2001 From: xuewei cao <36172337+xueweic@users.noreply.github.com> Date: Tue, 13 May 2025 21:19:51 -0500 Subject: [PATCH 1/5] Update colocboost_pipeline.R --- R/colocboost_pipeline.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/colocboost_pipeline.R b/R/colocboost_pipeline.R index 2b22d712..2f5eab7a 100644 --- a/R/colocboost_pipeline.R +++ b/R/colocboost_pipeline.R @@ -744,7 +744,7 @@ qc_regional_data <- function(region_data, final_LD <- c(final_LD, list(LD_mat) %>% setNames(conditions_sumstat)) LD_match <- c(LD_match, conditions_sumstat) } else { - LD_match <- c(LD_match, LD_match[pos[1]]) + LD_match <- c(LD_match, names(final_LD)[pos[1]]) } } } From 5deb27ff318f294edcd4220f30290e799f6361d6 Mon Sep 17 00:00:00 2001 From: xuewei cao <36172337+xueweic@users.noreply.github.com> Date: Tue, 13 May 2025 21:48:20 -0500 Subject: [PATCH 2/5] Update colocboost_pipeline.R --- R/colocboost_pipeline.R | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/R/colocboost_pipeline.R b/R/colocboost_pipeline.R index 2f5eab7a..9abe8676 100644 --- a/R/colocboost_pipeline.R +++ b/R/colocboost_pipeline.R @@ -172,6 +172,7 @@ load_multitask_regional_data <- function(region, # a string of chr:start-end for region = association_window, extract_region_name = extract_sumstats_region_name, region_name_col = sumstats_region_name_col, comment_string = comment_string ) + if (nrow(tmp$sumstats) == 0){ return(NULL) } if (!("variant_id" %in% colnames(tmp$sumstats))) { tmp$sumstats <- tmp$sumstats %>% rowwise() %>% @@ -180,6 +181,11 @@ load_multitask_regional_data <- function(region, # a string of chr:start-end for return(tmp) }) names(sumstats) <- conditions + if_no_variants <- sapply(sumstats, is.null) + if (sum(if_no_variants)!=0){ + pos_no_variants <- which(if_no_variants) + sumstats <- sumstats[-pos_no_variants] + } sumstat_data$sumstats <- c(sumstat_data$sumstats, list(sumstats)) sumstat_data$LD_info <- c(sumstat_data$LD_info, list(LD_info)) } From c9a5b3f15714acee60a5f50dd726dfee5f853158 Mon Sep 17 00:00:00 2001 From: xuewei cao <36172337+xueweic@users.noreply.github.com> Date: Thu, 6 Nov 2025 14:10:41 -0500 Subject: [PATCH 3/5] Create ColocBoost_Wrapper_Pipeline.Rmd --- vignettes/ColocBoost_Wrapper_Pipeline.Rmd | 291 ++++++++++++++++++++++ 1 file changed, 291 insertions(+) create mode 100644 vignettes/ColocBoost_Wrapper_Pipeline.Rmd diff --git a/vignettes/ColocBoost_Wrapper_Pipeline.Rmd b/vignettes/ColocBoost_Wrapper_Pipeline.Rmd new file mode 100644 index 00000000..49bc4c11 --- /dev/null +++ b/vignettes/ColocBoost_Wrapper_Pipeline.Rmd @@ -0,0 +1,291 @@ +--- +title: "Bioinformatics Pipeline for ColocBoost" +output: rmarkdown::html_vignette +vignette: > + %\VignetteIndexEntry{Bioinformatics Pipeline for ColocBoost} + %\VignetteEngine{knitr::rmarkdown} + %\VignetteEncoding{UTF-8} +--- + +```{r, include = FALSE} +knitr::opts_chunk$set( + collapse = TRUE, + comment = "#>" +) +``` + + +This vignette demonstrates how to use the bioinformatics pipeline for ColocBoost to perform colocalization analysis with `colocboost` for multiple individual-level datasets and/or summary statistics datasets. + +- See more details about functions in the package `pecotmr` with [link](https://github.com/StatFunGen/pecotmr/tree/main) and +`colocboost_pipeline` with [link](https://github.com/StatFunGen/pecotmr/blob/main/R/colocboost_pipeline.R). +- See more details about input data preparation in `xqtl_protocol` with [link](https://statfungen.github.io/xqtl-protocol/code/mnm_analysis/mnm_methods/colocboost.html). + + +# 1. Loading Data using `colocboost_analysis_pipeline` function + +This function harmonizes the input data and prepares it for colocalization analysis. + + +In this section, we introduce how to load the regional data required for the ColocBoost analysis using the `load_multitask_regional_data` function. +This function loads mixed datasets for a specific region, including individual-level data (genotype, phenotype, covariate data), summary statistics +(sumstats, LD), or a combination of both. It runs `load_regional_univariate_data` for each individual-level dataset and `load_rss_data` for each summary statistics dataset. +The output is a list with `individual_data` and `sumstat_data` components, where `individual_data` is a list of individual-level data and `sumstat_data` is a list of summary statistics data. +This list is then passed to the `colocboost_analysis_pipeline` function for the colocalization analysis. + + +Below are the input parameters for this function for loading individual-level data: + +## 1.1. Loading individual-level data from multiple cohorts + +inputs: +- **`region`**: String ; Genomic region of interest in the format of `chr:start-end` for the phenotype region you want to analyze. +- **`genotype_list`**: Character vector; Paths for PLINK bed files containing genotype data (do NOT include .bed suffix). +- **`phenotype_list`**: Character vector; Paths for phenotype file names. +- **`covariate_list`**: Character vector; Paths for covariate file names for each phenotype. Must have the same length as the phenotype file vector. +- **`conditions_list_individual`**: Character vector; Strings representing different conditions or groups used for naming. Must have the same length as the phenotype file vector. +- **`match_geno_pheno`**: Integer vector; Indices of phenotypes matched to genotype if multiple genotype PLINK files are used. For each phenotype file in `phenotype_list`, the index of the genotype file in `genotype_list` it matches with. +- **`association_window`**: String; Genomic region of interest in the format of `chr:start-end` for the association analysis window of variants to test (cis or trans). If not provided, all genotype data will be loaded. +- **`extract_region_name`**: List of character vectors; Phenotype names (e.g., gene ID `ENSG00000269699`) to subset the phenotype data when there are multiple phenotypes availible in the region. Must have the same length as the phenotype file vector. Default is `NULL`, which will use all phenotypes in the region. +- **`region_name_col`**: Integer; 1-based index of the column containing the region name (i.e. 4 for gene ID in a bed file). Required if `extract_region_name` is not `NULL`, or if multiple phenotypes fall into the same region in one phenotype file +- **`keep_indel`**: Logical; indicating whether to keep insertions/deletions (INDELs). Default is `TRUE`. +- **`keep_samples`**: Character vector; Sample names to keep. Default is `NULL`. Currently only supports keeping the same samples from all genotype and phenotype files. +- **`maf_cutoff`**: Numeric; Minimum minor allele frequency (MAF) cutoff. Default is 0. +- **`mac_cutoff`**: Numeric; Minimum minor allele count (MAC) cutoff. Default is 0. +- **`xvar_cutoff`**: Numeric; Minimum genotype variance cutoff. Default is 0. +- **`imiss_cutoff`**: Numeric; Maximum individual missingness cutoff. Default is 0. + +outputs: +- **`region_data`**: List (with `individual_data`, `sumstat_data`); Output of the `load_multitask_regional_data` function. If only individual-level data is loaded, `sumstat_data` will be `NULL`. + + +**Indivudual-level data loading example** + +The following example demonstrates how to set up input data with 3 phenotypes and 2 cohorts. The first cohort has 2 phenotypes and the second cohort has 1 phenotype. The first phenotype has 2 genes and the second phenotype has 1 gene. + +```{r, data-loader-individual} +# Example of loading individual-level data +region = "chr1:1000000-2000000" +genotype_list = c("plink_cohort1.1", "plink_cohort1.2") +phenotype_list = c("phenotype1_cohort1.bed.gz", "phenotype2_cohort1.bed.gz", "phenotype1_cohort2.bed.gz") +covariate_list = c("covariate1_cohort1.bed.gz", "covariate2_cohort1.bed.gz", "covariate1_cohort2.bed.gz") +conditions_list_individual = c("phenotype1_cohort1", "phenotype2_cohort1", "phenotype1_cohort2") +match_geno_pheno = c(1,1,2) +association_window = "chr1:1000000-2000000" # set to be the same as region for cis-analysis +extract_region_name = list(c("ENSG00000269699, ENSG00000789633"), c("ENSG00000269699"), c("ENSG00000269699", "ENSG00000789633")) +region_name_col = 4 +keep_indel = TRUE +keep_samples = c("SAMPLE1", "SAMPLE2", "SAMPLE3") + +# Following parameters need to be set according to your data +maf_cutoff = 0.01 +mac_cutoff = 10 +xvar_cutoff = 0 +imiss_cutoff = 0.9 + +# More advanced parameters see pecotmr::load_multitask_regional_data() + +#### Comment out to avoid running this code here, as we do not have real data files in this example #### +# region_data_individual <- load_multitask_regional_data( +# region = region, +# genotype_list = genotype_list, +# phenotype_list = phenotype_list, +# covariate_list = covariate_list, +# conditions_list_individual = conditions_list_individual, +# match_geno_pheno = match_geno_pheno, +# association_window = association_window, +# region_name_col = region_name_col, +# extract_region_name = extract_region_name, +# keep_indel = keep_indel, +# keep_samples = keep_samples, +# maf_cutoff = maf_cutoff, +# mac_cutoff = mac_cutoff, +# xvar_cutoff = xvar_cutoff, +# imiss_cutoff = imiss_cutoff +# ) +#### End of comment out + +``` + + + +## 1.2. Loading summary statistics from multiple cohorts or datasets + +inputs: +- **`sumstat_path_list`**: Character vector; Paths to the summary statistics. +- **`column_file_path_list`**: Character vector; Paths to the column mapping files. See below for expected format. +- **`LD_meta_file_path_list`**: Character vector; Paths to LD metadata files. See below for expected format. +- **`conditions_list_sumstat`**: Character vector; Strings representing different sumstats used for naming. Must have the same length as the sumstat file vector. +- **`match_LD_sumstat`**: List of character vectors; Mapping each LD metadata file to the summary-statistics conditions to pair with it. Length must equal `LD_meta_file_path_list`. Each element is a character vector of names present in `conditions_list_sumstat`. If omitted or an element is empty, defaults to all conditions for the first LD. +- **`association_window`**: String; Genomic region of interest in the format of `chr:start-end` for the association analysis window of variants to test (cis or trans). If not provided, all genotype data will be loaded. +- **`n_samples`**: Integer vector; Sample size. Set a 0 if `n_cases`/`n_controls` are passed explicitly. If unknown, set as 0 and include `n_samples` column in the column mapping file to retrieve from the sumstat file. +- **`n_cases`**: Integer vector; Number of cases. Set a 0 if `n_samples` is passed explicitly. If unknown, set as 0 and include `n_cases` column in the column mapping file to retrieve from the sumstat file. +- **`n_controls`**: Integer vector; Number of controls. Set a 0 if `n_samples` is passed explicitly. If unknown, set as 0 and include `n_controls` column in the column mapping file to retrieve from the sumstat file. + +outputs: +- **`region_data`**: List (with `individual_data`, `sumstat_data`); Output of the `load_multitask_regional_data` function. If only summary statistics data is loaded, `individual_data` will be `NULL`. + +**Summary statistics loading example** + +The following example demonstrates how to set up input data with 2 summary statistics and one LD reference. + +```{r, data-loader-sumstat} +# Example of loading summary statistics +sumstat_path_list = c("sumstat1.tsv.gz", "sumstat2.tsv.gz") +column_file_path_list = c("column_mapping_sumstat1.yml", "column_mapping_sumstat2.yml") +LD_meta_file_path_list = c("ld_meta_file.tsv") +conditions_list_sumstat = c("sumstat_1", "sumstat_2") +match_LD_sumstat = c("sumstat_1", "sumstat_2") +association_window = "chr1:1000000-2000000" + +# Following parameters need to be set according to your data +n_samples = c(300000, 0) +n_cases = c(0, 20000) +n_controls = c(0, 40000) + + +# More advanced parameters see pecotmr::load_multitask_regional_data() + +#### Comment out to avoid running this code here, as we do not have real data files in this example #### +# region_data_sumstat <- load_multitask_regional_data( +# sumstat_path_list = sumstat_path_list, +# column_file_path_list = column_file_path_list, +# LD_meta_file_path_list = LD_meta_file_path_list, +# conditions_list_sumstat = conditions_list_sumstat, +# match_LD_sumstat = match_LD_sumstat, +# association_window = association_window, +# n_samples = n_samples, +# n_cases = n_cases, +# n_controls = n_controls +# ) +#### End of comment out +``` + + + +**Expected format for column mapping file** +The column mapping file is YAML (`.yml`) with key: value pairs mapping your input column names to the standardized names expected by the loader. +Required columns are `chrom`, `pos`, `A1`, and `A2`, and either `z` or `beta` and `sebeta`. +Either 'n_case' and 'n_control' or 'n_samples' can be passed as part of the column mapping, but will be overwritten by the n_cases and n_controls or n_samples parameterspassed explicitly. +```yaml +# required +chrom: chromosome +pos: position +A1: effect_allele +A2: non_effect_allele + +z: zscore +# or +beta: beta +sebeta: sebeta + +# optional, will be overridden by n_samples or n_cases and n_controls if passed explicitly +n_case: NCASE +n_control: NCONTROL +# or +n_sample: N +``` + + +**Expected format for LD metadata file** +LD files sould be in the format generated by for instance `plink --r squared`, then xz compressed. +The LD metadata file is a tab-separated file with the following columns: +- `chrom`: chromosome +- `start`: start position +- `end`: end position +- `ld_path, bim_path`: path to the LD block file and bim file +``` +1 1000000 2000000 ld_block_1.ld.gz,ld_block_1.bim +``` + + +# 2. Perform ColocBoost using `colocboost_analysis_pipeline` function + +In this section, we load region data for a combination of individual-level and summary statistics data, then perform the colocalization analysis using the `colocboost_analysis_pipeline` function. +The colocalization analysis can be run in any one of three modes, or in a combination of these modes (names assume that individual-level data is xQTL data and summary statistics data is GWAS data): + +- **`xQTL-only mode`**: Only perform colocalization analysis on the individual-level data. Summary statistics data is not used. +- **`joint GWAS mode`**: Perform colocalization analysis in disease-agnostic mode on the individual-level and summary statistics data together. +- **`separate GWAS mode`**: Perform colocalization analysis in disease-prioritized mode on the the individual-level data and each summary statistics dataset separately, treating each summary statistics dataset as the focal trait. + +inputs: +- **`region_data`**: List (with `individual_data`, `sumstat_data`); Output of the `load_multitask_regional_data` function. +- **`focal_trait`**: String; For xQTL-only mode, the name of the trait to perform disease-prioritized ColocBoost, from `conditions_list_individual`. If not provided, xQTL-only mode will be run without disease-prioritized mode. +- **`event_filters`**: List of character vectors; Patterns for filtering events based on context names. +Example: for sQTL, `list(type_pattern = ".*clu_(\\d+_[+-?]).*", valid_pattern = "clu_(\\d+_[+-?]):PR:", exclude_pattern = "clu_(\\d+_[+-?]):IN:")`. +- **`maf_cutoff`**: Numeric; Minor allele frequency cutoff. Default is 0.005. +- **`pip_cutoff_to_skip_ind`**: Integer vector; Cutoff values for skipping analysis based on pre-screening with single-effect SuSiE (L=1). Context is skipped if none of the variants in the context have PIP values greater than the cutoff. Default is 0 (does not run single-effect SuSiE). Passing a negative value sets the cutoff to 3/number of variants. +- **`pip_cutoff_to_skip_sumstat`**: Integer vector; Cutoff values for skipping analysis based on pre-screening with single-effect SuSiE (L=1). Sumstat is skipped if none of the variants in the sumstat have PIP values greater than the cutoff. Default is 0 (does not run single-effect SuSiE). Passing a negative value sets the cutoff to 3/number of variants. +- **`qc_method`**: String; Quality control method to use. Options are "rss_qc", "dentist", or "slalom". Default is `rss_qc`. +- **`impute`**: Logical; if TRUE, performs imputation for outliers identified in the analysis. Default is `TRUE`. +- **`impute_opts`**: List of lists; Imputation options including rcond, R2_threshold, and minimum_ld. Default is `list(rcond = 0.01, R2_threshold = 0.6, minimum_ld = 5)`. +- **`xqtl_coloc`**: Logical; if TRUE, performs xQTL-only mode. Default is `TRUE`. +- **`joint_gwas`**: Logical; if TRUE, performs joint GWAS mode, mapping all individual-level and sumstat data together.Default is `FALSE`. +- **`separate_gwas`**: Logical; if TRUE, runs separate GWAS mode, where each sumstat dataset is analyzed separately with all individual-level data, treating each sumstat as the focal trait in disease-prioritized mode. Default is `FALSE`. + +outputs: +- **`colocboost_results`**: List of colocboost objects (with `xqtl_coloc`, `joint_gwas`, `separate_gwas`); Output of the `colocboost_analysis_pipeline` function. If the mode is not run, the corresponding element will be `NULL`. + +```{r, colocboost-analysis} + +#### Comment out to avoid running this code here, as we do not have real data files in this example #### +#### Please check the example code below #### + +# # load in individual-level and sumstat data +# region_data_combined <- load_multitask_regional_data( +# region = region, +# genotype_list = genotype_list, +# phenotype_list = phenotype_list, +# covariate_list = covariate_list, +# conditions_list_individual = conditions_list_individual, +# match_geno_pheno = match_geno_pheno, +# association_window = association_window, +# region_name_col = region_name_col, +# extract_region_name = extract_region_name, +# keep_indel = keep_indel, +# keep_samples = keep_samples, +# maf_cutoff = maf_cutoff, +# mac_cutoff = mac_cutoff, +# xvar_cutoff = xvar_cutoff, +# imiss_cutoff = imiss_cutoff, +# sumstat_path_list = sumstat_path_list, +# column_file_path_list = column_file_path_list, +# LD_meta_file_path_list = LD_meta_file_path_list, +# conditions_list_sumstat = conditions_list_sumstat, +# match_LD_sumstat = match_LD_sumstat, +# n_samples = n_samples, +# n_cases = n_cases, +# n_controls = n_controls +# ) + +# maf_cutoff = 0.01 +# pip_cutoff_to_skip_ind = rep(0, length(phenotype_list)) +# pip_cutoff_to_skip_sumstat = rep(0, length(sumstat_path_list)) +# qc_method = "rss_qc" + +# # run colocboost analysis +# colocboost_results <- colocboost_analysis_pipeline( +# region_data_combined, +# maf_cutoff = maf_cutoff, +# pip_cutoff_to_skip_ind = pip_cutoff_to_skip_ind, +# pip_cutoff_to_skip_sumstat = pip_cutoff_to_skip_sumstat, +# qc_method = qc_method, +# xqtl_coloc = TRUE, +# joint_gwas = TRUE, +# separate_gwas = TRUE +# ) + +# # visualize results for xQTL-only mode +# colocboost_plot(colocboost_results$xqtl_coloc) + +# # visualize results for joint GWAS mode +# colocboost_plot(colocboost_results$joint_gwas) + +# # visualize results for separate GWAS mode +# for (i in 1:length(colocboost_results$separate_gwas)) { +# colocboost_plot(colocboost_results$separate_gwas[[i]]) +# } + + +#### End of comment out +``` \ No newline at end of file From 339ba4de26361c610d41a991ad549c52631942bc Mon Sep 17 00:00:00 2001 From: xuewei cao <36172337+xueweic@users.noreply.github.com> Date: Thu, 6 Nov 2025 14:56:41 -0500 Subject: [PATCH 4/5] move load regional data function to file utils --- R/colocboost_pipeline.R | 202 +--------------------------------------- R/file_utils.R | 201 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 202 insertions(+), 201 deletions(-) diff --git a/R/colocboost_pipeline.R b/R/colocboost_pipeline.R index 9abe8676..c56dcdb6 100644 --- a/R/colocboost_pipeline.R +++ b/R/colocboost_pipeline.R @@ -1,203 +1,3 @@ -#' This function loads a mixture data sets for a specific region, including individual-level data (genotype, phenotype, covariate data) -#' or summary statistics (sumstats, LD). Run \code{load_regional_univariate_data} and \code{load_rss_data} multiple times for different datasets -#' -#' @section Loading individual level data from multiple corhorts -#' @param region A string of chr:start-end for the phenotype region. -#' @param genotype_list a vector of PLINK bed file containing genotype data. -#' @param phenotype_list A vector of phenotype file names. -#' @param covariate_list A vector of covariate file names corresponding to the phenotype file vector. -#' @param conditions_list_individual A vector of strings representing different conditions or groups. -#' @param match_geno_pheno A vector of index of phentoypes matched to genotype if mulitple genotype PLINK files -#' @param maf_cutoff Minimum minor allele frequency (MAF) cutoff. Default is 0. -#' @param mac_cutoff Minimum minor allele count (MAC) cutoff. Default is 0. -#' @param xvar_cutoff Minimum variance cutoff. Default is 0. -#' @param imiss_cutoff Maximum individual missingness cutoff. Default is 0. -#' @param association_window A string of chr:start-end for the association analysis window (cis or trans). If not provided, all genotype data will be loaded. -#' @param extract_region_name A list of vectors of strings (e.g., gene ID ENSG00000269699) to subset the information when there are multiple regions available. Default is NULL. -#' @param region_name_col Column name containing the region name. Default is NULL. -#' @param keep_indel Logical indicating whether to keep insertions/deletions (INDELs). Default is TRUE. -#' @param keep_samples A vector of sample names to keep. Default is NULL. -#' @param phenotype_header Number of rows to skip at the beginning of the transposed phenotype file (default is 4 for chr, start, end, and ID). -#' @param scale_residuals Logical indicating whether to scale residuals. Default is FALSE. -#' @param tabix_header Logical indicating whether the tabix file has a header. Default is TRUE. -#' -#' @section Loading summary statistics from multiple corhorts or data set -#' @param sumstat_path_list A vector of file path to the summary statistics. -#' @param column_file_path_list A vector of file path to the column file for mapping. -#' @param LD_meta_file_path_list A vector of path of LD_metadata, LD_metadata is a data frame specifying LD blocks with columns "chrom", "start", "end", and "path". "start" and "end" denote the positions of LD blocks. "path" is the path of each LD block, optionally including bim file paths. -#' @param match_LD_sumstat A vector of index of sumstat matched to LD if mulitple sumstat files -#' @param conditions_list_sumstat A vector of strings representing different sumstats. -#' @param n_samples User-specified sample size. If unknown, set as 0 to retrieve from the sumstat file. -#' @param n_cases User-specified number of cases. -#' @param n_controls User-specified number of controls. -#' @param region The region where tabix use to subset the input dataset. -#' @param extract_sumstats_region_name User-specified gene/phenotype name used to further subset the phenotype data. -#' @param sumstats_region_name_col Filter this specific column for the extract_sumstats_region_name. -#' @param comment_string comment sign in the column_mapping file, default is # -#' @param extract_coordinates Optional data frame with columns "chrom" and "pos" for specific coordinates extraction. -#' -#' @return A list containing the individual_data and sumstat_data: -#' individual_data contains the following components if exist -#' \itemize{ -#' \item residual_Y: A list of residualized phenotype values (either a vector or a matrix). -#' \item residual_X: A list of residualized genotype matrices for each condition. -#' \item residual_Y_scalar: Scaling factor for residualized phenotype values. -#' \item residual_X_scalar: Scaling factor for residualized genotype values. -#' \item dropped_sample: A list of dropped samples for X, Y, and covariates. -#' \item covar: Covariate data. -#' \item Y: Original phenotype data. -#' \item X_data: Original genotype data. -#' \item X: Filtered genotype matrix. -#' \item maf: Minor allele frequency (MAF) for each variant. -#' \item chrom: Chromosome of the region. -#' \item grange: Genomic range of the region (start and end positions). -#' \item Y_coordinates: Phenotype coordinates if a region is specified. -#' } -#' sumstat_data contains the following components if exist -#' \itemize{ -#' \item sumstats: A list of summary statistics for the matched LD_info, each sublist contains sumstats, n, var_y from \code{load_rss_data}. -#' \item LD_info: A list of LD information, each sublist contains combined_LD_variants, combined_LD_matrix, ref_panel \code{load_LD_matrix}. -#' } -#' -#' @export -load_multitask_regional_data <- function(region, # a string of chr:start-end for phenotype region - genotype_list = NULL, # PLINK file - phenotype_list = NULL, # a vector of phenotype file names - covariate_list = NULL, # a vector of covariate file names corresponding to the phenotype file vector - conditions_list_individual = NULL, # a vector of strings - match_geno_pheno = NULL, # a vector of index of phentoypes matched to genotype if mulitple genotype files - maf_cutoff = 0, - mac_cutoff = 0, - xvar_cutoff = 0, - imiss_cutoff = 0, - association_window = NULL, - extract_region_name = NULL, - region_name_col = NULL, - keep_indel = TRUE, - keep_samples = NULL, - keep_variants = NULL, - phenotype_header = 4, # skip first 4 rows of transposed phenotype for chr, start, end and ID - scale_residuals = FALSE, - tabix_header = TRUE, - # sumstat if need - sumstat_path_list = NULL, - column_file_path_list = NULL, - LD_meta_file_path_list = NULL, - match_LD_sumstat = NULL, # a vector of index of sumstat matched to LD if mulitple sumstat files - conditions_list_sumstat = NULL, - n_samples = 0, - n_cases = 0, - n_controls = 0, - extract_sumstats_region_name = NULL, - sumstats_region_name_col = NULL, - comment_string = "#", - extract_coordinates = NULL) { - if (is.null(genotype_list) & is.null(sumstat_path_list)) { - stop("Data load error. Please make sure at least one data set (sumstat_path_list or genotype_list) exists.") - } - - # - if exist individual level data - individual_data <- NULL - if (!is.null(genotype_list)) { - #### FIXME: later if we have mulitple genotype list - if (length(genotype_list) != 1 & is.null(match_geno_pheno)) { - stop("Data load error. Please make sure 'match_geno_pheno' exists if you load data from multiple individual-level data.") - } else if (length(genotype_list) == 1 & is.null(match_geno_pheno)) { - match_geno_pheno <- rep(1, length(phenotype_list)) - } - - # - load individual data from multiple datasets - n_dataset <- unique(match_geno_pheno) ### FIXME - for (i_data in 1:n_dataset) { - # extract genotype file name - genotype <- genotype_list[i_data] - # extract phenotype and covariate file names - pos <- which(match_geno_pheno == i_data) - phenotype <- phenotype_list[pos] - covariate <- covariate_list[pos] - conditions <- conditions_list_individual[pos] - dat <- load_regional_univariate_data( - genotype = genotype, phenotype = phenotype, - covariate = covariate, region = region, - association_window = association_window, - conditions = conditions, xvar_cutoff = xvar_cutoff, - maf_cutoff = maf_cutoff, mac_cutoff = mac_cutoff, - imiss_cutoff = imiss_cutoff, keep_indel = keep_indel, - keep_samples = keep_samples, keep_variants = keep_variants, - extract_region_name = extract_region_name, - phenotype_header = phenotype_header, - region_name_col = region_name_col, - scale_residuals = scale_residuals - ) - if (is.null(individual_data)) { - individual_data <- dat - } else { - individual_data <- lapply(names(dat), function(k) { - c(individual_data[[k]], dat[[k]]) - }) - individual_data$chrom <- dat$chrom - individual_data$grange <- dat$grange - } - } - } - - # - if exist summstat data - sumstat_data <- NULL - if (!is.null(sumstat_path_list)) { - if (length(match_LD_sumstat) == 0) { - match_LD_sumstat[[1]] <- conditions_list_sumstat - } - if (length(match_LD_sumstat) != length(LD_meta_file_path_list)) { - stop("Please make sure 'match_LD_sumstat' matched 'LD_meta_file_path_list' if you load data from multiple sumstats.") - } - # - load sumstat data from multiple datasets - n_LD <- length(match_LD_sumstat) - for (i_ld in 1:n_LD) { - # extract LD meta file path name - LD_meta_file_path <- LD_meta_file_path_list[i_ld] - LD_info <- load_LD_matrix(LD_meta_file_path, - region = association_window, - extract_coordinates = extract_coordinates - ) - # extract sumstat information - conditions <- match_LD_sumstat[[i_ld]] - pos <- match(conditions, conditions_list_sumstat) - sumstats <- lapply(pos, function(ii) { - sumstat_path <- sumstat_path_list[ii] - column_file_path <- column_file_path_list[ii] - # FIXME later: when consider multiple LD reference - tmp <- load_rss_data( - sumstat_path = sumstat_path, column_file_path = column_file_path, - n_sample = n_samples[ii], n_case = n_cases[ii], n_control = n_controls[ii], - region = association_window, extract_region_name = extract_sumstats_region_name, - region_name_col = sumstats_region_name_col, comment_string = comment_string - ) - if (nrow(tmp$sumstats) == 0){ return(NULL) } - if (!("variant_id" %in% colnames(tmp$sumstats))) { - tmp$sumstats <- tmp$sumstats %>% - rowwise() %>% - mutate(variant_id = paste0(c(chrom, pos, A1, A2), collapse = ":")) - } - return(tmp) - }) - names(sumstats) <- conditions - if_no_variants <- sapply(sumstats, is.null) - if (sum(if_no_variants)!=0){ - pos_no_variants <- which(if_no_variants) - sumstats <- sumstats[-pos_no_variants] - } - sumstat_data$sumstats <- c(sumstat_data$sumstats, list(sumstats)) - sumstat_data$LD_info <- c(sumstat_data$LD_info, list(LD_info)) - } - names(sumstat_data$sumstats) <- names(sumstat_data$LD_info) <- names(match_LD_sumstat) - } - - return(list( - individual_data = individual_data, - sumstat_data = sumstat_data - )) -} - #' Multi-trait colocalization analysis pipeline #' #' This function perform a multi-trait colocalization using ColocBoost @@ -221,7 +21,7 @@ load_multitask_regional_data <- function(region, # a string of chr:start-end for #' } #' sumstat_data contains the following components if exist #' \itemize{ -#' \item sumstats: A list of summary statistics for the matched LD_info, each sublist contains sumstats, n, var_y from \code{load_rss_data}. +#' \item sumstats: A list of summary statistics f or the matched LD_info, each sublist contains sumstats, n, var_y from \code{load_rss_data}. #' \item LD_info: A list of LD information, each sublist contains combined_LD_variants, combined_LD_matrix, ref_panel \code{load_LD_matrix}. #' } #' diff --git a/R/file_utils.R b/R/file_utils.R index ce3253ff..6e425dbc 100644 --- a/R/file_utils.R +++ b/R/file_utils.R @@ -873,6 +873,207 @@ load_rss_data <- function(sumstat_path, column_file_path, n_sample = 0, n_case = return(list(sumstats = sumstats, n = n, var_y = var_y)) } + +#' This function loads a mixture data sets for a specific region, including individual-level data (genotype, phenotype, covariate data) +#' or summary statistics (sumstats, LD). Run \code{load_regional_univariate_data} and \code{load_rss_data} multiple times for different datasets +#' +#' @section Loading individual level data from multiple corhorts +#' @param region A string of chr:start-end for the phenotype region. +#' @param genotype_list a vector of PLINK bed file containing genotype data. +#' @param phenotype_list A vector of phenotype file names. +#' @param covariate_list A vector of covariate file names corresponding to the phenotype file vector. +#' @param conditions_list_individual A vector of strings representing different conditions or groups. +#' @param match_geno_pheno A vector of index of phentoypes matched to genotype if mulitple genotype PLINK files +#' @param maf_cutoff Minimum minor allele frequency (MAF) cutoff. Default is 0. +#' @param mac_cutoff Minimum minor allele count (MAC) cutoff. Default is 0. +#' @param xvar_cutoff Minimum variance cutoff. Default is 0. +#' @param imiss_cutoff Maximum individual missingness cutoff. Default is 0. +#' @param association_window A string of chr:start-end for the association analysis window (cis or trans). If not provided, all genotype data will be loaded. +#' @param extract_region_name A list of vectors of strings (e.g., gene ID ENSG00000269699) to subset the information when there are multiple regions available. Default is NULL. +#' @param region_name_col Column name containing the region name. Default is NULL. +#' @param keep_indel Logical indicating whether to keep insertions/deletions (INDELs). Default is TRUE. +#' @param keep_samples A vector of sample names to keep. Default is NULL. +#' @param phenotype_header Number of rows to skip at the beginning of the transposed phenotype file (default is 4 for chr, start, end, and ID). +#' @param scale_residuals Logical indicating whether to scale residuals. Default is FALSE. +#' @param tabix_header Logical indicating whether the tabix file has a header. Default is TRUE. +#' +#' @section Loading summary statistics from multiple corhorts or data set +#' @param sumstat_path_list A vector of file path to the summary statistics. +#' @param column_file_path_list A vector of file path to the column file for mapping. +#' @param LD_meta_file_path_list A vector of path of LD_metadata, LD_metadata is a data frame specifying LD blocks with columns "chrom", "start", "end", and "path". "start" and "end" denote the positions of LD blocks. "path" is the path of each LD block, optionally including bim file paths. +#' @param match_LD_sumstat A vector of index of sumstat matched to LD if mulitple sumstat files +#' @param conditions_list_sumstat A vector of strings representing different sumstats. +#' @param n_samples User-specified sample size. If unknown, set as 0 to retrieve from the sumstat file. +#' @param n_cases User-specified number of cases. +#' @param n_controls User-specified number of controls. +#' @param region The region where tabix use to subset the input dataset. +#' @param extract_sumstats_region_name User-specified gene/phenotype name used to further subset the phenotype data. +#' @param sumstats_region_name_col Filter this specific column for the extract_sumstats_region_name. +#' @param comment_string comment sign in the column_mapping file, default is # +#' @param extract_coordinates Optional data frame with columns "chrom" and "pos" for specific coordinates extraction. +#' +#' @return A list containing the individual_data and sumstat_data: +#' individual_data contains the following components if exist +#' \itemize{ +#' \item residual_Y: A list of residualized phenotype values (either a vector or a matrix). +#' \item residual_X: A list of residualized genotype matrices for each condition. +#' \item residual_Y_scalar: Scaling factor for residualized phenotype values. +#' \item residual_X_scalar: Scaling factor for residualized genotype values. +#' \item dropped_sample: A list of dropped samples for X, Y, and covariates. +#' \item covar: Covariate data. +#' \item Y: Original phenotype data. +#' \item X_data: Original genotype data. +#' \item X: Filtered genotype matrix. +#' \item maf: Minor allele frequency (MAF) for each variant. +#' \item chrom: Chromosome of the region. +#' \item grange: Genomic range of the region (start and end positions). +#' \item Y_coordinates: Phenotype coordinates if a region is specified. +#' } +#' sumstat_data contains the following components if exist +#' \itemize{ +#' \item sumstats: A list of summary statistics for the matched LD_info, each sublist contains sumstats, n, var_y from \code{load_rss_data}. +#' \item LD_info: A list of LD information, each sublist contains combined_LD_variants, combined_LD_matrix, ref_panel \code{load_LD_matrix}. +#' } +#' +#' @export +load_multitask_regional_data <- function(region, # a string of chr:start-end for phenotype region + genotype_list = NULL, # PLINK file + phenotype_list = NULL, # a vector of phenotype file names + covariate_list = NULL, # a vector of covariate file names corresponding to the phenotype file vector + conditions_list_individual = NULL, # a vector of strings + match_geno_pheno = NULL, # a vector of index of phentoypes matched to genotype if mulitple genotype files + maf_cutoff = 0, + mac_cutoff = 0, + xvar_cutoff = 0, + imiss_cutoff = 0, + association_window = NULL, + extract_region_name = NULL, + region_name_col = NULL, + keep_indel = TRUE, + keep_samples = NULL, + keep_variants = NULL, + phenotype_header = 4, # skip first 4 rows of transposed phenotype for chr, start, end and ID + scale_residuals = FALSE, + tabix_header = TRUE, + # sumstat if need + sumstat_path_list = NULL, + column_file_path_list = NULL, + LD_meta_file_path_list = NULL, + match_LD_sumstat = NULL, # a vector of index of sumstat matched to LD if mulitple sumstat files + conditions_list_sumstat = NULL, + n_samples = 0, + n_cases = 0, + n_controls = 0, + extract_sumstats_region_name = NULL, + sumstats_region_name_col = NULL, + comment_string = "#", + extract_coordinates = NULL) { + if (is.null(genotype_list) & is.null(sumstat_path_list)) { + stop("Data load error. Please make sure at least one data set (sumstat_path_list or genotype_list) exists.") + } + + # - if exist individual level data + individual_data <- NULL + if (!is.null(genotype_list)) { + #### FIXME: later if we have mulitple genotype list + if (length(genotype_list) != 1 & is.null(match_geno_pheno)) { + stop("Data load error. Please make sure 'match_geno_pheno' exists if you load data from multiple individual-level data.") + } else if (length(genotype_list) == 1 & is.null(match_geno_pheno)) { + match_geno_pheno <- rep(1, length(phenotype_list)) + } + + # - load individual data from multiple datasets + n_dataset <- unique(match_geno_pheno) ### FIXME + for (i_data in 1:n_dataset) { + # extract genotype file name + genotype <- genotype_list[i_data] + # extract phenotype and covariate file names + pos <- which(match_geno_pheno == i_data) + phenotype <- phenotype_list[pos] + covariate <- covariate_list[pos] + conditions <- conditions_list_individual[pos] + dat <- load_regional_univariate_data( + genotype = genotype, phenotype = phenotype, + covariate = covariate, region = region, + association_window = association_window, + conditions = conditions, xvar_cutoff = xvar_cutoff, + maf_cutoff = maf_cutoff, mac_cutoff = mac_cutoff, + imiss_cutoff = imiss_cutoff, keep_indel = keep_indel, + keep_samples = keep_samples, keep_variants = keep_variants, + extract_region_name = extract_region_name, + phenotype_header = phenotype_header, + region_name_col = region_name_col, + scale_residuals = scale_residuals + ) + if (is.null(individual_data)) { + individual_data <- dat + } else { + individual_data <- lapply(names(dat), function(k) { + c(individual_data[[k]], dat[[k]]) + }) + individual_data$chrom <- dat$chrom + individual_data$grange <- dat$grange + } + } + } + + # - if exist summstat data + sumstat_data <- NULL + if (!is.null(sumstat_path_list)) { + if (length(match_LD_sumstat) == 0) { + match_LD_sumstat[[1]] <- conditions_list_sumstat + } + if (length(match_LD_sumstat) != length(LD_meta_file_path_list)) { + stop("Please make sure 'match_LD_sumstat' matched 'LD_meta_file_path_list' if you load data from multiple sumstats.") + } + # - load sumstat data from multiple datasets + n_LD <- length(match_LD_sumstat) + for (i_ld in 1:n_LD) { + # extract LD meta file path name + LD_meta_file_path <- LD_meta_file_path_list[i_ld] + LD_info <- load_LD_matrix(LD_meta_file_path, + region = association_window, + extract_coordinates = extract_coordinates + ) + # extract sumstat information + conditions <- match_LD_sumstat[[i_ld]] + pos <- match(conditions, conditions_list_sumstat) + sumstats <- lapply(pos, function(ii) { + sumstat_path <- sumstat_path_list[ii] + column_file_path <- column_file_path_list[ii] + # FIXME later: when consider multiple LD reference + tmp <- load_rss_data( + sumstat_path = sumstat_path, column_file_path = column_file_path, + n_sample = n_samples[ii], n_case = n_cases[ii], n_control = n_controls[ii], + region = association_window, extract_region_name = extract_sumstats_region_name, + region_name_col = sumstats_region_name_col, comment_string = comment_string + ) + if (nrow(tmp$sumstats) == 0){ return(NULL) } + if (!("variant_id" %in% colnames(tmp$sumstats))) { + tmp$sumstats <- tmp$sumstats %>% + rowwise() %>% + mutate(variant_id = paste0(c(chrom, pos, A1, A2), collapse = ":")) + } + return(tmp) + }) + names(sumstats) <- conditions + if_no_variants <- sapply(sumstats, is.null) + if (sum(if_no_variants)!=0){ + pos_no_variants <- which(if_no_variants) + sumstats <- sumstats[-pos_no_variants] + } + sumstat_data$sumstats <- c(sumstat_data$sumstats, list(sumstats)) + sumstat_data$LD_info <- c(sumstat_data$LD_info, list(LD_info)) + } + names(sumstat_data$sumstats) <- names(sumstat_data$LD_info) <- names(match_LD_sumstat) + } + + return(list( + individual_data = individual_data, + sumstat_data = sumstat_data + )) +} + #' Load and filter tabular data with optional region subsetting #' #' This function loads summary statistics data from tabular files (TSV, TXT). From 66cac31f195e514ca50ca64196b42246aa91b77c Mon Sep 17 00:00:00 2001 From: xueweic Date: Thu, 6 Nov 2025 19:57:56 +0000 Subject: [PATCH 5/5] Update documentation --- man/colocboost_analysis_pipeline.Rd | 2 +- man/load_multitask_regional_data.Rd | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/man/colocboost_analysis_pipeline.Rd b/man/colocboost_analysis_pipeline.Rd index bfa7f1bb..a96cbfe5 100644 --- a/man/colocboost_analysis_pipeline.Rd +++ b/man/colocboost_analysis_pipeline.Rd @@ -49,7 +49,7 @@ individual_data contains the following components if exist } sumstat_data contains the following components if exist \itemize{ - \item sumstats: A list of summary statistics for the matched LD_info, each sublist contains sumstats, n, var_y from \code{load_rss_data}. + \item sumstats: A list of summary statistics f or the matched LD_info, each sublist contains sumstats, n, var_y from \code{load_rss_data}. \item LD_info: A list of LD information, each sublist contains combined_LD_variants, combined_LD_matrix, ref_panel \code{load_LD_matrix}. } } diff --git a/man/load_multitask_regional_data.Rd b/man/load_multitask_regional_data.Rd index 1e3d1457..69528b43 100644 --- a/man/load_multitask_regional_data.Rd +++ b/man/load_multitask_regional_data.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/colocboost_pipeline.R +% Please edit documentation in R/file_utils.R \name{load_multitask_regional_data} \alias{load_multitask_regional_data} \title{This function loads a mixture data sets for a specific region, including individual-level data (genotype, phenotype, covariate data)