StatFunGen · gaow · Nov 7, 2025 · May 14, 2025 · May 14, 2025 · Nov 6, 2025
diff --git a/R/colocboost_pipeline.R b/R/colocboost_pipeline.R
@@ -1,197 +1,3 @@
-#' This function loads a mixture data sets for a specific region, including individual-level data (genotype, phenotype, covariate data)
-#' or summary statistics (sumstats, LD). Run \code{load_regional_univariate_data} and \code{load_rss_data} multiple times for different datasets
-#'
-#' @section Loading individual level data from multiple corhorts
-#' @param region A string of chr:start-end for the phenotype region.
-#' @param genotype_list a vector of PLINK bed file containing genotype data.
-#' @param phenotype_list A vector of phenotype file names.
-#' @param covariate_list A vector of covariate file names corresponding to the phenotype file vector.
-#' @param conditions_list_individual A vector of strings representing different conditions or groups.
-#' @param match_geno_pheno A vector of index of phentoypes matched to genotype if mulitple genotype PLINK files
-#' @param maf_cutoff Minimum minor allele frequency (MAF) cutoff. Default is 0.
-#' @param mac_cutoff Minimum minor allele count (MAC) cutoff. Default is 0.
-#' @param xvar_cutoff Minimum variance cutoff. Default is 0.
-#' @param imiss_cutoff Maximum individual missingness cutoff. Default is 0.
-#' @param association_window A string of chr:start-end for the association analysis window (cis or trans). If not provided, all genotype data will be loaded.
-#' @param extract_region_name A list of vectors of strings (e.g., gene ID ENSG00000269699) to subset the information when there are multiple regions available. Default is NULL.
-#' @param region_name_col Column name containing the region name. Default is NULL.
-#' @param keep_indel Logical indicating whether to keep insertions/deletions (INDELs). Default is TRUE.
-#' @param keep_samples A vector of sample names to keep. Default is NULL.
-#' @param phenotype_header Number of rows to skip at the beginning of the transposed phenotype file (default is 4 for chr, start, end, and ID).
-#' @param scale_residuals Logical indicating whether to scale residuals. Default is FALSE.
-#' @param tabix_header Logical indicating whether the tabix file has a header. Default is TRUE.
-#'
-#' @section Loading summary statistics from multiple corhorts or data set
-#' @param sumstat_path_list A vector of file path to the summary statistics.
-#' @param column_file_path_list A vector of file path to the column file for mapping.
-#' @param LD_meta_file_path_list A vector of path of LD_metadata, LD_metadata is a data frame specifying LD blocks with columns "chrom", "start", "end", and "path". "start" and "end" denote the positions of LD blocks. "path" is the path of each LD block, optionally including bim file paths.
-#' @param match_LD_sumstat A vector of index of sumstat matched to LD if mulitple sumstat files
-#' @param conditions_list_sumstat A vector of strings representing different sumstats.
-#' @param n_samples User-specified sample size. If unknown, set as 0 to retrieve from the sumstat file.
-#' @param n_cases User-specified number of cases.
-#' @param n_controls User-specified number of controls.
-#' @param region The region where tabix use to subset the input dataset.
-#' @param extract_sumstats_region_name User-specified gene/phenotype name used to further subset the phenotype data.
-#' @param sumstats_region_name_col Filter this specific column for the extract_sumstats_region_name.
-#' @param comment_string comment sign in the column_mapping file, default is #
-#' @param extract_coordinates Optional data frame with columns "chrom" and "pos" for specific coordinates extraction.
-#'
-#' @return A list containing the individual_data and sumstat_data:
-#' individual_data contains the following components if exist
-#' \itemize{
-#'   \item residual_Y: A list of residualized phenotype values (either a vector or a matrix).
-#'   \item residual_X: A list of residualized genotype matrices for each condition.
-#'   \item residual_Y_scalar: Scaling factor for residualized phenotype values.
-#'   \item residual_X_scalar: Scaling factor for residualized genotype values.
-#'   \item dropped_sample: A list of dropped samples for X, Y, and covariates.
-#'   \item covar: Covariate data.
-#'   \item Y: Original phenotype data.
-#'   \item X_data: Original genotype data.
-#'   \item X: Filtered genotype matrix.
-#'   \item maf: Minor allele frequency (MAF) for each variant.
-#'   \item chrom: Chromosome of the region.
-#'   \item grange: Genomic range of the region (start and end positions).
-#'   \item Y_coordinates: Phenotype coordinates if a region is specified.
-#' }
-#' sumstat_data contains the following components if exist
-#' \itemize{
-#'   \item sumstats: A list of summary statistics for the matched LD_info, each sublist contains sumstats, n, var_y from \code{load_rss_data}.
-#'   \item LD_info: A list of LD information, each sublist contains combined_LD_variants, combined_LD_matrix, ref_panel  \code{load_LD_matrix}.
-#' }
-#'
-#' @export
-load_multitask_regional_data <- function(region, # a string of chr:start-end for phenotype region
-                                         genotype_list = NULL, # PLINK file
-                                         phenotype_list = NULL, # a vector of phenotype file names
-                                         covariate_list = NULL, # a vector of covariate file names corresponding to the phenotype file vector
-                                         conditions_list_individual = NULL, # a vector of strings
-                                         match_geno_pheno = NULL, # a vector of index of phentoypes matched to genotype if mulitple genotype files
-                                         maf_cutoff = 0,
-                                         mac_cutoff = 0,
-                                         xvar_cutoff = 0,
-                                         imiss_cutoff = 0,
-                                         association_window = NULL,
-                                         extract_region_name = NULL,
-                                         region_name_col = NULL,
-                                         keep_indel = TRUE,
-                                         keep_samples = NULL,
-                                         keep_variants = NULL,
-                                         phenotype_header = 4, # skip first 4 rows of transposed phenotype for chr, start, end and ID
-                                         scale_residuals = FALSE,
-                                         tabix_header = TRUE,
-                                         # sumstat if need
-                                         sumstat_path_list = NULL,
-                                         column_file_path_list = NULL,
-                                         LD_meta_file_path_list = NULL,
-                                         match_LD_sumstat = NULL, # a vector of index of sumstat matched to LD if mulitple sumstat files
-                                         conditions_list_sumstat = NULL,
-                                         n_samples = 0,
-                                         n_cases = 0,
-                                         n_controls = 0,
-                                         extract_sumstats_region_name = NULL,
-                                         sumstats_region_name_col = NULL,
-                                         comment_string = "#",
-                                         extract_coordinates = NULL) {
-  if (is.null(genotype_list) & is.null(sumstat_path_list)) {
-    stop("Data load error. Please make sure at least one data set (sumstat_path_list or genotype_list) exists.")
-  }
-
-  # - if exist individual level data
-  individual_data <- NULL
-  if (!is.null(genotype_list)) {
-    #### FIXME: later if we have mulitple genotype list
-    if (length(genotype_list) != 1 & is.null(match_geno_pheno)) {
-      stop("Data load error. Please make sure 'match_geno_pheno' exists if you load data from multiple individual-level data.")
-    } else if (length(genotype_list) == 1 & is.null(match_geno_pheno)) {
-      match_geno_pheno <- rep(1, length(phenotype_list))
-    }
-
-    # - load individual data from multiple datasets
-    n_dataset <- unique(match_geno_pheno) ### FIXME
-    for (i_data in 1:n_dataset) {
-      # extract genotype file name
-      genotype <- genotype_list[i_data]
-      # extract phenotype and covariate file names
-      pos <- which(match_geno_pheno == i_data)
-      phenotype <- phenotype_list[pos]
-      covariate <- covariate_list[pos]
-      conditions <- conditions_list_individual[pos]
-      dat <- load_regional_univariate_data(
-        genotype = genotype, phenotype = phenotype,
-        covariate = covariate, region = region,
-        association_window = association_window,
-        conditions = conditions, xvar_cutoff = xvar_cutoff,
-        maf_cutoff = maf_cutoff, mac_cutoff = mac_cutoff,
-        imiss_cutoff = imiss_cutoff, keep_indel = keep_indel,
-        keep_samples = keep_samples, keep_variants = keep_variants,
-        extract_region_name = extract_region_name,
-        phenotype_header = phenotype_header,
-        region_name_col = region_name_col,
-        scale_residuals = scale_residuals
-      )
-      if (is.null(individual_data)) {
-        individual_data <- dat
-      } else {
-        individual_data <- lapply(names(dat), function(k) {
-          c(individual_data[[k]], dat[[k]])
-        })
-        individual_data$chrom <- dat$chrom
-        individual_data$grange <- dat$grange
-      }
-    }
-  }
-
-  # - if exist summstat data
-  sumstat_data <- NULL
-  if (!is.null(sumstat_path_list)) {
-    if (length(match_LD_sumstat) == 0) {
-      match_LD_sumstat[[1]] <- conditions_list_sumstat
-    }
-    if (length(match_LD_sumstat) != length(LD_meta_file_path_list)) {
-      stop("Please make sure 'match_LD_sumstat' matched 'LD_meta_file_path_list' if you load data from multiple sumstats.")
-    }
-    # - load sumstat data from multiple datasets
-    n_LD <- length(match_LD_sumstat)
-    for (i_ld in 1:n_LD) {
-      # extract LD meta file path name
-      LD_meta_file_path <- LD_meta_file_path_list[i_ld]
-      LD_info <- load_LD_matrix(LD_meta_file_path,
-        region = association_window,
-        extract_coordinates = extract_coordinates
-      )
-      # extract sumstat information
-      conditions <- match_LD_sumstat[[i_ld]]
-      pos <- match(conditions, conditions_list_sumstat)
-      sumstats <- lapply(pos, function(ii) {
-        sumstat_path <- sumstat_path_list[ii]
-        column_file_path <- column_file_path_list[ii]
-        # FIXME later: when consider multiple LD reference
-        tmp <- load_rss_data(
-          sumstat_path = sumstat_path, column_file_path = column_file_path,
-          n_sample = n_samples[ii], n_case = n_cases[ii], n_control = n_controls[ii],
-          region = association_window, extract_region_name = extract_sumstats_region_name,
-          region_name_col = sumstats_region_name_col, comment_string = comment_string
-        )
-        if (!("variant_id" %in% colnames(tmp$sumstats))) {
-          tmp$sumstats <- tmp$sumstats %>%
-            rowwise() %>%
-            mutate(variant_id = paste0(c(chrom, pos, A1, A2), collapse = ":"))
-        }
-        return(tmp)
-      })
-      names(sumstats) <- conditions
-      sumstat_data$sumstats <- c(sumstat_data$sumstats, list(sumstats))
-      sumstat_data$LD_info <- c(sumstat_data$LD_info, list(LD_info))
-    }
-    names(sumstat_data$sumstats) <- names(sumstat_data$LD_info) <- names(match_LD_sumstat)
-  }
-
-  return(list(
-    individual_data = individual_data,
-    sumstat_data = sumstat_data
-  ))
-}
-
 #' Multi-trait colocalization analysis pipeline
 #'
 #' This function perform a multi-trait colocalization using ColocBoost
@@ -215,7 +21,7 @@ load_multitask_regional_data <- function(region, # a string of chr:start-end for
 #' }
 #' sumstat_data contains the following components if exist
 #' \itemize{
-#'   \item sumstats: A list of summary statistics for the matched LD_info, each sublist contains sumstats, n, var_y from \code{load_rss_data}.
+#'   \item sumstats: A list of summary statistics f or the matched LD_info, each sublist contains sumstats, n, var_y from \code{load_rss_data}.
 #'   \item LD_info: A list of LD information, each sublist contains combined_LD_variants, combined_LD_matrix, ref_panel  \code{load_LD_matrix}.
 #' }
 #'
@@ -744,7 +550,7 @@ qc_regional_data <- function(region_data,
             final_LD <- c(final_LD, list(LD_mat) %>% setNames(conditions_sumstat))
             LD_match <- c(LD_match, conditions_sumstat)
           } else {
-            LD_match <- c(LD_match, LD_match[pos[1]])
+            LD_match <- c(LD_match, names(final_LD)[pos[1]])
           }
         }
       }