diff --git a/R/colocboost_pipeline.R b/R/colocboost_pipeline.R index 2b22d712..5b98aab4 100644 --- a/R/colocboost_pipeline.R +++ b/R/colocboost_pipeline.R @@ -554,6 +554,7 @@ qc_regional_data <- function(region_data, qc_method = c("rss_qc", "dentist", "slalom"), impute = TRUE, impute_opts = list(rcond = 0.01, R2_threshold = 0.6, minimum_ld = 5, lamb = 0.01)) { + if (!is.null(qc_method)) qc_method <- match.arg(qc_method) #### related internal functions # Add context names to colname of Y if missing add_context_to_Y <- function(res_Y) { @@ -614,6 +615,15 @@ qc_regional_data <- function(region_data, # - add context to colname of Y Y <- add_context_to_Y(Y) n_context <- length(X) + # Validate pip_cutoff_to_skip_ind length: allow scalar or exact length; otherwise error + if (length(pip_cutoff_to_skip_ind) == 1) { + pip_cutoff_vec <- rep(pip_cutoff_to_skip_ind, n_context) + } else if (length(pip_cutoff_to_skip_ind) == n_context) { + pip_cutoff_vec <- pip_cutoff_to_skip_ind + } else { + stop(sprintf("pip_cutoff_to_skip_ind length (%d) must be 1 or equal to number of contexts (%d)", + length(pip_cutoff_to_skip_ind), n_context)) + } residual_X <- residual_Y <- list() keep_contexts <- c() for (i_context in 1:n_context) { @@ -625,8 +635,8 @@ qc_regional_data <- function(region_data, # - remove variants with maf < maf_cutoff # tmp <- filter_resX_maf(resX, maf, maf_cutoff = maf_cutoff) resX <- filter_X(resX, missing_rate_thresh = NULL, maf_thresh = maf_cutoff, maf = maf) - # Initial PIP check - resY <- filter_resY_pip(resX, resY, pip_cutoff = pip_cutoff_to_skip_ind[i_context], context = context) + # Initial PIP check (use recycled vector element) + resY <- filter_resY_pip(resX, resY, pip_cutoff = pip_cutoff_vec[i_context], context = context) if (!is.null(resY)) { residual_X <- c(residual_X, list(resX)) residual_Y <- c(residual_Y, list(resY)) @@ -675,6 +685,7 @@ qc_regional_data <- function(region_data, qc_method = c("rss_qc", "dentist", "slalom"), impute = TRUE, impute_opts = list(rcond = 0.01, R2_threshold = 0.6, minimum_ld = 5, lamb = 0.01)) { + if (!is.null(qc_method)) qc_method <- match.arg(qc_method) n_LD <- length(sumstat_data$LD_info) final_sumstats <- final_LD <- NULL LD_match <- c() @@ -687,7 +698,26 @@ qc_regional_data <- function(region_data, n <- sumstat$n var_y <- sumstat$var_y conditions_sumstat <- names(sumstats)[ii] - pip_cutoff_to_skip_ld <- pip_cutoff_to_skip_sumstat[conditions_sumstat] %>% as.numeric() + # Determine PIP cutoff per sumstat with strict validation: + # - scalar applies to all studies + # - length equals number of studies -> positional mapping + # - named vector that covers ALL study names -> name-based mapping + # Otherwise: throw error (do not silently default) + if (length(pip_cutoff_to_skip_sumstat) == 1) { + pip_cutoff_to_skip_ld <- as.numeric(pip_cutoff_to_skip_sumstat) + } else if (length(pip_cutoff_to_skip_sumstat) == length(sumstats) && is.null(names(pip_cutoff_to_skip_sumstat))) { + pip_cutoff_to_skip_ld <- as.numeric(pip_cutoff_to_skip_sumstat[ii]) + } else if (!is.null(names(pip_cutoff_to_skip_sumstat))) { + missing_names <- setdiff(names(sumstats), names(pip_cutoff_to_skip_sumstat)) + if (length(missing_names) > 0) { + stop(sprintf("pip_cutoff_to_skip_sumstat must provide values for all studies; missing: %s", + paste(missing_names, collapse = ", "))) + } + pip_cutoff_to_skip_ld <- as.numeric(pip_cutoff_to_skip_sumstat[conditions_sumstat]) + } else { + stop(sprintf("pip_cutoff_to_skip_sumstat length (%d) must be 1, equal to number of studies (%d), or be a named vector covering all studies", + length(pip_cutoff_to_skip_sumstat), length(sumstats))) + } # Preprocess the input data preprocess_results <- rss_basic_qc(sumstat$sumstats, LD_data, remove_indels = remove_indels) diff --git a/R/file_utils.R b/R/file_utils.R index ce3253ff..6e2444a6 100644 --- a/R/file_utils.R +++ b/R/file_utils.R @@ -243,7 +243,20 @@ load_phenotype_data <- function(phenotype_path, region, extract_region_name = NU stop("region_name_col is out of bounds for the number of columns in tabix_data.") } } else { - return(tabix_data %>% t()) + # No extract_region_name filter: still transpose and assign names appropriately + result <- tabix_data %>% t() + if (!is.null(region_name_col) && (region_name_col %% 1 == 0)) { + if (region_name_col <= ncol(tabix_data)) { + # After transpose, the desired name row is at index region_name_col + colnames(result) <- result[region_name_col, ] + } else { + stop("region_name_col is out of bounds for the number of columns in tabix_data.") + } + } else if (is.null(colnames(result))) { + # Fallback: synthesize names if still missing + colnames(result) <- paste0("region_", seq_len(ncol(result))) + } + return(result) } }))