From 28dcfa95b797213d9fe8d72c60c60a44c3b4f137 Mon Sep 17 00:00:00 2001 From: scheidec Date: Tue, 4 Nov 2025 15:03:13 -0500 Subject: [PATCH 01/10] Add medianNormalize() function - added new medianNormalize() function for median normalization of soma_adat objects - supports different methods for passing a reference: calculating an internal from the provided soma_adat with the ability to select specific samples, or passing an external ADAT, tab or comma delimited file or data.frame with Dilution and Reference columns - includes validation for required hybridization and plate scale normalization steps --- NAMESPACE | 2 + NEWS.md | 14 +- R/medianNormalize.R | 667 +++++++++++++++++++++++ inst/WORDLIST | 2 + man/medianNormalize.Rd | 141 +++++ man/soma_adat.Rd | 22 +- tests/testthat/_snaps/medianNormalize.md | 17 + tests/testthat/test-medianNormalize.R | 363 ++++++++++++ 8 files changed, 1217 insertions(+), 11 deletions(-) create mode 100644 R/medianNormalize.R create mode 100644 man/medianNormalize.Rd create mode 100644 tests/testthat/_snaps/medianNormalize.md create mode 100644 tests/testthat/test-medianNormalize.R diff --git a/NAMESPACE b/NAMESPACE index d0754cee..ffe655c4 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -112,6 +112,7 @@ export(lift_adat) export(loadAdatsAsList) export(locateSeqId) export(matchSeqIds) +export(medianNormalize) export(meltExpressionSet) export(merge_clin) export(mutate) @@ -218,5 +219,6 @@ importFrom(utils,capture.output) importFrom(utils,head) importFrom(utils,read.csv) importFrom(utils,read.delim) +importFrom(utils,read.table) importFrom(utils,tail) importFrom(utils,write.table) diff --git a/NEWS.md b/NEWS.md index 1c689a83..35010b9f 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,5 +1,17 @@ # SomaDataIO (6.5.0.9000) +### New Functions + +* Added `medianNormalize()` function + - performs median normalization on `soma_adat` objects that have been + hybridization normalized and plate scaled + - includes validation to ensure required normalization steps have been applied + - supports multiple reference approaches: + - Internal reference built from study samples + - Reference extracted from existing `soma_adat` object + - External reference as a data.frame + - supports custom grouping by multiple clinical variables + ### Function and Object Improvements * Removed restrictive file validation from `read_annotations()` @@ -35,7 +47,7 @@ * Updated `dplyr` verb tests to no longer explicitly test for ordering of attributes (#165) - + # SomaDataIO (6.4.0) ### New Functions diff --git a/R/medianNormalize.R b/R/medianNormalize.R new file mode 100644 index 00000000..ba24e334 --- /dev/null +++ b/R/medianNormalize.R @@ -0,0 +1,667 @@ +#' Perform Median Normalization +#' +#' @description Performs median normalization on a `soma_adat` object that has +#' already undergone hybridization normalization and contains plate scale +#' normalization factors. +#' +#' Median normalization is a technique used to adjust data sets to remove +#' certain assay artifacts and biases prior to analysis, particularly those that +#' may arise due to differences in overall protein concentration, pipetting +#' errors, reagent concentration changes, assay timing, and other systematic +#' variabilities. Median normalization can improve assay precision and reduce +#' technical variations that can mask true biological signals. +#' +#' This function works by calculating or using reference median values for each +#' dilution group present in the ADAT. Dilution groups are determined from the +#' analyte metadata and represent different sample dilutions used in the assay +#' (e.g., 1:1, 1:200, 1:2000). For each sample, the median RFU across all +#' analytes within a dilution group is calculated and compared to the reference +#' median for that group. Scale factors are then computed and applied to adjust +#' each sample's values toward the reference, ensuring consistent signal levels +#' across samples within each dilution group. This function supports multiple +#' reference approaches: calculating an internal reference from study samples, +#' or using a calculated reference from a different ADAT or external reference +#' file. +#' +#' @section Important Considerations: +#' \itemize{ +#' \item If there is a known change in total protein concentration due to analysis +#' conditions, perform normalization _within_ groups using the `by` parameter +#' \item In newer SomaScan assay versions, population references improve +#' inter-plate consistency compared to within-plate references +#' } +#' +#' @param adat A `soma_adat` object created using [read_adat()], containing +#' RFU values that have been hybridization normalized and plate scaled. +#' @param reference Optional. Reference for median normalization. Can be: +#' \itemize{ +#' \item `NULL` (default): Use internal reference from study samples +#' \item A `soma_adat` object: Extract reference from this ADAT +#' \item A file path (character): Read external reference from tab/comma-separated file +#' \item A data.frame: Use provided reference data directly +#' } +#' When providing an external reference file or data.frame it must contain: +#' \describe{ +#' \item{Dilution}{Character column specifying the dilution group names +#' (e.g., "0_005", "0_5", "20"). These should match the dilution +#' groups present in the ADAT analyte data.} +#' \item{Reference}{Numeric column containing the reference median values +#' for each dilution group. These values will be used as the target +#' medians for normalization.} +#' \item{SeqId}{Optional character column. When included, SeqId-specific +#' reference values are used for more precise normalization. Values +#' must be in "10000-28" format, not "seq.10000.28" format.} +#' } +#' @param ref_field Character. Field used to select reference samples when +#' `reference = NULL`. Default is `"SampleType"`. +#' @param ref_value Character. Value(s) in `ref_field` to use as reference +#' when `reference = NULL`. Default is `c("QC", "Sample")`. +#' @param by Character vector. Grouping variable(s) for grouped median normalization. +#' Must be column name(s) in the ADAT. Use grouping when there are known changes +#' in total protein load due to analysis conditions (e.g., disease vs. control, +#' treatment vs. baseline). Normalization will be performed within each group +#' separately. Default is `"SampleType"`. +#' @param do_field Character. The field used to select samples for normalization +#' (others keep original values). Default is `"SampleType"`. +#' @param do_regexp Character. A regular expression pattern to select samples +#' from `do_field` for normalization. Default is `"QC|Sample"`. +#' @param verbose Logical. Should progress messages be printed? Default is `TRUE`. +#' @return A `soma_adat` object with median normalization applied and RFU values +#' adjusted. The existing `NormScale_*` columns are updated to include the +#' effects of both plate scale normalization and median normalization. +#' @examples +#' \dontrun{ +#' # Internal reference from study samples (default) +#' # Use when you have representative QC or control samples in your study +#' med_norm_adat <- medianNormalize(adat) +#' +#' # Reference from specific samples in the ADAT +#' # Use when you want to normalize against only QC samples +#' med_norm_adat <- medianNormalize(adat, +#' ref_field = "SampleType", +#' ref_value = "QC") +#' +#' # Reference from another ADAT +#' # Use when you have a reference population or control study +#' ref_adat <- read_adat("reference_file.adat") +#' med_norm_adat <- medianNormalize(adat, reference = ref_adat) +#' +#' # External reference file +#' # Use when you have pre-calculated reference medians for each dilution +#' med_norm_adat <- medianNormalize(adat, reference = "reference_file.csv") +#' +#' # External reference as data.frame +#' # Use for programmatic control over reference values +#' ref_data <- data.frame( +#' Dilution = c("0_005", "0_5", "20"), +#' Reference = c(1500.2, 3200.8, 4100.5) +#' ) +#' med_norm_adat <- medianNormalize(adat, reference = ref_data) +#' +#' # Custom grouping by multiple variables +#' # Use when total protein load changes due to analysis conditions +#' # (normalize within groups to account for expected biological differences) +#' med_norm_adat <- medianNormalize(adat, by = c("Sex", "SampleType")) +#' +#' # Normalize only specific sample types +#' # Use when you want to preserve original values for certain sample types +#' med_norm_adat <- medianNormalize(adat, +#' do_field = "SampleType", +#' do_regexp = "Sample") +#' } +#' @importFrom dplyr filter +#' @importFrom stats median +#' @importFrom utils read.table read.csv +#' @export +medianNormalize <- function(adat, + reference = NULL, + ref_field = "SampleType", + ref_value = c("QC", "Sample"), + by = "SampleType", + do_field = "SampleType", + do_regexp = "QC|Sample", + verbose = TRUE) { + + # Input validation ---- + stopifnot("`adat` must be a class `soma_adat` object" = is.soma_adat(adat)) + + # Check that required normalization steps have been applied ---- + header <- attr(adat, "Header.Meta")$HEADER + + if (is.null(header)) { + stop("ADAT header metadata is missing", call. = FALSE) + } + + # Check for hybrid normalization + if (!"ProcessSteps" %in% names(header) || + !grepl("Hyb Normalization", header$ProcessSteps, ignore.case = TRUE)) { + stop( + "Hybrid normalization step not detected in ProcessSteps. ", + "Please apply hybrid normalization before median normalization.", + call. = FALSE + ) + } + + # Check for plate scale factors + norm_cols <- grep("^(Norm|norm)[Ss]cale", names(adat), value = TRUE) + if (length(norm_cols) == 0) { + stop( + "No normalization scale factor columns found. ", + "Please ensure plate scale normalization has been applied.", + call. = FALSE + ) + } + + # Create dilution groups ---- + apt_data <- getAnalyteInfo(adat) + + if (!"Dilution" %in% names(apt_data)) { + stop("Dilution information not found in analyte data", call. = FALSE) + } + + # Filter out hybridization controls + apt_data <- filter(apt_data, !grepl("^Hybridization", Type, ignore.case = TRUE)) + + # Create dilution groups + dil_groups <- split(apt_data$AptName, apt_data$Dilution) + + # Clean up dilution names + names(dil_groups) <- gsub("\\.", "_", names(dil_groups)) + names(dil_groups) <- gsub("[.]0$|%|^[.]", "", names(dil_groups)) + + if (length(dil_groups) < 2L) { + warning("Fewer than 2 dilution groups detected", call. = FALSE) + } + + # Check for existing normalization scale factors ---- + existing_norm_sf <- grep("^NormScale_", names(adat), value = TRUE) + + if ( verbose ) { + if (length(existing_norm_sf) > 0) { + .todo("Normalization scale factors already exist: {.val {paste0(existing_norm_sf, collapse = ', ')}} - they will be replaced with new scale factors") + } + } + + + # Determine which samples to normalize ---- + if (!is.null(do_field) && !is.null(do_regexp)) { + if (!do_field %in% names(adat)) { + stop("Field `", do_field, "` not found in adat columns", call. = FALSE) + } + + do_samples <- grep(do_regexp, adat[[do_field]]) + if (length(do_samples) == 0L) { + stop( + "No samples selected for normalization with pattern: ", do_regexp, + call. = FALSE + ) + } + dont_samples <- setdiff(seq_len(nrow(adat)), do_samples) + } else { + do_samples <- seq_len(nrow(adat)) + dont_samples <- NULL + } + + # Process reference ---- + if (is.null(reference)) { + # Check if ref_field conflicts with grouping variables + conflicts_with_grouping <- FALSE + if (ref_field %in% by) { + samples_to_normalize <- adat[do_samples, ] + group_values <- unique(samples_to_normalize[[ref_field]]) + group_values <- group_values[!is.na(group_values)] + all_groups_in_ref <- all(group_values %in% ref_value) + conflicts_with_grouping <- !identical(by, ref_field) || !all_groups_in_ref + } + + if (conflicts_with_grouping) { + # Calculate global reference to avoid groups lacking reference samples + if (verbose) { + .todo("Building global internal reference from field: {.val {ref_field}} with values: {.val {ref_value}}") + } + ref_data <- .buildInternalReference(adat, ref_field, ref_value, dil_groups) + } else { + # Standard internal reference - calculate per group + ref_data <- NULL + if (verbose) { + .todo("Building internal reference from field: {.val {ref_field}} with values: {.val {ref_value}}") + } + } + } else { + ref_data <- .processReference(reference, ref_field, ref_value, adat, dil_groups, apt_data, verbose) + } + + # Add row identifier to maintain order + adat$.rowid <- seq_len(nrow(adat)) + + # Perform median normalization on selected samples + if (length(do_samples) > 0) { + norm_adat <- .performMedianNorm( + adat[do_samples, ], + dil_groups = dil_groups, + by = by, + ref_data = ref_data, + ref_field = ref_field, + ref_value = ref_value, + verbose = verbose + ) + } + + # Handle samples that were not normalized + if (!is.null(dont_samples) && length(dont_samples) > 0) { + unnorm_adat <- adat[dont_samples, ] + sf_cols <- paste0("NormScale_", names(dil_groups)) + + # Check if scale factor columns already exist in the original data + existing_sf_cols <- intersect(sf_cols, names(adat)) + + # For all scale factor columns, preserve existing values or set to 1.0 + for (col in sf_cols) { + if (col %in% existing_sf_cols) { + # Keep the existing value as-is - unnorm_adat[[col]] already contains it + } else { + # Set to 1.0 for new scale factor columns + unnorm_adat[[col]] <- 1.0 + } + } + + # Ensure column order matches + unnorm_adat <- unnorm_adat[, names(norm_adat)] + norm_adat <- rbind(norm_adat, unnorm_adat) + } + + # Restore original order + norm_adat <- norm_adat[order(norm_adat$.rowid), ] + norm_adat$.rowid <- NULL + + # Update header metadata + .updateHeaderMetadata(norm_adat, reference) +} + + +#' Process Reference Data +#' @noRd +.processReference <- function(reference, ref_field, ref_value, adat, dil_groups, apt_data, verbose) { + + if (is.soma_adat(reference)) { + # Use reference from provided ADAT + if (verbose) { + .todo("Using reference from provided ADAT object") + } + return(.extractReferenceFromAdat(reference, dil_groups)) + + } else if (is.character(reference) && length(reference) == 1) { + # External reference file + if (verbose) { + .todo("Reading external reference from file: {.val {reference}}") + } + return(.readExternalReference(reference, dil_groups, apt_data)) + + } else if (is.data.frame(reference)) { + # Reference data provided directly + if (verbose) { + .todo("Using provided reference data.frame") + } + return(.validateReferenceData(reference, dil_groups, apt_data)) + + } else { + stop( + "Invalid reference type. Must be NULL, soma_adat, file path, or data.frame", + call. = FALSE + ) + } +} + +#' Build Internal Reference from Study Samples +#' @noRd +.buildInternalReference <- function(adat, ref_field, ref_value, dil_groups) { + + if (!ref_field %in% names(adat)) { + stop("Reference field `", ref_field, "` not found", call. = FALSE) + } + + # Select reference samples + ref_samples <- adat[[ref_field]] %in% ref_value + if (sum(ref_samples) == 0) { + stop( + "No reference samples found with field `", ref_field, + "` and values: ", paste(ref_value, collapse = ", "), + call. = FALSE + ) + } + + ref_adat <- adat[ref_samples, ] + + # Calculate reference medians for each dilution group + ref_data <- list() + for (dil_name in names(dil_groups)) { + dil_apts <- intersect(dil_groups[[dil_name]], getAnalytes(adat)) + if (length(dil_apts) > 0) { + ref_data[[dil_name]] <- apply(ref_adat[, dil_apts, drop = FALSE], 2, median, na.rm = TRUE) + } + } + + ref_data +} + +#' Extract Reference from ADAT +#' @noRd +.extractReferenceFromAdat <- function(ref_adat, dil_groups) { + + # Calculate reference medians for each dilution group + ref_data <- list() + for (dil_name in names(dil_groups)) { + dil_apts <- intersect(dil_groups[[dil_name]], getAnalytes(ref_adat)) + if (length(dil_apts) > 0) { + ref_data[[dil_name]] <- apply(ref_adat[, dil_apts, drop = FALSE], 2, median, na.rm = TRUE) + } + } + + ref_data +} + +#' Read External Reference File +#' @noRd +.readExternalReference <- function(file_path, dil_groups, apt_data) { + + if (!file.exists(file_path)) { + stop("Reference file not found: ", file_path, call. = FALSE) + } + + # Determine file type and read + file_ext_val <- file_ext(file_path) + + if (file_ext_val %in% c("csv")) { + ref_df <- read.csv(file_path, stringsAsFactors = FALSE) + } else if (file_ext_val %in% c("txt", "tsv")) { + ref_df <- utils::read.table(file_path, header = TRUE, sep = "\t", stringsAsFactors = FALSE) + } else { + # Try to auto-detect + ref_df <- tryCatch({ + read.csv(file_path, stringsAsFactors = FALSE) + }, error = function(e) { + utils::read.table(file_path, header = TRUE, sep = "\t", stringsAsFactors = FALSE) + }) + } + + .validateReferenceData(ref_df, dil_groups, apt_data) +} + +#' Validate Reference Data +#' @noRd +.validateReferenceData <- function(ref_df, dil_groups, apt_data = NULL) { + + # Check that required columns are present + required_cols <- c("Dilution", "Reference") + if (!all(required_cols %in% names(ref_df))) { + missing_cols <- setdiff(required_cols, names(ref_df)) + stop( + "Reference data must contain columns: ", paste(missing_cols, collapse = ", "), + call. = FALSE + ) + } + + # Check if we have SeqId-specific information + if ("SeqId" %in% names(ref_df)) { + # Process as SeqId-specific reference + return(.processSeqIdReference(ref_df, dil_groups, apt_data)) + } else { + # Process as simple dilution-level reference + return(.processSimpleReference(ref_df, dil_groups)) + } +} + +#' Process SeqId-Specific Reference Data +#' @noRd +.processSeqIdReference <- function(ref_df, dil_groups, apt_data) { + # Convert dilution values to character for consistent matching + ref_df$Dilution <- as.character(ref_df$Dilution) + + if (!is.null(apt_data) && !"SeqId" %in% names(apt_data)) { + stop("ADAT analyte data must contain SeqId column for SeqId reference matching", call. = FALSE) + } + + # Create a global reference mapping from all dilutions + # Expect SeqId column in reference data + if (!"SeqId" %in% names(ref_df)) { + stop("Reference data must contain SeqId column for SeqId reference matching", call. = FALSE) + } + global_seqid_refs <- setNames(ref_df$Reference, ref_df$SeqId) + + ref_data <- list() + + # Process each dilution group using the global reference + for (dil_name in names(dil_groups)) { + dil_apt_names <- dil_groups[[dil_name]] + + # Convert AptNames to SeqIds and find matches in global reference + dil_seq_ids <- apt_data$SeqId[apt_data$AptName %in% dil_apt_names] + matching_seq_ids <- intersect(dil_seq_ids, names(global_seqid_refs)) + + if (length(matching_seq_ids) > 0) { + # Convert back to AptNames for the final result + matching_apt_names <- apt_data$AptName[apt_data$SeqId %in% matching_seq_ids] + ref_values <- global_seqid_refs[matching_seq_ids] + names(ref_values) <- matching_apt_names + ref_data[[dil_name]] <- ref_values + } + } + + # Check that we have some references + if (length(ref_data) == 0) { + stop("No matching SeqIds or dilution groups found in reference data", call. = FALSE) + } + + ref_data +} + +#' Process Simple Reference Data (Dilution -> Reference) +#' @noRd +.processSimpleReference <- function(ref_df, dil_groups) { + # Convert to named list + ref_data <- setNames(ref_df$Reference, ref_df$Dilution) + + # Check that we have references for all dilution groups + missing_dils <- setdiff(names(dil_groups), names(ref_data)) + if (length(missing_dils) > 0) { + stop( + "Missing reference values for dilution groups: ", paste(missing_dils, collapse = ", "), + call. = FALSE + ) + } + + ref_data +} + +#' Perform Median Normalization +#' @noRd +.performMedianNorm <- function(adat, dil_groups, by, ref_data, ref_field, ref_value, verbose) { + + # Store original rownames to restore later + original_rownames <- rownames(adat) + + # Validate grouping variables + if (is.character(by) && length(by) > 0) { + missing_cols <- setdiff(by, names(adat)) + if (length(missing_cols) > 0) { + stop("Grouping column(s) not found: ", paste(missing_cols, collapse = ", "), + call. = FALSE) + } + } + + # Create grouping variable + if (length(by) == 1L) { + group_var <- adat[[by]] + } else if (length(by) > 1L) { + group_var <- apply(adat[, by, drop = FALSE], 1, paste, collapse = "__") + } else { + group_var <- rep("all", nrow(adat)) + } + + # Report grouping strategy if verbose + if (verbose) { + if (length(by) > 1L) { + .todo("Performing grouped median normalization by: {.val {paste(by, collapse = ', ')}}") + } else if (length(by) == 1L && by != "SampleType") { + .todo("Performing grouped median normalization by: {.val {by}}") + } else if (length(unique(group_var)) > 1) { + .todo("Performing grouped median normalization by: {.val {by}} ({.val {length(unique(group_var))}} groups)") + } + } + + adat$.group <- group_var + + # Split data by groups and process each group separately + groups <- unique(group_var) + result_list <- list() + + for (grp in groups) { + grp_samples <- which(group_var == grp) + grp_adat <- adat[grp_samples, , drop = FALSE] + + if (verbose && length(groups) > 1) { + .todo("Processing group: {.val {grp}} ({.val {length(grp_samples)}} samples)") + } + + # Calculate scale factors for each dilution group within this sample group + for (dil_name in names(dil_groups)) { + sf_col <- paste0("NormScale_", dil_name) + + # Initialize scale factor column + if (!sf_col %in% names(grp_adat)) { + grp_adat[[sf_col]] <- 1.0 + } + + # Get analytes in this dilution + dil_apts <- intersect(dil_groups[[dil_name]], getAnalytes(grp_adat)) + + if (length(dil_apts) == 0) { + next + } + + if (verbose) { + .done("Processing dilution '{dil_name}' with {length(dil_apts)} analytes") + } + + # Calculate reference values for this dilution + if (!is.null(ref_data) && dil_name %in% names(ref_data)) { + # Use external reference + ref_values <- ref_data[[dil_name]] + + if (is.numeric(ref_values) && length(ref_values) == 1) { + # Single reference value for the whole dilution group + grp_ref_values <- rep(ref_values, length(dil_apts)) + names(grp_ref_values) <- dil_apts + } else if (is.numeric(ref_values) && length(ref_values) > 1) { + # Aptamer-specific reference values + grp_ref_values <- ref_values[dil_apts] + has_ref <- !is.na(grp_ref_values) + + if (any(has_ref)) { + dil_apts <- dil_apts[has_ref] + grp_ref_values <- grp_ref_values[has_ref] + } else { + next # No references available, skip this dilution group + } + } else { + # Fallback to group-specific calculation + grp_ref_values <- apply(grp_adat[, dil_apts, drop = FALSE], 2, median, na.rm = TRUE) + } + } else { + # Internal reference: Use samples from this group only + if (!ref_field %in% names(grp_adat)) { + stop("Reference field `", ref_field, "` not found", call. = FALSE) + } + + ref_samples_mask <- grp_adat[[ref_field]] %in% ref_value + if (sum(ref_samples_mask) == 0) { + stop( + "No reference samples found with field `", ref_field, + "` and values: ", paste(ref_value, collapse = ", "), + call. = FALSE + ) + } + + ref_sample_data <- grp_adat[ref_samples_mask, dil_apts, drop = FALSE] + grp_ref_values <- apply(ref_sample_data, 2, median, na.rm = TRUE) + } + + # Calculate scale factors for each sample in this group + for (i in seq_len(nrow(grp_adat))) { + sample_values <- as.numeric(grp_adat[i, dil_apts, drop = FALSE]) + ratios <- grp_ref_values / sample_values + med_scale_factor <- median(ratios[is.finite(ratios)], na.rm = TRUE) + + if (!is.finite(med_scale_factor)) { + med_scale_factor <- 1.0 + } + + grp_adat[[sf_col]][i] <- med_scale_factor + } + + # Apply scale factors to analytes + for (apt in dil_apts) { + grp_adat[[apt]] <- grp_adat[[apt]] * grp_adat[[sf_col]] + } + } + + result_list[[as.character(grp)]] <- grp_adat + } + + # Combine results and restore original order + adat <- do.call(rbind, result_list) + adat <- adat[order(adat$.rowid), ] + + # Restore original rownames + rownames(adat) <- original_rownames + + # Remove temporary grouping column + adat$.group <- NULL + + # Round to 1 decimal place (standard for SomaScan data) + apts <- getAnalytes(adat) + for (apt in apts) { + adat[[apt]] <- round(adat[[apt]], 1) + } + + adat +} + +#' Update Header Metadata +#' @noRd +.updateHeaderMetadata <- function(adat, reference) { + header_meta <- attr(adat, "Header.Meta") + + if (!is.null(header_meta) && !is.null(header_meta$HEADER)) { + # Add median normalization to process steps + if ("ProcessSteps" %in% names(header_meta$HEADER)) { + if (!grepl("medNormInt", header_meta$HEADER$ProcessSteps)) { + header_meta$HEADER$ProcessSteps <- paste( + header_meta$HEADER$ProcessSteps, + "medNormInt (SampleId)", + sep = ", " + ) + } + } else { + header_meta$HEADER$ProcessSteps <- "medNormInt (SampleId)" + } + + # Set normalization algorithm + header_meta$HEADER$NormalizationAlgorithm <- "MedNorm" + + # Set reference type + if (is.null(reference)) { + header_meta$HEADER$MedNormReference <- "intraplate" + } else if (is.soma_adat(reference)) { + header_meta$HEADER$MedNormReference <- "external_adat" + } else if (is.character(reference)) { + header_meta$HEADER$MedNormReference <- basename(reference) + } else { + header_meta$HEADER$MedNormReference <- "external_data" + } + + attr(adat, "Header.Meta") <- header_meta + } + + adat +} diff --git a/inst/WORDLIST b/inst/WORDLIST index efd92edc..4c2a38cc 100644 --- a/inst/WORDLIST +++ b/inst/WORDLIST @@ -113,6 +113,7 @@ md medNormRef nd normals +pipetting pkgdown plex pre @@ -128,4 +129,5 @@ th tibble tidyr untransformed +variabilities vectorized diff --git a/man/medianNormalize.Rd b/man/medianNormalize.Rd new file mode 100644 index 00000000..00a22ad8 --- /dev/null +++ b/man/medianNormalize.Rd @@ -0,0 +1,141 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/medianNormalize.R +\name{medianNormalize} +\alias{medianNormalize} +\title{Perform Median Normalization} +\usage{ +medianNormalize( + adat, + reference = NULL, + ref_field = "SampleType", + ref_value = c("QC", "Sample"), + by = "SampleType", + do_field = "SampleType", + do_regexp = "QC|Sample", + verbose = TRUE +) +} +\arguments{ +\item{adat}{A \code{soma_adat} object created using \code{\link[=read_adat]{read_adat()}}, containing +RFU values that have been hybridization normalized and plate scaled.} + +\item{reference}{Optional. Reference for median normalization. Can be: +\itemize{ +\item \code{NULL} (default): Use internal reference from study samples +\item A \code{soma_adat} object: Extract reference from this ADAT +\item A file path (character): Read external reference from tab/comma-separated file +\item A data.frame: Use provided reference data directly +} +When providing an external reference file or data.frame it must contain: +\describe{ +\item{Dilution}{Character column specifying the dilution group names +(e.g., "0_005", "0_5", "20"). These should match the dilution +groups present in the ADAT analyte data.} +\item{Reference}{Numeric column containing the reference median values +for each dilution group. These values will be used as the target +medians for normalization.} +\item{SeqId}{Optional character column. When included, SeqId-specific +reference values are used for more precise normalization. Values +must be in "10000-28" format, not "seq.10000.28" format.} +}} + +\item{ref_field}{Character. Field used to select reference samples when +\code{reference = NULL}. Default is \code{"SampleType"}.} + +\item{ref_value}{Character. Value(s) in \code{ref_field} to use as reference +when \code{reference = NULL}. Default is \code{c("QC", "Sample")}.} + +\item{by}{Character vector. Grouping variable(s) for grouped median normalization. +Must be column name(s) in the ADAT. Use grouping when there are known changes +in total protein load due to analysis conditions (e.g., disease vs. control, +treatment vs. baseline). Normalization will be performed within each group +separately. Default is \code{"SampleType"}.} + +\item{do_field}{Character. The field used to select samples for normalization +(others keep original values). Default is \code{"SampleType"}.} + +\item{do_regexp}{Character. A regular expression pattern to select samples +from \code{do_field} for normalization. Default is \code{"QC|Sample"}.} + +\item{verbose}{Logical. Should progress messages be printed? Default is \code{TRUE}.} +} +\value{ +A \code{soma_adat} object with median normalization applied and RFU values +adjusted. The existing \verb{NormScale_*} columns are updated to include the +effects of both plate scale normalization and median normalization. +} +\description{ +Performs median normalization on a \code{soma_adat} object that has +already undergone hybridization normalization and contains plate scale +normalization factors. + +Median normalization is a technique used to adjust data sets to remove +certain assay artifacts and biases prior to analysis, particularly those that +may arise due to differences in overall protein concentration, pipetting +errors, reagent concentration changes, assay timing, and other systematic +variabilities. Median normalization can improve assay precision and reduce +technical variations that can mask true biological signals. + +This function works by calculating or using reference median values for each +dilution group present in the ADAT. Dilution groups are determined from the +analyte metadata and represent different sample dilutions used in the assay +(e.g., 1:1, 1:200, 1:2000). For each sample, the median RFU across all +analytes within a dilution group is calculated and compared to the reference +median for that group. Scale factors are then computed and applied to adjust +each sample's values toward the reference, ensuring consistent signal levels +across samples within each dilution group. This function supports multiple +reference approaches: calculating an internal reference from study samples, +or using a calculated reference from a different ADAT or external reference +file. +} +\section{Important Considerations}{ + +\itemize{ +\item If there is a known change in total protein concentration due to analysis +conditions, perform normalization \emph{within} groups using the \code{by} parameter +\item In newer SomaScan assay versions, population references improve +inter-plate consistency compared to within-plate references +} +} + +\examples{ +\dontrun{ +# Internal reference from study samples (default) +# Use when you have representative QC or control samples in your study +med_norm_adat <- medianNormalize(adat) + +# Reference from specific samples in the ADAT +# Use when you want to normalize against only QC samples +med_norm_adat <- medianNormalize(adat, + ref_field = "SampleType", + ref_value = "QC") + +# Reference from another ADAT +# Use when you have a reference population or control study +ref_adat <- read_adat("reference_file.adat") +med_norm_adat <- medianNormalize(adat, reference = ref_adat) + +# External reference file +# Use when you have pre-calculated reference medians for each dilution +med_norm_adat <- medianNormalize(adat, reference = "reference_file.csv") + +# External reference as data.frame +# Use for programmatic control over reference values +ref_data <- data.frame( + Dilution = c("0_005", "0_5", "20"), + Reference = c(1500.2, 3200.8, 4100.5) +) +med_norm_adat <- medianNormalize(adat, reference = ref_data) + +# Custom grouping by multiple variables +# Use when total protein load changes due to analysis conditions +# (normalize within groups to account for expected biological differences) +med_norm_adat <- medianNormalize(adat, by = c("Sex", "SampleType")) + +# Normalize only specific sample types +# Use when you want to preserve original values for certain sample types +med_norm_adat <- medianNormalize(adat, + do_field = "SampleType", + do_regexp = "Sample") +} +} diff --git a/man/soma_adat.Rd b/man/soma_adat.Rd index 26127fdc..d117ebd1 100644 --- a/man/soma_adat.Rd +++ b/man/soma_adat.Rd @@ -91,16 +91,18 @@ dispatched on class \code{soma_adat}. Below is a list of \emph{all} currently available S3 methods that dispatch on the \code{soma_adat} class: -\if{html}{\out{
}}\preformatted{#> [1] [ [[ [[<- [<- -#> [5] == $ $<- anti_join -#> [9] arrange count filter full_join -#> [13] getAdatVersion getAnalytes getMeta group_by -#> [17] inner_join is_seqFormat left_join Math -#> [21] median merge mutate Ops -#> [25] print rename right_join row.names<- -#> [29] sample_frac sample_n select semi_join -#> [33] separate slice_sample slice summary -#> [37] Summary transform ungroup unite +\if{html}{\out{
}}\preformatted{#> [1] [ [[ [[<- [<- +#> [5] == $ $<- anti_join +#> [9] arrange centerScaleData convertAptNames count +#> [13] filter full_join getAdatVersion getAnalytes +#> [17] getMeta group_by imputeNAs inner_join +#> [21] is_seqFormat is.logspace left_join Math +#> [25] median merge mutate Ops +#> [29] print rename revertAptNames right_join +#> [33] row.names<- sample_frac sample_n select +#> [37] semi_join separate slice_sample slice +#> [41] summary Summary transform ungroup +#> [45] unite #> see '?methods' for accessing help and source code }\if{html}{\out{
}} diff --git a/tests/testthat/_snaps/medianNormalize.md b/tests/testthat/_snaps/medianNormalize.md new file mode 100644 index 00000000..8ee3491c --- /dev/null +++ b/tests/testthat/_snaps/medianNormalize.md @@ -0,0 +1,17 @@ +# `medianNormalize` produces expected verbose output + + Code + result <- medianNormalize(test_data, verbose = TRUE) + Message + > Normalization scale factors already exist: "NormScale_20, NormScale_0_005, NormScale_0_5" - they will be replaced with new scale factors + > Building internal reference from field: "SampleType" with values: "QC" and "Sample" + > Performing grouped median normalization by: "SampleType" (2 groups) + > Processing group: "Sample" (12 samples) + v Processing dilution '0_005' with 173 analytes + v Processing dilution '0_5' with 828 analytes + v Processing dilution '20' with 4271 analytes + > Processing group: "QC" (4 samples) + v Processing dilution '0_005' with 173 analytes + v Processing dilution '0_5' with 828 analytes + v Processing dilution '20' with 4271 analytes + diff --git a/tests/testthat/test-medianNormalize.R b/tests/testthat/test-medianNormalize.R new file mode 100644 index 00000000..6634c43f --- /dev/null +++ b/tests/testthat/test-medianNormalize.R @@ -0,0 +1,363 @@ +# Setup ---- +# Create a minimal test dataset +create_test_data <- function() { + data("example_data", package = "SomaDataIO") + # Select samples ensuring we have both Sample and QC types + sample_indices <- which(example_data$SampleType == "Sample")[1:12] + qc_indices <- which(example_data$SampleType == "QC")[1:4] + + # Combine indices and subset + subset_indices <- c(sample_indices, qc_indices) + + # Return subset with representative samples + example_data[subset_indices, ] +} + +test_data <- create_test_data() + +# Testing ---- +test_that("`medianNormalize` Method 1: Internal reference (default)", { + + # Test default internal reference method + expect_no_error( + result <- medianNormalize(test_data, verbose = FALSE) + ) + + # Check result structure + expect_true(is.soma_adat(result)) + norm_cols <- grep("^NormScale_", names(result), value = TRUE) + expect_equal(length(norm_cols), 3) # Should have 3 dilution groups + expect_true(all(norm_cols %in% names(result))) + + # Test scale factor output + expect_equal(result$NormScale_20[1:3], c(1.067845, 1.057102, 1.091748), tolerance = 0.0001) + expect_equal(result$NormScale_0_005[1:3], c(1.022533, 1.001174, 1.012215), tolerance = 0.0001) + expect_equal(result$NormScale_0_5[1:3], c(1.103112, 1.054358, 1.108711), tolerance = 0.0001) + + # Test specific SeqId columns + expect_equal(result$seq.10000.28[1:3], c(508.8, 501.5, 453.7)) + expect_equal(result$seq.10008.43[1:3], c(599.9, 572.8, 462.8)) + + # Check header metadata + result_header <- attr(result, "Header.Meta")$HEADER + expect_true(grepl("medNormInt", result_header$ProcessSteps)) + expect_equal(result_header$NormalizationAlgorithm, "MedNorm") + expect_equal(result_header$MedNormReference, "intraplate") +}) + +test_that("`medianNormalize` Method 2: Reference from specific samples", { + + # Test with specific reference field and value + expect_no_error( + result <- medianNormalize(test_data, + ref_field = "SampleType", + ref_value = "QC", + do_regexp = "QC", + verbose = FALSE) + ) + + # Check result structure + expect_true(is.soma_adat(result)) + norm_cols <- grep("^NormScale_", names(result), value = TRUE) + expect_equal(length(norm_cols), 3) # Should have 3 dilution groups + + # Test scale factor output + expect_equal(result$NormScale_20[1:3], c(1.0369358, 0.9602251, 0.9841162), tolerance = 0.0001) + expect_equal(result$NormScale_0_005[1:3], c(0.8570162, 0.8485842, 1.0327016), tolerance = 0.0001) + expect_equal(result$NormScale_0_5[1:3], c(0.7771749, 0.8520195, 0.9151915), tolerance = 0.0001) + + # Test specific SeqId columns + expect_equal(result$seq.10000.28[1:3], c(476.5, 474.4, 415.6)) + expect_equal(result$seq.10008.43[1:3], c(561.8, 541.9, 423.9)) + + # Check header metadata + result_header <- attr(result, "Header.Meta")$HEADER + expect_equal(result_header$MedNormReference, "intraplate") +}) + +test_that("`medianNormalize` Method 3: Reference from another ADAT", { + # Create a minimal reference ADAT from sample subset + ref_adat <- test_data[1:4, ] # Use first 4 samples as reference + + expect_no_error( + result <- medianNormalize(test_data, reference = ref_adat, verbose = FALSE) + ) + + # Check result structure + expect_true(is.soma_adat(result)) + norm_cols <- grep("^NormScale_", names(result), value = TRUE) + expect_equal(length(norm_cols), 3) # Should have 3 dilution groups + + # Test scale factor output + expect_equal(result$NormScale_20[1:3], c(0.9907154, 1.0109152, 0.9931271), tolerance = 0.0001) + expect_equal(result$NormScale_0_005[1:3], c(1.005733, 1.008454, 0.986035), tolerance = 0.0001) + expect_equal(result$NormScale_0_5[1:3], c(1.0090878, 0.9909594, 1.0002446), tolerance = 0.0001) + + # Test specific SeqId columns + expect_equal(result$seq.10000.28[1:3], c(472.1, 479.6, 412.7)) + expect_equal(result$seq.10008.43[1:3], c(556.6, 547.8, 421.0)) + + # Check header metadata + result_header <- attr(result, "Header.Meta")$HEADER + expect_equal(result_header$MedNormReference, "external_adat") +}) + +test_that("`medianNormalize` Method 4: External reference file", { + # Create a temporary CSV file with reference data + # Use realistic dilution groups from example_data: "20", "0.5", "0.005", "0" + ref_data <- data.frame( + Dilution = c("20", "0_5", "0_005", "0"), + Reference = c(2500.5, 1800.2, 3200.8, 1500.0), + stringsAsFactors = FALSE + ) + + # Create temporary CSV file + temp_csv <- tempfile(fileext = ".csv") + write.csv(ref_data, temp_csv, row.names = FALSE) + + expect_no_error( + result <- medianNormalize(test_data, reference = temp_csv, verbose = FALSE) + ) + + # Check result structure + expect_true(is.soma_adat(result)) + norm_cols <- grep("^NormScale_", names(result), value = TRUE) + expect_equal(length(norm_cols), 3) # Should have 3 dilution groups + + # Test scale factor output + expect_equal(result$NormScale_20[1:3], c(2.937272, 2.982111, 2.936927), tolerance = 0.0001) + expect_equal(result$NormScale_0_005[1:3], c(0.6313838, 0.5966632, 0.6624721), tolerance = 0.0001) + expect_equal(result$NormScale_0_5[1:3], c(0.7147623, 0.7780614, 0.7704032), tolerance = 0.0001) + + # Test specific SeqId columns + expect_equal(result$seq.10000.28[1:3], c(1399.6, 1414.7, 1220.6)) + expect_equal(result$seq.10008.43[1:3], c(1650.2, 1616.0, 1245.0)) + + # Check header metadata + result_header <- attr(result, "Header.Meta")$HEADER + expect_equal(result_header$MedNormReference, basename(temp_csv)) + + # Clean up + unlink(temp_csv) + + # Test with TSV file + temp_tsv <- tempfile(fileext = ".tsv") + write.table(ref_data, temp_tsv, sep = "\t", row.names = FALSE) + + expect_no_error( + result2 <- medianNormalize(test_data, reference = temp_tsv, verbose = FALSE) + ) + + expect_true(is.soma_adat(result2)) + + # Test scale factor output + expect_equal(result2$NormScale_20[1:3], c(2.937272, 2.982111, 2.936927), tolerance = 0.0001) + expect_equal(result2$NormScale_0_005[1:3], c(0.6313838, 0.5966632, 0.6624721), tolerance = 0.0001) + expect_equal(result2$NormScale_0_5[1:3], c(0.7147623, 0.7780614, 0.7704032), tolerance = 0.0001) + + # Test specific SeqId columns + expect_equal(result2$seq.10000.28[1:3], c(1399.6, 1414.7, 1220.6)) + expect_equal(result2$seq.10008.43[1:3], c(1650.2, 1616.0, 1245.0)) + + # Clean up + unlink(temp_tsv) +}) + +test_that("`medianNormalize` Method 5: External reference as data.frame", { + + # Create reference data.frame with realistic dilution groups + ref_data <- data.frame( + Dilution = c("20", "0_5", "0_005", "0"), + Reference = c(2500.5, 1800.2, 3200.8, 1500.0), + stringsAsFactors = FALSE + ) + + expect_no_error( + result <- medianNormalize(test_data, reference = ref_data, verbose = FALSE) + ) + + # Check result structure + expect_true(is.soma_adat(result)) + norm_cols <- grep("^NormScale_", names(result), value = TRUE) + expect_equal(length(norm_cols), 3) # Should have 3 dilution groups + + # Test scale factor output + expect_equal(result$NormScale_20[1:3], c(2.937272, 2.982111, 2.936927), tolerance = 0.0001) + expect_equal(result$NormScale_0_005[1:3], c(0.6313838, 0.5966632, 0.6624721), tolerance = 0.0001) + expect_equal(result$NormScale_0_5[1:3], c(0.7147623, 0.7780614, 0.7704032), tolerance = 0.0001) + + # Test specific SeqId columns + expect_equal(result$seq.10000.28[1:3], c(1399.6, 1414.7, 1220.6)) + expect_equal(result$seq.10008.43[1:3], c(1650.2, 1616.0, 1245.0)) + + # Check header metadata + result_header <- attr(result, "Header.Meta")$HEADER + expect_equal(result_header$MedNormReference, "external_data") +}) + +test_that("`medianNormalize` validates input requirements", { + + # Test with non-soma_adat object + expect_error( + medianNormalize(data.frame(a = 1:3, b = 4:6)), + "`adat` must be a class `soma_adat` object" + ) + + # Test with missing hybridization normalization + test_data_no_hyb <- test_data + header_meta <- list(HEADER = list(ProcessSteps = "SomeOtherStep")) + attr(test_data_no_hyb, "Header.Meta") <- header_meta + + expect_error( + medianNormalize(test_data_no_hyb), + "Hybrid normalization step not detected" + ) + + # Test with missing normalization scale factors + test_data_no_norm <- test_data |> + dplyr::select(-c(NormScale_0_005, NormScale_0_5, NormScale_20)) + header_meta <- list(HEADER = list(ProcessSteps = "Hyb Normalization")) + attr(test_data_no_norm, "Header.Meta") <- header_meta + + expect_error( + medianNormalize(test_data_no_norm), + "No normalization scale factor columns found" + ) + + # Test with invalid reference field + expect_error( + medianNormalize(test_data, ref_field = "NonExistentField", verbose = FALSE), + "Reference field `NonExistentField` not found" + ) + + # Test with invalid reference data.frame (missing required columns) + invalid_ref <- data.frame(Wrong = c("20", "0_5"), Column = c(1000, 2000)) + expect_error( + medianNormalize(test_data, reference = invalid_ref, verbose = FALSE), + "Reference data must contain columns: Dilution, Reference" + ) + + # Test with non-existent reference file + expect_error( + medianNormalize(test_data, reference = "non_existent_file.csv", verbose = FALSE), + "Reference file not found" + ) + + # Test with invalid reference type + expect_error( + medianNormalize(test_data, reference = 123, verbose = FALSE), + "Invalid reference type" + ) +}) + +test_that("`medianNormalize` external reference file validation", { + # Test with missing dilution groups in reference + incomplete_ref <- data.frame( + Dilution = c("20", "0_5"), # Missing "0_005" and "0" + Reference = c(2500, 1800), + stringsAsFactors = FALSE + ) + + expect_error( + medianNormalize(test_data, reference = incomplete_ref, verbose = FALSE), + "Missing reference values for dilution groups" + ) +}) + +test_that("`medianNormalize` handles grouping correctly", { + # Test grouping by Sex + expect_no_error( + result1 <- medianNormalize(test_data, by = "Sex", do_regexp = "Sample", verbose = FALSE) + ) + + # Check result structure + expect_true(is.soma_adat(result1)) + norm_cols <- grep("^NormScale_", names(result1), value = TRUE) + expect_equal(length(norm_cols), 3) # Should have 3 dilution groups + + # Test scale factor output + expect_equal(result1$NormScale_20[1:3], c(1.066964, 1.060173, 1.020341), tolerance = 0.0001) + expect_equal(result1$NormScale_0_005[1:3], c(1.000455, 1.005372, 1.000000), tolerance = 0.0001) + expect_equal(result1$NormScale_0_5[1:3], c(1.115562, 1.050018, 1.030397), tolerance = 0.0001) + + # Test specific SeqId columns + expect_equal(result1$seq.10000.28[1:3], c(508.4, 502.9, 424.1)) + expect_equal(result1$seq.10008.43[1:3], c(599.4, 574.5, 432.5)) + + # Test grouping by multiple variables + if ("PlateId" %in% names(test_data)) { + expect_no_error( + result2 <- medianNormalize(test_data, by = c("PlateId", "SampleType"), verbose = FALSE) + ) + + # Check result structure + expect_true(is.soma_adat(result2)) + norm_cols <- grep("^NormScale_", names(result2), value = TRUE) + expect_equal(length(norm_cols), 3) # Should have 3 dilution groups + + # Test scale factor output + expect_equal(result2$NormScale_20[1:3], c(1.053656, 1.059554, 1.068602), tolerance = 0.0001) + expect_equal(result2$NormScale_0_005[1:3], c(1.0124749, 1.0053543, 0.9992799), tolerance = 0.0001) + expect_equal(result2$NormScale_0_5[1:3], c(1.080347, 1.056874, 1.074292), tolerance = 0.0001) + + # Test specific SeqId columns + expect_equal(result2$seq.10000.28[1:3], c(502.1, 502.7, 444.1)) + expect_equal(result2$seq.10008.43[1:3], c(591.9, 574.2, 453.0)) + } + + # Test error with non-existent grouping column + expect_error( + medianNormalize(test_data, by = "NonExistentColumn", verbose = FALSE), + "Grouping column\\(s\\) not found" + ) +}) + +test_that("`medianNormalize` handles sample selection correctly", { + # Test selective normalization + expect_no_error( + result <- medianNormalize(test_data, + do_field = "SampleType", + do_regexp = "QC", + verbose = FALSE) + ) + expect_true(is.soma_adat(result)) + + # Check result structure + expect_true(is.soma_adat(result)) + norm_cols <- grep("^NormScale_", names(result), value = TRUE) + expect_equal(length(norm_cols), 3) # Should have 3 dilution groups + + # Test scale factor output + expect_equal(result$NormScale_20[1:3], c(1.0369358, 0.9602251, 0.9841162), tolerance = 0.0001) + expect_equal(result$NormScale_0_005[1:3], c(0.8570162, 0.8485842, 1.0327016), tolerance = 0.0001) + expect_equal(result$NormScale_0_5[1:3], c(0.7771749, 0.8520195, 0.9151915), tolerance = 0.0001) + + # Test specific SeqId columns + expect_equal(result$seq.10000.28[1:3], c(476.5, 474.4, 415.6)) + expect_equal(result$seq.10008.43[1:3], c(561.8, 541.9, 423.9)) + + # Test with non-existent field + expect_error( + medianNormalize(test_data, do_field = "NonExistentField", verbose = FALSE), + "Field `NonExistentField` not found" + ) + + # Test with pattern that matches no samples + expect_error( + medianNormalize(test_data, do_regexp = "NoMatchPattern", verbose = FALSE), + "No samples selected for normalization" + ) +}) + +test_that("`medianNormalize` produces expected verbose output", { + # specific verbose output messages + expect_snapshot( + result <- medianNormalize(test_data, verbose = TRUE) + ) + + # no output + expect_message( + result <- medianNormalize(test_data, verbose = FALSE), + NA # NA indicates no output expected + ) +}) From 49ef92cd2c9dd1f157e61f6115ae6fb7c81d74f6 Mon Sep 17 00:00:00 2001 From: scheidec Date: Tue, 13 Jan 2026 16:42:17 -0500 Subject: [PATCH 02/10] Enhance medianNormalize() with comprehensive validation - updated documentation to provide guidance on what type of data the function can be applied to - added data validation for hybridization normalization, plate scale factors, and existing processing - implemented `reverse_existing` parameter to handle already-normalized data - added dilution count validation, accepting 1 or 3, along with enhanced error messaging - now recalculates RowCheck to adjust for new MedNorm values - now adds `medNormSMP_ReferenceRFU` field to SeqId annotation to denote the median mormalization reference - now updates the header metadata to add `crossplate` entry to `MedNormReference` field --- R/medianNormalize.R | 348 ++++++++++++++++++++++- inst/WORDLIST | 10 +- man/medianNormalize.Rd | 41 +++ man/soma_adat.Rd | 22 +- tests/testthat/_snaps/medianNormalize.md | 13 + tests/testthat/test-medianNormalize.R | 24 +- 6 files changed, 426 insertions(+), 32 deletions(-) diff --git a/R/medianNormalize.R b/R/medianNormalize.R index ba24e334..4b44d55e 100644 --- a/R/medianNormalize.R +++ b/R/medianNormalize.R @@ -23,12 +23,38 @@ #' or using a calculated reference from a different ADAT or external reference #' file. #' +#' @section Data Requirements: +#' This function is designed for data in standard SomaLogic deliverable formats: +#' \enumerate{ +#' \item \strong{Study samples not median normalized}, with all controls median normalized. +#' Calibrators and buffers are intra-plate median normalized and QC samples are ANML +#' normalized if present on standard 3-dilution setups. QC assay controls undergo +#' full ANML data standardization even when study samples don't, in order to QC +#' the plate. Cell & tissue single dilution runs don't have QC samples. +#' \item \strong{Same as above}, but study samples undergo either ANML or intra-study +#' standard median normalization. +#' } +#' +#' All data should be hybridization normalized, median normalized internally +#' (buffer + calibrator), and plate scaled regardless of matrix prior to applying +#' median normalization. Calibration is not applied to all study types. +#' +#' \strong{Primary use cases:} +#' \itemize{ +#' \item 3-dilution setups (standard SomaScan) +#' \item Single dilution cell & tissue studies +#' } +#' #' @section Important Considerations: #' \itemize{ #' \item If there is a known change in total protein concentration due to analysis #' conditions, perform normalization _within_ groups using the `by` parameter #' \item In newer SomaScan assay versions, population references improve #' inter-plate consistency compared to within-plate references +#' \item The function will not proceed on already ANML or median normalized data +#' unless `reverse_existing = TRUE` is specified +#' \item When reversing existing normalization, only study samples are reversed; +#' QC, Calibrator, and Buffer samples retain their normalization #' } #' #' @param adat A `soma_adat` object created using [read_adat()], containing @@ -65,6 +91,11 @@ #' (others keep original values). Default is `"SampleType"`. #' @param do_regexp Character. A regular expression pattern to select samples #' from `do_field` for normalization. Default is `"QC|Sample"`. +#' @param reverse_existing Logical. Should existing median normalization be +#' reversed before applying new normalization? When `TRUE`, existing median +#' normalization scale factors are reversed for study samples only (QC, +#' Calibrator, and Buffer samples retain their normalization). This allows +#' re-normalization of data that has already been median normalized. Default is `FALSE`. #' @param verbose Logical. Should progress messages be printed? Default is `TRUE`. #' @return A `soma_adat` object with median normalization applied and RFU values #' adjusted. The existing `NormScale_*` columns are updated to include the @@ -108,6 +139,12 @@ #' med_norm_adat <- medianNormalize(adat, #' do_field = "SampleType", #' do_regexp = "Sample") +#' +#' # Re-normalize data that has already been median normalized +#' # Use when you want to apply different normalization to previously normalized data +#' med_norm_adat <- medianNormalize(adat, +#' reverse_existing = TRUE, +#' reference = new_reference) #' } #' @importFrom dplyr filter #' @importFrom stats median @@ -120,6 +157,7 @@ medianNormalize <- function(adat, by = "SampleType", do_field = "SampleType", do_regexp = "QC|Sample", + reverse_existing = FALSE, verbose = TRUE) { # Input validation ---- @@ -152,6 +190,17 @@ medianNormalize <- function(adat, ) } + # Check data state and existing normalization ---- + has_existing_norm <- .validateDataState(adat, header, verbose, reverse_existing) + + # Reverse existing normalization if requested ---- + if (reverse_existing && has_existing_norm) { + if (verbose) { + cat("Reversing existing median normalization for study samples...\n") + } + adat <- .reverseMedNormSMP(adat, verbose) + } + # Create dilution groups ---- apt_data <- getAnalyteInfo(adat) @@ -169,9 +218,8 @@ medianNormalize <- function(adat, names(dil_groups) <- gsub("\\.", "_", names(dil_groups)) names(dil_groups) <- gsub("[.]0$|%|^[.]", "", names(dil_groups)) - if (length(dil_groups) < 2L) { - warning("Fewer than 2 dilution groups detected", call. = FALSE) - } + # Validate dilution count ---- + .validateDilutionCount(dil_groups, verbose) # Check for existing normalization scale factors ---- existing_norm_sf <- grep("^NormScale_", names(adat), value = TRUE) @@ -274,6 +322,12 @@ medianNormalize <- function(adat, norm_adat <- norm_adat[order(norm_adat$.rowid), ] norm_adat$.rowid <- NULL + # Add medNorm reference to SeqId annotations ---- + norm_adat <- .addMedNormReference(norm_adat, ref_data, dil_groups) + + # Recalculate RowCheck to adjust for new MedNorm values ---- + norm_adat <- .recalculateRowCheck(norm_adat, verbose) + # Update header metadata .updateHeaderMetadata(norm_adat, reference) } @@ -649,15 +703,21 @@ medianNormalize <- function(adat, # Set normalization algorithm header_meta$HEADER$NormalizationAlgorithm <- "MedNorm" - # Set reference type + # Set reference type - add "crossplate" as new comma-separated entry if (is.null(reference)) { - header_meta$HEADER$MedNormReference <- "intraplate" + # For internal references, use crossplate + current_ref <- header_meta$HEADER$MedNormReference %||% "" + if (current_ref == "") { + header_meta$HEADER$MedNormReference <- "intraplate, crossplate" + } else if (!grepl("crossplate", current_ref)) { + header_meta$HEADER$MedNormReference <- paste(current_ref, "crossplate", sep = ", ") + } } else if (is.soma_adat(reference)) { - header_meta$HEADER$MedNormReference <- "external_adat" + header_meta$HEADER$MedNormReference <- "external_adat, crossplate" } else if (is.character(reference)) { - header_meta$HEADER$MedNormReference <- basename(reference) + header_meta$HEADER$MedNormReference <- paste(basename(reference), "crossplate", sep = ", ") } else { - header_meta$HEADER$MedNormReference <- "external_data" + header_meta$HEADER$MedNormReference <- "external_data, crossplate" } attr(adat, "Header.Meta") <- header_meta @@ -665,3 +725,275 @@ medianNormalize <- function(adat, adat } + + +#' Validate Data State for Median Normalization +#' @noRd +.validateDataState <- function(adat, header, verbose, reverse_existing = FALSE) { + + # Check if data is in a standard deliverable state + process_steps <- header$ProcessSteps %||% "" + + # Check for existing median normalization + has_mednorm <- grepl("medNormInt|MedNorm", process_steps, ignore.case = TRUE) + has_anml <- grepl("ANML", process_steps, ignore.case = TRUE) + + if (has_mednorm && !reverse_existing) { + stop( + "Data appears to already be median normalized. ", + "Set reverse_existing = TRUE to reverse existing median normalization before applying new normalization. ", + "ProcessSteps: ", process_steps, + call. = FALSE + ) + } + + if (has_anml) { + stop( + "Data appears to be ANML normalized. ", + "ANML normalized data cannot be directly processed with this function. ", + "ProcessSteps: ", process_steps, + call. = FALSE + ) + } + + # Check for required normalization steps + has_hyb <- grepl("Hyb|hybridization", process_steps, ignore.case = TRUE) + has_plate_scale <- grepl("PlateScale|plate.?scale", process_steps, ignore.case = TRUE) + + # Warn if not in standard deliverable state + if (!has_hyb || !has_plate_scale) { + warning( + "Data may not be in standard deliverable format. ", + "Standard format requires hybridization normalization, median normalization of controls ", + "(buffer + calibrator), and plate scale normalization before applying median normalization. ", + "Current ProcessSteps: ", process_steps, + call. = FALSE + ) + } + + if (verbose) { + cat("Data validation passed for median normalization.\n") + cat("Standard deliverable checks:\n") + cat(" - Hybridization normalization:", if(has_hyb) "PASS" else "WARN", "\n") + cat(" - Plate scale normalization:", if(has_plate_scale) "PASS" else "WARN", "\n") + cat(" - No existing MedNorm/ANML:", if(!has_mednorm && !has_anml) "PASS" else if(reverse_existing && has_mednorm) "WARN (will reverse)" else "FAIL", "\n") + } + + return(has_mednorm || has_anml) +} + + +#' Validate Dilution Count +#' @noRd +.validateDilutionCount <- function(dil_groups, verbose) { + + num_dilutions <- length(dil_groups) + + # Primary use cases are 1 or 3 dilutions + if (!num_dilutions %in% c(1, 3)) { + warning( + "Non-standard dilution count detected: ", num_dilutions, " dilutions. ", + "Primary use cases are 1 dilution (cell & tissue studies) or 3 dilutions (standard setups). ", + "Found dilutions: ", paste(names(dil_groups), collapse = ", "), + call. = FALSE + ) + } + + if (verbose) { + if (num_dilutions == 1) { + cat("Single dilution setup detected (typical for cell & tissue studies).\n") + } else if (num_dilutions == 3) { + cat("Three dilution setup detected (standard setup).\n") + } else { + cat("Non-standard dilution setup detected (", num_dilutions, " dilutions).\n") + } + } + + invisible(NULL) +} + + +#' Add MedNorm Reference to SeqId Annotations +#' @noRd +.addMedNormReference <- function(adat, ref_data, dil_groups) { + + # Get analyte info + apt_data <- getAnalyteInfo(adat) + + # Initialize the column if it doesn't exist + if (!"medNormSMP_ReferenceRFU" %in% names(apt_data)) { + apt_data$medNormSMP_ReferenceRFU <- NA_real_ + } + + # For each analyte, add medNormSMP_ReferenceRFU + if (!is.null(ref_data)) { + for (dil_name in names(dil_groups)) { + if (dil_name %in% names(ref_data)) { + dil_apts <- intersect(dil_groups[[dil_name]], getAnalytes(adat)) + + if (length(dil_apts) > 0) { + ref_values <- ref_data[[dil_name]] + + # Handle both single reference and aptamer-specific references + if (is.numeric(ref_values) && length(ref_values) == 1) { + # Single reference value for the whole dilution group + apt_data$medNormSMP_ReferenceRFU[apt_data$AptName %in% dil_apts] <- ref_values + } else if (is.numeric(ref_values) && length(ref_values) > 1) { + # Aptamer-specific reference values + for (apt in dil_apts) { + if (apt %in% names(ref_values)) { + apt_data$medNormSMP_ReferenceRFU[apt_data$AptName == apt] <- ref_values[apt] + } + } + } + } + } + } + + # Update the analyte metadata + attr(adat, "Col.Meta") <- apt_data + } + + invisible(adat) +} + + +#' Reverse Existing Median Normalization for Study Samples +#' @noRd +.reverseMedNormSMP <- function(adat, verbose) { + + # Get existing scale factors + sf_cols <- grep("^NormScale_", names(adat), value = TRUE) + + if (length(sf_cols) == 0) { + if (verbose) { + cat("No existing median normalization scale factors found to reverse.\n") + } + return(adat) + } + + if (verbose) { + cat("Reversing existing median normalization for study samples...\n") + } + + # Get dilution groups + apt_data <- getAnalyteInfo(adat) + dil_groups <- split(apt_data$AptName, apt_data$Dilution) + names(dil_groups) <- gsub("\\.", "_", names(dil_groups)) + names(dil_groups) <- gsub("[.]0$|%|^[.]", "", names(dil_groups)) + + # Only reverse for study samples, leave QC/Calibrator/Buffer alone + sample_mask <- grepl("Sample|sample", adat$SampleType %||% adat$SampleId %||% "", ignore.case = TRUE) + + if (sum(sample_mask) == 0) { + if (verbose) { + cat("No study samples found to reverse normalization.\n") + } + return(adat) + } + + # For each dilution group, reverse normalization + for (dil_name in names(dil_groups)) { + sf_col <- paste0("NormScale_", dil_name) + + if (sf_col %in% names(adat)) { + dil_apts <- intersect(dil_groups[[dil_name]], getAnalytes(adat)) + + if (length(dil_apts) > 0) { + for (i in which(sample_mask)) { + scale_factor <- adat[[sf_col]][i] + if (!is.na(scale_factor) && scale_factor != 0) { + # Apply inverse of scale factor + adat[i, dil_apts] <- adat[i, dil_apts] / scale_factor + # Reset scale factor to 1.0 + adat[[sf_col]][i] <- 1.0 + } + } + } + } + } + + # Update ProcessSteps to indicate reversal + header_meta <- attr(adat, "Header.Meta") + if (!is.null(header_meta) && !is.null(header_meta$HEADER)) { + current_steps <- header_meta$HEADER$ProcessSteps %||% "" + header_meta$HEADER$ProcessSteps <- paste(current_steps, "reverseMedNormSMP", sep = ", ") + attr(adat, "Header.Meta") <- header_meta + } + + if (verbose) { + cat("Median normalization reversed for", sum(sample_mask), "study samples.\n") + } + + adat +} + + +#' Recalculate RowCheck after Median Normalization +#' +#' Recalculates RowCheck values as PASS or FLAG based on normalization acceptance +#' criteria for row scale factors after median normalization. Samples with all +#' row scale factors within the acceptance range (0.4 to 2.5) receive "PASS", +#' while samples with any scale factor outside this range receive "FLAG". +#' +#' @param adat A `soma_adat` object after median normalization +#' @param verbose Logical. Whether to print progress messages +#' @return The `soma_adat` object with updated RowCheck values +#' @noRd +.recalculateRowCheck <- function(adat, verbose) { + + if (verbose) { + cat("Recalculating RowCheck values based on normalization acceptance criteria...\n") + } + + # Check if RowCheck column exists + if (!"RowCheck" %in% names(adat)) { + if (verbose) { + cat("No RowCheck column found to recalculate.\n") + } + return(adat) + } + + # Find all normalization scale factor columns (NormScale_*) + scale_factor_cols <- grep("^NormScale_", names(adat), value = TRUE) + + if (length(scale_factor_cols) == 0) { + if (verbose) { + cat("No normalization scale factor columns found. Setting all RowCheck to PASS.\n") + } + adat$RowCheck <- "PASS" + return(adat) + } + + # Define acceptance criteria range for row scale factors + min_scale <- 0.4 + max_scale <- 2.5 + + # Calculate RowCheck for each sample + for (i in seq_len(nrow(adat))) { + # Get all scale factor values for this sample + scale_values <- as.numeric(adat[i, scale_factor_cols, drop = FALSE]) + scale_values <- scale_values[!is.na(scale_values)] + + # Check if all scale factors are within acceptance range + if (length(scale_values) == 0) { + # No scale factors available - default to PASS + adat$RowCheck[i] <- "PASS" + } else if (all(scale_values >= min_scale & scale_values <= max_scale)) { + adat$RowCheck[i] <- "PASS" + } else { + adat$RowCheck[i] <- "FLAG" + } + } + + if (verbose) { + pass_count <- sum(adat$RowCheck == "PASS", na.rm = TRUE) + flag_count <- sum(adat$RowCheck == "FLAG", na.rm = TRUE) + cat("RowCheck values updated for", nrow(adat), "samples.\n") + cat(" - PASS:", pass_count, "samples\n") + cat(" - FLAG:", flag_count, "samples\n") + cat(" - Acceptance criteria: scale factors within [", min_scale, ", ", max_scale, "]\n") + } + + adat +} diff --git a/inst/WORDLIST b/inst/WORDLIST index 4c2a38cc..8bd5756d 100644 --- a/inst/WORDLIST +++ b/inst/WORDLIST @@ -5,6 +5,7 @@ Agilent Analyte Analytes AptName +AptNames AssayNotes Barcode BioTools @@ -36,9 +37,10 @@ HybControlNormScale Kuei LF Lifecycle +MADs MERCHANTABILITY MacOS -MADs +MedNorm NHS NormScale ORCID @@ -55,7 +57,6 @@ RFU RFUs RUO ReferenceRFU -Reproducibility RowCheck SELEX SG @@ -87,8 +88,8 @@ TimePoint TubeUniqueID Un UniProt -YAML adat +adats aliquot analyte analytes @@ -112,7 +113,6 @@ magrittr md medNormRef nd -normals pipetting pkgdown plex @@ -128,6 +128,4 @@ tada th tibble tidyr -untransformed variabilities -vectorized diff --git a/man/medianNormalize.Rd b/man/medianNormalize.Rd index 00a22ad8..d8ac0a30 100644 --- a/man/medianNormalize.Rd +++ b/man/medianNormalize.Rd @@ -12,6 +12,7 @@ medianNormalize( by = "SampleType", do_field = "SampleType", do_regexp = "QC|Sample", + reverse_existing = FALSE, verbose = TRUE ) } @@ -57,6 +58,12 @@ separately. Default is \code{"SampleType"}.} \item{do_regexp}{Character. A regular expression pattern to select samples from \code{do_field} for normalization. Default is \code{"QC|Sample"}.} +\item{reverse_existing}{Logical. Should existing median normalization be +reversed before applying new normalization? When \code{TRUE}, existing median +normalization scale factors are reversed for study samples only (QC, +Calibrator, and Buffer samples retain their normalization). This allows +re-normalization of data that has already been median normalized. Default is \code{FALSE}.} + \item{verbose}{Logical. Should progress messages be printed? Default is \code{TRUE}.} } \value{ @@ -88,6 +95,30 @@ reference approaches: calculating an internal reference from study samples, or using a calculated reference from a different ADAT or external reference file. } +\section{Data Requirements}{ + +This function is designed for data in standard SomaLogic deliverable formats: +\enumerate{ +\item \strong{Study samples not median normalized}, with all controls median normalized. +Calibrators and buffers are intra-plate median normalized and QC samples are ANML +normalized if present on standard 3-dilution setups. QC assay controls undergo +full ANML data standardization even when study samples don't, in order to QC +the plate. Cell & tissue single dilution runs don't have QC samples. +\item \strong{Same as above}, but study samples undergo either ANML or intra-study +standard median normalization. +} + +All data should be hybridization normalized, median normalized internally +(buffer + calibrator), and plate scaled regardless of matrix prior to applying +median normalization. Calibration is not applied to all study types. + +\strong{Primary use cases:} +\itemize{ +\item 3-dilution setups (standard SomaScan) +\item Single dilution cell & tissue studies +} +} + \section{Important Considerations}{ \itemize{ @@ -95,6 +126,10 @@ file. conditions, perform normalization \emph{within} groups using the \code{by} parameter \item In newer SomaScan assay versions, population references improve inter-plate consistency compared to within-plate references +\item The function will not proceed on already ANML or median normalized data +unless \code{reverse_existing = TRUE} is specified +\item When reversing existing normalization, only study samples are reversed; +QC, Calibrator, and Buffer samples retain their normalization } } @@ -137,5 +172,11 @@ med_norm_adat <- medianNormalize(adat, by = c("Sex", "SampleType")) med_norm_adat <- medianNormalize(adat, do_field = "SampleType", do_regexp = "Sample") + +# Re-normalize data that has already been median normalized +# Use when you want to apply different normalization to previously normalized data +med_norm_adat <- medianNormalize(adat, + reverse_existing = TRUE, + reference = new_reference) } } diff --git a/man/soma_adat.Rd b/man/soma_adat.Rd index d117ebd1..26127fdc 100644 --- a/man/soma_adat.Rd +++ b/man/soma_adat.Rd @@ -91,18 +91,16 @@ dispatched on class \code{soma_adat}. Below is a list of \emph{all} currently available S3 methods that dispatch on the \code{soma_adat} class: -\if{html}{\out{
}}\preformatted{#> [1] [ [[ [[<- [<- -#> [5] == $ $<- anti_join -#> [9] arrange centerScaleData convertAptNames count -#> [13] filter full_join getAdatVersion getAnalytes -#> [17] getMeta group_by imputeNAs inner_join -#> [21] is_seqFormat is.logspace left_join Math -#> [25] median merge mutate Ops -#> [29] print rename revertAptNames right_join -#> [33] row.names<- sample_frac sample_n select -#> [37] semi_join separate slice_sample slice -#> [41] summary Summary transform ungroup -#> [45] unite +\if{html}{\out{
}}\preformatted{#> [1] [ [[ [[<- [<- +#> [5] == $ $<- anti_join +#> [9] arrange count filter full_join +#> [13] getAdatVersion getAnalytes getMeta group_by +#> [17] inner_join is_seqFormat left_join Math +#> [21] median merge mutate Ops +#> [25] print rename right_join row.names<- +#> [29] sample_frac sample_n select semi_join +#> [33] separate slice_sample slice summary +#> [37] Summary transform ungroup unite #> see '?methods' for accessing help and source code }\if{html}{\out{
}} diff --git a/tests/testthat/_snaps/medianNormalize.md b/tests/testthat/_snaps/medianNormalize.md index 8ee3491c..8a528fe1 100644 --- a/tests/testthat/_snaps/medianNormalize.md +++ b/tests/testthat/_snaps/medianNormalize.md @@ -2,6 +2,13 @@ Code result <- medianNormalize(test_data, verbose = TRUE) + Output + Data validation passed for median normalization. + Standard deliverable checks: + - Hybridization normalization: PASS + - Plate scale normalization: PASS + - No existing MedNorm/ANML: PASS + Three dilution setup detected (standard setup). Message > Normalization scale factors already exist: "NormScale_20, NormScale_0_005, NormScale_0_5" - they will be replaced with new scale factors > Building internal reference from field: "SampleType" with values: "QC" and "Sample" @@ -14,4 +21,10 @@ v Processing dilution '0_005' with 173 analytes v Processing dilution '0_5' with 828 analytes v Processing dilution '20' with 4271 analytes + Output + Recalculating RowCheck values based on normalization acceptance criteria... + RowCheck values updated for 16 samples. + - PASS: 16 samples + - FLAG: 0 samples + - Acceptance criteria: scale factors within [ 0.4 , 2.5 ] diff --git a/tests/testthat/test-medianNormalize.R b/tests/testthat/test-medianNormalize.R index 6634c43f..fb7d4b7d 100644 --- a/tests/testthat/test-medianNormalize.R +++ b/tests/testthat/test-medianNormalize.R @@ -8,9 +8,18 @@ create_test_data <- function() { # Combine indices and subset subset_indices <- c(sample_indices, qc_indices) + test_adat <- example_data[subset_indices, ] + + # Modify header to simulate pre-ANML state (remove ANML steps) + header_meta <- attr(test_adat, "Header.Meta") + if (!is.null(header_meta) && !is.null(header_meta$HEADER)) { + # Remove ANML and median normalization steps to simulate pre-processed state + header_meta$HEADER$ProcessSteps <- "Raw RFU, Hyb Normalization, plateScale, Calibration" + attr(test_adat, "Header.Meta") <- header_meta + } # Return subset with representative samples - example_data[subset_indices, ] + test_adat } test_data <- create_test_data() @@ -42,7 +51,7 @@ test_that("`medianNormalize` Method 1: Internal reference (default)", { result_header <- attr(result, "Header.Meta")$HEADER expect_true(grepl("medNormInt", result_header$ProcessSteps)) expect_equal(result_header$NormalizationAlgorithm, "MedNorm") - expect_equal(result_header$MedNormReference, "intraplate") + expect_true(grepl("intraplate, crossplate", result_header$MedNormReference)) }) test_that("`medianNormalize` Method 2: Reference from specific samples", { @@ -72,7 +81,7 @@ test_that("`medianNormalize` Method 2: Reference from specific samples", { # Check header metadata result_header <- attr(result, "Header.Meta")$HEADER - expect_equal(result_header$MedNormReference, "intraplate") + expect_true(grepl("intraplate, crossplate", result_header$MedNormReference)) }) test_that("`medianNormalize` Method 3: Reference from another ADAT", { @@ -99,7 +108,8 @@ test_that("`medianNormalize` Method 3: Reference from another ADAT", { # Check header metadata result_header <- attr(result, "Header.Meta")$HEADER - expect_equal(result_header$MedNormReference, "external_adat") + expect_true(grepl("external_adat", result_header$MedNormReference)) + expect_true(grepl("crossplate", result_header$MedNormReference)) }) test_that("`medianNormalize` Method 4: External reference file", { @@ -135,7 +145,8 @@ test_that("`medianNormalize` Method 4: External reference file", { # Check header metadata result_header <- attr(result, "Header.Meta")$HEADER - expect_equal(result_header$MedNormReference, basename(temp_csv)) + expect_true(grepl(basename(temp_csv), result_header$MedNormReference)) + expect_true(grepl("crossplate", result_header$MedNormReference)) # Clean up unlink(temp_csv) @@ -192,7 +203,8 @@ test_that("`medianNormalize` Method 5: External reference as data.frame", { # Check header metadata result_header <- attr(result, "Header.Meta")$HEADER - expect_equal(result_header$MedNormReference, "external_data") + expect_true(grepl("external_data", result_header$MedNormReference)) + expect_true(grepl("crossplate", result_header$MedNormReference)) }) test_that("`medianNormalize` validates input requirements", { From 8b5be65e6134dbdd5d5f4e11d34cfde26f566dfa Mon Sep 17 00:00:00 2001 From: scheidec Date: Tue, 13 Jan 2026 18:14:09 -0500 Subject: [PATCH 03/10] Streamline unit tests for medianNormalize() - use subset of the exmaple data for unit tests to reduce execution time --- tests/testthat/_snaps/medianNormalize.md | 20 +-- tests/testthat/test-medianNormalize.R | 148 +++++++++++------------ 2 files changed, 83 insertions(+), 85 deletions(-) diff --git a/tests/testthat/_snaps/medianNormalize.md b/tests/testthat/_snaps/medianNormalize.md index 8a528fe1..390f7f7c 100644 --- a/tests/testthat/_snaps/medianNormalize.md +++ b/tests/testthat/_snaps/medianNormalize.md @@ -13,18 +13,18 @@ > Normalization scale factors already exist: "NormScale_20, NormScale_0_005, NormScale_0_5" - they will be replaced with new scale factors > Building internal reference from field: "SampleType" with values: "QC" and "Sample" > Performing grouped median normalization by: "SampleType" (2 groups) - > Processing group: "Sample" (12 samples) - v Processing dilution '0_005' with 173 analytes - v Processing dilution '0_5' with 828 analytes - v Processing dilution '20' with 4271 analytes - > Processing group: "QC" (4 samples) - v Processing dilution '0_005' with 173 analytes - v Processing dilution '0_5' with 828 analytes - v Processing dilution '20' with 4271 analytes + > Processing group: "Sample" (2 samples) + v Processing dilution '0_005' with 50 analytes + v Processing dilution '0_5' with 50 analytes + v Processing dilution '20' with 50 analytes + > Processing group: "QC" (1 samples) + v Processing dilution '0_005' with 50 analytes + v Processing dilution '0_5' with 50 analytes + v Processing dilution '20' with 50 analytes Output Recalculating RowCheck values based on normalization acceptance criteria... - RowCheck values updated for 16 samples. - - PASS: 16 samples + RowCheck values updated for 3 samples. + - PASS: 3 samples - FLAG: 0 samples - Acceptance criteria: scale factors within [ 0.4 , 2.5 ] diff --git a/tests/testthat/test-medianNormalize.R b/tests/testthat/test-medianNormalize.R index fb7d4b7d..afc8ebcf 100644 --- a/tests/testthat/test-medianNormalize.R +++ b/tests/testthat/test-medianNormalize.R @@ -1,14 +1,33 @@ # Setup ---- -# Create a minimal test dataset -create_test_data <- function() { +# Create a minimal test dataset with options for small vs full size +create_test_data <- function(small = FALSE) { data("example_data", package = "SomaDataIO") - # Select samples ensuring we have both Sample and QC types - sample_indices <- which(example_data$SampleType == "Sample")[1:12] - qc_indices <- which(example_data$SampleType == "QC")[1:4] - # Combine indices and subset - subset_indices <- c(sample_indices, qc_indices) - test_adat <- example_data[subset_indices, ] + if (small) { + # Ultra-small dataset for most tests: 3 samples, subset of analytes + sample_indices <- which(example_data$SampleType == "Sample")[1:2] + qc_indices <- which(example_data$SampleType == "QC")[1:1] + subset_indices <- c(sample_indices, qc_indices) + + # Select subset of analytes from different dilution groups for speed + analytes <- getAnalytes(example_data) + analyte_info <- getAnalyteInfo(example_data) + + # Get first 50 analytes from each dilution group for faster processing + selected_analytes <- c() + for (dil in unique(analyte_info$Dilution)) { + dil_analytes <- analytes[analyte_info$Dilution == dil] + selected_analytes <- c(selected_analytes, head(dil_analytes, 50)) + } + + test_adat <- example_data[subset_indices, c(getMeta(example_data), selected_analytes)] + } else { + # Standard test dataset: 16 samples, all analytes (for comprehensive tests) + sample_indices <- which(example_data$SampleType == "Sample")[1:12] + qc_indices <- which(example_data$SampleType == "QC")[1:4] + subset_indices <- c(sample_indices, qc_indices) + test_adat <- example_data[subset_indices, ] + } # Modify header to simulate pre-ANML state (remove ANML steps) header_meta <- attr(test_adat, "Header.Meta") @@ -22,7 +41,8 @@ create_test_data <- function() { test_adat } -test_data <- create_test_data() +test_data <- create_test_data(small = TRUE) +test_data_full <- create_test_data(small = FALSE) # Testing ---- test_that("`medianNormalize` Method 1: Internal reference (default)", { @@ -39,13 +59,13 @@ test_that("`medianNormalize` Method 1: Internal reference (default)", { expect_true(all(norm_cols %in% names(result))) # Test scale factor output - expect_equal(result$NormScale_20[1:3], c(1.067845, 1.057102, 1.091748), tolerance = 0.0001) - expect_equal(result$NormScale_0_005[1:3], c(1.022533, 1.001174, 1.012215), tolerance = 0.0001) - expect_equal(result$NormScale_0_5[1:3], c(1.103112, 1.054358, 1.108711), tolerance = 0.0001) + expect_equal(result$NormScale_20[1:3], c(0.971895, 1.029779, 1), tolerance = 0.000001) + expect_equal(result$NormScale_0_005[1:3], c(1.002922, 0.997176, 1), tolerance = 0.000001) + expect_equal(result$NormScale_0_5[1:3], c(0.973205, 1.028323, 1), tolerance = 0.000001) # Test specific SeqId columns - expect_equal(result$seq.10000.28[1:3], c(508.8, 501.5, 453.7)) - expect_equal(result$seq.10008.43[1:3], c(599.9, 572.8, 462.8)) + expect_equal(result$seq.10000.28[1:3], c(463.1, 488.5, 501.5)) + expect_equal(result$seq.10001.7[1:3], c(301.4, 302.2, 207.4)) # Check header metadata result_header <- attr(result, "Header.Meta")$HEADER @@ -71,13 +91,13 @@ test_that("`medianNormalize` Method 2: Reference from specific samples", { expect_equal(length(norm_cols), 3) # Should have 3 dilution groups # Test scale factor output - expect_equal(result$NormScale_20[1:3], c(1.0369358, 0.9602251, 0.9841162), tolerance = 0.0001) - expect_equal(result$NormScale_0_005[1:3], c(0.8570162, 0.8485842, 1.0327016), tolerance = 0.0001) - expect_equal(result$NormScale_0_5[1:3], c(0.7771749, 0.8520195, 0.9151915), tolerance = 0.0001) + expect_equal(result$NormScale_20[1:3], c(1.0369358, 0.9602251, 1), tolerance = 0.000001) + expect_equal(result$NormScale_0_005[1:3], c(0.8570162, 0.8485842, 1), tolerance = 0.000001) + expect_equal(result$NormScale_0_5[1:3], c(0.7771749, 0.8520195, 1), tolerance = 0.000001) # Test specific SeqId columns - expect_equal(result$seq.10000.28[1:3], c(476.5, 474.4, 415.6)) - expect_equal(result$seq.10008.43[1:3], c(561.8, 541.9, 423.9)) + expect_equal(result$seq.10000.28[1:3], c(476.5, 474.4, 501.5)) + expect_equal(result$seq.10008.43[1:3], c(561.8, 541.9, 510.1)) # Check header metadata result_header <- attr(result, "Header.Meta")$HEADER @@ -86,7 +106,7 @@ test_that("`medianNormalize` Method 2: Reference from specific samples", { test_that("`medianNormalize` Method 3: Reference from another ADAT", { # Create a minimal reference ADAT from sample subset - ref_adat <- test_data[1:4, ] # Use first 4 samples as reference + ref_adat <- test_data[1:2, ] # Use first 2 samples as reference expect_no_error( result <- medianNormalize(test_data, reference = ref_adat, verbose = FALSE) @@ -98,13 +118,13 @@ test_that("`medianNormalize` Method 3: Reference from another ADAT", { expect_equal(length(norm_cols), 3) # Should have 3 dilution groups # Test scale factor output - expect_equal(result$NormScale_20[1:3], c(0.9907154, 1.0109152, 0.9931271), tolerance = 0.0001) - expect_equal(result$NormScale_0_005[1:3], c(1.005733, 1.008454, 0.986035), tolerance = 0.0001) - expect_equal(result$NormScale_0_5[1:3], c(1.0090878, 0.9909594, 1.0002446), tolerance = 0.0001) + expect_equal(result$NormScale_20[1:3], c(0.9718953, 1.0297787, 1.0841026), tolerance = 0.0001) + expect_equal(result$NormScale_0_005[1:3], c(1.0029224, 0.9971762, 1.0548762), tolerance = 0.0001) + expect_equal(result$NormScale_0_5[1:3], c(0.9732045, 1.0283228, 0.9601410), tolerance = 0.0001) # Test specific SeqId columns - expect_equal(result$seq.10000.28[1:3], c(472.1, 479.6, 412.7)) - expect_equal(result$seq.10008.43[1:3], c(556.6, 547.8, 421.0)) + expect_equal(result$seq.10000.28[1:3], c(463.1, 488.5, 543.7)) + expect_equal(result$seq.10008.43[1:3], c(546, 558, 553)) # Check header metadata result_header <- attr(result, "Header.Meta")$HEADER @@ -135,13 +155,13 @@ test_that("`medianNormalize` Method 4: External reference file", { expect_equal(length(norm_cols), 3) # Should have 3 dilution groups # Test scale factor output - expect_equal(result$NormScale_20[1:3], c(2.937272, 2.982111, 2.936927), tolerance = 0.0001) - expect_equal(result$NormScale_0_005[1:3], c(0.6313838, 0.5966632, 0.6624721), tolerance = 0.0001) - expect_equal(result$NormScale_0_5[1:3], c(0.7147623, 0.7780614, 0.7704032), tolerance = 0.0001) + expect_equal(result$NormScale_20[1:3], c(3.31, 3.71, 4.21), tolerance = 0.01) + expect_equal(result$NormScale_0_005[1:3], c(0.577, 0.561, 0.561), tolerance = 0.001) + expect_equal(result$NormScale_0_5[1:3], c(0.90, 1.18, 1.03), tolerance = 0.01) # Test specific SeqId columns - expect_equal(result$seq.10000.28[1:3], c(1399.6, 1414.7, 1220.6)) - expect_equal(result$seq.10008.43[1:3], c(1650.2, 1616.0, 1245.0)) + expect_equal(result$seq.10000.28[1:3], c(1579.0, 1759.9, 2109.7)) + expect_equal(result$seq.10008.43[1:3], c(1861.7, 2010.3, 2145.9)) # Check header metadata result_header <- attr(result, "Header.Meta")$HEADER @@ -150,28 +170,6 @@ test_that("`medianNormalize` Method 4: External reference file", { # Clean up unlink(temp_csv) - - # Test with TSV file - temp_tsv <- tempfile(fileext = ".tsv") - write.table(ref_data, temp_tsv, sep = "\t", row.names = FALSE) - - expect_no_error( - result2 <- medianNormalize(test_data, reference = temp_tsv, verbose = FALSE) - ) - - expect_true(is.soma_adat(result2)) - - # Test scale factor output - expect_equal(result2$NormScale_20[1:3], c(2.937272, 2.982111, 2.936927), tolerance = 0.0001) - expect_equal(result2$NormScale_0_005[1:3], c(0.6313838, 0.5966632, 0.6624721), tolerance = 0.0001) - expect_equal(result2$NormScale_0_5[1:3], c(0.7147623, 0.7780614, 0.7704032), tolerance = 0.0001) - - # Test specific SeqId columns - expect_equal(result2$seq.10000.28[1:3], c(1399.6, 1414.7, 1220.6)) - expect_equal(result2$seq.10008.43[1:3], c(1650.2, 1616.0, 1245.0)) - - # Clean up - unlink(temp_tsv) }) test_that("`medianNormalize` Method 5: External reference as data.frame", { @@ -193,13 +191,13 @@ test_that("`medianNormalize` Method 5: External reference as data.frame", { expect_equal(length(norm_cols), 3) # Should have 3 dilution groups # Test scale factor output - expect_equal(result$NormScale_20[1:3], c(2.937272, 2.982111, 2.936927), tolerance = 0.0001) - expect_equal(result$NormScale_0_005[1:3], c(0.6313838, 0.5966632, 0.6624721), tolerance = 0.0001) - expect_equal(result$NormScale_0_5[1:3], c(0.7147623, 0.7780614, 0.7704032), tolerance = 0.0001) + expect_equal(result$NormScale_20[1:3], c(3.31, 3.71, 4.21), tolerance = 0.01) + expect_equal(result$NormScale_0_005[1:3], c(0.577, 0.561, 0.561), tolerance = 0.001) + expect_equal(result$NormScale_0_5[1:3], c(0.90, 1.18, 1.03), tolerance = 0.01) # Test specific SeqId columns - expect_equal(result$seq.10000.28[1:3], c(1399.6, 1414.7, 1220.6)) - expect_equal(result$seq.10008.43[1:3], c(1650.2, 1616.0, 1245.0)) + expect_equal(result$seq.10000.28[1:3], c(1579.0, 1759.9, 2109.7)) + expect_equal(result$seq.10008.43[1:3], c(1861.7, 2010.3, 2145.9)) # Check header metadata result_header <- attr(result, "Header.Meta")$HEADER @@ -287,19 +285,19 @@ test_that("`medianNormalize` handles grouping correctly", { norm_cols <- grep("^NormScale_", names(result1), value = TRUE) expect_equal(length(norm_cols), 3) # Should have 3 dilution groups - # Test scale factor output - expect_equal(result1$NormScale_20[1:3], c(1.066964, 1.060173, 1.020341), tolerance = 0.0001) - expect_equal(result1$NormScale_0_005[1:3], c(1.000455, 1.005372, 1.000000), tolerance = 0.0001) - expect_equal(result1$NormScale_0_5[1:3], c(1.115562, 1.050018, 1.030397), tolerance = 0.0001) + # Test scale factor output - will vary based on grouping by sex + expect_equal(result1$NormScale_20[1:3], c(0.971895, 1.029779, 1.199944), tolerance = 0.000001) + expect_equal(result1$NormScale_0_005[1:3], c(1.002922, 0.997176, 1.058289), tolerance = 0.000001) + expect_equal(result1$NormScale_0_5[1:3], c(0.973205, 1.028323, 1.104745), tolerance = 0.000001) # Test specific SeqId columns - expect_equal(result1$seq.10000.28[1:3], c(508.4, 502.9, 424.1)) - expect_equal(result1$seq.10008.43[1:3], c(599.4, 574.5, 432.5)) + expect_equal(result1$seq.10000.28[1:3], c(463.1, 488.5, 501.5)) + expect_equal(result1$seq.10008.43[1:3], c(546.0, 558.0, 510.1)) - # Test grouping by multiple variables - if ("PlateId" %in% names(test_data)) { + # Test grouping by multiple variables - use full example data + if ("PlateId" %in% names(test_data_full)) { expect_no_error( - result2 <- medianNormalize(test_data, by = c("PlateId", "SampleType"), verbose = FALSE) + result2 <- medianNormalize(test_data_full, by = c("PlateId", "SampleType"), verbose = FALSE) ) # Check result structure @@ -308,9 +306,9 @@ test_that("`medianNormalize` handles grouping correctly", { expect_equal(length(norm_cols), 3) # Should have 3 dilution groups # Test scale factor output - expect_equal(result2$NormScale_20[1:3], c(1.053656, 1.059554, 1.068602), tolerance = 0.0001) - expect_equal(result2$NormScale_0_005[1:3], c(1.0124749, 1.0053543, 0.9992799), tolerance = 0.0001) - expect_equal(result2$NormScale_0_5[1:3], c(1.080347, 1.056874, 1.074292), tolerance = 0.0001) + expect_equal(result2$NormScale_20[1:3], c(1.054, 1.060, 1.069), tolerance = 0.001) + expect_equal(result2$NormScale_0_005[1:3], c(1.0125, 1.0054, 0.9993), tolerance = 0.001) + expect_equal(result2$NormScale_0_5[1:3], c(1.080, 1.057, 1.074), tolerance = 0.001) # Test specific SeqId columns expect_equal(result2$seq.10000.28[1:3], c(502.1, 502.7, 444.1)) @@ -339,14 +337,14 @@ test_that("`medianNormalize` handles sample selection correctly", { norm_cols <- grep("^NormScale_", names(result), value = TRUE) expect_equal(length(norm_cols), 3) # Should have 3 dilution groups - # Test scale factor output - expect_equal(result$NormScale_20[1:3], c(1.0369358, 0.9602251, 0.9841162), tolerance = 0.0001) - expect_equal(result$NormScale_0_005[1:3], c(0.8570162, 0.8485842, 1.0327016), tolerance = 0.0001) - expect_equal(result$NormScale_0_5[1:3], c(0.7771749, 0.8520195, 0.9151915), tolerance = 0.0001) + # Test scale factor output - same as Method 2 for QC selection + expect_equal(result$NormScale_20[1:3], c(1.036936, 0.960225, 1), tolerance = 0.0001) + expect_equal(result$NormScale_0_005[1:3], c(0.857016, 0.848584, 1), tolerance = 0.0001) + expect_equal(result$NormScale_0_5[1:3], c(0.777175, 0.85202, 1), tolerance = 0.0001) # Test specific SeqId columns - expect_equal(result$seq.10000.28[1:3], c(476.5, 474.4, 415.6)) - expect_equal(result$seq.10008.43[1:3], c(561.8, 541.9, 423.9)) + expect_equal(result$seq.10000.28[1:3], c(476.5, 474.4, 501.5)) + expect_equal(result$seq.10008.43[1:3], c(561.8, 541.9, 510.1)) # Test with non-existent field expect_error( From 071f956f301f7a6b0b574f6784482ca13e8700e0 Mon Sep 17 00:00:00 2001 From: Alex Poole Date: Tue, 27 Jan 2026 17:10:22 -0700 Subject: [PATCH 04/10] Update medianNormalize function documentation --- R/medianNormalize.R | 133 +++++++++++++++++++++++--------------------- 1 file changed, 71 insertions(+), 62 deletions(-) diff --git a/R/medianNormalize.R b/R/medianNormalize.R index 4b44d55e..410d2c10 100644 --- a/R/medianNormalize.R +++ b/R/medianNormalize.R @@ -1,101 +1,110 @@ #' Perform Median Normalization #' #' @description Performs median normalization on a `soma_adat` object that has -#' already undergone hybridization normalization and contains plate scale -#' normalization factors. +#' already undergone standard data processing for array-based SomaScan studies. #' -#' Median normalization is a technique used to adjust data sets to remove -#' certain assay artifacts and biases prior to analysis, particularly those that -#' may arise due to differences in overall protein concentration, pipetting -#' errors, reagent concentration changes, assay timing, and other systematic -#' variabilities. Median normalization can improve assay precision and reduce -#' technical variations that can mask true biological signals. +#' Median normalization is a common, scale-based normalization technique that +#' corrects for assay-derived technical variation by applying sample-specific +#' linear scaling to expression measurements. Typical sources of assay +#' variation include robotic and manual liquid handling, manufactured +#' consumables such as buffers and plastic goods, laboratory instrument +#' calibration, ambient environmental conditions, inter-operator differences, +#' and other sources of technical variation. Median normalization +#' can improve assay precision and reduce technical variation that can mask +#' true biological signal. #' -#' This function works by calculating or using reference median values for each -#' dilution group present in the ADAT. Dilution groups are determined from the -#' analyte metadata and represent different sample dilutions used in the assay -#' (e.g., 1:1, 1:200, 1:2000). For each sample, the median RFU across all -#' analytes within a dilution group is calculated and compared to the reference -#' median for that group. Scale factors are then computed and applied to adjust -#' each sample's values toward the reference, ensuring consistent signal levels -#' across samples within each dilution group. This function supports multiple -#' reference approaches: calculating an internal reference from study samples, -#' or using a calculated reference from a different ADAT or external reference -#' file. +#' The method scales each sample so that the center of the within-sample analyte +#' distribution aligns to a defined reference, thereby correcting +#' global intensity shifts without altering relative differences between +#' measurements within a sample. For assay formats with multiple dilution groups +#' (e.g., 1:5 or 20%; 1:200 or 0.5%, 1:20,000 or 0.005%), separate scale +#' factors are calculated for each dilution because each dilution group is +#' processed separately during the assay. For each sample, the ratio of +#' reference RFU / observed RFU is calculated for every SeqId. The median ratio +#' within each dilution group is selected as the scale factor and applied to +#' all SeqIds for that sample within the associated dilution bin. #' #' @section Data Requirements: -#' This function is designed for data in standard SomaLogic deliverable formats: +#' This function is designed for data in standard SomaLogic deliverable formats. +#' Specific ADAT file requirements: #' \enumerate{ -#' \item \strong{Study samples not median normalized}, with all controls median normalized. -#' Calibrators and buffers are intra-plate median normalized and QC samples are ANML -#' normalized if present on standard 3-dilution setups. QC assay controls undergo -#' full ANML data standardization even when study samples don't, in order to QC -#' the plate. Cell & tissue single dilution runs don't have QC samples. -#' \item \strong{Same as above}, but study samples undergo either ANML or intra-study -#' standard median normalization. +#' \item \strong{Intact ADAT file}, with available data processing information +#' in the header section. Specifically, the `ProcessSteps` field must be +#' present and correctly represent the data processing steps present in +#' the data table. +#' \item \strong{Minimal standard processing}, the function assumes a standard +#' SomaScan data deliverable with minimally standard HybNorm and PlateScale +#' steps applied. #' } #' -#' All data should be hybridization normalized, median normalized internally -#' (buffer + calibrator), and plate scaled regardless of matrix prior to applying -#' median normalization. Calibration is not applied to all study types. -#' #' \strong{Primary use cases:} #' \itemize{ -#' \item 3-dilution setups (standard SomaScan) -#' \item Single dilution cell & tissue studies +#' \item Combining data sets from the same overarching experiment or sample +#' population and normalize to a common reference that were originally +#' processed separately and each normalized "withing study". +#' \item Normalize fundamentally different types of samples separately (by +#' group). For instance, lysate samples from different cell lines that +#' will be analyzed separately should likely be median normalized within +#' each cell type. Lysis buffer background samples would also be expected +#' to be normalized separately. #' } #' #' @section Important Considerations: #' \itemize{ -#' \item If there is a known change in total protein concentration due to analysis -#' conditions, perform normalization _within_ groups using the `by` parameter -#' \item In newer SomaScan assay versions, population references improve -#' inter-plate consistency compared to within-plate references -#' \item The function will not proceed on already ANML or median normalized data -#' unless `reverse_existing = TRUE` is specified -#' \item When reversing existing normalization, only study samples are reversed; -#' QC, Calibrator, and Buffer samples retain their normalization +#' \item A core assumption of median normalization is that the majority of +#' analytes are not differentially expressed; consequently, users should +#' validate this assumption by inspecting scale-factor distributions for +#' systematic bias between the biological groups intended for comparison. +#' \item Note this function does not perform the adaptive normalization by +#' maximum likelihood (ANML) method which leverages a population-based +#' reference that iteratively down-selects the set of analytes to include +#' for the normalization calculation. +#' \item The function requires `reverse_existing = TRUE` to be set in order +#' to process data where study samples have already undergone ANML or +#' standard median normalization. +#' \item When reversing existing normalization, only study samples are +#' reversed; QC, Calibrator, and Buffer samples retain their normalization #' } #' #' @param adat A `soma_adat` object created using [read_adat()], containing #' RFU values that have been hybridization normalized and plate scaled. #' @param reference Optional. Reference for median normalization. Can be: #' \itemize{ -#' \item `NULL` (default): Use internal reference from study samples +#' \item `NULL` (default): Calculate an internal reference from study +#' samples by taking the median of each SeqId within the sample +#' grouping. For multi-plate studies, the median of all plate medians +#' is used. #' \item A `soma_adat` object: Extract reference from this ADAT -#' \item A file path (character): Read external reference from tab/comma-separated file +#' \item A file path (character): Read external reference from +#' tab/comma-separated file #' \item A data.frame: Use provided reference data directly #' } #' When providing an external reference file or data.frame it must contain: #' \describe{ -#' \item{Dilution}{Character column specifying the dilution group names -#' (e.g., "0_005", "0_5", "20"). These should match the dilution -#' groups present in the ADAT analyte data.} -#' \item{Reference}{Numeric column containing the reference median values -#' for each dilution group. These values will be used as the target -#' medians for normalization.} -#' \item{SeqId}{Optional character column. When included, SeqId-specific -#' reference values are used for more precise normalization. Values -#' must be in "10000-28" format, not "seq.10000.28" format.} +#' \item{SeqId}{Character column containing the SeqId identifiers mapping +#' to those in the `soma_adat` object. Must be in "10000-28" format, not +#' "seq.10000.28" format.} +#' \item{Reference}{Numeric column containing the reference RFU values +#' for each SeqId.} +#' #' } #' @param ref_field Character. Field used to select reference samples when #' `reference = NULL`. Default is `"SampleType"`. #' @param ref_value Character. Value(s) in `ref_field` to use as reference -#' when `reference = NULL`. Default is `c("QC", "Sample")`. -#' @param by Character vector. Grouping variable(s) for grouped median normalization. -#' Must be column name(s) in the ADAT. Use grouping when there are known changes -#' in total protein load due to analysis conditions (e.g., disease vs. control, -#' treatment vs. baseline). Normalization will be performed within each group -#' separately. Default is `"SampleType"`. +#' when `reference = NULL`. Default is `c("Sample")`. +#' @param by Character vector. Grouping variable(s) for grouped median +#' normalization. Must be column name(s) in the ADAT. Normalization will be +#' performed within each group separately. Default is `"SampleType"`. #' @param do_field Character. The field used to select samples for normalization #' (others keep original values). Default is `"SampleType"`. #' @param do_regexp Character. A regular expression pattern to select samples -#' from `do_field` for normalization. Default is `"QC|Sample"`. +#' from `do_field` for normalization. Default is `"Sample"`. #' @param reverse_existing Logical. Should existing median normalization be #' reversed before applying new normalization? When `TRUE`, existing median #' normalization scale factors are reversed for study samples only (QC, #' Calibrator, and Buffer samples retain their normalization). This allows -#' re-normalization of data that has already been median normalized. Default is `FALSE`. +#' re-normalization of data that has already been median normalized. Default +#' is `FALSE`. #' @param verbose Logical. Should progress messages be printed? Default is `TRUE`. #' @return A `soma_adat` object with median normalization applied and RFU values #' adjusted. The existing `NormScale_*` columns are updated to include the @@ -153,10 +162,10 @@ medianNormalize <- function(adat, reference = NULL, ref_field = "SampleType", - ref_value = c("QC", "Sample"), + ref_value = c("Sample"), by = "SampleType", do_field = "SampleType", - do_regexp = "QC|Sample", + do_regexp = "Sample", reverse_existing = FALSE, verbose = TRUE) { From 8e47546eff534912636dce535e5f1e817b496baa Mon Sep 17 00:00:00 2001 From: scheidec Date: Mon, 9 Feb 2026 12:00:18 -0500 Subject: [PATCH 05/10] Simplify API, use sample-only defaults, add reverse ANML support - adjusted defaults to sample-only processing (SampleType == Sample) - added ANML normalization reversal support - simplified external reference to data.frame-only input - now require only SeqId and Reference columns for external reference data - added medNormSMP_ReferenceRFU annotation field with 2 decimal precision - updated ProcessSteps formatting to use "MedNormSMP" with reversal tracking --- NAMESPACE | 1 - R/medianNormalize.R | 256 ++++++++++++++--------- inst/WORDLIST | 1 + man/medianNormalize.Rd | 162 +++++++------- tests/testthat/_snaps/medianNormalize.md | 8 +- tests/testthat/test-medianNormalize.R | 224 ++++++++++++-------- 6 files changed, 367 insertions(+), 285 deletions(-) diff --git a/NAMESPACE b/NAMESPACE index ffe655c4..a78cf5cf 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -219,6 +219,5 @@ importFrom(utils,capture.output) importFrom(utils,head) importFrom(utils,read.csv) importFrom(utils,read.delim) -importFrom(utils,read.table) importFrom(utils,tail) importFrom(utils,write.table) diff --git a/R/medianNormalize.R b/R/medianNormalize.R index 410d2c10..a74ba896 100644 --- a/R/medianNormalize.R +++ b/R/medianNormalize.R @@ -75,18 +75,15 @@ #' grouping. For multi-plate studies, the median of all plate medians #' is used. #' \item A `soma_adat` object: Extract reference from this ADAT -#' \item A file path (character): Read external reference from -#' tab/comma-separated file #' \item A data.frame: Use provided reference data directly #' } -#' When providing an external reference file or data.frame it must contain: +#' When providing an external reference data.frame it must contain: #' \describe{ #' \item{SeqId}{Character column containing the SeqId identifiers mapping #' to those in the `soma_adat` object. Must be in "10000-28" format, not #' "seq.10000.28" format.} #' \item{Reference}{Numeric column containing the reference RFU values #' for each SeqId.} -#' #' } #' @param ref_field Character. Field used to select reference samples when #' `reference = NULL`. Default is `"SampleType"`. @@ -99,12 +96,12 @@ #' (others keep original values). Default is `"SampleType"`. #' @param do_regexp Character. A regular expression pattern to select samples #' from `do_field` for normalization. Default is `"Sample"`. -#' @param reverse_existing Logical. Should existing median normalization be +#' @param reverse_existing Logical. Should existing median or ANML normalization be #' reversed before applying new normalization? When `TRUE`, existing median -#' normalization scale factors are reversed for study samples only (QC, -#' Calibrator, and Buffer samples retain their normalization). This allows -#' re-normalization of data that has already been median normalized. Default -#' is `FALSE`. +#' normalization scale factors or ANML normalization effects are reversed for +#' study samples only (QC, Calibrator, and Buffer samples retain their +#' normalization). This allows re-normalization of data that has already been +#' median normalized or ANML normalized. Default is `FALSE`. #' @param verbose Logical. Should progress messages be printed? Default is `TRUE`. #' @return A `soma_adat` object with median normalization applied and RFU values #' adjusted. The existing `NormScale_*` columns are updated to include the @@ -112,30 +109,19 @@ #' @examples #' \dontrun{ #' # Internal reference from study samples (default) -#' # Use when you have representative QC or control samples in your study #' med_norm_adat <- medianNormalize(adat) #' #' # Reference from specific samples in the ADAT -#' # Use when you want to normalize against only QC samples #' med_norm_adat <- medianNormalize(adat, #' ref_field = "SampleType", -#' ref_value = "QC") +#' ref_value = "Sample") #' #' # Reference from another ADAT -#' # Use when you have a reference population or control study #' ref_adat <- read_adat("reference_file.adat") #' med_norm_adat <- medianNormalize(adat, reference = ref_adat) #' -#' # External reference file -#' # Use when you have pre-calculated reference medians for each dilution -#' med_norm_adat <- medianNormalize(adat, reference = "reference_file.csv") -#' -#' # External reference as data.frame -#' # Use for programmatic control over reference values -#' ref_data <- data.frame( -#' Dilution = c("0_005", "0_5", "20"), -#' Reference = c(1500.2, 3200.8, 4100.5) -#' ) +#' # External reference as a data.frame - requires `SeqId` and `Reference` columns +#' ref_data <- read.csv("reference_file.csv") #' med_norm_adat <- medianNormalize(adat, reference = ref_data) #' #' # Custom grouping by multiple variables @@ -144,12 +130,11 @@ #' med_norm_adat <- medianNormalize(adat, by = c("Sex", "SampleType")) #' #' # Normalize only specific sample types -#' # Use when you want to preserve original values for certain sample types #' med_norm_adat <- medianNormalize(adat, #' do_field = "SampleType", #' do_regexp = "Sample") #' -#' # Re-normalize data that has already been median normalized +#' # Re-normalize data that has already been median or ANML normalized #' # Use when you want to apply different normalization to previously normalized data #' med_norm_adat <- medianNormalize(adat, #' reverse_existing = TRUE, @@ -157,7 +142,6 @@ #' } #' @importFrom dplyr filter #' @importFrom stats median -#' @importFrom utils read.table read.csv #' @export medianNormalize <- function(adat, reference = NULL, @@ -172,6 +156,14 @@ medianNormalize <- function(adat, # Input validation ---- stopifnot("`adat` must be a class `soma_adat` object" = is.soma_adat(adat)) + # Validate reference type early + if (!is.null(reference) && !is.soma_adat(reference) && !is.data.frame(reference)) { + stop( + "Invalid reference type. Must be NULL, soma_adat, or data.frame", + call. = FALSE + ) + } + # Check that required normalization steps have been applied ---- header <- attr(adat, "Header.Meta")$HEADER @@ -205,9 +197,20 @@ medianNormalize <- function(adat, # Reverse existing normalization if requested ---- if (reverse_existing && has_existing_norm) { if (verbose) { - cat("Reversing existing median normalization for study samples...\n") + if (grepl("ANML", header$ProcessSteps %||% "", ignore.case = TRUE)) { + cat("Reversing existing ANML normalization for study samples...\n") + adat <- .reverseANMLSMP(adat, verbose) + } else { + cat("Reversing existing median normalization for study samples...\n") + adat <- .reverseMedNormSMP(adat, verbose) + } + } else { + if (grepl("ANML", header$ProcessSteps %||% "", ignore.case = TRUE)) { + adat <- .reverseANMLSMP(adat, verbose) + } else { + adat <- .reverseMedNormSMP(adat, verbose) + } } - adat <- .reverseMedNormSMP(adat, verbose) } # Create dilution groups ---- @@ -353,13 +356,6 @@ medianNormalize <- function(adat, } return(.extractReferenceFromAdat(reference, dil_groups)) - } else if (is.character(reference) && length(reference) == 1) { - # External reference file - if (verbose) { - .todo("Reading external reference from file: {.val {reference}}") - } - return(.readExternalReference(reference, dil_groups, apt_data)) - } else if (is.data.frame(reference)) { # Reference data provided directly if (verbose) { @@ -369,7 +365,7 @@ medianNormalize <- function(adat, } else { stop( - "Invalid reference type. Must be NULL, soma_adat, file path, or data.frame", + "Invalid reference type. Must be NULL, soma_adat, or data.frame", call. = FALSE ) } @@ -423,62 +419,31 @@ medianNormalize <- function(adat, ref_data } -#' Read External Reference File -#' @noRd -.readExternalReference <- function(file_path, dil_groups, apt_data) { - - if (!file.exists(file_path)) { - stop("Reference file not found: ", file_path, call. = FALSE) - } - # Determine file type and read - file_ext_val <- file_ext(file_path) - - if (file_ext_val %in% c("csv")) { - ref_df <- read.csv(file_path, stringsAsFactors = FALSE) - } else if (file_ext_val %in% c("txt", "tsv")) { - ref_df <- utils::read.table(file_path, header = TRUE, sep = "\t", stringsAsFactors = FALSE) - } else { - # Try to auto-detect - ref_df <- tryCatch({ - read.csv(file_path, stringsAsFactors = FALSE) - }, error = function(e) { - utils::read.table(file_path, header = TRUE, sep = "\t", stringsAsFactors = FALSE) - }) - } - - .validateReferenceData(ref_df, dil_groups, apt_data) -} #' Validate Reference Data #' @noRd .validateReferenceData <- function(ref_df, dil_groups, apt_data = NULL) { - # Check that required columns are present - required_cols <- c("Dilution", "Reference") + # Check for required SeqId and Reference columns + required_cols <- c("SeqId", "Reference") if (!all(required_cols %in% names(ref_df))) { missing_cols <- setdiff(required_cols, names(ref_df)) stop( - "Reference data must contain columns: ", paste(missing_cols, collapse = ", "), + "Reference data must contain 'SeqId' and 'Reference' columns.\n", + "Missing columns: ", paste(missing_cols, collapse = ", "), "\n", + "Found columns: ", paste(names(ref_df), collapse = ", "), call. = FALSE ) } - # Check if we have SeqId-specific information - if ("SeqId" %in% names(ref_df)) { - # Process as SeqId-specific reference - return(.processSeqIdReference(ref_df, dil_groups, apt_data)) - } else { - # Process as simple dilution-level reference - return(.processSimpleReference(ref_df, dil_groups)) - } + # Process as SeqId-specific reference + return(.processSeqIdReference(ref_df, dil_groups, apt_data)) } #' Process SeqId-Specific Reference Data #' @noRd .processSeqIdReference <- function(ref_df, dil_groups, apt_data) { - # Convert dilution values to character for consistent matching - ref_df$Dilution <- as.character(ref_df$Dilution) if (!is.null(apt_data) && !"SeqId" %in% names(apt_data)) { stop("ADAT analyte data must contain SeqId column for SeqId reference matching", call. = FALSE) @@ -518,23 +483,7 @@ medianNormalize <- function(adat, ref_data } -#' Process Simple Reference Data (Dilution -> Reference) -#' @noRd -.processSimpleReference <- function(ref_df, dil_groups) { - # Convert to named list - ref_data <- setNames(ref_df$Reference, ref_df$Dilution) - - # Check that we have references for all dilution groups - missing_dils <- setdiff(names(dil_groups), names(ref_data)) - if (length(missing_dils) > 0) { - stop( - "Missing reference values for dilution groups: ", paste(missing_dils, collapse = ", "), - call. = FALSE - ) - } - ref_data -} #' Perform Median Normalization #' @noRd @@ -698,15 +647,15 @@ medianNormalize <- function(adat, if (!is.null(header_meta) && !is.null(header_meta$HEADER)) { # Add median normalization to process steps if ("ProcessSteps" %in% names(header_meta$HEADER)) { - if (!grepl("medNormInt", header_meta$HEADER$ProcessSteps)) { + if (!grepl("MedNormSMP", header_meta$HEADER$ProcessSteps)) { header_meta$HEADER$ProcessSteps <- paste( header_meta$HEADER$ProcessSteps, - "medNormInt (SampleId)", + "MedNormSMP", sep = ", " ) } } else { - header_meta$HEADER$ProcessSteps <- "medNormInt (SampleId)" + header_meta$HEADER$ProcessSteps <- "MedNormSMP" } # Set normalization algorithm @@ -756,10 +705,10 @@ medianNormalize <- function(adat, ) } - if (has_anml) { + if (has_anml && !reverse_existing) { stop( "Data appears to be ANML normalized. ", - "ANML normalized data cannot be directly processed with this function. ", + "Set reverse_existing = TRUE to reverse existing ANML normalization before applying new normalization. ", "ProcessSteps: ", process_steps, call. = FALSE ) @@ -845,13 +794,15 @@ medianNormalize <- function(adat, # Handle both single reference and aptamer-specific references if (is.numeric(ref_values) && length(ref_values) == 1) { - # Single reference value for the whole dilution group - apt_data$medNormSMP_ReferenceRFU[apt_data$AptName %in% dil_apts] <- ref_values + # Single reference value for the whole dilution group (round to 2 decimal places) + rounded_value <- round(ref_values, 2) + apt_data$medNormSMP_ReferenceRFU[apt_data$AptName %in% dil_apts] <- rounded_value } else if (is.numeric(ref_values) && length(ref_values) > 1) { - # Aptamer-specific reference values + # Aptamer-specific reference values (round to 2 decimal places) for (apt in dil_apts) { if (apt %in% names(ref_values)) { - apt_data$medNormSMP_ReferenceRFU[apt_data$AptName == apt] <- ref_values[apt] + rounded_value <- round(ref_values[apt], 2) + apt_data$medNormSMP_ReferenceRFU[apt_data$AptName == apt] <- rounded_value } } } @@ -926,7 +877,16 @@ medianNormalize <- function(adat, header_meta <- attr(adat, "Header.Meta") if (!is.null(header_meta) && !is.null(header_meta$HEADER)) { current_steps <- header_meta$HEADER$ProcessSteps %||% "" - header_meta$HEADER$ProcessSteps <- paste(current_steps, "reverseMedNormSMP", sep = ", ") + + # Add reversal step - look for what type of median norm was reversed + if (grepl("MedNormSMP", current_steps, ignore.case = TRUE)) { + header_meta$HEADER$ProcessSteps <- paste(current_steps, "rev-MedNormSMP", sep = ", ") + } else if (grepl("medNormInt", current_steps, ignore.case = TRUE)) { + header_meta$HEADER$ProcessSteps <- paste(current_steps, "rev-medNormInt", sep = ", ") + } else { + header_meta$HEADER$ProcessSteps <- paste(current_steps, "rev-MedNorm", sep = ", ") + } + attr(adat, "Header.Meta") <- header_meta } @@ -938,6 +898,102 @@ medianNormalize <- function(adat, } +#' Reverse Existing ANML Normalization for Study Samples +#' @noRd +.reverseANMLSMP <- function(adat, verbose) { + + # Get existing scale factors + sf_cols <- grep("^NormScale_", names(adat), value = TRUE) + + if (length(sf_cols) == 0) { + if (verbose) { + cat("No existing ANML normalization scale factors found to reverse.\n") + } + return(adat) + } + + if (verbose) { + cat("Reversing existing ANML normalization for study samples...\n") + } + + # Get dilution groups + apt_data <- getAnalyteInfo(adat) + dil_groups <- split(apt_data$AptName, apt_data$Dilution) + names(dil_groups) <- gsub("\\.", "_", names(dil_groups)) + names(dil_groups) <- gsub("[.]0$|%|^[.]", "", names(dil_groups)) + + # Only reverse for study samples, leave QC/Calibrator/Buffer alone + sample_mask <- grepl("Sample|sample", adat$SampleType %||% adat$SampleId %||% "", ignore.case = TRUE) + + if (sum(sample_mask) == 0) { + if (verbose) { + cat("No study samples found to reverse ANML normalization.\n") + } + return(adat) + } + + # For each dilution group, reverse ANML normalization using log space + for (dil_name in names(dil_groups)) { + sf_col <- paste0("NormScale_", dil_name) + + if (sf_col %in% names(adat)) { + dil_apts <- intersect(dil_groups[[dil_name]], getAnalytes(adat)) + + if (length(dil_apts) > 0) { + for (i in which(sample_mask)) { + scale_factor <- adat[[sf_col]][i] + if (!is.na(scale_factor) && scale_factor != 0) { + # ANML uses log space scaling - reverse by subtracting log scale factor + log_sf <- log10(scale_factor) + log_data <- log10(as.numeric(adat[i, dil_apts])) + reversed_log_data <- log_data - log_sf + adat[i, dil_apts] <- 10^reversed_log_data + # Reset scale factor to 1.0 + adat[[sf_col]][i] <- 1.0 + } + } + } + } + } + + # Remove ANMLFractionUsed columns if they exist + anml_frac_cols <- grep("^ANMLFractionUsed_", names(adat), value = TRUE) + if (length(anml_frac_cols) > 0) { + for (col in anml_frac_cols) { + if (all(is.na(adat[[col]][sample_mask]))) { + adat[[col]][sample_mask] <- NA + } + } + } + + # Update ProcessSteps to indicate reversal + header_meta <- attr(adat, "Header.Meta") + if (!is.null(header_meta) && !is.null(header_meta$HEADER)) { + current_steps <- header_meta$HEADER$ProcessSteps %||% "" + + # Add reversal step - look for what type of ANML was reversed + if (grepl("anmlSMP", current_steps, ignore.case = TRUE)) { + header_meta$HEADER$ProcessSteps <- paste(current_steps, "rev-anmlSMP", sep = ", ") + } else if (grepl("ANML", current_steps, ignore.case = TRUE)) { + header_meta$HEADER$ProcessSteps <- paste(current_steps, "rev-ANML", sep = ", ") + } + + attr(adat, "Header.Meta") <- header_meta + } + + # Reset RowCheck for reversed samples + if ("RowCheck" %in% names(adat)) { + adat$RowCheck[sample_mask] <- "PASS" + } + + if (verbose) { + cat("ANML normalization reversed for", sum(sample_mask), "study samples.\n") + } + + adat +} + + #' Recalculate RowCheck after Median Normalization #' #' Recalculates RowCheck values as PASS or FLAG based on normalization acceptance diff --git a/inst/WORDLIST b/inst/WORDLIST index 8bd5756d..606b0080 100644 --- a/inst/WORDLIST +++ b/inst/WORDLIST @@ -34,6 +34,7 @@ ExtIdentifier FC Hiser HybControlNormScale +HybNorm Kuei LF Lifecycle diff --git a/man/medianNormalize.Rd b/man/medianNormalize.Rd index d8ac0a30..34ca2032 100644 --- a/man/medianNormalize.Rd +++ b/man/medianNormalize.Rd @@ -8,10 +8,10 @@ medianNormalize( adat, reference = NULL, ref_field = "SampleType", - ref_value = c("QC", "Sample"), + ref_value = c("Sample"), by = "SampleType", do_field = "SampleType", - do_regexp = "QC|Sample", + do_regexp = "Sample", reverse_existing = FALSE, verbose = TRUE ) @@ -22,47 +22,44 @@ RFU values that have been hybridization normalized and plate scaled.} \item{reference}{Optional. Reference for median normalization. Can be: \itemize{ -\item \code{NULL} (default): Use internal reference from study samples +\item \code{NULL} (default): Calculate an internal reference from study +samples by taking the median of each SeqId within the sample +grouping. For multi-plate studies, the median of all plate medians +is used. \item A \code{soma_adat} object: Extract reference from this ADAT -\item A file path (character): Read external reference from tab/comma-separated file \item A data.frame: Use provided reference data directly } -When providing an external reference file or data.frame it must contain: +When providing an external reference data.frame it must contain: \describe{ -\item{Dilution}{Character column specifying the dilution group names -(e.g., "0_005", "0_5", "20"). These should match the dilution -groups present in the ADAT analyte data.} -\item{Reference}{Numeric column containing the reference median values -for each dilution group. These values will be used as the target -medians for normalization.} -\item{SeqId}{Optional character column. When included, SeqId-specific -reference values are used for more precise normalization. Values -must be in "10000-28" format, not "seq.10000.28" format.} +\item{SeqId}{Character column containing the SeqId identifiers mapping +to those in the \code{soma_adat} object. Must be in "10000-28" format, not +"seq.10000.28" format.} +\item{Reference}{Numeric column containing the reference RFU values +for each SeqId.} }} \item{ref_field}{Character. Field used to select reference samples when \code{reference = NULL}. Default is \code{"SampleType"}.} \item{ref_value}{Character. Value(s) in \code{ref_field} to use as reference -when \code{reference = NULL}. Default is \code{c("QC", "Sample")}.} +when \code{reference = NULL}. Default is \code{c("Sample")}.} -\item{by}{Character vector. Grouping variable(s) for grouped median normalization. -Must be column name(s) in the ADAT. Use grouping when there are known changes -in total protein load due to analysis conditions (e.g., disease vs. control, -treatment vs. baseline). Normalization will be performed within each group -separately. Default is \code{"SampleType"}.} +\item{by}{Character vector. Grouping variable(s) for grouped median +normalization. Must be column name(s) in the ADAT. Normalization will be +performed within each group separately. Default is \code{"SampleType"}.} \item{do_field}{Character. The field used to select samples for normalization (others keep original values). Default is \code{"SampleType"}.} \item{do_regexp}{Character. A regular expression pattern to select samples -from \code{do_field} for normalization. Default is \code{"QC|Sample"}.} +from \code{do_field} for normalization. Default is \code{"Sample"}.} -\item{reverse_existing}{Logical. Should existing median normalization be +\item{reverse_existing}{Logical. Should existing median or ANML normalization be reversed before applying new normalization? When \code{TRUE}, existing median -normalization scale factors are reversed for study samples only (QC, -Calibrator, and Buffer samples retain their normalization). This allows -re-normalization of data that has already been median normalized. Default is \code{FALSE}.} +normalization scale factors or ANML normalization effects are reversed for +study samples only (QC, Calibrator, and Buffer samples retain their +normalization). This allows re-normalization of data that has already been +median normalized or ANML normalized. Default is \code{FALSE}.} \item{verbose}{Logical. Should progress messages be printed? Default is \code{TRUE}.} } @@ -73,93 +70,91 @@ effects of both plate scale normalization and median normalization. } \description{ Performs median normalization on a \code{soma_adat} object that has -already undergone hybridization normalization and contains plate scale -normalization factors. - -Median normalization is a technique used to adjust data sets to remove -certain assay artifacts and biases prior to analysis, particularly those that -may arise due to differences in overall protein concentration, pipetting -errors, reagent concentration changes, assay timing, and other systematic -variabilities. Median normalization can improve assay precision and reduce -technical variations that can mask true biological signals. - -This function works by calculating or using reference median values for each -dilution group present in the ADAT. Dilution groups are determined from the -analyte metadata and represent different sample dilutions used in the assay -(e.g., 1:1, 1:200, 1:2000). For each sample, the median RFU across all -analytes within a dilution group is calculated and compared to the reference -median for that group. Scale factors are then computed and applied to adjust -each sample's values toward the reference, ensuring consistent signal levels -across samples within each dilution group. This function supports multiple -reference approaches: calculating an internal reference from study samples, -or using a calculated reference from a different ADAT or external reference -file. +already undergone standard data processing for array-based SomaScan studies. + +Median normalization is a common, scale-based normalization technique that +corrects for assay-derived technical variation by applying sample-specific +linear scaling to expression measurements. Typical sources of assay +variation include robotic and manual liquid handling, manufactured +consumables such as buffers and plastic goods, laboratory instrument +calibration, ambient environmental conditions, inter-operator differences, +and other sources of technical variation. Median normalization +can improve assay precision and reduce technical variation that can mask +true biological signal. + +The method scales each sample so that the center of the within-sample analyte +distribution aligns to a defined reference, thereby correcting +global intensity shifts without altering relative differences between +measurements within a sample. For assay formats with multiple dilution groups +(e.g., 1:5 or 20\%; 1:200 or 0.5\%, 1:20,000 or 0.005\%), separate scale +factors are calculated for each dilution because each dilution group is +processed separately during the assay. For each sample, the ratio of +reference RFU / observed RFU is calculated for every SeqId. The median ratio +within each dilution group is selected as the scale factor and applied to +all SeqIds for that sample within the associated dilution bin. } \section{Data Requirements}{ -This function is designed for data in standard SomaLogic deliverable formats: +This function is designed for data in standard SomaLogic deliverable formats. +Specific ADAT file requirements: \enumerate{ -\item \strong{Study samples not median normalized}, with all controls median normalized. -Calibrators and buffers are intra-plate median normalized and QC samples are ANML -normalized if present on standard 3-dilution setups. QC assay controls undergo -full ANML data standardization even when study samples don't, in order to QC -the plate. Cell & tissue single dilution runs don't have QC samples. -\item \strong{Same as above}, but study samples undergo either ANML or intra-study -standard median normalization. +\item \strong{Intact ADAT file}, with available data processing information +in the header section. Specifically, the \code{ProcessSteps} field must be +present and correctly represent the data processing steps present in +the data table. +\item \strong{Minimal standard processing}, the function assumes a standard +SomaScan data deliverable with minimally standard HybNorm and PlateScale +steps applied. } -All data should be hybridization normalized, median normalized internally -(buffer + calibrator), and plate scaled regardless of matrix prior to applying -median normalization. Calibration is not applied to all study types. - \strong{Primary use cases:} \itemize{ -\item 3-dilution setups (standard SomaScan) -\item Single dilution cell & tissue studies +\item Combining data sets from the same overarching experiment or sample +population and normalize to a common reference that were originally +processed separately and each normalized "withing study". +\item Normalize fundamentally different types of samples separately (by +group). For instance, lysate samples from different cell lines that +will be analyzed separately should likely be median normalized within +each cell type. Lysis buffer background samples would also be expected +to be normalized separately. } } \section{Important Considerations}{ \itemize{ -\item If there is a known change in total protein concentration due to analysis -conditions, perform normalization \emph{within} groups using the \code{by} parameter -\item In newer SomaScan assay versions, population references improve -inter-plate consistency compared to within-plate references -\item The function will not proceed on already ANML or median normalized data -unless \code{reverse_existing = TRUE} is specified -\item When reversing existing normalization, only study samples are reversed; -QC, Calibrator, and Buffer samples retain their normalization +\item A core assumption of median normalization is that the majority of +analytes are not differentially expressed; consequently, users should +validate this assumption by inspecting scale-factor distributions for +systematic bias between the biological groups intended for comparison. +\item Note this function does not perform the adaptive normalization by +maximum likelihood (ANML) method which leverages a population-based +reference that iteratively down-selects the set of analytes to include +for the normalization calculation. +\item The function requires \code{reverse_existing = TRUE} to be set in order +to process data where study samples have already undergone ANML or +standard median normalization. +\item When reversing existing normalization, only study samples are +reversed; QC, Calibrator, and Buffer samples retain their normalization } } \examples{ \dontrun{ # Internal reference from study samples (default) -# Use when you have representative QC or control samples in your study med_norm_adat <- medianNormalize(adat) # Reference from specific samples in the ADAT -# Use when you want to normalize against only QC samples med_norm_adat <- medianNormalize(adat, ref_field = "SampleType", - ref_value = "QC") + ref_value = "Sample") # Reference from another ADAT -# Use when you have a reference population or control study ref_adat <- read_adat("reference_file.adat") med_norm_adat <- medianNormalize(adat, reference = ref_adat) -# External reference file -# Use when you have pre-calculated reference medians for each dilution -med_norm_adat <- medianNormalize(adat, reference = "reference_file.csv") - -# External reference as data.frame -# Use for programmatic control over reference values -ref_data <- data.frame( - Dilution = c("0_005", "0_5", "20"), - Reference = c(1500.2, 3200.8, 4100.5) -) +# External reference as a data.frame - requires `SeqId` and `Reference` columns +ref_data <- read.csv("reference_file.csv") med_norm_adat <- medianNormalize(adat, reference = ref_data) # Custom grouping by multiple variables @@ -168,12 +163,11 @@ med_norm_adat <- medianNormalize(adat, reference = ref_data) med_norm_adat <- medianNormalize(adat, by = c("Sex", "SampleType")) # Normalize only specific sample types -# Use when you want to preserve original values for certain sample types med_norm_adat <- medianNormalize(adat, do_field = "SampleType", do_regexp = "Sample") -# Re-normalize data that has already been median normalized +# Re-normalize data that has already been median or ANML normalized # Use when you want to apply different normalization to previously normalized data med_norm_adat <- medianNormalize(adat, reverse_existing = TRUE, diff --git a/tests/testthat/_snaps/medianNormalize.md b/tests/testthat/_snaps/medianNormalize.md index 390f7f7c..1c6b9627 100644 --- a/tests/testthat/_snaps/medianNormalize.md +++ b/tests/testthat/_snaps/medianNormalize.md @@ -11,13 +11,7 @@ Three dilution setup detected (standard setup). Message > Normalization scale factors already exist: "NormScale_20, NormScale_0_005, NormScale_0_5" - they will be replaced with new scale factors - > Building internal reference from field: "SampleType" with values: "QC" and "Sample" - > Performing grouped median normalization by: "SampleType" (2 groups) - > Processing group: "Sample" (2 samples) - v Processing dilution '0_005' with 50 analytes - v Processing dilution '0_5' with 50 analytes - v Processing dilution '20' with 50 analytes - > Processing group: "QC" (1 samples) + > Building internal reference from field: "SampleType" with values: "Sample" v Processing dilution '0_005' with 50 analytes v Processing dilution '0_5' with 50 analytes v Processing dilution '20' with 50 analytes diff --git a/tests/testthat/test-medianNormalize.R b/tests/testthat/test-medianNormalize.R index afc8ebcf..f8ad5541 100644 --- a/tests/testthat/test-medianNormalize.R +++ b/tests/testthat/test-medianNormalize.R @@ -59,9 +59,9 @@ test_that("`medianNormalize` Method 1: Internal reference (default)", { expect_true(all(norm_cols %in% names(result))) # Test scale factor output - expect_equal(result$NormScale_20[1:3], c(0.971895, 1.029779, 1), tolerance = 0.000001) - expect_equal(result$NormScale_0_005[1:3], c(1.002922, 0.997176, 1), tolerance = 0.000001) - expect_equal(result$NormScale_0_5[1:3], c(0.973205, 1.028323, 1), tolerance = 0.000001) + expect_equal(result$NormScale_20[1:3], c(0.971895, 1.029779, 1.199944), tolerance = 0.000001) + expect_equal(result$NormScale_0_005[1:3], c(1.002922, 0.997176, 1.058289), tolerance = 0.000001) + expect_equal(result$NormScale_0_5[1:3], c(0.973205, 1.028323, 1.104745), tolerance = 0.000001) # Test specific SeqId columns expect_equal(result$seq.10000.28[1:3], c(463.1, 488.5, 501.5)) @@ -69,19 +69,19 @@ test_that("`medianNormalize` Method 1: Internal reference (default)", { # Check header metadata result_header <- attr(result, "Header.Meta")$HEADER - expect_true(grepl("medNormInt", result_header$ProcessSteps)) + expect_true(grepl("MedNormSMP", result_header$ProcessSteps)) expect_equal(result_header$NormalizationAlgorithm, "MedNorm") expect_true(grepl("intraplate, crossplate", result_header$MedNormReference)) }) test_that("`medianNormalize` Method 2: Reference from specific samples", { - # Test with specific reference field and value + # Test with specific reference field and value - using QC as reference but normalizing both QC and Sample expect_no_error( result <- medianNormalize(test_data, ref_field = "SampleType", ref_value = "QC", - do_regexp = "QC", + do_regexp = "QC|Sample", # Normalize both types verbose = FALSE) ) @@ -90,14 +90,14 @@ test_that("`medianNormalize` Method 2: Reference from specific samples", { norm_cols <- grep("^NormScale_", names(result), value = TRUE) expect_equal(length(norm_cols), 3) # Should have 3 dilution groups - # Test scale factor output - expect_equal(result$NormScale_20[1:3], c(1.0369358, 0.9602251, 1), tolerance = 0.000001) - expect_equal(result$NormScale_0_005[1:3], c(0.8570162, 0.8485842, 1), tolerance = 0.000001) - expect_equal(result$NormScale_0_5[1:3], c(0.7771749, 0.8520195, 1), tolerance = 0.000001) + # Test scale factor output - normalized both QC and Sample types, so QC (3rd) gets scale factor 1.0 + expect_equal(result$NormScale_20[1:3], c(0.9089, 0.9578, 1.0000), tolerance = 0.001) + expect_equal(result$NormScale_0_005[1:3], c(0.9783, 0.9806, 1.0000), tolerance = 0.001) + expect_equal(result$NormScale_0_5[1:3], c(0.9879, 1.1445, 1.0000), tolerance = 0.001) # Test specific SeqId columns - expect_equal(result$seq.10000.28[1:3], c(476.5, 474.4, 501.5)) - expect_equal(result$seq.10008.43[1:3], c(561.8, 541.9, 510.1)) + expect_equal(result$seq.10000.28[1:3], c(433.1, 454.4, 501.5), tolerance = 0.1) + expect_equal(result$seq.10008.43[1:3], c(510.6, 519.0, 510.1), tolerance = 0.1) # Check header metadata result_header <- attr(result, "Header.Meta")$HEADER @@ -118,13 +118,13 @@ test_that("`medianNormalize` Method 3: Reference from another ADAT", { expect_equal(length(norm_cols), 3) # Should have 3 dilution groups # Test scale factor output - expect_equal(result$NormScale_20[1:3], c(0.9718953, 1.0297787, 1.0841026), tolerance = 0.0001) - expect_equal(result$NormScale_0_005[1:3], c(1.0029224, 0.9971762, 1.0548762), tolerance = 0.0001) - expect_equal(result$NormScale_0_5[1:3], c(0.9732045, 1.0283228, 0.9601410), tolerance = 0.0001) + expect_equal(result$NormScale_20[1:3], c(0.9719, 1.0298, 1.1999), tolerance = 0.0001) + expect_equal(result$NormScale_0_005[1:3], c(1.0029, 0.9972, 1.0583), tolerance = 0.0001) + expect_equal(result$NormScale_0_5[1:3], c(0.9732, 1.0283, 1.1047), tolerance = 0.0001) # Test specific SeqId columns - expect_equal(result$seq.10000.28[1:3], c(463.1, 488.5, 543.7)) - expect_equal(result$seq.10008.43[1:3], c(546, 558, 553)) + expect_equal(result$seq.10000.28[1:3], c(463.1, 488.5, 501.5), tolerance = 0.1) + expect_equal(result$seq.10008.43[1:3], c(546, 558, 510.1), tolerance = 0.1) # Check header metadata result_header <- attr(result, "Header.Meta")$HEADER @@ -132,52 +132,12 @@ test_that("`medianNormalize` Method 3: Reference from another ADAT", { expect_true(grepl("crossplate", result_header$MedNormReference)) }) -test_that("`medianNormalize` Method 4: External reference file", { - # Create a temporary CSV file with reference data - # Use realistic dilution groups from example_data: "20", "0.5", "0.005", "0" - ref_data <- data.frame( - Dilution = c("20", "0_5", "0_005", "0"), - Reference = c(2500.5, 1800.2, 3200.8, 1500.0), - stringsAsFactors = FALSE - ) - - # Create temporary CSV file - temp_csv <- tempfile(fileext = ".csv") - write.csv(ref_data, temp_csv, row.names = FALSE) - - expect_no_error( - result <- medianNormalize(test_data, reference = temp_csv, verbose = FALSE) - ) - - # Check result structure - expect_true(is.soma_adat(result)) - norm_cols <- grep("^NormScale_", names(result), value = TRUE) - expect_equal(length(norm_cols), 3) # Should have 3 dilution groups - - # Test scale factor output - expect_equal(result$NormScale_20[1:3], c(3.31, 3.71, 4.21), tolerance = 0.01) - expect_equal(result$NormScale_0_005[1:3], c(0.577, 0.561, 0.561), tolerance = 0.001) - expect_equal(result$NormScale_0_5[1:3], c(0.90, 1.18, 1.03), tolerance = 0.01) - - # Test specific SeqId columns - expect_equal(result$seq.10000.28[1:3], c(1579.0, 1759.9, 2109.7)) - expect_equal(result$seq.10008.43[1:3], c(1861.7, 2010.3, 2145.9)) - - # Check header metadata - result_header <- attr(result, "Header.Meta")$HEADER - expect_true(grepl(basename(temp_csv), result_header$MedNormReference)) - expect_true(grepl("crossplate", result_header$MedNormReference)) - - # Clean up - unlink(temp_csv) -}) - -test_that("`medianNormalize` Method 5: External reference as data.frame", { - - # Create reference data.frame with realistic dilution groups +test_that("`medianNormalize` External reference data.frame", { + # Create reference data.frame with SeqId-Reference format + analyte_info <- getAnalyteInfo(test_data) ref_data <- data.frame( - Dilution = c("20", "0_5", "0_005", "0"), - Reference = c(2500.5, 1800.2, 3200.8, 1500.0), + SeqId = analyte_info$SeqId[1:15], # Use first 15 SeqIds + Reference = runif(15, 2000, 4000), # Random reference values stringsAsFactors = FALSE ) @@ -190,19 +150,19 @@ test_that("`medianNormalize` Method 5: External reference as data.frame", { norm_cols <- grep("^NormScale_", names(result), value = TRUE) expect_equal(length(norm_cols), 3) # Should have 3 dilution groups - # Test scale factor output - expect_equal(result$NormScale_20[1:3], c(3.31, 3.71, 4.21), tolerance = 0.01) - expect_equal(result$NormScale_0_005[1:3], c(0.577, 0.561, 0.561), tolerance = 0.001) - expect_equal(result$NormScale_0_5[1:3], c(0.90, 1.18, 1.03), tolerance = 0.01) - - # Test specific SeqId columns - expect_equal(result$seq.10000.28[1:3], c(1579.0, 1759.9, 2109.7)) - expect_equal(result$seq.10008.43[1:3], c(1861.7, 2010.3, 2145.9)) - # Check header metadata result_header <- attr(result, "Header.Meta")$HEADER expect_true(grepl("external_data", result_header$MedNormReference)) expect_true(grepl("crossplate", result_header$MedNormReference)) + + # Check that medNormSMP_ReferenceRFU field was added with proper formatting + result_analyte_info <- getAnalyteInfo(result) + expect_true("medNormSMP_ReferenceRFU" %in% names(result_analyte_info)) + + # Check reference values are rounded to 2 decimal places + ref_values <- result_analyte_info$medNormSMP_ReferenceRFU + rounded_values <- round(ref_values, 2) + expect_equal(ref_values[!is.na(ref_values)], rounded_values[!is.na(rounded_values)]) }) test_that("`medianNormalize` validates input requirements", { @@ -241,36 +201,48 @@ test_that("`medianNormalize` validates input requirements", { ) # Test with invalid reference data.frame (missing required columns) - invalid_ref <- data.frame(Wrong = c("20", "0_5"), Column = c(1000, 2000)) + invalid_ref <- data.frame(Wrong = c("10000-28", "10001-7"), Column = c(1000, 2000)) expect_error( medianNormalize(test_data, reference = invalid_ref, verbose = FALSE), - "Reference data must contain columns: Dilution, Reference" + "Reference data must contain 'SeqId' and 'Reference' columns" ) - # Test with non-existent reference file + # Test with invalid reference type (non-data.frame) expect_error( - medianNormalize(test_data, reference = "non_existent_file.csv", verbose = FALSE), - "Reference file not found" + medianNormalize(test_data, reference = "invalid_string", verbose = FALSE), + "Invalid reference type" ) - # Test with invalid reference type + # Test with numeric input expect_error( medianNormalize(test_data, reference = 123, verbose = FALSE), "Invalid reference type" ) }) -test_that("`medianNormalize` external reference file validation", { - # Test with missing dilution groups in reference - incomplete_ref <- data.frame( - Dilution = c("20", "0_5"), # Missing "0_005" and "0" +test_that("`medianNormalize` external reference data.frame validation", { + # Test with invalid SeqId format (should error when no matches found) + invalid_seqid_ref <- data.frame( + SeqId = c("invalid_format_1", "invalid_format_2"), # Invalid SeqId format Reference = c(2500, 1800), stringsAsFactors = FALSE ) expect_error( - medianNormalize(test_data, reference = incomplete_ref, verbose = FALSE), - "Missing reference values for dilution groups" + medianNormalize(test_data, reference = invalid_seqid_ref, verbose = FALSE), + "No matching SeqIds or dilution groups found in reference data" + ) + + # Test with empty reference data + empty_ref <- data.frame( + SeqId = character(0), + Reference = numeric(0), + stringsAsFactors = FALSE + ) + + expect_error( + medianNormalize(test_data, reference = empty_ref, verbose = FALSE), + "No matching SeqIds or dilution groups found in reference data" ) }) @@ -306,13 +278,13 @@ test_that("`medianNormalize` handles grouping correctly", { expect_equal(length(norm_cols), 3) # Should have 3 dilution groups # Test scale factor output - expect_equal(result2$NormScale_20[1:3], c(1.054, 1.060, 1.069), tolerance = 0.001) - expect_equal(result2$NormScale_0_005[1:3], c(1.0125, 1.0054, 0.9993), tolerance = 0.001) - expect_equal(result2$NormScale_0_5[1:3], c(1.080, 1.057, 1.074), tolerance = 0.001) + expect_equal(result2$NormScale_20[1:3], c(1.0678, 1.0571, 1.0917), tolerance = 0.001) + expect_equal(result2$NormScale_0_005[1:3], c(1.0225, 1.0012, 1.0122), tolerance = 0.001) + expect_equal(result2$NormScale_0_5[1:3], c(1.1031, 1.0544, 1.1087), tolerance = 0.001) # Test specific SeqId columns - expect_equal(result2$seq.10000.28[1:3], c(502.1, 502.7, 444.1)) - expect_equal(result2$seq.10008.43[1:3], c(591.9, 574.2, 453.0)) + expect_equal(result2$seq.10000.28[1:3], c(508.8, 501.5, 453.7)) + expect_equal(result2$seq.10008.43[1:3], c(599.9, 572.8, 462.8)) } # Test error with non-existent grouping column @@ -338,13 +310,13 @@ test_that("`medianNormalize` handles sample selection correctly", { expect_equal(length(norm_cols), 3) # Should have 3 dilution groups # Test scale factor output - same as Method 2 for QC selection - expect_equal(result$NormScale_20[1:3], c(1.036936, 0.960225, 1), tolerance = 0.0001) - expect_equal(result$NormScale_0_005[1:3], c(0.857016, 0.848584, 1), tolerance = 0.0001) - expect_equal(result$NormScale_0_5[1:3], c(0.777175, 0.85202, 1), tolerance = 0.0001) + expect_equal(result$NormScale_20[1:3], c(1.036936, 0.960225, 1.08410), tolerance = 0.0001) + expect_equal(result$NormScale_0_005[1:3], c(0.857016, 0.848584, 1.05488), tolerance = 0.0001) + expect_equal(result$NormScale_0_5[1:3], c(0.777175, 0.85202, 0.96014), tolerance = 0.0001) # Test specific SeqId columns - expect_equal(result$seq.10000.28[1:3], c(476.5, 474.4, 501.5)) - expect_equal(result$seq.10008.43[1:3], c(561.8, 541.9, 510.1)) + expect_equal(result$seq.10000.28[1:3], c(476.5, 474.4, 543.7)) + expect_equal(result$seq.10008.43[1:3], c(561.8, 541.9, 553.0)) # Test with non-existent field expect_error( @@ -371,3 +343,69 @@ test_that("`medianNormalize` produces expected verbose output", { NA # NA indicates no output expected ) }) + +test_that("`medianNormalize` can reverse ANML normalization", { + # Create ANML-like test data by modifying test_data + anml_test_data <- test_data + + # Modify header to simulate ANML-processed data + header_meta <- attr(anml_test_data, "Header.Meta") + header_meta$HEADER$ProcessSteps <- "Raw RFU, Hyb Normalization, medNormInt (SampleId), plateScale, Calibration, anmlQC, qcCheck, anmlSMP" + header_meta$HEADER$NormalizationAlgorithm <- "ANML" + attr(anml_test_data, "Header.Meta") <- header_meta + + # Add ANML-like normalization scale factors (non-1.0 values to simulate ANML) + anml_test_data$NormScale_20 <- c(1.15, 0.95, 1.08) + anml_test_data$NormScale_0_5 <- c(1.22, 0.88, 1.12) + anml_test_data$NormScale_0_005 <- c(0.93, 1.07, 0.98) + + # Add ANMLFractionUsed columns to mimic ANML data + anml_test_data$ANMLFractionUsed_20 <- c(0.85, 0.92, 0.78) + anml_test_data$ANMLFractionUsed_0_5 <- c(0.88, 0.91, 0.83) + anml_test_data$ANMLFractionUsed_0_005 <- c(0.90, 0.87, 0.85) + + # Test ANML reversal + expect_no_error( + result <- medianNormalize(anml_test_data, reverse_existing = TRUE, verbose = FALSE) + ) + + # Check that result is valid + expect_true(is.soma_adat(result)) + + # Check that ProcessSteps includes reversal and new normalization + result_header <- attr(result, "Header.Meta")$HEADER + expect_true(grepl("rev-anmlSMP", result_header$ProcessSteps)) + expect_true(grepl("MedNormSMP", result_header$ProcessSteps)) + + # Test that error occurs without reverse_existing flag + expect_error( + medianNormalize(anml_test_data, reverse_existing = FALSE, verbose = FALSE), + ) +}) + +test_that("`medianNormalize` handles SeqId reference format properly", { + # Test with SeqId-Reference format + analytes <- getAnalytes(test_data) + seqids <- getAnalyteInfo(test_data)$SeqId[1:10] # First 10 SeqIds + + ref_data <- data.frame( + SeqId = seqids, + Reference = runif(10, 1000, 5000) # Random reference values + ) + + expect_no_error( + result <- medianNormalize(test_data, reference = ref_data, verbose = FALSE) + ) + + # Check that result is valid + expect_true(is.soma_adat(result)) + + # Check that medNormSMP_ReferenceRFU field was added with 2 decimal places + analyte_info <- getAnalyteInfo(result) + expect_true("medNormSMP_ReferenceRFU" %in% names(analyte_info)) + + # Check reference values are rounded to 2 decimal places + ref_values <- analyte_info$medNormSMP_ReferenceRFU + rounded_values <- round(ref_values, 2) + expect_equal(ref_values[!is.na(ref_values)], rounded_values[!is.na(rounded_values)]) +}) From 959a7236b03191cd29a2b1201e41cb624de92787 Mon Sep 17 00:00:00 2001 From: scheidec Date: Mon, 9 Feb 2026 12:46:12 -0500 Subject: [PATCH 06/10] Remove sample selection parameters, use study samples only - removed sample selection parameters (ref_field, ref_value, do_field, do_regexp) in favor of simplification in alignment with default of using study samples only --- R/medianNormalize.R | 99 +++++++++--------------- man/medianNormalize.Rd | 30 +------ tests/testthat/_snaps/medianNormalize.md | 2 +- tests/testthat/test-medianNormalize.R | 77 ++---------------- 4 files changed, 44 insertions(+), 164 deletions(-) diff --git a/R/medianNormalize.R b/R/medianNormalize.R index a74ba896..bf1aac89 100644 --- a/R/medianNormalize.R +++ b/R/medianNormalize.R @@ -85,17 +85,11 @@ #' \item{Reference}{Numeric column containing the reference RFU values #' for each SeqId.} #' } -#' @param ref_field Character. Field used to select reference samples when -#' `reference = NULL`. Default is `"SampleType"`. -#' @param ref_value Character. Value(s) in `ref_field` to use as reference -#' when `reference = NULL`. Default is `c("Sample")`. #' @param by Character vector. Grouping variable(s) for grouped median #' normalization. Must be column name(s) in the ADAT. Normalization will be -#' performed within each group separately. Default is `"SampleType"`. -#' @param do_field Character. The field used to select samples for normalization -#' (others keep original values). Default is `"SampleType"`. -#' @param do_regexp Character. A regular expression pattern to select samples -#' from `do_field` for normalization. Default is `"Sample"`. +#' performed within each group separately. Default is `"SampleType"`. Note +#' that only study samples (SampleType == 'Sample') are normalized; QC, +#' Calibrator, and Buffer samples are automatically excluded. #' @param reverse_existing Logical. Should existing median or ANML normalization be #' reversed before applying new normalization? When `TRUE`, existing median #' normalization scale factors or ANML normalization effects are reversed for @@ -111,11 +105,6 @@ #' # Internal reference from study samples (default) #' med_norm_adat <- medianNormalize(adat) #' -#' # Reference from specific samples in the ADAT -#' med_norm_adat <- medianNormalize(adat, -#' ref_field = "SampleType", -#' ref_value = "Sample") -#' #' # Reference from another ADAT #' ref_adat <- read_adat("reference_file.adat") #' med_norm_adat <- medianNormalize(adat, reference = ref_adat) @@ -129,11 +118,6 @@ #' # (normalize within groups to account for expected biological differences) #' med_norm_adat <- medianNormalize(adat, by = c("Sex", "SampleType")) #' -#' # Normalize only specific sample types -#' med_norm_adat <- medianNormalize(adat, -#' do_field = "SampleType", -#' do_regexp = "Sample") -#' #' # Re-normalize data that has already been median or ANML normalized #' # Use when you want to apply different normalization to previously normalized data #' med_norm_adat <- medianNormalize(adat, @@ -145,11 +129,7 @@ #' @export medianNormalize <- function(adat, reference = NULL, - ref_field = "SampleType", - ref_value = c("Sample"), by = "SampleType", - do_field = "SampleType", - do_regexp = "Sample", reverse_existing = FALSE, verbose = TRUE) { @@ -243,52 +223,47 @@ medianNormalize <- function(adat, } - # Determine which samples to normalize ---- - if (!is.null(do_field) && !is.null(do_regexp)) { - if (!do_field %in% names(adat)) { - stop("Field `", do_field, "` not found in adat columns", call. = FALSE) - } + # Determine which samples to normalize - only Sample types + if (!"SampleType" %in% names(adat)) { + stop("Field 'SampleType' not found in adat columns", call. = FALSE) + } - do_samples <- grep(do_regexp, adat[[do_field]]) - if (length(do_samples) == 0L) { - stop( - "No samples selected for normalization with pattern: ", do_regexp, - call. = FALSE - ) - } - dont_samples <- setdiff(seq_len(nrow(adat)), do_samples) - } else { - do_samples <- seq_len(nrow(adat)) - dont_samples <- NULL + do_samples <- grep("Sample", adat[["SampleType"]]) + if (length(do_samples) == 0L) { + stop( + "No samples selected for normalization with pattern: Sample", + call. = FALSE + ) } + dont_samples <- setdiff(seq_len(nrow(adat)), do_samples) # Process reference ---- if (is.null(reference)) { - # Check if ref_field conflicts with grouping variables + # Check if SampleType conflicts with grouping variables conflicts_with_grouping <- FALSE - if (ref_field %in% by) { + if ("SampleType" %in% by) { samples_to_normalize <- adat[do_samples, ] - group_values <- unique(samples_to_normalize[[ref_field]]) + group_values <- unique(samples_to_normalize[["SampleType"]]) group_values <- group_values[!is.na(group_values)] - all_groups_in_ref <- all(group_values %in% ref_value) - conflicts_with_grouping <- !identical(by, ref_field) || !all_groups_in_ref + all_groups_in_ref <- all(group_values %in% "Sample") + conflicts_with_grouping <- !identical(by, "SampleType") || !all_groups_in_ref } if (conflicts_with_grouping) { # Calculate global reference to avoid groups lacking reference samples if (verbose) { - .todo("Building global internal reference from field: {.val {ref_field}} with values: {.val {ref_value}}") + .todo("Building global internal reference from study samples (SampleType == 'Sample')") } - ref_data <- .buildInternalReference(adat, ref_field, ref_value, dil_groups) + ref_data <- .buildInternalReference(adat, dil_groups) } else { # Standard internal reference - calculate per group ref_data <- NULL if (verbose) { - .todo("Building internal reference from field: {.val {ref_field}} with values: {.val {ref_value}}") + .todo("Building internal reference from study samples (SampleType == 'Sample')") } } } else { - ref_data <- .processReference(reference, ref_field, ref_value, adat, dil_groups, apt_data, verbose) + ref_data <- .processReference(reference, adat, dil_groups, apt_data, verbose) } # Add row identifier to maintain order @@ -301,8 +276,6 @@ medianNormalize <- function(adat, dil_groups = dil_groups, by = by, ref_data = ref_data, - ref_field = ref_field, - ref_value = ref_value, verbose = verbose ) } @@ -347,7 +320,7 @@ medianNormalize <- function(adat, #' Process Reference Data #' @noRd -.processReference <- function(reference, ref_field, ref_value, adat, dil_groups, apt_data, verbose) { +.processReference <- function(reference, adat, dil_groups, apt_data, verbose) { if (is.soma_adat(reference)) { # Use reference from provided ADAT @@ -373,18 +346,17 @@ medianNormalize <- function(adat, #' Build Internal Reference from Study Samples #' @noRd -.buildInternalReference <- function(adat, ref_field, ref_value, dil_groups) { +.buildInternalReference <- function(adat, dil_groups) { - if (!ref_field %in% names(adat)) { - stop("Reference field `", ref_field, "` not found", call. = FALSE) + if (!"SampleType" %in% names(adat)) { + stop("Reference field 'SampleType' not found", call. = FALSE) } # Select reference samples - ref_samples <- adat[[ref_field]] %in% ref_value + ref_samples <- adat[["SampleType"]] %in% "Sample" if (sum(ref_samples) == 0) { stop( - "No reference samples found with field `", ref_field, - "` and values: ", paste(ref_value, collapse = ", "), + "No reference samples found with field 'SampleType' and value: Sample", call. = FALSE ) } @@ -487,7 +459,7 @@ medianNormalize <- function(adat, #' Perform Median Normalization #' @noRd -.performMedianNorm <- function(adat, dil_groups, by, ref_data, ref_field, ref_value, verbose) { +.performMedianNorm <- function(adat, dil_groups, by, ref_data, verbose) { # Store original rownames to restore later original_rownames <- rownames(adat) @@ -580,16 +552,15 @@ medianNormalize <- function(adat, grp_ref_values <- apply(grp_adat[, dil_apts, drop = FALSE], 2, median, na.rm = TRUE) } } else { - # Internal reference: Use samples from this group only - if (!ref_field %in% names(grp_adat)) { - stop("Reference field `", ref_field, "` not found", call. = FALSE) + # Internal reference: Use Sample types from this group only + if (!"SampleType" %in% names(grp_adat)) { + stop("Reference field 'SampleType' not found", call. = FALSE) } - ref_samples_mask <- grp_adat[[ref_field]] %in% ref_value + ref_samples_mask <- grp_adat[["SampleType"]] %in% "Sample" if (sum(ref_samples_mask) == 0) { stop( - "No reference samples found with field `", ref_field, - "` and values: ", paste(ref_value, collapse = ", "), + "No reference samples found with field 'SampleType' and value: Sample", call. = FALSE ) } diff --git a/man/medianNormalize.Rd b/man/medianNormalize.Rd index 34ca2032..88e8fcc5 100644 --- a/man/medianNormalize.Rd +++ b/man/medianNormalize.Rd @@ -7,11 +7,7 @@ medianNormalize( adat, reference = NULL, - ref_field = "SampleType", - ref_value = c("Sample"), by = "SampleType", - do_field = "SampleType", - do_regexp = "Sample", reverse_existing = FALSE, verbose = TRUE ) @@ -38,21 +34,11 @@ to those in the \code{soma_adat} object. Must be in "10000-28" format, not for each SeqId.} }} -\item{ref_field}{Character. Field used to select reference samples when -\code{reference = NULL}. Default is \code{"SampleType"}.} - -\item{ref_value}{Character. Value(s) in \code{ref_field} to use as reference -when \code{reference = NULL}. Default is \code{c("Sample")}.} - \item{by}{Character vector. Grouping variable(s) for grouped median normalization. Must be column name(s) in the ADAT. Normalization will be -performed within each group separately. Default is \code{"SampleType"}.} - -\item{do_field}{Character. The field used to select samples for normalization -(others keep original values). Default is \code{"SampleType"}.} - -\item{do_regexp}{Character. A regular expression pattern to select samples -from \code{do_field} for normalization. Default is \code{"Sample"}.} +performed within each group separately. Default is \code{"SampleType"}. Note +that only study samples (SampleType == 'Sample') are normalized; QC, +Calibrator, and Buffer samples are automatically excluded.} \item{reverse_existing}{Logical. Should existing median or ANML normalization be reversed before applying new normalization? When \code{TRUE}, existing median @@ -144,11 +130,6 @@ reversed; QC, Calibrator, and Buffer samples retain their normalization # Internal reference from study samples (default) med_norm_adat <- medianNormalize(adat) -# Reference from specific samples in the ADAT -med_norm_adat <- medianNormalize(adat, - ref_field = "SampleType", - ref_value = "Sample") - # Reference from another ADAT ref_adat <- read_adat("reference_file.adat") med_norm_adat <- medianNormalize(adat, reference = ref_adat) @@ -162,11 +143,6 @@ med_norm_adat <- medianNormalize(adat, reference = ref_data) # (normalize within groups to account for expected biological differences) med_norm_adat <- medianNormalize(adat, by = c("Sex", "SampleType")) -# Normalize only specific sample types -med_norm_adat <- medianNormalize(adat, - do_field = "SampleType", - do_regexp = "Sample") - # Re-normalize data that has already been median or ANML normalized # Use when you want to apply different normalization to previously normalized data med_norm_adat <- medianNormalize(adat, diff --git a/tests/testthat/_snaps/medianNormalize.md b/tests/testthat/_snaps/medianNormalize.md index 1c6b9627..a0b80f67 100644 --- a/tests/testthat/_snaps/medianNormalize.md +++ b/tests/testthat/_snaps/medianNormalize.md @@ -11,7 +11,7 @@ Three dilution setup detected (standard setup). Message > Normalization scale factors already exist: "NormScale_20, NormScale_0_005, NormScale_0_5" - they will be replaced with new scale factors - > Building internal reference from field: "SampleType" with values: "Sample" + > Building internal reference from study samples (SampleType == 'Sample') v Processing dilution '0_005' with 50 analytes v Processing dilution '0_5' with 50 analytes v Processing dilution '20' with 50 analytes diff --git a/tests/testthat/test-medianNormalize.R b/tests/testthat/test-medianNormalize.R index f8ad5541..8abef934 100644 --- a/tests/testthat/test-medianNormalize.R +++ b/tests/testthat/test-medianNormalize.R @@ -74,37 +74,7 @@ test_that("`medianNormalize` Method 1: Internal reference (default)", { expect_true(grepl("intraplate, crossplate", result_header$MedNormReference)) }) -test_that("`medianNormalize` Method 2: Reference from specific samples", { - - # Test with specific reference field and value - using QC as reference but normalizing both QC and Sample - expect_no_error( - result <- medianNormalize(test_data, - ref_field = "SampleType", - ref_value = "QC", - do_regexp = "QC|Sample", # Normalize both types - verbose = FALSE) - ) - - # Check result structure - expect_true(is.soma_adat(result)) - norm_cols <- grep("^NormScale_", names(result), value = TRUE) - expect_equal(length(norm_cols), 3) # Should have 3 dilution groups - - # Test scale factor output - normalized both QC and Sample types, so QC (3rd) gets scale factor 1.0 - expect_equal(result$NormScale_20[1:3], c(0.9089, 0.9578, 1.0000), tolerance = 0.001) - expect_equal(result$NormScale_0_005[1:3], c(0.9783, 0.9806, 1.0000), tolerance = 0.001) - expect_equal(result$NormScale_0_5[1:3], c(0.9879, 1.1445, 1.0000), tolerance = 0.001) - - # Test specific SeqId columns - expect_equal(result$seq.10000.28[1:3], c(433.1, 454.4, 501.5), tolerance = 0.1) - expect_equal(result$seq.10008.43[1:3], c(510.6, 519.0, 510.1), tolerance = 0.1) - - # Check header metadata - result_header <- attr(result, "Header.Meta")$HEADER - expect_true(grepl("intraplate, crossplate", result_header$MedNormReference)) -}) - -test_that("`medianNormalize` Method 3: Reference from another ADAT", { +test_that("`medianNormalize` Method 2: Reference from another ADAT", { # Create a minimal reference ADAT from sample subset ref_adat <- test_data[1:2, ] # Use first 2 samples as reference @@ -194,10 +164,10 @@ test_that("`medianNormalize` validates input requirements", { "No normalization scale factor columns found" ) - # Test with invalid reference field + # Test with invalid adat object (not soma_adat) expect_error( - medianNormalize(test_data, ref_field = "NonExistentField", verbose = FALSE), - "Reference field `NonExistentField` not found" + medianNormalize(data.frame(wrong = 1:3), verbose = FALSE), + "`adat` must be a class `soma_adat` object" ) # Test with invalid reference data.frame (missing required columns) @@ -249,7 +219,7 @@ test_that("`medianNormalize` external reference data.frame validation", { test_that("`medianNormalize` handles grouping correctly", { # Test grouping by Sex expect_no_error( - result1 <- medianNormalize(test_data, by = "Sex", do_regexp = "Sample", verbose = FALSE) + result1 <- medianNormalize(test_data, by = "Sex", verbose = FALSE) ) # Check result structure @@ -294,43 +264,6 @@ test_that("`medianNormalize` handles grouping correctly", { ) }) -test_that("`medianNormalize` handles sample selection correctly", { - # Test selective normalization - expect_no_error( - result <- medianNormalize(test_data, - do_field = "SampleType", - do_regexp = "QC", - verbose = FALSE) - ) - expect_true(is.soma_adat(result)) - - # Check result structure - expect_true(is.soma_adat(result)) - norm_cols <- grep("^NormScale_", names(result), value = TRUE) - expect_equal(length(norm_cols), 3) # Should have 3 dilution groups - - # Test scale factor output - same as Method 2 for QC selection - expect_equal(result$NormScale_20[1:3], c(1.036936, 0.960225, 1.08410), tolerance = 0.0001) - expect_equal(result$NormScale_0_005[1:3], c(0.857016, 0.848584, 1.05488), tolerance = 0.0001) - expect_equal(result$NormScale_0_5[1:3], c(0.777175, 0.85202, 0.96014), tolerance = 0.0001) - - # Test specific SeqId columns - expect_equal(result$seq.10000.28[1:3], c(476.5, 474.4, 543.7)) - expect_equal(result$seq.10008.43[1:3], c(561.8, 541.9, 553.0)) - - # Test with non-existent field - expect_error( - medianNormalize(test_data, do_field = "NonExistentField", verbose = FALSE), - "Field `NonExistentField` not found" - ) - - # Test with pattern that matches no samples - expect_error( - medianNormalize(test_data, do_regexp = "NoMatchPattern", verbose = FALSE), - "No samples selected for normalization" - ) -}) - test_that("`medianNormalize` produces expected verbose output", { # specific verbose output messages expect_snapshot( From 7e2c44579f546c9e936528a4de61d8ac716083c5 Mon Sep 17 00:00:00 2001 From: scheidec Date: Thu, 26 Feb 2026 15:50:21 -0500 Subject: [PATCH 07/10] Export reverseMedianNormalize() function - removed `reverse_existing` parameter from `medianNormalize()`, which now requires an unnormalized adat input - updated `medianNormalize()` to now point to reverseMedianNormalize() if an already normalized adat is passed --- NAMESPACE | 1 + R/medianNormalize.R | 421 +++++-------------- R/reverseMedianNormalize.R | 301 +++++++++++++ man/medianNormalize.Rd | 44 +- man/reverseMedianNormalize.Rd | 59 +++ tests/testthat/_snaps/medianNormalize.md | 3 +- tests/testthat/test-medianNormalize.R | 24 +- tests/testthat/test-reverseMedianNormalize.R | 84 ++++ 8 files changed, 585 insertions(+), 352 deletions(-) create mode 100644 R/reverseMedianNormalize.R create mode 100644 man/reverseMedianNormalize.Rd create mode 100644 tests/testthat/test-reverseMedianNormalize.R diff --git a/NAMESPACE b/NAMESPACE index a78cf5cf..f4834c9c 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -124,6 +124,7 @@ export(read_adat) export(read_annotations) export(regexSeqId) export(rename) +export(reverseMedianNormalize) export(right_join) export(rm_rn) export(rn2col) diff --git a/R/medianNormalize.R b/R/medianNormalize.R index bf1aac89..856da4f0 100644 --- a/R/medianNormalize.R +++ b/R/medianNormalize.R @@ -59,11 +59,9 @@ #' maximum likelihood (ANML) method which leverages a population-based #' reference that iteratively down-selects the set of analytes to include #' for the normalization calculation. -#' \item The function requires `reverse_existing = TRUE` to be set in order -#' to process data where study samples have already undergone ANML or -#' standard median normalization. -#' \item When reversing existing normalization, only study samples are -#' reversed; QC, Calibrator, and Buffer samples retain their normalization +#' \item This function requires unnormalized data as input. If study samples +#' have already undergone median normalization (ANML or standard), first use +#' `reverseMedianNormalize()` to remove existing normalization. #' } #' #' @param adat A `soma_adat` object created using [read_adat()], containing @@ -90,39 +88,34 @@ #' performed within each group separately. Default is `"SampleType"`. Note #' that only study samples (SampleType == 'Sample') are normalized; QC, #' Calibrator, and Buffer samples are automatically excluded. -#' @param reverse_existing Logical. Should existing median or ANML normalization be -#' reversed before applying new normalization? When `TRUE`, existing median -#' normalization scale factors or ANML normalization effects are reversed for -#' study samples only (QC, Calibrator, and Buffer samples retain their -#' normalization). This allows re-normalization of data that has already been -#' median normalized or ANML normalized. Default is `FALSE`. #' @param verbose Logical. Should progress messages be printed? Default is `TRUE`. #' @return A `soma_adat` object with median normalization applied and RFU values #' adjusted. The existing `NormScale_*` columns are updated to include the #' effects of both plate scale normalization and median normalization. #' @examples #' \dontrun{ +#' # Starting with unnormalized ADAT +#' unnormalized_adat <- read_adat("unnormalized_study_data.adat") +#' #' # Internal reference from study samples (default) -#' med_norm_adat <- medianNormalize(adat) +#' med_norm_adat <- medianNormalize(unnormalized_adat) #' #' # Reference from another ADAT #' ref_adat <- read_adat("reference_file.adat") -#' med_norm_adat <- medianNormalize(adat, reference = ref_adat) +#' med_norm_adat <- medianNormalize(unnormalized_adat, reference = ref_adat) #' #' # External reference as a data.frame - requires `SeqId` and `Reference` columns #' ref_data <- read.csv("reference_file.csv") -#' med_norm_adat <- medianNormalize(adat, reference = ref_data) +#' med_norm_adat <- medianNormalize(unnormalized_adat, reference = ref_data) #' #' # Custom grouping by multiple variables #' # Use when total protein load changes due to analysis conditions -#' # (normalize within groups to account for expected biological differences) -#' med_norm_adat <- medianNormalize(adat, by = c("Sex", "SampleType")) +#' med_norm_adat <- medianNormalize(unnormalized_adat, by = c("Sex", "SampleType")) #' -#' # Re-normalize data that has already been median or ANML normalized -#' # Use when you want to apply different normalization to previously normalized data -#' med_norm_adat <- medianNormalize(adat, -#' reverse_existing = TRUE, -#' reference = new_reference) +#' # If you already have normalized data, first reverse the normalization +#' normalized_adat <- read_adat("normalized_study_data.adat") +#' unnormalized_adat <- reverseMedianNormalize(normalized_adat) +#' custom_norm_adat <- medianNormalize(unnormalized_adat, reference = new_reference) #' } #' @importFrom dplyr filter #' @importFrom stats median @@ -130,7 +123,6 @@ medianNormalize <- function(adat, reference = NULL, by = "SampleType", - reverse_existing = FALSE, verbose = TRUE) { # Input validation ---- @@ -172,26 +164,7 @@ medianNormalize <- function(adat, } # Check data state and existing normalization ---- - has_existing_norm <- .validateDataState(adat, header, verbose, reverse_existing) - - # Reverse existing normalization if requested ---- - if (reverse_existing && has_existing_norm) { - if (verbose) { - if (grepl("ANML", header$ProcessSteps %||% "", ignore.case = TRUE)) { - cat("Reversing existing ANML normalization for study samples...\n") - adat <- .reverseANMLSMP(adat, verbose) - } else { - cat("Reversing existing median normalization for study samples...\n") - adat <- .reverseMedNormSMP(adat, verbose) - } - } else { - if (grepl("ANML", header$ProcessSteps %||% "", ignore.case = TRUE)) { - adat <- .reverseANMLSMP(adat, verbose) - } else { - adat <- .reverseMedNormSMP(adat, verbose) - } - } - } + .validateDataState(adat, header, verbose) # Create dilution groups ---- apt_data <- getAnalyteInfo(adat) @@ -257,10 +230,13 @@ medianNormalize <- function(adat, ref_data <- .buildInternalReference(adat, dil_groups) } else { # Standard internal reference - calculate per group - ref_data <- NULL + # Build internal reference for adding to metadata if (verbose) { .todo("Building internal reference from study samples (SampleType == 'Sample')") } + ref_data <- .buildInternalReference(adat, dil_groups) + # Mark as group-specific for normalization calculations + attr(ref_data, "group_specific") <- TRUE } } else { ref_data <- .processReference(reference, adat, dil_groups, apt_data, verbose) @@ -318,6 +294,76 @@ medianNormalize <- function(adat, } +#' Recalculate RowCheck after Median Normalization +#' +#' Recalculates RowCheck values as PASS or FLAG based on normalization acceptance +#' criteria for row scale factors after median normalization. Samples with all +#' row scale factors within the acceptance range (0.4 to 2.5) receive "PASS", +#' while samples with any scale factor outside this range receive "FLAG". +#' +#' @param adat A `soma_adat` object after median normalization +#' @param verbose Logical. Whether to print progress messages +#' @return The `soma_adat` object with updated RowCheck values +#' @noRd +.recalculateRowCheck <- function(adat, verbose) { + + if (verbose) { + cat("Recalculating RowCheck values based on normalization acceptance criteria...\n") + } + + # Check if RowCheck column exists + if (!"RowCheck" %in% names(adat)) { + if (verbose) { + cat("No RowCheck column found to recalculate.\n") + } + return(adat) + } + + # Find all normalization scale factor columns (NormScale_*) + scale_factor_cols <- grep("^NormScale_", names(adat), value = TRUE) + + if (length(scale_factor_cols) == 0) { + if (verbose) { + cat("No normalization scale factor columns found. Setting all RowCheck to PASS.\n") + } + adat$RowCheck <- "PASS" + return(adat) + } + + # Define acceptance criteria range for row scale factors + min_scale <- 0.4 + max_scale <- 2.5 + + # Calculate RowCheck for each sample + for (i in seq_len(nrow(adat))) { + # Get all scale factor values for this sample + scale_values <- as.numeric(adat[i, scale_factor_cols, drop = FALSE]) + scale_values <- scale_values[!is.na(scale_values)] + + # Check if all scale factors are within acceptance range + if (length(scale_values) == 0) { + # No scale factors available - default to PASS + adat$RowCheck[i] <- "PASS" + } else if (all(scale_values >= min_scale & scale_values <= max_scale)) { + adat$RowCheck[i] <- "PASS" + } else { + adat$RowCheck[i] <- "FLAG" + } + } + + if (verbose) { + pass_count <- sum(adat$RowCheck == "PASS", na.rm = TRUE) + flag_count <- sum(adat$RowCheck == "FLAG", na.rm = TRUE) + cat("RowCheck values updated for", nrow(adat), "samples.\n") + cat(" - PASS:", pass_count, "samples\n") + cat(" - FLAG:", flag_count, "samples\n") + cat(" - Acceptance criteria: scale factors within [", min_scale, ", ", max_scale, "]\n") + } + + adat +} + + #' Process Reference Data #' @noRd .processReference <- function(reference, adat, dil_groups, apt_data, verbose) { @@ -528,7 +574,7 @@ medianNormalize <- function(adat, } # Calculate reference values for this dilution - if (!is.null(ref_data) && dil_name %in% names(ref_data)) { + if (!is.null(ref_data) && dil_name %in% names(ref_data) && !isTRUE(attr(ref_data, "group_specific"))) { # Use external reference ref_values <- ref_data[[dil_name]] @@ -618,7 +664,8 @@ medianNormalize <- function(adat, if (!is.null(header_meta) && !is.null(header_meta$HEADER)) { # Add median normalization to process steps if ("ProcessSteps" %in% names(header_meta$HEADER)) { - if (!grepl("MedNormSMP", header_meta$HEADER$ProcessSteps)) { + # Check for MedNormSMP that is not preceded by "rev-" + if (!grepl("(? 0) { - for (i in which(sample_mask)) { - scale_factor <- adat[[sf_col]][i] - if (!is.na(scale_factor) && scale_factor != 0) { - # Apply inverse of scale factor - adat[i, dil_apts] <- adat[i, dil_apts] / scale_factor - # Reset scale factor to 1.0 - adat[[sf_col]][i] <- 1.0 - } - } - } - } - } - - # Update ProcessSteps to indicate reversal - header_meta <- attr(adat, "Header.Meta") - if (!is.null(header_meta) && !is.null(header_meta$HEADER)) { - current_steps <- header_meta$HEADER$ProcessSteps %||% "" - - # Add reversal step - look for what type of median norm was reversed - if (grepl("MedNormSMP", current_steps, ignore.case = TRUE)) { - header_meta$HEADER$ProcessSteps <- paste(current_steps, "rev-MedNormSMP", sep = ", ") - } else if (grepl("medNormInt", current_steps, ignore.case = TRUE)) { - header_meta$HEADER$ProcessSteps <- paste(current_steps, "rev-medNormInt", sep = ", ") - } else { - header_meta$HEADER$ProcessSteps <- paste(current_steps, "rev-MedNorm", sep = ", ") - } - - attr(adat, "Header.Meta") <- header_meta - } - - if (verbose) { - cat("Median normalization reversed for", sum(sample_mask), "study samples.\n") - } - - adat -} - - -#' Reverse Existing ANML Normalization for Study Samples -#' @noRd -.reverseANMLSMP <- function(adat, verbose) { - - # Get existing scale factors - sf_cols <- grep("^NormScale_", names(adat), value = TRUE) - - if (length(sf_cols) == 0) { - if (verbose) { - cat("No existing ANML normalization scale factors found to reverse.\n") - } - return(adat) - } - - if (verbose) { - cat("Reversing existing ANML normalization for study samples...\n") - } - - # Get dilution groups - apt_data <- getAnalyteInfo(adat) - dil_groups <- split(apt_data$AptName, apt_data$Dilution) - names(dil_groups) <- gsub("\\.", "_", names(dil_groups)) - names(dil_groups) <- gsub("[.]0$|%|^[.]", "", names(dil_groups)) - - # Only reverse for study samples, leave QC/Calibrator/Buffer alone - sample_mask <- grepl("Sample|sample", adat$SampleType %||% adat$SampleId %||% "", ignore.case = TRUE) - if (sum(sample_mask) == 0) { - if (verbose) { - cat("No study samples found to reverse ANML normalization.\n") - } - return(adat) - } - # For each dilution group, reverse ANML normalization using log space - for (dil_name in names(dil_groups)) { - sf_col <- paste0("NormScale_", dil_name) - - if (sf_col %in% names(adat)) { - dil_apts <- intersect(dil_groups[[dil_name]], getAnalytes(adat)) - - if (length(dil_apts) > 0) { - for (i in which(sample_mask)) { - scale_factor <- adat[[sf_col]][i] - if (!is.na(scale_factor) && scale_factor != 0) { - # ANML uses log space scaling - reverse by subtracting log scale factor - log_sf <- log10(scale_factor) - log_data <- log10(as.numeric(adat[i, dil_apts])) - reversed_log_data <- log_data - log_sf - adat[i, dil_apts] <- 10^reversed_log_data - # Reset scale factor to 1.0 - adat[[sf_col]][i] <- 1.0 - } - } - } - } - } - - # Remove ANMLFractionUsed columns if they exist - anml_frac_cols <- grep("^ANMLFractionUsed_", names(adat), value = TRUE) - if (length(anml_frac_cols) > 0) { - for (col in anml_frac_cols) { - if (all(is.na(adat[[col]][sample_mask]))) { - adat[[col]][sample_mask] <- NA - } - } - } - - # Update ProcessSteps to indicate reversal - header_meta <- attr(adat, "Header.Meta") - if (!is.null(header_meta) && !is.null(header_meta$HEADER)) { - current_steps <- header_meta$HEADER$ProcessSteps %||% "" - - # Add reversal step - look for what type of ANML was reversed - if (grepl("anmlSMP", current_steps, ignore.case = TRUE)) { - header_meta$HEADER$ProcessSteps <- paste(current_steps, "rev-anmlSMP", sep = ", ") - } else if (grepl("ANML", current_steps, ignore.case = TRUE)) { - header_meta$HEADER$ProcessSteps <- paste(current_steps, "rev-ANML", sep = ", ") - } - - attr(adat, "Header.Meta") <- header_meta - } - - # Reset RowCheck for reversed samples - if ("RowCheck" %in% names(adat)) { - adat$RowCheck[sample_mask] <- "PASS" - } - - if (verbose) { - cat("ANML normalization reversed for", sum(sample_mask), "study samples.\n") - } - - adat -} - - -#' Recalculate RowCheck after Median Normalization -#' -#' Recalculates RowCheck values as PASS or FLAG based on normalization acceptance -#' criteria for row scale factors after median normalization. Samples with all -#' row scale factors within the acceptance range (0.4 to 2.5) receive "PASS", -#' while samples with any scale factor outside this range receive "FLAG". -#' -#' @param adat A `soma_adat` object after median normalization -#' @param verbose Logical. Whether to print progress messages -#' @return The `soma_adat` object with updated RowCheck values -#' @noRd -.recalculateRowCheck <- function(adat, verbose) { - - if (verbose) { - cat("Recalculating RowCheck values based on normalization acceptance criteria...\n") - } - - # Check if RowCheck column exists - if (!"RowCheck" %in% names(adat)) { - if (verbose) { - cat("No RowCheck column found to recalculate.\n") - } - return(adat) - } - - # Find all normalization scale factor columns (NormScale_*) - scale_factor_cols <- grep("^NormScale_", names(adat), value = TRUE) - - if (length(scale_factor_cols) == 0) { - if (verbose) { - cat("No normalization scale factor columns found. Setting all RowCheck to PASS.\n") - } - adat$RowCheck <- "PASS" - return(adat) - } - - # Define acceptance criteria range for row scale factors - min_scale <- 0.4 - max_scale <- 2.5 - - # Calculate RowCheck for each sample - for (i in seq_len(nrow(adat))) { - # Get all scale factor values for this sample - scale_values <- as.numeric(adat[i, scale_factor_cols, drop = FALSE]) - scale_values <- scale_values[!is.na(scale_values)] - - # Check if all scale factors are within acceptance range - if (length(scale_values) == 0) { - # No scale factors available - default to PASS - adat$RowCheck[i] <- "PASS" - } else if (all(scale_values >= min_scale & scale_values <= max_scale)) { - adat$RowCheck[i] <- "PASS" - } else { - adat$RowCheck[i] <- "FLAG" - } - } - - if (verbose) { - pass_count <- sum(adat$RowCheck == "PASS", na.rm = TRUE) - flag_count <- sum(adat$RowCheck == "FLAG", na.rm = TRUE) - cat("RowCheck values updated for", nrow(adat), "samples.\n") - cat(" - PASS:", pass_count, "samples\n") - cat(" - FLAG:", flag_count, "samples\n") - cat(" - Acceptance criteria: scale factors within [", min_scale, ", ", max_scale, "]\n") - } - - adat -} diff --git a/R/reverseMedianNormalize.R b/R/reverseMedianNormalize.R new file mode 100644 index 00000000..9c43f024 --- /dev/null +++ b/R/reverseMedianNormalize.R @@ -0,0 +1,301 @@ +#' Reverse Median Normalization from Study Samples +#' +#' @description Reverses median normalization (including ANML) that was +#' previously applied to study samples (`SampleType == "Sample"`). This function +#' is designed to work with standard SomaScan deliverable ADAT files where +#' study samples have undergone median normalization as the final processing step. +#' +#' This function validates that: +#' \enumerate{ +#' \item Study samples have a median normalization step applied +#' \item The normalization was the last transformation applied to study samples +#' \item The correct reversal method is applied based on the normalization type +#' } +#' +#' @section Use Cases: +#' \itemize{ +#' \item Converting from normalized ADAT to unnormalized ADAT for custom normalization +#' \item Preparing normalized delivery data for use with `medianNormalize()` function +#' \item Backing out normalization to apply different normalization strategies +#' } +#' +#' @section Data Requirements: +#' \itemize{ +#' \item ADAT file with study samples (`SampleType == "Sample"`) that have been +#' median normalized (either standard median normalization or ANML) +#' \item Intact header metadata with `ProcessSteps` field indicating the +#' normalization history +#' \item Median normalization must be the last processing step applied to study samples +#' } +#' +#' @param adat A `soma_adat` object with study samples that have been median normalized +#' @param verbose Logical. Should progress messages be printed? Default is `TRUE`. +#' @return A `soma_adat` object with median normalization reversed for study samples. +#' QC, Calibrator, and Buffer samples retain their original normalization. +#' The `ProcessSteps` header is updated to reflect the reversal operation, +#' and median normalization-specific metadata fields are cleared. +#' @examples +#' \dontrun{ +#' # Reverse normalization from a delivered ADAT file +#' normalized_adat <- read_adat("normalized_study_data.adat") +#' unnormalized_adat <- reverseMedianNormalize(normalized_adat) +#' } +#' @export +reverseMedianNormalize <- function(adat, verbose = TRUE) { + + # Input validation ---- + stopifnot("`adat` must be a class `soma_adat` object" = is.soma_adat(adat)) + + # Get header metadata + header <- attr(adat, "Header.Meta")$HEADER + + if (is.null(header)) { + stop("ADAT header metadata is missing", call. = FALSE) + } + + # Validate that study samples have been normalized ---- + process_steps <- header$ProcessSteps %||% "" + + if (process_steps == "") { + stop( + "ProcessSteps field is empty. Cannot determine normalization history.", + call. = FALSE + ) + } + + # Check for evidence of study sample median normalization (MedNormSMP only) + has_mednorm_smp <- grepl("MedNormSMP", process_steps, ignore.case = TRUE) + has_anml_smp <- grepl("anmlSMP", process_steps, ignore.case = TRUE) + + if (!has_mednorm_smp && !has_anml_smp) { + stop( + "No evidence of median normalization applied to study samples. ", + "ProcessSteps: ", process_steps, ". ", + "This function is designed to reverse median normalization from study samples.", + call. = FALSE + ) + } + + # Check that normalization hasn't already been reversed + has_reversal <- grepl("rev-(?:MedNormSMP|medNormInt|anmlSMP|ANML)", process_steps, ignore.case = TRUE, perl = TRUE) + + if (has_reversal) { + stop( + "Data appears to have already been denormalized. ", + "ProcessSteps: ", process_steps, + call. = FALSE + ) + } + + # Determine which normalization type was applied to study samples ---- + # Only one type should be applied, check which was last + if (has_anml_smp) { + normalization_type <- "anml" + } else if (has_mednorm_smp) { + normalization_type <- "median" + } else { + stop( + "Could not determine normalization type from ProcessSteps: ", process_steps, + call. = FALSE + ) + } + + # Apply the appropriate denormalization ---- + if (normalization_type == "anml") { + adat <- .reverseANMLSMP(adat, verbose) + } else if (normalization_type == "median") { + adat <- .reverseMedNormSMP(adat, verbose) + } + + # Remove medNormSMP_ReferenceRFU field from analyte metadata after reversal ---- + col_meta <- attr(adat, "Col.Meta") + if (!is.null(col_meta) && "medNormSMP_ReferenceRFU" %in% names(col_meta)) { + col_meta$medNormSMP_ReferenceRFU <- NULL + attr(adat, "Col.Meta") <- col_meta + if (verbose) { + cat("Removed medNormSMP_ReferenceRFU field from analyte metadata.\n") + } + } + + adat +} + + +#' Reverse Existing Median Normalization for Study Samples +#' @noRd +.reverseMedNormSMP <- function(adat, verbose) { + + # Get existing scale factors + sf_cols <- grep("^NormScale_", names(adat), value = TRUE) + + if (length(sf_cols) == 0) { + if (verbose) { + cat("No existing median normalization scale factors found to reverse.\n") + } + return(adat) + } + + if (verbose) { + cat("Reversing existing median normalization for study samples...\n") + } + + # Get dilution groups + apt_data <- getAnalyteInfo(adat) + dil_groups <- split(apt_data$AptName, apt_data$Dilution) + names(dil_groups) <- gsub("\\.", "_", names(dil_groups)) + names(dil_groups) <- gsub("[.]0$|%|^[.]", "", names(dil_groups)) + + # Only reverse for study samples, leave QC/Calibrator/Buffer alone + sample_mask <- grepl("Sample|sample", adat$SampleType %||% adat$SampleId %||% "", ignore.case = TRUE) + + if (sum(sample_mask) == 0) { + if (verbose) { + cat("No study samples found to reverse normalization.\n") + } + return(adat) + } + + # For each dilution group, reverse normalization + for (dil_name in names(dil_groups)) { + sf_col <- paste0("NormScale_", dil_name) + + if (sf_col %in% names(adat)) { + dil_apts <- intersect(dil_groups[[dil_name]], getAnalytes(adat)) + + if (length(dil_apts) > 0) { + for (i in which(sample_mask)) { + scale_factor <- adat[[sf_col]][i] + if (!is.na(scale_factor) && scale_factor != 0) { + # Apply inverse of scale factor + adat[i, dil_apts] <- adat[i, dil_apts] / scale_factor + # Reset scale factor to 1.0 + adat[[sf_col]][i] <- 1.0 + } + } + } + } + } + + # Update ProcessSteps to indicate reversal + header_meta <- attr(adat, "Header.Meta") + if (!is.null(header_meta) && !is.null(header_meta$HEADER)) { + current_steps <- header_meta$HEADER$ProcessSteps %||% "" + + # Add reversal step + if (grepl("MedNormSMP", current_steps, ignore.case = TRUE)) { + header_meta$HEADER$ProcessSteps <- paste(current_steps, "rev-MedNormSMP", sep = ", ") + } + + # Clear median normalization specific header fields + header_meta$HEADER$NormalizationAlgorithm <- NULL + header_meta$HEADER$MedNormReference <- NULL + + attr(adat, "Header.Meta") <- header_meta + } + + if (verbose) { + cat("Median normalization reversed for", sum(sample_mask), "study samples.\n") + } + + adat +} + + +#' Reverse Existing ANML Normalization for Study Samples +#' @noRd +.reverseANMLSMP <- function(adat, verbose) { + + # Get existing scale factors + sf_cols <- grep("^NormScale_", names(adat), value = TRUE) + + if (length(sf_cols) == 0) { + if (verbose) { + cat("No existing ANML normalization scale factors found to reverse.\n") + } + return(adat) + } + + if (verbose) { + cat("Reversing existing ANML normalization for study samples...\n") + } + + # Get dilution groups + apt_data <- getAnalyteInfo(adat) + dil_groups <- split(apt_data$AptName, apt_data$Dilution) + names(dil_groups) <- gsub("\\.", "_", names(dil_groups)) + names(dil_groups) <- gsub("[.]0$|%|^[.]", "", names(dil_groups)) + + # Only reverse for study samples, leave QC/Calibrator/Buffer alone + sample_mask <- grepl("Sample|sample", adat$SampleType %||% adat$SampleId %||% "", ignore.case = TRUE) + + if (sum(sample_mask) == 0) { + if (verbose) { + cat("No study samples found to reverse ANML normalization.\n") + } + return(adat) + } + + # For each dilution group, reverse ANML normalization using log space + for (dil_name in names(dil_groups)) { + sf_col <- paste0("NormScale_", dil_name) + + if (sf_col %in% names(adat)) { + dil_apts <- intersect(dil_groups[[dil_name]], getAnalytes(adat)) + + if (length(dil_apts) > 0) { + for (i in which(sample_mask)) { + scale_factor <- adat[[sf_col]][i] + if (!is.na(scale_factor) && scale_factor != 0) { + # ANML uses log space scaling - reverse by subtracting log scale factor + log_sf <- log10(scale_factor) + log_data <- log10(as.numeric(adat[i, dil_apts])) + reversed_log_data <- log_data - log_sf + adat[i, dil_apts] <- 10^reversed_log_data + # Reset scale factor to 1.0 + adat[[sf_col]][i] <- 1.0 + } + } + } + } + } + + # Remove ANMLFractionUsed columns if they exist + anml_frac_cols <- grep("^ANMLFractionUsed_", names(adat), value = TRUE) + if (length(anml_frac_cols) > 0) { + for (col in anml_frac_cols) { + if (all(is.na(adat[[col]][sample_mask]))) { + adat[[col]][sample_mask] <- NA + } + } + } + + # Update ProcessSteps to indicate reversal + header_meta <- attr(adat, "Header.Meta") + if (!is.null(header_meta) && !is.null(header_meta$HEADER)) { + current_steps <- header_meta$HEADER$ProcessSteps %||% "" + + # Add reversal step - look for what type of ANML was reversed + if (grepl("anmlSMP", current_steps, ignore.case = TRUE)) { + header_meta$HEADER$ProcessSteps <- paste(current_steps, "rev-anmlSMP", sep = ", ") + } else if (grepl("ANML", current_steps, ignore.case = TRUE)) { + header_meta$HEADER$ProcessSteps <- paste(current_steps, "rev-ANML", sep = ", ") + } + + # Clear median normalization specific header fields + header_meta$HEADER$NormalizationAlgorithm <- NULL + header_meta$HEADER$MedNormReference <- NULL + + attr(adat, "Header.Meta") <- header_meta + } + + # Reset RowCheck for reversed samples + if ("RowCheck" %in% names(adat)) { + adat$RowCheck[sample_mask] <- "PASS" + } + + if (verbose) { + cat("ANML normalization reversed for", sum(sample_mask), "study samples.\n") + } + + adat +} diff --git a/man/medianNormalize.Rd b/man/medianNormalize.Rd index 88e8fcc5..0d33b250 100644 --- a/man/medianNormalize.Rd +++ b/man/medianNormalize.Rd @@ -4,13 +4,7 @@ \alias{medianNormalize} \title{Perform Median Normalization} \usage{ -medianNormalize( - adat, - reference = NULL, - by = "SampleType", - reverse_existing = FALSE, - verbose = TRUE -) +medianNormalize(adat, reference = NULL, by = "SampleType", verbose = TRUE) } \arguments{ \item{adat}{A \code{soma_adat} object created using \code{\link[=read_adat]{read_adat()}}, containing @@ -40,13 +34,6 @@ performed within each group separately. Default is \code{"SampleType"}. Note that only study samples (SampleType == 'Sample') are normalized; QC, Calibrator, and Buffer samples are automatically excluded.} -\item{reverse_existing}{Logical. Should existing median or ANML normalization be -reversed before applying new normalization? When \code{TRUE}, existing median -normalization scale factors or ANML normalization effects are reversed for -study samples only (QC, Calibrator, and Buffer samples retain their -normalization). This allows re-normalization of data that has already been -median normalized or ANML normalized. Default is \code{FALSE}.} - \item{verbose}{Logical. Should progress messages be printed? Default is \code{TRUE}.} } \value{ @@ -117,36 +104,35 @@ systematic bias between the biological groups intended for comparison. maximum likelihood (ANML) method which leverages a population-based reference that iteratively down-selects the set of analytes to include for the normalization calculation. -\item The function requires \code{reverse_existing = TRUE} to be set in order -to process data where study samples have already undergone ANML or -standard median normalization. -\item When reversing existing normalization, only study samples are -reversed; QC, Calibrator, and Buffer samples retain their normalization +\item This function requires unnormalized data as input. If study samples +have already undergone median normalization (ANML or standard), first use +\code{reverseMedianNormalize()} to remove existing normalization. } } \examples{ \dontrun{ +# Starting with unnormalized ADAT +unnormalized_adat <- read_adat("unnormalized_study_data.adat") + # Internal reference from study samples (default) -med_norm_adat <- medianNormalize(adat) +med_norm_adat <- medianNormalize(unnormalized_adat) # Reference from another ADAT ref_adat <- read_adat("reference_file.adat") -med_norm_adat <- medianNormalize(adat, reference = ref_adat) +med_norm_adat <- medianNormalize(unnormalized_adat, reference = ref_adat) # External reference as a data.frame - requires `SeqId` and `Reference` columns ref_data <- read.csv("reference_file.csv") -med_norm_adat <- medianNormalize(adat, reference = ref_data) +med_norm_adat <- medianNormalize(unnormalized_adat, reference = ref_data) # Custom grouping by multiple variables # Use when total protein load changes due to analysis conditions -# (normalize within groups to account for expected biological differences) -med_norm_adat <- medianNormalize(adat, by = c("Sex", "SampleType")) +med_norm_adat <- medianNormalize(unnormalized_adat, by = c("Sex", "SampleType")) -# Re-normalize data that has already been median or ANML normalized -# Use when you want to apply different normalization to previously normalized data -med_norm_adat <- medianNormalize(adat, - reverse_existing = TRUE, - reference = new_reference) +# If you already have normalized data, first reverse the normalization +normalized_adat <- read_adat("normalized_study_data.adat") +unnormalized_adat <- reverseMedianNormalize(normalized_adat) +custom_norm_adat <- medianNormalize(unnormalized_adat, reference = new_reference) } } diff --git a/man/reverseMedianNormalize.Rd b/man/reverseMedianNormalize.Rd new file mode 100644 index 00000000..8d63423a --- /dev/null +++ b/man/reverseMedianNormalize.Rd @@ -0,0 +1,59 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/reverseMedianNormalize.R +\name{reverseMedianNormalize} +\alias{reverseMedianNormalize} +\title{Reverse Median Normalization from Study Samples} +\usage{ +reverseMedianNormalize(adat, verbose = TRUE) +} +\arguments{ +\item{adat}{A \code{soma_adat} object with study samples that have been median normalized} + +\item{verbose}{Logical. Should progress messages be printed? Default is \code{TRUE}.} +} +\value{ +A \code{soma_adat} object with median normalization reversed for study samples. +QC, Calibrator, and Buffer samples retain their original normalization. +The \code{ProcessSteps} header is updated to reflect the reversal operation, +and median normalization-specific metadata fields are cleared. +} +\description{ +Reverses median normalization (including ANML) that was +previously applied to study samples (\code{SampleType == "Sample"}). This function +is designed to work with standard SomaScan deliverable ADAT files where +study samples have undergone median normalization as the final processing step. + +This function validates that: +\enumerate{ +\item Study samples have a median normalization step applied +\item The normalization was the last transformation applied to study samples +\item The correct reversal method is applied based on the normalization type +} +} +\section{Use Cases}{ + +\itemize{ +\item Converting from normalized ADAT to unnormalized ADAT for custom normalization +\item Preparing normalized delivery data for use with \code{medianNormalize()} function +\item Backing out normalization to apply different normalization strategies +} +} + +\section{Data Requirements}{ + +\itemize{ +\item ADAT file with study samples (\code{SampleType == "Sample"}) that have been +median normalized (either standard median normalization or ANML) +\item Intact header metadata with \code{ProcessSteps} field indicating the +normalization history +\item Median normalization must be the last processing step applied to study samples +} +} + +\examples{ +\dontrun{ +# Reverse normalization from a delivered ADAT file +normalized_adat <- read_adat("normalized_study_data.adat") +unnormalized_adat <- reverseMedianNormalize(normalized_adat) +} +} diff --git a/tests/testthat/_snaps/medianNormalize.md b/tests/testthat/_snaps/medianNormalize.md index a0b80f67..fb68f036 100644 --- a/tests/testthat/_snaps/medianNormalize.md +++ b/tests/testthat/_snaps/medianNormalize.md @@ -6,8 +6,9 @@ Data validation passed for median normalization. Standard deliverable checks: - Hybridization normalization: PASS + - medNormInt (controls): PASS - Plate scale normalization: PASS - - No existing MedNorm/ANML: PASS + - Study samples not already normalized: PASS Three dilution setup detected (standard setup). Message > Normalization scale factors already exist: "NormScale_20, NormScale_0_005, NormScale_0_5" - they will be replaced with new scale factors diff --git a/tests/testthat/test-medianNormalize.R b/tests/testthat/test-medianNormalize.R index 8abef934..13d93ca1 100644 --- a/tests/testthat/test-medianNormalize.R +++ b/tests/testthat/test-medianNormalize.R @@ -33,7 +33,7 @@ create_test_data <- function(small = FALSE) { header_meta <- attr(test_adat, "Header.Meta") if (!is.null(header_meta) && !is.null(header_meta$HEADER)) { # Remove ANML and median normalization steps to simulate pre-processed state - header_meta$HEADER$ProcessSteps <- "Raw RFU, Hyb Normalization, plateScale, Calibration" + header_meta$HEADER$ProcessSteps <- "Raw RFU, Hyb Normalization, medNormInt, plateScale, Calibration" attr(test_adat, "Header.Meta") <- header_meta } @@ -277,7 +277,7 @@ test_that("`medianNormalize` produces expected verbose output", { ) }) -test_that("`medianNormalize` can reverse ANML normalization", { +test_that("`medianNormalize` errors on already normalized data", { # Create ANML-like test data by modifying test_data anml_test_data <- test_data @@ -297,22 +297,16 @@ test_that("`medianNormalize` can reverse ANML normalization", { anml_test_data$ANMLFractionUsed_0_5 <- c(0.88, 0.91, 0.83) anml_test_data$ANMLFractionUsed_0_005 <- c(0.90, 0.87, 0.85) - # Test ANML reversal - expect_no_error( - result <- medianNormalize(anml_test_data, reverse_existing = TRUE, verbose = FALSE) + # Test that medianNormalize errors on already normalized data + expect_error( + medianNormalize(anml_test_data, verbose = FALSE), + "Study samples appear to already be normalized" ) - # Check that result is valid - expect_true(is.soma_adat(result)) - - # Check that ProcessSteps includes reversal and new normalization - result_header <- attr(result, "Header.Meta")$HEADER - expect_true(grepl("rev-anmlSMP", result_header$ProcessSteps)) - expect_true(grepl("MedNormSMP", result_header$ProcessSteps)) - - # Test that error occurs without reverse_existing flag + # Test that the error message suggests reverseMedianNormalize expect_error( - medianNormalize(anml_test_data, reverse_existing = FALSE, verbose = FALSE), + medianNormalize(anml_test_data, verbose = FALSE), + "reverseMedianNormalize" ) }) diff --git a/tests/testthat/test-reverseMedianNormalize.R b/tests/testthat/test-reverseMedianNormalize.R new file mode 100644 index 00000000..fd35c405 --- /dev/null +++ b/tests/testthat/test-reverseMedianNormalize.R @@ -0,0 +1,84 @@ +# Setup ---- +data("example_data", package = "SomaDataIO") + +# Create small representative subset for faster tests +test_data_full <- example_data[1:3, ] + +# Testing ---- +test_that("`reverseMedianNormalize` reverses ANML normalization", { + # Use example_data which already has anmlSMP ProcessStep + anml_data <- test_data_full + + # Verify it has anmlSMP in ProcessSteps (should already be there) + header_meta <- attr(anml_data, "Header.Meta") + expect_true(grepl("anmlSMP", header_meta$HEADER$ProcessSteps)) + + # Add ANML-like normalization scale factors + anml_data$NormScale_20 <- c(1.15, 0.95, 1.08) + anml_data$NormScale_0_5 <- c(1.22, 0.88, 1.12) + anml_data$NormScale_0_005 <- c(0.93, 1.07, 0.98) + + # Reverse ANML normalization + expect_no_error( + result <- reverseMedianNormalize(anml_data, verbose = FALSE) + ) + + # Check that result is valid + expect_true(is.soma_adat(result)) + + # Check that ProcessSteps includes reversal + result_header <- attr(result, "Header.Meta")$HEADER + expect_true(grepl("rev-anmlSMP", result_header$ProcessSteps)) + + # Check that scale factors are reset to 1.0 for study samples + sample_mask <- result$SampleType == "Sample" + expect_true(all(result$NormScale_20[sample_mask] == 1.0)) + expect_true(all(result$NormScale_0_5[sample_mask] == 1.0)) + expect_true(all(result$NormScale_0_005[sample_mask] == 1.0)) +}) + +test_that("`reverseMedianNormalize` error conditions", { + # Create unnormalized data by modifying ProcessSteps + unnormalized_data <- test_data_full + header_meta <- attr(unnormalized_data, "Header.Meta") + header_meta$HEADER$ProcessSteps <- "Raw RFU, Hyb Normalization, medNormInt, plateScale, Calibration" + attr(unnormalized_data, "Header.Meta") <- header_meta + + # Test with unnormalized data (should error) + expect_error( + reverseMedianNormalize(unnormalized_data, verbose = FALSE), + "No evidence of median normalization applied to study samples" + ) + + # Test with non-soma_adat object + expect_error( + reverseMedianNormalize(data.frame(a = 1:3, b = 4:6)), + "`adat` must be a class `soma_adat` object" + ) +}) + +test_that("`reverseMedianNormalize` + `medianNormalize` workflow", { + # Create normalized data with MedNormSMP + normalized_data <- test_data_full + header_meta <- attr(normalized_data, "Header.Meta") + header_meta$HEADER$ProcessSteps <- "Raw RFU, Hyb Normalization, medNormInt (SampleId), plateScale, Calibration, MedNormSMP" + attr(normalized_data, "Header.Meta") <- header_meta + normalized_data$NormScale_20 <- c(1.05, 0.98, 1.12) + normalized_data$NormScale_0_5 <- c(1.08, 0.92, 1.15) + normalized_data$NormScale_0_005 <- c(0.97, 1.03, 0.95) + + # Test complete workflow: reverse then re-normalize + expect_no_error( + unnormalized <- reverseMedianNormalize(normalized_data, verbose = FALSE) + ) + + expect_no_error( + renormalized <- medianNormalize(unnormalized, verbose = FALSE) + ) + + # Check final result structure + expect_true(is.soma_adat(renormalized)) + final_header <- attr(renormalized, "Header.Meta")$HEADER + expect_true(grepl("rev-MedNormSMP", final_header$ProcessSteps)) + expect_true(grepl("MedNormSMP", final_header$ProcessSteps)) +}) From 7be8264b6f1d7a6a327ad75b3e4813ec1fa1c7e0 Mon Sep 17 00:00:00 2001 From: Alex Poole Date: Thu, 12 Mar 2026 14:52:32 -0600 Subject: [PATCH 08/10] update medianNormalize function documentation --- R/medianNormalize.R | 7 +++---- man/plot.Map.Rd | 4 ++-- 2 files changed, 5 insertions(+), 6 deletions(-) diff --git a/R/medianNormalize.R b/R/medianNormalize.R index 856da4f0..bc2b9aa5 100644 --- a/R/medianNormalize.R +++ b/R/medianNormalize.R @@ -17,7 +17,7 @@ #' distribution aligns to a defined reference, thereby correcting #' global intensity shifts without altering relative differences between #' measurements within a sample. For assay formats with multiple dilution groups -#' (e.g., 1:5 or 20%; 1:200 or 0.5%, 1:20,000 or 0.005%), separate scale +#' (e.g., 1:5 or 20%; 1:200 or 0.5%; 1:20,000 or 0.005%), separate scale #' factors are calculated for each dilution because each dilution group is #' processed separately during the assay. For each sample, the ratio of #' reference RFU / observed RFU is calculated for every SeqId. The median ratio @@ -41,7 +41,7 @@ #' \itemize{ #' \item Combining data sets from the same overarching experiment or sample #' population and normalize to a common reference that were originally -#' processed separately and each normalized "withing study". +#' processed separately and each normalized "within study". #' \item Normalize fundamentally different types of samples separately (by #' group). For instance, lysate samples from different cell lines that #' will be analyzed separately should likely be median normalized within @@ -70,8 +70,7 @@ #' \itemize{ #' \item `NULL` (default): Calculate an internal reference from study #' samples by taking the median of each SeqId within the sample -#' grouping. For multi-plate studies, the median of all plate medians -#' is used. +#' grouping. #' \item A `soma_adat` object: Extract reference from this ADAT #' \item A data.frame: Use provided reference data directly #' } diff --git a/man/plot.Map.Rd b/man/plot.Map.Rd index 1743ba08..67d9ce36 100644 --- a/man/plot.Map.Rd +++ b/man/plot.Map.Rd @@ -36,8 +36,8 @@ \item \code{\link[grDevices:palettes]{grDevices::terrain.colors()}} \item \code{\link[grDevices:palettes]{grDevices::topo.colors()}} \item \code{\link[RColorBrewer:ColorBrewer]{RColorBrewer::brewer.pal()}} -\item \code{\link[viridis:reexports]{viridis::viridis()}} -\item \code{\link[viridis:reexports]{viridis::magma()}} +\item \code{\link[viridis:viridis]{viridis::viridis()}} +\item \code{\link[viridis:magma]{viridis::magma()}} }} \item{legend.ticks}{How many ticks to place on the color legend.} From 0fd12b6e7ec07ad450bf54736292657aa896f379 Mon Sep 17 00:00:00 2001 From: scheidec Date: Mon, 16 Mar 2026 10:06:25 -0400 Subject: [PATCH 09/10] Minor medianNormalize msg and documentation edits --- R/medianNormalize.R | 20 +++++++++++++------- man/medianNormalize.Rd | 7 +++---- man/plot.Map.Rd | 4 ++-- 3 files changed, 18 insertions(+), 13 deletions(-) diff --git a/R/medianNormalize.R b/R/medianNormalize.R index bc2b9aa5..d0183616 100644 --- a/R/medianNormalize.R +++ b/R/medianNormalize.R @@ -188,13 +188,6 @@ medianNormalize <- function(adat, # Check for existing normalization scale factors ---- existing_norm_sf <- grep("^NormScale_", names(adat), value = TRUE) - if ( verbose ) { - if (length(existing_norm_sf) > 0) { - .todo("Normalization scale factors already exist: {.val {paste0(existing_norm_sf, collapse = ', ')}} - they will be replaced with new scale factors") - } - } - - # Determine which samples to normalize - only Sample types if (!"SampleType" %in% names(adat)) { stop("Field 'SampleType' not found in adat columns", call. = FALSE) @@ -209,6 +202,19 @@ medianNormalize <- function(adat, } dont_samples <- setdiff(seq_len(nrow(adat)), do_samples) + # Check existing normalization scale factors and provide appropriate message ---- + if (verbose && length(existing_norm_sf) > 0) { + # Check if all scale factors are 1.0 for samples being normalized + sample_sf_data <- adat[do_samples, existing_norm_sf, drop = FALSE] + all_ones <- all(sapply(sample_sf_data, function(x) all(abs(x - 1.0) < 1e-10, na.rm = TRUE))) + + if (all_ones) { + .todo("Normalization scale factor columns already exist: {.val {paste0(existing_norm_sf, collapse = ', ')}} - they will be replaced with new scale factors") + } else { + .todo("Normalization scale factors already exist: {.val {paste0(existing_norm_sf, collapse = ', ')}} - they will be replaced with new scale factors") + } + } + # Process reference ---- if (is.null(reference)) { # Check if SampleType conflicts with grouping variables diff --git a/man/medianNormalize.Rd b/man/medianNormalize.Rd index 0d33b250..1273f902 100644 --- a/man/medianNormalize.Rd +++ b/man/medianNormalize.Rd @@ -14,8 +14,7 @@ RFU values that have been hybridization normalized and plate scaled.} \itemize{ \item \code{NULL} (default): Calculate an internal reference from study samples by taking the median of each SeqId within the sample -grouping. For multi-plate studies, the median of all plate medians -is used. +grouping. \item A \code{soma_adat} object: Extract reference from this ADAT \item A data.frame: Use provided reference data directly } @@ -59,7 +58,7 @@ The method scales each sample so that the center of the within-sample analyte distribution aligns to a defined reference, thereby correcting global intensity shifts without altering relative differences between measurements within a sample. For assay formats with multiple dilution groups -(e.g., 1:5 or 20\%; 1:200 or 0.5\%, 1:20,000 or 0.005\%), separate scale +(e.g., 1:5 or 20\%; 1:200 or 0.5\%; 1:20,000 or 0.005\%), separate scale factors are calculated for each dilution because each dilution group is processed separately during the assay. For each sample, the ratio of reference RFU / observed RFU is calculated for every SeqId. The median ratio @@ -84,7 +83,7 @@ steps applied. \itemize{ \item Combining data sets from the same overarching experiment or sample population and normalize to a common reference that were originally -processed separately and each normalized "withing study". +processed separately and each normalized "within study". \item Normalize fundamentally different types of samples separately (by group). For instance, lysate samples from different cell lines that will be analyzed separately should likely be median normalized within diff --git a/man/plot.Map.Rd b/man/plot.Map.Rd index 67d9ce36..1743ba08 100644 --- a/man/plot.Map.Rd +++ b/man/plot.Map.Rd @@ -36,8 +36,8 @@ \item \code{\link[grDevices:palettes]{grDevices::terrain.colors()}} \item \code{\link[grDevices:palettes]{grDevices::topo.colors()}} \item \code{\link[RColorBrewer:ColorBrewer]{RColorBrewer::brewer.pal()}} -\item \code{\link[viridis:viridis]{viridis::viridis()}} -\item \code{\link[viridis:magma]{viridis::magma()}} +\item \code{\link[viridis:reexports]{viridis::viridis()}} +\item \code{\link[viridis:reexports]{viridis::magma()}} }} \item{legend.ticks}{How many ticks to place on the color legend.} From 346fb5681bc36f354535fc783d2217cf0bc77e54 Mon Sep 17 00:00:00 2001 From: scheidec Date: Mon, 16 Mar 2026 15:38:21 -0400 Subject: [PATCH 10/10] Add more robust checks to medNorm functions - properly handle data extraction from multiple scale factor columns - added logic to preserve soma_adat attributes during rbind operations - improved SampleType matching precision and clearing of ANMLFractionUsed column in reverseMedianMormalize() --- R/medianNormalize.R | 35 +++++++++-- R/reverseMedianNormalize.R | 63 +++++++++++++++----- tests/testthat/test-reverseMedianNormalize.R | 37 ++++++++++++ 3 files changed, 113 insertions(+), 22 deletions(-) diff --git a/R/medianNormalize.R b/R/medianNormalize.R index d0183616..63d89eaf 100644 --- a/R/medianNormalize.R +++ b/R/medianNormalize.R @@ -193,10 +193,10 @@ medianNormalize <- function(adat, stop("Field 'SampleType' not found in adat columns", call. = FALSE) } - do_samples <- grep("Sample", adat[["SampleType"]]) + do_samples <- which(adat$SampleType %in% "Sample") if (length(do_samples) == 0L) { stop( - "No samples selected for normalization with pattern: Sample", + "No samples selected for normalization with SampleType == 'Sample'", call. = FALSE ) } @@ -207,7 +207,7 @@ medianNormalize <- function(adat, # Check if all scale factors are 1.0 for samples being normalized sample_sf_data <- adat[do_samples, existing_norm_sf, drop = FALSE] all_ones <- all(sapply(sample_sf_data, function(x) all(abs(x - 1.0) < 1e-10, na.rm = TRUE))) - + if (all_ones) { .todo("Normalization scale factor columns already exist: {.val {paste0(existing_norm_sf, collapse = ', ')}} - they will be replaced with new scale factors") } else { @@ -281,7 +281,18 @@ medianNormalize <- function(adat, # Ensure column order matches unnorm_adat <- unnorm_adat[, names(norm_adat)] - norm_adat <- rbind(norm_adat, unnorm_adat) + + # Combine normalized and unnormalized samples in a way that preserves + # soma_adat attributes and column metadata. + norm_attrs <- attributes(norm_adat) + combined_df <- rbind(as.data.frame(norm_adat), as.data.frame(unnorm_adat)) + # Restore non-dimension attributes (e.g., Col.Meta, header metadata) + # while keeping the combined row.names and names. + for (att in setdiff(names(norm_attrs), c("row.names", "names", "class"))) { + attr(combined_df, att) <- norm_attrs[[att]] + } + class(combined_df) <- norm_attrs[["class"]] + norm_adat <- combined_df } # Restore original order @@ -342,7 +353,7 @@ medianNormalize <- function(adat, # Calculate RowCheck for each sample for (i in seq_len(nrow(adat))) { # Get all scale factor values for this sample - scale_values <- as.numeric(adat[i, scale_factor_cols, drop = FALSE]) + scale_values <- unlist(adat[i, scale_factor_cols, drop = FALSE], use.names = FALSE) scale_values <- scale_values[!is.na(scale_values)] # Check if all scale factors are within acceptance range @@ -515,6 +526,9 @@ medianNormalize <- function(adat, # Store original rownames to restore later original_rownames <- rownames(adat) + # Preserve original attributes to restore later + original_attrs <- attributes(adat) + # Validate grouping variables if (is.character(by) && length(by) > 0) { missing_cols <- setdiff(by, names(adat)) @@ -622,7 +636,7 @@ medianNormalize <- function(adat, # Calculate scale factors for each sample in this group for (i in seq_len(nrow(grp_adat))) { - sample_values <- as.numeric(grp_adat[i, dil_apts, drop = FALSE]) + sample_values <- unlist(grp_adat[i, dil_apts, drop = FALSE], use.names = FALSE) ratios <- grp_ref_values / sample_values med_scale_factor <- median(ratios[is.finite(ratios)], na.rm = TRUE) @@ -652,6 +666,15 @@ medianNormalize <- function(adat, # Remove temporary grouping column adat$.group <- NULL + # Reattach original soma_adat attributes lost during rbind + if (!is.null(original_attrs)) { + # Preserve current column names and row.names; restore all other attributes + attrs_to_restore <- setdiff(names(original_attrs), c("names", "row.names")) + for (nm in attrs_to_restore) { + attr(adat, nm) <- original_attrs[[nm]] + } + } + # Round to 1 decimal place (standard for SomaScan data) apts <- getAnalytes(adat) for (apt in apts) { diff --git a/R/reverseMedianNormalize.R b/R/reverseMedianNormalize.R index 9c43f024..649145ac 100644 --- a/R/reverseMedianNormalize.R +++ b/R/reverseMedianNormalize.R @@ -32,7 +32,7 @@ #' @param verbose Logical. Should progress messages be printed? Default is `TRUE`. #' @return A `soma_adat` object with median normalization reversed for study samples. #' QC, Calibrator, and Buffer samples retain their original normalization. -#' The `ProcessSteps` header is updated to reflect the reversal operation, +#' The `ProcessSteps` header is updated to reflect the reversal operation, #' and median normalization-specific metadata fields are cleared. #' @examples #' \dontrun{ @@ -63,9 +63,13 @@ reverseMedianNormalize <- function(adat, verbose = TRUE) { ) } - # Check for evidence of study sample median normalization (MedNormSMP only) - has_mednorm_smp <- grepl("MedNormSMP", process_steps, ignore.case = TRUE) - has_anml_smp <- grepl("anmlSMP", process_steps, ignore.case = TRUE) + # Tokenize ProcessSteps so we can reason about ordering of steps ---- + step_tokens <- unlist(strsplit(process_steps, "\\s*[;,]\\s*")) + step_tokens <- step_tokens[nzchar(step_tokens)] + + # Check for evidence of study sample median normalization (MedNormSMP / anmlSMP) + has_mednorm_smp <- any(grepl("MedNormSMP", step_tokens, ignore.case = TRUE)) + has_anml_smp <- any(grepl("anmlSMP", step_tokens, ignore.case = TRUE)) if (!has_mednorm_smp && !has_anml_smp) { stop( @@ -77,7 +81,8 @@ reverseMedianNormalize <- function(adat, verbose = TRUE) { } # Check that normalization hasn't already been reversed - has_reversal <- grepl("rev-(?:MedNormSMP|medNormInt|anmlSMP|ANML)", process_steps, ignore.case = TRUE, perl = TRUE) + has_reversal <- any(grepl("rev-(?:MedNormSMP|medNormInt|anmlSMP|ANML)", step_tokens, + ignore.case = TRUE, perl = TRUE)) if (has_reversal) { stop( @@ -89,9 +94,30 @@ reverseMedianNormalize <- function(adat, verbose = TRUE) { # Determine which normalization type was applied to study samples ---- # Only one type should be applied, check which was last - if (has_anml_smp) { + # Identify the last normalization-related token and ensure it is the final step + norm_idx <- which( + grepl("MedNormSMP", step_tokens, ignore.case = TRUE) | + grepl("anmlSMP", step_tokens, ignore.case = TRUE) + ) + if (length(norm_idx) == 0L) { + stop( + "Could not determine normalization type from ProcessSteps: ", process_steps, + call. = FALSE + ) + } + last_norm_idx <- norm_idx[length(norm_idx)] + if (last_norm_idx != length(step_tokens)) { + stop( + "Median/ANML normalization of study samples is not the final processing step. ", + "ProcessSteps: ", process_steps, ". ", + "Reversal requires normalization to be the last transformation applied to study samples.", + call. = FALSE + ) + } + last_norm_token <- step_tokens[last_norm_idx] + if (grepl("anmlSMP", last_norm_token, ignore.case = TRUE)) { normalization_type <- "anml" - } else if (has_mednorm_smp) { + } else if (grepl("MedNormSMP", last_norm_token, ignore.case = TRUE)) { normalization_type <- "median" } else { stop( @@ -146,9 +172,12 @@ reverseMedianNormalize <- function(adat, verbose = TRUE) { names(dil_groups) <- gsub("[.]0$|%|^[.]", "", names(dil_groups)) # Only reverse for study samples, leave QC/Calibrator/Buffer alone - sample_mask <- grepl("Sample|sample", adat$SampleType %||% adat$SampleId %||% "", ignore.case = TRUE) + if (is.null(adat$SampleType)) { + stop("`SampleType` column is missing; cannot identify study samples to reverse normalization.", call. = FALSE) + } + sample_mask <- !is.na(adat$SampleType) & (adat$SampleType %in% "Sample") - if (sum(sample_mask) == 0) { + if (!any(sample_mask)) { if (verbose) { cat("No study samples found to reverse normalization.\n") } @@ -226,9 +255,12 @@ reverseMedianNormalize <- function(adat, verbose = TRUE) { names(dil_groups) <- gsub("[.]0$|%|^[.]", "", names(dil_groups)) # Only reverse for study samples, leave QC/Calibrator/Buffer alone - sample_mask <- grepl("Sample|sample", adat$SampleType %||% adat$SampleId %||% "", ignore.case = TRUE) + if (is.null(adat$SampleType)) { + stop("`SampleType` column is missing; cannot identify study samples to reverse normalization.", call. = FALSE) + } + sample_mask <- !is.na(adat$SampleType) & (adat$SampleType %in% "Sample") - if (sum(sample_mask) == 0) { + if (!any(sample_mask)) { if (verbose) { cat("No study samples found to reverse ANML normalization.\n") } @@ -248,7 +280,7 @@ reverseMedianNormalize <- function(adat, verbose = TRUE) { if (!is.na(scale_factor) && scale_factor != 0) { # ANML uses log space scaling - reverse by subtracting log scale factor log_sf <- log10(scale_factor) - log_data <- log10(as.numeric(adat[i, dil_apts])) + log_data <- log10(as.numeric(unlist(adat[i, dil_apts], use.names = FALSE))) reversed_log_data <- log_data - log_sf adat[i, dil_apts] <- 10^reversed_log_data # Reset scale factor to 1.0 @@ -259,13 +291,12 @@ reverseMedianNormalize <- function(adat, verbose = TRUE) { } } - # Remove ANMLFractionUsed columns if they exist + # Clear ANMLFractionUsed columns for reversed study samples anml_frac_cols <- grep("^ANMLFractionUsed_", names(adat), value = TRUE) if (length(anml_frac_cols) > 0) { for (col in anml_frac_cols) { - if (all(is.na(adat[[col]][sample_mask]))) { - adat[[col]][sample_mask] <- NA - } + # Clear ANML-specific metadata for study samples that were reversed + adat[[col]][sample_mask] <- NA } } diff --git a/tests/testthat/test-reverseMedianNormalize.R b/tests/testthat/test-reverseMedianNormalize.R index fd35c405..9041c6fe 100644 --- a/tests/testthat/test-reverseMedianNormalize.R +++ b/tests/testthat/test-reverseMedianNormalize.R @@ -82,3 +82,40 @@ test_that("`reverseMedianNormalize` + `medianNormalize` workflow", { expect_true(grepl("rev-MedNormSMP", final_header$ProcessSteps)) expect_true(grepl("MedNormSMP", final_header$ProcessSteps)) }) + +test_that("`reverseMedianNormalize` validates normalization is final step", { + # Test error when normalization is not the final step + test_data_invalid <- test_data_full + header_meta <- attr(test_data_invalid, "Header.Meta") + header_meta$HEADER$ProcessSteps <- "Raw RFU, Hyb Normalization, medNormInt, anmlSMP, plateScale" # anmlSMP not final + attr(test_data_invalid, "Header.Meta") <- header_meta + + expect_error( + reverseMedianNormalize(test_data_invalid, verbose = FALSE), + "Median/ANML normalization of study samples is not the final processing step" + ) +}) + +test_that("`reverseMedianNormalize` handles ANMLFractionUsed columns properly", { + # Test clearing ANML-specific metadata columns + test_data_anml <- test_data_full + header_meta <- attr(test_data_anml, "Header.Meta") + header_meta$HEADER$ProcessSteps <- "Raw RFU, Hyb Normalization, medNormInt, plateScale, anmlSMP" + attr(test_data_anml, "Header.Meta") <- header_meta + + # Add ANML metadata columns + test_data_anml$ANMLFractionUsed_20 <- c(0.85, 0.92, 0.78) + test_data_anml$ANMLFractionUsed_0_5 <- c(0.91, 0.87, 0.95) + test_data_anml$NormScale_20 <- c(1.15, 0.95, 1.08) + test_data_anml$NormScale_0_5 <- c(1.22, 0.88, 1.12) + + result <- reverseMedianNormalize(test_data_anml, verbose = FALSE) + + # ANMLFractionUsed columns should be cleared for study samples + expect_true(is.na(result$ANMLFractionUsed_20[1])) # Sample should be NA + expect_true(is.na(result$ANMLFractionUsed_0_5[1])) # Sample should be NA + + # Scale factors should be reset to 1.0 for study samples + expect_equal(result$NormScale_20[1], 1.0) + expect_equal(result$NormScale_0_5[1], 1.0) +})