diff --git a/NAMESPACE b/NAMESPACE index d0754ce..f4834c9 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -112,6 +112,7 @@ export(lift_adat) export(loadAdatsAsList) export(locateSeqId) export(matchSeqIds) +export(medianNormalize) export(meltExpressionSet) export(merge_clin) export(mutate) @@ -123,6 +124,7 @@ export(read_adat) export(read_annotations) export(regexSeqId) export(rename) +export(reverseMedianNormalize) export(right_join) export(rm_rn) export(rn2col) diff --git a/NEWS.md b/NEWS.md index 1c689a8..35010b9 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,5 +1,17 @@ # SomaDataIO (6.5.0.9000) +### New Functions + +* Added `medianNormalize()` function + - performs median normalization on `soma_adat` objects that have been + hybridization normalized and plate scaled + - includes validation to ensure required normalization steps have been applied + - supports multiple reference approaches: + - Internal reference built from study samples + - Reference extracted from existing `soma_adat` object + - External reference as a data.frame + - supports custom grouping by multiple clinical variables + ### Function and Object Improvements * Removed restrictive file validation from `read_annotations()` @@ -35,7 +47,7 @@ * Updated `dplyr` verb tests to no longer explicitly test for ordering of attributes (#165) - + # SomaDataIO (6.4.0) ### New Functions diff --git a/R/medianNormalize.R b/R/medianNormalize.R new file mode 100644 index 0000000..63d89ea --- /dev/null +++ b/R/medianNormalize.R @@ -0,0 +1,870 @@ +#' Perform Median Normalization +#' +#' @description Performs median normalization on a `soma_adat` object that has +#' already undergone standard data processing for array-based SomaScan studies. +#' +#' Median normalization is a common, scale-based normalization technique that +#' corrects for assay-derived technical variation by applying sample-specific +#' linear scaling to expression measurements. Typical sources of assay +#' variation include robotic and manual liquid handling, manufactured +#' consumables such as buffers and plastic goods, laboratory instrument +#' calibration, ambient environmental conditions, inter-operator differences, +#' and other sources of technical variation. Median normalization +#' can improve assay precision and reduce technical variation that can mask +#' true biological signal. +#' +#' The method scales each sample so that the center of the within-sample analyte +#' distribution aligns to a defined reference, thereby correcting +#' global intensity shifts without altering relative differences between +#' measurements within a sample. For assay formats with multiple dilution groups +#' (e.g., 1:5 or 20%; 1:200 or 0.5%; 1:20,000 or 0.005%), separate scale +#' factors are calculated for each dilution because each dilution group is +#' processed separately during the assay. For each sample, the ratio of +#' reference RFU / observed RFU is calculated for every SeqId. The median ratio +#' within each dilution group is selected as the scale factor and applied to +#' all SeqIds for that sample within the associated dilution bin. +#' +#' @section Data Requirements: +#' This function is designed for data in standard SomaLogic deliverable formats. +#' Specific ADAT file requirements: +#' \enumerate{ +#' \item \strong{Intact ADAT file}, with available data processing information +#' in the header section. Specifically, the `ProcessSteps` field must be +#' present and correctly represent the data processing steps present in +#' the data table. +#' \item \strong{Minimal standard processing}, the function assumes a standard +#' SomaScan data deliverable with minimally standard HybNorm and PlateScale +#' steps applied. +#' } +#' +#' \strong{Primary use cases:} +#' \itemize{ +#' \item Combining data sets from the same overarching experiment or sample +#' population and normalize to a common reference that were originally +#' processed separately and each normalized "within study". +#' \item Normalize fundamentally different types of samples separately (by +#' group). For instance, lysate samples from different cell lines that +#' will be analyzed separately should likely be median normalized within +#' each cell type. Lysis buffer background samples would also be expected +#' to be normalized separately. +#' } +#' +#' @section Important Considerations: +#' \itemize{ +#' \item A core assumption of median normalization is that the majority of +#' analytes are not differentially expressed; consequently, users should +#' validate this assumption by inspecting scale-factor distributions for +#' systematic bias between the biological groups intended for comparison. +#' \item Note this function does not perform the adaptive normalization by +#' maximum likelihood (ANML) method which leverages a population-based +#' reference that iteratively down-selects the set of analytes to include +#' for the normalization calculation. +#' \item This function requires unnormalized data as input. If study samples +#' have already undergone median normalization (ANML or standard), first use +#' `reverseMedianNormalize()` to remove existing normalization. +#' } +#' +#' @param adat A `soma_adat` object created using [read_adat()], containing +#' RFU values that have been hybridization normalized and plate scaled. +#' @param reference Optional. Reference for median normalization. Can be: +#' \itemize{ +#' \item `NULL` (default): Calculate an internal reference from study +#' samples by taking the median of each SeqId within the sample +#' grouping. +#' \item A `soma_adat` object: Extract reference from this ADAT +#' \item A data.frame: Use provided reference data directly +#' } +#' When providing an external reference data.frame it must contain: +#' \describe{ +#' \item{SeqId}{Character column containing the SeqId identifiers mapping +#' to those in the `soma_adat` object. Must be in "10000-28" format, not +#' "seq.10000.28" format.} +#' \item{Reference}{Numeric column containing the reference RFU values +#' for each SeqId.} +#' } +#' @param by Character vector. Grouping variable(s) for grouped median +#' normalization. Must be column name(s) in the ADAT. Normalization will be +#' performed within each group separately. Default is `"SampleType"`. Note +#' that only study samples (SampleType == 'Sample') are normalized; QC, +#' Calibrator, and Buffer samples are automatically excluded. +#' @param verbose Logical. Should progress messages be printed? Default is `TRUE`. +#' @return A `soma_adat` object with median normalization applied and RFU values +#' adjusted. The existing `NormScale_*` columns are updated to include the +#' effects of both plate scale normalization and median normalization. +#' @examples +#' \dontrun{ +#' # Starting with unnormalized ADAT +#' unnormalized_adat <- read_adat("unnormalized_study_data.adat") +#' +#' # Internal reference from study samples (default) +#' med_norm_adat <- medianNormalize(unnormalized_adat) +#' +#' # Reference from another ADAT +#' ref_adat <- read_adat("reference_file.adat") +#' med_norm_adat <- medianNormalize(unnormalized_adat, reference = ref_adat) +#' +#' # External reference as a data.frame - requires `SeqId` and `Reference` columns +#' ref_data <- read.csv("reference_file.csv") +#' med_norm_adat <- medianNormalize(unnormalized_adat, reference = ref_data) +#' +#' # Custom grouping by multiple variables +#' # Use when total protein load changes due to analysis conditions +#' med_norm_adat <- medianNormalize(unnormalized_adat, by = c("Sex", "SampleType")) +#' +#' # If you already have normalized data, first reverse the normalization +#' normalized_adat <- read_adat("normalized_study_data.adat") +#' unnormalized_adat <- reverseMedianNormalize(normalized_adat) +#' custom_norm_adat <- medianNormalize(unnormalized_adat, reference = new_reference) +#' } +#' @importFrom dplyr filter +#' @importFrom stats median +#' @export +medianNormalize <- function(adat, + reference = NULL, + by = "SampleType", + verbose = TRUE) { + + # Input validation ---- + stopifnot("`adat` must be a class `soma_adat` object" = is.soma_adat(adat)) + + # Validate reference type early + if (!is.null(reference) && !is.soma_adat(reference) && !is.data.frame(reference)) { + stop( + "Invalid reference type. Must be NULL, soma_adat, or data.frame", + call. = FALSE + ) + } + + # Check that required normalization steps have been applied ---- + header <- attr(adat, "Header.Meta")$HEADER + + if (is.null(header)) { + stop("ADAT header metadata is missing", call. = FALSE) + } + + # Check for hybrid normalization + if (!"ProcessSteps" %in% names(header) || + !grepl("Hyb Normalization", header$ProcessSteps, ignore.case = TRUE)) { + stop( + "Hybrid normalization step not detected in ProcessSteps. ", + "Please apply hybrid normalization before median normalization.", + call. = FALSE + ) + } + + # Check for plate scale factors + norm_cols <- grep("^(Norm|norm)[Ss]cale", names(adat), value = TRUE) + if (length(norm_cols) == 0) { + stop( + "No normalization scale factor columns found. ", + "Please ensure plate scale normalization has been applied.", + call. = FALSE + ) + } + + # Check data state and existing normalization ---- + .validateDataState(adat, header, verbose) + + # Create dilution groups ---- + apt_data <- getAnalyteInfo(adat) + + if (!"Dilution" %in% names(apt_data)) { + stop("Dilution information not found in analyte data", call. = FALSE) + } + + # Filter out hybridization controls + apt_data <- filter(apt_data, !grepl("^Hybridization", Type, ignore.case = TRUE)) + + # Create dilution groups + dil_groups <- split(apt_data$AptName, apt_data$Dilution) + + # Clean up dilution names + names(dil_groups) <- gsub("\\.", "_", names(dil_groups)) + names(dil_groups) <- gsub("[.]0$|%|^[.]", "", names(dil_groups)) + + # Validate dilution count ---- + .validateDilutionCount(dil_groups, verbose) + + # Check for existing normalization scale factors ---- + existing_norm_sf <- grep("^NormScale_", names(adat), value = TRUE) + + # Determine which samples to normalize - only Sample types + if (!"SampleType" %in% names(adat)) { + stop("Field 'SampleType' not found in adat columns", call. = FALSE) + } + + do_samples <- which(adat$SampleType %in% "Sample") + if (length(do_samples) == 0L) { + stop( + "No samples selected for normalization with SampleType == 'Sample'", + call. = FALSE + ) + } + dont_samples <- setdiff(seq_len(nrow(adat)), do_samples) + + # Check existing normalization scale factors and provide appropriate message ---- + if (verbose && length(existing_norm_sf) > 0) { + # Check if all scale factors are 1.0 for samples being normalized + sample_sf_data <- adat[do_samples, existing_norm_sf, drop = FALSE] + all_ones <- all(sapply(sample_sf_data, function(x) all(abs(x - 1.0) < 1e-10, na.rm = TRUE))) + + if (all_ones) { + .todo("Normalization scale factor columns already exist: {.val {paste0(existing_norm_sf, collapse = ', ')}} - they will be replaced with new scale factors") + } else { + .todo("Normalization scale factors already exist: {.val {paste0(existing_norm_sf, collapse = ', ')}} - they will be replaced with new scale factors") + } + } + + # Process reference ---- + if (is.null(reference)) { + # Check if SampleType conflicts with grouping variables + conflicts_with_grouping <- FALSE + if ("SampleType" %in% by) { + samples_to_normalize <- adat[do_samples, ] + group_values <- unique(samples_to_normalize[["SampleType"]]) + group_values <- group_values[!is.na(group_values)] + all_groups_in_ref <- all(group_values %in% "Sample") + conflicts_with_grouping <- !identical(by, "SampleType") || !all_groups_in_ref + } + + if (conflicts_with_grouping) { + # Calculate global reference to avoid groups lacking reference samples + if (verbose) { + .todo("Building global internal reference from study samples (SampleType == 'Sample')") + } + ref_data <- .buildInternalReference(adat, dil_groups) + } else { + # Standard internal reference - calculate per group + # Build internal reference for adding to metadata + if (verbose) { + .todo("Building internal reference from study samples (SampleType == 'Sample')") + } + ref_data <- .buildInternalReference(adat, dil_groups) + # Mark as group-specific for normalization calculations + attr(ref_data, "group_specific") <- TRUE + } + } else { + ref_data <- .processReference(reference, adat, dil_groups, apt_data, verbose) + } + + # Add row identifier to maintain order + adat$.rowid <- seq_len(nrow(adat)) + + # Perform median normalization on selected samples + if (length(do_samples) > 0) { + norm_adat <- .performMedianNorm( + adat[do_samples, ], + dil_groups = dil_groups, + by = by, + ref_data = ref_data, + verbose = verbose + ) + } + + # Handle samples that were not normalized + if (!is.null(dont_samples) && length(dont_samples) > 0) { + unnorm_adat <- adat[dont_samples, ] + sf_cols <- paste0("NormScale_", names(dil_groups)) + + # Check if scale factor columns already exist in the original data + existing_sf_cols <- intersect(sf_cols, names(adat)) + + # For all scale factor columns, preserve existing values or set to 1.0 + for (col in sf_cols) { + if (col %in% existing_sf_cols) { + # Keep the existing value as-is - unnorm_adat[[col]] already contains it + } else { + # Set to 1.0 for new scale factor columns + unnorm_adat[[col]] <- 1.0 + } + } + + # Ensure column order matches + unnorm_adat <- unnorm_adat[, names(norm_adat)] + + # Combine normalized and unnormalized samples in a way that preserves + # soma_adat attributes and column metadata. + norm_attrs <- attributes(norm_adat) + combined_df <- rbind(as.data.frame(norm_adat), as.data.frame(unnorm_adat)) + # Restore non-dimension attributes (e.g., Col.Meta, header metadata) + # while keeping the combined row.names and names. + for (att in setdiff(names(norm_attrs), c("row.names", "names", "class"))) { + attr(combined_df, att) <- norm_attrs[[att]] + } + class(combined_df) <- norm_attrs[["class"]] + norm_adat <- combined_df + } + + # Restore original order + norm_adat <- norm_adat[order(norm_adat$.rowid), ] + norm_adat$.rowid <- NULL + + # Add medNorm reference to SeqId annotations ---- + norm_adat <- .addMedNormReference(norm_adat, ref_data, dil_groups) + + # Recalculate RowCheck to adjust for new MedNorm values ---- + norm_adat <- .recalculateRowCheck(norm_adat, verbose) + + # Update header metadata + .updateHeaderMetadata(norm_adat, reference) +} + + +#' Recalculate RowCheck after Median Normalization +#' +#' Recalculates RowCheck values as PASS or FLAG based on normalization acceptance +#' criteria for row scale factors after median normalization. Samples with all +#' row scale factors within the acceptance range (0.4 to 2.5) receive "PASS", +#' while samples with any scale factor outside this range receive "FLAG". +#' +#' @param adat A `soma_adat` object after median normalization +#' @param verbose Logical. Whether to print progress messages +#' @return The `soma_adat` object with updated RowCheck values +#' @noRd +.recalculateRowCheck <- function(adat, verbose) { + + if (verbose) { + cat("Recalculating RowCheck values based on normalization acceptance criteria...\n") + } + + # Check if RowCheck column exists + if (!"RowCheck" %in% names(adat)) { + if (verbose) { + cat("No RowCheck column found to recalculate.\n") + } + return(adat) + } + + # Find all normalization scale factor columns (NormScale_*) + scale_factor_cols <- grep("^NormScale_", names(adat), value = TRUE) + + if (length(scale_factor_cols) == 0) { + if (verbose) { + cat("No normalization scale factor columns found. Setting all RowCheck to PASS.\n") + } + adat$RowCheck <- "PASS" + return(adat) + } + + # Define acceptance criteria range for row scale factors + min_scale <- 0.4 + max_scale <- 2.5 + + # Calculate RowCheck for each sample + for (i in seq_len(nrow(adat))) { + # Get all scale factor values for this sample + scale_values <- unlist(adat[i, scale_factor_cols, drop = FALSE], use.names = FALSE) + scale_values <- scale_values[!is.na(scale_values)] + + # Check if all scale factors are within acceptance range + if (length(scale_values) == 0) { + # No scale factors available - default to PASS + adat$RowCheck[i] <- "PASS" + } else if (all(scale_values >= min_scale & scale_values <= max_scale)) { + adat$RowCheck[i] <- "PASS" + } else { + adat$RowCheck[i] <- "FLAG" + } + } + + if (verbose) { + pass_count <- sum(adat$RowCheck == "PASS", na.rm = TRUE) + flag_count <- sum(adat$RowCheck == "FLAG", na.rm = TRUE) + cat("RowCheck values updated for", nrow(adat), "samples.\n") + cat(" - PASS:", pass_count, "samples\n") + cat(" - FLAG:", flag_count, "samples\n") + cat(" - Acceptance criteria: scale factors within [", min_scale, ", ", max_scale, "]\n") + } + + adat +} + + +#' Process Reference Data +#' @noRd +.processReference <- function(reference, adat, dil_groups, apt_data, verbose) { + + if (is.soma_adat(reference)) { + # Use reference from provided ADAT + if (verbose) { + .todo("Using reference from provided ADAT object") + } + return(.extractReferenceFromAdat(reference, dil_groups)) + + } else if (is.data.frame(reference)) { + # Reference data provided directly + if (verbose) { + .todo("Using provided reference data.frame") + } + return(.validateReferenceData(reference, dil_groups, apt_data)) + + } else { + stop( + "Invalid reference type. Must be NULL, soma_adat, or data.frame", + call. = FALSE + ) + } +} + +#' Build Internal Reference from Study Samples +#' @noRd +.buildInternalReference <- function(adat, dil_groups) { + + if (!"SampleType" %in% names(adat)) { + stop("Reference field 'SampleType' not found", call. = FALSE) + } + + # Select reference samples + ref_samples <- adat[["SampleType"]] %in% "Sample" + if (sum(ref_samples) == 0) { + stop( + "No reference samples found with field 'SampleType' and value: Sample", + call. = FALSE + ) + } + + ref_adat <- adat[ref_samples, ] + + # Calculate reference medians for each dilution group + ref_data <- list() + for (dil_name in names(dil_groups)) { + dil_apts <- intersect(dil_groups[[dil_name]], getAnalytes(adat)) + if (length(dil_apts) > 0) { + ref_data[[dil_name]] <- apply(ref_adat[, dil_apts, drop = FALSE], 2, median, na.rm = TRUE) + } + } + + ref_data +} + +#' Extract Reference from ADAT +#' @noRd +.extractReferenceFromAdat <- function(ref_adat, dil_groups) { + + # Calculate reference medians for each dilution group + ref_data <- list() + for (dil_name in names(dil_groups)) { + dil_apts <- intersect(dil_groups[[dil_name]], getAnalytes(ref_adat)) + if (length(dil_apts) > 0) { + ref_data[[dil_name]] <- apply(ref_adat[, dil_apts, drop = FALSE], 2, median, na.rm = TRUE) + } + } + + ref_data +} + + + +#' Validate Reference Data +#' @noRd +.validateReferenceData <- function(ref_df, dil_groups, apt_data = NULL) { + + # Check for required SeqId and Reference columns + required_cols <- c("SeqId", "Reference") + if (!all(required_cols %in% names(ref_df))) { + missing_cols <- setdiff(required_cols, names(ref_df)) + stop( + "Reference data must contain 'SeqId' and 'Reference' columns.\n", + "Missing columns: ", paste(missing_cols, collapse = ", "), "\n", + "Found columns: ", paste(names(ref_df), collapse = ", "), + call. = FALSE + ) + } + + # Process as SeqId-specific reference + return(.processSeqIdReference(ref_df, dil_groups, apt_data)) +} + +#' Process SeqId-Specific Reference Data +#' @noRd +.processSeqIdReference <- function(ref_df, dil_groups, apt_data) { + + if (!is.null(apt_data) && !"SeqId" %in% names(apt_data)) { + stop("ADAT analyte data must contain SeqId column for SeqId reference matching", call. = FALSE) + } + + # Create a global reference mapping from all dilutions + # Expect SeqId column in reference data + if (!"SeqId" %in% names(ref_df)) { + stop("Reference data must contain SeqId column for SeqId reference matching", call. = FALSE) + } + global_seqid_refs <- setNames(ref_df$Reference, ref_df$SeqId) + + ref_data <- list() + + # Process each dilution group using the global reference + for (dil_name in names(dil_groups)) { + dil_apt_names <- dil_groups[[dil_name]] + + # Convert AptNames to SeqIds and find matches in global reference + dil_seq_ids <- apt_data$SeqId[apt_data$AptName %in% dil_apt_names] + matching_seq_ids <- intersect(dil_seq_ids, names(global_seqid_refs)) + + if (length(matching_seq_ids) > 0) { + # Convert back to AptNames for the final result + matching_apt_names <- apt_data$AptName[apt_data$SeqId %in% matching_seq_ids] + ref_values <- global_seqid_refs[matching_seq_ids] + names(ref_values) <- matching_apt_names + ref_data[[dil_name]] <- ref_values + } + } + + # Check that we have some references + if (length(ref_data) == 0) { + stop("No matching SeqIds or dilution groups found in reference data", call. = FALSE) + } + + ref_data +} + + + +#' Perform Median Normalization +#' @noRd +.performMedianNorm <- function(adat, dil_groups, by, ref_data, verbose) { + + # Store original rownames to restore later + original_rownames <- rownames(adat) + + # Preserve original attributes to restore later + original_attrs <- attributes(adat) + + # Validate grouping variables + if (is.character(by) && length(by) > 0) { + missing_cols <- setdiff(by, names(adat)) + if (length(missing_cols) > 0) { + stop("Grouping column(s) not found: ", paste(missing_cols, collapse = ", "), + call. = FALSE) + } + } + + # Create grouping variable + if (length(by) == 1L) { + group_var <- adat[[by]] + } else if (length(by) > 1L) { + group_var <- apply(adat[, by, drop = FALSE], 1, paste, collapse = "__") + } else { + group_var <- rep("all", nrow(adat)) + } + + # Report grouping strategy if verbose + if (verbose) { + if (length(by) > 1L) { + .todo("Performing grouped median normalization by: {.val {paste(by, collapse = ', ')}}") + } else if (length(by) == 1L && by != "SampleType") { + .todo("Performing grouped median normalization by: {.val {by}}") + } else if (length(unique(group_var)) > 1) { + .todo("Performing grouped median normalization by: {.val {by}} ({.val {length(unique(group_var))}} groups)") + } + } + + adat$.group <- group_var + + # Split data by groups and process each group separately + groups <- unique(group_var) + result_list <- list() + + for (grp in groups) { + grp_samples <- which(group_var == grp) + grp_adat <- adat[grp_samples, , drop = FALSE] + + if (verbose && length(groups) > 1) { + .todo("Processing group: {.val {grp}} ({.val {length(grp_samples)}} samples)") + } + + # Calculate scale factors for each dilution group within this sample group + for (dil_name in names(dil_groups)) { + sf_col <- paste0("NormScale_", dil_name) + + # Initialize scale factor column + if (!sf_col %in% names(grp_adat)) { + grp_adat[[sf_col]] <- 1.0 + } + + # Get analytes in this dilution + dil_apts <- intersect(dil_groups[[dil_name]], getAnalytes(grp_adat)) + + if (length(dil_apts) == 0) { + next + } + + if (verbose) { + .done("Processing dilution '{dil_name}' with {length(dil_apts)} analytes") + } + + # Calculate reference values for this dilution + if (!is.null(ref_data) && dil_name %in% names(ref_data) && !isTRUE(attr(ref_data, "group_specific"))) { + # Use external reference + ref_values <- ref_data[[dil_name]] + + if (is.numeric(ref_values) && length(ref_values) == 1) { + # Single reference value for the whole dilution group + grp_ref_values <- rep(ref_values, length(dil_apts)) + names(grp_ref_values) <- dil_apts + } else if (is.numeric(ref_values) && length(ref_values) > 1) { + # Aptamer-specific reference values + grp_ref_values <- ref_values[dil_apts] + has_ref <- !is.na(grp_ref_values) + + if (any(has_ref)) { + dil_apts <- dil_apts[has_ref] + grp_ref_values <- grp_ref_values[has_ref] + } else { + next # No references available, skip this dilution group + } + } else { + # Fallback to group-specific calculation + grp_ref_values <- apply(grp_adat[, dil_apts, drop = FALSE], 2, median, na.rm = TRUE) + } + } else { + # Internal reference: Use Sample types from this group only + if (!"SampleType" %in% names(grp_adat)) { + stop("Reference field 'SampleType' not found", call. = FALSE) + } + + ref_samples_mask <- grp_adat[["SampleType"]] %in% "Sample" + if (sum(ref_samples_mask) == 0) { + stop( + "No reference samples found with field 'SampleType' and value: Sample", + call. = FALSE + ) + } + + ref_sample_data <- grp_adat[ref_samples_mask, dil_apts, drop = FALSE] + grp_ref_values <- apply(ref_sample_data, 2, median, na.rm = TRUE) + } + + # Calculate scale factors for each sample in this group + for (i in seq_len(nrow(grp_adat))) { + sample_values <- unlist(grp_adat[i, dil_apts, drop = FALSE], use.names = FALSE) + ratios <- grp_ref_values / sample_values + med_scale_factor <- median(ratios[is.finite(ratios)], na.rm = TRUE) + + if (!is.finite(med_scale_factor)) { + med_scale_factor <- 1.0 + } + + grp_adat[[sf_col]][i] <- med_scale_factor + } + + # Apply scale factors to analytes + for (apt in dil_apts) { + grp_adat[[apt]] <- grp_adat[[apt]] * grp_adat[[sf_col]] + } + } + + result_list[[as.character(grp)]] <- grp_adat + } + + # Combine results and restore original order + adat <- do.call(rbind, result_list) + adat <- adat[order(adat$.rowid), ] + + # Restore original rownames + rownames(adat) <- original_rownames + + # Remove temporary grouping column + adat$.group <- NULL + + # Reattach original soma_adat attributes lost during rbind + if (!is.null(original_attrs)) { + # Preserve current column names and row.names; restore all other attributes + attrs_to_restore <- setdiff(names(original_attrs), c("names", "row.names")) + for (nm in attrs_to_restore) { + attr(adat, nm) <- original_attrs[[nm]] + } + } + + # Round to 1 decimal place (standard for SomaScan data) + apts <- getAnalytes(adat) + for (apt in apts) { + adat[[apt]] <- round(adat[[apt]], 1) + } + + adat +} + +#' Update Header Metadata +#' @noRd +.updateHeaderMetadata <- function(adat, reference) { + header_meta <- attr(adat, "Header.Meta") + + if (!is.null(header_meta) && !is.null(header_meta$HEADER)) { + # Add median normalization to process steps + if ("ProcessSteps" %in% names(header_meta$HEADER)) { + # Check for MedNormSMP that is not preceded by "rev-" + if (!grepl("(? 0) { + ref_values <- ref_data[[dil_name]] + + # Handle both single reference and aptamer-specific references + if (is.numeric(ref_values) && length(ref_values) == 1) { + # Single reference value for the whole dilution group (round to 2 decimal places) + rounded_value <- round(ref_values, 2) + apt_data$medNormSMP_ReferenceRFU[apt_data$AptName %in% dil_apts] <- rounded_value + } else if (is.numeric(ref_values) && length(ref_values) > 1) { + # Aptamer-specific reference values (round to 2 decimal places) + for (apt in dil_apts) { + if (apt %in% names(ref_values)) { + rounded_value <- round(ref_values[apt], 2) + apt_data$medNormSMP_ReferenceRFU[apt_data$AptName == apt] <- rounded_value + } + } + } + } + } + } + + # Update the analyte metadata + attr(adat, "Col.Meta") <- apt_data + } + + invisible(adat) +} + + + + diff --git a/R/reverseMedianNormalize.R b/R/reverseMedianNormalize.R new file mode 100644 index 0000000..649145a --- /dev/null +++ b/R/reverseMedianNormalize.R @@ -0,0 +1,332 @@ +#' Reverse Median Normalization from Study Samples +#' +#' @description Reverses median normalization (including ANML) that was +#' previously applied to study samples (`SampleType == "Sample"`). This function +#' is designed to work with standard SomaScan deliverable ADAT files where +#' study samples have undergone median normalization as the final processing step. +#' +#' This function validates that: +#' \enumerate{ +#' \item Study samples have a median normalization step applied +#' \item The normalization was the last transformation applied to study samples +#' \item The correct reversal method is applied based on the normalization type +#' } +#' +#' @section Use Cases: +#' \itemize{ +#' \item Converting from normalized ADAT to unnormalized ADAT for custom normalization +#' \item Preparing normalized delivery data for use with `medianNormalize()` function +#' \item Backing out normalization to apply different normalization strategies +#' } +#' +#' @section Data Requirements: +#' \itemize{ +#' \item ADAT file with study samples (`SampleType == "Sample"`) that have been +#' median normalized (either standard median normalization or ANML) +#' \item Intact header metadata with `ProcessSteps` field indicating the +#' normalization history +#' \item Median normalization must be the last processing step applied to study samples +#' } +#' +#' @param adat A `soma_adat` object with study samples that have been median normalized +#' @param verbose Logical. Should progress messages be printed? Default is `TRUE`. +#' @return A `soma_adat` object with median normalization reversed for study samples. +#' QC, Calibrator, and Buffer samples retain their original normalization. +#' The `ProcessSteps` header is updated to reflect the reversal operation, +#' and median normalization-specific metadata fields are cleared. +#' @examples +#' \dontrun{ +#' # Reverse normalization from a delivered ADAT file +#' normalized_adat <- read_adat("normalized_study_data.adat") +#' unnormalized_adat <- reverseMedianNormalize(normalized_adat) +#' } +#' @export +reverseMedianNormalize <- function(adat, verbose = TRUE) { + + # Input validation ---- + stopifnot("`adat` must be a class `soma_adat` object" = is.soma_adat(adat)) + + # Get header metadata + header <- attr(adat, "Header.Meta")$HEADER + + if (is.null(header)) { + stop("ADAT header metadata is missing", call. = FALSE) + } + + # Validate that study samples have been normalized ---- + process_steps <- header$ProcessSteps %||% "" + + if (process_steps == "") { + stop( + "ProcessSteps field is empty. Cannot determine normalization history.", + call. = FALSE + ) + } + + # Tokenize ProcessSteps so we can reason about ordering of steps ---- + step_tokens <- unlist(strsplit(process_steps, "\\s*[;,]\\s*")) + step_tokens <- step_tokens[nzchar(step_tokens)] + + # Check for evidence of study sample median normalization (MedNormSMP / anmlSMP) + has_mednorm_smp <- any(grepl("MedNormSMP", step_tokens, ignore.case = TRUE)) + has_anml_smp <- any(grepl("anmlSMP", step_tokens, ignore.case = TRUE)) + + if (!has_mednorm_smp && !has_anml_smp) { + stop( + "No evidence of median normalization applied to study samples. ", + "ProcessSteps: ", process_steps, ". ", + "This function is designed to reverse median normalization from study samples.", + call. = FALSE + ) + } + + # Check that normalization hasn't already been reversed + has_reversal <- any(grepl("rev-(?:MedNormSMP|medNormInt|anmlSMP|ANML)", step_tokens, + ignore.case = TRUE, perl = TRUE)) + + if (has_reversal) { + stop( + "Data appears to have already been denormalized. ", + "ProcessSteps: ", process_steps, + call. = FALSE + ) + } + + # Determine which normalization type was applied to study samples ---- + # Only one type should be applied, check which was last + # Identify the last normalization-related token and ensure it is the final step + norm_idx <- which( + grepl("MedNormSMP", step_tokens, ignore.case = TRUE) | + grepl("anmlSMP", step_tokens, ignore.case = TRUE) + ) + if (length(norm_idx) == 0L) { + stop( + "Could not determine normalization type from ProcessSteps: ", process_steps, + call. = FALSE + ) + } + last_norm_idx <- norm_idx[length(norm_idx)] + if (last_norm_idx != length(step_tokens)) { + stop( + "Median/ANML normalization of study samples is not the final processing step. ", + "ProcessSteps: ", process_steps, ". ", + "Reversal requires normalization to be the last transformation applied to study samples.", + call. = FALSE + ) + } + last_norm_token <- step_tokens[last_norm_idx] + if (grepl("anmlSMP", last_norm_token, ignore.case = TRUE)) { + normalization_type <- "anml" + } else if (grepl("MedNormSMP", last_norm_token, ignore.case = TRUE)) { + normalization_type <- "median" + } else { + stop( + "Could not determine normalization type from ProcessSteps: ", process_steps, + call. = FALSE + ) + } + + # Apply the appropriate denormalization ---- + if (normalization_type == "anml") { + adat <- .reverseANMLSMP(adat, verbose) + } else if (normalization_type == "median") { + adat <- .reverseMedNormSMP(adat, verbose) + } + + # Remove medNormSMP_ReferenceRFU field from analyte metadata after reversal ---- + col_meta <- attr(adat, "Col.Meta") + if (!is.null(col_meta) && "medNormSMP_ReferenceRFU" %in% names(col_meta)) { + col_meta$medNormSMP_ReferenceRFU <- NULL + attr(adat, "Col.Meta") <- col_meta + if (verbose) { + cat("Removed medNormSMP_ReferenceRFU field from analyte metadata.\n") + } + } + + adat +} + + +#' Reverse Existing Median Normalization for Study Samples +#' @noRd +.reverseMedNormSMP <- function(adat, verbose) { + + # Get existing scale factors + sf_cols <- grep("^NormScale_", names(adat), value = TRUE) + + if (length(sf_cols) == 0) { + if (verbose) { + cat("No existing median normalization scale factors found to reverse.\n") + } + return(adat) + } + + if (verbose) { + cat("Reversing existing median normalization for study samples...\n") + } + + # Get dilution groups + apt_data <- getAnalyteInfo(adat) + dil_groups <- split(apt_data$AptName, apt_data$Dilution) + names(dil_groups) <- gsub("\\.", "_", names(dil_groups)) + names(dil_groups) <- gsub("[.]0$|%|^[.]", "", names(dil_groups)) + + # Only reverse for study samples, leave QC/Calibrator/Buffer alone + if (is.null(adat$SampleType)) { + stop("`SampleType` column is missing; cannot identify study samples to reverse normalization.", call. = FALSE) + } + sample_mask <- !is.na(adat$SampleType) & (adat$SampleType %in% "Sample") + + if (!any(sample_mask)) { + if (verbose) { + cat("No study samples found to reverse normalization.\n") + } + return(adat) + } + + # For each dilution group, reverse normalization + for (dil_name in names(dil_groups)) { + sf_col <- paste0("NormScale_", dil_name) + + if (sf_col %in% names(adat)) { + dil_apts <- intersect(dil_groups[[dil_name]], getAnalytes(adat)) + + if (length(dil_apts) > 0) { + for (i in which(sample_mask)) { + scale_factor <- adat[[sf_col]][i] + if (!is.na(scale_factor) && scale_factor != 0) { + # Apply inverse of scale factor + adat[i, dil_apts] <- adat[i, dil_apts] / scale_factor + # Reset scale factor to 1.0 + adat[[sf_col]][i] <- 1.0 + } + } + } + } + } + + # Update ProcessSteps to indicate reversal + header_meta <- attr(adat, "Header.Meta") + if (!is.null(header_meta) && !is.null(header_meta$HEADER)) { + current_steps <- header_meta$HEADER$ProcessSteps %||% "" + + # Add reversal step + if (grepl("MedNormSMP", current_steps, ignore.case = TRUE)) { + header_meta$HEADER$ProcessSteps <- paste(current_steps, "rev-MedNormSMP", sep = ", ") + } + + # Clear median normalization specific header fields + header_meta$HEADER$NormalizationAlgorithm <- NULL + header_meta$HEADER$MedNormReference <- NULL + + attr(adat, "Header.Meta") <- header_meta + } + + if (verbose) { + cat("Median normalization reversed for", sum(sample_mask), "study samples.\n") + } + + adat +} + + +#' Reverse Existing ANML Normalization for Study Samples +#' @noRd +.reverseANMLSMP <- function(adat, verbose) { + + # Get existing scale factors + sf_cols <- grep("^NormScale_", names(adat), value = TRUE) + + if (length(sf_cols) == 0) { + if (verbose) { + cat("No existing ANML normalization scale factors found to reverse.\n") + } + return(adat) + } + + if (verbose) { + cat("Reversing existing ANML normalization for study samples...\n") + } + + # Get dilution groups + apt_data <- getAnalyteInfo(adat) + dil_groups <- split(apt_data$AptName, apt_data$Dilution) + names(dil_groups) <- gsub("\\.", "_", names(dil_groups)) + names(dil_groups) <- gsub("[.]0$|%|^[.]", "", names(dil_groups)) + + # Only reverse for study samples, leave QC/Calibrator/Buffer alone + if (is.null(adat$SampleType)) { + stop("`SampleType` column is missing; cannot identify study samples to reverse normalization.", call. = FALSE) + } + sample_mask <- !is.na(adat$SampleType) & (adat$SampleType %in% "Sample") + + if (!any(sample_mask)) { + if (verbose) { + cat("No study samples found to reverse ANML normalization.\n") + } + return(adat) + } + + # For each dilution group, reverse ANML normalization using log space + for (dil_name in names(dil_groups)) { + sf_col <- paste0("NormScale_", dil_name) + + if (sf_col %in% names(adat)) { + dil_apts <- intersect(dil_groups[[dil_name]], getAnalytes(adat)) + + if (length(dil_apts) > 0) { + for (i in which(sample_mask)) { + scale_factor <- adat[[sf_col]][i] + if (!is.na(scale_factor) && scale_factor != 0) { + # ANML uses log space scaling - reverse by subtracting log scale factor + log_sf <- log10(scale_factor) + log_data <- log10(as.numeric(unlist(adat[i, dil_apts], use.names = FALSE))) + reversed_log_data <- log_data - log_sf + adat[i, dil_apts] <- 10^reversed_log_data + # Reset scale factor to 1.0 + adat[[sf_col]][i] <- 1.0 + } + } + } + } + } + + # Clear ANMLFractionUsed columns for reversed study samples + anml_frac_cols <- grep("^ANMLFractionUsed_", names(adat), value = TRUE) + if (length(anml_frac_cols) > 0) { + for (col in anml_frac_cols) { + # Clear ANML-specific metadata for study samples that were reversed + adat[[col]][sample_mask] <- NA + } + } + + # Update ProcessSteps to indicate reversal + header_meta <- attr(adat, "Header.Meta") + if (!is.null(header_meta) && !is.null(header_meta$HEADER)) { + current_steps <- header_meta$HEADER$ProcessSteps %||% "" + + # Add reversal step - look for what type of ANML was reversed + if (grepl("anmlSMP", current_steps, ignore.case = TRUE)) { + header_meta$HEADER$ProcessSteps <- paste(current_steps, "rev-anmlSMP", sep = ", ") + } else if (grepl("ANML", current_steps, ignore.case = TRUE)) { + header_meta$HEADER$ProcessSteps <- paste(current_steps, "rev-ANML", sep = ", ") + } + + # Clear median normalization specific header fields + header_meta$HEADER$NormalizationAlgorithm <- NULL + header_meta$HEADER$MedNormReference <- NULL + + attr(adat, "Header.Meta") <- header_meta + } + + # Reset RowCheck for reversed samples + if ("RowCheck" %in% names(adat)) { + adat$RowCheck[sample_mask] <- "PASS" + } + + if (verbose) { + cat("ANML normalization reversed for", sum(sample_mask), "study samples.\n") + } + + adat +} diff --git a/inst/WORDLIST b/inst/WORDLIST index efd92ed..606b008 100644 --- a/inst/WORDLIST +++ b/inst/WORDLIST @@ -5,6 +5,7 @@ Agilent Analyte Analytes AptName +AptNames AssayNotes Barcode BioTools @@ -33,12 +34,14 @@ ExtIdentifier FC Hiser HybControlNormScale +HybNorm Kuei LF Lifecycle +MADs MERCHANTABILITY MacOS -MADs +MedNorm NHS NormScale ORCID @@ -55,7 +58,6 @@ RFU RFUs RUO ReferenceRFU -Reproducibility RowCheck SELEX SG @@ -87,8 +89,8 @@ TimePoint TubeUniqueID Un UniProt -YAML adat +adats aliquot analyte analytes @@ -112,7 +114,7 @@ magrittr md medNormRef nd -normals +pipetting pkgdown plex pre @@ -127,5 +129,4 @@ tada th tibble tidyr -untransformed -vectorized +variabilities diff --git a/man/medianNormalize.Rd b/man/medianNormalize.Rd new file mode 100644 index 0000000..1273f90 --- /dev/null +++ b/man/medianNormalize.Rd @@ -0,0 +1,137 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/medianNormalize.R +\name{medianNormalize} +\alias{medianNormalize} +\title{Perform Median Normalization} +\usage{ +medianNormalize(adat, reference = NULL, by = "SampleType", verbose = TRUE) +} +\arguments{ +\item{adat}{A \code{soma_adat} object created using \code{\link[=read_adat]{read_adat()}}, containing +RFU values that have been hybridization normalized and plate scaled.} + +\item{reference}{Optional. Reference for median normalization. Can be: +\itemize{ +\item \code{NULL} (default): Calculate an internal reference from study +samples by taking the median of each SeqId within the sample +grouping. +\item A \code{soma_adat} object: Extract reference from this ADAT +\item A data.frame: Use provided reference data directly +} +When providing an external reference data.frame it must contain: +\describe{ +\item{SeqId}{Character column containing the SeqId identifiers mapping +to those in the \code{soma_adat} object. Must be in "10000-28" format, not +"seq.10000.28" format.} +\item{Reference}{Numeric column containing the reference RFU values +for each SeqId.} +}} + +\item{by}{Character vector. Grouping variable(s) for grouped median +normalization. Must be column name(s) in the ADAT. Normalization will be +performed within each group separately. Default is \code{"SampleType"}. Note +that only study samples (SampleType == 'Sample') are normalized; QC, +Calibrator, and Buffer samples are automatically excluded.} + +\item{verbose}{Logical. Should progress messages be printed? Default is \code{TRUE}.} +} +\value{ +A \code{soma_adat} object with median normalization applied and RFU values +adjusted. The existing \verb{NormScale_*} columns are updated to include the +effects of both plate scale normalization and median normalization. +} +\description{ +Performs median normalization on a \code{soma_adat} object that has +already undergone standard data processing for array-based SomaScan studies. + +Median normalization is a common, scale-based normalization technique that +corrects for assay-derived technical variation by applying sample-specific +linear scaling to expression measurements. Typical sources of assay +variation include robotic and manual liquid handling, manufactured +consumables such as buffers and plastic goods, laboratory instrument +calibration, ambient environmental conditions, inter-operator differences, +and other sources of technical variation. Median normalization +can improve assay precision and reduce technical variation that can mask +true biological signal. + +The method scales each sample so that the center of the within-sample analyte +distribution aligns to a defined reference, thereby correcting +global intensity shifts without altering relative differences between +measurements within a sample. For assay formats with multiple dilution groups +(e.g., 1:5 or 20\%; 1:200 or 0.5\%; 1:20,000 or 0.005\%), separate scale +factors are calculated for each dilution because each dilution group is +processed separately during the assay. For each sample, the ratio of +reference RFU / observed RFU is calculated for every SeqId. The median ratio +within each dilution group is selected as the scale factor and applied to +all SeqIds for that sample within the associated dilution bin. +} +\section{Data Requirements}{ + +This function is designed for data in standard SomaLogic deliverable formats. +Specific ADAT file requirements: +\enumerate{ +\item \strong{Intact ADAT file}, with available data processing information +in the header section. Specifically, the \code{ProcessSteps} field must be +present and correctly represent the data processing steps present in +the data table. +\item \strong{Minimal standard processing}, the function assumes a standard +SomaScan data deliverable with minimally standard HybNorm and PlateScale +steps applied. +} + +\strong{Primary use cases:} +\itemize{ +\item Combining data sets from the same overarching experiment or sample +population and normalize to a common reference that were originally +processed separately and each normalized "within study". +\item Normalize fundamentally different types of samples separately (by +group). For instance, lysate samples from different cell lines that +will be analyzed separately should likely be median normalized within +each cell type. Lysis buffer background samples would also be expected +to be normalized separately. +} +} + +\section{Important Considerations}{ + +\itemize{ +\item A core assumption of median normalization is that the majority of +analytes are not differentially expressed; consequently, users should +validate this assumption by inspecting scale-factor distributions for +systematic bias between the biological groups intended for comparison. +\item Note this function does not perform the adaptive normalization by +maximum likelihood (ANML) method which leverages a population-based +reference that iteratively down-selects the set of analytes to include +for the normalization calculation. +\item This function requires unnormalized data as input. If study samples +have already undergone median normalization (ANML or standard), first use +\code{reverseMedianNormalize()} to remove existing normalization. +} +} + +\examples{ +\dontrun{ +# Starting with unnormalized ADAT +unnormalized_adat <- read_adat("unnormalized_study_data.adat") + +# Internal reference from study samples (default) +med_norm_adat <- medianNormalize(unnormalized_adat) + +# Reference from another ADAT +ref_adat <- read_adat("reference_file.adat") +med_norm_adat <- medianNormalize(unnormalized_adat, reference = ref_adat) + +# External reference as a data.frame - requires `SeqId` and `Reference` columns +ref_data <- read.csv("reference_file.csv") +med_norm_adat <- medianNormalize(unnormalized_adat, reference = ref_data) + +# Custom grouping by multiple variables +# Use when total protein load changes due to analysis conditions +med_norm_adat <- medianNormalize(unnormalized_adat, by = c("Sex", "SampleType")) + +# If you already have normalized data, first reverse the normalization +normalized_adat <- read_adat("normalized_study_data.adat") +unnormalized_adat <- reverseMedianNormalize(normalized_adat) +custom_norm_adat <- medianNormalize(unnormalized_adat, reference = new_reference) +} +} diff --git a/man/reverseMedianNormalize.Rd b/man/reverseMedianNormalize.Rd new file mode 100644 index 0000000..8d63423 --- /dev/null +++ b/man/reverseMedianNormalize.Rd @@ -0,0 +1,59 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/reverseMedianNormalize.R +\name{reverseMedianNormalize} +\alias{reverseMedianNormalize} +\title{Reverse Median Normalization from Study Samples} +\usage{ +reverseMedianNormalize(adat, verbose = TRUE) +} +\arguments{ +\item{adat}{A \code{soma_adat} object with study samples that have been median normalized} + +\item{verbose}{Logical. Should progress messages be printed? Default is \code{TRUE}.} +} +\value{ +A \code{soma_adat} object with median normalization reversed for study samples. +QC, Calibrator, and Buffer samples retain their original normalization. +The \code{ProcessSteps} header is updated to reflect the reversal operation, +and median normalization-specific metadata fields are cleared. +} +\description{ +Reverses median normalization (including ANML) that was +previously applied to study samples (\code{SampleType == "Sample"}). This function +is designed to work with standard SomaScan deliverable ADAT files where +study samples have undergone median normalization as the final processing step. + +This function validates that: +\enumerate{ +\item Study samples have a median normalization step applied +\item The normalization was the last transformation applied to study samples +\item The correct reversal method is applied based on the normalization type +} +} +\section{Use Cases}{ + +\itemize{ +\item Converting from normalized ADAT to unnormalized ADAT for custom normalization +\item Preparing normalized delivery data for use with \code{medianNormalize()} function +\item Backing out normalization to apply different normalization strategies +} +} + +\section{Data Requirements}{ + +\itemize{ +\item ADAT file with study samples (\code{SampleType == "Sample"}) that have been +median normalized (either standard median normalization or ANML) +\item Intact header metadata with \code{ProcessSteps} field indicating the +normalization history +\item Median normalization must be the last processing step applied to study samples +} +} + +\examples{ +\dontrun{ +# Reverse normalization from a delivered ADAT file +normalized_adat <- read_adat("normalized_study_data.adat") +unnormalized_adat <- reverseMedianNormalize(normalized_adat) +} +} diff --git a/tests/testthat/_snaps/medianNormalize.md b/tests/testthat/_snaps/medianNormalize.md new file mode 100644 index 0000000..fb68f03 --- /dev/null +++ b/tests/testthat/_snaps/medianNormalize.md @@ -0,0 +1,25 @@ +# `medianNormalize` produces expected verbose output + + Code + result <- medianNormalize(test_data, verbose = TRUE) + Output + Data validation passed for median normalization. + Standard deliverable checks: + - Hybridization normalization: PASS + - medNormInt (controls): PASS + - Plate scale normalization: PASS + - Study samples not already normalized: PASS + Three dilution setup detected (standard setup). + Message + > Normalization scale factors already exist: "NormScale_20, NormScale_0_005, NormScale_0_5" - they will be replaced with new scale factors + > Building internal reference from study samples (SampleType == 'Sample') + v Processing dilution '0_005' with 50 analytes + v Processing dilution '0_5' with 50 analytes + v Processing dilution '20' with 50 analytes + Output + Recalculating RowCheck values based on normalization acceptance criteria... + RowCheck values updated for 3 samples. + - PASS: 3 samples + - FLAG: 0 samples + - Acceptance criteria: scale factors within [ 0.4 , 2.5 ] + diff --git a/tests/testthat/test-medianNormalize.R b/tests/testthat/test-medianNormalize.R new file mode 100644 index 0000000..13d93ca --- /dev/null +++ b/tests/testthat/test-medianNormalize.R @@ -0,0 +1,338 @@ +# Setup ---- +# Create a minimal test dataset with options for small vs full size +create_test_data <- function(small = FALSE) { + data("example_data", package = "SomaDataIO") + + if (small) { + # Ultra-small dataset for most tests: 3 samples, subset of analytes + sample_indices <- which(example_data$SampleType == "Sample")[1:2] + qc_indices <- which(example_data$SampleType == "QC")[1:1] + subset_indices <- c(sample_indices, qc_indices) + + # Select subset of analytes from different dilution groups for speed + analytes <- getAnalytes(example_data) + analyte_info <- getAnalyteInfo(example_data) + + # Get first 50 analytes from each dilution group for faster processing + selected_analytes <- c() + for (dil in unique(analyte_info$Dilution)) { + dil_analytes <- analytes[analyte_info$Dilution == dil] + selected_analytes <- c(selected_analytes, head(dil_analytes, 50)) + } + + test_adat <- example_data[subset_indices, c(getMeta(example_data), selected_analytes)] + } else { + # Standard test dataset: 16 samples, all analytes (for comprehensive tests) + sample_indices <- which(example_data$SampleType == "Sample")[1:12] + qc_indices <- which(example_data$SampleType == "QC")[1:4] + subset_indices <- c(sample_indices, qc_indices) + test_adat <- example_data[subset_indices, ] + } + + # Modify header to simulate pre-ANML state (remove ANML steps) + header_meta <- attr(test_adat, "Header.Meta") + if (!is.null(header_meta) && !is.null(header_meta$HEADER)) { + # Remove ANML and median normalization steps to simulate pre-processed state + header_meta$HEADER$ProcessSteps <- "Raw RFU, Hyb Normalization, medNormInt, plateScale, Calibration" + attr(test_adat, "Header.Meta") <- header_meta + } + + # Return subset with representative samples + test_adat +} + +test_data <- create_test_data(small = TRUE) +test_data_full <- create_test_data(small = FALSE) + +# Testing ---- +test_that("`medianNormalize` Method 1: Internal reference (default)", { + + # Test default internal reference method + expect_no_error( + result <- medianNormalize(test_data, verbose = FALSE) + ) + + # Check result structure + expect_true(is.soma_adat(result)) + norm_cols <- grep("^NormScale_", names(result), value = TRUE) + expect_equal(length(norm_cols), 3) # Should have 3 dilution groups + expect_true(all(norm_cols %in% names(result))) + + # Test scale factor output + expect_equal(result$NormScale_20[1:3], c(0.971895, 1.029779, 1.199944), tolerance = 0.000001) + expect_equal(result$NormScale_0_005[1:3], c(1.002922, 0.997176, 1.058289), tolerance = 0.000001) + expect_equal(result$NormScale_0_5[1:3], c(0.973205, 1.028323, 1.104745), tolerance = 0.000001) + + # Test specific SeqId columns + expect_equal(result$seq.10000.28[1:3], c(463.1, 488.5, 501.5)) + expect_equal(result$seq.10001.7[1:3], c(301.4, 302.2, 207.4)) + + # Check header metadata + result_header <- attr(result, "Header.Meta")$HEADER + expect_true(grepl("MedNormSMP", result_header$ProcessSteps)) + expect_equal(result_header$NormalizationAlgorithm, "MedNorm") + expect_true(grepl("intraplate, crossplate", result_header$MedNormReference)) +}) + +test_that("`medianNormalize` Method 2: Reference from another ADAT", { + # Create a minimal reference ADAT from sample subset + ref_adat <- test_data[1:2, ] # Use first 2 samples as reference + + expect_no_error( + result <- medianNormalize(test_data, reference = ref_adat, verbose = FALSE) + ) + + # Check result structure + expect_true(is.soma_adat(result)) + norm_cols <- grep("^NormScale_", names(result), value = TRUE) + expect_equal(length(norm_cols), 3) # Should have 3 dilution groups + + # Test scale factor output + expect_equal(result$NormScale_20[1:3], c(0.9719, 1.0298, 1.1999), tolerance = 0.0001) + expect_equal(result$NormScale_0_005[1:3], c(1.0029, 0.9972, 1.0583), tolerance = 0.0001) + expect_equal(result$NormScale_0_5[1:3], c(0.9732, 1.0283, 1.1047), tolerance = 0.0001) + + # Test specific SeqId columns + expect_equal(result$seq.10000.28[1:3], c(463.1, 488.5, 501.5), tolerance = 0.1) + expect_equal(result$seq.10008.43[1:3], c(546, 558, 510.1), tolerance = 0.1) + + # Check header metadata + result_header <- attr(result, "Header.Meta")$HEADER + expect_true(grepl("external_adat", result_header$MedNormReference)) + expect_true(grepl("crossplate", result_header$MedNormReference)) +}) + +test_that("`medianNormalize` External reference data.frame", { + # Create reference data.frame with SeqId-Reference format + analyte_info <- getAnalyteInfo(test_data) + ref_data <- data.frame( + SeqId = analyte_info$SeqId[1:15], # Use first 15 SeqIds + Reference = runif(15, 2000, 4000), # Random reference values + stringsAsFactors = FALSE + ) + + expect_no_error( + result <- medianNormalize(test_data, reference = ref_data, verbose = FALSE) + ) + + # Check result structure + expect_true(is.soma_adat(result)) + norm_cols <- grep("^NormScale_", names(result), value = TRUE) + expect_equal(length(norm_cols), 3) # Should have 3 dilution groups + + # Check header metadata + result_header <- attr(result, "Header.Meta")$HEADER + expect_true(grepl("external_data", result_header$MedNormReference)) + expect_true(grepl("crossplate", result_header$MedNormReference)) + + # Check that medNormSMP_ReferenceRFU field was added with proper formatting + result_analyte_info <- getAnalyteInfo(result) + expect_true("medNormSMP_ReferenceRFU" %in% names(result_analyte_info)) + + # Check reference values are rounded to 2 decimal places + ref_values <- result_analyte_info$medNormSMP_ReferenceRFU + rounded_values <- round(ref_values, 2) + expect_equal(ref_values[!is.na(ref_values)], rounded_values[!is.na(rounded_values)]) +}) + +test_that("`medianNormalize` validates input requirements", { + + # Test with non-soma_adat object + expect_error( + medianNormalize(data.frame(a = 1:3, b = 4:6)), + "`adat` must be a class `soma_adat` object" + ) + + # Test with missing hybridization normalization + test_data_no_hyb <- test_data + header_meta <- list(HEADER = list(ProcessSteps = "SomeOtherStep")) + attr(test_data_no_hyb, "Header.Meta") <- header_meta + + expect_error( + medianNormalize(test_data_no_hyb), + "Hybrid normalization step not detected" + ) + + # Test with missing normalization scale factors + test_data_no_norm <- test_data |> + dplyr::select(-c(NormScale_0_005, NormScale_0_5, NormScale_20)) + header_meta <- list(HEADER = list(ProcessSteps = "Hyb Normalization")) + attr(test_data_no_norm, "Header.Meta") <- header_meta + + expect_error( + medianNormalize(test_data_no_norm), + "No normalization scale factor columns found" + ) + + # Test with invalid adat object (not soma_adat) + expect_error( + medianNormalize(data.frame(wrong = 1:3), verbose = FALSE), + "`adat` must be a class `soma_adat` object" + ) + + # Test with invalid reference data.frame (missing required columns) + invalid_ref <- data.frame(Wrong = c("10000-28", "10001-7"), Column = c(1000, 2000)) + expect_error( + medianNormalize(test_data, reference = invalid_ref, verbose = FALSE), + "Reference data must contain 'SeqId' and 'Reference' columns" + ) + + # Test with invalid reference type (non-data.frame) + expect_error( + medianNormalize(test_data, reference = "invalid_string", verbose = FALSE), + "Invalid reference type" + ) + + # Test with numeric input + expect_error( + medianNormalize(test_data, reference = 123, verbose = FALSE), + "Invalid reference type" + ) +}) + +test_that("`medianNormalize` external reference data.frame validation", { + # Test with invalid SeqId format (should error when no matches found) + invalid_seqid_ref <- data.frame( + SeqId = c("invalid_format_1", "invalid_format_2"), # Invalid SeqId format + Reference = c(2500, 1800), + stringsAsFactors = FALSE + ) + + expect_error( + medianNormalize(test_data, reference = invalid_seqid_ref, verbose = FALSE), + "No matching SeqIds or dilution groups found in reference data" + ) + + # Test with empty reference data + empty_ref <- data.frame( + SeqId = character(0), + Reference = numeric(0), + stringsAsFactors = FALSE + ) + + expect_error( + medianNormalize(test_data, reference = empty_ref, verbose = FALSE), + "No matching SeqIds or dilution groups found in reference data" + ) +}) + +test_that("`medianNormalize` handles grouping correctly", { + # Test grouping by Sex + expect_no_error( + result1 <- medianNormalize(test_data, by = "Sex", verbose = FALSE) + ) + + # Check result structure + expect_true(is.soma_adat(result1)) + norm_cols <- grep("^NormScale_", names(result1), value = TRUE) + expect_equal(length(norm_cols), 3) # Should have 3 dilution groups + + # Test scale factor output - will vary based on grouping by sex + expect_equal(result1$NormScale_20[1:3], c(0.971895, 1.029779, 1.199944), tolerance = 0.000001) + expect_equal(result1$NormScale_0_005[1:3], c(1.002922, 0.997176, 1.058289), tolerance = 0.000001) + expect_equal(result1$NormScale_0_5[1:3], c(0.973205, 1.028323, 1.104745), tolerance = 0.000001) + + # Test specific SeqId columns + expect_equal(result1$seq.10000.28[1:3], c(463.1, 488.5, 501.5)) + expect_equal(result1$seq.10008.43[1:3], c(546.0, 558.0, 510.1)) + + # Test grouping by multiple variables - use full example data + if ("PlateId" %in% names(test_data_full)) { + expect_no_error( + result2 <- medianNormalize(test_data_full, by = c("PlateId", "SampleType"), verbose = FALSE) + ) + + # Check result structure + expect_true(is.soma_adat(result2)) + norm_cols <- grep("^NormScale_", names(result2), value = TRUE) + expect_equal(length(norm_cols), 3) # Should have 3 dilution groups + + # Test scale factor output + expect_equal(result2$NormScale_20[1:3], c(1.0678, 1.0571, 1.0917), tolerance = 0.001) + expect_equal(result2$NormScale_0_005[1:3], c(1.0225, 1.0012, 1.0122), tolerance = 0.001) + expect_equal(result2$NormScale_0_5[1:3], c(1.1031, 1.0544, 1.1087), tolerance = 0.001) + + # Test specific SeqId columns + expect_equal(result2$seq.10000.28[1:3], c(508.8, 501.5, 453.7)) + expect_equal(result2$seq.10008.43[1:3], c(599.9, 572.8, 462.8)) + } + + # Test error with non-existent grouping column + expect_error( + medianNormalize(test_data, by = "NonExistentColumn", verbose = FALSE), + "Grouping column\\(s\\) not found" + ) +}) + +test_that("`medianNormalize` produces expected verbose output", { + # specific verbose output messages + expect_snapshot( + result <- medianNormalize(test_data, verbose = TRUE) + ) + + # no output + expect_message( + result <- medianNormalize(test_data, verbose = FALSE), + NA # NA indicates no output expected + ) +}) + +test_that("`medianNormalize` errors on already normalized data", { + # Create ANML-like test data by modifying test_data + anml_test_data <- test_data + + # Modify header to simulate ANML-processed data + header_meta <- attr(anml_test_data, "Header.Meta") + header_meta$HEADER$ProcessSteps <- "Raw RFU, Hyb Normalization, medNormInt (SampleId), plateScale, Calibration, anmlQC, qcCheck, anmlSMP" + header_meta$HEADER$NormalizationAlgorithm <- "ANML" + attr(anml_test_data, "Header.Meta") <- header_meta + + # Add ANML-like normalization scale factors (non-1.0 values to simulate ANML) + anml_test_data$NormScale_20 <- c(1.15, 0.95, 1.08) + anml_test_data$NormScale_0_5 <- c(1.22, 0.88, 1.12) + anml_test_data$NormScale_0_005 <- c(0.93, 1.07, 0.98) + + # Add ANMLFractionUsed columns to mimic ANML data + anml_test_data$ANMLFractionUsed_20 <- c(0.85, 0.92, 0.78) + anml_test_data$ANMLFractionUsed_0_5 <- c(0.88, 0.91, 0.83) + anml_test_data$ANMLFractionUsed_0_005 <- c(0.90, 0.87, 0.85) + + # Test that medianNormalize errors on already normalized data + expect_error( + medianNormalize(anml_test_data, verbose = FALSE), + "Study samples appear to already be normalized" + ) + + # Test that the error message suggests reverseMedianNormalize + expect_error( + medianNormalize(anml_test_data, verbose = FALSE), + "reverseMedianNormalize" + ) +}) + +test_that("`medianNormalize` handles SeqId reference format properly", { + # Test with SeqId-Reference format + analytes <- getAnalytes(test_data) + seqids <- getAnalyteInfo(test_data)$SeqId[1:10] # First 10 SeqIds + + ref_data <- data.frame( + SeqId = seqids, + Reference = runif(10, 1000, 5000) # Random reference values + ) + + expect_no_error( + result <- medianNormalize(test_data, reference = ref_data, verbose = FALSE) + ) + + # Check that result is valid + expect_true(is.soma_adat(result)) + + # Check that medNormSMP_ReferenceRFU field was added with 2 decimal places + analyte_info <- getAnalyteInfo(result) + expect_true("medNormSMP_ReferenceRFU" %in% names(analyte_info)) + + # Check reference values are rounded to 2 decimal places + ref_values <- analyte_info$medNormSMP_ReferenceRFU + rounded_values <- round(ref_values, 2) + expect_equal(ref_values[!is.na(ref_values)], rounded_values[!is.na(rounded_values)]) +}) diff --git a/tests/testthat/test-reverseMedianNormalize.R b/tests/testthat/test-reverseMedianNormalize.R new file mode 100644 index 0000000..9041c6f --- /dev/null +++ b/tests/testthat/test-reverseMedianNormalize.R @@ -0,0 +1,121 @@ +# Setup ---- +data("example_data", package = "SomaDataIO") + +# Create small representative subset for faster tests +test_data_full <- example_data[1:3, ] + +# Testing ---- +test_that("`reverseMedianNormalize` reverses ANML normalization", { + # Use example_data which already has anmlSMP ProcessStep + anml_data <- test_data_full + + # Verify it has anmlSMP in ProcessSteps (should already be there) + header_meta <- attr(anml_data, "Header.Meta") + expect_true(grepl("anmlSMP", header_meta$HEADER$ProcessSteps)) + + # Add ANML-like normalization scale factors + anml_data$NormScale_20 <- c(1.15, 0.95, 1.08) + anml_data$NormScale_0_5 <- c(1.22, 0.88, 1.12) + anml_data$NormScale_0_005 <- c(0.93, 1.07, 0.98) + + # Reverse ANML normalization + expect_no_error( + result <- reverseMedianNormalize(anml_data, verbose = FALSE) + ) + + # Check that result is valid + expect_true(is.soma_adat(result)) + + # Check that ProcessSteps includes reversal + result_header <- attr(result, "Header.Meta")$HEADER + expect_true(grepl("rev-anmlSMP", result_header$ProcessSteps)) + + # Check that scale factors are reset to 1.0 for study samples + sample_mask <- result$SampleType == "Sample" + expect_true(all(result$NormScale_20[sample_mask] == 1.0)) + expect_true(all(result$NormScale_0_5[sample_mask] == 1.0)) + expect_true(all(result$NormScale_0_005[sample_mask] == 1.0)) +}) + +test_that("`reverseMedianNormalize` error conditions", { + # Create unnormalized data by modifying ProcessSteps + unnormalized_data <- test_data_full + header_meta <- attr(unnormalized_data, "Header.Meta") + header_meta$HEADER$ProcessSteps <- "Raw RFU, Hyb Normalization, medNormInt, plateScale, Calibration" + attr(unnormalized_data, "Header.Meta") <- header_meta + + # Test with unnormalized data (should error) + expect_error( + reverseMedianNormalize(unnormalized_data, verbose = FALSE), + "No evidence of median normalization applied to study samples" + ) + + # Test with non-soma_adat object + expect_error( + reverseMedianNormalize(data.frame(a = 1:3, b = 4:6)), + "`adat` must be a class `soma_adat` object" + ) +}) + +test_that("`reverseMedianNormalize` + `medianNormalize` workflow", { + # Create normalized data with MedNormSMP + normalized_data <- test_data_full + header_meta <- attr(normalized_data, "Header.Meta") + header_meta$HEADER$ProcessSteps <- "Raw RFU, Hyb Normalization, medNormInt (SampleId), plateScale, Calibration, MedNormSMP" + attr(normalized_data, "Header.Meta") <- header_meta + normalized_data$NormScale_20 <- c(1.05, 0.98, 1.12) + normalized_data$NormScale_0_5 <- c(1.08, 0.92, 1.15) + normalized_data$NormScale_0_005 <- c(0.97, 1.03, 0.95) + + # Test complete workflow: reverse then re-normalize + expect_no_error( + unnormalized <- reverseMedianNormalize(normalized_data, verbose = FALSE) + ) + + expect_no_error( + renormalized <- medianNormalize(unnormalized, verbose = FALSE) + ) + + # Check final result structure + expect_true(is.soma_adat(renormalized)) + final_header <- attr(renormalized, "Header.Meta")$HEADER + expect_true(grepl("rev-MedNormSMP", final_header$ProcessSteps)) + expect_true(grepl("MedNormSMP", final_header$ProcessSteps)) +}) + +test_that("`reverseMedianNormalize` validates normalization is final step", { + # Test error when normalization is not the final step + test_data_invalid <- test_data_full + header_meta <- attr(test_data_invalid, "Header.Meta") + header_meta$HEADER$ProcessSteps <- "Raw RFU, Hyb Normalization, medNormInt, anmlSMP, plateScale" # anmlSMP not final + attr(test_data_invalid, "Header.Meta") <- header_meta + + expect_error( + reverseMedianNormalize(test_data_invalid, verbose = FALSE), + "Median/ANML normalization of study samples is not the final processing step" + ) +}) + +test_that("`reverseMedianNormalize` handles ANMLFractionUsed columns properly", { + # Test clearing ANML-specific metadata columns + test_data_anml <- test_data_full + header_meta <- attr(test_data_anml, "Header.Meta") + header_meta$HEADER$ProcessSteps <- "Raw RFU, Hyb Normalization, medNormInt, plateScale, anmlSMP" + attr(test_data_anml, "Header.Meta") <- header_meta + + # Add ANML metadata columns + test_data_anml$ANMLFractionUsed_20 <- c(0.85, 0.92, 0.78) + test_data_anml$ANMLFractionUsed_0_5 <- c(0.91, 0.87, 0.95) + test_data_anml$NormScale_20 <- c(1.15, 0.95, 1.08) + test_data_anml$NormScale_0_5 <- c(1.22, 0.88, 1.12) + + result <- reverseMedianNormalize(test_data_anml, verbose = FALSE) + + # ANMLFractionUsed columns should be cleared for study samples + expect_true(is.na(result$ANMLFractionUsed_20[1])) # Sample should be NA + expect_true(is.na(result$ANMLFractionUsed_0_5[1])) # Sample should be NA + + # Scale factors should be reset to 1.0 for study samples + expect_equal(result$NormScale_20[1], 1.0) + expect_equal(result$NormScale_0_5[1], 1.0) +})