diff --git a/.gitignore b/.gitignore index bd1be61b0..8db067bcb 100644 --- a/.gitignore +++ b/.gitignore @@ -10,3 +10,8 @@ docs/ .DS_Store mia.BiocCheck/ +src/*.dll +inst/extdata/taxonomic_profiles_mgx.tsv +inst/extdata/ecs_relab.tsv +inst/extdata/hmp2_metadata_2018-08-20.csv +tools/cache/ \ No newline at end of file diff --git a/DESCRIPTION b/DESCRIPTION index 36df187b1..f806d90e4 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -40,7 +40,8 @@ Authors@R: person(given = "Danielle", family = "Callan", role=c("ctb")), person(given = "Sam", family = "Hillman", role=c("ctb")), person(given = "Jesse", family = "Pasanen", role=c("ctb")), - person(given = "Eetu", family = "Tammi", role=c("ctb"))) + person(given = "Eetu", family = "Tammi", role=c("ctb")), + person(given = "Aituar", family = "Bektanov", role=c("ctb"))) Title: Microbiome analysis Description: mia implements tools for microbiome analysis based on the diff --git a/NAMESPACE b/NAMESPACE index 851cc6637..318c0d894 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -71,6 +71,7 @@ export(getDominant) export(getExperimentCrossAssociation) export(getExperimentCrossCorrelation) export(getHierarchyTree) +export(getJointRPCA) export(getLDA) export(getLowAbundant) export(getMDS) @@ -474,6 +475,7 @@ importFrom(vegan,dbrda) importFrom(vegan,decostand) importFrom(vegan,eigenvals) importFrom(vegan,monoMDS) +importFrom(vegan,optspace) importFrom(vegan,permutest) importFrom(vegan,rrarefy) importFrom(vegan,scores) diff --git a/R/getJointRPCA.R b/R/getJointRPCA.R new file mode 100644 index 000000000..f8d8b6f55 --- /dev/null +++ b/R/getJointRPCA.R @@ -0,0 +1,1191 @@ +## Joint RPCA front-end helpers +## +## Exported: +## - jointRPCAuniversal() +## - getJointRPCA() +## + +#' Run Joint-RPCA and store embedding in reducedDim +#' @name getJointRPCA +#' +#' @details +#' Convenience wrapper that runs Joint Robust PCA on one or more compositional +#' tables and stores the resulting sample embedding in \code{reducedDim(x, name)}, +#' similar to \code{runMDS()} and \code{runPCA()}. +#' +#' @param x A \code{SummarizedExperiment}, \code{TreeSummarizedExperiment}, +#' \code{MultiAssayExperiment}, or a related object supported by +#' \code{jointRPCAuniversal()}. +#' @param experiments Optional character vector of experiment names to use when +#' \code{x} is a \code{MultiAssayExperiment} (i.e. \code{names(experiments(x))}). +#' Ignored for \code{SummarizedExperiment} inputs. +#' @param altexp Optional name of an alternative experiment. If supplied, +#' Joint-RPCA is run on \code{altExp(x, altexp)} instead of \code{x}. +#' @param name Character scalar giving the name of the \code{reducedDim} slot +#' in which to store the joint sample embedding. Defaults to \code{"JointRPCA"}. +#' @param transform Character string specifying preprocessing applied to each +#' input table before ordination. Use \code{"rclr"} to apply the robust CLR +#' transform (via \code{decostand(method = "rclr")}) or \code{"none"} to +#' disable transformation (data are used as-is after masking non-finite values). +#' @param optspace.tol Numeric tolerance passed to \code{optspace()}. +#' @param center Logical; whether to center the reconstructed low-rank matrix +#' (double-centering) prior to SVD/PCA steps. +#' @param scale Logical; whether to scale the reconstructed matrix prior to +#' SVD/PCA steps. Defaults to \code{FALSE}. +#' @param ... Additional arguments passed to \code{jointRPCAuniversal()} and then +#' to the internal \code{.joint_rpca()} engine (e.g. \code{n.components}, +#' \code{min.sample.count}, \code{min.feature.count}, \code{min.feature.frequency}, +#' \code{max.iterations}, \code{sample.metadata}). +#' +#' @return The input object \code{x} with a new entry in +#' \code{reducedDim(x, name)} containing the Joint-RPCA sample embedding. +#' The full Joint-RPCA result (including distances, cross-validation +#' statistics and transformed tables) is stored in +#' \code{metadata(x)$JointRPCA[[name]]}. +#' +#' @export +NULL + +getJointRPCA <- function(x, + experiments = NULL, + altexp = NULL, + name = "JointRPCA", + transform = c("rclr", "none"), + optspace.tol = 1e-5, + center = TRUE, + scale = FALSE, + ...) { + transform <- match.arg(transform) + + # Select the object to operate on + y <- x + if (!is.null(altexp)) { + y <- altExp(x, altexp) + } + + # Use universal front-end to build tables + run .joint_rpca() + res <- jointRPCAuniversal( + y, + experiments = experiments, + transform = transform, + optspace.tol = optspace.tol, + center = center, + scale = scale, + ... + ) + + # Extract sample embedding + emb <- res[["ord_res"]][["samples"]] + + if (is.null(emb)) { + stop( + "Internal error: JointRPCA did not return a sample embedding. ", + "Please report this and include sessionInfo().", + call. = FALSE + ) + } + + emb <- as.matrix(emb) + + # Ensure embedding rownames match colnames of the target object (Bioconductor requirement) + target_cols <- colnames(x) + if (!is.null(altexp)) { + target_cols <- colnames(x) + } + + if (is.null(target_cols)) { + stop("Cannot store reducedDim: 'x' has no colnames().", call. = FALSE) + } + + if (is.null(rownames(emb))) { + stop("Cannot store reducedDim: embedding has no rownames().", call. = FALSE) + } + + # Require the same set of samples/cells + if (!setequal(rownames(emb), target_cols)) { + missing_in_emb <- setdiff(target_cols, rownames(emb)) + extra_in_emb <- setdiff(rownames(emb), target_cols) + stop( + "Cannot store reducedDim: embedding rownames do not match colnames(x).\n", + "Missing in embedding: ", paste(missing_in_emb, collapse = ", "), "\n", + "Extra in embedding: ", paste(extra_in_emb, collapse = ", "), + call. = FALSE + ) + } + + # Reorder embedding to exactly match colnames(x) + emb <- emb[target_cols, , drop = FALSE] + rownames(emb) <- target_cols + + if (nrow(emb) == 0L || ncol(emb) == 0L || is.null(rownames(emb))) { + stop( + "Internal error: JointRPCA returned an invalid sample embedding. ", + "Please report this and include sessionInfo().", + call. = FALSE + ) + } + + # Store embedding in reducedDim only if supported (SCE / TreeSE / mia-specific) + cls <- class(x) + if (any(cls %in% c("SingleCellExperiment", "TreeSummarizedExperiment"))) { + reducedDim(x, name) <- emb + } + + # Store full result in metadata + if (is.null(metadata(x)$JointRPCA)) { + metadata(x)$JointRPCA <- list() + } + metadata(x)$JointRPCA[[name]] <- res + + return(x) +} + +#' Universal Joint RPCA Wrapper +#' @name getJointRPCA +#' @param x Input object: \code{MultiAssayExperiment}, \code{SummarizedExperiment} +#' (including \code{TreeSummarizedExperiment}), list of matrices, or single matrix. +#' @param experiments Character vector of experiment names to extract when \code{x} +#' is a \code{MultiAssayExperiment} (i.e. \code{names(experiments(x))}). +#' If \code{NULL}, all experiments are used. +#' +#' For \code{MultiAssayExperiment} inputs, \strong{one assay per experiment} is +#' used: by default the first assay returned by +#' \code{assayNames()} (or index \code{1L} if unnamed). +#' The actually used assay names are recorded in \code{$assay_names_used} in +#' the result. If you need a different assay (e.g. \code{"relab"} instead of +#' \code{"counts"}), subset or reorder assays in \code{x} before calling +#' \code{jointRPCAuniversal()}. +#' @param transform Character string specifying preprocessing applied to each +#' input table before ordination. Use \code{"rclr"} to apply the robust CLR +#' transform (via \code{decostand(method = "rclr")}) or \code{"none"} to +#' disable transformation (data are used as-is after masking non-finite values). +#' @param optspace.tol Numeric tolerance passed to \code{optspace()}. +#' @param center Logical; whether to center the reconstructed low-rank matrix +#' (double-centering) prior to SVD/PCA steps. +#' @param scale Logical; whether to scale the reconstructed matrix prior to +#' SVD/PCA steps. Defaults to \code{FALSE}. +#' @param ... Additional arguments passed to \code{.joint_rpca()}. +#' +#' @return Output from \code{.joint_rpca()} with extra fields when \code{x} is a +#' \code{MultiAssayExperiment}: +#' \itemize{ +#' \item \code{$experiment_names}: character vector of experiments used. +#' \item \code{$assay_names_used}: named character vector giving, for each +#' experiment, the assay name that was used (typically the first in +#' \code{assayNames()}). +#' } +#' @importFrom SummarizedExperiment assayNames +#' @importFrom SummarizedExperiment assay +#' @importFrom MultiAssayExperiment experiments +#' @importFrom vegan decostand +#' @importFrom vegan optspace +#' @export +NULL + +jointRPCAuniversal <- function(x, experiments = NULL, + transform = c("rclr", "none"), + optspace.tol = 1e-5, + center = TRUE, + scale = FALSE, + ...) { + transform <- match.arg(transform) + + assay_names_used <- NULL + + if (inherits(x, "MultiAssayExperiment")) { + mae <- .extract_mae_tables(x, experiments) + tables <- mae$tables + experiments <- mae$experiments + assay_names_used <- mae$assay_names_used + } else if (inherits(x, "SummarizedExperiment")) { + tables <- list(assay(x)) + anm <- assayNames(x) + nm <- if (length(anm) && !is.na(anm[1])) anm[1] else "assay1" + names(tables) <- nm + } else if (is.list(x) && all(vapply(x, is.matrix, logical(1)))) { + tables <- x + if (is.null(names(tables))) { + names(tables) <- paste0("view", seq_along(tables)) + } + } else if (is.matrix(x)) { + tables <- list(x) + names(tables) <- "assay1" + } else { + stop( + "Unsupported input type for jointRPCAuniversal(): ", + paste(class(x), collapse = ", "), + call. = FALSE + ) + } + + res <- .joint_rpca( + tables = tables, + transform = transform, + optspace.tol = optspace.tol, + center = center, + scale = scale, + ... + ) + + if (inherits(x, "MultiAssayExperiment")) { + res$experiment_names <- experiments + res$assay_names_used <- assay_names_used + } + + return(res) +} + +#' Joint Robust PCA on Multiple Compositional Tables +#' +#' Internal engine for Joint Robust Principal Component Analysis (RPCA) using +#' OptSpace on multiple compositional tables. +#' +#' This function assumes a list of already extracted tables and is typically +#' called via \code{jointRPCAuniversal()} or \code{getJointRPCA()}. +#' +#' @param tables A list of compositional data tables (matrices or data frames). +#' @param n.test.samples Integer specifying the number of samples to hold out for testing +#' (only used if \code{sample.metadata} is \code{NULL}). Default is 10. +#' @param sample.metadata Optional data frame containing sample-level metadata. +#' @param train.test.column The name of the column in \code{sample.metadata} +#' that defines training vs test samples. +#' @param n.components Integer specifying the number of principal components to compute. +#' @param transform Character string specifying preprocessing applied to each +#' input table before ordination: \code{"rclr"} or \code{"none"}. +#' @param optspace.tol Numeric tolerance passed to \code{vegan::optspace()}. +#' @param center,scale Logical; whether to center/scale the reconstructed matrix +#' prior to SVD/PCA steps. +#' @param min.sample.count Minimum total count required for a sample to be retained. +#' @param min.feature.count Minimum total count required for a feature to be retained. +#' @param min.feature.frequency Minimum percentage (0–100) of samples in which a +#' feature must be non-zero to be retained. +#' @param max.iterations Maximum number of optimization iterations. +#' +#' @return A list with \code{ord_res}, \code{dist}, \code{cv_stats}, and +#' \code{rclr_tables}. +#' +#' @keywords internal +#' @noRd + +.joint_rpca <- function(tables, + n.test.samples = 10, + sample.metadata = NULL, + train.test.column = NULL, + n.components = 3, + transform = c("rclr", "none"), + min.sample.count = 0, + min.feature.count = 0, + min.feature.frequency = 0, + max.iterations = 5, + optspace.tol = 1e-5, + center = TRUE, + scale = FALSE) { + transform <- match.arg(transform) + + if (is.null(names(tables))) { + names(tables) <- paste0("view", seq_along(tables)) + } + + if (n.components < 2) { + stop("n.components must be at least 2.", call. = FALSE) + } + if (max.iterations < 1) { + stop("max.iterations must be at least 1.", call. = FALSE) + } + + # Filtering (always done, independent of rclr) + tables <- lapply(tables, function(tbl) { + out <- .rpca_table_processing( + tbl, + min.sample.count = min.sample.count, + min.feature.count = min.feature.count, + min.feature.frequency = min.feature.frequency + ) + if (nrow(out) == 0L || ncol(out) == 0L) { + stop( + "Filtering removed all data in at least one table (0 features or 0 samples). ", + "Try relaxing filtering thresholds (min.sample.count / min.feature.count / ", + "min.feature.frequency) or check your input.", + call. = FALSE + ) + } + out + }) + + # Find shared samples across views + sample.sets <- lapply(tables, colnames) + shared.all.samples <- Reduce(intersect, sample.sets) + if (length(shared.all.samples) == 0L) { + stop( + "No samples overlap between all tables. ", + "Check that colnames are consistent across views, or", + "if you are using pre-transformed tables set transform = 'none'.", + call. = FALSE + ) + } + if (length(shared.all.samples) < (n.components + 1L)) { + stop( + "Too few shared samples across all tables after filtering (", + length(shared.all.samples), "). Need at least n.components + 1 shared samples. ", + "Try lowering n.components or relaxing filtering thresholds.", + call. = FALSE + ) + } + unshared.samples <- setdiff(unique(unlist(sample.sets)), shared.all.samples) + if (length(unshared.samples) > 0) { + warning(sprintf("Removing %d sample(s) that do not overlap in tables.", length(unshared.samples))) + } + + # Restrict each table to the shared sample set + tables <- lapply(tables, function(tbl) { + tbl[, shared.all.samples, drop = FALSE] + }) + shared.all.samples <- Reduce(intersect, lapply(tables, colnames)) + + # Transform tables: rCLR or masking + rclr_tables <- lapply(tables, function(tbl) { + mat <- as.matrix(tbl) + rown <- rownames(mat) + coln <- colnames(mat) + + if (transform == "rclr") { + mat[!is.finite(mat)] <- 0 + mat[mat < 0] <- 0 + out <- vegan::decostand(mat, method = "rclr", MARGIN = 2) + dimnames(out) <- list(rown, coln) + out + } else { + out <- .mask_value_only(mat)$data + dimnames(out) <- list(rown, coln) + out + } + }) + names(rclr_tables) <- names(tables) + + # Determine train/test split + if (!is.null(sample.metadata) && !is.null(train.test.column)) { + md <- as.data.frame(sample.metadata) + md <- md[shared.all.samples, , drop = FALSE] + train.samples <- rownames(md)[md[[train.test.column]] == "train"] + test.samples <- rownames(md)[md[[train.test.column]] == "test"] + } else { + ord.tmp <- .optspace_helper( + rclr.table = t(rclr_tables[[1]]), + feature.ids = rownames(rclr_tables[[1]]), + sample.ids = colnames(rclr_tables[[1]]), + n.components = n.components, + max.iterations = max.iterations, + tol = optspace.tol, + center = center, + scale = scale + )$ord_res + sorted.ids <- rownames(ord.tmp$samples[order(ord.tmp$samples[, 1]), ]) + idx <- round(seq(1, length(sorted.ids), length.out = n.test.samples)) + test.samples <- sorted.ids[idx] + train.samples <- setdiff(shared.all.samples, test.samples) + } + + # Run joint OptSpace + result <- .joint_optspace_helper( + tables = rclr_tables, + n.components = n.components, + max.iterations = max.iterations, + test.samples = test.samples, + train.samples = train.samples, + sample.order = shared.all.samples, + tol = optspace.tol, + center = center, + scale = scale + ) + + return(list( + ord_res = result$ord_res, + dist = result$dist, + cv_stats = result$cv_stats, + rclr_tables = rclr_tables + )) +} + +#' Joint RPCA Ordination Across Multiple Compositional Tables +#' +#' Internal function that performs Robust PCA via joint OptSpace decomposition across multiple compositional tables. +#' It splits each table into train/test sets, applies joint factorization, reconstructs sample and feature embeddings, +#' optionally projects test samples, computes a sample distance matrix, and returns cross-validation error statistics. +#' +#' @param tables A list of compositional matrices or data frames with features as rows and samples as columns. +#' @param n.components Number of principal components to compute. +#' @param max.iterations Maximum number of optimization iterations for OptSpace. +#' @param test.samples Character vector of sample IDs to be projected into the ordination space. +#' @param train.samples Character vector of sample IDs used to fit the ordination. +#' +#' @return A list with: +#' \describe{ +#' \item{ord_res}{An \code{OrdinationResults} object containing embeddings, loadings, and variance explained.} +#' \item{dist}{A \code{DistanceMatrix} object for sample embeddings.} +#' \item{cv_stats}{A data frame summarizing reconstruction error across iterations and tables.} +#' } +#' +#' @keywords internal +#' @noRd + +.joint_optspace_helper <- function(tables, + n.components, + max.iterations, + test.samples, + train.samples, + sample.order = NULL, + tol = 1e-5, + center = TRUE, + scale = FALSE) { + # Coerce to matrices and enforce colnames presence + tables <- lapply(tables, function(tbl) { + mat <- as.matrix(tbl) + if (is.null(colnames(mat))) { + stop("[.joint_optspace_helper] Input table is missing column names (sample IDs).") + } + mat + }) + + # Global set of samples present in all views + all_samples <- Reduce(intersect, lapply(tables, colnames)) + + # Align train/test to actually available samples + test.samples <- intersect(test.samples, all_samples) + train.samples <- intersect(train.samples, all_samples) + + if (!length(test.samples) || !length(train.samples)) { + stop("[.joint_optspace_helper] Empty train/test split after aligning sample IDs.") + } + + # Split and transpose training/test data per table + tables.split <- lapply(tables, function(tbl) { + list( + t(tbl[, test.samples, drop = FALSE]), + t(tbl[, train.samples, drop = FALSE]) + ) + }) + + # Format input for solver + tables.for.solver <- lapply(tables.split, function(pair) { + lapply(pair, as.matrix) + }) + + # Run joint OptSpace solver + opt.result <- .joint_optspace_solve( + train.test.pairs = tables.for.solver, + n.components = n.components, + max.iter = max.iterations, + tol = tol + ) + + U <- opt.result$U + S <- opt.result$S + V_list <- opt.result$V_list + dists <- opt.result$dists + + # Assign row/column names to loadings + pc.names <- paste0("PC", seq_len(n.components)) + + # Combine feature loadings with table-derived row names + vjoint <- do.call(rbind, Map(function(tbl, V) { + rownames(V) <- rownames(tbl) + colnames(V) <- pc.names + V + }, tables, V_list)) + + U <- U[seq_along(train.samples), , drop = FALSE] + rownames(U) <- train.samples + colnames(U) <- pc.names + + # Recenter & re-factor via SVD + X <- U %*% S %*% t(vjoint) + + if (center) { + X <- sweep(X, 2, colMeans(X)) + X <- sweep(X, 1, rowMeans(X)) + } + if (scale) { + X <- scale(X, center = FALSE, scale = TRUE) + } + svd.res <- svd(X) + u <- svd.res$u[, seq_len(n.components), drop = FALSE] + v <- svd.res$v[, seq_len(n.components), drop = FALSE] + s.eig <- svd.res$d[seq_len(n.components)] + + rownames(u) <- train.samples + rownames(v) <- rownames(vjoint) + pc.names <- paste0("PC", seq_len(n.components)) + colnames(u) <- colnames(v) <- pc.names + + # Build a named per-view features list + features_list <- lapply(seq_along(tables), function(i) { + rid <- rownames(tables[[i]]) + v[rid, , drop = FALSE] + }) + names(features_list) <- names(tables) + + prop.exp <- s.eig^2 / sum(s.eig^2) + ord_res <- .ordination_results( + method = "rpca", + eigvals = setNames(s.eig, pc.names), + samples = u, + features = features_list, + proportion.explained = setNames(prop.exp, pc.names) + ) + + # Project test samples + if (length(test.samples) > 0) { + test.matrices <- lapply(tables, function(tbl) tbl[, test.samples, drop = FALSE]) + names(test.matrices) <- names(tables) + ord_res <- .transform(ord_res, test.matrices, apply.rclr = FALSE) + } + + # Compute distance matrix and CV error summary + dist.base <- as.matrix(dist(ord_res$samples)) + + if (!is.null(sample.order)) { + order_use <- intersect(sample.order, rownames(dist.base)) + dist.mat <- dist.base[order_use, order_use, drop = FALSE] + } else { + dist.mat <- dist.base + order_use <- rownames(dist.base) + } + + dist.res <- .distance_matrix(dist.mat, ids = order_use) + + cv.dist <- data.frame(t(dists)) + colnames(cv.dist) <- c("mean_CV", "std_CV") + cv.dist$run <- sprintf( + "tables_%d.n.components_%d.max.iterations_%d.n.test_%d", + length(tables), n.components, max.iterations, length(test.samples) + ) + cv.dist$iteration <- seq_len(nrow(cv.dist)) + rownames(cv.dist) <- seq_len(nrow(cv.dist)) + + return(list(ord_res = ord_res, dist = dist.res, cv_stats = cv.dist)) +} + +#' Apply Projection of New Compositional Tables to Existing Ordination +#' +#' Internal function that transforms and projects new sample tables into an existing Joint RPCA ordination space. +#' It handles feature alignment, optional rCLR preprocessing, padding of missing features, +#' and merges projected samples with existing ones. +#' +#' @param ordination A list containing previous ordination results: `samples`, `features`, and `eigvals`. +#' @param tables A named list of new compositional tables (matrices or data frames) +#' with features as rows and samples as columns. +#' @param apply.rclr Logical; whether to apply rCLR transformation to new input tables before projection. Default is `TRUE`. +#' +#' @return An updated ordination list with the `samples` matrix extended to include new projected samples. +#' @keywords internal +#' @noRd + +.transform <- function(ordination, tables, + apply.rclr = TRUE) { + Udf <- ordination$samples + Vobj <- ordination$features + s.eig <- ordination$eigvals + + # Ensure tables is a list of views + if (!is.list(tables)) { + stop("[.transform] 'tables' must be a list of view matrices (features x samples).") + } + + if (is.list(Vobj) && !is.null(names(Vobj))) { + if (is.null(names(tables))) { + names(tables) <- names(Vobj)[seq_along(tables)] + } + if (is.null(names(tables))) { + stop("[.transform] 'tables' must be a *named* list of view matrices (features x samples).") + } + } + + # rCLR if requested + prep_view <- function(tab) { + mat <- as.matrix(tab) + rown <- rownames(mat) + coln <- colnames(mat) + + if (apply.rclr) { + mat <- vegan::decostand(mat, method = "rclr", MARGIN = 2) + + dimnames(mat) <- list(rown, coln) + } + + storage.mode(mat) <- "double" + mat[!is.finite(mat)] <- 0 + mat + } + tables <- lapply(tables, prep_view) + + if (is.matrix(Vobj)) { + all.features <- rownames(Vobj) + tables <- lapply(tables, function(mat) { + miss <- setdiff(all.features, rownames(mat)) + if (length(miss)) { + pad <- matrix(0, + nrow = length(miss), ncol = ncol(mat), + dimnames = list(miss, colnames(mat)) + ) + mat <- rbind(mat, pad) + } + mat[all.features, , drop = FALSE] + }) + proj.mat <- do.call(cbind, tables) + colnames(proj.mat) <- make.unique(colnames(proj.mat), sep = "_") + ordination$samples <- .transform_helper(Udf, Vobj, s.eig, proj.mat) + return(ordination) + } + + # 3+-omic path: V is a named list per view + if (!is.list(Vobj) || is.null(names(Vobj))) { + stop("[.transform] ordination$features is neither a matrix nor a named list.") + } + + # Intersect views by name, preserve training order + views <- intersect(names(Vobj), names(tables)) + if (!length(views)) stop("[.transform] No overlapping view names between ordination and new tables.") + + test.matrices <- list() + for (vw in views) { + Vvw <- Vobj[[vw]] + stopifnot(is.matrix(Vvw), !is.null(rownames(Vvw))) + mat <- tables[[vw]] + + train_feats <- rownames(Vvw) + miss <- setdiff(train_feats, rownames(mat)) + if (length(miss)) { + pad <- matrix(0, + nrow = length(miss), ncol = ncol(mat), + dimnames = list(miss, colnames(mat)) + ) + mat <- rbind(mat, pad) + } + mat <- mat[train_feats, , drop = FALSE] + + test.matrices[[vw]] <- mat + } + + ordination$samples <- .transform_helper(Udf, Vobj, s.eig, test.matrices) + return(ordination) +} + +#' Project New Data into Existing Ordination Space +#' +#' Internal function to align rCLR-transformed samples to an existing RPCA ordination space. +#' Handles feature alignment, deduplication of sample names, double-centering normalization, +#' and projection into low-rank space using previously learned components. +#' +#' @param Udf Matrix of training sample embeddings (samples × components). +#' @param Vdf Matrix or list of feature loadings (features × components, or per-view list). +#' @param s.eig Singular values from the RPCA decomposition. +#' @param table.rclr.project New rCLR-transformed table(s) for projection (features × samples). +#' If `Vdf` is a matrix, provide a single matrix; if `Vdf` is a list, provide a named list of matrices per view. +#' @param dedup.samples Logical; whether to merge samples with identical names (e.g. suffixes like `_1`, `_2`) +#' by averaging their feature values. Default is `TRUE`. Set to `FALSE` to preserve all duplicate sample IDs. +#' +#' @return A combined matrix of training and projected samples (samples × components). +#' @keywords internal +#' @noRd + +.transform_helper <- function(Udf, Vdf, s.eig, table.rclr.project, + dedup.samples = TRUE) { + # Legacy path (single view) + if (is.matrix(Vdf)) { + stopifnot(is.matrix(table.rclr.project)) + # Align rows by name + common <- intersect(rownames(Vdf), rownames(table.rclr.project)) + if (length(common) < ncol(Udf)) { + stop(sprintf("[.transform_helper] Too few matching features: %d", length(common))) + } + + M <- t(as.matrix(table.rclr.project[common, , drop = FALSE])) + V <- as.matrix(Vdf[common, , drop = FALSE]) + + # Dedup of sample IDs + if (dedup.samples) { + sid <- sub("_\\d+$", "", rownames(M)) + if (any(duplicated(sid))) { + M <- rowsum(M, group = sid, reorder = FALSE) / as.vector(table(sid)) + } else { + rownames(M) <- sid + } + } + + # Projection (match training scaling) + Uproj <- M %*% V + # Scale by singular values + if (length(s.eig)) { + Sinv <- diag(1 / s.eig, nrow = length(s.eig)) + Uproj <- Uproj %*% Sinv + } + + colnames(Uproj) <- colnames(Udf) + U.combined <- rbind(Udf[setdiff(rownames(Udf), rownames(Uproj)), , drop = FALSE], Uproj) + return(U.combined) + } + + # Multi-view path (named lists) + stopifnot(is.list(Vdf), is.list(table.rclr.project)) + views <- intersect(names(Vdf), names(table.rclr.project)) + if (!length(views)) stop("[.transform_helper] No overlapping views.") + + # Project per view, then sum contributions in the shared latent space + Usum <- NULL + ncomp <- ncol(Udf) + for (vw in views) { + Vvw <- Vdf[[vw]] + Tvw <- table.rclr.project[[vw]] + stopifnot(is.matrix(Vvw), is.matrix(Tvw)) + + common <- intersect(rownames(Vvw), rownames(Tvw)) + if (length(common) < ncomp) { + stop(sprintf("[.transform_helper] View '%s': too few matching features (%d).", vw, length(common))) + } + + M <- t(as.matrix(Tvw[common, , drop = FALSE])) + V <- as.matrix(Vvw[common, , drop = FALSE]) + + # Accumulate per-view U + Uvw <- M %*% V + if (is.null(Usum)) { + Usum <- Uvw + } else { + # Align rows (samples) by name before summing + all_s <- union(rownames(Usum), rownames(Uvw)) + Utmp <- matrix(0, + nrow = length(all_s), ncol = ncol(Udf), + dimnames = list(all_s, colnames(Udf)) + ) + Utmp[rownames(Usum), ] <- Usum + Utmp[rownames(Uvw), ] <- Utmp[rownames(Uvw), ] + Uvw + Usum <- Utmp + } + } + + # Sample dedup (after combining views) + if (dedup.samples) { + sid <- sub("_\\d+$", "", rownames(Usum)) + if (any(duplicated(sid))) { + Usum <- rowsum(Usum, group = sid, reorder = FALSE) / as.vector(table(sid)) + } else { + rownames(Usum) <- sid + } + } + + # Scale by S + if (length(s.eig)) { + Sinv <- diag(1 / s.eig, nrow = length(s.eig)) + Usum <- Usum %*% Sinv + } + colnames(Usum) <- colnames(Udf) + + # Merge with training U, avoiding duplicates + keep_train <- setdiff(rownames(Udf), rownames(Usum)) + rbind(Udf[keep_train, , drop = FALSE], Usum) +} + +#' RPCA Table Filtering and Preprocessing +#' +#' Internal function that performs filtering and cleanup on a compositional data table +#' prior to Robust PCA analysis. Removes low-count features/samples, enforces non-zero frequency thresholds, +#' checks for ID duplication, and returns a matrix suitable for transformation and ordination. +#' +#' @param table A matrix or data frame with features as rows and samples as columns. +#' @param min.sample.count Minimum total count required for a sample to be retained. Default is 0. +#' @param min.feature.count Minimum total count required for a feature to be retained. Default is 0. +#' @param min.feature.frequency Minimum percentage (0–100) of samples in which a feature must be non-zero. Default is 0. +#' +#' @return A filtered numeric matrix containing non-empty features and samples. +#' @keywords internal +#' @noRd + +.rpca_table_processing <- function(table, + min.sample.count = 0, + min.feature.count = 0, + min.feature.frequency = 0) { + # Ensure the input is a matrix + if (is.data.frame(table)) { + table <- as.matrix(table) + } + + n.features <- nrow(table) + n.samples <- ncol(table) + + # Filter features by total count + if (!is.null(min.feature.count)) { + feature.totals <- rowSums(table, na.rm = TRUE) + keep.features <- feature.totals > min.feature.count + table <- table[keep.features, , drop = FALSE] + } + + # Filter features by frequency across samples + if (!is.null(min.feature.frequency)) { + freq.threshold <- min.feature.frequency / 100 + feature.freq <- rowMeans(table > 0, na.rm = TRUE) + keep.features <- feature.freq > freq.threshold + table <- table[keep.features, , drop = FALSE] + } + + # Filter samples by total count + if (!is.null(min.sample.count)) { + sample.totals <- colSums(table, na.rm = TRUE) + keep.samples <- sample.totals > min.sample.count + table <- table[, keep.samples, drop = FALSE] + } + + # Check for duplicate IDs + if (any(duplicated(colnames(table)))) { + stop("Data table contains duplicate sample (column) IDs.", call. = FALSE) + } + if (any(duplicated(rownames(table)))) { + stop("Data table contains duplicate feature (row) IDs.", call. = FALSE) + } + + # Remove empty rows and columns if sample filtering applied + if (!is.null(min.sample.count)) { + nonzero.features <- rowSums(table, na.rm = TRUE) > 0 + nonzero.samples <- colSums(table, na.rm = TRUE) > 0 + table <- table[nonzero.features, nonzero.samples, drop = FALSE] + } + + return(table) +} + +#' Generate a MaskedMatrix from Numeric Input +#' +#' Internal helper that ensures input is a 2D numeric matrix and returns a masked version, +#' replacing non-finite values (e.g., NA, NaN, Inf) with `NA` and recording their positions in a logical mask. +#' +#' @param mat A numeric matrix or vector. If a vector, it will be converted to a 1-row matrix. +#' +#' @return A list of class \code{"MaskedMatrix"} with two elements: +#' \describe{ +#' \item{data}{The original matrix with non-finite values replaced by \code{NA}.} +#' \item{mask}{Logical matrix indicating non-finite entries (TRUE if missing).} +#' } +#' +#' @keywords internal +#' @noRd + +.mask_value_only <- function(mat) { + # Ensure matrix is at least 2D + if (is.vector(mat)) { + mat <- matrix(mat, nrow = 1) + } + + # Ensure matrix is not more than 2D + if (length(dim(mat)) > 2) { + stop("Input matrix can only have two dimensions or less") + } + + # Generate logical mask: TRUE where values are missing + mask <- !is.finite(mat) + + # Create masked matrix + masked.mat <- mat + masked.mat[!is.finite(mat)] <- NA + + # Return as a masked matrix + return(structure(list( + data = masked.mat, + mask = mask + ), class = "MaskedMatrix")) +} + +#' Internal constructor for ordination results +#' @keywords internal +#' @noRd +.ordination_results <- function(method, eigvals, samples, features, + proportion.explained, dist = NULL, metadata = list()) { + return(structure(list( + method = method, + eigvals = eigvals, + samples = samples, + features = features, + proportion.explained = proportion.explained, + dist = dist, + metadata = metadata + ), class = "OrdinationResults")) +} + +#' Internal constructor for a distance matrix object +#' @keywords internal +#' @noRd +.distance_matrix <- function(matrix, ids = NULL, method = "euclidean") { + if (!is.matrix(matrix)) stop("Input must be a matrix.") + if (!isSymmetric(matrix)) stop("Distance matrix must be symmetric.") + if (!is.null(ids)) { + if (length(ids) != nrow(matrix)) stop("Length of 'ids' must match matrix dimensions.") + rownames(matrix) <- ids + colnames(matrix) <- ids + } + return(structure(list( + data = matrix, + ids = rownames(matrix), + method = method + ), class = "DistanceMatrix")) +} + +#' Extract per-experiment assay tables from a MultiAssayExperiment +#' +#' @param x A MultiAssayExperiment. +#' @param experiments Character vector of experiment names (or NULL for all). +#' +#' @return A list with `tables`, `experiments`, and `assay_names_used`. +#' +#' @keywords internal +#' @noRd +.extract_mae_tables <- function(x, experiments = NULL) { + exps <- experiments(x) + + if (is.null(experiments)) { + experiments <- names(exps) + } + if (length(experiments) == 0L) { + stop("No experiments found in 'x'.", call. = FALSE) + } + + assay_names_used <- setNames(character(length(experiments)), experiments) + tables <- vector("list", length(experiments)) + names(tables) <- experiments + + for (i in seq_along(experiments)) { + e <- experiments[[i]] + exp_se <- exps[[e]] + if (is.null(exp_se)) { + stop(sprintf("Experiment '%s' not found in 'x'.", e), call. = FALSE) + } + + anm <- assayNames(exp_se) + default_assay <- if (length(anm)) anm[[1]] else 1L + + assay_names_used[[e]] <- if (is.character(default_assay)) default_assay else as.character(default_assay) + tables[[e]] <- assay(exp_se, default_assay) + } + + list( + tables = tables, + experiments = experiments, + assay_names_used = assay_names_used + ) +} + +#' OptSpace back-end (joint & single-view) +#' +#' OptSpace-based solvers built on top of \code{vegan::optspace()}. +#' +#' This file contains: +#' \itemize{ +#' \item \code{.optspace_helper()} — single-view rCLR → OptSpace → biplot +#' \item \code{.joint_optspace_solve()} — multi-view joint factorization +#' } +#' +#' @keywords internal +#' @noRd +NULL + +#' OptSpace-Based Dimensionality Reduction and RPCA Biplot Generation +#' +#' Internal function that fits an OptSpace model to a rCLR-transformed compositional table, +#' reconstructs the low-rank matrix, applies PCA, and constructs an ordination result +#' capturing sample embeddings, feature loadings, and explained variance. A distance matrix +#' is also generated using Aitchison geometry. +#' +#' @param rclr.table A numeric matrix representing rCLR-transformed compositional data. +#' @param feature.ids Character vector of feature names (used for row labeling of loadings). +#' @param sample.ids Character vector of sample names (used for row labeling of embeddings). +#' @param n.components Integer specifying number of principal components to retain. Default is 3. +#' @param max.iterations Maximum number of iterations to run OptSpace optimization. Default is 5. +#' +#' @return A list with: +#' \describe{ +#' \item{ord_res}{An \code{OrdinationResults} object containing PCA scores, loadings, and metadata.} +#' \item{dist}{A sample-by-sample \code{DistanceMatrix} object using Aitchison geometry.} +#' \item{opt_fit}{The raw OptSpace fit result containing matrices \code{X}, \code{Y}, and \code{S}.} +#' } +#' +#' @keywords internal +#' @noRd + +.optspace_helper <- function(rclr.table, + feature.ids, + sample.ids, + n.components = 3, + max.iterations = 5, + tol = 1e-5, + center = TRUE, + scale = FALSE) { + opt.result <- vegan::optspace( + x = rclr.table, + ropt = n.components, + niter = max.iterations, + tol = tol, + verbose = FALSE + ) + + n.components <- ncol(opt.result$S) + + # Reconstruct and re-center matrix + X.hat <- opt.result$X %*% opt.result$S %*% t(opt.result$Y) + X.hat <- scale(X.hat, center = center, scale = scale) + X.hat <- t(scale(t(X.hat), center = center, scale = scale)) + + # PCA + svd.out <- svd(X.hat) + u <- svd.out$u[, 1:n.components, drop = FALSE] + s <- svd.out$d[1:n.components] + v <- svd.out$v[, 1:n.components, drop = FALSE] + + # Label loadings + rename.cols <- paste0("PC", seq_len(n.components)) + sample.scores <- u + feature.scores <- data.frame(v, row.names = feature.ids) + feature.scores <- as.matrix(feature.scores) + rownames(sample.scores) <- sample.ids + colnames(sample.scores) <- rename.cols + colnames(feature.scores) <- rename.cols + + # Proportion explained + prop.var <- s^2 / sum(svd.out$d^2) + names(prop.var) <- rename.cols + names(s) <- rename.cols + + # Add PC3 for 2D case + if (n.components == 2) { + sample.scores <- cbind(sample.scores, PC3 = 0) + feature.scores <- cbind(feature.scores, PC3 = 0) + sample.scores <- as.matrix(sample.scores) + feature.scores <- as.matrix(feature.scores) + + s <- c(s, PC3 = 0) + prop.var <- c(prop.var, PC3 = 0) + rename.cols <- c(rename.cols, "PC3") + } + + # Compute distance in sample PC-space + dist.matrix.raw <- as.matrix(dist(u)) + rownames(dist.matrix.raw) <- sample.ids + colnames(dist.matrix.raw) <- sample.ids + + dist.res <- .distance_matrix(dist.matrix.raw, + ids = sample.ids, + method = "aitchison" + ) + + ord_res <- .ordination_results( + method = "rpca_biplot", + eigvals = s, + samples = sample.scores, + features = feature.scores, + proportion.explained = prop.var, + dist = dist.matrix.raw, + metadata = list( + long.method.name = "(Robust Aitchison) RPCA Biplot", + run.id = sprintf( + "optspace_helper_n.components_%d.max.iterations_%d", + n.components, max.iterations + ) + ) + ) + + return(list( + ord_res = ord_res, + dist = dist.res, + opt_fit = opt.result + )) +} + +#' Joint OptSpace Optimization Across Multiple Train/Test Splits +#' +#' Internal function that performs joint matrix factorization using OptSpace across a set of paired train/test compositional tables. +#' Stacks training matrices horizontally, applies low-rank optimization, splits feature loadings per table, +#' and evaluates projection error on test data via Frobenius norm. +#' +#' @param train.test.pairs A list of paired matrices where each element is a two-item list: \code{[[test, train]]}. +#' @param n.components Integer specifying number of components to retain in the OptSpace model. +#' @param max.iter Maximum number of optimization iterations. Default is 50. +#' @param verbose Logical; whether to print progress messages. Default is \code{TRUE}. +#' +#' @return A list with: +#' \describe{ +#' \item{U}{Shared sample embedding matrix across all input tables.} +#' \item{S}{Singular values matrix from OptSpace decomposition.} +#' \item{V_list}{List of per-table feature loading matrices.} +#' \item{dists}{Matrix of reconstruction errors (rows: error type, columns: tables).} +#' } +#' +#' @keywords internal +#' @noRd + +.joint_optspace_solve <- function(train.test.pairs, n.components, + max.iter = 50, verbose = TRUE, + tol = 1e-5) { + # Prepare lists to hold training matrices and dimensions + train.matrices <- list() + test.matrices <- list() + dims <- list() + + for (pair in train.test.pairs) { + test.mat <- pair[[1]] + train.mat <- pair[[2]] + train.matrices <- append(train.matrices, list(train.mat)) + test.matrices <- append(test.matrices, list(test.mat)) + dims <- append(dims, list(dim(train.mat))) + } + + # Stack training matrices horizontally + train.stacked <- do.call(cbind, train.matrices) + + # Apply OptSpace to stacked matrix via vegan + if (verbose) { + message("Running vegan::optspace() on stacked training data...") + } + + fit <- vegan::optspace( + x = train.stacked, + ropt = n.components, + niter = max.iter, + tol = tol, + verbose = verbose + ) + + # Extract sample loadings + U.shared <- fit$X + S.shared <- fit$S + + # Split V back into per-table pieces + feat.indices <- cumsum(vapply(dims, function(d) d[2], numeric(1))) + feat.starts <- c(1, head(feat.indices, -1) + 1) + V_list <- Map( + function(start, end) fit$Y[start:end, , drop = FALSE], + feat.starts, feat.indices + ) + + # Reconstruction error per view (mean_CV / std_CV placeholder) + n_views <- length(test.matrices) + dists <- matrix(0, nrow = 2, ncol = n_views) + + for (i in seq_along(test.matrices)) { + V.k <- V_list[[i]] + test.mat <- test.matrices[[i]] + + # Project test samples: U.test = test × V + U.test <- as.matrix(test.mat) %*% V.k + U.test <- sweep(U.test, 2, diag(S.shared), "/") + recon.test <- U.test %*% S.shared %*% t(V.k) + + # Center for consistency + recon.test <- scale(recon.test, center = TRUE, scale = FALSE) + recon.test <- t(scale(t(recon.test), center = TRUE, scale = FALSE)) + + error <- test.mat - recon.test + error[is.na(error)] <- 0 + error.val <- norm(error, "F") / sqrt(sum(!is.na(test.mat))) + + dists[1, i] <- error.val + dists[2, i] <- 0 + } + + return(list(U = U.shared, S = S.shared, V_list = V_list, dists = dists)) +} diff --git a/R/mia.R b/R/mia.R index f0f419f19..fc62a795b 100644 --- a/R/mia.R +++ b/R/mia.R @@ -27,7 +27,7 @@ NULL #' The datasets represent instances of the TreeSummarizedExperiment and #' MultiAssayExperiment containers and can serve as tools to practice the #' mia functionality. -#' +#' #' Currently, the following datasets are available: #' \itemize{ #' \item{\code{\link{dmn_se}}: A SummarizedExperiment with 130 features and @@ -38,6 +38,8 @@ NULL #' and 3 samples} #' \item{\code{\link{GlobalPatterns}}: A TreeSummarizedExperiment with 19216 #' features and 26 samples} +#' \item{\code{\link{ibdmdb_2omic_demo}}: A compact +#' \code{MultiAssayExperiment} demo dataset with 2 experiments (MGX + MTX)} #' \item{\code{\link{HintikkaXOData}}: A MultiAssayExperiment with 3 #' experiments (microbiota, metabolites and biomarkers)} #' \item{\code{\link{peerj13075}}: A TreeSummarizedExperiment with 674 @@ -47,19 +49,19 @@ NULL #' \item{\code{\link{Tito2024QMP}}: A TreeSummarizedExperiment with 676 #' features and 589 samples} #' } -#' +#' #' @name mia-datasets #' @docType data #' @keywords datasets -#' +#' #' @examples #' # Load dataset from mia #' library(mia) #' data("GlobalPatterns", package = "mia") -#' +#' #' # In this case, the dataset is a TreeSE, so it is renamed as tse #' tse <- GlobalPatterns -#' +#' #' # Print summary #' tse NULL @@ -70,17 +72,17 @@ NULL #' GlobalPatterns compared the microbial communities from 25 environmental #' samples #' and three known "mock communities" at a an average depth of 3.1 million reads -#' per sample. Authors reproduced diversity patterns seen in many other +#' per sample. Authors reproduced diversity patterns seen in many other #' published studies, while investigating technical bias by applying the same #' techniques to simulated microbial communities of known composition. Special #' thanks are given to J. Gregory Caporaso for providing the OTU-clustered data #' files for inclusion in the \pkg{phyloseq} package, from which this data was #' converted to \code{TreeSummarizedExperiment}. -#' +#' #' @format A TreeSummarizedExperiment with 19216 features and 26 samples. The #' rowData contains taxonomic information at Kingdom, Phylum, Class, Order, #' Family, Genus and Species levels. The colData includes: -#' +#' #' \describe{ #' \item{X.SampleID}{Sample ID taken from the corresponding study} #' \item{Primer}{primer used for sequencing} @@ -102,7 +104,7 @@ NULL #' @seealso \code{\link{mia-datasets}} #' @author Caporaso, J. G., et al. #' @references -#' Caporaso, J. G., et al. (2011). +#' Caporaso, J. G., et al. (2011). #' Global patterns of 16S rRNA diversity at a depth of millions of sequences per #' sample. #' PNAS, 108, 4516-4522. \url{https://doi.org/10.1073/pnas.1000080107} @@ -121,7 +123,7 @@ NULL #' #' @format A TreeSummarizedExperiment with 553 features and 280 samples. The #' rowData contains taxonomic information at Genus level. The colData includes: -#' +#' #' \describe{ #' \item{Enterotype}{enterotype the sample belongs to (1, 2 and 3)} #' \item{Sample_ID}{sample ID of samples from all studies} @@ -136,7 +138,7 @@ NULL #' \item{ClinicalStatus}{participant's clinical status (healthy, obese, CD, #' UC and elderly)} #' } -#' +#' #' @name enterotype #' @docType data #' @keywords datasets @@ -150,18 +152,18 @@ NULL #' Arumugam, M., et al. (2014). Addendum: Enterotypes of the human gut #' microbiome. #' Nature 506, 516 (2014). \url{https://doi.org/10.1038/nature13075} -#' +#' #' @source \url{http://www.bork.embl.de/Docu/Arumugam_et_al_2011/downloads.html} NULL #' Human esophageal community from 3 individuals -#' +#' #' This small dataset from a human esophageal community includes 3 samples from #' 3 human adults based on biopsies analysed with 16S rDNA PCR. The 16S rRNA #' sequence processing is provided in the mothur wiki from the link below. It #' was #' converted into a TreeSummarizedExperiment from the \pkg{phyloseq} package. -#' +#' #' @format A TreeSummarizedExperiment with 58 features and 3 samples. The #' rowData contains no taxonomic information. The colData is empty. #' @@ -171,9 +173,9 @@ NULL #' @usage data(esophagus) #' @seealso \code{\link{mia-datasets}} #' @author Pei et al. \email{zhiheng.pei@@med.nyu.edu}. -#' @references +#' @references #' Pei, Z., Bini, E. J., Yang, L., Zhou, M., Francois, F., & Blaser, M. J. -#' (2004). +#' (2004). #' Bacterial biota in the human distal esophagus. #' Proceedings of the National Academy of Sciences of the United States of #' America, 101(12), 4250-4255. @@ -192,14 +194,14 @@ NULL #' dmn_se is a dataset on twins' microbiome where samples are stratified by #' their community composition through Dirichlet Multinomial Mixtures (DMM). It #' was derived from the \pkg{DirichletMultinomial} package. -#' +#' #' @format A SummarizedExperiment with 130 features and 278 samples. The #' rowData contains no taxonomic information. The colData includes: -#' +#' #' \describe{ #' \item{pheno}{participant's weight condition (Lean, Overwt and Obese)} #' } -#' +#' #' @name dmn_se #' @docType data #' @aliases twins @@ -215,23 +217,23 @@ NULL #' PLoS ONE 7(2): e30126. \url{https://doi.org/10.1371/journal.pone.0030126} #' #' Turnbaugh PJ, Hamady M, Yatsunenko T, Cantarel BL, Duncan A, et al. (2009). -#' A core gut microbiome in obese and lean twins. Nature 457: 480–484. +#' A core gut microbiome in obese and lean twins. Nature 457: 480–484. #' \url{https://doi.org/10.1038/nature07540} NULL #' Skin microbial profiles 58 genetically unrelated individuals -#' +#' #' peerj13075 includes skin microbial profiles of 58 volunteers with multiple #' factors. 16S r-RNA sequencing of V3-V4 regions was done to generate millions #' of read using illumina platform. A standard bioinformatic and statistical #' analysis done to explore skin bacterial diversity and its association with #' age, diet, geographical locations. The authors investigated significant #' association of skin microbiota with individual’s geographical location. -#' +#' #' @format A TreeSummarizedExperiment with 674 features and 58 samples. The #' rowData contains taxonomic information at kingdom, phylum, class, order, #' family and genus level. The colData includes: -#' +#' #' \describe{ #' \item{Sample}{sample ID} #' \item{Geographical_location}{city where participant lives (Ahmednagar, @@ -248,18 +250,18 @@ NULL #' @seealso \code{\link{mia-datasets}} #' @author Potbhare, R., et al. #' @references -#' Potbhare, R., RaviKumar, A., Munukka, E., Lahti, L., & Ashma, R. (2022). +#' Potbhare, R., RaviKumar, A., Munukka, E., Lahti, L., & Ashma, R. (2022). #' Skin microbiota diversity among genetically unrelated individuals of Indian -#' origin. +#' origin. #' PeerJ, 10, e13075. \url{https://doi.org/10.7717/peerj.13075} #' Supplemental information includes OTU table and taxonomy table -#' publicly-accessible from: +#' publicly-accessible from: #' \url{https://www.doi.org/10.7717/peerj.13075/supp-1} #' \url{https://www.doi.org/10.7717/peerj.13075/supp-2} NULL #' Multiomics dataset from 40 rat samples -#' +#' #' HintikkaXO is a multiomics dataset from a rat experiment studying effect of #' fat and prebiotics in diet. It contains high-throughput profiling data from #' 40 rat samples, including 39 biomarkers, 38 metabolites (NMR), and 12706 OTUs @@ -267,7 +269,7 @@ NULL #' High/Low fat diet and xylo-oligosaccaride supplementation. Column metadata is #' common for all experiments (microbiota, metabolites, biomarkers) and is #' described below. -#' +#' #' @format A MultiAssayExperiment with 3 experiments (microbiota, metabolites #' and #' biomarkers). rowData of the microbiota experiment contains taxonomic @@ -276,7 +278,7 @@ NULL #' metabolites #' and biomarkers experiments contain 38 NMR metabolites and 39 biomarkers, #' respectively. The colData includes: -#' +#' #' \describe{ #' \item{Sample}{Sample ID (character)} #' \item{Rat}{Rat ID (factor)} @@ -285,7 +287,7 @@ NULL #' \item{Fat}{Fat in Diet (factor; Low/High)} #' \item{XOS}{XOS Diet Supplement (numeric; 0/1)} #' } -#' +#' #' @name HintikkaXOData #' @docType data #' @keywords datasets @@ -293,16 +295,16 @@ NULL #' @seealso \code{\link{mia-datasets}} #' @author Hintikka L et al. #' @references -#' Hintikka L et al. (2021): Xylo-oligosaccharides in prevention of hepatic -#' steatosis and adipose tissue inflammation: associating taxonomic and -#' metabolomic patterns in fecal microbiota with biclustering. International +#' Hintikka L et al. (2021): Xylo-oligosaccharides in prevention of hepatic +#' steatosis and adipose tissue inflammation: associating taxonomic and +#' metabolomic patterns in fecal microbiota with biclustering. International #' Journal of Environmental Research and Public Health 18(8):4049. #' \url{https://doi.org/10.3390/ijerph18084049} -#' +#' NULL #' Gut microbiota profiles of 27 individuals with ADHD and healthy controls -#' +#' #' Tengeler2020 includes gut microbiota profiles of 27 persons with ADHD. A #' standard bioinformatic and statistical analysis done to demonstrate that #' altered microbial composition could be a driver of altered brain structure @@ -313,7 +315,7 @@ NULL #' @format A TreeSummarizedExperiment with 151 features and 27 samples. The #' rowData contains taxonomic information at Kingdom, Phylum, Class, Order, #' Family and Genus level. The colData includes: -#' +#' #' \describe{ #' \item{patient_status}{clinical status of the patient (ADHD or Control)} #' \item{cohort}{cohort to which the patient belongs (Cohort_1, Cohort_2 and @@ -321,7 +323,7 @@ NULL #' \item{patient_status_vs_cohort}{combination of patient_status and cohort} #' \item{sample_name}{unique sample ID} #' } -#' +#' #' @name Tengeler2020 #' @docType data #' @keywords datasets @@ -329,42 +331,42 @@ NULL #' @seealso \code{\link{mia-datasets}} #' @author A.C. Tengeler, et al. #' @references -#' Tengeler, A.C., Dam, S.A., Wiesmann, M. et al. +#' Tengeler, A.C., Dam, S.A., Wiesmann, M. et al. #' Gut microbiota from persons with attention-deficit/hyperactivity disorder -#' affects the brain in mice. +#' affects the brain in mice. #' Microbiome 8, 44 (2020). \url{https://doi.org/10.1186/s40168-020-00816-x} -#' -#' Supplemental information includes Home-cage activity, methods, results and imaging parameters and publicly-accessible from: +#' +#' Supplemental information includes Home-cage activity, methods, results and imaging parameters and publicly-accessible from: #' \url{https://static-content.springer.com/esm/art%3A10.1186%2Fs40168-020-00816-x/MediaObjects/40168_2020_816_MOESM1_ESM.docx} #' \url{https://static-content.springer.com/esm/art%3A10.1186%2Fs40168-020-00816-x/MediaObjects/40168_2020_816_MOESM2_ESM.docx} #' \url{https://static-content.springer.com/esm/art%3A10.1186%2Fs40168-020-00816-x/MediaObjects/40168_2020_816_MOESM3_ESM.docx} -#' +#' NULL #' Fecal microbiota samples from 589 patients across different colorectal #' cancer stages #' -#' The study combined Quantitative Microbiome Profiling (QMP) with -#' extensive patient phenotyping from a group of 589 colorectal cancer (CRC) -#' patients, advanced adenoma (AA) patients, and healthy controls. +#' The study combined Quantitative Microbiome Profiling (QMP) with +#' extensive patient phenotyping from a group of 589 colorectal cancer (CRC) +#' patients, advanced adenoma (AA) patients, and healthy controls. #' By implementing confounder control and quantitative profiling methods, -#' the study +#' the study #' was able to reveal potential misleading associations between microbial -#' markers +#' markers #' and colorectal cancer development that were driven by other factors like -#' intestinal +#' intestinal #' inflammation, rather than the cancer diagnosis itself. #' -#' @format A TreeSummarizedExperiment with 676 features and 589 samples. +#' @format A TreeSummarizedExperiment with 676 features and 589 samples. #' The rowData contains species. The colData includes: -#' +#' #' \describe{ #' \item{sampleID}{(character) Sample ID from the corresponding study} #' \item{diagnosis}{(factor) Diagnosis type, with possible values: "ADE" -#' (advanced adenoma), +#' (advanced adenoma), #' "CRC" (colorectal cancer), "CTL" (control)} #' \item{colonoscopy}{(factor) Colonoscopy result, with possible values: -#' "FIT_Positive", +#' "FIT_Positive", #' "familial_risk_familial_CRC_FCC", "familial_risk_no", "abdomil_complaints"} #' } #' @@ -373,16 +375,59 @@ NULL #' @keywords datasets #' @usage data(Tito2024QMP) #' @seealso \code{\link{mia-datasets}} -#' @author +#' @author #' Shadman Ishraq #' @references #' Raúl Y. Tito, Sara Verbandt, Marta Aguirre Vazquez, Leo Lahti, Chloe #' Verspecht, Verónica Lloréns-Rico, Sara Vieira-Silva, #' Janine Arts, Gwen Falony, Evelien Dekker, Joke Reumers, Sabine Tejpar & -#' Jeroen Raes (2024). +#' Jeroen Raes (2024). #' Microbiome confounders and quantitative profiling challenge predicted -#' microbial targets in colorectal cancer development. -#' Nature Medicine,30, 1339-1348. +#' microbial targets in colorectal cancer development. +#' Nature Medicine,30, 1339-1348. #' \url{https://doi.org/10.1038/s41591-024-02963-2} -#' +#' +NULL + +#' IBDMDB 2-omic demo dataset (MGX + MTX) +#' +#' A compact example derived from the Integrative Human Microbiome Project (iHMP) +#' Inflammatory Bowel Disease (IBD) cohort. +#' +#' The dataset \code{ibdmdb_2omic_demo} is a named list with: +#' \itemize{ +#' \item \code{se_mgx}: metagenomic taxonomic profiles (MGX) +#' \item \code{se_mtx}: metatranscriptomic taxonomic profiles (MTX) +#' \item \code{mae2}: a \link[MultiAssayExperiment]{MultiAssayExperiment} +#' containing both experiments +#' } +#' +#' These compact objects are intended for quick examples and vignettes. +#' Load with: +#' \code{data("ibdmdb_2omic_demo")}. +#' +#' @name ibdmdb_2omic_demo +#' @docType data +#' @keywords datasets +#' @usage data("ibdmdb_2omic_demo") +#' +#' @format A \code{MultiAssayExperiment} with two experiments: +#' \describe{ +#' \item{se_mgx}{A \code{SummarizedExperiment} with MGX abundance data.} +#' \item{se_mtx}{A \code{SummarizedExperiment} with MTX abundance data.} +#' \item{mae2}{A \code{MultiAssayExperiment} containing both experiments.} +#' } +#' +#' @source Prepared by \code{inst/scripts/prepare_ibdmdb_demo.R}. The script is for +#' developers and downloads publicly available IBDMDB/HMP2 taxonomic profiles +#' and metadata from Zenodo into a local cache (e.g. \code{tools/cache/}), +#' then subsets and filters the raw data to create a small example suitable +#' for vignettes. +#' +#' @references +#' Lloyd-Price J, Arze C, Ananthakrishnan AN, et al. +#' \emph{Multi-omics of the gut microbial ecosystem in inflammatory bowel +#' diseases.} +#' Nature 569, 655–662 (2019). +#' \doi{10.1038/s41586-019-1237-9} NULL \ No newline at end of file diff --git a/data/ibdmdb_2omic_demo.rda b/data/ibdmdb_2omic_demo.rda new file mode 100644 index 000000000..84770ee71 Binary files /dev/null and b/data/ibdmdb_2omic_demo.rda differ diff --git a/inst/scripts/prepare_ibdmdb_demo.R b/inst/scripts/prepare_ibdmdb_demo.R new file mode 100644 index 000000000..92fd922b6 --- /dev/null +++ b/inst/scripts/prepare_ibdmdb_demo.R @@ -0,0 +1,277 @@ +# Build compact demo objects (.rda) for IBDMDB examples/vignettes. +# +# Produces: +# data/ibdmdb_2omic_demo.rda (ibdmdb_2omic_demo) +# +# Source raw inputs from inst/extdata and pre-process for speed/size. +# +# This script is for DEVELOPERS only. It is NOT run during R CMD check. + +message("== IBDMDB demo data preparation ==") + +# ------------------------------------------------------------------------------ +# Config +# ------------------------------------------------------------------------------ + +# This script is for DEVELOPERS only. +# It downloads large raw files into a local cache (NOT committed to git), +# then creates a compact demo dataset in data/ibdmdb_2omic_demo.rda. +# It is NOT run during R CMD check. + +cache_dir <- file.path("tools", "cache", "mia_ibdmdb_cache") +dir.create(cache_dir, recursive = TRUE, showWarnings = FALSE) + +# Stable download URLs (Zenodo) +url_meta <- "https://zenodo.org/records/18280405/files/hmp2_metadata_2018-08-20.csv" +url_mtx <- "https://zenodo.org/records/18280535/files/ecs_relab.tsv" +url_mgx <- "https://zenodo.org/records/18280521/files/taxonomic_profiles_mgx.tsv" + +# Local cached file paths (downloaded if missing) +f_mgx <- file.path(cache_dir, "taxonomic_profiles_mgx.tsv") +f_mtx <- file.path(cache_dir, "ecs_relab.tsv") +f_meta <- file.path(cache_dir, "hmp2_metadata_2018-08-20.csv") + +# Prevalence thresholds (fraction of samples) +prev_mgx_frac <- 0.05 +prev_mtx_frac <- 0.02 + +# Cap feature counts for speed/size +cap_mgx <- 800L +cap_mtx <- 800L +max_samples <- 60L + +# ------------------------------------------------------------------------------ +# Dependencies +# ------------------------------------------------------------------------------ + +need <- c("data.table", "matrixStats", "SummarizedExperiment", "MultiAssayExperiment", + "S4Vectors") +miss <- setdiff(need, rownames(installed.packages())) +if (length(miss)) { + stop( + "Missing packages: ", + paste(miss, collapse = ", "), + ". Install them, then re-run this script." + ) +} + +library(data.table) +library(matrixStats) +library(SummarizedExperiment) +library(MultiAssayExperiment) +library(S4Vectors) + +# ------------------------------------------------------------------------------ +# Helpers +# ------------------------------------------------------------------------------ + +download_if_missing <- function(url, dest) { + if (file.exists(dest)) return(invisible(dest)) + message("Downloading: ", basename(dest)) + utils::download.file(url, dest, mode = "wb", quiet = FALSE) + invisible(dest) +} + +read_ibdmdb_tsv <- function(path) { + stopifnot(file.exists(path)) + first <- readLines(path, n = 200L, warn = FALSE) + comment_idx <- which(grepl("^#", first)) + if (length(comment_idx) == 0L) { + stop("No commented header line found in: ", path) + } + + header_line <- first[max(comment_idx)] + header_line <- sub("^#\\s*", "", header_line) + header_line <- sub("^\ufeff", "", header_line) + header_vec <- strsplit(header_line, "\t", fixed = TRUE)[[1]] + header_vec <- gsub('^"|"$', "", header_vec) + + dt <- data.table::fread( + path, + skip = length(comment_idx), + header = FALSE, + sep = "\t", + quote = "", + fill = TRUE + ) + if (ncol(dt) < length(header_vec)) { + stop(sprintf( + "Header columns (%d) > data columns (%d) in %s", + length(header_vec), ncol(dt), path + )) + } + if (ncol(dt) > length(header_vec)) { + dt <- dt[, seq_len(length(header_vec)), with = FALSE] + } + data.table::setnames(dt, header_vec) + dt +} + +to_matrix <- function(dt) { + rn <- dt[[1]] + mat <- as.matrix(dt[, -1, with = FALSE]) + rownames(mat) <- rn + storage.mode(mat) <- "numeric" + mat[is.na(mat)] <- 0 + mat +} + +dedup_rownames <- function(mat) { + stopifnot(!is.null(rownames(mat))) + rn <- rownames(mat) + empty_or_na <- rn == "" | is.na(rn) + if (any(empty_or_na)) { + rn[empty_or_na] <- paste0("feat_", seq_len(sum(empty_or_na))) + } + rownames(mat) <- make.unique(as.character(rn), sep = "_") + mat +} + +sanitize_matrix <- function(mat) { + mat <- as.matrix(mat) + storage.mode(mat) <- "numeric" + mat[!is.finite(mat)] <- 0 + mat +} + +cap_by_var <- function(M, cap) { + if (nrow(M) <= cap) return(M) + ord <- order(matrixStats::rowVars(M), decreasing = TRUE) + M[ord[seq_len(cap)], , drop = FALSE] +} + +read_metadata <- function(path) { + stopifnot(file.exists(path)) + read.csv(path, stringsAsFactors = FALSE, check.names = FALSE) +} + +make_SE <- function(mat, meta_df = NULL, assay_name = "counts") { + if (is.null(meta_df)) { + return(SummarizedExperiment::SummarizedExperiment( + assays = setNames(list(mat), assay_name), + colData = S4Vectors::DataFrame(row.names = colnames(mat)) + )) + } + + md <- as.data.frame(meta_df, stringsAsFactors = FALSE, check.names = FALSE) + + overlaps <- vapply( + md, + function(col) sum(as.character(col) %in% colnames(mat)), + numeric(1) + ) + best <- names(overlaps)[which.max(overlaps)] + + if (length(best) == 0 || overlaps[[best]] == 0) { + return(SummarizedExperiment::SummarizedExperiment( + assays = setNames(list(mat), assay_name), + colData = S4Vectors::DataFrame(row.names = colnames(mat)) + )) + } + + md_sub <- md[md[[best]] %in% colnames(mat), , drop = FALSE] + md_sub <- md_sub[!duplicated(md_sub[[best]]), , drop = FALSE] + + rownames(md_sub) <- as.character(md_sub[[best]]) + md_sub <- md_sub[colnames(mat), , drop = FALSE] + + if (anyDuplicated(rownames(md_sub))) { + rownames(md_sub) <- make.unique(rownames(md_sub), sep = "_dup") + } + stopifnot(identical(rownames(md_sub), colnames(mat))) + + SummarizedExperiment::SummarizedExperiment( + assays = setNames(list(mat), assay_name), + colData = S4Vectors::DataFrame(md_sub) + ) +} + +# ------------------------------------------------------------------------------ +# I/O guards + download raw inputs +# ------------------------------------------------------------------------------ + +if (!dir.exists("data")) dir.create("data", recursive = TRUE) + +download_if_missing(url_mgx, f_mgx) +download_if_missing(url_mtx, f_mtx) +download_if_missing(url_meta, f_meta) + +missing_files <- c(f_mgx, f_mtx, f_meta)[!file.exists(c(f_mgx, f_mtx, f_meta))] +if (length(missing_files)) { + stop( + "Missing cached file(s):\n- ", + paste(missing_files, collapse = "\n- "), + "\n\nDownload failed or paths are wrong." + ) +} + +# ------------------------------------------------------------------------------ +# Metadata +# ------------------------------------------------------------------------------ + +meta_full <- read_metadata(f_meta) + +# ------------------------------------------------------------------------------ +# Prepare 2-omic (MGX + MTX) +# ------------------------------------------------------------------------------ + +message("Preparing 2-omic MGX + MTX demo ...") + +dt_mgx <- read_ibdmdb_tsv(f_mgx) +dt_mtx <- read_ibdmdb_tsv(f_mtx) + +M_mgx <- sanitize_matrix(dedup_rownames(to_matrix(dt_mgx))) +M_mtx <- sanitize_matrix(dedup_rownames(to_matrix(dt_mtx))) + +shared <- intersect(colnames(M_mgx), colnames(M_mtx)) +shared <- sort(unique(shared[nchar(shared) > 0])) +if (length(shared) < 20) { + warning("Few shared samples for 2-omic: ", length(shared), " (keeping anyway).") +} +M_mgx <- M_mgx[, shared, drop = FALSE] +M_mtx <- M_mtx[, shared, drop = FALSE] + +n_samp <- ncol(M_mgx) +keep_mgx <- rowSums(M_mgx > 0) >= ceiling(prev_mgx_frac * n_samp) +keep_mtx <- rowSums(M_mtx > 0) >= ceiling(prev_mtx_frac * n_samp) +M_mgx <- M_mgx[keep_mgx, , drop = FALSE] +M_mtx <- M_mtx[keep_mtx, , drop = FALSE] + +M_mgx <- M_mgx[, colSums(M_mgx) > 0, drop = FALSE] +M_mtx <- M_mtx[, colSums(M_mtx) > 0, drop = FALSE] + +shared2 <- intersect(colnames(M_mgx), colnames(M_mtx)) +shared2 <- sort(unique(shared2)) +set.seed(1) +if (length(shared2) > max_samples) { + shared2 <- sort(sample(shared2, max_samples)) +} +M_mgx <- M_mgx[, shared2, drop = FALSE] +M_mtx <- M_mtx[, shared2, drop = FALSE] + +M_mgx <- cap_by_var(M_mgx, cap_mgx) +M_mtx <- cap_by_var(M_mtx, cap_mtx) + +meta_df <- meta_full +se_mgx <- make_SE(M_mgx, meta_df, assay_name = "mgx") +se_mtx <- make_SE(M_mtx, meta_df, assay_name = "mtx") + +mae2 <- MultiAssayExperiment::MultiAssayExperiment( + experiments = list(MGX = se_mgx, MTX = se_mtx) +) + +if (!is.null(SummarizedExperiment::colData(se_mgx))) { + MultiAssayExperiment::colData(mae2) <- SummarizedExperiment::colData(se_mgx) +} + +# Save only one dataset object +ibdmdb_2omic_demo <- mae2 + +save( + ibdmdb_2omic_demo, + file = file.path("data", "ibdmdb_2omic_demo.rda"), + compress = "xz" +) + +message("Saved: data/ibdmdb_2omic_demo.rda") +message("== Done. Re-run devtools::document(); devtools::check(); BiocCheck::BiocCheck(). ==") \ No newline at end of file diff --git a/man/getJointRPCA.Rd b/man/getJointRPCA.Rd new file mode 100644 index 000000000..ca1b1cd79 --- /dev/null +++ b/man/getJointRPCA.Rd @@ -0,0 +1,68 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/getJointRPCA.R +\name{getJointRPCA} +\alias{getJointRPCA} +\title{Run Joint-RPCA and store embedding in reducedDim} +\arguments{ +\item{altexp}{Optional name of an alternative experiment. If supplied, +Joint-RPCA is run on \code{altExp(x, altexp)} instead of \code{x}.} + +\item{name}{Character scalar giving the name of the \code{reducedDim} slot +in which to store the joint sample embedding. Defaults to \code{"JointRPCA"}.} + +\item{x}{Input object: \code{MultiAssayExperiment}, \code{SummarizedExperiment} +(including \code{TreeSummarizedExperiment}), list of matrices, or single matrix.} + +\item{experiments}{Character vector of experiment names to extract when \code{x} +is a \code{MultiAssayExperiment} (i.e. \code{names(experiments(x))}). +If \code{NULL}, all experiments are used. + +For \code{MultiAssayExperiment} inputs, \strong{one assay per experiment} is +used: by default the first assay returned by +\code{assayNames()} (or index \code{1L} if unnamed). +The actually used assay names are recorded in \code{$assay_names_used} in +the result. If you need a different assay (e.g. \code{"relab"} instead of +\code{"counts"}), subset or reorder assays in \code{x} before calling +\code{jointRPCAuniversal()}.} + +\item{transform}{Character string specifying preprocessing applied to each +input table before ordination. Use \code{"rclr"} to apply the robust CLR +transform (via \code{decostand(method = "rclr")}) or \code{"none"} to +disable transformation (data are used as-is after masking non-finite values).} + +\item{optspace.tol}{Numeric tolerance passed to \code{optspace()}.} + +\item{center}{Logical; whether to center the reconstructed low-rank matrix +(double-centering) prior to SVD/PCA steps.} + +\item{scale}{Logical; whether to scale the reconstructed matrix prior to +SVD/PCA steps. Defaults to \code{FALSE}.} + +\item{...}{Additional arguments passed to \code{.joint_rpca()}.} +} +\value{ +The input object \code{x} with a new entry in +\code{reducedDim(x, name)} containing the Joint-RPCA sample embedding. +The full Joint-RPCA result (including distances, cross-validation +statistics and transformed tables) is stored in +\code{metadata(x)$JointRPCA[[name]]}. + +Output from \code{.joint_rpca()} with extra fields when \code{x} is a +\code{MultiAssayExperiment}: +\itemize{ +\item \code{$experiment_names}: character vector of experiments used. +\item \code{$assay_names_used}: named character vector giving, for each +experiment, the assay name that was used (typically the first in +\code{assayNames()}). +} +} +\description{ +Run Joint-RPCA and store embedding in reducedDim + +Universal Joint RPCA Wrapper +} +\details{ +Convenience wrapper that runs Joint Robust PCA on one or more compositional +tables and stores the resulting sample embedding in \code{reducedDim(x, name)}, +similar to \code{runMDS()} and \code{runPCA()}. +} diff --git a/man/ibdmdb_2omic_demo.Rd b/man/ibdmdb_2omic_demo.Rd new file mode 100644 index 000000000..6d490683b --- /dev/null +++ b/man/ibdmdb_2omic_demo.Rd @@ -0,0 +1,49 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/mia.R +\docType{data} +\name{ibdmdb_2omic_demo} +\alias{ibdmdb_2omic_demo} +\title{IBDMDB 2-omic demo dataset (MGX + MTX)} +\format{ +A \code{MultiAssayExperiment} with two experiments: +\describe{ +\item{se_mgx}{A \code{SummarizedExperiment} with MGX abundance data.} +\item{se_mtx}{A \code{SummarizedExperiment} with MTX abundance data.} +\item{mae2}{A \code{MultiAssayExperiment} containing both experiments.} +} +} +\source{ +Prepared by \code{inst/scripts/prepare_ibdmdb_demo.R}. The script is for +developers and downloads publicly available IBDMDB/HMP2 taxonomic profiles +and metadata from Zenodo into a local cache (e.g. \code{tools/cache/}), +then subsets and filters the raw data to create a small example suitable +for vignettes. +} +\usage{ +data("ibdmdb_2omic_demo") +} +\description{ +A compact example derived from the Integrative Human Microbiome Project (iHMP) +Inflammatory Bowel Disease (IBD) cohort. +} +\details{ +The dataset \code{ibdmdb_2omic_demo} is a named list with: +\itemize{ +\item \code{se_mgx}: metagenomic taxonomic profiles (MGX) +\item \code{se_mtx}: metatranscriptomic taxonomic profiles (MTX) +\item \code{mae2}: a \link[MultiAssayExperiment]{MultiAssayExperiment} +containing both experiments +} + +These compact objects are intended for quick examples and vignettes. +Load with: +\code{data("ibdmdb_2omic_demo")}. +} +\references{ +Lloyd-Price J, Arze C, Ananthakrishnan AN, et al. +\emph{Multi-omics of the gut microbial ecosystem in inflammatory bowel +diseases.} +Nature 569, 655–662 (2019). +\doi{10.1038/s41586-019-1237-9} +} +\keyword{datasets} diff --git a/man/mia-datasets.Rd b/man/mia-datasets.Rd index 00b590f18..48dd28a1a 100644 --- a/man/mia-datasets.Rd +++ b/man/mia-datasets.Rd @@ -21,6 +21,8 @@ features and 280 samples} and 3 samples} \item{\code{\link{GlobalPatterns}}: A TreeSummarizedExperiment with 19216 features and 26 samples} +\item{\code{\link{ibdmdb_2omic_demo}}: A compact +\code{MultiAssayExperiment} demo dataset with 2 experiments (MGX + MTX)} \item{\code{\link{HintikkaXOData}}: A MultiAssayExperiment with 3 experiments (microbiota, metabolites and biomarkers)} \item{\code{\link{peerj13075}}: A TreeSummarizedExperiment with 674 diff --git a/man/mia-package.Rd b/man/mia-package.Rd index 80908b87a..b42682002 100644 --- a/man/mia-package.Rd +++ b/man/mia-package.Rd @@ -55,6 +55,7 @@ Other contributors: \item Sam Hillman [contributor] \item Jesse Pasanen [contributor] \item Eetu Tammi [contributor] + \item Aituar Bektanov [contributor] } } diff --git a/pkgdown/_pkgdown.yml b/pkgdown/_pkgdown.yml index 875ec9258..f1f2887a6 100644 --- a/pkgdown/_pkgdown.yml +++ b/pkgdown/_pkgdown.yml @@ -8,6 +8,10 @@ reference: - getMediation - getPERMANOVA +- title: Joint-RPCA + contents: + - getJointRPCA + - title: Clustering - contents: - calculateDMN @@ -39,6 +43,7 @@ reference: - peerj13075 - Tengeler2020 - Tito2024QMP + - ibdmdb_2omic_demo - title: Dataset description - contents: - getPrevalence diff --git a/tests/testthat/test-getJointRPCA.R b/tests/testthat/test-getJointRPCA.R new file mode 100644 index 000000000..07eea3a63 --- /dev/null +++ b/tests/testthat/test-getJointRPCA.R @@ -0,0 +1,306 @@ +test_that("getJointRPCA stores embedding in reducedDim and metadata", { + skip_if_not(requireNamespace("SingleCellExperiment", quietly = TRUE)) + + set.seed(123) + X <- abs(matrix(rnorm(20 * 10), nrow = 20, ncol = 10)) + rownames(X) <- paste0("f", 1:20) + colnames(X) <- paste0("s", 1:10) + + sce <- SingleCellExperiment::SingleCellExperiment( + assays = list(counts = X) + ) + + sce2 <- mia:::getJointRPCA( + sce, + name = "JointRPCA_test", + n.components = 2, + max.iterations = 2, + transform = "none", + n.test.samples = 3 + ) + + emb <- reducedDim(sce2, "JointRPCA_test") + expect_true(is.matrix(emb)) + expect_equal(nrow(emb), ncol(X)) + expect_equal(rownames(emb), colnames(X)) + + jr <- metadata(sce2)$JointRPCA[["JointRPCA_test"]] + expect_type(jr, "list") + expect_true(all(c("ord_res", "dist", "cv_stats", "rclr_tables") %in% names(jr))) +}) + +test_that("single-view .joint_rpca returns the expected structure", { + set.seed(123) + # Features x samples + X <- abs(matrix(rnorm(20 * 12), nrow = 20, ncol = 12)) + rownames(X) <- paste0("f", 1:20) + colnames(X) <- paste0("s", 1:12) + + fit <- mia:::.joint_rpca( + tables = list(assay1 = X), + n.components = 3, + max.iterations = 2, + transform = "none", + n.test.samples = 4 + ) + + # Top-level structure + expect_type(fit, "list") + expect_true(all(c("ord_res", "dist", "cv_stats", "rclr_tables") %in% names(fit))) + + # Ordination structure + OR <- fit$ord_res + expect_true(all(c("eigvals", "samples", "features", "proportion.explained") %in% names(OR))) + + # Samples: should include both train + projected test = all original samples + expect_equal(nrow(OR$samples), ncol(X)) + expect_equal(colnames(OR$samples), paste0("PC", 1:3)) + expect_true(all(colnames(OR$samples) %in% names(OR$proportion.explained))) + + # Features: now per-view list, even for single-view + expect_true(is.list(OR$features)) + expect_true("assay1" %in% names(OR$features)) + + F1 <- OR$features$assay1 + if (is.data.frame(F1)) F1 <- as.matrix(F1) + expect_equal(nrow(F1), nrow(X)) + expect_equal(colnames(F1), paste0("PC", 1:3)) + + # CV stats & dist + expect_true(is.data.frame(fit$cv_stats)) + expect_true(all(c("mean_CV", "std_CV", "run", "iteration") %in% names(fit$cv_stats))) + + expect_s3_class(fit$dist, "DistanceMatrix") + expect_true(is.matrix(fit$dist$data)) + expect_equal(nrow(fit$dist$data), ncol(X)) + expect_equal(colnames(fit$dist$data), colnames(X)) +}) + +test_that("multi-view .joint_rpca preserves per-view feature loadings", { + set.seed(42) + # Two views, same samples + S <- paste0("s", 1:10) + A <- abs(matrix(rnorm(25 * 10), 25, 10, + dimnames = list(paste0("a", 1:25), S))) + B <- abs(matrix(rnorm(30 * 10), 30, 10, + dimnames = list(paste0("b", 1:30), S))) + + fit <- mia:::.joint_rpca( + tables = list(MGX = A, MTX = B), + n.components = 2, + max.iterations = 2, + transform = "none", + n.test.samples = 3 + ) + + OR <- fit$ord_res + expect_true(is.list(OR$features)) + expect_true(all(c("MGX", "MTX") %in% names(OR$features))) + + # Check each view's loading matrix dimensions & rownames + MGXv <- OR$features$MGX; MTXv <- OR$features$MTX + if (is.data.frame(MGXv)) MGXv <- as.matrix(MGXv) + if (is.data.frame(MTXv)) MTXv <- as.matrix(MTXv) + + expect_equal(nrow(MGXv), nrow(A)) + expect_equal(nrow(MTXv), nrow(B)) + expect_equal(colnames(MGXv), paste0("PC", 1:2)) + expect_equal(colnames(MTXv), paste0("PC", 1:2)) + expect_true(all(rownames(MGXv) %in% rownames(A))) + expect_true(all(rownames(MTXv) %in% rownames(B))) + + # Samples include all training + projected test + expect_equal(nrow(OR$samples), length(S)) + expect_equal(colnames(OR$samples), paste0("PC", 1:2)) +}) + +test_that("unshared samples are dropped with a warning and alignment is correct", { + set.seed(7) + S1 <- paste0("s", 1:8) + S2 <- c(paste0("s", 3:10)) + A <- abs(matrix(rnorm(20 * length(S1)), 20, length(S1), + dimnames = list(paste0("a", 1:20), S1))) + B <- abs(matrix(rnorm(15 * length(S2)), 15, length(S2), + dimnames = list(paste0("b", 1:15), S2))) + + expect_warning( + fit <- mia:::.joint_rpca( + tables = list(MGX = A, MTX = B), + n.components = 2, + max.iterations = 2, + transform = "none", + n.test.samples = 2 + ), + regexp = "Removing.*sample\\(s\\).*overlap", + all = FALSE + ) + + OR <- fit$ord_res + expect_equal(nrow(OR$samples), 6) +}) + +test_that("projection of new samples via .transform appends rows and keeps component names", { + set.seed(99) + S_train <- paste0("s", 1:8) + featsA <- paste0("a", 1:18) + featsB <- paste0("b", 1:22) + + A <- abs(matrix(rnorm(length(featsA) * length(S_train)), length(featsA), length(S_train), + dimnames = list(featsA, S_train))) + B <- abs(matrix(rnorm(length(featsB) * length(S_train)), length(featsB), length(S_train), + dimnames = list(featsB, S_train))) + + fit <- mia:::.joint_rpca( + tables = list(MGX = A, MTX = B), + n.components = 3, + max.iterations = 2, + transform = "none", + n.test.samples = 3 + ) + + OR <- fit$ord_res + n_before <- nrow(OR$samples) + + # Create new samples (same features, new sample IDs) + S_new <- c("s9", "s10") + A_new <- abs(matrix(rnorm(length(featsA) * length(S_new)), length(featsA), length(S_new), + dimnames = list(featsA, S_new))) + B_new <- abs(matrix(rnorm(length(featsB) * length(S_new)), length(featsB), length(S_new), + dimnames = list(featsB, S_new))) + + # Project new samples + OR2 <- mia:::.transform( + ordination = OR, + tables = list(MGX = A_new, MTX = B_new), + apply.rclr = FALSE + ) + + expect_true(is.list(OR2)) + expect_true(all(c("samples", "features", "eigvals", "proportion.explained") %in% names(OR2))) + expect_equal(colnames(OR2$samples), colnames(OR$samples)) + expect_true(all(S_new %in% rownames(OR2$samples))) + expect_equal(nrow(OR2$samples), n_before + length(S_new)) +}) + +test_that("errors surface for duplicated sample IDs during preprocessing", { + set.seed(101) + X <- abs(matrix(rpois(24, lambda = 5), 6, 4)) + rownames(X) <- paste0("f", 1:6) + colnames(X) <- c("s1", "s1", "s2", "s3") + + expect_error( + mia:::.joint_rpca( + tables = list(assay1 = X), + n.components = 2, + max.iterations = 2, + transform = "none", + n.test.samples = 2 + ), + regexp = "duplicate sample \\(column\\) IDs", + ignore.case = TRUE + ) +}) + +test_that("jointRPCAuniversal works on MultiAssayExperiment", { + set.seed(2025) + + # Two views, same samples + S <- paste0("s", 1:8) + MGX_mat <- abs(matrix( + rnorm(15 * length(S)), + nrow = 15, ncol = length(S), + dimnames = list(paste0("mgx", 1:15), S) + )) + MTX_mat <- abs(matrix( + rnorm(10 * length(S)), + nrow = 10, ncol = length(S), + dimnames = list(paste0("mtx", 1:10), S) + )) + + se_mgx <- SummarizedExperiment::SummarizedExperiment( + assays = list(counts = MGX_mat) + ) + se_mtx <- SummarizedExperiment::SummarizedExperiment( + assays = list(counts = MTX_mat) + ) + + mae2 <- MultiAssayExperiment::MultiAssayExperiment( + experiments = list(MGX = se_mgx, MTX = se_mtx) + ) + + fit <- mia:::jointRPCAuniversal( + x = mae2, + experiments = c("MGX", "MTX"), + n.components = 2, + max.iterations = 2, + transform = "none", + n.test.samples = 3 + ) + + # Basic structure + expect_type(fit, "list") + expect_true(all(c("ord_res", "dist", "cv_stats", "rclr_tables") %in% names(fit))) + + # rclr_tables should be named by experiment + expect_true(is.list(fit$rclr_tables)) + expect_equal(names(fit$rclr_tables), c("MGX", "MTX")) + + # Per-view feature loadings should also carry MGX/MTX names + OR <- fit$ord_res + expect_true(is.list(OR$features)) + expect_true(all(c("MGX", "MTX") %in% names(OR$features))) + + MGXv <- OR$features$MGX + MTXv <- OR$features$MTX + if (is.data.frame(MGXv)) MGXv <- as.matrix(MGXv) + if (is.data.frame(MTXv)) MTXv <- as.matrix(MTXv) + + expect_equal(nrow(MGXv), nrow(MGX_mat)) + expect_equal(nrow(MTXv), nrow(MTX_mat)) + expect_equal(colnames(MGXv), paste0("PC", 1:2)) + expect_equal(colnames(MTXv), paste0("PC", 1:2)) +}) + +test_that("jointRPCAuniversal uses default assays per experiment in a MAE and records them", { + skip_if_not_installed("MultiAssayExperiment") + skip_if_not_installed("SummarizedExperiment") + + S <- paste0("s", 1:6) + A1_counts <- matrix(abs(rnorm(10 * 6)), nrow = 10, + dimnames = list(paste0("a", 1:10), S)) + A1_alt <- A1_counts * 2 + + A2_counts <- matrix(abs(rnorm(8 * 6)), nrow = 8, + dimnames = list(paste0("b", 1:8), S)) + A2_alt <- A2_counts * 3 + + se1 <- SummarizedExperiment::SummarizedExperiment( + assays = list(counts = A1_counts, alt = A1_alt) + ) + se2 <- SummarizedExperiment::SummarizedExperiment( + assays = list(counts = A2_counts, alt = A2_alt) + ) + + mae <- MultiAssayExperiment::MultiAssayExperiment( + experiments = list(MGX = se1, MTX = se2) + ) + + fit <- mia:::jointRPCAuniversal( + x = mae, + experiments = c("MGX", "MTX"), + n.components = 2, + max.iterations = 2, + transform = "none", + n.test.samples = 2 + ) + + # Basic structure + expect_true(is.list(fit)) + expect_true(all(c("MGX", "MTX") %in% names(fit$rclr_tables))) + + expect_true(all(c("experiment_names", "assay_names_used") %in% names(fit))) + expect_equal(fit$experiment_names, c("MGX", "MTX")) + expect_named(fit$assay_names_used, c("MGX", "MTX")) + expect_equal(fit$assay_names_used[["MGX"]], "counts") + expect_equal(fit$assay_names_used[["MTX"]], "counts") +}) \ No newline at end of file