From a04d77d865a12e1765a9e468405dbb934e23a1fc Mon Sep 17 00:00:00 2001 From: Laura19993 Date: Wed, 19 Mar 2025 13:36:43 +0100 Subject: [PATCH 1/6] Added functionality for Parse and 10x FLEX and updated cellbender functions to work with the newest cellbender release --- .gitignore | 4 +- R/CRMetrics.R | 179 +++++++++++++++++++++++++---- R/inner_functions.R | 268 +++++++++++++++++++++++++++++++++++++++----- 3 files changed, 399 insertions(+), 52 deletions(-) diff --git a/.gitignore b/.gitignore index f357572..f180aac 100644 --- a/.gitignore +++ b/.gitignore @@ -27,9 +27,11 @@ vignettes/*.pdf # R Environment Variables .Renviron # Project files -.Rproj +*.Rproj cran-comments.md .Rbuildignore CRMetrics.Rproj CRAN-SUBMISSION .github/ +# test data +control_6 diff --git a/R/CRMetrics.R b/R/CRMetrics.R index 81d24f3..5a11792 100644 --- a/R/CRMetrics.R +++ b/R/CRMetrics.R @@ -1165,7 +1165,7 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE, #' @param n.cores integer Number of cores for the calculations (default = self$n.cores). #' @param arg.buildGraph list A list with additional arguments for the `buildGraph` function in Conos (default = list()) #' @param arg.findCommunities list A list with additional arguments for the `findCommunities` function in Conos (default = list()) - #' @param arg.embedGraph list A list with additional arguments for the `embedGraph` function in Conos (default = list(method = "UMAP)) + #' @param arg.embedGraph list A list with additional arguments for the `embedGraph` function in Conos (default = list(method = "UMAP")) #' @return Conos object #' @examples #' \donttest{ @@ -1706,14 +1706,23 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE, if (is.null(total.droplets)) total.droplets <- self$getTotalDroplets(samples) # Read CMs from HDF5 files + + #add parse case, use readParse function if (!is.null(cms.raw)) { - if (verbose) message(paste0(Sys.time()," Using stored HDF5 Cell Ranger outputs. To overwrite, set $cms.raw <- NULL")) + if (verbose) message(paste0(Sys.time()," Using stored HDF5 Cell Ranger/Parse outputs. To overwrite, set $cms.raw <- NULL")) } else { - if (verbose) message(paste0(Sys.time()," Loading HDF5 Cell Ranger outputs")) + checkDataPath(data.path) + + if (grepl("combined", data.path)) { + if (verbose) message("Loading Parse outputs") + cms.raw <- readParse(data.path, samples, "raw", n.cores = n.cores, verbose = verbose, unique.names = unique.names, sep = sep) + } else { + if (verbose) message(paste0(Sys.time()," Loading HDF5 Cell Ranger outputs")) cms.raw <- read10xH5(data.path, samples, "raw", n.cores = n.cores, verbose = verbose, unique.names = unique.names, sep = sep) - self$cms.raw <- cms.raw - } + } + self$cms.raw <- cms.raw + } # Get UMI counts if (!is.null(umi.counts)) { @@ -1760,15 +1769,54 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE, g <- g + facet_wrap(~ sample, scales = "free") if (verbose) message(paste0(Sys.time()," Done!")) + + + # transform parse output to work as input for cellbender + if (grepl("combined", data.path)) { + if (verbose) message("Transforming Parse outputs as preparation for cellbender") + # check if transformed data already exists + full.path <- data.path %>% + pathsToList(samples) %>% + sapply(\(sample) { + paste0(sample[2],"/",sample[1],"/DGE_unfiltered") + }) + lapply(full.path, function(x) { + + # 1. Check and create genes.tsv + if (!file.exists(paste0(x,"/","genes.tsv"))) { + if(verbose) message("Creating genes.tsv in ", x) + fread(paste0(x,"/","all_genes.csv"), sep = ",", header = TRUE) %>% + mutate(genome = "Gene Expression") %>% + fwrite(paste0(x,"/","genes.tsv"), sep = "\t", col.names = FALSE) + } + #2. Check and create barcodes.tsv + if (!file.exists(paste0(x,"/","barcodes.tsv"))) { + if(verbose) message("Creating barcodes.tsv in ", x) + fread(paste0(x,"/","cell_metadata.csv"), sep = ",", header = TRUE) %>% + .[, "bc_wells"] %>% + write.table(paste0(x,"/","barcodes.tsv"), row.names=F, col.names=F) + } + #2. Check and create matrix.mtx + if (!file.exists(paste0(x,"/","matrix.mtx"))) { + if(verbose) message("Creating matrix.mtx in ", x) + fread(paste0(full.path[1],"/","count_matrix.mtx"), header = F) %>% + select(V2, V1, V3) %>% + fwrite(paste0(full.path[1],"/","matrix.mtx"), col.names=F) + } + }) + } + # return the plot object return(g) }, + + #' @param file character File name for CellBender script. Will be stored in `data.path` (default: "cellbender_script.sh") #' @param fpr numeric False positive rate for CellBender (default = 0.01) #' @param epochs integer Number of epochs for CellBender (default = 150) #' @param use.gpu logical Use CUDA capable GPU (default = TRUE) - #' @param expected.cells named numeric If NULL, expected cells will be deduced from the number of cells per sample identified by Cell Ranger. Otherwise, a named vector of expected cells with sample IDs as names. Sample IDs must match those in summary.metrics (default: stored named vector) - #' @param total.droplets named numeric If NULL, total droplets included will be deduced from expected cells multiplied by 3. Otherwise, a named vector of total droplets included with sample IDs as names. Sample IDs must match those in summary.metrics (default: stored named vector) + #' @param expected.cells logical or named numeric vector If FALSE, the --expected-cells argument is omitted in the generated script.If TRUE, expected cells will be deduced from the number of cells per sample identified by Cell Ranger/Parse. Otherwise, a named vector of expected cells with sample IDs as names. Sample IDs must match those in summary.metrics (default: FALSE) + #' @param total.droplets logical or named numeric vector If FALSE, the --total.droplets argument is omitted in the generated script.If TRUE, total droplets included will be deduced from expected cells multiplied by 3. Otherwise, a named vector of total droplets included with sample IDs as names. Sample IDs must match those in summary.metrics (default: FALSE) #' @param data.path character Path to Cell Ranger outputs (default = self$data.path) #' @param samples character Sample names to include (default = self$metadata$sample) #' @param args character (optional) Additional parameters for CellBender @@ -1783,26 +1831,84 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE, fpr = 0.01, epochs = 150, use.gpu = TRUE, - expected.cells = NULL, - total.droplets = NULL, + expected.cells = FALSE, + total.droplets = FALSE, data.path = self$data.path, samples = self$metadata$sample, args = NULL) { # Preparations checkDataPath(data.path) - inputs <- getH5Paths(data.path, samples, "raw") + + if (grepl("combined", data.path)) { + # in case of parse we use the genes, matrix, barcodes files stored in DGE_unfiltered and therefor we just need to give the path to DGE_unfiltered + inputs <- data.path %>% + pathsToList(samples) %>% + sapply(\(sample) { + paste0(sample[2], "/", sample[1], "/DGE_unfiltered")}) + + #check that transformed files exists for parse + lapply(inputs, function(x) { + + # 1. Check genes.tsv + if (!file.exists(paste0(x,"/","genes.tsv"))) { + message("genes.tsv missing in ", x, " run prepareCellbender first.") + + } + #2. Check barcodes.tsv + if (!file.exists(paste0(x,"/","barcodes.tsv"))) { + message("barcodes.tsv missing in ", x, " run prepareCellbender first.") + } + + #2. Check matrix.mtx + if (!file.exists(paste0(x,"/","matrix.mtx"))) { + message("matrix.mtx missing in ", x, " run prepareCellbender first.") + } + }) + + } else { + inputs <- getH5Paths(data.path, samples, "raw") + } + outputs <- data.path %>% pathsToList(samples) %>% - sapply(\(sample) paste0(sample[2],sample[1],"/outs/cellbender.h5")) %>% + sapply(\(sample) { + if (grepl("combined", data.path)) { + paste0(sample[2], "/", sample[1], "/DGE_unfiltered/cellbender.h5") + } else if (grepl("per_sample_outs", data.path)) { + paste0(sample[2], "/", sample[1], "/count/cellbender.h5") + } else { + paste0(sample[2], "/", sample[1], "/outs/cellbender.h5") + } + }) %>% setNames(samples) - if (is.null(expected.cells)) expected.cells <- self$getExpectedCells(samples) - if (is.null(total.droplets)) total.droplets <- self$getTotalDroplets(samples) + if (isTRUE(expected.cells)) { + expected.cells <- self$getExpectedCells(samples) + } else if (isFALSE(expected.cells)) { + expected.cells <- NULL + } else if (is.vector(expected.cells) && is.null(names(expected.cells))) { + stop("If expected.cells is a vector, it must be named with sample names.") + } + + if (isTRUE(total.droplets)) { + total.droplets <- self$getTotalDroplets(samples) + } else if (isFALSE(total.droplets)) { + total.droplets <- NULL + } else if (is.vector(total.droplets) && is.null(names(total.droplets))) { + stop("If total.droplets is a vector, it must be named with sample names.") + } # Create CellBender shell scripts script.list <- samples %>% lapply(\(sample) { - paste0("cellbender remove-background --input ",inputs[sample]," --output ",outputs[sample],if (use.gpu) c(" --cuda ") else c(" "),"--expected-cells ",expected.cells[sample]," --total-droplets-included ",total.droplets[sample]," --fpr ",fpr," --epochs ",epochs," ",if (!is.null(args)) paste(args, collapse = " ")) + paste0("cellbender remove-background --input ",inputs[sample], + " --output ",outputs[sample], + if (use.gpu) c(" --cuda ") else c(" "), + if (!is.null(expected.cells)) paste0("--expected-cells ", expected.cells[sample], " ") else c(" "), + if (!is.null(total.droplets)) paste0("--total-droplets-included ", total.droplets[sample], " ") else c(" "), + " --fpr ",fpr, + " --epochs ",epochs, + " ",if (!is.null(args)) paste(args, collapse = " ")) }) out <- list("#! /bin/sh", script.list) %>% @@ -1832,9 +1938,12 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE, #' #' # Get no. cells #' crm$getExpectedCells() + #' + + #included parse and 10x flex nomenclature getExpectedCells = function(samples = self$metadata$sample) { expected.cells <- self$summary.metrics %>% - filter(metric == "estimated number of cells") %$% + filter(metric %in% c("estimated number of cells", "number_of_cells", "cells")) %$% setNames(value, sample) %>% .[samples] @@ -1877,7 +1986,9 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE, #' @param data.path character Path to cellranger count data (default = self$data.path). #' @param samples character Vector of sample names. If NULL, samples are extracted from cms (default = self$metadata$sample) #' @param cellbender logical Add CellBender filtered count matrices in HDF5 format. Requires that "cellbender" is in the names of the files (default = FALSE) - #' @param raw logical Add raw count matrices from Cell Ranger output. Cannot be combined with `cellbender=TRUE` (default = FALSE) + #' @param parse logical Add Parse filtered count matrices. Cannot be combined with `cellbender=TRUE`. Cannot be combined with `raw=TRUE`. (default = FALSE) + #' @param flex logical Add 10x Flex filtered count matrices from cell ranger. Can be combined with `raw=TRUE`. Cannot be combined with `cellbender=TRUE`. (default = FALSE) + #' @param raw logical Add raw count matrices from Cell Ranger output. Cannot be combined with `cellbender=TRUE`. (default = FALSE) #' @param symbol character The type of gene IDs to use, SYMBOL (TRUE) or ENSEMBLE (default = TRUE) #' @param unique.names logical Make cell names unique based on `sep` parameter (default = TRUE) #' @param sep character Separator used to create unique cell names (default = "!!") @@ -1904,6 +2015,8 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE, data.path = self$data.path, samples = self$metadata$sample, cellbender = FALSE, + flex = FALSE, + parse = FALSE, raw = FALSE, symbol = TRUE, unique.names = TRUE, @@ -1945,9 +2058,17 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE, if (unique.names) cms %<>% createUniqueCellNames(samples, sep) } else { # Add from data.path argument + if (parse && cellbender) { + stop("Error: If you run CellBender on Parse files, 'parse' must be set to FALSE and only 'cellbender' set to TRUE.") + } if (cellbender) { cms <- read10xH5(data.path = data.path, samples = samples, symbol = symbol, type = "cellbender_filtered", sep = sep, n.cores = n.cores, verbose = verbose, unique.names = unique.names) - } else { + } else if (parse) { + cms <- readParse(data.path = data.path, samples = samples, sep = sep, n.cores = n.cores, verbose = verbose, unique.names = unique.names) + } else if (flex) { + cms <- readFlex(data.path = data.path, samples = samples, raw = raw, symbol = symbol, sep = sep, n.cores = n.cores, verbose = verbose, unique.names = unique.names) + } + else { cms <- read10x(data.path = data.path, samples = samples, raw = raw, symbol = symbol, sep = sep, n.cores = n.cores, verbose = verbose, unique.names = unique.names) } } @@ -1989,7 +2110,9 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE, train.df <- samples %>% lapply(\(id) { - rhdf5::h5read(paths[id], "matrix/training_elbo_per_epoch") %>% + # rhdf5::h5read(paths[id], "matrix/training_elbo_per_epoch") %>% + #for cellbender v0.3.0 + rhdf5::h5read(paths[id], "metadata/learning_curve_train_elbo") %>% {data.frame(ELBO = ., Epoch = seq_len(length(.)), sample = id)} @@ -2000,8 +2123,12 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE, test.df <- samples %>% lapply(\(id) { path <- paths[id] - data.frame(ELBO = rhdf5::h5read(path, "matrix/test_elbo"), - Epoch = rhdf5::h5read(path, "matrix/test_epoch"), + # data.frame(ELBO = rhdf5::h5read(path, "matrix/test_elbo"), + # Epoch = rhdf5::h5read(path, "matrix/test_epoch"), + # sample = id) + #for cellbender v0.3.0 + data.frame(ELBO = rhdf5::h5read(path, "metadata/learning_curve_test_elbo"), + Epoch = rhdf5::h5read(path, "metadata/learning_curve_test_epoch"), sample = id) }) %>% setNames(samples) %>% @@ -2045,7 +2172,9 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE, cell.prob <- samples %>% lapply(\(id) { - rhdf5::h5read(paths[id], "matrix/latent_cell_probability") %>% + #rhdf5::h5read(paths[id], "matrix/latent_cell_probability") %>% + # for cellbender v0.3.0 + rhdf5::h5read(paths[id], "droplet_latents/cell_probability") %>% {data.frame(prob = ., cell = seq_len(length(.)), sample = id)} @@ -2083,7 +2212,9 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE, amb <- samples %>% lapply(\(id) { - rhdf5::h5read(paths[id], "matrix/ambient_expression") %>% + # rhdf5::h5read(paths[id], "matrix/ambient_expression") %>% + #for cellbender version 3 + rhdf5::h5read(paths[id], "global_latents/ambient_expression") %>% {data.frame(exp = ., cell = seq_len(length(.)), gene.names = rhdf5::h5read(paths[id], "matrix/features/name") %>% as.character(), @@ -2127,7 +2258,9 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE, amb <- samples %>% lapply(\(id) { - rhdf5::h5read(paths[id], "matrix/ambient_expression") %>% + #rhdf5::h5read(paths[id], "matrix/ambient_expression") %>% + #for cellbender version 3 + rhdf5::h5read(paths[id], "global_latents/ambient_expression") %>% {data.frame(exp = ., cell = seq_len(length(.)), gene.names = rhdf5::h5read(paths[id], "matrix/features/name") %>% as.character(), diff --git a/R/inner_functions.R b/R/inner_functions.R index ed72edd..8935513 100644 --- a/R/inner_functions.R +++ b/R/inner_functions.R @@ -111,6 +111,152 @@ read10x <- function(data.path, return(tmp) } +#' @title Load 10x flex count matrices +#' @description Load gene expression count data +#' @param data.path Path to cellranger count data. +#' @param samples Vector of sample names (default = NULL) +#' @param raw logical Add raw count matrices (default = FALSE) +#' @param symbol The type of gene IDs to use, SYMBOL (TRUE) or ENSEMBLE (default = TRUE). +#' @param sep Separator for cell names (default = "!!"). +#' @param n.cores Number of cores for the calculations (default = 1). +#' @param verbose Print messages (default = TRUE). +#' @keywords internal +#' @return data frame +#' @examples +#' \dontrun{ +#' cms <- readFlex(data.path = "/path/to/count/data", +#' samples = crm$metadata$samples, +#' raw = FALSE, +#' symbol = TRUE, +#' n.cores = crm$n.cores) +#' } +#' @export +readFlex <- function(data.path, + samples = NULL, + raw = FALSE, + symbol = TRUE, + sep = "!!", + unique.names = TRUE, + n.cores = 1, + verbose = TRUE) { + checkPackageInstalled("data.table", cran = TRUE) + if (is.null(samples)) samples <- list.dirs(data.path, full.names = FALSE, recursive = FALSE) + + full.path <- data.path %>% + pathsToList(samples) %>% + sapply(\(sample) { + if (raw) pat <- glob2rx("sample_raw_*_bc_matri*") else pat <- glob2rx("sample_filtered_*_bc_matri*") + dir(paste(sample[2],sample[1],"count", sep = "/"), pattern = pat, full.names = TRUE) %>% + .[!grepl(".h5", .)] + }) + + if (verbose) message(paste0(Sys.time()," Loading ",length(full.path)," count matrices using ", if (n.cores > length(full.path)) length(full.path) else n.cores," cores")) + tmp <- full.path %>% + plapply(\(sample) { + tmp.dir <- dir(sample, full.names = TRUE) + + # Read matrix + mat.path <- tmp.dir %>% + .[grepl("mtx", .)] + if (grepl("gz", mat.path)) { + mat <- as(Matrix::readMM(gzcon(file(mat.path, "rb"))), "CsparseMatrix") + } else { + mat <- as(Matrix::readMM(mat.path), "CsparseMatrix") + } + + # Add features + feat <- tmp.dir %>% + .[grepl(ifelse(any(grepl("features.tsv", .)),"features.tsv","genes.tsv"), .)] %>% + data.table::fread(header = FALSE) + if (symbol) rownames(mat) <- feat %>% pull(V2) else rownames(mat) <- feat %>% pull(V1) + + # Add barcodes + barcodes <- tmp.dir %>% + .[grepl("barcodes.tsv", .)] %>% + data.table::fread(header = FALSE) + colnames(mat) <- barcodes %>% pull(V1) + return(mat) + }, n.cores = n.cores) %>% + setNames(samples) + + if (unique.names) tmp %<>% createUniqueCellNames(samples, sep) + + if (verbose) message(paste0(Sys.time()," Done!")) + + return(tmp) +} + +#' @title Load filtered Parse count matrices +#' @description Load gene expression count data +#' @param data.path Path to Parse count data. +#' @param samples Vector of sample names (default = NULL) +#' @param raw logical Add raw count matrices (default = FALSE) +#' @param sep Separator for cell names (default = "!!"). +#' @param n.cores Number of cores for the calculations (default = 1). +#' @param verbose Print messages (default = TRUE). +#' @keywords internal +#' @return data frame +#' +#' @export +readParse <- function(data.path, + samples = NULL, + raw = FALSE, + sep = "!!", + unique.names = TRUE, + n.cores = 1, + verbose = TRUE) { + + checkPackageInstalled("data.table", cran = TRUE) + checkPackageInstalled("Matrix", cran = TRUE) + + # If no samples provided, infer from directories + if (is.null(samples)) samples <- list.dirs(data.path, full.names = FALSE, recursive = FALSE) + + full.path <- data.path %>% + pathsToList(samples) %>% + sapply(\(sample) { + if (isFALSE(raw)) { + paste0(sample[2],"/",sample[1],"/", "DGE_filtered", sep = "/") + } else paste0(sample[2],"/",sample[1],"/", "DGE_unfiltered", sep = "/") + }) + + if (verbose) message(paste0(Sys.time()," Loading ",length(full.path)," count matrices using ", if (n.cores > length(full.path)) length(full.path) else n.cores," cores")) + tmp <- full.path %>% + plapply(\(sample) { + tmp.dir <- dir(sample, full.names = TRUE) + + # Read matrix, attention, in case of raw there can be the case that we have 2 .mtx because for cellbender we need to create a new one + mat.path <- tmp.dir %>% + .[grepl("count_matrix.mtx", .)] + mat <- as(Matrix::readMM(mat.path), "CsparseMatrix") + # Transpose the matrix to have genes in rows and cells in columns + mat <- t(mat) + + # Add genes + genes <- tmp.dir %>% + .[grepl("all_genes.csv", .)] %>% + data.table::fread(header = T) + rownames(mat) <- genes$gene_name + + # Add barcodes + barcodes <- tmp.dir %>% + .[grepl("cell_metadata.csv", .)] %>% + data.table::fread(header = T) + colnames(mat) <- barcodes$bc_wells + + return(mat) + }, n.cores = n.cores) %>% + setNames(samples) + + + # Make cell names unique across samples if needed + if (unique.names) tmp %<>% createUniqueCellNames(samples, sep) + + if (verbose) message(paste0(Sys.time(), " Done!")) + + return(tmp) +} + #' @title Add detailed metrics #' @description Add detailed metrics, requires to load raw count matrices using pagoda2. #' @param cms List containing the count matrices. @@ -263,26 +409,71 @@ addSummaryMetrics <- function(data.path, metrics <- data.path %>% pathsToList(metadata$sample) %>% plapply(\(s) { - tmp <- read.table(dir(paste(s[2],s[1],"outs", sep = "/"), glob2rx("*ummary.csv"), full.names = TRUE), header = TRUE, sep = ",", colClasses = numeric()) %>% - mutate(., across(.cols = grep("%", .), - ~ as.numeric(gsub("%", "", .x)) / 100), - across(.cols = grep(",", .), - ~ as.numeric(gsub(",", "", .x)))) - # Take into account multiomics - if ("Sample.ID" %in% colnames(tmp)) tmp %<>% select(-c("Sample.ID","Genome","Pipeline.version")) - - tmp %>% - mutate(sample = s[1]) %>% - pivot_longer(cols = -c(sample), - names_to = "metric", - values_to = "value") %>% + # define possible directories to search for summary files (outs in case of Cell Ranger, report in case of Parse, sample base dir in case of 10X flex) + outs_dir <- paste(s[2], s[1], "outs", sep = "/") + report_dir <- paste(s[2], s[1], "report", sep = "/") + sample_base_dir <- paste(s[2], s[1], sep = "/") + + # Check which folder exists and use it to read the summary file + dir_to_use <- if (dir.exists(outs_dir)) outs_dir else if (dir.exists(report_dir)) report_dir else sample_base_dir + + if (!dir.exists(dir_to_use)) { + warning(paste("No 'outs', 'report', or summary file found for sample:", s[1])) + return(NULL) + } + + # Read the summary file differently based on the folder type + tmp <- NULL + if (dir_to_use == outs_dir) { + # Logic for 'outs' folder + tmp <- read.table(dir(dir_to_use, glob2rx("*ummary.csv"), full.names = TRUE), header = TRUE, sep = ",", colClasses = numeric()) %>% + mutate(., across(.cols = grep("%", .), ~ as.numeric(gsub("%", "", .x)) / 100), + across(.cols = grep(",", .), ~ as.numeric(gsub(",", "", .x)))) + + # Take into account multiomics + if ("Sample.ID" %in% colnames(tmp)) tmp %<>% select(-c("Sample.ID","Genome","Pipeline.version")) + + + # Add the sample column, pivot to long format, and clean metric names + tmp %<>% + mutate(sample = s[1]) %>% + pivot_longer(cols = -c(sample), + names_to = "metric", + values_to = "value") %>% mutate(metric = metric %>% gsub(".", " ", ., fixed = TRUE) %>% tolower()) - }, n.cores = n.cores) %>% - bind_rows() - if (verbose) message(paste0(Sys.time()," Done!")) - return(metrics) -} + } + + else if (dir_to_use == report_dir) { + + # Logic for 'report' folder (adjust this based on the structure of files in 'report') + tmp <- read.table(dir(dir_to_use, glob2rx("*ummary.csv"), full.names = TRUE), + header = TRUE, sep = ",", stringsAsFactors = FALSE) %>% + select(statistic, combined) %>% # Only select 'statistic' and 'combined' columns + rename(metric = statistic, value = combined) %>% # Rename columns + mutate(sample = s[1]) # Add sample column + } + + if (is.null(tmp)) { + # in case of 10X Flex summary metrics is not in outs nor report directory + tmp <- read.table(dir(sample_base_dir, glob2rx("*ummary.csv"), full.names = TRUE), + header = TRUE, sep = ",", stringsAsFactors = FALSE) %>% + select(Metric.Name, Metric.Value) %>% # Only select 'Metric Name' and 'Metric Value' columns + rename(metric = Metric.Name, value = Metric.Value) %>% + slice(1:12) %>% + mutate(value = gsub(",", "", value), # Remove commas + value = ifelse(grepl("%", value), as.numeric(gsub("%", "", value)) / 100, as.numeric(value))) %>% # Handle percentages + mutate(sample = s[1]) %>% # Add sample column + mutate(metric = metric %>% gsub(".", " ", ., fixed = TRUE) %>% tolower()) + } + + return(tmp) +}, n.cores = n.cores) %>% + bind_rows() #combine all data frames +if (verbose) message(paste0(Sys.time()," Done!")) +return(metrics) +} + #' @title Plot the data as points, as bars as a histogram, or as a violin #' @description Plot the data as points, barplot, histogram or violin @@ -375,7 +566,7 @@ labelsFilter <- function(filter.data) { return(tmp) } -#' @title Read 10x HDF5 files +#' @title Read 10x or cellbenders HDF5 files #' @param data.path character #' @param samples character vector, select specific samples for processing (default = NULL) #' @param type name of H5 file to search for, "raw" and "filtered" are Cell Ranger count outputs, "cellbender" is output from CellBender after running script from saveCellbenderScript @@ -467,9 +658,9 @@ createUniqueCellNames <- function(cms, #' @title Get H5 file paths #' @description Get file paths for H5 files -#' @param data.path character Path for directory containing sample-wise directories with Cell Ranger count outputs +#' @param data.path character Path for directory containing sample-wise directories with Cell Ranger count/Parse outputs #' @param samples character Sample names to include (default = NULL) -#' @param type character Type of H5 files to get paths for, one of "raw", "filtered" (Cell Ranger count outputs), "cellbender" (raw CellBender outputs), "cellbender_filtered" (CellBender filtered outputs) (default = "type") +#' @param type character Type of H5 files to get paths for, one of "raw", "filtered" (Cell Ranger count outputs, does not work for Parse), "cellbender" (raw CellBender outputs), "cellbender_filtered" (CellBender filtered outputs) (default = "type") #' @keywords internal getH5Paths <- function(data.path, samples = NULL, @@ -479,14 +670,32 @@ getH5Paths <- function(data.path, tolower() %>% match.arg(c("raw","filtered","cellbender","cellbender_filtered")) + + # Determine which folder to use based on whether 'combined' (parse) is in the data path or 'per_sample_outs' (10x flex) is in the data path + use_dge_unfiltered <- grepl("combined", data.path) + use_count <- grepl("per_sample_outs", data.path) + + # Get the appropriate folder path for all samples (helper function) + get_folder_path <- function(sample) { + if (use_dge_unfiltered) { + return(paste0(sample[2], "/", sample[1], "/DGE_unfiltered")) + } else if (use_count) { + return(paste0(sample[2], "/", sample[1], "/count")) + } else { + return(paste0(sample[2], "/", sample[1], "/outs")) + } + } + + # Get H5 paths paths <- data.path %>% pathsToList(samples) %>% sapply(\(sample) { + folder <- get_folder_path(sample) if (grepl("cellbender", type)) { - paste0(sample[2],"/",sample[1],"/outs/",type,".h5") + paste0(folder,"/",type,".h5") } else { - dir(paste0(sample[2],sample[1],"/outs"), glob2rx(paste0(type,"*.h5")), full.names = TRUE) + dir(paste0(folder), glob2rx(paste0("*", type,"*.h5")), full.names = TRUE) } }) %>% setNames(samples) @@ -500,12 +709,13 @@ getH5Paths <- function(data.path, miss <- miss.names %>% sapply(\(sample) { + folder <- get_folder_path(sample) if (type == "raw") { - paste0(data.path,sample,"/outs/raw_[feature/gene]_bc_matrix.h5") + paste0(folder, "/raw_[feature/gene]_bc_matrix.h5") } else if (type == "filtered") { - paste0(data.path,sample,"/outs/filtered_[feature/gene]_bc_matrix.h5") + paste0(folder,"/filtered_[feature/gene]_bc_matrix.h5") } else { - paste0(data.path,sample,"/outs/",type,".h5") + paste0(folder,"/",type,".h5") } }) %>% setNames(miss.names) @@ -569,6 +779,8 @@ checkDataPath <- function(data.path) { if (is.null(data.path)) stop("'data.path' cannot be NULL.") } + +# this fucntion is wired misses a title and so on... pathsToList <- function(data.path, samples) { data.path %>% lapply(\(path) list.dirs(path, recursive = F, full.names = F) %>% @@ -577,5 +789,5 @@ pathsToList <- function(data.path, samples) { bind_rows() %>% t() %>% data.frame() %>% - as.list() -} \ No newline at end of file + as.list() +} From 8a63533e415b68dd98fa5c602c920c3a79867309 Mon Sep 17 00:00:00 2001 From: Laura19993 Date: Fri, 11 Jul 2025 15:17:03 +0200 Subject: [PATCH 2/6] Fixes_nr_1 --- .gitignore | 4 ++ R/CRMetrics.R | 135 ++++++++++++++++++++++++++++++++------------ R/inner_functions.R | 47 ++++++++------- 3 files changed, 126 insertions(+), 60 deletions(-) diff --git a/.gitignore b/.gitignore index f180aac..6a9414d 100644 --- a/.gitignore +++ b/.gitignore @@ -35,3 +35,7 @@ CRAN-SUBMISSION .github/ # test data control_6 +control_7 +metadata.csv +# summary of updates for users +CRMetrics_updates_for_users.Rmd \ No newline at end of file diff --git a/R/CRMetrics.R b/R/CRMetrics.R index 5a11792..296e004 100644 --- a/R/CRMetrics.R +++ b/R/CRMetrics.R @@ -319,7 +319,7 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE, #' @param exact logical Whether to calculate exact p values (default = FALSE). #' @param metadata data.frame Metadata for samples (default = self$metadata). #' @param summary.metrics data.frame Summary metrics (default = self$summary.metrics). - #' @param plot.geom character Which geometric is used to plot the data (default = "point"). + #' @param plot.geom character Which geometric is used to plot the data (default = "bar"). #' @param se logical For regression lines, show SE (default = FALSE) #' @param group.reg.lines logical For regression lines, if FALSE show one line, if TRUE show line per group defined by second.comp.group (default = FALSE) #' @param secondary.testing logical Whether to show post hoc testing (default = TRUE) @@ -1715,7 +1715,7 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE, checkDataPath(data.path) if (grepl("combined", data.path)) { - if (verbose) message("Loading Parse outputs") + if (verbose) message(paste0(Sys.time()," Loading Parse outputs")) cms.raw <- readParse(data.path, samples, "raw", n.cores = n.cores, verbose = verbose, unique.names = unique.names, sep = sep) } else { if (verbose) message(paste0(Sys.time()," Loading HDF5 Cell Ranger outputs")) @@ -1773,7 +1773,7 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE, # transform parse output to work as input for cellbender if (grepl("combined", data.path)) { - if (verbose) message("Transforming Parse outputs as preparation for cellbender") + if (verbose) message(paste0(Sys.time()," Transforming Parse outputs as preparation for cellbender")) # check if transformed data already exists full.path <- data.path %>% pathsToList(samples) %>% @@ -1783,22 +1783,28 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE, lapply(full.path, function(x) { # 1. Check and create genes.tsv - if (!file.exists(paste0(x,"/","genes.tsv"))) { - if(verbose) message("Creating genes.tsv in ", x) + if (file.exists(paste0(x,"/","genes.tsv"))) { + warning(paste0("genes.tsv already exists in ", x)) + } else { + if(verbose) message(paste0(Sys.time()," Creating genes.tsv in ", x)) fread(paste0(x,"/","all_genes.csv"), sep = ",", header = TRUE) %>% mutate(genome = "Gene Expression") %>% fwrite(paste0(x,"/","genes.tsv"), sep = "\t", col.names = FALSE) } - #2. Check and create barcodes.tsv - if (!file.exists(paste0(x,"/","barcodes.tsv"))) { - if(verbose) message("Creating barcodes.tsv in ", x) + # 2. Check and create barcodes.tsv + if (file.exists(paste0(x,"/","barcodes.tsv"))) { + warning(paste0("barcodes.tsv already exists in ", x)) + } else { + if(verbose) message(paste0(Sys.time()," Creating barcodes.tsv in ", x)) fread(paste0(x,"/","cell_metadata.csv"), sep = ",", header = TRUE) %>% .[, "bc_wells"] %>% write.table(paste0(x,"/","barcodes.tsv"), row.names=F, col.names=F) } - #2. Check and create matrix.mtx - if (!file.exists(paste0(x,"/","matrix.mtx"))) { - if(verbose) message("Creating matrix.mtx in ", x) + # 3. Check and create matrix.mtx + if (file.exists(paste0(x,"/","matrix.mtx"))) { + warning(paste0("matrix.mtx already exists in ", x)) + } else { + if(verbose) message(paste0(Sys.time()," Creating matrix.mtx in ", x)) fread(paste0(full.path[1],"/","count_matrix.mtx"), header = F) %>% select(V2, V1, V3) %>% fwrite(paste0(full.path[1],"/","matrix.mtx"), col.names=F) @@ -1851,17 +1857,17 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE, # 1. Check genes.tsv if (!file.exists(paste0(x,"/","genes.tsv"))) { - message("genes.tsv missing in ", x, " run prepareCellbender first.") + error("genes.tsv missing in ", x, " run prepareCellbender first.") } #2. Check barcodes.tsv if (!file.exists(paste0(x,"/","barcodes.tsv"))) { - message("barcodes.tsv missing in ", x, " run prepareCellbender first.") + error("barcodes.tsv missing in ", x, " run prepareCellbender first.") } #2. Check matrix.mtx if (!file.exists(paste0(x,"/","matrix.mtx"))) { - message("matrix.mtx missing in ", x, " run prepareCellbender first.") + error("matrix.mtx missing in ", x, " run prepareCellbender first.") } }) @@ -1983,11 +1989,11 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE, #' @description Add a list of count matrices to the CRMetrics object. #' @param cms list List of (sparse) count matrices (default = NULL) - #' @param data.path character Path to cellranger count data (default = self$data.path). + #' @param data.path character Path to Cell Ranger or Parse count data (default = self$data.path). #' @param samples character Vector of sample names. If NULL, samples are extracted from cms (default = self$metadata$sample) #' @param cellbender logical Add CellBender filtered count matrices in HDF5 format. Requires that "cellbender" is in the names of the files (default = FALSE) #' @param parse logical Add Parse filtered count matrices. Cannot be combined with `cellbender=TRUE`. Cannot be combined with `raw=TRUE`. (default = FALSE) - #' @param flex logical Add 10x Flex filtered count matrices from cell ranger. Can be combined with `raw=TRUE`. Cannot be combined with `cellbender=TRUE`. (default = FALSE) + #' @param flex logical Add 10x Flex filtered count matrices from Cell Ranger. Can be combined with `raw=TRUE`. Cannot be combined with `cellbender=TRUE`. (default = FALSE) #' @param raw logical Add raw count matrices from Cell Ranger output. Cannot be combined with `cellbender=TRUE`. (default = FALSE) #' @param symbol character The type of gene IDs to use, SYMBOL (TRUE) or ENSEMBLE (default = TRUE) #' @param unique.names logical Make cell names unique based on `sep` parameter (default = TRUE) @@ -2110,12 +2116,23 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE, train.df <- samples %>% lapply(\(id) { - # rhdf5::h5read(paths[id], "matrix/training_elbo_per_epoch") %>% - #for cellbender v0.3.0 - rhdf5::h5read(paths[id], "metadata/learning_curve_train_elbo") %>% - {data.frame(ELBO = ., - Epoch = seq_len(length(.)), - sample = id)} + # for cellbender v0.3 + if ("learning_curve_train_elbo" %in% unique(rhdf5::h5ls(paths[id])$name)) { + message(paste0(Sys.time(), " Detected CellBender v0.3, loading training data.")) + rhdf5::h5read(paths[id], "metadata/learning_curve_train_elbo") %>% + {data.frame(ELBO = ., + Epoch = seq_len(length(.)), + sample = id)} + # for cellbender v0.2 + } else if ("training_elbo_per_epoch" %in% unique(rhdf5::h5ls(paths[id])$name)) { + message(paste0(Sys.time(), " Detected CellBender v0.2, loading training data.")) + rhdf5::h5read(paths[id], "matrix/training_elbo_per_epoch") %>% + {data.frame(ELBO = ., + Epoch = seq_len(length(.)), + sample = id)} + } else { + stop(paste0("Neither CellBender v0.2 nor v0.3 output found in file(s): ", paths[id])) + } }) %>% setNames(samples) %>% bind_rows() @@ -2123,13 +2140,21 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE, test.df <- samples %>% lapply(\(id) { path <- paths[id] - # data.frame(ELBO = rhdf5::h5read(path, "matrix/test_elbo"), - # Epoch = rhdf5::h5read(path, "matrix/test_epoch"), - # sample = id) - #for cellbender v0.3.0 - data.frame(ELBO = rhdf5::h5read(path, "metadata/learning_curve_test_elbo"), + # for cellbender v0.3 + if ("learning_curve_test_elbo" %in% unique(rhdf5::h5ls(paths[id])$name)) { + message(paste0(Sys.time(), " Detected CellBender v0.3, loading test data.")) + data.frame(ELBO = rhdf5::h5read(path, "metadata/learning_curve_test_elbo"), Epoch = rhdf5::h5read(path, "metadata/learning_curve_test_epoch"), sample = id) + # for cellbender v0.2 + } else if ("test_elbo" %in% unique(rhdf5::h5ls(paths[id])$name)) { + message(paste0(Sys.time(), " Detected CellBender v0.2, loading test data.")) + data.frame(ELBO = rhdf5::h5read(path, "matrix/test_elbo"), + Epoch = rhdf5::h5read(path, "matrix/test_epoch"), + sample = id) + } else { + stop(paste0("Neither CellBender v0.2 nor v0.3 output found in file(s): ", paths[id])) + } }) %>% setNames(samples) %>% bind_rows() @@ -2172,18 +2197,29 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE, cell.prob <- samples %>% lapply(\(id) { - #rhdf5::h5read(paths[id], "matrix/latent_cell_probability") %>% # for cellbender v0.3.0 + if ("cell_probability" %in% unique(rhdf5::h5ls(paths[id])$name)) { + message(paste0(Sys.time(), " Detected CellBender v0.3, loading cell probabilities.")) rhdf5::h5read(paths[id], "droplet_latents/cell_probability") %>% {data.frame(prob = ., cell = seq_len(length(.)), sample = id)} + # for cellbender v0.2 + } else if ("latent_cell_probability" %in% unique(rhdf5::h5ls(paths[id])$name)) { + message(paste0(Sys.time(), " Detected CellBender v0.2, loading cell probabilities.")) + rhdf5::h5read(paths[id], "matrix/latent_cell_probability") %>% + {data.frame(prob = ., + cell = seq_len(length(.)), + sample = id)} + } else { + stop(paste0("Neither CellBender v0.2 nor v0.3 output found in file(s): ", paths[id])) + } }) %>% setNames(samples) %>% bind_rows() ggplot(cell.prob, aes(cell, prob, col = prob)) + - geom_point() + + rasterise(geom_point(), dpi = 300) + scale_color_gradient(low=low.col, high=high.col) + self$theme + labs(x = "Cells", y = "Cell probability", col = "") + @@ -2212,13 +2248,26 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE, amb <- samples %>% lapply(\(id) { - # rhdf5::h5read(paths[id], "matrix/ambient_expression") %>% - #for cellbender version 3 + # for cellbender v0.3 + if ("/global_latents" %in% unique(rhdf5::h5ls(paths[id])$group)) { + message(paste0(Sys.time(), " Detected CellBender v0.3, loading ambient expression data.")) rhdf5::h5read(paths[id], "global_latents/ambient_expression") %>% {data.frame(exp = ., cell = seq_len(length(.)), gene.names = rhdf5::h5read(paths[id], "matrix/features/name") %>% as.character(), sample = id)} + # for cellbender v0.2, using a "name" from above to differentiate versions + } else if ("latent_cell_probability" %in% unique(rhdf5::h5ls(paths[id])$name)) { + message(paste0(Sys.time(), " Detected CellBender v0.2, loading ambient expression data.")) + rhdf5::h5read(paths[id], "matrix/ambient_expression") %>% + {data.frame(exp = ., + cell = seq_len(length(.)), + gene.names = rhdf5::h5read(paths[id], "matrix/features/name") %>% as.character(), + sample = id)} + } + else { + stop(paste0("Neither CellBender v0.2 nor v0.3 output found in file(s): ", paths[id])) + } }) %>% setNames(samples) %>% bind_rows() @@ -2258,14 +2307,28 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE, amb <- samples %>% lapply(\(id) { - #rhdf5::h5read(paths[id], "matrix/ambient_expression") %>% - #for cellbender version 3 - rhdf5::h5read(paths[id], "global_latents/ambient_expression") %>% + # for cellbender version 3 + if ("/global_latents" %in% unique(rhdf5::h5ls(paths[id])$group)) { + message(paste0(Sys.time(), " Detected CellBender v0.3, loading ambient expression data.")) + rhdf5::h5read(paths[id], "global_latents/ambient_expression") %>% {data.frame(exp = ., cell = seq_len(length(.)), gene.names = rhdf5::h5read(paths[id], "matrix/features/name") %>% as.character(), sample = id)} %>% filter(exp >= cutoff) + # for cellbender v0.2 + } else if ("latent_cell_probability" %in% unique(rhdf5::h5ls(paths[id])$name)) { + message(paste0(Sys.time(), " Detected CellBender v0.2, loading ambient expression data.")) + rhdf5::h5read(paths[id], "matrix/ambient_expression") %>% + {data.frame(exp = ., + cell = seq_len(length(.)), + gene.names = rhdf5::h5read(paths[id], "matrix/features/name") %>% as.character(), + sample = id)} %>% + filter(exp >= cutoff) + } + else { + stop(paste0("Neither CellBender v0.2 nor v0.3 output found in file(s): ", paths[id])) + } }) %>% setNames(samples) %>% bind_rows() %$% @@ -2279,7 +2342,7 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE, geom_bar(stat = "identity") + self$theme + labs(x = "", y = "Proportion") + - theme(axis.text.x = element_text(angle = 90)) + + theme(axis.text.x = element_text(angle = 45, hjust = 1)) + guides(fill = "none") if (!is.null(pal)) { @@ -2508,7 +2571,7 @@ plotCbCells = function(data.path = self$data.path, g <- ggplot(df, aes(sample, value, fill = sample)) + geom_bar(stat = "identity") + self$theme + - theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5)) + + theme(axis.text.x = element_text(angle = 45, hjust = 1)) + guides(fill = "none") + labs(x = "", y = "") + facet_wrap(~variable, scales = "free_y", nrow = 2, ncol = 2) diff --git a/R/inner_functions.R b/R/inner_functions.R index 8935513..00eca7e 100644 --- a/R/inner_functions.R +++ b/R/inner_functions.R @@ -207,7 +207,6 @@ readParse <- function(data.path, verbose = TRUE) { checkPackageInstalled("data.table", cran = TRUE) - checkPackageInstalled("Matrix", cran = TRUE) # If no samples provided, infer from directories if (is.null(samples)) samples <- list.dirs(data.path, full.names = FALSE, recursive = FALSE) @@ -230,7 +229,7 @@ readParse <- function(data.path, .[grepl("count_matrix.mtx", .)] mat <- as(Matrix::readMM(mat.path), "CsparseMatrix") # Transpose the matrix to have genes in rows and cells in columns - mat <- t(mat) + mat <- Matrix::t(mat) # Add genes genes <- tmp.dir %>% @@ -670,28 +669,11 @@ getH5Paths <- function(data.path, tolower() %>% match.arg(c("raw","filtered","cellbender","cellbender_filtered")) - - # Determine which folder to use based on whether 'combined' (parse) is in the data path or 'per_sample_outs' (10x flex) is in the data path - use_dge_unfiltered <- grepl("combined", data.path) - use_count <- grepl("per_sample_outs", data.path) - - # Get the appropriate folder path for all samples (helper function) - get_folder_path <- function(sample) { - if (use_dge_unfiltered) { - return(paste0(sample[2], "/", sample[1], "/DGE_unfiltered")) - } else if (use_count) { - return(paste0(sample[2], "/", sample[1], "/count")) - } else { - return(paste0(sample[2], "/", sample[1], "/outs")) - } - } - - # Get H5 paths paths <- data.path %>% pathsToList(samples) %>% - sapply(\(sample) { - folder <- get_folder_path(sample) + sapply(\(i) { + folder <- getFolderPaths(i, data.path = data.path) if (grepl("cellbender", type)) { paste0(folder,"/",type,".h5") } else { @@ -708,8 +690,8 @@ getH5Paths <- function(data.path, names() miss <- miss.names %>% - sapply(\(sample) { - folder <- get_folder_path(sample) + sapply(\(i) { + folder <- getFolderPaths(i, data.path= data.path) if (type == "raw") { paste0(folder, "/raw_[feature/gene]_bc_matrix.h5") } else if (type == "filtered") { @@ -734,6 +716,8 @@ getH5Paths <- function(data.path, return(paths) } + + #' @title Create filtering vector #' @description Create logical filtering vector based on a numeric vector and a (sample-wise) cutoff #' @param num.vec numeric Numeric vector to create filter on @@ -780,7 +764,7 @@ checkDataPath <- function(data.path) { } -# this fucntion is wired misses a title and so on... +# no title? pathsToList <- function(data.path, samples) { data.path %>% lapply(\(path) list.dirs(path, recursive = F, full.names = F) %>% @@ -791,3 +775,18 @@ pathsToList <- function(data.path, samples) { data.frame() %>% as.list() } + + +# this function uses the output of pathsToList() as input "i" +getFolderPaths <- function(i, data.path) { + # Determine which folder to use based on whether 'combined' (parse) is in the data path or 'per_sample_outs' (10x flex) is in the data path + use_dge_unfiltered <- grepl("combined", data.path) + use_count <- grepl("per_sample_outs", data.path) + if (use_dge_unfiltered) { + return(file.path(i[2], i[1], "DGE_unfiltered")) + } else if (use_count) { + return(file.path(i[2], i[1], "count")) + } else { + return(file.path(i[2], i[1], "outs")) + } +} From 30af8d052bd51cdb3d7411c52aeb27b96a14028d Mon Sep 17 00:00:00 2001 From: Laura19993 Date: Tue, 23 Sep 2025 14:28:18 +0200 Subject: [PATCH 3/6] Fixes nr 2: introduce technology flag --- .gitignore | 4 +++ R/CRMetrics.R | 72 +++++++++++++++++++++++++++------------------ R/inner_functions.R | 48 +++++++++++++++++------------- 3 files changed, 75 insertions(+), 49 deletions(-) diff --git a/.gitignore b/.gitignore index 6a9414d..119d079 100644 --- a/.gitignore +++ b/.gitignore @@ -37,5 +37,9 @@ CRAN-SUBMISSION control_6 control_7 metadata.csv +metadata_test.csv +metadata_test2.csv +metadata_test_multiome.csv +metadata_parse.csv # summary of updates for users CRMetrics_updates_for_users.Rmd \ No newline at end of file diff --git a/R/CRMetrics.R b/R/CRMetrics.R index 296e004..79edd22 100644 --- a/R/CRMetrics.R +++ b/R/CRMetrics.R @@ -58,8 +58,9 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE, n.cores = 1, #' Initialize a CRMetrics object - #' @description To initialize new object, 'data.path' or 'cms' is needed. 'metadata' is also recommended, but not required. - #' @param data.path character Path to directory with Cell Ranger count data, one directory per sample (default = NULL). + #' @description To initialize new object, 'data.path' or 'cms' and 'technology' is needed. 'metadata' is also recommended, but not required. + #' @param data.path character Path to directory with count data, one directory per sample (default = NULL). + #' @param technology character applied single cell technology (default = c("10x", "10xflex", "10xmultiome", "parse")) #' @param metadata data.frame or character Path to metadata file (comma-separated) or name of metadata dataframe object. Metadata must contain a column named 'sample' containing sample names that must match folder names in 'data.path' (default = NULL) #' @param cms list List with count matrices (default = NULL) #' @param samples character Sample names. Only relevant is cms is provided (default = NULL) @@ -77,7 +78,8 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE, #' \dontrun{ #' crm <- CRMetrics$new(data.path = "/path/to/count/data/") #' } - initialize = function(data.path = NULL, + initialize = function(data.path = NULL, + technology = c("10x", "10xflex", "10xmultiome", "parse"), metadata = NULL, cms = NULL, samples = NULL, @@ -114,9 +116,13 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE, }) } + # check that technology is provided and converted to lower case + technology %<>% tolower() %>% match.arg(c("10x", "10xflex", "10xmultiome", "parse")) + # Write stuff to object self$n.cores <- as.integer(n.cores) self$data.path <- data.path + self$technology <- technology self$verbose <- verbose self$theme <- theme self$pal <- pal @@ -1673,6 +1679,7 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE, #' @param expected.cells named numeric If NULL, expected cells will be deduced from the number of cells per sample identified by Cell Ranger. Otherwise, a named vector of expected cells with sample IDs as names. Sample IDs must match those in summary.metrics (default: stored named vector) #' @param total.droplets named numeric If NULL, total droplets included will be deduced from expected cells multiplied by 3. Otherwise, a named vector of total droplets included with sample IDs as names. Sample IDs must match those in summary.metrics (default: stored named vector) #' @param cms.raw list Raw count matrices from HDF5 Cell Ranger outputs (default = self$cms.raw) + #' @param technology character applied single cell technology (default = c("10x", "10xflex", "10xmultiome", "parse")) #' @param umi.counts list UMI counts calculated as column sums of raw count matrices from HDF5 Cell Ranger outputs (default: stored list) #' @param data.path character Path to Cell Ranger outputs (default = self$data.path) #' @param samples character Sample names to include (default = self$metadata$sample) @@ -1692,6 +1699,7 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE, expected.cells = NULL, total.droplets = NULL, cms.raw = self$cms.raw, + technology = self$technology, umi.counts = self$cellbender$umi.counts, data.path = self$data.path, samples = self$metadata$sample, @@ -1714,12 +1722,12 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE, checkDataPath(data.path) - if (grepl("combined", data.path)) { + if (technology == "parse") { if (verbose) message(paste0(Sys.time()," Loading Parse outputs")) cms.raw <- readParse(data.path, samples, "raw", n.cores = n.cores, verbose = verbose, unique.names = unique.names, sep = sep) } else { if (verbose) message(paste0(Sys.time()," Loading HDF5 Cell Ranger outputs")) - cms.raw <- read10xH5(data.path, samples, "raw", n.cores = n.cores, verbose = verbose, unique.names = unique.names, sep = sep) + cms.raw <- read10xH5(data.path, samples, "raw", technology = self$technology, n.cores = n.cores, verbose = verbose, unique.names = unique.names, sep = sep) } self$cms.raw <- cms.raw } @@ -1772,7 +1780,7 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE, # transform parse output to work as input for cellbender - if (grepl("combined", data.path)) { + if (technology == "parse") { if (verbose) message(paste0(Sys.time()," Transforming Parse outputs as preparation for cellbender")) # check if transformed data already exists full.path <- data.path %>% @@ -1818,6 +1826,7 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE, #' @param file character File name for CellBender script. Will be stored in `data.path` (default: "cellbender_script.sh") + #' @param technology character Applied single cell technology (default = self$technology). #' @param fpr numeric False positive rate for CellBender (default = 0.01) #' @param epochs integer Number of epochs for CellBender (default = 150) #' @param use.gpu logical Use CUDA capable GPU (default = TRUE) @@ -1834,6 +1843,7 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE, #' crm$saveCellbenderScript() #' } saveCellbenderScript = function(file = "cellbender_script.sh", + technology = self$technology, fpr = 0.01, epochs = 150, use.gpu = TRUE, @@ -1845,7 +1855,7 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE, # Preparations checkDataPath(data.path) - if (grepl("combined", data.path)) { + if (technology == "parse") { # in case of parse we use the genes, matrix, barcodes files stored in DGE_unfiltered and therefor we just need to give the path to DGE_unfiltered inputs <- data.path %>% pathsToList(samples) %>% @@ -1872,15 +1882,15 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE, }) } else { - inputs <- getH5Paths(data.path, samples, "raw") + inputs <- getH5Paths(data.path, samples, "raw", technology) } outputs <- data.path %>% pathsToList(samples) %>% sapply(\(sample) { - if (grepl("combined", data.path)) { + if (technology == "parse") { paste0(sample[2], "/", sample[1], "/DGE_unfiltered/cellbender.h5") - } else if (grepl("per_sample_outs", data.path)) { + } else if (technology == "10xflex") { paste0(sample[2], "/", sample[1], "/count/cellbender.h5") } else { paste0(sample[2], "/", sample[1], "/outs/cellbender.h5") @@ -1990,11 +2000,10 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE, #' @description Add a list of count matrices to the CRMetrics object. #' @param cms list List of (sparse) count matrices (default = NULL) #' @param data.path character Path to Cell Ranger or Parse count data (default = self$data.path). + #' @param technology character Applied single cell technology (default = self$technology). #' @param samples character Vector of sample names. If NULL, samples are extracted from cms (default = self$metadata$sample) #' @param cellbender logical Add CellBender filtered count matrices in HDF5 format. Requires that "cellbender" is in the names of the files (default = FALSE) - #' @param parse logical Add Parse filtered count matrices. Cannot be combined with `cellbender=TRUE`. Cannot be combined with `raw=TRUE`. (default = FALSE) - #' @param flex logical Add 10x Flex filtered count matrices from Cell Ranger. Can be combined with `raw=TRUE`. Cannot be combined with `cellbender=TRUE`. (default = FALSE) - #' @param raw logical Add raw count matrices from Cell Ranger output. Cannot be combined with `cellbender=TRUE`. (default = FALSE) + #' @param raw logical Add raw count matrices from Cell Ranger or Parse output. Cannot be combined with `cellbender=TRUE`. (default = FALSE) #' @param symbol character The type of gene IDs to use, SYMBOL (TRUE) or ENSEMBLE (default = TRUE) #' @param unique.names logical Make cell names unique based on `sep` parameter (default = TRUE) #' @param sep character Separator used to create unique cell names (default = "!!") @@ -2019,10 +2028,9 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE, #' } addCms = function(cms = NULL, data.path = self$data.path, + technology = self$technology, samples = self$metadata$sample, cellbender = FALSE, - flex = FALSE, - parse = FALSE, raw = FALSE, symbol = TRUE, unique.names = TRUE, @@ -2064,14 +2072,11 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE, if (unique.names) cms %<>% createUniqueCellNames(samples, sep) } else { # Add from data.path argument - if (parse && cellbender) { - stop("Error: If you run CellBender on Parse files, 'parse' must be set to FALSE and only 'cellbender' set to TRUE.") - } if (cellbender) { - cms <- read10xH5(data.path = data.path, samples = samples, symbol = symbol, type = "cellbender_filtered", sep = sep, n.cores = n.cores, verbose = verbose, unique.names = unique.names) - } else if (parse) { - cms <- readParse(data.path = data.path, samples = samples, sep = sep, n.cores = n.cores, verbose = verbose, unique.names = unique.names) - } else if (flex) { + cms <- read10xH5(data.path = data.path, samples = samples, symbol = symbol, type = "cellbender_filtered", technology = self$technology, sep = sep, n.cores = n.cores, verbose = verbose, unique.names = unique.names) + } else if (technology == "parse") { + cms <- readParse(data.path = data.path, samples = samples, raw = raw, sep = sep, n.cores = n.cores, verbose = verbose, unique.names = unique.names) + } else if (technology == "10xflex") { cms <- readFlex(data.path = data.path, samples = samples, raw = raw, symbol = symbol, sep = sep, n.cores = n.cores, verbose = verbose, unique.names = unique.names) } else { @@ -2097,6 +2102,7 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE, #' @description Plot the results from the CellBender estimations #' @param data.path character Path to Cell Ranger outputs (default = self$data.path) #' @param samples character Sample names to include (default = self$metadata$sample) + #' @param technology character applied single cell technology (default = c("10x", "10xflex", "10xmultiome", "parse")) #' @param pal character Plotting palette (default = self$pal) #' @return A ggplot2 object #' @examples @@ -2109,10 +2115,11 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE, #' } plotCbTraining = function(data.path = self$data.path, samples = self$metadata$sample, + technology = self$technology, pal = self$pal) { checkDataPath(data.path) checkPackageInstalled("rhdf5", bioc = TRUE) - paths <- getH5Paths(data.path, samples, "cellbender") + paths <- getH5Paths(data.path, samples, "cellbender", technology) train.df <- samples %>% lapply(\(id) { @@ -2176,6 +2183,7 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE, #' @description Plot the CellBender assigned cell probabilities #' @param data.path character Path to Cell Ranger outputs (default = self$data.path) #' @param samples character Sample names to include (default = self$metadata$sample) + #' @param technology character applied single cell technology (default = c("10x", "10xflex", "10xmultiome", "parse")) #' @param low.col character Color for low probabilities (default = "gray") #' @param high.col character Color for high probabilities (default = "red") #' @return A ggplot2 object @@ -2189,11 +2197,12 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE, #' } plotCbCellProbs = function(data.path = self$data.path, samples = self$metadata$sample, + technology = self$technology, low.col = "gray", high.col = "red") { checkDataPath(data.path) checkPackageInstalled("rhdf5", bioc = TRUE) - paths <- getH5Paths(data.path, samples, "cellbender") + paths <- getH5Paths(data.path, samples, "cellbender", technology) cell.prob <- samples %>% lapply(\(id) { @@ -2219,7 +2228,6 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE, bind_rows() ggplot(cell.prob, aes(cell, prob, col = prob)) + - rasterise(geom_point(), dpi = 300) + scale_color_gradient(low=low.col, high=high.col) + self$theme + labs(x = "Cells", y = "Cell probability", col = "") + @@ -2230,6 +2238,7 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE, #' @param cutoff numeric Horizontal line included in the plot to indicate highly expressed ambient genes (default = 0.005) #' @param data.path character Path to Cell Ranger outputs (default = self$data.path) #' @param samples character Sample names to include (default = self$metadata$sample) + #' @param technology character applied single cell technology (default = c("10x", "10xflex", "10xmultiome", "parse")) #' @return A ggplot2 object #' @examples #' \dontrun{ @@ -2241,10 +2250,11 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE, #' } plotCbAmbExp = function(cutoff = 0.005, data.path = self$data.path, - samples = self$metadata$sample) { + samples = self$metadata$sample, + technology = self$technology) { checkDataPath(data.path) checkPackageInstalled("rhdf5", bioc = TRUE) - paths <- getH5Paths(data.path, samples, "cellbender") + paths <- getH5Paths(data.path, samples, "cellbender", technology) amb <- samples %>% lapply(\(id) { @@ -2287,6 +2297,7 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE, #' @param cutoff numeric Cutoff of ambient gene expression to use to extract ambient genes per sample #' @param data.path character Path to Cell Ranger outputs (default = self$data.path) #' @param samples character Sample names to include (default = self$metadata$sample) + #' @param technology character applied single cell technology (default = c("10x", "10xflex", "10xmultiome", "parse")) #' @param pal character Plotting palette (default = self$pal) #' @return A ggplot2 object #' @examples @@ -2300,10 +2311,11 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE, plotCbAmbGenes = function(cutoff = 0.005, data.path = self$data.path, samples = self$metadata$sample, + technology = self$technology, pal = self$pal) { checkDataPath(data.path) checkPackageInstalled("rhdf5", bioc = TRUE) - paths <- getH5Paths(data.path, samples, "cellbender") + paths <- getH5Paths(data.path, samples, "cellbender", technology) amb <- samples %>% lapply(\(id) { @@ -2535,6 +2547,7 @@ plotSoupX = function(plot.df = self$soupx$plot.df) { #' @description Plot CellBender cell estimations against the estimated cell numbers from Cell Ranger #' @param data.path character Path to Cell Ranger outputs (default = self$data.path) #' @param samples character Sample names to include (default = self$metadata$sample) +#' @param technology character applied single cell technology (default = c("10x", "10xflex", "10xmultiome", "parse")) description #' @param pal character Plotting palette (default = self$pal) #' @return A ggplot2 object #' @examples @@ -2547,10 +2560,11 @@ plotSoupX = function(plot.df = self$soupx$plot.df) { #' } plotCbCells = function(data.path = self$data.path, samples = self$metadata$sample, + technology = self$technology, pal = self$pal) { checkDataPath(data.path) checkPackageInstalled("rhdf5", bioc = TRUE) - paths <- getH5Paths(data.path, samples, "cellbender_filtered") + paths <- getH5Paths(data.path, samples, "cellbender_filtered", technology) df <- samples %>% sapply(\(id) rhdf5::h5read(paths[id], "matrix/shape")[2]) %>% diff --git a/R/inner_functions.R b/R/inner_functions.R index 00eca7e..5c8d008 100644 --- a/R/inner_functions.R +++ b/R/inner_functions.R @@ -389,14 +389,15 @@ addPlotStatsSamples <- function(p, #' @param verbose Print messages (default = TRUE). #' @keywords internal #' @return data frame -addSummaryMetrics <- function(data.path, +addSummaryMetrics <- function(data.path, metadata, n.cores = 1, verbose = TRUE) { samples.tmp <- list.dirs(data.path, recursive = FALSE, full.names = FALSE) samples <- intersect(samples.tmp, metadata$sample) - doubles <- table(samples.tmp) %>% + # checking if there are duplicated sample names only among the samples that will be used/are in metadata + doubles <- table(samples) %>% .[. > 1] %>% names() @@ -409,16 +410,16 @@ addSummaryMetrics <- function(data.path, pathsToList(metadata$sample) %>% plapply(\(s) { - # define possible directories to search for summary files (outs in case of Cell Ranger, report in case of Parse, sample base dir in case of 10X flex) + # define possible directories to search for summary files (outs in case of 10x and 10xmultiome, report in case of Parse, sample base dir in case of 10X flex) outs_dir <- paste(s[2], s[1], "outs", sep = "/") report_dir <- paste(s[2], s[1], "report", sep = "/") sample_base_dir <- paste(s[2], s[1], sep = "/") - # Check which folder exists and use it to read the summary file + # check which path exists to read the summary file dir_to_use <- if (dir.exists(outs_dir)) outs_dir else if (dir.exists(report_dir)) report_dir else sample_base_dir if (!dir.exists(dir_to_use)) { - warning(paste("No 'outs', 'report', or summary file found for sample:", s[1])) + warning(paste("No 'outs' folder, 'report' folder, or summary file found for sample:", s[1])) return(NULL) } @@ -431,7 +432,7 @@ addSummaryMetrics <- function(data.path, across(.cols = grep(",", .), ~ as.numeric(gsub(",", "", .x)))) # Take into account multiomics - if ("Sample.ID" %in% colnames(tmp)) tmp %<>% select(-c("Sample.ID","Genome","Pipeline.version")) + if ("Sample.ID" %in% colnames(tmp)) tmp %<>% select(-c("Sample.ID","Genome","Pipeline.version")) # Add the sample column, pivot to long format, and clean metric names @@ -445,11 +446,11 @@ addSummaryMetrics <- function(data.path, else if (dir_to_use == report_dir) { - # Logic for 'report' folder (adjust this based on the structure of files in 'report') + # Logic for 'report' folder tmp <- read.table(dir(dir_to_use, glob2rx("*ummary.csv"), full.names = TRUE), header = TRUE, sep = ",", stringsAsFactors = FALSE) %>% - select(statistic, combined) %>% # Only select 'statistic' and 'combined' columns - rename(metric = statistic, value = combined) %>% # Rename columns + select(1:2) %>% # Select the first two columns + rename(metric = 1, value = 2) %>% # Rename them mutate(sample = s[1]) # Add sample column } @@ -569,6 +570,7 @@ labelsFilter <- function(filter.data) { #' @param data.path character #' @param samples character vector, select specific samples for processing (default = NULL) #' @param type name of H5 file to search for, "raw" and "filtered" are Cell Ranger count outputs, "cellbender" is output from CellBender after running script from saveCellbenderScript +#' @param technology character Used single-cell technology (c("10x", "10xmultiome", "flex")) #' @param symbol logical Use gene SYMBOLs (TRUE) or ENSEMBL IDs (FALSE) (default = TRUE) #' @param sep character Separator for creating unique cell names from sample IDs and cell IDs (default = "!!") #' @param n.cores integer Number of cores (default = 1) @@ -583,6 +585,7 @@ labelsFilter <- function(filter.data) { read10xH5 <- function(data.path, samples = NULL, type = c("raw","filtered","cellbender","cellbender_filtered"), + technology = c("10x", "10xmultiome", "flex"), symbol = TRUE, sep = "!!", n.cores = 1, @@ -592,7 +595,7 @@ read10xH5 <- function(data.path, if (is.null(samples)) samples <- list.dirs(data.path, full.names = FALSE, recursive = FALSE) - full.path <- getH5Paths(data.path, samples, type) + full.path <- getH5Paths(data.path, samples, type, technology) if (verbose) message(paste0(Sys.time()," Loading ",length(full.path)," count matrices using ", if (n.cores < length(full.path)) n.cores else length(full.path)," cores")) out <- full.path %>% @@ -660,20 +663,27 @@ createUniqueCellNames <- function(cms, #' @param data.path character Path for directory containing sample-wise directories with Cell Ranger count/Parse outputs #' @param samples character Sample names to include (default = NULL) #' @param type character Type of H5 files to get paths for, one of "raw", "filtered" (Cell Ranger count outputs, does not work for Parse), "cellbender" (raw CellBender outputs), "cellbender_filtered" (CellBender filtered outputs) (default = "type") +#' @param technology character Used technology ("parse", "10x", "10xflex", "10xmultiome") (default = NULL) #' @keywords internal getH5Paths <- function(data.path, samples = NULL, - type = NULL) { + type = NULL, + technology = NULL) { + # Check input type %<>% tolower() %>% match.arg(c("raw","filtered","cellbender","cellbender_filtered")) + technology %<>% + tolower() %>% + match.arg(c("parse", "10x", "10xflex", "10xmultiome")) + # Get H5 paths paths <- data.path %>% pathsToList(samples) %>% sapply(\(i) { - folder <- getFolderPaths(i, data.path = data.path) + folder <- getFolderPaths(i, technology = technology) if (grepl("cellbender", type)) { paste0(folder,"/",type,".h5") } else { @@ -691,7 +701,7 @@ getH5Paths <- function(data.path, miss <- miss.names %>% sapply(\(i) { - folder <- getFolderPaths(i, data.path= data.path) + folder <- getFolderPaths(i, technology = technology) if (type == "raw") { paste0(folder, "/raw_[feature/gene]_bc_matrix.h5") } else if (type == "filtered") { @@ -764,7 +774,7 @@ checkDataPath <- function(data.path) { } -# no title? +# returns a list where the first element contains all sample names and the second element the path pathsToList <- function(data.path, samples) { data.path %>% lapply(\(path) list.dirs(path, recursive = F, full.names = F) %>% @@ -778,13 +788,11 @@ pathsToList <- function(data.path, samples) { # this function uses the output of pathsToList() as input "i" -getFolderPaths <- function(i, data.path) { - # Determine which folder to use based on whether 'combined' (parse) is in the data path or 'per_sample_outs' (10x flex) is in the data path - use_dge_unfiltered <- grepl("combined", data.path) - use_count <- grepl("per_sample_outs", data.path) - if (use_dge_unfiltered) { +getFolderPaths <- function(i, technology) { + # Determine which folder to use based on which technology was used + if (technology == "parse") { return(file.path(i[2], i[1], "DGE_unfiltered")) - } else if (use_count) { + } else if (technology == "10xflex") { return(file.path(i[2], i[1], "count")) } else { return(file.path(i[2], i[1], "outs")) From 317698aa160bde8639cbfe7966e50ec0fac4428a Mon Sep 17 00:00:00 2001 From: Laura19993 Date: Fri, 14 Nov 2025 11:30:59 +0100 Subject: [PATCH 4/6] read in of duplicated samples fixed, new function plotRawDepth --- R/CRMetrics.R | 150 ++++++++++++++++++++++++++++++++++---------- R/inner_functions.R | 7 ++- 2 files changed, 120 insertions(+), 37 deletions(-) diff --git a/R/CRMetrics.R b/R/CRMetrics.R index 83a217f..2cb5cd4 100644 --- a/R/CRMetrics.R +++ b/R/CRMetrics.R @@ -63,7 +63,7 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE, #' @param technology character applied single cell technology (default = c("10x", "10xflex", "10xmultiome", "parse")) #' @param metadata data.frame or character Path to metadata file (comma-separated) or name of metadata dataframe object. Metadata must contain a column named 'sample' containing sample names that must match folder names in 'data.path' (default = NULL) #' @param cms list List with count matrices (default = NULL) - #' @param samples character Sample names. Only relevant is cms is provided (default = NULL) + #' @param samples character Sample names. Only relevant if cms is provided (default = NULL) #' @param unique.names logical Create unique cell names. Only relevant if cms is provided (default = TRUE) #' @param sep.cells character Sample-cell separator. Only relevant if cms is provided and `unique.names=TRUE` (default = "!!") #' @param comp.group character A group present in the metadata to compare the metrics by, can be added with addComparison (default = NULL) @@ -128,15 +128,23 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE, self$pal <- pal # Metadata + # behaviour if metadata file is not given if (is.null(metadata)) { if (!is.null(data.path)) { self$metadata <- lapply(data.path, list.dirs, recursive = FALSE, full.names = FALSE) %>% - Reduce(c, .) %>% - .[pathsToList(data.path, .) %>% sapply(\(path) file.exists(paste0(path[2],"/",path[1],"/outs")))] %>% - {data.frame(sample = .)} + Reduce(c, .) %>% + { subfolder <- switch(technology, + "parse" = "DGE_filtered", + "10xflex" = "count", + "10x" = "outs") + # filter the directory names vector (.) to keep only those directories for which a corresponding subfolder exists, problem in Parse: all-sample folder also has that subfolder + .[pathsToList(data.path, .) + %>% sapply(function(path) file.exists(file.path(path[2], path[1], subfolder)))] + } %>% + { data.frame(sample = .) } } else { if (is.null(names(cms))) { if (is.null(samples)) { @@ -164,8 +172,22 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE, if (!is.null(metadata)) { if (!raw.meta) self$metadata %<>% lapply(type.convert, as.is = FALSE) %>% bind_cols() } + # check if metadata has duplicated samples + doubles <- table(self$metadata$sample) %>% + .[. > 1] %>% + names() + if (length(doubles) > 0) stop(paste0("One or more samples are present twice in provided/derived 'metadata'. + Sample names must be unique. Affected sample(s): ",paste(doubles, collapse = " "))) + # check if metadata contains 'all-sample' which is created by parse pipeline automatically and remove it + all_sample_rows <- grepl("^all[-_ ]?sample$", self$metadata$sample, ignore.case = TRUE) + if (any(all_sample_rows)) { + message("Removed 'all-sample' entries automatically created by the Parse pipeline: ", + paste(self$metadata$sample[all_sample_rows], collapse = ", ")) + self$metadata <- self$metadata[!all_sample_rows, , drop = FALSE] + } + - # Add CMs + # Add CMs if (!is.null(cms)) { self$addCms(cms = cms, samples = samples, @@ -639,7 +661,7 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE, if (length(depth.cutoff) > 1) { main <- "Cells with low depth with sample-specific cutoff" } else { - main <- paste0("Cells with low depth, < ",depth.cutoff) + main <- paste0("Cells with depth < ",depth.cutoff) } g <- self$con$plotGraph(colors = (!depths) * 1, title = main, size = size, ...) } @@ -675,8 +697,8 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE, return(g) }, - #' @description Plot the sequencing depth in histogram. - #' @param cutoff numeric The depth cutoff to color the cells in the embedding (default = 1e3). + #' @description Plot per-sample density histograms of cell depths (=total UMI counts per cell). The plot highlights cells below or above a specified UMI cutoff with distinct colors, helping to assess data quality and filtering thresholds. + #' @param cutoff numeric Depth cutoff used to distinguish filtered and retained cells. Can be a single value or a named numeric vector specifying cutoffs per sample (default = 1e3). #' @param samples character Sample names to include for plotting (default = $metadata$sample). #' @param sep character Separator for creating unique cell names (default = "!!") #' @param keep.col character Color for density of cells that are kept (default = "#E7CDC2") @@ -722,7 +744,7 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE, stop("No Conos object found, please run createEmbedding.") } - if (length(cutoff) > 1 & length(self$con$samples) != length(cutoff)) stop(paste0("'cutoff' has a length of ",length(cutoff),", but the conos object contains ",length(tmp)," samples. Please adjust.")) + if (length(cutoff) > 1 & length(self$con$samples) != length(cutoff)) stop(paste0("'cutoff' has a length of ",length(cutoff),", but the conos object contains ",length(self$con$samples)," samples. Please adjust.")) depths <- self$getDepth() @@ -732,7 +754,7 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE, mutate(sample = sample %>% strsplit(sep, TRUE) %>% sapply(`[[`, 1)) %>% split(., .$sample) %>% .[samples] %>% - lapply(\(z) with(density(z$depth, adjust = 1/10), data.frame(x,y))) %>% + lapply(\(z) with(density(z$depth, adjust = 1/10, from = 0), data.frame(x,y))) %>% {lapply(names(.), \(x) data.frame(.[[x]], sample = x))} %>% bind_rows() @@ -757,7 +779,7 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE, geom_line() + xlim(0,xmax) + theme(legend.position = "none", axis.text.x = element_text(angle = 45, hjust = 1), plot.margin = unit(c(0, 0, 0, 0.5), "cm")) + - labs(title = id, y = "Density [AU]", x = "") + labs(title = id, y = "Density [AU]", x = "UMI counts") if (length(cutoff) == 1) { plot.cutoff <- cutoff @@ -780,6 +802,64 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE, return(depth.plot) }, + + #' @description Plot per-sample histograms of total UMI counts per cell (= cell depth), allowing visualization of sequencing depth distributions across samples. + #' @param cutoff numeric Depth (UMI count) cutoff used to distinguish filtered and retained cells. Can be a single value or a named numeric vector specifying cutoffs per sample (Default = 1e3). + #' @param samples character Vector of sample names to include in the plot (default = self$metadata$sample). + #' @param sep character Separator for creating unique cell names (default = "!!") + #' @param binwidth numeric Width of the histogram bins (default = 10). + #' @param xupper numeric Optional upper x-axis limit (default = 2e4). + #' @return A combined `ggplot` object showing UMI count histograms per sample. + + plotRawDepth = function(cutoff = 1e3, + samples = self$metadata$sample, + sep = "!!", + binwidth = 10, + xupper = 2e4){ + + # Checks + checkPackageInstalled("conos", cran = TRUE) + if (is.null(self$con)) { + stop("No Conos object found, please run createEmbedding.") + } + + if (length(cutoff) > 1 & length(self$con$samples) != length(cutoff)) stop(paste0("'cutoff' has a length of ",length(cutoff),", but the conos object contains ",length(self$con$samples)," samples. Please adjust.")) + + depths <- self$getDepth() + # build tidy df + tmp <- depths %>% + {data.frame(depth = unname(.), sample = names(.))} %>% + mutate(sample = sample %>% strsplit(sep, TRUE) %>% sapply(`[[`, 1)) %>% + filter(sample %in% samples) + + ncol.plot <- samples %>% + length() %>% + pmin(3) + + # Plot + depth.plot <- lapply(unique(tmp$sample), function(id) { + tmp.plot <- tmp %>% filter(sample == id) + + g <- ggplot(tmp.plot, aes(x = depth)) + + geom_histogram(binwidth = binwidth, boundary = 0, closed = "left") + + labs(title = id, x = "UMI counts", y = "Number of cells") + + self$theme + + xlim(0,xupper) + geom_vline(xintercept = cutoff, color="red", linetype = "dashed") + # optional: + scale_x_log10() + + if (length(cutoff) == 1) { + plot.cutoff <- cutoff + } else { + plot.cutoff <- cutoff[names(cutoff) == id] + } + + return(g) + }) %>% + plot_grid(plotlist = ., ncol = ncol.plot, label_size = 5) + + return(depth.plot) + }, + #' @description Plot the mitochondrial fraction in histogram. #' @param cutoff numeric The mito. fraction cutoff to color the embedding (default = 0.05) @@ -1572,7 +1652,7 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE, return(g) }, - #' @description Extract sequencing depth from Conos object. + #' @description Extract sum of UMI counts per cell (= cell depth) #' @param cms list List of (sparse) count matrices (default = self$cms) #' @return data frame #' @examples @@ -1786,36 +1866,36 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE, full.path <- data.path %>% pathsToList(samples) %>% sapply(\(sample) { - paste0(sample[2],"/",sample[1],"/DGE_unfiltered") + paste0(sample[2],sample[1], "/DGE_unfiltered") }) lapply(full.path, function(x) { # 1. Check and create genes.tsv - if (file.exists(paste0(x,"/","genes.tsv"))) { + if (file.exists(file.path(x,"genes.tsv"))) { warning(paste0("genes.tsv already exists in ", x)) } else { if(verbose) message(paste0(Sys.time()," Creating genes.tsv in ", x)) - fread(paste0(x,"/","all_genes.csv"), sep = ",", header = TRUE) %>% + data.table::fread(file.path(x,"all_genes.csv"), sep = ",", header = TRUE) %>% mutate(genome = "Gene Expression") %>% - fwrite(paste0(x,"/","genes.tsv"), sep = "\t", col.names = FALSE) + data.table::fwrite(file.path(x,"genes.tsv"), sep = "\t", col.names = FALSE) } # 2. Check and create barcodes.tsv - if (file.exists(paste0(x,"/","barcodes.tsv"))) { + if (file.exists(file.path(x,"barcodes.tsv"))) { warning(paste0("barcodes.tsv already exists in ", x)) } else { if(verbose) message(paste0(Sys.time()," Creating barcodes.tsv in ", x)) - fread(paste0(x,"/","cell_metadata.csv"), sep = ",", header = TRUE) %>% + data.table::fread(file.path(x,"cell_metadata.csv"), sep = ",", header = TRUE) %>% .[, "bc_wells"] %>% - write.table(paste0(x,"/","barcodes.tsv"), row.names=F, col.names=F) + write.table(file.path(x,"barcodes.tsv"), row.names=F, col.names=F) } # 3. Check and create matrix.mtx - if (file.exists(paste0(x,"/","matrix.mtx"))) { + if (file.exists(file.path(x,"matrix.mtx"))) { warning(paste0("matrix.mtx already exists in ", x)) } else { if(verbose) message(paste0(Sys.time()," Creating matrix.mtx in ", x)) - fread(paste0(full.path[1],"/","count_matrix.mtx"), header = F) %>% + data.table::fread(file.path(x,"count_matrix.mtx"), header = F) %>% select(V2, V1, V3) %>% - fwrite(paste0(full.path[1],"/","matrix.mtx"), col.names=F) + data.table::fwrite(file.path(x,"matrix.mtx"), col.names=F) } }) } @@ -1860,24 +1940,24 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE, inputs <- data.path %>% pathsToList(samples) %>% sapply(\(sample) { - paste0(sample[2], "/", sample[1], "/DGE_unfiltered")}) + file.path(sample[2],sample[1],"DGE_unfiltered")}) #check that transformed files exists for parse lapply(inputs, function(x) { # 1. Check genes.tsv - if (!file.exists(paste0(x,"/","genes.tsv"))) { - error("genes.tsv missing in ", x, " run prepareCellbender first.") + if (!file.exists(file.path(x,"genes.tsv"))) { + stop("genes.tsv missing in ", x, " run prepareCellbender first.") } #2. Check barcodes.tsv - if (!file.exists(paste0(x,"/","barcodes.tsv"))) { - error("barcodes.tsv missing in ", x, " run prepareCellbender first.") + if (!file.exists(file.path(x,"barcodes.tsv"))) { + stop("barcodes.tsv missing in ", x, " run prepareCellbender first.") } #2. Check matrix.mtx - if (!file.exists(paste0(x,"/","matrix.mtx"))) { - error("matrix.mtx missing in ", x, " run prepareCellbender first.") + if (!file.exists(file.path(x,"matrix.mtx"))) { + stop("matrix.mtx missing in ", x, " run prepareCellbender first.") } }) @@ -1889,11 +1969,11 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE, pathsToList(samples) %>% sapply(\(sample) { if (technology == "parse") { - paste0(sample[2], "/", sample[1], "/DGE_unfiltered/cellbender.h5") + file.path(sample[2], sample[1], "DGE_unfiltered/cellbender.h5") } else if (technology == "10xflex") { - paste0(sample[2], "/", sample[1], "/count/cellbender.h5") + file.path(sample[2],sample[1], "count/cellbender.h5") } else { - paste0(sample[2], "/", sample[1], "/outs/cellbender.h5") + file.path(sample[2],sample[1], "outs/cellbender.h5") } }) %>% setNames(samples) @@ -1929,8 +2009,9 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE, out <- list("#! /bin/sh", script.list) %>% unlist() - - cat(out, file = paste0(data.path,file), sep = "\n") + # if there are several data.path "file" is saved to the first element of data.paht vector + cat(out, file = file.path(data.path[1],file), sep = "\n") + message(paste0(file ," was saved into ", data.path[1])) }, #' @description Extract the expected number of cells per sample based on the Cell Ranger summary metrics @@ -2228,6 +2309,7 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE, bind_rows() ggplot(cell.prob, aes(cell, prob, col = prob)) + + geom_point() + scale_color_gradient(low=low.col, high=high.col) + self$theme + labs(x = "Cells", y = "Cell probability", col = "") + diff --git a/R/inner_functions.R b/R/inner_functions.R index 5c8d008..ed1ce9a 100644 --- a/R/inner_functions.R +++ b/R/inner_functions.R @@ -396,12 +396,13 @@ addSummaryMetrics <- function(data.path, samples.tmp <- list.dirs(data.path, recursive = FALSE, full.names = FALSE) samples <- intersect(samples.tmp, metadata$sample) - # checking if there are duplicated sample names only among the samples that will be used/are in metadata - doubles <- table(samples) %>% + # checking if there are duplicated sample name directories among the samples in metadata + doubles <- table(samples.tmp) %>% .[. > 1] %>% names() + overlap <- intersect(doubles, metadata$sample) - if (length(doubles) > 0) stop(paste0("One or more samples are present twice in 'data.path'. Sample names must be unique. Affected sample(s): ",paste(doubles, collapse = " "))) + if (length(overlap) > 0) stop(paste0("One or more samples are present twice in 'data.path' and are provided in 'metadata'. Sample names must be unique. Affected sample(s): ",paste(doubles, collapse = " "))) if (length(samples) != length(samples.tmp)) message("'metadata' doesn't contain the following sample(s) derived from 'data.path' (dropped): ",setdiff(samples.tmp, samples) %>% paste(collapse = " ")) if (verbose) message(paste0(Sys.time()," Adding ",length(samples)," samples")) From 8118ccbf2aefc93396d8129bc9a8ff775b5f9f29 Mon Sep 17 00:00:00 2001 From: Laura19993 Date: Mon, 26 Jan 2026 13:38:22 +0100 Subject: [PATCH 5/6] introduced depth.cutoff.upper, plotBarcodeRankPlot, reformulated some parameter descriptions --- .gitignore | 1 + R/CRMetrics.R | 336 ++++++++++++++++++++++++++++++++++++-------- R/inner_functions.R | 4 +- 3 files changed, 279 insertions(+), 62 deletions(-) diff --git a/.gitignore b/.gitignore index 119d079..58df564 100644 --- a/.gitignore +++ b/.gitignore @@ -16,6 +16,7 @@ # produced vignettes vignettes/*.html vignettes/*.pdf +vignettes/*.Rmd # OAuth2 token, see https://github.com/hadley/httr/releases/tag/v0.3 .httr-oauth # knitr and R markdown default cache directories diff --git a/R/CRMetrics.R b/R/CRMetrics.R index 2cb5cd4..c177469 100644 --- a/R/CRMetrics.R +++ b/R/CRMetrics.R @@ -24,7 +24,7 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE, #' @field metadata data.frame or character Path to metadata file or name of metadata data.frame object. Metadata must contain a column named 'sample' containing sample names that must match folder names in 'data.path' (default = NULL) metadata = NULL, - #' @field data.path character Path(s) to Cell Ranger count data, one directory per sample. If multiple paths, do c("path1","path2") (default = NULL) + #' @field data.path character Path(s) to Cell Ranger or Parse count data, one directory per sample. If multiple paths, do c("path1","path2") (default = NULL) data.path = NULL, #' @field cms list List with count matrices (default = NULL) @@ -33,10 +33,10 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE, #' @field cms.preprocessed list List with preprocessed count matrices after $doPreprocessing() (default = NULL) cms.preprocessed = NULL, - #' @field cms.raw list List with raw, unfiltered count matrices, i.e., including all CBs detected also empty droplets (default = NULL) + #' @field cms.raw list List with raw, unfiltered count matrices, i.e., including all cell barcodes detected, also empty ones (default = NULL) cms.raw = NULL, - #' @field summary.metrics data.frame Summary metrics from Cell Ranger (default = NULL) + #' @field summary.metrics data.frame Summary metrics from Cell Ranger or Parse pipeline (default = NULL) summary.metrics = NULL, #' @field detailed.metrics data.frame Detailed metrics, i.e., no. genes and UMIs per cell (default = NULL) @@ -245,6 +245,108 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE, self$detailed.metrics <- addDetailedMetricsInner(cms = cms, verbose = verbose, n.cores = n.cores) }, + + #' @description Read in raw count matrices and save UMI counts to object in "cellbender" slot, then create barcode rank plots for quality control + #' @param shrinkage integer Select every nth UMI count per cell for plotting. Improves plotting speed drastically. To plot all cells, set to 1 (default = 100) + #' @param show.expected.cells logical Plot line depicting called cells by Parse or Cell Ranger pipeline (default = TRUE) + #' @param cms.raw list Raw count matrices from HDF5 Cell Ranger or Parse outputs (default = self$cms.raw) + #' @param technology character applied single cell technology (default = c("10x", "10xflex", "10xmultiome", "parse")) + #' @param umi.counts list UMI counts calculated as column sums of raw count matrices from HDF5 Cell Ranger or Parse outputs (default: stored list) + #' @param data.path character Path to Cell Ranger or Parse outputs (default = self$data.path) + #' @param samples character Sample names to include (default = self$metadata$sample) + #' @param verbose logical Show progress (default: stored vector) + #' @param n.cores integer Number of cores (default: stored vector) + #' @param unique.names logical Create unique cell names (default = FALSE) + #' @param sep character Separator for creating unique cell names (default = "!!") + #' @return ggplot2 object and bash script + #' @examples + #' \dontrun{ + #' crm <- CRMetrics$new(data.path = "/path/to/count/data") + #' crm$prepareCellbender() + #' } + plotBarcodeRankPlot = function(shrinkage = 100, + show.expected.cells = TRUE, + cms.raw = self$cms.raw, + technology = self$technology, + umi.counts = self$cellbender$umi.counts, + data.path = self$data.path, + samples = self$metadata$sample, + verbose = self$verbose, + n.cores = self$n.cores, + unique.names = FALSE, + sep = "!!") { + checkPackageInstalled("sparseMatrixStats", bioc = TRUE) + # Preparations + if (verbose) message(paste0(Sys.time()," Started run using ", if (n.cores < length(samples)) n.cores else length(samples)," cores")) + expected.cells <- self$getExpectedCells(samples) + + # Read raw CMs from HDF5 files + + if (!is.null(cms.raw)) { + if (verbose) message(paste0(Sys.time()," Using stored HDF5 Cell Ranger/Parse outputs. To overwrite, set $cms.raw <- NULL")) + } else { + + checkDataPath(data.path) + + if (technology == "parse") { + if (verbose) message(paste0(Sys.time()," Loading Parse outputs")) + cms.raw <- readParse(data.path, samples, "raw", n.cores = n.cores, verbose = verbose, unique.names = unique.names, sep = sep) + } else { + if (verbose) message(paste0(Sys.time()," Loading HDF5 Cell Ranger outputs")) + cms.raw <- read10xH5(data.path, samples, "raw", technology = self$technology, n.cores = n.cores, verbose = verbose, unique.names = unique.names, sep = sep) + } + self$cms.raw <- cms.raw + } + + # Get UMI counts + if (!is.null(umi.counts)) { + if (verbose) message(paste0(Sys.time()," Using stored UMI counts calculations. To overwrite, set $cellbender$umi.counts <- NULL")) + } else { + if (verbose) message(paste0(Sys.time()," Calculating UMI counts per sample")) + umi.counts <- cms.raw[samples] %>% + plapply(\(cm) { + sparseMatrixStats::colSums2(cm) %>% + sort(decreasing = TRUE) %>% + {data.frame(y = .)} %>% + filter(y > 0) %>% + mutate(., x = seq_len(nrow(.))) + }, n.cores = n.cores) %>% + setNames(samples) + self$cellbender$umi.counts <- umi.counts + } + + # Create plot + if (verbose) message(paste0(Sys.time()," Plotting")) + data.df <- umi.counts[samples] %>% + names() %>% + lapply(\(sample) { + umi.counts[[sample]] %>% + mutate(sample = sample) %>% + .[seq(1, nrow(.), shrinkage),] + }) %>% + bind_rows() + + line.df <- expected.cells %>% + {data.frame(sample = names(.), exp = .)} + + g <- ggplot(data.df, aes(x, y)) + + geom_line(color = "red") + + scale_x_log10(labels = scales::comma) + + scale_y_log10(labels = scales::comma) + + self$theme + + labs(x = "Barcode ID ranked by count", y = "UMI count per barcode", col = "") + + if (show.expected.cells) g <- g + geom_vline(data = line.df, aes(xintercept = exp, col = "Expected cells")) + + g <- g + facet_wrap(~ sample, scales = "free") + + if (verbose) message(paste0(Sys.time()," Done!")) + + # return the plot object + return(g) + }, + + #' @description Add comparison group for statistical testing. #' @param comp.group character Comparison metric (default = self$comp.group). #' @param metadata data.frame Metadata for samples (default = self$metadata). @@ -595,7 +697,8 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE, #' @param depth logical Plot depth or not (default = FALSE). #' @param doublet.method character Doublet detection method (default = NULL). #' @param doublet.scores logical Plot doublet scores or not (default = FALSE). - #' @param depth.cutoff numeric Depth cutoff (default = 1e3). + #' @param depth.cutoff numeric Lower depth cutoff. Can be a single value or a named numeric vector specifying cutoffs per sample (default = NULL). + #' @param depth.cutoff.upper numeric Upper depth cutoff. Can be a single value or a named numeric vector specifying cutoffs per sample (default = NULL). #' @param mito.frac logical Plot mitochondrial fraction or not (default = FALSE). #' @param mito.cutoff numeric Mitochondrial fraction cutoff (default = 0.05). #' @param species character Species to calculate the mitochondrial fraction for (default = c("human","mouse")). @@ -635,7 +738,8 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE, plotEmbedding = function(depth = FALSE, doublet.method = NULL, doublet.scores = FALSE, - depth.cutoff = 1e3, + depth.cutoff = NULL, + depth.cutoff.upper = NULL, mito.frac = FALSE, mito.cutoff = 0.05, species = c("human","mouse"), @@ -656,16 +760,85 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE, # Depth if (depth) { - depths <- self$getDepth() %>% - filterVector("depth.cutoff", depth.cutoff, self$con$samples %>% names(), sep) - if (length(depth.cutoff) > 1) { - main <- "Cells with low depth with sample-specific cutoff" + + depth_vals <- self$getDepth() + cell_names <- names(depth_vals) + + # LOWER cutoff + if (!is.null(depth.cutoff)) { + + pass_lower <- filterVector( + num.vec = depth_vals, + name = "depth.cutoff", + filter = depth.cutoff, + samples = self$con$samples %>% names(), + sep = sep + ) + low_depth <- !pass_lower + + } else { + low_depth <- rep(FALSE, length(depth_vals)) + names(low_depth) <- cell_names + } + + # UPPER cutoff + if (!is.null(depth.cutoff.upper)) { + + pass_upper <- filterVector( + num.vec = -depth_vals, + name = "depth.cutoff.upper", + filter = -depth.cutoff.upper, + samples = self$con$samples %>% names(), + sep = sep + ) + high_depth <- !pass_upper + + } else { + high_depth <- rep(FALSE, length(depth_vals)) # default is FALSE + names(high_depth) <- cell_names + } + + + # combine into 3-state vector + depth_flag <- integer(length(depth_vals)) + names(depth_flag) <- cell_names + + depth_flag[low_depth] <- 1 + depth_flag[high_depth] <- 2 + + # title + lower_sample_specific <- !is.null(depth.cutoff) && + length(depth.cutoff) > 1 + upper_sample_specific <- !is.null(depth.cutoff.upper) && + length(depth.cutoff.upper) > 1 + + if (!is.null(depth.cutoff.upper)) { + if (lower_sample_specific || upper_sample_specific) { + main <- "Cells with low (red) or high (blue) depth (sample-specific cutoffs)" + } else if (!is.null(depth.cutoff)) { + main <- paste0( + "Cells with depth < ", depth.cutoff , " (red) or > ", depth.cutoff.upper , " (blue)" + ) + } else { + main <- paste0("Cells with depth > ", depth.cutoff.upper) + } } else { - main <- paste0("Cells with depth < ",depth.cutoff) + if (lower_sample_specific) { + main <- "Cells with low depth (sample-specific cutoff)" + } else { + main <- paste0("Cells with depth < ", depth.cutoff) + } } - g <- self$con$plotGraph(colors = (!depths) * 1, title = main, size = size, ...) + + g <- self$con$plotGraph( + groups = depth_flag, + title = main, + size = size, mark.groups=F, + ...)+scale_color_manual(values = c("0"= "lightgrey", "1" = "red", "2" = "blue")) } + + # Doublets if (!is.null(doublet.method)) { dres <- self$doublets[[doublet.method]]$result @@ -698,7 +871,7 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE, }, #' @description Plot per-sample density histograms of cell depths (=total UMI counts per cell). The plot highlights cells below or above a specified UMI cutoff with distinct colors, helping to assess data quality and filtering thresholds. - #' @param cutoff numeric Depth cutoff used to distinguish filtered and retained cells. Can be a single value or a named numeric vector specifying cutoffs per sample (default = 1e3). + #' @param cutoff numeric Depth cutoff (lower). Can be a single value or a named numeric vector specifying cutoffs per sample (default = 1e3). #' @param samples character Sample names to include for plotting (default = $metadata$sample). #' @param sep character Separator for creating unique cell names (default = "!!") #' @param keep.col character Color for density of cells that are kept (default = "#E7CDC2") @@ -804,7 +977,7 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE, }, #' @description Plot per-sample histograms of total UMI counts per cell (= cell depth), allowing visualization of sequencing depth distributions across samples. - #' @param cutoff numeric Depth (UMI count) cutoff used to distinguish filtered and retained cells. Can be a single value or a named numeric vector specifying cutoffs per sample (Default = 1e3). + #' @param cutoff numeric Depth cutoff (lower) used to distinguish filtered and retained cells. Can be a single value or a named numeric vector specifying cutoffs per sample (Default = 1e3). #' @param samples character Vector of sample names to include in the plot (default = self$metadata$sample). #' @param sep character Separator for creating unique cell names (default = "!!") #' @param binwidth numeric Width of the histogram bins (default = 10). @@ -837,22 +1010,24 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE, pmin(3) # Plot + + depth.plot <- lapply(unique(tmp$sample), function(id) { tmp.plot <- tmp %>% filter(sample == id) - + + if (length(cutoff) == 1) { + plot.cutoff <- cutoff + } else { + plot.cutoff <- cutoff[names(cutoff) == id] + } + g <- ggplot(tmp.plot, aes(x = depth)) + geom_histogram(binwidth = binwidth, boundary = 0, closed = "left") + labs(title = id, x = "UMI counts", y = "Number of cells") + self$theme + - xlim(0,xupper) + geom_vline(xintercept = cutoff, color="red", linetype = "dashed") + xlim(0,xupper) + geom_vline(xintercept = plot.cutoff, color="red", linetype = "dashed") # optional: + scale_x_log10() - if (length(cutoff) == 1) { - plot.cutoff <- cutoff - } else { - plot.cutoff <- cutoff[names(cutoff) == id] - } - return(g) }) %>% plot_grid(plotlist = ., ncol = ncol.plot, label_size = 5) @@ -910,7 +1085,7 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE, stop("No Conos object found, please run createEmbedding.") } - if (length(cutoff) > 1 & length(self$con$samples) != length(cutoff)) stop(paste0("'cutoff' has a length of ",length(cutoff),", but the conos object contains ",length(tmp)," samples. Please adjust.")) + if (length(cutoff) > 1 & length(self$con$samples) != length(cutoff)) stop(paste0("'cutoff' has a length of ",length(cutoff),", but the conos object contains ",length(self$con$samples)," samples. Please adjust.")) mf <- self$getMitoFraction(species = species) @@ -924,7 +1099,7 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE, mutate(sample = sample %>% strsplit(sep, TRUE) %>% sapply(`[[`, 1)) %>% split(., .$sample) %>% .[samples] %>% - lapply(\(z) with(density(z$mito.frac, adjust = 1/10), data.frame(x,y))) %>% + lapply(\(z) with(density(z$mito.frac, adjust = 1/10, from = 0), data.frame(x,y))) %>% {lapply(names(.), \(x) data.frame(.[[x]], sample = x))} %>% bind_rows() @@ -1308,7 +1483,8 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE, }, #' @description Filter cells based on depth, mitochondrial fraction and doublets from the count matrix. - #' @param depth.cutoff numeric Depth (transcripts per cell) cutoff (default = NULL). + #' @param depth.cutoff numeric Lower depth (total UMI counts per cell) cutoff (default = NULL). + #' @param depth.cutoff.upper numeric Upper depth (total UMI counts per cell) cutoff (default = NULL). #' @param mito.cutoff numeric Mitochondrial fraction cutoff (default = NULL). #' @param doublets character Doublet detection method to use (default = NULL). #' @param species character Species to calculate the mitochondrial fraction for (default = "human"). @@ -1348,6 +1524,7 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE, #' } #' } filterCms = function(depth.cutoff = NULL, + depth.cutoff.upper = NULL, mito.cutoff = NULL, doublets = NULL, species = c("human","mouse"), @@ -1358,7 +1535,8 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE, if (verbose) { filters <- c() - if (!is.null(depth.cutoff)) filters %<>% c(paste0("depth.cutoff = ",depth.cutoff)) + if (!is.null(depth.cutoff)) filters %<>% c(paste0("depth.cutoff.lower = ", depth.cutoff)) + if (!is.null(depth.cutoff.upper)) filters %<>% c(paste0("depth.cutoff.upper = ", depth.cutoff.upper)) if (!is.null(mito.cutoff)) filters %<>% c(paste0("mito.cutoff = ",mito.cutoff," and species = ",species)) if (!is.null(doublets)) filters %<>% c(paste0("doublet method = ",doublets)) @@ -1388,9 +1566,24 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE, names() # Depth - if (!is.null(depth.cutoff)) { - depth.filter <- self$getDepth() %>% - filterVector("depth.cutoff", depth.cutoff, samples, sep) + # Depth + if (!is.null(depth.cutoff) || !is.null(depth.cutoff.upper)) { + depth_vals <- self$getDepth() + depth.keep <- rep(TRUE, length(depth_vals)) + names(depth.keep) <- names(depth_vals) + + # lower cutoff + if (!is.null(depth.cutoff)) { + keep_lower <- filterVector(depth_vals, "depth.cutoff", depth.cutoff, samples, sep) + depth.keep <- depth.keep & keep_lower + } + # upper cutoff + if (!is.null(depth.cutoff.upper)) { + keep_upper <- !filterVector(depth_vals, "depth.cutoff.upper", depth.cutoff.upper, samples, sep) + depth.keep <- depth.keep & keep_upper + } + depth.filter <- depth.keep + } else { depth.filter <- NULL } @@ -1483,8 +1676,9 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE, #' @description Plot filtered cells in an embedding, in a bar plot, on a tile or export the data frame #' @param type character The type of plot to use: embedding, bar, tile or export (default = c("embedding","bar","tile","export")). - #' @param depth logical Plot the depth or not (default = TRUE). - #' @param depth.cutoff numeric Depth cutoff, either a single number or a vector with cutoff per sample and with sampleIDs as names (default = 1e3). + #' @param depth logical Plot the depth (= total amount of UMI counts per cell) or not (default = TRUE). + #' @param depth.cutoff numeric Lower depth cutoff, either a single number or a vector with cutoff per sample and with sample IDs as names (default = 1e3). + #' @param depth.cutoff.upper numeric Upper depth cutoff, either a single number or a vector with cutoff per sample and with sample IDs as names (default = NULL). #' @param doublet.method character Method to detect doublets (default = NULL). #' @param mito.frac logical Plot the mitochondrial fraction or not (default = TRUE). #' @param mito.cutoff numeric Mitochondrial fraction cutoff, either a single number or a vector with cutoff per sample and with sampleIDs as names (default = 0.05). @@ -1527,6 +1721,7 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE, plotFilteredCells = function(type = c("embedding","bar","tile","export"), depth = TRUE, depth.cutoff = 1e3, + depth.cutoff.upper = NULL, doublet.method = NULL, mito.frac = TRUE, mito.cutoff = 0.05, @@ -1542,14 +1737,35 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE, if (mito.frac) species %<>% tolower() %>% match.arg(c("human","mouse")) # Prepare data + # depth if (depth) { - depths <- self$getDepth() %>% - filterVector("depth.cutoff", depth.cutoff, depth.cutoff %>% names(), sep) %>% - {ifelse(!., "depth", "")} + depth_vals <- self$getDepth() + cell_names <- names(depth_vals) + samples <- self$con$samples[cell_names] + + low_depth <- !filterVector(depth_vals, "depth.cutoff", depth.cutoff, self$con$samples %>% names(), sep) + + # upper cutoff + if (!is.null(depth.cutoff.upper)) { + + high_depth <- filterVector(depth_vals,"depth.cutoff.upper",depth.cutoff.upper,self$con$samples %>% names(),sep) + } else { + high_depth <- rep(FALSE, length(depth_vals)) + names(high_depth) <- cell_names + } + + # encode as labels + depths <- rep("", length(depth_vals)) + names(depths) <- cell_names + + depths[low_depth] <- "low.depth" + depths[high_depth] <- "high.depth" + } else { depths <- NULL } - + + if (mito.frac) { mf <- self$getMitoFraction(species = species) %>% filterVector("mito.cutoff", mito.cutoff, mito.cutoff %>% names(), sep) %>% @@ -1752,12 +1968,12 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE, return(tmp) }, - #' @description Create plots and script call for CellBender + #' @description Read in raw cms and save umi counts to object, then plot barcode rank plots and in case of Parse data generate input files for CellBender #' @param shrinkage integer Select every nth UMI count per cell for plotting. Improves plotting speed drastically. To plot all cells, set to 1 (default = 100) #' @param show.expected.cells logical Plot line depicting expected number of cells (default = TRUE) - #' @param show.total.droplets logical Plot line depicting total droplets included for CellBender run (default = TRUE) - #' @param expected.cells named numeric If NULL, expected cells will be deduced from the number of cells per sample identified by Cell Ranger. Otherwise, a named vector of expected cells with sample IDs as names. Sample IDs must match those in summary.metrics (default: stored named vector) - #' @param total.droplets named numeric If NULL, total droplets included will be deduced from expected cells multiplied by 3. Otherwise, a named vector of total droplets included with sample IDs as names. Sample IDs must match those in summary.metrics (default: stored named vector) + #' @param show.total.barcodes logical Plot line depicting total barcodes included for CellBender run (default = TRUE) + #' @param expected.cells named numeric If NULL, expected cells will be deduced from the number of cells per sample identified by respective pipeline. Otherwise, a named vector of expected cells with sample IDs as names. Sample IDs must match those in summary.metrics (default: stored named vector) + #' @param total.barcodes named numeric If NULL, total barcodes included will be deduced from expected cells multiplied by 3. Otherwise, a named vector of total barcodes included with sample IDs as names. Sample IDs must match those in summary.metrics (default: stored named vector) #' @param cms.raw list Raw count matrices from HDF5 Cell Ranger outputs (default = self$cms.raw) #' @param technology character applied single cell technology (default = c("10x", "10xflex", "10xmultiome", "parse")) #' @param umi.counts list UMI counts calculated as column sums of raw count matrices from HDF5 Cell Ranger outputs (default: stored list) @@ -1775,9 +1991,9 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE, #' } prepareCellbender = function(shrinkage = 100, show.expected.cells = TRUE, - show.total.droplets = TRUE, + show.total.barcodes = TRUE, expected.cells = NULL, - total.droplets = NULL, + total.barcodes = NULL, cms.raw = self$cms.raw, technology = self$technology, umi.counts = self$cellbender$umi.counts, @@ -1791,7 +2007,7 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE, # Preparations if (verbose) message(paste0(Sys.time()," Started run using ", if (n.cores < length(samples)) n.cores else length(samples)," cores")) if (is.null(expected.cells)) expected.cells <- self$getExpectedCells(samples) - if (is.null(total.droplets)) total.droplets <- self$getTotalDroplets(samples) + if (is.null(total.barcodes)) total.barcodes <- self$getTotalBarcodes(samples) # Read CMs from HDF5 files @@ -1842,17 +2058,17 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE, line.df <- expected.cells %>% {data.frame(sample = names(.), exp = .)} %>% - mutate(total = total.droplets %>% unname()) + mutate(total = total.barcodes %>% unname()) g <- ggplot(data.df, aes(x, y)) + geom_line(color = "red") + scale_x_log10(labels = scales::comma) + scale_y_log10(labels = scales::comma) + self$theme + - labs(x = "Droplet ID ranked by count", y = "UMI count per droplet", col = "") + labs(x = "Barcode ID ranked by count", y = "UMI count per barcode", col = "") if (show.expected.cells) g <- g + geom_vline(data = line.df, aes(xintercept = exp, col = "Expected cells")) - if (show.total.droplets) g <- g + geom_vline(data = line.df, aes(xintercept = total, col = "Total droplets included")) + if (show.total.barcodes) g <- g + geom_vline(data = line.df, aes(xintercept = total, col = "Total barcodes included")) g <- g + facet_wrap(~ sample, scales = "free") @@ -1904,14 +2120,14 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE, }, - + #' @description Create CellBender script #' @param file character File name for CellBender script. Will be stored in `data.path` (default: "cellbender_script.sh") #' @param technology character Applied single cell technology (default = self$technology). #' @param fpr numeric False positive rate for CellBender (default = 0.01) #' @param epochs integer Number of epochs for CellBender (default = 150) #' @param use.gpu logical Use CUDA capable GPU (default = TRUE) #' @param expected.cells logical or named numeric vector If FALSE, the --expected-cells argument is omitted in the generated script.If TRUE, expected cells will be deduced from the number of cells per sample identified by Cell Ranger/Parse. Otherwise, a named vector of expected cells with sample IDs as names. Sample IDs must match those in summary.metrics (default: FALSE) - #' @param total.droplets logical or named numeric vector If FALSE, the --total.droplets argument is omitted in the generated script.If TRUE, total droplets included will be deduced from expected cells multiplied by 3. Otherwise, a named vector of total droplets included with sample IDs as names. Sample IDs must match those in summary.metrics (default: FALSE) + #' @param total.barcodes logical or named numeric vector If FALSE, the --total.barcodes argument is omitted in the generated script.If TRUE, total barcodes included will be deduced from expected cells multiplied by 3. Otherwise, a named vector of total barcodes included with sample IDs as names. Sample IDs must match those in summary.metrics (default: FALSE) #' @param data.path character Path to Cell Ranger outputs (default = self$data.path) #' @param samples character Sample names to include (default = self$metadata$sample) #' @param args character (optional) Additional parameters for CellBender @@ -1928,7 +2144,7 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE, epochs = 150, use.gpu = TRUE, expected.cells = FALSE, - total.droplets = FALSE, + total.barcodes = FALSE, data.path = self$data.path, samples = self$metadata$sample, args = NULL) { @@ -1986,12 +2202,12 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE, stop("If expected.cells is a vector, it must be named with sample names.") } - if (isTRUE(total.droplets)) { - total.droplets <- self$getTotalDroplets(samples) - } else if (isFALSE(total.droplets)) { - total.droplets <- NULL - } else if (is.vector(total.droplets) && is.null(names(total.droplets))) { - stop("If total.droplets is a vector, it must be named with sample names.") + if (isTRUE(total.barcodes)) { + total.barcodes <- self$getTotalBarcodes(samples) + } else if (isFALSE(total.barcodes)) { + total.barcodes <- NULL + } else if (is.vector(total.barcodes) && is.null(names(total.barcodes))) { + stop("If total.barcodes is a vector, it must be named with sample names.") } # Create CellBender shell scripts @@ -2001,7 +2217,7 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE, " --output ",outputs[sample], if (use.gpu) c(" --cuda ") else c(" "), if (!is.null(expected.cells)) paste0("--expected-cells ", expected.cells[sample], " ") else c(" "), - if (!is.null(total.droplets)) paste0("--total-droplets-included ", total.droplets[sample], " ") else c(" "), + if (!is.null(total.barcodes)) paste0("--total-barcodes-included ", total.barcodes[sample], " ") else c(" "), " --fpr ",fpr, " --epochs ",epochs, " ",if (!is.null(args)) paste(args, collapse = " ")) @@ -2047,7 +2263,7 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE, return(expected.cells) }, - #' @description Get the total number of droplets included in the CellBender estimations. Based on the Cell Ranger summary metrics and multiplied by a preset multiplier. + #' @description Get the total number of barcodes included in the CellBender estimations. Based on the Cell Ranger summary metrics and multiplied by a preset multiplier. #' @param samples character Samples names to include (default = self$metadata$sample) #' @param multiplier numeric Number to multiply expected number of cells with (default = 3) #' @return A numeric vector @@ -2067,15 +2283,15 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE, #' # Add summary #' crm$addSummaryFromCms() #' - #' # Get no. droplets - #' crm$getTotalDroplets() - getTotalDroplets = function(samples = self$metadata$sample, + #' # Get no. barcodes + #' crm$getTotalBarcodes() + getTotalBarcodes = function(samples = self$metadata$sample, multiplier = 3) { if (!is.numeric(multiplier)) stop("'multiplier' must be numeric.") expected.cells <- self$getExpectedCells(samples = samples) - total.droplets <- expected.cells * multiplier + total.barcodes <- expected.cells * multiplier - return(total.droplets) + return(total.barcodes) }, #' @description Add a list of count matrices to the CRMetrics object. diff --git a/R/inner_functions.R b/R/inner_functions.R index ed1ce9a..041ecee 100644 --- a/R/inner_functions.R +++ b/R/inner_functions.R @@ -186,7 +186,7 @@ readFlex <- function(data.path, return(tmp) } -#' @title Load filtered Parse count matrices +#' @title Load Parse count matrices #' @description Load gene expression count data #' @param data.path Path to Parse count data. #' @param samples Vector of sample names (default = NULL) @@ -461,7 +461,7 @@ addSummaryMetrics <- function(data.path, header = TRUE, sep = ",", stringsAsFactors = FALSE) %>% select(Metric.Name, Metric.Value) %>% # Only select 'Metric Name' and 'Metric Value' columns rename(metric = Metric.Name, value = Metric.Value) %>% - slice(1:12) %>% + slice(1:15) %>% mutate(value = gsub(",", "", value), # Remove commas value = ifelse(grepl("%", value), as.numeric(gsub("%", "", value)) / 100, as.numeric(value))) %>% # Handle percentages mutate(sample = s[1]) %>% # Add sample column From 30814f3b5c8fffdd470254bd3a12ed307055182f Mon Sep 17 00:00:00 2001 From: Laura19993 Date: Wed, 11 Mar 2026 15:26:44 +0100 Subject: [PATCH 6/6] changed prepareCellbender function to not plot Barcode-Rank-plots anymore and just prepares Parse data as input for CellBender --- .gitignore | 1 + R/CRMetrics.R | 247 +++++++++++--------------------------------- R/inner_functions.R | 2 +- 3 files changed, 65 insertions(+), 185 deletions(-) diff --git a/.gitignore b/.gitignore index 58df564..f1301d2 100644 --- a/.gitignore +++ b/.gitignore @@ -14,6 +14,7 @@ # RStudio files .Rproj.user/ # produced vignettes +vignettes vignettes/*.html vignettes/*.pdf vignettes/*.Rmd diff --git a/R/CRMetrics.R b/R/CRMetrics.R index c177469..966bbeb 100644 --- a/R/CRMetrics.R +++ b/R/CRMetrics.R @@ -24,13 +24,13 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE, #' @field metadata data.frame or character Path to metadata file or name of metadata data.frame object. Metadata must contain a column named 'sample' containing sample names that must match folder names in 'data.path' (default = NULL) metadata = NULL, - #' @field data.path character Path(s) to Cell Ranger or Parse count data, one directory per sample. If multiple paths, do c("path1","path2") (default = NULL) + #' @field data.path character Path(s) to Cell Ranger or Parse's split-pipe count data, one directory per sample. If multiple paths, do c("path1","path2") (default = NULL) data.path = NULL, - #' @field cms list List with count matrices (default = NULL) + #' @field cms list List with count matrices from Cell Ranger or Parse's split-pipe (default = NULL) cms = NULL, - #' @field cms.preprocessed list List with preprocessed count matrices after $doPreprocessing() (default = NULL) + #' @field cms.preprocessed list List with pagoda2- or seurat-preprocessed count matrices after $doPreprocessing() (default = NULL) cms.preprocessed = NULL, #' @field cms.raw list List with raw, unfiltered count matrices, i.e., including all cell barcodes detected, also empty ones (default = NULL) @@ -204,7 +204,7 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE, if (is.null(cms)) self$summary.metrics <- addSummaryMetrics(data.path = data.path, metadata = self$metadata, n.cores = self$n.cores, verbose = verbose) }, - #' @description Function to read in detailed metrics. This is not done upon initialization for speed. + #' @description Function to read in detailed metrics (total UMI count and gene count per cell). This is not done upon object initialization for speed. #' @param cms list List of (sparse) count matrices (default = self$cms) #' @param min.transcripts.per.cell numeric Minimal number of transcripts per cell (default = 100) #' @param n.cores integer Number of cores for the calculations (default = self$n.cores). @@ -268,7 +268,7 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE, show.expected.cells = TRUE, cms.raw = self$cms.raw, technology = self$technology, - umi.counts = self$cellbender$umi.counts, + umi.counts = self$umi.counts, data.path = self$data.path, samples = self$metadata$sample, verbose = self$verbose, @@ -300,7 +300,7 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE, # Get UMI counts if (!is.null(umi.counts)) { - if (verbose) message(paste0(Sys.time()," Using stored UMI counts calculations. To overwrite, set $cellbender$umi.counts <- NULL")) + if (verbose) message(paste0(Sys.time()," Using stored UMI counts calculations. To overwrite, set crm$umi.counts <- NULL")) } else { if (verbose) message(paste0(Sys.time()," Calculating UMI counts per sample")) umi.counts <- cms.raw[samples] %>% @@ -312,7 +312,7 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE, mutate(., x = seq_len(nrow(.))) }, n.cores = n.cores) %>% setNames(samples) - self$cellbender$umi.counts <- umi.counts + self$umi.counts <- umi.counts } # Create plot @@ -346,6 +346,36 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE, return(g) }, + #' @description Extract the expected number of cells per sample based on the Cell Ranger or Parse's split-pipe summary metrics. + #' @param samples character Sample names to include (default = self$metadata$sample) + #' @return A numeric vector + #' @examples + #' # Simulate data + #' testdata.cms <- lapply(seq_len(2), \(x) { + #' out <- Matrix::rsparsematrix(2e3, 1e3, 0.1) + #' out[out < 0] <- 1 + #' dimnames(out) <- list(sapply(seq_len(2e3), \(x) paste0("gene",x)), + #' sapply(seq_len(1e3), \(x) paste0("cell",x))) + #' return(out) + #' }) + #' + #' # Initialize + #' crm <- CRMetrics$new(cms = testdata.cms, samples = c("sample1", "sample2"), n.cores = 1) + #' + #' # Get summary + #' crm$addSummaryFromCms() + #' + #' # Get no. cells + #' crm$getExpectedCells() + getExpectedCells = function(samples = self$metadata$sample) { + expected.cells <- self$summary.metrics %>% + filter(metric == "estimated number of cells") %$% + setNames(value, sample) %>% + .[samples] + + return(expected.cells) + }, + #' @description Add comparison group for statistical testing. #' @param comp.group character Comparison metric (default = self$comp.group). @@ -1968,116 +1998,35 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE, return(tmp) }, - #' @description Read in raw cms and save umi counts to object, then plot barcode rank plots and in case of Parse data generate input files for CellBender - #' @param shrinkage integer Select every nth UMI count per cell for plotting. Improves plotting speed drastically. To plot all cells, set to 1 (default = 100) - #' @param show.expected.cells logical Plot line depicting expected number of cells (default = TRUE) - #' @param show.total.barcodes logical Plot line depicting total barcodes included for CellBender run (default = TRUE) - #' @param expected.cells named numeric If NULL, expected cells will be deduced from the number of cells per sample identified by respective pipeline. Otherwise, a named vector of expected cells with sample IDs as names. Sample IDs must match those in summary.metrics (default: stored named vector) - #' @param total.barcodes named numeric If NULL, total barcodes included will be deduced from expected cells multiplied by 3. Otherwise, a named vector of total barcodes included with sample IDs as names. Sample IDs must match those in summary.metrics (default: stored named vector) - #' @param cms.raw list Raw count matrices from HDF5 Cell Ranger outputs (default = self$cms.raw) + #' @description Generates input files for CellBender from Parse's split-pipe output. #' @param technology character applied single cell technology (default = c("10x", "10xflex", "10xmultiome", "parse")) - #' @param umi.counts list UMI counts calculated as column sums of raw count matrices from HDF5 Cell Ranger outputs (default: stored list) #' @param data.path character Path to Cell Ranger outputs (default = self$data.path) #' @param samples character Sample names to include (default = self$metadata$sample) #' @param verbose logical Show progress (default: stored vector) #' @param n.cores integer Number of cores (default: stored vector) #' @param unique.names logical Create unique cell names (default = FALSE) #' @param sep character Separator for creating unique cell names (default = "!!") - #' @return ggplot2 object and bash script + #' @return Saves files to disc. #' @examples #' \dontrun{ #' crm <- CRMetrics$new(data.path = "/path/to/count/data") #' crm$prepareCellbender() #' } - prepareCellbender = function(shrinkage = 100, - show.expected.cells = TRUE, - show.total.barcodes = TRUE, - expected.cells = NULL, - total.barcodes = NULL, - cms.raw = self$cms.raw, - technology = self$technology, - umi.counts = self$cellbender$umi.counts, + prepareCellbender = function(technology = self$technology, data.path = self$data.path, samples = self$metadata$sample, verbose = self$verbose, n.cores = self$n.cores, unique.names = FALSE, sep = "!!") { - checkPackageInstalled("sparseMatrixStats", bioc = TRUE) - # Preparations - if (verbose) message(paste0(Sys.time()," Started run using ", if (n.cores < length(samples)) n.cores else length(samples)," cores")) - if (is.null(expected.cells)) expected.cells <- self$getExpectedCells(samples) - if (is.null(total.barcodes)) total.barcodes <- self$getTotalBarcodes(samples) - # Read CMs from HDF5 files - - #add parse case, use readParse function - if (!is.null(cms.raw)) { - if (verbose) message(paste0(Sys.time()," Using stored HDF5 Cell Ranger/Parse outputs. To overwrite, set $cms.raw <- NULL")) - } else { - - checkDataPath(data.path) - - if (technology == "parse") { - if (verbose) message(paste0(Sys.time()," Loading Parse outputs")) - cms.raw <- readParse(data.path, samples, "raw", n.cores = n.cores, verbose = verbose, unique.names = unique.names, sep = sep) - } else { - if (verbose) message(paste0(Sys.time()," Loading HDF5 Cell Ranger outputs")) - cms.raw <- read10xH5(data.path, samples, "raw", technology = self$technology, n.cores = n.cores, verbose = verbose, unique.names = unique.names, sep = sep) - } - self$cms.raw <- cms.raw - } - - # Get UMI counts - if (!is.null(umi.counts)) { - if (verbose) message(paste0(Sys.time()," Using stored UMI counts calculations. To overwrite, set $cellbender$umi.counts <- NULL")) - } else { - if (verbose) message(paste0(Sys.time()," Calculating UMI counts per sample")) - umi.counts <- cms.raw[samples] %>% - plapply(\(cm) { - sparseMatrixStats::colSums2(cm) %>% - sort(decreasing = TRUE) %>% - {data.frame(y = .)} %>% - filter(y > 0) %>% - mutate(., x = seq_len(nrow(.))) - }, n.cores = n.cores) %>% - setNames(samples) - self$cellbender$umi.counts <- umi.counts + if (technology != "parse") { + message(paste0(Sys.time()," This function only needs to be run for Parse Biosciences data. ")) } - - # Create plot - if (verbose) message(paste0(Sys.time()," Plotting")) - data.df <- umi.counts[samples] %>% - names() %>% - lapply(\(sample) { - umi.counts[[sample]] %>% - mutate(sample = sample) %>% - .[seq(1, nrow(.), shrinkage),] - }) %>% - bind_rows() - - line.df <- expected.cells %>% - {data.frame(sample = names(.), exp = .)} %>% - mutate(total = total.barcodes %>% unname()) - - g <- ggplot(data.df, aes(x, y)) + - geom_line(color = "red") + - scale_x_log10(labels = scales::comma) + - scale_y_log10(labels = scales::comma) + - self$theme + - labs(x = "Barcode ID ranked by count", y = "UMI count per barcode", col = "") - - if (show.expected.cells) g <- g + geom_vline(data = line.df, aes(xintercept = exp, col = "Expected cells")) - if (show.total.barcodes) g <- g + geom_vline(data = line.df, aes(xintercept = total, col = "Total barcodes included")) - - g <- g + facet_wrap(~ sample, scales = "free") - - if (verbose) message(paste0(Sys.time()," Done!")) - - + # transform parse output to work as input for cellbender if (technology == "parse") { - if (verbose) message(paste0(Sys.time()," Transforming Parse outputs as preparation for cellbender")) + if (verbose) message(paste0(Sys.time()," Transforming Parse's split-pipe outputs as preparation for CellBender")) # check if transformed data already exists full.path <- data.path %>% pathsToList(samples) %>% @@ -2115,8 +2064,6 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE, } }) } - # return the plot object - return(g) }, @@ -2126,9 +2073,9 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE, #' @param fpr numeric False positive rate for CellBender (default = 0.01) #' @param epochs integer Number of epochs for CellBender (default = 150) #' @param use.gpu logical Use CUDA capable GPU (default = TRUE) - #' @param expected.cells logical or named numeric vector If FALSE, the --expected-cells argument is omitted in the generated script.If TRUE, expected cells will be deduced from the number of cells per sample identified by Cell Ranger/Parse. Otherwise, a named vector of expected cells with sample IDs as names. Sample IDs must match those in summary.metrics (default: FALSE) - #' @param total.barcodes logical or named numeric vector If FALSE, the --total.barcodes argument is omitted in the generated script.If TRUE, total barcodes included will be deduced from expected cells multiplied by 3. Otherwise, a named vector of total barcodes included with sample IDs as names. Sample IDs must match those in summary.metrics (default: FALSE) - #' @param data.path character Path to Cell Ranger outputs (default = self$data.path) + #' @param expected.cells named numeric vector By default, the --expected-cells argument is omitted in the generated script. Otherwise, a named vector of expected cell numbers with sample IDs as names. Sample IDs must match those in summary.metrics (default = NULL). + #' @param total.droplets named numeric vector By default, the --total-droplets-included argument is omitted in the generated script. Otherwise, a named vector of total droplets included with sample IDs as names. Sample IDs must match those in summary.metrics (default = NULL). + #' @param data.path character Path to Cell Ranger or Parse's split-pipe outputs (default = self$data.path) #' @param samples character Sample names to include (default = self$metadata$sample) #' @param args character (optional) Additional parameters for CellBender #' @return bash script @@ -2143,8 +2090,8 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE, fpr = 0.01, epochs = 150, use.gpu = TRUE, - expected.cells = FALSE, - total.barcodes = FALSE, + expected.cells = NULL, + total.droplets = NULL, data.path = self$data.path, samples = self$metadata$sample, args = NULL) { @@ -2152,11 +2099,11 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE, checkDataPath(data.path) if (technology == "parse") { - # in case of parse we use the genes, matrix, barcodes files stored in DGE_unfiltered and therefor we just need to give the path to DGE_unfiltered + # in case of parse we use the genes, matrix, barcodes files stored in DGE_unfiltered and therefore we just need to give the path to DGE_unfiltered inputs <- data.path %>% pathsToList(samples) %>% sapply(\(sample) { - file.path(sample[2],sample[1],"DGE_unfiltered")}) + paste0(sample[2],sample[1],"/DGE_unfiltered")}) #check that transformed files exists for parse lapply(inputs, function(x) { @@ -2185,29 +2132,23 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE, pathsToList(samples) %>% sapply(\(sample) { if (technology == "parse") { - file.path(sample[2], sample[1], "DGE_unfiltered/cellbender.h5") + paste0(sample[2], sample[1],"/DGE_unfiltered/cellbender.h5") } else if (technology == "10xflex") { - file.path(sample[2],sample[1], "count/cellbender.h5") + paste0(sample[2],sample[1],"/count/cellbender.h5") } else { - file.path(sample[2],sample[1], "outs/cellbender.h5") + paste0(sample[2],sample[1],"/outs/cellbender.h5") } }) %>% setNames(samples) - if (isTRUE(expected.cells)) { - expected.cells <- self$getExpectedCells(samples) - } else if (isFALSE(expected.cells)) { - expected.cells <- NULL - } else if (is.vector(expected.cells) && is.null(names(expected.cells))) { + + if (is.vector(expected.cells) && is.null(names(expected.cells))) { stop("If expected.cells is a vector, it must be named with sample names.") } - if (isTRUE(total.barcodes)) { - total.barcodes <- self$getTotalBarcodes(samples) - } else if (isFALSE(total.barcodes)) { - total.barcodes <- NULL - } else if (is.vector(total.barcodes) && is.null(names(total.barcodes))) { - stop("If total.barcodes is a vector, it must be named with sample names.") + + if (is.vector(total.droplets) && is.null(names(total.droplets))) { + stop("If total.droplets is a vector, it must be named with sample names.") } # Create CellBender shell scripts @@ -2217,7 +2158,7 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE, " --output ",outputs[sample], if (use.gpu) c(" --cuda ") else c(" "), if (!is.null(expected.cells)) paste0("--expected-cells ", expected.cells[sample], " ") else c(" "), - if (!is.null(total.barcodes)) paste0("--total-barcodes-included ", total.barcodes[sample], " ") else c(" "), + if (!is.null(total.droplets)) paste0("--total-droplets-included ", total.droplets[sample], " ") else c(" "), " --fpr ",fpr, " --epochs ",epochs, " ",if (!is.null(args)) paste(args, collapse = " ")) @@ -2225,82 +2166,20 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE, out <- list("#! /bin/sh", script.list) %>% unlist() - # if there are several data.path "file" is saved to the first element of data.paht vector + # if there are several data.path "file" is saved to the first element of data.path vector cat(out, file = file.path(data.path[1],file), sep = "\n") message(paste0(file ," was saved into ", data.path[1])) }, - #' @description Extract the expected number of cells per sample based on the Cell Ranger summary metrics - #' @param samples character Sample names to include (default = self$metadata$sample) - #' @return A numeric vector - #' @examples - #' # Simulate data - #' testdata.cms <- lapply(seq_len(2), \(x) { - #' out <- Matrix::rsparsematrix(2e3, 1e3, 0.1) - #' out[out < 0] <- 1 - #' dimnames(out) <- list(sapply(seq_len(2e3), \(x) paste0("gene",x)), - #' sapply(seq_len(1e3), \(x) paste0("cell",x))) - #' return(out) - #' }) - #' - #' # Initialize - #' crm <- CRMetrics$new(cms = testdata.cms, samples = c("sample1", "sample2"), n.cores = 1) - #' - #' # Get summary - #' crm$addSummaryFromCms() - #' - #' # Get no. cells - #' crm$getExpectedCells() - #' - - #included parse and 10x flex nomenclature - getExpectedCells = function(samples = self$metadata$sample) { - expected.cells <- self$summary.metrics %>% - filter(metric %in% c("estimated number of cells", "number_of_cells", "cells")) %$% - setNames(value, sample) %>% - .[samples] - - return(expected.cells) - }, - #' @description Get the total number of barcodes included in the CellBender estimations. Based on the Cell Ranger summary metrics and multiplied by a preset multiplier. - #' @param samples character Samples names to include (default = self$metadata$sample) - #' @param multiplier numeric Number to multiply expected number of cells with (default = 3) - #' @return A numeric vector - #' @examples - #' # Simulate data - #' testdata.cms <- lapply(seq_len(2), \(x) { - #' out <- Matrix::rsparsematrix(2e3, 1e3, 0.1) - #' out[out < 0] <- 1 - #' dimnames(out) <- list(sapply(seq_len(2e3), \(x) paste0("gene",x)), - #' sapply(seq_len(1e3), \(x) paste0("cell",x))) - #' return(out) - #' }) - #' - #' # Initialize - #' crm <- CRMetrics$new(cms = testdata.cms, samples = c("sample1", "sample2"), n.cores = 1) - #' - #' # Add summary - #' crm$addSummaryFromCms() - #' - #' # Get no. barcodes - #' crm$getTotalBarcodes() - getTotalBarcodes = function(samples = self$metadata$sample, - multiplier = 3) { - if (!is.numeric(multiplier)) stop("'multiplier' must be numeric.") - expected.cells <- self$getExpectedCells(samples = samples) - total.barcodes <- expected.cells * multiplier - - return(total.barcodes) - }, #' @description Add a list of count matrices to the CRMetrics object. #' @param cms list List of (sparse) count matrices (default = NULL) - #' @param data.path character Path to Cell Ranger or Parse count data (default = self$data.path). + #' @param data.path character Path to Cell Ranger or Parse's split-pipe count data (default = self$data.path). #' @param technology character Applied single cell technology (default = self$technology). #' @param samples character Vector of sample names. If NULL, samples are extracted from cms (default = self$metadata$sample) #' @param cellbender logical Add CellBender filtered count matrices in HDF5 format. Requires that "cellbender" is in the names of the files (default = FALSE) - #' @param raw logical Add raw count matrices from Cell Ranger or Parse output. Cannot be combined with `cellbender=TRUE`. (default = FALSE) + #' @param raw logical Add raw count matrices from Cell Ranger or Parse's split-pipe output. Cannot be combined with `cellbender=TRUE`. (default = FALSE) #' @param symbol character The type of gene IDs to use, SYMBOL (TRUE) or ENSEMBLE (default = TRUE) #' @param unique.names logical Make cell names unique based on `sep` parameter (default = TRUE) #' @param sep character Separator used to create unique cell names (default = "!!") diff --git a/R/inner_functions.R b/R/inner_functions.R index 041ecee..6691b3c 100644 --- a/R/inner_functions.R +++ b/R/inner_functions.R @@ -27,7 +27,7 @@ checkCompGroup <- function(comp.group, #' @title Check whether 'comp.group' is in metadata #' @description Checks whether 'comp.group' is any of the column names in metadata. -#' @param comp.group Comparison metric. +#' @param comp.group Comparison metric.? #' @param metadata Metadata for samples. #' @keywords internal #' @return nothing or stop