From a04d77d865a12e1765a9e468405dbb934e23a1fc Mon Sep 17 00:00:00 2001
From: Laura19993 <laura.wolbeck93@gmail.com>
Date: Wed, 19 Mar 2025 13:36:43 +0100
Subject: [PATCH 1/6] Added functionality for Parse and 10x FLEX and updated
 cellbender functions to work with the newest cellbender release

---
 .gitignore          |   4 +-
 R/CRMetrics.R       | 179 +++++++++++++++++++++++++----
 R/inner_functions.R | 268 +++++++++++++++++++++++++++++++++++++++-----
 3 files changed, 399 insertions(+), 52 deletions(-)

diff --git a/.gitignore b/.gitignore
index f357572..f180aac 100644
--- a/.gitignore
+++ b/.gitignore
@@ -27,9 +27,11 @@ vignettes/*.pdf
 # R Environment Variables
 .Renviron
 # Project files
-.Rproj
+*.Rproj
 cran-comments.md
 .Rbuildignore
 CRMetrics.Rproj
 CRAN-SUBMISSION
 .github/
+# test data
+control_6
diff --git a/R/CRMetrics.R b/R/CRMetrics.R
index 81d24f3..5a11792 100644
--- a/R/CRMetrics.R
+++ b/R/CRMetrics.R
@@ -1165,7 +1165,7 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE,
   #' @param n.cores integer Number of cores for the calculations (default = self$n.cores).
   #' @param arg.buildGraph list A list with additional arguments for the `buildGraph` function in Conos (default = list())
   #' @param arg.findCommunities list A list with additional arguments for the `findCommunities` function in Conos (default = list())
-  #' @param arg.embedGraph list A list with additional arguments for the `embedGraph` function in Conos (default = list(method = "UMAP))
+  #' @param arg.embedGraph list A list with additional arguments for the `embedGraph` function in Conos (default = list(method = "UMAP"))
   #' @return Conos object
   #' @examples 
   #' \donttest{
@@ -1706,14 +1706,23 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE,
     if (is.null(total.droplets)) total.droplets <- self$getTotalDroplets(samples)
     
     # Read CMs from HDF5 files
+    
+    #add parse case, use readParse function
     if (!is.null(cms.raw)) {
-      if (verbose) message(paste0(Sys.time()," Using stored HDF5 Cell Ranger outputs. To overwrite, set $cms.raw <- NULL"))
+      if (verbose) message(paste0(Sys.time()," Using stored HDF5 Cell Ranger/Parse outputs. To overwrite, set $cms.raw <- NULL"))
     } else {
-      if (verbose) message(paste0(Sys.time()," Loading HDF5 Cell Ranger outputs"))
+      
       checkDataPath(data.path)
+      
+      if (grepl("combined", data.path)) {
+        if (verbose) message("Loading Parse outputs")
+        cms.raw <- readParse(data.path, samples, "raw", n.cores = n.cores, verbose = verbose, unique.names = unique.names, sep = sep)
+    } else {
+      if (verbose) message(paste0(Sys.time()," Loading HDF5 Cell Ranger outputs"))
       cms.raw <- read10xH5(data.path, samples, "raw", n.cores = n.cores, verbose = verbose, unique.names = unique.names, sep = sep)
-      self$cms.raw <- cms.raw
-    }
+     }
+    self$cms.raw <- cms.raw
+  }
     
     # Get UMI counts
     if (!is.null(umi.counts)) {
@@ -1760,15 +1769,54 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE,
     g <- g + facet_wrap(~ sample, scales = "free")
     
     if (verbose) message(paste0(Sys.time()," Done!"))
+   
+  
+  # transform parse output to work as input for cellbender
+    if (grepl("combined", data.path)) {
+      if (verbose) message("Transforming Parse outputs as preparation for cellbender")
+      # check if transformed data already exists
+      full.path <- data.path %>% 
+        pathsToList(samples) %>% 
+        sapply(\(sample) {
+           paste0(sample[2],"/",sample[1],"/DGE_unfiltered")
+        })
+      lapply(full.path, function(x) {
+        
+        # 1. Check and create genes.tsv
+        if (!file.exists(paste0(x,"/","genes.tsv"))) {
+          if(verbose) message("Creating genes.tsv in ", x)
+          fread(paste0(x,"/","all_genes.csv"), sep = ",", header = TRUE) %>%
+            mutate(genome = "Gene Expression") %>%
+            fwrite(paste0(x,"/","genes.tsv"), sep = "\t", col.names = FALSE)
+        }
+        #2. Check and create barcodes.tsv
+        if (!file.exists(paste0(x,"/","barcodes.tsv"))) {
+          if(verbose) message("Creating barcodes.tsv in ", x)
+          fread(paste0(x,"/","cell_metadata.csv"), sep = ",", header = TRUE) %>%
+            .[, "bc_wells"] %>%
+            write.table(paste0(x,"/","barcodes.tsv"), row.names=F, col.names=F)
+        }
+        #2. Check and create matrix.mtx
+        if (!file.exists(paste0(x,"/","matrix.mtx"))) {
+          if(verbose) message("Creating matrix.mtx in ", x)
+        fread(paste0(full.path[1],"/","count_matrix.mtx"), header = F) %>%
+          select(V2, V1, V3)  %>%
+          fwrite(paste0(full.path[1],"/","matrix.mtx"), col.names=F)
+        }
+      })
+    }
+    # return the plot object
     return(g)
   },
   
+  
+  
   #' @param file character File name for CellBender script. Will be stored in `data.path` (default: "cellbender_script.sh")
   #' @param fpr numeric False positive rate for CellBender (default = 0.01)
   #' @param epochs integer Number of epochs for CellBender (default = 150)
   #' @param use.gpu logical Use CUDA capable GPU (default = TRUE)
-  #' @param expected.cells named numeric If NULL, expected cells will be deduced from the number of cells per sample identified by Cell Ranger. Otherwise, a named vector of expected cells with sample IDs as names. Sample IDs must match those in summary.metrics (default: stored named vector)
-  #' @param total.droplets named numeric If NULL, total droplets included will be deduced from expected cells multiplied by 3. Otherwise, a named vector of total droplets included with sample IDs as names. Sample IDs must match those in summary.metrics (default: stored named vector)
+  #' @param expected.cells logical or named numeric vector If FALSE, the --expected-cells argument is omitted in the generated script.If TRUE, expected cells will be deduced from the number of cells per sample identified by Cell Ranger/Parse. Otherwise, a named vector of expected cells with sample IDs as names. Sample IDs must match those in summary.metrics (default: FALSE) 
+  #' @param total.droplets logical or named numeric vector If FALSE, the --total.droplets argument is omitted in the generated script.If TRUE, total droplets included will be deduced from expected cells multiplied by 3. Otherwise, a named vector of total droplets included with sample IDs as names. Sample IDs must match those in summary.metrics (default: FALSE)
   #' @param data.path character Path to Cell Ranger outputs (default = self$data.path)
   #' @param samples character Sample names to include (default = self$metadata$sample)
   #' @param args character (optional) Additional parameters for CellBender
@@ -1783,26 +1831,84 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE,
                                   fpr = 0.01, 
                                   epochs = 150, 
                                   use.gpu = TRUE, 
-                                  expected.cells = NULL, 
-                                  total.droplets = NULL, 
+                                  expected.cells = FALSE, 
+                                  total.droplets = FALSE, 
                                   data.path = self$data.path, 
                                   samples = self$metadata$sample, 
                                   args = NULL) {
     # Preparations
     checkDataPath(data.path)
-    inputs <- getH5Paths(data.path, samples, "raw")
+    
+    if (grepl("combined", data.path)) {
+      # in case of parse we use the genes, matrix, barcodes files stored in DGE_unfiltered and therefor we just need to give the path to DGE_unfiltered
+      inputs <- data.path %>%
+        pathsToList(samples) %>% 
+        sapply(\(sample) {
+            paste0(sample[2], "/", sample[1], "/DGE_unfiltered")})
+      
+      #check that transformed files exists for parse
+      lapply(inputs, function(x) {
+        
+        # 1. Check genes.tsv
+        if (!file.exists(paste0(x,"/","genes.tsv"))) {
+          message("genes.tsv missing in ", x, " run prepareCellbender first.")
+          
+        }
+        #2. Check barcodes.tsv
+        if (!file.exists(paste0(x,"/","barcodes.tsv"))) {
+          message("barcodes.tsv missing in ", x, " run prepareCellbender first.")
+        }
+        
+        #2. Check matrix.mtx
+        if (!file.exists(paste0(x,"/","matrix.mtx"))) {
+          message("matrix.mtx missing in ", x, " run prepareCellbender first.")
+        }
+      })
+      
+    } else {
+      inputs <- getH5Paths(data.path, samples, "raw")
+    }
+    
     outputs <- data.path %>% 
       pathsToList(samples) %>% 
-      sapply(\(sample) paste0(sample[2],sample[1],"/outs/cellbender.h5")) %>% 
+    sapply(\(sample) {
+        if (grepl("combined", data.path)) {
+          paste0(sample[2], "/", sample[1], "/DGE_unfiltered/cellbender.h5")
+        } else if (grepl("per_sample_outs", data.path)) {
+          paste0(sample[2], "/", sample[1], "/count/cellbender.h5")
+        } else {
+          paste0(sample[2], "/", sample[1], "/outs/cellbender.h5")
+        }
+      }) %>%
       setNames(samples)
     
-    if (is.null(expected.cells)) expected.cells <- self$getExpectedCells(samples)
-    if (is.null(total.droplets)) total.droplets <- self$getTotalDroplets(samples)
+    if (isTRUE(expected.cells)) {
+      expected.cells <- self$getExpectedCells(samples)
+    } else if (isFALSE(expected.cells)) { 
+      expected.cells <- NULL
+    } else if (is.vector(expected.cells) && is.null(names(expected.cells))) {
+        stop("If expected.cells is a vector, it must be named with sample names.")
+      }
+    
+    if (isTRUE(total.droplets)) {
+      total.droplets <- self$getTotalDroplets(samples)
+    } else if (isFALSE(total.droplets)) {
+      total.droplets <- NULL
+    } else if (is.vector(total.droplets) && is.null(names(total.droplets))) {
+       stop("If total.droplets is a vector, it must be named with sample names.")
+    }
     
     # Create CellBender shell scripts
     script.list <- samples %>% 
       lapply(\(sample) {
-        paste0("cellbender remove-background --input ",inputs[sample]," --output ",outputs[sample],if (use.gpu) c(" --cuda ") else c(" "),"--expected-cells ",expected.cells[sample]," --total-droplets-included ",total.droplets[sample]," --fpr ",fpr," --epochs ",epochs," ",if (!is.null(args)) paste(args, collapse = " "))
+        paste0("cellbender remove-background --input ",inputs[sample],
+               " --output ",outputs[sample],
+               if (use.gpu) c(" --cuda ") else c(" "),
+               if (!is.null(expected.cells)) paste0("--expected-cells ", expected.cells[sample], " ") else c(" "),
+               if (!is.null(total.droplets)) paste0("--total-droplets-included ", total.droplets[sample], " ") else c(" "),
+               " --fpr ",fpr,
+               " --epochs ",epochs,
+               " ",if (!is.null(args)) paste(args, collapse = " "))
       })
     
     out <- list("#! /bin/sh", script.list) %>% 
@@ -1832,9 +1938,12 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE,
   #' 
   #' # Get no. cells
   #' crm$getExpectedCells()
+  #' 
+
+  #included parse and 10x flex nomenclature
   getExpectedCells = function(samples = self$metadata$sample) {
     expected.cells <- self$summary.metrics %>% 
-      filter(metric == "estimated number of cells") %$% 
+      filter(metric %in% c("estimated number of cells", "number_of_cells", "cells")) %$% 
       setNames(value, sample) %>%
       .[samples]
     
@@ -1877,7 +1986,9 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE,
   #' @param data.path character Path to cellranger count data (default = self$data.path).
   #' @param samples character Vector of sample names. If NULL, samples are extracted from cms (default = self$metadata$sample)
   #' @param cellbender logical Add CellBender filtered count matrices in HDF5 format. Requires that "cellbender" is in the names of the files (default = FALSE)
-  #' @param raw logical Add raw count matrices from Cell Ranger output. Cannot be combined with `cellbender=TRUE` (default = FALSE)
+  #' @param parse logical Add Parse filtered count matrices. Cannot be combined with `cellbender=TRUE`. Cannot be combined with `raw=TRUE`. (default = FALSE)
+  #' @param flex logical Add 10x Flex filtered count matrices from cell ranger. Can be combined with `raw=TRUE`. Cannot be combined with `cellbender=TRUE`. (default = FALSE)
+  #' @param raw logical Add raw count matrices from Cell Ranger output. Cannot be combined with `cellbender=TRUE`. (default = FALSE)
   #' @param symbol character The type of gene IDs to use, SYMBOL (TRUE) or ENSEMBLE (default = TRUE)
   #' @param unique.names logical Make cell names unique based on `sep` parameter (default = TRUE)
   #' @param sep character Separator used to create unique cell names (default = "!!")
@@ -1904,6 +2015,8 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE,
                     data.path = self$data.path,
                     samples = self$metadata$sample,
                     cellbender = FALSE,
+                    flex = FALSE,
+                    parse = FALSE,
                     raw = FALSE,
                     symbol = TRUE,
                     unique.names = TRUE, 
@@ -1945,9 +2058,17 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE,
       if (unique.names) cms %<>% createUniqueCellNames(samples, sep)
     } else {
       # Add from data.path argument
+      if (parse && cellbender) {
+        stop("Error: If you run CellBender on Parse files, 'parse' must be set to FALSE and only 'cellbender' set to TRUE.")
+      }
       if (cellbender) {
         cms <- read10xH5(data.path = data.path, samples = samples, symbol = symbol, type = "cellbender_filtered", sep = sep, n.cores = n.cores, verbose = verbose, unique.names = unique.names)
-      } else {
+        }  else if (parse) {       
+          cms <- readParse(data.path = data.path, samples = samples, sep = sep, n.cores = n.cores, verbose = verbose, unique.names = unique.names)
+        }  else if (flex) {
+          cms <- readFlex(data.path = data.path, samples = samples, raw = raw, symbol = symbol, sep = sep, n.cores = n.cores, verbose = verbose, unique.names = unique.names)   
+           }
+       else {
         cms <- read10x(data.path = data.path, samples = samples, raw = raw, symbol = symbol, sep = sep, n.cores = n.cores, verbose = verbose, unique.names = unique.names)
       }
     }
@@ -1989,7 +2110,9 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE,
     
     train.df <- samples %>% 
       lapply(\(id) {
-        rhdf5::h5read(paths[id], "matrix/training_elbo_per_epoch") %>%
+       # rhdf5::h5read(paths[id], "matrix/training_elbo_per_epoch") %>%
+      #for cellbender v0.3.0
+        rhdf5::h5read(paths[id], "metadata/learning_curve_train_elbo") %>%
           {data.frame(ELBO = ., 
                       Epoch = seq_len(length(.)), 
                       sample = id)}
@@ -2000,8 +2123,12 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE,
     test.df <- samples %>%
       lapply(\(id) {
         path <- paths[id]
-        data.frame(ELBO = rhdf5::h5read(path, "matrix/test_elbo"), 
-                   Epoch = rhdf5::h5read(path, "matrix/test_epoch"), 
+       # data.frame(ELBO = rhdf5::h5read(path, "matrix/test_elbo"), 
+      #             Epoch = rhdf5::h5read(path, "matrix/test_epoch"), 
+        #           sample = id)
+        #for cellbender v0.3.0
+        data.frame(ELBO = rhdf5::h5read(path, "metadata/learning_curve_test_elbo"), 
+                   Epoch = rhdf5::h5read(path, "metadata/learning_curve_test_epoch"), 
                    sample = id)
       }) %>% 
       setNames(samples) %>% 
@@ -2045,7 +2172,9 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE,
     
     cell.prob <- samples %>%
       lapply(\(id) {
-        rhdf5::h5read(paths[id], "matrix/latent_cell_probability") %>%
+        #rhdf5::h5read(paths[id], "matrix/latent_cell_probability") %>%
+        # for cellbender v0.3.0
+        rhdf5::h5read(paths[id], "droplet_latents/cell_probability") %>%
           {data.frame(prob = ., 
                       cell = seq_len(length(.)), 
                       sample = id)}
@@ -2083,7 +2212,9 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE,
     
     amb <- samples %>% 
       lapply(\(id) {
-        rhdf5::h5read(paths[id], "matrix/ambient_expression") %>% 
+      #  rhdf5::h5read(paths[id], "matrix/ambient_expression") %>% 
+        #for cellbender version 3
+        rhdf5::h5read(paths[id], "global_latents/ambient_expression") %>% 
           {data.frame(exp = ., 
                       cell = seq_len(length(.)), 
                       gene.names = rhdf5::h5read(paths[id], "matrix/features/name") %>% as.character(), 
@@ -2127,7 +2258,9 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE,
     
     amb <- samples %>% 
       lapply(\(id) {
-        rhdf5::h5read(paths[id], "matrix/ambient_expression") %>% 
+        #rhdf5::h5read(paths[id], "matrix/ambient_expression") %>% 
+          #for cellbender version 3
+          rhdf5::h5read(paths[id], "global_latents/ambient_expression") %>% 
           {data.frame(exp = ., 
                       cell = seq_len(length(.)), 
                       gene.names = rhdf5::h5read(paths[id], "matrix/features/name") %>% as.character(), 
diff --git a/R/inner_functions.R b/R/inner_functions.R
index ed72edd..8935513 100644
--- a/R/inner_functions.R
+++ b/R/inner_functions.R
@@ -111,6 +111,152 @@ read10x <- function(data.path,
   return(tmp)
 }
 
+#' @title Load 10x flex count matrices
+#' @description Load gene expression count data
+#' @param data.path Path to cellranger count data.
+#' @param samples Vector of sample names (default = NULL)
+#' @param raw logical Add raw count matrices (default = FALSE)
+#' @param symbol The type of gene IDs to use, SYMBOL (TRUE) or ENSEMBLE (default = TRUE).
+#' @param sep Separator for cell names (default = "!!").
+#' @param n.cores Number of cores for the calculations (default = 1).
+#' @param verbose Print messages (default = TRUE).
+#' @keywords internal
+#' @return data frame
+#' @examples 
+#' \dontrun{
+#' cms <- readFlex(data.path = "/path/to/count/data", 
+#' samples = crm$metadata$samples, 
+#' raw = FALSE, 
+#' symbol = TRUE, 
+#' n.cores = crm$n.cores)
+#' }
+#' @export
+readFlex <- function(data.path, 
+                    samples = NULL, 
+                    raw = FALSE, 
+                    symbol = TRUE, 
+                    sep = "!!", 
+                    unique.names = TRUE, 
+                    n.cores = 1, 
+                    verbose = TRUE) {
+  checkPackageInstalled("data.table", cran = TRUE)
+  if (is.null(samples)) samples <- list.dirs(data.path, full.names = FALSE, recursive = FALSE)
+  
+  full.path <- data.path %>% 
+    pathsToList(samples) %>% 
+    sapply(\(sample) {
+      if (raw) pat <- glob2rx("sample_raw_*_bc_matri*") else pat <- glob2rx("sample_filtered_*_bc_matri*")
+      dir(paste(sample[2],sample[1],"count", sep = "/"), pattern = pat, full.names = TRUE) %>% 
+        .[!grepl(".h5", .)]
+    })
+  
+  if (verbose) message(paste0(Sys.time()," Loading ",length(full.path)," count matrices using ", if (n.cores > length(full.path)) length(full.path) else n.cores," cores"))
+  tmp <- full.path %>%
+    plapply(\(sample) {
+      tmp.dir <- dir(sample, full.names = TRUE)
+      
+      # Read matrix
+      mat.path <- tmp.dir %>%
+        .[grepl("mtx", .)]
+      if (grepl("gz", mat.path)) {
+        mat <- as(Matrix::readMM(gzcon(file(mat.path, "rb"))), "CsparseMatrix")
+      } else {
+        mat <- as(Matrix::readMM(mat.path), "CsparseMatrix")
+      }
+      
+      # Add features
+      feat <- tmp.dir %>%
+        .[grepl(ifelse(any(grepl("features.tsv", .)),"features.tsv","genes.tsv"), .)] %>%
+        data.table::fread(header = FALSE)
+      if (symbol) rownames(mat) <- feat %>% pull(V2) else rownames(mat) <- feat %>% pull(V1)
+      
+      # Add barcodes
+      barcodes <- tmp.dir %>%
+        .[grepl("barcodes.tsv", .)] %>%
+        data.table::fread(header = FALSE)
+      colnames(mat) <- barcodes %>% pull(V1)
+      return(mat)
+    }, n.cores = n.cores) %>%
+    setNames(samples)
+  
+  if (unique.names) tmp %<>% createUniqueCellNames(samples, sep)
+  
+  if (verbose) message(paste0(Sys.time()," Done!"))
+  
+  return(tmp)
+}
+
+#' @title Load filtered Parse count matrices
+#' @description Load gene expression count data
+#' @param data.path Path to Parse count data.
+#' @param samples Vector of sample names (default = NULL)
+#' @param raw logical Add raw count matrices (default = FALSE)
+#' @param sep Separator for cell names (default = "!!").
+#' @param n.cores Number of cores for the calculations (default = 1).
+#' @param verbose Print messages (default = TRUE).
+#' @keywords internal
+#' @return data frame
+#' 
+#' @export
+readParse <- function(data.path, 
+                    samples = NULL, 
+                    raw = FALSE,
+                    sep = "!!", 
+                    unique.names = TRUE, 
+                    n.cores = 1, 
+                    verbose = TRUE) {
+  
+  checkPackageInstalled("data.table", cran = TRUE)
+  checkPackageInstalled("Matrix", cran = TRUE)
+  
+  # If no samples provided, infer from directories
+  if (is.null(samples)) samples <- list.dirs(data.path, full.names = FALSE, recursive = FALSE)
+  
+  full.path <- data.path %>% 
+    pathsToList(samples) %>% 
+    sapply(\(sample) {
+      if (isFALSE(raw)) {
+      paste0(sample[2],"/",sample[1],"/", "DGE_filtered", sep = "/")
+    } else paste0(sample[2],"/",sample[1],"/", "DGE_unfiltered", sep = "/")
+      })
+  
+  if (verbose) message(paste0(Sys.time()," Loading ",length(full.path)," count matrices using ", if (n.cores > length(full.path)) length(full.path) else n.cores," cores"))
+  tmp <- full.path %>%
+    plapply(\(sample) {
+      tmp.dir <- dir(sample, full.names = TRUE)
+      
+      # Read matrix, attention, in case of raw there can be the case that we have 2 .mtx because for cellbender we need to create a new one
+      mat.path <- tmp.dir %>%
+        .[grepl("count_matrix.mtx", .)]
+        mat <- as(Matrix::readMM(mat.path), "CsparseMatrix")
+        # Transpose the matrix to have genes in rows and cells in columns
+        mat <- t(mat)
+        
+      # Add genes
+      genes <- tmp.dir %>%
+        .[grepl("all_genes.csv", .)] %>%
+        data.table::fread(header = T)
+        rownames(mat) <- genes$gene_name 
+      
+      # Add barcodes
+      barcodes <- tmp.dir %>%
+        .[grepl("cell_metadata.csv", .)] %>%
+        data.table::fread(header = T)
+      colnames(mat) <- barcodes$bc_wells
+      
+      return(mat)
+    }, n.cores = n.cores) %>%
+    setNames(samples)
+  
+  
+  # Make cell names unique across samples if needed
+  if (unique.names) tmp %<>% createUniqueCellNames(samples, sep)
+  
+  if (verbose) message(paste0(Sys.time(), " Done!"))
+  
+  return(tmp)
+}
+  
 #' @title Add detailed metrics
 #' @description Add detailed metrics, requires to load raw count matrices using pagoda2.
 #' @param cms List containing the count matrices. 
@@ -263,26 +409,71 @@ addSummaryMetrics <- function(data.path,
   metrics <- data.path %>% 
     pathsToList(metadata$sample) %>% 
     plapply(\(s) {
-      tmp <- read.table(dir(paste(s[2],s[1],"outs", sep = "/"), glob2rx("*ummary.csv"), full.names = TRUE), header = TRUE, sep = ",", colClasses = numeric()) %>%
-        mutate(., across(.cols = grep("%", .),
-                         ~ as.numeric(gsub("%", "", .x)) / 100),
-               across(.cols = grep(",", .),
-                         ~ as.numeric(gsub(",", "", .x))))
       
-      # Take into account multiomics
-      if ("Sample.ID" %in% colnames(tmp)) tmp %<>% select(-c("Sample.ID","Genome","Pipeline.version"))
-                  
-      tmp %>%
-        mutate(sample = s[1]) %>% 
-        pivot_longer(cols = -c(sample),
-                     names_to = "metric",
-                     values_to = "value") %>% 
+      # define possible directories to search for summary files (outs in case of Cell Ranger, report in case of Parse, sample base dir in case of 10X flex)
+      outs_dir <- paste(s[2], s[1], "outs", sep = "/")
+      report_dir <- paste(s[2], s[1], "report", sep = "/")
+      sample_base_dir <- paste(s[2], s[1], sep = "/")
+      
+      # Check which folder exists and use it to read the summary file
+      dir_to_use <- if (dir.exists(outs_dir)) outs_dir else if (dir.exists(report_dir)) report_dir else sample_base_dir
+      
+      if (!dir.exists(dir_to_use)) {
+        warning(paste("No 'outs', 'report', or summary file found for sample:", s[1]))
+        return(NULL)
+      }
+      
+      # Read the summary file differently based on the folder type
+      tmp <- NULL
+      if (dir_to_use == outs_dir) {
+        # Logic for 'outs' folder
+        tmp <- read.table(dir(dir_to_use, glob2rx("*ummary.csv"), full.names = TRUE), header = TRUE, sep = ",", colClasses = numeric()) %>% 
+          mutate(., across(.cols = grep("%", .), ~ as.numeric(gsub("%", "", .x)) / 100),
+               across(.cols = grep(",", .), ~ as.numeric(gsub(",", "", .x))))
+      
+       # Take into account multiomics
+       if ("Sample.ID" %in% colnames(tmp)) tmp %<>% select(-c("Sample.ID","Genome","Pipeline.version"))
+  
+        
+     # Add the sample column, pivot to long format, and clean metric names            
+     tmp %<>%
+       mutate(sample = s[1]) %>% 
+       pivot_longer(cols = -c(sample),
+                 names_to = "metric",
+                 values_to = "value") %>% 
         mutate(metric = metric %>% gsub(".", " ", ., fixed = TRUE) %>% tolower())
-    }, n.cores = n.cores) %>% 
-    bind_rows()
-  if (verbose) message(paste0(Sys.time()," Done!"))
-  return(metrics)
-}
+      } 
+      
+      else if (dir_to_use == report_dir) {
+      
+      # Logic for 'report' folder (adjust this based on the structure of files in 'report')
+      tmp <- read.table(dir(dir_to_use, glob2rx("*ummary.csv"), full.names = TRUE), 
+                        header = TRUE, sep = ",", stringsAsFactors = FALSE) %>%
+        select(statistic, combined) %>% # Only select 'statistic' and 'combined' columns
+        rename(metric = statistic, value = combined) %>%  # Rename columns
+        mutate(sample = s[1])   # Add sample column
+      }
+      
+      if (is.null(tmp)) {
+        # in case of 10X Flex summary metrics is not in outs nor report directory
+          tmp <- read.table(dir(sample_base_dir, glob2rx("*ummary.csv"), full.names = TRUE), 
+                            header = TRUE, sep = ",", stringsAsFactors = FALSE) %>%
+            select(Metric.Name, Metric.Value) %>% # Only select 'Metric Name' and 'Metric Value' columns
+            rename(metric = Metric.Name, value = Metric.Value) %>%
+            slice(1:12) %>%
+            mutate(value = gsub(",", "", value),  # Remove commas
+                   value = ifelse(grepl("%", value), as.numeric(gsub("%", "", value)) / 100, as.numeric(value)))  %>%  # Handle percentages
+            mutate(sample = s[1]) %>% # Add sample column
+            mutate(metric = metric %>% gsub(".", " ", ., fixed = TRUE) %>% tolower())
+        }
+     
+  return(tmp)   
+}, n.cores = n.cores) %>% 
+  bind_rows() #combine all data frames
+if (verbose) message(paste0(Sys.time()," Done!"))
+return(metrics)  
+}      
+
 
 #' @title Plot the data as points, as bars as a histogram, or as a violin
 #' @description Plot the data as points, barplot, histogram or violin
@@ -375,7 +566,7 @@ labelsFilter <- function(filter.data) {
   return(tmp)
 }
 
-#' @title Read 10x HDF5 files
+#' @title Read 10x or cellbenders HDF5 files
 #' @param data.path character
 #' @param samples character vector, select specific samples for processing (default = NULL)
 #' @param type name of H5 file to search for, "raw" and "filtered" are Cell Ranger count outputs, "cellbender" is output from CellBender after running script from saveCellbenderScript
@@ -467,9 +658,9 @@ createUniqueCellNames <- function(cms,
 
 #' @title Get H5 file paths
 #' @description Get file paths for H5 files
-#' @param data.path character Path for directory containing sample-wise directories with Cell Ranger count outputs
+#' @param data.path character Path for directory containing sample-wise directories with Cell Ranger count/Parse outputs
 #' @param samples character Sample names to include (default = NULL)
-#' @param type character Type of H5 files to get paths for, one of "raw", "filtered" (Cell Ranger count outputs), "cellbender" (raw CellBender outputs), "cellbender_filtered" (CellBender filtered outputs) (default = "type")
+#' @param type character Type of H5 files to get paths for, one of "raw", "filtered" (Cell Ranger count outputs, does not work for Parse), "cellbender" (raw CellBender outputs), "cellbender_filtered" (CellBender filtered outputs) (default = "type")
 #' @keywords internal
 getH5Paths <- function(data.path, 
                        samples = NULL, 
@@ -479,14 +670,32 @@ getH5Paths <- function(data.path,
     tolower() %>% 
     match.arg(c("raw","filtered","cellbender","cellbender_filtered"))
   
+  
+  # Determine which folder to use based on whether 'combined' (parse) is in the data path or 'per_sample_outs' (10x flex) is in the data path
+  use_dge_unfiltered <- grepl("combined", data.path)
+  use_count <- grepl("per_sample_outs", data.path)
+  
+  # Get the appropriate folder path for all samples (helper function)
+  get_folder_path <- function(sample) {
+    if (use_dge_unfiltered) {
+      return(paste0(sample[2], "/", sample[1], "/DGE_unfiltered"))
+    } else if (use_count) {
+      return(paste0(sample[2], "/", sample[1], "/count"))
+    } else {
+      return(paste0(sample[2], "/", sample[1], "/outs"))
+    }
+  }
+  
+
   # Get H5 paths
   paths <- data.path %>% 
     pathsToList(samples) %>% 
     sapply(\(sample) {
+      folder <- get_folder_path(sample)
       if (grepl("cellbender", type)) {
-        paste0(sample[2],"/",sample[1],"/outs/",type,".h5")
+        paste0(folder,"/",type,".h5")
       } else {
-        dir(paste0(sample[2],sample[1],"/outs"), glob2rx(paste0(type,"*.h5")), full.names = TRUE)
+        dir(paste0(folder), glob2rx(paste0("*", type,"*.h5")), full.names = TRUE)
       }
     }) %>% 
     setNames(samples)
@@ -500,12 +709,13 @@ getH5Paths <- function(data.path,
     
     miss <- miss.names %>% 
       sapply(\(sample) {
+        folder <- get_folder_path(sample)
         if (type == "raw") {
-          paste0(data.path,sample,"/outs/raw_[feature/gene]_bc_matrix.h5")
+          paste0(folder, "/raw_[feature/gene]_bc_matrix.h5")
         } else if (type == "filtered") {
-          paste0(data.path,sample,"/outs/filtered_[feature/gene]_bc_matrix.h5")
+          paste0(folder,"/filtered_[feature/gene]_bc_matrix.h5")
         } else {
-          paste0(data.path,sample,"/outs/",type,".h5")
+          paste0(folder,"/",type,".h5")
         }
       }) %>% 
       setNames(miss.names)
@@ -569,6 +779,8 @@ checkDataPath <- function(data.path) {
   if (is.null(data.path)) stop("'data.path' cannot be NULL.")
 }
 
+
+# this fucntion is wired misses a title and so on...
 pathsToList <- function(data.path, samples) {
   data.path %>% 
     lapply(\(path) list.dirs(path, recursive = F, full.names = F) %>% 
@@ -577,5 +789,5 @@ pathsToList <- function(data.path, samples) {
     bind_rows() %>% 
     t() %>% 
     data.frame() %>% 
-    as.list()
-}
\ No newline at end of file
+    as.list() 
+}

From 8a63533e415b68dd98fa5c602c920c3a79867309 Mon Sep 17 00:00:00 2001
From: Laura19993 <laura.wolbeck93@gmail.com>
Date: Fri, 11 Jul 2025 15:17:03 +0200
Subject: [PATCH 2/6] Fixes_nr_1

---
 .gitignore          |   4 ++
 R/CRMetrics.R       | 135 ++++++++++++++++++++++++++++++++------------
 R/inner_functions.R |  47 ++++++++-------
 3 files changed, 126 insertions(+), 60 deletions(-)

diff --git a/.gitignore b/.gitignore
index f180aac..6a9414d 100644
--- a/.gitignore
+++ b/.gitignore
@@ -35,3 +35,7 @@ CRAN-SUBMISSION
 .github/
 # test data
 control_6
+control_7
+metadata.csv
+# summary of updates for users
+CRMetrics_updates_for_users.Rmd
\ No newline at end of file
diff --git a/R/CRMetrics.R b/R/CRMetrics.R
index 5a11792..296e004 100644
--- a/R/CRMetrics.R
+++ b/R/CRMetrics.R
@@ -319,7 +319,7 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE,
   #' @param exact logical Whether to calculate exact p values (default = FALSE).
   #' @param metadata data.frame Metadata for samples (default = self$metadata).
   #' @param summary.metrics data.frame Summary metrics (default = self$summary.metrics).
-  #' @param plot.geom character Which geometric is used to plot the data (default = "point").
+  #' @param plot.geom character Which geometric is used to plot the data (default = "bar").
   #' @param se logical For regression lines, show SE (default = FALSE)
   #' @param group.reg.lines logical For regression lines, if FALSE show one line, if TRUE show line per group defined by second.comp.group (default = FALSE)
   #' @param secondary.testing logical Whether to show post hoc testing (default = TRUE)
@@ -1715,7 +1715,7 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE,
       checkDataPath(data.path)
       
       if (grepl("combined", data.path)) {
-        if (verbose) message("Loading Parse outputs")
+        if (verbose) message(paste0(Sys.time()," Loading Parse outputs"))
         cms.raw <- readParse(data.path, samples, "raw", n.cores = n.cores, verbose = verbose, unique.names = unique.names, sep = sep)
     } else {
       if (verbose) message(paste0(Sys.time()," Loading HDF5 Cell Ranger outputs"))
@@ -1773,7 +1773,7 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE,
   
   # transform parse output to work as input for cellbender
     if (grepl("combined", data.path)) {
-      if (verbose) message("Transforming Parse outputs as preparation for cellbender")
+      if (verbose) message(paste0(Sys.time()," Transforming Parse outputs as preparation for cellbender"))
       # check if transformed data already exists
       full.path <- data.path %>% 
         pathsToList(samples) %>% 
@@ -1783,22 +1783,28 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE,
       lapply(full.path, function(x) {
         
         # 1. Check and create genes.tsv
-        if (!file.exists(paste0(x,"/","genes.tsv"))) {
-          if(verbose) message("Creating genes.tsv in ", x)
+        if (file.exists(paste0(x,"/","genes.tsv"))) {
+          warning(paste0("genes.tsv already exists in ", x))
+        } else {
+          if(verbose) message(paste0(Sys.time()," Creating genes.tsv in ", x))
           fread(paste0(x,"/","all_genes.csv"), sep = ",", header = TRUE) %>%
             mutate(genome = "Gene Expression") %>%
             fwrite(paste0(x,"/","genes.tsv"), sep = "\t", col.names = FALSE)
         }
-        #2. Check and create barcodes.tsv
-        if (!file.exists(paste0(x,"/","barcodes.tsv"))) {
-          if(verbose) message("Creating barcodes.tsv in ", x)
+        # 2. Check and create barcodes.tsv
+        if (file.exists(paste0(x,"/","barcodes.tsv"))) {
+          warning(paste0("barcodes.tsv already exists in ", x))
+        } else {
+          if(verbose) message(paste0(Sys.time()," Creating barcodes.tsv in ", x))
           fread(paste0(x,"/","cell_metadata.csv"), sep = ",", header = TRUE) %>%
             .[, "bc_wells"] %>%
             write.table(paste0(x,"/","barcodes.tsv"), row.names=F, col.names=F)
         }
-        #2. Check and create matrix.mtx
-        if (!file.exists(paste0(x,"/","matrix.mtx"))) {
-          if(verbose) message("Creating matrix.mtx in ", x)
+        # 3. Check and create matrix.mtx
+        if (file.exists(paste0(x,"/","matrix.mtx"))) {
+          warning(paste0("matrix.mtx already exists in ", x))
+        } else {
+          if(verbose) message(paste0(Sys.time()," Creating matrix.mtx in ", x))
         fread(paste0(full.path[1],"/","count_matrix.mtx"), header = F) %>%
           select(V2, V1, V3)  %>%
           fwrite(paste0(full.path[1],"/","matrix.mtx"), col.names=F)
@@ -1851,17 +1857,17 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE,
         
         # 1. Check genes.tsv
         if (!file.exists(paste0(x,"/","genes.tsv"))) {
-          message("genes.tsv missing in ", x, " run prepareCellbender first.")
+          error("genes.tsv missing in ", x, " run prepareCellbender first.")
           
         }
         #2. Check barcodes.tsv
         if (!file.exists(paste0(x,"/","barcodes.tsv"))) {
-          message("barcodes.tsv missing in ", x, " run prepareCellbender first.")
+          error("barcodes.tsv missing in ", x, " run prepareCellbender first.")
         }
         
         #2. Check matrix.mtx
         if (!file.exists(paste0(x,"/","matrix.mtx"))) {
-          message("matrix.mtx missing in ", x, " run prepareCellbender first.")
+          error("matrix.mtx missing in ", x, " run prepareCellbender first.")
         }
       })
       
@@ -1983,11 +1989,11 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE,
   
   #' @description Add a list of count matrices to the CRMetrics object.
   #' @param cms list List of (sparse) count matrices (default = NULL)
-  #' @param data.path character Path to cellranger count data (default = self$data.path).
+  #' @param data.path character Path to Cell Ranger or Parse count data (default = self$data.path).
   #' @param samples character Vector of sample names. If NULL, samples are extracted from cms (default = self$metadata$sample)
   #' @param cellbender logical Add CellBender filtered count matrices in HDF5 format. Requires that "cellbender" is in the names of the files (default = FALSE)
   #' @param parse logical Add Parse filtered count matrices. Cannot be combined with `cellbender=TRUE`. Cannot be combined with `raw=TRUE`. (default = FALSE)
-  #' @param flex logical Add 10x Flex filtered count matrices from cell ranger. Can be combined with `raw=TRUE`. Cannot be combined with `cellbender=TRUE`. (default = FALSE)
+  #' @param flex logical Add 10x Flex filtered count matrices from Cell Ranger. Can be combined with `raw=TRUE`. Cannot be combined with `cellbender=TRUE`. (default = FALSE)
   #' @param raw logical Add raw count matrices from Cell Ranger output. Cannot be combined with `cellbender=TRUE`. (default = FALSE)
   #' @param symbol character The type of gene IDs to use, SYMBOL (TRUE) or ENSEMBLE (default = TRUE)
   #' @param unique.names logical Make cell names unique based on `sep` parameter (default = TRUE)
@@ -2110,12 +2116,23 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE,
     
     train.df <- samples %>% 
       lapply(\(id) {
-       # rhdf5::h5read(paths[id], "matrix/training_elbo_per_epoch") %>%
-      #for cellbender v0.3.0
-        rhdf5::h5read(paths[id], "metadata/learning_curve_train_elbo") %>%
-          {data.frame(ELBO = ., 
-                      Epoch = seq_len(length(.)), 
-                      sample = id)}
+        # for cellbender v0.3
+        if ("learning_curve_train_elbo" %in% unique(rhdf5::h5ls(paths[id])$name)) {
+          message(paste0(Sys.time(), " Detected CellBender v0.3, loading training data."))
+          rhdf5::h5read(paths[id], "metadata/learning_curve_train_elbo") %>%
+            {data.frame(ELBO = ., 
+                        Epoch = seq_len(length(.)), 
+                        sample = id)}  
+        # for cellbender v0.2
+        } else if ("training_elbo_per_epoch" %in% unique(rhdf5::h5ls(paths[id])$name)) {
+          message(paste0(Sys.time(), " Detected CellBender v0.2, loading training data."))
+          rhdf5::h5read(paths[id], "matrix/training_elbo_per_epoch") %>%
+            {data.frame(ELBO = ., 
+                        Epoch = seq_len(length(.)), 
+                        sample = id)} 
+        } else {
+          stop(paste0("Neither CellBender v0.2 nor v0.3 output found in file(s): ", paths[id]))
+        }
       }) %>% 
       setNames(samples) %>% 
       bind_rows()
@@ -2123,13 +2140,21 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE,
     test.df <- samples %>%
       lapply(\(id) {
         path <- paths[id]
-       # data.frame(ELBO = rhdf5::h5read(path, "matrix/test_elbo"), 
-      #             Epoch = rhdf5::h5read(path, "matrix/test_epoch"), 
-        #           sample = id)
-        #for cellbender v0.3.0
-        data.frame(ELBO = rhdf5::h5read(path, "metadata/learning_curve_test_elbo"), 
+        # for cellbender v0.3
+        if ("learning_curve_test_elbo" %in% unique(rhdf5::h5ls(paths[id])$name)) {
+          message(paste0(Sys.time(), " Detected CellBender v0.3, loading test data."))
+          data.frame(ELBO = rhdf5::h5read(path, "metadata/learning_curve_test_elbo"), 
                    Epoch = rhdf5::h5read(path, "metadata/learning_curve_test_epoch"), 
                    sample = id)
+          # for cellbender v0.2
+        } else if ("test_elbo" %in% unique(rhdf5::h5ls(paths[id])$name)) {
+          message(paste0(Sys.time(), " Detected CellBender v0.2, loading test data."))
+          data.frame(ELBO = rhdf5::h5read(path, "matrix/test_elbo"), 
+                  Epoch = rhdf5::h5read(path, "matrix/test_epoch"), 
+                  sample = id)
+        } else {
+          stop(paste0("Neither CellBender v0.2 nor v0.3 output found in file(s): ", paths[id]))
+        }
       }) %>% 
       setNames(samples) %>% 
       bind_rows()
@@ -2172,18 +2197,29 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE,
     
     cell.prob <- samples %>%
       lapply(\(id) {
-        #rhdf5::h5read(paths[id], "matrix/latent_cell_probability") %>%
         # for cellbender v0.3.0
+        if ("cell_probability" %in% unique(rhdf5::h5ls(paths[id])$name)) {
+        message(paste0(Sys.time(), " Detected CellBender v0.3, loading cell probabilities."))
         rhdf5::h5read(paths[id], "droplet_latents/cell_probability") %>%
           {data.frame(prob = ., 
                       cell = seq_len(length(.)), 
                       sample = id)}
+        # for cellbender v0.2
+        } else if ("latent_cell_probability" %in% unique(rhdf5::h5ls(paths[id])$name)) {
+        message(paste0(Sys.time(), " Detected CellBender v0.2, loading cell probabilities."))
+        rhdf5::h5read(paths[id], "matrix/latent_cell_probability") %>%
+          {data.frame(prob = ., 
+                      cell = seq_len(length(.)), 
+                      sample = id)}
+        } else {
+          stop(paste0("Neither CellBender v0.2 nor v0.3 output found in file(s): ", paths[id]))
+        }
       }) %>% 
       setNames(samples) %>% 
       bind_rows()
     
     ggplot(cell.prob, aes(cell, prob, col = prob)) + 
-      geom_point() +
+      rasterise(geom_point(), dpi = 300) +
       scale_color_gradient(low=low.col, high=high.col) +
       self$theme +
       labs(x = "Cells", y = "Cell probability", col = "") +
@@ -2212,13 +2248,26 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE,
     
     amb <- samples %>% 
       lapply(\(id) {
-      #  rhdf5::h5read(paths[id], "matrix/ambient_expression") %>% 
-        #for cellbender version 3
+        # for cellbender v0.3
+        if ("/global_latents" %in% unique(rhdf5::h5ls(paths[id])$group)) {
+        message(paste0(Sys.time(), " Detected CellBender v0.3, loading ambient expression data."))
         rhdf5::h5read(paths[id], "global_latents/ambient_expression") %>% 
           {data.frame(exp = ., 
                       cell = seq_len(length(.)), 
                       gene.names = rhdf5::h5read(paths[id], "matrix/features/name") %>% as.character(), 
                       sample = id)}
+        # for cellbender v0.2, using a "name" from above to differentiate versions
+        } else if ("latent_cell_probability" %in% unique(rhdf5::h5ls(paths[id])$name)) {
+          message(paste0(Sys.time(), " Detected CellBender v0.2, loading ambient expression data."))
+        rhdf5::h5read(paths[id], "matrix/ambient_expression") %>% 
+          {data.frame(exp = ., 
+                      cell = seq_len(length(.)), 
+                      gene.names = rhdf5::h5read(paths[id], "matrix/features/name") %>% as.character(), 
+                      sample = id)}
+        }
+        else {
+          stop(paste0("Neither CellBender v0.2 nor v0.3 output found in file(s): ", paths[id]))
+          }    
       }) %>% 
       setNames(samples) %>% 
       bind_rows()
@@ -2258,14 +2307,28 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE,
     
     amb <- samples %>% 
       lapply(\(id) {
-        #rhdf5::h5read(paths[id], "matrix/ambient_expression") %>% 
-          #for cellbender version 3
-          rhdf5::h5read(paths[id], "global_latents/ambient_expression") %>% 
+        # for cellbender version 3
+        if ("/global_latents" %in% unique(rhdf5::h5ls(paths[id])$group)) {
+        message(paste0(Sys.time(), " Detected CellBender v0.3, loading ambient expression data."))
+        rhdf5::h5read(paths[id], "global_latents/ambient_expression") %>% 
           {data.frame(exp = ., 
                       cell = seq_len(length(.)), 
                       gene.names = rhdf5::h5read(paths[id], "matrix/features/name") %>% as.character(), 
                       sample = id)} %>% 
           filter(exp >= cutoff)
+        # for cellbender v0.2
+        } else if ("latent_cell_probability" %in% unique(rhdf5::h5ls(paths[id])$name)) {
+        message(paste0(Sys.time(), " Detected CellBender v0.2, loading ambient expression data."))
+        rhdf5::h5read(paths[id], "matrix/ambient_expression") %>% 
+          {data.frame(exp = ., 
+                      cell = seq_len(length(.)), 
+                      gene.names = rhdf5::h5read(paths[id], "matrix/features/name") %>% as.character(), 
+                      sample = id)} %>% 
+          filter(exp >= cutoff)
+        }
+        else {
+          stop(paste0("Neither CellBender v0.2 nor v0.3 output found in file(s): ", paths[id]))
+        }   
       }) %>% 
       setNames(samples) %>% 
       bind_rows() %$%
@@ -2279,7 +2342,7 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE,
       geom_bar(stat = "identity") +
       self$theme +
       labs(x = "", y = "Proportion") +
-      theme(axis.text.x = element_text(angle = 90)) + 
+      theme(axis.text.x = element_text(angle = 45, hjust = 1)) + 
       guides(fill = "none")
     
     if (!is.null(pal)) {
@@ -2508,7 +2571,7 @@ plotCbCells = function(data.path = self$data.path,
   g <- ggplot(df, aes(sample, value, fill = sample)) +
     geom_bar(stat = "identity") +
     self$theme + 
-    theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5)) +
+    theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
     guides(fill = "none") +
     labs(x = "", y = "") +
     facet_wrap(~variable, scales = "free_y", nrow = 2, ncol = 2)
diff --git a/R/inner_functions.R b/R/inner_functions.R
index 8935513..00eca7e 100644
--- a/R/inner_functions.R
+++ b/R/inner_functions.R
@@ -207,7 +207,6 @@ readParse <- function(data.path,
                     verbose = TRUE) {
   
   checkPackageInstalled("data.table", cran = TRUE)
-  checkPackageInstalled("Matrix", cran = TRUE)
   
   # If no samples provided, infer from directories
   if (is.null(samples)) samples <- list.dirs(data.path, full.names = FALSE, recursive = FALSE)
@@ -230,7 +229,7 @@ readParse <- function(data.path,
         .[grepl("count_matrix.mtx", .)]
         mat <- as(Matrix::readMM(mat.path), "CsparseMatrix")
         # Transpose the matrix to have genes in rows and cells in columns
-        mat <- t(mat)
+        mat <- Matrix::t(mat)
         
       # Add genes
       genes <- tmp.dir %>%
@@ -670,28 +669,11 @@ getH5Paths <- function(data.path,
     tolower() %>% 
     match.arg(c("raw","filtered","cellbender","cellbender_filtered"))
   
-  
-  # Determine which folder to use based on whether 'combined' (parse) is in the data path or 'per_sample_outs' (10x flex) is in the data path
-  use_dge_unfiltered <- grepl("combined", data.path)
-  use_count <- grepl("per_sample_outs", data.path)
-  
-  # Get the appropriate folder path for all samples (helper function)
-  get_folder_path <- function(sample) {
-    if (use_dge_unfiltered) {
-      return(paste0(sample[2], "/", sample[1], "/DGE_unfiltered"))
-    } else if (use_count) {
-      return(paste0(sample[2], "/", sample[1], "/count"))
-    } else {
-      return(paste0(sample[2], "/", sample[1], "/outs"))
-    }
-  }
-  
-
   # Get H5 paths
   paths <- data.path %>% 
     pathsToList(samples) %>% 
-    sapply(\(sample) {
-      folder <- get_folder_path(sample)
+    sapply(\(i) {
+      folder <- getFolderPaths(i, data.path = data.path)
       if (grepl("cellbender", type)) {
         paste0(folder,"/",type,".h5")
       } else {
@@ -708,8 +690,8 @@ getH5Paths <- function(data.path,
       names()
     
     miss <- miss.names %>% 
-      sapply(\(sample) {
-        folder <- get_folder_path(sample)
+      sapply(\(i) {
+        folder <- getFolderPaths(i, data.path= data.path)
         if (type == "raw") {
           paste0(folder, "/raw_[feature/gene]_bc_matrix.h5")
         } else if (type == "filtered") {
@@ -734,6 +716,8 @@ getH5Paths <- function(data.path,
   return(paths)
 }
 
+
+
 #' @title Create filtering vector
 #' @description Create logical filtering vector based on a numeric vector and a (sample-wise) cutoff
 #' @param num.vec numeric Numeric vector to create filter on
@@ -780,7 +764,7 @@ checkDataPath <- function(data.path) {
 }
 
 
-# this fucntion is wired misses a title and so on...
+# no title?
 pathsToList <- function(data.path, samples) {
   data.path %>% 
     lapply(\(path) list.dirs(path, recursive = F, full.names = F) %>% 
@@ -791,3 +775,18 @@ pathsToList <- function(data.path, samples) {
     data.frame() %>% 
     as.list() 
 }
+
+
+# this function uses the output of pathsToList() as input "i"
+getFolderPaths <- function(i, data.path) {
+  # Determine which folder to use based on whether 'combined' (parse) is in the data path or 'per_sample_outs' (10x flex) is in the data path
+  use_dge_unfiltered <- grepl("combined", data.path)
+  use_count <- grepl("per_sample_outs", data.path)
+  if (use_dge_unfiltered) {
+    return(file.path(i[2], i[1], "DGE_unfiltered"))
+  } else if (use_count) {
+    return(file.path(i[2], i[1], "count"))
+  } else {
+    return(file.path(i[2], i[1], "outs"))
+  }
+}

From 30af8d052bd51cdb3d7411c52aeb27b96a14028d Mon Sep 17 00:00:00 2001
From: Laura19993 <laura.wolbeck93@gmail.com>
Date: Tue, 23 Sep 2025 14:28:18 +0200
Subject: [PATCH 3/6] Fixes nr 2: introduce technology flag

---
 .gitignore          |  4 +++
 R/CRMetrics.R       | 72 +++++++++++++++++++++++++++------------------
 R/inner_functions.R | 48 +++++++++++++++++-------------
 3 files changed, 75 insertions(+), 49 deletions(-)

diff --git a/.gitignore b/.gitignore
index 6a9414d..119d079 100644
--- a/.gitignore
+++ b/.gitignore
@@ -37,5 +37,9 @@ CRAN-SUBMISSION
 control_6
 control_7
 metadata.csv
+metadata_test.csv
+metadata_test2.csv
+metadata_test_multiome.csv
+metadata_parse.csv
 # summary of updates for users
 CRMetrics_updates_for_users.Rmd
\ No newline at end of file
diff --git a/R/CRMetrics.R b/R/CRMetrics.R
index 296e004..79edd22 100644
--- a/R/CRMetrics.R
+++ b/R/CRMetrics.R
@@ -58,8 +58,9 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE,
    n.cores = 1,
   
   #' Initialize a CRMetrics object
-  #' @description To initialize new object, 'data.path' or 'cms' is needed. 'metadata' is also recommended, but not required.
-  #' @param data.path character Path to directory with Cell Ranger count data, one directory per sample (default = NULL).
+  #' @description To initialize new object, 'data.path' or 'cms' and 'technology' is needed. 'metadata' is also recommended, but not required.
+  #' @param data.path character Path to directory with count data, one directory per sample (default = NULL).
+  #' @param technology character applied single cell technology (default = c("10x", "10xflex", "10xmultiome", "parse")) 
   #' @param metadata data.frame or character Path to metadata file (comma-separated) or name of metadata dataframe object. Metadata must contain a column named 'sample' containing sample names that must match folder names in 'data.path' (default = NULL)
   #' @param cms list List with count matrices (default = NULL)
   #' @param samples character Sample names. Only relevant is cms is provided (default = NULL)
@@ -77,7 +78,8 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE,
   #' \dontrun{
   #' crm <- CRMetrics$new(data.path = "/path/to/count/data/")
   #' }
-  initialize = function(data.path = NULL, 
+  initialize = function(data.path = NULL,
+                        technology = c("10x", "10xflex", "10xmultiome", "parse"),
                         metadata = NULL, 
                         cms = NULL,
                         samples = NULL,
@@ -114,9 +116,13 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE,
       })
     }
     
+    # check that technology is provided and converted to lower case
+    technology %<>% tolower() %>% match.arg(c("10x", "10xflex", "10xmultiome", "parse"))
+    
     # Write stuff to object
     self$n.cores <- as.integer(n.cores)
     self$data.path <- data.path
+    self$technology <- technology
     self$verbose <- verbose
     self$theme <- theme
     self$pal <- pal
@@ -1673,6 +1679,7 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE,
   #' @param expected.cells named numeric If NULL, expected cells will be deduced from the number of cells per sample identified by Cell Ranger. Otherwise, a named vector of expected cells with sample IDs as names. Sample IDs must match those in summary.metrics (default: stored named vector)
   #' @param total.droplets named numeric If NULL, total droplets included will be deduced from expected cells multiplied by 3. Otherwise, a named vector of total droplets included with sample IDs as names. Sample IDs must match those in summary.metrics (default: stored named vector)
   #' @param cms.raw list Raw count matrices from HDF5 Cell Ranger outputs (default = self$cms.raw)
+  #' @param technology character applied single cell technology (default = c("10x", "10xflex", "10xmultiome", "parse")) 
   #' @param umi.counts list UMI counts calculated as column sums of raw count matrices from HDF5 Cell Ranger outputs (default: stored list)
   #' @param data.path character Path to Cell Ranger outputs (default = self$data.path)
   #' @param samples character Sample names to include (default = self$metadata$sample)
@@ -1692,6 +1699,7 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE,
                                expected.cells = NULL, 
                                total.droplets = NULL, 
                                cms.raw = self$cms.raw, 
+                               technology = self$technology,
                                umi.counts = self$cellbender$umi.counts, 
                                data.path = self$data.path, 
                                samples = self$metadata$sample, 
@@ -1714,12 +1722,12 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE,
       
       checkDataPath(data.path)
       
-      if (grepl("combined", data.path)) {
+      if (technology == "parse") {
         if (verbose) message(paste0(Sys.time()," Loading Parse outputs"))
         cms.raw <- readParse(data.path, samples, "raw", n.cores = n.cores, verbose = verbose, unique.names = unique.names, sep = sep)
     } else {
       if (verbose) message(paste0(Sys.time()," Loading HDF5 Cell Ranger outputs"))
-      cms.raw <- read10xH5(data.path, samples, "raw", n.cores = n.cores, verbose = verbose, unique.names = unique.names, sep = sep)
+      cms.raw <- read10xH5(data.path, samples, "raw", technology = self$technology, n.cores = n.cores, verbose = verbose, unique.names = unique.names, sep = sep)
      }
     self$cms.raw <- cms.raw
   }
@@ -1772,7 +1780,7 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE,
    
   
   # transform parse output to work as input for cellbender
-    if (grepl("combined", data.path)) {
+    if (technology == "parse") {
       if (verbose) message(paste0(Sys.time()," Transforming Parse outputs as preparation for cellbender"))
       # check if transformed data already exists
       full.path <- data.path %>% 
@@ -1818,6 +1826,7 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE,
   
   
   #' @param file character File name for CellBender script. Will be stored in `data.path` (default: "cellbender_script.sh")
+  #' @param technology character Applied single cell technology (default = self$technology).
   #' @param fpr numeric False positive rate for CellBender (default = 0.01)
   #' @param epochs integer Number of epochs for CellBender (default = 150)
   #' @param use.gpu logical Use CUDA capable GPU (default = TRUE)
@@ -1834,6 +1843,7 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE,
   #' crm$saveCellbenderScript()
   #' }
   saveCellbenderScript = function(file = "cellbender_script.sh", 
+                                  technology = self$technology,
                                   fpr = 0.01, 
                                   epochs = 150, 
                                   use.gpu = TRUE, 
@@ -1845,7 +1855,7 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE,
     # Preparations
     checkDataPath(data.path)
     
-    if (grepl("combined", data.path)) {
+    if (technology == "parse") {
       # in case of parse we use the genes, matrix, barcodes files stored in DGE_unfiltered and therefor we just need to give the path to DGE_unfiltered
       inputs <- data.path %>%
         pathsToList(samples) %>% 
@@ -1872,15 +1882,15 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE,
       })
       
     } else {
-      inputs <- getH5Paths(data.path, samples, "raw")
+      inputs <- getH5Paths(data.path, samples, "raw", technology)
     }
     
     outputs <- data.path %>% 
       pathsToList(samples) %>% 
     sapply(\(sample) {
-        if (grepl("combined", data.path)) {
+        if (technology == "parse") {
           paste0(sample[2], "/", sample[1], "/DGE_unfiltered/cellbender.h5")
-        } else if (grepl("per_sample_outs", data.path)) {
+        } else if (technology == "10xflex") {
           paste0(sample[2], "/", sample[1], "/count/cellbender.h5")
         } else {
           paste0(sample[2], "/", sample[1], "/outs/cellbender.h5")
@@ -1990,11 +2000,10 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE,
   #' @description Add a list of count matrices to the CRMetrics object.
   #' @param cms list List of (sparse) count matrices (default = NULL)
   #' @param data.path character Path to Cell Ranger or Parse count data (default = self$data.path).
+  #' @param technology character Applied single cell technology (default = self$technology).
   #' @param samples character Vector of sample names. If NULL, samples are extracted from cms (default = self$metadata$sample)
   #' @param cellbender logical Add CellBender filtered count matrices in HDF5 format. Requires that "cellbender" is in the names of the files (default = FALSE)
-  #' @param parse logical Add Parse filtered count matrices. Cannot be combined with `cellbender=TRUE`. Cannot be combined with `raw=TRUE`. (default = FALSE)
-  #' @param flex logical Add 10x Flex filtered count matrices from Cell Ranger. Can be combined with `raw=TRUE`. Cannot be combined with `cellbender=TRUE`. (default = FALSE)
-  #' @param raw logical Add raw count matrices from Cell Ranger output. Cannot be combined with `cellbender=TRUE`. (default = FALSE)
+  #' @param raw logical Add raw count matrices from Cell Ranger or Parse output. Cannot be combined with `cellbender=TRUE`. (default = FALSE)
   #' @param symbol character The type of gene IDs to use, SYMBOL (TRUE) or ENSEMBLE (default = TRUE)
   #' @param unique.names logical Make cell names unique based on `sep` parameter (default = TRUE)
   #' @param sep character Separator used to create unique cell names (default = "!!")
@@ -2019,10 +2028,9 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE,
   #' }
   addCms = function(cms = NULL, 
                     data.path = self$data.path,
+                    technology = self$technology,
                     samples = self$metadata$sample,
                     cellbender = FALSE,
-                    flex = FALSE,
-                    parse = FALSE,
                     raw = FALSE,
                     symbol = TRUE,
                     unique.names = TRUE, 
@@ -2064,14 +2072,11 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE,
       if (unique.names) cms %<>% createUniqueCellNames(samples, sep)
     } else {
       # Add from data.path argument
-      if (parse && cellbender) {
-        stop("Error: If you run CellBender on Parse files, 'parse' must be set to FALSE and only 'cellbender' set to TRUE.")
-      }
       if (cellbender) {
-        cms <- read10xH5(data.path = data.path, samples = samples, symbol = symbol, type = "cellbender_filtered", sep = sep, n.cores = n.cores, verbose = verbose, unique.names = unique.names)
-        }  else if (parse) {       
-          cms <- readParse(data.path = data.path, samples = samples, sep = sep, n.cores = n.cores, verbose = verbose, unique.names = unique.names)
-        }  else if (flex) {
+        cms <- read10xH5(data.path = data.path, samples = samples, symbol = symbol, type = "cellbender_filtered", technology = self$technology, sep = sep, n.cores = n.cores, verbose = verbose, unique.names = unique.names)
+        }  else if (technology == "parse") {       
+          cms <- readParse(data.path = data.path, samples = samples, raw = raw, sep = sep, n.cores = n.cores, verbose = verbose, unique.names = unique.names)
+        }  else if (technology == "10xflex") {
           cms <- readFlex(data.path = data.path, samples = samples, raw = raw, symbol = symbol, sep = sep, n.cores = n.cores, verbose = verbose, unique.names = unique.names)   
            }
        else {
@@ -2097,6 +2102,7 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE,
   #' @description Plot the results from the CellBender estimations
   #' @param data.path character Path to Cell Ranger outputs (default = self$data.path)
   #' @param samples character Sample names to include (default = self$metadata$sample)
+  #' @param technology character applied single cell technology (default = c("10x", "10xflex", "10xmultiome", "parse"))
   #' @param pal character Plotting palette (default = self$pal)
   #' @return A ggplot2 object
   #' @examples 
@@ -2109,10 +2115,11 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE,
   #' }
   plotCbTraining = function(data.path = self$data.path, 
                             samples = self$metadata$sample,
+                            technology = self$technology,
                             pal = self$pal) {
     checkDataPath(data.path)
     checkPackageInstalled("rhdf5", bioc = TRUE)
-    paths <- getH5Paths(data.path, samples, "cellbender")
+    paths <- getH5Paths(data.path, samples, "cellbender", technology)
     
     train.df <- samples %>% 
       lapply(\(id) {
@@ -2176,6 +2183,7 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE,
   #' @description Plot the CellBender assigned cell probabilities
   #' @param data.path character Path to Cell Ranger outputs (default = self$data.path)
   #' @param samples character Sample names to include (default = self$metadata$sample)
+  #' @param technology character applied single cell technology (default = c("10x", "10xflex", "10xmultiome", "parse"))
   #' @param low.col character Color for low probabilities (default = "gray")
   #' @param high.col character Color for high probabilities (default = "red")
   #' @return A ggplot2 object
@@ -2189,11 +2197,12 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE,
   #' }
   plotCbCellProbs = function(data.path = self$data.path, 
                              samples = self$metadata$sample,
+                             technology = self$technology,
                              low.col = "gray",
                              high.col = "red") {
     checkDataPath(data.path)
     checkPackageInstalled("rhdf5", bioc = TRUE)
-    paths <- getH5Paths(data.path, samples, "cellbender")
+    paths <- getH5Paths(data.path, samples, "cellbender", technology)
     
     cell.prob <- samples %>%
       lapply(\(id) {
@@ -2219,7 +2228,6 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE,
       bind_rows()
     
     ggplot(cell.prob, aes(cell, prob, col = prob)) + 
-      rasterise(geom_point(), dpi = 300) +
       scale_color_gradient(low=low.col, high=high.col) +
       self$theme +
       labs(x = "Cells", y = "Cell probability", col = "") +
@@ -2230,6 +2238,7 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE,
   #' @param cutoff numeric Horizontal line included in the plot to indicate highly expressed ambient genes (default = 0.005)
   #' @param data.path character Path to Cell Ranger outputs (default = self$data.path)
   #' @param samples character Sample names to include (default = self$metadata$sample)
+  #' @param technology character applied single cell technology (default = c("10x", "10xflex", "10xmultiome", "parse"))
   #' @return A ggplot2 object
   #' @examples 
   #' \dontrun{
@@ -2241,10 +2250,11 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE,
   #' }
   plotCbAmbExp = function(cutoff = 0.005, 
                           data.path = self$data.path, 
-                          samples = self$metadata$sample) {
+                          samples = self$metadata$sample,
+                          technology = self$technology) {
     checkDataPath(data.path)
     checkPackageInstalled("rhdf5", bioc = TRUE)
-    paths <- getH5Paths(data.path, samples, "cellbender")
+    paths <- getH5Paths(data.path, samples, "cellbender", technology)
     
     amb <- samples %>% 
       lapply(\(id) {
@@ -2287,6 +2297,7 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE,
   #' @param cutoff numeric Cutoff of ambient gene expression to use to extract ambient genes per sample
   #' @param data.path character Path to Cell Ranger outputs (default = self$data.path)
   #' @param samples character Sample names to include (default = self$metadata$sample)
+  #' @param technology character applied single cell technology (default = c("10x", "10xflex", "10xmultiome", "parse"))
   #' @param pal character Plotting palette (default = self$pal)
   #' @return A ggplot2 object
   #' @examples 
@@ -2300,10 +2311,11 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE,
   plotCbAmbGenes = function(cutoff = 0.005, 
                             data.path = self$data.path, 
                             samples = self$metadata$sample,
+                            technology = self$technology,
                             pal = self$pal) {
     checkDataPath(data.path)
     checkPackageInstalled("rhdf5", bioc = TRUE)
-    paths <- getH5Paths(data.path, samples, "cellbender")
+    paths <- getH5Paths(data.path, samples, "cellbender", technology)
     
     amb <- samples %>% 
       lapply(\(id) {
@@ -2535,6 +2547,7 @@ plotSoupX = function(plot.df = self$soupx$plot.df) {
 #' @description Plot CellBender cell estimations against the estimated cell numbers from Cell Ranger
 #' @param data.path character Path to Cell Ranger outputs (default = self$data.path)
 #' @param samples character Sample names to include (default = self$metadata$sample)
+#' @param technology character applied single cell technology (default = c("10x", "10xflex", "10xmultiome", "parse")) description
 #' @param pal character Plotting palette (default = self$pal)
 #' @return A ggplot2 object
 #' @examples 
@@ -2547,10 +2560,11 @@ plotSoupX = function(plot.df = self$soupx$plot.df) {
 #' }
 plotCbCells = function(data.path = self$data.path, 
                        samples = self$metadata$sample,
+                       technology = self$technology,
                        pal = self$pal) {
   checkDataPath(data.path)
   checkPackageInstalled("rhdf5", bioc = TRUE)
-  paths <- getH5Paths(data.path, samples, "cellbender_filtered")
+  paths <- getH5Paths(data.path, samples, "cellbender_filtered", technology)
   
   df <- samples %>% 
     sapply(\(id) rhdf5::h5read(paths[id], "matrix/shape")[2]) %>% 
diff --git a/R/inner_functions.R b/R/inner_functions.R
index 00eca7e..5c8d008 100644
--- a/R/inner_functions.R
+++ b/R/inner_functions.R
@@ -389,14 +389,15 @@ addPlotStatsSamples <- function(p,
 #' @param verbose Print messages (default = TRUE).
 #' @keywords internal
 #' @return data frame
-addSummaryMetrics <- function(data.path, 
+addSummaryMetrics <- function(data.path,
                               metadata, 
                               n.cores = 1, 
                               verbose = TRUE) {
   samples.tmp <- list.dirs(data.path, recursive = FALSE, full.names = FALSE)
   samples <- intersect(samples.tmp, metadata$sample)
   
-  doubles <- table(samples.tmp) %>% 
+  # checking if there are duplicated sample names only among the samples that will be used/are in metadata
+  doubles <- table(samples) %>% 
     .[. > 1] %>% 
     names()
   
@@ -409,16 +410,16 @@ addSummaryMetrics <- function(data.path,
     pathsToList(metadata$sample) %>% 
     plapply(\(s) {
       
-      # define possible directories to search for summary files (outs in case of Cell Ranger, report in case of Parse, sample base dir in case of 10X flex)
+      # define possible directories to search for summary files (outs in case of 10x and 10xmultiome, report in case of Parse, sample base dir in case of 10X flex)
       outs_dir <- paste(s[2], s[1], "outs", sep = "/")
       report_dir <- paste(s[2], s[1], "report", sep = "/")
       sample_base_dir <- paste(s[2], s[1], sep = "/")
       
-      # Check which folder exists and use it to read the summary file
+      # check which path exists to read the summary file
       dir_to_use <- if (dir.exists(outs_dir)) outs_dir else if (dir.exists(report_dir)) report_dir else sample_base_dir
       
       if (!dir.exists(dir_to_use)) {
-        warning(paste("No 'outs', 'report', or summary file found for sample:", s[1]))
+        warning(paste("No 'outs' folder, 'report' folder, or summary file found for sample:", s[1]))
         return(NULL)
       }
       
@@ -431,7 +432,7 @@ addSummaryMetrics <- function(data.path,
                across(.cols = grep(",", .), ~ as.numeric(gsub(",", "", .x))))
       
        # Take into account multiomics
-       if ("Sample.ID" %in% colnames(tmp)) tmp %<>% select(-c("Sample.ID","Genome","Pipeline.version"))
+      if ("Sample.ID" %in% colnames(tmp)) tmp %<>% select(-c("Sample.ID","Genome","Pipeline.version"))
   
         
      # Add the sample column, pivot to long format, and clean metric names            
@@ -445,11 +446,11 @@ addSummaryMetrics <- function(data.path,
       
       else if (dir_to_use == report_dir) {
       
-      # Logic for 'report' folder (adjust this based on the structure of files in 'report')
+      # Logic for 'report' folder 
       tmp <- read.table(dir(dir_to_use, glob2rx("*ummary.csv"), full.names = TRUE), 
                         header = TRUE, sep = ",", stringsAsFactors = FALSE) %>%
-        select(statistic, combined) %>% # Only select 'statistic' and 'combined' columns
-        rename(metric = statistic, value = combined) %>%  # Rename columns
+        select(1:2) %>%    # Select the first two columns
+        rename(metric = 1, value = 2) %>%  # Rename them
         mutate(sample = s[1])   # Add sample column
       }
       
@@ -569,6 +570,7 @@ labelsFilter <- function(filter.data) {
 #' @param data.path character
 #' @param samples character vector, select specific samples for processing (default = NULL)
 #' @param type name of H5 file to search for, "raw" and "filtered" are Cell Ranger count outputs, "cellbender" is output from CellBender after running script from saveCellbenderScript
+#' @param technology character Used single-cell technology (c("10x", "10xmultiome", "flex"))
 #' @param symbol logical Use gene SYMBOLs (TRUE) or ENSEMBL IDs (FALSE) (default = TRUE)
 #' @param sep character Separator for creating unique cell names from sample IDs and cell IDs (default = "!!")
 #' @param n.cores integer Number of cores (default = 1)
@@ -583,6 +585,7 @@ labelsFilter <- function(filter.data) {
 read10xH5 <- function(data.path, 
                       samples = NULL, 
                       type = c("raw","filtered","cellbender","cellbender_filtered"), 
+                      technology = c("10x", "10xmultiome", "flex"),
                       symbol = TRUE, 
                       sep = "!!", 
                       n.cores = 1, 
@@ -592,7 +595,7 @@ read10xH5 <- function(data.path,
   
   if (is.null(samples)) samples <- list.dirs(data.path, full.names = FALSE, recursive = FALSE)
   
-  full.path <- getH5Paths(data.path, samples, type)
+  full.path <- getH5Paths(data.path, samples, type, technology)
   
   if (verbose) message(paste0(Sys.time()," Loading ",length(full.path)," count matrices using ", if (n.cores < length(full.path)) n.cores else length(full.path)," cores"))
   out <- full.path %>%
@@ -660,20 +663,27 @@ createUniqueCellNames <- function(cms,
 #' @param data.path character Path for directory containing sample-wise directories with Cell Ranger count/Parse outputs
 #' @param samples character Sample names to include (default = NULL)
 #' @param type character Type of H5 files to get paths for, one of "raw", "filtered" (Cell Ranger count outputs, does not work for Parse), "cellbender" (raw CellBender outputs), "cellbender_filtered" (CellBender filtered outputs) (default = "type")
+#' @param technology character Used technology ("parse", "10x", "10xflex", "10xmultiome") (default = NULL)
 #' @keywords internal
 getH5Paths <- function(data.path, 
                        samples = NULL, 
-                       type = NULL) {
+                       type = NULL,
+                       technology = NULL) {
+  
   # Check input
   type %<>%
     tolower() %>% 
     match.arg(c("raw","filtered","cellbender","cellbender_filtered"))
   
+  technology %<>%
+    tolower() %>% 
+    match.arg(c("parse", "10x", "10xflex", "10xmultiome"))
+  
   # Get H5 paths
   paths <- data.path %>% 
     pathsToList(samples) %>% 
     sapply(\(i) {
-      folder <- getFolderPaths(i, data.path = data.path)
+      folder <- getFolderPaths(i, technology = technology)
       if (grepl("cellbender", type)) {
         paste0(folder,"/",type,".h5")
       } else {
@@ -691,7 +701,7 @@ getH5Paths <- function(data.path,
     
     miss <- miss.names %>% 
       sapply(\(i) {
-        folder <- getFolderPaths(i, data.path= data.path)
+        folder <- getFolderPaths(i, technology = technology)
         if (type == "raw") {
           paste0(folder, "/raw_[feature/gene]_bc_matrix.h5")
         } else if (type == "filtered") {
@@ -764,7 +774,7 @@ checkDataPath <- function(data.path) {
 }
 
 
-# no title?
+# returns a list where the first element contains all sample names and the second element the path
 pathsToList <- function(data.path, samples) {
   data.path %>% 
     lapply(\(path) list.dirs(path, recursive = F, full.names = F) %>% 
@@ -778,13 +788,11 @@ pathsToList <- function(data.path, samples) {
 
 
 # this function uses the output of pathsToList() as input "i"
-getFolderPaths <- function(i, data.path) {
-  # Determine which folder to use based on whether 'combined' (parse) is in the data path or 'per_sample_outs' (10x flex) is in the data path
-  use_dge_unfiltered <- grepl("combined", data.path)
-  use_count <- grepl("per_sample_outs", data.path)
-  if (use_dge_unfiltered) {
+getFolderPaths <- function(i, technology) {
+  # Determine which folder to use based on which technology was used
+  if (technology == "parse") {
     return(file.path(i[2], i[1], "DGE_unfiltered"))
-  } else if (use_count) {
+  } else if (technology == "10xflex") {
     return(file.path(i[2], i[1], "count"))
   } else {
     return(file.path(i[2], i[1], "outs"))

From 317698aa160bde8639cbfe7966e50ec0fac4428a Mon Sep 17 00:00:00 2001
From: Laura19993 <laura.wolbeck93@gmail.com>
Date: Fri, 14 Nov 2025 11:30:59 +0100
Subject: [PATCH 4/6] read in of duplicated samples fixed, new function
 plotRawDepth

---
 R/CRMetrics.R       | 150 ++++++++++++++++++++++++++++++++++----------
 R/inner_functions.R |   7 ++-
 2 files changed, 120 insertions(+), 37 deletions(-)

diff --git a/R/CRMetrics.R b/R/CRMetrics.R
index 83a217f..2cb5cd4 100644
--- a/R/CRMetrics.R
+++ b/R/CRMetrics.R
@@ -63,7 +63,7 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE,
   #' @param technology character applied single cell technology (default = c("10x", "10xflex", "10xmultiome", "parse")) 
   #' @param metadata data.frame or character Path to metadata file (comma-separated) or name of metadata dataframe object. Metadata must contain a column named 'sample' containing sample names that must match folder names in 'data.path' (default = NULL)
   #' @param cms list List with count matrices (default = NULL)
-  #' @param samples character Sample names. Only relevant is cms is provided (default = NULL)
+  #' @param samples character Sample names. Only relevant if cms is provided (default = NULL)
   #' @param unique.names logical Create unique cell names. Only relevant if cms is provided (default = TRUE)
   #' @param sep.cells character Sample-cell separator. Only relevant if cms is provided and `unique.names=TRUE` (default = "!!")
   #' @param comp.group character A group present in the metadata to compare the metrics by, can be added with addComparison (default = NULL)
@@ -128,15 +128,23 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE,
     self$pal <- pal
     
     # Metadata
+    # behaviour if metadata file is not given
     if (is.null(metadata)) {
       if (!is.null(data.path)) {
         self$metadata <- lapply(data.path, 
                                 list.dirs, 
                                 recursive = FALSE, 
                                 full.names = FALSE) %>% 
-          Reduce(c, .) %>% 
-   .[pathsToList(data.path, .) %>% sapply(\(path) file.exists(paste0(path[2],"/",path[1],"/outs")))] %>% 
-          {data.frame(sample = .)}
+          Reduce(c, .) %>%
+          { subfolder <- switch(technology,
+                                "parse"   = "DGE_filtered",
+                                "10xflex" = "count", 
+                                "10x" = "outs")
+          # filter the directory names vector (.) to keep only those directories for which a corresponding subfolder exists, problem in Parse: all-sample folder also has that subfolder
+          .[pathsToList(data.path, .) 
+          %>% sapply(function(path) file.exists(file.path(path[2], path[1], subfolder)))] 
+          } %>% 
+          { data.frame(sample = .) }
       } else {
         if (is.null(names(cms))) {
           if (is.null(samples)) {
@@ -164,8 +172,22 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE,
     if (!is.null(metadata)) {
       if (!raw.meta) self$metadata %<>% lapply(type.convert, as.is = FALSE) %>% bind_cols()
     }
+    # check if metadata has duplicated samples 
+    doubles <- table(self$metadata$sample) %>% 
+      .[. > 1] %>% 
+      names()
+    if (length(doubles) > 0) stop(paste0("One or more samples are present twice in provided/derived 'metadata'. 
+                                         Sample names must be unique. Affected sample(s): ",paste(doubles, collapse = " ")))
+    # check if metadata contains 'all-sample' which is created by parse pipeline automatically and remove it
+    all_sample_rows <- grepl("^all[-_ ]?sample$", self$metadata$sample, ignore.case = TRUE)
+    if (any(all_sample_rows)) {
+      message("Removed 'all-sample' entries automatically created by the Parse pipeline: ",
+              paste(self$metadata$sample[all_sample_rows], collapse = ", "))
+      self$metadata <- self$metadata[!all_sample_rows, , drop = FALSE]
+    }
+    
     
-    # Add CMs
+     # Add CMs
     if (!is.null(cms)) {
       self$addCms(cms = cms, 
                   samples = samples, 
@@ -639,7 +661,7 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE,
       if (length(depth.cutoff) > 1) {
         main <- "Cells with low depth with sample-specific cutoff"
       } else {
-        main <- paste0("Cells with low depth, < ",depth.cutoff)
+        main <- paste0("Cells with depth < ",depth.cutoff)
       }
         g <- self$con$plotGraph(colors = (!depths) * 1, title = main, size = size, ...)
     }
@@ -675,8 +697,8 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE,
     return(g)
   },
   
-  #' @description Plot the sequencing depth in histogram.
-  #' @param cutoff numeric The depth cutoff to color the cells in the embedding (default = 1e3).
+  #' @description Plot per-sample density histograms of cell depths (=total UMI counts per cell). The plot highlights cells below or above a specified UMI cutoff with distinct colors, helping to assess data quality and filtering thresholds.
+  #' @param cutoff numeric  Depth cutoff used to distinguish filtered and retained cells. Can be a single value or a named numeric vector specifying cutoffs per sample (default = 1e3).
   #' @param samples character Sample names to include for plotting (default = $metadata$sample).
   #' @param sep character Separator for creating unique cell names (default = "!!")
   #' @param keep.col character Color for density of cells that are kept (default = "#E7CDC2")
@@ -722,7 +744,7 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE,
       stop("No Conos object found, please run createEmbedding.")
     }
     
-    if (length(cutoff) > 1 & length(self$con$samples) != length(cutoff)) stop(paste0("'cutoff' has a length of ",length(cutoff),", but the conos object contains ",length(tmp)," samples. Please adjust."))
+    if (length(cutoff) > 1 & length(self$con$samples) != length(cutoff)) stop(paste0("'cutoff' has a length of ",length(cutoff),", but the conos object contains ",length(self$con$samples)," samples. Please adjust."))
     
     depths <- self$getDepth()
     
@@ -732,7 +754,7 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE,
       mutate(sample = sample %>% strsplit(sep, TRUE) %>% sapply(`[[`, 1)) %>%
       split(., .$sample) %>% 
       .[samples] %>% 
-      lapply(\(z) with(density(z$depth, adjust = 1/10), data.frame(x,y))) %>% 
+      lapply(\(z) with(density(z$depth, adjust = 1/10, from = 0), data.frame(x,y))) %>% 
       {lapply(names(.), \(x) data.frame(.[[x]], sample = x))} %>% 
       bind_rows()
     
@@ -757,7 +779,7 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE,
           geom_line() +
           xlim(0,xmax) +
           theme(legend.position = "none", axis.text.x = element_text(angle = 45, hjust = 1), plot.margin = unit(c(0, 0, 0, 0.5), "cm")) +
-          labs(title = id, y = "Density [AU]", x = "")
+          labs(title = id, y = "Density [AU]", x = "UMI counts")
         
         if (length(cutoff) == 1) {
           plot.cutoff <- cutoff
@@ -780,6 +802,64 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE,
     
     return(depth.plot)
   },
+
+  #' @description Plot per-sample histograms of total UMI counts per cell (= cell depth), allowing visualization of sequencing depth distributions across samples.
+  #' @param cutoff numeric Depth (UMI count) cutoff used to distinguish filtered and retained cells. Can be a single value or a named numeric vector specifying cutoffs per sample (Default = 1e3).
+  #' @param samples character Vector of sample names to include in the plot (default = self$metadata$sample).
+  #' @param sep  character Separator for creating unique cell names (default = "!!")
+  #' @param binwidth numeric Width of the histogram bins (default = 10).
+  #' @param xupper numeric Optional upper x-axis limit (default = 2e4).
+  #' @return A combined `ggplot` object showing UMI count histograms per sample.
+
+  plotRawDepth = function(cutoff = 1e3,
+                           samples = self$metadata$sample,
+                           sep = "!!",
+                           binwidth = 10,
+                           xupper = 2e4){
+    
+    # Checks
+    checkPackageInstalled("conos", cran = TRUE)
+    if (is.null(self$con)) {
+     stop("No Conos object found, please run createEmbedding.")
+   }
+  
+  if (length(cutoff) > 1 & length(self$con$samples) != length(cutoff)) stop(paste0("'cutoff' has a length of ",length(cutoff),", but the conos object contains ",length(self$con$samples)," samples. Please adjust."))
+  
+  depths <- self$getDepth()
+    # build tidy df
+    tmp <- depths %>% 
+      {data.frame(depth = unname(.), sample = names(.))} %>% 
+      mutate(sample = sample %>% strsplit(sep, TRUE) %>% sapply(`[[`, 1)) %>%
+      filter(sample %in% samples)
+    
+    ncol.plot <- samples %>% 
+      length() %>% 
+      pmin(3)
+    
+    # Plot
+    depth.plot <- lapply(unique(tmp$sample), function(id) {
+      tmp.plot <- tmp %>% filter(sample == id)
+        
+    g <- ggplot(tmp.plot, aes(x = depth)) +
+          geom_histogram(binwidth = binwidth, boundary = 0, closed = "left") +
+          labs(title = id, x = "UMI counts", y = "Number of cells") +
+          self$theme +
+          xlim(0,xupper) + geom_vline(xintercept = cutoff, color="red", linetype = "dashed")
+        # optional: + scale_x_log10()
+        
+    if (length(cutoff) == 1) {
+      plot.cutoff <- cutoff
+    } else {
+      plot.cutoff <- cutoff[names(cutoff) == id]
+    }
+        
+  return(g)
+    }) %>% 
+    plot_grid(plotlist = ., ncol = ncol.plot, label_size = 5)
+    
+    return(depth.plot)
+  },   
+  
   
   #' @description Plot the mitochondrial fraction in histogram.
   #' @param cutoff numeric The mito. fraction cutoff to color the embedding (default = 0.05)
@@ -1572,7 +1652,7 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE,
     return(g)
   },
   
-  #' @description Extract sequencing depth from Conos object.
+  #' @description Extract sum of UMI counts per cell (= cell depth) 
   #' @param cms list List of (sparse) count matrices (default = self$cms)
   #' @return data frame
   #' @examples 
@@ -1786,36 +1866,36 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE,
       full.path <- data.path %>% 
         pathsToList(samples) %>% 
         sapply(\(sample) {
-           paste0(sample[2],"/",sample[1],"/DGE_unfiltered")
+           paste0(sample[2],sample[1], "/DGE_unfiltered")
         })
       lapply(full.path, function(x) {
         
         # 1. Check and create genes.tsv
-        if (file.exists(paste0(x,"/","genes.tsv"))) {
+        if (file.exists(file.path(x,"genes.tsv"))) {
           warning(paste0("genes.tsv already exists in ", x))
         } else {
           if(verbose) message(paste0(Sys.time()," Creating genes.tsv in ", x))
-          fread(paste0(x,"/","all_genes.csv"), sep = ",", header = TRUE) %>%
+          data.table::fread(file.path(x,"all_genes.csv"), sep = ",", header = TRUE) %>%
             mutate(genome = "Gene Expression") %>%
-            fwrite(paste0(x,"/","genes.tsv"), sep = "\t", col.names = FALSE)
+            data.table::fwrite(file.path(x,"genes.tsv"), sep = "\t", col.names = FALSE)
         }
         # 2. Check and create barcodes.tsv
-        if (file.exists(paste0(x,"/","barcodes.tsv"))) {
+        if (file.exists(file.path(x,"barcodes.tsv"))) {
           warning(paste0("barcodes.tsv already exists in ", x))
         } else {
           if(verbose) message(paste0(Sys.time()," Creating barcodes.tsv in ", x))
-          fread(paste0(x,"/","cell_metadata.csv"), sep = ",", header = TRUE) %>%
+          data.table::fread(file.path(x,"cell_metadata.csv"), sep = ",", header = TRUE) %>%
             .[, "bc_wells"] %>%
-            write.table(paste0(x,"/","barcodes.tsv"), row.names=F, col.names=F)
+            write.table(file.path(x,"barcodes.tsv"), row.names=F, col.names=F)
         }
         # 3. Check and create matrix.mtx
-        if (file.exists(paste0(x,"/","matrix.mtx"))) {
+        if (file.exists(file.path(x,"matrix.mtx"))) {
           warning(paste0("matrix.mtx already exists in ", x))
         } else {
           if(verbose) message(paste0(Sys.time()," Creating matrix.mtx in ", x))
-        fread(paste0(full.path[1],"/","count_matrix.mtx"), header = F) %>%
+          data.table::fread(file.path(x,"count_matrix.mtx"), header = F) %>%
           select(V2, V1, V3)  %>%
-          fwrite(paste0(full.path[1],"/","matrix.mtx"), col.names=F)
+            data.table::fwrite(file.path(x,"matrix.mtx"), col.names=F)
         }
       })
     }
@@ -1860,24 +1940,24 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE,
       inputs <- data.path %>%
         pathsToList(samples) %>% 
         sapply(\(sample) {
-            paste0(sample[2], "/", sample[1], "/DGE_unfiltered")})
+            file.path(sample[2],sample[1],"DGE_unfiltered")})
       
       #check that transformed files exists for parse
       lapply(inputs, function(x) {
         
         # 1. Check genes.tsv
-        if (!file.exists(paste0(x,"/","genes.tsv"))) {
-          error("genes.tsv missing in ", x, " run prepareCellbender first.")
+        if (!file.exists(file.path(x,"genes.tsv"))) {
+          stop("genes.tsv missing in ", x, " run prepareCellbender first.")
           
         }
         #2. Check barcodes.tsv
-        if (!file.exists(paste0(x,"/","barcodes.tsv"))) {
-          error("barcodes.tsv missing in ", x, " run prepareCellbender first.")
+        if (!file.exists(file.path(x,"barcodes.tsv"))) {
+          stop("barcodes.tsv missing in ", x, " run prepareCellbender first.")
         }
         
         #2. Check matrix.mtx
-        if (!file.exists(paste0(x,"/","matrix.mtx"))) {
-          error("matrix.mtx missing in ", x, " run prepareCellbender first.")
+        if (!file.exists(file.path(x,"matrix.mtx"))) {
+          stop("matrix.mtx missing in ", x, " run prepareCellbender first.")
         }
       })
       
@@ -1889,11 +1969,11 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE,
       pathsToList(samples) %>% 
     sapply(\(sample) {
         if (technology == "parse") {
-          paste0(sample[2], "/", sample[1], "/DGE_unfiltered/cellbender.h5")
+          file.path(sample[2], sample[1], "DGE_unfiltered/cellbender.h5")
         } else if (technology == "10xflex") {
-          paste0(sample[2], "/", sample[1], "/count/cellbender.h5")
+          file.path(sample[2],sample[1], "count/cellbender.h5")
         } else {
-          paste0(sample[2], "/", sample[1], "/outs/cellbender.h5")
+          file.path(sample[2],sample[1], "outs/cellbender.h5")
         }
       }) %>%
       setNames(samples)
@@ -1929,8 +2009,9 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE,
     
     out <- list("#! /bin/sh", script.list) %>% 
       unlist()
-    
-    cat(out, file = paste0(data.path,file), sep = "\n")
+    # if there are several data.path "file" is saved to the first element of data.paht vector
+    cat(out, file = file.path(data.path[1],file), sep = "\n")
+    message(paste0(file ," was saved into ", data.path[1]))
   },
   
   #' @description Extract the expected number of cells per sample based on the Cell Ranger summary metrics
@@ -2228,6 +2309,7 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE,
       bind_rows()
     
     ggplot(cell.prob, aes(cell, prob, col = prob)) + 
+      geom_point() +
       scale_color_gradient(low=low.col, high=high.col) +
       self$theme +
       labs(x = "Cells", y = "Cell probability", col = "") +
diff --git a/R/inner_functions.R b/R/inner_functions.R
index 5c8d008..ed1ce9a 100644
--- a/R/inner_functions.R
+++ b/R/inner_functions.R
@@ -396,12 +396,13 @@ addSummaryMetrics <- function(data.path,
   samples.tmp <- list.dirs(data.path, recursive = FALSE, full.names = FALSE)
   samples <- intersect(samples.tmp, metadata$sample)
   
-  # checking if there are duplicated sample names only among the samples that will be used/are in metadata
-  doubles <- table(samples) %>% 
+  # checking if there are duplicated sample name directories among the samples in metadata
+  doubles <- table(samples.tmp) %>% 
     .[. > 1] %>% 
     names()
+  overlap <- intersect(doubles, metadata$sample)
   
-  if (length(doubles) > 0) stop(paste0("One or more samples are present twice in 'data.path'. Sample names must be unique. Affected sample(s): ",paste(doubles, collapse = " ")))
+  if (length(overlap) > 0) stop(paste0("One or more samples are present twice in 'data.path' and are provided in 'metadata'. Sample names must be unique. Affected sample(s): ",paste(doubles, collapse = " ")))
   if (length(samples) != length(samples.tmp)) message("'metadata' doesn't contain the following sample(s) derived from 'data.path' (dropped): ",setdiff(samples.tmp, samples) %>% paste(collapse = " "))
   
   if (verbose) message(paste0(Sys.time()," Adding ",length(samples)," samples"))

From 8118ccbf2aefc93396d8129bc9a8ff775b5f9f29 Mon Sep 17 00:00:00 2001
From: Laura19993 <laura.wolbeck93@gmail.com>
Date: Mon, 26 Jan 2026 13:38:22 +0100
Subject: [PATCH 5/6] introduced depth.cutoff.upper, plotBarcodeRankPlot,
 reformulated some parameter descriptions

---
 .gitignore          |   1 +
 R/CRMetrics.R       | 336 ++++++++++++++++++++++++++++++++++++--------
 R/inner_functions.R |   4 +-
 3 files changed, 279 insertions(+), 62 deletions(-)

diff --git a/.gitignore b/.gitignore
index 119d079..58df564 100644
--- a/.gitignore
+++ b/.gitignore
@@ -16,6 +16,7 @@
 # produced vignettes
 vignettes/*.html
 vignettes/*.pdf
+vignettes/*.Rmd
 # OAuth2 token, see https://github.com/hadley/httr/releases/tag/v0.3
 .httr-oauth
 # knitr and R markdown default cache directories
diff --git a/R/CRMetrics.R b/R/CRMetrics.R
index 2cb5cd4..c177469 100644
--- a/R/CRMetrics.R
+++ b/R/CRMetrics.R
@@ -24,7 +24,7 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE,
    #' @field metadata data.frame or character Path to metadata file or name of metadata data.frame object. Metadata must contain a column named 'sample' containing sample names that must match folder names in 'data.path' (default = NULL)
    metadata = NULL,
    
-   #' @field data.path character Path(s) to Cell Ranger count data, one directory per sample. If multiple paths, do c("path1","path2") (default = NULL)
+   #' @field data.path character Path(s) to Cell Ranger or Parse count data, one directory per sample. If multiple paths, do c("path1","path2") (default = NULL)
    data.path = NULL, 
    
    #' @field cms list List with count matrices (default = NULL)
@@ -33,10 +33,10 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE,
    #' @field cms.preprocessed list List with preprocessed count matrices after $doPreprocessing() (default = NULL)
    cms.preprocessed = NULL,
    
-   #' @field cms.raw list List with raw, unfiltered count matrices, i.e., including all CBs detected also empty droplets (default = NULL)
+   #' @field cms.raw list List with raw, unfiltered count matrices, i.e., including all cell barcodes detected, also empty ones (default = NULL)
    cms.raw = NULL,
    
-   #' @field summary.metrics data.frame Summary metrics from Cell Ranger (default = NULL)
+   #' @field summary.metrics data.frame Summary metrics from Cell Ranger or Parse pipeline (default = NULL)
    summary.metrics = NULL,
    
    #' @field detailed.metrics data.frame Detailed metrics, i.e., no. genes and UMIs per cell (default = NULL)
@@ -245,6 +245,108 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE,
     self$detailed.metrics <- addDetailedMetricsInner(cms = cms, verbose = verbose, n.cores = n.cores)
   },
   
+  
+  #' @description Read in raw count matrices and save UMI counts to object in "cellbender" slot, then create barcode rank plots for quality control
+  #' @param shrinkage integer Select every nth UMI count per cell for plotting. Improves plotting speed drastically. To plot all cells, set to 1 (default = 100)
+  #' @param show.expected.cells logical Plot line depicting called cells by Parse or Cell Ranger pipeline (default = TRUE)
+  #' @param cms.raw list Raw count matrices from HDF5 Cell Ranger or Parse outputs (default = self$cms.raw)
+  #' @param technology character applied single cell technology (default = c("10x", "10xflex", "10xmultiome", "parse")) 
+  #' @param umi.counts list UMI counts calculated as column sums of raw count matrices from HDF5 Cell Ranger or Parse outputs (default: stored list)
+  #' @param data.path character Path to Cell Ranger or Parse outputs (default = self$data.path)
+  #' @param samples character Sample names to include (default = self$metadata$sample)
+  #' @param verbose logical Show progress (default: stored vector)
+  #' @param n.cores integer Number of cores (default: stored vector)
+  #' @param unique.names logical Create unique cell names (default = FALSE)
+  #' @param sep character Separator for creating unique cell names (default = "!!")
+  #' @return ggplot2 object and bash script
+  #' @examples 
+  #' \dontrun{
+  #' crm <- CRMetrics$new(data.path = "/path/to/count/data")
+  #' crm$prepareCellbender()
+  #' }
+  plotBarcodeRankPlot = function(shrinkage = 100, 
+                               show.expected.cells = TRUE, 
+                               cms.raw = self$cms.raw, 
+                               technology = self$technology,
+                               umi.counts = self$cellbender$umi.counts, 
+                               data.path = self$data.path, 
+                               samples = self$metadata$sample, 
+                               verbose = self$verbose, 
+                               n.cores = self$n.cores, 
+                               unique.names = FALSE,
+                               sep = "!!") {
+    checkPackageInstalled("sparseMatrixStats", bioc = TRUE)
+    # Preparations
+    if (verbose) message(paste0(Sys.time()," Started run using ", if (n.cores < length(samples)) n.cores else length(samples)," cores"))
+    expected.cells <- self$getExpectedCells(samples)
+    
+    # Read raw CMs from HDF5 files
+    
+    if (!is.null(cms.raw)) {
+      if (verbose) message(paste0(Sys.time()," Using stored HDF5 Cell Ranger/Parse outputs. To overwrite, set $cms.raw <- NULL"))
+    } else {
+      
+      checkDataPath(data.path)
+      
+      if (technology == "parse") {
+        if (verbose) message(paste0(Sys.time()," Loading Parse outputs"))
+        cms.raw <- readParse(data.path, samples, "raw", n.cores = n.cores, verbose = verbose, unique.names = unique.names, sep = sep)
+      } else {
+        if (verbose) message(paste0(Sys.time()," Loading HDF5 Cell Ranger outputs"))
+        cms.raw <- read10xH5(data.path, samples, "raw", technology = self$technology, n.cores = n.cores, verbose = verbose, unique.names = unique.names, sep = sep)
+      }
+      self$cms.raw <- cms.raw
+    }
+    
+    # Get UMI counts
+    if (!is.null(umi.counts)) {
+      if (verbose) message(paste0(Sys.time()," Using stored UMI counts calculations. To overwrite, set $cellbender$umi.counts <- NULL"))
+    } else {
+      if (verbose) message(paste0(Sys.time()," Calculating UMI counts per sample"))
+      umi.counts <- cms.raw[samples] %>% 
+        plapply(\(cm) {
+          sparseMatrixStats::colSums2(cm) %>%
+            sort(decreasing = TRUE) %>% 
+            {data.frame(y = .)} %>% 
+            filter(y > 0) %>% 
+            mutate(., x = seq_len(nrow(.)))
+        }, n.cores = n.cores) %>% 
+        setNames(samples)
+      self$cellbender$umi.counts <- umi.counts
+    }
+    
+    # Create plot
+    if (verbose) message(paste0(Sys.time()," Plotting"))
+    data.df <- umi.counts[samples] %>% 
+      names() %>% 
+      lapply(\(sample) {
+        umi.counts[[sample]] %>% 
+          mutate(sample = sample) %>% 
+          .[seq(1, nrow(.), shrinkage),]
+      }) %>% 
+      bind_rows()
+    
+    line.df <- expected.cells %>% 
+      {data.frame(sample = names(.), exp = .)}
+    
+    g <- ggplot(data.df, aes(x, y)) + 
+      geom_line(color = "red") + 
+      scale_x_log10(labels = scales::comma) +
+      scale_y_log10(labels = scales::comma) +
+      self$theme +
+      labs(x = "Barcode ID ranked by count", y = "UMI count per barcode", col = "")
+    
+    if (show.expected.cells) g <- g + geom_vline(data = line.df, aes(xintercept = exp, col = "Expected cells"))
+    
+    g <- g + facet_wrap(~ sample, scales = "free")
+    
+    if (verbose) message(paste0(Sys.time()," Done!"))
+    
+    # return the plot object
+    return(g)
+  },
+  
+  
   #' @description Add comparison group for statistical testing.
   #' @param comp.group character Comparison metric (default = self$comp.group).
   #' @param metadata data.frame Metadata for samples (default = self$metadata).
@@ -595,7 +697,8 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE,
   #' @param depth logical Plot depth or not (default = FALSE).
   #' @param doublet.method character Doublet detection method (default = NULL).
   #' @param doublet.scores logical Plot doublet scores or not (default = FALSE).
-  #' @param depth.cutoff numeric Depth cutoff (default = 1e3).
+  #' @param depth.cutoff numeric Lower depth cutoff. Can be a single value or a named numeric vector specifying cutoffs per sample (default = NULL).
+  #' @param depth.cutoff.upper numeric Upper depth cutoff. Can be a single value or a named numeric vector specifying cutoffs per sample (default = NULL).
   #' @param mito.frac logical Plot mitochondrial fraction or not (default = FALSE).
   #' @param mito.cutoff numeric Mitochondrial fraction cutoff (default = 0.05).
   #' @param species character Species to calculate the mitochondrial fraction for (default = c("human","mouse")).
@@ -635,7 +738,8 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE,
   plotEmbedding = function(depth = FALSE, 
                            doublet.method = NULL, 
                            doublet.scores = FALSE, 
-                           depth.cutoff = 1e3, 
+                           depth.cutoff = NULL,
+                           depth.cutoff.upper = NULL,
                            mito.frac = FALSE, 
                            mito.cutoff = 0.05, 
                            species = c("human","mouse"), 
@@ -656,16 +760,85 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE,
     
     # Depth
     if (depth) {
-      depths <- self$getDepth() %>% 
-        filterVector("depth.cutoff", depth.cutoff, self$con$samples %>% names(), sep)
-      if (length(depth.cutoff) > 1) {
-        main <- "Cells with low depth with sample-specific cutoff"
+      
+      depth_vals <- self$getDepth()
+      cell_names <- names(depth_vals)
+      
+      # LOWER cutoff
+      if (!is.null(depth.cutoff)) {
+        
+        pass_lower <- filterVector(
+          num.vec = depth_vals,
+          name    = "depth.cutoff",
+          filter  = depth.cutoff,
+          samples = self$con$samples %>% names(),
+          sep     = sep
+        )
+        low_depth <- !pass_lower
+        
+      } else {
+        low_depth <- rep(FALSE, length(depth_vals))
+        names(low_depth) <- cell_names
+      }
+      
+      # UPPER cutoff
+      if (!is.null(depth.cutoff.upper)) {
+        
+        pass_upper <- filterVector(
+          num.vec = -depth_vals,
+          name    = "depth.cutoff.upper",
+          filter  = -depth.cutoff.upper,
+          samples = self$con$samples %>% names(),
+          sep     = sep
+        )
+        high_depth <- !pass_upper
+        
+      } else {
+        high_depth <- rep(FALSE, length(depth_vals)) # default is FALSE
+        names(high_depth) <- cell_names
+      }
+  
+      
+      # combine into 3-state vector
+      depth_flag <- integer(length(depth_vals))
+      names(depth_flag) <- cell_names
+      
+      depth_flag[low_depth]  <- 1
+      depth_flag[high_depth] <- 2
+      
+      # title
+      lower_sample_specific <- !is.null(depth.cutoff) &&
+        length(depth.cutoff) > 1
+      upper_sample_specific <- !is.null(depth.cutoff.upper) &&
+        length(depth.cutoff.upper) > 1
+      
+      if (!is.null(depth.cutoff.upper)) {
+        if (lower_sample_specific || upper_sample_specific) {
+          main <- "Cells with low (red) or high (blue) depth (sample-specific cutoffs)"
+        } else if (!is.null(depth.cutoff)) {
+          main <- paste0(
+            "Cells with depth < ", depth.cutoff , " (red) or > ", depth.cutoff.upper , " (blue)"
+          )
+        } else {
+          main <- paste0("Cells with depth > ", depth.cutoff.upper)
+        }
       } else {
-        main <- paste0("Cells with depth < ",depth.cutoff)
+        if (lower_sample_specific) {
+          main <- "Cells with low depth (sample-specific cutoff)"
+        } else {
+          main <- paste0("Cells with depth < ", depth.cutoff)
+        }
       }
-        g <- self$con$plotGraph(colors = (!depths) * 1, title = main, size = size, ...)
+      
+      g <- self$con$plotGraph(
+        groups  = depth_flag,
+        title   = main,
+        size    = size, mark.groups=F,
+        ...)+scale_color_manual(values = c("0"= "lightgrey", "1" = "red", "2" = "blue"))
     }
     
+
+    
     # Doublets
     if (!is.null(doublet.method)) {
       dres <- self$doublets[[doublet.method]]$result
@@ -698,7 +871,7 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE,
   },
   
   #' @description Plot per-sample density histograms of cell depths (=total UMI counts per cell). The plot highlights cells below or above a specified UMI cutoff with distinct colors, helping to assess data quality and filtering thresholds.
-  #' @param cutoff numeric  Depth cutoff used to distinguish filtered and retained cells. Can be a single value or a named numeric vector specifying cutoffs per sample (default = 1e3).
+  #' @param cutoff numeric Depth cutoff (lower). Can be a single value or a named numeric vector specifying cutoffs per sample (default = 1e3).
   #' @param samples character Sample names to include for plotting (default = $metadata$sample).
   #' @param sep character Separator for creating unique cell names (default = "!!")
   #' @param keep.col character Color for density of cells that are kept (default = "#E7CDC2")
@@ -804,7 +977,7 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE,
   },
 
   #' @description Plot per-sample histograms of total UMI counts per cell (= cell depth), allowing visualization of sequencing depth distributions across samples.
-  #' @param cutoff numeric Depth (UMI count) cutoff used to distinguish filtered and retained cells. Can be a single value or a named numeric vector specifying cutoffs per sample (Default = 1e3).
+  #' @param cutoff numeric Depth cutoff (lower) used to distinguish filtered and retained cells. Can be a single value or a named numeric vector specifying cutoffs per sample (Default = 1e3).
   #' @param samples character Vector of sample names to include in the plot (default = self$metadata$sample).
   #' @param sep  character Separator for creating unique cell names (default = "!!")
   #' @param binwidth numeric Width of the histogram bins (default = 10).
@@ -837,22 +1010,24 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE,
       pmin(3)
     
     # Plot
+    
+    
     depth.plot <- lapply(unique(tmp$sample), function(id) {
       tmp.plot <- tmp %>% filter(sample == id)
-        
+      
+      if (length(cutoff) == 1) {
+        plot.cutoff <- cutoff
+      } else {
+        plot.cutoff <- cutoff[names(cutoff) == id]
+      } 
+      
     g <- ggplot(tmp.plot, aes(x = depth)) +
           geom_histogram(binwidth = binwidth, boundary = 0, closed = "left") +
           labs(title = id, x = "UMI counts", y = "Number of cells") +
           self$theme +
-          xlim(0,xupper) + geom_vline(xintercept = cutoff, color="red", linetype = "dashed")
+          xlim(0,xupper) + geom_vline(xintercept = plot.cutoff, color="red", linetype = "dashed")
         # optional: + scale_x_log10()
         
-    if (length(cutoff) == 1) {
-      plot.cutoff <- cutoff
-    } else {
-      plot.cutoff <- cutoff[names(cutoff) == id]
-    }
-        
   return(g)
     }) %>% 
     plot_grid(plotlist = ., ncol = ncol.plot, label_size = 5)
@@ -910,7 +1085,7 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE,
       stop("No Conos object found, please run createEmbedding.")
     }
     
-    if (length(cutoff) > 1 & length(self$con$samples) != length(cutoff)) stop(paste0("'cutoff' has a length of ",length(cutoff),", but the conos object contains ",length(tmp)," samples. Please adjust."))
+    if (length(cutoff) > 1 & length(self$con$samples) != length(cutoff)) stop(paste0("'cutoff' has a length of ",length(cutoff),", but the conos object contains ",length(self$con$samples)," samples. Please adjust."))
     
     mf <- self$getMitoFraction(species = species)
     
@@ -924,7 +1099,7 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE,
       mutate(sample = sample %>% strsplit(sep, TRUE) %>% sapply(`[[`, 1)) %>%
       split(., .$sample) %>% 
       .[samples] %>% 
-      lapply(\(z) with(density(z$mito.frac, adjust = 1/10), data.frame(x,y))) %>% 
+      lapply(\(z) with(density(z$mito.frac, adjust = 1/10, from = 0), data.frame(x,y))) %>% 
       {lapply(names(.), \(x) data.frame(.[[x]], sample = x))} %>% 
       bind_rows()
     
@@ -1308,7 +1483,8 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE,
   },
   
   #' @description Filter cells based on depth, mitochondrial fraction and doublets from the count matrix.
-  #' @param depth.cutoff numeric Depth (transcripts per cell) cutoff (default = NULL).
+  #' @param depth.cutoff numeric Lower depth (total UMI counts per cell) cutoff (default = NULL).
+  #' @param depth.cutoff.upper numeric Upper depth (total UMI counts per cell) cutoff (default = NULL).
   #' @param mito.cutoff numeric Mitochondrial fraction cutoff (default = NULL).
   #' @param doublets character Doublet detection method to use (default = NULL).
   #' @param species character Species to calculate the mitochondrial fraction for (default = "human").
@@ -1348,6 +1524,7 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE,
   #' }
   #' }
   filterCms = function(depth.cutoff = NULL, 
+                       depth.cutoff.upper = NULL,
                        mito.cutoff = NULL, 
                        doublets = NULL,
                        species = c("human","mouse"),
@@ -1358,7 +1535,8 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE,
     
     if (verbose) {
       filters <- c()
-      if (!is.null(depth.cutoff)) filters %<>% c(paste0("depth.cutoff = ",depth.cutoff))
+      if (!is.null(depth.cutoff)) filters %<>% c(paste0("depth.cutoff.lower = ", depth.cutoff))
+      if (!is.null(depth.cutoff.upper)) filters %<>% c(paste0("depth.cutoff.upper = ", depth.cutoff.upper))
       if (!is.null(mito.cutoff)) filters %<>% c(paste0("mito.cutoff = ",mito.cutoff," and species = ",species))
       if (!is.null(doublets)) filters %<>% c(paste0("doublet method = ",doublets))
       
@@ -1388,9 +1566,24 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE,
       names()
     
     # Depth
-    if (!is.null(depth.cutoff)) {
-      depth.filter <- self$getDepth() %>% 
-        filterVector("depth.cutoff", depth.cutoff, samples, sep)
+    # Depth
+    if (!is.null(depth.cutoff) || !is.null(depth.cutoff.upper)) {
+      depth_vals <- self$getDepth()
+      depth.keep <- rep(TRUE, length(depth_vals))
+      names(depth.keep) <- names(depth_vals)
+      
+      # lower cutoff
+      if (!is.null(depth.cutoff)) {
+        keep_lower <- filterVector(depth_vals, "depth.cutoff", depth.cutoff, samples, sep)
+        depth.keep <- depth.keep & keep_lower
+      }
+      # upper cutoff 
+      if (!is.null(depth.cutoff.upper)) {
+        keep_upper <- !filterVector(depth_vals, "depth.cutoff.upper", depth.cutoff.upper, samples, sep)
+        depth.keep <- depth.keep & keep_upper
+      }
+      depth.filter <- depth.keep
+      
     } else {
       depth.filter <- NULL
     }
@@ -1483,8 +1676,9 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE,
   
   #' @description Plot filtered cells in an embedding, in a bar plot, on a tile or export the data frame
   #' @param type character The type of plot to use: embedding, bar, tile or export (default = c("embedding","bar","tile","export")).
-  #' @param depth logical Plot the depth or not (default = TRUE).
-  #' @param depth.cutoff numeric Depth cutoff, either a single number or a vector with cutoff per sample and with sampleIDs as names (default = 1e3).
+  #' @param depth logical Plot the depth (= total amount of UMI counts per cell) or not (default = TRUE).
+  #' @param depth.cutoff numeric Lower depth cutoff, either a single number or a vector with cutoff per sample and with sample IDs as names (default = 1e3).
+  #' @param depth.cutoff.upper numeric Upper depth cutoff, either a single number or a vector with cutoff per sample and with sample IDs as names (default = NULL).
   #' @param doublet.method character Method to detect doublets (default = NULL).
   #' @param mito.frac logical Plot the mitochondrial fraction or not (default = TRUE).
   #' @param mito.cutoff numeric Mitochondrial fraction cutoff, either a single number or a vector with cutoff per sample and with sampleIDs as names (default = 0.05).
@@ -1527,6 +1721,7 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE,
   plotFilteredCells = function(type = c("embedding","bar","tile","export"), 
                                depth = TRUE, 
                                depth.cutoff = 1e3, 
+                               depth.cutoff.upper = NULL,
                                doublet.method = NULL, 
                                mito.frac = TRUE, 
                                mito.cutoff = 0.05, 
@@ -1542,14 +1737,35 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE,
     if (mito.frac) species %<>% tolower() %>% match.arg(c("human","mouse"))
     
     # Prepare data
+    # depth
     if (depth) {
-      depths <- self$getDepth() %>% 
-        filterVector("depth.cutoff", depth.cutoff, depth.cutoff %>% names(), sep) %>% 
-        {ifelse(!., "depth", "")}
+      depth_vals <- self$getDepth()
+      cell_names <- names(depth_vals)
+      samples <- self$con$samples[cell_names]
+      
+      low_depth <- !filterVector(depth_vals, "depth.cutoff", depth.cutoff, self$con$samples %>% names(), sep)
+    
+      # upper cutoff
+      if (!is.null(depth.cutoff.upper)) {
+        
+        high_depth <- filterVector(depth_vals,"depth.cutoff.upper",depth.cutoff.upper,self$con$samples %>% names(),sep)
+      } else {
+        high_depth <- rep(FALSE, length(depth_vals))
+        names(high_depth) <- cell_names
+      }
+      
+      # encode as labels
+      depths <- rep("", length(depth_vals))
+      names(depths) <- cell_names
+      
+      depths[low_depth]  <- "low.depth"
+      depths[high_depth] <- "high.depth"
+      
     } else {
       depths <- NULL
     }
-    
+   
+
     if (mito.frac) {
       mf <- self$getMitoFraction(species = species) %>% 
         filterVector("mito.cutoff", mito.cutoff, mito.cutoff %>% names(), sep) %>% 
@@ -1752,12 +1968,12 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE,
     return(tmp)
   },
   
-  #' @description Create plots and script call for CellBender
+  #' @description Read in raw cms and save umi counts to object, then plot barcode rank plots and in case of Parse data generate input files for CellBender
   #' @param shrinkage integer Select every nth UMI count per cell for plotting. Improves plotting speed drastically. To plot all cells, set to 1 (default = 100)
   #' @param show.expected.cells logical Plot line depicting expected number of cells (default = TRUE)
-  #' @param show.total.droplets logical Plot line depicting total droplets included for CellBender run (default = TRUE)
-  #' @param expected.cells named numeric If NULL, expected cells will be deduced from the number of cells per sample identified by Cell Ranger. Otherwise, a named vector of expected cells with sample IDs as names. Sample IDs must match those in summary.metrics (default: stored named vector)
-  #' @param total.droplets named numeric If NULL, total droplets included will be deduced from expected cells multiplied by 3. Otherwise, a named vector of total droplets included with sample IDs as names. Sample IDs must match those in summary.metrics (default: stored named vector)
+  #' @param show.total.barcodes logical Plot line depicting total barcodes included for CellBender run (default = TRUE)
+  #' @param expected.cells named numeric If NULL, expected cells will be deduced from the number of cells per sample identified by respective pipeline. Otherwise, a named vector of expected cells with sample IDs as names. Sample IDs must match those in summary.metrics (default: stored named vector)
+  #' @param total.barcodes named numeric If NULL, total barcodes included will be deduced from expected cells multiplied by 3. Otherwise, a named vector of total barcodes included with sample IDs as names. Sample IDs must match those in summary.metrics (default: stored named vector)
   #' @param cms.raw list Raw count matrices from HDF5 Cell Ranger outputs (default = self$cms.raw)
   #' @param technology character applied single cell technology (default = c("10x", "10xflex", "10xmultiome", "parse")) 
   #' @param umi.counts list UMI counts calculated as column sums of raw count matrices from HDF5 Cell Ranger outputs (default: stored list)
@@ -1775,9 +1991,9 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE,
   #' }
   prepareCellbender = function(shrinkage = 100, 
                                show.expected.cells = TRUE, 
-                               show.total.droplets = TRUE, 
+                               show.total.barcodes = TRUE, 
                                expected.cells = NULL, 
-                               total.droplets = NULL, 
+                               total.barcodes = NULL, 
                                cms.raw = self$cms.raw, 
                                technology = self$technology,
                                umi.counts = self$cellbender$umi.counts, 
@@ -1791,7 +2007,7 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE,
     # Preparations
     if (verbose) message(paste0(Sys.time()," Started run using ", if (n.cores < length(samples)) n.cores else length(samples)," cores"))
     if (is.null(expected.cells)) expected.cells <- self$getExpectedCells(samples)
-    if (is.null(total.droplets)) total.droplets <- self$getTotalDroplets(samples)
+    if (is.null(total.barcodes)) total.barcodes <- self$getTotalBarcodes(samples)
     
     # Read CMs from HDF5 files
     
@@ -1842,17 +2058,17 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE,
     
     line.df <- expected.cells %>% 
       {data.frame(sample = names(.), exp = .)} %>% 
-      mutate(total = total.droplets %>% unname())
+      mutate(total = total.barcodes %>% unname())
     
     g <- ggplot(data.df, aes(x, y)) + 
       geom_line(color = "red") + 
       scale_x_log10(labels = scales::comma) +
       scale_y_log10(labels = scales::comma) +
       self$theme +
-      labs(x = "Droplet ID ranked by count", y = "UMI count per droplet", col = "")
+      labs(x = "Barcode ID ranked by count", y = "UMI count per barcode", col = "")
     
     if (show.expected.cells) g <- g + geom_vline(data = line.df, aes(xintercept = exp, col = "Expected cells"))
-    if (show.total.droplets) g <- g + geom_vline(data = line.df, aes(xintercept = total, col = "Total droplets included"))
+    if (show.total.barcodes) g <- g + geom_vline(data = line.df, aes(xintercept = total, col = "Total barcodes included"))
     
     g <- g + facet_wrap(~ sample, scales = "free")
     
@@ -1904,14 +2120,14 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE,
   },
   
   
-  
+  #' @description Create CellBender script
   #' @param file character File name for CellBender script. Will be stored in `data.path` (default: "cellbender_script.sh")
   #' @param technology character Applied single cell technology (default = self$technology).
   #' @param fpr numeric False positive rate for CellBender (default = 0.01)
   #' @param epochs integer Number of epochs for CellBender (default = 150)
   #' @param use.gpu logical Use CUDA capable GPU (default = TRUE)
   #' @param expected.cells logical or named numeric vector If FALSE, the --expected-cells argument is omitted in the generated script.If TRUE, expected cells will be deduced from the number of cells per sample identified by Cell Ranger/Parse. Otherwise, a named vector of expected cells with sample IDs as names. Sample IDs must match those in summary.metrics (default: FALSE) 
-  #' @param total.droplets logical or named numeric vector If FALSE, the --total.droplets argument is omitted in the generated script.If TRUE, total droplets included will be deduced from expected cells multiplied by 3. Otherwise, a named vector of total droplets included with sample IDs as names. Sample IDs must match those in summary.metrics (default: FALSE)
+  #' @param total.barcodes logical or named numeric vector If FALSE, the --total.barcodes argument is omitted in the generated script.If TRUE, total barcodes included will be deduced from expected cells multiplied by 3. Otherwise, a named vector of total barcodes included with sample IDs as names. Sample IDs must match those in summary.metrics (default: FALSE)
   #' @param data.path character Path to Cell Ranger outputs (default = self$data.path)
   #' @param samples character Sample names to include (default = self$metadata$sample)
   #' @param args character (optional) Additional parameters for CellBender
@@ -1928,7 +2144,7 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE,
                                   epochs = 150, 
                                   use.gpu = TRUE, 
                                   expected.cells = FALSE, 
-                                  total.droplets = FALSE, 
+                                  total.barcodes = FALSE, 
                                   data.path = self$data.path, 
                                   samples = self$metadata$sample, 
                                   args = NULL) {
@@ -1986,12 +2202,12 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE,
         stop("If expected.cells is a vector, it must be named with sample names.")
       }
     
-    if (isTRUE(total.droplets)) {
-      total.droplets <- self$getTotalDroplets(samples)
-    } else if (isFALSE(total.droplets)) {
-      total.droplets <- NULL
-    } else if (is.vector(total.droplets) && is.null(names(total.droplets))) {
-       stop("If total.droplets is a vector, it must be named with sample names.")
+    if (isTRUE(total.barcodes)) {
+      total.barcodes <- self$getTotalBarcodes(samples)
+    } else if (isFALSE(total.barcodes)) {
+      total.barcodes <- NULL
+    } else if (is.vector(total.barcodes) && is.null(names(total.barcodes))) {
+       stop("If total.barcodes is a vector, it must be named with sample names.")
     }
     
     # Create CellBender shell scripts
@@ -2001,7 +2217,7 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE,
                " --output ",outputs[sample],
                if (use.gpu) c(" --cuda ") else c(" "),
                if (!is.null(expected.cells)) paste0("--expected-cells ", expected.cells[sample], " ") else c(" "),
-               if (!is.null(total.droplets)) paste0("--total-droplets-included ", total.droplets[sample], " ") else c(" "),
+               if (!is.null(total.barcodes)) paste0("--total-barcodes-included ", total.barcodes[sample], " ") else c(" "),
                " --fpr ",fpr,
                " --epochs ",epochs,
                " ",if (!is.null(args)) paste(args, collapse = " "))
@@ -2047,7 +2263,7 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE,
     return(expected.cells)
   },
   
-  #' @description Get the total number of droplets included in the CellBender estimations. Based on the Cell Ranger summary metrics and multiplied by a preset multiplier.
+  #' @description Get the total number of barcodes included in the CellBender estimations. Based on the Cell Ranger summary metrics and multiplied by a preset multiplier.
   #' @param samples character Samples names to include (default = self$metadata$sample)
   #' @param multiplier numeric Number to multiply expected number of cells with (default = 3)
   #' @return A numeric vector
@@ -2067,15 +2283,15 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE,
   #' # Add summary
   #' crm$addSummaryFromCms()
   #' 
-  #' # Get no. droplets
-  #' crm$getTotalDroplets()
-  getTotalDroplets = function(samples = self$metadata$sample, 
+  #' # Get no. barcodes
+  #' crm$getTotalBarcodes()
+  getTotalBarcodes = function(samples = self$metadata$sample, 
                               multiplier = 3) {
     if (!is.numeric(multiplier)) stop("'multiplier' must be numeric.")
     expected.cells <- self$getExpectedCells(samples = samples)
-    total.droplets <- expected.cells * multiplier
+    total.barcodes <- expected.cells * multiplier
     
-    return(total.droplets)
+    return(total.barcodes)
   },
   
   #' @description Add a list of count matrices to the CRMetrics object.
diff --git a/R/inner_functions.R b/R/inner_functions.R
index ed1ce9a..041ecee 100644
--- a/R/inner_functions.R
+++ b/R/inner_functions.R
@@ -186,7 +186,7 @@ readFlex <- function(data.path,
   return(tmp)
 }
 
-#' @title Load filtered Parse count matrices
+#' @title Load Parse count matrices
 #' @description Load gene expression count data
 #' @param data.path Path to Parse count data.
 #' @param samples Vector of sample names (default = NULL)
@@ -461,7 +461,7 @@ addSummaryMetrics <- function(data.path,
                             header = TRUE, sep = ",", stringsAsFactors = FALSE) %>%
             select(Metric.Name, Metric.Value) %>% # Only select 'Metric Name' and 'Metric Value' columns
             rename(metric = Metric.Name, value = Metric.Value) %>%
-            slice(1:12) %>%
+            slice(1:15) %>%
             mutate(value = gsub(",", "", value),  # Remove commas
                    value = ifelse(grepl("%", value), as.numeric(gsub("%", "", value)) / 100, as.numeric(value)))  %>%  # Handle percentages
             mutate(sample = s[1]) %>% # Add sample column

From 30814f3b5c8fffdd470254bd3a12ed307055182f Mon Sep 17 00:00:00 2001
From: Laura19993 <laura.wolbeck93@gmail.com>
Date: Wed, 11 Mar 2026 15:26:44 +0100
Subject: [PATCH 6/6] changed prepareCellbender function to not plot
 Barcode-Rank-plots anymore and just prepares Parse data as input for
 CellBender

---
 .gitignore          |   1 +
 R/CRMetrics.R       | 247 +++++++++++---------------------------------
 R/inner_functions.R |   2 +-
 3 files changed, 65 insertions(+), 185 deletions(-)

diff --git a/.gitignore b/.gitignore
index 58df564..f1301d2 100644
--- a/.gitignore
+++ b/.gitignore
@@ -14,6 +14,7 @@
 # RStudio files
 .Rproj.user/
 # produced vignettes
+vignettes
 vignettes/*.html
 vignettes/*.pdf
 vignettes/*.Rmd
diff --git a/R/CRMetrics.R b/R/CRMetrics.R
index c177469..966bbeb 100644
--- a/R/CRMetrics.R
+++ b/R/CRMetrics.R
@@ -24,13 +24,13 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE,
    #' @field metadata data.frame or character Path to metadata file or name of metadata data.frame object. Metadata must contain a column named 'sample' containing sample names that must match folder names in 'data.path' (default = NULL)
    metadata = NULL,
    
-   #' @field data.path character Path(s) to Cell Ranger or Parse count data, one directory per sample. If multiple paths, do c("path1","path2") (default = NULL)
+   #' @field data.path character Path(s) to Cell Ranger or Parse's split-pipe count data, one directory per sample. If multiple paths, do c("path1","path2") (default = NULL)
    data.path = NULL, 
    
-   #' @field cms list List with count matrices (default = NULL)
+   #' @field cms list List with count matrices from Cell Ranger or Parse's split-pipe (default = NULL)
    cms = NULL,
    
-   #' @field cms.preprocessed list List with preprocessed count matrices after $doPreprocessing() (default = NULL)
+   #' @field cms.preprocessed list List with pagoda2- or seurat-preprocessed count matrices after $doPreprocessing() (default = NULL)
    cms.preprocessed = NULL,
    
    #' @field cms.raw list List with raw, unfiltered count matrices, i.e., including all cell barcodes detected, also empty ones (default = NULL)
@@ -204,7 +204,7 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE,
     if (is.null(cms)) self$summary.metrics <- addSummaryMetrics(data.path = data.path, metadata = self$metadata, n.cores = self$n.cores, verbose = verbose)
   },
   
-  #' @description Function to read in detailed metrics. This is not done upon initialization for speed.
+  #' @description Function to read in detailed metrics (total UMI count and gene count per cell). This is not done upon object initialization for speed.
   #' @param cms list List of (sparse) count matrices (default = self$cms)
   #' @param min.transcripts.per.cell numeric Minimal number of transcripts per cell (default = 100)
   #' @param n.cores integer Number of cores for the calculations (default = self$n.cores).
@@ -268,7 +268,7 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE,
                                show.expected.cells = TRUE, 
                                cms.raw = self$cms.raw, 
                                technology = self$technology,
-                               umi.counts = self$cellbender$umi.counts, 
+                               umi.counts = self$umi.counts, 
                                data.path = self$data.path, 
                                samples = self$metadata$sample, 
                                verbose = self$verbose, 
@@ -300,7 +300,7 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE,
     
     # Get UMI counts
     if (!is.null(umi.counts)) {
-      if (verbose) message(paste0(Sys.time()," Using stored UMI counts calculations. To overwrite, set $cellbender$umi.counts <- NULL"))
+      if (verbose) message(paste0(Sys.time()," Using stored UMI counts calculations. To overwrite, set crm$umi.counts <- NULL"))
     } else {
       if (verbose) message(paste0(Sys.time()," Calculating UMI counts per sample"))
       umi.counts <- cms.raw[samples] %>% 
@@ -312,7 +312,7 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE,
             mutate(., x = seq_len(nrow(.)))
         }, n.cores = n.cores) %>% 
         setNames(samples)
-      self$cellbender$umi.counts <- umi.counts
+      self$umi.counts <- umi.counts
     }
     
     # Create plot
@@ -346,6 +346,36 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE,
     return(g)
   },
   
+  #' @description Extract the expected number of cells per sample based on the Cell Ranger or Parse's split-pipe summary metrics.
+  #' @param samples character Sample names to include (default = self$metadata$sample) 
+  #' @return A numeric vector
+  #' @examples
+  #' # Simulate data
+  #' testdata.cms <- lapply(seq_len(2), \(x) {
+  #' out <- Matrix::rsparsematrix(2e3, 1e3, 0.1)
+  #' out[out < 0] <- 1
+  #' dimnames(out) <- list(sapply(seq_len(2e3), \(x) paste0("gene",x)),
+  #' sapply(seq_len(1e3), \(x) paste0("cell",x)))
+  #' return(out)
+  #' })
+  #' 
+  #' # Initialize
+  #' crm <- CRMetrics$new(cms = testdata.cms, samples = c("sample1", "sample2"), n.cores = 1)
+  #' 
+  #' # Get summary
+  #' crm$addSummaryFromCms()
+  #' 
+  #' # Get no. cells
+  #' crm$getExpectedCells()
+  getExpectedCells = function(samples = self$metadata$sample) {
+    expected.cells <- self$summary.metrics %>% 
+      filter(metric == "estimated number of cells") %$% 
+      setNames(value, sample) %>%
+      .[samples]
+    
+    return(expected.cells)
+  },
+  
   
   #' @description Add comparison group for statistical testing.
   #' @param comp.group character Comparison metric (default = self$comp.group).
@@ -1968,116 +1998,35 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE,
     return(tmp)
   },
   
-  #' @description Read in raw cms and save umi counts to object, then plot barcode rank plots and in case of Parse data generate input files for CellBender
-  #' @param shrinkage integer Select every nth UMI count per cell for plotting. Improves plotting speed drastically. To plot all cells, set to 1 (default = 100)
-  #' @param show.expected.cells logical Plot line depicting expected number of cells (default = TRUE)
-  #' @param show.total.barcodes logical Plot line depicting total barcodes included for CellBender run (default = TRUE)
-  #' @param expected.cells named numeric If NULL, expected cells will be deduced from the number of cells per sample identified by respective pipeline. Otherwise, a named vector of expected cells with sample IDs as names. Sample IDs must match those in summary.metrics (default: stored named vector)
-  #' @param total.barcodes named numeric If NULL, total barcodes included will be deduced from expected cells multiplied by 3. Otherwise, a named vector of total barcodes included with sample IDs as names. Sample IDs must match those in summary.metrics (default: stored named vector)
-  #' @param cms.raw list Raw count matrices from HDF5 Cell Ranger outputs (default = self$cms.raw)
+  #' @description Generates input files for CellBender from Parse's split-pipe output.
   #' @param technology character applied single cell technology (default = c("10x", "10xflex", "10xmultiome", "parse")) 
-  #' @param umi.counts list UMI counts calculated as column sums of raw count matrices from HDF5 Cell Ranger outputs (default: stored list)
   #' @param data.path character Path to Cell Ranger outputs (default = self$data.path)
   #' @param samples character Sample names to include (default = self$metadata$sample)
   #' @param verbose logical Show progress (default: stored vector)
   #' @param n.cores integer Number of cores (default: stored vector)
   #' @param unique.names logical Create unique cell names (default = FALSE)
   #' @param sep character Separator for creating unique cell names (default = "!!")
-  #' @return ggplot2 object and bash script
+  #' @return Saves files to disc.
   #' @examples 
   #' \dontrun{
   #' crm <- CRMetrics$new(data.path = "/path/to/count/data")
   #' crm$prepareCellbender()
   #' }
-  prepareCellbender = function(shrinkage = 100, 
-                               show.expected.cells = TRUE, 
-                               show.total.barcodes = TRUE, 
-                               expected.cells = NULL, 
-                               total.barcodes = NULL, 
-                               cms.raw = self$cms.raw, 
-                               technology = self$technology,
-                               umi.counts = self$cellbender$umi.counts, 
+  prepareCellbender = function(technology = self$technology, 
                                data.path = self$data.path, 
                                samples = self$metadata$sample, 
                                verbose = self$verbose, 
                                n.cores = self$n.cores, 
                                unique.names = FALSE,
                                sep = "!!") {
-    checkPackageInstalled("sparseMatrixStats", bioc = TRUE)
-    # Preparations
-    if (verbose) message(paste0(Sys.time()," Started run using ", if (n.cores < length(samples)) n.cores else length(samples)," cores"))
-    if (is.null(expected.cells)) expected.cells <- self$getExpectedCells(samples)
-    if (is.null(total.barcodes)) total.barcodes <- self$getTotalBarcodes(samples)
     
-    # Read CMs from HDF5 files
-    
-    #add parse case, use readParse function
-    if (!is.null(cms.raw)) {
-      if (verbose) message(paste0(Sys.time()," Using stored HDF5 Cell Ranger/Parse outputs. To overwrite, set $cms.raw <- NULL"))
-    } else {
-      
-      checkDataPath(data.path)
-      
-      if (technology == "parse") {
-        if (verbose) message(paste0(Sys.time()," Loading Parse outputs"))
-        cms.raw <- readParse(data.path, samples, "raw", n.cores = n.cores, verbose = verbose, unique.names = unique.names, sep = sep)
-    } else {
-      if (verbose) message(paste0(Sys.time()," Loading HDF5 Cell Ranger outputs"))
-      cms.raw <- read10xH5(data.path, samples, "raw", technology = self$technology, n.cores = n.cores, verbose = verbose, unique.names = unique.names, sep = sep)
-     }
-    self$cms.raw <- cms.raw
-  }
-    
-    # Get UMI counts
-    if (!is.null(umi.counts)) {
-      if (verbose) message(paste0(Sys.time()," Using stored UMI counts calculations. To overwrite, set $cellbender$umi.counts <- NULL"))
-    } else {
-      if (verbose) message(paste0(Sys.time()," Calculating UMI counts per sample"))
-      umi.counts <- cms.raw[samples] %>% 
-        plapply(\(cm) {
-          sparseMatrixStats::colSums2(cm) %>%
-            sort(decreasing = TRUE) %>% 
-            {data.frame(y = .)} %>% 
-            filter(y > 0) %>% 
-            mutate(., x = seq_len(nrow(.)))
-        }, n.cores = n.cores) %>% 
-        setNames(samples)
-      self$cellbender$umi.counts <- umi.counts
+    if (technology != "parse") {
+      message(paste0(Sys.time()," This function only needs to be run for Parse Biosciences data. "))
     }
-    
-    # Create plot
-    if (verbose) message(paste0(Sys.time()," Plotting"))
-    data.df <- umi.counts[samples] %>% 
-      names() %>% 
-      lapply(\(sample) {
-        umi.counts[[sample]] %>% 
-          mutate(sample = sample) %>% 
-          .[seq(1, nrow(.), shrinkage),]
-      }) %>% 
-      bind_rows()
-    
-    line.df <- expected.cells %>% 
-      {data.frame(sample = names(.), exp = .)} %>% 
-      mutate(total = total.barcodes %>% unname())
-    
-    g <- ggplot(data.df, aes(x, y)) + 
-      geom_line(color = "red") + 
-      scale_x_log10(labels = scales::comma) +
-      scale_y_log10(labels = scales::comma) +
-      self$theme +
-      labs(x = "Barcode ID ranked by count", y = "UMI count per barcode", col = "")
-    
-    if (show.expected.cells) g <- g + geom_vline(data = line.df, aes(xintercept = exp, col = "Expected cells"))
-    if (show.total.barcodes) g <- g + geom_vline(data = line.df, aes(xintercept = total, col = "Total barcodes included"))
-    
-    g <- g + facet_wrap(~ sample, scales = "free")
-    
-    if (verbose) message(paste0(Sys.time()," Done!"))
-   
-  
+      
   # transform parse output to work as input for cellbender
     if (technology == "parse") {
-      if (verbose) message(paste0(Sys.time()," Transforming Parse outputs as preparation for cellbender"))
+      if (verbose) message(paste0(Sys.time()," Transforming Parse's split-pipe outputs as preparation for CellBender"))
       # check if transformed data already exists
       full.path <- data.path %>% 
         pathsToList(samples) %>% 
@@ -2115,8 +2064,6 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE,
         }
       })
     }
-    # return the plot object
-    return(g)
   },
   
   
@@ -2126,9 +2073,9 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE,
   #' @param fpr numeric False positive rate for CellBender (default = 0.01)
   #' @param epochs integer Number of epochs for CellBender (default = 150)
   #' @param use.gpu logical Use CUDA capable GPU (default = TRUE)
-  #' @param expected.cells logical or named numeric vector If FALSE, the --expected-cells argument is omitted in the generated script.If TRUE, expected cells will be deduced from the number of cells per sample identified by Cell Ranger/Parse. Otherwise, a named vector of expected cells with sample IDs as names. Sample IDs must match those in summary.metrics (default: FALSE) 
-  #' @param total.barcodes logical or named numeric vector If FALSE, the --total.barcodes argument is omitted in the generated script.If TRUE, total barcodes included will be deduced from expected cells multiplied by 3. Otherwise, a named vector of total barcodes included with sample IDs as names. Sample IDs must match those in summary.metrics (default: FALSE)
-  #' @param data.path character Path to Cell Ranger outputs (default = self$data.path)
+  #' @param expected.cells named numeric vector By default, the --expected-cells argument is omitted in the generated script. Otherwise, a named vector of expected cell numbers with sample IDs as names. Sample IDs must match those in summary.metrics (default = NULL).
+  #' @param total.droplets named numeric vector By default, the --total-droplets-included argument is omitted in the generated script. Otherwise, a named vector of total droplets included with sample IDs as names. Sample IDs must match those in summary.metrics (default = NULL).
+  #' @param data.path character Path to Cell Ranger or Parse's split-pipe outputs (default = self$data.path)
   #' @param samples character Sample names to include (default = self$metadata$sample)
   #' @param args character (optional) Additional parameters for CellBender
   #' @return bash script
@@ -2143,8 +2090,8 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE,
                                   fpr = 0.01, 
                                   epochs = 150, 
                                   use.gpu = TRUE, 
-                                  expected.cells = FALSE, 
-                                  total.barcodes = FALSE, 
+                                  expected.cells = NULL, 
+                                  total.droplets = NULL, 
                                   data.path = self$data.path, 
                                   samples = self$metadata$sample, 
                                   args = NULL) {
@@ -2152,11 +2099,11 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE,
     checkDataPath(data.path)
     
     if (technology == "parse") {
-      # in case of parse we use the genes, matrix, barcodes files stored in DGE_unfiltered and therefor we just need to give the path to DGE_unfiltered
+      # in case of parse we use the genes, matrix, barcodes files stored in DGE_unfiltered and therefore we just need to give the path to DGE_unfiltered
       inputs <- data.path %>%
         pathsToList(samples) %>% 
         sapply(\(sample) {
-            file.path(sample[2],sample[1],"DGE_unfiltered")})
+          paste0(sample[2],sample[1],"/DGE_unfiltered")})
       
       #check that transformed files exists for parse
       lapply(inputs, function(x) {
@@ -2185,29 +2132,23 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE,
       pathsToList(samples) %>% 
     sapply(\(sample) {
         if (technology == "parse") {
-          file.path(sample[2], sample[1], "DGE_unfiltered/cellbender.h5")
+          paste0(sample[2], sample[1],"/DGE_unfiltered/cellbender.h5")
         } else if (technology == "10xflex") {
-          file.path(sample[2],sample[1], "count/cellbender.h5")
+          paste0(sample[2],sample[1],"/count/cellbender.h5")
         } else {
-          file.path(sample[2],sample[1], "outs/cellbender.h5")
+          paste0(sample[2],sample[1],"/outs/cellbender.h5")
         }
       }) %>%
       setNames(samples)
     
-    if (isTRUE(expected.cells)) {
-      expected.cells <- self$getExpectedCells(samples)
-    } else if (isFALSE(expected.cells)) { 
-      expected.cells <- NULL
-    } else if (is.vector(expected.cells) && is.null(names(expected.cells))) {
+
+     if (is.vector(expected.cells) && is.null(names(expected.cells))) {
         stop("If expected.cells is a vector, it must be named with sample names.")
       }
     
-    if (isTRUE(total.barcodes)) {
-      total.barcodes <- self$getTotalBarcodes(samples)
-    } else if (isFALSE(total.barcodes)) {
-      total.barcodes <- NULL
-    } else if (is.vector(total.barcodes) && is.null(names(total.barcodes))) {
-       stop("If total.barcodes is a vector, it must be named with sample names.")
+
+     if (is.vector(total.droplets) && is.null(names(total.droplets))) {
+       stop("If total.droplets is a vector, it must be named with sample names.")
     }
     
     # Create CellBender shell scripts
@@ -2217,7 +2158,7 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE,
                " --output ",outputs[sample],
                if (use.gpu) c(" --cuda ") else c(" "),
                if (!is.null(expected.cells)) paste0("--expected-cells ", expected.cells[sample], " ") else c(" "),
-               if (!is.null(total.barcodes)) paste0("--total-barcodes-included ", total.barcodes[sample], " ") else c(" "),
+               if (!is.null(total.droplets)) paste0("--total-droplets-included ", total.droplets[sample], " ") else c(" "),
                " --fpr ",fpr,
                " --epochs ",epochs,
                " ",if (!is.null(args)) paste(args, collapse = " "))
@@ -2225,82 +2166,20 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE,
     
     out <- list("#! /bin/sh", script.list) %>% 
       unlist()
-    # if there are several data.path "file" is saved to the first element of data.paht vector
+    # if there are several data.path "file" is saved to the first element of data.path vector
     cat(out, file = file.path(data.path[1],file), sep = "\n")
     message(paste0(file ," was saved into ", data.path[1]))
   },
   
-  #' @description Extract the expected number of cells per sample based on the Cell Ranger summary metrics
-  #' @param samples character Sample names to include (default = self$metadata$sample) 
-  #' @return A numeric vector
-  #' @examples
-  #' # Simulate data
-  #' testdata.cms <- lapply(seq_len(2), \(x) {
-  #' out <- Matrix::rsparsematrix(2e3, 1e3, 0.1)
-  #' out[out < 0] <- 1
-  #' dimnames(out) <- list(sapply(seq_len(2e3), \(x) paste0("gene",x)),
-  #' sapply(seq_len(1e3), \(x) paste0("cell",x)))
-  #' return(out)
-  #' })
-  #' 
-  #' # Initialize
-  #' crm <- CRMetrics$new(cms = testdata.cms, samples = c("sample1", "sample2"), n.cores = 1)
-  #' 
-  #' # Get summary
-  #' crm$addSummaryFromCms()
-  #' 
-  #' # Get no. cells
-  #' crm$getExpectedCells()
-  #' 
-
-  #included parse and 10x flex nomenclature
-  getExpectedCells = function(samples = self$metadata$sample) {
-    expected.cells <- self$summary.metrics %>% 
-      filter(metric %in% c("estimated number of cells", "number_of_cells", "cells")) %$% 
-      setNames(value, sample) %>%
-      .[samples]
-    
-    return(expected.cells)
-  },
   
-  #' @description Get the total number of barcodes included in the CellBender estimations. Based on the Cell Ranger summary metrics and multiplied by a preset multiplier.
-  #' @param samples character Samples names to include (default = self$metadata$sample)
-  #' @param multiplier numeric Number to multiply expected number of cells with (default = 3)
-  #' @return A numeric vector
-  #' @examples
-  #' # Simulate data
-  #' testdata.cms <- lapply(seq_len(2), \(x) {
-  #' out <- Matrix::rsparsematrix(2e3, 1e3, 0.1)
-  #' out[out < 0] <- 1
-  #' dimnames(out) <- list(sapply(seq_len(2e3), \(x) paste0("gene",x)),
-  #' sapply(seq_len(1e3), \(x) paste0("cell",x)))
-  #' return(out)
-  #' })
-  #' 
-  #' # Initialize
-  #' crm <- CRMetrics$new(cms = testdata.cms, samples = c("sample1", "sample2"), n.cores = 1)
-  #' 
-  #' # Add summary
-  #' crm$addSummaryFromCms()
-  #' 
-  #' # Get no. barcodes
-  #' crm$getTotalBarcodes()
-  getTotalBarcodes = function(samples = self$metadata$sample, 
-                              multiplier = 3) {
-    if (!is.numeric(multiplier)) stop("'multiplier' must be numeric.")
-    expected.cells <- self$getExpectedCells(samples = samples)
-    total.barcodes <- expected.cells * multiplier
-    
-    return(total.barcodes)
-  },
   
   #' @description Add a list of count matrices to the CRMetrics object.
   #' @param cms list List of (sparse) count matrices (default = NULL)
-  #' @param data.path character Path to Cell Ranger or Parse count data (default = self$data.path).
+  #' @param data.path character Path to Cell Ranger or Parse's split-pipe count data (default = self$data.path).
   #' @param technology character Applied single cell technology (default = self$technology).
   #' @param samples character Vector of sample names. If NULL, samples are extracted from cms (default = self$metadata$sample)
   #' @param cellbender logical Add CellBender filtered count matrices in HDF5 format. Requires that "cellbender" is in the names of the files (default = FALSE)
-  #' @param raw logical Add raw count matrices from Cell Ranger or Parse output. Cannot be combined with `cellbender=TRUE`. (default = FALSE)
+  #' @param raw logical Add raw count matrices from Cell Ranger or Parse's split-pipe output. Cannot be combined with `cellbender=TRUE`. (default = FALSE)
   #' @param symbol character The type of gene IDs to use, SYMBOL (TRUE) or ENSEMBLE (default = TRUE)
   #' @param unique.names logical Make cell names unique based on `sep` parameter (default = TRUE)
   #' @param sep character Separator used to create unique cell names (default = "!!")
diff --git a/R/inner_functions.R b/R/inner_functions.R
index 041ecee..6691b3c 100644
--- a/R/inner_functions.R
+++ b/R/inner_functions.R
@@ -27,7 +27,7 @@ checkCompGroup <- function(comp.group,
 
 #' @title Check whether 'comp.group' is in metadata
 #' @description Checks whether 'comp.group' is any of the column names in metadata.
-#' @param comp.group Comparison metric.
+#' @param comp.group Comparison metric.?
 #' @param metadata Metadata for samples.
 #' @keywords internal
 #' @return nothing or stop