diff --git a/r/DESCRIPTION b/r/DESCRIPTION index 17faadb9..54090e64 100644 --- a/r/DESCRIPTION +++ b/r/DESCRIPTION @@ -30,6 +30,7 @@ Imports: Matrix, Rcpp, rlang, + tools, vctrs, lifecycle, stringr, @@ -50,5 +51,5 @@ Suggests: matrixStats, igraph Depends: - R (>= 3.5.0) + R (>= 4.0.0) Config/Needs/website: pkgdown, devtools, uwot, irlba, RcppHNSW, igraph, BiocManager, bioc::BSgenome.Hsapiens.UCSC.hg38, github::GreenleafLab/motifmatchr, github::GreenleafLab/chromVARmotifs diff --git a/r/NAMESPACE b/r/NAMESPACE index 982ca440..527f06ca 100644 --- a/r/NAMESPACE +++ b/r/NAMESPACE @@ -47,6 +47,8 @@ export(gene_region) export(gene_score_archr) export(gene_score_tiles_archr) export(gene_score_weights_archr) +export(get_demo_frags) +export(get_demo_mat) export(get_trackplot_height) export(import_matrix_market) export(import_matrix_market_10x) @@ -93,6 +95,7 @@ export(read_gencode_transcripts) export(read_gtf) export(read_ucsc_chrom_sizes) export(regress_out) +export(remove_demo_data) export(rotate_x_labels) export(rowMaxs) export(rowMaxs.IterableMatrix) diff --git a/r/NEWS.md b/r/NEWS.md index 6cab91a5..a3531348 100644 --- a/r/NEWS.md +++ b/r/NEWS.md @@ -10,6 +10,7 @@ Contributions welcome :) ## Features - Add `write_matrix_anndata_hdf5_dense()` which allows writing matrices in AnnData's dense format, most commonly used for `obsm` or `varm` matrices. (Thanks to @ycli1995 for pull request #166) +- Add `get_demo_mat()`, `get_demo_frags()` and `remove_demo_data()` to retrieve a small test matrix/fragments object from the [PBMC 3k dataset from 10X Genomics](https://www.10xgenomics.com/datasets/pbmc-from-a-healthy-donor-granulocytes-removed-through-cell-sorting-3-k-1-standard-2-0-0). (pull request #193) ## Bug-fixes - Fix error message printing when MACS crashes during `call_peaks_macs()` (pull request #175) diff --git a/r/R/data.R b/r/R/data.R index 9f66faa5..b43a3abb 100644 --- a/r/R/data.R +++ b/r/R/data.R @@ -6,6 +6,252 @@ # option. This file may not be copied, modified, or distributed # except according to those terms. +#' Create a small demo matrix and fragment object. +#' +#' Downloads a +#' [10x Genomics dataset](https://www.10xgenomics.com/datasets/pbmc-from-a-healthy-donor-granulocytes-removed-through-cell-sorting-3-k-1-standard-2-0-0), +#' consisting of 3k cells then performs optional QC and subsetting. Holds subsetted objects in disk, +#' and returns a list with both the matrix and fragments. +#' @param directory (character) The directory where all the input/output data will be stored. +#' Downloaded intermediates will be stored in subdir `intermediates`. +#' If `NULL`, a temporary directory is created. +#' @param filter_qc (bool) Whether to filter both the RNA and ATAC data using QC information. +#' @param subset (bool) Whether to subset to only genes/insertions on chromosome 4 and 11. +#' @param timeout (numeric) Timeout for downloading files in seconds. +#' @return (list) A list with the RNA matrix under the name `mat`, and the ATAC fragments under the name `frags`. +#' @details +#' This function downloads the 10x Genomics PBMC 3k dataset. +#' Filtering using QC information on the fragments and matrix provides cells with at least 1000 reads, 1000 frags, and a minimum tss enrichment of 10. +#' Subsetting provides only genes and insertions on chromosomes 4 and 11. +#' The name of the matrix and fragments folders are `demo_mat` and `demo_frags` respectively. +#' Additionally, choosing to qc filter appends a `_filtered`, and choosing to subset data appends a `_subsetted` to the name. +#' @keywords internal +prepare_demo_data <- function(directory = NULL, filter_qc = TRUE, subset = TRUE, timeout = 300) { + if (is.null(directory)) { + directory <- file.path(tempdir()) + } + intermediate_dir <- file.path(directory, "intermediates") + dir.create(intermediate_dir, recursive = TRUE, showWarnings = FALSE) + on.exit(unlink(intermediate_dir, recursive = TRUE)) + + mat_name <- "demo_mat" + frags_name <- "demo_frags" + # Download matrix/frags if not done previously, and open + url_base <- "https://cf.10xgenomics.com/samples/cell-arc/2.0.0/pbmc_granulocyte_sorted_3k/" + # Recreate mat if mat is malformed + mat <- NULL + frags <- NULL + tryCatch({ + mat <- open_matrix_dir(file.path(directory, mat_name)) + }, error = function(e) { + rna_raw_url <- paste0(url_base, "pbmc_granulocyte_sorted_3k_raw_feature_bc_matrix.h5") + ensure_downloaded(file.path(intermediate_dir, "pbmc_3k_10x.h5"), rna_raw_url, timeout = timeout) + mat <<- open_matrix_10x_hdf5(file.path(intermediate_dir, "pbmc_3k_10x.h5"), feature_type="Gene Expression") %>% + write_matrix_dir(file.path(directory, mat_name), overwrite = TRUE) + }) + # Recreate frags if frags are malformed + tryCatch({ + frags <- open_fragments_dir(file.path(directory, frags_name)) + }, error = function(e) { + atac_raw_url <- paste0(url_base, "pbmc_granulocyte_sorted_3k_atac_fragments.tsv.gz") + ensure_downloaded(file.path(intermediate_dir, "pbmc_3k_10x.fragments.tsv.gz"), atac_raw_url, timeout = timeout) + frags <<- open_fragments_10x(file.path(intermediate_dir, "pbmc_3k_10x.fragments.tsv.gz")) %>% + write_fragments_dir(file.path(directory, frags_name), overwrite = TRUE) + }) + if (filter_qc) { + # Download annotations for transcripts + transcripts <- read_gencode_transcripts( + intermediate_dir, + release = "42", + transcript_choice = "MANE_Select", + annotation_set = "basic", + features = "transcript" + ) + blacklist <- read_encode_blacklist(intermediate_dir, genome="hg38") + atac_qc <- qc_scATAC(frags, transcripts, blacklist) + # Filter to only cells that have at least 1000 reads on the RNA side + # a minimum of 1000 frag reads, and greater than 10 tss enrichment + pass_atac <- atac_qc %>% + dplyr::filter(nFrags > 1000, TSSEnrichment > 10) %>% + dplyr::pull(cellName) + pass_rna <- colnames(mat)[colSums(mat) > 1000] + filtered_cells <- intersect(pass_atac, pass_rna) + frags <- select_cells(frags, filtered_cells) + mat <- mat[, filtered_cells] + } + if (subset) { + # Subset to only genes/fragments that exist on chr4 and 11 + genes_demo <- read_gencode_genes( + intermediate_dir, + release = "42", + annotation_set = "basic", + ) + filtered_genes <- genes_demo[genes_demo$chr %in% c("chr4", "chr11"),]$gene_id + # remove version numbers + filtered_genes <- gsub("\\..*", "", filtered_genes) + mat <- mat[which(rownames(mat) %in% filtered_genes), ] + frags <- frags %>% select_chromosomes(c("chr4", "chr11")) + } + # Rename mat and frags depending on state of filtering and subsetting + if (filter_qc) { + mat_name <- paste0(mat_name, "_filtered") + frags_name <- paste0(frags_name, "_filtered") + } + if (subset) { + mat_name <- paste0(mat_name, "_subsetted") + frags_name <- paste0(frags_name, "_subsetted") + } + # Write changes to directory + if (filter_qc || subset) { + mat <- write_matrix_dir(mat, file.path(directory, mat_name), overwrite = TRUE) + frags <- write_fragments_dir(frags, file.path(directory, frags_name), overwrite = TRUE) + } + return(list(mat = mat, frags = frags)) +} + +#' Retrieve BPCells demo data +#' +#' `r lifecycle::badge("experimental")` \cr Functions to download matrices and fragments derived from a +#' [10X Genomics PBMC 3k dataset](https://www.10xgenomics.com/datasets/pbmc-from-a-healthy-donor-granulocytes-removed-through-cell-sorting-3-k-1-standard-2-0-0), +#' with options to filter with common qc metrics, and to subset genes and fragments to only chromosome 4 and 11. +#' @rdname demo_data +#' @param filter_qc (bool) Whether to filter both the RNA and ATAC data using qc metrics (described in `details`). +#' @param subset (bool) Whether to subset to only genes/insertions on chromosome 4 and 11. +#' @return +#' - `get_demo_mat()`: (IterableMatrix) A `(features x cells)` matrix. +#' @details +#' These data functions are experimental. +#' The interface, as well as the demo dataset itself will likely undergo changes in the near future. +#' +#' **Data Processing**: +#' +#' The first time either `get_demo_mat()`, or `get_demo_frags()`, are ran +#' demo data is downloaded and stored in the BPCells data directory (under `file.path(tools::R_user_dir("BPcells", which="data"), "demo_data")`). +#' +#' Subsequent calls to this function will use the previously downloaded matrix/fragments, given that the same combination of filtering and +#' subsetting has been performed previously. +#' +#' The preparation of this matrix can be reproduced by running the internal function `prepare_demo_data()` with `directory` set to the BPCells data directory. +#' +#' In the case that demo data is not pre-downloaded and demo data download fails, `prepare_demo_data()` will act +#' as a fallback. +#' +#' Both the matrix from `get_demo_mat()` and the fragments from `get_demo_frags()` +#' may be removed by running `remove_demo_data()`. +#' +#' Filtering using QC information on the fragments and matrix object chooses cells with at least 1000 reads, 1000 frags, and a minimum tss enrichment of 10. +#' Subsetting provides only genes and insertions on chromosomes 4 and 11. +#' +#' **Dimensions**: +#' \tabular{lll}{ +#' \strong{Condition} \tab \strong{RNA matrix (features x cells)} \tab \strong{Fragments (chromosomes x cells)} \cr +#' Raw \tab (36601 x 650165) \tab (39 x 462264) \cr +#' Filter \tab (36601 x 2600) \tab (39 x 2600) \cr +#' Subset \tab (3582 x 650165) \tab (2 x 462264) \cr +#' Filter + Subset \tab (3582 x 2600) \tab (2 x 2600) \cr +#' } +#' **Data size**: +#' \tabular{lll}{ +#' \strong{Condition} \tab \strong{RNA matrix (MB)} \tab \strong{Fragments (MB)} \cr +#' Raw \tab 31.9 \tab 200 \cr +#' Filter \tab 9.4 \tab 137 \cr +#' Subset \tab 18.3 \tab 25.6 \cr +#' Filter + Subset \tab 1.2 \tab 12.3 \cr +#' } +#' +#' **Function Description**: +#' +#' - `get_demo_mat()`: Retrieve a demo `IterableMatrix` object representing the 10X Genomics PBMC 3k dataset. +#' @examples +#' ####################################################################### +#' ## get_demo_mat() example +#' ####################################################################### +#' get_demo_mat() +#' +#' +#' @export +get_demo_mat <- function(filter_qc = TRUE, subset = TRUE) { + # Use the data directory for BPCells + data_dir <- file.path(tools::R_user_dir("BPCells", which = "data"), "demo_data") + if (!dir.exists(data_dir)) { + dir.create(data_dir, recursive = TRUE) + } + mat_name = "demo_mat" + if (filter_qc) mat_name <- paste0(mat_name, "_filtered") + if (subset) mat_name <- paste0(mat_name, "_subsetted") + if (!dir.exists(file.path(data_dir, mat_name))) { + url <- paste0("https://pub-c4e56988ff67429e9856ffa33aecb0c1.r2.dev/", mat_name, ".tar.gz") + suppressWarnings(try(download.file(url, file.path(data_dir, paste0(mat_name, ".tar.gz"))), silent = TRUE)) + # Check if file download failed + if (!file.exists(file.path(data_dir, paste0(mat_name, ".tar.gz")))) { + prepare_demo_data(data_dir, filter_qc = filter_qc, subset = subset) + } else { + untar(file.path(data_dir, paste0(mat_name, ".tar.gz")), exdir=data_dir) + file.remove(file.path(data_dir, paste0(mat_name, ".tar.gz"))) + } + } + return(open_matrix_dir(file.path(data_dir, mat_name))) +} + +#' @rdname demo_data +#' @return +#' - `get_demo_frags()`: (IterableFragments) A Fragments object. +#' @details +#' - `get_demo_frags()`: Retrieve a demo `IterableFragments` object representing the 10X Genomics PBMC 3k dataset. +#' @examples +#' ####################################################################### +#' ## get_demo_frags() example +#' ####################################################################### +#' get_demo_frags() +#' +#' +#' @export +get_demo_frags <- function(filter_qc = TRUE, subset = TRUE) { + data_dir <- file.path(tools::R_user_dir("BPCells", which = "data"), "demo_data") + if (!dir.exists(data_dir)) { + dir.create(data_dir, recursive = TRUE) + } + frags_name <- "demo_frags" + if (filter_qc) frags_name <- paste0(frags_name, "_filtered") + if (subset) frags_name <- paste0(frags_name, "_subsetted") + if (!dir.exists(file.path(data_dir, frags_name))) { + url <- paste0("https://pub-c4e56988ff67429e9856ffa33aecb0c1.r2.dev/", frags_name, ".tar.gz") + suppressWarnings(try(download.file(url, file.path(data_dir, paste0(frags_name, ".tar.gz"))), silent = TRUE)) + if (!file.exists(file.path(data_dir, paste0(frags_name, ".tar.gz")))) { + prepare_demo_data(data_dir) + } else { + untar(file.path(data_dir, paste0(frags_name, ".tar.gz")), exdir = data_dir) + file.remove(file.path(data_dir, paste0(frags_name, ".tar.gz"))) + } + } + return(open_fragments_dir(file.path(data_dir, frags_name))) +} + +#' @rdname demo_data +#' @return +#' - `remove_demo_data()`: `NULL` +#' @details +#' - `remove_demo_data()`: Remove the demo data from the BPCells data directory. +#' @examples +#' ####################################################################### +#' ## remove_demo_data() example +#' ####################################################################### +#' remove_demo_data() +#' +#' +#' ## Demo data folder is now empty +#' data_dir <- file.path(tools::R_user_dir("BPCells", which = "data"), "demo_data") +#' list.files(data_dir) +#' +#' +#' @export +remove_demo_data <- function() { + data_dir <- file.path(tools::R_user_dir("BPCells", which = "data"), "demo_data") + if (dir.exists(data_dir)) { + unlink(data_dir, recursive = TRUE) + } +} + #' Gene Symbol Mapping data #' #' Mapping of the canonical gene symbols corresponding to each @@ -36,3 +282,5 @@ #' #' "mouse_gene_mapping" + + diff --git a/r/man/demo_data.Rd b/r/man/demo_data.Rd new file mode 100644 index 00000000..8dc1e870 --- /dev/null +++ b/r/man/demo_data.Rd @@ -0,0 +1,115 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/data.R +\name{get_demo_mat} +\alias{get_demo_mat} +\alias{get_demo_frags} +\alias{remove_demo_data} +\title{Retrieve BPCells demo data} +\usage{ +get_demo_mat(filter_qc = TRUE, subset = TRUE) + +get_demo_frags(filter_qc = TRUE, subset = TRUE) + +remove_demo_data() +} +\arguments{ +\item{filter_qc}{(bool) Whether to filter both the RNA and ATAC data using qc metrics (described in \code{details}).} + +\item{subset}{(bool) Whether to subset to only genes/insertions on chromosome 4 and 11.} +} +\value{ +\itemize{ +\item \code{get_demo_mat()}: (IterableMatrix) A \verb{(features x cells)} matrix. +} + +\itemize{ +\item \code{get_demo_frags()}: (IterableFragments) A Fragments object. +} + +\itemize{ +\item \code{remove_demo_data()}: \code{NULL} +} +} +\description{ +\ifelse{html}{\href{https://lifecycle.r-lib.org/articles/stages.html#experimental}{\figure{lifecycle-experimental.svg}{options: alt='[Experimental]'}}}{\strong{[Experimental]}} \cr Functions to download matrices and fragments derived from a +\href{https://www.10xgenomics.com/datasets/pbmc-from-a-healthy-donor-granulocytes-removed-through-cell-sorting-3-k-1-standard-2-0-0}{10X Genomics PBMC 3k dataset}, +with options to filter with common qc metrics, and to subset genes and fragments to only chromosome 4 and 11. +} +\details{ +These data functions are experimental. +The interface, as well as the demo dataset itself will likely undergo changes in the near future. + +\strong{Data Processing}: + +The first time either \code{get_demo_mat()}, or \code{get_demo_frags()}, are ran +demo data is downloaded and stored in the BPCells data directory (under \code{file.path(tools::R_user_dir("BPcells", which="data"), "demo_data")}). + +Subsequent calls to this function will use the previously downloaded matrix/fragments, given that the same combination of filtering and +subsetting has been performed previously. + +The preparation of this matrix can be reproduced by running the internal function \code{prepare_demo_data()} with \code{directory} set to the BPCells data directory. + +In the case that demo data is not pre-downloaded and demo data download fails, \code{prepare_demo_data()} will act +as a fallback. + +Both the matrix from \code{get_demo_mat()} and the fragments from \code{get_demo_frags()} +may be removed by running \code{remove_demo_data()}. + +Filtering using QC information on the fragments and matrix object chooses cells with at least 1000 reads, 1000 frags, and a minimum tss enrichment of 10. +Subsetting provides only genes and insertions on chromosomes 4 and 11. + +\strong{Dimensions}: +\tabular{lll}{ +\strong{Condition} \tab \strong{RNA matrix (features x cells)} \tab \strong{Fragments (chromosomes x cells)} \cr +Raw \tab (36601 x 650165) \tab (39 x 462264) \cr +Filter \tab (36601 x 2600) \tab (39 x 2600) \cr +Subset \tab (3582 x 650165) \tab (2 x 462264) \cr +Filter + Subset \tab (3582 x 2600) \tab (2 x 2600) \cr +} +\strong{Data size}: +\tabular{lll}{ +\strong{Condition} \tab \strong{RNA matrix (MB)} \tab \strong{Fragments (MB)} \cr +Raw \tab 31.9 \tab 200 \cr +Filter \tab 9.4 \tab 137 \cr +Subset \tab 18.3 \tab 25.6 \cr +Filter + Subset \tab 1.2 \tab 12.3 \cr +} + +\strong{Function Description}: +\itemize{ +\item \code{get_demo_mat()}: Retrieve a demo \code{IterableMatrix} object representing the 10X Genomics PBMC 3k dataset. +} + +\itemize{ +\item \code{get_demo_frags()}: Retrieve a demo \code{IterableFragments} object representing the 10X Genomics PBMC 3k dataset. +} + +\itemize{ +\item \code{remove_demo_data()}: Remove the demo data from the BPCells data directory. +} +} +\examples{ +####################################################################### +## get_demo_mat() example +####################################################################### +get_demo_mat() + + +####################################################################### +## get_demo_frags() example +####################################################################### +get_demo_frags() + + +####################################################################### +## remove_demo_data() example +####################################################################### +remove_demo_data() + + +## Demo data folder is now empty +data_dir <- file.path(tools::R_user_dir("BPCells", which = "data"), "demo_data") +list.files(data_dir) + + +} diff --git a/r/man/prepare_demo_data.Rd b/r/man/prepare_demo_data.Rd new file mode 100644 index 00000000..ea019c82 --- /dev/null +++ b/r/man/prepare_demo_data.Rd @@ -0,0 +1,41 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/data.R +\name{prepare_demo_data} +\alias{prepare_demo_data} +\title{Create a small demo matrix and fragment object.} +\usage{ +prepare_demo_data( + directory = NULL, + filter_qc = TRUE, + subset = TRUE, + timeout = 300 +) +} +\arguments{ +\item{directory}{(character) The directory where all the input/output data will be stored. +Downloaded intermediates will be stored in subdir \code{intermediates}. +If \code{NULL}, a temporary directory is created.} + +\item{filter_qc}{(bool) Whether to filter both the RNA and ATAC data using QC information.} + +\item{subset}{(bool) Whether to subset to only genes/insertions on chromosome 4 and 11.} + +\item{timeout}{(numeric) Timeout for downloading files in seconds.} +} +\value{ +(list) A list with the RNA matrix under the name \code{mat}, and the ATAC fragments under the name \code{frags}. +} +\description{ +Downloads a +\href{https://www.10xgenomics.com/datasets/pbmc-from-a-healthy-donor-granulocytes-removed-through-cell-sorting-3-k-1-standard-2-0-0}{10x Genomics dataset}, +consisting of 3k cells then performs optional QC and subsetting. Holds subsetted objects in disk, +and returns a list with both the matrix and fragments. +} +\details{ +This function downloads the 10x Genomics PBMC 3k dataset. +Filtering using QC information on the fragments and matrix provides cells with at least 1000 reads, 1000 frags, and a minimum tss enrichment of 10. +Subsetting provides only genes and insertions on chromosomes 4 and 11. +The name of the matrix and fragments folders are \code{demo_mat} and \code{demo_frags} respectively. +Additionally, choosing to qc filter appends a \verb{_filtered}, and choosing to subset data appends a \verb{_subsetted} to the name. +} +\keyword{internal} diff --git a/r/pkgdown/_pkgdown.yml b/r/pkgdown/_pkgdown.yml index 64d16f01..89166cc8 100644 --- a/r/pkgdown/_pkgdown.yml +++ b/r/pkgdown/_pkgdown.yml @@ -176,3 +176,7 @@ reference: - discrete_palette - collect_features - rotate_x_labels + +- title: "Data" +- contents: + - get_demo_mat diff --git a/r/tests/testthat/test-data.R b/r/tests/testthat/test-data.R new file mode 100644 index 00000000..4ee151b1 --- /dev/null +++ b/r/tests/testthat/test-data.R @@ -0,0 +1,18 @@ +# Copyright 2025 BPCells contributors +# +# Licensed under the Apache License, Version 2.0 or the MIT license +# , at your +# option. This file may not be copied, modified, or distributed +# except according to those terms. + + +test_that("Getting test data works", { + mat <- get_demo_mat() + frags <- get_demo_frags() + expect_true(is(mat, "IterableMatrix")) + expect_true(is(frags, "IterableFragments")) + remove_demo_data() + skip("Skip preparing demo data") + expect_no_error(BPCells:::prepare_demo_data(file.path(tools::R_user_dir("BPCells", which = "data"), "demo_data"))) +}) \ No newline at end of file