From 9824d304aab5135454523c5687cfb445463d2b0c Mon Sep 17 00:00:00 2001 From: immanuelazn Date: Tue, 28 Jan 2025 13:15:23 -0800 Subject: [PATCH 01/15] [r] add test data --- r/R/utils.R | 72 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 72 insertions(+) diff --git a/r/R/utils.R b/r/R/utils.R index 4ea62d15..bb569b3c 100644 --- a/r/R/utils.R +++ b/r/R/utils.R @@ -56,4 +56,76 @@ log_progress <- function(msg, add_timestamp = TRUE){ } else { message(msg) } +} + +#' Prepare a test matrix and test fragments for BPCells. +#' +#' @param directory (character) Where the input/output data should be stored. If NULL, a temporary directory is created. +#' @param mat_name (character) Name of the RNA matrix file. If NULL, the matrix is named "test_mat." +#' @param frags_name (character) Name of the ATAC fragments file. If NULL, the fragments are named "test_frags". +#' @return (list) A list with the RNA matrix under the name "mat", and the ATAC fragments under the name "frags". +#' @details +#' This function downloads the 10x Genomics PBMC 3k dataset, and filters the fragments and matrix to cells with at least 1000 reads. +#' Following, both fragments and the matrix is subset to only genes and insertions on chromosomes 4 and 11. +#' The RNA matrix is 1 MB and the fragments are 12.5 MB, after BPCells compression. +#' @keywords internal +prepare_test_data <- function(directory = NULL, mat_name = NULL, frags_name = NULL) { + if (is.null(directory)) { + directory <- file.path(tempdir()) + dir.create(directory, recursive = TRUE, showWarnings = FALSE) + } + if (is.null(mat_name)) { + mat_name <- "test_rna" + } + if (is.null(frags_name)) { + frags_name <- "test_frags" + } + url_base <- "https://cf.10xgenomics.com/samples/cell-arc/2.0.0/pbmc_granulocyte_sorted_3k/" + rna_raw_url <- paste0(url_base, "pbmc_granulocyte_sorted_3k_raw_feature_bc_matrix.h5") + atac_raw_url <- paste0(url_base, "pbmc_granulocyte_sorted_3k_atac_fragments.tsv.gz") + options(timeout=300) + if (!file.exists(file.path(directory,"pbmc_3k_10x.h5"))) { + download.file(rna_raw_url, file.path(directory, "pbmc_3k_10x.h5"), mode="wb") + } + if (!file.exists(file.path(directory,"pbmc_3k_10x.fragments.tsv.gz"))) { + download.file(atac_raw_url, file.path(directory, "pbmc_3k_10x.fragments.tsv.gz"), mode="wb") + } + if (!file.exists(file.path(directory,"pbmc_3k_rna_raw"))) { + mat_raw <- open_matrix_10x_hdf5(file.path(directory, "pbmc_3k_10x.h5"), feature_type="Gene Expression") %>% + write_matrix_dir(file.path(directory,"pbmc_3k_rna_raw")) + } else { + mat_raw <- open_matrix_dir(file.path(directory,"pbmc_3k_rna_raw")) + } + # Check if we already ran import + if (!file.exists(file.path(directory,"pbmc_3k_frags"))) { + frags_raw <- open_fragments_10x(file.path(directory,"pbmc_3k_10x.fragments.tsv.gz")) %>% + write_fragments_dir(file.path(directory,"pbmc_3k_frags")) + } else { + frags_raw <- open_fragments_dir(file.path(directory,"pbmc_3k_frags")) + } + # for atac transcripts + transcripts <- read_gencode_transcripts( + file.path(directory,"references_transcripts"), + release="42", + transcript_choice="MANE_Select", + annotation_set = "basic", + features="transcript" + ) + # for RNA genes + genes_test <- read_gencode_genes( + file.path(directory,"./reference_genes"), + release = "42", + annotation_set = "basic", + ) + # Filter to only cells that have at least 1000 reads on the RNA side + # and only genes/fragments that exist on chr 4 and 11 + filtered_cells <- colnames(mat_raw)[reads_per_cell >= 1e3] + filtered_genes <- genes[genes$chr %in% c("chr4", "chr11"),]$gene_id + # remove version numbers + filtered_genes <- gsub("\\..*", "", filtered_genes) + mat <- mat_raw[which(rownames(mat_raw) %in% filtered_genes), pass_rna] + frags <- select_cells(frags_raw, pass_rna) %>% select_chromosomes(c("chr4", "chr11")) + mat <- write_matrix_dir(mat, file.path(directory, mat_name)) + frags <- write_fragments_dir(frags, file.path(directory, frags_name)) + return(list(mat = mat, frags = frags)) } \ No newline at end of file From f685ad882f2d4343e23a964606b3bda75734b6fb Mon Sep 17 00:00:00 2001 From: immanuelazn Date: Mon, 10 Feb 2025 14:21:15 -0800 Subject: [PATCH 02/15] [r] update test data functions based on feedback --- r/DESCRIPTION | 3 +- r/NAMESPACE | 3 + r/NEWS.md | 1 + r/R/data.R | 156 +++++++++++++++++++++++++++++++++++ r/R/utils.R | 72 ---------------- r/man/demo_data.Rd | 53 ++++++++++++ r/man/prepare_demo_data.Rd | 38 +++++++++ r/pkgdown/_pkgdown.yml | 4 + r/tests/testthat/test-data.R | 16 ++++ 9 files changed, 273 insertions(+), 73 deletions(-) create mode 100644 r/man/demo_data.Rd create mode 100644 r/man/prepare_demo_data.Rd create mode 100644 r/tests/testthat/test-data.R diff --git a/r/DESCRIPTION b/r/DESCRIPTION index 17faadb9..54090e64 100644 --- a/r/DESCRIPTION +++ b/r/DESCRIPTION @@ -30,6 +30,7 @@ Imports: Matrix, Rcpp, rlang, + tools, vctrs, lifecycle, stringr, @@ -50,5 +51,5 @@ Suggests: matrixStats, igraph Depends: - R (>= 3.5.0) + R (>= 4.0.0) Config/Needs/website: pkgdown, devtools, uwot, irlba, RcppHNSW, igraph, BiocManager, bioc::BSgenome.Hsapiens.UCSC.hg38, github::GreenleafLab/motifmatchr, github::GreenleafLab/chromVARmotifs diff --git a/r/NAMESPACE b/r/NAMESPACE index 982ca440..527f06ca 100644 --- a/r/NAMESPACE +++ b/r/NAMESPACE @@ -47,6 +47,8 @@ export(gene_region) export(gene_score_archr) export(gene_score_tiles_archr) export(gene_score_weights_archr) +export(get_demo_frags) +export(get_demo_mat) export(get_trackplot_height) export(import_matrix_market) export(import_matrix_market_10x) @@ -93,6 +95,7 @@ export(read_gencode_transcripts) export(read_gtf) export(read_ucsc_chrom_sizes) export(regress_out) +export(remove_demo_data) export(rotate_x_labels) export(rowMaxs) export(rowMaxs.IterableMatrix) diff --git a/r/NEWS.md b/r/NEWS.md index 6cab91a5..abfceaa4 100644 --- a/r/NEWS.md +++ b/r/NEWS.md @@ -10,6 +10,7 @@ Contributions welcome :) ## Features - Add `write_matrix_anndata_hdf5_dense()` which allows writing matrices in AnnData's dense format, most commonly used for `obsm` or `varm` matrices. (Thanks to @ycli1995 for pull request #166) +- Add `get_demo_mat()`, `get_demo_frags()` and `remove_demo_data()` to retrieve a small test matrix subsetted from the PBMC 3k dataset from 10X Genomics. (pull request #193) ## Bug-fixes - Fix error message printing when MACS crashes during `call_peaks_macs()` (pull request #175) diff --git a/r/R/data.R b/r/R/data.R index 9f66faa5..7aa9bdef 100644 --- a/r/R/data.R +++ b/r/R/data.R @@ -6,6 +6,160 @@ # option. This file may not be copied, modified, or distributed # except according to those terms. +#' Prepare a demo matrix and demo fragments for BPCells. +#' +#' @param directory (character) Where the input/output data should be stored. If NULL, a temporary directory is created. +#' @param mat_name (character) Name of the RNA matrix file. If NULL, the matrix is named "demo_mat." +#' @param frags_name (character) Name of the ATAC fragments file. If NULL, the fragments are named "demo_frags". +#' @param timeout (numeric) Timeout for downloading files in seconds. +#' @param remove_input_data (logical) Whether to remove the downloaded non-procesed matrix, frags, gencode transcripts, and gencode genes +#' after processing. +#' @return (list) A list with the RNA matrix under the name "mat", and the ATAC fragments under the name "frags". +#' @details +#' This function downloads the 10x Genomics PBMC 3k dataset, and filters the fragments and matrix to cells with at least 1000 reads. +#' Following, both fragments and the matrix is subset to only genes and insertions on chromosomes 4 and 11. +#' The RNA matrix is 1 MB and the fragments are 12.5 MB, after BPCells compression. +#' @keywords internal +prepare_demo_data <- function(directory = NULL, mat_name = NULL, frags_name = NULL, timeout = 300, remove_input_data = TRUE) { + if (is.null(directory)) { + directory <- file.path(tempdir()) + dir.create(directory, recursive = TRUE, showWarnings = FALSE) + } + if (is.null(mat_name)) { + mat_name <- "demo_mat" + } + if (is.null(frags_name)) { + frags_name <- "demo_frags" + } + url_base <- "https://cf.10xgenomics.com/samples/cell-arc/2.0.0/pbmc_granulocyte_sorted_3k/" + rna_raw_url <- paste0(url_base, "pbmc_granulocyte_sorted_3k_raw_feature_bc_matrix.h5") + atac_raw_url <- paste0(url_base, "pbmc_granulocyte_sorted_3k_atac_fragments.tsv.gz") + ensure_downloaded(file.path(directory, "pbmc_3k_10x.h5"), rna_raw_url, timeout = timeout) + ensure_downloaded(file.path(directory, "pbmc_3k_10x.fragments.tsv.gz"), atac_raw_url, timeout = timeout) + if (!file.exists(file.path(directory,"pbmc_3k_rna_raw"))) { + mat_raw <- open_matrix_10x_hdf5(file.path(directory, "pbmc_3k_10x.h5"), feature_type="Gene Expression") %>% + write_matrix_dir(file.path(directory, "pbmc_3k_rna_raw")) + } else { + mat_raw <- open_matrix_dir(file.path(directory, "pbmc_3k_rna_raw")) + } + # Check if we already ran import + if (!file.exists(file.path(directory, "pbmc_3k_frags"))) { + frags_raw <- open_fragments_10x(file.path(directory, "pbmc_3k_10x.fragments.tsv.gz")) %>% + write_fragments_dir(file.path(directory, "pbmc_3k_frags")) + } else { + frags_raw <- open_fragments_dir(file.path(directory, "pbmc_3k_frags")) + } + # for atac transcripts + transcripts <- read_gencode_transcripts( + file.path(directory, "references_transcripts"), + release = "42", + transcript_choice = "MANE_Select", + annotation_set = "basic", + features = "transcript" + ) + # for RNA genes + genes_demo <- read_gencode_genes( + file.path(directory, "./reference_genes"), + release = "42", + annotation_set = "basic", + ) + # Filter to only cells that have at least 1000 reads on the RNA side + # and only genes/fragments that exist on chr 4 and 11 + reads_per_cell <- colSums(mat_raw) + filtered_cells <- colnames(mat_raw)[reads_per_cell >= 1e3] + filtered_genes <- genes_demo[genes_demo$chr %in% c("chr4", "chr11"),]$gene_id + # remove version numbers + filtered_genes <- gsub("\\..*", "", filtered_genes) + mat <- mat_raw[which(rownames(mat_raw) %in% filtered_genes), filtered_cells] + frags <- select_cells(frags_raw, filtered_cells) %>% select_chromosomes(c("chr4", "chr11")) + mat <- write_matrix_dir(mat, file.path(directory, mat_name), overwrite = TRUE) + frags <- write_fragments_dir(frags, file.path(directory, frags_name), overwrite = TRUE) + if (remove_input_data) { + unlink(file.path(directory, "pbmc_3k_10x.h5")) + unlink(file.path(directory, "pbmc_3k_10x.fragments.tsv.gz")) + unlink(file.path(directory, "pbmc_3k_rna_raw")) + unlink(file.path(directory, "pbmc_3k_frags")) + } + return(list(mat = mat, frags = frags)) +} + +#' Retrieve BPCells demo data +#' +#' The demo dataset is a subset of the 10x Genomics PBMC 3k dataset, and filters both the matrix and the fragments +#' to cells with at least 1000 reads. Both the matrix and the fragments are subset to only genes on chromosomes 4 and 11. +#' @rdname demo_data +#' @return +#' - `get_demo_mat()`: (IterableMatrix) A `(features x cells)` matrix of shape `(1984 x 2724)`. +#' @details +#' The first time either `get_demo_mat()` are ran `get_demo_frags()`, the demo data is downloaded and stored in the BPCells data directory +#' (under `file.path(tools::R_user_dir("BPcells", which="data"), "demo_data")`). Subsequent calls to this function will use the previously downloaded matrix. +#' The preperation of this matrix can be reproduced by running the internal function `prepare_demo_data()`. +#' +#' In the case that demo data is not pre-downloaded and demo data download fails, `prepare_demo_data()` will be run, +#' which manually builds the demo dataset from the 10x Genomics PBMC 3k dataset. +#' +#' Both the matrix from `get_demo_mat()` and the fragments from `get_demo_frags()` +#' may be removed by running `remove_demo_data()`. +#' +#' - `get_demo_mat()`: Retrieve a 1 MB demo matrix, representing a subset of the 10X Genomics PBMC 3k dataset. +#' @export +get_demo_mat <- function() { + # Use the data directory for BPCells + data_dir <- file.path(tools::R_user_dir("BPCells", which = "data"), "demo_data") + if (!dir.exists(data_dir)) { + dir.create(data_dir, recursive = TRUE) + } + if (!dir.exists(file.path(data_dir, "demo_mat"))) { + url <- "https://pub-c4e56988ff67429e9856ffa33aecb0c1.r2.dev/demo_mat.tar.gz" + download.file(url, file.path(data_dir, "demo_mat.tar.gz")) + # Check if file download failed + if (!file.exists(file.path(data_dir, "demo_mat.tar.gz"))) { + prepare_demo_data(data_dir) + } else { + untar(file.path(data_dir, "demo_mat.tar.gz"), exdir=data_dir) + file.remove(file.path(data_dir, "demo_mat.tar.gz")) + } + } + return(open_matrix_dir(file.path(data_dir, "demo_mat"))) +} + +#' @rdname demo_data +#' @return +#' - `get_demo_frags()`: (IterableFragments) A Fragments object with 2724 cells and fragments on chromosomes 4 and 11. +#' @details +#' - `get_demo_frags()`: Retrieve a 12.5 MB demo fragments object, representing a subset of the 10X Genomics PBMC 3k dataset. +#' @export +get_demo_frags <- function() { + data_dir <- file.path(tools::R_user_dir("BPCells", which = "data"), "demo_data") + if (!dir.exists(data_dir)) { + dir.create(data_dir, recursive = TRUE) + } + if (!dir.exists(file.path(data_dir, "demo_frags"))) { + url <- "https://pub-c4e56988ff67429e9856ffa33aecb0c1.r2.dev/demo_frags.tar.gz" + download.file(url, file.path(data_dir, "demo_frags.tar.gz")) + if (!file.exists(file.path(data_dir, "demo_frags.tar.gz"))) { + prepare_demo_data(data_dir) + } else { + untar(file.path(data_dir, "demo_frags.tar.gz"), exdir = data_dir) + file.remove(file.path(data_dir, "demo_frags.tar.gz")) + } + } + return(open_fragments_dir(file.path(data_dir, "demo_frags"))) +} + +#' @rdname demo_data +#' @return +#' - `remove_demo_data()`: NULL +#' @details +#' - `remove_demo_data()`: Remove the demo data from the BPCells data directory. +#' @export +remove_demo_data <- function() { + data_dir <- file.path(tools::R_user_dir("BPCells", which = "data"), "demo_data") + if (dir.exists(data_dir)) { + unlink(data_dir, recursive = TRUE) + } +} + #' Gene Symbol Mapping data #' #' Mapping of the canonical gene symbols corresponding to each @@ -36,3 +190,5 @@ #' #' "mouse_gene_mapping" + + diff --git a/r/R/utils.R b/r/R/utils.R index bb569b3c..4ea62d15 100644 --- a/r/R/utils.R +++ b/r/R/utils.R @@ -56,76 +56,4 @@ log_progress <- function(msg, add_timestamp = TRUE){ } else { message(msg) } -} - -#' Prepare a test matrix and test fragments for BPCells. -#' -#' @param directory (character) Where the input/output data should be stored. If NULL, a temporary directory is created. -#' @param mat_name (character) Name of the RNA matrix file. If NULL, the matrix is named "test_mat." -#' @param frags_name (character) Name of the ATAC fragments file. If NULL, the fragments are named "test_frags". -#' @return (list) A list with the RNA matrix under the name "mat", and the ATAC fragments under the name "frags". -#' @details -#' This function downloads the 10x Genomics PBMC 3k dataset, and filters the fragments and matrix to cells with at least 1000 reads. -#' Following, both fragments and the matrix is subset to only genes and insertions on chromosomes 4 and 11. -#' The RNA matrix is 1 MB and the fragments are 12.5 MB, after BPCells compression. -#' @keywords internal -prepare_test_data <- function(directory = NULL, mat_name = NULL, frags_name = NULL) { - if (is.null(directory)) { - directory <- file.path(tempdir()) - dir.create(directory, recursive = TRUE, showWarnings = FALSE) - } - if (is.null(mat_name)) { - mat_name <- "test_rna" - } - if (is.null(frags_name)) { - frags_name <- "test_frags" - } - url_base <- "https://cf.10xgenomics.com/samples/cell-arc/2.0.0/pbmc_granulocyte_sorted_3k/" - rna_raw_url <- paste0(url_base, "pbmc_granulocyte_sorted_3k_raw_feature_bc_matrix.h5") - atac_raw_url <- paste0(url_base, "pbmc_granulocyte_sorted_3k_atac_fragments.tsv.gz") - options(timeout=300) - if (!file.exists(file.path(directory,"pbmc_3k_10x.h5"))) { - download.file(rna_raw_url, file.path(directory, "pbmc_3k_10x.h5"), mode="wb") - } - if (!file.exists(file.path(directory,"pbmc_3k_10x.fragments.tsv.gz"))) { - download.file(atac_raw_url, file.path(directory, "pbmc_3k_10x.fragments.tsv.gz"), mode="wb") - } - if (!file.exists(file.path(directory,"pbmc_3k_rna_raw"))) { - mat_raw <- open_matrix_10x_hdf5(file.path(directory, "pbmc_3k_10x.h5"), feature_type="Gene Expression") %>% - write_matrix_dir(file.path(directory,"pbmc_3k_rna_raw")) - } else { - mat_raw <- open_matrix_dir(file.path(directory,"pbmc_3k_rna_raw")) - } - # Check if we already ran import - if (!file.exists(file.path(directory,"pbmc_3k_frags"))) { - frags_raw <- open_fragments_10x(file.path(directory,"pbmc_3k_10x.fragments.tsv.gz")) %>% - write_fragments_dir(file.path(directory,"pbmc_3k_frags")) - } else { - frags_raw <- open_fragments_dir(file.path(directory,"pbmc_3k_frags")) - } - # for atac transcripts - transcripts <- read_gencode_transcripts( - file.path(directory,"references_transcripts"), - release="42", - transcript_choice="MANE_Select", - annotation_set = "basic", - features="transcript" - ) - # for RNA genes - genes_test <- read_gencode_genes( - file.path(directory,"./reference_genes"), - release = "42", - annotation_set = "basic", - ) - # Filter to only cells that have at least 1000 reads on the RNA side - # and only genes/fragments that exist on chr 4 and 11 - filtered_cells <- colnames(mat_raw)[reads_per_cell >= 1e3] - filtered_genes <- genes[genes$chr %in% c("chr4", "chr11"),]$gene_id - # remove version numbers - filtered_genes <- gsub("\\..*", "", filtered_genes) - mat <- mat_raw[which(rownames(mat_raw) %in% filtered_genes), pass_rna] - frags <- select_cells(frags_raw, pass_rna) %>% select_chromosomes(c("chr4", "chr11")) - mat <- write_matrix_dir(mat, file.path(directory, mat_name)) - frags <- write_fragments_dir(frags, file.path(directory, frags_name)) - return(list(mat = mat, frags = frags)) } \ No newline at end of file diff --git a/r/man/demo_data.Rd b/r/man/demo_data.Rd new file mode 100644 index 00000000..89d149c2 --- /dev/null +++ b/r/man/demo_data.Rd @@ -0,0 +1,53 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/data.R +\name{get_demo_mat} +\alias{get_demo_mat} +\alias{get_demo_frags} +\alias{remove_demo_data} +\title{Retrieve BPCells demo data} +\usage{ +get_demo_mat() + +get_demo_frags() + +remove_demo_data() +} +\value{ +\itemize{ +\item \code{get_demo_mat()}: (IterableMatrix) A \verb{(features x cells)} matrix of shape \verb{(1984 x 2724)}. +} + +\itemize{ +\item \code{get_demo_frags()}: (IterableFragments) A Fragments object with 2724 cells and fragments on chromosomes 4 and 11. +} + +\itemize{ +\item \code{remove_demo_data()}: NULL +} +} +\description{ +The demo dataset is a subset of the 10x Genomics PBMC 3k dataset, and filters both the matrix and the fragments +to cells with at least 1000 reads. Both the matrix and the fragments are subset to only genes on chromosomes 4 and 11. +} +\details{ +The first time either \code{get_demo_mat()} are ran \code{get_demo_frags()}, the demo data is downloaded and stored in the BPCells data directory +(under \code{file.path(tools::R_user_dir("BPcells", which="data"), "demo_data")}). Subsequent calls to this function will use the previously downloaded matrix. +The preperation of this matrix can be reproduced by running the internal function \code{prepare_demo_data()}. + +In the case that demo data is not pre-downloaded and demo data download fails, \code{prepare_demo_data()} will be run, +which manually builds the demo dataset from the 10x Genomics PBMC 3k dataset. + +Both the matrix from \code{get_demo_mat()} and the fragments from \code{get_demo_frags()} +may be removed by running \code{remove_demo_data()}. +\itemize{ +\item \code{get_demo_mat()}: Retrieve a 1 MB demo matrix, representing a subset of the 10X Genomics PBMC 3k dataset. +} + +\itemize{ +\item \code{get_demo_frags()}: Retrieve a 12.5 MB demo fragments object, representing a subset of the 10X Genomics PBMC 3k dataset. +} + +\itemize{ +\item \code{remove_demo_data()}: Remove the demo data from the BPCells data directory. +} +} diff --git a/r/man/prepare_demo_data.Rd b/r/man/prepare_demo_data.Rd new file mode 100644 index 00000000..7b8f14a9 --- /dev/null +++ b/r/man/prepare_demo_data.Rd @@ -0,0 +1,38 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/data.R +\name{prepare_demo_data} +\alias{prepare_demo_data} +\title{Prepare a demo matrix and demo fragments for BPCells.} +\usage{ +prepare_demo_data( + directory = NULL, + mat_name = NULL, + frags_name = NULL, + timeout = 300, + remove_input_data = TRUE +) +} +\arguments{ +\item{directory}{(character) Where the input/output data should be stored. If NULL, a temporary directory is created.} + +\item{mat_name}{(character) Name of the RNA matrix file. If NULL, the matrix is named "demo_mat."} + +\item{frags_name}{(character) Name of the ATAC fragments file. If NULL, the fragments are named "demo_frags".} + +\item{timeout}{(numeric) Timeout for downloading files in seconds.} + +\item{remove_input_data}{(logical) Whether to remove the downloaded non-procesed matrix, frags, gencode transcripts, and gencode genes +after processing.} +} +\value{ +(list) A list with the RNA matrix under the name "mat", and the ATAC fragments under the name "frags". +} +\description{ +Prepare a demo matrix and demo fragments for BPCells. +} +\details{ +This function downloads the 10x Genomics PBMC 3k dataset, and filters the fragments and matrix to cells with at least 1000 reads. +Following, both fragments and the matrix is subset to only genes and insertions on chromosomes 4 and 11. +The RNA matrix is 1 MB and the fragments are 12.5 MB, after BPCells compression. +} +\keyword{internal} diff --git a/r/pkgdown/_pkgdown.yml b/r/pkgdown/_pkgdown.yml index 64d16f01..89166cc8 100644 --- a/r/pkgdown/_pkgdown.yml +++ b/r/pkgdown/_pkgdown.yml @@ -176,3 +176,7 @@ reference: - discrete_palette - collect_features - rotate_x_labels + +- title: "Data" +- contents: + - get_demo_mat diff --git a/r/tests/testthat/test-data.R b/r/tests/testthat/test-data.R new file mode 100644 index 00000000..4583a0c9 --- /dev/null +++ b/r/tests/testthat/test-data.R @@ -0,0 +1,16 @@ +# Copyright 2025 BPCells contributors +# +# Licensed under the Apache License, Version 2.0 or the MIT license +# , at your +# option. This file may not be copied, modified, or distributed +# except according to those terms. + + +test_that("Getting test data works", { + mat <- get_demo_mat() + frags <- get_demo_frags() + expect_true(is(mat, "IterableMatrix")) + expect_true(is(frags, "IterableFragments")) + remove_demo_data() +}) \ No newline at end of file From e1ec1cffba673af40219d903da20f400dbe413af Mon Sep 17 00:00:00 2001 From: immanuelazn Date: Wed, 26 Feb 2025 17:21:11 -0800 Subject: [PATCH 03/15] [r] rewrite some demo data documentation --- r/R/data.R | 17 ++++++++++------- r/man/demo_data.Rd | 11 ++++++----- r/man/prepare_demo_data.Rd | 5 +++-- 3 files changed, 19 insertions(+), 14 deletions(-) diff --git a/r/R/data.R b/r/R/data.R index 7aa9bdef..2865c307 100644 --- a/r/R/data.R +++ b/r/R/data.R @@ -6,8 +6,10 @@ # option. This file may not be copied, modified, or distributed # except according to those terms. -#' Prepare a demo matrix and demo fragments for BPCells. -#' +#' Create a small demo matrix and fragment object +#' +#' Downloads a 10x Genomics dataset, then performs QC and subsetting. Holds subsetted objects in disk, +#' and returns a list with both the matrix and fragments. #' @param directory (character) Where the input/output data should be stored. If NULL, a temporary directory is created. #' @param mat_name (character) Name of the RNA matrix file. If NULL, the matrix is named "demo_mat." #' @param frags_name (character) Name of the ATAC fragments file. If NULL, the fragments are named "demo_frags". @@ -91,12 +93,13 @@ prepare_demo_data <- function(directory = NULL, mat_name = NULL, frags_name = NU #' @return #' - `get_demo_mat()`: (IterableMatrix) A `(features x cells)` matrix of shape `(1984 x 2724)`. #' @details -#' The first time either `get_demo_mat()` are ran `get_demo_frags()`, the demo data is downloaded and stored in the BPCells data directory -#' (under `file.path(tools::R_user_dir("BPcells", which="data"), "demo_data")`). Subsequent calls to this function will use the previously downloaded matrix. -#' The preperation of this matrix can be reproduced by running the internal function `prepare_demo_data()`. +#' The first time either `get_demo_mat()` are ran `get_demo_frags()`, the pre-subsetted +#' demo data is downloaded and stored in the BPCells data directory (under `file.path(tools::R_user_dir("BPcells", which="data"), "demo_data")`). +#' Subsequent calls to this function will use the previously downloaded matrix. +#' The preperation of this matrix can be reproduced by running the internal function `prepare_demo_data()` with `directory` set to the BPCells data directory. #' -#' In the case that demo data is not pre-downloaded and demo data download fails, `prepare_demo_data()` will be run, -#' which manually builds the demo dataset from the 10x Genomics PBMC 3k dataset. +#' In the case that demo data is not pre-downloaded and demo data download fails, `prepare_demo_data()` will act +#' as a fallback. #' #' Both the matrix from `get_demo_mat()` and the fragments from `get_demo_frags()` #' may be removed by running `remove_demo_data()`. diff --git a/r/man/demo_data.Rd b/r/man/demo_data.Rd index 89d149c2..df58d7cb 100644 --- a/r/man/demo_data.Rd +++ b/r/man/demo_data.Rd @@ -30,12 +30,13 @@ The demo dataset is a subset of the 10x Genomics PBMC 3k dataset, and filters b to cells with at least 1000 reads. Both the matrix and the fragments are subset to only genes on chromosomes 4 and 11. } \details{ -The first time either \code{get_demo_mat()} are ran \code{get_demo_frags()}, the demo data is downloaded and stored in the BPCells data directory -(under \code{file.path(tools::R_user_dir("BPcells", which="data"), "demo_data")}). Subsequent calls to this function will use the previously downloaded matrix. -The preperation of this matrix can be reproduced by running the internal function \code{prepare_demo_data()}. +The first time either \code{get_demo_mat()} are ran \code{get_demo_frags()}, the pre-subsetted +demo data is downloaded and stored in the BPCells data directory (under \code{file.path(tools::R_user_dir("BPcells", which="data"), "demo_data")}). +Subsequent calls to this function will use the previously downloaded matrix. +The preperation of this matrix can be reproduced by running the internal function \code{prepare_demo_data()} with \code{directory} set to the BPCells data directory. -In the case that demo data is not pre-downloaded and demo data download fails, \code{prepare_demo_data()} will be run, -which manually builds the demo dataset from the 10x Genomics PBMC 3k dataset. +In the case that demo data is not pre-downloaded and demo data download fails, \code{prepare_demo_data()} will act +as a fallback. Both the matrix from \code{get_demo_mat()} and the fragments from \code{get_demo_frags()} may be removed by running \code{remove_demo_data()}. diff --git a/r/man/prepare_demo_data.Rd b/r/man/prepare_demo_data.Rd index 7b8f14a9..ee498b19 100644 --- a/r/man/prepare_demo_data.Rd +++ b/r/man/prepare_demo_data.Rd @@ -2,7 +2,7 @@ % Please edit documentation in R/data.R \name{prepare_demo_data} \alias{prepare_demo_data} -\title{Prepare a demo matrix and demo fragments for BPCells.} +\title{Create a small demo matrix and fragment object} \usage{ prepare_demo_data( directory = NULL, @@ -28,7 +28,8 @@ after processing.} (list) A list with the RNA matrix under the name "mat", and the ATAC fragments under the name "frags". } \description{ -Prepare a demo matrix and demo fragments for BPCells. +Downloads a 10x Genomics dataset, then performs QC and subsetting. Holds subsetted objects in disk, +and returns a list with both the matrix and fragments. } \details{ This function downloads the 10x Genomics PBMC 3k dataset, and filters the fragments and matrix to cells with at least 1000 reads. From 4b748b91f3748ff0fa0ba4be58732be9ffac3aaf Mon Sep 17 00:00:00 2001 From: immanuelazn Date: Thu, 27 Feb 2025 15:10:14 -0800 Subject: [PATCH 04/15] [r] update `get_demo_mat()` docs --- r/R/data.R | 10 +++++----- r/man/demo_data.Rd | 2 +- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/r/R/data.R b/r/R/data.R index 2865c307..52190ea3 100644 --- a/r/R/data.R +++ b/r/R/data.R @@ -94,17 +94,17 @@ prepare_demo_data <- function(directory = NULL, mat_name = NULL, frags_name = NU #' - `get_demo_mat()`: (IterableMatrix) A `(features x cells)` matrix of shape `(1984 x 2724)`. #' @details #' The first time either `get_demo_mat()` are ran `get_demo_frags()`, the pre-subsetted -#' demo data is downloaded and stored in the BPCells data directory (under `file.path(tools::R_user_dir("BPcells", which="data"), "demo_data")`). -#' Subsequent calls to this function will use the previously downloaded matrix. +#' demo data is downloaded and stored in the BPCells data directory (under `file.path(tools::R_user_dir("BPcells", which="data"), "demo_data")`). +#' Subsequent calls to this function will use the previously downloaded matrix/fragments. #' The preperation of this matrix can be reproduced by running the internal function `prepare_demo_data()` with `directory` set to the BPCells data directory. #' -#' In the case that demo data is not pre-downloaded and demo data download fails, `prepare_demo_data()` will act +#' In the case that demo data is not pre-downloaded and demo data download fails, `prepare_demo_data()` will act #' as a fallback. #' -#' Both the matrix from `get_demo_mat()` and the fragments from `get_demo_frags()` +#' Both the matrix from `get_demo_mat()` and the fragments from `get_demo_frags()` #' may be removed by running `remove_demo_data()`. #' -#' - `get_demo_mat()`: Retrieve a 1 MB demo matrix, representing a subset of the 10X Genomics PBMC 3k dataset. +#' - `get_demo_mat()`: Retrieve a 1 MB demo matrix, representing a subset of the 10X Genomics PBMC 3k dataset. #' @export get_demo_mat <- function() { # Use the data directory for BPCells diff --git a/r/man/demo_data.Rd b/r/man/demo_data.Rd index df58d7cb..5fec63db 100644 --- a/r/man/demo_data.Rd +++ b/r/man/demo_data.Rd @@ -32,7 +32,7 @@ to cells with at least 1000 reads. Both the matrix and the fragments are subset \details{ The first time either \code{get_demo_mat()} are ran \code{get_demo_frags()}, the pre-subsetted demo data is downloaded and stored in the BPCells data directory (under \code{file.path(tools::R_user_dir("BPcells", which="data"), "demo_data")}). -Subsequent calls to this function will use the previously downloaded matrix. +Subsequent calls to this function will use the previously downloaded matrix/fragments. The preperation of this matrix can be reproduced by running the internal function \code{prepare_demo_data()} with \code{directory} set to the BPCells data directory. In the case that demo data is not pre-downloaded and demo data download fails, \code{prepare_demo_data()} will act From 8311ca82ea9fd68a2c699c6ce88f57cff057dd06 Mon Sep 17 00:00:00 2001 From: immanuelazn Date: Fri, 7 Mar 2025 15:37:03 -0800 Subject: [PATCH 05/15] [r] clean up wording for demo data docstring --- r/R/data.R | 10 +++++----- r/man/demo_data.Rd | 10 +++++----- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/r/R/data.R b/r/R/data.R index 52190ea3..761c050f 100644 --- a/r/R/data.R +++ b/r/R/data.R @@ -87,8 +87,8 @@ prepare_demo_data <- function(directory = NULL, mat_name = NULL, frags_name = NU #' Retrieve BPCells demo data #' -#' The demo dataset is a subset of the 10x Genomics PBMC 3k dataset, and filters both the matrix and the fragments -#' to cells with at least 1000 reads. Both the matrix and the fragments are subset to only genes on chromosomes 4 and 11. +#' The demo dataset is a subset of the 10x Genomics PBMC 3k dataset, with the matrix and fragments +#' filtered to cells with at least 1000 reads. Both the matrix and the fragments are also subset to only genes on chromosomes 4 and 11. #' @rdname demo_data #' @return #' - `get_demo_mat()`: (IterableMatrix) A `(features x cells)` matrix of shape `(1984 x 2724)`. @@ -104,7 +104,7 @@ prepare_demo_data <- function(directory = NULL, mat_name = NULL, frags_name = NU #' Both the matrix from `get_demo_mat()` and the fragments from `get_demo_frags()` #' may be removed by running `remove_demo_data()`. #' -#' - `get_demo_mat()`: Retrieve a 1 MB demo matrix, representing a subset of the 10X Genomics PBMC 3k dataset. +#' - `get_demo_mat()`: Retrieve a 1 MB demo `IterableMatrix`, representing a subset of the 10X Genomics PBMC 3k dataset. #' @export get_demo_mat <- function() { # Use the data directory for BPCells @@ -130,7 +130,7 @@ get_demo_mat <- function() { #' @return #' - `get_demo_frags()`: (IterableFragments) A Fragments object with 2724 cells and fragments on chromosomes 4 and 11. #' @details -#' - `get_demo_frags()`: Retrieve a 12.5 MB demo fragments object, representing a subset of the 10X Genomics PBMC 3k dataset. +#' - `get_demo_frags()`: Retrieve a 12.5 MB demo `IterableFragments` object, representing a subset of the 10X Genomics PBMC 3k dataset. #' @export get_demo_frags <- function() { data_dir <- file.path(tools::R_user_dir("BPCells", which = "data"), "demo_data") @@ -152,7 +152,7 @@ get_demo_frags <- function() { #' @rdname demo_data #' @return -#' - `remove_demo_data()`: NULL +#' - `remove_demo_data()`: `NULL` #' @details #' - `remove_demo_data()`: Remove the demo data from the BPCells data directory. #' @export diff --git a/r/man/demo_data.Rd b/r/man/demo_data.Rd index 5fec63db..6e677812 100644 --- a/r/man/demo_data.Rd +++ b/r/man/demo_data.Rd @@ -22,12 +22,12 @@ remove_demo_data() } \itemize{ -\item \code{remove_demo_data()}: NULL +\item \code{remove_demo_data()}: \code{NULL} } } \description{ -The demo dataset is a subset of the 10x Genomics PBMC 3k dataset, and filters both the matrix and the fragments -to cells with at least 1000 reads. Both the matrix and the fragments are subset to only genes on chromosomes 4 and 11. +The demo dataset is a subset of the 10x Genomics PBMC 3k dataset, with the matrix and fragments +filtered to cells with at least 1000 reads. Both the matrix and the fragments are also subset to only genes on chromosomes 4 and 11. } \details{ The first time either \code{get_demo_mat()} are ran \code{get_demo_frags()}, the pre-subsetted @@ -41,11 +41,11 @@ as a fallback. Both the matrix from \code{get_demo_mat()} and the fragments from \code{get_demo_frags()} may be removed by running \code{remove_demo_data()}. \itemize{ -\item \code{get_demo_mat()}: Retrieve a 1 MB demo matrix, representing a subset of the 10X Genomics PBMC 3k dataset. +\item \code{get_demo_mat()}: Retrieve a 1 MB demo \code{IterableMatrix}, representing a subset of the 10X Genomics PBMC 3k dataset. } \itemize{ -\item \code{get_demo_frags()}: Retrieve a 12.5 MB demo fragments object, representing a subset of the 10X Genomics PBMC 3k dataset. +\item \code{get_demo_frags()}: Retrieve a 12.5 MB demo \code{IterableFragments} object, representing a subset of the 10X Genomics PBMC 3k dataset. } \itemize{ From c96db5456907c78d0468c3b89386581fde30d763 Mon Sep 17 00:00:00 2001 From: Immanuel Abdi Date: Wed, 26 Mar 2025 22:12:21 -0700 Subject: [PATCH 06/15] [r] expand demo data parameterization with subsetting and filtering --- r/R/data.R | 201 +++++++++++++++++++++++-------------- r/man/demo_data.Rd | 50 +++++++-- r/man/prepare_demo_data.Rd | 20 ++-- 3 files changed, 177 insertions(+), 94 deletions(-) diff --git a/r/R/data.R b/r/R/data.R index 761c050f..abe38505 100644 --- a/r/R/data.R +++ b/r/R/data.R @@ -6,74 +6,92 @@ # option. This file may not be copied, modified, or distributed # except according to those terms. -#' Create a small demo matrix and fragment object +#' Create a small demo matrix and fragment object. #' -#' Downloads a 10x Genomics dataset, then performs QC and subsetting. Holds subsetted objects in disk, +#' Downloads a 10x Genomics dataset consisting of 3k cells, then performs optional QC and subsetting. Holds subsetted objects in disk, #' and returns a list with both the matrix and fragments. #' @param directory (character) Where the input/output data should be stored. If NULL, a temporary directory is created. -#' @param mat_name (character) Name of the RNA matrix file. If NULL, the matrix is named "demo_mat." -#' @param frags_name (character) Name of the ATAC fragments file. If NULL, the fragments are named "demo_frags". +#' @param filter_qc (bool) Whether to filter both the RNA and ATAC data using QC information. +#' @param subset (bool) Whether to subset to only genes/insertions on chromosome 4 and 11. #' @param timeout (numeric) Timeout for downloading files in seconds. #' @param remove_input_data (logical) Whether to remove the downloaded non-procesed matrix, frags, gencode transcripts, and gencode genes #' after processing. #' @return (list) A list with the RNA matrix under the name "mat", and the ATAC fragments under the name "frags". #' @details -#' This function downloads the 10x Genomics PBMC 3k dataset, and filters the fragments and matrix to cells with at least 1000 reads. -#' Following, both fragments and the matrix is subset to only genes and insertions on chromosomes 4 and 11. -#' The RNA matrix is 1 MB and the fragments are 12.5 MB, after BPCells compression. +#' This function downloads the 10x Genomics PBMC 3k dataset. +#' Filtering using QC information on the fragments and matrix provides cells with at least 1000 reads, 1000 frags, and a minimum tss enrichment of 10. +#' Subsetting provides only genes and insertions on chromosomes 4 and 11. +#' The name of the matrix and fragments folders are `demo_mat` and `demo_frags` respectively. +#' Additionally, choosing to qc filter appends a `_filtered`, and choosing to subset data appends a `_subsetted` to the name. #' @keywords internal -prepare_demo_data <- function(directory = NULL, mat_name = NULL, frags_name = NULL, timeout = 300, remove_input_data = TRUE) { +prepare_demo_data <- function(directory = NULL, filter_qc = TRUE, subset = TRUE, timeout = 300, remove_input_data = TRUE) { if (is.null(directory)) { - directory <- file.path(tempdir()) - dir.create(directory, recursive = TRUE, showWarnings = FALSE) + directory <- file.path(tempdir()) + dir.create(directory, recursive = TRUE, showWarnings = FALSE) } - if (is.null(mat_name)) { - mat_name <- "demo_mat" + mat_name <- "demo_mat" + frags_name <- "demo_frags" + if (filter_qc) { + mat_name <- paste0(mat_name, "_filtered") + frags_name <- paste0(frags_name, "_filtered") } - if (is.null(frags_name)) { - frags_name <- "demo_frags" + if (subset) { + mat_name <- paste0(mat_name, "_subsetted") + frags_name <- paste0(frags_name, "_subsetted") } + # Download matrix/frags if not done previously, and open url_base <- "https://cf.10xgenomics.com/samples/cell-arc/2.0.0/pbmc_granulocyte_sorted_3k/" - rna_raw_url <- paste0(url_base, "pbmc_granulocyte_sorted_3k_raw_feature_bc_matrix.h5") - atac_raw_url <- paste0(url_base, "pbmc_granulocyte_sorted_3k_atac_fragments.tsv.gz") - ensure_downloaded(file.path(directory, "pbmc_3k_10x.h5"), rna_raw_url, timeout = timeout) - ensure_downloaded(file.path(directory, "pbmc_3k_10x.fragments.tsv.gz"), atac_raw_url, timeout = timeout) - if (!file.exists(file.path(directory,"pbmc_3k_rna_raw"))) { - mat_raw <- open_matrix_10x_hdf5(file.path(directory, "pbmc_3k_10x.h5"), feature_type="Gene Expression") %>% - write_matrix_dir(file.path(directory, "pbmc_3k_rna_raw")) + if (!file.exists(file.path(directory, "pbmc_3k_rna_raw"))) { + rna_raw_url <- paste0(url_base, "pbmc_granulocyte_sorted_3k_raw_feature_bc_matrix.h5") + ensure_downloaded(file.path(directory, "pbmc_3k_10x.h5"), rna_raw_url, timeout = timeout) + mat <- open_matrix_10x_hdf5(file.path(directory, "pbmc_3k_10x.h5"), feature_type="Gene Expression") %>% + write_matrix_dir(file.path(directory, "pbmc_3k_rna_raw")) } else { - mat_raw <- open_matrix_dir(file.path(directory, "pbmc_3k_rna_raw")) + mat <- open_matrix_dir(file.path(directory, "pbmc_3k_rna_raw")) } # Check if we already ran import if (!file.exists(file.path(directory, "pbmc_3k_frags"))) { - frags_raw <- open_fragments_10x(file.path(directory, "pbmc_3k_10x.fragments.tsv.gz")) %>% - write_fragments_dir(file.path(directory, "pbmc_3k_frags")) + atac_raw_url <- paste0(url_base, "pbmc_granulocyte_sorted_3k_atac_fragments.tsv.gz") + ensure_downloaded(file.path(directory, "pbmc_3k_10x.fragments.tsv.gz"), atac_raw_url, timeout = timeout) + frags <- open_fragments_10x(file.path(directory, "pbmc_3k_10x.fragments.tsv.gz")) %>% + write_fragments_dir(file.path(directory, "pbmc_3k_frags")) } else { - frags_raw <- open_fragments_dir(file.path(directory, "pbmc_3k_frags")) + frags <- open_fragments_dir(file.path(directory, "pbmc_3k_frags")) } - # for atac transcripts - transcripts <- read_gencode_transcripts( - file.path(directory, "references_transcripts"), - release = "42", - transcript_choice = "MANE_Select", - annotation_set = "basic", - features = "transcript" - ) - # for RNA genes - genes_demo <- read_gencode_genes( - file.path(directory, "./reference_genes"), - release = "42", - annotation_set = "basic", - ) - # Filter to only cells that have at least 1000 reads on the RNA side - # and only genes/fragments that exist on chr 4 and 11 - reads_per_cell <- colSums(mat_raw) - filtered_cells <- colnames(mat_raw)[reads_per_cell >= 1e3] - filtered_genes <- genes_demo[genes_demo$chr %in% c("chr4", "chr11"),]$gene_id - # remove version numbers - filtered_genes <- gsub("\\..*", "", filtered_genes) - mat <- mat_raw[which(rownames(mat_raw) %in% filtered_genes), filtered_cells] - frags <- select_cells(frags_raw, filtered_cells) %>% select_chromosomes(c("chr4", "chr11")) + if (filter_qc) { + # Download annotations for transcripts + transcripts <- read_gencode_transcripts( + file.path(directory, "references"), + release = "42", + transcript_choice = "MANE_Select", + annotation_set = "basic", + features = "transcript" + ) + blacklist <- read_encode_blacklist(file.path(directory, "references"), genome="hg38") + atac_qc <- qc_scATAC(frags, transcripts, blacklist) + # Filter to only cells that have at least 1000 reads on the RNA side + # a minimum of 1000 frag reads, and greater than 10 tss enrichment + pass_atac <- atac_qc %>% + dplyr::filter(nFrags > 1000, TSSEnrichment > 10) %>% + dplyr::pull(cellName) + pass_rna <- colnames(mat)[colSums(mat) > 1000] + filtered_cells <- intersect(pass_atac, pass_rna) + frags <- select_cells(frags, filtered_cells) + mat <- mat[, filtered_cells] + } + if (subset) { + # Subset to only genes/fragments that exist on chr4 and 11 + genes_demo <- read_gencode_genes( + file.path(directory, "./references"), + release = "42", + annotation_set = "basic", + ) + filtered_genes <- genes_demo[genes_demo$chr %in% c("chr4", "chr11"),]$gene_id + # remove version numbers + filtered_genes <- gsub("\\..*", "", filtered_genes) + mat <- mat[which(rownames(mat) %in% filtered_genes), ] + frags <- frags %>% select_chromosomes(c("chr4", "chr11")) + } mat <- write_matrix_dir(mat, file.path(directory, mat_name), overwrite = TRUE) frags <- write_fragments_dir(frags, file.path(directory, frags_name), overwrite = TRUE) if (remove_input_data) { @@ -87,15 +105,20 @@ prepare_demo_data <- function(directory = NULL, mat_name = NULL, frags_name = NU #' Retrieve BPCells demo data #' -#' The demo dataset is a subset of the 10x Genomics PBMC 3k dataset, with the matrix and fragments -#' filtered to cells with at least 1000 reads. Both the matrix and the fragments are also subset to only genes on chromosomes 4 and 11. +#' Functions to download matrices and fragments derived from the 10X Genomics PBMC 3k dataset, +#' with options to filter with common qc metrics, and to subset genes and fragments to only chromosome 4 and 11. #' @rdname demo_data +#' @param filter_qc (bool) Whether to filter both the RNA and ATAC data using qc metrics (described in `details`). +#' @param subset (bool) Whether to subset to only genes/insertions on chromosome 4 and 11. #' @return -#' - `get_demo_mat()`: (IterableMatrix) A `(features x cells)` matrix of shape `(1984 x 2724)`. -#' @details -#' The first time either `get_demo_mat()` are ran `get_demo_frags()`, the pre-subsetted +#' - `get_demo_mat()`: (IterableMatrix) A `(features x cells)` matrix. +#' @details +#' **Data Processing**: +#' +#' The first time either `get_demo_mat()` are ran `get_demo_frags()`, #' demo data is downloaded and stored in the BPCells data directory (under `file.path(tools::R_user_dir("BPcells", which="data"), "demo_data")`). -#' Subsequent calls to this function will use the previously downloaded matrix/fragments. +#' Subsequent calls to this function will use the previously downloaded matrix/fragments, given that the same combination of filtering and +#' subsetting has been performed previously. #' The preperation of this matrix can be reproduced by running the internal function `prepare_demo_data()` with `directory` set to the BPCells data directory. #' #' In the case that demo data is not pre-downloaded and demo data download fails, `prepare_demo_data()` will act @@ -104,57 +127,85 @@ prepare_demo_data <- function(directory = NULL, mat_name = NULL, frags_name = NU #' Both the matrix from `get_demo_mat()` and the fragments from `get_demo_frags()` #' may be removed by running `remove_demo_data()`. #' -#' - `get_demo_mat()`: Retrieve a 1 MB demo `IterableMatrix`, representing a subset of the 10X Genomics PBMC 3k dataset. +#' Filtering using QC information on the fragments and matrix object chooses cells with at least 1000 reads, 1000 frags, and a minimum tss enrichment of 10. +#' Subsetting provides only genes and insertions on chromosomes 4 and 11. +#' +#' **Dimensions**: +#' \tabular{lll}{ +#' \strong{Condition} \tab \strong{RNA matrix (features x cells)} \tab \strong{Fragments (chromosomes x cells)} \cr +#' Raw \tab (36601 x 650165) \tab (39 x 462264) \cr +#' Filter \tab (36601 x 2600) \tab (39 x 2600) \cr +#' Subset \tab (3582 x 650165) \tab (2 x 462264) \cr +#' Filter + Subset \tab (3582 x 2600) \tab (2 x 2600) \cr +#' } +#' **Data size**: +#' \tabular{lll}{ +#' \strong{Condition} \tab \strong{RNA matrix (MB)} \tab \strong{Fragments (MB)} \cr +#' Raw \tab 31.9 \tab 200 \cr +#' Filter \tab 9.4 \tab 137 \cr +#' Subset \tab 18.3 \tab 25.6 \cr +#' Filter + Subset \tab 1.2 \tab 12.3 \cr +#' } +#' +#' **Function Description**: +#' +#' - `get_demo_mat()`: Retrieve a demo `IterableMatrix` object representing the 10X Genomics PBMC 3k dataset. #' @export -get_demo_mat <- function() { +get_demo_mat <- function(filter_qc = TRUE, subset = TRUE) { # Use the data directory for BPCells data_dir <- file.path(tools::R_user_dir("BPCells", which = "data"), "demo_data") if (!dir.exists(data_dir)) { dir.create(data_dir, recursive = TRUE) } - if (!dir.exists(file.path(data_dir, "demo_mat"))) { - url <- "https://pub-c4e56988ff67429e9856ffa33aecb0c1.r2.dev/demo_mat.tar.gz" - download.file(url, file.path(data_dir, "demo_mat.tar.gz")) + mat_name = "demo_mat" + if (filter_qc) mat_name <- paste0(mat_name, "_filtered") + if (subset) mat_name <- paste0(mat_name, "_subsetted") + if (!dir.exists(file.path(data_dir, mat_name))) { + url <- paste0("https://pub-c4e56988ff67429e9856ffa33aecb0c1.r2.dev/", mat_name, ".tar.gz") + suppressWarnings(try(download.file(url, file.path(data_dir, paste0(mat_name, ".tar.gz"))), silent = TRUE)) # Check if file download failed - if (!file.exists(file.path(data_dir, "demo_mat.tar.gz"))) { - prepare_demo_data(data_dir) + if (!file.exists(file.path(data_dir, paste0(mat_name, ".tar.gz")))) { + prepare_demo_data(data_dir, filter_qc = filter_qc, subset = subset) } else { - untar(file.path(data_dir, "demo_mat.tar.gz"), exdir=data_dir) - file.remove(file.path(data_dir, "demo_mat.tar.gz")) + untar(file.path(data_dir, paste0(mat_name, ".tar.gz")), exdir=data_dir) + file.remove(file.path(data_dir, paste0(mat_name, ".tar.gz"))) } } - return(open_matrix_dir(file.path(data_dir, "demo_mat"))) + return(open_matrix_dir(file.path(data_dir, mat_name))) } #' @rdname demo_data #' @return -#' - `get_demo_frags()`: (IterableFragments) A Fragments object with 2724 cells and fragments on chromosomes 4 and 11. +#' - `get_demo_frags()`: (IterableFragments) A Fragments object. #' @details -#' - `get_demo_frags()`: Retrieve a 12.5 MB demo `IterableFragments` object, representing a subset of the 10X Genomics PBMC 3k dataset. +#' - `get_demo_frags()`: Retrieve a demo `IterableFragments` object representing the 10X Genomics PBMC 3k dataset. #' @export -get_demo_frags <- function() { +get_demo_frags <- function(filter_qc = TRUE, subset = TRUE) { data_dir <- file.path(tools::R_user_dir("BPCells", which = "data"), "demo_data") if (!dir.exists(data_dir)) { dir.create(data_dir, recursive = TRUE) } - if (!dir.exists(file.path(data_dir, "demo_frags"))) { - url <- "https://pub-c4e56988ff67429e9856ffa33aecb0c1.r2.dev/demo_frags.tar.gz" - download.file(url, file.path(data_dir, "demo_frags.tar.gz")) - if (!file.exists(file.path(data_dir, "demo_frags.tar.gz"))) { + frags_name <- "demo_frags" + if (filter_qc) frags_name <- paste0(frags_name, "_filtered") + if (subset) frags_name <- paste0(frags_name, "_subsetted") + if (!dir.exists(file.path(data_dir, frags_name))) { + url <- paste0("https://pub-c4e56988ff67429e9856ffa33aecb0c1.r2.dev/", frags_name, ".tar.gz") + suppressWarnings(try(download.file(url, file.path(data_dir, paste0(frags_name, ".tar.gz"))), silent = TRUE)) + if (!file.exists(file.path(data_dir, paste0(frags_name, ".tar.gz")))) { prepare_demo_data(data_dir) } else { - untar(file.path(data_dir, "demo_frags.tar.gz"), exdir = data_dir) - file.remove(file.path(data_dir, "demo_frags.tar.gz")) + untar(file.path(data_dir, paste0(frags_name, ".tar.gz")), exdir = data_dir) + file.remove(file.path(data_dir, paste0(frags_name, ".tar.gz"))) } } - return(open_fragments_dir(file.path(data_dir, "demo_frags"))) + return(open_fragments_dir(file.path(data_dir, frags_name))) } #' @rdname demo_data #' @return #' - `remove_demo_data()`: `NULL` #' @details -#' - `remove_demo_data()`: Remove the demo data from the BPCells data directory. +#' - `remove_demo_data()`: Remove the demo data from the BPCells data directory. #' @export remove_demo_data <- function() { data_dir <- file.path(tools::R_user_dir("BPCells", which = "data"), "demo_data") diff --git a/r/man/demo_data.Rd b/r/man/demo_data.Rd index 6e677812..89d16a7e 100644 --- a/r/man/demo_data.Rd +++ b/r/man/demo_data.Rd @@ -6,19 +6,24 @@ \alias{remove_demo_data} \title{Retrieve BPCells demo data} \usage{ -get_demo_mat() +get_demo_mat(filter_qc = TRUE, subset = TRUE) -get_demo_frags() +get_demo_frags(filter_qc = TRUE, subset = TRUE) remove_demo_data() } +\arguments{ +\item{filter_qc}{(bool) Whether to filter both the RNA and ATAC data using qc metrics (described in \code{details}).} + +\item{subset}{(bool) Whether to subset to only genes/insertions on chromosome 4 and 11.} +} \value{ \itemize{ -\item \code{get_demo_mat()}: (IterableMatrix) A \verb{(features x cells)} matrix of shape \verb{(1984 x 2724)}. +\item \code{get_demo_mat()}: (IterableMatrix) A \verb{(features x cells)} matrix. } \itemize{ -\item \code{get_demo_frags()}: (IterableFragments) A Fragments object with 2724 cells and fragments on chromosomes 4 and 11. +\item \code{get_demo_frags()}: (IterableFragments) A Fragments object. } \itemize{ @@ -26,13 +31,16 @@ remove_demo_data() } } \description{ -The demo dataset is a subset of the 10x Genomics PBMC 3k dataset, with the matrix and fragments -filtered to cells with at least 1000 reads. Both the matrix and the fragments are also subset to only genes on chromosomes 4 and 11. +Functions to download matrices and fragments derived from the 10X Genomics PBMC 3k dataset, +with options to filter with common qc metrics, and to subset genes and fragments to only chromosome 4 and 11. } \details{ -The first time either \code{get_demo_mat()} are ran \code{get_demo_frags()}, the pre-subsetted +\strong{Data Processing}: + +The first time either \code{get_demo_mat()} are ran \code{get_demo_frags()}, demo data is downloaded and stored in the BPCells data directory (under \code{file.path(tools::R_user_dir("BPcells", which="data"), "demo_data")}). -Subsequent calls to this function will use the previously downloaded matrix/fragments. +Subsequent calls to this function will use the previously downloaded matrix/fragments, given that the same combination of filtering and +subsetting has been performed previously. The preperation of this matrix can be reproduced by running the internal function \code{prepare_demo_data()} with \code{directory} set to the BPCells data directory. In the case that demo data is not pre-downloaded and demo data download fails, \code{prepare_demo_data()} will act @@ -40,12 +48,34 @@ as a fallback. Both the matrix from \code{get_demo_mat()} and the fragments from \code{get_demo_frags()} may be removed by running \code{remove_demo_data()}. + +Filtering using QC information on the fragments and matrix object chooses cells with at least 1000 reads, 1000 frags, and a minimum tss enrichment of 10. +Subsetting provides only genes and insertions on chromosomes 4 and 11. + +\strong{Dimensions}: +\tabular{lll}{ +\strong{Condition} \tab \strong{RNA matrix (features x cells)} \tab \strong{Fragments (chromosomes x cells)} \cr +Raw \tab (36601 x 650165) \tab (39 x 462264) \cr +Filter \tab (36601 x 2600) \tab (39 x 2600) \cr +Subset \tab (3582 x 650165) \tab (2 x 462264) \cr +Filter + Subset \tab (3582 x 2600) \tab (2 x 2600) \cr +} +\strong{Data size}: +\tabular{lll}{ +\strong{Condition} \tab \strong{RNA matrix (MB)} \tab \strong{Fragments (MB)} \cr +Raw \tab 31.9 \tab 200 \cr +Filter \tab 9.4 \tab 137 \cr +Subset \tab 18.3 \tab 25.6 \cr +Filter + Subset \tab 1.2 \tab 12.3 \cr +} + +\strong{Function Description}: \itemize{ -\item \code{get_demo_mat()}: Retrieve a 1 MB demo \code{IterableMatrix}, representing a subset of the 10X Genomics PBMC 3k dataset. +\item \code{get_demo_mat()}: Retrieve a demo \code{IterableMatrix} object representing the 10X Genomics PBMC 3k dataset. } \itemize{ -\item \code{get_demo_frags()}: Retrieve a 12.5 MB demo \code{IterableFragments} object, representing a subset of the 10X Genomics PBMC 3k dataset. +\item \code{get_demo_frags()}: Retrieve a demo \code{IterableFragments} object representing the 10X Genomics PBMC 3k dataset. } \itemize{ diff --git a/r/man/prepare_demo_data.Rd b/r/man/prepare_demo_data.Rd index ee498b19..c1921924 100644 --- a/r/man/prepare_demo_data.Rd +++ b/r/man/prepare_demo_data.Rd @@ -2,12 +2,12 @@ % Please edit documentation in R/data.R \name{prepare_demo_data} \alias{prepare_demo_data} -\title{Create a small demo matrix and fragment object} +\title{Create a small demo matrix and fragment object.} \usage{ prepare_demo_data( directory = NULL, - mat_name = NULL, - frags_name = NULL, + filter_qc = TRUE, + subset = TRUE, timeout = 300, remove_input_data = TRUE ) @@ -15,9 +15,9 @@ prepare_demo_data( \arguments{ \item{directory}{(character) Where the input/output data should be stored. If NULL, a temporary directory is created.} -\item{mat_name}{(character) Name of the RNA matrix file. If NULL, the matrix is named "demo_mat."} +\item{filter_qc}{(bool) Whether to filter both the RNA and ATAC data using QC information.} -\item{frags_name}{(character) Name of the ATAC fragments file. If NULL, the fragments are named "demo_frags".} +\item{subset}{(bool) Whether to subset to only genes/insertions on chromosome 4 and 11.} \item{timeout}{(numeric) Timeout for downloading files in seconds.} @@ -28,12 +28,14 @@ after processing.} (list) A list with the RNA matrix under the name "mat", and the ATAC fragments under the name "frags". } \description{ -Downloads a 10x Genomics dataset, then performs QC and subsetting. Holds subsetted objects in disk, +Downloads a 10x Genomics dataset consisting of 3k cells, then performs optional QC and subsetting. Holds subsetted objects in disk, and returns a list with both the matrix and fragments. } \details{ -This function downloads the 10x Genomics PBMC 3k dataset, and filters the fragments and matrix to cells with at least 1000 reads. -Following, both fragments and the matrix is subset to only genes and insertions on chromosomes 4 and 11. -The RNA matrix is 1 MB and the fragments are 12.5 MB, after BPCells compression. +This function downloads the 10x Genomics PBMC 3k dataset. +Filtering using QC information on the fragments and matrix provides cells with at least 1000 reads, 1000 frags, and a minimum tss enrichment of 10. +Subsetting provides only genes and insertions on chromosomes 4 and 11. +The name of the matrix and fragments folders are \code{demo_mat} and \code{demo_frags} respectively. +Additionally, choosing to qc filter appends a \verb{_filtered}, and choosing to subset data appends a \verb{_subsetted} to the name. } \keyword{internal} From af1eba714f41cbf54d5b9c4c510f8be2e1127f77 Mon Sep 17 00:00:00 2001 From: Immanuel Abdi <56730419+immanuelazn@users.noreply.github.com> Date: Mon, 7 Apr 2025 11:57:06 -0700 Subject: [PATCH 07/15] Apply suggestions from code review Co-authored-by: Ben Parks --- r/R/data.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/r/R/data.R b/r/R/data.R index abe38505..a0aec9a7 100644 --- a/r/R/data.R +++ b/r/R/data.R @@ -14,7 +14,7 @@ #' @param filter_qc (bool) Whether to filter both the RNA and ATAC data using QC information. #' @param subset (bool) Whether to subset to only genes/insertions on chromosome 4 and 11. #' @param timeout (numeric) Timeout for downloading files in seconds. -#' @param remove_input_data (logical) Whether to remove the downloaded non-procesed matrix, frags, gencode transcripts, and gencode genes +#' @param remove_input_data (logical) Whether to remove the downloaded non-processed matrix, frags, gencode transcripts, and gencode genes #' after processing. #' @return (list) A list with the RNA matrix under the name "mat", and the ATAC fragments under the name "frags". #' @details @@ -119,7 +119,7 @@ prepare_demo_data <- function(directory = NULL, filter_qc = TRUE, subset = TRUE, #' demo data is downloaded and stored in the BPCells data directory (under `file.path(tools::R_user_dir("BPcells", which="data"), "demo_data")`). #' Subsequent calls to this function will use the previously downloaded matrix/fragments, given that the same combination of filtering and #' subsetting has been performed previously. -#' The preperation of this matrix can be reproduced by running the internal function `prepare_demo_data()` with `directory` set to the BPCells data directory. +#' The preparation of this matrix can be reproduced by running the internal function `prepare_demo_data()` with `directory` set to the BPCells data directory. #' #' In the case that demo data is not pre-downloaded and demo data download fails, `prepare_demo_data()` will act #' as a fallback. From c6872dc81dcf2684ca13a694f857df0894727039 Mon Sep 17 00:00:00 2001 From: immanuelazn Date: Mon, 7 Apr 2025 12:53:12 -0700 Subject: [PATCH 08/15] [r] add examples to demo data delete intermediates when exiting clean up docs wording --- r/R/data.R | 63 ++++++++++++++++++++++++++------------ r/man/demo_data.Rd | 26 ++++++++++++++-- r/man/prepare_demo_data.Rd | 10 +++--- 3 files changed, 73 insertions(+), 26 deletions(-) diff --git a/r/R/data.R b/r/R/data.R index a0aec9a7..d3d8582a 100644 --- a/r/R/data.R +++ b/r/R/data.R @@ -10,13 +10,15 @@ #' #' Downloads a 10x Genomics dataset consisting of 3k cells, then performs optional QC and subsetting. Holds subsetted objects in disk, #' and returns a list with both the matrix and fragments. -#' @param directory (character) Where the input/output data should be stored. If NULL, a temporary directory is created. +#' @param directory (character) The directory where all the input/output data will be stored. +#' Downloaded intermediates will be stored in subdir `intermediates`. +#' If `NULL`, a temporary directory is created. #' @param filter_qc (bool) Whether to filter both the RNA and ATAC data using QC information. #' @param subset (bool) Whether to subset to only genes/insertions on chromosome 4 and 11. #' @param timeout (numeric) Timeout for downloading files in seconds. -#' @param remove_input_data (logical) Whether to remove the downloaded non-processed matrix, frags, gencode transcripts, and gencode genes -#' after processing. -#' @return (list) A list with the RNA matrix under the name "mat", and the ATAC fragments under the name "frags". +#' @param remove_input_data (logical) Whether to remove the intermediate non-processed matrix, frags, gencode transcripts, +#' and gencode genes after processing. If this function errors out, will also remove intermediate data if `remove_input_data` is `TRUE`. +#' @return (list) A list with the RNA matrix under the name `mat`, and the ATAC fragments under the name `frags`. #' @details #' This function downloads the 10x Genomics PBMC 3k dataset. #' Filtering using QC information on the fragments and matrix provides cells with at least 1000 reads, 1000 frags, and a minimum tss enrichment of 10. @@ -27,8 +29,12 @@ prepare_demo_data <- function(directory = NULL, filter_qc = TRUE, subset = TRUE, timeout = 300, remove_input_data = TRUE) { if (is.null(directory)) { directory <- file.path(tempdir()) - dir.create(directory, recursive = TRUE, showWarnings = FALSE) } + intermediate_dir <- file.path(directory, "intermediates") + dir.create(intermediate_dir, recursive = TRUE, showWarnings = FALSE) + # Delete all intermediates during exit if remove_input_data is TRUE. + on.exit(if (remove_input_data) unlink(intermediate_dir, recursive = TRUE)) + mat_name <- "demo_mat" frags_name <- "demo_frags" if (filter_qc) { @@ -41,10 +47,10 @@ prepare_demo_data <- function(directory = NULL, filter_qc = TRUE, subset = TRUE, } # Download matrix/frags if not done previously, and open url_base <- "https://cf.10xgenomics.com/samples/cell-arc/2.0.0/pbmc_granulocyte_sorted_3k/" - if (!file.exists(file.path(directory, "pbmc_3k_rna_raw"))) { + if (!file.exists(directory, "pbmc_3k_rna_raw")) { rna_raw_url <- paste0(url_base, "pbmc_granulocyte_sorted_3k_raw_feature_bc_matrix.h5") - ensure_downloaded(file.path(directory, "pbmc_3k_10x.h5"), rna_raw_url, timeout = timeout) - mat <- open_matrix_10x_hdf5(file.path(directory, "pbmc_3k_10x.h5"), feature_type="Gene Expression") %>% + ensure_downloaded(file.path(intermediate_dir, "pbmc_3k_10x.h5"), rna_raw_url, timeout = timeout) + mat <- open_matrix_10x_hdf5(file.path(intermediate_dir, "pbmc_3k_10x.h5"), feature_type="Gene Expression") %>% write_matrix_dir(file.path(directory, "pbmc_3k_rna_raw")) } else { mat <- open_matrix_dir(file.path(directory, "pbmc_3k_rna_raw")) @@ -52,8 +58,8 @@ prepare_demo_data <- function(directory = NULL, filter_qc = TRUE, subset = TRUE, # Check if we already ran import if (!file.exists(file.path(directory, "pbmc_3k_frags"))) { atac_raw_url <- paste0(url_base, "pbmc_granulocyte_sorted_3k_atac_fragments.tsv.gz") - ensure_downloaded(file.path(directory, "pbmc_3k_10x.fragments.tsv.gz"), atac_raw_url, timeout = timeout) - frags <- open_fragments_10x(file.path(directory, "pbmc_3k_10x.fragments.tsv.gz")) %>% + ensure_downloaded(file.path(intermediate_dir, "pbmc_3k_10x.fragments.tsv.gz"), atac_raw_url, timeout = timeout) + frags <- open_fragments_10x(file.path(intermediate_dir, "pbmc_3k_10x.fragments.tsv.gz")) %>% write_fragments_dir(file.path(directory, "pbmc_3k_frags")) } else { frags <- open_fragments_dir(file.path(directory, "pbmc_3k_frags")) @@ -61,13 +67,13 @@ prepare_demo_data <- function(directory = NULL, filter_qc = TRUE, subset = TRUE, if (filter_qc) { # Download annotations for transcripts transcripts <- read_gencode_transcripts( - file.path(directory, "references"), + intermediate_dir, release = "42", transcript_choice = "MANE_Select", annotation_set = "basic", features = "transcript" ) - blacklist <- read_encode_blacklist(file.path(directory, "references"), genome="hg38") + blacklist <- read_encode_blacklist(intermediate_dir, genome="hg38") atac_qc <- qc_scATAC(frags, transcripts, blacklist) # Filter to only cells that have at least 1000 reads on the RNA side # a minimum of 1000 frag reads, and greater than 10 tss enrichment @@ -82,7 +88,7 @@ prepare_demo_data <- function(directory = NULL, filter_qc = TRUE, subset = TRUE, if (subset) { # Subset to only genes/fragments that exist on chr4 and 11 genes_demo <- read_gencode_genes( - file.path(directory, "./references"), + intermediate_dir, release = "42", annotation_set = "basic", ) @@ -94,18 +100,12 @@ prepare_demo_data <- function(directory = NULL, filter_qc = TRUE, subset = TRUE, } mat <- write_matrix_dir(mat, file.path(directory, mat_name), overwrite = TRUE) frags <- write_fragments_dir(frags, file.path(directory, frags_name), overwrite = TRUE) - if (remove_input_data) { - unlink(file.path(directory, "pbmc_3k_10x.h5")) - unlink(file.path(directory, "pbmc_3k_10x.fragments.tsv.gz")) - unlink(file.path(directory, "pbmc_3k_rna_raw")) - unlink(file.path(directory, "pbmc_3k_frags")) - } return(list(mat = mat, frags = frags)) } #' Retrieve BPCells demo data #' -#' Functions to download matrices and fragments derived from the 10X Genomics PBMC 3k dataset, +#' `r lifecycle::badge("experimental")` \cr Functions to download matrices and fragments derived from the 10X Genomics PBMC 3k dataset, #' with options to filter with common qc metrics, and to subset genes and fragments to only chromosome 4 and 11. #' @rdname demo_data #' @param filter_qc (bool) Whether to filter both the RNA and ATAC data using qc metrics (described in `details`). @@ -113,6 +113,9 @@ prepare_demo_data <- function(directory = NULL, filter_qc = TRUE, subset = TRUE, #' @return #' - `get_demo_mat()`: (IterableMatrix) A `(features x cells)` matrix. #' @details +#' These data functions are experimental. +#' The interface, as well as the demo dataset itself will likely undergo changes in the near future. +#' #' **Data Processing**: #' #' The first time either `get_demo_mat()` are ran `get_demo_frags()`, @@ -150,6 +153,11 @@ prepare_demo_data <- function(directory = NULL, filter_qc = TRUE, subset = TRUE, #' **Function Description**: #' #' - `get_demo_mat()`: Retrieve a demo `IterableMatrix` object representing the 10X Genomics PBMC 3k dataset. +#' @examples +#' ####################################################################### +#' ## get_demo_mat() example +#' get_demo_mat() +#' ####################################################################### #' @export get_demo_mat <- function(filter_qc = TRUE, subset = TRUE) { # Use the data directory for BPCells @@ -179,6 +187,11 @@ get_demo_mat <- function(filter_qc = TRUE, subset = TRUE) { #' - `get_demo_frags()`: (IterableFragments) A Fragments object. #' @details #' - `get_demo_frags()`: Retrieve a demo `IterableFragments` object representing the 10X Genomics PBMC 3k dataset. +#' @examples +#' ####################################################################### +#' ## get_demo_frags() example +#' get_demo_frags() +#' ####################################################################### #' @export get_demo_frags <- function(filter_qc = TRUE, subset = TRUE) { data_dir <- file.path(tools::R_user_dir("BPCells", which = "data"), "demo_data") @@ -206,6 +219,16 @@ get_demo_frags <- function(filter_qc = TRUE, subset = TRUE) { #' - `remove_demo_data()`: `NULL` #' @details #' - `remove_demo_data()`: Remove the demo data from the BPCells data directory. +#' @examples +#' ####################################################################### +#' ## remove_demo_data() example +#' remove_demo_data() +#' +#' +#' ## Demo data folder is now empty +#' data_dir <- file.path(tools::R_user_dir("BPCells", which = "data"), "demo_data") +#' list.files(data_dir) +#' ####################################################################### #' @export remove_demo_data <- function() { data_dir <- file.path(tools::R_user_dir("BPCells", which = "data"), "demo_data") diff --git a/r/man/demo_data.Rd b/r/man/demo_data.Rd index 89d16a7e..b19795a4 100644 --- a/r/man/demo_data.Rd +++ b/r/man/demo_data.Rd @@ -31,17 +31,20 @@ remove_demo_data() } } \description{ -Functions to download matrices and fragments derived from the 10X Genomics PBMC 3k dataset, +\ifelse{html}{\href{https://lifecycle.r-lib.org/articles/stages.html#experimental}{\figure{lifecycle-experimental.svg}{options: alt='[Experimental]'}}}{\strong{[Experimental]}} \cr Functions to download matrices and fragments derived from the 10X Genomics PBMC 3k dataset, with options to filter with common qc metrics, and to subset genes and fragments to only chromosome 4 and 11. } \details{ +These data functions are experimental. +The interface, as well as the demo dataset itself will likely undergo changes in the near future. + \strong{Data Processing}: The first time either \code{get_demo_mat()} are ran \code{get_demo_frags()}, demo data is downloaded and stored in the BPCells data directory (under \code{file.path(tools::R_user_dir("BPcells", which="data"), "demo_data")}). Subsequent calls to this function will use the previously downloaded matrix/fragments, given that the same combination of filtering and subsetting has been performed previously. -The preperation of this matrix can be reproduced by running the internal function \code{prepare_demo_data()} with \code{directory} set to the BPCells data directory. +The preparation of this matrix can be reproduced by running the internal function \code{prepare_demo_data()} with \code{directory} set to the BPCells data directory. In the case that demo data is not pre-downloaded and demo data download fails, \code{prepare_demo_data()} will act as a fallback. @@ -82,3 +85,22 @@ Filter + Subset \tab 1.2 \tab 12.3 \item \code{remove_demo_data()}: Remove the demo data from the BPCells data directory. } } +\examples{ +####################################################################### +## get_demo_mat() example +get_demo_mat() +####################################################################### +####################################################################### +## get_demo_frags() example +get_demo_frags() +####################################################################### +####################################################################### +## remove_demo_data() example +remove_demo_data() + + +## Demo data folder is now empty +data_dir <- file.path(tools::R_user_dir("BPCells", which = "data"), "demo_data") +list.files(data_dir) +####################################################################### +} diff --git a/r/man/prepare_demo_data.Rd b/r/man/prepare_demo_data.Rd index c1921924..b6368db8 100644 --- a/r/man/prepare_demo_data.Rd +++ b/r/man/prepare_demo_data.Rd @@ -13,7 +13,9 @@ prepare_demo_data( ) } \arguments{ -\item{directory}{(character) Where the input/output data should be stored. If NULL, a temporary directory is created.} +\item{directory}{(character) The directory where all the input/output data will be stored. +Downloaded intermediates will be stored in subdir \code{intermediates}. +If \code{NULL}, a temporary directory is created.} \item{filter_qc}{(bool) Whether to filter both the RNA and ATAC data using QC information.} @@ -21,11 +23,11 @@ prepare_demo_data( \item{timeout}{(numeric) Timeout for downloading files in seconds.} -\item{remove_input_data}{(logical) Whether to remove the downloaded non-procesed matrix, frags, gencode transcripts, and gencode genes -after processing.} +\item{remove_input_data}{(logical) Whether to remove the intermediate non-processed matrix, frags, gencode transcripts, +and gencode genes after processing. If this function errors out, will also remove intermediate data if \code{remove_input_data} is \code{TRUE}.} } \value{ -(list) A list with the RNA matrix under the name "mat", and the ATAC fragments under the name "frags". +(list) A list with the RNA matrix under the name \code{mat}, and the ATAC fragments under the name \code{frags}. } \description{ Downloads a 10x Genomics dataset consisting of 3k cells, then performs optional QC and subsetting. Holds subsetted objects in disk, From a321917e5a5f76e8c661da977e058a2c29c480f1 Mon Sep 17 00:00:00 2001 From: immanuelazn Date: Mon, 7 Apr 2025 13:48:03 -0700 Subject: [PATCH 09/15] [r] add in link for specifying dataset for demo data --- r/R/data.R | 7 +++++-- r/man/demo_data.Rd | 3 ++- r/man/prepare_demo_data.Rd | 4 +++- 3 files changed, 10 insertions(+), 4 deletions(-) diff --git a/r/R/data.R b/r/R/data.R index d3d8582a..e3bdaa59 100644 --- a/r/R/data.R +++ b/r/R/data.R @@ -8,7 +8,9 @@ #' Create a small demo matrix and fragment object. #' -#' Downloads a 10x Genomics dataset consisting of 3k cells, then performs optional QC and subsetting. Holds subsetted objects in disk, +#' Downloads a +#' [10x Genomics dataset](https://cf.10xgenomics.com/samples/cell-arc/2.0.0/pbmc_granulocyte_sorted_3k/pbmc_granulocyte_sorted_3k_web_summary.html), +#' consisting of 3k cells then performs optional QC and subsetting. Holds subsetted objects in disk, #' and returns a list with both the matrix and fragments. #' @param directory (character) The directory where all the input/output data will be stored. #' Downloaded intermediates will be stored in subdir `intermediates`. @@ -105,7 +107,8 @@ prepare_demo_data <- function(directory = NULL, filter_qc = TRUE, subset = TRUE, #' Retrieve BPCells demo data #' -#' `r lifecycle::badge("experimental")` \cr Functions to download matrices and fragments derived from the 10X Genomics PBMC 3k dataset, +#' `r lifecycle::badge("experimental")` \cr Functions to download matrices and fragments derived from a +#' [10X Genomics PBMC 3k dataset](https://cf.10xgenomics.com/samples/cell-arc/2.0.0/pbmc_granulocyte_sorted_3k/pbmc_granulocyte_sorted_3k_web_summary.html), #' with options to filter with common qc metrics, and to subset genes and fragments to only chromosome 4 and 11. #' @rdname demo_data #' @param filter_qc (bool) Whether to filter both the RNA and ATAC data using qc metrics (described in `details`). diff --git a/r/man/demo_data.Rd b/r/man/demo_data.Rd index b19795a4..cf581e12 100644 --- a/r/man/demo_data.Rd +++ b/r/man/demo_data.Rd @@ -31,7 +31,8 @@ remove_demo_data() } } \description{ -\ifelse{html}{\href{https://lifecycle.r-lib.org/articles/stages.html#experimental}{\figure{lifecycle-experimental.svg}{options: alt='[Experimental]'}}}{\strong{[Experimental]}} \cr Functions to download matrices and fragments derived from the 10X Genomics PBMC 3k dataset, +\ifelse{html}{\href{https://lifecycle.r-lib.org/articles/stages.html#experimental}{\figure{lifecycle-experimental.svg}{options: alt='[Experimental]'}}}{\strong{[Experimental]}} \cr Functions to download matrices and fragments derived from a +\href{https://cf.10xgenomics.com/samples/cell-arc/2.0.0/pbmc_granulocyte_sorted_3k/pbmc_granulocyte_sorted_3k_web_summary.html}{10X Genomics PBMC 3k dataset}, with options to filter with common qc metrics, and to subset genes and fragments to only chromosome 4 and 11. } \details{ diff --git a/r/man/prepare_demo_data.Rd b/r/man/prepare_demo_data.Rd index b6368db8..d66426dc 100644 --- a/r/man/prepare_demo_data.Rd +++ b/r/man/prepare_demo_data.Rd @@ -30,7 +30,9 @@ and gencode genes after processing. If this function errors out, will also remo (list) A list with the RNA matrix under the name \code{mat}, and the ATAC fragments under the name \code{frags}. } \description{ -Downloads a 10x Genomics dataset consisting of 3k cells, then performs optional QC and subsetting. Holds subsetted objects in disk, +Downloads a +\href{https://cf.10xgenomics.com/samples/cell-arc/2.0.0/pbmc_granulocyte_sorted_3k/pbmc_granulocyte_sorted_3k_web_summary.html}{10x Genomics dataset}, +consisting of 3k cells then performs optional QC and subsetting. Holds subsetted objects in disk, and returns a list with both the matrix and fragments. } \details{ From b166d101260728cade421fbe9d4469a829a84866 Mon Sep 17 00:00:00 2001 From: immanuelazn Date: Mon, 7 Apr 2025 20:18:26 -0700 Subject: [PATCH 10/15] [r] add in better error handling in `prepare_demo_data()`, remove unnecessary parameterization + fix example styling --- r/R/data.R | 52 +++++++++++++++++++----------------- r/man/demo_data.Rd | 14 +++++++--- r/man/prepare_demo_data.Rd | 8 ++---- r/tests/testthat/test-data.R | 1 + 4 files changed, 41 insertions(+), 34 deletions(-) diff --git a/r/R/data.R b/r/R/data.R index e3bdaa59..9486cf86 100644 --- a/r/R/data.R +++ b/r/R/data.R @@ -9,7 +9,7 @@ #' Create a small demo matrix and fragment object. #' #' Downloads a -#' [10x Genomics dataset](https://cf.10xgenomics.com/samples/cell-arc/2.0.0/pbmc_granulocyte_sorted_3k/pbmc_granulocyte_sorted_3k_web_summary.html), +#' [10x Genomics dataset](https://www.10xgenomics.com/datasets/pbmc-from-a-healthy-donor-granulocytes-removed-through-cell-sorting-3-k-1-standard-2-0-0), #' consisting of 3k cells then performs optional QC and subsetting. Holds subsetted objects in disk, #' and returns a list with both the matrix and fragments. #' @param directory (character) The directory where all the input/output data will be stored. @@ -18,8 +18,6 @@ #' @param filter_qc (bool) Whether to filter both the RNA and ATAC data using QC information. #' @param subset (bool) Whether to subset to only genes/insertions on chromosome 4 and 11. #' @param timeout (numeric) Timeout for downloading files in seconds. -#' @param remove_input_data (logical) Whether to remove the intermediate non-processed matrix, frags, gencode transcripts, -#' and gencode genes after processing. If this function errors out, will also remove intermediate data if `remove_input_data` is `TRUE`. #' @return (list) A list with the RNA matrix under the name `mat`, and the ATAC fragments under the name `frags`. #' @details #' This function downloads the 10x Genomics PBMC 3k dataset. @@ -28,14 +26,13 @@ #' The name of the matrix and fragments folders are `demo_mat` and `demo_frags` respectively. #' Additionally, choosing to qc filter appends a `_filtered`, and choosing to subset data appends a `_subsetted` to the name. #' @keywords internal -prepare_demo_data <- function(directory = NULL, filter_qc = TRUE, subset = TRUE, timeout = 300, remove_input_data = TRUE) { +prepare_demo_data <- function(directory = NULL, filter_qc = TRUE, subset = TRUE, timeout = 300) { if (is.null(directory)) { directory <- file.path(tempdir()) } intermediate_dir <- file.path(directory, "intermediates") dir.create(intermediate_dir, recursive = TRUE, showWarnings = FALSE) - # Delete all intermediates during exit if remove_input_data is TRUE. - on.exit(if (remove_input_data) unlink(intermediate_dir, recursive = TRUE)) + on.exit(unlink(intermediate_dir, recursive = TRUE)) mat_name <- "demo_mat" frags_name <- "demo_frags" @@ -49,23 +46,24 @@ prepare_demo_data <- function(directory = NULL, filter_qc = TRUE, subset = TRUE, } # Download matrix/frags if not done previously, and open url_base <- "https://cf.10xgenomics.com/samples/cell-arc/2.0.0/pbmc_granulocyte_sorted_3k/" - if (!file.exists(directory, "pbmc_3k_rna_raw")) { + # Recreate mat if mat is malformed + tryCatch({ + mat <- open_matrix_dir(file.path(directory, "pbmc_3k_rna_raw")) + }, error = function(e) { rna_raw_url <- paste0(url_base, "pbmc_granulocyte_sorted_3k_raw_feature_bc_matrix.h5") ensure_downloaded(file.path(intermediate_dir, "pbmc_3k_10x.h5"), rna_raw_url, timeout = timeout) - mat <- open_matrix_10x_hdf5(file.path(intermediate_dir, "pbmc_3k_10x.h5"), feature_type="Gene Expression") %>% - write_matrix_dir(file.path(directory, "pbmc_3k_rna_raw")) - } else { - mat <- open_matrix_dir(file.path(directory, "pbmc_3k_rna_raw")) - } - # Check if we already ran import - if (!file.exists(file.path(directory, "pbmc_3k_frags"))) { + mat <<- open_matrix_10x_hdf5(file.path(intermediate_dir, "pbmc_3k_10x.h5"), feature_type="Gene Expression") %>% + write_matrix_dir(file.path(directory, "pbmc_3k_rna_raw"), overwrite = TRUE) + }) + # Recreate frags if frags are malformed + tryCatch({ + frags <- open_fragments_dir(file.path(directory, "pbmc_3k_frags")) + }, error = function(e) { atac_raw_url <- paste0(url_base, "pbmc_granulocyte_sorted_3k_atac_fragments.tsv.gz") ensure_downloaded(file.path(intermediate_dir, "pbmc_3k_10x.fragments.tsv.gz"), atac_raw_url, timeout = timeout) - frags <- open_fragments_10x(file.path(intermediate_dir, "pbmc_3k_10x.fragments.tsv.gz")) %>% - write_fragments_dir(file.path(directory, "pbmc_3k_frags")) - } else { - frags <- open_fragments_dir(file.path(directory, "pbmc_3k_frags")) - } + frags <<- open_fragments_10x(file.path(intermediate_dir, "pbmc_3k_10x.fragments.tsv.gz")) %>% + write_fragments_dir(file.path(directory, "pbmc_3k_frags"), overwrite = TRUE) + }) if (filter_qc) { # Download annotations for transcripts transcripts <- read_gencode_transcripts( @@ -99,7 +97,7 @@ prepare_demo_data <- function(directory = NULL, filter_qc = TRUE, subset = TRUE, filtered_genes <- gsub("\\..*", "", filtered_genes) mat <- mat[which(rownames(mat) %in% filtered_genes), ] frags <- frags %>% select_chromosomes(c("chr4", "chr11")) - } + } mat <- write_matrix_dir(mat, file.path(directory, mat_name), overwrite = TRUE) frags <- write_fragments_dir(frags, file.path(directory, frags_name), overwrite = TRUE) return(list(mat = mat, frags = frags)) @@ -108,7 +106,7 @@ prepare_demo_data <- function(directory = NULL, filter_qc = TRUE, subset = TRUE, #' Retrieve BPCells demo data #' #' `r lifecycle::badge("experimental")` \cr Functions to download matrices and fragments derived from a -#' [10X Genomics PBMC 3k dataset](https://cf.10xgenomics.com/samples/cell-arc/2.0.0/pbmc_granulocyte_sorted_3k/pbmc_granulocyte_sorted_3k_web_summary.html), +#' [10X Genomics PBMC 3k dataset](https://www.10xgenomics.com/datasets/pbmc-from-a-healthy-donor-granulocytes-removed-through-cell-sorting-3-k-1-standard-2-0-0), #' with options to filter with common qc metrics, and to subset genes and fragments to only chromosome 4 and 11. #' @rdname demo_data #' @param filter_qc (bool) Whether to filter both the RNA and ATAC data using qc metrics (described in `details`). @@ -159,8 +157,10 @@ prepare_demo_data <- function(directory = NULL, filter_qc = TRUE, subset = TRUE, #' @examples #' ####################################################################### #' ## get_demo_mat() example -#' get_demo_mat() #' ####################################################################### +#' get_demo_mat() +#' +#' #' @export get_demo_mat <- function(filter_qc = TRUE, subset = TRUE) { # Use the data directory for BPCells @@ -193,8 +193,10 @@ get_demo_mat <- function(filter_qc = TRUE, subset = TRUE) { #' @examples #' ####################################################################### #' ## get_demo_frags() example -#' get_demo_frags() #' ####################################################################### +#' get_demo_frags() +#' +#' #' @export get_demo_frags <- function(filter_qc = TRUE, subset = TRUE) { data_dir <- file.path(tools::R_user_dir("BPCells", which = "data"), "demo_data") @@ -225,13 +227,15 @@ get_demo_frags <- function(filter_qc = TRUE, subset = TRUE) { #' @examples #' ####################################################################### #' ## remove_demo_data() example +#' ####################################################################### #' remove_demo_data() #' #' #' ## Demo data folder is now empty #' data_dir <- file.path(tools::R_user_dir("BPCells", which = "data"), "demo_data") #' list.files(data_dir) -#' ####################################################################### +#' +#' #' @export remove_demo_data <- function() { data_dir <- file.path(tools::R_user_dir("BPCells", which = "data"), "demo_data") diff --git a/r/man/demo_data.Rd b/r/man/demo_data.Rd index cf581e12..e503aff3 100644 --- a/r/man/demo_data.Rd +++ b/r/man/demo_data.Rd @@ -32,7 +32,7 @@ remove_demo_data() } \description{ \ifelse{html}{\href{https://lifecycle.r-lib.org/articles/stages.html#experimental}{\figure{lifecycle-experimental.svg}{options: alt='[Experimental]'}}}{\strong{[Experimental]}} \cr Functions to download matrices and fragments derived from a -\href{https://cf.10xgenomics.com/samples/cell-arc/2.0.0/pbmc_granulocyte_sorted_3k/pbmc_granulocyte_sorted_3k_web_summary.html}{10X Genomics PBMC 3k dataset}, +\href{https://www.10xgenomics.com/datasets/pbmc-from-a-healthy-donor-granulocytes-removed-through-cell-sorting-3-k-1-standard-2-0-0}{10X Genomics PBMC 3k dataset}, with options to filter with common qc metrics, and to subset genes and fragments to only chromosome 4 and 11. } \details{ @@ -89,19 +89,25 @@ Filter + Subset \tab 1.2 \tab 12.3 \examples{ ####################################################################### ## get_demo_mat() example -get_demo_mat() ####################################################################### +get_demo_mat() + + ####################################################################### ## get_demo_frags() example -get_demo_frags() ####################################################################### +get_demo_frags() + + ####################################################################### ## remove_demo_data() example +####################################################################### remove_demo_data() ## Demo data folder is now empty data_dir <- file.path(tools::R_user_dir("BPCells", which = "data"), "demo_data") list.files(data_dir) -####################################################################### + + } diff --git a/r/man/prepare_demo_data.Rd b/r/man/prepare_demo_data.Rd index d66426dc..ea019c82 100644 --- a/r/man/prepare_demo_data.Rd +++ b/r/man/prepare_demo_data.Rd @@ -8,8 +8,7 @@ prepare_demo_data( directory = NULL, filter_qc = TRUE, subset = TRUE, - timeout = 300, - remove_input_data = TRUE + timeout = 300 ) } \arguments{ @@ -22,16 +21,13 @@ If \code{NULL}, a temporary directory is created.} \item{subset}{(bool) Whether to subset to only genes/insertions on chromosome 4 and 11.} \item{timeout}{(numeric) Timeout for downloading files in seconds.} - -\item{remove_input_data}{(logical) Whether to remove the intermediate non-processed matrix, frags, gencode transcripts, -and gencode genes after processing. If this function errors out, will also remove intermediate data if \code{remove_input_data} is \code{TRUE}.} } \value{ (list) A list with the RNA matrix under the name \code{mat}, and the ATAC fragments under the name \code{frags}. } \description{ Downloads a -\href{https://cf.10xgenomics.com/samples/cell-arc/2.0.0/pbmc_granulocyte_sorted_3k/pbmc_granulocyte_sorted_3k_web_summary.html}{10x Genomics dataset}, +\href{https://www.10xgenomics.com/datasets/pbmc-from-a-healthy-donor-granulocytes-removed-through-cell-sorting-3-k-1-standard-2-0-0}{10x Genomics dataset}, consisting of 3k cells then performs optional QC and subsetting. Holds subsetted objects in disk, and returns a list with both the matrix and fragments. } diff --git a/r/tests/testthat/test-data.R b/r/tests/testthat/test-data.R index 4583a0c9..7afebac8 100644 --- a/r/tests/testthat/test-data.R +++ b/r/tests/testthat/test-data.R @@ -8,6 +8,7 @@ test_that("Getting test data works", { + expect_no_error(BPCells:::prepare_demo_data(file.path(tools::R_user_dir("BPCells", which = "data"), "demo_data"))) mat <- get_demo_mat() frags <- get_demo_frags() expect_true(is(mat, "IterableMatrix")) From fc5cb7ef9681499701290e4743f8310d227d2fec Mon Sep 17 00:00:00 2001 From: immanuelazn Date: Wed, 9 Apr 2025 14:47:12 -0700 Subject: [PATCH 11/15] [r] update `demo_data.Rd` styling --- r/R/data.R | 4 +++- r/man/demo_data.Rd | 4 +++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/r/R/data.R b/r/R/data.R index 9486cf86..b80b56d3 100644 --- a/r/R/data.R +++ b/r/R/data.R @@ -119,10 +119,12 @@ prepare_demo_data <- function(directory = NULL, filter_qc = TRUE, subset = TRUE, #' #' **Data Processing**: #' -#' The first time either `get_demo_mat()` are ran `get_demo_frags()`, +#' The first time either `get_demo_mat()`, or `get_demo_frags()`, are ran #' demo data is downloaded and stored in the BPCells data directory (under `file.path(tools::R_user_dir("BPcells", which="data"), "demo_data")`). +#' #' Subsequent calls to this function will use the previously downloaded matrix/fragments, given that the same combination of filtering and #' subsetting has been performed previously. +#' #' The preparation of this matrix can be reproduced by running the internal function `prepare_demo_data()` with `directory` set to the BPCells data directory. #' #' In the case that demo data is not pre-downloaded and demo data download fails, `prepare_demo_data()` will act diff --git a/r/man/demo_data.Rd b/r/man/demo_data.Rd index e503aff3..8dc1e870 100644 --- a/r/man/demo_data.Rd +++ b/r/man/demo_data.Rd @@ -41,10 +41,12 @@ The interface, as well as the demo dataset itself will likely undergo changes in \strong{Data Processing}: -The first time either \code{get_demo_mat()} are ran \code{get_demo_frags()}, +The first time either \code{get_demo_mat()}, or \code{get_demo_frags()}, are ran demo data is downloaded and stored in the BPCells data directory (under \code{file.path(tools::R_user_dir("BPcells", which="data"), "demo_data")}). + Subsequent calls to this function will use the previously downloaded matrix/fragments, given that the same combination of filtering and subsetting has been performed previously. + The preparation of this matrix can be reproduced by running the internal function \code{prepare_demo_data()} with \code{directory} set to the BPCells data directory. In the case that demo data is not pre-downloaded and demo data download fails, \code{prepare_demo_data()} will act From b37279e4d4a079b84121c579e2187cdb06cfd249 Mon Sep 17 00:00:00 2001 From: immanuelazn Date: Thu, 10 Apr 2025 10:17:20 -0700 Subject: [PATCH 12/15] [r] add fix to `tryCatch` block in `prepare_demo_data()` to not overwrite global `mat` --- r/R/data.R | 1 + 1 file changed, 1 insertion(+) diff --git a/r/R/data.R b/r/R/data.R index b80b56d3..eaec39ea 100644 --- a/r/R/data.R +++ b/r/R/data.R @@ -47,6 +47,7 @@ prepare_demo_data <- function(directory = NULL, filter_qc = TRUE, subset = TRUE, # Download matrix/frags if not done previously, and open url_base <- "https://cf.10xgenomics.com/samples/cell-arc/2.0.0/pbmc_granulocyte_sorted_3k/" # Recreate mat if mat is malformed + mat <- NULL tryCatch({ mat <- open_matrix_dir(file.path(directory, "pbmc_3k_rna_raw")) }, error = function(e) { From f959e0c73260da42af0a0865317cd6ce896aa4e6 Mon Sep 17 00:00:00 2001 From: immanuelazn Date: Thu, 10 Apr 2025 10:44:48 -0700 Subject: [PATCH 13/15] [r] refactor `prepare_demo_data()`, remove usage of intermediate raw rna/frags file --- r/R/data.R | 33 +++++++++++++++++++-------------- 1 file changed, 19 insertions(+), 14 deletions(-) diff --git a/r/R/data.R b/r/R/data.R index eaec39ea..b43a3abb 100644 --- a/r/R/data.R +++ b/r/R/data.R @@ -36,34 +36,27 @@ prepare_demo_data <- function(directory = NULL, filter_qc = TRUE, subset = TRUE, mat_name <- "demo_mat" frags_name <- "demo_frags" - if (filter_qc) { - mat_name <- paste0(mat_name, "_filtered") - frags_name <- paste0(frags_name, "_filtered") - } - if (subset) { - mat_name <- paste0(mat_name, "_subsetted") - frags_name <- paste0(frags_name, "_subsetted") - } # Download matrix/frags if not done previously, and open url_base <- "https://cf.10xgenomics.com/samples/cell-arc/2.0.0/pbmc_granulocyte_sorted_3k/" # Recreate mat if mat is malformed mat <- NULL + frags <- NULL tryCatch({ - mat <- open_matrix_dir(file.path(directory, "pbmc_3k_rna_raw")) + mat <- open_matrix_dir(file.path(directory, mat_name)) }, error = function(e) { rna_raw_url <- paste0(url_base, "pbmc_granulocyte_sorted_3k_raw_feature_bc_matrix.h5") ensure_downloaded(file.path(intermediate_dir, "pbmc_3k_10x.h5"), rna_raw_url, timeout = timeout) mat <<- open_matrix_10x_hdf5(file.path(intermediate_dir, "pbmc_3k_10x.h5"), feature_type="Gene Expression") %>% - write_matrix_dir(file.path(directory, "pbmc_3k_rna_raw"), overwrite = TRUE) + write_matrix_dir(file.path(directory, mat_name), overwrite = TRUE) }) # Recreate frags if frags are malformed tryCatch({ - frags <- open_fragments_dir(file.path(directory, "pbmc_3k_frags")) + frags <- open_fragments_dir(file.path(directory, frags_name)) }, error = function(e) { atac_raw_url <- paste0(url_base, "pbmc_granulocyte_sorted_3k_atac_fragments.tsv.gz") ensure_downloaded(file.path(intermediate_dir, "pbmc_3k_10x.fragments.tsv.gz"), atac_raw_url, timeout = timeout) frags <<- open_fragments_10x(file.path(intermediate_dir, "pbmc_3k_10x.fragments.tsv.gz")) %>% - write_fragments_dir(file.path(directory, "pbmc_3k_frags"), overwrite = TRUE) + write_fragments_dir(file.path(directory, frags_name), overwrite = TRUE) }) if (filter_qc) { # Download annotations for transcripts @@ -99,8 +92,20 @@ prepare_demo_data <- function(directory = NULL, filter_qc = TRUE, subset = TRUE, mat <- mat[which(rownames(mat) %in% filtered_genes), ] frags <- frags %>% select_chromosomes(c("chr4", "chr11")) } - mat <- write_matrix_dir(mat, file.path(directory, mat_name), overwrite = TRUE) - frags <- write_fragments_dir(frags, file.path(directory, frags_name), overwrite = TRUE) + # Rename mat and frags depending on state of filtering and subsetting + if (filter_qc) { + mat_name <- paste0(mat_name, "_filtered") + frags_name <- paste0(frags_name, "_filtered") + } + if (subset) { + mat_name <- paste0(mat_name, "_subsetted") + frags_name <- paste0(frags_name, "_subsetted") + } + # Write changes to directory + if (filter_qc || subset) { + mat <- write_matrix_dir(mat, file.path(directory, mat_name), overwrite = TRUE) + frags <- write_fragments_dir(frags, file.path(directory, frags_name), overwrite = TRUE) + } return(list(mat = mat, frags = frags)) } From a24160481ab60871650a47010013c6ef151b95da Mon Sep 17 00:00:00 2001 From: immanuelazn Date: Thu, 10 Apr 2025 12:06:13 -0700 Subject: [PATCH 14/15] [r] skip test for preparing demo data --- r/tests/testthat/test-data.R | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/r/tests/testthat/test-data.R b/r/tests/testthat/test-data.R index 7afebac8..4ee151b1 100644 --- a/r/tests/testthat/test-data.R +++ b/r/tests/testthat/test-data.R @@ -8,10 +8,11 @@ test_that("Getting test data works", { - expect_no_error(BPCells:::prepare_demo_data(file.path(tools::R_user_dir("BPCells", which = "data"), "demo_data"))) mat <- get_demo_mat() frags <- get_demo_frags() expect_true(is(mat, "IterableMatrix")) expect_true(is(frags, "IterableFragments")) remove_demo_data() + skip("Skip preparing demo data") + expect_no_error(BPCells:::prepare_demo_data(file.path(tools::R_user_dir("BPCells", which = "data"), "demo_data"))) }) \ No newline at end of file From 5a4e25e27e3ddfe6e030878e30f28f3e5e1ee1cc Mon Sep 17 00:00:00 2001 From: immanuelazn Date: Thu, 10 Apr 2025 12:14:35 -0700 Subject: [PATCH 15/15] [r] update NEWS.md --- r/NEWS.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/r/NEWS.md b/r/NEWS.md index abfceaa4..a3531348 100644 --- a/r/NEWS.md +++ b/r/NEWS.md @@ -10,7 +10,7 @@ Contributions welcome :) ## Features - Add `write_matrix_anndata_hdf5_dense()` which allows writing matrices in AnnData's dense format, most commonly used for `obsm` or `varm` matrices. (Thanks to @ycli1995 for pull request #166) -- Add `get_demo_mat()`, `get_demo_frags()` and `remove_demo_data()` to retrieve a small test matrix subsetted from the PBMC 3k dataset from 10X Genomics. (pull request #193) +- Add `get_demo_mat()`, `get_demo_frags()` and `remove_demo_data()` to retrieve a small test matrix/fragments object from the [PBMC 3k dataset from 10X Genomics](https://www.10xgenomics.com/datasets/pbmc-from-a-healthy-donor-granulocytes-removed-through-cell-sorting-3-k-1-standard-2-0-0). (pull request #193) ## Bug-fixes - Fix error message printing when MACS crashes during `call_peaks_macs()` (pull request #175)