From 9824d304aab5135454523c5687cfb445463d2b0c Mon Sep 17 00:00:00 2001 From: immanuelazn Date: Tue, 28 Jan 2025 13:15:23 -0800 Subject: [PATCH 01/15] [r] add test data --- r/R/utils.R | 72 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 72 insertions(+) diff --git a/r/R/utils.R b/r/R/utils.R index 4ea62d15..bb569b3c 100644 --- a/r/R/utils.R +++ b/r/R/utils.R @@ -56,4 +56,76 @@ log_progress <- function(msg, add_timestamp = TRUE){ } else { message(msg) } +} + +#' Prepare a test matrix and test fragments for BPCells. +#' +#' @param directory (character) Where the input/output data should be stored. If NULL, a temporary directory is created. +#' @param mat_name (character) Name of the RNA matrix file. If NULL, the matrix is named "test_mat." +#' @param frags_name (character) Name of the ATAC fragments file. If NULL, the fragments are named "test_frags". +#' @return (list) A list with the RNA matrix under the name "mat", and the ATAC fragments under the name "frags". +#' @details +#' This function downloads the 10x Genomics PBMC 3k dataset, and filters the fragments and matrix to cells with at least 1000 reads. +#' Following, both fragments and the matrix is subset to only genes and insertions on chromosomes 4 and 11. +#' The RNA matrix is 1 MB and the fragments are 12.5 MB, after BPCells compression. +#' @keywords internal +prepare_test_data <- function(directory = NULL, mat_name = NULL, frags_name = NULL) { + if (is.null(directory)) { + directory <- file.path(tempdir()) + dir.create(directory, recursive = TRUE, showWarnings = FALSE) + } + if (is.null(mat_name)) { + mat_name <- "test_rna" + } + if (is.null(frags_name)) { + frags_name <- "test_frags" + } + url_base <- "https://cf.10xgenomics.com/samples/cell-arc/2.0.0/pbmc_granulocyte_sorted_3k/" + rna_raw_url <- paste0(url_base, "pbmc_granulocyte_sorted_3k_raw_feature_bc_matrix.h5") + atac_raw_url <- paste0(url_base, "pbmc_granulocyte_sorted_3k_atac_fragments.tsv.gz") + options(timeout=300) + if (!file.exists(file.path(directory,"pbmc_3k_10x.h5"))) { + download.file(rna_raw_url, file.path(directory, "pbmc_3k_10x.h5"), mode="wb") + } + if (!file.exists(file.path(directory,"pbmc_3k_10x.fragments.tsv.gz"))) { + download.file(atac_raw_url, file.path(directory, "pbmc_3k_10x.fragments.tsv.gz"), mode="wb") + } + if (!file.exists(file.path(directory,"pbmc_3k_rna_raw"))) { + mat_raw <- open_matrix_10x_hdf5(file.path(directory, "pbmc_3k_10x.h5"), feature_type="Gene Expression") %>% + write_matrix_dir(file.path(directory,"pbmc_3k_rna_raw")) + } else { + mat_raw <- open_matrix_dir(file.path(directory,"pbmc_3k_rna_raw")) + } + # Check if we already ran import + if (!file.exists(file.path(directory,"pbmc_3k_frags"))) { + frags_raw <- open_fragments_10x(file.path(directory,"pbmc_3k_10x.fragments.tsv.gz")) %>% + write_fragments_dir(file.path(directory,"pbmc_3k_frags")) + } else { + frags_raw <- open_fragments_dir(file.path(directory,"pbmc_3k_frags")) + } + # for atac transcripts + transcripts <- read_gencode_transcripts( + file.path(directory,"references_transcripts"), + release="42", + transcript_choice="MANE_Select", + annotation_set = "basic", + features="transcript" + ) + # for RNA genes + genes_test <- read_gencode_genes( + file.path(directory,"./reference_genes"), + release = "42", + annotation_set = "basic", + ) + # Filter to only cells that have at least 1000 reads on the RNA side + # and only genes/fragments that exist on chr 4 and 11 + filtered_cells <- colnames(mat_raw)[reads_per_cell >= 1e3] + filtered_genes <- genes[genes$chr %in% c("chr4", "chr11"),]$gene_id + # remove version numbers + filtered_genes <- gsub("\\..*", "", filtered_genes) + mat <- mat_raw[which(rownames(mat_raw) %in% filtered_genes), pass_rna] + frags <- select_cells(frags_raw, pass_rna) %>% select_chromosomes(c("chr4", "chr11")) + mat <- write_matrix_dir(mat, file.path(directory, mat_name)) + frags <- write_fragments_dir(frags, file.path(directory, frags_name)) + return(list(mat = mat, frags = frags)) } \ No newline at end of file From f685ad882f2d4343e23a964606b3bda75734b6fb Mon Sep 17 00:00:00 2001 From: immanuelazn Date: Mon, 10 Feb 2025 14:21:15 -0800 Subject: [PATCH 02/15] [r] update test data functions based on feedback --- r/DESCRIPTION | 3 +- r/NAMESPACE | 3 + r/NEWS.md | 1 + r/R/data.R | 156 +++++++++++++++++++++++++++++++++++ r/R/utils.R | 72 ---------------- r/man/demo_data.Rd | 53 ++++++++++++ r/man/prepare_demo_data.Rd | 38 +++++++++ r/pkgdown/_pkgdown.yml | 4 + r/tests/testthat/test-data.R | 16 ++++ 9 files changed, 273 insertions(+), 73 deletions(-) create mode 100644 r/man/demo_data.Rd create mode 100644 r/man/prepare_demo_data.Rd create mode 100644 r/tests/testthat/test-data.R diff --git a/r/DESCRIPTION b/r/DESCRIPTION index 17faadb9..54090e64 100644 --- a/r/DESCRIPTION +++ b/r/DESCRIPTION @@ -30,6 +30,7 @@ Imports: Matrix, Rcpp, rlang, + tools, vctrs, lifecycle, stringr, @@ -50,5 +51,5 @@ Suggests: matrixStats, igraph Depends: - R (>= 3.5.0) + R (>= 4.0.0) Config/Needs/website: pkgdown, devtools, uwot, irlba, RcppHNSW, igraph, BiocManager, bioc::BSgenome.Hsapiens.UCSC.hg38, github::GreenleafLab/motifmatchr, github::GreenleafLab/chromVARmotifs diff --git a/r/NAMESPACE b/r/NAMESPACE index 982ca440..527f06ca 100644 --- a/r/NAMESPACE +++ b/r/NAMESPACE @@ -47,6 +47,8 @@ export(gene_region) export(gene_score_archr) export(gene_score_tiles_archr) export(gene_score_weights_archr) +export(get_demo_frags) +export(get_demo_mat) export(get_trackplot_height) export(import_matrix_market) export(import_matrix_market_10x) @@ -93,6 +95,7 @@ export(read_gencode_transcripts) export(read_gtf) export(read_ucsc_chrom_sizes) export(regress_out) +export(remove_demo_data) export(rotate_x_labels) export(rowMaxs) export(rowMaxs.IterableMatrix) diff --git a/r/NEWS.md b/r/NEWS.md index 6cab91a5..abfceaa4 100644 --- a/r/NEWS.md +++ b/r/NEWS.md @@ -10,6 +10,7 @@ Contributions welcome :) ## Features - Add `write_matrix_anndata_hdf5_dense()` which allows writing matrices in AnnData's dense format, most commonly used for `obsm` or `varm` matrices. (Thanks to @ycli1995 for pull request #166) +- Add `get_demo_mat()`, `get_demo_frags()` and `remove_demo_data()` to retrieve a small test matrix subsetted from the PBMC 3k dataset from 10X Genomics. (pull request #193) ## Bug-fixes - Fix error message printing when MACS crashes during `call_peaks_macs()` (pull request #175) diff --git a/r/R/data.R b/r/R/data.R index 9f66faa5..7aa9bdef 100644 --- a/r/R/data.R +++ b/r/R/data.R @@ -6,6 +6,160 @@ # option. This file may not be copied, modified, or distributed # except according to those terms. +#' Prepare a demo matrix and demo fragments for BPCells. +#' +#' @param directory (character) Where the input/output data should be stored. If NULL, a temporary directory is created. +#' @param mat_name (character) Name of the RNA matrix file. If NULL, the matrix is named "demo_mat." +#' @param frags_name (character) Name of the ATAC fragments file. If NULL, the fragments are named "demo_frags". +#' @param timeout (numeric) Timeout for downloading files in seconds. +#' @param remove_input_data (logical) Whether to remove the downloaded non-procesed matrix, frags, gencode transcripts, and gencode genes +#' after processing. +#' @return (list) A list with the RNA matrix under the name "mat", and the ATAC fragments under the name "frags". +#' @details +#' This function downloads the 10x Genomics PBMC 3k dataset, and filters the fragments and matrix to cells with at least 1000 reads. +#' Following, both fragments and the matrix is subset to only genes and insertions on chromosomes 4 and 11. +#' The RNA matrix is 1 MB and the fragments are 12.5 MB, after BPCells compression. +#' @keywords internal +prepare_demo_data <- function(directory = NULL, mat_name = NULL, frags_name = NULL, timeout = 300, remove_input_data = TRUE) { + if (is.null(directory)) { + directory <- file.path(tempdir()) + dir.create(directory, recursive = TRUE, showWarnings = FALSE) + } + if (is.null(mat_name)) { + mat_name <- "demo_mat" + } + if (is.null(frags_name)) { + frags_name <- "demo_frags" + } + url_base <- "https://cf.10xgenomics.com/samples/cell-arc/2.0.0/pbmc_granulocyte_sorted_3k/" + rna_raw_url <- paste0(url_base, "pbmc_granulocyte_sorted_3k_raw_feature_bc_matrix.h5") + atac_raw_url <- paste0(url_base, "pbmc_granulocyte_sorted_3k_atac_fragments.tsv.gz") + ensure_downloaded(file.path(directory, "pbmc_3k_10x.h5"), rna_raw_url, timeout = timeout) + ensure_downloaded(file.path(directory, "pbmc_3k_10x.fragments.tsv.gz"), atac_raw_url, timeout = timeout) + if (!file.exists(file.path(directory,"pbmc_3k_rna_raw"))) { + mat_raw <- open_matrix_10x_hdf5(file.path(directory, "pbmc_3k_10x.h5"), feature_type="Gene Expression") %>% + write_matrix_dir(file.path(directory, "pbmc_3k_rna_raw")) + } else { + mat_raw <- open_matrix_dir(file.path(directory, "pbmc_3k_rna_raw")) + } + # Check if we already ran import + if (!file.exists(file.path(directory, "pbmc_3k_frags"))) { + frags_raw <- open_fragments_10x(file.path(directory, "pbmc_3k_10x.fragments.tsv.gz")) %>% + write_fragments_dir(file.path(directory, "pbmc_3k_frags")) + } else { + frags_raw <- open_fragments_dir(file.path(directory, "pbmc_3k_frags")) + } + # for atac transcripts + transcripts <- read_gencode_transcripts( + file.path(directory, "references_transcripts"), + release = "42", + transcript_choice = "MANE_Select", + annotation_set = "basic", + features = "transcript" + ) + # for RNA genes + genes_demo <- read_gencode_genes( + file.path(directory, "./reference_genes"), + release = "42", + annotation_set = "basic", + ) + # Filter to only cells that have at least 1000 reads on the RNA side + # and only genes/fragments that exist on chr 4 and 11 + reads_per_cell <- colSums(mat_raw) + filtered_cells <- colnames(mat_raw)[reads_per_cell >= 1e3] + filtered_genes <- genes_demo[genes_demo$chr %in% c("chr4", "chr11"),]$gene_id + # remove version numbers + filtered_genes <- gsub("\\..*", "", filtered_genes) + mat <- mat_raw[which(rownames(mat_raw) %in% filtered_genes), filtered_cells] + frags <- select_cells(frags_raw, filtered_cells) %>% select_chromosomes(c("chr4", "chr11")) + mat <- write_matrix_dir(mat, file.path(directory, mat_name), overwrite = TRUE) + frags <- write_fragments_dir(frags, file.path(directory, frags_name), overwrite = TRUE) + if (remove_input_data) { + unlink(file.path(directory, "pbmc_3k_10x.h5")) + unlink(file.path(directory, "pbmc_3k_10x.fragments.tsv.gz")) + unlink(file.path(directory, "pbmc_3k_rna_raw")) + unlink(file.path(directory, "pbmc_3k_frags")) + } + return(list(mat = mat, frags = frags)) +} + +#' Retrieve BPCells demo data +#' +#' The demo dataset is a subset of the 10x Genomics PBMC 3k dataset, and filters both the matrix and the fragments +#' to cells with at least 1000 reads. Both the matrix and the fragments are subset to only genes on chromosomes 4 and 11. +#' @rdname demo_data +#' @return +#' - `get_demo_mat()`: (IterableMatrix) A `(features x cells)` matrix of shape `(1984 x 2724)`. +#' @details +#' The first time either `get_demo_mat()` are ran `get_demo_frags()`, the demo data is downloaded and stored in the BPCells data directory +#' (under `file.path(tools::R_user_dir("BPcells", which="data"), "demo_data")`). Subsequent calls to this function will use the previously downloaded matrix. +#' The preperation of this matrix can be reproduced by running the internal function `prepare_demo_data()`. +#' +#' In the case that demo data is not pre-downloaded and demo data download fails, `prepare_demo_data()` will be run, +#' which manually builds the demo dataset from the 10x Genomics PBMC 3k dataset. +#' +#' Both the matrix from `get_demo_mat()` and the fragments from `get_demo_frags()` +#' may be removed by running `remove_demo_data()`. +#' +#' - `get_demo_mat()`: Retrieve a 1 MB demo matrix, representing a subset of the 10X Genomics PBMC 3k dataset. +#' @export +get_demo_mat <- function() { + # Use the data directory for BPCells + data_dir <- file.path(tools::R_user_dir("BPCells", which = "data"), "demo_data") + if (!dir.exists(data_dir)) { + dir.create(data_dir, recursive = TRUE) + } + if (!dir.exists(file.path(data_dir, "demo_mat"))) { + url <- "https://pub-c4e56988ff67429e9856ffa33aecb0c1.r2.dev/demo_mat.tar.gz" + download.file(url, file.path(data_dir, "demo_mat.tar.gz")) + # Check if file download failed + if (!file.exists(file.path(data_dir, "demo_mat.tar.gz"))) { + prepare_demo_data(data_dir) + } else { + untar(file.path(data_dir, "demo_mat.tar.gz"), exdir=data_dir) + file.remove(file.path(data_dir, "demo_mat.tar.gz")) + } + } + return(open_matrix_dir(file.path(data_dir, "demo_mat"))) +} + +#' @rdname demo_data +#' @return +#' - `get_demo_frags()`: (IterableFragments) A Fragments object with 2724 cells and fragments on chromosomes 4 and 11. +#' @details +#' - `get_demo_frags()`: Retrieve a 12.5 MB demo fragments object, representing a subset of the 10X Genomics PBMC 3k dataset. +#' @export +get_demo_frags <- function() { + data_dir <- file.path(tools::R_user_dir("BPCells", which = "data"), "demo_data") + if (!dir.exists(data_dir)) { + dir.create(data_dir, recursive = TRUE) + } + if (!dir.exists(file.path(data_dir, "demo_frags"))) { + url <- "https://pub-c4e56988ff67429e9856ffa33aecb0c1.r2.dev/demo_frags.tar.gz" + download.file(url, file.path(data_dir, "demo_frags.tar.gz")) + if (!file.exists(file.path(data_dir, "demo_frags.tar.gz"))) { + prepare_demo_data(data_dir) + } else { + untar(file.path(data_dir, "demo_frags.tar.gz"), exdir = data_dir) + file.remove(file.path(data_dir, "demo_frags.tar.gz")) + } + } + return(open_fragments_dir(file.path(data_dir, "demo_frags"))) +} + +#' @rdname demo_data +#' @return +#' - `remove_demo_data()`: NULL +#' @details +#' - `remove_demo_data()`: Remove the demo data from the BPCells data directory. +#' @export +remove_demo_data <- function() { + data_dir <- file.path(tools::R_user_dir("BPCells", which = "data"), "demo_data") + if (dir.exists(data_dir)) { + unlink(data_dir, recursive = TRUE) + } +} + #' Gene Symbol Mapping data #' #' Mapping of the canonical gene symbols corresponding to each @@ -36,3 +190,5 @@ #' #' "mouse_gene_mapping" + + diff --git a/r/R/utils.R b/r/R/utils.R index bb569b3c..4ea62d15 100644 --- a/r/R/utils.R +++ b/r/R/utils.R @@ -56,76 +56,4 @@ log_progress <- function(msg, add_timestamp = TRUE){ } else { message(msg) } -} - -#' Prepare a test matrix and test fragments for BPCells. -#' -#' @param directory (character) Where the input/output data should be stored. If NULL, a temporary directory is created. -#' @param mat_name (character) Name of the RNA matrix file. If NULL, the matrix is named "test_mat." -#' @param frags_name (character) Name of the ATAC fragments file. If NULL, the fragments are named "test_frags". -#' @return (list) A list with the RNA matrix under the name "mat", and the ATAC fragments under the name "frags". -#' @details -#' This function downloads the 10x Genomics PBMC 3k dataset, and filters the fragments and matrix to cells with at least 1000 reads. -#' Following, both fragments and the matrix is subset to only genes and insertions on chromosomes 4 and 11. -#' The RNA matrix is 1 MB and the fragments are 12.5 MB, after BPCells compression. -#' @keywords internal -prepare_test_data <- function(directory = NULL, mat_name = NULL, frags_name = NULL) { - if (is.null(directory)) { - directory <- file.path(tempdir()) - dir.create(directory, recursive = TRUE, showWarnings = FALSE) - } - if (is.null(mat_name)) { - mat_name <- "test_rna" - } - if (is.null(frags_name)) { - frags_name <- "test_frags" - } - url_base <- "https://cf.10xgenomics.com/samples/cell-arc/2.0.0/pbmc_granulocyte_sorted_3k/" - rna_raw_url <- paste0(url_base, "pbmc_granulocyte_sorted_3k_raw_feature_bc_matrix.h5") - atac_raw_url <- paste0(url_base, "pbmc_granulocyte_sorted_3k_atac_fragments.tsv.gz") - options(timeout=300) - if (!file.exists(file.path(directory,"pbmc_3k_10x.h5"))) { - download.file(rna_raw_url, file.path(directory, "pbmc_3k_10x.h5"), mode="wb") - } - if (!file.exists(file.path(directory,"pbmc_3k_10x.fragments.tsv.gz"))) { - download.file(atac_raw_url, file.path(directory, "pbmc_3k_10x.fragments.tsv.gz"), mode="wb") - } - if (!file.exists(file.path(directory,"pbmc_3k_rna_raw"))) { - mat_raw <- open_matrix_10x_hdf5(file.path(directory, "pbmc_3k_10x.h5"), feature_type="Gene Expression") %>% - write_matrix_dir(file.path(directory,"pbmc_3k_rna_raw")) - } else { - mat_raw <- open_matrix_dir(file.path(directory,"pbmc_3k_rna_raw")) - } - # Check if we already ran import - if (!file.exists(file.path(directory,"pbmc_3k_frags"))) { - frags_raw <- open_fragments_10x(file.path(directory,"pbmc_3k_10x.fragments.tsv.gz")) %>% - write_fragments_dir(file.path(directory,"pbmc_3k_frags")) - } else { - frags_raw <- open_fragments_dir(file.path(directory,"pbmc_3k_frags")) - } - # for atac transcripts - transcripts <- read_gencode_transcripts( - file.path(directory,"references_transcripts"), - release="42", - transcript_choice="MANE_Select", - annotation_set = "basic", - features="transcript" - ) - # for RNA genes - genes_test <- read_gencode_genes( - file.path(directory,"./reference_genes"), - release = "42", - annotation_set = "basic", - ) - # Filter to only cells that have at least 1000 reads on the RNA side - # and only genes/fragments that exist on chr 4 and 11 - filtered_cells <- colnames(mat_raw)[reads_per_cell >= 1e3] - filtered_genes <- genes[genes$chr %in% c("chr4", "chr11"),]$gene_id - # remove version numbers - filtered_genes <- gsub("\\..*", "", filtered_genes) - mat <- mat_raw[which(rownames(mat_raw) %in% filtered_genes), pass_rna] - frags <- select_cells(frags_raw, pass_rna) %>% select_chromosomes(c("chr4", "chr11")) - mat <- write_matrix_dir(mat, file.path(directory, mat_name)) - frags <- write_fragments_dir(frags, file.path(directory, frags_name)) - return(list(mat = mat, frags = frags)) } \ No newline at end of file diff --git a/r/man/demo_data.Rd b/r/man/demo_data.Rd new file mode 100644 index 00000000..89d149c2 --- /dev/null +++ b/r/man/demo_data.Rd @@ -0,0 +1,53 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/data.R +\name{get_demo_mat} +\alias{get_demo_mat} +\alias{get_demo_frags} +\alias{remove_demo_data} +\title{Retrieve BPCells demo data} +\usage{ +get_demo_mat() + +get_demo_frags() + +remove_demo_data() +} +\value{ +\itemize{ +\item \code{get_demo_mat()}: (IterableMatrix) A \verb{(features x cells)} matrix of shape \verb{(1984 x 2724)}. +} + +\itemize{ +\item \code{get_demo_frags()}: (IterableFragments) A Fragments object with 2724 cells and fragments on chromosomes 4 and 11. +} + +\itemize{ +\item \code{remove_demo_data()}: NULL +} +} +\description{ +The demo dataset is a subset of the 10x Genomics PBMC 3k dataset, and filters both the matrix and the fragments +to cells with at least 1000 reads. Both the matrix and the fragments are subset to only genes on chromosomes 4 and 11. +} +\details{ +The first time either \code{get_demo_mat()} are ran \code{get_demo_frags()}, the demo data is downloaded and stored in the BPCells data directory +(under \code{file.path(tools::R_user_dir("BPcells", which="data"), "demo_data")}). Subsequent calls to this function will use the previously downloaded matrix. +The preperation of this matrix can be reproduced by running the internal function \code{prepare_demo_data()}. + +In the case that demo data is not pre-downloaded and demo data download fails, \code{prepare_demo_data()} will be run, +which manually builds the demo dataset from the 10x Genomics PBMC 3k dataset. + +Both the matrix from \code{get_demo_mat()} and the fragments from \code{get_demo_frags()} +may be removed by running \code{remove_demo_data()}. +\itemize{ +\item \code{get_demo_mat()}: Retrieve a 1 MB demo matrix, representing a subset of the 10X Genomics PBMC 3k dataset. +} + +\itemize{ +\item \code{get_demo_frags()}: Retrieve a 12.5 MB demo fragments object, representing a subset of the 10X Genomics PBMC 3k dataset. +} + +\itemize{ +\item \code{remove_demo_data()}: Remove the demo data from the BPCells data directory. +} +} diff --git a/r/man/prepare_demo_data.Rd b/r/man/prepare_demo_data.Rd new file mode 100644 index 00000000..7b8f14a9 --- /dev/null +++ b/r/man/prepare_demo_data.Rd @@ -0,0 +1,38 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/data.R +\name{prepare_demo_data} +\alias{prepare_demo_data} +\title{Prepare a demo matrix and demo fragments for BPCells.} +\usage{ +prepare_demo_data( + directory = NULL, + mat_name = NULL, + frags_name = NULL, + timeout = 300, + remove_input_data = TRUE +) +} +\arguments{ +\item{directory}{(character) Where the input/output data should be stored. If NULL, a temporary directory is created.} + +\item{mat_name}{(character) Name of the RNA matrix file. If NULL, the matrix is named "demo_mat."} + +\item{frags_name}{(character) Name of the ATAC fragments file. If NULL, the fragments are named "demo_frags".} + +\item{timeout}{(numeric) Timeout for downloading files in seconds.} + +\item{remove_input_data}{(logical) Whether to remove the downloaded non-procesed matrix, frags, gencode transcripts, and gencode genes +after processing.} +} +\value{ +(list) A list with the RNA matrix under the name "mat", and the ATAC fragments under the name "frags". +} +\description{ +Prepare a demo matrix and demo fragments for BPCells. +} +\details{ +This function downloads the 10x Genomics PBMC 3k dataset, and filters the fragments and matrix to cells with at least 1000 reads. +Following, both fragments and the matrix is subset to only genes and insertions on chromosomes 4 and 11. +The RNA matrix is 1 MB and the fragments are 12.5 MB, after BPCells compression. +} +\keyword{internal} diff --git a/r/pkgdown/_pkgdown.yml b/r/pkgdown/_pkgdown.yml index 64d16f01..89166cc8 100644 --- a/r/pkgdown/_pkgdown.yml +++ b/r/pkgdown/_pkgdown.yml @@ -176,3 +176,7 @@ reference: - discrete_palette - collect_features - rotate_x_labels + +- title: "Data" +- contents: + - get_demo_mat diff --git a/r/tests/testthat/test-data.R b/r/tests/testthat/test-data.R new file mode 100644 index 00000000..4583a0c9 --- /dev/null +++ b/r/tests/testthat/test-data.R @@ -0,0 +1,16 @@ +# Copyright 2025 BPCells contributors +# +# Licensed under the Apache License, Version 2.0 or the MIT license +# , at your +# option. This file may not be copied, modified, or distributed +# except according to those terms. + + +test_that("Getting test data works", { + mat <- get_demo_mat() + frags <- get_demo_frags() + expect_true(is(mat, "IterableMatrix")) + expect_true(is(frags, "IterableFragments")) + remove_demo_data() +}) \ No newline at end of file From e1ec1cffba673af40219d903da20f400dbe413af Mon Sep 17 00:00:00 2001 From: immanuelazn Date: Wed, 26 Feb 2025 17:21:11 -0800 Subject: [PATCH 03/15] [r] rewrite some demo data documentation --- r/R/data.R | 17 ++++++++++------- r/man/demo_data.Rd | 11 ++++++----- r/man/prepare_demo_data.Rd | 5 +++-- 3 files changed, 19 insertions(+), 14 deletions(-) diff --git a/r/R/data.R b/r/R/data.R index 7aa9bdef..2865c307 100644 --- a/r/R/data.R +++ b/r/R/data.R @@ -6,8 +6,10 @@ # option. This file may not be copied, modified, or distributed # except according to those terms. -#' Prepare a demo matrix and demo fragments for BPCells. -#' +#' Create a small demo matrix and fragment object +#' +#' Downloads a 10x Genomics dataset, then performs QC and subsetting. Holds subsetted objects in disk, +#' and returns a list with both the matrix and fragments. #' @param directory (character) Where the input/output data should be stored. If NULL, a temporary directory is created. #' @param mat_name (character) Name of the RNA matrix file. If NULL, the matrix is named "demo_mat." #' @param frags_name (character) Name of the ATAC fragments file. If NULL, the fragments are named "demo_frags". @@ -91,12 +93,13 @@ prepare_demo_data <- function(directory = NULL, mat_name = NULL, frags_name = NU #' @return #' - `get_demo_mat()`: (IterableMatrix) A `(features x cells)` matrix of shape `(1984 x 2724)`. #' @details -#' The first time either `get_demo_mat()` are ran `get_demo_frags()`, the demo data is downloaded and stored in the BPCells data directory -#' (under `file.path(tools::R_user_dir("BPcells", which="data"), "demo_data")`). Subsequent calls to this function will use the previously downloaded matrix. -#' The preperation of this matrix can be reproduced by running the internal function `prepare_demo_data()`. +#' The first time either `get_demo_mat()` are ran `get_demo_frags()`, the pre-subsetted +#' demo data is downloaded and stored in the BPCells data directory (under `file.path(tools::R_user_dir("BPcells", which="data"), "demo_data")`). +#' Subsequent calls to this function will use the previously downloaded matrix. +#' The preperation of this matrix can be reproduced by running the internal function `prepare_demo_data()` with `directory` set to the BPCells data directory. #' -#' In the case that demo data is not pre-downloaded and demo data download fails, `prepare_demo_data()` will be run, -#' which manually builds the demo dataset from the 10x Genomics PBMC 3k dataset. +#' In the case that demo data is not pre-downloaded and demo data download fails, `prepare_demo_data()` will act +#' as a fallback. #' #' Both the matrix from `get_demo_mat()` and the fragments from `get_demo_frags()` #' may be removed by running `remove_demo_data()`. diff --git a/r/man/demo_data.Rd b/r/man/demo_data.Rd index 89d149c2..df58d7cb 100644 --- a/r/man/demo_data.Rd +++ b/r/man/demo_data.Rd @@ -30,12 +30,13 @@ The demo dataset is a subset of the 10x Genomics PBMC 3k dataset, and filters b to cells with at least 1000 reads. Both the matrix and the fragments are subset to only genes on chromosomes 4 and 11. } \details{ -The first time either \code{get_demo_mat()} are ran \code{get_demo_frags()}, the demo data is downloaded and stored in the BPCells data directory -(under \code{file.path(tools::R_user_dir("BPcells", which="data"), "demo_data")}). Subsequent calls to this function will use the previously downloaded matrix. -The preperation of this matrix can be reproduced by running the internal function \code{prepare_demo_data()}. +The first time either \code{get_demo_mat()} are ran \code{get_demo_frags()}, the pre-subsetted +demo data is downloaded and stored in the BPCells data directory (under \code{file.path(tools::R_user_dir("BPcells", which="data"), "demo_data")}). +Subsequent calls to this function will use the previously downloaded matrix. +The preperation of this matrix can be reproduced by running the internal function \code{prepare_demo_data()} with \code{directory} set to the BPCells data directory. -In the case that demo data is not pre-downloaded and demo data download fails, \code{prepare_demo_data()} will be run, -which manually builds the demo dataset from the 10x Genomics PBMC 3k dataset. +In the case that demo data is not pre-downloaded and demo data download fails, \code{prepare_demo_data()} will act +as a fallback. Both the matrix from \code{get_demo_mat()} and the fragments from \code{get_demo_frags()} may be removed by running \code{remove_demo_data()}. diff --git a/r/man/prepare_demo_data.Rd b/r/man/prepare_demo_data.Rd index 7b8f14a9..ee498b19 100644 --- a/r/man/prepare_demo_data.Rd +++ b/r/man/prepare_demo_data.Rd @@ -2,7 +2,7 @@ % Please edit documentation in R/data.R \name{prepare_demo_data} \alias{prepare_demo_data} -\title{Prepare a demo matrix and demo fragments for BPCells.} +\title{Create a small demo matrix and fragment object} \usage{ prepare_demo_data( directory = NULL, @@ -28,7 +28,8 @@ after processing.} (list) A list with the RNA matrix under the name "mat", and the ATAC fragments under the name "frags". } \description{ -Prepare a demo matrix and demo fragments for BPCells. +Downloads a 10x Genomics dataset, then performs QC and subsetting. Holds subsetted objects in disk, +and returns a list with both the matrix and fragments. } \details{ This function downloads the 10x Genomics PBMC 3k dataset, and filters the fragments and matrix to cells with at least 1000 reads. From 4b748b91f3748ff0fa0ba4be58732be9ffac3aaf Mon Sep 17 00:00:00 2001 From: immanuelazn Date: Thu, 27 Feb 2025 15:10:14 -0800 Subject: [PATCH 04/15] [r] update `get_demo_mat()` docs --- r/R/data.R | 10 +++++----- r/man/demo_data.Rd | 2 +- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/r/R/data.R b/r/R/data.R index 2865c307..52190ea3 100644 --- a/r/R/data.R +++ b/r/R/data.R @@ -94,17 +94,17 @@ prepare_demo_data <- function(directory = NULL, mat_name = NULL, frags_name = NU #' - `get_demo_mat()`: (IterableMatrix) A `(features x cells)` matrix of shape `(1984 x 2724)`. #' @details #' The first time either `get_demo_mat()` are ran `get_demo_frags()`, the pre-subsetted -#' demo data is downloaded and stored in the BPCells data directory (under `file.path(tools::R_user_dir("BPcells", which="data"), "demo_data")`). -#' Subsequent calls to this function will use the previously downloaded matrix. +#' demo data is downloaded and stored in the BPCells data directory (under `file.path(tools::R_user_dir("BPcells", which="data"), "demo_data")`). +#' Subsequent calls to this function will use the previously downloaded matrix/fragments. #' The preperation of this matrix can be reproduced by running the internal function `prepare_demo_data()` with `directory` set to the BPCells data directory. #' -#' In the case that demo data is not pre-downloaded and demo data download fails, `prepare_demo_data()` will act +#' In the case that demo data is not pre-downloaded and demo data download fails, `prepare_demo_data()` will act #' as a fallback. #' -#' Both the matrix from `get_demo_mat()` and the fragments from `get_demo_frags()` +#' Both the matrix from `get_demo_mat()` and the fragments from `get_demo_frags()` #' may be removed by running `remove_demo_data()`. #' -#' - `get_demo_mat()`: Retrieve a 1 MB demo matrix, representing a subset of the 10X Genomics PBMC 3k dataset. +#' - `get_demo_mat()`: Retrieve a 1 MB demo matrix, representing a subset of the 10X Genomics PBMC 3k dataset. #' @export get_demo_mat <- function() { # Use the data directory for BPCells diff --git a/r/man/demo_data.Rd b/r/man/demo_data.Rd index df58d7cb..5fec63db 100644 --- a/r/man/demo_data.Rd +++ b/r/man/demo_data.Rd @@ -32,7 +32,7 @@ to cells with at least 1000 reads. Both the matrix and the fragments are subset \details{ The first time either \code{get_demo_mat()} are ran \code{get_demo_frags()}, the pre-subsetted demo data is downloaded and stored in the BPCells data directory (under \code{file.path(tools::R_user_dir("BPcells", which="data"), "demo_data")}). -Subsequent calls to this function will use the previously downloaded matrix. +Subsequent calls to this function will use the previously downloaded matrix/fragments. The preperation of this matrix can be reproduced by running the internal function \code{prepare_demo_data()} with \code{directory} set to the BPCells data directory. In the case that demo data is not pre-downloaded and demo data download fails, \code{prepare_demo_data()} will act From 8311ca82ea9fd68a2c699c6ce88f57cff057dd06 Mon Sep 17 00:00:00 2001 From: immanuelazn Date: Fri, 7 Mar 2025 15:37:03 -0800 Subject: [PATCH 05/15] [r] clean up wording for demo data docstring --- r/R/data.R | 10 +++++----- r/man/demo_data.Rd | 10 +++++----- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/r/R/data.R b/r/R/data.R index 52190ea3..761c050f 100644 --- a/r/R/data.R +++ b/r/R/data.R @@ -87,8 +87,8 @@ prepare_demo_data <- function(directory = NULL, mat_name = NULL, frags_name = NU #' Retrieve BPCells demo data #' -#' The demo dataset is a subset of the 10x Genomics PBMC 3k dataset, and filters both the matrix and the fragments -#' to cells with at least 1000 reads. Both the matrix and the fragments are subset to only genes on chromosomes 4 and 11. +#' The demo dataset is a subset of the 10x Genomics PBMC 3k dataset, with the matrix and fragments +#' filtered to cells with at least 1000 reads. Both the matrix and the fragments are also subset to only genes on chromosomes 4 and 11. #' @rdname demo_data #' @return #' - `get_demo_mat()`: (IterableMatrix) A `(features x cells)` matrix of shape `(1984 x 2724)`. @@ -104,7 +104,7 @@ prepare_demo_data <- function(directory = NULL, mat_name = NULL, frags_name = NU #' Both the matrix from `get_demo_mat()` and the fragments from `get_demo_frags()` #' may be removed by running `remove_demo_data()`. #' -#' - `get_demo_mat()`: Retrieve a 1 MB demo matrix, representing a subset of the 10X Genomics PBMC 3k dataset. +#' - `get_demo_mat()`: Retrieve a 1 MB demo `IterableMatrix`, representing a subset of the 10X Genomics PBMC 3k dataset. #' @export get_demo_mat <- function() { # Use the data directory for BPCells @@ -130,7 +130,7 @@ get_demo_mat <- function() { #' @return #' - `get_demo_frags()`: (IterableFragments) A Fragments object with 2724 cells and fragments on chromosomes 4 and 11. #' @details -#' - `get_demo_frags()`: Retrieve a 12.5 MB demo fragments object, representing a subset of the 10X Genomics PBMC 3k dataset. +#' - `get_demo_frags()`: Retrieve a 12.5 MB demo `IterableFragments` object, representing a subset of the 10X Genomics PBMC 3k dataset. #' @export get_demo_frags <- function() { data_dir <- file.path(tools::R_user_dir("BPCells", which = "data"), "demo_data") @@ -152,7 +152,7 @@ get_demo_frags <- function() { #' @rdname demo_data #' @return -#' - `remove_demo_data()`: NULL +#' - `remove_demo_data()`: `NULL` #' @details #' - `remove_demo_data()`: Remove the demo data from the BPCells data directory. #' @export diff --git a/r/man/demo_data.Rd b/r/man/demo_data.Rd index 5fec63db..6e677812 100644 --- a/r/man/demo_data.Rd +++ b/r/man/demo_data.Rd @@ -22,12 +22,12 @@ remove_demo_data() } \itemize{ -\item \code{remove_demo_data()}: NULL +\item \code{remove_demo_data()}: \code{NULL} } } \description{ -The demo dataset is a subset of the 10x Genomics PBMC 3k dataset, and filters both the matrix and the fragments -to cells with at least 1000 reads. Both the matrix and the fragments are subset to only genes on chromosomes 4 and 11. +The demo dataset is a subset of the 10x Genomics PBMC 3k dataset, with the matrix and fragments +filtered to cells with at least 1000 reads. Both the matrix and the fragments are also subset to only genes on chromosomes 4 and 11. } \details{ The first time either \code{get_demo_mat()} are ran \code{get_demo_frags()}, the pre-subsetted @@ -41,11 +41,11 @@ as a fallback. Both the matrix from \code{get_demo_mat()} and the fragments from \code{get_demo_frags()} may be removed by running \code{remove_demo_data()}. \itemize{ -\item \code{get_demo_mat()}: Retrieve a 1 MB demo matrix, representing a subset of the 10X Genomics PBMC 3k dataset. +\item \code{get_demo_mat()}: Retrieve a 1 MB demo \code{IterableMatrix}, representing a subset of the 10X Genomics PBMC 3k dataset. } \itemize{ -\item \code{get_demo_frags()}: Retrieve a 12.5 MB demo fragments object, representing a subset of the 10X Genomics PBMC 3k dataset. +\item \code{get_demo_frags()}: Retrieve a 12.5 MB demo \code{IterableFragments} object, representing a subset of the 10X Genomics PBMC 3k dataset. } \itemize{ From c96db5456907c78d0468c3b89386581fde30d763 Mon Sep 17 00:00:00 2001 From: Immanuel Abdi Date: Wed, 26 Mar 2025 22:12:21 -0700 Subject: [PATCH 06/15] [r] expand demo data parameterization with subsetting and filtering --- r/R/data.R | 201 +++++++++++++++++++++++-------------- r/man/demo_data.Rd | 50 +++++++-- r/man/prepare_demo_data.Rd | 20 ++-- 3 files changed, 177 insertions(+), 94 deletions(-) diff --git a/r/R/data.R b/r/R/data.R index 761c050f..abe38505 100644 --- a/r/R/data.R +++ b/r/R/data.R @@ -6,74 +6,92 @@ # option. This file may not be copied, modified, or distributed # except according to those terms. -#' Create a small demo matrix and fragment object +#' Create a small demo matrix and fragment object. #' -#' Downloads a 10x Genomics dataset, then performs QC and subsetting. Holds subsetted objects in disk, +#' Downloads a 10x Genomics dataset consisting of 3k cells, then performs optional QC and subsetting. Holds subsetted objects in disk, #' and returns a list with both the matrix and fragments. #' @param directory (character) Where the input/output data should be stored. If NULL, a temporary directory is created. -#' @param mat_name (character) Name of the RNA matrix file. If NULL, the matrix is named "demo_mat." -#' @param frags_name (character) Name of the ATAC fragments file. If NULL, the fragments are named "demo_frags". +#' @param filter_qc (bool) Whether to filter both the RNA and ATAC data using QC information. +#' @param subset (bool) Whether to subset to only genes/insertions on chromosome 4 and 11. #' @param timeout (numeric) Timeout for downloading files in seconds. #' @param remove_input_data (logical) Whether to remove the downloaded non-procesed matrix, frags, gencode transcripts, and gencode genes #' after processing. #' @return (list) A list with the RNA matrix under the name "mat", and the ATAC fragments under the name "frags". #' @details -#' This function downloads the 10x Genomics PBMC 3k dataset, and filters the fragments and matrix to cells with at least 1000 reads. -#' Following, both fragments and the matrix is subset to only genes and insertions on chromosomes 4 and 11. -#' The RNA matrix is 1 MB and the fragments are 12.5 MB, after BPCells compression. +#' This function downloads the 10x Genomics PBMC 3k dataset. +#' Filtering using QC information on the fragments and matrix provides cells with at least 1000 reads, 1000 frags, and a minimum tss enrichment of 10. +#' Subsetting provides only genes and insertions on chromosomes 4 and 11. +#' The name of the matrix and fragments folders are `demo_mat` and `demo_frags` respectively. +#' Additionally, choosing to qc filter appends a `_filtered`, and choosing to subset data appends a `_subsetted` to the name. #' @keywords internal -prepare_demo_data <- function(directory = NULL, mat_name = NULL, frags_name = NULL, timeout = 300, remove_input_data = TRUE) { +prepare_demo_data <- function(directory = NULL, filter_qc = TRUE, subset = TRUE, timeout = 300, remove_input_data = TRUE) { if (is.null(directory)) { - directory <- file.path(tempdir()) - dir.create(directory, recursive = TRUE, showWarnings = FALSE) + directory <- file.path(tempdir()) + dir.create(directory, recursive = TRUE, showWarnings = FALSE) } - if (is.null(mat_name)) { - mat_name <- "demo_mat" + mat_name <- "demo_mat" + frags_name <- "demo_frags" + if (filter_qc) { + mat_name <- paste0(mat_name, "_filtered") + frags_name <- paste0(frags_name, "_filtered") } - if (is.null(frags_name)) { - frags_name <- "demo_frags" + if (subset) { + mat_name <- paste0(mat_name, "_subsetted") + frags_name <- paste0(frags_name, "_subsetted") } + # Download matrix/frags if not done previously, and open url_base <- "https://cf.10xgenomics.com/samples/cell-arc/2.0.0/pbmc_granulocyte_sorted_3k/" - rna_raw_url <- paste0(url_base, "pbmc_granulocyte_sorted_3k_raw_feature_bc_matrix.h5") - atac_raw_url <- paste0(url_base, "pbmc_granulocyte_sorted_3k_atac_fragments.tsv.gz") - ensure_downloaded(file.path(directory, "pbmc_3k_10x.h5"), rna_raw_url, timeout = timeout) - ensure_downloaded(file.path(directory, "pbmc_3k_10x.fragments.tsv.gz"), atac_raw_url, timeout = timeout) - if (!file.exists(file.path(directory,"pbmc_3k_rna_raw"))) { - mat_raw <- open_matrix_10x_hdf5(file.path(directory, "pbmc_3k_10x.h5"), feature_type="Gene Expression") %>% - write_matrix_dir(file.path(directory, "pbmc_3k_rna_raw")) + if (!file.exists(file.path(directory, "pbmc_3k_rna_raw"))) { + rna_raw_url <- paste0(url_base, "pbmc_granulocyte_sorted_3k_raw_feature_bc_matrix.h5") + ensure_downloaded(file.path(directory, "pbmc_3k_10x.h5"), rna_raw_url, timeout = timeout) + mat <- open_matrix_10x_hdf5(file.path(directory, "pbmc_3k_10x.h5"), feature_type="Gene Expression") %>% + write_matrix_dir(file.path(directory, "pbmc_3k_rna_raw")) } else { - mat_raw <- open_matrix_dir(file.path(directory, "pbmc_3k_rna_raw")) + mat <- open_matrix_dir(file.path(directory, "pbmc_3k_rna_raw")) } # Check if we already ran import if (!file.exists(file.path(directory, "pbmc_3k_frags"))) { - frags_raw <- open_fragments_10x(file.path(directory, "pbmc_3k_10x.fragments.tsv.gz")) %>% - write_fragments_dir(file.path(directory, "pbmc_3k_frags")) + atac_raw_url <- paste0(url_base, "pbmc_granulocyte_sorted_3k_atac_fragments.tsv.gz") + ensure_downloaded(file.path(directory, "pbmc_3k_10x.fragments.tsv.gz"), atac_raw_url, timeout = timeout) + frags <- open_fragments_10x(file.path(directory, "pbmc_3k_10x.fragments.tsv.gz")) %>% + write_fragments_dir(file.path(directory, "pbmc_3k_frags")) } else { - frags_raw <- open_fragments_dir(file.path(directory, "pbmc_3k_frags")) + frags <- open_fragments_dir(file.path(directory, "pbmc_3k_frags")) } - # for atac transcripts - transcripts <- read_gencode_transcripts( - file.path(directory, "references_transcripts"), - release = "42", - transcript_choice = "MANE_Select", - annotation_set = "basic", - features = "transcript" - ) - # for RNA genes - genes_demo <- read_gencode_genes( - file.path(directory, "./reference_genes"), - release = "42", - annotation_set = "basic", - ) - # Filter to only cells that have at least 1000 reads on the RNA side - # and only genes/fragments that exist on chr 4 and 11 - reads_per_cell <- colSums(mat_raw) - filtered_cells <- colnames(mat_raw)[reads_per_cell >= 1e3] - filtered_genes <- genes_demo[genes_demo$chr %in% c("chr4", "chr11"),]$gene_id - # remove version numbers - filtered_genes <- gsub("\\..*", "", filtered_genes) - mat <- mat_raw[which(rownames(mat_raw) %in% filtered_genes), filtered_cells] - frags <- select_cells(frags_raw, filtered_cells) %>% select_chromosomes(c("chr4", "chr11")) + if (filter_qc) { + # Download annotations for transcripts + transcripts <- read_gencode_transcripts( + file.path(directory, "references"), + release = "42", + transcript_choice = "MANE_Select", + annotation_set = "basic", + features = "transcript" + ) + blacklist <- read_encode_blacklist(file.path(directory, "references"), genome="hg38") + atac_qc <- qc_scATAC(frags, transcripts, blacklist) + # Filter to only cells that have at least 1000 reads on the RNA side + # a minimum of 1000 frag reads, and greater than 10 tss enrichment + pass_atac <- atac_qc %>% + dplyr::filter(nFrags > 1000, TSSEnrichment > 10) %>% + dplyr::pull(cellName) + pass_rna <- colnames(mat)[colSums(mat) > 1000] + filtered_cells <- intersect(pass_atac, pass_rna) + frags <- select_cells(frags, filtered_cells) + mat <- mat[, filtered_cells] + } + if (subset) { + # Subset to only genes/fragments that exist on chr4 and 11 + genes_demo <- read_gencode_genes( + file.path(directory, "./references"), + release = "42", + annotation_set = "basic", + ) + filtered_genes <- genes_demo[genes_demo$chr %in% c("chr4", "chr11"),]$gene_id + # remove version numbers + filtered_genes <- gsub("\\..*", "", filtered_genes) + mat <- mat[which(rownames(mat) %in% filtered_genes), ] + frags <- frags %>% select_chromosomes(c("chr4", "chr11")) + } mat <- write_matrix_dir(mat, file.path(directory, mat_name), overwrite = TRUE) frags <- write_fragments_dir(frags, file.path(directory, frags_name), overwrite = TRUE) if (remove_input_data) { @@ -87,15 +105,20 @@ prepare_demo_data <- function(directory = NULL, mat_name = NULL, frags_name = NU #' Retrieve BPCells demo data #' -#' The demo dataset is a subset of the 10x Genomics PBMC 3k dataset, with the matrix and fragments -#' filtered to cells with at least 1000 reads. Both the matrix and the fragments are also subset to only genes on chromosomes 4 and 11. +#' Functions to download matrices and fragments derived from the 10X Genomics PBMC 3k dataset, +#' with options to filter with common qc metrics, and to subset genes and fragments to only chromosome 4 and 11. #' @rdname demo_data +#' @param filter_qc (bool) Whether to filter both the RNA and ATAC data using qc metrics (described in `details`). +#' @param subset (bool) Whether to subset to only genes/insertions on chromosome 4 and 11. #' @return -#' - `get_demo_mat()`: (IterableMatrix) A `(features x cells)` matrix of shape `(1984 x 2724)`. -#' @details -#' The first time either `get_demo_mat()` are ran `get_demo_frags()`, the pre-subsetted +#' - `get_demo_mat()`: (IterableMatrix) A `(features x cells)` matrix. +#' @details +#' **Data Processing**: +#' +#' The first time either `get_demo_mat()` are ran `get_demo_frags()`, #' demo data is downloaded and stored in the BPCells data directory (under `file.path(tools::R_user_dir("BPcells", which="data"), "demo_data")`). -#' Subsequent calls to this function will use the previously downloaded matrix/fragments. +#' Subsequent calls to this function will use the previously downloaded matrix/fragments, given that the same combination of filtering and +#' subsetting has been performed previously. #' The preperation of this matrix can be reproduced by running the internal function `prepare_demo_data()` with `directory` set to the BPCells data directory. #' #' In the case that demo data is not pre-downloaded and demo data download fails, `prepare_demo_data()` will act @@ -104,57 +127,85 @@ prepare_demo_data <- function(directory = NULL, mat_name = NULL, frags_name = NU #' Both the matrix from `get_demo_mat()` and the fragments from `get_demo_frags()` #' may be removed by running `remove_demo_data()`. #' -#' - `get_demo_mat()`: Retrieve a 1 MB demo `IterableMatrix`, representing a subset of the 10X Genomics PBMC 3k dataset. +#' Filtering using QC information on the fragments and matrix object chooses cells with at least 1000 reads, 1000 frags, and a minimum tss enrichment of 10. +#' Subsetting provides only genes and insertions on chromosomes 4 and 11. +#' +#' **Dimensions**: +#' \tabular{lll}{ +#' \strong{Condition} \tab \strong{RNA matrix (features x cells)} \tab \strong{Fragments (chromosomes x cells)} \cr +#' Raw \tab (36601 x 650165) \tab (39 x 462264) \cr +#' Filter \tab (36601 x 2600) \tab (39 x 2600) \cr +#' Subset \tab (3582 x 650165) \tab (2 x 462264) \cr +#' Filter + Subset \tab (3582 x 2600) \tab (2 x 2600) \cr +#' } +#' **Data size**: +#' \tabular{lll}{ +#' \strong{Condition} \tab \strong{RNA matrix (MB)} \tab \strong{Fragments (MB)} \cr +#' Raw \tab 31.9 \tab 200 \cr +#' Filter \tab 9.4 \tab 137 \cr +#' Subset \tab 18.3 \tab 25.6 \cr +#' Filter + Subset \tab 1.2 \tab 12.3 \cr +#' } +#' +#' **Function Description**: +#' +#' - `get_demo_mat()`: Retrieve a demo `IterableMatrix` object representing the 10X Genomics PBMC 3k dataset. #' @export -get_demo_mat <- function() { +get_demo_mat <- function(filter_qc = TRUE, subset = TRUE) { # Use the data directory for BPCells data_dir <- file.path(tools::R_user_dir("BPCells", which = "data"), "demo_data") if (!dir.exists(data_dir)) { dir.create(data_dir, recursive = TRUE) } - if (!dir.exists(file.path(data_dir, "demo_mat"))) { - url <- "https://pub-c4e56988ff67429e9856ffa33aecb0c1.r2.dev/demo_mat.tar.gz" - download.file(url, file.path(data_dir, "demo_mat.tar.gz")) + mat_name = "demo_mat" + if (filter_qc) mat_name <- paste0(mat_name, "_filtered") + if (subset) mat_name <- paste0(mat_name, "_subsetted") + if (!dir.exists(file.path(data_dir, mat_name))) { + url <- paste0("https://pub-c4e56988ff67429e9856ffa33aecb0c1.r2.dev/", mat_name, ".tar.gz") + suppressWarnings(try(download.file(url, file.path(data_dir, paste0(mat_name, ".tar.gz"))), silent = TRUE)) # Check if file download failed - if (!file.exists(file.path(data_dir, "demo_mat.tar.gz"))) { - prepare_demo_data(data_dir) + if (!file.exists(file.path(data_dir, paste0(mat_name, ".tar.gz")))) { + prepare_demo_data(data_dir, filter_qc = filter_qc, subset = subset) } else { - untar(file.path(data_dir, "demo_mat.tar.gz"), exdir=data_dir) - file.remove(file.path(data_dir, "demo_mat.tar.gz")) + untar(file.path(data_dir, paste0(mat_name, ".tar.gz")), exdir=data_dir) + file.remove(file.path(data_dir, paste0(mat_name, ".tar.gz"))) } } - return(open_matrix_dir(file.path(data_dir, "demo_mat"))) + return(open_matrix_dir(file.path(data_dir, mat_name))) } #' @rdname demo_data #' @return -#' - `get_demo_frags()`: (IterableFragments) A Fragments object with 2724 cells and fragments on chromosomes 4 and 11. +#' - `get_demo_frags()`: (IterableFragments) A Fragments object. #' @details -#' - `get_demo_frags()`: Retrieve a 12.5 MB demo `IterableFragments` object, representing a subset of the 10X Genomics PBMC 3k dataset. +#' - `get_demo_frags()`: Retrieve a demo `IterableFragments` object representing the 10X Genomics PBMC 3k dataset. #' @export -get_demo_frags <- function() { +get_demo_frags <- function(filter_qc = TRUE, subset = TRUE) { data_dir <- file.path(tools::R_user_dir("BPCells", which = "data"), "demo_data") if (!dir.exists(data_dir)) { dir.create(data_dir, recursive = TRUE) } - if (!dir.exists(file.path(data_dir, "demo_frags"))) { - url <- "https://pub-c4e56988ff67429e9856ffa33aecb0c1.r2.dev/demo_frags.tar.gz" - download.file(url, file.path(data_dir, "demo_frags.tar.gz")) - if (!file.exists(file.path(data_dir, "demo_frags.tar.gz"))) { + frags_name <- "demo_frags" + if (filter_qc) frags_name <- paste0(frags_name, "_filtered") + if (subset) frags_name <- paste0(frags_name, "_subsetted") + if (!dir.exists(file.path(data_dir, frags_name))) { + url <- paste0("https://pub-c4e56988ff67429e9856ffa33aecb0c1.r2.dev/", frags_name, ".tar.gz") + suppressWarnings(try(download.file(url, file.path(data_dir, paste0(frags_name, ".tar.gz"))), silent = TRUE)) + if (!file.exists(file.path(data_dir, paste0(frags_name, ".tar.gz")))) { prepare_demo_data(data_dir) } else { - untar(file.path(data_dir, "demo_frags.tar.gz"), exdir = data_dir) - file.remove(file.path(data_dir, "demo_frags.tar.gz")) + untar(file.path(data_dir, paste0(frags_name, ".tar.gz")), exdir = data_dir) + file.remove(file.path(data_dir, paste0(frags_name, ".tar.gz"))) } } - return(open_fragments_dir(file.path(data_dir, "demo_frags"))) + return(open_fragments_dir(file.path(data_dir, frags_name))) } #' @rdname demo_data #' @return #' - `remove_demo_data()`: `NULL` #' @details -#' - `remove_demo_data()`: Remove the demo data from the BPCells data directory. +#' - `remove_demo_data()`: Remove the demo data from the BPCells data directory. #' @export remove_demo_data <- function() { data_dir <- file.path(tools::R_user_dir("BPCells", which = "data"), "demo_data") diff --git a/r/man/demo_data.Rd b/r/man/demo_data.Rd index 6e677812..89d16a7e 100644 --- a/r/man/demo_data.Rd +++ b/r/man/demo_data.Rd @@ -6,19 +6,24 @@ \alias{remove_demo_data} \title{Retrieve BPCells demo data} \usage{ -get_demo_mat() +get_demo_mat(filter_qc = TRUE, subset = TRUE) -get_demo_frags() +get_demo_frags(filter_qc = TRUE, subset = TRUE) remove_demo_data() } +\arguments{ +\item{filter_qc}{(bool) Whether to filter both the RNA and ATAC data using qc metrics (described in \code{details}).} + +\item{subset}{(bool) Whether to subset to only genes/insertions on chromosome 4 and 11.} +} \value{ \itemize{ -\item \code{get_demo_mat()}: (IterableMatrix) A \verb{(features x cells)} matrix of shape \verb{(1984 x 2724)}. +\item \code{get_demo_mat()}: (IterableMatrix) A \verb{(features x cells)} matrix. } \itemize{ -\item \code{get_demo_frags()}: (IterableFragments) A Fragments object with 2724 cells and fragments on chromosomes 4 and 11. +\item \code{get_demo_frags()}: (IterableFragments) A Fragments object. } \itemize{ @@ -26,13 +31,16 @@ remove_demo_data() } } \description{ -The demo dataset is a subset of the 10x Genomics PBMC 3k dataset, with the matrix and fragments -filtered to cells with at least 1000 reads. Both the matrix and the fragments are also subset to only genes on chromosomes 4 and 11. +Functions to download matrices and fragments derived from the 10X Genomics PBMC 3k dataset, +with options to filter with common qc metrics, and to subset genes and fragments to only chromosome 4 and 11. } \details{ -The first time either \code{get_demo_mat()} are ran \code{get_demo_frags()}, the pre-subsetted +\strong{Data Processing}: + +The first time either \code{get_demo_mat()} are ran \code{get_demo_frags()}, demo data is downloaded and stored in the BPCells data directory (under \code{file.path(tools::R_user_dir("BPcells", which="data"), "demo_data")}). -Subsequent calls to this function will use the previously downloaded matrix/fragments. +Subsequent calls to this function will use the previously downloaded matrix/fragments, given that the same combination of filtering and +subsetting has been performed previously. The preperation of this matrix can be reproduced by running the internal function \code{prepare_demo_data()} with \code{directory} set to the BPCells data directory. In the case that demo data is not pre-downloaded and demo data download fails, \code{prepare_demo_data()} will act @@ -40,12 +48,34 @@ as a fallback. Both the matrix from \code{get_demo_mat()} and the fragments from \code{get_demo_frags()} may be removed by running \code{remove_demo_data()}. + +Filtering using QC information on the fragments and matrix object chooses cells with at least 1000 reads, 1000 frags, and a minimum tss enrichment of 10. +Subsetting provides only genes and insertions on chromosomes 4 and 11. + +\strong{Dimensions}: +\tabular{lll}{ +\strong{Condition} \tab \strong{RNA matrix (features x cells)} \tab \strong{Fragments (chromosomes x cells)} \cr +Raw \tab (36601 x 650165) \tab (39 x 462264) \cr +Filter \tab (36601 x 2600) \tab (39 x 2600) \cr +Subset \tab (3582 x 650165) \tab (2 x 462264) \cr +Filter + Subset \tab (3582 x 2600) \tab (2 x 2600) \cr +} +\strong{Data size}: +\tabular{lll}{ +\strong{Condition} \tab \strong{RNA matrix (MB)} \tab \strong{Fragments (MB)} \cr +Raw \tab 31.9 \tab 200 \cr +Filter \tab 9.4 \tab 137 \cr +Subset \tab 18.3 \tab 25.6 \cr +Filter + Subset \tab 1.2 \tab 12.3 \cr +} + +\strong{Function Description}: \itemize{ -\item \code{get_demo_mat()}: Retrieve a 1 MB demo \code{IterableMatrix}, representing a subset of the 10X Genomics PBMC 3k dataset. +\item \code{get_demo_mat()}: Retrieve a demo \code{IterableMatrix} object representing the 10X Genomics PBMC 3k dataset. } \itemize{ -\item \code{get_demo_frags()}: Retrieve a 12.5 MB demo \code{IterableFragments} object, representing a subset of the 10X Genomics PBMC 3k dataset. +\item \code{get_demo_frags()}: Retrieve a demo \code{IterableFragments} object representing the 10X Genomics PBMC 3k dataset. } \itemize{ diff --git a/r/man/prepare_demo_data.Rd b/r/man/prepare_demo_data.Rd index ee498b19..c1921924 100644 --- a/r/man/prepare_demo_data.Rd +++ b/r/man/prepare_demo_data.Rd @@ -2,12 +2,12 @@ % Please edit documentation in R/data.R \name{prepare_demo_data} \alias{prepare_demo_data} -\title{Create a small demo matrix and fragment object} +\title{Create a small demo matrix and fragment object.} \usage{ prepare_demo_data( directory = NULL, - mat_name = NULL, - frags_name = NULL, + filter_qc = TRUE, + subset = TRUE, timeout = 300, remove_input_data = TRUE ) @@ -15,9 +15,9 @@ prepare_demo_data( \arguments{ \item{directory}{(character) Where the input/output data should be stored. If NULL, a temporary directory is created.} -\item{mat_name}{(character) Name of the RNA matrix file. If NULL, the matrix is named "demo_mat."} +\item{filter_qc}{(bool) Whether to filter both the RNA and ATAC data using QC information.} -\item{frags_name}{(character) Name of the ATAC fragments file. If NULL, the fragments are named "demo_frags".} +\item{subset}{(bool) Whether to subset to only genes/insertions on chromosome 4 and 11.} \item{timeout}{(numeric) Timeout for downloading files in seconds.} @@ -28,12 +28,14 @@ after processing.} (list) A list with the RNA matrix under the name "mat", and the ATAC fragments under the name "frags". } \description{ -Downloads a 10x Genomics dataset, then performs QC and subsetting. Holds subsetted objects in disk, +Downloads a 10x Genomics dataset consisting of 3k cells, then performs optional QC and subsetting. Holds subsetted objects in disk, and returns a list with both the matrix and fragments. } \details{ -This function downloads the 10x Genomics PBMC 3k dataset, and filters the fragments and matrix to cells with at least 1000 reads. -Following, both fragments and the matrix is subset to only genes and insertions on chromosomes 4 and 11. -The RNA matrix is 1 MB and the fragments are 12.5 MB, after BPCells compression. +This function downloads the 10x Genomics PBMC 3k dataset. +Filtering using QC information on the fragments and matrix provides cells with at least 1000 reads, 1000 frags, and a minimum tss enrichment of 10. +Subsetting provides only genes and insertions on chromosomes 4 and 11. +The name of the matrix and fragments folders are \code{demo_mat} and \code{demo_frags} respectively. +Additionally, choosing to qc filter appends a \verb{_filtered}, and choosing to subset data appends a \verb{_subsetted} to the name. } \keyword{internal} From af1eba714f41cbf54d5b9c4c510f8be2e1127f77 Mon Sep 17 00:00:00 2001 From: Immanuel Abdi <56730419+immanuelazn@users.noreply.github.com> Date: Mon, 7 Apr 2025 11:57:06 -0700 Subject: [PATCH 07/15] Apply suggestions from code review Co-authored-by: Ben Parks --- r/R/data.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/r/R/data.R b/r/R/data.R index abe38505..a0aec9a7 100644 --- a/r/R/data.R +++ b/r/R/data.R @@ -14,7 +14,7 @@ #' @param filter_qc (bool) Whether to filter both the RNA and ATAC data using QC information. #' @param subset (bool) Whether to subset to only genes/insertions on chromosome 4 and 11. #' @param timeout (numeric) Timeout for downloading files in seconds. -#' @param remove_input_data (logical) Whether to remove the downloaded non-procesed matrix, frags, gencode transcripts, and gencode genes +#' @param remove_input_data (logical) Whether to remove the downloaded non-processed matrix, frags, gencode transcripts, and gencode genes #' after processing. #' @return (list) A list with the RNA matrix under the name "mat", and the ATAC fragments under the name "frags". #' @details @@ -119,7 +119,7 @@ prepare_demo_data <- function(directory = NULL, filter_qc = TRUE, subset = TRUE, #' demo data is downloaded and stored in the BPCells data directory (under `file.path(tools::R_user_dir("BPcells", which="data"), "demo_data")`). #' Subsequent calls to this function will use the previously downloaded matrix/fragments, given that the same combination of filtering and #' subsetting has been performed previously. -#' The preperation of this matrix can be reproduced by running the internal function `prepare_demo_data()` with `directory` set to the BPCells data directory. +#' The preparation of this matrix can be reproduced by running the internal function `prepare_demo_data()` with `directory` set to the BPCells data directory. #' #' In the case that demo data is not pre-downloaded and demo data download fails, `prepare_demo_data()` will act #' as a fallback. From c6872dc81dcf2684ca13a694f857df0894727039 Mon Sep 17 00:00:00 2001 From: immanuelazn Date: Mon, 7 Apr 2025 12:53:12 -0700 Subject: [PATCH 08/15] [r] add examples to demo data delete intermediates when exiting clean up docs wording --- r/R/data.R | 63 ++++++++++++++++++++++++++------------ r/man/demo_data.Rd | 26 ++++++++++++++-- r/man/prepare_demo_data.Rd | 10 +++--- 3 files changed, 73 insertions(+), 26 deletions(-) diff --git a/r/R/data.R b/r/R/data.R index a0aec9a7..d3d8582a 100644 --- a/r/R/data.R +++ b/r/R/data.R @@ -10,13 +10,15 @@ #' #' Downloads a 10x Genomics dataset consisting of 3k cells, then performs optional QC and subsetting. Holds subsetted objects in disk, #' and returns a list with both the matrix and fragments. -#' @param directory (character) Where the input/output data should be stored. If NULL, a temporary directory is created. +#' @param directory (character) The directory where all the input/output data will be stored. +#' Downloaded intermediates will be stored in subdir `intermediates`. +#' If `NULL`, a temporary directory is created. #' @param filter_qc (bool) Whether to filter both the RNA and ATAC data using QC information. #' @param subset (bool) Whether to subset to only genes/insertions on chromosome 4 and 11. #' @param timeout (numeric) Timeout for downloading files in seconds. -#' @param remove_input_data (logical) Whether to remove the downloaded non-processed matrix, frags, gencode transcripts, and gencode genes -#' after processing. -#' @return (list) A list with the RNA matrix under the name "mat", and the ATAC fragments under the name "frags". +#' @param remove_input_data (logical) Whether to remove the intermediate non-processed matrix, frags, gencode transcripts, +#' and gencode genes after processing. If this function errors out, will also remove intermediate data if `remove_input_data` is `TRUE`. +#' @return (list) A list with the RNA matrix under the name `mat`, and the ATAC fragments under the name `frags`. #' @details #' This function downloads the 10x Genomics PBMC 3k dataset. #' Filtering using QC information on the fragments and matrix provides cells with at least 1000 reads, 1000 frags, and a minimum tss enrichment of 10. @@ -27,8 +29,12 @@ prepare_demo_data <- function(directory = NULL, filter_qc = TRUE, subset = TRUE, timeout = 300, remove_input_data = TRUE) { if (is.null(directory)) { directory <- file.path(tempdir()) - dir.create(directory, recursive = TRUE, showWarnings = FALSE) } + intermediate_dir <- file.path(directory, "intermediates") + dir.create(intermediate_dir, recursive = TRUE, showWarnings = FALSE) + # Delete all intermediates during exit if remove_input_data is TRUE. + on.exit(if (remove_input_data) unlink(intermediate_dir, recursive = TRUE)) + mat_name <- "demo_mat" frags_name <- "demo_frags" if (filter_qc) { @@ -41,10 +47,10 @@ prepare_demo_data <- function(directory = NULL, filter_qc = TRUE, subset = TRUE, } # Download matrix/frags if not done previously, and open url_base <- "https://cf.10xgenomics.com/samples/cell-arc/2.0.0/pbmc_granulocyte_sorted_3k/" - if (!file.exists(file.path(directory, "pbmc_3k_rna_raw"))) { + if (!file.exists(directory, "pbmc_3k_rna_raw")) { rna_raw_url <- paste0(url_base, "pbmc_granulocyte_sorted_3k_raw_feature_bc_matrix.h5") - ensure_downloaded(file.path(directory, "pbmc_3k_10x.h5"), rna_raw_url, timeout = timeout) - mat <- open_matrix_10x_hdf5(file.path(directory, "pbmc_3k_10x.h5"), feature_type="Gene Expression") %>% + ensure_downloaded(file.path(intermediate_dir, "pbmc_3k_10x.h5"), rna_raw_url, timeout = timeout) + mat <- open_matrix_10x_hdf5(file.path(intermediate_dir, "pbmc_3k_10x.h5"), feature_type="Gene Expression") %>% write_matrix_dir(file.path(directory, "pbmc_3k_rna_raw")) } else { mat <- open_matrix_dir(file.path(directory, "pbmc_3k_rna_raw")) @@ -52,8 +58,8 @@ prepare_demo_data <- function(directory = NULL, filter_qc = TRUE, subset = TRUE, # Check if we already ran import if (!file.exists(file.path(directory, "pbmc_3k_frags"))) { atac_raw_url <- paste0(url_base, "pbmc_granulocyte_sorted_3k_atac_fragments.tsv.gz") - ensure_downloaded(file.path(directory, "pbmc_3k_10x.fragments.tsv.gz"), atac_raw_url, timeout = timeout) - frags <- open_fragments_10x(file.path(directory, "pbmc_3k_10x.fragments.tsv.gz")) %>% + ensure_downloaded(file.path(intermediate_dir, "pbmc_3k_10x.fragments.tsv.gz"), atac_raw_url, timeout = timeout) + frags <- open_fragments_10x(file.path(intermediate_dir, "pbmc_3k_10x.fragments.tsv.gz")) %>% write_fragments_dir(file.path(directory, "pbmc_3k_frags")) } else { frags <- open_fragments_dir(file.path(directory, "pbmc_3k_frags")) @@ -61,13 +67,13 @@ prepare_demo_data <- function(directory = NULL, filter_qc = TRUE, subset = TRUE, if (filter_qc) { # Download annotations for transcripts transcripts <- read_gencode_transcripts( - file.path(directory, "references"), + intermediate_dir, release = "42", transcript_choice = "MANE_Select", annotation_set = "basic", features = "transcript" ) - blacklist <- read_encode_blacklist(file.path(directory, "references"), genome="hg38") + blacklist <- read_encode_blacklist(intermediate_dir, genome="hg38") atac_qc <- qc_scATAC(frags, transcripts, blacklist) # Filter to only cells that have at least 1000 reads on the RNA side # a minimum of 1000 frag reads, and greater than 10 tss enrichment @@ -82,7 +88,7 @@ prepare_demo_data <- function(directory = NULL, filter_qc = TRUE, subset = TRUE, if (subset) { # Subset to only genes/fragments that exist on chr4 and 11 genes_demo <- read_gencode_genes( - file.path(directory, "./references"), + intermediate_dir, release = "42", annotation_set = "basic", ) @@ -94,18 +100,12 @@ prepare_demo_data <- function(directory = NULL, filter_qc = TRUE, subset = TRUE, } mat <- write_matrix_dir(mat, file.path(directory, mat_name), overwrite = TRUE) frags <- write_fragments_dir(frags, file.path(directory, frags_name), overwrite = TRUE) - if (remove_input_data) { - unlink(file.path(directory, "pbmc_3k_10x.h5")) - unlink(file.path(directory, "pbmc_3k_10x.fragments.tsv.gz")) - unlink(file.path(directory, "pbmc_3k_rna_raw")) - unlink(file.path(directory, "pbmc_3k_frags")) - } return(list(mat = mat, frags = frags)) } #' Retrieve BPCells demo data #' -#' Functions to download matrices and fragments derived from the 10X Genomics PBMC 3k dataset, +#' `r lifecycle::badge("experimental")` \cr Functions to download matrices and fragments derived from the 10X Genomics PBMC 3k dataset, #' with options to filter with common qc metrics, and to subset genes and fragments to only chromosome 4 and 11. #' @rdname demo_data #' @param filter_qc (bool) Whether to filter both the RNA and ATAC data using qc metrics (described in `details`). @@ -113,6 +113,9 @@ prepare_demo_data <- function(directory = NULL, filter_qc = TRUE, subset = TRUE, #' @return #' - `get_demo_mat()`: (IterableMatrix) A `(features x cells)` matrix. #' @details +#' These data functions are experimental. +#' The interface, as well as the demo dataset itself will likely undergo changes in the near future. +#' #' **Data Processing**: #' #' The first time either `get_demo_mat()` are ran `get_demo_frags()`, @@ -150,6 +153,11 @@ prepare_demo_data <- function(directory = NULL, filter_qc = TRUE, subset = TRUE, #' **Function Description**: #' #' - `get_demo_mat()`: Retrieve a demo `IterableMatrix` object representing the 10X Genomics PBMC 3k dataset. +#' @examples +#' ####################################################################### +#' ## get_demo_mat() example +#' get_demo_mat() +#' ####################################################################### #' @export get_demo_mat <- function(filter_qc = TRUE, subset = TRUE) { # Use the data directory for BPCells @@ -179,6 +187,11 @@ get_demo_mat <- function(filter_qc = TRUE, subset = TRUE) { #' - `get_demo_frags()`: (IterableFragments) A Fragments object. #' @details #' - `get_demo_frags()`: Retrieve a demo `IterableFragments` object representing the 10X Genomics PBMC 3k dataset. +#' @examples +#' ####################################################################### +#' ## get_demo_frags() example +#' get_demo_frags() +#' ####################################################################### #' @export get_demo_frags <- function(filter_qc = TRUE, subset = TRUE) { data_dir <- file.path(tools::R_user_dir("BPCells", which = "data"), "demo_data") @@ -206,6 +219,16 @@ get_demo_frags <- function(filter_qc = TRUE, subset = TRUE) { #' - `remove_demo_data()`: `NULL` #' @details #' - `remove_demo_data()`: Remove the demo data from the BPCells data directory. +#' @examples +#' ####################################################################### +#' ## remove_demo_data() example +#' remove_demo_data() +#' +#' +#' ## Demo data folder is now empty +#' data_dir <- file.path(tools::R_user_dir("BPCells", which = "data"), "demo_data") +#' list.files(data_dir) +#' ####################################################################### #' @export remove_demo_data <- function() { data_dir <- file.path(tools::R_user_dir("BPCells", which = "data"), "demo_data") diff --git a/r/man/demo_data.Rd b/r/man/demo_data.Rd index 89d16a7e..b19795a4 100644 --- a/r/man/demo_data.Rd +++ b/r/man/demo_data.Rd @@ -31,17 +31,20 @@ remove_demo_data() } } \description{ -Functions to download matrices and fragments derived from the 10X Genomics PBMC 3k dataset, +\ifelse{html}{\href{https://lifecycle.r-lib.org/articles/stages.html#experimental}{\figure{lifecycle-experimental.svg}{options: alt='[Experimental]'}}}{\strong{[Experimental]}} \cr Functions to download matrices and fragments derived from the 10X Genomics PBMC 3k dataset, with options to filter with common qc metrics, and to subset genes and fragments to only chromosome 4 and 11. } \details{ +These data functions are experimental. +The interface, as well as the demo dataset itself will likely undergo changes in the near future. + \strong{Data Processing}: The first time either \code{get_demo_mat()} are ran \code{get_demo_frags()}, demo data is downloaded and stored in the BPCells data directory (under \code{file.path(tools::R_user_dir("BPcells", which="data"), "demo_data")}). Subsequent calls to this function will use the previously downloaded matrix/fragments, given that the same combination of filtering and subsetting has been performed previously. -The preperation of this matrix can be reproduced by running the internal function \code{prepare_demo_data()} with \code{directory} set to the BPCells data directory. +The preparation of this matrix can be reproduced by running the internal function \code{prepare_demo_data()} with \code{directory} set to the BPCells data directory. In the case that demo data is not pre-downloaded and demo data download fails, \code{prepare_demo_data()} will act as a fallback. @@ -82,3 +85,22 @@ Filter + Subset \tab 1.2 \tab 12.3 \item \code{remove_demo_data()}: Remove the demo data from the BPCells data directory. } } +\examples{ +####################################################################### +## get_demo_mat() example +get_demo_mat() +####################################################################### +####################################################################### +## get_demo_frags() example +get_demo_frags() +####################################################################### +####################################################################### +## remove_demo_data() example +remove_demo_data() + + +## Demo data folder is now empty +data_dir <- file.path(tools::R_user_dir("BPCells", which = "data"), "demo_data") +list.files(data_dir) +####################################################################### +} diff --git a/r/man/prepare_demo_data.Rd b/r/man/prepare_demo_data.Rd index c1921924..b6368db8 100644 --- a/r/man/prepare_demo_data.Rd +++ b/r/man/prepare_demo_data.Rd @@ -13,7 +13,9 @@ prepare_demo_data( ) } \arguments{ -\item{directory}{(character) Where the input/output data should be stored. If NULL, a temporary directory is created.} +\item{directory}{(character) The directory where all the input/output data will be stored. +Downloaded intermediates will be stored in subdir \code{intermediates}. +If \code{NULL}, a temporary directory is created.} \item{filter_qc}{(bool) Whether to filter both the RNA and ATAC data using QC information.} @@ -21,11 +23,11 @@ prepare_demo_data( \item{timeout}{(numeric) Timeout for downloading files in seconds.} -\item{remove_input_data}{(logical) Whether to remove the downloaded non-procesed matrix, frags, gencode transcripts, and gencode genes -after processing.} +\item{remove_input_data}{(logical) Whether to remove the intermediate non-processed matrix, frags, gencode transcripts, +and gencode genes after processing. If this function errors out, will also remove intermediate data if \code{remove_input_data} is \code{TRUE}.} } \value{ -(list) A list with the RNA matrix under the name "mat", and the ATAC fragments under the name "frags". +(list) A list with the RNA matrix under the name \code{mat}, and the ATAC fragments under the name \code{frags}. } \description{ Downloads a 10x Genomics dataset consisting of 3k cells, then performs optional QC and subsetting. Holds subsetted objects in disk, From a321917e5a5f76e8c661da977e058a2c29c480f1 Mon Sep 17 00:00:00 2001 From: immanuelazn Date: Mon, 7 Apr 2025 13:48:03 -0700 Subject: [PATCH 09/15] [r] add in link for specifying dataset for demo data --- r/R/data.R | 7 +++++-- r/man/demo_data.Rd | 3 ++- r/man/prepare_demo_data.Rd | 4 +++- 3 files changed, 10 insertions(+), 4 deletions(-) diff --git a/r/R/data.R b/r/R/data.R index d3d8582a..e3bdaa59 100644 --- a/r/R/data.R +++ b/r/R/data.R @@ -8,7 +8,9 @@ #' Create a small demo matrix and fragment object. #' -#' Downloads a 10x Genomics dataset consisting of 3k cells, then performs optional QC and subsetting. Holds subsetted objects in disk, +#' Downloads a +#' [10x Genomics dataset](https://cf.10xgenomics.com/samples/cell-arc/2.0.0/pbmc_granulocyte_sorted_3k/pbmc_granulocyte_sorted_3k_web_summary.html), +#' consisting of 3k cells then performs optional QC and subsetting. Holds subsetted objects in disk, #' and returns a list with both the matrix and fragments. #' @param directory (character) The directory where all the input/output data will be stored. #' Downloaded intermediates will be stored in subdir `intermediates`. @@ -105,7 +107,8 @@ prepare_demo_data <- function(directory = NULL, filter_qc = TRUE, subset = TRUE, #' Retrieve BPCells demo data #' -#' `r lifecycle::badge("experimental")` \cr Functions to download matrices and fragments derived from the 10X Genomics PBMC 3k dataset, +#' `r lifecycle::badge("experimental")` \cr Functions to download matrices and fragments derived from a +#' [10X Genomics PBMC 3k dataset](https://cf.10xgenomics.com/samples/cell-arc/2.0.0/pbmc_granulocyte_sorted_3k/pbmc_granulocyte_sorted_3k_web_summary.html), #' with options to filter with common qc metrics, and to subset genes and fragments to only chromosome 4 and 11. #' @rdname demo_data #' @param filter_qc (bool) Whether to filter both the RNA and ATAC data using qc metrics (described in `details`). diff --git a/r/man/demo_data.Rd b/r/man/demo_data.Rd index b19795a4..cf581e12 100644 --- a/r/man/demo_data.Rd +++ b/r/man/demo_data.Rd @@ -31,7 +31,8 @@ remove_demo_data() } } \description{ -\ifelse{html}{\href{https://lifecycle.r-lib.org/articles/stages.html#experimental}{\figure{lifecycle-experimental.svg}{options: alt='[Experimental]'}}}{\strong{[Experimental]}} \cr Functions to download matrices and fragments derived from the 10X Genomics PBMC 3k dataset, +\ifelse{html}{\href{https://lifecycle.r-lib.org/articles/stages.html#experimental}{\figure{lifecycle-experimental.svg}{options: alt='[Experimental]'}}}{\strong{[Experimental]}} \cr Functions to download matrices and fragments derived from a +\href{https://cf.10xgenomics.com/samples/cell-arc/2.0.0/pbmc_granulocyte_sorted_3k/pbmc_granulocyte_sorted_3k_web_summary.html}{10X Genomics PBMC 3k dataset}, with options to filter with common qc metrics, and to subset genes and fragments to only chromosome 4 and 11. } \details{ diff --git a/r/man/prepare_demo_data.Rd b/r/man/prepare_demo_data.Rd index b6368db8..d66426dc 100644 --- a/r/man/prepare_demo_data.Rd +++ b/r/man/prepare_demo_data.Rd @@ -30,7 +30,9 @@ and gencode genes after processing. If this function errors out, will also remo (list) A list with the RNA matrix under the name \code{mat}, and the ATAC fragments under the name \code{frags}. } \description{ -Downloads a 10x Genomics dataset consisting of 3k cells, then performs optional QC and subsetting. Holds subsetted objects in disk, +Downloads a +\href{https://cf.10xgenomics.com/samples/cell-arc/2.0.0/pbmc_granulocyte_sorted_3k/pbmc_granulocyte_sorted_3k_web_summary.html}{10x Genomics dataset}, +consisting of 3k cells then performs optional QC and subsetting. Holds subsetted objects in disk, and returns a list with both the matrix and fragments. } \details{ From 4d7105936c6aa9d8d09e6daf237cfe1312aef472 Mon Sep 17 00:00:00 2001 From: immanuelazn Date: Mon, 7 Apr 2025 17:45:55 -0700 Subject: [PATCH 10/15] [r] add in better error handling in `prepare_demo_data()`, remove unnecessary parameterization + fix example styling --- r/R/data.R | 49 ++++++++++++++++++++---------------- r/man/demo_data.Rd | 14 ++++++++--- r/man/prepare_demo_data.Rd | 8 ++---- r/tests/testthat/test-data.R | 2 ++ 4 files changed, 41 insertions(+), 32 deletions(-) diff --git a/r/R/data.R b/r/R/data.R index e3bdaa59..8a743ac6 100644 --- a/r/R/data.R +++ b/r/R/data.R @@ -9,7 +9,7 @@ #' Create a small demo matrix and fragment object. #' #' Downloads a -#' [10x Genomics dataset](https://cf.10xgenomics.com/samples/cell-arc/2.0.0/pbmc_granulocyte_sorted_3k/pbmc_granulocyte_sorted_3k_web_summary.html), +#' [10x Genomics dataset](https://www.10xgenomics.com/datasets/pbmc-from-a-healthy-donor-granulocytes-removed-through-cell-sorting-3-k-1-standard-2-0-0), #' consisting of 3k cells then performs optional QC and subsetting. Holds subsetted objects in disk, #' and returns a list with both the matrix and fragments. #' @param directory (character) The directory where all the input/output data will be stored. @@ -18,8 +18,6 @@ #' @param filter_qc (bool) Whether to filter both the RNA and ATAC data using QC information. #' @param subset (bool) Whether to subset to only genes/insertions on chromosome 4 and 11. #' @param timeout (numeric) Timeout for downloading files in seconds. -#' @param remove_input_data (logical) Whether to remove the intermediate non-processed matrix, frags, gencode transcripts, -#' and gencode genes after processing. If this function errors out, will also remove intermediate data if `remove_input_data` is `TRUE`. #' @return (list) A list with the RNA matrix under the name `mat`, and the ATAC fragments under the name `frags`. #' @details #' This function downloads the 10x Genomics PBMC 3k dataset. @@ -28,14 +26,13 @@ #' The name of the matrix and fragments folders are `demo_mat` and `demo_frags` respectively. #' Additionally, choosing to qc filter appends a `_filtered`, and choosing to subset data appends a `_subsetted` to the name. #' @keywords internal -prepare_demo_data <- function(directory = NULL, filter_qc = TRUE, subset = TRUE, timeout = 300, remove_input_data = TRUE) { +prepare_demo_data <- function(directory = NULL, filter_qc = TRUE, subset = TRUE, timeout = 300) { if (is.null(directory)) { directory <- file.path(tempdir()) } intermediate_dir <- file.path(directory, "intermediates") dir.create(intermediate_dir, recursive = TRUE, showWarnings = FALSE) - # Delete all intermediates during exit if remove_input_data is TRUE. - on.exit(if (remove_input_data) unlink(intermediate_dir, recursive = TRUE)) + on.exit(unlink(intermediate_dir, recursive = TRUE)) mat_name <- "demo_mat" frags_name <- "demo_frags" @@ -49,23 +46,24 @@ prepare_demo_data <- function(directory = NULL, filter_qc = TRUE, subset = TRUE, } # Download matrix/frags if not done previously, and open url_base <- "https://cf.10xgenomics.com/samples/cell-arc/2.0.0/pbmc_granulocyte_sorted_3k/" - if (!file.exists(directory, "pbmc_3k_rna_raw")) { + # Recreate mat if mat is malformed + tryCatch({ + mat <- open_matrix_dir(file.path(directory, "pbmc_3k_rna_raw")) + }, error = function(e) { rna_raw_url <- paste0(url_base, "pbmc_granulocyte_sorted_3k_raw_feature_bc_matrix.h5") ensure_downloaded(file.path(intermediate_dir, "pbmc_3k_10x.h5"), rna_raw_url, timeout = timeout) mat <- open_matrix_10x_hdf5(file.path(intermediate_dir, "pbmc_3k_10x.h5"), feature_type="Gene Expression") %>% - write_matrix_dir(file.path(directory, "pbmc_3k_rna_raw")) - } else { - mat <- open_matrix_dir(file.path(directory, "pbmc_3k_rna_raw")) - } - # Check if we already ran import - if (!file.exists(file.path(directory, "pbmc_3k_frags"))) { + write_matrix_dir(file.path(directory, "pbmc_3k_rna_raw"), overwrite = TRUE) + }) + # Recreate frags if frags are malformed + tryCatch({ + frags <- open_fragments_dir(file.path(directory, "pbmc_3k_frags")) + }, error = function(e) { atac_raw_url <- paste0(url_base, "pbmc_granulocyte_sorted_3k_atac_fragments.tsv.gz") ensure_downloaded(file.path(intermediate_dir, "pbmc_3k_10x.fragments.tsv.gz"), atac_raw_url, timeout = timeout) frags <- open_fragments_10x(file.path(intermediate_dir, "pbmc_3k_10x.fragments.tsv.gz")) %>% - write_fragments_dir(file.path(directory, "pbmc_3k_frags")) - } else { - frags <- open_fragments_dir(file.path(directory, "pbmc_3k_frags")) - } + write_fragments_dir(file.path(directory, "pbmc_3k_frags"), overwrite = TRUE) + }) if (filter_qc) { # Download annotations for transcripts transcripts <- read_gencode_transcripts( @@ -99,7 +97,8 @@ prepare_demo_data <- function(directory = NULL, filter_qc = TRUE, subset = TRUE, filtered_genes <- gsub("\\..*", "", filtered_genes) mat <- mat[which(rownames(mat) %in% filtered_genes), ] frags <- frags %>% select_chromosomes(c("chr4", "chr11")) - } + } + dir.create(file.path(directory, frags_name), recursive = TRUE, showWarnings = FALSE) mat <- write_matrix_dir(mat, file.path(directory, mat_name), overwrite = TRUE) frags <- write_fragments_dir(frags, file.path(directory, frags_name), overwrite = TRUE) return(list(mat = mat, frags = frags)) @@ -108,7 +107,7 @@ prepare_demo_data <- function(directory = NULL, filter_qc = TRUE, subset = TRUE, #' Retrieve BPCells demo data #' #' `r lifecycle::badge("experimental")` \cr Functions to download matrices and fragments derived from a -#' [10X Genomics PBMC 3k dataset](https://cf.10xgenomics.com/samples/cell-arc/2.0.0/pbmc_granulocyte_sorted_3k/pbmc_granulocyte_sorted_3k_web_summary.html), +#' [10X Genomics PBMC 3k dataset](https://www.10xgenomics.com/datasets/pbmc-from-a-healthy-donor-granulocytes-removed-through-cell-sorting-3-k-1-standard-2-0-0), #' with options to filter with common qc metrics, and to subset genes and fragments to only chromosome 4 and 11. #' @rdname demo_data #' @param filter_qc (bool) Whether to filter both the RNA and ATAC data using qc metrics (described in `details`). @@ -159,8 +158,10 @@ prepare_demo_data <- function(directory = NULL, filter_qc = TRUE, subset = TRUE, #' @examples #' ####################################################################### #' ## get_demo_mat() example -#' get_demo_mat() #' ####################################################################### +#' get_demo_mat() +#' +#' #' @export get_demo_mat <- function(filter_qc = TRUE, subset = TRUE) { # Use the data directory for BPCells @@ -193,8 +194,10 @@ get_demo_mat <- function(filter_qc = TRUE, subset = TRUE) { #' @examples #' ####################################################################### #' ## get_demo_frags() example -#' get_demo_frags() #' ####################################################################### +#' get_demo_frags() +#' +#' #' @export get_demo_frags <- function(filter_qc = TRUE, subset = TRUE) { data_dir <- file.path(tools::R_user_dir("BPCells", which = "data"), "demo_data") @@ -225,13 +228,15 @@ get_demo_frags <- function(filter_qc = TRUE, subset = TRUE) { #' @examples #' ####################################################################### #' ## remove_demo_data() example +#' ####################################################################### #' remove_demo_data() #' #' #' ## Demo data folder is now empty #' data_dir <- file.path(tools::R_user_dir("BPCells", which = "data"), "demo_data") #' list.files(data_dir) -#' ####################################################################### +#' +#' #' @export remove_demo_data <- function() { data_dir <- file.path(tools::R_user_dir("BPCells", which = "data"), "demo_data") diff --git a/r/man/demo_data.Rd b/r/man/demo_data.Rd index cf581e12..e503aff3 100644 --- a/r/man/demo_data.Rd +++ b/r/man/demo_data.Rd @@ -32,7 +32,7 @@ remove_demo_data() } \description{ \ifelse{html}{\href{https://lifecycle.r-lib.org/articles/stages.html#experimental}{\figure{lifecycle-experimental.svg}{options: alt='[Experimental]'}}}{\strong{[Experimental]}} \cr Functions to download matrices and fragments derived from a -\href{https://cf.10xgenomics.com/samples/cell-arc/2.0.0/pbmc_granulocyte_sorted_3k/pbmc_granulocyte_sorted_3k_web_summary.html}{10X Genomics PBMC 3k dataset}, +\href{https://www.10xgenomics.com/datasets/pbmc-from-a-healthy-donor-granulocytes-removed-through-cell-sorting-3-k-1-standard-2-0-0}{10X Genomics PBMC 3k dataset}, with options to filter with common qc metrics, and to subset genes and fragments to only chromosome 4 and 11. } \details{ @@ -89,19 +89,25 @@ Filter + Subset \tab 1.2 \tab 12.3 \examples{ ####################################################################### ## get_demo_mat() example -get_demo_mat() ####################################################################### +get_demo_mat() + + ####################################################################### ## get_demo_frags() example -get_demo_frags() ####################################################################### +get_demo_frags() + + ####################################################################### ## remove_demo_data() example +####################################################################### remove_demo_data() ## Demo data folder is now empty data_dir <- file.path(tools::R_user_dir("BPCells", which = "data"), "demo_data") list.files(data_dir) -####################################################################### + + } diff --git a/r/man/prepare_demo_data.Rd b/r/man/prepare_demo_data.Rd index d66426dc..ea019c82 100644 --- a/r/man/prepare_demo_data.Rd +++ b/r/man/prepare_demo_data.Rd @@ -8,8 +8,7 @@ prepare_demo_data( directory = NULL, filter_qc = TRUE, subset = TRUE, - timeout = 300, - remove_input_data = TRUE + timeout = 300 ) } \arguments{ @@ -22,16 +21,13 @@ If \code{NULL}, a temporary directory is created.} \item{subset}{(bool) Whether to subset to only genes/insertions on chromosome 4 and 11.} \item{timeout}{(numeric) Timeout for downloading files in seconds.} - -\item{remove_input_data}{(logical) Whether to remove the intermediate non-processed matrix, frags, gencode transcripts, -and gencode genes after processing. If this function errors out, will also remove intermediate data if \code{remove_input_data} is \code{TRUE}.} } \value{ (list) A list with the RNA matrix under the name \code{mat}, and the ATAC fragments under the name \code{frags}. } \description{ Downloads a -\href{https://cf.10xgenomics.com/samples/cell-arc/2.0.0/pbmc_granulocyte_sorted_3k/pbmc_granulocyte_sorted_3k_web_summary.html}{10x Genomics dataset}, +\href{https://www.10xgenomics.com/datasets/pbmc-from-a-healthy-donor-granulocytes-removed-through-cell-sorting-3-k-1-standard-2-0-0}{10x Genomics dataset}, consisting of 3k cells then performs optional QC and subsetting. Holds subsetted objects in disk, and returns a list with both the matrix and fragments. } diff --git a/r/tests/testthat/test-data.R b/r/tests/testthat/test-data.R index 4583a0c9..6fa7e84f 100644 --- a/r/tests/testthat/test-data.R +++ b/r/tests/testthat/test-data.R @@ -12,5 +12,7 @@ test_that("Getting test data works", { frags <- get_demo_frags() expect_true(is(mat, "IterableMatrix")) expect_true(is(frags, "IterableFragments")) + ## Also make sure data can be built + expect_no_error(prepare_demo_data(file.path(tools::R_user_dir("BPCells", which = "data"), "demo_data"))) remove_demo_data() }) \ No newline at end of file From 09112029134666ef3d791f263ea485302aa2c1ed Mon Sep 17 00:00:00 2001 From: immanuelazn Date: Mon, 7 Apr 2025 15:31:44 -0700 Subject: [PATCH 11/15] [r] add example for `trackplot_gene()` --- r/R/trackplots.R | 18 ++++++++++++++++++ r/man/trackplot_gene.Rd | 19 +++++++++++++++++++ 2 files changed, 37 insertions(+) diff --git a/r/R/trackplots.R b/r/R/trackplots.R index 998bfecd..956497c6 100644 --- a/r/R/trackplots.R +++ b/r/R/trackplots.R @@ -510,6 +510,24 @@ trackplot_coverage <- function(fragments, region, groups, #' @param label_size size for transcript labels in units of mm #' @return Plot of gene locations #' @seealso `trackplot_combine()`, `trackplot_coverage()`, `trackplot_loop()`, `trackplot_scalebar()` +#' @examples +#' transcripts <- read_gencode_transcripts( +#' file.path(tempdir(), "references"), release = "42", +#' annotation_set = "basic" +#' ) +#' +#' region <- gene_region(transcripts, "CD19", extend_bp = 1e5) +#' plot <- trackplot_gene(transcripts, region) +#' +#' ## Boiler plate for getting plot sizing correct +#' image_path <- file.path(tempdir(), "trackplot_gene.png") +#' ggsave( +#' image_path, +#' trackplot_gene(transcripts, region), +#' width = 6, height = 1 +#' ) +#' img <- png::readPNG(image_path) +#' grid::grid.raster(img) #' @export trackplot_gene <- function(transcripts, region, exon_size = 2.5, gene_size = 0.5, label_size = 11*.8/ggplot2::.pt, track_label="Genes", return_data = FALSE) { region <- normalize_ranges(region) diff --git a/r/man/trackplot_gene.Rd b/r/man/trackplot_gene.Rd index 7991e14a..cc52545b 100644 --- a/r/man/trackplot_gene.Rd +++ b/r/man/trackplot_gene.Rd @@ -47,6 +47,25 @@ Plot of gene locations \description{ Plot transcript models } +\examples{ +transcripts <- read_gencode_transcripts( + file.path(tempdir(), "references"), release = "42", + annotation_set = "basic" +) + +region <- gene_region(transcripts, "CD19", extend_bp = 1e5) +plot <- trackplot_gene(transcripts, region) + +## Boiler plate for getting plot sizing correct +image_path <- file.path(tempdir(), "trackplot_gene.png") +ggsave( + image_path, + trackplot_gene(transcripts, region), + width = 6, height = 1 +) +img <- png::readPNG(image_path) +grid::grid.raster(img) +} \seealso{ \code{trackplot_combine()}, \code{trackplot_coverage()}, \code{trackplot_loop()}, \code{trackplot_scalebar()} } From 6a850dedb4e801d1eb12543d2cbbd3e5b95c83ff Mon Sep 17 00:00:00 2001 From: immanuelazn Date: Tue, 8 Apr 2025 18:58:40 -0700 Subject: [PATCH 12/15] [r] add examples for a trackplot functions --- r/R/genomeAnnotations.R | 10 +++ r/R/trackplots.R | 127 ++++++++++++++++++++++++--- r/man/gene_region.Rd | 11 +++ r/man/render_plot_from_storage.Rd | 20 +++++ r/man/trackplot_combine.Rd | 34 +++++++ r/man/trackplot_coverage.Rd | 20 +++++ r/man/trackplot_gene.Rd | 18 ++-- r/man/trackplot_genome_annotation.Rd | 21 +++++ r/man/trackplot_loop.Rd | 14 +++ r/man/trackplot_scalebar.Rd | 6 ++ 10 files changed, 257 insertions(+), 24 deletions(-) create mode 100644 r/man/render_plot_from_storage.Rd diff --git a/r/R/genomeAnnotations.R b/r/R/genomeAnnotations.R index 158a6378..33620ee7 100644 --- a/r/R/genomeAnnotations.R +++ b/r/R/genomeAnnotations.R @@ -328,6 +328,16 @@ canonical_gene_symbol <- function(query, gene_mapping = human_gene_mapping) { #' @param extend_bp Bases to extend region upstream and downstream of gene. If length 1, extension is #' symmetric. If length 2, provide upstream extension then downstream extension as positive distances. #' @return List of chr, start, end positions for use with trackplot functions. +#' @examples +#' ## Prep data +#' genes <- read_gencode_transcripts( +#' file.path(tempdir(), "references"), release = "42", +#' annotation_set = "basic", +#' features = "transcript" +#' ) +#' +#' ## Get gene region +#' gene_region(genes, "CD19", extend_bp = 1e5) #' @export gene_region <- function(genes, gene_symbol, extend_bp = c(1e4, 1e4), gene_mapping = human_gene_mapping) { genes <- normalize_ranges(genes, metadata_cols = c("strand", "gene_name")) diff --git a/r/R/trackplots.R b/r/R/trackplots.R index 956497c6..ed377750 100644 --- a/r/R/trackplots.R +++ b/r/R/trackplots.R @@ -252,6 +252,22 @@ trackplot_normalize_ranges_with_metadata <- function(data, metadata) { return(data) } +#' Render a plot with intermediate disk storage step +#' +#' Take a plotting object and save in temp storage, so it can be outputted with exact dimensions. +#' Primarily used to allow for adjusting plot dimensions within function reference examples. +#' @param plot (ggplot) ggplot output from a plotting function +#' @param width (numeric) width of rendered plot +#' @param height (numeric) height of rendered plot +#' @keywords internal +render_plot_from_storage <- function(plot, width = 6, height) { + assert_is(plot, "ggplot") + image_path <- tempfile(fileext = ".png") + ggplot2::ggsave(image_path, plot, width = width, height = height) + img <- png::readPNG(image_path) + grid::grid.raster(img) +} + #' Combine track plots #' #' Combines multiple track plots of the same region into a single grid. @@ -268,6 +284,39 @@ trackplot_normalize_ranges_with_metadata <- function(data, metadata) { #' the text label, y-axis, and plot body. The relative height of each row is given #' by heights. A shared title and x-axis are put at the top. #' @seealso `trackplot_coverage()`, `trackplot_gene()`, `trackplot_loop()`, `trackplot_scalebar()` +#' @examples +#' ## Prep data +#' frags <- get_demo_frags() +#' +#' ## Use genes and blacklist to determine proper number of reads per cell +#' genes <- read_gencode_transcripts( +#' file.path(tempdir(), "references"), release = "42", +#' annotation_set = "basic", +#' features = "transcript" +#' ) +#' blacklist <- read_encode_blacklist(file.path(tempdir(), "references"), genome="hg38") +#' read_counts <- qc_scATAC(frags, genes, blacklist)$nFrags +#' region <- "chr4:3034877-4034877" +#' cell_types <- paste("Group", rep(1:3, length.out = length(cellNames(frags)))) +#' transcripts <- read_gencode_transcripts( +#' file.path(tempdir(), "references"), release = "42", +#' annotation_set = "basic" +#' ) +#' region <- "chr4:3034877-4034877" +#' +#' +#' ## Get all trackplots and scalebars to combine +#' plot_scalebar <- trackplot_scalebar(region) +#' plot_gene <- trackplot_gene(transcripts, region) +#' plot_coverage <- trackplot_coverage(frags, region, groups = cell_types, cell_read_counts = read_counts) +#' +#' +#' ## Combine trackplots and render +#' ## Also remove colors from gene track +#' plot <- trackplot_combine( +#' list(plot_scalebar, plot_coverage, plot_gene + ggplot2::guides(color = "none")) +#' ) +#' BPCells:::render_plot_from_storage(plot, width = 6, height = 4) #' @export trackplot_combine <- function(tracks, side_plot = NULL, title = NULL, side_plot_width = 0.3) { for (plot in tracks) { @@ -407,6 +456,26 @@ trackplot_combine <- function(tracks, side_plot = NULL, title = NULL, side_plot_ #' specify the labels for each track. If `return_data` or `return_plot_list` is #' `TRUE`, the return value will be modified accordingly. #' @seealso `trackplot_combine()`, `trackplot_gene()`, `trackplot_loop()`, `trackplot_scalebar()` +#' @examples +## Prep data +#' frags <- get_demo_frags() +#' +#' ## Use genes and blacklist to determine proper number of reads per cell +#' genes <- read_gencode_transcripts( +#' file.path(tempdir(), "references"), release = "42", +#' annotation_set = "basic", +#' features = "transcript" +#' ) +#' blacklist <- read_encode_blacklist(file.path(tempdir(), "references"), genome="hg38") +#' read_counts <- qc_scATAC(frags, genes, blacklist)$nFrags +#' region <- "chr4:3034877-4034877" +#' cell_types <- paste("Group", rep(1:3, length.out = length(cellNames(frags)))) +#' +#' +#' BPCells:::render_plot_from_storage( +#' trackplot_coverage(frags, region, groups = cell_types, cell_read_counts = read_counts), +#' width = 6, height = 3 +#' ) #' @export trackplot_coverage <- function(fragments, region, groups, cell_read_counts, @@ -511,23 +580,17 @@ trackplot_coverage <- function(fragments, region, groups, #' @return Plot of gene locations #' @seealso `trackplot_combine()`, `trackplot_coverage()`, `trackplot_loop()`, `trackplot_scalebar()` #' @examples +#' ## Prep data #' transcripts <- read_gencode_transcripts( #' file.path(tempdir(), "references"), release = "42", -#' annotation_set = "basic" +#' annotation_set = "basic", features = "transcript" #' ) +#' region <- "chr4:3034877-4034877" #' -#' region <- gene_region(transcripts, "CD19", extend_bp = 1e5) -#' plot <- trackplot_gene(transcripts, region) #' -#' ## Boiler plate for getting plot sizing correct -#' image_path <- file.path(tempdir(), "trackplot_gene.png") -#' ggsave( -#' image_path, -#' trackplot_gene(transcripts, region), -#' width = 6, height = 1 -#' ) -#' img <- png::readPNG(image_path) -#' grid::grid.raster(img) +#' ## Plot gene trackplot +#' plot <- trackplot_gene(transcripts, region) +#' BPCells:::render_plot_from_storage(plot, width = 6, height = 1) #' @export trackplot_gene <- function(transcripts, region, exon_size = 2.5, gene_size = 0.5, label_size = 11*.8/ggplot2::.pt, track_label="Genes", return_data = FALSE) { region <- normalize_ranges(region) @@ -625,6 +688,28 @@ trackplot_gene <- function(transcripts, region, exon_size = 2.5, gene_size = 0.5 #' @param show_strand If TRUE, show strand direction as arrows #' @return Plot of genomic loci if return_data is FALSE, otherwise returns the data frame used to generate the plot #' @seealso `trackplot_combine()`, `trackplot_coverage()`, `trackplot_loop()`, `trackplot_scalebar()`, `trackplot_gene()` +#' @examples +## Prep data +## Peaks generated from demo frags, as input into `call_peaks_tile()` +#' peaks <- tibble::tibble( +#' chr = factor(rep("chr4", 16)), +#' start = c(3041400, 3041733, 3037400, 3041933, 3040466, 3041200, +#' 3038200, 3038000, 3040266, 3037733, 3040800, 3042133, +#' 3038466, 3037200, 3043333, 3040066), +#' end = c(3041600, 3041933, 3037600, 3042133, 3040666, 3041400, +#' 3038400, 3038200, 3040466, 3037933, 3041000, 3042333, +#' 3038666, 3037400, 3043533, 3040266), +#' enrichment = c(46.4, 43.5, 28.4, 27.3, 17.3, 11.7, +#' 10.5, 7.95, 7.22, 6.86, 6.32, 6.14, +#' 5.96, 5.06, 4.51, 3.43) +#' ) +#' region <- "chr4:3034877-3044877" +#' +#' ## Plot peaks +#' BPCells:::render_plot_from_storage( +#' trackplot_genome_annotation(peaks, region, color_by = "enrichment"), +#' width = 6, height = 1 +#' ) #' @export trackplot_genome_annotation <- function(loci, region, color_by = NULL, colors = NULL, label_by = NULL, label_size = 11*.8/ggplot2::.pt, show_strand=FALSE, annotation_size = 2.5, track_label="Peaks", return_data = FALSE) { @@ -747,6 +832,19 @@ trackplot_genome_annotation <- function(loci, region, color_by = NULL, colors = #' #' @return Plot of loops connecting genomic coordinates #' @seealso `trackplot_combine()`, `trackplot_coverage()`, `trackplot_gene()`, `trackplot_scalebar()`, `trackplot_genome_annotation()` +#' @examples +#' peaks <-c(3054877, 3334877, 3534877, 3634877, 3734877) +#' loops <- tibble::tibble( +#' chr = "chr4", +#' start = fake_peaks[c(1,1,2,3)], +#' end = fake_peaks[c(2,3,4,5)], +#' score = c(4,1,3,2) +#' ) +#' region <- "chr4:3034877-4034877" +#' +#' ## Plot loops +#' plot <- trackplot_loop(loops, region, color_by = "score") +#' BPCells:::render_plot_from_storage(plot, width = 6, height = 1) #' @export trackplot_loop <- function(loops, region, color_by=NULL, colors=NULL, allow_truncated=TRUE, curvature=0.75, track_label="Links", return_data = FALSE) { region <- normalize_ranges(region) @@ -844,6 +942,11 @@ trackplot_loop <- function(loops, region, color_by=NULL, colors=NULL, allow_trun #' #' @return Plot with coordinates and scalebar for plotted genomic region #' @seealso `trackplot_combine()`, `trackplot_coverage()`, `trackplot_gene()`, `trackplot_loop()` +#' @examples +#' region <- "chr4:3034877-3044877" +#' BPCells:::render_plot_from_storage( +#' trackplot_scalebar(region), width = 6, height = 1 +#' ) #' @export trackplot_scalebar <- function(region, font_pt=11) { region <- normalize_ranges(region) diff --git a/r/man/gene_region.Rd b/r/man/gene_region.Rd index 81082017..4855da39 100644 --- a/r/man/gene_region.Rd +++ b/r/man/gene_region.Rd @@ -35,3 +35,14 @@ Conveniently look up the region of a gene by gene symbol. The value returned by can be used as the \code{region} argument for trackplot functions such as \code{trackplot_coverage()} or \code{trackplot_gene()} } +\examples{ +## Prep data +genes <- read_gencode_transcripts( + file.path(tempdir(), "references"), release = "42", + annotation_set = "basic", + features = "transcript" +) + +## Get gene region +gene_region(genes, "CD19", extend_bp = 1e5) +} diff --git a/r/man/render_plot_from_storage.Rd b/r/man/render_plot_from_storage.Rd new file mode 100644 index 00000000..1c031e62 --- /dev/null +++ b/r/man/render_plot_from_storage.Rd @@ -0,0 +1,20 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/trackplots.R +\name{render_plot_from_storage} +\alias{render_plot_from_storage} +\title{Render a plot with intermediate disk storage step} +\usage{ +render_plot_from_storage(plot, width = 6, height) +} +\arguments{ +\item{plot}{(ggplot) ggplot output from a plotting function} + +\item{width}{(numeric) width of rendered plot} + +\item{height}{(numeric) height of rendered plot} +} +\description{ +Take a plotting object and save in temp storage, so it can be outputted with exact dimensions. +Primarily used to allow for adjusting plot dimensions within function reference examples. +} +\keyword{internal} diff --git a/r/man/trackplot_combine.Rd b/r/man/trackplot_combine.Rd index 6e0bc283..a18d0b34 100644 --- a/r/man/trackplot_combine.Rd +++ b/r/man/trackplot_combine.Rd @@ -32,6 +32,40 @@ by heights. A shared title and x-axis are put at the top. Combines multiple track plots of the same region into a single grid. Uses the \code{patchwork} package to perform the alignment. } +\examples{ +## Prep data +frags <- get_demo_frags() + +## Use genes and blacklist to determine proper number of reads per cell +genes <- read_gencode_transcripts( + file.path(tempdir(), "references"), release = "42", + annotation_set = "basic", + features = "transcript" +) +blacklist <- read_encode_blacklist(file.path(tempdir(), "references"), genome="hg38") +read_counts <- qc_scATAC(frags, genes, blacklist)$nFrags +region <- "chr4:3034877-4034877" +cell_types <- paste("Group", rep(1:3, length.out = length(cellNames(frags)))) +transcripts <- read_gencode_transcripts( + file.path(tempdir(), "references"), release = "42", + annotation_set = "basic" +) +region <- "chr4:3034877-4034877" + + +## Get all trackplots and scalebars to combine +plot_scalebar <- trackplot_scalebar(region) +plot_gene <- trackplot_gene(transcripts, region) +plot_coverage <- trackplot_coverage(frags, region, groups = cell_types, cell_read_counts = read_counts) + + +## Combine trackplots and render +## Also remove colors from gene track +plot <- trackplot_combine( + list(plot_scalebar, plot_coverage, plot_gene + ggplot2::guides(color = "none")) +) +BPCells:::render_plot_from_storage(plot, width = 6, height = 4) +} \seealso{ \code{trackplot_coverage()}, \code{trackplot_gene()}, \code{trackplot_loop()}, \code{trackplot_scalebar()} } diff --git a/r/man/trackplot_coverage.Rd b/r/man/trackplot_coverage.Rd index c6225c9f..9abf0fe1 100644 --- a/r/man/trackplot_coverage.Rd +++ b/r/man/trackplot_coverage.Rd @@ -58,6 +58,26 @@ specify the labels for each track. If \code{return_data} or \code{return_plot_li Plot a pseudobulk genome track, showing the number of fragment insertions across a region for each cell type or group. } +\examples{ +frags <- get_demo_frags() + +## Use genes and blacklist to determine proper number of reads per cell +genes <- read_gencode_transcripts( + file.path(tempdir(), "references"), release = "42", + annotation_set = "basic", + features = "transcript" +) +blacklist <- read_encode_blacklist(file.path(tempdir(), "references"), genome="hg38") +read_counts <- qc_scATAC(frags, genes, blacklist)$nFrags +region <- "chr4:3034877-4034877" +cell_types <- paste("Group", rep(1:3, length.out = length(cellNames(frags)))) + + +BPCells:::render_plot_from_storage( + trackplot_coverage(frags, region, groups = cell_types, cell_read_counts = read_counts), + width = 6, height = 3 +) +} \seealso{ \code{trackplot_combine()}, \code{trackplot_gene()}, \code{trackplot_loop()}, \code{trackplot_scalebar()} } diff --git a/r/man/trackplot_gene.Rd b/r/man/trackplot_gene.Rd index cc52545b..96285492 100644 --- a/r/man/trackplot_gene.Rd +++ b/r/man/trackplot_gene.Rd @@ -48,23 +48,17 @@ Plot of gene locations Plot transcript models } \examples{ +## Prep data transcripts <- read_gencode_transcripts( file.path(tempdir(), "references"), release = "42", - annotation_set = "basic" + annotation_set = "basic", features = "transcript" ) +region <- "chr4:3034877-4034877" -region <- gene_region(transcripts, "CD19", extend_bp = 1e5) -plot <- trackplot_gene(transcripts, region) -## Boiler plate for getting plot sizing correct -image_path <- file.path(tempdir(), "trackplot_gene.png") -ggsave( - image_path, - trackplot_gene(transcripts, region), - width = 6, height = 1 -) -img <- png::readPNG(image_path) -grid::grid.raster(img) +## Plot gene trackplot +plot <- trackplot_gene(transcripts, region) +BPCells:::render_plot_from_storage(plot, width = 6, height = 1) } \seealso{ \code{trackplot_combine()}, \code{trackplot_coverage()}, \code{trackplot_loop()}, \code{trackplot_scalebar()} diff --git a/r/man/trackplot_genome_annotation.Rd b/r/man/trackplot_genome_annotation.Rd index 71af5964..2f9065a2 100644 --- a/r/man/trackplot_genome_annotation.Rd +++ b/r/man/trackplot_genome_annotation.Rd @@ -47,6 +47,27 @@ Plot of genomic loci if return_data is FALSE, otherwise returns the data frame u \description{ Plot range-based annotation tracks (e.g. peaks) } +\examples{ +peaks <- tibble::tibble( + chr = factor(rep("chr4", 16)), + start = c(3041400, 3041733, 3037400, 3041933, 3040466, 3041200, + 3038200, 3038000, 3040266, 3037733, 3040800, 3042133, + 3038466, 3037200, 3043333, 3040066), + end = c(3041600, 3041933, 3037600, 3042133, 3040666, 3041400, + 3038400, 3038200, 3040466, 3037933, 3041000, 3042333, + 3038666, 3037400, 3043533, 3040266), + enrichment = c(46.4, 43.5, 28.4, 27.3, 17.3, 11.7, + 10.5, 7.95, 7.22, 6.86, 6.32, 6.14, + 5.96, 5.06, 4.51, 3.43) +) +region <- "chr4:3034877-3044877" + +## Plot peaks +BPCells:::render_plot_from_storage( + trackplot_genome_annotation(peaks, region, color_by = "enrichment"), + width = 6, height = 1 +) +} \seealso{ \code{trackplot_combine()}, \code{trackplot_coverage()}, \code{trackplot_loop()}, \code{trackplot_scalebar()}, \code{trackplot_gene()} } diff --git a/r/man/trackplot_loop.Rd b/r/man/trackplot_loop.Rd index 188517f9..c96e0c29 100644 --- a/r/man/trackplot_loop.Rd +++ b/r/man/trackplot_loop.Rd @@ -41,6 +41,20 @@ Plot of loops connecting genomic coordinates \description{ Plot loops } +\examples{ +peaks <-c(3054877, 3334877, 3534877, 3634877, 3734877) +loops <- tibble::tibble( + chr = "chr4", + start = fake_peaks[c(1,1,2,3)], + end = fake_peaks[c(2,3,4,5)], + score = c(4,1,3,2) +) +region <- "chr4:3034877-4034877" + +## Plot loops +plot <- trackplot_loop(loops, region, color_by = "score") +BPCells:::render_plot_from_storage(plot, width = 6, height = 1) +} \seealso{ \code{trackplot_combine()}, \code{trackplot_coverage()}, \code{trackplot_gene()}, \code{trackplot_scalebar()}, \code{trackplot_genome_annotation()} } diff --git a/r/man/trackplot_scalebar.Rd b/r/man/trackplot_scalebar.Rd index 9de670db..f52f663e 100644 --- a/r/man/trackplot_scalebar.Rd +++ b/r/man/trackplot_scalebar.Rd @@ -18,6 +18,12 @@ Plot with coordinates and scalebar for plotted genomic region \description{ Plots a human-readable scale bar and coordinates of the region being plotted } +\examples{ +region <- "chr4:3034877-3044877" +BPCells:::render_plot_from_storage( + trackplot_scalebar(region), width = 6, height = 1 +) +} \seealso{ \code{trackplot_combine()}, \code{trackplot_coverage()}, \code{trackplot_gene()}, \code{trackplot_loop()} } From 20bf3ec8351b3f4124d534b48b51700c71c39fc6 Mon Sep 17 00:00:00 2001 From: immanuelazn Date: Wed, 9 Apr 2025 14:39:14 -0700 Subject: [PATCH 13/15] [r] add examples for single cell plots --- r/R/plots.R | 88 +++++++++++++++++++++++++++++++++++ r/man/plot_dot.Rd | 12 +++++ r/man/plot_embedding.Rd | 42 +++++++++++++++++ r/man/plot_fragment_length.Rd | 4 ++ r/man/plot_read_count_knee.Rd | 8 ++++ r/man/plot_tss_profile.Rd | 12 +++++ r/man/plot_tss_scatter.Rd | 15 ++++++ 7 files changed, 181 insertions(+) diff --git a/r/R/plots.R b/r/R/plots.R index 5da3aaeb..e37ad784 100644 --- a/r/R/plots.R +++ b/r/R/plots.R @@ -108,6 +108,13 @@ continuous_palette <- function(name) { #' @param return_data If true, return data from just before plotting rather than a plot. #' @param apply_styling If false, return a plot without pretty styling applied #' @return ggplot2 plot object +#' @examples +#' ## Prep data +#' mat <- get_demo_mat(filter_qc = FALSE, subset = FALSE) +#' reads_per_cell <- colSums(mat) +#' +#' # Render knee plot +#' plot_read_count_knee(reads_per_cell, cutoff = 1e3) #' @export plot_read_count_knee <- function(read_counts, cutoff = NULL, return_data = FALSE, apply_styling = TRUE) { # Keep ~1k cells per order of magnitude to speed up plotting @@ -178,6 +185,20 @@ plot_read_count_knee <- function(read_counts, cutoff = NULL, return_data = FALSE #' @param min_tss Minimum TSS Enrichment cutoff #' @param bins Number of bins for density calculation #' @inheritParams plot_embedding +#' @examples +#' ## Prep data +#' frags <- get_demo_frags(filter_qc = FALSE, subset = FALSE) +#' genes <- read_gencode_transcripts( +#' file.path(tempdir(), "references"), release = "42", +#' annotation_set = "basic", +#' features = "transcript" +#' ) +#' blacklist <- read_encode_blacklist(file.path(tempdir(), "references"), genome="hg38") +#' atac_qc <- qc_scATAC(frags, genes, blacklist) +#' +#' +#' ## Render tss enrichment vs fragment plot +#' plot_tss_scatter(atac_qc, min_frags = 1000, min_tss = 10) #' @export plot_tss_scatter <- function(atac_qc, min_frags = NULL, min_tss = NULL, bins = 100, apply_styling = TRUE) { assert_is(atac_qc, "data.frame") @@ -255,6 +276,9 @@ plot_tss_scatter <- function(atac_qc, min_frags = NULL, min_tss = NULL, bins = 1 #' @param max_length Maximum length to show on the plot #' @inheritParams plot_embedding #' @return Numeric vector where index i contans the number of length-i fragments +#' @examples +#' frags <- get_demo_frags(filter_qc = FALSE, subset = FALSE) +#' plot_fragment_length(frags) #' @export plot_fragment_length <- function(fragments, max_length = 500, return_data = FALSE, apply_styling = TRUE) { assert_is(fragments, "IterableFragments") @@ -297,6 +321,17 @@ plot_fragment_length <- function(fragments, max_length = 500, return_data = FALS #' @param genes Coordinate ranges for genes (must include strand) #' @param smooth Number of bases to smooth over (rolling average) #' @seealso `footprint()`, `plot_tf_footprint()` +#' @examples +#' ## Prep data +#' frags <- get_demo_frags(filter_qc = FALSE, subset = FALSE) +#' genes <- read_gencode_transcripts( +#' file.path(tempdir(), "references"), release = "42", +#' annotation_set = "basic", +#' features = "transcript" +#' ) +#' +#' ## Plot tss profile +#' plot_tss_profile(frags, genes) #' @export plot_tss_profile <- function(fragments, genes, cell_groups = rlang::rep_along(cellNames(fragments), "all"), flank = 2000L, smooth = 0L, zero_based_coords = !is(genes, "GRanges"), @@ -485,6 +520,48 @@ collect_features <- function(source, features = NULL, gene_mapping = human_gene_ #' @return By default, returns a ggplot2 object with all the requested features plotted #' in a grid. If `return_data` or `return_plot_list` is called, the return value will #' match that argument. +#' @examples +## Prep data +#' set.seed(123) +#' mat <- get_demo_mat() +#' ## Normalize matrix +#' mat_norm <- log1p(multiply_cols(mat, 1/colSums(mat)) * 10000) %>% write_matrix_memory(compress = FALSE) +#' ## Get variable genes +#' stats <- matrix_stats(mat, row_stats = "variance") +#' variable_genes <- order(stats$row_stats["variance",], decreasing=TRUE) %>% +#' head(1000) %>% +#' sort() +#' # Z score normalize genes +#' mat_norm <- mat[variable_genes, ] +#' gene_means <- stats$row_stats['mean', variable_genes] +#' gene_vars <- stats$row_stats['variance', variable_genes] +#' mat_norm <- (mat_norm - gene_means) / gene_vars +#' ## Save matrix to memory +#' mat_norm <- mat_norm %>% write_matrix_memory(compress = FALSE) +#' ## Run SVD +#' svd <- BPCells::svds(mat_norm, k = 10) +#' pca <- multiply_cols(svd$v, svd$d) +#' ## Get UMAP +#' umap <- uwot::umap(pca) +#' ## Get clusters +#' clusts <- knn_hnsw(pca, ef = 500) %>% +#' knn_to_snn_graph() %>% +#' cluster_graph_louvain() +#' +#' +#' ## Plot embedding +#' print(length(clusts)) +#' +#' plot_embedding(clusts, umap) +#' +#' +#' ### Can also plot by features +#' #plot_embedding( +#' # source = mat, +#' # umap, +#' # features = c("MS4A1", "CD3E"), +#' #) +#' #' @export plot_embedding <- function(source, embedding, features = NULL, quantile_range = c(0.01, 0.99), @@ -741,6 +818,17 @@ rotate_x_labels <- function(degrees = 45) { #' @inheritParams plot_embedding #' @param gene_mapping An optional vector for gene name matching with match_gene_symbol(). #' @param colors Color scale for plot +#' @examples +#' ## Prep data +#' mat <- get_demo_mat() +#' cell_types <- paste("Group", rep(1:3, length.out = length(colnames(mat)))) +#' +#' ## Plot dot +#' plot <- plot_dot(mat, c("MS4A1", "CD3E"), cell_types) +#' +#' BPCells:::render_plot_from_storage( +#' plot, width = 4, height = 5 +#' ) #' @export plot_dot <- function(source, features, groups, group_order = NULL, gene_mapping = human_gene_mapping, colors = c("lightgrey", "#4682B4"), diff --git a/r/man/plot_dot.Rd b/r/man/plot_dot.Rd index 61bd55fe..e92da530 100644 --- a/r/man/plot_dot.Rd +++ b/r/man/plot_dot.Rd @@ -38,3 +38,15 @@ map to zero)} Plot feature levels per group or cluster as a grid of dots. Dots are colored by z-score normalized average expression, and sized by percent non-zero. } +\examples{ +## Prep data +mat <- get_demo_mat() +cell_types <- paste("Group", rep(1:3, length.out = length(colnames(mat)))) + +## Plot dot +plot <- plot_dot(mat, c("MS4A1", "CD3E"), cell_types) + +BPCells:::render_plot_from_storage( + plot, width = 4, height = 5 +) +} diff --git a/r/man/plot_embedding.Rd b/r/man/plot_embedding.Rd index 10d4dc9c..281c49ae 100644 --- a/r/man/plot_embedding.Rd +++ b/r/man/plot_embedding.Rd @@ -96,3 +96,45 @@ are repeatedly multiplied by the smoothing matrix and re-scaled so the average value stays the same. } } +\examples{ +set.seed(123) +mat <- get_demo_mat() +## Normalize matrix +mat_norm <- log1p(multiply_cols(mat, 1/colSums(mat)) * 10000) \%>\% write_matrix_memory(compress = FALSE) +## Get variable genes +stats <- matrix_stats(mat, row_stats = "variance") +variable_genes <- order(stats$row_stats["variance",], decreasing=TRUE) \%>\% + head(1000) \%>\% + sort() +# Z score normalize genes +mat_norm <- mat[variable_genes, ] +gene_means <- stats$row_stats['mean', variable_genes] +gene_vars <- stats$row_stats['variance', variable_genes] +mat_norm <- (mat_norm - gene_means) / gene_vars +## Save matrix to memory +mat_norm <- mat_norm \%>\% write_matrix_memory(compress = FALSE) +## Run SVD +svd <- BPCells::svds(mat_norm, k = 10) +pca <- multiply_cols(svd$v, svd$d) +## Get UMAP +umap <- uwot::umap(pca) +## Get clusters +clusts <- knn_hnsw(pca, ef = 500) \%>\% + knn_to_snn_graph() \%>\% + cluster_graph_louvain() + + +## Plot embedding +print(length(clusts)) + +plot_embedding(clusts, umap) + + +### Can also plot by features +#plot_embedding( +# source = mat, +# umap, +# features = c("MS4A1", "CD3E"), +#) + +} diff --git a/r/man/plot_fragment_length.Rd b/r/man/plot_fragment_length.Rd index c86f0ca4..0931b560 100644 --- a/r/man/plot_fragment_length.Rd +++ b/r/man/plot_fragment_length.Rd @@ -29,3 +29,7 @@ x-axis, and proportion of fragments on the y-axis. Typical plots will show 10-basepair periodicity, as well as humps spaced at multiples of a nucleosome width (about 150bp). } +\examples{ +frags <- get_demo_frags(filter_qc = FALSE, subset = FALSE) +plot_fragment_length(frags) +} diff --git a/r/man/plot_read_count_knee.Rd b/r/man/plot_read_count_knee.Rd index 3fe07c7b..b4ffbaad 100644 --- a/r/man/plot_read_count_knee.Rd +++ b/r/man/plot_read_count_knee.Rd @@ -29,3 +29,11 @@ Plots read count rank vs. number of reads on a log-log scale. \details{ Performs logarithmic downsampling to reduce the number of points plotted } +\examples{ +## Prep data +mat <- get_demo_mat(filter_qc = FALSE, subset = FALSE) +reads_per_cell <- colSums(mat) + +# Render knee plot +plot_read_count_knee(reads_per_cell, cutoff = 1e3) +} diff --git a/r/man/plot_tss_profile.Rd b/r/man/plot_tss_profile.Rd index e2f99919..08a5334f 100644 --- a/r/man/plot_tss_profile.Rd +++ b/r/man/plot_tss_profile.Rd @@ -40,6 +40,18 @@ Plot the enrichmment of insertions relative to transcription start sites (TSS). Typically, this plot shows strong enrichment of insertions near a TSS, and a small bump downstream around 220bp downstream of the TSS for the +1 nucleosome. } +\examples{ +## Prep data +frags <- get_demo_frags(filter_qc = FALSE, subset = FALSE) +genes <- read_gencode_transcripts( + file.path(tempdir(), "references"), release = "42", + annotation_set = "basic", + features = "transcript" +) + +## Plot tss profile +plot_tss_profile(frags, genes) +} \seealso{ \code{footprint()}, \code{plot_tf_footprint()} } diff --git a/r/man/plot_tss_scatter.Rd b/r/man/plot_tss_scatter.Rd index 94d3ddb0..9c75ffd4 100644 --- a/r/man/plot_tss_scatter.Rd +++ b/r/man/plot_tss_scatter.Rd @@ -28,3 +28,18 @@ Density scatter plot with log10(fragment_count) on the x-axis and TSS enrichment on the y-axis. This plot is most useful to select which cell barcodes in an experiment correspond to high-quality cells } +\examples{ +## Prep data +frags <- get_demo_frags(filter_qc = FALSE, subset = FALSE) +genes <- read_gencode_transcripts( + file.path(tempdir(), "references"), release = "42", + annotation_set = "basic", + features = "transcript" +) +blacklist <- read_encode_blacklist(file.path(tempdir(), "references"), genome="hg38") +atac_qc <- qc_scATAC(frags, genes, blacklist) + + +## Render tss enrichment vs fragment plot +plot_tss_scatter(atac_qc, min_frags = 1000, min_tss = 10) +} From 920007b0f5528a12681a81321e328f1a2ab522c9 Mon Sep 17 00:00:00 2001 From: immanuelazn Date: Wed, 9 Apr 2025 15:30:06 -0700 Subject: [PATCH 14/15] [r] remove default from `render_plot_from_storage()` --- r/R/trackplots.R | 2 +- r/man/render_plot_from_storage.Rd | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/r/R/trackplots.R b/r/R/trackplots.R index ed377750..4cf9761f 100644 --- a/r/R/trackplots.R +++ b/r/R/trackplots.R @@ -260,7 +260,7 @@ trackplot_normalize_ranges_with_metadata <- function(data, metadata) { #' @param width (numeric) width of rendered plot #' @param height (numeric) height of rendered plot #' @keywords internal -render_plot_from_storage <- function(plot, width = 6, height) { +render_plot_from_storage <- function(plot, width, height) { assert_is(plot, "ggplot") image_path <- tempfile(fileext = ".png") ggplot2::ggsave(image_path, plot, width = width, height = height) diff --git a/r/man/render_plot_from_storage.Rd b/r/man/render_plot_from_storage.Rd index 1c031e62..bfcea1f6 100644 --- a/r/man/render_plot_from_storage.Rd +++ b/r/man/render_plot_from_storage.Rd @@ -4,7 +4,7 @@ \alias{render_plot_from_storage} \title{Render a plot with intermediate disk storage step} \usage{ -render_plot_from_storage(plot, width = 6, height) +render_plot_from_storage(plot, width, height) } \arguments{ \item{plot}{(ggplot) ggplot output from a plotting function} From 14407a11a53b2e7e7fffbe5d449abcc74c783abb Mon Sep 17 00:00:00 2001 From: immanuelazn Date: Wed, 9 Apr 2025 15:44:42 -0700 Subject: [PATCH 15/15] [r] add some small syntax fixes to `trackplot_genome_annotation()` `trackplot_loop()` --- r/R/trackplots.R | 12 ++++++------ r/man/trackplot_genome_annotation.Rd | 2 ++ r/man/trackplot_loop.Rd | 8 ++++---- 3 files changed, 12 insertions(+), 10 deletions(-) diff --git a/r/R/trackplots.R b/r/R/trackplots.R index 4cf9761f..5f04cbb9 100644 --- a/r/R/trackplots.R +++ b/r/R/trackplots.R @@ -689,8 +689,8 @@ trackplot_gene <- function(transcripts, region, exon_size = 2.5, gene_size = 0.5 #' @return Plot of genomic loci if return_data is FALSE, otherwise returns the data frame used to generate the plot #' @seealso `trackplot_combine()`, `trackplot_coverage()`, `trackplot_loop()`, `trackplot_scalebar()`, `trackplot_gene()` #' @examples -## Prep data -## Peaks generated from demo frags, as input into `call_peaks_tile()` +#' ## Prep data +#' ## Peaks generated from demo frags, as input into `call_peaks_tile()` #' peaks <- tibble::tibble( #' chr = factor(rep("chr4", 16)), #' start = c(3041400, 3041733, 3037400, 3041933, 3040466, 3041200, @@ -833,18 +833,18 @@ trackplot_genome_annotation <- function(loci, region, color_by = NULL, colors = #' @return Plot of loops connecting genomic coordinates #' @seealso `trackplot_combine()`, `trackplot_coverage()`, `trackplot_gene()`, `trackplot_scalebar()`, `trackplot_genome_annotation()` #' @examples -#' peaks <-c(3054877, 3334877, 3534877, 3634877, 3734877) +#' peaks <- c(3054877, 3334877, 3534877, 3634877, 3734877) #' loops <- tibble::tibble( #' chr = "chr4", -#' start = fake_peaks[c(1,1,2,3)], -#' end = fake_peaks[c(2,3,4,5)], +#' start = peaks[c(1,1,2,3)], +#' end = peaks[c(2,3,4,5)], #' score = c(4,1,3,2) #' ) #' region <- "chr4:3034877-4034877" #' #' ## Plot loops #' plot <- trackplot_loop(loops, region, color_by = "score") -#' BPCells:::render_plot_from_storage(plot, width = 6, height = 1) +#' BPCells:::render_plot_from_storage(plot, width = 6, height = 1.5) #' @export trackplot_loop <- function(loops, region, color_by=NULL, colors=NULL, allow_truncated=TRUE, curvature=0.75, track_label="Links", return_data = FALSE) { region <- normalize_ranges(region) diff --git a/r/man/trackplot_genome_annotation.Rd b/r/man/trackplot_genome_annotation.Rd index 2f9065a2..f5bf7129 100644 --- a/r/man/trackplot_genome_annotation.Rd +++ b/r/man/trackplot_genome_annotation.Rd @@ -48,6 +48,8 @@ Plot of genomic loci if return_data is FALSE, otherwise returns the data frame u Plot range-based annotation tracks (e.g. peaks) } \examples{ +## Prep data +## Peaks generated from demo frags, as input into `call_peaks_tile()` peaks <- tibble::tibble( chr = factor(rep("chr4", 16)), start = c(3041400, 3041733, 3037400, 3041933, 3040466, 3041200, diff --git a/r/man/trackplot_loop.Rd b/r/man/trackplot_loop.Rd index c96e0c29..5ebbb94b 100644 --- a/r/man/trackplot_loop.Rd +++ b/r/man/trackplot_loop.Rd @@ -42,18 +42,18 @@ Plot of loops connecting genomic coordinates Plot loops } \examples{ -peaks <-c(3054877, 3334877, 3534877, 3634877, 3734877) +peaks <- c(3054877, 3334877, 3534877, 3634877, 3734877) loops <- tibble::tibble( chr = "chr4", - start = fake_peaks[c(1,1,2,3)], - end = fake_peaks[c(2,3,4,5)], + start = peaks[c(1,1,2,3)], + end = peaks[c(2,3,4,5)], score = c(4,1,3,2) ) region <- "chr4:3034877-4034877" ## Plot loops plot <- trackplot_loop(loops, region, color_by = "score") -BPCells:::render_plot_from_storage(plot, width = 6, height = 1) +BPCells:::render_plot_from_storage(plot, width = 6, height = 1.5) } \seealso{ \code{trackplot_combine()}, \code{trackplot_coverage()}, \code{trackplot_gene()}, \code{trackplot_scalebar()}, \code{trackplot_genome_annotation()}