diff --git a/DESCRIPTION b/DESCRIPTION index 1784ceed..b838f52e 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -3,16 +3,17 @@ Type: Package Title: A Toolbox for Spatial Gene Expression Analysis Version: 0.1.0 Authors@R: as.person(c( - "Jan Kueckelhaus [aut, cre]", + "Jan Kueckelhaus [aut, cre]", "Dieter-Henrik Heiland [aut]" )) -Description: This package provides a framework of functions and shiny-applications to make it as easy and intuitive as possible to work with spatial gene expression data. +Description: This package provides a framework of functions and shiny-applications to make it as easy and intuitive as possible to work with spatial gene expression data. Encoding: UTF-8 LazyData: true License: GPL-3 BugReports: themilolab-spata@gmx.de -RoxygenNote: 7.1.1 -Imports: +RoxygenNote: 7.2.3 +Imports: + biomaRt, broom, concaveman, colorspace, @@ -33,19 +34,23 @@ Imports: rlang, readr, reticulate, + Seurat, shiny, shinyWidgets, shinybusy, shinydashboard, + SingleCellExperiment, sp, stringr, stringi, + SummarizedExperiment, tibble, tidyr, tidytext, viridis, umap -Collate: +Collate: + 'GetPositions.R' 'S4-documentation.R' 'S4-generic-functions.R' 'S4-programming-aid.R' @@ -109,6 +114,6 @@ Collate: 'update-spata-object.R' 'valid-input-options.R' 'validation.R' -Depends: +Depends: R (>= 2.10) URL: https://themilolab.com/ diff --git a/R/GetPositions.R b/R/GetPositions.R new file mode 100644 index 00000000..e18b1baf --- /dev/null +++ b/R/GetPositions.R @@ -0,0 +1,98 @@ +#' Remove alternative chromosomes, X chromosome, Y chromosome, and mitochondrial genome from a gene positions dataframe +#' +#' This function removes alternative chromosomes, X chromosome, Y chromosome, and mitochondrial genome from a gene positions dataframe. +#' It also removes any duplicated genes, sorts the dataframe by chromosome column in numeric order, and returns the modified dataframe. +#' +#' @param gene_positions_df A data frame containing gene positions. +#' @return A modified gene positions dataframe with alternative chromosomes, X chromosome, Y chromosome, and mitochondrial chromosome removed, sorted in numeric order. +#' @examples +#' gene_positions <- data.frame( +#' ensembl_gene_id = c("ENSG00000261846", "ENSG00000197953", "ENSG00000262466"), +#' hgnc_symbol = c("AADACL2", "AADACL2", "AADACL2-AS1"), +#' chromosome_name = c("CHR_HSCHR3_1_CTG2_1", "3", "CHR_HSCHR3_1_CTG2_1"), +#' start_position = c(151744454, 151733916, 151761981), +#' end_position = c(151770036, 151761339, 151765669) +#' ) +#' ignoreAlternative(gene_positions) +#' +#' @export +ignoreAlternative <- function(gene_positions_df) { + + # Sort the dataframe by chromosome column + gene_positions_df <- gene_positions_df[order(gene_positions_df[, 3], decreasing = F), ] + + # Remove duplicates based on the gene column + gene_positions_df <- gene_positions_df[!duplicated(gene_positions_df[, 2]), ] + + # Replace alternative chromosome names with numeric codes + gene_positions_df[which(gene_positions_df[, 3] == "X"), 3] <- 23 + gene_positions_df[which(gene_positions_df[, 3] == "Y"), 3] <- 24 + gene_positions_df[which(gene_positions_df[, 3] == "MT"), 3] <- 0 + + # Remove any chromosome names that are longer than 2 characters + gene_positions_df[which(nchar(gene_positions_df[, 3]) > 2), 3] <- 0 + + # Sort the dataframe by chromosome column in numeric order + gene_positions_df <- gene_positions_df[order(as.numeric(gene_positions_df[, 3]), decreasing = F), ] + + # Return the modified dataframe + return(gene_positions_df) +} + +#' Receive genomic coordinates of a gene list +#' +#' This function allows to receive the genomic positions of a vector of genes in HUGO format. +#' @param gene_names A vector of gene names in HUGO format. +#' @param ensembl_version Version of the ENSEMBL database used to quantify gene expression data. Default: v109. +#' @param ignoreAlt If set to TRUE: Ignore if multiple loci are reported for a gene, pick the one from the primary assembly. +#' @keywords Chromosomal positions +#' @export +#' @examples +#' getGenePositions(gene_names = c("EGFR", "PDGFRA")) +getGenePositions <- function(gene_names = character(0), + ensembl_version = "https://feb2023.archive.ensembl.org", + species = "human", + ignoreAlt = F) { + if (species == "human") { + ensembl <- biomaRt::useMart( + biomart = "ENSEMBL_MART_ENSEMBL", + dataset = "hsapiens_gene_ensembl", + host = ensembl_version # use biomaRt::listEnsemblArchives() to check the versions + ) + + if (length(gene_names) == 0) { + gene_names <- biomaRt::getBM(attributes = "hgnc_symbol", mart = ensembl)$hgnc_symbol + } + + gene_positions <- biomaRt::getBM( + attributes = c("ensembl_gene_id", "hgnc_symbol", "chromosome_name", "start_position", "end_position"), + filters = "hgnc_symbol", + values = gene_names, + mart = ensembl + ) + } else if (species == "mouse") { + ensembl <- biomaRt::useMart( + biomart = "ENSEMBL_MART_ENSEMBL", + dataset = "mmusculus_gene_ensembl", + host = ensembl_version # use biomaRt::listEnsemblArchives() to check the versions + ) + + if (length(gene_names) == 0) { + gene_names <- biomaRt::getBM(attributes = "mgi_symbol", mart = ensembl)$mgi_symbol + } + + gene_positions <- biomaRt::getBM( + attributes = c("ensembl_gene_id", "mgi_symbol", "chromosome_name", "start_position", "end_position"), + filters = "mgi_symbol", + values = gene_names, + mart = ensembl + ) + } else { + stop("Species other than human and mouse are not supported.") + } + + if (ignoreAlt == T) { + gene_positions <- ignoreAlternative(gene_positions) + } + return(gene_positions) +} diff --git a/R/cnv-analysis.R b/R/cnv-analysis.R index 6690e4c1..adeed52f 100644 --- a/R/cnv-analysis.R +++ b/R/cnv-analysis.R @@ -167,11 +167,17 @@ hlpr_run_cnva_pca <- function(object, n_pcs = 30, of_sample = NA, ...){ #' character variables \emph{ensembl_gene_id}, \emph{hgnc_symbol}, \emph{chromosome_name} #' and two numeric variables \emph{start_position} and \emph{end_position.}. #' -#' If NULL the data.frame is created via \code{CONICsmat::getGenePositions()} using -#' all gene names that appear in the count matrix and in the reference matrix. +#' If NULL, the data.frame is created via custom function \code{getGenePositions()} +#' adapted from \code{CONICsmat::getGenePositions()} +#' using #' all gene names that retrieved from ensembl. #' #' Defaults to the SPATA2 intern data.frame \code{SPATA2::gene_pos_df}. #' +#' @param remove_alternative_chr either TRUE or FALSE. +#' If TRUE, remove the chromosome of 0 (mitochondria and contigs), 23 (X), and 24 (Y). +#' If FALSE, keep the chromosome of 0 (mitochondria and contigs), 23 (X), and 24 (Y). +#' Defaults to TRUE to reduce confusion of the user. +#' #' @param cnv_prefix Character value. Denotes the string with which the #' the feature variables in which the information about the chromosomal gains and #' losses are stored are prefixed. @@ -253,6 +259,7 @@ runCnvAnalysis <- function(object, ref_mtr = cnv_ref[["mtr"]], # reference data set of healthy tissue ref_regions = cnv_ref[["regions"]], # chromosome positions gene_pos_df = SPATA2::gene_pos_df, + remove_alternative_chr = TRUE, directory_cnv_folder = "data-development/cnv-results", # output folder directory_regions_df = NA, # deprecated (chromosome positions) n_pcs = 30, @@ -401,28 +408,43 @@ runCnvAnalysis <- function(object, } - if(base::is.data.frame(gene_pos_df)){ + if (base::is.data.frame(gene_pos_df)){ + base::message( + "Default or user-provided dataframe is used." + ) - confuns::check_data_frame( - df = gene_pos_df, - var.class = list( - ensembl_gene_id = "character", - hgnc_symbol = "character", - chromosome_name = "character", - start_position = "integer", - end_position = "integer" - ) + } else if (base::is.null(gene_pos_df)){ + base::message( + "The function getGenePositions() will be use to extract the gene position dataframe." ) + # custom getGenePositions() is adapted from CONICSmat::getGenePositions() + gene_pos_df <- getGenePositions(ignoreAlt = T) + # usethis::use_data(gene_pos_df, overwrite = T) # how to make the built-in rda. } else { + base::stop("No other options for gene position dataframe.") + } - gene_pos_df <- - CONICSmat::getGenePositions(gene_names = base::rownames(expr_inter)) + # Validate the column type of the gene_pos_df. + confuns::check_data_frame( + df = gene_pos_df, + var.class = list( + ensembl_gene_id = "character", + hgnc_symbol = "character", + chromosome_name = "character", + start_position = "integer", + end_position = "integer" + ) + ) + # Remove the chromosome of 0 (mitochondria and contigs), 23 (X), and 24 (Y) + if (remove_alternative_chr == TRUE) { + gene_pos_df <- dplyr::filter(gene_pos_df, !(chromosome_name %in% c("0", "23", "24"))) } + # ----- @@ -687,7 +709,7 @@ runCnvAnalysis <- function(object, result_dir <- stringr::str_c(directory_cnv_folder, "/", plot_cnv$output_filename, ".observations.txt") - results <- utils::read.table(result_dir) + results <- utils::read.table(result_dir, check.names = FALSE) bcs_object <- getFeatureDf(object) %>% @@ -724,7 +746,6 @@ runCnvAnalysis <- function(object, base::as.data.frame() %>% tibble::rownames_to_column(var = "barcodes") %>% magrittr::set_colnames(value = cnames) %>% - dplyr::mutate(barcodes = stringr::str_replace_all(string = barcodes, pattern = "\\.", replacement = "-")) %>% dplyr::mutate(dplyr::across(dplyr::starts_with(match = cnv_prefix), .fns = base::as.numeric)) %>% tibble::as_tibble() @@ -740,13 +761,6 @@ runCnvAnalysis <- function(object, ) # cnv matrix - base::colnames(results) <- - stringr::str_replace_all( - string = base::colnames(results), - pattern = "\\.", - replacement = "-" - ) - cnv_mtr <- base::as.matrix(results) # cnv list diff --git a/data/gene_pos_df.rda b/data/gene_pos_df.rda index 7635fd7c..d89fe430 100644 Binary files a/data/gene_pos_df.rda and b/data/gene_pos_df.rda differ diff --git a/man/gene_pos_df.Rd b/man/gene_pos_df.Rd index fae1038b..f150cef4 100644 --- a/man/gene_pos_df.Rd +++ b/man/gene_pos_df.Rd @@ -6,7 +6,7 @@ \title{A data.frame necessary for cnv-analysis. Contains information about the gene positions on chromosomes. Contains the following variables:} \format{ -An object of class \code{data.frame} with 17208 rows and 5 columns. +An object of class \code{data.frame} with 40249 rows and 5 columns. } \usage{ gene_pos_df diff --git a/man/getGenePositions.Rd b/man/getGenePositions.Rd new file mode 100644 index 00000000..9878d071 --- /dev/null +++ b/man/getGenePositions.Rd @@ -0,0 +1,28 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/GetPositions.R +\name{getGenePositions} +\alias{getGenePositions} +\title{Receive genomic coordinates of a gene list} +\usage{ +getGenePositions( + gene_names = character(0), + ensembl_version = "https://feb2023.archive.ensembl.org", + species = "human", + ignoreAlt = F +) +} +\arguments{ +\item{gene_names}{A vector of gene names in HUGO format.} + +\item{ensembl_version}{Version of the ENSEMBL database used to quantify gene expression data. Default: v109.} + +\item{ignoreAlt}{If set to TRUE: Ignore if multiple loci are reported for a gene, pick the one from the primary assembly.} +} +\description{ +This function allows to receive the genomic positions of a vector of genes in HUGO format. +} +\examples{ +getGenePositions(gene_names = c("EGFR", "PDGFRA")) +} +\keyword{Chromosomal} +\keyword{positions} diff --git a/man/ignoreAlternative.Rd b/man/ignoreAlternative.Rd new file mode 100644 index 00000000..43165767 --- /dev/null +++ b/man/ignoreAlternative.Rd @@ -0,0 +1,29 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/GetPositions.R +\name{ignoreAlternative} +\alias{ignoreAlternative} +\title{Remove alternative chromosomes, X chromosome, Y chromosome, and mitochondrial genome from a gene positions dataframe} +\usage{ +ignoreAlternative(gene_positions_df) +} +\arguments{ +\item{gene_positions_df}{A data frame containing gene positions.} +} +\value{ +A modified gene positions dataframe with alternative chromosomes, X chromosome, Y chromosome, and mitochondrial chromosome removed, sorted in numeric order. +} +\description{ +This function removes alternative chromosomes, X chromosome, Y chromosome, and mitochondrial genome from a gene positions dataframe. +It also removes any duplicated genes, sorts the dataframe by chromosome column in numeric order, and returns the modified dataframe. +} +\examples{ +gene_positions <- data.frame( + ensembl_gene_id = c("ENSG00000261846", "ENSG00000197953", "ENSG00000262466"), + hgnc_symbol = c("AADACL2", "AADACL2", "AADACL2-AS1"), + chromosome_name = c("CHR_HSCHR3_1_CTG2_1", "3", "CHR_HSCHR3_1_CTG2_1"), + start_position = c(151744454, 151733916, 151761981), + end_position = c(151770036, 151761339, 151765669) +) +ignoreAlternative(gene_positions) + +}